@evalgate/sdk 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +638 -0
- package/README.md +398 -0
- package/dist/assertions.d.ts +189 -0
- package/dist/assertions.js +662 -0
- package/dist/batch.d.ts +68 -0
- package/dist/batch.js +179 -0
- package/dist/cache.d.ts +65 -0
- package/dist/cache.js +131 -0
- package/dist/cli/api.d.ts +108 -0
- package/dist/cli/api.js +132 -0
- package/dist/cli/baseline.d.ts +10 -0
- package/dist/cli/baseline.js +172 -0
- package/dist/cli/check.d.ts +73 -0
- package/dist/cli/check.js +355 -0
- package/dist/cli/ci-context.d.ts +6 -0
- package/dist/cli/ci-context.js +112 -0
- package/dist/cli/ci.d.ts +45 -0
- package/dist/cli/ci.js +192 -0
- package/dist/cli/config.d.ts +30 -0
- package/dist/cli/config.js +230 -0
- package/dist/cli/constants.d.ts +15 -0
- package/dist/cli/constants.js +18 -0
- package/dist/cli/diff.d.ts +173 -0
- package/dist/cli/diff.js +685 -0
- package/dist/cli/discover.d.ts +84 -0
- package/dist/cli/discover.js +419 -0
- package/dist/cli/doctor.d.ts +88 -0
- package/dist/cli/doctor.js +675 -0
- package/dist/cli/env.d.ts +21 -0
- package/dist/cli/env.js +42 -0
- package/dist/cli/explain.d.ts +58 -0
- package/dist/cli/explain.js +561 -0
- package/dist/cli/formatters/github.d.ts +8 -0
- package/dist/cli/formatters/github.js +135 -0
- package/dist/cli/formatters/human.d.ts +6 -0
- package/dist/cli/formatters/human.js +110 -0
- package/dist/cli/formatters/json.d.ts +6 -0
- package/dist/cli/formatters/json.js +10 -0
- package/dist/cli/formatters/pr-comment.d.ts +12 -0
- package/dist/cli/formatters/pr-comment.js +103 -0
- package/dist/cli/formatters/types.d.ts +103 -0
- package/dist/cli/formatters/types.js +8 -0
- package/dist/cli/gate.d.ts +21 -0
- package/dist/cli/gate.js +179 -0
- package/dist/cli/impact-analysis.d.ts +63 -0
- package/dist/cli/impact-analysis.js +252 -0
- package/dist/cli/index.d.ts +9 -0
- package/dist/cli/index.js +332 -0
- package/dist/cli/init.d.ts +16 -0
- package/dist/cli/init.js +292 -0
- package/dist/cli/manifest.d.ts +103 -0
- package/dist/cli/manifest.js +282 -0
- package/dist/cli/migrate.d.ts +41 -0
- package/dist/cli/migrate.js +349 -0
- package/dist/cli/policy-packs.d.ts +23 -0
- package/dist/cli/policy-packs.js +89 -0
- package/dist/cli/print-config.d.ts +29 -0
- package/dist/cli/print-config.js +270 -0
- package/dist/cli/profiles.d.ts +28 -0
- package/dist/cli/profiles.js +30 -0
- package/dist/cli/reason-codes.d.ts +17 -0
- package/dist/cli/reason-codes.js +19 -0
- package/dist/cli/regression-gate.d.ts +15 -0
- package/dist/cli/regression-gate.js +341 -0
- package/dist/cli/render/snippet.d.ts +5 -0
- package/dist/cli/render/snippet.js +15 -0
- package/dist/cli/render/sort.d.ts +10 -0
- package/dist/cli/render/sort.js +24 -0
- package/dist/cli/report/build-check-report.d.ts +19 -0
- package/dist/cli/report/build-check-report.js +132 -0
- package/dist/cli/run.d.ts +101 -0
- package/dist/cli/run.js +395 -0
- package/dist/cli/share.d.ts +17 -0
- package/dist/cli/share.js +91 -0
- package/dist/cli/upgrade.d.ts +15 -0
- package/dist/cli/upgrade.js +492 -0
- package/dist/cli/workspace.d.ts +31 -0
- package/dist/cli/workspace.js +68 -0
- package/dist/client.d.ts +368 -0
- package/dist/client.js +893 -0
- package/dist/client.request.test.d.ts +1 -0
- package/dist/client.request.test.js +232 -0
- package/dist/context.d.ts +134 -0
- package/dist/context.js +215 -0
- package/dist/errors.d.ts +82 -0
- package/dist/errors.js +298 -0
- package/dist/export.d.ts +195 -0
- package/dist/export.js +344 -0
- package/dist/index.d.ts +44 -0
- package/dist/index.js +153 -0
- package/dist/integrations/anthropic.d.ts +91 -0
- package/dist/integrations/anthropic.js +163 -0
- package/dist/integrations/openai-eval.d.ts +57 -0
- package/dist/integrations/openai-eval.js +232 -0
- package/dist/integrations/openai.d.ts +92 -0
- package/dist/integrations/openai.js +160 -0
- package/dist/local.d.ts +39 -0
- package/dist/local.js +148 -0
- package/dist/logger.d.ts +128 -0
- package/dist/logger.js +227 -0
- package/dist/matchers/index.d.ts +1 -0
- package/dist/matchers/index.js +6 -0
- package/dist/matchers/to-pass-gate.d.ts +29 -0
- package/dist/matchers/to-pass-gate.js +35 -0
- package/dist/pagination.d.ts +74 -0
- package/dist/pagination.js +139 -0
- package/dist/regression.d.ts +100 -0
- package/dist/regression.js +44 -0
- package/dist/runtime/adapters/config-to-dsl.d.ts +33 -0
- package/dist/runtime/adapters/config-to-dsl.js +400 -0
- package/dist/runtime/adapters/testsuite-to-dsl.d.ts +63 -0
- package/dist/runtime/adapters/testsuite-to-dsl.js +276 -0
- package/dist/runtime/context.d.ts +26 -0
- package/dist/runtime/context.js +74 -0
- package/dist/runtime/eval.d.ts +46 -0
- package/dist/runtime/eval.js +244 -0
- package/dist/runtime/execution-mode.d.ts +80 -0
- package/dist/runtime/execution-mode.js +357 -0
- package/dist/runtime/executor.d.ts +16 -0
- package/dist/runtime/executor.js +152 -0
- package/dist/runtime/registry.d.ts +78 -0
- package/dist/runtime/registry.js +403 -0
- package/dist/runtime/run-report.d.ts +200 -0
- package/dist/runtime/run-report.js +222 -0
- package/dist/runtime/types.d.ts +356 -0
- package/dist/runtime/types.js +76 -0
- package/dist/snapshot.d.ts +176 -0
- package/dist/snapshot.js +322 -0
- package/dist/streaming.d.ts +173 -0
- package/dist/streaming.js +268 -0
- package/dist/testing.d.ts +273 -0
- package/dist/testing.js +317 -0
- package/dist/types.d.ts +754 -0
- package/dist/types.js +54 -0
- package/dist/utils/input-hash.d.ts +8 -0
- package/dist/utils/input-hash.js +41 -0
- package/dist/version.d.ts +7 -0
- package/dist/version.js +10 -0
- package/dist/workflows.d.ts +389 -0
- package/dist/workflows.js +671 -0
- package/package.json +117 -0
package/dist/cli/env.js
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* CORE-401: Centralized environment detection
|
|
4
|
+
*
|
|
5
|
+
* Provides unified environment detection for all EvalGate CLI commands
|
|
6
|
+
*/
|
|
7
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
8
|
+
exports.isCI = isCI;
|
|
9
|
+
exports.isGitHubActions = isGitHubActions;
|
|
10
|
+
exports.getGitHubStepSummaryPath = getGitHubStepSummaryPath;
|
|
11
|
+
exports.isGitRef = isGitRef;
|
|
12
|
+
/**
|
|
13
|
+
* Check if running in CI environment
|
|
14
|
+
*/
|
|
15
|
+
function isCI() {
|
|
16
|
+
return !!(process.env.GITHUB_ACTIONS ||
|
|
17
|
+
process.env.CI ||
|
|
18
|
+
process.env.CONTINUOUS_INTEGRATION ||
|
|
19
|
+
process.env.BUILDKITE ||
|
|
20
|
+
process.env.CIRCLECI ||
|
|
21
|
+
process.env.TRAVIS ||
|
|
22
|
+
process.env.JENKINS_URL);
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Check if running in GitHub Actions
|
|
26
|
+
*/
|
|
27
|
+
function isGitHubActions() {
|
|
28
|
+
return !!process.env.GITHUB_ACTIONS;
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Get GitHub Step Summary path if available
|
|
32
|
+
*/
|
|
33
|
+
function getGitHubStepSummaryPath() {
|
|
34
|
+
return process.env.GITHUB_STEP_SUMMARY;
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Check if string looks like a git reference
|
|
38
|
+
*/
|
|
39
|
+
function isGitRef(ref) {
|
|
40
|
+
// Common git ref patterns
|
|
41
|
+
return /^(main|master|develop|dev|origin\/|remotes\/|feature\/|hotfix\/|release\/|v\d+\.\d+\.\d+|.*\.\.\..*|nonexistent-branch|test-branch|ci-branch)/.test(ref);
|
|
42
|
+
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* evalgate explain — Offline report explainer.
|
|
3
|
+
*
|
|
4
|
+
* Reads the last check/gate report artifact and prints:
|
|
5
|
+
* 1. Top failing test cases (up to 3)
|
|
6
|
+
* 2. What changed (baseline vs current)
|
|
7
|
+
* 3. Likely root cause class
|
|
8
|
+
* 4. Suggested fix actions
|
|
9
|
+
*
|
|
10
|
+
* Works offline — no network calls. Designed for CI logs.
|
|
11
|
+
*
|
|
12
|
+
* Usage:
|
|
13
|
+
* evalgate explain # reads evals/regression-report.json or .evalgate/last-report.json
|
|
14
|
+
* evalgate explain --report path/to/report.json
|
|
15
|
+
* evalgate explain --format json
|
|
16
|
+
*
|
|
17
|
+
* Exit codes:
|
|
18
|
+
* 0 — Explained successfully
|
|
19
|
+
* 1 — Report not found or unreadable
|
|
20
|
+
*/
|
|
21
|
+
export interface ExplainFlags {
|
|
22
|
+
reportPath: string | null;
|
|
23
|
+
format: "human" | "json";
|
|
24
|
+
}
|
|
25
|
+
export type RootCauseClass = "prompt_drift" | "retrieval_drift" | "formatting_drift" | "tool_use_drift" | "safety_regression" | "cost_regression" | "latency_regression" | "coverage_drop" | "baseline_stale" | "unknown";
|
|
26
|
+
export interface SuggestedFix {
|
|
27
|
+
action: string;
|
|
28
|
+
detail: string;
|
|
29
|
+
priority: "high" | "medium" | "low";
|
|
30
|
+
}
|
|
31
|
+
export interface ExplainOutput {
|
|
32
|
+
verdict: string;
|
|
33
|
+
score?: number;
|
|
34
|
+
baselineScore?: number;
|
|
35
|
+
delta?: number;
|
|
36
|
+
reasonCode?: string;
|
|
37
|
+
reasonMessage?: string;
|
|
38
|
+
topFailures: Array<{
|
|
39
|
+
rank: number;
|
|
40
|
+
name?: string;
|
|
41
|
+
input?: string;
|
|
42
|
+
expected?: string;
|
|
43
|
+
actual?: string;
|
|
44
|
+
reason?: string;
|
|
45
|
+
}>;
|
|
46
|
+
totalFailures: number;
|
|
47
|
+
changes: Array<{
|
|
48
|
+
metric: string;
|
|
49
|
+
baseline: string;
|
|
50
|
+
current: string;
|
|
51
|
+
direction: "better" | "worse" | "same";
|
|
52
|
+
}>;
|
|
53
|
+
rootCauses: RootCauseClass[];
|
|
54
|
+
suggestedFixes: SuggestedFix[];
|
|
55
|
+
reportPath: string;
|
|
56
|
+
}
|
|
57
|
+
export declare function parseExplainFlags(argv: string[]): ExplainFlags;
|
|
58
|
+
export declare function runExplain(argv: string[]): Promise<number>;
|
|
@@ -0,0 +1,561 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* evalgate explain — Offline report explainer.
|
|
4
|
+
*
|
|
5
|
+
* Reads the last check/gate report artifact and prints:
|
|
6
|
+
* 1. Top failing test cases (up to 3)
|
|
7
|
+
* 2. What changed (baseline vs current)
|
|
8
|
+
* 3. Likely root cause class
|
|
9
|
+
* 4. Suggested fix actions
|
|
10
|
+
*
|
|
11
|
+
* Works offline — no network calls. Designed for CI logs.
|
|
12
|
+
*
|
|
13
|
+
* Usage:
|
|
14
|
+
* evalgate explain # reads evals/regression-report.json or .evalgate/last-report.json
|
|
15
|
+
* evalgate explain --report path/to/report.json
|
|
16
|
+
* evalgate explain --format json
|
|
17
|
+
*
|
|
18
|
+
* Exit codes:
|
|
19
|
+
* 0 — Explained successfully
|
|
20
|
+
* 1 — Report not found or unreadable
|
|
21
|
+
*/
|
|
22
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
23
|
+
if (k2 === undefined) k2 = k;
|
|
24
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
25
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
26
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
27
|
+
}
|
|
28
|
+
Object.defineProperty(o, k2, desc);
|
|
29
|
+
}) : (function(o, m, k, k2) {
|
|
30
|
+
if (k2 === undefined) k2 = k;
|
|
31
|
+
o[k2] = m[k];
|
|
32
|
+
}));
|
|
33
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
34
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
35
|
+
}) : function(o, v) {
|
|
36
|
+
o["default"] = v;
|
|
37
|
+
});
|
|
38
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
39
|
+
var ownKeys = function(o) {
|
|
40
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
41
|
+
var ar = [];
|
|
42
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
43
|
+
return ar;
|
|
44
|
+
};
|
|
45
|
+
return ownKeys(o);
|
|
46
|
+
};
|
|
47
|
+
return function (mod) {
|
|
48
|
+
if (mod && mod.__esModule) return mod;
|
|
49
|
+
var result = {};
|
|
50
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
51
|
+
__setModuleDefault(result, mod);
|
|
52
|
+
return result;
|
|
53
|
+
};
|
|
54
|
+
})();
|
|
55
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
56
|
+
exports.parseExplainFlags = parseExplainFlags;
|
|
57
|
+
exports.runExplain = runExplain;
|
|
58
|
+
const fs = __importStar(require("node:fs"));
|
|
59
|
+
const path = __importStar(require("node:path"));
|
|
60
|
+
const types_1 = require("./formatters/types");
|
|
61
|
+
// ── Arg parsing ──
|
|
62
|
+
function parseExplainFlags(argv) {
|
|
63
|
+
const raw = {};
|
|
64
|
+
for (let i = 0; i < argv.length; i++) {
|
|
65
|
+
const arg = argv[i];
|
|
66
|
+
if (arg.startsWith("--")) {
|
|
67
|
+
const key = arg.slice(2);
|
|
68
|
+
const next = argv[i + 1];
|
|
69
|
+
if (next !== undefined && !next.startsWith("--")) {
|
|
70
|
+
raw[key] = next;
|
|
71
|
+
i++;
|
|
72
|
+
}
|
|
73
|
+
else {
|
|
74
|
+
raw[key] = "true";
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
const reportPath = raw.report || raw.reportPath || null;
|
|
79
|
+
const format = raw.format === "json" ? "json" : "human";
|
|
80
|
+
return { reportPath, format };
|
|
81
|
+
}
|
|
82
|
+
// ── Report discovery ──
|
|
83
|
+
const REPORT_SEARCH_PATHS = [
|
|
84
|
+
"evals/regression-report.json",
|
|
85
|
+
".evalgate/last-report.json",
|
|
86
|
+
".evalgate/last_report.json",
|
|
87
|
+
];
|
|
88
|
+
function findReport(cwd, explicitPath) {
|
|
89
|
+
if (explicitPath) {
|
|
90
|
+
const abs = path.isAbsolute(explicitPath)
|
|
91
|
+
? explicitPath
|
|
92
|
+
: path.join(cwd, explicitPath);
|
|
93
|
+
return fs.existsSync(abs) ? abs : null;
|
|
94
|
+
}
|
|
95
|
+
for (const rel of REPORT_SEARCH_PATHS) {
|
|
96
|
+
const abs = path.join(cwd, rel);
|
|
97
|
+
if (fs.existsSync(abs))
|
|
98
|
+
return abs;
|
|
99
|
+
}
|
|
100
|
+
return null;
|
|
101
|
+
}
|
|
102
|
+
// ── Root cause classification ──
|
|
103
|
+
function classifyRootCauses(report) {
|
|
104
|
+
const causes = [];
|
|
105
|
+
const failedCases = report.failedCases ?? [];
|
|
106
|
+
const reasonCode = report.reasonCode ?? "";
|
|
107
|
+
const breakdown = report.breakdown01;
|
|
108
|
+
const delta = report.delta;
|
|
109
|
+
// Safety regression
|
|
110
|
+
if (reasonCode === "POLICY_FAILED" ||
|
|
111
|
+
reasonCode === "SAFETY_RISK" ||
|
|
112
|
+
(breakdown?.safety != null && breakdown.safety < 0.9)) {
|
|
113
|
+
causes.push("safety_regression");
|
|
114
|
+
}
|
|
115
|
+
// Cost regression
|
|
116
|
+
if (reasonCode === "COST_BUDGET_EXCEEDED" || reasonCode === "COST_RISK") {
|
|
117
|
+
causes.push("cost_regression");
|
|
118
|
+
}
|
|
119
|
+
// Latency regression
|
|
120
|
+
if (reasonCode === "LATENCY_BUDGET_EXCEEDED" ||
|
|
121
|
+
reasonCode === "LATENCY_RISK") {
|
|
122
|
+
causes.push("latency_regression");
|
|
123
|
+
}
|
|
124
|
+
// Coverage drop (test count decreased)
|
|
125
|
+
if (reasonCode === "LOW_SAMPLE_SIZE" ||
|
|
126
|
+
reasonCode === "INSUFFICIENT_EVIDENCE") {
|
|
127
|
+
causes.push("coverage_drop");
|
|
128
|
+
}
|
|
129
|
+
// Analyze failed cases for drift patterns
|
|
130
|
+
if (failedCases.length > 0) {
|
|
131
|
+
const outputs = failedCases
|
|
132
|
+
.map((fc) => (fc.output ?? "").toLowerCase())
|
|
133
|
+
.filter(Boolean);
|
|
134
|
+
const expectedOutputs = failedCases
|
|
135
|
+
.map((fc) => (fc.expectedOutput ?? "").toLowerCase())
|
|
136
|
+
.filter(Boolean);
|
|
137
|
+
// Formatting drift: output structure changed (JSON/markdown/format mismatch)
|
|
138
|
+
const hasFormatIssue = outputs.some((o) => o.includes("```") !== expectedOutputs.some((e) => e.includes("```")) ||
|
|
139
|
+
o.includes("{") !== expectedOutputs.some((e) => e.includes("{")) ||
|
|
140
|
+
o.includes("<") !== expectedOutputs.some((e) => e.includes("<")));
|
|
141
|
+
if (hasFormatIssue && failedCases.length >= 2) {
|
|
142
|
+
causes.push("formatting_drift");
|
|
143
|
+
}
|
|
144
|
+
// Tool use drift: output mentions tool calls or function calls
|
|
145
|
+
const hasToolIssue = outputs.some((o) => o.includes("tool_call") ||
|
|
146
|
+
o.includes("function_call") ||
|
|
147
|
+
o.includes("tool_use"));
|
|
148
|
+
if (hasToolIssue) {
|
|
149
|
+
causes.push("tool_use_drift");
|
|
150
|
+
}
|
|
151
|
+
// Retrieval drift: output mentions "not found", "no results", context issues
|
|
152
|
+
const hasRetrievalIssue = outputs.some((o) => o.includes("not found") ||
|
|
153
|
+
o.includes("no results") ||
|
|
154
|
+
o.includes("no relevant") ||
|
|
155
|
+
o.includes("unable to find"));
|
|
156
|
+
if (hasRetrievalIssue) {
|
|
157
|
+
causes.push("retrieval_drift");
|
|
158
|
+
}
|
|
159
|
+
// Prompt drift: catch-all for score regression with failed cases
|
|
160
|
+
if (delta != null &&
|
|
161
|
+
delta < -2 &&
|
|
162
|
+
!causes.includes("formatting_drift") &&
|
|
163
|
+
!causes.includes("tool_use_drift") &&
|
|
164
|
+
!causes.includes("retrieval_drift")) {
|
|
165
|
+
causes.push("prompt_drift");
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
// Baseline stale
|
|
169
|
+
if (reasonCode === "BASELINE_MISSING") {
|
|
170
|
+
causes.push("baseline_stale");
|
|
171
|
+
}
|
|
172
|
+
if (causes.length === 0) {
|
|
173
|
+
causes.push("unknown");
|
|
174
|
+
}
|
|
175
|
+
return [...new Set(causes)];
|
|
176
|
+
}
|
|
177
|
+
// ── Suggested fixes ──
|
|
178
|
+
const ROOT_CAUSE_FIXES = {
|
|
179
|
+
prompt_drift: [
|
|
180
|
+
{
|
|
181
|
+
action: "Review prompt changes",
|
|
182
|
+
detail: "Compare current prompt with the version used in baseline run. Diff system/user messages.",
|
|
183
|
+
priority: "high",
|
|
184
|
+
},
|
|
185
|
+
{
|
|
186
|
+
action: "Pin model version",
|
|
187
|
+
detail: "Use a specific model snapshot (e.g. gpt-4-0613) instead of a rolling alias.",
|
|
188
|
+
priority: "medium",
|
|
189
|
+
},
|
|
190
|
+
{
|
|
191
|
+
action: "Update baseline",
|
|
192
|
+
detail: "If changes are intentional, run: npx evalgate baseline update",
|
|
193
|
+
priority: "low",
|
|
194
|
+
},
|
|
195
|
+
],
|
|
196
|
+
retrieval_drift: [
|
|
197
|
+
{
|
|
198
|
+
action: "Check retrieval pipeline",
|
|
199
|
+
detail: "Verify embeddings, index, and chunk strategy haven't changed.",
|
|
200
|
+
priority: "high",
|
|
201
|
+
},
|
|
202
|
+
{
|
|
203
|
+
action: "Update test case context",
|
|
204
|
+
detail: "If knowledge base changed, update expected outputs in test cases.",
|
|
205
|
+
priority: "medium",
|
|
206
|
+
},
|
|
207
|
+
{
|
|
208
|
+
action: "Add retrieval-specific tests",
|
|
209
|
+
detail: "Add test cases that verify document retrieval before generation.",
|
|
210
|
+
priority: "low",
|
|
211
|
+
},
|
|
212
|
+
],
|
|
213
|
+
formatting_drift: [
|
|
214
|
+
{
|
|
215
|
+
action: "Update output format instructions",
|
|
216
|
+
detail: "Check if system prompt format instructions match expected output structure.",
|
|
217
|
+
priority: "high",
|
|
218
|
+
},
|
|
219
|
+
{
|
|
220
|
+
action: "Add format validators",
|
|
221
|
+
detail: "Use schema assertions to validate output structure (JSON schema, regex).",
|
|
222
|
+
priority: "medium",
|
|
223
|
+
},
|
|
224
|
+
{
|
|
225
|
+
action: "Refresh baseline",
|
|
226
|
+
detail: "If new format is intentional, run: npx evalgate baseline update",
|
|
227
|
+
priority: "low",
|
|
228
|
+
},
|
|
229
|
+
],
|
|
230
|
+
tool_use_drift: [
|
|
231
|
+
{
|
|
232
|
+
action: "Verify tool definitions",
|
|
233
|
+
detail: "Check that tool/function schemas match what the model expects.",
|
|
234
|
+
priority: "high",
|
|
235
|
+
},
|
|
236
|
+
{
|
|
237
|
+
action: "Review tool call patterns",
|
|
238
|
+
detail: "Compare tool call sequences in failing vs passing cases.",
|
|
239
|
+
priority: "medium",
|
|
240
|
+
},
|
|
241
|
+
{
|
|
242
|
+
action: "Add tool-use assertions",
|
|
243
|
+
detail: "Assert specific tool calls are made (or not made) per test case.",
|
|
244
|
+
priority: "low",
|
|
245
|
+
},
|
|
246
|
+
],
|
|
247
|
+
safety_regression: [
|
|
248
|
+
{
|
|
249
|
+
action: "Review safety assertions",
|
|
250
|
+
detail: "Check which safety test cases are failing and why.",
|
|
251
|
+
priority: "high",
|
|
252
|
+
},
|
|
253
|
+
{
|
|
254
|
+
action: "Strengthen guardrails",
|
|
255
|
+
detail: "Add or update content filters, system prompt safety instructions.",
|
|
256
|
+
priority: "high",
|
|
257
|
+
},
|
|
258
|
+
{
|
|
259
|
+
action: "Update rubric",
|
|
260
|
+
detail: "If safety criteria changed, update the LLM judge rubric.",
|
|
261
|
+
priority: "medium",
|
|
262
|
+
},
|
|
263
|
+
],
|
|
264
|
+
cost_regression: [
|
|
265
|
+
{
|
|
266
|
+
action: "Check token usage",
|
|
267
|
+
detail: "Compare input/output token counts between baseline and current run.",
|
|
268
|
+
priority: "high",
|
|
269
|
+
},
|
|
270
|
+
{
|
|
271
|
+
action: "Optimize prompts",
|
|
272
|
+
detail: "Reduce prompt length or use a smaller model for non-critical paths.",
|
|
273
|
+
priority: "medium",
|
|
274
|
+
},
|
|
275
|
+
{
|
|
276
|
+
action: "Update cost budget",
|
|
277
|
+
detail: "If higher cost is expected, adjust --max-cost-usd threshold.",
|
|
278
|
+
priority: "low",
|
|
279
|
+
},
|
|
280
|
+
],
|
|
281
|
+
latency_regression: [
|
|
282
|
+
{
|
|
283
|
+
action: "Check response times",
|
|
284
|
+
detail: "Compare per-test-case latency between baseline and current run.",
|
|
285
|
+
priority: "high",
|
|
286
|
+
},
|
|
287
|
+
{
|
|
288
|
+
action: "Reduce prompt complexity",
|
|
289
|
+
detail: "Simplify prompts or use streaming to reduce perceived latency.",
|
|
290
|
+
priority: "medium",
|
|
291
|
+
},
|
|
292
|
+
{
|
|
293
|
+
action: "Update latency budget",
|
|
294
|
+
detail: "If higher latency is expected, adjust --max-latency-ms threshold.",
|
|
295
|
+
priority: "low",
|
|
296
|
+
},
|
|
297
|
+
],
|
|
298
|
+
coverage_drop: [
|
|
299
|
+
{
|
|
300
|
+
action: "Add test cases",
|
|
301
|
+
detail: "Current test count is below minimum. Add more test cases to the evaluation.",
|
|
302
|
+
priority: "high",
|
|
303
|
+
},
|
|
304
|
+
{
|
|
305
|
+
action: "Check test case filtering",
|
|
306
|
+
detail: "Verify no test cases were accidentally deleted or filtered out.",
|
|
307
|
+
priority: "medium",
|
|
308
|
+
},
|
|
309
|
+
],
|
|
310
|
+
baseline_stale: [
|
|
311
|
+
{
|
|
312
|
+
action: "Create baseline",
|
|
313
|
+
detail: "Run: npx evalgate baseline init (or publish a run from the dashboard)",
|
|
314
|
+
priority: "high",
|
|
315
|
+
},
|
|
316
|
+
{
|
|
317
|
+
action: "Use --baseline previous",
|
|
318
|
+
detail: "Compare against the previous run instead of a published baseline.",
|
|
319
|
+
priority: "medium",
|
|
320
|
+
},
|
|
321
|
+
],
|
|
322
|
+
unknown: [
|
|
323
|
+
{
|
|
324
|
+
action: "Run evalgate doctor",
|
|
325
|
+
detail: "Run: npx evalgate doctor to check your full CI/CD setup.",
|
|
326
|
+
priority: "high",
|
|
327
|
+
},
|
|
328
|
+
{
|
|
329
|
+
action: "Check logs",
|
|
330
|
+
detail: "Review CI logs for errors or unexpected behavior.",
|
|
331
|
+
priority: "medium",
|
|
332
|
+
},
|
|
333
|
+
{
|
|
334
|
+
action: "Update baseline",
|
|
335
|
+
detail: "If changes are intentional, run: npx evalgate baseline update",
|
|
336
|
+
priority: "low",
|
|
337
|
+
},
|
|
338
|
+
],
|
|
339
|
+
};
|
|
340
|
+
function suggestFixes(causes) {
|
|
341
|
+
const seen = new Set();
|
|
342
|
+
const fixes = [];
|
|
343
|
+
for (const cause of causes) {
|
|
344
|
+
for (const fix of ROOT_CAUSE_FIXES[cause] ?? []) {
|
|
345
|
+
if (!seen.has(fix.action)) {
|
|
346
|
+
seen.add(fix.action);
|
|
347
|
+
fixes.push(fix);
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
// Sort by priority
|
|
352
|
+
const pOrder = { high: 0, medium: 1, low: 2 };
|
|
353
|
+
return fixes.sort((a, b) => (pOrder[a.priority] ?? 9) - (pOrder[b.priority] ?? 9));
|
|
354
|
+
}
|
|
355
|
+
// ── Build explain output ──
|
|
356
|
+
function buildExplainOutput(report, reportPath) {
|
|
357
|
+
// Support both CheckReport (from evalgate check) and BuiltinReport (from evalgate gate)
|
|
358
|
+
const isBuiltinReport = "category" in report && "deltas" in report;
|
|
359
|
+
if (isBuiltinReport) {
|
|
360
|
+
return buildFromBuiltinReport(report, reportPath);
|
|
361
|
+
}
|
|
362
|
+
return buildFromCheckReport(report, reportPath);
|
|
363
|
+
}
|
|
364
|
+
function buildFromCheckReport(report, reportPath) {
|
|
365
|
+
const failedCases = report.failedCases ?? [];
|
|
366
|
+
// Top failures (up to 3)
|
|
367
|
+
const topFailures = failedCases
|
|
368
|
+
.slice(0, 3)
|
|
369
|
+
.map((fc, i) => ({
|
|
370
|
+
rank: i + 1,
|
|
371
|
+
name: fc.name,
|
|
372
|
+
input: fc.inputSnippet || fc.input,
|
|
373
|
+
expected: fc.expectedSnippet || fc.expectedOutput,
|
|
374
|
+
actual: fc.outputSnippet || fc.output,
|
|
375
|
+
reason: fc.reason,
|
|
376
|
+
}));
|
|
377
|
+
// Changes
|
|
378
|
+
const changes = [];
|
|
379
|
+
if (report.score != null && report.baselineScore != null) {
|
|
380
|
+
const d = report.score - report.baselineScore;
|
|
381
|
+
changes.push({
|
|
382
|
+
metric: "Score",
|
|
383
|
+
baseline: String(report.baselineScore),
|
|
384
|
+
current: String(report.score),
|
|
385
|
+
direction: d > 0 ? "better" : d < 0 ? "worse" : "same",
|
|
386
|
+
});
|
|
387
|
+
}
|
|
388
|
+
if (report.breakdown01?.passRate != null) {
|
|
389
|
+
changes.push({
|
|
390
|
+
metric: "Pass rate",
|
|
391
|
+
baseline: "—",
|
|
392
|
+
current: `${Math.round(report.breakdown01.passRate * 100)}%`,
|
|
393
|
+
direction: "same",
|
|
394
|
+
});
|
|
395
|
+
}
|
|
396
|
+
if (report.breakdown01?.safety != null) {
|
|
397
|
+
changes.push({
|
|
398
|
+
metric: "Safety",
|
|
399
|
+
baseline: "—",
|
|
400
|
+
current: `${Math.round(report.breakdown01.safety * 100)}%`,
|
|
401
|
+
direction: report.breakdown01.safety < 0.95 ? "worse" : "same",
|
|
402
|
+
});
|
|
403
|
+
}
|
|
404
|
+
const rootCauses = classifyRootCauses(report);
|
|
405
|
+
const suggestedFixes = suggestFixes(rootCauses);
|
|
406
|
+
return {
|
|
407
|
+
verdict: report.verdict ?? "unknown",
|
|
408
|
+
score: report.score,
|
|
409
|
+
baselineScore: report.baselineScore,
|
|
410
|
+
delta: report.delta,
|
|
411
|
+
reasonCode: report.reasonCode,
|
|
412
|
+
reasonMessage: report.reasonMessage ?? report.actionableMessage,
|
|
413
|
+
topFailures,
|
|
414
|
+
totalFailures: failedCases.length,
|
|
415
|
+
changes,
|
|
416
|
+
rootCauses,
|
|
417
|
+
suggestedFixes,
|
|
418
|
+
reportPath,
|
|
419
|
+
};
|
|
420
|
+
}
|
|
421
|
+
function buildFromBuiltinReport(report, reportPath) {
|
|
422
|
+
const passed = report.passed;
|
|
423
|
+
const failures = report.failures ?? [];
|
|
424
|
+
const deltas = report.deltas ?? [];
|
|
425
|
+
const changes = deltas.map((d) => ({
|
|
426
|
+
metric: d.metric,
|
|
427
|
+
baseline: String(d.baseline),
|
|
428
|
+
current: String(d.current),
|
|
429
|
+
direction: d.status === "pass" ? "same" : "worse",
|
|
430
|
+
}));
|
|
431
|
+
const topFailures = failures.slice(0, 3).map((f, i) => ({
|
|
432
|
+
rank: i + 1,
|
|
433
|
+
reason: f,
|
|
434
|
+
}));
|
|
435
|
+
// Simple root cause for builtin reports
|
|
436
|
+
const rootCauses = [];
|
|
437
|
+
if (failures.some((f) => f.includes("failing")))
|
|
438
|
+
rootCauses.push("prompt_drift");
|
|
439
|
+
if (failures.some((f) => f.includes("count dropped")))
|
|
440
|
+
rootCauses.push("coverage_drop");
|
|
441
|
+
if (rootCauses.length === 0)
|
|
442
|
+
rootCauses.push("unknown");
|
|
443
|
+
return {
|
|
444
|
+
verdict: passed ? "pass" : "fail",
|
|
445
|
+
reasonCode: report.category ?? undefined,
|
|
446
|
+
reasonMessage: failures[0],
|
|
447
|
+
topFailures,
|
|
448
|
+
totalFailures: failures.length,
|
|
449
|
+
changes,
|
|
450
|
+
rootCauses,
|
|
451
|
+
suggestedFixes: suggestFixes(rootCauses),
|
|
452
|
+
reportPath,
|
|
453
|
+
};
|
|
454
|
+
}
|
|
455
|
+
// ── Output formatting ──
|
|
456
|
+
function printHuman(output) {
|
|
457
|
+
const verdictIcon = output.verdict === "pass"
|
|
458
|
+
? "\u2705"
|
|
459
|
+
: output.verdict === "warn"
|
|
460
|
+
? "\u26A0\uFE0F"
|
|
461
|
+
: "\u274C";
|
|
462
|
+
console.log(`\n evalgate explain\n`);
|
|
463
|
+
console.log(` ${verdictIcon} Verdict: ${output.verdict.toUpperCase()}`);
|
|
464
|
+
if (output.score != null) {
|
|
465
|
+
const scoreStr = output.baselineScore != null
|
|
466
|
+
? `${output.score} (baseline: ${output.baselineScore}, delta: ${output.delta ?? "n/a"})`
|
|
467
|
+
: `${output.score}`;
|
|
468
|
+
console.log(` Score: ${scoreStr}`);
|
|
469
|
+
}
|
|
470
|
+
if (output.reasonMessage) {
|
|
471
|
+
console.log(` Reason: ${output.reasonMessage}`);
|
|
472
|
+
}
|
|
473
|
+
// Changes
|
|
474
|
+
if (output.changes.length > 0) {
|
|
475
|
+
console.log("\n What changed:");
|
|
476
|
+
for (const c of output.changes) {
|
|
477
|
+
const arrow = c.direction === "worse"
|
|
478
|
+
? "\u2193"
|
|
479
|
+
: c.direction === "better"
|
|
480
|
+
? "\u2191"
|
|
481
|
+
: "\u2192";
|
|
482
|
+
console.log(` ${arrow} ${c.metric}: ${c.baseline} \u2192 ${c.current}`);
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
// Top failures
|
|
486
|
+
if (output.topFailures.length > 0) {
|
|
487
|
+
console.log(`\n Top failing cases (${output.topFailures.length} of ${output.totalFailures}):`);
|
|
488
|
+
for (const f of output.topFailures) {
|
|
489
|
+
console.log(`\n ${f.rank}. ${f.name ?? "unnamed"}`);
|
|
490
|
+
if (f.input)
|
|
491
|
+
console.log(` Input: ${f.input}`);
|
|
492
|
+
if (f.expected)
|
|
493
|
+
console.log(` Expected: ${f.expected}`);
|
|
494
|
+
if (f.actual)
|
|
495
|
+
console.log(` Actual: ${f.actual}`);
|
|
496
|
+
if (f.reason)
|
|
497
|
+
console.log(` Reason: ${f.reason}`);
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
// Root causes
|
|
501
|
+
if (output.rootCauses.length > 0 && output.rootCauses[0] !== "unknown") {
|
|
502
|
+
console.log("\n Likely root causes:");
|
|
503
|
+
for (const cause of output.rootCauses) {
|
|
504
|
+
console.log(` \u2022 ${cause.replace(/_/g, " ")}`);
|
|
505
|
+
}
|
|
506
|
+
}
|
|
507
|
+
// Suggested fixes
|
|
508
|
+
if (output.suggestedFixes.length > 0) {
|
|
509
|
+
console.log("\n Suggested fixes:");
|
|
510
|
+
for (const fix of output.suggestedFixes) {
|
|
511
|
+
const pIcon = fix.priority === "high"
|
|
512
|
+
? "\u203C\uFE0F"
|
|
513
|
+
: fix.priority === "medium"
|
|
514
|
+
? "\u2757"
|
|
515
|
+
: "\u2022";
|
|
516
|
+
console.log(` ${pIcon} ${fix.action}`);
|
|
517
|
+
console.log(` ${fix.detail}`);
|
|
518
|
+
}
|
|
519
|
+
}
|
|
520
|
+
console.log(`\n Report: ${output.reportPath}\n`);
|
|
521
|
+
}
|
|
522
|
+
// ── Main ──
|
|
523
|
+
async function runExplain(argv) {
|
|
524
|
+
const flags = parseExplainFlags(argv);
|
|
525
|
+
const cwd = process.cwd();
|
|
526
|
+
const reportPath = findReport(cwd, flags.reportPath);
|
|
527
|
+
if (!reportPath) {
|
|
528
|
+
const searched = flags.reportPath
|
|
529
|
+
? flags.reportPath
|
|
530
|
+
: REPORT_SEARCH_PATHS.join(", ");
|
|
531
|
+
console.error(`\n \u274C No report found. Searched: ${searched}`);
|
|
532
|
+
console.error(" Run a gate first:");
|
|
533
|
+
console.error(" npx evalgate gate --format json");
|
|
534
|
+
console.error(" npx evalgate check --format json > .evalgate/last-report.json\n");
|
|
535
|
+
return 1;
|
|
536
|
+
}
|
|
537
|
+
let reportData;
|
|
538
|
+
try {
|
|
539
|
+
reportData = JSON.parse(fs.readFileSync(reportPath, "utf-8"));
|
|
540
|
+
}
|
|
541
|
+
catch {
|
|
542
|
+
console.error(`\n \u274C Cannot parse report: ${reportPath}\n`);
|
|
543
|
+
return 1;
|
|
544
|
+
}
|
|
545
|
+
// Schema version compatibility check
|
|
546
|
+
const reportSchema = typeof reportData.schemaVersion === "number"
|
|
547
|
+
? reportData.schemaVersion
|
|
548
|
+
: undefined;
|
|
549
|
+
if (reportSchema != null && reportSchema > types_1.CHECK_REPORT_SCHEMA_VERSION) {
|
|
550
|
+
console.error(`\n \u26A0\uFE0F Report schema version ${reportSchema} is newer than this CLI supports (v${types_1.CHECK_REPORT_SCHEMA_VERSION}).`);
|
|
551
|
+
console.error(" Update your SDK: npm install @evalgate/sdk@latest\n");
|
|
552
|
+
}
|
|
553
|
+
const output = buildExplainOutput(reportData, path.relative(cwd, reportPath));
|
|
554
|
+
if (flags.format === "json") {
|
|
555
|
+
console.log(JSON.stringify(output, null, 2));
|
|
556
|
+
}
|
|
557
|
+
else {
|
|
558
|
+
printHuman(output);
|
|
559
|
+
}
|
|
560
|
+
return 0;
|
|
561
|
+
}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GitHub formatter for evalgate check.
|
|
3
|
+
* - stdout: minimal (verdict + score + link) + ::error annotations for failed cases
|
|
4
|
+
* - Step summary: full Markdown written to GITHUB_STEP_SUMMARY (not stdout)
|
|
5
|
+
*/
|
|
6
|
+
import type { CheckReport } from "./types";
|
|
7
|
+
export declare function appendStepSummary(report: CheckReport): void;
|
|
8
|
+
export declare function formatGitHub(report: CheckReport): string;
|