@pauly4010/evalai-sdk 1.4.1 → 1.5.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +85 -0
- package/README.md +205 -543
- package/dist/assertions.d.ts +2 -2
- package/dist/assertions.js +104 -71
- package/dist/batch.js +12 -17
- package/dist/cache.js +7 -11
- package/dist/cli/api.d.ts +108 -0
- package/dist/cli/api.js +130 -0
- package/dist/cli/check.d.ts +28 -13
- package/dist/cli/check.js +249 -142
- package/dist/cli/ci-context.d.ts +6 -0
- package/dist/cli/ci-context.js +110 -0
- package/dist/cli/config.d.ts +30 -0
- package/dist/cli/config.js +207 -0
- package/dist/cli/constants.d.ts +15 -0
- package/dist/cli/constants.js +18 -0
- package/dist/cli/doctor.d.ts +11 -0
- package/dist/cli/doctor.js +82 -0
- package/dist/cli/formatters/github.d.ts +8 -0
- package/dist/cli/formatters/github.js +130 -0
- package/dist/cli/formatters/human.d.ts +6 -0
- package/dist/cli/formatters/human.js +107 -0
- package/dist/cli/formatters/json.d.ts +6 -0
- package/dist/cli/formatters/json.js +10 -0
- package/dist/cli/formatters/pr-comment.d.ts +12 -0
- package/dist/cli/formatters/pr-comment.js +101 -0
- package/dist/cli/formatters/types.d.ts +100 -0
- package/dist/cli/formatters/types.js +5 -0
- package/dist/cli/gate.d.ts +21 -0
- package/dist/cli/gate.js +175 -0
- package/dist/cli/index.d.ts +1 -0
- package/dist/cli/index.js +67 -23
- package/dist/cli/init.d.ts +7 -0
- package/dist/cli/init.js +69 -0
- package/dist/cli/policy-packs.d.ts +23 -0
- package/dist/cli/policy-packs.js +83 -0
- package/dist/cli/profiles.d.ts +28 -0
- package/dist/cli/profiles.js +30 -0
- package/dist/cli/reason-codes.d.ts +17 -0
- package/dist/cli/reason-codes.js +19 -0
- package/dist/cli/render/snippet.d.ts +5 -0
- package/dist/cli/render/snippet.js +15 -0
- package/dist/cli/render/sort.d.ts +10 -0
- package/dist/cli/render/sort.js +24 -0
- package/dist/cli/report/build-check-report.d.ts +19 -0
- package/dist/cli/report/build-check-report.js +124 -0
- package/dist/cli/share.d.ts +17 -0
- package/dist/cli/share.js +83 -0
- package/dist/client.d.ts +2 -2
- package/dist/client.js +144 -132
- package/dist/context.d.ts +1 -1
- package/dist/context.js +4 -6
- package/dist/errors.d.ts +2 -0
- package/dist/errors.js +116 -107
- package/dist/export.d.ts +6 -6
- package/dist/export.js +39 -33
- package/dist/index.d.ts +25 -24
- package/dist/index.js +62 -56
- package/dist/integrations/anthropic.d.ts +1 -1
- package/dist/integrations/anthropic.js +23 -19
- package/dist/integrations/openai-eval.d.ts +57 -0
- package/dist/integrations/openai-eval.js +230 -0
- package/dist/integrations/openai.d.ts +1 -1
- package/dist/integrations/openai.js +23 -19
- package/dist/local.d.ts +2 -2
- package/dist/local.js +25 -25
- package/dist/logger.d.ts +1 -1
- package/dist/logger.js +24 -28
- package/dist/matchers/index.d.ts +1 -0
- package/dist/matchers/index.js +6 -0
- package/dist/matchers/to-pass-gate.d.ts +29 -0
- package/dist/matchers/to-pass-gate.js +35 -0
- package/dist/pagination.d.ts +1 -1
- package/dist/pagination.js +6 -6
- package/dist/snapshot.js +24 -24
- package/dist/streaming.js +11 -11
- package/dist/testing.d.ts +6 -2
- package/dist/testing.js +30 -12
- package/dist/types.d.ts +22 -22
- package/dist/types.js +13 -13
- package/dist/utils/input-hash.d.ts +8 -0
- package/dist/utils/input-hash.js +38 -0
- package/dist/version.d.ts +7 -0
- package/dist/version.js +10 -0
- package/dist/workflows.d.ts +7 -7
- package/dist/workflows.js +44 -44
- package/package.json +102 -90
- package/dist/__tests__/assertions.test.d.ts +0 -1
- package/dist/__tests__/assertions.test.js +0 -288
- package/dist/__tests__/client.test.d.ts +0 -1
- package/dist/__tests__/client.test.js +0 -185
- package/dist/__tests__/testing.test.d.ts +0 -1
- package/dist/__tests__/testing.test.js +0 -230
- package/dist/__tests__/workflows.test.d.ts +0 -1
- package/dist/__tests__/workflows.test.js +0 -222
package/dist/cli/check.js
CHANGED
|
@@ -15,10 +15,14 @@
|
|
|
15
15
|
* --minN <n> Fail if total test cases < n (low sample size)
|
|
16
16
|
* --allowWeakEvidence If false (default), fail when evidenceLevel is 'weak'
|
|
17
17
|
* --policy <name> Enforce a compliance policy (e.g. HIPAA, SOC2, GDPR)
|
|
18
|
-
* --baseline <mode>
|
|
18
|
+
* --baseline <mode> Baseline comparison mode: "published" (default), "previous", or "production"
|
|
19
19
|
* --evaluationId <id> Required. The evaluation to gate on.
|
|
20
20
|
* --baseUrl <url> API base URL (default: EVALAI_BASE_URL or http://localhost:3000)
|
|
21
21
|
* --apiKey <key> API key (default: EVALAI_API_KEY env var)
|
|
22
|
+
* --share <mode> Share link: "always" | "fail" | "never" (default: never)
|
|
23
|
+
* fail = create public share link only when gate fails (CI-friendly)
|
|
24
|
+
* --pr-comment-out <file> Write PR comment markdown to file (for GitHub Action to post)
|
|
25
|
+
* --profile <name> Preset: strict (95/0/30), balanced (90/2/10), fast (85/5/5). Explicit flags override.
|
|
22
26
|
*
|
|
23
27
|
* Exit codes:
|
|
24
28
|
* 0 — Gate passed
|
|
@@ -29,187 +33,290 @@
|
|
|
29
33
|
* 5 — Invalid arguments
|
|
30
34
|
* 6 — Gate failed: total test cases < minN
|
|
31
35
|
* 7 — Gate failed: weak evidence (evidenceLevel === 'weak')
|
|
36
|
+
* 8 — Gate warned: near-regression (warnDrop ≤ drop < maxDrop)
|
|
32
37
|
*
|
|
33
38
|
* Environment:
|
|
34
39
|
* EVALAI_BASE_URL — API base URL (default: http://localhost:3000)
|
|
35
40
|
* EVALAI_API_KEY — API key for authentication
|
|
36
41
|
*/
|
|
42
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
43
|
+
if (k2 === undefined) k2 = k;
|
|
44
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
45
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
46
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
47
|
+
}
|
|
48
|
+
Object.defineProperty(o, k2, desc);
|
|
49
|
+
}) : (function(o, m, k, k2) {
|
|
50
|
+
if (k2 === undefined) k2 = k;
|
|
51
|
+
o[k2] = m[k];
|
|
52
|
+
}));
|
|
53
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
54
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
55
|
+
}) : function(o, v) {
|
|
56
|
+
o["default"] = v;
|
|
57
|
+
});
|
|
58
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
59
|
+
var ownKeys = function(o) {
|
|
60
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
61
|
+
var ar = [];
|
|
62
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
63
|
+
return ar;
|
|
64
|
+
};
|
|
65
|
+
return ownKeys(o);
|
|
66
|
+
};
|
|
67
|
+
return function (mod) {
|
|
68
|
+
if (mod && mod.__esModule) return mod;
|
|
69
|
+
var result = {};
|
|
70
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
71
|
+
__setModuleDefault(result, mod);
|
|
72
|
+
return result;
|
|
73
|
+
};
|
|
74
|
+
})();
|
|
37
75
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
38
76
|
exports.EXIT = void 0;
|
|
39
77
|
exports.parseArgs = parseArgs;
|
|
40
78
|
exports.runCheck = runCheck;
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
79
|
+
const fs = __importStar(require("node:fs"));
|
|
80
|
+
const api_1 = require("./api");
|
|
81
|
+
const ci_context_1 = require("./ci-context");
|
|
82
|
+
const config_1 = require("./config");
|
|
83
|
+
const constants_1 = require("./constants");
|
|
84
|
+
const github_1 = require("./formatters/github");
|
|
85
|
+
const human_1 = require("./formatters/human");
|
|
86
|
+
const json_1 = require("./formatters/json");
|
|
87
|
+
const pr_comment_1 = require("./formatters/pr-comment");
|
|
88
|
+
const gate_1 = require("./gate");
|
|
89
|
+
const build_check_report_1 = require("./report/build-check-report");
|
|
90
|
+
var constants_2 = require("./constants");
|
|
91
|
+
Object.defineProperty(exports, "EXIT", { enumerable: true, get: function () { return constants_2.EXIT; } });
|
|
52
92
|
function parseArgs(argv) {
|
|
53
93
|
const args = {};
|
|
54
94
|
for (let i = 0; i < argv.length; i++) {
|
|
55
95
|
const arg = argv[i];
|
|
56
|
-
if (arg.startsWith(
|
|
96
|
+
if (arg.startsWith("--")) {
|
|
57
97
|
const key = arg.slice(2);
|
|
58
98
|
const next = argv[i + 1];
|
|
59
|
-
if (next !== undefined && !next.startsWith(
|
|
99
|
+
if (next !== undefined && !next.startsWith("--")) {
|
|
60
100
|
args[key] = next;
|
|
61
101
|
i++;
|
|
62
102
|
}
|
|
63
103
|
else {
|
|
64
|
-
args[key] =
|
|
104
|
+
args[key] = "true";
|
|
65
105
|
}
|
|
66
106
|
}
|
|
67
107
|
}
|
|
68
|
-
|
|
69
|
-
const apiKey = args.apiKey || process.env.EVALAI_API_KEY ||
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
108
|
+
let baseUrl = args.baseUrl || process.env.EVALAI_BASE_URL || "http://localhost:3000";
|
|
109
|
+
const apiKey = args.apiKey || process.env.EVALAI_API_KEY || "";
|
|
110
|
+
let minScore = parseInt(args.minScore || "0", 10);
|
|
111
|
+
let maxDrop = args.maxDrop ? parseInt(args.maxDrop, 10) : undefined;
|
|
112
|
+
let warnDrop = args.warnDrop ? parseInt(args.warnDrop, 10) : undefined;
|
|
113
|
+
let minN = args.minN ? parseInt(args.minN, 10) : undefined;
|
|
114
|
+
let allowWeakEvidence = args.allowWeakEvidence === "true" || args.allowWeakEvidence === "1";
|
|
115
|
+
let evaluationId = args.evaluationId || "";
|
|
75
116
|
const policy = args.policy || undefined;
|
|
76
|
-
const
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
117
|
+
const formatRaw = args.format || "human";
|
|
118
|
+
const format = formatRaw === "json" ? "json" : formatRaw === "github" ? "github" : "human";
|
|
119
|
+
const explain = args.explain === "true" || args.explain === "1";
|
|
120
|
+
const onFail = args.onFail === "import" ? "import" : undefined;
|
|
121
|
+
const shareRaw = args.share || "never";
|
|
122
|
+
const share = shareRaw === "always" ? "always" : shareRaw === "fail" ? "fail" : "never";
|
|
123
|
+
const prCommentOut = args["pr-comment-out"] || args.prCommentOut || undefined;
|
|
124
|
+
const maxCostUsd = args["max-cost-usd"] || args.maxCostUsd
|
|
125
|
+
? parseFloat(args["max-cost-usd"] || args.maxCostUsd || "0")
|
|
126
|
+
: undefined;
|
|
127
|
+
const maxLatencyMs = args["max-latency-ms"] || args.maxLatencyMs
|
|
128
|
+
? parseInt(args["max-latency-ms"] || args.maxLatencyMs || "0", 10)
|
|
129
|
+
: undefined;
|
|
130
|
+
const maxCostDeltaUsd = args["max-cost-delta-usd"] || args.maxCostDeltaUsd
|
|
131
|
+
? parseFloat(args["max-cost-delta-usd"] || args.maxCostDeltaUsd || "0")
|
|
132
|
+
: undefined;
|
|
133
|
+
const profile = (args.profile || args.profile);
|
|
134
|
+
let baseline = (args.baseline === "auto"
|
|
135
|
+
? "auto"
|
|
136
|
+
: args.baseline === "previous"
|
|
137
|
+
? "previous"
|
|
138
|
+
: args.baseline === "production"
|
|
139
|
+
? "production"
|
|
140
|
+
: "published");
|
|
141
|
+
const config = (0, config_1.loadConfig)(process.cwd());
|
|
142
|
+
const merged = (0, config_1.mergeConfigWithArgs)(config, {
|
|
143
|
+
evaluationId: args.evaluationId,
|
|
144
|
+
baseUrl: args.baseUrl || process.env.EVALAI_BASE_URL,
|
|
145
|
+
minScore: args.minScore,
|
|
146
|
+
maxDrop: args.maxDrop,
|
|
147
|
+
warnDrop: args.warnDrop,
|
|
148
|
+
minN: args.minN,
|
|
149
|
+
allowWeakEvidence: args.allowWeakEvidence,
|
|
150
|
+
baseline: args.baseline,
|
|
151
|
+
profile: profile,
|
|
152
|
+
prCommentOut: args["pr-comment-out"] ?? args.prCommentOut,
|
|
153
|
+
});
|
|
154
|
+
if (!evaluationId && merged.evaluationId)
|
|
155
|
+
evaluationId = merged.evaluationId;
|
|
156
|
+
if (merged.baseUrl)
|
|
157
|
+
baseUrl = merged.baseUrl;
|
|
158
|
+
if (merged.minScore != null && args.minScore === undefined)
|
|
159
|
+
minScore = merged.minScore ?? 0;
|
|
160
|
+
if (merged.maxDrop != null && args.maxDrop === undefined)
|
|
161
|
+
maxDrop = merged.maxDrop;
|
|
162
|
+
if (merged.warnDrop != null && args.warnDrop === undefined)
|
|
163
|
+
warnDrop = merged.warnDrop;
|
|
164
|
+
if (merged.minN != null && args.minN === undefined)
|
|
165
|
+
minN = merged.minN;
|
|
166
|
+
if (merged.allowWeakEvidence != null && args.allowWeakEvidence === undefined)
|
|
167
|
+
allowWeakEvidence = merged.allowWeakEvidence ?? false;
|
|
168
|
+
if (merged.baseline && !args.baseline)
|
|
169
|
+
baseline = merged.baseline;
|
|
81
170
|
if (!apiKey) {
|
|
82
|
-
|
|
83
|
-
|
|
171
|
+
return {
|
|
172
|
+
ok: false,
|
|
173
|
+
exitCode: constants_1.EXIT.BAD_ARGS,
|
|
174
|
+
message: "Error: --apiKey or EVALAI_API_KEY is required",
|
|
175
|
+
};
|
|
84
176
|
}
|
|
85
177
|
if (!evaluationId) {
|
|
86
|
-
|
|
87
|
-
|
|
178
|
+
return {
|
|
179
|
+
ok: false,
|
|
180
|
+
exitCode: constants_1.EXIT.BAD_ARGS,
|
|
181
|
+
message: "Run npx evalai init and paste your evaluationId, or pass --evaluationId.",
|
|
182
|
+
};
|
|
88
183
|
}
|
|
89
|
-
if (isNaN(minScore) || minScore < 0 || minScore > 100) {
|
|
90
|
-
|
|
91
|
-
process.exit(exports.EXIT.BAD_ARGS);
|
|
184
|
+
if (Number.isNaN(minScore) || minScore < 0 || minScore > 100) {
|
|
185
|
+
return { ok: false, exitCode: constants_1.EXIT.BAD_ARGS, message: "Error: --minScore must be 0-100" };
|
|
92
186
|
}
|
|
93
|
-
if (minN !== undefined && (isNaN(minN) || minN < 1)) {
|
|
94
|
-
|
|
95
|
-
|
|
187
|
+
if (minN !== undefined && (Number.isNaN(minN) || minN < 1)) {
|
|
188
|
+
return {
|
|
189
|
+
ok: false,
|
|
190
|
+
exitCode: constants_1.EXIT.BAD_ARGS,
|
|
191
|
+
message: "Error: --minN must be a positive number",
|
|
192
|
+
};
|
|
96
193
|
}
|
|
97
|
-
return {
|
|
194
|
+
return {
|
|
195
|
+
ok: true,
|
|
196
|
+
args: {
|
|
197
|
+
baseUrl,
|
|
198
|
+
apiKey,
|
|
199
|
+
minScore,
|
|
200
|
+
maxDrop,
|
|
201
|
+
warnDrop,
|
|
202
|
+
minN,
|
|
203
|
+
allowWeakEvidence,
|
|
204
|
+
evaluationId,
|
|
205
|
+
policy,
|
|
206
|
+
baseline,
|
|
207
|
+
format,
|
|
208
|
+
explain,
|
|
209
|
+
onFail,
|
|
210
|
+
share,
|
|
211
|
+
prCommentOut,
|
|
212
|
+
maxCostUsd: maxCostUsd != null && !Number.isNaN(maxCostUsd) ? maxCostUsd : undefined,
|
|
213
|
+
maxLatencyMs: maxLatencyMs != null && !Number.isNaN(maxLatencyMs) ? maxLatencyMs : undefined,
|
|
214
|
+
maxCostDeltaUsd: maxCostDeltaUsd != null && !Number.isNaN(maxCostDeltaUsd) ? maxCostDeltaUsd : undefined,
|
|
215
|
+
},
|
|
216
|
+
};
|
|
98
217
|
}
|
|
99
218
|
async function runCheck(args) {
|
|
100
|
-
const
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
return exports.EXIT.API_ERROR;
|
|
110
|
-
}
|
|
111
|
-
if (!scoreRes.ok) {
|
|
112
|
-
const body = await scoreRes.text();
|
|
113
|
-
console.error(`EvalAI gate ERROR: API returned ${scoreRes.status} — ${body}`);
|
|
114
|
-
return exports.EXIT.API_ERROR;
|
|
115
|
-
}
|
|
116
|
-
const data = (await scoreRes.json());
|
|
117
|
-
const score = data?.score ?? 0;
|
|
118
|
-
const total = data?.total ?? null;
|
|
119
|
-
const evidenceLevel = data?.evidenceLevel ?? null;
|
|
120
|
-
const baselineScore = data?.baselineScore ?? null;
|
|
121
|
-
const regressionDelta = data?.regressionDelta ?? null;
|
|
122
|
-
const baselineMissing = data?.baselineMissing === true;
|
|
123
|
-
const breakdown = data?.breakdown ?? {};
|
|
124
|
-
// ── Gate: baseline missing (when baseline comparison requested) ──
|
|
125
|
-
if (baselineMissing && (args.baseline !== 'published' || args.maxDrop !== undefined)) {
|
|
126
|
-
console.error(`\n✗ FAILED: baseline (${args.baseline}) not found. ` +
|
|
127
|
-
`Ensure a baseline run exists (e.g. published run, previous run, or prod-tagged run).`);
|
|
128
|
-
return exports.EXIT.API_ERROR;
|
|
129
|
-
}
|
|
130
|
-
// ── Gate: minN (low sample size) ──
|
|
131
|
-
if (args.minN !== undefined && total !== null && total < args.minN) {
|
|
132
|
-
console.error(`\n✗ FAILED: total test cases (${total}) < minN (${args.minN})`);
|
|
133
|
-
return exports.EXIT.LOW_N;
|
|
134
|
-
}
|
|
135
|
-
// ── Gate: allowWeakEvidence ──
|
|
136
|
-
if (!args.allowWeakEvidence && evidenceLevel === 'weak') {
|
|
137
|
-
console.error(`\n✗ FAILED: evidence level is 'weak' (use --allowWeakEvidence to permit)`);
|
|
138
|
-
return exports.EXIT.WEAK_EVIDENCE;
|
|
139
|
-
}
|
|
140
|
-
// ── Print summary ──
|
|
141
|
-
console.log('┌─────────────────────────────────────────┐');
|
|
142
|
-
console.log(`│ EvalAI Quality Score: ${String(score).padStart(3)}/100 │`);
|
|
143
|
-
console.log('├─────────────────────────────────────────┤');
|
|
144
|
-
if (baselineScore !== null) {
|
|
145
|
-
const delta = regressionDelta ?? 0;
|
|
146
|
-
const arrow = delta >= 0 ? '▲' : '▼';
|
|
147
|
-
console.log(`│ Baseline: ${baselineScore} ${arrow} ${Math.abs(delta)} pts │`);
|
|
148
|
-
}
|
|
149
|
-
if (breakdown) {
|
|
150
|
-
const pct = (v) => `${Math.round((v ?? 0) * 100)}%`;
|
|
151
|
-
console.log(`│ Pass: ${pct(breakdown.passRate)} Safety: ${pct(breakdown.safety)} Judge: ${pct(breakdown.judge)} │`);
|
|
152
|
-
}
|
|
153
|
-
if (data?.flags && data.flags.length > 0) {
|
|
154
|
-
console.log(`│ Flags: ${data.flags.join(', ').padEnd(30)} │`);
|
|
219
|
+
const qualityResult = await (0, api_1.fetchQualityLatest)(args.baseUrl, args.apiKey, args.evaluationId, args.baseline);
|
|
220
|
+
if (!qualityResult.ok) {
|
|
221
|
+
if (qualityResult.status === 0) {
|
|
222
|
+
console.error(`EvalAI gate ERROR: Network failure — ${qualityResult.body}`);
|
|
223
|
+
}
|
|
224
|
+
else {
|
|
225
|
+
console.error(`EvalAI gate ERROR: API returned ${qualityResult.status} — ${qualityResult.body}`);
|
|
226
|
+
}
|
|
227
|
+
return constants_1.EXIT.API_ERROR;
|
|
155
228
|
}
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
229
|
+
const { data: quality, requestId } = qualityResult;
|
|
230
|
+
const evaluationRunId = quality?.evaluationRunId;
|
|
231
|
+
let runDetails = null;
|
|
232
|
+
if (evaluationRunId != null) {
|
|
233
|
+
const runRes = await (0, api_1.fetchRunDetails)(args.baseUrl, args.apiKey, args.evaluationId, evaluationRunId);
|
|
234
|
+
if (runRes.ok)
|
|
235
|
+
runDetails = runRes.data;
|
|
161
236
|
}
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
237
|
+
const gateResult = (0, gate_1.evaluateGate)(args, quality);
|
|
238
|
+
// Create share before report when PR comment needs shareUrl (--pr-comment-out + --share fail + gate failed)
|
|
239
|
+
let shareUrl;
|
|
240
|
+
const shouldCreateShare = quality?.evaluationRunId != null &&
|
|
241
|
+
(args.share === "always" || (args.share === "fail" && !gateResult.passed));
|
|
242
|
+
if (shouldCreateShare) {
|
|
243
|
+
const exportRes = await (0, api_1.fetchRunExport)(args.baseUrl, args.apiKey, args.evaluationId, quality.evaluationRunId);
|
|
244
|
+
if (exportRes.ok) {
|
|
245
|
+
const publishRes = await (0, api_1.publishShare)(args.baseUrl, args.apiKey, args.evaluationId, exportRes.exportData, quality.evaluationRunId);
|
|
246
|
+
if (publishRes.ok) {
|
|
247
|
+
shareUrl = publishRes.data.shareUrl;
|
|
248
|
+
console.error(`\nPublic share link created: ${shareUrl}`);
|
|
249
|
+
}
|
|
250
|
+
}
|
|
167
251
|
}
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
252
|
+
const ci = (0, ci_context_1.captureCiContext)();
|
|
253
|
+
const report = (0, build_check_report_1.buildCheckReport)({
|
|
254
|
+
args,
|
|
255
|
+
quality,
|
|
256
|
+
runDetails,
|
|
257
|
+
gateResult,
|
|
258
|
+
requestId,
|
|
259
|
+
shareUrl,
|
|
260
|
+
baselineRunId: quality?.baselineRunId ?? undefined,
|
|
261
|
+
ciRunUrl: ci?.runUrl ?? undefined,
|
|
262
|
+
});
|
|
263
|
+
const formatted = args.format === "json"
|
|
264
|
+
? (0, json_1.formatJson)(report)
|
|
265
|
+
: args.format === "github"
|
|
266
|
+
? (0, github_1.formatGitHub)(report)
|
|
267
|
+
: (0, human_1.formatHuman)(report);
|
|
268
|
+
console.log(formatted);
|
|
269
|
+
// --pr-comment-out: write markdown to file for GitHub Action to post
|
|
270
|
+
if (args.prCommentOut) {
|
|
271
|
+
try {
|
|
272
|
+
const markdown = (0, pr_comment_1.buildPrComment)(report);
|
|
273
|
+
fs.writeFileSync(args.prCommentOut, markdown, "utf8");
|
|
186
274
|
}
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
if (safetyRate < check.requiredSafetyRate) {
|
|
190
|
-
console.error(`\n✗ POLICY VIOLATION (${policyName}): safety rate ${Math.round(safetyRate * 100)}% < ` +
|
|
191
|
-
`required ${Math.round(check.requiredSafetyRate * 100)}%`);
|
|
192
|
-
return exports.EXIT.POLICY_VIOLATION;
|
|
275
|
+
catch (err) {
|
|
276
|
+
console.error(`EvalAI: failed to write PR comment to ${args.prCommentOut}: ${err instanceof Error ? err.message : String(err)}`);
|
|
193
277
|
}
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
278
|
+
}
|
|
279
|
+
// --onFail import: when gate fails, import run with CI context
|
|
280
|
+
if (!gateResult.passed &&
|
|
281
|
+
args.onFail === "import" &&
|
|
282
|
+
runDetails?.results &&
|
|
283
|
+
quality?.evaluationRunId) {
|
|
284
|
+
const importResults = runDetails.results
|
|
285
|
+
.filter((r) => r.testCaseId != null && (r.status === "passed" || r.status === "failed"))
|
|
286
|
+
.map((r) => ({
|
|
287
|
+
testCaseId: r.testCaseId,
|
|
288
|
+
status: r.status,
|
|
289
|
+
output: r.output ?? "",
|
|
290
|
+
latencyMs: r.durationMs,
|
|
291
|
+
assertionsJson: r.assertionsJson,
|
|
292
|
+
}));
|
|
293
|
+
if (importResults.length > 0) {
|
|
294
|
+
const idempotencyKey = ci ? (0, ci_context_1.computeIdempotencyKey)(args.evaluationId, ci) : undefined;
|
|
295
|
+
const importRes = await (0, api_1.importRunOnFail)(args.baseUrl, args.apiKey, args.evaluationId, importResults, {
|
|
296
|
+
idempotencyKey,
|
|
297
|
+
ci,
|
|
298
|
+
importClientVersion: "evalai-cli",
|
|
299
|
+
checkReport: report,
|
|
300
|
+
});
|
|
301
|
+
if (!importRes.ok) {
|
|
302
|
+
console.error(`EvalAI import (onFail): ${importRes.status} — ${importRes.body}`);
|
|
303
|
+
}
|
|
199
304
|
}
|
|
200
|
-
console.log(`\n✓ Policy ${policyName}: COMPLIANT`);
|
|
201
305
|
}
|
|
202
|
-
|
|
203
|
-
return exports.EXIT.PASS;
|
|
306
|
+
return gateResult.exitCode;
|
|
204
307
|
}
|
|
205
308
|
// Main entry point
|
|
206
|
-
const isDirectRun = typeof require !==
|
|
309
|
+
const isDirectRun = typeof require !== "undefined" && require.main === module;
|
|
207
310
|
if (isDirectRun) {
|
|
208
|
-
const
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
311
|
+
const parsed = parseArgs(process.argv.slice(2));
|
|
312
|
+
if (!parsed.ok) {
|
|
313
|
+
console.error(parsed.message);
|
|
314
|
+
process.exit(parsed.exitCode);
|
|
315
|
+
}
|
|
316
|
+
runCheck(parsed.args)
|
|
317
|
+
.then((code) => process.exit(code))
|
|
318
|
+
.catch((err) => {
|
|
319
|
+
console.error(`EvalAI gate ERROR: ${err instanceof Error ? err.message : String(err)}`);
|
|
320
|
+
process.exit(constants_1.EXIT.API_ERROR);
|
|
214
321
|
});
|
|
215
322
|
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CI context capture and idempotency key for --onFail import.
|
|
3
|
+
*/
|
|
4
|
+
import type { CiContext } from "./api";
|
|
5
|
+
export declare function captureCiContext(): CiContext | undefined;
|
|
6
|
+
export declare function computeIdempotencyKey(evaluationId: string, ci: CiContext): string | undefined;
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* CI context capture and idempotency key for --onFail import.
|
|
4
|
+
*/
|
|
5
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
6
|
+
if (k2 === undefined) k2 = k;
|
|
7
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
8
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
9
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
10
|
+
}
|
|
11
|
+
Object.defineProperty(o, k2, desc);
|
|
12
|
+
}) : (function(o, m, k, k2) {
|
|
13
|
+
if (k2 === undefined) k2 = k;
|
|
14
|
+
o[k2] = m[k];
|
|
15
|
+
}));
|
|
16
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
17
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
18
|
+
}) : function(o, v) {
|
|
19
|
+
o["default"] = v;
|
|
20
|
+
});
|
|
21
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
22
|
+
var ownKeys = function(o) {
|
|
23
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
24
|
+
var ar = [];
|
|
25
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
26
|
+
return ar;
|
|
27
|
+
};
|
|
28
|
+
return ownKeys(o);
|
|
29
|
+
};
|
|
30
|
+
return function (mod) {
|
|
31
|
+
if (mod && mod.__esModule) return mod;
|
|
32
|
+
var result = {};
|
|
33
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
34
|
+
__setModuleDefault(result, mod);
|
|
35
|
+
return result;
|
|
36
|
+
};
|
|
37
|
+
})();
|
|
38
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
39
|
+
exports.captureCiContext = captureCiContext;
|
|
40
|
+
exports.computeIdempotencyKey = computeIdempotencyKey;
|
|
41
|
+
const node_crypto_1 = require("node:crypto");
|
|
42
|
+
const fs = __importStar(require("node:fs"));
|
|
43
|
+
function readPrFromEventPath() {
|
|
44
|
+
const path = process.env.GITHUB_EVENT_PATH;
|
|
45
|
+
if (!path)
|
|
46
|
+
return undefined;
|
|
47
|
+
try {
|
|
48
|
+
const raw = fs.readFileSync(path, "utf8");
|
|
49
|
+
const event = JSON.parse(raw);
|
|
50
|
+
return event.pull_request?.number;
|
|
51
|
+
}
|
|
52
|
+
catch {
|
|
53
|
+
return undefined;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
function readPrFromRef() {
|
|
57
|
+
const ref = process.env.GITHUB_REF;
|
|
58
|
+
if (!ref)
|
|
59
|
+
return undefined;
|
|
60
|
+
const m = ref.match(/^refs\/pull\/(\d+)\/merge$/);
|
|
61
|
+
return m ? parseInt(m[1], 10) : undefined;
|
|
62
|
+
}
|
|
63
|
+
function captureCiContext() {
|
|
64
|
+
const repo = process.env.GITHUB_REPOSITORY;
|
|
65
|
+
const sha = process.env.GITHUB_SHA;
|
|
66
|
+
const ref = process.env.GITHUB_REF;
|
|
67
|
+
const runId = process.env.GITHUB_RUN_ID;
|
|
68
|
+
const _workflow = process.env.GITHUB_WORKFLOW;
|
|
69
|
+
const _job = process.env.GITHUB_JOB;
|
|
70
|
+
const actor = process.env.GITHUB_ACTOR;
|
|
71
|
+
if (!repo && !sha)
|
|
72
|
+
return undefined;
|
|
73
|
+
let provider = "unknown";
|
|
74
|
+
if (process.env.GITHUB_ACTIONS)
|
|
75
|
+
provider = "github";
|
|
76
|
+
else if (process.env.GITLAB_CI)
|
|
77
|
+
provider = "gitlab";
|
|
78
|
+
else if (process.env.CIRCLECI)
|
|
79
|
+
provider = "circle";
|
|
80
|
+
let runUrl;
|
|
81
|
+
if (repo && runId) {
|
|
82
|
+
runUrl = `https://github.com/${repo}/actions/runs/${runId}`;
|
|
83
|
+
}
|
|
84
|
+
let pr;
|
|
85
|
+
if (process.env.GITHUB_EVENT_NAME === "pull_request") {
|
|
86
|
+
pr = readPrFromEventPath() ?? readPrFromRef();
|
|
87
|
+
}
|
|
88
|
+
return {
|
|
89
|
+
provider,
|
|
90
|
+
repo,
|
|
91
|
+
sha,
|
|
92
|
+
branch: ref?.startsWith("refs/heads/") ? ref.slice("refs/heads/".length) : ref,
|
|
93
|
+
runUrl,
|
|
94
|
+
actor,
|
|
95
|
+
pr,
|
|
96
|
+
};
|
|
97
|
+
}
|
|
98
|
+
function computeIdempotencyKey(evaluationId, ci) {
|
|
99
|
+
const repo = ci.repo ?? process.env.GITHUB_REPOSITORY;
|
|
100
|
+
const workflow = process.env.GITHUB_WORKFLOW ?? "";
|
|
101
|
+
const job = process.env.GITHUB_JOB ?? "";
|
|
102
|
+
const sha = ci.sha ?? process.env.GITHUB_SHA ?? "";
|
|
103
|
+
if (!repo || !sha)
|
|
104
|
+
return undefined;
|
|
105
|
+
const input = `${repo}.${workflow}.${job}.${sha}.${evaluationId}`;
|
|
106
|
+
return hashSha256(input);
|
|
107
|
+
}
|
|
108
|
+
function hashSha256(input) {
|
|
109
|
+
return (0, node_crypto_1.createHash)("sha256").update(input, "utf8").digest("hex");
|
|
110
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* EvalAI config loader
|
|
3
|
+
* Discovery: evalai.config.json → evalai.config.js → evalai.config.cjs → package.json evalai
|
|
4
|
+
*/
|
|
5
|
+
import { type ProfileName } from "./profiles";
|
|
6
|
+
export interface EvalAIConfig {
|
|
7
|
+
evaluationId?: string;
|
|
8
|
+
baseUrl?: string;
|
|
9
|
+
minScore?: number;
|
|
10
|
+
minN?: number;
|
|
11
|
+
maxDrop?: number;
|
|
12
|
+
warnDrop?: number;
|
|
13
|
+
allowWeakEvidence?: boolean;
|
|
14
|
+
baseline?: "published" | "previous" | "production" | "auto";
|
|
15
|
+
profile?: ProfileName;
|
|
16
|
+
/** Monorepo: package path → config. Key = path relative to config dir (e.g. "apps/web", "packages/api"). */
|
|
17
|
+
packages?: Record<string, Partial<EvalAIConfig>>;
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Find config file path in directory, walking up to root
|
|
21
|
+
*/
|
|
22
|
+
export declare function findConfigPath(cwd?: string): string | null;
|
|
23
|
+
/**
|
|
24
|
+
* Load config from file system
|
|
25
|
+
*/
|
|
26
|
+
export declare function loadConfig(cwd?: string): EvalAIConfig | null;
|
|
27
|
+
/**
|
|
28
|
+
* Merge config with CLI args. Priority: args > profile > config > defaults.
|
|
29
|
+
*/
|
|
30
|
+
export declare function mergeConfigWithArgs(config: EvalAIConfig | null, args: Partial<Record<string, string | number | boolean>>): Partial<EvalAIConfig>;
|