@pauly4010/evalai-sdk 1.4.1 → 1.5.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +85 -0
- package/README.md +205 -543
- package/dist/assertions.d.ts +2 -2
- package/dist/assertions.js +104 -71
- package/dist/batch.js +12 -17
- package/dist/cache.js +7 -11
- package/dist/cli/api.d.ts +108 -0
- package/dist/cli/api.js +130 -0
- package/dist/cli/check.d.ts +28 -13
- package/dist/cli/check.js +249 -142
- package/dist/cli/ci-context.d.ts +6 -0
- package/dist/cli/ci-context.js +110 -0
- package/dist/cli/config.d.ts +30 -0
- package/dist/cli/config.js +207 -0
- package/dist/cli/constants.d.ts +15 -0
- package/dist/cli/constants.js +18 -0
- package/dist/cli/doctor.d.ts +11 -0
- package/dist/cli/doctor.js +82 -0
- package/dist/cli/formatters/github.d.ts +8 -0
- package/dist/cli/formatters/github.js +130 -0
- package/dist/cli/formatters/human.d.ts +6 -0
- package/dist/cli/formatters/human.js +107 -0
- package/dist/cli/formatters/json.d.ts +6 -0
- package/dist/cli/formatters/json.js +10 -0
- package/dist/cli/formatters/pr-comment.d.ts +12 -0
- package/dist/cli/formatters/pr-comment.js +101 -0
- package/dist/cli/formatters/types.d.ts +100 -0
- package/dist/cli/formatters/types.js +5 -0
- package/dist/cli/gate.d.ts +21 -0
- package/dist/cli/gate.js +175 -0
- package/dist/cli/index.d.ts +1 -0
- package/dist/cli/index.js +67 -23
- package/dist/cli/init.d.ts +7 -0
- package/dist/cli/init.js +69 -0
- package/dist/cli/policy-packs.d.ts +23 -0
- package/dist/cli/policy-packs.js +83 -0
- package/dist/cli/profiles.d.ts +28 -0
- package/dist/cli/profiles.js +30 -0
- package/dist/cli/reason-codes.d.ts +17 -0
- package/dist/cli/reason-codes.js +19 -0
- package/dist/cli/render/snippet.d.ts +5 -0
- package/dist/cli/render/snippet.js +15 -0
- package/dist/cli/render/sort.d.ts +10 -0
- package/dist/cli/render/sort.js +24 -0
- package/dist/cli/report/build-check-report.d.ts +19 -0
- package/dist/cli/report/build-check-report.js +124 -0
- package/dist/cli/share.d.ts +17 -0
- package/dist/cli/share.js +83 -0
- package/dist/client.d.ts +2 -2
- package/dist/client.js +144 -132
- package/dist/context.d.ts +1 -1
- package/dist/context.js +4 -6
- package/dist/errors.d.ts +2 -0
- package/dist/errors.js +116 -107
- package/dist/export.d.ts +6 -6
- package/dist/export.js +39 -33
- package/dist/index.d.ts +25 -24
- package/dist/index.js +62 -56
- package/dist/integrations/anthropic.d.ts +1 -1
- package/dist/integrations/anthropic.js +23 -19
- package/dist/integrations/openai-eval.d.ts +57 -0
- package/dist/integrations/openai-eval.js +230 -0
- package/dist/integrations/openai.d.ts +1 -1
- package/dist/integrations/openai.js +23 -19
- package/dist/local.d.ts +2 -2
- package/dist/local.js +25 -25
- package/dist/logger.d.ts +1 -1
- package/dist/logger.js +24 -28
- package/dist/matchers/index.d.ts +1 -0
- package/dist/matchers/index.js +6 -0
- package/dist/matchers/to-pass-gate.d.ts +29 -0
- package/dist/matchers/to-pass-gate.js +35 -0
- package/dist/pagination.d.ts +1 -1
- package/dist/pagination.js +6 -6
- package/dist/snapshot.js +24 -24
- package/dist/streaming.js +11 -11
- package/dist/testing.d.ts +6 -2
- package/dist/testing.js +30 -12
- package/dist/types.d.ts +22 -22
- package/dist/types.js +13 -13
- package/dist/utils/input-hash.d.ts +8 -0
- package/dist/utils/input-hash.js +38 -0
- package/dist/version.d.ts +7 -0
- package/dist/version.js +10 -0
- package/dist/workflows.d.ts +7 -7
- package/dist/workflows.js +44 -44
- package/package.json +102 -90
- package/dist/__tests__/assertions.test.d.ts +0 -1
- package/dist/__tests__/assertions.test.js +0 -288
- package/dist/__tests__/client.test.d.ts +0 -1
- package/dist/__tests__/client.test.js +0 -185
- package/dist/__tests__/testing.test.d.ts +0 -1
- package/dist/__tests__/testing.test.js +0 -230
- package/dist/__tests__/workflows.test.d.ts +0 -1
- package/dist/__tests__/workflows.test.js +0 -222
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* PR comment markdown builder for evalai check --pr-comment-out.
|
|
4
|
+
* Produces deterministic markdown for GitHub Action to post as PR comment.
|
|
5
|
+
*/
|
|
6
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
7
|
+
exports.PR_COMMENT_MARKER = void 0;
|
|
8
|
+
exports.buildPrComment = buildPrComment;
|
|
9
|
+
const snippet_1 = require("../render/snippet");
|
|
10
|
+
const TOP_FAILURES = 3;
|
|
11
|
+
function escapeMarkdown(s) {
|
|
12
|
+
return s.replace(/\|/g, "\\|").replace(/\n/g, " ");
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Hidden marker for GitHub Action to find and update existing comment (sticky update).
|
|
16
|
+
* Action should: 1) post body from file 2) search PR comments for this marker 3) update if found, else create.
|
|
17
|
+
* Export for use in Action scripts.
|
|
18
|
+
*/
|
|
19
|
+
exports.PR_COMMENT_MARKER = "<!-- evalai-gate-comment -->";
|
|
20
|
+
function buildPrComment(report) {
|
|
21
|
+
const lines = [];
|
|
22
|
+
lines.push(exports.PR_COMMENT_MARKER);
|
|
23
|
+
lines.push("");
|
|
24
|
+
const passed = report.verdict === "pass";
|
|
25
|
+
const gateApplied = report.gateApplied !== false;
|
|
26
|
+
// Verdict badge — distinguish "PASS" from "NOT GATED"
|
|
27
|
+
if (!gateApplied) {
|
|
28
|
+
lines.push("## ⚠️ EvalAI Regression Gate — NOT APPLIED");
|
|
29
|
+
lines.push("");
|
|
30
|
+
lines.push("**Gate not applied: baseline missing.**");
|
|
31
|
+
if (report.actionableMessage) {
|
|
32
|
+
lines.push("");
|
|
33
|
+
lines.push(report.actionableMessage);
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
else {
|
|
37
|
+
lines.push(passed ? "## ✅ EvalAI Regression Gate — PASSED" : "## 🚨 EvalAI Regression Gate — FAILED");
|
|
38
|
+
}
|
|
39
|
+
lines.push("");
|
|
40
|
+
// Score + Delta (skip when gate not applied)
|
|
41
|
+
const deltaStr = report.baselineScore != null && report.delta != null
|
|
42
|
+
? ` (${report.delta >= 0 ? "+" : ""}${report.delta} from baseline ${report.baselineScore})`
|
|
43
|
+
: "";
|
|
44
|
+
lines.push(`**Score:** ${report.score ?? 0}/100${deltaStr}`);
|
|
45
|
+
lines.push("");
|
|
46
|
+
// ReasonCode
|
|
47
|
+
lines.push(`**Reason:** ${report.reasonCode}`);
|
|
48
|
+
if (report.reasonMessage) {
|
|
49
|
+
lines.push(`_${escapeMarkdown(report.reasonMessage)}_`);
|
|
50
|
+
}
|
|
51
|
+
lines.push("");
|
|
52
|
+
// Policy (if any)
|
|
53
|
+
if (report.policy) {
|
|
54
|
+
lines.push(`**Policy:** ${report.policy}`);
|
|
55
|
+
lines.push("");
|
|
56
|
+
}
|
|
57
|
+
// Top failures (max 3)
|
|
58
|
+
const failedCases = report.failedCases ?? [];
|
|
59
|
+
if (failedCases.length > 0) {
|
|
60
|
+
lines.push("### Top Issues");
|
|
61
|
+
lines.push("");
|
|
62
|
+
for (const fc of failedCases.slice(0, TOP_FAILURES)) {
|
|
63
|
+
const label = fc.name ?? fc.input ?? "(unnamed)";
|
|
64
|
+
const reason = fc.reason ?? fc.outputSnippet ?? fc.output ?? "no output";
|
|
65
|
+
lines.push(`- **${(0, snippet_1.truncateSnippet)(escapeMarkdown(label), 60)}** — ${(0, snippet_1.truncateSnippet)(escapeMarkdown(reason), 80)}`);
|
|
66
|
+
}
|
|
67
|
+
if (failedCases.length > TOP_FAILURES) {
|
|
68
|
+
lines.push(`- _+ ${failedCases.length - TOP_FAILURES} more_`);
|
|
69
|
+
}
|
|
70
|
+
lines.push("");
|
|
71
|
+
}
|
|
72
|
+
// Explain summary (if --explain)
|
|
73
|
+
if (report.explain && report.contribPts) {
|
|
74
|
+
const pts = report.contribPts;
|
|
75
|
+
const parts = [];
|
|
76
|
+
if (pts.passRatePts != null)
|
|
77
|
+
parts.push(`pass rate: ${pts.passRatePts} pts`);
|
|
78
|
+
if (pts.safetyPts != null)
|
|
79
|
+
parts.push(`safety: ${pts.safetyPts} pts`);
|
|
80
|
+
if (pts.compliancePts != null)
|
|
81
|
+
parts.push(`compliance: ${pts.compliancePts} pts`);
|
|
82
|
+
if (pts.performancePts != null)
|
|
83
|
+
parts.push(`performance: ${pts.performancePts} pts`);
|
|
84
|
+
if (parts.length > 0) {
|
|
85
|
+
lines.push("### Breakdown");
|
|
86
|
+
lines.push("");
|
|
87
|
+
lines.push(parts.join(" | "));
|
|
88
|
+
lines.push("");
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
// Dashboard URL
|
|
92
|
+
if (report.dashboardUrl) {
|
|
93
|
+
lines.push(`🔎 [Dashboard](${report.dashboardUrl})`);
|
|
94
|
+
}
|
|
95
|
+
// Share URL (if exists)
|
|
96
|
+
if (report.shareUrl) {
|
|
97
|
+
lines.push(`🔗 [Share Snapshot](${report.shareUrl})`);
|
|
98
|
+
}
|
|
99
|
+
lines.push("");
|
|
100
|
+
return lines.join("\n");
|
|
101
|
+
}
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CheckReport and related types for formatters.
|
|
3
|
+
*/
|
|
4
|
+
export type GateVerdict = "pass" | "warn" | "fail";
|
|
5
|
+
/** "neutral" = exit 0 but gate not applied (e.g. baseline missing with --baseline auto) */
|
|
6
|
+
export type GateMode = "enforced" | "neutral";
|
|
7
|
+
/** Canonical reason codes. Import REASON_CODES from ../reason-codes for constants. */
|
|
8
|
+
export type FailureReasonCode = "PASS" | "WARN_REGRESSION" | "LOW_SAMPLE_SIZE" | "BASELINE_MISSING" | "SCORE_TOO_LOW" | "DELTA_TOO_HIGH" | "COST_BUDGET_EXCEEDED" | "LATENCY_BUDGET_EXCEEDED" | "POLICY_FAILED" | "UNKNOWN" | "LOW_SCORE" | "LOW_PASS_RATE" | "SAFETY_RISK" | "LATENCY_RISK" | "COST_RISK" | "MAX_DROP_EXCEEDED" | "INSUFFICIENT_EVIDENCE" | "POLICY_VIOLATION";
|
|
9
|
+
export type ScoreBreakdown01 = {
|
|
10
|
+
passRate?: number;
|
|
11
|
+
safety?: number;
|
|
12
|
+
judge?: number;
|
|
13
|
+
schema?: number;
|
|
14
|
+
latency?: number;
|
|
15
|
+
cost?: number;
|
|
16
|
+
};
|
|
17
|
+
export type ScoreContribPts = {
|
|
18
|
+
passRatePts?: number;
|
|
19
|
+
safetyPts?: number;
|
|
20
|
+
compliancePts?: number;
|
|
21
|
+
performancePts?: number;
|
|
22
|
+
};
|
|
23
|
+
export type GateThresholds = {
|
|
24
|
+
minScore?: number;
|
|
25
|
+
minPassRate?: number;
|
|
26
|
+
minSafety?: number;
|
|
27
|
+
maxDrop?: number;
|
|
28
|
+
warnDrop?: number;
|
|
29
|
+
minN?: number;
|
|
30
|
+
allowWeakEvidence?: boolean;
|
|
31
|
+
baseline?: "published" | "previous" | "production" | "auto";
|
|
32
|
+
maxCostUsd?: number;
|
|
33
|
+
maxLatencyMs?: number;
|
|
34
|
+
maxCostDeltaUsd?: number;
|
|
35
|
+
};
|
|
36
|
+
export type FailedCase = {
|
|
37
|
+
testCaseId?: number;
|
|
38
|
+
status?: "failed" | "error" | "skipped" | "passed";
|
|
39
|
+
name?: string;
|
|
40
|
+
input?: string;
|
|
41
|
+
inputSnippet?: string;
|
|
42
|
+
expectedOutput?: string;
|
|
43
|
+
expectedSnippet?: string;
|
|
44
|
+
output?: string;
|
|
45
|
+
outputSnippet?: string;
|
|
46
|
+
reason?: string;
|
|
47
|
+
};
|
|
48
|
+
export type CiContext = {
|
|
49
|
+
provider?: "github" | "gitlab" | "circle" | "unknown";
|
|
50
|
+
repo?: string;
|
|
51
|
+
sha?: string;
|
|
52
|
+
branch?: string;
|
|
53
|
+
pr?: number;
|
|
54
|
+
runUrl?: string;
|
|
55
|
+
actor?: string;
|
|
56
|
+
};
|
|
57
|
+
export type CheckReport = {
|
|
58
|
+
evaluationId: string;
|
|
59
|
+
runId?: number;
|
|
60
|
+
verdict: GateVerdict;
|
|
61
|
+
/** false when gate not applied (e.g. baseline missing, exit 0) — prevents false confidence */
|
|
62
|
+
gateApplied: boolean;
|
|
63
|
+
/** "enforced" = gate ran; "neutral" = exit 0, gate skipped */
|
|
64
|
+
gateMode: GateMode;
|
|
65
|
+
reasonCode: FailureReasonCode;
|
|
66
|
+
/** Actionable message for PR comment / UX */
|
|
67
|
+
actionableMessage?: string;
|
|
68
|
+
reasonMessage?: string;
|
|
69
|
+
score?: number;
|
|
70
|
+
baselineScore?: number;
|
|
71
|
+
delta?: number;
|
|
72
|
+
passRate?: number;
|
|
73
|
+
safetyPassRate?: number;
|
|
74
|
+
flags?: string[];
|
|
75
|
+
breakdown01?: ScoreBreakdown01;
|
|
76
|
+
contribPts?: ScoreContribPts;
|
|
77
|
+
thresholds?: GateThresholds;
|
|
78
|
+
n?: number;
|
|
79
|
+
evidenceLevel?: "strong" | "medium" | "weak";
|
|
80
|
+
baselineMissing?: boolean;
|
|
81
|
+
baselineStatus?: "found" | "missing";
|
|
82
|
+
dashboardUrl?: string;
|
|
83
|
+
failedCases?: FailedCase[];
|
|
84
|
+
failedCasesShown?: number;
|
|
85
|
+
failedCasesMore?: number;
|
|
86
|
+
requestId?: string;
|
|
87
|
+
durationMs?: number;
|
|
88
|
+
ci?: CiContext;
|
|
89
|
+
explain?: boolean;
|
|
90
|
+
shareUrl?: string;
|
|
91
|
+
policy?: string;
|
|
92
|
+
baselineRunId?: number;
|
|
93
|
+
ciRunUrl?: string;
|
|
94
|
+
/** When --explain and policy failed: which sub-check failed, remediation, snapshot */
|
|
95
|
+
policyEvidence?: {
|
|
96
|
+
failedCheck?: string;
|
|
97
|
+
remediation?: string;
|
|
98
|
+
snapshot?: Record<string, unknown>;
|
|
99
|
+
};
|
|
100
|
+
};
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pure gate evaluation. No console output.
|
|
3
|
+
* Baseline missing → configuration failure (BAD_ARGS), not API_ERROR.
|
|
4
|
+
*/
|
|
5
|
+
import type { QualityLatestData } from "./api";
|
|
6
|
+
import type { CheckArgs } from "./check";
|
|
7
|
+
export type GateResult = {
|
|
8
|
+
exitCode: number;
|
|
9
|
+
passed: boolean;
|
|
10
|
+
reasonCode: string;
|
|
11
|
+
reasonMessage: string | null;
|
|
12
|
+
/** true when gate was skipped (e.g. baseline missing + auto) */
|
|
13
|
+
gateSkipped?: boolean;
|
|
14
|
+
/** When policy failed: sub-check, remediation, snapshot for explain */
|
|
15
|
+
policyEvidence?: {
|
|
16
|
+
failedCheck: string;
|
|
17
|
+
remediation: string;
|
|
18
|
+
snapshot?: Record<string, unknown>;
|
|
19
|
+
};
|
|
20
|
+
};
|
|
21
|
+
export declare function evaluateGate(args: CheckArgs, quality: QualityLatestData): GateResult;
|
package/dist/cli/gate.js
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Pure gate evaluation. No console output.
|
|
4
|
+
* Baseline missing → configuration failure (BAD_ARGS), not API_ERROR.
|
|
5
|
+
*/
|
|
6
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
7
|
+
exports.evaluateGate = evaluateGate;
|
|
8
|
+
const constants_1 = require("./constants");
|
|
9
|
+
const policy_packs_1 = require("./policy-packs");
|
|
10
|
+
const reason_codes_1 = require("./reason-codes");
|
|
11
|
+
function evaluateGate(args, quality) {
|
|
12
|
+
const score = quality?.score ?? 0;
|
|
13
|
+
const total = quality?.total ?? null;
|
|
14
|
+
const evidenceLevel = quality?.evidenceLevel ?? null;
|
|
15
|
+
const _baselineScore = quality?.baselineScore ?? null;
|
|
16
|
+
const regressionDelta = quality?.regressionDelta ?? null;
|
|
17
|
+
const baselineMissing = quality?.baselineMissing === true;
|
|
18
|
+
const breakdown = quality?.breakdown ?? {};
|
|
19
|
+
const policyFlags = (quality?.flags ?? []);
|
|
20
|
+
const avgLatencyMs = quality?.avgLatencyMs ?? null;
|
|
21
|
+
const costUsd = quality?.costUsd ?? null;
|
|
22
|
+
const baselineCostUsd = quality?.baselineCostUsd ?? null;
|
|
23
|
+
// Baseline missing FIRST: --baseline auto → exit 0 (neutral, gate not applied); others → BAD_ARGS
|
|
24
|
+
// Must run before budget gates so baseline missing + maxCostDeltaUsd ⇒ neutral, not budget failure
|
|
25
|
+
if (baselineMissing) {
|
|
26
|
+
const msg = args.baseline === "auto"
|
|
27
|
+
? "No baseline found. Tip: Publish a baseline from the dashboard, or run with --baseline previous once you have runs."
|
|
28
|
+
: args.baseline === "production"
|
|
29
|
+
? "No prod runs exist for this evaluation. Tag runs with environment=prod before using --baseline production."
|
|
30
|
+
: `Baseline (${args.baseline}) not found. Ensure a baseline run exists (e.g. published run, previous run, or prod-tagged run).`;
|
|
31
|
+
if (args.baseline === "auto") {
|
|
32
|
+
return {
|
|
33
|
+
exitCode: constants_1.EXIT.PASS,
|
|
34
|
+
passed: false,
|
|
35
|
+
reasonCode: reason_codes_1.REASON_CODES.BASELINE_MISSING,
|
|
36
|
+
reasonMessage: msg,
|
|
37
|
+
gateSkipped: true,
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
if (args.baseline !== "published" || args.maxDrop !== undefined) {
|
|
41
|
+
return {
|
|
42
|
+
exitCode: constants_1.EXIT.BAD_ARGS,
|
|
43
|
+
passed: false,
|
|
44
|
+
reasonCode: reason_codes_1.REASON_CODES.BASELINE_MISSING,
|
|
45
|
+
reasonMessage: msg,
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
// Budget gates (after baseline check)
|
|
50
|
+
if (args.maxCostUsd != null && costUsd != null && costUsd > args.maxCostUsd) {
|
|
51
|
+
return {
|
|
52
|
+
exitCode: constants_1.EXIT.SCORE_BELOW,
|
|
53
|
+
passed: false,
|
|
54
|
+
reasonCode: reason_codes_1.REASON_CODES.COST_BUDGET_EXCEEDED,
|
|
55
|
+
reasonMessage: `cost $${costUsd.toFixed(4)} exceeds maxCostUsd $${args.maxCostUsd.toFixed(4)}`,
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
if (args.maxLatencyMs != null && avgLatencyMs != null && avgLatencyMs > args.maxLatencyMs) {
|
|
59
|
+
return {
|
|
60
|
+
exitCode: constants_1.EXIT.SCORE_BELOW,
|
|
61
|
+
passed: false,
|
|
62
|
+
reasonCode: reason_codes_1.REASON_CODES.LATENCY_BUDGET_EXCEEDED,
|
|
63
|
+
reasonMessage: `avg latency ${avgLatencyMs}ms exceeds maxLatencyMs ${args.maxLatencyMs}`,
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
if (args.maxCostDeltaUsd != null &&
|
|
67
|
+
costUsd != null &&
|
|
68
|
+
baselineCostUsd != null &&
|
|
69
|
+
costUsd - baselineCostUsd > args.maxCostDeltaUsd) {
|
|
70
|
+
return {
|
|
71
|
+
exitCode: constants_1.EXIT.SCORE_BELOW,
|
|
72
|
+
passed: false,
|
|
73
|
+
reasonCode: reason_codes_1.REASON_CODES.COST_BUDGET_EXCEEDED,
|
|
74
|
+
reasonMessage: `cost delta $${(costUsd - baselineCostUsd).toFixed(4)} exceeds maxCostDeltaUsd $${args.maxCostDeltaUsd.toFixed(4)}`,
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
// minN gate
|
|
78
|
+
if (args.minN !== undefined && total !== null && total < args.minN) {
|
|
79
|
+
return {
|
|
80
|
+
exitCode: constants_1.EXIT.LOW_N,
|
|
81
|
+
passed: false,
|
|
82
|
+
reasonCode: reason_codes_1.REASON_CODES.LOW_SAMPLE_SIZE,
|
|
83
|
+
reasonMessage: `total test cases (${total}) < minN (${args.minN})`,
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
// allowWeakEvidence gate
|
|
87
|
+
if (!args.allowWeakEvidence && evidenceLevel === "weak") {
|
|
88
|
+
return {
|
|
89
|
+
exitCode: constants_1.EXIT.WEAK_EVIDENCE,
|
|
90
|
+
passed: false,
|
|
91
|
+
reasonCode: reason_codes_1.REASON_CODES.LOW_SAMPLE_SIZE,
|
|
92
|
+
reasonMessage: "evidence level is 'weak' (use --allowWeakEvidence to permit)",
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
// Compute gate result
|
|
96
|
+
if (args.minScore > 0 && score < args.minScore) {
|
|
97
|
+
return {
|
|
98
|
+
exitCode: constants_1.EXIT.SCORE_BELOW,
|
|
99
|
+
passed: false,
|
|
100
|
+
reasonCode: reason_codes_1.REASON_CODES.SCORE_TOO_LOW,
|
|
101
|
+
reasonMessage: `score ${score} < minScore ${args.minScore}`,
|
|
102
|
+
};
|
|
103
|
+
}
|
|
104
|
+
// warnDrop: soft warning band; maxDrop: hard fail
|
|
105
|
+
if (args.maxDrop !== undefined && regressionDelta !== null && regressionDelta < -args.maxDrop) {
|
|
106
|
+
return {
|
|
107
|
+
exitCode: constants_1.EXIT.REGRESSION,
|
|
108
|
+
passed: false,
|
|
109
|
+
reasonCode: reason_codes_1.REASON_CODES.DELTA_TOO_HIGH,
|
|
110
|
+
reasonMessage: `score dropped ${Math.abs(regressionDelta)} pts from baseline (max allowed: ${args.maxDrop})`,
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
if (args.warnDrop !== undefined &&
|
|
114
|
+
regressionDelta !== null &&
|
|
115
|
+
regressionDelta < -args.warnDrop &&
|
|
116
|
+
(args.maxDrop === undefined || regressionDelta >= -args.maxDrop)) {
|
|
117
|
+
return {
|
|
118
|
+
exitCode: constants_1.EXIT.WARN_REGRESSION,
|
|
119
|
+
passed: true, // gate passes but with warning
|
|
120
|
+
reasonCode: reason_codes_1.REASON_CODES.WARN_REGRESSION,
|
|
121
|
+
reasonMessage: `score dropped ${Math.abs(regressionDelta)} pts from baseline (warn threshold: ${args.warnDrop}${args.maxDrop != null ? `, fail at ${args.maxDrop}` : ""})`,
|
|
122
|
+
};
|
|
123
|
+
}
|
|
124
|
+
if (args.policy) {
|
|
125
|
+
const pack = (0, policy_packs_1.resolvePolicyPack)(args.policy);
|
|
126
|
+
if (!pack) {
|
|
127
|
+
const valid = (0, policy_packs_1.getValidPolicyVersions)().join(", ");
|
|
128
|
+
return {
|
|
129
|
+
exitCode: constants_1.EXIT.BAD_ARGS,
|
|
130
|
+
passed: false,
|
|
131
|
+
reasonCode: reason_codes_1.REASON_CODES.UNKNOWN,
|
|
132
|
+
reasonMessage: `Unknown policy or version: ${args.policy}. Valid: ${valid}`,
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
const { requiredSafetyRate, maxFlags } = pack.thresholds;
|
|
136
|
+
const safetyRate = breakdown?.safety ?? 0;
|
|
137
|
+
if (safetyRate < requiredSafetyRate) {
|
|
138
|
+
return {
|
|
139
|
+
exitCode: constants_1.EXIT.POLICY_VIOLATION,
|
|
140
|
+
passed: false,
|
|
141
|
+
reasonCode: reason_codes_1.REASON_CODES.POLICY_FAILED,
|
|
142
|
+
reasonMessage: `policy ${pack.policyId}@${pack.version}: safety ${Math.round(safetyRate * 100)}% < required ${Math.round(requiredSafetyRate * 100)}%`,
|
|
143
|
+
policyEvidence: {
|
|
144
|
+
failedCheck: "safety_rate",
|
|
145
|
+
remediation: `Increase safety pass rate to at least ${Math.round(requiredSafetyRate * 100)}%. Review failing test cases for safety-related assertions.`,
|
|
146
|
+
snapshot: {
|
|
147
|
+
safety: safetyRate,
|
|
148
|
+
required: requiredSafetyRate,
|
|
149
|
+
policy: `${pack.policyId}@${pack.version}`,
|
|
150
|
+
},
|
|
151
|
+
},
|
|
152
|
+
};
|
|
153
|
+
}
|
|
154
|
+
const violations = policyFlags.filter((f) => maxFlags.includes(f));
|
|
155
|
+
if (violations.length > 0) {
|
|
156
|
+
return {
|
|
157
|
+
exitCode: constants_1.EXIT.POLICY_VIOLATION,
|
|
158
|
+
passed: false,
|
|
159
|
+
reasonCode: reason_codes_1.REASON_CODES.POLICY_FAILED,
|
|
160
|
+
reasonMessage: `policy ${pack.policyId}@${pack.version}: ${violations.join(", ")}`,
|
|
161
|
+
policyEvidence: {
|
|
162
|
+
failedCheck: "flag_restrictions",
|
|
163
|
+
remediation: `Resolve flags: ${violations.join(", ")}. These indicate policy violations that must be addressed.`,
|
|
164
|
+
snapshot: { violations, policy: `${pack.policyId}@${pack.version}` },
|
|
165
|
+
},
|
|
166
|
+
};
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
return {
|
|
170
|
+
exitCode: constants_1.EXIT.PASS,
|
|
171
|
+
passed: true,
|
|
172
|
+
reasonCode: reason_codes_1.REASON_CODES.PASS,
|
|
173
|
+
reasonMessage: null,
|
|
174
|
+
};
|
|
175
|
+
}
|
package/dist/cli/index.d.ts
CHANGED
package/dist/cli/index.js
CHANGED
|
@@ -4,41 +4,85 @@
|
|
|
4
4
|
* evalai — EvalAI CLI
|
|
5
5
|
*
|
|
6
6
|
* Commands:
|
|
7
|
+
* evalai init — Create evalai.config.json
|
|
7
8
|
* evalai check — CI/CD evaluation gate (see evalai check --help)
|
|
8
9
|
*/
|
|
9
10
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
10
11
|
const check_1 = require("./check");
|
|
12
|
+
const doctor_1 = require("./doctor");
|
|
13
|
+
const init_1 = require("./init");
|
|
14
|
+
const share_1 = require("./share");
|
|
11
15
|
const argv = process.argv.slice(2);
|
|
12
16
|
const subcommand = argv[0];
|
|
13
|
-
if (subcommand ===
|
|
14
|
-
const
|
|
15
|
-
(0,
|
|
17
|
+
if (subcommand === "init") {
|
|
18
|
+
const cwd = process.cwd();
|
|
19
|
+
const ok = (0, init_1.runInit)(cwd);
|
|
20
|
+
process.exit(ok ? 0 : 1);
|
|
21
|
+
}
|
|
22
|
+
else if (subcommand === "doctor") {
|
|
23
|
+
(0, doctor_1.runDoctor)(argv.slice(1))
|
|
24
|
+
.then((code) => process.exit(code))
|
|
25
|
+
.catch((err) => {
|
|
26
|
+
console.error(`EvalAI ERROR: ${err instanceof Error ? err.message : String(err)}`);
|
|
27
|
+
process.exit(1);
|
|
28
|
+
});
|
|
29
|
+
}
|
|
30
|
+
else if (subcommand === "check") {
|
|
31
|
+
const parsed = (0, check_1.parseArgs)(argv.slice(1));
|
|
32
|
+
if (!parsed.ok) {
|
|
33
|
+
console.error(parsed.message);
|
|
34
|
+
process.exit(parsed.exitCode);
|
|
35
|
+
}
|
|
36
|
+
(0, check_1.runCheck)(parsed.args)
|
|
16
37
|
.then((code) => process.exit(code))
|
|
17
38
|
.catch((err) => {
|
|
18
39
|
console.error(`EvalAI ERROR: ${err instanceof Error ? err.message : String(err)}`);
|
|
19
40
|
process.exit(4);
|
|
20
41
|
});
|
|
21
42
|
}
|
|
43
|
+
else if (subcommand === "share") {
|
|
44
|
+
const parsed = (0, share_1.parseShareArgs)(argv.slice(1));
|
|
45
|
+
if ("error" in parsed) {
|
|
46
|
+
console.error(parsed.error);
|
|
47
|
+
process.exit(1);
|
|
48
|
+
}
|
|
49
|
+
(0, share_1.runShare)(parsed)
|
|
50
|
+
.then((code) => process.exit(code))
|
|
51
|
+
.catch((err) => {
|
|
52
|
+
console.error(`EvalAI ERROR: ${err instanceof Error ? err.message : String(err)}`);
|
|
53
|
+
process.exit(1);
|
|
54
|
+
});
|
|
55
|
+
}
|
|
22
56
|
else {
|
|
23
|
-
console.log(`EvalAI CLI
|
|
24
|
-
|
|
25
|
-
Usage:
|
|
26
|
-
evalai
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
--
|
|
33
|
-
--
|
|
34
|
-
--
|
|
35
|
-
--
|
|
36
|
-
--
|
|
37
|
-
--
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
57
|
+
console.log(`EvalAI CLI
|
|
58
|
+
|
|
59
|
+
Usage:
|
|
60
|
+
evalai init Create evalai.config.json
|
|
61
|
+
evalai doctor [options] Verify CI/CD setup (same endpoint as check)
|
|
62
|
+
evalai check [options] CI/CD evaluation gate
|
|
63
|
+
evalai share [options] Create share link for a run
|
|
64
|
+
|
|
65
|
+
Options for check:
|
|
66
|
+
--evaluationId <id> Evaluation to gate on (or from config)
|
|
67
|
+
--apiKey <key> API key (or EVALAI_API_KEY env)
|
|
68
|
+
--format <fmt> Output format: human (default), json, github
|
|
69
|
+
--explain Show score breakdown and thresholds
|
|
70
|
+
--onFail import When gate fails, import run with CI context
|
|
71
|
+
--minScore <n> Fail if score < n (0-100)
|
|
72
|
+
--maxDrop <n> Fail if score dropped > n from baseline
|
|
73
|
+
--warnDrop <n> Warn (exit 8) if score dropped > n but < maxDrop
|
|
74
|
+
--minN <n> Fail if total test cases < n
|
|
75
|
+
--allowWeakEvidence Allow weak evidence level
|
|
76
|
+
--policy <name> Enforce policy (HIPAA, SOC2, GDPR, etc.)
|
|
77
|
+
--baseline <mode> "published", "previous", or "production"
|
|
78
|
+
--share <mode> Share link: always | fail | never (fail = only when gate fails)
|
|
79
|
+
--baseUrl <url> API base URL
|
|
80
|
+
|
|
81
|
+
Examples:
|
|
82
|
+
evalai init
|
|
83
|
+
evalai check --minScore 92 --evaluationId 42 --apiKey $EVALAI_API_KEY
|
|
84
|
+
evalai check --policy HIPAA --evaluationId 42 --apiKey $EVALAI_API_KEY
|
|
85
|
+
evalai share --scope run --evaluationId 42 --runId 123 --expires 7d --apiKey $EVALAI_API_KEY
|
|
42
86
|
`);
|
|
43
|
-
process.exit(subcommand ===
|
|
87
|
+
process.exit(subcommand === "--help" || subcommand === "-h" ? 0 : 1);
|
|
44
88
|
}
|
package/dist/cli/init.js
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
"use strict";
|
|
3
|
+
/**
|
|
4
|
+
* evalai init — Create evalai.config.json
|
|
5
|
+
*
|
|
6
|
+
* Creates the smallest possible config file. Defaults belong in code.
|
|
7
|
+
*/
|
|
8
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
9
|
+
if (k2 === undefined) k2 = k;
|
|
10
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
11
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
12
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
13
|
+
}
|
|
14
|
+
Object.defineProperty(o, k2, desc);
|
|
15
|
+
}) : (function(o, m, k, k2) {
|
|
16
|
+
if (k2 === undefined) k2 = k;
|
|
17
|
+
o[k2] = m[k];
|
|
18
|
+
}));
|
|
19
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
20
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
21
|
+
}) : function(o, v) {
|
|
22
|
+
o["default"] = v;
|
|
23
|
+
});
|
|
24
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
25
|
+
var ownKeys = function(o) {
|
|
26
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
27
|
+
var ar = [];
|
|
28
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
29
|
+
return ar;
|
|
30
|
+
};
|
|
31
|
+
return ownKeys(o);
|
|
32
|
+
};
|
|
33
|
+
return function (mod) {
|
|
34
|
+
if (mod && mod.__esModule) return mod;
|
|
35
|
+
var result = {};
|
|
36
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
37
|
+
__setModuleDefault(result, mod);
|
|
38
|
+
return result;
|
|
39
|
+
};
|
|
40
|
+
})();
|
|
41
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
42
|
+
exports.runInit = runInit;
|
|
43
|
+
const fs = __importStar(require("node:fs"));
|
|
44
|
+
const path = __importStar(require("node:path"));
|
|
45
|
+
const CONFIG_CONTENT = `{
|
|
46
|
+
"evaluationId": ""
|
|
47
|
+
}
|
|
48
|
+
`;
|
|
49
|
+
function runInit(cwd = process.cwd()) {
|
|
50
|
+
const configPath = path.join(cwd, "evalai.config.json");
|
|
51
|
+
if (fs.existsSync(configPath)) {
|
|
52
|
+
console.log(`evalai.config.json already exists at ${path.resolve(configPath)}`);
|
|
53
|
+
return false;
|
|
54
|
+
}
|
|
55
|
+
fs.writeFileSync(configPath, CONFIG_CONTENT, "utf-8");
|
|
56
|
+
const resolvedPath = path.resolve(configPath);
|
|
57
|
+
console.log(`Wrote evalai.config.json at ${resolvedPath}`);
|
|
58
|
+
console.log("");
|
|
59
|
+
console.log("Next: paste evaluationId into evalai.config.json, then run npx -y @pauly4010/evalai-sdk@^1 check --format github --onFail import");
|
|
60
|
+
console.log("");
|
|
61
|
+
console.log("GitHub Actions snippet (add to your workflow):");
|
|
62
|
+
console.log(" - name: EvalAI gate");
|
|
63
|
+
console.log(" env:");
|
|
64
|
+
console.log(" EVALAI_API_KEY: ${{ secrets.EVALAI_API_KEY }}");
|
|
65
|
+
console.log(" run: npx -y @pauly4010/evalai-sdk@^1 check --format github --onFail import");
|
|
66
|
+
console.log("");
|
|
67
|
+
console.log("To uninstall: delete evalai.config.json.");
|
|
68
|
+
return true;
|
|
69
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Versioned policy packs for evalai check --policy.
|
|
3
|
+
* Schema: policyId, version, thresholds, rationale, checks.
|
|
4
|
+
* Usage: --policy HIPAA@1
|
|
5
|
+
*/
|
|
6
|
+
export type PolicyPack = {
|
|
7
|
+
policyId: string;
|
|
8
|
+
version: number;
|
|
9
|
+
thresholds: {
|
|
10
|
+
requiredSafetyRate: number;
|
|
11
|
+
maxFlags: string[];
|
|
12
|
+
};
|
|
13
|
+
rationale: string;
|
|
14
|
+
checks: string[];
|
|
15
|
+
};
|
|
16
|
+
export declare const POLICY_PACKS: Record<string, Record<number, PolicyPack>>;
|
|
17
|
+
/**
|
|
18
|
+
* Parse --policy flag (e.g. "HIPAA@1" or "HIPAA") and resolve to PolicyPack.
|
|
19
|
+
* Default version is 1 when omitted.
|
|
20
|
+
*/
|
|
21
|
+
export declare function resolvePolicyPack(spec: string): PolicyPack | null;
|
|
22
|
+
/** List valid policy@version specs for error messages */
|
|
23
|
+
export declare function getValidPolicyVersions(): string[];
|