@pauly4010/evalai-sdk 1.4.1 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +34 -0
- package/README.md +102 -8
- package/dist/cli/api.d.ts +79 -0
- package/dist/cli/api.js +74 -0
- package/dist/cli/check.d.ts +15 -12
- package/dist/cli/check.js +113 -134
- package/dist/cli/ci-context.d.ts +6 -0
- package/dist/cli/ci-context.js +51 -0
- package/dist/cli/config.d.ts +24 -0
- package/dist/cli/config.js +158 -0
- package/dist/cli/constants.d.ts +13 -0
- package/dist/cli/constants.js +16 -0
- package/dist/cli/doctor.d.ts +11 -0
- package/dist/cli/doctor.js +82 -0
- package/dist/cli/formatters/github.d.ts +8 -0
- package/dist/cli/formatters/github.js +119 -0
- package/dist/cli/formatters/human.d.ts +6 -0
- package/dist/cli/formatters/human.js +92 -0
- package/dist/cli/formatters/json.d.ts +6 -0
- package/dist/cli/formatters/json.js +10 -0
- package/dist/cli/formatters/types.d.ts +76 -0
- package/dist/cli/formatters/types.js +5 -0
- package/dist/cli/gate.d.ts +13 -0
- package/dist/cli/gate.js +108 -0
- package/dist/cli/index.d.ts +1 -0
- package/dist/cli/index.js +31 -5
- package/dist/cli/init.d.ts +7 -0
- package/dist/cli/init.js +69 -0
- package/dist/cli/render/snippet.d.ts +5 -0
- package/dist/cli/render/snippet.js +15 -0
- package/dist/cli/render/sort.d.ts +10 -0
- package/dist/cli/render/sort.js +24 -0
- package/dist/cli/report/build-check-report.d.ts +16 -0
- package/dist/cli/report/build-check-report.js +94 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.js +4 -1
- package/dist/integrations/openai-eval.d.ts +53 -0
- package/dist/integrations/openai-eval.js +226 -0
- package/dist/utils/input-hash.d.ts +8 -0
- package/dist/utils/input-hash.js +38 -0
- package/package.json +5 -1
- package/dist/__tests__/assertions.test.d.ts +0 -1
- package/dist/__tests__/assertions.test.js +0 -288
- package/dist/__tests__/client.test.d.ts +0 -1
- package/dist/__tests__/client.test.js +0 -185
- package/dist/__tests__/testing.test.d.ts +0 -1
- package/dist/__tests__/testing.test.js +0 -230
- package/dist/__tests__/workflows.test.d.ts +0 -1
- package/dist/__tests__/workflows.test.js +0 -222
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* GitHub formatter for evalai check.
|
|
4
|
+
* - stdout: minimal (verdict + score + link) + ::error annotations for failed cases
|
|
5
|
+
* - Step summary: full Markdown written to GITHUB_STEP_SUMMARY (not stdout)
|
|
6
|
+
*/
|
|
7
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
8
|
+
if (k2 === undefined) k2 = k;
|
|
9
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
10
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
11
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
12
|
+
}
|
|
13
|
+
Object.defineProperty(o, k2, desc);
|
|
14
|
+
}) : (function(o, m, k, k2) {
|
|
15
|
+
if (k2 === undefined) k2 = k;
|
|
16
|
+
o[k2] = m[k];
|
|
17
|
+
}));
|
|
18
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
19
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
20
|
+
}) : function(o, v) {
|
|
21
|
+
o["default"] = v;
|
|
22
|
+
});
|
|
23
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
24
|
+
var ownKeys = function(o) {
|
|
25
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
26
|
+
var ar = [];
|
|
27
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
28
|
+
return ar;
|
|
29
|
+
};
|
|
30
|
+
return ownKeys(o);
|
|
31
|
+
};
|
|
32
|
+
return function (mod) {
|
|
33
|
+
if (mod && mod.__esModule) return mod;
|
|
34
|
+
var result = {};
|
|
35
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
36
|
+
__setModuleDefault(result, mod);
|
|
37
|
+
return result;
|
|
38
|
+
};
|
|
39
|
+
})();
|
|
40
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
41
|
+
exports.appendStepSummary = appendStepSummary;
|
|
42
|
+
exports.formatGitHub = formatGitHub;
|
|
43
|
+
const fs = __importStar(require("fs"));
|
|
44
|
+
const snippet_1 = require("../render/snippet");
|
|
45
|
+
const ANNOTATION_MAX = 10;
|
|
46
|
+
function escapeAnnotationMessage(s) {
|
|
47
|
+
return s.replace(/\r/g, '').replace(/\n/g, '%0A');
|
|
48
|
+
}
|
|
49
|
+
function formatAnnotation(fc) {
|
|
50
|
+
const id = fc.testCaseId ?? fc.name ?? 'unknown';
|
|
51
|
+
const reason = fc.reason ?? fc.outputSnippet ?? fc.output ?? 'no output';
|
|
52
|
+
const msg = escapeAnnotationMessage(`TestCase ${id} failed - ${(0, snippet_1.truncateSnippet)(reason, 100)}`);
|
|
53
|
+
return `::error title=EvalAI regression::${msg}`;
|
|
54
|
+
}
|
|
55
|
+
function appendStepSummary(report) {
|
|
56
|
+
const path = typeof process !== 'undefined' && process.env?.GITHUB_STEP_SUMMARY;
|
|
57
|
+
if (!path)
|
|
58
|
+
return;
|
|
59
|
+
const lines = [];
|
|
60
|
+
const passed = report.verdict === 'pass';
|
|
61
|
+
lines.push('## EvalAI Gate');
|
|
62
|
+
lines.push('');
|
|
63
|
+
lines.push(passed ? '✅ **PASSED**' : `❌ **FAILED**: ${report.reasonMessage ?? report.reasonCode}`);
|
|
64
|
+
lines.push('');
|
|
65
|
+
const deltaStr = report.baselineScore != null && report.delta != null
|
|
66
|
+
? ` (baseline ${report.baselineScore}, ${report.delta >= 0 ? '+' : ''}${report.delta} pts)`
|
|
67
|
+
: '';
|
|
68
|
+
lines.push(`**Score:** ${report.score ?? 0}/100${deltaStr}`);
|
|
69
|
+
lines.push('');
|
|
70
|
+
const failedCases = report.failedCases ?? [];
|
|
71
|
+
if (failedCases.length > 0) {
|
|
72
|
+
lines.push(`### ${failedCases.length} failing case${failedCases.length === 1 ? '' : 's'}`);
|
|
73
|
+
lines.push('');
|
|
74
|
+
for (const fc of failedCases.slice(0, 10)) {
|
|
75
|
+
const label = fc.name ?? fc.input ?? '(unnamed)';
|
|
76
|
+
const exp = (0, snippet_1.truncateSnippet)(fc.expectedOutput ?? fc.expectedSnippet, 80);
|
|
77
|
+
const out = (0, snippet_1.truncateSnippet)(fc.output ?? fc.outputSnippet, 80);
|
|
78
|
+
const reason = out ? `got "${out}"` : 'no output';
|
|
79
|
+
lines.push(`- **${(0, snippet_1.truncateSnippet)(label, 60)}** — expected: ${exp || '(any)'}, ${reason}`);
|
|
80
|
+
}
|
|
81
|
+
if (failedCases.length > 10) {
|
|
82
|
+
lines.push(`- _+ ${failedCases.length - 10} more_`);
|
|
83
|
+
}
|
|
84
|
+
lines.push('');
|
|
85
|
+
}
|
|
86
|
+
if (report.dashboardUrl) {
|
|
87
|
+
lines.push(`[View Dashboard](${report.dashboardUrl})`);
|
|
88
|
+
lines.push('');
|
|
89
|
+
}
|
|
90
|
+
try {
|
|
91
|
+
fs.appendFileSync(path, lines.join('\n'), 'utf8');
|
|
92
|
+
}
|
|
93
|
+
catch {
|
|
94
|
+
// Non-fatal: step summary is best-effort
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
function formatGitHub(report) {
|
|
98
|
+
const stdoutLines = [];
|
|
99
|
+
// Emit ::error annotations for failed cases (up to N)
|
|
100
|
+
const failedCases = report.failedCases ?? [];
|
|
101
|
+
const toAnnotate = failedCases.slice(0, ANNOTATION_MAX);
|
|
102
|
+
for (const fc of toAnnotate) {
|
|
103
|
+
stdoutLines.push(formatAnnotation(fc));
|
|
104
|
+
}
|
|
105
|
+
// Minimal summary: verdict + score + link
|
|
106
|
+
const passed = report.verdict === 'pass';
|
|
107
|
+
const failReason = report.reasonMessage ?? report.reasonCode;
|
|
108
|
+
stdoutLines.push(passed ? '\n✓ EvalAI gate PASSED' : `\n✗ EvalAI gate FAILED: ${failReason}`);
|
|
109
|
+
const deltaStr = report.baselineScore != null && report.delta != null
|
|
110
|
+
? ` (baseline ${report.baselineScore}, ${report.delta >= 0 ? '+' : ''}${report.delta} pts)`
|
|
111
|
+
: '';
|
|
112
|
+
stdoutLines.push(`Score: ${report.score ?? 0}/100${deltaStr}`);
|
|
113
|
+
if (report.dashboardUrl) {
|
|
114
|
+
stdoutLines.push(`Dashboard: ${report.dashboardUrl}`);
|
|
115
|
+
}
|
|
116
|
+
// Write full markdown to GITHUB_STEP_SUMMARY (not stdout)
|
|
117
|
+
appendStepSummary(report);
|
|
118
|
+
return stdoutLines.join('\n');
|
|
119
|
+
}
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Human-readable formatter for evalai check output.
|
|
4
|
+
* Deterministic: verdict → score → failures → link → hint.
|
|
5
|
+
*/
|
|
6
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
7
|
+
exports.formatHuman = formatHuman;
|
|
8
|
+
const snippet_1 = require("../render/snippet");
|
|
9
|
+
const TOP_N = 3;
|
|
10
|
+
function formatHuman(report) {
|
|
11
|
+
const lines = [];
|
|
12
|
+
const passed = report.verdict === 'pass';
|
|
13
|
+
const failReason = report.reasonMessage;
|
|
14
|
+
lines.push(passed ? '\n✓ EvalAI gate PASSED' : `\n✗ EvalAI gate FAILED: ${failReason ?? report.reasonCode}`);
|
|
15
|
+
const deltaStr = report.baselineScore != null && report.delta != null
|
|
16
|
+
? ` (baseline ${report.baselineScore}, ${report.delta >= 0 ? '+' : ''}${report.delta} pts)`
|
|
17
|
+
: '';
|
|
18
|
+
lines.push(`Score: ${report.score ?? 0}/100${deltaStr}`);
|
|
19
|
+
const failedCases = report.failedCases ?? [];
|
|
20
|
+
if (failedCases.length > 0) {
|
|
21
|
+
const toShow = failedCases.slice(0, TOP_N);
|
|
22
|
+
lines.push(`${failedCases.length} failing case${failedCases.length === 1 ? '' : 's'}:`);
|
|
23
|
+
for (const fc of toShow) {
|
|
24
|
+
const label = fc.name ?? fc.input ?? '(unnamed)';
|
|
25
|
+
const exp = (0, snippet_1.truncateSnippet)(fc.expectedOutput ?? fc.expectedSnippet, 50);
|
|
26
|
+
const out = (0, snippet_1.truncateSnippet)(fc.output ?? fc.outputSnippet, 50);
|
|
27
|
+
const reason = out ? `got "${out}"` : 'no output';
|
|
28
|
+
lines.push(` - "${(0, snippet_1.truncateSnippet)(label, 50)}" → expected: ${exp || '(any)'}, ${reason}`);
|
|
29
|
+
}
|
|
30
|
+
if (failedCases.length > toShow.length) {
|
|
31
|
+
lines.push(` + ${failedCases.length - toShow.length} more`);
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
if (report.dashboardUrl) {
|
|
35
|
+
lines.push(`Dashboard: ${report.dashboardUrl}`);
|
|
36
|
+
}
|
|
37
|
+
if (!passed) {
|
|
38
|
+
lines.push('Next: View full report above, fix failing cases, or adjust gate with --minScore / --maxDrop');
|
|
39
|
+
}
|
|
40
|
+
if (report.explain && (report.breakdown01 || report.contribPts || report.flags?.length)) {
|
|
41
|
+
lines.push('');
|
|
42
|
+
lines.push('--- Explain ---');
|
|
43
|
+
if (report.contribPts) {
|
|
44
|
+
const cp = report.contribPts;
|
|
45
|
+
const pts = [];
|
|
46
|
+
if (cp.passRatePts != null)
|
|
47
|
+
pts.push(`passRate: ${cp.passRatePts}`);
|
|
48
|
+
if (cp.safetyPts != null)
|
|
49
|
+
pts.push(`safety: ${cp.safetyPts}`);
|
|
50
|
+
if (cp.compliancePts != null)
|
|
51
|
+
pts.push(`compliance: ${cp.compliancePts}`);
|
|
52
|
+
if (cp.performancePts != null)
|
|
53
|
+
pts.push(`performance: ${cp.performancePts}`);
|
|
54
|
+
if (pts.length)
|
|
55
|
+
lines.push(`Contrib pts: ${pts.join(', ')}`);
|
|
56
|
+
}
|
|
57
|
+
if (report.breakdown01) {
|
|
58
|
+
const b = report.breakdown01;
|
|
59
|
+
const parts = [];
|
|
60
|
+
if (b.passRate != null)
|
|
61
|
+
parts.push(`passRate=${b.passRate}`);
|
|
62
|
+
if (b.safety != null)
|
|
63
|
+
parts.push(`safety=${b.safety}`);
|
|
64
|
+
if (b.judge != null)
|
|
65
|
+
parts.push(`judge=${b.judge}`);
|
|
66
|
+
if (b.schema != null)
|
|
67
|
+
parts.push(`schema=${b.schema}`);
|
|
68
|
+
if (b.latency != null)
|
|
69
|
+
parts.push(`latency=${b.latency}`);
|
|
70
|
+
if (b.cost != null)
|
|
71
|
+
parts.push(`cost=${b.cost}`);
|
|
72
|
+
if (parts.length)
|
|
73
|
+
lines.push(`Breakdown: ${parts.join(', ')}`);
|
|
74
|
+
}
|
|
75
|
+
if (report.flags && report.flags.length > 0) {
|
|
76
|
+
lines.push(`Flags: ${report.flags.join(', ')}`);
|
|
77
|
+
}
|
|
78
|
+
if (report.thresholds) {
|
|
79
|
+
const t = report.thresholds;
|
|
80
|
+
const parts = [];
|
|
81
|
+
if (t.minScore != null)
|
|
82
|
+
parts.push(`minScore=${t.minScore}`);
|
|
83
|
+
if (t.maxDrop != null)
|
|
84
|
+
parts.push(`maxDrop=${t.maxDrop}`);
|
|
85
|
+
if (t.minN != null)
|
|
86
|
+
parts.push(`minN=${t.minN}`);
|
|
87
|
+
if (parts.length)
|
|
88
|
+
lines.push(`Thresholds: ${parts.join(', ')}`);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
return lines.join('\n');
|
|
92
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* JSON formatter for evalai check.
|
|
4
|
+
* Outputs only JSON, no extra logs.
|
|
5
|
+
*/
|
|
6
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
7
|
+
exports.formatJson = formatJson;
|
|
8
|
+
function formatJson(report) {
|
|
9
|
+
return JSON.stringify(report, null, 0);
|
|
10
|
+
}
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CheckReport and related types for formatters.
|
|
3
|
+
*/
|
|
4
|
+
export type GateVerdict = 'pass' | 'fail';
|
|
5
|
+
export type FailureReasonCode = 'LOW_SCORE' | 'LOW_PASS_RATE' | 'SAFETY_RISK' | 'LATENCY_RISK' | 'COST_RISK' | 'BASELINE_MISSING' | 'MAX_DROP_EXCEEDED' | 'INSUFFICIENT_EVIDENCE' | 'POLICY_VIOLATION' | 'UNKNOWN';
|
|
6
|
+
export type ScoreBreakdown01 = {
|
|
7
|
+
passRate?: number;
|
|
8
|
+
safety?: number;
|
|
9
|
+
judge?: number;
|
|
10
|
+
schema?: number;
|
|
11
|
+
latency?: number;
|
|
12
|
+
cost?: number;
|
|
13
|
+
};
|
|
14
|
+
export type ScoreContribPts = {
|
|
15
|
+
passRatePts?: number;
|
|
16
|
+
safetyPts?: number;
|
|
17
|
+
compliancePts?: number;
|
|
18
|
+
performancePts?: number;
|
|
19
|
+
};
|
|
20
|
+
export type GateThresholds = {
|
|
21
|
+
minScore?: number;
|
|
22
|
+
minPassRate?: number;
|
|
23
|
+
minSafety?: number;
|
|
24
|
+
maxDrop?: number;
|
|
25
|
+
minN?: number;
|
|
26
|
+
allowWeakEvidence?: boolean;
|
|
27
|
+
baseline?: 'published' | 'previous' | 'production';
|
|
28
|
+
};
|
|
29
|
+
export type FailedCase = {
|
|
30
|
+
testCaseId?: number;
|
|
31
|
+
status?: 'failed' | 'error' | 'skipped' | 'passed';
|
|
32
|
+
name?: string;
|
|
33
|
+
input?: string;
|
|
34
|
+
inputSnippet?: string;
|
|
35
|
+
expectedOutput?: string;
|
|
36
|
+
expectedSnippet?: string;
|
|
37
|
+
output?: string;
|
|
38
|
+
outputSnippet?: string;
|
|
39
|
+
reason?: string;
|
|
40
|
+
};
|
|
41
|
+
export type CiContext = {
|
|
42
|
+
provider?: 'github' | 'gitlab' | 'circle' | 'unknown';
|
|
43
|
+
repo?: string;
|
|
44
|
+
sha?: string;
|
|
45
|
+
branch?: string;
|
|
46
|
+
pr?: number;
|
|
47
|
+
runUrl?: string;
|
|
48
|
+
actor?: string;
|
|
49
|
+
};
|
|
50
|
+
export type CheckReport = {
|
|
51
|
+
evaluationId: string;
|
|
52
|
+
runId?: number;
|
|
53
|
+
verdict: GateVerdict;
|
|
54
|
+
reasonCode: FailureReasonCode;
|
|
55
|
+
reasonMessage?: string;
|
|
56
|
+
score?: number;
|
|
57
|
+
baselineScore?: number;
|
|
58
|
+
delta?: number;
|
|
59
|
+
passRate?: number;
|
|
60
|
+
safetyPassRate?: number;
|
|
61
|
+
flags?: string[];
|
|
62
|
+
breakdown01?: ScoreBreakdown01;
|
|
63
|
+
contribPts?: ScoreContribPts;
|
|
64
|
+
thresholds?: GateThresholds;
|
|
65
|
+
n?: number;
|
|
66
|
+
evidenceLevel?: 'strong' | 'medium' | 'weak';
|
|
67
|
+
baselineMissing?: boolean;
|
|
68
|
+
dashboardUrl?: string;
|
|
69
|
+
failedCases?: FailedCase[];
|
|
70
|
+
failedCasesShown?: number;
|
|
71
|
+
failedCasesMore?: number;
|
|
72
|
+
requestId?: string;
|
|
73
|
+
durationMs?: number;
|
|
74
|
+
ci?: CiContext;
|
|
75
|
+
explain?: boolean;
|
|
76
|
+
};
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pure gate evaluation. No console output.
|
|
3
|
+
* Baseline missing → configuration failure (BAD_ARGS), not API_ERROR.
|
|
4
|
+
*/
|
|
5
|
+
import type { CheckArgs } from './check';
|
|
6
|
+
import type { QualityLatestData } from './api';
|
|
7
|
+
export type GateResult = {
|
|
8
|
+
exitCode: number;
|
|
9
|
+
passed: boolean;
|
|
10
|
+
reasonCode: string;
|
|
11
|
+
reasonMessage: string | null;
|
|
12
|
+
};
|
|
13
|
+
export declare function evaluateGate(args: CheckArgs, quality: QualityLatestData): GateResult;
|
package/dist/cli/gate.js
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Pure gate evaluation. No console output.
|
|
4
|
+
* Baseline missing → configuration failure (BAD_ARGS), not API_ERROR.
|
|
5
|
+
*/
|
|
6
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
7
|
+
exports.evaluateGate = evaluateGate;
|
|
8
|
+
const constants_1 = require("./constants");
|
|
9
|
+
function evaluateGate(args, quality) {
|
|
10
|
+
const score = quality?.score ?? 0;
|
|
11
|
+
const total = quality?.total ?? null;
|
|
12
|
+
const evidenceLevel = quality?.evidenceLevel ?? null;
|
|
13
|
+
const baselineScore = quality?.baselineScore ?? null;
|
|
14
|
+
const regressionDelta = quality?.regressionDelta ?? null;
|
|
15
|
+
const baselineMissing = quality?.baselineMissing === true;
|
|
16
|
+
const breakdown = quality?.breakdown ?? {};
|
|
17
|
+
const policyFlags = (quality?.flags ?? []);
|
|
18
|
+
// Baseline missing → configuration failure (not API error)
|
|
19
|
+
if (baselineMissing && (args.baseline !== 'published' || args.maxDrop !== undefined)) {
|
|
20
|
+
const msg = args.baseline === 'production'
|
|
21
|
+
? 'No prod runs exist for this evaluation. Tag runs with environment=prod before using --baseline production.'
|
|
22
|
+
: `Baseline (${args.baseline}) not found. Ensure a baseline run exists (e.g. published run, previous run, or prod-tagged run).`;
|
|
23
|
+
return {
|
|
24
|
+
exitCode: constants_1.EXIT.BAD_ARGS,
|
|
25
|
+
passed: false,
|
|
26
|
+
reasonCode: 'BASELINE_MISSING',
|
|
27
|
+
reasonMessage: msg,
|
|
28
|
+
};
|
|
29
|
+
}
|
|
30
|
+
// minN gate
|
|
31
|
+
if (args.minN !== undefined && total !== null && total < args.minN) {
|
|
32
|
+
return {
|
|
33
|
+
exitCode: constants_1.EXIT.LOW_N,
|
|
34
|
+
passed: false,
|
|
35
|
+
reasonCode: 'INSUFFICIENT_EVIDENCE',
|
|
36
|
+
reasonMessage: `total test cases (${total}) < minN (${args.minN})`,
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
// allowWeakEvidence gate
|
|
40
|
+
if (!args.allowWeakEvidence && evidenceLevel === 'weak') {
|
|
41
|
+
return {
|
|
42
|
+
exitCode: constants_1.EXIT.WEAK_EVIDENCE,
|
|
43
|
+
passed: false,
|
|
44
|
+
reasonCode: 'INSUFFICIENT_EVIDENCE',
|
|
45
|
+
reasonMessage: "evidence level is 'weak' (use --allowWeakEvidence to permit)",
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
// Compute gate result
|
|
49
|
+
if (args.minScore > 0 && score < args.minScore) {
|
|
50
|
+
return {
|
|
51
|
+
exitCode: constants_1.EXIT.SCORE_BELOW,
|
|
52
|
+
passed: false,
|
|
53
|
+
reasonCode: 'LOW_SCORE',
|
|
54
|
+
reasonMessage: `score ${score} < minScore ${args.minScore}`,
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
if (args.maxDrop !== undefined && regressionDelta !== null && regressionDelta < -(args.maxDrop)) {
|
|
58
|
+
return {
|
|
59
|
+
exitCode: constants_1.EXIT.REGRESSION,
|
|
60
|
+
passed: false,
|
|
61
|
+
reasonCode: 'MAX_DROP_EXCEEDED',
|
|
62
|
+
reasonMessage: `score dropped ${Math.abs(regressionDelta)} pts from baseline (max allowed: ${args.maxDrop})`,
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
if (args.policy) {
|
|
66
|
+
const policyChecks = {
|
|
67
|
+
HIPAA: { requiredSafetyRate: 0.99, maxFlags: ['SAFETY_RISK'] },
|
|
68
|
+
SOC2: { requiredSafetyRate: 0.95, maxFlags: ['SAFETY_RISK', 'LOW_PASS_RATE'] },
|
|
69
|
+
GDPR: { requiredSafetyRate: 0.95, maxFlags: ['SAFETY_RISK'] },
|
|
70
|
+
PCI_DSS: { requiredSafetyRate: 0.99, maxFlags: ['SAFETY_RISK', 'LOW_PASS_RATE'] },
|
|
71
|
+
FINRA_4511: { requiredSafetyRate: 0.95, maxFlags: ['SAFETY_RISK'] },
|
|
72
|
+
};
|
|
73
|
+
const policyName = args.policy.toUpperCase();
|
|
74
|
+
const check = policyChecks[policyName];
|
|
75
|
+
if (!check) {
|
|
76
|
+
return {
|
|
77
|
+
exitCode: constants_1.EXIT.BAD_ARGS,
|
|
78
|
+
passed: false,
|
|
79
|
+
reasonCode: 'UNKNOWN',
|
|
80
|
+
reasonMessage: `Unknown policy: ${args.policy}. Available: ${Object.keys(policyChecks).join(', ')}`,
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
const safetyRate = breakdown?.safety ?? 0;
|
|
84
|
+
if (safetyRate < check.requiredSafetyRate) {
|
|
85
|
+
return {
|
|
86
|
+
exitCode: constants_1.EXIT.POLICY_VIOLATION,
|
|
87
|
+
passed: false,
|
|
88
|
+
reasonCode: 'POLICY_VIOLATION',
|
|
89
|
+
reasonMessage: `policy ${policyName}: safety ${Math.round(safetyRate * 100)}% < required ${Math.round(check.requiredSafetyRate * 100)}%`,
|
|
90
|
+
};
|
|
91
|
+
}
|
|
92
|
+
const violations = policyFlags.filter((f) => check.maxFlags.includes(f));
|
|
93
|
+
if (violations.length > 0) {
|
|
94
|
+
return {
|
|
95
|
+
exitCode: constants_1.EXIT.POLICY_VIOLATION,
|
|
96
|
+
passed: false,
|
|
97
|
+
reasonCode: 'POLICY_VIOLATION',
|
|
98
|
+
reasonMessage: `policy ${policyName}: ${violations.join(', ')}`,
|
|
99
|
+
};
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
return {
|
|
103
|
+
exitCode: constants_1.EXIT.PASS,
|
|
104
|
+
passed: true,
|
|
105
|
+
reasonCode: 'PASS',
|
|
106
|
+
reasonMessage: null,
|
|
107
|
+
};
|
|
108
|
+
}
|
package/dist/cli/index.d.ts
CHANGED
package/dist/cli/index.js
CHANGED
|
@@ -4,15 +4,35 @@
|
|
|
4
4
|
* evalai — EvalAI CLI
|
|
5
5
|
*
|
|
6
6
|
* Commands:
|
|
7
|
+
* evalai init — Create evalai.config.json
|
|
7
8
|
* evalai check — CI/CD evaluation gate (see evalai check --help)
|
|
8
9
|
*/
|
|
9
10
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
10
11
|
const check_1 = require("./check");
|
|
12
|
+
const init_1 = require("./init");
|
|
13
|
+
const doctor_1 = require("./doctor");
|
|
11
14
|
const argv = process.argv.slice(2);
|
|
12
15
|
const subcommand = argv[0];
|
|
13
|
-
if (subcommand === '
|
|
14
|
-
const
|
|
15
|
-
(0,
|
|
16
|
+
if (subcommand === 'init') {
|
|
17
|
+
const cwd = process.cwd();
|
|
18
|
+
const ok = (0, init_1.runInit)(cwd);
|
|
19
|
+
process.exit(ok ? 0 : 1);
|
|
20
|
+
}
|
|
21
|
+
else if (subcommand === 'doctor') {
|
|
22
|
+
(0, doctor_1.runDoctor)(argv.slice(1))
|
|
23
|
+
.then((code) => process.exit(code))
|
|
24
|
+
.catch((err) => {
|
|
25
|
+
console.error(`EvalAI ERROR: ${err instanceof Error ? err.message : String(err)}`);
|
|
26
|
+
process.exit(1);
|
|
27
|
+
});
|
|
28
|
+
}
|
|
29
|
+
else if (subcommand === 'check') {
|
|
30
|
+
const parsed = (0, check_1.parseArgs)(argv.slice(1));
|
|
31
|
+
if (!parsed.ok) {
|
|
32
|
+
console.error(parsed.message);
|
|
33
|
+
process.exit(parsed.exitCode);
|
|
34
|
+
}
|
|
35
|
+
(0, check_1.runCheck)(parsed.args)
|
|
16
36
|
.then((code) => process.exit(code))
|
|
17
37
|
.catch((err) => {
|
|
18
38
|
console.error(`EvalAI ERROR: ${err instanceof Error ? err.message : String(err)}`);
|
|
@@ -23,20 +43,26 @@ else {
|
|
|
23
43
|
console.log(`EvalAI CLI
|
|
24
44
|
|
|
25
45
|
Usage:
|
|
46
|
+
evalai init Create evalai.config.json
|
|
47
|
+
evalai doctor [options] Verify CI/CD setup (same endpoint as check)
|
|
26
48
|
evalai check [options] CI/CD evaluation gate
|
|
27
49
|
|
|
28
50
|
Options for check:
|
|
29
|
-
--evaluationId <id>
|
|
51
|
+
--evaluationId <id> Evaluation to gate on (or from config)
|
|
30
52
|
--apiKey <key> API key (or EVALAI_API_KEY env)
|
|
53
|
+
--format <fmt> Output format: human (default), json, github
|
|
54
|
+
--explain Show score breakdown and thresholds
|
|
55
|
+
--onFail import When gate fails, import run with CI context
|
|
31
56
|
--minScore <n> Fail if score < n (0-100)
|
|
32
57
|
--maxDrop <n> Fail if score dropped > n from baseline
|
|
33
58
|
--minN <n> Fail if total test cases < n
|
|
34
59
|
--allowWeakEvidence Allow weak evidence level
|
|
35
60
|
--policy <name> Enforce policy (HIPAA, SOC2, GDPR, etc.)
|
|
36
|
-
--baseline <mode> "published" or "
|
|
61
|
+
--baseline <mode> "published", "previous", or "production"
|
|
37
62
|
--baseUrl <url> API base URL
|
|
38
63
|
|
|
39
64
|
Examples:
|
|
65
|
+
evalai init
|
|
40
66
|
evalai check --minScore 92 --evaluationId 42 --apiKey $EVALAI_API_KEY
|
|
41
67
|
evalai check --policy HIPAA --evaluationId 42 --apiKey $EVALAI_API_KEY
|
|
42
68
|
`);
|
package/dist/cli/init.js
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
"use strict";
|
|
3
|
+
/**
|
|
4
|
+
* evalai init — Create evalai.config.json
|
|
5
|
+
*
|
|
6
|
+
* Creates the smallest possible config file. Defaults belong in code.
|
|
7
|
+
*/
|
|
8
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
9
|
+
if (k2 === undefined) k2 = k;
|
|
10
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
11
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
12
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
13
|
+
}
|
|
14
|
+
Object.defineProperty(o, k2, desc);
|
|
15
|
+
}) : (function(o, m, k, k2) {
|
|
16
|
+
if (k2 === undefined) k2 = k;
|
|
17
|
+
o[k2] = m[k];
|
|
18
|
+
}));
|
|
19
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
20
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
21
|
+
}) : function(o, v) {
|
|
22
|
+
o["default"] = v;
|
|
23
|
+
});
|
|
24
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
25
|
+
var ownKeys = function(o) {
|
|
26
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
27
|
+
var ar = [];
|
|
28
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
29
|
+
return ar;
|
|
30
|
+
};
|
|
31
|
+
return ownKeys(o);
|
|
32
|
+
};
|
|
33
|
+
return function (mod) {
|
|
34
|
+
if (mod && mod.__esModule) return mod;
|
|
35
|
+
var result = {};
|
|
36
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
37
|
+
__setModuleDefault(result, mod);
|
|
38
|
+
return result;
|
|
39
|
+
};
|
|
40
|
+
})();
|
|
41
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
42
|
+
exports.runInit = runInit;
|
|
43
|
+
const fs = __importStar(require("fs"));
|
|
44
|
+
const path = __importStar(require("path"));
|
|
45
|
+
const CONFIG_CONTENT = `{
|
|
46
|
+
"evaluationId": ""
|
|
47
|
+
}
|
|
48
|
+
`;
|
|
49
|
+
function runInit(cwd = process.cwd()) {
|
|
50
|
+
const configPath = path.join(cwd, 'evalai.config.json');
|
|
51
|
+
if (fs.existsSync(configPath)) {
|
|
52
|
+
console.log(`evalai.config.json already exists at ${path.resolve(configPath)}`);
|
|
53
|
+
return false;
|
|
54
|
+
}
|
|
55
|
+
fs.writeFileSync(configPath, CONFIG_CONTENT, 'utf-8');
|
|
56
|
+
const resolvedPath = path.resolve(configPath);
|
|
57
|
+
console.log(`Wrote evalai.config.json at ${resolvedPath}`);
|
|
58
|
+
console.log('');
|
|
59
|
+
console.log('Next: paste evaluationId into evalai.config.json, then run npx -y @pauly4010/evalai-sdk@^1 check --format github --onFail import');
|
|
60
|
+
console.log('');
|
|
61
|
+
console.log('GitHub Actions snippet (add to your workflow):');
|
|
62
|
+
console.log(' - name: EvalAI gate');
|
|
63
|
+
console.log(' env:');
|
|
64
|
+
console.log(' EVALAI_API_KEY: ${{ secrets.EVALAI_API_KEY }}');
|
|
65
|
+
console.log(' run: npx -y @pauly4010/evalai-sdk@^1 check --format github --onFail import');
|
|
66
|
+
console.log('');
|
|
67
|
+
console.log('To uninstall: delete evalai.config.json.');
|
|
68
|
+
return true;
|
|
69
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Truncate a string for deterministic output.
|
|
4
|
+
* Replaces newlines with space, caps length.
|
|
5
|
+
*/
|
|
6
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
7
|
+
exports.truncateSnippet = truncateSnippet;
|
|
8
|
+
function truncateSnippet(s, maxLen = 140) {
|
|
9
|
+
if (s == null)
|
|
10
|
+
return '';
|
|
11
|
+
const normalized = s.replace(/\s+/g, ' ').trim();
|
|
12
|
+
if (normalized.length <= maxLen)
|
|
13
|
+
return normalized;
|
|
14
|
+
return normalized.slice(0, maxLen) + '…';
|
|
15
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Deterministic ordering for failed cases.
|
|
3
|
+
* Sort by status severity (failed > error > skipped > passed), then by testCaseId asc.
|
|
4
|
+
*/
|
|
5
|
+
export interface SortableCase {
|
|
6
|
+
status?: string;
|
|
7
|
+
testCaseId?: number;
|
|
8
|
+
[key: string]: unknown;
|
|
9
|
+
}
|
|
10
|
+
export declare function sortFailedCases<T extends SortableCase>(cases: T[]): T[];
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Deterministic ordering for failed cases.
|
|
4
|
+
* Sort by status severity (failed > error > skipped > passed), then by testCaseId asc.
|
|
5
|
+
*/
|
|
6
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
7
|
+
exports.sortFailedCases = sortFailedCases;
|
|
8
|
+
const STATUS_SEVERITY = {
|
|
9
|
+
failed: 0,
|
|
10
|
+
error: 1,
|
|
11
|
+
skipped: 2,
|
|
12
|
+
passed: 3,
|
|
13
|
+
};
|
|
14
|
+
function sortFailedCases(cases) {
|
|
15
|
+
return [...cases].sort((a, b) => {
|
|
16
|
+
const sevA = STATUS_SEVERITY[a.status?.toLowerCase() ?? ''] ?? 4;
|
|
17
|
+
const sevB = STATUS_SEVERITY[b.status?.toLowerCase() ?? ''] ?? 4;
|
|
18
|
+
if (sevA !== sevB)
|
|
19
|
+
return sevA - sevB;
|
|
20
|
+
const idA = a.testCaseId ?? 0;
|
|
21
|
+
const idB = b.testCaseId ?? 0;
|
|
22
|
+
return idA - idB;
|
|
23
|
+
});
|
|
24
|
+
}
|