@pauly4010/evalai-sdk 1.4.1 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/CHANGELOG.md +34 -0
  2. package/README.md +102 -8
  3. package/dist/cli/api.d.ts +79 -0
  4. package/dist/cli/api.js +74 -0
  5. package/dist/cli/check.d.ts +15 -12
  6. package/dist/cli/check.js +113 -134
  7. package/dist/cli/ci-context.d.ts +6 -0
  8. package/dist/cli/ci-context.js +51 -0
  9. package/dist/cli/config.d.ts +24 -0
  10. package/dist/cli/config.js +158 -0
  11. package/dist/cli/constants.d.ts +13 -0
  12. package/dist/cli/constants.js +16 -0
  13. package/dist/cli/doctor.d.ts +11 -0
  14. package/dist/cli/doctor.js +82 -0
  15. package/dist/cli/formatters/github.d.ts +8 -0
  16. package/dist/cli/formatters/github.js +119 -0
  17. package/dist/cli/formatters/human.d.ts +6 -0
  18. package/dist/cli/formatters/human.js +92 -0
  19. package/dist/cli/formatters/json.d.ts +6 -0
  20. package/dist/cli/formatters/json.js +10 -0
  21. package/dist/cli/formatters/types.d.ts +76 -0
  22. package/dist/cli/formatters/types.js +5 -0
  23. package/dist/cli/gate.d.ts +13 -0
  24. package/dist/cli/gate.js +108 -0
  25. package/dist/cli/index.d.ts +1 -0
  26. package/dist/cli/index.js +31 -5
  27. package/dist/cli/init.d.ts +7 -0
  28. package/dist/cli/init.js +69 -0
  29. package/dist/cli/render/snippet.d.ts +5 -0
  30. package/dist/cli/render/snippet.js +15 -0
  31. package/dist/cli/render/sort.d.ts +10 -0
  32. package/dist/cli/render/sort.js +24 -0
  33. package/dist/cli/report/build-check-report.d.ts +16 -0
  34. package/dist/cli/report/build-check-report.js +94 -0
  35. package/dist/index.d.ts +1 -0
  36. package/dist/index.js +4 -1
  37. package/dist/integrations/openai-eval.d.ts +53 -0
  38. package/dist/integrations/openai-eval.js +226 -0
  39. package/dist/utils/input-hash.d.ts +8 -0
  40. package/dist/utils/input-hash.js +38 -0
  41. package/package.json +5 -1
  42. package/dist/__tests__/assertions.test.d.ts +0 -1
  43. package/dist/__tests__/assertions.test.js +0 -288
  44. package/dist/__tests__/client.test.d.ts +0 -1
  45. package/dist/__tests__/client.test.js +0 -185
  46. package/dist/__tests__/testing.test.d.ts +0 -1
  47. package/dist/__tests__/testing.test.js +0 -230
  48. package/dist/__tests__/workflows.test.d.ts +0 -1
  49. package/dist/__tests__/workflows.test.js +0 -222
@@ -0,0 +1,119 @@
1
+ "use strict";
2
+ /**
3
+ * GitHub formatter for evalai check.
4
+ * - stdout: minimal (verdict + score + link) + ::error annotations for failed cases
5
+ * - Step summary: full Markdown written to GITHUB_STEP_SUMMARY (not stdout)
6
+ */
7
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
8
+ if (k2 === undefined) k2 = k;
9
+ var desc = Object.getOwnPropertyDescriptor(m, k);
10
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
11
+ desc = { enumerable: true, get: function() { return m[k]; } };
12
+ }
13
+ Object.defineProperty(o, k2, desc);
14
+ }) : (function(o, m, k, k2) {
15
+ if (k2 === undefined) k2 = k;
16
+ o[k2] = m[k];
17
+ }));
18
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
19
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
20
+ }) : function(o, v) {
21
+ o["default"] = v;
22
+ });
23
+ var __importStar = (this && this.__importStar) || (function () {
24
+ var ownKeys = function(o) {
25
+ ownKeys = Object.getOwnPropertyNames || function (o) {
26
+ var ar = [];
27
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
28
+ return ar;
29
+ };
30
+ return ownKeys(o);
31
+ };
32
+ return function (mod) {
33
+ if (mod && mod.__esModule) return mod;
34
+ var result = {};
35
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
36
+ __setModuleDefault(result, mod);
37
+ return result;
38
+ };
39
+ })();
40
+ Object.defineProperty(exports, "__esModule", { value: true });
41
+ exports.appendStepSummary = appendStepSummary;
42
+ exports.formatGitHub = formatGitHub;
43
+ const fs = __importStar(require("fs"));
44
+ const snippet_1 = require("../render/snippet");
45
+ const ANNOTATION_MAX = 10;
46
+ function escapeAnnotationMessage(s) {
47
+ return s.replace(/\r/g, '').replace(/\n/g, '%0A');
48
+ }
49
+ function formatAnnotation(fc) {
50
+ const id = fc.testCaseId ?? fc.name ?? 'unknown';
51
+ const reason = fc.reason ?? fc.outputSnippet ?? fc.output ?? 'no output';
52
+ const msg = escapeAnnotationMessage(`TestCase ${id} failed - ${(0, snippet_1.truncateSnippet)(reason, 100)}`);
53
+ return `::error title=EvalAI regression::${msg}`;
54
+ }
55
+ function appendStepSummary(report) {
56
+ const path = typeof process !== 'undefined' && process.env?.GITHUB_STEP_SUMMARY;
57
+ if (!path)
58
+ return;
59
+ const lines = [];
60
+ const passed = report.verdict === 'pass';
61
+ lines.push('## EvalAI Gate');
62
+ lines.push('');
63
+ lines.push(passed ? '✅ **PASSED**' : `❌ **FAILED**: ${report.reasonMessage ?? report.reasonCode}`);
64
+ lines.push('');
65
+ const deltaStr = report.baselineScore != null && report.delta != null
66
+ ? ` (baseline ${report.baselineScore}, ${report.delta >= 0 ? '+' : ''}${report.delta} pts)`
67
+ : '';
68
+ lines.push(`**Score:** ${report.score ?? 0}/100${deltaStr}`);
69
+ lines.push('');
70
+ const failedCases = report.failedCases ?? [];
71
+ if (failedCases.length > 0) {
72
+ lines.push(`### ${failedCases.length} failing case${failedCases.length === 1 ? '' : 's'}`);
73
+ lines.push('');
74
+ for (const fc of failedCases.slice(0, 10)) {
75
+ const label = fc.name ?? fc.input ?? '(unnamed)';
76
+ const exp = (0, snippet_1.truncateSnippet)(fc.expectedOutput ?? fc.expectedSnippet, 80);
77
+ const out = (0, snippet_1.truncateSnippet)(fc.output ?? fc.outputSnippet, 80);
78
+ const reason = out ? `got "${out}"` : 'no output';
79
+ lines.push(`- **${(0, snippet_1.truncateSnippet)(label, 60)}** — expected: ${exp || '(any)'}, ${reason}`);
80
+ }
81
+ if (failedCases.length > 10) {
82
+ lines.push(`- _+ ${failedCases.length - 10} more_`);
83
+ }
84
+ lines.push('');
85
+ }
86
+ if (report.dashboardUrl) {
87
+ lines.push(`[View Dashboard](${report.dashboardUrl})`);
88
+ lines.push('');
89
+ }
90
+ try {
91
+ fs.appendFileSync(path, lines.join('\n'), 'utf8');
92
+ }
93
+ catch {
94
+ // Non-fatal: step summary is best-effort
95
+ }
96
+ }
97
+ function formatGitHub(report) {
98
+ const stdoutLines = [];
99
+ // Emit ::error annotations for failed cases (up to N)
100
+ const failedCases = report.failedCases ?? [];
101
+ const toAnnotate = failedCases.slice(0, ANNOTATION_MAX);
102
+ for (const fc of toAnnotate) {
103
+ stdoutLines.push(formatAnnotation(fc));
104
+ }
105
+ // Minimal summary: verdict + score + link
106
+ const passed = report.verdict === 'pass';
107
+ const failReason = report.reasonMessage ?? report.reasonCode;
108
+ stdoutLines.push(passed ? '\n✓ EvalAI gate PASSED' : `\n✗ EvalAI gate FAILED: ${failReason}`);
109
+ const deltaStr = report.baselineScore != null && report.delta != null
110
+ ? ` (baseline ${report.baselineScore}, ${report.delta >= 0 ? '+' : ''}${report.delta} pts)`
111
+ : '';
112
+ stdoutLines.push(`Score: ${report.score ?? 0}/100${deltaStr}`);
113
+ if (report.dashboardUrl) {
114
+ stdoutLines.push(`Dashboard: ${report.dashboardUrl}`);
115
+ }
116
+ // Write full markdown to GITHUB_STEP_SUMMARY (not stdout)
117
+ appendStepSummary(report);
118
+ return stdoutLines.join('\n');
119
+ }
@@ -0,0 +1,6 @@
1
+ /**
2
+ * Human-readable formatter for evalai check output.
3
+ * Deterministic: verdict → score → failures → link → hint.
4
+ */
5
+ import type { CheckReport } from './types';
6
+ export declare function formatHuman(report: CheckReport): string;
@@ -0,0 +1,92 @@
1
+ "use strict";
2
+ /**
3
+ * Human-readable formatter for evalai check output.
4
+ * Deterministic: verdict → score → failures → link → hint.
5
+ */
6
+ Object.defineProperty(exports, "__esModule", { value: true });
7
+ exports.formatHuman = formatHuman;
8
+ const snippet_1 = require("../render/snippet");
9
+ const TOP_N = 3;
10
+ function formatHuman(report) {
11
+ const lines = [];
12
+ const passed = report.verdict === 'pass';
13
+ const failReason = report.reasonMessage;
14
+ lines.push(passed ? '\n✓ EvalAI gate PASSED' : `\n✗ EvalAI gate FAILED: ${failReason ?? report.reasonCode}`);
15
+ const deltaStr = report.baselineScore != null && report.delta != null
16
+ ? ` (baseline ${report.baselineScore}, ${report.delta >= 0 ? '+' : ''}${report.delta} pts)`
17
+ : '';
18
+ lines.push(`Score: ${report.score ?? 0}/100${deltaStr}`);
19
+ const failedCases = report.failedCases ?? [];
20
+ if (failedCases.length > 0) {
21
+ const toShow = failedCases.slice(0, TOP_N);
22
+ lines.push(`${failedCases.length} failing case${failedCases.length === 1 ? '' : 's'}:`);
23
+ for (const fc of toShow) {
24
+ const label = fc.name ?? fc.input ?? '(unnamed)';
25
+ const exp = (0, snippet_1.truncateSnippet)(fc.expectedOutput ?? fc.expectedSnippet, 50);
26
+ const out = (0, snippet_1.truncateSnippet)(fc.output ?? fc.outputSnippet, 50);
27
+ const reason = out ? `got "${out}"` : 'no output';
28
+ lines.push(` - "${(0, snippet_1.truncateSnippet)(label, 50)}" → expected: ${exp || '(any)'}, ${reason}`);
29
+ }
30
+ if (failedCases.length > toShow.length) {
31
+ lines.push(` + ${failedCases.length - toShow.length} more`);
32
+ }
33
+ }
34
+ if (report.dashboardUrl) {
35
+ lines.push(`Dashboard: ${report.dashboardUrl}`);
36
+ }
37
+ if (!passed) {
38
+ lines.push('Next: View full report above, fix failing cases, or adjust gate with --minScore / --maxDrop');
39
+ }
40
+ if (report.explain && (report.breakdown01 || report.contribPts || report.flags?.length)) {
41
+ lines.push('');
42
+ lines.push('--- Explain ---');
43
+ if (report.contribPts) {
44
+ const cp = report.contribPts;
45
+ const pts = [];
46
+ if (cp.passRatePts != null)
47
+ pts.push(`passRate: ${cp.passRatePts}`);
48
+ if (cp.safetyPts != null)
49
+ pts.push(`safety: ${cp.safetyPts}`);
50
+ if (cp.compliancePts != null)
51
+ pts.push(`compliance: ${cp.compliancePts}`);
52
+ if (cp.performancePts != null)
53
+ pts.push(`performance: ${cp.performancePts}`);
54
+ if (pts.length)
55
+ lines.push(`Contrib pts: ${pts.join(', ')}`);
56
+ }
57
+ if (report.breakdown01) {
58
+ const b = report.breakdown01;
59
+ const parts = [];
60
+ if (b.passRate != null)
61
+ parts.push(`passRate=${b.passRate}`);
62
+ if (b.safety != null)
63
+ parts.push(`safety=${b.safety}`);
64
+ if (b.judge != null)
65
+ parts.push(`judge=${b.judge}`);
66
+ if (b.schema != null)
67
+ parts.push(`schema=${b.schema}`);
68
+ if (b.latency != null)
69
+ parts.push(`latency=${b.latency}`);
70
+ if (b.cost != null)
71
+ parts.push(`cost=${b.cost}`);
72
+ if (parts.length)
73
+ lines.push(`Breakdown: ${parts.join(', ')}`);
74
+ }
75
+ if (report.flags && report.flags.length > 0) {
76
+ lines.push(`Flags: ${report.flags.join(', ')}`);
77
+ }
78
+ if (report.thresholds) {
79
+ const t = report.thresholds;
80
+ const parts = [];
81
+ if (t.minScore != null)
82
+ parts.push(`minScore=${t.minScore}`);
83
+ if (t.maxDrop != null)
84
+ parts.push(`maxDrop=${t.maxDrop}`);
85
+ if (t.minN != null)
86
+ parts.push(`minN=${t.minN}`);
87
+ if (parts.length)
88
+ lines.push(`Thresholds: ${parts.join(', ')}`);
89
+ }
90
+ }
91
+ return lines.join('\n');
92
+ }
@@ -0,0 +1,6 @@
1
+ /**
2
+ * JSON formatter for evalai check.
3
+ * Outputs only JSON, no extra logs.
4
+ */
5
+ import type { CheckReport } from './types';
6
+ export declare function formatJson(report: CheckReport): string;
@@ -0,0 +1,10 @@
1
+ "use strict";
2
+ /**
3
+ * JSON formatter for evalai check.
4
+ * Outputs only JSON, no extra logs.
5
+ */
6
+ Object.defineProperty(exports, "__esModule", { value: true });
7
+ exports.formatJson = formatJson;
8
+ function formatJson(report) {
9
+ return JSON.stringify(report, null, 0);
10
+ }
@@ -0,0 +1,76 @@
1
+ /**
2
+ * CheckReport and related types for formatters.
3
+ */
4
+ export type GateVerdict = 'pass' | 'fail';
5
+ export type FailureReasonCode = 'LOW_SCORE' | 'LOW_PASS_RATE' | 'SAFETY_RISK' | 'LATENCY_RISK' | 'COST_RISK' | 'BASELINE_MISSING' | 'MAX_DROP_EXCEEDED' | 'INSUFFICIENT_EVIDENCE' | 'POLICY_VIOLATION' | 'UNKNOWN';
6
+ export type ScoreBreakdown01 = {
7
+ passRate?: number;
8
+ safety?: number;
9
+ judge?: number;
10
+ schema?: number;
11
+ latency?: number;
12
+ cost?: number;
13
+ };
14
+ export type ScoreContribPts = {
15
+ passRatePts?: number;
16
+ safetyPts?: number;
17
+ compliancePts?: number;
18
+ performancePts?: number;
19
+ };
20
+ export type GateThresholds = {
21
+ minScore?: number;
22
+ minPassRate?: number;
23
+ minSafety?: number;
24
+ maxDrop?: number;
25
+ minN?: number;
26
+ allowWeakEvidence?: boolean;
27
+ baseline?: 'published' | 'previous' | 'production';
28
+ };
29
+ export type FailedCase = {
30
+ testCaseId?: number;
31
+ status?: 'failed' | 'error' | 'skipped' | 'passed';
32
+ name?: string;
33
+ input?: string;
34
+ inputSnippet?: string;
35
+ expectedOutput?: string;
36
+ expectedSnippet?: string;
37
+ output?: string;
38
+ outputSnippet?: string;
39
+ reason?: string;
40
+ };
41
+ export type CiContext = {
42
+ provider?: 'github' | 'gitlab' | 'circle' | 'unknown';
43
+ repo?: string;
44
+ sha?: string;
45
+ branch?: string;
46
+ pr?: number;
47
+ runUrl?: string;
48
+ actor?: string;
49
+ };
50
+ export type CheckReport = {
51
+ evaluationId: string;
52
+ runId?: number;
53
+ verdict: GateVerdict;
54
+ reasonCode: FailureReasonCode;
55
+ reasonMessage?: string;
56
+ score?: number;
57
+ baselineScore?: number;
58
+ delta?: number;
59
+ passRate?: number;
60
+ safetyPassRate?: number;
61
+ flags?: string[];
62
+ breakdown01?: ScoreBreakdown01;
63
+ contribPts?: ScoreContribPts;
64
+ thresholds?: GateThresholds;
65
+ n?: number;
66
+ evidenceLevel?: 'strong' | 'medium' | 'weak';
67
+ baselineMissing?: boolean;
68
+ dashboardUrl?: string;
69
+ failedCases?: FailedCase[];
70
+ failedCasesShown?: number;
71
+ failedCasesMore?: number;
72
+ requestId?: string;
73
+ durationMs?: number;
74
+ ci?: CiContext;
75
+ explain?: boolean;
76
+ };
@@ -0,0 +1,5 @@
1
+ "use strict";
2
+ /**
3
+ * CheckReport and related types for formatters.
4
+ */
5
+ Object.defineProperty(exports, "__esModule", { value: true });
@@ -0,0 +1,13 @@
1
+ /**
2
+ * Pure gate evaluation. No console output.
3
+ * Baseline missing → configuration failure (BAD_ARGS), not API_ERROR.
4
+ */
5
+ import type { CheckArgs } from './check';
6
+ import type { QualityLatestData } from './api';
7
+ export type GateResult = {
8
+ exitCode: number;
9
+ passed: boolean;
10
+ reasonCode: string;
11
+ reasonMessage: string | null;
12
+ };
13
+ export declare function evaluateGate(args: CheckArgs, quality: QualityLatestData): GateResult;
@@ -0,0 +1,108 @@
1
+ "use strict";
2
+ /**
3
+ * Pure gate evaluation. No console output.
4
+ * Baseline missing → configuration failure (BAD_ARGS), not API_ERROR.
5
+ */
6
+ Object.defineProperty(exports, "__esModule", { value: true });
7
+ exports.evaluateGate = evaluateGate;
8
+ const constants_1 = require("./constants");
9
+ function evaluateGate(args, quality) {
10
+ const score = quality?.score ?? 0;
11
+ const total = quality?.total ?? null;
12
+ const evidenceLevel = quality?.evidenceLevel ?? null;
13
+ const baselineScore = quality?.baselineScore ?? null;
14
+ const regressionDelta = quality?.regressionDelta ?? null;
15
+ const baselineMissing = quality?.baselineMissing === true;
16
+ const breakdown = quality?.breakdown ?? {};
17
+ const policyFlags = (quality?.flags ?? []);
18
+ // Baseline missing → configuration failure (not API error)
19
+ if (baselineMissing && (args.baseline !== 'published' || args.maxDrop !== undefined)) {
20
+ const msg = args.baseline === 'production'
21
+ ? 'No prod runs exist for this evaluation. Tag runs with environment=prod before using --baseline production.'
22
+ : `Baseline (${args.baseline}) not found. Ensure a baseline run exists (e.g. published run, previous run, or prod-tagged run).`;
23
+ return {
24
+ exitCode: constants_1.EXIT.BAD_ARGS,
25
+ passed: false,
26
+ reasonCode: 'BASELINE_MISSING',
27
+ reasonMessage: msg,
28
+ };
29
+ }
30
+ // minN gate
31
+ if (args.minN !== undefined && total !== null && total < args.minN) {
32
+ return {
33
+ exitCode: constants_1.EXIT.LOW_N,
34
+ passed: false,
35
+ reasonCode: 'INSUFFICIENT_EVIDENCE',
36
+ reasonMessage: `total test cases (${total}) < minN (${args.minN})`,
37
+ };
38
+ }
39
+ // allowWeakEvidence gate
40
+ if (!args.allowWeakEvidence && evidenceLevel === 'weak') {
41
+ return {
42
+ exitCode: constants_1.EXIT.WEAK_EVIDENCE,
43
+ passed: false,
44
+ reasonCode: 'INSUFFICIENT_EVIDENCE',
45
+ reasonMessage: "evidence level is 'weak' (use --allowWeakEvidence to permit)",
46
+ };
47
+ }
48
+ // Compute gate result
49
+ if (args.minScore > 0 && score < args.minScore) {
50
+ return {
51
+ exitCode: constants_1.EXIT.SCORE_BELOW,
52
+ passed: false,
53
+ reasonCode: 'LOW_SCORE',
54
+ reasonMessage: `score ${score} < minScore ${args.minScore}`,
55
+ };
56
+ }
57
+ if (args.maxDrop !== undefined && regressionDelta !== null && regressionDelta < -(args.maxDrop)) {
58
+ return {
59
+ exitCode: constants_1.EXIT.REGRESSION,
60
+ passed: false,
61
+ reasonCode: 'MAX_DROP_EXCEEDED',
62
+ reasonMessage: `score dropped ${Math.abs(regressionDelta)} pts from baseline (max allowed: ${args.maxDrop})`,
63
+ };
64
+ }
65
+ if (args.policy) {
66
+ const policyChecks = {
67
+ HIPAA: { requiredSafetyRate: 0.99, maxFlags: ['SAFETY_RISK'] },
68
+ SOC2: { requiredSafetyRate: 0.95, maxFlags: ['SAFETY_RISK', 'LOW_PASS_RATE'] },
69
+ GDPR: { requiredSafetyRate: 0.95, maxFlags: ['SAFETY_RISK'] },
70
+ PCI_DSS: { requiredSafetyRate: 0.99, maxFlags: ['SAFETY_RISK', 'LOW_PASS_RATE'] },
71
+ FINRA_4511: { requiredSafetyRate: 0.95, maxFlags: ['SAFETY_RISK'] },
72
+ };
73
+ const policyName = args.policy.toUpperCase();
74
+ const check = policyChecks[policyName];
75
+ if (!check) {
76
+ return {
77
+ exitCode: constants_1.EXIT.BAD_ARGS,
78
+ passed: false,
79
+ reasonCode: 'UNKNOWN',
80
+ reasonMessage: `Unknown policy: ${args.policy}. Available: ${Object.keys(policyChecks).join(', ')}`,
81
+ };
82
+ }
83
+ const safetyRate = breakdown?.safety ?? 0;
84
+ if (safetyRate < check.requiredSafetyRate) {
85
+ return {
86
+ exitCode: constants_1.EXIT.POLICY_VIOLATION,
87
+ passed: false,
88
+ reasonCode: 'POLICY_VIOLATION',
89
+ reasonMessage: `policy ${policyName}: safety ${Math.round(safetyRate * 100)}% < required ${Math.round(check.requiredSafetyRate * 100)}%`,
90
+ };
91
+ }
92
+ const violations = policyFlags.filter((f) => check.maxFlags.includes(f));
93
+ if (violations.length > 0) {
94
+ return {
95
+ exitCode: constants_1.EXIT.POLICY_VIOLATION,
96
+ passed: false,
97
+ reasonCode: 'POLICY_VIOLATION',
98
+ reasonMessage: `policy ${policyName}: ${violations.join(', ')}`,
99
+ };
100
+ }
101
+ }
102
+ return {
103
+ exitCode: constants_1.EXIT.PASS,
104
+ passed: true,
105
+ reasonCode: 'PASS',
106
+ reasonMessage: null,
107
+ };
108
+ }
@@ -3,6 +3,7 @@
3
3
  * evalai — EvalAI CLI
4
4
  *
5
5
  * Commands:
6
+ * evalai init — Create evalai.config.json
6
7
  * evalai check — CI/CD evaluation gate (see evalai check --help)
7
8
  */
8
9
  export {};
package/dist/cli/index.js CHANGED
@@ -4,15 +4,35 @@
4
4
  * evalai — EvalAI CLI
5
5
  *
6
6
  * Commands:
7
+ * evalai init — Create evalai.config.json
7
8
  * evalai check — CI/CD evaluation gate (see evalai check --help)
8
9
  */
9
10
  Object.defineProperty(exports, "__esModule", { value: true });
10
11
  const check_1 = require("./check");
12
+ const init_1 = require("./init");
13
+ const doctor_1 = require("./doctor");
11
14
  const argv = process.argv.slice(2);
12
15
  const subcommand = argv[0];
13
- if (subcommand === 'check') {
14
- const args = (0, check_1.parseArgs)(argv.slice(1));
15
- (0, check_1.runCheck)(args)
16
+ if (subcommand === 'init') {
17
+ const cwd = process.cwd();
18
+ const ok = (0, init_1.runInit)(cwd);
19
+ process.exit(ok ? 0 : 1);
20
+ }
21
+ else if (subcommand === 'doctor') {
22
+ (0, doctor_1.runDoctor)(argv.slice(1))
23
+ .then((code) => process.exit(code))
24
+ .catch((err) => {
25
+ console.error(`EvalAI ERROR: ${err instanceof Error ? err.message : String(err)}`);
26
+ process.exit(1);
27
+ });
28
+ }
29
+ else if (subcommand === 'check') {
30
+ const parsed = (0, check_1.parseArgs)(argv.slice(1));
31
+ if (!parsed.ok) {
32
+ console.error(parsed.message);
33
+ process.exit(parsed.exitCode);
34
+ }
35
+ (0, check_1.runCheck)(parsed.args)
16
36
  .then((code) => process.exit(code))
17
37
  .catch((err) => {
18
38
  console.error(`EvalAI ERROR: ${err instanceof Error ? err.message : String(err)}`);
@@ -23,20 +43,26 @@ else {
23
43
  console.log(`EvalAI CLI
24
44
 
25
45
  Usage:
46
+ evalai init Create evalai.config.json
47
+ evalai doctor [options] Verify CI/CD setup (same endpoint as check)
26
48
  evalai check [options] CI/CD evaluation gate
27
49
 
28
50
  Options for check:
29
- --evaluationId <id> Required. Evaluation to gate on.
51
+ --evaluationId <id> Evaluation to gate on (or from config)
30
52
  --apiKey <key> API key (or EVALAI_API_KEY env)
53
+ --format <fmt> Output format: human (default), json, github
54
+ --explain Show score breakdown and thresholds
55
+ --onFail import When gate fails, import run with CI context
31
56
  --minScore <n> Fail if score < n (0-100)
32
57
  --maxDrop <n> Fail if score dropped > n from baseline
33
58
  --minN <n> Fail if total test cases < n
34
59
  --allowWeakEvidence Allow weak evidence level
35
60
  --policy <name> Enforce policy (HIPAA, SOC2, GDPR, etc.)
36
- --baseline <mode> "published" or "previous"
61
+ --baseline <mode> "published", "previous", or "production"
37
62
  --baseUrl <url> API base URL
38
63
 
39
64
  Examples:
65
+ evalai init
40
66
  evalai check --minScore 92 --evaluationId 42 --apiKey $EVALAI_API_KEY
41
67
  evalai check --policy HIPAA --evaluationId 42 --apiKey $EVALAI_API_KEY
42
68
  `);
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * evalai init — Create evalai.config.json
4
+ *
5
+ * Creates the smallest possible config file. Defaults belong in code.
6
+ */
7
+ export declare function runInit(cwd?: string): boolean;
@@ -0,0 +1,69 @@
1
+ #!/usr/bin/env node
2
+ "use strict";
3
+ /**
4
+ * evalai init — Create evalai.config.json
5
+ *
6
+ * Creates the smallest possible config file. Defaults belong in code.
7
+ */
8
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
9
+ if (k2 === undefined) k2 = k;
10
+ var desc = Object.getOwnPropertyDescriptor(m, k);
11
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
12
+ desc = { enumerable: true, get: function() { return m[k]; } };
13
+ }
14
+ Object.defineProperty(o, k2, desc);
15
+ }) : (function(o, m, k, k2) {
16
+ if (k2 === undefined) k2 = k;
17
+ o[k2] = m[k];
18
+ }));
19
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
20
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
21
+ }) : function(o, v) {
22
+ o["default"] = v;
23
+ });
24
+ var __importStar = (this && this.__importStar) || (function () {
25
+ var ownKeys = function(o) {
26
+ ownKeys = Object.getOwnPropertyNames || function (o) {
27
+ var ar = [];
28
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
29
+ return ar;
30
+ };
31
+ return ownKeys(o);
32
+ };
33
+ return function (mod) {
34
+ if (mod && mod.__esModule) return mod;
35
+ var result = {};
36
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
37
+ __setModuleDefault(result, mod);
38
+ return result;
39
+ };
40
+ })();
41
+ Object.defineProperty(exports, "__esModule", { value: true });
42
+ exports.runInit = runInit;
43
+ const fs = __importStar(require("fs"));
44
+ const path = __importStar(require("path"));
45
+ const CONFIG_CONTENT = `{
46
+ "evaluationId": ""
47
+ }
48
+ `;
49
+ function runInit(cwd = process.cwd()) {
50
+ const configPath = path.join(cwd, 'evalai.config.json');
51
+ if (fs.existsSync(configPath)) {
52
+ console.log(`evalai.config.json already exists at ${path.resolve(configPath)}`);
53
+ return false;
54
+ }
55
+ fs.writeFileSync(configPath, CONFIG_CONTENT, 'utf-8');
56
+ const resolvedPath = path.resolve(configPath);
57
+ console.log(`Wrote evalai.config.json at ${resolvedPath}`);
58
+ console.log('');
59
+ console.log('Next: paste evaluationId into evalai.config.json, then run npx -y @pauly4010/evalai-sdk@^1 check --format github --onFail import');
60
+ console.log('');
61
+ console.log('GitHub Actions snippet (add to your workflow):');
62
+ console.log(' - name: EvalAI gate');
63
+ console.log(' env:');
64
+ console.log(' EVALAI_API_KEY: ${{ secrets.EVALAI_API_KEY }}');
65
+ console.log(' run: npx -y @pauly4010/evalai-sdk@^1 check --format github --onFail import');
66
+ console.log('');
67
+ console.log('To uninstall: delete evalai.config.json.');
68
+ return true;
69
+ }
@@ -0,0 +1,5 @@
1
+ /**
2
+ * Truncate a string for deterministic output.
3
+ * Replaces newlines with space, caps length.
4
+ */
5
+ export declare function truncateSnippet(s: string | undefined | null, maxLen?: number): string;
@@ -0,0 +1,15 @@
1
+ "use strict";
2
+ /**
3
+ * Truncate a string for deterministic output.
4
+ * Replaces newlines with space, caps length.
5
+ */
6
+ Object.defineProperty(exports, "__esModule", { value: true });
7
+ exports.truncateSnippet = truncateSnippet;
8
+ function truncateSnippet(s, maxLen = 140) {
9
+ if (s == null)
10
+ return '';
11
+ const normalized = s.replace(/\s+/g, ' ').trim();
12
+ if (normalized.length <= maxLen)
13
+ return normalized;
14
+ return normalized.slice(0, maxLen) + '…';
15
+ }
@@ -0,0 +1,10 @@
1
+ /**
2
+ * Deterministic ordering for failed cases.
3
+ * Sort by status severity (failed > error > skipped > passed), then by testCaseId asc.
4
+ */
5
+ export interface SortableCase {
6
+ status?: string;
7
+ testCaseId?: number;
8
+ [key: string]: unknown;
9
+ }
10
+ export declare function sortFailedCases<T extends SortableCase>(cases: T[]): T[];
@@ -0,0 +1,24 @@
1
+ "use strict";
2
+ /**
3
+ * Deterministic ordering for failed cases.
4
+ * Sort by status severity (failed > error > skipped > passed), then by testCaseId asc.
5
+ */
6
+ Object.defineProperty(exports, "__esModule", { value: true });
7
+ exports.sortFailedCases = sortFailedCases;
8
+ const STATUS_SEVERITY = {
9
+ failed: 0,
10
+ error: 1,
11
+ skipped: 2,
12
+ passed: 3,
13
+ };
14
+ function sortFailedCases(cases) {
15
+ return [...cases].sort((a, b) => {
16
+ const sevA = STATUS_SEVERITY[a.status?.toLowerCase() ?? ''] ?? 4;
17
+ const sevB = STATUS_SEVERITY[b.status?.toLowerCase() ?? ''] ?? 4;
18
+ if (sevA !== sevB)
19
+ return sevA - sevB;
20
+ const idA = a.testCaseId ?? 0;
21
+ const idB = b.testCaseId ?? 0;
22
+ return idA - idB;
23
+ });
24
+ }