@pauly4010/evalai-sdk 1.8.0 ā 1.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +54 -0
- package/README.md +136 -23
- package/dist/assertions.js +51 -18
- package/dist/batch.js +8 -2
- package/dist/cli/api.js +3 -1
- package/dist/cli/check.js +19 -6
- package/dist/cli/ci-context.js +3 -1
- package/dist/cli/ci.d.ts +45 -0
- package/dist/cli/ci.js +192 -0
- package/dist/cli/config.js +28 -8
- package/dist/cli/diff.d.ts +173 -0
- package/dist/cli/diff.js +685 -0
- package/dist/cli/discover.d.ts +84 -0
- package/dist/cli/discover.js +419 -0
- package/dist/cli/doctor.js +62 -19
- package/dist/cli/env.d.ts +21 -0
- package/dist/cli/env.js +42 -0
- package/dist/cli/explain.js +168 -36
- package/dist/cli/formatters/human.js +4 -1
- package/dist/cli/formatters/pr-comment.js +3 -1
- package/dist/cli/gate.js +6 -2
- package/dist/cli/impact-analysis.d.ts +63 -0
- package/dist/cli/impact-analysis.js +252 -0
- package/dist/cli/index.js +185 -0
- package/dist/cli/manifest.d.ts +103 -0
- package/dist/cli/manifest.js +282 -0
- package/dist/cli/migrate.d.ts +41 -0
- package/dist/cli/migrate.js +349 -0
- package/dist/cli/policy-packs.js +8 -2
- package/dist/cli/print-config.js +33 -14
- package/dist/cli/regression-gate.js +8 -2
- package/dist/cli/report/build-check-report.js +8 -2
- package/dist/cli/run.d.ts +101 -0
- package/dist/cli/run.js +395 -0
- package/dist/cli/share.js +3 -1
- package/dist/cli/upgrade.js +2 -1
- package/dist/cli/workspace.d.ts +28 -0
- package/dist/cli/workspace.js +58 -0
- package/dist/client.d.ts +16 -19
- package/dist/client.js +60 -43
- package/dist/client.request.test.d.ts +1 -1
- package/dist/client.request.test.js +222 -147
- package/dist/context.js +3 -1
- package/dist/errors.js +11 -4
- package/dist/export.js +3 -1
- package/dist/index.d.ts +8 -2
- package/dist/index.js +30 -5
- package/dist/integrations/anthropic.d.ts +20 -1
- package/dist/integrations/openai-eval.js +4 -2
- package/dist/integrations/openai.d.ts +24 -1
- package/dist/local.js +3 -1
- package/dist/logger.js +6 -2
- package/dist/pagination.js +6 -2
- package/dist/runtime/adapters/config-to-dsl.d.ts +33 -0
- package/dist/runtime/adapters/config-to-dsl.js +394 -0
- package/dist/runtime/adapters/testsuite-to-dsl.d.ts +63 -0
- package/dist/runtime/adapters/testsuite-to-dsl.js +276 -0
- package/dist/runtime/context.d.ts +26 -0
- package/dist/runtime/context.js +74 -0
- package/dist/runtime/eval.d.ts +46 -0
- package/dist/runtime/eval.js +244 -0
- package/dist/runtime/execution-mode.d.ts +80 -0
- package/dist/runtime/execution-mode.js +357 -0
- package/dist/runtime/executor.d.ts +16 -0
- package/dist/runtime/executor.js +152 -0
- package/dist/runtime/registry.d.ts +78 -0
- package/dist/runtime/registry.js +403 -0
- package/dist/runtime/run-report.d.ts +200 -0
- package/dist/runtime/run-report.js +222 -0
- package/dist/runtime/types.d.ts +356 -0
- package/dist/runtime/types.js +76 -0
- package/dist/testing.d.ts +65 -0
- package/dist/testing.js +49 -2
- package/dist/types.d.ts +100 -69
- package/dist/utils/input-hash.js +4 -1
- package/dist/version.d.ts +1 -1
- package/dist/version.js +1 -1
- package/dist/workflows.js +62 -14
- package/package.json +115 -110
package/dist/cli/print-config.js
CHANGED
|
@@ -114,8 +114,10 @@ function buildResolvedConfig(cwd, flags) {
|
|
|
114
114
|
// Determine source of each field
|
|
115
115
|
const fields = [];
|
|
116
116
|
// evaluationId
|
|
117
|
-
const evalIdSource = flags.evaluationId
|
|
118
|
-
|
|
117
|
+
const evalIdSource = flags.evaluationId
|
|
118
|
+
? "arg"
|
|
119
|
+
: fileConfig?.evaluationId
|
|
120
|
+
? "file"
|
|
119
121
|
: "default";
|
|
120
122
|
fields.push({
|
|
121
123
|
key: "evaluationId",
|
|
@@ -124,20 +126,28 @@ function buildResolvedConfig(cwd, flags) {
|
|
|
124
126
|
});
|
|
125
127
|
// baseUrl
|
|
126
128
|
const envBaseUrl = process.env.EVALAI_BASE_URL;
|
|
127
|
-
const baseUrlSource = flags.baseUrl
|
|
128
|
-
|
|
129
|
-
|
|
129
|
+
const baseUrlSource = flags.baseUrl
|
|
130
|
+
? "arg"
|
|
131
|
+
: envBaseUrl
|
|
132
|
+
? "env"
|
|
133
|
+
: fileConfig?.baseUrl
|
|
134
|
+
? "file"
|
|
130
135
|
: "default";
|
|
131
136
|
fields.push({
|
|
132
137
|
key: "baseUrl",
|
|
133
|
-
value: flags.baseUrl ||
|
|
138
|
+
value: flags.baseUrl ||
|
|
139
|
+
envBaseUrl ||
|
|
140
|
+
fileConfig?.baseUrl ||
|
|
141
|
+
"http://localhost:3000",
|
|
134
142
|
source: baseUrlSource,
|
|
135
143
|
});
|
|
136
144
|
// apiKey (always redacted)
|
|
137
145
|
const envApiKey = process.env.EVALAI_API_KEY;
|
|
138
146
|
const rawApiKey = flags.apiKey || envApiKey || "";
|
|
139
|
-
const apiKeySource = flags.apiKey
|
|
140
|
-
|
|
147
|
+
const apiKeySource = flags.apiKey
|
|
148
|
+
? "arg"
|
|
149
|
+
: envApiKey
|
|
150
|
+
? "env"
|
|
141
151
|
: "default";
|
|
142
152
|
fields.push({
|
|
143
153
|
key: "apiKey",
|
|
@@ -147,7 +157,11 @@ function buildResolvedConfig(cwd, flags) {
|
|
|
147
157
|
});
|
|
148
158
|
// profile
|
|
149
159
|
const profileName = (flags.profile || fileConfig?.profile);
|
|
150
|
-
const profileSource = flags.profile
|
|
160
|
+
const profileSource = flags.profile
|
|
161
|
+
? "arg"
|
|
162
|
+
: fileConfig?.profile
|
|
163
|
+
? "file"
|
|
164
|
+
: "default";
|
|
151
165
|
fields.push({
|
|
152
166
|
key: "profile",
|
|
153
167
|
value: profileName ?? null,
|
|
@@ -167,9 +181,12 @@ function buildResolvedConfig(cwd, flags) {
|
|
|
167
181
|
const profileVal = profileName && profileName in profiles_1.PROFILES
|
|
168
182
|
? profiles_1.PROFILES[profileName][key]
|
|
169
183
|
: undefined;
|
|
170
|
-
const source = argVal !== undefined
|
|
171
|
-
|
|
172
|
-
|
|
184
|
+
const source = argVal !== undefined
|
|
185
|
+
? "arg"
|
|
186
|
+
: fileVal !== undefined
|
|
187
|
+
? "file"
|
|
188
|
+
: profileVal !== undefined
|
|
189
|
+
? "profile"
|
|
173
190
|
: "default";
|
|
174
191
|
fields.push({
|
|
175
192
|
key,
|
|
@@ -178,8 +195,10 @@ function buildResolvedConfig(cwd, flags) {
|
|
|
178
195
|
});
|
|
179
196
|
}
|
|
180
197
|
// baseline
|
|
181
|
-
const baselineSource = flags.baseline
|
|
182
|
-
|
|
198
|
+
const baselineSource = flags.baseline
|
|
199
|
+
? "arg"
|
|
200
|
+
: fileConfig?.baseline
|
|
201
|
+
? "file"
|
|
183
202
|
: "default";
|
|
184
203
|
fields.push({
|
|
185
204
|
key: "baseline",
|
|
@@ -137,7 +137,10 @@ function runBuiltinGate(cwd) {
|
|
|
137
137
|
};
|
|
138
138
|
}
|
|
139
139
|
const baselineMeta = baselineData.updatedAt
|
|
140
|
-
? {
|
|
140
|
+
? {
|
|
141
|
+
updatedAt: baselineData.updatedAt,
|
|
142
|
+
updatedBy: baselineData.updatedBy ?? "unknown",
|
|
143
|
+
}
|
|
141
144
|
: null;
|
|
142
145
|
// Run tests
|
|
143
146
|
const isWin = process.platform === "win32";
|
|
@@ -302,7 +305,10 @@ function runGate(argv) {
|
|
|
302
305
|
process.stdout.write(fs.readFileSync(reportPath, "utf-8"));
|
|
303
306
|
}
|
|
304
307
|
else {
|
|
305
|
-
console.error(JSON.stringify({
|
|
308
|
+
console.error(JSON.stringify({
|
|
309
|
+
error: "regression-report.json not found",
|
|
310
|
+
exitCode,
|
|
311
|
+
}));
|
|
306
312
|
}
|
|
307
313
|
}
|
|
308
314
|
else if (args.format === "github") {
|
|
@@ -60,7 +60,9 @@ function buildCheckReport(input) {
|
|
|
60
60
|
}
|
|
61
61
|
const failedCasesShown = Math.min(failedCases.length, TOP_N);
|
|
62
62
|
const failedCasesMore = failedCases.length - failedCasesShown;
|
|
63
|
-
const breakdown01 = Object.keys(breakdown).length > 0
|
|
63
|
+
const breakdown01 = Object.keys(breakdown).length > 0
|
|
64
|
+
? breakdown
|
|
65
|
+
: undefined;
|
|
64
66
|
const contribPts = args.explain && breakdown01 ? computeContribPts(breakdown01) : undefined;
|
|
65
67
|
const gateSkipped = gateResult.gateSkipped === true;
|
|
66
68
|
const gateApplied = !gateSkipped;
|
|
@@ -68,7 +70,11 @@ function buildCheckReport(input) {
|
|
|
68
70
|
const actionableMessage = gateSkipped
|
|
69
71
|
? "Gate not applied: baseline missing. Publish a baseline from the dashboard, or run with --baseline previous once you have runs."
|
|
70
72
|
: (gateResult.reasonMessage ?? undefined);
|
|
71
|
-
const verdict = gateResult.reasonCode === "WARN_REGRESSION"
|
|
73
|
+
const verdict = gateResult.reasonCode === "WARN_REGRESSION"
|
|
74
|
+
? "warn"
|
|
75
|
+
: gateResult.passed
|
|
76
|
+
? "pass"
|
|
77
|
+
: "fail";
|
|
72
78
|
const report = {
|
|
73
79
|
schemaVersion: types_1.CHECK_REPORT_SCHEMA_VERSION,
|
|
74
80
|
evaluationId: args.evaluationId,
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* TICKET 4 ā Unified evalai run CLI Command
|
|
3
|
+
*
|
|
4
|
+
* Goal: Consolidated execution interface that consumes manifest
|
|
5
|
+
*
|
|
6
|
+
* Features:
|
|
7
|
+
* - Manifest loading and spec filtering
|
|
8
|
+
* - --impacted-only integration with impact analysis
|
|
9
|
+
* - Local executor integration
|
|
10
|
+
* - .evalai/last-run.json output
|
|
11
|
+
* - Legacy mode compatibility
|
|
12
|
+
*/
|
|
13
|
+
/**
|
|
14
|
+
* Run execution options
|
|
15
|
+
*/
|
|
16
|
+
export interface RunOptions {
|
|
17
|
+
/** Filter to specific spec IDs */
|
|
18
|
+
specIds?: string[];
|
|
19
|
+
/** Run only impacted specs (requires base branch) */
|
|
20
|
+
impactedOnly?: boolean;
|
|
21
|
+
/** Base branch for impact analysis */
|
|
22
|
+
baseBranch?: string;
|
|
23
|
+
/** Output format */
|
|
24
|
+
format?: "human" | "json";
|
|
25
|
+
/** Write run results to file */
|
|
26
|
+
writeResults?: boolean;
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* Run execution result
|
|
30
|
+
*/
|
|
31
|
+
export interface RunResult {
|
|
32
|
+
/** Schema version for compatibility checking */
|
|
33
|
+
schemaVersion: number;
|
|
34
|
+
/** Unique run identifier */
|
|
35
|
+
runId: string;
|
|
36
|
+
/** Execution metadata */
|
|
37
|
+
metadata: {
|
|
38
|
+
startedAt: number;
|
|
39
|
+
completedAt: number;
|
|
40
|
+
duration: number;
|
|
41
|
+
totalSpecs: number;
|
|
42
|
+
executedSpecs: number;
|
|
43
|
+
mode: "spec" | "legacy";
|
|
44
|
+
};
|
|
45
|
+
/** Individual spec results */
|
|
46
|
+
results: SpecResult[];
|
|
47
|
+
/** Summary statistics */
|
|
48
|
+
summary: {
|
|
49
|
+
passed: number;
|
|
50
|
+
failed: number;
|
|
51
|
+
skipped: number;
|
|
52
|
+
passRate: number;
|
|
53
|
+
};
|
|
54
|
+
}
|
|
55
|
+
/**
|
|
56
|
+
* Individual spec result
|
|
57
|
+
*/
|
|
58
|
+
export interface SpecResult {
|
|
59
|
+
/** Spec identifier */
|
|
60
|
+
specId: string;
|
|
61
|
+
/** Spec name */
|
|
62
|
+
name: string;
|
|
63
|
+
/** File path */
|
|
64
|
+
filePath: string;
|
|
65
|
+
/** Execution result */
|
|
66
|
+
result: {
|
|
67
|
+
status: "passed" | "failed" | "skipped";
|
|
68
|
+
score?: number;
|
|
69
|
+
error?: string;
|
|
70
|
+
duration: number;
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Run evaluation specifications
|
|
75
|
+
*/
|
|
76
|
+
export declare function runEvaluations(options: RunOptions, projectRoot?: string): Promise<RunResult>;
|
|
77
|
+
/**
|
|
78
|
+
* Run index entry
|
|
79
|
+
*/
|
|
80
|
+
export interface RunIndexEntry {
|
|
81
|
+
runId: string;
|
|
82
|
+
createdAt: number;
|
|
83
|
+
gitSha?: string;
|
|
84
|
+
branch?: string;
|
|
85
|
+
mode: "spec" | "legacy";
|
|
86
|
+
specCount: number;
|
|
87
|
+
passRate: number;
|
|
88
|
+
avgScore: number;
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* Print human-readable results
|
|
92
|
+
*/
|
|
93
|
+
export declare function printHumanResults(result: RunResult): void;
|
|
94
|
+
/**
|
|
95
|
+
* Print JSON results
|
|
96
|
+
*/
|
|
97
|
+
export declare function printJsonResults(result: RunResult): void;
|
|
98
|
+
/**
|
|
99
|
+
* CLI entry point
|
|
100
|
+
*/
|
|
101
|
+
export declare function runEvaluationsCLI(options: RunOptions): Promise<void>;
|
package/dist/cli/run.js
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* TICKET 4 ā Unified evalai run CLI Command
|
|
4
|
+
*
|
|
5
|
+
* Goal: Consolidated execution interface that consumes manifest
|
|
6
|
+
*
|
|
7
|
+
* Features:
|
|
8
|
+
* - Manifest loading and spec filtering
|
|
9
|
+
* - --impacted-only integration with impact analysis
|
|
10
|
+
* - Local executor integration
|
|
11
|
+
* - .evalai/last-run.json output
|
|
12
|
+
* - Legacy mode compatibility
|
|
13
|
+
*/
|
|
14
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
15
|
+
if (k2 === undefined) k2 = k;
|
|
16
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
17
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
18
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
19
|
+
}
|
|
20
|
+
Object.defineProperty(o, k2, desc);
|
|
21
|
+
}) : (function(o, m, k, k2) {
|
|
22
|
+
if (k2 === undefined) k2 = k;
|
|
23
|
+
o[k2] = m[k];
|
|
24
|
+
}));
|
|
25
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
26
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
27
|
+
}) : function(o, v) {
|
|
28
|
+
o["default"] = v;
|
|
29
|
+
});
|
|
30
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
31
|
+
var ownKeys = function(o) {
|
|
32
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
33
|
+
var ar = [];
|
|
34
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
35
|
+
return ar;
|
|
36
|
+
};
|
|
37
|
+
return ownKeys(o);
|
|
38
|
+
};
|
|
39
|
+
return function (mod) {
|
|
40
|
+
if (mod && mod.__esModule) return mod;
|
|
41
|
+
var result = {};
|
|
42
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
43
|
+
__setModuleDefault(result, mod);
|
|
44
|
+
return result;
|
|
45
|
+
};
|
|
46
|
+
})();
|
|
47
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
48
|
+
exports.runEvaluations = runEvaluations;
|
|
49
|
+
exports.printHumanResults = printHumanResults;
|
|
50
|
+
exports.printJsonResults = printJsonResults;
|
|
51
|
+
exports.runEvaluationsCLI = runEvaluationsCLI;
|
|
52
|
+
const node_child_process_1 = require("node:child_process");
|
|
53
|
+
const fs = __importStar(require("node:fs/promises"));
|
|
54
|
+
const path = __importStar(require("node:path"));
|
|
55
|
+
const impact_analysis_1 = require("./impact-analysis");
|
|
56
|
+
/**
|
|
57
|
+
* Generate deterministic run ID
|
|
58
|
+
*/
|
|
59
|
+
function generateRunId() {
|
|
60
|
+
const timestamp = Date.now().toString(36);
|
|
61
|
+
const random = Math.random().toString(36).substring(2, 8);
|
|
62
|
+
return `run-${timestamp}-${random}`;
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Run evaluation specifications
|
|
66
|
+
*/
|
|
67
|
+
async function runEvaluations(options, projectRoot = process.cwd()) {
|
|
68
|
+
const startTime = Date.now();
|
|
69
|
+
// Load manifest
|
|
70
|
+
const manifest = await loadManifest(projectRoot);
|
|
71
|
+
if (!manifest) {
|
|
72
|
+
throw new Error("No evaluation manifest found. Run 'evalai discover --manifest' first.");
|
|
73
|
+
}
|
|
74
|
+
// Determine which specs to run
|
|
75
|
+
let specsToRun = manifest.specs;
|
|
76
|
+
if (options.impactedOnly && options.baseBranch) {
|
|
77
|
+
// Run impact analysis first
|
|
78
|
+
const impactResult = await (0, impact_analysis_1.runImpactAnalysis)({
|
|
79
|
+
baseBranch: options.baseBranch,
|
|
80
|
+
}, projectRoot);
|
|
81
|
+
// Filter to impacted specs only
|
|
82
|
+
const impactedSpecIds = new Set(impactResult.impactedSpecIds);
|
|
83
|
+
specsToRun = manifest.specs.filter((spec) => impactedSpecIds.has(spec.id));
|
|
84
|
+
console.log(`šÆ Running ${specsToRun.length} impacted specs (out of ${manifest.specs.length} total)`);
|
|
85
|
+
}
|
|
86
|
+
else if (options.specIds && options.specIds.length > 0) {
|
|
87
|
+
// Filter to specific spec IDs
|
|
88
|
+
const specIdSet = new Set(options.specIds);
|
|
89
|
+
specsToRun = manifest.specs.filter((spec) => specIdSet.has(spec.id));
|
|
90
|
+
console.log(`šÆ Running ${specsToRun.length} specific specs`);
|
|
91
|
+
}
|
|
92
|
+
else if (options.specIds && options.specIds.length === 0) {
|
|
93
|
+
// Explicit empty list means run nothing
|
|
94
|
+
specsToRun = [];
|
|
95
|
+
console.log(`šÆ Running 0 specs (explicit empty list)`);
|
|
96
|
+
}
|
|
97
|
+
else {
|
|
98
|
+
console.log(`šÆ Running all ${specsToRun.length} specs`);
|
|
99
|
+
}
|
|
100
|
+
// Execute specs
|
|
101
|
+
const results = await executeSpecs(specsToRun);
|
|
102
|
+
const completedAt = Date.now();
|
|
103
|
+
const duration = completedAt - startTime;
|
|
104
|
+
// Calculate summary
|
|
105
|
+
const summary = calculateSummary(results);
|
|
106
|
+
const runResult = {
|
|
107
|
+
schemaVersion: 1,
|
|
108
|
+
runId: generateRunId(),
|
|
109
|
+
metadata: {
|
|
110
|
+
startedAt: startTime,
|
|
111
|
+
completedAt,
|
|
112
|
+
duration,
|
|
113
|
+
totalSpecs: manifest.specs.length,
|
|
114
|
+
executedSpecs: specsToRun.length,
|
|
115
|
+
mode: manifest.runtime.mode,
|
|
116
|
+
},
|
|
117
|
+
results,
|
|
118
|
+
summary,
|
|
119
|
+
};
|
|
120
|
+
// Write results if requested
|
|
121
|
+
if (options.writeResults) {
|
|
122
|
+
await writeRunResults(runResult, projectRoot);
|
|
123
|
+
await updateRunIndex(runResult, projectRoot);
|
|
124
|
+
}
|
|
125
|
+
return runResult;
|
|
126
|
+
}
|
|
127
|
+
/**
|
|
128
|
+
* Load evaluation manifest
|
|
129
|
+
*/
|
|
130
|
+
async function loadManifest(projectRoot = process.cwd()) {
|
|
131
|
+
const manifestPath = path.join(projectRoot, ".evalai", "manifest.json");
|
|
132
|
+
try {
|
|
133
|
+
const content = await fs.readFile(manifestPath, "utf-8");
|
|
134
|
+
return JSON.parse(content);
|
|
135
|
+
}
|
|
136
|
+
catch (_error) {
|
|
137
|
+
return null;
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
/**
|
|
141
|
+
* Execute specifications
|
|
142
|
+
*/
|
|
143
|
+
async function executeSpecs(specs) {
|
|
144
|
+
const results = [];
|
|
145
|
+
for (const spec of specs) {
|
|
146
|
+
const result = await executeSpec(spec);
|
|
147
|
+
results.push(result);
|
|
148
|
+
}
|
|
149
|
+
return results;
|
|
150
|
+
}
|
|
151
|
+
/**
|
|
152
|
+
* Execute individual specification
|
|
153
|
+
*/
|
|
154
|
+
async function executeSpec(spec) {
|
|
155
|
+
const startTime = Date.now();
|
|
156
|
+
try {
|
|
157
|
+
// For now, simulate execution
|
|
158
|
+
// In a real implementation, this would:
|
|
159
|
+
// 1. Load the spec file
|
|
160
|
+
// 2. Execute the defineEval function
|
|
161
|
+
// 3. Capture the result
|
|
162
|
+
// Simulate some work
|
|
163
|
+
await new Promise((resolve) => setTimeout(resolve, Math.random() * 100 + 50));
|
|
164
|
+
// Simulate success/failure (90% success rate for demo)
|
|
165
|
+
const success = Math.random() > 0.1;
|
|
166
|
+
const duration = Date.now() - startTime;
|
|
167
|
+
if (success) {
|
|
168
|
+
return {
|
|
169
|
+
specId: spec.id,
|
|
170
|
+
name: spec.name,
|
|
171
|
+
filePath: spec.filePath,
|
|
172
|
+
result: {
|
|
173
|
+
status: "passed",
|
|
174
|
+
score: Math.random() * 0.3 + 0.7, // 0.7-1.0
|
|
175
|
+
duration,
|
|
176
|
+
},
|
|
177
|
+
};
|
|
178
|
+
}
|
|
179
|
+
else {
|
|
180
|
+
return {
|
|
181
|
+
specId: spec.id,
|
|
182
|
+
name: spec.name,
|
|
183
|
+
filePath: spec.filePath,
|
|
184
|
+
result: {
|
|
185
|
+
status: "failed",
|
|
186
|
+
error: "Simulated execution failure",
|
|
187
|
+
duration,
|
|
188
|
+
},
|
|
189
|
+
};
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
catch (error) {
|
|
193
|
+
return {
|
|
194
|
+
specId: spec.id,
|
|
195
|
+
name: spec.name,
|
|
196
|
+
filePath: spec.filePath,
|
|
197
|
+
result: {
|
|
198
|
+
status: "failed",
|
|
199
|
+
error: error instanceof Error ? error.message : String(error),
|
|
200
|
+
duration: Date.now() - startTime,
|
|
201
|
+
},
|
|
202
|
+
};
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
/**
|
|
206
|
+
* Calculate summary statistics
|
|
207
|
+
*/
|
|
208
|
+
function calculateSummary(results) {
|
|
209
|
+
const passed = results.filter((r) => r.result.status === "passed").length;
|
|
210
|
+
const failed = results.filter((r) => r.result.status === "failed").length;
|
|
211
|
+
const skipped = results.filter((r) => r.result.status === "skipped").length;
|
|
212
|
+
const passRate = results.length > 0 ? passed / results.length : 0;
|
|
213
|
+
return {
|
|
214
|
+
passed,
|
|
215
|
+
failed,
|
|
216
|
+
skipped,
|
|
217
|
+
passRate,
|
|
218
|
+
};
|
|
219
|
+
}
|
|
220
|
+
/**
|
|
221
|
+
* Write run results to file
|
|
222
|
+
*/
|
|
223
|
+
async function writeRunResults(result, projectRoot = process.cwd()) {
|
|
224
|
+
const evalaiDir = path.join(projectRoot, ".evalai");
|
|
225
|
+
await fs.mkdir(evalaiDir, { recursive: true });
|
|
226
|
+
// Write last-run.json (existing behavior)
|
|
227
|
+
const lastRunPath = path.join(evalaiDir, "last-run.json");
|
|
228
|
+
await fs.writeFile(lastRunPath, JSON.stringify(result, null, 2), "utf-8");
|
|
229
|
+
// Create runs directory and write timestamped artifact
|
|
230
|
+
if (result.runId) {
|
|
231
|
+
const runsDir = path.join(evalaiDir, "runs");
|
|
232
|
+
await fs.mkdir(runsDir, { recursive: true });
|
|
233
|
+
const timestampedPath = path.join(runsDir, `${result.runId}.json`);
|
|
234
|
+
await fs.writeFile(timestampedPath, JSON.stringify(result, null, 2), "utf-8");
|
|
235
|
+
// Optional: Create latest.json mirror
|
|
236
|
+
const latestPath = path.join(runsDir, "latest.json");
|
|
237
|
+
await fs.writeFile(latestPath, JSON.stringify(result, null, 2), "utf-8");
|
|
238
|
+
}
|
|
239
|
+
console.log(`ā
Run results written to .evalai/last-run.json`);
|
|
240
|
+
if (result.runId) {
|
|
241
|
+
console.log(`š Run artifact: .evalai/runs/${result.runId}.json`);
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
/**
|
|
245
|
+
* Update run index with new run entry
|
|
246
|
+
*/
|
|
247
|
+
async function updateRunIndex(result, projectRoot = process.cwd()) {
|
|
248
|
+
const runsDir = path.join(projectRoot, ".evalai", "runs");
|
|
249
|
+
const indexPath = path.join(runsDir, "index.json");
|
|
250
|
+
await fs.mkdir(runsDir, { recursive: true });
|
|
251
|
+
// Calculate average score
|
|
252
|
+
const scores = result.results
|
|
253
|
+
.filter((r) => r.result.score !== undefined)
|
|
254
|
+
.map((r) => r.result.score);
|
|
255
|
+
const avgScore = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : 0;
|
|
256
|
+
// Get git info if available
|
|
257
|
+
let gitSha;
|
|
258
|
+
let branch;
|
|
259
|
+
try {
|
|
260
|
+
gitSha = await getGitSha();
|
|
261
|
+
branch = await getGitBranch();
|
|
262
|
+
}
|
|
263
|
+
catch {
|
|
264
|
+
// Git commands not available, continue without git info
|
|
265
|
+
}
|
|
266
|
+
const indexEntry = {
|
|
267
|
+
runId: result.runId,
|
|
268
|
+
createdAt: result.metadata.startedAt,
|
|
269
|
+
gitSha,
|
|
270
|
+
branch,
|
|
271
|
+
mode: result.metadata.mode,
|
|
272
|
+
specCount: result.results.length,
|
|
273
|
+
passRate: result.summary.passRate,
|
|
274
|
+
avgScore,
|
|
275
|
+
};
|
|
276
|
+
// Read existing index or create new one
|
|
277
|
+
let index = [];
|
|
278
|
+
try {
|
|
279
|
+
const existingContent = await fs.readFile(indexPath, "utf-8");
|
|
280
|
+
index = JSON.parse(existingContent);
|
|
281
|
+
}
|
|
282
|
+
catch (_error) {
|
|
283
|
+
// Index doesn't exist yet, start with empty array
|
|
284
|
+
}
|
|
285
|
+
// Add new entry
|
|
286
|
+
index.push(indexEntry);
|
|
287
|
+
// Sort by creation time (newest first)
|
|
288
|
+
index.sort((a, b) => b.createdAt - a.createdAt);
|
|
289
|
+
// Write to temp file first, then rename for atomicity
|
|
290
|
+
const tempPath = `${indexPath}.tmp`;
|
|
291
|
+
await fs.writeFile(tempPath, JSON.stringify(index, null, 2), "utf-8");
|
|
292
|
+
await fs.rename(tempPath, indexPath);
|
|
293
|
+
}
|
|
294
|
+
/**
|
|
295
|
+
* Get current git SHA
|
|
296
|
+
*/
|
|
297
|
+
async function getGitSha() {
|
|
298
|
+
return new Promise((resolve) => {
|
|
299
|
+
const git = (0, node_child_process_1.spawn)("git", ["rev-parse", "HEAD"], {
|
|
300
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
301
|
+
});
|
|
302
|
+
let output = "";
|
|
303
|
+
git.stdout.on("data", (data) => {
|
|
304
|
+
output += data.toString();
|
|
305
|
+
});
|
|
306
|
+
git.on("close", (code) => {
|
|
307
|
+
if (code === 0 && output.trim()) {
|
|
308
|
+
resolve(output.trim());
|
|
309
|
+
}
|
|
310
|
+
else {
|
|
311
|
+
resolve(undefined);
|
|
312
|
+
}
|
|
313
|
+
});
|
|
314
|
+
});
|
|
315
|
+
}
|
|
316
|
+
/**
|
|
317
|
+
* Get current git branch
|
|
318
|
+
*/
|
|
319
|
+
async function getGitBranch() {
|
|
320
|
+
return new Promise((resolve) => {
|
|
321
|
+
const git = (0, node_child_process_1.spawn)("git", ["rev-parse", "--abbrev-ref", "HEAD"], {
|
|
322
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
323
|
+
});
|
|
324
|
+
let output = "";
|
|
325
|
+
git.stdout.on("data", (data) => {
|
|
326
|
+
output += data.toString();
|
|
327
|
+
});
|
|
328
|
+
git.on("close", (code) => {
|
|
329
|
+
if (code === 0 && output.trim()) {
|
|
330
|
+
resolve(output.trim());
|
|
331
|
+
}
|
|
332
|
+
else {
|
|
333
|
+
resolve(undefined);
|
|
334
|
+
}
|
|
335
|
+
});
|
|
336
|
+
});
|
|
337
|
+
}
|
|
338
|
+
/**
|
|
339
|
+
* Print human-readable results
|
|
340
|
+
*/
|
|
341
|
+
function printHumanResults(result) {
|
|
342
|
+
console.log("\nš Evaluation Run Results");
|
|
343
|
+
console.log(`ā±ļø Duration: ${result.metadata.duration}ms`);
|
|
344
|
+
console.log(`š Specs: ${result.metadata.executedSpecs}/${result.metadata.totalSpecs} executed`);
|
|
345
|
+
console.log(`šÆ Mode: ${result.metadata.mode}`);
|
|
346
|
+
console.log("\nš Summary:");
|
|
347
|
+
console.log(` ā
Passed: ${result.summary.passed}`);
|
|
348
|
+
console.log(` ā Failed: ${result.summary.failed}`);
|
|
349
|
+
console.log(` āļø Skipped: ${result.summary.skipped}`);
|
|
350
|
+
console.log(` š Pass Rate: ${(result.summary.passRate * 100).toFixed(1)}%`);
|
|
351
|
+
console.log("\nš Individual Results:");
|
|
352
|
+
for (const spec of result.results) {
|
|
353
|
+
const status = spec.result.status === "passed"
|
|
354
|
+
? "ā
"
|
|
355
|
+
: spec.result.status === "failed"
|
|
356
|
+
? "ā"
|
|
357
|
+
: "āļø";
|
|
358
|
+
const score = spec.result.score
|
|
359
|
+
? ` (${(spec.result.score * 100).toFixed(1)}%)`
|
|
360
|
+
: "";
|
|
361
|
+
const error = spec.result.error ? ` - ${spec.result.error}` : "";
|
|
362
|
+
console.log(` ${status} ${spec.name}${score}${error}`);
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
/**
|
|
366
|
+
* Print JSON results
|
|
367
|
+
*/
|
|
368
|
+
function printJsonResults(result) {
|
|
369
|
+
console.log(JSON.stringify(result, null, 2));
|
|
370
|
+
}
|
|
371
|
+
/**
|
|
372
|
+
* CLI entry point
|
|
373
|
+
*/
|
|
374
|
+
async function runEvaluationsCLI(options) {
|
|
375
|
+
try {
|
|
376
|
+
const result = await runEvaluations(options);
|
|
377
|
+
if (options.format === "json") {
|
|
378
|
+
printJsonResults(result);
|
|
379
|
+
}
|
|
380
|
+
else {
|
|
381
|
+
printHumanResults(result);
|
|
382
|
+
}
|
|
383
|
+
// Exit with appropriate code
|
|
384
|
+
if (result.summary.failed > 0) {
|
|
385
|
+
process.exit(1);
|
|
386
|
+
}
|
|
387
|
+
else {
|
|
388
|
+
process.exit(0);
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
catch (error) {
|
|
392
|
+
console.error("ā Run failed:", error instanceof Error ? error.message : String(error));
|
|
393
|
+
process.exit(2);
|
|
394
|
+
}
|
|
395
|
+
}
|
package/dist/cli/share.js
CHANGED
|
@@ -50,7 +50,9 @@ function parseShareArgs(argv) {
|
|
|
50
50
|
if (!evaluationId)
|
|
51
51
|
return { error: "Error: --evaluationId is required" };
|
|
52
52
|
if (Number.isNaN(runId) || runId < 1)
|
|
53
|
-
return {
|
|
53
|
+
return {
|
|
54
|
+
error: "Error: --runId is required and must be a positive number",
|
|
55
|
+
};
|
|
54
56
|
const expiresInDays = parseExpires(expires);
|
|
55
57
|
if (expiresInDays == null || expiresInDays <= 0)
|
|
56
58
|
return { error: "Error: --expires must be e.g. 7d, 24h, 60m, 1s" };
|
package/dist/cli/upgrade.js
CHANGED
|
@@ -275,7 +275,8 @@ function addNpmScripts(cwd) {
|
|
|
275
275
|
changed = true;
|
|
276
276
|
}
|
|
277
277
|
if (!scripts["eval:baseline-update"]) {
|
|
278
|
-
scripts["eval:baseline-update"] =
|
|
278
|
+
scripts["eval:baseline-update"] =
|
|
279
|
+
"npx tsx scripts/regression-gate.ts --update-baseline";
|
|
279
280
|
changed = true;
|
|
280
281
|
}
|
|
281
282
|
if (changed) {
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CORE-402: Centralized .evalai workspace resolution
|
|
3
|
+
*
|
|
4
|
+
* Provides unified workspace path resolution for all EvalAI CLI commands
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* EvalAI workspace paths
|
|
8
|
+
*/
|
|
9
|
+
export interface EvalWorkspace {
|
|
10
|
+
/** Project root directory */
|
|
11
|
+
root: string;
|
|
12
|
+
/** .evalai directory */
|
|
13
|
+
evalaiDir: string;
|
|
14
|
+
/** runs directory */
|
|
15
|
+
runsDir: string;
|
|
16
|
+
/** manifest.json path */
|
|
17
|
+
manifestPath: string;
|
|
18
|
+
/** last-run.json path */
|
|
19
|
+
lastRunPath: string;
|
|
20
|
+
/** runs/index.json path */
|
|
21
|
+
indexPath: string;
|
|
22
|
+
/** baseline-run.json path */
|
|
23
|
+
baselinePath: string;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Resolve EvalAI workspace paths
|
|
27
|
+
*/
|
|
28
|
+
export declare function resolveEvalWorkspace(projectRoot?: string): EvalWorkspace;
|