@pauly4010/evalai-sdk 1.8.0 → 1.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +54 -0
- package/README.md +136 -23
- package/dist/assertions.js +51 -18
- package/dist/batch.js +8 -2
- package/dist/cli/api.js +3 -1
- package/dist/cli/check.js +19 -6
- package/dist/cli/ci-context.js +3 -1
- package/dist/cli/ci.d.ts +45 -0
- package/dist/cli/ci.js +192 -0
- package/dist/cli/config.js +28 -8
- package/dist/cli/diff.d.ts +173 -0
- package/dist/cli/diff.js +685 -0
- package/dist/cli/discover.d.ts +84 -0
- package/dist/cli/discover.js +419 -0
- package/dist/cli/doctor.js +62 -19
- package/dist/cli/env.d.ts +21 -0
- package/dist/cli/env.js +42 -0
- package/dist/cli/explain.js +168 -36
- package/dist/cli/formatters/human.js +4 -1
- package/dist/cli/formatters/pr-comment.js +3 -1
- package/dist/cli/gate.js +6 -2
- package/dist/cli/impact-analysis.d.ts +63 -0
- package/dist/cli/impact-analysis.js +252 -0
- package/dist/cli/index.js +185 -0
- package/dist/cli/manifest.d.ts +103 -0
- package/dist/cli/manifest.js +282 -0
- package/dist/cli/migrate.d.ts +41 -0
- package/dist/cli/migrate.js +349 -0
- package/dist/cli/policy-packs.js +8 -2
- package/dist/cli/print-config.js +33 -14
- package/dist/cli/regression-gate.js +8 -2
- package/dist/cli/report/build-check-report.js +8 -2
- package/dist/cli/run.d.ts +101 -0
- package/dist/cli/run.js +395 -0
- package/dist/cli/share.js +3 -1
- package/dist/cli/upgrade.js +2 -1
- package/dist/cli/workspace.d.ts +28 -0
- package/dist/cli/workspace.js +58 -0
- package/dist/client.d.ts +16 -19
- package/dist/client.js +60 -43
- package/dist/client.request.test.d.ts +1 -1
- package/dist/client.request.test.js +222 -147
- package/dist/context.js +3 -1
- package/dist/errors.js +11 -4
- package/dist/export.js +3 -1
- package/dist/index.d.ts +8 -2
- package/dist/index.js +30 -5
- package/dist/integrations/anthropic.d.ts +20 -1
- package/dist/integrations/openai-eval.js +4 -2
- package/dist/integrations/openai.d.ts +24 -1
- package/dist/local.js +3 -1
- package/dist/logger.js +6 -2
- package/dist/pagination.js +6 -2
- package/dist/runtime/adapters/config-to-dsl.d.ts +33 -0
- package/dist/runtime/adapters/config-to-dsl.js +394 -0
- package/dist/runtime/adapters/testsuite-to-dsl.d.ts +63 -0
- package/dist/runtime/adapters/testsuite-to-dsl.js +276 -0
- package/dist/runtime/context.d.ts +26 -0
- package/dist/runtime/context.js +74 -0
- package/dist/runtime/eval.d.ts +46 -0
- package/dist/runtime/eval.js +244 -0
- package/dist/runtime/execution-mode.d.ts +80 -0
- package/dist/runtime/execution-mode.js +357 -0
- package/dist/runtime/executor.d.ts +16 -0
- package/dist/runtime/executor.js +152 -0
- package/dist/runtime/registry.d.ts +78 -0
- package/dist/runtime/registry.js +403 -0
- package/dist/runtime/run-report.d.ts +200 -0
- package/dist/runtime/run-report.js +222 -0
- package/dist/runtime/types.d.ts +356 -0
- package/dist/runtime/types.js +76 -0
- package/dist/testing.d.ts +65 -0
- package/dist/testing.js +49 -2
- package/dist/types.d.ts +100 -69
- package/dist/utils/input-hash.js +4 -1
- package/dist/version.d.ts +1 -1
- package/dist/version.js +1 -1
- package/dist/workflows.js +62 -14
- package/package.json +115 -110
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* TICKET 1 — evalai discover
|
|
3
|
+
*
|
|
4
|
+
* Your first "holy shit" moment feature
|
|
5
|
+
*
|
|
6
|
+
* Goal:
|
|
7
|
+
* npm install
|
|
8
|
+
* evalai discover
|
|
9
|
+
*
|
|
10
|
+
* Output:
|
|
11
|
+
* Found 42 behavioral specifications
|
|
12
|
+
* Safety: 12
|
|
13
|
+
* Accuracy: 18
|
|
14
|
+
* Agents: 7
|
|
15
|
+
* Tools: 5
|
|
16
|
+
*
|
|
17
|
+
* Why this matters:
|
|
18
|
+
* - makes EvalAI feel alive
|
|
19
|
+
* - proves DSL works
|
|
20
|
+
* - enables intelligence layer
|
|
21
|
+
*
|
|
22
|
+
* This becomes your entry point command.
|
|
23
|
+
*/
|
|
24
|
+
/**
|
|
25
|
+
* Discovered specification statistics
|
|
26
|
+
*/
|
|
27
|
+
export interface DiscoveryStats {
|
|
28
|
+
/** Total number of specifications found */
|
|
29
|
+
totalSpecs: number;
|
|
30
|
+
/** Specifications by category/tag */
|
|
31
|
+
categories: Record<string, number>;
|
|
32
|
+
/** Specifications by file */
|
|
33
|
+
files: Record<string, number>;
|
|
34
|
+
/** Execution mode information */
|
|
35
|
+
executionMode: {
|
|
36
|
+
mode: string;
|
|
37
|
+
hasSpecRuntime: boolean;
|
|
38
|
+
hasLegacyRuntime: boolean;
|
|
39
|
+
specFiles: string[];
|
|
40
|
+
legacyConfig?: string;
|
|
41
|
+
};
|
|
42
|
+
/** Project metadata */
|
|
43
|
+
project: {
|
|
44
|
+
root: string;
|
|
45
|
+
name: string;
|
|
46
|
+
hasPackageJson: boolean;
|
|
47
|
+
hasGit: boolean;
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Specification analysis result
|
|
52
|
+
*/
|
|
53
|
+
export interface SpecAnalysis {
|
|
54
|
+
/** Specification ID */
|
|
55
|
+
id: string;
|
|
56
|
+
/** Specification name */
|
|
57
|
+
name: string;
|
|
58
|
+
/** File path */
|
|
59
|
+
file: string;
|
|
60
|
+
/** Tags/categories */
|
|
61
|
+
tags: string[];
|
|
62
|
+
/** Has assertions */
|
|
63
|
+
hasAssertions: boolean;
|
|
64
|
+
/** Uses external models */
|
|
65
|
+
usesModels: boolean;
|
|
66
|
+
/** Uses tools */
|
|
67
|
+
usesTools: boolean;
|
|
68
|
+
/** Estimated complexity */
|
|
69
|
+
complexity: "simple" | "medium" | "complex";
|
|
70
|
+
}
|
|
71
|
+
/**
|
|
72
|
+
* Discover and analyze behavioral specifications in the current project
|
|
73
|
+
*/
|
|
74
|
+
export declare function discoverSpecs(options?: {
|
|
75
|
+
manifest?: boolean;
|
|
76
|
+
}): Promise<DiscoveryStats>;
|
|
77
|
+
/**
|
|
78
|
+
* Print discovery results in a beautiful format
|
|
79
|
+
*/
|
|
80
|
+
export declare function printDiscoveryResults(stats: DiscoveryStats): void;
|
|
81
|
+
/**
|
|
82
|
+
* Run discovery command
|
|
83
|
+
*/
|
|
84
|
+
export declare function runDiscover(): Promise<void>;
|
|
@@ -0,0 +1,419 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* TICKET 1 — evalai discover
|
|
4
|
+
*
|
|
5
|
+
* Your first "holy shit" moment feature
|
|
6
|
+
*
|
|
7
|
+
* Goal:
|
|
8
|
+
* npm install
|
|
9
|
+
* evalai discover
|
|
10
|
+
*
|
|
11
|
+
* Output:
|
|
12
|
+
* Found 42 behavioral specifications
|
|
13
|
+
* Safety: 12
|
|
14
|
+
* Accuracy: 18
|
|
15
|
+
* Agents: 7
|
|
16
|
+
* Tools: 5
|
|
17
|
+
*
|
|
18
|
+
* Why this matters:
|
|
19
|
+
* - makes EvalAI feel alive
|
|
20
|
+
* - proves DSL works
|
|
21
|
+
* - enables intelligence layer
|
|
22
|
+
*
|
|
23
|
+
* This becomes your entry point command.
|
|
24
|
+
*/
|
|
25
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
26
|
+
if (k2 === undefined) k2 = k;
|
|
27
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
28
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
29
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
30
|
+
}
|
|
31
|
+
Object.defineProperty(o, k2, desc);
|
|
32
|
+
}) : (function(o, m, k, k2) {
|
|
33
|
+
if (k2 === undefined) k2 = k;
|
|
34
|
+
o[k2] = m[k];
|
|
35
|
+
}));
|
|
36
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
37
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
38
|
+
}) : function(o, v) {
|
|
39
|
+
o["default"] = v;
|
|
40
|
+
});
|
|
41
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
42
|
+
var ownKeys = function(o) {
|
|
43
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
44
|
+
var ar = [];
|
|
45
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
46
|
+
return ar;
|
|
47
|
+
};
|
|
48
|
+
return ownKeys(o);
|
|
49
|
+
};
|
|
50
|
+
return function (mod) {
|
|
51
|
+
if (mod && mod.__esModule) return mod;
|
|
52
|
+
var result = {};
|
|
53
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
54
|
+
__setModuleDefault(result, mod);
|
|
55
|
+
return result;
|
|
56
|
+
};
|
|
57
|
+
})();
|
|
58
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
59
|
+
exports.discoverSpecs = discoverSpecs;
|
|
60
|
+
exports.printDiscoveryResults = printDiscoveryResults;
|
|
61
|
+
exports.runDiscover = runDiscover;
|
|
62
|
+
const fs = __importStar(require("node:fs/promises"));
|
|
63
|
+
const path = __importStar(require("node:path"));
|
|
64
|
+
const execution_mode_1 = require("../runtime/execution-mode");
|
|
65
|
+
const manifest_1 = require("./manifest");
|
|
66
|
+
/**
|
|
67
|
+
* Discover and analyze behavioral specifications in the current project
|
|
68
|
+
*/
|
|
69
|
+
async function discoverSpecs(options = {}) {
|
|
70
|
+
try {
|
|
71
|
+
const projectRoot = process.cwd();
|
|
72
|
+
const executionMode = await (0, execution_mode_1.getExecutionMode)(projectRoot);
|
|
73
|
+
// Get project metadata
|
|
74
|
+
const project = await getProjectMetadata(projectRoot);
|
|
75
|
+
if (executionMode.specFiles.length === 0) {
|
|
76
|
+
console.log("\n✨ No behavioral specifications found.");
|
|
77
|
+
console.log("💡 Create files with defineEval() calls to get started.");
|
|
78
|
+
return {
|
|
79
|
+
totalSpecs: 0,
|
|
80
|
+
categories: {},
|
|
81
|
+
files: {},
|
|
82
|
+
executionMode: {
|
|
83
|
+
mode: executionMode.mode,
|
|
84
|
+
hasSpecRuntime: executionMode.hasSpecRuntime,
|
|
85
|
+
hasLegacyRuntime: executionMode.hasLegacyRuntime,
|
|
86
|
+
specFiles: executionMode.specFiles,
|
|
87
|
+
legacyConfig: executionMode.legacyConfig,
|
|
88
|
+
},
|
|
89
|
+
project,
|
|
90
|
+
};
|
|
91
|
+
}
|
|
92
|
+
// Analyze specifications
|
|
93
|
+
const specs = await analyzeSpecifications(executionMode.specFiles);
|
|
94
|
+
// Generate manifest if requested
|
|
95
|
+
if (options.manifest) {
|
|
96
|
+
console.log("🔧 Generating evaluation manifest...");
|
|
97
|
+
const manifest = await (0, manifest_1.generateManifest)(specs, projectRoot, project.name, executionMode);
|
|
98
|
+
await (0, manifest_1.writeManifest)(manifest, projectRoot);
|
|
99
|
+
console.log(`✅ Manifest written to .evalai/manifest.json`);
|
|
100
|
+
console.log(`✅ Lock file written to .evalai/manifest.lock.json`);
|
|
101
|
+
}
|
|
102
|
+
// Calculate statistics
|
|
103
|
+
const stats = calculateStats(specs, executionMode, project);
|
|
104
|
+
printDiscoveryResults(stats);
|
|
105
|
+
return stats;
|
|
106
|
+
}
|
|
107
|
+
catch (error) {
|
|
108
|
+
console.error("❌ Discovery failed:", error instanceof Error ? error.message : String(error));
|
|
109
|
+
throw error;
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
/**
|
|
113
|
+
* Get project metadata
|
|
114
|
+
*/
|
|
115
|
+
async function getProjectMetadata(projectRoot) {
|
|
116
|
+
const packageJsonPath = path.join(projectRoot, "package.json");
|
|
117
|
+
const gitPath = path.join(projectRoot, ".git");
|
|
118
|
+
let hasPackageJson = false;
|
|
119
|
+
let projectName = "unknown";
|
|
120
|
+
try {
|
|
121
|
+
const packageJson = await fs.readFile(packageJsonPath, "utf-8");
|
|
122
|
+
const parsed = JSON.parse(packageJson);
|
|
123
|
+
hasPackageJson = true;
|
|
124
|
+
projectName = parsed.name || "unknown";
|
|
125
|
+
}
|
|
126
|
+
catch (_error) {
|
|
127
|
+
// No package.json
|
|
128
|
+
}
|
|
129
|
+
const hasGit = await fs
|
|
130
|
+
.access(gitPath)
|
|
131
|
+
.then(() => true)
|
|
132
|
+
.catch(() => false);
|
|
133
|
+
return {
|
|
134
|
+
root: projectRoot,
|
|
135
|
+
name: projectName,
|
|
136
|
+
hasPackageJson,
|
|
137
|
+
hasGit,
|
|
138
|
+
};
|
|
139
|
+
}
|
|
140
|
+
/**
|
|
141
|
+
* Analyze specification files
|
|
142
|
+
*/
|
|
143
|
+
async function analyzeSpecifications(specFiles) {
|
|
144
|
+
const specs = [];
|
|
145
|
+
for (const filePath of specFiles) {
|
|
146
|
+
try {
|
|
147
|
+
const content = await fs.readFile(filePath, "utf-8");
|
|
148
|
+
const analysis = analyzeSpecFile(filePath, content);
|
|
149
|
+
specs.push(analysis);
|
|
150
|
+
}
|
|
151
|
+
catch (error) {
|
|
152
|
+
console.warn(`Warning: Could not analyze ${filePath}: ${error instanceof Error ? error.message : String(error)}`);
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
return specs;
|
|
156
|
+
}
|
|
157
|
+
/**
|
|
158
|
+
* Analyze a single specification file
|
|
159
|
+
*/
|
|
160
|
+
function analyzeSpecFile(filePath, content) {
|
|
161
|
+
// Extract defineEval calls
|
|
162
|
+
const defineEvalMatches = content.match(/defineEval\s*\([^)]+\)/g) || [];
|
|
163
|
+
const specNames = defineEvalMatches.map((match) => {
|
|
164
|
+
const nameMatch = match.match(/["'`](.+?)["'`](?:\s*,|\s*\))/);
|
|
165
|
+
return nameMatch ? nameMatch[1] : "unnamed";
|
|
166
|
+
});
|
|
167
|
+
// Extract tags
|
|
168
|
+
const tags = extractTags(content);
|
|
169
|
+
// Analyze complexity
|
|
170
|
+
const complexity = analyzeComplexity(content);
|
|
171
|
+
// Check for models and tools
|
|
172
|
+
const usesModels = content.includes("model:") ||
|
|
173
|
+
content.includes("model=") ||
|
|
174
|
+
content.includes("openai") ||
|
|
175
|
+
content.includes("anthropic");
|
|
176
|
+
const usesTools = content.includes("tool:") ||
|
|
177
|
+
content.includes("function.") ||
|
|
178
|
+
content.includes("call(");
|
|
179
|
+
// Check for assertions
|
|
180
|
+
const hasAssertions = content.includes("assert") ||
|
|
181
|
+
content.includes("expect") ||
|
|
182
|
+
content.includes("should");
|
|
183
|
+
// Generate ID from file path
|
|
184
|
+
const id = generateSpecId(filePath);
|
|
185
|
+
return {
|
|
186
|
+
id,
|
|
187
|
+
name: specNames[0] || path.basename(filePath, ".ts"),
|
|
188
|
+
file: path.relative(process.cwd(), filePath),
|
|
189
|
+
tags,
|
|
190
|
+
hasAssertions,
|
|
191
|
+
usesModels,
|
|
192
|
+
usesTools,
|
|
193
|
+
complexity,
|
|
194
|
+
};
|
|
195
|
+
}
|
|
196
|
+
/**
|
|
197
|
+
* Extract tags from specification content
|
|
198
|
+
*/
|
|
199
|
+
function extractTags(content) {
|
|
200
|
+
const tags = [];
|
|
201
|
+
// Extract tags parameter
|
|
202
|
+
const tagsMatch = content.match(/tags\s*:\s*\[([^\]]+)\]/);
|
|
203
|
+
if (tagsMatch) {
|
|
204
|
+
const tagContent = tagsMatch[1];
|
|
205
|
+
const tagStrings = tagContent.match(/["'`](.+?)["'`](?:\s*,|\s*)/g) || [];
|
|
206
|
+
tags.push(...tagStrings.map((tag) => tag.replace(/["'`](.+?)["'`](?:\s*,|\s*)/, "$1")));
|
|
207
|
+
}
|
|
208
|
+
// Extract from description and metadata
|
|
209
|
+
const descriptionMatch = content.match(/description\s*:\s*["'`](.+?)["'`](?:\s*,|\s*)/);
|
|
210
|
+
if (descriptionMatch) {
|
|
211
|
+
const description = descriptionMatch[1].toLowerCase();
|
|
212
|
+
// Auto-categorize based on description
|
|
213
|
+
if (description.includes("safety") || description.includes("security"))
|
|
214
|
+
tags.push("safety");
|
|
215
|
+
if (description.includes("accuracy") || description.includes("precision"))
|
|
216
|
+
tags.push("accuracy");
|
|
217
|
+
if (description.includes("agent") || description.includes("autonomous"))
|
|
218
|
+
tags.push("agents");
|
|
219
|
+
if (description.includes("tool") || description.includes("function"))
|
|
220
|
+
tags.push("tools");
|
|
221
|
+
if (description.includes("latency") || description.includes("speed"))
|
|
222
|
+
tags.push("performance");
|
|
223
|
+
if (description.includes("hallucination") || description.includes("fact"))
|
|
224
|
+
tags.push("factual");
|
|
225
|
+
if (description.includes("bias") || description.includes("fairness"))
|
|
226
|
+
tags.push("bias");
|
|
227
|
+
if (description.includes("privacy") || description.includes("pii"))
|
|
228
|
+
tags.push("privacy");
|
|
229
|
+
}
|
|
230
|
+
return [...new Set(tags)]; // Remove duplicates
|
|
231
|
+
}
|
|
232
|
+
/**
|
|
233
|
+
* Analyze specification complexity
|
|
234
|
+
*/
|
|
235
|
+
function analyzeComplexity(content) {
|
|
236
|
+
const lines = content.split("\n").length;
|
|
237
|
+
const hasAsync = content.includes("async") || content.includes("await");
|
|
238
|
+
const hasLoops = content.includes("for") || content.includes("while");
|
|
239
|
+
const hasConditionals = content.includes("if") || content.includes("switch");
|
|
240
|
+
const hasTryCatch = content.includes("try") || content.includes("catch");
|
|
241
|
+
const hasExternalCalls = content.includes("fetch") ||
|
|
242
|
+
content.includes("http") ||
|
|
243
|
+
content.includes("api");
|
|
244
|
+
let complexityScore = 0;
|
|
245
|
+
if (lines > 50)
|
|
246
|
+
complexityScore += 2;
|
|
247
|
+
if (lines > 100)
|
|
248
|
+
complexityScore += 3;
|
|
249
|
+
if (hasAsync)
|
|
250
|
+
complexityScore += 2;
|
|
251
|
+
if (hasLoops)
|
|
252
|
+
complexityScore += 1;
|
|
253
|
+
if (hasConditionals)
|
|
254
|
+
complexityScore += 1;
|
|
255
|
+
if (hasTryCatch)
|
|
256
|
+
complexityScore += 1;
|
|
257
|
+
if (hasExternalCalls)
|
|
258
|
+
complexityScore += 2;
|
|
259
|
+
if (complexityScore <= 2)
|
|
260
|
+
return "simple";
|
|
261
|
+
if (complexityScore <= 5)
|
|
262
|
+
return "medium";
|
|
263
|
+
return "complex";
|
|
264
|
+
}
|
|
265
|
+
/**
|
|
266
|
+
* Generate specification ID from file path
|
|
267
|
+
*/
|
|
268
|
+
function generateSpecId(filePath) {
|
|
269
|
+
const relativePath = path.relative(process.cwd(), filePath);
|
|
270
|
+
const hash = Buffer.from(relativePath)
|
|
271
|
+
.toString("base64")
|
|
272
|
+
.replace(/[+/=]/g, "")
|
|
273
|
+
.slice(0, 8);
|
|
274
|
+
return hash;
|
|
275
|
+
}
|
|
276
|
+
/**
|
|
277
|
+
* Calculate discovery statistics
|
|
278
|
+
*/
|
|
279
|
+
function calculateStats(specs, executionMode, project) {
|
|
280
|
+
const categories = {};
|
|
281
|
+
const files = {};
|
|
282
|
+
// Count by categories
|
|
283
|
+
for (const spec of specs) {
|
|
284
|
+
for (const tag of spec.tags) {
|
|
285
|
+
categories[tag] = (categories[tag] || 0) + 1;
|
|
286
|
+
}
|
|
287
|
+
// Count by files
|
|
288
|
+
files[spec.file] = (files[spec.file] || 0) + 1;
|
|
289
|
+
}
|
|
290
|
+
// Add default categories if none found
|
|
291
|
+
if (Object.keys(categories).length === 0) {
|
|
292
|
+
categories.general = specs.length;
|
|
293
|
+
}
|
|
294
|
+
return {
|
|
295
|
+
totalSpecs: specs.length,
|
|
296
|
+
categories,
|
|
297
|
+
files,
|
|
298
|
+
executionMode: {
|
|
299
|
+
mode: executionMode.mode,
|
|
300
|
+
hasSpecRuntime: executionMode.hasSpecRuntime,
|
|
301
|
+
hasLegacyRuntime: executionMode.hasLegacyRuntime,
|
|
302
|
+
specFiles: executionMode.specFiles,
|
|
303
|
+
legacyConfig: executionMode.legacyConfig,
|
|
304
|
+
},
|
|
305
|
+
project,
|
|
306
|
+
};
|
|
307
|
+
}
|
|
308
|
+
/**
|
|
309
|
+
* Print discovery results in a beautiful format
|
|
310
|
+
*/
|
|
311
|
+
function printDiscoveryResults(stats) {
|
|
312
|
+
console.log(`🔍 EvalAI Discovery Results`);
|
|
313
|
+
console.log(``);
|
|
314
|
+
console.log(`📊 Found ${stats.totalSpecs} behavioral specifications`);
|
|
315
|
+
console.log(``);
|
|
316
|
+
// Print categories
|
|
317
|
+
if (Object.keys(stats.categories).length > 0) {
|
|
318
|
+
console.log(`📋 Categories:`);
|
|
319
|
+
const sortedCategories = Object.entries(stats.categories)
|
|
320
|
+
.sort(([, a], [, b]) => b - a)
|
|
321
|
+
.slice(0, 10); // Top 10 categories
|
|
322
|
+
for (const [category, count] of sortedCategories) {
|
|
323
|
+
const icon = getCategoryIcon(category);
|
|
324
|
+
console.log(` ${icon} ${category}: ${count}`);
|
|
325
|
+
}
|
|
326
|
+
console.log(``);
|
|
327
|
+
}
|
|
328
|
+
// Print execution mode
|
|
329
|
+
console.log(`⚙️ Execution Mode: ${stats.executionMode.mode.toUpperCase()}`);
|
|
330
|
+
if (stats.executionMode.hasSpecRuntime) {
|
|
331
|
+
console.log(` ✅ Spec runtime: ${stats.executionMode.specFiles.length} files`);
|
|
332
|
+
}
|
|
333
|
+
if (stats.executionMode.hasLegacyRuntime) {
|
|
334
|
+
console.log(` ✅ Legacy runtime: ${stats.executionMode.legacyConfig ? path.basename(stats.executionMode.legacyConfig) : "config"}`);
|
|
335
|
+
}
|
|
336
|
+
console.log(``);
|
|
337
|
+
// Print project info
|
|
338
|
+
console.log(`📁 Project: ${stats.project.name}`);
|
|
339
|
+
console.log(` 📍 Root: ${stats.project.root}`);
|
|
340
|
+
console.log(` 📦 Package.json: ${stats.project.hasPackageJson ? "✅" : "❌"}`);
|
|
341
|
+
console.log(` 🔄 Git: ${stats.project.hasGit ? "✅" : "❌"}`);
|
|
342
|
+
console.log(``);
|
|
343
|
+
// Print recommendations
|
|
344
|
+
printRecommendations(stats);
|
|
345
|
+
}
|
|
346
|
+
/**
|
|
347
|
+
* Get icon for category
|
|
348
|
+
*/
|
|
349
|
+
function getCategoryIcon(category) {
|
|
350
|
+
const icons = {
|
|
351
|
+
safety: "🛡️",
|
|
352
|
+
security: "🔒",
|
|
353
|
+
accuracy: "🎯",
|
|
354
|
+
precision: "🎯",
|
|
355
|
+
agents: "🤖",
|
|
356
|
+
autonomous: "🤖",
|
|
357
|
+
tools: "🔧",
|
|
358
|
+
functions: "🔧",
|
|
359
|
+
performance: "⚡",
|
|
360
|
+
latency: "⚡",
|
|
361
|
+
speed: "⚡",
|
|
362
|
+
factual: "📊",
|
|
363
|
+
hallucination: "📊",
|
|
364
|
+
bias: "⚖️",
|
|
365
|
+
fairness: "⚖️",
|
|
366
|
+
privacy: "🔐",
|
|
367
|
+
pii: "🔐",
|
|
368
|
+
general: "📝",
|
|
369
|
+
};
|
|
370
|
+
return icons[category.toLowerCase()] || "📝";
|
|
371
|
+
}
|
|
372
|
+
/**
|
|
373
|
+
* Print recommendations based on discovery results
|
|
374
|
+
*/
|
|
375
|
+
function printRecommendations(stats) {
|
|
376
|
+
console.log(`💡 Recommendations:`);
|
|
377
|
+
if (stats.totalSpecs === 0) {
|
|
378
|
+
console.log(` 🚀 No specifications found. Create your first eval with:
|
|
379
|
+
echo 'import { defineEval } from "@pauly4010/evalai-sdk";
|
|
380
|
+
defineEval("hello-world", async (context) => {
|
|
381
|
+
return { pass: true, score: 100 };
|
|
382
|
+
});' > eval/hello.spec.ts`);
|
|
383
|
+
}
|
|
384
|
+
else if (stats.totalSpecs < 5) {
|
|
385
|
+
console.log(` 📈 Add more specifications to improve coverage`);
|
|
386
|
+
}
|
|
387
|
+
else if (stats.totalSpecs < 20) {
|
|
388
|
+
console.log(` 🎯 Good start! Consider organizing by categories`);
|
|
389
|
+
}
|
|
390
|
+
else {
|
|
391
|
+
console.log(` 🏆 Excellent coverage! Consider running evalai run`);
|
|
392
|
+
}
|
|
393
|
+
if (!stats.executionMode.hasSpecRuntime &&
|
|
394
|
+
!stats.executionMode.hasLegacyRuntime) {
|
|
395
|
+
console.log(` 🆕 New project? Try 'evalai init' to get started`);
|
|
396
|
+
}
|
|
397
|
+
if (stats.executionMode.hasLegacyRuntime &&
|
|
398
|
+
!stats.executionMode.hasSpecRuntime) {
|
|
399
|
+
console.log(` 🔄 Legacy project detected. Try 'evalai migrate config' to upgrade`);
|
|
400
|
+
}
|
|
401
|
+
if (stats.executionMode.hasSpecRuntime) {
|
|
402
|
+
console.log(` 🚀 Ready to run! Use 'evalai run' to execute specifications`);
|
|
403
|
+
}
|
|
404
|
+
console.log(``);
|
|
405
|
+
}
|
|
406
|
+
/**
|
|
407
|
+
* Run discovery command
|
|
408
|
+
*/
|
|
409
|
+
async function runDiscover() {
|
|
410
|
+
try {
|
|
411
|
+
const stats = await discoverSpecs();
|
|
412
|
+
printDiscoveryResults(stats);
|
|
413
|
+
process.exit(0);
|
|
414
|
+
}
|
|
415
|
+
catch (error) {
|
|
416
|
+
console.error(`❌ Discovery failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
417
|
+
process.exit(1);
|
|
418
|
+
}
|
|
419
|
+
}
|
package/dist/cli/doctor.js
CHANGED
|
@@ -96,8 +96,10 @@ function parseFlags(argv) {
|
|
|
96
96
|
const baseUrl = raw.baseUrl || process.env.EVALAI_BASE_URL || "http://localhost:3000";
|
|
97
97
|
const apiKey = raw.apiKey || process.env.EVALAI_API_KEY || "";
|
|
98
98
|
let evaluationId = raw.evaluationId || "";
|
|
99
|
-
const baseline = (raw.baseline === "previous"
|
|
100
|
-
|
|
99
|
+
const baseline = (raw.baseline === "previous"
|
|
100
|
+
? "previous"
|
|
101
|
+
: raw.baseline === "production"
|
|
102
|
+
? "production"
|
|
101
103
|
: "published");
|
|
102
104
|
// Try to fill evaluationId from config
|
|
103
105
|
if (!evaluationId) {
|
|
@@ -111,7 +113,15 @@ function parseFlags(argv) {
|
|
|
111
113
|
evaluationId = String(merged.evaluationId);
|
|
112
114
|
}
|
|
113
115
|
const strict = raw.strict === "true" || raw.strict === "1";
|
|
114
|
-
return {
|
|
116
|
+
return {
|
|
117
|
+
report,
|
|
118
|
+
format: report ? "json" : fmt,
|
|
119
|
+
strict,
|
|
120
|
+
baseUrl,
|
|
121
|
+
apiKey,
|
|
122
|
+
evaluationId,
|
|
123
|
+
baseline,
|
|
124
|
+
};
|
|
115
125
|
}
|
|
116
126
|
// ── Individual checks ──
|
|
117
127
|
function checkProject(cwd) {
|
|
@@ -222,7 +232,10 @@ function checkBaseline(cwd) {
|
|
|
222
232
|
};
|
|
223
233
|
}
|
|
224
234
|
const schemaVersion = typeof data.schemaVersion === "number" ? data.schemaVersion : undefined;
|
|
225
|
-
const hash = (0, node_crypto_1.createHash)("sha256")
|
|
235
|
+
const hash = (0, node_crypto_1.createHash)("sha256")
|
|
236
|
+
.update(JSON.stringify(data))
|
|
237
|
+
.digest("hex")
|
|
238
|
+
.slice(0, 12);
|
|
226
239
|
const updatedAt = typeof data.updatedAt === "string" ? data.updatedAt : undefined;
|
|
227
240
|
// Staleness: warn if baseline older than 30 days
|
|
228
241
|
let stale = false;
|
|
@@ -237,7 +250,12 @@ function checkBaseline(cwd) {
|
|
|
237
250
|
status: "fail",
|
|
238
251
|
message: `Unsupported baseline schemaVersion: ${schemaVersion ?? "missing"}`,
|
|
239
252
|
remediation: "Run: npx evalai baseline init (creates schemaVersion 1)",
|
|
240
|
-
baselineInfo: {
|
|
253
|
+
baselineInfo: {
|
|
254
|
+
path: "evals/baseline.json",
|
|
255
|
+
exists: true,
|
|
256
|
+
hash,
|
|
257
|
+
schemaVersion,
|
|
258
|
+
},
|
|
241
259
|
};
|
|
242
260
|
}
|
|
243
261
|
if (stale) {
|
|
@@ -247,7 +265,13 @@ function checkBaseline(cwd) {
|
|
|
247
265
|
status: "warn",
|
|
248
266
|
message: `Baseline is stale (last updated ${updatedAt})`,
|
|
249
267
|
remediation: "Run: npx evalai baseline update",
|
|
250
|
-
baselineInfo: {
|
|
268
|
+
baselineInfo: {
|
|
269
|
+
path: "evals/baseline.json",
|
|
270
|
+
exists: true,
|
|
271
|
+
hash,
|
|
272
|
+
schemaVersion,
|
|
273
|
+
stale,
|
|
274
|
+
},
|
|
251
275
|
};
|
|
252
276
|
}
|
|
253
277
|
return {
|
|
@@ -255,7 +279,13 @@ function checkBaseline(cwd) {
|
|
|
255
279
|
label: "Baseline file",
|
|
256
280
|
status: "pass",
|
|
257
281
|
message: `schemaVersion ${schemaVersion}, hash ${hash}`,
|
|
258
|
-
baselineInfo: {
|
|
282
|
+
baselineInfo: {
|
|
283
|
+
path: "evals/baseline.json",
|
|
284
|
+
exists: true,
|
|
285
|
+
hash,
|
|
286
|
+
schemaVersion,
|
|
287
|
+
stale,
|
|
288
|
+
},
|
|
259
289
|
};
|
|
260
290
|
}
|
|
261
291
|
function checkAuth(apiKey) {
|
|
@@ -269,9 +299,7 @@ function checkAuth(apiKey) {
|
|
|
269
299
|
};
|
|
270
300
|
}
|
|
271
301
|
// Redact key for display
|
|
272
|
-
const redacted = apiKey.length > 8
|
|
273
|
-
? `${apiKey.slice(0, 4)}...${apiKey.slice(-4)}`
|
|
274
|
-
: "****";
|
|
302
|
+
const redacted = apiKey.length > 8 ? `${apiKey.slice(0, 4)}...${apiKey.slice(-4)}` : "****";
|
|
275
303
|
return {
|
|
276
304
|
id: "auth",
|
|
277
305
|
label: "Authentication",
|
|
@@ -437,7 +465,8 @@ function checkCiWiring(cwd) {
|
|
|
437
465
|
ciInfo: { workflowPath, exists: true },
|
|
438
466
|
};
|
|
439
467
|
}
|
|
440
|
-
if (!content.includes("evalai") &&
|
|
468
|
+
if (!content.includes("evalai") &&
|
|
469
|
+
!content.includes("@pauly4010/evalai-sdk")) {
|
|
441
470
|
return {
|
|
442
471
|
id: "ci_wiring",
|
|
443
472
|
label: "CI wiring",
|
|
@@ -480,10 +509,14 @@ function checkProviderEnv() {
|
|
|
480
509
|
// ── Output formatting ──
|
|
481
510
|
function icon(status) {
|
|
482
511
|
switch (status) {
|
|
483
|
-
case "pass":
|
|
484
|
-
|
|
485
|
-
case "
|
|
486
|
-
|
|
512
|
+
case "pass":
|
|
513
|
+
return "\u2705"; // ✅
|
|
514
|
+
case "fail":
|
|
515
|
+
return "\u274C"; // ❌
|
|
516
|
+
case "warn":
|
|
517
|
+
return "\u26A0\uFE0F"; // ⚠️
|
|
518
|
+
case "skip":
|
|
519
|
+
return "\u23ED\uFE0F"; // ⏭️
|
|
487
520
|
}
|
|
488
521
|
}
|
|
489
522
|
function printHuman(checks, overall) {
|
|
@@ -539,10 +572,17 @@ async function runDoctor(argv) {
|
|
|
539
572
|
message: "Infrastructure error during connectivity check",
|
|
540
573
|
});
|
|
541
574
|
infraError = true;
|
|
542
|
-
connectivityResult = {
|
|
575
|
+
connectivityResult = {
|
|
576
|
+
id: "connectivity",
|
|
577
|
+
label: "API connectivity",
|
|
578
|
+
status: "fail",
|
|
579
|
+
message: "",
|
|
580
|
+
};
|
|
543
581
|
}
|
|
544
582
|
// 7. Eval access (async, depends on auth + connectivity)
|
|
545
|
-
if (flags.apiKey &&
|
|
583
|
+
if (flags.apiKey &&
|
|
584
|
+
flags.evaluationId &&
|
|
585
|
+
connectivityResult.status !== "fail") {
|
|
546
586
|
try {
|
|
547
587
|
const accessResult = await checkEvalAccess(flags.baseUrl, flags.apiKey, flags.evaluationId, flags.baseline);
|
|
548
588
|
checks.push(accessResult);
|
|
@@ -583,7 +623,9 @@ async function runDoctor(argv) {
|
|
|
583
623
|
if (flags.report || flags.format === "json") {
|
|
584
624
|
const redactedConfig = {
|
|
585
625
|
...(configResult.config ?? {}),
|
|
586
|
-
path: configResult.configPath
|
|
626
|
+
path: configResult.configPath
|
|
627
|
+
? path.relative(cwd, configResult.configPath)
|
|
628
|
+
: null,
|
|
587
629
|
};
|
|
588
630
|
const bundle = {
|
|
589
631
|
timestamp: new Date().toISOString(),
|
|
@@ -595,7 +637,8 @@ async function runDoctor(argv) {
|
|
|
595
637
|
config: redactedConfig,
|
|
596
638
|
baseline: baselineResult.baselineInfo,
|
|
597
639
|
api: {
|
|
598
|
-
reachable: connectivityResult.status === "pass" ||
|
|
640
|
+
reachable: connectivityResult.status === "pass" ||
|
|
641
|
+
connectivityResult.status === "warn",
|
|
599
642
|
latencyMs: connectivityResult.latencyMs,
|
|
600
643
|
},
|
|
601
644
|
ci: ciResult.ciInfo,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CORE-401: Centralized environment detection
|
|
3
|
+
*
|
|
4
|
+
* Provides unified environment detection for all EvalAI CLI commands
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* Check if running in CI environment
|
|
8
|
+
*/
|
|
9
|
+
export declare function isCI(): boolean;
|
|
10
|
+
/**
|
|
11
|
+
* Check if running in GitHub Actions
|
|
12
|
+
*/
|
|
13
|
+
export declare function isGitHubActions(): boolean;
|
|
14
|
+
/**
|
|
15
|
+
* Get GitHub Step Summary path if available
|
|
16
|
+
*/
|
|
17
|
+
export declare function getGitHubStepSummaryPath(): string | undefined;
|
|
18
|
+
/**
|
|
19
|
+
* Check if string looks like a git reference
|
|
20
|
+
*/
|
|
21
|
+
export declare function isGitRef(ref: string): boolean;
|