@pauly4010/evalai-sdk 1.9.0 → 1.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +136 -23
- package/dist/assertions.js +51 -18
- package/dist/batch.js +8 -2
- package/dist/cli/api.js +3 -1
- package/dist/cli/check.js +19 -6
- package/dist/cli/ci-context.js +3 -1
- package/dist/cli/config.js +28 -8
- package/dist/cli/diff.js +14 -9
- package/dist/cli/discover.js +18 -7
- package/dist/cli/doctor.js +43 -9
- package/dist/cli/explain.js +37 -11
- package/dist/cli/formatters/human.js +4 -1
- package/dist/cli/formatters/pr-comment.js +3 -1
- package/dist/cli/gate.js +6 -2
- package/dist/cli/impact-analysis.js +6 -5
- package/dist/cli/index.js +18 -6
- package/dist/cli/manifest.d.ts +3 -5
- package/dist/cli/manifest.js +21 -14
- package/dist/cli/migrate.js +4 -4
- package/dist/cli/policy-packs.js +8 -2
- package/dist/cli/print-config.js +19 -4
- package/dist/cli/regression-gate.js +8 -2
- package/dist/cli/report/build-check-report.js +8 -2
- package/dist/cli/run.js +11 -5
- package/dist/cli/share.js +3 -1
- package/dist/cli/upgrade.js +2 -1
- package/dist/client.d.ts +16 -19
- package/dist/client.js +60 -43
- package/dist/client.request.test.d.ts +1 -1
- package/dist/client.request.test.js +222 -147
- package/dist/context.js +3 -1
- package/dist/errors.js +11 -4
- package/dist/export.js +3 -1
- package/dist/index.d.ts +8 -8
- package/dist/index.js +19 -19
- package/dist/integrations/anthropic.d.ts +20 -1
- package/dist/integrations/openai-eval.js +4 -2
- package/dist/integrations/openai.d.ts +24 -1
- package/dist/local.js +3 -1
- package/dist/logger.js +6 -2
- package/dist/pagination.js +6 -2
- package/dist/runtime/adapters/config-to-dsl.js +12 -9
- package/dist/runtime/adapters/testsuite-to-dsl.d.ts +1 -1
- package/dist/runtime/adapters/testsuite-to-dsl.js +11 -6
- package/dist/runtime/eval.d.ts +1 -1
- package/dist/runtime/eval.js +12 -5
- package/dist/runtime/execution-mode.js +13 -9
- package/dist/runtime/registry.js +8 -21
- package/dist/runtime/run-report.d.ts +0 -2
- package/dist/runtime/run-report.js +12 -10
- package/dist/testing.js +7 -2
- package/dist/types.d.ts +100 -69
- package/dist/utils/input-hash.js +4 -1
- package/dist/version.d.ts +1 -1
- package/dist/version.js +1 -1
- package/dist/workflows.js +62 -14
- package/package.json +115 -111
package/dist/cli/discover.js
CHANGED
|
@@ -123,7 +123,7 @@ async function getProjectMetadata(projectRoot) {
|
|
|
123
123
|
hasPackageJson = true;
|
|
124
124
|
projectName = parsed.name || "unknown";
|
|
125
125
|
}
|
|
126
|
-
catch (
|
|
126
|
+
catch (_error) {
|
|
127
127
|
// No package.json
|
|
128
128
|
}
|
|
129
129
|
const hasGit = await fs
|
|
@@ -173,9 +173,13 @@ function analyzeSpecFile(filePath, content) {
|
|
|
173
173
|
content.includes("model=") ||
|
|
174
174
|
content.includes("openai") ||
|
|
175
175
|
content.includes("anthropic");
|
|
176
|
-
const usesTools = content.includes("tool:") ||
|
|
176
|
+
const usesTools = content.includes("tool:") ||
|
|
177
|
+
content.includes("function.") ||
|
|
178
|
+
content.includes("call(");
|
|
177
179
|
// Check for assertions
|
|
178
|
-
const hasAssertions = content.includes("assert") ||
|
|
180
|
+
const hasAssertions = content.includes("assert") ||
|
|
181
|
+
content.includes("expect") ||
|
|
182
|
+
content.includes("should");
|
|
179
183
|
// Generate ID from file path
|
|
180
184
|
const id = generateSpecId(filePath);
|
|
181
185
|
return {
|
|
@@ -234,7 +238,9 @@ function analyzeComplexity(content) {
|
|
|
234
238
|
const hasLoops = content.includes("for") || content.includes("while");
|
|
235
239
|
const hasConditionals = content.includes("if") || content.includes("switch");
|
|
236
240
|
const hasTryCatch = content.includes("try") || content.includes("catch");
|
|
237
|
-
const hasExternalCalls = content.includes("fetch") ||
|
|
241
|
+
const hasExternalCalls = content.includes("fetch") ||
|
|
242
|
+
content.includes("http") ||
|
|
243
|
+
content.includes("api");
|
|
238
244
|
let complexityScore = 0;
|
|
239
245
|
if (lines > 50)
|
|
240
246
|
complexityScore += 2;
|
|
@@ -261,7 +267,10 @@ function analyzeComplexity(content) {
|
|
|
261
267
|
*/
|
|
262
268
|
function generateSpecId(filePath) {
|
|
263
269
|
const relativePath = path.relative(process.cwd(), filePath);
|
|
264
|
-
const hash = Buffer.from(relativePath)
|
|
270
|
+
const hash = Buffer.from(relativePath)
|
|
271
|
+
.toString("base64")
|
|
272
|
+
.replace(/[+/=]/g, "")
|
|
273
|
+
.slice(0, 8);
|
|
265
274
|
return hash;
|
|
266
275
|
}
|
|
267
276
|
/**
|
|
@@ -381,10 +390,12 @@ function printRecommendations(stats) {
|
|
|
381
390
|
else {
|
|
382
391
|
console.log(` 🏆 Excellent coverage! Consider running evalai run`);
|
|
383
392
|
}
|
|
384
|
-
if (!stats.executionMode.hasSpecRuntime &&
|
|
393
|
+
if (!stats.executionMode.hasSpecRuntime &&
|
|
394
|
+
!stats.executionMode.hasLegacyRuntime) {
|
|
385
395
|
console.log(` 🆕 New project? Try 'evalai init' to get started`);
|
|
386
396
|
}
|
|
387
|
-
if (stats.executionMode.hasLegacyRuntime &&
|
|
397
|
+
if (stats.executionMode.hasLegacyRuntime &&
|
|
398
|
+
!stats.executionMode.hasSpecRuntime) {
|
|
388
399
|
console.log(` 🔄 Legacy project detected. Try 'evalai migrate config' to upgrade`);
|
|
389
400
|
}
|
|
390
401
|
if (stats.executionMode.hasSpecRuntime) {
|
package/dist/cli/doctor.js
CHANGED
|
@@ -113,7 +113,15 @@ function parseFlags(argv) {
|
|
|
113
113
|
evaluationId = String(merged.evaluationId);
|
|
114
114
|
}
|
|
115
115
|
const strict = raw.strict === "true" || raw.strict === "1";
|
|
116
|
-
return {
|
|
116
|
+
return {
|
|
117
|
+
report,
|
|
118
|
+
format: report ? "json" : fmt,
|
|
119
|
+
strict,
|
|
120
|
+
baseUrl,
|
|
121
|
+
apiKey,
|
|
122
|
+
evaluationId,
|
|
123
|
+
baseline,
|
|
124
|
+
};
|
|
117
125
|
}
|
|
118
126
|
// ── Individual checks ──
|
|
119
127
|
function checkProject(cwd) {
|
|
@@ -224,7 +232,10 @@ function checkBaseline(cwd) {
|
|
|
224
232
|
};
|
|
225
233
|
}
|
|
226
234
|
const schemaVersion = typeof data.schemaVersion === "number" ? data.schemaVersion : undefined;
|
|
227
|
-
const hash = (0, node_crypto_1.createHash)("sha256")
|
|
235
|
+
const hash = (0, node_crypto_1.createHash)("sha256")
|
|
236
|
+
.update(JSON.stringify(data))
|
|
237
|
+
.digest("hex")
|
|
238
|
+
.slice(0, 12);
|
|
228
239
|
const updatedAt = typeof data.updatedAt === "string" ? data.updatedAt : undefined;
|
|
229
240
|
// Staleness: warn if baseline older than 30 days
|
|
230
241
|
let stale = false;
|
|
@@ -239,7 +250,12 @@ function checkBaseline(cwd) {
|
|
|
239
250
|
status: "fail",
|
|
240
251
|
message: `Unsupported baseline schemaVersion: ${schemaVersion ?? "missing"}`,
|
|
241
252
|
remediation: "Run: npx evalai baseline init (creates schemaVersion 1)",
|
|
242
|
-
baselineInfo: {
|
|
253
|
+
baselineInfo: {
|
|
254
|
+
path: "evals/baseline.json",
|
|
255
|
+
exists: true,
|
|
256
|
+
hash,
|
|
257
|
+
schemaVersion,
|
|
258
|
+
},
|
|
243
259
|
};
|
|
244
260
|
}
|
|
245
261
|
if (stale) {
|
|
@@ -249,7 +265,13 @@ function checkBaseline(cwd) {
|
|
|
249
265
|
status: "warn",
|
|
250
266
|
message: `Baseline is stale (last updated ${updatedAt})`,
|
|
251
267
|
remediation: "Run: npx evalai baseline update",
|
|
252
|
-
baselineInfo: {
|
|
268
|
+
baselineInfo: {
|
|
269
|
+
path: "evals/baseline.json",
|
|
270
|
+
exists: true,
|
|
271
|
+
hash,
|
|
272
|
+
schemaVersion,
|
|
273
|
+
stale,
|
|
274
|
+
},
|
|
253
275
|
};
|
|
254
276
|
}
|
|
255
277
|
return {
|
|
@@ -257,7 +279,13 @@ function checkBaseline(cwd) {
|
|
|
257
279
|
label: "Baseline file",
|
|
258
280
|
status: "pass",
|
|
259
281
|
message: `schemaVersion ${schemaVersion}, hash ${hash}`,
|
|
260
|
-
baselineInfo: {
|
|
282
|
+
baselineInfo: {
|
|
283
|
+
path: "evals/baseline.json",
|
|
284
|
+
exists: true,
|
|
285
|
+
hash,
|
|
286
|
+
schemaVersion,
|
|
287
|
+
stale,
|
|
288
|
+
},
|
|
261
289
|
};
|
|
262
290
|
}
|
|
263
291
|
function checkAuth(apiKey) {
|
|
@@ -437,7 +465,8 @@ function checkCiWiring(cwd) {
|
|
|
437
465
|
ciInfo: { workflowPath, exists: true },
|
|
438
466
|
};
|
|
439
467
|
}
|
|
440
|
-
if (!content.includes("evalai") &&
|
|
468
|
+
if (!content.includes("evalai") &&
|
|
469
|
+
!content.includes("@pauly4010/evalai-sdk")) {
|
|
441
470
|
return {
|
|
442
471
|
id: "ci_wiring",
|
|
443
472
|
label: "CI wiring",
|
|
@@ -551,7 +580,9 @@ async function runDoctor(argv) {
|
|
|
551
580
|
};
|
|
552
581
|
}
|
|
553
582
|
// 7. Eval access (async, depends on auth + connectivity)
|
|
554
|
-
if (flags.apiKey &&
|
|
583
|
+
if (flags.apiKey &&
|
|
584
|
+
flags.evaluationId &&
|
|
585
|
+
connectivityResult.status !== "fail") {
|
|
555
586
|
try {
|
|
556
587
|
const accessResult = await checkEvalAccess(flags.baseUrl, flags.apiKey, flags.evaluationId, flags.baseline);
|
|
557
588
|
checks.push(accessResult);
|
|
@@ -592,7 +623,9 @@ async function runDoctor(argv) {
|
|
|
592
623
|
if (flags.report || flags.format === "json") {
|
|
593
624
|
const redactedConfig = {
|
|
594
625
|
...(configResult.config ?? {}),
|
|
595
|
-
path: configResult.configPath
|
|
626
|
+
path: configResult.configPath
|
|
627
|
+
? path.relative(cwd, configResult.configPath)
|
|
628
|
+
: null,
|
|
596
629
|
};
|
|
597
630
|
const bundle = {
|
|
598
631
|
timestamp: new Date().toISOString(),
|
|
@@ -604,7 +637,8 @@ async function runDoctor(argv) {
|
|
|
604
637
|
config: redactedConfig,
|
|
605
638
|
baseline: baselineResult.baselineInfo,
|
|
606
639
|
api: {
|
|
607
|
-
reachable: connectivityResult.status === "pass" ||
|
|
640
|
+
reachable: connectivityResult.status === "pass" ||
|
|
641
|
+
connectivityResult.status === "warn",
|
|
608
642
|
latencyMs: connectivityResult.latencyMs,
|
|
609
643
|
},
|
|
610
644
|
ci: ciResult.ciInfo,
|
package/dist/cli/explain.js
CHANGED
|
@@ -87,7 +87,9 @@ const REPORT_SEARCH_PATHS = [
|
|
|
87
87
|
];
|
|
88
88
|
function findReport(cwd, explicitPath) {
|
|
89
89
|
if (explicitPath) {
|
|
90
|
-
const abs = path.isAbsolute(explicitPath)
|
|
90
|
+
const abs = path.isAbsolute(explicitPath)
|
|
91
|
+
? explicitPath
|
|
92
|
+
: path.join(cwd, explicitPath);
|
|
91
93
|
return fs.existsSync(abs) ? abs : null;
|
|
92
94
|
}
|
|
93
95
|
for (const rel of REPORT_SEARCH_PATHS) {
|
|
@@ -115,16 +117,20 @@ function classifyRootCauses(report) {
|
|
|
115
117
|
causes.push("cost_regression");
|
|
116
118
|
}
|
|
117
119
|
// Latency regression
|
|
118
|
-
if (reasonCode === "LATENCY_BUDGET_EXCEEDED" ||
|
|
120
|
+
if (reasonCode === "LATENCY_BUDGET_EXCEEDED" ||
|
|
121
|
+
reasonCode === "LATENCY_RISK") {
|
|
119
122
|
causes.push("latency_regression");
|
|
120
123
|
}
|
|
121
124
|
// Coverage drop (test count decreased)
|
|
122
|
-
if (reasonCode === "LOW_SAMPLE_SIZE" ||
|
|
125
|
+
if (reasonCode === "LOW_SAMPLE_SIZE" ||
|
|
126
|
+
reasonCode === "INSUFFICIENT_EVIDENCE") {
|
|
123
127
|
causes.push("coverage_drop");
|
|
124
128
|
}
|
|
125
129
|
// Analyze failed cases for drift patterns
|
|
126
130
|
if (failedCases.length > 0) {
|
|
127
|
-
const outputs = failedCases
|
|
131
|
+
const outputs = failedCases
|
|
132
|
+
.map((fc) => (fc.output ?? "").toLowerCase())
|
|
133
|
+
.filter(Boolean);
|
|
128
134
|
const expectedOutputs = failedCases
|
|
129
135
|
.map((fc) => (fc.expectedOutput ?? "").toLowerCase())
|
|
130
136
|
.filter(Boolean);
|
|
@@ -136,7 +142,9 @@ function classifyRootCauses(report) {
|
|
|
136
142
|
causes.push("formatting_drift");
|
|
137
143
|
}
|
|
138
144
|
// Tool use drift: output mentions tool calls or function calls
|
|
139
|
-
const hasToolIssue = outputs.some((o) => o.includes("tool_call") ||
|
|
145
|
+
const hasToolIssue = outputs.some((o) => o.includes("tool_call") ||
|
|
146
|
+
o.includes("function_call") ||
|
|
147
|
+
o.includes("tool_use"));
|
|
140
148
|
if (hasToolIssue) {
|
|
141
149
|
causes.push("tool_use_drift");
|
|
142
150
|
}
|
|
@@ -356,7 +364,9 @@ function buildExplainOutput(report, reportPath) {
|
|
|
356
364
|
function buildFromCheckReport(report, reportPath) {
|
|
357
365
|
const failedCases = report.failedCases ?? [];
|
|
358
366
|
// Top failures (up to 3)
|
|
359
|
-
const topFailures = failedCases
|
|
367
|
+
const topFailures = failedCases
|
|
368
|
+
.slice(0, 3)
|
|
369
|
+
.map((fc, i) => ({
|
|
360
370
|
rank: i + 1,
|
|
361
371
|
name: fc.name,
|
|
362
372
|
input: fc.inputSnippet || fc.input,
|
|
@@ -444,7 +454,11 @@ function buildFromBuiltinReport(report, reportPath) {
|
|
|
444
454
|
}
|
|
445
455
|
// ── Output formatting ──
|
|
446
456
|
function printHuman(output) {
|
|
447
|
-
const verdictIcon = output.verdict === "pass"
|
|
457
|
+
const verdictIcon = output.verdict === "pass"
|
|
458
|
+
? "\u2705"
|
|
459
|
+
: output.verdict === "warn"
|
|
460
|
+
? "\u26A0\uFE0F"
|
|
461
|
+
: "\u274C";
|
|
448
462
|
console.log(`\n evalai explain\n`);
|
|
449
463
|
console.log(` ${verdictIcon} Verdict: ${output.verdict.toUpperCase()}`);
|
|
450
464
|
if (output.score != null) {
|
|
@@ -460,7 +474,11 @@ function printHuman(output) {
|
|
|
460
474
|
if (output.changes.length > 0) {
|
|
461
475
|
console.log("\n What changed:");
|
|
462
476
|
for (const c of output.changes) {
|
|
463
|
-
const arrow = c.direction === "worse"
|
|
477
|
+
const arrow = c.direction === "worse"
|
|
478
|
+
? "\u2193"
|
|
479
|
+
: c.direction === "better"
|
|
480
|
+
? "\u2191"
|
|
481
|
+
: "\u2192";
|
|
464
482
|
console.log(` ${arrow} ${c.metric}: ${c.baseline} \u2192 ${c.current}`);
|
|
465
483
|
}
|
|
466
484
|
}
|
|
@@ -490,7 +508,11 @@ function printHuman(output) {
|
|
|
490
508
|
if (output.suggestedFixes.length > 0) {
|
|
491
509
|
console.log("\n Suggested fixes:");
|
|
492
510
|
for (const fix of output.suggestedFixes) {
|
|
493
|
-
const pIcon = fix.priority === "high"
|
|
511
|
+
const pIcon = fix.priority === "high"
|
|
512
|
+
? "\u203C\uFE0F"
|
|
513
|
+
: fix.priority === "medium"
|
|
514
|
+
? "\u2757"
|
|
515
|
+
: "\u2022";
|
|
494
516
|
console.log(` ${pIcon} ${fix.action}`);
|
|
495
517
|
console.log(` ${fix.detail}`);
|
|
496
518
|
}
|
|
@@ -503,7 +525,9 @@ async function runExplain(argv) {
|
|
|
503
525
|
const cwd = process.cwd();
|
|
504
526
|
const reportPath = findReport(cwd, flags.reportPath);
|
|
505
527
|
if (!reportPath) {
|
|
506
|
-
const searched = flags.reportPath
|
|
528
|
+
const searched = flags.reportPath
|
|
529
|
+
? flags.reportPath
|
|
530
|
+
: REPORT_SEARCH_PATHS.join(", ");
|
|
507
531
|
console.error(`\n \u274C No report found. Searched: ${searched}`);
|
|
508
532
|
console.error(" Run a gate first:");
|
|
509
533
|
console.error(" npx evalai gate --format json");
|
|
@@ -519,7 +543,9 @@ async function runExplain(argv) {
|
|
|
519
543
|
return 1;
|
|
520
544
|
}
|
|
521
545
|
// Schema version compatibility check
|
|
522
|
-
const reportSchema = typeof reportData.schemaVersion === "number"
|
|
546
|
+
const reportSchema = typeof reportData.schemaVersion === "number"
|
|
547
|
+
? reportData.schemaVersion
|
|
548
|
+
: undefined;
|
|
523
549
|
if (reportSchema != null && reportSchema > types_1.CHECK_REPORT_SCHEMA_VERSION) {
|
|
524
550
|
console.error(`\n \u26A0\uFE0F Report schema version ${reportSchema} is newer than this CLI supports (v${types_1.CHECK_REPORT_SCHEMA_VERSION}).`);
|
|
525
551
|
console.error(" Update your SDK: npm install @pauly4010/evalai-sdk@latest\n");
|
|
@@ -43,7 +43,10 @@ function formatHuman(report) {
|
|
|
43
43
|
lines.push("Next: View full report above, fix failing cases, or adjust gate with --minScore / --maxDrop / --warnDrop");
|
|
44
44
|
}
|
|
45
45
|
if (report.explain &&
|
|
46
|
-
(report.breakdown01 ||
|
|
46
|
+
(report.breakdown01 ||
|
|
47
|
+
report.contribPts ||
|
|
48
|
+
report.flags?.length ||
|
|
49
|
+
report.policyEvidence)) {
|
|
47
50
|
lines.push("");
|
|
48
51
|
lines.push("--- Explain ---");
|
|
49
52
|
if (report.contribPts) {
|
|
@@ -34,7 +34,9 @@ function buildPrComment(report) {
|
|
|
34
34
|
}
|
|
35
35
|
}
|
|
36
36
|
else {
|
|
37
|
-
lines.push(passed
|
|
37
|
+
lines.push(passed
|
|
38
|
+
? "## ✅ EvalAI Regression Gate — PASSED"
|
|
39
|
+
: "## 🚨 EvalAI Regression Gate — FAILED");
|
|
38
40
|
}
|
|
39
41
|
lines.push("");
|
|
40
42
|
// Score + Delta (skip when gate not applied)
|
package/dist/cli/gate.js
CHANGED
|
@@ -55,7 +55,9 @@ function evaluateGate(args, quality) {
|
|
|
55
55
|
reasonMessage: `cost $${costUsd.toFixed(4)} exceeds maxCostUsd $${args.maxCostUsd.toFixed(4)}`,
|
|
56
56
|
};
|
|
57
57
|
}
|
|
58
|
-
if (args.maxLatencyMs != null &&
|
|
58
|
+
if (args.maxLatencyMs != null &&
|
|
59
|
+
avgLatencyMs != null &&
|
|
60
|
+
avgLatencyMs > args.maxLatencyMs) {
|
|
59
61
|
return {
|
|
60
62
|
exitCode: constants_1.EXIT.SCORE_BELOW,
|
|
61
63
|
passed: false,
|
|
@@ -102,7 +104,9 @@ function evaluateGate(args, quality) {
|
|
|
102
104
|
};
|
|
103
105
|
}
|
|
104
106
|
// warnDrop: soft warning band; maxDrop: hard fail
|
|
105
|
-
if (args.maxDrop !== undefined &&
|
|
107
|
+
if (args.maxDrop !== undefined &&
|
|
108
|
+
regressionDelta !== null &&
|
|
109
|
+
regressionDelta < -args.maxDrop) {
|
|
106
110
|
return {
|
|
107
111
|
exitCode: constants_1.EXIT.REGRESSION,
|
|
108
112
|
passed: false,
|
|
@@ -48,9 +48,9 @@ exports.analyzeImpact = analyzeImpact;
|
|
|
48
48
|
exports.printHumanResults = printHumanResults;
|
|
49
49
|
exports.printJsonResults = printJsonResults;
|
|
50
50
|
exports.runImpactAnalysisCLI = runImpactAnalysisCLI;
|
|
51
|
+
const node_child_process_1 = require("node:child_process");
|
|
51
52
|
const fs = __importStar(require("node:fs/promises"));
|
|
52
53
|
const path = __importStar(require("node:path"));
|
|
53
|
-
const node_child_process_1 = require("node:child_process");
|
|
54
54
|
/**
|
|
55
55
|
* Run impact analysis
|
|
56
56
|
*/
|
|
@@ -87,7 +87,7 @@ async function readManifest(projectRoot = process.cwd()) {
|
|
|
87
87
|
const content = await fs.readFile(manifestPath, "utf-8");
|
|
88
88
|
return JSON.parse(content);
|
|
89
89
|
}
|
|
90
|
-
catch (
|
|
90
|
+
catch (_error) {
|
|
91
91
|
return null;
|
|
92
92
|
}
|
|
93
93
|
}
|
|
@@ -138,7 +138,7 @@ function analyzeImpact(changedFiles, manifest) {
|
|
|
138
138
|
if (!specsByFile.has(spec.filePath)) {
|
|
139
139
|
specsByFile.set(spec.filePath, []);
|
|
140
140
|
}
|
|
141
|
-
specsByFile.get(spec.filePath)
|
|
141
|
+
specsByFile.get(spec.filePath)?.push(spec);
|
|
142
142
|
// By dependencies
|
|
143
143
|
const deps = [
|
|
144
144
|
...spec.dependsOn.prompts,
|
|
@@ -150,7 +150,7 @@ function analyzeImpact(changedFiles, manifest) {
|
|
|
150
150
|
if (!specsByDependency.has(dep)) {
|
|
151
151
|
specsByDependency.set(dep, []);
|
|
152
152
|
}
|
|
153
|
-
specsByDependency.get(dep)
|
|
153
|
+
specsByDependency.get(dep)?.push(spec);
|
|
154
154
|
}
|
|
155
155
|
}
|
|
156
156
|
// Analyze each changed file
|
|
@@ -179,7 +179,8 @@ function analyzeImpact(changedFiles, manifest) {
|
|
|
179
179
|
// Add all specs
|
|
180
180
|
for (const spec of manifest.specs) {
|
|
181
181
|
impactedSpecIds.add(spec.id);
|
|
182
|
-
reasonBySpecId[spec.id] =
|
|
182
|
+
reasonBySpecId[spec.id] =
|
|
183
|
+
`Unknown file changed: ${changedFile} (safe fallback)`;
|
|
183
184
|
}
|
|
184
185
|
break; // No need to continue analyzing
|
|
185
186
|
}
|
package/dist/cli/index.js
CHANGED
|
@@ -160,9 +160,13 @@ else if (subcommand === "impact-analysis") {
|
|
|
160
160
|
const changedFilesIndex = args.indexOf("--changed-files");
|
|
161
161
|
const formatIndex = args.indexOf("--format");
|
|
162
162
|
const baseBranch = baseIndex !== -1 ? args[baseIndex + 1] : "main";
|
|
163
|
-
const changedFiles = changedFilesIndex !== -1
|
|
163
|
+
const changedFiles = changedFilesIndex !== -1
|
|
164
|
+
? args[changedFilesIndex + 1]?.split(",")
|
|
165
|
+
: undefined;
|
|
164
166
|
const format = formatIndex !== -1 ? args[formatIndex + 1] : "human";
|
|
165
|
-
(0, impact_analysis_1.runImpactAnalysisCLI)({ baseBranch, changedFiles, format })
|
|
167
|
+
(0, impact_analysis_1.runImpactAnalysisCLI)({ baseBranch, changedFiles, format })
|
|
168
|
+
.then(() => process.exit(0))
|
|
169
|
+
.catch((err) => {
|
|
166
170
|
console.error(`EvalAI ERROR: ${err instanceof Error ? err.message : String(err)}`);
|
|
167
171
|
process.exit(2);
|
|
168
172
|
});
|
|
@@ -186,7 +190,9 @@ else if (subcommand === "run") {
|
|
|
186
190
|
baseBranch,
|
|
187
191
|
format,
|
|
188
192
|
writeResults,
|
|
189
|
-
})
|
|
193
|
+
})
|
|
194
|
+
.then(() => process.exit(0))
|
|
195
|
+
.catch((err) => {
|
|
190
196
|
console.error(`EvalAI ERROR: ${err instanceof Error ? err.message : String(err)}`);
|
|
191
197
|
process.exit(2);
|
|
192
198
|
});
|
|
@@ -200,7 +206,9 @@ else if (subcommand === "diff") {
|
|
|
200
206
|
const base = baseIndex !== -1 ? args[baseIndex + 1] : undefined;
|
|
201
207
|
const head = headIndex !== -1 ? args[headIndex + 1] : undefined;
|
|
202
208
|
const format = formatIndex !== -1 ? args[formatIndex + 1] : "human";
|
|
203
|
-
(0, diff_1.runDiffCLI)({ base, head, format })
|
|
209
|
+
(0, diff_1.runDiffCLI)({ base, head, format })
|
|
210
|
+
.then(() => process.exit(0))
|
|
211
|
+
.catch((err) => {
|
|
204
212
|
console.error(`EvalAI ERROR: ${err instanceof Error ? err.message : String(err)}`);
|
|
205
213
|
process.exit(2);
|
|
206
214
|
});
|
|
@@ -214,9 +222,13 @@ else if (subcommand === "ci") {
|
|
|
214
222
|
const writeResultsIndex = args.indexOf("--write-results");
|
|
215
223
|
const base = baseIndex !== -1 ? args[baseIndex + 1] : undefined;
|
|
216
224
|
const impactedOnly = impactedOnlyIndex !== -1;
|
|
217
|
-
const format = formatIndex !== -1
|
|
225
|
+
const format = formatIndex !== -1
|
|
226
|
+
? args[formatIndex + 1]
|
|
227
|
+
: "human";
|
|
218
228
|
const writeResults = writeResultsIndex !== -1;
|
|
219
|
-
(0, ci_1.runCICLI)({ base, impactedOnly, format, writeResults })
|
|
229
|
+
(0, ci_1.runCICLI)({ base, impactedOnly, format, writeResults })
|
|
230
|
+
.then(() => process.exit(0))
|
|
231
|
+
.catch((err) => {
|
|
220
232
|
console.error(`EvalAI ERROR: ${err instanceof Error ? err.message : String(err)}`);
|
|
221
233
|
process.exit(2);
|
|
222
234
|
});
|
package/dist/cli/manifest.d.ts
CHANGED
|
@@ -6,16 +6,14 @@
|
|
|
6
6
|
*
|
|
7
7
|
* This is the compiler output that everything else consumes.
|
|
8
8
|
*/
|
|
9
|
-
import type { SpecAnalysis } from "./discover";
|
|
10
9
|
import type { ExecutionModeConfig } from "../runtime/execution-mode";
|
|
10
|
+
import { SDK_VERSION } from "../version";
|
|
11
|
+
import type { SpecAnalysis } from "./discover";
|
|
12
|
+
export { SDK_VERSION };
|
|
11
13
|
/**
|
|
12
14
|
* Manifest schema version
|
|
13
15
|
*/
|
|
14
16
|
export declare const MANIFEST_SCHEMA_VERSION = 1;
|
|
15
|
-
/**
|
|
16
|
-
* SDK version from package.json
|
|
17
|
-
*/
|
|
18
|
-
export declare const SDK_VERSION = "1.8.0";
|
|
19
17
|
/**
|
|
20
18
|
* Evaluation Manifest Schema
|
|
21
19
|
*/
|
package/dist/cli/manifest.js
CHANGED
|
@@ -41,22 +41,20 @@ var __importStar = (this && this.__importStar) || (function () {
|
|
|
41
41
|
};
|
|
42
42
|
})();
|
|
43
43
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
44
|
-
exports.
|
|
44
|
+
exports.MANIFEST_SCHEMA_VERSION = exports.SDK_VERSION = void 0;
|
|
45
45
|
exports.generateManifest = generateManifest;
|
|
46
46
|
exports.writeManifest = writeManifest;
|
|
47
47
|
exports.readManifest = readManifest;
|
|
48
48
|
exports.readLock = readLock;
|
|
49
|
+
const crypto = __importStar(require("node:crypto"));
|
|
49
50
|
const fs = __importStar(require("node:fs/promises"));
|
|
50
51
|
const path = __importStar(require("node:path"));
|
|
51
|
-
const
|
|
52
|
+
const version_1 = require("../version");
|
|
53
|
+
Object.defineProperty(exports, "SDK_VERSION", { enumerable: true, get: function () { return version_1.SDK_VERSION; } });
|
|
52
54
|
/**
|
|
53
55
|
* Manifest schema version
|
|
54
56
|
*/
|
|
55
57
|
exports.MANIFEST_SCHEMA_VERSION = 1;
|
|
56
|
-
/**
|
|
57
|
-
* SDK version from package.json
|
|
58
|
-
*/
|
|
59
|
-
exports.SDK_VERSION = "1.8.0";
|
|
60
58
|
/**
|
|
61
59
|
* Generate evaluation manifest from discovery results
|
|
62
60
|
*/
|
|
@@ -73,7 +71,7 @@ async function generateManifest(specs, projectRoot, projectName, executionMode)
|
|
|
73
71
|
if (!specsByFile.has(normalizedPath)) {
|
|
74
72
|
specsByFile.set(normalizedPath, []);
|
|
75
73
|
}
|
|
76
|
-
specsByFile.get(normalizedPath)
|
|
74
|
+
specsByFile.get(normalizedPath)?.push(spec);
|
|
77
75
|
}
|
|
78
76
|
// Process each file
|
|
79
77
|
for (const [filePath, fileSpecs] of specsByFile) {
|
|
@@ -100,7 +98,7 @@ async function generateManifest(specs, projectRoot, projectName, executionMode)
|
|
|
100
98
|
},
|
|
101
99
|
runtime: {
|
|
102
100
|
mode: executionMode.mode,
|
|
103
|
-
sdkVersion:
|
|
101
|
+
sdkVersion: version_1.SDK_VERSION,
|
|
104
102
|
},
|
|
105
103
|
specFiles,
|
|
106
104
|
specs: processedSpecs,
|
|
@@ -163,7 +161,8 @@ function extractDependencies(content) {
|
|
|
163
161
|
const dependsOnMatch = content.match(/dependsOn\s*:\s*({[^}]+})/s);
|
|
164
162
|
if (dependsOnMatch) {
|
|
165
163
|
try {
|
|
166
|
-
|
|
164
|
+
// Use JSON.parse instead of eval for safety
|
|
165
|
+
const deps = JSON.parse(dependsOnMatch[1]);
|
|
167
166
|
return {
|
|
168
167
|
prompts: deps.prompts || [],
|
|
169
168
|
datasets: deps.datasets || [],
|
|
@@ -171,8 +170,14 @@ function extractDependencies(content) {
|
|
|
171
170
|
code: deps.code || [],
|
|
172
171
|
};
|
|
173
172
|
}
|
|
174
|
-
catch (
|
|
175
|
-
//
|
|
173
|
+
catch (_error) {
|
|
174
|
+
// If parsing fails, return empty dependencies
|
|
175
|
+
return {
|
|
176
|
+
prompts: [],
|
|
177
|
+
datasets: [],
|
|
178
|
+
tools: [],
|
|
179
|
+
code: [],
|
|
180
|
+
};
|
|
176
181
|
}
|
|
177
182
|
}
|
|
178
183
|
// Simple extraction as fallback
|
|
@@ -184,8 +189,10 @@ function extractDependencies(content) {
|
|
|
184
189
|
};
|
|
185
190
|
for (const [type, pattern] of Object.entries(patterns)) {
|
|
186
191
|
let match;
|
|
187
|
-
|
|
192
|
+
match = pattern.exec(content);
|
|
193
|
+
while (match !== null) {
|
|
188
194
|
dependsOn[type].push(match[1]);
|
|
195
|
+
match = pattern.exec(content);
|
|
189
196
|
}
|
|
190
197
|
}
|
|
191
198
|
return dependsOn;
|
|
@@ -256,7 +263,7 @@ async function readManifest(projectRoot) {
|
|
|
256
263
|
const content = await fs.readFile(manifestPath, "utf-8");
|
|
257
264
|
return JSON.parse(content);
|
|
258
265
|
}
|
|
259
|
-
catch (
|
|
266
|
+
catch (_error) {
|
|
260
267
|
return null;
|
|
261
268
|
}
|
|
262
269
|
}
|
|
@@ -269,7 +276,7 @@ async function readLock(projectRoot) {
|
|
|
269
276
|
const content = await fs.readFile(lockPath, "utf-8");
|
|
270
277
|
return JSON.parse(content);
|
|
271
278
|
}
|
|
272
|
-
catch (
|
|
279
|
+
catch (_error) {
|
|
273
280
|
return null;
|
|
274
281
|
}
|
|
275
282
|
}
|
package/dist/cli/migrate.js
CHANGED
|
@@ -43,9 +43,9 @@ exports.migrateConfig = migrateConfig;
|
|
|
43
43
|
exports.createMigrateCommand = createMigrateCommand;
|
|
44
44
|
exports.validateConfigFile = validateConfigFile;
|
|
45
45
|
exports.previewMigration = previewMigration;
|
|
46
|
-
const commander_1 = require("commander");
|
|
47
46
|
const fs = __importStar(require("node:fs/promises"));
|
|
48
47
|
const path = __importStar(require("node:path"));
|
|
48
|
+
const commander_1 = require("commander");
|
|
49
49
|
const testsuite_to_dsl_1 = require("../runtime/adapters/testsuite-to-dsl");
|
|
50
50
|
const testing_1 = require("../testing");
|
|
51
51
|
/**
|
|
@@ -97,7 +97,7 @@ function extractTestSuitesFromConfig(config) {
|
|
|
97
97
|
/**
|
|
98
98
|
* Generate DSL file header
|
|
99
99
|
*/
|
|
100
|
-
function generateFileHeader(
|
|
100
|
+
function generateFileHeader(_config, options) {
|
|
101
101
|
const timestamp = new Date().toISOString();
|
|
102
102
|
const inputPath = path.resolve(options.input);
|
|
103
103
|
const outputPath = path.resolve(options.output);
|
|
@@ -127,7 +127,7 @@ function generateFileHeader(config, options) {
|
|
|
127
127
|
/**
|
|
128
128
|
* Generate helper functions for the entire file
|
|
129
129
|
*/
|
|
130
|
-
function generateGlobalHelpers(config,
|
|
130
|
+
function generateGlobalHelpers(config, _options) {
|
|
131
131
|
const helpers = [];
|
|
132
132
|
// Add executor helper if config has executor
|
|
133
133
|
if (config.executor) {
|
|
@@ -164,7 +164,7 @@ function generateGlobalHelpers(config, options) {
|
|
|
164
164
|
` * Legacy test evaluation function`,
|
|
165
165
|
` * TODO: Adapt based on your original test logic`,
|
|
166
166
|
` */`,
|
|
167
|
-
`async function evaluateLegacyTest(input: string, expected?: string): Promise<
|
|
167
|
+
`async function evaluateLegacyTest(input: string, expected?: string): Promise<unknown> {`,
|
|
168
168
|
` const output = await legacyExecutor(input);`,
|
|
169
169
|
` const passed = evaluateAssertions(output, expected);`,
|
|
170
170
|
` `,
|
package/dist/cli/policy-packs.js
CHANGED
|
@@ -22,7 +22,10 @@ exports.POLICY_PACKS = {
|
|
|
22
22
|
1: {
|
|
23
23
|
policyId: "SOC2",
|
|
24
24
|
version: 1,
|
|
25
|
-
thresholds: {
|
|
25
|
+
thresholds: {
|
|
26
|
+
requiredSafetyRate: 0.95,
|
|
27
|
+
maxFlags: ["SAFETY_RISK", "LOW_PASS_RATE"],
|
|
28
|
+
},
|
|
26
29
|
rationale: "SOC2 trust criteria for security and availability.",
|
|
27
30
|
checks: ["safety_rate", "flag_restrictions"],
|
|
28
31
|
},
|
|
@@ -40,7 +43,10 @@ exports.POLICY_PACKS = {
|
|
|
40
43
|
1: {
|
|
41
44
|
policyId: "PCI_DSS",
|
|
42
45
|
version: 1,
|
|
43
|
-
thresholds: {
|
|
46
|
+
thresholds: {
|
|
47
|
+
requiredSafetyRate: 0.99,
|
|
48
|
+
maxFlags: ["SAFETY_RISK", "LOW_PASS_RATE"],
|
|
49
|
+
},
|
|
44
50
|
rationale: "PCI DSS cardholder data security standards.",
|
|
45
51
|
checks: ["safety_rate", "flag_restrictions"],
|
|
46
52
|
},
|
package/dist/cli/print-config.js
CHANGED
|
@@ -135,13 +135,20 @@ function buildResolvedConfig(cwd, flags) {
|
|
|
135
135
|
: "default";
|
|
136
136
|
fields.push({
|
|
137
137
|
key: "baseUrl",
|
|
138
|
-
value: flags.baseUrl ||
|
|
138
|
+
value: flags.baseUrl ||
|
|
139
|
+
envBaseUrl ||
|
|
140
|
+
fileConfig?.baseUrl ||
|
|
141
|
+
"http://localhost:3000",
|
|
139
142
|
source: baseUrlSource,
|
|
140
143
|
});
|
|
141
144
|
// apiKey (always redacted)
|
|
142
145
|
const envApiKey = process.env.EVALAI_API_KEY;
|
|
143
146
|
const rawApiKey = flags.apiKey || envApiKey || "";
|
|
144
|
-
const apiKeySource = flags.apiKey
|
|
147
|
+
const apiKeySource = flags.apiKey
|
|
148
|
+
? "arg"
|
|
149
|
+
: envApiKey
|
|
150
|
+
? "env"
|
|
151
|
+
: "default";
|
|
145
152
|
fields.push({
|
|
146
153
|
key: "apiKey",
|
|
147
154
|
value: redact(rawApiKey) ?? "(not set)",
|
|
@@ -150,7 +157,11 @@ function buildResolvedConfig(cwd, flags) {
|
|
|
150
157
|
});
|
|
151
158
|
// profile
|
|
152
159
|
const profileName = (flags.profile || fileConfig?.profile);
|
|
153
|
-
const profileSource = flags.profile
|
|
160
|
+
const profileSource = flags.profile
|
|
161
|
+
? "arg"
|
|
162
|
+
: fileConfig?.profile
|
|
163
|
+
? "file"
|
|
164
|
+
: "default";
|
|
154
165
|
fields.push({
|
|
155
166
|
key: "profile",
|
|
156
167
|
value: profileName ?? null,
|
|
@@ -184,7 +195,11 @@ function buildResolvedConfig(cwd, flags) {
|
|
|
184
195
|
});
|
|
185
196
|
}
|
|
186
197
|
// baseline
|
|
187
|
-
const baselineSource = flags.baseline
|
|
198
|
+
const baselineSource = flags.baseline
|
|
199
|
+
? "arg"
|
|
200
|
+
: fileConfig?.baseline
|
|
201
|
+
? "file"
|
|
202
|
+
: "default";
|
|
188
203
|
fields.push({
|
|
189
204
|
key: "baseline",
|
|
190
205
|
value: merged.baseline ?? "published",
|