@traits-dev/cli 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/traits.js +236 -7
- package/package.json +2 -2
package/dist/traits.js
CHANGED
|
@@ -22,6 +22,8 @@ function printCompileUsage(out = process.stderr) {
|
|
|
22
22
|
" --model <model> Model target (required)",
|
|
23
23
|
" --json Output structured JSON",
|
|
24
24
|
" --strict Treat warnings as compile-blocking",
|
|
25
|
+
" --budget Print estimated token count (chars/4)",
|
|
26
|
+
" --budget-limit <tokens> Warn to stderr if estimate exceeds limit",
|
|
25
27
|
" --explain Include compilation trace output",
|
|
26
28
|
" --context key=value Activate context adaptation (repeatable)",
|
|
27
29
|
" --knowledge-base-dir Directory containing compiler pattern files",
|
|
@@ -52,6 +54,8 @@ function parseCompileArgs(args) {
|
|
|
52
54
|
model: null,
|
|
53
55
|
strict: false,
|
|
54
56
|
json: false,
|
|
57
|
+
budget: false,
|
|
58
|
+
budgetLimit: null,
|
|
55
59
|
explain: false,
|
|
56
60
|
verbose: false,
|
|
57
61
|
noColor: false,
|
|
@@ -70,6 +74,10 @@ function parseCompileArgs(args) {
|
|
|
70
74
|
result.json = true;
|
|
71
75
|
continue;
|
|
72
76
|
}
|
|
77
|
+
if (arg === "--budget") {
|
|
78
|
+
result.budget = true;
|
|
79
|
+
continue;
|
|
80
|
+
}
|
|
73
81
|
if (arg === "--explain") {
|
|
74
82
|
result.explain = true;
|
|
75
83
|
continue;
|
|
@@ -82,7 +90,7 @@ function parseCompileArgs(args) {
|
|
|
82
90
|
result.noColor = true;
|
|
83
91
|
continue;
|
|
84
92
|
}
|
|
85
|
-
if (arg === "--model" || arg === "--bundled-profiles-dir" || arg === "--context" || arg === "--knowledge-base-dir") {
|
|
93
|
+
if (arg === "--model" || arg === "--bundled-profiles-dir" || arg === "--context" || arg === "--knowledge-base-dir" || arg === "--budget-limit") {
|
|
86
94
|
const value = args[index + 1];
|
|
87
95
|
if (!value) return { error: `Missing value for "${arg}"` };
|
|
88
96
|
if (arg === "--model") {
|
|
@@ -91,6 +99,12 @@ function parseCompileArgs(args) {
|
|
|
91
99
|
result.bundledProfilesDir = value;
|
|
92
100
|
} else if (arg === "--knowledge-base-dir") {
|
|
93
101
|
result.knowledgeBaseDir = value;
|
|
102
|
+
} else if (arg === "--budget-limit") {
|
|
103
|
+
const parsedBudgetLimit = Number(value);
|
|
104
|
+
if (!Number.isFinite(parsedBudgetLimit) || parsedBudgetLimit <= 0) {
|
|
105
|
+
return { error: `Invalid value for "--budget-limit": "${value}"` };
|
|
106
|
+
}
|
|
107
|
+
result.budgetLimit = Math.round(parsedBudgetLimit);
|
|
94
108
|
} else {
|
|
95
109
|
const parsedContext = parseContextArg(value);
|
|
96
110
|
if ("error" in parsedContext) return { error: parsedContext.error };
|
|
@@ -111,8 +125,14 @@ function parseCompileArgs(args) {
|
|
|
111
125
|
if (!result.model) {
|
|
112
126
|
return { error: 'Missing required option "--model"' };
|
|
113
127
|
}
|
|
128
|
+
if (result.budgetLimit != null) {
|
|
129
|
+
result.budget = true;
|
|
130
|
+
}
|
|
114
131
|
return { value: result };
|
|
115
132
|
}
|
|
133
|
+
function estimateBudgetTokens(text) {
|
|
134
|
+
return Math.ceil(String(text ?? "").length / 4);
|
|
135
|
+
}
|
|
116
136
|
function runCompile(args, io = process) {
|
|
117
137
|
const parsed = parseCompileArgs(args);
|
|
118
138
|
if ("error" in parsed) {
|
|
@@ -156,10 +176,32 @@ function runCompile(args, io = process) {
|
|
|
156
176
|
if (options.json) {
|
|
157
177
|
io.stdout.write(`${JSON.stringify(compiled, null, 2)}
|
|
158
178
|
`);
|
|
179
|
+
if (options.budget) {
|
|
180
|
+
const budgetEstimate = estimateBudgetTokens(compiled.text);
|
|
181
|
+
io.stderr.write(`Estimated token count: ${budgetEstimate}
|
|
182
|
+
`);
|
|
183
|
+
if (options.budgetLimit != null && budgetEstimate > options.budgetLimit) {
|
|
184
|
+
io.stderr.write(
|
|
185
|
+
`Warning: Estimated token count ${budgetEstimate} exceeds budget limit ${options.budgetLimit}
|
|
186
|
+
`
|
|
187
|
+
);
|
|
188
|
+
}
|
|
189
|
+
}
|
|
159
190
|
return 0;
|
|
160
191
|
}
|
|
161
192
|
io.stdout.write(`${compiled.text}
|
|
162
193
|
`);
|
|
194
|
+
if (options.budget) {
|
|
195
|
+
const budgetEstimate = estimateBudgetTokens(compiled.text);
|
|
196
|
+
io.stderr.write(`Estimated token count: ${budgetEstimate}
|
|
197
|
+
`);
|
|
198
|
+
if (options.budgetLimit != null && budgetEstimate > options.budgetLimit) {
|
|
199
|
+
io.stderr.write(
|
|
200
|
+
`Warning: Estimated token count ${budgetEstimate} exceeds budget limit ${options.budgetLimit}
|
|
201
|
+
`
|
|
202
|
+
);
|
|
203
|
+
}
|
|
204
|
+
}
|
|
163
205
|
if (options.explain && compiled.trace) {
|
|
164
206
|
io.stdout.write(`
|
|
165
207
|
[TRACE]
|
|
@@ -211,6 +253,7 @@ import {
|
|
|
211
253
|
import {
|
|
212
254
|
detectEvalTierAvailability,
|
|
213
255
|
formatValidationResult as formatValidationResult2,
|
|
256
|
+
loadBuiltInEvalSuite,
|
|
214
257
|
resolveTierExecution,
|
|
215
258
|
runOfflineBaselineScaffold,
|
|
216
259
|
toValidationResultObject as toValidationResultObject2
|
|
@@ -224,6 +267,7 @@ function printEvalUsage(out = process.stderr) {
|
|
|
224
267
|
"Options:",
|
|
225
268
|
" --model <model> Model target (required)",
|
|
226
269
|
" --tier <1|2|3> Highest tier to run (default: highest available)",
|
|
270
|
+
" --suite <name> Built-in baseline suite: support|healthcare|developer",
|
|
227
271
|
" --provider <name> Judge provider for Tier 3: auto|openai|anthropic",
|
|
228
272
|
" --embedding-model <name> Embedding model for Tier 2 (OpenAI)",
|
|
229
273
|
" --judge-model <name> Judge model for Tier 3 provider",
|
|
@@ -236,6 +280,11 @@ function printEvalUsage(out = process.stderr) {
|
|
|
236
280
|
" --samples <path> JSON file with samples: [{ id, response }]",
|
|
237
281
|
" --scenarios <path> Alias for --samples in this scaffold",
|
|
238
282
|
" --json Output structured JSON",
|
|
283
|
+
" --format <text|json|junit> Output format (default: text)",
|
|
284
|
+
" --junit-threshold <num> Global JUnit pass threshold in [0,1] (default: 0.7)",
|
|
285
|
+
" --junit-threshold-tier1 <num> Tier 1 JUnit threshold override",
|
|
286
|
+
" --junit-threshold-tier2 <num> Tier 2 JUnit threshold override",
|
|
287
|
+
" --junit-threshold-tier3 <num> Tier 3 JUnit threshold override",
|
|
239
288
|
" --strict Treat validation warnings as errors",
|
|
240
289
|
" --verbose Include command metadata output",
|
|
241
290
|
" --no-color Disable colorized output",
|
|
@@ -251,6 +300,7 @@ function parseEvalArgs(args) {
|
|
|
251
300
|
profilePath: null,
|
|
252
301
|
model: null,
|
|
253
302
|
tier: null,
|
|
303
|
+
suite: null,
|
|
254
304
|
provider: "auto",
|
|
255
305
|
embeddingModel: null,
|
|
256
306
|
judgeModel: null,
|
|
@@ -260,6 +310,11 @@ function parseEvalArgs(args) {
|
|
|
260
310
|
maxRetries: null,
|
|
261
311
|
retryBaseMs: null,
|
|
262
312
|
json: false,
|
|
313
|
+
format: "text",
|
|
314
|
+
junitThreshold: null,
|
|
315
|
+
junitThresholdTier1: null,
|
|
316
|
+
junitThresholdTier2: null,
|
|
317
|
+
junitThresholdTier3: null,
|
|
263
318
|
strict: false,
|
|
264
319
|
verbose: false,
|
|
265
320
|
noColor: false,
|
|
@@ -274,6 +329,7 @@ function parseEvalArgs(args) {
|
|
|
274
329
|
const arg = args[index];
|
|
275
330
|
if (arg === "--json") {
|
|
276
331
|
result.json = true;
|
|
332
|
+
result.format = "json";
|
|
277
333
|
continue;
|
|
278
334
|
}
|
|
279
335
|
if (arg === "--strict") {
|
|
@@ -300,14 +356,18 @@ function parseEvalArgs(args) {
|
|
|
300
356
|
result.constraintImpact = true;
|
|
301
357
|
continue;
|
|
302
358
|
}
|
|
303
|
-
if (arg === "--model" || arg === "--tier" || arg === "--provider" || arg === "--embedding-model" || arg === "--judge-model" || arg === "--openai-base-url" || arg === "--anthropic-base-url" || arg === "--timeout-ms" || arg === "--max-retries" || arg === "--retry-base-ms" || arg === "--response" || arg === "--samples" || arg === "--scenarios") {
|
|
359
|
+
if (arg === "--model" || arg === "--tier" || arg === "--suite" || arg === "--provider" || arg === "--format" || arg === "--embedding-model" || arg === "--judge-model" || arg === "--openai-base-url" || arg === "--anthropic-base-url" || arg === "--timeout-ms" || arg === "--max-retries" || arg === "--retry-base-ms" || arg === "--junit-threshold" || arg === "--junit-threshold-tier1" || arg === "--junit-threshold-tier2" || arg === "--junit-threshold-tier3" || arg === "--response" || arg === "--samples" || arg === "--scenarios") {
|
|
304
360
|
const value = args[index + 1];
|
|
305
361
|
if (!value) return { error: `Missing value for "${arg}"` };
|
|
306
362
|
if (arg === "--model") result.model = value;
|
|
307
363
|
if (arg === "--tier") result.tier = Number(value);
|
|
364
|
+
if (arg === "--suite") result.suite = String(value).toLowerCase();
|
|
308
365
|
if (arg === "--provider") {
|
|
309
366
|
result.provider = String(value).toLowerCase();
|
|
310
367
|
}
|
|
368
|
+
if (arg === "--format") {
|
|
369
|
+
result.format = String(value).toLowerCase();
|
|
370
|
+
}
|
|
311
371
|
if (arg === "--embedding-model") result.embeddingModel = value;
|
|
312
372
|
if (arg === "--judge-model") result.judgeModel = value;
|
|
313
373
|
if (arg === "--openai-base-url") result.openaiBaseUrl = value;
|
|
@@ -315,6 +375,10 @@ function parseEvalArgs(args) {
|
|
|
315
375
|
if (arg === "--timeout-ms") result.timeoutMs = Number(value);
|
|
316
376
|
if (arg === "--max-retries") result.maxRetries = Number(value);
|
|
317
377
|
if (arg === "--retry-base-ms") result.retryBaseMs = Number(value);
|
|
378
|
+
if (arg === "--junit-threshold") result.junitThreshold = Number(value);
|
|
379
|
+
if (arg === "--junit-threshold-tier1") result.junitThresholdTier1 = Number(value);
|
|
380
|
+
if (arg === "--junit-threshold-tier2") result.junitThresholdTier2 = Number(value);
|
|
381
|
+
if (arg === "--junit-threshold-tier3") result.junitThresholdTier3 = Number(value);
|
|
318
382
|
if (arg === "--response") result.responses.push(value);
|
|
319
383
|
if (arg === "--samples" || arg === "--scenarios") result.samplesPath = value;
|
|
320
384
|
index += 1;
|
|
@@ -338,6 +402,20 @@ function parseEvalArgs(args) {
|
|
|
338
402
|
if (!["auto", "openai", "anthropic"].includes(result.provider)) {
|
|
339
403
|
return { error: 'Invalid "--provider" value. Expected auto, openai, or anthropic.' };
|
|
340
404
|
}
|
|
405
|
+
if (result.suite != null && !["support", "healthcare", "developer"].includes(
|
|
406
|
+
result.suite
|
|
407
|
+
)) {
|
|
408
|
+
return { error: 'Invalid "--suite" value. Expected support, healthcare, or developer.' };
|
|
409
|
+
}
|
|
410
|
+
if (result.suite != null && result.samplesPath != null) {
|
|
411
|
+
return { error: 'Use either "--suite" or "--samples/--scenarios", not both.' };
|
|
412
|
+
}
|
|
413
|
+
if (result.suite != null && result.responses.length > 0) {
|
|
414
|
+
return { error: 'Use either "--suite" or "--response", not both.' };
|
|
415
|
+
}
|
|
416
|
+
if (!["text", "json", "junit"].includes(result.format)) {
|
|
417
|
+
return { error: 'Invalid "--format" value. Expected text, json, or junit.' };
|
|
418
|
+
}
|
|
341
419
|
if (result.timeoutMs != null && (!Number.isInteger(result.timeoutMs) || result.timeoutMs < 0)) {
|
|
342
420
|
return { error: 'Invalid "--timeout-ms" value. Expected a non-negative integer.' };
|
|
343
421
|
}
|
|
@@ -347,9 +425,33 @@ function parseEvalArgs(args) {
|
|
|
347
425
|
if (result.retryBaseMs != null && (!Number.isInteger(result.retryBaseMs) || result.retryBaseMs < 0)) {
|
|
348
426
|
return { error: 'Invalid "--retry-base-ms" value. Expected a non-negative integer.' };
|
|
349
427
|
}
|
|
428
|
+
for (const [flag, value] of [
|
|
429
|
+
["--junit-threshold", result.junitThreshold],
|
|
430
|
+
["--junit-threshold-tier1", result.junitThresholdTier1],
|
|
431
|
+
["--junit-threshold-tier2", result.junitThresholdTier2],
|
|
432
|
+
["--junit-threshold-tier3", result.junitThresholdTier3]
|
|
433
|
+
]) {
|
|
434
|
+
if (value == null) continue;
|
|
435
|
+
if (!Number.isFinite(value) || value < 0 || value > 1) {
|
|
436
|
+
return { error: `Invalid "${flag}" value. Expected a number in [0, 1].` };
|
|
437
|
+
}
|
|
438
|
+
}
|
|
350
439
|
return { value: result };
|
|
351
440
|
}
|
|
352
441
|
function loadSamples(options, cwd) {
|
|
442
|
+
if (options.suite) {
|
|
443
|
+
const suite = loadBuiltInEvalSuite(options.suite);
|
|
444
|
+
if (!suite) {
|
|
445
|
+
throw new Error(
|
|
446
|
+
`Unknown suite "${options.suite}". Expected support, healthcare, or developer.`
|
|
447
|
+
);
|
|
448
|
+
}
|
|
449
|
+
return suite.scenarios.map((scenario) => ({
|
|
450
|
+
id: scenario.id,
|
|
451
|
+
prompt: scenario.messages.map((message) => `${message.role}: ${message.content}`).join("\n"),
|
|
452
|
+
response: scenario.expected_behavior ?? ""
|
|
453
|
+
}));
|
|
454
|
+
}
|
|
353
455
|
if (options.samplesPath) {
|
|
354
456
|
const sampleFile = path2.resolve(cwd, options.samplesPath);
|
|
355
457
|
const parsed = JSON.parse(fs.readFileSync(sampleFile, "utf8"));
|
|
@@ -377,10 +479,118 @@ function loadSamples(options, cwd) {
|
|
|
377
479
|
}));
|
|
378
480
|
}
|
|
379
481
|
function writeProgress(io, options, message) {
|
|
380
|
-
if (options.
|
|
482
|
+
if (options.format !== "text") return;
|
|
381
483
|
io.stderr.write(`${message}
|
|
382
484
|
`);
|
|
383
485
|
}
|
|
486
|
+
function escapeXml(value) {
|
|
487
|
+
return String(value ?? "").replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
488
|
+
}
|
|
489
|
+
function resolveJUnitThresholds(options) {
|
|
490
|
+
const base = options.junitThreshold ?? 0.7;
|
|
491
|
+
return {
|
|
492
|
+
tier1: options.junitThresholdTier1 ?? base,
|
|
493
|
+
tier2: options.junitThresholdTier2 ?? base,
|
|
494
|
+
tier3: options.junitThresholdTier3 ?? base
|
|
495
|
+
};
|
|
496
|
+
}
|
|
497
|
+
function buildSampleScoreMap(samples) {
|
|
498
|
+
return new Map((samples ?? []).map((sample) => [String(sample.id), Number(sample.score)]));
|
|
499
|
+
}
|
|
500
|
+
function collectScenarioIds(reports) {
|
|
501
|
+
const ids = [];
|
|
502
|
+
const seen = /* @__PURE__ */ new Set();
|
|
503
|
+
for (const sample of reports.tier1?.samples ?? []) {
|
|
504
|
+
const id = String(sample.id);
|
|
505
|
+
if (seen.has(id)) continue;
|
|
506
|
+
seen.add(id);
|
|
507
|
+
ids.push(id);
|
|
508
|
+
}
|
|
509
|
+
for (const sample of reports.tier2?.samples ?? []) {
|
|
510
|
+
const id = String(sample.id);
|
|
511
|
+
if (seen.has(id)) continue;
|
|
512
|
+
seen.add(id);
|
|
513
|
+
ids.push(id);
|
|
514
|
+
}
|
|
515
|
+
for (const sample of reports.tier3?.samples ?? []) {
|
|
516
|
+
const id = String(sample.id);
|
|
517
|
+
if (seen.has(id)) continue;
|
|
518
|
+
seen.add(id);
|
|
519
|
+
ids.push(id);
|
|
520
|
+
}
|
|
521
|
+
return ids;
|
|
522
|
+
}
|
|
523
|
+
function buildJUnitReport(args) {
|
|
524
|
+
const ids = collectScenarioIds(args.tierReports);
|
|
525
|
+
const tier1Scores = buildSampleScoreMap(args.tierReports.tier1?.samples);
|
|
526
|
+
const tier2Scores = buildSampleScoreMap(args.tierReports.tier2?.samples);
|
|
527
|
+
const tier3Scores = buildSampleScoreMap(args.tierReports.tier3?.samples);
|
|
528
|
+
const className = `traits.eval.${path2.basename(args.profilePath, path2.extname(args.profilePath))}`;
|
|
529
|
+
let failures = 0;
|
|
530
|
+
const testCases = [];
|
|
531
|
+
for (const id of ids) {
|
|
532
|
+
const reasons = [];
|
|
533
|
+
const scoreLines = [];
|
|
534
|
+
const tier1 = tier1Scores.get(id);
|
|
535
|
+
if (tier1 != null) {
|
|
536
|
+
scoreLines.push(`tier1=${tier1.toFixed(3)} threshold=${args.thresholds.tier1.toFixed(3)}`);
|
|
537
|
+
if (tier1 < args.thresholds.tier1) {
|
|
538
|
+
reasons.push(
|
|
539
|
+
`Tier 1 score ${tier1.toFixed(3)} below threshold ${args.thresholds.tier1.toFixed(3)}`
|
|
540
|
+
);
|
|
541
|
+
}
|
|
542
|
+
}
|
|
543
|
+
const tier2 = tier2Scores.get(id);
|
|
544
|
+
if (tier2 != null) {
|
|
545
|
+
scoreLines.push(`tier2=${tier2.toFixed(3)} threshold=${args.thresholds.tier2.toFixed(3)}`);
|
|
546
|
+
if (tier2 < args.thresholds.tier2) {
|
|
547
|
+
reasons.push(
|
|
548
|
+
`Tier 2 score ${tier2.toFixed(3)} below threshold ${args.thresholds.tier2.toFixed(3)}`
|
|
549
|
+
);
|
|
550
|
+
}
|
|
551
|
+
}
|
|
552
|
+
const tier3 = tier3Scores.get(id);
|
|
553
|
+
if (tier3 != null) {
|
|
554
|
+
scoreLines.push(`tier3=${tier3.toFixed(3)} threshold=${args.thresholds.tier3.toFixed(3)}`);
|
|
555
|
+
if (tier3 < args.thresholds.tier3) {
|
|
556
|
+
reasons.push(
|
|
557
|
+
`Tier 3 score ${tier3.toFixed(3)} below threshold ${args.thresholds.tier3.toFixed(3)}`
|
|
558
|
+
);
|
|
559
|
+
}
|
|
560
|
+
}
|
|
561
|
+
const testcase = [];
|
|
562
|
+
testcase.push(
|
|
563
|
+
` <testcase classname="${escapeXml(className)}" name="${escapeXml(id)}" time="0">`
|
|
564
|
+
);
|
|
565
|
+
if (reasons.length > 0) {
|
|
566
|
+
failures += 1;
|
|
567
|
+
testcase.push(
|
|
568
|
+
` <failure message="${escapeXml("traits eval threshold failure")}">${escapeXml(
|
|
569
|
+
reasons.join(" | ")
|
|
570
|
+
)}</failure>`
|
|
571
|
+
);
|
|
572
|
+
}
|
|
573
|
+
if (scoreLines.length > 0) {
|
|
574
|
+
testcase.push(` <system-out>${escapeXml(scoreLines.join(" | "))}</system-out>`);
|
|
575
|
+
}
|
|
576
|
+
testcase.push(" </testcase>");
|
|
577
|
+
testCases.push(testcase.join("\n"));
|
|
578
|
+
}
|
|
579
|
+
const xml = [
|
|
580
|
+
'<?xml version="1.0" encoding="UTF-8"?>',
|
|
581
|
+
"<testsuites>",
|
|
582
|
+
` <testsuite name="traits.eval" tests="${ids.length}" failures="${failures}" errors="0" skipped="0" time="0">`,
|
|
583
|
+
` <properties><property name="profile" value="${escapeXml(args.profilePath)}" /><property name="model" value="${escapeXml(args.model)}" /><property name="threshold_tier1" value="${args.thresholds.tier1.toFixed(3)}" /><property name="threshold_tier2" value="${args.thresholds.tier2.toFixed(3)}" /><property name="threshold_tier3" value="${args.thresholds.tier3.toFixed(3)}" /></properties>`,
|
|
584
|
+
...testCases,
|
|
585
|
+
" </testsuite>",
|
|
586
|
+
"</testsuites>"
|
|
587
|
+
].join("\n");
|
|
588
|
+
return {
|
|
589
|
+
xml,
|
|
590
|
+
tests: ids.length,
|
|
591
|
+
failures
|
|
592
|
+
};
|
|
593
|
+
}
|
|
384
594
|
async function runEval(args, io = process) {
|
|
385
595
|
const parsed = parseEvalArgs(args);
|
|
386
596
|
if ("error" in parsed) {
|
|
@@ -515,6 +725,8 @@ async function runEval(args, io = process) {
|
|
|
515
725
|
const payload = {
|
|
516
726
|
profile: profilePath,
|
|
517
727
|
model: options.model,
|
|
728
|
+
format: options.format,
|
|
729
|
+
suite: options.suite,
|
|
518
730
|
tier_requested: requestedTier,
|
|
519
731
|
tier_executed: tierResolution.tier_executed,
|
|
520
732
|
tier_resolution: tierResolution,
|
|
@@ -531,11 +743,22 @@ async function runEval(args, io = process) {
|
|
|
531
743
|
errors: evaluation.validation.errors.length
|
|
532
744
|
};
|
|
533
745
|
}
|
|
534
|
-
if (options.json) {
|
|
746
|
+
if (options.format === "json") {
|
|
535
747
|
io.stdout.write(`${JSON.stringify(payload, null, 2)}
|
|
536
748
|
`);
|
|
537
749
|
return 0;
|
|
538
750
|
}
|
|
751
|
+
if (options.format === "junit") {
|
|
752
|
+
const junit = buildJUnitReport({
|
|
753
|
+
profilePath,
|
|
754
|
+
model: options.model,
|
|
755
|
+
tierReports,
|
|
756
|
+
thresholds: resolveJUnitThresholds(options)
|
|
757
|
+
});
|
|
758
|
+
io.stdout.write(`${junit.xml}
|
|
759
|
+
`);
|
|
760
|
+
return junit.failures > 0 ? 1 : 0;
|
|
761
|
+
}
|
|
539
762
|
if (tierReports.tier1) {
|
|
540
763
|
io.stdout.write(`Tier 1 average score: ${tierReports.tier1.average_score.toFixed(3)}
|
|
541
764
|
`);
|
|
@@ -547,10 +770,16 @@ async function runEval(args, io = process) {
|
|
|
547
770
|
if (tierReports.tier2) {
|
|
548
771
|
io.stdout.write(`Tier 2 average score: ${tierReports.tier2.average_score.toFixed(3)}
|
|
549
772
|
`);
|
|
773
|
+
io.stdout.write(
|
|
774
|
+
"Note: Tier 2 embedding scores are directionally useful but sensitive to model granularity.\n"
|
|
775
|
+
);
|
|
550
776
|
}
|
|
551
777
|
if (tierReports.tier3) {
|
|
552
778
|
io.stdout.write(`Tier 3 average score: ${tierReports.tier3.average_score.toFixed(3)}
|
|
553
779
|
`);
|
|
780
|
+
io.stdout.write(
|
|
781
|
+
"Note: Tier 3 judge scores are noisy across runs. Do not use as a sole merge gate.\n"
|
|
782
|
+
);
|
|
554
783
|
}
|
|
555
784
|
if (baselineReport?.tier1) {
|
|
556
785
|
io.stdout.write(
|
|
@@ -578,12 +807,12 @@ async function runEval(args, io = process) {
|
|
|
578
807
|
return 0;
|
|
579
808
|
} catch (error) {
|
|
580
809
|
const typedError = error;
|
|
581
|
-
if ((typedError.code === "E_EVAL_TIER2_UNAVAILABLE" || typedError.code === "E_EVAL_TIER3_UNAVAILABLE") &&
|
|
810
|
+
if ((typedError.code === "E_EVAL_TIER2_UNAVAILABLE" || typedError.code === "E_EVAL_TIER3_UNAVAILABLE") && options.format !== "json") {
|
|
582
811
|
io.stderr.write(`Error: ${typedError.message ?? "Evaluation tier unavailable."}
|
|
583
812
|
`);
|
|
584
813
|
return 2;
|
|
585
814
|
}
|
|
586
|
-
if ((typedError.code === "E_EVAL_TIER2_UNAVAILABLE" || typedError.code === "E_EVAL_TIER3_UNAVAILABLE") && options.json) {
|
|
815
|
+
if ((typedError.code === "E_EVAL_TIER2_UNAVAILABLE" || typedError.code === "E_EVAL_TIER3_UNAVAILABLE") && options.format === "json") {
|
|
587
816
|
io.stdout.write(
|
|
588
817
|
`${JSON.stringify(
|
|
589
818
|
{
|
|
@@ -599,7 +828,7 @@ async function runEval(args, io = process) {
|
|
|
599
828
|
}
|
|
600
829
|
const validation = typedError.validation;
|
|
601
830
|
if (typedError.code === "E_EVAL_VALIDATION" && validation) {
|
|
602
|
-
if (options.json) {
|
|
831
|
+
if (options.format === "json") {
|
|
603
832
|
io.stdout.write(
|
|
604
833
|
`${JSON.stringify(
|
|
605
834
|
{
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@traits-dev/cli",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.4.0",
|
|
4
4
|
"description": "traits.dev command-line interface for voice profile init, validate, compile, eval, and import workflows.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"traits-dev",
|
|
@@ -41,7 +41,7 @@
|
|
|
41
41
|
"provenance": true
|
|
42
42
|
},
|
|
43
43
|
"dependencies": {
|
|
44
|
-
"@traits-dev/core": "^0.
|
|
44
|
+
"@traits-dev/core": "^0.4.0"
|
|
45
45
|
},
|
|
46
46
|
"devDependencies": {
|
|
47
47
|
"@types/node": "^25.2.3",
|