@traits-dev/cli 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/traits.js +194 -7
- package/package.json +5 -3
package/README.md
CHANGED
package/dist/traits.js
CHANGED
|
@@ -211,6 +211,7 @@ import {
|
|
|
211
211
|
import {
|
|
212
212
|
detectEvalTierAvailability,
|
|
213
213
|
formatValidationResult as formatValidationResult2,
|
|
214
|
+
loadBuiltInEvalSuite,
|
|
214
215
|
resolveTierExecution,
|
|
215
216
|
runOfflineBaselineScaffold,
|
|
216
217
|
toValidationResultObject as toValidationResultObject2
|
|
@@ -224,6 +225,7 @@ function printEvalUsage(out = process.stderr) {
|
|
|
224
225
|
"Options:",
|
|
225
226
|
" --model <model> Model target (required)",
|
|
226
227
|
" --tier <1|2|3> Highest tier to run (default: highest available)",
|
|
228
|
+
" --suite <name> Built-in baseline suite: support|healthcare|developer",
|
|
227
229
|
" --provider <name> Judge provider for Tier 3: auto|openai|anthropic",
|
|
228
230
|
" --embedding-model <name> Embedding model for Tier 2 (OpenAI)",
|
|
229
231
|
" --judge-model <name> Judge model for Tier 3 provider",
|
|
@@ -236,6 +238,11 @@ function printEvalUsage(out = process.stderr) {
|
|
|
236
238
|
" --samples <path> JSON file with samples: [{ id, response }]",
|
|
237
239
|
" --scenarios <path> Alias for --samples in this scaffold",
|
|
238
240
|
" --json Output structured JSON",
|
|
241
|
+
" --format <text|json|junit> Output format (default: text)",
|
|
242
|
+
" --junit-threshold <num> Global JUnit pass threshold in [0,1] (default: 0.7)",
|
|
243
|
+
" --junit-threshold-tier1 <num> Tier 1 JUnit threshold override",
|
|
244
|
+
" --junit-threshold-tier2 <num> Tier 2 JUnit threshold override",
|
|
245
|
+
" --junit-threshold-tier3 <num> Tier 3 JUnit threshold override",
|
|
239
246
|
" --strict Treat validation warnings as errors",
|
|
240
247
|
" --verbose Include command metadata output",
|
|
241
248
|
" --no-color Disable colorized output",
|
|
@@ -251,6 +258,7 @@ function parseEvalArgs(args) {
|
|
|
251
258
|
profilePath: null,
|
|
252
259
|
model: null,
|
|
253
260
|
tier: null,
|
|
261
|
+
suite: null,
|
|
254
262
|
provider: "auto",
|
|
255
263
|
embeddingModel: null,
|
|
256
264
|
judgeModel: null,
|
|
@@ -260,6 +268,11 @@ function parseEvalArgs(args) {
|
|
|
260
268
|
maxRetries: null,
|
|
261
269
|
retryBaseMs: null,
|
|
262
270
|
json: false,
|
|
271
|
+
format: "text",
|
|
272
|
+
junitThreshold: null,
|
|
273
|
+
junitThresholdTier1: null,
|
|
274
|
+
junitThresholdTier2: null,
|
|
275
|
+
junitThresholdTier3: null,
|
|
263
276
|
strict: false,
|
|
264
277
|
verbose: false,
|
|
265
278
|
noColor: false,
|
|
@@ -274,6 +287,7 @@ function parseEvalArgs(args) {
|
|
|
274
287
|
const arg = args[index];
|
|
275
288
|
if (arg === "--json") {
|
|
276
289
|
result.json = true;
|
|
290
|
+
result.format = "json";
|
|
277
291
|
continue;
|
|
278
292
|
}
|
|
279
293
|
if (arg === "--strict") {
|
|
@@ -300,14 +314,18 @@ function parseEvalArgs(args) {
|
|
|
300
314
|
result.constraintImpact = true;
|
|
301
315
|
continue;
|
|
302
316
|
}
|
|
303
|
-
if (arg === "--model" || arg === "--tier" || arg === "--provider" || arg === "--embedding-model" || arg === "--judge-model" || arg === "--openai-base-url" || arg === "--anthropic-base-url" || arg === "--timeout-ms" || arg === "--max-retries" || arg === "--retry-base-ms" || arg === "--response" || arg === "--samples" || arg === "--scenarios") {
|
|
317
|
+
if (arg === "--model" || arg === "--tier" || arg === "--suite" || arg === "--provider" || arg === "--format" || arg === "--embedding-model" || arg === "--judge-model" || arg === "--openai-base-url" || arg === "--anthropic-base-url" || arg === "--timeout-ms" || arg === "--max-retries" || arg === "--retry-base-ms" || arg === "--junit-threshold" || arg === "--junit-threshold-tier1" || arg === "--junit-threshold-tier2" || arg === "--junit-threshold-tier3" || arg === "--response" || arg === "--samples" || arg === "--scenarios") {
|
|
304
318
|
const value = args[index + 1];
|
|
305
319
|
if (!value) return { error: `Missing value for "${arg}"` };
|
|
306
320
|
if (arg === "--model") result.model = value;
|
|
307
321
|
if (arg === "--tier") result.tier = Number(value);
|
|
322
|
+
if (arg === "--suite") result.suite = String(value).toLowerCase();
|
|
308
323
|
if (arg === "--provider") {
|
|
309
324
|
result.provider = String(value).toLowerCase();
|
|
310
325
|
}
|
|
326
|
+
if (arg === "--format") {
|
|
327
|
+
result.format = String(value).toLowerCase();
|
|
328
|
+
}
|
|
311
329
|
if (arg === "--embedding-model") result.embeddingModel = value;
|
|
312
330
|
if (arg === "--judge-model") result.judgeModel = value;
|
|
313
331
|
if (arg === "--openai-base-url") result.openaiBaseUrl = value;
|
|
@@ -315,6 +333,10 @@ function parseEvalArgs(args) {
|
|
|
315
333
|
if (arg === "--timeout-ms") result.timeoutMs = Number(value);
|
|
316
334
|
if (arg === "--max-retries") result.maxRetries = Number(value);
|
|
317
335
|
if (arg === "--retry-base-ms") result.retryBaseMs = Number(value);
|
|
336
|
+
if (arg === "--junit-threshold") result.junitThreshold = Number(value);
|
|
337
|
+
if (arg === "--junit-threshold-tier1") result.junitThresholdTier1 = Number(value);
|
|
338
|
+
if (arg === "--junit-threshold-tier2") result.junitThresholdTier2 = Number(value);
|
|
339
|
+
if (arg === "--junit-threshold-tier3") result.junitThresholdTier3 = Number(value);
|
|
318
340
|
if (arg === "--response") result.responses.push(value);
|
|
319
341
|
if (arg === "--samples" || arg === "--scenarios") result.samplesPath = value;
|
|
320
342
|
index += 1;
|
|
@@ -338,6 +360,20 @@ function parseEvalArgs(args) {
|
|
|
338
360
|
if (!["auto", "openai", "anthropic"].includes(result.provider)) {
|
|
339
361
|
return { error: 'Invalid "--provider" value. Expected auto, openai, or anthropic.' };
|
|
340
362
|
}
|
|
363
|
+
if (result.suite != null && !["support", "healthcare", "developer"].includes(
|
|
364
|
+
result.suite
|
|
365
|
+
)) {
|
|
366
|
+
return { error: 'Invalid "--suite" value. Expected support, healthcare, or developer.' };
|
|
367
|
+
}
|
|
368
|
+
if (result.suite != null && result.samplesPath != null) {
|
|
369
|
+
return { error: 'Use either "--suite" or "--samples/--scenarios", not both.' };
|
|
370
|
+
}
|
|
371
|
+
if (result.suite != null && result.responses.length > 0) {
|
|
372
|
+
return { error: 'Use either "--suite" or "--response", not both.' };
|
|
373
|
+
}
|
|
374
|
+
if (!["text", "json", "junit"].includes(result.format)) {
|
|
375
|
+
return { error: 'Invalid "--format" value. Expected text, json, or junit.' };
|
|
376
|
+
}
|
|
341
377
|
if (result.timeoutMs != null && (!Number.isInteger(result.timeoutMs) || result.timeoutMs < 0)) {
|
|
342
378
|
return { error: 'Invalid "--timeout-ms" value. Expected a non-negative integer.' };
|
|
343
379
|
}
|
|
@@ -347,9 +383,33 @@ function parseEvalArgs(args) {
|
|
|
347
383
|
if (result.retryBaseMs != null && (!Number.isInteger(result.retryBaseMs) || result.retryBaseMs < 0)) {
|
|
348
384
|
return { error: 'Invalid "--retry-base-ms" value. Expected a non-negative integer.' };
|
|
349
385
|
}
|
|
386
|
+
for (const [flag, value] of [
|
|
387
|
+
["--junit-threshold", result.junitThreshold],
|
|
388
|
+
["--junit-threshold-tier1", result.junitThresholdTier1],
|
|
389
|
+
["--junit-threshold-tier2", result.junitThresholdTier2],
|
|
390
|
+
["--junit-threshold-tier3", result.junitThresholdTier3]
|
|
391
|
+
]) {
|
|
392
|
+
if (value == null) continue;
|
|
393
|
+
if (!Number.isFinite(value) || value < 0 || value > 1) {
|
|
394
|
+
return { error: `Invalid "${flag}" value. Expected a number in [0, 1].` };
|
|
395
|
+
}
|
|
396
|
+
}
|
|
350
397
|
return { value: result };
|
|
351
398
|
}
|
|
352
399
|
function loadSamples(options, cwd) {
|
|
400
|
+
if (options.suite) {
|
|
401
|
+
const suite = loadBuiltInEvalSuite(options.suite);
|
|
402
|
+
if (!suite) {
|
|
403
|
+
throw new Error(
|
|
404
|
+
`Unknown suite "${options.suite}". Expected support, healthcare, or developer.`
|
|
405
|
+
);
|
|
406
|
+
}
|
|
407
|
+
return suite.scenarios.map((scenario) => ({
|
|
408
|
+
id: scenario.id,
|
|
409
|
+
prompt: scenario.messages.map((message) => `${message.role}: ${message.content}`).join("\n"),
|
|
410
|
+
response: scenario.expected_behavior ?? ""
|
|
411
|
+
}));
|
|
412
|
+
}
|
|
353
413
|
if (options.samplesPath) {
|
|
354
414
|
const sampleFile = path2.resolve(cwd, options.samplesPath);
|
|
355
415
|
const parsed = JSON.parse(fs.readFileSync(sampleFile, "utf8"));
|
|
@@ -377,10 +437,118 @@ function loadSamples(options, cwd) {
|
|
|
377
437
|
}));
|
|
378
438
|
}
|
|
379
439
|
function writeProgress(io, options, message) {
|
|
380
|
-
if (options.
|
|
440
|
+
if (options.format !== "text") return;
|
|
381
441
|
io.stderr.write(`${message}
|
|
382
442
|
`);
|
|
383
443
|
}
|
|
444
|
+
function escapeXml(value) {
|
|
445
|
+
return String(value ?? "").replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
446
|
+
}
|
|
447
|
+
function resolveJUnitThresholds(options) {
|
|
448
|
+
const base = options.junitThreshold ?? 0.7;
|
|
449
|
+
return {
|
|
450
|
+
tier1: options.junitThresholdTier1 ?? base,
|
|
451
|
+
tier2: options.junitThresholdTier2 ?? base,
|
|
452
|
+
tier3: options.junitThresholdTier3 ?? base
|
|
453
|
+
};
|
|
454
|
+
}
|
|
455
|
+
function buildSampleScoreMap(samples) {
|
|
456
|
+
return new Map((samples ?? []).map((sample) => [String(sample.id), Number(sample.score)]));
|
|
457
|
+
}
|
|
458
|
+
function collectScenarioIds(reports) {
|
|
459
|
+
const ids = [];
|
|
460
|
+
const seen = /* @__PURE__ */ new Set();
|
|
461
|
+
for (const sample of reports.tier1?.samples ?? []) {
|
|
462
|
+
const id = String(sample.id);
|
|
463
|
+
if (seen.has(id)) continue;
|
|
464
|
+
seen.add(id);
|
|
465
|
+
ids.push(id);
|
|
466
|
+
}
|
|
467
|
+
for (const sample of reports.tier2?.samples ?? []) {
|
|
468
|
+
const id = String(sample.id);
|
|
469
|
+
if (seen.has(id)) continue;
|
|
470
|
+
seen.add(id);
|
|
471
|
+
ids.push(id);
|
|
472
|
+
}
|
|
473
|
+
for (const sample of reports.tier3?.samples ?? []) {
|
|
474
|
+
const id = String(sample.id);
|
|
475
|
+
if (seen.has(id)) continue;
|
|
476
|
+
seen.add(id);
|
|
477
|
+
ids.push(id);
|
|
478
|
+
}
|
|
479
|
+
return ids;
|
|
480
|
+
}
|
|
481
|
+
function buildJUnitReport(args) {
|
|
482
|
+
const ids = collectScenarioIds(args.tierReports);
|
|
483
|
+
const tier1Scores = buildSampleScoreMap(args.tierReports.tier1?.samples);
|
|
484
|
+
const tier2Scores = buildSampleScoreMap(args.tierReports.tier2?.samples);
|
|
485
|
+
const tier3Scores = buildSampleScoreMap(args.tierReports.tier3?.samples);
|
|
486
|
+
const className = `traits.eval.${path2.basename(args.profilePath, path2.extname(args.profilePath))}`;
|
|
487
|
+
let failures = 0;
|
|
488
|
+
const testCases = [];
|
|
489
|
+
for (const id of ids) {
|
|
490
|
+
const reasons = [];
|
|
491
|
+
const scoreLines = [];
|
|
492
|
+
const tier1 = tier1Scores.get(id);
|
|
493
|
+
if (tier1 != null) {
|
|
494
|
+
scoreLines.push(`tier1=${tier1.toFixed(3)} threshold=${args.thresholds.tier1.toFixed(3)}`);
|
|
495
|
+
if (tier1 < args.thresholds.tier1) {
|
|
496
|
+
reasons.push(
|
|
497
|
+
`Tier 1 score ${tier1.toFixed(3)} below threshold ${args.thresholds.tier1.toFixed(3)}`
|
|
498
|
+
);
|
|
499
|
+
}
|
|
500
|
+
}
|
|
501
|
+
const tier2 = tier2Scores.get(id);
|
|
502
|
+
if (tier2 != null) {
|
|
503
|
+
scoreLines.push(`tier2=${tier2.toFixed(3)} threshold=${args.thresholds.tier2.toFixed(3)}`);
|
|
504
|
+
if (tier2 < args.thresholds.tier2) {
|
|
505
|
+
reasons.push(
|
|
506
|
+
`Tier 2 score ${tier2.toFixed(3)} below threshold ${args.thresholds.tier2.toFixed(3)}`
|
|
507
|
+
);
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
const tier3 = tier3Scores.get(id);
|
|
511
|
+
if (tier3 != null) {
|
|
512
|
+
scoreLines.push(`tier3=${tier3.toFixed(3)} threshold=${args.thresholds.tier3.toFixed(3)}`);
|
|
513
|
+
if (tier3 < args.thresholds.tier3) {
|
|
514
|
+
reasons.push(
|
|
515
|
+
`Tier 3 score ${tier3.toFixed(3)} below threshold ${args.thresholds.tier3.toFixed(3)}`
|
|
516
|
+
);
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
const testcase = [];
|
|
520
|
+
testcase.push(
|
|
521
|
+
` <testcase classname="${escapeXml(className)}" name="${escapeXml(id)}" time="0">`
|
|
522
|
+
);
|
|
523
|
+
if (reasons.length > 0) {
|
|
524
|
+
failures += 1;
|
|
525
|
+
testcase.push(
|
|
526
|
+
` <failure message="${escapeXml("traits eval threshold failure")}">${escapeXml(
|
|
527
|
+
reasons.join(" | ")
|
|
528
|
+
)}</failure>`
|
|
529
|
+
);
|
|
530
|
+
}
|
|
531
|
+
if (scoreLines.length > 0) {
|
|
532
|
+
testcase.push(` <system-out>${escapeXml(scoreLines.join(" | "))}</system-out>`);
|
|
533
|
+
}
|
|
534
|
+
testcase.push(" </testcase>");
|
|
535
|
+
testCases.push(testcase.join("\n"));
|
|
536
|
+
}
|
|
537
|
+
const xml = [
|
|
538
|
+
'<?xml version="1.0" encoding="UTF-8"?>',
|
|
539
|
+
"<testsuites>",
|
|
540
|
+
` <testsuite name="traits.eval" tests="${ids.length}" failures="${failures}" errors="0" skipped="0" time="0">`,
|
|
541
|
+
` <properties><property name="profile" value="${escapeXml(args.profilePath)}" /><property name="model" value="${escapeXml(args.model)}" /><property name="threshold_tier1" value="${args.thresholds.tier1.toFixed(3)}" /><property name="threshold_tier2" value="${args.thresholds.tier2.toFixed(3)}" /><property name="threshold_tier3" value="${args.thresholds.tier3.toFixed(3)}" /></properties>`,
|
|
542
|
+
...testCases,
|
|
543
|
+
" </testsuite>",
|
|
544
|
+
"</testsuites>"
|
|
545
|
+
].join("\n");
|
|
546
|
+
return {
|
|
547
|
+
xml,
|
|
548
|
+
tests: ids.length,
|
|
549
|
+
failures
|
|
550
|
+
};
|
|
551
|
+
}
|
|
384
552
|
async function runEval(args, io = process) {
|
|
385
553
|
const parsed = parseEvalArgs(args);
|
|
386
554
|
if ("error" in parsed) {
|
|
@@ -515,6 +683,8 @@ async function runEval(args, io = process) {
|
|
|
515
683
|
const payload = {
|
|
516
684
|
profile: profilePath,
|
|
517
685
|
model: options.model,
|
|
686
|
+
format: options.format,
|
|
687
|
+
suite: options.suite,
|
|
518
688
|
tier_requested: requestedTier,
|
|
519
689
|
tier_executed: tierResolution.tier_executed,
|
|
520
690
|
tier_resolution: tierResolution,
|
|
@@ -531,11 +701,22 @@ async function runEval(args, io = process) {
|
|
|
531
701
|
errors: evaluation.validation.errors.length
|
|
532
702
|
};
|
|
533
703
|
}
|
|
534
|
-
if (options.json) {
|
|
704
|
+
if (options.format === "json") {
|
|
535
705
|
io.stdout.write(`${JSON.stringify(payload, null, 2)}
|
|
536
706
|
`);
|
|
537
707
|
return 0;
|
|
538
708
|
}
|
|
709
|
+
if (options.format === "junit") {
|
|
710
|
+
const junit = buildJUnitReport({
|
|
711
|
+
profilePath,
|
|
712
|
+
model: options.model,
|
|
713
|
+
tierReports,
|
|
714
|
+
thresholds: resolveJUnitThresholds(options)
|
|
715
|
+
});
|
|
716
|
+
io.stdout.write(`${junit.xml}
|
|
717
|
+
`);
|
|
718
|
+
return junit.failures > 0 ? 1 : 0;
|
|
719
|
+
}
|
|
539
720
|
if (tierReports.tier1) {
|
|
540
721
|
io.stdout.write(`Tier 1 average score: ${tierReports.tier1.average_score.toFixed(3)}
|
|
541
722
|
`);
|
|
@@ -547,10 +728,16 @@ async function runEval(args, io = process) {
|
|
|
547
728
|
if (tierReports.tier2) {
|
|
548
729
|
io.stdout.write(`Tier 2 average score: ${tierReports.tier2.average_score.toFixed(3)}
|
|
549
730
|
`);
|
|
731
|
+
io.stdout.write(
|
|
732
|
+
"Note: Tier 2 embedding scores are directionally useful but sensitive to model granularity.\n"
|
|
733
|
+
);
|
|
550
734
|
}
|
|
551
735
|
if (tierReports.tier3) {
|
|
552
736
|
io.stdout.write(`Tier 3 average score: ${tierReports.tier3.average_score.toFixed(3)}
|
|
553
737
|
`);
|
|
738
|
+
io.stdout.write(
|
|
739
|
+
"Note: Tier 3 judge scores are noisy across runs. Do not use as a sole merge gate.\n"
|
|
740
|
+
);
|
|
554
741
|
}
|
|
555
742
|
if (baselineReport?.tier1) {
|
|
556
743
|
io.stdout.write(
|
|
@@ -578,12 +765,12 @@ async function runEval(args, io = process) {
|
|
|
578
765
|
return 0;
|
|
579
766
|
} catch (error) {
|
|
580
767
|
const typedError = error;
|
|
581
|
-
if ((typedError.code === "E_EVAL_TIER2_UNAVAILABLE" || typedError.code === "E_EVAL_TIER3_UNAVAILABLE") &&
|
|
768
|
+
if ((typedError.code === "E_EVAL_TIER2_UNAVAILABLE" || typedError.code === "E_EVAL_TIER3_UNAVAILABLE") && options.format !== "json") {
|
|
582
769
|
io.stderr.write(`Error: ${typedError.message ?? "Evaluation tier unavailable."}
|
|
583
770
|
`);
|
|
584
771
|
return 2;
|
|
585
772
|
}
|
|
586
|
-
if ((typedError.code === "E_EVAL_TIER2_UNAVAILABLE" || typedError.code === "E_EVAL_TIER3_UNAVAILABLE") && options.json) {
|
|
773
|
+
if ((typedError.code === "E_EVAL_TIER2_UNAVAILABLE" || typedError.code === "E_EVAL_TIER3_UNAVAILABLE") && options.format === "json") {
|
|
587
774
|
io.stdout.write(
|
|
588
775
|
`${JSON.stringify(
|
|
589
776
|
{
|
|
@@ -599,7 +786,7 @@ async function runEval(args, io = process) {
|
|
|
599
786
|
}
|
|
600
787
|
const validation = typedError.validation;
|
|
601
788
|
if (typedError.code === "E_EVAL_VALIDATION" && validation) {
|
|
602
|
-
if (options.json) {
|
|
789
|
+
if (options.format === "json") {
|
|
603
790
|
io.stdout.write(
|
|
604
791
|
`${JSON.stringify(
|
|
605
792
|
{
|
|
@@ -1357,7 +1544,7 @@ function printRootUsage(out = process.stdout) {
|
|
|
1357
1544
|
" compile <profile-path> Compile a profile for a target model",
|
|
1358
1545
|
" eval <profile-path> Evaluate profile responses (Tier 1 scaffold)",
|
|
1359
1546
|
" import [prompt-path] Import a profile from an existing system prompt",
|
|
1360
|
-
" validate <profile-path> Validate a
|
|
1547
|
+
" validate <profile-path> Validate a voice profile",
|
|
1361
1548
|
"",
|
|
1362
1549
|
"Global flags:",
|
|
1363
1550
|
" --json Output JSON where supported",
|
package/package.json
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@traits-dev/cli",
|
|
3
|
-
"version": "0.
|
|
4
|
-
"description": "traits.dev command-line interface for profile init, validate, compile, eval, and import workflows.",
|
|
3
|
+
"version": "0.3.0",
|
|
4
|
+
"description": "traits.dev command-line interface for voice profile init, validate, compile, eval, and import workflows.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"traits-dev",
|
|
7
7
|
"cli",
|
|
8
|
+
"voice-profile",
|
|
9
|
+
"behavioral-policy",
|
|
8
10
|
"llm",
|
|
9
11
|
"prompt-engineering",
|
|
10
12
|
"evaluation"
|
|
@@ -39,7 +41,7 @@
|
|
|
39
41
|
"provenance": true
|
|
40
42
|
},
|
|
41
43
|
"dependencies": {
|
|
42
|
-
"@traits-dev/core": "^0.
|
|
44
|
+
"@traits-dev/core": "^0.3.0"
|
|
43
45
|
},
|
|
44
46
|
"devDependencies": {
|
|
45
47
|
"@types/node": "^25.2.3",
|