@traits-dev/cli 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/traits.js +193 -6
  2. package/package.json +2 -2
package/dist/traits.js CHANGED
@@ -211,6 +211,7 @@ import {
211
211
  import {
212
212
  detectEvalTierAvailability,
213
213
  formatValidationResult as formatValidationResult2,
214
+ loadBuiltInEvalSuite,
214
215
  resolveTierExecution,
215
216
  runOfflineBaselineScaffold,
216
217
  toValidationResultObject as toValidationResultObject2
@@ -224,6 +225,7 @@ function printEvalUsage(out = process.stderr) {
224
225
  "Options:",
225
226
  " --model <model> Model target (required)",
226
227
  " --tier <1|2|3> Highest tier to run (default: highest available)",
228
+ " --suite <name> Built-in baseline suite: support|healthcare|developer",
227
229
  " --provider <name> Judge provider for Tier 3: auto|openai|anthropic",
228
230
  " --embedding-model <name> Embedding model for Tier 2 (OpenAI)",
229
231
  " --judge-model <name> Judge model for Tier 3 provider",
@@ -236,6 +238,11 @@ function printEvalUsage(out = process.stderr) {
236
238
  " --samples <path> JSON file with samples: [{ id, response }]",
237
239
  " --scenarios <path> Alias for --samples in this scaffold",
238
240
  " --json Output structured JSON",
241
+ " --format <text|json|junit> Output format (default: text)",
242
+ " --junit-threshold <num> Global JUnit pass threshold in [0,1] (default: 0.7)",
243
+ " --junit-threshold-tier1 <num> Tier 1 JUnit threshold override",
244
+ " --junit-threshold-tier2 <num> Tier 2 JUnit threshold override",
245
+ " --junit-threshold-tier3 <num> Tier 3 JUnit threshold override",
239
246
  " --strict Treat validation warnings as errors",
240
247
  " --verbose Include command metadata output",
241
248
  " --no-color Disable colorized output",
@@ -251,6 +258,7 @@ function parseEvalArgs(args) {
251
258
  profilePath: null,
252
259
  model: null,
253
260
  tier: null,
261
+ suite: null,
254
262
  provider: "auto",
255
263
  embeddingModel: null,
256
264
  judgeModel: null,
@@ -260,6 +268,11 @@ function parseEvalArgs(args) {
260
268
  maxRetries: null,
261
269
  retryBaseMs: null,
262
270
  json: false,
271
+ format: "text",
272
+ junitThreshold: null,
273
+ junitThresholdTier1: null,
274
+ junitThresholdTier2: null,
275
+ junitThresholdTier3: null,
263
276
  strict: false,
264
277
  verbose: false,
265
278
  noColor: false,
@@ -274,6 +287,7 @@ function parseEvalArgs(args) {
274
287
  const arg = args[index];
275
288
  if (arg === "--json") {
276
289
  result.json = true;
290
+ result.format = "json";
277
291
  continue;
278
292
  }
279
293
  if (arg === "--strict") {
@@ -300,14 +314,18 @@ function parseEvalArgs(args) {
300
314
  result.constraintImpact = true;
301
315
  continue;
302
316
  }
303
- if (arg === "--model" || arg === "--tier" || arg === "--provider" || arg === "--embedding-model" || arg === "--judge-model" || arg === "--openai-base-url" || arg === "--anthropic-base-url" || arg === "--timeout-ms" || arg === "--max-retries" || arg === "--retry-base-ms" || arg === "--response" || arg === "--samples" || arg === "--scenarios") {
317
+ if (arg === "--model" || arg === "--tier" || arg === "--suite" || arg === "--provider" || arg === "--format" || arg === "--embedding-model" || arg === "--judge-model" || arg === "--openai-base-url" || arg === "--anthropic-base-url" || arg === "--timeout-ms" || arg === "--max-retries" || arg === "--retry-base-ms" || arg === "--junit-threshold" || arg === "--junit-threshold-tier1" || arg === "--junit-threshold-tier2" || arg === "--junit-threshold-tier3" || arg === "--response" || arg === "--samples" || arg === "--scenarios") {
304
318
  const value = args[index + 1];
305
319
  if (!value) return { error: `Missing value for "${arg}"` };
306
320
  if (arg === "--model") result.model = value;
307
321
  if (arg === "--tier") result.tier = Number(value);
322
+ if (arg === "--suite") result.suite = String(value).toLowerCase();
308
323
  if (arg === "--provider") {
309
324
  result.provider = String(value).toLowerCase();
310
325
  }
326
+ if (arg === "--format") {
327
+ result.format = String(value).toLowerCase();
328
+ }
311
329
  if (arg === "--embedding-model") result.embeddingModel = value;
312
330
  if (arg === "--judge-model") result.judgeModel = value;
313
331
  if (arg === "--openai-base-url") result.openaiBaseUrl = value;
@@ -315,6 +333,10 @@ function parseEvalArgs(args) {
315
333
  if (arg === "--timeout-ms") result.timeoutMs = Number(value);
316
334
  if (arg === "--max-retries") result.maxRetries = Number(value);
317
335
  if (arg === "--retry-base-ms") result.retryBaseMs = Number(value);
336
+ if (arg === "--junit-threshold") result.junitThreshold = Number(value);
337
+ if (arg === "--junit-threshold-tier1") result.junitThresholdTier1 = Number(value);
338
+ if (arg === "--junit-threshold-tier2") result.junitThresholdTier2 = Number(value);
339
+ if (arg === "--junit-threshold-tier3") result.junitThresholdTier3 = Number(value);
318
340
  if (arg === "--response") result.responses.push(value);
319
341
  if (arg === "--samples" || arg === "--scenarios") result.samplesPath = value;
320
342
  index += 1;
@@ -338,6 +360,20 @@ function parseEvalArgs(args) {
338
360
  if (!["auto", "openai", "anthropic"].includes(result.provider)) {
339
361
  return { error: 'Invalid "--provider" value. Expected auto, openai, or anthropic.' };
340
362
  }
363
+ if (result.suite != null && !["support", "healthcare", "developer"].includes(
364
+ result.suite
365
+ )) {
366
+ return { error: 'Invalid "--suite" value. Expected support, healthcare, or developer.' };
367
+ }
368
+ if (result.suite != null && result.samplesPath != null) {
369
+ return { error: 'Use either "--suite" or "--samples/--scenarios", not both.' };
370
+ }
371
+ if (result.suite != null && result.responses.length > 0) {
372
+ return { error: 'Use either "--suite" or "--response", not both.' };
373
+ }
374
+ if (!["text", "json", "junit"].includes(result.format)) {
375
+ return { error: 'Invalid "--format" value. Expected text, json, or junit.' };
376
+ }
341
377
  if (result.timeoutMs != null && (!Number.isInteger(result.timeoutMs) || result.timeoutMs < 0)) {
342
378
  return { error: 'Invalid "--timeout-ms" value. Expected a non-negative integer.' };
343
379
  }
@@ -347,9 +383,33 @@ function parseEvalArgs(args) {
347
383
  if (result.retryBaseMs != null && (!Number.isInteger(result.retryBaseMs) || result.retryBaseMs < 0)) {
348
384
  return { error: 'Invalid "--retry-base-ms" value. Expected a non-negative integer.' };
349
385
  }
386
+ for (const [flag, value] of [
387
+ ["--junit-threshold", result.junitThreshold],
388
+ ["--junit-threshold-tier1", result.junitThresholdTier1],
389
+ ["--junit-threshold-tier2", result.junitThresholdTier2],
390
+ ["--junit-threshold-tier3", result.junitThresholdTier3]
391
+ ]) {
392
+ if (value == null) continue;
393
+ if (!Number.isFinite(value) || value < 0 || value > 1) {
394
+ return { error: `Invalid "${flag}" value. Expected a number in [0, 1].` };
395
+ }
396
+ }
350
397
  return { value: result };
351
398
  }
352
399
  function loadSamples(options, cwd) {
400
+ if (options.suite) {
401
+ const suite = loadBuiltInEvalSuite(options.suite);
402
+ if (!suite) {
403
+ throw new Error(
404
+ `Unknown suite "${options.suite}". Expected support, healthcare, or developer.`
405
+ );
406
+ }
407
+ return suite.scenarios.map((scenario) => ({
408
+ id: scenario.id,
409
+ prompt: scenario.messages.map((message) => `${message.role}: ${message.content}`).join("\n"),
410
+ response: scenario.expected_behavior ?? ""
411
+ }));
412
+ }
353
413
  if (options.samplesPath) {
354
414
  const sampleFile = path2.resolve(cwd, options.samplesPath);
355
415
  const parsed = JSON.parse(fs.readFileSync(sampleFile, "utf8"));
@@ -377,10 +437,118 @@ function loadSamples(options, cwd) {
377
437
  }));
378
438
  }
379
439
  function writeProgress(io, options, message) {
380
- if (options.json) return;
440
+ if (options.format !== "text") return;
381
441
  io.stderr.write(`${message}
382
442
  `);
383
443
  }
444
+ function escapeXml(value) {
445
+ return String(value ?? "").replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&apos;");
446
+ }
447
+ function resolveJUnitThresholds(options) {
448
+ const base = options.junitThreshold ?? 0.7;
449
+ return {
450
+ tier1: options.junitThresholdTier1 ?? base,
451
+ tier2: options.junitThresholdTier2 ?? base,
452
+ tier3: options.junitThresholdTier3 ?? base
453
+ };
454
+ }
455
+ function buildSampleScoreMap(samples) {
456
+ return new Map((samples ?? []).map((sample) => [String(sample.id), Number(sample.score)]));
457
+ }
458
+ function collectScenarioIds(reports) {
459
+ const ids = [];
460
+ const seen = /* @__PURE__ */ new Set();
461
+ for (const sample of reports.tier1?.samples ?? []) {
462
+ const id = String(sample.id);
463
+ if (seen.has(id)) continue;
464
+ seen.add(id);
465
+ ids.push(id);
466
+ }
467
+ for (const sample of reports.tier2?.samples ?? []) {
468
+ const id = String(sample.id);
469
+ if (seen.has(id)) continue;
470
+ seen.add(id);
471
+ ids.push(id);
472
+ }
473
+ for (const sample of reports.tier3?.samples ?? []) {
474
+ const id = String(sample.id);
475
+ if (seen.has(id)) continue;
476
+ seen.add(id);
477
+ ids.push(id);
478
+ }
479
+ return ids;
480
+ }
481
+ function buildJUnitReport(args) {
482
+ const ids = collectScenarioIds(args.tierReports);
483
+ const tier1Scores = buildSampleScoreMap(args.tierReports.tier1?.samples);
484
+ const tier2Scores = buildSampleScoreMap(args.tierReports.tier2?.samples);
485
+ const tier3Scores = buildSampleScoreMap(args.tierReports.tier3?.samples);
486
+ const className = `traits.eval.${path2.basename(args.profilePath, path2.extname(args.profilePath))}`;
487
+ let failures = 0;
488
+ const testCases = [];
489
+ for (const id of ids) {
490
+ const reasons = [];
491
+ const scoreLines = [];
492
+ const tier1 = tier1Scores.get(id);
493
+ if (tier1 != null) {
494
+ scoreLines.push(`tier1=${tier1.toFixed(3)} threshold=${args.thresholds.tier1.toFixed(3)}`);
495
+ if (tier1 < args.thresholds.tier1) {
496
+ reasons.push(
497
+ `Tier 1 score ${tier1.toFixed(3)} below threshold ${args.thresholds.tier1.toFixed(3)}`
498
+ );
499
+ }
500
+ }
501
+ const tier2 = tier2Scores.get(id);
502
+ if (tier2 != null) {
503
+ scoreLines.push(`tier2=${tier2.toFixed(3)} threshold=${args.thresholds.tier2.toFixed(3)}`);
504
+ if (tier2 < args.thresholds.tier2) {
505
+ reasons.push(
506
+ `Tier 2 score ${tier2.toFixed(3)} below threshold ${args.thresholds.tier2.toFixed(3)}`
507
+ );
508
+ }
509
+ }
510
+ const tier3 = tier3Scores.get(id);
511
+ if (tier3 != null) {
512
+ scoreLines.push(`tier3=${tier3.toFixed(3)} threshold=${args.thresholds.tier3.toFixed(3)}`);
513
+ if (tier3 < args.thresholds.tier3) {
514
+ reasons.push(
515
+ `Tier 3 score ${tier3.toFixed(3)} below threshold ${args.thresholds.tier3.toFixed(3)}`
516
+ );
517
+ }
518
+ }
519
+ const testcase = [];
520
+ testcase.push(
521
+ ` <testcase classname="${escapeXml(className)}" name="${escapeXml(id)}" time="0">`
522
+ );
523
+ if (reasons.length > 0) {
524
+ failures += 1;
525
+ testcase.push(
526
+ ` <failure message="${escapeXml("traits eval threshold failure")}">${escapeXml(
527
+ reasons.join(" | ")
528
+ )}</failure>`
529
+ );
530
+ }
531
+ if (scoreLines.length > 0) {
532
+ testcase.push(` <system-out>${escapeXml(scoreLines.join(" | "))}</system-out>`);
533
+ }
534
+ testcase.push(" </testcase>");
535
+ testCases.push(testcase.join("\n"));
536
+ }
537
+ const xml = [
538
+ '<?xml version="1.0" encoding="UTF-8"?>',
539
+ "<testsuites>",
540
+ ` <testsuite name="traits.eval" tests="${ids.length}" failures="${failures}" errors="0" skipped="0" time="0">`,
541
+ ` <properties><property name="profile" value="${escapeXml(args.profilePath)}" /><property name="model" value="${escapeXml(args.model)}" /><property name="threshold_tier1" value="${args.thresholds.tier1.toFixed(3)}" /><property name="threshold_tier2" value="${args.thresholds.tier2.toFixed(3)}" /><property name="threshold_tier3" value="${args.thresholds.tier3.toFixed(3)}" /></properties>`,
542
+ ...testCases,
543
+ " </testsuite>",
544
+ "</testsuites>"
545
+ ].join("\n");
546
+ return {
547
+ xml,
548
+ tests: ids.length,
549
+ failures
550
+ };
551
+ }
384
552
  async function runEval(args, io = process) {
385
553
  const parsed = parseEvalArgs(args);
386
554
  if ("error" in parsed) {
@@ -515,6 +683,8 @@ async function runEval(args, io = process) {
515
683
  const payload = {
516
684
  profile: profilePath,
517
685
  model: options.model,
686
+ format: options.format,
687
+ suite: options.suite,
518
688
  tier_requested: requestedTier,
519
689
  tier_executed: tierResolution.tier_executed,
520
690
  tier_resolution: tierResolution,
@@ -531,11 +701,22 @@ async function runEval(args, io = process) {
531
701
  errors: evaluation.validation.errors.length
532
702
  };
533
703
  }
534
- if (options.json) {
704
+ if (options.format === "json") {
535
705
  io.stdout.write(`${JSON.stringify(payload, null, 2)}
536
706
  `);
537
707
  return 0;
538
708
  }
709
+ if (options.format === "junit") {
710
+ const junit = buildJUnitReport({
711
+ profilePath,
712
+ model: options.model,
713
+ tierReports,
714
+ thresholds: resolveJUnitThresholds(options)
715
+ });
716
+ io.stdout.write(`${junit.xml}
717
+ `);
718
+ return junit.failures > 0 ? 1 : 0;
719
+ }
539
720
  if (tierReports.tier1) {
540
721
  io.stdout.write(`Tier 1 average score: ${tierReports.tier1.average_score.toFixed(3)}
541
722
  `);
@@ -547,10 +728,16 @@ async function runEval(args, io = process) {
547
728
  if (tierReports.tier2) {
548
729
  io.stdout.write(`Tier 2 average score: ${tierReports.tier2.average_score.toFixed(3)}
549
730
  `);
731
+ io.stdout.write(
732
+ "Note: Tier 2 embedding scores are directionally useful but sensitive to model granularity.\n"
733
+ );
550
734
  }
551
735
  if (tierReports.tier3) {
552
736
  io.stdout.write(`Tier 3 average score: ${tierReports.tier3.average_score.toFixed(3)}
553
737
  `);
738
+ io.stdout.write(
739
+ "Note: Tier 3 judge scores are noisy across runs. Do not use as a sole merge gate.\n"
740
+ );
554
741
  }
555
742
  if (baselineReport?.tier1) {
556
743
  io.stdout.write(
@@ -578,12 +765,12 @@ async function runEval(args, io = process) {
578
765
  return 0;
579
766
  } catch (error) {
580
767
  const typedError = error;
581
- if ((typedError.code === "E_EVAL_TIER2_UNAVAILABLE" || typedError.code === "E_EVAL_TIER3_UNAVAILABLE") && !options.json) {
768
+ if ((typedError.code === "E_EVAL_TIER2_UNAVAILABLE" || typedError.code === "E_EVAL_TIER3_UNAVAILABLE") && options.format !== "json") {
582
769
  io.stderr.write(`Error: ${typedError.message ?? "Evaluation tier unavailable."}
583
770
  `);
584
771
  return 2;
585
772
  }
586
- if ((typedError.code === "E_EVAL_TIER2_UNAVAILABLE" || typedError.code === "E_EVAL_TIER3_UNAVAILABLE") && options.json) {
773
+ if ((typedError.code === "E_EVAL_TIER2_UNAVAILABLE" || typedError.code === "E_EVAL_TIER3_UNAVAILABLE") && options.format === "json") {
587
774
  io.stdout.write(
588
775
  `${JSON.stringify(
589
776
  {
@@ -599,7 +786,7 @@ async function runEval(args, io = process) {
599
786
  }
600
787
  const validation = typedError.validation;
601
788
  if (typedError.code === "E_EVAL_VALIDATION" && validation) {
602
- if (options.json) {
789
+ if (options.format === "json") {
603
790
  io.stdout.write(
604
791
  `${JSON.stringify(
605
792
  {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@traits-dev/cli",
3
- "version": "0.2.0",
3
+ "version": "0.3.0",
4
4
  "description": "traits.dev command-line interface for voice profile init, validate, compile, eval, and import workflows.",
5
5
  "keywords": [
6
6
  "traits-dev",
@@ -41,7 +41,7 @@
41
41
  "provenance": true
42
42
  },
43
43
  "dependencies": {
44
- "@traits-dev/core": "^0.2.0"
44
+ "@traits-dev/core": "^0.3.0"
45
45
  },
46
46
  "devDependencies": {
47
47
  "@types/node": "^25.2.3",