npm - @traits-dev/cli - Versions diffs - 0.2.0 → 0.3.0 - Mend

@traits-dev/cli 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/dist/traits.js +193 -6
package/package.json +2 -2

package/dist/traits.js CHANGED Viewed

@@ -211,6 +211,7 @@ import {
 import {
   detectEvalTierAvailability,
   formatValidationResult as formatValidationResult2,
+  loadBuiltInEvalSuite,
   resolveTierExecution,
   runOfflineBaselineScaffold,
   toValidationResultObject as toValidationResultObject2
@@ -224,6 +225,7 @@ function printEvalUsage(out = process.stderr) {
       "Options:",
       "  --model <model>           Model target (required)",
       "  --tier <1|2|3>            Highest tier to run (default: highest available)",
+      "  --suite <name>            Built-in baseline suite: support|healthcare|developer",
       "  --provider <name>         Judge provider for Tier 3: auto|openai|anthropic",
       "  --embedding-model <name>  Embedding model for Tier 2 (OpenAI)",
       "  --judge-model <name>      Judge model for Tier 3 provider",
@@ -236,6 +238,11 @@ function printEvalUsage(out = process.stderr) {
       "  --samples <path>          JSON file with samples: [{ id, response }]",
       "  --scenarios <path>        Alias for --samples in this scaffold",
       "  --json                    Output structured JSON",
+      "  --format <text|json|junit> Output format (default: text)",
+      "  --junit-threshold <num>   Global JUnit pass threshold in [0,1] (default: 0.7)",
+      "  --junit-threshold-tier1 <num> Tier 1 JUnit threshold override",
+      "  --junit-threshold-tier2 <num> Tier 2 JUnit threshold override",
+      "  --junit-threshold-tier3 <num> Tier 3 JUnit threshold override",
       "  --strict                  Treat validation warnings as errors",
       "  --verbose                 Include command metadata output",
       "  --no-color                Disable colorized output",
@@ -251,6 +258,7 @@ function parseEvalArgs(args) {
     profilePath: null,
     model: null,
     tier: null,
+    suite: null,
     provider: "auto",
     embeddingModel: null,
     judgeModel: null,
@@ -260,6 +268,11 @@ function parseEvalArgs(args) {
     maxRetries: null,
     retryBaseMs: null,
     json: false,
+    format: "text",
+    junitThreshold: null,
+    junitThresholdTier1: null,
+    junitThresholdTier2: null,
+    junitThresholdTier3: null,
     strict: false,
     verbose: false,
     noColor: false,
@@ -274,6 +287,7 @@ function parseEvalArgs(args) {
     const arg = args[index];
     if (arg === "--json") {
       result.json = true;
+      result.format = "json";
       continue;
     }
     if (arg === "--strict") {
@@ -300,14 +314,18 @@ function parseEvalArgs(args) {
       result.constraintImpact = true;
       continue;
     }
-    if (arg === "--model" || arg === "--tier" || arg === "--provider" || arg === "--embedding-model" || arg === "--judge-model" || arg === "--openai-base-url" || arg === "--anthropic-base-url" || arg === "--timeout-ms" || arg === "--max-retries" || arg === "--retry-base-ms" || arg === "--response" || arg === "--samples" || arg === "--scenarios") {
+    if (arg === "--model" || arg === "--tier" || arg === "--suite" || arg === "--provider" || arg === "--format" || arg === "--embedding-model" || arg === "--judge-model" || arg === "--openai-base-url" || arg === "--anthropic-base-url" || arg === "--timeout-ms" || arg === "--max-retries" || arg === "--retry-base-ms" || arg === "--junit-threshold" || arg === "--junit-threshold-tier1" || arg === "--junit-threshold-tier2" || arg === "--junit-threshold-tier3" || arg === "--response" || arg === "--samples" || arg === "--scenarios") {
       const value = args[index + 1];
       if (!value) return { error: `Missing value for "${arg}"` };
       if (arg === "--model") result.model = value;
       if (arg === "--tier") result.tier = Number(value);
+      if (arg === "--suite") result.suite = String(value).toLowerCase();
       if (arg === "--provider") {
         result.provider = String(value).toLowerCase();
       }
+      if (arg === "--format") {
+        result.format = String(value).toLowerCase();
+      }
       if (arg === "--embedding-model") result.embeddingModel = value;
       if (arg === "--judge-model") result.judgeModel = value;
       if (arg === "--openai-base-url") result.openaiBaseUrl = value;
@@ -315,6 +333,10 @@ function parseEvalArgs(args) {
       if (arg === "--timeout-ms") result.timeoutMs = Number(value);
       if (arg === "--max-retries") result.maxRetries = Number(value);
       if (arg === "--retry-base-ms") result.retryBaseMs = Number(value);
+      if (arg === "--junit-threshold") result.junitThreshold = Number(value);
+      if (arg === "--junit-threshold-tier1") result.junitThresholdTier1 = Number(value);
+      if (arg === "--junit-threshold-tier2") result.junitThresholdTier2 = Number(value);
+      if (arg === "--junit-threshold-tier3") result.junitThresholdTier3 = Number(value);
       if (arg === "--response") result.responses.push(value);
       if (arg === "--samples" || arg === "--scenarios") result.samplesPath = value;
       index += 1;
@@ -338,6 +360,20 @@ function parseEvalArgs(args) {
   if (!["auto", "openai", "anthropic"].includes(result.provider)) {
     return { error: 'Invalid "--provider" value. Expected auto, openai, or anthropic.' };
   }
+  if (result.suite != null && !["support", "healthcare", "developer"].includes(
+    result.suite
+  )) {
+    return { error: 'Invalid "--suite" value. Expected support, healthcare, or developer.' };
+  }
+  if (result.suite != null && result.samplesPath != null) {
+    return { error: 'Use either "--suite" or "--samples/--scenarios", not both.' };
+  }
+  if (result.suite != null && result.responses.length > 0) {
+    return { error: 'Use either "--suite" or "--response", not both.' };
+  }
+  if (!["text", "json", "junit"].includes(result.format)) {
+    return { error: 'Invalid "--format" value. Expected text, json, or junit.' };
+  }
   if (result.timeoutMs != null && (!Number.isInteger(result.timeoutMs) || result.timeoutMs < 0)) {
     return { error: 'Invalid "--timeout-ms" value. Expected a non-negative integer.' };
   }
@@ -347,9 +383,33 @@ function parseEvalArgs(args) {
   if (result.retryBaseMs != null && (!Number.isInteger(result.retryBaseMs) || result.retryBaseMs < 0)) {
     return { error: 'Invalid "--retry-base-ms" value. Expected a non-negative integer.' };
   }
+  for (const [flag, value] of [
+    ["--junit-threshold", result.junitThreshold],
+    ["--junit-threshold-tier1", result.junitThresholdTier1],
+    ["--junit-threshold-tier2", result.junitThresholdTier2],
+    ["--junit-threshold-tier3", result.junitThresholdTier3]
+  ]) {
+    if (value == null) continue;
+    if (!Number.isFinite(value) || value < 0 || value > 1) {
+      return { error: `Invalid "${flag}" value. Expected a number in [0, 1].` };
+    }
+  }
   return { value: result };
 }
 function loadSamples(options, cwd) {
+  if (options.suite) {
+    const suite = loadBuiltInEvalSuite(options.suite);
+    if (!suite) {
+      throw new Error(
+        `Unknown suite "${options.suite}". Expected support, healthcare, or developer.`
+      );
+    }
+    return suite.scenarios.map((scenario) => ({
+      id: scenario.id,
+      prompt: scenario.messages.map((message) => `${message.role}: ${message.content}`).join("\n"),
+      response: scenario.expected_behavior ?? ""
+    }));
+  }
   if (options.samplesPath) {
     const sampleFile = path2.resolve(cwd, options.samplesPath);
     const parsed = JSON.parse(fs.readFileSync(sampleFile, "utf8"));
@@ -377,10 +437,118 @@ function loadSamples(options, cwd) {
   }));
 }
 function writeProgress(io, options, message) {
-  if (options.json) return;
+  if (options.format !== "text") return;
   io.stderr.write(`${message}
 `);
 }
+function escapeXml(value) {
+  return String(value ?? "").replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&apos;");
+}
+function resolveJUnitThresholds(options) {
+  const base = options.junitThreshold ?? 0.7;
+  return {
+    tier1: options.junitThresholdTier1 ?? base,
+    tier2: options.junitThresholdTier2 ?? base,
+    tier3: options.junitThresholdTier3 ?? base
+  };
+}
+function buildSampleScoreMap(samples) {
+  return new Map((samples ?? []).map((sample) => [String(sample.id), Number(sample.score)]));
+}
+function collectScenarioIds(reports) {
+  const ids = [];
+  const seen = /* @__PURE__ */ new Set();
+  for (const sample of reports.tier1?.samples ?? []) {
+    const id = String(sample.id);
+    if (seen.has(id)) continue;
+    seen.add(id);
+    ids.push(id);
+  }
+  for (const sample of reports.tier2?.samples ?? []) {
+    const id = String(sample.id);
+    if (seen.has(id)) continue;
+    seen.add(id);
+    ids.push(id);
+  }
+  for (const sample of reports.tier3?.samples ?? []) {
+    const id = String(sample.id);
+    if (seen.has(id)) continue;
+    seen.add(id);
+    ids.push(id);
+  }
+  return ids;
+}
+function buildJUnitReport(args) {
+  const ids = collectScenarioIds(args.tierReports);
+  const tier1Scores = buildSampleScoreMap(args.tierReports.tier1?.samples);
+  const tier2Scores = buildSampleScoreMap(args.tierReports.tier2?.samples);
+  const tier3Scores = buildSampleScoreMap(args.tierReports.tier3?.samples);
+  const className = `traits.eval.${path2.basename(args.profilePath, path2.extname(args.profilePath))}`;
+  let failures = 0;
+  const testCases = [];
+  for (const id of ids) {
+    const reasons = [];
+    const scoreLines = [];
+    const tier1 = tier1Scores.get(id);
+    if (tier1 != null) {
+      scoreLines.push(`tier1=${tier1.toFixed(3)} threshold=${args.thresholds.tier1.toFixed(3)}`);
+      if (tier1 < args.thresholds.tier1) {
+        reasons.push(
+          `Tier 1 score ${tier1.toFixed(3)} below threshold ${args.thresholds.tier1.toFixed(3)}`
+        );
+      }
+    }
+    const tier2 = tier2Scores.get(id);
+    if (tier2 != null) {
+      scoreLines.push(`tier2=${tier2.toFixed(3)} threshold=${args.thresholds.tier2.toFixed(3)}`);
+      if (tier2 < args.thresholds.tier2) {
+        reasons.push(
+          `Tier 2 score ${tier2.toFixed(3)} below threshold ${args.thresholds.tier2.toFixed(3)}`
+        );
+      }
+    }
+    const tier3 = tier3Scores.get(id);
+    if (tier3 != null) {
+      scoreLines.push(`tier3=${tier3.toFixed(3)} threshold=${args.thresholds.tier3.toFixed(3)}`);
+      if (tier3 < args.thresholds.tier3) {
+        reasons.push(
+          `Tier 3 score ${tier3.toFixed(3)} below threshold ${args.thresholds.tier3.toFixed(3)}`
+        );
+      }
+    }
+    const testcase = [];
+    testcase.push(
+      `  <testcase classname="${escapeXml(className)}" name="${escapeXml(id)}" time="0">`
+    );
+    if (reasons.length > 0) {
+      failures += 1;
+      testcase.push(
+        `    <failure message="${escapeXml("traits eval threshold failure")}">${escapeXml(
+          reasons.join(" | ")
+        )}</failure>`
+      );
+    }
+    if (scoreLines.length > 0) {
+      testcase.push(`    <system-out>${escapeXml(scoreLines.join(" | "))}</system-out>`);
+    }
+    testcase.push("  </testcase>");
+    testCases.push(testcase.join("\n"));
+  }
+  const xml = [
+    '<?xml version="1.0" encoding="UTF-8"?>',
+    "<testsuites>",
+    `  <testsuite name="traits.eval" tests="${ids.length}" failures="${failures}" errors="0" skipped="0" time="0">`,
+    `    <properties><property name="profile" value="${escapeXml(args.profilePath)}" /><property name="model" value="${escapeXml(args.model)}" /><property name="threshold_tier1" value="${args.thresholds.tier1.toFixed(3)}" /><property name="threshold_tier2" value="${args.thresholds.tier2.toFixed(3)}" /><property name="threshold_tier3" value="${args.thresholds.tier3.toFixed(3)}" /></properties>`,
+    ...testCases,
+    "  </testsuite>",
+    "</testsuites>"
+  ].join("\n");
+  return {
+    xml,
+    tests: ids.length,
+    failures
+  };
+}
 async function runEval(args, io = process) {
   const parsed = parseEvalArgs(args);
   if ("error" in parsed) {
@@ -515,6 +683,8 @@ async function runEval(args, io = process) {
     const payload = {
       profile: profilePath,
       model: options.model,
+      format: options.format,
+      suite: options.suite,
       tier_requested: requestedTier,
       tier_executed: tierResolution.tier_executed,
       tier_resolution: tierResolution,
@@ -531,11 +701,22 @@ async function runEval(args, io = process) {
         errors: evaluation.validation.errors.length
       };
     }
-    if (options.json) {
+    if (options.format === "json") {
       io.stdout.write(`${JSON.stringify(payload, null, 2)}
 `);
       return 0;
     }
+    if (options.format === "junit") {
+      const junit = buildJUnitReport({
+        profilePath,
+        model: options.model,
+        tierReports,
+        thresholds: resolveJUnitThresholds(options)
+      });
+      io.stdout.write(`${junit.xml}
+`);
+      return junit.failures > 0 ? 1 : 0;
+    }
     if (tierReports.tier1) {
       io.stdout.write(`Tier 1 average score: ${tierReports.tier1.average_score.toFixed(3)}
 `);
@@ -547,10 +728,16 @@ async function runEval(args, io = process) {
     if (tierReports.tier2) {
       io.stdout.write(`Tier 2 average score: ${tierReports.tier2.average_score.toFixed(3)}
 `);
+      io.stdout.write(
+        "Note: Tier 2 embedding scores are directionally useful but sensitive to model granularity.\n"
+      );
     }
     if (tierReports.tier3) {
       io.stdout.write(`Tier 3 average score: ${tierReports.tier3.average_score.toFixed(3)}
 `);
+      io.stdout.write(
+        "Note: Tier 3 judge scores are noisy across runs. Do not use as a sole merge gate.\n"
+      );
     }
     if (baselineReport?.tier1) {
       io.stdout.write(
@@ -578,12 +765,12 @@ async function runEval(args, io = process) {
     return 0;
   } catch (error) {
     const typedError = error;
-    if ((typedError.code === "E_EVAL_TIER2_UNAVAILABLE" || typedError.code === "E_EVAL_TIER3_UNAVAILABLE") && !options.json) {
+    if ((typedError.code === "E_EVAL_TIER2_UNAVAILABLE" || typedError.code === "E_EVAL_TIER3_UNAVAILABLE") && options.format !== "json") {
       io.stderr.write(`Error: ${typedError.message ?? "Evaluation tier unavailable."}
 `);
       return 2;
     }
-    if ((typedError.code === "E_EVAL_TIER2_UNAVAILABLE" || typedError.code === "E_EVAL_TIER3_UNAVAILABLE") && options.json) {
+    if ((typedError.code === "E_EVAL_TIER2_UNAVAILABLE" || typedError.code === "E_EVAL_TIER3_UNAVAILABLE") && options.format === "json") {
       io.stdout.write(
         `${JSON.stringify(
           {
@@ -599,7 +786,7 @@ async function runEval(args, io = process) {
     }
     const validation = typedError.validation;
     if (typedError.code === "E_EVAL_VALIDATION" && validation) {
-      if (options.json) {
+      if (options.format === "json") {
         io.stdout.write(
           `${JSON.stringify(
             {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@traits-dev/cli",
-  "version": "0.2.0",
+  "version": "0.3.0",
   "description": "traits.dev command-line interface for voice profile init, validate, compile, eval, and import workflows.",
   "keywords": [
     "traits-dev",
@@ -41,7 +41,7 @@
     "provenance": true
   },
   "dependencies": {
-    "@traits-dev/core": "^0.2.0"
+    "@traits-dev/core": "^0.3.0"
   },
   "devDependencies": {
     "@types/node": "^25.2.3",