npm - create-byan-agent - Versions diffs - 2.23.0 → 2.26.0 - Mend

create-byan-agent 2.23.0 → 2.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (172) hide show

package/install/templates/.claude/hooks/lib/autobench-config.json ADDED Viewed

@@ -0,0 +1,81 @@
+{
+  "_generated_by": "byan-sync-rules",
+  "_note": "Runtime subset read by autobench-stop-guard.js. Edit _byan/_config/autobench.yaml and regenerate; do not hand-edit. Regexes are {source, flags} pairs reconstructed into RegExp at load time.",
+  "version": 1,
+  "marker_patterns": {
+    "any": {
+      "source": "<!--\\s*BYAN-BENCH:(done|skip)\\b",
+      "flags": "i"
+    },
+    "done": {
+      "source": "<!--\\s*BYAN-BENCH:done\\b",
+      "flags": "i"
+    },
+    "skip": {
+      "source": "<!--\\s*BYAN-BENCH:skip\\b",
+      "flags": "i"
+    }
+  },
+  "marker_fields": {
+    "g1": {
+      "source": "g1=(\\d+)",
+      "flags": "i"
+    },
+    "g2": {
+      "source": "g2=(\\d+)",
+      "flags": "i"
+    },
+    "scope": {
+      "source": "scope=(internal|external)",
+      "flags": "i"
+    }
+  },
+  "never_list": [
+    {
+      "source": "\\b(yes/no|y/n|confirm|proceed\\?|continue\\?|ok\\?|on continue\\?|je confirme)\\b",
+      "flags": "i"
+    },
+    {
+      "source": "\\b(delete|drop|rm -rf|overwrite|force push|reset --hard|supprimer|écraser)\\b",
+      "flags": "i"
+    }
+  ],
+  "choice_language": [
+    {
+      "source": "\\boption\\s*[1-3a-c]\\b",
+      "flags": "ig",
+      "min_matches": 2
+    },
+    {
+      "source": "^[ \\t]*[-*][ \\t]+[A-Z][^\\n]{0,80}(:|[ \\t]-[ \\t])",
+      "flags": "gm",
+      "min_matches": 2
+    },
+    {
+      "source": "\\b(should I|veux-tu que je|do you want me to|préfères-tu|which (one|approach|option)|A or B|soit .* soit )\\b",
+      "flags": "i"
+    },
+    {
+      "source": "\\b(pros?|cons?|trade-?offs?|avantages?|inconvénients?)\\b",
+      "flags": "i",
+      "requires_candidates": 2
+    }
+  ],
+  "candidate_token": {
+    "source": "\\b(option|approach|approche|alternative|choix|solution|stack|library|librairie|vendor|standard)s?\\b",
+    "flags": "ig"
+  },
+  "escape_hatch": {
+    "session_flag": ".byan-autobench/off",
+    "disabled": false
+  },
+  "enforcement": {
+    "armed": false
+  },
+  "ledger": {
+    "path": "_byan-output/benchmark-ledger.jsonl"
+  },
+  "banners": {
+    "stop_block": "Auto-benchmark: you are presenting a choice between options but emitted no BYAN-BENCH marker. Re-present the fork as the compact 1-table benchmark (Option | criteria | Niv + best-first reco), then emit <!-- BYAN-BENCH:done g1=.. g2=.. scope=.. -->. If this is a confirm/destructive/obvious-default prompt, emit <!-- BYAN-BENCH:skip reason=.. -->. To disable for this session: touch .byan-autobench/off."
+  }
+}

package/install/templates/.claude/hooks/lib/autobench-fc-enrich.js ADDED Viewed

@@ -0,0 +1,251 @@
+// BYAN-only opt-in evidence enrichment for the byan-benchmark matrix (C5d).
+//
+// The native workflow (.claude/workflows/byan-benchmark.js) returns a DATA
+// matrix where each cell carries a self-graded evidence `level` (L1..L5) and an
+// `unverified` flag. That self-grade is the model judging its own claim. Inside
+// ~/BYAN the orchestrating skill can do better: it can call the byan_fc_check
+// MCP tool per factual cell and stamp an AUDITED evidence level onto the cell,
+// turning the Niv column into a fact-checked authority rather than a self-grade.
+//
+// This module is the pure, testable core of that wiring. It does NOT know about
+// MCP transport: the caller injects an async `check(text) -> { level, score, ...}`
+// function (in BYAN, a thin adapter over mcp__byan__byan_fc_check; in tests, a
+// mock). Without an injected checker the matrix is returned unchanged, which is
+// why the layer is OPT-IN and BYAN-only by construction: a platform that cannot
+// reach byan_fc_check simply does not pass a checker and gets the self-graded
+// matrix back, untouched.
+//
+// Strict-domain floors (mirrors .claude/rules/fact-check.md and the engine's
+// STRICT_FLOORS): a security/performance claim must reach L2, a compliance claim
+// L1, or the cell stays flagged [UNVERIFIED] no matter what the checker returned.
+// Enrichment can only RAISE authority or flag a shortfall; it never silently
+// upgrades a cell past its domain floor.
+'use strict';
+// Strict-domain minimum evidence levels. Numeric so floor comparison is a plain
+// `<=` (L1 is the strongest -> the smallest number). Kept in sync with the
+// engine's STRICT_FLOORS map and the fact-check rule doc.
+const STRICT_FLOORS = { security: 2, performance: 2, compliance: 1 };
+// Default heuristic: which cells are "hard claims" worth fact-checking. A cell
+// is a hard claim when it sits in a strict domain (every cell is then a claim
+// because the floor applies) OR its verdict text uses an absolute / superlative
+// the fact-check auto-detection also keys on. Low-stakes internal cells with a
+// hedged verdict are skipped to keep latency down (anti-bloat, C4).
+const ABSOLUTE_RE =
+  /\b(always|never|toujours|jamais|forcement|obviously|guaranteed|fastest|slowest|best|worst|optimal|superior|plus rapide|le plus|mieux|meilleur|fully|completely|zero|100%)\b/i;
+const STRICT_DOMAINS = Object.keys(STRICT_FLOORS);
+// Parse an "L3" / "l2" / 3 style level into the 1..5 integer, or null if absent.
+function parseLevel(level) {
+  if (typeof level === 'number' && Number.isFinite(level)) {
+    return level >= 1 && level <= 5 ? Math.round(level) : null;
+  }
+  if (typeof level === 'string') {
+    const m = level.match(/L?\s*([1-5])\b/i);
+    if (m) return Number(m[1]);
+  }
+  return null;
+}
+// Render a numeric level back to the canonical "L{n}" the matrix uses.
+function levelLabel(n) {
+  return n == null ? null : `L${n}`;
+}
+// Decide whether a cell is a hard claim worth an fc_check call.
+//   - any cell in a strict domain is a hard claim (the floor must be enforced);
+//   - otherwise, a cell whose verdict uses an absolute/superlative is a claim;
+//   - an explicit isHardClaim flag on the cell forces inclusion.
+// Returns false for hedged, low-stakes internal cells so enrichment stays cheap.
+function isHardClaim(cell, domain) {
+  if (!cell) return false;
+  if (cell.isHardClaim === true) return true;
+  if (STRICT_DOMAINS.includes(domain)) return true;
+  const verdict = typeof cell.verdict === 'string' ? cell.verdict : '';
+  const claim = typeof cell.claim === 'string' ? cell.claim : '';
+  return ABSOLUTE_RE.test(verdict) || ABSOLUTE_RE.test(claim);
+}
+// Build the text the checker scores for a cell. Prefer an explicit cell.claim
+// (the factual basis the SOURCE leaf wrote); fall back to the qualitative
+// verdict joined with the criterion so the checker has a self-contained claim.
+function cellClaimText(cell) {
+  if (cell && typeof cell.claim === 'string' && cell.claim.trim()) return cell.claim.trim();
+  const criterion = cell && cell.criterion ? String(cell.criterion) : '';
+  const verdict = cell && cell.verdict ? String(cell.verdict) : '';
+  return [criterion, verdict].filter(Boolean).join(': ').trim();
+}
+// Apply a single fc_check result to a cell. PURE given the result: returns a NEW
+// cell object (never mutates the input), records the audited level/score, the
+// fact-check status and assertionType, and re-evaluates the strict-domain floor.
+function applyCheckToCell(cell, result, domain) {
+  const checkedLevel = result ? parseLevel(result.level) : null;
+  const floor = STRICT_FLOORS[domain] || null;
+  // Below the domain floor (or unscored) -> the cell stays unverified regardless
+  // of the prior self-grade. A claim that cannot be sourced to its floor is not
+  // trustworthy in a strict domain.
+  const belowFloor =
+    floor != null && (checkedLevel == null || checkedLevel > floor);
+  const blocked = result && result.status === 'BLOCKED';
+  const out = Object.assign({}, cell);
+  out.fcChecked = true;
+  if (result) {
+    out.fcStatus = result.status;
+    out.fcAssertionType = result.assertionType;
+    if (typeof result.score === 'number') out.fcScore = result.score;
+  }
+  if (checkedLevel != null) {
+    out.level = levelLabel(checkedLevel);
+  }
+  if (belowFloor || blocked) {
+    out.unverified = true;
+    out.fcFloor = floor != null ? `L${floor}` : null;
+    out.fcBelowFloor = true;
+  } else if (checkedLevel != null) {
+    // A genuine audited level at or above the floor clears the unverified flag
+    // ONLY when the checker actually classified it as a CLAIM/FACT (not a bare
+    // HYPOTHESIS/OPINION). Otherwise leave the flag as the engine set it.
+    if (result && (result.status === 'CLAIM' || result.status === 'VERIFIED')) {
+      out.unverified = false;
+    }
+    out.fcBelowFloor = false;
+  }
+  return out;
+}
+/**
+ * Enrich a benchmark matrix in place-free fashion (returns a NEW matrix).
+ *
+ * @param {object} params
+ * @param {object} params.benchmark   The DATA object the workflow returned
+ *                                    ({ matrix, domain, scope, ... }).
+ * @param {(text: string) => Promise<object>} [params.check]
+ *                                    Async checker; in BYAN a thin adapter over
+ *                                    mcp__byan__byan_fc_check. If omitted, the
+ *                                    matrix is returned unchanged (opt-in).
+ * @param {boolean} [params.enabled=true]  Master opt-in switch.
+ * @param {(cell, domain) => boolean} [params.claimSelector]
+ *                                    Override the hard-claim heuristic.
+ * @returns {Promise<object>} A new benchmark object with enriched matrix and an
+ *                            `enrichment` report ({ enabled, checked, raised,
+ *                            flagged, skipped }).
+ */
+async function enrichMatrix(params) {
+  const {
+    benchmark,
+    check,
+    enabled = true,
+    claimSelector = isHardClaim,
+  } = params || {};
+  if (!benchmark || typeof benchmark !== 'object') {
+    throw new Error('enrichMatrix requires a benchmark object');
+  }
+  const domain = benchmark.domain || 'general';
+  const matrix = Array.isArray(benchmark.matrix) ? benchmark.matrix : [];
+  // Opt-in guard: no checker, disabled, or a degenerate (un-tabled) benchmark ->
+  // return the input untouched with an honest report. This is the BYAN-only
+  // gate: other platforms never inject a checker, so they get this branch.
+  if (!enabled || typeof check !== 'function' || benchmark.degenerate) {
+    return Object.assign({}, benchmark, {
+      enrichment: {
+        enabled: false,
+        reason: !enabled
+          ? 'disabled'
+          : typeof check !== 'function'
+            ? 'no-checker'
+            : 'degenerate',
+        checked: 0,
+        raised: 0,
+        flagged: 0,
+        skipped: countCells(matrix),
+      },
+    });
+  }
+  let checked = 0;
+  let raised = 0;
+  let flagged = 0;
+  let skipped = 0;
+  const newMatrix = [];
+  for (const row of matrix) {
+    const cells = Array.isArray(row && row.cells) ? row.cells : [];
+    const newCells = [];
+    for (const cell of cells) {
+      if (!claimSelector(cell, domain)) {
+        skipped += 1;
+        newCells.push(cell);
+        continue;
+      }
+      const text = cellClaimText(cell);
+      if (!text) {
+        skipped += 1;
+        newCells.push(cell);
+        continue;
+      }
+      let result = null;
+      try {
+        result = await check(text);
+      } catch {
+        // A checker failure must never break the benchmark: fall back to the
+        // self-graded cell, flagged so the gap is auditable, and keep going.
+        const fallback = Object.assign({}, cell, { fcChecked: false, fcError: true });
+        newCells.push(fallback);
+        skipped += 1;
+        continue;
+      }
+      const beforeLevel = parseLevel(cell && cell.level);
+      const enriched = applyCheckToCell(cell, result, domain);
+      checked += 1;
+      const afterLevel = parseLevel(enriched.level);
+      if (afterLevel != null && (beforeLevel == null || afterLevel < beforeLevel)) {
+        // A smaller number is a STRONGER level -> authority was raised.
+        raised += 1;
+      }
+      if (enriched.fcBelowFloor === true) flagged += 1;
+      newCells.push(enriched);
+    }
+    newMatrix.push(Object.assign({}, row, { cells: newCells }));
+  }
+  return Object.assign({}, benchmark, {
+    matrix: newMatrix,
+    enrichment: { enabled: true, checked, raised, flagged, skipped },
+  });
+}
+function countCells(matrix) {
+  if (!Array.isArray(matrix)) return 0;
+  return matrix.reduce(
+    (n, row) => n + (Array.isArray(row && row.cells) ? row.cells.length : 0),
+    0
+  );
+}
+module.exports = {
+  STRICT_FLOORS,
+  parseLevel,
+  levelLabel,
+  isHardClaim,
+  cellClaimText,
+  applyCheckToCell,
+  enrichMatrix,
+  countCells,
+};

package/install/templates/.claude/hooks/lib/autobench-ledger-report.js ADDED Viewed

@@ -0,0 +1,253 @@
+#!/usr/bin/env node
+// BYAN Auto-Benchmark miss-ledger reader / aggregator (C5e).
+//
+// The Stop hook (autobench-stop-guard.js) appends ONE JSONL line per invocation
+// to _byan-output/benchmark-ledger.jsonl. Each line is an audit record of one
+// end-of-turn decision: did the agent benchmark a fork it should have, or did it
+// MISS. This module reads that trail and aggregates it into the fires / misses /
+// miss-rate summary required by the acceptance criteria, plus a small CLI-ish
+// `main()` so a human (or CI) can run a one-shot report.
+//
+// Event vocabulary is OWNED by the hook; this reader treats it as the contract:
+//   fired-block               -> a MISS (the agent offered a fork without a
+//                                benchmark marker; the hook forced a regen).
+//   satisfied-marker          -> a HIT  (a real benchmark was presented).
+//   satisfied-skip            -> a deliberate skip (fork considered, not tabled).
+//   satisfied-never           -> exempt (y/n confirm / destructive prompt).
+//   satisfied-escape          -> exempt (escape-hatch active).
+//   satisfied-already-blocked -> the forced regen pass (block-once accounting).
+//   no-choice                 -> no fork was present (the common case).
+// Anything else is bucketed under `unknown` so a future event type is surfaced,
+// not silently dropped.
+//
+// Pure read-only: this module NEVER writes the ledger. It reads what the hook
+// wrote. Robust to a partially-written / corrupt JSONL file: a malformed line is
+// counted under `malformed` and skipped, never thrown.
+'use strict';
+const fs = require('fs');
+const path = require('path');
+// Events that count as a genuine MISS the agent must fix. Kept narrow on
+// purpose: only `fired-block` is a real failure. Everything else is either a
+// hit, an exempt case, or accounting.
+const MISS_EVENTS = new Set(['fired-block']);
+// Events that count as a real benchmark HIT (a fork was tabled).
+const HIT_EVENTS = new Set(['satisfied-marker']);
+// Events that mean "a fork was considered and deliberately not tabled".
+const SKIP_EVENTS = new Set(['satisfied-skip']);
+// Exempt / accounting events: present in the trail but neither a miss nor a hit.
+const EXEMPT_EVENTS = new Set([
+  'satisfied-never',
+  'satisfied-escape',
+  'satisfied-already-blocked',
+  'no-choice',
+]);
+function projectRoot() {
+  return process.env.CLAUDE_PROJECT_DIR || process.cwd();
+}
+function defaultLedgerPath() {
+  return path.join(projectRoot(), '_byan-output', 'benchmark-ledger.jsonl');
+}
+/**
+ * Read and parse the ledger file into an array of entries.
+ * Returns { entries, malformed, missing } where `malformed` counts unparseable
+ * lines and `missing` is true when the file does not exist (a fresh repo where
+ * the hook never fired -> not an error, just an empty trail).
+ *
+ * @param {string} [filePath] absolute path; defaults to the project ledger.
+ */
+function readLedger(filePath) {
+  const p = filePath || defaultLedgerPath();
+  let raw;
+  try {
+    if (!fs.existsSync(p)) return { entries: [], malformed: 0, missing: true };
+    raw = fs.readFileSync(p, 'utf8');
+  } catch {
+    // An unreadable ledger is treated as empty rather than thrown: a reporting
+    // tool must never crash the caller over a permissions blip.
+    return { entries: [], malformed: 0, missing: true };
+  }
+  const entries = [];
+  let malformed = 0;
+  for (const line of raw.split('\n')) {
+    const trimmed = line.trim();
+    if (!trimmed) continue;
+    try {
+      const obj = JSON.parse(trimmed);
+      if (obj && typeof obj === 'object') entries.push(obj);
+      else malformed += 1;
+    } catch {
+      malformed += 1;
+    }
+  }
+  return { entries, malformed, missing: false };
+}
+/**
+ * Aggregate a list of ledger entries into a summary.
+ *
+ * @param {Array<object>} entries
+ * @returns {object} {
+ *   total, fires, misses, hits, skips, exempt, unknown, missRate,
+ *   byEvent:{event:count}, byScope:{internal,external,unknown},
+ *   gates:{g1Total,g2Total,countWithGates}
+ * }
+ *   - fires  = decisions where a fork was present and acted on (hits + misses).
+ *   - misses = fired-block events (the agent must fix these).
+ *   - missRate = misses / fires (0 when no fork was ever present).
+ */
+function aggregate(entries) {
+  const list = Array.isArray(entries) ? entries : [];
+  const byEvent = {};
+  const byScope = { internal: 0, external: 0, unknown: 0 };
+  let misses = 0;
+  let hits = 0;
+  let skips = 0;
+  let exempt = 0;
+  let unknown = 0;
+  let g1Total = 0;
+  let g2Total = 0;
+  let countWithGates = 0;
+  for (const e of list) {
+    const event = e && typeof e.event === 'string' ? e.event : 'unknown';
+    byEvent[event] = (byEvent[event] || 0) + 1;
+    if (MISS_EVENTS.has(event)) misses += 1;
+    else if (HIT_EVENTS.has(event)) hits += 1;
+    else if (SKIP_EVENTS.has(event)) skips += 1;
+    else if (EXEMPT_EVENTS.has(event)) exempt += 1;
+    else unknown += 1;
+    // Scope tally (only the marker-bearing hits/skips carry a scope field).
+    const scope = e && e.scope;
+    if (scope === 'internal' || scope === 'external') byScope[scope] += 1;
+    else if (scope != null) byScope.unknown += 1;
+    // Gate totals: only the satisfied-marker entries carry g1/g2 (the marker
+    // fields the hook parsed). Average gate counts hint at fork complexity.
+    if (typeof e.g1 === 'number' || typeof e.g2 === 'number') {
+      if (typeof e.g1 === 'number') g1Total += e.g1;
+      if (typeof e.g2 === 'number') g2Total += e.g2;
+      countWithGates += 1;
+    }
+  }
+  // A "fire" is a turn where a fork was genuinely present and the doctrine
+  // applied: a HIT (tabled) or a MISS (should have, didn't). Skips, exempts and
+  // no-choice turns are NOT fires, so the miss-rate is not diluted by the vast
+  // majority of turns that have no fork at all.
+  const fires = hits + misses;
+  const missRate = fires > 0 ? misses / fires : 0;
+  return {
+    total: list.length,
+    fires,
+    misses,
+    hits,
+    skips,
+    exempt,
+    unknown,
+    missRate,
+    byEvent,
+    byScope,
+    gates: {
+      g1Total,
+      g2Total,
+      countWithGates,
+      g1Avg: countWithGates > 0 ? g1Total / countWithGates : 0,
+      g2Avg: countWithGates > 0 ? g2Total / countWithGates : 0,
+    },
+  };
+}
+/**
+ * Convenience: read + aggregate in one call.
+ * @param {string} [filePath]
+ * @returns {object} aggregate(...) plus { malformed, missing, path }.
+ */
+function report(filePath) {
+  const p = filePath || defaultLedgerPath();
+  const { entries, malformed, missing } = readLedger(p);
+  return Object.assign(aggregate(entries), { malformed, missing, path: p });
+}
+// Render a percentage with one decimal, no trailing-zero noise (e.g. "12.5%").
+function pct(ratio) {
+  return `${(ratio * 100).toFixed(1)}%`;
+}
+/**
+ * Render a human-readable summary block (no color, no emoji). Returns a string
+ * so it is testable; main() writes it to stdout.
+ */
+function formatReport(rep) {
+  const lines = [];
+  lines.push('BYAN Auto-Benchmark ledger report');
+  lines.push(`  file        : ${rep.path}`);
+  if (rep.missing) {
+    lines.push('  status      : ledger not found (the Stop hook has not fired yet)');
+    return lines.join('\n');
+  }
+  lines.push(`  records     : ${rep.total}${rep.malformed ? ` (+${rep.malformed} malformed, skipped)` : ''}`);
+  lines.push(`  forks (fires): ${rep.fires}   hits: ${rep.hits}   misses: ${rep.misses}`);
+  lines.push(`  miss-rate   : ${pct(rep.missRate)}${rep.fires === 0 ? ' (no fork seen)' : ''}`);
+  lines.push(`  skips       : ${rep.skips}   exempt: ${rep.exempt}   unknown: ${rep.unknown}`);
+  lines.push(
+    `  scope       : internal=${rep.byScope.internal} external=${rep.byScope.external}`
+  );
+  if (rep.gates.countWithGates > 0) {
+    lines.push(
+      `  avg gates   : g1=${rep.gates.g1Avg.toFixed(1)} g2=${rep.gates.g2Avg.toFixed(1)} (over ${rep.gates.countWithGates} benchmarks)`
+    );
+  }
+  const events = Object.keys(rep.byEvent).sort();
+  if (events.length) {
+    lines.push('  by event    :');
+    for (const ev of events) lines.push(`    ${ev.padEnd(26)} ${rep.byEvent[ev]}`);
+  }
+  return lines.join('\n');
+}
+// CLI entry: `node autobench-ledger-report.js [path]` or `--json` for raw data.
+// Exit 0 always (a report tool never fails the shell over a read).
+function main(argv) {
+  const args = Array.isArray(argv) ? argv : process.argv.slice(2);
+  const asJson = args.includes('--json');
+  const fileArg = args.find((a) => a && !a.startsWith('--'));
+  const rep = report(fileArg);
+  if (asJson) {
+    process.stdout.write(JSON.stringify(rep, null, 2) + '\n');
+  } else {
+    process.stdout.write(formatReport(rep) + '\n');
+  }
+  return rep;
+}
+if (require.main === module) {
+  main();
+}
+module.exports = {
+  MISS_EVENTS,
+  HIT_EVENTS,
+  SKIP_EVENTS,
+  EXEMPT_EVENTS,
+  defaultLedgerPath,
+  readLedger,
+  aggregate,
+  report,
+  formatReport,
+  pct,
+  main,
+};