npm - @tritard/waterbrother - Versions diffs - 0.13.0 → 0.14.0 - Mend

@tritard/waterbrother 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@tritard/waterbrother",
-  "version": "0.13.0",
+  "version": "0.14.0",
   "description": "Waterbrother: Grok-powered coding CLI with local tools, sessions, operator modes, and approval controls",
   "type": "module",
   "bin": {

package/src/scorecard.js CHANGED Viewed

@@ -3,7 +3,7 @@ import path from "node:path";
 import crypto from "node:crypto";
 const MAX_INDEX_ENTRIES = 200;
-const MAX_CALIBRATION_CHARS = 1500;
+const MAX_CALIBRATION_CHARS = 2000;
 function scorecardsDir(cwd) {
   return path.join(cwd, ".waterbrother", "memory", "scorecards");
@@ -56,8 +56,64 @@ function computeUserScore(action) {
   return null;
 }
-function computeComposite({ verificationScore, sentinelScore, qualityScore, userScore }) {
-  const weights = { verification: 0.25, sentinel: 0.25, quality: 0.25, user: 0.25 };
+// --- Phase 4: Attribution decomposition ---
+// Chain: prompt → plan → execution → verification → sentinel → user
+// Each stage gets a score. Composite is precision-weighted.
+function computeAttribution({ planQuality, executionQuality, verificationScore, sentinelScore, userScore }) {
+  return {
+    plan: planQuality !== null ? Math.round(planQuality * 100) / 100 : null,
+    execution: executionQuality !== null ? Math.round(executionQuality * 100) / 100 : null,
+    verification: verificationScore !== null ? Math.round(verificationScore * 100) / 100 : null,
+    sentinel: sentinelScore !== null ? Math.round(sentinelScore * 100) / 100 : null,
+    user: userScore !== null ? Math.round(userScore * 100) / 100 : null
+  };
+}
+function computePlanQuality({ receipt, sentinel, challenge }) {
+  // Plan is good if: files changed match contract scope, no scope leaks flagged
+  let score = 0.7; // baseline
+  if (sentinel?.verdict === "ship") score += 0.2;
+  if (sentinel?.verdict === "block") score -= 0.3;
+  // Scope leak = plan was too broad or too narrow
+  const scopeLeakConcerns = [...(sentinel?.concerns || []), ...(challenge?.concerns || [])]
+    .filter((c) => /scope|outside|unrelated|unnecessary/i.test(c));
+  score -= scopeLeakConcerns.length * 0.15;
+  return Math.max(0, Math.min(1, score));
+}
+function computeExecutionQuality({ receipt, verification }) {
+  // Execution is good if: code compiles, no runtime errors, verification passes
+  let score = 0.5; // baseline
+  if (Array.isArray(verification) && verification.length > 0) {
+    const passRate = verification.filter((v) => v.pass || v.ok).length / verification.length;
+    score = passRate;
+  }
+  // Bonus for clean diff (no empty files, no giant changes)
+  if (receipt?.changedFiles?.length > 0 && receipt.changedFiles.length <= 10) score += 0.1;
+  return Math.max(0, Math.min(1, score));
+}
+// --- Precision weighting ---
+// Larger changes = more evidence = higher precision
+function computePrecision(receipt) {
+  const fileCount = receipt?.changedFiles?.length || 0;
+  if (fileCount === 0) return 0.1;
+  if (fileCount <= 2) return 0.5;
+  if (fileCount <= 5) return 0.75;
+  if (fileCount <= 15) return 1.0;
+  return 1.0; // cap at 1.0
+}
+function computeComposite({ verificationScore, sentinelScore, qualityScore, userScore, precision }) {
+  // Precision-weighted blend
+  const weights = {
+    verification: 0.30 * (precision || 0.5),
+    sentinel: 0.25 * (precision || 0.5),
+    quality: 0.20,
+    user: 0.25
+  };
   let total = 0;
   let weightSum = 0;
@@ -69,7 +125,7 @@ function computeComposite({ verificationScore, sentinelScore, qualityScore, user
   return weightSum > 0 ? Math.round((total / weightSum) * 100) / 100 : null;
 }
-// --- Brier score (Layer 3) ---
+// --- Phase 3: Brier scores ---
 export function computeBrierScores(predictions, outcomes) {
   if (!predictions || !outcomes) return null;
@@ -90,15 +146,48 @@ export function computeBrierScores(predictions, outcomes) {
     scores.userAcceptFirstTry = Math.round(Math.pow(predictions.userAcceptFirstTry - actual, 2) * 1000) / 1000;
   }
+  // Contrarian reward: predicted failure but it shipped clean
+  if (predictions.testPass !== undefined && predictions.testPass < 0.5) {
+    const actual = outcomes.verification?.every((v) => v.pass || v.ok) ? 1 : 0;
+    if (actual === 1) scores.contrarianReward = true;
+  }
   return Object.keys(scores).length > 0 ? scores : null;
 }
-// --- Scorecard creation (Layer 1: passive scoring) ---
+// Generate predictions from historical data for a scope
+export function generatePredictions(historicalCards) {
+  if (!historicalCards || historicalCards.length < 2) return null;
+  const testPassRates = historicalCards
+    .map((c) => c.scores.verificationScore)
+    .filter((v) => v !== null);
+  const sentinelShipRates = historicalCards
+    .map((c) => c.scores.sentinelScore)
+    .filter((v) => v !== null);
+  const userAcceptRates = historicalCards
+    .map((c) => c.scores.userScore)
+    .filter((v) => v !== null);
+  const avg = (arr) => arr.length > 0 ? arr.reduce((a, b) => a + b, 0) / arr.length : null;
+  const predictions = {};
+  const tp = avg(testPassRates);
+  const ss = avg(sentinelShipRates);
+  const ua = avg(userAcceptRates);
+  if (tp !== null) predictions.testPass = Math.round(tp * 100) / 100;
+  if (ss !== null) predictions.sentinelShip = Math.round(ss * 100) / 100;
+  if (ua !== null) predictions.userAcceptFirstTry = Math.round(Math.max(0, ua) * 100) / 100;
+  return Object.keys(predictions).length > 0 ? predictions : null;
+}
+// --- Scorecard creation ---
-export function computeScorecard({ task, receipt, qualityFindings, userAction }) {
+export function computeScorecard({ task, receipt, qualityFindings, userAction, predictions }) {
   const id = makeId(task?.name || task?.id || "build");
-  // Extract outcomes from receipt
   const verification = (receipt?.verification || []).map((v) => ({
     command: v.command || v.label || "check",
     pass: Boolean(v.ok)
@@ -123,14 +212,30 @@ export function computeScorecard({ task, receipt, qualityFindings, userAction })
       : []
   };
-  // Compute scores
+  // Scores
   const verificationScore = computeVerificationScore(verification);
   const sentinelScore = computeSentinelScore(sentinel?.verdict);
   const qualityScore = computeQualityScore(warningCount);
   const userScoreVal = computeUserScore(userAction);
-  const composite = computeComposite({ verificationScore, sentinelScore, qualityScore, userScore: userScoreVal });
+  const precision = computePrecision(receipt);
+  const composite = computeComposite({ verificationScore, sentinelScore, qualityScore, userScore: userScoreVal, precision });
+  // Attribution (Phase 4)
+  const planQuality = computePlanQuality({ receipt, sentinel, challenge });
+  const executionQuality = computeExecutionQuality({ receipt, verification });
+  const attribution = computeAttribution({
+    planQuality,
+    executionQuality,
+    verificationScore,
+    sentinelScore,
+    userScore: userScoreVal
+  });
-  // Derive scope from receipt
+  // Brier (Phase 3)
+  const outcomes = { verification, sentinel, challenge, quality, userAction, designReview: receipt?.designReview ? { verdict: receipt.designReview.verdict } : null, experimentDelta: null };
+  const brierScores = predictions ? computeBrierScores(predictions, outcomes) : null;
+  // Scope
   const scope = [];
   if (receipt?.changedFiles?.length) {
     const dirs = new Set();
@@ -148,16 +253,9 @@ export function computeScorecard({ task, receipt, qualityFindings, userAction })
     scope,
     approach: task?.chosenOption || null,
     timestamp: new Date().toISOString(),
-    predictions: null,
-    outcomes: {
-      verification,
-      sentinel,
-      challenge,
-      quality,
-      designReview: receipt?.designReview ? { verdict: receipt.designReview.verdict } : null,
-      userAction: userAction || null,
-      experimentDelta: null
-    },
+    precision,
+    predictions: predictions || null,
+    outcomes,
     scores: {
       verificationScore,
       sentinelScore,
@@ -165,7 +263,8 @@ export function computeScorecard({ task, receipt, qualityFindings, userAction })
       userScore: userScoreVal,
       composite
     },
-    brierScores: null
+    attribution,
+    brierScores
   };
 }
@@ -196,6 +295,7 @@ export async function saveScorecard({ cwd, scorecard }) {
     scope: scorecard.scope,
     approach: scorecard.approach,
     composite: scorecard.scores.composite,
+    precision: scorecard.precision,
     timestamp: scorecard.timestamp
   });
   if (index.length > MAX_INDEX_ENTRIES) index.length = MAX_INDEX_ENTRIES;
@@ -247,6 +347,28 @@ export async function loadRecentScorecards({ cwd, limit = 10 }) {
   return cards;
 }
+// --- Scope variance (for autonomy adjustment) ---
+export function computeScopeVariance(scorecards) {
+  if (!scorecards || scorecards.length < 2) return null;
+  const composites = scorecards.map((c) => c.scores.composite).filter((v) => v !== null);
+  if (composites.length < 2) return null;
+  const mean = composites.reduce((a, b) => a + b, 0) / composites.length;
+  const variance = composites.reduce((sum, v) => sum + Math.pow(v - mean, 2), 0) / composites.length;
+  return Math.round(variance * 1000) / 1000;
+}
+export function suggestAutonomyForScope(scorecards) {
+  if (!scorecards || scorecards.length < 3) return null;
+  const avg = scorecards.reduce((sum, c) => sum + (c.scores.composite || 0), 0) / scorecards.length;
+  const variance = computeScopeVariance(scorecards);
+  // High score + low variance = trust this scope
+  if (avg >= 0.8 && (variance === null || variance < 0.05)) return "auto";
+  // Low score or high variance = be careful
+  if (avg < 0.4 || (variance !== null && variance > 0.15)) return "ask";
+  return "scoped";
+}
 // --- Layer 2: Context injection ---
 export function buildCalibrationBlock(scorecards) {
@@ -264,19 +386,50 @@ export function buildCalibrationBlock(scorecards) {
   }
   for (const [approach, cards] of Object.entries(byApproach)) {
-    const avg = cards.reduce((sum, c) => sum + (c.scores.composite || 0), 0) / cards.length;
-    const verdicts = cards.map((c) => c.outcomes.sentinel?.verdict).filter(Boolean);
-    const actions = cards.map((c) => c.outcomes.userAction).filter(Boolean);
-    const line = `- ${approach}: avg score ${avg.toFixed(2)} (${verdicts.join(", ")}) → user: ${actions.join(", ")}`;
+    // Precision-weighted average
+    const totalWeight = cards.reduce((sum, c) => sum + (c.precision || 0.5), 0);
+    const weightedAvg = totalWeight > 0
+      ? cards.reduce((sum, c) => sum + (c.scores.composite || 0) * (c.precision || 0.5), 0) / totalWeight
+      : 0;
+    const verdicts = cards.map((c) => c.outcomes?.sentinel?.verdict).filter(Boolean);
+    const actions = cards.map((c) => c.outcomes?.userAction).filter(Boolean);
+    const line = `- ${approach}: weighted avg ${weightedAvg.toFixed(2)} (${verdicts.join(", ")}) → user: ${actions.join(", ")}`;
     if (chars + line.length > MAX_CALIBRATION_CHARS) break;
     lines.push(line);
     chars += line.length;
   }
-  // Aggregate blind spots
+  // Attribution insights — where does the system fail?
+  const attrCounts = { plan: 0, execution: 0, verification: 0, sentinel: 0, user: 0 };
+  const attrSums = { plan: 0, execution: 0, verification: 0, sentinel: 0, user: 0 };
+  for (const sc of scorecards) {
+    if (!sc.attribution) continue;
+    for (const key of Object.keys(attrCounts)) {
+      if (sc.attribution[key] !== null && sc.attribution[key] !== undefined) {
+        attrCounts[key]++;
+        attrSums[key] += sc.attribution[key];
+      }
+    }
+  }
+  const attrAvgs = {};
+  for (const key of Object.keys(attrCounts)) {
+    if (attrCounts[key] > 0) attrAvgs[key] = attrSums[key] / attrCounts[key];
+  }
+  // Find weakest stage
+  const stages = Object.entries(attrAvgs).sort((a, b) => a[1] - b[1]);
+  if (stages.length > 0 && stages[0][1] < 0.6) {
+    const weakest = stages[0];
+    const attrLine = `Weakest stage: ${weakest[0]} (avg ${weakest[1].toFixed(2)}). Strengthen ${weakest[0] === "plan" ? "planning — add missing error handling, edge cases" : weakest[0] === "execution" ? "execution — check for compile errors, runtime crashes" : weakest[0] === "verification" ? "verification — ensure all tests pass before submitting" : "this stage"}.`;
+    if (chars + attrLine.length <= MAX_CALIBRATION_CHARS) {
+      lines.push(attrLine);
+      chars += attrLine.length;
+    }
+  }
+  // Quality blind spots
   const allFindings = {};
   for (const sc of scorecards) {
-    for (const f of (sc.outcomes.quality?.findings || [])) {
+    for (const f of (sc.outcomes?.quality?.findings || [])) {
       allFindings[f] = (allFindings[f] || 0) + 1;
     }
   }
@@ -292,18 +445,32 @@ export function buildCalibrationBlock(scorecards) {
     }
   }
-  // Brier calibration note (Layer 3)
+  // Brier calibration (Phase 3)
   const brierCards = scorecards.filter((sc) => sc.brierScores);
   if (brierCards.length >= 3) {
     const avgBrier = brierCards.reduce((sum, sc) => {
-      const vals = Object.values(sc.brierScores);
-      return sum + (vals.reduce((a, b) => a + b, 0) / vals.length);
+      const vals = Object.values(sc.brierScores).filter((v) => typeof v === "number");
+      return sum + (vals.length > 0 ? vals.reduce((a, b) => a + b, 0) / vals.length : 0);
     }, 0) / brierCards.length;
-    if (avgBrier > 0.3) {
-      lines.push(`Calibration warning: avg Brier ${avgBrier.toFixed(2)} — lower your confidence estimates.`);
+    const brierLine = avgBrier > 0.3
+      ? `Calibration warning: avg Brier ${avgBrier.toFixed(2)} — you are overconfident. Lower predictions.`
+      : `Calibration: avg Brier ${avgBrier.toFixed(2)} — well calibrated.`;
+    lines.push(brierLine);
+    // Contrarian signals
+    const contrarianCount = brierCards.filter((sc) => sc.brierScores?.contrarianReward).length;
+    if (contrarianCount > 0) {
+      lines.push(`Contrarian note: ${contrarianCount} builds succeeded despite low confidence — you may be too conservative on this scope.`);
     }
   }
+  // Scope variance → autonomy suggestion
+  const variance = computeScopeVariance(scorecards);
+  const suggestedAutonomy = suggestAutonomyForScope(scorecards);
+  if (suggestedAutonomy && variance !== null) {
+    lines.push(`Scope variance: ${variance.toFixed(3)}. Suggested autonomy: ${suggestedAutonomy}.`);
+  }
   return lines.join("\n");
 }
@@ -317,5 +484,12 @@ export function formatScorecardSummary(scorecard) {
   if (s.qualityScore !== null) parts.push(`quality:${(s.qualityScore * 100).toFixed(0)}%`);
   if (s.userScore !== null) parts.push(`user:${(s.userScore * 100).toFixed(0)}%`);
   if (s.composite !== null) parts.push(`composite:${(s.composite * 100).toFixed(0)}%`);
+  if (scorecard.precision) parts.push(`precision:${scorecard.precision.toFixed(1)}`);
+  if (scorecard.attribution) {
+    const weakest = Object.entries(scorecard.attribution)
+      .filter(([, v]) => v !== null)
+      .sort((a, b) => a[1] - b[1])[0];
+    if (weakest && weakest[1] < 0.6) parts.push(`weak:${weakest[0]}`);
+  }
   return parts.join("  ");
 }

package/src/workflow.js CHANGED Viewed

@@ -16,7 +16,7 @@ import {
 } from "./frontend.js";
 import { runPlannerPass, formatPlanForExecutor, formatPlanForDisplay } from "./planner.js";
 import { runVerificationPass, formatVerifierResults, hasFailures } from "./verifier.js";
-import { computeScorecard, saveScorecard, findRelevantScorecards, buildCalibrationBlock } from "./scorecard.js";
+import { computeScorecard, saveScorecard, findRelevantScorecards, buildCalibrationBlock, generatePredictions } from "./scorecard.js";
 export async function runBuildWorkflow({
   agent,
@@ -28,14 +28,16 @@ export async function runBuildWorkflow({
   if (!task) throw new Error("no active task");
   if (!promptText) throw new Error("build requires a prompt");
-  // Layer 2: Inject calibration from scored memory before planning
+  // Layer 2+3: Inject calibration + generate predictions from scored memory
   let calibrationBlock = "";
+  let predictions = null;
   try {
     const contractPaths = task.activeContract?.paths || [];
     if (contractPaths.length > 0) {
-      const relevantCards = await findRelevantScorecards({ cwd: context.cwd, filePatterns: contractPaths, limit: 5 });
+      const relevantCards = await findRelevantScorecards({ cwd: context.cwd, filePatterns: contractPaths, limit: 10 });
       if (relevantCards.length > 0) {
         calibrationBlock = buildCalibrationBlock(relevantCards);
+        predictions = generatePredictions(relevantCards);
       }
     }
   } catch {}
@@ -325,15 +327,16 @@ export async function runBuildWorkflow({
     context.runtime.lastImpact = impact || null;
   }
-  // Layer 1: Compute and save scorecard (passive scoring)
+  // Compute and save scorecard with predictions + attribution
   let scorecard = null;
   if (finalReceipt?.mutated) {
     try {
       scorecard = computeScorecard({
         task,
         receipt: finalReceipt,
-        qualityFindings: null, // quality findings come from the caller if available
-        userAction: null       // populated later when user acts
+        qualityFindings: null,
+        userAction: null,
+        predictions
       });
       await saveScorecard({ cwd: context.cwd, scorecard });
     } catch {}