@tritard/waterbrother 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tritard/waterbrother",
3
- "version": "0.13.0",
3
+ "version": "0.14.0",
4
4
  "description": "Waterbrother: Grok-powered coding CLI with local tools, sessions, operator modes, and approval controls",
5
5
  "type": "module",
6
6
  "bin": {
package/src/scorecard.js CHANGED
@@ -3,7 +3,7 @@ import path from "node:path";
3
3
  import crypto from "node:crypto";
4
4
 
5
5
  const MAX_INDEX_ENTRIES = 200;
6
- const MAX_CALIBRATION_CHARS = 1500;
6
+ const MAX_CALIBRATION_CHARS = 2000;
7
7
 
8
8
  function scorecardsDir(cwd) {
9
9
  return path.join(cwd, ".waterbrother", "memory", "scorecards");
@@ -56,8 +56,64 @@ function computeUserScore(action) {
56
56
  return null;
57
57
  }
58
58
 
59
- function computeComposite({ verificationScore, sentinelScore, qualityScore, userScore }) {
60
- const weights = { verification: 0.25, sentinel: 0.25, quality: 0.25, user: 0.25 };
59
+ // --- Phase 4: Attribution decomposition ---
60
+ // Chain: prompt plan execution verification sentinel user
61
+ // Each stage gets a score. Composite is precision-weighted.
62
+
63
+ function computeAttribution({ planQuality, executionQuality, verificationScore, sentinelScore, userScore }) {
64
+ return {
65
+ plan: planQuality !== null ? Math.round(planQuality * 100) / 100 : null,
66
+ execution: executionQuality !== null ? Math.round(executionQuality * 100) / 100 : null,
67
+ verification: verificationScore !== null ? Math.round(verificationScore * 100) / 100 : null,
68
+ sentinel: sentinelScore !== null ? Math.round(sentinelScore * 100) / 100 : null,
69
+ user: userScore !== null ? Math.round(userScore * 100) / 100 : null
70
+ };
71
+ }
72
+
73
+ function computePlanQuality({ receipt, sentinel, challenge }) {
74
+ // Plan is good if: files changed match contract scope, no scope leaks flagged
75
+ let score = 0.7; // baseline
76
+ if (sentinel?.verdict === "ship") score += 0.2;
77
+ if (sentinel?.verdict === "block") score -= 0.3;
78
+ // Scope leak = plan was too broad or too narrow
79
+ const scopeLeakConcerns = [...(sentinel?.concerns || []), ...(challenge?.concerns || [])]
80
+ .filter((c) => /scope|outside|unrelated|unnecessary/i.test(c));
81
+ score -= scopeLeakConcerns.length * 0.15;
82
+ return Math.max(0, Math.min(1, score));
83
+ }
84
+
85
+ function computeExecutionQuality({ receipt, verification }) {
86
+ // Execution is good if: code compiles, no runtime errors, verification passes
87
+ let score = 0.5; // baseline
88
+ if (Array.isArray(verification) && verification.length > 0) {
89
+ const passRate = verification.filter((v) => v.pass || v.ok).length / verification.length;
90
+ score = passRate;
91
+ }
92
+ // Bonus for clean diff (no empty files, no giant changes)
93
+ if (receipt?.changedFiles?.length > 0 && receipt.changedFiles.length <= 10) score += 0.1;
94
+ return Math.max(0, Math.min(1, score));
95
+ }
96
+
97
+ // --- Precision weighting ---
98
+ // Larger changes = more evidence = higher precision
99
+
100
+ function computePrecision(receipt) {
101
+ const fileCount = receipt?.changedFiles?.length || 0;
102
+ if (fileCount === 0) return 0.1;
103
+ if (fileCount <= 2) return 0.5;
104
+ if (fileCount <= 5) return 0.75;
105
+ if (fileCount <= 15) return 1.0;
106
+ return 1.0; // cap at 1.0
107
+ }
108
+
109
+ function computeComposite({ verificationScore, sentinelScore, qualityScore, userScore, precision }) {
110
+ // Precision-weighted blend
111
+ const weights = {
112
+ verification: 0.30 * (precision || 0.5),
113
+ sentinel: 0.25 * (precision || 0.5),
114
+ quality: 0.20,
115
+ user: 0.25
116
+ };
61
117
  let total = 0;
62
118
  let weightSum = 0;
63
119
 
@@ -69,7 +125,7 @@ function computeComposite({ verificationScore, sentinelScore, qualityScore, user
69
125
  return weightSum > 0 ? Math.round((total / weightSum) * 100) / 100 : null;
70
126
  }
71
127
 
72
- // --- Brier score (Layer 3) ---
128
+ // --- Phase 3: Brier scores ---
73
129
 
74
130
  export function computeBrierScores(predictions, outcomes) {
75
131
  if (!predictions || !outcomes) return null;
@@ -90,15 +146,48 @@ export function computeBrierScores(predictions, outcomes) {
90
146
  scores.userAcceptFirstTry = Math.round(Math.pow(predictions.userAcceptFirstTry - actual, 2) * 1000) / 1000;
91
147
  }
92
148
 
149
+ // Contrarian reward: predicted failure but it shipped clean
150
+ if (predictions.testPass !== undefined && predictions.testPass < 0.5) {
151
+ const actual = outcomes.verification?.every((v) => v.pass || v.ok) ? 1 : 0;
152
+ if (actual === 1) scores.contrarianReward = true;
153
+ }
154
+
93
155
  return Object.keys(scores).length > 0 ? scores : null;
94
156
  }
95
157
 
96
- // --- Scorecard creation (Layer 1: passive scoring) ---
158
+ // Generate predictions from historical data for a scope
159
+ export function generatePredictions(historicalCards) {
160
+ if (!historicalCards || historicalCards.length < 2) return null;
161
+
162
+ const testPassRates = historicalCards
163
+ .map((c) => c.scores.verificationScore)
164
+ .filter((v) => v !== null);
165
+ const sentinelShipRates = historicalCards
166
+ .map((c) => c.scores.sentinelScore)
167
+ .filter((v) => v !== null);
168
+ const userAcceptRates = historicalCards
169
+ .map((c) => c.scores.userScore)
170
+ .filter((v) => v !== null);
171
+
172
+ const avg = (arr) => arr.length > 0 ? arr.reduce((a, b) => a + b, 0) / arr.length : null;
173
+
174
+ const predictions = {};
175
+ const tp = avg(testPassRates);
176
+ const ss = avg(sentinelShipRates);
177
+ const ua = avg(userAcceptRates);
178
+
179
+ if (tp !== null) predictions.testPass = Math.round(tp * 100) / 100;
180
+ if (ss !== null) predictions.sentinelShip = Math.round(ss * 100) / 100;
181
+ if (ua !== null) predictions.userAcceptFirstTry = Math.round(Math.max(0, ua) * 100) / 100;
182
+
183
+ return Object.keys(predictions).length > 0 ? predictions : null;
184
+ }
185
+
186
+ // --- Scorecard creation ---
97
187
 
98
- export function computeScorecard({ task, receipt, qualityFindings, userAction }) {
188
+ export function computeScorecard({ task, receipt, qualityFindings, userAction, predictions }) {
99
189
  const id = makeId(task?.name || task?.id || "build");
100
190
 
101
- // Extract outcomes from receipt
102
191
  const verification = (receipt?.verification || []).map((v) => ({
103
192
  command: v.command || v.label || "check",
104
193
  pass: Boolean(v.ok)
@@ -123,14 +212,30 @@ export function computeScorecard({ task, receipt, qualityFindings, userAction })
123
212
  : []
124
213
  };
125
214
 
126
- // Compute scores
215
+ // Scores
127
216
  const verificationScore = computeVerificationScore(verification);
128
217
  const sentinelScore = computeSentinelScore(sentinel?.verdict);
129
218
  const qualityScore = computeQualityScore(warningCount);
130
219
  const userScoreVal = computeUserScore(userAction);
131
- const composite = computeComposite({ verificationScore, sentinelScore, qualityScore, userScore: userScoreVal });
220
+ const precision = computePrecision(receipt);
221
+ const composite = computeComposite({ verificationScore, sentinelScore, qualityScore, userScore: userScoreVal, precision });
222
+
223
+ // Attribution (Phase 4)
224
+ const planQuality = computePlanQuality({ receipt, sentinel, challenge });
225
+ const executionQuality = computeExecutionQuality({ receipt, verification });
226
+ const attribution = computeAttribution({
227
+ planQuality,
228
+ executionQuality,
229
+ verificationScore,
230
+ sentinelScore,
231
+ userScore: userScoreVal
232
+ });
132
233
 
133
- // Derive scope from receipt
234
+ // Brier (Phase 3)
235
+ const outcomes = { verification, sentinel, challenge, quality, userAction, designReview: receipt?.designReview ? { verdict: receipt.designReview.verdict } : null, experimentDelta: null };
236
+ const brierScores = predictions ? computeBrierScores(predictions, outcomes) : null;
237
+
238
+ // Scope
134
239
  const scope = [];
135
240
  if (receipt?.changedFiles?.length) {
136
241
  const dirs = new Set();
@@ -148,16 +253,9 @@ export function computeScorecard({ task, receipt, qualityFindings, userAction })
148
253
  scope,
149
254
  approach: task?.chosenOption || null,
150
255
  timestamp: new Date().toISOString(),
151
- predictions: null,
152
- outcomes: {
153
- verification,
154
- sentinel,
155
- challenge,
156
- quality,
157
- designReview: receipt?.designReview ? { verdict: receipt.designReview.verdict } : null,
158
- userAction: userAction || null,
159
- experimentDelta: null
160
- },
256
+ precision,
257
+ predictions: predictions || null,
258
+ outcomes,
161
259
  scores: {
162
260
  verificationScore,
163
261
  sentinelScore,
@@ -165,7 +263,8 @@ export function computeScorecard({ task, receipt, qualityFindings, userAction })
165
263
  userScore: userScoreVal,
166
264
  composite
167
265
  },
168
- brierScores: null
266
+ attribution,
267
+ brierScores
169
268
  };
170
269
  }
171
270
 
@@ -196,6 +295,7 @@ export async function saveScorecard({ cwd, scorecard }) {
196
295
  scope: scorecard.scope,
197
296
  approach: scorecard.approach,
198
297
  composite: scorecard.scores.composite,
298
+ precision: scorecard.precision,
199
299
  timestamp: scorecard.timestamp
200
300
  });
201
301
  if (index.length > MAX_INDEX_ENTRIES) index.length = MAX_INDEX_ENTRIES;
@@ -247,6 +347,28 @@ export async function loadRecentScorecards({ cwd, limit = 10 }) {
247
347
  return cards;
248
348
  }
249
349
 
350
+ // --- Scope variance (for autonomy adjustment) ---
351
+
352
+ export function computeScopeVariance(scorecards) {
353
+ if (!scorecards || scorecards.length < 2) return null;
354
+ const composites = scorecards.map((c) => c.scores.composite).filter((v) => v !== null);
355
+ if (composites.length < 2) return null;
356
+ const mean = composites.reduce((a, b) => a + b, 0) / composites.length;
357
+ const variance = composites.reduce((sum, v) => sum + Math.pow(v - mean, 2), 0) / composites.length;
358
+ return Math.round(variance * 1000) / 1000;
359
+ }
360
+
361
+ export function suggestAutonomyForScope(scorecards) {
362
+ if (!scorecards || scorecards.length < 3) return null;
363
+ const avg = scorecards.reduce((sum, c) => sum + (c.scores.composite || 0), 0) / scorecards.length;
364
+ const variance = computeScopeVariance(scorecards);
365
+ // High score + low variance = trust this scope
366
+ if (avg >= 0.8 && (variance === null || variance < 0.05)) return "auto";
367
+ // Low score or high variance = be careful
368
+ if (avg < 0.4 || (variance !== null && variance > 0.15)) return "ask";
369
+ return "scoped";
370
+ }
371
+
250
372
  // --- Layer 2: Context injection ---
251
373
 
252
374
  export function buildCalibrationBlock(scorecards) {
@@ -264,19 +386,50 @@ export function buildCalibrationBlock(scorecards) {
264
386
  }
265
387
 
266
388
  for (const [approach, cards] of Object.entries(byApproach)) {
267
- const avg = cards.reduce((sum, c) => sum + (c.scores.composite || 0), 0) / cards.length;
268
- const verdicts = cards.map((c) => c.outcomes.sentinel?.verdict).filter(Boolean);
269
- const actions = cards.map((c) => c.outcomes.userAction).filter(Boolean);
270
- const line = `- ${approach}: avg score ${avg.toFixed(2)} (${verdicts.join(", ")}) → user: ${actions.join(", ")}`;
389
+ // Precision-weighted average
390
+ const totalWeight = cards.reduce((sum, c) => sum + (c.precision || 0.5), 0);
391
+ const weightedAvg = totalWeight > 0
392
+ ? cards.reduce((sum, c) => sum + (c.scores.composite || 0) * (c.precision || 0.5), 0) / totalWeight
393
+ : 0;
394
+ const verdicts = cards.map((c) => c.outcomes?.sentinel?.verdict).filter(Boolean);
395
+ const actions = cards.map((c) => c.outcomes?.userAction).filter(Boolean);
396
+ const line = `- ${approach}: weighted avg ${weightedAvg.toFixed(2)} (${verdicts.join(", ")}) → user: ${actions.join(", ")}`;
271
397
  if (chars + line.length > MAX_CALIBRATION_CHARS) break;
272
398
  lines.push(line);
273
399
  chars += line.length;
274
400
  }
275
401
 
276
- // Aggregate blind spots
402
+ // Attribution insights — where does the system fail?
403
+ const attrCounts = { plan: 0, execution: 0, verification: 0, sentinel: 0, user: 0 };
404
+ const attrSums = { plan: 0, execution: 0, verification: 0, sentinel: 0, user: 0 };
405
+ for (const sc of scorecards) {
406
+ if (!sc.attribution) continue;
407
+ for (const key of Object.keys(attrCounts)) {
408
+ if (sc.attribution[key] !== null && sc.attribution[key] !== undefined) {
409
+ attrCounts[key]++;
410
+ attrSums[key] += sc.attribution[key];
411
+ }
412
+ }
413
+ }
414
+ const attrAvgs = {};
415
+ for (const key of Object.keys(attrCounts)) {
416
+ if (attrCounts[key] > 0) attrAvgs[key] = attrSums[key] / attrCounts[key];
417
+ }
418
+ // Find weakest stage
419
+ const stages = Object.entries(attrAvgs).sort((a, b) => a[1] - b[1]);
420
+ if (stages.length > 0 && stages[0][1] < 0.6) {
421
+ const weakest = stages[0];
422
+ const attrLine = `Weakest stage: ${weakest[0]} (avg ${weakest[1].toFixed(2)}). Strengthen ${weakest[0] === "plan" ? "planning — add missing error handling, edge cases" : weakest[0] === "execution" ? "execution — check for compile errors, runtime crashes" : weakest[0] === "verification" ? "verification — ensure all tests pass before submitting" : "this stage"}.`;
423
+ if (chars + attrLine.length <= MAX_CALIBRATION_CHARS) {
424
+ lines.push(attrLine);
425
+ chars += attrLine.length;
426
+ }
427
+ }
428
+
429
+ // Quality blind spots
277
430
  const allFindings = {};
278
431
  for (const sc of scorecards) {
279
- for (const f of (sc.outcomes.quality?.findings || [])) {
432
+ for (const f of (sc.outcomes?.quality?.findings || [])) {
280
433
  allFindings[f] = (allFindings[f] || 0) + 1;
281
434
  }
282
435
  }
@@ -292,18 +445,32 @@ export function buildCalibrationBlock(scorecards) {
292
445
  }
293
446
  }
294
447
 
295
- // Brier calibration note (Layer 3)
448
+ // Brier calibration (Phase 3)
296
449
  const brierCards = scorecards.filter((sc) => sc.brierScores);
297
450
  if (brierCards.length >= 3) {
298
451
  const avgBrier = brierCards.reduce((sum, sc) => {
299
- const vals = Object.values(sc.brierScores);
300
- return sum + (vals.reduce((a, b) => a + b, 0) / vals.length);
452
+ const vals = Object.values(sc.brierScores).filter((v) => typeof v === "number");
453
+ return sum + (vals.length > 0 ? vals.reduce((a, b) => a + b, 0) / vals.length : 0);
301
454
  }, 0) / brierCards.length;
302
- if (avgBrier > 0.3) {
303
- lines.push(`Calibration warning: avg Brier ${avgBrier.toFixed(2)} — lower your confidence estimates.`);
455
+ const brierLine = avgBrier > 0.3
456
+ ? `Calibration warning: avg Brier ${avgBrier.toFixed(2)} — you are overconfident. Lower predictions.`
457
+ : `Calibration: avg Brier ${avgBrier.toFixed(2)} — well calibrated.`;
458
+ lines.push(brierLine);
459
+
460
+ // Contrarian signals
461
+ const contrarianCount = brierCards.filter((sc) => sc.brierScores?.contrarianReward).length;
462
+ if (contrarianCount > 0) {
463
+ lines.push(`Contrarian note: ${contrarianCount} builds succeeded despite low confidence — you may be too conservative on this scope.`);
304
464
  }
305
465
  }
306
466
 
467
+ // Scope variance → autonomy suggestion
468
+ const variance = computeScopeVariance(scorecards);
469
+ const suggestedAutonomy = suggestAutonomyForScope(scorecards);
470
+ if (suggestedAutonomy && variance !== null) {
471
+ lines.push(`Scope variance: ${variance.toFixed(3)}. Suggested autonomy: ${suggestedAutonomy}.`);
472
+ }
473
+
307
474
  return lines.join("\n");
308
475
  }
309
476
 
@@ -317,5 +484,12 @@ export function formatScorecardSummary(scorecard) {
317
484
  if (s.qualityScore !== null) parts.push(`quality:${(s.qualityScore * 100).toFixed(0)}%`);
318
485
  if (s.userScore !== null) parts.push(`user:${(s.userScore * 100).toFixed(0)}%`);
319
486
  if (s.composite !== null) parts.push(`composite:${(s.composite * 100).toFixed(0)}%`);
487
+ if (scorecard.precision) parts.push(`precision:${scorecard.precision.toFixed(1)}`);
488
+ if (scorecard.attribution) {
489
+ const weakest = Object.entries(scorecard.attribution)
490
+ .filter(([, v]) => v !== null)
491
+ .sort((a, b) => a[1] - b[1])[0];
492
+ if (weakest && weakest[1] < 0.6) parts.push(`weak:${weakest[0]}`);
493
+ }
320
494
  return parts.join(" ");
321
495
  }
package/src/workflow.js CHANGED
@@ -16,7 +16,7 @@ import {
16
16
  } from "./frontend.js";
17
17
  import { runPlannerPass, formatPlanForExecutor, formatPlanForDisplay } from "./planner.js";
18
18
  import { runVerificationPass, formatVerifierResults, hasFailures } from "./verifier.js";
19
- import { computeScorecard, saveScorecard, findRelevantScorecards, buildCalibrationBlock } from "./scorecard.js";
19
+ import { computeScorecard, saveScorecard, findRelevantScorecards, buildCalibrationBlock, generatePredictions } from "./scorecard.js";
20
20
 
21
21
  export async function runBuildWorkflow({
22
22
  agent,
@@ -28,14 +28,16 @@ export async function runBuildWorkflow({
28
28
  if (!task) throw new Error("no active task");
29
29
  if (!promptText) throw new Error("build requires a prompt");
30
30
 
31
- // Layer 2: Inject calibration from scored memory before planning
31
+ // Layer 2+3: Inject calibration + generate predictions from scored memory
32
32
  let calibrationBlock = "";
33
+ let predictions = null;
33
34
  try {
34
35
  const contractPaths = task.activeContract?.paths || [];
35
36
  if (contractPaths.length > 0) {
36
- const relevantCards = await findRelevantScorecards({ cwd: context.cwd, filePatterns: contractPaths, limit: 5 });
37
+ const relevantCards = await findRelevantScorecards({ cwd: context.cwd, filePatterns: contractPaths, limit: 10 });
37
38
  if (relevantCards.length > 0) {
38
39
  calibrationBlock = buildCalibrationBlock(relevantCards);
40
+ predictions = generatePredictions(relevantCards);
39
41
  }
40
42
  }
41
43
  } catch {}
@@ -325,15 +327,16 @@ export async function runBuildWorkflow({
325
327
  context.runtime.lastImpact = impact || null;
326
328
  }
327
329
 
328
- // Layer 1: Compute and save scorecard (passive scoring)
330
+ // Compute and save scorecard with predictions + attribution
329
331
  let scorecard = null;
330
332
  if (finalReceipt?.mutated) {
331
333
  try {
332
334
  scorecard = computeScorecard({
333
335
  task,
334
336
  receipt: finalReceipt,
335
- qualityFindings: null, // quality findings come from the caller if available
336
- userAction: null // populated later when user acts
337
+ qualityFindings: null,
338
+ userAction: null,
339
+ predictions
337
340
  });
338
341
  await saveScorecard({ cwd: context.cwd, scorecard });
339
342
  } catch {}