@tritard/waterbrother 0.12.8 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tritard/waterbrother",
3
- "version": "0.12.8",
3
+ "version": "0.13.0",
4
4
  "description": "Waterbrother: Grok-powered coding CLI with local tools, sessions, operator modes, and approval controls",
5
5
  "type": "module",
6
6
  "bin": {
package/src/agent.js CHANGED
@@ -136,6 +136,7 @@ function buildSystemPrompt(profile, experienceMode = "standard", autonomyMode =
136
136
  ctxLines.push(`Benchmark site-type rules:\n- ${frontend.benchmarkSiteTypeRules.join("\n- ")}`);
137
137
  }
138
138
  }
139
+ if (executionContext.calibration) ctxLines.push(`Scope calibration (from scored build history):\n${executionContext.calibration}`);
139
140
  if (executionContext.reminders) ctxLines.push(`Scope reminders:\n${executionContext.reminders}`);
140
141
  if (ctxLines.length > 0) base += `\n\nExecution context:\n${ctxLines.join("\n")}`;
141
142
  }
package/src/cli.js CHANGED
@@ -33,6 +33,7 @@ import { runBuildWorkflow, startFeatureTask, runChallengeWorkflow } from "./work
33
33
  import { createPanelRenderer, buildPanelState } from "./panel.js";
34
34
  import { deriveTaskNameFromPrompt, nextActionsForState, routeNaturalInput } from "./router.js";
35
35
  import { compressEpisode, compressSessionEpisode, saveEpisode, loadRecentEpisodes, findRelevantEpisodes, buildEpisodicMemoryBlock, buildReminderBlock } from "./episodic.js";
36
+ import { formatScorecardSummary } from "./scorecard.js";
36
37
  import { createProduct, loadProduct, saveProduct, hasProduct, generateBlueprint, buildProductContext, detectProductRequest, parseProductIntent, addSurface, createCampaign, getActiveCampaign, matchTemplate, applyTemplate, startPreview, killPreview } from "./product.js";
37
38
  import { runQualityChecks, formatQualityFindings, buildQualityFixPrompt } from "./quality.js";
38
39
  import { scanForInitiatives, formatInitiatives, buildInitiativeFixPrompt } from "./initiative.js";
@@ -5244,7 +5245,7 @@ Be concrete about surfaces — name actual pages/flows. Choose the best stack fo
5244
5245
  }
5245
5246
 
5246
5247
  // Refine: natural language changes to a built product
5247
- if (/^(fix these|fix quality|fix initiatives|fix product)$/.test(lower) && !context.runtime.activeTask) {
5248
+ if (/^(fix these|fix quality|fix initiatives|fix product|fix issues|fix the issues|fix it|fix everything|fix all)$/.test(lower) && !context.runtime.activeTask) {
5248
5249
  // Only handle product fixes when no task is active — otherwise let router handle "fix these" for task reviews
5249
5250
  if (context.runtime.pendingInitiatives?.length > 0) {
5250
5251
  const spinner = createProgressSpinner("fixing product gaps...");
@@ -5270,7 +5271,7 @@ Be concrete about surfaces — name actual pages/flows. Choose the best stack fo
5270
5271
  }
5271
5272
  }
5272
5273
 
5273
- if (/^(fix these|fix quality)$/.test(lower) && !context.runtime.activeTask) {
5274
+ if (/^(fix these|fix quality|fix issues|fix the issues|fix it|fix everything|fix all)$/.test(lower) && !context.runtime.activeTask) {
5274
5275
  const spinner = createProgressSpinner("fixing quality issues...");
5275
5276
  try {
5276
5277
  const findings = await runQualityChecks({ cwd: context.cwd });
@@ -6446,6 +6447,11 @@ Be concrete about surfaces — name actual pages/flows. Choose the best stack fo
6446
6447
  lines.push(`${dim("impact:")} ${parts.join(", ")}`);
6447
6448
  }
6448
6449
 
6450
+ // Scorecard
6451
+ if (buildResult.scorecard) {
6452
+ lines.push(`${dim("score:")} ${formatScorecardSummary(buildResult.scorecard)}`);
6453
+ }
6454
+
6449
6455
  // Sentinel verdict
6450
6456
  if (buildResult.review) {
6451
6457
  const v = buildResult.review.verdict;
@@ -0,0 +1,321 @@
1
+ import fs from "node:fs/promises";
2
+ import path from "node:path";
3
+ import crypto from "node:crypto";
4
+
5
+ const MAX_INDEX_ENTRIES = 200;
6
+ const MAX_CALIBRATION_CHARS = 1500;
7
+
8
+ function scorecardsDir(cwd) {
9
+ return path.join(cwd, ".waterbrother", "memory", "scorecards");
10
+ }
11
+
12
+ function indexPath(cwd) {
13
+ return path.join(scorecardsDir(cwd), "index.json");
14
+ }
15
+
16
+ function scorecardPath(cwd, id) {
17
+ return path.join(scorecardsDir(cwd), `${id}.json`);
18
+ }
19
+
20
+ function makeId(name) {
21
+ const slug = String(name || "")
22
+ .toLowerCase()
23
+ .replace(/[^a-z0-9]+/g, "-")
24
+ .replace(/^-|-$/g, "")
25
+ .slice(0, 40);
26
+ const rand = crypto.randomBytes(3).toString("hex");
27
+ return slug ? `sc_${slug}-${rand}` : `sc_${rand}`;
28
+ }
29
+
30
+ // --- Score computation ---
31
+
32
+ function computeVerificationScore(verification) {
33
+ if (!Array.isArray(verification) || verification.length === 0) return null;
34
+ const passed = verification.filter((v) => v.pass || v.ok).length;
35
+ return passed / verification.length;
36
+ }
37
+
38
+ function computeSentinelScore(verdict) {
39
+ if (verdict === "ship") return 1.0;
40
+ if (verdict === "caution") return 0.5;
41
+ if (verdict === "block") return 0.0;
42
+ return null;
43
+ }
44
+
45
+ function computeQualityScore(warningCount) {
46
+ if (warningCount === null || warningCount === undefined) return null;
47
+ const maxWarnings = 10;
48
+ return Math.max(0, 1 - (warningCount / maxWarnings));
49
+ }
50
+
51
+ function computeUserScore(action) {
52
+ if (action === "accepted" || action === "ship it") return 1.0;
53
+ if (action === "fix-these" || action === "fix these") return 0.0;
54
+ if (action === "redo") return -0.5;
55
+ if (action === "challenge") return 0.25;
56
+ return null;
57
+ }
58
+
59
+ function computeComposite({ verificationScore, sentinelScore, qualityScore, userScore }) {
60
+ const weights = { verification: 0.25, sentinel: 0.25, quality: 0.25, user: 0.25 };
61
+ let total = 0;
62
+ let weightSum = 0;
63
+
64
+ if (verificationScore !== null) { total += verificationScore * weights.verification; weightSum += weights.verification; }
65
+ if (sentinelScore !== null) { total += sentinelScore * weights.sentinel; weightSum += weights.sentinel; }
66
+ if (qualityScore !== null) { total += qualityScore * weights.quality; weightSum += weights.quality; }
67
+ if (userScore !== null) { total += userScore * weights.user; weightSum += weights.user; }
68
+
69
+ return weightSum > 0 ? Math.round((total / weightSum) * 100) / 100 : null;
70
+ }
71
+
72
+ // --- Brier score (Layer 3) ---
73
+
74
+ export function computeBrierScores(predictions, outcomes) {
75
+ if (!predictions || !outcomes) return null;
76
+ const scores = {};
77
+
78
+ if (predictions.testPass !== undefined && Array.isArray(outcomes.verification)) {
79
+ const actual = outcomes.verification.every((v) => v.pass || v.ok) ? 1 : 0;
80
+ scores.testPass = Math.round(Math.pow(predictions.testPass - actual, 2) * 1000) / 1000;
81
+ }
82
+
83
+ if (predictions.sentinelShip !== undefined && outcomes.sentinel?.verdict) {
84
+ const actual = outcomes.sentinel.verdict === "ship" ? 1 : 0;
85
+ scores.sentinelShip = Math.round(Math.pow(predictions.sentinelShip - actual, 2) * 1000) / 1000;
86
+ }
87
+
88
+ if (predictions.userAcceptFirstTry !== undefined && outcomes.userAction) {
89
+ const actual = (outcomes.userAction === "accepted" || outcomes.userAction === "ship it") ? 1 : 0;
90
+ scores.userAcceptFirstTry = Math.round(Math.pow(predictions.userAcceptFirstTry - actual, 2) * 1000) / 1000;
91
+ }
92
+
93
+ return Object.keys(scores).length > 0 ? scores : null;
94
+ }
95
+
96
+ // --- Scorecard creation (Layer 1: passive scoring) ---
97
+
98
+ export function computeScorecard({ task, receipt, qualityFindings, userAction }) {
99
+ const id = makeId(task?.name || task?.id || "build");
100
+
101
+ // Extract outcomes from receipt
102
+ const verification = (receipt?.verification || []).map((v) => ({
103
+ command: v.command || v.label || "check",
104
+ pass: Boolean(v.ok)
105
+ }));
106
+
107
+ const sentinel = receipt?.review
108
+ ? { verdict: receipt.review.verdict, concerns: receipt.review.concerns || [] }
109
+ : null;
110
+
111
+ const challenge = receipt?.challenge
112
+ ? { concerns: receipt.challenge.concerns || [] }
113
+ : null;
114
+
115
+ const warningCount = Array.isArray(qualityFindings)
116
+ ? qualityFindings.filter((f) => f.severity === "warning").length
117
+ : null;
118
+
119
+ const quality = {
120
+ warnings: warningCount || 0,
121
+ findings: Array.isArray(qualityFindings)
122
+ ? qualityFindings.filter((f) => f.severity === "warning").map((f) => f.message).slice(0, 5)
123
+ : []
124
+ };
125
+
126
+ // Compute scores
127
+ const verificationScore = computeVerificationScore(verification);
128
+ const sentinelScore = computeSentinelScore(sentinel?.verdict);
129
+ const qualityScore = computeQualityScore(warningCount);
130
+ const userScoreVal = computeUserScore(userAction);
131
+ const composite = computeComposite({ verificationScore, sentinelScore, qualityScore, userScore: userScoreVal });
132
+
133
+ // Derive scope from receipt
134
+ const scope = [];
135
+ if (receipt?.changedFiles?.length) {
136
+ const dirs = new Set();
137
+ for (const f of receipt.changedFiles) {
138
+ const dir = f.replace(/\\/g, "/").split("/").slice(0, -1).join("/");
139
+ if (dir) dirs.add(`${dir}/**`);
140
+ }
141
+ scope.push(...dirs);
142
+ }
143
+
144
+ return {
145
+ id,
146
+ taskId: task?.id || null,
147
+ taskName: task?.name || null,
148
+ scope,
149
+ approach: task?.chosenOption || null,
150
+ timestamp: new Date().toISOString(),
151
+ predictions: null,
152
+ outcomes: {
153
+ verification,
154
+ sentinel,
155
+ challenge,
156
+ quality,
157
+ designReview: receipt?.designReview ? { verdict: receipt.designReview.verdict } : null,
158
+ userAction: userAction || null,
159
+ experimentDelta: null
160
+ },
161
+ scores: {
162
+ verificationScore,
163
+ sentinelScore,
164
+ qualityScore,
165
+ userScore: userScoreVal,
166
+ composite
167
+ },
168
+ brierScores: null
169
+ };
170
+ }
171
+
172
+ // --- Storage ---
173
+
174
+ async function readIndex(cwd) {
175
+ try {
176
+ const raw = await fs.readFile(indexPath(cwd), "utf8");
177
+ return JSON.parse(raw);
178
+ } catch {
179
+ return [];
180
+ }
181
+ }
182
+
183
+ async function writeIndex(cwd, index) {
184
+ await fs.mkdir(scorecardsDir(cwd), { recursive: true });
185
+ await fs.writeFile(indexPath(cwd), `${JSON.stringify(index, null, 2)}\n`, "utf8");
186
+ }
187
+
188
+ export async function saveScorecard({ cwd, scorecard }) {
189
+ await fs.mkdir(scorecardsDir(cwd), { recursive: true });
190
+ await fs.writeFile(scorecardPath(cwd, scorecard.id), `${JSON.stringify(scorecard, null, 2)}\n`, "utf8");
191
+
192
+ const index = await readIndex(cwd);
193
+ index.unshift({
194
+ id: scorecard.id,
195
+ taskName: scorecard.taskName,
196
+ scope: scorecard.scope,
197
+ approach: scorecard.approach,
198
+ composite: scorecard.scores.composite,
199
+ timestamp: scorecard.timestamp
200
+ });
201
+ if (index.length > MAX_INDEX_ENTRIES) index.length = MAX_INDEX_ENTRIES;
202
+ await writeIndex(cwd, index);
203
+ }
204
+
205
+ export async function findRelevantScorecards({ cwd, filePatterns = [], limit = 10 }) {
206
+ const index = await readIndex(cwd);
207
+ if (index.length === 0) return [];
208
+
209
+ const queryDirs = filePatterns.map((p) => p.replace(/\/?\*\*$/, "").replace(/\\/g, "/").toLowerCase());
210
+
211
+ const scored = [];
212
+ for (const entry of index) {
213
+ let relevance = 0;
214
+ if (queryDirs.length > 0 && Array.isArray(entry.scope)) {
215
+ for (const s of entry.scope) {
216
+ const sDir = s.replace(/\/?\*\*$/, "").replace(/\\/g, "/").toLowerCase();
217
+ for (const qd of queryDirs) {
218
+ if (sDir.startsWith(qd) || qd.startsWith(sDir)) { relevance += 3; break; }
219
+ }
220
+ }
221
+ }
222
+ if (relevance > 0) scored.push({ entry, relevance });
223
+ }
224
+
225
+ scored.sort((a, b) => b.relevance - a.relevance || new Date(b.entry.timestamp) - new Date(a.entry.timestamp));
226
+ const top = scored.slice(0, limit);
227
+
228
+ const cards = [];
229
+ for (const { entry } of top) {
230
+ try {
231
+ const raw = await fs.readFile(scorecardPath(cwd, entry.id), "utf8");
232
+ cards.push(JSON.parse(raw));
233
+ } catch {}
234
+ }
235
+ return cards;
236
+ }
237
+
238
+ export async function loadRecentScorecards({ cwd, limit = 10 }) {
239
+ const index = await readIndex(cwd);
240
+ const cards = [];
241
+ for (const entry of index.slice(0, limit)) {
242
+ try {
243
+ const raw = await fs.readFile(scorecardPath(cwd, entry.id), "utf8");
244
+ cards.push(JSON.parse(raw));
245
+ } catch {}
246
+ }
247
+ return cards;
248
+ }
249
+
250
+ // --- Layer 2: Context injection ---
251
+
252
+ export function buildCalibrationBlock(scorecards) {
253
+ if (!scorecards || scorecards.length === 0) return "";
254
+
255
+ const lines = ["Build history for this scope:"];
256
+ let chars = lines[0].length;
257
+
258
+ // Group by approach
259
+ const byApproach = {};
260
+ for (const sc of scorecards) {
261
+ const key = sc.approach || "unknown";
262
+ if (!byApproach[key]) byApproach[key] = [];
263
+ byApproach[key].push(sc);
264
+ }
265
+
266
+ for (const [approach, cards] of Object.entries(byApproach)) {
267
+ const avg = cards.reduce((sum, c) => sum + (c.scores.composite || 0), 0) / cards.length;
268
+ const verdicts = cards.map((c) => c.outcomes.sentinel?.verdict).filter(Boolean);
269
+ const actions = cards.map((c) => c.outcomes.userAction).filter(Boolean);
270
+ const line = `- ${approach}: avg score ${avg.toFixed(2)} (${verdicts.join(", ")}) → user: ${actions.join(", ")}`;
271
+ if (chars + line.length > MAX_CALIBRATION_CHARS) break;
272
+ lines.push(line);
273
+ chars += line.length;
274
+ }
275
+
276
+ // Aggregate blind spots
277
+ const allFindings = {};
278
+ for (const sc of scorecards) {
279
+ for (const f of (sc.outcomes.quality?.findings || [])) {
280
+ allFindings[f] = (allFindings[f] || 0) + 1;
281
+ }
282
+ }
283
+ const blindSpots = Object.entries(allFindings)
284
+ .sort((a, b) => b[1] - a[1])
285
+ .slice(0, 3)
286
+ .map(([finding, count]) => `${finding} (${count}x)`);
287
+
288
+ if (blindSpots.length > 0) {
289
+ const bsLine = `Quality blind spots: ${blindSpots.join(", ")}`;
290
+ if (chars + bsLine.length <= MAX_CALIBRATION_CHARS) {
291
+ lines.push(bsLine);
292
+ }
293
+ }
294
+
295
+ // Brier calibration note (Layer 3)
296
+ const brierCards = scorecards.filter((sc) => sc.brierScores);
297
+ if (brierCards.length >= 3) {
298
+ const avgBrier = brierCards.reduce((sum, sc) => {
299
+ const vals = Object.values(sc.brierScores);
300
+ return sum + (vals.reduce((a, b) => a + b, 0) / vals.length);
301
+ }, 0) / brierCards.length;
302
+ if (avgBrier > 0.3) {
303
+ lines.push(`Calibration warning: avg Brier ${avgBrier.toFixed(2)} — lower your confidence estimates.`);
304
+ }
305
+ }
306
+
307
+ return lines.join("\n");
308
+ }
309
+
310
+ // --- Summary for display ---
311
+
312
+ export function formatScorecardSummary(scorecard) {
313
+ const s = scorecard.scores;
314
+ const parts = [];
315
+ if (s.verificationScore !== null) parts.push(`verify:${(s.verificationScore * 100).toFixed(0)}%`);
316
+ if (s.sentinelScore !== null) parts.push(`sentinel:${(s.sentinelScore * 100).toFixed(0)}%`);
317
+ if (s.qualityScore !== null) parts.push(`quality:${(s.qualityScore * 100).toFixed(0)}%`);
318
+ if (s.userScore !== null) parts.push(`user:${(s.userScore * 100).toFixed(0)}%`);
319
+ if (s.composite !== null) parts.push(`composite:${(s.composite * 100).toFixed(0)}%`);
320
+ return parts.join(" ");
321
+ }
package/src/workflow.js CHANGED
@@ -16,6 +16,7 @@ import {
16
16
  } from "./frontend.js";
17
17
  import { runPlannerPass, formatPlanForExecutor, formatPlanForDisplay } from "./planner.js";
18
18
  import { runVerificationPass, formatVerifierResults, hasFailures } from "./verifier.js";
19
+ import { computeScorecard, saveScorecard, findRelevantScorecards, buildCalibrationBlock } from "./scorecard.js";
19
20
 
20
21
  export async function runBuildWorkflow({
21
22
  agent,
@@ -27,6 +28,18 @@ export async function runBuildWorkflow({
27
28
  if (!task) throw new Error("no active task");
28
29
  if (!promptText) throw new Error("build requires a prompt");
29
30
 
31
+ // Layer 2: Inject calibration from scored memory before planning
32
+ let calibrationBlock = "";
33
+ try {
34
+ const contractPaths = task.activeContract?.paths || [];
35
+ if (contractPaths.length > 0) {
36
+ const relevantCards = await findRelevantScorecards({ cwd: context.cwd, filePatterns: contractPaths, limit: 5 });
37
+ if (relevantCards.length > 0) {
38
+ calibrationBlock = buildCalibrationBlock(relevantCards);
39
+ }
40
+ }
41
+ } catch {}
42
+
30
43
  // Planner/Executor split: if plannerModel is configured, run planner first
31
44
  const plannerModel = context.runtime?.plannerModel;
32
45
  let planBlock = "";
@@ -68,6 +81,7 @@ export async function runBuildWorkflow({
68
81
  Object.assign(executionCtx, frontendCtx);
69
82
  }
70
83
  if (planBlock) executionCtx.plan = planBlock;
84
+ if (calibrationBlock) executionCtx.calibration = calibrationBlock;
71
85
  agent.setExecutionContext(executionCtx);
72
86
 
73
87
  // Pre-seed contract if task has one
@@ -311,6 +325,20 @@ export async function runBuildWorkflow({
311
325
  context.runtime.lastImpact = impact || null;
312
326
  }
313
327
 
328
+ // Layer 1: Compute and save scorecard (passive scoring)
329
+ let scorecard = null;
330
+ if (finalReceipt?.mutated) {
331
+ try {
332
+ scorecard = computeScorecard({
333
+ task,
334
+ receipt: finalReceipt,
335
+ qualityFindings: null, // quality findings come from the caller if available
336
+ userAction: null // populated later when user acts
337
+ });
338
+ await saveScorecard({ cwd: context.cwd, scorecard });
339
+ } catch {}
340
+ }
341
+
314
342
  return {
315
343
  response,
316
344
  receipt: finalReceipt,
@@ -320,7 +348,8 @@ export async function runBuildWorkflow({
320
348
  screenshotReview,
321
349
  impactSummary: impact ? summarizeImpactMap(impact) : null,
322
350
  verifierResults,
323
- verifierSummary: verifierResults ? formatVerifierResults(verifierResults) : null
351
+ verifierSummary: verifierResults ? formatVerifierResults(verifierResults) : null,
352
+ scorecard
324
353
  };
325
354
  }
326
355