@fredericboyer/dev-team 0.8.1 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/dist/create-agent.js +20 -6
  2. package/dist/create-agent.js.map +1 -1
  3. package/dist/init.d.ts +8 -1
  4. package/dist/init.js +71 -5
  5. package/dist/init.js.map +1 -1
  6. package/dist/status.js +12 -6
  7. package/dist/status.js.map +1 -1
  8. package/dist/update.d.ts +6 -0
  9. package/dist/update.js +107 -0
  10. package/dist/update.js.map +1 -1
  11. package/package.json +2 -2
  12. package/templates/CLAUDE.md +25 -11
  13. package/templates/agent-memory/dev-team-beck/MEMORY.md +20 -6
  14. package/templates/agent-memory/dev-team-borges/MEMORY.md +20 -6
  15. package/templates/agent-memory/dev-team-brooks/MEMORY.md +20 -6
  16. package/templates/agent-memory/dev-team-conway/MEMORY.md +20 -6
  17. package/templates/agent-memory/dev-team-deming/MEMORY.md +21 -7
  18. package/templates/agent-memory/dev-team-drucker/MEMORY.md +20 -6
  19. package/templates/agent-memory/dev-team-hamilton/MEMORY.md +20 -6
  20. package/templates/agent-memory/dev-team-knuth/MEMORY.md +20 -6
  21. package/templates/agent-memory/dev-team-mori/MEMORY.md +20 -6
  22. package/templates/agent-memory/dev-team-szabo/MEMORY.md +20 -6
  23. package/templates/agent-memory/dev-team-tufte/MEMORY.md +20 -6
  24. package/templates/agent-memory/dev-team-voss/MEMORY.md +20 -6
  25. package/templates/agents/dev-team-beck.md +3 -0
  26. package/templates/agents/dev-team-borges.md +119 -11
  27. package/templates/agents/dev-team-brooks.md +10 -0
  28. package/templates/agents/dev-team-conway.md +3 -0
  29. package/templates/agents/dev-team-deming.md +3 -0
  30. package/templates/agents/dev-team-drucker.md +114 -2
  31. package/templates/agents/dev-team-hamilton.md +3 -0
  32. package/templates/agents/dev-team-knuth.md +10 -0
  33. package/templates/agents/dev-team-mori.md +3 -0
  34. package/templates/agents/dev-team-szabo.md +10 -0
  35. package/templates/agents/dev-team-tufte.md +3 -0
  36. package/templates/agents/dev-team-voss.md +3 -0
  37. package/templates/dev-team-learnings.md +3 -1
  38. package/templates/dev-team-metrics.md +18 -0
  39. package/templates/hooks/dev-team-post-change-review.js +71 -0
  40. package/templates/skills/dev-team-assess/SKILL.md +20 -0
  41. package/templates/skills/dev-team-audit/SKILL.md +1 -1
  42. package/templates/skills/dev-team-review/SKILL.md +36 -3
  43. package/templates/skills/dev-team-task/SKILL.md +30 -10
  44. package/templates/{skills → workflow-skills}/dev-team-security-status/SKILL.md +1 -1
  45. /package/templates/{skills → workflow-skills}/dev-team-merge/SKILL.md +0 -0
@@ -233,8 +233,79 @@ if (flags.length === 0) {
233
233
  process.exit(0);
234
234
  }
235
235
 
236
+ // ─── Complexity-based triage ─────────────────────────────────────────────────
237
+ // Score the change to determine review depth: LIGHT, STANDARD, or DEEP.
238
+ // Uses available tool_input data (old_string/new_string for Edit, content for Write).
239
+
240
+ function scoreComplexity(toolInput, filePath) {
241
+ let score = 0;
242
+
243
+ // Lines changed
244
+ const oldStr = toolInput.old_string || "";
245
+ const newStr = toolInput.new_string || toolInput.content || "";
246
+ const oldLines = oldStr ? oldStr.split("\n").length : 0;
247
+ const newLines = newStr ? newStr.split("\n").length : 0;
248
+ const linesChanged = Math.abs(newLines - oldLines) + Math.min(oldLines, newLines);
249
+ score += Math.min(linesChanged, 50); // Cap at 50 to avoid single large file dominating
250
+
251
+ // Complexity indicators in the new content
252
+ const complexityPatterns = [
253
+ /\bfunction\b/g, // new functions
254
+ /\bclass\b/g, // new classes
255
+ /\bif\b.*\belse\b/g, // control flow
256
+ /\bcatch\b/g, // error handling
257
+ /\bthrow\b/g, // error throwing
258
+ /\basync\b/g, // async operations
259
+ /\bawait\b/g, // async operations
260
+ /\bexport\b/g, // API surface changes
261
+ ];
262
+
263
+ for (const pattern of complexityPatterns) {
264
+ const matches = newStr.match(pattern);
265
+ if (matches) score += matches.length * 2;
266
+ }
267
+
268
+ // Security-sensitive files get a boost
269
+ if (SECURITY_PATTERNS.some((p) => p.test(filePath))) {
270
+ score += 20;
271
+ }
272
+
273
+ return score;
274
+ }
275
+
276
+ // Read configurable thresholds from config.json, or use defaults
277
+ let lightThreshold = 10;
278
+ let deepThreshold = 40;
279
+ try {
280
+ const fs = require("fs");
281
+ const configPath = path.join(process.cwd(), ".dev-team", "config.json");
282
+ const config = JSON.parse(fs.readFileSync(configPath, "utf-8"));
283
+ if (config.reviewThresholds) {
284
+ lightThreshold = config.reviewThresholds.light || lightThreshold;
285
+ deepThreshold = config.reviewThresholds.deep || deepThreshold;
286
+ }
287
+ } catch {
288
+ // Use defaults
289
+ }
290
+
291
+ const complexityScore = scoreComplexity(input.tool_input || {}, fullPath);
292
+ let reviewDepth = "STANDARD";
293
+ if (complexityScore < lightThreshold) {
294
+ reviewDepth = "LIGHT";
295
+ } else if (complexityScore >= deepThreshold) {
296
+ reviewDepth = "DEEP";
297
+ }
298
+
236
299
  // Output as a DIRECTIVE, not a suggestion. CLAUDE.md instructs the LLM to act on this.
237
300
  console.log(`[dev-team] ACTION REQUIRED — spawn these agents as background reviewers:`);
301
+ console.log(`[dev-team] Review depth: ${reviewDepth} (complexity score: ${complexityScore})`);
302
+ if (reviewDepth === "LIGHT") {
303
+ console.log(`[dev-team] LIGHT review: findings are advisory only — do not classify as [DEFECT].`);
304
+ } else if (reviewDepth === "DEEP") {
305
+ console.log(
306
+ `[dev-team] DEEP review: high complexity — request thorough analysis from all reviewers.`,
307
+ );
308
+ }
238
309
  for (const flag of flags) {
239
310
  console.log(` → ${flag}`);
240
311
  }
@@ -22,6 +22,7 @@ This skill audits **only update-safe files** — files that survive `dev-team up
22
22
  - All `.dev-team/agent-memory/*/MEMORY.md` files (use Glob to discover them)
23
23
  - The project's `CLAUDE.md` (root of repo)
24
24
  - `.dev-team/config.json` (to know which agents are installed)
25
+ - `.dev-team/metrics.md` (if it exists — calibration metrics log)
25
26
 
26
27
  2. If `$ARGUMENTS` specifies a focus area (e.g., "learnings", "memory", "claude.md"), scope the audit to that area only. Otherwise, audit all three.
27
28
 
@@ -91,6 +92,24 @@ Check the project's `CLAUDE.md` for:
91
92
  ### Learnings promotion
92
93
  - Mature learnings that have been stable for multiple sessions and should be promoted to `CLAUDE.md` instructions
93
94
 
95
+ ## Phase 4: Calibration metrics audit (`.dev-team/metrics.md`)
96
+
97
+ If `.dev-team/metrics.md` exists and contains entries, analyze:
98
+
99
+ ### Acceptance rates per agent
100
+ - Calculate rolling acceptance rate (last 10 entries) for each reviewer agent
101
+ - Flag agents with acceptance rate below 50% — they may be generating more noise than signal
102
+ - Identify trend direction: improving, stable, or degrading
103
+
104
+ ### Signal quality
105
+ - Are DEFECT findings being overruled frequently? This suggests over-flagging
106
+ - Are SUGGESTION findings dominating? This suggests agents are not calibrated to the project's conventions
107
+ - Are review rounds consistently high (3+)? This suggests systemic quality issues or miscalibrated reviewers
108
+
109
+ ### Delegation patterns
110
+ - Which implementing agents are used most frequently?
111
+ - Are reviewers consistently finding issues in specific domains? This may indicate an implementing agent needs calibration
112
+
94
113
  ## Report
95
114
 
96
115
  Produce a structured health report:
@@ -145,6 +164,7 @@ Provide a simple health score:
145
164
  | Learnings | healthy / needs attention / unhealthy | count by severity |
146
165
  | Agent Memory | healthy / needs attention / unhealthy | count by severity |
147
166
  | CLAUDE.md | healthy / needs attention / unhealthy | count by severity |
167
+ | Metrics | healthy / needs attention / unhealthy | count by severity |
148
168
  | **Overall** | **status** | **total** |
149
169
 
150
170
  Thresholds:
@@ -86,7 +86,7 @@ Numbered list of concrete actions, ordered by priority. Each action should refer
86
86
 
87
87
  ### Security preamble
88
88
 
89
- Before starting the audit, check for open security alerts: run `/dev-team:security-status` if available, or check `gh api repos/{owner}/{repo}/code-scanning/alerts?state=open` and `gh api repos/{owner}/{repo}/dependabot/alerts?state=open`. Include these in the audit scope.
89
+ Before starting the audit, check for open security alerts: run `/dev-team:security-status` if available, or use the project's security monitoring tools. Include these in the audit scope.
90
90
 
91
91
  ### Completion
92
92
 
@@ -27,6 +27,12 @@ Run a multi-agent parallel review of: $ARGUMENTS
27
27
 
28
28
  3. Always include @dev-team-szabo and @dev-team-knuth — they review all code changes.
29
29
 
30
+ ## Pre-review validation
31
+
32
+ Before spawning reviewers, verify the changes are reviewable:
33
+ 1. **Non-empty diff**: The diff contains actual changes to review. If empty, report "nothing to review" and stop.
34
+ 2. **Tests pass**: If the project has a test command, confirm tests pass. Flag test failures in the review report header.
35
+
30
36
  ## Execution
31
37
 
32
38
  1. Spawn each selected agent as a **parallel background subagent** using the Agent tool with `subagent_type: "general-purpose"`.
@@ -39,6 +45,18 @@ Run a multi-agent parallel review of: $ARGUMENTS
39
45
 
40
46
  3. Wait for all agents to complete.
41
47
 
48
+ ## Filter findings (judge pass)
49
+
50
+ Before producing the report, filter raw findings to maximize signal quality:
51
+ 1. **Remove contradictions**: Drop findings that contradict existing ADRs (`docs/adr/`), learnings (`.dev-team/learnings.md`), or agent memory (`.dev-team/agent-memory/*/MEMORY.md`)
52
+ 2. **Deduplicate**: When multiple agents flag the same issue, keep the most specific finding
53
+ 3. **Consolidate suggestions**: Group `[SUGGESTION]`-level items into a single summary block
54
+ 4. **Suppress generated file findings**: Skip findings on generated, vendored, or build artifacts
55
+ 5. **Validate DEFECTs**: Each `[DEFECT]` must include a concrete scenario — downgrade to `[RISK]` if not
56
+ 6. **Accept silence**: "No substantive findings" from a reviewer is a valid positive signal — do not request re-review
57
+
58
+ Log filtered findings in a "Filtered" section for calibration tracking.
59
+
42
60
  ## Report
43
61
 
44
62
  Produce a unified review summary:
@@ -60,6 +78,14 @@ Group by severity:
60
78
  - **[QUESTION]** — decisions needing justification
61
79
  - **[SUGGESTION]** — specific improvements
62
80
 
81
+ ### Filtered
82
+
83
+ List findings removed during the judge pass, with the reason for filtering:
84
+ ```
85
+ **Filtered** @agent-name — reason (contradicts ADR-NNN / duplicate of above / no concrete scenario / generated file)
86
+ Original finding summary.
87
+ ```
88
+
63
89
  ### Verdict
64
90
 
65
91
  - **Approve** — No `[DEFECT]` findings. Advisory items noted.
@@ -69,12 +95,19 @@ State the verdict clearly. List what must be fixed for approval if requesting ch
69
95
 
70
96
  ### Security preamble
71
97
 
72
- Before starting the review, check for open security alerts: run `/dev-team:security-status` if available, or check `gh api repos/{owner}/{repo}/code-scanning/alerts?state=open` and `gh api repos/{owner}/{repo}/dependabot/alerts?state=open`. Flag any critical findings in the review report.
98
+ Before starting the review, check for open security alerts: run `/dev-team:security-status` if available, or use the project's security monitoring tools. Flag any critical findings in the review report.
73
99
 
74
100
  ### Completion
75
101
 
76
102
  After the review report is delivered:
77
- 1. You MUST spawn **@dev-team-borges** (Librarian) as the final step to review memory freshness and capture any learnings from the review findings. Do NOT skip this.
103
+ 1. You MUST spawn **@dev-team-borges** (Librarian) as the final step. Pass Borges the **finding outcome log**: every finding with its classification, source agent, and outcome (accepted/overruled/ignored), including reasoning for overrules. Borges will:
104
+ - **Extract structured memory entries** from the review findings (each classified finding becomes a memory entry for the reviewer who produced it)
105
+ - **Reinforce accepted patterns** and **record overruled findings** for reviewer calibration
106
+ - **Generate calibration rules** when 3+ findings on the same tag are overruled
107
+ - **Record metrics** to `.dev-team/metrics.md`
108
+ - Write entries to each participating agent's MEMORY.md using the structured format
109
+ - Update shared learnings in `.dev-team/learnings.md`
110
+ - Check cross-agent coherence
78
111
  2. If Borges was not spawned, the review is INCOMPLETE.
79
- 3. **Borges memory gate**: If Borges reports that any participating agent's MEMORY.md is empty or contains only boilerplate, this is a **[DEFECT]** that blocks review completion. The agent must write substantive learnings before the review can be marked done.
112
+ 3. **Memory formation gate**: After Borges runs, verify that each participating reviewer's MEMORY.md contains at least one new structured entry from this review.
80
113
  4. Include Borges's recommendations in the final report.
@@ -38,11 +38,21 @@ Before the first iteration, the implementing agent should research current best
38
38
  Track iterations in conversation context (no state files). For each iteration:
39
39
 
40
40
  1. The implementing agent works on the task.
41
- 2. After implementation, spawn review agents in parallel as background tasks.
42
- 3. Collect classified challenges from reviewers.
43
- 4. If any `[DEFECT]` challenges exist, address them in the next iteration.
44
- 5. If no `[DEFECT]` remains, output DONE to exit the loop.
45
- 6. If max iterations reached without convergence, report remaining defects and exit.
41
+ 2. **Validate implementation output** before spawning reviewers:
42
+ - Non-empty diff: `git diff` shows actual changes
43
+ - Tests pass: test command executed with exit code 0
44
+ - Relevance: changed files relate to the stated issue
45
+ - Clean working tree: no uncommitted debris
46
+ - If validation fails, route back to implementer with specific failure reason. If it fails twice, escalate to human.
47
+ 3. After validation passes, spawn review agents in parallel as background tasks.
48
+ 4. Collect classified challenges from reviewers.
49
+ 5. If any `[DEFECT]` challenges exist, **compact the context** before the next iteration:
50
+ - Produce a structured summary: DEFECTs found (agent, file, status), files changed, outstanding items
51
+ - New reviewers in subsequent waves receive: current diff + compact summary + agent definition
52
+ - They do NOT receive raw conversation history from prior waves
53
+ 6. Address defects in the next iteration.
54
+ 7. If no `[DEFECT]` remains, output DONE to exit the loop.
55
+ 8. If max iterations reached without convergence, report remaining defects and exit.
46
56
 
47
57
  The convergence check happens in conversation context: count iterations, check for `[DEFECT]` findings, and decide whether to continue or exit.
48
58
 
@@ -50,6 +60,8 @@ The convergence check happens in conversation context: count iterations, check f
50
60
 
51
61
  When multiple issues are being addressed in a single session, the task loop switches to parallel orchestration (see ADR-019). Drucker coordinates all phases in conversation context.
52
62
 
63
+ **Mode selection:** If agent teams are enabled (check `.dev-team/config.json` for `"agentTeams": true`), use team lead mode for batches of 3+ issues. Otherwise, use standard worktree subagent mode. For single issues, always use standard mode.
64
+
53
65
  ### Phase 0: Brooks pre-assessment (batch)
54
66
  Spawn @dev-team-brooks once with all issues. Brooks identifies:
55
67
  - **File independence**: which issues touch overlapping files (conflict groups that must run sequentially)
@@ -65,7 +77,7 @@ Drucker spawns one implementing agent per independent issue, each on its own bra
65
77
  Reviews do **not** start until **all** implementation agents have completed (Agent tool provides completion notifications as the sync barrier). Once all are done, spawn review agents (Szabo + Knuth, plus conditional reviewers) in parallel across all branches simultaneously. Each reviewer receives the diff for one specific branch and produces classified findings scoped to that branch.
66
78
 
67
79
  ### Phase 3: Defect routing
68
- Collect all findings. Route `[DEFECT]` items back to the original implementing agent for each branch. Agents fix defects on their own branch. After fixes, another review wave runs. Continue until no `[DEFECT]` findings remain or the per-branch iteration limit is reached.
80
+ Collect all findings. Route `[DEFECT]` items back to the original implementing agent for each branch. Agents fix defects on their own branch. Before spawning the next review wave, **compact context**: produce a structured summary of prior findings, their status (fixed/disputed/pending), and files changed. New reviewers receive current diff + compact summary only — not full conversation history from prior waves. Continue until no `[DEFECT]` findings remain or the per-branch iteration limit is reached.
69
81
 
70
82
  ### Phase 4: Borges completion
71
83
  Borges runs **once** across all branches after the final review wave clears. This ensures cross-branch coherence: memory files are consistent, learnings are not duplicated, and system improvement recommendations consider the full batch.
@@ -77,16 +89,24 @@ Parallel mode is complete when:
77
89
 
78
90
  ## Security preamble
79
91
 
80
- Before starting work, check for open security alerts: run `/dev-team:security-status` if available, or check `gh api repos/{owner}/{repo}/code-scanning/alerts?state=open` and `gh api repos/{owner}/{repo}/dependabot/alerts?state=open`. Flag any critical findings before proceeding.
92
+ Before starting work, check for open security alerts: run `/dev-team:security-status` if available, or use the project's security monitoring tools. Flag any critical findings before proceeding.
81
93
 
82
94
  ## Completion
83
95
 
84
96
  When the loop exits:
85
97
  1. **Deliver the work**: If changes are on a feature branch, create the PR (body must include `Closes #<issue>`). Ensure the PR is ready to merge: CI green, reviews passed, branch up to date. Then follow the project's merge workflow — use `/dev-team:merge` if the project has it configured, otherwise report readiness. If merge fails (CI failures, merge conflicts, branch protection), report the blocker to the human rather than leaving work unattended.
86
98
  2. **Clean up worktree**: If the work was done in a worktree, clean it up after the branch is pushed and the PR is created. Do not wait for merge to clean the worktree.
87
- 3. You MUST spawn **@dev-team-borges** (Librarian) as the final step to review memory freshness, cross-agent coherence, and system improvement opportunities. Do NOT skip this.
99
+ 3. You MUST spawn **@dev-team-borges** (Librarian) as the final step. Pass Borges the **finding outcome log**: every finding with its classification, source agent, and outcome (accepted/overruled/ignored), including the human's reasoning for overrules. Borges will:
100
+ - **Extract structured memory entries** from review findings and implementation decisions
101
+ - **Reinforce accepted patterns** in the reviewer's memory (calibration feedback)
102
+ - **Record overruled findings** with context so reviewers generate fewer false positives
103
+ - **Generate calibration rules** when 3+ findings on the same tag are overruled
104
+ - **Record metrics** to `.dev-team/metrics.md` (acceptance rates, rounds to convergence)
105
+ - Write entries to each participating agent's MEMORY.md using the structured format
106
+ - Update shared learnings in `.dev-team/learnings.md`
107
+ - Check cross-agent coherence
108
+ - Report system improvement opportunities
88
109
  4. If Borges was not spawned, the task is INCOMPLETE.
89
- 5. **Borges memory gate**: If Borges reports that any implementing agent's MEMORY.md is empty or contains only boilerplate after a task, this is a **[DEFECT]** that blocks task completion. The implementing agent must write substantive learnings before the task can be marked done. Empty agent memory after a task means the enforcement pipeline failed.
110
+ 5. **Memory formation gate**: After Borges runs, verify that each participating agent's MEMORY.md contains at least one new structured entry from this task. Empty agent memory after a completed task is a system failure Borges prevents this by automating extraction.
90
111
  6. Summarize what was accomplished across all iterations.
91
112
  7. Report any remaining `[RISK]` or `[SUGGESTION]` items, including Borges's recommendations.
92
- 8. Write key learnings to agent MEMORY.md files.
@@ -1,5 +1,5 @@
1
1
  ---
2
- name: security-status
2
+ name: dev-team:security-status
3
3
  description: Check GitHub security signals — code scanning, Dependabot, secret scanning, and compliance status. Use at session start and before releases.
4
4
  user_invocable: true
5
5
  ---