claude-dev-env 1.25.2 → 1.26.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/CLAUDE.md +6 -0
  2. package/agents/clean-coder.md +1 -1
  3. package/docs/CODE_RULES.md +3 -1
  4. package/hooks/HOOK_SPECS_PROMPT_WORKFLOW.md +54 -0
  5. package/hooks/blocking/{code-rules-enforcer.py → code_rules_enforcer.py} +154 -5
  6. package/hooks/blocking/test_code_rules_enforcer.py +61 -0
  7. package/hooks/blocking/test_code_rules_enforcer_any_type_ignore.py +2 -2
  8. package/hooks/blocking/test_code_rules_enforcer_banned_identifier.py +2 -2
  9. package/hooks/blocking/test_code_rules_enforcer_conftest_anchor.py +1 -1
  10. package/hooks/blocking/test_code_rules_enforcer_dot_test_pattern.py +2 -2
  11. package/hooks/blocking/test_code_rules_enforcer_file_global_constants.py +183 -0
  12. package/hooks/blocking/test_code_rules_enforcer_fstring_scan.py +4 -4
  13. package/hooks/blocking/test_code_rules_enforcer_logger_fstring.py +1 -1
  14. package/hooks/blocking/test_code_rules_enforcer_magic_allowlist.py +1 -1
  15. package/hooks/blocking/test_code_rules_enforcer_magic_string_masking.py +104 -0
  16. package/hooks/blocking/test_code_rules_enforcer_naming_pattern.py +2 -2
  17. package/hooks/blocking/test_code_rules_enforcer_type_checking_scope.py +2 -2
  18. package/hooks/blocking/test_content_search_to_zoekt_redirector_integration.py +1 -1
  19. package/hooks/blocking/test_destructive_command_blocker.py +1 -1
  20. package/hooks/blocking/test_gh_body_arg_blocker.py +1 -1
  21. package/hooks/blocking/test_pr_description_enforcer.py +8 -8
  22. package/hooks/blocking/test_tdd_enforcer.py +1 -1
  23. package/hooks/github-action/pre-push-review.yml +27 -0
  24. package/hooks/hooks.json +28 -28
  25. package/hooks/lifecycle/{config-change-guard.py → config_change_guard.py} +26 -12
  26. package/hooks/lifecycle/test_config_change_guard.py +3 -3
  27. package/hooks/notification/{attention-needed-notify.py → attention_needed_notify.py} +7 -0
  28. package/hooks/notification/{claude-notification-handler.py → claude_notification_handler.py} +8 -0
  29. package/hooks/notification/notification_utils.py +56 -0
  30. package/hooks/notification/subagent_complete_notify.py +381 -0
  31. package/hooks/notification/test_attention_needed_notify.py +47 -0
  32. package/hooks/notification/test_claude_notification_handler.py +54 -0
  33. package/hooks/notification/test_notification_utils.py +45 -0
  34. package/hooks/notification/test_subagent_complete_notify.py +79 -0
  35. package/hooks/validators/README.md +5 -1
  36. package/hooks/validators/abbreviation_checks.py +1 -1
  37. package/hooks/validators/code_quality_checks.py +1 -1
  38. package/hooks/validators/config.py +5 -0
  39. package/hooks/validators/conftest.py +10 -0
  40. package/hooks/validators/exempt_paths.py +1 -1
  41. package/hooks/validators/git_checks.py +80 -0
  42. package/hooks/validators/magic_value_checks.py +2 -2
  43. package/hooks/validators/pr_reference_checks.py +1 -1
  44. package/hooks/validators/python_antipattern_checks.py +1 -1
  45. package/hooks/validators/run_all_validators.py +53 -105
  46. package/hooks/validators/security_checks.py +1 -1
  47. package/hooks/validators/test_abbreviation_checks.py +2 -2
  48. package/hooks/validators/test_code_quality_checks.py +2 -2
  49. package/hooks/validators/test_file_structure_checks.py +1 -1
  50. package/hooks/validators/test_git_checks.py +79 -13
  51. package/hooks/validators/test_health_check.py +1 -1
  52. package/hooks/validators/test_magic_value_checks.py +2 -2
  53. package/hooks/validators/test_mypy_integration.py +1 -1
  54. package/hooks/validators/test_output_formatter.py +3 -1
  55. package/hooks/validators/test_pr_reference_checks.py +2 -2
  56. package/hooks/validators/test_python_antipattern_checks.py +2 -2
  57. package/hooks/validators/test_python_style_checks.py +2 -4
  58. package/hooks/validators/test_react_checks.py +1 -1
  59. package/hooks/validators/test_ruff_integration.py +1 -1
  60. package/hooks/validators/test_run_all_validators.py +75 -43
  61. package/hooks/validators/test_run_all_validators_integration.py +14 -37
  62. package/hooks/validators/test_security_checks.py +2 -2
  63. package/hooks/validators/test_test_safety_checks.py +1 -1
  64. package/hooks/validators/test_todo_checks.py +2 -2
  65. package/hooks/validators/test_type_safety_checks.py +2 -2
  66. package/hooks/validators/test_useless_test_checks.py +2 -2
  67. package/hooks/validators/test_validator_base.py +1 -1
  68. package/hooks/validators/test_verify_paths.py +2 -4
  69. package/hooks/validators/todo_checks.py +1 -1
  70. package/hooks/validators/type_safety_checks.py +1 -1
  71. package/hooks/validators/useless_test_checks.py +1 -1
  72. package/package.json +1 -1
  73. package/rules/file-global-constants.md +71 -0
  74. package/rules/gh-body-file.md +1 -1
  75. package/rules/prompt-workflow-context-controls.md +48 -0
  76. package/scripts/sync_to_cursor/rules.py +2 -2
  77. package/scripts/tests/test_sync_to_cursor.py +2 -2
  78. package/skills/bugteam/CONSTRAINTS.md +37 -0
  79. package/skills/bugteam/EXAMPLES.md +64 -0
  80. package/skills/bugteam/PROMPTS.md +175 -0
  81. package/skills/bugteam/SKILL.md +204 -295
  82. package/skills/bugteam/SKILL_EVALS.md +346 -0
  83. package/skills/bugteam/scripts/README.md +37 -0
  84. package/skills/bugteam/scripts/bugteam_code_rules_gate.py +334 -0
  85. package/skills/bugteam/scripts/bugteam_preflight.py +135 -0
  86. package/skills/rule-audit/SKILL.md +4 -4
  87. /package/hooks/advisory/{migration-safety-advisor.py → migration_safety_advisor.py} +0 -0
  88. /package/hooks/advisory/{refactor-guard.py → refactor_guard.py} +0 -0
  89. /package/hooks/blocking/{block-main-commit.py → block_main_commit.py} +0 -0
  90. /package/hooks/blocking/{content-search-to-zoekt-redirector.py → content_search_to_zoekt_redirector.py} +0 -0
  91. /package/hooks/blocking/{destructive-command-blocker.py → destructive_command_blocker.py} +0 -0
  92. /package/hooks/blocking/{gh-body-arg-blocker.py → gh_body_arg_blocker.py} +0 -0
  93. /package/hooks/blocking/{hedging-language-blocker.py → hedging_language_blocker.py} +0 -0
  94. /package/hooks/blocking/{pr-description-enforcer.py → pr_description_enforcer.py} +0 -0
  95. /package/hooks/blocking/{sensitive-file-protector.py → sensitive_file_protector.py} +0 -0
  96. /package/hooks/blocking/{tdd-enforcer.py → tdd_enforcer.py} +0 -0
  97. /package/hooks/blocking/{test-preflight-check.py → test_preflight_check.py} +0 -0
  98. /package/hooks/blocking/{write-existing-file-blocker.py → write_existing_file_blocker.py} +0 -0
  99. /package/hooks/git-hooks/{post-commit.py → post_commit.py} +0 -0
  100. /package/hooks/lifecycle/{session-end-cleanup.py → session_end_cleanup.py} +0 -0
  101. /package/hooks/{rewrite-plugin-paths.py → rewrite_plugin_paths.py} +0 -0
  102. /package/hooks/session/{plugin-data-dir-cleanup.py → plugin_data_dir_cleanup.py} +0 -0
  103. /package/hooks/validation/{hook-format-validator.py → hook_format_validator.py} +0 -0
  104. /package/hooks/workflow/{auto-formatter.py → auto_formatter.py} +0 -0
  105. /package/hooks/workflow/{investigation-tracker-reset.py → investigation_tracker_reset.py} +0 -0
  106. /package/scripts/{sync-to-cursor.py → sync_to_cursor.py} +0 -0
@@ -0,0 +1,346 @@
1
+ # Bugteam — Evaluation Suite
2
+
3
+ Evaluation-driven iteration set for the `bugteam` skill, following [Anthropic — Agent Skills best practices: evaluation and iteration](https://platform.claude.com/docs/en/agents-and-tools/agent-skills/best-practices#evaluation-and-iteration).
4
+
5
+ ## Methodology
6
+
7
+ Evals are split into two layers. Both layers run against the same trace but carry different failure semantics.
8
+
9
+ **Layer A — Ironclad invariants.** Order-and-presence rules that MUST hold on every run regardless of fixture, regardless of model choice, regardless of the exact number of loops taken. Every Layer A assertion is cited to a specific line in `SKILL.md` — if the assertion fails, either the run diverged from the skill or the skill text is ambiguous and needs patching.
10
+
11
+ **Layer B — Fixture-dependent expectations.** The concrete tool trace predicted for a specific fixture (fixed PR state, canned audit XML, canned fix XML). Layer B is prediction — reality may diverge in small ways (extra `Bash("git rev-parse HEAD")` checkpoints the lead inserts for sanity; retry loops on transient failures; consolidated cleanup calls) without indicating a skill defect. Layer B failures trigger reconciliation, not auto-failure.
12
+
13
+ **Process note.** This document was drafted before running a real trace. Layer B predictions are labeled *predicted*, not *observed*. On the first real run, every Layer B prediction is reconciled against the observed trace and the diffs written back here — that reconciliation is Cycle 0 of the iteration protocol below.
14
+
15
+ ## Ironclad invariants (Layer A, apply to every eval)
16
+
17
+ Each invariant cites the skill line it derives from.
18
+
19
+ | # | Invariant | Citation |
20
+ |---|---|---|
21
+ | I-1 | `Bash(grant_project_claude_permissions.py)` precedes every `TeamCreate`. | SKILL.md Step 0 |
22
+ | I-2 | `Bash(revoke_project_claude_permissions.py)` runs exactly once per invocation, after the last `TeamDelete`, on every exit path (converged, stuck, cap reached, error). | SKILL.md Step 5 |
23
+ | I-3 | Exactly one `TeamCreate` and exactly one `TeamDelete` per invocation. | SKILL.md Step 2, Step 4 |
24
+ | I-4 | Every `Agent(name=...)` spawn is followed (before `TeamDelete`) by a matching `SendMessage(to=..., message={type: "shutdown_request", ...})`. No orphaned teammates. | SKILL.md AUDIT shutdown, FIX shutdown, Step 4.1 |
25
+ | I-5 | `Agent` calls are fresh per loop — the same `name` is never reused across loops without an intervening shutdown. | SKILL.md Constraints: "Fresh teammate per loop" |
26
+ | I-6 | Both audit and fix `Agent` calls pass `model="sonnet"`. | SKILL.md Step 2 roles list, Constraints: "Sonnet for both teammates" |
27
+ | I-7 | `TeamDelete()` is called with no arguments. | TeamDelete schema: no required params, no properties |
28
+ | I-8 | Loop count ≤ 10 audits. 11th audit never fires. | SKILL.md: "10-loop hard cap" |
29
+ | I-9 | From loop 4 onward without convergence, the audit phase emits three parallel `Agent` calls in a single assistant message with names `bugfind-loop-<N>-a/b/c`. | SKILL.md AUDIT action: "Parallel auditors from loop 4 onward" |
30
+ | I-10 | Lead reads `.bugteam-loop-<N>.outcomes.xml` with the `Read` tool after each audit, before the next action. | SKILL.md: "the lead reads `.bugteam-loop-<N>.outcomes.xml` with the `Read` tool" |
31
+ | I-11 | On exit of any kind, ordering is: `TeamDelete` → temp-dir cleanup → Step 4.5 PR rewrite → revoke. | SKILL.md Step 4, Step 4.5, Step 5 ordering |
32
+ | I-12 | Lead never posts PR review comments, finding comments, or fix replies. The only lead-side PR mutation is the final `gh pr edit --body-file` in Step 4.5. | SKILL.md Constraints: "Teammates own audit/fix comment posting", "Lead owns the final PR description rewrite only" |
33
+ | I-13 | Only the orchestrator (lead session) invokes `TeamCreate`. Every teammate `Agent(...)` call passes `team_name=<lead_team_name>`. No teammate ever calls `TeamCreate`. When supplementary work arises mid-cycle (parallel auditors, adjacent-file audits, infrastructure fixes), the lead spawns additional teammates into the existing team rather than creating a second team. | SKILL.md Constraints: "Orchestrator-only TeamCreate"; runtime error: `Already leading team "<name>". A leader can only manage one team at a time.` |
34
+
35
+ Any eval failing one or more Layer A invariants fails the run.
36
+
37
+ ## Observation strategy
38
+
39
+ Evals run in a harness that intercepts the tool layer:
40
+
41
+ - A **mock tool layer** records each tool call with its arguments and returns synthetic responses matching the real tool's response shape. Nothing hits GitHub; no real teammates spawn.
42
+ - A **fixture repo** supplies deterministic git state and a mocked `gh` CLI that returns canned JSON for `pr view`, `pr diff`, and `api` calls.
43
+ - **Assertions** run against the recorded call list, not against real PR state.
44
+
45
+ The harness does not yet exist; this document defines its contract.
46
+
47
+ ---
48
+
49
+ ## Eval 1 — Refusal: agent teams not enabled
50
+
51
+ **Scenario.** `CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS` is unset in both `claude config` and `~/.claude/settings.json`.
52
+
53
+ **Trigger.** `/bugteam`
54
+
55
+ **Layer A invariants.** None fire downstream — this is a pre-cycle refusal.
56
+
57
+ **Layer B predicted trace.**
58
+ 1. `Bash("claude config get env.CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS")` → empty.
59
+ 2. `Read("~/.claude/settings.json")` → settings without the env var.
60
+ 3. No grant script, no `TeamCreate`, no `Agent`.
61
+
62
+ **Pass criteria.**
63
+ - Final assistant message contains the exact string `CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS=1 not set. /bugteam requires the agent teams feature.`.
64
+ - Zero `TeamCreate`, `Agent`, `SendMessage`, `TeamDelete` calls.
65
+ - Zero invocations of the grant or revoke scripts.
66
+
67
+ ---
68
+
69
+ ## Eval 2 — Refusal: missing PR, no upstream diff
70
+
71
+ **Scenario.** Current branch is `main` with no PR and no upstream difference.
72
+
73
+ **Layer B predicted trace.**
74
+ 1. `Bash("gh pr view --json ...")` → non-zero exit.
75
+ 2. `Bash("git merge-base HEAD origin/main")` → empty.
76
+ 3. No grant script, no `TeamCreate`.
77
+
78
+ **Pass criteria.** Assistant message matches `No PR or upstream diff. /bugteam needs a target.`. Zero downstream tool calls.
79
+
80
+ ---
81
+
82
+ ## Eval 3 — Refusal: uncommitted changes in working tree
83
+
84
+ **Scenario.** Clean PR exists but `git status --porcelain` shows unstaged changes.
85
+
86
+ **Pass criteria.** Assistant message matches `Uncommitted changes detected. Stash, commit, or revert before /bugteam.`. Zero downstream tool calls.
87
+
88
+ ---
89
+
90
+ ## Eval 4 — Refusal: required subagent missing
91
+
92
+ **Scenario.** `code-quality-agent` is present in the available-agents list; `clean-coder` is not.
93
+
94
+ **Pass criteria.** Assistant message contains `Required subagent type clean-coder not installed.`. Zero grant script call, zero `TeamCreate`.
95
+
96
+ ---
97
+
98
+ ## Eval 5 — Happy path: converges in 2 loops
99
+
100
+ **Scenario.** PR #42 contains three P1 bugs all addressable by the mock fix teammate. Loop 1 audit returns 3 findings; loop 1 fix commits cleanly; loop 2 audit returns zero findings.
101
+
102
+ **Layer A invariants.** I-1, I-2, I-3, I-4, I-5, I-6, I-7, I-10, I-11, I-12.
103
+
104
+ **Layer B predicted trace.**
105
+
106
+ | # | Tool call | Source |
107
+ |---|---|---|
108
+ | 1 | `Bash("python .../grant_project_claude_permissions.py")` | SKILL.md Step 0 |
109
+ | 2 | `Bash("gh pr view --json number,baseRefName,headRefName,url")` | SKILL.md Step 1 |
110
+ | 3 | `Bash("git rev-parse HEAD")` → captures `starting_sha` | SKILL.md:125 |
111
+ | 4 | `TeamCreate(team_name="bugteam-pr-42-<ts>", description=..., agent_type="team-lead")` | SKILL.md Step 2 |
112
+ | 5 | `Bash("mkdir -p <team_temp_dir>")` | SKILL.md AUDIT action |
113
+ | 6 | `Bash("gh pr diff 42 -R ... > <team_temp_dir>/loop-1.patch")` | SKILL.md AUDIT action |
114
+ | 7 | `Agent(subagent_type="code-quality-agent", name="bugfind", team_name=..., model="sonnet", description=..., prompt=<audit XML loop 1>)` | SKILL.md AUDIT action |
115
+ | 8 | `Read(".bugteam-loop-1.outcomes.xml")` | SKILL.md AUDIT action |
116
+ | 9 | `SendMessage(to="bugfind", message={type: "shutdown_request", reason: "audit loop 1 complete; outcome XML captured"})` | SKILL.md AUDIT shutdown |
117
+ | 10 | `Agent(subagent_type="clean-coder", name="bugfix", team_name=..., model="sonnet", description=..., prompt=<fix XML loop 1>)` | SKILL.md FIX action |
118
+ | 11 | `Read(".bugteam-loop-1.outcomes.xml")` — bugfix outcome XML overwrites same filename | SKILL.md FIX action |
119
+ | 12 | `Bash("git rev-parse HEAD")` → verify HEAD advanced | SKILL.md:499, 502 |
120
+ | 13 | `Bash("git fetch origin <branch> && git rev-parse origin/<branch>")` → verify push landed | SKILL.md:500 |
121
+ | 14 | `SendMessage(to="bugfix", message={type: "shutdown_request", reason: "fix loop 1 complete; commit <sha7> pushed"})` | SKILL.md FIX shutdown |
122
+ | 15 | `Bash("gh pr diff 42 -R ... > <team_temp_dir>/loop-2.patch")` | SKILL.md AUDIT action |
123
+ | 16 | `Agent(subagent_type="code-quality-agent", name="bugfind", ...)` (loop 2) | SKILL.md AUDIT action |
124
+ | 17 | `Read(".bugteam-loop-2.outcomes.xml")` — zero findings | SKILL.md AUDIT action |
125
+ | 18 | `SendMessage(to="bugfind", message={type: "shutdown_request", reason: "audit loop 2 complete; zero findings"})` | SKILL.md AUDIT shutdown |
126
+ | 19 | `TeamDelete()` | SKILL.md Step 4 |
127
+ | 20 | `Bash("python -c \"import shutil; shutil.rmtree(r'<team_temp_dir>', ignore_errors=True)\"")` | SKILL.md Step 4 |
128
+ | 21 | `Bash("gh pr diff 42 -R ... > .bugteam-final.diff")` | SKILL.md:564 |
129
+ | 22 | `Bash("gh pr view 42 -R ... --json body --jq .body > .bugteam-original-body.md")` | SKILL.md:565 |
130
+ | 23 | `Agent(subagent_type="pr-description-writer", description=..., prompt=<brief>)` | SKILL.md Step 4.5 |
131
+ | 24 | `Write(".bugteam-final-body.md", <returned body>)` | SKILL.md:571 |
132
+ | 25 | `Bash("gh pr edit 42 -R ... --body-file .bugteam-final-body.md")` | SKILL.md:572 |
133
+ | 26 | `Bash("rm .bugteam-final.diff .bugteam-original-body.md .bugteam-final-body.md .bugteam-loop-*.outcomes.xml")` | SKILL.md:573, 621 |
134
+ | 27 | `Bash("python .../revoke_project_claude_permissions.py")` | SKILL.md Step 5 |
135
+
136
+ **Pass criteria.**
137
+ - All Layer A invariants hold.
138
+ - Exactly 2 `Agent(name="bugfind"...)` calls, exactly 1 `Agent(name="bugfix"...)` call.
139
+ - Exactly 2 bugfind shutdown messages + 1 bugfix shutdown message.
140
+ - Final report contains `/bugteam exit: converged` and `Loops: 2`.
141
+
142
+ **Process check after first real run.** Compare the observed trace against steps 1–27. Common expected divergences that should not fail the eval:
143
+ - Extra `Bash("git rev-parse HEAD")` calls the lead inserts for bookkeeping.
144
+ - Consolidated `Bash` calls (step 26 may split into two or three calls).
145
+ - Extra `Read` calls when the lead re-reads an outcome XML to quote specific findings.
146
+ - Reordered but still-Layer-A-compliant cleanup sequencing.
147
+
148
+ Patch this table to match observation and annotate each correction.
149
+
150
+ ---
151
+
152
+ ## Eval 6 — Stuck path: fix teammate produces no commit
153
+
154
+ **Scenario.** Loop 1 audit finds 2 P1 bugs; the mock fix teammate reports both as `could_not_address` (no commit created).
155
+
156
+ **Layer A invariants.** I-1, I-2, I-3, I-4, I-5, I-6, I-7, I-10, I-11, I-12. I-8 trivially holds.
157
+
158
+ **Layer B predicted trace.** Identical to Eval 5 steps 1–14 with this divergence:
159
+ - Step 11 bugfix outcome XML marks every finding `status="could_not_address"`.
160
+ - Step 12 `Bash("git rev-parse HEAD")` returns the pre-fix SHA unchanged.
161
+ - Skill sets exit reason = `stuck`, skips loop 2, and falls through to `TeamDelete()`.
162
+
163
+ **Pass criteria.**
164
+ - Loop count stops at 1.
165
+ - Final report contains `/bugteam exit: stuck` and names the two unresolved findings.
166
+ - Steps 19–27 fire despite the stuck exit — I-2 and I-11 enforce this.
167
+
168
+ ---
169
+
170
+ ## Eval 7 — Cap reached: 10 loops, no convergence
171
+
172
+ **Scenario.** Mock audit returns one P2 finding every loop. Mock fix teammate always commits but never clears the finding.
173
+
174
+ **Layer A invariants.** All of I-1 through I-12.
175
+
176
+ **Layer B predicted behavior.**
177
+ - Loops 1–3: single `Agent(name="bugfind")` per loop.
178
+ - Loops 4–10: three parallel `Agent(name="bugfind-loop-<N>-a/b/c")` in a single assistant message per loop, followed by two parallel `-b`/`-c` shutdowns and one `-a` shutdown.
179
+ - Each loop produces one `Agent(name="bugfix")` and its matching shutdown.
180
+ - Exactly 10 audit phases, exactly 10 fix phases.
181
+ - Steps 19–27 from Eval 5 fire at teardown.
182
+
183
+ **Pass criteria.**
184
+ - I-8 holds: exactly 10 audit phases.
185
+ - I-9 holds: loops 4–10 each emit three audit `Agent` calls in a single assistant message.
186
+ - Final report contains `/bugteam exit: cap reached` and the remaining bug count.
187
+
188
+ **Process check.** The distinct `Agent(name=...)` audit-call count is a prediction. On the first real run, record the exact count and rewrite the formula here.
189
+
190
+ ---
191
+
192
+ ## Eval 8 — Clean on first audit
193
+
194
+ **Scenario.** Loop 1 audit returns zero findings.
195
+
196
+ **Layer A invariants.** I-1 through I-7, I-10, I-11, I-12.
197
+
198
+ **Layer B predicted trace.** Eval 5 steps 1–9 and 19–27 only — no FIX phase because zero findings means the skill exits the loop at `last_action == "audited"` and `last_findings.total == 0`.
199
+
200
+ **Pass criteria.**
201
+ - Exactly 1 `Agent(name="bugfind"...)` call, 0 `Agent(name="bugfix"...)` calls, 1 bugfind shutdown.
202
+ - Bugfind's outcome XML records zero findings; the per-loop review POST carries body `## /bugteam loop 1 audit: 0P0 / 0P1 / 0P2 → clean`.
203
+ - Step 4.5 and Step 5 still fire.
204
+
205
+ ---
206
+
207
+ ## Eval 9 — Anchor fallback: finding outside diff
208
+
209
+ **Scenario.** Loop 1 audit returns 3 findings; 1 anchors to a line outside the captured diff.
210
+
211
+ **Layer A invariants.** Same as Eval 5.
212
+
213
+ **Layer B predicted teammate-side behavior** (observed via the recorded `gh api ... /reviews` POST payload in the bugfind teammate fixture).
214
+ - `comments[]` length in the POST body = 2 (anchored findings only).
215
+ - Review body contains a `### Findings without a diff anchor` section listing the third finding.
216
+ - Bugfix outcome XML marks all 3 findings with a `reply_comment_url`; the unanchored finding's `used_fallback="true"` and `finding_comment_url` equals the parent review URL.
217
+
218
+ **Pass criteria.** Confirmed in the fixture's canned teammate outcome XML; Layer A invariants hold on the lead side.
219
+
220
+ ---
221
+
222
+ ## Eval 10 — Review POST failure fallback
223
+
224
+ **Scenario.** The first `POST /pulls/42/reviews` call from the bugfind teammate returns HTTP 422.
225
+
226
+ **Layer B predicted teammate-side behavior.**
227
+ - Bugfind teammate retries via the issue-comments endpoint `POST /repos/.../issues/42/comments` with a single body carrying the review header and every finding inline.
228
+ - Every finding's outcome XML carries `used_fallback="true"` and the issue-comment URL as `finding_comment_url`.
229
+ - Cycle continues to the FIX action without aborting.
230
+
231
+ **Open item for the real run.** The exact `gh api` payload shape for the issue-comments fallback is specified at `SKILL.md:164-171` as `jq -Rs | gh api .../issues/<number>/comments --input -`. Before running Eval 10 for real, confirm the teammate obeys this shape — the fixture must assert the endpoint path and the `--input -` pattern.
232
+
233
+ ---
234
+
235
+ ## Eval 11 — Hook-blocked fix commit
236
+
237
+ **Scenario.** Bugfix stages edits but `git commit` fails because a `pre-commit` hook returns non-zero.
238
+
239
+ **Layer B predicted behavior.**
240
+ - Bugfix teammate outcome XML marks every finding `status="hook_blocked"` with populated `<hook_output>`.
241
+ - Bugfix teammate posts `Hook blocked the fix commit: <one-line summary>` to each finding comment.
242
+ - Lead's `Bash("git rev-parse HEAD")` after fix detects no SHA change → exit reason `stuck`.
243
+ - Steps 19–27 from Eval 5 fire at teardown.
244
+
245
+ **Pass criteria.** Layer A I-2 and I-11 hold. Final report contains `/bugteam exit: stuck` and surfaces the hook_output summary.
246
+
247
+ ---
248
+
249
+ ## Eval 12 — `pr-description-writer` unavailable, `general-purpose` available
250
+
251
+ **Scenario.** The available-agents list does not include `pr-description-writer` but does include `general-purpose`.
252
+
253
+ **Layer B predicted trace.** Eval 5 steps 1–22 identical; step 23 becomes:
254
+
255
+ ```
256
+ Agent(subagent_type="general-purpose", description="Rewrite PR 42 body from cumulative diff", prompt=<same brief>)
257
+ ```
258
+
259
+ Steps 24–27 follow normally.
260
+
261
+ **Pass criteria.** Exactly 1 `Agent(subagent_type="general-purpose", ...)` call for the description rewrite. `gh pr edit` fires. Final report carries no Step 4.5 skip warning.
262
+
263
+ ---
264
+
265
+ ## Eval 13 — Neither PR-description agent available
266
+
267
+ **Scenario.** Neither `pr-description-writer` nor `general-purpose` appear in the available-agents list.
268
+
269
+ **Layer B predicted trace.** Eval 5 steps 1–22, then skip steps 23–25. Steps 26–27 still fire.
270
+
271
+ **Pass criteria.**
272
+ - Zero `Agent` calls for PR description rewriting.
273
+ - Zero `gh pr edit` calls.
274
+ - Final report carries the Step 4.5 skip warning.
275
+ - Layer A I-2 holds: revoke still fires.
276
+
277
+ ---
278
+
279
+ ## Eval 14 — Permissions revoke on error path
280
+
281
+ **Scenario.** Bugfind teammate refuses `shutdown_request` during loop 1, returning `{type: "shutdown_response", approve: false}`.
282
+
283
+ **Layer B predicted trace.** Eval 5 steps 1–8, then:
284
+ - Step 9 `SendMessage(to="bugfind", ...)` receives `approve: false`.
285
+ - Skill sets exit reason = `error: bugfind teammate refused shutdown`.
286
+ - Steps 19–27 all fire (Layer A I-2 and I-11 mandate this).
287
+
288
+ **Pass criteria.** Final report surfaces the error and the loop number. Revoke fires despite the error.
289
+
290
+ ---
291
+
292
+ ## Eval 15 — Orchestrator-only `TeamCreate` (supplementary work path)
293
+
294
+ **Scenario.** A loop 1 audit surfaces a P0/P1 finding whose root cause sits in adjacent infrastructure the lead needs to fix before the cycle can converge (e.g., a broken CI hook, a misbehaving lint config, a wrong GitHub API shape in a teammate's own dependency). The lead recognizes supplementary work is needed and decides to spawn additional teammates to handle it.
295
+
296
+ **Layer A invariants.** I-1, I-3, I-4, I-5, I-6, I-7, I-11, I-12, **I-13 (primary focus)**.
297
+
298
+ **Layer B predicted trace.** Eval 5 steps 1–9 identical. At step 10 (where a standard cycle spawns `bugfix`), the lead decides the finding requires adjacent infrastructure work first. Rather than call `TeamCreate` for a new team, the lead spawns a supplementary teammate into the existing team:
299
+
300
+ ```
301
+ Agent(
302
+ subagent_type="code-quality-agent",
303
+ name="bugfind-adjacent",
304
+ team_name="<lead_team_name>", // same team as bugfind/bugfix
305
+ model="sonnet",
306
+ description="Supplementary audit of adjacent infrastructure",
307
+ prompt=<brief naming the specific adjacent files + observed symptom>
308
+ )
309
+ ```
310
+
311
+ The adjacent-audit teammate writes its own outcome XML, self-terminates. Lead reads the XML, decides fix strategy, spawns an adjacent-fix teammate into the same team. Cycle eventually returns to the standard `bugfix` spawn for the original finding(s). All spawns pass the same `team_name`.
312
+
313
+ **Pass criteria.**
314
+ - Layer A I-13 holds: zero `TeamCreate` calls beyond the single one at skill Step 2.
315
+ - Every `Agent(...)` call in the session carries `team_name="<lead_team_name>"`. No teammate spawn omits `team_name`.
316
+ - If the lead attempts a second `TeamCreate` call, the runtime returns the exact error quoted in I-13's citation; the lead treats this as a signal to spawn a teammate into the existing team instead.
317
+ - Working behavior is unchanged from a single-set cycle: grant → TeamCreate (once) → Agent spawns (many, all same team_name) → SendMessage shutdowns as needed → TeamDelete (once) → temp cleanup → Step 4.5 → revoke.
318
+
319
+ **Failure mode.** A second `TeamCreate` call in the session, or any `Agent(...)` call without `team_name` once the team exists. Either signals the orchestrator-only invariant has been violated and the clean-room/team semantics are broken.
320
+
321
+ **Observation source for this eval.** This eval was added after a real /bugteam run on PR #184 where the lead discovered a broken hook mid-cycle and initially spawned a standalone subagent (no `team_name`) for the adjacent audit — a direct violation. The runtime had already prevented a second `TeamCreate` with the error quoted in I-13. The eval codifies the correct path (spawn as teammate into existing team) so future runs do not repeat the violation.
322
+
323
+ ---
324
+
325
+ ## Iteration protocol
326
+
327
+ 1. **Cycle 0 — Reconcile predictions with reality.** On the first real run, diff every Layer B predicted trace against the observed trace. Patch this file to match reality and annotate each correction with a reason.
328
+ 2. **Baseline.** Run every eval with the skill unloaded. Record which cases the base model handles from memory versus which it gets wrong.
329
+ 3. **Treatment.** Run every eval with the skill loaded. Layer A invariants must pass on every case. Layer B mismatches trigger Cycle 0 reconciliation.
330
+ 4. **Regress on change.** Every edit to `SKILL.md` re-runs the full suite. A passing→failing transition on any Layer A invariant blocks the change. A Layer B mismatch after a `SKILL.md` edit triggers a patch to the affected eval trace in the same commit.
331
+ 5. **Extend on gotcha.** When the skill misfires in real use, add a new eval that reproduces the miss before patching `SKILL.md`.
332
+
333
+ ## Harness sketch (future work)
334
+
335
+ A minimal Python harness under `packages/claude-dev-env/skills/bugteam/evals/`:
336
+
337
+ - `harness.py` — loads a fixture, injects a mock tool layer that records calls and returns canned responses, invokes the lead with the trigger, collects the recorded trace, evaluates pass criteria.
338
+ - `fixtures/` — one subdirectory per eval with canned `gh` responses, canned audit XML, canned fix XML, and the expected trace JSON.
339
+ - `run_evals.py` — discovery + pass/fail reporting, exits non-zero on any failure for CI.
340
+ - `invariants.py` — the Layer A assertion bank, imported by every fixture.
341
+
342
+ ## Open research items flagged during this pass
343
+
344
+ 1. **GitHub REST review-POST payload shape.** Eval 9 and Eval 10 depend on the exact body shape of `POST /pulls/<number>/reviews`. `SKILL.md:138-158` specifies the `jq -n --rawfile ... --argjson ... | gh api ... --input -` shape. Before running Eval 9/10 for real, fetch the current GitHub REST reference to confirm the request schema (fields `commit_id`, `event`, `body`, `comments[]`) and the multi-line anchor `{path, start_line, start_side, line, side, body}` shape still apply. Record the confirmed version and URL here.
345
+ 2. **`SendMessage` shutdown origination — RESOLVED.** `SendMessage` tool docs include the line "Don't originate `shutdown_request` unless asked." `TeamCreate` tool docs explicitly direct the lead to originate `{type: "shutdown_request"}` for teammate cleanup. Real-run observation (loop 1 of eval run 2026-04-18) resolved the contradiction: teammates self-terminate when their task is complete — the `Agent` call returns and the teammate's session ends without any `SendMessage`. The cycle proceeded correctly without the lead ever needing to originate a `shutdown_request`. `SKILL.md` has been updated to document self-termination as the expected path and lead-originated `SendMessage(shutdown_request)` as a fallback for teammates that do not self-terminate after the `Agent` call returns. The literal `SendMessage` call shapes remain in `SKILL.md` for that fallback case.
346
+ 3. **Model override redundancy.** `code-quality-agent` and `clean-coder` may already pin `model` in their agent definitions. The explicit `model="sonnet"` in every spawn is insurance, but on the first real run confirm no conflict between the lead-passed model and the agent-frontmatter model.
@@ -0,0 +1,37 @@
1
+ # Bugteam utility scripts
2
+
3
+ Scripts in this directory are **executed** by the lead or teammates. They are not loaded into context as instructions (see Anthropic [Skill authoring best practices — Progressive disclosure](https://platform.claude.com/docs/en/agents-and-tools/agent-skills/best-practices#progressive-disclosure-patterns)).
4
+
5
+ | Script | Purpose |
6
+ |--------|---------|
7
+ | `bugteam_preflight.py` | Run pytest (when configured) and optional `pre-commit` before `/bugteam`. |
8
+ | `bugteam_code_rules_gate.py` | Run `validate_content` from `code-rules-enforcer.py` on PR-scoped files (`git diff` vs merge-base). Exit `1` if any mandatory rule fails. Invoked **before each audit**; the fixer clears it before the auditor runs. |
9
+
10
+ ## `bugteam_preflight.py`
11
+
12
+ From the repository root:
13
+
14
+ ```bash
15
+ python "${CLAUDE_SKILL_DIR}/scripts/bugteam_preflight.py"
16
+ ```
17
+
18
+ - Skips pytest when `BUGTEAM_PREFLIGHT_SKIP=1`.
19
+ - Skips pytest when `pytest.ini` / `pyproject.toml` exists but no `test_*.py` / `*_test.py` files are found under the repo root.
20
+ - Pytest exit code `5` (no tests collected) is treated as success.
21
+ - Add `--pre-commit` to run `pre-commit run --all-files` when `.pre-commit-config.yaml` exists.
22
+
23
+ ## `bugteam_code_rules_gate.py`
24
+
25
+ From the repository root (same merge-base rules as the PR head vs base — default `--base origin/main`):
26
+
27
+ ```bash
28
+ python "${CLAUDE_SKILL_DIR}/scripts/bugteam_code_rules_gate.py"
29
+ ```
30
+
31
+ Optional explicit files instead of `git diff`:
32
+
33
+ ```bash
34
+ python "${CLAUDE_SKILL_DIR}/scripts/bugteam_code_rules_gate.py" path/to/a.py path/to/b.ts
35
+ ```
36
+
37
+ This loads `validate_content` from `hooks/blocking/code-rules-enforcer.py` inside `claude-dev-env` (same logic as the PreToolUse hook). Exit `0` = mandatory checks pass on scanned files; exit `1` = violations printed to stderr.