buildlog 0.6.1__tar.gz → 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {buildlog-0.6.1 → buildlog-0.7.0}/.gitignore +8 -0
  2. {buildlog-0.6.1 → buildlog-0.7.0}/PKG-INFO +71 -17
  3. {buildlog-0.6.1 → buildlog-0.7.0}/README.md +70 -16
  4. {buildlog-0.6.1 → buildlog-0.7.0}/pyproject.toml +1 -1
  5. {buildlog-0.6.1 → buildlog-0.7.0}/src/buildlog/cli.py +153 -0
  6. {buildlog-0.6.1 → buildlog-0.7.0}/src/buildlog/core/__init__.py +10 -0
  7. {buildlog-0.6.1 → buildlog-0.7.0}/src/buildlog/core/operations.py +234 -0
  8. {buildlog-0.6.1 → buildlog-0.7.0}/src/buildlog/mcp/server.py +6 -0
  9. {buildlog-0.6.1 → buildlog-0.7.0}/src/buildlog/mcp/tools.py +105 -0
  10. {buildlog-0.6.1 → buildlog-0.7.0}/src/buildlog/render/claude_md.py +17 -4
  11. {buildlog-0.6.1 → buildlog-0.7.0}/src/buildlog/render/tracking.py +20 -1
  12. {buildlog-0.6.1 → buildlog-0.7.0}/src/buildlog/seeds.py +41 -0
  13. {buildlog-0.6.1 → buildlog-0.7.0}/LICENSE +0 -0
  14. {buildlog-0.6.1 → buildlog-0.7.0}/copier.yml +0 -0
  15. {buildlog-0.6.1 → buildlog-0.7.0}/post_gen.py +0 -0
  16. {buildlog-0.6.1 → buildlog-0.7.0}/src/buildlog/__init__.py +0 -0
  17. {buildlog-0.6.1 → buildlog-0.7.0}/src/buildlog/confidence.py +0 -0
  18. {buildlog-0.6.1 → buildlog-0.7.0}/src/buildlog/data/__init__.py +0 -0
  19. {buildlog-0.6.1 → buildlog-0.7.0}/src/buildlog/data/seeds/security_karen.yaml +0 -0
  20. {buildlog-0.6.1 → buildlog-0.7.0}/src/buildlog/data/seeds/test_terrorist.yaml +0 -0
  21. {buildlog-0.6.1 → buildlog-0.7.0}/src/buildlog/distill.py +0 -0
  22. {buildlog-0.6.1 → buildlog-0.7.0}/src/buildlog/embeddings.py +0 -0
  23. {buildlog-0.6.1 → buildlog-0.7.0}/src/buildlog/mcp/__init__.py +0 -0
  24. {buildlog-0.6.1 → buildlog-0.7.0}/src/buildlog/render/__init__.py +0 -0
  25. {buildlog-0.6.1 → buildlog-0.7.0}/src/buildlog/render/base.py +0 -0
  26. {buildlog-0.6.1 → buildlog-0.7.0}/src/buildlog/render/settings_json.py +0 -0
  27. {buildlog-0.6.1 → buildlog-0.7.0}/src/buildlog/render/skill.py +0 -0
  28. {buildlog-0.6.1 → buildlog-0.7.0}/src/buildlog/seed_engine/__init__.py +0 -0
  29. {buildlog-0.6.1 → buildlog-0.7.0}/src/buildlog/seed_engine/categorizers.py +0 -0
  30. {buildlog-0.6.1 → buildlog-0.7.0}/src/buildlog/seed_engine/extractors.py +0 -0
  31. {buildlog-0.6.1 → buildlog-0.7.0}/src/buildlog/seed_engine/generators.py +0 -0
  32. {buildlog-0.6.1 → buildlog-0.7.0}/src/buildlog/seed_engine/models.py +0 -0
  33. {buildlog-0.6.1 → buildlog-0.7.0}/src/buildlog/seed_engine/pipeline.py +0 -0
  34. {buildlog-0.6.1 → buildlog-0.7.0}/src/buildlog/seed_engine/sources.py +0 -0
  35. {buildlog-0.6.1 → buildlog-0.7.0}/src/buildlog/skills.py +0 -0
  36. {buildlog-0.6.1 → buildlog-0.7.0}/src/buildlog/stats.py +0 -0
  37. {buildlog-0.6.1 → buildlog-0.7.0}/template/buildlog/.gitkeep +0 -0
  38. {buildlog-0.6.1 → buildlog-0.7.0}/template/buildlog/2026-01-01-example.md +0 -0
  39. {buildlog-0.6.1 → buildlog-0.7.0}/template/buildlog/BUILDLOG_SYSTEM.md +0 -0
  40. {buildlog-0.6.1 → buildlog-0.7.0}/template/buildlog/_TEMPLATE.md +0 -0
  41. {buildlog-0.6.1 → buildlog-0.7.0}/template/buildlog/assets/.gitkeep +0 -0
@@ -44,3 +44,11 @@ htmlcov/
44
44
 
45
45
  # Build artifacts
46
46
  *.whl
47
+
48
+ # Development artifacts
49
+ CHAT.txt
50
+ results/
51
+ sketches/
52
+
53
+ # buildlog runtime data (in project root, not in src/)
54
+ buildlog/.buildlog/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: buildlog
3
- Version: 0.6.1
3
+ Version: 0.7.0
4
4
  Summary: Engineering notebook for AI-assisted development
5
5
  Project-URL: Homepage, https://github.com/Peleke/buildlog-template
6
6
  Project-URL: Repository, https://github.com/Peleke/buildlog-template
@@ -123,11 +123,30 @@ RMR is not the only metric that matters. But it's one we can measure, and measur
123
123
 
124
124
  ## The Mechanism
125
125
 
126
- buildlog uses **contextual bandits** to select which rules to surface.
126
+ buildlog is building toward **contextual bandits** for automatic rule selection. Here's where we are:
127
+
128
+ ### What Exists Today (v0.7)
127
129
 
128
130
  ```
129
131
  ┌─────────────────────────────────────────────────────────────────┐
130
- CONTEXTUAL BANDIT SETUP
132
+ CURRENT INFRASTRUCTURE
133
+ ├─────────────────────────────────────────────────────────────────┤
134
+ │ │
135
+ │ ✅ Rule extraction From entries, reviews, curated seeds │
136
+ │ ✅ Confidence scoring Frequency + recency based │
137
+ │ ✅ Reward logging Accept/reject/revision signals │
138
+ │ ✅ Experiment tracking Sessions, mistakes, RMR calculation │
139
+ │ ✅ Review gauntlet Curated persona-based code review │
140
+ │ ⏳ Manual promotion Human selects rules to surface │
141
+ │ │
142
+ └─────────────────────────────────────────────────────────────────┘
143
+ ```
144
+
145
+ ### What's Coming (v0.8+)
146
+
147
+ ```
148
+ ┌─────────────────────────────────────────────────────────────────┐
149
+ │ CONTEXTUAL BANDIT (PLANNED) │
131
150
  ├─────────────────────────────────────────────────────────────────┤
132
151
  │ │
133
152
  │ Context (c): Error class, file type, task category │
@@ -147,9 +166,9 @@ buildlog uses **contextual bandits** to select which rules to surface.
147
166
 
148
167
  **Reward** = did surfacing this rule actually help?
149
168
 
150
- The system explores (tries uncertain rules) and exploits (uses proven rules) based on accumulated evidence. Thompson Sampling provides theoretical guarantees: O(√(KT log K)) regret bounds.
169
+ The reward infrastructure exists. The bandit policy is next. Thompson Sampling will provide theoretical guarantees: O(√(KT log K)) regret bounds.
151
170
 
152
- This isn't magic. It's a well-understood framework with decades of research. We're applying it to agent rule selection.
171
+ We're building in public—the bandit implementation will be developed with full documentation of the process.
153
172
 
154
173
  ---
155
174
 
@@ -161,16 +180,20 @@ buildlog captures signal at every stage:
161
180
  flowchart LR
162
181
  A["Work Sessions"] --> B["Structured Entries"]
163
182
  B --> C["Extracted Rules"]
164
- C --> D["Bandit Selection"]
183
+ C --> D["Manual Promotion"]
165
184
  D --> E["Rule Surfaced"]
166
185
  E --> F["Human Feedback"]
167
- F --> G["Posterior Update"]
168
- G --> D
186
+ F --> G["Reward Logged"]
187
+ G -.-> H["Bandit Policy"]
188
+ H -.-> D
169
189
 
170
190
  style F fill:#ff6b6b,color:#fff
171
191
  style G fill:#4ecdc4,color:#fff
192
+ style H fill:#666,color:#fff,stroke-dasharray: 5 5
172
193
  ```
173
194
 
195
+ *Dashed: Coming in v0.8 — automatic rule selection via Thompson Sampling*
196
+
174
197
  ### Stage 1: Capture
175
198
  Document your work. Include the fuckups—they're the most valuable signal.
176
199
 
@@ -269,6 +292,27 @@ buildlog gauntlet rules --format markdown -o review_checklist.md
269
292
  buildlog gauntlet learn review_issues.json --source "PR#42"
270
293
  ```
271
294
 
295
+ ### Gauntlet Loop (Agent Integration)
296
+
297
+ For AI agents, the gauntlet loop automates the fix-rerun cycle:
298
+
299
+ ```bash
300
+ buildlog gauntlet loop src/ --persona security_karen --persona test_terrorist
301
+ ```
302
+
303
+ The loop provides structured checkpoints:
304
+
305
+ | Severity | Action | Human Needed? |
306
+ |----------|--------|---------------|
307
+ | **Critical** | Agent fixes, reruns | No |
308
+ | **Major** | Checkpoint: continue? | Yes |
309
+ | **Minor** | Accept risk or fix? | Yes |
310
+ | **Clean** | Done | No |
311
+
312
+ MCP tools for agent integration:
313
+ - `buildlog_gauntlet_issues` — Report findings, get next action
314
+ - `buildlog_gauntlet_accept_risk` — Accept remaining issues (optionally create GitHub issues)
315
+
272
316
  The gauntlet integrates with the learning loop—issues found become rules that accumulate confidence.
273
317
 
274
318
  ---
@@ -359,6 +403,8 @@ Available tools:
359
403
  | `buildlog_start_session` | Begin tracked experiment |
360
404
  | `buildlog_log_mistake` | Record mistake during session |
361
405
  | `buildlog_experiment_report` | Full experiment report |
406
+ | `buildlog_gauntlet_issues` | Report gauntlet findings, get next action |
407
+ | `buildlog_gauntlet_accept_risk` | Accept remaining issues, optionally create GH issues |
362
408
 
363
409
  ### CLI Commands
364
410
 
@@ -382,6 +428,7 @@ buildlog gauntlet list # Show reviewers
382
428
  buildlog gauntlet rules # Export rules
383
429
  buildlog gauntlet prompt <path> # Generate review prompt
384
430
  buildlog gauntlet learn <file> # Persist learnings
431
+ buildlog gauntlet loop <path> # Auto-fix loop with HITL checkpoints
385
432
  ```
386
433
 
387
434
  ---
@@ -421,21 +468,28 @@ This is how you know. Not vibes. Data.
421
468
 
422
469
  For the technically curious:
423
470
 
424
- | Concept | Application in buildlog |
425
- |---------|------------------------|
426
- | **Thompson Sampling** | Rule selection under uncertainty |
427
- | **Beta-Bernoulli model** | Posterior updates from binary reward |
428
- | **Contextual bandits** | Context-dependent rule selection |
429
- | **Regret bounds** | O(√(KT log K)) theoretical guarantee |
430
- | **Semantic hashing** | Mistake deduplication for RMR |
471
+ | Concept | Application in buildlog | Status |
472
+ |---------|------------------------|--------|
473
+ | **Confidence scoring** | Frequency + recency decay | ✅ Implemented |
474
+ | **Semantic hashing** | Mistake deduplication for RMR | ✅ Implemented |
475
+ | **Reward signals** | Binary feedback infrastructure | ✅ Implemented |
476
+ | **Thompson Sampling** | Rule selection under uncertainty | ⏳ Planned (v0.8) |
477
+ | **Beta-Bernoulli model** | Posterior updates from binary reward | ⏳ Planned (v0.8) |
478
+ | **Contextual bandits** | Context-dependent rule selection | ⏳ Planned (v0.8) |
479
+ | **Regret bounds** | O(√(KT log K)) theoretical guarantee | ⏳ Planned (v0.8) |
431
480
 
432
- We're not inventing new math. We're applying proven frameworks to a new domain.
481
+ We're not inventing new math. We're applying proven frameworks to a new domain. The infrastructure for reward collection is live; the bandit policy is the next milestone.
433
482
 
434
483
  ---
435
484
 
436
485
  ## Honest Limitations
437
486
 
438
- Things we don't have figured out yet:
487
+ ### Not Yet Implemented
488
+
489
+ - **Automatic rule selection**: Currently manual promotion; Thompson Sampling bandit planned for v0.8
490
+ - **Context-aware surfacing**: Rules are surfaced globally, not based on task context
491
+
492
+ ### Hard Problems We're Working On
439
493
 
440
494
  - **Credit assignment**: When multiple rules are active, which one helped?
441
495
  - **Non-stationarity**: Developer skill changes over time
@@ -75,11 +75,30 @@ RMR is not the only metric that matters. But it's one we can measure, and measur
75
75
 
76
76
  ## The Mechanism
77
77
 
78
- buildlog uses **contextual bandits** to select which rules to surface.
78
+ buildlog is building toward **contextual bandits** for automatic rule selection. Here's where we are:
79
+
80
+ ### What Exists Today (v0.7)
79
81
 
80
82
  ```
81
83
  ┌─────────────────────────────────────────────────────────────────┐
82
- CONTEXTUAL BANDIT SETUP
84
+ CURRENT INFRASTRUCTURE
85
+ ├─────────────────────────────────────────────────────────────────┤
86
+ │ │
87
+ │ ✅ Rule extraction From entries, reviews, curated seeds │
88
+ │ ✅ Confidence scoring Frequency + recency based │
89
+ │ ✅ Reward logging Accept/reject/revision signals │
90
+ │ ✅ Experiment tracking Sessions, mistakes, RMR calculation │
91
+ │ ✅ Review gauntlet Curated persona-based code review │
92
+ │ ⏳ Manual promotion Human selects rules to surface │
93
+ │ │
94
+ └─────────────────────────────────────────────────────────────────┘
95
+ ```
96
+
97
+ ### What's Coming (v0.8+)
98
+
99
+ ```
100
+ ┌─────────────────────────────────────────────────────────────────┐
101
+ │ CONTEXTUAL BANDIT (PLANNED) │
83
102
  ├─────────────────────────────────────────────────────────────────┤
84
103
  │ │
85
104
  │ Context (c): Error class, file type, task category │
@@ -99,9 +118,9 @@ buildlog uses **contextual bandits** to select which rules to surface.
99
118
 
100
119
  **Reward** = did surfacing this rule actually help?
101
120
 
102
- The system explores (tries uncertain rules) and exploits (uses proven rules) based on accumulated evidence. Thompson Sampling provides theoretical guarantees: O(√(KT log K)) regret bounds.
121
+ The reward infrastructure exists. The bandit policy is next. Thompson Sampling will provide theoretical guarantees: O(√(KT log K)) regret bounds.
103
122
 
104
- This isn't magic. It's a well-understood framework with decades of research. We're applying it to agent rule selection.
123
+ We're building in public—the bandit implementation will be developed with full documentation of the process.
105
124
 
106
125
  ---
107
126
 
@@ -113,16 +132,20 @@ buildlog captures signal at every stage:
113
132
  flowchart LR
114
133
  A["Work Sessions"] --> B["Structured Entries"]
115
134
  B --> C["Extracted Rules"]
116
- C --> D["Bandit Selection"]
135
+ C --> D["Manual Promotion"]
117
136
  D --> E["Rule Surfaced"]
118
137
  E --> F["Human Feedback"]
119
- F --> G["Posterior Update"]
120
- G --> D
138
+ F --> G["Reward Logged"]
139
+ G -.-> H["Bandit Policy"]
140
+ H -.-> D
121
141
 
122
142
  style F fill:#ff6b6b,color:#fff
123
143
  style G fill:#4ecdc4,color:#fff
144
+ style H fill:#666,color:#fff,stroke-dasharray: 5 5
124
145
  ```
125
146
 
147
+ *Dashed: Coming in v0.8 — automatic rule selection via Thompson Sampling*
148
+
126
149
  ### Stage 1: Capture
127
150
  Document your work. Include the fuckups—they're the most valuable signal.
128
151
 
@@ -221,6 +244,27 @@ buildlog gauntlet rules --format markdown -o review_checklist.md
221
244
  buildlog gauntlet learn review_issues.json --source "PR#42"
222
245
  ```
223
246
 
247
+ ### Gauntlet Loop (Agent Integration)
248
+
249
+ For AI agents, the gauntlet loop automates the fix-rerun cycle:
250
+
251
+ ```bash
252
+ buildlog gauntlet loop src/ --persona security_karen --persona test_terrorist
253
+ ```
254
+
255
+ The loop provides structured checkpoints:
256
+
257
+ | Severity | Action | Human Needed? |
258
+ |----------|--------|---------------|
259
+ | **Critical** | Agent fixes, reruns | No |
260
+ | **Major** | Checkpoint: continue? | Yes |
261
+ | **Minor** | Accept risk or fix? | Yes |
262
+ | **Clean** | Done | No |
263
+
264
+ MCP tools for agent integration:
265
+ - `buildlog_gauntlet_issues` — Report findings, get next action
266
+ - `buildlog_gauntlet_accept_risk` — Accept remaining issues (optionally create GitHub issues)
267
+
224
268
  The gauntlet integrates with the learning loop—issues found become rules that accumulate confidence.
225
269
 
226
270
  ---
@@ -311,6 +355,8 @@ Available tools:
311
355
  | `buildlog_start_session` | Begin tracked experiment |
312
356
  | `buildlog_log_mistake` | Record mistake during session |
313
357
  | `buildlog_experiment_report` | Full experiment report |
358
+ | `buildlog_gauntlet_issues` | Report gauntlet findings, get next action |
359
+ | `buildlog_gauntlet_accept_risk` | Accept remaining issues, optionally create GH issues |
314
360
 
315
361
  ### CLI Commands
316
362
 
@@ -334,6 +380,7 @@ buildlog gauntlet list # Show reviewers
334
380
  buildlog gauntlet rules # Export rules
335
381
  buildlog gauntlet prompt <path> # Generate review prompt
336
382
  buildlog gauntlet learn <file> # Persist learnings
383
+ buildlog gauntlet loop <path> # Auto-fix loop with HITL checkpoints
337
384
  ```
338
385
 
339
386
  ---
@@ -373,21 +420,28 @@ This is how you know. Not vibes. Data.
373
420
 
374
421
  For the technically curious:
375
422
 
376
- | Concept | Application in buildlog |
377
- |---------|------------------------|
378
- | **Thompson Sampling** | Rule selection under uncertainty |
379
- | **Beta-Bernoulli model** | Posterior updates from binary reward |
380
- | **Contextual bandits** | Context-dependent rule selection |
381
- | **Regret bounds** | O(√(KT log K)) theoretical guarantee |
382
- | **Semantic hashing** | Mistake deduplication for RMR |
423
+ | Concept | Application in buildlog | Status |
424
+ |---------|------------------------|--------|
425
+ | **Confidence scoring** | Frequency + recency decay | ✅ Implemented |
426
+ | **Semantic hashing** | Mistake deduplication for RMR | ✅ Implemented |
427
+ | **Reward signals** | Binary feedback infrastructure | ✅ Implemented |
428
+ | **Thompson Sampling** | Rule selection under uncertainty | ⏳ Planned (v0.8) |
429
+ | **Beta-Bernoulli model** | Posterior updates from binary reward | ⏳ Planned (v0.8) |
430
+ | **Contextual bandits** | Context-dependent rule selection | ⏳ Planned (v0.8) |
431
+ | **Regret bounds** | O(√(KT log K)) theoretical guarantee | ⏳ Planned (v0.8) |
383
432
 
384
- We're not inventing new math. We're applying proven frameworks to a new domain.
433
+ We're not inventing new math. We're applying proven frameworks to a new domain. The infrastructure for reward collection is live; the bandit policy is the next milestone.
385
434
 
386
435
  ---
387
436
 
388
437
  ## Honest Limitations
389
438
 
390
- Things we don't have figured out yet:
439
+ ### Not Yet Implemented
440
+
441
+ - **Automatic rule selection**: Currently manual promotion; Thompson Sampling bandit planned for v0.8
442
+ - **Context-aware surfacing**: Rules are surfaced globally, not based on task context
443
+
444
+ ### Hard Problems We're Working On
391
445
 
392
446
  - **Credit assignment**: When multiple rules are active, which one helped?
393
447
  - **Non-stationarity**: Developer skill changes over time
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "buildlog"
7
- version = "0.6.1"
7
+ version = "0.7.0"
8
8
  description = "Engineering notebook for AI-assisted development"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -1264,5 +1264,158 @@ def gauntlet_learn(issues_file: str, source: str | None, output_json: bool):
1264
1264
  click.echo(f" Total processed: {result.total_issues_processed}")
1265
1265
 
1266
1266
 
1267
+ @gauntlet.command("loop")
1268
+ @click.argument("target", type=click.Path(exists=True))
1269
+ @click.option(
1270
+ "--persona",
1271
+ "-p",
1272
+ multiple=True,
1273
+ help="Personas to run (default: all)",
1274
+ )
1275
+ @click.option(
1276
+ "--max-iterations",
1277
+ "-n",
1278
+ default=10,
1279
+ help="Maximum iterations to prevent infinite loops (default: 10)",
1280
+ )
1281
+ @click.option(
1282
+ "--stop-at",
1283
+ type=click.Choice(["criticals", "majors", "minors"]),
1284
+ default="minors",
1285
+ help="Stop after clearing this severity level (default: minors)",
1286
+ )
1287
+ @click.option(
1288
+ "--auto-gh-issues",
1289
+ is_flag=True,
1290
+ help="Create GitHub issues for remaining items when accepting risk",
1291
+ )
1292
+ @click.option("--json", "output_json", is_flag=True, help="Output as JSON")
1293
+ def gauntlet_loop(
1294
+ target: str,
1295
+ persona: tuple[str, ...],
1296
+ max_iterations: int,
1297
+ stop_at: str,
1298
+ auto_gh_issues: bool,
1299
+ output_json: bool,
1300
+ ):
1301
+ """Run the gauntlet loop: review, fix, repeat until clean.
1302
+
1303
+ This command orchestrates the gauntlet loop workflow:
1304
+
1305
+ 1. Generate review prompt for target code
1306
+ 2. Process issues and determine action
1307
+ 3. On criticals: output fix instructions, expect re-run
1308
+ 4. On majors only: checkpoint (ask to continue)
1309
+ 5. On minors only: checkpoint (accept risk?)
1310
+ 6. Optionally create GitHub issues for remaining items
1311
+
1312
+ The loop is designed to be run interactively with an agent
1313
+ (Claude Code, Cursor, etc.) that does the actual fixing.
1314
+
1315
+ Examples:
1316
+
1317
+ buildlog gauntlet loop src/
1318
+ buildlog gauntlet loop tests/ --stop-at majors
1319
+ buildlog gauntlet loop . --auto-gh-issues
1320
+ """
1321
+ import json as json_module
1322
+
1323
+ from buildlog.seeds import get_default_seeds_dir, load_all_seeds
1324
+
1325
+ # Find seeds directory
1326
+ seeds_dir = get_default_seeds_dir()
1327
+
1328
+ if seeds_dir is None:
1329
+ click.echo("No seed files found.", err=True)
1330
+ raise SystemExit(1)
1331
+
1332
+ seeds = load_all_seeds(seeds_dir)
1333
+
1334
+ if not seeds:
1335
+ click.echo("No seed files found in directory.", err=True)
1336
+ raise SystemExit(1)
1337
+
1338
+ # Filter personas
1339
+ if persona:
1340
+ seeds = {k: v for k, v in seeds.items() if k in persona}
1341
+ if not seeds:
1342
+ click.echo(f"No matching personas: {', '.join(persona)}", err=True)
1343
+ raise SystemExit(1)
1344
+
1345
+ target_path = Path(target)
1346
+
1347
+ # Generate persona rules summary
1348
+ rules_by_persona: dict[str, list[dict[str, str]]] = {}
1349
+ for name, sf in seeds.items():
1350
+ rules_by_persona[name] = [
1351
+ {"rule": r.rule, "antipattern": r.antipattern, "category": r.category}
1352
+ for r in sf.rules
1353
+ ]
1354
+
1355
+ # Loop instructions
1356
+ instructions = [
1357
+ "1. Review the target code using the rules from each persona",
1358
+ "2. Report all violations as JSON issues with: severity, category, description, rule_learned, location",
1359
+ "3. Call `buildlog_gauntlet_issues` with the issues list to determine next action",
1360
+ "4. If action='fix_criticals': Fix critical+major issues, then re-run gauntlet",
1361
+ "5. If action='checkpoint_majors': Ask user whether to continue fixing majors",
1362
+ "6. If action='checkpoint_minors': Ask user whether to accept risk or continue",
1363
+ "7. If user accepts risk and --auto-gh-issues: Call `buildlog_gauntlet_accept_risk` with remaining issues",
1364
+ "8. Repeat until action='clean' or max_iterations reached",
1365
+ ]
1366
+
1367
+ # Expected issue format
1368
+ issue_format = {
1369
+ "severity": "critical|major|minor|nitpick",
1370
+ "category": "security|testing|architectural|workflow|...",
1371
+ "description": "Concrete description of what's wrong",
1372
+ "rule_learned": "Generalizable rule for the future",
1373
+ "location": "file:line (optional)",
1374
+ }
1375
+
1376
+ # Build the loop output
1377
+ output = {
1378
+ "command": "gauntlet_loop",
1379
+ "target": str(target_path),
1380
+ "personas": list(seeds.keys()),
1381
+ "max_iterations": max_iterations,
1382
+ "stop_at": stop_at,
1383
+ "auto_gh_issues": auto_gh_issues,
1384
+ "rules_by_persona": rules_by_persona,
1385
+ "instructions": instructions,
1386
+ "issue_format": issue_format,
1387
+ }
1388
+
1389
+ if output_json:
1390
+ click.echo(json_module.dumps(output, indent=2))
1391
+ else:
1392
+ # Human-readable output
1393
+ click.echo("=" * 60)
1394
+ click.echo("GAUNTLET LOOP")
1395
+ click.echo("=" * 60)
1396
+ click.echo(f"\nTarget: {target_path}")
1397
+ click.echo(f"Personas: {', '.join(seeds.keys())}")
1398
+ click.echo(f"Max iterations: {max_iterations}")
1399
+ click.echo(f"Stop at: {stop_at}")
1400
+ click.echo(f"Auto GH issues: {auto_gh_issues}")
1401
+
1402
+ click.echo("\n--- RULES ---")
1403
+ for name, rules in rules_by_persona.items():
1404
+ click.echo(f"\n## {name.replace('_', ' ').title()}")
1405
+ for r in rules:
1406
+ click.echo(f" • {r['rule']}")
1407
+
1408
+ click.echo("\n--- LOOP WORKFLOW ---")
1409
+ for instruction in instructions:
1410
+ click.echo(f" {instruction}")
1411
+
1412
+ click.echo("\n--- ISSUE FORMAT ---")
1413
+ click.echo(json_module.dumps(issue_format, indent=2))
1414
+
1415
+ click.echo("\n" + "=" * 60)
1416
+ click.echo("Ready. Run gauntlet review and process issues.")
1417
+ click.echo("=" * 60)
1418
+
1419
+
1267
1420
  if __name__ == "__main__":
1268
1421
  main()
@@ -3,6 +3,8 @@
3
3
  from buildlog.core.operations import (
4
4
  DiffResult,
5
5
  EndSessionResult,
6
+ GauntletAcceptRiskResult,
7
+ GauntletLoopResult,
6
8
  LearnFromReviewResult,
7
9
  LogMistakeResult,
8
10
  LogRewardResult,
@@ -20,6 +22,8 @@ from buildlog.core.operations import (
20
22
  diff,
21
23
  end_session,
22
24
  find_skills_by_ids,
25
+ gauntlet_accept_risk,
26
+ gauntlet_process_issues,
23
27
  get_experiment_report,
24
28
  get_rewards,
25
29
  get_session_metrics,
@@ -50,6 +54,9 @@ __all__ = [
50
54
  "StartSessionResult",
51
55
  "EndSessionResult",
52
56
  "LogMistakeResult",
57
+ # Gauntlet loop
58
+ "GauntletLoopResult",
59
+ "GauntletAcceptRiskResult",
53
60
  "status",
54
61
  "promote",
55
62
  "reject",
@@ -64,4 +71,7 @@ __all__ = [
64
71
  "log_mistake",
65
72
  "get_session_metrics",
66
73
  "get_experiment_report",
74
+ # Gauntlet loop operations
75
+ "gauntlet_process_issues",
76
+ "gauntlet_accept_risk",
67
77
  ]
@@ -35,6 +35,9 @@ __all__ = [
35
35
  "StartSessionResult",
36
36
  "EndSessionResult",
37
37
  "LogMistakeResult",
38
+ # Gauntlet loop
39
+ "GauntletLoopResult",
40
+ "GauntletAcceptRiskResult",
38
41
  "status",
39
42
  "promote",
40
43
  "reject",
@@ -49,6 +52,9 @@ __all__ = [
49
52
  "log_mistake",
50
53
  "get_session_metrics",
51
54
  "get_experiment_report",
55
+ # Gauntlet loop operations
56
+ "gauntlet_process_issues",
57
+ "gauntlet_accept_risk",
52
58
  ]
53
59
 
54
60
 
@@ -1652,3 +1658,231 @@ def get_experiment_report(buildlog_dir: Path) -> dict:
1652
1658
  "sessions": session_metrics,
1653
1659
  "error_classes": error_classes,
1654
1660
  }
1661
+
1662
+
1663
+ # =============================================================================
1664
+ # Gauntlet Loop Operations
1665
+ # =============================================================================
1666
+
1667
+
1668
+ @dataclass
1669
+ class GauntletLoopResult:
1670
+ """Result of processing gauntlet issues.
1671
+
1672
+ Attributes:
1673
+ action: What to do next:
1674
+ - "fix_criticals": Criticals remain, auto-fix and loop
1675
+ - "checkpoint_majors": No criticals, but majors remain (HITL)
1676
+ - "checkpoint_minors": Only minors remain (HITL)
1677
+ - "clean": No issues remain
1678
+ criticals: List of critical severity issues
1679
+ majors: List of major severity issues
1680
+ minors: List of minor/nitpick severity issues
1681
+ iteration: Current iteration number
1682
+ learnings_persisted: Number of learnings persisted this iteration
1683
+ message: Human-readable summary
1684
+ """
1685
+
1686
+ action: Literal["fix_criticals", "checkpoint_majors", "checkpoint_minors", "clean"]
1687
+ criticals: list[dict]
1688
+ majors: list[dict]
1689
+ minors: list[dict]
1690
+ iteration: int
1691
+ learnings_persisted: int
1692
+ message: str
1693
+
1694
+
1695
+ @dataclass
1696
+ class GauntletAcceptRiskResult:
1697
+ """Result of accepting risk with remaining issues.
1698
+
1699
+ Attributes:
1700
+ accepted_issues: Number of issues accepted as risk
1701
+ github_issues_created: Number of GitHub issues created (if enabled)
1702
+ github_issue_urls: URLs of created GitHub issues
1703
+ message: Human-readable summary
1704
+ error: Error message if operation failed
1705
+ """
1706
+
1707
+ accepted_issues: int
1708
+ github_issues_created: int
1709
+ github_issue_urls: list[str]
1710
+ message: str
1711
+ error: str | None = None
1712
+
1713
+
1714
+ def gauntlet_process_issues(
1715
+ buildlog_dir: Path,
1716
+ issues: list[dict],
1717
+ iteration: int = 1,
1718
+ source: str | None = None,
1719
+ ) -> GauntletLoopResult:
1720
+ """Process gauntlet issues and determine next action.
1721
+
1722
+ Categorizes issues by severity, persists learnings, and returns
1723
+ the appropriate next action for the gauntlet loop.
1724
+
1725
+ Args:
1726
+ buildlog_dir: Path to buildlog directory.
1727
+ issues: List of issues from the gauntlet review.
1728
+ iteration: Current iteration number (for tracking).
1729
+ source: Optional source identifier for learnings.
1730
+
1731
+ Returns:
1732
+ GauntletLoopResult with categorized issues and next action.
1733
+ """
1734
+ # Categorize by severity
1735
+ criticals = [i for i in issues if i.get("severity") == "critical"]
1736
+ majors = [i for i in issues if i.get("severity") == "major"]
1737
+ minors = [i for i in issues if i.get("severity") in ("minor", "nitpick", None)]
1738
+
1739
+ # Persist learnings for this iteration
1740
+ learn_source = source or f"gauntlet:iteration-{iteration}"
1741
+ learn_result = learn_from_review(buildlog_dir, issues, learn_source)
1742
+ learnings_persisted = len(learn_result.new_learnings) + len(
1743
+ learn_result.reinforced_learnings
1744
+ )
1745
+
1746
+ # Determine action
1747
+ if criticals:
1748
+ action: Literal[
1749
+ "fix_criticals", "checkpoint_majors", "checkpoint_minors", "clean"
1750
+ ] = "fix_criticals"
1751
+ message = (
1752
+ f"Iteration {iteration}: {len(criticals)} critical, "
1753
+ f"{len(majors)} major, {len(minors)} minor. "
1754
+ f"Fix criticals (and majors) then re-run."
1755
+ )
1756
+ elif majors:
1757
+ action = "checkpoint_majors"
1758
+ message = (
1759
+ f"Iteration {iteration}: No criticals! "
1760
+ f"{len(majors)} major, {len(minors)} minor remain. "
1761
+ f"Continue clearing majors?"
1762
+ )
1763
+ elif minors:
1764
+ action = "checkpoint_minors"
1765
+ message = (
1766
+ f"Iteration {iteration}: Only {len(minors)} minor issues remain. "
1767
+ f"Accept risk or continue?"
1768
+ )
1769
+ else:
1770
+ action = "clean"
1771
+ message = f"Iteration {iteration}: All clear! No issues found."
1772
+
1773
+ return GauntletLoopResult(
1774
+ action=action,
1775
+ criticals=criticals,
1776
+ majors=majors,
1777
+ minors=minors,
1778
+ iteration=iteration,
1779
+ learnings_persisted=learnings_persisted,
1780
+ message=message,
1781
+ )
1782
+
1783
+
1784
+ def gauntlet_accept_risk(
1785
+ remaining_issues: list[dict],
1786
+ create_github_issues: bool = False,
1787
+ repo: str | None = None,
1788
+ ) -> GauntletAcceptRiskResult:
1789
+ """Accept risk for remaining issues, optionally creating GitHub issues.
1790
+
1791
+ Args:
1792
+ remaining_issues: Issues being accepted as risk.
1793
+ create_github_issues: Whether to create GitHub issues for tracking.
1794
+ repo: Repository for GitHub issues (uses current repo if None).
1795
+
1796
+ Returns:
1797
+ GauntletAcceptRiskResult with created issue info.
1798
+ """
1799
+ import subprocess
1800
+
1801
+ github_urls: list[str] = []
1802
+ error: str | None = None
1803
+
1804
+ if create_github_issues and remaining_issues:
1805
+ for issue in remaining_issues:
1806
+ severity = issue.get("severity", "minor")
1807
+ rule = issue.get("rule_learned", issue.get("description", "Unknown"))
1808
+ description = issue.get("description", "")
1809
+ location = issue.get("location", "")
1810
+
1811
+ # Sanitize inputs for GitHub issue creation
1812
+ # Note: We use list args (not shell=True), so this is defense-in-depth
1813
+ def _sanitize_for_gh(text: str, max_len: int = 256) -> str:
1814
+ """Sanitize text for GitHub issue fields."""
1815
+ # Remove/replace problematic characters
1816
+ sanitized = text.replace("\n", " ").replace("\r", " ")
1817
+ # Truncate to max length
1818
+ if len(sanitized) > max_len:
1819
+ sanitized = sanitized[: max_len - 3] + "..."
1820
+ return sanitized.strip()
1821
+
1822
+ safe_severity = _sanitize_for_gh(str(severity), 20)
1823
+ safe_rule = _sanitize_for_gh(str(rule), 200)
1824
+ safe_description = _sanitize_for_gh(str(description), 1000)
1825
+ safe_location = _sanitize_for_gh(str(location), 100)
1826
+
1827
+ # Build issue body
1828
+ body_parts = [
1829
+ f"**Severity:** {safe_severity}",
1830
+ f"**Rule:** {safe_rule}",
1831
+ "",
1832
+ "## Description",
1833
+ safe_description,
1834
+ ]
1835
+ if safe_location:
1836
+ body_parts.extend(["", f"**Location:** `{safe_location}`"])
1837
+
1838
+ body_parts.extend(
1839
+ [
1840
+ "",
1841
+ "---",
1842
+ "_Created by buildlog gauntlet loop (accepted risk)_",
1843
+ ]
1844
+ )
1845
+
1846
+ body = "\n".join(body_parts)
1847
+ title = f"[Gauntlet/{safe_severity}] {safe_rule[:60]}"
1848
+
1849
+ # Create GitHub issue
1850
+ cmd = [
1851
+ "gh",
1852
+ "issue",
1853
+ "create",
1854
+ "--title",
1855
+ title,
1856
+ "--body",
1857
+ body,
1858
+ "--label",
1859
+ severity,
1860
+ ]
1861
+ if repo:
1862
+ cmd.extend(["--repo", repo])
1863
+
1864
+ try:
1865
+ result = subprocess.run(cmd, capture_output=True, text=True, check=True)
1866
+ # gh issue create outputs the URL
1867
+ url = result.stdout.strip()
1868
+ if url:
1869
+ github_urls.append(url)
1870
+ except subprocess.CalledProcessError as e:
1871
+ # Don't fail entirely, just note the error
1872
+ error = f"Failed to create some GitHub issues: {e.stderr}"
1873
+ except FileNotFoundError:
1874
+ error = "gh CLI not found. Install GitHub CLI to create issues."
1875
+ break
1876
+
1877
+ return GauntletAcceptRiskResult(
1878
+ accepted_issues=len(remaining_issues),
1879
+ github_issues_created=len(github_urls),
1880
+ github_issue_urls=github_urls,
1881
+ message=(
1882
+ f"Accepted {len(remaining_issues)} issues as risk. "
1883
+ f"Created {len(github_urls)} GitHub issues."
1884
+ if create_github_issues
1885
+ else f"Accepted {len(remaining_issues)} issues as risk."
1886
+ ),
1887
+ error=error,
1888
+ )
@@ -8,6 +8,8 @@ from buildlog.mcp.tools import (
8
8
  buildlog_diff,
9
9
  buildlog_end_session,
10
10
  buildlog_experiment_report,
11
+ buildlog_gauntlet_accept_risk,
12
+ buildlog_gauntlet_issues,
11
13
  buildlog_learn_from_review,
12
14
  buildlog_log_mistake,
13
15
  buildlog_log_reward,
@@ -37,6 +39,10 @@ mcp.tool()(buildlog_log_mistake)
37
39
  mcp.tool()(buildlog_session_metrics)
38
40
  mcp.tool()(buildlog_experiment_report)
39
41
 
42
+ # Gauntlet loop tools
43
+ mcp.tool()(buildlog_gauntlet_issues)
44
+ mcp.tool()(buildlog_gauntlet_accept_risk)
45
+
40
46
 
41
47
  def main() -> None:
42
48
  """Run the MCP server."""
@@ -405,3 +405,108 @@ def buildlog_experiment_report(
405
405
  buildlog_experiment_report()
406
406
  """
407
407
  return get_experiment_report(Path(buildlog_dir))
408
+
409
+
410
+ # -----------------------------------------------------------------------------
411
+ # Gauntlet Loop MCP Tools
412
+ # -----------------------------------------------------------------------------
413
+
414
+
415
+ def buildlog_gauntlet_issues(
416
+ issues: list[dict],
417
+ iteration: int = 1,
418
+ source: str | None = None,
419
+ buildlog_dir: str = "buildlog",
420
+ ) -> dict:
421
+ """Process gauntlet review issues and determine next action.
422
+
423
+ Call this after running a gauntlet review. It categorizes issues by
424
+ severity, persists learnings, and returns the appropriate next action.
425
+
426
+ Args:
427
+ issues: List of issues from the gauntlet review, each with:
428
+ {
429
+ "severity": "critical|major|minor|nitpick",
430
+ "category": "security|testing|architectural|...",
431
+ "description": "What's wrong",
432
+ "rule_learned": "Generalizable rule",
433
+ "location": "file:line (optional)"
434
+ }
435
+ iteration: Current iteration number (for tracking loops)
436
+ source: Optional source identifier for learnings
437
+ buildlog_dir: Path to buildlog directory
438
+
439
+ Returns:
440
+ Dict with:
441
+ - action: What to do next:
442
+ - "fix_criticals": Criticals remain, auto-fix and loop
443
+ - "checkpoint_majors": No criticals, majors remain (ask user)
444
+ - "checkpoint_minors": Only minors remain (ask user)
445
+ - "clean": No issues remain
446
+ - criticals: List of critical issues
447
+ - majors: List of major issues
448
+ - minors: List of minor/nitpick issues
449
+ - iteration: Current iteration number
450
+ - learnings_persisted: Number of learnings saved
451
+ - message: Human-readable summary
452
+
453
+ Example:
454
+ # After running gauntlet review
455
+ result = buildlog_gauntlet_issues(
456
+ issues=[
457
+ {"severity": "critical", "category": "security", ...},
458
+ {"severity": "major", "category": "testing", ...},
459
+ ],
460
+ iteration=1
461
+ )
462
+ # result["action"] tells you what to do next
463
+ """
464
+ from buildlog.core import gauntlet_process_issues
465
+
466
+ result = gauntlet_process_issues(
467
+ Path(buildlog_dir),
468
+ issues=issues,
469
+ iteration=iteration,
470
+ source=source,
471
+ )
472
+ return asdict(result)
473
+
474
+
475
+ def buildlog_gauntlet_accept_risk(
476
+ remaining_issues: list[dict],
477
+ create_github_issues: bool = False,
478
+ repo: str | None = None,
479
+ ) -> dict:
480
+ """Accept risk for remaining issues, optionally creating GitHub issues.
481
+
482
+ Call this when the user decides to accept remaining issues as risk
483
+ (e.g., only minors remain and they want to move on).
484
+
485
+ Args:
486
+ remaining_issues: Issues being accepted as risk
487
+ create_github_issues: Whether to create GitHub issues for tracking
488
+ repo: Repository for GitHub issues (uses current repo if None)
489
+
490
+ Returns:
491
+ Dict with:
492
+ - accepted_issues: Number of issues accepted
493
+ - github_issues_created: Number of GitHub issues created
494
+ - github_issue_urls: URLs of created issues
495
+ - message: Human-readable summary
496
+ - error: Error message if GitHub issue creation failed
497
+
498
+ Example:
499
+ # User accepts risk with minors, wants GitHub issues
500
+ result = buildlog_gauntlet_accept_risk(
501
+ remaining_issues=[...],
502
+ create_github_issues=True
503
+ )
504
+ """
505
+ from buildlog.core import gauntlet_accept_risk
506
+
507
+ result = gauntlet_accept_risk(
508
+ remaining_issues=remaining_issues,
509
+ create_github_issues=create_github_issues,
510
+ repo=repo,
511
+ )
512
+ return asdict(result)
@@ -6,7 +6,7 @@ from datetime import datetime
6
6
  from pathlib import Path
7
7
  from typing import TYPE_CHECKING
8
8
 
9
- from buildlog.render.tracking import track_promoted
9
+ from buildlog.render.tracking import get_promoted_ids, track_promoted
10
10
  from buildlog.skills import _to_imperative
11
11
 
12
12
  if TYPE_CHECKING:
@@ -33,6 +33,8 @@ class ClaudeMdRenderer:
33
33
  def render(self, skills: list[Skill]) -> str:
34
34
  """Append skills to CLAUDE.md.
35
35
 
36
+ Filters out skills that have already been promoted to prevent duplicates.
37
+
36
38
  Args:
37
39
  skills: List of skills to append.
38
40
 
@@ -42,9 +44,16 @@ class ClaudeMdRenderer:
42
44
  if not skills:
43
45
  return "No skills to promote"
44
46
 
47
+ # Filter out already-promoted skills
48
+ already_promoted = get_promoted_ids(self.tracking_path)
49
+ new_skills = [s for s in skills if s.id not in already_promoted]
50
+
51
+ if not new_skills:
52
+ return f"All {len(skills)} skills already promoted"
53
+
45
54
  # Group by category
46
55
  by_category: dict[str, list[Skill]] = {}
47
- for skill in skills:
56
+ for skill in new_skills:
48
57
  by_category.setdefault(skill.category, []).append(skill)
49
58
 
50
59
  # Build section
@@ -80,6 +89,10 @@ class ClaudeMdRenderer:
80
89
  self.path.write_text(content)
81
90
 
82
91
  # Track promoted skill IDs using shared utility
83
- track_promoted(skills, self.tracking_path)
92
+ track_promoted(new_skills, self.tracking_path)
84
93
 
85
- return f"Appended {len(skills)} rules to {self.path}"
94
+ skipped = len(skills) - len(new_skills)
95
+ msg = f"Appended {len(new_skills)} rules to {self.path}"
96
+ if skipped > 0:
97
+ msg += f" ({skipped} already promoted, skipped)"
98
+ return msg
@@ -10,7 +10,26 @@ from typing import TYPE_CHECKING
10
10
  if TYPE_CHECKING:
11
11
  from buildlog.skills import Skill
12
12
 
13
- __all__ = ["track_promoted"]
13
+ __all__ = ["track_promoted", "get_promoted_ids"]
14
+
15
+
16
+ def get_promoted_ids(tracking_path: Path) -> set[str]:
17
+ """Get the set of already-promoted skill IDs.
18
+
19
+ Args:
20
+ tracking_path: Path to the tracking JSON file.
21
+
22
+ Returns:
23
+ Set of skill IDs that have been promoted.
24
+ """
25
+ if not tracking_path.exists():
26
+ return set()
27
+
28
+ try:
29
+ tracking = json.loads(tracking_path.read_text())
30
+ return set(tracking.get("skill_ids", []))
31
+ except json.JSONDecodeError:
32
+ return set()
14
33
 
15
34
 
16
35
  def track_promoted(skills: list[Skill], tracking_path: Path) -> None:
@@ -156,6 +156,36 @@ class SeedFile:
156
156
  )
157
157
 
158
158
 
159
+ def _validate_seed_schema(data: dict) -> bool:
160
+ """Validate seed file has expected schema structure.
161
+
162
+ Defense-in-depth validation for seed files. While yaml.safe_load
163
+ prevents code execution, this ensures data structure matches expectations.
164
+
165
+ Args:
166
+ data: Parsed YAML data.
167
+
168
+ Returns:
169
+ True if schema is valid, False otherwise.
170
+ """
171
+ if not isinstance(data, dict):
172
+ return False
173
+
174
+ # Rules must be a list if present
175
+ rules = data.get("rules", [])
176
+ if not isinstance(rules, list):
177
+ return False
178
+
179
+ # Each rule must be a dict with at least a "rule" key
180
+ for rule in rules:
181
+ if not isinstance(rule, dict):
182
+ return False
183
+ if "rule" not in rule:
184
+ return False
185
+
186
+ return True
187
+
188
+
159
189
  def load_seed_file(path: Path) -> SeedFile | None:
160
190
  """Load a single seed file from disk.
161
191
 
@@ -164,6 +194,10 @@ def load_seed_file(path: Path) -> SeedFile | None:
164
194
 
165
195
  Returns:
166
196
  Parsed SeedFile or None if loading fails.
197
+
198
+ Note:
199
+ Uses yaml.safe_load which is safe from code execution attacks.
200
+ Additional schema validation ensures data structure is as expected.
167
201
  """
168
202
  if not path.exists():
169
203
  logger.warning(f"Seed file not found: {path}")
@@ -171,7 +205,14 @@ def load_seed_file(path: Path) -> SeedFile | None:
171
205
 
172
206
  try:
173
207
  with open(path) as f:
208
+ # yaml.safe_load is safe - no arbitrary code execution
174
209
  data = yaml.safe_load(f)
210
+
211
+ # Validate schema before parsing
212
+ if not _validate_seed_schema(data):
213
+ logger.error(f"Invalid seed file schema: {path}")
214
+ return None
215
+
175
216
  return SeedFile.from_dict(data)
176
217
  except (yaml.YAMLError, KeyError, TypeError) as e:
177
218
  logger.error(f"Failed to parse seed file {path}: {e}")
File without changes
File without changes
File without changes
File without changes