buildlog 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. buildlog/__init__.py +1 -1
  2. buildlog/cli.py +659 -48
  3. buildlog/confidence.py +27 -0
  4. buildlog/core/__init__.py +2 -0
  5. buildlog/core/bandit.py +699 -0
  6. buildlog/core/operations.py +284 -24
  7. buildlog/distill.py +80 -1
  8. buildlog/engine/__init__.py +61 -0
  9. buildlog/engine/bandit.py +23 -0
  10. buildlog/engine/confidence.py +28 -0
  11. buildlog/engine/embeddings.py +28 -0
  12. buildlog/engine/experiments.py +619 -0
  13. buildlog/engine/types.py +31 -0
  14. buildlog/llm.py +508 -0
  15. buildlog/mcp/server.py +10 -6
  16. buildlog/mcp/tools.py +61 -13
  17. buildlog/render/__init__.py +19 -2
  18. buildlog/render/claude_md.py +67 -32
  19. buildlog/render/continue_dev.py +102 -0
  20. buildlog/render/copilot.py +100 -0
  21. buildlog/render/cursor.py +105 -0
  22. buildlog/render/windsurf.py +95 -0
  23. buildlog/seed_engine/__init__.py +2 -0
  24. buildlog/seed_engine/llm_extractor.py +121 -0
  25. buildlog/seed_engine/pipeline.py +45 -1
  26. buildlog/skills.py +69 -6
  27. {buildlog-0.7.0.data → buildlog-0.9.0.data}/data/share/buildlog/copier.yml +0 -4
  28. buildlog-0.9.0.data/data/share/buildlog/template/buildlog/_TEMPLATE_QUICK.md +21 -0
  29. buildlog-0.9.0.dist-info/METADATA +248 -0
  30. buildlog-0.9.0.dist-info/RECORD +55 -0
  31. buildlog-0.7.0.dist-info/METADATA +0 -544
  32. buildlog-0.7.0.dist-info/RECORD +0 -41
  33. {buildlog-0.7.0.data → buildlog-0.9.0.data}/data/share/buildlog/post_gen.py +0 -0
  34. {buildlog-0.7.0.data → buildlog-0.9.0.data}/data/share/buildlog/template/buildlog/.gitkeep +0 -0
  35. {buildlog-0.7.0.data → buildlog-0.9.0.data}/data/share/buildlog/template/buildlog/2026-01-01-example.md +0 -0
  36. {buildlog-0.7.0.data → buildlog-0.9.0.data}/data/share/buildlog/template/buildlog/BUILDLOG_SYSTEM.md +0 -0
  37. {buildlog-0.7.0.data → buildlog-0.9.0.data}/data/share/buildlog/template/buildlog/_TEMPLATE.md +0 -0
  38. {buildlog-0.7.0.data → buildlog-0.9.0.data}/data/share/buildlog/template/buildlog/assets/.gitkeep +0 -0
  39. {buildlog-0.7.0.dist-info → buildlog-0.9.0.dist-info}/WHEEL +0 -0
  40. {buildlog-0.7.0.dist-info → buildlog-0.9.0.dist-info}/entry_points.txt +0 -0
  41. {buildlog-0.7.0.dist-info → buildlog-0.9.0.dist-info}/licenses/LICENSE +0 -0
@@ -14,6 +14,7 @@ from pathlib import Path
14
14
  from typing import Literal, TypedDict
15
15
 
16
16
  from buildlog.confidence import ConfidenceMetrics, merge_confidence_metrics
17
+ from buildlog.core.bandit import ThompsonSamplingBandit
17
18
  from buildlog.render import get_renderer
18
19
  from buildlog.skills import Skill, SkillSet, generate_skills
19
20
 
@@ -52,6 +53,7 @@ __all__ = [
52
53
  "log_mistake",
53
54
  "get_session_metrics",
54
55
  "get_experiment_report",
56
+ "get_bandit_status",
55
57
  # Gauntlet loop operations
56
58
  "gauntlet_process_issues",
57
59
  "gauntlet_accept_risk",
@@ -558,7 +560,7 @@ def status(
558
560
  def promote(
559
561
  buildlog_dir: Path,
560
562
  skill_ids: list[str],
561
- target: Literal["claude_md", "settings_json", "skill"] = "claude_md",
563
+ target: str = "claude_md",
562
564
  target_path: Path | None = None,
563
565
  ) -> PromoteResult:
564
566
  """Promote skills to agent rules.
@@ -566,7 +568,8 @@ def promote(
566
568
  Args:
567
569
  buildlog_dir: Path to buildlog directory.
568
570
  skill_ids: List of skill IDs to promote.
569
- target: Where to write rules ("claude_md", "settings_json", or "skill").
571
+ target: Where to write rules. One of: claude_md, settings_json,
572
+ skill, cursor, copilot, windsurf, continue_dev.
570
573
  target_path: Optional custom path for the target file.
571
574
 
572
575
  Returns:
@@ -650,7 +653,7 @@ def reject(
650
653
  rejected = {"rejected_at": {}, "skill_ids": []}
651
654
 
652
655
  # Add new rejections
653
- now = datetime.now().isoformat()
656
+ now = datetime.now(timezone.utc).isoformat()
654
657
  newly_rejected: list[str] = []
655
658
  for skill_id in skill_ids:
656
659
  if skill_id not in rejected["skill_ids"]:
@@ -938,14 +941,27 @@ def log_reward(
938
941
  ) -> LogRewardResult:
939
942
  """Log a reward event for bandit learning.
940
943
 
941
- Appends to reward_events.jsonl for later analysis.
944
+ This is where the bandit learns from EXPLICIT feedback:
945
+
946
+ The reward signal comes from the outcome:
947
+ - accepted (reward=1.0): Rules helped produce good output
948
+ - rejected (reward=0.0): Rules failed to prevent bad output
949
+ - revision (reward=1-distance): Partial credit based on correction needed
950
+
951
+ Unlike log_mistake() which gives implicit negative feedback, this allows
952
+ direct positive feedback when rules DO help. This is crucial for learning
953
+ which rules are genuinely effective, not just which ones don't fail.
954
+
955
+ Appends to reward_events.jsonl for analysis AND updates the bandit.
942
956
 
943
957
  Args:
944
958
  buildlog_dir: Path to buildlog directory.
945
959
  outcome: Type of feedback (accepted/revision/rejected).
946
960
  rules_active: List of rule IDs that were in context.
961
+ If None, tries to use session's selected_rules.
947
962
  revision_distance: How much correction was needed (0-1, for revisions).
948
963
  error_class: Category of error if applicable.
964
+ If None, tries to use session's error_class.
949
965
  notes: Optional notes about the feedback.
950
966
  source: Where this feedback came from.
951
967
 
@@ -956,6 +972,15 @@ def log_reward(
956
972
  reward_id = _generate_reward_id(outcome, now)
957
973
  reward_value = _compute_reward_value(outcome, revision_distance)
958
974
 
975
+ # Try to get rules and context from active session if not provided
976
+ active_path = _get_active_session_path(buildlog_dir)
977
+ if active_path.exists():
978
+ session_data = json.loads(active_path.read_text())
979
+ if rules_active is None:
980
+ rules_active = session_data.get("selected_rules", [])
981
+ if error_class is None:
982
+ error_class = session_data.get("error_class")
983
+
959
984
  event = RewardEvent(
960
985
  id=reward_id,
961
986
  timestamp=now,
@@ -975,6 +1000,32 @@ def log_reward(
975
1000
  with open(rewards_path, "a") as f:
976
1001
  f.write(json.dumps(event.to_dict()) + "\n")
977
1002
 
1003
+ # =========================================================================
1004
+ # BANDIT LEARNING: Update with explicit reward
1005
+ # =========================================================================
1006
+ #
1007
+ # For accepted (reward=1): Beta(α, β) → Beta(α + 1, β)
1008
+ # → Distribution shifts RIGHT, increasing expected value
1009
+ # → Rule becomes MORE likely to be selected
1010
+ #
1011
+ # For rejected (reward=0): Beta(α, β) → Beta(α, β + 1)
1012
+ # → Distribution shifts LEFT, decreasing expected value
1013
+ # → Rule becomes LESS likely to be selected
1014
+ #
1015
+ # For revision (0 < reward < 1): Both α and β increase proportionally
1016
+ # → Distribution narrows (more confident) with moderate expected value
1017
+ # =========================================================================
1018
+
1019
+ if rules_active:
1020
+ bandit_path = buildlog_dir / "bandit_state.jsonl"
1021
+ bandit = ThompsonSamplingBandit(bandit_path)
1022
+
1023
+ bandit.batch_update(
1024
+ rule_ids=rules_active,
1025
+ reward=reward_value,
1026
+ context=error_class or "general",
1027
+ )
1028
+
978
1029
  # Count total events
979
1030
  total_events = 0
980
1031
  if rewards_path.exists():
@@ -982,11 +1033,16 @@ def log_reward(
982
1033
  1 for line in rewards_path.read_text().strip().split("\n") if line
983
1034
  )
984
1035
 
1036
+ rules_count = len(rules_active) if rules_active else 0
1037
+ message = f"Logged {outcome} (reward={reward_value:.2f})"
1038
+ if rules_count > 0:
1039
+ message += f" | Updated bandit: {rules_count} rules"
1040
+
985
1041
  return LogRewardResult(
986
1042
  reward_id=reward_id,
987
1043
  reward_value=reward_value,
988
1044
  total_events=total_events,
989
- message=f"Logged {outcome} (reward={reward_value:.2f})",
1045
+ message=message,
990
1046
  )
991
1047
 
992
1048
 
@@ -1061,6 +1117,7 @@ class SessionDict(TypedDict, total=False):
1061
1117
  entry_file: str | None
1062
1118
  rules_at_start: list[str]
1063
1119
  rules_at_end: list[str]
1120
+ selected_rules: list[str] # Bandit-selected subset for this session
1064
1121
  error_class: str | None
1065
1122
  notes: str | None
1066
1123
 
@@ -1070,15 +1127,17 @@ class Session:
1070
1127
  """A coding session for experiment tracking.
1071
1128
 
1072
1129
  Tracks the state of rules before and after a session to measure
1073
- learning effectiveness.
1130
+ learning effectiveness. The bandit selects a subset of rules
1131
+ (selected_rules) to be "active" for this session based on context.
1074
1132
 
1075
1133
  Attributes:
1076
1134
  id: Unique identifier for this session.
1077
1135
  started_at: When the session started.
1078
1136
  ended_at: When the session ended (None if still active).
1079
1137
  entry_file: Corresponding buildlog entry file, if any.
1080
- rules_at_start: Rule IDs active at session start.
1081
- rules_at_end: Rule IDs active at session end.
1138
+ rules_at_start: All rule IDs available at session start.
1139
+ rules_at_end: All rule IDs available at session end.
1140
+ selected_rules: Bandit-selected subset active for this session.
1082
1141
  error_class: Error class being targeted (e.g., "missing_test").
1083
1142
  notes: Optional notes about the session.
1084
1143
  """
@@ -1089,6 +1148,7 @@ class Session:
1089
1148
  entry_file: str | None = None
1090
1149
  rules_at_start: list[str] = field(default_factory=list)
1091
1150
  rules_at_end: list[str] = field(default_factory=list)
1151
+ selected_rules: list[str] = field(default_factory=list)
1092
1152
  error_class: str | None = None
1093
1153
  notes: str | None = None
1094
1154
 
@@ -1101,6 +1161,8 @@ class Session:
1101
1161
  "rules_at_start": self.rules_at_start,
1102
1162
  "rules_at_end": self.rules_at_end,
1103
1163
  }
1164
+ if self.selected_rules:
1165
+ result["selected_rules"] = self.selected_rules
1104
1166
  if self.entry_file is not None:
1105
1167
  result["entry_file"] = self.entry_file
1106
1168
  if self.error_class is not None:
@@ -1130,6 +1192,7 @@ class Session:
1130
1192
  entry_file=data.get("entry_file"),
1131
1193
  rules_at_start=data.get("rules_at_start", []),
1132
1194
  rules_at_end=data.get("rules_at_end", []),
1195
+ selected_rules=data.get("selected_rules", []),
1133
1196
  error_class=data.get("error_class"),
1134
1197
  notes=data.get("notes"),
1135
1198
  )
@@ -1233,11 +1296,15 @@ class SessionMetrics:
1233
1296
 
1234
1297
  @dataclass
1235
1298
  class StartSessionResult:
1236
- """Result of starting a new session."""
1299
+ """Result of starting a new session.
1300
+
1301
+ Includes both the full rule set and the bandit-selected subset.
1302
+ """
1237
1303
 
1238
1304
  session_id: str
1239
1305
  error_class: str | None
1240
1306
  rules_count: int
1307
+ selected_rules: list[str] # Bandit-selected rules for this session
1241
1308
  message: str
1242
1309
 
1243
1310
 
@@ -1316,6 +1383,31 @@ def _get_current_rules(buildlog_dir: Path) -> list[str]:
1316
1383
  return list(_load_json_set(promoted_path, "skill_ids"))
1317
1384
 
1318
1385
 
1386
+ def _get_seed_rule_ids(buildlog_dir: Path) -> set[str]:
1387
+ """Get IDs of rules that come from seed personas.
1388
+
1389
+ Seed rules (from gauntlet personas like Test Terrorist, Security Karen)
1390
+ have non-empty persona_tags. These rules get boosted priors in the
1391
+ bandit because they represent curated, expert knowledge.
1392
+
1393
+ Returns:
1394
+ Set of rule IDs that have persona_tags.
1395
+ """
1396
+ try:
1397
+ skill_set = generate_skills(buildlog_dir)
1398
+ seed_ids: set[str] = set()
1399
+
1400
+ for category_skills in skill_set.skills.values():
1401
+ for skill in category_skills:
1402
+ if skill.persona_tags: # Non-empty means it's from a seed
1403
+ seed_ids.add(skill.id)
1404
+
1405
+ return seed_ids
1406
+ except Exception:
1407
+ # If skill generation fails, treat no rules as seeds
1408
+ return set()
1409
+
1410
+
1319
1411
  def _load_sessions(buildlog_dir: Path) -> list[Session]:
1320
1412
  """Load all sessions from JSONL file."""
1321
1413
  sessions_path = _get_sessions_path(buildlog_dir)
@@ -1389,25 +1481,78 @@ def start_session(
1389
1481
  buildlog_dir: Path,
1390
1482
  error_class: str | None = None,
1391
1483
  notes: str | None = None,
1484
+ select_k: int = 3,
1392
1485
  ) -> StartSessionResult:
1393
- """Start a new experiment session.
1486
+ """Start a new experiment session with bandit-selected rules.
1487
+
1488
+ This is where Thompson Sampling kicks in:
1489
+
1490
+ 1. Load all available rules (candidates)
1491
+ 2. Identify which rules are from seeds (get boosted priors)
1492
+ 3. Use bandit to select top-k rules for this error_class context
1493
+ 4. Store selected rules in session for later attribution
1494
+
1495
+ The selected rules are the ones "active" for this session. When a
1496
+ mistake occurs, we'll give negative feedback to these rules (they
1497
+ didn't prevent the mistake). This teaches the bandit which rules
1498
+ are effective for which error classes.
1394
1499
 
1395
1500
  Args:
1396
1501
  buildlog_dir: Path to buildlog directory.
1397
1502
  error_class: Error class being targeted (e.g., "missing_test").
1503
+ This is the CONTEXT for contextual bandits - rules
1504
+ are evaluated per-context.
1398
1505
  notes: Optional notes about the session.
1506
+ select_k: Number of rules to select via Thompson Sampling.
1507
+ Default 3 balances coverage with attribution clarity.
1399
1508
 
1400
1509
  Returns:
1401
- StartSessionResult with session ID and current rules count.
1510
+ StartSessionResult with session ID, rules count, and selected rules.
1402
1511
  """
1403
1512
  now = datetime.now(timezone.utc)
1404
1513
  session_id = _generate_session_id(now)
1405
1514
  current_rules = _get_current_rules(buildlog_dir)
1406
1515
 
1516
+ # =========================================================================
1517
+ # THOMPSON SAMPLING: Select rules for this session
1518
+ # =========================================================================
1519
+ #
1520
+ # The bandit maintains a Beta distribution for each (context, rule) pair.
1521
+ # At session start, we SAMPLE from each distribution and pick the top-k.
1522
+ #
1523
+ # Why sample instead of using the mean?
1524
+ # - Arms we're uncertain about have high variance
1525
+ # - High variance means occasional high samples
1526
+ # - This causes us to explore uncertain arms
1527
+ # - As we gather data, variance shrinks, and we exploit
1528
+ #
1529
+ # This is the elegant explore-exploit balance of Thompson Sampling.
1530
+ # =========================================================================
1531
+
1532
+ selected_rules: list[str] = []
1533
+
1534
+ if current_rules:
1535
+ # Initialize bandit
1536
+ bandit_path = buildlog_dir / "bandit_state.jsonl"
1537
+ bandit = ThompsonSamplingBandit(bandit_path)
1538
+
1539
+ # Identify seed rules (those with persona_tags from gauntlet)
1540
+ # Seeds get boosted priors - we believe curated rules are good
1541
+ seed_rule_ids = _get_seed_rule_ids(buildlog_dir)
1542
+
1543
+ # SELECT: Sample from Beta distributions, pick top-k
1544
+ selected_rules = bandit.select(
1545
+ candidates=current_rules,
1546
+ context=error_class or "general",
1547
+ k=min(select_k, len(current_rules)),
1548
+ seed_rule_ids=seed_rule_ids,
1549
+ )
1550
+
1407
1551
  session = Session(
1408
1552
  id=session_id,
1409
1553
  started_at=now,
1410
1554
  rules_at_start=current_rules,
1555
+ selected_rules=selected_rules,
1411
1556
  error_class=error_class,
1412
1557
  notes=notes,
1413
1558
  )
@@ -1421,7 +1566,11 @@ def start_session(
1421
1566
  session_id=session_id,
1422
1567
  error_class=error_class,
1423
1568
  rules_count=len(current_rules),
1424
- message=f"Started session {session_id} with {len(current_rules)} active rules",
1569
+ selected_rules=selected_rules,
1570
+ message=(
1571
+ f"Started session {session_id}: selected {len(selected_rules)}/"
1572
+ f"{len(current_rules)} rules via Thompson Sampling"
1573
+ ),
1425
1574
  )
1426
1575
 
1427
1576
 
@@ -1493,6 +1642,16 @@ def log_mistake(
1493
1642
  ) -> LogMistakeResult:
1494
1643
  """Log a mistake during an experiment session.
1495
1644
 
1645
+ This is where the bandit learns from NEGATIVE feedback:
1646
+
1647
+ When a mistake occurs, the selected rules for this session FAILED
1648
+ to prevent it. We update the bandit with reward=0 for each selected
1649
+ rule, teaching it that these rules aren't effective for this context.
1650
+
1651
+ Over time, rules that consistently fail to prevent mistakes will
1652
+ have their Beta distributions shift left (lower expected value),
1653
+ and the bandit will stop selecting them.
1654
+
1496
1655
  Args:
1497
1656
  buildlog_dir: Path to buildlog directory.
1498
1657
  error_class: Category of error (e.g., "missing_test").
@@ -1539,9 +1698,39 @@ def log_mistake(
1539
1698
  with open(mistakes_path, "a") as f:
1540
1699
  f.write(json.dumps(mistake.to_dict()) + "\n")
1541
1700
 
1701
+ # =========================================================================
1702
+ # BANDIT LEARNING: Negative feedback for selected rules
1703
+ # =========================================================================
1704
+ #
1705
+ # The selected rules were supposed to help prevent mistakes. A mistake
1706
+ # occurred anyway, so we give them reward=0 (failure).
1707
+ #
1708
+ # Bayesian update: Beta(α, β) → Beta(α + 0, β + 1) = Beta(α, β + 1)
1709
+ #
1710
+ # This shifts the distribution LEFT, decreasing the expected value.
1711
+ # Rules that repeatedly fail will become less likely to be selected.
1712
+ # =========================================================================
1713
+
1714
+ selected_rules = session_data.get("selected_rules", [])
1715
+ if selected_rules:
1716
+ bandit_path = buildlog_dir / "bandit_state.jsonl"
1717
+ bandit = ThompsonSamplingBandit(bandit_path)
1718
+
1719
+ # Use session's error_class as context, not the mistake's
1720
+ # (they should match, but session context is authoritative)
1721
+ context = session_data.get("error_class") or "general"
1722
+
1723
+ bandit.batch_update(
1724
+ rule_ids=selected_rules,
1725
+ reward=0.0, # Failure: rules didn't prevent mistake
1726
+ context=context,
1727
+ )
1728
+
1542
1729
  message = f"Logged mistake: {error_class}"
1543
1730
  if similar:
1544
1731
  message += f" (REPEAT of {similar.id})"
1732
+ if selected_rules:
1733
+ message += f" | Updated bandit: {len(selected_rules)} rules got reward=0"
1545
1734
 
1546
1735
  return LogMistakeResult(
1547
1736
  mistake_id=mistake_id,
@@ -1660,6 +1849,71 @@ def get_experiment_report(buildlog_dir: Path) -> dict:
1660
1849
  }
1661
1850
 
1662
1851
 
1852
+ def get_bandit_status(
1853
+ buildlog_dir: Path,
1854
+ context: str | None = None,
1855
+ top_k: int = 10,
1856
+ ) -> dict:
1857
+ """Get current bandit state and statistics.
1858
+
1859
+ Provides insight into the Thompson Sampling bandit's learned beliefs.
1860
+ Useful for debugging and understanding which rules are being favored.
1861
+
1862
+ Args:
1863
+ buildlog_dir: Path to buildlog directory.
1864
+ context: Specific error class to show. If None, shows all contexts.
1865
+ top_k: Number of top rules to show per context.
1866
+
1867
+ Returns:
1868
+ Dictionary with:
1869
+ - summary: Overall bandit statistics
1870
+ - contexts: Per-context rule rankings
1871
+ - top_rules: Top rules by expected value per context
1872
+ """
1873
+ bandit_path = buildlog_dir / "bandit_state.jsonl"
1874
+ bandit = ThompsonSamplingBandit(bandit_path)
1875
+
1876
+ stats = bandit.get_stats(context)
1877
+
1878
+ # Group stats by context
1879
+ contexts: dict[str, list[dict]] = {}
1880
+ for key, rule_stats in stats.items():
1881
+ ctx = rule_stats["context"]
1882
+ if ctx not in contexts:
1883
+ contexts[ctx] = []
1884
+ contexts[ctx].append(
1885
+ {
1886
+ "rule_id": key.split(":")[-1] if ":" in key else key,
1887
+ **{k: v for k, v in rule_stats.items() if k != "context"},
1888
+ }
1889
+ )
1890
+
1891
+ # Sort by mean (descending) and take top_k
1892
+ top_rules: dict[str, list[dict]] = {}
1893
+ for ctx, rules in contexts.items():
1894
+ sorted_rules = sorted(rules, key=lambda x: x["mean"], reverse=True)
1895
+ top_rules[ctx] = sorted_rules[:top_k]
1896
+
1897
+ # Summary stats
1898
+ total_arms = sum(len(rules) for rules in contexts.values())
1899
+ total_observations = sum(
1900
+ rule.get("total_observations", 0)
1901
+ for rules in contexts.values()
1902
+ for rule in rules
1903
+ )
1904
+
1905
+ return {
1906
+ "summary": {
1907
+ "total_contexts": len(contexts),
1908
+ "total_arms": total_arms,
1909
+ "total_observations": total_observations,
1910
+ "state_file": str(bandit_path),
1911
+ },
1912
+ "top_rules": top_rules,
1913
+ "all_rules": contexts if context else None, # Only include all if filtering
1914
+ }
1915
+
1916
+
1663
1917
  # =============================================================================
1664
1918
  # Gauntlet Loop Operations
1665
1919
  # =============================================================================
@@ -1781,6 +2035,18 @@ def gauntlet_process_issues(
1781
2035
  )
1782
2036
 
1783
2037
 
2038
+ def _sanitize_for_gh(text: str, max_len: int = 256) -> str:
2039
+ """Sanitize text for GitHub issue fields.
2040
+
2041
+ Defense-in-depth: we use list args (not shell=True) for subprocess,
2042
+ but sanitize anyway to prevent injection via gh's argument parsing.
2043
+ """
2044
+ sanitized = text.replace("\n", " ").replace("\r", " ")
2045
+ if len(sanitized) > max_len:
2046
+ sanitized = sanitized[: max_len - 3] + "..."
2047
+ return sanitized.strip()
2048
+
2049
+
1784
2050
  def gauntlet_accept_risk(
1785
2051
  remaining_issues: list[dict],
1786
2052
  create_github_issues: bool = False,
@@ -1808,17 +2074,6 @@ def gauntlet_accept_risk(
1808
2074
  description = issue.get("description", "")
1809
2075
  location = issue.get("location", "")
1810
2076
 
1811
- # Sanitize inputs for GitHub issue creation
1812
- # Note: We use list args (not shell=True), so this is defense-in-depth
1813
- def _sanitize_for_gh(text: str, max_len: int = 256) -> str:
1814
- """Sanitize text for GitHub issue fields."""
1815
- # Remove/replace problematic characters
1816
- sanitized = text.replace("\n", " ").replace("\r", " ")
1817
- # Truncate to max length
1818
- if len(sanitized) > max_len:
1819
- sanitized = sanitized[: max_len - 3] + "..."
1820
- return sanitized.strip()
1821
-
1822
2077
  safe_severity = _sanitize_for_gh(str(severity), 20)
1823
2078
  safe_rule = _sanitize_for_gh(str(rule), 200)
1824
2079
  safe_description = _sanitize_for_gh(str(description), 1000)
@@ -1862,7 +2117,9 @@ def gauntlet_accept_risk(
1862
2117
  cmd.extend(["--repo", repo])
1863
2118
 
1864
2119
  try:
1865
- result = subprocess.run(cmd, capture_output=True, text=True, check=True)
2120
+ result = subprocess.run(
2121
+ cmd, capture_output=True, text=True, check=True, timeout=30
2122
+ )
1866
2123
  # gh issue create outputs the URL
1867
2124
  url = result.stdout.strip()
1868
2125
  if url:
@@ -1870,6 +2127,9 @@ def gauntlet_accept_risk(
1870
2127
  except subprocess.CalledProcessError as e:
1871
2128
  # Don't fail entirely, just note the error
1872
2129
  error = f"Failed to create some GitHub issues: {e.stderr}"
2130
+ except subprocess.TimeoutExpired:
2131
+ error = "GitHub issue creation timed out (30s limit)."
2132
+ break
1873
2133
  except FileNotFoundError:
1874
2134
  error = "gh CLI not found. Install GitHub CLI to create issues."
1875
2135
  break
buildlog/distill.py CHANGED
@@ -8,6 +8,7 @@ __all__ = [
8
8
  "distill_all",
9
9
  "format_output",
10
10
  "parse_improvements",
11
+ "parse_improvements_llm",
11
12
  "parse_date_from_filename",
12
13
  "iter_buildlog_entries",
13
14
  ]
@@ -19,7 +20,10 @@ from collections.abc import Iterator
19
20
  from dataclasses import dataclass, field
20
21
  from datetime import date, datetime, timezone
21
22
  from pathlib import Path
22
- from typing import Final, Literal, TypedDict
23
+ from typing import TYPE_CHECKING, Final, Literal, TypedDict
24
+
25
+ if TYPE_CHECKING:
26
+ from buildlog.llm import ExtractedRule, LLMBackend
23
27
 
24
28
  logger = logging.getLogger(__name__)
25
29
 
@@ -191,6 +195,36 @@ def parse_improvements(content: str) -> dict[str, list[str]]:
191
195
  return result
192
196
 
193
197
 
198
+ def parse_improvements_llm(content: str, backend: LLMBackend) -> list[ExtractedRule]:
199
+ """Extract improvements using an LLM backend for richer extraction.
200
+
201
+ Sends the Improvements section to the LLM for structured extraction
202
+ of rules with severity, scope, applicability, and defensibility fields.
203
+
204
+ Args:
205
+ content: The full markdown content of a buildlog entry.
206
+ backend: An LLM backend implementing the LLMBackend protocol.
207
+
208
+ Returns:
209
+ List of ExtractedRule objects with rich metadata.
210
+ """
211
+ # Extract the Improvements section
212
+ improvements_match = re.search(
213
+ r"^##\s+Improvements\s*\n(.*?)(?=^#{1,2}\s|\Z)",
214
+ content,
215
+ re.MULTILINE | re.DOTALL,
216
+ )
217
+
218
+ if not improvements_match:
219
+ return []
220
+
221
+ improvements_text = improvements_match.group(1).strip()
222
+ if not improvements_text:
223
+ return []
224
+
225
+ return backend.extract_rules(improvements_text)
226
+
227
+
194
228
  def parse_date_from_filename(filename: str) -> str | None:
195
229
  """Extract date from buildlog filename (YYYY-MM-DD-slug.md format)."""
196
230
  match = re.match(r"^(\d{4}-\d{2}-\d{2})-", filename)
@@ -290,6 +324,7 @@ def distill_all(
290
324
  buildlog_dir: Path,
291
325
  since: date | None = None,
292
326
  category_filter: str | None = None,
327
+ llm: bool = False,
293
328
  ) -> DistillResult:
294
329
  """Parse all buildlog entries and aggregate patterns.
295
330
 
@@ -297,10 +332,23 @@ def distill_all(
297
332
  buildlog_dir: Path to the buildlog directory.
298
333
  since: If provided, only include entries from this date onward.
299
334
  category_filter: If provided, only include patterns from this category.
335
+ llm: If True and an LLM backend is available, use LLM extraction.
336
+ Falls back to regex on failure or if no backend is available.
300
337
 
301
338
  Returns:
302
339
  DistillResult with aggregated patterns and statistics.
303
340
  """
341
+ # Resolve LLM backend if requested
342
+ llm_backend: LLMBackend | None = None
343
+ if llm:
344
+ from buildlog.llm import get_llm_backend
345
+
346
+ llm_backend = get_llm_backend(buildlog_dir=buildlog_dir)
347
+ if llm_backend is None:
348
+ logger.warning(
349
+ "--llm requested but no LLM provider available, using regex fallback"
350
+ )
351
+
304
352
  patterns: dict[str, list[PatternDict]] = {cat: [] for cat in CATEGORIES}
305
353
  by_month: dict[str, int] = {}
306
354
  entry_count = 0
@@ -318,6 +366,37 @@ def distill_all(
318
366
  month_key = _extract_month_key(date_str)
319
367
  by_month[month_key] = by_month.get(month_key, 0) + 1
320
368
 
369
+ # Try LLM extraction first, fall back to regex
370
+ if llm_backend is not None:
371
+ try:
372
+ extracted = parse_improvements_llm(content, llm_backend)
373
+ if extracted:
374
+ # Convert ExtractedRule objects to standard PatternDict format
375
+ for rule in extracted:
376
+ cat = (
377
+ rule.category
378
+ if rule.category in CATEGORIES
379
+ else "architectural"
380
+ )
381
+ if cat not in patterns:
382
+ patterns[cat] = []
383
+ patterns[cat].append(
384
+ PatternDict(
385
+ insight=rule.rule,
386
+ source=str(entry_path),
387
+ date=date_str,
388
+ context=context,
389
+ )
390
+ )
391
+ continue # Skip regex if LLM succeeded
392
+ except Exception as e:
393
+ logger.warning(
394
+ "LLM extraction failed for %s, falling back to regex: %s",
395
+ entry_path,
396
+ e,
397
+ )
398
+
399
+ # Regex fallback (default behavior)
321
400
  try:
322
401
  improvements = parse_improvements(content)
323
402
  except re.error as e: