buildlog 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- buildlog/__init__.py +1 -1
- buildlog/cli.py +659 -48
- buildlog/confidence.py +27 -0
- buildlog/core/__init__.py +2 -0
- buildlog/core/bandit.py +699 -0
- buildlog/core/operations.py +284 -24
- buildlog/distill.py +80 -1
- buildlog/engine/__init__.py +61 -0
- buildlog/engine/bandit.py +23 -0
- buildlog/engine/confidence.py +28 -0
- buildlog/engine/embeddings.py +28 -0
- buildlog/engine/experiments.py +619 -0
- buildlog/engine/types.py +31 -0
- buildlog/llm.py +508 -0
- buildlog/mcp/server.py +10 -6
- buildlog/mcp/tools.py +61 -13
- buildlog/render/__init__.py +19 -2
- buildlog/render/claude_md.py +67 -32
- buildlog/render/continue_dev.py +102 -0
- buildlog/render/copilot.py +100 -0
- buildlog/render/cursor.py +105 -0
- buildlog/render/windsurf.py +95 -0
- buildlog/seed_engine/__init__.py +2 -0
- buildlog/seed_engine/llm_extractor.py +121 -0
- buildlog/seed_engine/pipeline.py +45 -1
- buildlog/skills.py +69 -6
- {buildlog-0.7.0.data → buildlog-0.9.0.data}/data/share/buildlog/copier.yml +0 -4
- buildlog-0.9.0.data/data/share/buildlog/template/buildlog/_TEMPLATE_QUICK.md +21 -0
- buildlog-0.9.0.dist-info/METADATA +248 -0
- buildlog-0.9.0.dist-info/RECORD +55 -0
- buildlog-0.7.0.dist-info/METADATA +0 -544
- buildlog-0.7.0.dist-info/RECORD +0 -41
- {buildlog-0.7.0.data → buildlog-0.9.0.data}/data/share/buildlog/post_gen.py +0 -0
- {buildlog-0.7.0.data → buildlog-0.9.0.data}/data/share/buildlog/template/buildlog/.gitkeep +0 -0
- {buildlog-0.7.0.data → buildlog-0.9.0.data}/data/share/buildlog/template/buildlog/2026-01-01-example.md +0 -0
- {buildlog-0.7.0.data → buildlog-0.9.0.data}/data/share/buildlog/template/buildlog/BUILDLOG_SYSTEM.md +0 -0
- {buildlog-0.7.0.data → buildlog-0.9.0.data}/data/share/buildlog/template/buildlog/_TEMPLATE.md +0 -0
- {buildlog-0.7.0.data → buildlog-0.9.0.data}/data/share/buildlog/template/buildlog/assets/.gitkeep +0 -0
- {buildlog-0.7.0.dist-info → buildlog-0.9.0.dist-info}/WHEEL +0 -0
- {buildlog-0.7.0.dist-info → buildlog-0.9.0.dist-info}/entry_points.txt +0 -0
- {buildlog-0.7.0.dist-info → buildlog-0.9.0.dist-info}/licenses/LICENSE +0 -0
buildlog/core/operations.py
CHANGED
|
@@ -14,6 +14,7 @@ from pathlib import Path
|
|
|
14
14
|
from typing import Literal, TypedDict
|
|
15
15
|
|
|
16
16
|
from buildlog.confidence import ConfidenceMetrics, merge_confidence_metrics
|
|
17
|
+
from buildlog.core.bandit import ThompsonSamplingBandit
|
|
17
18
|
from buildlog.render import get_renderer
|
|
18
19
|
from buildlog.skills import Skill, SkillSet, generate_skills
|
|
19
20
|
|
|
@@ -52,6 +53,7 @@ __all__ = [
|
|
|
52
53
|
"log_mistake",
|
|
53
54
|
"get_session_metrics",
|
|
54
55
|
"get_experiment_report",
|
|
56
|
+
"get_bandit_status",
|
|
55
57
|
# Gauntlet loop operations
|
|
56
58
|
"gauntlet_process_issues",
|
|
57
59
|
"gauntlet_accept_risk",
|
|
@@ -558,7 +560,7 @@ def status(
|
|
|
558
560
|
def promote(
|
|
559
561
|
buildlog_dir: Path,
|
|
560
562
|
skill_ids: list[str],
|
|
561
|
-
target:
|
|
563
|
+
target: str = "claude_md",
|
|
562
564
|
target_path: Path | None = None,
|
|
563
565
|
) -> PromoteResult:
|
|
564
566
|
"""Promote skills to agent rules.
|
|
@@ -566,7 +568,8 @@ def promote(
|
|
|
566
568
|
Args:
|
|
567
569
|
buildlog_dir: Path to buildlog directory.
|
|
568
570
|
skill_ids: List of skill IDs to promote.
|
|
569
|
-
target: Where to write rules
|
|
571
|
+
target: Where to write rules. One of: claude_md, settings_json,
|
|
572
|
+
skill, cursor, copilot, windsurf, continue_dev.
|
|
570
573
|
target_path: Optional custom path for the target file.
|
|
571
574
|
|
|
572
575
|
Returns:
|
|
@@ -650,7 +653,7 @@ def reject(
|
|
|
650
653
|
rejected = {"rejected_at": {}, "skill_ids": []}
|
|
651
654
|
|
|
652
655
|
# Add new rejections
|
|
653
|
-
now = datetime.now().isoformat()
|
|
656
|
+
now = datetime.now(timezone.utc).isoformat()
|
|
654
657
|
newly_rejected: list[str] = []
|
|
655
658
|
for skill_id in skill_ids:
|
|
656
659
|
if skill_id not in rejected["skill_ids"]:
|
|
@@ -938,14 +941,27 @@ def log_reward(
|
|
|
938
941
|
) -> LogRewardResult:
|
|
939
942
|
"""Log a reward event for bandit learning.
|
|
940
943
|
|
|
941
|
-
|
|
944
|
+
This is where the bandit learns from EXPLICIT feedback:
|
|
945
|
+
|
|
946
|
+
The reward signal comes from the outcome:
|
|
947
|
+
- accepted (reward=1.0): Rules helped produce good output
|
|
948
|
+
- rejected (reward=0.0): Rules failed to prevent bad output
|
|
949
|
+
- revision (reward=1-distance): Partial credit based on correction needed
|
|
950
|
+
|
|
951
|
+
Unlike log_mistake() which gives implicit negative feedback, this allows
|
|
952
|
+
direct positive feedback when rules DO help. This is crucial for learning
|
|
953
|
+
which rules are genuinely effective, not just which ones don't fail.
|
|
954
|
+
|
|
955
|
+
Appends to reward_events.jsonl for analysis AND updates the bandit.
|
|
942
956
|
|
|
943
957
|
Args:
|
|
944
958
|
buildlog_dir: Path to buildlog directory.
|
|
945
959
|
outcome: Type of feedback (accepted/revision/rejected).
|
|
946
960
|
rules_active: List of rule IDs that were in context.
|
|
961
|
+
If None, tries to use session's selected_rules.
|
|
947
962
|
revision_distance: How much correction was needed (0-1, for revisions).
|
|
948
963
|
error_class: Category of error if applicable.
|
|
964
|
+
If None, tries to use session's error_class.
|
|
949
965
|
notes: Optional notes about the feedback.
|
|
950
966
|
source: Where this feedback came from.
|
|
951
967
|
|
|
@@ -956,6 +972,15 @@ def log_reward(
|
|
|
956
972
|
reward_id = _generate_reward_id(outcome, now)
|
|
957
973
|
reward_value = _compute_reward_value(outcome, revision_distance)
|
|
958
974
|
|
|
975
|
+
# Try to get rules and context from active session if not provided
|
|
976
|
+
active_path = _get_active_session_path(buildlog_dir)
|
|
977
|
+
if active_path.exists():
|
|
978
|
+
session_data = json.loads(active_path.read_text())
|
|
979
|
+
if rules_active is None:
|
|
980
|
+
rules_active = session_data.get("selected_rules", [])
|
|
981
|
+
if error_class is None:
|
|
982
|
+
error_class = session_data.get("error_class")
|
|
983
|
+
|
|
959
984
|
event = RewardEvent(
|
|
960
985
|
id=reward_id,
|
|
961
986
|
timestamp=now,
|
|
@@ -975,6 +1000,32 @@ def log_reward(
|
|
|
975
1000
|
with open(rewards_path, "a") as f:
|
|
976
1001
|
f.write(json.dumps(event.to_dict()) + "\n")
|
|
977
1002
|
|
|
1003
|
+
# =========================================================================
|
|
1004
|
+
# BANDIT LEARNING: Update with explicit reward
|
|
1005
|
+
# =========================================================================
|
|
1006
|
+
#
|
|
1007
|
+
# For accepted (reward=1): Beta(α, β) → Beta(α + 1, β)
|
|
1008
|
+
# → Distribution shifts RIGHT, increasing expected value
|
|
1009
|
+
# → Rule becomes MORE likely to be selected
|
|
1010
|
+
#
|
|
1011
|
+
# For rejected (reward=0): Beta(α, β) → Beta(α, β + 1)
|
|
1012
|
+
# → Distribution shifts LEFT, decreasing expected value
|
|
1013
|
+
# → Rule becomes LESS likely to be selected
|
|
1014
|
+
#
|
|
1015
|
+
# For revision (0 < reward < 1): Both α and β increase proportionally
|
|
1016
|
+
# → Distribution narrows (more confident) with moderate expected value
|
|
1017
|
+
# =========================================================================
|
|
1018
|
+
|
|
1019
|
+
if rules_active:
|
|
1020
|
+
bandit_path = buildlog_dir / "bandit_state.jsonl"
|
|
1021
|
+
bandit = ThompsonSamplingBandit(bandit_path)
|
|
1022
|
+
|
|
1023
|
+
bandit.batch_update(
|
|
1024
|
+
rule_ids=rules_active,
|
|
1025
|
+
reward=reward_value,
|
|
1026
|
+
context=error_class or "general",
|
|
1027
|
+
)
|
|
1028
|
+
|
|
978
1029
|
# Count total events
|
|
979
1030
|
total_events = 0
|
|
980
1031
|
if rewards_path.exists():
|
|
@@ -982,11 +1033,16 @@ def log_reward(
|
|
|
982
1033
|
1 for line in rewards_path.read_text().strip().split("\n") if line
|
|
983
1034
|
)
|
|
984
1035
|
|
|
1036
|
+
rules_count = len(rules_active) if rules_active else 0
|
|
1037
|
+
message = f"Logged {outcome} (reward={reward_value:.2f})"
|
|
1038
|
+
if rules_count > 0:
|
|
1039
|
+
message += f" | Updated bandit: {rules_count} rules"
|
|
1040
|
+
|
|
985
1041
|
return LogRewardResult(
|
|
986
1042
|
reward_id=reward_id,
|
|
987
1043
|
reward_value=reward_value,
|
|
988
1044
|
total_events=total_events,
|
|
989
|
-
message=
|
|
1045
|
+
message=message,
|
|
990
1046
|
)
|
|
991
1047
|
|
|
992
1048
|
|
|
@@ -1061,6 +1117,7 @@ class SessionDict(TypedDict, total=False):
|
|
|
1061
1117
|
entry_file: str | None
|
|
1062
1118
|
rules_at_start: list[str]
|
|
1063
1119
|
rules_at_end: list[str]
|
|
1120
|
+
selected_rules: list[str] # Bandit-selected subset for this session
|
|
1064
1121
|
error_class: str | None
|
|
1065
1122
|
notes: str | None
|
|
1066
1123
|
|
|
@@ -1070,15 +1127,17 @@ class Session:
|
|
|
1070
1127
|
"""A coding session for experiment tracking.
|
|
1071
1128
|
|
|
1072
1129
|
Tracks the state of rules before and after a session to measure
|
|
1073
|
-
learning effectiveness.
|
|
1130
|
+
learning effectiveness. The bandit selects a subset of rules
|
|
1131
|
+
(selected_rules) to be "active" for this session based on context.
|
|
1074
1132
|
|
|
1075
1133
|
Attributes:
|
|
1076
1134
|
id: Unique identifier for this session.
|
|
1077
1135
|
started_at: When the session started.
|
|
1078
1136
|
ended_at: When the session ended (None if still active).
|
|
1079
1137
|
entry_file: Corresponding buildlog entry file, if any.
|
|
1080
|
-
rules_at_start:
|
|
1081
|
-
rules_at_end:
|
|
1138
|
+
rules_at_start: All rule IDs available at session start.
|
|
1139
|
+
rules_at_end: All rule IDs available at session end.
|
|
1140
|
+
selected_rules: Bandit-selected subset active for this session.
|
|
1082
1141
|
error_class: Error class being targeted (e.g., "missing_test").
|
|
1083
1142
|
notes: Optional notes about the session.
|
|
1084
1143
|
"""
|
|
@@ -1089,6 +1148,7 @@ class Session:
|
|
|
1089
1148
|
entry_file: str | None = None
|
|
1090
1149
|
rules_at_start: list[str] = field(default_factory=list)
|
|
1091
1150
|
rules_at_end: list[str] = field(default_factory=list)
|
|
1151
|
+
selected_rules: list[str] = field(default_factory=list)
|
|
1092
1152
|
error_class: str | None = None
|
|
1093
1153
|
notes: str | None = None
|
|
1094
1154
|
|
|
@@ -1101,6 +1161,8 @@ class Session:
|
|
|
1101
1161
|
"rules_at_start": self.rules_at_start,
|
|
1102
1162
|
"rules_at_end": self.rules_at_end,
|
|
1103
1163
|
}
|
|
1164
|
+
if self.selected_rules:
|
|
1165
|
+
result["selected_rules"] = self.selected_rules
|
|
1104
1166
|
if self.entry_file is not None:
|
|
1105
1167
|
result["entry_file"] = self.entry_file
|
|
1106
1168
|
if self.error_class is not None:
|
|
@@ -1130,6 +1192,7 @@ class Session:
|
|
|
1130
1192
|
entry_file=data.get("entry_file"),
|
|
1131
1193
|
rules_at_start=data.get("rules_at_start", []),
|
|
1132
1194
|
rules_at_end=data.get("rules_at_end", []),
|
|
1195
|
+
selected_rules=data.get("selected_rules", []),
|
|
1133
1196
|
error_class=data.get("error_class"),
|
|
1134
1197
|
notes=data.get("notes"),
|
|
1135
1198
|
)
|
|
@@ -1233,11 +1296,15 @@ class SessionMetrics:
|
|
|
1233
1296
|
|
|
1234
1297
|
@dataclass
|
|
1235
1298
|
class StartSessionResult:
|
|
1236
|
-
"""Result of starting a new session.
|
|
1299
|
+
"""Result of starting a new session.
|
|
1300
|
+
|
|
1301
|
+
Includes both the full rule set and the bandit-selected subset.
|
|
1302
|
+
"""
|
|
1237
1303
|
|
|
1238
1304
|
session_id: str
|
|
1239
1305
|
error_class: str | None
|
|
1240
1306
|
rules_count: int
|
|
1307
|
+
selected_rules: list[str] # Bandit-selected rules for this session
|
|
1241
1308
|
message: str
|
|
1242
1309
|
|
|
1243
1310
|
|
|
@@ -1316,6 +1383,31 @@ def _get_current_rules(buildlog_dir: Path) -> list[str]:
|
|
|
1316
1383
|
return list(_load_json_set(promoted_path, "skill_ids"))
|
|
1317
1384
|
|
|
1318
1385
|
|
|
1386
|
+
def _get_seed_rule_ids(buildlog_dir: Path) -> set[str]:
|
|
1387
|
+
"""Get IDs of rules that come from seed personas.
|
|
1388
|
+
|
|
1389
|
+
Seed rules (from gauntlet personas like Test Terrorist, Security Karen)
|
|
1390
|
+
have non-empty persona_tags. These rules get boosted priors in the
|
|
1391
|
+
bandit because they represent curated, expert knowledge.
|
|
1392
|
+
|
|
1393
|
+
Returns:
|
|
1394
|
+
Set of rule IDs that have persona_tags.
|
|
1395
|
+
"""
|
|
1396
|
+
try:
|
|
1397
|
+
skill_set = generate_skills(buildlog_dir)
|
|
1398
|
+
seed_ids: set[str] = set()
|
|
1399
|
+
|
|
1400
|
+
for category_skills in skill_set.skills.values():
|
|
1401
|
+
for skill in category_skills:
|
|
1402
|
+
if skill.persona_tags: # Non-empty means it's from a seed
|
|
1403
|
+
seed_ids.add(skill.id)
|
|
1404
|
+
|
|
1405
|
+
return seed_ids
|
|
1406
|
+
except Exception:
|
|
1407
|
+
# If skill generation fails, treat no rules as seeds
|
|
1408
|
+
return set()
|
|
1409
|
+
|
|
1410
|
+
|
|
1319
1411
|
def _load_sessions(buildlog_dir: Path) -> list[Session]:
|
|
1320
1412
|
"""Load all sessions from JSONL file."""
|
|
1321
1413
|
sessions_path = _get_sessions_path(buildlog_dir)
|
|
@@ -1389,25 +1481,78 @@ def start_session(
|
|
|
1389
1481
|
buildlog_dir: Path,
|
|
1390
1482
|
error_class: str | None = None,
|
|
1391
1483
|
notes: str | None = None,
|
|
1484
|
+
select_k: int = 3,
|
|
1392
1485
|
) -> StartSessionResult:
|
|
1393
|
-
"""Start a new experiment session.
|
|
1486
|
+
"""Start a new experiment session with bandit-selected rules.
|
|
1487
|
+
|
|
1488
|
+
This is where Thompson Sampling kicks in:
|
|
1489
|
+
|
|
1490
|
+
1. Load all available rules (candidates)
|
|
1491
|
+
2. Identify which rules are from seeds (get boosted priors)
|
|
1492
|
+
3. Use bandit to select top-k rules for this error_class context
|
|
1493
|
+
4. Store selected rules in session for later attribution
|
|
1494
|
+
|
|
1495
|
+
The selected rules are the ones "active" for this session. When a
|
|
1496
|
+
mistake occurs, we'll give negative feedback to these rules (they
|
|
1497
|
+
didn't prevent the mistake). This teaches the bandit which rules
|
|
1498
|
+
are effective for which error classes.
|
|
1394
1499
|
|
|
1395
1500
|
Args:
|
|
1396
1501
|
buildlog_dir: Path to buildlog directory.
|
|
1397
1502
|
error_class: Error class being targeted (e.g., "missing_test").
|
|
1503
|
+
This is the CONTEXT for contextual bandits - rules
|
|
1504
|
+
are evaluated per-context.
|
|
1398
1505
|
notes: Optional notes about the session.
|
|
1506
|
+
select_k: Number of rules to select via Thompson Sampling.
|
|
1507
|
+
Default 3 balances coverage with attribution clarity.
|
|
1399
1508
|
|
|
1400
1509
|
Returns:
|
|
1401
|
-
StartSessionResult with session ID and
|
|
1510
|
+
StartSessionResult with session ID, rules count, and selected rules.
|
|
1402
1511
|
"""
|
|
1403
1512
|
now = datetime.now(timezone.utc)
|
|
1404
1513
|
session_id = _generate_session_id(now)
|
|
1405
1514
|
current_rules = _get_current_rules(buildlog_dir)
|
|
1406
1515
|
|
|
1516
|
+
# =========================================================================
|
|
1517
|
+
# THOMPSON SAMPLING: Select rules for this session
|
|
1518
|
+
# =========================================================================
|
|
1519
|
+
#
|
|
1520
|
+
# The bandit maintains a Beta distribution for each (context, rule) pair.
|
|
1521
|
+
# At session start, we SAMPLE from each distribution and pick the top-k.
|
|
1522
|
+
#
|
|
1523
|
+
# Why sample instead of using the mean?
|
|
1524
|
+
# - Arms we're uncertain about have high variance
|
|
1525
|
+
# - High variance means occasional high samples
|
|
1526
|
+
# - This causes us to explore uncertain arms
|
|
1527
|
+
# - As we gather data, variance shrinks, and we exploit
|
|
1528
|
+
#
|
|
1529
|
+
# This is the elegant explore-exploit balance of Thompson Sampling.
|
|
1530
|
+
# =========================================================================
|
|
1531
|
+
|
|
1532
|
+
selected_rules: list[str] = []
|
|
1533
|
+
|
|
1534
|
+
if current_rules:
|
|
1535
|
+
# Initialize bandit
|
|
1536
|
+
bandit_path = buildlog_dir / "bandit_state.jsonl"
|
|
1537
|
+
bandit = ThompsonSamplingBandit(bandit_path)
|
|
1538
|
+
|
|
1539
|
+
# Identify seed rules (those with persona_tags from gauntlet)
|
|
1540
|
+
# Seeds get boosted priors - we believe curated rules are good
|
|
1541
|
+
seed_rule_ids = _get_seed_rule_ids(buildlog_dir)
|
|
1542
|
+
|
|
1543
|
+
# SELECT: Sample from Beta distributions, pick top-k
|
|
1544
|
+
selected_rules = bandit.select(
|
|
1545
|
+
candidates=current_rules,
|
|
1546
|
+
context=error_class or "general",
|
|
1547
|
+
k=min(select_k, len(current_rules)),
|
|
1548
|
+
seed_rule_ids=seed_rule_ids,
|
|
1549
|
+
)
|
|
1550
|
+
|
|
1407
1551
|
session = Session(
|
|
1408
1552
|
id=session_id,
|
|
1409
1553
|
started_at=now,
|
|
1410
1554
|
rules_at_start=current_rules,
|
|
1555
|
+
selected_rules=selected_rules,
|
|
1411
1556
|
error_class=error_class,
|
|
1412
1557
|
notes=notes,
|
|
1413
1558
|
)
|
|
@@ -1421,7 +1566,11 @@ def start_session(
|
|
|
1421
1566
|
session_id=session_id,
|
|
1422
1567
|
error_class=error_class,
|
|
1423
1568
|
rules_count=len(current_rules),
|
|
1424
|
-
|
|
1569
|
+
selected_rules=selected_rules,
|
|
1570
|
+
message=(
|
|
1571
|
+
f"Started session {session_id}: selected {len(selected_rules)}/"
|
|
1572
|
+
f"{len(current_rules)} rules via Thompson Sampling"
|
|
1573
|
+
),
|
|
1425
1574
|
)
|
|
1426
1575
|
|
|
1427
1576
|
|
|
@@ -1493,6 +1642,16 @@ def log_mistake(
|
|
|
1493
1642
|
) -> LogMistakeResult:
|
|
1494
1643
|
"""Log a mistake during an experiment session.
|
|
1495
1644
|
|
|
1645
|
+
This is where the bandit learns from NEGATIVE feedback:
|
|
1646
|
+
|
|
1647
|
+
When a mistake occurs, the selected rules for this session FAILED
|
|
1648
|
+
to prevent it. We update the bandit with reward=0 for each selected
|
|
1649
|
+
rule, teaching it that these rules aren't effective for this context.
|
|
1650
|
+
|
|
1651
|
+
Over time, rules that consistently fail to prevent mistakes will
|
|
1652
|
+
have their Beta distributions shift left (lower expected value),
|
|
1653
|
+
and the bandit will stop selecting them.
|
|
1654
|
+
|
|
1496
1655
|
Args:
|
|
1497
1656
|
buildlog_dir: Path to buildlog directory.
|
|
1498
1657
|
error_class: Category of error (e.g., "missing_test").
|
|
@@ -1539,9 +1698,39 @@ def log_mistake(
|
|
|
1539
1698
|
with open(mistakes_path, "a") as f:
|
|
1540
1699
|
f.write(json.dumps(mistake.to_dict()) + "\n")
|
|
1541
1700
|
|
|
1701
|
+
# =========================================================================
|
|
1702
|
+
# BANDIT LEARNING: Negative feedback for selected rules
|
|
1703
|
+
# =========================================================================
|
|
1704
|
+
#
|
|
1705
|
+
# The selected rules were supposed to help prevent mistakes. A mistake
|
|
1706
|
+
# occurred anyway, so we give them reward=0 (failure).
|
|
1707
|
+
#
|
|
1708
|
+
# Bayesian update: Beta(α, β) → Beta(α + 0, β + 1) = Beta(α, β + 1)
|
|
1709
|
+
#
|
|
1710
|
+
# This shifts the distribution LEFT, decreasing the expected value.
|
|
1711
|
+
# Rules that repeatedly fail will become less likely to be selected.
|
|
1712
|
+
# =========================================================================
|
|
1713
|
+
|
|
1714
|
+
selected_rules = session_data.get("selected_rules", [])
|
|
1715
|
+
if selected_rules:
|
|
1716
|
+
bandit_path = buildlog_dir / "bandit_state.jsonl"
|
|
1717
|
+
bandit = ThompsonSamplingBandit(bandit_path)
|
|
1718
|
+
|
|
1719
|
+
# Use session's error_class as context, not the mistake's
|
|
1720
|
+
# (they should match, but session context is authoritative)
|
|
1721
|
+
context = session_data.get("error_class") or "general"
|
|
1722
|
+
|
|
1723
|
+
bandit.batch_update(
|
|
1724
|
+
rule_ids=selected_rules,
|
|
1725
|
+
reward=0.0, # Failure: rules didn't prevent mistake
|
|
1726
|
+
context=context,
|
|
1727
|
+
)
|
|
1728
|
+
|
|
1542
1729
|
message = f"Logged mistake: {error_class}"
|
|
1543
1730
|
if similar:
|
|
1544
1731
|
message += f" (REPEAT of {similar.id})"
|
|
1732
|
+
if selected_rules:
|
|
1733
|
+
message += f" | Updated bandit: {len(selected_rules)} rules got reward=0"
|
|
1545
1734
|
|
|
1546
1735
|
return LogMistakeResult(
|
|
1547
1736
|
mistake_id=mistake_id,
|
|
@@ -1660,6 +1849,71 @@ def get_experiment_report(buildlog_dir: Path) -> dict:
|
|
|
1660
1849
|
}
|
|
1661
1850
|
|
|
1662
1851
|
|
|
1852
|
+
def get_bandit_status(
|
|
1853
|
+
buildlog_dir: Path,
|
|
1854
|
+
context: str | None = None,
|
|
1855
|
+
top_k: int = 10,
|
|
1856
|
+
) -> dict:
|
|
1857
|
+
"""Get current bandit state and statistics.
|
|
1858
|
+
|
|
1859
|
+
Provides insight into the Thompson Sampling bandit's learned beliefs.
|
|
1860
|
+
Useful for debugging and understanding which rules are being favored.
|
|
1861
|
+
|
|
1862
|
+
Args:
|
|
1863
|
+
buildlog_dir: Path to buildlog directory.
|
|
1864
|
+
context: Specific error class to show. If None, shows all contexts.
|
|
1865
|
+
top_k: Number of top rules to show per context.
|
|
1866
|
+
|
|
1867
|
+
Returns:
|
|
1868
|
+
Dictionary with:
|
|
1869
|
+
- summary: Overall bandit statistics
|
|
1870
|
+
- contexts: Per-context rule rankings
|
|
1871
|
+
- top_rules: Top rules by expected value per context
|
|
1872
|
+
"""
|
|
1873
|
+
bandit_path = buildlog_dir / "bandit_state.jsonl"
|
|
1874
|
+
bandit = ThompsonSamplingBandit(bandit_path)
|
|
1875
|
+
|
|
1876
|
+
stats = bandit.get_stats(context)
|
|
1877
|
+
|
|
1878
|
+
# Group stats by context
|
|
1879
|
+
contexts: dict[str, list[dict]] = {}
|
|
1880
|
+
for key, rule_stats in stats.items():
|
|
1881
|
+
ctx = rule_stats["context"]
|
|
1882
|
+
if ctx not in contexts:
|
|
1883
|
+
contexts[ctx] = []
|
|
1884
|
+
contexts[ctx].append(
|
|
1885
|
+
{
|
|
1886
|
+
"rule_id": key.split(":")[-1] if ":" in key else key,
|
|
1887
|
+
**{k: v for k, v in rule_stats.items() if k != "context"},
|
|
1888
|
+
}
|
|
1889
|
+
)
|
|
1890
|
+
|
|
1891
|
+
# Sort by mean (descending) and take top_k
|
|
1892
|
+
top_rules: dict[str, list[dict]] = {}
|
|
1893
|
+
for ctx, rules in contexts.items():
|
|
1894
|
+
sorted_rules = sorted(rules, key=lambda x: x["mean"], reverse=True)
|
|
1895
|
+
top_rules[ctx] = sorted_rules[:top_k]
|
|
1896
|
+
|
|
1897
|
+
# Summary stats
|
|
1898
|
+
total_arms = sum(len(rules) for rules in contexts.values())
|
|
1899
|
+
total_observations = sum(
|
|
1900
|
+
rule.get("total_observations", 0)
|
|
1901
|
+
for rules in contexts.values()
|
|
1902
|
+
for rule in rules
|
|
1903
|
+
)
|
|
1904
|
+
|
|
1905
|
+
return {
|
|
1906
|
+
"summary": {
|
|
1907
|
+
"total_contexts": len(contexts),
|
|
1908
|
+
"total_arms": total_arms,
|
|
1909
|
+
"total_observations": total_observations,
|
|
1910
|
+
"state_file": str(bandit_path),
|
|
1911
|
+
},
|
|
1912
|
+
"top_rules": top_rules,
|
|
1913
|
+
"all_rules": contexts if context else None, # Only include all if filtering
|
|
1914
|
+
}
|
|
1915
|
+
|
|
1916
|
+
|
|
1663
1917
|
# =============================================================================
|
|
1664
1918
|
# Gauntlet Loop Operations
|
|
1665
1919
|
# =============================================================================
|
|
@@ -1781,6 +2035,18 @@ def gauntlet_process_issues(
|
|
|
1781
2035
|
)
|
|
1782
2036
|
|
|
1783
2037
|
|
|
2038
|
+
def _sanitize_for_gh(text: str, max_len: int = 256) -> str:
|
|
2039
|
+
"""Sanitize text for GitHub issue fields.
|
|
2040
|
+
|
|
2041
|
+
Defense-in-depth: we use list args (not shell=True) for subprocess,
|
|
2042
|
+
but sanitize anyway to prevent injection via gh's argument parsing.
|
|
2043
|
+
"""
|
|
2044
|
+
sanitized = text.replace("\n", " ").replace("\r", " ")
|
|
2045
|
+
if len(sanitized) > max_len:
|
|
2046
|
+
sanitized = sanitized[: max_len - 3] + "..."
|
|
2047
|
+
return sanitized.strip()
|
|
2048
|
+
|
|
2049
|
+
|
|
1784
2050
|
def gauntlet_accept_risk(
|
|
1785
2051
|
remaining_issues: list[dict],
|
|
1786
2052
|
create_github_issues: bool = False,
|
|
@@ -1808,17 +2074,6 @@ def gauntlet_accept_risk(
|
|
|
1808
2074
|
description = issue.get("description", "")
|
|
1809
2075
|
location = issue.get("location", "")
|
|
1810
2076
|
|
|
1811
|
-
# Sanitize inputs for GitHub issue creation
|
|
1812
|
-
# Note: We use list args (not shell=True), so this is defense-in-depth
|
|
1813
|
-
def _sanitize_for_gh(text: str, max_len: int = 256) -> str:
|
|
1814
|
-
"""Sanitize text for GitHub issue fields."""
|
|
1815
|
-
# Remove/replace problematic characters
|
|
1816
|
-
sanitized = text.replace("\n", " ").replace("\r", " ")
|
|
1817
|
-
# Truncate to max length
|
|
1818
|
-
if len(sanitized) > max_len:
|
|
1819
|
-
sanitized = sanitized[: max_len - 3] + "..."
|
|
1820
|
-
return sanitized.strip()
|
|
1821
|
-
|
|
1822
2077
|
safe_severity = _sanitize_for_gh(str(severity), 20)
|
|
1823
2078
|
safe_rule = _sanitize_for_gh(str(rule), 200)
|
|
1824
2079
|
safe_description = _sanitize_for_gh(str(description), 1000)
|
|
@@ -1862,7 +2117,9 @@ def gauntlet_accept_risk(
|
|
|
1862
2117
|
cmd.extend(["--repo", repo])
|
|
1863
2118
|
|
|
1864
2119
|
try:
|
|
1865
|
-
result = subprocess.run(
|
|
2120
|
+
result = subprocess.run(
|
|
2121
|
+
cmd, capture_output=True, text=True, check=True, timeout=30
|
|
2122
|
+
)
|
|
1866
2123
|
# gh issue create outputs the URL
|
|
1867
2124
|
url = result.stdout.strip()
|
|
1868
2125
|
if url:
|
|
@@ -1870,6 +2127,9 @@ def gauntlet_accept_risk(
|
|
|
1870
2127
|
except subprocess.CalledProcessError as e:
|
|
1871
2128
|
# Don't fail entirely, just note the error
|
|
1872
2129
|
error = f"Failed to create some GitHub issues: {e.stderr}"
|
|
2130
|
+
except subprocess.TimeoutExpired:
|
|
2131
|
+
error = "GitHub issue creation timed out (30s limit)."
|
|
2132
|
+
break
|
|
1873
2133
|
except FileNotFoundError:
|
|
1874
2134
|
error = "gh CLI not found. Install GitHub CLI to create issues."
|
|
1875
2135
|
break
|
buildlog/distill.py
CHANGED
|
@@ -8,6 +8,7 @@ __all__ = [
|
|
|
8
8
|
"distill_all",
|
|
9
9
|
"format_output",
|
|
10
10
|
"parse_improvements",
|
|
11
|
+
"parse_improvements_llm",
|
|
11
12
|
"parse_date_from_filename",
|
|
12
13
|
"iter_buildlog_entries",
|
|
13
14
|
]
|
|
@@ -19,7 +20,10 @@ from collections.abc import Iterator
|
|
|
19
20
|
from dataclasses import dataclass, field
|
|
20
21
|
from datetime import date, datetime, timezone
|
|
21
22
|
from pathlib import Path
|
|
22
|
-
from typing import Final, Literal, TypedDict
|
|
23
|
+
from typing import TYPE_CHECKING, Final, Literal, TypedDict
|
|
24
|
+
|
|
25
|
+
if TYPE_CHECKING:
|
|
26
|
+
from buildlog.llm import ExtractedRule, LLMBackend
|
|
23
27
|
|
|
24
28
|
logger = logging.getLogger(__name__)
|
|
25
29
|
|
|
@@ -191,6 +195,36 @@ def parse_improvements(content: str) -> dict[str, list[str]]:
|
|
|
191
195
|
return result
|
|
192
196
|
|
|
193
197
|
|
|
198
|
+
def parse_improvements_llm(content: str, backend: LLMBackend) -> list[ExtractedRule]:
|
|
199
|
+
"""Extract improvements using an LLM backend for richer extraction.
|
|
200
|
+
|
|
201
|
+
Sends the Improvements section to the LLM for structured extraction
|
|
202
|
+
of rules with severity, scope, applicability, and defensibility fields.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
content: The full markdown content of a buildlog entry.
|
|
206
|
+
backend: An LLM backend implementing the LLMBackend protocol.
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
List of ExtractedRule objects with rich metadata.
|
|
210
|
+
"""
|
|
211
|
+
# Extract the Improvements section
|
|
212
|
+
improvements_match = re.search(
|
|
213
|
+
r"^##\s+Improvements\s*\n(.*?)(?=^#{1,2}\s|\Z)",
|
|
214
|
+
content,
|
|
215
|
+
re.MULTILINE | re.DOTALL,
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
if not improvements_match:
|
|
219
|
+
return []
|
|
220
|
+
|
|
221
|
+
improvements_text = improvements_match.group(1).strip()
|
|
222
|
+
if not improvements_text:
|
|
223
|
+
return []
|
|
224
|
+
|
|
225
|
+
return backend.extract_rules(improvements_text)
|
|
226
|
+
|
|
227
|
+
|
|
194
228
|
def parse_date_from_filename(filename: str) -> str | None:
|
|
195
229
|
"""Extract date from buildlog filename (YYYY-MM-DD-slug.md format)."""
|
|
196
230
|
match = re.match(r"^(\d{4}-\d{2}-\d{2})-", filename)
|
|
@@ -290,6 +324,7 @@ def distill_all(
|
|
|
290
324
|
buildlog_dir: Path,
|
|
291
325
|
since: date | None = None,
|
|
292
326
|
category_filter: str | None = None,
|
|
327
|
+
llm: bool = False,
|
|
293
328
|
) -> DistillResult:
|
|
294
329
|
"""Parse all buildlog entries and aggregate patterns.
|
|
295
330
|
|
|
@@ -297,10 +332,23 @@ def distill_all(
|
|
|
297
332
|
buildlog_dir: Path to the buildlog directory.
|
|
298
333
|
since: If provided, only include entries from this date onward.
|
|
299
334
|
category_filter: If provided, only include patterns from this category.
|
|
335
|
+
llm: If True and an LLM backend is available, use LLM extraction.
|
|
336
|
+
Falls back to regex on failure or if no backend is available.
|
|
300
337
|
|
|
301
338
|
Returns:
|
|
302
339
|
DistillResult with aggregated patterns and statistics.
|
|
303
340
|
"""
|
|
341
|
+
# Resolve LLM backend if requested
|
|
342
|
+
llm_backend: LLMBackend | None = None
|
|
343
|
+
if llm:
|
|
344
|
+
from buildlog.llm import get_llm_backend
|
|
345
|
+
|
|
346
|
+
llm_backend = get_llm_backend(buildlog_dir=buildlog_dir)
|
|
347
|
+
if llm_backend is None:
|
|
348
|
+
logger.warning(
|
|
349
|
+
"--llm requested but no LLM provider available, using regex fallback"
|
|
350
|
+
)
|
|
351
|
+
|
|
304
352
|
patterns: dict[str, list[PatternDict]] = {cat: [] for cat in CATEGORIES}
|
|
305
353
|
by_month: dict[str, int] = {}
|
|
306
354
|
entry_count = 0
|
|
@@ -318,6 +366,37 @@ def distill_all(
|
|
|
318
366
|
month_key = _extract_month_key(date_str)
|
|
319
367
|
by_month[month_key] = by_month.get(month_key, 0) + 1
|
|
320
368
|
|
|
369
|
+
# Try LLM extraction first, fall back to regex
|
|
370
|
+
if llm_backend is not None:
|
|
371
|
+
try:
|
|
372
|
+
extracted = parse_improvements_llm(content, llm_backend)
|
|
373
|
+
if extracted:
|
|
374
|
+
# Convert ExtractedRule objects to standard PatternDict format
|
|
375
|
+
for rule in extracted:
|
|
376
|
+
cat = (
|
|
377
|
+
rule.category
|
|
378
|
+
if rule.category in CATEGORIES
|
|
379
|
+
else "architectural"
|
|
380
|
+
)
|
|
381
|
+
if cat not in patterns:
|
|
382
|
+
patterns[cat] = []
|
|
383
|
+
patterns[cat].append(
|
|
384
|
+
PatternDict(
|
|
385
|
+
insight=rule.rule,
|
|
386
|
+
source=str(entry_path),
|
|
387
|
+
date=date_str,
|
|
388
|
+
context=context,
|
|
389
|
+
)
|
|
390
|
+
)
|
|
391
|
+
continue # Skip regex if LLM succeeded
|
|
392
|
+
except Exception as e:
|
|
393
|
+
logger.warning(
|
|
394
|
+
"LLM extraction failed for %s, falling back to regex: %s",
|
|
395
|
+
entry_path,
|
|
396
|
+
e,
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
# Regex fallback (default behavior)
|
|
321
400
|
try:
|
|
322
401
|
improvements = parse_improvements(content)
|
|
323
402
|
except re.error as e:
|