buildlog 0.6.1__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- buildlog/__init__.py +1 -1
- buildlog/cli.py +589 -44
- buildlog/confidence.py +27 -0
- buildlog/core/__init__.py +12 -0
- buildlog/core/bandit.py +699 -0
- buildlog/core/operations.py +499 -11
- buildlog/distill.py +80 -1
- buildlog/engine/__init__.py +61 -0
- buildlog/engine/bandit.py +23 -0
- buildlog/engine/confidence.py +28 -0
- buildlog/engine/embeddings.py +28 -0
- buildlog/engine/experiments.py +619 -0
- buildlog/engine/types.py +31 -0
- buildlog/llm.py +461 -0
- buildlog/mcp/server.py +12 -6
- buildlog/mcp/tools.py +166 -13
- buildlog/render/__init__.py +19 -2
- buildlog/render/claude_md.py +74 -26
- buildlog/render/continue_dev.py +102 -0
- buildlog/render/copilot.py +100 -0
- buildlog/render/cursor.py +105 -0
- buildlog/render/tracking.py +20 -1
- buildlog/render/windsurf.py +95 -0
- buildlog/seeds.py +41 -0
- buildlog/skills.py +69 -6
- {buildlog-0.6.1.data → buildlog-0.8.0.data}/data/share/buildlog/copier.yml +0 -4
- buildlog-0.8.0.data/data/share/buildlog/template/buildlog/_TEMPLATE_QUICK.md +21 -0
- buildlog-0.8.0.dist-info/METADATA +151 -0
- buildlog-0.8.0.dist-info/RECORD +54 -0
- buildlog-0.6.1.dist-info/METADATA +0 -490
- buildlog-0.6.1.dist-info/RECORD +0 -41
- {buildlog-0.6.1.data → buildlog-0.8.0.data}/data/share/buildlog/post_gen.py +0 -0
- {buildlog-0.6.1.data → buildlog-0.8.0.data}/data/share/buildlog/template/buildlog/.gitkeep +0 -0
- {buildlog-0.6.1.data → buildlog-0.8.0.data}/data/share/buildlog/template/buildlog/2026-01-01-example.md +0 -0
- {buildlog-0.6.1.data → buildlog-0.8.0.data}/data/share/buildlog/template/buildlog/BUILDLOG_SYSTEM.md +0 -0
- {buildlog-0.6.1.data → buildlog-0.8.0.data}/data/share/buildlog/template/buildlog/_TEMPLATE.md +0 -0
- {buildlog-0.6.1.data → buildlog-0.8.0.data}/data/share/buildlog/template/buildlog/assets/.gitkeep +0 -0
- {buildlog-0.6.1.dist-info → buildlog-0.8.0.dist-info}/WHEEL +0 -0
- {buildlog-0.6.1.dist-info → buildlog-0.8.0.dist-info}/entry_points.txt +0 -0
- {buildlog-0.6.1.dist-info → buildlog-0.8.0.dist-info}/licenses/LICENSE +0 -0
buildlog/core/operations.py
CHANGED
|
@@ -14,6 +14,7 @@ from pathlib import Path
|
|
|
14
14
|
from typing import Literal, TypedDict
|
|
15
15
|
|
|
16
16
|
from buildlog.confidence import ConfidenceMetrics, merge_confidence_metrics
|
|
17
|
+
from buildlog.core.bandit import ThompsonSamplingBandit
|
|
17
18
|
from buildlog.render import get_renderer
|
|
18
19
|
from buildlog.skills import Skill, SkillSet, generate_skills
|
|
19
20
|
|
|
@@ -35,6 +36,9 @@ __all__ = [
|
|
|
35
36
|
"StartSessionResult",
|
|
36
37
|
"EndSessionResult",
|
|
37
38
|
"LogMistakeResult",
|
|
39
|
+
# Gauntlet loop
|
|
40
|
+
"GauntletLoopResult",
|
|
41
|
+
"GauntletAcceptRiskResult",
|
|
38
42
|
"status",
|
|
39
43
|
"promote",
|
|
40
44
|
"reject",
|
|
@@ -49,6 +53,10 @@ __all__ = [
|
|
|
49
53
|
"log_mistake",
|
|
50
54
|
"get_session_metrics",
|
|
51
55
|
"get_experiment_report",
|
|
56
|
+
"get_bandit_status",
|
|
57
|
+
# Gauntlet loop operations
|
|
58
|
+
"gauntlet_process_issues",
|
|
59
|
+
"gauntlet_accept_risk",
|
|
52
60
|
]
|
|
53
61
|
|
|
54
62
|
|
|
@@ -552,7 +560,7 @@ def status(
|
|
|
552
560
|
def promote(
|
|
553
561
|
buildlog_dir: Path,
|
|
554
562
|
skill_ids: list[str],
|
|
555
|
-
target:
|
|
563
|
+
target: str = "claude_md",
|
|
556
564
|
target_path: Path | None = None,
|
|
557
565
|
) -> PromoteResult:
|
|
558
566
|
"""Promote skills to agent rules.
|
|
@@ -560,7 +568,8 @@ def promote(
|
|
|
560
568
|
Args:
|
|
561
569
|
buildlog_dir: Path to buildlog directory.
|
|
562
570
|
skill_ids: List of skill IDs to promote.
|
|
563
|
-
target: Where to write rules
|
|
571
|
+
target: Where to write rules. One of: claude_md, settings_json,
|
|
572
|
+
skill, cursor, copilot, windsurf, continue_dev.
|
|
564
573
|
target_path: Optional custom path for the target file.
|
|
565
574
|
|
|
566
575
|
Returns:
|
|
@@ -932,14 +941,27 @@ def log_reward(
|
|
|
932
941
|
) -> LogRewardResult:
|
|
933
942
|
"""Log a reward event for bandit learning.
|
|
934
943
|
|
|
935
|
-
|
|
944
|
+
This is where the bandit learns from EXPLICIT feedback:
|
|
945
|
+
|
|
946
|
+
The reward signal comes from the outcome:
|
|
947
|
+
- accepted (reward=1.0): Rules helped produce good output
|
|
948
|
+
- rejected (reward=0.0): Rules failed to prevent bad output
|
|
949
|
+
- revision (reward=1-distance): Partial credit based on correction needed
|
|
950
|
+
|
|
951
|
+
Unlike log_mistake() which gives implicit negative feedback, this allows
|
|
952
|
+
direct positive feedback when rules DO help. This is crucial for learning
|
|
953
|
+
which rules are genuinely effective, not just which ones don't fail.
|
|
954
|
+
|
|
955
|
+
Appends to reward_events.jsonl for analysis AND updates the bandit.
|
|
936
956
|
|
|
937
957
|
Args:
|
|
938
958
|
buildlog_dir: Path to buildlog directory.
|
|
939
959
|
outcome: Type of feedback (accepted/revision/rejected).
|
|
940
960
|
rules_active: List of rule IDs that were in context.
|
|
961
|
+
If None, tries to use session's selected_rules.
|
|
941
962
|
revision_distance: How much correction was needed (0-1, for revisions).
|
|
942
963
|
error_class: Category of error if applicable.
|
|
964
|
+
If None, tries to use session's error_class.
|
|
943
965
|
notes: Optional notes about the feedback.
|
|
944
966
|
source: Where this feedback came from.
|
|
945
967
|
|
|
@@ -950,6 +972,15 @@ def log_reward(
|
|
|
950
972
|
reward_id = _generate_reward_id(outcome, now)
|
|
951
973
|
reward_value = _compute_reward_value(outcome, revision_distance)
|
|
952
974
|
|
|
975
|
+
# Try to get rules and context from active session if not provided
|
|
976
|
+
active_path = _get_active_session_path(buildlog_dir)
|
|
977
|
+
if active_path.exists():
|
|
978
|
+
session_data = json.loads(active_path.read_text())
|
|
979
|
+
if rules_active is None:
|
|
980
|
+
rules_active = session_data.get("selected_rules", [])
|
|
981
|
+
if error_class is None:
|
|
982
|
+
error_class = session_data.get("error_class")
|
|
983
|
+
|
|
953
984
|
event = RewardEvent(
|
|
954
985
|
id=reward_id,
|
|
955
986
|
timestamp=now,
|
|
@@ -969,6 +1000,32 @@ def log_reward(
|
|
|
969
1000
|
with open(rewards_path, "a") as f:
|
|
970
1001
|
f.write(json.dumps(event.to_dict()) + "\n")
|
|
971
1002
|
|
|
1003
|
+
# =========================================================================
|
|
1004
|
+
# BANDIT LEARNING: Update with explicit reward
|
|
1005
|
+
# =========================================================================
|
|
1006
|
+
#
|
|
1007
|
+
# For accepted (reward=1): Beta(α, β) → Beta(α + 1, β)
|
|
1008
|
+
# → Distribution shifts RIGHT, increasing expected value
|
|
1009
|
+
# → Rule becomes MORE likely to be selected
|
|
1010
|
+
#
|
|
1011
|
+
# For rejected (reward=0): Beta(α, β) → Beta(α, β + 1)
|
|
1012
|
+
# → Distribution shifts LEFT, decreasing expected value
|
|
1013
|
+
# → Rule becomes LESS likely to be selected
|
|
1014
|
+
#
|
|
1015
|
+
# For revision (0 < reward < 1): Both α and β increase proportionally
|
|
1016
|
+
# → Distribution narrows (more confident) with moderate expected value
|
|
1017
|
+
# =========================================================================
|
|
1018
|
+
|
|
1019
|
+
if rules_active:
|
|
1020
|
+
bandit_path = buildlog_dir / "bandit_state.jsonl"
|
|
1021
|
+
bandit = ThompsonSamplingBandit(bandit_path)
|
|
1022
|
+
|
|
1023
|
+
bandit.batch_update(
|
|
1024
|
+
rule_ids=rules_active,
|
|
1025
|
+
reward=reward_value,
|
|
1026
|
+
context=error_class or "general",
|
|
1027
|
+
)
|
|
1028
|
+
|
|
972
1029
|
# Count total events
|
|
973
1030
|
total_events = 0
|
|
974
1031
|
if rewards_path.exists():
|
|
@@ -976,11 +1033,16 @@ def log_reward(
|
|
|
976
1033
|
1 for line in rewards_path.read_text().strip().split("\n") if line
|
|
977
1034
|
)
|
|
978
1035
|
|
|
1036
|
+
rules_count = len(rules_active) if rules_active else 0
|
|
1037
|
+
message = f"Logged {outcome} (reward={reward_value:.2f})"
|
|
1038
|
+
if rules_count > 0:
|
|
1039
|
+
message += f" | Updated bandit: {rules_count} rules"
|
|
1040
|
+
|
|
979
1041
|
return LogRewardResult(
|
|
980
1042
|
reward_id=reward_id,
|
|
981
1043
|
reward_value=reward_value,
|
|
982
1044
|
total_events=total_events,
|
|
983
|
-
message=
|
|
1045
|
+
message=message,
|
|
984
1046
|
)
|
|
985
1047
|
|
|
986
1048
|
|
|
@@ -1055,6 +1117,7 @@ class SessionDict(TypedDict, total=False):
|
|
|
1055
1117
|
entry_file: str | None
|
|
1056
1118
|
rules_at_start: list[str]
|
|
1057
1119
|
rules_at_end: list[str]
|
|
1120
|
+
selected_rules: list[str] # Bandit-selected subset for this session
|
|
1058
1121
|
error_class: str | None
|
|
1059
1122
|
notes: str | None
|
|
1060
1123
|
|
|
@@ -1064,15 +1127,17 @@ class Session:
|
|
|
1064
1127
|
"""A coding session for experiment tracking.
|
|
1065
1128
|
|
|
1066
1129
|
Tracks the state of rules before and after a session to measure
|
|
1067
|
-
learning effectiveness.
|
|
1130
|
+
learning effectiveness. The bandit selects a subset of rules
|
|
1131
|
+
(selected_rules) to be "active" for this session based on context.
|
|
1068
1132
|
|
|
1069
1133
|
Attributes:
|
|
1070
1134
|
id: Unique identifier for this session.
|
|
1071
1135
|
started_at: When the session started.
|
|
1072
1136
|
ended_at: When the session ended (None if still active).
|
|
1073
1137
|
entry_file: Corresponding buildlog entry file, if any.
|
|
1074
|
-
rules_at_start:
|
|
1075
|
-
rules_at_end:
|
|
1138
|
+
rules_at_start: All rule IDs available at session start.
|
|
1139
|
+
rules_at_end: All rule IDs available at session end.
|
|
1140
|
+
selected_rules: Bandit-selected subset active for this session.
|
|
1076
1141
|
error_class: Error class being targeted (e.g., "missing_test").
|
|
1077
1142
|
notes: Optional notes about the session.
|
|
1078
1143
|
"""
|
|
@@ -1083,6 +1148,7 @@ class Session:
|
|
|
1083
1148
|
entry_file: str | None = None
|
|
1084
1149
|
rules_at_start: list[str] = field(default_factory=list)
|
|
1085
1150
|
rules_at_end: list[str] = field(default_factory=list)
|
|
1151
|
+
selected_rules: list[str] = field(default_factory=list)
|
|
1086
1152
|
error_class: str | None = None
|
|
1087
1153
|
notes: str | None = None
|
|
1088
1154
|
|
|
@@ -1095,6 +1161,8 @@ class Session:
|
|
|
1095
1161
|
"rules_at_start": self.rules_at_start,
|
|
1096
1162
|
"rules_at_end": self.rules_at_end,
|
|
1097
1163
|
}
|
|
1164
|
+
if self.selected_rules:
|
|
1165
|
+
result["selected_rules"] = self.selected_rules
|
|
1098
1166
|
if self.entry_file is not None:
|
|
1099
1167
|
result["entry_file"] = self.entry_file
|
|
1100
1168
|
if self.error_class is not None:
|
|
@@ -1124,6 +1192,7 @@ class Session:
|
|
|
1124
1192
|
entry_file=data.get("entry_file"),
|
|
1125
1193
|
rules_at_start=data.get("rules_at_start", []),
|
|
1126
1194
|
rules_at_end=data.get("rules_at_end", []),
|
|
1195
|
+
selected_rules=data.get("selected_rules", []),
|
|
1127
1196
|
error_class=data.get("error_class"),
|
|
1128
1197
|
notes=data.get("notes"),
|
|
1129
1198
|
)
|
|
@@ -1227,11 +1296,15 @@ class SessionMetrics:
|
|
|
1227
1296
|
|
|
1228
1297
|
@dataclass
|
|
1229
1298
|
class StartSessionResult:
|
|
1230
|
-
"""Result of starting a new session.
|
|
1299
|
+
"""Result of starting a new session.
|
|
1300
|
+
|
|
1301
|
+
Includes both the full rule set and the bandit-selected subset.
|
|
1302
|
+
"""
|
|
1231
1303
|
|
|
1232
1304
|
session_id: str
|
|
1233
1305
|
error_class: str | None
|
|
1234
1306
|
rules_count: int
|
|
1307
|
+
selected_rules: list[str] # Bandit-selected rules for this session
|
|
1235
1308
|
message: str
|
|
1236
1309
|
|
|
1237
1310
|
|
|
@@ -1310,6 +1383,31 @@ def _get_current_rules(buildlog_dir: Path) -> list[str]:
|
|
|
1310
1383
|
return list(_load_json_set(promoted_path, "skill_ids"))
|
|
1311
1384
|
|
|
1312
1385
|
|
|
1386
|
+
def _get_seed_rule_ids(buildlog_dir: Path) -> set[str]:
|
|
1387
|
+
"""Get IDs of rules that come from seed personas.
|
|
1388
|
+
|
|
1389
|
+
Seed rules (from gauntlet personas like Test Terrorist, Security Karen)
|
|
1390
|
+
have non-empty persona_tags. These rules get boosted priors in the
|
|
1391
|
+
bandit because they represent curated, expert knowledge.
|
|
1392
|
+
|
|
1393
|
+
Returns:
|
|
1394
|
+
Set of rule IDs that have persona_tags.
|
|
1395
|
+
"""
|
|
1396
|
+
try:
|
|
1397
|
+
skill_set = generate_skills(buildlog_dir)
|
|
1398
|
+
seed_ids: set[str] = set()
|
|
1399
|
+
|
|
1400
|
+
for category_skills in skill_set.skills.values():
|
|
1401
|
+
for skill in category_skills:
|
|
1402
|
+
if skill.persona_tags: # Non-empty means it's from a seed
|
|
1403
|
+
seed_ids.add(skill.id)
|
|
1404
|
+
|
|
1405
|
+
return seed_ids
|
|
1406
|
+
except Exception:
|
|
1407
|
+
# If skill generation fails, treat no rules as seeds
|
|
1408
|
+
return set()
|
|
1409
|
+
|
|
1410
|
+
|
|
1313
1411
|
def _load_sessions(buildlog_dir: Path) -> list[Session]:
|
|
1314
1412
|
"""Load all sessions from JSONL file."""
|
|
1315
1413
|
sessions_path = _get_sessions_path(buildlog_dir)
|
|
@@ -1383,25 +1481,78 @@ def start_session(
|
|
|
1383
1481
|
buildlog_dir: Path,
|
|
1384
1482
|
error_class: str | None = None,
|
|
1385
1483
|
notes: str | None = None,
|
|
1484
|
+
select_k: int = 3,
|
|
1386
1485
|
) -> StartSessionResult:
|
|
1387
|
-
"""Start a new experiment session.
|
|
1486
|
+
"""Start a new experiment session with bandit-selected rules.
|
|
1487
|
+
|
|
1488
|
+
This is where Thompson Sampling kicks in:
|
|
1489
|
+
|
|
1490
|
+
1. Load all available rules (candidates)
|
|
1491
|
+
2. Identify which rules are from seeds (get boosted priors)
|
|
1492
|
+
3. Use bandit to select top-k rules for this error_class context
|
|
1493
|
+
4. Store selected rules in session for later attribution
|
|
1494
|
+
|
|
1495
|
+
The selected rules are the ones "active" for this session. When a
|
|
1496
|
+
mistake occurs, we'll give negative feedback to these rules (they
|
|
1497
|
+
didn't prevent the mistake). This teaches the bandit which rules
|
|
1498
|
+
are effective for which error classes.
|
|
1388
1499
|
|
|
1389
1500
|
Args:
|
|
1390
1501
|
buildlog_dir: Path to buildlog directory.
|
|
1391
1502
|
error_class: Error class being targeted (e.g., "missing_test").
|
|
1503
|
+
This is the CONTEXT for contextual bandits - rules
|
|
1504
|
+
are evaluated per-context.
|
|
1392
1505
|
notes: Optional notes about the session.
|
|
1506
|
+
select_k: Number of rules to select via Thompson Sampling.
|
|
1507
|
+
Default 3 balances coverage with attribution clarity.
|
|
1393
1508
|
|
|
1394
1509
|
Returns:
|
|
1395
|
-
StartSessionResult with session ID and
|
|
1510
|
+
StartSessionResult with session ID, rules count, and selected rules.
|
|
1396
1511
|
"""
|
|
1397
1512
|
now = datetime.now(timezone.utc)
|
|
1398
1513
|
session_id = _generate_session_id(now)
|
|
1399
1514
|
current_rules = _get_current_rules(buildlog_dir)
|
|
1400
1515
|
|
|
1516
|
+
# =========================================================================
|
|
1517
|
+
# THOMPSON SAMPLING: Select rules for this session
|
|
1518
|
+
# =========================================================================
|
|
1519
|
+
#
|
|
1520
|
+
# The bandit maintains a Beta distribution for each (context, rule) pair.
|
|
1521
|
+
# At session start, we SAMPLE from each distribution and pick the top-k.
|
|
1522
|
+
#
|
|
1523
|
+
# Why sample instead of using the mean?
|
|
1524
|
+
# - Arms we're uncertain about have high variance
|
|
1525
|
+
# - High variance means occasional high samples
|
|
1526
|
+
# - This causes us to explore uncertain arms
|
|
1527
|
+
# - As we gather data, variance shrinks, and we exploit
|
|
1528
|
+
#
|
|
1529
|
+
# This is the elegant explore-exploit balance of Thompson Sampling.
|
|
1530
|
+
# =========================================================================
|
|
1531
|
+
|
|
1532
|
+
selected_rules: list[str] = []
|
|
1533
|
+
|
|
1534
|
+
if current_rules:
|
|
1535
|
+
# Initialize bandit
|
|
1536
|
+
bandit_path = buildlog_dir / "bandit_state.jsonl"
|
|
1537
|
+
bandit = ThompsonSamplingBandit(bandit_path)
|
|
1538
|
+
|
|
1539
|
+
# Identify seed rules (those with persona_tags from gauntlet)
|
|
1540
|
+
# Seeds get boosted priors - we believe curated rules are good
|
|
1541
|
+
seed_rule_ids = _get_seed_rule_ids(buildlog_dir)
|
|
1542
|
+
|
|
1543
|
+
# SELECT: Sample from Beta distributions, pick top-k
|
|
1544
|
+
selected_rules = bandit.select(
|
|
1545
|
+
candidates=current_rules,
|
|
1546
|
+
context=error_class or "general",
|
|
1547
|
+
k=min(select_k, len(current_rules)),
|
|
1548
|
+
seed_rule_ids=seed_rule_ids,
|
|
1549
|
+
)
|
|
1550
|
+
|
|
1401
1551
|
session = Session(
|
|
1402
1552
|
id=session_id,
|
|
1403
1553
|
started_at=now,
|
|
1404
1554
|
rules_at_start=current_rules,
|
|
1555
|
+
selected_rules=selected_rules,
|
|
1405
1556
|
error_class=error_class,
|
|
1406
1557
|
notes=notes,
|
|
1407
1558
|
)
|
|
@@ -1415,7 +1566,11 @@ def start_session(
|
|
|
1415
1566
|
session_id=session_id,
|
|
1416
1567
|
error_class=error_class,
|
|
1417
1568
|
rules_count=len(current_rules),
|
|
1418
|
-
|
|
1569
|
+
selected_rules=selected_rules,
|
|
1570
|
+
message=(
|
|
1571
|
+
f"Started session {session_id}: selected {len(selected_rules)}/"
|
|
1572
|
+
f"{len(current_rules)} rules via Thompson Sampling"
|
|
1573
|
+
),
|
|
1419
1574
|
)
|
|
1420
1575
|
|
|
1421
1576
|
|
|
@@ -1487,6 +1642,16 @@ def log_mistake(
|
|
|
1487
1642
|
) -> LogMistakeResult:
|
|
1488
1643
|
"""Log a mistake during an experiment session.
|
|
1489
1644
|
|
|
1645
|
+
This is where the bandit learns from NEGATIVE feedback:
|
|
1646
|
+
|
|
1647
|
+
When a mistake occurs, the selected rules for this session FAILED
|
|
1648
|
+
to prevent it. We update the bandit with reward=0 for each selected
|
|
1649
|
+
rule, teaching it that these rules aren't effective for this context.
|
|
1650
|
+
|
|
1651
|
+
Over time, rules that consistently fail to prevent mistakes will
|
|
1652
|
+
have their Beta distributions shift left (lower expected value),
|
|
1653
|
+
and the bandit will stop selecting them.
|
|
1654
|
+
|
|
1490
1655
|
Args:
|
|
1491
1656
|
buildlog_dir: Path to buildlog directory.
|
|
1492
1657
|
error_class: Category of error (e.g., "missing_test").
|
|
@@ -1533,9 +1698,39 @@ def log_mistake(
|
|
|
1533
1698
|
with open(mistakes_path, "a") as f:
|
|
1534
1699
|
f.write(json.dumps(mistake.to_dict()) + "\n")
|
|
1535
1700
|
|
|
1701
|
+
# =========================================================================
|
|
1702
|
+
# BANDIT LEARNING: Negative feedback for selected rules
|
|
1703
|
+
# =========================================================================
|
|
1704
|
+
#
|
|
1705
|
+
# The selected rules were supposed to help prevent mistakes. A mistake
|
|
1706
|
+
# occurred anyway, so we give them reward=0 (failure).
|
|
1707
|
+
#
|
|
1708
|
+
# Bayesian update: Beta(α, β) → Beta(α + 0, β + 1) = Beta(α, β + 1)
|
|
1709
|
+
#
|
|
1710
|
+
# This shifts the distribution LEFT, decreasing the expected value.
|
|
1711
|
+
# Rules that repeatedly fail will become less likely to be selected.
|
|
1712
|
+
# =========================================================================
|
|
1713
|
+
|
|
1714
|
+
selected_rules = session_data.get("selected_rules", [])
|
|
1715
|
+
if selected_rules:
|
|
1716
|
+
bandit_path = buildlog_dir / "bandit_state.jsonl"
|
|
1717
|
+
bandit = ThompsonSamplingBandit(bandit_path)
|
|
1718
|
+
|
|
1719
|
+
# Use session's error_class as context, not the mistake's
|
|
1720
|
+
# (they should match, but session context is authoritative)
|
|
1721
|
+
context = session_data.get("error_class") or "general"
|
|
1722
|
+
|
|
1723
|
+
bandit.batch_update(
|
|
1724
|
+
rule_ids=selected_rules,
|
|
1725
|
+
reward=0.0, # Failure: rules didn't prevent mistake
|
|
1726
|
+
context=context,
|
|
1727
|
+
)
|
|
1728
|
+
|
|
1536
1729
|
message = f"Logged mistake: {error_class}"
|
|
1537
1730
|
if similar:
|
|
1538
1731
|
message += f" (REPEAT of {similar.id})"
|
|
1732
|
+
if selected_rules:
|
|
1733
|
+
message += f" | Updated bandit: {len(selected_rules)} rules got reward=0"
|
|
1539
1734
|
|
|
1540
1735
|
return LogMistakeResult(
|
|
1541
1736
|
mistake_id=mistake_id,
|
|
@@ -1652,3 +1847,296 @@ def get_experiment_report(buildlog_dir: Path) -> dict:
|
|
|
1652
1847
|
"sessions": session_metrics,
|
|
1653
1848
|
"error_classes": error_classes,
|
|
1654
1849
|
}
|
|
1850
|
+
|
|
1851
|
+
|
|
1852
|
+
def get_bandit_status(
|
|
1853
|
+
buildlog_dir: Path,
|
|
1854
|
+
context: str | None = None,
|
|
1855
|
+
top_k: int = 10,
|
|
1856
|
+
) -> dict:
|
|
1857
|
+
"""Get current bandit state and statistics.
|
|
1858
|
+
|
|
1859
|
+
Provides insight into the Thompson Sampling bandit's learned beliefs.
|
|
1860
|
+
Useful for debugging and understanding which rules are being favored.
|
|
1861
|
+
|
|
1862
|
+
Args:
|
|
1863
|
+
buildlog_dir: Path to buildlog directory.
|
|
1864
|
+
context: Specific error class to show. If None, shows all contexts.
|
|
1865
|
+
top_k: Number of top rules to show per context.
|
|
1866
|
+
|
|
1867
|
+
Returns:
|
|
1868
|
+
Dictionary with:
|
|
1869
|
+
- summary: Overall bandit statistics
|
|
1870
|
+
- contexts: Per-context rule rankings
|
|
1871
|
+
- top_rules: Top rules by expected value per context
|
|
1872
|
+
"""
|
|
1873
|
+
bandit_path = buildlog_dir / "bandit_state.jsonl"
|
|
1874
|
+
bandit = ThompsonSamplingBandit(bandit_path)
|
|
1875
|
+
|
|
1876
|
+
stats = bandit.get_stats(context)
|
|
1877
|
+
|
|
1878
|
+
# Group stats by context
|
|
1879
|
+
contexts: dict[str, list[dict]] = {}
|
|
1880
|
+
for key, rule_stats in stats.items():
|
|
1881
|
+
ctx = rule_stats["context"]
|
|
1882
|
+
if ctx not in contexts:
|
|
1883
|
+
contexts[ctx] = []
|
|
1884
|
+
contexts[ctx].append(
|
|
1885
|
+
{
|
|
1886
|
+
"rule_id": key.split(":")[-1] if ":" in key else key,
|
|
1887
|
+
**{k: v for k, v in rule_stats.items() if k != "context"},
|
|
1888
|
+
}
|
|
1889
|
+
)
|
|
1890
|
+
|
|
1891
|
+
# Sort by mean (descending) and take top_k
|
|
1892
|
+
top_rules: dict[str, list[dict]] = {}
|
|
1893
|
+
for ctx, rules in contexts.items():
|
|
1894
|
+
sorted_rules = sorted(rules, key=lambda x: x["mean"], reverse=True)
|
|
1895
|
+
top_rules[ctx] = sorted_rules[:top_k]
|
|
1896
|
+
|
|
1897
|
+
# Summary stats
|
|
1898
|
+
total_arms = sum(len(rules) for rules in contexts.values())
|
|
1899
|
+
total_observations = sum(
|
|
1900
|
+
rule.get("total_observations", 0)
|
|
1901
|
+
for rules in contexts.values()
|
|
1902
|
+
for rule in rules
|
|
1903
|
+
)
|
|
1904
|
+
|
|
1905
|
+
return {
|
|
1906
|
+
"summary": {
|
|
1907
|
+
"total_contexts": len(contexts),
|
|
1908
|
+
"total_arms": total_arms,
|
|
1909
|
+
"total_observations": total_observations,
|
|
1910
|
+
"state_file": str(bandit_path),
|
|
1911
|
+
},
|
|
1912
|
+
"top_rules": top_rules,
|
|
1913
|
+
"all_rules": contexts if context else None, # Only include all if filtering
|
|
1914
|
+
}
|
|
1915
|
+
|
|
1916
|
+
|
|
1917
|
+
# =============================================================================
|
|
1918
|
+
# Gauntlet Loop Operations
|
|
1919
|
+
# =============================================================================
|
|
1920
|
+
|
|
1921
|
+
|
|
1922
|
+
@dataclass
|
|
1923
|
+
class GauntletLoopResult:
|
|
1924
|
+
"""Result of processing gauntlet issues.
|
|
1925
|
+
|
|
1926
|
+
Attributes:
|
|
1927
|
+
action: What to do next:
|
|
1928
|
+
- "fix_criticals": Criticals remain, auto-fix and loop
|
|
1929
|
+
- "checkpoint_majors": No criticals, but majors remain (HITL)
|
|
1930
|
+
- "checkpoint_minors": Only minors remain (HITL)
|
|
1931
|
+
- "clean": No issues remain
|
|
1932
|
+
criticals: List of critical severity issues
|
|
1933
|
+
majors: List of major severity issues
|
|
1934
|
+
minors: List of minor/nitpick severity issues
|
|
1935
|
+
iteration: Current iteration number
|
|
1936
|
+
learnings_persisted: Number of learnings persisted this iteration
|
|
1937
|
+
message: Human-readable summary
|
|
1938
|
+
"""
|
|
1939
|
+
|
|
1940
|
+
action: Literal["fix_criticals", "checkpoint_majors", "checkpoint_minors", "clean"]
|
|
1941
|
+
criticals: list[dict]
|
|
1942
|
+
majors: list[dict]
|
|
1943
|
+
minors: list[dict]
|
|
1944
|
+
iteration: int
|
|
1945
|
+
learnings_persisted: int
|
|
1946
|
+
message: str
|
|
1947
|
+
|
|
1948
|
+
|
|
1949
|
+
@dataclass
|
|
1950
|
+
class GauntletAcceptRiskResult:
|
|
1951
|
+
"""Result of accepting risk with remaining issues.
|
|
1952
|
+
|
|
1953
|
+
Attributes:
|
|
1954
|
+
accepted_issues: Number of issues accepted as risk
|
|
1955
|
+
github_issues_created: Number of GitHub issues created (if enabled)
|
|
1956
|
+
github_issue_urls: URLs of created GitHub issues
|
|
1957
|
+
message: Human-readable summary
|
|
1958
|
+
error: Error message if operation failed
|
|
1959
|
+
"""
|
|
1960
|
+
|
|
1961
|
+
accepted_issues: int
|
|
1962
|
+
github_issues_created: int
|
|
1963
|
+
github_issue_urls: list[str]
|
|
1964
|
+
message: str
|
|
1965
|
+
error: str | None = None
|
|
1966
|
+
|
|
1967
|
+
|
|
1968
|
+
def gauntlet_process_issues(
|
|
1969
|
+
buildlog_dir: Path,
|
|
1970
|
+
issues: list[dict],
|
|
1971
|
+
iteration: int = 1,
|
|
1972
|
+
source: str | None = None,
|
|
1973
|
+
) -> GauntletLoopResult:
|
|
1974
|
+
"""Process gauntlet issues and determine next action.
|
|
1975
|
+
|
|
1976
|
+
Categorizes issues by severity, persists learnings, and returns
|
|
1977
|
+
the appropriate next action for the gauntlet loop.
|
|
1978
|
+
|
|
1979
|
+
Args:
|
|
1980
|
+
buildlog_dir: Path to buildlog directory.
|
|
1981
|
+
issues: List of issues from the gauntlet review.
|
|
1982
|
+
iteration: Current iteration number (for tracking).
|
|
1983
|
+
source: Optional source identifier for learnings.
|
|
1984
|
+
|
|
1985
|
+
Returns:
|
|
1986
|
+
GauntletLoopResult with categorized issues and next action.
|
|
1987
|
+
"""
|
|
1988
|
+
# Categorize by severity
|
|
1989
|
+
criticals = [i for i in issues if i.get("severity") == "critical"]
|
|
1990
|
+
majors = [i for i in issues if i.get("severity") == "major"]
|
|
1991
|
+
minors = [i for i in issues if i.get("severity") in ("minor", "nitpick", None)]
|
|
1992
|
+
|
|
1993
|
+
# Persist learnings for this iteration
|
|
1994
|
+
learn_source = source or f"gauntlet:iteration-{iteration}"
|
|
1995
|
+
learn_result = learn_from_review(buildlog_dir, issues, learn_source)
|
|
1996
|
+
learnings_persisted = len(learn_result.new_learnings) + len(
|
|
1997
|
+
learn_result.reinforced_learnings
|
|
1998
|
+
)
|
|
1999
|
+
|
|
2000
|
+
# Determine action
|
|
2001
|
+
if criticals:
|
|
2002
|
+
action: Literal[
|
|
2003
|
+
"fix_criticals", "checkpoint_majors", "checkpoint_minors", "clean"
|
|
2004
|
+
] = "fix_criticals"
|
|
2005
|
+
message = (
|
|
2006
|
+
f"Iteration {iteration}: {len(criticals)} critical, "
|
|
2007
|
+
f"{len(majors)} major, {len(minors)} minor. "
|
|
2008
|
+
f"Fix criticals (and majors) then re-run."
|
|
2009
|
+
)
|
|
2010
|
+
elif majors:
|
|
2011
|
+
action = "checkpoint_majors"
|
|
2012
|
+
message = (
|
|
2013
|
+
f"Iteration {iteration}: No criticals! "
|
|
2014
|
+
f"{len(majors)} major, {len(minors)} minor remain. "
|
|
2015
|
+
f"Continue clearing majors?"
|
|
2016
|
+
)
|
|
2017
|
+
elif minors:
|
|
2018
|
+
action = "checkpoint_minors"
|
|
2019
|
+
message = (
|
|
2020
|
+
f"Iteration {iteration}: Only {len(minors)} minor issues remain. "
|
|
2021
|
+
f"Accept risk or continue?"
|
|
2022
|
+
)
|
|
2023
|
+
else:
|
|
2024
|
+
action = "clean"
|
|
2025
|
+
message = f"Iteration {iteration}: All clear! No issues found."
|
|
2026
|
+
|
|
2027
|
+
return GauntletLoopResult(
|
|
2028
|
+
action=action,
|
|
2029
|
+
criticals=criticals,
|
|
2030
|
+
majors=majors,
|
|
2031
|
+
minors=minors,
|
|
2032
|
+
iteration=iteration,
|
|
2033
|
+
learnings_persisted=learnings_persisted,
|
|
2034
|
+
message=message,
|
|
2035
|
+
)
|
|
2036
|
+
|
|
2037
|
+
|
|
2038
|
+
def gauntlet_accept_risk(
|
|
2039
|
+
remaining_issues: list[dict],
|
|
2040
|
+
create_github_issues: bool = False,
|
|
2041
|
+
repo: str | None = None,
|
|
2042
|
+
) -> GauntletAcceptRiskResult:
|
|
2043
|
+
"""Accept risk for remaining issues, optionally creating GitHub issues.
|
|
2044
|
+
|
|
2045
|
+
Args:
|
|
2046
|
+
remaining_issues: Issues being accepted as risk.
|
|
2047
|
+
create_github_issues: Whether to create GitHub issues for tracking.
|
|
2048
|
+
repo: Repository for GitHub issues (uses current repo if None).
|
|
2049
|
+
|
|
2050
|
+
Returns:
|
|
2051
|
+
GauntletAcceptRiskResult with created issue info.
|
|
2052
|
+
"""
|
|
2053
|
+
import subprocess
|
|
2054
|
+
|
|
2055
|
+
github_urls: list[str] = []
|
|
2056
|
+
error: str | None = None
|
|
2057
|
+
|
|
2058
|
+
if create_github_issues and remaining_issues:
|
|
2059
|
+
for issue in remaining_issues:
|
|
2060
|
+
severity = issue.get("severity", "minor")
|
|
2061
|
+
rule = issue.get("rule_learned", issue.get("description", "Unknown"))
|
|
2062
|
+
description = issue.get("description", "")
|
|
2063
|
+
location = issue.get("location", "")
|
|
2064
|
+
|
|
2065
|
+
# Sanitize inputs for GitHub issue creation
|
|
2066
|
+
# Note: We use list args (not shell=True), so this is defense-in-depth
|
|
2067
|
+
def _sanitize_for_gh(text: str, max_len: int = 256) -> str:
|
|
2068
|
+
"""Sanitize text for GitHub issue fields."""
|
|
2069
|
+
# Remove/replace problematic characters
|
|
2070
|
+
sanitized = text.replace("\n", " ").replace("\r", " ")
|
|
2071
|
+
# Truncate to max length
|
|
2072
|
+
if len(sanitized) > max_len:
|
|
2073
|
+
sanitized = sanitized[: max_len - 3] + "..."
|
|
2074
|
+
return sanitized.strip()
|
|
2075
|
+
|
|
2076
|
+
safe_severity = _sanitize_for_gh(str(severity), 20)
|
|
2077
|
+
safe_rule = _sanitize_for_gh(str(rule), 200)
|
|
2078
|
+
safe_description = _sanitize_for_gh(str(description), 1000)
|
|
2079
|
+
safe_location = _sanitize_for_gh(str(location), 100)
|
|
2080
|
+
|
|
2081
|
+
# Build issue body
|
|
2082
|
+
body_parts = [
|
|
2083
|
+
f"**Severity:** {safe_severity}",
|
|
2084
|
+
f"**Rule:** {safe_rule}",
|
|
2085
|
+
"",
|
|
2086
|
+
"## Description",
|
|
2087
|
+
safe_description,
|
|
2088
|
+
]
|
|
2089
|
+
if safe_location:
|
|
2090
|
+
body_parts.extend(["", f"**Location:** `{safe_location}`"])
|
|
2091
|
+
|
|
2092
|
+
body_parts.extend(
|
|
2093
|
+
[
|
|
2094
|
+
"",
|
|
2095
|
+
"---",
|
|
2096
|
+
"_Created by buildlog gauntlet loop (accepted risk)_",
|
|
2097
|
+
]
|
|
2098
|
+
)
|
|
2099
|
+
|
|
2100
|
+
body = "\n".join(body_parts)
|
|
2101
|
+
title = f"[Gauntlet/{safe_severity}] {safe_rule[:60]}"
|
|
2102
|
+
|
|
2103
|
+
# Create GitHub issue
|
|
2104
|
+
cmd = [
|
|
2105
|
+
"gh",
|
|
2106
|
+
"issue",
|
|
2107
|
+
"create",
|
|
2108
|
+
"--title",
|
|
2109
|
+
title,
|
|
2110
|
+
"--body",
|
|
2111
|
+
body,
|
|
2112
|
+
"--label",
|
|
2113
|
+
severity,
|
|
2114
|
+
]
|
|
2115
|
+
if repo:
|
|
2116
|
+
cmd.extend(["--repo", repo])
|
|
2117
|
+
|
|
2118
|
+
try:
|
|
2119
|
+
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
2120
|
+
# gh issue create outputs the URL
|
|
2121
|
+
url = result.stdout.strip()
|
|
2122
|
+
if url:
|
|
2123
|
+
github_urls.append(url)
|
|
2124
|
+
except subprocess.CalledProcessError as e:
|
|
2125
|
+
# Don't fail entirely, just note the error
|
|
2126
|
+
error = f"Failed to create some GitHub issues: {e.stderr}"
|
|
2127
|
+
except FileNotFoundError:
|
|
2128
|
+
error = "gh CLI not found. Install GitHub CLI to create issues."
|
|
2129
|
+
break
|
|
2130
|
+
|
|
2131
|
+
return GauntletAcceptRiskResult(
|
|
2132
|
+
accepted_issues=len(remaining_issues),
|
|
2133
|
+
github_issues_created=len(github_urls),
|
|
2134
|
+
github_issue_urls=github_urls,
|
|
2135
|
+
message=(
|
|
2136
|
+
f"Accepted {len(remaining_issues)} issues as risk. "
|
|
2137
|
+
f"Created {len(github_urls)} GitHub issues."
|
|
2138
|
+
if create_github_issues
|
|
2139
|
+
else f"Accepted {len(remaining_issues)} issues as risk."
|
|
2140
|
+
),
|
|
2141
|
+
error=error,
|
|
2142
|
+
)
|