buildlog 0.6.1__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- buildlog/__init__.py +1 -1
- buildlog/cli.py +589 -44
- buildlog/confidence.py +27 -0
- buildlog/core/__init__.py +12 -0
- buildlog/core/bandit.py +699 -0
- buildlog/core/operations.py +499 -11
- buildlog/distill.py +80 -1
- buildlog/engine/__init__.py +61 -0
- buildlog/engine/bandit.py +23 -0
- buildlog/engine/confidence.py +28 -0
- buildlog/engine/embeddings.py +28 -0
- buildlog/engine/experiments.py +619 -0
- buildlog/engine/types.py +31 -0
- buildlog/llm.py +461 -0
- buildlog/mcp/server.py +12 -6
- buildlog/mcp/tools.py +166 -13
- buildlog/render/__init__.py +19 -2
- buildlog/render/claude_md.py +74 -26
- buildlog/render/continue_dev.py +102 -0
- buildlog/render/copilot.py +100 -0
- buildlog/render/cursor.py +105 -0
- buildlog/render/tracking.py +20 -1
- buildlog/render/windsurf.py +95 -0
- buildlog/seeds.py +41 -0
- buildlog/skills.py +69 -6
- {buildlog-0.6.1.data → buildlog-0.8.0.data}/data/share/buildlog/copier.yml +0 -4
- buildlog-0.8.0.data/data/share/buildlog/template/buildlog/_TEMPLATE_QUICK.md +21 -0
- buildlog-0.8.0.dist-info/METADATA +151 -0
- buildlog-0.8.0.dist-info/RECORD +54 -0
- buildlog-0.6.1.dist-info/METADATA +0 -490
- buildlog-0.6.1.dist-info/RECORD +0 -41
- {buildlog-0.6.1.data → buildlog-0.8.0.data}/data/share/buildlog/post_gen.py +0 -0
- {buildlog-0.6.1.data → buildlog-0.8.0.data}/data/share/buildlog/template/buildlog/.gitkeep +0 -0
- {buildlog-0.6.1.data → buildlog-0.8.0.data}/data/share/buildlog/template/buildlog/2026-01-01-example.md +0 -0
- {buildlog-0.6.1.data → buildlog-0.8.0.data}/data/share/buildlog/template/buildlog/BUILDLOG_SYSTEM.md +0 -0
- {buildlog-0.6.1.data → buildlog-0.8.0.data}/data/share/buildlog/template/buildlog/_TEMPLATE.md +0 -0
- {buildlog-0.6.1.data → buildlog-0.8.0.data}/data/share/buildlog/template/buildlog/assets/.gitkeep +0 -0
- {buildlog-0.6.1.dist-info → buildlog-0.8.0.dist-info}/WHEEL +0 -0
- {buildlog-0.6.1.dist-info → buildlog-0.8.0.dist-info}/entry_points.txt +0 -0
- {buildlog-0.6.1.dist-info → buildlog-0.8.0.dist-info}/licenses/LICENSE +0 -0
buildlog/core/bandit.py
ADDED
|
@@ -0,0 +1,699 @@
|
|
|
1
|
+
"""Thompson Sampling Bandit for Contextual Rule Selection.
|
|
2
|
+
|
|
3
|
+
=============================================================================
|
|
4
|
+
CANONICAL EXAMPLE: Thompson Sampling with Beta-Bernoulli Distributions
|
|
5
|
+
=============================================================================
|
|
6
|
+
|
|
7
|
+
This module implements a contextual multi-armed bandit using Thompson Sampling
|
|
8
|
+
for automatic rule selection in buildlog. It serves as an instructive,
|
|
9
|
+
production-ready example of these fundamental concepts.
|
|
10
|
+
|
|
11
|
+
BACKGROUND: THE MULTI-ARMED BANDIT PROBLEM
|
|
12
|
+
------------------------------------------
|
|
13
|
+
Imagine you're in a casino with multiple slot machines ("arms"). Each machine
|
|
14
|
+
has an unknown probability of paying out. You want to maximize your winnings,
|
|
15
|
+
but you face a fundamental tension:
|
|
16
|
+
|
|
17
|
+
- EXPLOITATION: Play the machine that has paid best so far
|
|
18
|
+
- EXPLORATION: Try other machines to learn if they're actually better
|
|
19
|
+
|
|
20
|
+
This is the "explore-exploit tradeoff" - one of the most important concepts
|
|
21
|
+
in decision-making under uncertainty.
|
|
22
|
+
|
|
23
|
+
WHY THOMPSON SAMPLING?
|
|
24
|
+
----------------------
|
|
25
|
+
Thompson Sampling is an elegant Bayesian approach that naturally balances
|
|
26
|
+
exploration and exploitation:
|
|
27
|
+
|
|
28
|
+
1. Maintain a probability distribution over each arm's true reward rate
|
|
29
|
+
2. Sample from each distribution
|
|
30
|
+
3. Pick the arm with the highest sample
|
|
31
|
+
|
|
32
|
+
The magic: arms we're uncertain about have high-variance distributions,
|
|
33
|
+
so they occasionally produce high samples, causing us to explore them.
|
|
34
|
+
As we gather data, distributions narrow, and we naturally exploit.
|
|
35
|
+
|
|
36
|
+
BETA-BERNOULLI MODEL
|
|
37
|
+
--------------------
|
|
38
|
+
For binary outcomes (success/failure), we use:
|
|
39
|
+
|
|
40
|
+
- Prior: Beta(α, β) - our belief before seeing data
|
|
41
|
+
- Likelihood: Bernoulli - each observation is success (1) or failure (0)
|
|
42
|
+
- Posterior: Beta(α + successes, β + failures)
|
|
43
|
+
|
|
44
|
+
The Beta distribution is "conjugate" to Bernoulli, meaning the posterior
|
|
45
|
+
has the same form as the prior. This makes updates trivial:
|
|
46
|
+
|
|
47
|
+
After observing a success: α → α + 1
|
|
48
|
+
After observing a failure: β → β + 1
|
|
49
|
+
|
|
50
|
+
CONTEXTUAL EXTENSION
|
|
51
|
+
--------------------
|
|
52
|
+
"Contextual" means we maintain separate distributions per context. In buildlog:
|
|
53
|
+
|
|
54
|
+
- Context = error class (e.g., "type-errors", "api-design")
|
|
55
|
+
- Arms = rules (skills that should prevent mistakes)
|
|
56
|
+
|
|
57
|
+
A rule might be excellent for type errors but useless for API design.
|
|
58
|
+
Separate distributions let us learn this.
|
|
59
|
+
|
|
60
|
+
USAGE IN BUILDLOG
|
|
61
|
+
-----------------
|
|
62
|
+
1. Session starts → bandit.select() picks top-k rules for this error class
|
|
63
|
+
2. Mistake logged → bandit.update(reward=0) for rules that didn't help
|
|
64
|
+
3. Explicit reward → bandit.update(reward=value) for direct feedback
|
|
65
|
+
|
|
66
|
+
References:
|
|
67
|
+
- Thompson (1933). "On the likelihood that one unknown probability exceeds another"
|
|
68
|
+
- Russo et al. (2018). "A Tutorial on Thompson Sampling"
|
|
69
|
+
- https://en.wikipedia.org/wiki/Thompson_sampling
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
from __future__ import annotations
|
|
73
|
+
|
|
74
|
+
import json
|
|
75
|
+
import random
|
|
76
|
+
from dataclasses import dataclass, field
|
|
77
|
+
from datetime import datetime, timezone
|
|
78
|
+
from pathlib import Path
|
|
79
|
+
from typing import Iterator
|
|
80
|
+
|
|
81
|
+
__all__ = [
|
|
82
|
+
"BetaParams",
|
|
83
|
+
"BanditState",
|
|
84
|
+
"ThompsonSamplingBandit",
|
|
85
|
+
"DEFAULT_SEED_BOOST",
|
|
86
|
+
"DEFAULT_CONTEXT",
|
|
87
|
+
]
|
|
88
|
+
|
|
89
|
+
# ============================================================================
|
|
90
|
+
# CONSTANTS
|
|
91
|
+
# ============================================================================
|
|
92
|
+
|
|
93
|
+
DEFAULT_SEED_BOOST = 2.0 # Extra α for seed rules (higher prior confidence)
|
|
94
|
+
DEFAULT_CONTEXT = "general" # Fallback when no error class specified
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# ============================================================================
|
|
98
|
+
# BETA DISTRIBUTION PARAMETERS
|
|
99
|
+
# ============================================================================
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@dataclass
|
|
103
|
+
class BetaParams:
|
|
104
|
+
"""Parameters for a Beta distribution representing belief about a rule's effectiveness.
|
|
105
|
+
|
|
106
|
+
The Beta distribution is parameterized by α (alpha) and β (beta):
|
|
107
|
+
|
|
108
|
+
Beta(α, β) has mean = α / (α + β)
|
|
109
|
+
|
|
110
|
+
Interpretation:
|
|
111
|
+
- α represents "pseudo-successes" (prior + observed successes)
|
|
112
|
+
- β represents "pseudo-failures" (prior + observed failures)
|
|
113
|
+
|
|
114
|
+
With uninformative prior Beta(1, 1):
|
|
115
|
+
- Uniform distribution over [0, 1]
|
|
116
|
+
- Mean = 0.5 (maximum uncertainty)
|
|
117
|
+
|
|
118
|
+
As we observe outcomes:
|
|
119
|
+
- Success → α += 1 (distribution shifts right)
|
|
120
|
+
- Failure → β += 1 (distribution shifts left)
|
|
121
|
+
- More observations → distribution narrows (less uncertainty)
|
|
122
|
+
|
|
123
|
+
Example evolution:
|
|
124
|
+
Beta(1, 1) → Uniform, mean=0.5, high variance
|
|
125
|
+
Beta(3, 2) → Skewed right, mean=0.6, moderate variance
|
|
126
|
+
Beta(30, 20) → Peaked at 0.6, low variance (high confidence)
|
|
127
|
+
|
|
128
|
+
Attributes:
|
|
129
|
+
alpha: Pseudo-count of successes (must be > 0)
|
|
130
|
+
beta: Pseudo-count of failures (must be > 0)
|
|
131
|
+
"""
|
|
132
|
+
|
|
133
|
+
alpha: float = 1.0
|
|
134
|
+
beta: float = 1.0
|
|
135
|
+
|
|
136
|
+
def __post_init__(self) -> None:
|
|
137
|
+
"""Validate parameters."""
|
|
138
|
+
if self.alpha <= 0 or self.beta <= 0:
|
|
139
|
+
raise ValueError(
|
|
140
|
+
f"Alpha and beta must be positive: α={self.alpha}, β={self.beta}"
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
def sample(self) -> float:
|
|
144
|
+
"""Draw a random sample from Beta(α, β).
|
|
145
|
+
|
|
146
|
+
This is the core of Thompson Sampling: we sample from our belief
|
|
147
|
+
distribution rather than using the mean. This naturally balances
|
|
148
|
+
exploration (high variance → occasional high samples) and
|
|
149
|
+
exploitation (high mean → consistently high samples).
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
A value in [0, 1] representing a possible true reward rate.
|
|
153
|
+
"""
|
|
154
|
+
return random.betavariate(self.alpha, self.beta)
|
|
155
|
+
|
|
156
|
+
def update(self, reward: float) -> None:
|
|
157
|
+
"""Update posterior with observed reward.
|
|
158
|
+
|
|
159
|
+
For Bernoulli rewards (0 or 1), this is exact Bayesian inference.
|
|
160
|
+
For continuous rewards in [0, 1], this is an approximation that
|
|
161
|
+
still works well in practice.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
reward: Observed reward, typically in [0, 1].
|
|
165
|
+
- 1.0 = full success (rule helped)
|
|
166
|
+
- 0.0 = failure (rule didn't help)
|
|
167
|
+
- Values in between for partial credit
|
|
168
|
+
"""
|
|
169
|
+
self.alpha += reward
|
|
170
|
+
self.beta += 1.0 - reward
|
|
171
|
+
|
|
172
|
+
def mean(self) -> float:
|
|
173
|
+
"""Expected value of the distribution.
|
|
174
|
+
|
|
175
|
+
This is our best point estimate of the arm's true reward rate.
|
|
176
|
+
We don't use this for selection (we sample instead), but it's
|
|
177
|
+
useful for reporting and debugging.
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
E[X] = α / (α + β)
|
|
181
|
+
"""
|
|
182
|
+
return self.alpha / (self.alpha + self.beta)
|
|
183
|
+
|
|
184
|
+
def variance(self) -> float:
|
|
185
|
+
"""Variance of the distribution.
|
|
186
|
+
|
|
187
|
+
Higher variance means more uncertainty. Thompson Sampling
|
|
188
|
+
naturally explores high-variance arms because their samples
|
|
189
|
+
occasionally exceed the mean.
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
Var[X] = αβ / ((α + β)² × (α + β + 1))
|
|
193
|
+
"""
|
|
194
|
+
total = self.alpha + self.beta
|
|
195
|
+
return (self.alpha * self.beta) / (total * total * (total + 1))
|
|
196
|
+
|
|
197
|
+
def confidence_interval(self, level: float = 0.95) -> tuple[float, float]:
|
|
198
|
+
"""Approximate confidence interval using normal approximation.
|
|
199
|
+
|
|
200
|
+
For large α + β, the Beta distribution approaches normal.
|
|
201
|
+
This gives us a quick sense of our uncertainty range.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
level: Confidence level (default 0.95 for 95% CI).
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
(lower, upper) bounds of the interval.
|
|
208
|
+
"""
|
|
209
|
+
import math
|
|
210
|
+
|
|
211
|
+
mean = self.mean()
|
|
212
|
+
std = math.sqrt(self.variance())
|
|
213
|
+
# Z-score for 95% CI is approximately 1.96
|
|
214
|
+
z = 1.96 if level == 0.95 else 2.576 if level == 0.99 else 1.645
|
|
215
|
+
|
|
216
|
+
lower = max(0.0, mean - z * std)
|
|
217
|
+
upper = min(1.0, mean + z * std)
|
|
218
|
+
return (lower, upper)
|
|
219
|
+
|
|
220
|
+
def to_dict(self) -> dict[str, float]:
|
|
221
|
+
"""Serialize for storage."""
|
|
222
|
+
return {"alpha": self.alpha, "beta": self.beta}
|
|
223
|
+
|
|
224
|
+
@classmethod
|
|
225
|
+
def from_dict(cls, data: dict[str, float]) -> BetaParams:
|
|
226
|
+
"""Deserialize from storage."""
|
|
227
|
+
return cls(alpha=data["alpha"], beta=data["beta"])
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
# ============================================================================
|
|
231
|
+
# BANDIT STATE PERSISTENCE
|
|
232
|
+
# ============================================================================
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
@dataclass
|
|
236
|
+
class ArmRecord:
|
|
237
|
+
"""A single arm's state record for persistence.
|
|
238
|
+
|
|
239
|
+
Stored as one line in the JSONL file.
|
|
240
|
+
"""
|
|
241
|
+
|
|
242
|
+
context: str
|
|
243
|
+
rule_id: str
|
|
244
|
+
params: BetaParams
|
|
245
|
+
is_seed: bool = False
|
|
246
|
+
updated_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
|
247
|
+
|
|
248
|
+
def to_dict(self) -> dict:
|
|
249
|
+
"""Serialize for JSONL storage."""
|
|
250
|
+
return {
|
|
251
|
+
"context": self.context,
|
|
252
|
+
"rule_id": self.rule_id,
|
|
253
|
+
"alpha": self.params.alpha,
|
|
254
|
+
"beta": self.params.beta,
|
|
255
|
+
"is_seed": self.is_seed,
|
|
256
|
+
"updated_at": self.updated_at.isoformat(),
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
@classmethod
|
|
260
|
+
def from_dict(cls, data: dict) -> ArmRecord:
|
|
261
|
+
"""Deserialize from JSONL storage."""
|
|
262
|
+
updated_at = datetime.fromisoformat(data["updated_at"])
|
|
263
|
+
if updated_at.tzinfo is None:
|
|
264
|
+
updated_at = updated_at.replace(tzinfo=timezone.utc)
|
|
265
|
+
|
|
266
|
+
return cls(
|
|
267
|
+
context=data["context"],
|
|
268
|
+
rule_id=data["rule_id"],
|
|
269
|
+
params=BetaParams(alpha=data["alpha"], beta=data["beta"]),
|
|
270
|
+
is_seed=data.get("is_seed", False),
|
|
271
|
+
updated_at=updated_at,
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
@dataclass
|
|
276
|
+
class BanditState:
|
|
277
|
+
"""Persisted state for the contextual bandit.
|
|
278
|
+
|
|
279
|
+
Structure:
|
|
280
|
+
arms[context][rule_id] = BetaParams
|
|
281
|
+
|
|
282
|
+
This allows O(1) lookup for any (context, rule) pair while
|
|
283
|
+
maintaining separate belief distributions per context.
|
|
284
|
+
|
|
285
|
+
Storage Format (JSONL):
|
|
286
|
+
Each line is a JSON object representing one arm's state.
|
|
287
|
+
We use append-only writes and compact on load to handle
|
|
288
|
+
concurrent access and crash recovery gracefully.
|
|
289
|
+
|
|
290
|
+
Example .buildlog/bandit_state.jsonl:
|
|
291
|
+
{"context": "type-errors", "rule_id": "arch-123", "alpha": 3.0, "beta": 2.0, ...}
|
|
292
|
+
{"context": "type-errors", "rule_id": "arch-123", "alpha": 4.0, "beta": 2.0, ...}
|
|
293
|
+
|
|
294
|
+
The second line supersedes the first (same context + rule_id).
|
|
295
|
+
"""
|
|
296
|
+
|
|
297
|
+
arms: dict[str, dict[str, BetaParams]] = field(default_factory=dict)
|
|
298
|
+
seed_flags: dict[str, dict[str, bool]] = field(default_factory=dict)
|
|
299
|
+
|
|
300
|
+
def get_params(self, context: str, rule_id: str) -> BetaParams | None:
|
|
301
|
+
"""Get parameters for a (context, rule) pair, if they exist."""
|
|
302
|
+
return self.arms.get(context, {}).get(rule_id)
|
|
303
|
+
|
|
304
|
+
def set_params(
|
|
305
|
+
self,
|
|
306
|
+
context: str,
|
|
307
|
+
rule_id: str,
|
|
308
|
+
params: BetaParams,
|
|
309
|
+
is_seed: bool = False,
|
|
310
|
+
) -> None:
|
|
311
|
+
"""Set parameters for a (context, rule) pair."""
|
|
312
|
+
if context not in self.arms:
|
|
313
|
+
self.arms[context] = {}
|
|
314
|
+
self.seed_flags[context] = {}
|
|
315
|
+
self.arms[context][rule_id] = params
|
|
316
|
+
self.seed_flags[context][rule_id] = is_seed
|
|
317
|
+
|
|
318
|
+
def is_seed(self, context: str, rule_id: str) -> bool:
|
|
319
|
+
"""Check if a rule was initialized as a seed rule."""
|
|
320
|
+
return self.seed_flags.get(context, {}).get(rule_id, False)
|
|
321
|
+
|
|
322
|
+
def all_arms(self) -> Iterator[tuple[str, str, BetaParams]]:
|
|
323
|
+
"""Iterate over all (context, rule_id, params) tuples."""
|
|
324
|
+
for context, rules in self.arms.items():
|
|
325
|
+
for rule_id, params in rules.items():
|
|
326
|
+
yield context, rule_id, params
|
|
327
|
+
|
|
328
|
+
@classmethod
|
|
329
|
+
def load(cls, path: Path) -> BanditState:
|
|
330
|
+
"""Load state from JSONL file, compacting duplicate entries.
|
|
331
|
+
|
|
332
|
+
Because we append updates, the file may contain multiple entries
|
|
333
|
+
for the same (context, rule_id). We keep only the latest.
|
|
334
|
+
"""
|
|
335
|
+
state = cls()
|
|
336
|
+
|
|
337
|
+
if not path.exists():
|
|
338
|
+
return state
|
|
339
|
+
|
|
340
|
+
# Read all records, keeping only the latest per (context, rule_id)
|
|
341
|
+
records: dict[tuple[str, str], ArmRecord] = {}
|
|
342
|
+
|
|
343
|
+
for line in path.read_text().strip().split("\n"):
|
|
344
|
+
if not line:
|
|
345
|
+
continue
|
|
346
|
+
try:
|
|
347
|
+
data = json.loads(line)
|
|
348
|
+
record = ArmRecord.from_dict(data)
|
|
349
|
+
key = (record.context, record.rule_id)
|
|
350
|
+
|
|
351
|
+
# Keep if newer or first seen
|
|
352
|
+
if key not in records or record.updated_at > records[key].updated_at:
|
|
353
|
+
records[key] = record
|
|
354
|
+
except (json.JSONDecodeError, KeyError, ValueError):
|
|
355
|
+
# Skip malformed lines (crash recovery)
|
|
356
|
+
continue
|
|
357
|
+
|
|
358
|
+
# Populate state from compacted records
|
|
359
|
+
for (context, rule_id), record in records.items():
|
|
360
|
+
state.set_params(context, rule_id, record.params, record.is_seed)
|
|
361
|
+
|
|
362
|
+
return state
|
|
363
|
+
|
|
364
|
+
def save(self, path: Path) -> None:
|
|
365
|
+
"""Save full state to JSONL file (compacted).
|
|
366
|
+
|
|
367
|
+
This writes a fresh file with one line per arm, removing
|
|
368
|
+
any historical duplicates from append-only updates.
|
|
369
|
+
"""
|
|
370
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
371
|
+
|
|
372
|
+
lines = []
|
|
373
|
+
for context, rule_id, params in self.all_arms():
|
|
374
|
+
record = ArmRecord(
|
|
375
|
+
context=context,
|
|
376
|
+
rule_id=rule_id,
|
|
377
|
+
params=params,
|
|
378
|
+
is_seed=self.is_seed(context, rule_id),
|
|
379
|
+
)
|
|
380
|
+
lines.append(json.dumps(record.to_dict()))
|
|
381
|
+
|
|
382
|
+
path.write_text("\n".join(lines) + "\n" if lines else "")
|
|
383
|
+
|
|
384
|
+
def append_update(self, path: Path, context: str, rule_id: str) -> None:
|
|
385
|
+
"""Append a single arm's update to the JSONL file.
|
|
386
|
+
|
|
387
|
+
This is more efficient than rewriting the entire file for
|
|
388
|
+
each update. The file will be compacted on next load.
|
|
389
|
+
"""
|
|
390
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
391
|
+
|
|
392
|
+
params = self.get_params(context, rule_id)
|
|
393
|
+
if params is None:
|
|
394
|
+
return
|
|
395
|
+
|
|
396
|
+
record = ArmRecord(
|
|
397
|
+
context=context,
|
|
398
|
+
rule_id=rule_id,
|
|
399
|
+
params=params,
|
|
400
|
+
is_seed=self.is_seed(context, rule_id),
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
with open(path, "a") as f:
|
|
404
|
+
f.write(json.dumps(record.to_dict()) + "\n")
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
# ============================================================================
|
|
408
|
+
# THOMPSON SAMPLING BANDIT
|
|
409
|
+
# ============================================================================
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
class ThompsonSamplingBandit:
|
|
413
|
+
"""Thompson Sampling bandit for contextual rule selection.
|
|
414
|
+
|
|
415
|
+
This is the main interface for the bandit. It handles:
|
|
416
|
+
|
|
417
|
+
1. SELECTION: Pick top-k rules for a given context
|
|
418
|
+
- Sample from each rule's Beta distribution
|
|
419
|
+
- Return rules with highest samples
|
|
420
|
+
- Initialize new rules with appropriate priors
|
|
421
|
+
|
|
422
|
+
2. UPDATES: Learn from feedback
|
|
423
|
+
- Success (reward=1): rule helped prevent mistakes
|
|
424
|
+
- Failure (reward=0): mistake occurred despite rule
|
|
425
|
+
- Partial (0 < reward < 1): for nuanced feedback
|
|
426
|
+
|
|
427
|
+
3. PERSISTENCE: State survives across sessions
|
|
428
|
+
- Append-only writes for crash safety
|
|
429
|
+
- Compact on load for efficiency
|
|
430
|
+
|
|
431
|
+
Example usage:
|
|
432
|
+
bandit = ThompsonSamplingBandit(buildlog_dir / "bandit_state.jsonl")
|
|
433
|
+
|
|
434
|
+
# At session start: select rules
|
|
435
|
+
selected = bandit.select(
|
|
436
|
+
candidates=["rule-1", "rule-2", "rule-3"],
|
|
437
|
+
context="type-errors",
|
|
438
|
+
k=2,
|
|
439
|
+
)
|
|
440
|
+
# selected might be ["rule-2", "rule-1"] based on sampling
|
|
441
|
+
|
|
442
|
+
# On mistake: negative feedback
|
|
443
|
+
for rule_id in selected:
|
|
444
|
+
bandit.update(rule_id, reward=0.0, context="type-errors")
|
|
445
|
+
|
|
446
|
+
# On success: positive feedback
|
|
447
|
+
bandit.update("rule-2", reward=1.0, context="type-errors")
|
|
448
|
+
"""
|
|
449
|
+
|
|
450
|
+
def __init__(
|
|
451
|
+
self,
|
|
452
|
+
state_path: Path,
|
|
453
|
+
seed_boost: float = DEFAULT_SEED_BOOST,
|
|
454
|
+
default_context: str = DEFAULT_CONTEXT,
|
|
455
|
+
):
|
|
456
|
+
"""Initialize the bandit.
|
|
457
|
+
|
|
458
|
+
Args:
|
|
459
|
+
state_path: Path to JSONL file for persistence.
|
|
460
|
+
seed_boost: Extra α for seed rules. Higher values mean
|
|
461
|
+
seed rules start with higher assumed success rates.
|
|
462
|
+
Default 2.0 means seed rules start as if they've
|
|
463
|
+
already had 2 extra successes.
|
|
464
|
+
default_context: Fallback context when none specified.
|
|
465
|
+
"""
|
|
466
|
+
self.state_path = state_path
|
|
467
|
+
self.seed_boost = seed_boost
|
|
468
|
+
self.default_context = default_context
|
|
469
|
+
self.state = BanditState.load(state_path)
|
|
470
|
+
|
|
471
|
+
def select(
|
|
472
|
+
self,
|
|
473
|
+
candidates: list[str],
|
|
474
|
+
context: str | None = None,
|
|
475
|
+
k: int = 3,
|
|
476
|
+
seed_rule_ids: set[str] | None = None,
|
|
477
|
+
) -> list[str]:
|
|
478
|
+
"""Select top-k rules using Thompson Sampling.
|
|
479
|
+
|
|
480
|
+
This is where the magic happens:
|
|
481
|
+
|
|
482
|
+
1. For each candidate rule, get or create its Beta distribution
|
|
483
|
+
2. Sample from each distribution (not the mean!)
|
|
484
|
+
3. Return the k rules with highest samples
|
|
485
|
+
|
|
486
|
+
The sampling step is crucial: it means rules we're uncertain about
|
|
487
|
+
(high variance) will occasionally beat rules with higher means,
|
|
488
|
+
ensuring we explore enough to learn their true values.
|
|
489
|
+
|
|
490
|
+
Args:
|
|
491
|
+
candidates: List of rule IDs to choose from.
|
|
492
|
+
context: Error class for contextual selection.
|
|
493
|
+
Different contexts have independent distributions.
|
|
494
|
+
k: Number of rules to select.
|
|
495
|
+
seed_rule_ids: Set of rule IDs that are from seeds (axioms).
|
|
496
|
+
These get boosted priors.
|
|
497
|
+
|
|
498
|
+
Returns:
|
|
499
|
+
List of k rule IDs, ordered by their sampled values (best first).
|
|
500
|
+
If fewer than k candidates, returns all of them.
|
|
501
|
+
"""
|
|
502
|
+
ctx = context or self.default_context
|
|
503
|
+
seed_ids = seed_rule_ids or set()
|
|
504
|
+
|
|
505
|
+
# Sample from each candidate's distribution
|
|
506
|
+
samples: list[tuple[str, float]] = []
|
|
507
|
+
|
|
508
|
+
for rule_id in candidates:
|
|
509
|
+
params = self.state.get_params(ctx, rule_id)
|
|
510
|
+
|
|
511
|
+
if params is None:
|
|
512
|
+
# Initialize new arm
|
|
513
|
+
is_seed = rule_id in seed_ids
|
|
514
|
+
params = self._create_prior(is_seed)
|
|
515
|
+
self.state.set_params(ctx, rule_id, params, is_seed)
|
|
516
|
+
|
|
517
|
+
# THE KEY STEP: sample, don't use mean
|
|
518
|
+
sample = params.sample()
|
|
519
|
+
samples.append((rule_id, sample))
|
|
520
|
+
|
|
521
|
+
# Sort by sampled value (descending) and take top k
|
|
522
|
+
samples.sort(key=lambda x: x[1], reverse=True)
|
|
523
|
+
selected = [rule_id for rule_id, _ in samples[:k]]
|
|
524
|
+
|
|
525
|
+
# Persist any new arms we created
|
|
526
|
+
self.state.save(self.state_path)
|
|
527
|
+
|
|
528
|
+
return selected
|
|
529
|
+
|
|
530
|
+
def update(
|
|
531
|
+
self,
|
|
532
|
+
rule_id: str,
|
|
533
|
+
reward: float,
|
|
534
|
+
context: str | None = None,
|
|
535
|
+
) -> None:
|
|
536
|
+
"""Update posterior for a rule based on observed reward.
|
|
537
|
+
|
|
538
|
+
This is Bayesian learning in action:
|
|
539
|
+
|
|
540
|
+
Prior: Beta(α, β)
|
|
541
|
+
+ Observation: reward r
|
|
542
|
+
= Posterior: Beta(α + r, β + (1 - r))
|
|
543
|
+
|
|
544
|
+
Over time, rules that consistently help will have high α,
|
|
545
|
+
rules that don't help will have high β, and the bandit will
|
|
546
|
+
naturally favor effective rules.
|
|
547
|
+
|
|
548
|
+
Args:
|
|
549
|
+
rule_id: The rule to update.
|
|
550
|
+
reward: Observed reward in [0, 1].
|
|
551
|
+
- 1.0: Rule helped (full success)
|
|
552
|
+
- 0.0: Rule didn't help (failure)
|
|
553
|
+
- 0.5: Partial credit
|
|
554
|
+
context: Error class context.
|
|
555
|
+
"""
|
|
556
|
+
ctx = context or self.default_context
|
|
557
|
+
params = self.state.get_params(ctx, rule_id)
|
|
558
|
+
|
|
559
|
+
if params is None:
|
|
560
|
+
# Rule wasn't initialized yet - create with default prior
|
|
561
|
+
params = self._create_prior(is_seed=False)
|
|
562
|
+
self.state.set_params(ctx, rule_id, params, is_seed=False)
|
|
563
|
+
|
|
564
|
+
# Bayesian update
|
|
565
|
+
params.update(reward)
|
|
566
|
+
|
|
567
|
+
# Persist (append-only for efficiency)
|
|
568
|
+
self.state.append_update(self.state_path, ctx, rule_id)
|
|
569
|
+
|
|
570
|
+
def batch_update(
|
|
571
|
+
self,
|
|
572
|
+
rule_ids: list[str],
|
|
573
|
+
reward: float,
|
|
574
|
+
context: str | None = None,
|
|
575
|
+
) -> None:
|
|
576
|
+
"""Update multiple rules with the same reward.
|
|
577
|
+
|
|
578
|
+
Convenience method for updating all rules active during a session
|
|
579
|
+
when a mistake occurs (reward=0) or when giving positive feedback
|
|
580
|
+
(reward>0) to all active rules.
|
|
581
|
+
|
|
582
|
+
Args:
|
|
583
|
+
rule_ids: Rules to update.
|
|
584
|
+
reward: Reward value for all rules.
|
|
585
|
+
context: Error class context.
|
|
586
|
+
"""
|
|
587
|
+
for rule_id in rule_ids:
|
|
588
|
+
self.update(rule_id, reward, context)
|
|
589
|
+
|
|
590
|
+
def get_stats(self, context: str | None = None) -> dict[str, dict]:
|
|
591
|
+
"""Get statistics for all rules in a context.
|
|
592
|
+
|
|
593
|
+
Useful for debugging and reporting.
|
|
594
|
+
|
|
595
|
+
Args:
|
|
596
|
+
context: Error class to get stats for.
|
|
597
|
+
If None, returns stats for all contexts.
|
|
598
|
+
|
|
599
|
+
Returns:
|
|
600
|
+
Dict mapping rule_id to stats dict with:
|
|
601
|
+
- mean: Expected reward rate
|
|
602
|
+
- alpha, beta: Distribution parameters
|
|
603
|
+
- variance: Uncertainty measure
|
|
604
|
+
- is_seed: Whether this is a seed rule
|
|
605
|
+
- confidence_interval: 95% CI
|
|
606
|
+
"""
|
|
607
|
+
stats: dict[str, dict] = {}
|
|
608
|
+
|
|
609
|
+
if context is not None:
|
|
610
|
+
contexts = [context]
|
|
611
|
+
else:
|
|
612
|
+
contexts = list(self.state.arms.keys())
|
|
613
|
+
|
|
614
|
+
for ctx in contexts:
|
|
615
|
+
rules = self.state.arms.get(ctx, {})
|
|
616
|
+
for rule_id, params in rules.items():
|
|
617
|
+
key = f"{ctx}:{rule_id}" if context is None else rule_id
|
|
618
|
+
ci_low, ci_high = params.confidence_interval()
|
|
619
|
+
stats[key] = {
|
|
620
|
+
"context": ctx,
|
|
621
|
+
"mean": round(params.mean(), 4),
|
|
622
|
+
"alpha": params.alpha,
|
|
623
|
+
"beta": params.beta,
|
|
624
|
+
"variance": round(params.variance(), 6),
|
|
625
|
+
"is_seed": self.state.is_seed(ctx, rule_id),
|
|
626
|
+
"confidence_interval": (round(ci_low, 4), round(ci_high, 4)),
|
|
627
|
+
"total_observations": params.alpha
|
|
628
|
+
+ params.beta
|
|
629
|
+
- 2, # Subtract prior
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
return stats
|
|
633
|
+
|
|
634
|
+
def get_top_rules(
|
|
635
|
+
self,
|
|
636
|
+
context: str,
|
|
637
|
+
k: int = 10,
|
|
638
|
+
) -> list[tuple[str, float]]:
|
|
639
|
+
"""Get top rules by expected value (not sampled).
|
|
640
|
+
|
|
641
|
+
Unlike select(), this uses the mean rather than sampling.
|
|
642
|
+
Useful for reporting "best rules so far" without the
|
|
643
|
+
exploration randomness.
|
|
644
|
+
|
|
645
|
+
Args:
|
|
646
|
+
context: Error class.
|
|
647
|
+
k: Number of rules to return.
|
|
648
|
+
|
|
649
|
+
Returns:
|
|
650
|
+
List of (rule_id, mean) tuples, sorted by mean descending.
|
|
651
|
+
"""
|
|
652
|
+
rules = self.state.arms.get(context, {})
|
|
653
|
+
ranked = [(rule_id, params.mean()) for rule_id, params in rules.items()]
|
|
654
|
+
ranked.sort(key=lambda x: x[1], reverse=True)
|
|
655
|
+
return ranked[:k]
|
|
656
|
+
|
|
657
|
+
def _create_prior(self, is_seed: bool) -> BetaParams:
|
|
658
|
+
"""Create prior distribution for a new arm.
|
|
659
|
+
|
|
660
|
+
Seed rules (from gauntlet personas / axioms) get a boosted prior,
|
|
661
|
+
reflecting our belief that curated rules are likely effective.
|
|
662
|
+
|
|
663
|
+
Non-seed rules get the uninformative Beta(1, 1) prior,
|
|
664
|
+
meaning we start with maximum uncertainty about their value.
|
|
665
|
+
|
|
666
|
+
Args:
|
|
667
|
+
is_seed: Whether this rule comes from seeds.
|
|
668
|
+
|
|
669
|
+
Returns:
|
|
670
|
+
BetaParams with appropriate prior.
|
|
671
|
+
"""
|
|
672
|
+
if is_seed:
|
|
673
|
+
# Boosted prior: as if rule already had seed_boost successes
|
|
674
|
+
# Beta(1 + boost, 1) → mean = (1 + boost) / (2 + boost)
|
|
675
|
+
# With boost=2: mean = 3/4 = 0.75 (optimistic)
|
|
676
|
+
return BetaParams(alpha=1.0 + self.seed_boost, beta=1.0)
|
|
677
|
+
else:
|
|
678
|
+
# Uninformative prior: maximum uncertainty
|
|
679
|
+
# Beta(1, 1) is uniform → mean = 0.5
|
|
680
|
+
return BetaParams(alpha=1.0, beta=1.0)
|
|
681
|
+
|
|
682
|
+
def reset(self, context: str | None = None) -> None:
|
|
683
|
+
"""Reset bandit state.
|
|
684
|
+
|
|
685
|
+
Use with caution - this discards learned information.
|
|
686
|
+
|
|
687
|
+
Args:
|
|
688
|
+
context: If provided, only reset this context.
|
|
689
|
+
If None, reset everything.
|
|
690
|
+
"""
|
|
691
|
+
if context is None:
|
|
692
|
+
self.state = BanditState()
|
|
693
|
+
else:
|
|
694
|
+
if context in self.state.arms:
|
|
695
|
+
del self.state.arms[context]
|
|
696
|
+
if context in self.state.seed_flags:
|
|
697
|
+
del self.state.seed_flags[context]
|
|
698
|
+
|
|
699
|
+
self.state.save(self.state_path)
|