buildlog 0.6.1__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. buildlog/__init__.py +1 -1
  2. buildlog/cli.py +589 -44
  3. buildlog/confidence.py +27 -0
  4. buildlog/core/__init__.py +12 -0
  5. buildlog/core/bandit.py +699 -0
  6. buildlog/core/operations.py +499 -11
  7. buildlog/distill.py +80 -1
  8. buildlog/engine/__init__.py +61 -0
  9. buildlog/engine/bandit.py +23 -0
  10. buildlog/engine/confidence.py +28 -0
  11. buildlog/engine/embeddings.py +28 -0
  12. buildlog/engine/experiments.py +619 -0
  13. buildlog/engine/types.py +31 -0
  14. buildlog/llm.py +461 -0
  15. buildlog/mcp/server.py +12 -6
  16. buildlog/mcp/tools.py +166 -13
  17. buildlog/render/__init__.py +19 -2
  18. buildlog/render/claude_md.py +74 -26
  19. buildlog/render/continue_dev.py +102 -0
  20. buildlog/render/copilot.py +100 -0
  21. buildlog/render/cursor.py +105 -0
  22. buildlog/render/tracking.py +20 -1
  23. buildlog/render/windsurf.py +95 -0
  24. buildlog/seeds.py +41 -0
  25. buildlog/skills.py +69 -6
  26. {buildlog-0.6.1.data → buildlog-0.8.0.data}/data/share/buildlog/copier.yml +0 -4
  27. buildlog-0.8.0.data/data/share/buildlog/template/buildlog/_TEMPLATE_QUICK.md +21 -0
  28. buildlog-0.8.0.dist-info/METADATA +151 -0
  29. buildlog-0.8.0.dist-info/RECORD +54 -0
  30. buildlog-0.6.1.dist-info/METADATA +0 -490
  31. buildlog-0.6.1.dist-info/RECORD +0 -41
  32. {buildlog-0.6.1.data → buildlog-0.8.0.data}/data/share/buildlog/post_gen.py +0 -0
  33. {buildlog-0.6.1.data → buildlog-0.8.0.data}/data/share/buildlog/template/buildlog/.gitkeep +0 -0
  34. {buildlog-0.6.1.data → buildlog-0.8.0.data}/data/share/buildlog/template/buildlog/2026-01-01-example.md +0 -0
  35. {buildlog-0.6.1.data → buildlog-0.8.0.data}/data/share/buildlog/template/buildlog/BUILDLOG_SYSTEM.md +0 -0
  36. {buildlog-0.6.1.data → buildlog-0.8.0.data}/data/share/buildlog/template/buildlog/_TEMPLATE.md +0 -0
  37. {buildlog-0.6.1.data → buildlog-0.8.0.data}/data/share/buildlog/template/buildlog/assets/.gitkeep +0 -0
  38. {buildlog-0.6.1.dist-info → buildlog-0.8.0.dist-info}/WHEEL +0 -0
  39. {buildlog-0.6.1.dist-info → buildlog-0.8.0.dist-info}/entry_points.txt +0 -0
  40. {buildlog-0.6.1.dist-info → buildlog-0.8.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,699 @@
1
+ """Thompson Sampling Bandit for Contextual Rule Selection.
2
+
3
+ =============================================================================
4
+ CANONICAL EXAMPLE: Thompson Sampling with Beta-Bernoulli Distributions
5
+ =============================================================================
6
+
7
+ This module implements a contextual multi-armed bandit using Thompson Sampling
8
+ for automatic rule selection in buildlog. It serves as an instructive,
9
+ production-ready example of these fundamental concepts.
10
+
11
+ BACKGROUND: THE MULTI-ARMED BANDIT PROBLEM
12
+ ------------------------------------------
13
+ Imagine you're in a casino with multiple slot machines ("arms"). Each machine
14
+ has an unknown probability of paying out. You want to maximize your winnings,
15
+ but you face a fundamental tension:
16
+
17
+ - EXPLOITATION: Play the machine that has paid best so far
18
+ - EXPLORATION: Try other machines to learn if they're actually better
19
+
20
+ This is the "explore-exploit tradeoff" - one of the most important concepts
21
+ in decision-making under uncertainty.
22
+
23
+ WHY THOMPSON SAMPLING?
24
+ ----------------------
25
+ Thompson Sampling is an elegant Bayesian approach that naturally balances
26
+ exploration and exploitation:
27
+
28
+ 1. Maintain a probability distribution over each arm's true reward rate
29
+ 2. Sample from each distribution
30
+ 3. Pick the arm with the highest sample
31
+
32
+ The magic: arms we're uncertain about have high-variance distributions,
33
+ so they occasionally produce high samples, causing us to explore them.
34
+ As we gather data, distributions narrow, and we naturally exploit.
35
+
36
+ BETA-BERNOULLI MODEL
37
+ --------------------
38
+ For binary outcomes (success/failure), we use:
39
+
40
+ - Prior: Beta(α, β) - our belief before seeing data
41
+ - Likelihood: Bernoulli - each observation is success (1) or failure (0)
42
+ - Posterior: Beta(α + successes, β + failures)
43
+
44
+ The Beta distribution is "conjugate" to Bernoulli, meaning the posterior
45
+ has the same form as the prior. This makes updates trivial:
46
+
47
+ After observing a success: α → α + 1
48
+ After observing a failure: β → β + 1
49
+
50
+ CONTEXTUAL EXTENSION
51
+ --------------------
52
+ "Contextual" means we maintain separate distributions per context. In buildlog:
53
+
54
+ - Context = error class (e.g., "type-errors", "api-design")
55
+ - Arms = rules (skills that should prevent mistakes)
56
+
57
+ A rule might be excellent for type errors but useless for API design.
58
+ Separate distributions let us learn this.
59
+
60
+ USAGE IN BUILDLOG
61
+ -----------------
62
+ 1. Session starts → bandit.select() picks top-k rules for this error class
63
+ 2. Mistake logged → bandit.update(reward=0) for rules that didn't help
64
+ 3. Explicit reward → bandit.update(reward=value) for direct feedback
65
+
66
+ References:
67
+ - Thompson (1933). "On the likelihood that one unknown probability exceeds another"
68
+ - Russo et al. (2018). "A Tutorial on Thompson Sampling"
69
+ - https://en.wikipedia.org/wiki/Thompson_sampling
70
+ """
71
+
72
+ from __future__ import annotations
73
+
74
+ import json
75
+ import random
76
+ from dataclasses import dataclass, field
77
+ from datetime import datetime, timezone
78
+ from pathlib import Path
79
+ from typing import Iterator
80
+
81
+ __all__ = [
82
+ "BetaParams",
83
+ "BanditState",
84
+ "ThompsonSamplingBandit",
85
+ "DEFAULT_SEED_BOOST",
86
+ "DEFAULT_CONTEXT",
87
+ ]
88
+
89
+ # ============================================================================
90
+ # CONSTANTS
91
+ # ============================================================================
92
+
93
+ DEFAULT_SEED_BOOST = 2.0 # Extra α for seed rules (higher prior confidence)
94
+ DEFAULT_CONTEXT = "general" # Fallback when no error class specified
95
+
96
+
97
+ # ============================================================================
98
+ # BETA DISTRIBUTION PARAMETERS
99
+ # ============================================================================
100
+
101
+
102
+ @dataclass
103
+ class BetaParams:
104
+ """Parameters for a Beta distribution representing belief about a rule's effectiveness.
105
+
106
+ The Beta distribution is parameterized by α (alpha) and β (beta):
107
+
108
+ Beta(α, β) has mean = α / (α + β)
109
+
110
+ Interpretation:
111
+ - α represents "pseudo-successes" (prior + observed successes)
112
+ - β represents "pseudo-failures" (prior + observed failures)
113
+
114
+ With uninformative prior Beta(1, 1):
115
+ - Uniform distribution over [0, 1]
116
+ - Mean = 0.5 (maximum uncertainty)
117
+
118
+ As we observe outcomes:
119
+ - Success → α += 1 (distribution shifts right)
120
+ - Failure → β += 1 (distribution shifts left)
121
+ - More observations → distribution narrows (less uncertainty)
122
+
123
+ Example evolution:
124
+ Beta(1, 1) → Uniform, mean=0.5, high variance
125
+ Beta(3, 2) → Skewed right, mean=0.6, moderate variance
126
+ Beta(30, 20) → Peaked at 0.6, low variance (high confidence)
127
+
128
+ Attributes:
129
+ alpha: Pseudo-count of successes (must be > 0)
130
+ beta: Pseudo-count of failures (must be > 0)
131
+ """
132
+
133
+ alpha: float = 1.0
134
+ beta: float = 1.0
135
+
136
+ def __post_init__(self) -> None:
137
+ """Validate parameters."""
138
+ if self.alpha <= 0 or self.beta <= 0:
139
+ raise ValueError(
140
+ f"Alpha and beta must be positive: α={self.alpha}, β={self.beta}"
141
+ )
142
+
143
+ def sample(self) -> float:
144
+ """Draw a random sample from Beta(α, β).
145
+
146
+ This is the core of Thompson Sampling: we sample from our belief
147
+ distribution rather than using the mean. This naturally balances
148
+ exploration (high variance → occasional high samples) and
149
+ exploitation (high mean → consistently high samples).
150
+
151
+ Returns:
152
+ A value in [0, 1] representing a possible true reward rate.
153
+ """
154
+ return random.betavariate(self.alpha, self.beta)
155
+
156
+ def update(self, reward: float) -> None:
157
+ """Update posterior with observed reward.
158
+
159
+ For Bernoulli rewards (0 or 1), this is exact Bayesian inference.
160
+ For continuous rewards in [0, 1], this is an approximation that
161
+ still works well in practice.
162
+
163
+ Args:
164
+ reward: Observed reward, typically in [0, 1].
165
+ - 1.0 = full success (rule helped)
166
+ - 0.0 = failure (rule didn't help)
167
+ - Values in between for partial credit
168
+ """
169
+ self.alpha += reward
170
+ self.beta += 1.0 - reward
171
+
172
+ def mean(self) -> float:
173
+ """Expected value of the distribution.
174
+
175
+ This is our best point estimate of the arm's true reward rate.
176
+ We don't use this for selection (we sample instead), but it's
177
+ useful for reporting and debugging.
178
+
179
+ Returns:
180
+ E[X] = α / (α + β)
181
+ """
182
+ return self.alpha / (self.alpha + self.beta)
183
+
184
+ def variance(self) -> float:
185
+ """Variance of the distribution.
186
+
187
+ Higher variance means more uncertainty. Thompson Sampling
188
+ naturally explores high-variance arms because their samples
189
+ occasionally exceed the mean.
190
+
191
+ Returns:
192
+ Var[X] = αβ / ((α + β)² × (α + β + 1))
193
+ """
194
+ total = self.alpha + self.beta
195
+ return (self.alpha * self.beta) / (total * total * (total + 1))
196
+
197
+ def confidence_interval(self, level: float = 0.95) -> tuple[float, float]:
198
+ """Approximate confidence interval using normal approximation.
199
+
200
+ For large α + β, the Beta distribution approaches normal.
201
+ This gives us a quick sense of our uncertainty range.
202
+
203
+ Args:
204
+ level: Confidence level (default 0.95 for 95% CI).
205
+
206
+ Returns:
207
+ (lower, upper) bounds of the interval.
208
+ """
209
+ import math
210
+
211
+ mean = self.mean()
212
+ std = math.sqrt(self.variance())
213
+ # Z-score for 95% CI is approximately 1.96
214
+ z = 1.96 if level == 0.95 else 2.576 if level == 0.99 else 1.645
215
+
216
+ lower = max(0.0, mean - z * std)
217
+ upper = min(1.0, mean + z * std)
218
+ return (lower, upper)
219
+
220
+ def to_dict(self) -> dict[str, float]:
221
+ """Serialize for storage."""
222
+ return {"alpha": self.alpha, "beta": self.beta}
223
+
224
+ @classmethod
225
+ def from_dict(cls, data: dict[str, float]) -> BetaParams:
226
+ """Deserialize from storage."""
227
+ return cls(alpha=data["alpha"], beta=data["beta"])
228
+
229
+
230
+ # ============================================================================
231
+ # BANDIT STATE PERSISTENCE
232
+ # ============================================================================
233
+
234
+
235
+ @dataclass
236
+ class ArmRecord:
237
+ """A single arm's state record for persistence.
238
+
239
+ Stored as one line in the JSONL file.
240
+ """
241
+
242
+ context: str
243
+ rule_id: str
244
+ params: BetaParams
245
+ is_seed: bool = False
246
+ updated_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
247
+
248
+ def to_dict(self) -> dict:
249
+ """Serialize for JSONL storage."""
250
+ return {
251
+ "context": self.context,
252
+ "rule_id": self.rule_id,
253
+ "alpha": self.params.alpha,
254
+ "beta": self.params.beta,
255
+ "is_seed": self.is_seed,
256
+ "updated_at": self.updated_at.isoformat(),
257
+ }
258
+
259
+ @classmethod
260
+ def from_dict(cls, data: dict) -> ArmRecord:
261
+ """Deserialize from JSONL storage."""
262
+ updated_at = datetime.fromisoformat(data["updated_at"])
263
+ if updated_at.tzinfo is None:
264
+ updated_at = updated_at.replace(tzinfo=timezone.utc)
265
+
266
+ return cls(
267
+ context=data["context"],
268
+ rule_id=data["rule_id"],
269
+ params=BetaParams(alpha=data["alpha"], beta=data["beta"]),
270
+ is_seed=data.get("is_seed", False),
271
+ updated_at=updated_at,
272
+ )
273
+
274
+
275
+ @dataclass
276
+ class BanditState:
277
+ """Persisted state for the contextual bandit.
278
+
279
+ Structure:
280
+ arms[context][rule_id] = BetaParams
281
+
282
+ This allows O(1) lookup for any (context, rule) pair while
283
+ maintaining separate belief distributions per context.
284
+
285
+ Storage Format (JSONL):
286
+ Each line is a JSON object representing one arm's state.
287
+ We use append-only writes and compact on load to handle
288
+ concurrent access and crash recovery gracefully.
289
+
290
+ Example .buildlog/bandit_state.jsonl:
291
+ {"context": "type-errors", "rule_id": "arch-123", "alpha": 3.0, "beta": 2.0, ...}
292
+ {"context": "type-errors", "rule_id": "arch-123", "alpha": 4.0, "beta": 2.0, ...}
293
+
294
+ The second line supersedes the first (same context + rule_id).
295
+ """
296
+
297
+ arms: dict[str, dict[str, BetaParams]] = field(default_factory=dict)
298
+ seed_flags: dict[str, dict[str, bool]] = field(default_factory=dict)
299
+
300
+ def get_params(self, context: str, rule_id: str) -> BetaParams | None:
301
+ """Get parameters for a (context, rule) pair, if they exist."""
302
+ return self.arms.get(context, {}).get(rule_id)
303
+
304
+ def set_params(
305
+ self,
306
+ context: str,
307
+ rule_id: str,
308
+ params: BetaParams,
309
+ is_seed: bool = False,
310
+ ) -> None:
311
+ """Set parameters for a (context, rule) pair."""
312
+ if context not in self.arms:
313
+ self.arms[context] = {}
314
+ self.seed_flags[context] = {}
315
+ self.arms[context][rule_id] = params
316
+ self.seed_flags[context][rule_id] = is_seed
317
+
318
+ def is_seed(self, context: str, rule_id: str) -> bool:
319
+ """Check if a rule was initialized as a seed rule."""
320
+ return self.seed_flags.get(context, {}).get(rule_id, False)
321
+
322
+ def all_arms(self) -> Iterator[tuple[str, str, BetaParams]]:
323
+ """Iterate over all (context, rule_id, params) tuples."""
324
+ for context, rules in self.arms.items():
325
+ for rule_id, params in rules.items():
326
+ yield context, rule_id, params
327
+
328
+ @classmethod
329
+ def load(cls, path: Path) -> BanditState:
330
+ """Load state from JSONL file, compacting duplicate entries.
331
+
332
+ Because we append updates, the file may contain multiple entries
333
+ for the same (context, rule_id). We keep only the latest.
334
+ """
335
+ state = cls()
336
+
337
+ if not path.exists():
338
+ return state
339
+
340
+ # Read all records, keeping only the latest per (context, rule_id)
341
+ records: dict[tuple[str, str], ArmRecord] = {}
342
+
343
+ for line in path.read_text().strip().split("\n"):
344
+ if not line:
345
+ continue
346
+ try:
347
+ data = json.loads(line)
348
+ record = ArmRecord.from_dict(data)
349
+ key = (record.context, record.rule_id)
350
+
351
+ # Keep if newer or first seen
352
+ if key not in records or record.updated_at > records[key].updated_at:
353
+ records[key] = record
354
+ except (json.JSONDecodeError, KeyError, ValueError):
355
+ # Skip malformed lines (crash recovery)
356
+ continue
357
+
358
+ # Populate state from compacted records
359
+ for (context, rule_id), record in records.items():
360
+ state.set_params(context, rule_id, record.params, record.is_seed)
361
+
362
+ return state
363
+
364
+ def save(self, path: Path) -> None:
365
+ """Save full state to JSONL file (compacted).
366
+
367
+ This writes a fresh file with one line per arm, removing
368
+ any historical duplicates from append-only updates.
369
+ """
370
+ path.parent.mkdir(parents=True, exist_ok=True)
371
+
372
+ lines = []
373
+ for context, rule_id, params in self.all_arms():
374
+ record = ArmRecord(
375
+ context=context,
376
+ rule_id=rule_id,
377
+ params=params,
378
+ is_seed=self.is_seed(context, rule_id),
379
+ )
380
+ lines.append(json.dumps(record.to_dict()))
381
+
382
+ path.write_text("\n".join(lines) + "\n" if lines else "")
383
+
384
+ def append_update(self, path: Path, context: str, rule_id: str) -> None:
385
+ """Append a single arm's update to the JSONL file.
386
+
387
+ This is more efficient than rewriting the entire file for
388
+ each update. The file will be compacted on next load.
389
+ """
390
+ path.parent.mkdir(parents=True, exist_ok=True)
391
+
392
+ params = self.get_params(context, rule_id)
393
+ if params is None:
394
+ return
395
+
396
+ record = ArmRecord(
397
+ context=context,
398
+ rule_id=rule_id,
399
+ params=params,
400
+ is_seed=self.is_seed(context, rule_id),
401
+ )
402
+
403
+ with open(path, "a") as f:
404
+ f.write(json.dumps(record.to_dict()) + "\n")
405
+
406
+
407
+ # ============================================================================
408
+ # THOMPSON SAMPLING BANDIT
409
+ # ============================================================================
410
+
411
+
412
+ class ThompsonSamplingBandit:
413
+ """Thompson Sampling bandit for contextual rule selection.
414
+
415
+ This is the main interface for the bandit. It handles:
416
+
417
+ 1. SELECTION: Pick top-k rules for a given context
418
+ - Sample from each rule's Beta distribution
419
+ - Return rules with highest samples
420
+ - Initialize new rules with appropriate priors
421
+
422
+ 2. UPDATES: Learn from feedback
423
+ - Success (reward=1): rule helped prevent mistakes
424
+ - Failure (reward=0): mistake occurred despite rule
425
+ - Partial (0 < reward < 1): for nuanced feedback
426
+
427
+ 3. PERSISTENCE: State survives across sessions
428
+ - Append-only writes for crash safety
429
+ - Compact on load for efficiency
430
+
431
+ Example usage:
432
+ bandit = ThompsonSamplingBandit(buildlog_dir / "bandit_state.jsonl")
433
+
434
+ # At session start: select rules
435
+ selected = bandit.select(
436
+ candidates=["rule-1", "rule-2", "rule-3"],
437
+ context="type-errors",
438
+ k=2,
439
+ )
440
+ # selected might be ["rule-2", "rule-1"] based on sampling
441
+
442
+ # On mistake: negative feedback
443
+ for rule_id in selected:
444
+ bandit.update(rule_id, reward=0.0, context="type-errors")
445
+
446
+ # On success: positive feedback
447
+ bandit.update("rule-2", reward=1.0, context="type-errors")
448
+ """
449
+
450
+ def __init__(
451
+ self,
452
+ state_path: Path,
453
+ seed_boost: float = DEFAULT_SEED_BOOST,
454
+ default_context: str = DEFAULT_CONTEXT,
455
+ ):
456
+ """Initialize the bandit.
457
+
458
+ Args:
459
+ state_path: Path to JSONL file for persistence.
460
+ seed_boost: Extra α for seed rules. Higher values mean
461
+ seed rules start with higher assumed success rates.
462
+ Default 2.0 means seed rules start as if they've
463
+ already had 2 extra successes.
464
+ default_context: Fallback context when none specified.
465
+ """
466
+ self.state_path = state_path
467
+ self.seed_boost = seed_boost
468
+ self.default_context = default_context
469
+ self.state = BanditState.load(state_path)
470
+
471
+ def select(
472
+ self,
473
+ candidates: list[str],
474
+ context: str | None = None,
475
+ k: int = 3,
476
+ seed_rule_ids: set[str] | None = None,
477
+ ) -> list[str]:
478
+ """Select top-k rules using Thompson Sampling.
479
+
480
+ This is where the magic happens:
481
+
482
+ 1. For each candidate rule, get or create its Beta distribution
483
+ 2. Sample from each distribution (not the mean!)
484
+ 3. Return the k rules with highest samples
485
+
486
+ The sampling step is crucial: it means rules we're uncertain about
487
+ (high variance) will occasionally beat rules with higher means,
488
+ ensuring we explore enough to learn their true values.
489
+
490
+ Args:
491
+ candidates: List of rule IDs to choose from.
492
+ context: Error class for contextual selection.
493
+ Different contexts have independent distributions.
494
+ k: Number of rules to select.
495
+ seed_rule_ids: Set of rule IDs that are from seeds (axioms).
496
+ These get boosted priors.
497
+
498
+ Returns:
499
+ List of k rule IDs, ordered by their sampled values (best first).
500
+ If fewer than k candidates, returns all of them.
501
+ """
502
+ ctx = context or self.default_context
503
+ seed_ids = seed_rule_ids or set()
504
+
505
+ # Sample from each candidate's distribution
506
+ samples: list[tuple[str, float]] = []
507
+
508
+ for rule_id in candidates:
509
+ params = self.state.get_params(ctx, rule_id)
510
+
511
+ if params is None:
512
+ # Initialize new arm
513
+ is_seed = rule_id in seed_ids
514
+ params = self._create_prior(is_seed)
515
+ self.state.set_params(ctx, rule_id, params, is_seed)
516
+
517
+ # THE KEY STEP: sample, don't use mean
518
+ sample = params.sample()
519
+ samples.append((rule_id, sample))
520
+
521
+ # Sort by sampled value (descending) and take top k
522
+ samples.sort(key=lambda x: x[1], reverse=True)
523
+ selected = [rule_id for rule_id, _ in samples[:k]]
524
+
525
+ # Persist any new arms we created
526
+ self.state.save(self.state_path)
527
+
528
+ return selected
529
+
530
+ def update(
531
+ self,
532
+ rule_id: str,
533
+ reward: float,
534
+ context: str | None = None,
535
+ ) -> None:
536
+ """Update posterior for a rule based on observed reward.
537
+
538
+ This is Bayesian learning in action:
539
+
540
+ Prior: Beta(α, β)
541
+ + Observation: reward r
542
+ = Posterior: Beta(α + r, β + (1 - r))
543
+
544
+ Over time, rules that consistently help will have high α,
545
+ rules that don't help will have high β, and the bandit will
546
+ naturally favor effective rules.
547
+
548
+ Args:
549
+ rule_id: The rule to update.
550
+ reward: Observed reward in [0, 1].
551
+ - 1.0: Rule helped (full success)
552
+ - 0.0: Rule didn't help (failure)
553
+ - 0.5: Partial credit
554
+ context: Error class context.
555
+ """
556
+ ctx = context or self.default_context
557
+ params = self.state.get_params(ctx, rule_id)
558
+
559
+ if params is None:
560
+ # Rule wasn't initialized yet - create with default prior
561
+ params = self._create_prior(is_seed=False)
562
+ self.state.set_params(ctx, rule_id, params, is_seed=False)
563
+
564
+ # Bayesian update
565
+ params.update(reward)
566
+
567
+ # Persist (append-only for efficiency)
568
+ self.state.append_update(self.state_path, ctx, rule_id)
569
+
570
+ def batch_update(
571
+ self,
572
+ rule_ids: list[str],
573
+ reward: float,
574
+ context: str | None = None,
575
+ ) -> None:
576
+ """Update multiple rules with the same reward.
577
+
578
+ Convenience method for updating all rules active during a session
579
+ when a mistake occurs (reward=0) or when giving positive feedback
580
+ (reward>0) to all active rules.
581
+
582
+ Args:
583
+ rule_ids: Rules to update.
584
+ reward: Reward value for all rules.
585
+ context: Error class context.
586
+ """
587
+ for rule_id in rule_ids:
588
+ self.update(rule_id, reward, context)
589
+
590
+ def get_stats(self, context: str | None = None) -> dict[str, dict]:
591
+ """Get statistics for all rules in a context.
592
+
593
+ Useful for debugging and reporting.
594
+
595
+ Args:
596
+ context: Error class to get stats for.
597
+ If None, returns stats for all contexts.
598
+
599
+ Returns:
600
+ Dict mapping rule_id to stats dict with:
601
+ - mean: Expected reward rate
602
+ - alpha, beta: Distribution parameters
603
+ - variance: Uncertainty measure
604
+ - is_seed: Whether this is a seed rule
605
+ - confidence_interval: 95% CI
606
+ """
607
+ stats: dict[str, dict] = {}
608
+
609
+ if context is not None:
610
+ contexts = [context]
611
+ else:
612
+ contexts = list(self.state.arms.keys())
613
+
614
+ for ctx in contexts:
615
+ rules = self.state.arms.get(ctx, {})
616
+ for rule_id, params in rules.items():
617
+ key = f"{ctx}:{rule_id}" if context is None else rule_id
618
+ ci_low, ci_high = params.confidence_interval()
619
+ stats[key] = {
620
+ "context": ctx,
621
+ "mean": round(params.mean(), 4),
622
+ "alpha": params.alpha,
623
+ "beta": params.beta,
624
+ "variance": round(params.variance(), 6),
625
+ "is_seed": self.state.is_seed(ctx, rule_id),
626
+ "confidence_interval": (round(ci_low, 4), round(ci_high, 4)),
627
+ "total_observations": params.alpha
628
+ + params.beta
629
+ - 2, # Subtract prior
630
+ }
631
+
632
+ return stats
633
+
634
+ def get_top_rules(
635
+ self,
636
+ context: str,
637
+ k: int = 10,
638
+ ) -> list[tuple[str, float]]:
639
+ """Get top rules by expected value (not sampled).
640
+
641
+ Unlike select(), this uses the mean rather than sampling.
642
+ Useful for reporting "best rules so far" without the
643
+ exploration randomness.
644
+
645
+ Args:
646
+ context: Error class.
647
+ k: Number of rules to return.
648
+
649
+ Returns:
650
+ List of (rule_id, mean) tuples, sorted by mean descending.
651
+ """
652
+ rules = self.state.arms.get(context, {})
653
+ ranked = [(rule_id, params.mean()) for rule_id, params in rules.items()]
654
+ ranked.sort(key=lambda x: x[1], reverse=True)
655
+ return ranked[:k]
656
+
657
+ def _create_prior(self, is_seed: bool) -> BetaParams:
658
+ """Create prior distribution for a new arm.
659
+
660
+ Seed rules (from gauntlet personas / axioms) get a boosted prior,
661
+ reflecting our belief that curated rules are likely effective.
662
+
663
+ Non-seed rules get the uninformative Beta(1, 1) prior,
664
+ meaning we start with maximum uncertainty about their value.
665
+
666
+ Args:
667
+ is_seed: Whether this rule comes from seeds.
668
+
669
+ Returns:
670
+ BetaParams with appropriate prior.
671
+ """
672
+ if is_seed:
673
+ # Boosted prior: as if rule already had seed_boost successes
674
+ # Beta(1 + boost, 1) → mean = (1 + boost) / (2 + boost)
675
+ # With boost=2: mean = 3/4 = 0.75 (optimistic)
676
+ return BetaParams(alpha=1.0 + self.seed_boost, beta=1.0)
677
+ else:
678
+ # Uninformative prior: maximum uncertainty
679
+ # Beta(1, 1) is uniform → mean = 0.5
680
+ return BetaParams(alpha=1.0, beta=1.0)
681
+
682
+ def reset(self, context: str | None = None) -> None:
683
+ """Reset bandit state.
684
+
685
+ Use with caution - this discards learned information.
686
+
687
+ Args:
688
+ context: If provided, only reset this context.
689
+ If None, reset everything.
690
+ """
691
+ if context is None:
692
+ self.state = BanditState()
693
+ else:
694
+ if context in self.state.arms:
695
+ del self.state.arms[context]
696
+ if context in self.state.seed_flags:
697
+ del self.state.seed_flags[context]
698
+
699
+ self.state.save(self.state_path)