buildlog 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- buildlog/__init__.py +1 -1
- buildlog/cli.py +659 -48
- buildlog/confidence.py +27 -0
- buildlog/core/__init__.py +2 -0
- buildlog/core/bandit.py +699 -0
- buildlog/core/operations.py +284 -24
- buildlog/distill.py +80 -1
- buildlog/engine/__init__.py +61 -0
- buildlog/engine/bandit.py +23 -0
- buildlog/engine/confidence.py +28 -0
- buildlog/engine/embeddings.py +28 -0
- buildlog/engine/experiments.py +619 -0
- buildlog/engine/types.py +31 -0
- buildlog/llm.py +508 -0
- buildlog/mcp/server.py +10 -6
- buildlog/mcp/tools.py +61 -13
- buildlog/render/__init__.py +19 -2
- buildlog/render/claude_md.py +67 -32
- buildlog/render/continue_dev.py +102 -0
- buildlog/render/copilot.py +100 -0
- buildlog/render/cursor.py +105 -0
- buildlog/render/windsurf.py +95 -0
- buildlog/seed_engine/__init__.py +2 -0
- buildlog/seed_engine/llm_extractor.py +121 -0
- buildlog/seed_engine/pipeline.py +45 -1
- buildlog/skills.py +69 -6
- {buildlog-0.7.0.data → buildlog-0.9.0.data}/data/share/buildlog/copier.yml +0 -4
- buildlog-0.9.0.data/data/share/buildlog/template/buildlog/_TEMPLATE_QUICK.md +21 -0
- buildlog-0.9.0.dist-info/METADATA +248 -0
- buildlog-0.9.0.dist-info/RECORD +55 -0
- buildlog-0.7.0.dist-info/METADATA +0 -544
- buildlog-0.7.0.dist-info/RECORD +0 -41
- {buildlog-0.7.0.data → buildlog-0.9.0.data}/data/share/buildlog/post_gen.py +0 -0
- {buildlog-0.7.0.data → buildlog-0.9.0.data}/data/share/buildlog/template/buildlog/.gitkeep +0 -0
- {buildlog-0.7.0.data → buildlog-0.9.0.data}/data/share/buildlog/template/buildlog/2026-01-01-example.md +0 -0
- {buildlog-0.7.0.data → buildlog-0.9.0.data}/data/share/buildlog/template/buildlog/BUILDLOG_SYSTEM.md +0 -0
- {buildlog-0.7.0.data → buildlog-0.9.0.data}/data/share/buildlog/template/buildlog/_TEMPLATE.md +0 -0
- {buildlog-0.7.0.data → buildlog-0.9.0.data}/data/share/buildlog/template/buildlog/assets/.gitkeep +0 -0
- {buildlog-0.7.0.dist-info → buildlog-0.9.0.dist-info}/WHEEL +0 -0
- {buildlog-0.7.0.dist-info → buildlog-0.9.0.dist-info}/entry_points.txt +0 -0
- {buildlog-0.7.0.dist-info → buildlog-0.9.0.dist-info}/licenses/LICENSE +0 -0
buildlog/llm.py
ADDED
|
@@ -0,0 +1,508 @@
|
|
|
1
|
+
"""LLM-backed rule extraction, deduplication, and scoring.
|
|
2
|
+
|
|
3
|
+
Provides a provider-agnostic interface for using LLMs to:
|
|
4
|
+
- Extract structured rules from buildlog entries
|
|
5
|
+
- Select canonical forms when deduplicating similar rules
|
|
6
|
+
- Score rules with severity/scope/applicability
|
|
7
|
+
|
|
8
|
+
Provider cascade:
|
|
9
|
+
1. Explicit config (.buildlog/config.yml or env)
|
|
10
|
+
2. Injected at call site (API parameter)
|
|
11
|
+
3. Auto-detect: Ollama -> Anthropic -> None (regex fallback)
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"ExtractedRule",
|
|
18
|
+
"RuleScoring",
|
|
19
|
+
"LLMConfig",
|
|
20
|
+
"LLMBackend",
|
|
21
|
+
"OllamaBackend",
|
|
22
|
+
"AnthropicBackend",
|
|
23
|
+
"PROVIDERS",
|
|
24
|
+
"register_provider",
|
|
25
|
+
"get_llm_backend",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
import json
|
|
29
|
+
import logging
|
|
30
|
+
import os
|
|
31
|
+
import time
|
|
32
|
+
from dataclasses import dataclass, field
|
|
33
|
+
from pathlib import Path
|
|
34
|
+
from types import MappingProxyType
|
|
35
|
+
from typing import Protocol, runtime_checkable
|
|
36
|
+
|
|
37
|
+
logger = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
# --- Data types (provider-agnostic) ---
|
|
40
|
+
|
|
41
|
+
VALID_SEVERITIES = ("critical", "major", "minor", "info")
|
|
42
|
+
VALID_SCOPES = ("global", "module", "function")
|
|
43
|
+
VALID_CATEGORIES = ("architectural", "workflow", "tool_usage", "domain_knowledge")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class ExtractedRule:
|
|
48
|
+
"""A rule extracted from buildlog text by an LLM."""
|
|
49
|
+
|
|
50
|
+
rule: str
|
|
51
|
+
category: str # architectural/workflow/tool_usage/domain_knowledge
|
|
52
|
+
severity: str = "info" # critical/major/minor/info
|
|
53
|
+
scope: str = "global" # global/module/function
|
|
54
|
+
applicability: list[str] = field(default_factory=list)
|
|
55
|
+
context: str | None = None # when to apply
|
|
56
|
+
antipattern: str | None = None # what violation looks like
|
|
57
|
+
rationale: str | None = None # why it matters
|
|
58
|
+
|
|
59
|
+
def __post_init__(self) -> None:
|
|
60
|
+
if self.severity not in VALID_SEVERITIES:
|
|
61
|
+
self.severity = "info"
|
|
62
|
+
if self.scope not in VALID_SCOPES:
|
|
63
|
+
self.scope = "global"
|
|
64
|
+
if self.category not in VALID_CATEGORIES:
|
|
65
|
+
self.category = "architectural"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass
|
|
69
|
+
class RuleScoring:
|
|
70
|
+
"""Severity/scope/applicability scoring for a rule."""
|
|
71
|
+
|
|
72
|
+
severity: str = "info"
|
|
73
|
+
scope: str = "global"
|
|
74
|
+
applicability: list[str] = field(default_factory=list)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# --- Provider config ---
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass
|
|
81
|
+
class LLMConfig:
|
|
82
|
+
"""Configuration for an LLM provider."""
|
|
83
|
+
|
|
84
|
+
provider: str # "ollama", "anthropic", "openai", ...
|
|
85
|
+
model: str | None = None # None = auto-detect or provider default
|
|
86
|
+
base_url: str | None = None # Override endpoint
|
|
87
|
+
api_key: str | None = None # From config or env var
|
|
88
|
+
|
|
89
|
+
def __repr__(self) -> str:
|
|
90
|
+
"""Redact api_key to prevent accidental exposure in logs/tracebacks."""
|
|
91
|
+
key_display = "***" if self.api_key else "None"
|
|
92
|
+
return (
|
|
93
|
+
f"LLMConfig(provider={self.provider!r}, model={self.model!r}, "
|
|
94
|
+
f"base_url={self.base_url!r}, api_key={key_display})"
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
@classmethod
|
|
98
|
+
def from_buildlog_config(cls, buildlog_dir: Path) -> LLMConfig | None:
|
|
99
|
+
"""Read from .buildlog/config.yml [llm] section."""
|
|
100
|
+
config_path = buildlog_dir / ".buildlog" / "config.yml"
|
|
101
|
+
if not config_path.exists():
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
import yaml
|
|
106
|
+
except ImportError:
|
|
107
|
+
logger.debug("PyYAML not available, skipping config file")
|
|
108
|
+
return None
|
|
109
|
+
|
|
110
|
+
try:
|
|
111
|
+
data = yaml.safe_load(config_path.read_text())
|
|
112
|
+
except Exception:
|
|
113
|
+
logger.warning("Failed to parse %s", config_path)
|
|
114
|
+
return None
|
|
115
|
+
|
|
116
|
+
if not isinstance(data, dict):
|
|
117
|
+
return None
|
|
118
|
+
|
|
119
|
+
llm_config = data.get("llm")
|
|
120
|
+
if not isinstance(llm_config, dict):
|
|
121
|
+
return None
|
|
122
|
+
|
|
123
|
+
provider = llm_config.get("provider")
|
|
124
|
+
if not provider:
|
|
125
|
+
return None
|
|
126
|
+
|
|
127
|
+
return cls(
|
|
128
|
+
provider=str(provider),
|
|
129
|
+
model=llm_config.get("model"),
|
|
130
|
+
base_url=llm_config.get("base_url"),
|
|
131
|
+
api_key=llm_config.get("api_key"),
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
@classmethod
|
|
135
|
+
def auto_detect(cls) -> LLMConfig | None:
|
|
136
|
+
"""Ollama running? -> use it. ANTHROPIC_API_KEY? -> use that. Else None."""
|
|
137
|
+
# Try Ollama first (local, no API key needed)
|
|
138
|
+
if _is_ollama_available():
|
|
139
|
+
return cls(provider="ollama")
|
|
140
|
+
|
|
141
|
+
# Try Anthropic (cloud)
|
|
142
|
+
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
|
143
|
+
if api_key:
|
|
144
|
+
return cls(provider="anthropic", api_key=api_key)
|
|
145
|
+
|
|
146
|
+
return None
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _is_ollama_available() -> bool:
|
|
150
|
+
"""Check if Ollama is running and accessible."""
|
|
151
|
+
try:
|
|
152
|
+
import ollama as ollama_lib
|
|
153
|
+
|
|
154
|
+
ollama_lib.list()
|
|
155
|
+
return True
|
|
156
|
+
except Exception:
|
|
157
|
+
return False
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
# --- Interface ---
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
@runtime_checkable
|
|
164
|
+
class LLMBackend(Protocol):
|
|
165
|
+
"""Protocol for LLM backends."""
|
|
166
|
+
|
|
167
|
+
def extract_rules(self, entry_text: str) -> list[ExtractedRule]:
|
|
168
|
+
"""Extract structured rules from buildlog entry text."""
|
|
169
|
+
...
|
|
170
|
+
|
|
171
|
+
def select_canonical(self, candidates: list[str]) -> str:
|
|
172
|
+
"""Given similar rules, produce the single best canonical form."""
|
|
173
|
+
...
|
|
174
|
+
|
|
175
|
+
def score_rule(self, rule: str, context: str) -> RuleScoring:
|
|
176
|
+
"""Score a rule with severity/scope/applicability."""
|
|
177
|
+
...
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
# --- Prompts ---
|
|
181
|
+
|
|
182
|
+
EXTRACT_RULES_PROMPT = """\
|
|
183
|
+
You are analyzing a buildlog entry's Improvements section. Extract actionable rules.
|
|
184
|
+
|
|
185
|
+
For each rule, return a JSON array of objects with these fields:
|
|
186
|
+
- "rule": string — the actionable rule in imperative form
|
|
187
|
+
- "category": string — one of: architectural, workflow, tool_usage, domain_knowledge
|
|
188
|
+
- "severity": string — one of: critical, major, minor, info
|
|
189
|
+
- "scope": string — one of: global, module, function
|
|
190
|
+
- "applicability": array of strings — contexts where relevant (e.g., "python", "api-design")
|
|
191
|
+
- "context": string or null — when to apply this rule
|
|
192
|
+
- "antipattern": string or null — what violation looks like
|
|
193
|
+
- "rationale": string or null — why it matters
|
|
194
|
+
|
|
195
|
+
Return ONLY a JSON array. No markdown, no explanation.
|
|
196
|
+
|
|
197
|
+
Text to analyze:
|
|
198
|
+
{text}
|
|
199
|
+
"""
|
|
200
|
+
|
|
201
|
+
SELECT_CANONICAL_PROMPT = """\
|
|
202
|
+
Given these similar rules, produce the single best canonical form.
|
|
203
|
+
The canonical rule should be clear, concise, and actionable.
|
|
204
|
+
|
|
205
|
+
Similar rules:
|
|
206
|
+
{candidates}
|
|
207
|
+
|
|
208
|
+
Return ONLY the canonical rule text as a plain string. No JSON, no quotes, no explanation.
|
|
209
|
+
"""
|
|
210
|
+
|
|
211
|
+
SCORE_RULE_PROMPT = """\
|
|
212
|
+
Score this rule for severity, scope, and applicability.
|
|
213
|
+
|
|
214
|
+
Rule: {rule}
|
|
215
|
+
Context: {context}
|
|
216
|
+
|
|
217
|
+
Return ONLY a JSON object with:
|
|
218
|
+
- "severity": one of: critical, major, minor, info
|
|
219
|
+
- "scope": one of: global, module, function
|
|
220
|
+
- "applicability": array of strings (contexts where relevant)
|
|
221
|
+
|
|
222
|
+
No markdown, no explanation.
|
|
223
|
+
"""
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def _parse_json_response(text: str) -> list | dict:
|
|
227
|
+
"""Parse JSON from LLM response, handling markdown code blocks."""
|
|
228
|
+
text = text.strip()
|
|
229
|
+
# Strip markdown code blocks
|
|
230
|
+
if text.startswith("```"):
|
|
231
|
+
lines = text.split("\n")
|
|
232
|
+
# Remove first and last lines (``` markers)
|
|
233
|
+
lines = [ln for ln in lines[1:] if not ln.strip().startswith("```")]
|
|
234
|
+
text = "\n".join(lines)
|
|
235
|
+
return json.loads(text)
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
# --- Rate limiting ---
|
|
239
|
+
|
|
240
|
+
# Minimum seconds between API calls (per-backend instance).
|
|
241
|
+
_MIN_CALL_INTERVAL = 0.5
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
class _RateLimiter:
|
|
245
|
+
"""Simple per-instance rate limiter to prevent API abuse."""
|
|
246
|
+
|
|
247
|
+
def __init__(self, min_interval: float = _MIN_CALL_INTERVAL):
|
|
248
|
+
self._min_interval = min_interval
|
|
249
|
+
self._last_call: float = 0.0
|
|
250
|
+
|
|
251
|
+
def wait(self) -> None:
|
|
252
|
+
"""Block until min_interval has elapsed since last call."""
|
|
253
|
+
now = time.monotonic()
|
|
254
|
+
elapsed = now - self._last_call
|
|
255
|
+
if elapsed < self._min_interval:
|
|
256
|
+
time.sleep(self._min_interval - elapsed)
|
|
257
|
+
self._last_call = time.monotonic()
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
# --- Implementations ---
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
class OllamaBackend:
|
|
264
|
+
"""LLM backend using Ollama (local)."""
|
|
265
|
+
|
|
266
|
+
def __init__(self, model: str | None = None, base_url: str | None = None):
|
|
267
|
+
self._model = model
|
|
268
|
+
self._base_url = base_url
|
|
269
|
+
self._resolved_model: str | None = None
|
|
270
|
+
self._rate_limiter = _RateLimiter()
|
|
271
|
+
|
|
272
|
+
def _get_model(self) -> str:
|
|
273
|
+
"""Resolve model name, auto-detecting largest if not specified."""
|
|
274
|
+
if self._resolved_model:
|
|
275
|
+
return self._resolved_model
|
|
276
|
+
|
|
277
|
+
if self._model:
|
|
278
|
+
self._resolved_model = self._model
|
|
279
|
+
return self._resolved_model
|
|
280
|
+
|
|
281
|
+
# Auto-detect: pick largest pulled model
|
|
282
|
+
try:
|
|
283
|
+
import ollama as ollama_lib
|
|
284
|
+
|
|
285
|
+
models = ollama_lib.list()
|
|
286
|
+
if not models or not models.get("models"):
|
|
287
|
+
raise RuntimeError(
|
|
288
|
+
"No Ollama models found. Pull one with: ollama pull llama3.2"
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
model_list = models["models"]
|
|
292
|
+
# Sort by size descending, pick largest
|
|
293
|
+
largest = max(model_list, key=lambda m: m.get("size", 0))
|
|
294
|
+
model_name: str = largest["name"]
|
|
295
|
+
self._resolved_model = model_name
|
|
296
|
+
logger.info("Auto-detected Ollama model: %s", model_name)
|
|
297
|
+
return model_name
|
|
298
|
+
except ImportError:
|
|
299
|
+
raise ImportError(
|
|
300
|
+
"ollama package is required. Install with: pip install buildlog[ollama]"
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
def _chat(self, prompt: str) -> str:
|
|
304
|
+
"""Send a prompt to Ollama and return the response text."""
|
|
305
|
+
self._rate_limiter.wait()
|
|
306
|
+
import ollama as ollama_lib
|
|
307
|
+
|
|
308
|
+
kwargs = {
|
|
309
|
+
"model": self._get_model(),
|
|
310
|
+
"messages": [{"role": "user", "content": prompt}],
|
|
311
|
+
}
|
|
312
|
+
if self._base_url:
|
|
313
|
+
client = ollama_lib.Client(host=self._base_url)
|
|
314
|
+
response = client.chat(**kwargs)
|
|
315
|
+
else:
|
|
316
|
+
response = ollama_lib.chat(**kwargs)
|
|
317
|
+
return response["message"]["content"]
|
|
318
|
+
|
|
319
|
+
def extract_rules(self, entry_text: str) -> list[ExtractedRule]:
|
|
320
|
+
"""Extract structured rules from buildlog entry text."""
|
|
321
|
+
prompt = EXTRACT_RULES_PROMPT.format(text=entry_text)
|
|
322
|
+
try:
|
|
323
|
+
response = self._chat(prompt)
|
|
324
|
+
parsed = _parse_json_response(response)
|
|
325
|
+
if not isinstance(parsed, list):
|
|
326
|
+
parsed = [parsed]
|
|
327
|
+
return [ExtractedRule(**item) for item in parsed]
|
|
328
|
+
except Exception as e:
|
|
329
|
+
logger.warning("Ollama extraction failed: %s", e)
|
|
330
|
+
return []
|
|
331
|
+
|
|
332
|
+
def select_canonical(self, candidates: list[str]) -> str:
|
|
333
|
+
"""Given similar rules, produce the single best canonical form."""
|
|
334
|
+
numbered = "\n".join(f"{i+1}. {c}" for i, c in enumerate(candidates))
|
|
335
|
+
prompt = SELECT_CANONICAL_PROMPT.format(candidates=numbered)
|
|
336
|
+
try:
|
|
337
|
+
response = self._chat(prompt)
|
|
338
|
+
return response.strip().strip('"').strip("'")
|
|
339
|
+
except Exception as e:
|
|
340
|
+
logger.warning("Ollama canonical selection failed: %s", e)
|
|
341
|
+
return min(candidates, key=len)
|
|
342
|
+
|
|
343
|
+
def score_rule(self, rule: str, context: str) -> RuleScoring:
|
|
344
|
+
"""Score a rule with severity/scope/applicability."""
|
|
345
|
+
prompt = SCORE_RULE_PROMPT.format(rule=rule, context=context)
|
|
346
|
+
try:
|
|
347
|
+
response = self._chat(prompt)
|
|
348
|
+
parsed = _parse_json_response(response)
|
|
349
|
+
if isinstance(parsed, dict):
|
|
350
|
+
return RuleScoring(
|
|
351
|
+
severity=parsed.get("severity", "info"),
|
|
352
|
+
scope=parsed.get("scope", "global"),
|
|
353
|
+
applicability=parsed.get("applicability", []),
|
|
354
|
+
)
|
|
355
|
+
except Exception as e:
|
|
356
|
+
logger.warning("Ollama scoring failed: %s", e)
|
|
357
|
+
return RuleScoring()
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
class AnthropicBackend:
|
|
361
|
+
"""LLM backend using Anthropic Claude API."""
|
|
362
|
+
|
|
363
|
+
def __init__(
|
|
364
|
+
self,
|
|
365
|
+
model: str | None = None,
|
|
366
|
+
api_key: str | None = None,
|
|
367
|
+
):
|
|
368
|
+
self._model = model or "claude-haiku-4-20250514"
|
|
369
|
+
self._api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
|
|
370
|
+
self._client = None
|
|
371
|
+
self._rate_limiter = _RateLimiter()
|
|
372
|
+
|
|
373
|
+
def __repr__(self) -> str:
|
|
374
|
+
"""Redact API key from repr to prevent exposure in logs/tracebacks."""
|
|
375
|
+
return f"AnthropicBackend(model={self._model!r}, api_key=***)"
|
|
376
|
+
|
|
377
|
+
def _get_client(self):
|
|
378
|
+
"""Lazy-load the Anthropic client."""
|
|
379
|
+
if self._client is None:
|
|
380
|
+
try:
|
|
381
|
+
import anthropic
|
|
382
|
+
except ImportError:
|
|
383
|
+
raise ImportError(
|
|
384
|
+
"anthropic package is required. Install with: pip install buildlog[anthropic]"
|
|
385
|
+
)
|
|
386
|
+
if not self._api_key:
|
|
387
|
+
raise ValueError("ANTHROPIC_API_KEY is required for Anthropic backend")
|
|
388
|
+
self._client = anthropic.Anthropic(api_key=self._api_key)
|
|
389
|
+
return self._client
|
|
390
|
+
|
|
391
|
+
def _chat(self, prompt: str) -> str:
|
|
392
|
+
"""Send a prompt to Claude and return the response text."""
|
|
393
|
+
self._rate_limiter.wait()
|
|
394
|
+
client = self._get_client()
|
|
395
|
+
response = client.messages.create(
|
|
396
|
+
model=self._model,
|
|
397
|
+
max_tokens=2048,
|
|
398
|
+
messages=[{"role": "user", "content": prompt}],
|
|
399
|
+
)
|
|
400
|
+
return response.content[0].text
|
|
401
|
+
|
|
402
|
+
def extract_rules(self, entry_text: str) -> list[ExtractedRule]:
|
|
403
|
+
"""Extract structured rules from buildlog entry text."""
|
|
404
|
+
prompt = EXTRACT_RULES_PROMPT.format(text=entry_text)
|
|
405
|
+
try:
|
|
406
|
+
response = self._chat(prompt)
|
|
407
|
+
parsed = _parse_json_response(response)
|
|
408
|
+
if not isinstance(parsed, list):
|
|
409
|
+
parsed = [parsed]
|
|
410
|
+
return [ExtractedRule(**item) for item in parsed]
|
|
411
|
+
except Exception as e:
|
|
412
|
+
logger.warning("Anthropic extraction failed: %s", e)
|
|
413
|
+
return []
|
|
414
|
+
|
|
415
|
+
def select_canonical(self, candidates: list[str]) -> str:
|
|
416
|
+
"""Given similar rules, produce the single best canonical form."""
|
|
417
|
+
numbered = "\n".join(f"{i+1}. {c}" for i, c in enumerate(candidates))
|
|
418
|
+
prompt = SELECT_CANONICAL_PROMPT.format(candidates=numbered)
|
|
419
|
+
try:
|
|
420
|
+
response = self._chat(prompt)
|
|
421
|
+
return response.strip().strip('"').strip("'")
|
|
422
|
+
except Exception as e:
|
|
423
|
+
logger.warning("Anthropic canonical selection failed: %s", e)
|
|
424
|
+
return min(candidates, key=len)
|
|
425
|
+
|
|
426
|
+
def score_rule(self, rule: str, context: str) -> RuleScoring:
|
|
427
|
+
"""Score a rule with severity/scope/applicability."""
|
|
428
|
+
prompt = SCORE_RULE_PROMPT.format(rule=rule, context=context)
|
|
429
|
+
try:
|
|
430
|
+
response = self._chat(prompt)
|
|
431
|
+
parsed = _parse_json_response(response)
|
|
432
|
+
if isinstance(parsed, dict):
|
|
433
|
+
return RuleScoring(
|
|
434
|
+
severity=parsed.get("severity", "info"),
|
|
435
|
+
scope=parsed.get("scope", "global"),
|
|
436
|
+
applicability=parsed.get("applicability", []),
|
|
437
|
+
)
|
|
438
|
+
except Exception as e:
|
|
439
|
+
logger.warning("Anthropic scoring failed: %s", e)
|
|
440
|
+
return RuleScoring()
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
# --- Registry ---
|
|
444
|
+
|
|
445
|
+
_PROVIDERS: dict[str, type] = {
|
|
446
|
+
"ollama": OllamaBackend,
|
|
447
|
+
"anthropic": AnthropicBackend,
|
|
448
|
+
}
|
|
449
|
+
# Public read-only view. Use register_provider() to add entries.
|
|
450
|
+
PROVIDERS: MappingProxyType[str, type] = MappingProxyType(_PROVIDERS)
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
def register_provider(name: str, cls: type) -> None:
|
|
454
|
+
"""Register a new LLM provider backend.
|
|
455
|
+
|
|
456
|
+
This is the only sanctioned way to mutate the provider registry.
|
|
457
|
+
"""
|
|
458
|
+
if not isinstance(name, str) or not name.strip():
|
|
459
|
+
raise ValueError("Provider name must be a non-empty string")
|
|
460
|
+
_PROVIDERS[name] = cls
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
def get_llm_backend(
|
|
464
|
+
config: LLMConfig | None = None,
|
|
465
|
+
buildlog_dir: Path | None = None,
|
|
466
|
+
) -> LLMBackend | None:
|
|
467
|
+
"""Get an LLM backend using the provider cascade.
|
|
468
|
+
|
|
469
|
+
Resolution order:
|
|
470
|
+
1. Explicit config parameter (highest priority)
|
|
471
|
+
2. Config file (.buildlog/config.yml)
|
|
472
|
+
3. Auto-detect: Ollama -> Anthropic -> None
|
|
473
|
+
|
|
474
|
+
Returns None if no provider is available (regex fallback).
|
|
475
|
+
"""
|
|
476
|
+
# 1. Explicit config
|
|
477
|
+
if config is None and buildlog_dir is not None:
|
|
478
|
+
# 2. Config file
|
|
479
|
+
config = LLMConfig.from_buildlog_config(buildlog_dir)
|
|
480
|
+
|
|
481
|
+
if config is None:
|
|
482
|
+
# 3. Auto-detect
|
|
483
|
+
config = LLMConfig.auto_detect()
|
|
484
|
+
|
|
485
|
+
if config is None:
|
|
486
|
+
logger.info("No LLM provider available, using regex fallback")
|
|
487
|
+
return None
|
|
488
|
+
|
|
489
|
+
provider_cls = _PROVIDERS.get(config.provider)
|
|
490
|
+
if provider_cls is None:
|
|
491
|
+
logger.warning("Unknown LLM provider: %s", config.provider)
|
|
492
|
+
return None
|
|
493
|
+
|
|
494
|
+
try:
|
|
495
|
+
kwargs: dict = {}
|
|
496
|
+
if config.model:
|
|
497
|
+
kwargs["model"] = config.model
|
|
498
|
+
if config.provider == "ollama" and config.base_url:
|
|
499
|
+
kwargs["base_url"] = config.base_url
|
|
500
|
+
if config.provider == "anthropic" and config.api_key:
|
|
501
|
+
kwargs["api_key"] = config.api_key
|
|
502
|
+
|
|
503
|
+
backend = provider_cls(**kwargs)
|
|
504
|
+
logger.info("Using LLM provider: %s", config.provider)
|
|
505
|
+
return backend
|
|
506
|
+
except Exception as e:
|
|
507
|
+
logger.warning("Failed to initialize %s backend: %s", config.provider, e)
|
|
508
|
+
return None
|
buildlog/mcp/server.py
CHANGED
|
@@ -5,9 +5,12 @@ from __future__ import annotations
|
|
|
5
5
|
from mcp.server.fastmcp import FastMCP
|
|
6
6
|
|
|
7
7
|
from buildlog.mcp.tools import (
|
|
8
|
+
buildlog_bandit_status,
|
|
8
9
|
buildlog_diff,
|
|
9
|
-
|
|
10
|
+
buildlog_experiment_end,
|
|
11
|
+
buildlog_experiment_metrics,
|
|
10
12
|
buildlog_experiment_report,
|
|
13
|
+
buildlog_experiment_start,
|
|
11
14
|
buildlog_gauntlet_accept_risk,
|
|
12
15
|
buildlog_gauntlet_issues,
|
|
13
16
|
buildlog_learn_from_review,
|
|
@@ -16,8 +19,6 @@ from buildlog.mcp.tools import (
|
|
|
16
19
|
buildlog_promote,
|
|
17
20
|
buildlog_reject,
|
|
18
21
|
buildlog_rewards,
|
|
19
|
-
buildlog_session_metrics,
|
|
20
|
-
buildlog_start_session,
|
|
21
22
|
buildlog_status,
|
|
22
23
|
)
|
|
23
24
|
|
|
@@ -33,16 +34,19 @@ mcp.tool()(buildlog_log_reward)
|
|
|
33
34
|
mcp.tool()(buildlog_rewards)
|
|
34
35
|
|
|
35
36
|
# Session tracking tools (experiment infrastructure)
|
|
36
|
-
mcp.tool()(
|
|
37
|
-
mcp.tool()(
|
|
37
|
+
mcp.tool()(buildlog_experiment_start)
|
|
38
|
+
mcp.tool()(buildlog_experiment_end)
|
|
38
39
|
mcp.tool()(buildlog_log_mistake)
|
|
39
|
-
mcp.tool()(
|
|
40
|
+
mcp.tool()(buildlog_experiment_metrics)
|
|
40
41
|
mcp.tool()(buildlog_experiment_report)
|
|
41
42
|
|
|
42
43
|
# Gauntlet loop tools
|
|
43
44
|
mcp.tool()(buildlog_gauntlet_issues)
|
|
44
45
|
mcp.tool()(buildlog_gauntlet_accept_risk)
|
|
45
46
|
|
|
47
|
+
# Bandit tools
|
|
48
|
+
mcp.tool()(buildlog_bandit_status)
|
|
49
|
+
|
|
46
50
|
|
|
47
51
|
def main() -> None:
|
|
48
52
|
"""Run the MCP server."""
|
buildlog/mcp/tools.py
CHANGED
|
@@ -12,6 +12,7 @@ from typing import Literal
|
|
|
12
12
|
from buildlog.core import (
|
|
13
13
|
diff,
|
|
14
14
|
end_session,
|
|
15
|
+
get_bandit_status,
|
|
15
16
|
get_experiment_report,
|
|
16
17
|
get_rewards,
|
|
17
18
|
get_session_metrics,
|
|
@@ -52,17 +53,17 @@ def buildlog_status(
|
|
|
52
53
|
|
|
53
54
|
def buildlog_promote(
|
|
54
55
|
skill_ids: list[str],
|
|
55
|
-
target:
|
|
56
|
+
target: str = "claude_md",
|
|
56
57
|
buildlog_dir: str = "buildlog",
|
|
57
58
|
) -> dict:
|
|
58
59
|
"""Promote skills to your agent's rules.
|
|
59
60
|
|
|
60
|
-
Writes selected skills to
|
|
61
|
-
.claude/skills/buildlog-learned/SKILL.md (Anthropic Agent Skills format).
|
|
61
|
+
Writes selected skills to agent-specific rule files.
|
|
62
62
|
|
|
63
63
|
Args:
|
|
64
64
|
skill_ids: List of skill IDs to promote (e.g., ["arch-b0fcb62a1e"])
|
|
65
|
-
target: Where to write rules
|
|
65
|
+
target: Where to write rules. One of: claude_md, settings_json,
|
|
66
|
+
skill, cursor, copilot, windsurf, continue_dev.
|
|
66
67
|
buildlog_dir: Path to buildlog directory
|
|
67
68
|
|
|
68
69
|
Returns:
|
|
@@ -262,36 +263,47 @@ def buildlog_rewards(
|
|
|
262
263
|
# -----------------------------------------------------------------------------
|
|
263
264
|
|
|
264
265
|
|
|
265
|
-
def
|
|
266
|
+
def buildlog_experiment_start(
|
|
266
267
|
error_class: str | None = None,
|
|
267
268
|
notes: str | None = None,
|
|
269
|
+
select_k: int = 3,
|
|
268
270
|
buildlog_dir: str = "buildlog",
|
|
269
271
|
) -> dict:
|
|
270
|
-
"""Start a new experiment session.
|
|
272
|
+
"""Start a new experiment session with Thompson Sampling rule selection.
|
|
271
273
|
|
|
272
|
-
Begins tracking for a learning experiment.
|
|
273
|
-
|
|
274
|
+
Begins tracking for a learning experiment. Uses Thompson Sampling
|
|
275
|
+
to select which rules will be "active" for this session based on
|
|
276
|
+
the error class context.
|
|
277
|
+
|
|
278
|
+
The selected rules will receive feedback:
|
|
279
|
+
- Negative feedback (reward=0) when log_mistake() is called
|
|
280
|
+
- Explicit feedback when log_reward() is called
|
|
281
|
+
|
|
282
|
+
This teaches the bandit which rules are effective for which contexts.
|
|
274
283
|
|
|
275
284
|
Args:
|
|
276
|
-
error_class: Error class being targeted (e.g., "missing_test")
|
|
285
|
+
error_class: Error class being targeted (e.g., "missing_test").
|
|
286
|
+
This is the CONTEXT for contextual bandits.
|
|
277
287
|
notes: Notes about this session
|
|
288
|
+
select_k: Number of rules to select via Thompson Sampling
|
|
278
289
|
buildlog_dir: Path to buildlog directory
|
|
279
290
|
|
|
280
291
|
Returns:
|
|
281
|
-
Dict with session_id, error_class, rules_count, message
|
|
292
|
+
Dict with session_id, error_class, rules_count, selected_rules, message
|
|
282
293
|
|
|
283
294
|
Example:
|
|
284
|
-
buildlog_start_session(error_class="
|
|
295
|
+
buildlog_start_session(error_class="type-errors", select_k=5)
|
|
285
296
|
"""
|
|
286
297
|
result = start_session(
|
|
287
298
|
Path(buildlog_dir),
|
|
288
299
|
error_class=error_class,
|
|
289
300
|
notes=notes,
|
|
301
|
+
select_k=select_k,
|
|
290
302
|
)
|
|
291
303
|
return asdict(result)
|
|
292
304
|
|
|
293
305
|
|
|
294
|
-
def
|
|
306
|
+
def buildlog_experiment_end(
|
|
295
307
|
entry_file: str | None = None,
|
|
296
308
|
notes: str | None = None,
|
|
297
309
|
buildlog_dir: str = "buildlog",
|
|
@@ -358,7 +370,7 @@ def buildlog_log_mistake(
|
|
|
358
370
|
return asdict(result)
|
|
359
371
|
|
|
360
372
|
|
|
361
|
-
def
|
|
373
|
+
def buildlog_experiment_metrics(
|
|
362
374
|
session_id: str | None = None,
|
|
363
375
|
buildlog_dir: str = "buildlog",
|
|
364
376
|
) -> dict:
|
|
@@ -407,6 +419,42 @@ def buildlog_experiment_report(
|
|
|
407
419
|
return get_experiment_report(Path(buildlog_dir))
|
|
408
420
|
|
|
409
421
|
|
|
422
|
+
def buildlog_bandit_status(
|
|
423
|
+
buildlog_dir: str = "buildlog",
|
|
424
|
+
context: str | None = None,
|
|
425
|
+
top_k: int = 10,
|
|
426
|
+
) -> dict:
|
|
427
|
+
"""Get Thompson Sampling bandit status and rule rankings.
|
|
428
|
+
|
|
429
|
+
Shows the bandit's learned beliefs about which rules are effective
|
|
430
|
+
for each error class context. Higher mean = bandit believes rule
|
|
431
|
+
is more effective.
|
|
432
|
+
|
|
433
|
+
The bandit uses Beta distributions to model uncertainty:
|
|
434
|
+
- High variance (wide CI) = uncertain, will explore more
|
|
435
|
+
- Low variance (narrow CI) = confident, will exploit
|
|
436
|
+
|
|
437
|
+
Args:
|
|
438
|
+
buildlog_dir: Path to buildlog directory
|
|
439
|
+
context: Specific error class to filter by (optional)
|
|
440
|
+
top_k: Number of top rules to show per context
|
|
441
|
+
|
|
442
|
+
Returns:
|
|
443
|
+
Dict with:
|
|
444
|
+
- summary: Total contexts, arms, observations
|
|
445
|
+
- top_rules: Best rules per context by expected value
|
|
446
|
+
- all_rules: Full stats if filtering by context
|
|
447
|
+
|
|
448
|
+
Example:
|
|
449
|
+
# See all bandit state
|
|
450
|
+
buildlog_bandit_status()
|
|
451
|
+
|
|
452
|
+
# See state for specific error class
|
|
453
|
+
buildlog_bandit_status(context="type-errors")
|
|
454
|
+
"""
|
|
455
|
+
return get_bandit_status(Path(buildlog_dir), context, top_k)
|
|
456
|
+
|
|
457
|
+
|
|
410
458
|
# -----------------------------------------------------------------------------
|
|
411
459
|
# Gauntlet Loop MCP Tools
|
|
412
460
|
# -----------------------------------------------------------------------------
|