buildlog 0.6.1__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- buildlog/__init__.py +1 -1
- buildlog/cli.py +589 -44
- buildlog/confidence.py +27 -0
- buildlog/core/__init__.py +12 -0
- buildlog/core/bandit.py +699 -0
- buildlog/core/operations.py +499 -11
- buildlog/distill.py +80 -1
- buildlog/engine/__init__.py +61 -0
- buildlog/engine/bandit.py +23 -0
- buildlog/engine/confidence.py +28 -0
- buildlog/engine/embeddings.py +28 -0
- buildlog/engine/experiments.py +619 -0
- buildlog/engine/types.py +31 -0
- buildlog/llm.py +461 -0
- buildlog/mcp/server.py +12 -6
- buildlog/mcp/tools.py +166 -13
- buildlog/render/__init__.py +19 -2
- buildlog/render/claude_md.py +74 -26
- buildlog/render/continue_dev.py +102 -0
- buildlog/render/copilot.py +100 -0
- buildlog/render/cursor.py +105 -0
- buildlog/render/tracking.py +20 -1
- buildlog/render/windsurf.py +95 -0
- buildlog/seeds.py +41 -0
- buildlog/skills.py +69 -6
- {buildlog-0.6.1.data → buildlog-0.8.0.data}/data/share/buildlog/copier.yml +0 -4
- buildlog-0.8.0.data/data/share/buildlog/template/buildlog/_TEMPLATE_QUICK.md +21 -0
- buildlog-0.8.0.dist-info/METADATA +151 -0
- buildlog-0.8.0.dist-info/RECORD +54 -0
- buildlog-0.6.1.dist-info/METADATA +0 -490
- buildlog-0.6.1.dist-info/RECORD +0 -41
- {buildlog-0.6.1.data → buildlog-0.8.0.data}/data/share/buildlog/post_gen.py +0 -0
- {buildlog-0.6.1.data → buildlog-0.8.0.data}/data/share/buildlog/template/buildlog/.gitkeep +0 -0
- {buildlog-0.6.1.data → buildlog-0.8.0.data}/data/share/buildlog/template/buildlog/2026-01-01-example.md +0 -0
- {buildlog-0.6.1.data → buildlog-0.8.0.data}/data/share/buildlog/template/buildlog/BUILDLOG_SYSTEM.md +0 -0
- {buildlog-0.6.1.data → buildlog-0.8.0.data}/data/share/buildlog/template/buildlog/_TEMPLATE.md +0 -0
- {buildlog-0.6.1.data → buildlog-0.8.0.data}/data/share/buildlog/template/buildlog/assets/.gitkeep +0 -0
- {buildlog-0.6.1.dist-info → buildlog-0.8.0.dist-info}/WHEEL +0 -0
- {buildlog-0.6.1.dist-info → buildlog-0.8.0.dist-info}/entry_points.txt +0 -0
- {buildlog-0.6.1.dist-info → buildlog-0.8.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,619 @@
|
|
|
1
|
+
"""Agent-agnostic experiment tracking engine.
|
|
2
|
+
|
|
3
|
+
This module contains the core session tracking, mistake logging, and reward
|
|
4
|
+
signal logic decoupled from any specific agent or skill generation mechanism.
|
|
5
|
+
|
|
6
|
+
The key difference from core/operations.py: functions here accept
|
|
7
|
+
`available_rules: list[str]` as a parameter rather than calling
|
|
8
|
+
`generate_skills()` internally. The caller (CLI, MCP, etc.) is responsible
|
|
9
|
+
for getting the rule list however it wants. The engine doesn't care where
|
|
10
|
+
rules come from.
|
|
11
|
+
|
|
12
|
+
Usage:
|
|
13
|
+
from buildlog.engine.experiments import start_session, end_session, log_mistake
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import hashlib
|
|
19
|
+
import json
|
|
20
|
+
from datetime import datetime, timezone
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import Literal
|
|
23
|
+
|
|
24
|
+
from buildlog.core.bandit import ThompsonSamplingBandit
|
|
25
|
+
from buildlog.core.operations import (
|
|
26
|
+
EndSessionResult,
|
|
27
|
+
LogMistakeResult,
|
|
28
|
+
LogRewardResult,
|
|
29
|
+
Mistake,
|
|
30
|
+
RewardEvent,
|
|
31
|
+
RewardSummary,
|
|
32
|
+
Session,
|
|
33
|
+
SessionMetrics,
|
|
34
|
+
StartSessionResult,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
__all__ = [
|
|
38
|
+
"start_session",
|
|
39
|
+
"end_session",
|
|
40
|
+
"log_mistake",
|
|
41
|
+
"log_reward",
|
|
42
|
+
"get_rewards",
|
|
43
|
+
"session_metrics",
|
|
44
|
+
"experiment_report",
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# ---------------------------------------------------------------------------
|
|
49
|
+
# Path helpers (duplicated from operations to avoid tight coupling)
|
|
50
|
+
# ---------------------------------------------------------------------------
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _get_sessions_path(buildlog_dir: Path) -> Path:
|
|
54
|
+
return buildlog_dir / ".buildlog" / "sessions.jsonl"
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _get_mistakes_path(buildlog_dir: Path) -> Path:
|
|
58
|
+
return buildlog_dir / ".buildlog" / "mistakes.jsonl"
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _get_active_session_path(buildlog_dir: Path) -> Path:
|
|
62
|
+
return buildlog_dir / ".buildlog" / "active_session.json"
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _get_rewards_path(buildlog_dir: Path) -> Path:
|
|
66
|
+
return buildlog_dir / ".buildlog" / "reward_events.jsonl"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _get_promoted_path(buildlog_dir: Path) -> Path:
|
|
70
|
+
return buildlog_dir / ".buildlog" / "promoted.json"
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _load_json_set(path: Path, key: str) -> set[str]:
|
|
74
|
+
if not path.exists():
|
|
75
|
+
return set()
|
|
76
|
+
try:
|
|
77
|
+
data = json.loads(path.read_text())
|
|
78
|
+
return set(data.get(key, []))
|
|
79
|
+
except (json.JSONDecodeError, OSError):
|
|
80
|
+
return set()
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _get_current_rules(buildlog_dir: Path) -> list[str]:
|
|
84
|
+
promoted_path = _get_promoted_path(buildlog_dir)
|
|
85
|
+
return list(_load_json_set(promoted_path, "skill_ids"))
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _generate_session_id(now: datetime) -> str:
|
|
89
|
+
return f"session-{now.strftime('%Y%m%d-%H%M%S')}-{now.microsecond:06d}"
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _generate_mistake_id(error_class: str, now: datetime) -> str:
|
|
93
|
+
return f"mistake-{error_class[:10]}-{now.strftime('%Y%m%d-%H%M%S')}-{now.microsecond:06d}"
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _compute_semantic_hash(description: str) -> str:
|
|
97
|
+
normalized = " ".join(description.lower().split())
|
|
98
|
+
return hashlib.sha256(normalized.encode()).hexdigest()[:16]
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _generate_reward_id(outcome: str, timestamp: datetime) -> str:
|
|
102
|
+
ts_str = timestamp.isoformat()
|
|
103
|
+
normalized = f"{outcome}:{ts_str}"
|
|
104
|
+
hash_hex = hashlib.sha256(normalized.encode("utf-8")).hexdigest()[:10]
|
|
105
|
+
return f"rew-{hash_hex}"
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _compute_reward_value(
|
|
109
|
+
outcome: Literal["accepted", "revision", "rejected"],
|
|
110
|
+
revision_distance: float | None,
|
|
111
|
+
) -> float:
|
|
112
|
+
if outcome == "accepted":
|
|
113
|
+
return 1.0
|
|
114
|
+
elif outcome == "rejected":
|
|
115
|
+
return 0.0
|
|
116
|
+
else:
|
|
117
|
+
distance = revision_distance if revision_distance is not None else 0.5
|
|
118
|
+
return max(0.0, min(1.0, 1.0 - distance))
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _load_sessions(buildlog_dir: Path) -> list[Session]:
|
|
122
|
+
sessions_path = _get_sessions_path(buildlog_dir)
|
|
123
|
+
if not sessions_path.exists():
|
|
124
|
+
return []
|
|
125
|
+
sessions = []
|
|
126
|
+
for line in sessions_path.read_text().strip().split("\n"):
|
|
127
|
+
if line:
|
|
128
|
+
try:
|
|
129
|
+
data = json.loads(line)
|
|
130
|
+
sessions.append(Session.from_dict(data))
|
|
131
|
+
except (json.JSONDecodeError, KeyError):
|
|
132
|
+
continue
|
|
133
|
+
return sessions
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _load_mistakes(buildlog_dir: Path) -> list[Mistake]:
|
|
137
|
+
mistakes_path = _get_mistakes_path(buildlog_dir)
|
|
138
|
+
if not mistakes_path.exists():
|
|
139
|
+
return []
|
|
140
|
+
mistakes = []
|
|
141
|
+
for line in mistakes_path.read_text().strip().split("\n"):
|
|
142
|
+
if line:
|
|
143
|
+
try:
|
|
144
|
+
data = json.loads(line)
|
|
145
|
+
mistakes.append(Mistake.from_dict(data))
|
|
146
|
+
except (json.JSONDecodeError, KeyError):
|
|
147
|
+
continue
|
|
148
|
+
return mistakes
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _find_similar_prior_mistake(
|
|
152
|
+
description: str,
|
|
153
|
+
error_class: str,
|
|
154
|
+
current_session_id: str,
|
|
155
|
+
all_mistakes: list[Mistake],
|
|
156
|
+
) -> Mistake | None:
|
|
157
|
+
semantic_hash = _compute_semantic_hash(description)
|
|
158
|
+
for mistake in all_mistakes:
|
|
159
|
+
if (
|
|
160
|
+
mistake.session_id != current_session_id
|
|
161
|
+
and mistake.error_class == error_class
|
|
162
|
+
):
|
|
163
|
+
if mistake.semantic_hash == semantic_hash:
|
|
164
|
+
return mistake
|
|
165
|
+
desc_words = set(description.lower().split())
|
|
166
|
+
mistake_words = set(mistake.description.lower().split())
|
|
167
|
+
if len(desc_words & mistake_words) / max(len(desc_words), 1) > 0.7:
|
|
168
|
+
return mistake
|
|
169
|
+
return None
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
# ---------------------------------------------------------------------------
|
|
173
|
+
# Public API — agent-agnostic experiment functions
|
|
174
|
+
# ---------------------------------------------------------------------------
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def start_session(
|
|
178
|
+
buildlog_dir: Path,
|
|
179
|
+
error_class: str | None = None,
|
|
180
|
+
notes: str | None = None,
|
|
181
|
+
select_k: int = 3,
|
|
182
|
+
available_rules: list[str] | None = None,
|
|
183
|
+
seed_rule_ids: set[str] | None = None,
|
|
184
|
+
) -> StartSessionResult:
|
|
185
|
+
"""Start a new experiment session with bandit-selected rules.
|
|
186
|
+
|
|
187
|
+
Unlike core/operations.start_session, this function accepts
|
|
188
|
+
``available_rules`` directly rather than calling generate_skills().
|
|
189
|
+
If ``available_rules`` is None, falls back to reading promoted rule IDs
|
|
190
|
+
from .buildlog/promoted.json.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
buildlog_dir: Path to buildlog directory.
|
|
194
|
+
error_class: Error class being targeted (context for bandits).
|
|
195
|
+
notes: Optional notes about the session.
|
|
196
|
+
select_k: Number of rules to select via Thompson Sampling.
|
|
197
|
+
available_rules: Explicit list of candidate rule IDs. If None,
|
|
198
|
+
reads promoted IDs from .buildlog/promoted.json.
|
|
199
|
+
seed_rule_ids: Set of rule IDs that get boosted priors.
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
StartSessionResult with session ID, rules count, and selected rules.
|
|
203
|
+
"""
|
|
204
|
+
now = datetime.now(timezone.utc)
|
|
205
|
+
session_id = _generate_session_id(now)
|
|
206
|
+
|
|
207
|
+
current_rules = (
|
|
208
|
+
available_rules
|
|
209
|
+
if available_rules is not None
|
|
210
|
+
else _get_current_rules(buildlog_dir)
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
selected_rules: list[str] = []
|
|
214
|
+
|
|
215
|
+
if current_rules:
|
|
216
|
+
bandit_path = buildlog_dir / "bandit_state.jsonl"
|
|
217
|
+
bandit = ThompsonSamplingBandit(bandit_path)
|
|
218
|
+
|
|
219
|
+
selected_rules = bandit.select(
|
|
220
|
+
candidates=current_rules,
|
|
221
|
+
context=error_class or "general",
|
|
222
|
+
k=min(select_k, len(current_rules)),
|
|
223
|
+
seed_rule_ids=seed_rule_ids or set(),
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
session = Session(
|
|
227
|
+
id=session_id,
|
|
228
|
+
started_at=now,
|
|
229
|
+
rules_at_start=current_rules,
|
|
230
|
+
selected_rules=selected_rules,
|
|
231
|
+
error_class=error_class,
|
|
232
|
+
notes=notes,
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
active_path = _get_active_session_path(buildlog_dir)
|
|
236
|
+
active_path.parent.mkdir(parents=True, exist_ok=True)
|
|
237
|
+
active_path.write_text(json.dumps(session.to_dict(), indent=2))
|
|
238
|
+
|
|
239
|
+
return StartSessionResult(
|
|
240
|
+
session_id=session_id,
|
|
241
|
+
error_class=error_class,
|
|
242
|
+
rules_count=len(current_rules),
|
|
243
|
+
selected_rules=selected_rules,
|
|
244
|
+
message=(
|
|
245
|
+
f"Started session {session_id}: selected {len(selected_rules)}/"
|
|
246
|
+
f"{len(current_rules)} rules via Thompson Sampling"
|
|
247
|
+
),
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def end_session(
|
|
252
|
+
buildlog_dir: Path,
|
|
253
|
+
entry_file: str | None = None,
|
|
254
|
+
notes: str | None = None,
|
|
255
|
+
) -> EndSessionResult:
|
|
256
|
+
"""End the current experiment session.
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
buildlog_dir: Path to buildlog directory.
|
|
260
|
+
entry_file: Corresponding buildlog entry file, if any.
|
|
261
|
+
notes: Additional notes to append.
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
EndSessionResult with session metrics.
|
|
265
|
+
"""
|
|
266
|
+
active_path = _get_active_session_path(buildlog_dir)
|
|
267
|
+
|
|
268
|
+
if not active_path.exists():
|
|
269
|
+
raise ValueError("No active session to end")
|
|
270
|
+
|
|
271
|
+
session_data = json.loads(active_path.read_text())
|
|
272
|
+
session = Session.from_dict(session_data)
|
|
273
|
+
|
|
274
|
+
now = datetime.now(timezone.utc)
|
|
275
|
+
session.ended_at = now
|
|
276
|
+
session.rules_at_end = _get_current_rules(buildlog_dir)
|
|
277
|
+
if entry_file:
|
|
278
|
+
session.entry_file = entry_file
|
|
279
|
+
if notes:
|
|
280
|
+
session.notes = f"{session.notes or ''}\n{notes}".strip()
|
|
281
|
+
|
|
282
|
+
sessions_path = _get_sessions_path(buildlog_dir)
|
|
283
|
+
sessions_path.parent.mkdir(parents=True, exist_ok=True)
|
|
284
|
+
with open(sessions_path, "a") as f:
|
|
285
|
+
f.write(json.dumps(session.to_dict()) + "\n")
|
|
286
|
+
|
|
287
|
+
active_path.unlink()
|
|
288
|
+
|
|
289
|
+
all_mistakes = _load_mistakes(buildlog_dir)
|
|
290
|
+
session_mistakes = [m for m in all_mistakes if m.session_id == session.id]
|
|
291
|
+
repeated = sum(1 for m in session_mistakes if m.was_repeat)
|
|
292
|
+
|
|
293
|
+
duration = (session.ended_at - session.started_at).total_seconds() / 60
|
|
294
|
+
|
|
295
|
+
return EndSessionResult(
|
|
296
|
+
session_id=session.id,
|
|
297
|
+
duration_minutes=round(duration, 1),
|
|
298
|
+
mistakes_logged=len(session_mistakes),
|
|
299
|
+
repeated_mistakes=repeated,
|
|
300
|
+
rules_at_start=len(session.rules_at_start),
|
|
301
|
+
rules_at_end=len(session.rules_at_end),
|
|
302
|
+
message=f"Ended session {session.id} ({duration:.1f}min, {len(session_mistakes)} mistakes, {repeated} repeats)",
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def log_mistake(
|
|
307
|
+
buildlog_dir: Path,
|
|
308
|
+
error_class: str,
|
|
309
|
+
description: str,
|
|
310
|
+
corrected_by_rule: str | None = None,
|
|
311
|
+
) -> LogMistakeResult:
|
|
312
|
+
"""Log a mistake during an experiment session.
|
|
313
|
+
|
|
314
|
+
Updates the bandit with reward=0 for selected rules in the session.
|
|
315
|
+
|
|
316
|
+
Args:
|
|
317
|
+
buildlog_dir: Path to buildlog directory.
|
|
318
|
+
error_class: Category of error.
|
|
319
|
+
description: Description of the mistake.
|
|
320
|
+
corrected_by_rule: Rule ID that should have prevented this.
|
|
321
|
+
|
|
322
|
+
Returns:
|
|
323
|
+
LogMistakeResult indicating if this was a repeat.
|
|
324
|
+
"""
|
|
325
|
+
active_path = _get_active_session_path(buildlog_dir)
|
|
326
|
+
|
|
327
|
+
if not active_path.exists():
|
|
328
|
+
raise ValueError(
|
|
329
|
+
"No active session - start one with 'buildlog experiment start'"
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
session_data = json.loads(active_path.read_text())
|
|
333
|
+
session_id = session_data["id"]
|
|
334
|
+
|
|
335
|
+
now = datetime.now(timezone.utc)
|
|
336
|
+
mistake_id = _generate_mistake_id(error_class, now)
|
|
337
|
+
|
|
338
|
+
all_mistakes = _load_mistakes(buildlog_dir)
|
|
339
|
+
similar = _find_similar_prior_mistake(
|
|
340
|
+
description, error_class, session_id, all_mistakes
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
mistake = Mistake(
|
|
344
|
+
id=mistake_id,
|
|
345
|
+
session_id=session_id,
|
|
346
|
+
timestamp=now,
|
|
347
|
+
error_class=error_class,
|
|
348
|
+
description=description,
|
|
349
|
+
semantic_hash=_compute_semantic_hash(description),
|
|
350
|
+
was_repeat=similar is not None,
|
|
351
|
+
corrected_by_rule=corrected_by_rule,
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
mistakes_path = _get_mistakes_path(buildlog_dir)
|
|
355
|
+
mistakes_path.parent.mkdir(parents=True, exist_ok=True)
|
|
356
|
+
with open(mistakes_path, "a") as f:
|
|
357
|
+
f.write(json.dumps(mistake.to_dict()) + "\n")
|
|
358
|
+
|
|
359
|
+
selected_rules = session_data.get("selected_rules", [])
|
|
360
|
+
if selected_rules:
|
|
361
|
+
bandit_path = buildlog_dir / "bandit_state.jsonl"
|
|
362
|
+
bandit = ThompsonSamplingBandit(bandit_path)
|
|
363
|
+
context = session_data.get("error_class") or "general"
|
|
364
|
+
bandit.batch_update(
|
|
365
|
+
rule_ids=selected_rules,
|
|
366
|
+
reward=0.0,
|
|
367
|
+
context=context,
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
message = f"Logged mistake: {error_class}"
|
|
371
|
+
if similar:
|
|
372
|
+
message += f" (REPEAT of {similar.id})"
|
|
373
|
+
if selected_rules:
|
|
374
|
+
message += f" | Updated bandit: {len(selected_rules)} rules got reward=0"
|
|
375
|
+
|
|
376
|
+
return LogMistakeResult(
|
|
377
|
+
mistake_id=mistake_id,
|
|
378
|
+
session_id=session_id,
|
|
379
|
+
was_repeat=similar is not None,
|
|
380
|
+
similar_prior=similar.id if similar else None,
|
|
381
|
+
message=message,
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def log_reward(
|
|
386
|
+
buildlog_dir: Path,
|
|
387
|
+
outcome: Literal["accepted", "revision", "rejected"],
|
|
388
|
+
rules_active: list[str] | None = None,
|
|
389
|
+
revision_distance: float | None = None,
|
|
390
|
+
error_class: str | None = None,
|
|
391
|
+
notes: str | None = None,
|
|
392
|
+
source: str | None = None,
|
|
393
|
+
) -> LogRewardResult:
|
|
394
|
+
"""Log a reward event for bandit learning.
|
|
395
|
+
|
|
396
|
+
Args:
|
|
397
|
+
buildlog_dir: Path to buildlog directory.
|
|
398
|
+
outcome: Type of feedback (accepted/revision/rejected).
|
|
399
|
+
rules_active: List of rule IDs in context. If None, uses session's.
|
|
400
|
+
revision_distance: How much correction needed (0-1).
|
|
401
|
+
error_class: Category of error if applicable.
|
|
402
|
+
notes: Optional notes.
|
|
403
|
+
source: Where this feedback came from.
|
|
404
|
+
|
|
405
|
+
Returns:
|
|
406
|
+
LogRewardResult with confirmation.
|
|
407
|
+
"""
|
|
408
|
+
now = datetime.now(timezone.utc)
|
|
409
|
+
reward_id = _generate_reward_id(outcome, now)
|
|
410
|
+
reward_value = _compute_reward_value(outcome, revision_distance)
|
|
411
|
+
|
|
412
|
+
active_path = _get_active_session_path(buildlog_dir)
|
|
413
|
+
if active_path.exists():
|
|
414
|
+
session_data = json.loads(active_path.read_text())
|
|
415
|
+
if rules_active is None:
|
|
416
|
+
rules_active = session_data.get("selected_rules", [])
|
|
417
|
+
if error_class is None:
|
|
418
|
+
error_class = session_data.get("error_class")
|
|
419
|
+
|
|
420
|
+
event = RewardEvent(
|
|
421
|
+
id=reward_id,
|
|
422
|
+
timestamp=now,
|
|
423
|
+
outcome=outcome,
|
|
424
|
+
reward_value=reward_value,
|
|
425
|
+
rules_active=rules_active or [],
|
|
426
|
+
revision_distance=revision_distance,
|
|
427
|
+
error_class=error_class,
|
|
428
|
+
notes=notes,
|
|
429
|
+
source=source or "manual",
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
rewards_path = _get_rewards_path(buildlog_dir)
|
|
433
|
+
rewards_path.parent.mkdir(parents=True, exist_ok=True)
|
|
434
|
+
with open(rewards_path, "a") as f:
|
|
435
|
+
f.write(json.dumps(event.to_dict()) + "\n")
|
|
436
|
+
|
|
437
|
+
if rules_active:
|
|
438
|
+
bandit_path = buildlog_dir / "bandit_state.jsonl"
|
|
439
|
+
bandit = ThompsonSamplingBandit(bandit_path)
|
|
440
|
+
bandit.batch_update(
|
|
441
|
+
rule_ids=rules_active,
|
|
442
|
+
reward=reward_value,
|
|
443
|
+
context=error_class or "general",
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
total_events = 0
|
|
447
|
+
if rewards_path.exists():
|
|
448
|
+
total_events = sum(
|
|
449
|
+
1 for line in rewards_path.read_text().strip().split("\n") if line
|
|
450
|
+
)
|
|
451
|
+
|
|
452
|
+
rules_count = len(rules_active) if rules_active else 0
|
|
453
|
+
message = f"Logged {outcome} (reward={reward_value:.2f})"
|
|
454
|
+
if rules_count > 0:
|
|
455
|
+
message += f" | Updated bandit: {rules_count} rules"
|
|
456
|
+
|
|
457
|
+
return LogRewardResult(
|
|
458
|
+
reward_id=reward_id,
|
|
459
|
+
reward_value=reward_value,
|
|
460
|
+
total_events=total_events,
|
|
461
|
+
message=message,
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
def get_rewards(
|
|
466
|
+
buildlog_dir: Path,
|
|
467
|
+
limit: int | None = None,
|
|
468
|
+
) -> RewardSummary:
|
|
469
|
+
"""Get reward events with summary statistics.
|
|
470
|
+
|
|
471
|
+
Args:
|
|
472
|
+
buildlog_dir: Path to buildlog directory.
|
|
473
|
+
limit: Maximum number of events to return (most recent first).
|
|
474
|
+
|
|
475
|
+
Returns:
|
|
476
|
+
RewardSummary with events and statistics.
|
|
477
|
+
"""
|
|
478
|
+
rewards_path = _get_rewards_path(buildlog_dir)
|
|
479
|
+
|
|
480
|
+
if not rewards_path.exists():
|
|
481
|
+
return RewardSummary(
|
|
482
|
+
total_events=0,
|
|
483
|
+
accepted=0,
|
|
484
|
+
revisions=0,
|
|
485
|
+
rejected=0,
|
|
486
|
+
mean_reward=0.0,
|
|
487
|
+
events=[],
|
|
488
|
+
)
|
|
489
|
+
|
|
490
|
+
events: list[RewardEvent] = []
|
|
491
|
+
for line in rewards_path.read_text().strip().split("\n"):
|
|
492
|
+
if line:
|
|
493
|
+
try:
|
|
494
|
+
data = json.loads(line)
|
|
495
|
+
events.append(RewardEvent.from_dict(data))
|
|
496
|
+
except (json.JSONDecodeError, KeyError):
|
|
497
|
+
continue
|
|
498
|
+
|
|
499
|
+
total = len(events)
|
|
500
|
+
accepted = sum(1 for e in events if e.outcome == "accepted")
|
|
501
|
+
revisions = sum(1 for e in events if e.outcome == "revision")
|
|
502
|
+
rejected = sum(1 for e in events if e.outcome == "rejected")
|
|
503
|
+
mean_reward = sum(e.reward_value for e in events) / total if total > 0 else 0.0
|
|
504
|
+
|
|
505
|
+
events.sort(key=lambda e: e.timestamp, reverse=True)
|
|
506
|
+
if limit is not None:
|
|
507
|
+
events = events[:limit]
|
|
508
|
+
|
|
509
|
+
return RewardSummary(
|
|
510
|
+
total_events=total,
|
|
511
|
+
accepted=accepted,
|
|
512
|
+
revisions=revisions,
|
|
513
|
+
rejected=rejected,
|
|
514
|
+
mean_reward=mean_reward,
|
|
515
|
+
events=events,
|
|
516
|
+
)
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
def session_metrics(
|
|
520
|
+
buildlog_dir: Path,
|
|
521
|
+
session_id: str | None = None,
|
|
522
|
+
) -> SessionMetrics:
|
|
523
|
+
"""Get metrics for a session or all sessions.
|
|
524
|
+
|
|
525
|
+
Args:
|
|
526
|
+
buildlog_dir: Path to buildlog directory.
|
|
527
|
+
session_id: Specific session ID, or None for aggregate metrics.
|
|
528
|
+
|
|
529
|
+
Returns:
|
|
530
|
+
SessionMetrics with mistake rates and rule changes.
|
|
531
|
+
"""
|
|
532
|
+
sessions = _load_sessions(buildlog_dir)
|
|
533
|
+
mistakes = _load_mistakes(buildlog_dir)
|
|
534
|
+
|
|
535
|
+
if session_id:
|
|
536
|
+
session = next((s for s in sessions if s.id == session_id), None)
|
|
537
|
+
if not session:
|
|
538
|
+
raise ValueError(f"Session not found: {session_id}")
|
|
539
|
+
|
|
540
|
+
session_mistakes = [m for m in mistakes if m.session_id == session_id]
|
|
541
|
+
total = len(session_mistakes)
|
|
542
|
+
repeated = sum(1 for m in session_mistakes if m.was_repeat)
|
|
543
|
+
|
|
544
|
+
return SessionMetrics(
|
|
545
|
+
session_id=session_id,
|
|
546
|
+
total_mistakes=total,
|
|
547
|
+
repeated_mistakes=repeated,
|
|
548
|
+
repeated_mistake_rate=repeated / total if total > 0 else 0.0,
|
|
549
|
+
rules_at_start=len(session.rules_at_start),
|
|
550
|
+
rules_at_end=len(session.rules_at_end),
|
|
551
|
+
rules_added=len(session.rules_at_end) - len(session.rules_at_start),
|
|
552
|
+
)
|
|
553
|
+
else:
|
|
554
|
+
total = len(mistakes)
|
|
555
|
+
repeated = sum(1 for m in mistakes if m.was_repeat)
|
|
556
|
+
|
|
557
|
+
rules_start = sessions[0].rules_at_start if sessions else []
|
|
558
|
+
rules_end = sessions[-1].rules_at_end if sessions else []
|
|
559
|
+
|
|
560
|
+
return SessionMetrics(
|
|
561
|
+
session_id="aggregate",
|
|
562
|
+
total_mistakes=total,
|
|
563
|
+
repeated_mistakes=repeated,
|
|
564
|
+
repeated_mistake_rate=repeated / total if total > 0 else 0.0,
|
|
565
|
+
rules_at_start=len(rules_start),
|
|
566
|
+
rules_at_end=len(rules_end),
|
|
567
|
+
rules_added=len(rules_end) - len(rules_start),
|
|
568
|
+
)
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
def experiment_report(buildlog_dir: Path) -> dict:
|
|
572
|
+
"""Generate a comprehensive experiment report.
|
|
573
|
+
|
|
574
|
+
Returns:
|
|
575
|
+
Dictionary with sessions, metrics, and analysis.
|
|
576
|
+
"""
|
|
577
|
+
sessions = _load_sessions(buildlog_dir)
|
|
578
|
+
mistakes = _load_mistakes(buildlog_dir)
|
|
579
|
+
|
|
580
|
+
session_metrics_list = []
|
|
581
|
+
for session in sessions:
|
|
582
|
+
session_mistakes = [m for m in mistakes if m.session_id == session.id]
|
|
583
|
+
total = len(session_mistakes)
|
|
584
|
+
repeated = sum(1 for m in session_mistakes if m.was_repeat)
|
|
585
|
+
session_metrics_list.append(
|
|
586
|
+
{
|
|
587
|
+
"session_id": session.id,
|
|
588
|
+
"started_at": session.started_at.isoformat(),
|
|
589
|
+
"error_class": session.error_class,
|
|
590
|
+
"total_mistakes": total,
|
|
591
|
+
"repeated_mistakes": repeated,
|
|
592
|
+
"repeated_mistake_rate": repeated / total if total > 0 else 0.0,
|
|
593
|
+
"rules_added": len(session.rules_at_end) - len(session.rules_at_start),
|
|
594
|
+
}
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
total_mistakes = len(mistakes)
|
|
598
|
+
total_repeated = sum(1 for m in mistakes if m.was_repeat)
|
|
599
|
+
|
|
600
|
+
error_classes: dict[str, dict] = {}
|
|
601
|
+
for mistake in mistakes:
|
|
602
|
+
if mistake.error_class not in error_classes:
|
|
603
|
+
error_classes[mistake.error_class] = {"total": 0, "repeated": 0}
|
|
604
|
+
error_classes[mistake.error_class]["total"] += 1
|
|
605
|
+
if mistake.was_repeat:
|
|
606
|
+
error_classes[mistake.error_class]["repeated"] += 1
|
|
607
|
+
|
|
608
|
+
return {
|
|
609
|
+
"summary": {
|
|
610
|
+
"total_sessions": len(sessions),
|
|
611
|
+
"total_mistakes": total_mistakes,
|
|
612
|
+
"total_repeated": total_repeated,
|
|
613
|
+
"overall_repeat_rate": (
|
|
614
|
+
total_repeated / total_mistakes if total_mistakes > 0 else 0.0
|
|
615
|
+
),
|
|
616
|
+
},
|
|
617
|
+
"sessions": session_metrics_list,
|
|
618
|
+
"error_classes": error_classes,
|
|
619
|
+
}
|
buildlog/engine/types.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Pure data types for the buildlog engine.
|
|
2
|
+
|
|
3
|
+
Re-exports dataclasses from their canonical locations. These are pure data
|
|
4
|
+
structures with no I/O dependencies, suitable for use in any context.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
from buildlog.engine.types import Skill, Session, Mistake, RewardEvent
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from buildlog.confidence import ConfidenceConfig, ConfidenceMetrics
|
|
11
|
+
from buildlog.core.bandit import BetaParams
|
|
12
|
+
from buildlog.core.operations import (
|
|
13
|
+
Mistake,
|
|
14
|
+
RewardEvent,
|
|
15
|
+
RewardSummary,
|
|
16
|
+
Session,
|
|
17
|
+
SessionMetrics,
|
|
18
|
+
)
|
|
19
|
+
from buildlog.skills import Skill
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"Skill",
|
|
23
|
+
"Session",
|
|
24
|
+
"SessionMetrics",
|
|
25
|
+
"Mistake",
|
|
26
|
+
"RewardEvent",
|
|
27
|
+
"RewardSummary",
|
|
28
|
+
"BetaParams",
|
|
29
|
+
"ConfidenceMetrics",
|
|
30
|
+
"ConfidenceConfig",
|
|
31
|
+
]
|