buildlog 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
buildlog/mcp/__init__.py CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  from buildlog.mcp.tools import (
4
4
  buildlog_diff,
5
+ buildlog_learn_from_review,
5
6
  buildlog_promote,
6
7
  buildlog_reject,
7
8
  buildlog_status,
@@ -12,4 +13,5 @@ __all__ = [
12
13
  "buildlog_promote",
13
14
  "buildlog_reject",
14
15
  "buildlog_diff",
16
+ "buildlog_learn_from_review",
15
17
  ]
buildlog/mcp/server.py CHANGED
@@ -6,8 +6,16 @@ from mcp.server.fastmcp import FastMCP
6
6
 
7
7
  from buildlog.mcp.tools import (
8
8
  buildlog_diff,
9
+ buildlog_end_session,
10
+ buildlog_experiment_report,
11
+ buildlog_learn_from_review,
12
+ buildlog_log_mistake,
13
+ buildlog_log_reward,
9
14
  buildlog_promote,
10
15
  buildlog_reject,
16
+ buildlog_rewards,
17
+ buildlog_session_metrics,
18
+ buildlog_start_session,
11
19
  buildlog_status,
12
20
  )
13
21
 
@@ -18,6 +26,16 @@ mcp.tool()(buildlog_status)
18
26
  mcp.tool()(buildlog_promote)
19
27
  mcp.tool()(buildlog_reject)
20
28
  mcp.tool()(buildlog_diff)
29
+ mcp.tool()(buildlog_learn_from_review)
30
+ mcp.tool()(buildlog_log_reward)
31
+ mcp.tool()(buildlog_rewards)
32
+
33
+ # Session tracking tools (experiment infrastructure)
34
+ mcp.tool()(buildlog_start_session)
35
+ mcp.tool()(buildlog_end_session)
36
+ mcp.tool()(buildlog_log_mistake)
37
+ mcp.tool()(buildlog_session_metrics)
38
+ mcp.tool()(buildlog_experiment_report)
21
39
 
22
40
 
23
41
  def main() -> None:
buildlog/mcp/tools.py CHANGED
@@ -9,7 +9,20 @@ from dataclasses import asdict
9
9
  from pathlib import Path
10
10
  from typing import Literal
11
11
 
12
- from buildlog.core import diff, promote, reject, status
12
+ from buildlog.core import (
13
+ diff,
14
+ end_session,
15
+ get_experiment_report,
16
+ get_rewards,
17
+ get_session_metrics,
18
+ learn_from_review,
19
+ log_mistake,
20
+ log_reward,
21
+ promote,
22
+ reject,
23
+ start_session,
24
+ status,
25
+ )
13
26
 
14
27
 
15
28
  def _validate_skill_ids(skill_ids: list[str]) -> list[str]:
@@ -95,3 +108,300 @@ def buildlog_diff(
95
108
  """
96
109
  result = diff(Path(buildlog_dir))
97
110
  return asdict(result)
111
+
112
+
113
+ def buildlog_learn_from_review(
114
+ issues: list[dict],
115
+ source: str | None = None,
116
+ buildlog_dir: str = "buildlog",
117
+ ) -> dict:
118
+ """Capture learnings from code review feedback.
119
+
120
+ Call this after a review loop completes to persist learnings.
121
+ Each issue's rule_learned becomes a tracked learning that gains
122
+ confidence through reinforcement.
123
+
124
+ Args:
125
+ issues: List of issues with structure:
126
+ {
127
+ "severity": "critical|major|minor|nitpick",
128
+ "category": "architectural|workflow|tool_usage|domain_knowledge",
129
+ "description": "What's wrong",
130
+ "rule_learned": "Generalizable rule",
131
+ "location": "file:line (optional)",
132
+ "why_it_matters": "Why this matters (optional)",
133
+ "functional_principle": "FP principle (optional)"
134
+ }
135
+ source: Optional identifier (e.g., "PR#13")
136
+ buildlog_dir: Path to buildlog directory
137
+
138
+ Returns:
139
+ Result with new_learnings, reinforced_learnings, total processed
140
+
141
+ Example:
142
+ buildlog_learn_from_review(
143
+ issues=[
144
+ {
145
+ "severity": "critical",
146
+ "category": "architectural",
147
+ "description": "Score bounds not validated",
148
+ "rule_learned": "Validate invariants at function boundaries"
149
+ }
150
+ ],
151
+ source="PR#13"
152
+ )
153
+ """
154
+ result = learn_from_review(Path(buildlog_dir), issues, source)
155
+ return asdict(result)
156
+
157
+
158
+ def buildlog_log_reward(
159
+ outcome: str,
160
+ rules_active: list[str] | None = None,
161
+ revision_distance: float | None = None,
162
+ error_class: str | None = None,
163
+ notes: str | None = None,
164
+ buildlog_dir: str = "buildlog",
165
+ ) -> dict:
166
+ """Log a reward signal for bandit learning.
167
+
168
+ Call this after agent work to provide feedback on the outcome.
169
+ This enables learning which rules are effective in which contexts.
170
+
171
+ Args:
172
+ outcome: Type of feedback:
173
+ - "accepted": Work was accepted as-is (reward=1.0)
174
+ - "revision": Work needed changes (reward=1-distance)
175
+ - "rejected": Work was rejected entirely (reward=0.0)
176
+ rules_active: List of rule IDs that were in context during the work
177
+ revision_distance: How much correction was needed (0-1, 0=minor tweak, 1=complete redo)
178
+ error_class: Category of error if applicable (e.g., "missing_test", "validation_boundary")
179
+ notes: Optional notes about the feedback
180
+ buildlog_dir: Path to buildlog directory
181
+
182
+ Returns:
183
+ Dict with reward_id, reward_value, total_events
184
+
185
+ Example:
186
+ # Work was accepted
187
+ buildlog_log_reward(outcome="accepted", rules_active=["arch-123", "wf-456"])
188
+
189
+ # Work needed revision
190
+ buildlog_log_reward(
191
+ outcome="revision",
192
+ revision_distance=0.3,
193
+ error_class="missing_test",
194
+ notes="Forgot to test error path"
195
+ )
196
+
197
+ # Work was rejected
198
+ buildlog_log_reward(outcome="rejected", notes="Completely wrong approach")
199
+ """
200
+ # Validate outcome
201
+ if outcome not in ("accepted", "revision", "rejected"):
202
+ return {
203
+ "reward_id": "",
204
+ "reward_value": 0.0,
205
+ "total_events": 0,
206
+ "message": "",
207
+ "error": f"Invalid outcome: {outcome}. Must be 'accepted', 'revision', or 'rejected'",
208
+ }
209
+
210
+ result = log_reward(
211
+ Path(buildlog_dir),
212
+ outcome=outcome, # type: ignore[arg-type]
213
+ rules_active=rules_active,
214
+ revision_distance=revision_distance,
215
+ error_class=error_class,
216
+ notes=notes,
217
+ source="mcp",
218
+ )
219
+ return asdict(result)
220
+
221
+
222
+ def buildlog_rewards(
223
+ limit: int | None = None,
224
+ buildlog_dir: str = "buildlog",
225
+ ) -> dict:
226
+ """Get reward events with summary statistics.
227
+
228
+ Returns recent reward events and aggregate statistics useful for
229
+ understanding learning progress.
230
+
231
+ Args:
232
+ limit: Maximum number of events to return (most recent first)
233
+ buildlog_dir: Path to buildlog directory
234
+
235
+ Returns:
236
+ Dict with:
237
+ - total_events: Total count of reward events
238
+ - accepted: Count of accepted outcomes
239
+ - revisions: Count of revision outcomes
240
+ - rejected: Count of rejected outcomes
241
+ - mean_reward: Average reward value
242
+ - events: List of recent events (limited)
243
+
244
+ Example:
245
+ buildlog_rewards(limit=10) # Get 10 most recent events with stats
246
+ """
247
+ result = get_rewards(Path(buildlog_dir), limit)
248
+
249
+ # Convert events to dicts
250
+ return {
251
+ "total_events": result.total_events,
252
+ "accepted": result.accepted,
253
+ "revisions": result.revisions,
254
+ "rejected": result.rejected,
255
+ "mean_reward": result.mean_reward,
256
+ "events": [e.to_dict() for e in result.events],
257
+ }
258
+
259
+
260
+ # -----------------------------------------------------------------------------
261
+ # Session Tracking MCP Tools (Experiment Infrastructure)
262
+ # -----------------------------------------------------------------------------
263
+
264
+
265
+ def buildlog_start_session(
266
+ error_class: str | None = None,
267
+ notes: str | None = None,
268
+ buildlog_dir: str = "buildlog",
269
+ ) -> dict:
270
+ """Start a new experiment session.
271
+
272
+ Begins tracking for a learning experiment. Captures the current
273
+ set of active rules to measure learning over time.
274
+
275
+ Args:
276
+ error_class: Error class being targeted (e.g., "missing_test")
277
+ notes: Notes about this session
278
+ buildlog_dir: Path to buildlog directory
279
+
280
+ Returns:
281
+ Dict with session_id, error_class, rules_count, message
282
+
283
+ Example:
284
+ buildlog_start_session(error_class="missing_test")
285
+ """
286
+ result = start_session(
287
+ Path(buildlog_dir),
288
+ error_class=error_class,
289
+ notes=notes,
290
+ )
291
+ return asdict(result)
292
+
293
+
294
+ def buildlog_end_session(
295
+ entry_file: str | None = None,
296
+ notes: str | None = None,
297
+ buildlog_dir: str = "buildlog",
298
+ ) -> dict:
299
+ """End the current experiment session.
300
+
301
+ Finalizes the session and calculates metrics including:
302
+ - Total mistakes logged
303
+ - Repeated mistakes (from prior sessions)
304
+ - Rules added during session
305
+
306
+ Args:
307
+ entry_file: Corresponding buildlog entry file, if any
308
+ notes: Additional notes to append
309
+ buildlog_dir: Path to buildlog directory
310
+
311
+ Returns:
312
+ Dict with session_id, duration_minutes, mistakes_logged,
313
+ repeated_mistakes, rules_at_start, rules_at_end, message
314
+
315
+ Example:
316
+ buildlog_end_session(entry_file="2026-01-21.md")
317
+ """
318
+ result = end_session(
319
+ Path(buildlog_dir),
320
+ entry_file=entry_file,
321
+ notes=notes,
322
+ )
323
+ return asdict(result)
324
+
325
+
326
+ def buildlog_log_mistake(
327
+ error_class: str,
328
+ description: str,
329
+ corrected_by_rule: str | None = None,
330
+ buildlog_dir: str = "buildlog",
331
+ ) -> dict:
332
+ """Log a mistake during the current session.
333
+
334
+ Records the mistake and checks if it's a repeat of a prior mistake
335
+ (from earlier sessions). This enables measuring repeated-mistake rates.
336
+
337
+ Args:
338
+ error_class: Category of error (e.g., "missing_test")
339
+ description: Description of the mistake
340
+ corrected_by_rule: Rule ID that should have prevented this
341
+ buildlog_dir: Path to buildlog directory
342
+
343
+ Returns:
344
+ Dict with mistake_id, session_id, was_repeat, similar_prior, message
345
+
346
+ Example:
347
+ buildlog_log_mistake(
348
+ error_class="missing_test",
349
+ description="Forgot to add unit tests for new helper function"
350
+ )
351
+ """
352
+ result = log_mistake(
353
+ Path(buildlog_dir),
354
+ error_class=error_class,
355
+ description=description,
356
+ corrected_by_rule=corrected_by_rule,
357
+ )
358
+ return asdict(result)
359
+
360
+
361
+ def buildlog_session_metrics(
362
+ session_id: str | None = None,
363
+ buildlog_dir: str = "buildlog",
364
+ ) -> dict:
365
+ """Get metrics for a session or all sessions.
366
+
367
+ Returns mistake rates and rule changes for analysis.
368
+
369
+ Args:
370
+ session_id: Specific session ID, or None for aggregate metrics
371
+ buildlog_dir: Path to buildlog directory
372
+
373
+ Returns:
374
+ Dict with session_id, total_mistakes, repeated_mistakes,
375
+ repeated_mistake_rate, rules_at_start, rules_at_end, rules_added
376
+
377
+ Example:
378
+ buildlog_session_metrics() # Aggregate metrics
379
+ buildlog_session_metrics(session_id="session-20260121-140000")
380
+ """
381
+ result = get_session_metrics(
382
+ Path(buildlog_dir),
383
+ session_id=session_id,
384
+ )
385
+ return asdict(result)
386
+
387
+
388
+ def buildlog_experiment_report(
389
+ buildlog_dir: str = "buildlog",
390
+ ) -> dict:
391
+ """Generate a comprehensive experiment report.
392
+
393
+ Returns summary statistics, per-session breakdown, and error class analysis.
394
+
395
+ Args:
396
+ buildlog_dir: Path to buildlog directory
397
+
398
+ Returns:
399
+ Dict with:
400
+ - summary: Overall statistics
401
+ - sessions: Per-session breakdown
402
+ - error_classes: Breakdown by error class
403
+
404
+ Example:
405
+ buildlog_experiment_report()
406
+ """
407
+ return get_experiment_report(Path(buildlog_dir))
buildlog/skills.py CHANGED
@@ -33,6 +33,18 @@ from buildlog.embeddings import EmbeddingBackend, get_backend, get_default_backe
33
33
 
34
34
  logger = logging.getLogger(__name__)
35
35
 
36
+
37
+ def _load_review_learnings(buildlog_dir: Path) -> dict:
38
+ """Load review learnings from .buildlog/review_learnings.json."""
39
+ learnings_path = buildlog_dir / ".buildlog" / "review_learnings.json"
40
+ if not learnings_path.exists():
41
+ return {"learnings": {}}
42
+ try:
43
+ return json.loads(learnings_path.read_text())
44
+ except (json.JSONDecodeError, OSError):
45
+ return {"learnings": {}}
46
+
47
+
36
48
  # Configuration constants
37
49
  MIN_SIMILARITY_THRESHOLD: Final[float] = 0.7
38
50
  HIGH_CONFIDENCE_FREQUENCY: Final[int] = 3
@@ -398,8 +410,9 @@ def generate_skills(
398
410
  since_date: date | None = None,
399
411
  embedding_backend: str | None = None,
400
412
  confidence_config: ConfidenceConfig | None = None,
413
+ include_review_learnings: bool = True,
401
414
  ) -> SkillSet:
402
- """Generate skills from buildlog patterns.
415
+ """Generate skills from buildlog patterns and review learnings.
403
416
 
404
417
  Args:
405
418
  buildlog_dir: Path to the buildlog directory.
@@ -410,6 +423,9 @@ def generate_skills(
410
423
  confidence_config: Configuration for continuous confidence scoring.
411
424
  If provided, skills will include confidence_score and confidence_tier.
412
425
  If None, only discrete confidence levels (high/medium/low) are computed.
426
+ include_review_learnings: Whether to include learnings from code reviews.
427
+ When True, loads .buildlog/review_learnings.json and merges
428
+ review learnings into the skill set.
413
429
 
414
430
  Returns:
415
431
  SkillSet with generated skills.
@@ -468,6 +484,111 @@ def generate_skills(
468
484
  skills.sort(key=lambda s: (-s.frequency, s.rule))
469
485
  skills_by_category[category] = skills
470
486
 
487
+ # Merge review learnings if requested
488
+ if include_review_learnings:
489
+ review_data = _load_review_learnings(buildlog_dir)
490
+ learnings = review_data.get("learnings", {})
491
+
492
+ for learning_id, learning_dict in learnings.items():
493
+ category = learning_dict.get("category", "workflow")
494
+ rule = learning_dict.get("rule", "")
495
+
496
+ if not rule:
497
+ continue
498
+
499
+ # Parse timestamps for confidence calculation
500
+ first_seen_str = learning_dict.get("first_seen", "")
501
+ last_reinforced_str = learning_dict.get("last_reinforced", "")
502
+
503
+ try:
504
+ first_seen = datetime.fromisoformat(first_seen_str)
505
+ if first_seen.tzinfo is None:
506
+ first_seen = first_seen.replace(tzinfo=timezone.utc)
507
+ except (ValueError, TypeError):
508
+ first_seen = datetime.now(timezone.utc)
509
+
510
+ try:
511
+ last_reinforced = datetime.fromisoformat(last_reinforced_str)
512
+ if last_reinforced.tzinfo is None:
513
+ last_reinforced = last_reinforced.replace(tzinfo=timezone.utc)
514
+ except (ValueError, TypeError):
515
+ last_reinforced = datetime.now(timezone.utc)
516
+
517
+ # Get frequency from reinforcement count
518
+ frequency = learning_dict.get("reinforcement_count", 1)
519
+
520
+ # Check for duplicate rules in existing skills (by ID match)
521
+ existing_skill = None
522
+ if category in skills_by_category:
523
+ for skill in skills_by_category[category]:
524
+ if skill.id == learning_id:
525
+ existing_skill = skill
526
+ break
527
+
528
+ if existing_skill is not None:
529
+ # Merge: boost the existing skill's frequency
530
+ existing_skill = Skill(
531
+ id=existing_skill.id,
532
+ category=existing_skill.category,
533
+ rule=existing_skill.rule,
534
+ frequency=existing_skill.frequency + frequency,
535
+ confidence=existing_skill.confidence,
536
+ sources=existing_skill.sources
537
+ + [learning_dict.get("source", "review")],
538
+ tags=existing_skill.tags,
539
+ confidence_score=existing_skill.confidence_score,
540
+ confidence_tier=existing_skill.confidence_tier,
541
+ )
542
+ # Replace in list
543
+ skills_by_category[category] = [
544
+ existing_skill if s.id == existing_skill.id else s
545
+ for s in skills_by_category[category]
546
+ ]
547
+ else:
548
+ # Create new skill from review learning
549
+ review_conf_score: float | None = None
550
+ review_conf_tier: str | None = None
551
+
552
+ if confidence_config is not None and t_now is not None:
553
+ metrics = ConfidenceMetrics(
554
+ reinforcement_count=frequency,
555
+ last_reinforced=last_reinforced,
556
+ contradiction_count=learning_dict.get("contradiction_count", 0),
557
+ first_seen=first_seen,
558
+ )
559
+ review_conf_score = calculate_continuous_confidence(
560
+ metrics, confidence_config, t_now
561
+ )
562
+ review_conf_tier = get_confidence_tier(
563
+ review_conf_score, confidence_config
564
+ ).value
565
+
566
+ # Calculate discrete confidence from most recent date
567
+ discrete_confidence = _calculate_confidence(
568
+ frequency, last_reinforced.date()
569
+ )
570
+
571
+ skill = Skill(
572
+ id=learning_id,
573
+ category=category,
574
+ rule=rule,
575
+ frequency=frequency,
576
+ confidence=discrete_confidence,
577
+ sources=[learning_dict.get("source", "review")],
578
+ tags=_extract_tags(rule),
579
+ confidence_score=review_conf_score,
580
+ confidence_tier=review_conf_tier,
581
+ )
582
+
583
+ # Add to category
584
+ if category not in skills_by_category:
585
+ skills_by_category[category] = []
586
+ skills_by_category[category].append(skill)
587
+
588
+ # Re-sort categories after adding review learnings
589
+ for category in skills_by_category:
590
+ skills_by_category[category].sort(key=lambda s: (-s.frequency, s.rule))
591
+
471
592
  return SkillSet(
472
593
  generated_at=datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
473
594
  source_entries=result.entry_count,