buildlog 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,12 +6,14 @@ MCP, CLI, HTTP, or any other interface.
6
6
 
7
7
  from __future__ import annotations
8
8
 
9
+ import hashlib
9
10
  import json
10
11
  from dataclasses import dataclass, field
11
- from datetime import datetime
12
+ from datetime import datetime, timezone
12
13
  from pathlib import Path
13
- from typing import Literal
14
+ from typing import Literal, TypedDict
14
15
 
16
+ from buildlog.confidence import ConfidenceMetrics, merge_confidence_metrics
15
17
  from buildlog.render import get_renderer
16
18
  from buildlog.skills import Skill, SkillSet, generate_skills
17
19
 
@@ -20,11 +22,33 @@ __all__ = [
20
22
  "PromoteResult",
21
23
  "RejectResult",
22
24
  "DiffResult",
25
+ "ReviewIssue",
26
+ "ReviewLearning",
27
+ "LearnFromReviewResult",
28
+ "RewardEvent",
29
+ "LogRewardResult",
30
+ "RewardSummary",
31
+ # Session tracking (experiment infrastructure)
32
+ "Session",
33
+ "Mistake",
34
+ "SessionMetrics",
35
+ "StartSessionResult",
36
+ "EndSessionResult",
37
+ "LogMistakeResult",
23
38
  "status",
24
39
  "promote",
25
40
  "reject",
26
41
  "diff",
27
42
  "find_skills_by_ids",
43
+ "learn_from_review",
44
+ "log_reward",
45
+ "get_rewards",
46
+ # Session tracking operations
47
+ "start_session",
48
+ "end_session",
49
+ "log_mistake",
50
+ "get_session_metrics",
51
+ "get_experiment_report",
28
52
  ]
29
53
 
30
54
 
@@ -108,6 +132,302 @@ class DiffResult:
108
132
  """Error message if operation failed."""
109
133
 
110
134
 
135
+ # -----------------------------------------------------------------------------
136
+ # Review Learning Data Structures
137
+ # -----------------------------------------------------------------------------
138
+
139
+
140
+ class ReviewIssueDict(TypedDict, total=False):
141
+ """Serializable form of ReviewIssue."""
142
+
143
+ severity: str
144
+ category: str
145
+ description: str
146
+ rule_learned: str
147
+ location: str | None
148
+ why_it_matters: str | None
149
+ functional_principle: str | None
150
+
151
+
152
+ @dataclass
153
+ class ReviewIssue:
154
+ """A single issue identified during code review.
155
+
156
+ Attributes:
157
+ severity: How serious the issue is (critical/major/minor/nitpick).
158
+ category: What kind of issue (architectural/workflow/tool_usage/domain_knowledge).
159
+ description: What's wrong (concrete).
160
+ rule_learned: The generalizable rule extracted from this issue.
161
+ location: File:line where the issue was found.
162
+ why_it_matters: Why this issue matters (consequences).
163
+ functional_principle: Related FP principle, if applicable.
164
+ """
165
+
166
+ severity: Literal["critical", "major", "minor", "nitpick"]
167
+ category: Literal["architectural", "workflow", "tool_usage", "domain_knowledge"]
168
+ description: str
169
+ rule_learned: str
170
+ location: str | None = None
171
+ why_it_matters: str | None = None
172
+ functional_principle: str | None = None
173
+
174
+ @classmethod
175
+ def from_dict(cls, data: dict) -> "ReviewIssue":
176
+ """Construct from dictionary (e.g., from JSON)."""
177
+ return cls(
178
+ severity=data.get("severity", "minor"),
179
+ category=data.get("category", "workflow"),
180
+ description=data.get("description", ""),
181
+ rule_learned=data.get("rule_learned", ""),
182
+ location=data.get("location"),
183
+ why_it_matters=data.get("why_it_matters"),
184
+ functional_principle=data.get("functional_principle"),
185
+ )
186
+
187
+
188
+ class ReviewLearningDict(TypedDict, total=False):
189
+ """Serializable form of ReviewLearning."""
190
+
191
+ id: str
192
+ rule: str
193
+ category: str
194
+ severity: str
195
+ source: str
196
+ first_seen: str
197
+ last_reinforced: str
198
+ reinforcement_count: int
199
+ contradiction_count: int
200
+ functional_principle: str | None
201
+
202
+
203
+ @dataclass
204
+ class ReviewLearning:
205
+ """A learning extracted from review, with confidence tracking.
206
+
207
+ Attributes:
208
+ id: Deterministic hash of rule_learned (category prefix + hash).
209
+ rule: The generalizable rule text.
210
+ category: Category of the learning.
211
+ severity: Severity of the original issue.
212
+ source: Where this learning came from (e.g., "review:PR#13").
213
+ first_seen: When this rule was first identified.
214
+ last_reinforced: When this rule was last seen/reinforced.
215
+ reinforcement_count: How many times this rule has been seen.
216
+ contradiction_count: How many times this rule was contradicted.
217
+ functional_principle: Related FP principle, if applicable.
218
+ """
219
+
220
+ id: str
221
+ rule: str
222
+ category: str
223
+ severity: str
224
+ source: str
225
+ first_seen: datetime
226
+ last_reinforced: datetime
227
+ reinforcement_count: int = 1
228
+ contradiction_count: int = 0
229
+ functional_principle: str | None = None
230
+
231
+ def to_confidence_metrics(self) -> ConfidenceMetrics:
232
+ """Convert to ConfidenceMetrics for scoring."""
233
+ return ConfidenceMetrics(
234
+ reinforcement_count=self.reinforcement_count,
235
+ last_reinforced=self.last_reinforced,
236
+ contradiction_count=self.contradiction_count,
237
+ first_seen=self.first_seen,
238
+ )
239
+
240
+ def to_dict(self) -> ReviewLearningDict:
241
+ """Convert to serializable dictionary."""
242
+ result: ReviewLearningDict = {
243
+ "id": self.id,
244
+ "rule": self.rule,
245
+ "category": self.category,
246
+ "severity": self.severity,
247
+ "source": self.source,
248
+ "first_seen": self.first_seen.isoformat(),
249
+ "last_reinforced": self.last_reinforced.isoformat(),
250
+ "reinforcement_count": self.reinforcement_count,
251
+ "contradiction_count": self.contradiction_count,
252
+ }
253
+ if self.functional_principle:
254
+ result["functional_principle"] = self.functional_principle
255
+ return result
256
+
257
+ @classmethod
258
+ def from_dict(cls, data: ReviewLearningDict) -> "ReviewLearning":
259
+ """Reconstruct from serialized dictionary."""
260
+ first_seen = datetime.fromisoformat(data["first_seen"])
261
+ last_reinforced = datetime.fromisoformat(data["last_reinforced"])
262
+
263
+ # Ensure timezone awareness
264
+ if first_seen.tzinfo is None:
265
+ first_seen = first_seen.replace(tzinfo=timezone.utc)
266
+ if last_reinforced.tzinfo is None:
267
+ last_reinforced = last_reinforced.replace(tzinfo=timezone.utc)
268
+
269
+ return cls(
270
+ id=data["id"],
271
+ rule=data["rule"],
272
+ category=data["category"],
273
+ severity=data["severity"],
274
+ source=data["source"],
275
+ first_seen=first_seen,
276
+ last_reinforced=last_reinforced,
277
+ reinforcement_count=data.get("reinforcement_count", 1),
278
+ contradiction_count=data.get("contradiction_count", 0),
279
+ functional_principle=data.get("functional_principle"),
280
+ )
281
+
282
+
283
+ @dataclass
284
+ class LearnFromReviewResult:
285
+ """Result of learning from a review.
286
+
287
+ Attributes:
288
+ new_learnings: IDs of newly created learnings.
289
+ reinforced_learnings: IDs of existing learnings that were reinforced.
290
+ total_issues_processed: Total number of issues processed.
291
+ source: Review source identifier.
292
+ message: Human-readable summary.
293
+ error: Error message if operation failed.
294
+ """
295
+
296
+ new_learnings: list[str]
297
+ reinforced_learnings: list[str]
298
+ total_issues_processed: int
299
+ source: str
300
+ message: str = ""
301
+ error: str | None = None
302
+
303
+
304
+ # -----------------------------------------------------------------------------
305
+ # Reward Signal Data Structures (for Bandit Learning)
306
+ # -----------------------------------------------------------------------------
307
+
308
+
309
+ class RewardEventDict(TypedDict, total=False):
310
+ """Serializable form of RewardEvent."""
311
+
312
+ id: str
313
+ timestamp: str
314
+ outcome: str # "accepted" | "revision" | "rejected"
315
+ reward_value: float
316
+ rules_active: list[str]
317
+ revision_distance: float | None
318
+ error_class: str | None
319
+ notes: str | None
320
+ source: str | None
321
+
322
+
323
+ @dataclass
324
+ class RewardEvent:
325
+ """A single reward/feedback event for bandit learning.
326
+
327
+ This tracks human feedback on agent work to enable learning
328
+ which rules are effective in which contexts.
329
+
330
+ Attributes:
331
+ id: Unique identifier for this event.
332
+ timestamp: When the feedback was recorded.
333
+ outcome: The feedback type (accepted/revision/rejected).
334
+ reward_value: Numeric reward (1.0=accepted, 0=rejected, in between for revision).
335
+ rules_active: IDs of rules that were in context when work was done.
336
+ revision_distance: How much correction was needed (0-1, lower is better).
337
+ error_class: Category of error if applicable.
338
+ notes: Optional notes about the feedback.
339
+ source: Where this feedback came from (manual, review_loop, etc.).
340
+ """
341
+
342
+ id: str
343
+ timestamp: datetime
344
+ outcome: Literal["accepted", "revision", "rejected"]
345
+ reward_value: float
346
+ rules_active: list[str] = field(default_factory=list)
347
+ revision_distance: float | None = None
348
+ error_class: str | None = None
349
+ notes: str | None = None
350
+ source: str | None = None
351
+
352
+ def to_dict(self) -> RewardEventDict:
353
+ """Convert to serializable dictionary."""
354
+ result: RewardEventDict = {
355
+ "id": self.id,
356
+ "timestamp": self.timestamp.isoformat(),
357
+ "outcome": self.outcome,
358
+ "reward_value": self.reward_value,
359
+ "rules_active": self.rules_active,
360
+ }
361
+ if self.revision_distance is not None:
362
+ result["revision_distance"] = self.revision_distance
363
+ if self.error_class is not None:
364
+ result["error_class"] = self.error_class
365
+ if self.notes is not None:
366
+ result["notes"] = self.notes
367
+ if self.source is not None:
368
+ result["source"] = self.source
369
+ return result
370
+
371
+ @classmethod
372
+ def from_dict(cls, data: RewardEventDict) -> "RewardEvent":
373
+ """Reconstruct from serialized dictionary."""
374
+ timestamp = datetime.fromisoformat(data["timestamp"])
375
+ if timestamp.tzinfo is None:
376
+ timestamp = timestamp.replace(tzinfo=timezone.utc)
377
+
378
+ return cls(
379
+ id=data["id"],
380
+ timestamp=timestamp,
381
+ outcome=data["outcome"], # type: ignore[arg-type]
382
+ reward_value=data["reward_value"],
383
+ rules_active=data.get("rules_active", []),
384
+ revision_distance=data.get("revision_distance"),
385
+ error_class=data.get("error_class"),
386
+ notes=data.get("notes"),
387
+ source=data.get("source"),
388
+ )
389
+
390
+
391
+ @dataclass
392
+ class LogRewardResult:
393
+ """Result of logging a reward event.
394
+
395
+ Attributes:
396
+ reward_id: ID of the logged reward event.
397
+ reward_value: The computed reward value.
398
+ total_events: Total reward events logged so far.
399
+ message: Human-readable confirmation.
400
+ error: Error message if operation failed.
401
+ """
402
+
403
+ reward_id: str
404
+ reward_value: float
405
+ total_events: int
406
+ message: str = ""
407
+ error: str | None = None
408
+
409
+
410
+ @dataclass
411
+ class RewardSummary:
412
+ """Summary statistics for reward events.
413
+
414
+ Attributes:
415
+ total_events: Total number of reward events.
416
+ accepted: Count of accepted outcomes.
417
+ revisions: Count of revision outcomes.
418
+ rejected: Count of rejected outcomes.
419
+ mean_reward: Average reward value across all events.
420
+ events: List of reward events (limited by query).
421
+ """
422
+
423
+ total_events: int
424
+ accepted: int
425
+ revisions: int
426
+ rejected: int
427
+ mean_reward: float
428
+ events: list[RewardEvent] = field(default_factory=list)
429
+
430
+
111
431
  def _get_rejected_path(buildlog_dir: Path) -> Path:
112
432
  """Get path to rejected.json file."""
113
433
  return buildlog_dir / ".buildlog" / "rejected.json"
@@ -386,3 +706,949 @@ def diff(
386
706
  already_promoted=len(promoted_ids),
387
707
  already_rejected=len(rejected_ids),
388
708
  )
709
+
710
+
711
+ # -----------------------------------------------------------------------------
712
+ # Review Learning Operations
713
+ # -----------------------------------------------------------------------------
714
+
715
+
716
+ def _get_learnings_path(buildlog_dir: Path) -> Path:
717
+ """Get path to review_learnings.json file."""
718
+ return buildlog_dir / ".buildlog" / "review_learnings.json"
719
+
720
+
721
+ def _generate_learning_id(category: str, rule: str) -> str:
722
+ """Generate deterministic ID for a learning.
723
+
724
+ Uses category prefix + first 10 chars of SHA256 hash.
725
+ """
726
+ # Normalize: lowercase, strip whitespace
727
+ normalized = rule.lower().strip()
728
+ hash_input = f"{category}:{normalized}".encode("utf-8")
729
+ hash_hex = hashlib.sha256(hash_input).hexdigest()[:10]
730
+
731
+ # Category prefix mapping
732
+ prefix_map = {
733
+ "architectural": "arch",
734
+ "workflow": "wf",
735
+ "tool_usage": "tool",
736
+ "domain_knowledge": "dom",
737
+ }
738
+ prefix = prefix_map.get(category, category[:4])
739
+
740
+ return f"{prefix}-{hash_hex}"
741
+
742
+
743
+ def _load_learnings(path: Path) -> dict:
744
+ """Load learnings from JSON file."""
745
+ if not path.exists():
746
+ return {"learnings": {}, "review_history": []}
747
+ try:
748
+ return json.loads(path.read_text())
749
+ except (json.JSONDecodeError, OSError):
750
+ return {"learnings": {}, "review_history": []}
751
+
752
+
753
+ def _save_learnings(path: Path, data: dict) -> None:
754
+ """Save learnings to JSON file."""
755
+ path.parent.mkdir(parents=True, exist_ok=True)
756
+ path.write_text(json.dumps(data, indent=2))
757
+
758
+
759
+ def learn_from_review(
760
+ buildlog_dir: Path,
761
+ issues: list[dict],
762
+ source: str | None = None,
763
+ ) -> LearnFromReviewResult:
764
+ """Capture learnings from a code review and update confidence metrics.
765
+
766
+ For each issue:
767
+ 1. Generate deterministic ID from rule text
768
+ 2. If exists: reinforce (increment count, update timestamp)
769
+ 3. If new: create ReviewLearning with initial metrics
770
+ 4. Persist to .buildlog/review_learnings.json
771
+
772
+ Args:
773
+ buildlog_dir: Path to buildlog directory.
774
+ issues: List of review issues with rule_learned field.
775
+ source: Optional source identifier (defaults to timestamp).
776
+
777
+ Returns:
778
+ LearnFromReviewResult with new/reinforced learning IDs.
779
+ """
780
+ if not issues:
781
+ return LearnFromReviewResult(
782
+ new_learnings=[],
783
+ reinforced_learnings=[],
784
+ total_issues_processed=0,
785
+ source=source or "",
786
+ error="No issues provided",
787
+ )
788
+
789
+ # Default source to timestamp
790
+ now = datetime.now(timezone.utc)
791
+ if source is None:
792
+ source = f"review:{now.isoformat()}"
793
+ elif not source.startswith("review:"):
794
+ source = f"review:{source}"
795
+
796
+ learnings_path = _get_learnings_path(buildlog_dir)
797
+ data = _load_learnings(learnings_path)
798
+
799
+ new_ids: list[str] = []
800
+ reinforced_ids: list[str] = []
801
+ processed = 0
802
+
803
+ for issue_dict in issues:
804
+ # Skip issues without rule_learned
805
+ rule = issue_dict.get("rule_learned", "").strip()
806
+ if not rule:
807
+ continue
808
+
809
+ # Parse issue
810
+ issue = ReviewIssue.from_dict(issue_dict)
811
+ learning_id = _generate_learning_id(issue.category, rule)
812
+
813
+ if learning_id in data["learnings"]:
814
+ # Reinforce existing learning
815
+ existing_data = data["learnings"][learning_id]
816
+ existing = ReviewLearning.from_dict(existing_data)
817
+
818
+ # Use merge_confidence_metrics pattern
819
+ updated_metrics = merge_confidence_metrics(
820
+ existing.to_confidence_metrics(), now
821
+ )
822
+
823
+ # Update the learning
824
+ existing_data["last_reinforced"] = now.isoformat()
825
+ existing_data["reinforcement_count"] = updated_metrics.reinforcement_count
826
+ reinforced_ids.append(learning_id)
827
+ else:
828
+ # Create new learning
829
+ learning = ReviewLearning(
830
+ id=learning_id,
831
+ rule=rule,
832
+ category=issue.category,
833
+ severity=issue.severity,
834
+ source=source,
835
+ first_seen=now,
836
+ last_reinforced=now,
837
+ reinforcement_count=1,
838
+ contradiction_count=0,
839
+ functional_principle=issue.functional_principle,
840
+ )
841
+ data["learnings"][learning_id] = learning.to_dict()
842
+ new_ids.append(learning_id)
843
+
844
+ processed += 1
845
+
846
+ # Record in review history
847
+ data["review_history"].append(
848
+ {
849
+ "timestamp": now.isoformat(),
850
+ "source": source,
851
+ "issues_count": processed,
852
+ "new_learning_ids": new_ids,
853
+ "reinforced_learning_ids": reinforced_ids,
854
+ }
855
+ )
856
+
857
+ # Persist
858
+ _save_learnings(learnings_path, data)
859
+
860
+ # Build message
861
+ msg_parts = []
862
+ if new_ids:
863
+ msg_parts.append(f"{len(new_ids)} new learning(s)")
864
+ if reinforced_ids:
865
+ msg_parts.append(f"{len(reinforced_ids)} reinforced")
866
+ message = ", ".join(msg_parts) if msg_parts else "No learnings captured"
867
+
868
+ return LearnFromReviewResult(
869
+ new_learnings=new_ids,
870
+ reinforced_learnings=reinforced_ids,
871
+ total_issues_processed=processed,
872
+ source=source,
873
+ message=message,
874
+ )
875
+
876
+
877
+ # -----------------------------------------------------------------------------
878
+ # Reward Signal Operations (for Bandit Learning)
879
+ # -----------------------------------------------------------------------------
880
+
881
+
882
+ def _get_rewards_path(buildlog_dir: Path) -> Path:
883
+ """Get path to reward_events.jsonl file."""
884
+ return buildlog_dir / ".buildlog" / "reward_events.jsonl"
885
+
886
+
887
+ def _generate_reward_id(outcome: str, timestamp: datetime) -> str:
888
+ """Generate unique ID for a reward event.
889
+
890
+ Uses outcome + timestamp to ensure uniqueness while allowing
891
+ multiple events with the same outcome.
892
+ """
893
+ ts_str = timestamp.isoformat()
894
+ normalized = f"{outcome}:{ts_str}"
895
+ hash_hex = hashlib.sha256(normalized.encode("utf-8")).hexdigest()[:10]
896
+ return f"rew-{hash_hex}"
897
+
898
+
899
+ def _compute_reward_value(
900
+ outcome: Literal["accepted", "revision", "rejected"],
901
+ revision_distance: float | None,
902
+ ) -> float:
903
+ """Compute numeric reward from outcome.
904
+
905
+ Args:
906
+ outcome: The feedback type.
907
+ revision_distance: How much correction needed (0-1).
908
+
909
+ Returns:
910
+ Reward value in [0, 1].
911
+ - accepted: 1.0
912
+ - rejected: 0.0
913
+ - revision: 1.0 - distance (default distance 0.5 if not provided)
914
+ """
915
+ if outcome == "accepted":
916
+ return 1.0
917
+ elif outcome == "rejected":
918
+ return 0.0
919
+ else: # revision
920
+ distance = revision_distance if revision_distance is not None else 0.5
921
+ return max(0.0, min(1.0, 1.0 - distance))
922
+
923
+
924
+ def log_reward(
925
+ buildlog_dir: Path,
926
+ outcome: Literal["accepted", "revision", "rejected"],
927
+ rules_active: list[str] | None = None,
928
+ revision_distance: float | None = None,
929
+ error_class: str | None = None,
930
+ notes: str | None = None,
931
+ source: str | None = None,
932
+ ) -> LogRewardResult:
933
+ """Log a reward event for bandit learning.
934
+
935
+ Appends to reward_events.jsonl for later analysis.
936
+
937
+ Args:
938
+ buildlog_dir: Path to buildlog directory.
939
+ outcome: Type of feedback (accepted/revision/rejected).
940
+ rules_active: List of rule IDs that were in context.
941
+ revision_distance: How much correction was needed (0-1, for revisions).
942
+ error_class: Category of error if applicable.
943
+ notes: Optional notes about the feedback.
944
+ source: Where this feedback came from.
945
+
946
+ Returns:
947
+ LogRewardResult with confirmation.
948
+ """
949
+ now = datetime.now(timezone.utc)
950
+ reward_id = _generate_reward_id(outcome, now)
951
+ reward_value = _compute_reward_value(outcome, revision_distance)
952
+
953
+ event = RewardEvent(
954
+ id=reward_id,
955
+ timestamp=now,
956
+ outcome=outcome,
957
+ reward_value=reward_value,
958
+ rules_active=rules_active or [],
959
+ revision_distance=revision_distance,
960
+ error_class=error_class,
961
+ notes=notes,
962
+ source=source or "manual",
963
+ )
964
+
965
+ # Append to JSONL file
966
+ rewards_path = _get_rewards_path(buildlog_dir)
967
+ rewards_path.parent.mkdir(parents=True, exist_ok=True)
968
+
969
+ with open(rewards_path, "a") as f:
970
+ f.write(json.dumps(event.to_dict()) + "\n")
971
+
972
+ # Count total events
973
+ total_events = 0
974
+ if rewards_path.exists():
975
+ total_events = sum(
976
+ 1 for line in rewards_path.read_text().strip().split("\n") if line
977
+ )
978
+
979
+ return LogRewardResult(
980
+ reward_id=reward_id,
981
+ reward_value=reward_value,
982
+ total_events=total_events,
983
+ message=f"Logged {outcome} (reward={reward_value:.2f})",
984
+ )
985
+
986
+
987
+ def get_rewards(
988
+ buildlog_dir: Path,
989
+ limit: int | None = None,
990
+ ) -> RewardSummary:
991
+ """Get reward events with summary statistics.
992
+
993
+ Args:
994
+ buildlog_dir: Path to buildlog directory.
995
+ limit: Maximum number of events to return (most recent first).
996
+
997
+ Returns:
998
+ RewardSummary with events and statistics.
999
+ """
1000
+ rewards_path = _get_rewards_path(buildlog_dir)
1001
+
1002
+ if not rewards_path.exists():
1003
+ return RewardSummary(
1004
+ total_events=0,
1005
+ accepted=0,
1006
+ revisions=0,
1007
+ rejected=0,
1008
+ mean_reward=0.0,
1009
+ events=[],
1010
+ )
1011
+
1012
+ # Parse all events
1013
+ events: list[RewardEvent] = []
1014
+ for line in rewards_path.read_text().strip().split("\n"):
1015
+ if line:
1016
+ try:
1017
+ data = json.loads(line)
1018
+ events.append(RewardEvent.from_dict(data))
1019
+ except (json.JSONDecodeError, KeyError):
1020
+ continue # Skip malformed lines
1021
+
1022
+ # Calculate statistics
1023
+ total = len(events)
1024
+ accepted = sum(1 for e in events if e.outcome == "accepted")
1025
+ revisions = sum(1 for e in events if e.outcome == "revision")
1026
+ rejected = sum(1 for e in events if e.outcome == "rejected")
1027
+ mean_reward = sum(e.reward_value for e in events) / total if total > 0 else 0.0
1028
+
1029
+ # Sort by timestamp (most recent first) and limit
1030
+ events.sort(key=lambda e: e.timestamp, reverse=True)
1031
+ if limit is not None:
1032
+ events = events[:limit]
1033
+
1034
+ return RewardSummary(
1035
+ total_events=total,
1036
+ accepted=accepted,
1037
+ revisions=revisions,
1038
+ rejected=rejected,
1039
+ mean_reward=mean_reward,
1040
+ events=events,
1041
+ )
1042
+
1043
+
1044
+ # -----------------------------------------------------------------------------
1045
+ # Session Tracking Data Structures (for Experimental Infrastructure)
1046
+ # -----------------------------------------------------------------------------
1047
+
1048
+
1049
+ class SessionDict(TypedDict, total=False):
1050
+ """Serializable form of Session."""
1051
+
1052
+ id: str
1053
+ started_at: str
1054
+ ended_at: str | None
1055
+ entry_file: str | None
1056
+ rules_at_start: list[str]
1057
+ rules_at_end: list[str]
1058
+ error_class: str | None
1059
+ notes: str | None
1060
+
1061
+
1062
+ @dataclass
1063
+ class Session:
1064
+ """A coding session for experiment tracking.
1065
+
1066
+ Tracks the state of rules before and after a session to measure
1067
+ learning effectiveness.
1068
+
1069
+ Attributes:
1070
+ id: Unique identifier for this session.
1071
+ started_at: When the session started.
1072
+ ended_at: When the session ended (None if still active).
1073
+ entry_file: Corresponding buildlog entry file, if any.
1074
+ rules_at_start: Rule IDs active at session start.
1075
+ rules_at_end: Rule IDs active at session end.
1076
+ error_class: Error class being targeted (e.g., "missing_test").
1077
+ notes: Optional notes about the session.
1078
+ """
1079
+
1080
+ id: str
1081
+ started_at: datetime
1082
+ ended_at: datetime | None = None
1083
+ entry_file: str | None = None
1084
+ rules_at_start: list[str] = field(default_factory=list)
1085
+ rules_at_end: list[str] = field(default_factory=list)
1086
+ error_class: str | None = None
1087
+ notes: str | None = None
1088
+
1089
+ def to_dict(self) -> SessionDict:
1090
+ """Convert to serializable dictionary."""
1091
+ result: SessionDict = {
1092
+ "id": self.id,
1093
+ "started_at": self.started_at.isoformat(),
1094
+ "ended_at": self.ended_at.isoformat() if self.ended_at else None,
1095
+ "rules_at_start": self.rules_at_start,
1096
+ "rules_at_end": self.rules_at_end,
1097
+ }
1098
+ if self.entry_file is not None:
1099
+ result["entry_file"] = self.entry_file
1100
+ if self.error_class is not None:
1101
+ result["error_class"] = self.error_class
1102
+ if self.notes is not None:
1103
+ result["notes"] = self.notes
1104
+ return result
1105
+
1106
+ @classmethod
1107
+ def from_dict(cls, data: SessionDict) -> "Session":
1108
+ """Reconstruct from serialized dictionary."""
1109
+ started_at = datetime.fromisoformat(data["started_at"])
1110
+ if started_at.tzinfo is None:
1111
+ started_at = started_at.replace(tzinfo=timezone.utc)
1112
+
1113
+ ended_at = None
1114
+ ended_at_str = data.get("ended_at")
1115
+ if ended_at_str:
1116
+ ended_at = datetime.fromisoformat(ended_at_str)
1117
+ if ended_at.tzinfo is None:
1118
+ ended_at = ended_at.replace(tzinfo=timezone.utc)
1119
+
1120
+ return cls(
1121
+ id=data["id"],
1122
+ started_at=started_at,
1123
+ ended_at=ended_at,
1124
+ entry_file=data.get("entry_file"),
1125
+ rules_at_start=data.get("rules_at_start", []),
1126
+ rules_at_end=data.get("rules_at_end", []),
1127
+ error_class=data.get("error_class"),
1128
+ notes=data.get("notes"),
1129
+ )
1130
+
1131
+
1132
+ class MistakeDict(TypedDict, total=False):
1133
+ """Serializable form of Mistake."""
1134
+
1135
+ id: str
1136
+ session_id: str
1137
+ timestamp: str
1138
+ error_class: str
1139
+ description: str
1140
+ semantic_hash: str # Simplified from embedding - hash of description
1141
+ was_repeat: bool
1142
+ corrected_by_rule: str | None
1143
+
1144
+
1145
+ @dataclass
1146
+ class Mistake:
1147
+ """A logged mistake during a session.
1148
+
1149
+ Tracks mistakes to measure repeated-mistake rate.
1150
+
1151
+ Attributes:
1152
+ id: Unique identifier for this mistake.
1153
+ session_id: Session in which this mistake occurred.
1154
+ timestamp: When the mistake was logged.
1155
+ error_class: Category of error (e.g., "missing_test").
1156
+ description: Description of the mistake.
1157
+ semantic_hash: Hash of description for similarity matching.
1158
+ was_repeat: Whether this was a repeat of a prior mistake.
1159
+ corrected_by_rule: Rule ID that should have prevented this, if any.
1160
+ """
1161
+
1162
+ id: str
1163
+ session_id: str
1164
+ timestamp: datetime
1165
+ error_class: str
1166
+ description: str
1167
+ semantic_hash: str
1168
+ was_repeat: bool = False
1169
+ corrected_by_rule: str | None = None
1170
+
1171
+ def to_dict(self) -> MistakeDict:
1172
+ """Convert to serializable dictionary."""
1173
+ result: MistakeDict = {
1174
+ "id": self.id,
1175
+ "session_id": self.session_id,
1176
+ "timestamp": self.timestamp.isoformat(),
1177
+ "error_class": self.error_class,
1178
+ "description": self.description,
1179
+ "semantic_hash": self.semantic_hash,
1180
+ "was_repeat": self.was_repeat,
1181
+ }
1182
+ if self.corrected_by_rule is not None:
1183
+ result["corrected_by_rule"] = self.corrected_by_rule
1184
+ return result
1185
+
1186
+ @classmethod
1187
+ def from_dict(cls, data: MistakeDict) -> "Mistake":
1188
+ """Reconstruct from serialized dictionary."""
1189
+ timestamp = datetime.fromisoformat(data["timestamp"])
1190
+ if timestamp.tzinfo is None:
1191
+ timestamp = timestamp.replace(tzinfo=timezone.utc)
1192
+
1193
+ return cls(
1194
+ id=data["id"],
1195
+ session_id=data["session_id"],
1196
+ timestamp=timestamp,
1197
+ error_class=data["error_class"],
1198
+ description=data["description"],
1199
+ semantic_hash=data["semantic_hash"],
1200
+ was_repeat=data.get("was_repeat", False),
1201
+ corrected_by_rule=data.get("corrected_by_rule"),
1202
+ )
1203
+
1204
+
1205
+ @dataclass
1206
+ class SessionMetrics:
1207
+ """Metrics for a session or aggregated across sessions.
1208
+
1209
+ Attributes:
1210
+ session_id: Session ID (or "aggregate" for combined metrics).
1211
+ total_mistakes: Total mistakes in the session(s).
1212
+ repeated_mistakes: Mistakes that were repeats.
1213
+ repeated_mistake_rate: Ratio of repeated to total mistakes.
1214
+ rules_at_start: Number of rules at session start.
1215
+ rules_at_end: Number of rules at session end.
1216
+ rules_added: Net rules added during session(s).
1217
+ """
1218
+
1219
+ session_id: str
1220
+ total_mistakes: int
1221
+ repeated_mistakes: int
1222
+ repeated_mistake_rate: float
1223
+ rules_at_start: int
1224
+ rules_at_end: int
1225
+ rules_added: int
1226
+
1227
+
1228
+ @dataclass
1229
+ class StartSessionResult:
1230
+ """Result of starting a new session."""
1231
+
1232
+ session_id: str
1233
+ error_class: str | None
1234
+ rules_count: int
1235
+ message: str
1236
+
1237
+
1238
+ @dataclass
1239
+ class EndSessionResult:
1240
+ """Result of ending a session."""
1241
+
1242
+ session_id: str
1243
+ duration_minutes: float
1244
+ mistakes_logged: int
1245
+ repeated_mistakes: int
1246
+ rules_at_start: int
1247
+ rules_at_end: int
1248
+ message: str
1249
+
1250
+
1251
+ @dataclass
1252
+ class LogMistakeResult:
1253
+ """Result of logging a mistake."""
1254
+
1255
+ mistake_id: str
1256
+ session_id: str
1257
+ was_repeat: bool
1258
+ similar_prior: str | None # ID of similar prior mistake if repeat
1259
+ message: str
1260
+
1261
+
1262
+ # -----------------------------------------------------------------------------
1263
+ # Session Tracking Helper Functions
1264
+ # -----------------------------------------------------------------------------
1265
+
1266
+
1267
+ def _get_sessions_path(buildlog_dir: Path) -> Path:
1268
+ """Get path to sessions JSONL file."""
1269
+ return buildlog_dir / ".buildlog" / "sessions.jsonl"
1270
+
1271
+
1272
+ def _get_mistakes_path(buildlog_dir: Path) -> Path:
1273
+ """Get path to mistakes JSONL file."""
1274
+ return buildlog_dir / ".buildlog" / "mistakes.jsonl"
1275
+
1276
+
1277
+ def _get_active_session_path(buildlog_dir: Path) -> Path:
1278
+ """Get path to active session marker file."""
1279
+ return buildlog_dir / ".buildlog" / "active_session.json"
1280
+
1281
+
1282
+ def _generate_session_id(now: datetime) -> str:
1283
+ """Generate a unique session ID."""
1284
+ # Include microseconds for uniqueness when sessions are created quickly
1285
+ return f"session-{now.strftime('%Y%m%d-%H%M%S')}-{now.microsecond:06d}"
1286
+
1287
+
1288
+ def _generate_mistake_id(error_class: str, now: datetime) -> str:
1289
+ """Generate a unique mistake ID."""
1290
+ # Include microseconds for uniqueness
1291
+ return f"mistake-{error_class[:10]}-{now.strftime('%Y%m%d-%H%M%S')}-{now.microsecond:06d}"
1292
+
1293
+
1294
+ def _compute_semantic_hash(description: str) -> str:
1295
+ """Compute a hash for semantic similarity matching.
1296
+
1297
+ This is a simplified approach - in production, you'd use embeddings.
1298
+ For now, we normalize and hash the description.
1299
+ """
1300
+ import hashlib
1301
+
1302
+ # Normalize: lowercase, remove extra whitespace
1303
+ normalized = " ".join(description.lower().split())
1304
+ return hashlib.sha256(normalized.encode()).hexdigest()[:16]
1305
+
1306
+
1307
+ def _get_current_rules(buildlog_dir: Path) -> list[str]:
1308
+ """Get list of current promoted rule IDs."""
1309
+ promoted_path = _get_promoted_path(buildlog_dir)
1310
+ return list(_load_json_set(promoted_path, "skill_ids"))
1311
+
1312
+
1313
+ def _load_sessions(buildlog_dir: Path) -> list[Session]:
1314
+ """Load all sessions from JSONL file."""
1315
+ sessions_path = _get_sessions_path(buildlog_dir)
1316
+ if not sessions_path.exists():
1317
+ return []
1318
+
1319
+ sessions = []
1320
+ for line in sessions_path.read_text().strip().split("\n"):
1321
+ if line:
1322
+ try:
1323
+ data = json.loads(line)
1324
+ sessions.append(Session.from_dict(data))
1325
+ except (json.JSONDecodeError, KeyError):
1326
+ continue
1327
+ return sessions
1328
+
1329
+
1330
+ def _load_mistakes(buildlog_dir: Path) -> list[Mistake]:
1331
+ """Load all mistakes from JSONL file."""
1332
+ mistakes_path = _get_mistakes_path(buildlog_dir)
1333
+ if not mistakes_path.exists():
1334
+ return []
1335
+
1336
+ mistakes = []
1337
+ for line in mistakes_path.read_text().strip().split("\n"):
1338
+ if line:
1339
+ try:
1340
+ data = json.loads(line)
1341
+ mistakes.append(Mistake.from_dict(data))
1342
+ except (json.JSONDecodeError, KeyError):
1343
+ continue
1344
+ return mistakes
1345
+
1346
+
1347
+ def _find_similar_prior_mistake(
1348
+ description: str,
1349
+ error_class: str,
1350
+ current_session_id: str,
1351
+ all_mistakes: list[Mistake],
1352
+ ) -> Mistake | None:
1353
+ """Find a similar mistake from a prior session.
1354
+
1355
+ Uses semantic hash for similarity matching (simplified approach).
1356
+ """
1357
+ semantic_hash = _compute_semantic_hash(description)
1358
+
1359
+ for mistake in all_mistakes:
1360
+ # Only check mistakes from prior sessions with same error class
1361
+ if (
1362
+ mistake.session_id != current_session_id
1363
+ and mistake.error_class == error_class
1364
+ ):
1365
+ # Check for semantic similarity (hash match or high description overlap)
1366
+ if mistake.semantic_hash == semantic_hash:
1367
+ return mistake
1368
+ # Also check for high word overlap
1369
+ desc_words = set(description.lower().split())
1370
+ mistake_words = set(mistake.description.lower().split())
1371
+ if len(desc_words & mistake_words) / max(len(desc_words), 1) > 0.7:
1372
+ return mistake
1373
+
1374
+ return None
1375
+
1376
+
1377
+ # -----------------------------------------------------------------------------
1378
+ # Session Tracking Operations
1379
+ # -----------------------------------------------------------------------------
1380
+
1381
+
1382
+ def start_session(
1383
+ buildlog_dir: Path,
1384
+ error_class: str | None = None,
1385
+ notes: str | None = None,
1386
+ ) -> StartSessionResult:
1387
+ """Start a new experiment session.
1388
+
1389
+ Args:
1390
+ buildlog_dir: Path to buildlog directory.
1391
+ error_class: Error class being targeted (e.g., "missing_test").
1392
+ notes: Optional notes about the session.
1393
+
1394
+ Returns:
1395
+ StartSessionResult with session ID and current rules count.
1396
+ """
1397
+ now = datetime.now(timezone.utc)
1398
+ session_id = _generate_session_id(now)
1399
+ current_rules = _get_current_rules(buildlog_dir)
1400
+
1401
+ session = Session(
1402
+ id=session_id,
1403
+ started_at=now,
1404
+ rules_at_start=current_rules,
1405
+ error_class=error_class,
1406
+ notes=notes,
1407
+ )
1408
+
1409
+ # Save as active session
1410
+ active_path = _get_active_session_path(buildlog_dir)
1411
+ active_path.parent.mkdir(parents=True, exist_ok=True)
1412
+ active_path.write_text(json.dumps(session.to_dict(), indent=2))
1413
+
1414
+ return StartSessionResult(
1415
+ session_id=session_id,
1416
+ error_class=error_class,
1417
+ rules_count=len(current_rules),
1418
+ message=f"Started session {session_id} with {len(current_rules)} active rules",
1419
+ )
1420
+
1421
+
1422
+ def end_session(
1423
+ buildlog_dir: Path,
1424
+ entry_file: str | None = None,
1425
+ notes: str | None = None,
1426
+ ) -> EndSessionResult:
1427
+ """End the current experiment session.
1428
+
1429
+ Args:
1430
+ buildlog_dir: Path to buildlog directory.
1431
+ entry_file: Corresponding buildlog entry file, if any.
1432
+ notes: Additional notes to append.
1433
+
1434
+ Returns:
1435
+ EndSessionResult with session metrics.
1436
+ """
1437
+ active_path = _get_active_session_path(buildlog_dir)
1438
+
1439
+ if not active_path.exists():
1440
+ raise ValueError("No active session to end")
1441
+
1442
+ # Load active session
1443
+ session_data = json.loads(active_path.read_text())
1444
+ session = Session.from_dict(session_data)
1445
+
1446
+ # Update session with end info
1447
+ now = datetime.now(timezone.utc)
1448
+ session.ended_at = now
1449
+ session.rules_at_end = _get_current_rules(buildlog_dir)
1450
+ if entry_file:
1451
+ session.entry_file = entry_file
1452
+ if notes:
1453
+ session.notes = f"{session.notes or ''}\n{notes}".strip()
1454
+
1455
+ # Append to sessions log
1456
+ sessions_path = _get_sessions_path(buildlog_dir)
1457
+ sessions_path.parent.mkdir(parents=True, exist_ok=True)
1458
+ with open(sessions_path, "a") as f:
1459
+ f.write(json.dumps(session.to_dict()) + "\n")
1460
+
1461
+ # Remove active session marker
1462
+ active_path.unlink()
1463
+
1464
+ # Calculate session metrics
1465
+ all_mistakes = _load_mistakes(buildlog_dir)
1466
+ session_mistakes = [m for m in all_mistakes if m.session_id == session.id]
1467
+ repeated = sum(1 for m in session_mistakes if m.was_repeat)
1468
+
1469
+ duration = (session.ended_at - session.started_at).total_seconds() / 60
1470
+
1471
+ return EndSessionResult(
1472
+ session_id=session.id,
1473
+ duration_minutes=round(duration, 1),
1474
+ mistakes_logged=len(session_mistakes),
1475
+ repeated_mistakes=repeated,
1476
+ rules_at_start=len(session.rules_at_start),
1477
+ rules_at_end=len(session.rules_at_end),
1478
+ message=f"Ended session {session.id} ({duration:.1f}min, {len(session_mistakes)} mistakes, {repeated} repeats)",
1479
+ )
1480
+
1481
+
1482
+ def log_mistake(
1483
+ buildlog_dir: Path,
1484
+ error_class: str,
1485
+ description: str,
1486
+ corrected_by_rule: str | None = None,
1487
+ ) -> LogMistakeResult:
1488
+ """Log a mistake during an experiment session.
1489
+
1490
+ Args:
1491
+ buildlog_dir: Path to buildlog directory.
1492
+ error_class: Category of error (e.g., "missing_test").
1493
+ description: Description of the mistake.
1494
+ corrected_by_rule: Rule ID that should have prevented this.
1495
+
1496
+ Returns:
1497
+ LogMistakeResult indicating if this was a repeat.
1498
+ """
1499
+ active_path = _get_active_session_path(buildlog_dir)
1500
+
1501
+ if not active_path.exists():
1502
+ raise ValueError(
1503
+ "No active session - start one with 'buildlog experiment start'"
1504
+ )
1505
+
1506
+ # Get current session
1507
+ session_data = json.loads(active_path.read_text())
1508
+ session_id = session_data["id"]
1509
+
1510
+ now = datetime.now(timezone.utc)
1511
+ mistake_id = _generate_mistake_id(error_class, now)
1512
+
1513
+ # Check for similar prior mistakes
1514
+ all_mistakes = _load_mistakes(buildlog_dir)
1515
+ similar = _find_similar_prior_mistake(
1516
+ description, error_class, session_id, all_mistakes
1517
+ )
1518
+
1519
+ mistake = Mistake(
1520
+ id=mistake_id,
1521
+ session_id=session_id,
1522
+ timestamp=now,
1523
+ error_class=error_class,
1524
+ description=description,
1525
+ semantic_hash=_compute_semantic_hash(description),
1526
+ was_repeat=similar is not None,
1527
+ corrected_by_rule=corrected_by_rule,
1528
+ )
1529
+
1530
+ # Append to mistakes log
1531
+ mistakes_path = _get_mistakes_path(buildlog_dir)
1532
+ mistakes_path.parent.mkdir(parents=True, exist_ok=True)
1533
+ with open(mistakes_path, "a") as f:
1534
+ f.write(json.dumps(mistake.to_dict()) + "\n")
1535
+
1536
+ message = f"Logged mistake: {error_class}"
1537
+ if similar:
1538
+ message += f" (REPEAT of {similar.id})"
1539
+
1540
+ return LogMistakeResult(
1541
+ mistake_id=mistake_id,
1542
+ session_id=session_id,
1543
+ was_repeat=similar is not None,
1544
+ similar_prior=similar.id if similar else None,
1545
+ message=message,
1546
+ )
1547
+
1548
+
1549
+ def get_session_metrics(
1550
+ buildlog_dir: Path,
1551
+ session_id: str | None = None,
1552
+ ) -> SessionMetrics:
1553
+ """Get metrics for a session or all sessions.
1554
+
1555
+ Args:
1556
+ buildlog_dir: Path to buildlog directory.
1557
+ session_id: Specific session ID, or None for aggregate metrics.
1558
+
1559
+ Returns:
1560
+ SessionMetrics with mistake rates and rule changes.
1561
+ """
1562
+ sessions = _load_sessions(buildlog_dir)
1563
+ mistakes = _load_mistakes(buildlog_dir)
1564
+
1565
+ if session_id:
1566
+ # Filter to specific session
1567
+ session = next((s for s in sessions if s.id == session_id), None)
1568
+ if not session:
1569
+ raise ValueError(f"Session not found: {session_id}")
1570
+
1571
+ session_mistakes = [m for m in mistakes if m.session_id == session_id]
1572
+ total = len(session_mistakes)
1573
+ repeated = sum(1 for m in session_mistakes if m.was_repeat)
1574
+
1575
+ return SessionMetrics(
1576
+ session_id=session_id,
1577
+ total_mistakes=total,
1578
+ repeated_mistakes=repeated,
1579
+ repeated_mistake_rate=repeated / total if total > 0 else 0.0,
1580
+ rules_at_start=len(session.rules_at_start),
1581
+ rules_at_end=len(session.rules_at_end),
1582
+ rules_added=len(session.rules_at_end) - len(session.rules_at_start),
1583
+ )
1584
+ else:
1585
+ # Aggregate across all sessions
1586
+ total = len(mistakes)
1587
+ repeated = sum(1 for m in mistakes if m.was_repeat)
1588
+
1589
+ rules_start = sessions[0].rules_at_start if sessions else []
1590
+ rules_end = sessions[-1].rules_at_end if sessions else []
1591
+
1592
+ return SessionMetrics(
1593
+ session_id="aggregate",
1594
+ total_mistakes=total,
1595
+ repeated_mistakes=repeated,
1596
+ repeated_mistake_rate=repeated / total if total > 0 else 0.0,
1597
+ rules_at_start=len(rules_start),
1598
+ rules_at_end=len(rules_end),
1599
+ rules_added=len(rules_end) - len(rules_start),
1600
+ )
1601
+
1602
+
1603
+ def get_experiment_report(buildlog_dir: Path) -> dict:
1604
+ """Generate a comprehensive experiment report.
1605
+
1606
+ Returns:
1607
+ Dictionary with sessions, metrics, and analysis.
1608
+ """
1609
+ sessions = _load_sessions(buildlog_dir)
1610
+ mistakes = _load_mistakes(buildlog_dir)
1611
+
1612
+ # Per-session metrics
1613
+ session_metrics = []
1614
+ for session in sessions:
1615
+ session_mistakes = [m for m in mistakes if m.session_id == session.id]
1616
+ total = len(session_mistakes)
1617
+ repeated = sum(1 for m in session_mistakes if m.was_repeat)
1618
+ session_metrics.append(
1619
+ {
1620
+ "session_id": session.id,
1621
+ "started_at": session.started_at.isoformat(),
1622
+ "error_class": session.error_class,
1623
+ "total_mistakes": total,
1624
+ "repeated_mistakes": repeated,
1625
+ "repeated_mistake_rate": repeated / total if total > 0 else 0.0,
1626
+ "rules_added": len(session.rules_at_end) - len(session.rules_at_start),
1627
+ }
1628
+ )
1629
+
1630
+ # Aggregate metrics
1631
+ total_mistakes = len(mistakes)
1632
+ total_repeated = sum(1 for m in mistakes if m.was_repeat)
1633
+
1634
+ # Error class breakdown
1635
+ error_classes: dict[str, dict] = {}
1636
+ for mistake in mistakes:
1637
+ if mistake.error_class not in error_classes:
1638
+ error_classes[mistake.error_class] = {"total": 0, "repeated": 0}
1639
+ error_classes[mistake.error_class]["total"] += 1
1640
+ if mistake.was_repeat:
1641
+ error_classes[mistake.error_class]["repeated"] += 1
1642
+
1643
+ return {
1644
+ "summary": {
1645
+ "total_sessions": len(sessions),
1646
+ "total_mistakes": total_mistakes,
1647
+ "total_repeated": total_repeated,
1648
+ "overall_repeat_rate": (
1649
+ total_repeated / total_mistakes if total_mistakes > 0 else 0.0
1650
+ ),
1651
+ },
1652
+ "sessions": session_metrics,
1653
+ "error_classes": error_classes,
1654
+ }