buildlog 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- buildlog/cli.py +799 -3
- buildlog/core/__init__.py +34 -0
- buildlog/core/operations.py +925 -0
- buildlog/mcp/server.py +16 -0
- buildlog/mcp/tools.py +266 -1
- buildlog/seed_engine/__init__.py +74 -0
- buildlog/seed_engine/categorizers.py +145 -0
- buildlog/seed_engine/extractors.py +148 -0
- buildlog/seed_engine/generators.py +144 -0
- buildlog/seed_engine/models.py +113 -0
- buildlog/seed_engine/pipeline.py +202 -0
- buildlog/seed_engine/sources.py +362 -0
- buildlog/seeds.py +211 -0
- buildlog/skills.py +26 -3
- buildlog-0.6.0.dist-info/METADATA +490 -0
- buildlog-0.6.0.dist-info/RECORD +38 -0
- buildlog-0.4.0.dist-info/METADATA +0 -894
- buildlog-0.4.0.dist-info/RECORD +0 -30
- {buildlog-0.4.0.data → buildlog-0.6.0.data}/data/share/buildlog/copier.yml +0 -0
- {buildlog-0.4.0.data → buildlog-0.6.0.data}/data/share/buildlog/post_gen.py +0 -0
- {buildlog-0.4.0.data → buildlog-0.6.0.data}/data/share/buildlog/template/buildlog/.gitkeep +0 -0
- {buildlog-0.4.0.data → buildlog-0.6.0.data}/data/share/buildlog/template/buildlog/2026-01-01-example.md +0 -0
- {buildlog-0.4.0.data → buildlog-0.6.0.data}/data/share/buildlog/template/buildlog/BUILDLOG_SYSTEM.md +0 -0
- {buildlog-0.4.0.data → buildlog-0.6.0.data}/data/share/buildlog/template/buildlog/_TEMPLATE.md +0 -0
- {buildlog-0.4.0.data → buildlog-0.6.0.data}/data/share/buildlog/template/buildlog/assets/.gitkeep +0 -0
- {buildlog-0.4.0.dist-info → buildlog-0.6.0.dist-info}/WHEEL +0 -0
- {buildlog-0.4.0.dist-info → buildlog-0.6.0.dist-info}/entry_points.txt +0 -0
- {buildlog-0.4.0.dist-info → buildlog-0.6.0.dist-info}/licenses/LICENSE +0 -0
buildlog/core/operations.py
CHANGED
|
@@ -25,12 +25,30 @@ __all__ = [
|
|
|
25
25
|
"ReviewIssue",
|
|
26
26
|
"ReviewLearning",
|
|
27
27
|
"LearnFromReviewResult",
|
|
28
|
+
"RewardEvent",
|
|
29
|
+
"LogRewardResult",
|
|
30
|
+
"RewardSummary",
|
|
31
|
+
# Session tracking (experiment infrastructure)
|
|
32
|
+
"Session",
|
|
33
|
+
"Mistake",
|
|
34
|
+
"SessionMetrics",
|
|
35
|
+
"StartSessionResult",
|
|
36
|
+
"EndSessionResult",
|
|
37
|
+
"LogMistakeResult",
|
|
28
38
|
"status",
|
|
29
39
|
"promote",
|
|
30
40
|
"reject",
|
|
31
41
|
"diff",
|
|
32
42
|
"find_skills_by_ids",
|
|
33
43
|
"learn_from_review",
|
|
44
|
+
"log_reward",
|
|
45
|
+
"get_rewards",
|
|
46
|
+
# Session tracking operations
|
|
47
|
+
"start_session",
|
|
48
|
+
"end_session",
|
|
49
|
+
"log_mistake",
|
|
50
|
+
"get_session_metrics",
|
|
51
|
+
"get_experiment_report",
|
|
34
52
|
]
|
|
35
53
|
|
|
36
54
|
|
|
@@ -283,6 +301,133 @@ class LearnFromReviewResult:
|
|
|
283
301
|
error: str | None = None
|
|
284
302
|
|
|
285
303
|
|
|
304
|
+
# -----------------------------------------------------------------------------
|
|
305
|
+
# Reward Signal Data Structures (for Bandit Learning)
|
|
306
|
+
# -----------------------------------------------------------------------------
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
class RewardEventDict(TypedDict, total=False):
|
|
310
|
+
"""Serializable form of RewardEvent."""
|
|
311
|
+
|
|
312
|
+
id: str
|
|
313
|
+
timestamp: str
|
|
314
|
+
outcome: str # "accepted" | "revision" | "rejected"
|
|
315
|
+
reward_value: float
|
|
316
|
+
rules_active: list[str]
|
|
317
|
+
revision_distance: float | None
|
|
318
|
+
error_class: str | None
|
|
319
|
+
notes: str | None
|
|
320
|
+
source: str | None
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
@dataclass
|
|
324
|
+
class RewardEvent:
|
|
325
|
+
"""A single reward/feedback event for bandit learning.
|
|
326
|
+
|
|
327
|
+
This tracks human feedback on agent work to enable learning
|
|
328
|
+
which rules are effective in which contexts.
|
|
329
|
+
|
|
330
|
+
Attributes:
|
|
331
|
+
id: Unique identifier for this event.
|
|
332
|
+
timestamp: When the feedback was recorded.
|
|
333
|
+
outcome: The feedback type (accepted/revision/rejected).
|
|
334
|
+
reward_value: Numeric reward (1.0=accepted, 0=rejected, in between for revision).
|
|
335
|
+
rules_active: IDs of rules that were in context when work was done.
|
|
336
|
+
revision_distance: How much correction was needed (0-1, lower is better).
|
|
337
|
+
error_class: Category of error if applicable.
|
|
338
|
+
notes: Optional notes about the feedback.
|
|
339
|
+
source: Where this feedback came from (manual, review_loop, etc.).
|
|
340
|
+
"""
|
|
341
|
+
|
|
342
|
+
id: str
|
|
343
|
+
timestamp: datetime
|
|
344
|
+
outcome: Literal["accepted", "revision", "rejected"]
|
|
345
|
+
reward_value: float
|
|
346
|
+
rules_active: list[str] = field(default_factory=list)
|
|
347
|
+
revision_distance: float | None = None
|
|
348
|
+
error_class: str | None = None
|
|
349
|
+
notes: str | None = None
|
|
350
|
+
source: str | None = None
|
|
351
|
+
|
|
352
|
+
def to_dict(self) -> RewardEventDict:
|
|
353
|
+
"""Convert to serializable dictionary."""
|
|
354
|
+
result: RewardEventDict = {
|
|
355
|
+
"id": self.id,
|
|
356
|
+
"timestamp": self.timestamp.isoformat(),
|
|
357
|
+
"outcome": self.outcome,
|
|
358
|
+
"reward_value": self.reward_value,
|
|
359
|
+
"rules_active": self.rules_active,
|
|
360
|
+
}
|
|
361
|
+
if self.revision_distance is not None:
|
|
362
|
+
result["revision_distance"] = self.revision_distance
|
|
363
|
+
if self.error_class is not None:
|
|
364
|
+
result["error_class"] = self.error_class
|
|
365
|
+
if self.notes is not None:
|
|
366
|
+
result["notes"] = self.notes
|
|
367
|
+
if self.source is not None:
|
|
368
|
+
result["source"] = self.source
|
|
369
|
+
return result
|
|
370
|
+
|
|
371
|
+
@classmethod
|
|
372
|
+
def from_dict(cls, data: RewardEventDict) -> "RewardEvent":
|
|
373
|
+
"""Reconstruct from serialized dictionary."""
|
|
374
|
+
timestamp = datetime.fromisoformat(data["timestamp"])
|
|
375
|
+
if timestamp.tzinfo is None:
|
|
376
|
+
timestamp = timestamp.replace(tzinfo=timezone.utc)
|
|
377
|
+
|
|
378
|
+
return cls(
|
|
379
|
+
id=data["id"],
|
|
380
|
+
timestamp=timestamp,
|
|
381
|
+
outcome=data["outcome"], # type: ignore[arg-type]
|
|
382
|
+
reward_value=data["reward_value"],
|
|
383
|
+
rules_active=data.get("rules_active", []),
|
|
384
|
+
revision_distance=data.get("revision_distance"),
|
|
385
|
+
error_class=data.get("error_class"),
|
|
386
|
+
notes=data.get("notes"),
|
|
387
|
+
source=data.get("source"),
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
@dataclass
|
|
392
|
+
class LogRewardResult:
|
|
393
|
+
"""Result of logging a reward event.
|
|
394
|
+
|
|
395
|
+
Attributes:
|
|
396
|
+
reward_id: ID of the logged reward event.
|
|
397
|
+
reward_value: The computed reward value.
|
|
398
|
+
total_events: Total reward events logged so far.
|
|
399
|
+
message: Human-readable confirmation.
|
|
400
|
+
error: Error message if operation failed.
|
|
401
|
+
"""
|
|
402
|
+
|
|
403
|
+
reward_id: str
|
|
404
|
+
reward_value: float
|
|
405
|
+
total_events: int
|
|
406
|
+
message: str = ""
|
|
407
|
+
error: str | None = None
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
@dataclass
|
|
411
|
+
class RewardSummary:
|
|
412
|
+
"""Summary statistics for reward events.
|
|
413
|
+
|
|
414
|
+
Attributes:
|
|
415
|
+
total_events: Total number of reward events.
|
|
416
|
+
accepted: Count of accepted outcomes.
|
|
417
|
+
revisions: Count of revision outcomes.
|
|
418
|
+
rejected: Count of rejected outcomes.
|
|
419
|
+
mean_reward: Average reward value across all events.
|
|
420
|
+
events: List of reward events (limited by query).
|
|
421
|
+
"""
|
|
422
|
+
|
|
423
|
+
total_events: int
|
|
424
|
+
accepted: int
|
|
425
|
+
revisions: int
|
|
426
|
+
rejected: int
|
|
427
|
+
mean_reward: float
|
|
428
|
+
events: list[RewardEvent] = field(default_factory=list)
|
|
429
|
+
|
|
430
|
+
|
|
286
431
|
def _get_rejected_path(buildlog_dir: Path) -> Path:
|
|
287
432
|
"""Get path to rejected.json file."""
|
|
288
433
|
return buildlog_dir / ".buildlog" / "rejected.json"
|
|
@@ -727,3 +872,783 @@ def learn_from_review(
|
|
|
727
872
|
source=source,
|
|
728
873
|
message=message,
|
|
729
874
|
)
|
|
875
|
+
|
|
876
|
+
|
|
877
|
+
# -----------------------------------------------------------------------------
|
|
878
|
+
# Reward Signal Operations (for Bandit Learning)
|
|
879
|
+
# -----------------------------------------------------------------------------
|
|
880
|
+
|
|
881
|
+
|
|
882
|
+
def _get_rewards_path(buildlog_dir: Path) -> Path:
|
|
883
|
+
"""Get path to reward_events.jsonl file."""
|
|
884
|
+
return buildlog_dir / ".buildlog" / "reward_events.jsonl"
|
|
885
|
+
|
|
886
|
+
|
|
887
|
+
def _generate_reward_id(outcome: str, timestamp: datetime) -> str:
|
|
888
|
+
"""Generate unique ID for a reward event.
|
|
889
|
+
|
|
890
|
+
Uses outcome + timestamp to ensure uniqueness while allowing
|
|
891
|
+
multiple events with the same outcome.
|
|
892
|
+
"""
|
|
893
|
+
ts_str = timestamp.isoformat()
|
|
894
|
+
normalized = f"{outcome}:{ts_str}"
|
|
895
|
+
hash_hex = hashlib.sha256(normalized.encode("utf-8")).hexdigest()[:10]
|
|
896
|
+
return f"rew-{hash_hex}"
|
|
897
|
+
|
|
898
|
+
|
|
899
|
+
def _compute_reward_value(
|
|
900
|
+
outcome: Literal["accepted", "revision", "rejected"],
|
|
901
|
+
revision_distance: float | None,
|
|
902
|
+
) -> float:
|
|
903
|
+
"""Compute numeric reward from outcome.
|
|
904
|
+
|
|
905
|
+
Args:
|
|
906
|
+
outcome: The feedback type.
|
|
907
|
+
revision_distance: How much correction needed (0-1).
|
|
908
|
+
|
|
909
|
+
Returns:
|
|
910
|
+
Reward value in [0, 1].
|
|
911
|
+
- accepted: 1.0
|
|
912
|
+
- rejected: 0.0
|
|
913
|
+
- revision: 1.0 - distance (default distance 0.5 if not provided)
|
|
914
|
+
"""
|
|
915
|
+
if outcome == "accepted":
|
|
916
|
+
return 1.0
|
|
917
|
+
elif outcome == "rejected":
|
|
918
|
+
return 0.0
|
|
919
|
+
else: # revision
|
|
920
|
+
distance = revision_distance if revision_distance is not None else 0.5
|
|
921
|
+
return max(0.0, min(1.0, 1.0 - distance))
|
|
922
|
+
|
|
923
|
+
|
|
924
|
+
def log_reward(
|
|
925
|
+
buildlog_dir: Path,
|
|
926
|
+
outcome: Literal["accepted", "revision", "rejected"],
|
|
927
|
+
rules_active: list[str] | None = None,
|
|
928
|
+
revision_distance: float | None = None,
|
|
929
|
+
error_class: str | None = None,
|
|
930
|
+
notes: str | None = None,
|
|
931
|
+
source: str | None = None,
|
|
932
|
+
) -> LogRewardResult:
|
|
933
|
+
"""Log a reward event for bandit learning.
|
|
934
|
+
|
|
935
|
+
Appends to reward_events.jsonl for later analysis.
|
|
936
|
+
|
|
937
|
+
Args:
|
|
938
|
+
buildlog_dir: Path to buildlog directory.
|
|
939
|
+
outcome: Type of feedback (accepted/revision/rejected).
|
|
940
|
+
rules_active: List of rule IDs that were in context.
|
|
941
|
+
revision_distance: How much correction was needed (0-1, for revisions).
|
|
942
|
+
error_class: Category of error if applicable.
|
|
943
|
+
notes: Optional notes about the feedback.
|
|
944
|
+
source: Where this feedback came from.
|
|
945
|
+
|
|
946
|
+
Returns:
|
|
947
|
+
LogRewardResult with confirmation.
|
|
948
|
+
"""
|
|
949
|
+
now = datetime.now(timezone.utc)
|
|
950
|
+
reward_id = _generate_reward_id(outcome, now)
|
|
951
|
+
reward_value = _compute_reward_value(outcome, revision_distance)
|
|
952
|
+
|
|
953
|
+
event = RewardEvent(
|
|
954
|
+
id=reward_id,
|
|
955
|
+
timestamp=now,
|
|
956
|
+
outcome=outcome,
|
|
957
|
+
reward_value=reward_value,
|
|
958
|
+
rules_active=rules_active or [],
|
|
959
|
+
revision_distance=revision_distance,
|
|
960
|
+
error_class=error_class,
|
|
961
|
+
notes=notes,
|
|
962
|
+
source=source or "manual",
|
|
963
|
+
)
|
|
964
|
+
|
|
965
|
+
# Append to JSONL file
|
|
966
|
+
rewards_path = _get_rewards_path(buildlog_dir)
|
|
967
|
+
rewards_path.parent.mkdir(parents=True, exist_ok=True)
|
|
968
|
+
|
|
969
|
+
with open(rewards_path, "a") as f:
|
|
970
|
+
f.write(json.dumps(event.to_dict()) + "\n")
|
|
971
|
+
|
|
972
|
+
# Count total events
|
|
973
|
+
total_events = 0
|
|
974
|
+
if rewards_path.exists():
|
|
975
|
+
total_events = sum(
|
|
976
|
+
1 for line in rewards_path.read_text().strip().split("\n") if line
|
|
977
|
+
)
|
|
978
|
+
|
|
979
|
+
return LogRewardResult(
|
|
980
|
+
reward_id=reward_id,
|
|
981
|
+
reward_value=reward_value,
|
|
982
|
+
total_events=total_events,
|
|
983
|
+
message=f"Logged {outcome} (reward={reward_value:.2f})",
|
|
984
|
+
)
|
|
985
|
+
|
|
986
|
+
|
|
987
|
+
def get_rewards(
|
|
988
|
+
buildlog_dir: Path,
|
|
989
|
+
limit: int | None = None,
|
|
990
|
+
) -> RewardSummary:
|
|
991
|
+
"""Get reward events with summary statistics.
|
|
992
|
+
|
|
993
|
+
Args:
|
|
994
|
+
buildlog_dir: Path to buildlog directory.
|
|
995
|
+
limit: Maximum number of events to return (most recent first).
|
|
996
|
+
|
|
997
|
+
Returns:
|
|
998
|
+
RewardSummary with events and statistics.
|
|
999
|
+
"""
|
|
1000
|
+
rewards_path = _get_rewards_path(buildlog_dir)
|
|
1001
|
+
|
|
1002
|
+
if not rewards_path.exists():
|
|
1003
|
+
return RewardSummary(
|
|
1004
|
+
total_events=0,
|
|
1005
|
+
accepted=0,
|
|
1006
|
+
revisions=0,
|
|
1007
|
+
rejected=0,
|
|
1008
|
+
mean_reward=0.0,
|
|
1009
|
+
events=[],
|
|
1010
|
+
)
|
|
1011
|
+
|
|
1012
|
+
# Parse all events
|
|
1013
|
+
events: list[RewardEvent] = []
|
|
1014
|
+
for line in rewards_path.read_text().strip().split("\n"):
|
|
1015
|
+
if line:
|
|
1016
|
+
try:
|
|
1017
|
+
data = json.loads(line)
|
|
1018
|
+
events.append(RewardEvent.from_dict(data))
|
|
1019
|
+
except (json.JSONDecodeError, KeyError):
|
|
1020
|
+
continue # Skip malformed lines
|
|
1021
|
+
|
|
1022
|
+
# Calculate statistics
|
|
1023
|
+
total = len(events)
|
|
1024
|
+
accepted = sum(1 for e in events if e.outcome == "accepted")
|
|
1025
|
+
revisions = sum(1 for e in events if e.outcome == "revision")
|
|
1026
|
+
rejected = sum(1 for e in events if e.outcome == "rejected")
|
|
1027
|
+
mean_reward = sum(e.reward_value for e in events) / total if total > 0 else 0.0
|
|
1028
|
+
|
|
1029
|
+
# Sort by timestamp (most recent first) and limit
|
|
1030
|
+
events.sort(key=lambda e: e.timestamp, reverse=True)
|
|
1031
|
+
if limit is not None:
|
|
1032
|
+
events = events[:limit]
|
|
1033
|
+
|
|
1034
|
+
return RewardSummary(
|
|
1035
|
+
total_events=total,
|
|
1036
|
+
accepted=accepted,
|
|
1037
|
+
revisions=revisions,
|
|
1038
|
+
rejected=rejected,
|
|
1039
|
+
mean_reward=mean_reward,
|
|
1040
|
+
events=events,
|
|
1041
|
+
)
|
|
1042
|
+
|
|
1043
|
+
|
|
1044
|
+
# -----------------------------------------------------------------------------
|
|
1045
|
+
# Session Tracking Data Structures (for Experimental Infrastructure)
|
|
1046
|
+
# -----------------------------------------------------------------------------
|
|
1047
|
+
|
|
1048
|
+
|
|
1049
|
+
class SessionDict(TypedDict, total=False):
|
|
1050
|
+
"""Serializable form of Session."""
|
|
1051
|
+
|
|
1052
|
+
id: str
|
|
1053
|
+
started_at: str
|
|
1054
|
+
ended_at: str | None
|
|
1055
|
+
entry_file: str | None
|
|
1056
|
+
rules_at_start: list[str]
|
|
1057
|
+
rules_at_end: list[str]
|
|
1058
|
+
error_class: str | None
|
|
1059
|
+
notes: str | None
|
|
1060
|
+
|
|
1061
|
+
|
|
1062
|
+
@dataclass
|
|
1063
|
+
class Session:
|
|
1064
|
+
"""A coding session for experiment tracking.
|
|
1065
|
+
|
|
1066
|
+
Tracks the state of rules before and after a session to measure
|
|
1067
|
+
learning effectiveness.
|
|
1068
|
+
|
|
1069
|
+
Attributes:
|
|
1070
|
+
id: Unique identifier for this session.
|
|
1071
|
+
started_at: When the session started.
|
|
1072
|
+
ended_at: When the session ended (None if still active).
|
|
1073
|
+
entry_file: Corresponding buildlog entry file, if any.
|
|
1074
|
+
rules_at_start: Rule IDs active at session start.
|
|
1075
|
+
rules_at_end: Rule IDs active at session end.
|
|
1076
|
+
error_class: Error class being targeted (e.g., "missing_test").
|
|
1077
|
+
notes: Optional notes about the session.
|
|
1078
|
+
"""
|
|
1079
|
+
|
|
1080
|
+
id: str
|
|
1081
|
+
started_at: datetime
|
|
1082
|
+
ended_at: datetime | None = None
|
|
1083
|
+
entry_file: str | None = None
|
|
1084
|
+
rules_at_start: list[str] = field(default_factory=list)
|
|
1085
|
+
rules_at_end: list[str] = field(default_factory=list)
|
|
1086
|
+
error_class: str | None = None
|
|
1087
|
+
notes: str | None = None
|
|
1088
|
+
|
|
1089
|
+
def to_dict(self) -> SessionDict:
|
|
1090
|
+
"""Convert to serializable dictionary."""
|
|
1091
|
+
result: SessionDict = {
|
|
1092
|
+
"id": self.id,
|
|
1093
|
+
"started_at": self.started_at.isoformat(),
|
|
1094
|
+
"ended_at": self.ended_at.isoformat() if self.ended_at else None,
|
|
1095
|
+
"rules_at_start": self.rules_at_start,
|
|
1096
|
+
"rules_at_end": self.rules_at_end,
|
|
1097
|
+
}
|
|
1098
|
+
if self.entry_file is not None:
|
|
1099
|
+
result["entry_file"] = self.entry_file
|
|
1100
|
+
if self.error_class is not None:
|
|
1101
|
+
result["error_class"] = self.error_class
|
|
1102
|
+
if self.notes is not None:
|
|
1103
|
+
result["notes"] = self.notes
|
|
1104
|
+
return result
|
|
1105
|
+
|
|
1106
|
+
@classmethod
|
|
1107
|
+
def from_dict(cls, data: SessionDict) -> "Session":
|
|
1108
|
+
"""Reconstruct from serialized dictionary."""
|
|
1109
|
+
started_at = datetime.fromisoformat(data["started_at"])
|
|
1110
|
+
if started_at.tzinfo is None:
|
|
1111
|
+
started_at = started_at.replace(tzinfo=timezone.utc)
|
|
1112
|
+
|
|
1113
|
+
ended_at = None
|
|
1114
|
+
ended_at_str = data.get("ended_at")
|
|
1115
|
+
if ended_at_str:
|
|
1116
|
+
ended_at = datetime.fromisoformat(ended_at_str)
|
|
1117
|
+
if ended_at.tzinfo is None:
|
|
1118
|
+
ended_at = ended_at.replace(tzinfo=timezone.utc)
|
|
1119
|
+
|
|
1120
|
+
return cls(
|
|
1121
|
+
id=data["id"],
|
|
1122
|
+
started_at=started_at,
|
|
1123
|
+
ended_at=ended_at,
|
|
1124
|
+
entry_file=data.get("entry_file"),
|
|
1125
|
+
rules_at_start=data.get("rules_at_start", []),
|
|
1126
|
+
rules_at_end=data.get("rules_at_end", []),
|
|
1127
|
+
error_class=data.get("error_class"),
|
|
1128
|
+
notes=data.get("notes"),
|
|
1129
|
+
)
|
|
1130
|
+
|
|
1131
|
+
|
|
1132
|
+
class MistakeDict(TypedDict, total=False):
|
|
1133
|
+
"""Serializable form of Mistake."""
|
|
1134
|
+
|
|
1135
|
+
id: str
|
|
1136
|
+
session_id: str
|
|
1137
|
+
timestamp: str
|
|
1138
|
+
error_class: str
|
|
1139
|
+
description: str
|
|
1140
|
+
semantic_hash: str # Simplified from embedding - hash of description
|
|
1141
|
+
was_repeat: bool
|
|
1142
|
+
corrected_by_rule: str | None
|
|
1143
|
+
|
|
1144
|
+
|
|
1145
|
+
@dataclass
|
|
1146
|
+
class Mistake:
|
|
1147
|
+
"""A logged mistake during a session.
|
|
1148
|
+
|
|
1149
|
+
Tracks mistakes to measure repeated-mistake rate.
|
|
1150
|
+
|
|
1151
|
+
Attributes:
|
|
1152
|
+
id: Unique identifier for this mistake.
|
|
1153
|
+
session_id: Session in which this mistake occurred.
|
|
1154
|
+
timestamp: When the mistake was logged.
|
|
1155
|
+
error_class: Category of error (e.g., "missing_test").
|
|
1156
|
+
description: Description of the mistake.
|
|
1157
|
+
semantic_hash: Hash of description for similarity matching.
|
|
1158
|
+
was_repeat: Whether this was a repeat of a prior mistake.
|
|
1159
|
+
corrected_by_rule: Rule ID that should have prevented this, if any.
|
|
1160
|
+
"""
|
|
1161
|
+
|
|
1162
|
+
id: str
|
|
1163
|
+
session_id: str
|
|
1164
|
+
timestamp: datetime
|
|
1165
|
+
error_class: str
|
|
1166
|
+
description: str
|
|
1167
|
+
semantic_hash: str
|
|
1168
|
+
was_repeat: bool = False
|
|
1169
|
+
corrected_by_rule: str | None = None
|
|
1170
|
+
|
|
1171
|
+
def to_dict(self) -> MistakeDict:
|
|
1172
|
+
"""Convert to serializable dictionary."""
|
|
1173
|
+
result: MistakeDict = {
|
|
1174
|
+
"id": self.id,
|
|
1175
|
+
"session_id": self.session_id,
|
|
1176
|
+
"timestamp": self.timestamp.isoformat(),
|
|
1177
|
+
"error_class": self.error_class,
|
|
1178
|
+
"description": self.description,
|
|
1179
|
+
"semantic_hash": self.semantic_hash,
|
|
1180
|
+
"was_repeat": self.was_repeat,
|
|
1181
|
+
}
|
|
1182
|
+
if self.corrected_by_rule is not None:
|
|
1183
|
+
result["corrected_by_rule"] = self.corrected_by_rule
|
|
1184
|
+
return result
|
|
1185
|
+
|
|
1186
|
+
@classmethod
|
|
1187
|
+
def from_dict(cls, data: MistakeDict) -> "Mistake":
|
|
1188
|
+
"""Reconstruct from serialized dictionary."""
|
|
1189
|
+
timestamp = datetime.fromisoformat(data["timestamp"])
|
|
1190
|
+
if timestamp.tzinfo is None:
|
|
1191
|
+
timestamp = timestamp.replace(tzinfo=timezone.utc)
|
|
1192
|
+
|
|
1193
|
+
return cls(
|
|
1194
|
+
id=data["id"],
|
|
1195
|
+
session_id=data["session_id"],
|
|
1196
|
+
timestamp=timestamp,
|
|
1197
|
+
error_class=data["error_class"],
|
|
1198
|
+
description=data["description"],
|
|
1199
|
+
semantic_hash=data["semantic_hash"],
|
|
1200
|
+
was_repeat=data.get("was_repeat", False),
|
|
1201
|
+
corrected_by_rule=data.get("corrected_by_rule"),
|
|
1202
|
+
)
|
|
1203
|
+
|
|
1204
|
+
|
|
1205
|
+
@dataclass
|
|
1206
|
+
class SessionMetrics:
|
|
1207
|
+
"""Metrics for a session or aggregated across sessions.
|
|
1208
|
+
|
|
1209
|
+
Attributes:
|
|
1210
|
+
session_id: Session ID (or "aggregate" for combined metrics).
|
|
1211
|
+
total_mistakes: Total mistakes in the session(s).
|
|
1212
|
+
repeated_mistakes: Mistakes that were repeats.
|
|
1213
|
+
repeated_mistake_rate: Ratio of repeated to total mistakes.
|
|
1214
|
+
rules_at_start: Number of rules at session start.
|
|
1215
|
+
rules_at_end: Number of rules at session end.
|
|
1216
|
+
rules_added: Net rules added during session(s).
|
|
1217
|
+
"""
|
|
1218
|
+
|
|
1219
|
+
session_id: str
|
|
1220
|
+
total_mistakes: int
|
|
1221
|
+
repeated_mistakes: int
|
|
1222
|
+
repeated_mistake_rate: float
|
|
1223
|
+
rules_at_start: int
|
|
1224
|
+
rules_at_end: int
|
|
1225
|
+
rules_added: int
|
|
1226
|
+
|
|
1227
|
+
|
|
1228
|
+
@dataclass
|
|
1229
|
+
class StartSessionResult:
|
|
1230
|
+
"""Result of starting a new session."""
|
|
1231
|
+
|
|
1232
|
+
session_id: str
|
|
1233
|
+
error_class: str | None
|
|
1234
|
+
rules_count: int
|
|
1235
|
+
message: str
|
|
1236
|
+
|
|
1237
|
+
|
|
1238
|
+
@dataclass
|
|
1239
|
+
class EndSessionResult:
|
|
1240
|
+
"""Result of ending a session."""
|
|
1241
|
+
|
|
1242
|
+
session_id: str
|
|
1243
|
+
duration_minutes: float
|
|
1244
|
+
mistakes_logged: int
|
|
1245
|
+
repeated_mistakes: int
|
|
1246
|
+
rules_at_start: int
|
|
1247
|
+
rules_at_end: int
|
|
1248
|
+
message: str
|
|
1249
|
+
|
|
1250
|
+
|
|
1251
|
+
@dataclass
|
|
1252
|
+
class LogMistakeResult:
|
|
1253
|
+
"""Result of logging a mistake."""
|
|
1254
|
+
|
|
1255
|
+
mistake_id: str
|
|
1256
|
+
session_id: str
|
|
1257
|
+
was_repeat: bool
|
|
1258
|
+
similar_prior: str | None # ID of similar prior mistake if repeat
|
|
1259
|
+
message: str
|
|
1260
|
+
|
|
1261
|
+
|
|
1262
|
+
# -----------------------------------------------------------------------------
|
|
1263
|
+
# Session Tracking Helper Functions
|
|
1264
|
+
# -----------------------------------------------------------------------------
|
|
1265
|
+
|
|
1266
|
+
|
|
1267
|
+
def _get_sessions_path(buildlog_dir: Path) -> Path:
|
|
1268
|
+
"""Get path to sessions JSONL file."""
|
|
1269
|
+
return buildlog_dir / ".buildlog" / "sessions.jsonl"
|
|
1270
|
+
|
|
1271
|
+
|
|
1272
|
+
def _get_mistakes_path(buildlog_dir: Path) -> Path:
|
|
1273
|
+
"""Get path to mistakes JSONL file."""
|
|
1274
|
+
return buildlog_dir / ".buildlog" / "mistakes.jsonl"
|
|
1275
|
+
|
|
1276
|
+
|
|
1277
|
+
def _get_active_session_path(buildlog_dir: Path) -> Path:
|
|
1278
|
+
"""Get path to active session marker file."""
|
|
1279
|
+
return buildlog_dir / ".buildlog" / "active_session.json"
|
|
1280
|
+
|
|
1281
|
+
|
|
1282
|
+
def _generate_session_id(now: datetime) -> str:
|
|
1283
|
+
"""Generate a unique session ID."""
|
|
1284
|
+
# Include microseconds for uniqueness when sessions are created quickly
|
|
1285
|
+
return f"session-{now.strftime('%Y%m%d-%H%M%S')}-{now.microsecond:06d}"
|
|
1286
|
+
|
|
1287
|
+
|
|
1288
|
+
def _generate_mistake_id(error_class: str, now: datetime) -> str:
|
|
1289
|
+
"""Generate a unique mistake ID."""
|
|
1290
|
+
# Include microseconds for uniqueness
|
|
1291
|
+
return f"mistake-{error_class[:10]}-{now.strftime('%Y%m%d-%H%M%S')}-{now.microsecond:06d}"
|
|
1292
|
+
|
|
1293
|
+
|
|
1294
|
+
def _compute_semantic_hash(description: str) -> str:
|
|
1295
|
+
"""Compute a hash for semantic similarity matching.
|
|
1296
|
+
|
|
1297
|
+
This is a simplified approach - in production, you'd use embeddings.
|
|
1298
|
+
For now, we normalize and hash the description.
|
|
1299
|
+
"""
|
|
1300
|
+
import hashlib
|
|
1301
|
+
|
|
1302
|
+
# Normalize: lowercase, remove extra whitespace
|
|
1303
|
+
normalized = " ".join(description.lower().split())
|
|
1304
|
+
return hashlib.sha256(normalized.encode()).hexdigest()[:16]
|
|
1305
|
+
|
|
1306
|
+
|
|
1307
|
+
def _get_current_rules(buildlog_dir: Path) -> list[str]:
|
|
1308
|
+
"""Get list of current promoted rule IDs."""
|
|
1309
|
+
promoted_path = _get_promoted_path(buildlog_dir)
|
|
1310
|
+
return list(_load_json_set(promoted_path, "skill_ids"))
|
|
1311
|
+
|
|
1312
|
+
|
|
1313
|
+
def _load_sessions(buildlog_dir: Path) -> list[Session]:
|
|
1314
|
+
"""Load all sessions from JSONL file."""
|
|
1315
|
+
sessions_path = _get_sessions_path(buildlog_dir)
|
|
1316
|
+
if not sessions_path.exists():
|
|
1317
|
+
return []
|
|
1318
|
+
|
|
1319
|
+
sessions = []
|
|
1320
|
+
for line in sessions_path.read_text().strip().split("\n"):
|
|
1321
|
+
if line:
|
|
1322
|
+
try:
|
|
1323
|
+
data = json.loads(line)
|
|
1324
|
+
sessions.append(Session.from_dict(data))
|
|
1325
|
+
except (json.JSONDecodeError, KeyError):
|
|
1326
|
+
continue
|
|
1327
|
+
return sessions
|
|
1328
|
+
|
|
1329
|
+
|
|
1330
|
+
def _load_mistakes(buildlog_dir: Path) -> list[Mistake]:
|
|
1331
|
+
"""Load all mistakes from JSONL file."""
|
|
1332
|
+
mistakes_path = _get_mistakes_path(buildlog_dir)
|
|
1333
|
+
if not mistakes_path.exists():
|
|
1334
|
+
return []
|
|
1335
|
+
|
|
1336
|
+
mistakes = []
|
|
1337
|
+
for line in mistakes_path.read_text().strip().split("\n"):
|
|
1338
|
+
if line:
|
|
1339
|
+
try:
|
|
1340
|
+
data = json.loads(line)
|
|
1341
|
+
mistakes.append(Mistake.from_dict(data))
|
|
1342
|
+
except (json.JSONDecodeError, KeyError):
|
|
1343
|
+
continue
|
|
1344
|
+
return mistakes
|
|
1345
|
+
|
|
1346
|
+
|
|
1347
|
+
def _find_similar_prior_mistake(
|
|
1348
|
+
description: str,
|
|
1349
|
+
error_class: str,
|
|
1350
|
+
current_session_id: str,
|
|
1351
|
+
all_mistakes: list[Mistake],
|
|
1352
|
+
) -> Mistake | None:
|
|
1353
|
+
"""Find a similar mistake from a prior session.
|
|
1354
|
+
|
|
1355
|
+
Uses semantic hash for similarity matching (simplified approach).
|
|
1356
|
+
"""
|
|
1357
|
+
semantic_hash = _compute_semantic_hash(description)
|
|
1358
|
+
|
|
1359
|
+
for mistake in all_mistakes:
|
|
1360
|
+
# Only check mistakes from prior sessions with same error class
|
|
1361
|
+
if (
|
|
1362
|
+
mistake.session_id != current_session_id
|
|
1363
|
+
and mistake.error_class == error_class
|
|
1364
|
+
):
|
|
1365
|
+
# Check for semantic similarity (hash match or high description overlap)
|
|
1366
|
+
if mistake.semantic_hash == semantic_hash:
|
|
1367
|
+
return mistake
|
|
1368
|
+
# Also check for high word overlap
|
|
1369
|
+
desc_words = set(description.lower().split())
|
|
1370
|
+
mistake_words = set(mistake.description.lower().split())
|
|
1371
|
+
if len(desc_words & mistake_words) / max(len(desc_words), 1) > 0.7:
|
|
1372
|
+
return mistake
|
|
1373
|
+
|
|
1374
|
+
return None
|
|
1375
|
+
|
|
1376
|
+
|
|
1377
|
+
# -----------------------------------------------------------------------------
|
|
1378
|
+
# Session Tracking Operations
|
|
1379
|
+
# -----------------------------------------------------------------------------
|
|
1380
|
+
|
|
1381
|
+
|
|
1382
|
+
def start_session(
|
|
1383
|
+
buildlog_dir: Path,
|
|
1384
|
+
error_class: str | None = None,
|
|
1385
|
+
notes: str | None = None,
|
|
1386
|
+
) -> StartSessionResult:
|
|
1387
|
+
"""Start a new experiment session.
|
|
1388
|
+
|
|
1389
|
+
Args:
|
|
1390
|
+
buildlog_dir: Path to buildlog directory.
|
|
1391
|
+
error_class: Error class being targeted (e.g., "missing_test").
|
|
1392
|
+
notes: Optional notes about the session.
|
|
1393
|
+
|
|
1394
|
+
Returns:
|
|
1395
|
+
StartSessionResult with session ID and current rules count.
|
|
1396
|
+
"""
|
|
1397
|
+
now = datetime.now(timezone.utc)
|
|
1398
|
+
session_id = _generate_session_id(now)
|
|
1399
|
+
current_rules = _get_current_rules(buildlog_dir)
|
|
1400
|
+
|
|
1401
|
+
session = Session(
|
|
1402
|
+
id=session_id,
|
|
1403
|
+
started_at=now,
|
|
1404
|
+
rules_at_start=current_rules,
|
|
1405
|
+
error_class=error_class,
|
|
1406
|
+
notes=notes,
|
|
1407
|
+
)
|
|
1408
|
+
|
|
1409
|
+
# Save as active session
|
|
1410
|
+
active_path = _get_active_session_path(buildlog_dir)
|
|
1411
|
+
active_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1412
|
+
active_path.write_text(json.dumps(session.to_dict(), indent=2))
|
|
1413
|
+
|
|
1414
|
+
return StartSessionResult(
|
|
1415
|
+
session_id=session_id,
|
|
1416
|
+
error_class=error_class,
|
|
1417
|
+
rules_count=len(current_rules),
|
|
1418
|
+
message=f"Started session {session_id} with {len(current_rules)} active rules",
|
|
1419
|
+
)
|
|
1420
|
+
|
|
1421
|
+
|
|
1422
|
+
def end_session(
|
|
1423
|
+
buildlog_dir: Path,
|
|
1424
|
+
entry_file: str | None = None,
|
|
1425
|
+
notes: str | None = None,
|
|
1426
|
+
) -> EndSessionResult:
|
|
1427
|
+
"""End the current experiment session.
|
|
1428
|
+
|
|
1429
|
+
Args:
|
|
1430
|
+
buildlog_dir: Path to buildlog directory.
|
|
1431
|
+
entry_file: Corresponding buildlog entry file, if any.
|
|
1432
|
+
notes: Additional notes to append.
|
|
1433
|
+
|
|
1434
|
+
Returns:
|
|
1435
|
+
EndSessionResult with session metrics.
|
|
1436
|
+
"""
|
|
1437
|
+
active_path = _get_active_session_path(buildlog_dir)
|
|
1438
|
+
|
|
1439
|
+
if not active_path.exists():
|
|
1440
|
+
raise ValueError("No active session to end")
|
|
1441
|
+
|
|
1442
|
+
# Load active session
|
|
1443
|
+
session_data = json.loads(active_path.read_text())
|
|
1444
|
+
session = Session.from_dict(session_data)
|
|
1445
|
+
|
|
1446
|
+
# Update session with end info
|
|
1447
|
+
now = datetime.now(timezone.utc)
|
|
1448
|
+
session.ended_at = now
|
|
1449
|
+
session.rules_at_end = _get_current_rules(buildlog_dir)
|
|
1450
|
+
if entry_file:
|
|
1451
|
+
session.entry_file = entry_file
|
|
1452
|
+
if notes:
|
|
1453
|
+
session.notes = f"{session.notes or ''}\n{notes}".strip()
|
|
1454
|
+
|
|
1455
|
+
# Append to sessions log
|
|
1456
|
+
sessions_path = _get_sessions_path(buildlog_dir)
|
|
1457
|
+
sessions_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1458
|
+
with open(sessions_path, "a") as f:
|
|
1459
|
+
f.write(json.dumps(session.to_dict()) + "\n")
|
|
1460
|
+
|
|
1461
|
+
# Remove active session marker
|
|
1462
|
+
active_path.unlink()
|
|
1463
|
+
|
|
1464
|
+
# Calculate session metrics
|
|
1465
|
+
all_mistakes = _load_mistakes(buildlog_dir)
|
|
1466
|
+
session_mistakes = [m for m in all_mistakes if m.session_id == session.id]
|
|
1467
|
+
repeated = sum(1 for m in session_mistakes if m.was_repeat)
|
|
1468
|
+
|
|
1469
|
+
duration = (session.ended_at - session.started_at).total_seconds() / 60
|
|
1470
|
+
|
|
1471
|
+
return EndSessionResult(
|
|
1472
|
+
session_id=session.id,
|
|
1473
|
+
duration_minutes=round(duration, 1),
|
|
1474
|
+
mistakes_logged=len(session_mistakes),
|
|
1475
|
+
repeated_mistakes=repeated,
|
|
1476
|
+
rules_at_start=len(session.rules_at_start),
|
|
1477
|
+
rules_at_end=len(session.rules_at_end),
|
|
1478
|
+
message=f"Ended session {session.id} ({duration:.1f}min, {len(session_mistakes)} mistakes, {repeated} repeats)",
|
|
1479
|
+
)
|
|
1480
|
+
|
|
1481
|
+
|
|
1482
|
+
def log_mistake(
|
|
1483
|
+
buildlog_dir: Path,
|
|
1484
|
+
error_class: str,
|
|
1485
|
+
description: str,
|
|
1486
|
+
corrected_by_rule: str | None = None,
|
|
1487
|
+
) -> LogMistakeResult:
|
|
1488
|
+
"""Log a mistake during an experiment session.
|
|
1489
|
+
|
|
1490
|
+
Args:
|
|
1491
|
+
buildlog_dir: Path to buildlog directory.
|
|
1492
|
+
error_class: Category of error (e.g., "missing_test").
|
|
1493
|
+
description: Description of the mistake.
|
|
1494
|
+
corrected_by_rule: Rule ID that should have prevented this.
|
|
1495
|
+
|
|
1496
|
+
Returns:
|
|
1497
|
+
LogMistakeResult indicating if this was a repeat.
|
|
1498
|
+
"""
|
|
1499
|
+
active_path = _get_active_session_path(buildlog_dir)
|
|
1500
|
+
|
|
1501
|
+
if not active_path.exists():
|
|
1502
|
+
raise ValueError(
|
|
1503
|
+
"No active session - start one with 'buildlog experiment start'"
|
|
1504
|
+
)
|
|
1505
|
+
|
|
1506
|
+
# Get current session
|
|
1507
|
+
session_data = json.loads(active_path.read_text())
|
|
1508
|
+
session_id = session_data["id"]
|
|
1509
|
+
|
|
1510
|
+
now = datetime.now(timezone.utc)
|
|
1511
|
+
mistake_id = _generate_mistake_id(error_class, now)
|
|
1512
|
+
|
|
1513
|
+
# Check for similar prior mistakes
|
|
1514
|
+
all_mistakes = _load_mistakes(buildlog_dir)
|
|
1515
|
+
similar = _find_similar_prior_mistake(
|
|
1516
|
+
description, error_class, session_id, all_mistakes
|
|
1517
|
+
)
|
|
1518
|
+
|
|
1519
|
+
mistake = Mistake(
|
|
1520
|
+
id=mistake_id,
|
|
1521
|
+
session_id=session_id,
|
|
1522
|
+
timestamp=now,
|
|
1523
|
+
error_class=error_class,
|
|
1524
|
+
description=description,
|
|
1525
|
+
semantic_hash=_compute_semantic_hash(description),
|
|
1526
|
+
was_repeat=similar is not None,
|
|
1527
|
+
corrected_by_rule=corrected_by_rule,
|
|
1528
|
+
)
|
|
1529
|
+
|
|
1530
|
+
# Append to mistakes log
|
|
1531
|
+
mistakes_path = _get_mistakes_path(buildlog_dir)
|
|
1532
|
+
mistakes_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1533
|
+
with open(mistakes_path, "a") as f:
|
|
1534
|
+
f.write(json.dumps(mistake.to_dict()) + "\n")
|
|
1535
|
+
|
|
1536
|
+
message = f"Logged mistake: {error_class}"
|
|
1537
|
+
if similar:
|
|
1538
|
+
message += f" (REPEAT of {similar.id})"
|
|
1539
|
+
|
|
1540
|
+
return LogMistakeResult(
|
|
1541
|
+
mistake_id=mistake_id,
|
|
1542
|
+
session_id=session_id,
|
|
1543
|
+
was_repeat=similar is not None,
|
|
1544
|
+
similar_prior=similar.id if similar else None,
|
|
1545
|
+
message=message,
|
|
1546
|
+
)
|
|
1547
|
+
|
|
1548
|
+
|
|
1549
|
+
def get_session_metrics(
|
|
1550
|
+
buildlog_dir: Path,
|
|
1551
|
+
session_id: str | None = None,
|
|
1552
|
+
) -> SessionMetrics:
|
|
1553
|
+
"""Get metrics for a session or all sessions.
|
|
1554
|
+
|
|
1555
|
+
Args:
|
|
1556
|
+
buildlog_dir: Path to buildlog directory.
|
|
1557
|
+
session_id: Specific session ID, or None for aggregate metrics.
|
|
1558
|
+
|
|
1559
|
+
Returns:
|
|
1560
|
+
SessionMetrics with mistake rates and rule changes.
|
|
1561
|
+
"""
|
|
1562
|
+
sessions = _load_sessions(buildlog_dir)
|
|
1563
|
+
mistakes = _load_mistakes(buildlog_dir)
|
|
1564
|
+
|
|
1565
|
+
if session_id:
|
|
1566
|
+
# Filter to specific session
|
|
1567
|
+
session = next((s for s in sessions if s.id == session_id), None)
|
|
1568
|
+
if not session:
|
|
1569
|
+
raise ValueError(f"Session not found: {session_id}")
|
|
1570
|
+
|
|
1571
|
+
session_mistakes = [m for m in mistakes if m.session_id == session_id]
|
|
1572
|
+
total = len(session_mistakes)
|
|
1573
|
+
repeated = sum(1 for m in session_mistakes if m.was_repeat)
|
|
1574
|
+
|
|
1575
|
+
return SessionMetrics(
|
|
1576
|
+
session_id=session_id,
|
|
1577
|
+
total_mistakes=total,
|
|
1578
|
+
repeated_mistakes=repeated,
|
|
1579
|
+
repeated_mistake_rate=repeated / total if total > 0 else 0.0,
|
|
1580
|
+
rules_at_start=len(session.rules_at_start),
|
|
1581
|
+
rules_at_end=len(session.rules_at_end),
|
|
1582
|
+
rules_added=len(session.rules_at_end) - len(session.rules_at_start),
|
|
1583
|
+
)
|
|
1584
|
+
else:
|
|
1585
|
+
# Aggregate across all sessions
|
|
1586
|
+
total = len(mistakes)
|
|
1587
|
+
repeated = sum(1 for m in mistakes if m.was_repeat)
|
|
1588
|
+
|
|
1589
|
+
rules_start = sessions[0].rules_at_start if sessions else []
|
|
1590
|
+
rules_end = sessions[-1].rules_at_end if sessions else []
|
|
1591
|
+
|
|
1592
|
+
return SessionMetrics(
|
|
1593
|
+
session_id="aggregate",
|
|
1594
|
+
total_mistakes=total,
|
|
1595
|
+
repeated_mistakes=repeated,
|
|
1596
|
+
repeated_mistake_rate=repeated / total if total > 0 else 0.0,
|
|
1597
|
+
rules_at_start=len(rules_start),
|
|
1598
|
+
rules_at_end=len(rules_end),
|
|
1599
|
+
rules_added=len(rules_end) - len(rules_start),
|
|
1600
|
+
)
|
|
1601
|
+
|
|
1602
|
+
|
|
1603
|
+
def get_experiment_report(buildlog_dir: Path) -> dict:
|
|
1604
|
+
"""Generate a comprehensive experiment report.
|
|
1605
|
+
|
|
1606
|
+
Returns:
|
|
1607
|
+
Dictionary with sessions, metrics, and analysis.
|
|
1608
|
+
"""
|
|
1609
|
+
sessions = _load_sessions(buildlog_dir)
|
|
1610
|
+
mistakes = _load_mistakes(buildlog_dir)
|
|
1611
|
+
|
|
1612
|
+
# Per-session metrics
|
|
1613
|
+
session_metrics = []
|
|
1614
|
+
for session in sessions:
|
|
1615
|
+
session_mistakes = [m for m in mistakes if m.session_id == session.id]
|
|
1616
|
+
total = len(session_mistakes)
|
|
1617
|
+
repeated = sum(1 for m in session_mistakes if m.was_repeat)
|
|
1618
|
+
session_metrics.append(
|
|
1619
|
+
{
|
|
1620
|
+
"session_id": session.id,
|
|
1621
|
+
"started_at": session.started_at.isoformat(),
|
|
1622
|
+
"error_class": session.error_class,
|
|
1623
|
+
"total_mistakes": total,
|
|
1624
|
+
"repeated_mistakes": repeated,
|
|
1625
|
+
"repeated_mistake_rate": repeated / total if total > 0 else 0.0,
|
|
1626
|
+
"rules_added": len(session.rules_at_end) - len(session.rules_at_start),
|
|
1627
|
+
}
|
|
1628
|
+
)
|
|
1629
|
+
|
|
1630
|
+
# Aggregate metrics
|
|
1631
|
+
total_mistakes = len(mistakes)
|
|
1632
|
+
total_repeated = sum(1 for m in mistakes if m.was_repeat)
|
|
1633
|
+
|
|
1634
|
+
# Error class breakdown
|
|
1635
|
+
error_classes: dict[str, dict] = {}
|
|
1636
|
+
for mistake in mistakes:
|
|
1637
|
+
if mistake.error_class not in error_classes:
|
|
1638
|
+
error_classes[mistake.error_class] = {"total": 0, "repeated": 0}
|
|
1639
|
+
error_classes[mistake.error_class]["total"] += 1
|
|
1640
|
+
if mistake.was_repeat:
|
|
1641
|
+
error_classes[mistake.error_class]["repeated"] += 1
|
|
1642
|
+
|
|
1643
|
+
return {
|
|
1644
|
+
"summary": {
|
|
1645
|
+
"total_sessions": len(sessions),
|
|
1646
|
+
"total_mistakes": total_mistakes,
|
|
1647
|
+
"total_repeated": total_repeated,
|
|
1648
|
+
"overall_repeat_rate": (
|
|
1649
|
+
total_repeated / total_mistakes if total_mistakes > 0 else 0.0
|
|
1650
|
+
),
|
|
1651
|
+
},
|
|
1652
|
+
"sessions": session_metrics,
|
|
1653
|
+
"error_classes": error_classes,
|
|
1654
|
+
}
|