PyPI - buildlog - Versions diffs - 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

buildlog 0.3.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

buildlog/core/operations.py CHANGED Viewed

@@ -6,12 +6,14 @@ MCP, CLI, HTTP, or any other interface.
 from __future__ import annotations
+import hashlib
 import json
 from dataclasses import dataclass, field
-from datetime import datetime
+from datetime import datetime, timezone
 from pathlib import Path
-from typing import Literal
+from typing import Literal, TypedDict
+from buildlog.confidence import ConfidenceMetrics, merge_confidence_metrics
 from buildlog.render import get_renderer
 from buildlog.skills import Skill, SkillSet, generate_skills
@@ -20,11 +22,33 @@ __all__ = [
     "PromoteResult",
     "RejectResult",
     "DiffResult",
+    "ReviewIssue",
+    "ReviewLearning",
+    "LearnFromReviewResult",
+    "RewardEvent",
+    "LogRewardResult",
+    "RewardSummary",
+    # Session tracking (experiment infrastructure)
+    "Session",
+    "Mistake",
+    "SessionMetrics",
+    "StartSessionResult",
+    "EndSessionResult",
+    "LogMistakeResult",
     "status",
     "promote",
     "reject",
     "diff",
     "find_skills_by_ids",
+    "learn_from_review",
+    "log_reward",
+    "get_rewards",
+    # Session tracking operations
+    "start_session",
+    "end_session",
+    "log_mistake",
+    "get_session_metrics",
+    "get_experiment_report",
 ]
@@ -108,6 +132,302 @@ class DiffResult:
     """Error message if operation failed."""
+# -----------------------------------------------------------------------------
+# Review Learning Data Structures
+# -----------------------------------------------------------------------------
+class ReviewIssueDict(TypedDict, total=False):
+    """Serializable form of ReviewIssue."""
+    severity: str
+    category: str
+    description: str
+    rule_learned: str
+    location: str | None
+    why_it_matters: str | None
+    functional_principle: str | None
+@dataclass
+class ReviewIssue:
+    """A single issue identified during code review.
+    Attributes:
+        severity: How serious the issue is (critical/major/minor/nitpick).
+        category: What kind of issue (architectural/workflow/tool_usage/domain_knowledge).
+        description: What's wrong (concrete).
+        rule_learned: The generalizable rule extracted from this issue.
+        location: File:line where the issue was found.
+        why_it_matters: Why this issue matters (consequences).
+        functional_principle: Related FP principle, if applicable.
+    """
+    severity: Literal["critical", "major", "minor", "nitpick"]
+    category: Literal["architectural", "workflow", "tool_usage", "domain_knowledge"]
+    description: str
+    rule_learned: str
+    location: str | None = None
+    why_it_matters: str | None = None
+    functional_principle: str | None = None
+    @classmethod
+    def from_dict(cls, data: dict) -> "ReviewIssue":
+        """Construct from dictionary (e.g., from JSON)."""
+        return cls(
+            severity=data.get("severity", "minor"),
+            category=data.get("category", "workflow"),
+            description=data.get("description", ""),
+            rule_learned=data.get("rule_learned", ""),
+            location=data.get("location"),
+            why_it_matters=data.get("why_it_matters"),
+            functional_principle=data.get("functional_principle"),
+        )
+class ReviewLearningDict(TypedDict, total=False):
+    """Serializable form of ReviewLearning."""
+    id: str
+    rule: str
+    category: str
+    severity: str
+    source: str
+    first_seen: str
+    last_reinforced: str
+    reinforcement_count: int
+    contradiction_count: int
+    functional_principle: str | None
+@dataclass
+class ReviewLearning:
+    """A learning extracted from review, with confidence tracking.
+    Attributes:
+        id: Deterministic hash of rule_learned (category prefix + hash).
+        rule: The generalizable rule text.
+        category: Category of the learning.
+        severity: Severity of the original issue.
+        source: Where this learning came from (e.g., "review:PR#13").
+        first_seen: When this rule was first identified.
+        last_reinforced: When this rule was last seen/reinforced.
+        reinforcement_count: How many times this rule has been seen.
+        contradiction_count: How many times this rule was contradicted.
+        functional_principle: Related FP principle, if applicable.
+    """
+    id: str
+    rule: str
+    category: str
+    severity: str
+    source: str
+    first_seen: datetime
+    last_reinforced: datetime
+    reinforcement_count: int = 1
+    contradiction_count: int = 0
+    functional_principle: str | None = None
+    def to_confidence_metrics(self) -> ConfidenceMetrics:
+        """Convert to ConfidenceMetrics for scoring."""
+        return ConfidenceMetrics(
+            reinforcement_count=self.reinforcement_count,
+            last_reinforced=self.last_reinforced,
+            contradiction_count=self.contradiction_count,
+            first_seen=self.first_seen,
+        )
+    def to_dict(self) -> ReviewLearningDict:
+        """Convert to serializable dictionary."""
+        result: ReviewLearningDict = {
+            "id": self.id,
+            "rule": self.rule,
+            "category": self.category,
+            "severity": self.severity,
+            "source": self.source,
+            "first_seen": self.first_seen.isoformat(),
+            "last_reinforced": self.last_reinforced.isoformat(),
+            "reinforcement_count": self.reinforcement_count,
+            "contradiction_count": self.contradiction_count,
+        }
+        if self.functional_principle:
+            result["functional_principle"] = self.functional_principle
+        return result
+    @classmethod
+    def from_dict(cls, data: ReviewLearningDict) -> "ReviewLearning":
+        """Reconstruct from serialized dictionary."""
+        first_seen = datetime.fromisoformat(data["first_seen"])
+        last_reinforced = datetime.fromisoformat(data["last_reinforced"])
+        # Ensure timezone awareness
+        if first_seen.tzinfo is None:
+            first_seen = first_seen.replace(tzinfo=timezone.utc)
+        if last_reinforced.tzinfo is None:
+            last_reinforced = last_reinforced.replace(tzinfo=timezone.utc)
+        return cls(
+            id=data["id"],
+            rule=data["rule"],
+            category=data["category"],
+            severity=data["severity"],
+            source=data["source"],
+            first_seen=first_seen,
+            last_reinforced=last_reinforced,
+            reinforcement_count=data.get("reinforcement_count", 1),
+            contradiction_count=data.get("contradiction_count", 0),
+            functional_principle=data.get("functional_principle"),
+        )
+@dataclass
+class LearnFromReviewResult:
+    """Result of learning from a review.
+    Attributes:
+        new_learnings: IDs of newly created learnings.
+        reinforced_learnings: IDs of existing learnings that were reinforced.
+        total_issues_processed: Total number of issues processed.
+        source: Review source identifier.
+        message: Human-readable summary.
+        error: Error message if operation failed.
+    """
+    new_learnings: list[str]
+    reinforced_learnings: list[str]
+    total_issues_processed: int
+    source: str
+    message: str = ""
+    error: str | None = None
+# -----------------------------------------------------------------------------
+# Reward Signal Data Structures (for Bandit Learning)
+# -----------------------------------------------------------------------------
+class RewardEventDict(TypedDict, total=False):
+    """Serializable form of RewardEvent."""
+    id: str
+    timestamp: str
+    outcome: str  # "accepted" | "revision" | "rejected"
+    reward_value: float
+    rules_active: list[str]
+    revision_distance: float | None
+    error_class: str | None
+    notes: str | None
+    source: str | None
+@dataclass
+class RewardEvent:
+    """A single reward/feedback event for bandit learning.
+    This tracks human feedback on agent work to enable learning
+    which rules are effective in which contexts.
+    Attributes:
+        id: Unique identifier for this event.
+        timestamp: When the feedback was recorded.
+        outcome: The feedback type (accepted/revision/rejected).
+        reward_value: Numeric reward (1.0=accepted, 0=rejected, in between for revision).
+        rules_active: IDs of rules that were in context when work was done.
+        revision_distance: How much correction was needed (0-1, lower is better).
+        error_class: Category of error if applicable.
+        notes: Optional notes about the feedback.
+        source: Where this feedback came from (manual, review_loop, etc.).
+    """
+    id: str
+    timestamp: datetime
+    outcome: Literal["accepted", "revision", "rejected"]
+    reward_value: float
+    rules_active: list[str] = field(default_factory=list)
+    revision_distance: float | None = None
+    error_class: str | None = None
+    notes: str | None = None
+    source: str | None = None
+    def to_dict(self) -> RewardEventDict:
+        """Convert to serializable dictionary."""
+        result: RewardEventDict = {
+            "id": self.id,
+            "timestamp": self.timestamp.isoformat(),
+            "outcome": self.outcome,
+            "reward_value": self.reward_value,
+            "rules_active": self.rules_active,
+        }
+        if self.revision_distance is not None:
+            result["revision_distance"] = self.revision_distance
+        if self.error_class is not None:
+            result["error_class"] = self.error_class
+        if self.notes is not None:
+            result["notes"] = self.notes
+        if self.source is not None:
+            result["source"] = self.source
+        return result
+    @classmethod
+    def from_dict(cls, data: RewardEventDict) -> "RewardEvent":
+        """Reconstruct from serialized dictionary."""
+        timestamp = datetime.fromisoformat(data["timestamp"])
+        if timestamp.tzinfo is None:
+            timestamp = timestamp.replace(tzinfo=timezone.utc)
+        return cls(
+            id=data["id"],
+            timestamp=timestamp,
+            outcome=data["outcome"],  # type: ignore[arg-type]
+            reward_value=data["reward_value"],
+            rules_active=data.get("rules_active", []),
+            revision_distance=data.get("revision_distance"),
+            error_class=data.get("error_class"),
+            notes=data.get("notes"),
+            source=data.get("source"),
+        )
+@dataclass
+class LogRewardResult:
+    """Result of logging a reward event.
+    Attributes:
+        reward_id: ID of the logged reward event.
+        reward_value: The computed reward value.
+        total_events: Total reward events logged so far.
+        message: Human-readable confirmation.
+        error: Error message if operation failed.
+    """
+    reward_id: str
+    reward_value: float
+    total_events: int
+    message: str = ""
+    error: str | None = None
+@dataclass
+class RewardSummary:
+    """Summary statistics for reward events.
+    Attributes:
+        total_events: Total number of reward events.
+        accepted: Count of accepted outcomes.
+        revisions: Count of revision outcomes.
+        rejected: Count of rejected outcomes.
+        mean_reward: Average reward value across all events.
+        events: List of reward events (limited by query).
+    """
+    total_events: int
+    accepted: int
+    revisions: int
+    rejected: int
+    mean_reward: float
+    events: list[RewardEvent] = field(default_factory=list)
 def _get_rejected_path(buildlog_dir: Path) -> Path:
     """Get path to rejected.json file."""
     return buildlog_dir / ".buildlog" / "rejected.json"
@@ -386,3 +706,949 @@ def diff(
         already_promoted=len(promoted_ids),
         already_rejected=len(rejected_ids),
     )
+# -----------------------------------------------------------------------------
+# Review Learning Operations
+# -----------------------------------------------------------------------------
+def _get_learnings_path(buildlog_dir: Path) -> Path:
+    """Get path to review_learnings.json file."""
+    return buildlog_dir / ".buildlog" / "review_learnings.json"
+def _generate_learning_id(category: str, rule: str) -> str:
+    """Generate deterministic ID for a learning.
+    Uses category prefix + first 10 chars of SHA256 hash.
+    """
+    # Normalize: lowercase, strip whitespace
+    normalized = rule.lower().strip()
+    hash_input = f"{category}:{normalized}".encode("utf-8")
+    hash_hex = hashlib.sha256(hash_input).hexdigest()[:10]
+    # Category prefix mapping
+    prefix_map = {
+        "architectural": "arch",
+        "workflow": "wf",
+        "tool_usage": "tool",
+        "domain_knowledge": "dom",
+    }
+    prefix = prefix_map.get(category, category[:4])
+    return f"{prefix}-{hash_hex}"
+def _load_learnings(path: Path) -> dict:
+    """Load learnings from JSON file."""
+    if not path.exists():
+        return {"learnings": {}, "review_history": []}
+    try:
+        return json.loads(path.read_text())
+    except (json.JSONDecodeError, OSError):
+        return {"learnings": {}, "review_history": []}
+def _save_learnings(path: Path, data: dict) -> None:
+    """Save learnings to JSON file."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(data, indent=2))
+def learn_from_review(
+    buildlog_dir: Path,
+    issues: list[dict],
+    source: str | None = None,
+) -> LearnFromReviewResult:
+    """Capture learnings from a code review and update confidence metrics.
+    For each issue:
+    1. Generate deterministic ID from rule text
+    2. If exists: reinforce (increment count, update timestamp)
+    3. If new: create ReviewLearning with initial metrics
+    4. Persist to .buildlog/review_learnings.json
+    Args:
+        buildlog_dir: Path to buildlog directory.
+        issues: List of review issues with rule_learned field.
+        source: Optional source identifier (defaults to timestamp).
+    Returns:
+        LearnFromReviewResult with new/reinforced learning IDs.
+    """
+    if not issues:
+        return LearnFromReviewResult(
+            new_learnings=[],
+            reinforced_learnings=[],
+            total_issues_processed=0,
+            source=source or "",
+            error="No issues provided",
+        )
+    # Default source to timestamp
+    now = datetime.now(timezone.utc)
+    if source is None:
+        source = f"review:{now.isoformat()}"
+    elif not source.startswith("review:"):
+        source = f"review:{source}"
+    learnings_path = _get_learnings_path(buildlog_dir)
+    data = _load_learnings(learnings_path)
+    new_ids: list[str] = []
+    reinforced_ids: list[str] = []
+    processed = 0
+    for issue_dict in issues:
+        # Skip issues without rule_learned
+        rule = issue_dict.get("rule_learned", "").strip()
+        if not rule:
+            continue
+        # Parse issue
+        issue = ReviewIssue.from_dict(issue_dict)
+        learning_id = _generate_learning_id(issue.category, rule)
+        if learning_id in data["learnings"]:
+            # Reinforce existing learning
+            existing_data = data["learnings"][learning_id]
+            existing = ReviewLearning.from_dict(existing_data)
+            # Use merge_confidence_metrics pattern
+            updated_metrics = merge_confidence_metrics(
+                existing.to_confidence_metrics(), now
+            )
+            # Update the learning
+            existing_data["last_reinforced"] = now.isoformat()
+            existing_data["reinforcement_count"] = updated_metrics.reinforcement_count
+            reinforced_ids.append(learning_id)
+        else:
+            # Create new learning
+            learning = ReviewLearning(
+                id=learning_id,
+                rule=rule,
+                category=issue.category,
+                severity=issue.severity,
+                source=source,
+                first_seen=now,
+                last_reinforced=now,
+                reinforcement_count=1,
+                contradiction_count=0,
+                functional_principle=issue.functional_principle,
+            )
+            data["learnings"][learning_id] = learning.to_dict()
+            new_ids.append(learning_id)
+        processed += 1
+    # Record in review history
+    data["review_history"].append(
+        {
+            "timestamp": now.isoformat(),
+            "source": source,
+            "issues_count": processed,
+            "new_learning_ids": new_ids,
+            "reinforced_learning_ids": reinforced_ids,
+        }
+    )
+    # Persist
+    _save_learnings(learnings_path, data)
+    # Build message
+    msg_parts = []
+    if new_ids:
+        msg_parts.append(f"{len(new_ids)} new learning(s)")
+    if reinforced_ids:
+        msg_parts.append(f"{len(reinforced_ids)} reinforced")
+    message = ", ".join(msg_parts) if msg_parts else "No learnings captured"
+    return LearnFromReviewResult(
+        new_learnings=new_ids,
+        reinforced_learnings=reinforced_ids,
+        total_issues_processed=processed,
+        source=source,
+        message=message,
+    )
+# -----------------------------------------------------------------------------
+# Reward Signal Operations (for Bandit Learning)
+# -----------------------------------------------------------------------------
+def _get_rewards_path(buildlog_dir: Path) -> Path:
+    """Get path to reward_events.jsonl file."""
+    return buildlog_dir / ".buildlog" / "reward_events.jsonl"
+def _generate_reward_id(outcome: str, timestamp: datetime) -> str:
+    """Generate unique ID for a reward event.
+    Uses outcome + timestamp to ensure uniqueness while allowing
+    multiple events with the same outcome.
+    """
+    ts_str = timestamp.isoformat()
+    normalized = f"{outcome}:{ts_str}"
+    hash_hex = hashlib.sha256(normalized.encode("utf-8")).hexdigest()[:10]
+    return f"rew-{hash_hex}"
+def _compute_reward_value(
+    outcome: Literal["accepted", "revision", "rejected"],
+    revision_distance: float | None,
+) -> float:
+    """Compute numeric reward from outcome.
+    Args:
+        outcome: The feedback type.
+        revision_distance: How much correction needed (0-1).
+    Returns:
+        Reward value in [0, 1].
+        - accepted: 1.0
+        - rejected: 0.0
+        - revision: 1.0 - distance (default distance 0.5 if not provided)
+    """
+    if outcome == "accepted":
+        return 1.0
+    elif outcome == "rejected":
+        return 0.0
+    else:  # revision
+        distance = revision_distance if revision_distance is not None else 0.5
+        return max(0.0, min(1.0, 1.0 - distance))
+def log_reward(
+    buildlog_dir: Path,
+    outcome: Literal["accepted", "revision", "rejected"],
+    rules_active: list[str] | None = None,
+    revision_distance: float | None = None,
+    error_class: str | None = None,
+    notes: str | None = None,
+    source: str | None = None,
+) -> LogRewardResult:
+    """Log a reward event for bandit learning.
+    Appends to reward_events.jsonl for later analysis.
+    Args:
+        buildlog_dir: Path to buildlog directory.
+        outcome: Type of feedback (accepted/revision/rejected).
+        rules_active: List of rule IDs that were in context.
+        revision_distance: How much correction was needed (0-1, for revisions).
+        error_class: Category of error if applicable.
+        notes: Optional notes about the feedback.
+        source: Where this feedback came from.
+    Returns:
+        LogRewardResult with confirmation.
+    """
+    now = datetime.now(timezone.utc)
+    reward_id = _generate_reward_id(outcome, now)
+    reward_value = _compute_reward_value(outcome, revision_distance)
+    event = RewardEvent(
+        id=reward_id,
+        timestamp=now,
+        outcome=outcome,
+        reward_value=reward_value,
+        rules_active=rules_active or [],
+        revision_distance=revision_distance,
+        error_class=error_class,
+        notes=notes,
+        source=source or "manual",
+    )
+    # Append to JSONL file
+    rewards_path = _get_rewards_path(buildlog_dir)
+    rewards_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(rewards_path, "a") as f:
+        f.write(json.dumps(event.to_dict()) + "\n")
+    # Count total events
+    total_events = 0
+    if rewards_path.exists():
+        total_events = sum(
+            1 for line in rewards_path.read_text().strip().split("\n") if line
+        )
+    return LogRewardResult(
+        reward_id=reward_id,
+        reward_value=reward_value,
+        total_events=total_events,
+        message=f"Logged {outcome} (reward={reward_value:.2f})",
+    )
+def get_rewards(
+    buildlog_dir: Path,
+    limit: int | None = None,
+) -> RewardSummary:
+    """Get reward events with summary statistics.
+    Args:
+        buildlog_dir: Path to buildlog directory.
+        limit: Maximum number of events to return (most recent first).
+    Returns:
+        RewardSummary with events and statistics.
+    """
+    rewards_path = _get_rewards_path(buildlog_dir)
+    if not rewards_path.exists():
+        return RewardSummary(
+            total_events=0,
+            accepted=0,
+            revisions=0,
+            rejected=0,
+            mean_reward=0.0,
+            events=[],
+        )
+    # Parse all events
+    events: list[RewardEvent] = []
+    for line in rewards_path.read_text().strip().split("\n"):
+        if line:
+            try:
+                data = json.loads(line)
+                events.append(RewardEvent.from_dict(data))
+            except (json.JSONDecodeError, KeyError):
+                continue  # Skip malformed lines
+    # Calculate statistics
+    total = len(events)
+    accepted = sum(1 for e in events if e.outcome == "accepted")
+    revisions = sum(1 for e in events if e.outcome == "revision")
+    rejected = sum(1 for e in events if e.outcome == "rejected")
+    mean_reward = sum(e.reward_value for e in events) / total if total > 0 else 0.0
+    # Sort by timestamp (most recent first) and limit
+    events.sort(key=lambda e: e.timestamp, reverse=True)
+    if limit is not None:
+        events = events[:limit]
+    return RewardSummary(
+        total_events=total,
+        accepted=accepted,
+        revisions=revisions,
+        rejected=rejected,
+        mean_reward=mean_reward,
+        events=events,
+    )
+# -----------------------------------------------------------------------------
+# Session Tracking Data Structures (for Experimental Infrastructure)
+# -----------------------------------------------------------------------------
+class SessionDict(TypedDict, total=False):
+    """Serializable form of Session."""
+    id: str
+    started_at: str
+    ended_at: str | None
+    entry_file: str | None
+    rules_at_start: list[str]
+    rules_at_end: list[str]
+    error_class: str | None
+    notes: str | None
+@dataclass
+class Session:
+    """A coding session for experiment tracking.
+    Tracks the state of rules before and after a session to measure
+    learning effectiveness.
+    Attributes:
+        id: Unique identifier for this session.
+        started_at: When the session started.
+        ended_at: When the session ended (None if still active).
+        entry_file: Corresponding buildlog entry file, if any.
+        rules_at_start: Rule IDs active at session start.
+        rules_at_end: Rule IDs active at session end.
+        error_class: Error class being targeted (e.g., "missing_test").
+        notes: Optional notes about the session.
+    """
+    id: str
+    started_at: datetime
+    ended_at: datetime | None = None
+    entry_file: str | None = None
+    rules_at_start: list[str] = field(default_factory=list)
+    rules_at_end: list[str] = field(default_factory=list)
+    error_class: str | None = None
+    notes: str | None = None
+    def to_dict(self) -> SessionDict:
+        """Convert to serializable dictionary."""
+        result: SessionDict = {
+            "id": self.id,
+            "started_at": self.started_at.isoformat(),
+            "ended_at": self.ended_at.isoformat() if self.ended_at else None,
+            "rules_at_start": self.rules_at_start,
+            "rules_at_end": self.rules_at_end,
+        }
+        if self.entry_file is not None:
+            result["entry_file"] = self.entry_file
+        if self.error_class is not None:
+            result["error_class"] = self.error_class
+        if self.notes is not None:
+            result["notes"] = self.notes
+        return result
+    @classmethod
+    def from_dict(cls, data: SessionDict) -> "Session":
+        """Reconstruct from serialized dictionary."""
+        started_at = datetime.fromisoformat(data["started_at"])
+        if started_at.tzinfo is None:
+            started_at = started_at.replace(tzinfo=timezone.utc)
+        ended_at = None
+        ended_at_str = data.get("ended_at")
+        if ended_at_str:
+            ended_at = datetime.fromisoformat(ended_at_str)
+            if ended_at.tzinfo is None:
+                ended_at = ended_at.replace(tzinfo=timezone.utc)
+        return cls(
+            id=data["id"],
+            started_at=started_at,
+            ended_at=ended_at,
+            entry_file=data.get("entry_file"),
+            rules_at_start=data.get("rules_at_start", []),
+            rules_at_end=data.get("rules_at_end", []),
+            error_class=data.get("error_class"),
+            notes=data.get("notes"),
+        )
+class MistakeDict(TypedDict, total=False):
+    """Serializable form of Mistake."""
+    id: str
+    session_id: str
+    timestamp: str
+    error_class: str
+    description: str
+    semantic_hash: str  # Simplified from embedding - hash of description
+    was_repeat: bool
+    corrected_by_rule: str | None
+@dataclass
+class Mistake:
+    """A logged mistake during a session.
+    Tracks mistakes to measure repeated-mistake rate.
+    Attributes:
+        id: Unique identifier for this mistake.
+        session_id: Session in which this mistake occurred.
+        timestamp: When the mistake was logged.
+        error_class: Category of error (e.g., "missing_test").
+        description: Description of the mistake.
+        semantic_hash: Hash of description for similarity matching.
+        was_repeat: Whether this was a repeat of a prior mistake.
+        corrected_by_rule: Rule ID that should have prevented this, if any.
+    """
+    id: str
+    session_id: str
+    timestamp: datetime
+    error_class: str
+    description: str
+    semantic_hash: str
+    was_repeat: bool = False
+    corrected_by_rule: str | None = None
+    def to_dict(self) -> MistakeDict:
+        """Convert to serializable dictionary."""
+        result: MistakeDict = {
+            "id": self.id,
+            "session_id": self.session_id,
+            "timestamp": self.timestamp.isoformat(),
+            "error_class": self.error_class,
+            "description": self.description,
+            "semantic_hash": self.semantic_hash,
+            "was_repeat": self.was_repeat,
+        }
+        if self.corrected_by_rule is not None:
+            result["corrected_by_rule"] = self.corrected_by_rule
+        return result
+    @classmethod
+    def from_dict(cls, data: MistakeDict) -> "Mistake":
+        """Reconstruct from serialized dictionary."""
+        timestamp = datetime.fromisoformat(data["timestamp"])
+        if timestamp.tzinfo is None:
+            timestamp = timestamp.replace(tzinfo=timezone.utc)
+        return cls(
+            id=data["id"],
+            session_id=data["session_id"],
+            timestamp=timestamp,
+            error_class=data["error_class"],
+            description=data["description"],
+            semantic_hash=data["semantic_hash"],
+            was_repeat=data.get("was_repeat", False),
+            corrected_by_rule=data.get("corrected_by_rule"),
+        )
+@dataclass
+class SessionMetrics:
+    """Metrics for a session or aggregated across sessions.
+    Attributes:
+        session_id: Session ID (or "aggregate" for combined metrics).
+        total_mistakes: Total mistakes in the session(s).
+        repeated_mistakes: Mistakes that were repeats.
+        repeated_mistake_rate: Ratio of repeated to total mistakes.
+        rules_at_start: Number of rules at session start.
+        rules_at_end: Number of rules at session end.
+        rules_added: Net rules added during session(s).
+    """
+    session_id: str
+    total_mistakes: int
+    repeated_mistakes: int
+    repeated_mistake_rate: float
+    rules_at_start: int
+    rules_at_end: int
+    rules_added: int
+@dataclass
+class StartSessionResult:
+    """Result of starting a new session."""
+    session_id: str
+    error_class: str | None
+    rules_count: int
+    message: str
+@dataclass
+class EndSessionResult:
+    """Result of ending a session."""
+    session_id: str
+    duration_minutes: float
+    mistakes_logged: int
+    repeated_mistakes: int
+    rules_at_start: int
+    rules_at_end: int
+    message: str
+@dataclass
+class LogMistakeResult:
+    """Result of logging a mistake."""
+    mistake_id: str
+    session_id: str
+    was_repeat: bool
+    similar_prior: str | None  # ID of similar prior mistake if repeat
+    message: str
+# -----------------------------------------------------------------------------
+# Session Tracking Helper Functions
+# -----------------------------------------------------------------------------
+def _get_sessions_path(buildlog_dir: Path) -> Path:
+    """Get path to sessions JSONL file."""
+    return buildlog_dir / ".buildlog" / "sessions.jsonl"
+def _get_mistakes_path(buildlog_dir: Path) -> Path:
+    """Get path to mistakes JSONL file."""
+    return buildlog_dir / ".buildlog" / "mistakes.jsonl"
+def _get_active_session_path(buildlog_dir: Path) -> Path:
+    """Get path to active session marker file."""
+    return buildlog_dir / ".buildlog" / "active_session.json"
+def _generate_session_id(now: datetime) -> str:
+    """Generate a unique session ID."""
+    # Include microseconds for uniqueness when sessions are created quickly
+    return f"session-{now.strftime('%Y%m%d-%H%M%S')}-{now.microsecond:06d}"
+def _generate_mistake_id(error_class: str, now: datetime) -> str:
+    """Generate a unique mistake ID."""
+    # Include microseconds for uniqueness
+    return f"mistake-{error_class[:10]}-{now.strftime('%Y%m%d-%H%M%S')}-{now.microsecond:06d}"
+def _compute_semantic_hash(description: str) -> str:
+    """Compute a hash for semantic similarity matching.
+    This is a simplified approach - in production, you'd use embeddings.
+    For now, we normalize and hash the description.
+    """
+    import hashlib
+    # Normalize: lowercase, remove extra whitespace
+    normalized = " ".join(description.lower().split())
+    return hashlib.sha256(normalized.encode()).hexdigest()[:16]
+def _get_current_rules(buildlog_dir: Path) -> list[str]:
+    """Get list of current promoted rule IDs."""
+    promoted_path = _get_promoted_path(buildlog_dir)
+    return list(_load_json_set(promoted_path, "skill_ids"))
+def _load_sessions(buildlog_dir: Path) -> list[Session]:
+    """Load all sessions from JSONL file."""
+    sessions_path = _get_sessions_path(buildlog_dir)
+    if not sessions_path.exists():
+        return []
+    sessions = []
+    for line in sessions_path.read_text().strip().split("\n"):
+        if line:
+            try:
+                data = json.loads(line)
+                sessions.append(Session.from_dict(data))
+            except (json.JSONDecodeError, KeyError):
+                continue
+    return sessions
+def _load_mistakes(buildlog_dir: Path) -> list[Mistake]:
+    """Load all mistakes from JSONL file."""
+    mistakes_path = _get_mistakes_path(buildlog_dir)
+    if not mistakes_path.exists():
+        return []
+    mistakes = []
+    for line in mistakes_path.read_text().strip().split("\n"):
+        if line:
+            try:
+                data = json.loads(line)
+                mistakes.append(Mistake.from_dict(data))
+            except (json.JSONDecodeError, KeyError):
+                continue
+    return mistakes
+def _find_similar_prior_mistake(
+    description: str,
+    error_class: str,
+    current_session_id: str,
+    all_mistakes: list[Mistake],
+) -> Mistake | None:
+    """Find a similar mistake from a prior session.
+    Uses semantic hash for similarity matching (simplified approach).
+    """
+    semantic_hash = _compute_semantic_hash(description)
+    for mistake in all_mistakes:
+        # Only check mistakes from prior sessions with same error class
+        if (
+            mistake.session_id != current_session_id
+            and mistake.error_class == error_class
+        ):
+            # Check for semantic similarity (hash match or high description overlap)
+            if mistake.semantic_hash == semantic_hash:
+                return mistake
+            # Also check for high word overlap
+            desc_words = set(description.lower().split())
+            mistake_words = set(mistake.description.lower().split())
+            if len(desc_words & mistake_words) / max(len(desc_words), 1) > 0.7:
+                return mistake
+    return None
+# -----------------------------------------------------------------------------
+# Session Tracking Operations
+# -----------------------------------------------------------------------------
+def start_session(
+    buildlog_dir: Path,
+    error_class: str | None = None,
+    notes: str | None = None,
+) -> StartSessionResult:
+    """Start a new experiment session.
+    Args:
+        buildlog_dir: Path to buildlog directory.
+        error_class: Error class being targeted (e.g., "missing_test").
+        notes: Optional notes about the session.
+    Returns:
+        StartSessionResult with session ID and current rules count.
+    """
+    now = datetime.now(timezone.utc)
+    session_id = _generate_session_id(now)
+    current_rules = _get_current_rules(buildlog_dir)
+    session = Session(
+        id=session_id,
+        started_at=now,
+        rules_at_start=current_rules,
+        error_class=error_class,
+        notes=notes,
+    )
+    # Save as active session
+    active_path = _get_active_session_path(buildlog_dir)
+    active_path.parent.mkdir(parents=True, exist_ok=True)
+    active_path.write_text(json.dumps(session.to_dict(), indent=2))
+    return StartSessionResult(
+        session_id=session_id,
+        error_class=error_class,
+        rules_count=len(current_rules),
+        message=f"Started session {session_id} with {len(current_rules)} active rules",
+    )
+def end_session(
+    buildlog_dir: Path,
+    entry_file: str | None = None,
+    notes: str | None = None,
+) -> EndSessionResult:
+    """End the current experiment session.
+    Args:
+        buildlog_dir: Path to buildlog directory.
+        entry_file: Corresponding buildlog entry file, if any.
+        notes: Additional notes to append.
+    Returns:
+        EndSessionResult with session metrics.
+    """
+    active_path = _get_active_session_path(buildlog_dir)
+    if not active_path.exists():
+        raise ValueError("No active session to end")
+    # Load active session
+    session_data = json.loads(active_path.read_text())
+    session = Session.from_dict(session_data)
+    # Update session with end info
+    now = datetime.now(timezone.utc)
+    session.ended_at = now
+    session.rules_at_end = _get_current_rules(buildlog_dir)
+    if entry_file:
+        session.entry_file = entry_file
+    if notes:
+        session.notes = f"{session.notes or ''}\n{notes}".strip()
+    # Append to sessions log
+    sessions_path = _get_sessions_path(buildlog_dir)
+    sessions_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(sessions_path, "a") as f:
+        f.write(json.dumps(session.to_dict()) + "\n")
+    # Remove active session marker
+    active_path.unlink()
+    # Calculate session metrics
+    all_mistakes = _load_mistakes(buildlog_dir)
+    session_mistakes = [m for m in all_mistakes if m.session_id == session.id]
+    repeated = sum(1 for m in session_mistakes if m.was_repeat)
+    duration = (session.ended_at - session.started_at).total_seconds() / 60
+    return EndSessionResult(
+        session_id=session.id,
+        duration_minutes=round(duration, 1),
+        mistakes_logged=len(session_mistakes),
+        repeated_mistakes=repeated,
+        rules_at_start=len(session.rules_at_start),
+        rules_at_end=len(session.rules_at_end),
+        message=f"Ended session {session.id} ({duration:.1f}min, {len(session_mistakes)} mistakes, {repeated} repeats)",
+    )
+def log_mistake(
+    buildlog_dir: Path,
+    error_class: str,
+    description: str,
+    corrected_by_rule: str | None = None,
+) -> LogMistakeResult:
+    """Log a mistake during an experiment session.
+    Args:
+        buildlog_dir: Path to buildlog directory.
+        error_class: Category of error (e.g., "missing_test").
+        description: Description of the mistake.
+        corrected_by_rule: Rule ID that should have prevented this.
+    Returns:
+        LogMistakeResult indicating if this was a repeat.
+    """
+    active_path = _get_active_session_path(buildlog_dir)
+    if not active_path.exists():
+        raise ValueError(
+            "No active session - start one with 'buildlog experiment start'"
+        )
+    # Get current session
+    session_data = json.loads(active_path.read_text())
+    session_id = session_data["id"]
+    now = datetime.now(timezone.utc)
+    mistake_id = _generate_mistake_id(error_class, now)
+    # Check for similar prior mistakes
+    all_mistakes = _load_mistakes(buildlog_dir)
+    similar = _find_similar_prior_mistake(
+        description, error_class, session_id, all_mistakes
+    )
+    mistake = Mistake(
+        id=mistake_id,
+        session_id=session_id,
+        timestamp=now,
+        error_class=error_class,
+        description=description,
+        semantic_hash=_compute_semantic_hash(description),
+        was_repeat=similar is not None,
+        corrected_by_rule=corrected_by_rule,
+    )
+    # Append to mistakes log
+    mistakes_path = _get_mistakes_path(buildlog_dir)
+    mistakes_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(mistakes_path, "a") as f:
+        f.write(json.dumps(mistake.to_dict()) + "\n")
+    message = f"Logged mistake: {error_class}"
+    if similar:
+        message += f" (REPEAT of {similar.id})"
+    return LogMistakeResult(
+        mistake_id=mistake_id,
+        session_id=session_id,
+        was_repeat=similar is not None,
+        similar_prior=similar.id if similar else None,
+        message=message,
+    )
+def get_session_metrics(
+    buildlog_dir: Path,
+    session_id: str | None = None,
+) -> SessionMetrics:
+    """Get metrics for a session or all sessions.
+    Args:
+        buildlog_dir: Path to buildlog directory.
+        session_id: Specific session ID, or None for aggregate metrics.
+    Returns:
+        SessionMetrics with mistake rates and rule changes.
+    """
+    sessions = _load_sessions(buildlog_dir)
+    mistakes = _load_mistakes(buildlog_dir)
+    if session_id:
+        # Filter to specific session
+        session = next((s for s in sessions if s.id == session_id), None)
+        if not session:
+            raise ValueError(f"Session not found: {session_id}")
+        session_mistakes = [m for m in mistakes if m.session_id == session_id]
+        total = len(session_mistakes)
+        repeated = sum(1 for m in session_mistakes if m.was_repeat)
+        return SessionMetrics(
+            session_id=session_id,
+            total_mistakes=total,
+            repeated_mistakes=repeated,
+            repeated_mistake_rate=repeated / total if total > 0 else 0.0,
+            rules_at_start=len(session.rules_at_start),
+            rules_at_end=len(session.rules_at_end),
+            rules_added=len(session.rules_at_end) - len(session.rules_at_start),
+        )
+    else:
+        # Aggregate across all sessions
+        total = len(mistakes)
+        repeated = sum(1 for m in mistakes if m.was_repeat)
+        rules_start = sessions[0].rules_at_start if sessions else []
+        rules_end = sessions[-1].rules_at_end if sessions else []
+        return SessionMetrics(
+            session_id="aggregate",
+            total_mistakes=total,
+            repeated_mistakes=repeated,
+            repeated_mistake_rate=repeated / total if total > 0 else 0.0,
+            rules_at_start=len(rules_start),
+            rules_at_end=len(rules_end),
+            rules_added=len(rules_end) - len(rules_start),
+        )
+def get_experiment_report(buildlog_dir: Path) -> dict:
+    """Generate a comprehensive experiment report.
+    Returns:
+        Dictionary with sessions, metrics, and analysis.
+    """
+    sessions = _load_sessions(buildlog_dir)
+    mistakes = _load_mistakes(buildlog_dir)
+    # Per-session metrics
+    session_metrics = []
+    for session in sessions:
+        session_mistakes = [m for m in mistakes if m.session_id == session.id]
+        total = len(session_mistakes)
+        repeated = sum(1 for m in session_mistakes if m.was_repeat)
+        session_metrics.append(
+            {
+                "session_id": session.id,
+                "started_at": session.started_at.isoformat(),
+                "error_class": session.error_class,
+                "total_mistakes": total,
+                "repeated_mistakes": repeated,
+                "repeated_mistake_rate": repeated / total if total > 0 else 0.0,
+                "rules_added": len(session.rules_at_end) - len(session.rules_at_start),
+            }
+        )
+    # Aggregate metrics
+    total_mistakes = len(mistakes)
+    total_repeated = sum(1 for m in mistakes if m.was_repeat)
+    # Error class breakdown
+    error_classes: dict[str, dict] = {}
+    for mistake in mistakes:
+        if mistake.error_class not in error_classes:
+            error_classes[mistake.error_class] = {"total": 0, "repeated": 0}
+        error_classes[mistake.error_class]["total"] += 1
+        if mistake.was_repeat:
+            error_classes[mistake.error_class]["repeated"] += 1
+    return {
+        "summary": {
+            "total_sessions": len(sessions),
+            "total_mistakes": total_mistakes,
+            "total_repeated": total_repeated,
+            "overall_repeat_rate": (
+                total_repeated / total_mistakes if total_mistakes > 0 else 0.0
+            ),
+        },
+        "sessions": session_metrics,
+        "error_classes": error_classes,
+    }

buildlog 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

buildlog 0.3.0py3-none-any.whl → 0.5.0py3-none-any.whl