PyPI - suitable-loop - Versions diffs - 0.1.0__py3-none-any.whl - Mend

suitable-loop 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

suitable_loop/__init__.py +3 -0
suitable_loop/__main__.py +5 -0
suitable_loop/analyzers/__init__.py +1 -0
suitable_loop/analyzers/code_analyzer.py +652 -0
suitable_loop/analyzers/git_analyzer.py +510 -0
suitable_loop/analyzers/log_analyzer.py +663 -0
suitable_loop/config.py +60 -0
suitable_loop/db.py +497 -0
suitable_loop/graph/__init__.py +1 -0
suitable_loop/graph/engine.py +341 -0
suitable_loop/models.py +131 -0
suitable_loop/server.py +46 -0
suitable_loop/tools/__init__.py +1 -0
suitable_loop/tools/code_tools.py +104 -0
suitable_loop/tools/git_tools.py +52 -0
suitable_loop/tools/log_tools.py +53 -0
suitable_loop/tools/util_tools.py +49 -0
suitable_loop-0.1.0.dist-info/METADATA +12 -0
suitable_loop-0.1.0.dist-info/RECORD +21 -0
suitable_loop-0.1.0.dist-info/WHEEL +4 -0
suitable_loop-0.1.0.dist-info/entry_points.txt +2 -0

suitable_loop/analyzers/log_analyzer.py ADDED Viewed

@@ -0,0 +1,663 @@
+"""Log and error analysis engine for CodeZero."""
+from __future__ import annotations
+import hashlib
+import json
+import logging
+import os
+import re
+import time
+from pathlib import Path
+from suitable_loop.config import SuitableLoopConfig
+from suitable_loop.db import Database
+from suitable_loop.models import ErrorCodeLink, ErrorGroup, LogEntry
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Regex patterns for stdlib logging formats
+# ---------------------------------------------------------------------------
+# Patterns ordered from most specific to least specific.
+_STDLIB_PATTERNS: list[re.Pattern[str]] = [
+    # 2024-01-15 10:30:45,123 - module - ERROR - message
+    re.compile(
+        r"^(?P<timestamp>\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}[,\.]\d{1,6})"
+        r"\s+-\s+(?P<logger>\S+)"
+        r"\s+-\s+(?P<level>[A-Z]+)"
+        r"\s+-\s+(?P<message>.+)$"
+    ),
+    # 2024-01-15 10:30:45,123 module ERROR message
+    re.compile(
+        r"^(?P<timestamp>\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}[,\.]\d{1,6})"
+        r"\s+(?P<logger>\S+)"
+        r"\s+(?P<level>DEBUG|INFO|WARNING|ERROR|CRITICAL)"
+        r"\s+(?P<message>.+)$"
+    ),
+    # [2024-01-15 10:30:45] ERROR module: message
+    re.compile(
+        r"^\[(?P<timestamp>\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}[,\.]?\d{0,6})\]"
+        r"\s+(?P<level>[A-Z]+)"
+        r"\s+(?P<logger>\S+?):\s+(?P<message>.+)$"
+    ),
+    # ERROR 2024-01-15 10:30:45 module - message
+    re.compile(
+        r"^(?P<level>DEBUG|INFO|WARNING|ERROR|CRITICAL)"
+        r"\s+(?P<timestamp>\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}[,\.]?\d{0,6})"
+        r"\s+(?P<logger>\S+)"
+        r"\s+-\s+(?P<message>.+)$"
+    ),
+]
+# Traceback frame line: File "path", line N, in func
+_FRAME_RE = re.compile(
+    r'^\s+File "(?P<path>[^"]+)",\s+line\s+(?P<lineno>\d+),\s+in\s+(?P<func>\S+)'
+)
+# Timestamp parsing formats (tried in order).
+_TIMESTAMP_FORMATS = [
+    "%Y-%m-%d %H:%M:%S,%f",
+    "%Y-%m-%d %H:%M:%S.%f",
+    "%Y-%m-%d %H:%M:%S",
+    "%Y-%m-%dT%H:%M:%S,%f",
+    "%Y-%m-%dT%H:%M:%S.%f",
+    "%Y-%m-%dT%H:%M:%S",
+]
+def _parse_timestamp(raw: str) -> float | None:
+    """Try to convert a timestamp string to a Unix epoch float."""
+    raw = raw.strip()
+    for fmt in _TIMESTAMP_FORMATS:
+        try:
+            from datetime import datetime, timezone
+            dt = datetime.strptime(raw, fmt).replace(tzinfo=timezone.utc)
+            return dt.timestamp()
+        except ValueError:
+            continue
+    return None
+class LogAnalyzer:
+    """Ingests log files, groups errors, and maps stack frames to code."""
+    def __init__(self, db: Database, config: SuitableLoopConfig) -> None:
+        self.db = db
+        self.config = config
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+    def ingest_logs(self, path: str) -> dict:
+        """Ingest log files from *path* (file or directory).
+        Returns a summary dict with keys: entries_parsed, errors_found,
+        error_groups_new, error_groups_updated.
+        """
+        target = Path(path)
+        if target.is_dir():
+            files = sorted(
+                p
+                for p in target.iterdir()
+                if p.is_file() and p.suffix in (".log", ".txt")
+            )
+        elif target.is_file():
+            files = [target]
+        else:
+            logger.warning("Path does not exist or is not readable: %s", path)
+            return {
+                "entries_parsed": 0,
+                "errors_found": 0,
+                "error_groups_new": 0,
+                "error_groups_updated": 0,
+            }
+        max_entries = self.config.logging.max_entries_per_ingest
+        entries_parsed = 0
+        errors_found = 0
+        error_groups_new = 0
+        error_groups_updated = 0
+        for file_path in files:
+            if entries_parsed >= max_entries:
+                logger.info(
+                    "Reached max_entries_per_ingest limit (%d). Stopping.",
+                    max_entries,
+                )
+                break
+            logger.info("Ingesting %s", file_path)
+            try:
+                lines = file_path.read_text(errors="replace").splitlines()
+            except OSError:
+                logger.exception("Failed to read %s", file_path)
+                continue
+            # 1. Extract tracebacks so we can create error groups.
+            tracebacks = self._extract_tracebacks(lines)
+            # Build a lookup: line index -> traceback dict (keyed on the
+            # exception line index, i.e., the last line of each traceback).
+            tb_by_exc_line: dict[int, dict] = {}
+            for tb in tracebacks:
+                tb_by_exc_line[tb["end_line"]] = tb
+            # 2. Persist error groups and remember mapping from signature
+            #    to group_id so we can link log entries later.
+            sig_to_group: dict[str, int] = {}
+            existing_sigs: set[str] = set()
+            # Pre-check which signatures already exist.
+            for tb in tracebacks:
+                row = self.db.conn.execute(
+                    "SELECT id FROM error_groups WHERE signature = ?",
+                    (tb["signature"],),
+                ).fetchone()
+                if row:
+                    existing_sigs.add(tb["signature"])
+            for tb in tracebacks:
+                now = time.time()
+                eg = ErrorGroup(
+                    signature=tb["signature"],
+                    exception_type=tb["exception_type"],
+                    exception_message=tb["exception_message"],
+                    traceback=tb["raw"],
+                    first_seen=now,
+                    last_seen=now,
+                    occurrence_count=1,
+                )
+                group_id = self.db.upsert_error_group(eg)
+                sig_to_group[tb["signature"]] = group_id
+                errors_found += 1
+                if tb["signature"] in existing_sigs:
+                    error_groups_updated += 1
+                else:
+                    error_groups_new += 1
+                    existing_sigs.add(tb["signature"])
+                # Map frames to indexed code.
+                self._map_frames_to_code(tb["frames"], group_id)
+            # 3. Detect format and parse individual lines.
+            auto_detect = self.config.logging.auto_detect_format
+            detected_format: str | None = None
+            if auto_detect:
+                detected_format = self._detect_format(lines)
+            for line_idx, line in enumerate(lines):
+                if entries_parsed >= max_entries:
+                    break
+                parsed = self._parse_line(line, detected_format)
+                if parsed is None:
+                    continue
+                # Check if this line is the exception line of a traceback.
+                error_group_id: int | None = None
+                if line_idx in tb_by_exc_line:
+                    tb = tb_by_exc_line[line_idx]
+                    error_group_id = sig_to_group.get(tb["signature"])
+                entry = LogEntry(
+                    source_file=str(file_path),
+                    timestamp=parsed.get("timestamp"),
+                    level=parsed.get("level", ""),
+                    logger_name=parsed.get("logger", ""),
+                    message=parsed.get("message", ""),
+                    raw_line=line,
+                    error_group_id=error_group_id,
+                )
+                self.db.insert_log_entry(entry)
+                entries_parsed += 1
+            self.db.commit()
+        return {
+            "entries_parsed": entries_parsed,
+            "errors_found": errors_found,
+            "error_groups_new": error_groups_new,
+            "error_groups_updated": error_groups_updated,
+        }
+    def get_error_groups(self, limit: int = 20) -> list[dict]:
+        """Return error groups sorted by frequency, enriched with code links."""
+        groups = self.db.get_error_groups(limit=limit)
+        results: list[dict] = []
+        for eg in groups:
+            links = self.db.get_error_code_links(eg.id)  # type: ignore[arg-type]
+            results.append(
+                {
+                    "id": eg.id,
+                    "signature": eg.signature,
+                    "exception_type": eg.exception_type,
+                    "exception_message": eg.exception_message,
+                    "first_seen": eg.first_seen,
+                    "last_seen": eg.last_seen,
+                    "occurrence_count": eg.occurrence_count,
+                    "code_links": links,
+                }
+            )
+        return results
+    def error_detail(self, error_group_id: int) -> dict | None:
+        """Full detail for a single error group."""
+        eg = self.db.get_error_group_by_id(error_group_id)
+        if eg is None:
+            return None
+        code_links = self.db.get_error_code_links(error_group_id)
+        # Affected functions (from code links).
+        affected_functions: list[dict] = []
+        for link in code_links:
+            func_id = link.get("function_id")
+            if func_id is not None:
+                func = self.db.get_function_by_id(func_id)
+                if func:
+                    affected_functions.append(
+                        {
+                            "function_id": func.id,
+                            "qualified_name": func.qualified_name,
+                            "file_id": func.file_id,
+                            "line_start": func.line_start,
+                            "line_end": func.line_end,
+                            "frame_position": link.get("frame_position"),
+                        }
+                    )
+        # Sample log entries.
+        rows = self.db.conn.execute(
+            "SELECT * FROM log_entries WHERE error_group_id = ? ORDER BY timestamp DESC LIMIT 10",
+            (error_group_id,),
+        ).fetchall()
+        sample_entries = [dict(r) for r in rows]
+        return {
+            "id": eg.id,
+            "signature": eg.signature,
+            "exception_type": eg.exception_type,
+            "exception_message": eg.exception_message,
+            "traceback": eg.traceback,
+            "first_seen": eg.first_seen,
+            "last_seen": eg.last_seen,
+            "occurrence_count": eg.occurrence_count,
+            "code_links": code_links,
+            "affected_functions": affected_functions,
+            "sample_entries": sample_entries,
+        }
+    def correlate_error(self, error_text: str) -> dict:
+        """Parse raw error text and find matching error groups / code paths."""
+        lines = error_text.splitlines()
+        tracebacks = self._extract_tracebacks(lines)
+        matched_groups: list[dict] = []
+        mapped_frames: list[dict] = []
+        for tb in tracebacks:
+            # Look for an existing error group with the same signature.
+            row = self.db.conn.execute(
+                "SELECT * FROM error_groups WHERE signature = ?",
+                (tb["signature"],),
+            ).fetchone()
+            if row:
+                eg = ErrorGroup(**dict(row))
+                links = self.db.get_error_code_links(eg.id)  # type: ignore[arg-type]
+                matched_groups.append(
+                    {
+                        "id": eg.id,
+                        "signature": eg.signature,
+                        "exception_type": eg.exception_type,
+                        "exception_message": eg.exception_message,
+                        "occurrence_count": eg.occurrence_count,
+                        "code_links": links,
+                    }
+                )
+            else:
+                # No existing group -- try to map frames to code anyway.
+                for pos, (fpath, lineno, func_name) in enumerate(tb["frames"]):
+                    match = self._find_function_for_frame(fpath, lineno)
+                    if match:
+                        file_entity, func_entity = match
+                        mapped_frames.append(
+                            {
+                                "frame_position": pos,
+                                "file_path": fpath,
+                                "line_number": lineno,
+                                "frame_function": func_name,
+                                "matched_function": func_entity.qualified_name,
+                                "function_id": func_entity.id,
+                                "file_id": file_entity.id,
+                            }
+                        )
+        # If no tracebacks found, try a simple text search against existing
+        # error messages.
+        if not tracebacks:
+            search_text = error_text.strip()[:200]
+            rows = self.db.conn.execute(
+                "SELECT * FROM error_groups WHERE exception_message LIKE ? "
+                "ORDER BY occurrence_count DESC LIMIT 5",
+                (f"%{search_text}%",),
+            ).fetchall()
+            for row in rows:
+                eg = ErrorGroup(**dict(row))
+                links = self.db.get_error_code_links(eg.id)  # type: ignore[arg-type]
+                matched_groups.append(
+                    {
+                        "id": eg.id,
+                        "signature": eg.signature,
+                        "exception_type": eg.exception_type,
+                        "exception_message": eg.exception_message,
+                        "occurrence_count": eg.occurrence_count,
+                        "code_links": links,
+                    }
+                )
+        return {
+            "tracebacks_parsed": len(tracebacks),
+            "matched_groups": matched_groups,
+            "unmapped_frame_matches": mapped_frames,
+        }
+    def error_timeline(self, days: int = 7) -> list[dict]:
+        """Return error counts grouped by day and error type for the last *days* days."""
+        cutoff = time.time() - days * 86400
+        rows = self.db.conn.execute(
+            """
+            SELECT
+                date(le.timestamp, 'unixepoch') AS day,
+                eg.exception_type,
+                eg.id AS error_group_id,
+                COUNT(*) AS count
+            FROM log_entries le
+            JOIN error_groups eg ON le.error_group_id = eg.id
+            WHERE le.timestamp >= ?
+            GROUP BY day, eg.id
+            ORDER BY day, count DESC
+            """,
+            (cutoff,),
+        ).fetchall()
+        return [dict(r) for r in rows]
+    # ------------------------------------------------------------------
+    # Private helpers
+    # ------------------------------------------------------------------
+    def _parse_stdlib_line(self, line: str) -> dict | None:
+        """Parse a stdlib-style log line and return a dict or *None*."""
+        for pattern in _STDLIB_PATTERNS:
+            m = pattern.match(line)
+            if m:
+                groups = m.groupdict()
+                ts = _parse_timestamp(groups["timestamp"])
+                return {
+                    "timestamp": ts,
+                    "level": groups["level"].upper(),
+                    "logger": groups.get("logger", ""),
+                    "message": groups["message"],
+                }
+        return None
+    def _parse_json_line(self, line: str) -> dict | None:
+        """Parse a JSON log line and return a normalised dict or *None*."""
+        stripped = line.strip()
+        if not stripped.startswith("{"):
+            return None
+        try:
+            obj = json.loads(stripped)
+        except (json.JSONDecodeError, ValueError):
+            return None
+        if not isinstance(obj, dict):
+            return None
+        # Normalise common key names.
+        level = (
+            obj.get("level")
+            or obj.get("levelname")
+            or obj.get("severity")
+            or ""
+        )
+        message = (
+            obj.get("message")
+            or obj.get("msg")
+            or obj.get("text")
+            or ""
+        )
+        logger_name = (
+            obj.get("logger")
+            or obj.get("name")
+            or obj.get("logger_name")
+            or ""
+        )
+        raw_ts = obj.get("timestamp") or obj.get("time") or obj.get("ts")
+        ts: float | None = None
+        if isinstance(raw_ts, (int, float)):
+            ts = float(raw_ts)
+        elif isinstance(raw_ts, str):
+            ts = _parse_timestamp(raw_ts)
+        return {
+            "timestamp": ts,
+            "level": str(level).upper(),
+            "logger": str(logger_name),
+            "message": str(message),
+        }
+    def _extract_tracebacks(self, lines: list[str]) -> list[dict]:
+        """Extract Python traceback blocks from *lines*.
+        Each returned dict contains:
+          - start_line / end_line: indices in *lines*
+          - raw: the full traceback text
+          - exception_type, exception_message: parsed from the last line
+          - frames: list of (file_path, line_number, function_name) tuples
+          - signature: dedup hash
+        """
+        tracebacks: list[dict] = []
+        i = 0
+        while i < len(lines):
+            if lines[i].strip() == "Traceback (most recent call last):":
+                start = i
+                frames: list[tuple[str, int, str]] = []
+                i += 1
+                # Consume frame lines and code lines.
+                while i < len(lines):
+                    frame_match = _FRAME_RE.match(lines[i])
+                    if frame_match:
+                        frames.append(
+                            (
+                                frame_match.group("path"),
+                                int(frame_match.group("lineno")),
+                                frame_match.group("func"),
+                            )
+                        )
+                        i += 1
+                        # Skip the source-code line that follows the frame.
+                        if i < len(lines) and lines[i].startswith("    "):
+                            i += 1
+                        continue
+                    # A line starting with whitespace that is not a frame is
+                    # still part of the traceback (e.g., chained exceptions or
+                    # "During handling..." blocks).  But a non-indented line is
+                    # the exception line (or end of block).
+                    if lines[i] and not lines[i][0].isspace():
+                        break
+                    i += 1
+                # The current line should be the exception line.
+                if i < len(lines):
+                    exc_line = lines[i].strip()
+                    end = i
+                else:
+                    exc_line = ""
+                    end = i - 1
+                exc_type, _, exc_msg = exc_line.partition(":")
+                exc_type = exc_type.strip()
+                exc_msg = exc_msg.strip()
+                raw = "\n".join(lines[start : end + 1])
+                signature = self._compute_error_signature(exc_type, frames)
+                tracebacks.append(
+                    {
+                        "start_line": start,
+                        "end_line": end,
+                        "raw": raw,
+                        "exception_type": exc_type,
+                        "exception_message": exc_msg,
+                        "frames": frames,
+                        "signature": signature,
+                    }
+                )
+            i += 1
+        return tracebacks
+    def _compute_error_signature(
+        self, exception_type: str, frames: list[tuple[str, int, str]]
+    ) -> str:
+        """Return a hex-digest hash that uniquely identifies this error shape.
+        The signature is derived from the exception type and the top 3 stack
+        frames (file path, line number, function name) so that identical
+        tracebacks always map to the same error group.
+        """
+        top_frames = frames[-3:] if len(frames) > 3 else frames
+        key_parts = [exception_type]
+        for fpath, lineno, func_name in top_frames:
+            key_parts.append(f"{fpath}:{lineno}:{func_name}")
+        key = "|".join(key_parts)
+        return hashlib.sha256(key.encode()).hexdigest()
+    def _map_frames_to_code(
+        self, frames: list[tuple[str, int, str]], error_group_id: int
+    ) -> None:
+        """Try to match each stack frame to an indexed function and persist links."""
+        # Avoid creating duplicate links if we re-ingest.
+        existing = self.db.get_error_code_links(error_group_id)
+        existing_positions: set[int] = {
+            link.get("frame_position", -1) for link in existing
+        }
+        for pos, (fpath, lineno, _func_name) in enumerate(frames):
+            if pos in existing_positions:
+                continue
+            match = self._find_function_for_frame(fpath, lineno)
+            if match:
+                file_entity, func_entity = match
+                link = ErrorCodeLink(
+                    error_group_id=error_group_id,
+                    function_id=func_entity.id,
+                    file_id=file_entity.id,
+                    line_number=lineno,
+                    frame_position=pos,
+                )
+                self.db.insert_error_code_link(link)
+            else:
+                # Even without a function match, try to link by file alone.
+                file_entity = self._find_file_for_frame(fpath)
+                if file_entity:
+                    link = ErrorCodeLink(
+                        error_group_id=error_group_id,
+                        function_id=None,
+                        file_id=file_entity.id,
+                        line_number=lineno,
+                        frame_position=pos,
+                    )
+                    self.db.insert_error_code_link(link)
+        self.db.commit()
+    # ------------------------------------------------------------------
+    # Internal utilities
+    # ------------------------------------------------------------------
+    def _detect_format(self, lines: list[str]) -> str | None:
+        """Sample the first non-empty lines to guess the log format.
+        Returns ``"json"``, ``"stdlib"``, or ``None``.
+        """
+        sample_count = 0
+        json_hits = 0
+        stdlib_hits = 0
+        for line in lines:
+            stripped = line.strip()
+            if not stripped:
+                continue
+            if self._parse_json_line(stripped) is not None:
+                json_hits += 1
+            elif self._parse_stdlib_line(stripped) is not None:
+                stdlib_hits += 1
+            sample_count += 1
+            if sample_count >= 20:
+                break
+        if json_hits > stdlib_hits and json_hits > 0:
+            return "json"
+        if stdlib_hits > 0:
+            return "stdlib"
+        return None
+    def _parse_line(self, line: str, detected_format: str | None) -> dict | None:
+        """Parse a single log line using the detected (or both) format(s)."""
+        stripped = line.strip()
+        if not stripped:
+            return None
+        if detected_format == "json":
+            return self._parse_json_line(stripped)
+        if detected_format == "stdlib":
+            return self._parse_stdlib_line(stripped)
+        # Fallback: try both.
+        result = self._parse_json_line(stripped)
+        if result is not None:
+            return result
+        return self._parse_stdlib_line(stripped)
+    def _find_function_for_frame(self, fpath: str, lineno: int):
+        """Return ``(FileEntity, FunctionEntity)`` or *None*."""
+        file_entity = self._find_file_for_frame(fpath)
+        if file_entity is None:
+            return None
+        # Find the function whose line range contains *lineno*.
+        funcs = self.db.get_functions_by_file(file_entity.id)  # type: ignore[arg-type]
+        for func in funcs:
+            if func.line_start <= lineno <= func.line_end:
+                return file_entity, func
+        return None
+    def _find_file_for_frame(self, fpath: str):
+        """Try to find an indexed ``FileEntity`` matching *fpath*."""
+        # Try exact match first.
+        fe = self.db.get_file_by_path(fpath)
+        if fe:
+            return fe
+        # Try matching by filename suffix (the traceback path might be
+        # absolute while the indexed path is relative, or vice-versa).
+        basename = os.path.basename(fpath)
+        rows = self.db.conn.execute(
+            "SELECT * FROM files WHERE path LIKE ? LIMIT 1",
+            (f"%/{basename}",),
+        ).fetchall()
+        if rows:
+            from suitable_loop.models import FileEntity
+            return FileEntity(**dict(rows[0]))
+        return None