PyPI - mnemosyne-engine - Versions diffs - 0.3.0__py3-none-any.whl - Mend

mnemosyne-engine 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

mnemosyne/__init__.py +14 -0
mnemosyne/__main__.py +14 -0
mnemosyne/analytics.py +271 -0
mnemosyne/audit.py +202 -0
mnemosyne/bloom.py +265 -0
mnemosyne/cache.py +252 -0
mnemosyne/chunkers/__init__.py +153 -0
mnemosyne/chunkers/brace_chunker.py +535 -0
mnemosyne/chunkers/code_chunker.py +509 -0
mnemosyne/chunkers/csharp_chunker.py +145 -0
mnemosyne/chunkers/generic_chunker.py +143 -0
mnemosyne/chunkers/go_chunker.py +177 -0
mnemosyne/chunkers/java_chunker.py +234 -0
mnemosyne/chunkers/js_chunker.py +794 -0
mnemosyne/chunkers/rust_chunker.py +134 -0
mnemosyne/chunkers/text_chunker.py +315 -0
mnemosyne/cli.py +931 -0
mnemosyne/compress.py +483 -0
mnemosyne/config.py +315 -0
mnemosyne/daemon.py +342 -0
mnemosyne/delta.py +238 -0
mnemosyne/density.py +253 -0
mnemosyne/embeddings/__init__.py +36 -0
mnemosyne/embeddings/tfidf_backend.py +430 -0
mnemosyne/formatter.py +176 -0
mnemosyne/hasher.py +150 -0
mnemosyne/ingest.py +421 -0
mnemosyne/models.py +230 -0
mnemosyne/prefetch.py +115 -0
mnemosyne/py.typed +0 -0
mnemosyne/ranking.py +198 -0
mnemosyne/retrieval.py +889 -0
mnemosyne/schema.py +409 -0
mnemosyne/store.py +1058 -0
mnemosyne/tests/__init__.py +0 -0
mnemosyne/tests/benchmark.py +675 -0
mnemosyne/tests/benchmark_suite.py +676 -0
mnemosyne/tests/test_analytics.py +252 -0
mnemosyne/tests/test_brace_chunkers.py +638 -0
mnemosyne/tests/test_cache.py +245 -0
mnemosyne/tests/test_chunkers.py +321 -0
mnemosyne/tests/test_compression.py +343 -0
mnemosyne/tests/test_core.py +463 -0
mnemosyne/tests/test_daemon.py +376 -0
mnemosyne/tests/test_integration.py +479 -0
mnemosyne/tests/test_retrieval.py +667 -0
mnemosyne/tests/test_store.py +479 -0
mnemosyne/tests/test_tfidf.py +259 -0
mnemosyne/tiers.py +136 -0
mnemosyne/vectorstore.py +177 -0
mnemosyne_engine-0.3.0.dist-info/METADATA +1248 -0
mnemosyne_engine-0.3.0.dist-info/RECORD +57 -0
mnemosyne_engine-0.3.0.dist-info/WHEEL +5 -0
mnemosyne_engine-0.3.0.dist-info/entry_points.txt +2 -0
mnemosyne_engine-0.3.0.dist-info/licenses/LICENSE +683 -0
mnemosyne_engine-0.3.0.dist-info/licenses/NOTICE +10 -0
mnemosyne_engine-0.3.0.dist-info/top_level.txt +1 -0

mnemosyne/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+# Copyright 2026 Cast Rock Innovation L.L.C.
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""
+Mnemosyne — LLM Context Compression and Retrieval Engine.
+A foundation layer for intelligent codebase indexing, chunking, embedding,
+compression, and retrieval — built entirely on the Python standard library.
+"""
+__version__ = "0.3.0"
+__package_name__ = "mnemosyne"
+__all__ = ["__version__", "__package_name__"]

mnemosyne/__main__.py ADDED Viewed

@@ -0,0 +1,14 @@
+# Copyright 2026 Cast Rock Innovation L.L.C.
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""
+Entry point for ``python -m mnemosyne``.
+The full CLI is implemented in ``mnemosyne.cli`` (built separately).
+This module's sole responsibility is to invoke it.
+"""
+from mnemosyne.cli import main
+if __name__ == "__main__":
+    main()

mnemosyne/analytics.py ADDED Viewed

@@ -0,0 +1,271 @@
+# Copyright 2026 Cast Rock Innovation L.L.C.
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""
+Usage analytics and decay-weighted scoring for Mnemosyne.
+Tracks ``UsageEvent`` records and derives exponentially-decayed frequency
+scores that feed back into the retrieval ranking pipeline.
+"""
+from __future__ import annotations
+import math
+import uuid
+from datetime import datetime, timezone
+from typing import TYPE_CHECKING
+from mnemosyne.models import UsageEvent
+if TYPE_CHECKING:
+    from mnemosyne.store import Store
+class Analytics:
+    """
+    Session-aware usage tracker with exponential time-decay scoring.
+    Decay formula (half-life model)::
+        score(event) = 2 ^ (-age_days / halflife)
+    For each chunk the per-event contributions of ``'selected'`` and
+    ``'used'`` event types are summed.  ``'retrieved'`` and ``'discarded'``
+    events are stored but do not contribute to the usage score.
+    Args:
+        store:  The persistent :class:`~mnemosyne.store.Store` instance.
+        config: Mnemosyne :class:`~mnemosyne.config.Config` instance.
+                Reads ``config.analytics.decay_halflife_days``.
+    """
+    def __init__(self, store: "Store", config) -> None:
+        self.store = store
+        self.halflife: float = float(config.analytics.decay_halflife_days)
+        self._session_id: str | None = None
+    # ------------------------------------------------------------------
+    # Session management
+    # ------------------------------------------------------------------
+    def start_session(self, session_id: str | None = None) -> str:
+        """
+        Start or resume a usage-tracking session.
+        Args:
+            session_id: Explicit session identifier.  A new 8-hex-char UUID
+                        fragment is generated when this is ``None``.
+        Returns:
+            The active session ID string.
+        """
+        self._session_id = session_id or self._generate_session_id()
+        return self._session_id
+    # ------------------------------------------------------------------
+    # Event recording
+    # ------------------------------------------------------------------
+    def record(
+        self,
+        chunk_id: int,
+        event_type: str,
+        query_text: str | None = None,
+    ) -> None:
+        """
+        Record a usage event for *chunk_id*.
+        Args:
+            chunk_id:    The chunk that was interacted with.
+            event_type:  One of ``'retrieved'``, ``'selected'``, ``'used'``,
+                         ``'discarded'``.
+            query_text:  The raw query string (may be None).
+        """
+        now_iso = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+        event = UsageEvent(
+            event_id=None,
+            chunk_id=chunk_id,
+            query_text=query_text,
+            session_id=self._session_id,
+            event_type=event_type,
+            timestamp=now_iso,
+        )
+        self.store.save_usage_event(event)
+    # ------------------------------------------------------------------
+    # Scoring
+    # ------------------------------------------------------------------
+    def get_usage_scores(self) -> dict[int, float]:
+        """
+        Compute exponentially-decayed usage scores for all chunks.
+        Only ``'selected'`` and ``'used'`` events contribute to the score.
+        The contribution of each event decays with time::
+            contribution = 2 ^ (-age_days / halflife)
+        Returns:
+            Mapping of ``chunk_id -> score``.  Chunks with no qualifying
+            events are absent from the dict.
+        """
+        now_utc = datetime.now(timezone.utc)
+        # Fetch all 'selected' and 'used' events from the store
+        events = self.store.get_usage_events(event_types=["selected", "used"])
+        scores: dict[int, float] = {}
+        for event in events:
+            if not event.timestamp:
+                continue
+            try:
+                # Parse ISO-8601 timestamps (handle both Z and +00:00 suffixes)
+                ts_str = event.timestamp.replace("Z", "+00:00")
+                event_time = datetime.fromisoformat(ts_str)
+                if event_time.tzinfo is None:
+                    event_time = event_time.replace(tzinfo=timezone.utc)
+            except (ValueError, AttributeError):
+                continue
+            age_days = (now_utc - event_time).total_seconds() / 86400.0
+            contribution = math.pow(2.0, -age_days / max(1e-9, self.halflife))
+            scores[event.chunk_id] = scores.get(event.chunk_id, 0.0) + contribution
+        return scores
+    # ------------------------------------------------------------------
+    # Co-occurrence analysis
+    # ------------------------------------------------------------------
+    def get_co_occurrence(self, chunk_ids: list[int]) -> dict[int, int]:
+        """
+        Find chunks frequently co-retrieved with the given chunks.
+        Looks up sessions in which any of the provided *chunk_ids* were
+        retrieved, then counts how often other chunks appeared in those same
+        sessions.
+        Args:
+            chunk_ids: Reference set of chunk IDs.
+        Returns:
+            Mapping of ``co_chunk_id -> session_co_occurrence_count`` for
+            all chunks that appeared alongside the input set (excluding the
+            input IDs themselves).
+        """
+        if not chunk_ids:
+            return {}
+        # Get sessions that involved any of our reference chunks
+        reference_sessions: set[str] = set()
+        for cid in chunk_ids:
+            events = self.store.get_usage_events_for_chunk(cid)
+            for event in events:
+                if event.session_id:
+                    reference_sessions.add(event.session_id)
+        if not reference_sessions:
+            return {}
+        # Count co-occurrences within those sessions
+        co_counts: dict[int, int] = {}
+        reference_set = set(chunk_ids)
+        for session_id in reference_sessions:
+            session_events = self.store.get_usage_events_for_session(session_id)
+            for event in session_events:
+                if event.chunk_id not in reference_set:
+                    co_counts[event.chunk_id] = co_counts.get(event.chunk_id, 0) + 1
+        return co_counts
+    # ------------------------------------------------------------------
+    # Precision / feedback analytics
+    # ------------------------------------------------------------------
+    def compute_precision_at_k(self, session_id: str | None = None) -> dict:
+        """
+        Compute precision from feedback events.
+        Precision is defined as ``used / (used + discarded)``.  When both
+        counts are zero the precision is reported as ``0.0``.
+        Args:
+            session_id: If provided, only events for this session are
+                        considered.  ``None`` aggregates across all sessions.
+        Returns:
+            Dict with keys ``precision``, ``total_retrieved``,
+            ``total_used``, ``total_discarded``, ``total_selected``.
+        """
+        if session_id is not None:
+            events = self.store.get_usage_events_for_session(session_id)
+        else:
+            events = self.store.get_usage_events()
+        counts: dict[str, int] = {
+            "retrieved": 0,
+            "used": 0,
+            "discarded": 0,
+            "selected": 0,
+        }
+        for event in events:
+            if event.event_type in counts:
+                counts[event.event_type] += 1
+        denominator = counts["used"] + counts["discarded"]
+        precision = counts["used"] / denominator if denominator > 0 else 0.0
+        return {
+            "precision": precision,
+            "total_retrieved": counts["retrieved"],
+            "total_used": counts["used"],
+            "total_discarded": counts["discarded"],
+            "total_selected": counts["selected"],
+        }
+    def get_top_used_chunks(self, limit: int = 5) -> list[dict]:
+        """
+        Return the *limit* most-used chunks ranked by ``'used'`` event count.
+        Each entry is a dict with ``chunk_id``, ``use_count``, ``file_path``,
+        ``symbol_name``, ``line_start``, and ``line_end``.
+        """
+        events = self.store.get_usage_events(event_types=["used"])
+        # Tally per chunk_id
+        chunk_counts: dict[int, int] = {}
+        for event in events:
+            chunk_counts[event.chunk_id] = chunk_counts.get(event.chunk_id, 0) + 1
+        # Sort descending by count
+        ranked = sorted(chunk_counts.items(), key=lambda x: -x[1])[:limit]
+        results: list[dict] = []
+        for chunk_id, count in ranked:
+            chunk = self.store.get_chunk(chunk_id)
+            file_path = ""
+            symbol_name = None
+            line_start = 0
+            line_end = 0
+            if chunk is not None:
+                line_start = chunk.line_start
+                line_end = chunk.line_end
+                symbol_name = chunk.symbol_name
+                file_rec = self.store.get_file_by_id(chunk.file_id)
+                if file_rec is not None:
+                    file_path = file_rec.rel_path
+            results.append({
+                "chunk_id": chunk_id,
+                "use_count": count,
+                "file_path": file_path,
+                "symbol_name": symbol_name,
+                "line_start": line_start,
+                "line_end": line_end,
+            })
+        return results
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+    def _generate_session_id(self) -> str:
+        return str(uuid.uuid4())[:8]

mnemosyne/audit.py ADDED Viewed

@@ -0,0 +1,202 @@
+# Copyright 2026 Cast Rock Innovation L.L.C.
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""
+Append-only JSONL audit logger for Mnemosyne.
+Design:
+  - Every operation is written as one JSON object per line (JSONL format).
+  - Writes are atomic at the line level: each ``log()`` call opens, writes,
+    and closes (or flushes) the file — there is no open file handle held
+    between calls, so concurrent processes can append safely on most OS.
+  - ``rotate()`` renames the current log to ``<name>.1.jsonl`` (keeping one
+    backup), preventing unbounded growth.
+  - ``read()`` supports tail-N filtering and operation-type filtering without
+    loading the entire file into memory first.
+Thread safety: individual ``log()`` writes are protected by a
+``threading.Lock``.  Cross-process safety relies on OS-level append
+atomicity (guaranteed for lines < PIPE_BUF on POSIX; safe enough for audit
+use on all common platforms).
+"""
+from __future__ import annotations
+import json
+import os
+import threading
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+def _now_utc() -> str:
+    """Return the current UTC time as an ISO-8601 string."""
+    return datetime.now(tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+class AuditLog:
+    """
+    Append-only, JSONL-format audit log.
+    Args:
+        path: Filesystem path to the log file.  Parent directories are created
+              automatically on the first write.
+    Usage::
+        log = AuditLog("/path/to/.mnemosyne/audit.jsonl")
+        log.log("index_file", rel_path="src/main.py", chunks=42)
+        log.log("query",      query="auth middleware", results=5)
+        recent = log.read(last_n=100, op_filter="query")
+    """
+    def __init__(self, path: str | Path) -> None:
+        self.path = Path(path)
+        self._lock = threading.Lock()
+    # ------------------------------------------------------------------
+    # Write
+    # ------------------------------------------------------------------
+    def log(self, operation: str, **details: Any) -> None:
+        """
+        Append one audit event to the log.
+        The record is a single JSON object containing at minimum:
+          - ``"op"``:        the *operation* name (e.g. ``"index_file"``)
+          - ``"ts"``:        ISO-8601 UTC timestamp of when ``log()`` was called
+          - **details:       any keyword arguments passed by the caller
+        Args:
+            operation: Short operation identifier; should be a lowercase
+                       snake_case string (e.g. ``"query"``, ``"cache_evict"``).
+            **details: Arbitrary key/value pairs to include in the record.
+                       Values must be JSON-serialisable.
+        Raises:
+            TypeError: If any value in *details* is not JSON-serialisable.
+        """
+        record: dict[str, Any] = {
+            "op": operation,
+            "ts": _now_utc(),
+        }
+        record.update(details)
+        line = json.dumps(record, ensure_ascii=False, separators=(",", ":")) + "\n"
+        with self._lock:
+            self.path.parent.mkdir(parents=True, exist_ok=True)
+            # Open in append mode; 'a' is atomic at line granularity on POSIX.
+            with open(self.path, "a", encoding="utf-8") as fh:
+                fh.write(line)
+    # ------------------------------------------------------------------
+    # Read
+    # ------------------------------------------------------------------
+    def read(
+        self,
+        last_n: int | None = None,
+        op_filter: str | None = None,
+    ) -> list[dict[str, Any]]:
+        """
+        Read audit records from the log file.
+        Args:
+            last_n:    When provided, return only the last *n* matching records
+                       (tail semantics — most recent *n* entries that satisfy
+                       *op_filter*).  Pass None to return all matching records.
+            op_filter: When provided, return only records whose ``"op"`` field
+                       equals this string (exact match, case-sensitive).
+        Returns:
+            List of record dicts in chronological order (oldest first).
+            Returns an empty list if the log file does not exist.
+        Note:
+            Malformed JSON lines are silently skipped so that a single corrupt
+            line does not prevent reading the rest of the log.
+        """
+        if not self.path.exists():
+            return []
+        records: list[dict[str, Any]] = []
+        with open(self.path, "r", encoding="utf-8", errors="replace") as fh:
+            for raw_line in fh:
+                raw_line = raw_line.strip()
+                if not raw_line:
+                    continue
+                try:
+                    record = json.loads(raw_line)
+                except json.JSONDecodeError:
+                    # Corrupt line — skip rather than raising.
+                    continue
+                if op_filter is not None and record.get("op") != op_filter:
+                    continue
+                records.append(record)
+        if last_n is not None and last_n > 0:
+            records = records[-last_n:]
+        return records
+    # ------------------------------------------------------------------
+    # Rotation
+    # ------------------------------------------------------------------
+    def rotate(self, max_size_mb: float = 10.0) -> bool:
+        """
+        Rotate the log file if it exceeds *max_size_mb* megabytes.
+        Rotation renames the current log to ``<stem>.1<suffix>`` (overwriting
+        any existing backup), then the next ``log()`` call will create a fresh
+        empty file.
+        Args:
+            max_size_mb: Threshold in mebibytes.  If the current file is
+                         smaller than this, no rotation occurs.
+        Returns:
+            True if rotation was performed, False if not needed or file absent.
+        """
+        if not self.path.exists():
+            return False
+        size_mb = self.path.stat().st_size / (1024 * 1024)
+        if size_mb < max_size_mb:
+            return False
+        backup_path = self.path.with_name(
+            self.path.stem + ".1" + self.path.suffix
+        )
+        with self._lock:
+            # Re-check size inside the lock to avoid TOCTOU race.
+            if not self.path.exists():
+                return False
+            if self.path.stat().st_size / (1024 * 1024) < max_size_mb:
+                return False
+            # Rename current → backup (atomic on most POSIX filesystems).
+            self.path.rename(backup_path)
+        return True
+    # ------------------------------------------------------------------
+    # Diagnostics
+    # ------------------------------------------------------------------
+    def size_bytes(self) -> int:
+        """Return the current log file size in bytes, or 0 if absent."""
+        try:
+            return self.path.stat().st_size
+        except FileNotFoundError:
+            return 0
+    def __repr__(self) -> str:  # pragma: no cover
+        return f"AuditLog(path={str(self.path)!r})"