mnemosyne-engine 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. mnemosyne/__init__.py +14 -0
  2. mnemosyne/__main__.py +14 -0
  3. mnemosyne/analytics.py +271 -0
  4. mnemosyne/audit.py +202 -0
  5. mnemosyne/bloom.py +265 -0
  6. mnemosyne/cache.py +252 -0
  7. mnemosyne/chunkers/__init__.py +153 -0
  8. mnemosyne/chunkers/brace_chunker.py +535 -0
  9. mnemosyne/chunkers/code_chunker.py +509 -0
  10. mnemosyne/chunkers/csharp_chunker.py +145 -0
  11. mnemosyne/chunkers/generic_chunker.py +143 -0
  12. mnemosyne/chunkers/go_chunker.py +177 -0
  13. mnemosyne/chunkers/java_chunker.py +234 -0
  14. mnemosyne/chunkers/js_chunker.py +794 -0
  15. mnemosyne/chunkers/rust_chunker.py +134 -0
  16. mnemosyne/chunkers/text_chunker.py +315 -0
  17. mnemosyne/cli.py +931 -0
  18. mnemosyne/compress.py +483 -0
  19. mnemosyne/config.py +315 -0
  20. mnemosyne/daemon.py +342 -0
  21. mnemosyne/delta.py +238 -0
  22. mnemosyne/density.py +253 -0
  23. mnemosyne/embeddings/__init__.py +36 -0
  24. mnemosyne/embeddings/tfidf_backend.py +430 -0
  25. mnemosyne/formatter.py +176 -0
  26. mnemosyne/hasher.py +150 -0
  27. mnemosyne/ingest.py +421 -0
  28. mnemosyne/models.py +230 -0
  29. mnemosyne/prefetch.py +115 -0
  30. mnemosyne/py.typed +0 -0
  31. mnemosyne/ranking.py +198 -0
  32. mnemosyne/retrieval.py +889 -0
  33. mnemosyne/schema.py +409 -0
  34. mnemosyne/store.py +1058 -0
  35. mnemosyne/tests/__init__.py +0 -0
  36. mnemosyne/tests/benchmark.py +675 -0
  37. mnemosyne/tests/benchmark_suite.py +676 -0
  38. mnemosyne/tests/test_analytics.py +252 -0
  39. mnemosyne/tests/test_brace_chunkers.py +638 -0
  40. mnemosyne/tests/test_cache.py +245 -0
  41. mnemosyne/tests/test_chunkers.py +321 -0
  42. mnemosyne/tests/test_compression.py +343 -0
  43. mnemosyne/tests/test_core.py +463 -0
  44. mnemosyne/tests/test_daemon.py +376 -0
  45. mnemosyne/tests/test_integration.py +479 -0
  46. mnemosyne/tests/test_retrieval.py +667 -0
  47. mnemosyne/tests/test_store.py +479 -0
  48. mnemosyne/tests/test_tfidf.py +259 -0
  49. mnemosyne/tiers.py +136 -0
  50. mnemosyne/vectorstore.py +177 -0
  51. mnemosyne_engine-0.3.0.dist-info/METADATA +1248 -0
  52. mnemosyne_engine-0.3.0.dist-info/RECORD +57 -0
  53. mnemosyne_engine-0.3.0.dist-info/WHEEL +5 -0
  54. mnemosyne_engine-0.3.0.dist-info/entry_points.txt +2 -0
  55. mnemosyne_engine-0.3.0.dist-info/licenses/LICENSE +683 -0
  56. mnemosyne_engine-0.3.0.dist-info/licenses/NOTICE +10 -0
  57. mnemosyne_engine-0.3.0.dist-info/top_level.txt +1 -0
mnemosyne/__init__.py ADDED
@@ -0,0 +1,14 @@
1
+ # Copyright 2026 Cast Rock Innovation L.L.C.
2
+ # SPDX-License-Identifier: AGPL-3.0-or-later
3
+
4
+ """
5
+ Mnemosyne — LLM Context Compression and Retrieval Engine.
6
+
7
+ A foundation layer for intelligent codebase indexing, chunking, embedding,
8
+ compression, and retrieval — built entirely on the Python standard library.
9
+ """
10
+
11
+ __version__ = "0.3.0"
12
+ __package_name__ = "mnemosyne"
13
+
14
+ __all__ = ["__version__", "__package_name__"]
mnemosyne/__main__.py ADDED
@@ -0,0 +1,14 @@
1
+ # Copyright 2026 Cast Rock Innovation L.L.C.
2
+ # SPDX-License-Identifier: AGPL-3.0-or-later
3
+
4
+ """
5
+ Entry point for ``python -m mnemosyne``.
6
+
7
+ The full CLI is implemented in ``mnemosyne.cli`` (built separately).
8
+ This module's sole responsibility is to invoke it.
9
+ """
10
+
11
+ from mnemosyne.cli import main
12
+
13
+ if __name__ == "__main__":
14
+ main()
mnemosyne/analytics.py ADDED
@@ -0,0 +1,271 @@
1
+ # Copyright 2026 Cast Rock Innovation L.L.C.
2
+ # SPDX-License-Identifier: AGPL-3.0-or-later
3
+
4
+ """
5
+ Usage analytics and decay-weighted scoring for Mnemosyne.
6
+
7
+ Tracks ``UsageEvent`` records and derives exponentially-decayed frequency
8
+ scores that feed back into the retrieval ranking pipeline.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import math
14
+ import uuid
15
+ from datetime import datetime, timezone
16
+ from typing import TYPE_CHECKING
17
+
18
+ from mnemosyne.models import UsageEvent
19
+
20
+ if TYPE_CHECKING:
21
+ from mnemosyne.store import Store
22
+
23
+
24
+ class Analytics:
25
+ """
26
+ Session-aware usage tracker with exponential time-decay scoring.
27
+
28
+ Decay formula (half-life model)::
29
+
30
+ score(event) = 2 ^ (-age_days / halflife)
31
+
32
+ For each chunk the per-event contributions of ``'selected'`` and
33
+ ``'used'`` event types are summed. ``'retrieved'`` and ``'discarded'``
34
+ events are stored but do not contribute to the usage score.
35
+
36
+ Args:
37
+ store: The persistent :class:`~mnemosyne.store.Store` instance.
38
+ config: Mnemosyne :class:`~mnemosyne.config.Config` instance.
39
+ Reads ``config.analytics.decay_halflife_days``.
40
+ """
41
+
42
+ def __init__(self, store: "Store", config) -> None:
43
+ self.store = store
44
+ self.halflife: float = float(config.analytics.decay_halflife_days)
45
+ self._session_id: str | None = None
46
+
47
+ # ------------------------------------------------------------------
48
+ # Session management
49
+ # ------------------------------------------------------------------
50
+
51
+ def start_session(self, session_id: str | None = None) -> str:
52
+ """
53
+ Start or resume a usage-tracking session.
54
+
55
+ Args:
56
+ session_id: Explicit session identifier. A new 8-hex-char UUID
57
+ fragment is generated when this is ``None``.
58
+
59
+ Returns:
60
+ The active session ID string.
61
+ """
62
+ self._session_id = session_id or self._generate_session_id()
63
+ return self._session_id
64
+
65
+ # ------------------------------------------------------------------
66
+ # Event recording
67
+ # ------------------------------------------------------------------
68
+
69
+ def record(
70
+ self,
71
+ chunk_id: int,
72
+ event_type: str,
73
+ query_text: str | None = None,
74
+ ) -> None:
75
+ """
76
+ Record a usage event for *chunk_id*.
77
+
78
+ Args:
79
+ chunk_id: The chunk that was interacted with.
80
+ event_type: One of ``'retrieved'``, ``'selected'``, ``'used'``,
81
+ ``'discarded'``.
82
+ query_text: The raw query string (may be None).
83
+ """
84
+ now_iso = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
85
+ event = UsageEvent(
86
+ event_id=None,
87
+ chunk_id=chunk_id,
88
+ query_text=query_text,
89
+ session_id=self._session_id,
90
+ event_type=event_type,
91
+ timestamp=now_iso,
92
+ )
93
+ self.store.save_usage_event(event)
94
+
95
+ # ------------------------------------------------------------------
96
+ # Scoring
97
+ # ------------------------------------------------------------------
98
+
99
+ def get_usage_scores(self) -> dict[int, float]:
100
+ """
101
+ Compute exponentially-decayed usage scores for all chunks.
102
+
103
+ Only ``'selected'`` and ``'used'`` events contribute to the score.
104
+ The contribution of each event decays with time::
105
+
106
+ contribution = 2 ^ (-age_days / halflife)
107
+
108
+ Returns:
109
+ Mapping of ``chunk_id -> score``. Chunks with no qualifying
110
+ events are absent from the dict.
111
+ """
112
+ now_utc = datetime.now(timezone.utc)
113
+ # Fetch all 'selected' and 'used' events from the store
114
+ events = self.store.get_usage_events(event_types=["selected", "used"])
115
+
116
+ scores: dict[int, float] = {}
117
+ for event in events:
118
+ if not event.timestamp:
119
+ continue
120
+ try:
121
+ # Parse ISO-8601 timestamps (handle both Z and +00:00 suffixes)
122
+ ts_str = event.timestamp.replace("Z", "+00:00")
123
+ event_time = datetime.fromisoformat(ts_str)
124
+ if event_time.tzinfo is None:
125
+ event_time = event_time.replace(tzinfo=timezone.utc)
126
+ except (ValueError, AttributeError):
127
+ continue
128
+
129
+ age_days = (now_utc - event_time).total_seconds() / 86400.0
130
+ contribution = math.pow(2.0, -age_days / max(1e-9, self.halflife))
131
+ scores[event.chunk_id] = scores.get(event.chunk_id, 0.0) + contribution
132
+
133
+ return scores
134
+
135
+ # ------------------------------------------------------------------
136
+ # Co-occurrence analysis
137
+ # ------------------------------------------------------------------
138
+
139
+ def get_co_occurrence(self, chunk_ids: list[int]) -> dict[int, int]:
140
+ """
141
+ Find chunks frequently co-retrieved with the given chunks.
142
+
143
+ Looks up sessions in which any of the provided *chunk_ids* were
144
+ retrieved, then counts how often other chunks appeared in those same
145
+ sessions.
146
+
147
+ Args:
148
+ chunk_ids: Reference set of chunk IDs.
149
+
150
+ Returns:
151
+ Mapping of ``co_chunk_id -> session_co_occurrence_count`` for
152
+ all chunks that appeared alongside the input set (excluding the
153
+ input IDs themselves).
154
+ """
155
+ if not chunk_ids:
156
+ return {}
157
+
158
+ # Get sessions that involved any of our reference chunks
159
+ reference_sessions: set[str] = set()
160
+ for cid in chunk_ids:
161
+ events = self.store.get_usage_events_for_chunk(cid)
162
+ for event in events:
163
+ if event.session_id:
164
+ reference_sessions.add(event.session_id)
165
+
166
+ if not reference_sessions:
167
+ return {}
168
+
169
+ # Count co-occurrences within those sessions
170
+ co_counts: dict[int, int] = {}
171
+ reference_set = set(chunk_ids)
172
+ for session_id in reference_sessions:
173
+ session_events = self.store.get_usage_events_for_session(session_id)
174
+ for event in session_events:
175
+ if event.chunk_id not in reference_set:
176
+ co_counts[event.chunk_id] = co_counts.get(event.chunk_id, 0) + 1
177
+
178
+ return co_counts
179
+
180
+ # ------------------------------------------------------------------
181
+ # Precision / feedback analytics
182
+ # ------------------------------------------------------------------
183
+
184
+ def compute_precision_at_k(self, session_id: str | None = None) -> dict:
185
+ """
186
+ Compute precision from feedback events.
187
+
188
+ Precision is defined as ``used / (used + discarded)``. When both
189
+ counts are zero the precision is reported as ``0.0``.
190
+
191
+ Args:
192
+ session_id: If provided, only events for this session are
193
+ considered. ``None`` aggregates across all sessions.
194
+
195
+ Returns:
196
+ Dict with keys ``precision``, ``total_retrieved``,
197
+ ``total_used``, ``total_discarded``, ``total_selected``.
198
+ """
199
+ if session_id is not None:
200
+ events = self.store.get_usage_events_for_session(session_id)
201
+ else:
202
+ events = self.store.get_usage_events()
203
+
204
+ counts: dict[str, int] = {
205
+ "retrieved": 0,
206
+ "used": 0,
207
+ "discarded": 0,
208
+ "selected": 0,
209
+ }
210
+ for event in events:
211
+ if event.event_type in counts:
212
+ counts[event.event_type] += 1
213
+
214
+ denominator = counts["used"] + counts["discarded"]
215
+ precision = counts["used"] / denominator if denominator > 0 else 0.0
216
+
217
+ return {
218
+ "precision": precision,
219
+ "total_retrieved": counts["retrieved"],
220
+ "total_used": counts["used"],
221
+ "total_discarded": counts["discarded"],
222
+ "total_selected": counts["selected"],
223
+ }
224
+
225
+ def get_top_used_chunks(self, limit: int = 5) -> list[dict]:
226
+ """
227
+ Return the *limit* most-used chunks ranked by ``'used'`` event count.
228
+
229
+ Each entry is a dict with ``chunk_id``, ``use_count``, ``file_path``,
230
+ ``symbol_name``, ``line_start``, and ``line_end``.
231
+ """
232
+ events = self.store.get_usage_events(event_types=["used"])
233
+
234
+ # Tally per chunk_id
235
+ chunk_counts: dict[int, int] = {}
236
+ for event in events:
237
+ chunk_counts[event.chunk_id] = chunk_counts.get(event.chunk_id, 0) + 1
238
+
239
+ # Sort descending by count
240
+ ranked = sorted(chunk_counts.items(), key=lambda x: -x[1])[:limit]
241
+
242
+ results: list[dict] = []
243
+ for chunk_id, count in ranked:
244
+ chunk = self.store.get_chunk(chunk_id)
245
+ file_path = ""
246
+ symbol_name = None
247
+ line_start = 0
248
+ line_end = 0
249
+ if chunk is not None:
250
+ line_start = chunk.line_start
251
+ line_end = chunk.line_end
252
+ symbol_name = chunk.symbol_name
253
+ file_rec = self.store.get_file_by_id(chunk.file_id)
254
+ if file_rec is not None:
255
+ file_path = file_rec.rel_path
256
+ results.append({
257
+ "chunk_id": chunk_id,
258
+ "use_count": count,
259
+ "file_path": file_path,
260
+ "symbol_name": symbol_name,
261
+ "line_start": line_start,
262
+ "line_end": line_end,
263
+ })
264
+ return results
265
+
266
+ # ------------------------------------------------------------------
267
+ # Internal helpers
268
+ # ------------------------------------------------------------------
269
+
270
+ def _generate_session_id(self) -> str:
271
+ return str(uuid.uuid4())[:8]
mnemosyne/audit.py ADDED
@@ -0,0 +1,202 @@
1
+ # Copyright 2026 Cast Rock Innovation L.L.C.
2
+ # SPDX-License-Identifier: AGPL-3.0-or-later
3
+
4
+ """
5
+ Append-only JSONL audit logger for Mnemosyne.
6
+
7
+ Design:
8
+ - Every operation is written as one JSON object per line (JSONL format).
9
+ - Writes are atomic at the line level: each ``log()`` call opens, writes,
10
+ and closes (or flushes) the file — there is no open file handle held
11
+ between calls, so concurrent processes can append safely on most OS.
12
+ - ``rotate()`` renames the current log to ``<name>.1.jsonl`` (keeping one
13
+ backup), preventing unbounded growth.
14
+ - ``read()`` supports tail-N filtering and operation-type filtering without
15
+ loading the entire file into memory first.
16
+
17
+ Thread safety: individual ``log()`` writes are protected by a
18
+ ``threading.Lock``. Cross-process safety relies on OS-level append
19
+ atomicity (guaranteed for lines < PIPE_BUF on POSIX; safe enough for audit
20
+ use on all common platforms).
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ import json
26
+ import os
27
+ import threading
28
+ from datetime import datetime, timezone
29
+ from pathlib import Path
30
+ from typing import Any
31
+
32
+
33
+ def _now_utc() -> str:
34
+ """Return the current UTC time as an ISO-8601 string."""
35
+ return datetime.now(tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
36
+
37
+
38
+ class AuditLog:
39
+ """
40
+ Append-only, JSONL-format audit log.
41
+
42
+ Args:
43
+ path: Filesystem path to the log file. Parent directories are created
44
+ automatically on the first write.
45
+
46
+ Usage::
47
+
48
+ log = AuditLog("/path/to/.mnemosyne/audit.jsonl")
49
+ log.log("index_file", rel_path="src/main.py", chunks=42)
50
+ log.log("query", query="auth middleware", results=5)
51
+
52
+ recent = log.read(last_n=100, op_filter="query")
53
+ """
54
+
55
+ def __init__(self, path: str | Path) -> None:
56
+ self.path = Path(path)
57
+ self._lock = threading.Lock()
58
+
59
+ # ------------------------------------------------------------------
60
+ # Write
61
+ # ------------------------------------------------------------------
62
+
63
+ def log(self, operation: str, **details: Any) -> None:
64
+ """
65
+ Append one audit event to the log.
66
+
67
+ The record is a single JSON object containing at minimum:
68
+ - ``"op"``: the *operation* name (e.g. ``"index_file"``)
69
+ - ``"ts"``: ISO-8601 UTC timestamp of when ``log()`` was called
70
+ - **details: any keyword arguments passed by the caller
71
+
72
+ Args:
73
+ operation: Short operation identifier; should be a lowercase
74
+ snake_case string (e.g. ``"query"``, ``"cache_evict"``).
75
+ **details: Arbitrary key/value pairs to include in the record.
76
+ Values must be JSON-serialisable.
77
+
78
+ Raises:
79
+ TypeError: If any value in *details* is not JSON-serialisable.
80
+ """
81
+ record: dict[str, Any] = {
82
+ "op": operation,
83
+ "ts": _now_utc(),
84
+ }
85
+ record.update(details)
86
+
87
+ line = json.dumps(record, ensure_ascii=False, separators=(",", ":")) + "\n"
88
+
89
+ with self._lock:
90
+ self.path.parent.mkdir(parents=True, exist_ok=True)
91
+ # Open in append mode; 'a' is atomic at line granularity on POSIX.
92
+ with open(self.path, "a", encoding="utf-8") as fh:
93
+ fh.write(line)
94
+
95
+ # ------------------------------------------------------------------
96
+ # Read
97
+ # ------------------------------------------------------------------
98
+
99
+ def read(
100
+ self,
101
+ last_n: int | None = None,
102
+ op_filter: str | None = None,
103
+ ) -> list[dict[str, Any]]:
104
+ """
105
+ Read audit records from the log file.
106
+
107
+ Args:
108
+ last_n: When provided, return only the last *n* matching records
109
+ (tail semantics — most recent *n* entries that satisfy
110
+ *op_filter*). Pass None to return all matching records.
111
+ op_filter: When provided, return only records whose ``"op"`` field
112
+ equals this string (exact match, case-sensitive).
113
+
114
+ Returns:
115
+ List of record dicts in chronological order (oldest first).
116
+ Returns an empty list if the log file does not exist.
117
+
118
+ Note:
119
+ Malformed JSON lines are silently skipped so that a single corrupt
120
+ line does not prevent reading the rest of the log.
121
+ """
122
+ if not self.path.exists():
123
+ return []
124
+
125
+ records: list[dict[str, Any]] = []
126
+
127
+ with open(self.path, "r", encoding="utf-8", errors="replace") as fh:
128
+ for raw_line in fh:
129
+ raw_line = raw_line.strip()
130
+ if not raw_line:
131
+ continue
132
+ try:
133
+ record = json.loads(raw_line)
134
+ except json.JSONDecodeError:
135
+ # Corrupt line — skip rather than raising.
136
+ continue
137
+
138
+ if op_filter is not None and record.get("op") != op_filter:
139
+ continue
140
+
141
+ records.append(record)
142
+
143
+ if last_n is not None and last_n > 0:
144
+ records = records[-last_n:]
145
+
146
+ return records
147
+
148
+ # ------------------------------------------------------------------
149
+ # Rotation
150
+ # ------------------------------------------------------------------
151
+
152
+ def rotate(self, max_size_mb: float = 10.0) -> bool:
153
+ """
154
+ Rotate the log file if it exceeds *max_size_mb* megabytes.
155
+
156
+ Rotation renames the current log to ``<stem>.1<suffix>`` (overwriting
157
+ any existing backup), then the next ``log()`` call will create a fresh
158
+ empty file.
159
+
160
+ Args:
161
+ max_size_mb: Threshold in mebibytes. If the current file is
162
+ smaller than this, no rotation occurs.
163
+
164
+ Returns:
165
+ True if rotation was performed, False if not needed or file absent.
166
+ """
167
+ if not self.path.exists():
168
+ return False
169
+
170
+ size_mb = self.path.stat().st_size / (1024 * 1024)
171
+ if size_mb < max_size_mb:
172
+ return False
173
+
174
+ backup_path = self.path.with_name(
175
+ self.path.stem + ".1" + self.path.suffix
176
+ )
177
+
178
+ with self._lock:
179
+ # Re-check size inside the lock to avoid TOCTOU race.
180
+ if not self.path.exists():
181
+ return False
182
+ if self.path.stat().st_size / (1024 * 1024) < max_size_mb:
183
+ return False
184
+
185
+ # Rename current → backup (atomic on most POSIX filesystems).
186
+ self.path.rename(backup_path)
187
+
188
+ return True
189
+
190
+ # ------------------------------------------------------------------
191
+ # Diagnostics
192
+ # ------------------------------------------------------------------
193
+
194
+ def size_bytes(self) -> int:
195
+ """Return the current log file size in bytes, or 0 if absent."""
196
+ try:
197
+ return self.path.stat().st_size
198
+ except FileNotFoundError:
199
+ return 0
200
+
201
+ def __repr__(self) -> str: # pragma: no cover
202
+ return f"AuditLog(path={str(self.path)!r})"