code-context-engine 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. code_context_engine-0.4.0.dist-info/METADATA +389 -0
  2. code_context_engine-0.4.0.dist-info/RECORD +63 -0
  3. code_context_engine-0.4.0.dist-info/WHEEL +5 -0
  4. code_context_engine-0.4.0.dist-info/entry_points.txt +4 -0
  5. code_context_engine-0.4.0.dist-info/licenses/LICENSE +21 -0
  6. code_context_engine-0.4.0.dist-info/top_level.txt +1 -0
  7. context_engine/__init__.py +3 -0
  8. context_engine/cli.py +2848 -0
  9. context_engine/cli_style.py +66 -0
  10. context_engine/compression/__init__.py +0 -0
  11. context_engine/compression/compressor.py +144 -0
  12. context_engine/compression/ollama_client.py +33 -0
  13. context_engine/compression/output_rules.py +77 -0
  14. context_engine/compression/prompts.py +9 -0
  15. context_engine/compression/quality.py +37 -0
  16. context_engine/config.py +198 -0
  17. context_engine/dashboard/__init__.py +0 -0
  18. context_engine/dashboard/_page.py +1548 -0
  19. context_engine/dashboard/server.py +429 -0
  20. context_engine/editors.py +265 -0
  21. context_engine/event_bus.py +24 -0
  22. context_engine/indexer/__init__.py +0 -0
  23. context_engine/indexer/chunker.py +147 -0
  24. context_engine/indexer/embedder.py +154 -0
  25. context_engine/indexer/embedding_cache.py +168 -0
  26. context_engine/indexer/git_hooks.py +73 -0
  27. context_engine/indexer/git_indexer.py +136 -0
  28. context_engine/indexer/ignorefile.py +96 -0
  29. context_engine/indexer/manifest.py +78 -0
  30. context_engine/indexer/pipeline.py +624 -0
  31. context_engine/indexer/secrets.py +332 -0
  32. context_engine/indexer/watcher.py +109 -0
  33. context_engine/integration/__init__.py +0 -0
  34. context_engine/integration/bootstrap.py +76 -0
  35. context_engine/integration/git_context.py +132 -0
  36. context_engine/integration/mcp_server.py +1825 -0
  37. context_engine/integration/session_capture.py +306 -0
  38. context_engine/memory/__init__.py +6 -0
  39. context_engine/memory/compressor.py +344 -0
  40. context_engine/memory/db.py +922 -0
  41. context_engine/memory/extractive.py +106 -0
  42. context_engine/memory/grammar.py +419 -0
  43. context_engine/memory/hook_installer.py +258 -0
  44. context_engine/memory/hook_server.py +83 -0
  45. context_engine/memory/hooks.py +327 -0
  46. context_engine/memory/migrate.py +268 -0
  47. context_engine/models.py +96 -0
  48. context_engine/pricing.py +104 -0
  49. context_engine/project_commands.py +296 -0
  50. context_engine/retrieval/__init__.py +0 -0
  51. context_engine/retrieval/confidence.py +47 -0
  52. context_engine/retrieval/query_parser.py +105 -0
  53. context_engine/retrieval/retriever.py +199 -0
  54. context_engine/serve_http.py +208 -0
  55. context_engine/services.py +252 -0
  56. context_engine/storage/__init__.py +0 -0
  57. context_engine/storage/backend.py +39 -0
  58. context_engine/storage/fts_store.py +112 -0
  59. context_engine/storage/graph_store.py +219 -0
  60. context_engine/storage/local_backend.py +109 -0
  61. context_engine/storage/remote_backend.py +117 -0
  62. context_engine/storage/vector_store.py +357 -0
  63. context_engine/utils.py +72 -0
@@ -0,0 +1,106 @@
1
+ """Extractive summarisation using the embedding model already loaded for the index.
2
+
3
+ The summary is always real text from the source — no synthesis, no
4
+ hallucination. Algorithm:
5
+
6
+ 1. Split candidate text into sentences.
7
+ 2. Embed each with bge-small (or whatever embedder is passed in).
8
+ 3. Compute the centroid as the mean of all embeddings.
9
+ 4. Rank sentences by cosine similarity to the centroid.
10
+ 5. Take the top K, restored to their original order.
11
+
12
+ Failure modes:
13
+ - Empty / single-sentence input: return the input verbatim.
14
+ - Embedder raises: caller falls back to truncation.
15
+
16
+ This module has no dependency on the rest of the memory package; it operates
17
+ on plain strings and any object exposing `embed_query(str) -> tuple[float]`.
18
+ """
19
+ from __future__ import annotations
20
+
21
+ import re
22
+ from typing import Iterable, Protocol
23
+
24
+
25
+ _SENTENCE_SPLIT = re.compile(r"(?<=[.!?])\s+")
26
+
27
+
28
+ class _EmbedderLike(Protocol):
29
+ def embed_query(self, query: str) -> Iterable[float]: ...
30
+
31
+
32
+ def split_sentences(text: str) -> list[str]:
33
+ """Coarse sentence split. Good enough for chat-shaped text.
34
+
35
+ Newlines are treated as sentence boundaries too — Claude often emits
36
+ multi-line tool output where each line is its own statement.
37
+ """
38
+ pieces: list[str] = []
39
+ for line in text.splitlines():
40
+ line = line.strip()
41
+ if not line:
42
+ continue
43
+ pieces.extend(s.strip() for s in _SENTENCE_SPLIT.split(line) if s.strip())
44
+ return pieces
45
+
46
+
47
+ def _cosine(a: list[float], b: list[float]) -> float:
48
+ # No numpy here — keep this module standalone and free of imports the
49
+ # caller might not want to pay for in cold paths.
50
+ dot = 0.0
51
+ na = 0.0
52
+ nb = 0.0
53
+ for x, y in zip(a, b):
54
+ dot += x * y
55
+ na += x * x
56
+ nb += y * y
57
+ if na <= 0.0 or nb <= 0.0:
58
+ return 0.0
59
+ return dot / ((na ** 0.5) * (nb ** 0.5))
60
+
61
+
62
+ def extractive_summary(
63
+ text: str,
64
+ *,
65
+ embedder: _EmbedderLike,
66
+ top_k: int = 3,
67
+ ) -> str:
68
+ """Return the top-K most central sentences from `text`, in source order.
69
+
70
+ Returns the input verbatim if there's nothing to rank.
71
+ """
72
+ sentences = split_sentences(text)
73
+ if len(sentences) <= top_k:
74
+ return " ".join(sentences) if sentences else text.strip()
75
+
76
+ embeddings: list[list[float]] = []
77
+ for s in sentences:
78
+ v = list(embedder.embed_query(s))
79
+ embeddings.append(v)
80
+
81
+ if not embeddings or not embeddings[0]:
82
+ return " ".join(sentences[:top_k])
83
+
84
+ dim = len(embeddings[0])
85
+ centroid = [0.0] * dim
86
+ for emb in embeddings:
87
+ for i in range(dim):
88
+ centroid[i] += emb[i]
89
+ n = len(embeddings)
90
+ centroid = [c / n for c in centroid]
91
+
92
+ scored = [
93
+ (i, _cosine(emb, centroid))
94
+ for i, emb in enumerate(embeddings)
95
+ ]
96
+ scored.sort(key=lambda p: p[1], reverse=True)
97
+ chosen_indices = sorted(i for i, _ in scored[:top_k])
98
+ return " ".join(sentences[i] for i in chosen_indices)
99
+
100
+
101
+ def truncation_summary(text: str, *, max_chars: int = 200) -> str:
102
+ """Final fallback when the embedder isn't available."""
103
+ text = text.strip()
104
+ if len(text) <= max_chars:
105
+ return text
106
+ return text[: max_chars - 1].rstrip() + "…"
@@ -0,0 +1,419 @@
1
+ """Deterministic, model-free prose compression for memory.db rows.
2
+
3
+ Approach (modelled after cavemem's "caveman grammar"):
4
+
5
+ 1. Tokenise input into typed tokens. Code, paths, URLs, versions, dates,
6
+ identifiers, numbers, and headings are *structured* — they're
7
+ preserved byte-for-byte through compress(). Only `prose` tokens are
8
+ transformed.
9
+
10
+ 2. Apply prose transformations at one of three intensity levels:
11
+ lite — drop articles only (a, an, the)
12
+ full — drop articles + grammatical fillers (default)
13
+ ultra — full + abbreviation lexicon
14
+
15
+ 3. Round-trip property: structured tokens survive compress() unchanged.
16
+ expand() is a light reversal that restores abbreviations and tidies
17
+ spacing, but is *not* required to recover dropped articles/fillers —
18
+ the compression is intentionally lossy on non-content words.
19
+
20
+ The whole module is pure: no IO, no external deps, no LLM call. It's
21
+ deterministic — same input + same level always yields the same output.
22
+
23
+ Used by:
24
+ · record_decision dual-write (mcp_server.py)
25
+ · compress_turn after extractive summary (compressor.py)
26
+ · session_recall / session_timeline output formatting (read-side expand)
27
+
28
+ See `tests/memory/test_grammar.py` for the corpus + round-trip tests.
29
+ """
30
+ from __future__ import annotations
31
+
32
+ import re
33
+ from typing import Literal
34
+
35
+
36
+ Level = Literal["lite", "full", "ultra"]
37
+
38
+ # Single source of truth for the compression level applied to memory.db
39
+ # storage. Imported by mcp_server (record_decision), compressor (turn +
40
+ # rollup), migrate (legacy import), and the bench (canonical-form match).
41
+ # All five paths must agree, otherwise the bench gives misleading numbers
42
+ # and the same decision can land in storage at different shapes.
43
+ #
44
+ # Savings-bucket invariant: at "lite" the dropped tokens are articles only,
45
+ # and `expand()` does NOT restore them — so wire savings (what the model
46
+ # sees on read) match storage savings (what was written). The `grammar`
47
+ # bucket in `cce savings` reports both honestly.
48
+ #
49
+ # If this is bumped to "ultra", abbreviations like config↔configuration
50
+ # round-trip via `expand()`, so the wire form is roughly the same length
51
+ # as raw input and the bucket would over-claim. Update the renderer to
52
+ # subtract the abbreviation round-trip before bumping the level.
53
+ DEFAULT_LEVEL: Level = "lite"
54
+
55
+
56
+ # ── Token classes ──────────────────────────────────────────────────────────
57
+ # Any string fragment that matches a structured-token pattern is preserved
58
+ # byte-for-byte through compress(). Order matters — patterns earlier in the
59
+ # list win.
60
+
61
+ _FENCE_RE = re.compile(r"```[\s\S]*?```", re.MULTILINE)
62
+ _INLINE_CODE_RE = re.compile(r"`[^`\n]+?`")
63
+
64
+ _URL_RE = re.compile(r"https?://\S+|www\.\S+")
65
+ _DATETIME_RE = re.compile(
66
+ r"\d{4}-\d{2}-\d{2}(?:[T ]\d{2}:\d{2}(?::\d{2})?(?:Z|[+\-]\d{2}:?\d{2})?)?"
67
+ )
68
+ _VERSION_RE = re.compile(r"v?\d+\.\d+(?:\.\d+)?(?:-[A-Za-z0-9.]+)?")
69
+ # Path: requires *substantive* segments (≥2 chars each, or a clear file
70
+ # extension). Single-slash bare-word pairs like the abbreviations `b/c`,
71
+ # `w/o`, `w/` deliberately don't match — they flow through as prose so
72
+ # the lexicon expansion sees them. Same constraint excludes `/c` etc.
73
+ _PATH_RE = re.compile(
74
+ r"(?:\.{1,2}/|/)[A-Za-z0-9_\-]{2,}(?:/[A-Za-z0-9_./\-]+)*"
75
+ r"|[A-Za-z0-9_\-]{2,}/[A-Za-z0-9_\-]{2,}(?:/[A-Za-z0-9_./\-]+)*"
76
+ r"|[A-Za-z0-9_\-]{2,}/[A-Za-z0-9_\-]+\.[A-Za-z]{1,5}\b"
77
+ )
78
+ # Number with unit attached: 100ms, 5GB, 0.3s. Distinct from a bare number
79
+ # so we can let bare numbers stay in prose.
80
+ _NUMBER_UNIT_RE = re.compile(r"\d+(?:\.\d+)?[A-Za-z]{1,4}\b")
81
+ # Identifier: CamelCase / snake_case / dotted.path. Must contain an
82
+ # uppercase, a digit, an underscore, or a dot to be flagged as identifier
83
+ # (otherwise a single lowercase word is just prose).
84
+ _IDENT_RE = re.compile(
85
+ r"[A-Za-z][A-Za-z0-9]*(?:[._][A-Za-z0-9]+)+|"
86
+ r"[A-Za-z][A-Za-z0-9]*[A-Z][A-Za-z0-9]*|"
87
+ r"[A-Za-z]+_[A-Za-z0-9_]+"
88
+ )
89
+
90
+ # ── Stop-word lexicons ─────────────────────────────────────────────────────
91
+
92
+ _ARTICLES = frozenset({"a", "an", "the"})
93
+
94
+ # Grammatical fillers that carry no topic signal in coding-session prose.
95
+ # Conservative — topic words (code, auth, database, scale, etc.) are NOT here.
96
+ _FILLERS = _ARTICLES | frozenset({
97
+ # auxiliaries / modals
98
+ "is", "are", "was", "were", "be", "been", "being", "am",
99
+ "have", "has", "had",
100
+ "will", "would", "shall", "should", "may", "might", "must",
101
+ # connectives / weak verbs
102
+ "of", "to", "as", "by", "on", "in", "at", "for", "with", "from",
103
+ "and", "or", "but", "if", "than", "then",
104
+ "that", "this", "these", "those",
105
+ # mild redundancies frequently in decision text
106
+ "just", "very", "quite", "really", "actually", "basically",
107
+ })
108
+
109
+ # Aggressive drop set used at ultra. Adds discourse fillers, common
110
+ # pronouns, weak verbs. Trades a few recall points for ~3× the byte
111
+ # savings on conversational prose.
112
+ _FILLERS_ULTRA = _FILLERS | frozenset({
113
+ "also", "still", "now", "when", "while", "since",
114
+ "i", "we", "you", "he", "she", "it", "they", "them",
115
+ "our", "their", "its", "my", "your", "his", "her",
116
+ "me", "us", "him",
117
+ "do", "does", "did",
118
+ "via", "into", "onto", "upon", "over", "under", "through",
119
+ "much", "more", "most", "less", "least", "some", "any", "all",
120
+ "such", "both", "each", "every", "other", "another",
121
+ "here", "there",
122
+ "let", "get", "got", "take", "took", "give", "gave",
123
+ "truly", "absolutely", "completely", "totally", "entirely",
124
+ })
125
+
126
+ # Abbreviation lexicon for ultra. expand() uses the inverse to restore on read.
127
+ _ABBREVIATE: dict[str, str] = {
128
+ # Connectives / discourse
129
+ "because": "b/c",
130
+ "however": "but",
131
+ "therefore": "so",
132
+ "additionally": "+",
133
+ "approximately": "~",
134
+ "particularly": "esp",
135
+ "specifically": "esp",
136
+ "currently": "now",
137
+ "previously": "before",
138
+ "subsequently": "then",
139
+ "throughout": "thru",
140
+ "instead": "vs",
141
+ "without": "w/o",
142
+ "with": "w/",
143
+ "between": "btwn",
144
+ "probably": "likely",
145
+ # Programming jargon
146
+ "configuration": "config",
147
+ "implementation": "impl",
148
+ "documentation": "docs",
149
+ "repository": "repo",
150
+ "performance": "perf",
151
+ "production": "prod",
152
+ "development": "dev",
153
+ "environment": "env",
154
+ "infrastructure": "infra",
155
+ "architecture": "arch",
156
+ # Note: deliberately NOT abbreviating "authentication"/"authorization"/
157
+ # "library" — their natural abbreviations ("auth"/"authz"/"lib") are
158
+ # already real domain words, so expanding "auth" back to "authentication"
159
+ # corrupts text the user wrote as "auth" intentionally.
160
+ "framework": "fw",
161
+ "function": "fn",
162
+ "variable": "var",
163
+ "parameter": "param",
164
+ "argument": "arg",
165
+ "request": "req",
166
+ "response": "resp",
167
+ "database": "db",
168
+ "language": "lang",
169
+ "directory": "dir",
170
+ "execution": "exec",
171
+ "operation": "op",
172
+ "management": "mgmt",
173
+ "deployment": "deploy",
174
+ "synchronisation": "sync",
175
+ "synchronization": "sync",
176
+ "asynchronous": "async",
177
+ "synchronous": "sync",
178
+ "concurrent": "conc",
179
+ "optimisation": "opt",
180
+ "optimization": "opt",
181
+ "automatically": "auto",
182
+ "available": "avail",
183
+ "compatibility": "compat",
184
+ "incompatible": "incompat",
185
+ "information": "info",
186
+ "reference": "ref",
187
+ "different": "diff",
188
+ "specific": "spec",
189
+ "important": "imp",
190
+ "consider": "cons",
191
+ "additional": "extra",
192
+ "responsible": "resp",
193
+ }
194
+
195
+ _EXPAND: dict[str, str] = {abbr: word for word, abbr in _ABBREVIATE.items()}
196
+
197
+
198
+ # ── Tokenisation ───────────────────────────────────────────────────────────
199
+
200
+ # A token is a (kind, text) tuple. `kind` drives whether it survives
201
+ # compress() unchanged or is transformed.
202
+ _TokenKind = Literal[
203
+ "fence", "inline_code", "url", "datetime", "version", "path",
204
+ "number_unit", "identifier", "prose",
205
+ ]
206
+
207
+
208
+ def _tokenise(text: str) -> list[tuple[str, str]]:
209
+ """Split `text` into (kind, fragment) tuples. Concatenating all
210
+ fragments reproduces the input exactly.
211
+
212
+ Strategy: in priority order, find structured-token spans and treat the
213
+ gaps between them as `prose`. The priority order is the order patterns
214
+ appear in `_PATTERNS` below — fenced code wins over everything else,
215
+ then inline code, then URL/datetime/version/path/number+unit/identifier.
216
+ """
217
+ if not text:
218
+ return []
219
+ patterns = [
220
+ ("fence", _FENCE_RE),
221
+ ("inline_code", _INLINE_CODE_RE),
222
+ ("url", _URL_RE),
223
+ ("datetime", _DATETIME_RE),
224
+ ("version", _VERSION_RE),
225
+ ("path", _PATH_RE),
226
+ ("number_unit", _NUMBER_UNIT_RE),
227
+ ("identifier", _IDENT_RE),
228
+ ]
229
+ spans: list[tuple[int, int, str]] = [] # (start, end, kind)
230
+ occupied = [False] * len(text)
231
+ for kind, regex in patterns:
232
+ for m in regex.finditer(text):
233
+ s, e = m.start(), m.end()
234
+ if any(occupied[s:e]):
235
+ continue # overlaps with a higher-priority span
236
+ spans.append((s, e, kind))
237
+ for i in range(s, e):
238
+ occupied[i] = True
239
+ spans.sort()
240
+
241
+ tokens: list[tuple[str, str]] = []
242
+ cursor = 0
243
+ for s, e, kind in spans:
244
+ if cursor < s:
245
+ tokens.append(("prose", text[cursor:s]))
246
+ tokens.append((kind, text[s:e]))
247
+ cursor = e
248
+ if cursor < len(text):
249
+ tokens.append(("prose", text[cursor:]))
250
+ return tokens
251
+
252
+
253
+ # ── Prose transformation ───────────────────────────────────────────────────
254
+
255
+
256
+ def _transform_prose(text: str, level: Level) -> str:
257
+ """Apply level-specific transformations to a prose fragment.
258
+
259
+ Splits on whitespace and filters / abbreviates words. Punctuation
260
+ attached to a word (e.g. "auth,") is preserved by separating it before
261
+ matching against the stop-word set.
262
+ """
263
+ if not text.strip():
264
+ return text # pure whitespace passthrough
265
+ if level == "lite":
266
+ drop_set = _ARTICLES
267
+ do_abbreviate = False
268
+ elif level == "full":
269
+ drop_set = _FILLERS
270
+ do_abbreviate = False
271
+ else: # "ultra"
272
+ drop_set = _FILLERS_ULTRA
273
+ do_abbreviate = True
274
+
275
+ out: list[str] = []
276
+ # Tokenise on whitespace boundaries while preserving the whitespace
277
+ # itself, so a prose fragment like "use the JWT" becomes "use JWT"
278
+ # not "useJWT".
279
+ parts = re.split(r"(\s+)", text)
280
+ for part in parts:
281
+ if not part:
282
+ continue
283
+ if part.isspace():
284
+ out.append(part)
285
+ continue
286
+ # Strip leading/trailing punctuation so "the," / "auth." match.
287
+ leading_match = re.match(r"^[^\w]*", part)
288
+ trailing_match = re.search(r"[^\w]*$", part)
289
+ leading = leading_match.group() if leading_match else ""
290
+ trailing = trailing_match.group() if trailing_match else ""
291
+ core = part[len(leading): len(part) - len(trailing)]
292
+ if not core:
293
+ out.append(part)
294
+ continue
295
+ lower = core.lower()
296
+ if lower in drop_set:
297
+ # Drop the word but keep attached punctuation if any (e.g. ",").
298
+ kept = leading + trailing
299
+ if kept:
300
+ out.append(kept)
301
+ continue
302
+ if do_abbreviate and lower in _ABBREVIATE:
303
+ replaced = _ABBREVIATE[lower]
304
+ # Preserve original capitalisation (Title-case → Title-case)
305
+ if core[0].isupper():
306
+ replaced = replaced[0].upper() + replaced[1:] if replaced else replaced
307
+ out.append(leading + replaced + trailing)
308
+ continue
309
+ out.append(part)
310
+
311
+ result = "".join(out)
312
+ # Collapse runs of internal whitespace that resulted from drops, AND
313
+ # collapse multi-space runs in the leading/trailing whitespace so that
314
+ # when this fragment sits next to a structured token (e.g. an
315
+ # identifier) the seam reads as a single space, not two. Newlines are
316
+ # preserved so block layout survives.
317
+ leading_ws = re.match(r"^\s*", result).group()
318
+ trailing_ws = re.search(r"\s*$", result).group()
319
+ middle = result[len(leading_ws): len(result) - len(trailing_ws) if trailing_ws else None]
320
+ middle = re.sub(r"\s+", " ", middle)
321
+ leading_ws = re.sub(r"[ \t]{2,}", " ", leading_ws)
322
+ trailing_ws = re.sub(r"[ \t]{2,}", " ", trailing_ws)
323
+ return leading_ws + middle + trailing_ws
324
+
325
+
326
+ # ── Public API ─────────────────────────────────────────────────────────────
327
+
328
+
329
+ def compress(text: str, level: Level = "full") -> str:
330
+ """Deterministically compress prose while preserving structured tokens.
331
+
332
+ Code blocks, file paths, URLs, versions, dates, and identifiers all
333
+ survive compress() byte-for-byte. Only prose words are subject to
334
+ drop-articles / drop-fillers / abbreviate transformations.
335
+
336
+ Levels:
337
+ · "lite" — drop articles (a/an/the)
338
+ · "full" — drop articles + grammatical fillers (default)
339
+ · "ultra" — full + lexicon abbreviations
340
+ """
341
+ if not text:
342
+ return text
343
+ tokens = _tokenise(text)
344
+ out: list[str] = []
345
+ for kind, frag in tokens:
346
+ if kind == "prose":
347
+ out.append(_transform_prose(frag, level))
348
+ else:
349
+ # Structured tokens pass through byte-for-byte. Crucially we do
350
+ # NOT call _normalise_seams or any whitespace tweak across the
351
+ # whole output — that would collapse spacing *inside* fenced
352
+ # code blocks. Each prose fragment self-collapses internally
353
+ # in _transform_prose; that's enough.
354
+ out.append(frag)
355
+ return "".join(out)
356
+
357
+
358
+ def compress_with_counts(text: str, level: Level = "full") -> tuple[str, int, int]:
359
+ """Same as `compress()` but also returns approximate input/output token
360
+ counts (chars // 4 heuristic, matching mcp_server._count_tokens).
361
+
362
+ Used by instrumentation sites that feed the `grammar` bucket of
363
+ `cce savings`. Pure function — adds no IO.
364
+ """
365
+ out = compress(text, level=level)
366
+ raw_tokens = max(1, len(text) // 4) if text else 0
367
+ compressed_tokens = max(1, len(out) // 4) if out else 0
368
+ return out, raw_tokens, compressed_tokens
369
+
370
+
371
+ def expand(text: str) -> str:
372
+ """Restore well-known abbreviations to their full forms and tidy
373
+ spacing. Structured tokens pass through unchanged. Lossy: dropped
374
+ articles/fillers are NOT recovered.
375
+
376
+ Used on the read side (session_recall, session_timeline) so the agent
377
+ sees natural-ish prose. Stored bytes remain compressed.
378
+ """
379
+ if not text:
380
+ return text
381
+ tokens = _tokenise(text)
382
+ out: list[str] = []
383
+ for kind, frag in tokens:
384
+ if kind != "prose":
385
+ out.append(frag)
386
+ continue
387
+ # Word-by-word abbreviation reversal.
388
+ parts = re.split(r"(\s+)", frag)
389
+ for part in parts:
390
+ if not part or part.isspace():
391
+ out.append(part)
392
+ continue
393
+ leading_match = re.match(r"^[^\w/]*", part)
394
+ trailing_match = re.search(r"[^\w/]*$", part)
395
+ leading = leading_match.group() if leading_match else ""
396
+ trailing = trailing_match.group() if trailing_match else ""
397
+ core = part[len(leading): len(part) - len(trailing)]
398
+ lower = core.lower()
399
+ if lower in _EXPAND:
400
+ full = _EXPAND[lower]
401
+ if core[:1].isupper():
402
+ full = full[0].upper() + full[1:] if full else full
403
+ out.append(leading + full + trailing)
404
+ else:
405
+ out.append(part)
406
+ return "".join(out)
407
+
408
+
409
+ # ── Helpers ────────────────────────────────────────────────────────────────
410
+
411
+
412
+ def compression_ratio(original: str, compressed: str) -> float:
413
+ """Convenience for benches: fraction of bytes saved (0.0 = no
414
+ compression, 1.0 = compressed to empty). Negative if compressed is
415
+ larger (rare — only possible if the lexicon expands more than it
416
+ drops, which it shouldn't)."""
417
+ if not original:
418
+ return 0.0
419
+ return 1.0 - len(compressed) / len(original)