code-context-engine 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_context_engine-0.4.0.dist-info/METADATA +389 -0
- code_context_engine-0.4.0.dist-info/RECORD +63 -0
- code_context_engine-0.4.0.dist-info/WHEEL +5 -0
- code_context_engine-0.4.0.dist-info/entry_points.txt +4 -0
- code_context_engine-0.4.0.dist-info/licenses/LICENSE +21 -0
- code_context_engine-0.4.0.dist-info/top_level.txt +1 -0
- context_engine/__init__.py +3 -0
- context_engine/cli.py +2848 -0
- context_engine/cli_style.py +66 -0
- context_engine/compression/__init__.py +0 -0
- context_engine/compression/compressor.py +144 -0
- context_engine/compression/ollama_client.py +33 -0
- context_engine/compression/output_rules.py +77 -0
- context_engine/compression/prompts.py +9 -0
- context_engine/compression/quality.py +37 -0
- context_engine/config.py +198 -0
- context_engine/dashboard/__init__.py +0 -0
- context_engine/dashboard/_page.py +1548 -0
- context_engine/dashboard/server.py +429 -0
- context_engine/editors.py +265 -0
- context_engine/event_bus.py +24 -0
- context_engine/indexer/__init__.py +0 -0
- context_engine/indexer/chunker.py +147 -0
- context_engine/indexer/embedder.py +154 -0
- context_engine/indexer/embedding_cache.py +168 -0
- context_engine/indexer/git_hooks.py +73 -0
- context_engine/indexer/git_indexer.py +136 -0
- context_engine/indexer/ignorefile.py +96 -0
- context_engine/indexer/manifest.py +78 -0
- context_engine/indexer/pipeline.py +624 -0
- context_engine/indexer/secrets.py +332 -0
- context_engine/indexer/watcher.py +109 -0
- context_engine/integration/__init__.py +0 -0
- context_engine/integration/bootstrap.py +76 -0
- context_engine/integration/git_context.py +132 -0
- context_engine/integration/mcp_server.py +1825 -0
- context_engine/integration/session_capture.py +306 -0
- context_engine/memory/__init__.py +6 -0
- context_engine/memory/compressor.py +344 -0
- context_engine/memory/db.py +922 -0
- context_engine/memory/extractive.py +106 -0
- context_engine/memory/grammar.py +419 -0
- context_engine/memory/hook_installer.py +258 -0
- context_engine/memory/hook_server.py +83 -0
- context_engine/memory/hooks.py +327 -0
- context_engine/memory/migrate.py +268 -0
- context_engine/models.py +96 -0
- context_engine/pricing.py +104 -0
- context_engine/project_commands.py +296 -0
- context_engine/retrieval/__init__.py +0 -0
- context_engine/retrieval/confidence.py +47 -0
- context_engine/retrieval/query_parser.py +105 -0
- context_engine/retrieval/retriever.py +199 -0
- context_engine/serve_http.py +208 -0
- context_engine/services.py +252 -0
- context_engine/storage/__init__.py +0 -0
- context_engine/storage/backend.py +39 -0
- context_engine/storage/fts_store.py +112 -0
- context_engine/storage/graph_store.py +219 -0
- context_engine/storage/local_backend.py +109 -0
- context_engine/storage/remote_backend.py +117 -0
- context_engine/storage/vector_store.py +357 -0
- context_engine/utils.py +72 -0
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""Extractive summarisation using the embedding model already loaded for the index.
|
|
2
|
+
|
|
3
|
+
The summary is always real text from the source — no synthesis, no
|
|
4
|
+
hallucination. Algorithm:
|
|
5
|
+
|
|
6
|
+
1. Split candidate text into sentences.
|
|
7
|
+
2. Embed each with bge-small (or whatever embedder is passed in).
|
|
8
|
+
3. Compute the centroid as the mean of all embeddings.
|
|
9
|
+
4. Rank sentences by cosine similarity to the centroid.
|
|
10
|
+
5. Take the top K, restored to their original order.
|
|
11
|
+
|
|
12
|
+
Failure modes:
|
|
13
|
+
- Empty / single-sentence input: return the input verbatim.
|
|
14
|
+
- Embedder raises: caller falls back to truncation.
|
|
15
|
+
|
|
16
|
+
This module has no dependency on the rest of the memory package; it operates
|
|
17
|
+
on plain strings and any object exposing `embed_query(str) -> tuple[float]`.
|
|
18
|
+
"""
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import re
|
|
22
|
+
from typing import Iterable, Protocol
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
_SENTENCE_SPLIT = re.compile(r"(?<=[.!?])\s+")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class _EmbedderLike(Protocol):
|
|
29
|
+
def embed_query(self, query: str) -> Iterable[float]: ...
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def split_sentences(text: str) -> list[str]:
|
|
33
|
+
"""Coarse sentence split. Good enough for chat-shaped text.
|
|
34
|
+
|
|
35
|
+
Newlines are treated as sentence boundaries too — Claude often emits
|
|
36
|
+
multi-line tool output where each line is its own statement.
|
|
37
|
+
"""
|
|
38
|
+
pieces: list[str] = []
|
|
39
|
+
for line in text.splitlines():
|
|
40
|
+
line = line.strip()
|
|
41
|
+
if not line:
|
|
42
|
+
continue
|
|
43
|
+
pieces.extend(s.strip() for s in _SENTENCE_SPLIT.split(line) if s.strip())
|
|
44
|
+
return pieces
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _cosine(a: list[float], b: list[float]) -> float:
|
|
48
|
+
# No numpy here — keep this module standalone and free of imports the
|
|
49
|
+
# caller might not want to pay for in cold paths.
|
|
50
|
+
dot = 0.0
|
|
51
|
+
na = 0.0
|
|
52
|
+
nb = 0.0
|
|
53
|
+
for x, y in zip(a, b):
|
|
54
|
+
dot += x * y
|
|
55
|
+
na += x * x
|
|
56
|
+
nb += y * y
|
|
57
|
+
if na <= 0.0 or nb <= 0.0:
|
|
58
|
+
return 0.0
|
|
59
|
+
return dot / ((na ** 0.5) * (nb ** 0.5))
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def extractive_summary(
|
|
63
|
+
text: str,
|
|
64
|
+
*,
|
|
65
|
+
embedder: _EmbedderLike,
|
|
66
|
+
top_k: int = 3,
|
|
67
|
+
) -> str:
|
|
68
|
+
"""Return the top-K most central sentences from `text`, in source order.
|
|
69
|
+
|
|
70
|
+
Returns the input verbatim if there's nothing to rank.
|
|
71
|
+
"""
|
|
72
|
+
sentences = split_sentences(text)
|
|
73
|
+
if len(sentences) <= top_k:
|
|
74
|
+
return " ".join(sentences) if sentences else text.strip()
|
|
75
|
+
|
|
76
|
+
embeddings: list[list[float]] = []
|
|
77
|
+
for s in sentences:
|
|
78
|
+
v = list(embedder.embed_query(s))
|
|
79
|
+
embeddings.append(v)
|
|
80
|
+
|
|
81
|
+
if not embeddings or not embeddings[0]:
|
|
82
|
+
return " ".join(sentences[:top_k])
|
|
83
|
+
|
|
84
|
+
dim = len(embeddings[0])
|
|
85
|
+
centroid = [0.0] * dim
|
|
86
|
+
for emb in embeddings:
|
|
87
|
+
for i in range(dim):
|
|
88
|
+
centroid[i] += emb[i]
|
|
89
|
+
n = len(embeddings)
|
|
90
|
+
centroid = [c / n for c in centroid]
|
|
91
|
+
|
|
92
|
+
scored = [
|
|
93
|
+
(i, _cosine(emb, centroid))
|
|
94
|
+
for i, emb in enumerate(embeddings)
|
|
95
|
+
]
|
|
96
|
+
scored.sort(key=lambda p: p[1], reverse=True)
|
|
97
|
+
chosen_indices = sorted(i for i, _ in scored[:top_k])
|
|
98
|
+
return " ".join(sentences[i] for i in chosen_indices)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def truncation_summary(text: str, *, max_chars: int = 200) -> str:
|
|
102
|
+
"""Final fallback when the embedder isn't available."""
|
|
103
|
+
text = text.strip()
|
|
104
|
+
if len(text) <= max_chars:
|
|
105
|
+
return text
|
|
106
|
+
return text[: max_chars - 1].rstrip() + "…"
|
|
@@ -0,0 +1,419 @@
|
|
|
1
|
+
"""Deterministic, model-free prose compression for memory.db rows.
|
|
2
|
+
|
|
3
|
+
Approach (modelled after cavemem's "caveman grammar"):
|
|
4
|
+
|
|
5
|
+
1. Tokenise input into typed tokens. Code, paths, URLs, versions, dates,
|
|
6
|
+
identifiers, numbers, and headings are *structured* — they're
|
|
7
|
+
preserved byte-for-byte through compress(). Only `prose` tokens are
|
|
8
|
+
transformed.
|
|
9
|
+
|
|
10
|
+
2. Apply prose transformations at one of three intensity levels:
|
|
11
|
+
lite — drop articles only (a, an, the)
|
|
12
|
+
full — drop articles + grammatical fillers (default)
|
|
13
|
+
ultra — full + abbreviation lexicon
|
|
14
|
+
|
|
15
|
+
3. Round-trip property: structured tokens survive compress() unchanged.
|
|
16
|
+
expand() is a light reversal that restores abbreviations and tidies
|
|
17
|
+
spacing, but is *not* required to recover dropped articles/fillers —
|
|
18
|
+
the compression is intentionally lossy on non-content words.
|
|
19
|
+
|
|
20
|
+
The whole module is pure: no IO, no external deps, no LLM call. It's
|
|
21
|
+
deterministic — same input + same level always yields the same output.
|
|
22
|
+
|
|
23
|
+
Used by:
|
|
24
|
+
· record_decision dual-write (mcp_server.py)
|
|
25
|
+
· compress_turn after extractive summary (compressor.py)
|
|
26
|
+
· session_recall / session_timeline output formatting (read-side expand)
|
|
27
|
+
|
|
28
|
+
See `tests/memory/test_grammar.py` for the corpus + round-trip tests.
|
|
29
|
+
"""
|
|
30
|
+
from __future__ import annotations
|
|
31
|
+
|
|
32
|
+
import re
|
|
33
|
+
from typing import Literal
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
Level = Literal["lite", "full", "ultra"]
|
|
37
|
+
|
|
38
|
+
# Single source of truth for the compression level applied to memory.db
|
|
39
|
+
# storage. Imported by mcp_server (record_decision), compressor (turn +
|
|
40
|
+
# rollup), migrate (legacy import), and the bench (canonical-form match).
|
|
41
|
+
# All five paths must agree, otherwise the bench gives misleading numbers
|
|
42
|
+
# and the same decision can land in storage at different shapes.
|
|
43
|
+
#
|
|
44
|
+
# Savings-bucket invariant: at "lite" the dropped tokens are articles only,
|
|
45
|
+
# and `expand()` does NOT restore them — so wire savings (what the model
|
|
46
|
+
# sees on read) match storage savings (what was written). The `grammar`
|
|
47
|
+
# bucket in `cce savings` reports both honestly.
|
|
48
|
+
#
|
|
49
|
+
# If this is bumped to "ultra", abbreviations like config↔configuration
|
|
50
|
+
# round-trip via `expand()`, so the wire form is roughly the same length
|
|
51
|
+
# as raw input and the bucket would over-claim. Update the renderer to
|
|
52
|
+
# subtract the abbreviation round-trip before bumping the level.
|
|
53
|
+
DEFAULT_LEVEL: Level = "lite"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# ── Token classes ──────────────────────────────────────────────────────────
|
|
57
|
+
# Any string fragment that matches a structured-token pattern is preserved
|
|
58
|
+
# byte-for-byte through compress(). Order matters — patterns earlier in the
|
|
59
|
+
# list win.
|
|
60
|
+
|
|
61
|
+
_FENCE_RE = re.compile(r"```[\s\S]*?```", re.MULTILINE)
|
|
62
|
+
_INLINE_CODE_RE = re.compile(r"`[^`\n]+?`")
|
|
63
|
+
|
|
64
|
+
_URL_RE = re.compile(r"https?://\S+|www\.\S+")
|
|
65
|
+
_DATETIME_RE = re.compile(
|
|
66
|
+
r"\d{4}-\d{2}-\d{2}(?:[T ]\d{2}:\d{2}(?::\d{2})?(?:Z|[+\-]\d{2}:?\d{2})?)?"
|
|
67
|
+
)
|
|
68
|
+
_VERSION_RE = re.compile(r"v?\d+\.\d+(?:\.\d+)?(?:-[A-Za-z0-9.]+)?")
|
|
69
|
+
# Path: requires *substantive* segments (≥2 chars each, or a clear file
|
|
70
|
+
# extension). Single-slash bare-word pairs like the abbreviations `b/c`,
|
|
71
|
+
# `w/o`, `w/` deliberately don't match — they flow through as prose so
|
|
72
|
+
# the lexicon expansion sees them. Same constraint excludes `/c` etc.
|
|
73
|
+
_PATH_RE = re.compile(
|
|
74
|
+
r"(?:\.{1,2}/|/)[A-Za-z0-9_\-]{2,}(?:/[A-Za-z0-9_./\-]+)*"
|
|
75
|
+
r"|[A-Za-z0-9_\-]{2,}/[A-Za-z0-9_\-]{2,}(?:/[A-Za-z0-9_./\-]+)*"
|
|
76
|
+
r"|[A-Za-z0-9_\-]{2,}/[A-Za-z0-9_\-]+\.[A-Za-z]{1,5}\b"
|
|
77
|
+
)
|
|
78
|
+
# Number with unit attached: 100ms, 5GB, 0.3s. Distinct from a bare number
|
|
79
|
+
# so we can let bare numbers stay in prose.
|
|
80
|
+
_NUMBER_UNIT_RE = re.compile(r"\d+(?:\.\d+)?[A-Za-z]{1,4}\b")
|
|
81
|
+
# Identifier: CamelCase / snake_case / dotted.path. Must contain an
|
|
82
|
+
# uppercase, a digit, an underscore, or a dot to be flagged as identifier
|
|
83
|
+
# (otherwise a single lowercase word is just prose).
|
|
84
|
+
_IDENT_RE = re.compile(
|
|
85
|
+
r"[A-Za-z][A-Za-z0-9]*(?:[._][A-Za-z0-9]+)+|"
|
|
86
|
+
r"[A-Za-z][A-Za-z0-9]*[A-Z][A-Za-z0-9]*|"
|
|
87
|
+
r"[A-Za-z]+_[A-Za-z0-9_]+"
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# ── Stop-word lexicons ─────────────────────────────────────────────────────
|
|
91
|
+
|
|
92
|
+
_ARTICLES = frozenset({"a", "an", "the"})
|
|
93
|
+
|
|
94
|
+
# Grammatical fillers that carry no topic signal in coding-session prose.
|
|
95
|
+
# Conservative — topic words (code, auth, database, scale, etc.) are NOT here.
|
|
96
|
+
_FILLERS = _ARTICLES | frozenset({
|
|
97
|
+
# auxiliaries / modals
|
|
98
|
+
"is", "are", "was", "were", "be", "been", "being", "am",
|
|
99
|
+
"have", "has", "had",
|
|
100
|
+
"will", "would", "shall", "should", "may", "might", "must",
|
|
101
|
+
# connectives / weak verbs
|
|
102
|
+
"of", "to", "as", "by", "on", "in", "at", "for", "with", "from",
|
|
103
|
+
"and", "or", "but", "if", "than", "then",
|
|
104
|
+
"that", "this", "these", "those",
|
|
105
|
+
# mild redundancies frequently in decision text
|
|
106
|
+
"just", "very", "quite", "really", "actually", "basically",
|
|
107
|
+
})
|
|
108
|
+
|
|
109
|
+
# Aggressive drop set used at ultra. Adds discourse fillers, common
|
|
110
|
+
# pronouns, weak verbs. Trades a few recall points for ~3× the byte
|
|
111
|
+
# savings on conversational prose.
|
|
112
|
+
_FILLERS_ULTRA = _FILLERS | frozenset({
|
|
113
|
+
"also", "still", "now", "when", "while", "since",
|
|
114
|
+
"i", "we", "you", "he", "she", "it", "they", "them",
|
|
115
|
+
"our", "their", "its", "my", "your", "his", "her",
|
|
116
|
+
"me", "us", "him",
|
|
117
|
+
"do", "does", "did",
|
|
118
|
+
"via", "into", "onto", "upon", "over", "under", "through",
|
|
119
|
+
"much", "more", "most", "less", "least", "some", "any", "all",
|
|
120
|
+
"such", "both", "each", "every", "other", "another",
|
|
121
|
+
"here", "there",
|
|
122
|
+
"let", "get", "got", "take", "took", "give", "gave",
|
|
123
|
+
"truly", "absolutely", "completely", "totally", "entirely",
|
|
124
|
+
})
|
|
125
|
+
|
|
126
|
+
# Abbreviation lexicon for ultra. expand() uses the inverse to restore on read.
|
|
127
|
+
_ABBREVIATE: dict[str, str] = {
|
|
128
|
+
# Connectives / discourse
|
|
129
|
+
"because": "b/c",
|
|
130
|
+
"however": "but",
|
|
131
|
+
"therefore": "so",
|
|
132
|
+
"additionally": "+",
|
|
133
|
+
"approximately": "~",
|
|
134
|
+
"particularly": "esp",
|
|
135
|
+
"specifically": "esp",
|
|
136
|
+
"currently": "now",
|
|
137
|
+
"previously": "before",
|
|
138
|
+
"subsequently": "then",
|
|
139
|
+
"throughout": "thru",
|
|
140
|
+
"instead": "vs",
|
|
141
|
+
"without": "w/o",
|
|
142
|
+
"with": "w/",
|
|
143
|
+
"between": "btwn",
|
|
144
|
+
"probably": "likely",
|
|
145
|
+
# Programming jargon
|
|
146
|
+
"configuration": "config",
|
|
147
|
+
"implementation": "impl",
|
|
148
|
+
"documentation": "docs",
|
|
149
|
+
"repository": "repo",
|
|
150
|
+
"performance": "perf",
|
|
151
|
+
"production": "prod",
|
|
152
|
+
"development": "dev",
|
|
153
|
+
"environment": "env",
|
|
154
|
+
"infrastructure": "infra",
|
|
155
|
+
"architecture": "arch",
|
|
156
|
+
# Note: deliberately NOT abbreviating "authentication"/"authorization"/
|
|
157
|
+
# "library" — their natural abbreviations ("auth"/"authz"/"lib") are
|
|
158
|
+
# already real domain words, so expanding "auth" back to "authentication"
|
|
159
|
+
# corrupts text the user wrote as "auth" intentionally.
|
|
160
|
+
"framework": "fw",
|
|
161
|
+
"function": "fn",
|
|
162
|
+
"variable": "var",
|
|
163
|
+
"parameter": "param",
|
|
164
|
+
"argument": "arg",
|
|
165
|
+
"request": "req",
|
|
166
|
+
"response": "resp",
|
|
167
|
+
"database": "db",
|
|
168
|
+
"language": "lang",
|
|
169
|
+
"directory": "dir",
|
|
170
|
+
"execution": "exec",
|
|
171
|
+
"operation": "op",
|
|
172
|
+
"management": "mgmt",
|
|
173
|
+
"deployment": "deploy",
|
|
174
|
+
"synchronisation": "sync",
|
|
175
|
+
"synchronization": "sync",
|
|
176
|
+
"asynchronous": "async",
|
|
177
|
+
"synchronous": "sync",
|
|
178
|
+
"concurrent": "conc",
|
|
179
|
+
"optimisation": "opt",
|
|
180
|
+
"optimization": "opt",
|
|
181
|
+
"automatically": "auto",
|
|
182
|
+
"available": "avail",
|
|
183
|
+
"compatibility": "compat",
|
|
184
|
+
"incompatible": "incompat",
|
|
185
|
+
"information": "info",
|
|
186
|
+
"reference": "ref",
|
|
187
|
+
"different": "diff",
|
|
188
|
+
"specific": "spec",
|
|
189
|
+
"important": "imp",
|
|
190
|
+
"consider": "cons",
|
|
191
|
+
"additional": "extra",
|
|
192
|
+
"responsible": "resp",
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
_EXPAND: dict[str, str] = {abbr: word for word, abbr in _ABBREVIATE.items()}
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
# ── Tokenisation ───────────────────────────────────────────────────────────
|
|
199
|
+
|
|
200
|
+
# A token is a (kind, text) tuple. `kind` drives whether it survives
|
|
201
|
+
# compress() unchanged or is transformed.
|
|
202
|
+
_TokenKind = Literal[
|
|
203
|
+
"fence", "inline_code", "url", "datetime", "version", "path",
|
|
204
|
+
"number_unit", "identifier", "prose",
|
|
205
|
+
]
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def _tokenise(text: str) -> list[tuple[str, str]]:
|
|
209
|
+
"""Split `text` into (kind, fragment) tuples. Concatenating all
|
|
210
|
+
fragments reproduces the input exactly.
|
|
211
|
+
|
|
212
|
+
Strategy: in priority order, find structured-token spans and treat the
|
|
213
|
+
gaps between them as `prose`. The priority order is the order patterns
|
|
214
|
+
appear in `_PATTERNS` below — fenced code wins over everything else,
|
|
215
|
+
then inline code, then URL/datetime/version/path/number+unit/identifier.
|
|
216
|
+
"""
|
|
217
|
+
if not text:
|
|
218
|
+
return []
|
|
219
|
+
patterns = [
|
|
220
|
+
("fence", _FENCE_RE),
|
|
221
|
+
("inline_code", _INLINE_CODE_RE),
|
|
222
|
+
("url", _URL_RE),
|
|
223
|
+
("datetime", _DATETIME_RE),
|
|
224
|
+
("version", _VERSION_RE),
|
|
225
|
+
("path", _PATH_RE),
|
|
226
|
+
("number_unit", _NUMBER_UNIT_RE),
|
|
227
|
+
("identifier", _IDENT_RE),
|
|
228
|
+
]
|
|
229
|
+
spans: list[tuple[int, int, str]] = [] # (start, end, kind)
|
|
230
|
+
occupied = [False] * len(text)
|
|
231
|
+
for kind, regex in patterns:
|
|
232
|
+
for m in regex.finditer(text):
|
|
233
|
+
s, e = m.start(), m.end()
|
|
234
|
+
if any(occupied[s:e]):
|
|
235
|
+
continue # overlaps with a higher-priority span
|
|
236
|
+
spans.append((s, e, kind))
|
|
237
|
+
for i in range(s, e):
|
|
238
|
+
occupied[i] = True
|
|
239
|
+
spans.sort()
|
|
240
|
+
|
|
241
|
+
tokens: list[tuple[str, str]] = []
|
|
242
|
+
cursor = 0
|
|
243
|
+
for s, e, kind in spans:
|
|
244
|
+
if cursor < s:
|
|
245
|
+
tokens.append(("prose", text[cursor:s]))
|
|
246
|
+
tokens.append((kind, text[s:e]))
|
|
247
|
+
cursor = e
|
|
248
|
+
if cursor < len(text):
|
|
249
|
+
tokens.append(("prose", text[cursor:]))
|
|
250
|
+
return tokens
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
# ── Prose transformation ───────────────────────────────────────────────────
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def _transform_prose(text: str, level: Level) -> str:
|
|
257
|
+
"""Apply level-specific transformations to a prose fragment.
|
|
258
|
+
|
|
259
|
+
Splits on whitespace and filters / abbreviates words. Punctuation
|
|
260
|
+
attached to a word (e.g. "auth,") is preserved by separating it before
|
|
261
|
+
matching against the stop-word set.
|
|
262
|
+
"""
|
|
263
|
+
if not text.strip():
|
|
264
|
+
return text # pure whitespace passthrough
|
|
265
|
+
if level == "lite":
|
|
266
|
+
drop_set = _ARTICLES
|
|
267
|
+
do_abbreviate = False
|
|
268
|
+
elif level == "full":
|
|
269
|
+
drop_set = _FILLERS
|
|
270
|
+
do_abbreviate = False
|
|
271
|
+
else: # "ultra"
|
|
272
|
+
drop_set = _FILLERS_ULTRA
|
|
273
|
+
do_abbreviate = True
|
|
274
|
+
|
|
275
|
+
out: list[str] = []
|
|
276
|
+
# Tokenise on whitespace boundaries while preserving the whitespace
|
|
277
|
+
# itself, so a prose fragment like "use the JWT" becomes "use JWT"
|
|
278
|
+
# not "useJWT".
|
|
279
|
+
parts = re.split(r"(\s+)", text)
|
|
280
|
+
for part in parts:
|
|
281
|
+
if not part:
|
|
282
|
+
continue
|
|
283
|
+
if part.isspace():
|
|
284
|
+
out.append(part)
|
|
285
|
+
continue
|
|
286
|
+
# Strip leading/trailing punctuation so "the," / "auth." match.
|
|
287
|
+
leading_match = re.match(r"^[^\w]*", part)
|
|
288
|
+
trailing_match = re.search(r"[^\w]*$", part)
|
|
289
|
+
leading = leading_match.group() if leading_match else ""
|
|
290
|
+
trailing = trailing_match.group() if trailing_match else ""
|
|
291
|
+
core = part[len(leading): len(part) - len(trailing)]
|
|
292
|
+
if not core:
|
|
293
|
+
out.append(part)
|
|
294
|
+
continue
|
|
295
|
+
lower = core.lower()
|
|
296
|
+
if lower in drop_set:
|
|
297
|
+
# Drop the word but keep attached punctuation if any (e.g. ",").
|
|
298
|
+
kept = leading + trailing
|
|
299
|
+
if kept:
|
|
300
|
+
out.append(kept)
|
|
301
|
+
continue
|
|
302
|
+
if do_abbreviate and lower in _ABBREVIATE:
|
|
303
|
+
replaced = _ABBREVIATE[lower]
|
|
304
|
+
# Preserve original capitalisation (Title-case → Title-case)
|
|
305
|
+
if core[0].isupper():
|
|
306
|
+
replaced = replaced[0].upper() + replaced[1:] if replaced else replaced
|
|
307
|
+
out.append(leading + replaced + trailing)
|
|
308
|
+
continue
|
|
309
|
+
out.append(part)
|
|
310
|
+
|
|
311
|
+
result = "".join(out)
|
|
312
|
+
# Collapse runs of internal whitespace that resulted from drops, AND
|
|
313
|
+
# collapse multi-space runs in the leading/trailing whitespace so that
|
|
314
|
+
# when this fragment sits next to a structured token (e.g. an
|
|
315
|
+
# identifier) the seam reads as a single space, not two. Newlines are
|
|
316
|
+
# preserved so block layout survives.
|
|
317
|
+
leading_ws = re.match(r"^\s*", result).group()
|
|
318
|
+
trailing_ws = re.search(r"\s*$", result).group()
|
|
319
|
+
middle = result[len(leading_ws): len(result) - len(trailing_ws) if trailing_ws else None]
|
|
320
|
+
middle = re.sub(r"\s+", " ", middle)
|
|
321
|
+
leading_ws = re.sub(r"[ \t]{2,}", " ", leading_ws)
|
|
322
|
+
trailing_ws = re.sub(r"[ \t]{2,}", " ", trailing_ws)
|
|
323
|
+
return leading_ws + middle + trailing_ws
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
# ── Public API ─────────────────────────────────────────────────────────────
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def compress(text: str, level: Level = "full") -> str:
|
|
330
|
+
"""Deterministically compress prose while preserving structured tokens.
|
|
331
|
+
|
|
332
|
+
Code blocks, file paths, URLs, versions, dates, and identifiers all
|
|
333
|
+
survive compress() byte-for-byte. Only prose words are subject to
|
|
334
|
+
drop-articles / drop-fillers / abbreviate transformations.
|
|
335
|
+
|
|
336
|
+
Levels:
|
|
337
|
+
· "lite" — drop articles (a/an/the)
|
|
338
|
+
· "full" — drop articles + grammatical fillers (default)
|
|
339
|
+
· "ultra" — full + lexicon abbreviations
|
|
340
|
+
"""
|
|
341
|
+
if not text:
|
|
342
|
+
return text
|
|
343
|
+
tokens = _tokenise(text)
|
|
344
|
+
out: list[str] = []
|
|
345
|
+
for kind, frag in tokens:
|
|
346
|
+
if kind == "prose":
|
|
347
|
+
out.append(_transform_prose(frag, level))
|
|
348
|
+
else:
|
|
349
|
+
# Structured tokens pass through byte-for-byte. Crucially we do
|
|
350
|
+
# NOT call _normalise_seams or any whitespace tweak across the
|
|
351
|
+
# whole output — that would collapse spacing *inside* fenced
|
|
352
|
+
# code blocks. Each prose fragment self-collapses internally
|
|
353
|
+
# in _transform_prose; that's enough.
|
|
354
|
+
out.append(frag)
|
|
355
|
+
return "".join(out)
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def compress_with_counts(text: str, level: Level = "full") -> tuple[str, int, int]:
|
|
359
|
+
"""Same as `compress()` but also returns approximate input/output token
|
|
360
|
+
counts (chars // 4 heuristic, matching mcp_server._count_tokens).
|
|
361
|
+
|
|
362
|
+
Used by instrumentation sites that feed the `grammar` bucket of
|
|
363
|
+
`cce savings`. Pure function — adds no IO.
|
|
364
|
+
"""
|
|
365
|
+
out = compress(text, level=level)
|
|
366
|
+
raw_tokens = max(1, len(text) // 4) if text else 0
|
|
367
|
+
compressed_tokens = max(1, len(out) // 4) if out else 0
|
|
368
|
+
return out, raw_tokens, compressed_tokens
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
def expand(text: str) -> str:
|
|
372
|
+
"""Restore well-known abbreviations to their full forms and tidy
|
|
373
|
+
spacing. Structured tokens pass through unchanged. Lossy: dropped
|
|
374
|
+
articles/fillers are NOT recovered.
|
|
375
|
+
|
|
376
|
+
Used on the read side (session_recall, session_timeline) so the agent
|
|
377
|
+
sees natural-ish prose. Stored bytes remain compressed.
|
|
378
|
+
"""
|
|
379
|
+
if not text:
|
|
380
|
+
return text
|
|
381
|
+
tokens = _tokenise(text)
|
|
382
|
+
out: list[str] = []
|
|
383
|
+
for kind, frag in tokens:
|
|
384
|
+
if kind != "prose":
|
|
385
|
+
out.append(frag)
|
|
386
|
+
continue
|
|
387
|
+
# Word-by-word abbreviation reversal.
|
|
388
|
+
parts = re.split(r"(\s+)", frag)
|
|
389
|
+
for part in parts:
|
|
390
|
+
if not part or part.isspace():
|
|
391
|
+
out.append(part)
|
|
392
|
+
continue
|
|
393
|
+
leading_match = re.match(r"^[^\w/]*", part)
|
|
394
|
+
trailing_match = re.search(r"[^\w/]*$", part)
|
|
395
|
+
leading = leading_match.group() if leading_match else ""
|
|
396
|
+
trailing = trailing_match.group() if trailing_match else ""
|
|
397
|
+
core = part[len(leading): len(part) - len(trailing)]
|
|
398
|
+
lower = core.lower()
|
|
399
|
+
if lower in _EXPAND:
|
|
400
|
+
full = _EXPAND[lower]
|
|
401
|
+
if core[:1].isupper():
|
|
402
|
+
full = full[0].upper() + full[1:] if full else full
|
|
403
|
+
out.append(leading + full + trailing)
|
|
404
|
+
else:
|
|
405
|
+
out.append(part)
|
|
406
|
+
return "".join(out)
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
# ── Helpers ────────────────────────────────────────────────────────────────
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
def compression_ratio(original: str, compressed: str) -> float:
|
|
413
|
+
"""Convenience for benches: fraction of bytes saved (0.0 = no
|
|
414
|
+
compression, 1.0 = compressed to empty). Negative if compressed is
|
|
415
|
+
larger (rare — only possible if the lexicon expands more than it
|
|
416
|
+
drops, which it shouldn't)."""
|
|
417
|
+
if not original:
|
|
418
|
+
return 0.0
|
|
419
|
+
return 1.0 - len(compressed) / len(original)
|