ccllm 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. ccllm-0.1.0/LICENSE +21 -0
  2. ccllm-0.1.0/PKG-INFO +32 -0
  3. ccllm-0.1.0/pyproject.toml +51 -0
  4. ccllm-0.1.0/setup.cfg +4 -0
  5. ccllm-0.1.0/src/ccllm/__init__.py +0 -0
  6. ccllm-0.1.0/src/ccllm/api/__init__.py +3 -0
  7. ccllm-0.1.0/src/ccllm/api/app.py +218 -0
  8. ccllm-0.1.0/src/ccllm/compression/__init__.py +31 -0
  9. ccllm-0.1.0/src/ccllm/compression/compressor.py +300 -0
  10. ccllm-0.1.0/src/ccllm/compression/decompressor.py +184 -0
  11. ccllm-0.1.0/src/ccllm/compression/dictionary.py +196 -0
  12. ccllm-0.1.0/src/ccllm/compression/tokenizer.py +71 -0
  13. ccllm-0.1.0/src/ccllm/memory/__init__.py +3 -0
  14. ccllm-0.1.0/src/ccllm/memory/memory_manager.py +179 -0
  15. ccllm-0.1.0/src/ccllm/retrieval/__init__.py +17 -0
  16. ccllm-0.1.0/src/ccllm/retrieval/embedder.py +150 -0
  17. ccllm-0.1.0/src/ccllm/retrieval/retriever.py +230 -0
  18. ccllm-0.1.0/src/ccllm/retrieval/vector_store.py +228 -0
  19. ccllm-0.1.0/src/ccllm/storage/__init__.py +8 -0
  20. ccllm-0.1.0/src/ccllm/storage/db.py +258 -0
  21. ccllm-0.1.0/src/ccllm/storage/models.py +58 -0
  22. ccllm-0.1.0/src/ccllm.egg-info/PKG-INFO +32 -0
  23. ccllm-0.1.0/src/ccllm.egg-info/SOURCES.txt +36 -0
  24. ccllm-0.1.0/src/ccllm.egg-info/dependency_links.txt +1 -0
  25. ccllm-0.1.0/src/ccllm.egg-info/requires.txt +12 -0
  26. ccllm-0.1.0/src/ccllm.egg-info/top_level.txt +1 -0
  27. ccllm-0.1.0/tests/test_compressor.py +312 -0
  28. ccllm-0.1.0/tests/test_decompressor.py +558 -0
  29. ccllm-0.1.0/tests/test_dictionary.py +113 -0
  30. ccllm-0.1.0/tests/test_embedder.py +200 -0
  31. ccllm-0.1.0/tests/test_integration_compression.py +189 -0
  32. ccllm-0.1.0/tests/test_integration_retrieval.py +183 -0
  33. ccllm-0.1.0/tests/test_integration_storage.py +155 -0
  34. ccllm-0.1.0/tests/test_memory_manager.py +304 -0
  35. ccllm-0.1.0/tests/test_retriever.py +244 -0
  36. ccllm-0.1.0/tests/test_storage.py +268 -0
  37. ccllm-0.1.0/tests/test_tokenizer.py +104 -0
  38. ccllm-0.1.0/tests/test_vector_store.py +392 -0
ccllm-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Wasi_Ahmad
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
ccllm-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,32 @@
1
+ Metadata-Version: 2.4
2
+ Name: ccllm
3
+ Version: 0.1.0
4
+ Summary: Lossless text compression, storage, memory management, and retrieval toolkit
5
+ Author-email: Wasi Ahmad <wasiahamd0569@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/<your-user-or-org>/<repo>
8
+ Project-URL: Repository, https://github.com/<your-user-or-org>/<repo>
9
+ Project-URL: Issues, https://github.com/<your-user-or-org>/<repo>/issues
10
+ Keywords: compression,retrieval,memory,storage,nlp
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Requires-Python: >=3.10
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE
21
+ Requires-Dist: fastapi<1.0,>=0.110
22
+ Requires-Dist: pydantic<3.0,>=2.7
23
+ Requires-Dist: numpy<2.0,>=1.26
24
+ Requires-Dist: scikit-learn<2.0,>=1.4
25
+ Requires-Dist: uvicorn<1.0,>=0.29
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest<9,>=8; extra == "dev"
28
+ Requires-Dist: pytest-cov<6,>=5; extra == "dev"
29
+ Requires-Dist: ruff<0.6,>=0.4; extra == "dev"
30
+ Requires-Dist: build>=1.2; extra == "dev"
31
+ Requires-Dist: twine<6,>=5; extra == "dev"
32
+ Dynamic: license-file
@@ -0,0 +1,51 @@
1
+ [build-system]
2
+ requires = ["setuptools>=69", "wheel", "build"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "ccllm"
7
+ version = "0.1.0"
8
+ description = "Lossless text compression, storage, memory management, and retrieval toolkit"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { text = "MIT" }
12
+ authors = [
13
+ { name = "Wasi Ahmad", email = "wasiahamd0569@gmail.com" }
14
+ ]
15
+ keywords = ["compression", "retrieval", "memory", "storage", "nlp"]
16
+ classifiers = [
17
+ "Development Status :: 3 - Alpha",
18
+ "Intended Audience :: Developers",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.12"
24
+ ]
25
+ dependencies = [
26
+ "fastapi>=0.110,<1.0",
27
+ "pydantic>=2.7,<3.0",
28
+ "numpy>=1.26,<2.0",
29
+ "scikit-learn>=1.4,<2.0",
30
+ "uvicorn>=0.29,<1.0"
31
+ ]
32
+
33
+ [project.optional-dependencies]
34
+ dev = [
35
+ "pytest>=8,<9",
36
+ "pytest-cov>=5,<6",
37
+ "ruff>=0.4,<0.6",
38
+ "build>=1.2",
39
+ "twine>=5,<6"
40
+ ]
41
+
42
+ [tool.setuptools]
43
+ package-dir = {"" = "src"}
44
+
45
+ [tool.setuptools.packages.find]
46
+ where = ["src"]
47
+
48
+ [project.urls]
49
+ Homepage = "https://github.com/<your-user-or-org>/<repo>"
50
+ Repository = "https://github.com/<your-user-or-org>/<repo>"
51
+ Issues = "https://github.com/<your-user-or-org>/<repo>/issues"
ccllm-0.1.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
File without changes
@@ -0,0 +1,3 @@
1
+ from src.ccllm.api.app import app
2
+
3
+ __all__ = ["app"]
@@ -0,0 +1,218 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Literal
4
+
5
+ from fastapi import FastAPI, HTTPException, Query
6
+ from pydantic import BaseModel, Field
7
+
8
+ from src.ccllm.memory import MemoryManager
9
+ from src.ccllm.retrieval import MemoryRetriever
10
+ from src.ccllm.storage import CompressionStorage
11
+
12
+ # -----------------------------
13
+ # App setup
14
+ # -----------------------------
15
+
16
+ storage = CompressionStorage("data/compression.db")
17
+ memory_manager = MemoryManager(storage=storage, default_method="zlib")
18
+
19
+ app = FastAPI(
20
+ title="Compression Memory API",
21
+ version="1.0.0",
22
+ description=(
23
+ "API for storing, restoring, listing, deleting, and retrieving "
24
+ "compressed text memories."
25
+ ),
26
+ )
27
+
28
+
29
+ # -----------------------------
30
+ # Request / Response Models
31
+ # -----------------------------
32
+
33
+ CompressionMethod = Literal["none", "zlib", "lzma", "dictionary"]
34
+ RetrievalMode = Literal["lexical", "vector", "hybrid"]
35
+
36
+
37
+ class StoreTextRequest(BaseModel):
38
+ text: str = Field(..., description="Original text to compress and store.")
39
+ method: CompressionMethod | None = Field(
40
+ default=None,
41
+ description="Compression method to use. Falls back to default if omitted.",
42
+ )
43
+ metadata: dict[str, Any] | None = Field(
44
+ default=None,
45
+ description="Optional metadata stored with the record.",
46
+ )
47
+ record_id: str | None = Field(
48
+ default=None,
49
+ description="Optional custom record ID.",
50
+ )
51
+ compressor_kwargs: dict[str, Any] | None = Field(
52
+ default=None,
53
+ description="Optional compressor configuration.",
54
+ )
55
+
56
+
57
+ class StoreTextResponse(BaseModel):
58
+ record_id: str
59
+ created_at: str
60
+ method: str
61
+ original_sha256: str
62
+ original_length: int
63
+ token_count: int
64
+ compressed_bytes: int
65
+ compression_ratio: float
66
+ metadata_json: str
67
+
68
+
69
+ class RetrieveRequest(BaseModel):
70
+ query: str = Field(..., description="Search query text.")
71
+ mode: RetrievalMode = Field(
72
+ default="hybrid",
73
+ description="Retrieval mode: lexical, vector, or hybrid.",
74
+ )
75
+ limit: int = Field(default=5, ge=1, description="Maximum number of results.")
76
+ search_limit: int = Field(
77
+ default=100,
78
+ ge=1,
79
+ description="Maximum number of stored memories to inspect.",
80
+ )
81
+ metadata_filter: dict[str, Any] | None = Field(
82
+ default=None,
83
+ description="Optional exact-match metadata filter.",
84
+ )
85
+ alpha: float = Field(
86
+ default=0.5,
87
+ ge=0.0,
88
+ le=1.0,
89
+ description="Hybrid weighting. Used only in hybrid mode.",
90
+ )
91
+
92
+
93
+ class RetrievalItem(BaseModel):
94
+ record_id: str
95
+ score: float
96
+ method: str
97
+ created_at: str
98
+ metadata: dict[str, Any]
99
+ text: str
100
+
101
+
102
+ class DeleteResponse(BaseModel):
103
+ deleted: bool
104
+ record_id: str
105
+
106
+
107
+ class HealthResponse(BaseModel):
108
+ status: str
109
+ total_memories: int
110
+
111
+
112
+ # -----------------------------
113
+ # Routes
114
+ # -----------------------------
115
+
116
+
117
+ @app.get("/health", response_model=HealthResponse)
118
+ def health() -> HealthResponse:
119
+ return HealthResponse(
120
+ status="ok",
121
+ total_memories=memory_manager.count_memories(),
122
+ )
123
+
124
+
125
+ @app.post("/store", response_model=StoreTextResponse)
126
+ def store_text(payload: StoreTextRequest) -> StoreTextResponse:
127
+ try:
128
+ record = memory_manager.save_text(
129
+ text=payload.text,
130
+ method=payload.method,
131
+ metadata=payload.metadata,
132
+ record_id=payload.record_id,
133
+ compressor_kwargs=payload.compressor_kwargs,
134
+ )
135
+ except Exception as exc:
136
+ raise HTTPException(status_code=400, detail=str(exc)) from exc
137
+
138
+ return StoreTextResponse(
139
+ record_id=record.record_id,
140
+ created_at=record.created_at,
141
+ method=record.method,
142
+ original_sha256=record.original_sha256,
143
+ original_length=record.original_length,
144
+ token_count=record.token_count,
145
+ compressed_bytes=record.compressed_bytes,
146
+ compression_ratio=record.compression_ratio,
147
+ metadata_json=record.metadata_json,
148
+ )
149
+
150
+
151
+ @app.get("/memories")
152
+ def list_memories(
153
+ limit: int = Query(default=100, ge=1, description="Maximum number of records."),
154
+ ) -> list[dict[str, Any]]:
155
+ try:
156
+ records = memory_manager.list_memories(limit=limit)
157
+ except Exception as exc:
158
+ raise HTTPException(status_code=400, detail=str(exc)) from exc
159
+
160
+ return [record.to_dict() for record in records]
161
+
162
+
163
+ @app.get("/memories/{record_id}")
164
+ def get_memory(record_id: str) -> dict[str, Any]:
165
+ bundle = memory_manager.export_record_bundle(record_id)
166
+ if bundle is None:
167
+ raise HTTPException(status_code=404, detail="Record not found")
168
+ return bundle
169
+
170
+
171
+ @app.get("/memories/{record_id}/text")
172
+ def get_memory_text(record_id: str) -> dict[str, Any]:
173
+ text = memory_manager.get_text(record_id)
174
+ if text is None:
175
+ raise HTTPException(status_code=404, detail="Record not found")
176
+ return {"record_id": record_id, "text": text}
177
+
178
+
179
+ @app.delete("/memories/{record_id}", response_model=DeleteResponse)
180
+ def delete_memory(record_id: str) -> DeleteResponse:
181
+ deleted = memory_manager.delete_memory(record_id)
182
+ if not deleted:
183
+ raise HTTPException(status_code=404, detail="Record not found")
184
+
185
+ return DeleteResponse(
186
+ deleted=True,
187
+ record_id=record_id,
188
+ )
189
+
190
+
191
+ @app.post("/retrieve", response_model=list[RetrievalItem])
192
+ def retrieve_texts(payload: RetrieveRequest) -> list[RetrievalItem]:
193
+ try:
194
+ retriever = MemoryRetriever(
195
+ memory_manager=memory_manager,
196
+ mode=payload.mode,
197
+ alpha=payload.alpha,
198
+ )
199
+ results = retriever.retrieve(
200
+ query=payload.query,
201
+ limit=payload.limit,
202
+ search_limit=payload.search_limit,
203
+ metadata_filter=payload.metadata_filter,
204
+ )
205
+ except Exception as exc:
206
+ raise HTTPException(status_code=400, detail=str(exc)) from exc
207
+
208
+ return [
209
+ RetrievalItem(
210
+ record_id=result.record_id,
211
+ score=result.score,
212
+ method=result.method,
213
+ created_at=result.created_at,
214
+ metadata=result.metadata,
215
+ text=result.text,
216
+ )
217
+ for result in results
218
+ ]
@@ -0,0 +1,31 @@
1
+ from src.ccllm.compression.compressor import (
2
+ CompressionMethod,
3
+ CompressionPackage,
4
+ CompressionStats,
5
+ TextCompressor,
6
+ )
7
+ from src.ccllm.compression.decompressor import TextDecompressor
8
+ from src.ccllm.compression.dictionary import (
9
+ DictionaryEntry,
10
+ PhraseDictionaryBuilder,
11
+ )
12
+ from src.ccllm.compression.tokenizer import (
13
+ TextTokenizer,
14
+ TokenizationResult,
15
+ )
16
+
17
+ __all__ = [
18
+ # tokenizer
19
+ "TextTokenizer",
20
+ "TokenizationResult",
21
+ # dictionary
22
+ "PhraseDictionaryBuilder",
23
+ "DictionaryEntry",
24
+ # compressor
25
+ "TextCompressor",
26
+ "CompressionPackage",
27
+ "CompressionStats",
28
+ "CompressionMethod",
29
+ # decompressor
30
+ "TextDecompressor",
31
+ ]
@@ -0,0 +1,300 @@
1
+ from __future__ import annotations
2
+
3
+ import base64
4
+ import hashlib
5
+ import json
6
+ import lzma
7
+ import zlib
8
+ from dataclasses import asdict, dataclass
9
+ from typing import Any, Literal
10
+
11
+ from src.ccllm.compression.dictionary import DictionaryEntry, PhraseDictionaryBuilder
12
+ from src.ccllm.compression.tokenizer import TextTokenizer
13
+
14
+ CompressionMethod = Literal["none", "zlib", "lzma", "dictionary"]
15
+
16
+
17
+ @dataclass(frozen=True)
18
+ class CompressionStats:
19
+ original_bytes: int
20
+ compressed_bytes: int
21
+ compression_ratio: float
22
+ space_saving_ratio: float
23
+ original_token_count: int
24
+ compressed_token_count: int | None = None
25
+
26
+
27
+ @dataclass(frozen=True)
28
+ class CompressionPackage:
29
+ version: str
30
+ method: CompressionMethod
31
+ original_length: int
32
+ original_sha256: str
33
+ token_count: int
34
+ compressed_payload_b64: str
35
+ stats: CompressionStats
36
+ metadata: dict[str, Any]
37
+ dictionary: dict[str, list[str]] | None = None
38
+
39
+ def to_dict(self) -> dict[str, Any]:
40
+ return asdict(self)
41
+
42
+ def to_json(self) -> str:
43
+ return json.dumps(self.to_dict(), ensure_ascii=False, indent=2)
44
+
45
+ @classmethod
46
+ def from_dict(cls, data: dict[str, Any]) -> CompressionPackage:
47
+ stats = CompressionStats(**data["stats"])
48
+ return cls(
49
+ version=data["version"],
50
+ method=data["method"],
51
+ original_length=data["original_length"],
52
+ original_sha256=data["original_sha256"],
53
+ token_count=data["token_count"],
54
+ compressed_payload_b64=data["compressed_payload_b64"],
55
+ stats=stats,
56
+ metadata=data.get("metadata", {}),
57
+ dictionary=data.get("dictionary"),
58
+ )
59
+
60
+ @classmethod
61
+ def from_json(cls, json_str: str) -> CompressionPackage:
62
+ return cls.from_dict(json.loads(json_str))
63
+
64
+
65
+ class TextCompressor:
66
+ """
67
+ Baseline + custom dictionary text compressor.
68
+
69
+ Supported methods:
70
+ - none
71
+ - zlib
72
+ - lzma
73
+ - dictionary
74
+
75
+ The dictionary method is token-aware:
76
+ - tokenize text losslessly
77
+ - build repeated phrase dictionary
78
+ - replace repeated phrases with phrase IDs
79
+ - serialize encoded token stream as UTF-8 JSON
80
+ """
81
+
82
+ SUPPORTED_METHODS: tuple[CompressionMethod, ...] = (
83
+ "none",
84
+ "zlib",
85
+ "lzma",
86
+ "dictionary",
87
+ )
88
+
89
+ def __init__(
90
+ self,
91
+ method: CompressionMethod = "zlib",
92
+ zlib_level: int = 9,
93
+ lzma_preset: int = 6,
94
+ min_phrase_len: int = 2,
95
+ max_phrase_len: int = 8,
96
+ min_frequency: int = 2,
97
+ max_dictionary_size: int = 256,
98
+ min_estimated_savings: int = 1,
99
+ skip_all_whitespace_phrases: bool = True,
100
+ ) -> None:
101
+ if method not in self.SUPPORTED_METHODS:
102
+ raise ValueError(
103
+ f"Unsupported compression method: {method}. "
104
+ f"Supported: {self.SUPPORTED_METHODS}"
105
+ )
106
+
107
+ if not 0 <= zlib_level <= 9:
108
+ raise ValueError("zlib_level must be between 0 and 9")
109
+
110
+ if not 0 <= lzma_preset <= 9:
111
+ raise ValueError("lzma_preset must be between 0 and 9")
112
+
113
+ self.method = method
114
+ self.zlib_level = zlib_level
115
+ self.lzma_preset = lzma_preset
116
+ self.tokenizer = TextTokenizer()
117
+ self.dictionary_builder = PhraseDictionaryBuilder(
118
+ min_phrase_len=min_phrase_len,
119
+ max_phrase_len=max_phrase_len,
120
+ min_frequency=min_frequency,
121
+ max_dictionary_size=max_dictionary_size,
122
+ min_estimated_savings=min_estimated_savings,
123
+ skip_all_whitespace_phrases=skip_all_whitespace_phrases,
124
+ )
125
+
126
+ def compress(
127
+ self,
128
+ text: str,
129
+ metadata: dict[str, Any] | None = None,
130
+ ) -> CompressionPackage:
131
+ if not isinstance(text, str):
132
+ raise TypeError("text must be a string")
133
+
134
+ metadata = metadata or {}
135
+ tokenization = self.tokenizer.tokenize(text)
136
+ original_bytes = text.encode("utf-8")
137
+
138
+ if self.method == "dictionary":
139
+ return self._compress_with_dictionary(
140
+ text=text,
141
+ original_bytes=original_bytes,
142
+ token_count=tokenization.token_count,
143
+ tokens=tokenization.tokens,
144
+ metadata=metadata,
145
+ )
146
+
147
+ compressed_bytes = self._compress_bytes(original_bytes)
148
+ payload_b64 = base64.b64encode(compressed_bytes).decode("ascii")
149
+
150
+ stats = self._build_stats(
151
+ original_bytes_len=len(original_bytes),
152
+ compressed_bytes_len=len(compressed_bytes),
153
+ original_token_count=tokenization.token_count,
154
+ compressed_token_count=None,
155
+ )
156
+
157
+ return CompressionPackage(
158
+ version="1.1.0",
159
+ method=self.method,
160
+ original_length=len(text),
161
+ original_sha256=self._sha256_hex(original_bytes),
162
+ token_count=tokenization.token_count,
163
+ compressed_payload_b64=payload_b64,
164
+ stats=stats,
165
+ metadata=metadata,
166
+ dictionary=None,
167
+ )
168
+
169
+ def compress_to_json(
170
+ self,
171
+ text: str,
172
+ metadata: dict[str, Any] | None = None,
173
+ ) -> str:
174
+ return self.compress(text=text, metadata=metadata).to_json()
175
+
176
+ def _compress_with_dictionary(
177
+ self,
178
+ text: str,
179
+ original_bytes: bytes,
180
+ token_count: int,
181
+ tokens: list[str],
182
+ metadata: dict[str, Any],
183
+ ) -> CompressionPackage:
184
+ entries = self.dictionary_builder.build(tokens)
185
+ dictionary_map = self.dictionary_builder.build_reverse_lookup(entries)
186
+ encoded_tokens = self._encode_with_dictionary(tokens, entries)
187
+
188
+ serialized_stream = json.dumps(
189
+ encoded_tokens,
190
+ ensure_ascii=False,
191
+ separators=(",", ":"),
192
+ ).encode("utf-8")
193
+ payload_b64 = base64.b64encode(serialized_stream).decode("ascii")
194
+
195
+ stats = self._build_stats(
196
+ original_bytes_len=len(original_bytes),
197
+ compressed_bytes_len=len(serialized_stream),
198
+ original_token_count=token_count,
199
+ compressed_token_count=len(encoded_tokens),
200
+ )
201
+
202
+ return CompressionPackage(
203
+ version="1.1.0",
204
+ method="dictionary",
205
+ original_length=len(text),
206
+ original_sha256=self._sha256_hex(original_bytes),
207
+ token_count=token_count,
208
+ compressed_payload_b64=payload_b64,
209
+ stats=stats,
210
+ metadata=metadata,
211
+ dictionary=dictionary_map,
212
+ )
213
+
214
+ def _encode_with_dictionary(
215
+ self,
216
+ tokens: list[str],
217
+ entries: list[DictionaryEntry],
218
+ ) -> list[str]:
219
+ if not entries:
220
+ return list(tokens)
221
+
222
+ phrase_to_id = self.dictionary_builder.build_lookup(entries)
223
+ available_lengths = sorted(
224
+ {len(entry.phrase) for entry in entries},
225
+ reverse=True,
226
+ )
227
+
228
+ encoded_tokens: list[str] = []
229
+ token_index = 0
230
+ total_tokens = len(tokens)
231
+
232
+ while token_index < total_tokens:
233
+ matched_phrase_id: str | None = None
234
+ matched_phrase_len = 0
235
+
236
+ for phrase_len in available_lengths:
237
+ if token_index + phrase_len > total_tokens:
238
+ continue
239
+
240
+ candidate = tuple(tokens[token_index : token_index + phrase_len])
241
+ phrase_id = phrase_to_id.get(candidate)
242
+
243
+ if phrase_id is not None:
244
+ matched_phrase_id = phrase_id
245
+ matched_phrase_len = phrase_len
246
+ break
247
+
248
+ if matched_phrase_id is not None:
249
+ encoded_tokens.append(matched_phrase_id)
250
+ token_index += matched_phrase_len
251
+ else:
252
+ encoded_tokens.append(tokens[token_index])
253
+ token_index += 1
254
+
255
+ return encoded_tokens
256
+
257
+ def _compress_bytes(self, data: bytes) -> bytes:
258
+ if self.method == "none":
259
+ return data
260
+
261
+ if self.method == "zlib":
262
+ return zlib.compress(data, level=self.zlib_level)
263
+
264
+ if self.method == "lzma":
265
+ return lzma.compress(data, preset=self.lzma_preset)
266
+
267
+ raise ValueError(f"Unhandled compression method: {self.method}")
268
+
269
+ @staticmethod
270
+ def _sha256_hex(data: bytes) -> str:
271
+ return hashlib.sha256(data).hexdigest()
272
+
273
+ @staticmethod
274
+ def _build_stats(
275
+ original_bytes_len: int,
276
+ compressed_bytes_len: int,
277
+ original_token_count: int,
278
+ compressed_token_count: int | None,
279
+ ) -> CompressionStats:
280
+ if original_bytes_len < 0 or compressed_bytes_len < 0:
281
+ raise ValueError("byte lengths must be non-negative")
282
+
283
+ if original_bytes_len == 0:
284
+ compression_ratio = 1.0
285
+ space_saving_ratio = 0.0
286
+ else:
287
+ compression_ratio = compressed_bytes_len / original_bytes_len
288
+ space_saving_ratio = 1.0 - compression_ratio
289
+
290
+ return CompressionStats(
291
+ original_bytes=original_bytes_len,
292
+ compressed_bytes=compressed_bytes_len,
293
+ compression_ratio=compression_ratio,
294
+ space_saving_ratio=space_saving_ratio,
295
+ original_token_count=original_token_count,
296
+ compressed_token_count=compressed_token_count,
297
+ )
298
+
299
+ def available_methods(self) -> list[str]:
300
+ return list(self.SUPPORTED_METHODS)