ccllm 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ccllm-0.1.0/LICENSE +21 -0
- ccllm-0.1.0/PKG-INFO +32 -0
- ccllm-0.1.0/pyproject.toml +51 -0
- ccllm-0.1.0/setup.cfg +4 -0
- ccllm-0.1.0/src/ccllm/__init__.py +0 -0
- ccllm-0.1.0/src/ccllm/api/__init__.py +3 -0
- ccllm-0.1.0/src/ccllm/api/app.py +218 -0
- ccllm-0.1.0/src/ccllm/compression/__init__.py +31 -0
- ccllm-0.1.0/src/ccllm/compression/compressor.py +300 -0
- ccllm-0.1.0/src/ccllm/compression/decompressor.py +184 -0
- ccllm-0.1.0/src/ccllm/compression/dictionary.py +196 -0
- ccllm-0.1.0/src/ccllm/compression/tokenizer.py +71 -0
- ccllm-0.1.0/src/ccllm/memory/__init__.py +3 -0
- ccllm-0.1.0/src/ccllm/memory/memory_manager.py +179 -0
- ccllm-0.1.0/src/ccllm/retrieval/__init__.py +17 -0
- ccllm-0.1.0/src/ccllm/retrieval/embedder.py +150 -0
- ccllm-0.1.0/src/ccllm/retrieval/retriever.py +230 -0
- ccllm-0.1.0/src/ccllm/retrieval/vector_store.py +228 -0
- ccllm-0.1.0/src/ccllm/storage/__init__.py +8 -0
- ccllm-0.1.0/src/ccllm/storage/db.py +258 -0
- ccllm-0.1.0/src/ccllm/storage/models.py +58 -0
- ccllm-0.1.0/src/ccllm.egg-info/PKG-INFO +32 -0
- ccllm-0.1.0/src/ccllm.egg-info/SOURCES.txt +36 -0
- ccllm-0.1.0/src/ccllm.egg-info/dependency_links.txt +1 -0
- ccllm-0.1.0/src/ccllm.egg-info/requires.txt +12 -0
- ccllm-0.1.0/src/ccllm.egg-info/top_level.txt +1 -0
- ccllm-0.1.0/tests/test_compressor.py +312 -0
- ccllm-0.1.0/tests/test_decompressor.py +558 -0
- ccllm-0.1.0/tests/test_dictionary.py +113 -0
- ccllm-0.1.0/tests/test_embedder.py +200 -0
- ccllm-0.1.0/tests/test_integration_compression.py +189 -0
- ccllm-0.1.0/tests/test_integration_retrieval.py +183 -0
- ccllm-0.1.0/tests/test_integration_storage.py +155 -0
- ccllm-0.1.0/tests/test_memory_manager.py +304 -0
- ccllm-0.1.0/tests/test_retriever.py +244 -0
- ccllm-0.1.0/tests/test_storage.py +268 -0
- ccllm-0.1.0/tests/test_tokenizer.py +104 -0
- ccllm-0.1.0/tests/test_vector_store.py +392 -0
ccllm-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Wasi_Ahmad
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
ccllm-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ccllm
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Lossless text compression, storage, memory management, and retrieval toolkit
|
|
5
|
+
Author-email: Wasi Ahmad <wasiahamd0569@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/<your-user-or-org>/<repo>
|
|
8
|
+
Project-URL: Repository, https://github.com/<your-user-or-org>/<repo>
|
|
9
|
+
Project-URL: Issues, https://github.com/<your-user-or-org>/<repo>/issues
|
|
10
|
+
Keywords: compression,retrieval,memory,storage,nlp
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: fastapi<1.0,>=0.110
|
|
22
|
+
Requires-Dist: pydantic<3.0,>=2.7
|
|
23
|
+
Requires-Dist: numpy<2.0,>=1.26
|
|
24
|
+
Requires-Dist: scikit-learn<2.0,>=1.4
|
|
25
|
+
Requires-Dist: uvicorn<1.0,>=0.29
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest<9,>=8; extra == "dev"
|
|
28
|
+
Requires-Dist: pytest-cov<6,>=5; extra == "dev"
|
|
29
|
+
Requires-Dist: ruff<0.6,>=0.4; extra == "dev"
|
|
30
|
+
Requires-Dist: build>=1.2; extra == "dev"
|
|
31
|
+
Requires-Dist: twine<6,>=5; extra == "dev"
|
|
32
|
+
Dynamic: license-file
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=69", "wheel", "build"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "ccllm"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Lossless text compression, storage, memory management, and retrieval toolkit"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Wasi Ahmad", email = "wasiahamd0569@gmail.com" }
|
|
14
|
+
]
|
|
15
|
+
keywords = ["compression", "retrieval", "memory", "storage", "nlp"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.10",
|
|
22
|
+
"Programming Language :: Python :: 3.11",
|
|
23
|
+
"Programming Language :: Python :: 3.12"
|
|
24
|
+
]
|
|
25
|
+
dependencies = [
|
|
26
|
+
"fastapi>=0.110,<1.0",
|
|
27
|
+
"pydantic>=2.7,<3.0",
|
|
28
|
+
"numpy>=1.26,<2.0",
|
|
29
|
+
"scikit-learn>=1.4,<2.0",
|
|
30
|
+
"uvicorn>=0.29,<1.0"
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.optional-dependencies]
|
|
34
|
+
dev = [
|
|
35
|
+
"pytest>=8,<9",
|
|
36
|
+
"pytest-cov>=5,<6",
|
|
37
|
+
"ruff>=0.4,<0.6",
|
|
38
|
+
"build>=1.2",
|
|
39
|
+
"twine>=5,<6"
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
[tool.setuptools]
|
|
43
|
+
package-dir = {"" = "src"}
|
|
44
|
+
|
|
45
|
+
[tool.setuptools.packages.find]
|
|
46
|
+
where = ["src"]
|
|
47
|
+
|
|
48
|
+
[project.urls]
|
|
49
|
+
Homepage = "https://github.com/<your-user-or-org>/<repo>"
|
|
50
|
+
Repository = "https://github.com/<your-user-or-org>/<repo>"
|
|
51
|
+
Issues = "https://github.com/<your-user-or-org>/<repo>/issues"
|
ccllm-0.1.0/setup.cfg
ADDED
|
File without changes
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Literal
|
|
4
|
+
|
|
5
|
+
from fastapi import FastAPI, HTTPException, Query
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
7
|
+
|
|
8
|
+
from src.ccllm.memory import MemoryManager
|
|
9
|
+
from src.ccllm.retrieval import MemoryRetriever
|
|
10
|
+
from src.ccllm.storage import CompressionStorage
|
|
11
|
+
|
|
12
|
+
# -----------------------------
|
|
13
|
+
# App setup
|
|
14
|
+
# -----------------------------
|
|
15
|
+
|
|
16
|
+
storage = CompressionStorage("data/compression.db")
|
|
17
|
+
memory_manager = MemoryManager(storage=storage, default_method="zlib")
|
|
18
|
+
|
|
19
|
+
app = FastAPI(
|
|
20
|
+
title="Compression Memory API",
|
|
21
|
+
version="1.0.0",
|
|
22
|
+
description=(
|
|
23
|
+
"API for storing, restoring, listing, deleting, and retrieving "
|
|
24
|
+
"compressed text memories."
|
|
25
|
+
),
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# -----------------------------
|
|
30
|
+
# Request / Response Models
|
|
31
|
+
# -----------------------------
|
|
32
|
+
|
|
33
|
+
CompressionMethod = Literal["none", "zlib", "lzma", "dictionary"]
|
|
34
|
+
RetrievalMode = Literal["lexical", "vector", "hybrid"]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class StoreTextRequest(BaseModel):
|
|
38
|
+
text: str = Field(..., description="Original text to compress and store.")
|
|
39
|
+
method: CompressionMethod | None = Field(
|
|
40
|
+
default=None,
|
|
41
|
+
description="Compression method to use. Falls back to default if omitted.",
|
|
42
|
+
)
|
|
43
|
+
metadata: dict[str, Any] | None = Field(
|
|
44
|
+
default=None,
|
|
45
|
+
description="Optional metadata stored with the record.",
|
|
46
|
+
)
|
|
47
|
+
record_id: str | None = Field(
|
|
48
|
+
default=None,
|
|
49
|
+
description="Optional custom record ID.",
|
|
50
|
+
)
|
|
51
|
+
compressor_kwargs: dict[str, Any] | None = Field(
|
|
52
|
+
default=None,
|
|
53
|
+
description="Optional compressor configuration.",
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class StoreTextResponse(BaseModel):
|
|
58
|
+
record_id: str
|
|
59
|
+
created_at: str
|
|
60
|
+
method: str
|
|
61
|
+
original_sha256: str
|
|
62
|
+
original_length: int
|
|
63
|
+
token_count: int
|
|
64
|
+
compressed_bytes: int
|
|
65
|
+
compression_ratio: float
|
|
66
|
+
metadata_json: str
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class RetrieveRequest(BaseModel):
|
|
70
|
+
query: str = Field(..., description="Search query text.")
|
|
71
|
+
mode: RetrievalMode = Field(
|
|
72
|
+
default="hybrid",
|
|
73
|
+
description="Retrieval mode: lexical, vector, or hybrid.",
|
|
74
|
+
)
|
|
75
|
+
limit: int = Field(default=5, ge=1, description="Maximum number of results.")
|
|
76
|
+
search_limit: int = Field(
|
|
77
|
+
default=100,
|
|
78
|
+
ge=1,
|
|
79
|
+
description="Maximum number of stored memories to inspect.",
|
|
80
|
+
)
|
|
81
|
+
metadata_filter: dict[str, Any] | None = Field(
|
|
82
|
+
default=None,
|
|
83
|
+
description="Optional exact-match metadata filter.",
|
|
84
|
+
)
|
|
85
|
+
alpha: float = Field(
|
|
86
|
+
default=0.5,
|
|
87
|
+
ge=0.0,
|
|
88
|
+
le=1.0,
|
|
89
|
+
description="Hybrid weighting. Used only in hybrid mode.",
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class RetrievalItem(BaseModel):
|
|
94
|
+
record_id: str
|
|
95
|
+
score: float
|
|
96
|
+
method: str
|
|
97
|
+
created_at: str
|
|
98
|
+
metadata: dict[str, Any]
|
|
99
|
+
text: str
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class DeleteResponse(BaseModel):
|
|
103
|
+
deleted: bool
|
|
104
|
+
record_id: str
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class HealthResponse(BaseModel):
|
|
108
|
+
status: str
|
|
109
|
+
total_memories: int
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
# -----------------------------
|
|
113
|
+
# Routes
|
|
114
|
+
# -----------------------------
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
@app.get("/health", response_model=HealthResponse)
|
|
118
|
+
def health() -> HealthResponse:
|
|
119
|
+
return HealthResponse(
|
|
120
|
+
status="ok",
|
|
121
|
+
total_memories=memory_manager.count_memories(),
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
@app.post("/store", response_model=StoreTextResponse)
|
|
126
|
+
def store_text(payload: StoreTextRequest) -> StoreTextResponse:
|
|
127
|
+
try:
|
|
128
|
+
record = memory_manager.save_text(
|
|
129
|
+
text=payload.text,
|
|
130
|
+
method=payload.method,
|
|
131
|
+
metadata=payload.metadata,
|
|
132
|
+
record_id=payload.record_id,
|
|
133
|
+
compressor_kwargs=payload.compressor_kwargs,
|
|
134
|
+
)
|
|
135
|
+
except Exception as exc:
|
|
136
|
+
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
|
137
|
+
|
|
138
|
+
return StoreTextResponse(
|
|
139
|
+
record_id=record.record_id,
|
|
140
|
+
created_at=record.created_at,
|
|
141
|
+
method=record.method,
|
|
142
|
+
original_sha256=record.original_sha256,
|
|
143
|
+
original_length=record.original_length,
|
|
144
|
+
token_count=record.token_count,
|
|
145
|
+
compressed_bytes=record.compressed_bytes,
|
|
146
|
+
compression_ratio=record.compression_ratio,
|
|
147
|
+
metadata_json=record.metadata_json,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
@app.get("/memories")
|
|
152
|
+
def list_memories(
|
|
153
|
+
limit: int = Query(default=100, ge=1, description="Maximum number of records."),
|
|
154
|
+
) -> list[dict[str, Any]]:
|
|
155
|
+
try:
|
|
156
|
+
records = memory_manager.list_memories(limit=limit)
|
|
157
|
+
except Exception as exc:
|
|
158
|
+
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
|
159
|
+
|
|
160
|
+
return [record.to_dict() for record in records]
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
@app.get("/memories/{record_id}")
|
|
164
|
+
def get_memory(record_id: str) -> dict[str, Any]:
|
|
165
|
+
bundle = memory_manager.export_record_bundle(record_id)
|
|
166
|
+
if bundle is None:
|
|
167
|
+
raise HTTPException(status_code=404, detail="Record not found")
|
|
168
|
+
return bundle
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
@app.get("/memories/{record_id}/text")
|
|
172
|
+
def get_memory_text(record_id: str) -> dict[str, Any]:
|
|
173
|
+
text = memory_manager.get_text(record_id)
|
|
174
|
+
if text is None:
|
|
175
|
+
raise HTTPException(status_code=404, detail="Record not found")
|
|
176
|
+
return {"record_id": record_id, "text": text}
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
@app.delete("/memories/{record_id}", response_model=DeleteResponse)
|
|
180
|
+
def delete_memory(record_id: str) -> DeleteResponse:
|
|
181
|
+
deleted = memory_manager.delete_memory(record_id)
|
|
182
|
+
if not deleted:
|
|
183
|
+
raise HTTPException(status_code=404, detail="Record not found")
|
|
184
|
+
|
|
185
|
+
return DeleteResponse(
|
|
186
|
+
deleted=True,
|
|
187
|
+
record_id=record_id,
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
@app.post("/retrieve", response_model=list[RetrievalItem])
|
|
192
|
+
def retrieve_texts(payload: RetrieveRequest) -> list[RetrievalItem]:
|
|
193
|
+
try:
|
|
194
|
+
retriever = MemoryRetriever(
|
|
195
|
+
memory_manager=memory_manager,
|
|
196
|
+
mode=payload.mode,
|
|
197
|
+
alpha=payload.alpha,
|
|
198
|
+
)
|
|
199
|
+
results = retriever.retrieve(
|
|
200
|
+
query=payload.query,
|
|
201
|
+
limit=payload.limit,
|
|
202
|
+
search_limit=payload.search_limit,
|
|
203
|
+
metadata_filter=payload.metadata_filter,
|
|
204
|
+
)
|
|
205
|
+
except Exception as exc:
|
|
206
|
+
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
|
207
|
+
|
|
208
|
+
return [
|
|
209
|
+
RetrievalItem(
|
|
210
|
+
record_id=result.record_id,
|
|
211
|
+
score=result.score,
|
|
212
|
+
method=result.method,
|
|
213
|
+
created_at=result.created_at,
|
|
214
|
+
metadata=result.metadata,
|
|
215
|
+
text=result.text,
|
|
216
|
+
)
|
|
217
|
+
for result in results
|
|
218
|
+
]
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from src.ccllm.compression.compressor import (
|
|
2
|
+
CompressionMethod,
|
|
3
|
+
CompressionPackage,
|
|
4
|
+
CompressionStats,
|
|
5
|
+
TextCompressor,
|
|
6
|
+
)
|
|
7
|
+
from src.ccllm.compression.decompressor import TextDecompressor
|
|
8
|
+
from src.ccllm.compression.dictionary import (
|
|
9
|
+
DictionaryEntry,
|
|
10
|
+
PhraseDictionaryBuilder,
|
|
11
|
+
)
|
|
12
|
+
from src.ccllm.compression.tokenizer import (
|
|
13
|
+
TextTokenizer,
|
|
14
|
+
TokenizationResult,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
# tokenizer
|
|
19
|
+
"TextTokenizer",
|
|
20
|
+
"TokenizationResult",
|
|
21
|
+
# dictionary
|
|
22
|
+
"PhraseDictionaryBuilder",
|
|
23
|
+
"DictionaryEntry",
|
|
24
|
+
# compressor
|
|
25
|
+
"TextCompressor",
|
|
26
|
+
"CompressionPackage",
|
|
27
|
+
"CompressionStats",
|
|
28
|
+
"CompressionMethod",
|
|
29
|
+
# decompressor
|
|
30
|
+
"TextDecompressor",
|
|
31
|
+
]
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import base64
|
|
4
|
+
import hashlib
|
|
5
|
+
import json
|
|
6
|
+
import lzma
|
|
7
|
+
import zlib
|
|
8
|
+
from dataclasses import asdict, dataclass
|
|
9
|
+
from typing import Any, Literal
|
|
10
|
+
|
|
11
|
+
from src.ccllm.compression.dictionary import DictionaryEntry, PhraseDictionaryBuilder
|
|
12
|
+
from src.ccllm.compression.tokenizer import TextTokenizer
|
|
13
|
+
|
|
14
|
+
CompressionMethod = Literal["none", "zlib", "lzma", "dictionary"]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass(frozen=True)
|
|
18
|
+
class CompressionStats:
|
|
19
|
+
original_bytes: int
|
|
20
|
+
compressed_bytes: int
|
|
21
|
+
compression_ratio: float
|
|
22
|
+
space_saving_ratio: float
|
|
23
|
+
original_token_count: int
|
|
24
|
+
compressed_token_count: int | None = None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass(frozen=True)
|
|
28
|
+
class CompressionPackage:
|
|
29
|
+
version: str
|
|
30
|
+
method: CompressionMethod
|
|
31
|
+
original_length: int
|
|
32
|
+
original_sha256: str
|
|
33
|
+
token_count: int
|
|
34
|
+
compressed_payload_b64: str
|
|
35
|
+
stats: CompressionStats
|
|
36
|
+
metadata: dict[str, Any]
|
|
37
|
+
dictionary: dict[str, list[str]] | None = None
|
|
38
|
+
|
|
39
|
+
def to_dict(self) -> dict[str, Any]:
|
|
40
|
+
return asdict(self)
|
|
41
|
+
|
|
42
|
+
def to_json(self) -> str:
|
|
43
|
+
return json.dumps(self.to_dict(), ensure_ascii=False, indent=2)
|
|
44
|
+
|
|
45
|
+
@classmethod
|
|
46
|
+
def from_dict(cls, data: dict[str, Any]) -> CompressionPackage:
|
|
47
|
+
stats = CompressionStats(**data["stats"])
|
|
48
|
+
return cls(
|
|
49
|
+
version=data["version"],
|
|
50
|
+
method=data["method"],
|
|
51
|
+
original_length=data["original_length"],
|
|
52
|
+
original_sha256=data["original_sha256"],
|
|
53
|
+
token_count=data["token_count"],
|
|
54
|
+
compressed_payload_b64=data["compressed_payload_b64"],
|
|
55
|
+
stats=stats,
|
|
56
|
+
metadata=data.get("metadata", {}),
|
|
57
|
+
dictionary=data.get("dictionary"),
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
@classmethod
|
|
61
|
+
def from_json(cls, json_str: str) -> CompressionPackage:
|
|
62
|
+
return cls.from_dict(json.loads(json_str))
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class TextCompressor:
|
|
66
|
+
"""
|
|
67
|
+
Baseline + custom dictionary text compressor.
|
|
68
|
+
|
|
69
|
+
Supported methods:
|
|
70
|
+
- none
|
|
71
|
+
- zlib
|
|
72
|
+
- lzma
|
|
73
|
+
- dictionary
|
|
74
|
+
|
|
75
|
+
The dictionary method is token-aware:
|
|
76
|
+
- tokenize text losslessly
|
|
77
|
+
- build repeated phrase dictionary
|
|
78
|
+
- replace repeated phrases with phrase IDs
|
|
79
|
+
- serialize encoded token stream as UTF-8 JSON
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
SUPPORTED_METHODS: tuple[CompressionMethod, ...] = (
|
|
83
|
+
"none",
|
|
84
|
+
"zlib",
|
|
85
|
+
"lzma",
|
|
86
|
+
"dictionary",
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
def __init__(
|
|
90
|
+
self,
|
|
91
|
+
method: CompressionMethod = "zlib",
|
|
92
|
+
zlib_level: int = 9,
|
|
93
|
+
lzma_preset: int = 6,
|
|
94
|
+
min_phrase_len: int = 2,
|
|
95
|
+
max_phrase_len: int = 8,
|
|
96
|
+
min_frequency: int = 2,
|
|
97
|
+
max_dictionary_size: int = 256,
|
|
98
|
+
min_estimated_savings: int = 1,
|
|
99
|
+
skip_all_whitespace_phrases: bool = True,
|
|
100
|
+
) -> None:
|
|
101
|
+
if method not in self.SUPPORTED_METHODS:
|
|
102
|
+
raise ValueError(
|
|
103
|
+
f"Unsupported compression method: {method}. "
|
|
104
|
+
f"Supported: {self.SUPPORTED_METHODS}"
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
if not 0 <= zlib_level <= 9:
|
|
108
|
+
raise ValueError("zlib_level must be between 0 and 9")
|
|
109
|
+
|
|
110
|
+
if not 0 <= lzma_preset <= 9:
|
|
111
|
+
raise ValueError("lzma_preset must be between 0 and 9")
|
|
112
|
+
|
|
113
|
+
self.method = method
|
|
114
|
+
self.zlib_level = zlib_level
|
|
115
|
+
self.lzma_preset = lzma_preset
|
|
116
|
+
self.tokenizer = TextTokenizer()
|
|
117
|
+
self.dictionary_builder = PhraseDictionaryBuilder(
|
|
118
|
+
min_phrase_len=min_phrase_len,
|
|
119
|
+
max_phrase_len=max_phrase_len,
|
|
120
|
+
min_frequency=min_frequency,
|
|
121
|
+
max_dictionary_size=max_dictionary_size,
|
|
122
|
+
min_estimated_savings=min_estimated_savings,
|
|
123
|
+
skip_all_whitespace_phrases=skip_all_whitespace_phrases,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
def compress(
|
|
127
|
+
self,
|
|
128
|
+
text: str,
|
|
129
|
+
metadata: dict[str, Any] | None = None,
|
|
130
|
+
) -> CompressionPackage:
|
|
131
|
+
if not isinstance(text, str):
|
|
132
|
+
raise TypeError("text must be a string")
|
|
133
|
+
|
|
134
|
+
metadata = metadata or {}
|
|
135
|
+
tokenization = self.tokenizer.tokenize(text)
|
|
136
|
+
original_bytes = text.encode("utf-8")
|
|
137
|
+
|
|
138
|
+
if self.method == "dictionary":
|
|
139
|
+
return self._compress_with_dictionary(
|
|
140
|
+
text=text,
|
|
141
|
+
original_bytes=original_bytes,
|
|
142
|
+
token_count=tokenization.token_count,
|
|
143
|
+
tokens=tokenization.tokens,
|
|
144
|
+
metadata=metadata,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
compressed_bytes = self._compress_bytes(original_bytes)
|
|
148
|
+
payload_b64 = base64.b64encode(compressed_bytes).decode("ascii")
|
|
149
|
+
|
|
150
|
+
stats = self._build_stats(
|
|
151
|
+
original_bytes_len=len(original_bytes),
|
|
152
|
+
compressed_bytes_len=len(compressed_bytes),
|
|
153
|
+
original_token_count=tokenization.token_count,
|
|
154
|
+
compressed_token_count=None,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
return CompressionPackage(
|
|
158
|
+
version="1.1.0",
|
|
159
|
+
method=self.method,
|
|
160
|
+
original_length=len(text),
|
|
161
|
+
original_sha256=self._sha256_hex(original_bytes),
|
|
162
|
+
token_count=tokenization.token_count,
|
|
163
|
+
compressed_payload_b64=payload_b64,
|
|
164
|
+
stats=stats,
|
|
165
|
+
metadata=metadata,
|
|
166
|
+
dictionary=None,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
def compress_to_json(
|
|
170
|
+
self,
|
|
171
|
+
text: str,
|
|
172
|
+
metadata: dict[str, Any] | None = None,
|
|
173
|
+
) -> str:
|
|
174
|
+
return self.compress(text=text, metadata=metadata).to_json()
|
|
175
|
+
|
|
176
|
+
def _compress_with_dictionary(
|
|
177
|
+
self,
|
|
178
|
+
text: str,
|
|
179
|
+
original_bytes: bytes,
|
|
180
|
+
token_count: int,
|
|
181
|
+
tokens: list[str],
|
|
182
|
+
metadata: dict[str, Any],
|
|
183
|
+
) -> CompressionPackage:
|
|
184
|
+
entries = self.dictionary_builder.build(tokens)
|
|
185
|
+
dictionary_map = self.dictionary_builder.build_reverse_lookup(entries)
|
|
186
|
+
encoded_tokens = self._encode_with_dictionary(tokens, entries)
|
|
187
|
+
|
|
188
|
+
serialized_stream = json.dumps(
|
|
189
|
+
encoded_tokens,
|
|
190
|
+
ensure_ascii=False,
|
|
191
|
+
separators=(",", ":"),
|
|
192
|
+
).encode("utf-8")
|
|
193
|
+
payload_b64 = base64.b64encode(serialized_stream).decode("ascii")
|
|
194
|
+
|
|
195
|
+
stats = self._build_stats(
|
|
196
|
+
original_bytes_len=len(original_bytes),
|
|
197
|
+
compressed_bytes_len=len(serialized_stream),
|
|
198
|
+
original_token_count=token_count,
|
|
199
|
+
compressed_token_count=len(encoded_tokens),
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
return CompressionPackage(
|
|
203
|
+
version="1.1.0",
|
|
204
|
+
method="dictionary",
|
|
205
|
+
original_length=len(text),
|
|
206
|
+
original_sha256=self._sha256_hex(original_bytes),
|
|
207
|
+
token_count=token_count,
|
|
208
|
+
compressed_payload_b64=payload_b64,
|
|
209
|
+
stats=stats,
|
|
210
|
+
metadata=metadata,
|
|
211
|
+
dictionary=dictionary_map,
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
def _encode_with_dictionary(
|
|
215
|
+
self,
|
|
216
|
+
tokens: list[str],
|
|
217
|
+
entries: list[DictionaryEntry],
|
|
218
|
+
) -> list[str]:
|
|
219
|
+
if not entries:
|
|
220
|
+
return list(tokens)
|
|
221
|
+
|
|
222
|
+
phrase_to_id = self.dictionary_builder.build_lookup(entries)
|
|
223
|
+
available_lengths = sorted(
|
|
224
|
+
{len(entry.phrase) for entry in entries},
|
|
225
|
+
reverse=True,
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
encoded_tokens: list[str] = []
|
|
229
|
+
token_index = 0
|
|
230
|
+
total_tokens = len(tokens)
|
|
231
|
+
|
|
232
|
+
while token_index < total_tokens:
|
|
233
|
+
matched_phrase_id: str | None = None
|
|
234
|
+
matched_phrase_len = 0
|
|
235
|
+
|
|
236
|
+
for phrase_len in available_lengths:
|
|
237
|
+
if token_index + phrase_len > total_tokens:
|
|
238
|
+
continue
|
|
239
|
+
|
|
240
|
+
candidate = tuple(tokens[token_index : token_index + phrase_len])
|
|
241
|
+
phrase_id = phrase_to_id.get(candidate)
|
|
242
|
+
|
|
243
|
+
if phrase_id is not None:
|
|
244
|
+
matched_phrase_id = phrase_id
|
|
245
|
+
matched_phrase_len = phrase_len
|
|
246
|
+
break
|
|
247
|
+
|
|
248
|
+
if matched_phrase_id is not None:
|
|
249
|
+
encoded_tokens.append(matched_phrase_id)
|
|
250
|
+
token_index += matched_phrase_len
|
|
251
|
+
else:
|
|
252
|
+
encoded_tokens.append(tokens[token_index])
|
|
253
|
+
token_index += 1
|
|
254
|
+
|
|
255
|
+
return encoded_tokens
|
|
256
|
+
|
|
257
|
+
def _compress_bytes(self, data: bytes) -> bytes:
|
|
258
|
+
if self.method == "none":
|
|
259
|
+
return data
|
|
260
|
+
|
|
261
|
+
if self.method == "zlib":
|
|
262
|
+
return zlib.compress(data, level=self.zlib_level)
|
|
263
|
+
|
|
264
|
+
if self.method == "lzma":
|
|
265
|
+
return lzma.compress(data, preset=self.lzma_preset)
|
|
266
|
+
|
|
267
|
+
raise ValueError(f"Unhandled compression method: {self.method}")
|
|
268
|
+
|
|
269
|
+
@staticmethod
|
|
270
|
+
def _sha256_hex(data: bytes) -> str:
|
|
271
|
+
return hashlib.sha256(data).hexdigest()
|
|
272
|
+
|
|
273
|
+
@staticmethod
|
|
274
|
+
def _build_stats(
|
|
275
|
+
original_bytes_len: int,
|
|
276
|
+
compressed_bytes_len: int,
|
|
277
|
+
original_token_count: int,
|
|
278
|
+
compressed_token_count: int | None,
|
|
279
|
+
) -> CompressionStats:
|
|
280
|
+
if original_bytes_len < 0 or compressed_bytes_len < 0:
|
|
281
|
+
raise ValueError("byte lengths must be non-negative")
|
|
282
|
+
|
|
283
|
+
if original_bytes_len == 0:
|
|
284
|
+
compression_ratio = 1.0
|
|
285
|
+
space_saving_ratio = 0.0
|
|
286
|
+
else:
|
|
287
|
+
compression_ratio = compressed_bytes_len / original_bytes_len
|
|
288
|
+
space_saving_ratio = 1.0 - compression_ratio
|
|
289
|
+
|
|
290
|
+
return CompressionStats(
|
|
291
|
+
original_bytes=original_bytes_len,
|
|
292
|
+
compressed_bytes=compressed_bytes_len,
|
|
293
|
+
compression_ratio=compression_ratio,
|
|
294
|
+
space_saving_ratio=space_saving_ratio,
|
|
295
|
+
original_token_count=original_token_count,
|
|
296
|
+
compressed_token_count=compressed_token_count,
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
def available_methods(self) -> list[str]:
|
|
300
|
+
return list(self.SUPPORTED_METHODS)
|