lopace 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lopace/__init__.py +11 -0
- lopace/compressor.py +517 -0
- lopace-0.1.0.dist-info/METADATA +402 -0
- lopace-0.1.0.dist-info/RECORD +9 -0
- lopace-0.1.0.dist-info/WHEEL +5 -0
- lopace-0.1.0.dist-info/licenses/LICENSE +21 -0
- lopace-0.1.0.dist-info/top_level.txt +2 -0
- tests/__init__.py +1 -0
- tests/test_compressor.py +198 -0
lopace/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LoPace - Lossless Optimized Prompt Accurate Compression Engine
|
|
3
|
+
|
|
4
|
+
A professional Python package for compressing and decompressing prompts
|
|
5
|
+
using multiple techniques: Zstd, Token-based (BPE), and Hybrid methods.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .compressor import PromptCompressor, CompressionMethod
|
|
9
|
+
|
|
10
|
+
__version__ = "0.1.0"
|
|
11
|
+
__all__ = ["PromptCompressor", "CompressionMethod"]
|
lopace/compressor.py
ADDED
|
@@ -0,0 +1,517 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Main compression module implementing Zstd, Token-based, and Hybrid compression methods.
|
|
3
|
+
|
|
4
|
+
The compression algorithms used:
|
|
5
|
+
- Zstd: Uses LZ77 (sliding window) and FSE (Finite State Entropy, a variant of Huffman coding)
|
|
6
|
+
internally via the zstandard library
|
|
7
|
+
- Token-based: Uses BPE (Byte-Pair Encoding) via tiktoken
|
|
8
|
+
- Hybrid: Combines tokenization + Zstd compression
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import struct
|
|
12
|
+
import math
|
|
13
|
+
from collections import Counter
|
|
14
|
+
from enum import Enum
|
|
15
|
+
from typing import Union, Tuple, Optional, Dict
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
import zstandard as zstd
|
|
19
|
+
except ImportError:
|
|
20
|
+
zstd = None
|
|
21
|
+
|
|
22
|
+
try:
|
|
23
|
+
import tiktoken
|
|
24
|
+
except ImportError:
|
|
25
|
+
tiktoken = None
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class CompressionMethod(Enum):
|
|
29
|
+
"""Compression methods available."""
|
|
30
|
+
ZSTD = "zstd"
|
|
31
|
+
TOKEN = "token"
|
|
32
|
+
HYBRID = "hybrid"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class PromptCompressor:
|
|
36
|
+
"""
|
|
37
|
+
Professional prompt compressor supporting multiple compression techniques.
|
|
38
|
+
|
|
39
|
+
Methods:
|
|
40
|
+
- Zstd: Dictionary-based compression using Zstandard
|
|
41
|
+
- Token: Byte-Pair Encoding (BPE) tokenization with binary packing
|
|
42
|
+
- Hybrid: Combination of tokenization and Zstd compression
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
model: Tokenizer model name (default: "cl100k_base")
|
|
46
|
+
Options: "cl100k_base", "p50k_base", "r50k_base", "gpt2", etc.
|
|
47
|
+
zstd_level: Zstd compression level (1-22, default: 15)
|
|
48
|
+
Higher levels provide better compression but are slower.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
def __init__(
|
|
52
|
+
self,
|
|
53
|
+
model: str = "cl100k_base",
|
|
54
|
+
zstd_level: int = 15
|
|
55
|
+
):
|
|
56
|
+
if zstd is None:
|
|
57
|
+
raise ImportError(
|
|
58
|
+
"zstandard is required. Install it with: pip install zstandard"
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
if tiktoken is None:
|
|
62
|
+
raise ImportError(
|
|
63
|
+
"tiktoken is required. Install it with: pip install tiktoken"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
self.tokenizer = tiktoken.get_encoding(model)
|
|
67
|
+
self.zstd_level = zstd_level
|
|
68
|
+
self.model = model
|
|
69
|
+
|
|
70
|
+
# Validate zstd_level
|
|
71
|
+
if not (1 <= zstd_level <= 22):
|
|
72
|
+
raise ValueError("zstd_level must be between 1 and 22")
|
|
73
|
+
|
|
74
|
+
def compress_zstd(self, text: str) -> bytes:
|
|
75
|
+
"""
|
|
76
|
+
Compress prompt using Zstandard algorithm.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
text: Original prompt string
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
Compressed bytes
|
|
83
|
+
|
|
84
|
+
Example:
|
|
85
|
+
>>> compressor = PromptCompressor()
|
|
86
|
+
>>> compressed = compressor.compress_zstd("Your prompt here")
|
|
87
|
+
>>> original = compressor.decompress_zstd(compressed)
|
|
88
|
+
"""
|
|
89
|
+
data_bytes = text.encode('utf-8')
|
|
90
|
+
compressed_blob = zstd.compress(data_bytes, level=self.zstd_level)
|
|
91
|
+
return compressed_blob
|
|
92
|
+
|
|
93
|
+
def decompress_zstd(self, compressed_blob: bytes) -> str:
|
|
94
|
+
"""
|
|
95
|
+
Decompress Zstandard-compressed prompt.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
compressed_blob: Compressed bytes from compress_zstd()
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Original prompt string
|
|
102
|
+
"""
|
|
103
|
+
raw_bytes = zstd.decompress(compressed_blob)
|
|
104
|
+
return raw_bytes.decode('utf-8')
|
|
105
|
+
|
|
106
|
+
def compress_token(self, text: str) -> bytes:
|
|
107
|
+
"""
|
|
108
|
+
Compress prompt using BPE tokenization and binary packing.
|
|
109
|
+
|
|
110
|
+
This method:
|
|
111
|
+
1. Converts text to token IDs using the tokenizer
|
|
112
|
+
2. Packs token IDs as unsigned integers (uint16 or uint32)
|
|
113
|
+
- Uses uint16 (2 bytes) if all token IDs <= 65535
|
|
114
|
+
- Uses uint32 (4 bytes) if any token ID > 65535
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
text: Original prompt string
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Compressed bytes (format byte + binary-packed token IDs)
|
|
121
|
+
Format: [1 byte format flag: 0=uint16, 1=uint32][packed token IDs]
|
|
122
|
+
|
|
123
|
+
Example:
|
|
124
|
+
>>> compressor = PromptCompressor()
|
|
125
|
+
>>> compressed = compressor.compress_token("Your prompt here")
|
|
126
|
+
>>> original = compressor.decompress_token(compressed)
|
|
127
|
+
"""
|
|
128
|
+
# Step 1: Convert text to list of token IDs
|
|
129
|
+
token_ids = list(self.tokenizer.encode(text)) # Ensure it's a list
|
|
130
|
+
|
|
131
|
+
if not token_ids:
|
|
132
|
+
# Empty token list - return just format byte
|
|
133
|
+
return struct.pack('B', 0) # uint16 format
|
|
134
|
+
|
|
135
|
+
# Step 2: Determine if we can use uint16 or need uint32
|
|
136
|
+
# Check if ANY token ID exceeds uint16 range (0-65535)
|
|
137
|
+
max_token_id = max(token_ids)
|
|
138
|
+
min_token_id = min(token_ids)
|
|
139
|
+
use_uint32 = (max_token_id > 65535) or (min_token_id < 0)
|
|
140
|
+
|
|
141
|
+
# Step 3: Pack token IDs (format byte + token data)
|
|
142
|
+
format_byte = 1 if use_uint32 else 0 # 0 = uint16, 1 = uint32
|
|
143
|
+
|
|
144
|
+
try:
|
|
145
|
+
if use_uint32:
|
|
146
|
+
# Use uint32 (4 bytes per token) - format 'I'
|
|
147
|
+
token_data = struct.pack(f'{len(token_ids)}I', *token_ids)
|
|
148
|
+
else:
|
|
149
|
+
# Use uint16 (2 bytes per token) - format 'H'
|
|
150
|
+
# Double-check all IDs fit in uint16 range
|
|
151
|
+
if max(token_ids) > 65535:
|
|
152
|
+
# Fallback to uint32 if somehow we got here
|
|
153
|
+
format_byte = 1
|
|
154
|
+
token_data = struct.pack(f'{len(token_ids)}I', *token_ids)
|
|
155
|
+
else:
|
|
156
|
+
token_data = struct.pack(f'{len(token_ids)}H', *token_ids)
|
|
157
|
+
except (struct.error, OverflowError) as e:
|
|
158
|
+
# If packing fails, fallback to uint32
|
|
159
|
+
format_byte = 1
|
|
160
|
+
token_data = struct.pack(f'{len(token_ids)}I', *token_ids)
|
|
161
|
+
|
|
162
|
+
# Combine format byte with token data
|
|
163
|
+
binary_payload = struct.pack('B', format_byte) + token_data
|
|
164
|
+
|
|
165
|
+
return binary_payload
|
|
166
|
+
|
|
167
|
+
def decompress_token(self, binary_payload: bytes) -> str:
|
|
168
|
+
"""
|
|
169
|
+
Decompress token-based compressed prompt.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
binary_payload: Compressed bytes from compress_token()
|
|
173
|
+
Format: [1 byte format flag: 0=uint16, 1=uint32][packed token IDs]
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
Original prompt string
|
|
177
|
+
"""
|
|
178
|
+
if len(binary_payload) < 1:
|
|
179
|
+
raise ValueError("Invalid compressed data: missing format byte")
|
|
180
|
+
|
|
181
|
+
# Step 1: Read format byte
|
|
182
|
+
format_byte = struct.unpack('B', binary_payload[0:1])[0]
|
|
183
|
+
token_data = binary_payload[1:]
|
|
184
|
+
|
|
185
|
+
if format_byte == 1:
|
|
186
|
+
# uint32 format (4 bytes per token)
|
|
187
|
+
if len(token_data) % 4 != 0:
|
|
188
|
+
raise ValueError("Invalid compressed data: uint32 data length not divisible by 4")
|
|
189
|
+
num_tokens = len(token_data) // 4
|
|
190
|
+
token_ids = struct.unpack(f'{num_tokens}I', token_data)
|
|
191
|
+
else:
|
|
192
|
+
# uint16 format (2 bytes per token)
|
|
193
|
+
if len(token_data) % 2 != 0:
|
|
194
|
+
raise ValueError("Invalid compressed data: uint16 data length not divisible by 2")
|
|
195
|
+
num_tokens = len(token_data) // 2
|
|
196
|
+
token_ids = struct.unpack(f'{num_tokens}H', token_data)
|
|
197
|
+
|
|
198
|
+
# Step 2: Decode token IDs back to string
|
|
199
|
+
return self.tokenizer.decode(list(token_ids))
|
|
200
|
+
|
|
201
|
+
def compress_hybrid(self, text: str) -> bytes:
|
|
202
|
+
"""
|
|
203
|
+
Compress prompt using hybrid approach (Token + Zstd).
|
|
204
|
+
|
|
205
|
+
This is the most efficient method:
|
|
206
|
+
1. Tokenizes text to reduce redundancy
|
|
207
|
+
2. Packs tokens as binary
|
|
208
|
+
3. Applies Zstd compression on the binary data
|
|
209
|
+
|
|
210
|
+
Provides the best compression ratio for database storage.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
text: Original prompt string
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
Compressed bytes
|
|
217
|
+
|
|
218
|
+
Example:
|
|
219
|
+
>>> compressor = PromptCompressor()
|
|
220
|
+
>>> compressed = compressor.compress_hybrid("Your prompt here")
|
|
221
|
+
>>> original = compressor.decompress_hybrid(compressed)
|
|
222
|
+
"""
|
|
223
|
+
# Step 1: Tokenize
|
|
224
|
+
tokens = list(self.tokenizer.encode(text)) # Ensure it's a list
|
|
225
|
+
|
|
226
|
+
if not tokens:
|
|
227
|
+
# Empty token list - return compressed empty data
|
|
228
|
+
empty_data = struct.pack('B', 0) # uint16 format
|
|
229
|
+
return zstd.compress(empty_data, level=self.zstd_level)
|
|
230
|
+
|
|
231
|
+
# Step 2: Convert to binary (determine uint16 or uint32)
|
|
232
|
+
max_token_id = max(tokens)
|
|
233
|
+
min_token_id = min(tokens)
|
|
234
|
+
use_uint32 = (max_token_id > 65535) or (min_token_id < 0)
|
|
235
|
+
|
|
236
|
+
format_byte = 1 if use_uint32 else 0 # 0 = uint16, 1 = uint32
|
|
237
|
+
|
|
238
|
+
try:
|
|
239
|
+
if use_uint32:
|
|
240
|
+
# Use uint32 (4 bytes per token)
|
|
241
|
+
token_data = struct.pack('B', format_byte) + struct.pack(f'{len(tokens)}I', *tokens)
|
|
242
|
+
else:
|
|
243
|
+
# Use uint16 (2 bytes per token)
|
|
244
|
+
# Double-check all IDs fit in uint16 range
|
|
245
|
+
if max(tokens) > 65535:
|
|
246
|
+
# Fallback to uint32 if somehow we got here
|
|
247
|
+
format_byte = 1
|
|
248
|
+
token_data = struct.pack('B', format_byte) + struct.pack(f'{len(tokens)}I', *tokens)
|
|
249
|
+
else:
|
|
250
|
+
token_data = struct.pack('B', format_byte) + struct.pack(f'{len(tokens)}H', *tokens)
|
|
251
|
+
except (struct.error, OverflowError) as e:
|
|
252
|
+
# If packing fails, fallback to uint32
|
|
253
|
+
format_byte = 1
|
|
254
|
+
token_data = struct.pack('B', format_byte) + struct.pack(f'{len(tokens)}I', *tokens)
|
|
255
|
+
|
|
256
|
+
# Step 3: Final Zstd compression
|
|
257
|
+
compressed_blob = zstd.compress(token_data, level=self.zstd_level)
|
|
258
|
+
|
|
259
|
+
return compressed_blob
|
|
260
|
+
|
|
261
|
+
def decompress_hybrid(self, blob: bytes) -> str:
|
|
262
|
+
"""
|
|
263
|
+
Decompress hybrid-compressed prompt.
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
blob: Compressed bytes from compress_hybrid()
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
Original prompt string
|
|
270
|
+
"""
|
|
271
|
+
# Step 1: Decompress Zstd
|
|
272
|
+
token_data = zstd.decompress(blob)
|
|
273
|
+
|
|
274
|
+
if len(token_data) < 1:
|
|
275
|
+
raise ValueError("Invalid compressed data: missing format byte")
|
|
276
|
+
|
|
277
|
+
# Step 2: Read format byte and unpack token IDs
|
|
278
|
+
format_byte = struct.unpack('B', token_data[0:1])[0]
|
|
279
|
+
packed_data = token_data[1:]
|
|
280
|
+
|
|
281
|
+
if format_byte == 1:
|
|
282
|
+
# uint32 format (4 bytes per token)
|
|
283
|
+
if len(packed_data) % 4 != 0:
|
|
284
|
+
raise ValueError("Invalid compressed data: uint32 data length not divisible by 4")
|
|
285
|
+
num_tokens = len(packed_data) // 4
|
|
286
|
+
tokens = struct.unpack(f'{num_tokens}I', packed_data)
|
|
287
|
+
else:
|
|
288
|
+
# uint16 format (2 bytes per token)
|
|
289
|
+
if len(packed_data) % 2 != 0:
|
|
290
|
+
raise ValueError("Invalid compressed data: uint16 data length not divisible by 2")
|
|
291
|
+
num_tokens = len(packed_data) // 2
|
|
292
|
+
tokens = struct.unpack(f'{num_tokens}H', packed_data)
|
|
293
|
+
|
|
294
|
+
# Step 3: Decode to text
|
|
295
|
+
return self.tokenizer.decode(list(tokens))
|
|
296
|
+
|
|
297
|
+
def compress(
|
|
298
|
+
self,
|
|
299
|
+
text: str,
|
|
300
|
+
method: CompressionMethod = CompressionMethod.HYBRID
|
|
301
|
+
) -> bytes:
|
|
302
|
+
"""
|
|
303
|
+
Compress prompt using the specified method.
|
|
304
|
+
|
|
305
|
+
Args:
|
|
306
|
+
text: Original prompt string
|
|
307
|
+
method: Compression method to use (default: HYBRID)
|
|
308
|
+
|
|
309
|
+
Returns:
|
|
310
|
+
Compressed bytes
|
|
311
|
+
|
|
312
|
+
Example:
|
|
313
|
+
>>> compressor = PromptCompressor()
|
|
314
|
+
>>> compressed = compressor.compress("Your prompt", CompressionMethod.HYBRID)
|
|
315
|
+
>>> original = compressor.decompress(compressed, CompressionMethod.HYBRID)
|
|
316
|
+
"""
|
|
317
|
+
if method == CompressionMethod.ZSTD:
|
|
318
|
+
return self.compress_zstd(text)
|
|
319
|
+
elif method == CompressionMethod.TOKEN:
|
|
320
|
+
return self.compress_token(text)
|
|
321
|
+
elif method == CompressionMethod.HYBRID:
|
|
322
|
+
return self.compress_hybrid(text)
|
|
323
|
+
else:
|
|
324
|
+
raise ValueError(f"Unknown compression method: {method}")
|
|
325
|
+
|
|
326
|
+
def decompress(
|
|
327
|
+
self,
|
|
328
|
+
compressed_data: bytes,
|
|
329
|
+
method: CompressionMethod = CompressionMethod.HYBRID
|
|
330
|
+
) -> str:
|
|
331
|
+
"""
|
|
332
|
+
Decompress prompt using the specified method.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
compressed_data: Compressed bytes
|
|
336
|
+
method: Compression method used for compression
|
|
337
|
+
|
|
338
|
+
Returns:
|
|
339
|
+
Original prompt string
|
|
340
|
+
"""
|
|
341
|
+
if method == CompressionMethod.ZSTD:
|
|
342
|
+
return self.decompress_zstd(compressed_data)
|
|
343
|
+
elif method == CompressionMethod.TOKEN:
|
|
344
|
+
return self.decompress_token(compressed_data)
|
|
345
|
+
elif method == CompressionMethod.HYBRID:
|
|
346
|
+
return self.decompress_hybrid(compressed_data)
|
|
347
|
+
else:
|
|
348
|
+
raise ValueError(f"Unknown compression method: {method}")
|
|
349
|
+
|
|
350
|
+
def compress_and_return_both(
|
|
351
|
+
self,
|
|
352
|
+
text: str,
|
|
353
|
+
method: CompressionMethod = CompressionMethod.HYBRID
|
|
354
|
+
) -> Tuple[str, bytes]:
|
|
355
|
+
"""
|
|
356
|
+
Compress prompt and return both original and compressed versions.
|
|
357
|
+
|
|
358
|
+
Args:
|
|
359
|
+
text: Original prompt string
|
|
360
|
+
method: Compression method to use (default: HYBRID)
|
|
361
|
+
|
|
362
|
+
Returns:
|
|
363
|
+
Tuple of (original_prompt, compressed_bytes)
|
|
364
|
+
|
|
365
|
+
Example:
|
|
366
|
+
>>> compressor = PromptCompressor()
|
|
367
|
+
>>> original, compressed = compressor.compress_and_return_both("Your prompt")
|
|
368
|
+
"""
|
|
369
|
+
compressed = self.compress(text, method)
|
|
370
|
+
return (text, compressed)
|
|
371
|
+
|
|
372
|
+
def get_compression_stats(
|
|
373
|
+
self,
|
|
374
|
+
text: str,
|
|
375
|
+
method: Optional[CompressionMethod] = None
|
|
376
|
+
) -> dict:
|
|
377
|
+
"""
|
|
378
|
+
Get compression statistics for a given prompt.
|
|
379
|
+
|
|
380
|
+
Args:
|
|
381
|
+
text: Original prompt string
|
|
382
|
+
method: Compression method to analyze (None = all methods)
|
|
383
|
+
|
|
384
|
+
Returns:
|
|
385
|
+
Dictionary with compression statistics
|
|
386
|
+
"""
|
|
387
|
+
methods = [method] if method else [
|
|
388
|
+
CompressionMethod.ZSTD,
|
|
389
|
+
CompressionMethod.TOKEN,
|
|
390
|
+
CompressionMethod.HYBRID
|
|
391
|
+
]
|
|
392
|
+
|
|
393
|
+
original_size = len(text.encode('utf-8'))
|
|
394
|
+
stats = {
|
|
395
|
+
'original_size_bytes': original_size,
|
|
396
|
+
'original_size_tokens': len(self.tokenizer.encode(text)),
|
|
397
|
+
'methods': {}
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
for m in methods:
|
|
401
|
+
compressed = self.compress(text, m)
|
|
402
|
+
compressed_size = len(compressed)
|
|
403
|
+
compression_ratio = compressed_size / original_size if original_size > 0 else 0
|
|
404
|
+
space_saved = 1 - compression_ratio
|
|
405
|
+
|
|
406
|
+
stats['methods'][m.value] = {
|
|
407
|
+
'compressed_size_bytes': compressed_size,
|
|
408
|
+
'compression_ratio': compression_ratio,
|
|
409
|
+
'space_saved_percent': space_saved * 100,
|
|
410
|
+
'bytes_saved': original_size - compressed_size
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
return stats
|
|
414
|
+
|
|
415
|
+
def calculate_shannon_entropy(self, text: str, unit: str = 'character') -> float:
|
|
416
|
+
"""
|
|
417
|
+
Calculate Shannon Entropy of the input text.
|
|
418
|
+
|
|
419
|
+
Shannon Entropy formula: H(X) = -∑ P(x_i) * log₂(P(x_i))
|
|
420
|
+
|
|
421
|
+
This determines the theoretical lower limit of compression based on
|
|
422
|
+
character/byte frequency distribution.
|
|
423
|
+
|
|
424
|
+
Args:
|
|
425
|
+
text: Input text to analyze
|
|
426
|
+
unit: Unit to analyze ('character' or 'byte')
|
|
427
|
+
- 'character': Analyze individual characters
|
|
428
|
+
- 'byte': Analyze bytes (for binary data)
|
|
429
|
+
|
|
430
|
+
Returns:
|
|
431
|
+
Shannon entropy in bits
|
|
432
|
+
|
|
433
|
+
Example:
|
|
434
|
+
>>> compressor = PromptCompressor()
|
|
435
|
+
>>> entropy = compressor.calculate_shannon_entropy("Hello world")
|
|
436
|
+
>>> print(f"Theoretical compression limit: {entropy:.2f} bits per character")
|
|
437
|
+
"""
|
|
438
|
+
if not text:
|
|
439
|
+
return 0.0
|
|
440
|
+
|
|
441
|
+
if unit == 'byte':
|
|
442
|
+
# Analyze bytes
|
|
443
|
+
data = text.encode('utf-8')
|
|
444
|
+
frequencies = Counter(data)
|
|
445
|
+
else: # unit == 'character'
|
|
446
|
+
# Analyze characters
|
|
447
|
+
frequencies = Counter(text)
|
|
448
|
+
|
|
449
|
+
# Calculate probabilities
|
|
450
|
+
length = len(text) if unit == 'character' else len(data)
|
|
451
|
+
probabilities = [count / length for count in frequencies.values()]
|
|
452
|
+
|
|
453
|
+
# Calculate Shannon Entropy: H(X) = -∑ P(x_i) * log₂(P(x_i))
|
|
454
|
+
entropy = -sum(p * math.log2(p) for p in probabilities if p > 0)
|
|
455
|
+
|
|
456
|
+
return entropy
|
|
457
|
+
|
|
458
|
+
def get_theoretical_compression_limit(self, text: str, unit: str = 'character') -> Dict[str, float]:
|
|
459
|
+
"""
|
|
460
|
+
Calculate theoretical compression limit using Shannon Entropy.
|
|
461
|
+
|
|
462
|
+
This provides the theoretical minimum size achievable through entropy coding.
|
|
463
|
+
|
|
464
|
+
Args:
|
|
465
|
+
text: Input text to analyze
|
|
466
|
+
unit: Unit to analyze ('character' or 'byte')
|
|
467
|
+
|
|
468
|
+
Returns:
|
|
469
|
+
Dictionary with theoretical limits:
|
|
470
|
+
- entropy_bits_per_unit: Shannon entropy in bits
|
|
471
|
+
- theoretical_min_bits: Minimum total bits needed
|
|
472
|
+
- theoretical_min_bytes: Minimum bytes needed (theoretical limit)
|
|
473
|
+
- original_size_bytes: Original size in bytes
|
|
474
|
+
- theoretical_compression_ratio: Theoretical best compression ratio
|
|
475
|
+
|
|
476
|
+
Example:
|
|
477
|
+
>>> compressor = PromptCompressor()
|
|
478
|
+
>>> limits = compressor.get_theoretical_compression_limit("Your prompt")
|
|
479
|
+
>>> print(f"Theoretical minimum: {limits['theoretical_min_bytes']:.2f} bytes")
|
|
480
|
+
"""
|
|
481
|
+
if not text:
|
|
482
|
+
return {
|
|
483
|
+
'entropy_bits_per_unit': 0.0,
|
|
484
|
+
'theoretical_min_bits': 0.0,
|
|
485
|
+
'theoretical_min_bytes': 0.0,
|
|
486
|
+
'original_size_bytes': 0.0,
|
|
487
|
+
'theoretical_compression_ratio': 0.0
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
# Calculate Shannon Entropy
|
|
491
|
+
entropy = self.calculate_shannon_entropy(text, unit)
|
|
492
|
+
|
|
493
|
+
# Calculate theoretical minimums
|
|
494
|
+
if unit == 'byte':
|
|
495
|
+
num_units = len(text.encode('utf-8'))
|
|
496
|
+
original_size_bytes = len(text.encode('utf-8'))
|
|
497
|
+
else: # character
|
|
498
|
+
num_units = len(text)
|
|
499
|
+
original_size_bytes = len(text.encode('utf-8'))
|
|
500
|
+
|
|
501
|
+
theoretical_min_bits = entropy * num_units
|
|
502
|
+
theoretical_min_bytes = theoretical_min_bits / 8.0
|
|
503
|
+
|
|
504
|
+
# Theoretical compression ratio
|
|
505
|
+
theoretical_compression_ratio = (
|
|
506
|
+
theoretical_min_bytes / original_size_bytes
|
|
507
|
+
if original_size_bytes > 0 else 0.0
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
return {
|
|
511
|
+
'entropy_bits_per_unit': entropy,
|
|
512
|
+
'theoretical_min_bits': theoretical_min_bits,
|
|
513
|
+
'theoretical_min_bytes': theoretical_min_bytes,
|
|
514
|
+
'original_size_bytes': original_size_bytes,
|
|
515
|
+
'theoretical_compression_ratio': theoretical_compression_ratio,
|
|
516
|
+
'theoretical_space_savings_percent': (1 - theoretical_compression_ratio) * 100
|
|
517
|
+
}
|
|
@@ -0,0 +1,402 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: lopace
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Lossless Optimized Prompt Accurate Compression Engine
|
|
5
|
+
Home-page: https://github.com/connectaman/LoPace
|
|
6
|
+
Author: Aman Ulla
|
|
7
|
+
License: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/amanulla/lopace
|
|
9
|
+
Project-URL: Repository, https://github.com/amanulla/lopace
|
|
10
|
+
Project-URL: Issues, https://github.com/amanulla/lopace/issues
|
|
11
|
+
Keywords: prompt,compression,tokenization,zstd,bpe,nlp
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
15
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Requires-Python: >=3.8
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Requires-Dist: zstandard>=0.22.0
|
|
27
|
+
Requires-Dist: tiktoken>=0.5.0
|
|
28
|
+
Dynamic: home-page
|
|
29
|
+
Dynamic: license-file
|
|
30
|
+
Dynamic: requires-python
|
|
31
|
+
|
|
32
|
+
# LoPace
|
|
33
|
+
|
|
34
|
+
**Lossless Optimized Prompt Accurate Compression Engine**
|
|
35
|
+
|
|
36
|
+
A professional, open-source Python package for compressing and decompressing prompts using multiple techniques: Zstd, Token-based (BPE), and Hybrid methods. Achieve up to 80% space reduction while maintaining perfect lossless reconstruction.
|
|
37
|
+
|
|
38
|
+
[](https://opensource.org/licenses/MIT)
|
|
39
|
+
[](https://www.python.org/downloads/)
|
|
40
|
+
|
|
41
|
+
## Features
|
|
42
|
+
|
|
43
|
+
- 🚀 **Three Compression Methods**:
|
|
44
|
+
- **Zstd**: Dictionary-based compression using Zstandard algorithm
|
|
45
|
+
- **Token**: Byte-Pair Encoding (BPE) tokenization with binary packing
|
|
46
|
+
- **Hybrid**: Combination of tokenization and Zstd (best compression ratio)
|
|
47
|
+
|
|
48
|
+
- ✅ **Lossless**: Perfect reconstruction of original prompts
|
|
49
|
+
- 📊 **Compression Statistics**: Analyze compression ratios and space savings
|
|
50
|
+
- 🔧 **Simple API**: Easy-to-use interface for all compression methods
|
|
51
|
+
- 🎯 **Database-Ready**: Optimized for storing prompts in databases
|
|
52
|
+
|
|
53
|
+
## Installation
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
pip install lopace
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### Dependencies
|
|
60
|
+
|
|
61
|
+
- `zstandard>=0.22.0` - For Zstd compression
|
|
62
|
+
- `tiktoken>=0.5.0` - For BPE tokenization
|
|
63
|
+
|
|
64
|
+
## Quick Start
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
from lopace import PromptCompressor, CompressionMethod
|
|
68
|
+
|
|
69
|
+
# Initialize compressor
|
|
70
|
+
compressor = PromptCompressor(model="cl100k_base", zstd_level=15)
|
|
71
|
+
|
|
72
|
+
# Your prompt
|
|
73
|
+
prompt = "You are a helpful AI assistant..."
|
|
74
|
+
|
|
75
|
+
# Compress using hybrid method (recommended)
|
|
76
|
+
compressed = compressor.compress(prompt, CompressionMethod.HYBRID)
|
|
77
|
+
|
|
78
|
+
# Decompress back to original
|
|
79
|
+
original = compressor.decompress(compressed, CompressionMethod.HYBRID)
|
|
80
|
+
|
|
81
|
+
# Verify losslessness
|
|
82
|
+
assert original == prompt # ✓ True
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Usage Examples
|
|
86
|
+
|
|
87
|
+
### Basic Compression/Decompression
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from lopace import PromptCompressor, CompressionMethod
|
|
91
|
+
|
|
92
|
+
compressor = PromptCompressor()
|
|
93
|
+
|
|
94
|
+
# Compress and return both original and compressed
|
|
95
|
+
original, compressed = compressor.compress_and_return_both(
|
|
96
|
+
"Your prompt here",
|
|
97
|
+
CompressionMethod.HYBRID
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# Decompress
|
|
101
|
+
recovered = compressor.decompress(compressed, CompressionMethod.HYBRID)
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### Using Different Methods
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
compressor = PromptCompressor()
|
|
108
|
+
|
|
109
|
+
prompt = "Your system prompt here..."
|
|
110
|
+
|
|
111
|
+
# Method 1: Zstd only
|
|
112
|
+
zstd_compressed = compressor.compress_zstd(prompt)
|
|
113
|
+
zstd_decompressed = compressor.decompress_zstd(zstd_compressed)
|
|
114
|
+
|
|
115
|
+
# Method 2: Token-based (BPE)
|
|
116
|
+
token_compressed = compressor.compress_token(prompt)
|
|
117
|
+
token_decompressed = compressor.decompress_token(token_compressed)
|
|
118
|
+
|
|
119
|
+
# Method 3: Hybrid (recommended - best compression)
|
|
120
|
+
hybrid_compressed = compressor.compress_hybrid(prompt)
|
|
121
|
+
hybrid_decompressed = compressor.decompress_hybrid(hybrid_compressed)
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Get Compression Statistics
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
compressor = PromptCompressor()
|
|
128
|
+
prompt = "Your long system prompt..."
|
|
129
|
+
|
|
130
|
+
# Get stats for all methods
|
|
131
|
+
stats = compressor.get_compression_stats(prompt)
|
|
132
|
+
|
|
133
|
+
print(f"Original Size: {stats['original_size_bytes']} bytes")
|
|
134
|
+
print(f"Original Tokens: {stats['original_size_tokens']}")
|
|
135
|
+
|
|
136
|
+
for method, method_stats in stats['methods'].items():
|
|
137
|
+
print(f"\n{method}:")
|
|
138
|
+
print(f" Compressed: {method_stats['compressed_size_bytes']} bytes")
|
|
139
|
+
print(f" Space Saved: {method_stats['space_saved_percent']:.2f}%")
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## Compression Methods Explained
|
|
143
|
+
|
|
144
|
+
### 1. Zstd Compression
|
|
145
|
+
|
|
146
|
+
Uses Zstandard's dictionary-based algorithm to find repeated patterns and replace them with shorter references.
|
|
147
|
+
|
|
148
|
+
**Best for**: General text compression, when tokenization overhead is not needed.
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
compressed = compressor.compress_zstd(prompt)
|
|
152
|
+
original = compressor.decompress_zstd(compressed)
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### 2. Token-Based Compression
|
|
156
|
+
|
|
157
|
+
Uses Byte-Pair Encoding (BPE) to convert text to token IDs, then packs them as binary data.
|
|
158
|
+
|
|
159
|
+
**Best for**: When you need token IDs anyway, or when working with LLM tokenizers.
|
|
160
|
+
|
|
161
|
+
```python
|
|
162
|
+
compressed = compressor.compress_token(prompt)
|
|
163
|
+
original = compressor.decompress_token(compressed)
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
### 3. Hybrid Compression (Recommended)
|
|
167
|
+
|
|
168
|
+
Combines tokenization and Zstd compression for maximum efficiency:
|
|
169
|
+
|
|
170
|
+
1. Tokenizes text to reduce redundancy
|
|
171
|
+
2. Packs tokens as binary (2 bytes per token)
|
|
172
|
+
3. Applies Zstd compression on the binary data
|
|
173
|
+
|
|
174
|
+
**Best for**: Database storage where maximum compression is needed.
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
compressed = compressor.compress_hybrid(prompt)
|
|
178
|
+
original = compressor.decompress_hybrid(compressed)
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
## API Reference
|
|
182
|
+
|
|
183
|
+
### `PromptCompressor`
|
|
184
|
+
|
|
185
|
+
Main compressor class.
|
|
186
|
+
|
|
187
|
+
#### Constructor
|
|
188
|
+
|
|
189
|
+
```python
|
|
190
|
+
PromptCompressor(
|
|
191
|
+
model: str = "cl100k_base",
|
|
192
|
+
zstd_level: int = 15
|
|
193
|
+
)
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
**Parameters:**
|
|
197
|
+
- `model`: Tokenizer model name (default: `"cl100k_base"`)
|
|
198
|
+
- Options: `"cl100k_base"`, `"p50k_base"`, `"r50k_base"`, `"gpt2"`, etc.
|
|
199
|
+
- `zstd_level`: Zstd compression level 1-22 (default: `15`)
|
|
200
|
+
- Higher = better compression but slower
|
|
201
|
+
|
|
202
|
+
#### Methods
|
|
203
|
+
|
|
204
|
+
##### `compress(text: str, method: CompressionMethod) -> bytes`
|
|
205
|
+
|
|
206
|
+
Compress a prompt using the specified method.
|
|
207
|
+
|
|
208
|
+
##### `decompress(compressed_data: bytes, method: CompressionMethod) -> str`
|
|
209
|
+
|
|
210
|
+
Decompress a compressed prompt.
|
|
211
|
+
|
|
212
|
+
##### `compress_and_return_both(text: str, method: CompressionMethod) -> Tuple[str, bytes]`
|
|
213
|
+
|
|
214
|
+
Compress and return both original and compressed versions.
|
|
215
|
+
|
|
216
|
+
##### `get_compression_stats(text: str, method: Optional[CompressionMethod]) -> dict`
|
|
217
|
+
|
|
218
|
+
Get detailed compression statistics for analysis.
|
|
219
|
+
|
|
220
|
+
### `CompressionMethod`
|
|
221
|
+
|
|
222
|
+
Enumeration of available compression methods:
|
|
223
|
+
|
|
224
|
+
- `CompressionMethod.ZSTD` - Zstandard compression
|
|
225
|
+
- `CompressionMethod.TOKEN` - Token-based compression
|
|
226
|
+
- `CompressionMethod.HYBRID` - Hybrid compression (recommended)
|
|
227
|
+
|
|
228
|
+
## How It Works
|
|
229
|
+
|
|
230
|
+
### Compression Pipeline (Hybrid Method)
|
|
231
|
+
|
|
232
|
+
```
|
|
233
|
+
Input: Raw System Prompt String (100%)
|
|
234
|
+
↓
|
|
235
|
+
Tokenization: Convert to Tiktoken IDs (~70% reduced)
|
|
236
|
+
↓
|
|
237
|
+
Binary Packing: Convert IDs to uint16 (~50% of above)
|
|
238
|
+
↓
|
|
239
|
+
Zstd: Final compression (~30% further reduction)
|
|
240
|
+
↓
|
|
241
|
+
Output: Compressed Binary Blob
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
### Why Hybrid is Best for Databases
|
|
245
|
+
|
|
246
|
+
1. **Searchability**: Token IDs can be searched without full decompression
|
|
247
|
+
2. **Consistency**: Fixed tokenizer ensures stable compression ratios
|
|
248
|
+
3. **Efficiency**: Maximum space savings for millions of prompts
|
|
249
|
+
|
|
250
|
+
## Example Output
|
|
251
|
+
|
|
252
|
+
```python
|
|
253
|
+
# Original prompt: 500 bytes
|
|
254
|
+
# After compression:
|
|
255
|
+
# Zstd: 180 bytes (64% space saved)
|
|
256
|
+
# Token: 240 bytes (52% space saved)
|
|
257
|
+
# Hybrid: 120 bytes (76% space saved) ← Best!
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
## Running the Example
|
|
261
|
+
|
|
262
|
+
```bash
|
|
263
|
+
python example.py
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
This will demonstrate all compression methods and show statistics.
|
|
267
|
+
|
|
268
|
+
## Interactive Web App (Streamlit)
|
|
269
|
+
|
|
270
|
+
LoPace includes an interactive Streamlit web application with comprehensive evaluation metrics:
|
|
271
|
+
|
|
272
|
+
### Features
|
|
273
|
+
|
|
274
|
+
- **Interactive Interface**: Enter prompts and see real-time compression results
|
|
275
|
+
- **Comprehensive Metrics**: All four industry-standard metrics:
|
|
276
|
+
- Compression Ratio (CR): $CR = \frac{S_{original}}{S_{compressed}}$
|
|
277
|
+
- Space Savings (SS): $SS = 1 - \frac{S_{compressed}}{S_{original}}$
|
|
278
|
+
- Bits Per Character (BPC): $BPC = \frac{Total Bits}{Total Characters}$
|
|
279
|
+
- Throughput (MB/s): $T = \frac{Data Size}{Time}$
|
|
280
|
+
- **Lossless Verification**:
|
|
281
|
+
- SHA-256 Hash Verification
|
|
282
|
+
- Exact Match (Character-by-Character)
|
|
283
|
+
- Reconstruction Error: $E = \frac{1}{N} \sum_{i=1}^{N} \mathbb{1}(x_i \neq \hat{x}_i) = 0$
|
|
284
|
+
- **Side-by-Side Comparison**: Compare all three compression methods
|
|
285
|
+
- **Real-time Configuration**: Adjust tokenizer model and Zstd level
|
|
286
|
+
|
|
287
|
+
### Running the Streamlit App
|
|
288
|
+
|
|
289
|
+
```bash
|
|
290
|
+
streamlit run streamlit_app.py
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
The app will open in your default web browser at `http://localhost:8501`
|
|
294
|
+
|
|
295
|
+
### Screenshot Preview
|
|
296
|
+
|
|
297
|
+
The app features:
|
|
298
|
+
- **Left Panel**: Text input area for entering prompts
|
|
299
|
+
- **Right Panel**: Results with tabs for each compression method
|
|
300
|
+
- **Metrics Dashboard**: Real-time calculation of all evaluation metrics
|
|
301
|
+
- **Verification Section**: Hash matching and exact match verification
|
|
302
|
+
- **Comparison Table**: Side-by-side comparison of all methods
|
|
303
|
+
|
|
304
|
+
## Development
|
|
305
|
+
|
|
306
|
+
### Setup Development Environment
|
|
307
|
+
|
|
308
|
+
```bash
|
|
309
|
+
git clone https://github.com/amanulla/lopace.git
|
|
310
|
+
cd lopace
|
|
311
|
+
pip install -r requirements-dev.txt
|
|
312
|
+
```
|
|
313
|
+
|
|
314
|
+
### Running Tests
|
|
315
|
+
|
|
316
|
+
```bash
|
|
317
|
+
pytest
|
|
318
|
+
```
|
|
319
|
+
|
|
320
|
+
### CI/CD Pipeline
|
|
321
|
+
|
|
322
|
+
This project uses GitHub Actions for automated testing and publishing:
|
|
323
|
+
|
|
324
|
+
- **Tests run automatically** on every push and pull request
|
|
325
|
+
- **Publishing to PyPI** happens automatically when:
|
|
326
|
+
- All tests pass ✅
|
|
327
|
+
- Push is to `main`/`master` branch or a version tag (e.g., `v0.1.0`)
|
|
328
|
+
|
|
329
|
+
See [.github/workflows/README.md](.github/workflows/README.md) for detailed setup instructions.
|
|
330
|
+
|
|
331
|
+
## Mathematical Background
|
|
332
|
+
|
|
333
|
+
### Compression Techniques Used
|
|
334
|
+
|
|
335
|
+
LoPace uses the following compression techniques:
|
|
336
|
+
|
|
337
|
+
1. **LZ77 (Sliding Window)**: Used **indirectly** through Zstandard
|
|
338
|
+
- Zstandard internally uses LZ77-style algorithms to find repeated patterns
|
|
339
|
+
- Instead of storing "assistant" again, it stores a tuple: (distance_back, length)
|
|
340
|
+
- We use this by calling `zstandard.compress()` - the LZ77 is handled internally
|
|
341
|
+
|
|
342
|
+
2. **Huffman Coding / FSE (Finite State Entropy)**: Used **indirectly** through Zstandard
|
|
343
|
+
- Zstandard uses FSE, a variant of Huffman coding
|
|
344
|
+
- Assigns shorter binary codes to characters/patterns that appear most frequently
|
|
345
|
+
- Again, handled internally by the zstandard library
|
|
346
|
+
|
|
347
|
+
3. **BPE Tokenization**: Used **directly** via tiktoken
|
|
348
|
+
- Byte-Pair Encoding converts text to token IDs
|
|
349
|
+
- Reduces vocabulary size before compression
|
|
350
|
+
- Implemented by OpenAI's tiktoken library
|
|
351
|
+
|
|
352
|
+
### Shannon Entropy
|
|
353
|
+
|
|
354
|
+
The theoretical compression limit is determined by Shannon Entropy:
|
|
355
|
+
|
|
356
|
+
$H(X) = -\sum_{i=1}^{n} P(x_i) \log_2 P(x_i)$
|
|
357
|
+
|
|
358
|
+
Where:
|
|
359
|
+
- $H(X)$ is the entropy of the source
|
|
360
|
+
- $P(x_i)$ is the probability of character/pattern $x_i$
|
|
361
|
+
|
|
362
|
+
LoPace **calculates** Shannon Entropy to show theoretical compression limits:
|
|
363
|
+
|
|
364
|
+
```python
|
|
365
|
+
compressor = PromptCompressor()
|
|
366
|
+
entropy = compressor.calculate_shannon_entropy("Your prompt")
|
|
367
|
+
limits = compressor.get_theoretical_compression_limit("Your prompt")
|
|
368
|
+
print(f"Theoretical minimum: {limits['theoretical_min_bytes']:.2f} bytes")
|
|
369
|
+
```
|
|
370
|
+
|
|
371
|
+
This allows you to compare actual compression against the theoretical limit.
|
|
372
|
+
|
|
373
|
+
## License
|
|
374
|
+
|
|
375
|
+
MIT License - see [LICENSE](LICENSE) file for details.
|
|
376
|
+
|
|
377
|
+
## Contributing
|
|
378
|
+
|
|
379
|
+
Contributions are welcome! We appreciate your help in making LoPace better.
|
|
380
|
+
|
|
381
|
+
Please read our [Contributing Guidelines](CONTRIBUTING.md) and [Code of Conduct](CODE_OF_CONDUCT.md) before contributing.
|
|
382
|
+
|
|
383
|
+
### Quick Start for Contributors
|
|
384
|
+
|
|
385
|
+
1. Fork the repository
|
|
386
|
+
2. Create a feature branch (`git checkout -b feature/amazing-feature`)
|
|
387
|
+
3. Make your changes
|
|
388
|
+
4. Run tests (`pytest tests/ -v`)
|
|
389
|
+
5. Commit your changes (`git commit -m 'Add amazing feature'`)
|
|
390
|
+
6. Push to the branch (`git push origin feature/amazing-feature`)
|
|
391
|
+
7. Open a Pull Request
|
|
392
|
+
|
|
393
|
+
For more details, see [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
394
|
+
|
|
395
|
+
## Author
|
|
396
|
+
|
|
397
|
+
Aman Ulla
|
|
398
|
+
|
|
399
|
+
## Acknowledgments
|
|
400
|
+
|
|
401
|
+
- Built on top of [zstandard](https://github.com/facebook/zstd) and [tiktoken](https://github.com/openai/tiktoken)
|
|
402
|
+
- Inspired by the need for efficient prompt storage in LLM applications
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
lopace/__init__.py,sha256=PYjZWZHhSITNgag9sF0qZ_yXgZaMa3R8_3FuasiH0Nc,351
|
|
2
|
+
lopace/compressor.py,sha256=nUTWDcAPYvQaeSFKx_lne-D2xIQ02IMVGE4yLODo8qE,19060
|
|
3
|
+
lopace-0.1.0.dist-info/licenses/LICENSE,sha256=uFUrlsfsOwx_8Nzhq2pUgNaJghcJxXBMML3l7T39Tm0,1067
|
|
4
|
+
tests/__init__.py,sha256=yXNVJE20E2iHo0qbit5SgRE35eXWq89F1kkhNHy7VJA,31
|
|
5
|
+
tests/test_compressor.py,sha256=-vMztSzY89n5dpShcACrFboEQOlfJ6FxF7eQOEU3swM,8273
|
|
6
|
+
lopace-0.1.0.dist-info/METADATA,sha256=yXy0jt23uvVWkGlEeCb8KEUSx1_o3N02wZZEFj5weEI,12199
|
|
7
|
+
lopace-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
8
|
+
lopace-0.1.0.dist-info/top_level.txt,sha256=8CLB5czxmmAfR7ayh3TO5qyB1-xJoYNxabufJ37Xh5o,13
|
|
9
|
+
lopace-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Aman Ulla
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
tests/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Tests for LoPace package."""
|
tests/test_compressor.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
"""Tests for PromptCompressor class."""
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
from lopace import PromptCompressor, CompressionMethod
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@pytest.fixture
|
|
8
|
+
def compressor():
|
|
9
|
+
"""Create a PromptCompressor instance for testing."""
|
|
10
|
+
return PromptCompressor(model="cl100k_base", zstd_level=15)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@pytest.fixture
|
|
14
|
+
def sample_prompt():
|
|
15
|
+
"""Sample prompt for testing."""
|
|
16
|
+
return """You are a helpful AI assistant designed to provide accurate,
|
|
17
|
+
detailed, and helpful responses to user queries. Your goal is to assist users
|
|
18
|
+
by understanding their questions and providing relevant information."""
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TestZstdCompression:
|
|
22
|
+
"""Test Zstd compression/decompression."""
|
|
23
|
+
|
|
24
|
+
def test_compress_decompress_zstd(self, compressor, sample_prompt):
|
|
25
|
+
"""Test that Zstd compression is lossless."""
|
|
26
|
+
compressed = compressor.compress_zstd(sample_prompt)
|
|
27
|
+
decompressed = compressor.decompress_zstd(compressed)
|
|
28
|
+
assert decompressed == sample_prompt
|
|
29
|
+
|
|
30
|
+
def test_zstd_compression_ratio(self, compressor, sample_prompt):
|
|
31
|
+
"""Test that Zstd actually compresses."""
|
|
32
|
+
compressed = compressor.compress_zstd(sample_prompt)
|
|
33
|
+
original_size = len(sample_prompt.encode('utf-8'))
|
|
34
|
+
compressed_size = len(compressed)
|
|
35
|
+
assert compressed_size < original_size
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class TestTokenCompression:
|
|
39
|
+
"""Test Token-based compression/decompression."""
|
|
40
|
+
|
|
41
|
+
def test_compress_decompress_token(self, compressor, sample_prompt):
|
|
42
|
+
"""Test that Token compression is lossless."""
|
|
43
|
+
compressed = compressor.compress_token(sample_prompt)
|
|
44
|
+
decompressed = compressor.decompress_token(compressed)
|
|
45
|
+
assert decompressed == sample_prompt
|
|
46
|
+
|
|
47
|
+
def test_token_binary_format(self, compressor, sample_prompt):
|
|
48
|
+
"""Test that token compression produces binary data with format byte."""
|
|
49
|
+
compressed = compressor.compress_token(sample_prompt)
|
|
50
|
+
assert isinstance(compressed, bytes)
|
|
51
|
+
# Should have at least format byte (1 byte)
|
|
52
|
+
assert len(compressed) >= 1
|
|
53
|
+
# Format byte should be 0 (uint16) or 1 (uint32)
|
|
54
|
+
import struct
|
|
55
|
+
format_byte = struct.unpack('B', compressed[0:1])[0]
|
|
56
|
+
assert format_byte in [0, 1]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class TestHybridCompression:
|
|
60
|
+
"""Test Hybrid compression/decompression."""
|
|
61
|
+
|
|
62
|
+
def test_compress_decompress_hybrid(self, compressor, sample_prompt):
|
|
63
|
+
"""Test that Hybrid compression is lossless."""
|
|
64
|
+
compressed = compressor.compress_hybrid(sample_prompt)
|
|
65
|
+
decompressed = compressor.decompress_hybrid(compressed)
|
|
66
|
+
assert decompressed == sample_prompt
|
|
67
|
+
|
|
68
|
+
def test_hybrid_compression(self, compressor, sample_prompt):
|
|
69
|
+
"""Test that Hybrid compression works and compares with other methods."""
|
|
70
|
+
zstd_compressed = compressor.compress_zstd(sample_prompt)
|
|
71
|
+
token_compressed = compressor.compress_token(sample_prompt)
|
|
72
|
+
hybrid_compressed = compressor.compress_hybrid(sample_prompt)
|
|
73
|
+
|
|
74
|
+
# All methods should compress (smaller than original for longer prompts)
|
|
75
|
+
original_size = len(sample_prompt.encode('utf-8'))
|
|
76
|
+
|
|
77
|
+
# Verify all compression methods produce valid output
|
|
78
|
+
assert len(zstd_compressed) > 0
|
|
79
|
+
assert len(token_compressed) > 0
|
|
80
|
+
assert len(hybrid_compressed) > 0
|
|
81
|
+
|
|
82
|
+
# For very long prompts (>500 chars), hybrid should typically be better than token alone
|
|
83
|
+
# But for short prompts, Zstd overhead can make hybrid larger
|
|
84
|
+
if len(sample_prompt) > 500:
|
|
85
|
+
# On very long prompts, hybrid should generally compress well
|
|
86
|
+
assert len(hybrid_compressed) < original_size
|
|
87
|
+
|
|
88
|
+
# Zstd alone should compress for longer prompts
|
|
89
|
+
if len(sample_prompt) > 100:
|
|
90
|
+
assert len(zstd_compressed) < original_size
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class TestGenericMethods:
|
|
94
|
+
"""Test generic compress/decompress methods."""
|
|
95
|
+
|
|
96
|
+
def test_compress_with_method(self, compressor, sample_prompt):
|
|
97
|
+
"""Test generic compress method with all methods."""
|
|
98
|
+
for method in CompressionMethod:
|
|
99
|
+
compressed = compressor.compress(sample_prompt, method)
|
|
100
|
+
assert isinstance(compressed, bytes)
|
|
101
|
+
assert len(compressed) > 0
|
|
102
|
+
|
|
103
|
+
def test_decompress_with_method(self, compressor, sample_prompt):
|
|
104
|
+
"""Test generic decompress method with all methods."""
|
|
105
|
+
for method in CompressionMethod:
|
|
106
|
+
compressed = compressor.compress(sample_prompt, method)
|
|
107
|
+
decompressed = compressor.decompress(compressed, method)
|
|
108
|
+
assert decompressed == sample_prompt
|
|
109
|
+
|
|
110
|
+
def test_compress_and_return_both(self, compressor, sample_prompt):
|
|
111
|
+
"""Test compress_and_return_both method."""
|
|
112
|
+
original, compressed = compressor.compress_and_return_both(
|
|
113
|
+
sample_prompt,
|
|
114
|
+
CompressionMethod.HYBRID
|
|
115
|
+
)
|
|
116
|
+
assert original == sample_prompt
|
|
117
|
+
assert isinstance(compressed, bytes)
|
|
118
|
+
assert len(compressed) > 0
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class TestCompressionStats:
|
|
122
|
+
"""Test compression statistics."""
|
|
123
|
+
|
|
124
|
+
def test_get_compression_stats_all_methods(self, compressor, sample_prompt):
|
|
125
|
+
"""Test getting stats for all methods."""
|
|
126
|
+
stats = compressor.get_compression_stats(sample_prompt)
|
|
127
|
+
|
|
128
|
+
assert 'original_size_bytes' in stats
|
|
129
|
+
assert 'original_size_tokens' in stats
|
|
130
|
+
assert 'methods' in stats
|
|
131
|
+
|
|
132
|
+
assert len(stats['methods']) == 3 # ZSTD, TOKEN, HYBRID
|
|
133
|
+
|
|
134
|
+
for method_name, method_stats in stats['methods'].items():
|
|
135
|
+
assert 'compressed_size_bytes' in method_stats
|
|
136
|
+
assert 'compression_ratio' in method_stats
|
|
137
|
+
assert 'space_saved_percent' in method_stats
|
|
138
|
+
assert 'bytes_saved' in method_stats
|
|
139
|
+
|
|
140
|
+
def test_get_compression_stats_single_method(self, compressor, sample_prompt):
|
|
141
|
+
"""Test getting stats for a single method."""
|
|
142
|
+
stats = compressor.get_compression_stats(
|
|
143
|
+
sample_prompt,
|
|
144
|
+
CompressionMethod.HYBRID
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
assert len(stats['methods']) == 1
|
|
148
|
+
assert CompressionMethod.HYBRID.value in stats['methods']
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
class TestEdgeCases:
|
|
152
|
+
"""Test edge cases and error handling."""
|
|
153
|
+
|
|
154
|
+
def test_empty_string(self, compressor):
|
|
155
|
+
"""Test compression of empty string."""
|
|
156
|
+
for method in CompressionMethod:
|
|
157
|
+
compressed = compressor.compress("", method)
|
|
158
|
+
decompressed = compressor.decompress(compressed, method)
|
|
159
|
+
assert decompressed == ""
|
|
160
|
+
|
|
161
|
+
def test_large_token_ids(self, compressor):
|
|
162
|
+
"""Test compression with token IDs that exceed uint16 range."""
|
|
163
|
+
# Create a prompt that might trigger large token IDs
|
|
164
|
+
# Using various special characters and unicode
|
|
165
|
+
large_prompt = "Hello " * 1000 + "世界 🌍 مرحبا " * 100
|
|
166
|
+
|
|
167
|
+
# This should work without error, handling uint32 if needed
|
|
168
|
+
compressed = compressor.compress_token(large_prompt)
|
|
169
|
+
decompressed = compressor.decompress_token(compressed)
|
|
170
|
+
assert decompressed == large_prompt
|
|
171
|
+
|
|
172
|
+
# Test hybrid method too
|
|
173
|
+
compressed_hybrid = compressor.compress_hybrid(large_prompt)
|
|
174
|
+
decompressed_hybrid = compressor.decompress_hybrid(compressed_hybrid)
|
|
175
|
+
assert decompressed_hybrid == large_prompt
|
|
176
|
+
|
|
177
|
+
def test_single_character(self, compressor):
|
|
178
|
+
"""Test compression of single character."""
|
|
179
|
+
for method in CompressionMethod:
|
|
180
|
+
compressed = compressor.compress("a", method)
|
|
181
|
+
decompressed = compressor.decompress(compressed, method)
|
|
182
|
+
assert decompressed == "a"
|
|
183
|
+
|
|
184
|
+
def test_unicode_characters(self, compressor):
|
|
185
|
+
"""Test compression of unicode characters."""
|
|
186
|
+
unicode_prompt = "你好世界 🌍 مرحبا"
|
|
187
|
+
for method in CompressionMethod:
|
|
188
|
+
compressed = compressor.compress(unicode_prompt, method)
|
|
189
|
+
decompressed = compressor.decompress(compressed, method)
|
|
190
|
+
assert decompressed == unicode_prompt
|
|
191
|
+
|
|
192
|
+
def test_invalid_zstd_level(self):
|
|
193
|
+
"""Test that invalid zstd_level raises error."""
|
|
194
|
+
with pytest.raises(ValueError):
|
|
195
|
+
PromptCompressor(zstd_level=0)
|
|
196
|
+
|
|
197
|
+
with pytest.raises(ValueError):
|
|
198
|
+
PromptCompressor(zstd_level=23)
|