lopace 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lopace/__init__.py ADDED
@@ -0,0 +1,11 @@
1
+ """
2
+ LoPace - Lossless Optimized Prompt Accurate Compression Engine
3
+
4
+ A professional Python package for compressing and decompressing prompts
5
+ using multiple techniques: Zstd, Token-based (BPE), and Hybrid methods.
6
+ """
7
+
8
+ from .compressor import PromptCompressor, CompressionMethod
9
+
10
+ __version__ = "0.1.0"
11
+ __all__ = ["PromptCompressor", "CompressionMethod"]
lopace/compressor.py ADDED
@@ -0,0 +1,517 @@
1
+ """
2
+ Main compression module implementing Zstd, Token-based, and Hybrid compression methods.
3
+
4
+ The compression algorithms used:
5
+ - Zstd: Uses LZ77 (sliding window) and FSE (Finite State Entropy, a variant of Huffman coding)
6
+ internally via the zstandard library
7
+ - Token-based: Uses BPE (Byte-Pair Encoding) via tiktoken
8
+ - Hybrid: Combines tokenization + Zstd compression
9
+ """
10
+
11
+ import struct
12
+ import math
13
+ from collections import Counter
14
+ from enum import Enum
15
+ from typing import Union, Tuple, Optional, Dict
16
+
17
+ try:
18
+ import zstandard as zstd
19
+ except ImportError:
20
+ zstd = None
21
+
22
+ try:
23
+ import tiktoken
24
+ except ImportError:
25
+ tiktoken = None
26
+
27
+
28
+ class CompressionMethod(Enum):
29
+ """Compression methods available."""
30
+ ZSTD = "zstd"
31
+ TOKEN = "token"
32
+ HYBRID = "hybrid"
33
+
34
+
35
+ class PromptCompressor:
36
+ """
37
+ Professional prompt compressor supporting multiple compression techniques.
38
+
39
+ Methods:
40
+ - Zstd: Dictionary-based compression using Zstandard
41
+ - Token: Byte-Pair Encoding (BPE) tokenization with binary packing
42
+ - Hybrid: Combination of tokenization and Zstd compression
43
+
44
+ Args:
45
+ model: Tokenizer model name (default: "cl100k_base")
46
+ Options: "cl100k_base", "p50k_base", "r50k_base", "gpt2", etc.
47
+ zstd_level: Zstd compression level (1-22, default: 15)
48
+ Higher levels provide better compression but are slower.
49
+ """
50
+
51
+ def __init__(
52
+ self,
53
+ model: str = "cl100k_base",
54
+ zstd_level: int = 15
55
+ ):
56
+ if zstd is None:
57
+ raise ImportError(
58
+ "zstandard is required. Install it with: pip install zstandard"
59
+ )
60
+
61
+ if tiktoken is None:
62
+ raise ImportError(
63
+ "tiktoken is required. Install it with: pip install tiktoken"
64
+ )
65
+
66
+ self.tokenizer = tiktoken.get_encoding(model)
67
+ self.zstd_level = zstd_level
68
+ self.model = model
69
+
70
+ # Validate zstd_level
71
+ if not (1 <= zstd_level <= 22):
72
+ raise ValueError("zstd_level must be between 1 and 22")
73
+
74
+ def compress_zstd(self, text: str) -> bytes:
75
+ """
76
+ Compress prompt using Zstandard algorithm.
77
+
78
+ Args:
79
+ text: Original prompt string
80
+
81
+ Returns:
82
+ Compressed bytes
83
+
84
+ Example:
85
+ >>> compressor = PromptCompressor()
86
+ >>> compressed = compressor.compress_zstd("Your prompt here")
87
+ >>> original = compressor.decompress_zstd(compressed)
88
+ """
89
+ data_bytes = text.encode('utf-8')
90
+ compressed_blob = zstd.compress(data_bytes, level=self.zstd_level)
91
+ return compressed_blob
92
+
93
+ def decompress_zstd(self, compressed_blob: bytes) -> str:
94
+ """
95
+ Decompress Zstandard-compressed prompt.
96
+
97
+ Args:
98
+ compressed_blob: Compressed bytes from compress_zstd()
99
+
100
+ Returns:
101
+ Original prompt string
102
+ """
103
+ raw_bytes = zstd.decompress(compressed_blob)
104
+ return raw_bytes.decode('utf-8')
105
+
106
+ def compress_token(self, text: str) -> bytes:
107
+ """
108
+ Compress prompt using BPE tokenization and binary packing.
109
+
110
+ This method:
111
+ 1. Converts text to token IDs using the tokenizer
112
+ 2. Packs token IDs as unsigned integers (uint16 or uint32)
113
+ - Uses uint16 (2 bytes) if all token IDs <= 65535
114
+ - Uses uint32 (4 bytes) if any token ID > 65535
115
+
116
+ Args:
117
+ text: Original prompt string
118
+
119
+ Returns:
120
+ Compressed bytes (format byte + binary-packed token IDs)
121
+ Format: [1 byte format flag: 0=uint16, 1=uint32][packed token IDs]
122
+
123
+ Example:
124
+ >>> compressor = PromptCompressor()
125
+ >>> compressed = compressor.compress_token("Your prompt here")
126
+ >>> original = compressor.decompress_token(compressed)
127
+ """
128
+ # Step 1: Convert text to list of token IDs
129
+ token_ids = list(self.tokenizer.encode(text)) # Ensure it's a list
130
+
131
+ if not token_ids:
132
+ # Empty token list - return just format byte
133
+ return struct.pack('B', 0) # uint16 format
134
+
135
+ # Step 2: Determine if we can use uint16 or need uint32
136
+ # Check if ANY token ID exceeds uint16 range (0-65535)
137
+ max_token_id = max(token_ids)
138
+ min_token_id = min(token_ids)
139
+ use_uint32 = (max_token_id > 65535) or (min_token_id < 0)
140
+
141
+ # Step 3: Pack token IDs (format byte + token data)
142
+ format_byte = 1 if use_uint32 else 0 # 0 = uint16, 1 = uint32
143
+
144
+ try:
145
+ if use_uint32:
146
+ # Use uint32 (4 bytes per token) - format 'I'
147
+ token_data = struct.pack(f'{len(token_ids)}I', *token_ids)
148
+ else:
149
+ # Use uint16 (2 bytes per token) - format 'H'
150
+ # Double-check all IDs fit in uint16 range
151
+ if max(token_ids) > 65535:
152
+ # Fallback to uint32 if somehow we got here
153
+ format_byte = 1
154
+ token_data = struct.pack(f'{len(token_ids)}I', *token_ids)
155
+ else:
156
+ token_data = struct.pack(f'{len(token_ids)}H', *token_ids)
157
+ except (struct.error, OverflowError) as e:
158
+ # If packing fails, fallback to uint32
159
+ format_byte = 1
160
+ token_data = struct.pack(f'{len(token_ids)}I', *token_ids)
161
+
162
+ # Combine format byte with token data
163
+ binary_payload = struct.pack('B', format_byte) + token_data
164
+
165
+ return binary_payload
166
+
167
+ def decompress_token(self, binary_payload: bytes) -> str:
168
+ """
169
+ Decompress token-based compressed prompt.
170
+
171
+ Args:
172
+ binary_payload: Compressed bytes from compress_token()
173
+ Format: [1 byte format flag: 0=uint16, 1=uint32][packed token IDs]
174
+
175
+ Returns:
176
+ Original prompt string
177
+ """
178
+ if len(binary_payload) < 1:
179
+ raise ValueError("Invalid compressed data: missing format byte")
180
+
181
+ # Step 1: Read format byte
182
+ format_byte = struct.unpack('B', binary_payload[0:1])[0]
183
+ token_data = binary_payload[1:]
184
+
185
+ if format_byte == 1:
186
+ # uint32 format (4 bytes per token)
187
+ if len(token_data) % 4 != 0:
188
+ raise ValueError("Invalid compressed data: uint32 data length not divisible by 4")
189
+ num_tokens = len(token_data) // 4
190
+ token_ids = struct.unpack(f'{num_tokens}I', token_data)
191
+ else:
192
+ # uint16 format (2 bytes per token)
193
+ if len(token_data) % 2 != 0:
194
+ raise ValueError("Invalid compressed data: uint16 data length not divisible by 2")
195
+ num_tokens = len(token_data) // 2
196
+ token_ids = struct.unpack(f'{num_tokens}H', token_data)
197
+
198
+ # Step 2: Decode token IDs back to string
199
+ return self.tokenizer.decode(list(token_ids))
200
+
201
+ def compress_hybrid(self, text: str) -> bytes:
202
+ """
203
+ Compress prompt using hybrid approach (Token + Zstd).
204
+
205
+ This is the most efficient method:
206
+ 1. Tokenizes text to reduce redundancy
207
+ 2. Packs tokens as binary
208
+ 3. Applies Zstd compression on the binary data
209
+
210
+ Provides the best compression ratio for database storage.
211
+
212
+ Args:
213
+ text: Original prompt string
214
+
215
+ Returns:
216
+ Compressed bytes
217
+
218
+ Example:
219
+ >>> compressor = PromptCompressor()
220
+ >>> compressed = compressor.compress_hybrid("Your prompt here")
221
+ >>> original = compressor.decompress_hybrid(compressed)
222
+ """
223
+ # Step 1: Tokenize
224
+ tokens = list(self.tokenizer.encode(text)) # Ensure it's a list
225
+
226
+ if not tokens:
227
+ # Empty token list - return compressed empty data
228
+ empty_data = struct.pack('B', 0) # uint16 format
229
+ return zstd.compress(empty_data, level=self.zstd_level)
230
+
231
+ # Step 2: Convert to binary (determine uint16 or uint32)
232
+ max_token_id = max(tokens)
233
+ min_token_id = min(tokens)
234
+ use_uint32 = (max_token_id > 65535) or (min_token_id < 0)
235
+
236
+ format_byte = 1 if use_uint32 else 0 # 0 = uint16, 1 = uint32
237
+
238
+ try:
239
+ if use_uint32:
240
+ # Use uint32 (4 bytes per token)
241
+ token_data = struct.pack('B', format_byte) + struct.pack(f'{len(tokens)}I', *tokens)
242
+ else:
243
+ # Use uint16 (2 bytes per token)
244
+ # Double-check all IDs fit in uint16 range
245
+ if max(tokens) > 65535:
246
+ # Fallback to uint32 if somehow we got here
247
+ format_byte = 1
248
+ token_data = struct.pack('B', format_byte) + struct.pack(f'{len(tokens)}I', *tokens)
249
+ else:
250
+ token_data = struct.pack('B', format_byte) + struct.pack(f'{len(tokens)}H', *tokens)
251
+ except (struct.error, OverflowError) as e:
252
+ # If packing fails, fallback to uint32
253
+ format_byte = 1
254
+ token_data = struct.pack('B', format_byte) + struct.pack(f'{len(tokens)}I', *tokens)
255
+
256
+ # Step 3: Final Zstd compression
257
+ compressed_blob = zstd.compress(token_data, level=self.zstd_level)
258
+
259
+ return compressed_blob
260
+
261
+ def decompress_hybrid(self, blob: bytes) -> str:
262
+ """
263
+ Decompress hybrid-compressed prompt.
264
+
265
+ Args:
266
+ blob: Compressed bytes from compress_hybrid()
267
+
268
+ Returns:
269
+ Original prompt string
270
+ """
271
+ # Step 1: Decompress Zstd
272
+ token_data = zstd.decompress(blob)
273
+
274
+ if len(token_data) < 1:
275
+ raise ValueError("Invalid compressed data: missing format byte")
276
+
277
+ # Step 2: Read format byte and unpack token IDs
278
+ format_byte = struct.unpack('B', token_data[0:1])[0]
279
+ packed_data = token_data[1:]
280
+
281
+ if format_byte == 1:
282
+ # uint32 format (4 bytes per token)
283
+ if len(packed_data) % 4 != 0:
284
+ raise ValueError("Invalid compressed data: uint32 data length not divisible by 4")
285
+ num_tokens = len(packed_data) // 4
286
+ tokens = struct.unpack(f'{num_tokens}I', packed_data)
287
+ else:
288
+ # uint16 format (2 bytes per token)
289
+ if len(packed_data) % 2 != 0:
290
+ raise ValueError("Invalid compressed data: uint16 data length not divisible by 2")
291
+ num_tokens = len(packed_data) // 2
292
+ tokens = struct.unpack(f'{num_tokens}H', packed_data)
293
+
294
+ # Step 3: Decode to text
295
+ return self.tokenizer.decode(list(tokens))
296
+
297
+ def compress(
298
+ self,
299
+ text: str,
300
+ method: CompressionMethod = CompressionMethod.HYBRID
301
+ ) -> bytes:
302
+ """
303
+ Compress prompt using the specified method.
304
+
305
+ Args:
306
+ text: Original prompt string
307
+ method: Compression method to use (default: HYBRID)
308
+
309
+ Returns:
310
+ Compressed bytes
311
+
312
+ Example:
313
+ >>> compressor = PromptCompressor()
314
+ >>> compressed = compressor.compress("Your prompt", CompressionMethod.HYBRID)
315
+ >>> original = compressor.decompress(compressed, CompressionMethod.HYBRID)
316
+ """
317
+ if method == CompressionMethod.ZSTD:
318
+ return self.compress_zstd(text)
319
+ elif method == CompressionMethod.TOKEN:
320
+ return self.compress_token(text)
321
+ elif method == CompressionMethod.HYBRID:
322
+ return self.compress_hybrid(text)
323
+ else:
324
+ raise ValueError(f"Unknown compression method: {method}")
325
+
326
+ def decompress(
327
+ self,
328
+ compressed_data: bytes,
329
+ method: CompressionMethod = CompressionMethod.HYBRID
330
+ ) -> str:
331
+ """
332
+ Decompress prompt using the specified method.
333
+
334
+ Args:
335
+ compressed_data: Compressed bytes
336
+ method: Compression method used for compression
337
+
338
+ Returns:
339
+ Original prompt string
340
+ """
341
+ if method == CompressionMethod.ZSTD:
342
+ return self.decompress_zstd(compressed_data)
343
+ elif method == CompressionMethod.TOKEN:
344
+ return self.decompress_token(compressed_data)
345
+ elif method == CompressionMethod.HYBRID:
346
+ return self.decompress_hybrid(compressed_data)
347
+ else:
348
+ raise ValueError(f"Unknown compression method: {method}")
349
+
350
+ def compress_and_return_both(
351
+ self,
352
+ text: str,
353
+ method: CompressionMethod = CompressionMethod.HYBRID
354
+ ) -> Tuple[str, bytes]:
355
+ """
356
+ Compress prompt and return both original and compressed versions.
357
+
358
+ Args:
359
+ text: Original prompt string
360
+ method: Compression method to use (default: HYBRID)
361
+
362
+ Returns:
363
+ Tuple of (original_prompt, compressed_bytes)
364
+
365
+ Example:
366
+ >>> compressor = PromptCompressor()
367
+ >>> original, compressed = compressor.compress_and_return_both("Your prompt")
368
+ """
369
+ compressed = self.compress(text, method)
370
+ return (text, compressed)
371
+
372
+ def get_compression_stats(
373
+ self,
374
+ text: str,
375
+ method: Optional[CompressionMethod] = None
376
+ ) -> dict:
377
+ """
378
+ Get compression statistics for a given prompt.
379
+
380
+ Args:
381
+ text: Original prompt string
382
+ method: Compression method to analyze (None = all methods)
383
+
384
+ Returns:
385
+ Dictionary with compression statistics
386
+ """
387
+ methods = [method] if method else [
388
+ CompressionMethod.ZSTD,
389
+ CompressionMethod.TOKEN,
390
+ CompressionMethod.HYBRID
391
+ ]
392
+
393
+ original_size = len(text.encode('utf-8'))
394
+ stats = {
395
+ 'original_size_bytes': original_size,
396
+ 'original_size_tokens': len(self.tokenizer.encode(text)),
397
+ 'methods': {}
398
+ }
399
+
400
+ for m in methods:
401
+ compressed = self.compress(text, m)
402
+ compressed_size = len(compressed)
403
+ compression_ratio = compressed_size / original_size if original_size > 0 else 0
404
+ space_saved = 1 - compression_ratio
405
+
406
+ stats['methods'][m.value] = {
407
+ 'compressed_size_bytes': compressed_size,
408
+ 'compression_ratio': compression_ratio,
409
+ 'space_saved_percent': space_saved * 100,
410
+ 'bytes_saved': original_size - compressed_size
411
+ }
412
+
413
+ return stats
414
+
415
+ def calculate_shannon_entropy(self, text: str, unit: str = 'character') -> float:
416
+ """
417
+ Calculate Shannon Entropy of the input text.
418
+
419
+ Shannon Entropy formula: H(X) = -∑ P(x_i) * log₂(P(x_i))
420
+
421
+ This determines the theoretical lower limit of compression based on
422
+ character/byte frequency distribution.
423
+
424
+ Args:
425
+ text: Input text to analyze
426
+ unit: Unit to analyze ('character' or 'byte')
427
+ - 'character': Analyze individual characters
428
+ - 'byte': Analyze bytes (for binary data)
429
+
430
+ Returns:
431
+ Shannon entropy in bits
432
+
433
+ Example:
434
+ >>> compressor = PromptCompressor()
435
+ >>> entropy = compressor.calculate_shannon_entropy("Hello world")
436
+ >>> print(f"Theoretical compression limit: {entropy:.2f} bits per character")
437
+ """
438
+ if not text:
439
+ return 0.0
440
+
441
+ if unit == 'byte':
442
+ # Analyze bytes
443
+ data = text.encode('utf-8')
444
+ frequencies = Counter(data)
445
+ else: # unit == 'character'
446
+ # Analyze characters
447
+ frequencies = Counter(text)
448
+
449
+ # Calculate probabilities
450
+ length = len(text) if unit == 'character' else len(data)
451
+ probabilities = [count / length for count in frequencies.values()]
452
+
453
+ # Calculate Shannon Entropy: H(X) = -∑ P(x_i) * log₂(P(x_i))
454
+ entropy = -sum(p * math.log2(p) for p in probabilities if p > 0)
455
+
456
+ return entropy
457
+
458
+ def get_theoretical_compression_limit(self, text: str, unit: str = 'character') -> Dict[str, float]:
459
+ """
460
+ Calculate theoretical compression limit using Shannon Entropy.
461
+
462
+ This provides the theoretical minimum size achievable through entropy coding.
463
+
464
+ Args:
465
+ text: Input text to analyze
466
+ unit: Unit to analyze ('character' or 'byte')
467
+
468
+ Returns:
469
+ Dictionary with theoretical limits:
470
+ - entropy_bits_per_unit: Shannon entropy in bits
471
+ - theoretical_min_bits: Minimum total bits needed
472
+ - theoretical_min_bytes: Minimum bytes needed (theoretical limit)
473
+ - original_size_bytes: Original size in bytes
474
+ - theoretical_compression_ratio: Theoretical best compression ratio
475
+
476
+ Example:
477
+ >>> compressor = PromptCompressor()
478
+ >>> limits = compressor.get_theoretical_compression_limit("Your prompt")
479
+ >>> print(f"Theoretical minimum: {limits['theoretical_min_bytes']:.2f} bytes")
480
+ """
481
+ if not text:
482
+ return {
483
+ 'entropy_bits_per_unit': 0.0,
484
+ 'theoretical_min_bits': 0.0,
485
+ 'theoretical_min_bytes': 0.0,
486
+ 'original_size_bytes': 0.0,
487
+ 'theoretical_compression_ratio': 0.0
488
+ }
489
+
490
+ # Calculate Shannon Entropy
491
+ entropy = self.calculate_shannon_entropy(text, unit)
492
+
493
+ # Calculate theoretical minimums
494
+ if unit == 'byte':
495
+ num_units = len(text.encode('utf-8'))
496
+ original_size_bytes = len(text.encode('utf-8'))
497
+ else: # character
498
+ num_units = len(text)
499
+ original_size_bytes = len(text.encode('utf-8'))
500
+
501
+ theoretical_min_bits = entropy * num_units
502
+ theoretical_min_bytes = theoretical_min_bits / 8.0
503
+
504
+ # Theoretical compression ratio
505
+ theoretical_compression_ratio = (
506
+ theoretical_min_bytes / original_size_bytes
507
+ if original_size_bytes > 0 else 0.0
508
+ )
509
+
510
+ return {
511
+ 'entropy_bits_per_unit': entropy,
512
+ 'theoretical_min_bits': theoretical_min_bits,
513
+ 'theoretical_min_bytes': theoretical_min_bytes,
514
+ 'original_size_bytes': original_size_bytes,
515
+ 'theoretical_compression_ratio': theoretical_compression_ratio,
516
+ 'theoretical_space_savings_percent': (1 - theoretical_compression_ratio) * 100
517
+ }
@@ -0,0 +1,402 @@
1
+ Metadata-Version: 2.4
2
+ Name: lopace
3
+ Version: 0.1.0
4
+ Summary: Lossless Optimized Prompt Accurate Compression Engine
5
+ Home-page: https://github.com/connectaman/LoPace
6
+ Author: Aman Ulla
7
+ License: MIT
8
+ Project-URL: Homepage, https://github.com/amanulla/lopace
9
+ Project-URL: Repository, https://github.com/amanulla/lopace
10
+ Project-URL: Issues, https://github.com/amanulla/lopace/issues
11
+ Keywords: prompt,compression,tokenization,zstd,bpe,nlp
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
15
+ Classifier: Topic :: Text Processing :: Linguistic
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.8
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Requires-Python: >=3.8
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE
26
+ Requires-Dist: zstandard>=0.22.0
27
+ Requires-Dist: tiktoken>=0.5.0
28
+ Dynamic: home-page
29
+ Dynamic: license-file
30
+ Dynamic: requires-python
31
+
32
+ # LoPace
33
+
34
+ **Lossless Optimized Prompt Accurate Compression Engine**
35
+
36
+ A professional, open-source Python package for compressing and decompressing prompts using multiple techniques: Zstd, Token-based (BPE), and Hybrid methods. Achieve up to 80% space reduction while maintaining perfect lossless reconstruction.
37
+
38
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
39
+ [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
40
+
41
+ ## Features
42
+
43
+ - 🚀 **Three Compression Methods**:
44
+ - **Zstd**: Dictionary-based compression using Zstandard algorithm
45
+ - **Token**: Byte-Pair Encoding (BPE) tokenization with binary packing
46
+ - **Hybrid**: Combination of tokenization and Zstd (best compression ratio)
47
+
48
+ - ✅ **Lossless**: Perfect reconstruction of original prompts
49
+ - 📊 **Compression Statistics**: Analyze compression ratios and space savings
50
+ - 🔧 **Simple API**: Easy-to-use interface for all compression methods
51
+ - 🎯 **Database-Ready**: Optimized for storing prompts in databases
52
+
53
+ ## Installation
54
+
55
+ ```bash
56
+ pip install lopace
57
+ ```
58
+
59
+ ### Dependencies
60
+
61
+ - `zstandard>=0.22.0` - For Zstd compression
62
+ - `tiktoken>=0.5.0` - For BPE tokenization
63
+
64
+ ## Quick Start
65
+
66
+ ```python
67
+ from lopace import PromptCompressor, CompressionMethod
68
+
69
+ # Initialize compressor
70
+ compressor = PromptCompressor(model="cl100k_base", zstd_level=15)
71
+
72
+ # Your prompt
73
+ prompt = "You are a helpful AI assistant..."
74
+
75
+ # Compress using hybrid method (recommended)
76
+ compressed = compressor.compress(prompt, CompressionMethod.HYBRID)
77
+
78
+ # Decompress back to original
79
+ original = compressor.decompress(compressed, CompressionMethod.HYBRID)
80
+
81
+ # Verify losslessness
82
+ assert original == prompt # ✓ True
83
+ ```
84
+
85
+ ## Usage Examples
86
+
87
+ ### Basic Compression/Decompression
88
+
89
+ ```python
90
+ from lopace import PromptCompressor, CompressionMethod
91
+
92
+ compressor = PromptCompressor()
93
+
94
+ # Compress and return both original and compressed
95
+ original, compressed = compressor.compress_and_return_both(
96
+ "Your prompt here",
97
+ CompressionMethod.HYBRID
98
+ )
99
+
100
+ # Decompress
101
+ recovered = compressor.decompress(compressed, CompressionMethod.HYBRID)
102
+ ```
103
+
104
+ ### Using Different Methods
105
+
106
+ ```python
107
+ compressor = PromptCompressor()
108
+
109
+ prompt = "Your system prompt here..."
110
+
111
+ # Method 1: Zstd only
112
+ zstd_compressed = compressor.compress_zstd(prompt)
113
+ zstd_decompressed = compressor.decompress_zstd(zstd_compressed)
114
+
115
+ # Method 2: Token-based (BPE)
116
+ token_compressed = compressor.compress_token(prompt)
117
+ token_decompressed = compressor.decompress_token(token_compressed)
118
+
119
+ # Method 3: Hybrid (recommended - best compression)
120
+ hybrid_compressed = compressor.compress_hybrid(prompt)
121
+ hybrid_decompressed = compressor.decompress_hybrid(hybrid_compressed)
122
+ ```
123
+
124
+ ### Get Compression Statistics
125
+
126
+ ```python
127
+ compressor = PromptCompressor()
128
+ prompt = "Your long system prompt..."
129
+
130
+ # Get stats for all methods
131
+ stats = compressor.get_compression_stats(prompt)
132
+
133
+ print(f"Original Size: {stats['original_size_bytes']} bytes")
134
+ print(f"Original Tokens: {stats['original_size_tokens']}")
135
+
136
+ for method, method_stats in stats['methods'].items():
137
+ print(f"\n{method}:")
138
+ print(f" Compressed: {method_stats['compressed_size_bytes']} bytes")
139
+ print(f" Space Saved: {method_stats['space_saved_percent']:.2f}%")
140
+ ```
141
+
142
+ ## Compression Methods Explained
143
+
144
+ ### 1. Zstd Compression
145
+
146
+ Uses Zstandard's dictionary-based algorithm to find repeated patterns and replace them with shorter references.
147
+
148
+ **Best for**: General text compression, when tokenization overhead is not needed.
149
+
150
+ ```python
151
+ compressed = compressor.compress_zstd(prompt)
152
+ original = compressor.decompress_zstd(compressed)
153
+ ```
154
+
155
+ ### 2. Token-Based Compression
156
+
157
+ Uses Byte-Pair Encoding (BPE) to convert text to token IDs, then packs them as binary data.
158
+
159
+ **Best for**: When you need token IDs anyway, or when working with LLM tokenizers.
160
+
161
+ ```python
162
+ compressed = compressor.compress_token(prompt)
163
+ original = compressor.decompress_token(compressed)
164
+ ```
165
+
166
+ ### 3. Hybrid Compression (Recommended)
167
+
168
+ Combines tokenization and Zstd compression for maximum efficiency:
169
+
170
+ 1. Tokenizes text to reduce redundancy
171
+ 2. Packs tokens as binary (2 bytes per token)
172
+ 3. Applies Zstd compression on the binary data
173
+
174
+ **Best for**: Database storage where maximum compression is needed.
175
+
176
+ ```python
177
+ compressed = compressor.compress_hybrid(prompt)
178
+ original = compressor.decompress_hybrid(compressed)
179
+ ```
180
+
181
+ ## API Reference
182
+
183
+ ### `PromptCompressor`
184
+
185
+ Main compressor class.
186
+
187
+ #### Constructor
188
+
189
+ ```python
190
+ PromptCompressor(
191
+ model: str = "cl100k_base",
192
+ zstd_level: int = 15
193
+ )
194
+ ```
195
+
196
+ **Parameters:**
197
+ - `model`: Tokenizer model name (default: `"cl100k_base"`)
198
+ - Options: `"cl100k_base"`, `"p50k_base"`, `"r50k_base"`, `"gpt2"`, etc.
199
+ - `zstd_level`: Zstd compression level 1-22 (default: `15`)
200
+ - Higher = better compression but slower
201
+
202
+ #### Methods
203
+
204
+ ##### `compress(text: str, method: CompressionMethod) -> bytes`
205
+
206
+ Compress a prompt using the specified method.
207
+
208
+ ##### `decompress(compressed_data: bytes, method: CompressionMethod) -> str`
209
+
210
+ Decompress a compressed prompt.
211
+
212
+ ##### `compress_and_return_both(text: str, method: CompressionMethod) -> Tuple[str, bytes]`
213
+
214
+ Compress and return both original and compressed versions.
215
+
216
+ ##### `get_compression_stats(text: str, method: Optional[CompressionMethod]) -> dict`
217
+
218
+ Get detailed compression statistics for analysis.
219
+
220
+ ### `CompressionMethod`
221
+
222
+ Enumeration of available compression methods:
223
+
224
+ - `CompressionMethod.ZSTD` - Zstandard compression
225
+ - `CompressionMethod.TOKEN` - Token-based compression
226
+ - `CompressionMethod.HYBRID` - Hybrid compression (recommended)
227
+
228
+ ## How It Works
229
+
230
+ ### Compression Pipeline (Hybrid Method)
231
+
232
+ ```
233
+ Input: Raw System Prompt String (100%)
234
+
235
+ Tokenization: Convert to Tiktoken IDs (~70% reduced)
236
+
237
+ Binary Packing: Convert IDs to uint16 (~50% of above)
238
+
239
+ Zstd: Final compression (~30% further reduction)
240
+
241
+ Output: Compressed Binary Blob
242
+ ```
243
+
244
+ ### Why Hybrid is Best for Databases
245
+
246
+ 1. **Searchability**: Token IDs can be searched without full decompression
247
+ 2. **Consistency**: Fixed tokenizer ensures stable compression ratios
248
+ 3. **Efficiency**: Maximum space savings for millions of prompts
249
+
250
+ ## Example Output
251
+
252
+ ```python
253
+ # Original prompt: 500 bytes
254
+ # After compression:
255
+ # Zstd: 180 bytes (64% space saved)
256
+ # Token: 240 bytes (52% space saved)
257
+ # Hybrid: 120 bytes (76% space saved) ← Best!
258
+ ```
259
+
260
+ ## Running the Example
261
+
262
+ ```bash
263
+ python example.py
264
+ ```
265
+
266
+ This will demonstrate all compression methods and show statistics.
267
+
268
+ ## Interactive Web App (Streamlit)
269
+
270
+ LoPace includes an interactive Streamlit web application with comprehensive evaluation metrics:
271
+
272
+ ### Features
273
+
274
+ - **Interactive Interface**: Enter prompts and see real-time compression results
275
+ - **Comprehensive Metrics**: All four industry-standard metrics:
276
+ - Compression Ratio (CR): $CR = \frac{S_{original}}{S_{compressed}}$
277
+ - Space Savings (SS): $SS = 1 - \frac{S_{compressed}}{S_{original}}$
278
+ - Bits Per Character (BPC): $BPC = \frac{Total Bits}{Total Characters}$
279
+ - Throughput (MB/s): $T = \frac{Data Size}{Time}$
280
+ - **Lossless Verification**:
281
+ - SHA-256 Hash Verification
282
+ - Exact Match (Character-by-Character)
283
+ - Reconstruction Error: $E = \frac{1}{N} \sum_{i=1}^{N} \mathbb{1}(x_i \neq \hat{x}_i) = 0$
284
+ - **Side-by-Side Comparison**: Compare all three compression methods
285
+ - **Real-time Configuration**: Adjust tokenizer model and Zstd level
286
+
287
+ ### Running the Streamlit App
288
+
289
+ ```bash
290
+ streamlit run streamlit_app.py
291
+ ```
292
+
293
+ The app will open in your default web browser at `http://localhost:8501`
294
+
295
+ ### Screenshot Preview
296
+
297
+ The app features:
298
+ - **Left Panel**: Text input area for entering prompts
299
+ - **Right Panel**: Results with tabs for each compression method
300
+ - **Metrics Dashboard**: Real-time calculation of all evaluation metrics
301
+ - **Verification Section**: Hash matching and exact match verification
302
+ - **Comparison Table**: Side-by-side comparison of all methods
303
+
304
+ ## Development
305
+
306
+ ### Setup Development Environment
307
+
308
+ ```bash
309
+ git clone https://github.com/amanulla/lopace.git
310
+ cd lopace
311
+ pip install -r requirements-dev.txt
312
+ ```
313
+
314
+ ### Running Tests
315
+
316
+ ```bash
317
+ pytest
318
+ ```
319
+
320
+ ### CI/CD Pipeline
321
+
322
+ This project uses GitHub Actions for automated testing and publishing:
323
+
324
+ - **Tests run automatically** on every push and pull request
325
+ - **Publishing to PyPI** happens automatically when:
326
+ - All tests pass ✅
327
+ - Push is to `main`/`master` branch or a version tag (e.g., `v0.1.0`)
328
+
329
+ See [.github/workflows/README.md](.github/workflows/README.md) for detailed setup instructions.
330
+
331
+ ## Mathematical Background
332
+
333
+ ### Compression Techniques Used
334
+
335
+ LoPace uses the following compression techniques:
336
+
337
+ 1. **LZ77 (Sliding Window)**: Used **indirectly** through Zstandard
338
+ - Zstandard internally uses LZ77-style algorithms to find repeated patterns
339
+ - Instead of storing "assistant" again, it stores a tuple: (distance_back, length)
340
+ - We use this by calling `zstandard.compress()` - the LZ77 is handled internally
341
+
342
+ 2. **Huffman Coding / FSE (Finite State Entropy)**: Used **indirectly** through Zstandard
343
+ - Zstandard uses FSE, a variant of Huffman coding
344
+ - Assigns shorter binary codes to characters/patterns that appear most frequently
345
+ - Again, handled internally by the zstandard library
346
+
347
+ 3. **BPE Tokenization**: Used **directly** via tiktoken
348
+ - Byte-Pair Encoding converts text to token IDs
349
+ - Reduces vocabulary size before compression
350
+ - Implemented by OpenAI's tiktoken library
351
+
352
+ ### Shannon Entropy
353
+
354
+ The theoretical compression limit is determined by Shannon Entropy:
355
+
356
+ $H(X) = -\sum_{i=1}^{n} P(x_i) \log_2 P(x_i)$
357
+
358
+ Where:
359
+ - $H(X)$ is the entropy of the source
360
+ - $P(x_i)$ is the probability of character/pattern $x_i$
361
+
362
+ LoPace **calculates** Shannon Entropy to show theoretical compression limits:
363
+
364
+ ```python
365
+ compressor = PromptCompressor()
366
+ entropy = compressor.calculate_shannon_entropy("Your prompt")
367
+ limits = compressor.get_theoretical_compression_limit("Your prompt")
368
+ print(f"Theoretical minimum: {limits['theoretical_min_bytes']:.2f} bytes")
369
+ ```
370
+
371
+ This allows you to compare actual compression against the theoretical limit.
372
+
373
+ ## License
374
+
375
+ MIT License - see [LICENSE](LICENSE) file for details.
376
+
377
+ ## Contributing
378
+
379
+ Contributions are welcome! We appreciate your help in making LoPace better.
380
+
381
+ Please read our [Contributing Guidelines](CONTRIBUTING.md) and [Code of Conduct](CODE_OF_CONDUCT.md) before contributing.
382
+
383
+ ### Quick Start for Contributors
384
+
385
+ 1. Fork the repository
386
+ 2. Create a feature branch (`git checkout -b feature/amazing-feature`)
387
+ 3. Make your changes
388
+ 4. Run tests (`pytest tests/ -v`)
389
+ 5. Commit your changes (`git commit -m 'Add amazing feature'`)
390
+ 6. Push to the branch (`git push origin feature/amazing-feature`)
391
+ 7. Open a Pull Request
392
+
393
+ For more details, see [CONTRIBUTING.md](CONTRIBUTING.md).
394
+
395
+ ## Author
396
+
397
+ Aman Ulla
398
+
399
+ ## Acknowledgments
400
+
401
+ - Built on top of [zstandard](https://github.com/facebook/zstd) and [tiktoken](https://github.com/openai/tiktoken)
402
+ - Inspired by the need for efficient prompt storage in LLM applications
@@ -0,0 +1,9 @@
1
+ lopace/__init__.py,sha256=PYjZWZHhSITNgag9sF0qZ_yXgZaMa3R8_3FuasiH0Nc,351
2
+ lopace/compressor.py,sha256=nUTWDcAPYvQaeSFKx_lne-D2xIQ02IMVGE4yLODo8qE,19060
3
+ lopace-0.1.0.dist-info/licenses/LICENSE,sha256=uFUrlsfsOwx_8Nzhq2pUgNaJghcJxXBMML3l7T39Tm0,1067
4
+ tests/__init__.py,sha256=yXNVJE20E2iHo0qbit5SgRE35eXWq89F1kkhNHy7VJA,31
5
+ tests/test_compressor.py,sha256=-vMztSzY89n5dpShcACrFboEQOlfJ6FxF7eQOEU3swM,8273
6
+ lopace-0.1.0.dist-info/METADATA,sha256=yXy0jt23uvVWkGlEeCb8KEUSx1_o3N02wZZEFj5weEI,12199
7
+ lopace-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
8
+ lopace-0.1.0.dist-info/top_level.txt,sha256=8CLB5czxmmAfR7ayh3TO5qyB1-xJoYNxabufJ37Xh5o,13
9
+ lopace-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Aman Ulla
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,2 @@
1
+ lopace
2
+ tests
tests/__init__.py ADDED
@@ -0,0 +1 @@
1
+ """Tests for LoPace package."""
@@ -0,0 +1,198 @@
1
+ """Tests for PromptCompressor class."""
2
+
3
+ import pytest
4
+ from lopace import PromptCompressor, CompressionMethod
5
+
6
+
7
+ @pytest.fixture
8
+ def compressor():
9
+ """Create a PromptCompressor instance for testing."""
10
+ return PromptCompressor(model="cl100k_base", zstd_level=15)
11
+
12
+
13
+ @pytest.fixture
14
+ def sample_prompt():
15
+ """Sample prompt for testing."""
16
+ return """You are a helpful AI assistant designed to provide accurate,
17
+ detailed, and helpful responses to user queries. Your goal is to assist users
18
+ by understanding their questions and providing relevant information."""
19
+
20
+
21
+ class TestZstdCompression:
22
+ """Test Zstd compression/decompression."""
23
+
24
+ def test_compress_decompress_zstd(self, compressor, sample_prompt):
25
+ """Test that Zstd compression is lossless."""
26
+ compressed = compressor.compress_zstd(sample_prompt)
27
+ decompressed = compressor.decompress_zstd(compressed)
28
+ assert decompressed == sample_prompt
29
+
30
+ def test_zstd_compression_ratio(self, compressor, sample_prompt):
31
+ """Test that Zstd actually compresses."""
32
+ compressed = compressor.compress_zstd(sample_prompt)
33
+ original_size = len(sample_prompt.encode('utf-8'))
34
+ compressed_size = len(compressed)
35
+ assert compressed_size < original_size
36
+
37
+
38
+ class TestTokenCompression:
39
+ """Test Token-based compression/decompression."""
40
+
41
+ def test_compress_decompress_token(self, compressor, sample_prompt):
42
+ """Test that Token compression is lossless."""
43
+ compressed = compressor.compress_token(sample_prompt)
44
+ decompressed = compressor.decompress_token(compressed)
45
+ assert decompressed == sample_prompt
46
+
47
+ def test_token_binary_format(self, compressor, sample_prompt):
48
+ """Test that token compression produces binary data with format byte."""
49
+ compressed = compressor.compress_token(sample_prompt)
50
+ assert isinstance(compressed, bytes)
51
+ # Should have at least format byte (1 byte)
52
+ assert len(compressed) >= 1
53
+ # Format byte should be 0 (uint16) or 1 (uint32)
54
+ import struct
55
+ format_byte = struct.unpack('B', compressed[0:1])[0]
56
+ assert format_byte in [0, 1]
57
+
58
+
59
+ class TestHybridCompression:
60
+ """Test Hybrid compression/decompression."""
61
+
62
+ def test_compress_decompress_hybrid(self, compressor, sample_prompt):
63
+ """Test that Hybrid compression is lossless."""
64
+ compressed = compressor.compress_hybrid(sample_prompt)
65
+ decompressed = compressor.decompress_hybrid(compressed)
66
+ assert decompressed == sample_prompt
67
+
68
+ def test_hybrid_compression(self, compressor, sample_prompt):
69
+ """Test that Hybrid compression works and compares with other methods."""
70
+ zstd_compressed = compressor.compress_zstd(sample_prompt)
71
+ token_compressed = compressor.compress_token(sample_prompt)
72
+ hybrid_compressed = compressor.compress_hybrid(sample_prompt)
73
+
74
+ # All methods should compress (smaller than original for longer prompts)
75
+ original_size = len(sample_prompt.encode('utf-8'))
76
+
77
+ # Verify all compression methods produce valid output
78
+ assert len(zstd_compressed) > 0
79
+ assert len(token_compressed) > 0
80
+ assert len(hybrid_compressed) > 0
81
+
82
+ # For very long prompts (>500 chars), hybrid should typically be better than token alone
83
+ # But for short prompts, Zstd overhead can make hybrid larger
84
+ if len(sample_prompt) > 500:
85
+ # On very long prompts, hybrid should generally compress well
86
+ assert len(hybrid_compressed) < original_size
87
+
88
+ # Zstd alone should compress for longer prompts
89
+ if len(sample_prompt) > 100:
90
+ assert len(zstd_compressed) < original_size
91
+
92
+
93
+ class TestGenericMethods:
94
+ """Test generic compress/decompress methods."""
95
+
96
+ def test_compress_with_method(self, compressor, sample_prompt):
97
+ """Test generic compress method with all methods."""
98
+ for method in CompressionMethod:
99
+ compressed = compressor.compress(sample_prompt, method)
100
+ assert isinstance(compressed, bytes)
101
+ assert len(compressed) > 0
102
+
103
+ def test_decompress_with_method(self, compressor, sample_prompt):
104
+ """Test generic decompress method with all methods."""
105
+ for method in CompressionMethod:
106
+ compressed = compressor.compress(sample_prompt, method)
107
+ decompressed = compressor.decompress(compressed, method)
108
+ assert decompressed == sample_prompt
109
+
110
+ def test_compress_and_return_both(self, compressor, sample_prompt):
111
+ """Test compress_and_return_both method."""
112
+ original, compressed = compressor.compress_and_return_both(
113
+ sample_prompt,
114
+ CompressionMethod.HYBRID
115
+ )
116
+ assert original == sample_prompt
117
+ assert isinstance(compressed, bytes)
118
+ assert len(compressed) > 0
119
+
120
+
121
+ class TestCompressionStats:
122
+ """Test compression statistics."""
123
+
124
+ def test_get_compression_stats_all_methods(self, compressor, sample_prompt):
125
+ """Test getting stats for all methods."""
126
+ stats = compressor.get_compression_stats(sample_prompt)
127
+
128
+ assert 'original_size_bytes' in stats
129
+ assert 'original_size_tokens' in stats
130
+ assert 'methods' in stats
131
+
132
+ assert len(stats['methods']) == 3 # ZSTD, TOKEN, HYBRID
133
+
134
+ for method_name, method_stats in stats['methods'].items():
135
+ assert 'compressed_size_bytes' in method_stats
136
+ assert 'compression_ratio' in method_stats
137
+ assert 'space_saved_percent' in method_stats
138
+ assert 'bytes_saved' in method_stats
139
+
140
+ def test_get_compression_stats_single_method(self, compressor, sample_prompt):
141
+ """Test getting stats for a single method."""
142
+ stats = compressor.get_compression_stats(
143
+ sample_prompt,
144
+ CompressionMethod.HYBRID
145
+ )
146
+
147
+ assert len(stats['methods']) == 1
148
+ assert CompressionMethod.HYBRID.value in stats['methods']
149
+
150
+
151
+ class TestEdgeCases:
152
+ """Test edge cases and error handling."""
153
+
154
+ def test_empty_string(self, compressor):
155
+ """Test compression of empty string."""
156
+ for method in CompressionMethod:
157
+ compressed = compressor.compress("", method)
158
+ decompressed = compressor.decompress(compressed, method)
159
+ assert decompressed == ""
160
+
161
+ def test_large_token_ids(self, compressor):
162
+ """Test compression with token IDs that exceed uint16 range."""
163
+ # Create a prompt that might trigger large token IDs
164
+ # Using various special characters and unicode
165
+ large_prompt = "Hello " * 1000 + "世界 🌍 مرحبا " * 100
166
+
167
+ # This should work without error, handling uint32 if needed
168
+ compressed = compressor.compress_token(large_prompt)
169
+ decompressed = compressor.decompress_token(compressed)
170
+ assert decompressed == large_prompt
171
+
172
+ # Test hybrid method too
173
+ compressed_hybrid = compressor.compress_hybrid(large_prompt)
174
+ decompressed_hybrid = compressor.decompress_hybrid(compressed_hybrid)
175
+ assert decompressed_hybrid == large_prompt
176
+
177
+ def test_single_character(self, compressor):
178
+ """Test compression of single character."""
179
+ for method in CompressionMethod:
180
+ compressed = compressor.compress("a", method)
181
+ decompressed = compressor.decompress(compressed, method)
182
+ assert decompressed == "a"
183
+
184
+ def test_unicode_characters(self, compressor):
185
+ """Test compression of unicode characters."""
186
+ unicode_prompt = "你好世界 🌍 مرحبا"
187
+ for method in CompressionMethod:
188
+ compressed = compressor.compress(unicode_prompt, method)
189
+ decompressed = compressor.decompress(compressed, method)
190
+ assert decompressed == unicode_prompt
191
+
192
+ def test_invalid_zstd_level(self):
193
+ """Test that invalid zstd_level raises error."""
194
+ with pytest.raises(ValueError):
195
+ PromptCompressor(zstd_level=0)
196
+
197
+ with pytest.raises(ValueError):
198
+ PromptCompressor(zstd_level=23)