simple-compression 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- simple_compression-0.1.0/LICENSE +22 -0
- simple_compression-0.1.0/PKG-INFO +97 -0
- simple_compression-0.1.0/README.md +83 -0
- simple_compression-0.1.0/pyproject.toml +29 -0
- simple_compression-0.1.0/setup.cfg +4 -0
- simple_compression-0.1.0/simple_compression/__init__.py +4 -0
- simple_compression-0.1.0/simple_compression/algorithms/__init__.py +3 -0
- simple_compression-0.1.0/simple_compression/algorithms/lz77.py +123 -0
- simple_compression-0.1.0/simple_compression/algorithms/rle.py +77 -0
- simple_compression-0.1.0/simple_compression/compression.py +93 -0
- simple_compression-0.1.0/simple_compression/probe.py +35 -0
- simple_compression-0.1.0/simple_compression.egg-info/PKG-INFO +97 -0
- simple_compression-0.1.0/simple_compression.egg-info/SOURCES.txt +23 -0
- simple_compression-0.1.0/simple_compression.egg-info/dependency_links.txt +1 -0
- simple_compression-0.1.0/simple_compression.egg-info/top_level.txt +1 -0
- simple_compression-0.1.0/tests/test_auto_pipeline.py +41 -0
- simple_compression-0.1.0/tests/test_lz77.py +12 -0
- simple_compression-0.1.0/tests/test_manual_pipeline.py +31 -0
- simple_compression-0.1.0/tests/test_rle.py +11 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Malek Yehya (th3f0rk)
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the “Software”), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: simple-compression
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A byte-level compression library implementing RLE and LZ77 with explicit encoding formats, manual pipelines, and automatic algorithm selection.
|
|
5
|
+
Author: Malek Yehya
|
|
6
|
+
License: MIT
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Python: >=3.9
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Dynamic: license-file
|
|
14
|
+
|
|
15
|
+
# simple-compression
|
|
16
|
+
|
|
17
|
+
`simple-compression` is a small Python library that implements classic byte-oriented compression algorithms with an explicit and composable API.
|
|
18
|
+
|
|
19
|
+
The library is designed to operate directly on `bytearray` data and provides both manual and automatic compression pipelines. All encoded outputs are self-describing and can be decoded without external metadata.
|
|
20
|
+
|
|
21
|
+
Current version: **v0.1.0**
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Scope and goals
|
|
26
|
+
|
|
27
|
+
This project focuses on:
|
|
28
|
+
|
|
29
|
+
- Correct, deterministic implementations of classic compression algorithms
|
|
30
|
+
- Explicit encoding formats that are easy to inspect and reason about
|
|
31
|
+
- A simple API for chaining multiple compression stages
|
|
32
|
+
- Safe and strict decoding
|
|
33
|
+
|
|
34
|
+
This library does **not** attempt to compete with production compressors in performance. It is intended for correctness, clarity, and control.
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
|
|
38
|
+
## Implemented algorithms
|
|
39
|
+
|
|
40
|
+
- Run-Length Encoding (RLE)
|
|
41
|
+
- LZ77
|
|
42
|
+
|
|
43
|
+
Each algorithm has a fully defined binary format and a strict decoder.
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## Installation
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install simple-compression
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## Basic Usage
|
|
56
|
+
|
|
57
|
+
**Both of these features are expected to improve in effectiveness with more testing and tuning as well as the future implementation of a Huffman Algorithm**
|
|
58
|
+
The first usage uses the `auto=True` argument does a quick pass on the data to gather metrics to automatically select the algorithms and their sequence.
|
|
59
|
+
The second usage passes the algorithm name as arguments to manually select algorithms and determine their sequence.
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
from simple_compression.compression import SimpleCompression
|
|
63
|
+
|
|
64
|
+
compress = SimpleCompression()
|
|
65
|
+
|
|
66
|
+
data = bytearray(b"AAAAAABBBBBCCDSADDDDDSSSCVZCSSSSWEEEFWEWAFZCVAGQWTQL")
|
|
67
|
+
|
|
68
|
+
encoded = compress.encode(data, auto=True)
|
|
69
|
+
decoded = compress.decode(encoded)
|
|
70
|
+
|
|
71
|
+
encoded = compress.encode(data, sequence=["RLE", "LZ77"])
|
|
72
|
+
decoded = compress.decode(encoded)
|
|
73
|
+
```
|
|
74
|
+
The decoder reads header tokens embedded at the start of the bitstream to determine which algorithms were applied and in which order.
|
|
75
|
+
This allows for a really robust decoder which when combined with the spec documentation for each algorithm can be helpful in implementing decoders in other languages.
|
|
76
|
+
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
## Algorithm Formatting
|
|
80
|
+
|
|
81
|
+
Detailed binary formats for each algorithm are documented below.
|
|
82
|
+
[RLE Format](documentation/rle_format.md)
|
|
83
|
+
[LZ77 Format](documentation/lz77_format.md)
|
|
84
|
+
|
|
85
|
+
---
|
|
86
|
+
|
|
87
|
+
## Road Map
|
|
88
|
+
|
|
89
|
+
First is implementing a Huffman algorithm to really take advantage of the sequencing ability of this library.
|
|
90
|
+
As features are added the automatic sequencing feature will be continuously tuned to ensure that rle is only enabled when it doesn't infalte the input.
|
|
91
|
+
As algorithms are added sequencing logic and metrics will develop alongside.
|
|
92
|
+
|
|
93
|
+
---
|
|
94
|
+
|
|
95
|
+
## License
|
|
96
|
+
|
|
97
|
+
**MIT**
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# simple-compression
|
|
2
|
+
|
|
3
|
+
`simple-compression` is a small Python library that implements classic byte-oriented compression algorithms with an explicit and composable API.
|
|
4
|
+
|
|
5
|
+
The library is designed to operate directly on `bytearray` data and provides both manual and automatic compression pipelines. All encoded outputs are self-describing and can be decoded without external metadata.
|
|
6
|
+
|
|
7
|
+
Current version: **v0.1.0**
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## Scope and goals
|
|
12
|
+
|
|
13
|
+
This project focuses on:
|
|
14
|
+
|
|
15
|
+
- Correct, deterministic implementations of classic compression algorithms
|
|
16
|
+
- Explicit encoding formats that are easy to inspect and reason about
|
|
17
|
+
- A simple API for chaining multiple compression stages
|
|
18
|
+
- Safe and strict decoding
|
|
19
|
+
|
|
20
|
+
This library does **not** attempt to compete with production compressors in performance. It is intended for correctness, clarity, and control.
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Implemented algorithms
|
|
25
|
+
|
|
26
|
+
- Run-Length Encoding (RLE)
|
|
27
|
+
- LZ77
|
|
28
|
+
|
|
29
|
+
Each algorithm has a fully defined binary format and a strict decoder.
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install simple-compression
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## Basic Usage
|
|
42
|
+
|
|
43
|
+
**Both of these features are expected to improve in effectiveness with more testing and tuning as well as the future implementation of a Huffman Algorithm**
|
|
44
|
+
The first usage uses the `auto=True` argument does a quick pass on the data to gather metrics to automatically select the algorithms and their sequence.
|
|
45
|
+
The second usage passes the algorithm name as arguments to manually select algorithms and determine their sequence.
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
from simple_compression.compression import SimpleCompression
|
|
49
|
+
|
|
50
|
+
compress = SimpleCompression()
|
|
51
|
+
|
|
52
|
+
data = bytearray(b"AAAAAABBBBBCCDSADDDDDSSSCVZCSSSSWEEEFWEWAFZCVAGQWTQL")
|
|
53
|
+
|
|
54
|
+
encoded = compress.encode(data, auto=True)
|
|
55
|
+
decoded = compress.decode(encoded)
|
|
56
|
+
|
|
57
|
+
encoded = compress.encode(data, sequence=["RLE", "LZ77"])
|
|
58
|
+
decoded = compress.decode(encoded)
|
|
59
|
+
```
|
|
60
|
+
The decoder reads header tokens embedded at the start of the bitstream to determine which algorithms were applied and in which order.
|
|
61
|
+
This allows for a really robust decoder which when combined with the spec documentation for each algorithm can be helpful in implementing decoders in other languages.
|
|
62
|
+
|
|
63
|
+
---
|
|
64
|
+
|
|
65
|
+
## Algorithm Formatting
|
|
66
|
+
|
|
67
|
+
Detailed binary formats for each algorithm are documented below.
|
|
68
|
+
[RLE Format](documentation/rle_format.md)
|
|
69
|
+
[LZ77 Format](documentation/lz77_format.md)
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
## Road Map
|
|
74
|
+
|
|
75
|
+
First is implementing a Huffman algorithm to really take advantage of the sequencing ability of this library.
|
|
76
|
+
As features are added the automatic sequencing feature will be continuously tuned to ensure that rle is only enabled when it doesn't infalte the input.
|
|
77
|
+
As algorithms are added sequencing logic and metrics will develop alongside.
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## License
|
|
82
|
+
|
|
83
|
+
**MIT**
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "simple-compression"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A byte-level compression library implementing RLE and LZ77 with explicit encoding formats, manual pipelines, and automatic algorithm selection."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
|
|
13
|
+
authors = [
|
|
14
|
+
{ name = "Malek Yehya" }
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
classifiers = [
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Operating System :: OS Independent",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
[tool.setuptools]
|
|
24
|
+
package-dir = {"" = "."}
|
|
25
|
+
|
|
26
|
+
[tool.setuptools.packages.find]
|
|
27
|
+
where = ["."]
|
|
28
|
+
include = ["simple_compression*"]
|
|
29
|
+
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
class LZ77:
|
|
2
|
+
def __init__(self, window_size=4096, min_match=3, max_match=255):
|
|
3
|
+
self.window_size = window_size
|
|
4
|
+
self.min_match = min_match
|
|
5
|
+
self.max_match = max_match
|
|
6
|
+
|
|
7
|
+
def encode(self, data):
|
|
8
|
+
output = bytearray()
|
|
9
|
+
cursor = 0
|
|
10
|
+
|
|
11
|
+
index = {}
|
|
12
|
+
|
|
13
|
+
while cursor < len(data):
|
|
14
|
+
if cursor - self.window_size > 0:
|
|
15
|
+
back_start = cursor - self.window_size
|
|
16
|
+
else:
|
|
17
|
+
back_start = 0
|
|
18
|
+
|
|
19
|
+
back_end = cursor
|
|
20
|
+
ahead_start = cursor
|
|
21
|
+
|
|
22
|
+
max_ahead = ahead_start + self.max_match
|
|
23
|
+
if max_ahead < len(data):
|
|
24
|
+
ahead_end = max_ahead
|
|
25
|
+
else:
|
|
26
|
+
ahead_end = len(data)
|
|
27
|
+
|
|
28
|
+
best_length = 0
|
|
29
|
+
best_distance = 0
|
|
30
|
+
|
|
31
|
+
if cursor + 2 < len(data):
|
|
32
|
+
key = (data[cursor], data[cursor + 1], data[cursor + 2])
|
|
33
|
+
candidates = index.get(key, [])
|
|
34
|
+
else:
|
|
35
|
+
key = None
|
|
36
|
+
candidates = []
|
|
37
|
+
|
|
38
|
+
for j in reversed(candidates):
|
|
39
|
+
if j < back_start:
|
|
40
|
+
break
|
|
41
|
+
|
|
42
|
+
k = 0
|
|
43
|
+
while k < min(ahead_end - ahead_start, back_end - j):
|
|
44
|
+
if data[j + k] == data[ahead_start + k]:
|
|
45
|
+
k += 1
|
|
46
|
+
else:
|
|
47
|
+
break
|
|
48
|
+
|
|
49
|
+
match_length = k
|
|
50
|
+
match_distance = cursor - j
|
|
51
|
+
|
|
52
|
+
if match_length > best_length:
|
|
53
|
+
best_length = match_length
|
|
54
|
+
best_distance = match_distance
|
|
55
|
+
|
|
56
|
+
if best_length >= self.min_match:
|
|
57
|
+
output.append(0x01)
|
|
58
|
+
|
|
59
|
+
high = (best_distance >> 8) & 0xFF
|
|
60
|
+
low = best_distance & 0xFF
|
|
61
|
+
|
|
62
|
+
output.append(high)
|
|
63
|
+
output.append(low)
|
|
64
|
+
output.append(best_length)
|
|
65
|
+
|
|
66
|
+
step = best_length
|
|
67
|
+
else:
|
|
68
|
+
output.append(0x00)
|
|
69
|
+
output.append(data[cursor])
|
|
70
|
+
step = 1
|
|
71
|
+
|
|
72
|
+
for i in range(step):
|
|
73
|
+
pos = cursor + i
|
|
74
|
+
if pos + 2 < len(data):
|
|
75
|
+
k = (data[pos], data[pos + 1], data[pos + 2])
|
|
76
|
+
lst = index.setdefault(k, [])
|
|
77
|
+
lst.append(pos)
|
|
78
|
+
|
|
79
|
+
if len(lst) > 64:
|
|
80
|
+
del lst[0]
|
|
81
|
+
|
|
82
|
+
cursor += step
|
|
83
|
+
|
|
84
|
+
return output
|
|
85
|
+
|
|
86
|
+
def decode(self, data):
|
|
87
|
+
output = bytearray()
|
|
88
|
+
cursor = 0
|
|
89
|
+
|
|
90
|
+
while cursor < len(data):
|
|
91
|
+
token = data[cursor]
|
|
92
|
+
cursor += 1
|
|
93
|
+
|
|
94
|
+
if token == 0x00:
|
|
95
|
+
literal = data[cursor]
|
|
96
|
+
cursor += 1
|
|
97
|
+
output.append(literal)
|
|
98
|
+
|
|
99
|
+
elif token == 0x01:
|
|
100
|
+
high = data[cursor]
|
|
101
|
+
cursor += 1
|
|
102
|
+
low = data[cursor]
|
|
103
|
+
cursor += 1
|
|
104
|
+
|
|
105
|
+
distance = (high << 8) | low
|
|
106
|
+
|
|
107
|
+
length = data[cursor]
|
|
108
|
+
cursor += 1
|
|
109
|
+
|
|
110
|
+
copy_start = len(output) - distance
|
|
111
|
+
copy_index = copy_start
|
|
112
|
+
copied = 0
|
|
113
|
+
|
|
114
|
+
while copied < length:
|
|
115
|
+
output.append(output[copy_index])
|
|
116
|
+
copy_index += 1
|
|
117
|
+
copied += 1
|
|
118
|
+
|
|
119
|
+
else:
|
|
120
|
+
raise ValueError("Invalid LZ77 token")
|
|
121
|
+
|
|
122
|
+
return output
|
|
123
|
+
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
class RLE:
|
|
2
|
+
def __init__(self, min_run=3):
|
|
3
|
+
self.min_run = min_run
|
|
4
|
+
|
|
5
|
+
def encode(self, data):
|
|
6
|
+
if not isinstance(data, bytearray):
|
|
7
|
+
raise Exception("RLE encode requires bytearray input")
|
|
8
|
+
|
|
9
|
+
if len(data) < self.min_run * 2:
|
|
10
|
+
return data
|
|
11
|
+
|
|
12
|
+
output = bytearray()
|
|
13
|
+
|
|
14
|
+
current_value = data[0]
|
|
15
|
+
count = 1
|
|
16
|
+
cursor = 1
|
|
17
|
+
|
|
18
|
+
def emit_run(count, value):
|
|
19
|
+
while count > 255:
|
|
20
|
+
output.append(0x01)
|
|
21
|
+
output.append(255)
|
|
22
|
+
output.append(value)
|
|
23
|
+
count -= 255
|
|
24
|
+
if count >= self.min_run:
|
|
25
|
+
output.append(0x01)
|
|
26
|
+
output.append(count)
|
|
27
|
+
output.append(value)
|
|
28
|
+
else:
|
|
29
|
+
for _ in range(count):
|
|
30
|
+
output.append(0x00)
|
|
31
|
+
output.append(value)
|
|
32
|
+
|
|
33
|
+
while cursor < len(data):
|
|
34
|
+
if data[cursor] == current_value:
|
|
35
|
+
count += 1
|
|
36
|
+
else:
|
|
37
|
+
emit_run(count, current_value)
|
|
38
|
+
current_value = data[cursor]
|
|
39
|
+
count = 1
|
|
40
|
+
cursor += 1
|
|
41
|
+
|
|
42
|
+
emit_run(count, current_value)
|
|
43
|
+
|
|
44
|
+
return output
|
|
45
|
+
|
|
46
|
+
def decode(self, data):
|
|
47
|
+
if not isinstance(data, bytearray):
|
|
48
|
+
raise Exception("RLE decode requires bytearray input")
|
|
49
|
+
|
|
50
|
+
output = bytearray()
|
|
51
|
+
cursor = 0
|
|
52
|
+
|
|
53
|
+
while cursor < len(data):
|
|
54
|
+
token = data[cursor]
|
|
55
|
+
cursor += 1
|
|
56
|
+
|
|
57
|
+
if token == 0x00:
|
|
58
|
+
value = data[cursor]
|
|
59
|
+
cursor += 1
|
|
60
|
+
output.append(value)
|
|
61
|
+
|
|
62
|
+
elif token == 0x01:
|
|
63
|
+
count = data[cursor]
|
|
64
|
+
cursor += 1
|
|
65
|
+
value = data[cursor]
|
|
66
|
+
cursor += 1
|
|
67
|
+
|
|
68
|
+
repeated = 0
|
|
69
|
+
while repeated < count:
|
|
70
|
+
output.append(value)
|
|
71
|
+
repeated += 1
|
|
72
|
+
|
|
73
|
+
else:
|
|
74
|
+
raise Exception("Invalid RLE token")
|
|
75
|
+
|
|
76
|
+
return output
|
|
77
|
+
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
from simple_compression.algorithms.rle import RLE
|
|
2
|
+
from simple_compression.algorithms.lz77 import LZ77
|
|
3
|
+
from simple_compression.probe import CompressionProbe
|
|
4
|
+
|
|
5
|
+
HDR_RLE = 0x01
|
|
6
|
+
HDR_LZ77 = 0x02
|
|
7
|
+
|
|
8
|
+
MAGIC_0 = 0x53
|
|
9
|
+
MAGIC_1 = 0x43
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SimpleCompression:
|
|
13
|
+
def __init__(self, min_run=3, window_size=4096):
|
|
14
|
+
self.rle = RLE(min_run=min_run)
|
|
15
|
+
self.lz77 = LZ77(window_size=window_size)
|
|
16
|
+
self.probe = CompressionProbe(min_run=min_run)
|
|
17
|
+
|
|
18
|
+
def encode(self, data, sequence=None, auto=False):
|
|
19
|
+
if not isinstance(data, bytearray):
|
|
20
|
+
raise Exception("encode requires bytearray input")
|
|
21
|
+
|
|
22
|
+
if auto:
|
|
23
|
+
sequence = self._auto_select(data)
|
|
24
|
+
|
|
25
|
+
if not sequence:
|
|
26
|
+
return data
|
|
27
|
+
|
|
28
|
+
output = data
|
|
29
|
+
headers = bytearray()
|
|
30
|
+
|
|
31
|
+
for alg in sequence:
|
|
32
|
+
if alg == "RLE":
|
|
33
|
+
output = self.rle.encode(output)
|
|
34
|
+
headers.append(HDR_RLE)
|
|
35
|
+
elif alg == "LZ77":
|
|
36
|
+
output = self.lz77.encode(output)
|
|
37
|
+
headers.append(HDR_LZ77)
|
|
38
|
+
else:
|
|
39
|
+
raise Exception("Unknown algorithm")
|
|
40
|
+
|
|
41
|
+
framed = bytearray()
|
|
42
|
+
framed.append(MAGIC_0)
|
|
43
|
+
framed.append(MAGIC_1)
|
|
44
|
+
framed.append(len(headers))
|
|
45
|
+
framed.extend(headers)
|
|
46
|
+
framed.extend(output)
|
|
47
|
+
return framed
|
|
48
|
+
|
|
49
|
+
def decode(self, data):
|
|
50
|
+
if not isinstance(data, bytearray):
|
|
51
|
+
raise Exception("decode requires bytearray input")
|
|
52
|
+
|
|
53
|
+
if len(data) < 3:
|
|
54
|
+
return data
|
|
55
|
+
|
|
56
|
+
if data[0] != MAGIC_0 or data[1] != MAGIC_1:
|
|
57
|
+
return data
|
|
58
|
+
|
|
59
|
+
header_count = data[2]
|
|
60
|
+
cursor = 3
|
|
61
|
+
|
|
62
|
+
if len(data) < cursor + header_count:
|
|
63
|
+
raise Exception("Invalid frame")
|
|
64
|
+
|
|
65
|
+
headers = data[cursor:cursor + header_count]
|
|
66
|
+
cursor += header_count
|
|
67
|
+
|
|
68
|
+
output = data[cursor:]
|
|
69
|
+
|
|
70
|
+
i = len(headers) - 1
|
|
71
|
+
while i >= 0:
|
|
72
|
+
header = headers[i]
|
|
73
|
+
if header == HDR_RLE:
|
|
74
|
+
output = self.rle.decode(output)
|
|
75
|
+
elif header == HDR_LZ77:
|
|
76
|
+
output = self.lz77.decode(output)
|
|
77
|
+
else:
|
|
78
|
+
raise Exception("Invalid header")
|
|
79
|
+
i -= 1
|
|
80
|
+
|
|
81
|
+
return output
|
|
82
|
+
|
|
83
|
+
def _auto_select(self, data):
|
|
84
|
+
metrics = self.probe.analyze(data)
|
|
85
|
+
sequence = []
|
|
86
|
+
|
|
87
|
+
if metrics["use_rle"]:
|
|
88
|
+
sequence.append("RLE")
|
|
89
|
+
if metrics["use_lz77"]:
|
|
90
|
+
sequence.append("LZ77")
|
|
91
|
+
|
|
92
|
+
return sequence
|
|
93
|
+
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
class CompressionProbe:
|
|
2
|
+
def __init__(self, min_run=3):
|
|
3
|
+
self.min_run = min_run
|
|
4
|
+
|
|
5
|
+
def analyze(self, data):
|
|
6
|
+
if not isinstance(data, bytearray):
|
|
7
|
+
raise Exception("probe requires bytearray input")
|
|
8
|
+
|
|
9
|
+
has_runs = False
|
|
10
|
+
has_repetition = False
|
|
11
|
+
|
|
12
|
+
i = 0
|
|
13
|
+
while i < len(data) - 1:
|
|
14
|
+
run_length = 1
|
|
15
|
+
while i + run_length < len(data) and data[i] == data[i + run_length]:
|
|
16
|
+
run_length += 1
|
|
17
|
+
|
|
18
|
+
if run_length >= self.min_run:
|
|
19
|
+
has_runs = True
|
|
20
|
+
break
|
|
21
|
+
|
|
22
|
+
i += run_length
|
|
23
|
+
|
|
24
|
+
seen = {}
|
|
25
|
+
for b in data:
|
|
26
|
+
seen[b] = seen.get(b, 0) + 1
|
|
27
|
+
if seen[b] > 2:
|
|
28
|
+
has_repetition = True
|
|
29
|
+
break
|
|
30
|
+
|
|
31
|
+
return {
|
|
32
|
+
"use_rle": has_runs,
|
|
33
|
+
"use_lz77": has_repetition
|
|
34
|
+
}
|
|
35
|
+
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: simple-compression
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A byte-level compression library implementing RLE and LZ77 with explicit encoding formats, manual pipelines, and automatic algorithm selection.
|
|
5
|
+
Author: Malek Yehya
|
|
6
|
+
License: MIT
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Python: >=3.9
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Dynamic: license-file
|
|
14
|
+
|
|
15
|
+
# simple-compression
|
|
16
|
+
|
|
17
|
+
`simple-compression` is a small Python library that implements classic byte-oriented compression algorithms with an explicit and composable API.
|
|
18
|
+
|
|
19
|
+
The library is designed to operate directly on `bytearray` data and provides both manual and automatic compression pipelines. All encoded outputs are self-describing and can be decoded without external metadata.
|
|
20
|
+
|
|
21
|
+
Current version: **v0.1.0**
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Scope and goals
|
|
26
|
+
|
|
27
|
+
This project focuses on:
|
|
28
|
+
|
|
29
|
+
- Correct, deterministic implementations of classic compression algorithms
|
|
30
|
+
- Explicit encoding formats that are easy to inspect and reason about
|
|
31
|
+
- A simple API for chaining multiple compression stages
|
|
32
|
+
- Safe and strict decoding
|
|
33
|
+
|
|
34
|
+
This library does **not** attempt to compete with production compressors in performance. It is intended for correctness, clarity, and control.
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
|
|
38
|
+
## Implemented algorithms
|
|
39
|
+
|
|
40
|
+
- Run-Length Encoding (RLE)
|
|
41
|
+
- LZ77
|
|
42
|
+
|
|
43
|
+
Each algorithm has a fully defined binary format and a strict decoder.
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## Installation
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install simple-compression
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## Basic Usage
|
|
56
|
+
|
|
57
|
+
**Both of these features are expected to improve in effectiveness with more testing and tuning as well as the future implementation of a Huffman Algorithm**
|
|
58
|
+
The first usage uses the `auto=True` argument does a quick pass on the data to gather metrics to automatically select the algorithms and their sequence.
|
|
59
|
+
The second usage passes the algorithm name as arguments to manually select algorithms and determine their sequence.
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
from simple_compression.compression import SimpleCompression
|
|
63
|
+
|
|
64
|
+
compress = SimpleCompression()
|
|
65
|
+
|
|
66
|
+
data = bytearray(b"AAAAAABBBBBCCDSADDDDDSSSCVZCSSSSWEEEFWEWAFZCVAGQWTQL")
|
|
67
|
+
|
|
68
|
+
encoded = compress.encode(data, auto=True)
|
|
69
|
+
decoded = compress.decode(encoded)
|
|
70
|
+
|
|
71
|
+
encoded = compress.encode(data, sequence=["RLE", "LZ77"])
|
|
72
|
+
decoded = compress.decode(encoded)
|
|
73
|
+
```
|
|
74
|
+
The decoder reads header tokens embedded at the start of the bitstream to determine which algorithms were applied and in which order.
|
|
75
|
+
This allows for a really robust decoder which when combined with the spec documentation for each algorithm can be helpful in implementing decoders in other languages.
|
|
76
|
+
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
## Algorithm Formatting
|
|
80
|
+
|
|
81
|
+
Detailed binary formats for each algorithm are documented below.
|
|
82
|
+
[RLE Format](documentation/rle_format.md)
|
|
83
|
+
[LZ77 Format](documentation/lz77_format.md)
|
|
84
|
+
|
|
85
|
+
---
|
|
86
|
+
|
|
87
|
+
## Road Map
|
|
88
|
+
|
|
89
|
+
First is implementing a Huffman algorithm to really take advantage of the sequencing ability of this library.
|
|
90
|
+
As features are added the automatic sequencing feature will be continuously tuned to ensure that rle is only enabled when it doesn't infalte the input.
|
|
91
|
+
As algorithms are added sequencing logic and metrics will develop alongside.
|
|
92
|
+
|
|
93
|
+
---
|
|
94
|
+
|
|
95
|
+
## License
|
|
96
|
+
|
|
97
|
+
**MIT**
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
./simple_compression/__init__.py
|
|
5
|
+
./simple_compression/compression.py
|
|
6
|
+
./simple_compression/probe.py
|
|
7
|
+
./simple_compression/algorithms/__init__.py
|
|
8
|
+
./simple_compression/algorithms/lz77.py
|
|
9
|
+
./simple_compression/algorithms/rle.py
|
|
10
|
+
simple_compression/__init__.py
|
|
11
|
+
simple_compression/compression.py
|
|
12
|
+
simple_compression/probe.py
|
|
13
|
+
simple_compression.egg-info/PKG-INFO
|
|
14
|
+
simple_compression.egg-info/SOURCES.txt
|
|
15
|
+
simple_compression.egg-info/dependency_links.txt
|
|
16
|
+
simple_compression.egg-info/top_level.txt
|
|
17
|
+
simple_compression/algorithms/__init__.py
|
|
18
|
+
simple_compression/algorithms/lz77.py
|
|
19
|
+
simple_compression/algorithms/rle.py
|
|
20
|
+
tests/test_auto_pipeline.py
|
|
21
|
+
tests/test_lz77.py
|
|
22
|
+
tests/test_manual_pipeline.py
|
|
23
|
+
tests/test_rle.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
simple_compression
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from simple_compression.compression import SimpleCompression
|
|
2
|
+
|
|
3
|
+
def test_auto_uses_rle():
|
|
4
|
+
c = SimpleCompression()
|
|
5
|
+
data = bytearray(b"AAAAAAAB")
|
|
6
|
+
|
|
7
|
+
encoded = c.encode(data, auto=True)
|
|
8
|
+
decoded = c.decode(encoded)
|
|
9
|
+
|
|
10
|
+
assert decoded == data
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_auto_uses_lz77():
|
|
14
|
+
c = SimpleCompression()
|
|
15
|
+
data = bytearray(b"ABCABCABCABC")
|
|
16
|
+
|
|
17
|
+
encoded = c.encode(data, auto=True)
|
|
18
|
+
decoded = c.decode(encoded)
|
|
19
|
+
|
|
20
|
+
assert decoded == data
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def test_auto_uses_rle_then_lz77():
|
|
24
|
+
c = SimpleCompression()
|
|
25
|
+
data = bytearray(b"AAAAABBBBBCCCCCAAAAABBBBB")
|
|
26
|
+
|
|
27
|
+
encoded = c.encode(data, auto=True)
|
|
28
|
+
decoded = c.decode(encoded)
|
|
29
|
+
|
|
30
|
+
assert decoded == data
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def test_auto_negative_tech_passthrough():
|
|
34
|
+
c = SimpleCompression()
|
|
35
|
+
data = bytearray(b"\x01\xA9\x3F\x7C\xD2\x88\x10")
|
|
36
|
+
|
|
37
|
+
encoded = c.encode(data, auto=True)
|
|
38
|
+
decoded = c.decode(encoded)
|
|
39
|
+
|
|
40
|
+
assert decoded == data
|
|
41
|
+
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from simple_compression.compression import SimpleCompression
|
|
2
|
+
|
|
3
|
+
def test_manual_rle_only():
|
|
4
|
+
c = SimpleCompression()
|
|
5
|
+
data = bytearray(b"AAAAAAAB")
|
|
6
|
+
|
|
7
|
+
encoded = c.encode(data, sequence=["RLE"])
|
|
8
|
+
decoded = c.decode(encoded)
|
|
9
|
+
|
|
10
|
+
assert decoded == data
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_manual_lz77_only():
|
|
14
|
+
c = SimpleCompression()
|
|
15
|
+
data = bytearray(b"ABCABCABCABC")
|
|
16
|
+
|
|
17
|
+
encoded = c.encode(data, sequence=["LZ77"])
|
|
18
|
+
decoded = c.decode(encoded)
|
|
19
|
+
|
|
20
|
+
assert decoded == data
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def test_manual_rle_then_lz77():
|
|
24
|
+
c = SimpleCompression()
|
|
25
|
+
data = bytearray(b"AAAAABBBBBCCCCCAAAAABBBBB")
|
|
26
|
+
|
|
27
|
+
encoded = c.encode(data, sequence=["RLE", "LZ77"])
|
|
28
|
+
decoded = c.decode(encoded)
|
|
29
|
+
|
|
30
|
+
assert decoded == data
|
|
31
|
+
|