sub-byte 0.0.7__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
sub_byte/__init__.py ADDED
File without changes
sub_byte/factories.py ADDED
@@ -0,0 +1,288 @@
1
+ import itertools
2
+ from collections.abc import Iterable, Iterator, Callable, Hashable, Sequence
3
+ import typing
4
+
5
+ import more_itertools
6
+
7
+
8
+ def get_bits(x: int) -> str:
9
+ """E.g. get_bits(13) == '1101'."""
10
+ return bin(x).removeprefix("0b")
11
+
12
+
13
+ def all_ones_bit_mask(n: int) -> int:
14
+ """E.g. all_ones_bit_mask(8) == 255
15
+ Invariant property: len(get_bits(all_ones_bit_mask(n))) - 2
16
+ """
17
+ return (1 << n) - 1
18
+
19
+
20
+ def int_encoder(
21
+ integers: Iterable[int],
22
+ uint_bit_widths: Iterable[int],
23
+ ) -> Iterator[int]:
24
+ """If uint_bit_widths is an iterable that is not a container, e.g.
25
+ a once only iterator from a generator, it must yield the
26
+ same number of items or more, than len(integers).
27
+ i.e. the caller must handle cacheing of bit widths (or
28
+ repeating without cacheing).
29
+ """
30
+ bit_widths = itertools.cycle(uint_bit_widths)
31
+
32
+ # Initialise a buffer (an ordinary Number)
33
+ # and a bit counter.
34
+ buffer = 0
35
+ bits_used = 0
36
+
37
+ for i, integer in enumerate(integers):
38
+ bit_width = next(bit_widths, 0)
39
+
40
+ if bit_width == 0:
41
+ raise Exception(
42
+ f"No bit width specified for integer: {integer}, number: {i}"
43
+ )
44
+
45
+ # Left bitshift to make room for next integer, add it in and bump the bit counter.
46
+ buffer <<= bit_width
47
+ buffer |= integer
48
+ bits_used += bit_width
49
+
50
+ # Yield encoded bytes from the buffer
51
+ while bits_used >= 8:
52
+ # subtract bits to be yielded from counter, and yield them
53
+ bits_used -= 8
54
+ yield (buffer >> bits_used) & all_ones_bit_mask(8)
55
+
56
+ # Clear buffer of yielded bytes (only keep bits_used bits).
57
+ buffer &= all_ones_bit_mask(bits_used)
58
+
59
+ # Clear the buffer of any encoded integers, that were too few
60
+ # to completely fill a whole byte.
61
+ if bits_used >= 1:
62
+ # left shift the data to start of the bytes from the
63
+ # highest order bits (no leading zeros)
64
+ yield buffer << (8 - bits_used)
65
+
66
+
67
+ def int_decoder(
68
+ encoded: Iterable[int],
69
+ num_ints: int | None,
70
+ uint_bit_widths: Iterable[int]
71
+ ) -> Iterator[int]:
72
+ """If uint_bit_widths is an Iterable that is not a Container, e.g.
73
+ a once only iterator from a generator, the total of all its
74
+ widths yielded, must be >= (8 * the number of bytes from encoded)
75
+ i.e. as for int_encoder above, the caller must handle caching
76
+ of bit widths (or repeating them without caching).
77
+ When iteration of the decoder terminates, can be controlled by
78
+ by specifying the precise number of uints to decode, in num_ints.
79
+ encoded is always interpreted as whole bytes, so for example to decode
80
+ precisely 3 (and no more) 2-bit zeros (3* u2 0, or 3* 0b00) from a whole byte
81
+ (0b00000000), ignoring the last two bits, num_ints can be set to 3.
82
+ Alternatively, to support custom schemas, e.g. with dynamic data controlled
83
+ bit widths, setting num_ints = None causes the int_decoder to decode uints
84
+ from encoded indefinitely. In this case, the caller must terminate the
85
+ (otherwise infinite) loop themselves.
86
+ """
87
+ bit_widths: Iterator[int] = itertools.cycle(uint_bit_widths)
88
+
89
+ if num_ints is not None:
90
+ bit_widths = itertools.islice(bit_widths, num_ints)
91
+
92
+ bytes_ = iter(encoded)
93
+
94
+ # Initialise a buffer (an ordinary Number)
95
+ # and a bit counter.
96
+ buffer = 0
97
+ buffer_width_in_bits = 0
98
+
99
+ j = 0
100
+
101
+ bit_width = next(bit_widths, 0)
102
+
103
+ for i, byte in enumerate(bytes_):
104
+ # Left shift 8 bits to make room for byte
105
+ buffer <<= 8
106
+ # Bump counter by 8
107
+ buffer_width_in_bits += 8
108
+ # Add in byte to buffer
109
+ buffer |= byte
110
+
111
+ if buffer_width_in_bits < bit_width:
112
+ continue
113
+
114
+ while buffer_width_in_bits >= bit_width and bit_width > 0:
115
+ buffer_width_in_bits -= bit_width
116
+ # mask is bit_width 1s followed by buffer_width_in_bits 0s up
117
+ # the same total width as the original value of buffer_width_in_bits
118
+ # before the previous line.
119
+ mask = all_ones_bit_mask(bit_width)
120
+ yield (buffer >> buffer_width_in_bits) & mask
121
+ j += 1
122
+ # Clear buffer of the bits that made up the yielded integer
123
+ # (the left most bit_width bits)
124
+ buffer &= all_ones_bit_mask(buffer_width_in_bits)
125
+
126
+ bit_width = next(bit_widths, 0)
127
+
128
+ if bit_width == 0:
129
+ if num_ints is not None and buffer_width_in_bits >= 1 and j < num_ints:
130
+ raise Exception(
131
+ f"Not enough uint bit widths to decode remaining bits {buffer_width_in_bits} with.",
132
+ f"Got: {num_ints=}. Total ints decoded so far: {j=}. ",
133
+ )
134
+
135
+ break
136
+
137
+
138
+ def get_bit_widths_encodings_and_decodings[H: Hashable](
139
+ value_sets: Iterable[Iterable[H]],
140
+ ) -> tuple[list[int], list[dict[H, int]], list[list[H]]]:
141
+ bit_widths = []
142
+ decodings = []
143
+ encodings = []
144
+
145
+ for value_set in value_sets:
146
+ # A set would not preserve the order of the elements
147
+ # in value_set, hence dict.fromkeys is used.
148
+ decoding = list(dict.fromkeys((value_set)))
149
+ if len(decoding) <= 1:
150
+ raise Exception(
151
+ "All symbols are the same, or no symbols have been given."
152
+ f"Value set: {value_set}"
153
+ )
154
+ decodings.append(decoding)
155
+
156
+ # Mapping starts at zero, so we subtract one from num of
157
+ # total symbols to get highest int.
158
+ binary_of_highest_mapped_int = get_bits(len(decoding) - 1)
159
+ bit_width = len(binary_of_highest_mapped_int)
160
+ bit_widths.append(bit_width)
161
+
162
+ encoding = {symbol: i for i, symbol in enumerate(decoding)}
163
+ encodings.append(encoding)
164
+
165
+ return bit_widths, encodings, decodings
166
+
167
+
168
+ def map_symbols_to_integers[H: Hashable](
169
+ symbols: Iterable[H],
170
+ encodings: Iterable[dict[H, int]]
171
+ ) -> Iterator[int]:
172
+ for symbol, encoding in zip(symbols, itertools.cycle(encodings)):
173
+ yield encoding[symbol]
174
+
175
+
176
+ def map_integers_to_symbols[H: Hashable](
177
+ unsigned_integers: Iterable[int],
178
+ decodings: Iterable[Sequence[H]]
179
+ ) -> Iterator[H]:
180
+ for unsigned_integer, decoding in zip(
181
+ unsigned_integers, itertools.cycle(decodings)
182
+ ):
183
+ yield decoding[unsigned_integer]
184
+
185
+
186
+ def encoder_and_decoder_from_bit_widths_and_mappings[H: Hashable](
187
+ bit_widths: Iterable[int],
188
+ encodings: Iterable[dict[H, int]],
189
+ decodings: Iterable[list[H]],
190
+ ):
191
+
192
+ def encoder(
193
+ symbols: Iterable[H],
194
+ ) -> Iterator[int]:
195
+ for unsigned_integer in int_encoder(
196
+ map_symbols_to_integers(symbols, encodings), bit_widths
197
+ ):
198
+ yield unsigned_integer
199
+
200
+ def decoder(
201
+ encoded: Iterable[int],
202
+ number_of_symbols: int,
203
+ ) -> Iterator[H]:
204
+ for symbol in map_integers_to_symbols(
205
+ int_decoder(encoded, number_of_symbols, bit_widths), decodings
206
+ ):
207
+ yield symbol
208
+
209
+ return encoder, decoder
210
+
211
+
212
+ def make_sub_byte_encoder_and_decoder[H: Hashable](
213
+ value_sets: Iterable[Iterable[H]],
214
+ ) -> tuple[Callable[[Iterable[H]], Iterator[int]],
215
+ Callable[[Iterable[int], int], Iterator[H]],
216
+ list[int],
217
+ list[dict[H, int]],
218
+ list[list[H]]
219
+ ]:
220
+
221
+ bit_widths, encodings, decodings = get_bit_widths_encodings_and_decodings(
222
+ value_sets
223
+ )
224
+
225
+ encoder, decoder = encoder_and_decoder_from_bit_widths_and_mappings(
226
+ bit_widths, encodings, decodings)
227
+
228
+ return encoder, decoder, bit_widths, encodings, decodings
229
+
230
+
231
+
232
+ def possible_numbers_of_symbols(
233
+ b: Sequence[int],
234
+ bit_widths: Iterable[int], # Must be positive integers
235
+ ) -> Iterator[int]:
236
+
237
+ padding = [None] * 8
238
+ bit_widths_subsequences = more_itertools.windowed(
239
+ itertools.chain(padding, itertools.cycle(bit_widths)), 9
240
+ )
241
+
242
+ num_symbols = 0
243
+
244
+ num_bits = 0
245
+
246
+ for bit_widths_subsequence in bit_widths_subsequences:
247
+
248
+ last_bit_width = bit_widths_subsequence[-1]
249
+
250
+ if last_bit_width is None or last_bit_width <= 0:
251
+ raise Exception(
252
+ f'Bit widths must be non-zero positive integers. Got: {last_bit_width=} and {bit_widths_subsequence=})'
253
+ )
254
+
255
+
256
+ if num_bits + last_bit_width > 8 * len(b):
257
+ break
258
+
259
+ num_symbols += 1
260
+
261
+ assert last_bit_width >= 1
262
+ num_bits += last_bit_width
263
+ else:
264
+ raise Exception(
265
+ 'Could not find last bit widths (up to 8). '
266
+ 'Ensure bit_widths is non-empty '
267
+ f'(got: {bit_widths=}).'
268
+ )
269
+
270
+ up_to_last_8_bit_widths = bit_widths_subsequence[:9]
271
+
272
+ last_byte = b[-1]
273
+ last_byte_bits = get_bits(last_byte)
274
+ __, __, last_byte_trailing_zero_bits = last_byte_bits.rpartition("1")
275
+ num_zero_bits = len(last_byte_trailing_zero_bits)
276
+
277
+ for one_of_last_bit_widths in reversed(up_to_last_8_bit_widths):
278
+ yield num_symbols
279
+
280
+ if one_of_last_bit_widths is None:
281
+ break
282
+
283
+ num_zero_bits -= one_of_last_bit_widths
284
+
285
+ if num_zero_bits < 0:
286
+ break
287
+
288
+ num_symbols -= 1
sub_byte/py.typed ADDED
File without changes
@@ -0,0 +1,144 @@
1
+ Metadata-Version: 2.4
2
+ Name: sub-byte
3
+ Version: 0.0.7
4
+ Summary: Encodes and decodes sequences of unsigned integers with known widths (and sequences of symbols from finite sets).
5
+ Project-URL: Homepage, https://github.com/NumberzGame/sub_byte
6
+ Project-URL: Bug Tracker, https://github.com/NumberzGame/sub_byte/issues
7
+ Author-email: James Parrott <james.parrott@proton.me>
8
+ License: MIT License
9
+
10
+ Copyright (c) 2024-present James Parrott
11
+
12
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
13
+
14
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
17
+ License-File: LICENSE.md
18
+ Keywords: Encoders,Serialization
19
+ Classifier: Programming Language :: Python
20
+ Requires-Python: >=3.12
21
+ Requires-Dist: more-itertools
22
+ Provides-Extra: mypy
23
+ Requires-Dist: mypy; extra == 'mypy'
24
+ Provides-Extra: test
25
+ Requires-Dist: hypothesis; extra == 'test'
26
+ Requires-Dist: pytest; extra == 'test'
27
+ Description-Content-Type: text/markdown
28
+
29
+ # Sub_Byte
30
+
31
+ Bit packer and depacker. Encodes and decodes sequences of integers with known bit-widths (and sequences of symbols equivalent to integers under some mapping).
32
+
33
+ ## Overview
34
+
35
+ Sub Byte stores data without wasting bits, while preserving its structure, without requiring compression or decompression. Simple bit packing is used, supporting using less than a byte of storage for <=7 bit fields, crossing byte
36
+ boundaries if necessary.
37
+
38
+ A bit width for each symbol is required. The bit width sequence (a simple codec) can be associated with the encoded data as meta data. The decoder can be passed the total number of symbols to decode (e.g. whether a null byte (0b00000000), is 8 1-bit zeros, 4 2-bit zeros, 2 u4 zeros or a single u8 zero).
39
+
40
+ Alternatively, more dynamic codecs can be supported by passing null for the number of symbols to the decoder. Axtra custom code
41
+ must then be written by the user, to determine when iteration ceases. This can be used e.g. to encode the actual bit widths first (in some other fixed bit widths), to encode the number of symbols or cycles, and to implement any other codec that determines bit widths, and termination of iteration, according to the user's code.
42
+
43
+ Data validation (e.g. checksums or hashes) must be done by the user, but an extra field can easily be appended to a bit width cycle.
44
+
45
+ ## Implementations
46
+
47
+ ### Python
48
+ Calculate a cache of data in Python.
49
+
50
+ ```shell
51
+ uv pip install sub_byte
52
+ ```
53
+
54
+
55
+ ### Typescript/Javascript
56
+ Decode a cache of data in Javascript, even in browser.
57
+
58
+ ```shell
59
+ npm i sub_byte
60
+ ```
61
+
62
+
63
+ ## Alternatives
64
+
65
+ ### Sub 4kB datasets
66
+ This library is not needed for data storage. Neither Sub_byte nor anything else, will reduce the disk space used.
67
+ If the size of the un-encoded data set is less 4kB for example (or the page size of the file system on which the data will be stored, e.g. ext4, NTFS, APFS) then it is already below the minimum file size for that file system.
68
+
69
+ ### A bespoke protocol using custom width integer types
70
+
71
+ Up to 8 u1s (bits), up to 4 u2s, or up to 2 u3s or u4s per byte.
72
+ Each developer must create their own implementation and tests.
73
+ Interoperability between different private implementations is untestable.
74
+
75
+ ### Protocol buffers
76
+
77
+ Encodes max symbol per byte. Variable byte encoding - uses continuation bits.
78
+
79
+ ### Zipping (data compression)
80
+
81
+ - Exploits statistical distributions (e.g. "E" being more common in English text than "Q") and patterns.
82
+ - Unstructured until the end user unzips the archive.
83
+
84
+ ## Changelog
85
+ ### v0.05
86
+ Configured npm module for Typescript.
87
+ ### v0.04
88
+ Support dynamic codecs (null/None number of elements to decode).
89
+
90
+ ## Development
91
+
92
+ ### Type checking and linting:
93
+ #### Python
94
+ ##### MyPy
95
+ ```shell
96
+ mypy --python-executable=path/to/venv/where/deps/installed/python.exe src/sub_byte
97
+ ```
98
+
99
+ ##### Pyright
100
+ Activate venv where deps installed
101
+ ```shell
102
+ pyright src/sub_byte/factories.py
103
+ ```
104
+
105
+ #### TS
106
+ ##### Typescipt compiler
107
+ ```shell
108
+ npm run typecheck
109
+ ```
110
+ ##### Eslint
111
+ ```shell
112
+ npm run eslint
113
+ ```
114
+
115
+ ##### Prettier
116
+ ###### Check
117
+ ```shell
118
+ npm run prettier
119
+ ```
120
+
121
+ ###### Auto fix
122
+ ```shell
123
+ npm run prettier:write
124
+ ```
125
+
126
+ ### Publishing
127
+
128
+ Bump version in package.json to x.y.z
129
+
130
+ #### NPM
131
+ ```shell
132
+ npm run prepublish
133
+ npm pack
134
+ ```
135
+ Double check contents of sub_byte-x.y.z.tgz
136
+
137
+ ```shell
138
+ npm publish
139
+ ```
140
+ Sign in (currently requires being the author).
141
+
142
+ #### PyPi
143
+
144
+
@@ -0,0 +1,7 @@
1
+ sub_byte/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ sub_byte/factories.py,sha256=IxrnsVsxSyUdUXnOFYoyt9mHtF0mAkKvSdjaD1rrj2E,9183
3
+ sub_byte/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ sub_byte-0.0.7.dist-info/METADATA,sha256=YbM68APsyREAEjdWN7iMV78A4RIEFDIx2YcoX9nJyx0,5230
5
+ sub_byte-0.0.7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
6
+ sub_byte-0.0.7.dist-info/licenses/LICENSE.md,sha256=j8A9Ejfu8YaNnQvQRJFdWixi8XViJtdekrqa--pweiQ,1078
7
+ sub_byte-0.0.7.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.27.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,9 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024-present James Parrott
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6
+
7
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8
+
9
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.