chimp-encoding 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,17 @@
1
+ """Chimp time-series floating point compression for Python."""
2
+
3
+ from chimp_encoding.chimp import ChimpDecoder, ChimpEncoder
4
+ from chimp_encoding.chimp32 import Chimp32Decoder, Chimp32Encoder
5
+ from chimp_encoding.chimp_n import ChimpNDecoder, ChimpNEncoder
6
+ from chimp_encoding.chimp_n32 import ChimpN32Decoder, ChimpN32Encoder
7
+
8
+ __all__ = [
9
+ "ChimpEncoder",
10
+ "ChimpDecoder",
11
+ "Chimp32Encoder",
12
+ "Chimp32Decoder",
13
+ "ChimpNEncoder",
14
+ "ChimpNDecoder",
15
+ "ChimpN32Encoder",
16
+ "ChimpN32Decoder",
17
+ ]
@@ -0,0 +1,130 @@
1
+ """Big-endian bit-level I/O streams for Chimp compression."""
2
+
3
+ from __future__ import annotations
4
+
5
+
6
+ class OutputBitStream:
7
+ """Big-endian bit-level output stream over a growable byte buffer."""
8
+
9
+ def __init__(self, initial_capacity: int = 8000) -> None:
10
+ self._buffer: bytearray = bytearray(initial_capacity)
11
+ self._current: int = 0
12
+ self._free: int = 8
13
+ self._pos: int = 0
14
+
15
+ def _ensure_capacity(self, needed: int) -> None:
16
+ while self._pos + needed >= len(self._buffer):
17
+ self._buffer.extend(bytearray(len(self._buffer)))
18
+
19
+ def _write_in_current(self, value: int, length: int) -> None:
20
+ self._current |= (value & ((1 << length) - 1)) << (self._free - length)
21
+ self._free -= length
22
+ if self._free == 0:
23
+ self._ensure_capacity(1)
24
+ self._buffer[self._pos] = self._current & 0xFF
25
+ self._pos += 1
26
+ self._free = 8
27
+ self._current = 0
28
+
29
+ def write_bit(self, bit: bool) -> None:
30
+ self._write_in_current(1 if bit else 0, 1)
31
+
32
+ def write_int(self, value: int, length: int) -> None:
33
+ if length == 0:
34
+ return
35
+ if length <= self._free:
36
+ self._write_in_current(value, length)
37
+ return
38
+
39
+ # Fill current byte
40
+ remaining = length - self._free
41
+ if self._free != 0:
42
+ self._write_in_current(value >> remaining, self._free)
43
+
44
+ # Write full bytes
45
+ while remaining >= 8:
46
+ remaining -= 8
47
+ self._ensure_capacity(1)
48
+ self._buffer[self._pos] = (value >> remaining) & 0xFF
49
+ self._pos += 1
50
+
51
+ # Write remaining bits
52
+ if remaining > 0:
53
+ self._write_in_current(value, remaining)
54
+
55
+ def write_long(self, value: int, length: int) -> None:
56
+ self.write_int(value, length)
57
+
58
+ def flush(self) -> None:
59
+ if self._free < 8:
60
+ self._ensure_capacity(1)
61
+ self._buffer[self._pos] = self._current & 0xFF
62
+ self._pos += 1
63
+ self._current = 0
64
+ self._free = 8
65
+
66
+ @property
67
+ def buffer(self) -> bytes:
68
+ return bytes(self._buffer[: self._pos + (1 if self._free < 8 else 0)])
69
+
70
+
71
+ class InputBitStream:
72
+ """Big-endian bit-level input stream over a byte buffer."""
73
+
74
+ def __init__(self, data: bytes | bytearray) -> None:
75
+ self._buffer: bytes = bytes(data)
76
+ self._current: int = 0
77
+ self._fill: int = 0
78
+ self._pos: int = 0
79
+
80
+ def _read_byte(self) -> int:
81
+ if self._pos >= len(self._buffer):
82
+ return 0
83
+ b = self._buffer[self._pos]
84
+ self._pos += 1
85
+ return b
86
+
87
+ def _refill(self) -> None:
88
+ while self._fill < 16 and self._pos < len(self._buffer):
89
+ self._current = (self._current << 8) | self._read_byte()
90
+ self._fill += 8
91
+
92
+ def _read_from_current(self, length: int) -> int:
93
+ if length == 0:
94
+ return 0
95
+ self._fill -= length
96
+ return (self._current >> self._fill) & ((1 << length) - 1)
97
+
98
+ def read_bit(self) -> int:
99
+ if self._fill == 0:
100
+ self._current = self._read_byte()
101
+ self._fill = 8
102
+ return self._read_from_current(1)
103
+
104
+ def read_int(self, length: int) -> int:
105
+ if length == 0:
106
+ return 0
107
+ if self._fill < 16:
108
+ self._refill()
109
+ if length <= self._fill:
110
+ return self._read_from_current(length)
111
+
112
+ remaining = length - self._fill
113
+ result = self._read_from_current(self._fill)
114
+
115
+ # Read full bytes
116
+ while remaining >= 8:
117
+ result = (result << 8) | self._read_byte()
118
+ remaining -= 8
119
+
120
+ # Read remaining bits
121
+ if remaining > 0:
122
+ if self._fill == 0:
123
+ self._current = self._read_byte()
124
+ self._fill = 8
125
+ result = (result << remaining) | self._read_from_current(remaining)
126
+
127
+ return result
128
+
129
+ def read_long(self, length: int) -> int:
130
+ return self.read_int(length)
@@ -0,0 +1,143 @@
1
+ """Shared lookup tables for Chimp encoding/decoding."""
2
+
3
+ # Maps a leading-zero count (0..63) to a 3-bit representation code (0..7).
4
+ # Used by encoders to quantize leading zero counts.
5
+ LEADING_REPRESENTATION_ENCODE: tuple[int, ...] = (
6
+ 0,
7
+ 0,
8
+ 0,
9
+ 0,
10
+ 0,
11
+ 0,
12
+ 0,
13
+ 0,
14
+ 1,
15
+ 1,
16
+ 1,
17
+ 1,
18
+ 2,
19
+ 2,
20
+ 2,
21
+ 2,
22
+ 3,
23
+ 3,
24
+ 4,
25
+ 4,
26
+ 5,
27
+ 5,
28
+ 6,
29
+ 6,
30
+ 7,
31
+ 7,
32
+ 7,
33
+ 7,
34
+ 7,
35
+ 7,
36
+ 7,
37
+ 7,
38
+ 7,
39
+ 7,
40
+ 7,
41
+ 7,
42
+ 7,
43
+ 7,
44
+ 7,
45
+ 7,
46
+ 7,
47
+ 7,
48
+ 7,
49
+ 7,
50
+ 7,
51
+ 7,
52
+ 7,
53
+ 7,
54
+ 7,
55
+ 7,
56
+ 7,
57
+ 7,
58
+ 7,
59
+ 7,
60
+ 7,
61
+ 7,
62
+ 7,
63
+ 7,
64
+ 7,
65
+ 7,
66
+ 7,
67
+ 7,
68
+ 7,
69
+ 7,
70
+ )
71
+
72
+ # Rounds a leading-zero count to the nearest representable value.
73
+ # Used by encoders before storing leading zeros.
74
+ LEADING_ROUND: tuple[int, ...] = (
75
+ 0,
76
+ 0,
77
+ 0,
78
+ 0,
79
+ 0,
80
+ 0,
81
+ 0,
82
+ 0,
83
+ 8,
84
+ 8,
85
+ 8,
86
+ 8,
87
+ 12,
88
+ 12,
89
+ 12,
90
+ 12,
91
+ 16,
92
+ 16,
93
+ 18,
94
+ 18,
95
+ 20,
96
+ 20,
97
+ 22,
98
+ 22,
99
+ 24,
100
+ 24,
101
+ 24,
102
+ 24,
103
+ 24,
104
+ 24,
105
+ 24,
106
+ 24,
107
+ 24,
108
+ 24,
109
+ 24,
110
+ 24,
111
+ 24,
112
+ 24,
113
+ 24,
114
+ 24,
115
+ 24,
116
+ 24,
117
+ 24,
118
+ 24,
119
+ 24,
120
+ 24,
121
+ 24,
122
+ 24,
123
+ 24,
124
+ 24,
125
+ 24,
126
+ 24,
127
+ 24,
128
+ 24,
129
+ 24,
130
+ 24,
131
+ 24,
132
+ 24,
133
+ 24,
134
+ 24,
135
+ 24,
136
+ 24,
137
+ 24,
138
+ 24,
139
+ )
140
+
141
+ # Inverse decode table: maps a 3-bit code back to the leading-zero count.
142
+ # Used by decoders to reconstruct leading zero counts.
143
+ LEADING_REPRESENTATION_DECODE: tuple[int, ...] = (0, 8, 12, 16, 18, 20, 22, 24)
@@ -0,0 +1,191 @@
1
+ """Chimp encoder/decoder for 64-bit doubles (base variant)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import struct
6
+
7
+ from chimp_encoding._bit_stream import InputBitStream, OutputBitStream
8
+ from chimp_encoding._tables import (
9
+ LEADING_REPRESENTATION_DECODE,
10
+ LEADING_REPRESENTATION_ENCODE,
11
+ LEADING_ROUND,
12
+ )
13
+
14
+ _NAN_LONG = 0x7FF8000000000000
15
+ _MASK_64 = 0xFFFFFFFFFFFFFFFF
16
+
17
+
18
+ def _double_to_long_bits(value: float) -> int:
19
+ return struct.unpack(">Q", struct.pack(">d", value))[0]
20
+
21
+
22
+ def _long_bits_to_double(bits: int) -> float:
23
+ return struct.unpack(">d", struct.pack(">Q", bits & _MASK_64))[0]
24
+
25
+
26
+ def _leading_zeros_64(x: int) -> int:
27
+ if x == 0:
28
+ return 64
29
+ return 64 - x.bit_length()
30
+
31
+
32
+ def _trailing_zeros_64(x: int) -> int:
33
+ if x == 0:
34
+ return 64
35
+ return (x & -x).bit_length() - 1
36
+
37
+
38
+ class ChimpEncoder:
39
+ """Compresses a series of 64-bit doubles using the Chimp algorithm."""
40
+
41
+ THRESHOLD: int = 6
42
+
43
+ def __init__(self) -> None:
44
+ self._out = OutputBitStream()
45
+ self._stored_leading_zeros: int = 65
46
+ self._stored_val: int = 0
47
+ self._first: bool = True
48
+ self._size: int = 0
49
+
50
+ def add_value(self, value: float) -> None:
51
+ bits = _double_to_long_bits(value)
52
+ if self._first:
53
+ self._write_first(bits)
54
+ else:
55
+ self._compress_value(bits)
56
+
57
+ def _write_first(self, value: int) -> None:
58
+ self._first = False
59
+ self._stored_val = value
60
+ self._out.write_long(value, 64)
61
+ self._size += 64
62
+
63
+ def _compress_value(self, value: int) -> None:
64
+ xor = (self._stored_val ^ value) & _MASK_64
65
+
66
+ if xor == 0:
67
+ self._out.write_bit(False)
68
+ self._out.write_bit(False)
69
+ self._size += 2
70
+ self._stored_leading_zeros = 65
71
+ else:
72
+ leading_zeros = LEADING_ROUND[_leading_zeros_64(xor)]
73
+ trailing_zeros = _trailing_zeros_64(xor)
74
+
75
+ if trailing_zeros > self.THRESHOLD:
76
+ significant_bits = 64 - leading_zeros - trailing_zeros
77
+ self._out.write_bit(False)
78
+ self._out.write_bit(True)
79
+ self._out.write_int(LEADING_REPRESENTATION_ENCODE[leading_zeros], 3)
80
+ self._out.write_int(significant_bits, 6)
81
+ self._out.write_long(xor >> trailing_zeros, significant_bits)
82
+ self._size += 11 + significant_bits
83
+ self._stored_leading_zeros = 65
84
+ elif leading_zeros == self._stored_leading_zeros:
85
+ self._out.write_bit(True)
86
+ self._out.write_bit(False)
87
+ significant_bits = 64 - leading_zeros
88
+ self._out.write_long(xor, significant_bits)
89
+ self._size += 2 + significant_bits
90
+ else:
91
+ self._stored_leading_zeros = leading_zeros
92
+ significant_bits = 64 - leading_zeros
93
+ self._out.write_bit(True)
94
+ self._out.write_bit(True)
95
+ self._out.write_int(LEADING_REPRESENTATION_ENCODE[leading_zeros], 3)
96
+ self._out.write_long(xor, significant_bits)
97
+ self._size += 5 + significant_bits
98
+
99
+ self._stored_val = value
100
+
101
+ def close(self) -> None:
102
+ self.add_value(float("nan"))
103
+ self._out.write_bit(False)
104
+ self._out.flush()
105
+
106
+ def get_size(self) -> int:
107
+ return self._size
108
+
109
+ def get_bytes(self) -> bytes:
110
+ return self._out.buffer
111
+
112
+
113
+ class ChimpDecoder:
114
+ """Decompresses a Chimp-compressed stream of 64-bit doubles."""
115
+
116
+ def __init__(self, data: bytes) -> None:
117
+ self._in = InputBitStream(data)
118
+ self._stored_leading_zeros: int = 65
119
+ self._stored_trailing_zeros: int = 0
120
+ self._stored_val: int = 0
121
+ self._first: bool = True
122
+ self._end_of_stream: bool = False
123
+
124
+ def read_value(self) -> float | None:
125
+ if self._first:
126
+ self._first = False
127
+ self._stored_val = self._in.read_long(64)
128
+ if self._stored_val == _NAN_LONG:
129
+ self._end_of_stream = True
130
+ return None
131
+ else:
132
+ self._next_value()
133
+
134
+ if self._end_of_stream:
135
+ return None
136
+ return _long_bits_to_double(self._stored_val)
137
+
138
+ def _next_value(self) -> None:
139
+ flag = self._in.read_int(2)
140
+
141
+ if flag == 3:
142
+ # New leading zeros
143
+ self._stored_leading_zeros = LEADING_REPRESENTATION_DECODE[self._in.read_int(3)]
144
+ significant_bits = 64 - self._stored_leading_zeros
145
+ if significant_bits == 0:
146
+ significant_bits = 64
147
+ value = self._in.read_long(64 - self._stored_leading_zeros)
148
+ value = (self._stored_val ^ value) & _MASK_64
149
+ if value == _NAN_LONG:
150
+ self._end_of_stream = True
151
+ else:
152
+ self._stored_val = value
153
+
154
+ elif flag == 2:
155
+ # Same leading zeros
156
+ significant_bits = 64 - self._stored_leading_zeros
157
+ if significant_bits == 0:
158
+ significant_bits = 64
159
+ value = self._in.read_long(64 - self._stored_leading_zeros)
160
+ value = (self._stored_val ^ value) & _MASK_64
161
+ if value == _NAN_LONG:
162
+ self._end_of_stream = True
163
+ else:
164
+ self._stored_val = value
165
+
166
+ elif flag == 1:
167
+ # Trailing zeros > threshold
168
+ self._stored_leading_zeros = LEADING_REPRESENTATION_DECODE[self._in.read_int(3)]
169
+ significant_bits = self._in.read_int(6)
170
+ if significant_bits == 0:
171
+ significant_bits = 64
172
+ self._stored_trailing_zeros = 64 - significant_bits - self._stored_leading_zeros
173
+ value = self._in.read_long(
174
+ 64 - self._stored_leading_zeros - self._stored_trailing_zeros
175
+ )
176
+ value = (value << self._stored_trailing_zeros) & _MASK_64
177
+ value = (self._stored_val ^ value) & _MASK_64
178
+ if value == _NAN_LONG:
179
+ self._end_of_stream = True
180
+ else:
181
+ self._stored_val = value
182
+
183
+ # flag == 0: identical value, nothing to do
184
+
185
+ def get_values(self) -> list[float]:
186
+ result: list[float] = []
187
+ value = self.read_value()
188
+ while value is not None:
189
+ result.append(value)
190
+ value = self.read_value()
191
+ return result
@@ -0,0 +1,175 @@
1
+ """Chimp encoder/decoder for 32-bit floats (base variant)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import struct
6
+
7
+ from chimp_encoding._bit_stream import InputBitStream, OutputBitStream
8
+ from chimp_encoding._tables import (
9
+ LEADING_REPRESENTATION_DECODE,
10
+ LEADING_REPRESENTATION_ENCODE,
11
+ LEADING_ROUND,
12
+ )
13
+
14
+ _NAN_INT = 0x7FC00000
15
+ _MASK_32 = 0xFFFFFFFF
16
+
17
+
18
+ def _float_to_int_bits(value: float) -> int:
19
+ return struct.unpack(">I", struct.pack(">f", value))[0]
20
+
21
+
22
+ def _int_bits_to_float(bits: int) -> float:
23
+ return struct.unpack(">f", struct.pack(">I", bits & _MASK_32))[0]
24
+
25
+
26
+ def _leading_zeros_32(x: int) -> int:
27
+ if x == 0:
28
+ return 32
29
+ return 32 - x.bit_length()
30
+
31
+
32
+ def _trailing_zeros_32(x: int) -> int:
33
+ if x == 0:
34
+ return 32
35
+ return (x & -x).bit_length() - 1
36
+
37
+
38
+ class Chimp32Encoder:
39
+ """Compresses a series of 32-bit floats using the Chimp algorithm."""
40
+
41
+ THRESHOLD: int = 5
42
+
43
+ def __init__(self) -> None:
44
+ self._out = OutputBitStream()
45
+ self._stored_leading_zeros: int = 33
46
+ self._stored_val: int = 0
47
+ self._first: bool = True
48
+ self._size: int = 0
49
+
50
+ def add_value(self, value: float) -> None:
51
+ bits = _float_to_int_bits(value)
52
+ if self._first:
53
+ self._write_first(bits)
54
+ else:
55
+ self._compress_value(bits)
56
+
57
+ def _write_first(self, value: int) -> None:
58
+ self._first = False
59
+ self._stored_val = value
60
+ self._out.write_int(value, 32)
61
+ self._size += 32
62
+
63
+ def _compress_value(self, value: int) -> None:
64
+ xor = (self._stored_val ^ value) & _MASK_32
65
+
66
+ if xor == 0:
67
+ self._out.write_bit(False)
68
+ self._out.write_bit(False)
69
+ self._size += 2
70
+ self._stored_leading_zeros = 33
71
+ else:
72
+ leading_zeros = LEADING_ROUND[_leading_zeros_32(xor)]
73
+ trailing_zeros = _trailing_zeros_32(xor)
74
+
75
+ if trailing_zeros > self.THRESHOLD:
76
+ significant_bits = 32 - leading_zeros - trailing_zeros
77
+ self._out.write_bit(False)
78
+ self._out.write_bit(True)
79
+ self._out.write_int(LEADING_REPRESENTATION_ENCODE[leading_zeros], 3)
80
+ self._out.write_int(significant_bits, 5)
81
+ self._out.write_int(xor >> trailing_zeros, significant_bits)
82
+ self._size += 10 + significant_bits
83
+ self._stored_leading_zeros = 33
84
+ elif leading_zeros == self._stored_leading_zeros:
85
+ self._out.write_bit(True)
86
+ self._out.write_bit(False)
87
+ significant_bits = 32 - leading_zeros
88
+ self._out.write_int(xor, significant_bits)
89
+ self._size += 2 + significant_bits
90
+ else:
91
+ self._stored_leading_zeros = leading_zeros
92
+ significant_bits = 32 - leading_zeros
93
+ self._out.write_int(24 + LEADING_REPRESENTATION_ENCODE[leading_zeros], 5)
94
+ self._out.write_int(xor, significant_bits)
95
+ self._size += 5 + significant_bits
96
+
97
+ self._stored_val = value
98
+
99
+ def close(self) -> None:
100
+ self.add_value(float("nan"))
101
+ self._out.write_bit(False)
102
+ self._out.flush()
103
+
104
+ def get_size(self) -> int:
105
+ return self._size
106
+
107
+ def get_bytes(self) -> bytes:
108
+ return self._out.buffer
109
+
110
+
111
+ class Chimp32Decoder:
112
+ """Decompresses a Chimp-compressed stream of 32-bit floats."""
113
+
114
+ def __init__(self, data: bytes) -> None:
115
+ self._in = InputBitStream(data)
116
+ self._stored_leading_zeros: int = 33
117
+ self._stored_trailing_zeros: int = 0
118
+ self._stored_val: int = 0
119
+ self._first: bool = True
120
+ self._end_of_stream: bool = False
121
+
122
+ def read_value(self) -> float | None:
123
+ if self._first:
124
+ self._first = False
125
+ self._stored_val = self._in.read_int(32)
126
+ if self._stored_val == _NAN_INT:
127
+ self._end_of_stream = True
128
+ return None
129
+ else:
130
+ self._next_value()
131
+
132
+ if self._end_of_stream:
133
+ return None
134
+ return _int_bits_to_float(self._stored_val)
135
+
136
+ def _next_value(self) -> None:
137
+ if self._in.read_bit() == 1:
138
+ if self._in.read_bit() == 1:
139
+ # New leading zeros
140
+ self._stored_leading_zeros = LEADING_REPRESENTATION_DECODE[self._in.read_int(3)]
141
+
142
+ significant_bits = 32 - self._stored_leading_zeros
143
+ if significant_bits == 0:
144
+ significant_bits = 32
145
+ value = self._in.read_int(32 - self._stored_leading_zeros)
146
+ value = (self._stored_val ^ value) & _MASK_32
147
+ if value == _NAN_INT:
148
+ self._end_of_stream = True
149
+ else:
150
+ self._stored_val = value
151
+
152
+ elif self._in.read_bit() == 1:
153
+ # Trailing zeros > threshold
154
+ self._stored_leading_zeros = LEADING_REPRESENTATION_DECODE[self._in.read_int(3)]
155
+ significant_bits = self._in.read_int(5)
156
+ if significant_bits == 0:
157
+ significant_bits = 32
158
+ self._stored_trailing_zeros = 32 - significant_bits - self._stored_leading_zeros
159
+ value = self._in.read_int(32 - self._stored_leading_zeros - self._stored_trailing_zeros)
160
+ value = (value << self._stored_trailing_zeros) & _MASK_32
161
+ value = (self._stored_val ^ value) & _MASK_32
162
+ if value == _NAN_INT:
163
+ self._end_of_stream = True
164
+ else:
165
+ self._stored_val = value
166
+
167
+ # else: identical value, nothing to do
168
+
169
+ def get_values(self) -> list[float]:
170
+ result: list[float] = []
171
+ value = self.read_value()
172
+ while value is not None:
173
+ result.append(value)
174
+ value = self.read_value()
175
+ return result