dissect.util 3.24.dev1__cp314-cp314t-manylinux_2_28_s390x.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dissect/util/__init__.py +20 -0
- dissect/util/_build.py +17 -0
- dissect/util/_native/__init__.pyi +3 -0
- dissect/util/_native/compression/__init__.pyi +3 -0
- dissect/util/_native/compression/lz4.pyi +7 -0
- dissect/util/_native/compression/lzo.pyi +3 -0
- dissect/util/_native/hash/__init__.py +3 -0
- dissect/util/_native/hash/crc32c.py +2 -0
- dissect/util/_native.cpython-314t-s390x-linux-gnu.so +0 -0
- dissect/util/compression/__init__.py +45 -0
- dissect/util/compression/lz4.py +95 -0
- dissect/util/compression/lzbitmap.py +130 -0
- dissect/util/compression/lzfse.py +467 -0
- dissect/util/compression/lznt1.py +92 -0
- dissect/util/compression/lzo.py +118 -0
- dissect/util/compression/lzvn.py +241 -0
- dissect/util/compression/lzxpress.py +80 -0
- dissect/util/compression/lzxpress_huffman.py +184 -0
- dissect/util/compression/sevenbit.py +77 -0
- dissect/util/compression/xz.py +112 -0
- dissect/util/cpio.py +226 -0
- dissect/util/encoding/__init__.py +0 -0
- dissect/util/encoding/surrogateescape.py +21 -0
- dissect/util/exceptions.py +6 -0
- dissect/util/hash/__init__.py +28 -0
- dissect/util/hash/crc32.py +55 -0
- dissect/util/hash/crc32c.py +60 -0
- dissect/util/hash/jenkins.py +102 -0
- dissect/util/ldap.py +237 -0
- dissect/util/plist.py +156 -0
- dissect/util/sid.py +81 -0
- dissect/util/stream.py +671 -0
- dissect/util/tools/__init__.py +0 -0
- dissect/util/tools/dump_nskeyedarchiver.py +61 -0
- dissect/util/ts.py +295 -0
- dissect/util/xmemoryview.py +117 -0
- dissect_util-3.24.dev1.dist-info/METADATA +89 -0
- dissect_util-3.24.dev1.dist-info/RECORD +43 -0
- dissect_util-3.24.dev1.dist-info/WHEEL +5 -0
- dissect_util-3.24.dev1.dist-info/entry_points.txt +2 -0
- dissect_util-3.24.dev1.dist-info/licenses/COPYRIGHT +5 -0
- dissect_util-3.24.dev1.dist-info/licenses/LICENSE +201 -0
- dissect_util-3.24.dev1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,467 @@
|
|
|
1
|
+
# References:
|
|
2
|
+
# - https://github.com/lzfse/lzfse
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import io
|
|
6
|
+
import struct
|
|
7
|
+
from typing import BinaryIO, NamedTuple
|
|
8
|
+
|
|
9
|
+
from dissect.util.compression import lzvn
|
|
10
|
+
|
|
11
|
+
LZFSE_ENDOFSTREAM_BLOCK_MAGIC = b"bvx$" # 0x24787662 (end of stream)
|
|
12
|
+
LZFSE_UNCOMPRESSED_BLOCK_MAGIC = b"bvx-" # 0x2d787662 (raw data)
|
|
13
|
+
LZFSE_COMPRESSEDV1_BLOCK_MAGIC = b"bvx1" # 0x31787662 (lzfse compressed, uncompressed tables)
|
|
14
|
+
LZFSE_COMPRESSEDV2_BLOCK_MAGIC = b"bvx2" # 0x32787662 (lzfse compressed, compressed tables)
|
|
15
|
+
LZFSE_COMPRESSEDLZVN_BLOCK_MAGIC = b"bvxn" # 0x6e787662 (lzvn compressed)
|
|
16
|
+
|
|
17
|
+
# Throughout LZFSE we refer to "L", "M" and "D"; these will always appear as
|
|
18
|
+
# a triplet, and represent a "usual" LZ-style literal and match pair. "L"
|
|
19
|
+
# is the number of literal bytes, "M" is the number of match bytes, and "D"
|
|
20
|
+
# is the match "distance"; the distance in bytes between the current pointer
|
|
21
|
+
# and the start of the match.
|
|
22
|
+
LZFSE_ENCODE_L_SYMBOLS = 20
|
|
23
|
+
LZFSE_ENCODE_M_SYMBOLS = 20
|
|
24
|
+
LZFSE_ENCODE_D_SYMBOLS = 64
|
|
25
|
+
LZFSE_ENCODE_LITERAL_SYMBOLS = 256
|
|
26
|
+
LZFSE_ENCODE_L_STATES = 64
|
|
27
|
+
LZFSE_ENCODE_M_STATES = 64
|
|
28
|
+
LZFSE_ENCODE_D_STATES = 256
|
|
29
|
+
LZFSE_ENCODE_LITERAL_STATES = 1024
|
|
30
|
+
LZFSE_MATCHES_PER_BLOCK = 10000
|
|
31
|
+
LZFSE_LITERALS_PER_BLOCK = 4 * LZFSE_MATCHES_PER_BLOCK
|
|
32
|
+
|
|
33
|
+
# fmt: off
|
|
34
|
+
_lzfse_freq_nbits_table = (
|
|
35
|
+
2, 3, 2, 5, 2, 3, 2, 8, 2, 3, 2, 5, 2, 3, 2, 14,
|
|
36
|
+
2, 3, 2, 5, 2, 3, 2, 8, 2, 3, 2, 5, 2, 3, 2, 14
|
|
37
|
+
)
|
|
38
|
+
_lzfse_freq_value_table = (
|
|
39
|
+
0, 2, 1, 4, 0, 3, 1, -1, 0, 2, 1, 5, 0, 3, 1, -1,
|
|
40
|
+
0, 2, 1, 6, 0, 3, 1, -1, 0, 2, 1, 7, 0, 3, 1, -1
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# The L, M, D data streams are all encoded as a "base" value, which is
|
|
44
|
+
# FSE-encoded, and an "extra bits" value, which is the difference between
|
|
45
|
+
# value and base, and is simply represented as a raw bit value (because it
|
|
46
|
+
# is the low-order bits of a larger number, not much entropy can be
|
|
47
|
+
# extracted from these bits by more complex encoding schemes). The following
|
|
48
|
+
# tables represent the number of low-order bits to encode separately and the
|
|
49
|
+
# base values for each of L, M, and D.
|
|
50
|
+
_l_extra_bits = (
|
|
51
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 5, 8
|
|
52
|
+
)
|
|
53
|
+
_l_base_value = (
|
|
54
|
+
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 28, 60
|
|
55
|
+
)
|
|
56
|
+
_m_extra_bits = (
|
|
57
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 5, 8, 11
|
|
58
|
+
)
|
|
59
|
+
_m_base_value = (
|
|
60
|
+
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 24, 56, 312
|
|
61
|
+
)
|
|
62
|
+
_d_extra_bits = (
|
|
63
|
+
0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3,
|
|
64
|
+
4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
|
|
65
|
+
8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11,
|
|
66
|
+
12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15
|
|
67
|
+
)
|
|
68
|
+
_d_base_value = (
|
|
69
|
+
0, 1, 2, 3, 4, 6, 8, 10, 12, 16,
|
|
70
|
+
20, 24, 28, 36, 44, 52, 60, 76, 92, 108,
|
|
71
|
+
124, 156, 188, 220, 252, 316, 380, 444, 508, 636,
|
|
72
|
+
764, 892, 1020, 1276, 1532, 1788, 2044, 2556, 3068, 3580,
|
|
73
|
+
4092, 5116, 6140, 7164, 8188, 10236, 12284, 14332, 16380, 20476,
|
|
74
|
+
24572, 28668, 32764, 40956, 49148, 57340, 65532, 81916, 98300, 114684,
|
|
75
|
+
131068, 163836, 196604, 229372
|
|
76
|
+
)
|
|
77
|
+
# fmt: on
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
_I = struct.Struct("<I")
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _clz(n: int) -> int:
|
|
84
|
+
if n == 0:
|
|
85
|
+
return 32
|
|
86
|
+
return 32 - n.bit_length()
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class LZFSECompressedBlockHeader(NamedTuple):
|
|
90
|
+
"""LZFSE compressed block header."""
|
|
91
|
+
|
|
92
|
+
__struct_v1__ = struct.Struct(
|
|
93
|
+
"<IIIIIIi4HiHHH"
|
|
94
|
+
f"{LZFSE_ENCODE_L_SYMBOLS}H"
|
|
95
|
+
f"{LZFSE_ENCODE_M_SYMBOLS}H"
|
|
96
|
+
f"{LZFSE_ENCODE_D_SYMBOLS}H"
|
|
97
|
+
f"{LZFSE_ENCODE_LITERAL_SYMBOLS}H"
|
|
98
|
+
)
|
|
99
|
+
__struct_v2__ = struct.Struct("<I3Q")
|
|
100
|
+
|
|
101
|
+
n_raw_bytes: int
|
|
102
|
+
n_payload_bytes: int
|
|
103
|
+
n_literals: int
|
|
104
|
+
n_matches: int
|
|
105
|
+
n_literal_payload_bytes: int
|
|
106
|
+
n_lmd_payload_bytes: int
|
|
107
|
+
literal_bits: int
|
|
108
|
+
literal_state: tuple[int, int, int, int]
|
|
109
|
+
lmd_bits: int
|
|
110
|
+
l_state: int
|
|
111
|
+
m_state: int
|
|
112
|
+
d_state: int
|
|
113
|
+
l_freq: tuple[int, ...]
|
|
114
|
+
m_freq: tuple[int, ...]
|
|
115
|
+
d_freq: tuple[int, ...]
|
|
116
|
+
literal_freq: tuple[int, ...]
|
|
117
|
+
|
|
118
|
+
@classmethod
|
|
119
|
+
def parse_v1(cls, fh: BinaryIO) -> LZFSECompressedBlockHeader:
|
|
120
|
+
"""Decode all fields from a v1 header."""
|
|
121
|
+
values = cls.__struct_v1__.unpack(fh.read(cls.__struct_v1__.size))
|
|
122
|
+
|
|
123
|
+
return cls(
|
|
124
|
+
n_raw_bytes=values[0],
|
|
125
|
+
n_payload_bytes=values[1],
|
|
126
|
+
n_literals=values[2],
|
|
127
|
+
n_matches=values[3],
|
|
128
|
+
n_literal_payload_bytes=values[4],
|
|
129
|
+
n_lmd_payload_bytes=values[5],
|
|
130
|
+
literal_bits=values[6],
|
|
131
|
+
literal_state=values[7:11],
|
|
132
|
+
lmd_bits=values[11],
|
|
133
|
+
l_state=values[12],
|
|
134
|
+
m_state=values[13],
|
|
135
|
+
d_state=values[14],
|
|
136
|
+
l_freq=values[15 : 15 + LZFSE_ENCODE_L_SYMBOLS],
|
|
137
|
+
m_freq=values[35 : 35 + LZFSE_ENCODE_M_SYMBOLS],
|
|
138
|
+
d_freq=values[55 : 55 + LZFSE_ENCODE_D_SYMBOLS],
|
|
139
|
+
literal_freq=values[119 : 119 + LZFSE_ENCODE_LITERAL_SYMBOLS],
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
@classmethod
|
|
143
|
+
def parse_v2(cls, fh: BinaryIO) -> LZFSECompressedBlockHeader:
|
|
144
|
+
"""Decode all fields from a v2 header."""
|
|
145
|
+
values = cls.__struct_v2__.unpack(fh.read(cls.__struct_v2__.size))
|
|
146
|
+
v0, v1, v2 = values[1:4]
|
|
147
|
+
|
|
148
|
+
n_literal_payload_bytes = _get_field(v0, 20, 20)
|
|
149
|
+
n_lmd_payload_bytes = _get_field(v1, 40, 20)
|
|
150
|
+
n_payload_bytes = n_literal_payload_bytes + n_lmd_payload_bytes
|
|
151
|
+
|
|
152
|
+
freq_tables_size = _get_field(v2, 0, 32) - cls.__struct_v2__.size - 4 # exclude magic
|
|
153
|
+
|
|
154
|
+
if freq_tables_size == 0:
|
|
155
|
+
l_freq = (0,) * 20
|
|
156
|
+
m_freq = (0,) * 20
|
|
157
|
+
d_freq = (0,) * 64
|
|
158
|
+
literal_freq = (0,) * 256
|
|
159
|
+
else:
|
|
160
|
+
accum = 0
|
|
161
|
+
accum = int.from_bytes(fh.read(freq_tables_size), "little")
|
|
162
|
+
result = [0] * 720
|
|
163
|
+
|
|
164
|
+
for i in range(
|
|
165
|
+
LZFSE_ENCODE_L_SYMBOLS + LZFSE_ENCODE_M_SYMBOLS + LZFSE_ENCODE_D_SYMBOLS + LZFSE_ENCODE_LITERAL_SYMBOLS
|
|
166
|
+
):
|
|
167
|
+
# Decode and store value
|
|
168
|
+
nbits, value = _decode_v1_freq_value(accum)
|
|
169
|
+
result[i] = value
|
|
170
|
+
|
|
171
|
+
# Consume nbits bits
|
|
172
|
+
accum >>= nbits
|
|
173
|
+
|
|
174
|
+
l_freq = tuple(result[0:20])
|
|
175
|
+
m_freq = tuple(result[20:40])
|
|
176
|
+
d_freq = tuple(result[40:104])
|
|
177
|
+
literal_freq = tuple(result[104:360])
|
|
178
|
+
|
|
179
|
+
return cls(
|
|
180
|
+
n_raw_bytes=values[0],
|
|
181
|
+
n_payload_bytes=n_payload_bytes,
|
|
182
|
+
n_literals=_get_field(v0, 0, 20),
|
|
183
|
+
n_matches=_get_field(v0, 40, 20),
|
|
184
|
+
n_literal_payload_bytes=n_literal_payload_bytes,
|
|
185
|
+
n_lmd_payload_bytes=n_lmd_payload_bytes,
|
|
186
|
+
literal_bits=_get_field(v0, 60, 3) - 7,
|
|
187
|
+
literal_state=(
|
|
188
|
+
_get_field(v1, 0, 10),
|
|
189
|
+
_get_field(v1, 10, 10),
|
|
190
|
+
_get_field(v1, 20, 10),
|
|
191
|
+
_get_field(v1, 30, 10),
|
|
192
|
+
),
|
|
193
|
+
lmd_bits=_get_field(v1, 60, 3) - 7,
|
|
194
|
+
l_state=_get_field(v2, 32, 10),
|
|
195
|
+
m_state=_get_field(v2, 42, 10),
|
|
196
|
+
d_state=_get_field(v2, 52, 10),
|
|
197
|
+
l_freq=l_freq,
|
|
198
|
+
m_freq=m_freq,
|
|
199
|
+
d_freq=d_freq,
|
|
200
|
+
literal_freq=literal_freq,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _get_field(value: int, offset: int, nbits: int) -> int:
|
|
205
|
+
"""Extracts up to 32 bits from a 64-bit field beginning at offset, and zero-extends them to a 32-bit int.
|
|
206
|
+
|
|
207
|
+
If we number the bits of ``value`` from 0 (least significant) to 63 (most significant),
|
|
208
|
+
the result is bits ``offset`` to ``offset+nbits-1``.
|
|
209
|
+
"""
|
|
210
|
+
return (value >> offset) & ((1 << nbits) - 1)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _decode_v1_freq_value(bits: int) -> tuple[int, int]:
|
|
214
|
+
"""Decode an entry value from next bits of stream."""
|
|
215
|
+
b = bits & 31
|
|
216
|
+
n = _lzfse_freq_nbits_table[b]
|
|
217
|
+
|
|
218
|
+
# Special cases for > 5 bits encoding
|
|
219
|
+
if n == 8:
|
|
220
|
+
value = 8 + ((bits >> 4) & 0xF)
|
|
221
|
+
elif n == 14:
|
|
222
|
+
value = 24 + ((bits >> 4) & 0x3FF)
|
|
223
|
+
else:
|
|
224
|
+
value = _lzfse_freq_value_table[b]
|
|
225
|
+
|
|
226
|
+
return n, value
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
class DecoderEntry(NamedTuple):
|
|
230
|
+
"""Entry for one state in the decoder table."""
|
|
231
|
+
|
|
232
|
+
k: int # Number of bits to read
|
|
233
|
+
symbol: int # Emitted symbol
|
|
234
|
+
delta: int # Signed increment used to compute next state (+bias)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
class ValueDecoderEntry(NamedTuple):
|
|
238
|
+
"""Entry for one state in the value decoder table."""
|
|
239
|
+
|
|
240
|
+
total_bits: int # state bits + extra value bits = shift for next decode
|
|
241
|
+
value_bits: int # extra value bits
|
|
242
|
+
delta: int # state base (delta)
|
|
243
|
+
vbase: int # value base
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def _init_decoder_table(nstates: int, freq: tuple[int, ...]) -> list[DecoderEntry]:
|
|
247
|
+
"""Initialize decoder table ``T[nstates]``.
|
|
248
|
+
|
|
249
|
+
``nstates = sum freq[i]`` is the number of states (a power of 2).
|
|
250
|
+
``freq`` is a normalized histogram of symbol frequencies, with ``freq[i] >= 0``.
|
|
251
|
+
|
|
252
|
+
Some symbols may have a 0 frequency. In that case, they should not be present in the data.
|
|
253
|
+
"""
|
|
254
|
+
table = []
|
|
255
|
+
|
|
256
|
+
n_clz = _clz(nstates)
|
|
257
|
+
sum_of_freq = 0
|
|
258
|
+
|
|
259
|
+
for i, f in enumerate(freq):
|
|
260
|
+
if f == 0:
|
|
261
|
+
# skip this symbol, no occurrences
|
|
262
|
+
continue
|
|
263
|
+
|
|
264
|
+
sum_of_freq += f
|
|
265
|
+
if sum_of_freq > nstates:
|
|
266
|
+
raise ValueError("Invalid frequency table")
|
|
267
|
+
k = _clz(f) - n_clz # shift needed to ensure N <= (F<<K) < 2*N
|
|
268
|
+
j0 = ((2 * nstates) >> k) - f
|
|
269
|
+
|
|
270
|
+
# Initialize all states S reached by this symbol: OFFSET <= S < OFFSET + F
|
|
271
|
+
table.extend(
|
|
272
|
+
DecoderEntry(
|
|
273
|
+
k=k if j < j0 else k - 1,
|
|
274
|
+
symbol=i,
|
|
275
|
+
delta=((f + j) << k) - nstates if j < j0 else (j - j0) << (k - 1),
|
|
276
|
+
)
|
|
277
|
+
for j in range(f)
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
return table
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def _init_value_decoder_table(
|
|
284
|
+
nstates: int, freq: tuple[int, ...], symbol_vbits: tuple[int, ...], symbol_vbase: tuple[int, ...]
|
|
285
|
+
) -> list[ValueDecoderEntry]:
|
|
286
|
+
"""Initialize value decoder table ``T[nstates]``.
|
|
287
|
+
|
|
288
|
+
``nstates = sum freq[i]`` is the number of states (a power of 2).
|
|
289
|
+
``freq`` is a normalized histogram of symbol frequencies, with ``freq[i] >= 0``.
|
|
290
|
+
``symbol_vbits`` and ``symbol_vbase`` are the number of value bits to read and the base value for each symbol.
|
|
291
|
+
|
|
292
|
+
Some symbols may have a 0 frequency. In that case, they should not be present in the data.
|
|
293
|
+
"""
|
|
294
|
+
table = []
|
|
295
|
+
|
|
296
|
+
n_clz = _clz(nstates)
|
|
297
|
+
for i, f in enumerate(freq):
|
|
298
|
+
if f == 0:
|
|
299
|
+
# skip this symbol, no occurrences
|
|
300
|
+
continue
|
|
301
|
+
|
|
302
|
+
k = _clz(f) - n_clz # shift needed to ensure N <= (F<<K) < 2*N
|
|
303
|
+
j0 = ((2 * nstates) >> k) - f
|
|
304
|
+
|
|
305
|
+
# Initialize all states S reached by this symbol: OFFSET <= S < OFFSET + F
|
|
306
|
+
table.extend(
|
|
307
|
+
ValueDecoderEntry(
|
|
308
|
+
total_bits=k + symbol_vbits[i] if j < j0 else (k - 1) + symbol_vbits[i],
|
|
309
|
+
value_bits=symbol_vbits[i],
|
|
310
|
+
delta=(((f + j) << k) - nstates) if j < j0 else ((j - j0) << (k - 1)),
|
|
311
|
+
vbase=symbol_vbase[i],
|
|
312
|
+
)
|
|
313
|
+
for j in range(f)
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
return table
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
class _BitStream:
|
|
320
|
+
def __init__(self, data: bytes, nbits: int):
|
|
321
|
+
self.accum = int.from_bytes(data, "little")
|
|
322
|
+
self.nbits = nbits + (len(data) * 8)
|
|
323
|
+
|
|
324
|
+
def pull(self, n: int) -> int:
|
|
325
|
+
self.nbits -= n
|
|
326
|
+
result = self.accum >> self.nbits
|
|
327
|
+
self.accum &= (1 << self.nbits) - 1
|
|
328
|
+
return result
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def _decode(state: int, decoder_table: list[DecoderEntry], in_stream: _BitStream) -> tuple[int, int]:
|
|
332
|
+
"""Decode and return symbol using the decoder table, and update state."""
|
|
333
|
+
e = decoder_table[state]
|
|
334
|
+
|
|
335
|
+
# Update state from K bits of input + DELTA
|
|
336
|
+
state = e.delta + in_stream.pull(e.k)
|
|
337
|
+
|
|
338
|
+
# Return the symbol for this state
|
|
339
|
+
return state, e.symbol
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def _value_decode(state: int, decoder_table: list[ValueDecoderEntry], in_stream: _BitStream) -> tuple[int, int]:
|
|
343
|
+
"""Decode and return value using the decoder table, and update state."""
|
|
344
|
+
entry = decoder_table[state]
|
|
345
|
+
|
|
346
|
+
state_and_value_bits = in_stream.pull(entry.total_bits)
|
|
347
|
+
state = entry.delta + (state_and_value_bits >> entry.value_bits)
|
|
348
|
+
|
|
349
|
+
return state, entry.vbase + (state_and_value_bits & ((1 << entry.value_bits) - 1))
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
def _decode_lmd(
|
|
353
|
+
header: LZFSECompressedBlockHeader,
|
|
354
|
+
literals: list[int],
|
|
355
|
+
l_decoder: list[ValueDecoderEntry],
|
|
356
|
+
m_decoder: list[ValueDecoderEntry],
|
|
357
|
+
d_decoder: list[ValueDecoderEntry],
|
|
358
|
+
in_stream: _BitStream,
|
|
359
|
+
) -> bytes:
|
|
360
|
+
symbols = header.n_matches
|
|
361
|
+
|
|
362
|
+
l_state = header.l_state
|
|
363
|
+
m_state = header.m_state
|
|
364
|
+
d_state = header.d_state
|
|
365
|
+
|
|
366
|
+
lit = io.BytesIO(bytes(literals))
|
|
367
|
+
|
|
368
|
+
# ruff: noqa: N806
|
|
369
|
+
L = 0
|
|
370
|
+
M = 0
|
|
371
|
+
D = None
|
|
372
|
+
|
|
373
|
+
dst = bytearray()
|
|
374
|
+
|
|
375
|
+
while symbols:
|
|
376
|
+
# Decode the next L, M, D symbol from the input stream
|
|
377
|
+
l_state, L = _value_decode(l_state, l_decoder, in_stream)
|
|
378
|
+
if (lit.tell() + L) >= LZFSE_LITERALS_PER_BLOCK + 64:
|
|
379
|
+
raise ValueError("Literal overflow")
|
|
380
|
+
|
|
381
|
+
m_state, M = _value_decode(m_state, m_decoder, in_stream)
|
|
382
|
+
d_state, new_d = _value_decode(d_state, d_decoder, in_stream)
|
|
383
|
+
D = new_d if new_d != 0 else D
|
|
384
|
+
|
|
385
|
+
if D is None or len(dst) + L < D:
|
|
386
|
+
raise ValueError("Invalid match distance")
|
|
387
|
+
|
|
388
|
+
dst += lit.read(L)
|
|
389
|
+
|
|
390
|
+
remaining = M
|
|
391
|
+
while remaining > 0:
|
|
392
|
+
match_size = min(remaining, D)
|
|
393
|
+
dst += dst[-D : (-D + match_size) or None]
|
|
394
|
+
remaining -= match_size
|
|
395
|
+
|
|
396
|
+
symbols -= 1
|
|
397
|
+
|
|
398
|
+
return bytes(dst)
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def decompress(src: bytes | BinaryIO) -> bytes:
|
|
402
|
+
"""LZFSE decompress from a file-like object or bytes.
|
|
403
|
+
|
|
404
|
+
Decompresses until EOF or EOS of the input data.
|
|
405
|
+
|
|
406
|
+
Args:
|
|
407
|
+
src: File-like object or bytes to decompress.
|
|
408
|
+
|
|
409
|
+
Returns:
|
|
410
|
+
The decompressed data.
|
|
411
|
+
"""
|
|
412
|
+
if not hasattr(src, "read"):
|
|
413
|
+
src = io.BytesIO(src)
|
|
414
|
+
|
|
415
|
+
dst = bytearray()
|
|
416
|
+
|
|
417
|
+
while True:
|
|
418
|
+
magic = src.read(4)
|
|
419
|
+
|
|
420
|
+
if magic == LZFSE_ENDOFSTREAM_BLOCK_MAGIC:
|
|
421
|
+
break
|
|
422
|
+
|
|
423
|
+
if magic == LZFSE_UNCOMPRESSED_BLOCK_MAGIC:
|
|
424
|
+
(n_raw_bytes,) = _I.unpack(src.read(4))
|
|
425
|
+
if n_raw_bytes == 0:
|
|
426
|
+
continue
|
|
427
|
+
|
|
428
|
+
dst += src.read(n_raw_bytes)
|
|
429
|
+
|
|
430
|
+
elif magic in (LZFSE_COMPRESSEDV1_BLOCK_MAGIC, LZFSE_COMPRESSEDV2_BLOCK_MAGIC):
|
|
431
|
+
if magic == LZFSE_COMPRESSEDV1_BLOCK_MAGIC:
|
|
432
|
+
header = LZFSECompressedBlockHeader.parse_v1(src)
|
|
433
|
+
else:
|
|
434
|
+
header = LZFSECompressedBlockHeader.parse_v2(src)
|
|
435
|
+
|
|
436
|
+
literal_decoder = _init_decoder_table(LZFSE_ENCODE_LITERAL_STATES, header.literal_freq)
|
|
437
|
+
l_decoder = _init_value_decoder_table(LZFSE_ENCODE_L_STATES, header.l_freq, _l_extra_bits, _l_base_value)
|
|
438
|
+
m_decoder = _init_value_decoder_table(LZFSE_ENCODE_M_STATES, header.m_freq, _m_extra_bits, _m_base_value)
|
|
439
|
+
d_decoder = _init_value_decoder_table(LZFSE_ENCODE_D_STATES, header.d_freq, _d_extra_bits, _d_base_value)
|
|
440
|
+
|
|
441
|
+
in_stream = _BitStream(src.read(header.n_literal_payload_bytes), header.literal_bits)
|
|
442
|
+
|
|
443
|
+
literals = []
|
|
444
|
+
state0, state1, state2, state3 = header.literal_state
|
|
445
|
+
for _ in range(0, header.n_literals, 4):
|
|
446
|
+
state0, result = _decode(state0, literal_decoder, in_stream)
|
|
447
|
+
literals.append(result)
|
|
448
|
+
state1, result = _decode(state1, literal_decoder, in_stream)
|
|
449
|
+
literals.append(result)
|
|
450
|
+
state2, result = _decode(state2, literal_decoder, in_stream)
|
|
451
|
+
literals.append(result)
|
|
452
|
+
state3, result = _decode(state3, literal_decoder, in_stream)
|
|
453
|
+
literals.append(result)
|
|
454
|
+
|
|
455
|
+
in_stream = _BitStream(src.read(header.n_lmd_payload_bytes), header.lmd_bits)
|
|
456
|
+
dst += _decode_lmd(header, literals, l_decoder, m_decoder, d_decoder, in_stream)
|
|
457
|
+
|
|
458
|
+
elif magic == LZFSE_COMPRESSEDLZVN_BLOCK_MAGIC:
|
|
459
|
+
(n_raw_bytes,) = _I.unpack(src.read(4))
|
|
460
|
+
(n_payload_bytes,) = _I.unpack(src.read(4))
|
|
461
|
+
|
|
462
|
+
dst += lzvn.decompress(src.read(n_payload_bytes))
|
|
463
|
+
|
|
464
|
+
else:
|
|
465
|
+
raise ValueError(f"Invalid LZFSE block magic: {magic!r}")
|
|
466
|
+
|
|
467
|
+
return bytes(dst)
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# Reference: https://github.com/google/rekall/blob/master/rekall-core/rekall/plugins/filesystems/lznt1.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import array
|
|
5
|
+
import io
|
|
6
|
+
import struct
|
|
7
|
+
from typing import BinaryIO
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _get_displacement(offset: int) -> int:
|
|
11
|
+
"""Calculate the displacement."""
|
|
12
|
+
result = 0
|
|
13
|
+
while offset >= 0x10:
|
|
14
|
+
offset >>= 1
|
|
15
|
+
result += 1
|
|
16
|
+
|
|
17
|
+
return result
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
DISPLACEMENT_TABLE = array.array("B", [_get_displacement(x) for x in range(8192)])
|
|
21
|
+
|
|
22
|
+
COMPRESSED_MASK = 1 << 15
|
|
23
|
+
SIGNATURE_MASK = 3 << 12
|
|
24
|
+
SIZE_MASK = (1 << 12) - 1
|
|
25
|
+
TAG_MASKS = [(1 << i) for i in range(8)]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def decompress(src: bytes | BinaryIO) -> bytes:
|
|
29
|
+
"""LZNT1 decompress from a file-like object or bytes.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
src: File-like object or bytes to decompress.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
The decompressed data.
|
|
36
|
+
"""
|
|
37
|
+
if not hasattr(src, "read"):
|
|
38
|
+
src = io.BytesIO(src)
|
|
39
|
+
|
|
40
|
+
offset = src.tell()
|
|
41
|
+
src.seek(0, io.SEEK_END)
|
|
42
|
+
size = src.tell() - offset
|
|
43
|
+
src.seek(offset)
|
|
44
|
+
|
|
45
|
+
dst = io.BytesIO()
|
|
46
|
+
|
|
47
|
+
while src.tell() - offset < size:
|
|
48
|
+
block_offset = src.tell()
|
|
49
|
+
uncompressed_chunk_offset = dst.tell()
|
|
50
|
+
|
|
51
|
+
block_header = struct.unpack("<H", src.read(2))[0]
|
|
52
|
+
if block_header & SIGNATURE_MASK != SIGNATURE_MASK:
|
|
53
|
+
break
|
|
54
|
+
|
|
55
|
+
hsize = block_header & SIZE_MASK
|
|
56
|
+
|
|
57
|
+
block_end = block_offset + hsize + 3
|
|
58
|
+
|
|
59
|
+
if block_header & COMPRESSED_MASK:
|
|
60
|
+
while src.tell() < block_end:
|
|
61
|
+
header = ord(src.read(1))
|
|
62
|
+
for mask in TAG_MASKS:
|
|
63
|
+
if src.tell() >= block_end:
|
|
64
|
+
break
|
|
65
|
+
|
|
66
|
+
if header & mask:
|
|
67
|
+
pointer = struct.unpack("<H", src.read(2))[0]
|
|
68
|
+
displacement = DISPLACEMENT_TABLE[dst.tell() - uncompressed_chunk_offset - 1]
|
|
69
|
+
|
|
70
|
+
symbol_offset = (pointer >> (12 - displacement)) + 1
|
|
71
|
+
symbol_length = (pointer & (0xFFF >> displacement)) + 3
|
|
72
|
+
|
|
73
|
+
dst.seek(-symbol_offset, io.SEEK_END)
|
|
74
|
+
data = dst.read(symbol_length)
|
|
75
|
+
|
|
76
|
+
# Pad the data to make it fit.
|
|
77
|
+
if 0 < len(data) < symbol_length:
|
|
78
|
+
data = data * (symbol_length // len(data) + 1)
|
|
79
|
+
data = data[:symbol_length]
|
|
80
|
+
|
|
81
|
+
dst.seek(0, io.SEEK_END)
|
|
82
|
+
dst.write(data)
|
|
83
|
+
else:
|
|
84
|
+
data = src.read(1)
|
|
85
|
+
dst.write(data)
|
|
86
|
+
|
|
87
|
+
else:
|
|
88
|
+
# Block is not compressed
|
|
89
|
+
data = src.read(hsize + 1)
|
|
90
|
+
dst.write(data)
|
|
91
|
+
|
|
92
|
+
return dst.getvalue()
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# References:
|
|
2
|
+
# - https://github.com/FFmpeg/FFmpeg/blob/master/libavutil/lzo.c
|
|
3
|
+
# - https://docs.kernel.org/staging/lzo.html
|
|
4
|
+
# - https://github.com/torvalds/linux/blob/master/lib/lzo/lzo1x_decompress_safe.c
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import io
|
|
8
|
+
import struct
|
|
9
|
+
from typing import BinaryIO
|
|
10
|
+
|
|
11
|
+
MAX_READ_LENGTH = (1 << 32) - 1000
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _read_length(src: BinaryIO, val: int, mask: int) -> int:
|
|
15
|
+
if (length := val & mask) != 0:
|
|
16
|
+
return length
|
|
17
|
+
|
|
18
|
+
while (val := src.read(1)[0]) == 0:
|
|
19
|
+
if length >= MAX_READ_LENGTH:
|
|
20
|
+
raise ValueError("Invalid encoded length")
|
|
21
|
+
length += 255
|
|
22
|
+
|
|
23
|
+
return length + mask + val
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def decompress(src: bytes | BinaryIO, header: bool = True, buflen: int = -1) -> bytes:
|
|
27
|
+
"""LZO decompress from a file-like object or bytes. Assumes no header.
|
|
28
|
+
|
|
29
|
+
Arguments are largely compatible with python-lzo API.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
src: File-like object or bytes to decompress.
|
|
33
|
+
header: Whether the metadata header is included in the input.
|
|
34
|
+
buflen: If ``header`` is ``False``, a buffer length in bytes must be given that will fit the output.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
The decompressed data.
|
|
38
|
+
"""
|
|
39
|
+
if not hasattr(src, "read"):
|
|
40
|
+
src = io.BytesIO(src)
|
|
41
|
+
|
|
42
|
+
dst = bytearray()
|
|
43
|
+
|
|
44
|
+
if header:
|
|
45
|
+
byte = src.read(1)[0]
|
|
46
|
+
if byte not in [0xF0, 0xF1]:
|
|
47
|
+
raise ValueError("Invalid header value")
|
|
48
|
+
out_len = struct.unpack("<I", src.read(4))[0]
|
|
49
|
+
else:
|
|
50
|
+
out_len = buflen
|
|
51
|
+
|
|
52
|
+
val = src.read(1)[0]
|
|
53
|
+
offset = src.tell()
|
|
54
|
+
if src.seek(5) == 5 and val == 17:
|
|
55
|
+
src.seek(offset)
|
|
56
|
+
_ = src.read(1) # This would be the bitstream version but we don't currently use it
|
|
57
|
+
val = src.read(1)[0]
|
|
58
|
+
else:
|
|
59
|
+
src.seek(offset)
|
|
60
|
+
|
|
61
|
+
if val > 17:
|
|
62
|
+
dst += src.read(val - 17)
|
|
63
|
+
val = src.read(1)[0]
|
|
64
|
+
|
|
65
|
+
if val < 16:
|
|
66
|
+
raise ValueError("Invalid LZO stream")
|
|
67
|
+
|
|
68
|
+
state = 0
|
|
69
|
+
while True:
|
|
70
|
+
if val > 15:
|
|
71
|
+
if val > 63:
|
|
72
|
+
# Copy 3-8 bytes from block within 2kb distance
|
|
73
|
+
length = (val >> 5) - 1
|
|
74
|
+
dist = (src.read(1)[0] << 3) + ((val >> 2) & 7) + 1
|
|
75
|
+
elif val > 31:
|
|
76
|
+
# Copy of small block within 16kb distance
|
|
77
|
+
length = _read_length(src, val, 31)
|
|
78
|
+
val = src.read(1)[0]
|
|
79
|
+
dist = (src.read(1)[0] << 6) + (val >> 2) + 1
|
|
80
|
+
else:
|
|
81
|
+
# Copy of a block within 16...48kB distance
|
|
82
|
+
length = _read_length(src, val, 7)
|
|
83
|
+
dist = (1 << 14) + ((val & 8) << 11)
|
|
84
|
+
val = src.read(1)[0]
|
|
85
|
+
dist += (src.read(1)[0] << 6) + (val >> 2)
|
|
86
|
+
if dist == (1 << 14):
|
|
87
|
+
if length != 1:
|
|
88
|
+
raise ValueError("Invalid LZO stream")
|
|
89
|
+
break
|
|
90
|
+
elif not state:
|
|
91
|
+
# Copy 4 or more literals, depending on the last 4 bits
|
|
92
|
+
length = _read_length(src, val, 15)
|
|
93
|
+
dst += src.read(length + 3)
|
|
94
|
+
val = src.read(1)[0]
|
|
95
|
+
if val > 15:
|
|
96
|
+
continue
|
|
97
|
+
length = 1
|
|
98
|
+
dist = (1 << 11) + (src.read(1)[0] << 2) + (val >> 2) + 1
|
|
99
|
+
else:
|
|
100
|
+
length = 0
|
|
101
|
+
dist = (src.read(1)[0] << 2) + (val >> 2) + 1
|
|
102
|
+
|
|
103
|
+
remaining = length + 2
|
|
104
|
+
while remaining > 0:
|
|
105
|
+
match_size = min(remaining, dist)
|
|
106
|
+
dst += dst[-dist : (-dist + match_size) or None]
|
|
107
|
+
remaining -= match_size
|
|
108
|
+
|
|
109
|
+
# State is often encoded in the last 2 bits of the value, and used in subsequent iterations
|
|
110
|
+
state = length = val & 3
|
|
111
|
+
dst += src.read(length)
|
|
112
|
+
|
|
113
|
+
if len(dst) == out_len:
|
|
114
|
+
break
|
|
115
|
+
|
|
116
|
+
val = src.read(1)[0]
|
|
117
|
+
|
|
118
|
+
return bytes(dst)
|