dissect.util 3.24.dev2__cp314-cp314t-manylinux_2_31_armv7l.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. dissect/util/__init__.py +20 -0
  2. dissect/util/_build.py +17 -0
  3. dissect/util/_native/__init__.pyi +3 -0
  4. dissect/util/_native/compression/__init__.pyi +3 -0
  5. dissect/util/_native/compression/lz4.pyi +7 -0
  6. dissect/util/_native/compression/lzo.pyi +3 -0
  7. dissect/util/_native/hash/__init__.py +3 -0
  8. dissect/util/_native/hash/crc32c.py +2 -0
  9. dissect/util/_native.cpython-314t-arm-linux-gnueabihf.so +0 -0
  10. dissect/util/compression/__init__.py +45 -0
  11. dissect/util/compression/lz4.py +95 -0
  12. dissect/util/compression/lzbitmap.py +130 -0
  13. dissect/util/compression/lzfse.py +467 -0
  14. dissect/util/compression/lznt1.py +92 -0
  15. dissect/util/compression/lzo.py +118 -0
  16. dissect/util/compression/lzvn.py +241 -0
  17. dissect/util/compression/lzxpress.py +80 -0
  18. dissect/util/compression/lzxpress_huffman.py +184 -0
  19. dissect/util/compression/sevenbit.py +77 -0
  20. dissect/util/compression/xz.py +112 -0
  21. dissect/util/cpio.py +226 -0
  22. dissect/util/encoding/__init__.py +0 -0
  23. dissect/util/encoding/surrogateescape.py +21 -0
  24. dissect/util/exceptions.py +6 -0
  25. dissect/util/hash/__init__.py +28 -0
  26. dissect/util/hash/crc32.py +55 -0
  27. dissect/util/hash/crc32c.py +60 -0
  28. dissect/util/hash/jenkins.py +102 -0
  29. dissect/util/ldap.py +237 -0
  30. dissect/util/plist.py +156 -0
  31. dissect/util/sid.py +81 -0
  32. dissect/util/stream.py +772 -0
  33. dissect/util/tools/__init__.py +0 -0
  34. dissect/util/tools/dump_nskeyedarchiver.py +61 -0
  35. dissect/util/ts.py +295 -0
  36. dissect/util/xmemoryview.py +117 -0
  37. dissect_util-3.24.dev2.dist-info/METADATA +89 -0
  38. dissect_util-3.24.dev2.dist-info/RECORD +43 -0
  39. dissect_util-3.24.dev2.dist-info/WHEEL +5 -0
  40. dissect_util-3.24.dev2.dist-info/entry_points.txt +2 -0
  41. dissect_util-3.24.dev2.dist-info/licenses/COPYRIGHT +5 -0
  42. dissect_util-3.24.dev2.dist-info/licenses/LICENSE +201 -0
  43. dissect_util-3.24.dev2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,241 @@
1
+ # References:
2
+ # - https://github.com/lzfse/lzfse
3
+ from __future__ import annotations
4
+
5
+ import io
6
+ import struct
7
+ from typing import BinaryIO
8
+
9
+ # fmt: off
10
+ OP_SML_D = (
11
+ 0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19,
12
+ 20, 21, 24, 25, 26, 27, 28, 29, 32, 33, 34, 35, 36, 37, 40, 41,
13
+ 42, 43, 44, 45, 48, 49, 50, 51, 52, 53, 56, 57, 58, 59, 60, 61,
14
+ 64, 65, 66, 67, 68, 69, 72, 73, 74, 75, 76, 77, 80, 81, 82, 83,
15
+ 84, 85, 88, 89, 90, 91, 92, 93, 96, 97, 98, 99, 100, 101, 104, 105,
16
+ 106, 107, 108, 109, 128, 129, 130, 131, 132, 133, 136, 137, 138, 139, 140, 141,
17
+ 144, 145, 146, 147, 148, 149, 152, 153, 154, 155, 156, 157, 192, 193, 194, 195,
18
+ 196, 197, 200, 201, 202, 203, 204, 205,
19
+ )
20
+ OP_MED_D = (
21
+ 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
22
+ 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
23
+ )
24
+ OP_LRG_D = (
25
+ 7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 135, 143,
26
+ 151, 159, 199, 207,
27
+ )
28
+ OP_PRE_D = (
29
+ 70, 78, 86, 94, 102, 110, 134, 142, 150, 158, 198, 206,
30
+ )
31
+ OP_SML_M = (
32
+ 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
33
+ )
34
+ OP_LRG_M = (
35
+ 240,
36
+ )
37
+ OP_SML_L = (
38
+ 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
39
+ )
40
+ OP_LRG_L = (
41
+ 224,
42
+ )
43
+ OP_NOP = (
44
+ 14, 22,
45
+ )
46
+ OP_EOS = (
47
+ 6,
48
+ )
49
+ OP_UDEF = (
50
+ 30, 38, 46, 54, 62, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
51
+ 122, 123, 124, 125, 126, 127, 208, 209, 210, 211, 212, 213, 214, 215, 216,
52
+ 217, 218, 219, 220, 221, 222, 223,
53
+ )
54
+ # fmt: on
55
+
56
+ _H = struct.Struct("<H")
57
+
58
+
59
+ def decompress(src: bytes | BinaryIO) -> bytes:
60
+ """LZVN decompress from a file-like object or bytes.
61
+
62
+ Decompresses until EOF or EOS of the input data.
63
+
64
+ Args:
65
+ src: File-like object or bytes to decompress.
66
+
67
+ Returns:
68
+ The decompressed data.
69
+ """
70
+ if not hasattr(src, "read"):
71
+ src = io.BytesIO(src)
72
+
73
+ offset = src.tell()
74
+ src.seek(0, io.SEEK_END)
75
+ src_size = src.tell() - offset
76
+ src.seek(offset)
77
+
78
+ dst = bytearray()
79
+
80
+ opc_len = 0
81
+
82
+ # ruff: noqa: N806
83
+ L = None
84
+ M = None
85
+ D = 0
86
+
87
+ while src_size > 0:
88
+ opc = src.read(1)[0]
89
+
90
+ if opc in OP_SML_D:
91
+ # "small distance": This opcode has the structure LLMMMDDD DDDDDDDD LITERAL
92
+ # where the length of literal (0-3 bytes) is encoded by the high 2 bits of
93
+ # the first byte. We first extract the literal length so we know how long
94
+ # the opcode is, then check that the source can hold both this opcode and
95
+ # at least one byte of the next (because any valid input stream must be
96
+ # terminated with an eos token).
97
+ opc_len = 2
98
+ L = _extract(opc, 8, 6, 2)
99
+ M = _extract(opc, 8, 3, 3) + 3
100
+
101
+ if src_size <= opc_len + L:
102
+ break
103
+
104
+ D = _extract(opc, 8, 0, 3) << 8 | src.read(1)[0]
105
+
106
+ elif opc in OP_MED_D:
107
+ # "medium distance": This is a minor variant of the "small distance"
108
+ # encoding, where we will now use two extra bytes instead of one to encode
109
+ # the restof the match length and distance. This allows an extra two bits
110
+ # for the match length, and an extra three bits for the match distance. The
111
+ # full structure of the opcode is 101LLMMM DDDDDDMM DDDDDDDD LITERAL.
112
+ opc_len = 3
113
+ L = _extract(opc, 8, 3, 2)
114
+
115
+ if src_size <= opc_len + L:
116
+ break
117
+
118
+ (opc23,) = _H.unpack(src.read(2))
119
+ M = (_extract(opc, 8, 0, 3) << 2 | _extract(opc23, 16, 0, 2)) + 3
120
+ D = _extract(opc23, 16, 2, 14)
121
+
122
+ elif opc in OP_LRG_D:
123
+ # "large distance": This is another variant of the "small distance"
124
+ # encoding, where we will now use two extra bytes to encode the match
125
+ # distance, which allows distances up to 65535 to be represented. The full
126
+ # structure of the opcode is LLMMM111 DDDDDDDD DDDDDDDD LITERAL.
127
+ opc_len = 3
128
+ L = _extract(opc, 8, 6, 2)
129
+ M = _extract(opc, 8, 3, 3) + 3
130
+
131
+ if src_size <= opc_len + L:
132
+ break
133
+
134
+ (D,) = _H.unpack(src.read(2))
135
+
136
+ elif opc in OP_PRE_D:
137
+ # "previous distance": This opcode has the structure LLMMM110, where the
138
+ # length of the literal (0-3 bytes) is encoded by the high 2 bits of the
139
+ # first byte. We first extract the literal length so we know how long
140
+ # the opcode is, then check that the source can hold both this opcode and
141
+ # at least one byte of the next (because any valid input stream must be
142
+ # terminated with an eos token).
143
+ opc_len = 1
144
+ L = _extract(opc, 8, 6, 2)
145
+ M = _extract(opc, 8, 3, 3) + 3
146
+
147
+ if src_size <= opc_len + L:
148
+ break
149
+
150
+ elif opc in OP_SML_M:
151
+ # "small match": This opcode has no literal, and uses the previous match
152
+ # distance (i.e. it encodes only the match length), in a single byte as
153
+ # 1111MMMM.
154
+ opc_len = 1
155
+ L = None
156
+ M = _extract(opc, 8, 0, 4)
157
+
158
+ if src_size <= opc_len:
159
+ break
160
+
161
+ elif opc in OP_LRG_M:
162
+ # "large match": This opcode has no literal, and uses the previous match
163
+ # distance (i.e. it encodes only the match length). It is encoded in two
164
+ # bytes as 11110000 MMMMMMMM. Because matches smaller than 16 bytes can
165
+ # be represented by sml_m, there is an implicit bias of 16 on the match
166
+ # length; the representable values are [16,271].
167
+ opc_len = 2
168
+ L = None
169
+
170
+ if src_size <= opc_len:
171
+ break
172
+
173
+ M = src.read(1)[0] + 16
174
+
175
+ elif opc in OP_SML_L:
176
+ # "small literal": This opcode has no match, and encodes only a literal
177
+ # of length up to 15 bytes. The format is 1110LLLL LITERAL.
178
+ opc_len = 1
179
+ L = _extract(opc, 8, 0, 4)
180
+ M = None
181
+
182
+ elif opc in OP_LRG_L:
183
+ # "large literal": This opcode has no match, and uses the previous match
184
+ # distance (i.e. it encodes only the match length). It is encoded in two
185
+ # bytes as 11100000 LLLLLLLL LITERAL. Because literals smaller than 16
186
+ # bytes can be represented by sml_l, there is an implicit bias of 16 on
187
+ # the literal length; the representable values are [16,271].
188
+ opc_len = 2
189
+
190
+ if src_size <= opc_len:
191
+ break
192
+
193
+ L = src.read(1)[0] + 16
194
+ M = None
195
+
196
+ elif opc in OP_NOP:
197
+ opc_len = 1
198
+ L = None
199
+ M = None
200
+
201
+ if src_size <= opc_len:
202
+ break
203
+
204
+ elif opc in OP_EOS:
205
+ opc_len = 8
206
+
207
+ if src_size < opc_len:
208
+ break
209
+
210
+ src_size -= opc_len + L
211
+ break
212
+
213
+ elif opc in OP_UDEF:
214
+ raise ValueError("Undefined opcode")
215
+
216
+ # Update remaining source size
217
+ src_size -= opc_len
218
+
219
+ # Copy literal
220
+ if L is not None:
221
+ src_size -= L
222
+ dst += src.read(L)
223
+
224
+ # Match
225
+ if M is not None:
226
+ if len(dst) < D or D == 0:
227
+ raise ValueError("Invalid match distance")
228
+
229
+ remaining = M
230
+ while remaining > 0:
231
+ match_size = min(remaining, D)
232
+ dst += dst[-D : (-D + match_size) or None]
233
+ remaining -= match_size
234
+
235
+ return bytes(dst)
236
+
237
+
238
+ def _extract(container: int, container_width: int, lsb: int, width: int) -> int:
239
+ if width == container_width:
240
+ return container
241
+ return (container >> lsb) & ((1 << width) - 1)
@@ -0,0 +1,80 @@
1
+ # Reference: [MS-XCA]
2
+ from __future__ import annotations
3
+
4
+ import io
5
+ import struct
6
+ from typing import BinaryIO
7
+
8
+
9
+ def decompress(src: bytes | BinaryIO) -> bytes:
10
+ """LZXPRESS decompress from a file-like object or bytes.
11
+
12
+ Args:
13
+ src: File-like object or bytes to decompress.
14
+
15
+ Returns:
16
+ The decompressed data.
17
+ """
18
+ if not hasattr(src, "read"):
19
+ src = io.BytesIO(src)
20
+
21
+ offset = src.tell()
22
+ src.seek(0, io.SEEK_END)
23
+ size = src.tell() - offset
24
+ src.seek(offset)
25
+
26
+ dst = bytearray()
27
+
28
+ buffered_flags = 0
29
+ buffered_flags_count = 0
30
+ last_length_half_byte = 0
31
+
32
+ while src.tell() - offset < size:
33
+ if buffered_flags_count == 0:
34
+ buffered_flags = struct.unpack("<I", src.read(4))[0]
35
+ buffered_flags_count = 32
36
+
37
+ buffered_flags_count -= 1
38
+ if buffered_flags & (1 << buffered_flags_count) == 0:
39
+ dst.append(ord(src.read(1)))
40
+ else:
41
+ if src.tell() - offset == size:
42
+ break
43
+
44
+ match = struct.unpack("<H", src.read(2))[0]
45
+ match_offset, match_length = divmod(match, 8)
46
+ match_offset += 1
47
+
48
+ if match_length == 7:
49
+ if last_length_half_byte == 0:
50
+ last_length_half_byte = src.tell()
51
+ match_length = ord(src.read(1)) % 16
52
+ else:
53
+ rewind = src.tell()
54
+ src.seek(last_length_half_byte)
55
+ match_length = ord(src.read(1)) // 16
56
+ src.seek(rewind)
57
+ last_length_half_byte = 0
58
+
59
+ if match_length == 15:
60
+ match_length = ord(src.read(1))
61
+ if match_length == 255:
62
+ match_length = struct.unpack("<H", src.read(2))[0]
63
+ if match_length == 0:
64
+ match_length = struct.unpack("<I", src.read(4))[0]
65
+
66
+ if match_length < 15 + 7:
67
+ raise ValueError("wrong match length")
68
+
69
+ match_length -= 15 + 7
70
+ match_length += 15
71
+ match_length += 7
72
+ match_length += 3
73
+
74
+ remaining = match_length
75
+ while remaining > 0:
76
+ match_size = min(remaining, match_offset)
77
+ dst += dst[-match_offset : (-match_offset + match_size) or None]
78
+ remaining -= match_size
79
+
80
+ return bytes(dst)
@@ -0,0 +1,184 @@
1
+ # https://docs.microsoft.com/en-us/openspecs/windows_protocols/ms-frs2/8cb5bae9-edf3-4833-9f0a-9d7e24218d3d
2
+ # https://winprotocoldoc.blob.core.windows.net/productionwindowsarchives/MS-XCA/[MS-XCA].pdf
3
+ from __future__ import annotations
4
+
5
+ import io
6
+ import struct
7
+ from typing import BinaryIO, NamedTuple
8
+
9
+
10
+ class Symbol(NamedTuple):
11
+ length: int
12
+ symbol: int
13
+
14
+
15
+ def _read_16_bit(fh: BinaryIO) -> int:
16
+ return struct.unpack("<H", fh.read(2).rjust(2, b"\x00"))[0]
17
+
18
+
19
+ class Node:
20
+ __slots__ = ("children", "is_leaf", "symbol")
21
+
22
+ def __init__(self, symbol: Symbol | None = None, is_leaf: bool = False):
23
+ self.symbol = symbol
24
+ self.is_leaf = is_leaf
25
+ self.children = [None, None]
26
+
27
+
28
+ def _add_leaf(nodes: list[Node], idx: int, mask: int, bits: int) -> int:
29
+ node = nodes[0]
30
+ i = idx + 1
31
+
32
+ while bits > 1:
33
+ bits -= 1
34
+ childidx = (mask >> bits) & 1
35
+ if node.children[childidx] is None:
36
+ node.children[childidx] = nodes[i]
37
+ nodes[i].is_leaf = False
38
+ i += 1
39
+ node = node.children[childidx]
40
+
41
+ node.children[mask & 1] = nodes[idx]
42
+ return i
43
+
44
+
45
+ def _build_tree(buf: bytes) -> Node:
46
+ if len(buf) != 256:
47
+ raise ValueError("Not enough data for Huffman code tree")
48
+
49
+ nodes = [Node() for _ in range(1024)]
50
+ symbols: list[Symbol] = []
51
+
52
+ for i, c in enumerate(buf):
53
+ symbols.append(Symbol(c & 0x0F, i * 2))
54
+ symbols.append(Symbol((c >> 4) & 0x0F, i * 2 + 1))
55
+
56
+ symbols = sorted(symbols)
57
+
58
+ symbol_index_start = 0
59
+ for s in symbols:
60
+ if s.length > 0:
61
+ break
62
+ symbol_index_start += 1
63
+
64
+ mask = 0
65
+ bits = 1
66
+
67
+ root = nodes[0]
68
+
69
+ tree_index = 1
70
+ for symbol_index in range(symbol_index_start, 512):
71
+ s = symbols[symbol_index]
72
+
73
+ node = nodes[tree_index]
74
+ node.symbol = s.symbol
75
+ node.is_leaf = True
76
+
77
+ mask = (mask << s.length - bits) & 0xFFFFFFFF
78
+ bits = s.length
79
+
80
+ tree_index = _add_leaf(nodes, tree_index, mask, bits)
81
+ mask += 1
82
+
83
+ return root
84
+
85
+
86
+ class BitString:
87
+ def __init__(self):
88
+ self.source = None
89
+ self.mask = 0
90
+ self.bits = 0
91
+
92
+ @property
93
+ def index(self) -> int:
94
+ return self.source.tell()
95
+
96
+ def init(self, fh: BinaryIO) -> None:
97
+ self.mask = (_read_16_bit(fh) << 16) + _read_16_bit(fh)
98
+ self.bits = 32
99
+ self.source = fh
100
+
101
+ def read(self, n: int) -> bytes:
102
+ return self.source.read(n)
103
+
104
+ def lookup(self, n: int) -> int:
105
+ if n == 0:
106
+ return 0
107
+
108
+ return self.mask >> (32 - n)
109
+
110
+ def skip(self, n: int) -> None:
111
+ self.mask = (self.mask << n) & 0xFFFFFFFF
112
+ self.bits -= n
113
+ if self.bits < 16:
114
+ self.mask += _read_16_bit(self.source) << (16 - self.bits)
115
+ self.bits += 16
116
+
117
+ def decode(self, root: Node) -> Symbol:
118
+ node = root
119
+ while not node.is_leaf:
120
+ bit = self.lookup(1)
121
+ self.skip(1)
122
+ node = node.children[bit]
123
+ return node.symbol
124
+
125
+
126
+ def decompress(src: bytes | BinaryIO) -> bytes:
127
+ """LZXPRESS decompress from a file-like object or bytes.
128
+
129
+ Decompresses until EOF of the input data.
130
+
131
+ Args:
132
+ src: File-like object or bytes to decompress.
133
+
134
+ Returns:
135
+ The decompressed data.
136
+ """
137
+ if not hasattr(src, "read"):
138
+ src = io.BytesIO(src)
139
+
140
+ dst = bytearray()
141
+
142
+ start_offset = src.tell()
143
+ src.seek(0, io.SEEK_END)
144
+ size = src.tell() - start_offset
145
+ src.seek(start_offset, io.SEEK_SET)
146
+
147
+ bitstring = BitString()
148
+
149
+ while src.tell() - start_offset < size:
150
+ root = _build_tree(src.read(256))
151
+ bitstring.init(src)
152
+
153
+ chunk_size = 0
154
+ while chunk_size < 65536 and src.tell() - start_offset < size:
155
+ symbol = bitstring.decode(root)
156
+ if symbol < 256:
157
+ dst.append(symbol)
158
+ chunk_size += 1
159
+ else:
160
+ symbol -= 256
161
+ length = symbol & 0x0F
162
+ symbol >>= 4
163
+
164
+ offset = (1 << symbol) + bitstring.lookup(symbol)
165
+
166
+ if length == 15:
167
+ length = ord(bitstring.read(1)) + 15
168
+
169
+ if length == 270:
170
+ length = _read_16_bit(bitstring.source)
171
+
172
+ bitstring.skip(symbol)
173
+
174
+ length += 3
175
+
176
+ remaining = length
177
+ while remaining > 0:
178
+ match_size = min(remaining, offset)
179
+ dst += dst[-offset : (-offset + match_size) or None]
180
+ remaining -= match_size
181
+
182
+ chunk_size += length
183
+
184
+ return bytes(dst)
@@ -0,0 +1,77 @@
1
+ from __future__ import annotations
2
+
3
+ from io import BytesIO
4
+ from typing import BinaryIO
5
+
6
+
7
+ def compress(src: bytes | BinaryIO) -> bytes:
8
+ """Sevenbit compress from a file-like object or bytes.
9
+
10
+ Args:
11
+ src: File-like object or bytes to compress.
12
+
13
+ Returns:
14
+ The compressed data.
15
+ """
16
+ if not hasattr(src, "read"):
17
+ src = BytesIO(src)
18
+
19
+ dst = bytearray()
20
+
21
+ val = 0
22
+ shift = 0
23
+ while True:
24
+ _byte = src.read(1)
25
+ if not len(_byte):
26
+ break
27
+
28
+ val |= (_byte[0] & 0x7F) << shift
29
+ shift += 7
30
+
31
+ if shift >= 8:
32
+ dst.append(val & 0xFF)
33
+ val >>= 8
34
+ shift -= 8
35
+
36
+ if val:
37
+ dst.append(val & 0xFF)
38
+
39
+ return bytes(dst)
40
+
41
+
42
+ def decompress(src: bytes | BinaryIO, wide: bool = False) -> bytes:
43
+ """Sevenbit decompress from a file-like object or bytes.
44
+
45
+ Args:
46
+ src: File-like object or bytes to decompress.
47
+
48
+ Returns:
49
+ The decompressed data.
50
+ """
51
+ if not hasattr(src, "read"):
52
+ src = BytesIO(src)
53
+
54
+ dst = bytearray()
55
+
56
+ val = 0
57
+ shift = 0
58
+ while True:
59
+ _byte = src.read(1)
60
+ if not len(_byte):
61
+ break
62
+
63
+ val |= _byte[0] << shift
64
+ dst.append(val & 0x7F)
65
+ if wide:
66
+ dst.append(0)
67
+
68
+ val >>= 7
69
+ shift += 1
70
+ if shift == 7:
71
+ dst.append(val & 0x7F)
72
+ if wide:
73
+ dst.append(0)
74
+ val >>= 7
75
+ shift = 0
76
+
77
+ return bytes(dst)
@@ -0,0 +1,112 @@
1
+ import io
2
+ from binascii import crc32
3
+ from typing import BinaryIO
4
+
5
+ from dissect.util.stream import OverlayStream
6
+
7
+ HEADER_FOOTER_SIZE = 12
8
+ CRC_SIZE = 4
9
+
10
+
11
+ def repair_checksum(fh: BinaryIO) -> BinaryIO:
12
+ """Repair CRC32 checksums for all headers in an XZ stream.
13
+
14
+ FortiOS XZ files have (on purpose) corrupt streams which they read using a modified ``xz`` binary.
15
+ The only thing changed are the CRC32 checksums, so partially parse the XZ file and fix all of them.
16
+
17
+ References:
18
+ - https://tukaani.org/xz/xz-file-format-1.1.0.txt
19
+ - https://github.com/Rogdham/python-xz
20
+
21
+ Args:
22
+ fh: A file-like object of an LZMA stream to repair.
23
+ """
24
+ file_size = fh.seek(0, io.SEEK_END)
25
+ repaired = OverlayStream(fh, file_size)
26
+ fh.seek(0)
27
+
28
+ header = fh.read(HEADER_FOOTER_SIZE)
29
+ # Check header magic
30
+ magic = b"\xfd7zXZ\x00"
31
+ if header[: len(magic)] != magic:
32
+ raise ValueError("Not an XZ file")
33
+
34
+ # Add correct header CRC32
35
+ repaired.add(fh.tell() - CRC_SIZE, _crc32(header[len(magic) : HEADER_FOOTER_SIZE - CRC_SIZE]))
36
+
37
+ footer_offset = fh.seek(-HEADER_FOOTER_SIZE, io.SEEK_END)
38
+ footer = fh.read(HEADER_FOOTER_SIZE)
39
+
40
+ # Check footer magic
41
+ footer_magic = b"YZ"
42
+ if footer[HEADER_FOOTER_SIZE - len(footer_magic) : HEADER_FOOTER_SIZE] != footer_magic:
43
+ raise ValueError("Not an XZ file")
44
+
45
+ # Add correct footer CRC32
46
+ repaired.add(footer_offset, _crc32(footer[CRC_SIZE : HEADER_FOOTER_SIZE - len(footer_magic)]))
47
+
48
+ backward_size = (int.from_bytes(footer[4:8], "little") + 1) * 4
49
+ fh.seek(-HEADER_FOOTER_SIZE - backward_size, io.SEEK_END)
50
+ index = fh.read(backward_size)
51
+
52
+ # Add correct index CRC32
53
+ repaired.add(fh.tell() - CRC_SIZE, _crc32(index[:-CRC_SIZE]))
54
+
55
+ # Parse the index
56
+ isize, num_records = _mbi(index[1:])
57
+ index = index[1 + isize : -4]
58
+ records = []
59
+ for _ in range(num_records):
60
+ if not index:
61
+ raise ValueError("Missing index size")
62
+
63
+ isize, unpadded_size = _mbi(index)
64
+ if not unpadded_size:
65
+ raise ValueError("Missing index record unpadded size")
66
+
67
+ index = index[isize:]
68
+ if not index:
69
+ raise ValueError("Missing index size")
70
+
71
+ isize, uncompressed_size = _mbi(index)
72
+ if not uncompressed_size:
73
+ raise ValueError("Missing index record uncompressed size")
74
+
75
+ index = index[isize:]
76
+ records.append((unpadded_size, uncompressed_size))
77
+
78
+ block_start = file_size - HEADER_FOOTER_SIZE - backward_size
79
+ blocks_len = sum((unpadded_size + 3) & ~3 for unpadded_size, _ in records)
80
+ block_start -= blocks_len
81
+
82
+ # Iterate over all blocks and add the correct block header CRC32
83
+ for unpadded_size, _ in records:
84
+ fh.seek(block_start)
85
+
86
+ block_header = fh.read(1)
87
+ block_header_size = (block_header[0] + 1) * 4
88
+ block_header += fh.read(block_header_size - 1)
89
+ repaired.add(fh.tell() - CRC_SIZE, _crc32(block_header[:-CRC_SIZE]))
90
+
91
+ block_start += (unpadded_size + 3) & ~3
92
+
93
+ return repaired
94
+
95
+
96
+ def _mbi(data: bytes) -> tuple[int, int]:
97
+ """Decode a multibyte integer.
98
+
99
+ The encoding is similar to most other "varint" encodings. For each byte, the 7 least significant bits are used for
100
+ the integer value. The most significant bit is used to indicate if the integer continues in the next byte.
101
+ Bytes are ordered in little endian byte order, meaning the least significant byte comes first.
102
+ """
103
+ value = 0
104
+ for size, byte in enumerate(data):
105
+ value |= (byte & 0x7F) << (size * 7)
106
+ if not byte & 0x80:
107
+ return size + 1, value
108
+ raise ValueError("Invalid mbi")
109
+
110
+
111
+ def _crc32(data: bytes) -> bytes:
112
+ return int.to_bytes(crc32(data), CRC_SIZE, "little")