dissect.util 3.24.dev1__cp310-abi3-manylinux_2_28_s390x.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dissect.util might be problematic. Click here for more details.
- dissect/util/__init__.py +20 -0
- dissect/util/_build.py +17 -0
- dissect/util/_native/__init__.pyi +3 -0
- dissect/util/_native/compression/__init__.pyi +3 -0
- dissect/util/_native/compression/lz4.pyi +7 -0
- dissect/util/_native/compression/lzo.pyi +3 -0
- dissect/util/_native/hash/__init__.py +3 -0
- dissect/util/_native/hash/crc32c.py +2 -0
- dissect/util/_native.abi3.so +0 -0
- dissect/util/compression/__init__.py +45 -0
- dissect/util/compression/lz4.py +95 -0
- dissect/util/compression/lzbitmap.py +130 -0
- dissect/util/compression/lzfse.py +467 -0
- dissect/util/compression/lznt1.py +92 -0
- dissect/util/compression/lzo.py +118 -0
- dissect/util/compression/lzvn.py +241 -0
- dissect/util/compression/lzxpress.py +80 -0
- dissect/util/compression/lzxpress_huffman.py +184 -0
- dissect/util/compression/sevenbit.py +77 -0
- dissect/util/compression/xz.py +112 -0
- dissect/util/cpio.py +226 -0
- dissect/util/encoding/__init__.py +0 -0
- dissect/util/encoding/surrogateescape.py +21 -0
- dissect/util/exceptions.py +6 -0
- dissect/util/hash/__init__.py +28 -0
- dissect/util/hash/crc32.py +55 -0
- dissect/util/hash/crc32c.py +60 -0
- dissect/util/hash/jenkins.py +102 -0
- dissect/util/ldap.py +237 -0
- dissect/util/plist.py +156 -0
- dissect/util/sid.py +81 -0
- dissect/util/stream.py +671 -0
- dissect/util/tools/__init__.py +0 -0
- dissect/util/tools/dump_nskeyedarchiver.py +61 -0
- dissect/util/ts.py +295 -0
- dissect/util/xmemoryview.py +117 -0
- dissect_util-3.24.dev1.dist-info/METADATA +89 -0
- dissect_util-3.24.dev1.dist-info/RECORD +43 -0
- dissect_util-3.24.dev1.dist-info/WHEEL +5 -0
- dissect_util-3.24.dev1.dist-info/entry_points.txt +2 -0
- dissect_util-3.24.dev1.dist-info/licenses/COPYRIGHT +5 -0
- dissect_util-3.24.dev1.dist-info/licenses/LICENSE +201 -0
- dissect_util-3.24.dev1.dist-info/top_level.txt +1 -0
dissect/util/stream.py
ADDED
|
@@ -0,0 +1,671 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import io
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
import zlib
|
|
7
|
+
from bisect import bisect_left, bisect_right
|
|
8
|
+
from threading import Lock
|
|
9
|
+
from typing import BinaryIO
|
|
10
|
+
|
|
11
|
+
STREAM_BUFFER_SIZE = int(os.getenv("DISSECT_STREAM_BUFFER_SIZE", io.DEFAULT_BUFFER_SIZE))
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class AlignedStream(io.RawIOBase):
|
|
15
|
+
"""Basic buffered stream that provides aligned reads.
|
|
16
|
+
|
|
17
|
+
Must be subclassed for various stream implementations. Subclasses can implement:
|
|
18
|
+
- :meth:`~AlignedStream._read`
|
|
19
|
+
- :meth:`~AlignedStream._seek`
|
|
20
|
+
|
|
21
|
+
The offset and length for ``_read`` are guaranteed to be aligned for streams of a known size.
|
|
22
|
+
If your stream has an unknown size (i.e. ``size == None``), reads of length ``-1`` (i.e. read until EOF) will be
|
|
23
|
+
passed through to your implementation of ``_read``.
|
|
24
|
+
The only time that overriding ``_seek`` would make sense is if there's no known size of your stream,
|
|
25
|
+
but still want to provide ``SEEK_END`` functionality.
|
|
26
|
+
|
|
27
|
+
Most subclasses of ``AlignedStream`` take one or more file-like objects as source.
|
|
28
|
+
Operations on these subclasses, like reading, will modify the source file-like object as a side effect.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
size: The size of the stream. This is used in read and seek operations. ``None`` if unknown.
|
|
32
|
+
align: The alignment size. Read operations are aligned on this boundary. Also determines buffer size.
|
|
33
|
+
|
|
34
|
+
.. automethod:: _read
|
|
35
|
+
.. automethod:: _seek
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(self, size: int | None = None, align: int = STREAM_BUFFER_SIZE):
|
|
39
|
+
super().__init__()
|
|
40
|
+
self.size = size
|
|
41
|
+
self.align = align
|
|
42
|
+
|
|
43
|
+
self._pos = 0
|
|
44
|
+
self._pos_align = 0
|
|
45
|
+
|
|
46
|
+
self._buf = None
|
|
47
|
+
self._lock = Lock()
|
|
48
|
+
|
|
49
|
+
def readable(self) -> bool:
|
|
50
|
+
"""Indicate that the stream is readable."""
|
|
51
|
+
return True
|
|
52
|
+
|
|
53
|
+
def seekable(self) -> bool:
|
|
54
|
+
"""Indicate that the stream is seekable."""
|
|
55
|
+
return True
|
|
56
|
+
|
|
57
|
+
def seek(self, pos: int, whence: int = io.SEEK_SET) -> int:
|
|
58
|
+
"""Seek the stream to the specified position.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
The new stream position after seeking.
|
|
62
|
+
"""
|
|
63
|
+
with self._lock:
|
|
64
|
+
pos = self._seek(pos, whence)
|
|
65
|
+
self._set_pos(pos)
|
|
66
|
+
|
|
67
|
+
return pos
|
|
68
|
+
|
|
69
|
+
def _seek(self, pos: int, whence: int = io.SEEK_SET) -> int:
|
|
70
|
+
"""Calculate and return the new stream position after a seek."""
|
|
71
|
+
if whence == io.SEEK_SET:
|
|
72
|
+
if pos < 0:
|
|
73
|
+
raise ValueError(f"negative seek position {pos}")
|
|
74
|
+
elif whence == io.SEEK_CUR:
|
|
75
|
+
pos = max(0, self._pos + pos)
|
|
76
|
+
elif whence == io.SEEK_END:
|
|
77
|
+
if self.size is None:
|
|
78
|
+
raise IOError("invalid whence value for stream with no size")
|
|
79
|
+
pos = max(0, self.size + pos)
|
|
80
|
+
else:
|
|
81
|
+
raise IOError("invalid whence value")
|
|
82
|
+
|
|
83
|
+
return pos
|
|
84
|
+
|
|
85
|
+
def _set_pos(self, pos: int) -> None:
|
|
86
|
+
"""Update the position and aligned position within the stream."""
|
|
87
|
+
new_pos_align = pos - (pos % self.align)
|
|
88
|
+
|
|
89
|
+
if self._pos_align != new_pos_align:
|
|
90
|
+
self._pos_align = new_pos_align
|
|
91
|
+
self._buf = None
|
|
92
|
+
|
|
93
|
+
self._pos = pos
|
|
94
|
+
|
|
95
|
+
def tell(self) -> int:
|
|
96
|
+
"""Return current stream position."""
|
|
97
|
+
return self._pos
|
|
98
|
+
|
|
99
|
+
def _fill_buf(self) -> None:
|
|
100
|
+
"""Fill the alignment buffer if we can."""
|
|
101
|
+
if self._buf or (self.size is not None and (self.size <= self._pos or self.size <= self._pos_align)):
|
|
102
|
+
# Don't fill the buffer if:
|
|
103
|
+
# - We already have a buffer
|
|
104
|
+
# - The stream position is at the end (or beyond) the stream size
|
|
105
|
+
return
|
|
106
|
+
|
|
107
|
+
self._buf = self._read(self._pos_align, self.align)
|
|
108
|
+
|
|
109
|
+
def read(self, n: int = -1) -> bytes:
|
|
110
|
+
"""Read and return up to ``n`` bytes, or read to the end of the stream if ``n`` is ``-1``.
|
|
111
|
+
|
|
112
|
+
Returns an empty bytes object on EOF.
|
|
113
|
+
"""
|
|
114
|
+
if n is not None and n < -1:
|
|
115
|
+
raise ValueError("invalid number of bytes to read")
|
|
116
|
+
|
|
117
|
+
r = []
|
|
118
|
+
size = self.size
|
|
119
|
+
align = self.align
|
|
120
|
+
|
|
121
|
+
with self._lock:
|
|
122
|
+
if size is None and n == -1:
|
|
123
|
+
r = []
|
|
124
|
+
if self._buf:
|
|
125
|
+
buffer_pos = self._pos - self._pos_align
|
|
126
|
+
r.append(self._buf[buffer_pos:])
|
|
127
|
+
self._set_pos(self._pos_align + align)
|
|
128
|
+
|
|
129
|
+
r.append(self._read(self._pos_align, -1))
|
|
130
|
+
|
|
131
|
+
buf = b"".join(r)
|
|
132
|
+
self._set_pos(self._pos + len(buf))
|
|
133
|
+
return buf
|
|
134
|
+
|
|
135
|
+
# If we know the stream size, adjust n
|
|
136
|
+
if size is not None:
|
|
137
|
+
remaining = size - self._pos
|
|
138
|
+
n = remaining if n == -1 else min(n, remaining)
|
|
139
|
+
|
|
140
|
+
# Short path for when it turns out we don't need to read anything
|
|
141
|
+
if n == 0 or (size is not None and size <= self._pos):
|
|
142
|
+
return b""
|
|
143
|
+
|
|
144
|
+
# Read misaligned start from buffer
|
|
145
|
+
if self._pos != self._pos_align:
|
|
146
|
+
self._fill_buf()
|
|
147
|
+
|
|
148
|
+
buffer_pos = self._pos - self._pos_align
|
|
149
|
+
remaining = align - buffer_pos
|
|
150
|
+
buffer_len = min(n, remaining)
|
|
151
|
+
|
|
152
|
+
r.append(self._buf[buffer_pos : buffer_pos + buffer_len])
|
|
153
|
+
|
|
154
|
+
n -= buffer_len
|
|
155
|
+
self._set_pos(self._pos + buffer_len)
|
|
156
|
+
|
|
157
|
+
# Aligned blocks
|
|
158
|
+
if n >= align:
|
|
159
|
+
count, n = divmod(n, align)
|
|
160
|
+
|
|
161
|
+
read_len = count * align
|
|
162
|
+
r.append(self._read(self._pos, read_len))
|
|
163
|
+
|
|
164
|
+
self._set_pos(self._pos + read_len)
|
|
165
|
+
|
|
166
|
+
# Misaligned remaining bytes
|
|
167
|
+
if n > 0:
|
|
168
|
+
self._fill_buf()
|
|
169
|
+
r.append(self._buf[:n])
|
|
170
|
+
self._set_pos(self._pos + n)
|
|
171
|
+
|
|
172
|
+
return b"".join(r)
|
|
173
|
+
|
|
174
|
+
def readinto(self, b: bytearray) -> int:
|
|
175
|
+
"""Read bytes into a pre-allocated bytes-like object b.
|
|
176
|
+
|
|
177
|
+
Returns an int representing the number of bytes read (0 for EOF).
|
|
178
|
+
"""
|
|
179
|
+
buf = self.read(len(b))
|
|
180
|
+
length = len(buf)
|
|
181
|
+
b[:length] = buf
|
|
182
|
+
return length
|
|
183
|
+
|
|
184
|
+
def _read(self, offset: int, length: int) -> bytes:
|
|
185
|
+
"""Read method that backs this aligned stream."""
|
|
186
|
+
raise NotImplementedError("_read needs to be implemented by subclass")
|
|
187
|
+
|
|
188
|
+
def readall(self) -> bytes:
|
|
189
|
+
"""Read until end of stream."""
|
|
190
|
+
return self.read()
|
|
191
|
+
|
|
192
|
+
def readoffset(self, offset: int, length: int) -> bytes:
|
|
193
|
+
"""Convenience method to read from a given offset.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
offset: The offset in the stream to read from.
|
|
197
|
+
length: The number of bytes to read.
|
|
198
|
+
"""
|
|
199
|
+
self.seek(offset)
|
|
200
|
+
return self.read(length)
|
|
201
|
+
|
|
202
|
+
def peek(self, n: int) -> bytes:
|
|
203
|
+
"""Convenience method to peek from the current offset without advancing the stream position.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
n: The number of bytes to peek.
|
|
207
|
+
"""
|
|
208
|
+
pos = self._pos
|
|
209
|
+
data = self.read(n)
|
|
210
|
+
self._set_pos(pos)
|
|
211
|
+
return data
|
|
212
|
+
|
|
213
|
+
def close(self) -> None:
|
|
214
|
+
"""Close the stream. Does nothing by default."""
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
class RangeStream(AlignedStream):
|
|
218
|
+
"""Create a stream with a specific range from another file-like object.
|
|
219
|
+
|
|
220
|
+
ASCII representation::
|
|
221
|
+
|
|
222
|
+
Source file-like object
|
|
223
|
+
|................................................|
|
|
224
|
+
RangeStream with offset and size
|
|
225
|
+
|............................|
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
fh: The source file-like object.
|
|
229
|
+
offset: The offset the stream should start from on the source file-like object.
|
|
230
|
+
size: The size the stream should be.
|
|
231
|
+
align: The alignment size.
|
|
232
|
+
"""
|
|
233
|
+
|
|
234
|
+
def __init__(self, fh: BinaryIO, offset: int, size: int | None, align: int = STREAM_BUFFER_SIZE):
|
|
235
|
+
super().__init__(size, align)
|
|
236
|
+
self._fh = fh
|
|
237
|
+
self.offset = offset
|
|
238
|
+
|
|
239
|
+
def _seek(self, pos: int, whence: int = io.SEEK_SET) -> int:
|
|
240
|
+
if self.size is None and whence == io.SEEK_END:
|
|
241
|
+
if (pos := self._fh.seek(pos, whence)) is None:
|
|
242
|
+
pos = self._fh.tell()
|
|
243
|
+
return max(0, pos - self.offset)
|
|
244
|
+
return super()._seek(pos, whence)
|
|
245
|
+
|
|
246
|
+
def _read(self, offset: int, length: int) -> bytes:
|
|
247
|
+
read_length = min(length, self.size - offset) if self.size else length
|
|
248
|
+
self._fh.seek(self.offset + offset)
|
|
249
|
+
return self._fh.read(read_length)
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
class RelativeStream(RangeStream):
|
|
253
|
+
"""Create a relative stream from another file-like object.
|
|
254
|
+
|
|
255
|
+
ASCII representation::
|
|
256
|
+
|
|
257
|
+
Source file-like object
|
|
258
|
+
|................................................|
|
|
259
|
+
RelativeStream with offset
|
|
260
|
+
|........................................|
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
fh: The source file-like object.
|
|
264
|
+
offset: The offset the stream should start from on the source file-like object.
|
|
265
|
+
size: Optional size the stream should be.
|
|
266
|
+
align: The alignment size.
|
|
267
|
+
"""
|
|
268
|
+
|
|
269
|
+
def __init__(self, fh: BinaryIO, offset: int, size: int | None = None, align: int = STREAM_BUFFER_SIZE):
|
|
270
|
+
super().__init__(fh, offset, size, align)
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
class BufferedStream(RelativeStream):
|
|
274
|
+
"""Create a buffered stream from another file-like object.
|
|
275
|
+
|
|
276
|
+
Optionally start from a specific offset.
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
fh: The source file-like object.
|
|
280
|
+
offset: The offset the stream should start from.
|
|
281
|
+
size: The size the stream should be.
|
|
282
|
+
align: The alignment size.
|
|
283
|
+
"""
|
|
284
|
+
|
|
285
|
+
def __init__(self, fh: BinaryIO, offset: int = 0, size: int | None = None, align: int = STREAM_BUFFER_SIZE):
|
|
286
|
+
super().__init__(fh, offset, size, align)
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
class MappingStream(AlignedStream):
|
|
290
|
+
"""Create a stream from multiple mapped file-like objects.
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
size: The size the stream should be.
|
|
294
|
+
align: The alignment size.
|
|
295
|
+
"""
|
|
296
|
+
|
|
297
|
+
def __init__(self, size: int | None = None, align: int = STREAM_BUFFER_SIZE):
|
|
298
|
+
super().__init__(size, align)
|
|
299
|
+
self._runs: list[tuple[int, int, BinaryIO, int]] = []
|
|
300
|
+
|
|
301
|
+
def add(self, offset: int, size: int, fh: BinaryIO, file_offset: int = 0) -> None:
|
|
302
|
+
"""Add a file-like object to the stream.
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
offset: The offset in the stream this fh maps to.
|
|
306
|
+
size: The size that this mapped fh spans in the stream.
|
|
307
|
+
fh: The file-like object to map.
|
|
308
|
+
file_offset: The offset in the fh to start from.
|
|
309
|
+
|
|
310
|
+
Note that there is no check on overlapping offsets and/or sizes.
|
|
311
|
+
"""
|
|
312
|
+
self._runs.append((offset, size, fh, file_offset))
|
|
313
|
+
self._runs = sorted(self._runs, key=lambda run: run[0])
|
|
314
|
+
self._buf = None
|
|
315
|
+
self.size = self._runs[-1][0] + self._runs[-1][1]
|
|
316
|
+
|
|
317
|
+
def _get_run_idx(self, offset: int) -> tuple[int, int, BinaryIO, int]:
|
|
318
|
+
"""Find a mapping run for a given offset.
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
offset: The offset to find a mapping for.
|
|
322
|
+
|
|
323
|
+
Returns:
|
|
324
|
+
The run tuple if found.
|
|
325
|
+
|
|
326
|
+
Raises:
|
|
327
|
+
IOError: If no mapping is found for the given offset.
|
|
328
|
+
"""
|
|
329
|
+
for idx, run in enumerate(self._runs):
|
|
330
|
+
if run[0] <= offset < run[0] + run[1]:
|
|
331
|
+
return idx
|
|
332
|
+
|
|
333
|
+
raise EOFError(f"No mapping for offset {offset}")
|
|
334
|
+
|
|
335
|
+
def _read(self, offset: int, length: int) -> bytes:
|
|
336
|
+
result = []
|
|
337
|
+
|
|
338
|
+
run_idx = self._get_run_idx(offset)
|
|
339
|
+
runlist_len = len(self._runs)
|
|
340
|
+
size = self.size
|
|
341
|
+
|
|
342
|
+
while length > 0:
|
|
343
|
+
if run_idx >= runlist_len:
|
|
344
|
+
# We somehow requested more data than we have runs for
|
|
345
|
+
break
|
|
346
|
+
|
|
347
|
+
run_offset, run_size, run_fh, run_file_offset = self._runs[run_idx]
|
|
348
|
+
|
|
349
|
+
if run_offset > offset:
|
|
350
|
+
# We landed in a gap, stop reading
|
|
351
|
+
break
|
|
352
|
+
|
|
353
|
+
run_pos = offset - run_offset
|
|
354
|
+
run_remaining = run_size - run_pos
|
|
355
|
+
|
|
356
|
+
if run_remaining < 0:
|
|
357
|
+
break
|
|
358
|
+
|
|
359
|
+
read_count = min(size - offset, min(run_remaining, length))
|
|
360
|
+
|
|
361
|
+
run_fh.seek(run_file_offset + run_pos)
|
|
362
|
+
result.append(run_fh.read(read_count))
|
|
363
|
+
|
|
364
|
+
offset += read_count
|
|
365
|
+
length -= read_count
|
|
366
|
+
run_idx += 1
|
|
367
|
+
|
|
368
|
+
return b"".join(result)
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
class RunlistStream(AlignedStream):
|
|
372
|
+
"""Create a stream from multiple runs on another file-like object.
|
|
373
|
+
|
|
374
|
+
This is common in filesystems, where file data information is stored in "runs".
|
|
375
|
+
A run is a ``(block_offset, block_count)`` tuple, meaning the amount of consecutive blocks from a
|
|
376
|
+
specific starting block. A block_offset of ``None`` represents a sparse run, meaning it must simply
|
|
377
|
+
return all ``\\x00`` bytes.
|
|
378
|
+
|
|
379
|
+
Args:
|
|
380
|
+
fh: The source file-like object.
|
|
381
|
+
runlist: The runlist for this stream in block units.
|
|
382
|
+
size: The size of the stream. This can be smaller than the total sum of blocks (to account for slack space).
|
|
383
|
+
block_size: The block size in bytes.
|
|
384
|
+
align: Optional alignment that differs from the block size, otherwise ``block_size`` is used as alignment.
|
|
385
|
+
"""
|
|
386
|
+
|
|
387
|
+
def __init__(
|
|
388
|
+
self, fh: BinaryIO, runlist: list[tuple[int, int]], size: int, block_size: int, align: int | None = None
|
|
389
|
+
):
|
|
390
|
+
super().__init__(size, align or block_size)
|
|
391
|
+
|
|
392
|
+
if isinstance(fh, RunlistStream):
|
|
393
|
+
self._fh = fh._fh
|
|
394
|
+
else:
|
|
395
|
+
self._fh = fh
|
|
396
|
+
|
|
397
|
+
self._runlist = []
|
|
398
|
+
self._runlist_offsets = []
|
|
399
|
+
|
|
400
|
+
self.runlist = runlist
|
|
401
|
+
self.block_size = block_size
|
|
402
|
+
|
|
403
|
+
@property
|
|
404
|
+
def runlist(self) -> list[tuple[int, int]]:
|
|
405
|
+
return self._runlist
|
|
406
|
+
|
|
407
|
+
@runlist.setter
|
|
408
|
+
def runlist(self, runlist: list[tuple[int, int]]) -> None:
|
|
409
|
+
self._runlist = runlist
|
|
410
|
+
self._runlist_offsets = []
|
|
411
|
+
|
|
412
|
+
offset = 0
|
|
413
|
+
# Create a list of starting offsets for each run so we can bisect that quickly when reading
|
|
414
|
+
for _, block_count in self._runlist:
|
|
415
|
+
if offset != 0:
|
|
416
|
+
self._runlist_offsets.append(offset)
|
|
417
|
+
offset += block_count
|
|
418
|
+
|
|
419
|
+
self._buf = None
|
|
420
|
+
|
|
421
|
+
def _read(self, offset: int, length: int) -> bytes:
|
|
422
|
+
r = []
|
|
423
|
+
|
|
424
|
+
block_offset = offset // self.block_size
|
|
425
|
+
|
|
426
|
+
run_idx = bisect_right(self._runlist_offsets, block_offset)
|
|
427
|
+
runlist_len = len(self.runlist)
|
|
428
|
+
size = self.size
|
|
429
|
+
|
|
430
|
+
while length > 0:
|
|
431
|
+
if run_idx >= runlist_len:
|
|
432
|
+
# We somehow requested more data than we have runs for
|
|
433
|
+
break
|
|
434
|
+
|
|
435
|
+
# If run_idx == 0, we only have a single run
|
|
436
|
+
run_block_pos = 0 if run_idx == 0 else self._runlist_offsets[run_idx - 1]
|
|
437
|
+
run_block_offset, run_block_count = self.runlist[run_idx]
|
|
438
|
+
|
|
439
|
+
run_size = run_block_count * self.block_size
|
|
440
|
+
run_pos = offset - run_block_pos * self.block_size
|
|
441
|
+
run_remaining = run_size - run_pos
|
|
442
|
+
|
|
443
|
+
# Sometimes the self.size is way larger than what we actually have runs for?
|
|
444
|
+
# Stop reading if we reach a negative run_remaining
|
|
445
|
+
if run_remaining < 0:
|
|
446
|
+
break
|
|
447
|
+
|
|
448
|
+
read_count = min(size - offset, min(run_remaining, length))
|
|
449
|
+
|
|
450
|
+
# Sparse run
|
|
451
|
+
if run_block_offset is None:
|
|
452
|
+
r.append(b"\x00" * read_count)
|
|
453
|
+
else:
|
|
454
|
+
self._fh.seek(run_block_offset * self.block_size + run_pos)
|
|
455
|
+
r.append(self._fh.read(read_count))
|
|
456
|
+
|
|
457
|
+
offset += read_count
|
|
458
|
+
length -= read_count
|
|
459
|
+
run_idx += 1
|
|
460
|
+
|
|
461
|
+
return b"".join(r)
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
class OverlayStream(AlignedStream):
|
|
465
|
+
"""Create a stream from another file-like object with the ability to overlay other streams or bytes.
|
|
466
|
+
|
|
467
|
+
Useful for patching large file-like objects without having to cache the entire contents.
|
|
468
|
+
First wrap the original stream in this class, and then call ``add()`` with the offset and data to overlay.
|
|
469
|
+
|
|
470
|
+
Args:
|
|
471
|
+
fh: The source file-like object.
|
|
472
|
+
size: The size the stream should be.
|
|
473
|
+
align: The alignment size.
|
|
474
|
+
"""
|
|
475
|
+
|
|
476
|
+
def __init__(self, fh: BinaryIO, size: int | None = None, align: int = STREAM_BUFFER_SIZE):
|
|
477
|
+
super().__init__(size, align)
|
|
478
|
+
self._fh = fh
|
|
479
|
+
self.overlays: dict[int, tuple[int, BinaryIO]] = {}
|
|
480
|
+
self._lookup: list[int] = []
|
|
481
|
+
|
|
482
|
+
def add(self, offset: int, data: bytes | BinaryIO, size: int | None = None) -> None:
|
|
483
|
+
"""Add an overlay at the given offset.
|
|
484
|
+
|
|
485
|
+
Args:
|
|
486
|
+
offset: The offset in bytes to add an overlay at.
|
|
487
|
+
data: The bytes or file-like object to overlay.
|
|
488
|
+
size: Optional size specification of the overlay, if it can't be inferred.
|
|
489
|
+
"""
|
|
490
|
+
if not hasattr(data, "read"):
|
|
491
|
+
size = size or len(data)
|
|
492
|
+
data = io.BytesIO(data)
|
|
493
|
+
elif size is None:
|
|
494
|
+
size = data.size if hasattr(data, "size") else data.seek(0, io.SEEK_END)
|
|
495
|
+
|
|
496
|
+
if not size:
|
|
497
|
+
return None
|
|
498
|
+
|
|
499
|
+
if size < 0:
|
|
500
|
+
raise ValueError("Size must be positive")
|
|
501
|
+
|
|
502
|
+
# Check if there are overlapping overlays
|
|
503
|
+
for other_offset, (other_size, _) in self.overlays.items():
|
|
504
|
+
if other_offset < offset + size and offset < other_offset + other_size:
|
|
505
|
+
raise ValueError(f"Overlap with existing overlay: ({other_offset, other_size})")
|
|
506
|
+
|
|
507
|
+
self.overlays[offset] = (size, data)
|
|
508
|
+
self._lookup.append(offset)
|
|
509
|
+
self._lookup.sort()
|
|
510
|
+
|
|
511
|
+
# Clear the buffer if we add an overlay at our current position
|
|
512
|
+
if self._buf and (self._pos_align <= offset + size and offset <= self._pos_align + len(self._buf)):
|
|
513
|
+
self._buf = None
|
|
514
|
+
|
|
515
|
+
return self
|
|
516
|
+
|
|
517
|
+
def _read(self, offset: int, length: int) -> bytes:
|
|
518
|
+
result = []
|
|
519
|
+
|
|
520
|
+
fh = self._fh
|
|
521
|
+
overlays = self.overlays
|
|
522
|
+
lookup = self._lookup
|
|
523
|
+
|
|
524
|
+
overlay_len = len(overlays)
|
|
525
|
+
overlay_idx = bisect_left(lookup, offset)
|
|
526
|
+
|
|
527
|
+
while length > 0:
|
|
528
|
+
prev_overlay_offset = None if overlay_idx == 0 else lookup[overlay_idx - 1]
|
|
529
|
+
next_overlay_offset = None if overlay_idx >= overlay_len else lookup[overlay_idx]
|
|
530
|
+
|
|
531
|
+
if prev_overlay_offset is not None:
|
|
532
|
+
prev_overlay_size, prev_overlay_data = overlays[prev_overlay_offset]
|
|
533
|
+
prev_overlay_end = prev_overlay_offset + prev_overlay_size
|
|
534
|
+
|
|
535
|
+
if prev_overlay_end > offset:
|
|
536
|
+
# Currently in an overlay
|
|
537
|
+
offset_in_prev_overlay = offset - prev_overlay_offset
|
|
538
|
+
prev_overlay_remaining = prev_overlay_size - offset_in_prev_overlay
|
|
539
|
+
prev_overlay_read_size = min(length, prev_overlay_remaining)
|
|
540
|
+
|
|
541
|
+
prev_overlay_data.seek(offset_in_prev_overlay)
|
|
542
|
+
result.append(prev_overlay_data.read(prev_overlay_read_size))
|
|
543
|
+
|
|
544
|
+
offset += prev_overlay_read_size
|
|
545
|
+
length -= prev_overlay_read_size
|
|
546
|
+
|
|
547
|
+
if length == 0:
|
|
548
|
+
break
|
|
549
|
+
|
|
550
|
+
if next_overlay_offset:
|
|
551
|
+
next_overlay_size, next_overlay_data = overlays[next_overlay_offset]
|
|
552
|
+
gap_to_next_overlay = next_overlay_offset - offset
|
|
553
|
+
|
|
554
|
+
if 0 <= gap_to_next_overlay < length:
|
|
555
|
+
if gap_to_next_overlay:
|
|
556
|
+
fh.seek(offset)
|
|
557
|
+
result.append(fh.read(gap_to_next_overlay))
|
|
558
|
+
|
|
559
|
+
# read remaining from overlay
|
|
560
|
+
next_overlay_read_size = min(next_overlay_size, length - gap_to_next_overlay)
|
|
561
|
+
next_overlay_data.seek(0)
|
|
562
|
+
result.append(next_overlay_data.read(next_overlay_read_size))
|
|
563
|
+
|
|
564
|
+
offset += next_overlay_read_size + gap_to_next_overlay
|
|
565
|
+
length -= next_overlay_read_size + gap_to_next_overlay
|
|
566
|
+
else:
|
|
567
|
+
# Next overlay is too far away, complete read
|
|
568
|
+
fh.seek(offset)
|
|
569
|
+
result.append(fh.read(length))
|
|
570
|
+
break
|
|
571
|
+
else:
|
|
572
|
+
# No next overlay, complete read
|
|
573
|
+
fh.seek(offset)
|
|
574
|
+
result.append(fh.read(length))
|
|
575
|
+
break
|
|
576
|
+
|
|
577
|
+
overlay_idx += 1
|
|
578
|
+
|
|
579
|
+
return b"".join(result)
|
|
580
|
+
|
|
581
|
+
|
|
582
|
+
class ZlibStream(AlignedStream):
|
|
583
|
+
"""Create a zlib stream from another file-like object.
|
|
584
|
+
|
|
585
|
+
Basically the same as ``gzip.GzipFile`` but for raw zlib streams.
|
|
586
|
+
Due to the nature of zlib streams, seeking backwards requires resetting the decompression context.
|
|
587
|
+
|
|
588
|
+
Args:
|
|
589
|
+
fh: The source file-like object.
|
|
590
|
+
size: The size the stream should be.
|
|
591
|
+
"""
|
|
592
|
+
|
|
593
|
+
def __init__(self, fh: BinaryIO, size: int | None = None, align: int = STREAM_BUFFER_SIZE, **kwargs):
|
|
594
|
+
self._fh = fh
|
|
595
|
+
|
|
596
|
+
self._zlib = None
|
|
597
|
+
self._zlib_args = kwargs
|
|
598
|
+
self._zlib_offset = 0
|
|
599
|
+
self._zlib_prepend = b""
|
|
600
|
+
self._zlib_prepend_offset = None
|
|
601
|
+
self._rewind()
|
|
602
|
+
|
|
603
|
+
super().__init__(size, align)
|
|
604
|
+
|
|
605
|
+
def _rewind(self) -> None:
|
|
606
|
+
self._fh.seek(0)
|
|
607
|
+
self._zlib = zlib.decompressobj(**self._zlib_args)
|
|
608
|
+
self._zlib_offset = 0
|
|
609
|
+
self._zlib_prepend = b""
|
|
610
|
+
self._zlib_prepend_offset = None
|
|
611
|
+
|
|
612
|
+
def _seek_zlib(self, offset: int) -> None:
|
|
613
|
+
if offset < self._zlib_offset:
|
|
614
|
+
self._rewind()
|
|
615
|
+
|
|
616
|
+
while self._zlib_offset < offset:
|
|
617
|
+
read_size = min(offset - self._zlib_offset, self.align)
|
|
618
|
+
if self._read_zlib(read_size) == b"":
|
|
619
|
+
break
|
|
620
|
+
|
|
621
|
+
def _read_fh(self, length: int) -> bytes:
|
|
622
|
+
if self._zlib_prepend_offset is None:
|
|
623
|
+
return self._fh.read(length)
|
|
624
|
+
|
|
625
|
+
if self._zlib_prepend_offset + length <= len(self._zlib_prepend):
|
|
626
|
+
offset = self._zlib_prepend_offset
|
|
627
|
+
self._zlib_prepend_offset += length
|
|
628
|
+
return self._zlib_prepend[offset : self._zlib_prepend_offset]
|
|
629
|
+
|
|
630
|
+
offset = self._zlib_prepend_offset
|
|
631
|
+
self._zlib_prepend_offset = None
|
|
632
|
+
return self._zlib_prepend[offset:] + self._fh.read(length - len(self._zlib_prepend) + offset)
|
|
633
|
+
|
|
634
|
+
def _read_zlib(self, length: int) -> bytes:
|
|
635
|
+
if length < 0:
|
|
636
|
+
return self.readall()
|
|
637
|
+
|
|
638
|
+
result = []
|
|
639
|
+
while length > 0:
|
|
640
|
+
buf = self._read_fh(io.DEFAULT_BUFFER_SIZE)
|
|
641
|
+
decompressed = self._zlib.decompress(buf, length)
|
|
642
|
+
|
|
643
|
+
if self._zlib.unconsumed_tail != b"":
|
|
644
|
+
self._zlib_prepend = self._zlib.unconsumed_tail
|
|
645
|
+
self._zlib_prepend_offset = 0
|
|
646
|
+
|
|
647
|
+
if buf == b"":
|
|
648
|
+
break
|
|
649
|
+
|
|
650
|
+
result.append(decompressed)
|
|
651
|
+
length -= len(decompressed)
|
|
652
|
+
|
|
653
|
+
buf = b"".join(result)
|
|
654
|
+
self._zlib_offset += len(buf)
|
|
655
|
+
return buf
|
|
656
|
+
|
|
657
|
+
def _read(self, offset: int, length: int) -> bytes:
|
|
658
|
+
self._seek_zlib(offset)
|
|
659
|
+
return self._read_zlib(length)
|
|
660
|
+
|
|
661
|
+
def readall(self) -> bytes:
|
|
662
|
+
self._seek_zlib(self.tell())
|
|
663
|
+
|
|
664
|
+
chunks = []
|
|
665
|
+
# sys.maxsize means the max length of output buffer is unlimited,
|
|
666
|
+
# so that the whole input buffer can be decompressed within one
|
|
667
|
+
# .decompress() call.
|
|
668
|
+
while data := self._read_zlib(sys.maxsize):
|
|
669
|
+
chunks.append(data)
|
|
670
|
+
|
|
671
|
+
return b"".join(chunks)
|
|
File without changes
|