zipremove 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
zipremove/__init__.py
ADDED
|
@@ -0,0 +1,647 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import io
|
|
3
|
+
import os
|
|
4
|
+
import struct
|
|
5
|
+
from zipfile import *
|
|
6
|
+
from zipfile import ( # noqa: F401
|
|
7
|
+
_DD_SIGNATURE,
|
|
8
|
+
_FH_COMPRESSED_SIZE,
|
|
9
|
+
_FH_COMPRESSION_METHOD,
|
|
10
|
+
_FH_CRC,
|
|
11
|
+
_FH_EXTRA_FIELD_LENGTH,
|
|
12
|
+
_FH_FILENAME_LENGTH,
|
|
13
|
+
_FH_GENERAL_PURPOSE_FLAG_BITS,
|
|
14
|
+
_FH_SIGNATURE,
|
|
15
|
+
_FH_UNCOMPRESSED_SIZE,
|
|
16
|
+
LZMADecompressor,
|
|
17
|
+
_get_compressor,
|
|
18
|
+
_get_decompressor,
|
|
19
|
+
crc32,
|
|
20
|
+
sizeFileHeader,
|
|
21
|
+
stringFileHeader,
|
|
22
|
+
structFileHeader,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# polyfills
|
|
26
|
+
try:
|
|
27
|
+
ZIP_ZSTANDARD
|
|
28
|
+
except NameError:
|
|
29
|
+
# polyfill for Python < 3.14
|
|
30
|
+
ZIP_ZSTANDARD = 93
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
from zipfile import _MASK_USE_DATA_DESCRIPTOR
|
|
34
|
+
except ImportError:
|
|
35
|
+
# polyfill for Python < 3.11
|
|
36
|
+
_MASK_USE_DATA_DESCRIPTOR = 1 << 3
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
from zipfile import _sanitize_filename
|
|
40
|
+
except ImportError:
|
|
41
|
+
# polyfill for Python < 3.11
|
|
42
|
+
def _sanitize_filename(filename):
|
|
43
|
+
null_byte = filename.find(chr(0))
|
|
44
|
+
if null_byte >= 0:
|
|
45
|
+
filename = filename[0:null_byte]
|
|
46
|
+
if os.sep != "/" and os.sep in filename:
|
|
47
|
+
filename = filename.replace(os.sep, "/")
|
|
48
|
+
if os.altsep and os.altsep != "/" and os.altsep in filename:
|
|
49
|
+
filename = filename.replace(os.altsep, "/")
|
|
50
|
+
return filename
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class _ZipRepacker:
|
|
54
|
+
"""Class for ZipFile repacking."""
|
|
55
|
+
def __init__(self, *, strict_descriptor=False, chunk_size=2**20, debug=0):
|
|
56
|
+
self.debug = debug # Level of printing: 0 through 3
|
|
57
|
+
self.chunk_size = chunk_size
|
|
58
|
+
self.strict_descriptor = strict_descriptor
|
|
59
|
+
|
|
60
|
+
def _debug(self, level, *msg):
|
|
61
|
+
if self.debug >= level:
|
|
62
|
+
print(*msg)
|
|
63
|
+
|
|
64
|
+
def copy(self, zfile, zinfo, filename):
|
|
65
|
+
# make a copy of zinfo
|
|
66
|
+
zinfo2 = copy.deepcopy(zinfo)
|
|
67
|
+
|
|
68
|
+
# apply sanitized new filename as in `ZipInfo.__init__`
|
|
69
|
+
zinfo2.orig_filename = filename
|
|
70
|
+
zinfo2.filename = _sanitize_filename(filename)
|
|
71
|
+
|
|
72
|
+
zinfo2.header_offset = zfile.start_dir
|
|
73
|
+
|
|
74
|
+
# polyfill: update zinfo2._end_offset if exists
|
|
75
|
+
# (Python >= 3.8 with fix #109858)
|
|
76
|
+
if hasattr(zinfo2, '_end_offset'):
|
|
77
|
+
zinfo2._end_offset = None
|
|
78
|
+
|
|
79
|
+
# write to a new local file header
|
|
80
|
+
fp = zfile.fp
|
|
81
|
+
sizes = self._calc_local_file_entry_size(fp, zinfo)
|
|
82
|
+
fp.seek(zinfo2.header_offset)
|
|
83
|
+
fp.write(zinfo2.FileHeader())
|
|
84
|
+
self._copy_bytes(fp, zinfo.header_offset + sum(sizes[:3]), fp.tell(), sum(sizes[3:]))
|
|
85
|
+
zfile.start_dir = fp.tell()
|
|
86
|
+
|
|
87
|
+
# add to filelist
|
|
88
|
+
zfile.filelist.append(zinfo2)
|
|
89
|
+
zfile.NameToInfo[zinfo2.filename] = zinfo2
|
|
90
|
+
|
|
91
|
+
zfile._didModify = True
|
|
92
|
+
|
|
93
|
+
def repack(self, zfile, removed=None):
|
|
94
|
+
"""
|
|
95
|
+
Repack the ZIP file, stripping unreferenced local file entries.
|
|
96
|
+
|
|
97
|
+
Assumes that local file entries are stored consecutively, with no gaps
|
|
98
|
+
or overlaps.
|
|
99
|
+
|
|
100
|
+
Behavior:
|
|
101
|
+
|
|
102
|
+
1. If any referenced entry overlaps with another, a `BadZipFile` error
|
|
103
|
+
is raised since safe repacking cannot be guaranteed.
|
|
104
|
+
|
|
105
|
+
2. Data before the first referenced entry is stripped only when it
|
|
106
|
+
appears to be a sequence of consecutive entries with no extra
|
|
107
|
+
following bytes; extra preceeding bytes are preserved.
|
|
108
|
+
|
|
109
|
+
3. Data between referenced entries is stripped only when it appears to
|
|
110
|
+
be a sequence of consecutive entries with no extra preceding bytes;
|
|
111
|
+
extra following bytes are preserved.
|
|
112
|
+
|
|
113
|
+
4. This is to prevent an unexpected data removal (false positive),
|
|
114
|
+
though a false negative may happen in certain rare cases.
|
|
115
|
+
|
|
116
|
+
Examples:
|
|
117
|
+
|
|
118
|
+
Stripping before the first referenced entry:
|
|
119
|
+
|
|
120
|
+
[random bytes]
|
|
121
|
+
[unreferenced local file entry]
|
|
122
|
+
[random bytes]
|
|
123
|
+
<-- stripping start
|
|
124
|
+
[unreferenced local file entry]
|
|
125
|
+
[unreferenced local file entry]
|
|
126
|
+
<-- stripping end
|
|
127
|
+
[local file entry 1] (or central directory)
|
|
128
|
+
...
|
|
129
|
+
|
|
130
|
+
Stripping between referenced entries:
|
|
131
|
+
|
|
132
|
+
...
|
|
133
|
+
[local file entry]
|
|
134
|
+
<-- stripping start
|
|
135
|
+
[unreferenced local file entry]
|
|
136
|
+
[unreferenced local file entry]
|
|
137
|
+
<-- stripping end
|
|
138
|
+
[random bytes]
|
|
139
|
+
[unreferenced local file entry]
|
|
140
|
+
[random bytes]
|
|
141
|
+
[local file entry] (or central directory)
|
|
142
|
+
...
|
|
143
|
+
|
|
144
|
+
No stripping:
|
|
145
|
+
|
|
146
|
+
[unreferenced local file entry]
|
|
147
|
+
[random bytes]
|
|
148
|
+
[local file entry 1] (or central directory)
|
|
149
|
+
...
|
|
150
|
+
|
|
151
|
+
No stripping:
|
|
152
|
+
|
|
153
|
+
...
|
|
154
|
+
[local file entry]
|
|
155
|
+
[random bytes]
|
|
156
|
+
[unreferenced local file entry]
|
|
157
|
+
[local file entry] (or central directory)
|
|
158
|
+
...
|
|
159
|
+
|
|
160
|
+
Side effects:
|
|
161
|
+
- Modifies the ZIP file in place.
|
|
162
|
+
- Updates zfile.start_dir to account for removed data.
|
|
163
|
+
- Sets zfile._didModify to True.
|
|
164
|
+
- Updates header_offset and _end_offset of referenced ZipInfo
|
|
165
|
+
instances.
|
|
166
|
+
|
|
167
|
+
Parameters:
|
|
168
|
+
zfile: A ZipFile object representing the archive to repack.
|
|
169
|
+
removed: Optional. A sequence of ZipInfo instances representing
|
|
170
|
+
the previously removed entries. When provided, only their
|
|
171
|
+
corresponding local file entries are stripped.
|
|
172
|
+
"""
|
|
173
|
+
removed_zinfos = set(removed or ())
|
|
174
|
+
|
|
175
|
+
fp = zfile.fp
|
|
176
|
+
|
|
177
|
+
# get a sorted filelist by header offset, in case the dir order
|
|
178
|
+
# doesn't match the actual entry order
|
|
179
|
+
filelist = (*zfile.filelist, *removed_zinfos)
|
|
180
|
+
filelist = sorted(filelist, key=lambda x: x.header_offset)
|
|
181
|
+
|
|
182
|
+
# calculate each entry size and validate
|
|
183
|
+
entry_size_list = []
|
|
184
|
+
used_entry_size_list = []
|
|
185
|
+
for i, zinfo in enumerate(filelist):
|
|
186
|
+
try:
|
|
187
|
+
offset = filelist[i + 1].header_offset
|
|
188
|
+
except IndexError:
|
|
189
|
+
offset = zfile.start_dir
|
|
190
|
+
entry_size = offset - zinfo.header_offset
|
|
191
|
+
|
|
192
|
+
# may raise on an invalid local file header
|
|
193
|
+
used_entry_size = sum(self._calc_local_file_entry_size(fp, zinfo))
|
|
194
|
+
|
|
195
|
+
self._debug(3, i, zinfo.orig_filename, zinfo.header_offset, entry_size, used_entry_size)
|
|
196
|
+
if used_entry_size > entry_size:
|
|
197
|
+
raise BadZipFile(
|
|
198
|
+
f"Overlapped entries: {zinfo.orig_filename!r} ")
|
|
199
|
+
|
|
200
|
+
if removed is not None and zinfo not in removed_zinfos:
|
|
201
|
+
used_entry_size = entry_size
|
|
202
|
+
|
|
203
|
+
entry_size_list.append(entry_size)
|
|
204
|
+
used_entry_size_list.append(used_entry_size)
|
|
205
|
+
|
|
206
|
+
# calculate the starting entry offset (bytes to skip)
|
|
207
|
+
if removed is None:
|
|
208
|
+
try:
|
|
209
|
+
offset = filelist[0].header_offset
|
|
210
|
+
except IndexError:
|
|
211
|
+
offset = zfile.start_dir
|
|
212
|
+
entry_offset = self._calc_initial_entry_offset(fp, offset)
|
|
213
|
+
else:
|
|
214
|
+
entry_offset = 0
|
|
215
|
+
|
|
216
|
+
# move file entries
|
|
217
|
+
for i, zinfo in enumerate(filelist):
|
|
218
|
+
entry_size = entry_size_list[i]
|
|
219
|
+
used_entry_size = used_entry_size_list[i]
|
|
220
|
+
|
|
221
|
+
# update the header and move entry data to the new position
|
|
222
|
+
old_header_offset = zinfo.header_offset
|
|
223
|
+
zinfo.header_offset -= entry_offset
|
|
224
|
+
|
|
225
|
+
if zinfo in removed_zinfos:
|
|
226
|
+
self._copy_bytes(
|
|
227
|
+
fp,
|
|
228
|
+
old_header_offset + used_entry_size,
|
|
229
|
+
zinfo.header_offset,
|
|
230
|
+
entry_size - used_entry_size
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
# update entry_offset for subsequent files to follow
|
|
234
|
+
entry_offset += used_entry_size
|
|
235
|
+
|
|
236
|
+
else:
|
|
237
|
+
if entry_offset > 0:
|
|
238
|
+
self._copy_bytes(fp, old_header_offset, zinfo.header_offset, used_entry_size)
|
|
239
|
+
|
|
240
|
+
if used_entry_size < entry_size:
|
|
241
|
+
stale_entry_size = self._validate_local_file_entry_sequence(
|
|
242
|
+
fp,
|
|
243
|
+
old_header_offset + used_entry_size,
|
|
244
|
+
old_header_offset + entry_size,
|
|
245
|
+
)
|
|
246
|
+
else:
|
|
247
|
+
stale_entry_size = 0
|
|
248
|
+
|
|
249
|
+
if stale_entry_size > 0:
|
|
250
|
+
self._copy_bytes(
|
|
251
|
+
fp,
|
|
252
|
+
old_header_offset + used_entry_size + stale_entry_size,
|
|
253
|
+
zinfo.header_offset + used_entry_size,
|
|
254
|
+
entry_size - used_entry_size - stale_entry_size,
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
# update entry_offset for subsequent files to follow
|
|
258
|
+
entry_offset += stale_entry_size
|
|
259
|
+
|
|
260
|
+
# update state
|
|
261
|
+
zfile.start_dir -= entry_offset
|
|
262
|
+
zfile._didModify = True
|
|
263
|
+
|
|
264
|
+
# polyfill: update ZipInfo._end_offset if exists
|
|
265
|
+
# (Python >= 3.8 with fix #109858)
|
|
266
|
+
if hasattr(ZipInfo, '_end_offset'):
|
|
267
|
+
end_offset = zfile.start_dir
|
|
268
|
+
for zinfo in reversed(filelist):
|
|
269
|
+
if zinfo in removed_zinfos:
|
|
270
|
+
zinfo._end_offset = None
|
|
271
|
+
else:
|
|
272
|
+
if zinfo._end_offset is not None:
|
|
273
|
+
zinfo._end_offset = end_offset
|
|
274
|
+
end_offset = zinfo.header_offset
|
|
275
|
+
|
|
276
|
+
def _calc_initial_entry_offset(self, fp, data_offset):
|
|
277
|
+
checked_offsets = {}
|
|
278
|
+
if data_offset > 0:
|
|
279
|
+
self._debug(3, 'scanning file signatures before:', data_offset)
|
|
280
|
+
for pos in self._iter_scan_signature(fp, stringFileHeader, 0, data_offset):
|
|
281
|
+
self._debug(3, 'checking file signature at:', pos)
|
|
282
|
+
entry_size = self._validate_local_file_entry_sequence(fp, pos, data_offset, checked_offsets)
|
|
283
|
+
if entry_size == data_offset - pos:
|
|
284
|
+
return entry_size
|
|
285
|
+
return 0
|
|
286
|
+
|
|
287
|
+
def _iter_scan_signature(self, fp, signature, start_offset, end_offset, chunk_size=4096):
|
|
288
|
+
sig_len = len(signature)
|
|
289
|
+
remainder = b''
|
|
290
|
+
pos = start_offset
|
|
291
|
+
|
|
292
|
+
while pos < end_offset:
|
|
293
|
+
# required for each loop since fp may be changed during each yield
|
|
294
|
+
fp.seek(pos)
|
|
295
|
+
|
|
296
|
+
chunk = remainder + fp.read(min(chunk_size, end_offset - pos))
|
|
297
|
+
|
|
298
|
+
delta = pos - len(remainder)
|
|
299
|
+
idx = 0
|
|
300
|
+
while True:
|
|
301
|
+
idx = chunk.find(signature, idx)
|
|
302
|
+
if idx == -1:
|
|
303
|
+
break
|
|
304
|
+
|
|
305
|
+
yield delta + idx
|
|
306
|
+
idx += 1
|
|
307
|
+
|
|
308
|
+
remainder = chunk[-(sig_len - 1):]
|
|
309
|
+
pos += chunk_size
|
|
310
|
+
|
|
311
|
+
def _validate_local_file_entry_sequence(self, fp, start_offset, end_offset, checked_offsets=None):
|
|
312
|
+
offset = start_offset
|
|
313
|
+
|
|
314
|
+
while offset < end_offset:
|
|
315
|
+
self._debug(3, 'checking local file entry at:', offset)
|
|
316
|
+
|
|
317
|
+
# Cache checked offsets to improve performance.
|
|
318
|
+
try:
|
|
319
|
+
entry_size = checked_offsets[offset]
|
|
320
|
+
except (KeyError, TypeError):
|
|
321
|
+
entry_size = self._validate_local_file_entry(fp, offset, end_offset)
|
|
322
|
+
if checked_offsets is not None:
|
|
323
|
+
checked_offsets[offset] = entry_size
|
|
324
|
+
else:
|
|
325
|
+
self._debug(3, 'read from checked cache:', offset)
|
|
326
|
+
|
|
327
|
+
if entry_size is None:
|
|
328
|
+
break
|
|
329
|
+
|
|
330
|
+
offset += entry_size
|
|
331
|
+
|
|
332
|
+
return offset - start_offset
|
|
333
|
+
|
|
334
|
+
def _validate_local_file_entry(self, fp, offset, end_offset):
|
|
335
|
+
fp.seek(offset)
|
|
336
|
+
try:
|
|
337
|
+
fheader = self._read_local_file_header(fp)
|
|
338
|
+
except BadZipFile:
|
|
339
|
+
return None
|
|
340
|
+
|
|
341
|
+
# Create a dummy ZipInfo to utilize parsing.
|
|
342
|
+
# Flush only the required information.
|
|
343
|
+
zinfo = ZipInfo()
|
|
344
|
+
zinfo.header_offset = offset
|
|
345
|
+
zinfo.flag_bits = fheader[_FH_GENERAL_PURPOSE_FLAG_BITS]
|
|
346
|
+
zinfo.compress_size = fheader[_FH_COMPRESSED_SIZE]
|
|
347
|
+
zinfo.file_size = fheader[_FH_UNCOMPRESSED_SIZE]
|
|
348
|
+
zinfo.CRC = fheader[_FH_CRC]
|
|
349
|
+
|
|
350
|
+
filename = fp.read(fheader[_FH_FILENAME_LENGTH])
|
|
351
|
+
zinfo.extra = fp.read(fheader[_FH_EXTRA_FIELD_LENGTH])
|
|
352
|
+
pos = fp.tell()
|
|
353
|
+
|
|
354
|
+
if pos > end_offset:
|
|
355
|
+
return None
|
|
356
|
+
|
|
357
|
+
try:
|
|
358
|
+
# parse zip64
|
|
359
|
+
try:
|
|
360
|
+
zinfo._decodeExtra(crc32(filename))
|
|
361
|
+
except TypeError:
|
|
362
|
+
# polyfill for Python < 3.12
|
|
363
|
+
zinfo._decodeExtra()
|
|
364
|
+
except BadZipFile:
|
|
365
|
+
return None
|
|
366
|
+
|
|
367
|
+
dd_size = 0
|
|
368
|
+
|
|
369
|
+
if zinfo.flag_bits & _MASK_USE_DATA_DESCRIPTOR:
|
|
370
|
+
# According to the spec, these fields should be zero when data
|
|
371
|
+
# descriptor is used. Otherwise treat as a false positive on
|
|
372
|
+
# random bytes to return early, as scanning for data descriptor
|
|
373
|
+
# is rather expensive.
|
|
374
|
+
if not (zinfo.CRC == zinfo.compress_size == zinfo.file_size == 0):
|
|
375
|
+
return None
|
|
376
|
+
|
|
377
|
+
zip64 = fheader[_FH_UNCOMPRESSED_SIZE] == 0xffffffff
|
|
378
|
+
|
|
379
|
+
dd = self._scan_data_descriptor(fp, pos, end_offset, zip64)
|
|
380
|
+
if dd is None:
|
|
381
|
+
dd = self._scan_data_descriptor_no_sig_by_decompression(
|
|
382
|
+
fp, pos, end_offset, zip64, fheader[_FH_COMPRESSION_METHOD])
|
|
383
|
+
if dd is False:
|
|
384
|
+
if not self.strict_descriptor:
|
|
385
|
+
dd = self._scan_data_descriptor_no_sig(fp, pos, end_offset, zip64)
|
|
386
|
+
else:
|
|
387
|
+
dd = None
|
|
388
|
+
if dd is None:
|
|
389
|
+
return None
|
|
390
|
+
|
|
391
|
+
zinfo.CRC, zinfo.compress_size, zinfo.file_size, dd_size = dd
|
|
392
|
+
|
|
393
|
+
return (
|
|
394
|
+
sizeFileHeader +
|
|
395
|
+
fheader[_FH_FILENAME_LENGTH] + fheader[_FH_EXTRA_FIELD_LENGTH] +
|
|
396
|
+
zinfo.compress_size +
|
|
397
|
+
dd_size
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
def _read_local_file_header(self, fp):
|
|
401
|
+
fheader = fp.read(sizeFileHeader)
|
|
402
|
+
if len(fheader) != sizeFileHeader:
|
|
403
|
+
raise BadZipFile("Truncated file header")
|
|
404
|
+
fheader = struct.unpack(structFileHeader, fheader)
|
|
405
|
+
if fheader[_FH_SIGNATURE] != stringFileHeader:
|
|
406
|
+
raise BadZipFile("Bad magic number for file header")
|
|
407
|
+
return fheader
|
|
408
|
+
|
|
409
|
+
def _scan_data_descriptor(self, fp, offset, end_offset, zip64):
|
|
410
|
+
dd_fmt = '<LLQQ' if zip64 else '<LLLL'
|
|
411
|
+
dd_size = struct.calcsize(dd_fmt)
|
|
412
|
+
|
|
413
|
+
# scan for signature and take the first valid descriptor
|
|
414
|
+
for pos in self._iter_scan_signature(
|
|
415
|
+
fp, struct.pack('<L', _DD_SIGNATURE), offset, end_offset
|
|
416
|
+
):
|
|
417
|
+
fp.seek(pos)
|
|
418
|
+
dd = fp.read(min(dd_size, end_offset - pos))
|
|
419
|
+
try:
|
|
420
|
+
_, crc, compress_size, file_size = struct.unpack(dd_fmt, dd)
|
|
421
|
+
except struct.error:
|
|
422
|
+
continue
|
|
423
|
+
|
|
424
|
+
# @TODO: also check CRC to better guard from a false positive?
|
|
425
|
+
if pos - offset != compress_size:
|
|
426
|
+
continue
|
|
427
|
+
|
|
428
|
+
return crc, compress_size, file_size, dd_size
|
|
429
|
+
|
|
430
|
+
return None
|
|
431
|
+
|
|
432
|
+
def _scan_data_descriptor_no_sig(self, fp, offset, end_offset, zip64, chunk_size=8192):
|
|
433
|
+
dd_fmt = '<LQQ' if zip64 else '<LLL'
|
|
434
|
+
dd_size = struct.calcsize(dd_fmt)
|
|
435
|
+
|
|
436
|
+
pos = offset
|
|
437
|
+
remainder = b''
|
|
438
|
+
|
|
439
|
+
fp.seek(offset)
|
|
440
|
+
while pos < end_offset:
|
|
441
|
+
chunk = remainder + fp.read(min(chunk_size, end_offset - pos))
|
|
442
|
+
|
|
443
|
+
delta = pos - len(remainder) - offset
|
|
444
|
+
mv = memoryview(chunk)
|
|
445
|
+
for i in range(len(chunk) - dd_size + 1):
|
|
446
|
+
dd = mv[i:i + dd_size]
|
|
447
|
+
try:
|
|
448
|
+
crc, compress_size, file_size = struct.unpack(dd_fmt, dd)
|
|
449
|
+
except struct.error:
|
|
450
|
+
continue
|
|
451
|
+
if delta + i != compress_size:
|
|
452
|
+
continue
|
|
453
|
+
|
|
454
|
+
return crc, compress_size, file_size, dd_size
|
|
455
|
+
|
|
456
|
+
remainder = chunk[-(dd_size - 1):]
|
|
457
|
+
pos += chunk_size
|
|
458
|
+
|
|
459
|
+
return None
|
|
460
|
+
|
|
461
|
+
def _scan_data_descriptor_no_sig_by_decompression(self, fp, offset, end_offset, zip64, method):
|
|
462
|
+
try:
|
|
463
|
+
decompressor = _get_decompressor(method)
|
|
464
|
+
except RuntimeError:
|
|
465
|
+
return False
|
|
466
|
+
|
|
467
|
+
if decompressor is None:
|
|
468
|
+
return False
|
|
469
|
+
|
|
470
|
+
# Current LZMADecompressor is unreliable since it's `.eof` is usually
|
|
471
|
+
# not set as expected.
|
|
472
|
+
if isinstance(decompressor, LZMADecompressor):
|
|
473
|
+
return False
|
|
474
|
+
|
|
475
|
+
dd_fmt = '<LQQ' if zip64 else '<LLL'
|
|
476
|
+
dd_size = struct.calcsize(dd_fmt)
|
|
477
|
+
|
|
478
|
+
if end_offset - dd_size < offset:
|
|
479
|
+
return None
|
|
480
|
+
|
|
481
|
+
try:
|
|
482
|
+
pos = self._trace_compressed_block_end(fp, offset, end_offset - dd_size, decompressor)
|
|
483
|
+
except Exception:
|
|
484
|
+
return None
|
|
485
|
+
|
|
486
|
+
fp.seek(pos)
|
|
487
|
+
dd = fp.read(dd_size)
|
|
488
|
+
try:
|
|
489
|
+
crc, compress_size, file_size = struct.unpack(dd_fmt, dd)
|
|
490
|
+
except struct.error:
|
|
491
|
+
return None
|
|
492
|
+
if pos - offset != compress_size:
|
|
493
|
+
return None
|
|
494
|
+
|
|
495
|
+
return crc, compress_size, file_size, dd_size
|
|
496
|
+
|
|
497
|
+
def _trace_compressed_block_end(self, fp, offset, end_offset, decompressor, chunk_size=4096):
|
|
498
|
+
fp.seek(offset)
|
|
499
|
+
read_size = 0
|
|
500
|
+
while True:
|
|
501
|
+
chunk = fp.read(min(chunk_size, end_offset - offset - read_size))
|
|
502
|
+
if not chunk:
|
|
503
|
+
raise EOFError('Unexpected EOF while decompressing')
|
|
504
|
+
|
|
505
|
+
# may raise on error
|
|
506
|
+
decompressor.decompress(chunk)
|
|
507
|
+
|
|
508
|
+
read_size += len(chunk)
|
|
509
|
+
|
|
510
|
+
if decompressor.eof:
|
|
511
|
+
unused_len = len(decompressor.unused_data)
|
|
512
|
+
return offset + read_size - unused_len
|
|
513
|
+
|
|
514
|
+
def _calc_local_file_entry_size(self, fp, zinfo):
|
|
515
|
+
fp.seek(zinfo.header_offset)
|
|
516
|
+
fheader = self._read_local_file_header(fp)
|
|
517
|
+
|
|
518
|
+
if zinfo.flag_bits & _MASK_USE_DATA_DESCRIPTOR:
|
|
519
|
+
zip64 = fheader[_FH_UNCOMPRESSED_SIZE] == 0xffffffff
|
|
520
|
+
dd_fmt = '<LLQQ' if zip64 else '<LLLL'
|
|
521
|
+
fp.seek(
|
|
522
|
+
fheader[_FH_FILENAME_LENGTH] + fheader[_FH_EXTRA_FIELD_LENGTH] +
|
|
523
|
+
zinfo.compress_size,
|
|
524
|
+
os.SEEK_CUR,
|
|
525
|
+
)
|
|
526
|
+
if fp.read(struct.calcsize('<L')) != struct.pack('<L', _DD_SIGNATURE):
|
|
527
|
+
dd_fmt = '<LQQ' if zip64 else '<LLL'
|
|
528
|
+
dd_size = struct.calcsize(dd_fmt)
|
|
529
|
+
else:
|
|
530
|
+
dd_size = 0
|
|
531
|
+
|
|
532
|
+
return (
|
|
533
|
+
sizeFileHeader,
|
|
534
|
+
fheader[_FH_FILENAME_LENGTH],
|
|
535
|
+
fheader[_FH_EXTRA_FIELD_LENGTH],
|
|
536
|
+
zinfo.compress_size,
|
|
537
|
+
dd_size,
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
def _copy_bytes(self, fp, old_offset, new_offset, size):
|
|
541
|
+
read_size = 0
|
|
542
|
+
while read_size < size:
|
|
543
|
+
fp.seek(old_offset + read_size)
|
|
544
|
+
data = fp.read(min(size - read_size, self.chunk_size))
|
|
545
|
+
fp.seek(new_offset + read_size)
|
|
546
|
+
fp.write(data)
|
|
547
|
+
fp.flush()
|
|
548
|
+
read_size += len(data)
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
class ZipFile(ZipFile):
|
|
552
|
+
def copy(self, zinfo_or_arcname, filename, *, chunk_size=2**20):
|
|
553
|
+
"""Copy a member in the archive."""
|
|
554
|
+
if self.mode not in ('w', 'x', 'a'):
|
|
555
|
+
raise ValueError("copy() requires mode 'w', 'x', or 'a'")
|
|
556
|
+
if not self.fp:
|
|
557
|
+
raise ValueError(
|
|
558
|
+
"Attempt to write to ZIP archive that was already closed")
|
|
559
|
+
if self._writing:
|
|
560
|
+
raise ValueError(
|
|
561
|
+
"Can't write to ZIP archive while an open writing handle exists."
|
|
562
|
+
)
|
|
563
|
+
if not self._seekable:
|
|
564
|
+
raise io.UnsupportedOperation("copy() requires a seekable stream.")
|
|
565
|
+
|
|
566
|
+
with self._lock:
|
|
567
|
+
# get the zinfo
|
|
568
|
+
# raise KeyError if arcname does not exist
|
|
569
|
+
if isinstance(zinfo_or_arcname, ZipInfo):
|
|
570
|
+
zinfo = zinfo_or_arcname
|
|
571
|
+
if zinfo not in self.filelist:
|
|
572
|
+
raise KeyError('There is no item %r in the archive' % zinfo)
|
|
573
|
+
else:
|
|
574
|
+
zinfo = self.getinfo(zinfo_or_arcname)
|
|
575
|
+
|
|
576
|
+
self._writing = True
|
|
577
|
+
try:
|
|
578
|
+
_ZipRepacker(chunk_size=chunk_size).copy(self, zinfo, filename)
|
|
579
|
+
finally:
|
|
580
|
+
self._writing = False
|
|
581
|
+
|
|
582
|
+
return zinfo
|
|
583
|
+
|
|
584
|
+
def remove(self, zinfo_or_arcname):
|
|
585
|
+
"""Remove a member from the archive."""
|
|
586
|
+
if self.mode not in ('w', 'x', 'a'):
|
|
587
|
+
raise ValueError("remove() requires mode 'w', 'x', or 'a'")
|
|
588
|
+
if not self.fp:
|
|
589
|
+
raise ValueError(
|
|
590
|
+
"Attempt to write to ZIP archive that was already closed")
|
|
591
|
+
if self._writing:
|
|
592
|
+
raise ValueError(
|
|
593
|
+
"Can't write to ZIP archive while an open writing handle exists."
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
with self._lock:
|
|
597
|
+
# get the zinfo
|
|
598
|
+
# raise KeyError if arcname does not exist
|
|
599
|
+
if isinstance(zinfo_or_arcname, ZipInfo):
|
|
600
|
+
zinfo = zinfo_or_arcname
|
|
601
|
+
if zinfo not in self.filelist:
|
|
602
|
+
raise KeyError('There is no item %r in the archive' % zinfo)
|
|
603
|
+
else:
|
|
604
|
+
zinfo = self.getinfo(zinfo_or_arcname)
|
|
605
|
+
|
|
606
|
+
self.filelist.remove(zinfo)
|
|
607
|
+
|
|
608
|
+
try:
|
|
609
|
+
del self.NameToInfo[zinfo.filename]
|
|
610
|
+
except KeyError:
|
|
611
|
+
pass
|
|
612
|
+
|
|
613
|
+
# Avoid missing entry if there is another entry having the same name,
|
|
614
|
+
# to prevent an error on `testzip()`.
|
|
615
|
+
# Reverse the order as NameToInfo normally stores the last added one.
|
|
616
|
+
for zi in reversed(self.filelist):
|
|
617
|
+
if zi.filename == zinfo.filename:
|
|
618
|
+
self.NameToInfo.setdefault(zi.filename, zi)
|
|
619
|
+
break
|
|
620
|
+
|
|
621
|
+
self._didModify = True
|
|
622
|
+
|
|
623
|
+
return zinfo
|
|
624
|
+
|
|
625
|
+
def repack(self, removed=None, **opts):
|
|
626
|
+
"""Repack a zip file, removing non-referenced file entries.
|
|
627
|
+
|
|
628
|
+
The archive must be opened with mode 'a', as mode 'w'/'x' do not
|
|
629
|
+
truncate the file when closed. This cannot be simplely changed as
|
|
630
|
+
they may be used on an unseekable file buffer, which disallows
|
|
631
|
+
truncation."""
|
|
632
|
+
if self.mode != 'a':
|
|
633
|
+
raise ValueError("repack() requires mode 'a'")
|
|
634
|
+
if not self.fp:
|
|
635
|
+
raise ValueError(
|
|
636
|
+
"Attempt to write to ZIP archive that was already closed")
|
|
637
|
+
if self._writing:
|
|
638
|
+
raise ValueError(
|
|
639
|
+
"Can't write to ZIP archive while an open writing handle exists"
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
with self._lock:
|
|
643
|
+
self._writing = True
|
|
644
|
+
try:
|
|
645
|
+
_ZipRepacker(**opts).repack(self, removed)
|
|
646
|
+
finally:
|
|
647
|
+
self._writing = False
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: zipremove
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Extend `zipfile` with `remove`-related functionalities
|
|
5
|
+
Home-page: https://github.com/danny0838/zipremove
|
|
6
|
+
Author: Danny Lin
|
|
7
|
+
Author-email: danny0838@gmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Topic :: System :: Archiving :: Compression
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
19
|
+
Classifier: Operating System :: OS Independent
|
|
20
|
+
Requires-Python: ~=3.9
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE.txt
|
|
23
|
+
Provides-Extra: dev
|
|
24
|
+
Requires-Dist: tox>=4.0; extra == "dev"
|
|
25
|
+
Requires-Dist: build; extra == "dev"
|
|
26
|
+
Requires-Dist: twine>=4.0; extra == "dev"
|
|
27
|
+
Requires-Dist: flake8>=5.0; extra == "dev"
|
|
28
|
+
Requires-Dist: flake8-comprehensions>=3.12; extra == "dev"
|
|
29
|
+
Requires-Dist: flake8-bugbear>=22.0; extra == "dev"
|
|
30
|
+
Requires-Dist: flake8-isort>=6.0; extra == "dev"
|
|
31
|
+
Requires-Dist: isort>=5.5; extra == "dev"
|
|
32
|
+
Dynamic: license-file
|
|
33
|
+
|
|
34
|
+
This package extends `zipfile` with `remove`-related functionalities.
|
|
35
|
+
|
|
36
|
+
## API
|
|
37
|
+
|
|
38
|
+
* `ZipFile.remove(zinfo_or_arcname)`
|
|
39
|
+
|
|
40
|
+
Removes a member from the archive. *zinfo_or_arcname* may be the full path
|
|
41
|
+
of the member or a `ZipInfo` instance.
|
|
42
|
+
|
|
43
|
+
If multiple members share the same full path, only one is removed when
|
|
44
|
+
a path is provided.
|
|
45
|
+
|
|
46
|
+
This does not physically remove the local file entry from the archive;
|
|
47
|
+
the ZIP file size remains unchanged. Call `ZipFile.repack` afterwards
|
|
48
|
+
to reclaim space.
|
|
49
|
+
|
|
50
|
+
The archive must be opened with mode ``'w'``, ``'x'`` or ``'a'``.
|
|
51
|
+
|
|
52
|
+
Returns the removed `ZipInfo` instance.
|
|
53
|
+
|
|
54
|
+
Calling `remove` on a closed ZipFile will raise a `ValueError`.
|
|
55
|
+
|
|
56
|
+
* `ZipFile.repack(removed=None, *, strict_descriptor=False[, chunk_size])`
|
|
57
|
+
|
|
58
|
+
Rewrites the archive to remove stale local file entries, shrinking the ZIP
|
|
59
|
+
file size.
|
|
60
|
+
|
|
61
|
+
If *removed* is provided, it must be a sequence of `ZipInfo` objects
|
|
62
|
+
representing removed entries; only their corresponding local file entries
|
|
63
|
+
will be removed.
|
|
64
|
+
|
|
65
|
+
If *removed* is not provided, local file entries no longer referenced in the
|
|
66
|
+
central directory will be removed. The algorithm assumes that local file
|
|
67
|
+
entries are stored consecutively:
|
|
68
|
+
|
|
69
|
+
1. Data before the first referenced entry is removed only when it appears to
|
|
70
|
+
be a sequence of consecutive entries with no extra following bytes; extra
|
|
71
|
+
preceeding bytes are preserved.
|
|
72
|
+
2. Data between referenced entries is removed only when it appears to
|
|
73
|
+
be a sequence of consecutive entries with no extra preceding bytes; extra
|
|
74
|
+
following bytes are preserved.
|
|
75
|
+
|
|
76
|
+
``strict_descriptor=True`` can be provided to skip the slower scan for an
|
|
77
|
+
unsigned data descriptor (deprecated in the latest ZIP specification and is
|
|
78
|
+
only used by legacy tools) when checking for bytes resembling a valid local
|
|
79
|
+
file entry. This improves performance, but may cause some stale local file
|
|
80
|
+
entries to be preserved, as any entry using an unsigned descriptor cannot
|
|
81
|
+
be detected.
|
|
82
|
+
|
|
83
|
+
*chunk_size* may be specified to control the buffer size when moving
|
|
84
|
+
entry data (default is 1 MiB).
|
|
85
|
+
|
|
86
|
+
The archive must be opened with mode ``'a'``.
|
|
87
|
+
|
|
88
|
+
Calling `repack` on a closed ZipFile will raise a `ValueError`.
|
|
89
|
+
|
|
90
|
+
* `ZipFile.copy(zinfo_or_arcname, new_arcname[, chunk_size])`
|
|
91
|
+
|
|
92
|
+
Copies a member *zinfo_or_arcname* to *new_arcname* in the archive.
|
|
93
|
+
*zinfo_or_arcname* may be the full path of the member or a `ZipInfo`
|
|
94
|
+
instance.
|
|
95
|
+
|
|
96
|
+
*chunk_size* may be specified to control the buffer size when copying
|
|
97
|
+
entry data (default is 1 MiB).
|
|
98
|
+
|
|
99
|
+
The archive must be opened with mode ``'w'``, ``'x'`` or ``'a'``, and the
|
|
100
|
+
underlying stream must be seekable.
|
|
101
|
+
|
|
102
|
+
Returns the original version of the copied `ZipInfo` instance.
|
|
103
|
+
|
|
104
|
+
Calling `copy` on a closed ZipFile will raise a `ValueError`.
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
## Examples
|
|
108
|
+
|
|
109
|
+
### Remove files and reclaim space
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
import os
|
|
113
|
+
import zipremove as zipfile
|
|
114
|
+
|
|
115
|
+
with zipfile.ZipFile('archive.zip', 'w') as zh:
|
|
116
|
+
zh.writestr('file1', 'content1')
|
|
117
|
+
zh.writestr('file2', 'content2')
|
|
118
|
+
zh.writestr('file3', 'content3')
|
|
119
|
+
zh.writestr('file4', 'content4')
|
|
120
|
+
|
|
121
|
+
print(os.path.getsize('archive.zip')) # 398
|
|
122
|
+
|
|
123
|
+
with zipfile.ZipFile('archive.zip', 'a') as zh:
|
|
124
|
+
zh.remove('file1')
|
|
125
|
+
zh.remove('file2')
|
|
126
|
+
zh.remove('file3')
|
|
127
|
+
zh.repack()
|
|
128
|
+
|
|
129
|
+
print(os.path.getsize('archive.zip')) # 116
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### Remove files under a directory and reclaim space
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
import os
|
|
136
|
+
import zipremove as zipfile
|
|
137
|
+
|
|
138
|
+
with zipfile.ZipFile('archive.zip', 'w') as zh:
|
|
139
|
+
zh.writestr('file0', 'content0')
|
|
140
|
+
zh.writestr('folder/file1', 'content1')
|
|
141
|
+
zh.writestr('folder/file2', 'content2')
|
|
142
|
+
zh.writestr('folder/file3', 'content3')
|
|
143
|
+
|
|
144
|
+
print(os.path.getsize('archive.zip')) # 440
|
|
145
|
+
|
|
146
|
+
with zipfile.ZipFile('archive.zip', 'a') as zh:
|
|
147
|
+
zinfos = [zh.remove(n) for n in zh.namelist() if n.startswith('folder/')]
|
|
148
|
+
zh.repack(zinfos)
|
|
149
|
+
|
|
150
|
+
print(os.path.getsize('archive.zip')) # 116
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### Rename files under a directory and reclaim space
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
import os
|
|
157
|
+
import zipremove as zipfile
|
|
158
|
+
|
|
159
|
+
with zipfile.ZipFile('archive.zip', 'w') as zh:
|
|
160
|
+
zh.writestr('file0', 'content0')
|
|
161
|
+
zh.writestr('folder1/file1', 'content1')
|
|
162
|
+
zh.writestr('folder1/file2', 'content2')
|
|
163
|
+
zh.writestr('folder1/file3', 'content3')
|
|
164
|
+
|
|
165
|
+
print(os.path.getsize('archive.zip')) # 446
|
|
166
|
+
|
|
167
|
+
with zipfile.ZipFile('archive.zip', 'a') as zh:
|
|
168
|
+
for n in zh.namelist():
|
|
169
|
+
if n.startswith('folder1/'):
|
|
170
|
+
n2 = 'folder2/' + n[len('folder1/'):]
|
|
171
|
+
zh.copy(n, n2)
|
|
172
|
+
zh.remove(n)
|
|
173
|
+
zh.repack()
|
|
174
|
+
|
|
175
|
+
print(os.path.getsize('archive.zip')) # 446
|
|
176
|
+
```
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
zipremove/__init__.py,sha256=7Zsq41HokO_OXm8hTOJzLfSQzYmoF9CYgf3855YVq4Q,23190
|
|
2
|
+
zipremove-0.1.0.dist-info/licenses/LICENSE.txt,sha256=DPhKIVISoyY27Og_OjvjwUeaoLmQkV0A5ZooG-0qyU8,1087
|
|
3
|
+
zipremove-0.1.0.dist-info/METADATA,sha256=SchVYUvN_ZzIpw9lV6HgbKvwTTTCBEad8WGrc_Vz7V8,6012
|
|
4
|
+
zipremove-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
5
|
+
zipremove-0.1.0.dist-info/top_level.txt,sha256=o5uNDGXkYnXQwCAqW36Y39ETXOC54gNiCohsM8Uzk80,10
|
|
6
|
+
zipremove-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Danny Lin
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
zipremove
|