zipinspect 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
zipinspect/__init__.py ADDED
@@ -0,0 +1,233 @@
1
+ import asyncio
2
+ import os.path
3
+ import sys
4
+ import textwrap
5
+ import time
6
+
7
+ from tabulate import tabulate
8
+ from progress.bar import Bar
9
+ from aioconsole import ainput
10
+
11
+ from .zipread import HTTPZipReader
12
+
13
+ from .utils.asyncio import TaskPool
14
+ from .utils.misc import PaginatedCollection
15
+
16
+
17
+ def dostime_to_rfc3339(t):
18
+ return time.strftime("%Y-%m-%dT%H:%M:%S", t + (0, 0, -1))
19
+
20
+ def numfmt_iec(n):
21
+ for u in 'BKMG':
22
+ if abs(n) < 1024.0:
23
+ return f'{n:3.1f}{u}'
24
+ n /= 1024.0
25
+
26
+ def sanitized_open(path, *args, **kwargs):
27
+ path = os.path.abspath(path)
28
+ if os.path.commonpath((path, os.getcwd())) != os.getcwd():
29
+ print(f"WARNING: Path {path} is dangerous; ignoring")
30
+ return None
31
+
32
+ os.makedirs(os.path.dirname(path), exist_ok=True)
33
+ return open(path, *args, **kwargs)
34
+
35
+ def parse_repl_args(line):
36
+ """Space delimited argument parser like the shell, but minimal."""
37
+ parsed = []
38
+ accum = ''
39
+ escape = False
40
+ quote = False
41
+
42
+ for i, token in enumerate(line):
43
+ if quote:
44
+ if escape:
45
+ if token in '\\"':
46
+ accum += token
47
+ escape = False
48
+ else:
49
+ print(f"ERROR: Cannot escape {token} at column {i}")
50
+ else:
51
+ if token == '\\':
52
+ escape = True
53
+ elif token == '"':
54
+ quote = False # Quote end
55
+ else:
56
+ accum += token
57
+ else:
58
+ # Only append if accum is not empty
59
+ if token == ' ' and accum != '':
60
+ parsed.append(accum)
61
+ accum = '' # Reset
62
+ elif token == '"':
63
+ quote = True # Quote start
64
+ else:
65
+ accum += token
66
+
67
+ # Push the last argument
68
+ if accum:
69
+ parsed.append(accum)
70
+
71
+ return parsed
72
+
73
+ async def extract_entries(z, entries, *, out_dir=None, concurrency=10):
74
+ async def extract_entry(entry, *, progress_cb):
75
+ final_path = f'{out_dir}/{entry.path}' if out_dir else entry.path
76
+
77
+ if not (output := sanitized_open(final_path, 'wb')):
78
+ return
79
+
80
+ # NOTE Multiple running async-for loops (i.e. async generators) are cumbersome,
81
+ # and thus to make matters manageable we use the good 'ol callbacks instead.
82
+ with output:
83
+ async for processed in z.extract(entry, output):
84
+ progress_cb(processed)
85
+
86
+
87
+ async with TaskPool(maxsize=concurrency) as pool:
88
+ visited_dirs = set()
89
+ total_tx = 0
90
+ coros = set()
91
+ bar = None
92
+
93
+ # Non-obvious control flow: by the time this function is called, `bar' would've been defined.
94
+ # This style of programming -- relying on global state -- is considered bad practice, but here
95
+ # this saves us the burden of not introducing one more asyncio.Task in the task pool.
96
+ def increment_done(n):
97
+ bar.next(n)
98
+
99
+ for info in entries:
100
+ if info.is_dir:
101
+ # Recursing into directories
102
+ if info.path not in visited_dirs:
103
+ visited_dirs.add(info.path)
104
+
105
+ for nested_info in z.entries:
106
+ if (not nested_info.is_dir and
107
+ nested_info.path.startswith(info.path)):
108
+ coros.add(extract_entry(nested_info, progress_cb=increment_done))
109
+ total_tx += nested_info.file_size
110
+
111
+ else: # Single files
112
+ if os.path.dirname(info.path) not in visited_dirs:
113
+ coros.add(extract_entry(info, progress_cb=increment_done))
114
+ total_tx += info.file_size
115
+
116
+ bar = Bar(max=total_tx, width=80, suffix='%(percent)d%%')
117
+ for coro in coros:
118
+ pool.create_task(coro)
119
+ bar.finish()
120
+
121
+
122
+ def print_entries(pages):
123
+ def zipinfo_to_row(info):
124
+ size = numfmt_iec(info.file_size) \
125
+ if not info.is_dir else 'N/A'
126
+ timestamp = dostime_to_rfc3339(info.modified_date)
127
+
128
+ return info.path, size, timestamp
129
+
130
+ page = [(i, *zipinfo_to_row(info))
131
+ for i, info in enumerate(pages.current(),
132
+ start=pages.current_offset)]
133
+
134
+ print(tabulate(page, headers=['#', 'entry', 'size', 'modified date']))
135
+ print(f"(Page {pages.current_page + 1}/{pages.n_pages})")
136
+
137
+ def int_safe(v, *args, **kwargs):
138
+ try:
139
+ iv = int(v, *args, **kwargs)
140
+ except ValueError:
141
+ iv = None
142
+
143
+ if v == '...':
144
+ print("ERROR: Ellipsis (...) is used in a wrong way", file=sys.stderr)
145
+ else:
146
+ print(f"ERROR: {v!r} is not an integer value", file=sys.stderr)
147
+
148
+ return iv
149
+
150
+ async def app(url):
151
+ async with HTTPZipReader(url) as z:
152
+ pages = PaginatedCollection(z.entries)
153
+
154
+ while True:
155
+ try:
156
+ args = parse_repl_args(await ainput("> "))
157
+ except EOFError:
158
+ break
159
+
160
+ # Skip empty prompts.
161
+ if len(args) < 1:
162
+ continue
163
+
164
+ match args[0]:
165
+ case 'help':
166
+ print(textwrap.dedent("""\
167
+ This is the REPL, and the following commands are available.
168
+
169
+ list List entries in the current page
170
+ prev Go backward one page and show entries
171
+ next Go forward one page and show entries
172
+ extract <index> [dir] Extract entry with index <index>
173
+ extract <start>,...,<end> [dir] Extract entries from <start> to <end>
174
+ extract <i0>,<i1>,...<in> [dir] Extract entries with specified indices
175
+
176
+ NOTE: The extract command accepts an optinal path to the directory to extract into.
177
+ If not provided, it extracts into the current working directory"""))
178
+ case 'list':
179
+ print_entries(pages)
180
+ case 'prev':
181
+ pages.previous()
182
+ print_entries(pages)
183
+ case 'next':
184
+ pages.next()
185
+ print_entries(pages)
186
+ case 'extract':
187
+ if len(args) < 2:
188
+ print("ERROR: Nothing to extract, forgot an argument?", file=sys.stderr)
189
+ continue
190
+
191
+ indices = args[1].split(',')
192
+ out_dir = None
193
+
194
+ if len(args) > 2:
195
+ out_dir = args[2]
196
+
197
+ if len(indices) == 1:
198
+ start = int_safe(indices[0])
199
+ if start is None:
200
+ continue
201
+
202
+ if not 0 <= start < len(z.entries):
203
+ print(f"ERROR: Index {start} is out of bounds", file=sys.stderr)
204
+ continue
205
+
206
+ await extract_entries(z, (z.entries[start],), out_dir=out_dir)
207
+ else:
208
+ if len(indices) == 3 and indices[1] == '...':
209
+ start, end = int_safe(indices[0]), int_safe(indices[2])
210
+
211
+ if start is None or end is None:
212
+ continue
213
+ if not(0 <= start < end < len(z.entries)):
214
+ print(f"ERROR: Range {start},...,{end} is out of bounds", file=sys.stderr)
215
+ continue
216
+
217
+ await extract_entries(z, z.entries[start:end], out_dir=out_dir)
218
+ else:
219
+ # Filter out invalid and out-of-bounds indices
220
+ entries = [z.entries[iv] for s in indices
221
+ if (iv := int_safe(s)) is not None and 0 <= iv < len(z.entries)]
222
+ await extract_entries(z, entries, out_dir=out_dir)
223
+
224
+ # FIXME For some reason, Bar.finish() doesn't end with a newline.
225
+ sys.stdout.write('\n\n')
226
+ case wrong_cmd:
227
+ print(f"ERROR: Not a valid command {wrong_cmd}; try again.", file=sys.stderr)
228
+
229
+ def main():
230
+ if len(sys.argv) < 2:
231
+ print("Forgot thy URL?", file=sys.stderr)
232
+
233
+ asyncio.run(app(sys.argv[1]), debug=True)
zipinspect/__main__.py ADDED
@@ -0,0 +1,4 @@
1
+ from . import main
2
+
3
+ if __name__ == "__main__":
4
+ main()
File without changes
@@ -0,0 +1,19 @@
1
+ from asyncio import TaskGroup, Semaphore
2
+
3
+ class TaskPool(TaskGroup):
4
+ def __init__(self, *, maxsize):
5
+ self._semaphore = Semaphore(maxsize)
6
+ super().__init__()
7
+
8
+ def create_task(self, coro, **kwargs):
9
+ async def wrapper_coro():
10
+ await self._semaphore.acquire()
11
+ try:
12
+ result = await coro
13
+ finally:
14
+ self._semaphore.release()
15
+
16
+ return result
17
+
18
+ return super().create_task(wrapper_coro(), **kwargs)
19
+
@@ -0,0 +1,34 @@
1
+ class PaginatedCollection:
2
+ def __init__(self, sequence, *, page_size = 25):
3
+ self.sequence = sequence
4
+ self.current_page = 0
5
+ self.n_pages = len(sequence) // page_size + 1
6
+ self.page_size = page_size
7
+
8
+ def previous(self):
9
+ if self.current_page == 0:
10
+ self.current_page = self.n_pages-1
11
+ else:
12
+ self.current_page -= 1
13
+
14
+ def next(self):
15
+ if self.current_page == self.n_pages-1:
16
+ self.current_page = 0
17
+ else:
18
+ self.current_page += 1
19
+
20
+ @property
21
+ def current_offset(self):
22
+ return self.current_page * self.page_size
23
+
24
+ def current(self):
25
+ return self.index(self.current_page)
26
+
27
+ def index(self, page: int):
28
+ if 0 <= page <= self.n_pages:
29
+ begin = page * self.page_size
30
+ end = (page + 1) * self.page_size
31
+
32
+ return self.sequence[begin:end]
33
+ else:
34
+ raise ValueError(f"Page index {page} out of bounds")
@@ -0,0 +1,293 @@
1
+ import zlib
2
+ import bz2
3
+ import lzma
4
+ import compression.zstd
5
+
6
+ import httpx
7
+
8
+ from enum import Enum
9
+ from struct import Struct
10
+ from dataclasses import dataclass
11
+
12
+ from .stubs import (
13
+ _LFHStub,
14
+ _CDFHStub,
15
+ _EOCDStub,
16
+ _EOCD64Stub
17
+ )
18
+
19
+ _LFHStruct = Struct('<4sHHHHHIIIHH')
20
+ _CDFHStruct = Struct('<4sHHHHHHIIIHHHHHII')
21
+ _EOCDStruct = Struct('<4sHHHHII')
22
+ _EOCD64Struct = Struct('<4sQHHIIQQQQ')
23
+ _EOCD64LocatorStruct = Struct('<4sIQI')
24
+
25
+ class ZipCompression(Enum):
26
+ NONE = 0
27
+ DEFLATE = 8
28
+ BZIP2 = 12
29
+ LZMA = 14
30
+ ZSTANDARD = 93
31
+
32
+ @dataclass
33
+ class ZipEntryInfo:
34
+ path: str
35
+ raw_offset: int
36
+ file_size: int
37
+ encrypted: int
38
+ checksum: int
39
+ compression: ZipCompression
40
+ compressed_size: int
41
+ modified_date: tuple
42
+ internal_attrs: int
43
+ external_attrs: int
44
+
45
+ @property
46
+ def is_dir(self):
47
+ return self.path.endswith('/')
48
+
49
+ class ZipError(Exception):
50
+ pass
51
+
52
+ class HTTPError(Exception):
53
+ pass
54
+
55
+
56
+ class HTTPZipReader:
57
+ def __init__(self, url: str, *, httpx_args=None):
58
+ httpx_args = httpx_args or {}
59
+
60
+ self.url = url
61
+ self.entries = None
62
+ self.size = 0
63
+ self.client = httpx.AsyncClient(http2=True, **httpx_args)
64
+
65
+ async def _request(self, start, end=None, *, stream=False, httpx_args=None):
66
+ if httpx_args is None:
67
+ httpx_args = {}
68
+ if start < 0:
69
+ raise ValueError(f"Range can't beginning with {start}; clamping.")
70
+ if end is None:
71
+ end = self.size
72
+ if start >= end:
73
+ raise ValueError(f"Invalid range {start}-{end}")
74
+
75
+ httpx_args = httpx_args or {}
76
+ headers = httpx_args.setdefault('headers', {})
77
+ headers['Range'] = f'bytes={int(start)}-{int(end) - 1}'
78
+
79
+ request = self.client.build_request('GET', self.url, **httpx_args)
80
+ r = await self.client.send(request, stream=stream)
81
+ if r.status_code != 206:
82
+ raise HTTPError(f"Got status code {r.status_code} for {self.url}")
83
+
84
+ return r
85
+
86
+ async def _parse_eocd(self):
87
+ r = await self._request(max(0, self.size - 65557))
88
+ start_offset = r.content.rfind(b'\x50\x4B\x05\x06')
89
+
90
+ if start_offset == -1:
91
+ raise ZipError(f"EOCD Signature not found")
92
+
93
+ stub = _EOCDStub._make(_EOCDStruct.unpack(r.content[start_offset:start_offset + 20]))
94
+ if (stub.disk != stub.begin_disk or
95
+ stub.ents_on_disk != stub.ents_total):
96
+ raise ZipError("Multipart Zip files aren't supported")
97
+
98
+ return stub, (self.size - len(r.content) + start_offset)
99
+
100
+ async def _parse_eocd64(self, offset):
101
+ r = await self._request(offset, offset + 56)
102
+ stub = _EOCD64Stub._make(_EOCDStruct.unpack(r.content))
103
+
104
+ if stub.signature != b'\x50\x4B\x06\x06':
105
+ raise ZipError(f"Invalid EOCD signature: {stub.signature.hex()}")
106
+
107
+ if (stub.disk != stub.begin_disk or
108
+ stub.ents_on_disk != stub.ents_total):
109
+ raise ZipError("Multipart Zip files are not supported")
110
+
111
+ async def _parse_eocd64_locator(self, eocd_start):
112
+ r = await self._request(eocd_start - 20, eocd_start)
113
+
114
+ signature, disk, offset, n_disks = _EOCD64LocatorStruct.unpack(r.content)
115
+ if signature != b'\x50\x4B\x06\x07':
116
+ raise ZipError(f"Invalid EOCD64 signature: {signature.hex()}")
117
+ if disk != 0 or n_disks == 1:
118
+ raise ZipError("Multipart Zip files aren't supported")
119
+
120
+ return offset
121
+
122
+ @staticmethod
123
+ def _detect_zip64_from_eocd(stub: _EOCDStub):
124
+ if (stub.disk == 0xFFFF and stub.ents_total == 0xFFF and
125
+ stub.cd_size == 0xFFFFFFF and stub.cd_offset == 0xFFFFFFFF):
126
+ return True
127
+ else:
128
+ return False
129
+
130
+ @staticmethod
131
+ def _parse_extras(extras):
132
+ offset = 0
133
+ size = len(extras)
134
+
135
+ while offset < size:
136
+ eid = extras[offset:offset + 2]
137
+ size = int.from_bytes(extras[offset + 2:offset + 4], byteorder='little')
138
+ data = extras[offset + 4:offset + size]
139
+
140
+ yield eid, data
141
+
142
+ @staticmethod
143
+ def _parse_zip64_extra(data):
144
+ size = len(data)
145
+ uncompressed, compressed, offset = None, None, None
146
+
147
+ if size >= 8:
148
+ uncompressed = int.from_bytes(data[:8], byteorder='little')
149
+ if size >= 16:
150
+ compressed = int.from_bytes(data[8:16], byteorder='little')
151
+ if size >= 32:
152
+ offset = int.from_bytes(data[16:24], byteorder='little')
153
+
154
+ return uncompressed, compressed, offset
155
+
156
+ async def _parse_cd_ents(self, offset, size):
157
+ r = await self._request(offset, offset + size)
158
+ cd = r.content
159
+
160
+ offset = 0
161
+ while offset < size:
162
+ stub = _CDFHStub._make(_CDFHStruct.unpack(cd[offset:offset + 46]))
163
+ offset += 46
164
+
165
+ raw_path = cd[offset:offset + stub.path_size]
166
+ offset += stub.path_size
167
+
168
+ extras = dict(self._parse_extras(cd[offset:offset + stub.extra_size]))
169
+ offset += stub.extra_size
170
+ offset += stub.comment_size
171
+
172
+ # TODO Consider "UPath" extra field for completeness sake
173
+ if stub.bitflag & 0b10000000000:
174
+ path = raw_path.decode('utf-8', errors='replace')
175
+ else:
176
+ path = raw_path.decode('cp437')
177
+
178
+ if zip64_extra := extras.get(b'\x01\x00'):
179
+ uncompressed, compressed, offset = self._parse_zip64_extra(zip64_extra)
180
+
181
+ if stub.compressed_size == 0xFF:
182
+ stub.compressed_size = compressed
183
+ if stub.uncompressed_size == 0xFF:
184
+ stub.uncompressed_size = uncompressed
185
+ if stub.offset == 0xFF:
186
+ stub.offset = offset
187
+
188
+ yield path, stub
189
+
190
+ async def _calc_data_offset(self, offset: int) -> int:
191
+ r = await self._request(offset, offset + 30)
192
+ lfh = _LFHStub._make(_LFHStruct.unpack(r.content))
193
+
194
+ # NOTE An encrypted file has encryption header following LFH.
195
+ return (offset
196
+ + 30 # LFH
197
+ + lfh.path_size
198
+ + lfh.extra_size)
199
+
200
+ @staticmethod
201
+ def _parse_msdos_date(date, time):
202
+ # See: https://learn.microsoft.com/en-us/windows/win32/api/winbase/nf-winbase-dosdatetimetofiletime
203
+ # Microsoft was retarded from its early days.
204
+ year = (date >> 9) + 1980
205
+ month = date >> 5 & 0xF
206
+ day = date & 0x1F
207
+ hour = time >> 11
208
+ minute = time >> 5 & 0x3F
209
+ second = time & 0x1F
210
+
211
+ return year, month, day, hour, minute, second * 2
212
+
213
+ async def load_entries(self):
214
+ if self.entries is not None:
215
+ return
216
+
217
+ r = await self.client.head(self.url)
218
+ if r.status_code != 200:
219
+ raise HTTPError(f"Got status code {r.status_code} for {self.url}")
220
+ if r.headers.get('Accept-Ranges') != 'bytes':
221
+ raise HTTPError(f"Range requests not supported on {self.url}")
222
+
223
+ if size := r.headers.get('Content-Length'):
224
+ self.size = int(size)
225
+ else:
226
+ raise HTTPError(f"Unable to determine length of zip file for {self.url}")
227
+
228
+ eocd, eocd_start = await self._parse_eocd()
229
+
230
+ # Load EOCD64 if Zip64 detected, and replace original EOCD.
231
+ if self._detect_zip64_from_eocd(eocd):
232
+ eocd64_start = await self._parse_eocd64_locator(eocd_start)
233
+ eocd = await self._parse_eocd64(eocd64_start)
234
+
235
+ self.entries = [ZipEntryInfo(path=path,
236
+ raw_offset=info.offset,
237
+ file_size=info.uncompressed_size,
238
+ checksum=info.checksum,
239
+ encrypted=bool(info.bitflag & 1),
240
+ compression=ZipCompression(info.compression_mode),
241
+ compressed_size=info.compressed_size,
242
+ modified_date=self._parse_msdos_date(info.file_mdate, info.file_mtime),
243
+ internal_attrs=info.internal_attrs,
244
+ external_attrs=info.external_attrs)
245
+ async for path, info in self._parse_cd_ents(eocd.cd_offset, eocd.cd_size)]
246
+
247
+ async def extract(self, info, output):
248
+ offset = await self._calc_data_offset(info.raw_offset)
249
+
250
+ # Nothing to do, so exit.
251
+ if not info.compressed_size:
252
+ return
253
+ if info.encrypted:
254
+ raise ZipError("Encrypted files are not supported")
255
+
256
+ r = await self._request(offset, offset + info.compressed_size, stream=True)
257
+
258
+ match info.compression:
259
+ case ZipCompression.NONE:
260
+ decompressor = None
261
+ case ZipCompression.DEFLATE:
262
+ # Negative value for raw DEFLATE
263
+ decompressor = zlib.decompressobj(-15)
264
+ case ZipCompression.BZIP2:
265
+ decompressor = bz2.BZ2Decompressor()
266
+ case ZipCompression.LZMA:
267
+ decompressor = lzma.LZMADecompressor()
268
+ case ZipCompression.ZSTANDARD:
269
+ decompressor = compression.zstd.ZstdDecompressor()
270
+ case _:
271
+ raise NotImplementedError
272
+
273
+ if decompressor:
274
+ async for chunk in r.aiter_bytes():
275
+ decompressed = decompressor.decompress(chunk)
276
+ output.write(decompressed)
277
+ yield len(decompressed)
278
+
279
+ decompressed = decompressor.flush()
280
+ output.write(decompressed)
281
+ yield len(decompressed)
282
+ else:
283
+ async for chunk in r.aiter_bytes():
284
+ output.write(chunk)
285
+ yield len(chunk)
286
+
287
+ async def __aenter__(self):
288
+ await self.load_entries()
289
+
290
+ return self
291
+
292
+ async def __aexit__(self, *args):
293
+ pass
@@ -0,0 +1,54 @@
1
+ from typing import NamedTuple
2
+
3
+ class _CDFHStub(NamedTuple):
4
+ signature: bytes
5
+ maker_version: int
6
+ version_needed: int
7
+ bitflag: int
8
+ compression_mode: int
9
+ file_mtime: int
10
+ file_mdate: int
11
+ checksum: int
12
+ compressed_size: int
13
+ uncompressed_size: int
14
+ path_size: int
15
+ extra_size: int
16
+ comment_size: int
17
+ begin_disk: int
18
+ internal_attrs: int
19
+ external_attrs: int
20
+ offset: int
21
+
22
+ class _EOCDStub(NamedTuple):
23
+ signature: bytes
24
+ disk: int
25
+ begin_disk: int
26
+ ents_on_disk: int
27
+ ents_total: int
28
+ cd_size: int
29
+ cd_offset: int
30
+
31
+ class _EOCD64Stub(NamedTuple):
32
+ signature: bytes
33
+ size: int
34
+ make_version: int
35
+ version_needed: int
36
+ disk: int
37
+ begin_disk: int
38
+ ents_on_disk: int
39
+ ents_total: int
40
+ cd_size: int
41
+ cd_offset: int
42
+
43
+ class _LFHStub(NamedTuple):
44
+ signature: bytes
45
+ version: int
46
+ bitflag: int
47
+ compression_mode: int
48
+ file_mtime: int
49
+ file_mdate: int
50
+ checksum: int
51
+ compressed_size: int
52
+ uncomprssed_size: int
53
+ path_size: int
54
+ extra_size: int