PyPI - zipinspect - Versions diffs - 0.1.0__py3-none-any.whl - Mend

zipinspect 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

zipinspect/__init__.py +233 -0
zipinspect/__main__.py +4 -0
zipinspect/utils/__init__.py +0 -0
zipinspect/utils/asyncio.py +19 -0
zipinspect/utils/misc.py +34 -0
zipinspect/zipread/__init__.py +293 -0
zipinspect/zipread/stubs.py +54 -0
zipinspect-0.1.0.dist-info/METADATA +107 -0
zipinspect-0.1.0.dist-info/RECORD +12 -0
zipinspect-0.1.0.dist-info/WHEEL +4 -0
zipinspect-0.1.0.dist-info/entry_points.txt +3 -0
zipinspect-0.1.0.dist-info/licenses/LICENSE +674 -0

zipinspect/__init__.py ADDED Viewed

@@ -0,0 +1,233 @@
+import asyncio
+import os.path
+import sys
+import textwrap
+import time
+from tabulate import tabulate
+from progress.bar import Bar
+from aioconsole import ainput
+from .zipread import HTTPZipReader
+from .utils.asyncio import TaskPool
+from .utils.misc import PaginatedCollection
+def dostime_to_rfc3339(t):
+    return time.strftime("%Y-%m-%dT%H:%M:%S", t + (0, 0, -1))
+def numfmt_iec(n):
+    for u in 'BKMG':
+        if abs(n) < 1024.0:
+            return f'{n:3.1f}{u}'
+        n /= 1024.0
+def sanitized_open(path, *args, **kwargs):
+    path = os.path.abspath(path)
+    if os.path.commonpath((path, os.getcwd())) != os.getcwd():
+        print(f"WARNING: Path {path} is dangerous; ignoring")
+        return None
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    return open(path, *args, **kwargs)
+def parse_repl_args(line):
+    """Space delimited argument parser like the shell, but minimal."""
+    parsed = []
+    accum = ''
+    escape = False
+    quote = False
+    for i, token in enumerate(line):
+        if quote:
+            if escape:
+                if token in '\\"':
+                    accum += token
+                    escape = False
+                else:
+                    print(f"ERROR: Cannot escape {token} at column {i}")
+            else:
+                if token == '\\':
+                    escape = True
+                elif token == '"':
+                    quote = False # Quote end
+                else:
+                    accum += token
+        else:
+            # Only append if accum is not empty
+            if token == ' ' and accum != '':
+                parsed.append(accum)
+                accum = '' # Reset
+            elif token == '"':
+                quote = True # Quote start
+            else:
+                accum += token
+    # Push the last argument
+    if accum:
+        parsed.append(accum)
+    return parsed
+async def extract_entries(z, entries, *, out_dir=None, concurrency=10):
+    async def extract_entry(entry, *, progress_cb):
+        final_path = f'{out_dir}/{entry.path}' if out_dir else entry.path
+        if not (output := sanitized_open(final_path, 'wb')):
+            return
+        # NOTE Multiple running async-for loops (i.e. async generators) are cumbersome,
+        # and thus to make matters manageable we use the good 'ol callbacks instead.
+        with output:
+            async for processed in z.extract(entry, output):
+                progress_cb(processed)
+    async with TaskPool(maxsize=concurrency) as pool:
+        visited_dirs = set()
+        total_tx = 0
+        coros = set()
+        bar = None
+        # Non-obvious control flow: by the time this function is called, `bar' would've been defined.
+        # This style of programming -- relying on global state -- is considered bad practice, but here
+        # this saves us the burden of not introducing one more asyncio.Task in the task pool.
+        def increment_done(n):
+            bar.next(n)
+        for info in entries:
+            if info.is_dir:
+                # Recursing into directories
+                if info.path not in visited_dirs:
+                    visited_dirs.add(info.path)
+                    for nested_info in z.entries:
+                        if (not nested_info.is_dir and
+                                nested_info.path.startswith(info.path)):
+                            coros.add(extract_entry(nested_info, progress_cb=increment_done))
+                            total_tx += nested_info.file_size
+            else: # Single files
+                if os.path.dirname(info.path) not in visited_dirs:
+                    coros.add(extract_entry(info, progress_cb=increment_done))
+                    total_tx += info.file_size
+        bar = Bar(max=total_tx, width=80, suffix='%(percent)d%%')
+        for coro in coros:
+            pool.create_task(coro)
+        bar.finish()
+def print_entries(pages):
+    def zipinfo_to_row(info):
+        size = numfmt_iec(info.file_size) \
+            if not info.is_dir else 'N/A'
+        timestamp = dostime_to_rfc3339(info.modified_date)
+        return info.path, size, timestamp
+    page = [(i, *zipinfo_to_row(info))
+            for i, info in enumerate(pages.current(),
+                                     start=pages.current_offset)]
+    print(tabulate(page, headers=['#', 'entry', 'size', 'modified date']))
+    print(f"(Page {pages.current_page + 1}/{pages.n_pages})")
+def int_safe(v, *args, **kwargs):
+    try:
+        iv = int(v, *args, **kwargs)
+    except ValueError:
+        iv = None
+        if v == '...':
+            print("ERROR: Ellipsis (...) is used in a wrong way", file=sys.stderr)
+        else:
+            print(f"ERROR: {v!r} is not an integer value", file=sys.stderr)
+    return iv
+async def app(url):
+    async with HTTPZipReader(url) as z:
+        pages = PaginatedCollection(z.entries)
+        while True:
+            try:
+                args = parse_repl_args(await ainput("> "))
+            except EOFError:
+                break
+            # Skip empty prompts.
+            if len(args) < 1:
+                continue
+            match args[0]:
+                case 'help':
+                    print(textwrap.dedent("""\
+                          This is the REPL, and the following commands are available.
+                          list                            List entries in the current page
+                          prev                            Go backward one page and show entries
+                          next                            Go forward one page and show entries
+                          extract <index> [dir]           Extract entry with index <index>
+                          extract <start>,...,<end> [dir] Extract entries from <start> to <end>
+                          extract <i0>,<i1>,...<in> [dir] Extract entries with specified indices
+                          NOTE: The extract command accepts an optinal path to the directory to extract into.
+                          If not provided, it extracts into the current working directory"""))
+                case 'list':
+                    print_entries(pages)
+                case 'prev':
+                    pages.previous()
+                    print_entries(pages)
+                case 'next':
+                    pages.next()
+                    print_entries(pages)
+                case 'extract':
+                    if len(args) < 2:
+                        print("ERROR: Nothing to extract, forgot an argument?", file=sys.stderr)
+                        continue
+                    indices = args[1].split(',')
+                    out_dir = None
+                    if len(args) > 2:
+                        out_dir = args[2]
+                    if len(indices) == 1:
+                        start = int_safe(indices[0])
+                        if start is None:
+                            continue
+                        if not 0 <= start < len(z.entries):
+                            print(f"ERROR: Index {start} is out of bounds", file=sys.stderr)
+                            continue
+                        await extract_entries(z, (z.entries[start],), out_dir=out_dir)
+                    else:
+                        if len(indices) == 3 and indices[1] == '...':
+                            start, end = int_safe(indices[0]), int_safe(indices[2])
+                            if start is None or end is None:
+                                continue
+                            if not(0 <= start < end < len(z.entries)):
+                                print(f"ERROR: Range {start},...,{end} is out of bounds", file=sys.stderr)
+                                continue
+                            await extract_entries(z, z.entries[start:end], out_dir=out_dir)
+                        else:
+                            # Filter out invalid and out-of-bounds indices
+                            entries = [z.entries[iv] for s in indices
+                                                     if (iv := int_safe(s)) is not None and 0 <= iv < len(z.entries)]
+                            await extract_entries(z, entries, out_dir=out_dir)
+                    # FIXME For some reason, Bar.finish() doesn't end with a newline.
+                    sys.stdout.write('\n\n')
+                case wrong_cmd:
+                    print(f"ERROR: Not a valid command {wrong_cmd}; try again.", file=sys.stderr)
+def main():
+    if len(sys.argv) < 2:
+        print("Forgot thy URL?", file=sys.stderr)
+    asyncio.run(app(sys.argv[1]), debug=True)

zipinspect/__main__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from . import main
+if __name__ == "__main__":
+    main()

zipinspect/utils/__init__.py ADDED Viewed

File without changes

zipinspect/utils/asyncio.py ADDED Viewed

@@ -0,0 +1,19 @@
+from asyncio import TaskGroup, Semaphore
+class TaskPool(TaskGroup):
+    def __init__(self, *, maxsize):
+        self._semaphore = Semaphore(maxsize)
+        super().__init__()
+    def create_task(self, coro, **kwargs):
+        async def wrapper_coro():
+            await self._semaphore.acquire()
+            try:
+                result = await coro
+            finally:
+                self._semaphore.release()
+            return result
+        return super().create_task(wrapper_coro(), **kwargs)

zipinspect/utils/misc.py ADDED Viewed

@@ -0,0 +1,34 @@
+class PaginatedCollection:
+    def __init__(self, sequence, *, page_size = 25):
+        self.sequence = sequence
+        self.current_page = 0
+        self.n_pages = len(sequence) // page_size + 1
+        self.page_size = page_size
+    def previous(self):
+        if self.current_page == 0:
+            self.current_page = self.n_pages-1
+        else:
+            self.current_page -= 1
+    def next(self):
+        if self.current_page == self.n_pages-1:
+            self.current_page = 0
+        else:
+            self.current_page += 1
+    @property
+    def current_offset(self):
+        return self.current_page * self.page_size
+    def current(self):
+        return self.index(self.current_page)
+    def index(self, page: int):
+        if 0 <= page <= self.n_pages:
+            begin = page * self.page_size
+            end = (page + 1) * self.page_size
+            return self.sequence[begin:end]
+        else:
+            raise ValueError(f"Page index {page} out of bounds")

zipinspect/zipread/__init__.py ADDED Viewed

@@ -0,0 +1,293 @@
+import zlib
+import bz2
+import lzma
+import compression.zstd
+import httpx
+from enum import Enum
+from struct import Struct
+from dataclasses import dataclass
+from .stubs import (
+    _LFHStub,
+    _CDFHStub,
+    _EOCDStub,
+    _EOCD64Stub
+)
+_LFHStruct = Struct('<4sHHHHHIIIHH')
+_CDFHStruct = Struct('<4sHHHHHHIIIHHHHHII')
+_EOCDStruct = Struct('<4sHHHHII')
+_EOCD64Struct = Struct('<4sQHHIIQQQQ')
+_EOCD64LocatorStruct = Struct('<4sIQI')
+class ZipCompression(Enum):
+    NONE = 0
+    DEFLATE = 8
+    BZIP2 = 12
+    LZMA = 14
+    ZSTANDARD = 93
+@dataclass
+class ZipEntryInfo:
+    path: str
+    raw_offset: int
+    file_size: int
+    encrypted: int
+    checksum: int
+    compression: ZipCompression
+    compressed_size: int
+    modified_date: tuple
+    internal_attrs: int
+    external_attrs: int
+    @property
+    def is_dir(self):
+        return self.path.endswith('/')
+class ZipError(Exception):
+    pass
+class HTTPError(Exception):
+    pass
+class HTTPZipReader:
+    def __init__(self, url: str, *, httpx_args=None):
+        httpx_args = httpx_args or {}
+        self.url = url
+        self.entries = None
+        self.size = 0
+        self.client = httpx.AsyncClient(http2=True, **httpx_args)
+    async def _request(self, start, end=None, *, stream=False, httpx_args=None):
+        if httpx_args is None:
+            httpx_args = {}
+        if start < 0:
+            raise ValueError(f"Range can't beginning with {start}; clamping.")
+        if end is None:
+            end = self.size
+        if start >= end:
+            raise ValueError(f"Invalid range {start}-{end}")
+        httpx_args = httpx_args or {}
+        headers = httpx_args.setdefault('headers', {})
+        headers['Range'] = f'bytes={int(start)}-{int(end) - 1}'
+        request = self.client.build_request('GET', self.url, **httpx_args)
+        r = await self.client.send(request, stream=stream)
+        if r.status_code != 206:
+            raise HTTPError(f"Got status code {r.status_code} for {self.url}")
+        return r
+    async def _parse_eocd(self):
+        r = await self._request(max(0, self.size - 65557))
+        start_offset = r.content.rfind(b'\x50\x4B\x05\x06')
+        if start_offset == -1:
+            raise ZipError(f"EOCD Signature not found")
+        stub = _EOCDStub._make(_EOCDStruct.unpack(r.content[start_offset:start_offset + 20]))
+        if (stub.disk != stub.begin_disk or
+                stub.ents_on_disk != stub.ents_total):
+            raise ZipError("Multipart Zip files aren't supported")
+        return stub, (self.size - len(r.content) + start_offset)
+    async def _parse_eocd64(self, offset):
+        r = await self._request(offset, offset + 56)
+        stub = _EOCD64Stub._make(_EOCDStruct.unpack(r.content))
+        if stub.signature != b'\x50\x4B\x06\x06':
+            raise ZipError(f"Invalid EOCD signature: {stub.signature.hex()}")
+        if (stub.disk != stub.begin_disk or
+                stub.ents_on_disk != stub.ents_total):
+            raise ZipError("Multipart Zip files are not supported")
+    async def _parse_eocd64_locator(self, eocd_start):
+        r = await self._request(eocd_start - 20, eocd_start)
+        signature, disk, offset, n_disks = _EOCD64LocatorStruct.unpack(r.content)
+        if signature != b'\x50\x4B\x06\x07':
+            raise ZipError(f"Invalid EOCD64 signature: {signature.hex()}")
+        if disk != 0 or n_disks == 1:
+            raise ZipError("Multipart Zip files aren't supported")
+        return offset
+    @staticmethod
+    def _detect_zip64_from_eocd(stub: _EOCDStub):
+        if (stub.disk == 0xFFFF and stub.ents_total == 0xFFF and
+            stub.cd_size == 0xFFFFFFF and stub.cd_offset == 0xFFFFFFFF):
+            return True
+        else:
+            return False
+    @staticmethod
+    def _parse_extras(extras):
+        offset = 0
+        size = len(extras)
+        while offset < size:
+            eid = extras[offset:offset + 2]
+            size = int.from_bytes(extras[offset + 2:offset + 4], byteorder='little')
+            data = extras[offset + 4:offset + size]
+            yield eid, data
+    @staticmethod
+    def _parse_zip64_extra(data):
+        size = len(data)
+        uncompressed, compressed, offset = None, None, None
+        if size >= 8:
+            uncompressed = int.from_bytes(data[:8], byteorder='little')
+        if size >= 16:
+            compressed = int.from_bytes(data[8:16], byteorder='little')
+        if size >= 32:
+            offset = int.from_bytes(data[16:24], byteorder='little')
+        return uncompressed, compressed, offset
+    async def _parse_cd_ents(self, offset, size):
+        r = await self._request(offset, offset + size)
+        cd = r.content
+        offset = 0
+        while offset < size:
+            stub = _CDFHStub._make(_CDFHStruct.unpack(cd[offset:offset + 46]))
+            offset += 46
+            raw_path = cd[offset:offset + stub.path_size]
+            offset += stub.path_size
+            extras = dict(self._parse_extras(cd[offset:offset + stub.extra_size]))
+            offset += stub.extra_size
+            offset += stub.comment_size
+            # TODO Consider "UPath" extra field for completeness sake
+            if stub.bitflag & 0b10000000000:
+                path = raw_path.decode('utf-8', errors='replace')
+            else:
+                path = raw_path.decode('cp437')
+            if zip64_extra := extras.get(b'\x01\x00'):
+                uncompressed, compressed, offset = self._parse_zip64_extra(zip64_extra)
+                if stub.compressed_size == 0xFF:
+                    stub.compressed_size = compressed
+                if stub.uncompressed_size == 0xFF:
+                    stub.uncompressed_size = uncompressed
+                if stub.offset == 0xFF:
+                    stub.offset = offset
+            yield path, stub
+    async def _calc_data_offset(self, offset: int) -> int:
+        r = await self._request(offset, offset + 30)
+        lfh = _LFHStub._make(_LFHStruct.unpack(r.content))
+        # NOTE An encrypted file has encryption header following LFH.
+        return (offset
+                + 30  # LFH
+                + lfh.path_size
+                + lfh.extra_size)
+    @staticmethod
+    def _parse_msdos_date(date, time):
+        # See: https://learn.microsoft.com/en-us/windows/win32/api/winbase/nf-winbase-dosdatetimetofiletime
+        # Microsoft was retarded from its early days.
+        year = (date >> 9) + 1980
+        month = date >> 5 & 0xF
+        day = date & 0x1F
+        hour = time >> 11
+        minute = time >> 5 & 0x3F
+        second = time & 0x1F
+        return year, month, day, hour, minute, second * 2
+    async def load_entries(self):
+        if self.entries is not None:
+            return
+        r = await self.client.head(self.url)
+        if r.status_code != 200:
+            raise HTTPError(f"Got status code {r.status_code} for {self.url}")
+        if r.headers.get('Accept-Ranges') != 'bytes':
+            raise HTTPError(f"Range requests not supported on {self.url}")
+        if size := r.headers.get('Content-Length'):
+            self.size = int(size)
+        else:
+            raise HTTPError(f"Unable to determine length of zip file for {self.url}")
+        eocd, eocd_start = await self._parse_eocd()
+        # Load EOCD64 if Zip64 detected, and replace original EOCD.
+        if self._detect_zip64_from_eocd(eocd):
+            eocd64_start = await self._parse_eocd64_locator(eocd_start)
+            eocd = await self._parse_eocd64(eocd64_start)
+        self.entries = [ZipEntryInfo(path=path,
+                                     raw_offset=info.offset,
+                                     file_size=info.uncompressed_size,
+                                     checksum=info.checksum,
+                                     encrypted=bool(info.bitflag & 1),
+                                     compression=ZipCompression(info.compression_mode),
+                                     compressed_size=info.compressed_size,
+                                     modified_date=self._parse_msdos_date(info.file_mdate, info.file_mtime),
+                                     internal_attrs=info.internal_attrs,
+                                     external_attrs=info.external_attrs)
+                        async for path, info in self._parse_cd_ents(eocd.cd_offset, eocd.cd_size)]
+    async def extract(self, info, output):
+        offset = await self._calc_data_offset(info.raw_offset)
+        # Nothing to do, so exit.
+        if not info.compressed_size:
+            return
+        if info.encrypted:
+            raise ZipError("Encrypted files are not supported")
+        r = await self._request(offset, offset + info.compressed_size, stream=True)
+        match info.compression:
+            case ZipCompression.NONE:
+                decompressor = None
+            case ZipCompression.DEFLATE:
+                # Negative value for raw DEFLATE
+                decompressor = zlib.decompressobj(-15)
+            case ZipCompression.BZIP2:
+                decompressor = bz2.BZ2Decompressor()
+            case ZipCompression.LZMA:
+                decompressor = lzma.LZMADecompressor()
+            case ZipCompression.ZSTANDARD:
+                decompressor = compression.zstd.ZstdDecompressor()
+            case _:
+                raise NotImplementedError
+        if decompressor:
+            async for chunk in r.aiter_bytes():
+                decompressed = decompressor.decompress(chunk)
+                output.write(decompressed)
+                yield len(decompressed)
+            decompressed = decompressor.flush()
+            output.write(decompressed)
+            yield len(decompressed)
+        else:
+            async for chunk in r.aiter_bytes():
+                output.write(chunk)
+                yield len(chunk)
+    async def __aenter__(self):
+        await self.load_entries()
+        return self
+    async def __aexit__(self, *args):
+        pass

zipinspect/zipread/stubs.py ADDED Viewed

@@ -0,0 +1,54 @@
+from typing import NamedTuple
+class _CDFHStub(NamedTuple):
+    signature: bytes
+    maker_version: int
+    version_needed: int
+    bitflag: int
+    compression_mode: int
+    file_mtime: int
+    file_mdate: int
+    checksum: int
+    compressed_size: int
+    uncompressed_size: int
+    path_size: int
+    extra_size: int
+    comment_size: int
+    begin_disk: int
+    internal_attrs: int
+    external_attrs: int
+    offset: int
+class _EOCDStub(NamedTuple):
+    signature: bytes
+    disk: int
+    begin_disk: int
+    ents_on_disk: int
+    ents_total: int
+    cd_size: int
+    cd_offset: int
+class _EOCD64Stub(NamedTuple):
+    signature: bytes
+    size: int
+    make_version: int
+    version_needed: int
+    disk: int
+    begin_disk: int
+    ents_on_disk: int
+    ents_total: int
+    cd_size: int
+    cd_offset: int
+class _LFHStub(NamedTuple):
+    signature: bytes
+    version: int
+    bitflag: int
+    compression_mode: int
+    file_mtime: int
+    file_mdate: int
+    checksum: int
+    compressed_size: int
+    uncomprssed_size: int
+    path_size: int
+    extra_size: int