PyPI - hctef - Versions diffs - 0.1.0__py3-none-any.whl - Mend

hctef 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

hctef/__init__.py +13 -0
hctef/__version__.py +34 -0
hctef/aio/__init__.py +5 -0
hctef/aio/async_file_read_cache.py +223 -0
hctef/aio/async_http_file.py +429 -0
hctef/exceptions.py +10 -0
hctef/file_read_cache.py +155 -0
hctef/http_file.py +279 -0
hctef/interval_tree.py +83 -0
hctef/py.typed +0 -0
hctef-0.1.0.dist-info/METADATA +236 -0
hctef-0.1.0.dist-info/RECORD +13 -0
hctef-0.1.0.dist-info/WHEEL +4 -0

hctef/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+from .http_file import HttpFile
+try:
+    from .__version__ import __version__, __version_tuple__
+except ImportError:
+    __version__ = '0.0.0'
+    __version_tuple__ = ('0', '0', '0')
+__all__: list[str] = [
+    'HttpFile',
+    '__version__',
+    '__version_tuple__',
+]

hctef/__version__.py ADDED Viewed

@@ -0,0 +1,34 @@
+# file generated by setuptools-scm
+# don't change, don't track in version control
+__all__ = [
+    "__version__",
+    "__version_tuple__",
+    "version",
+    "version_tuple",
+    "__commit_id__",
+    "commit_id",
+]
+TYPE_CHECKING = False
+if TYPE_CHECKING:
+    from typing import Tuple
+    from typing import Union
+    VERSION_TUPLE = Tuple[Union[int, str], ...]
+    COMMIT_ID = Union[str, None]
+else:
+    VERSION_TUPLE = object
+    COMMIT_ID = object
+version: str
+__version__: str
+__version_tuple__: VERSION_TUPLE
+version_tuple: VERSION_TUPLE
+commit_id: COMMIT_ID
+__commit_id__: COMMIT_ID
+__version__ = version = '0.1.0'
+__version_tuple__ = version_tuple = (0, 1, 0)
+__commit_id__ = commit_id = None

hctef/aio/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .async_http_file import AsyncHttpFile
+__all__ = [
+    'AsyncHttpFile',
+]

hctef/aio/async_file_read_cache.py ADDED Viewed

@@ -0,0 +1,223 @@
+import asyncio
+import io
+import logging
+from collections.abc import Callable
+from hctef.interval_tree import BinningIntervalTree, Interval
+logger = logging.getLogger(__name__)
+class AsyncFileReadCache:
+    """
+    An async-safe cache for file chunks that supports concurrent reads.
+    Key Features:
+    - Stores bytes | asyncio.Task[bytes] in intervals
+    - Multiple reads of the same range share the same fetch task
+    - Lock-free: uses asyncio's single-threaded execution model
+    """
+    def __init__(
+        self,
+        file_size: int,
+        fetch_range: Callable[[int, int], asyncio.Task[bytes]],
+        minimum_request_size: int = 8192,
+    ) -> None:
+        """
+        Initialize async file read cache.
+        Args:
+            file_size: Total size of the file being cached
+            fetch_range: Async function that returns a Task to fetch a byte range
+            minimum_request_size: Minimum bytes to fetch in a single request
+        """
+        if file_size < 0:
+            raise ValueError('File size cannot be less than zero')
+        self.file_size = file_size
+        self.cache = BinningIntervalTree[bytes | asyncio.Task[bytes]](
+            bin_size=minimum_request_size,
+        )
+        self._fetcher = fetch_range
+        self._minimum_request_size = minimum_request_size
+    def clear(self):
+        """
+        Resets the cache, clearing all stored data and intervals.
+        """
+        logger.debug('Clearing all cached data')
+        self.cache.clear()
+    def _store_and_merge(self, start: int, end: int, chunk: bytes) -> None:
+        """
+        Stores a new chunk of data and merges it with any adjacent or
+        overlapping chunks that contain completed bytes (not Tasks).
+        """
+        # Find adjacent/overlapping intervals that have completed (bytes only)
+        overlapping = [
+            iv
+            for iv in self.cache.find_overlapping(start - 1, end + 1)
+            if isinstance(iv.data, bytes)
+        ]
+        min_start = start
+        max_end = end
+        parts = {start: chunk}
+        for iv in overlapping:
+            min_start = min(min_start, iv.begin)
+            max_end = max(max_end, iv.end)
+            parts[iv.begin] = iv.data
+            self.cache.remove(iv)
+        merged_data = io.BytesIO()
+        for offset in sorted(parts.keys()):
+            merged_data.write(parts[offset])
+        self.cache.add(Interval(min_start, max_end, merged_data.getvalue()))
+        logger.debug(
+            '--- CACHE UPDATE: Cache now contains: %s',
+            [(iv.begin, iv.end, type(iv.data).__name__) for iv in sorted(self.cache)],
+        )
+    def _create_fetch_task(self, start: int, end: int) -> asyncio.Task[bytes] | None:
+        """
+        Creates a fetch task and stores it in the cache synchronously.
+        Returns:
+            Task to await, or None if range is already covered
+        """
+        # Apply minimum request size logic (same as sync version)
+        more = self._minimum_request_size - (end - start)
+        if more > 0:
+            overlapping = self.cache.find_overlapping(
+                end,
+                min(self.file_size, end + self._minimum_request_size),
+            )
+            right_wall = overlapping[0].begin if overlapping else self.file_size
+            end = min(end + more, right_wall)
+            more = self._minimum_request_size - (end - start)
+        if more > 0:
+            overlapping = self.cache.find_overlapping(
+                max(0, start - self._minimum_request_size),
+                start,
+                sort_by_end=True,
+            )
+            left_wall = overlapping[0].end if overlapping else 0
+            start = max(start - more, left_wall)
+        # Check if already covered (bytes or task)
+        # Not needed in sync version due to no concurrency
+        existing = self.cache.find_overlapping(start, end)
+        if existing:
+            logger.debug(
+                'FETCH SKIPPED: bytes %s-%s (already in cache)',
+                start,
+                end - 1,
+            )
+            return None
+        # Create task for the actual fetch operation
+        async def do_fetch() -> bytes:
+            chunk = await self._fetcher(start, end)
+            # Remove task interval and store bytes
+            self.cache.remove(task_interval)
+            self._store_and_merge(start, end, chunk)
+            return chunk
+        task = asyncio.create_task(do_fetch())
+        task_interval: Interval[bytes | asyncio.Task[bytes]] = Interval(
+            start,
+            end,
+            task,
+        )
+        self.cache.add(task_interval)
+        logger.debug(
+            'FETCH INITIATED: bytes %s-%s (task created)',
+            start,
+            end - 1,
+        )
+        return task
+    def _find_missing_ranges(self, start: int, end: int) -> list[Interval]:
+        """
+        Find 'holes' in the cache that need to be fetched.
+        """
+        missing: list[Interval] = []
+        # All intervals (bytes or tasks) are considered "covered"
+        relevant_intervals = sorted(self.cache.find_overlapping(start, end))
+        pos = start
+        for interval in relevant_intervals:
+            if pos < interval.begin:
+                missing.append(Interval(pos, interval.begin, None))
+            pos = max(pos, interval.end)
+        if pos < end:
+            missing.append(Interval(pos, end, None))
+        return missing
+    async def read(self, start: int, end: int) -> bytes:
+        """
+        Reads a range of bytes asynchronously, utilizing the cache and
+        fetching if necessary.
+        """
+        if end > self.file_size:
+            raise ValueError('Read request extends beyond the end of the file.')
+        logger.debug(
+            'Read Request for bytes %s-%s',
+            start,
+            end - 1,
+        )
+        missing_intervals = self._find_missing_ranges(start, end)
+        if not missing_intervals:
+            logger.debug('CACHE HIT: All requested data already in cache.')
+        else:
+            logger.debug(
+                'CACHE MISS: Missing intervals are: %s',
+                [(iv.begin, iv.end) for iv in missing_intervals],
+            )
+            # Synchronously create all fetch tasks
+            fetch_tasks = []
+            for interval in missing_intervals:
+                # Check if the interval (or part of it) has been filled
+                # by a previous larger fetch in this same read() call.
+                still_missing = self._find_missing_ranges(
+                    interval.begin,
+                    interval.end,
+                )
+                for gap in still_missing:
+                    task = self._create_fetch_task(gap.begin, gap.end)
+                    if task:
+                        fetch_tasks.append(task)
+        # Assemble result, awaiting any fetch tasks
+        result_buffer = io.BytesIO()
+        cached_chunks = sorted(self.cache.find_overlapping(start, end))
+        for interval in cached_chunks:
+            read_start = max(start, interval.begin)
+            read_end = min(end, interval.end)
+            # If this is still a task, await it to get bytes
+            if isinstance(interval.data, asyncio.Task):
+                data = await interval.data
+            else:
+                data = interval.data
+            slice_start = read_start - interval.begin
+            slice_end = read_end - interval.begin
+            result_buffer.write(data[slice_start:slice_end])
+        return result_buffer.getvalue()

hctef/aio/async_http_file.py ADDED Viewed

@@ -0,0 +1,429 @@
+from __future__ import annotations
+import asyncio
+from typing import Literal, Self
+try:
+    import aiohttp
+except ImportError:
+    raise ImportError(
+        'Must install hctef with `[async]` extra to get necessary dependencies',
+    ) from None
+from hctef.exceptions import HctefNetworkError, HctefUrlError
+from .async_file_read_cache import AsyncFileReadCache
+def _check_url(url: str) -> None:
+    """
+    Validate that URL is a valid HTTP/HTTPS URL.
+    Args:
+        url: URL to validate
+    Raises:
+        HctefUrlError: If URL doesn't start with http: or https:
+    """
+    if not url.startswith(('http:', 'https:')):
+        raise HctefUrlError("URL must start with 'http:' or 'https:'")
+class _OpenedAsyncHttpFile:
+    """
+    Internal class managing shared state for AsyncHttpFile.
+    """
+    def __init__(
+        self,
+        http_file: AsyncHttpFile,
+        session: aiohttp.ClientSession,
+        size: int,
+    ) -> None:
+        """
+        Initialize opened HTTP file with pre-fetched async values.
+        Args:
+            http_file: The parent AsyncHttpFile instance
+            session: Pre-created aiohttp session
+            size: File size obtained via async HTTP request
+        """
+        self.http_file = http_file
+        self.session = session
+        self.size = size
+        self.cache = AsyncFileReadCache(
+            self.size,
+            self._fetch_range,
+            minimum_request_size=min(
+                self.http_file._minimum_range_request_bytes,
+                self.size,
+            ),
+        )
+    @classmethod
+    async def create(cls, http_file: AsyncHttpFile) -> Self:
+        """
+        Async factory method to create _OpenedAsyncHttpFile.
+        Args:
+            http_file: The parent AsyncHttpFile instance
+        Returns:
+            Fully initialized _OpenedAsyncHttpFile instance
+        """
+        session = aiohttp.ClientSession()
+        size = await cls._get_file_size(session, http_file.url)
+        return cls(http_file, session, size)
+    @staticmethod
+    async def _get_file_size(
+        session: aiohttp.ClientSession,
+        url: str,
+    ) -> int:
+        """
+        Get total file size using async HTTP range request.
+        Args:
+            session: aiohttp session to use for request
+            url: URL to fetch size for
+        Returns:
+            File size in bytes
+        Raises:
+            HctefNetworkError: If size cannot be determined
+        """
+        try:
+            headers = {'Range': 'bytes=0-'}
+            async with session.get(url, headers=headers) as response:
+                content_range = response.headers.get('Content-Range')
+                if content_range:
+                    return int(content_range.split('/')[-1])
+                # If no Content-Range header, server doesn't support ranges
+                raise HctefNetworkError(
+                    f'Server does not support range requests for {url}',
+                )
+        except Exception as e:
+            raise HctefNetworkError(
+                f'Cannot determine file size for {url}',
+            ) from e
+    def _fetch_range(self, start: int, end: int) -> asyncio.Task[bytes]:
+        """
+        Create an async task to fetch byte range using HTTP request.
+        Args:
+            start: Start byte position (inclusive)
+            end: End byte position (exclusive)
+        Returns:
+            Asyncio task that will fetch the requested bytes
+        """
+        return asyncio.create_task(self._do_fetch_range(start, end))
+    async def _do_fetch_range(self, start: int, end: int) -> bytes:
+        """
+        Actually fetch byte range using async HTTP request.
+        Args:
+            start: Start byte position (inclusive)
+            end: End byte position (exclusive)
+        Returns:
+            Bytes fetched from the range
+        Raises:
+            HctefNetworkError: If range request fails
+        """
+        if start >= end or start < 0 or end > self.size:
+            raise HctefUrlError(
+                f'Invalid byte range: {start}-{end} (file size: {self.size})',
+            )
+        try:
+            headers = {'Range': f'bytes={start}-{end - 1}'}
+            async with self.session.get(
+                self.http_file.url,
+                headers=headers,
+            ) as response:
+                return await response.read()
+        except RuntimeError:
+            raise
+        except Exception as e:
+            raise HctefNetworkError(
+                f'Failed to fetch bytes {start}-{end} from {self.http_file.url}',
+            ) from e
+    async def read(self, position: int, size: int | None = None, /) -> bytes:
+        """
+        Read bytes from a specific position without managing cursor state.
+        Args:
+            position: Starting byte position to read from
+            size: Number of bytes to read (None for all remaining)
+        Returns:
+            Bytes read from the file
+        """
+        if size is None:
+            size = self.size - position
+        if size < 0:
+            raise ValueError(f'Cannot read negative number of bytes, got: {size}')
+        if size == 0:
+            return b''
+        start = position
+        end = min(start + size, self.size)
+        return await self.cache.read(start, end)
+    async def close(self) -> None:
+        """
+        Close the file and session.
+        """
+        await self.session.close()
+class AsyncHttpFileCursor:
+    """
+    Lightweight cursor for reading from AsyncHttpFile with independent position.
+    """
+    def __init__(self, opened_file: _OpenedAsyncHttpFile) -> None:
+        """
+        Create a cursor for reading from an opened HTTP file.
+        Args:
+            opened_file: The shared opened file state
+        """
+        self.ohf = opened_file
+        self.position = 0
+    @property
+    def size(self) -> int:
+        return self.ohf.size
+    async def read(self, size: int | None = None, /) -> bytes:
+        """
+        Read bytes from current position asynchronously.
+        Args:
+            size: Number of bytes to read (None for all remaining)
+        Returns:
+            Bytes read from the file
+        """
+        data = await self.ohf.read(self.position, size)
+        self.position += len(data)
+        return data
+    def seek(self, offset: int, whence: int = 0, /) -> int:
+        """
+        Change stream position (synchronous - no I/O).
+        Args:
+            offset: Byte offset
+            whence: How to interpret offset (0=absolute, 1=relative, 2=from end)
+        Returns:
+            New absolute position
+        """
+        if whence == 0:  # Absolute position
+            new_pos = offset
+        elif whence == 1:  # Relative to current position
+            new_pos = self.position + offset
+        elif whence == 2:  # Relative to end
+            new_pos = self.size + offset
+        else:
+            raise ValueError(f'Invalid whence value: {whence}')
+        if new_pos < 0:
+            new_pos = 0
+        elif new_pos > self.size:
+            new_pos = self.size
+        self.position = new_pos
+        return self.position
+    def tell(self) -> int:
+        """
+        Get current stream position (synchronous - no I/O).
+        Returns:
+            Current byte position in file
+        """
+        return self.position
+    def clone(self) -> AsyncHttpFileCursor:
+        """
+        Create a new sibling cursor with independent position.
+        Returns:
+            New cursor sharing cache and session but with independent position
+        Raises:
+            ValueError: If file is not opened
+        """
+        return AsyncHttpFileCursor(self.ohf)
+    def readable(self) -> bool:
+        return True
+    def writable(self) -> bool:
+        return False
+    def seekable(self) -> bool:
+        return True
+class AsyncHttpFile:
+    """
+    Async file-like wrapper for HTTP URLs with concurrent read support.
+    """
+    def __init__(
+        self,
+        url: str,
+        minimum_range_request_bytes: int = 8192,
+        prefetch_bytes: int = 2**20,
+        prefetch_direction: Literal['START', 'END'] = 'END',
+    ) -> None:
+        """
+        Initialize async HTTP file wrapper.
+        Args:
+            url: HTTP/HTTPS URL for a file
+        Keyword Args:
+            minimum_range_request_bytes:
+                Least number of bytes to request,
+                except when filling cache gaps
+            prefetch_bytes:
+                How many bytes to request when opening the file.
+                Set to 0 or less to disable prefetch. Default 1 MiB.
+            prefetch_direction:
+                Whether to prefetch from file start or file end.
+                Possible values `START` or `END`.
+        Raises:
+            HctefUrlError: If URL is invalid
+        """
+        _check_url(url)
+        self.url = url
+        self._prefetch_bytes = prefetch_bytes
+        self._prefetch_direction = prefetch_direction
+        self._minimum_range_request_bytes = minimum_range_request_bytes
+        self._cursor: AsyncHttpFileCursor | None = None
+    @property
+    def cursor(self) -> AsyncHttpFileCursor:
+        if not self._cursor:
+            raise ValueError('I/O operation on closed file')
+        return self._cursor
+    @property
+    def size(self) -> int:
+        return self.cursor.size
+    async def open(self) -> Self:
+        """
+        Open the file asynchronously.
+        Returns:
+            Self for use in context manager
+        """
+        self._cursor = AsyncHttpFileCursor(await _OpenedAsyncHttpFile.create(self))
+        prefetch_bytes = min(self._prefetch_bytes, self.size)
+        if prefetch_bytes > 0 and self._prefetch_direction == 'START':
+            await self.read(prefetch_bytes)
+        elif prefetch_bytes > 0 and self._prefetch_direction == 'END':
+            self.cursor.seek(prefetch_bytes, 2)
+            await self.read(prefetch_bytes)
+        self.cursor.seek(0)
+        return self
+    def clone(self) -> AsyncHttpFileCursor:
+        """
+        Create a new cursor for concurrent reads.
+        Returns:
+            New cursor sharing cache but with independent position
+        Raises:
+            ValueError: If file is not opened
+        """
+        return self.cursor.clone()
+    async def close(self) -> None:
+        """
+        Close the file and release resources.
+        """
+        if self._cursor:
+            await self._cursor.ohf.close()
+        self._cursor = None
+    async def __aenter__(self) -> Self:
+        """Async context manager entry."""
+        return await self.open()
+    async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
+        """Async context manager exit."""
+        await self.close()
+    def __repr__(self) -> str:
+        if self._cursor:
+            return (
+                f'AsyncHttpFile(url={self.url!r}, opened=True, '
+                f'size={self.size}, pos={self._cursor.position})'
+            )
+        return f'AsyncHttpFile(url={self.url!r}, opened=False)'
+    async def read(self, size: int | None = None, /) -> bytes:
+        """
+        Read bytes from current position asynchronously.
+        Args:
+            size: Number of bytes to read (None for all remaining)
+        Returns:
+            Bytes read from the file
+        """
+        return await self.cursor.read(size)
+    def seek(self, offset: int, whence: int = 0, /) -> int:
+        """
+        Change stream position (synchronous - no I/O).
+        Args:
+            offset: Byte offset
+            whence: How to interpret offset (0=absolute, 1=relative, 2=from end)
+        Returns:
+            New absolute position
+        """
+        return self.cursor.seek(offset, whence)
+    def tell(self) -> int:
+        """
+        Get current stream position (synchronous - no I/O).
+        Returns:
+            Current byte position in file
+        """
+        return self.cursor.tell()
+    def readable(self) -> bool:
+        return True
+    def writable(self) -> bool:
+        return False
+    def seekable(self) -> bool:
+        return True