hctef 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hctef/__init__.py ADDED
@@ -0,0 +1,13 @@
1
+ from .http_file import HttpFile
2
+
3
+ try:
4
+ from .__version__ import __version__, __version_tuple__
5
+ except ImportError:
6
+ __version__ = '0.0.0'
7
+ __version_tuple__ = ('0', '0', '0')
8
+
9
+ __all__: list[str] = [
10
+ 'HttpFile',
11
+ '__version__',
12
+ '__version_tuple__',
13
+ ]
hctef/__version__.py ADDED
@@ -0,0 +1,34 @@
1
+ # file generated by setuptools-scm
2
+ # don't change, don't track in version control
3
+
4
+ __all__ = [
5
+ "__version__",
6
+ "__version_tuple__",
7
+ "version",
8
+ "version_tuple",
9
+ "__commit_id__",
10
+ "commit_id",
11
+ ]
12
+
13
+ TYPE_CHECKING = False
14
+ if TYPE_CHECKING:
15
+ from typing import Tuple
16
+ from typing import Union
17
+
18
+ VERSION_TUPLE = Tuple[Union[int, str], ...]
19
+ COMMIT_ID = Union[str, None]
20
+ else:
21
+ VERSION_TUPLE = object
22
+ COMMIT_ID = object
23
+
24
+ version: str
25
+ __version__: str
26
+ __version_tuple__: VERSION_TUPLE
27
+ version_tuple: VERSION_TUPLE
28
+ commit_id: COMMIT_ID
29
+ __commit_id__: COMMIT_ID
30
+
31
+ __version__ = version = '0.1.0'
32
+ __version_tuple__ = version_tuple = (0, 1, 0)
33
+
34
+ __commit_id__ = commit_id = None
hctef/aio/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ from .async_http_file import AsyncHttpFile
2
+
3
+ __all__ = [
4
+ 'AsyncHttpFile',
5
+ ]
@@ -0,0 +1,223 @@
1
+ import asyncio
2
+ import io
3
+ import logging
4
+
5
+ from collections.abc import Callable
6
+
7
+ from hctef.interval_tree import BinningIntervalTree, Interval
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class AsyncFileReadCache:
13
+ """
14
+ An async-safe cache for file chunks that supports concurrent reads.
15
+
16
+ Key Features:
17
+ - Stores bytes | asyncio.Task[bytes] in intervals
18
+ - Multiple reads of the same range share the same fetch task
19
+ - Lock-free: uses asyncio's single-threaded execution model
20
+ """
21
+
22
+ def __init__(
23
+ self,
24
+ file_size: int,
25
+ fetch_range: Callable[[int, int], asyncio.Task[bytes]],
26
+ minimum_request_size: int = 8192,
27
+ ) -> None:
28
+ """
29
+ Initialize async file read cache.
30
+
31
+ Args:
32
+ file_size: Total size of the file being cached
33
+ fetch_range: Async function that returns a Task to fetch a byte range
34
+ minimum_request_size: Minimum bytes to fetch in a single request
35
+ """
36
+ if file_size < 0:
37
+ raise ValueError('File size cannot be less than zero')
38
+
39
+ self.file_size = file_size
40
+ self.cache = BinningIntervalTree[bytes | asyncio.Task[bytes]](
41
+ bin_size=minimum_request_size,
42
+ )
43
+ self._fetcher = fetch_range
44
+ self._minimum_request_size = minimum_request_size
45
+
46
+ def clear(self):
47
+ """
48
+ Resets the cache, clearing all stored data and intervals.
49
+ """
50
+ logger.debug('Clearing all cached data')
51
+ self.cache.clear()
52
+
53
+ def _store_and_merge(self, start: int, end: int, chunk: bytes) -> None:
54
+ """
55
+ Stores a new chunk of data and merges it with any adjacent or
56
+ overlapping chunks that contain completed bytes (not Tasks).
57
+ """
58
+ # Find adjacent/overlapping intervals that have completed (bytes only)
59
+ overlapping = [
60
+ iv
61
+ for iv in self.cache.find_overlapping(start - 1, end + 1)
62
+ if isinstance(iv.data, bytes)
63
+ ]
64
+
65
+ min_start = start
66
+ max_end = end
67
+ parts = {start: chunk}
68
+
69
+ for iv in overlapping:
70
+ min_start = min(min_start, iv.begin)
71
+ max_end = max(max_end, iv.end)
72
+ parts[iv.begin] = iv.data
73
+ self.cache.remove(iv)
74
+
75
+ merged_data = io.BytesIO()
76
+ for offset in sorted(parts.keys()):
77
+ merged_data.write(parts[offset])
78
+
79
+ self.cache.add(Interval(min_start, max_end, merged_data.getvalue()))
80
+ logger.debug(
81
+ '--- CACHE UPDATE: Cache now contains: %s',
82
+ [(iv.begin, iv.end, type(iv.data).__name__) for iv in sorted(self.cache)],
83
+ )
84
+
85
+ def _create_fetch_task(self, start: int, end: int) -> asyncio.Task[bytes] | None:
86
+ """
87
+ Creates a fetch task and stores it in the cache synchronously.
88
+
89
+ Returns:
90
+ Task to await, or None if range is already covered
91
+ """
92
+ # Apply minimum request size logic (same as sync version)
93
+ more = self._minimum_request_size - (end - start)
94
+
95
+ if more > 0:
96
+ overlapping = self.cache.find_overlapping(
97
+ end,
98
+ min(self.file_size, end + self._minimum_request_size),
99
+ )
100
+ right_wall = overlapping[0].begin if overlapping else self.file_size
101
+ end = min(end + more, right_wall)
102
+ more = self._minimum_request_size - (end - start)
103
+
104
+ if more > 0:
105
+ overlapping = self.cache.find_overlapping(
106
+ max(0, start - self._minimum_request_size),
107
+ start,
108
+ sort_by_end=True,
109
+ )
110
+ left_wall = overlapping[0].end if overlapping else 0
111
+ start = max(start - more, left_wall)
112
+
113
+ # Check if already covered (bytes or task)
114
+ # Not needed in sync version due to no concurrency
115
+ existing = self.cache.find_overlapping(start, end)
116
+ if existing:
117
+ logger.debug(
118
+ 'FETCH SKIPPED: bytes %s-%s (already in cache)',
119
+ start,
120
+ end - 1,
121
+ )
122
+ return None
123
+
124
+ # Create task for the actual fetch operation
125
+ async def do_fetch() -> bytes:
126
+ chunk = await self._fetcher(start, end)
127
+ # Remove task interval and store bytes
128
+ self.cache.remove(task_interval)
129
+ self._store_and_merge(start, end, chunk)
130
+ return chunk
131
+
132
+ task = asyncio.create_task(do_fetch())
133
+ task_interval: Interval[bytes | asyncio.Task[bytes]] = Interval(
134
+ start,
135
+ end,
136
+ task,
137
+ )
138
+ self.cache.add(task_interval)
139
+
140
+ logger.debug(
141
+ 'FETCH INITIATED: bytes %s-%s (task created)',
142
+ start,
143
+ end - 1,
144
+ )
145
+
146
+ return task
147
+
148
+ def _find_missing_ranges(self, start: int, end: int) -> list[Interval]:
149
+ """
150
+ Find 'holes' in the cache that need to be fetched.
151
+ """
152
+ missing: list[Interval] = []
153
+ # All intervals (bytes or tasks) are considered "covered"
154
+ relevant_intervals = sorted(self.cache.find_overlapping(start, end))
155
+ pos = start
156
+
157
+ for interval in relevant_intervals:
158
+ if pos < interval.begin:
159
+ missing.append(Interval(pos, interval.begin, None))
160
+ pos = max(pos, interval.end)
161
+
162
+ if pos < end:
163
+ missing.append(Interval(pos, end, None))
164
+
165
+ return missing
166
+
167
+ async def read(self, start: int, end: int) -> bytes:
168
+ """
169
+ Reads a range of bytes asynchronously, utilizing the cache and
170
+ fetching if necessary.
171
+ """
172
+ if end > self.file_size:
173
+ raise ValueError('Read request extends beyond the end of the file.')
174
+
175
+ logger.debug(
176
+ 'Read Request for bytes %s-%s',
177
+ start,
178
+ end - 1,
179
+ )
180
+
181
+ missing_intervals = self._find_missing_ranges(start, end)
182
+
183
+ if not missing_intervals:
184
+ logger.debug('CACHE HIT: All requested data already in cache.')
185
+ else:
186
+ logger.debug(
187
+ 'CACHE MISS: Missing intervals are: %s',
188
+ [(iv.begin, iv.end) for iv in missing_intervals],
189
+ )
190
+ # Synchronously create all fetch tasks
191
+ fetch_tasks = []
192
+ for interval in missing_intervals:
193
+ # Check if the interval (or part of it) has been filled
194
+ # by a previous larger fetch in this same read() call.
195
+ still_missing = self._find_missing_ranges(
196
+ interval.begin,
197
+ interval.end,
198
+ )
199
+ for gap in still_missing:
200
+ task = self._create_fetch_task(gap.begin, gap.end)
201
+ if task:
202
+ fetch_tasks.append(task)
203
+
204
+ # Assemble result, awaiting any fetch tasks
205
+ result_buffer = io.BytesIO()
206
+ cached_chunks = sorted(self.cache.find_overlapping(start, end))
207
+
208
+ for interval in cached_chunks:
209
+ read_start = max(start, interval.begin)
210
+ read_end = min(end, interval.end)
211
+
212
+ # If this is still a task, await it to get bytes
213
+ if isinstance(interval.data, asyncio.Task):
214
+ data = await interval.data
215
+ else:
216
+ data = interval.data
217
+
218
+ slice_start = read_start - interval.begin
219
+ slice_end = read_end - interval.begin
220
+
221
+ result_buffer.write(data[slice_start:slice_end])
222
+
223
+ return result_buffer.getvalue()
@@ -0,0 +1,429 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+
5
+ from typing import Literal, Self
6
+
7
+ try:
8
+ import aiohttp
9
+ except ImportError:
10
+ raise ImportError(
11
+ 'Must install hctef with `[async]` extra to get necessary dependencies',
12
+ ) from None
13
+
14
+ from hctef.exceptions import HctefNetworkError, HctefUrlError
15
+
16
+ from .async_file_read_cache import AsyncFileReadCache
17
+
18
+
19
+ def _check_url(url: str) -> None:
20
+ """
21
+ Validate that URL is a valid HTTP/HTTPS URL.
22
+
23
+ Args:
24
+ url: URL to validate
25
+
26
+ Raises:
27
+ HctefUrlError: If URL doesn't start with http: or https:
28
+ """
29
+ if not url.startswith(('http:', 'https:')):
30
+ raise HctefUrlError("URL must start with 'http:' or 'https:'")
31
+
32
+
33
+ class _OpenedAsyncHttpFile:
34
+ """
35
+ Internal class managing shared state for AsyncHttpFile.
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ http_file: AsyncHttpFile,
41
+ session: aiohttp.ClientSession,
42
+ size: int,
43
+ ) -> None:
44
+ """
45
+ Initialize opened HTTP file with pre-fetched async values.
46
+
47
+ Args:
48
+ http_file: The parent AsyncHttpFile instance
49
+ session: Pre-created aiohttp session
50
+ size: File size obtained via async HTTP request
51
+ """
52
+ self.http_file = http_file
53
+ self.session = session
54
+ self.size = size
55
+ self.cache = AsyncFileReadCache(
56
+ self.size,
57
+ self._fetch_range,
58
+ minimum_request_size=min(
59
+ self.http_file._minimum_range_request_bytes,
60
+ self.size,
61
+ ),
62
+ )
63
+
64
+ @classmethod
65
+ async def create(cls, http_file: AsyncHttpFile) -> Self:
66
+ """
67
+ Async factory method to create _OpenedAsyncHttpFile.
68
+
69
+ Args:
70
+ http_file: The parent AsyncHttpFile instance
71
+
72
+ Returns:
73
+ Fully initialized _OpenedAsyncHttpFile instance
74
+ """
75
+ session = aiohttp.ClientSession()
76
+ size = await cls._get_file_size(session, http_file.url)
77
+ return cls(http_file, session, size)
78
+
79
+ @staticmethod
80
+ async def _get_file_size(
81
+ session: aiohttp.ClientSession,
82
+ url: str,
83
+ ) -> int:
84
+ """
85
+ Get total file size using async HTTP range request.
86
+
87
+ Args:
88
+ session: aiohttp session to use for request
89
+ url: URL to fetch size for
90
+
91
+ Returns:
92
+ File size in bytes
93
+
94
+ Raises:
95
+ HctefNetworkError: If size cannot be determined
96
+ """
97
+ try:
98
+ headers = {'Range': 'bytes=0-'}
99
+ async with session.get(url, headers=headers) as response:
100
+ content_range = response.headers.get('Content-Range')
101
+ if content_range:
102
+ return int(content_range.split('/')[-1])
103
+
104
+ # If no Content-Range header, server doesn't support ranges
105
+ raise HctefNetworkError(
106
+ f'Server does not support range requests for {url}',
107
+ )
108
+ except Exception as e:
109
+ raise HctefNetworkError(
110
+ f'Cannot determine file size for {url}',
111
+ ) from e
112
+
113
+ def _fetch_range(self, start: int, end: int) -> asyncio.Task[bytes]:
114
+ """
115
+ Create an async task to fetch byte range using HTTP request.
116
+
117
+ Args:
118
+ start: Start byte position (inclusive)
119
+ end: End byte position (exclusive)
120
+
121
+ Returns:
122
+ Asyncio task that will fetch the requested bytes
123
+ """
124
+ return asyncio.create_task(self._do_fetch_range(start, end))
125
+
126
+ async def _do_fetch_range(self, start: int, end: int) -> bytes:
127
+ """
128
+ Actually fetch byte range using async HTTP request.
129
+
130
+ Args:
131
+ start: Start byte position (inclusive)
132
+ end: End byte position (exclusive)
133
+
134
+ Returns:
135
+ Bytes fetched from the range
136
+
137
+ Raises:
138
+ HctefNetworkError: If range request fails
139
+ """
140
+ if start >= end or start < 0 or end > self.size:
141
+ raise HctefUrlError(
142
+ f'Invalid byte range: {start}-{end} (file size: {self.size})',
143
+ )
144
+
145
+ try:
146
+ headers = {'Range': f'bytes={start}-{end - 1}'}
147
+ async with self.session.get(
148
+ self.http_file.url,
149
+ headers=headers,
150
+ ) as response:
151
+ return await response.read()
152
+ except RuntimeError:
153
+ raise
154
+ except Exception as e:
155
+ raise HctefNetworkError(
156
+ f'Failed to fetch bytes {start}-{end} from {self.http_file.url}',
157
+ ) from e
158
+
159
+ async def read(self, position: int, size: int | None = None, /) -> bytes:
160
+ """
161
+ Read bytes from a specific position without managing cursor state.
162
+
163
+ Args:
164
+ position: Starting byte position to read from
165
+ size: Number of bytes to read (None for all remaining)
166
+
167
+ Returns:
168
+ Bytes read from the file
169
+ """
170
+ if size is None:
171
+ size = self.size - position
172
+
173
+ if size < 0:
174
+ raise ValueError(f'Cannot read negative number of bytes, got: {size}')
175
+
176
+ if size == 0:
177
+ return b''
178
+
179
+ start = position
180
+ end = min(start + size, self.size)
181
+
182
+ return await self.cache.read(start, end)
183
+
184
+ async def close(self) -> None:
185
+ """
186
+ Close the file and session.
187
+ """
188
+ await self.session.close()
189
+
190
+
191
+ class AsyncHttpFileCursor:
192
+ """
193
+ Lightweight cursor for reading from AsyncHttpFile with independent position.
194
+ """
195
+
196
+ def __init__(self, opened_file: _OpenedAsyncHttpFile) -> None:
197
+ """
198
+ Create a cursor for reading from an opened HTTP file.
199
+
200
+ Args:
201
+ opened_file: The shared opened file state
202
+ """
203
+ self.ohf = opened_file
204
+ self.position = 0
205
+
206
+ @property
207
+ def size(self) -> int:
208
+ return self.ohf.size
209
+
210
+ async def read(self, size: int | None = None, /) -> bytes:
211
+ """
212
+ Read bytes from current position asynchronously.
213
+
214
+ Args:
215
+ size: Number of bytes to read (None for all remaining)
216
+
217
+ Returns:
218
+ Bytes read from the file
219
+ """
220
+ data = await self.ohf.read(self.position, size)
221
+ self.position += len(data)
222
+ return data
223
+
224
+ def seek(self, offset: int, whence: int = 0, /) -> int:
225
+ """
226
+ Change stream position (synchronous - no I/O).
227
+
228
+ Args:
229
+ offset: Byte offset
230
+ whence: How to interpret offset (0=absolute, 1=relative, 2=from end)
231
+
232
+ Returns:
233
+ New absolute position
234
+ """
235
+ if whence == 0: # Absolute position
236
+ new_pos = offset
237
+ elif whence == 1: # Relative to current position
238
+ new_pos = self.position + offset
239
+ elif whence == 2: # Relative to end
240
+ new_pos = self.size + offset
241
+ else:
242
+ raise ValueError(f'Invalid whence value: {whence}')
243
+
244
+ if new_pos < 0:
245
+ new_pos = 0
246
+ elif new_pos > self.size:
247
+ new_pos = self.size
248
+
249
+ self.position = new_pos
250
+ return self.position
251
+
252
+ def tell(self) -> int:
253
+ """
254
+ Get current stream position (synchronous - no I/O).
255
+
256
+ Returns:
257
+ Current byte position in file
258
+ """
259
+ return self.position
260
+
261
+ def clone(self) -> AsyncHttpFileCursor:
262
+ """
263
+ Create a new sibling cursor with independent position.
264
+
265
+ Returns:
266
+ New cursor sharing cache and session but with independent position
267
+
268
+ Raises:
269
+ ValueError: If file is not opened
270
+ """
271
+ return AsyncHttpFileCursor(self.ohf)
272
+
273
+ def readable(self) -> bool:
274
+ return True
275
+
276
+ def writable(self) -> bool:
277
+ return False
278
+
279
+ def seekable(self) -> bool:
280
+ return True
281
+
282
+
283
+ class AsyncHttpFile:
284
+ """
285
+ Async file-like wrapper for HTTP URLs with concurrent read support.
286
+ """
287
+
288
+ def __init__(
289
+ self,
290
+ url: str,
291
+ minimum_range_request_bytes: int = 8192,
292
+ prefetch_bytes: int = 2**20,
293
+ prefetch_direction: Literal['START', 'END'] = 'END',
294
+ ) -> None:
295
+ """
296
+ Initialize async HTTP file wrapper.
297
+
298
+ Args:
299
+ url: HTTP/HTTPS URL for a file
300
+
301
+ Keyword Args:
302
+ minimum_range_request_bytes:
303
+ Least number of bytes to request,
304
+ except when filling cache gaps
305
+ prefetch_bytes:
306
+ How many bytes to request when opening the file.
307
+ Set to 0 or less to disable prefetch. Default 1 MiB.
308
+ prefetch_direction:
309
+ Whether to prefetch from file start or file end.
310
+ Possible values `START` or `END`.
311
+
312
+ Raises:
313
+ HctefUrlError: If URL is invalid
314
+ """
315
+ _check_url(url)
316
+ self.url = url
317
+ self._prefetch_bytes = prefetch_bytes
318
+ self._prefetch_direction = prefetch_direction
319
+ self._minimum_range_request_bytes = minimum_range_request_bytes
320
+ self._cursor: AsyncHttpFileCursor | None = None
321
+
322
+ @property
323
+ def cursor(self) -> AsyncHttpFileCursor:
324
+ if not self._cursor:
325
+ raise ValueError('I/O operation on closed file')
326
+ return self._cursor
327
+
328
+ @property
329
+ def size(self) -> int:
330
+ return self.cursor.size
331
+
332
+ async def open(self) -> Self:
333
+ """
334
+ Open the file asynchronously.
335
+
336
+ Returns:
337
+ Self for use in context manager
338
+ """
339
+ self._cursor = AsyncHttpFileCursor(await _OpenedAsyncHttpFile.create(self))
340
+
341
+ prefetch_bytes = min(self._prefetch_bytes, self.size)
342
+ if prefetch_bytes > 0 and self._prefetch_direction == 'START':
343
+ await self.read(prefetch_bytes)
344
+ elif prefetch_bytes > 0 and self._prefetch_direction == 'END':
345
+ self.cursor.seek(prefetch_bytes, 2)
346
+ await self.read(prefetch_bytes)
347
+
348
+ self.cursor.seek(0)
349
+
350
+ return self
351
+
352
+ def clone(self) -> AsyncHttpFileCursor:
353
+ """
354
+ Create a new cursor for concurrent reads.
355
+
356
+ Returns:
357
+ New cursor sharing cache but with independent position
358
+
359
+ Raises:
360
+ ValueError: If file is not opened
361
+ """
362
+ return self.cursor.clone()
363
+
364
+ async def close(self) -> None:
365
+ """
366
+ Close the file and release resources.
367
+ """
368
+ if self._cursor:
369
+ await self._cursor.ohf.close()
370
+ self._cursor = None
371
+
372
+ async def __aenter__(self) -> Self:
373
+ """Async context manager entry."""
374
+ return await self.open()
375
+
376
+ async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
377
+ """Async context manager exit."""
378
+ await self.close()
379
+
380
+ def __repr__(self) -> str:
381
+ if self._cursor:
382
+ return (
383
+ f'AsyncHttpFile(url={self.url!r}, opened=True, '
384
+ f'size={self.size}, pos={self._cursor.position})'
385
+ )
386
+ return f'AsyncHttpFile(url={self.url!r}, opened=False)'
387
+
388
+ async def read(self, size: int | None = None, /) -> bytes:
389
+ """
390
+ Read bytes from current position asynchronously.
391
+
392
+ Args:
393
+ size: Number of bytes to read (None for all remaining)
394
+
395
+ Returns:
396
+ Bytes read from the file
397
+ """
398
+ return await self.cursor.read(size)
399
+
400
+ def seek(self, offset: int, whence: int = 0, /) -> int:
401
+ """
402
+ Change stream position (synchronous - no I/O).
403
+
404
+ Args:
405
+ offset: Byte offset
406
+ whence: How to interpret offset (0=absolute, 1=relative, 2=from end)
407
+
408
+ Returns:
409
+ New absolute position
410
+ """
411
+ return self.cursor.seek(offset, whence)
412
+
413
+ def tell(self) -> int:
414
+ """
415
+ Get current stream position (synchronous - no I/O).
416
+
417
+ Returns:
418
+ Current byte position in file
419
+ """
420
+ return self.cursor.tell()
421
+
422
+ def readable(self) -> bool:
423
+ return True
424
+
425
+ def writable(self) -> bool:
426
+ return False
427
+
428
+ def seekable(self) -> bool:
429
+ return True