hctef 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hctef-0.1.0/.github/workflows/ci.yml +44 -0
- hctef-0.1.0/.github/workflows/release.yml +47 -0
- hctef-0.1.0/.gitignore +13 -0
- hctef-0.1.0/.pre-commit-config.yaml +51 -0
- hctef-0.1.0/PKG-INFO +236 -0
- hctef-0.1.0/README.md +224 -0
- hctef-0.1.0/pyproject.toml +99 -0
- hctef-0.1.0/src/hctef/__init__.py +13 -0
- hctef-0.1.0/src/hctef/__version__.py +34 -0
- hctef-0.1.0/src/hctef/aio/__init__.py +5 -0
- hctef-0.1.0/src/hctef/aio/async_file_read_cache.py +223 -0
- hctef-0.1.0/src/hctef/aio/async_http_file.py +429 -0
- hctef-0.1.0/src/hctef/exceptions.py +10 -0
- hctef-0.1.0/src/hctef/file_read_cache.py +155 -0
- hctef-0.1.0/src/hctef/http_file.py +279 -0
- hctef-0.1.0/src/hctef/interval_tree.py +83 -0
- hctef-0.1.0/src/hctef/py.typed +0 -0
- hctef-0.1.0/tests/__init__.py +0 -0
- hctef-0.1.0/tests/conftest.py +10 -0
- hctef-0.1.0/tests/test_async_http_file.py +229 -0
- hctef-0.1.0/tests/test_file_read_cache.py +65 -0
- hctef-0.1.0/tests/test_http_file.py +61 -0
- hctef-0.1.0/uv.lock +962 -0
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
name: "Continuous integration"
|
|
2
|
+
|
|
3
|
+
concurrency:
|
|
4
|
+
group: ${{ github.ref }}
|
|
5
|
+
cancel-in-progress: false
|
|
6
|
+
|
|
7
|
+
on:
|
|
8
|
+
push:
|
|
9
|
+
branches:
|
|
10
|
+
- main
|
|
11
|
+
pull_request:
|
|
12
|
+
|
|
13
|
+
jobs:
|
|
14
|
+
ci:
|
|
15
|
+
name: Continuous integration
|
|
16
|
+
runs-on: ubuntu-latest
|
|
17
|
+
strategy:
|
|
18
|
+
matrix:
|
|
19
|
+
python-version:
|
|
20
|
+
- "3.12"
|
|
21
|
+
- "3.13"
|
|
22
|
+
- "3.14"
|
|
23
|
+
steps:
|
|
24
|
+
- uses: actions/checkout@v4
|
|
25
|
+
- uses: astral-sh/setup-uv@v6
|
|
26
|
+
with:
|
|
27
|
+
python-version: ${{ matrix.python-version }}
|
|
28
|
+
- name: Sync
|
|
29
|
+
run: |
|
|
30
|
+
uv sync \
|
|
31
|
+
--locked \
|
|
32
|
+
--all-extras \
|
|
33
|
+
--no-editable
|
|
34
|
+
- name: Pre-Commit Hooks
|
|
35
|
+
run: uv run pre-commit run --all-files
|
|
36
|
+
- name: Test
|
|
37
|
+
run: uv run pytest
|
|
38
|
+
- name: "Upload coverage to Codecov"
|
|
39
|
+
uses: codecov/codecov-action@v4
|
|
40
|
+
env:
|
|
41
|
+
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
|
42
|
+
with:
|
|
43
|
+
fail_ci_if_error: false
|
|
44
|
+
verbose: true
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
name: Build and release
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches:
|
|
6
|
+
- main
|
|
7
|
+
pull_request:
|
|
8
|
+
branches:
|
|
9
|
+
- main
|
|
10
|
+
release:
|
|
11
|
+
types:
|
|
12
|
+
- published
|
|
13
|
+
|
|
14
|
+
jobs:
|
|
15
|
+
build-package:
|
|
16
|
+
runs-on: ubuntu-latest
|
|
17
|
+
steps:
|
|
18
|
+
- uses: actions/checkout@v4
|
|
19
|
+
- uses: astral-sh/setup-uv@v6
|
|
20
|
+
- name: Build
|
|
21
|
+
run: uv build
|
|
22
|
+
- name: Upload Artifact
|
|
23
|
+
uses: actions/upload-artifact@v4
|
|
24
|
+
if: startsWith(github.ref, 'refs/tags')
|
|
25
|
+
with:
|
|
26
|
+
name: dist-{github.ref}
|
|
27
|
+
path: dist/
|
|
28
|
+
overwrite: true
|
|
29
|
+
if-no-files-found: error
|
|
30
|
+
|
|
31
|
+
release-package:
|
|
32
|
+
if: startsWith(github.ref, 'refs/tags')
|
|
33
|
+
needs: build-package
|
|
34
|
+
runs-on: ubuntu-latest
|
|
35
|
+
environment:
|
|
36
|
+
name: pypi
|
|
37
|
+
url: https://pypi.org/p/hctef
|
|
38
|
+
permissions:
|
|
39
|
+
id-token: write
|
|
40
|
+
steps:
|
|
41
|
+
- name: Download a single artifact
|
|
42
|
+
uses: actions/download-artifact@v5
|
|
43
|
+
with:
|
|
44
|
+
name: dist-{github.ref}
|
|
45
|
+
path: dist/
|
|
46
|
+
- name: Upload release
|
|
47
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
hctef-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
exclude: "^tests/fixtures/"
|
|
2
|
+
repos:
|
|
3
|
+
- repo: local
|
|
4
|
+
hooks:
|
|
5
|
+
- id: ruff_check
|
|
6
|
+
name: ruff check
|
|
7
|
+
entry: ruff check --force-exclude
|
|
8
|
+
language: python
|
|
9
|
+
'types_or': [python, pyi]
|
|
10
|
+
args: [--fix, --exit-non-zero-on-fix]
|
|
11
|
+
require_serial: true
|
|
12
|
+
- id: ruff_format
|
|
13
|
+
name: ruff format
|
|
14
|
+
entry: ruff format --force-exclude
|
|
15
|
+
language: python
|
|
16
|
+
'types_or': [python, pyi]
|
|
17
|
+
args: []
|
|
18
|
+
require_serial: true
|
|
19
|
+
- id: check-added-large-files
|
|
20
|
+
name: Check for added large files
|
|
21
|
+
entry: check-added-large-files
|
|
22
|
+
language: system
|
|
23
|
+
- id: check-toml
|
|
24
|
+
name: Check Toml
|
|
25
|
+
entry: check-toml
|
|
26
|
+
language: system
|
|
27
|
+
types: [toml]
|
|
28
|
+
- id: check-yaml
|
|
29
|
+
name: Check Yaml
|
|
30
|
+
entry: check-yaml
|
|
31
|
+
language: system
|
|
32
|
+
types: [yaml]
|
|
33
|
+
- id: end-of-file-fixer
|
|
34
|
+
name: Fix End of Files
|
|
35
|
+
entry: end-of-file-fixer
|
|
36
|
+
language: system
|
|
37
|
+
types: [text]
|
|
38
|
+
stages: [pre-commit, pre-push, manual]
|
|
39
|
+
- id: trailing-whitespace
|
|
40
|
+
name: Trim Trailing Whitespace
|
|
41
|
+
entry: trailing-whitespace-fixer
|
|
42
|
+
language: system
|
|
43
|
+
types: [text]
|
|
44
|
+
stages: [pre-commit, pre-push, manual]
|
|
45
|
+
- id: mypy
|
|
46
|
+
name: mypy
|
|
47
|
+
entry: mypy
|
|
48
|
+
language: python
|
|
49
|
+
'types_or': [python, pyi]
|
|
50
|
+
args: []
|
|
51
|
+
require_serial: true
|
hctef-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hctef
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Helper classes to read files over HTTP using Range requests, with caching
|
|
5
|
+
Project-URL: Repository, https://github.com/jkeifer/hctef
|
|
6
|
+
Author-email: Jarrett Keifer <jkeifer0@gmail.com>
|
|
7
|
+
License: Apache License 2.0
|
|
8
|
+
Requires-Python: >=3.12
|
|
9
|
+
Provides-Extra: async
|
|
10
|
+
Requires-Dist: aiohttp>=3.13.0; extra == 'async'
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
|
|
13
|
+
[](https://github.com/jkeifer/hctef/actions/workflows/ci.yml)
|
|
14
|
+
[](https://badge.fury.io/py/hctef)
|
|
15
|
+
|
|
16
|
+
# hctef
|
|
17
|
+
|
|
18
|
+
Python library with helper classes to read files over HTTP using Range
|
|
19
|
+
requests, with caching.
|
|
20
|
+
|
|
21
|
+
## Overview
|
|
22
|
+
|
|
23
|
+
`hctef` provides a file-like interface for reading files over HTTP/HTTPS, using
|
|
24
|
+
HTTP Range requests to fetch only the data you need. It includes intelligent
|
|
25
|
+
caching to minimize network requests and supports both synchronous and
|
|
26
|
+
asynchronous operations.
|
|
27
|
+
|
|
28
|
+
## Features
|
|
29
|
+
|
|
30
|
+
- **File-like API**: Works like a regular Python file object with `read()`,
|
|
31
|
+
`seek()`, and `tell()` methods
|
|
32
|
+
- **Efficient Range Requests**: Fetches only the data you need using HTTP Range
|
|
33
|
+
headers
|
|
34
|
+
- **Intelligent Caching**: Uses an interval tree to track cached byte ranges
|
|
35
|
+
and minimize redundant requests
|
|
36
|
+
- **Prefetching**: Optionally prefetch data from the start or end of the file
|
|
37
|
+
- **Sync and Async**: Both synchronous and asynchronous implementations
|
|
38
|
+
available
|
|
39
|
+
- **Context Manager Support**: Use with `with` statements for automatic cleanup
|
|
40
|
+
|
|
41
|
+
## Installation
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
pip install hctef
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
To include async support:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install hctef[async]
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Quick Start
|
|
54
|
+
|
|
55
|
+
### Synchronous Usage
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
from hctef import HttpFile
|
|
59
|
+
|
|
60
|
+
url = "https://example.com/large-file.bin"
|
|
61
|
+
|
|
62
|
+
with HttpFile(url) as f:
|
|
63
|
+
# Read first 100 bytes
|
|
64
|
+
data = f.read(100)
|
|
65
|
+
|
|
66
|
+
# Seek to a specific position
|
|
67
|
+
f.seek(1000)
|
|
68
|
+
|
|
69
|
+
# Read from current position
|
|
70
|
+
more_data = f.read(50)
|
|
71
|
+
|
|
72
|
+
# Get current position
|
|
73
|
+
position = f.tell()
|
|
74
|
+
|
|
75
|
+
# Seek relative to end of file
|
|
76
|
+
f.seek(-100, 2)
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### Asynchronous Usage
|
|
80
|
+
|
|
81
|
+
The async implementation supports independent cursors for concurrent reads:
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
import asyncio
|
|
85
|
+
from hctef.aio import AsyncHttpFile
|
|
86
|
+
|
|
87
|
+
url = "https://example.com/large-file.bin"
|
|
88
|
+
|
|
89
|
+
async with AsyncHttpFile(url) as f:
|
|
90
|
+
# Read first 100 bytes
|
|
91
|
+
data = await f.read(100)
|
|
92
|
+
|
|
93
|
+
# Seek to a specific position (synchronous - no I/O)
|
|
94
|
+
f.seek(1000)
|
|
95
|
+
|
|
96
|
+
# Read from current position
|
|
97
|
+
more_data = await f.read(50)
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
#### Parallel Reads with Multiple Cursors
|
|
101
|
+
|
|
102
|
+
Create independent cursors to read from different positions concurrently:
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
import asyncio
|
|
106
|
+
from hctef.aio import AsyncHttpFile
|
|
107
|
+
|
|
108
|
+
url = "https://example.com/large-file.bin"
|
|
109
|
+
|
|
110
|
+
async with AsyncHttpFile(url) as f:
|
|
111
|
+
# Create independent cursors for parallel reading
|
|
112
|
+
cursor1 = f.clone()
|
|
113
|
+
cursor2 = f.clone()
|
|
114
|
+
|
|
115
|
+
# Position each cursor at different locations
|
|
116
|
+
f.seek(0)
|
|
117
|
+
cursor1.seek(1000)
|
|
118
|
+
cursor2.seek(2000)
|
|
119
|
+
|
|
120
|
+
# Read from all three positions in parallel
|
|
121
|
+
# All cursors share the same cache and HTTP session
|
|
122
|
+
results = await asyncio.gather(
|
|
123
|
+
f.read(100), # Read bytes 0-100
|
|
124
|
+
cursor1.read(100), # Read bytes 1000-1100
|
|
125
|
+
cursor2.read(100), # Read bytes 2000-2100
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# Each cursor maintains independent position
|
|
129
|
+
print(f.tell()) # 100
|
|
130
|
+
print(cursor1.tell()) # 1100
|
|
131
|
+
print(cursor2.tell()) # 2100
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
Cursors are lightweight and share:
|
|
135
|
+
|
|
136
|
+
- HTTP session (connection pooling)
|
|
137
|
+
- Byte range cache (deduplication of overlapping requests)
|
|
138
|
+
- File metadata
|
|
139
|
+
|
|
140
|
+
## Configuration Options
|
|
141
|
+
|
|
142
|
+
Both `HttpFile` and `AsyncHttpFile` accept the following parameters:
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
HttpFile(
|
|
146
|
+
url,
|
|
147
|
+
minimum_range_request_bytes=8192, # Minimum bytes per request (default: 8KB)
|
|
148
|
+
prefetch_bytes=1048576, # Bytes to prefetch on open (default: 1MB)
|
|
149
|
+
prefetch_direction='END' # 'START' or 'END' (default: 'END')
|
|
150
|
+
)
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
- **`minimum_range_request_bytes`**: The minimum number of bytes to request in
|
|
154
|
+
a single HTTP Range request (except when filling small cache gaps)
|
|
155
|
+
- **`prefetch_bytes`**: How many bytes to fetch immediately when opening the
|
|
156
|
+
file. Set to 0 to disable prefetching
|
|
157
|
+
- **`prefetch_direction`**: Whether to prefetch from the start (`'START'`) or
|
|
158
|
+
end (`'END'`) of the file
|
|
159
|
+
|
|
160
|
+
## Requirements
|
|
161
|
+
|
|
162
|
+
- Python 3.12 or higher
|
|
163
|
+
- HTTP server must support Range requests
|
|
164
|
+
- For async: `aiohttp>=3.13.0`
|
|
165
|
+
|
|
166
|
+
## How It Works
|
|
167
|
+
|
|
168
|
+
When you open an HTTP file, `hctef`:
|
|
169
|
+
|
|
170
|
+
1. Sends an initial Range request to determine the file size and verify Range
|
|
171
|
+
support
|
|
172
|
+
1. Optionally prefetches data from the start or end of the file
|
|
173
|
+
1. Maintains an in-memory cache of fetched byte ranges (not suitable for
|
|
174
|
+
downloading complete large files)
|
|
175
|
+
1. On `read()`, checks the cache first and only fetches missing data from the
|
|
176
|
+
server
|
|
177
|
+
1. Combines multiple small requests into larger ones based on
|
|
178
|
+
`minimum_range_request_bytes`
|
|
179
|
+
|
|
180
|
+
This approach minimizes HTTP requests while providing efficient random access
|
|
181
|
+
to remote files.
|
|
182
|
+
|
|
183
|
+
## Error Handling
|
|
184
|
+
|
|
185
|
+
`hctef` defines custom exceptions:
|
|
186
|
+
|
|
187
|
+
- `HctefError`: Base exception class
|
|
188
|
+
- `HctefNetworkError`: Raised for network-related errors (inherits from
|
|
189
|
+
`IOError`)
|
|
190
|
+
- `HctefUrlError`: Raised for invalid URLs (inherits from `ValueError`)
|
|
191
|
+
|
|
192
|
+
```python
|
|
193
|
+
from hctef import HttpFile
|
|
194
|
+
from hctef.exceptions import HctefNetworkError, HctefUrlError
|
|
195
|
+
|
|
196
|
+
try:
|
|
197
|
+
with HttpFile("https://example.com/file.bin") as f:
|
|
198
|
+
data = f.read(100)
|
|
199
|
+
except HctefNetworkError as e:
|
|
200
|
+
print(f"Network error: {e}")
|
|
201
|
+
except HctefUrlError as e:
|
|
202
|
+
print(f"Invalid URL: {e}")
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
## Development
|
|
206
|
+
|
|
207
|
+
To set up for development:
|
|
208
|
+
|
|
209
|
+
```bash
|
|
210
|
+
# Clone the repository
|
|
211
|
+
git clone https://github.com/jkeifer/hctef
|
|
212
|
+
cd hctef
|
|
213
|
+
|
|
214
|
+
# Install dependencies
|
|
215
|
+
uv sync --all-extras --dev
|
|
216
|
+
|
|
217
|
+
# Setup pre-commit
|
|
218
|
+
pre-commit install
|
|
219
|
+
|
|
220
|
+
# Run tests
|
|
221
|
+
pytest
|
|
222
|
+
|
|
223
|
+
# Run all checks with pre-commit
|
|
224
|
+
pre-commit run --all-files
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
## Future Ideas
|
|
228
|
+
|
|
229
|
+
- Consoldiate sync/async implementations
|
|
230
|
+
- Allow uncached "cursor" for reading a large file segement
|
|
231
|
+
- Cursors with separate caches (to allow clearing memory when done)
|
|
232
|
+
- would allow cursor-based access with non-async implementation
|
|
233
|
+
|
|
234
|
+
## License
|
|
235
|
+
|
|
236
|
+
Apache License 2.0
|
hctef-0.1.0/README.md
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
[](https://github.com/jkeifer/hctef/actions/workflows/ci.yml)
|
|
2
|
+
[](https://badge.fury.io/py/hctef)
|
|
3
|
+
|
|
4
|
+
# hctef
|
|
5
|
+
|
|
6
|
+
Python library with helper classes to read files over HTTP using Range
|
|
7
|
+
requests, with caching.
|
|
8
|
+
|
|
9
|
+
## Overview
|
|
10
|
+
|
|
11
|
+
`hctef` provides a file-like interface for reading files over HTTP/HTTPS, using
|
|
12
|
+
HTTP Range requests to fetch only the data you need. It includes intelligent
|
|
13
|
+
caching to minimize network requests and supports both synchronous and
|
|
14
|
+
asynchronous operations.
|
|
15
|
+
|
|
16
|
+
## Features
|
|
17
|
+
|
|
18
|
+
- **File-like API**: Works like a regular Python file object with `read()`,
|
|
19
|
+
`seek()`, and `tell()` methods
|
|
20
|
+
- **Efficient Range Requests**: Fetches only the data you need using HTTP Range
|
|
21
|
+
headers
|
|
22
|
+
- **Intelligent Caching**: Uses an interval tree to track cached byte ranges
|
|
23
|
+
and minimize redundant requests
|
|
24
|
+
- **Prefetching**: Optionally prefetch data from the start or end of the file
|
|
25
|
+
- **Sync and Async**: Both synchronous and asynchronous implementations
|
|
26
|
+
available
|
|
27
|
+
- **Context Manager Support**: Use with `with` statements for automatic cleanup
|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install hctef
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
To include async support:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install hctef[async]
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Quick Start
|
|
42
|
+
|
|
43
|
+
### Synchronous Usage
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
from hctef import HttpFile
|
|
47
|
+
|
|
48
|
+
url = "https://example.com/large-file.bin"
|
|
49
|
+
|
|
50
|
+
with HttpFile(url) as f:
|
|
51
|
+
# Read first 100 bytes
|
|
52
|
+
data = f.read(100)
|
|
53
|
+
|
|
54
|
+
# Seek to a specific position
|
|
55
|
+
f.seek(1000)
|
|
56
|
+
|
|
57
|
+
# Read from current position
|
|
58
|
+
more_data = f.read(50)
|
|
59
|
+
|
|
60
|
+
# Get current position
|
|
61
|
+
position = f.tell()
|
|
62
|
+
|
|
63
|
+
# Seek relative to end of file
|
|
64
|
+
f.seek(-100, 2)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Asynchronous Usage
|
|
68
|
+
|
|
69
|
+
The async implementation supports independent cursors for concurrent reads:
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
import asyncio
|
|
73
|
+
from hctef.aio import AsyncHttpFile
|
|
74
|
+
|
|
75
|
+
url = "https://example.com/large-file.bin"
|
|
76
|
+
|
|
77
|
+
async with AsyncHttpFile(url) as f:
|
|
78
|
+
# Read first 100 bytes
|
|
79
|
+
data = await f.read(100)
|
|
80
|
+
|
|
81
|
+
# Seek to a specific position (synchronous - no I/O)
|
|
82
|
+
f.seek(1000)
|
|
83
|
+
|
|
84
|
+
# Read from current position
|
|
85
|
+
more_data = await f.read(50)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
#### Parallel Reads with Multiple Cursors
|
|
89
|
+
|
|
90
|
+
Create independent cursors to read from different positions concurrently:
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
import asyncio
|
|
94
|
+
from hctef.aio import AsyncHttpFile
|
|
95
|
+
|
|
96
|
+
url = "https://example.com/large-file.bin"
|
|
97
|
+
|
|
98
|
+
async with AsyncHttpFile(url) as f:
|
|
99
|
+
# Create independent cursors for parallel reading
|
|
100
|
+
cursor1 = f.clone()
|
|
101
|
+
cursor2 = f.clone()
|
|
102
|
+
|
|
103
|
+
# Position each cursor at different locations
|
|
104
|
+
f.seek(0)
|
|
105
|
+
cursor1.seek(1000)
|
|
106
|
+
cursor2.seek(2000)
|
|
107
|
+
|
|
108
|
+
# Read from all three positions in parallel
|
|
109
|
+
# All cursors share the same cache and HTTP session
|
|
110
|
+
results = await asyncio.gather(
|
|
111
|
+
f.read(100), # Read bytes 0-100
|
|
112
|
+
cursor1.read(100), # Read bytes 1000-1100
|
|
113
|
+
cursor2.read(100), # Read bytes 2000-2100
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# Each cursor maintains independent position
|
|
117
|
+
print(f.tell()) # 100
|
|
118
|
+
print(cursor1.tell()) # 1100
|
|
119
|
+
print(cursor2.tell()) # 2100
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
Cursors are lightweight and share:
|
|
123
|
+
|
|
124
|
+
- HTTP session (connection pooling)
|
|
125
|
+
- Byte range cache (deduplication of overlapping requests)
|
|
126
|
+
- File metadata
|
|
127
|
+
|
|
128
|
+
## Configuration Options
|
|
129
|
+
|
|
130
|
+
Both `HttpFile` and `AsyncHttpFile` accept the following parameters:
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
HttpFile(
|
|
134
|
+
url,
|
|
135
|
+
minimum_range_request_bytes=8192, # Minimum bytes per request (default: 8KB)
|
|
136
|
+
prefetch_bytes=1048576, # Bytes to prefetch on open (default: 1MB)
|
|
137
|
+
prefetch_direction='END' # 'START' or 'END' (default: 'END')
|
|
138
|
+
)
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
- **`minimum_range_request_bytes`**: The minimum number of bytes to request in
|
|
142
|
+
a single HTTP Range request (except when filling small cache gaps)
|
|
143
|
+
- **`prefetch_bytes`**: How many bytes to fetch immediately when opening the
|
|
144
|
+
file. Set to 0 to disable prefetching
|
|
145
|
+
- **`prefetch_direction`**: Whether to prefetch from the start (`'START'`) or
|
|
146
|
+
end (`'END'`) of the file
|
|
147
|
+
|
|
148
|
+
## Requirements
|
|
149
|
+
|
|
150
|
+
- Python 3.12 or higher
|
|
151
|
+
- HTTP server must support Range requests
|
|
152
|
+
- For async: `aiohttp>=3.13.0`
|
|
153
|
+
|
|
154
|
+
## How It Works
|
|
155
|
+
|
|
156
|
+
When you open an HTTP file, `hctef`:
|
|
157
|
+
|
|
158
|
+
1. Sends an initial Range request to determine the file size and verify Range
|
|
159
|
+
support
|
|
160
|
+
1. Optionally prefetches data from the start or end of the file
|
|
161
|
+
1. Maintains an in-memory cache of fetched byte ranges (not suitable for
|
|
162
|
+
downloading complete large files)
|
|
163
|
+
1. On `read()`, checks the cache first and only fetches missing data from the
|
|
164
|
+
server
|
|
165
|
+
1. Combines multiple small requests into larger ones based on
|
|
166
|
+
`minimum_range_request_bytes`
|
|
167
|
+
|
|
168
|
+
This approach minimizes HTTP requests while providing efficient random access
|
|
169
|
+
to remote files.
|
|
170
|
+
|
|
171
|
+
## Error Handling
|
|
172
|
+
|
|
173
|
+
`hctef` defines custom exceptions:
|
|
174
|
+
|
|
175
|
+
- `HctefError`: Base exception class
|
|
176
|
+
- `HctefNetworkError`: Raised for network-related errors (inherits from
|
|
177
|
+
`IOError`)
|
|
178
|
+
- `HctefUrlError`: Raised for invalid URLs (inherits from `ValueError`)
|
|
179
|
+
|
|
180
|
+
```python
|
|
181
|
+
from hctef import HttpFile
|
|
182
|
+
from hctef.exceptions import HctefNetworkError, HctefUrlError
|
|
183
|
+
|
|
184
|
+
try:
|
|
185
|
+
with HttpFile("https://example.com/file.bin") as f:
|
|
186
|
+
data = f.read(100)
|
|
187
|
+
except HctefNetworkError as e:
|
|
188
|
+
print(f"Network error: {e}")
|
|
189
|
+
except HctefUrlError as e:
|
|
190
|
+
print(f"Invalid URL: {e}")
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
## Development
|
|
194
|
+
|
|
195
|
+
To set up for development:
|
|
196
|
+
|
|
197
|
+
```bash
|
|
198
|
+
# Clone the repository
|
|
199
|
+
git clone https://github.com/jkeifer/hctef
|
|
200
|
+
cd hctef
|
|
201
|
+
|
|
202
|
+
# Install dependencies
|
|
203
|
+
uv sync --all-extras --dev
|
|
204
|
+
|
|
205
|
+
# Setup pre-commit
|
|
206
|
+
pre-commit install
|
|
207
|
+
|
|
208
|
+
# Run tests
|
|
209
|
+
pytest
|
|
210
|
+
|
|
211
|
+
# Run all checks with pre-commit
|
|
212
|
+
pre-commit run --all-files
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
## Future Ideas
|
|
216
|
+
|
|
217
|
+
- Consoldiate sync/async implementations
|
|
218
|
+
- Allow uncached "cursor" for reading a large file segement
|
|
219
|
+
- Cursors with separate caches (to allow clearing memory when done)
|
|
220
|
+
- would allow cursor-based access with non-async implementation
|
|
221
|
+
|
|
222
|
+
## License
|
|
223
|
+
|
|
224
|
+
Apache License 2.0
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling", "hatch-vcs"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "hctef"
|
|
7
|
+
description = "Helper classes to read files over HTTP using Range requests, with caching"
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
authors = [
|
|
10
|
+
{ name = "Jarrett Keifer", email = "jkeifer0@gmail.com" }
|
|
11
|
+
]
|
|
12
|
+
requires-python = ">=3.12"
|
|
13
|
+
license = {text = "Apache License 2.0"}
|
|
14
|
+
dependencies = []
|
|
15
|
+
dynamic = ["version"]
|
|
16
|
+
|
|
17
|
+
[project.urls]
|
|
18
|
+
Repository = 'https://github.com/jkeifer/hctef'
|
|
19
|
+
|
|
20
|
+
[project.optional-dependencies]
|
|
21
|
+
async = [
|
|
22
|
+
"aiohttp>=3.13.0",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
[dependency-groups]
|
|
26
|
+
dev = [
|
|
27
|
+
"mypy>=1.15.0",
|
|
28
|
+
"pre-commit>=4.2.0",
|
|
29
|
+
"pre-commit-hooks>=5.0.0",
|
|
30
|
+
"pytest>=8.3.5",
|
|
31
|
+
"pytest-asyncio>=1.2.0",
|
|
32
|
+
"pytest-cov>=6.1.1",
|
|
33
|
+
"ruff>=0.11.6",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
[tool.hatch.version]
|
|
37
|
+
source = "vcs"
|
|
38
|
+
|
|
39
|
+
[tool.hatch.build.hooks.vcs]
|
|
40
|
+
version-file = "src/hctef/__version__.py"
|
|
41
|
+
|
|
42
|
+
[tool.hatch.build.targets.wheel]
|
|
43
|
+
packages = ["src/hctef"]
|
|
44
|
+
|
|
45
|
+
[tool.ruff.format]
|
|
46
|
+
quote-style = 'single'
|
|
47
|
+
|
|
48
|
+
[tool.ruff.lint]
|
|
49
|
+
exclude = [
|
|
50
|
+
'tests/fixtures/',
|
|
51
|
+
]
|
|
52
|
+
# https://docs.astral.sh/ruff/rules/
|
|
53
|
+
select = [
|
|
54
|
+
'B',
|
|
55
|
+
'BLE',
|
|
56
|
+
'C4',
|
|
57
|
+
'C90',
|
|
58
|
+
'COM',
|
|
59
|
+
'DTZ',
|
|
60
|
+
'E',
|
|
61
|
+
'ERA',
|
|
62
|
+
'F',
|
|
63
|
+
'FA',
|
|
64
|
+
'G',
|
|
65
|
+
'I',
|
|
66
|
+
'INP',
|
|
67
|
+
'N',
|
|
68
|
+
'NPY',
|
|
69
|
+
'PT',
|
|
70
|
+
'PTH',
|
|
71
|
+
'RET',
|
|
72
|
+
'RUF',
|
|
73
|
+
'S',
|
|
74
|
+
'SIM',
|
|
75
|
+
'T20',
|
|
76
|
+
'UP',
|
|
77
|
+
'W',
|
|
78
|
+
|
|
79
|
+
]
|
|
80
|
+
|
|
81
|
+
[tool.ruff.lint.per-file-ignores]
|
|
82
|
+
'__init__.py' = ['E402']
|
|
83
|
+
'tests/**/*' = ['T201', 'S101', 'S603']
|
|
84
|
+
|
|
85
|
+
[tool.ruff.lint.isort]
|
|
86
|
+
lines-between-types = 1
|
|
87
|
+
|
|
88
|
+
[tool.mypy]
|
|
89
|
+
ignore_missing_imports = true
|
|
90
|
+
scripts_are_modules = true
|
|
91
|
+
disable_error_code = 'prop-decorator'
|
|
92
|
+
|
|
93
|
+
[tool.pytest.ini_options]
|
|
94
|
+
addopts="--cov=hctef"
|
|
95
|
+
|
|
96
|
+
[tool.coverage.report]
|
|
97
|
+
show_missing = true
|
|
98
|
+
skip_empty = true
|
|
99
|
+
sort = "Cover"
|