hashserver 1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hashserver-1.0/LICENSE.txt +22 -0
- hashserver-1.0/PKG-INFO +174 -0
- hashserver-1.0/README.md +160 -0
- hashserver-1.0/hash_file_response.py +237 -0
- hashserver-1.0/hashserver.egg-info/PKG-INFO +174 -0
- hashserver-1.0/hashserver.egg-info/SOURCES.txt +23 -0
- hashserver-1.0/hashserver.egg-info/dependency_links.txt +1 -0
- hashserver-1.0/hashserver.egg-info/entry_points.txt +2 -0
- hashserver-1.0/hashserver.egg-info/requires.txt +3 -0
- hashserver-1.0/hashserver.egg-info/top_level.txt +2 -0
- hashserver-1.0/hashserver.py +865 -0
- hashserver-1.0/pyproject.toml +24 -0
- hashserver-1.0/setup.cfg +4 -0
- hashserver-1.0/tests/test_basic.py +85 -0
- hashserver-1.0/tests/test_basic_uvicorn.py +89 -0
- hashserver-1.0/tests/test_global_lock.py +96 -0
- hashserver-1.0/tests/test_has_buffers.py +92 -0
- hashserver-1.0/tests/test_lock.py +96 -0
- hashserver-1.0/tests/test_lock_uvicorn.py +99 -0
- hashserver-1.0/tests/test_port_range.py +54 -0
- hashserver-1.0/tests/test_promises.py +110 -0
- hashserver-1.0/tests/test_put_read.py +61 -0
- hashserver-1.0/tests/test_put_read_big.py +54 -0
- hashserver-1.0/tests/test_simple_put.py +87 -0
- hashserver-1.0/tests/test_timeout.py +31 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Author: Sjoerd de Vries, MBI platform.
|
|
4
|
+
Copyright (c) 2023-2026 CNRS.
|
|
5
|
+
|
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
8
|
+
in the Software without restriction, including without limitation the rights
|
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
11
|
+
furnished to do so, subject to the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be included in all
|
|
14
|
+
copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
22
|
+
SOFTWARE.
|
hashserver-1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hashserver
|
|
3
|
+
Version: 1.0
|
|
4
|
+
Summary: Simple FastAPI-based hash server
|
|
5
|
+
Author: Sjoerd de Vries
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE.txt
|
|
10
|
+
Requires-Dist: fastapi
|
|
11
|
+
Requires-Dist: uvicorn[standard]
|
|
12
|
+
Requires-Dist: typing-extensions
|
|
13
|
+
Dynamic: license-file
|
|
14
|
+
|
|
15
|
+
# Hashserver
|
|
16
|
+
|
|
17
|
+
A lightweight, content-addressed file server over HTTP.
|
|
18
|
+
|
|
19
|
+
Hashserver stores and serves opaque binary buffers keyed by their cryptographic checksum. You PUT a buffer with its checksum in the URL; you GET it back by the same checksum. There are no filenames, no directories, no metadata — just content and its hash.
|
|
20
|
+
|
|
21
|
+
The hash algorithm is configurable: SHA-256 (default) or SHA3-256.
|
|
22
|
+
|
|
23
|
+
## Why content-addressed storage?
|
|
24
|
+
|
|
25
|
+
Content-addressed storage (CAS) is a well-established pattern used by Git, IPFS, Docker registries, and many other systems. Identifying data by its cryptographic hash gives you automatic deduplication, trivially verifiable integrity, and strong reproducibility guarantees.
|
|
26
|
+
|
|
27
|
+
Hashserver brings these benefits to any project that needs a simple HTTP-based buffer store. It is intentionally minimal: a single ASGI application backed by a directory of files, designed to be easy to deploy, easy to integrate, and easy to reason about.
|
|
28
|
+
|
|
29
|
+
## Relationship to Seamless
|
|
30
|
+
|
|
31
|
+
Hashserver was originally developed as the buffer-serving component of [Seamless](https://github.com/sjdv1982/seamless), a framework for reproducible, reactive computational workflows. In Seamless, all data — inputs, source code, and results — is represented as a tree of checksums, and hashserver provides the storage layer that maps those checksums back to actual data.
|
|
32
|
+
|
|
33
|
+
However, **hashserver has no dependency on Seamless** and no knowledge of it. It is a generic content-addressed file server that is useful in any context where you need to store and retrieve buffers by hash — caching layers, artifact stores, reproducible pipelines, or your own CAS-backed application. It is published as an independent PyPI package for exactly this reason.
|
|
34
|
+
|
|
35
|
+
## Features
|
|
36
|
+
|
|
37
|
+
- **Content-addressed**: buffers are stored and retrieved by their cryptographic checksum.
|
|
38
|
+
- **Configurable hash algorithm**: SHA-256 (default) or SHA3-256, selected at startup.
|
|
39
|
+
- **Integrity-verified reads**: every buffer is re-checksummed on GET to detect corruption.
|
|
40
|
+
- **Prefix directory layout**: by default, buffers are stored under a two-character prefix subdirectory (e.g. `ab/ab3f7c...`) to avoid filesystem performance problems with large flat directories. A flat layout is also supported.
|
|
41
|
+
- **Extra read-only directories**: additional buffer directories can be mounted as fallback read sources.
|
|
42
|
+
- **Promises**: a client can announce that a buffer will be uploaded soon via `PUT /promise/{checksum}`. Other clients reading that checksum will wait for the upload rather than getting a 404.
|
|
43
|
+
- **Concurrent-safe**: in-flight PUT requests are tracked so concurrent GETs and batch queries return consistent results. Lock files are respected for external writers.
|
|
44
|
+
- **Multiple instances**: several hashserver processes can safely share the same buffer directory.
|
|
45
|
+
- **Lightweight**: built on FastAPI/Starlette — no database, no external services.
|
|
46
|
+
- **Flexible deployment**: run as a CLI tool, under any ASGI server, or via Docker Compose.
|
|
47
|
+
|
|
48
|
+
## Installation
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pip install hashserver
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
Or with conda:
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
mamba env create --file environment.yml
|
|
58
|
+
conda activate hashserver
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Quick start
|
|
62
|
+
|
|
63
|
+
Serve buffers from a local directory:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
hashserver ./my-buffers
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
This starts the server under uvicorn on port 8000. Run `hashserver -h` for all options.
|
|
70
|
+
|
|
71
|
+
### Storing and retrieving a buffer
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
# Start a writable server
|
|
75
|
+
hashserver ./my-buffers --writable
|
|
76
|
+
|
|
77
|
+
# Compute the SHA-256 checksum and upload
|
|
78
|
+
CHECKSUM=$(python3 -c "
|
|
79
|
+
import hashlib, sys
|
|
80
|
+
print(hashlib.sha256(open(sys.argv[1],'rb').read()).hexdigest())
|
|
81
|
+
" myfile.bin)
|
|
82
|
+
curl -X PUT --data-binary @myfile.bin http://localhost:8000/$CHECKSUM
|
|
83
|
+
|
|
84
|
+
# Download
|
|
85
|
+
curl -O http://localhost:8000/$CHECKSUM
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
To use SHA3-256 instead, start the server with `--hash-algorithm sha3-256` and hash your files with `hashlib.sha3_256`.
|
|
89
|
+
|
|
90
|
+
## API
|
|
91
|
+
|
|
92
|
+
### Retrieving buffers
|
|
93
|
+
|
|
94
|
+
**`GET /{checksum}`** — Retrieve a buffer by its hex checksum. The server verifies the checksum before sending the response. Returns the raw buffer (200), or 404 if not found.
|
|
95
|
+
|
|
96
|
+
### Storing buffers
|
|
97
|
+
|
|
98
|
+
Requires `--writable`.
|
|
99
|
+
|
|
100
|
+
**`PUT /{checksum}`** — Upload a buffer. The request body is the raw data; the server verifies that its checksum matches the URL. Returns 200 on success, 201 if the buffer already existed, or 400 on checksum mismatch.
|
|
101
|
+
|
|
102
|
+
**`PUT /promise/{checksum}`** — Announce that a buffer will be uploaded soon. Returns 202 with the promise TTL. While a promise is active, GET requests for that checksum will wait rather than returning 404, and `/has` queries will report the checksum as present.
|
|
103
|
+
|
|
104
|
+
### Querying availability
|
|
105
|
+
|
|
106
|
+
**`GET /has`** — Batch existence check. Send a JSON list of checksums in the request body. Returns a JSON list of booleans. Includes both on-disk buffers and active promises.
|
|
107
|
+
|
|
108
|
+
**`GET /has-now`** — Same as `/has`, but excludes promises — only reports buffers that are already on disk.
|
|
109
|
+
|
|
110
|
+
**`GET /buffer-length`** — Batch size query. Send a JSON list of checksums in the request body. Returns a JSON list of integers: the buffer size in bytes, or 0 if not present. Promised checksums are reported as `true`.
|
|
111
|
+
|
|
112
|
+
### Health
|
|
113
|
+
|
|
114
|
+
**`GET /healthcheck`** — Returns "OK". Useful for load balancer probes.
|
|
115
|
+
|
|
116
|
+
## Configuration
|
|
117
|
+
|
|
118
|
+
### CLI flags
|
|
119
|
+
|
|
120
|
+
| Flag | Description | Default |
|
|
121
|
+
|------|-------------|---------|
|
|
122
|
+
| `directory` | Buffer storage directory (positional, required) | — |
|
|
123
|
+
| `--writable` | Enable PUT endpoints | off |
|
|
124
|
+
| `--hash-algorithm` | Hash algorithm: `sha3-256` or `sha-256` | `sha-256` |
|
|
125
|
+
| `--layout` | Directory layout: `prefix` or `flat` | `prefix` |
|
|
126
|
+
| `--extra-dirs` | Semicolon-separated list of extra read-only buffer directories | — |
|
|
127
|
+
| `--host` | Listen address | `127.0.0.1` |
|
|
128
|
+
| `--port` | Listen port | `8000` |
|
|
129
|
+
| `--port-range START END` | Pick a random free port in range (mutually exclusive with `--port`) | — |
|
|
130
|
+
| `--status-file` | JSON file for reporting server status | — |
|
|
131
|
+
| `--timeout` | Shut down after this many seconds of inactivity | — |
|
|
132
|
+
|
|
133
|
+
### Environment variables
|
|
134
|
+
|
|
135
|
+
When running under an external ASGI server (e.g. `uvicorn hashserver:app`), configure via environment variables instead:
|
|
136
|
+
|
|
137
|
+
| Variable | Equivalent flag |
|
|
138
|
+
|----------|----------------|
|
|
139
|
+
| `HASHSERVER_DIRECTORY` | `directory` |
|
|
140
|
+
| `HASHSERVER_WRITABLE` | `--writable` (set to `1` or `true`) |
|
|
141
|
+
| `HASHSERVER_HASH_ALGORITHM` | `--hash-algorithm` |
|
|
142
|
+
| `HASHSERVER_LAYOUT` | `--layout` |
|
|
143
|
+
| `HASHSERVER_EXTRA_DIRS` | `--extra-dirs` |
|
|
144
|
+
|
|
145
|
+
### Docker Compose
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
export HASHSERVER_PORT=8000
|
|
149
|
+
export HASHSERVER_HOST=0.0.0.0
|
|
150
|
+
export HASHSERVER_DIRECTORY=./buffers
|
|
151
|
+
export HASHSERVER_WRITABLE=1
|
|
152
|
+
docker compose up -d
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
Container user/group ID can be set with `HASHSERVER_USER_ID` and `HASHSERVER_GROUP_ID` (both default to 0).
|
|
156
|
+
|
|
157
|
+
## Directory layouts
|
|
158
|
+
|
|
159
|
+
In **prefix** layout (the default), a buffer with checksum `ab3f7c...` is stored as `<directory>/ab/ab3f7c...`. A sentinel file `.HASHSERVER_PREFIX` is written to the directory. This avoids performance issues when storing large numbers of buffers.
|
|
160
|
+
|
|
161
|
+
In **flat** layout, the same buffer is stored as `<directory>/ab3f7c...`.
|
|
162
|
+
|
|
163
|
+
Extra directories auto-detect their layout by checking for the `.HASHSERVER_PREFIX` sentinel.
|
|
164
|
+
|
|
165
|
+
## Running tests
|
|
166
|
+
|
|
167
|
+
```bash
|
|
168
|
+
pip install requests
|
|
169
|
+
pytest tests/
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
## License
|
|
173
|
+
|
|
174
|
+
See [LICENSE.txt](LICENSE.txt).
|
hashserver-1.0/README.md
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
# Hashserver
|
|
2
|
+
|
|
3
|
+
A lightweight, content-addressed file server over HTTP.
|
|
4
|
+
|
|
5
|
+
Hashserver stores and serves opaque binary buffers keyed by their cryptographic checksum. You PUT a buffer with its checksum in the URL; you GET it back by the same checksum. There are no filenames, no directories, no metadata — just content and its hash.
|
|
6
|
+
|
|
7
|
+
The hash algorithm is configurable: SHA-256 (default) or SHA3-256.
|
|
8
|
+
|
|
9
|
+
## Why content-addressed storage?
|
|
10
|
+
|
|
11
|
+
Content-addressed storage (CAS) is a well-established pattern used by Git, IPFS, Docker registries, and many other systems. Identifying data by its cryptographic hash gives you automatic deduplication, trivially verifiable integrity, and strong reproducibility guarantees.
|
|
12
|
+
|
|
13
|
+
Hashserver brings these benefits to any project that needs a simple HTTP-based buffer store. It is intentionally minimal: a single ASGI application backed by a directory of files, designed to be easy to deploy, easy to integrate, and easy to reason about.
|
|
14
|
+
|
|
15
|
+
## Relationship to Seamless
|
|
16
|
+
|
|
17
|
+
Hashserver was originally developed as the buffer-serving component of [Seamless](https://github.com/sjdv1982/seamless), a framework for reproducible, reactive computational workflows. In Seamless, all data — inputs, source code, and results — is represented as a tree of checksums, and hashserver provides the storage layer that maps those checksums back to actual data.
|
|
18
|
+
|
|
19
|
+
However, **hashserver has no dependency on Seamless** and no knowledge of it. It is a generic content-addressed file server that is useful in any context where you need to store and retrieve buffers by hash — caching layers, artifact stores, reproducible pipelines, or your own CAS-backed application. It is published as an independent PyPI package for exactly this reason.
|
|
20
|
+
|
|
21
|
+
## Features
|
|
22
|
+
|
|
23
|
+
- **Content-addressed**: buffers are stored and retrieved by their cryptographic checksum.
|
|
24
|
+
- **Configurable hash algorithm**: SHA-256 (default) or SHA3-256, selected at startup.
|
|
25
|
+
- **Integrity-verified reads**: every buffer is re-checksummed on GET to detect corruption.
|
|
26
|
+
- **Prefix directory layout**: by default, buffers are stored under a two-character prefix subdirectory (e.g. `ab/ab3f7c...`) to avoid filesystem performance problems with large flat directories. A flat layout is also supported.
|
|
27
|
+
- **Extra read-only directories**: additional buffer directories can be mounted as fallback read sources.
|
|
28
|
+
- **Promises**: a client can announce that a buffer will be uploaded soon via `PUT /promise/{checksum}`. Other clients reading that checksum will wait for the upload rather than getting a 404.
|
|
29
|
+
- **Concurrent-safe**: in-flight PUT requests are tracked so concurrent GETs and batch queries return consistent results. Lock files are respected for external writers.
|
|
30
|
+
- **Multiple instances**: several hashserver processes can safely share the same buffer directory.
|
|
31
|
+
- **Lightweight**: built on FastAPI/Starlette — no database, no external services.
|
|
32
|
+
- **Flexible deployment**: run as a CLI tool, under any ASGI server, or via Docker Compose.
|
|
33
|
+
|
|
34
|
+
## Installation
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install hashserver
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Or with conda:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
mamba env create --file environment.yml
|
|
44
|
+
conda activate hashserver
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Quick start
|
|
48
|
+
|
|
49
|
+
Serve buffers from a local directory:
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
hashserver ./my-buffers
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
This starts the server under uvicorn on port 8000. Run `hashserver -h` for all options.
|
|
56
|
+
|
|
57
|
+
### Storing and retrieving a buffer
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
# Start a writable server
|
|
61
|
+
hashserver ./my-buffers --writable
|
|
62
|
+
|
|
63
|
+
# Compute the SHA-256 checksum and upload
|
|
64
|
+
CHECKSUM=$(python3 -c "
|
|
65
|
+
import hashlib, sys
|
|
66
|
+
print(hashlib.sha256(open(sys.argv[1],'rb').read()).hexdigest())
|
|
67
|
+
" myfile.bin)
|
|
68
|
+
curl -X PUT --data-binary @myfile.bin http://localhost:8000/$CHECKSUM
|
|
69
|
+
|
|
70
|
+
# Download
|
|
71
|
+
curl -O http://localhost:8000/$CHECKSUM
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
To use SHA3-256 instead, start the server with `--hash-algorithm sha3-256` and hash your files with `hashlib.sha3_256`.
|
|
75
|
+
|
|
76
|
+
## API
|
|
77
|
+
|
|
78
|
+
### Retrieving buffers
|
|
79
|
+
|
|
80
|
+
**`GET /{checksum}`** — Retrieve a buffer by its hex checksum. The server verifies the checksum before sending the response. Returns the raw buffer (200), or 404 if not found.
|
|
81
|
+
|
|
82
|
+
### Storing buffers
|
|
83
|
+
|
|
84
|
+
Requires `--writable`.
|
|
85
|
+
|
|
86
|
+
**`PUT /{checksum}`** — Upload a buffer. The request body is the raw data; the server verifies that its checksum matches the URL. Returns 200 on success, 201 if the buffer already existed, or 400 on checksum mismatch.
|
|
87
|
+
|
|
88
|
+
**`PUT /promise/{checksum}`** — Announce that a buffer will be uploaded soon. Returns 202 with the promise TTL. While a promise is active, GET requests for that checksum will wait rather than returning 404, and `/has` queries will report the checksum as present.
|
|
89
|
+
|
|
90
|
+
### Querying availability
|
|
91
|
+
|
|
92
|
+
**`GET /has`** — Batch existence check. Send a JSON list of checksums in the request body. Returns a JSON list of booleans. Includes both on-disk buffers and active promises.
|
|
93
|
+
|
|
94
|
+
**`GET /has-now`** — Same as `/has`, but excludes promises — only reports buffers that are already on disk.
|
|
95
|
+
|
|
96
|
+
**`GET /buffer-length`** — Batch size query. Send a JSON list of checksums in the request body. Returns a JSON list of integers: the buffer size in bytes, or 0 if not present. Promised checksums are reported as `true`.
|
|
97
|
+
|
|
98
|
+
### Health
|
|
99
|
+
|
|
100
|
+
**`GET /healthcheck`** — Returns "OK". Useful for load balancer probes.
|
|
101
|
+
|
|
102
|
+
## Configuration
|
|
103
|
+
|
|
104
|
+
### CLI flags
|
|
105
|
+
|
|
106
|
+
| Flag | Description | Default |
|
|
107
|
+
|------|-------------|---------|
|
|
108
|
+
| `directory` | Buffer storage directory (positional, required) | — |
|
|
109
|
+
| `--writable` | Enable PUT endpoints | off |
|
|
110
|
+
| `--hash-algorithm` | Hash algorithm: `sha3-256` or `sha-256` | `sha-256` |
|
|
111
|
+
| `--layout` | Directory layout: `prefix` or `flat` | `prefix` |
|
|
112
|
+
| `--extra-dirs` | Semicolon-separated list of extra read-only buffer directories | — |
|
|
113
|
+
| `--host` | Listen address | `127.0.0.1` |
|
|
114
|
+
| `--port` | Listen port | `8000` |
|
|
115
|
+
| `--port-range START END` | Pick a random free port in range (mutually exclusive with `--port`) | — |
|
|
116
|
+
| `--status-file` | JSON file for reporting server status | — |
|
|
117
|
+
| `--timeout` | Shut down after this many seconds of inactivity | — |
|
|
118
|
+
|
|
119
|
+
### Environment variables
|
|
120
|
+
|
|
121
|
+
When running under an external ASGI server (e.g. `uvicorn hashserver:app`), configure via environment variables instead:
|
|
122
|
+
|
|
123
|
+
| Variable | Equivalent flag |
|
|
124
|
+
|----------|----------------|
|
|
125
|
+
| `HASHSERVER_DIRECTORY` | `directory` |
|
|
126
|
+
| `HASHSERVER_WRITABLE` | `--writable` (set to `1` or `true`) |
|
|
127
|
+
| `HASHSERVER_HASH_ALGORITHM` | `--hash-algorithm` |
|
|
128
|
+
| `HASHSERVER_LAYOUT` | `--layout` |
|
|
129
|
+
| `HASHSERVER_EXTRA_DIRS` | `--extra-dirs` |
|
|
130
|
+
|
|
131
|
+
### Docker Compose
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
export HASHSERVER_PORT=8000
|
|
135
|
+
export HASHSERVER_HOST=0.0.0.0
|
|
136
|
+
export HASHSERVER_DIRECTORY=./buffers
|
|
137
|
+
export HASHSERVER_WRITABLE=1
|
|
138
|
+
docker compose up -d
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
Container user/group ID can be set with `HASHSERVER_USER_ID` and `HASHSERVER_GROUP_ID` (both default to 0).
|
|
142
|
+
|
|
143
|
+
## Directory layouts
|
|
144
|
+
|
|
145
|
+
In **prefix** layout (the default), a buffer with checksum `ab3f7c...` is stored as `<directory>/ab/ab3f7c...`. A sentinel file `.HASHSERVER_PREFIX` is written to the directory. This avoids performance issues when storing large numbers of buffers.
|
|
146
|
+
|
|
147
|
+
In **flat** layout, the same buffer is stored as `<directory>/ab3f7c...`.
|
|
148
|
+
|
|
149
|
+
Extra directories auto-detect their layout by checking for the `.HASHSERVER_PREFIX` sentinel.
|
|
150
|
+
|
|
151
|
+
## Running tests
|
|
152
|
+
|
|
153
|
+
```bash
|
|
154
|
+
pip install requests
|
|
155
|
+
pytest tests/
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
## License
|
|
159
|
+
|
|
160
|
+
See [LICENSE.txt](LICENSE.txt).
|
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import stat
|
|
3
|
+
import time
|
|
4
|
+
import typing
|
|
5
|
+
from hashlib import sha3_256, sha256
|
|
6
|
+
|
|
7
|
+
import anyio
|
|
8
|
+
|
|
9
|
+
from starlette.background import BackgroundTask
|
|
10
|
+
from starlette.types import Receive, Scope, Send
|
|
11
|
+
from starlette.responses import FileResponse
|
|
12
|
+
|
|
13
|
+
HASH_ALGORITHMS = {
|
|
14
|
+
"sha3-256": sha3_256,
|
|
15
|
+
"sha-256": sha256,
|
|
16
|
+
}
|
|
17
|
+
DEFAULT_HASH_ALGORITHM = "sha-256"
|
|
18
|
+
_current_hash_algorithm = DEFAULT_HASH_ALGORITHM
|
|
19
|
+
_hash_constructor = HASH_ALGORITHMS[DEFAULT_HASH_ALGORITHM]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def set_hash_algorithm(algorithm: str) -> None:
|
|
23
|
+
global _current_hash_algorithm, _hash_constructor
|
|
24
|
+
try:
|
|
25
|
+
_hash_constructor = HASH_ALGORITHMS[algorithm]
|
|
26
|
+
except KeyError as exc:
|
|
27
|
+
raise ValueError(
|
|
28
|
+
f"Unsupported hash algorithm '{algorithm}'. "
|
|
29
|
+
f"Choose one of: {', '.join(HASH_ALGORITHMS)}"
|
|
30
|
+
) from exc
|
|
31
|
+
_current_hash_algorithm = algorithm
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def get_hash_algorithm() -> str:
|
|
35
|
+
return _current_hash_algorithm
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def parse_checksum(checksum) -> str:
|
|
39
|
+
"""Parses checksum and returns it as string.
|
|
40
|
+
|
|
41
|
+
Adapted from the Seamless source code (fair use)"""
|
|
42
|
+
if isinstance(checksum, bytes):
|
|
43
|
+
checksum = checksum.hex()
|
|
44
|
+
if isinstance(checksum, str):
|
|
45
|
+
if len(checksum) % 2:
|
|
46
|
+
raise ValueError("Wrong length")
|
|
47
|
+
checksum = bytes.fromhex(checksum)
|
|
48
|
+
|
|
49
|
+
if isinstance(checksum, bytes):
|
|
50
|
+
if len(checksum) != 32:
|
|
51
|
+
raise ValueError("Wrong length")
|
|
52
|
+
return checksum.hex()
|
|
53
|
+
|
|
54
|
+
if checksum is None:
|
|
55
|
+
return
|
|
56
|
+
raise TypeError(type(checksum))
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class HashFileResponse(FileResponse):
|
|
60
|
+
"""FileResponse that validates files against their checksum-derived filename."""
|
|
61
|
+
|
|
62
|
+
_PREFIX = False
|
|
63
|
+
|
|
64
|
+
lock_timeout = 120
|
|
65
|
+
chunk_size = 640 * 1024
|
|
66
|
+
|
|
67
|
+
def __init__(
|
|
68
|
+
self,
|
|
69
|
+
checksum: str,
|
|
70
|
+
directory: str,
|
|
71
|
+
status_code: int = 200,
|
|
72
|
+
headers: typing.Optional[typing.Mapping[str, str]] = None,
|
|
73
|
+
media_type: typing.Optional[str] = None,
|
|
74
|
+
background: typing.Optional[BackgroundTask] = None,
|
|
75
|
+
stat_result: typing.Optional[os.stat_result] = None,
|
|
76
|
+
method: typing.Optional[str] = None,
|
|
77
|
+
content_disposition_type: str = "attachment",
|
|
78
|
+
extra_dirs: typing.Optional[typing.List[str]] = None,
|
|
79
|
+
) -> None:
|
|
80
|
+
filename = parse_checksum(checksum)
|
|
81
|
+
self.prefix = filename[:2]
|
|
82
|
+
stat_result = None
|
|
83
|
+
if self._PREFIX:
|
|
84
|
+
path = os.path.join(directory, self.prefix, filename)
|
|
85
|
+
else:
|
|
86
|
+
path = os.path.join(directory, filename)
|
|
87
|
+
super().__init__(
|
|
88
|
+
path=path,
|
|
89
|
+
status_code=status_code,
|
|
90
|
+
headers=headers,
|
|
91
|
+
media_type=media_type,
|
|
92
|
+
background=background,
|
|
93
|
+
filename=filename,
|
|
94
|
+
stat_result=stat_result,
|
|
95
|
+
method=method,
|
|
96
|
+
content_disposition_type=content_disposition_type,
|
|
97
|
+
)
|
|
98
|
+
self.directory = directory
|
|
99
|
+
self.extra_dirs = extra_dirs
|
|
100
|
+
extra_dirs_layout = {}
|
|
101
|
+
for extra_dir in extra_dirs:
|
|
102
|
+
prefix_file = os.path.join(extra_dir, ".HASHSERVER_PREFIX")
|
|
103
|
+
if os.path.exists(prefix_file):
|
|
104
|
+
layout = "prefix"
|
|
105
|
+
else:
|
|
106
|
+
layout = "flat"
|
|
107
|
+
extra_dirs_layout[extra_dir] = layout
|
|
108
|
+
self.extra_dirs_layout = extra_dirs_layout
|
|
109
|
+
|
|
110
|
+
async def refresh_stat_headers(self):
|
|
111
|
+
if self.extra_dirs and not await anyio.Path(self.path).exists():
|
|
112
|
+
for extra_dir in self.extra_dirs:
|
|
113
|
+
layout = self.extra_dirs_layout[extra_dir]
|
|
114
|
+
if layout == "prefix":
|
|
115
|
+
path0 = os.path.join(extra_dir, self.prefix, self.filename)
|
|
116
|
+
else:
|
|
117
|
+
path0 = os.path.join(extra_dir, self.filename)
|
|
118
|
+
if await anyio.Path(path0).exists():
|
|
119
|
+
self.path = path0
|
|
120
|
+
break
|
|
121
|
+
|
|
122
|
+
try:
|
|
123
|
+
stat_result = await anyio.to_thread.run_sync(os.stat, self.path)
|
|
124
|
+
del self.headers["content-length"]
|
|
125
|
+
del self.headers["last-modified"]
|
|
126
|
+
del self.headers["etag"]
|
|
127
|
+
|
|
128
|
+
self.set_stat_headers(stat_result)
|
|
129
|
+
except FileNotFoundError:
|
|
130
|
+
raise FileNotFoundError(
|
|
131
|
+
f"File at path {self.path} does not exist."
|
|
132
|
+
) from None
|
|
133
|
+
else:
|
|
134
|
+
mode = stat_result.st_mode
|
|
135
|
+
if not stat.S_ISREG(mode):
|
|
136
|
+
raise RuntimeError(f"File at path {self.path} is not a file.")
|
|
137
|
+
return stat_result
|
|
138
|
+
|
|
139
|
+
async def _until_no_lock(self, lockpaths):
|
|
140
|
+
for lockpath in lockpaths:
|
|
141
|
+
while 1:
|
|
142
|
+
try:
|
|
143
|
+
lock_stat_result = await anyio.to_thread.run_sync(os.stat, lockpath)
|
|
144
|
+
except FileNotFoundError:
|
|
145
|
+
break
|
|
146
|
+
lock_mtime = lock_stat_result.st_mtime
|
|
147
|
+
if time.time() - lock_mtime > self.lock_timeout:
|
|
148
|
+
break
|
|
149
|
+
await anyio.sleep(1)
|
|
150
|
+
|
|
151
|
+
async def until_no_lock(self):
|
|
152
|
+
lockpaths = [os.path.join(self.directory, ".LOCK")]
|
|
153
|
+
if self.path is not None:
|
|
154
|
+
lockpaths.append(self.path + ".LOCK")
|
|
155
|
+
return await self._until_no_lock(lockpaths)
|
|
156
|
+
|
|
157
|
+
async def calculate_checksum(self):
|
|
158
|
+
"""Return checksum for the configured algorithm."""
|
|
159
|
+
checksum = _hash_constructor()
|
|
160
|
+
async with await anyio.open_file(self.path, mode="rb") as file:
|
|
161
|
+
more_body = True
|
|
162
|
+
while more_body:
|
|
163
|
+
chunk = await file.read(self.chunk_size)
|
|
164
|
+
checksum.update(chunk)
|
|
165
|
+
more_body = len(chunk) == self.chunk_size
|
|
166
|
+
|
|
167
|
+
checksum = checksum.digest().hex()
|
|
168
|
+
return checksum
|
|
169
|
+
|
|
170
|
+
async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None:
|
|
171
|
+
if self.stat_result is None:
|
|
172
|
+
try:
|
|
173
|
+
stat_result = await self.refresh_stat_headers()
|
|
174
|
+
except FileNotFoundError:
|
|
175
|
+
await self.until_no_lock()
|
|
176
|
+
stat_result = await self.refresh_stat_headers()
|
|
177
|
+
self.stat_result = stat_result
|
|
178
|
+
|
|
179
|
+
checksum = await self.calculate_checksum()
|
|
180
|
+
if checksum != self.filename:
|
|
181
|
+
await self.until_no_lock()
|
|
182
|
+
stat_result = await self.refresh_stat_headers()
|
|
183
|
+
self.stat_result = stat_result
|
|
184
|
+
checksum2 = await self.calculate_checksum()
|
|
185
|
+
if checksum2 != self.filename:
|
|
186
|
+
raise RuntimeError(
|
|
187
|
+
f"File corruption: file at path {self.path} does not have the correct {_current_hash_algorithm} checksum."
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
await super().__call__(scope=scope, receive=receive, send=send)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
class PrefixHashFileResponse(HashFileResponse):
|
|
194
|
+
"""Same as HashFileResponse but files are stored under a two-character prefix.
|
|
195
|
+
|
|
196
|
+
File has the same name as checksum.
|
|
197
|
+
File is stored as $PREFIX/$CHECKSUM, where $PREFIX is the first two
|
|
198
|
+
characters of $CHECKSUM
|
|
199
|
+
"""
|
|
200
|
+
|
|
201
|
+
_PREFIX = True
|
|
202
|
+
|
|
203
|
+
def __init__(
|
|
204
|
+
self,
|
|
205
|
+
checksum: str,
|
|
206
|
+
directory: str,
|
|
207
|
+
status_code: int = 200,
|
|
208
|
+
headers: typing.Optional[typing.Mapping[str, str]] = None,
|
|
209
|
+
media_type: typing.Optional[str] = None,
|
|
210
|
+
background: typing.Optional[BackgroundTask] = None,
|
|
211
|
+
stat_result: typing.Optional[os.stat_result] = None,
|
|
212
|
+
method: typing.Optional[str] = None,
|
|
213
|
+
content_disposition_type: str = "attachment",
|
|
214
|
+
extra_dirs: typing.Optional[typing.List[str]] = None,
|
|
215
|
+
) -> None:
|
|
216
|
+
|
|
217
|
+
super().__init__(
|
|
218
|
+
checksum=checksum,
|
|
219
|
+
directory=directory,
|
|
220
|
+
status_code=status_code,
|
|
221
|
+
headers=headers,
|
|
222
|
+
media_type=media_type,
|
|
223
|
+
background=background,
|
|
224
|
+
stat_result=stat_result,
|
|
225
|
+
method=method,
|
|
226
|
+
content_disposition_type=content_disposition_type,
|
|
227
|
+
extra_dirs=extra_dirs,
|
|
228
|
+
)
|
|
229
|
+
prefix_file = os.path.join(directory, ".HASHSERVER_PREFIX")
|
|
230
|
+
with open(prefix_file, mode="wb") as f:
|
|
231
|
+
f.write(b"1\n")
|
|
232
|
+
|
|
233
|
+
async def until_no_lock(self):
|
|
234
|
+
lockpaths = [os.path.join(self.directory, self.prefix, ".LOCK")]
|
|
235
|
+
if self.path is not None:
|
|
236
|
+
lockpaths.append(self.path + ".LOCK")
|
|
237
|
+
return await self._until_no_lock(lockpaths)
|