hashserver 1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hash_file_response.py +237 -0
- hashserver-1.0.dist-info/METADATA +174 -0
- hashserver-1.0.dist-info/RECORD +8 -0
- hashserver-1.0.dist-info/WHEEL +5 -0
- hashserver-1.0.dist-info/entry_points.txt +2 -0
- hashserver-1.0.dist-info/licenses/LICENSE.txt +22 -0
- hashserver-1.0.dist-info/top_level.txt +2 -0
- hashserver.py +865 -0
hash_file_response.py
ADDED
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import stat
|
|
3
|
+
import time
|
|
4
|
+
import typing
|
|
5
|
+
from hashlib import sha3_256, sha256
|
|
6
|
+
|
|
7
|
+
import anyio
|
|
8
|
+
|
|
9
|
+
from starlette.background import BackgroundTask
|
|
10
|
+
from starlette.types import Receive, Scope, Send
|
|
11
|
+
from starlette.responses import FileResponse
|
|
12
|
+
|
|
13
|
+
HASH_ALGORITHMS = {
|
|
14
|
+
"sha3-256": sha3_256,
|
|
15
|
+
"sha-256": sha256,
|
|
16
|
+
}
|
|
17
|
+
DEFAULT_HASH_ALGORITHM = "sha-256"
|
|
18
|
+
_current_hash_algorithm = DEFAULT_HASH_ALGORITHM
|
|
19
|
+
_hash_constructor = HASH_ALGORITHMS[DEFAULT_HASH_ALGORITHM]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def set_hash_algorithm(algorithm: str) -> None:
|
|
23
|
+
global _current_hash_algorithm, _hash_constructor
|
|
24
|
+
try:
|
|
25
|
+
_hash_constructor = HASH_ALGORITHMS[algorithm]
|
|
26
|
+
except KeyError as exc:
|
|
27
|
+
raise ValueError(
|
|
28
|
+
f"Unsupported hash algorithm '{algorithm}'. "
|
|
29
|
+
f"Choose one of: {', '.join(HASH_ALGORITHMS)}"
|
|
30
|
+
) from exc
|
|
31
|
+
_current_hash_algorithm = algorithm
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def get_hash_algorithm() -> str:
|
|
35
|
+
return _current_hash_algorithm
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def parse_checksum(checksum) -> str:
|
|
39
|
+
"""Parses checksum and returns it as string.
|
|
40
|
+
|
|
41
|
+
Adapted from the Seamless source code (fair use)"""
|
|
42
|
+
if isinstance(checksum, bytes):
|
|
43
|
+
checksum = checksum.hex()
|
|
44
|
+
if isinstance(checksum, str):
|
|
45
|
+
if len(checksum) % 2:
|
|
46
|
+
raise ValueError("Wrong length")
|
|
47
|
+
checksum = bytes.fromhex(checksum)
|
|
48
|
+
|
|
49
|
+
if isinstance(checksum, bytes):
|
|
50
|
+
if len(checksum) != 32:
|
|
51
|
+
raise ValueError("Wrong length")
|
|
52
|
+
return checksum.hex()
|
|
53
|
+
|
|
54
|
+
if checksum is None:
|
|
55
|
+
return
|
|
56
|
+
raise TypeError(type(checksum))
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class HashFileResponse(FileResponse):
|
|
60
|
+
"""FileResponse that validates files against their checksum-derived filename."""
|
|
61
|
+
|
|
62
|
+
_PREFIX = False
|
|
63
|
+
|
|
64
|
+
lock_timeout = 120
|
|
65
|
+
chunk_size = 640 * 1024
|
|
66
|
+
|
|
67
|
+
def __init__(
|
|
68
|
+
self,
|
|
69
|
+
checksum: str,
|
|
70
|
+
directory: str,
|
|
71
|
+
status_code: int = 200,
|
|
72
|
+
headers: typing.Optional[typing.Mapping[str, str]] = None,
|
|
73
|
+
media_type: typing.Optional[str] = None,
|
|
74
|
+
background: typing.Optional[BackgroundTask] = None,
|
|
75
|
+
stat_result: typing.Optional[os.stat_result] = None,
|
|
76
|
+
method: typing.Optional[str] = None,
|
|
77
|
+
content_disposition_type: str = "attachment",
|
|
78
|
+
extra_dirs: typing.Optional[typing.List[str]] = None,
|
|
79
|
+
) -> None:
|
|
80
|
+
filename = parse_checksum(checksum)
|
|
81
|
+
self.prefix = filename[:2]
|
|
82
|
+
stat_result = None
|
|
83
|
+
if self._PREFIX:
|
|
84
|
+
path = os.path.join(directory, self.prefix, filename)
|
|
85
|
+
else:
|
|
86
|
+
path = os.path.join(directory, filename)
|
|
87
|
+
super().__init__(
|
|
88
|
+
path=path,
|
|
89
|
+
status_code=status_code,
|
|
90
|
+
headers=headers,
|
|
91
|
+
media_type=media_type,
|
|
92
|
+
background=background,
|
|
93
|
+
filename=filename,
|
|
94
|
+
stat_result=stat_result,
|
|
95
|
+
method=method,
|
|
96
|
+
content_disposition_type=content_disposition_type,
|
|
97
|
+
)
|
|
98
|
+
self.directory = directory
|
|
99
|
+
self.extra_dirs = extra_dirs
|
|
100
|
+
extra_dirs_layout = {}
|
|
101
|
+
for extra_dir in extra_dirs:
|
|
102
|
+
prefix_file = os.path.join(extra_dir, ".HASHSERVER_PREFIX")
|
|
103
|
+
if os.path.exists(prefix_file):
|
|
104
|
+
layout = "prefix"
|
|
105
|
+
else:
|
|
106
|
+
layout = "flat"
|
|
107
|
+
extra_dirs_layout[extra_dir] = layout
|
|
108
|
+
self.extra_dirs_layout = extra_dirs_layout
|
|
109
|
+
|
|
110
|
+
async def refresh_stat_headers(self):
|
|
111
|
+
if self.extra_dirs and not await anyio.Path(self.path).exists():
|
|
112
|
+
for extra_dir in self.extra_dirs:
|
|
113
|
+
layout = self.extra_dirs_layout[extra_dir]
|
|
114
|
+
if layout == "prefix":
|
|
115
|
+
path0 = os.path.join(extra_dir, self.prefix, self.filename)
|
|
116
|
+
else:
|
|
117
|
+
path0 = os.path.join(extra_dir, self.filename)
|
|
118
|
+
if await anyio.Path(path0).exists():
|
|
119
|
+
self.path = path0
|
|
120
|
+
break
|
|
121
|
+
|
|
122
|
+
try:
|
|
123
|
+
stat_result = await anyio.to_thread.run_sync(os.stat, self.path)
|
|
124
|
+
del self.headers["content-length"]
|
|
125
|
+
del self.headers["last-modified"]
|
|
126
|
+
del self.headers["etag"]
|
|
127
|
+
|
|
128
|
+
self.set_stat_headers(stat_result)
|
|
129
|
+
except FileNotFoundError:
|
|
130
|
+
raise FileNotFoundError(
|
|
131
|
+
f"File at path {self.path} does not exist."
|
|
132
|
+
) from None
|
|
133
|
+
else:
|
|
134
|
+
mode = stat_result.st_mode
|
|
135
|
+
if not stat.S_ISREG(mode):
|
|
136
|
+
raise RuntimeError(f"File at path {self.path} is not a file.")
|
|
137
|
+
return stat_result
|
|
138
|
+
|
|
139
|
+
async def _until_no_lock(self, lockpaths):
|
|
140
|
+
for lockpath in lockpaths:
|
|
141
|
+
while 1:
|
|
142
|
+
try:
|
|
143
|
+
lock_stat_result = await anyio.to_thread.run_sync(os.stat, lockpath)
|
|
144
|
+
except FileNotFoundError:
|
|
145
|
+
break
|
|
146
|
+
lock_mtime = lock_stat_result.st_mtime
|
|
147
|
+
if time.time() - lock_mtime > self.lock_timeout:
|
|
148
|
+
break
|
|
149
|
+
await anyio.sleep(1)
|
|
150
|
+
|
|
151
|
+
async def until_no_lock(self):
|
|
152
|
+
lockpaths = [os.path.join(self.directory, ".LOCK")]
|
|
153
|
+
if self.path is not None:
|
|
154
|
+
lockpaths.append(self.path + ".LOCK")
|
|
155
|
+
return await self._until_no_lock(lockpaths)
|
|
156
|
+
|
|
157
|
+
async def calculate_checksum(self):
|
|
158
|
+
"""Return checksum for the configured algorithm."""
|
|
159
|
+
checksum = _hash_constructor()
|
|
160
|
+
async with await anyio.open_file(self.path, mode="rb") as file:
|
|
161
|
+
more_body = True
|
|
162
|
+
while more_body:
|
|
163
|
+
chunk = await file.read(self.chunk_size)
|
|
164
|
+
checksum.update(chunk)
|
|
165
|
+
more_body = len(chunk) == self.chunk_size
|
|
166
|
+
|
|
167
|
+
checksum = checksum.digest().hex()
|
|
168
|
+
return checksum
|
|
169
|
+
|
|
170
|
+
async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None:
|
|
171
|
+
if self.stat_result is None:
|
|
172
|
+
try:
|
|
173
|
+
stat_result = await self.refresh_stat_headers()
|
|
174
|
+
except FileNotFoundError:
|
|
175
|
+
await self.until_no_lock()
|
|
176
|
+
stat_result = await self.refresh_stat_headers()
|
|
177
|
+
self.stat_result = stat_result
|
|
178
|
+
|
|
179
|
+
checksum = await self.calculate_checksum()
|
|
180
|
+
if checksum != self.filename:
|
|
181
|
+
await self.until_no_lock()
|
|
182
|
+
stat_result = await self.refresh_stat_headers()
|
|
183
|
+
self.stat_result = stat_result
|
|
184
|
+
checksum2 = await self.calculate_checksum()
|
|
185
|
+
if checksum2 != self.filename:
|
|
186
|
+
raise RuntimeError(
|
|
187
|
+
f"File corruption: file at path {self.path} does not have the correct {_current_hash_algorithm} checksum."
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
await super().__call__(scope=scope, receive=receive, send=send)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
class PrefixHashFileResponse(HashFileResponse):
|
|
194
|
+
"""Same as HashFileResponse but files are stored under a two-character prefix.
|
|
195
|
+
|
|
196
|
+
File has the same name as checksum.
|
|
197
|
+
File is stored as $PREFIX/$CHECKSUM, where $PREFIX is the first two
|
|
198
|
+
characters of $CHECKSUM
|
|
199
|
+
"""
|
|
200
|
+
|
|
201
|
+
_PREFIX = True
|
|
202
|
+
|
|
203
|
+
def __init__(
|
|
204
|
+
self,
|
|
205
|
+
checksum: str,
|
|
206
|
+
directory: str,
|
|
207
|
+
status_code: int = 200,
|
|
208
|
+
headers: typing.Optional[typing.Mapping[str, str]] = None,
|
|
209
|
+
media_type: typing.Optional[str] = None,
|
|
210
|
+
background: typing.Optional[BackgroundTask] = None,
|
|
211
|
+
stat_result: typing.Optional[os.stat_result] = None,
|
|
212
|
+
method: typing.Optional[str] = None,
|
|
213
|
+
content_disposition_type: str = "attachment",
|
|
214
|
+
extra_dirs: typing.Optional[typing.List[str]] = None,
|
|
215
|
+
) -> None:
|
|
216
|
+
|
|
217
|
+
super().__init__(
|
|
218
|
+
checksum=checksum,
|
|
219
|
+
directory=directory,
|
|
220
|
+
status_code=status_code,
|
|
221
|
+
headers=headers,
|
|
222
|
+
media_type=media_type,
|
|
223
|
+
background=background,
|
|
224
|
+
stat_result=stat_result,
|
|
225
|
+
method=method,
|
|
226
|
+
content_disposition_type=content_disposition_type,
|
|
227
|
+
extra_dirs=extra_dirs,
|
|
228
|
+
)
|
|
229
|
+
prefix_file = os.path.join(directory, ".HASHSERVER_PREFIX")
|
|
230
|
+
with open(prefix_file, mode="wb") as f:
|
|
231
|
+
f.write(b"1\n")
|
|
232
|
+
|
|
233
|
+
async def until_no_lock(self):
|
|
234
|
+
lockpaths = [os.path.join(self.directory, self.prefix, ".LOCK")]
|
|
235
|
+
if self.path is not None:
|
|
236
|
+
lockpaths.append(self.path + ".LOCK")
|
|
237
|
+
return await self._until_no_lock(lockpaths)
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hashserver
|
|
3
|
+
Version: 1.0
|
|
4
|
+
Summary: Simple FastAPI-based hash server
|
|
5
|
+
Author: Sjoerd de Vries
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Requires-Python: >=3.9
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE.txt
|
|
10
|
+
Requires-Dist: fastapi
|
|
11
|
+
Requires-Dist: uvicorn[standard]
|
|
12
|
+
Requires-Dist: typing-extensions
|
|
13
|
+
Dynamic: license-file
|
|
14
|
+
|
|
15
|
+
# Hashserver
|
|
16
|
+
|
|
17
|
+
A lightweight, content-addressed file server over HTTP.
|
|
18
|
+
|
|
19
|
+
Hashserver stores and serves opaque binary buffers keyed by their cryptographic checksum. You PUT a buffer with its checksum in the URL; you GET it back by the same checksum. There are no filenames, no directories, no metadata — just content and its hash.
|
|
20
|
+
|
|
21
|
+
The hash algorithm is configurable: SHA-256 (default) or SHA3-256.
|
|
22
|
+
|
|
23
|
+
## Why content-addressed storage?
|
|
24
|
+
|
|
25
|
+
Content-addressed storage (CAS) is a well-established pattern used by Git, IPFS, Docker registries, and many other systems. Identifying data by its cryptographic hash gives you automatic deduplication, trivially verifiable integrity, and strong reproducibility guarantees.
|
|
26
|
+
|
|
27
|
+
Hashserver brings these benefits to any project that needs a simple HTTP-based buffer store. It is intentionally minimal: a single ASGI application backed by a directory of files, designed to be easy to deploy, easy to integrate, and easy to reason about.
|
|
28
|
+
|
|
29
|
+
## Relationship to Seamless
|
|
30
|
+
|
|
31
|
+
Hashserver was originally developed as the buffer-serving component of [Seamless](https://github.com/sjdv1982/seamless), a framework for reproducible, reactive computational workflows. In Seamless, all data — inputs, source code, and results — is represented as a tree of checksums, and hashserver provides the storage layer that maps those checksums back to actual data.
|
|
32
|
+
|
|
33
|
+
However, **hashserver has no dependency on Seamless** and no knowledge of it. It is a generic content-addressed file server that is useful in any context where you need to store and retrieve buffers by hash — caching layers, artifact stores, reproducible pipelines, or your own CAS-backed application. It is published as an independent PyPI package for exactly this reason.
|
|
34
|
+
|
|
35
|
+
## Features
|
|
36
|
+
|
|
37
|
+
- **Content-addressed**: buffers are stored and retrieved by their cryptographic checksum.
|
|
38
|
+
- **Configurable hash algorithm**: SHA-256 (default) or SHA3-256, selected at startup.
|
|
39
|
+
- **Integrity-verified reads**: every buffer is re-checksummed on GET to detect corruption.
|
|
40
|
+
- **Prefix directory layout**: by default, buffers are stored under a two-character prefix subdirectory (e.g. `ab/ab3f7c...`) to avoid filesystem performance problems with large flat directories. A flat layout is also supported.
|
|
41
|
+
- **Extra read-only directories**: additional buffer directories can be mounted as fallback read sources.
|
|
42
|
+
- **Promises**: a client can announce that a buffer will be uploaded soon via `PUT /promise/{checksum}`. Other clients reading that checksum will wait for the upload rather than getting a 404.
|
|
43
|
+
- **Concurrent-safe**: in-flight PUT requests are tracked so concurrent GETs and batch queries return consistent results. Lock files are respected for external writers.
|
|
44
|
+
- **Multiple instances**: several hashserver processes can safely share the same buffer directory.
|
|
45
|
+
- **Lightweight**: built on FastAPI/Starlette — no database, no external services.
|
|
46
|
+
- **Flexible deployment**: run as a CLI tool, under any ASGI server, or via Docker Compose.
|
|
47
|
+
|
|
48
|
+
## Installation
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pip install hashserver
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
Or with conda:
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
mamba env create --file environment.yml
|
|
58
|
+
conda activate hashserver
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Quick start
|
|
62
|
+
|
|
63
|
+
Serve buffers from a local directory:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
hashserver ./my-buffers
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
This starts the server under uvicorn on port 8000. Run `hashserver -h` for all options.
|
|
70
|
+
|
|
71
|
+
### Storing and retrieving a buffer
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
# Start a writable server
|
|
75
|
+
hashserver ./my-buffers --writable
|
|
76
|
+
|
|
77
|
+
# Compute the SHA-256 checksum and upload
|
|
78
|
+
CHECKSUM=$(python3 -c "
|
|
79
|
+
import hashlib, sys
|
|
80
|
+
print(hashlib.sha256(open(sys.argv[1],'rb').read()).hexdigest())
|
|
81
|
+
" myfile.bin)
|
|
82
|
+
curl -X PUT --data-binary @myfile.bin http://localhost:8000/$CHECKSUM
|
|
83
|
+
|
|
84
|
+
# Download
|
|
85
|
+
curl -O http://localhost:8000/$CHECKSUM
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
To use SHA3-256 instead, start the server with `--hash-algorithm sha3-256` and hash your files with `hashlib.sha3_256`.
|
|
89
|
+
|
|
90
|
+
## API
|
|
91
|
+
|
|
92
|
+
### Retrieving buffers
|
|
93
|
+
|
|
94
|
+
**`GET /{checksum}`** — Retrieve a buffer by its hex checksum. The server verifies the checksum before sending the response. Returns the raw buffer (200), or 404 if not found.
|
|
95
|
+
|
|
96
|
+
### Storing buffers
|
|
97
|
+
|
|
98
|
+
Requires `--writable`.
|
|
99
|
+
|
|
100
|
+
**`PUT /{checksum}`** — Upload a buffer. The request body is the raw data; the server verifies that its checksum matches the URL. Returns 200 on success, 201 if the buffer already existed, or 400 on checksum mismatch.
|
|
101
|
+
|
|
102
|
+
**`PUT /promise/{checksum}`** — Announce that a buffer will be uploaded soon. Returns 202 with the promise TTL. While a promise is active, GET requests for that checksum will wait rather than returning 404, and `/has` queries will report the checksum as present.
|
|
103
|
+
|
|
104
|
+
### Querying availability
|
|
105
|
+
|
|
106
|
+
**`GET /has`** — Batch existence check. Send a JSON list of checksums in the request body. Returns a JSON list of booleans. Includes both on-disk buffers and active promises.
|
|
107
|
+
|
|
108
|
+
**`GET /has-now`** — Same as `/has`, but excludes promises — only reports buffers that are already on disk.
|
|
109
|
+
|
|
110
|
+
**`GET /buffer-length`** — Batch size query. Send a JSON list of checksums in the request body. Returns a JSON list of integers: the buffer size in bytes, or 0 if not present. Promised checksums are reported as `true`.
|
|
111
|
+
|
|
112
|
+
### Health
|
|
113
|
+
|
|
114
|
+
**`GET /healthcheck`** — Returns "OK". Useful for load balancer probes.
|
|
115
|
+
|
|
116
|
+
## Configuration
|
|
117
|
+
|
|
118
|
+
### CLI flags
|
|
119
|
+
|
|
120
|
+
| Flag | Description | Default |
|
|
121
|
+
|------|-------------|---------|
|
|
122
|
+
| `directory` | Buffer storage directory (positional, required) | — |
|
|
123
|
+
| `--writable` | Enable PUT endpoints | off |
|
|
124
|
+
| `--hash-algorithm` | Hash algorithm: `sha3-256` or `sha-256` | `sha-256` |
|
|
125
|
+
| `--layout` | Directory layout: `prefix` or `flat` | `prefix` |
|
|
126
|
+
| `--extra-dirs` | Semicolon-separated list of extra read-only buffer directories | — |
|
|
127
|
+
| `--host` | Listen address | `127.0.0.1` |
|
|
128
|
+
| `--port` | Listen port | `8000` |
|
|
129
|
+
| `--port-range START END` | Pick a random free port in range (mutually exclusive with `--port`) | — |
|
|
130
|
+
| `--status-file` | JSON file for reporting server status | — |
|
|
131
|
+
| `--timeout` | Shut down after this many seconds of inactivity | — |
|
|
132
|
+
|
|
133
|
+
### Environment variables
|
|
134
|
+
|
|
135
|
+
When running under an external ASGI server (e.g. `uvicorn hashserver:app`), configure via environment variables instead:
|
|
136
|
+
|
|
137
|
+
| Variable | Equivalent flag |
|
|
138
|
+
|----------|----------------|
|
|
139
|
+
| `HASHSERVER_DIRECTORY` | `directory` |
|
|
140
|
+
| `HASHSERVER_WRITABLE` | `--writable` (set to `1` or `true`) |
|
|
141
|
+
| `HASHSERVER_HASH_ALGORITHM` | `--hash-algorithm` |
|
|
142
|
+
| `HASHSERVER_LAYOUT` | `--layout` |
|
|
143
|
+
| `HASHSERVER_EXTRA_DIRS` | `--extra-dirs` |
|
|
144
|
+
|
|
145
|
+
### Docker Compose
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
export HASHSERVER_PORT=8000
|
|
149
|
+
export HASHSERVER_HOST=0.0.0.0
|
|
150
|
+
export HASHSERVER_DIRECTORY=./buffers
|
|
151
|
+
export HASHSERVER_WRITABLE=1
|
|
152
|
+
docker compose up -d
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
Container user/group ID can be set with `HASHSERVER_USER_ID` and `HASHSERVER_GROUP_ID` (both default to 0).
|
|
156
|
+
|
|
157
|
+
## Directory layouts
|
|
158
|
+
|
|
159
|
+
In **prefix** layout (the default), a buffer with checksum `ab3f7c...` is stored as `<directory>/ab/ab3f7c...`. A sentinel file `.HASHSERVER_PREFIX` is written to the directory. This avoids performance issues when storing large numbers of buffers.
|
|
160
|
+
|
|
161
|
+
In **flat** layout, the same buffer is stored as `<directory>/ab3f7c...`.
|
|
162
|
+
|
|
163
|
+
Extra directories auto-detect their layout by checking for the `.HASHSERVER_PREFIX` sentinel.
|
|
164
|
+
|
|
165
|
+
## Running tests
|
|
166
|
+
|
|
167
|
+
```bash
|
|
168
|
+
pip install requests
|
|
169
|
+
pytest tests/
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
## License
|
|
173
|
+
|
|
174
|
+
See [LICENSE.txt](LICENSE.txt).
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
hash_file_response.py,sha256=3H5snJYz8FWiIfIugI5i-zQPhgF0jUVjr2GDRrX7DX8,8247
|
|
2
|
+
hashserver.py,sha256=LDAYpZI7plcT6I5Lpe-nfFQwC3T54u-Z2-by2vPpBEY,26715
|
|
3
|
+
hashserver-1.0.dist-info/licenses/LICENSE.txt,sha256=aFUunT7WYX_fR7ryljRBKqoltt4dSYe-PB15Hz9GeyA,1117
|
|
4
|
+
hashserver-1.0.dist-info/METADATA,sha256=xc1WzqyX1Bb6Y7IlU9PIVUiLyz7_AYf4IPVtGVZTa3Y,7630
|
|
5
|
+
hashserver-1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
6
|
+
hashserver-1.0.dist-info/entry_points.txt,sha256=APGs23yr75suYAn6nQFAmAUL0GC1MARS6SFntYzhdvo,47
|
|
7
|
+
hashserver-1.0.dist-info/top_level.txt,sha256=oBgEDscAxsuQKVjeI30QOwRavz0iM5MLWh__ffn3mB0,30
|
|
8
|
+
hashserver-1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Author: Sjoerd de Vries, MBI platform.
|
|
4
|
+
Copyright (c) 2023-2026 CNRS.
|
|
5
|
+
|
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
8
|
+
in the Software without restriction, including without limitation the rights
|
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
11
|
+
furnished to do so, subject to the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be included in all
|
|
14
|
+
copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
22
|
+
SOFTWARE.
|
hashserver.py
ADDED
|
@@ -0,0 +1,865 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
import argparse
|
|
4
|
+
import random
|
|
5
|
+
import socket
|
|
6
|
+
import json
|
|
7
|
+
import asyncio
|
|
8
|
+
import logging
|
|
9
|
+
import aiofiles
|
|
10
|
+
import aiofiles.os
|
|
11
|
+
import aiofiles.tempfile
|
|
12
|
+
import contextlib
|
|
13
|
+
import copy
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from typing import Iterable, List, Optional, Set, Union
|
|
16
|
+
from fastapi import FastAPI, Path, Body, Request
|
|
17
|
+
from fastapi.middleware.cors import CORSMiddleware
|
|
18
|
+
from fastapi.responses import Response, JSONResponse
|
|
19
|
+
from fastapi.exceptions import RequestValidationError
|
|
20
|
+
from fastapi.encoders import jsonable_encoder
|
|
21
|
+
from starlette.requests import ClientDisconnect
|
|
22
|
+
|
|
23
|
+
from functools import partial
|
|
24
|
+
|
|
25
|
+
from hash_file_response import (
|
|
26
|
+
parse_checksum,
|
|
27
|
+
HashFileResponse,
|
|
28
|
+
PrefixHashFileResponse,
|
|
29
|
+
HASH_ALGORITHMS,
|
|
30
|
+
DEFAULT_HASH_ALGORITHM,
|
|
31
|
+
set_hash_algorithm,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
from typing_extensions import Annotated
|
|
35
|
+
from pydantic.functional_validators import BeforeValidator
|
|
36
|
+
|
|
37
|
+
import anyio
|
|
38
|
+
import pathlib
|
|
39
|
+
import time
|
|
40
|
+
|
|
41
|
+
Checksum = Annotated[str, BeforeValidator(parse_checksum)]
|
|
42
|
+
|
|
43
|
+
checksum_constructor = HASH_ALGORITHMS[DEFAULT_HASH_ALGORITHM]
|
|
44
|
+
|
|
45
|
+
STATUS_FILE_WAIT_TIMEOUT = 20.0
|
|
46
|
+
INACTIVITY_CHECK_INTERVAL = 1.0
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
INACTIVITY_STATE = {
|
|
50
|
+
"timeout": None,
|
|
51
|
+
"last_request": None,
|
|
52
|
+
"task": None,
|
|
53
|
+
"server": None,
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
aiofiles_chmod = aiofiles.os.wrap(os.chmod) # aiofiles.os lacks chmod
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def calculate_checksum(buffer):
|
|
60
|
+
"""Return checksum in the configured hash algorithm."""
|
|
61
|
+
return checksum_constructor(buffer).digest().hex()
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def calculate_checksum_stream():
|
|
65
|
+
return checksum_constructor()
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def wait_for_status_file(path: str, timeout: float = STATUS_FILE_WAIT_TIMEOUT):
|
|
69
|
+
deadline = time.monotonic() + timeout
|
|
70
|
+
while True:
|
|
71
|
+
try:
|
|
72
|
+
with open(path, "r", encoding="utf-8") as status_stream:
|
|
73
|
+
contents = json.load(status_stream)
|
|
74
|
+
break
|
|
75
|
+
except FileNotFoundError:
|
|
76
|
+
if time.monotonic() >= deadline:
|
|
77
|
+
print(
|
|
78
|
+
f"Status file '{path}' not found after {int(timeout)} seconds",
|
|
79
|
+
file=sys.stderr,
|
|
80
|
+
)
|
|
81
|
+
sys.exit(1)
|
|
82
|
+
time.sleep(0.1)
|
|
83
|
+
continue
|
|
84
|
+
except json.JSONDecodeError as exc:
|
|
85
|
+
print(
|
|
86
|
+
f"Status file '{path}' is not valid JSON: {exc}",
|
|
87
|
+
file=sys.stderr,
|
|
88
|
+
)
|
|
89
|
+
sys.exit(1)
|
|
90
|
+
|
|
91
|
+
if not isinstance(contents, dict):
|
|
92
|
+
print(
|
|
93
|
+
f"Status file '{path}' must contain a JSON object",
|
|
94
|
+
file=sys.stderr,
|
|
95
|
+
)
|
|
96
|
+
sys.exit(1)
|
|
97
|
+
|
|
98
|
+
return contents
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class StatusFileTracker:
|
|
102
|
+
def __init__(self, path: str, base_contents: dict, port: int):
|
|
103
|
+
self.path = path
|
|
104
|
+
self._base_contents = dict(base_contents)
|
|
105
|
+
self.port = port
|
|
106
|
+
self.running_written = False
|
|
107
|
+
|
|
108
|
+
def _write(self, payload: dict):
|
|
109
|
+
tmp_path = f"{self.path}.tmp"
|
|
110
|
+
with open(tmp_path, "w", encoding="utf-8") as status_stream:
|
|
111
|
+
json.dump(payload, status_stream)
|
|
112
|
+
status_stream.write("\n")
|
|
113
|
+
os.replace(tmp_path, self.path)
|
|
114
|
+
|
|
115
|
+
def write_running(self):
|
|
116
|
+
payload = dict(self._base_contents)
|
|
117
|
+
payload["port"] = self.port
|
|
118
|
+
payload["status"] = "running"
|
|
119
|
+
self._write(payload)
|
|
120
|
+
self._base_contents = payload
|
|
121
|
+
self.running_written = True
|
|
122
|
+
|
|
123
|
+
def write_failed(self):
|
|
124
|
+
payload = dict(self._base_contents)
|
|
125
|
+
payload["status"] = "failed"
|
|
126
|
+
self._write(payload)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def raise_startup_error(exc: BaseException):
|
|
130
|
+
if status_tracker and not status_tracker.running_written:
|
|
131
|
+
status_tracker.write_failed()
|
|
132
|
+
raise exc
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def configure_hash_algorithm(algorithm: str):
|
|
136
|
+
global checksum_constructor
|
|
137
|
+
try:
|
|
138
|
+
checksum_constructor = HASH_ALGORITHMS[algorithm]
|
|
139
|
+
except KeyError:
|
|
140
|
+
raise_startup_error(
|
|
141
|
+
RuntimeError(
|
|
142
|
+
f"--hash-algorithm must be one of: {', '.join(HASH_ALGORITHMS.keys())}"
|
|
143
|
+
)
|
|
144
|
+
)
|
|
145
|
+
set_hash_algorithm(algorithm)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def configure_lock_timeout(lock_timeout_seconds: float):
|
|
149
|
+
if lock_timeout_seconds <= 0:
|
|
150
|
+
raise_startup_error(RuntimeError("--lock-timeout must be a positive number"))
|
|
151
|
+
HashFileResponse.lock_timeout = lock_timeout_seconds
|
|
152
|
+
PrefixHashFileResponse.lock_timeout = lock_timeout_seconds
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def setup_inactivity_timeout(timeout_seconds: float, server):
|
|
156
|
+
INACTIVITY_STATE["timeout"] = timeout_seconds
|
|
157
|
+
INACTIVITY_STATE["server"] = server
|
|
158
|
+
|
|
159
|
+
async def monitor_inactivity():
|
|
160
|
+
try:
|
|
161
|
+
while True:
|
|
162
|
+
await asyncio.sleep(INACTIVITY_CHECK_INTERVAL)
|
|
163
|
+
last_request = INACTIVITY_STATE.get("last_request")
|
|
164
|
+
if last_request is None:
|
|
165
|
+
continue
|
|
166
|
+
if time.monotonic() - last_request >= timeout_seconds:
|
|
167
|
+
server.should_exit = True
|
|
168
|
+
break
|
|
169
|
+
except asyncio.CancelledError:
|
|
170
|
+
raise
|
|
171
|
+
|
|
172
|
+
async def start_monitor():
|
|
173
|
+
INACTIVITY_STATE["last_request"] = time.monotonic()
|
|
174
|
+
loop = asyncio.get_running_loop()
|
|
175
|
+
INACTIVITY_STATE["task"] = loop.create_task(monitor_inactivity())
|
|
176
|
+
|
|
177
|
+
async def stop_monitor():
|
|
178
|
+
task = INACTIVITY_STATE.get("task")
|
|
179
|
+
if task:
|
|
180
|
+
task.cancel()
|
|
181
|
+
with contextlib.suppress(asyncio.CancelledError):
|
|
182
|
+
await task
|
|
183
|
+
INACTIVITY_STATE["task"] = None
|
|
184
|
+
INACTIVITY_STATE["last_request"] = None
|
|
185
|
+
INACTIVITY_STATE["server"] = None
|
|
186
|
+
INACTIVITY_STATE["timeout"] = None
|
|
187
|
+
|
|
188
|
+
app.add_event_handler("startup", start_monitor)
|
|
189
|
+
app.add_event_handler("shutdown", stop_monitor)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def pick_random_free_port(host: str, start: int, end: int) -> int:
|
|
193
|
+
if start < 0 or end > 65535:
|
|
194
|
+
raise RuntimeError("--port-range values must be between 0 and 65535")
|
|
195
|
+
if start > end:
|
|
196
|
+
raise RuntimeError("--port-range START must be less than or equal to END")
|
|
197
|
+
|
|
198
|
+
span = end - start + 1
|
|
199
|
+
attempted = set()
|
|
200
|
+
while len(attempted) < span:
|
|
201
|
+
port = random.randint(start, end)
|
|
202
|
+
if port in attempted:
|
|
203
|
+
continue
|
|
204
|
+
attempted.add(port)
|
|
205
|
+
try:
|
|
206
|
+
with socket.create_server((host, port), reuse_port=False):
|
|
207
|
+
pass
|
|
208
|
+
except OSError:
|
|
209
|
+
continue
|
|
210
|
+
return port
|
|
211
|
+
|
|
212
|
+
raise RuntimeError(f"No free port available in range {start}-{end}")
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
DEFAULT_LOCK_TIMEOUT = 120.0
|
|
216
|
+
CHUNK_SIZE = 640 * 1024 # for now, hardcoded
|
|
217
|
+
PROMISE_TTL_SECONDS = 10 * 60.0
|
|
218
|
+
|
|
219
|
+
env = os.environ
|
|
220
|
+
as_commandline_tool = True
|
|
221
|
+
status_tracker = None
|
|
222
|
+
status_file_path = None
|
|
223
|
+
status_file_contents = None
|
|
224
|
+
timeout_seconds = None
|
|
225
|
+
|
|
226
|
+
if "HASHSERVER_DIRECTORY" in os.environ:
|
|
227
|
+
directory = os.environ["HASHSERVER_DIRECTORY"]
|
|
228
|
+
writable = False
|
|
229
|
+
if "HASHSERVER_WRITABLE" in os.environ:
|
|
230
|
+
env_writable = os.environ["HASHSERVER_WRITABLE"]
|
|
231
|
+
assert env_writable.lower() in ("true", "false", "0", "1", ""), env_writable
|
|
232
|
+
if env_writable.lower() in ("true", "1"):
|
|
233
|
+
writable = True
|
|
234
|
+
as_commandline_tool = False
|
|
235
|
+
|
|
236
|
+
extra_dirs: list[str] = []
|
|
237
|
+
extra_dirs0 = os.environ.get("HASHSERVER_EXTRA_DIRS")
|
|
238
|
+
if extra_dirs0:
|
|
239
|
+
|
|
240
|
+
def _filt(d):
|
|
241
|
+
d = d.strip()
|
|
242
|
+
if d == '""' or d == "''":
|
|
243
|
+
return ""
|
|
244
|
+
|
|
245
|
+
extra_dirs00 = [_filt(d) for d in extra_dirs0.split(";")]
|
|
246
|
+
extra_dirs = [d for d in extra_dirs00 if d]
|
|
247
|
+
|
|
248
|
+
layout = os.environ.get("HASHSERVER_LAYOUT", "prefix")
|
|
249
|
+
status_file_path = None
|
|
250
|
+
status_file_contents = None
|
|
251
|
+
timeout_seconds = None
|
|
252
|
+
algorithm = os.environ.get("HASHSERVER_HASH_ALGORITHM", DEFAULT_HASH_ALGORITHM)
|
|
253
|
+
configure_hash_algorithm(algorithm)
|
|
254
|
+
lock_timeout = os.environ.get("HASHSERVER_LOCK_TIMEOUT")
|
|
255
|
+
if lock_timeout is not None:
|
|
256
|
+
try:
|
|
257
|
+
configure_lock_timeout(float(lock_timeout))
|
|
258
|
+
except ValueError:
|
|
259
|
+
raise_startup_error(
|
|
260
|
+
RuntimeError("HASHSERVER_LOCK_TIMEOUT must be a number")
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
else:
|
|
264
|
+
if (
|
|
265
|
+
len(sys.argv)
|
|
266
|
+
and sys.argv[0].find("uvicorn") > -1
|
|
267
|
+
and not os.path.isdir(sys.argv[0])
|
|
268
|
+
):
|
|
269
|
+
print(
|
|
270
|
+
"Running hashserver under uvicorn CLI requires at least HASHSERVER_DIRECTORY to be defined",
|
|
271
|
+
file=sys.stderr,
|
|
272
|
+
)
|
|
273
|
+
exit(1)
|
|
274
|
+
parser = argparse.ArgumentParser()
|
|
275
|
+
parser.add_argument(
|
|
276
|
+
"directory",
|
|
277
|
+
help="""Directory where buffers are located.
|
|
278
|
+
|
|
279
|
+
Buffers have the same file name as their checksum (sha3-256 by default).""",
|
|
280
|
+
)
|
|
281
|
+
parser.add_argument(
|
|
282
|
+
"--extra-dirs",
|
|
283
|
+
help="""Extra directories where read-only buffers are located.
|
|
284
|
+
|
|
285
|
+
This must be a list of directories separated by semi-colons (;).
|
|
286
|
+
If not specified, this argument is read from HASHSERVER_EXTRA_DIRS, if present""",
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
parser.add_argument(
|
|
290
|
+
"--writable",
|
|
291
|
+
action="store_true",
|
|
292
|
+
help="Allow HTTP PUT requests",
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
port_group = parser.add_mutually_exclusive_group()
|
|
296
|
+
port_group.add_argument(
|
|
297
|
+
"--port",
|
|
298
|
+
type=int,
|
|
299
|
+
help="Network port",
|
|
300
|
+
)
|
|
301
|
+
port_group.add_argument(
|
|
302
|
+
"--port-range",
|
|
303
|
+
type=int,
|
|
304
|
+
nargs=2,
|
|
305
|
+
metavar=("START", "END"),
|
|
306
|
+
help="Inclusive port range to select a random free port from",
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
parser.add_argument(
|
|
310
|
+
"--host",
|
|
311
|
+
type=str,
|
|
312
|
+
help="Network host",
|
|
313
|
+
default="127.0.0.1",
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
parser.add_argument(
|
|
317
|
+
"--layout",
|
|
318
|
+
type=str,
|
|
319
|
+
help="""Directory layout.
|
|
320
|
+
One of:
|
|
321
|
+
- "flat".
|
|
322
|
+
A buffer with checksum CS is stored as file "$DIRECTORY/$CS".
|
|
323
|
+
|
|
324
|
+
- "prefix".
|
|
325
|
+
A buffer with checksum CS is stored as file "$DIRECTORY/$PREFIX/$CS",
|
|
326
|
+
where PREFIX is the first two characters of CS.
|
|
327
|
+
|
|
328
|
+
""",
|
|
329
|
+
default="prefix",
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
parser.add_argument(
|
|
333
|
+
"--hash-algorithm",
|
|
334
|
+
type=str,
|
|
335
|
+
choices=tuple(HASH_ALGORITHMS.keys()),
|
|
336
|
+
default=DEFAULT_HASH_ALGORITHM,
|
|
337
|
+
help="Hash algorithm used for checksum calculations (default: %(default)s)",
|
|
338
|
+
)
|
|
339
|
+
parser.add_argument(
|
|
340
|
+
"--lock-timeout",
|
|
341
|
+
type=float,
|
|
342
|
+
default=DEFAULT_LOCK_TIMEOUT,
|
|
343
|
+
help="Wait this many seconds for stale lock files (default: %(default)s)",
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
parser.add_argument(
|
|
347
|
+
"--status-file",
|
|
348
|
+
type=str,
|
|
349
|
+
help="JSON file used to report server status",
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
parser.add_argument(
|
|
353
|
+
"--timeout",
|
|
354
|
+
type=float,
|
|
355
|
+
help="Stop the server after this many seconds of inactivity",
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
args = parser.parse_args()
|
|
359
|
+
directory = args.directory
|
|
360
|
+
writable = args.writable
|
|
361
|
+
extra_dirs = args.extra_dirs
|
|
362
|
+
configure_hash_algorithm(args.hash_algorithm)
|
|
363
|
+
configure_lock_timeout(args.lock_timeout)
|
|
364
|
+
status_file_path = args.status_file
|
|
365
|
+
timeout_seconds = args.timeout
|
|
366
|
+
if status_file_path:
|
|
367
|
+
status_file_contents = wait_for_status_file(status_file_path)
|
|
368
|
+
status_tracker = StatusFileTracker(
|
|
369
|
+
status_file_path, status_file_contents, args.port
|
|
370
|
+
)
|
|
371
|
+
if timeout_seconds is not None and timeout_seconds <= 0:
|
|
372
|
+
raise_startup_error(RuntimeError("--timeout must be a positive number"))
|
|
373
|
+
if not extra_dirs:
|
|
374
|
+
extra_dirs = os.environ.get("HASHSERVER_EXTRA_DIRS")
|
|
375
|
+
if extra_dirs:
|
|
376
|
+
extra_dirs = [d.strip() for d in extra_dirs.split(";")]
|
|
377
|
+
else:
|
|
378
|
+
extra_dirs = []
|
|
379
|
+
layout = args.layout
|
|
380
|
+
if args.port_range:
|
|
381
|
+
start, end = args.port_range
|
|
382
|
+
try:
|
|
383
|
+
selected_port = pick_random_free_port(args.host, start, end)
|
|
384
|
+
except BaseException as exc:
|
|
385
|
+
raise_startup_error(exc)
|
|
386
|
+
else:
|
|
387
|
+
selected_port = args.port if args.port is not None else 8000
|
|
388
|
+
args.port = selected_port
|
|
389
|
+
if status_tracker:
|
|
390
|
+
status_tracker.port = selected_port
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
if not os.path.exists(directory):
|
|
394
|
+
raise_startup_error(FileExistsError(f"Directory '{directory}' does not exist"))
|
|
395
|
+
if not os.path.isdir(directory):
|
|
396
|
+
raise_startup_error(FileExistsError(f"Directory '{directory}' is not a directory"))
|
|
397
|
+
|
|
398
|
+
try:
|
|
399
|
+
os.chmod(directory, 0o3775)
|
|
400
|
+
except Exception:
|
|
401
|
+
pass
|
|
402
|
+
|
|
403
|
+
if layout not in ("flat", "prefix"):
|
|
404
|
+
raise_startup_error(RuntimeError("Layout must be 'flat' or 'prefix'"))
|
|
405
|
+
|
|
406
|
+
if layout == "prefix":
|
|
407
|
+
prefix_file = os.path.join(directory, ".HASHSERVER_PREFIX")
|
|
408
|
+
try:
|
|
409
|
+
if not os.path.exists(prefix_file):
|
|
410
|
+
with open(prefix_file, "wb") as f:
|
|
411
|
+
f.write(b"1\n")
|
|
412
|
+
except Exception:
|
|
413
|
+
pass
|
|
414
|
+
|
|
415
|
+
app = FastAPI()
|
|
416
|
+
LOGGER = logging.getLogger("hashserver")
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
@app.exception_handler(RequestValidationError)
|
|
420
|
+
async def validation_exception_handler(request, exc):
|
|
421
|
+
inner_exc = exc.args[0][0]
|
|
422
|
+
inner_exc = jsonable_encoder(inner_exc)
|
|
423
|
+
inner_exc.pop("ctx", None)
|
|
424
|
+
return JSONResponse(
|
|
425
|
+
status_code=400,
|
|
426
|
+
content={"message": "Invalid data", "exception": inner_exc},
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
@app.exception_handler(FileNotFoundError)
|
|
431
|
+
async def filenotfound_exception_handler(request, exc):
|
|
432
|
+
return Response(status_code=404, content="Not found")
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
@app.exception_handler(RuntimeError)
|
|
436
|
+
async def runtime_exception_handler(request, exc):
|
|
437
|
+
return JSONResponse(
|
|
438
|
+
status_code=400,
|
|
439
|
+
content={"message": f"{exc}"},
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
@app.middleware("http")
|
|
444
|
+
async def record_last_request(request: Request, call_next):
|
|
445
|
+
if INACTIVITY_STATE["timeout"] is not None:
|
|
446
|
+
INACTIVITY_STATE["last_request"] = time.monotonic()
|
|
447
|
+
response = await call_next(request)
|
|
448
|
+
if INACTIVITY_STATE["timeout"] is not None:
|
|
449
|
+
INACTIVITY_STATE["last_request"] = time.monotonic()
|
|
450
|
+
return response
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
@app.get("/buffer-length")
|
|
454
|
+
async def buffer_length(checksums: Annotated[List[Checksum], Body()]) -> JSONResponse:
|
|
455
|
+
checksums2 = [parse_checksum(checksum) for checksum in checksums]
|
|
456
|
+
await _wait_for_current_put_requests(checksums2)
|
|
457
|
+
curr_results = [0] * len(checksums)
|
|
458
|
+
|
|
459
|
+
async def stat_all(paths):
|
|
460
|
+
futures = []
|
|
461
|
+
for _, path in paths:
|
|
462
|
+
fut = anyio.Path(path).stat()
|
|
463
|
+
futures.append(fut)
|
|
464
|
+
result0 = await asyncio.gather(*futures, return_exceptions=True)
|
|
465
|
+
for (nr, path), stat in zip(paths, result0):
|
|
466
|
+
if isinstance(stat, Exception):
|
|
467
|
+
continue
|
|
468
|
+
curr_results[nr] = stat.st_size
|
|
469
|
+
|
|
470
|
+
paths = []
|
|
471
|
+
for nr, checksum in enumerate(checksums2):
|
|
472
|
+
assert isinstance(checksum, str)
|
|
473
|
+
if layout == "prefix":
|
|
474
|
+
prefix = checksum[:2]
|
|
475
|
+
path = os.path.join(directory, prefix, checksum)
|
|
476
|
+
else:
|
|
477
|
+
path = os.path.join(directory, checksum)
|
|
478
|
+
paths.append((nr, path))
|
|
479
|
+
|
|
480
|
+
await stat_all(paths)
|
|
481
|
+
|
|
482
|
+
for extra_dir in extra_dirs:
|
|
483
|
+
for nr, checksum in enumerate(checksums2):
|
|
484
|
+
if curr_results[nr]:
|
|
485
|
+
continue
|
|
486
|
+
path = os.path.join(extra_dir, checksum)
|
|
487
|
+
paths.append((nr, path))
|
|
488
|
+
if not len(paths):
|
|
489
|
+
break
|
|
490
|
+
await stat_all(paths)
|
|
491
|
+
|
|
492
|
+
promised = await _promise_registry.promised_indices(checksums2)
|
|
493
|
+
for idx in promised:
|
|
494
|
+
curr_results[idx] = True
|
|
495
|
+
|
|
496
|
+
return curr_results
|
|
497
|
+
|
|
498
|
+
|
|
499
|
+
async def _has(checksums: List[Checksum], include_promises: bool) -> List[bool]:
|
|
500
|
+
checksums2 = [parse_checksum(checksum) for checksum in checksums]
|
|
501
|
+
curr_results = [False] * len(checksums)
|
|
502
|
+
|
|
503
|
+
# Flag any in-flight uploads immediately.
|
|
504
|
+
for idx, checksum in enumerate(checksums2):
|
|
505
|
+
if checksum in _current_put_requests:
|
|
506
|
+
curr_results[idx] = True
|
|
507
|
+
|
|
508
|
+
async def exists_all(paths):
|
|
509
|
+
futures = []
|
|
510
|
+
for _, path in paths:
|
|
511
|
+
fut = anyio.Path(path).exists()
|
|
512
|
+
futures.append(fut)
|
|
513
|
+
result0 = await asyncio.gather(*futures, return_exceptions=True)
|
|
514
|
+
for (nr, path), exists in zip(paths, result0):
|
|
515
|
+
if isinstance(exists, Exception):
|
|
516
|
+
continue
|
|
517
|
+
if exists:
|
|
518
|
+
curr_results[nr] = True
|
|
519
|
+
|
|
520
|
+
paths = []
|
|
521
|
+
for nr, checksum in enumerate(checksums2):
|
|
522
|
+
assert isinstance(checksum, str)
|
|
523
|
+
if curr_results[nr]:
|
|
524
|
+
continue
|
|
525
|
+
if layout == "prefix":
|
|
526
|
+
prefix = checksum[:2]
|
|
527
|
+
path = os.path.join(directory, prefix, checksum)
|
|
528
|
+
else:
|
|
529
|
+
path = os.path.join(directory, checksum)
|
|
530
|
+
paths.append((nr, path))
|
|
531
|
+
|
|
532
|
+
if paths:
|
|
533
|
+
await exists_all(paths)
|
|
534
|
+
|
|
535
|
+
for extra_dir in extra_dirs:
|
|
536
|
+
paths = []
|
|
537
|
+
for nr, checksum in enumerate(checksums2):
|
|
538
|
+
if curr_results[nr]:
|
|
539
|
+
continue
|
|
540
|
+
path = os.path.join(extra_dir, checksum)
|
|
541
|
+
paths.append((nr, path))
|
|
542
|
+
if not paths:
|
|
543
|
+
break
|
|
544
|
+
await exists_all(paths)
|
|
545
|
+
|
|
546
|
+
if include_promises:
|
|
547
|
+
promised = await _promise_registry.promised_indices(checksums2)
|
|
548
|
+
for idx in promised:
|
|
549
|
+
curr_results[idx] = True
|
|
550
|
+
|
|
551
|
+
return curr_results
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
@app.get("/has")
|
|
555
|
+
async def has(checksums: Annotated[List[Checksum], Body()]) -> JSONResponse:
|
|
556
|
+
return await _has(checksums, include_promises=True)
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
@app.get("/has-now")
|
|
560
|
+
async def has(checksums: Annotated[List[Checksum], Body()]) -> JSONResponse:
|
|
561
|
+
return await _has(checksums, include_promises=False)
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
class PromiseAwareResponseMixin:
|
|
565
|
+
def __init__(self, *, checksum: str, **kwargs):
|
|
566
|
+
self._promise_checksum = checksum
|
|
567
|
+
super().__init__(checksum=checksum, **kwargs)
|
|
568
|
+
|
|
569
|
+
async def __call__(self, scope, receive, send):
|
|
570
|
+
while True:
|
|
571
|
+
try:
|
|
572
|
+
await super().__call__(scope, receive, send)
|
|
573
|
+
return
|
|
574
|
+
except FileNotFoundError:
|
|
575
|
+
should_retry = await _promise_registry.wait_for(self._promise_checksum)
|
|
576
|
+
if not should_retry:
|
|
577
|
+
raise
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
class PromiseAwareHashFileResponse(PromiseAwareResponseMixin, HashFileResponse):
|
|
581
|
+
pass
|
|
582
|
+
|
|
583
|
+
|
|
584
|
+
class PromiseAwarePrefixHashFileResponse(
|
|
585
|
+
PromiseAwareResponseMixin, PrefixHashFileResponse
|
|
586
|
+
):
|
|
587
|
+
pass
|
|
588
|
+
|
|
589
|
+
|
|
590
|
+
_response_classes_get_file = {
|
|
591
|
+
"flat": PromiseAwareHashFileResponse,
|
|
592
|
+
"prefix": PromiseAwarePrefixHashFileResponse,
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
|
|
596
|
+
@app.get("/healthcheck")
|
|
597
|
+
async def healthcheck() -> Response:
|
|
598
|
+
return Response(content="OK")
|
|
599
|
+
|
|
600
|
+
|
|
601
|
+
@app.get("/{checksum}")
|
|
602
|
+
async def get_file(checksum: Annotated[Checksum, Path()]) -> HashFileResponse:
|
|
603
|
+
checksum2 = parse_checksum(checksum)
|
|
604
|
+
LOGGER.info("GET %s", checksum2)
|
|
605
|
+
await _wait_for_current_put_requests((checksum2,))
|
|
606
|
+
ResponseClass = _response_classes_get_file[layout]
|
|
607
|
+
response = ResponseClass(
|
|
608
|
+
directory=directory, checksum=checksum2, extra_dirs=extra_dirs
|
|
609
|
+
)
|
|
610
|
+
return response
|
|
611
|
+
|
|
612
|
+
|
|
613
|
+
async def promise(checksum: Annotated[Checksum, Path()]) -> JSONResponse:
|
|
614
|
+
checksum2 = parse_checksum(checksum)
|
|
615
|
+
await _promise_registry.add(checksum2)
|
|
616
|
+
return JSONResponse(
|
|
617
|
+
status_code=202,
|
|
618
|
+
content={"checksum": checksum2, "expires_in": PROMISE_TTL_SECONDS},
|
|
619
|
+
)
|
|
620
|
+
|
|
621
|
+
|
|
622
|
+
_current_put_requests: set[str] = set()
|
|
623
|
+
_current_put_condition = asyncio.Condition()
|
|
624
|
+
|
|
625
|
+
|
|
626
|
+
@dataclass
|
|
627
|
+
class _PromiseEntry:
|
|
628
|
+
event: asyncio.Event
|
|
629
|
+
expires_at: float
|
|
630
|
+
|
|
631
|
+
|
|
632
|
+
class PromiseRegistry:
|
|
633
|
+
def __init__(self, ttl_seconds: float = PROMISE_TTL_SECONDS):
|
|
634
|
+
self._ttl_seconds = ttl_seconds
|
|
635
|
+
self._promises: dict[str, _PromiseEntry] = {}
|
|
636
|
+
self._lock = asyncio.Lock()
|
|
637
|
+
|
|
638
|
+
def _cleanup_locked(self, now: Optional[float] = None) -> None:
|
|
639
|
+
if now is None:
|
|
640
|
+
now = time.monotonic()
|
|
641
|
+
expired = [
|
|
642
|
+
cs for cs, entry in self._promises.items() if entry.expires_at <= now
|
|
643
|
+
]
|
|
644
|
+
for checksum in expired:
|
|
645
|
+
self._promises.pop(checksum, None)
|
|
646
|
+
|
|
647
|
+
async def add(self, checksum: str) -> float:
|
|
648
|
+
now = time.monotonic()
|
|
649
|
+
expires_at = now + self._ttl_seconds
|
|
650
|
+
async with self._lock:
|
|
651
|
+
self._cleanup_locked(now)
|
|
652
|
+
entry = self._promises.get(checksum)
|
|
653
|
+
if entry is None:
|
|
654
|
+
entry = _PromiseEntry(asyncio.Event(), expires_at)
|
|
655
|
+
self._promises[checksum] = entry
|
|
656
|
+
else:
|
|
657
|
+
entry.expires_at = expires_at
|
|
658
|
+
return expires_at
|
|
659
|
+
|
|
660
|
+
async def resolve(self, checksum: str) -> None:
|
|
661
|
+
async with self._lock:
|
|
662
|
+
entry = self._promises.pop(checksum, None)
|
|
663
|
+
if entry:
|
|
664
|
+
entry.event.set()
|
|
665
|
+
|
|
666
|
+
async def promised_indices(self, checksums: List[str]) -> Set[int]:
|
|
667
|
+
async with self._lock:
|
|
668
|
+
self._cleanup_locked()
|
|
669
|
+
promised = {idx for idx, cs in enumerate(checksums) if cs in self._promises}
|
|
670
|
+
return promised
|
|
671
|
+
|
|
672
|
+
async def wait_for(self, checksum: str) -> bool:
|
|
673
|
+
while True:
|
|
674
|
+
async with self._lock:
|
|
675
|
+
self._cleanup_locked()
|
|
676
|
+
entry = self._promises.get(checksum)
|
|
677
|
+
if entry is None:
|
|
678
|
+
return False
|
|
679
|
+
timeout = entry.expires_at - time.monotonic()
|
|
680
|
+
if timeout <= 0:
|
|
681
|
+
self._promises.pop(checksum, None)
|
|
682
|
+
return False
|
|
683
|
+
event = entry.event
|
|
684
|
+
try:
|
|
685
|
+
await asyncio.wait_for(event.wait(), timeout)
|
|
686
|
+
return True
|
|
687
|
+
except asyncio.TimeoutError:
|
|
688
|
+
async with self._lock:
|
|
689
|
+
current = self._promises.get(checksum)
|
|
690
|
+
if current is not entry:
|
|
691
|
+
continue
|
|
692
|
+
remaining = current.expires_at - time.monotonic()
|
|
693
|
+
if remaining <= 0:
|
|
694
|
+
self._promises.pop(checksum, None)
|
|
695
|
+
return False
|
|
696
|
+
continue
|
|
697
|
+
|
|
698
|
+
|
|
699
|
+
_promise_registry = PromiseRegistry()
|
|
700
|
+
|
|
701
|
+
|
|
702
|
+
async def _wait_for_current_put_requests(checksums: Iterable[str]) -> None:
|
|
703
|
+
if isinstance(checksums, (str, bytes)):
|
|
704
|
+
checksum_set = {checksums}
|
|
705
|
+
else:
|
|
706
|
+
checksum_set = set(checksums)
|
|
707
|
+
async with _current_put_condition:
|
|
708
|
+
while _current_put_requests.intersection(checksum_set):
|
|
709
|
+
await _current_put_condition.wait()
|
|
710
|
+
|
|
711
|
+
|
|
712
|
+
async def put_file(checksum: Annotated[Checksum, Path()], rq: Request) -> Response:
|
|
713
|
+
|
|
714
|
+
checksum_str = parse_checksum(checksum)
|
|
715
|
+
LOGGER.info("PUT %s start", checksum_str)
|
|
716
|
+
|
|
717
|
+
if layout == "prefix":
|
|
718
|
+
prefix = checksum_str[:2]
|
|
719
|
+
target_dir = os.path.join(directory, prefix)
|
|
720
|
+
else:
|
|
721
|
+
target_dir = directory
|
|
722
|
+
path = os.path.join(target_dir, checksum_str)
|
|
723
|
+
|
|
724
|
+
if layout == "prefix":
|
|
725
|
+
target_directory = anyio.Path(target_dir)
|
|
726
|
+
if not await target_directory.exists():
|
|
727
|
+
await target_directory.mkdir(exist_ok=True)
|
|
728
|
+
await aiofiles_chmod(target_dir, 0o3775)
|
|
729
|
+
if await aiofiles.ospath.exists(path):
|
|
730
|
+
LOGGER.info("PUT %s already exists", checksum_str)
|
|
731
|
+
await aiofiles_chmod(path, 0o444)
|
|
732
|
+
await _promise_registry.resolve(checksum_str)
|
|
733
|
+
return Response(status_code=201)
|
|
734
|
+
|
|
735
|
+
ok = False
|
|
736
|
+
added_to_put_requests = False
|
|
737
|
+
cs_stream = calculate_checksum_stream()
|
|
738
|
+
temp_path = None
|
|
739
|
+
buffer_checksum = None
|
|
740
|
+
try:
|
|
741
|
+
async with _current_put_condition:
|
|
742
|
+
if checksum_str in _current_put_requests:
|
|
743
|
+
LOGGER.info("PUT %s already in progress", checksum_str)
|
|
744
|
+
return Response(status_code=202)
|
|
745
|
+
_current_put_requests.add(checksum_str)
|
|
746
|
+
added_to_put_requests = True
|
|
747
|
+
async with aiofiles.tempfile.NamedTemporaryFile(
|
|
748
|
+
dir=target_dir,
|
|
749
|
+
prefix=checksum_str + "-",
|
|
750
|
+
delete=False,
|
|
751
|
+
) as file:
|
|
752
|
+
async for chunk in rq.stream():
|
|
753
|
+
cs_stream.update(chunk)
|
|
754
|
+
await file.write(chunk)
|
|
755
|
+
buffer_checksum = cs_stream.hexdigest()
|
|
756
|
+
temp_path = file.name
|
|
757
|
+
if buffer_checksum != checksum_str:
|
|
758
|
+
LOGGER.warning("PUT %s incorrect checksum", checksum_str)
|
|
759
|
+
return Response(status_code=400, content="Incorrect checksum")
|
|
760
|
+
if not await aiofiles.ospath.exists(path):
|
|
761
|
+
try:
|
|
762
|
+
await aiofiles.os.replace(temp_path, path)
|
|
763
|
+
except Exception:
|
|
764
|
+
if not await aiofiles.ospath.exists(path):
|
|
765
|
+
raise
|
|
766
|
+
ok = True
|
|
767
|
+
try:
|
|
768
|
+
await aiofiles_chmod(path, 0o444)
|
|
769
|
+
except Exception:
|
|
770
|
+
pass
|
|
771
|
+
|
|
772
|
+
except ClientDisconnect:
|
|
773
|
+
LOGGER.warning("PUT %s client disconnected", checksum_str)
|
|
774
|
+
return Response(status_code=400)
|
|
775
|
+
|
|
776
|
+
finally:
|
|
777
|
+
if added_to_put_requests:
|
|
778
|
+
async with _current_put_condition:
|
|
779
|
+
_current_put_requests.remove(checksum_str)
|
|
780
|
+
_current_put_condition.notify_all()
|
|
781
|
+
if temp_path is not None:
|
|
782
|
+
try:
|
|
783
|
+
await aiofiles.os.unlink(temp_path)
|
|
784
|
+
except FileNotFoundError:
|
|
785
|
+
pass
|
|
786
|
+
except Exception:
|
|
787
|
+
pass
|
|
788
|
+
if added_to_put_requests and not ok:
|
|
789
|
+
try:
|
|
790
|
+
pathlib.Path(path).unlink()
|
|
791
|
+
except FileNotFoundError:
|
|
792
|
+
pass
|
|
793
|
+
|
|
794
|
+
if ok:
|
|
795
|
+
LOGGER.info("PUT %s completed", checksum_str)
|
|
796
|
+
await _promise_registry.resolve(checksum_str)
|
|
797
|
+
return Response(content="OK")
|
|
798
|
+
|
|
799
|
+
|
|
800
|
+
if writable:
|
|
801
|
+
put_file = app.put("/{checksum}")(put_file)
|
|
802
|
+
promise = app.put("/promise/{checksum}")(promise)
|
|
803
|
+
|
|
804
|
+
app.add_middleware(
|
|
805
|
+
CORSMiddleware,
|
|
806
|
+
allow_origins=["*"],
|
|
807
|
+
allow_methods=["*"],
|
|
808
|
+
allow_headers=["*"],
|
|
809
|
+
)
|
|
810
|
+
|
|
811
|
+
|
|
812
|
+
def main():
|
|
813
|
+
"""Console-script shim; server launch happens during module import."""
|
|
814
|
+
return 0
|
|
815
|
+
|
|
816
|
+
|
|
817
|
+
def _timestamped_log_config():
|
|
818
|
+
try:
|
|
819
|
+
from uvicorn.config import LOGGING_CONFIG
|
|
820
|
+
except Exception: # pragma: no cover - uvicorn import guard
|
|
821
|
+
return None
|
|
822
|
+
|
|
823
|
+
log_config = copy.deepcopy(LOGGING_CONFIG)
|
|
824
|
+
formatters = log_config.get("formatters", {})
|
|
825
|
+
for name in ("default", "access"):
|
|
826
|
+
formatter = formatters.get(name)
|
|
827
|
+
if not formatter:
|
|
828
|
+
continue
|
|
829
|
+
fmt = formatter.get("fmt")
|
|
830
|
+
if fmt:
|
|
831
|
+
formatter["fmt"] = f"%(asctime)s {fmt}"
|
|
832
|
+
else:
|
|
833
|
+
formatter["fmt"] = "%(asctime)s %(message)s"
|
|
834
|
+
return log_config
|
|
835
|
+
|
|
836
|
+
|
|
837
|
+
if as_commandline_tool:
|
|
838
|
+
import uvicorn
|
|
839
|
+
|
|
840
|
+
log_config = _timestamped_log_config()
|
|
841
|
+
config_kwargs = dict(app=app, port=args.port, host=args.host)
|
|
842
|
+
if log_config is not None:
|
|
843
|
+
config_kwargs["log_config"] = log_config
|
|
844
|
+
config = uvicorn.Config(**config_kwargs)
|
|
845
|
+
server = uvicorn.Server(config)
|
|
846
|
+
|
|
847
|
+
if status_tracker:
|
|
848
|
+
|
|
849
|
+
@app.on_event("startup")
|
|
850
|
+
async def _hashserver_status_file_running():
|
|
851
|
+
await anyio.to_thread.run_sync(status_tracker.write_running)
|
|
852
|
+
|
|
853
|
+
if timeout_seconds is not None:
|
|
854
|
+
setup_inactivity_timeout(timeout_seconds, server)
|
|
855
|
+
|
|
856
|
+
print("OK")
|
|
857
|
+
try:
|
|
858
|
+
server.run()
|
|
859
|
+
except BaseException:
|
|
860
|
+
if status_tracker and not status_tracker.running_written:
|
|
861
|
+
status_tracker.write_failed()
|
|
862
|
+
raise
|
|
863
|
+
else:
|
|
864
|
+
# uvicorn (or some other ASGI launcher) will take care of it
|
|
865
|
+
pass
|