lsst-resources 29.0.0rc7__py3-none-any.whl → 29.2025.4600__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lsst/resources/_resourceHandles/_davResourceHandle.py +197 -0
- lsst/resources/_resourceHandles/_fileResourceHandle.py +1 -1
- lsst/resources/_resourceHandles/_httpResourceHandle.py +16 -2
- lsst/resources/_resourceHandles/_s3ResourceHandle.py +3 -17
- lsst/resources/_resourcePath.py +448 -81
- lsst/resources/dav.py +912 -0
- lsst/resources/davutils.py +2659 -0
- lsst/resources/file.py +97 -57
- lsst/resources/gs.py +11 -4
- lsst/resources/http.py +229 -62
- lsst/resources/mem.py +7 -1
- lsst/resources/packageresource.py +13 -2
- lsst/resources/s3.py +174 -17
- lsst/resources/s3utils.py +8 -1
- lsst/resources/schemeless.py +6 -3
- lsst/resources/tests.py +140 -12
- lsst/resources/utils.py +74 -1
- lsst/resources/version.py +1 -1
- {lsst_resources-29.0.0rc7.dist-info → lsst_resources-29.2025.4600.dist-info}/METADATA +3 -3
- lsst_resources-29.2025.4600.dist-info/RECORD +31 -0
- {lsst_resources-29.0.0rc7.dist-info → lsst_resources-29.2025.4600.dist-info}/WHEEL +1 -1
- lsst_resources-29.0.0rc7.dist-info/RECORD +0 -28
- {lsst_resources-29.0.0rc7.dist-info → lsst_resources-29.2025.4600.dist-info}/licenses/COPYRIGHT +0 -0
- {lsst_resources-29.0.0rc7.dist-info → lsst_resources-29.2025.4600.dist-info}/licenses/LICENSE +0 -0
- {lsst_resources-29.0.0rc7.dist-info → lsst_resources-29.2025.4600.dist-info}/top_level.txt +0 -0
- {lsst_resources-29.0.0rc7.dist-info → lsst_resources-29.2025.4600.dist-info}/zip-safe +0 -0
|
@@ -0,0 +1,2659 @@
|
|
|
1
|
+
# This file is part of lsst-resources.
|
|
2
|
+
#
|
|
3
|
+
# Developed for the LSST Data Management System.
|
|
4
|
+
# This product includes software developed by the LSST Project
|
|
5
|
+
# (https://www.lsst.org).
|
|
6
|
+
# See the COPYRIGHT file at the top-level directory of this distribution
|
|
7
|
+
# for details of code ownership.
|
|
8
|
+
#
|
|
9
|
+
# Use of this source code is governed by a 3-clause BSD-style
|
|
10
|
+
# license that can be found in the LICENSE file.
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import base64
|
|
15
|
+
import enum
|
|
16
|
+
import io
|
|
17
|
+
import json
|
|
18
|
+
import logging
|
|
19
|
+
import os
|
|
20
|
+
import posixpath
|
|
21
|
+
import random
|
|
22
|
+
import re
|
|
23
|
+
import stat
|
|
24
|
+
import threading
|
|
25
|
+
import time
|
|
26
|
+
import xml.etree.ElementTree as eTree
|
|
27
|
+
from datetime import datetime
|
|
28
|
+
from http import HTTPStatus
|
|
29
|
+
from typing import Any, BinaryIO
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
import fsspec
|
|
33
|
+
from fsspec.spec import AbstractFileSystem
|
|
34
|
+
except ImportError:
|
|
35
|
+
fsspec = None
|
|
36
|
+
AbstractFileSystem = type
|
|
37
|
+
|
|
38
|
+
import yaml
|
|
39
|
+
from astropy import units as u
|
|
40
|
+
from urllib3 import PoolManager
|
|
41
|
+
from urllib3.response import HTTPResponse
|
|
42
|
+
from urllib3.util import Retry, Timeout, Url, parse_url
|
|
43
|
+
|
|
44
|
+
from lsst.utils.timer import time_this
|
|
45
|
+
|
|
46
|
+
# Use the same logger than `dav.py`.
|
|
47
|
+
log = logging.getLogger(f"""{__name__.replace(".davutils", ".dav")}""")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def normalize_path(path: str | None) -> str:
|
|
51
|
+
"""Normalize a path intended to be part of a URL.
|
|
52
|
+
|
|
53
|
+
A path of the form "///a/b/c///../d/e/" would be normalized as "/a/b/d/e".
|
|
54
|
+
The returned path is always absolute, i.e. starts by "/" and never
|
|
55
|
+
ends by "/" except when the path is exactly "/" and does not contain
|
|
56
|
+
"." nor "..". It does not contain consecutive "/" either.
|
|
57
|
+
|
|
58
|
+
Parameters
|
|
59
|
+
----------
|
|
60
|
+
path : `str`, optional
|
|
61
|
+
Path to normalize (e.g., '/path/to/..///normalize/').
|
|
62
|
+
|
|
63
|
+
Returns
|
|
64
|
+
-------
|
|
65
|
+
url : `str`
|
|
66
|
+
Normalized URL (e.g., '/path/normalize').
|
|
67
|
+
"""
|
|
68
|
+
return "/" if not path else "/" + posixpath.normpath(path).lstrip("/")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def normalize_url(url: str, preserve_scheme: bool = False, preserve_path: bool = True) -> str:
|
|
72
|
+
"""Normalize a URL so that scheme be 'http' or 'https' and the URL path
|
|
73
|
+
is normalized.
|
|
74
|
+
|
|
75
|
+
Parameters
|
|
76
|
+
----------
|
|
77
|
+
url : `str`
|
|
78
|
+
URL to normalize (e.g., 'davs://example.org:1234///path/to//../dir/').
|
|
79
|
+
preserve_scheme : `bool`
|
|
80
|
+
If True the scheme of `url` will be preserved. Otherwise the scheme
|
|
81
|
+
of the returned normalized URL will be 'http' or 'https'.
|
|
82
|
+
preserve_path : `bool`
|
|
83
|
+
If True, the path of `url` will be preserved in the returned
|
|
84
|
+
normalized URL, otherwise, the returned URL will have '/' as path.
|
|
85
|
+
|
|
86
|
+
Returns
|
|
87
|
+
-------
|
|
88
|
+
url : `str`
|
|
89
|
+
Normalized URL (e.g. 'https://example.org:1234/path/dir').
|
|
90
|
+
"""
|
|
91
|
+
parsed = parse_url(url)
|
|
92
|
+
if parsed.scheme is None:
|
|
93
|
+
scheme = "http"
|
|
94
|
+
else:
|
|
95
|
+
scheme = parsed.scheme if preserve_scheme else parsed.scheme.replace("dav", "http")
|
|
96
|
+
path = normalize_path(parsed.path) if preserve_path else "/"
|
|
97
|
+
return Url(scheme=scheme, host=parsed.host, port=parsed.port, path=path).url
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class DavConfig:
|
|
101
|
+
"""Configurable settings a webDAV client must use when interacting with a
|
|
102
|
+
particular storage endpoint.
|
|
103
|
+
|
|
104
|
+
Parameters
|
|
105
|
+
----------
|
|
106
|
+
config : `dict[str, str]`
|
|
107
|
+
Dictionary of configurable settings for the webdav endpoint which
|
|
108
|
+
base URL is `config["base_url"]`.
|
|
109
|
+
|
|
110
|
+
For instance, if `config["base_url"]` is
|
|
111
|
+
|
|
112
|
+
"davs://webdav.example.org:1234/"
|
|
113
|
+
|
|
114
|
+
any object of class `DavResourcePath` like
|
|
115
|
+
|
|
116
|
+
"davs://webdav.example.org:1234/path/to/any/file"
|
|
117
|
+
|
|
118
|
+
will use the settings in this configuration to configure its client.
|
|
119
|
+
"""
|
|
120
|
+
|
|
121
|
+
# Timeout in seconds to establish a network connection with the remote
|
|
122
|
+
# server.
|
|
123
|
+
DEFAULT_TIMEOUT_CONNECT: float = 10.0
|
|
124
|
+
|
|
125
|
+
# Timeout in seconds to read the response to a request sent to a server.
|
|
126
|
+
# This is total time for reading both the headers and the response body.
|
|
127
|
+
# It must be large enough to allow for upload and download of files
|
|
128
|
+
# of typical size the webdav client supports.
|
|
129
|
+
DEFAULT_TIMEOUT_READ: float = 300.0
|
|
130
|
+
|
|
131
|
+
# Maximum number of network connections to persist against each one of
|
|
132
|
+
# the hosts in the frontend and backend server pools.
|
|
133
|
+
# Servers in the frontend pool typically respond to requests such as
|
|
134
|
+
# OPTIONS, PROPFIND, MKCOL, etc.
|
|
135
|
+
#
|
|
136
|
+
# Frontend servers redirect to backend servers to respond to GET and PUT
|
|
137
|
+
# requests (e.g. dCache) but sometimes also for metadata requests such as
|
|
138
|
+
# PROPFIND or HEAD (e.g. XRootD).
|
|
139
|
+
DEFAULT_PERSISTENT_CONNECTIONS_FRONTEND: int = 50
|
|
140
|
+
DEFAULT_PERSISTENT_CONNECTIONS_BACKEND: int = 100
|
|
141
|
+
|
|
142
|
+
# Size of the buffer (in mebibytes, i.e. 1024*1024 bytes) the webdav
|
|
143
|
+
# client of this endpoint will use when sending requests and receiving
|
|
144
|
+
# responses.
|
|
145
|
+
DEFAULT_BUFFER_SIZE: int = 5
|
|
146
|
+
|
|
147
|
+
# Number of times to retry requests before failing. Retry happens only
|
|
148
|
+
# under certain conditions.
|
|
149
|
+
DEFAULT_RETRIES: int = 3
|
|
150
|
+
|
|
151
|
+
# Minimal and maximal retry backoff (in seconds) for the client to compute
|
|
152
|
+
# the wait time before retrying a request.
|
|
153
|
+
# A value in this interval is randomly selected as the backoff factor
|
|
154
|
+
# every time a request is retried.
|
|
155
|
+
DEFAULT_RETRY_BACKOFF_MIN: float = 1.0
|
|
156
|
+
DEFAULT_RETRY_BACKOFF_MAX: float = 3.0
|
|
157
|
+
|
|
158
|
+
# Path to a directory or certificate bundle file where the certificates
|
|
159
|
+
# of the trusted certificate authorities can be found.
|
|
160
|
+
# Those certificates will be used by the client of the webdav endpoint
|
|
161
|
+
# to verify the server's host certificate.
|
|
162
|
+
# If None, the certificates trusted by the system are used.
|
|
163
|
+
DEFAULT_TRUSTED_AUTHORITIES: str | None = None
|
|
164
|
+
|
|
165
|
+
# Path to the client certificate and associated private key the webdav
|
|
166
|
+
# client must present to the server for authentication purposes.
|
|
167
|
+
# If None, no client certificate is presented.
|
|
168
|
+
DEFAULT_USER_CERT: str | None = None
|
|
169
|
+
DEFAULT_USER_KEY: str | None = None
|
|
170
|
+
|
|
171
|
+
# Token the webdav client must sent to the server for authentication
|
|
172
|
+
# purposes. The token may be the value of the token itself or the path
|
|
173
|
+
# to a file where the token can be found.
|
|
174
|
+
DEFAULT_TOKEN: str | None = None
|
|
175
|
+
|
|
176
|
+
# Default checksum algorithm to request the server to compute on every
|
|
177
|
+
# file upload. Not al servers support this.
|
|
178
|
+
# See RFC 3230 for details.
|
|
179
|
+
DEFAULT_REQUEST_CHECKSUM: str | None = None
|
|
180
|
+
|
|
181
|
+
# If this option is set to True, the webdav client can return objects
|
|
182
|
+
# compliant to the fsspec specification.
|
|
183
|
+
# See: https://filesystem-spec.readthedocs.io
|
|
184
|
+
DEFAULT_ENABLE_FSSPEC: bool = True
|
|
185
|
+
|
|
186
|
+
# If this option is set to True, memory usage is computed and reported
|
|
187
|
+
# when executing in debug mode. Computing memory usage is costly, so only
|
|
188
|
+
# set this when debugging.
|
|
189
|
+
DEFAULT_COLLECT_MEMORY_USAGE: bool = False
|
|
190
|
+
|
|
191
|
+
# Accepted checksum algorithms. Must be lowercase.
|
|
192
|
+
ACCEPTED_CHECKSUMS: list[str] = ["adler32", "md5", "sha-256", "sha-512"]
|
|
193
|
+
|
|
194
|
+
def __init__(self, config: dict | None = None) -> None:
|
|
195
|
+
if config is None:
|
|
196
|
+
config = {}
|
|
197
|
+
|
|
198
|
+
if (base_url := config.get("base_url")) is None:
|
|
199
|
+
self._base_url = "_default_"
|
|
200
|
+
else:
|
|
201
|
+
self._base_url = normalize_url(base_url, preserve_path=False)
|
|
202
|
+
|
|
203
|
+
self._timeout_connect: float = float(config.get("timeout_connect", DavConfig.DEFAULT_TIMEOUT_CONNECT))
|
|
204
|
+
self._timeout_read: float = float(config.get("timeout_read", DavConfig.DEFAULT_TIMEOUT_READ))
|
|
205
|
+
self._persistent_connections_frontend: int = int(
|
|
206
|
+
config.get(
|
|
207
|
+
"persistent_connections_frontend",
|
|
208
|
+
DavConfig.DEFAULT_PERSISTENT_CONNECTIONS_FRONTEND,
|
|
209
|
+
)
|
|
210
|
+
)
|
|
211
|
+
self._persistent_connections_backend: int = int(
|
|
212
|
+
config.get(
|
|
213
|
+
"persistent_connections_backend",
|
|
214
|
+
DavConfig.DEFAULT_PERSISTENT_CONNECTIONS_BACKEND,
|
|
215
|
+
)
|
|
216
|
+
)
|
|
217
|
+
self._buffer_size: int = 1_048_576 * int(config.get("buffer_size", DavConfig.DEFAULT_BUFFER_SIZE))
|
|
218
|
+
self._retries: int = int(config.get("retries", DavConfig.DEFAULT_RETRIES))
|
|
219
|
+
self._retry_backoff_min: float = float(
|
|
220
|
+
config.get("retry_backoff_min", DavConfig.DEFAULT_RETRY_BACKOFF_MIN)
|
|
221
|
+
)
|
|
222
|
+
self._retry_backoff_max: float = float(
|
|
223
|
+
config.get("retry_backoff_max", DavConfig.DEFAULT_RETRY_BACKOFF_MAX)
|
|
224
|
+
)
|
|
225
|
+
self._trusted_authorities: str | None = expand_vars(
|
|
226
|
+
config.get("trusted_authorities", DavConfig.DEFAULT_TRUSTED_AUTHORITIES)
|
|
227
|
+
)
|
|
228
|
+
self._user_cert: str | None = expand_vars(config.get("user_cert", DavConfig.DEFAULT_USER_CERT))
|
|
229
|
+
self._user_key: str | None = expand_vars(config.get("user_key", DavConfig.DEFAULT_USER_KEY))
|
|
230
|
+
self._token: str | None = expand_vars(config.get("token", DavConfig.DEFAULT_TOKEN))
|
|
231
|
+
self._enable_fsspec: bool = config.get("enable_fsspec", DavConfig.DEFAULT_ENABLE_FSSPEC)
|
|
232
|
+
self._collect_memory_usage: bool = config.get(
|
|
233
|
+
"collect_memory_usage", DavConfig.DEFAULT_COLLECT_MEMORY_USAGE
|
|
234
|
+
)
|
|
235
|
+
self._request_checksum: str | None = config.get(
|
|
236
|
+
"request_checksum", DavConfig.DEFAULT_REQUEST_CHECKSUM
|
|
237
|
+
)
|
|
238
|
+
if self._request_checksum is not None:
|
|
239
|
+
self._request_checksum = self._request_checksum.lower()
|
|
240
|
+
if self._request_checksum not in DavConfig.ACCEPTED_CHECKSUMS:
|
|
241
|
+
raise ValueError(
|
|
242
|
+
f"""Value for checksum algorithm {self._request_checksum} for storage endpoint """
|
|
243
|
+
f"""{self._base_url} is not among the accepted values: {DavConfig.ACCEPTED_CHECKSUMS}"""
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
@property
|
|
247
|
+
def base_url(self) -> str:
|
|
248
|
+
return self._base_url
|
|
249
|
+
|
|
250
|
+
@property
|
|
251
|
+
def timeout_connect(self) -> float:
|
|
252
|
+
return self._timeout_connect
|
|
253
|
+
|
|
254
|
+
@property
|
|
255
|
+
def timeout_read(self) -> float:
|
|
256
|
+
return self._timeout_read
|
|
257
|
+
|
|
258
|
+
@property
|
|
259
|
+
def persistent_connections_frontend(self) -> int:
|
|
260
|
+
return self._persistent_connections_frontend
|
|
261
|
+
|
|
262
|
+
@property
|
|
263
|
+
def persistent_connections_backend(self) -> int:
|
|
264
|
+
return self._persistent_connections_backend
|
|
265
|
+
|
|
266
|
+
@property
|
|
267
|
+
def buffer_size(self) -> int:
|
|
268
|
+
return self._buffer_size
|
|
269
|
+
|
|
270
|
+
@property
|
|
271
|
+
def retries(self) -> int:
|
|
272
|
+
return self._retries
|
|
273
|
+
|
|
274
|
+
@property
|
|
275
|
+
def retry_backoff_min(self) -> float:
|
|
276
|
+
return self._retry_backoff_min
|
|
277
|
+
|
|
278
|
+
@property
|
|
279
|
+
def retry_backoff_max(self) -> float:
|
|
280
|
+
return self._retry_backoff_max
|
|
281
|
+
|
|
282
|
+
@property
|
|
283
|
+
def trusted_authorities(self) -> str | None:
|
|
284
|
+
return self._trusted_authorities
|
|
285
|
+
|
|
286
|
+
@property
|
|
287
|
+
def token(self) -> str | None:
|
|
288
|
+
return self._token
|
|
289
|
+
|
|
290
|
+
@property
|
|
291
|
+
def request_checksum(self) -> str | None:
|
|
292
|
+
return self._request_checksum
|
|
293
|
+
|
|
294
|
+
@property
|
|
295
|
+
def user_cert(self) -> str | None:
|
|
296
|
+
return self._user_cert
|
|
297
|
+
|
|
298
|
+
@property
|
|
299
|
+
def user_key(self) -> str | None:
|
|
300
|
+
# If no user certificate was specified in the configuration,
|
|
301
|
+
# ignore the private key, even if it was provided.
|
|
302
|
+
if self._user_cert is None:
|
|
303
|
+
return None
|
|
304
|
+
|
|
305
|
+
# If we have a user certificate but not a private key, assume the
|
|
306
|
+
# private key is included in the same file as the user certificate.
|
|
307
|
+
# That is typically the case when using a X.509 grid proxy as
|
|
308
|
+
# client certificate.
|
|
309
|
+
return self._user_cert if self._user_key is None else self._user_key
|
|
310
|
+
|
|
311
|
+
@property
|
|
312
|
+
def enable_fsspec(self) -> bool:
|
|
313
|
+
return self._enable_fsspec
|
|
314
|
+
|
|
315
|
+
@property
|
|
316
|
+
def collect_memory_usage(self) -> bool:
|
|
317
|
+
return self._collect_memory_usage
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
class DavConfigPool:
|
|
321
|
+
"""Registry of configurable settings for all known webDAV endpoints.
|
|
322
|
+
|
|
323
|
+
Parameters
|
|
324
|
+
----------
|
|
325
|
+
filename : `list` [ `str` ]
|
|
326
|
+
List of environment variables or file names to load the configuration
|
|
327
|
+
from. The first file found in the list will be read and the
|
|
328
|
+
configuration settings for all webDAV endpoints will be extracted
|
|
329
|
+
from it. Other files will be ignored.
|
|
330
|
+
|
|
331
|
+
Each component of `filenames` can be an environment variable or
|
|
332
|
+
the path of a file which itself can include an environment variable,
|
|
333
|
+
e.g. '$HOME/path/to/config.yaml'.
|
|
334
|
+
|
|
335
|
+
The configuration file is a YAML file with the structure below:
|
|
336
|
+
|
|
337
|
+
- base_url: "davs://webdav1.example.org:1234/"
|
|
338
|
+
persistent_connections_frontend: 10
|
|
339
|
+
persistent_connections_backend: 100
|
|
340
|
+
timeout_connect: 20.0
|
|
341
|
+
timeout_read: 120.0
|
|
342
|
+
retries: 3
|
|
343
|
+
retry_backoff_min: 1.0
|
|
344
|
+
retry_backoff_max: 3.0
|
|
345
|
+
user_cert: "${X509_USER_PROXY}"
|
|
346
|
+
user_key: "${X509_USER_PROXY}"
|
|
347
|
+
token: "/path/to/bearer/token/file"
|
|
348
|
+
trusted_authorities: "/etc/grid-security/certificates"
|
|
349
|
+
buffer_size: 5
|
|
350
|
+
enable_fsspec: false
|
|
351
|
+
request_checksum: "md5"
|
|
352
|
+
collect_memory_usage: false
|
|
353
|
+
|
|
354
|
+
- base_url: "davs://webdav2.example.org:1234/"
|
|
355
|
+
persistent_connections_frontend: 5
|
|
356
|
+
...
|
|
357
|
+
|
|
358
|
+
All settings are optional. If no settings are found in the
|
|
359
|
+
configuration file for a particular webDAV endpoint, sensible
|
|
360
|
+
defaults will be used.
|
|
361
|
+
|
|
362
|
+
There is only a single instance of this class. This thead-safe
|
|
363
|
+
singleton is intended to be initialized when the module is imported
|
|
364
|
+
the first time.
|
|
365
|
+
"""
|
|
366
|
+
|
|
367
|
+
_instance = None
|
|
368
|
+
_lock = threading.Lock()
|
|
369
|
+
|
|
370
|
+
def __new__(cls, filename: str | None = None) -> DavConfigPool:
|
|
371
|
+
if cls._instance is None:
|
|
372
|
+
with cls._lock:
|
|
373
|
+
if cls._instance is None:
|
|
374
|
+
cls._instance = super().__new__(cls)
|
|
375
|
+
|
|
376
|
+
return cls._instance
|
|
377
|
+
|
|
378
|
+
def __init__(self, filename: str | None = None) -> None:
|
|
379
|
+
# Create a default configuration. This configuration is
|
|
380
|
+
# used when a URL doest not match any of the endpoints in the
|
|
381
|
+
# configuration.
|
|
382
|
+
self._default_config: DavConfig = DavConfig()
|
|
383
|
+
|
|
384
|
+
# The key of this dictionary is the URL of the webDAV endpoint,
|
|
385
|
+
# e.g. "davs://host.example.org:1234/"
|
|
386
|
+
self._configs: dict[str, DavConfig] = {}
|
|
387
|
+
|
|
388
|
+
# Load the configuration from the file we have been provided with,
|
|
389
|
+
# if any.
|
|
390
|
+
if filename is None:
|
|
391
|
+
return
|
|
392
|
+
|
|
393
|
+
# filename can be the name of an environment variable or a path.
|
|
394
|
+
# A path can include environment variables
|
|
395
|
+
# (e.g. "$HOME/path/to/config.yaml") or "~"
|
|
396
|
+
# (e.g. "~/path/to/config.yaml")
|
|
397
|
+
if (filename := os.getenv(filename)) is not None:
|
|
398
|
+
# Expand environment variables and '~' in the file name, if any.
|
|
399
|
+
filename = os.path.expandvars(filename)
|
|
400
|
+
filename = os.path.expanduser(filename)
|
|
401
|
+
with open(filename) as file:
|
|
402
|
+
for config_item in yaml.safe_load(file):
|
|
403
|
+
config = DavConfig(config_item)
|
|
404
|
+
if config.base_url not in self._configs:
|
|
405
|
+
self._configs[config.base_url] = config
|
|
406
|
+
else:
|
|
407
|
+
# We already have a configuration for the same
|
|
408
|
+
# endpoint. That is likely a human error in
|
|
409
|
+
# the configuration file.
|
|
410
|
+
raise ValueError(
|
|
411
|
+
f"""configuration file {filename} contains two configurations for """
|
|
412
|
+
f"""endpoint {config.base_url}"""
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
def get_config_for_url(self, url: str) -> DavConfig:
|
|
416
|
+
"""Return the configuration to use a webDAV client when interacting
|
|
417
|
+
with the server which hosts the resource at `url`.
|
|
418
|
+
|
|
419
|
+
Parameters
|
|
420
|
+
----------
|
|
421
|
+
url : `str`
|
|
422
|
+
URL for which to obtain a configuration.
|
|
423
|
+
"""
|
|
424
|
+
# Select the configuration for the endpoint of the provided URL.
|
|
425
|
+
normalized_url: str = normalize_url(url, preserve_path=False)
|
|
426
|
+
if (config := self._configs.get(normalized_url)) is not None:
|
|
427
|
+
return config
|
|
428
|
+
|
|
429
|
+
# No config was found for the specified URL. Use the default.
|
|
430
|
+
return self._default_config
|
|
431
|
+
|
|
432
|
+
def _destroy(self) -> None:
|
|
433
|
+
"""Destroy this class singleton instance.
|
|
434
|
+
|
|
435
|
+
Helper method to be used in tests to reset global configuration.
|
|
436
|
+
"""
|
|
437
|
+
with DavConfigPool._lock:
|
|
438
|
+
DavConfigPool._instance = None
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
def make_retry(config: DavConfig) -> Retry:
|
|
442
|
+
"""Create a ``urllib3.util.Retry`` object from settings in `config`.
|
|
443
|
+
|
|
444
|
+
Parameters
|
|
445
|
+
----------
|
|
446
|
+
config : `DavConfig`
|
|
447
|
+
Configurable settings for a webDAV storage endpoint.
|
|
448
|
+
|
|
449
|
+
Returns
|
|
450
|
+
-------
|
|
451
|
+
retry : `urllib3.util.Retry`
|
|
452
|
+
Retry object to he used when creating a ``urllib3.PoolManager``.
|
|
453
|
+
"""
|
|
454
|
+
backoff_min: float = config.retry_backoff_min
|
|
455
|
+
backoff_max: float = config.retry_backoff_max
|
|
456
|
+
retry = Retry(
|
|
457
|
+
# Total number of retries to allow. Takes precedence over other
|
|
458
|
+
# counts.
|
|
459
|
+
total=2 * config.retries,
|
|
460
|
+
# How many connection-related errors to retry on.
|
|
461
|
+
connect=config.retries,
|
|
462
|
+
# How many times to retry on read errors.
|
|
463
|
+
read=config.retries,
|
|
464
|
+
# Backoff factor to apply between attempts after the second try
|
|
465
|
+
# (seconds). Compute a random jitter to prevent all the clients which
|
|
466
|
+
# started at the same time (even on different hosts) to overwhelm the
|
|
467
|
+
# server by sending requests at the same time.
|
|
468
|
+
backoff_factor=backoff_min + (backoff_max - backoff_min) * random.random(),
|
|
469
|
+
# How many times to retry on bad status codes.
|
|
470
|
+
status=config.retries,
|
|
471
|
+
# Set of uppercased HTTP method verbs that we should retry on.
|
|
472
|
+
# We only automatically retry idempotent requests.
|
|
473
|
+
allowed_methods=frozenset(
|
|
474
|
+
[
|
|
475
|
+
"COPY",
|
|
476
|
+
"DELETE",
|
|
477
|
+
"GET",
|
|
478
|
+
"HEAD",
|
|
479
|
+
"MKCOL",
|
|
480
|
+
"OPTIONS",
|
|
481
|
+
"PROPFIND",
|
|
482
|
+
"PUT",
|
|
483
|
+
]
|
|
484
|
+
),
|
|
485
|
+
# HTTP status codes that we should force a retry on.
|
|
486
|
+
status_forcelist=frozenset(
|
|
487
|
+
[
|
|
488
|
+
HTTPStatus.TOO_MANY_REQUESTS, # 429
|
|
489
|
+
HTTPStatus.INTERNAL_SERVER_ERROR, # 500
|
|
490
|
+
HTTPStatus.BAD_GATEWAY, # 502
|
|
491
|
+
HTTPStatus.SERVICE_UNAVAILABLE, # 503
|
|
492
|
+
HTTPStatus.GATEWAY_TIMEOUT, # 504
|
|
493
|
+
]
|
|
494
|
+
),
|
|
495
|
+
# Whether to respect "Retry-After" header on status codes defined
|
|
496
|
+
# above.
|
|
497
|
+
respect_retry_after_header=True,
|
|
498
|
+
)
|
|
499
|
+
return retry
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
class DavClientPool:
|
|
503
|
+
"""Container of reusable webDAV clients, each one specifically configured
|
|
504
|
+
to talk to a single storage endpoint.
|
|
505
|
+
|
|
506
|
+
Parameters
|
|
507
|
+
----------
|
|
508
|
+
config_pool : `DavConfigPool`
|
|
509
|
+
Pool of all known webDAV client configurations.
|
|
510
|
+
|
|
511
|
+
Notes
|
|
512
|
+
-----
|
|
513
|
+
There is a single instance of this class. This thead-safe singleton is
|
|
514
|
+
intended to be initialized when the module is imported the first time.
|
|
515
|
+
"""
|
|
516
|
+
|
|
517
|
+
_instance = None
|
|
518
|
+
_lock = threading.Lock()
|
|
519
|
+
|
|
520
|
+
def __new__(cls, config_pool: DavConfigPool) -> DavClientPool:
|
|
521
|
+
if cls._instance is None:
|
|
522
|
+
with cls._lock:
|
|
523
|
+
if cls._instance is None:
|
|
524
|
+
cls._instance = super().__new__(cls)
|
|
525
|
+
|
|
526
|
+
return cls._instance
|
|
527
|
+
|
|
528
|
+
def __init__(self, config_pool: DavConfigPool) -> None:
|
|
529
|
+
self._config_pool: DavConfigPool = config_pool
|
|
530
|
+
|
|
531
|
+
# The key of this dictionnary is a path-stripped URL of the form
|
|
532
|
+
# "davs://host.example.org:1234/". The value is a reusable
|
|
533
|
+
# DavClient to interact with that endpoint.
|
|
534
|
+
self._clients: dict[str, DavClient] = {}
|
|
535
|
+
|
|
536
|
+
def get_client_for_url(self, url: str) -> DavClient:
|
|
537
|
+
"""Return a client for interacting with the endpoint where `url`
|
|
538
|
+
is hosted.
|
|
539
|
+
|
|
540
|
+
Parameters
|
|
541
|
+
----------
|
|
542
|
+
url : `str`
|
|
543
|
+
URL for which to obtain a client.
|
|
544
|
+
|
|
545
|
+
Notes
|
|
546
|
+
-----
|
|
547
|
+
The returned client is thread-safe. If a client for that endpoint
|
|
548
|
+
already exists it is reused, otherwise a new client is created
|
|
549
|
+
with the appropriate configuration for interacting with the storage
|
|
550
|
+
endpoint.
|
|
551
|
+
"""
|
|
552
|
+
# If we already have a client for this endpoint reuse it.
|
|
553
|
+
url = normalize_url(url, preserve_path=False)
|
|
554
|
+
if (client := self._clients.get(url)) is not None:
|
|
555
|
+
return client
|
|
556
|
+
|
|
557
|
+
# No client for this endpoint was found. Create a new one and save it
|
|
558
|
+
# for serving subsequent requests.
|
|
559
|
+
with DavClientPool._lock:
|
|
560
|
+
# If another client was created in the meantime by another thread
|
|
561
|
+
# reuse it.
|
|
562
|
+
if (client := self._clients.get(url)) is not None:
|
|
563
|
+
return client
|
|
564
|
+
|
|
565
|
+
config: DavConfig = self._config_pool.get_config_for_url(url)
|
|
566
|
+
self._clients[url] = self._make_client(url, config)
|
|
567
|
+
|
|
568
|
+
return self._clients[url]
|
|
569
|
+
|
|
570
|
+
def _make_client(self, url: str, config: DavConfig) -> DavClient:
|
|
571
|
+
"""Make a webDAV client for interacting with the server at `url`."""
|
|
572
|
+
# Check the server implements webDAV protocol and retrieve its
|
|
573
|
+
# identity so that we can build a client for that specific
|
|
574
|
+
# server implementation.
|
|
575
|
+
client = DavClient(url, config)
|
|
576
|
+
server_details = client.get_server_details(url)
|
|
577
|
+
server_id = server_details.get("Server", None)
|
|
578
|
+
accepts_ranges: bool | str | None = server_details.get("Accept-Ranges", None)
|
|
579
|
+
if accepts_ranges is not None:
|
|
580
|
+
accepts_ranges = accepts_ranges == "bytes"
|
|
581
|
+
|
|
582
|
+
if server_id is None:
|
|
583
|
+
# Create a generic webDAV client
|
|
584
|
+
return DavClient(url, config, accepts_ranges)
|
|
585
|
+
|
|
586
|
+
if server_id.startswith("dCache/"):
|
|
587
|
+
# Create a client for a dCache webDAV server
|
|
588
|
+
return DavClientDCache(url, config, accepts_ranges)
|
|
589
|
+
elif server_id.startswith("XrootD/"):
|
|
590
|
+
# Create a client for a XrootD webDAV server
|
|
591
|
+
return DavClientXrootD(url, config, accepts_ranges)
|
|
592
|
+
else:
|
|
593
|
+
# Return a generic webDAV client
|
|
594
|
+
return DavClient(url, config, accepts_ranges)
|
|
595
|
+
|
|
596
|
+
def _destroy(self) -> None:
|
|
597
|
+
"""Destroy this class singleton instance.
|
|
598
|
+
|
|
599
|
+
Helper method to be used in tests to reset global configuration.
|
|
600
|
+
"""
|
|
601
|
+
with DavClientPool._lock:
|
|
602
|
+
DavClientPool._instance = None
|
|
603
|
+
|
|
604
|
+
|
|
605
|
+
class DavClient:
|
|
606
|
+
"""WebDAV client, configured to talk to a single storage endpoint.
|
|
607
|
+
|
|
608
|
+
Instances of this class are thread-safe.
|
|
609
|
+
|
|
610
|
+
Parameters
|
|
611
|
+
----------
|
|
612
|
+
url : `str`
|
|
613
|
+
Root URL of the storage endpoint (e.g.
|
|
614
|
+
"https://host.example.org:1234/").
|
|
615
|
+
config : `DavConfig`
|
|
616
|
+
Configuration to initialize this client.
|
|
617
|
+
accepts_ranges : `bool` | `None`
|
|
618
|
+
Indicate whether the remote server accepts the ``Range`` header in GET
|
|
619
|
+
requests.
|
|
620
|
+
"""
|
|
621
|
+
|
|
622
|
+
def __init__(self, url: str, config: DavConfig, accepts_ranges: bool | None = None) -> None:
|
|
623
|
+
# Lock to protect this client fields from concurrent modification.
|
|
624
|
+
self._lock = threading.Lock()
|
|
625
|
+
|
|
626
|
+
# Configuration for the storage endpoint.
|
|
627
|
+
self._config: DavConfig = config
|
|
628
|
+
|
|
629
|
+
# Prepare the trusted authorities certificates
|
|
630
|
+
ca_certs, ca_cert_dir = None, None
|
|
631
|
+
if self._config.trusted_authorities is not None:
|
|
632
|
+
if os.path.isdir(self._config.trusted_authorities):
|
|
633
|
+
ca_cert_dir = self._config.trusted_authorities
|
|
634
|
+
elif os.path.isfile(self._config.trusted_authorities):
|
|
635
|
+
ca_certs = self._config.trusted_authorities
|
|
636
|
+
else:
|
|
637
|
+
raise FileNotFoundError(
|
|
638
|
+
f"Trusted authorities file or directory {self._config.trusted_authorities} does not exist"
|
|
639
|
+
)
|
|
640
|
+
|
|
641
|
+
# If a token was specified for this endpoint, prefer it as the
|
|
642
|
+
# authentication method, instead of a <user certificate, private key>
|
|
643
|
+
# pair, even if they were also specified.
|
|
644
|
+
self._authorizer: TokenAuthorizer | None = None
|
|
645
|
+
if self._config.token is not None:
|
|
646
|
+
self._authorizer = TokenAuthorizer(self._config.token)
|
|
647
|
+
user_cert, user_key = None, None
|
|
648
|
+
else:
|
|
649
|
+
user_cert = self._config.user_cert
|
|
650
|
+
user_key = self._config.user_key
|
|
651
|
+
|
|
652
|
+
# We use this pool manager for sending requests that the front
|
|
653
|
+
# server typically responds to directly without redirecting (e.g.
|
|
654
|
+
# OPTIONS, HEAD, etc.)
|
|
655
|
+
#
|
|
656
|
+
# Connections in this pool are generally left open by the client but
|
|
657
|
+
# the front-end server may choose to close them in some specific
|
|
658
|
+
# situations (e.g. PUT request with "Expect: 100-continue" header).
|
|
659
|
+
self._frontend = PoolManager(
|
|
660
|
+
# Number of connection pools to cache before discarding the least
|
|
661
|
+
# recently used pool. Each connection pool manages network
|
|
662
|
+
# connections to a single host, so this is basically the number
|
|
663
|
+
# of "host:port" we persist network connections to.
|
|
664
|
+
num_pools=10,
|
|
665
|
+
# Number of connections to the same "host:port" to persist for
|
|
666
|
+
# later reuse. More than 1 is useful in multithreaded situations.
|
|
667
|
+
# If more than this number of network connections are needed at
|
|
668
|
+
# a particular moment, they will be created and used but not
|
|
669
|
+
# perrsisted.
|
|
670
|
+
maxsize=self._config.persistent_connections_frontend,
|
|
671
|
+
# Retry configuration to use by default with requests sent to
|
|
672
|
+
# host in the front end.
|
|
673
|
+
retries=make_retry(self._config),
|
|
674
|
+
# Socket timeout in seconds for each individual connection.
|
|
675
|
+
timeout=Timeout(
|
|
676
|
+
connect=self._config.timeout_connect,
|
|
677
|
+
read=self._config.timeout_read,
|
|
678
|
+
),
|
|
679
|
+
# Size in bytes of the buffer for reading/writing data from/to
|
|
680
|
+
# the underlying socket.
|
|
681
|
+
blocksize=self._config.buffer_size,
|
|
682
|
+
# Client certificate and private key for esablishing TLS
|
|
683
|
+
# connections. If None, no client certificate is sent to the
|
|
684
|
+
# server. Only relevant for endpoints using secure HTTP protocol.
|
|
685
|
+
cert_file=user_cert,
|
|
686
|
+
key_file=user_key,
|
|
687
|
+
# We require verification of the server certificate.
|
|
688
|
+
cert_reqs="CERT_REQUIRED",
|
|
689
|
+
# Directory where the certificates of the trusted certificate
|
|
690
|
+
# authorities can be found. The contents of that directory
|
|
691
|
+
# must be as expected by OpenSSL.
|
|
692
|
+
ca_cert_dir=ca_cert_dir,
|
|
693
|
+
# Path to a file of concatenated CA certificates in PEM format.
|
|
694
|
+
ca_certs=ca_certs,
|
|
695
|
+
)
|
|
696
|
+
|
|
697
|
+
# We use this pool manager to send requests to the backend hosts.
|
|
698
|
+
# Those requests are typically 'GET' and 'PUT'. The backend servers
|
|
699
|
+
# typically leave the connection open after serving the request,
|
|
700
|
+
# but we want the client to have the possibility to close them
|
|
701
|
+
# when there is no benefit of persist those connections.
|
|
702
|
+
#
|
|
703
|
+
# That is the case, for instance, when the backend servers use a
|
|
704
|
+
# range of ports for listening for new connections. In that case
|
|
705
|
+
# it is likely that a connection to the same pair
|
|
706
|
+
# <backend server, port number>
|
|
707
|
+
# is not going to be reused in a short interval of time
|
|
708
|
+
self._backend = PoolManager(
|
|
709
|
+
num_pools=100,
|
|
710
|
+
maxsize=self._config.persistent_connections_backend,
|
|
711
|
+
retries=make_retry(self._config),
|
|
712
|
+
timeout=Timeout(
|
|
713
|
+
connect=self._config.timeout_connect,
|
|
714
|
+
read=self._config.timeout_read,
|
|
715
|
+
),
|
|
716
|
+
blocksize=self._config.buffer_size,
|
|
717
|
+
cert_file=user_cert,
|
|
718
|
+
key_file=user_key,
|
|
719
|
+
cert_reqs="CERT_REQUIRED",
|
|
720
|
+
ca_cert_dir=ca_cert_dir,
|
|
721
|
+
ca_certs=ca_certs,
|
|
722
|
+
)
|
|
723
|
+
|
|
724
|
+
# Parser of PROPFIND responses.
|
|
725
|
+
self._propfind_parser: DavPropfindParser = DavPropfindParser()
|
|
726
|
+
|
|
727
|
+
# Does the remote server accept "Range" header in GET requests?
|
|
728
|
+
# This field is lazy initialized.
|
|
729
|
+
self._accepts_ranges: bool | None = accepts_ranges
|
|
730
|
+
|
|
731
|
+
# Base URL of the server this is a client for. It is of the form:
|
|
732
|
+
# "davs://host.example.org:1234./"
|
|
733
|
+
self._base_url: str = url
|
|
734
|
+
|
|
735
|
+
def get_server_details(self, url: str) -> dict[str, str]:
|
|
736
|
+
"""
|
|
737
|
+
Retrieve the details of the server and check it advertises compliance
|
|
738
|
+
to class 1 of webDAV protocol.
|
|
739
|
+
|
|
740
|
+
Parameters
|
|
741
|
+
----------
|
|
742
|
+
url : `str`
|
|
743
|
+
URL to check.
|
|
744
|
+
|
|
745
|
+
Returns
|
|
746
|
+
-------
|
|
747
|
+
details: `dic[str, str]`
|
|
748
|
+
The keys of the returned dictionary can be "Server" and
|
|
749
|
+
"Accept-Ranges". Any of those keys may not exist in the returned
|
|
750
|
+
dictionary if the server did not include it in its response.
|
|
751
|
+
|
|
752
|
+
The values are the values of the corresponding
|
|
753
|
+
headers found in the response to the OPTIONS request.
|
|
754
|
+
Examples of values for the "Server" header are 'dCache/9.2.4' or
|
|
755
|
+
'XrootD/v5.7.1'.
|
|
756
|
+
"""
|
|
757
|
+
# Check that the value "1" is part of the value of the "DAV" header in
|
|
758
|
+
# the response to an 'OPTIONS' request.
|
|
759
|
+
#
|
|
760
|
+
# We don't rely on webDAV locks, so a server complying to class 1 is
|
|
761
|
+
# enough for our purposes. All webDAV servers must advertise at least
|
|
762
|
+
# compliance class "1".
|
|
763
|
+
#
|
|
764
|
+
# Compliance classes are documented in
|
|
765
|
+
# http://www.webdav.org/specs/rfc4918.html#dav.compliance.classes
|
|
766
|
+
#
|
|
767
|
+
# Examples of values for header DAV are:
|
|
768
|
+
# DAV: 1, 2
|
|
769
|
+
# DAV: 1, <http://apache.org/dav/propset/fs/1>
|
|
770
|
+
resp = self._options(url)
|
|
771
|
+
if "DAV" not in resp.headers:
|
|
772
|
+
raise ValueError(f"Server of {resp.geturl()} does not implement webDAV protocol")
|
|
773
|
+
|
|
774
|
+
if "1" not in resp.headers.get("DAV").replace(" ", "").split(","):
|
|
775
|
+
raise ValueError(
|
|
776
|
+
f"Server of {resp.geturl()} does not advertise required compliance to webDAV protocol class 1"
|
|
777
|
+
)
|
|
778
|
+
|
|
779
|
+
# The value of 'Server' header is expected to be of the form
|
|
780
|
+
# 'dCache/9.2.4' or 'XrootD/v5.7.1'. Not all servers include such a
|
|
781
|
+
# header in their response to an OPTIONS request. If no such a
|
|
782
|
+
# header is found in the response, use "_unknown_".
|
|
783
|
+
details: dict[str, str] = {}
|
|
784
|
+
for header in ("Server", "Accept-Ranges"):
|
|
785
|
+
value = resp.headers.get(header, None)
|
|
786
|
+
if value is not None:
|
|
787
|
+
details[header] = value
|
|
788
|
+
|
|
789
|
+
return details
|
|
790
|
+
|
|
791
|
+
def _options(self, url: str) -> HTTPResponse:
|
|
792
|
+
"""Send a HTTP OPTIONS request and return the response.
|
|
793
|
+
|
|
794
|
+
Parameters
|
|
795
|
+
----------
|
|
796
|
+
url : `str`
|
|
797
|
+
Target URL.
|
|
798
|
+
"""
|
|
799
|
+
resp = self._request("OPTIONS", url)
|
|
800
|
+
if resp.status in (HTTPStatus.OK, HTTPStatus.CREATED):
|
|
801
|
+
return resp
|
|
802
|
+
else:
|
|
803
|
+
raise ValueError(
|
|
804
|
+
f"""Unexpected response to OPTIONS request to {resp.geturl()}: status {resp.status} """
|
|
805
|
+
f"""{resp.reason}"""
|
|
806
|
+
)
|
|
807
|
+
|
|
808
|
+
def _request(
|
|
809
|
+
self,
|
|
810
|
+
method: str,
|
|
811
|
+
url: str,
|
|
812
|
+
headers: dict[str, str] | None = None,
|
|
813
|
+
body: BinaryIO | bytes | str | None = None,
|
|
814
|
+
pool_manager: PoolManager | None = None,
|
|
815
|
+
preload_content: bool = True,
|
|
816
|
+
redirect: bool = True,
|
|
817
|
+
) -> HTTPResponse:
|
|
818
|
+
"""Send a generic HTTP request and return the response.
|
|
819
|
+
|
|
820
|
+
Parameters
|
|
821
|
+
----------
|
|
822
|
+
method : `str`
|
|
823
|
+
Request method, e.g. 'GET', 'PUT', 'PROPFIND'.
|
|
824
|
+
url : `str`
|
|
825
|
+
Target URL.
|
|
826
|
+
headers : `dict[str, str]`, optional
|
|
827
|
+
Headers to sent with the request.
|
|
828
|
+
body : `bytes` or `str` or `None`, optional
|
|
829
|
+
Request body.
|
|
830
|
+
pool_manager : `PoolManager`, optional
|
|
831
|
+
Pool manager to use to send the request. By default, the requests
|
|
832
|
+
are sent to the frontend servers.
|
|
833
|
+
preload_content : `bool`, optional
|
|
834
|
+
If True, the response body is downloaded and can be retrieved
|
|
835
|
+
via the returned response `.data` property. If False, the
|
|
836
|
+
caller needs to call `.read()` on the returned response object to
|
|
837
|
+
download the body, either entirely in one call or by chunks.
|
|
838
|
+
redirect : `bool`, optional
|
|
839
|
+
If True, automatically handle redirects. If False, the returned
|
|
840
|
+
response may contain a redirection to another location.
|
|
841
|
+
|
|
842
|
+
Returns
|
|
843
|
+
-------
|
|
844
|
+
resp: `HTTPResponse`
|
|
845
|
+
Response to the request as received from the server.
|
|
846
|
+
"""
|
|
847
|
+
# If this client is configured to use a bearer token for
|
|
848
|
+
# authentication, ensure we only set the token to requests over secure
|
|
849
|
+
# HTTP to avoid leaking the token.
|
|
850
|
+
headers = {} if headers is None else dict(headers)
|
|
851
|
+
if self._authorizer is not None and url.startswith("https://"):
|
|
852
|
+
self._authorizer.set_authorization(headers)
|
|
853
|
+
|
|
854
|
+
# By default, send the request to a frontend server.
|
|
855
|
+
if pool_manager is None:
|
|
856
|
+
pool_manager = self._frontend
|
|
857
|
+
|
|
858
|
+
log.debug("sending request %s %s", method, url)
|
|
859
|
+
|
|
860
|
+
with time_this(
|
|
861
|
+
log,
|
|
862
|
+
msg="%s %s",
|
|
863
|
+
args=(
|
|
864
|
+
method,
|
|
865
|
+
url,
|
|
866
|
+
),
|
|
867
|
+
mem_usage=self._config.collect_memory_usage,
|
|
868
|
+
mem_unit=u.mebibyte,
|
|
869
|
+
):
|
|
870
|
+
resp = pool_manager.request(
|
|
871
|
+
method,
|
|
872
|
+
url,
|
|
873
|
+
body=body,
|
|
874
|
+
headers=headers,
|
|
875
|
+
preload_content=preload_content,
|
|
876
|
+
redirect=redirect,
|
|
877
|
+
)
|
|
878
|
+
|
|
879
|
+
return resp
|
|
880
|
+
|
|
881
|
+
def _get(
|
|
882
|
+
self, url: str, headers: dict[str, str] | None = None, preload_content: bool = True
|
|
883
|
+
) -> HTTPResponse:
|
|
884
|
+
"""Send a HTTP GET request.
|
|
885
|
+
|
|
886
|
+
Parameters
|
|
887
|
+
----------
|
|
888
|
+
url : `str`
|
|
889
|
+
Target URL.
|
|
890
|
+
headers : `dict[str, str]`, optional
|
|
891
|
+
Headers to sent with the request.
|
|
892
|
+
preload_content : `bool`, optional
|
|
893
|
+
If True, the response body is downloaded and can be retrieved
|
|
894
|
+
via the returned response `.data` property. If False, the
|
|
895
|
+
caller needs to call the `.read()` on the returned response
|
|
896
|
+
object to download the body.
|
|
897
|
+
|
|
898
|
+
Returns
|
|
899
|
+
-------
|
|
900
|
+
resp: `HTTPResponse`
|
|
901
|
+
Response to the GET request as received from the server.
|
|
902
|
+
"""
|
|
903
|
+
# Send the GET request to the frontend servers. We handle redirections
|
|
904
|
+
# ourselves.
|
|
905
|
+
headers = {} if headers is None else dict(headers)
|
|
906
|
+
resp = self._request("GET", url, headers=headers, preload_content=preload_content, redirect=False)
|
|
907
|
+
if resp.status in (HTTPStatus.OK, HTTPStatus.PARTIAL_CONTENT):
|
|
908
|
+
return resp
|
|
909
|
+
|
|
910
|
+
if resp.status == HTTPStatus.NOT_FOUND:
|
|
911
|
+
raise FileNotFoundError(f"No file found at {resp.geturl()}")
|
|
912
|
+
|
|
913
|
+
redirect_location = resp.get_redirect_location()
|
|
914
|
+
if redirect_location is None or redirect_location is False:
|
|
915
|
+
raise ValueError(
|
|
916
|
+
f"Unexpected error in HTTP GET {resp.geturl()}: status {resp.status} {resp.reason}"
|
|
917
|
+
)
|
|
918
|
+
|
|
919
|
+
# We were redirected to a backend server so follow the redirection.
|
|
920
|
+
# The response body will be automatically downloaded when
|
|
921
|
+
# `preload_content` is true and the underlying network connection
|
|
922
|
+
# may be kept open for future reuse if the maximum number of
|
|
923
|
+
# connections for the backend pool is not reached.
|
|
924
|
+
url = redirect_location
|
|
925
|
+
resp = self._request(
|
|
926
|
+
"GET",
|
|
927
|
+
url,
|
|
928
|
+
headers=headers,
|
|
929
|
+
pool_manager=self._backend,
|
|
930
|
+
preload_content=preload_content,
|
|
931
|
+
)
|
|
932
|
+
if resp.status not in (HTTPStatus.OK, HTTPStatus.PARTIAL_CONTENT):
|
|
933
|
+
raise ValueError(
|
|
934
|
+
f"Unexpected error in HTTP GET {resp.geturl()}: status {resp.status} {resp.reason}"
|
|
935
|
+
)
|
|
936
|
+
|
|
937
|
+
# The caller will access the `resp.data` property or use
|
|
938
|
+
# the `resp.read()` method to read the contents of the
|
|
939
|
+
# response body. If `preload_content` argument is True, the
|
|
940
|
+
# response body is already downloaded, otherwise `resp.read()`
|
|
941
|
+
# will download it.
|
|
942
|
+
return resp
|
|
943
|
+
|
|
944
|
+
def _put(
|
|
945
|
+
self,
|
|
946
|
+
url: str,
|
|
947
|
+
data: BinaryIO | bytes,
|
|
948
|
+
) -> None:
|
|
949
|
+
"""Send a HTTP PUT request.
|
|
950
|
+
|
|
951
|
+
Parameters
|
|
952
|
+
----------
|
|
953
|
+
url : `str`
|
|
954
|
+
Target URL.
|
|
955
|
+
data : `BinaryIO` or `bytes`
|
|
956
|
+
Request body.
|
|
957
|
+
"""
|
|
958
|
+
# Send a PUT request with empty body and handle redirection. This
|
|
959
|
+
# is useful if the server redirects us; since we cannot rewind the
|
|
960
|
+
# data we are uploading, we don't start uploading data until we
|
|
961
|
+
# connect to the server that will actually serve our request.
|
|
962
|
+
headers = {"Content-Length": "0"}
|
|
963
|
+
resp = self._request("PUT", url, headers=headers, redirect=False)
|
|
964
|
+
if redirect_location := resp.get_redirect_location():
|
|
965
|
+
url = redirect_location
|
|
966
|
+
elif resp.status not in (
|
|
967
|
+
HTTPStatus.OK,
|
|
968
|
+
HTTPStatus.CREATED,
|
|
969
|
+
HTTPStatus.NO_CONTENT,
|
|
970
|
+
):
|
|
971
|
+
raise ValueError(
|
|
972
|
+
f"""Unexpected response to HTTP request PUT {resp.geturl()}: status {resp.status} """
|
|
973
|
+
f"""{resp.reason} [{resp.data.decode("utf-8")}]"""
|
|
974
|
+
)
|
|
975
|
+
|
|
976
|
+
# We may have been redirectred. Upload the file contents to
|
|
977
|
+
# its final destination.
|
|
978
|
+
|
|
979
|
+
# Ask the server to compute and record a checksum of the uploaded
|
|
980
|
+
# file contents, for later integrity checks. Since we don't compute
|
|
981
|
+
# the digest ourselves while uploading the data, we cannot control
|
|
982
|
+
# after the request is complete that the data we uploaded is
|
|
983
|
+
# identical to the data recorded by the server, but at least the
|
|
984
|
+
# server has recorded a digest of the data it stored.
|
|
985
|
+
#
|
|
986
|
+
# See RFC-3230 for details and
|
|
987
|
+
# https://www.iana.org/assignments/http-dig-alg/http-dig-alg.xhtml
|
|
988
|
+
# for the list of supported digest algorithhms.
|
|
989
|
+
#
|
|
990
|
+
# In addition, note that not all servers implement this RFC so
|
|
991
|
+
# the checksum reqquest may be ignored by the server.
|
|
992
|
+
headers = {}
|
|
993
|
+
if (checksum := self._config.request_checksum) is not None:
|
|
994
|
+
headers = {"Want-Digest": checksum}
|
|
995
|
+
|
|
996
|
+
resp = self._request(
|
|
997
|
+
"PUT",
|
|
998
|
+
url,
|
|
999
|
+
body=data,
|
|
1000
|
+
headers=headers,
|
|
1001
|
+
pool_manager=self._backend,
|
|
1002
|
+
)
|
|
1003
|
+
|
|
1004
|
+
if resp.status not in (
|
|
1005
|
+
HTTPStatus.OK,
|
|
1006
|
+
HTTPStatus.CREATED,
|
|
1007
|
+
HTTPStatus.NO_CONTENT,
|
|
1008
|
+
):
|
|
1009
|
+
raise ValueError(
|
|
1010
|
+
f"""Unexpected response to HTTP request PUT {resp.geturl()}: status {resp.status} """
|
|
1011
|
+
f"""{resp.reason} [{resp.data.decode("utf-8")}]"""
|
|
1012
|
+
)
|
|
1013
|
+
|
|
1014
|
+
def _head(self, url: str, headers: dict[str, str] | None = None) -> HTTPResponse:
|
|
1015
|
+
"""Send a HTTP HEAD request and return the response.
|
|
1016
|
+
|
|
1017
|
+
Parameters
|
|
1018
|
+
----------
|
|
1019
|
+
url : `str`
|
|
1020
|
+
Target URL.
|
|
1021
|
+
headers : `bool``
|
|
1022
|
+
If the target URL is not found, raise an exception. Otherwise
|
|
1023
|
+
just return the response.
|
|
1024
|
+
"""
|
|
1025
|
+
headers = {} if headers is None else dict(headers)
|
|
1026
|
+
resp = self._request("HEAD", url, headers=headers)
|
|
1027
|
+
match resp.status:
|
|
1028
|
+
case HTTPStatus.OK:
|
|
1029
|
+
return resp
|
|
1030
|
+
case HTTPStatus.NOT_FOUND:
|
|
1031
|
+
raise FileNotFoundError(f"No file found at {resp.geturl()}")
|
|
1032
|
+
case _:
|
|
1033
|
+
raise ValueError(
|
|
1034
|
+
f"""Unexpected response to HEAD request to {resp.geturl()}: status {resp.status} """
|
|
1035
|
+
f"""{resp.reason}"""
|
|
1036
|
+
)
|
|
1037
|
+
|
|
1038
|
+
def _propfind(self, url: str, body: str | None = None, depth: str = "0") -> HTTPResponse:
|
|
1039
|
+
"""Send a HTTP PROPFIND request and return the response.
|
|
1040
|
+
|
|
1041
|
+
Parameters
|
|
1042
|
+
----------
|
|
1043
|
+
url : `str`
|
|
1044
|
+
Target URL.
|
|
1045
|
+
body : `str`, optional
|
|
1046
|
+
Request body.
|
|
1047
|
+
"""
|
|
1048
|
+
if body is None:
|
|
1049
|
+
# Request only the DAV live properties we are explicitly interested
|
|
1050
|
+
# in namely 'resourcetype', 'getcontentlength', 'getlastmodified'
|
|
1051
|
+
# and 'displayname'.
|
|
1052
|
+
body = (
|
|
1053
|
+
"""<?xml version="1.0" encoding="utf-8"?>"""
|
|
1054
|
+
"""<D:propfind xmlns:D="DAV:"><D:prop>"""
|
|
1055
|
+
"""<D:resourcetype/><D:getcontentlength/><D:getlastmodified/><D:displayname/>"""
|
|
1056
|
+
"""</D:prop></D:propfind>"""
|
|
1057
|
+
)
|
|
1058
|
+
|
|
1059
|
+
headers = {
|
|
1060
|
+
"Depth": depth,
|
|
1061
|
+
"Content-Type": 'application/xml; charset="utf-8"',
|
|
1062
|
+
"Content-Length": str(len(body)),
|
|
1063
|
+
}
|
|
1064
|
+
resp = self._request("PROPFIND", url=url, headers=headers, body=body)
|
|
1065
|
+
if resp.status in (HTTPStatus.MULTI_STATUS, HTTPStatus.NOT_FOUND):
|
|
1066
|
+
return resp
|
|
1067
|
+
else:
|
|
1068
|
+
raise ValueError(
|
|
1069
|
+
f"Unexpected response to PROPFIND {resp.geturl()}: status {resp.status} {resp.reason}"
|
|
1070
|
+
)
|
|
1071
|
+
|
|
1072
|
+
def stat(self, url: str) -> DavFileMetadata:
|
|
1073
|
+
"""Return the properties of file or directory located at `url`.
|
|
1074
|
+
|
|
1075
|
+
Parameters
|
|
1076
|
+
----------
|
|
1077
|
+
url : `str`
|
|
1078
|
+
Target URL.
|
|
1079
|
+
|
|
1080
|
+
Returns
|
|
1081
|
+
-------
|
|
1082
|
+
result: `DavResourceMetadata``
|
|
1083
|
+
Details of the resources at `url`. If no resource was found at
|
|
1084
|
+
that URL no exception is raised. Instead the returned details allow
|
|
1085
|
+
for detecting that the resource does not exist.
|
|
1086
|
+
"""
|
|
1087
|
+
resp = self._propfind(url)
|
|
1088
|
+
match resp.status:
|
|
1089
|
+
case HTTPStatus.NOT_FOUND:
|
|
1090
|
+
href = url.replace(self._base_url, "", 1)
|
|
1091
|
+
return DavFileMetadata(base_url=self._base_url, href=href)
|
|
1092
|
+
case HTTPStatus.MULTI_STATUS:
|
|
1093
|
+
property = self._propfind_parser.parse(resp.data)[0]
|
|
1094
|
+
return DavFileMetadata.from_property(base_url=self._base_url, property=property)
|
|
1095
|
+
case _:
|
|
1096
|
+
raise ValueError(
|
|
1097
|
+
f"""Unexpected response to HTTP PROPFIND request to {resp.geturl()}: status """
|
|
1098
|
+
f"""{resp.status} {resp.reason}"""
|
|
1099
|
+
)
|
|
1100
|
+
|
|
1101
|
+
def info(self, url: str, name: str | None = None) -> dict[str, Any]:
|
|
1102
|
+
"""Return the details about the file or directory at `url`.
|
|
1103
|
+
|
|
1104
|
+
Parameters
|
|
1105
|
+
----------
|
|
1106
|
+
url : `str`
|
|
1107
|
+
Target URL.
|
|
1108
|
+
name : `str`
|
|
1109
|
+
Name of the object to be included in the returned value. If None,
|
|
1110
|
+
the `url` is used as name.
|
|
1111
|
+
|
|
1112
|
+
Returns
|
|
1113
|
+
-------
|
|
1114
|
+
result: `dict``
|
|
1115
|
+
For an existing file, the returned value has the form:
|
|
1116
|
+
|
|
1117
|
+
.. code-block:: json
|
|
1118
|
+
|
|
1119
|
+
{
|
|
1120
|
+
"name": name,
|
|
1121
|
+
"size": 1234,
|
|
1122
|
+
"type": "file",
|
|
1123
|
+
"last_modified":
|
|
1124
|
+
datetime.datetime(2025, 4, 10, 15, 12, 51, 227854),
|
|
1125
|
+
"checksums": {
|
|
1126
|
+
"adler32": "0fc5f83f",
|
|
1127
|
+
"md5": "1f57339acdec099c6c0a41f8e3d5fcd0",
|
|
1128
|
+
}
|
|
1129
|
+
}
|
|
1130
|
+
|
|
1131
|
+
For an existing directory, the returned value has the form:
|
|
1132
|
+
|
|
1133
|
+
.. code-block:: json
|
|
1134
|
+
|
|
1135
|
+
{
|
|
1136
|
+
"name": name,
|
|
1137
|
+
"size": 0,
|
|
1138
|
+
"type": "directory",
|
|
1139
|
+
"last_modified":
|
|
1140
|
+
datetime.datetime(2025, 4, 10, 15, 12, 51, 227854),
|
|
1141
|
+
"checksums": {},
|
|
1142
|
+
}
|
|
1143
|
+
|
|
1144
|
+
For a non-existing file or directory, the returned value has the
|
|
1145
|
+
form:
|
|
1146
|
+
|
|
1147
|
+
.. code-block:: json
|
|
1148
|
+
|
|
1149
|
+
{
|
|
1150
|
+
"name": name,
|
|
1151
|
+
"size": None,
|
|
1152
|
+
"type": None,
|
|
1153
|
+
"last_modified":
|
|
1154
|
+
datetime.datetime(1, 1, 1, 0, 0),
|
|
1155
|
+
"checksums": {},
|
|
1156
|
+
}
|
|
1157
|
+
|
|
1158
|
+
Notes
|
|
1159
|
+
-----
|
|
1160
|
+
The format of the returned directory is inspired and compatible with
|
|
1161
|
+
`fsspec`.
|
|
1162
|
+
|
|
1163
|
+
The size of existing directories is always zero. The `checksums``
|
|
1164
|
+
dictionary may be empty if the storage endpoint does not compute
|
|
1165
|
+
and store the checksum of the files it stores.
|
|
1166
|
+
"""
|
|
1167
|
+
result: dict[str, Any] = {
|
|
1168
|
+
"name": name if name is not None else url,
|
|
1169
|
+
"type": None,
|
|
1170
|
+
"size": None,
|
|
1171
|
+
"last_modified": datetime.min,
|
|
1172
|
+
"checksums": {},
|
|
1173
|
+
}
|
|
1174
|
+
metadata = self.stat(url)
|
|
1175
|
+
if not metadata.exists:
|
|
1176
|
+
return result
|
|
1177
|
+
|
|
1178
|
+
if metadata.is_dir:
|
|
1179
|
+
result.update({"type": "directory", "size": 0})
|
|
1180
|
+
else:
|
|
1181
|
+
result.update({"type": "file", "size": metadata.size, "checksums": metadata.checksums})
|
|
1182
|
+
|
|
1183
|
+
result.update({"last_modified": metadata.last_modified})
|
|
1184
|
+
return result
|
|
1185
|
+
|
|
1186
|
+
def read_dir(self, url: str) -> list[DavFileMetadata]:
|
|
1187
|
+
"""Return the properties of the files or directories contained in
|
|
1188
|
+
directory located at `url`.
|
|
1189
|
+
|
|
1190
|
+
If `url` designates a file, only the details of itself are returned.
|
|
1191
|
+
|
|
1192
|
+
Parameters
|
|
1193
|
+
----------
|
|
1194
|
+
url : `str`
|
|
1195
|
+
Target URL.
|
|
1196
|
+
|
|
1197
|
+
Returns
|
|
1198
|
+
-------
|
|
1199
|
+
result: `list[DavResourceMetadata]`
|
|
1200
|
+
List of details of each file or directory within `url`.
|
|
1201
|
+
"""
|
|
1202
|
+
resp = self._propfind(url, depth="1")
|
|
1203
|
+
if resp.status == HTTPStatus.NOT_FOUND:
|
|
1204
|
+
raise FileNotFoundError(f"No directory found at {resp.geturl()}")
|
|
1205
|
+
elif resp.status != HTTPStatus.MULTI_STATUS:
|
|
1206
|
+
raise ValueError(
|
|
1207
|
+
f"""Unexpected response to HTTP PROPFIND request to {resp.geturl()}: status {resp.status} """
|
|
1208
|
+
f"""{resp.reason}"""
|
|
1209
|
+
)
|
|
1210
|
+
|
|
1211
|
+
if (path := parse_url(url).path) is not None:
|
|
1212
|
+
this_dir_href = path.rstrip("/") + "/"
|
|
1213
|
+
else:
|
|
1214
|
+
this_dir_href = "/"
|
|
1215
|
+
|
|
1216
|
+
result = []
|
|
1217
|
+
for property in self._propfind_parser.parse(resp.data):
|
|
1218
|
+
# Don't include in the results the metadata of the directory we
|
|
1219
|
+
# traversing.
|
|
1220
|
+
# Some webDAV servers do not append a "/" to the href of a
|
|
1221
|
+
# directory in their response to PROPFIND, so we must take into
|
|
1222
|
+
# account that.
|
|
1223
|
+
if property.is_file:
|
|
1224
|
+
result.append(DavFileMetadata.from_property(base_url=self._base_url, property=property))
|
|
1225
|
+
elif property.is_dir and property.href != this_dir_href:
|
|
1226
|
+
result.append(DavFileMetadata.from_property(base_url=self._base_url, property=property))
|
|
1227
|
+
|
|
1228
|
+
return result
|
|
1229
|
+
|
|
1230
|
+
def read(self, url: str) -> bytes:
|
|
1231
|
+
"""Download the contents of file located at `url`.
|
|
1232
|
+
|
|
1233
|
+
Parameters
|
|
1234
|
+
----------
|
|
1235
|
+
url : `str`
|
|
1236
|
+
Target URL.
|
|
1237
|
+
|
|
1238
|
+
Returns
|
|
1239
|
+
-------
|
|
1240
|
+
read: `bytes`
|
|
1241
|
+
Contents of the file.
|
|
1242
|
+
|
|
1243
|
+
Notes
|
|
1244
|
+
-----
|
|
1245
|
+
The caller must ensure that the resource at `url` is a file, not
|
|
1246
|
+
a directory.
|
|
1247
|
+
"""
|
|
1248
|
+
return self._get(url).data
|
|
1249
|
+
|
|
1250
|
+
def read_range(
|
|
1251
|
+
self, url: str, start: int, end: int | None, headers: dict[str, str] | None = None
|
|
1252
|
+
) -> bytes:
|
|
1253
|
+
"""Download partial content of file located at `url`.
|
|
1254
|
+
|
|
1255
|
+
Parameters
|
|
1256
|
+
----------
|
|
1257
|
+
url : `str`
|
|
1258
|
+
Target URL.
|
|
1259
|
+
start : `int`
|
|
1260
|
+
Starting byte offset of the range to download.
|
|
1261
|
+
end : `int`
|
|
1262
|
+
Ending byte offset of the range to download.
|
|
1263
|
+
headers : `dict[str,str]`, optional
|
|
1264
|
+
Specific headers to sent with the GET request.
|
|
1265
|
+
|
|
1266
|
+
Returns
|
|
1267
|
+
-------
|
|
1268
|
+
read: `bytes`
|
|
1269
|
+
Partial contents of the file.
|
|
1270
|
+
|
|
1271
|
+
Notes
|
|
1272
|
+
-----
|
|
1273
|
+
The caller must ensure that the resource at `url` is a file, not
|
|
1274
|
+
a directory. This is important because some webDAV servers respond
|
|
1275
|
+
with an HTML document when asked for reading a directory.
|
|
1276
|
+
"""
|
|
1277
|
+
headers = {} if headers is None else dict(headers)
|
|
1278
|
+
if end is None:
|
|
1279
|
+
headers.update({"Range": f"bytes={start}-"})
|
|
1280
|
+
else:
|
|
1281
|
+
headers.update({"Range": f"bytes={start}-{end}"})
|
|
1282
|
+
|
|
1283
|
+
return self._get(url, headers=headers).data
|
|
1284
|
+
|
|
1285
|
+
def download(self, url: str, filename: str, chunk_size: int, close_connection: bool = False) -> int:
|
|
1286
|
+
"""Download the content of a file and write it to local file.
|
|
1287
|
+
|
|
1288
|
+
Parameters
|
|
1289
|
+
----------
|
|
1290
|
+
url : `str`
|
|
1291
|
+
Target URL.
|
|
1292
|
+
filename : `str`
|
|
1293
|
+
Local file to write the content to. If the file already exists,
|
|
1294
|
+
it will be rewritten.
|
|
1295
|
+
chunk_size : `int`
|
|
1296
|
+
Size of the chunks to write to `filename`.
|
|
1297
|
+
close_connection : `bool`
|
|
1298
|
+
Whether to close the connection after download.
|
|
1299
|
+
|
|
1300
|
+
Returns
|
|
1301
|
+
-------
|
|
1302
|
+
count: `int`
|
|
1303
|
+
Number of bytes written to `filename`.
|
|
1304
|
+
|
|
1305
|
+
Notes
|
|
1306
|
+
-----
|
|
1307
|
+
The caller must ensure that the resource at `url` is a file, not
|
|
1308
|
+
a directory.
|
|
1309
|
+
"""
|
|
1310
|
+
try:
|
|
1311
|
+
resp = self._get(url, preload_content=False)
|
|
1312
|
+
|
|
1313
|
+
# If we were asked to close the connection to the server, disable
|
|
1314
|
+
# auto close so that we can explicitly close the connection.
|
|
1315
|
+
# By default, urrlib3 releases the connection and keeps it open
|
|
1316
|
+
# for later reuse when it consumes the response body.
|
|
1317
|
+
if close_connection:
|
|
1318
|
+
resp.auto_close = False
|
|
1319
|
+
|
|
1320
|
+
content_length = 0
|
|
1321
|
+
with open(filename, "wb", buffering=chunk_size) as file:
|
|
1322
|
+
for chunk in resp.stream(chunk_size):
|
|
1323
|
+
file.write(chunk)
|
|
1324
|
+
content_length += len(chunk)
|
|
1325
|
+
|
|
1326
|
+
# Check that the expected and actual content lengths match. Perform
|
|
1327
|
+
# this check only when the content of the file was not encoded by
|
|
1328
|
+
# the server.
|
|
1329
|
+
expected_length: int = int(resp.headers.get("Content-Length", -1))
|
|
1330
|
+
if (
|
|
1331
|
+
"Content-Encoding" not in resp.headers
|
|
1332
|
+
and expected_length != -1
|
|
1333
|
+
and expected_length != content_length
|
|
1334
|
+
):
|
|
1335
|
+
raise ValueError(
|
|
1336
|
+
f"Size of downloaded file does not match value in Content-Length header for {self}: "
|
|
1337
|
+
f"expecting {expected_length} and got {content_length} bytes"
|
|
1338
|
+
)
|
|
1339
|
+
|
|
1340
|
+
return content_length
|
|
1341
|
+
finally:
|
|
1342
|
+
# Close this connection
|
|
1343
|
+
if close_connection:
|
|
1344
|
+
resp.close()
|
|
1345
|
+
|
|
1346
|
+
def write(self, url: str, data: BinaryIO | bytes) -> None:
|
|
1347
|
+
"""Create or rewrite a remote file at `url` with `data` as its
|
|
1348
|
+
contents.
|
|
1349
|
+
|
|
1350
|
+
Parameters
|
|
1351
|
+
----------
|
|
1352
|
+
url : `str`
|
|
1353
|
+
Target URL.
|
|
1354
|
+
data : `bytes`
|
|
1355
|
+
Sequence of bytes to upload.
|
|
1356
|
+
|
|
1357
|
+
Notes
|
|
1358
|
+
-----
|
|
1359
|
+
If a file already exists at `url` it will be rewritten.
|
|
1360
|
+
"""
|
|
1361
|
+
self._put(url, data)
|
|
1362
|
+
|
|
1363
|
+
def checksums(self, url: str) -> dict[str, str]:
|
|
1364
|
+
"""Return the checksums of the contents of file located at `url`.
|
|
1365
|
+
|
|
1366
|
+
The checksums are retrieved from the storage endpoint. There may be
|
|
1367
|
+
none if the storage endpoint does not automatically expose the
|
|
1368
|
+
checksums it computes.
|
|
1369
|
+
|
|
1370
|
+
Parameters
|
|
1371
|
+
----------
|
|
1372
|
+
url : `str`
|
|
1373
|
+
Target URL.
|
|
1374
|
+
|
|
1375
|
+
Returns
|
|
1376
|
+
-------
|
|
1377
|
+
checksums: `dict[str, str]`
|
|
1378
|
+
A file exists at `url`.
|
|
1379
|
+
The key of the dictionary is the lowercased name of the checksum
|
|
1380
|
+
algorithm (e.g. "md5", "adler32"). The value is the lowercased
|
|
1381
|
+
checksum itself (e.g. "78441cec2479ec8b545c4d6699f542da").
|
|
1382
|
+
"""
|
|
1383
|
+
stat = self.stat(url)
|
|
1384
|
+
if not stat.exists:
|
|
1385
|
+
raise FileNotFoundError(f"No file found at {url}")
|
|
1386
|
+
|
|
1387
|
+
return stat.checksums if stat.is_file else {}
|
|
1388
|
+
|
|
1389
|
+
def mkcol(self, url: str) -> None:
|
|
1390
|
+
"""Create a directory at `url`.
|
|
1391
|
+
|
|
1392
|
+
If a directory already exists at `url` no error is returned nor
|
|
1393
|
+
exception is raised. An exception is raised if a file exists at `url`.
|
|
1394
|
+
|
|
1395
|
+
Parameters
|
|
1396
|
+
----------
|
|
1397
|
+
url : `str`
|
|
1398
|
+
Target URL.
|
|
1399
|
+
"""
|
|
1400
|
+
resp = self._request("MKCOL", url)
|
|
1401
|
+
if resp.status not in (HTTPStatus.CREATED, HTTPStatus.METHOD_NOT_ALLOWED):
|
|
1402
|
+
raise ValueError(f"Can not create directory {resp.geturl()}: status {resp.status} {resp.reason}")
|
|
1403
|
+
|
|
1404
|
+
def delete(self, url: str) -> None:
|
|
1405
|
+
"""Delete the file or directory at `url`.
|
|
1406
|
+
|
|
1407
|
+
If there is no file or directory at `url` is not considered an error.
|
|
1408
|
+
|
|
1409
|
+
Parameters
|
|
1410
|
+
----------
|
|
1411
|
+
url : `str`
|
|
1412
|
+
Target URL.
|
|
1413
|
+
|
|
1414
|
+
Notes
|
|
1415
|
+
-----
|
|
1416
|
+
If `url` designates a directory, some webDAV servers recursively
|
|
1417
|
+
remove the directory and its contents. Others, only remove the
|
|
1418
|
+
directory if it is empty.
|
|
1419
|
+
|
|
1420
|
+
For a consisten behavior, the caller must check what kind of object
|
|
1421
|
+
the target URL is and walk the hierarchy removing all objects.
|
|
1422
|
+
"""
|
|
1423
|
+
resp = self._request("DELETE", url)
|
|
1424
|
+
if resp.status not in (
|
|
1425
|
+
HTTPStatus.OK,
|
|
1426
|
+
HTTPStatus.ACCEPTED,
|
|
1427
|
+
HTTPStatus.NO_CONTENT,
|
|
1428
|
+
HTTPStatus.NOT_FOUND,
|
|
1429
|
+
):
|
|
1430
|
+
raise ValueError(f"Unable to delete resource {resp.geturl()}: status {resp.status} {resp.reason}")
|
|
1431
|
+
|
|
1432
|
+
def accepts_ranges(self, url: str) -> bool:
|
|
1433
|
+
"""Return `True` if the server supports a 'Range' header in
|
|
1434
|
+
GET requests against `url`.
|
|
1435
|
+
|
|
1436
|
+
Parameters
|
|
1437
|
+
----------
|
|
1438
|
+
url : `str`
|
|
1439
|
+
Target URL.
|
|
1440
|
+
"""
|
|
1441
|
+
# If we have already determined that the server accepts "Range" for
|
|
1442
|
+
# another URL, we assume that it implements that feature for any
|
|
1443
|
+
# file it serves, so reuse that information.
|
|
1444
|
+
if self._accepts_ranges is not None:
|
|
1445
|
+
return self._accepts_ranges
|
|
1446
|
+
|
|
1447
|
+
with self._lock:
|
|
1448
|
+
if self._accepts_ranges is None:
|
|
1449
|
+
self._accepts_ranges = self._head(url).headers.get("Accept-Ranges", "") == "bytes"
|
|
1450
|
+
|
|
1451
|
+
return self._accepts_ranges
|
|
1452
|
+
|
|
1453
|
+
def copy(self, source_url: str, destination_url: str, overwrite: bool = False) -> None:
|
|
1454
|
+
"""Copy the file at `source_url` to `destination_url` in the same
|
|
1455
|
+
storage endpoint.
|
|
1456
|
+
|
|
1457
|
+
Parameters
|
|
1458
|
+
----------
|
|
1459
|
+
source_url : `str`
|
|
1460
|
+
URL of the source file.
|
|
1461
|
+
destination_url : `str`
|
|
1462
|
+
URL of the destination file. Its parent directory must exist.
|
|
1463
|
+
overwrite : `bool`
|
|
1464
|
+
If True and a file exists at `destination_url` it will be
|
|
1465
|
+
overwritten. Otherwise an exception is raised.
|
|
1466
|
+
"""
|
|
1467
|
+
# Check the source is a file
|
|
1468
|
+
if self.stat(source_url).is_dir:
|
|
1469
|
+
raise NotImplementedError(f"copy is not implemented for directory {source_url}")
|
|
1470
|
+
|
|
1471
|
+
# Send a COPY request for this file.
|
|
1472
|
+
headers = {
|
|
1473
|
+
"Destination": destination_url,
|
|
1474
|
+
"Overwrite": "T" if overwrite else "F",
|
|
1475
|
+
}
|
|
1476
|
+
resp = self._request("COPY", source_url, headers=headers)
|
|
1477
|
+
if resp.status not in (HTTPStatus.CREATED, HTTPStatus.NO_CONTENT):
|
|
1478
|
+
raise ValueError(
|
|
1479
|
+
f"Could not copy {resp.geturl()} to {destination_url}: status {resp.status} {resp.reason}"
|
|
1480
|
+
)
|
|
1481
|
+
return
|
|
1482
|
+
|
|
1483
|
+
def move(self, source_url: str, destination_url: str, overwrite: bool = False) -> None:
|
|
1484
|
+
"""Move the file at `source_url` to `destination_url` in the same
|
|
1485
|
+
storage endpoint.
|
|
1486
|
+
|
|
1487
|
+
Parameters
|
|
1488
|
+
----------
|
|
1489
|
+
source_url : `str`
|
|
1490
|
+
URL of the source file.
|
|
1491
|
+
destination_url : `str`
|
|
1492
|
+
URL of the destination file. Its parent directory must exist.
|
|
1493
|
+
overwrite : `bool`
|
|
1494
|
+
If True and a file exists at `destination_url` it will be
|
|
1495
|
+
overwritten. Otherwise an exception is raised.
|
|
1496
|
+
"""
|
|
1497
|
+
headers = {
|
|
1498
|
+
"Destination": destination_url,
|
|
1499
|
+
"Overwrite": "T" if overwrite else "F",
|
|
1500
|
+
}
|
|
1501
|
+
resp = self._request("MOVE", source_url, headers=headers)
|
|
1502
|
+
if resp.status not in (HTTPStatus.CREATED, HTTPStatus.NO_CONTENT):
|
|
1503
|
+
raise ValueError(
|
|
1504
|
+
f"""Could not move file {resp.geturl()} to {destination_url}: status {resp.status} """
|
|
1505
|
+
f"""{resp.reason}"""
|
|
1506
|
+
)
|
|
1507
|
+
|
|
1508
|
+
def generate_presigned_get_url(self, url: str, expiration_time_seconds: int) -> str:
|
|
1509
|
+
"""Return a pre-signed URL that can be used to retrieve this resource
|
|
1510
|
+
using an HTTP GET without supplying any access credentials.
|
|
1511
|
+
|
|
1512
|
+
Parameters
|
|
1513
|
+
----------
|
|
1514
|
+
url : `str`
|
|
1515
|
+
Target URL.
|
|
1516
|
+
expiration_time_seconds : `int`
|
|
1517
|
+
Number of seconds until the generated URL is no longer valid.
|
|
1518
|
+
|
|
1519
|
+
Returns
|
|
1520
|
+
-------
|
|
1521
|
+
url : `str`
|
|
1522
|
+
HTTP URL signed for GET.
|
|
1523
|
+
"""
|
|
1524
|
+
raise NotImplementedError(f"URL signing is not supported by server for {self}")
|
|
1525
|
+
|
|
1526
|
+
def generate_presigned_put_url(self, url: str, expiration_time_seconds: int) -> str:
|
|
1527
|
+
"""Return a pre-signed URL that can be used to upload a file to this
|
|
1528
|
+
path using an HTTP PUT without supplying any access credentials.
|
|
1529
|
+
|
|
1530
|
+
Parameters
|
|
1531
|
+
----------
|
|
1532
|
+
url : `str`
|
|
1533
|
+
Target URL.
|
|
1534
|
+
expiration_time_seconds : `int`
|
|
1535
|
+
Number of seconds until the generated URL is no longer valid.
|
|
1536
|
+
|
|
1537
|
+
Returns
|
|
1538
|
+
-------
|
|
1539
|
+
url : `str`
|
|
1540
|
+
HTTP URL signed for PUT.
|
|
1541
|
+
"""
|
|
1542
|
+
raise NotImplementedError(f"URL signing is not supported by server for {self}")
|
|
1543
|
+
|
|
1544
|
+
|
|
1545
|
+
class ActivityCaveat(enum.Enum):
|
|
1546
|
+
"""Helper class for enumerating accepted activity caveats for requesting
|
|
1547
|
+
macaroons for dCache or XRootD webDAV servers.
|
|
1548
|
+
"""
|
|
1549
|
+
|
|
1550
|
+
DOWNLOAD = 1
|
|
1551
|
+
UPLOAD = 2
|
|
1552
|
+
|
|
1553
|
+
|
|
1554
|
+
class DavClientURLSigner(DavClient):
|
|
1555
|
+
"""WebDAV client which supports signing of URL for upload and download.
|
|
1556
|
+
|
|
1557
|
+
Instances of this class are thread-safe.
|
|
1558
|
+
|
|
1559
|
+
Parameters
|
|
1560
|
+
----------
|
|
1561
|
+
url : `str`
|
|
1562
|
+
Root URL of the storage endpoint
|
|
1563
|
+
(e.g. "https://host.example.org:1234/").
|
|
1564
|
+
config : `DavConfig`
|
|
1565
|
+
Configuration to initialize this client.
|
|
1566
|
+
accepts_ranges : `bool` | `None`
|
|
1567
|
+
Indicate whether the remote server accepts the ``Range`` header in GET
|
|
1568
|
+
requests.
|
|
1569
|
+
"""
|
|
1570
|
+
|
|
1571
|
+
def __init__(self, url: str, config: DavConfig, accepts_ranges: bool | None = None) -> None:
|
|
1572
|
+
super().__init__(url=url, config=config, accepts_ranges=accepts_ranges)
|
|
1573
|
+
|
|
1574
|
+
def generate_presigned_get_url(self, url: str, expiration_time_seconds: int) -> str:
|
|
1575
|
+
"""Return a pre-signed URL that can be used to retrieve the resource
|
|
1576
|
+
at `url` using an HTTP GET without supplying any access credentials.
|
|
1577
|
+
|
|
1578
|
+
Parameters
|
|
1579
|
+
----------
|
|
1580
|
+
url : `str`
|
|
1581
|
+
URL of an existing file.
|
|
1582
|
+
expiration_time_seconds : `int`
|
|
1583
|
+
Number of seconds until the generated URL is no longer valid.
|
|
1584
|
+
|
|
1585
|
+
Returns
|
|
1586
|
+
-------
|
|
1587
|
+
url : `str`
|
|
1588
|
+
HTTP URL signed for GET.
|
|
1589
|
+
|
|
1590
|
+
Notes
|
|
1591
|
+
-----
|
|
1592
|
+
Although the returned URL allows for downloading the file at `url`
|
|
1593
|
+
without supplying credentials, the HTTP client must be configured
|
|
1594
|
+
to accept the certificate the server will present if the client wants
|
|
1595
|
+
validate it. The server's certificate may be issued by a certificate
|
|
1596
|
+
authority unknown to the client.
|
|
1597
|
+
"""
|
|
1598
|
+
macaroon: str = self._get_macaroon(url, ActivityCaveat.DOWNLOAD, expiration_time_seconds)
|
|
1599
|
+
return f"{url}?authz={macaroon}"
|
|
1600
|
+
|
|
1601
|
+
def generate_presigned_put_url(self, url: str, expiration_time_seconds: int) -> str:
|
|
1602
|
+
"""Return a pre-signed URL that can be used to upload a file to `url`
|
|
1603
|
+
using an HTTP PUT without supplying any access credentials.
|
|
1604
|
+
|
|
1605
|
+
Parameters
|
|
1606
|
+
----------
|
|
1607
|
+
url : `str`
|
|
1608
|
+
URL of an existing file.
|
|
1609
|
+
expiration_time_seconds : `int`
|
|
1610
|
+
Number of seconds until the generated URL is no longer valid.
|
|
1611
|
+
|
|
1612
|
+
Returns
|
|
1613
|
+
-------
|
|
1614
|
+
url : `str`
|
|
1615
|
+
HTTP URL signed for PUT.
|
|
1616
|
+
|
|
1617
|
+
Notes
|
|
1618
|
+
-----
|
|
1619
|
+
Although the returned URL allows for uploading a file to `url`
|
|
1620
|
+
without supplying credentials, the HTTP client must be configured
|
|
1621
|
+
to accept the certificate the server will present if the client wants
|
|
1622
|
+
validate it. The server's certificate may be issued by a certificate
|
|
1623
|
+
authority unknown to the client.
|
|
1624
|
+
"""
|
|
1625
|
+
macaroon: str = self._get_macaroon(url, ActivityCaveat.UPLOAD, expiration_time_seconds)
|
|
1626
|
+
return f"{url}?authz={macaroon}"
|
|
1627
|
+
|
|
1628
|
+
def _get_macaroon(self, url: str, activity: ActivityCaveat, expiration_time_seconds: int) -> str:
|
|
1629
|
+
"""Return a macaroon for uploading or downloading the file at `url`.
|
|
1630
|
+
|
|
1631
|
+
Parameters
|
|
1632
|
+
----------
|
|
1633
|
+
url : `str`
|
|
1634
|
+
URL of an existing file.
|
|
1635
|
+
activity : `ActivityCaveat`
|
|
1636
|
+
the activity the macaroon is requested for.
|
|
1637
|
+
expiration_time_seconds : `int`
|
|
1638
|
+
Requested duration of the macaroon, in seconds.
|
|
1639
|
+
|
|
1640
|
+
Returns
|
|
1641
|
+
-------
|
|
1642
|
+
macaroon : `str`
|
|
1643
|
+
Macaroon to be used with `url` in a GET or PUT request.
|
|
1644
|
+
"""
|
|
1645
|
+
# dCache and XRootD webDAV servers support delivery of macaroons.
|
|
1646
|
+
#
|
|
1647
|
+
# For details about dCache macaroons see:
|
|
1648
|
+
# https://www.dcache.org/manuals/UserGuide-9.2/macaroons.shtml
|
|
1649
|
+
match activity:
|
|
1650
|
+
case ActivityCaveat.DOWNLOAD:
|
|
1651
|
+
activity_caveat = "DOWNLOAD,LIST"
|
|
1652
|
+
case ActivityCaveat.UPLOAD:
|
|
1653
|
+
activity_caveat = "UPLOAD,LIST,DELETE,MANAGE"
|
|
1654
|
+
|
|
1655
|
+
# Retrieve a macaroon for the requested activities and duration
|
|
1656
|
+
headers = {"Content-Type": "application/macaroon-request"}
|
|
1657
|
+
body = {
|
|
1658
|
+
"caveats": [
|
|
1659
|
+
f"activity:{activity_caveat}",
|
|
1660
|
+
],
|
|
1661
|
+
"validity": f"PT{expiration_time_seconds}S",
|
|
1662
|
+
}
|
|
1663
|
+
resp = self._request("POST", url, headers=headers, body=json.dumps(body))
|
|
1664
|
+
if resp.status != HTTPStatus.OK:
|
|
1665
|
+
raise ValueError(
|
|
1666
|
+
f"Could not retrieve a macaroon for URL {resp.geturl()}, status: {resp.status} {resp.reason}"
|
|
1667
|
+
)
|
|
1668
|
+
|
|
1669
|
+
# We are expecting the body of the response to be formatted in JSON.
|
|
1670
|
+
# dCache sets the 'Content-Type' of the response to 'application/json'
|
|
1671
|
+
# but XRootD does not set any 'Content-Type' header 8-[
|
|
1672
|
+
#
|
|
1673
|
+
# An example of a response body returned by dCache is shown below:
|
|
1674
|
+
# {
|
|
1675
|
+
# "macaroon": "MDA[...]Qo",
|
|
1676
|
+
# "uri": {
|
|
1677
|
+
# "targetWithMacaroon": "https://dcache.example.org/?authz=MD...",
|
|
1678
|
+
# "baseWithMacaroon": "https://dcache.example.org/?authz=MD...",
|
|
1679
|
+
# "target": "https://dcache.example.org/",
|
|
1680
|
+
# "base": "https://dcache.example.org/"
|
|
1681
|
+
# }
|
|
1682
|
+
# }
|
|
1683
|
+
#
|
|
1684
|
+
# An example of a response body returned by XRootD is shown below:
|
|
1685
|
+
# {
|
|
1686
|
+
# "macaroon": "MDA[...]Qo",
|
|
1687
|
+
# "expires_in": 86400
|
|
1688
|
+
# }
|
|
1689
|
+
try:
|
|
1690
|
+
response_body = json.loads(resp.data.decode("utf-8"))
|
|
1691
|
+
except json.JSONDecodeError:
|
|
1692
|
+
raise ValueError(f"Could not deserialize response to POST request for URL {resp.geturl()}")
|
|
1693
|
+
|
|
1694
|
+
if "macaroon" in response_body:
|
|
1695
|
+
return response_body["macaroon"]
|
|
1696
|
+
|
|
1697
|
+
raise ValueError(f"Could not retrieve macaroon for URL {resp.geturl()}")
|
|
1698
|
+
|
|
1699
|
+
def copy(self, source_url: str, destination_url: str, overwrite: bool = False) -> None:
|
|
1700
|
+
"""Copy the file at `source_url` to `destination_url` in the same
|
|
1701
|
+
storage endpoint.
|
|
1702
|
+
|
|
1703
|
+
Parameters
|
|
1704
|
+
----------
|
|
1705
|
+
source_url : `str`
|
|
1706
|
+
URL of the source file.
|
|
1707
|
+
destination_url : `str`
|
|
1708
|
+
URL of the destination file. Its parent directory must exist.
|
|
1709
|
+
overwrite : `bool`
|
|
1710
|
+
If True and a file exists at `destination_url` it will be
|
|
1711
|
+
overwritten. Otherwise an exception is raised.
|
|
1712
|
+
"""
|
|
1713
|
+
# Check the source is a file
|
|
1714
|
+
if self.stat(source_url).is_dir:
|
|
1715
|
+
raise NotImplementedError(f"copy is not implemented for directory {source_url}")
|
|
1716
|
+
|
|
1717
|
+
# Neither dCache nor XrootD currently implement the COPY
|
|
1718
|
+
# webDAV method as documented in
|
|
1719
|
+
#
|
|
1720
|
+
# http://www.webdav.org/specs/rfc4918.html#METHOD_COPY
|
|
1721
|
+
#
|
|
1722
|
+
# (See issues DM-37603 and DM-37651 for details)
|
|
1723
|
+
# With those servers use third-party copy instead.
|
|
1724
|
+
return self._copy_via_third_party(source_url, destination_url, overwrite)
|
|
1725
|
+
|
|
1726
|
+
def _copy_via_third_party(self, source_url: str, destination_url: str, overwrite: bool = False) -> None:
|
|
1727
|
+
"""Copy the file at `source_url` to `destination_url` in the same
|
|
1728
|
+
storage endpoint using the third-party copy functionality
|
|
1729
|
+
implemented by dCache and XRootD servers.
|
|
1730
|
+
|
|
1731
|
+
Parameters
|
|
1732
|
+
----------
|
|
1733
|
+
source_url : `str`
|
|
1734
|
+
URL of the source file.
|
|
1735
|
+
destination_url : `str`
|
|
1736
|
+
URL of the destination file. Its parent directory must exist.
|
|
1737
|
+
overwrite : `bool`
|
|
1738
|
+
If True and a file exists at `destination_url` it will be
|
|
1739
|
+
overwritten. Otherwise an exception is raised.
|
|
1740
|
+
"""
|
|
1741
|
+
# To implement COPY we use dCache's third-party copy mechanism
|
|
1742
|
+
# documented at:
|
|
1743
|
+
#
|
|
1744
|
+
# https://www.dcache.org/manuals/UserGuide-10.2/webdav.shtml#third-party-transfers
|
|
1745
|
+
#
|
|
1746
|
+
# The reason is that dCache does not correctly implement webDAV's COPY
|
|
1747
|
+
# method. See https://github.com/dCache/dcache/issues/6950
|
|
1748
|
+
|
|
1749
|
+
# Retrieve a macaroon for downloading the source
|
|
1750
|
+
download_macaroon = self._get_macaroon(source_url, ActivityCaveat.DOWNLOAD, 300)
|
|
1751
|
+
|
|
1752
|
+
# Prepare and send the COPY request
|
|
1753
|
+
try:
|
|
1754
|
+
headers = {
|
|
1755
|
+
"Source": source_url,
|
|
1756
|
+
"TransferHeaderAuthorization": f"Bearer {download_macaroon}",
|
|
1757
|
+
"Credential": "none",
|
|
1758
|
+
"Depth": "0",
|
|
1759
|
+
"Overwrite": "T" if overwrite else "F",
|
|
1760
|
+
"RequireChecksumVerification": "false",
|
|
1761
|
+
}
|
|
1762
|
+
resp = self._request("COPY", destination_url, headers=headers, preload_content=False)
|
|
1763
|
+
if resp.status == HTTPStatus.CREATED:
|
|
1764
|
+
return
|
|
1765
|
+
|
|
1766
|
+
if resp.status != HTTPStatus.ACCEPTED:
|
|
1767
|
+
raise ValueError(
|
|
1768
|
+
f"Unable to copy resource {resp.geturl()}; status: {resp.status} {resp.reason}"
|
|
1769
|
+
)
|
|
1770
|
+
|
|
1771
|
+
content_type = resp.headers.get("Content-Type")
|
|
1772
|
+
if content_type != "text/perf-marker-stream":
|
|
1773
|
+
raise ValueError(
|
|
1774
|
+
f"""Unexpected Content-Type {content_type} in response to COPY request from """
|
|
1775
|
+
f"""{source_url} to {destination_url}"""
|
|
1776
|
+
)
|
|
1777
|
+
|
|
1778
|
+
# Read the performance markers in the response body.
|
|
1779
|
+
# Documentation:
|
|
1780
|
+
# https://dcache.org/manuals/UserGuide-10.2/webdav.shtml#third-party-transfers
|
|
1781
|
+
for marker in io.TextIOWrapper(resp): # type: ignore
|
|
1782
|
+
marker = marker.rstrip("\n")
|
|
1783
|
+
if marker == "": # EOF
|
|
1784
|
+
raise ValueError(
|
|
1785
|
+
f"""Copying file from {source_url} to {destination_url} failed: """
|
|
1786
|
+
"""could not get response from server"""
|
|
1787
|
+
)
|
|
1788
|
+
elif marker.startswith("failure:"):
|
|
1789
|
+
raise ValueError(
|
|
1790
|
+
f"""Copying file from {source_url} to {destination_url} failed with error: """
|
|
1791
|
+
f"""{marker}"""
|
|
1792
|
+
)
|
|
1793
|
+
elif marker.startswith("success:"):
|
|
1794
|
+
return
|
|
1795
|
+
finally:
|
|
1796
|
+
resp.drain_conn()
|
|
1797
|
+
|
|
1798
|
+
|
|
1799
|
+
class DavClientDCache(DavClientURLSigner):
|
|
1800
|
+
"""Client for interacting with a dCache webDAV server.
|
|
1801
|
+
|
|
1802
|
+
Instances of this class are thread-safe.
|
|
1803
|
+
|
|
1804
|
+
Parameters
|
|
1805
|
+
----------
|
|
1806
|
+
url : `str`
|
|
1807
|
+
Root URL of the storage endpoint
|
|
1808
|
+
(e.g. "https://host.example.org:1234/").
|
|
1809
|
+
config : `DavConfig`
|
|
1810
|
+
Configuration to initialize this client.
|
|
1811
|
+
accepts_ranges : `bool` | `None`
|
|
1812
|
+
Indicate whether the remote server accepts the ``Range`` header in GET
|
|
1813
|
+
requests.
|
|
1814
|
+
"""
|
|
1815
|
+
|
|
1816
|
+
def __init__(self, url: str, config: DavConfig, accepts_ranges: bool | None = None) -> None:
|
|
1817
|
+
super().__init__(url=url, config=config, accepts_ranges=accepts_ranges)
|
|
1818
|
+
|
|
1819
|
+
def _propfind(self, url: str, body: str | None = None, depth: str = "0") -> HTTPResponse:
|
|
1820
|
+
"""Send a HTTP PROPFIND request and return the response.
|
|
1821
|
+
|
|
1822
|
+
Parameters
|
|
1823
|
+
----------
|
|
1824
|
+
url : `str`
|
|
1825
|
+
Target URL.
|
|
1826
|
+
body : `str`, optional
|
|
1827
|
+
Request body.
|
|
1828
|
+
"""
|
|
1829
|
+
if body is None:
|
|
1830
|
+
# Request only the DAV live properties we are explicitly interested
|
|
1831
|
+
# in namely 'resourcetype', 'getcontentlength', 'getlastmodified'
|
|
1832
|
+
# and 'displayname'. In addition, request dCache-specific
|
|
1833
|
+
# checksums.
|
|
1834
|
+
body = (
|
|
1835
|
+
"""<?xml version="1.0" encoding="utf-8"?>"""
|
|
1836
|
+
"""<D:propfind xmlns:D="DAV:" xmlns:dcache="http://www.dcache.org/2013/webdav"><D:prop>"""
|
|
1837
|
+
"""<D:resourcetype/><D:getcontentlength/><D:getlastmodified/><D:displayname/>"""
|
|
1838
|
+
"""<dcache:Checksums/>"""
|
|
1839
|
+
"""</D:prop></D:propfind>"""
|
|
1840
|
+
)
|
|
1841
|
+
|
|
1842
|
+
return super()._propfind(url=url, body=body, depth=depth)
|
|
1843
|
+
|
|
1844
|
+
def _get(
|
|
1845
|
+
self, url: str, headers: dict[str, str] | None = None, preload_content: bool = True
|
|
1846
|
+
) -> HTTPResponse:
|
|
1847
|
+
"""Send a HTTP GET request to a dCache webDAV server.
|
|
1848
|
+
|
|
1849
|
+
Parameters
|
|
1850
|
+
----------
|
|
1851
|
+
url : `str`
|
|
1852
|
+
Target URL.
|
|
1853
|
+
headers : `dict[str, str]`, optional
|
|
1854
|
+
Headers to sent with the request.
|
|
1855
|
+
preload_content : `bool`, optional
|
|
1856
|
+
If True, the response body is downloaded and can be retrieved
|
|
1857
|
+
via the returned response `.data` property. If False, the
|
|
1858
|
+
caller needs to call the `.read()` on the returned response
|
|
1859
|
+
object to download the body.
|
|
1860
|
+
|
|
1861
|
+
Returns
|
|
1862
|
+
-------
|
|
1863
|
+
resp: `HTTPResponse`
|
|
1864
|
+
Response to the GET request as received from the server.
|
|
1865
|
+
"""
|
|
1866
|
+
# Send the GET request to the frontend servers. We handle
|
|
1867
|
+
# redirections ourselves.
|
|
1868
|
+
headers = {} if headers is None else dict(headers)
|
|
1869
|
+
resp = self._request("GET", url, headers=headers, preload_content=preload_content, redirect=False)
|
|
1870
|
+
if resp.status in (HTTPStatus.OK, HTTPStatus.PARTIAL_CONTENT):
|
|
1871
|
+
return resp
|
|
1872
|
+
|
|
1873
|
+
if resp.status == HTTPStatus.NOT_FOUND:
|
|
1874
|
+
raise FileNotFoundError(f"No file found at {resp.geturl()}")
|
|
1875
|
+
|
|
1876
|
+
redirect_location = resp.get_redirect_location()
|
|
1877
|
+
if redirect_location is None or redirect_location is False:
|
|
1878
|
+
raise ValueError(
|
|
1879
|
+
f"Unexpected error in HTTP GET {resp.geturl()}: status {resp.status} {resp.reason}"
|
|
1880
|
+
)
|
|
1881
|
+
|
|
1882
|
+
# We were redirected to a backend server so follow the redirection.
|
|
1883
|
+
# The response body will be automatically downloaded when
|
|
1884
|
+
# `preload_content` is true and the underlying network connection
|
|
1885
|
+
# may be kept open for future reuse if the maximum number of
|
|
1886
|
+
# connections for the backend pool is not reached.
|
|
1887
|
+
try:
|
|
1888
|
+
# Explicitly ask the backend server to close the connection after
|
|
1889
|
+
# serving this request.
|
|
1890
|
+
if preload_content:
|
|
1891
|
+
headers.update({"Connection": "close"})
|
|
1892
|
+
|
|
1893
|
+
url = redirect_location
|
|
1894
|
+
resp = self._request(
|
|
1895
|
+
"GET",
|
|
1896
|
+
url,
|
|
1897
|
+
headers=headers,
|
|
1898
|
+
pool_manager=self._backend,
|
|
1899
|
+
preload_content=preload_content,
|
|
1900
|
+
)
|
|
1901
|
+
|
|
1902
|
+
# Mark this connection so that it won't be be automatically
|
|
1903
|
+
# returned to the reusable connection pool. We will close it
|
|
1904
|
+
# ourselves if appropriate.
|
|
1905
|
+
if preload_content:
|
|
1906
|
+
resp.auto_close = False
|
|
1907
|
+
|
|
1908
|
+
if resp.status not in (HTTPStatus.OK, HTTPStatus.PARTIAL_CONTENT):
|
|
1909
|
+
raise ValueError(
|
|
1910
|
+
f"Unexpected error in HTTP GET {resp.geturl()}: status {resp.status} {resp.reason}"
|
|
1911
|
+
)
|
|
1912
|
+
|
|
1913
|
+
# The caller will access the `resp.data` property or use
|
|
1914
|
+
# the `resp.read()` method to read the contents of the
|
|
1915
|
+
# response body. If `preload_content` argument is True, the
|
|
1916
|
+
# response body is already downloaded, otherwise `resp.read()`
|
|
1917
|
+
# will download it.
|
|
1918
|
+
return resp
|
|
1919
|
+
finally:
|
|
1920
|
+
# Don't keep this connection to the backend server open. Given
|
|
1921
|
+
# that dCache pools may be configured to serve requests over a
|
|
1922
|
+
# range of ports, it is unlikely we will reuse this particular
|
|
1923
|
+
# connection again in the short term.
|
|
1924
|
+
if preload_content:
|
|
1925
|
+
resp.close()
|
|
1926
|
+
|
|
1927
|
+
def _put(
|
|
1928
|
+
self,
|
|
1929
|
+
url: str,
|
|
1930
|
+
data: BinaryIO | bytes,
|
|
1931
|
+
) -> None:
|
|
1932
|
+
"""Send a HTTP PUT request to a dCache webDAV server.
|
|
1933
|
+
|
|
1934
|
+
Parameters
|
|
1935
|
+
----------
|
|
1936
|
+
url : `str`
|
|
1937
|
+
Target URL.
|
|
1938
|
+
data : `BinaryIO` or `bytes`
|
|
1939
|
+
Request body.
|
|
1940
|
+
"""
|
|
1941
|
+
# Send a PUT request with empty body to the dCache frontend server to
|
|
1942
|
+
# get redirected to the backend.
|
|
1943
|
+
#
|
|
1944
|
+
# Details:
|
|
1945
|
+
# https://www.dcache.org/manuals/UserGuide-10.2/webdav.shtml#redirection
|
|
1946
|
+
#
|
|
1947
|
+
# Note that we use the backend pool manager for PUT requests, since
|
|
1948
|
+
# the dCache webDAV door closes the connection when redirecting a
|
|
1949
|
+
# PUT request to the backend.
|
|
1950
|
+
#
|
|
1951
|
+
# We want to reuse the connections to the door as much as possible so
|
|
1952
|
+
# that metadata operations are faster; all metadata operations use the
|
|
1953
|
+
# frontend pool manager.
|
|
1954
|
+
headers = {"Content-Length": "0", "Expect": "100-continue"}
|
|
1955
|
+
resp = self._request("PUT", url, headers=headers, redirect=False, pool_manager=self._backend)
|
|
1956
|
+
if redirect_location := resp.get_redirect_location():
|
|
1957
|
+
url = redirect_location
|
|
1958
|
+
elif resp.status not in (
|
|
1959
|
+
HTTPStatus.OK,
|
|
1960
|
+
HTTPStatus.CREATED,
|
|
1961
|
+
HTTPStatus.NO_CONTENT,
|
|
1962
|
+
):
|
|
1963
|
+
raise ValueError(
|
|
1964
|
+
f"""Unexpected response to HTTP request PUT {resp.geturl()}: status {resp.status} """
|
|
1965
|
+
f"""{resp.reason} [{resp.data.decode("utf-8")}]"""
|
|
1966
|
+
)
|
|
1967
|
+
|
|
1968
|
+
# We were redirected to a backend server. Upload the file contents to
|
|
1969
|
+
# its final destination. Explicitly ask the server to close this
|
|
1970
|
+
# network connection after serving this PUT request to release
|
|
1971
|
+
# the associated dCache mover.
|
|
1972
|
+
|
|
1973
|
+
# Ask dCache to compute and record a checksum of the uploaded
|
|
1974
|
+
# file contents, for later integrity checks. Since we don't compute
|
|
1975
|
+
# the digest ourselves while uploading the data, we cannot control
|
|
1976
|
+
# after the request is complete that the data we uploaded is
|
|
1977
|
+
# identical to the data recorded by the server, but at least the
|
|
1978
|
+
# server has recorded a digest of the data it stored.
|
|
1979
|
+
#
|
|
1980
|
+
# See RFC-3230 for details and
|
|
1981
|
+
# https://www.iana.org/assignments/http-dig-alg/http-dig-alg.xhtml
|
|
1982
|
+
# for the list of supported digest algorithhms.
|
|
1983
|
+
headers = {"Connection": "close"}
|
|
1984
|
+
if (checksum := self._config.request_checksum) is not None:
|
|
1985
|
+
headers.update({"Want-Digest": checksum})
|
|
1986
|
+
|
|
1987
|
+
try:
|
|
1988
|
+
resp = self._request(
|
|
1989
|
+
"PUT",
|
|
1990
|
+
url,
|
|
1991
|
+
body=data,
|
|
1992
|
+
headers=headers,
|
|
1993
|
+
pool_manager=self._backend,
|
|
1994
|
+
# Don't consume the response body, so that we can explicitly
|
|
1995
|
+
# close the connection.
|
|
1996
|
+
preload_content=False,
|
|
1997
|
+
)
|
|
1998
|
+
|
|
1999
|
+
# Disable automatically returning the connection to the pool
|
|
2000
|
+
# to be reused later on, since we want that connection to be
|
|
2001
|
+
# closed. By default, when preload_content is True, the network
|
|
2002
|
+
# connection is returned to the connection pool once the response
|
|
2003
|
+
# body is completely consumed. Once this happens, we don't have a
|
|
2004
|
+
# mecanism to force closing the connection.
|
|
2005
|
+
resp.auto_close = False
|
|
2006
|
+
|
|
2007
|
+
if resp.status not in (
|
|
2008
|
+
HTTPStatus.OK,
|
|
2009
|
+
HTTPStatus.CREATED,
|
|
2010
|
+
HTTPStatus.NO_CONTENT,
|
|
2011
|
+
):
|
|
2012
|
+
raise ValueError(
|
|
2013
|
+
f"""Unexpected response to HTTP request PUT {resp.geturl()}: status {resp.status} """
|
|
2014
|
+
f"""{resp.reason} [{resp.data.decode("utf-8")}]"""
|
|
2015
|
+
)
|
|
2016
|
+
|
|
2017
|
+
finally:
|
|
2018
|
+
# Explicitly close this connection to the dCache backend server.
|
|
2019
|
+
resp.close()
|
|
2020
|
+
|
|
2021
|
+
def download(self, url: str, filename: str, chunk_size: int, close_connection: bool = True) -> int:
|
|
2022
|
+
# Close the connection to the backend servers after downloading
|
|
2023
|
+
# the entire file content.
|
|
2024
|
+
return super().download(
|
|
2025
|
+
url=url, filename=filename, chunk_size=chunk_size, close_connection=close_connection
|
|
2026
|
+
)
|
|
2027
|
+
|
|
2028
|
+
|
|
2029
|
+
class DavClientXrootD(DavClientURLSigner):
|
|
2030
|
+
"""Client for interacting with a XrootD webDAV server.
|
|
2031
|
+
|
|
2032
|
+
Instances of this class are thread-safe.
|
|
2033
|
+
|
|
2034
|
+
Parameters
|
|
2035
|
+
----------
|
|
2036
|
+
url : `str`
|
|
2037
|
+
Root URL of the storage endpoint
|
|
2038
|
+
(e.g. "https://host.example.org:1234/").
|
|
2039
|
+
config : `DavConfig`
|
|
2040
|
+
Configuration to initialize this client.
|
|
2041
|
+
accepts_ranges : `bool` | `None`
|
|
2042
|
+
Indicate whether the remote server accepts the ``Range`` header in GET
|
|
2043
|
+
requests.
|
|
2044
|
+
"""
|
|
2045
|
+
|
|
2046
|
+
def __init__(self, url: str, config: DavConfig, accepts_ranges: bool | None = None) -> None:
|
|
2047
|
+
super().__init__(url=url, config=config, accepts_ranges=accepts_ranges)
|
|
2048
|
+
|
|
2049
|
+
def _get(
|
|
2050
|
+
self, url: str, headers: dict[str, str] | None = None, preload_content: bool = True
|
|
2051
|
+
) -> HTTPResponse:
|
|
2052
|
+
"""Send a HTTP GET request to a XrootD webDAV server.
|
|
2053
|
+
|
|
2054
|
+
Parameters
|
|
2055
|
+
----------
|
|
2056
|
+
url : `str`
|
|
2057
|
+
Target URL.
|
|
2058
|
+
headers : `dict[str, str]`, optional
|
|
2059
|
+
Headers to sent with the request.
|
|
2060
|
+
preload_content : `bool`, optional
|
|
2061
|
+
If True, the response body is downloaded and can be retrieved
|
|
2062
|
+
via the returned response `.data` property. If False, the
|
|
2063
|
+
caller needs to call the `.read()` on the returned response
|
|
2064
|
+
object to download the body.
|
|
2065
|
+
|
|
2066
|
+
Returns
|
|
2067
|
+
-------
|
|
2068
|
+
resp: `HTTPResponse`
|
|
2069
|
+
Response to the GET request as received from the server.
|
|
2070
|
+
"""
|
|
2071
|
+
# Send the GET request to the frontend servers and follow redirection.
|
|
2072
|
+
headers = {} if headers is None else dict(headers)
|
|
2073
|
+
resp = self._request("GET", url, headers=headers, preload_content=preload_content, redirect=False)
|
|
2074
|
+
if resp.status in (HTTPStatus.OK, HTTPStatus.PARTIAL_CONTENT):
|
|
2075
|
+
return resp
|
|
2076
|
+
|
|
2077
|
+
if resp.status == HTTPStatus.NOT_FOUND:
|
|
2078
|
+
raise FileNotFoundError(f"No file found at {resp.geturl()}")
|
|
2079
|
+
|
|
2080
|
+
redirect_location = resp.get_redirect_location()
|
|
2081
|
+
if redirect_location is None or redirect_location is False:
|
|
2082
|
+
raise ValueError(
|
|
2083
|
+
f"Unexpected error in HTTP GET {resp.geturl()}: status {resp.status} {resp.reason}"
|
|
2084
|
+
)
|
|
2085
|
+
|
|
2086
|
+
# We were redirected to a backend server so follow the redirection.
|
|
2087
|
+
# The response body will be automatically downloaded when
|
|
2088
|
+
# `preload_content` is true and the underlying network connection
|
|
2089
|
+
# may be kept open for future reuse if the maximum number of
|
|
2090
|
+
# connections for the backend pool is not reached.
|
|
2091
|
+
#
|
|
2092
|
+
# For XRootD endpoints, we always use the same pool manager, namely
|
|
2093
|
+
# the frontend pool manager, to increase the chance of reusing
|
|
2094
|
+
# network connections.
|
|
2095
|
+
url = redirect_location
|
|
2096
|
+
resp = self._request(
|
|
2097
|
+
"GET",
|
|
2098
|
+
url,
|
|
2099
|
+
headers=headers,
|
|
2100
|
+
pool_manager=self._frontend,
|
|
2101
|
+
preload_content=preload_content,
|
|
2102
|
+
)
|
|
2103
|
+
|
|
2104
|
+
if resp.status not in (HTTPStatus.OK, HTTPStatus.PARTIAL_CONTENT):
|
|
2105
|
+
resp.close()
|
|
2106
|
+
raise ValueError(
|
|
2107
|
+
f"Unexpected error in HTTP GET {resp.geturl()}: status {resp.status} {resp.reason}"
|
|
2108
|
+
)
|
|
2109
|
+
|
|
2110
|
+
# The caller will access the `resp.data` property or use
|
|
2111
|
+
# the `resp.read()` method to read the contents of the
|
|
2112
|
+
# response body. If `preload_content` argument is True, the
|
|
2113
|
+
# response body is already downloaded, otherwise `resp.read()`
|
|
2114
|
+
# will download it.
|
|
2115
|
+
return resp
|
|
2116
|
+
|
|
2117
|
+
def _put(
|
|
2118
|
+
self,
|
|
2119
|
+
url: str,
|
|
2120
|
+
data: BinaryIO | bytes,
|
|
2121
|
+
) -> None:
|
|
2122
|
+
"""Send a HTTP PUT request to a dCache webDAV server.
|
|
2123
|
+
|
|
2124
|
+
Parameters
|
|
2125
|
+
----------
|
|
2126
|
+
url : `str`
|
|
2127
|
+
Target URL.
|
|
2128
|
+
data : `BinaryIO` or `bytes`
|
|
2129
|
+
Request body.
|
|
2130
|
+
"""
|
|
2131
|
+
# Send a PUT request with empty body to the XRootD frontend server to
|
|
2132
|
+
# get redirected to the backend.
|
|
2133
|
+
headers = {"Content-Length": "0", "Expect": "100-continue"}
|
|
2134
|
+
for attempt in range(max_attempts := 3):
|
|
2135
|
+
resp = self._request("PUT", url, headers=headers, redirect=False)
|
|
2136
|
+
if redirect_location := resp.get_redirect_location():
|
|
2137
|
+
url = redirect_location
|
|
2138
|
+
break
|
|
2139
|
+
elif resp.status == HTTPStatus.LOCKED:
|
|
2140
|
+
# Sometimes XRootD servers respond with status code LOCKED and
|
|
2141
|
+
# response body of the form:
|
|
2142
|
+
#
|
|
2143
|
+
# "Output file /path/to/file is already opened by 1 writer;
|
|
2144
|
+
# open denied."
|
|
2145
|
+
#
|
|
2146
|
+
# If we get such a response, try again, unless we reached
|
|
2147
|
+
# the maximum number of attempts.
|
|
2148
|
+
if attempt == max_attempts - 1:
|
|
2149
|
+
raise ValueError(
|
|
2150
|
+
f"""Unexpected response to HTTP request PUT {resp.geturl()}: status {resp.status} """
|
|
2151
|
+
f"""{resp.reason} [{resp.data.decode("utf-8")}] after {max_attempts} attempts"""
|
|
2152
|
+
)
|
|
2153
|
+
|
|
2154
|
+
# Wait a bit and try again
|
|
2155
|
+
log.warning(
|
|
2156
|
+
f"""got unexpected response status {HTTPStatus.LOCKED} Locked for {url} """
|
|
2157
|
+
f"""(attempt {attempt}/{max_attempts}), retrying..."""
|
|
2158
|
+
)
|
|
2159
|
+
time.sleep((attempt + 1) * 0.100)
|
|
2160
|
+
continue
|
|
2161
|
+
elif resp.status not in (
|
|
2162
|
+
HTTPStatus.OK,
|
|
2163
|
+
HTTPStatus.CREATED,
|
|
2164
|
+
HTTPStatus.NO_CONTENT,
|
|
2165
|
+
):
|
|
2166
|
+
raise ValueError(
|
|
2167
|
+
f"""Unexpected response to HTTP request PUT {resp.geturl()}: status {resp.status} """
|
|
2168
|
+
f"""{resp.reason} [{resp.data.decode("utf-8")}]"""
|
|
2169
|
+
)
|
|
2170
|
+
|
|
2171
|
+
# We were redirected to a backend server. Upload the file contents to
|
|
2172
|
+
# its final destination.
|
|
2173
|
+
|
|
2174
|
+
# XRootD backend servers typically use a single port number for
|
|
2175
|
+
# accepting connections from clients. It is therefore beneficial
|
|
2176
|
+
# to keep those connections open, if the server allows.
|
|
2177
|
+
|
|
2178
|
+
# Ask the server to compute and record a checksum of the uploaded
|
|
2179
|
+
# file contents, for later integrity checks. Since we don't compute
|
|
2180
|
+
# the digest ourselves while uploading the data, we cannot control
|
|
2181
|
+
# after the request is complete that the data we uploaded is
|
|
2182
|
+
# identical to the data recorded by the server, but at least the
|
|
2183
|
+
# server has recorded a digest of the data it stored.
|
|
2184
|
+
#
|
|
2185
|
+
# See RFC-3230 for details and
|
|
2186
|
+
# https://www.iana.org/assignments/http-dig-alg/http-dig-alg.xhtml
|
|
2187
|
+
# for the list of supported digest algorithhms.
|
|
2188
|
+
#
|
|
2189
|
+
# In addition, note that not all servers implement this RFC so
|
|
2190
|
+
# the checksum reqquest may be ignored by the server.
|
|
2191
|
+
headers = {}
|
|
2192
|
+
if (checksum := self._config.request_checksum) is not None:
|
|
2193
|
+
headers = {"Want-Digest": checksum}
|
|
2194
|
+
|
|
2195
|
+
# For XRootD endpoints, we always use the same pool manager, namely
|
|
2196
|
+
# the frontend pool manager, to increase the chance of reusing
|
|
2197
|
+
# network connections.
|
|
2198
|
+
resp = self._request(
|
|
2199
|
+
"PUT",
|
|
2200
|
+
url,
|
|
2201
|
+
body=data,
|
|
2202
|
+
headers=headers,
|
|
2203
|
+
pool_manager=self._frontend,
|
|
2204
|
+
)
|
|
2205
|
+
|
|
2206
|
+
if resp.status not in (
|
|
2207
|
+
HTTPStatus.OK,
|
|
2208
|
+
HTTPStatus.CREATED,
|
|
2209
|
+
HTTPStatus.NO_CONTENT,
|
|
2210
|
+
):
|
|
2211
|
+
raise ValueError(
|
|
2212
|
+
f"""Unexpected response to HTTP request PUT {resp.geturl()}: status {resp.status} """
|
|
2213
|
+
f"""{resp.reason} [{resp.data.decode("utf-8")}]"""
|
|
2214
|
+
)
|
|
2215
|
+
|
|
2216
|
+
def info(self, url: str, name: str | None = None) -> dict[str, Any]:
|
|
2217
|
+
# XRootD does not include checksums in the response to PROPFIND
|
|
2218
|
+
# requqest. We need to send a specific HEAD request to retrieve
|
|
2219
|
+
# the ADLER32 checksum.
|
|
2220
|
+
#
|
|
2221
|
+
# If found, the checksum is included in the response header "Digest",
|
|
2222
|
+
# which is of the form:
|
|
2223
|
+
#
|
|
2224
|
+
# Digest: adler32=0e4709f2
|
|
2225
|
+
result = super().info(url, name)
|
|
2226
|
+
if result["type"] == "file":
|
|
2227
|
+
headers: dict[str, str] = {"Want-Digest": "adler32"}
|
|
2228
|
+
resp = self._head(url=url, headers=headers)
|
|
2229
|
+
if (digest := resp.headers.get("Digest")) is not None:
|
|
2230
|
+
value = digest.split("=")[1]
|
|
2231
|
+
result["checksums"].update({"adler32": value})
|
|
2232
|
+
|
|
2233
|
+
return result
|
|
2234
|
+
|
|
2235
|
+
|
|
2236
|
+
class DavFileMetadata:
|
|
2237
|
+
"""Container for attributes of interest of a webDAV file or directory.
|
|
2238
|
+
|
|
2239
|
+
Parameters
|
|
2240
|
+
----------
|
|
2241
|
+
base_url : `str`
|
|
2242
|
+
Base URL.
|
|
2243
|
+
href : `str`, optional
|
|
2244
|
+
Path component that can be added to the base URL.
|
|
2245
|
+
name : `str`, optional
|
|
2246
|
+
Name.
|
|
2247
|
+
exists : `bool`, optional
|
|
2248
|
+
Whether file or directory exist.
|
|
2249
|
+
size : `int`, optional
|
|
2250
|
+
Size of file.
|
|
2251
|
+
is_dir : `bool`, optional
|
|
2252
|
+
Whether the URL points to a directory or file.
|
|
2253
|
+
last_modified : `bool`, optional
|
|
2254
|
+
Last modified date.
|
|
2255
|
+
checksums : `dict` [ `str`, `str` ] | `None`, optional
|
|
2256
|
+
Checksums.
|
|
2257
|
+
"""
|
|
2258
|
+
|
|
2259
|
+
def __init__(
|
|
2260
|
+
self,
|
|
2261
|
+
base_url: str,
|
|
2262
|
+
href: str = "",
|
|
2263
|
+
name: str = "",
|
|
2264
|
+
exists: bool = False,
|
|
2265
|
+
size: int = -1,
|
|
2266
|
+
is_dir: bool = False,
|
|
2267
|
+
last_modified: datetime = datetime.min,
|
|
2268
|
+
checksums: dict[str, str] | None = None,
|
|
2269
|
+
):
|
|
2270
|
+
self._url: str = base_url if not href else base_url.rstrip("/") + href
|
|
2271
|
+
self._href: str = href
|
|
2272
|
+
self._name: str = name
|
|
2273
|
+
self._exists: bool = exists
|
|
2274
|
+
self._size: int = size
|
|
2275
|
+
self._is_dir: bool = is_dir
|
|
2276
|
+
self._last_modified: datetime = last_modified
|
|
2277
|
+
self._checksums: dict[str, str] = {} if checksums is None else dict(checksums)
|
|
2278
|
+
|
|
2279
|
+
@staticmethod
|
|
2280
|
+
def from_property(base_url: str, property: DavProperty) -> DavFileMetadata:
|
|
2281
|
+
"""Create an instance from the values in `property`.
|
|
2282
|
+
|
|
2283
|
+
Parameters
|
|
2284
|
+
----------
|
|
2285
|
+
base_url : `str`
|
|
2286
|
+
Base URL.
|
|
2287
|
+
property : `DavProperty`
|
|
2288
|
+
Properties to associate with URL.
|
|
2289
|
+
"""
|
|
2290
|
+
return DavFileMetadata(
|
|
2291
|
+
base_url=base_url,
|
|
2292
|
+
href=property.href,
|
|
2293
|
+
name=property.name,
|
|
2294
|
+
exists=property.exists,
|
|
2295
|
+
size=property.size,
|
|
2296
|
+
is_dir=property.is_dir,
|
|
2297
|
+
last_modified=property.last_modified,
|
|
2298
|
+
checksums=dict(property.checksums),
|
|
2299
|
+
)
|
|
2300
|
+
|
|
2301
|
+
def __str__(self) -> str:
|
|
2302
|
+
return (
|
|
2303
|
+
f"""{self._url} {self._href} {self._name} {self._exists} {self._size} {self._is_dir} """
|
|
2304
|
+
f"""{self._checksums}"""
|
|
2305
|
+
)
|
|
2306
|
+
|
|
2307
|
+
@property
|
|
2308
|
+
def url(self) -> str:
|
|
2309
|
+
return self._url
|
|
2310
|
+
|
|
2311
|
+
@property
|
|
2312
|
+
def href(self) -> str:
|
|
2313
|
+
return self._href
|
|
2314
|
+
|
|
2315
|
+
@property
|
|
2316
|
+
def name(self) -> str:
|
|
2317
|
+
return self._name
|
|
2318
|
+
|
|
2319
|
+
@property
|
|
2320
|
+
def exists(self) -> bool:
|
|
2321
|
+
return self._exists
|
|
2322
|
+
|
|
2323
|
+
@property
|
|
2324
|
+
def size(self) -> int:
|
|
2325
|
+
if not self._exists:
|
|
2326
|
+
return -1
|
|
2327
|
+
|
|
2328
|
+
return 0 if self._is_dir else self._size
|
|
2329
|
+
|
|
2330
|
+
@property
|
|
2331
|
+
def is_dir(self) -> bool:
|
|
2332
|
+
return self._exists and self._is_dir
|
|
2333
|
+
|
|
2334
|
+
@property
|
|
2335
|
+
def is_file(self) -> bool:
|
|
2336
|
+
return self._exists and not self._is_dir
|
|
2337
|
+
|
|
2338
|
+
@property
|
|
2339
|
+
def last_modified(self) -> datetime:
|
|
2340
|
+
return self._last_modified
|
|
2341
|
+
|
|
2342
|
+
@property
|
|
2343
|
+
def checksums(self) -> dict[str, str]:
|
|
2344
|
+
return self._checksums
|
|
2345
|
+
|
|
2346
|
+
|
|
2347
|
+
class DavProperty:
|
|
2348
|
+
"""Helper class to encapsulate select live DAV properties of a single
|
|
2349
|
+
resource, as retrieved via a PROPFIND request.
|
|
2350
|
+
|
|
2351
|
+
Parameters
|
|
2352
|
+
----------
|
|
2353
|
+
response : `eTree.Element` or `None`
|
|
2354
|
+
The XML response defining the DAV property.
|
|
2355
|
+
"""
|
|
2356
|
+
|
|
2357
|
+
# Regular expression to compare against the 'status' element of a
|
|
2358
|
+
# PROPFIND response's 'propstat' element.
|
|
2359
|
+
_status_ok_rex = re.compile(r"^HTTP/.* 200 .*$", re.IGNORECASE)
|
|
2360
|
+
|
|
2361
|
+
def __init__(self, response: eTree.Element | None):
|
|
2362
|
+
self._href: str = ""
|
|
2363
|
+
self._displayname: str = ""
|
|
2364
|
+
self._collection: bool = False
|
|
2365
|
+
self._getlastmodified: str = ""
|
|
2366
|
+
self._getcontentlength: int = -1
|
|
2367
|
+
self._checksums: dict[str, str] = {}
|
|
2368
|
+
|
|
2369
|
+
if response is not None:
|
|
2370
|
+
self._parse(response)
|
|
2371
|
+
|
|
2372
|
+
def _parse(self, response: eTree.Element) -> None:
|
|
2373
|
+
# Extract 'href'.
|
|
2374
|
+
if (element := response.find("./{DAV:}href")) is not None:
|
|
2375
|
+
# We need to use "str(element.text)"" instead of "element.text" to
|
|
2376
|
+
# keep mypy happy.
|
|
2377
|
+
self._href = str(element.text).strip()
|
|
2378
|
+
else:
|
|
2379
|
+
raise ValueError(
|
|
2380
|
+
"Property 'href' expected but not found in PROPFIND response: "
|
|
2381
|
+
f"{eTree.tostring(response, encoding='unicode')}"
|
|
2382
|
+
)
|
|
2383
|
+
|
|
2384
|
+
for propstat in response.findall("./{DAV:}propstat"):
|
|
2385
|
+
# Only extract properties of interest with status OK.
|
|
2386
|
+
status = propstat.find("./{DAV:}status")
|
|
2387
|
+
if status is None or not self._status_ok_rex.match(str(status.text)):
|
|
2388
|
+
continue
|
|
2389
|
+
|
|
2390
|
+
for prop in propstat.findall("./{DAV:}prop"):
|
|
2391
|
+
# Parse "collection".
|
|
2392
|
+
if (element := prop.find("./{DAV:}resourcetype/{DAV:}collection")) is not None:
|
|
2393
|
+
self._collection = True
|
|
2394
|
+
|
|
2395
|
+
# Parse "getlastmodified".
|
|
2396
|
+
if (element := prop.find("./{DAV:}getlastmodified")) is not None:
|
|
2397
|
+
self._getlastmodified = str(element.text)
|
|
2398
|
+
|
|
2399
|
+
# Parse "getcontentlength".
|
|
2400
|
+
if (element := prop.find("./{DAV:}getcontentlength")) is not None:
|
|
2401
|
+
self._getcontentlength = int(str(element.text))
|
|
2402
|
+
|
|
2403
|
+
# Parse "displayname".
|
|
2404
|
+
if (element := prop.find("./{DAV:}displayname")) is not None:
|
|
2405
|
+
self._displayname = str(element.text)
|
|
2406
|
+
|
|
2407
|
+
# Parse "Checksums"
|
|
2408
|
+
if (element := prop.find("./{http://www.dcache.org/2013/webdav}Checksums")) is not None:
|
|
2409
|
+
self._checksums = self._parse_checksums(element.text)
|
|
2410
|
+
|
|
2411
|
+
# Some webDAV servers don't include the 'displayname' property in the
|
|
2412
|
+
# response so try to infer it from the value of the 'href' property.
|
|
2413
|
+
# Depending on the server the href value may end with '/'.
|
|
2414
|
+
if not self._displayname:
|
|
2415
|
+
self._displayname = os.path.basename(self._href.rstrip("/"))
|
|
2416
|
+
|
|
2417
|
+
# Some webDAV servers do not append a "/" to the href of directories.
|
|
2418
|
+
# Ensure we include a single final "/" in our response.
|
|
2419
|
+
if self._collection:
|
|
2420
|
+
self._href = self._href.rstrip("/") + "/"
|
|
2421
|
+
|
|
2422
|
+
# Force a size of 0 for collections.
|
|
2423
|
+
if self._collection:
|
|
2424
|
+
self._getcontentlength = 0
|
|
2425
|
+
|
|
2426
|
+
def _parse_checksums(self, checksums: str | None) -> dict[str, str]:
|
|
2427
|
+
# checksums argument is of the form
|
|
2428
|
+
# md5=MyS/wljSzI9WYiyrsuyoxw==,adler32=23b104f2
|
|
2429
|
+
result: dict[str, str] = {}
|
|
2430
|
+
if checksums is not None:
|
|
2431
|
+
for checksum in checksums.split(","):
|
|
2432
|
+
if (pos := checksum.find("=")) != -1:
|
|
2433
|
+
algorithm, value = checksum[:pos].lower(), checksum[pos + 1 :]
|
|
2434
|
+
if algorithm == "md5":
|
|
2435
|
+
# dCache documentation about how it encodes the
|
|
2436
|
+
# MD5 checksum:
|
|
2437
|
+
#
|
|
2438
|
+
# https://www.dcache.org/manuals/UserGuide-10.2/webdav.shtml#checksums
|
|
2439
|
+
result[algorithm] = bytes.hex(base64.standard_b64decode(value))
|
|
2440
|
+
else:
|
|
2441
|
+
result[algorithm] = value
|
|
2442
|
+
|
|
2443
|
+
return result
|
|
2444
|
+
|
|
2445
|
+
@property
|
|
2446
|
+
def exists(self) -> bool:
|
|
2447
|
+
# It is either a directory or a file with length of at least zero
|
|
2448
|
+
return self._collection or self._getcontentlength >= 0
|
|
2449
|
+
|
|
2450
|
+
@property
|
|
2451
|
+
def is_dir(self) -> bool:
|
|
2452
|
+
return self._collection
|
|
2453
|
+
|
|
2454
|
+
@property
|
|
2455
|
+
def is_file(self) -> bool:
|
|
2456
|
+
return not self._collection
|
|
2457
|
+
|
|
2458
|
+
@property
|
|
2459
|
+
def last_modified(self) -> datetime:
|
|
2460
|
+
if not self._getlastmodified:
|
|
2461
|
+
return datetime.min
|
|
2462
|
+
|
|
2463
|
+
# Last modified timestamp is of the form:
|
|
2464
|
+
# 'Wed, 12 Mar 2025 10:11:13 GMT'
|
|
2465
|
+
return datetime.strptime(self._getlastmodified, "%a, %d %b %Y %H:%M:%S %Z")
|
|
2466
|
+
|
|
2467
|
+
@property
|
|
2468
|
+
def size(self) -> int:
|
|
2469
|
+
return self._getcontentlength
|
|
2470
|
+
|
|
2471
|
+
@property
|
|
2472
|
+
def name(self) -> str:
|
|
2473
|
+
return self._displayname
|
|
2474
|
+
|
|
2475
|
+
@property
|
|
2476
|
+
def href(self) -> str:
|
|
2477
|
+
return self._href
|
|
2478
|
+
|
|
2479
|
+
@property
|
|
2480
|
+
def checksums(self) -> dict[str, str]:
|
|
2481
|
+
return self._checksums
|
|
2482
|
+
|
|
2483
|
+
|
|
2484
|
+
class DavPropfindParser:
|
|
2485
|
+
"""Helper class to parse the response body of a PROPFIND request."""
|
|
2486
|
+
|
|
2487
|
+
def __init__(self) -> None:
|
|
2488
|
+
return
|
|
2489
|
+
|
|
2490
|
+
def parse(self, body: bytes) -> list[DavProperty]:
|
|
2491
|
+
"""Parse the XML-encoded contents of the response body to a webDAV
|
|
2492
|
+
PROPFIND request.
|
|
2493
|
+
|
|
2494
|
+
Parameters
|
|
2495
|
+
----------
|
|
2496
|
+
body : `bytes`
|
|
2497
|
+
XML-encoded response body to a PROPFIND request.
|
|
2498
|
+
|
|
2499
|
+
Returns
|
|
2500
|
+
-------
|
|
2501
|
+
responses : `list` [ `DavProperty` ]
|
|
2502
|
+
Parsed content of the response.
|
|
2503
|
+
|
|
2504
|
+
Notes
|
|
2505
|
+
-----
|
|
2506
|
+
Is is expected that there is at least one reponse in `body`, otherwise
|
|
2507
|
+
this function raises.
|
|
2508
|
+
"""
|
|
2509
|
+
# A response body to a PROPFIND request is of the form (indented for
|
|
2510
|
+
# readability):
|
|
2511
|
+
#
|
|
2512
|
+
# <?xml version="1.0" encoding="UTF-8"?>
|
|
2513
|
+
# <D:multistatus xmlns:D="DAV:">
|
|
2514
|
+
# <D:response>
|
|
2515
|
+
# <D:href>path/to/resource</D:href>
|
|
2516
|
+
# <D:propstat>
|
|
2517
|
+
# <D:prop>
|
|
2518
|
+
# <D:resourcetype>
|
|
2519
|
+
# <D:collection xmlns:D="DAV:"/>
|
|
2520
|
+
# </D:resourcetype>
|
|
2521
|
+
# <D:getlastmodified>
|
|
2522
|
+
# Fri, 27 Jan 2 023 13:59:01 GMT
|
|
2523
|
+
# </D:getlastmodified>
|
|
2524
|
+
# <D:getcontentlength>
|
|
2525
|
+
# 12345
|
|
2526
|
+
# </D:getcontentlength>
|
|
2527
|
+
# </D:prop>
|
|
2528
|
+
# <D:status>
|
|
2529
|
+
# HTTP/1.1 200 OK
|
|
2530
|
+
# </D:status>
|
|
2531
|
+
# </D:propstat>
|
|
2532
|
+
# </D:response>
|
|
2533
|
+
# <D:response>
|
|
2534
|
+
# ...
|
|
2535
|
+
# </D:response>
|
|
2536
|
+
# <D:response>
|
|
2537
|
+
# ...
|
|
2538
|
+
# </D:response>
|
|
2539
|
+
# </D:multistatus>
|
|
2540
|
+
|
|
2541
|
+
# Scan all the 'response' elements and extract the relevant properties
|
|
2542
|
+
decoded_body: str = body.decode("utf-8").strip()
|
|
2543
|
+
responses = []
|
|
2544
|
+
multistatus = eTree.fromstring(decoded_body)
|
|
2545
|
+
for response in multistatus.findall("./{DAV:}response"):
|
|
2546
|
+
responses.append(DavProperty(response))
|
|
2547
|
+
|
|
2548
|
+
if responses:
|
|
2549
|
+
return responses
|
|
2550
|
+
else:
|
|
2551
|
+
# Could not parse the body
|
|
2552
|
+
raise ValueError(f"Unable to parse response for PROPFIND request: {decoded_body}")
|
|
2553
|
+
|
|
2554
|
+
|
|
2555
|
+
class TokenAuthorizer:
|
|
2556
|
+
"""Attach a bearer token 'Authorization' header to each request.
|
|
2557
|
+
|
|
2558
|
+
Parameters
|
|
2559
|
+
----------
|
|
2560
|
+
token : `str`
|
|
2561
|
+
Can be either the path to a local file which contains the
|
|
2562
|
+
value of the token or the token itself. If `token` is a file
|
|
2563
|
+
it must be protected so that only the owner can read and write it.
|
|
2564
|
+
"""
|
|
2565
|
+
|
|
2566
|
+
def __init__(self, token: str | None = None) -> None:
|
|
2567
|
+
self._token = self._path = None
|
|
2568
|
+
self._mtime: float = -1.0
|
|
2569
|
+
if token is None:
|
|
2570
|
+
return
|
|
2571
|
+
|
|
2572
|
+
self._token = token
|
|
2573
|
+
if os.path.isfile(token):
|
|
2574
|
+
self._path = os.path.abspath(token)
|
|
2575
|
+
if not self._is_protected(self._path):
|
|
2576
|
+
raise PermissionError(
|
|
2577
|
+
f"""Authorization token file at {self._path} must be protected for access only """
|
|
2578
|
+
"""by its owner"""
|
|
2579
|
+
)
|
|
2580
|
+
self._refresh()
|
|
2581
|
+
|
|
2582
|
+
def _refresh(self) -> None:
|
|
2583
|
+
"""Read the token file (if any) if its modification time is more recent
|
|
2584
|
+
than the last time we read it.
|
|
2585
|
+
"""
|
|
2586
|
+
if self._path is None:
|
|
2587
|
+
return
|
|
2588
|
+
|
|
2589
|
+
if (mtime := os.stat(self._path).st_mtime) > self._mtime:
|
|
2590
|
+
log.debug("Reading authorization token from file %s", self._path)
|
|
2591
|
+
self._mtime = mtime
|
|
2592
|
+
with open(self._path) as f:
|
|
2593
|
+
self._token = f.read().rstrip("\n")
|
|
2594
|
+
|
|
2595
|
+
def _is_protected(self, filepath: str) -> bool:
|
|
2596
|
+
"""Return true if the permissions of file at filepath only allow for
|
|
2597
|
+
access by its owner.
|
|
2598
|
+
|
|
2599
|
+
Parameters
|
|
2600
|
+
----------
|
|
2601
|
+
filepath : `str`
|
|
2602
|
+
Path of a local file.
|
|
2603
|
+
"""
|
|
2604
|
+
if not os.path.isfile(filepath):
|
|
2605
|
+
return False
|
|
2606
|
+
|
|
2607
|
+
mode = stat.S_IMODE(os.stat(filepath).st_mode)
|
|
2608
|
+
owner_accessible = bool(mode & stat.S_IRWXU)
|
|
2609
|
+
group_accessible = bool(mode & stat.S_IRWXG)
|
|
2610
|
+
other_accessible = bool(mode & stat.S_IRWXO)
|
|
2611
|
+
return owner_accessible and not group_accessible and not other_accessible
|
|
2612
|
+
|
|
2613
|
+
def set_authorization(self, headers: dict[str, str]) -> None:
|
|
2614
|
+
"""Add the 'Authorization' header to `headers`.
|
|
2615
|
+
|
|
2616
|
+
Parameters
|
|
2617
|
+
----------
|
|
2618
|
+
headers : `dict` [ `str`, `str` ]
|
|
2619
|
+
Dict to augment with authorization information.
|
|
2620
|
+
"""
|
|
2621
|
+
if self._token is None:
|
|
2622
|
+
return
|
|
2623
|
+
|
|
2624
|
+
self._refresh()
|
|
2625
|
+
headers["Authorization"] = f"Bearer {self._token}"
|
|
2626
|
+
|
|
2627
|
+
|
|
2628
|
+
def expand_vars(path: str | None) -> str | None:
|
|
2629
|
+
"""Expand the environment variables in `path` and return the path with
|
|
2630
|
+
the value of the variable expanded.
|
|
2631
|
+
|
|
2632
|
+
Parameters
|
|
2633
|
+
----------
|
|
2634
|
+
path : `str` or `None`
|
|
2635
|
+
Abolute or relative path which may include an environment variable
|
|
2636
|
+
(e.g. '$HOME/path/to/my/file').
|
|
2637
|
+
|
|
2638
|
+
Returns
|
|
2639
|
+
-------
|
|
2640
|
+
path: `str`
|
|
2641
|
+
The path with the values of the environment variables expanded.
|
|
2642
|
+
"""
|
|
2643
|
+
return None if path is None else os.path.expandvars(path)
|
|
2644
|
+
|
|
2645
|
+
|
|
2646
|
+
def dump_response(method: str, resp: HTTPResponse) -> None:
|
|
2647
|
+
"""Dump response for debugging purposes.
|
|
2648
|
+
|
|
2649
|
+
Parameters
|
|
2650
|
+
----------
|
|
2651
|
+
method : `str`
|
|
2652
|
+
Method name to include in log output.
|
|
2653
|
+
resp : `HTTPResponse`
|
|
2654
|
+
Response to dump.
|
|
2655
|
+
"""
|
|
2656
|
+
log.debug("%s %s", method, resp.geturl())
|
|
2657
|
+
for header, value in resp.headers.items():
|
|
2658
|
+
log.debug(" %s: %s", header, value)
|
|
2659
|
+
log.debug(" response body length: %d", len(resp.data.decode("utf-8")))
|