thds.adls 4.3.20251008013223__py3-none-any.whl → 4.3.20251009011718__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of thds.adls might be problematic. Click here for more details.
- thds/adls/_upload.py +15 -10
- thds/adls/source.py +8 -4
- thds/adls/source_tree.py +1 -1
- thds/adls/upload.py +11 -14
- {thds_adls-4.3.20251008013223.dist-info → thds_adls-4.3.20251009011718.dist-info}/METADATA +1 -1
- {thds_adls-4.3.20251008013223.dist-info → thds_adls-4.3.20251009011718.dist-info}/RECORD +9 -9
- {thds_adls-4.3.20251008013223.dist-info → thds_adls-4.3.20251009011718.dist-info}/WHEEL +0 -0
- {thds_adls-4.3.20251008013223.dist-info → thds_adls-4.3.20251009011718.dist-info}/entry_points.txt +0 -0
- {thds_adls-4.3.20251008013223.dist-info → thds_adls-4.3.20251009011718.dist-info}/top_level.txt +0 -0
thds/adls/_upload.py
CHANGED
|
@@ -38,17 +38,22 @@ def _try_default_hash(data: hashes.AnyStrSrc) -> ty.Optional[hashing.Hash]:
|
|
|
38
38
|
return None
|
|
39
39
|
|
|
40
40
|
|
|
41
|
+
UploadSrc = ty.Union[Path, bytes, ty.IO[bytes], ty.Iterable[bytes]]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def upload_src_len(upload_src: UploadSrc, default: int = 0) -> int:
|
|
45
|
+
if isinstance(upload_src, Path) and upload_src.exists():
|
|
46
|
+
return upload_src.stat().st_size
|
|
47
|
+
try:
|
|
48
|
+
return len(upload_src) # type: ignore
|
|
49
|
+
except TypeError as te:
|
|
50
|
+
logger.debug(f"failed to get length? {repr(te)} for {upload_src!r}")
|
|
51
|
+
return default
|
|
52
|
+
|
|
53
|
+
|
|
41
54
|
def _too_small_to_skip_upload(data: hashes.AnyStrSrc, min_size_for_remote_check: int) -> bool:
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
return data.stat().st_size
|
|
45
|
-
try:
|
|
46
|
-
return len(data) # type: ignore
|
|
47
|
-
except TypeError as te:
|
|
48
|
-
logger.debug(f"failed to get length? {repr(te)} for {data}")
|
|
49
|
-
return min_size_for_remote_check + 1
|
|
50
|
-
|
|
51
|
-
return _len() < min_size_for_remote_check
|
|
55
|
+
len_ = upload_src_len(data) or min_size_for_remote_check + 1
|
|
56
|
+
return len_ < min_size_for_remote_check
|
|
52
57
|
|
|
53
58
|
|
|
54
59
|
class UploadDecision(ty.NamedTuple):
|
thds/adls/source.py
CHANGED
|
@@ -30,7 +30,9 @@ def _adls_uri_source_download_handler(uri: str) -> ty.Optional[source.Downloader
|
|
|
30
30
|
source.register_download_handler("thds.adls", _adls_uri_source_download_handler)
|
|
31
31
|
|
|
32
32
|
|
|
33
|
-
def from_adls(
|
|
33
|
+
def from_adls(
|
|
34
|
+
uri_or_fqn: ty.Union[str, AdlsFqn], hash: ty.Optional[Hash] = None, size: int = 0
|
|
35
|
+
) -> source.Source:
|
|
34
36
|
"""Flexible, public interface to creating Sources from any ADLS-like reference.
|
|
35
37
|
|
|
36
38
|
Does NOT automatically fetch a checksumming hash from the ADLS URI if it's not
|
|
@@ -40,7 +42,7 @@ def from_adls(uri_or_fqn: ty.Union[str, AdlsFqn], hash: ty.Optional[Hash] = None
|
|
|
40
42
|
r_fqn = resolve_any(uri_or_fqn)
|
|
41
43
|
if not r_fqn:
|
|
42
44
|
raise ValueError(f"Could not resolve {uri_or_fqn} to an ADLS FQN")
|
|
43
|
-
return source.Source(str(r_fqn), hash)
|
|
45
|
+
return source.Source(str(r_fqn), hash, size)
|
|
44
46
|
|
|
45
47
|
|
|
46
48
|
source.register_from_uri_handler(
|
|
@@ -55,13 +57,15 @@ def get_with_hash(fqn_or_uri: ty.Union[AdlsFqn, str]) -> source.Source:
|
|
|
55
57
|
"""
|
|
56
58
|
fqn = AdlsFqn.parse(fqn_or_uri) if isinstance(fqn_or_uri, str) else fqn_or_uri
|
|
57
59
|
with blob_not_found_translation(fqn):
|
|
58
|
-
|
|
60
|
+
props = get_file_properties(fqn)
|
|
61
|
+
uri_hashes = hashes.extract_hashes_from_props(props)
|
|
59
62
|
if not uri_hashes:
|
|
60
63
|
raise ValueError(
|
|
61
64
|
f"ADLS file {fqn} must have a hash to use this function. "
|
|
62
65
|
"If you know the hash, use `from_adls` with the hash parameter."
|
|
63
66
|
)
|
|
64
|
-
|
|
67
|
+
size = int(props.get("size")) or 0
|
|
68
|
+
return from_adls(fqn, next(iter(uri_hashes.values())), size)
|
|
65
69
|
|
|
66
70
|
|
|
67
71
|
def with_md5b64(uri_or_fqn: ty.Union[str, AdlsFqn], *, md5b64: str = "") -> source.Source:
|
thds/adls/source_tree.py
CHANGED
|
@@ -60,7 +60,7 @@ def from_path(adls_path: uri.UriIsh, match_suffix: str = "") -> SourceTree:
|
|
|
60
60
|
container_root = root_fqn.root()
|
|
61
61
|
return SourceTree(
|
|
62
62
|
sources=[
|
|
63
|
-
source.from_adls(container_root / blob_meta.path, hash=blob_meta.hash)
|
|
63
|
+
source.from_adls(container_root / blob_meta.path, hash=blob_meta.hash, size=blob_meta.size)
|
|
64
64
|
for blob_meta in list_blob_meta(container_client, root_fqn.path, match_suffix=match_suffix)
|
|
65
65
|
],
|
|
66
66
|
higher_logical_root=fqn.split(root_fqn)[-1],
|
thds/adls/upload.py
CHANGED
|
@@ -15,7 +15,7 @@ from thds.core import files, fretry, link, log, scope, source, tmp
|
|
|
15
15
|
|
|
16
16
|
from . import azcopy, hashes
|
|
17
17
|
from ._progress import report_upload_progress
|
|
18
|
-
from ._upload import upload_decision_and_metadata
|
|
18
|
+
from ._upload import UploadSrc, upload_decision_and_metadata, upload_src_len
|
|
19
19
|
from .conf import UPLOAD_FILE_MAX_CONCURRENCY
|
|
20
20
|
from .file_lock import file_lock
|
|
21
21
|
from .fqn import AdlsFqn
|
|
@@ -26,9 +26,6 @@ logger = log.getLogger(__name__)
|
|
|
26
26
|
_SLOW_CONNECTION_WORKAROUND = 14400 # seconds
|
|
27
27
|
|
|
28
28
|
|
|
29
|
-
UploadSrc = ty.Union[Path, bytes, ty.IO[ty.AnyStr], ty.Iterable[bytes]]
|
|
30
|
-
|
|
31
|
-
|
|
32
29
|
def _write_through_local_cache(local_cache_path: Path, data: UploadSrc) -> ty.Optional[Path]:
|
|
33
30
|
@scope.bound
|
|
34
31
|
def _try_write_through() -> bool:
|
|
@@ -41,8 +38,8 @@ def _write_through_local_cache(local_cache_path: Path, data: UploadSrc) -> ty.Op
|
|
|
41
38
|
out = scope.enter(tmp.temppath_same_fs(local_cache_path))
|
|
42
39
|
if hasattr(data, "read") and hasattr(data, "seek"):
|
|
43
40
|
with open(out, "wb") as f:
|
|
44
|
-
f.write(data.read())
|
|
45
|
-
data.seek(0)
|
|
41
|
+
f.write(data.read())
|
|
42
|
+
data.seek(0)
|
|
46
43
|
link.link_or_copy(out, local_cache_path)
|
|
47
44
|
return True
|
|
48
45
|
|
|
@@ -107,6 +104,7 @@ def upload(
|
|
|
107
104
|
blob_container_client = get_global_blob_container_client(dest_.sa, dest_.container)
|
|
108
105
|
blob_client = blob_container_client.get_blob_client(dest_.path)
|
|
109
106
|
decision = upload_decision_and_metadata(blob_client.get_blob_properties, src) # type: ignore [arg-type]
|
|
107
|
+
n_bytes = upload_src_len(src, default=0)
|
|
110
108
|
|
|
111
109
|
def source_from_meta() -> source.Source:
|
|
112
110
|
best_hash = next(iter(hashes.extract_hashes_from_metadata(decision.metadata)), None)
|
|
@@ -114,17 +112,14 @@ def upload(
|
|
|
114
112
|
assert best_hash, "A hash should always be calculable for a local path."
|
|
115
113
|
return source.from_file(src, hash=best_hash, uri=str(dest_))
|
|
116
114
|
|
|
117
|
-
return source.from_uri(str(dest_), hash=best_hash)
|
|
115
|
+
return source.from_uri(str(dest_), hash=best_hash, size=n_bytes)
|
|
118
116
|
|
|
119
117
|
if decision.upload_required:
|
|
120
118
|
# set up some bookkeeping
|
|
121
|
-
n_bytes = None # if we pass 0 to upload_blob, it truncates the write now
|
|
122
119
|
bytes_src: ty.Union[bytes, ty.IO, ty.Iterable[bytes]]
|
|
123
120
|
if isinstance(src, Path):
|
|
124
|
-
n_bytes = src.stat().st_size
|
|
125
121
|
bytes_src = scope.enter(open(src, "rb"))
|
|
126
122
|
elif isinstance(src, bytes):
|
|
127
|
-
n_bytes = len(src)
|
|
128
123
|
bytes_src = src
|
|
129
124
|
else:
|
|
130
125
|
bytes_src = src
|
|
@@ -132,7 +127,7 @@ def upload(
|
|
|
132
127
|
if "metadata" in upload_data_kwargs:
|
|
133
128
|
decision.metadata.update(upload_data_kwargs.pop("metadata"))
|
|
134
129
|
|
|
135
|
-
if azcopy.upload.should_use_azcopy(n_bytes
|
|
130
|
+
if azcopy.upload.should_use_azcopy(n_bytes) and isinstance(src, Path):
|
|
136
131
|
logger.info("Using azcopy to upload %s to %s", src, dest_)
|
|
137
132
|
try:
|
|
138
133
|
azcopy.upload.run(
|
|
@@ -140,7 +135,7 @@ def upload(
|
|
|
140
135
|
src, dest_, content_type=content_type, metadata=decision.metadata, overwrite=True
|
|
141
136
|
),
|
|
142
137
|
dest_,
|
|
143
|
-
n_bytes
|
|
138
|
+
n_bytes,
|
|
144
139
|
)
|
|
145
140
|
return source_from_meta()
|
|
146
141
|
|
|
@@ -158,9 +153,11 @@ def upload(
|
|
|
158
153
|
# This is both faster, as well as simpler to reason about, and
|
|
159
154
|
# in fact was the behavior I had been assuming all along...
|
|
160
155
|
blob_client.upload_blob(
|
|
161
|
-
report_upload_progress(ty.cast(ty.IO, bytes_src), str(dest_), n_bytes
|
|
156
|
+
report_upload_progress(ty.cast(ty.IO, bytes_src), str(dest_), n_bytes),
|
|
162
157
|
overwrite=True,
|
|
163
|
-
length=
|
|
158
|
+
length=(
|
|
159
|
+
n_bytes if n_bytes > 0 else None
|
|
160
|
+
), # if we pass 0 to upload_blob, it truncates the write now
|
|
164
161
|
content_settings=upload_content_settings,
|
|
165
162
|
connection_timeout=_SLOW_CONNECTION_WORKAROUND,
|
|
166
163
|
max_concurrency=UPLOAD_FILE_MAX_CONCURRENCY(),
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
thds/adls/__init__.py,sha256=Z450uhM5BIxgve9qkz_Lw7biVWFKp3Uz_iKaSPYEf1Y,1073
|
|
2
2
|
thds/adls/_fork_protector.py,sha256=WML1Ul-G_MCziBOyRp6GoNq1IGktn8-OQVjny1IZ1bs,950
|
|
3
3
|
thds/adls/_progress.py,sha256=0cmFZ2sOEogQY25AkMAhcunXO69FdlbkD1rU_zmLAp0,6648
|
|
4
|
-
thds/adls/_upload.py,sha256=
|
|
4
|
+
thds/adls/_upload.py,sha256=rN-nuRZur9VvSUywaQezUi_qedj-xU30fTBG6rasuIA,4853
|
|
5
5
|
thds/adls/abfss.py,sha256=nCFfcdwLiwtz_MdbrpLJ9WOhoz0zHIVxsf-tarorEwM,727
|
|
6
6
|
thds/adls/cached.py,sha256=up1F5JOVXdmwdZ8RAB2UDgiy6ooLg8IMULohBh75VpQ,3034
|
|
7
7
|
thds/adls/conf.py,sha256=nTw3X1ilC3A_905jZH-rWXFsESeHAKQn5IghvfX2VIo,1991
|
|
@@ -24,9 +24,9 @@ thds/adls/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
24
24
|
thds/adls/ro_cache.py,sha256=P-UVFZhqnE5wojqYmRVWZcqjl-pM1xVMm9VAm3nXlnA,4769
|
|
25
25
|
thds/adls/sas_tokens.py,sha256=mArbB_GYohevOmArw_1gKqVUWpv6kG8Hsbvdrhbtnbg,1957
|
|
26
26
|
thds/adls/shared_credential.py,sha256=-x42aXoIM001KW59oS8PpuXQd4-F2vg-1gB6OMHlpk4,4602
|
|
27
|
-
thds/adls/source.py,sha256=
|
|
28
|
-
thds/adls/source_tree.py,sha256=
|
|
29
|
-
thds/adls/upload.py,sha256=
|
|
27
|
+
thds/adls/source.py,sha256=5t9bc3tUxdh4QdbRvvVd9xF9_Rv4v7CDAqE09M3rE1g,2657
|
|
28
|
+
thds/adls/source_tree.py,sha256=FqVXgvfYPiowrWhRsXBItjvB7t41JRI3sCFVAHxjwgI,2610
|
|
29
|
+
thds/adls/upload.py,sha256=XJvygYzn1r6pZAl460s5j-kY9dgfQ6qeoddID7R6OQ0,6621
|
|
30
30
|
thds/adls/uri.py,sha256=9MXuW_KfpPvzBc4ERxuTJ3vvi_6yr7e1kMAW9mx2zXM,1414
|
|
31
31
|
thds/adls/azcopy/__init__.py,sha256=qn2dmT92EHcrtaQ8uwRoUgvtF6Fu3NQbhZItOBdIBmY,45
|
|
32
32
|
thds/adls/azcopy/download.py,sha256=FOtYyYh7ZXNWNdkj04yTV26lxcKOVj-YhS2p_EclYxA,6526
|
|
@@ -38,8 +38,8 @@ thds/adls/tools/download.py,sha256=CW2cWbCRdUqisVVVoqqvqk5Ved7pPGTkwnZj3uV0jy4,1
|
|
|
38
38
|
thds/adls/tools/ls.py,sha256=OgEaIfTK359twlZIj-A0AW_nv81Z6zi0b9Tw6OJJfWA,1083
|
|
39
39
|
thds/adls/tools/ls_fast.py,sha256=Nowc-efAL_Y4ybPwZzKIeh7KGIjfecRzdWvJZcBzq_8,585
|
|
40
40
|
thds/adls/tools/upload.py,sha256=5WyWkpuVp2PETZ3O3ODlq8LXszSHU73ZMnIDZXPJdC8,442
|
|
41
|
-
thds_adls-4.3.
|
|
42
|
-
thds_adls-4.3.
|
|
43
|
-
thds_adls-4.3.
|
|
44
|
-
thds_adls-4.3.
|
|
45
|
-
thds_adls-4.3.
|
|
41
|
+
thds_adls-4.3.20251009011718.dist-info/METADATA,sha256=H8nvvzUdcM2NU_NQVAn7zco92Buk26EqGkqj2O4NVjk,587
|
|
42
|
+
thds_adls-4.3.20251009011718.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
43
|
+
thds_adls-4.3.20251009011718.dist-info/entry_points.txt,sha256=rtVF0A2MMTYUsBScF6b3AlOuk2Vm02QK7Tc2bDcDpk0,200
|
|
44
|
+
thds_adls-4.3.20251009011718.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
|
|
45
|
+
thds_adls-4.3.20251009011718.dist-info/RECORD,,
|
|
File without changes
|
{thds_adls-4.3.20251008013223.dist-info → thds_adls-4.3.20251009011718.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{thds_adls-4.3.20251008013223.dist-info → thds_adls-4.3.20251009011718.dist-info}/top_level.txt
RENAMED
|
File without changes
|