thds.adls 4.2.20250926202021__py3-none-any.whl → 4.4.20251117191451__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
thds/adls/__init__.py CHANGED
@@ -2,6 +2,7 @@ from thds import core
2
2
 
3
3
  from . import ( # noqa: F401
4
4
  abfss,
5
+ blob_meta,
5
6
  defaults,
6
7
  etag,
7
8
  fqn,
thds/adls/_upload.py CHANGED
@@ -38,17 +38,22 @@ def _try_default_hash(data: hashes.AnyStrSrc) -> ty.Optional[hashing.Hash]:
38
38
  return None
39
39
 
40
40
 
41
+ UploadSrc = ty.Union[Path, bytes, ty.IO[bytes], ty.Iterable[bytes]]
42
+
43
+
44
+ def upload_src_len(upload_src: UploadSrc, default: int = 0) -> int:
45
+ if isinstance(upload_src, Path) and upload_src.exists():
46
+ return upload_src.stat().st_size
47
+ try:
48
+ return len(upload_src) # type: ignore
49
+ except TypeError as te:
50
+ logger.debug(f"failed to get length? {repr(te)} for {upload_src!r}")
51
+ return default
52
+
53
+
41
54
  def _too_small_to_skip_upload(data: hashes.AnyStrSrc, min_size_for_remote_check: int) -> bool:
42
- def _len() -> int:
43
- if isinstance(data, Path) and data.exists():
44
- return data.stat().st_size
45
- try:
46
- return len(data) # type: ignore
47
- except TypeError as te:
48
- logger.debug(f"failed to get length? {repr(te)} for {data}")
49
- return min_size_for_remote_check + 1
50
-
51
- return _len() < min_size_for_remote_check
55
+ len_ = upload_src_len(data) or min_size_for_remote_check + 1
56
+ return len_ < min_size_for_remote_check
52
57
 
53
58
 
54
59
  class UploadDecision(ty.NamedTuple):
thds/adls/blob_meta.py ADDED
@@ -0,0 +1,38 @@
1
+ import typing as ty
2
+ from dataclasses import dataclass
3
+
4
+ from azure.storage.blob import BlobProperties, ContainerClient
5
+
6
+ from thds.core import hashing
7
+
8
+ from . import hashes
9
+
10
+
11
+ @dataclass
12
+ class BlobMeta:
13
+ path: str
14
+ size: int
15
+ hash: ty.Optional[hashing.Hash]
16
+ metadata: dict[str, str]
17
+
18
+
19
+ def to_blob_meta(blob_props: BlobProperties) -> BlobMeta:
20
+ return BlobMeta(
21
+ blob_props.name,
22
+ blob_props.size,
23
+ next(iter(hashes.extract_hashes_from_props(blob_props).values()), None),
24
+ blob_props.metadata or {},
25
+ )
26
+
27
+
28
+ def is_dir(blob_meta: BlobMeta) -> bool:
29
+ return blob_meta.metadata.get("hdi_isfolder", "false") == "true"
30
+
31
+
32
+ # https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.containerclient?view=azure-python#azure-storage-blob-containerclient-list-blobs
33
+ # https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.blobproperties?view=azure-python
34
+ # https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.contentsettings?view=azure-python
35
+ def yield_blob_meta(container_client: ContainerClient, root_dir: str) -> ty.Iterator[BlobMeta]:
36
+ for blob_props in container_client.list_blobs(name_starts_with=root_dir, include=["metadata"]):
37
+ # `list_blobs` does not include metadata by default, so we need to explicitly specify including it
38
+ yield to_blob_meta(blob_props)
thds/adls/download.py CHANGED
@@ -16,7 +16,7 @@ from thds.core.types import StrOrPath
16
16
 
17
17
  from . import azcopy, errors, etag, hashes
18
18
  from ._progress import report_download_progress
19
- from .download_lock import download_lock
19
+ from .file_lock import file_lock
20
20
  from .fqn import AdlsFqn
21
21
  from .ro_cache import Cache, from_cache_path_to_local, from_local_path_to_cache
22
22
 
@@ -240,12 +240,12 @@ def _download_or_use_verified_cached_coroutine( # noqa: C901
240
240
  # No cache hit, so its time to prepare to download. if a cache was provided, we will
241
241
  # _put_ the resulting file in it.
242
242
 
243
- file_lock = str(cache.path(fqn) if cache else local_path)
243
+ file_lock_str = str(cache.path(fqn) if cache else local_path)
244
244
  # create lockfile name from the (shared) cache path if present, otherwise the final
245
245
  # destination. Non-cache users may then still incur multiple downloads in parallel,
246
246
  # but if you wanted to coordinate then you should probably have been using the global
247
247
  # cache in the first place.
248
- _dl_scope.enter(download_lock(file_lock))
248
+ _dl_scope.enter(file_lock(file_lock_str))
249
249
 
250
250
  # re-attempt cache hit - we may have gotten the lock after somebody else downloaded
251
251
  if file_result := attempt_cache_hit():
@@ -9,18 +9,18 @@ from thds.core import config, home, log
9
9
 
10
10
  from .md5 import hex_md5_str
11
11
 
12
- DOWNLOAD_LOCKS_DIR = config.item("dir", home.HOMEDIR() / ".thds/adls/download-locks", parse=Path)
12
+ FILELOCKS_DIR = config.item("dir", home.HOMEDIR() / ".thds/adls/file-locks", parse=Path)
13
13
  _CLEAN_UP_LOCKFILES_AFTER_TIME = timedelta(hours=24)
14
14
  _CLEAN_UP_LOCKFILES_EVERY = timedelta(hours=1).total_seconds()
15
15
  _LAST_CLEANED_BY_THIS_PROCESS = time.monotonic() - _CLEAN_UP_LOCKFILES_EVERY
16
16
  logger = log.getLogger(__name__)
17
17
 
18
18
 
19
- def _clean_download_locks() -> int:
19
+ def _clean_file_locks() -> int:
20
20
  deleted = 0
21
21
  deletion_threshold = time.time() - _CLEAN_UP_LOCKFILES_AFTER_TIME.total_seconds()
22
22
  try:
23
- for f in DOWNLOAD_LOCKS_DIR().iterdir():
23
+ for f in FILELOCKS_DIR().rglob("*"):
24
24
  fstat = f.stat()
25
25
  if stat.S_ISREG(fstat.st_mode) and fstat.st_mtime < deletion_threshold:
26
26
  f.unlink()
@@ -29,20 +29,20 @@ def _clean_download_locks() -> int:
29
29
  # this should be, hopefully, both very rare and completely inconsequential as to
30
30
  # program correctness. if you see this happen multiple times, you may have some
31
31
  # read-only files or something and want to manually clean up this directory.
32
- logger.exception("Failed to clean download locks directory.")
32
+ logger.exception("Failed to clean file locks directory.")
33
33
  return deleted
34
34
 
35
35
 
36
- def _occasionally_clean_download_locks():
36
+ def _occasionally_clean_file_locks():
37
37
  global _LAST_CLEANED_BY_THIS_PROCESS
38
38
  # do this about once an hour
39
39
  if time.monotonic() > _LAST_CLEANED_BY_THIS_PROCESS + _CLEAN_UP_LOCKFILES_EVERY:
40
40
  _LAST_CLEANED_BY_THIS_PROCESS = time.monotonic()
41
41
  # minor race condition with other threads but it doesn't really matter.
42
- _clean_download_locks()
42
+ _clean_file_locks()
43
43
 
44
44
 
45
- def download_lock(download_unique_str: str) -> FileLock:
45
+ def file_lock(lock_unique_str: str, locktype: str = "download") -> FileLock:
46
46
  """Note that the lockfiles will never be deleted automatically.
47
47
  https://py-filelock.readthedocs.io/en/latest/api.html#filelock.BaseFileLock.release
48
48
 
@@ -50,7 +50,7 @@ def download_lock(download_unique_str: str) -> FileLock:
50
50
  https://stackoverflow.com/questions/58098634/why-does-the-python-filelock-library-delete-lockfiles-on-windows-but-not-unix
51
51
 
52
52
  This means local developers would have a whole bunch of zero-byte files in their
53
- download locks directory. So, we take a slightly idiosyncratic approach to cleaning
53
+ file locks directory. So, we take a slightly idiosyncratic approach to cleaning
54
54
  this up: not wanting to run this code on every download, but also not wanting
55
55
  developers to see an infinitely-growing mess. Since parallel downloads will
56
56
  (generally) not constitute a correctness issue, the 'safest' time to clean it up will
@@ -58,11 +58,11 @@ def download_lock(download_unique_str: str) -> FileLock:
58
58
  we can get rid of old lockfiles after they've existed for more than 24 hours, since
59
59
  it's quite rare that a download would last that long.
60
60
  """
61
- DOWNLOAD_LOCKS_DIR().mkdir(parents=True, exist_ok=True)
62
- _occasionally_clean_download_locks()
61
+ lock_type_dir = FILELOCKS_DIR() / locktype
62
+ lock_type_dir.mkdir(parents=True, exist_ok=True)
63
+ _occasionally_clean_file_locks()
63
64
  return FileLock(
64
- DOWNLOAD_LOCKS_DIR()
65
- / (download_unique_str.split("/")[-1][:50] + hex_md5_str(download_unique_str)),
65
+ lock_type_dir / (lock_unique_str.split("/")[-1][:50] + hex_md5_str(lock_unique_str)),
66
66
  # is_singleton=True,
67
67
  # critical for keeping this reentrant without passing the lock around.
68
68
  # see https://github.com/tox-dev/filelock/issues/315#issuecomment-2016797681
thds/adls/list_fast.py CHANGED
@@ -6,20 +6,29 @@ client instead of the file system client.
6
6
 
7
7
  import typing as ty
8
8
 
9
- from thds.core import parallel, thunks
9
+ from thds.core import log, parallel, source, thunks
10
10
 
11
- from . import global_client
11
+ from . import blob_meta, global_client
12
+ from . import source as adls_source
12
13
  from .fqn import AdlsFqn
13
- from .source_tree import BlobMeta, to_blob_meta, yield_blob_meta
14
+ from .uri import UriIsh, parse_any
14
15
 
15
16
  R = ty.TypeVar("R")
16
17
 
17
18
 
19
+ logger = log.getLogger(__name__)
20
+
21
+
18
22
  def _failfast_parallel(thunks: ty.Iterable[ty.Callable[[], R]]) -> ty.Iterator[R]:
19
- yield from (res for _, res in parallel.failfast(parallel.yield_all(parallel.create_keys(thunks))))
23
+ yield from (
24
+ res
25
+ for _, res in parallel.failfast(
26
+ parallel.yield_all(parallel.create_keys(thunks), progress_logger=logger.debug)
27
+ )
28
+ )
20
29
 
21
30
 
22
- def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[BlobMeta]:
31
+ def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[blob_meta.BlobMeta]:
23
32
  """A fast way to find all blobs in a directory tree; we do this in parallel on
24
33
  subdirs, with as much nesting as necessary (though 1 level should be enough for most cases).
25
34
 
@@ -29,9 +38,9 @@ def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[Blo
29
38
  """
30
39
  if layers <= 0:
31
40
  # directly yield the blobs
32
- yield from yield_blob_meta(
41
+ yield from blob_meta.yield_blob_meta(
33
42
  global_client.get_global_blob_container_client(fqn.sa, fqn.container),
34
- fqn.path,
43
+ fqn.path.rstrip("/") + "/",
35
44
  )
36
45
  return
37
46
 
@@ -69,8 +78,10 @@ def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[Blo
69
78
 
70
79
  blob_container_client = global_client.get_global_blob_container_client(fqn.sa, fqn.container)
71
80
 
72
- def _get_blob_meta(blob_name: str) -> BlobMeta:
73
- return to_blob_meta(blob_container_client.get_blob_client(blob_name).get_blob_properties())
81
+ def _get_blob_meta(blob_name: str) -> blob_meta.BlobMeta:
82
+ return blob_meta.to_blob_meta(
83
+ blob_container_client.get_blob_client(blob_name).get_blob_properties()
84
+ )
74
85
 
75
86
  for blob_meta_iter in (
76
87
  _failfast_parallel((thunks.thunking(_get_blob_meta)(file) for file in files)),
@@ -86,10 +97,22 @@ def multilayer_yield_blob_meta(fqn: AdlsFqn, layers: int = 1) -> ty.Iterator[Blo
86
97
  yield from blob_meta_iter
87
98
 
88
99
 
89
- def is_dir(blob_meta: BlobMeta) -> bool: # TODO move to blob_meta.py once it exists
90
- return blob_meta.metadata.get("hdi_isfolder", "false") == "true"
91
-
92
-
93
- def _list_blob_meta(fqn: AdlsFqn, layers: int = 1) -> list[BlobMeta]:
100
+ def _list_blob_meta(fqn: AdlsFqn, layers: int = 1) -> list[blob_meta.BlobMeta]:
94
101
  """Only for use within multi_layer_yield_blobs."""
95
102
  return list(multilayer_yield_blob_meta(fqn, layers))
103
+
104
+
105
+ def multilayer_yield_sources(
106
+ fqn_or_uri: UriIsh,
107
+ layers: int = 1,
108
+ filter_: ty.Callable[[blob_meta.BlobMeta], bool] = lambda _: True,
109
+ ) -> ty.Iterator[source.Source]:
110
+ """
111
+ if you want to list directories and files, use `multilayer_yield_blob_meta` instead
112
+ """
113
+ fqn = parse_any(fqn_or_uri)
114
+ root = fqn.root()
115
+ for blob in multilayer_yield_blob_meta(fqn, layers):
116
+ if not blob_meta.is_dir(blob) and filter_(blob):
117
+ # ^ a "dir" Source would not make sense
118
+ yield adls_source.from_adls(root / blob.path, hash=blob.hash, size=blob.size)
thds/adls/source.py CHANGED
@@ -30,7 +30,9 @@ def _adls_uri_source_download_handler(uri: str) -> ty.Optional[source.Downloader
30
30
  source.register_download_handler("thds.adls", _adls_uri_source_download_handler)
31
31
 
32
32
 
33
- def from_adls(uri_or_fqn: ty.Union[str, AdlsFqn], hash: ty.Optional[Hash] = None) -> source.Source:
33
+ def from_adls(
34
+ uri_or_fqn: ty.Union[str, AdlsFqn], hash: ty.Optional[Hash] = None, size: int = 0
35
+ ) -> source.Source:
34
36
  """Flexible, public interface to creating Sources from any ADLS-like reference.
35
37
 
36
38
  Does NOT automatically fetch a checksumming hash from the ADLS URI if it's not
@@ -40,7 +42,7 @@ def from_adls(uri_or_fqn: ty.Union[str, AdlsFqn], hash: ty.Optional[Hash] = None
40
42
  r_fqn = resolve_any(uri_or_fqn)
41
43
  if not r_fqn:
42
44
  raise ValueError(f"Could not resolve {uri_or_fqn} to an ADLS FQN")
43
- return source.Source(str(r_fqn), hash)
45
+ return source.Source(str(r_fqn), hash, size)
44
46
 
45
47
 
46
48
  source.register_from_uri_handler(
@@ -55,13 +57,15 @@ def get_with_hash(fqn_or_uri: ty.Union[AdlsFqn, str]) -> source.Source:
55
57
  """
56
58
  fqn = AdlsFqn.parse(fqn_or_uri) if isinstance(fqn_or_uri, str) else fqn_or_uri
57
59
  with blob_not_found_translation(fqn):
58
- uri_hashes = hashes.extract_hashes_from_props(get_file_properties(fqn))
60
+ props = get_file_properties(fqn)
61
+ uri_hashes = hashes.extract_hashes_from_props(props)
59
62
  if not uri_hashes:
60
63
  raise ValueError(
61
64
  f"ADLS file {fqn} must have a hash to use this function. "
62
65
  "If you know the hash, use `from_adls` with the hash parameter."
63
66
  )
64
- return from_adls(fqn, next(iter(uri_hashes.values())))
67
+ size = int(props.get("size")) or 0
68
+ return from_adls(fqn, next(iter(uri_hashes.values())), size)
65
69
 
66
70
 
67
71
  def with_md5b64(uri_or_fqn: ty.Union[str, AdlsFqn], *, md5b64: str = "") -> source.Source:
thds/adls/source_tree.py CHANGED
@@ -1,53 +1,6 @@
1
- import typing as ty
2
- from dataclasses import dataclass
3
-
4
- from azure.storage.blob import BlobProperties, ContainerClient
5
-
6
- from thds.core import hashing
7
1
  from thds.core.source.tree import SourceTree
8
2
 
9
- from . import fqn, global_client, hashes, source, uri
10
-
11
- # TODO refactor BlobMeta into its own module.
12
-
13
-
14
- @dataclass
15
- class BlobMeta:
16
- path: str
17
- size: int
18
- hash: ty.Optional[hashing.Hash]
19
- metadata: dict[str, str]
20
-
21
-
22
- def to_blob_meta(blob_props: BlobProperties) -> BlobMeta:
23
- return BlobMeta(
24
- blob_props.name,
25
- blob_props.size,
26
- next(iter(hashes.extract_hashes_from_props(blob_props).values()), None),
27
- blob_props.metadata or {},
28
- )
29
-
30
-
31
- def yield_blob_meta(container_client: ContainerClient, root_dir: str) -> ty.Iterator[BlobMeta]:
32
- for blob_props in container_client.list_blobs(name_starts_with=root_dir, include=["metadata"]):
33
- # `list_blobs` does not include metadata by default, so we need to explicitly specify including it
34
- yield to_blob_meta(blob_props)
35
-
36
-
37
- # https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.containerclient?view=azure-python#azure-storage-blob-containerclient-list-blobs
38
- # https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.blobproperties?view=azure-python
39
- # https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.contentsettings?view=azure-python
40
- def list_blob_meta(
41
- container_client: ContainerClient, root_dir: str, match_suffix: str = ""
42
- ) -> ty.List[BlobMeta]:
43
- """Gets the path (relative to the SA/container root), size, and _a_ hash (if available) of all blobs in a directory."""
44
- return [
45
- blob_meta
46
- for blob_meta in yield_blob_meta(container_client, root_dir)
47
- if blob_meta.size > 0
48
- # container client lists directories as blobs with size 0
49
- and blob_meta.path.endswith(match_suffix)
50
- ]
3
+ from . import fqn, list_fast, uri
51
4
 
52
5
 
53
6
  def from_path(adls_path: uri.UriIsh, match_suffix: str = "") -> SourceTree:
@@ -56,12 +9,12 @@ def from_path(adls_path: uri.UriIsh, match_suffix: str = "") -> SourceTree:
56
9
  """
57
10
  root_fqn = uri.parse_any(adls_path)
58
11
 
59
- container_client = global_client.get_global_blob_container_client(root_fqn.sa, root_fqn.container)
60
- container_root = root_fqn.root()
61
12
  return SourceTree(
62
- sources=[
63
- source.from_adls(container_root / blob_meta.path, hash=blob_meta.hash)
64
- for blob_meta in list_blob_meta(container_client, root_fqn.path, match_suffix=match_suffix)
65
- ],
13
+ sources=sorted(
14
+ list_fast.multilayer_yield_sources(
15
+ root_fqn, layers=0, filter_=lambda blob: blob.path.endswith(match_suffix)
16
+ ),
17
+ key=lambda src: src.uri,
18
+ ),
66
19
  higher_logical_root=fqn.split(root_fqn)[-1],
67
20
  )
thds/adls/upload.py CHANGED
@@ -15,8 +15,9 @@ from thds.core import files, fretry, link, log, scope, source, tmp
15
15
 
16
16
  from . import azcopy, hashes
17
17
  from ._progress import report_upload_progress
18
- from ._upload import upload_decision_and_metadata
18
+ from ._upload import UploadSrc, upload_decision_and_metadata, upload_src_len
19
19
  from .conf import UPLOAD_FILE_MAX_CONCURRENCY
20
+ from .file_lock import file_lock
20
21
  from .fqn import AdlsFqn
21
22
  from .global_client import get_global_blob_container_client
22
23
  from .ro_cache import Cache
@@ -25,9 +26,6 @@ logger = log.getLogger(__name__)
25
26
  _SLOW_CONNECTION_WORKAROUND = 14400 # seconds
26
27
 
27
28
 
28
- UploadSrc = ty.Union[Path, bytes, ty.IO[ty.AnyStr], ty.Iterable[bytes]]
29
-
30
-
31
29
  def _write_through_local_cache(local_cache_path: Path, data: UploadSrc) -> ty.Optional[Path]:
32
30
  @scope.bound
33
31
  def _try_write_through() -> bool:
@@ -40,8 +38,8 @@ def _write_through_local_cache(local_cache_path: Path, data: UploadSrc) -> ty.Op
40
38
  out = scope.enter(tmp.temppath_same_fs(local_cache_path))
41
39
  if hasattr(data, "read") and hasattr(data, "seek"):
42
40
  with open(out, "wb") as f:
43
- f.write(data.read()) # type: ignore
44
- data.seek(0) # type: ignore
41
+ f.write(data.read())
42
+ data.seek(0)
45
43
  link.link_or_copy(out, local_cache_path)
46
44
  return True
47
45
 
@@ -101,9 +99,12 @@ def upload(
101
99
  # we always use the original source file to upload, not the cached path,
102
100
  # because uploading from a shared location risks race conditions.
103
101
 
102
+ scope.enter(file_lock(str(dest_), locktype="upload"))
103
+
104
104
  blob_container_client = get_global_blob_container_client(dest_.sa, dest_.container)
105
105
  blob_client = blob_container_client.get_blob_client(dest_.path)
106
106
  decision = upload_decision_and_metadata(blob_client.get_blob_properties, src) # type: ignore [arg-type]
107
+ n_bytes = upload_src_len(src, default=0)
107
108
 
108
109
  def source_from_meta() -> source.Source:
109
110
  best_hash = next(iter(hashes.extract_hashes_from_metadata(decision.metadata)), None)
@@ -111,17 +112,14 @@ def upload(
111
112
  assert best_hash, "A hash should always be calculable for a local path."
112
113
  return source.from_file(src, hash=best_hash, uri=str(dest_))
113
114
 
114
- return source.from_uri(str(dest_), hash=best_hash)
115
+ return source.from_uri(str(dest_), hash=best_hash, size=n_bytes)
115
116
 
116
117
  if decision.upload_required:
117
118
  # set up some bookkeeping
118
- n_bytes = None # if we pass 0 to upload_blob, it truncates the write now
119
119
  bytes_src: ty.Union[bytes, ty.IO, ty.Iterable[bytes]]
120
120
  if isinstance(src, Path):
121
- n_bytes = src.stat().st_size
122
121
  bytes_src = scope.enter(open(src, "rb"))
123
122
  elif isinstance(src, bytes):
124
- n_bytes = len(src)
125
123
  bytes_src = src
126
124
  else:
127
125
  bytes_src = src
@@ -129,7 +127,7 @@ def upload(
129
127
  if "metadata" in upload_data_kwargs:
130
128
  decision.metadata.update(upload_data_kwargs.pop("metadata"))
131
129
 
132
- if azcopy.upload.should_use_azcopy(n_bytes or 0) and isinstance(src, Path):
130
+ if azcopy.upload.should_use_azcopy(n_bytes) and isinstance(src, Path):
133
131
  logger.info("Using azcopy to upload %s to %s", src, dest_)
134
132
  try:
135
133
  azcopy.upload.run(
@@ -137,7 +135,7 @@ def upload(
137
135
  src, dest_, content_type=content_type, metadata=decision.metadata, overwrite=True
138
136
  ),
139
137
  dest_,
140
- n_bytes or 0,
138
+ n_bytes,
141
139
  )
142
140
  return source_from_meta()
143
141
 
@@ -155,9 +153,11 @@ def upload(
155
153
  # This is both faster, as well as simpler to reason about, and
156
154
  # in fact was the behavior I had been assuming all along...
157
155
  blob_client.upload_blob(
158
- report_upload_progress(ty.cast(ty.IO, bytes_src), str(dest_), n_bytes or 0),
156
+ report_upload_progress(ty.cast(ty.IO, bytes_src), str(dest_), n_bytes),
159
157
  overwrite=True,
160
- length=n_bytes,
158
+ length=(
159
+ n_bytes if n_bytes > 0 else None
160
+ ), # if we pass 0 to upload_blob, it truncates the write now
161
161
  content_settings=upload_content_settings,
162
162
  connection_timeout=_SLOW_CONNECTION_WORKAROUND,
163
163
  max_concurrency=UPLOAD_FILE_MAX_CONCURRENCY(),
@@ -0,0 +1,79 @@
1
+ Metadata-Version: 2.4
2
+ Name: thds.adls
3
+ Version: 4.4.20251117191451
4
+ Summary: ADLS tools
5
+ Author-email: Trilliant Health <info@trillianthealth.com>
6
+ License: MIT
7
+ Project-URL: Repository, https://github.com/TrilliantHealth/trilliant-data-science
8
+ Requires-Python: >=3.9
9
+ Description-Content-Type: text/markdown
10
+ Requires-Dist: aiohttp>=3.8.1
11
+ Requires-Dist: aiostream>=0.4.5
12
+ Requires-Dist: azure-identity>=1.9
13
+ Requires-Dist: azure-storage-file-datalake>=12.6
14
+ Requires-Dist: blake3
15
+ Requires-Dist: filelock>=3.0
16
+ Requires-Dist: xxhash
17
+ Requires-Dist: thds-core
18
+
19
+ # thds.adls
20
+
21
+ A high-performance Azure Data Lake Storage (ADLS Gen2) client for the THDS monorepo. It wraps the Azure
22
+ SDK with hash-aware caching, azcopy acceleration, and shared client/credential plumbing so applications
23
+ can transfer large blob datasets quickly and reliably.
24
+
25
+ ## Highlights
26
+
27
+ - **Environment-aware paths first:** Almost every consumer starts by importing `fqn`, `AdlsFqn`, and
28
+ `defaults.env_root()` to build storage-account/container URIs that follow the current THDS environment.
29
+ - **Cache-backed reads:** `download_to_cache` is the standard entry point for pulling blobs down with a
30
+ verified hash so local workflows, tests, and pipelines can operate on read-only copies.
31
+ - **Bulk filesystem helpers:** `ADLSFileSystem` powers scripts and jobs that need to walk directories,
32
+ fetch batches of files, or mirror hive tables without re-implementing Azure SDK plumbing.
33
+ - **Spark/Databricks bridges:** `abfss` and `uri` conversions keep analytics code agnostic to whether it
34
+ needs an `adls://`, `abfss://`, `https://`, or `dbfs://` view of the same path.
35
+ - **Composable utilities:** Higher-level modules (cache, upload, copy, list) layer on top of those
36
+ imports so teams can opt into more advanced behavior without leaving the public API surface.
37
+
38
+ ## Key Modules
39
+
40
+ | Component | Typical usage in the monorepo |
41
+ | ------------------------------------- | ---------------------------------------------------------------------------------------------------------- |
42
+ | `fqn` | Parse, validate, and join ADLS paths; used when materializing model datasets and configuring pipelines. |
43
+ | `AdlsFqn` | Strongly typed value passed between tasks and tests to represent a single blob or directory. |
44
+ | `defaults` / `named_roots` | Resolve environment-specific storage roots (`defaults.env_root()`, `named_roots.require(...)`). |
45
+ | `download_to_cache` (`cached` module) | Bring a blob down to the shared read-only cache before analytics, feature builds, or test fixtures run. |
46
+ | `ADLSFileSystem` (`impl` module) | Fetch or list entire directory trees and integrate with caching inside scripts and notebooks. |
47
+ | `abfss` | Translate `AdlsFqn` objects into `abfss://` URIs for Spark/Databricks jobs. |
48
+ | `uri` | Normalize `adls://`, `abfss://`, `https://`, and `dbfs://` strings into `AdlsFqn` values (and vice versa). |
49
+ | `global_client` / `shared_credential` | Shared, fork-safe Azure clients and credentials backing the public helpers above. |
50
+
51
+ ## Example Usage
52
+
53
+ 1. Use the caching helpers and Source integration:
54
+
55
+ ```python
56
+ from thds.adls import cached, upload, source
57
+
58
+ cache_path = cached.download_to_cache("adls://acct/container/path/to/file")
59
+ src = upload("adls://acct/container/path/out.parquet", cache_path)
60
+ verified = source.get_with_hash(src.uri)
61
+ ```
62
+
63
+ 1. For CLI usage, run (from repo root):
64
+
65
+ ```bash
66
+ uv run python -m thds.adls.tools.download adls://acct/container/path/file
67
+ ```
68
+
69
+ ## Operational Notes
70
+
71
+ - **Hash metadata:** Uploads attach `hash_xxh3_128_b64` automatically when the bytes are known. Download
72
+ completion back-fills missing hashes when permissions allow.
73
+ - **Locks and concurrency:** Large transfers acquire per-path file locks to keep azcopy instances
74
+ cooperative. Global HTTP connection pools default to 100 but are configurable via `thds.core.config`.
75
+ - **Error handling:** `BlobNotFoundError` and other ADLS-specific exceptions translate into custom error
76
+ types to simplify retries and diagnostics.
77
+ - **Extensibility:** Additional hash algorithms can be registered by importing dependent packages (e.g.,
78
+ `blake3`). Named roots can be populated dynamically via environment-specific modules
79
+ (`thds.adls._thds_defaults` hook).
@@ -1,32 +1,33 @@
1
- thds/adls/__init__.py,sha256=Z450uhM5BIxgve9qkz_Lw7biVWFKp3Uz_iKaSPYEf1Y,1073
1
+ thds/adls/__init__.py,sha256=CzNcmTdP5RHrD-9K1boDU4Z3pbJAvEAFfFhXGFwpEXw,1088
2
2
  thds/adls/_fork_protector.py,sha256=WML1Ul-G_MCziBOyRp6GoNq1IGktn8-OQVjny1IZ1bs,950
3
3
  thds/adls/_progress.py,sha256=0cmFZ2sOEogQY25AkMAhcunXO69FdlbkD1rU_zmLAp0,6648
4
- thds/adls/_upload.py,sha256=mhTdWiQroaugYuwQg7R8CEgdfCYF4xvJthlsqO0jlnE,4692
4
+ thds/adls/_upload.py,sha256=rN-nuRZur9VvSUywaQezUi_qedj-xU30fTBG6rasuIA,4853
5
5
  thds/adls/abfss.py,sha256=nCFfcdwLiwtz_MdbrpLJ9WOhoz0zHIVxsf-tarorEwM,727
6
+ thds/adls/blob_meta.py,sha256=DDhRK3pNeMaDUjiClXZ9g8nRD-ECZp21gLZGCKSxmfk,1404
6
7
  thds/adls/cached.py,sha256=up1F5JOVXdmwdZ8RAB2UDgiy6ooLg8IMULohBh75VpQ,3034
7
8
  thds/adls/conf.py,sha256=nTw3X1ilC3A_905jZH-rWXFsESeHAKQn5IghvfX2VIo,1991
8
9
  thds/adls/copy.py,sha256=aD6AquUR8r5W9SXd6Nm1qPrFH_fYpLC5dZk6HjPJnSQ,6611
9
10
  thds/adls/dbfs.py,sha256=pPAjbIZRKJsaXKQljDMUgqS_zy1yKeEZHGMueXbuv3g,2219
10
11
  thds/adls/defaults.py,sha256=GGq5Pn4r-8cX4bZItp4nnwWAAz7S07pzPoOegw0y5Fw,676
11
- thds/adls/download.py,sha256=z31w4Yuz4CqmU0iectcXmSoM2QJb1mSp9tGs0GHEhtY,19146
12
- thds/adls/download_lock.py,sha256=tgT48l4C5_qmArGeq05gl7VlxT22dZBH2Xwxx0itE9o,3176
12
+ thds/adls/download.py,sha256=IPg5nz_sGE7dX8DUQyWjG2D9z54PXLScap-pZzTUFTk,19142
13
13
  thds/adls/errors.py,sha256=6NMLHtVNsWBRDXaes9yzHj9cwKOD9t1dwL4BltdtjhU,1895
14
14
  thds/adls/etag.py,sha256=ct7jpHhNFcKzbekn5rZ3m6DhjK48A7qOZGwDiHkc-pc,242
15
+ thds/adls/file_lock.py,sha256=yLak5XDpnIYwfUNdpGFbIGG64uEs98-yVscNpJlqMxM,3176
15
16
  thds/adls/file_properties.py,sha256=dhRtbsMNOYfExkEiy76wrLfrJ6IMQeN1Z3LIxgKceqY,2042
16
17
  thds/adls/fqn.py,sha256=pIfEw25SjZNGmtzOLwrYCblciK35VQ35ZWb_mRaZKK0,5915
17
18
  thds/adls/global_client.py,sha256=q6oG1OOsfl4fbti81u8TE9d4Jx9-phlYgsGSc4032w8,3721
18
19
  thds/adls/hashes.py,sha256=2x1zcT_87en_vqFrLFs6F_EZCGpn7hk_81dYAwcypm8,5459
19
20
  thds/adls/impl.py,sha256=cNf1vmeS46X_wvyVdDJ8qFfowHn2QwtU5C80BmDtu5Y,43247
20
- thds/adls/list_fast.py,sha256=7jHnln4DMWYVLHhejj-fdWMBWflBiWfynegKxcUlNDY,4189
21
+ thds/adls/list_fast.py,sha256=VTQ0C-78ucnywfCWRWSZ8Kfqhi2jt9VXArhpbiK0cII,4857
21
22
  thds/adls/md5.py,sha256=hGT8AIX32VUsnRCbm8cel9OlxAiRrgjwNWQTqRDHM_k,374
22
23
  thds/adls/named_roots.py,sha256=7SLbAoQQpV_mrFZaUPjYoS-F9dxQxN5Hg4M3YPirF_w,751
23
24
  thds/adls/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
25
  thds/adls/ro_cache.py,sha256=P-UVFZhqnE5wojqYmRVWZcqjl-pM1xVMm9VAm3nXlnA,4769
25
26
  thds/adls/sas_tokens.py,sha256=mArbB_GYohevOmArw_1gKqVUWpv6kG8Hsbvdrhbtnbg,1957
26
27
  thds/adls/shared_credential.py,sha256=-x42aXoIM001KW59oS8PpuXQd4-F2vg-1gB6OMHlpk4,4602
27
- thds/adls/source.py,sha256=8HVMYuxDn1XYGwFFSBowMlvQ6r2Jm2CQlpu4h85JvsE,2559
28
- thds/adls/source_tree.py,sha256=gl2JLjxAduo4cGQBb8LqBnmRHHk2wqIC5yt-sqkXOEo,2589
29
- thds/adls/upload.py,sha256=MRHK9Am-x5FKBPh1SXLTbPC1r0Xk0bGWNU8CcNuUMLo,6602
28
+ thds/adls/source.py,sha256=5t9bc3tUxdh4QdbRvvVd9xF9_Rv4v7CDAqE09M3rE1g,2657
29
+ thds/adls/source_tree.py,sha256=gJVpFUI3pjXRM5VmppyAcsmhJwtTl8tGLhed0TvlDOM,622
30
+ thds/adls/upload.py,sha256=XJvygYzn1r6pZAl460s5j-kY9dgfQ6qeoddID7R6OQ0,6621
30
31
  thds/adls/uri.py,sha256=9MXuW_KfpPvzBc4ERxuTJ3vvi_6yr7e1kMAW9mx2zXM,1414
31
32
  thds/adls/azcopy/__init__.py,sha256=qn2dmT92EHcrtaQ8uwRoUgvtF6Fu3NQbhZItOBdIBmY,45
32
33
  thds/adls/azcopy/download.py,sha256=FOtYyYh7ZXNWNdkj04yTV26lxcKOVj-YhS2p_EclYxA,6526
@@ -38,8 +39,8 @@ thds/adls/tools/download.py,sha256=CW2cWbCRdUqisVVVoqqvqk5Ved7pPGTkwnZj3uV0jy4,1
38
39
  thds/adls/tools/ls.py,sha256=OgEaIfTK359twlZIj-A0AW_nv81Z6zi0b9Tw6OJJfWA,1083
39
40
  thds/adls/tools/ls_fast.py,sha256=Nowc-efAL_Y4ybPwZzKIeh7KGIjfecRzdWvJZcBzq_8,585
40
41
  thds/adls/tools/upload.py,sha256=5WyWkpuVp2PETZ3O3ODlq8LXszSHU73ZMnIDZXPJdC8,442
41
- thds_adls-4.2.20250926202021.dist-info/METADATA,sha256=WTNhMwL8Xk0cJBvpuqUA4NjZDw55hC1ra4VbiJqFX7o,587
42
- thds_adls-4.2.20250926202021.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
43
- thds_adls-4.2.20250926202021.dist-info/entry_points.txt,sha256=rtVF0A2MMTYUsBScF6b3AlOuk2Vm02QK7Tc2bDcDpk0,200
44
- thds_adls-4.2.20250926202021.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
45
- thds_adls-4.2.20250926202021.dist-info/RECORD,,
42
+ thds_adls-4.4.20251117191451.dist-info/METADATA,sha256=phV7EH6lnptlnQYY5TSfyZZk0Wiv0wqZ6L6o7pcP4UM,4586
43
+ thds_adls-4.4.20251117191451.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
44
+ thds_adls-4.4.20251117191451.dist-info/entry_points.txt,sha256=rtVF0A2MMTYUsBScF6b3AlOuk2Vm02QK7Tc2bDcDpk0,200
45
+ thds_adls-4.4.20251117191451.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
46
+ thds_adls-4.4.20251117191451.dist-info/RECORD,,
@@ -1,21 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: thds.adls
3
- Version: 4.2.20250926202021
4
- Summary: ADLS tools
5
- Author-email: Trilliant Health <info@trillianthealth.com>
6
- License: MIT
7
- Project-URL: Repository, https://github.com/TrilliantHealth/trilliant-data-science
8
- Requires-Python: >=3.9
9
- Description-Content-Type: text/markdown
10
- Requires-Dist: aiohttp>=3.8.1
11
- Requires-Dist: aiostream>=0.4.5
12
- Requires-Dist: azure-identity>=1.9
13
- Requires-Dist: azure-storage-file-datalake>=12.6
14
- Requires-Dist: blake3
15
- Requires-Dist: filelock>=3.0
16
- Requires-Dist: xxhash
17
- Requires-Dist: thds-core
18
-
19
- # adls Library
20
-
21
- A port of `core.adls`.