thds.adls 4.2.20251007062717__py3-none-any.whl → 4.3.20251008224101__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of thds.adls might be problematic. Click here for more details.

thds/adls/_upload.py CHANGED
@@ -38,17 +38,22 @@ def _try_default_hash(data: hashes.AnyStrSrc) -> ty.Optional[hashing.Hash]:
38
38
  return None
39
39
 
40
40
 
41
+ UploadSrc = ty.Union[Path, bytes, ty.IO[bytes], ty.Iterable[bytes]]
42
+
43
+
44
+ def upload_src_len(upload_src: UploadSrc, default: int = 0) -> int:
45
+ if isinstance(upload_src, Path) and upload_src.exists():
46
+ return upload_src.stat().st_size
47
+ try:
48
+ return len(upload_src) # type: ignore
49
+ except TypeError as te:
50
+ logger.debug(f"failed to get length? {repr(te)} for {upload_src!r}")
51
+ return default
52
+
53
+
41
54
  def _too_small_to_skip_upload(data: hashes.AnyStrSrc, min_size_for_remote_check: int) -> bool:
42
- def _len() -> int:
43
- if isinstance(data, Path) and data.exists():
44
- return data.stat().st_size
45
- try:
46
- return len(data) # type: ignore
47
- except TypeError as te:
48
- logger.debug(f"failed to get length? {repr(te)} for {data}")
49
- return min_size_for_remote_check + 1
50
-
51
- return _len() < min_size_for_remote_check
55
+ len_ = upload_src_len(data) or min_size_for_remote_check + 1
56
+ return len_ < min_size_for_remote_check
52
57
 
53
58
 
54
59
  class UploadDecision(ty.NamedTuple):
thds/adls/download.py CHANGED
@@ -16,7 +16,7 @@ from thds.core.types import StrOrPath
16
16
 
17
17
  from . import azcopy, errors, etag, hashes
18
18
  from ._progress import report_download_progress
19
- from .download_lock import download_lock
19
+ from .file_lock import file_lock
20
20
  from .fqn import AdlsFqn
21
21
  from .ro_cache import Cache, from_cache_path_to_local, from_local_path_to_cache
22
22
 
@@ -240,12 +240,12 @@ def _download_or_use_verified_cached_coroutine( # noqa: C901
240
240
  # No cache hit, so its time to prepare to download. if a cache was provided, we will
241
241
  # _put_ the resulting file in it.
242
242
 
243
- file_lock = str(cache.path(fqn) if cache else local_path)
243
+ file_lock_str = str(cache.path(fqn) if cache else local_path)
244
244
  # create lockfile name from the (shared) cache path if present, otherwise the final
245
245
  # destination. Non-cache users may then still incur multiple downloads in parallel,
246
246
  # but if you wanted to coordinate then you should probably have been using the global
247
247
  # cache in the first place.
248
- _dl_scope.enter(download_lock(file_lock))
248
+ _dl_scope.enter(file_lock(file_lock_str))
249
249
 
250
250
  # re-attempt cache hit - we may have gotten the lock after somebody else downloaded
251
251
  if file_result := attempt_cache_hit():
@@ -9,18 +9,18 @@ from thds.core import config, home, log
9
9
 
10
10
  from .md5 import hex_md5_str
11
11
 
12
- DOWNLOAD_LOCKS_DIR = config.item("dir", home.HOMEDIR() / ".thds/adls/download-locks", parse=Path)
12
+ FILELOCKS_DIR = config.item("dir", home.HOMEDIR() / ".thds/adls/file-locks", parse=Path)
13
13
  _CLEAN_UP_LOCKFILES_AFTER_TIME = timedelta(hours=24)
14
14
  _CLEAN_UP_LOCKFILES_EVERY = timedelta(hours=1).total_seconds()
15
15
  _LAST_CLEANED_BY_THIS_PROCESS = time.monotonic() - _CLEAN_UP_LOCKFILES_EVERY
16
16
  logger = log.getLogger(__name__)
17
17
 
18
18
 
19
- def _clean_download_locks() -> int:
19
+ def _clean_file_locks() -> int:
20
20
  deleted = 0
21
21
  deletion_threshold = time.time() - _CLEAN_UP_LOCKFILES_AFTER_TIME.total_seconds()
22
22
  try:
23
- for f in DOWNLOAD_LOCKS_DIR().iterdir():
23
+ for f in FILELOCKS_DIR().rglob("*"):
24
24
  fstat = f.stat()
25
25
  if stat.S_ISREG(fstat.st_mode) and fstat.st_mtime < deletion_threshold:
26
26
  f.unlink()
@@ -29,20 +29,20 @@ def _clean_download_locks() -> int:
29
29
  # this should be, hopefully, both very rare and completely inconsequential as to
30
30
  # program correctness. if you see this happen multiple times, you may have some
31
31
  # read-only files or something and want to manually clean up this directory.
32
- logger.exception("Failed to clean download locks directory.")
32
+ logger.exception("Failed to clean file locks directory.")
33
33
  return deleted
34
34
 
35
35
 
36
- def _occasionally_clean_download_locks():
36
+ def _occasionally_clean_file_locks():
37
37
  global _LAST_CLEANED_BY_THIS_PROCESS
38
38
  # do this about once an hour
39
39
  if time.monotonic() > _LAST_CLEANED_BY_THIS_PROCESS + _CLEAN_UP_LOCKFILES_EVERY:
40
40
  _LAST_CLEANED_BY_THIS_PROCESS = time.monotonic()
41
41
  # minor race condition with other threads but it doesn't really matter.
42
- _clean_download_locks()
42
+ _clean_file_locks()
43
43
 
44
44
 
45
- def download_lock(download_unique_str: str) -> FileLock:
45
+ def file_lock(lock_unique_str: str, locktype: str = "download") -> FileLock:
46
46
  """Note that the lockfiles will never be deleted automatically.
47
47
  https://py-filelock.readthedocs.io/en/latest/api.html#filelock.BaseFileLock.release
48
48
 
@@ -50,7 +50,7 @@ def download_lock(download_unique_str: str) -> FileLock:
50
50
  https://stackoverflow.com/questions/58098634/why-does-the-python-filelock-library-delete-lockfiles-on-windows-but-not-unix
51
51
 
52
52
  This means local developers would have a whole bunch of zero-byte files in their
53
- download locks directory. So, we take a slightly idiosyncratic approach to cleaning
53
+ file locks directory. So, we take a slightly idiosyncratic approach to cleaning
54
54
  this up: not wanting to run this code on every download, but also not wanting
55
55
  developers to see an infinitely-growing mess. Since parallel downloads will
56
56
  (generally) not constitute a correctness issue, the 'safest' time to clean it up will
@@ -58,11 +58,11 @@ def download_lock(download_unique_str: str) -> FileLock:
58
58
  we can get rid of old lockfiles after they've existed for more than 24 hours, since
59
59
  it's quite rare that a download would last that long.
60
60
  """
61
- DOWNLOAD_LOCKS_DIR().mkdir(parents=True, exist_ok=True)
62
- _occasionally_clean_download_locks()
61
+ lock_type_dir = FILELOCKS_DIR() / locktype
62
+ lock_type_dir.mkdir(parents=True, exist_ok=True)
63
+ _occasionally_clean_file_locks()
63
64
  return FileLock(
64
- DOWNLOAD_LOCKS_DIR()
65
- / (download_unique_str.split("/")[-1][:50] + hex_md5_str(download_unique_str)),
65
+ lock_type_dir / (lock_unique_str.split("/")[-1][:50] + hex_md5_str(lock_unique_str)),
66
66
  # is_singleton=True,
67
67
  # critical for keeping this reentrant without passing the lock around.
68
68
  # see https://github.com/tox-dev/filelock/issues/315#issuecomment-2016797681
thds/adls/source.py CHANGED
@@ -30,7 +30,9 @@ def _adls_uri_source_download_handler(uri: str) -> ty.Optional[source.Downloader
30
30
  source.register_download_handler("thds.adls", _adls_uri_source_download_handler)
31
31
 
32
32
 
33
- def from_adls(uri_or_fqn: ty.Union[str, AdlsFqn], hash: ty.Optional[Hash] = None) -> source.Source:
33
+ def from_adls(
34
+ uri_or_fqn: ty.Union[str, AdlsFqn], hash: ty.Optional[Hash] = None, size: int = 0
35
+ ) -> source.Source:
34
36
  """Flexible, public interface to creating Sources from any ADLS-like reference.
35
37
 
36
38
  Does NOT automatically fetch a checksumming hash from the ADLS URI if it's not
@@ -40,7 +42,7 @@ def from_adls(uri_or_fqn: ty.Union[str, AdlsFqn], hash: ty.Optional[Hash] = None
40
42
  r_fqn = resolve_any(uri_or_fqn)
41
43
  if not r_fqn:
42
44
  raise ValueError(f"Could not resolve {uri_or_fqn} to an ADLS FQN")
43
- return source.Source(str(r_fqn), hash)
45
+ return source.Source(str(r_fqn), hash, size)
44
46
 
45
47
 
46
48
  source.register_from_uri_handler(
@@ -55,13 +57,15 @@ def get_with_hash(fqn_or_uri: ty.Union[AdlsFqn, str]) -> source.Source:
55
57
  """
56
58
  fqn = AdlsFqn.parse(fqn_or_uri) if isinstance(fqn_or_uri, str) else fqn_or_uri
57
59
  with blob_not_found_translation(fqn):
58
- uri_hashes = hashes.extract_hashes_from_props(get_file_properties(fqn))
60
+ props = get_file_properties(fqn)
61
+ uri_hashes = hashes.extract_hashes_from_props(props)
59
62
  if not uri_hashes:
60
63
  raise ValueError(
61
64
  f"ADLS file {fqn} must have a hash to use this function. "
62
65
  "If you know the hash, use `from_adls` with the hash parameter."
63
66
  )
64
- return from_adls(fqn, next(iter(uri_hashes.values())))
67
+ size = int(props.get("size")) or 0
68
+ return from_adls(fqn, next(iter(uri_hashes.values())), size)
65
69
 
66
70
 
67
71
  def with_md5b64(uri_or_fqn: ty.Union[str, AdlsFqn], *, md5b64: str = "") -> source.Source:
thds/adls/source_tree.py CHANGED
@@ -60,7 +60,7 @@ def from_path(adls_path: uri.UriIsh, match_suffix: str = "") -> SourceTree:
60
60
  container_root = root_fqn.root()
61
61
  return SourceTree(
62
62
  sources=[
63
- source.from_adls(container_root / blob_meta.path, hash=blob_meta.hash)
63
+ source.from_adls(container_root / blob_meta.path, hash=blob_meta.hash, size=blob_meta.size)
64
64
  for blob_meta in list_blob_meta(container_client, root_fqn.path, match_suffix=match_suffix)
65
65
  ],
66
66
  higher_logical_root=fqn.split(root_fqn)[-1],
thds/adls/upload.py CHANGED
@@ -15,8 +15,9 @@ from thds.core import files, fretry, link, log, scope, source, tmp
15
15
 
16
16
  from . import azcopy, hashes
17
17
  from ._progress import report_upload_progress
18
- from ._upload import upload_decision_and_metadata
18
+ from ._upload import UploadSrc, upload_decision_and_metadata, upload_src_len
19
19
  from .conf import UPLOAD_FILE_MAX_CONCURRENCY
20
+ from .file_lock import file_lock
20
21
  from .fqn import AdlsFqn
21
22
  from .global_client import get_global_blob_container_client
22
23
  from .ro_cache import Cache
@@ -25,9 +26,6 @@ logger = log.getLogger(__name__)
25
26
  _SLOW_CONNECTION_WORKAROUND = 14400 # seconds
26
27
 
27
28
 
28
- UploadSrc = ty.Union[Path, bytes, ty.IO[ty.AnyStr], ty.Iterable[bytes]]
29
-
30
-
31
29
  def _write_through_local_cache(local_cache_path: Path, data: UploadSrc) -> ty.Optional[Path]:
32
30
  @scope.bound
33
31
  def _try_write_through() -> bool:
@@ -40,8 +38,8 @@ def _write_through_local_cache(local_cache_path: Path, data: UploadSrc) -> ty.Op
40
38
  out = scope.enter(tmp.temppath_same_fs(local_cache_path))
41
39
  if hasattr(data, "read") and hasattr(data, "seek"):
42
40
  with open(out, "wb") as f:
43
- f.write(data.read()) # type: ignore
44
- data.seek(0) # type: ignore
41
+ f.write(data.read())
42
+ data.seek(0)
45
43
  link.link_or_copy(out, local_cache_path)
46
44
  return True
47
45
 
@@ -101,9 +99,12 @@ def upload(
101
99
  # we always use the original source file to upload, not the cached path,
102
100
  # because uploading from a shared location risks race conditions.
103
101
 
102
+ scope.enter(file_lock(str(dest_), locktype="upload"))
103
+
104
104
  blob_container_client = get_global_blob_container_client(dest_.sa, dest_.container)
105
105
  blob_client = blob_container_client.get_blob_client(dest_.path)
106
106
  decision = upload_decision_and_metadata(blob_client.get_blob_properties, src) # type: ignore [arg-type]
107
+ n_bytes = upload_src_len(src, default=0)
107
108
 
108
109
  def source_from_meta() -> source.Source:
109
110
  best_hash = next(iter(hashes.extract_hashes_from_metadata(decision.metadata)), None)
@@ -111,17 +112,14 @@ def upload(
111
112
  assert best_hash, "A hash should always be calculable for a local path."
112
113
  return source.from_file(src, hash=best_hash, uri=str(dest_))
113
114
 
114
- return source.from_uri(str(dest_), hash=best_hash)
115
+ return source.from_uri(str(dest_), hash=best_hash, size=n_bytes)
115
116
 
116
117
  if decision.upload_required:
117
118
  # set up some bookkeeping
118
- n_bytes = None # if we pass 0 to upload_blob, it truncates the write now
119
119
  bytes_src: ty.Union[bytes, ty.IO, ty.Iterable[bytes]]
120
120
  if isinstance(src, Path):
121
- n_bytes = src.stat().st_size
122
121
  bytes_src = scope.enter(open(src, "rb"))
123
122
  elif isinstance(src, bytes):
124
- n_bytes = len(src)
125
123
  bytes_src = src
126
124
  else:
127
125
  bytes_src = src
@@ -129,7 +127,7 @@ def upload(
129
127
  if "metadata" in upload_data_kwargs:
130
128
  decision.metadata.update(upload_data_kwargs.pop("metadata"))
131
129
 
132
- if azcopy.upload.should_use_azcopy(n_bytes or 0) and isinstance(src, Path):
130
+ if azcopy.upload.should_use_azcopy(n_bytes) and isinstance(src, Path):
133
131
  logger.info("Using azcopy to upload %s to %s", src, dest_)
134
132
  try:
135
133
  azcopy.upload.run(
@@ -137,7 +135,7 @@ def upload(
137
135
  src, dest_, content_type=content_type, metadata=decision.metadata, overwrite=True
138
136
  ),
139
137
  dest_,
140
- n_bytes or 0,
138
+ n_bytes,
141
139
  )
142
140
  return source_from_meta()
143
141
 
@@ -155,9 +153,11 @@ def upload(
155
153
  # This is both faster, as well as simpler to reason about, and
156
154
  # in fact was the behavior I had been assuming all along...
157
155
  blob_client.upload_blob(
158
- report_upload_progress(ty.cast(ty.IO, bytes_src), str(dest_), n_bytes or 0),
156
+ report_upload_progress(ty.cast(ty.IO, bytes_src), str(dest_), n_bytes),
159
157
  overwrite=True,
160
- length=n_bytes,
158
+ length=(
159
+ n_bytes if n_bytes > 0 else None
160
+ ), # if we pass 0 to upload_blob, it truncates the write now
161
161
  content_settings=upload_content_settings,
162
162
  connection_timeout=_SLOW_CONNECTION_WORKAROUND,
163
163
  max_concurrency=UPLOAD_FILE_MAX_CONCURRENCY(),
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: thds.adls
3
- Version: 4.2.20251007062717
3
+ Version: 4.3.20251008224101
4
4
  Summary: ADLS tools
5
5
  Author-email: Trilliant Health <info@trillianthealth.com>
6
6
  License: MIT
@@ -1,17 +1,17 @@
1
1
  thds/adls/__init__.py,sha256=Z450uhM5BIxgve9qkz_Lw7biVWFKp3Uz_iKaSPYEf1Y,1073
2
2
  thds/adls/_fork_protector.py,sha256=WML1Ul-G_MCziBOyRp6GoNq1IGktn8-OQVjny1IZ1bs,950
3
3
  thds/adls/_progress.py,sha256=0cmFZ2sOEogQY25AkMAhcunXO69FdlbkD1rU_zmLAp0,6648
4
- thds/adls/_upload.py,sha256=mhTdWiQroaugYuwQg7R8CEgdfCYF4xvJthlsqO0jlnE,4692
4
+ thds/adls/_upload.py,sha256=rN-nuRZur9VvSUywaQezUi_qedj-xU30fTBG6rasuIA,4853
5
5
  thds/adls/abfss.py,sha256=nCFfcdwLiwtz_MdbrpLJ9WOhoz0zHIVxsf-tarorEwM,727
6
6
  thds/adls/cached.py,sha256=up1F5JOVXdmwdZ8RAB2UDgiy6ooLg8IMULohBh75VpQ,3034
7
7
  thds/adls/conf.py,sha256=nTw3X1ilC3A_905jZH-rWXFsESeHAKQn5IghvfX2VIo,1991
8
8
  thds/adls/copy.py,sha256=aD6AquUR8r5W9SXd6Nm1qPrFH_fYpLC5dZk6HjPJnSQ,6611
9
9
  thds/adls/dbfs.py,sha256=pPAjbIZRKJsaXKQljDMUgqS_zy1yKeEZHGMueXbuv3g,2219
10
10
  thds/adls/defaults.py,sha256=GGq5Pn4r-8cX4bZItp4nnwWAAz7S07pzPoOegw0y5Fw,676
11
- thds/adls/download.py,sha256=z31w4Yuz4CqmU0iectcXmSoM2QJb1mSp9tGs0GHEhtY,19146
12
- thds/adls/download_lock.py,sha256=tgT48l4C5_qmArGeq05gl7VlxT22dZBH2Xwxx0itE9o,3176
11
+ thds/adls/download.py,sha256=IPg5nz_sGE7dX8DUQyWjG2D9z54PXLScap-pZzTUFTk,19142
13
12
  thds/adls/errors.py,sha256=6NMLHtVNsWBRDXaes9yzHj9cwKOD9t1dwL4BltdtjhU,1895
14
13
  thds/adls/etag.py,sha256=ct7jpHhNFcKzbekn5rZ3m6DhjK48A7qOZGwDiHkc-pc,242
14
+ thds/adls/file_lock.py,sha256=yLak5XDpnIYwfUNdpGFbIGG64uEs98-yVscNpJlqMxM,3176
15
15
  thds/adls/file_properties.py,sha256=dhRtbsMNOYfExkEiy76wrLfrJ6IMQeN1Z3LIxgKceqY,2042
16
16
  thds/adls/fqn.py,sha256=pIfEw25SjZNGmtzOLwrYCblciK35VQ35ZWb_mRaZKK0,5915
17
17
  thds/adls/global_client.py,sha256=q6oG1OOsfl4fbti81u8TE9d4Jx9-phlYgsGSc4032w8,3721
@@ -24,9 +24,9 @@ thds/adls/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
24
  thds/adls/ro_cache.py,sha256=P-UVFZhqnE5wojqYmRVWZcqjl-pM1xVMm9VAm3nXlnA,4769
25
25
  thds/adls/sas_tokens.py,sha256=mArbB_GYohevOmArw_1gKqVUWpv6kG8Hsbvdrhbtnbg,1957
26
26
  thds/adls/shared_credential.py,sha256=-x42aXoIM001KW59oS8PpuXQd4-F2vg-1gB6OMHlpk4,4602
27
- thds/adls/source.py,sha256=8HVMYuxDn1XYGwFFSBowMlvQ6r2Jm2CQlpu4h85JvsE,2559
28
- thds/adls/source_tree.py,sha256=gl2JLjxAduo4cGQBb8LqBnmRHHk2wqIC5yt-sqkXOEo,2589
29
- thds/adls/upload.py,sha256=MRHK9Am-x5FKBPh1SXLTbPC1r0Xk0bGWNU8CcNuUMLo,6602
27
+ thds/adls/source.py,sha256=5t9bc3tUxdh4QdbRvvVd9xF9_Rv4v7CDAqE09M3rE1g,2657
28
+ thds/adls/source_tree.py,sha256=FqVXgvfYPiowrWhRsXBItjvB7t41JRI3sCFVAHxjwgI,2610
29
+ thds/adls/upload.py,sha256=XJvygYzn1r6pZAl460s5j-kY9dgfQ6qeoddID7R6OQ0,6621
30
30
  thds/adls/uri.py,sha256=9MXuW_KfpPvzBc4ERxuTJ3vvi_6yr7e1kMAW9mx2zXM,1414
31
31
  thds/adls/azcopy/__init__.py,sha256=qn2dmT92EHcrtaQ8uwRoUgvtF6Fu3NQbhZItOBdIBmY,45
32
32
  thds/adls/azcopy/download.py,sha256=FOtYyYh7ZXNWNdkj04yTV26lxcKOVj-YhS2p_EclYxA,6526
@@ -38,8 +38,8 @@ thds/adls/tools/download.py,sha256=CW2cWbCRdUqisVVVoqqvqk5Ved7pPGTkwnZj3uV0jy4,1
38
38
  thds/adls/tools/ls.py,sha256=OgEaIfTK359twlZIj-A0AW_nv81Z6zi0b9Tw6OJJfWA,1083
39
39
  thds/adls/tools/ls_fast.py,sha256=Nowc-efAL_Y4ybPwZzKIeh7KGIjfecRzdWvJZcBzq_8,585
40
40
  thds/adls/tools/upload.py,sha256=5WyWkpuVp2PETZ3O3ODlq8LXszSHU73ZMnIDZXPJdC8,442
41
- thds_adls-4.2.20251007062717.dist-info/METADATA,sha256=GA-pGzGPN-vNCMUWnC0bdxhqDv_6pPrVOZ4ayRhgsdI,587
42
- thds_adls-4.2.20251007062717.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
43
- thds_adls-4.2.20251007062717.dist-info/entry_points.txt,sha256=rtVF0A2MMTYUsBScF6b3AlOuk2Vm02QK7Tc2bDcDpk0,200
44
- thds_adls-4.2.20251007062717.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
45
- thds_adls-4.2.20251007062717.dist-info/RECORD,,
41
+ thds_adls-4.3.20251008224101.dist-info/METADATA,sha256=cz-SynGzdU2LP2ndI7NyyY5o0JpOHUGOqU0vpxt69tg,587
42
+ thds_adls-4.3.20251008224101.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
43
+ thds_adls-4.3.20251008224101.dist-info/entry_points.txt,sha256=rtVF0A2MMTYUsBScF6b3AlOuk2Vm02QK7Tc2bDcDpk0,200
44
+ thds_adls-4.3.20251008224101.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
45
+ thds_adls-4.3.20251008224101.dist-info/RECORD,,