thds.adls 3.1.20250226213817__py3-none-any.whl → 3.1.20250227181712__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of thds.adls might be problematic. Click here for more details.

thds/adls/_progress.py CHANGED
@@ -138,11 +138,13 @@ class Tracker:
138
138
  self._reporter(list(self._progresses.values()))
139
139
  return self, key
140
140
 
141
- def __call__(self, key: str, written: int):
141
+ def __call__(self, key: str, written: int = 0, total_written: int = 0):
142
142
  assert written >= 0, "cannot write negative bytes: {written}"
143
143
  try:
144
144
  start, total, n = self._progresses[key]
145
- self._progresses[key] = ProgressState(start, total, n + written)
145
+ if not total_written:
146
+ total_written = n + written
147
+ self._progresses[key] = ProgressState(start, total, total_written)
146
148
  self._reporter(list(self._progresses.values()))
147
149
  if self._progresses[key].n >= total:
148
150
  del self._progresses[key]
@@ -155,6 +157,10 @@ _GLOBAL_UP_TRACKER = Tracker(TqdmReporter("thds.adls uploading"))
155
157
  T = ty.TypeVar("T", bound=ty.IO)
156
158
 
157
159
 
160
+ def get_global_download_tracker() -> Tracker:
161
+ return _GLOBAL_DN_TRACKER
162
+
163
+
158
164
  def _proxy_io(io_type: str, stream: T, key: str, total_len: int) -> T:
159
165
  assert io_type in ("read", "write"), io_type
160
166
 
@@ -0,0 +1 @@
1
+ from . import download # noqa
@@ -0,0 +1,185 @@
1
+ # azcopy is, for whatever reason, quite a bit faster than the Python SDK for downloads.
2
+ # This code allows us to have a thin "try azcopy if it's present, fall back to the Python
3
+ # SDK if it's not" layer.
4
+ #
5
+ # However, azcopy is also quite a bit 'dumber' when it comes to progress reporting (its
6
+ # reported progress totals seem to way underestimate progress and then rubber-band at the
7
+ # very end of the download), so for local users who don't have huge bandwidth, it's likely
8
+ # a better user experience to disable this globally.
9
+ import asyncio
10
+ import json
11
+ import os
12
+ import subprocess
13
+ import typing as ty
14
+ import urllib.parse
15
+ from contextlib import contextmanager
16
+ from pathlib import Path
17
+
18
+ from azure.storage.filedatalake import DataLakeFileClient
19
+
20
+ from thds.core import cache, config, log
21
+
22
+ from .. import _progress, conf, uri
23
+
24
+ DONT_USE_AZCOPY = config.item("dont_use", default=False, parse=config.tobool)
25
+
26
+ _AZCOPY_LOGIN_WORKLOAD_IDENTITY = "azcopy login --login-type workload".split()
27
+ _AZCOPY_LOGIN_LOCAL_STATUS = "azcopy login status".split()
28
+ # device login is an interactive process involving a web browser,
29
+ # which is not acceptable for large scale automation.
30
+ # So instead of logging in, we check to see if you _are_ logged in,
31
+ # and if you are, we try using azcopy in the future.
32
+
33
+ logger = log.getLogger(__name__)
34
+
35
+
36
+ class DownloadRequest(ty.NamedTuple):
37
+ """Use one or the other, but not both, to write the results."""
38
+
39
+ writer: ty.IO[bytes]
40
+ temp_path: Path
41
+
42
+
43
+ @cache.locking # only run this once per process.
44
+ def _good_azcopy_login() -> bool:
45
+ if DONT_USE_AZCOPY():
46
+ return False
47
+
48
+ try:
49
+ subprocess.run(_AZCOPY_LOGIN_WORKLOAD_IDENTITY, check=True, capture_output=True)
50
+ logger.info("Will use azcopy for downloads in this process...")
51
+ return True
52
+
53
+ except (subprocess.CalledProcessError, FileNotFoundError):
54
+ pass
55
+ try:
56
+ subprocess.run(_AZCOPY_LOGIN_LOCAL_STATUS, check=True)
57
+ logger.info("Will use azcopy for downloads in this process...", dl=None)
58
+ return True
59
+ except FileNotFoundError:
60
+ logger.info("azcopy is not installed or not on your PATH, so we cannot speed up downloads")
61
+ except subprocess.CalledProcessError as cpe:
62
+ logger.warning(
63
+ "You are not logged in with azcopy, so we cannot speed up downloads."
64
+ f" Run `azcopy login` to fix this. Return code was {cpe.returncode}"
65
+ )
66
+ return False
67
+
68
+
69
+ def _azcopy_download_command(dl_file_client: DataLakeFileClient, path: Path) -> ty.List[str]:
70
+ return ["azcopy", "copy", dl_file_client.url, str(path), "--output-type=json"]
71
+
72
+
73
+ class AzCopyMessage(ty.TypedDict):
74
+ TotalBytesEnumerated: str
75
+ TotalBytesTransferred: str
76
+
77
+
78
+ class AzCopyJsonLine(ty.TypedDict):
79
+ MessageType: str
80
+ MessageContent: AzCopyMessage
81
+
82
+
83
+ def _parse_azcopy_json_output(line: str) -> AzCopyJsonLine:
84
+ outer_msg = json.loads(line)
85
+ return AzCopyJsonLine(
86
+ MessageType=outer_msg["MessageType"],
87
+ MessageContent=json.loads(outer_msg["MessageContent"]),
88
+ )
89
+
90
+
91
+ @contextmanager
92
+ def _track_azcopy_progress(http_url: str) -> ty.Iterator[ty.Callable[[str], None]]:
93
+ """Context manager that tracks progress from AzCopy JSON lines. This works for both async and sync impls."""
94
+ tracker = _progress.get_global_download_tracker()
95
+ adls_uri = urllib.parse.unquote(str(uri.parse_uri(http_url)))
96
+
97
+ def track(line: str):
98
+ if not line:
99
+ return
100
+
101
+ try:
102
+ prog = _parse_azcopy_json_output(line)
103
+ if prog["MessageType"] == "Progress":
104
+ tracker(adls_uri, total_written=int(prog["MessageContent"]["TotalBytesTransferred"]))
105
+ except json.JSONDecodeError:
106
+ pass
107
+
108
+ yield track
109
+
110
+
111
+ def _restrict_mem() -> dict:
112
+ return dict(os.environ, AZCOPY_BUFFER_GB="0.3")
113
+
114
+
115
+ def sync_fastpath(
116
+ dl_file_client: DataLakeFileClient,
117
+ download_request: DownloadRequest,
118
+ ) -> None:
119
+ if _good_azcopy_login():
120
+ try:
121
+ # Run the copy
122
+ process = subprocess.Popen(
123
+ _azcopy_download_command(dl_file_client, download_request.temp_path),
124
+ stdout=subprocess.PIPE,
125
+ stderr=subprocess.PIPE,
126
+ text=True,
127
+ env=_restrict_mem(),
128
+ )
129
+ assert process.stdout
130
+ with _track_azcopy_progress(dl_file_client.url) as track:
131
+ for line in process.stdout:
132
+ track(line)
133
+ return # success
134
+
135
+ except (subprocess.SubprocessError, FileNotFoundError):
136
+ logger.warning("Falling back to Python SDK for download")
137
+
138
+ dl_file_client.download_file(
139
+ max_concurrency=conf.DOWNLOAD_FILE_MAX_CONCURRENCY(),
140
+ connection_timeout=conf.CONNECTION_TIMEOUT(),
141
+ ).readinto(download_request.writer)
142
+
143
+
144
+ async def async_fastpath(
145
+ dl_file_client: DataLakeFileClient,
146
+ download_request: DownloadRequest,
147
+ ) -> None:
148
+ # technically it would be 'better' to do this login in an async subproces,
149
+ # but it involves a lot of boilerplate, _and_ we have no nice way to cache
150
+ # the value, which is going to be computed one per process and never again.
151
+ # So we'll just block the async loop for a couple of seconds one time...
152
+ if _good_azcopy_login():
153
+ try:
154
+ # Run the copy
155
+ copy_proc = await asyncio.create_subprocess_exec(
156
+ *_azcopy_download_command(dl_file_client, download_request.temp_path),
157
+ stdout=asyncio.subprocess.PIPE,
158
+ stderr=asyncio.subprocess.PIPE,
159
+ env=_restrict_mem(),
160
+ )
161
+ assert copy_proc.stdout
162
+
163
+ # Feed lines to the tracker asynchronously
164
+ with _track_azcopy_progress(dl_file_client.url) as track:
165
+ while True:
166
+ line = await copy_proc.stdout.readline()
167
+ if not line: # EOF
168
+ break
169
+ track(line.decode().strip())
170
+
171
+ # Wait for process completion
172
+ exit_code = await copy_proc.wait()
173
+ if exit_code != 0:
174
+ raise subprocess.SubprocessError()
175
+
176
+ return # success
177
+
178
+ except (subprocess.SubprocessError, FileNotFoundError):
179
+ logger.warning("Falling back to Python SDK for download")
180
+
181
+ reader = await dl_file_client.download_file(
182
+ max_concurrency=conf.DOWNLOAD_FILE_MAX_CONCURRENCY(),
183
+ connection_timeout=conf.CONNECTION_TIMEOUT(),
184
+ )
185
+ await reader.readinto(download_request.writer)
thds/adls/download.py CHANGED
@@ -17,8 +17,8 @@ from thds.core import log, scope, tmp
17
17
  from thds.core.hashing import b64
18
18
  from thds.core.types import StrOrPath
19
19
 
20
+ from . import azcopy
20
21
  from ._progress import report_download_progress
21
- from .conf import CONNECTION_TIMEOUT, DOWNLOAD_FILE_MAX_CONCURRENCY
22
22
  from .download_lock import download_lock
23
23
  from .errors import MD5MismatchError, translate_azure_error
24
24
  from .etag import match_etag
@@ -34,12 +34,14 @@ def _atomic_download_and_move(
34
34
  fqn: AdlsFqn,
35
35
  dest: StrOrPath,
36
36
  properties: ty.Optional[FileProperties] = None,
37
- ) -> ty.Iterator[ty.IO[bytes]]:
37
+ ) -> ty.Iterator[azcopy.download.DownloadRequest]:
38
38
  with tmp.temppath_same_fs(dest) as dpath:
39
39
  with open(dpath, "wb") as f:
40
40
  known_size = (properties.size or 0) if properties else 0
41
41
  logger.debug("Downloading %s", fqn)
42
- yield report_download_progress(f, str(dest), known_size)
42
+ yield azcopy.download.DownloadRequest(
43
+ report_download_progress(f, str(fqn), known_size), dpath
44
+ )
43
45
  try:
44
46
  os.rename(dpath, dest) # will succeed even if dest is read-only
45
47
  except OSError as oserr:
@@ -139,7 +141,7 @@ class _IoRequest(enum.Enum):
139
141
  FILE_PROPERTIES = "file_properties"
140
142
 
141
143
 
142
- IoRequest = ty.Union[_IoRequest, ty.IO[bytes]]
144
+ IoRequest = ty.Union[_IoRequest, azcopy.download.DownloadRequest]
143
145
  IoResponse = ty.Union[FileProperties, None]
144
146
 
145
147
 
@@ -332,12 +334,12 @@ def download_or_use_verified(
332
334
  # only fetch these if they haven't already been requested
333
335
  file_properties = dl_file_client.get_file_properties()
334
336
  co_request = co.send(file_properties)
335
- else: # needs file object
336
- dl_file_client.download_file(
337
- max_concurrency=DOWNLOAD_FILE_MAX_CONCURRENCY(),
338
- connection_timeout=CONNECTION_TIMEOUT(),
339
- ).readinto(co_request)
337
+ elif isinstance(co_request, azcopy.download.DownloadRequest):
338
+ # coroutine is requesting download
339
+ azcopy.download.sync_fastpath(dl_file_client, co_request)
340
340
  co_request = co.send(None)
341
+ else:
342
+ raise ValueError(f"Unexpected coroutine request: {co_request}")
341
343
  except StopIteration as si:
342
344
  if cs := _set_md5_if_missing(file_properties, si.value.md5b64):
343
345
  try:
@@ -370,13 +372,13 @@ async def async_download_or_use_verified(
370
372
  # only fetch these if they haven't already been requested
371
373
  file_properties = await dl_file_client.get_file_properties()
372
374
  co_request = co.send(file_properties)
373
- else: # needs file object
374
- reader = await dl_file_client.download_file(
375
- max_concurrency=DOWNLOAD_FILE_MAX_CONCURRENCY(),
376
- connection_timeout=CONNECTION_TIMEOUT(),
377
- )
378
- await reader.readinto(co_request)
375
+ elif isinstance(co_request, azcopy.download.DownloadRequest):
376
+ # coroutine is requesting download
377
+ await azcopy.download.async_fastpath(dl_file_client, co_request)
379
378
  co_request = co.send(None)
379
+ else:
380
+ raise ValueError(f"Unexpected coroutine request: {co_request}")
381
+
380
382
  except StopIteration as si:
381
383
  if cs := _set_md5_if_missing(file_properties, si.value.md5b64):
382
384
  try:
thds/adls/meta.json CHANGED
@@ -1,8 +1,8 @@
1
1
  {
2
- "git_commit": "923bef905389724895eddd2b2d00e87a533d4c52",
2
+ "git_commit": "f4211224c05a5de6426bfd3efe4d62e3c0a7d64d",
3
3
  "git_branch": "main",
4
4
  "git_is_clean": true,
5
- "pyproject_version": "3.1.20250226213817",
5
+ "pyproject_version": "3.1.20250227181712",
6
6
  "thds_user": "runner",
7
7
  "misc": {}
8
8
  }
@@ -3,26 +3,45 @@ from pathlib import Path
3
3
 
4
4
  from thds.adls.cached_up_down import download_directory, download_to_cache
5
5
  from thds.adls.file_properties import get_file_properties, is_directory
6
+ from thds.adls.impl import ADLSFileSystem
6
7
  from thds.adls.uri import resolve_uri
7
8
  from thds.core.link import link
8
9
 
9
10
 
10
11
  def main():
11
12
  parser = argparse.ArgumentParser()
12
- parser.add_argument("uri", type=resolve_uri, help="A fully qualified path to an ADLS location")
13
+ parser.add_argument(
14
+ "adls_fqn",
15
+ type=resolve_uri,
16
+ help="A fully qualified path to an ADLS location. Accepts adls://, https:// and abfss:// URIs.",
17
+ )
13
18
  parser.add_argument(
14
19
  "--copy-to",
15
20
  "-c",
16
21
  type=Path,
17
22
  help="This will create a link to the cached download at the specified location",
18
23
  )
24
+ parser.add_argument(
25
+ "--use-async",
26
+ action="store_true",
27
+ help="Only useful if you want to exercise the async code; does not make things faster or better.",
28
+ )
19
29
 
20
30
  args = parser.parse_args()
21
31
 
22
- if is_directory(get_file_properties(args.uri)):
23
- cache_path = download_directory(args.uri)
32
+ is_dir = is_directory(get_file_properties(args.adls_fqn))
33
+
34
+ if args.use_async:
35
+ fs = ADLSFileSystem(args.adls_fqn.sa, args.adls_fqn.container)
36
+ if is_dir:
37
+ cache_path = fs.fetch_directory(args.adls_fqn.path)[0]
38
+ else:
39
+ cache_path = fs.fetch_file(args.adls_fqn.path)
24
40
  else:
25
- cache_path = download_to_cache(args.uri)
41
+ if is_dir:
42
+ cache_path = download_directory(args.adls_fqn)
43
+ else:
44
+ cache_path = download_to_cache(args.adls_fqn)
26
45
 
27
46
  if args.copy_to:
28
47
  link(cache_path, args.copy_to)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: thds.adls
3
- Version: 3.1.20250226213817
3
+ Version: 3.1.20250227181712
4
4
  Summary: ADLS tools
5
5
  Author: Trilliant Health
6
6
  Description-Content-Type: text/markdown
@@ -1,5 +1,5 @@
1
1
  thds/adls/__init__.py,sha256=er14MoCC9PlJMxWVS4G1hAeMJItaJj4EAsrTZlvlb0M,797
2
- thds/adls/_progress.py,sha256=SVOiVRvfvOfUjp-y-UUgA6n0FhAnpKesxltaQ16kmOk,6394
2
+ thds/adls/_progress.py,sha256=ZzCHn_G7nHakioNFxdvoJZRr-jN6ymsp5JXf-iReROM,6580
3
3
  thds/adls/_upload.py,sha256=q6Sk0CRnNcAjUOUPiBj4CfO4tJD196SQY0lT25CTSE4,4364
4
4
  thds/adls/abfss.py,sha256=ZRJOLjDuXmS4bIbQAQpQxWWWeu74N9NKEKCNfXQek80,726
5
5
  thds/adls/cached_up_down.py,sha256=qTMrsHHrDq3YZ0gaLMFIPnGNraz0ctG9yoFd3HCVit4,1896
@@ -7,7 +7,7 @@ thds/adls/conf.py,sha256=q1SPrgb46NpobVzwt_Oyv71-BvsIbZLq9nRWS3LZjz0,1990
7
7
  thds/adls/copy.py,sha256=jUWbGvTpb4B3yRGS0nhGSbDzqRPzUqYgH0z1lFRJB3k,6365
8
8
  thds/adls/dbfs.py,sha256=pPAjbIZRKJsaXKQljDMUgqS_zy1yKeEZHGMueXbuv3g,2219
9
9
  thds/adls/defaults.py,sha256=VxyaGz1gCcz2rGTR9acNRybbxGOYhTldvD_-SdGLBh0,652
10
- thds/adls/download.py,sha256=liVfnW4PJRjvBgPKsMqoo_F-9NT8syKsFMDaVSL8A4E,16345
10
+ thds/adls/download.py,sha256=Vfd9DEHUmSKPKpKxf9WjkqAzZFZwIgjo4fgM5aIvW0g,16455
11
11
  thds/adls/download_lock.py,sha256=_JZj-kjCUfHk9FvrmEuYpJYknmbam5eReFhGNDgzdLQ,2520
12
12
  thds/adls/errors.py,sha256=B_rMsQvQnNmP_sf-x8kmGsv2vIeOh4G9kVbdNVyk350,1469
13
13
  thds/adls/etag.py,sha256=ct7jpHhNFcKzbekn5rZ3m6DhjK48A7qOZGwDiHkc-pc,242
@@ -16,7 +16,7 @@ thds/adls/fqn.py,sha256=0zHmHhBWN7GEfKRB3fBC1NVhaiIHHifBdCRanyT01X8,5822
16
16
  thds/adls/global_client.py,sha256=f4VJw5y_Yh__8gQUcdSYTh1aU6iEPlauMchVirSAwDQ,3716
17
17
  thds/adls/impl.py,sha256=x1nSqc8W4NeuX8-JGOp2MRkK8ff6GnelTWedGxPs-qY,42494
18
18
  thds/adls/md5.py,sha256=qOX4_7WUj1QkbH_IwREcQNHvvZccOj-HpHZBfsKn1gY,1846
19
- thds/adls/meta.json,sha256=lX2VX33yybCNVd5IFfQ04M5BMyXhW_eDcmT6V0UEFfQ,195
19
+ thds/adls/meta.json,sha256=MhZqsLGrHq1cqZB6OydUf5yDbfo_GySrRRtjdpImBQc,195
20
20
  thds/adls/named_roots.py,sha256=7SLbAoQQpV_mrFZaUPjYoS-F9dxQxN5Hg4M3YPirF_w,751
21
21
  thds/adls/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
22
  thds/adls/ro_cache.py,sha256=F0uXol0t95mcRuBukNg3A7wt7XXQxpD5Sy09d9sl8f0,4825
@@ -25,15 +25,17 @@ thds/adls/shared_credential.py,sha256=-x42aXoIM001KW59oS8PpuXQd4-F2vg-1gB6OMHlpk
25
25
  thds/adls/source.py,sha256=1JYliDqafqpLewOQ2XMKp4T2WD7b5Rl7XYViyLYtzX8,2437
26
26
  thds/adls/source_tree.py,sha256=CiyCwgItF186aOoYiICkWt2oMLGMD6v526HRi8XWHtM,2219
27
27
  thds/adls/uri.py,sha256=pDH956p_VEHnjLLUnjWY6sGgRqksp9gdpc9KOW4gEP0,1205
28
+ thds/adls/azcopy/__init__.py,sha256=nTNbgz2GcEiGeswYbAgy4oPhivnzl_5crF3HqCdWWiw,31
29
+ thds/adls/azcopy/download.py,sha256=_I0smR2f9pu0xIiQC9C97Xb-GHMGwn5dqbEhz41H-qo,6548
28
30
  thds/adls/resource/__init__.py,sha256=IZ7_aRf1b3jEp7wXOxqHop0gV2gUcf9SOLeEEjIWlCU,1669
29
31
  thds/adls/resource/core.py,sha256=BVM91xsZ_B_CoGTc9DDD3FnGy8g6X-9eFpa86ZCzuZI,2717
30
32
  thds/adls/resource/file_pointers.py,sha256=PLru_3lwut_ZvrX5Keu-wJkPOt5o7UGf-OOT4ixaXME,2049
31
33
  thds/adls/resource/up_down.py,sha256=3uNlTvm2gVhSyYdQTBwsGecOgwtINQfINckR-awwV0Y,9907
32
- thds/adls/tools/download.py,sha256=ZZ8t1g9MRRy-aENGjr10E4aAqyVkKq2OqpyH70pCD_8,965
34
+ thds/adls/tools/download.py,sha256=vvBO8lSDl9oPugv75qpCkoemT9pOM9BV6yeExlkyG08,1594
33
35
  thds/adls/tools/ls.py,sha256=OgEaIfTK359twlZIj-A0AW_nv81Z6zi0b9Tw6OJJfWA,1083
34
36
  thds/adls/tools/upload.py,sha256=eMk4pdug1aCMPDDWpIE3Zoq77i5APp9Uuh-sVCCDNJE,493
35
- thds_adls-3.1.20250226213817.dist-info/METADATA,sha256=g0fHOJuklbyOLusIxUSeAa2FIRcLAsEwxCmkzF-ChoY,397
36
- thds_adls-3.1.20250226213817.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
37
- thds_adls-3.1.20250226213817.dist-info/entry_points.txt,sha256=uTqreT1AIwqJboMfLv5w6sviM8mNbAkln765gIjzoA4,152
38
- thds_adls-3.1.20250226213817.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
39
- thds_adls-3.1.20250226213817.dist-info/RECORD,,
37
+ thds_adls-3.1.20250227181712.dist-info/METADATA,sha256=O8W1_KMHFvwQZ--B8xPvFDMLL774NvfbhpxvEKkACWE,397
38
+ thds_adls-3.1.20250227181712.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
39
+ thds_adls-3.1.20250227181712.dist-info/entry_points.txt,sha256=uTqreT1AIwqJboMfLv5w6sviM8mNbAkln765gIjzoA4,152
40
+ thds_adls-3.1.20250227181712.dist-info/top_level.txt,sha256=LTZaE5SkWJwv9bwOlMbIhiS-JWQEEIcjVYnJrt-CriY,5
41
+ thds_adls-3.1.20250227181712.dist-info/RECORD,,