thds.adls 3.0.20250116223841__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of thds.adls might be problematic. Click here for more details.
- thds/adls/__init__.py +15 -0
- thds/adls/_progress.py +193 -0
- thds/adls/_upload.py +127 -0
- thds/adls/abfss.py +24 -0
- thds/adls/cached_up_down.py +48 -0
- thds/adls/conf.py +33 -0
- thds/adls/dbfs.py +60 -0
- thds/adls/defaults.py +26 -0
- thds/adls/download.py +394 -0
- thds/adls/download_lock.py +57 -0
- thds/adls/errors.py +44 -0
- thds/adls/etag.py +6 -0
- thds/adls/file_properties.py +13 -0
- thds/adls/fqn.py +169 -0
- thds/adls/global_client.py +78 -0
- thds/adls/impl.py +1111 -0
- thds/adls/md5.py +60 -0
- thds/adls/meta.json +8 -0
- thds/adls/named_roots.py +26 -0
- thds/adls/py.typed +0 -0
- thds/adls/resource/__init__.py +36 -0
- thds/adls/resource/core.py +79 -0
- thds/adls/resource/file_pointers.py +54 -0
- thds/adls/resource/up_down.py +245 -0
- thds/adls/ro_cache.py +126 -0
- thds/adls/shared_credential.py +107 -0
- thds/adls/source.py +66 -0
- thds/adls/tools/download.py +35 -0
- thds/adls/tools/ls.py +38 -0
- thds/adls/uri.py +38 -0
- thds.adls-3.0.20250116223841.dist-info/METADATA +16 -0
- thds.adls-3.0.20250116223841.dist-info/RECORD +35 -0
- thds.adls-3.0.20250116223841.dist-info/WHEEL +5 -0
- thds.adls-3.0.20250116223841.dist-info/entry_points.txt +3 -0
- thds.adls-3.0.20250116223841.dist-info/top_level.txt +1 -0
thds/adls/impl.py
ADDED
|
@@ -0,0 +1,1111 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import datetime
|
|
3
|
+
import itertools
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
import shutil
|
|
7
|
+
from collections.abc import Mapping as MappingABC
|
|
8
|
+
from functools import cmp_to_key, wraps
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import (
|
|
11
|
+
IO,
|
|
12
|
+
Any,
|
|
13
|
+
AsyncIterable,
|
|
14
|
+
AsyncIterator,
|
|
15
|
+
Awaitable,
|
|
16
|
+
Callable,
|
|
17
|
+
Dict,
|
|
18
|
+
Iterable,
|
|
19
|
+
List,
|
|
20
|
+
Mapping,
|
|
21
|
+
Optional,
|
|
22
|
+
TypeVar,
|
|
23
|
+
Union,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
import attr
|
|
27
|
+
import azure.core.exceptions
|
|
28
|
+
from aiostream import stream
|
|
29
|
+
from azure.identity.aio import DefaultAzureCredential
|
|
30
|
+
from azure.storage.filedatalake import FileProperties, PathProperties
|
|
31
|
+
from azure.storage.filedatalake.aio import DataLakeServiceClient, FileSystemClient
|
|
32
|
+
|
|
33
|
+
from thds.core import lazy, log
|
|
34
|
+
|
|
35
|
+
from ._upload import async_upload_decision_and_settings, metadata_for_upload
|
|
36
|
+
from .conf import CONNECTION_TIMEOUT, UPLOAD_CHUNK_SIZE
|
|
37
|
+
from .download import async_download_or_use_verified
|
|
38
|
+
from .errors import translate_azure_error
|
|
39
|
+
from .file_properties import is_directory
|
|
40
|
+
from .ro_cache import from_cache_path_to_local, global_cache
|
|
41
|
+
from .shared_credential import get_credential_kwargs
|
|
42
|
+
|
|
43
|
+
LOGGER = log.getLogger(__name__)
|
|
44
|
+
log.getLogger("azure.core").setLevel(logging.WARNING)
|
|
45
|
+
log.getLogger("azure.identity").setLevel(logging.WARNING)
|
|
46
|
+
|
|
47
|
+
DEFAULT_HIVE_PREFIX = os.getenv("CORE_HIVE_PREFIX", "")
|
|
48
|
+
WEST_HIVE_PREFIX = "hive/warehouse" # For easy access while we may need backwards compatibility
|
|
49
|
+
|
|
50
|
+
T = TypeVar("T")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def async_run(func: Callable[..., Awaitable[T]]) -> Callable[..., T]:
|
|
54
|
+
"""Used to decorate the main runner function to avoid calling async.run too many times
|
|
55
|
+
|
|
56
|
+
:param func: any async function
|
|
57
|
+
:return:
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
@wraps(func)
|
|
61
|
+
def wrapper(*args, **kwargs):
|
|
62
|
+
return asyncio.run(func(*args, **kwargs)) # type: ignore
|
|
63
|
+
|
|
64
|
+
return wrapper
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def base_name(remote_path: str) -> str:
|
|
68
|
+
return remote_path.rstrip("/").split("/")[-1]
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _true(_):
|
|
72
|
+
return True
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def batcher(it: Iterable[T], size: int = 1) -> Iterable[List[T]]:
|
|
76
|
+
stream = iter(it)
|
|
77
|
+
|
|
78
|
+
def _slice():
|
|
79
|
+
return list(itertools.islice(stream, size))
|
|
80
|
+
|
|
81
|
+
yield from iter(_slice, [])
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@attr.s(auto_attribs=True)
|
|
85
|
+
class ADLSFileSystemNotFound(ConnectionError):
|
|
86
|
+
account_name: str
|
|
87
|
+
file_system: str
|
|
88
|
+
|
|
89
|
+
def __str__(self):
|
|
90
|
+
return f"File system {self.file_system!r} not found under account {self.account_name!r}"
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@attr.s(auto_attribs=True, frozen=True)
|
|
94
|
+
class PathPair:
|
|
95
|
+
"""Store the remote path and the corresponding local path of a file"""
|
|
96
|
+
|
|
97
|
+
remote_path: str
|
|
98
|
+
local_path: Path
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@attr.s(auto_attribs=True)
|
|
102
|
+
class DeleteProperties:
|
|
103
|
+
"""Convenience class around dicts returned in file deletion."""
|
|
104
|
+
|
|
105
|
+
path: str
|
|
106
|
+
date: Optional[datetime.datetime] = None
|
|
107
|
+
version: Optional[str] = None
|
|
108
|
+
request_id: Optional[str] = None
|
|
109
|
+
deletion_id: Optional[str] = None # Inferring type based on request_id.
|
|
110
|
+
continuation: Optional[Any] = None # Cannot find details on this.
|
|
111
|
+
exception: Optional[Exception] = None
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class ADLSFileSystem:
|
|
115
|
+
"""A downloader that can be used to download a single file, all the files and subdirectories
|
|
116
|
+
in a given directory, or all the files for a given hive table.
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
def __init__(
|
|
120
|
+
self,
|
|
121
|
+
account_name: str,
|
|
122
|
+
file_system: str,
|
|
123
|
+
default_batch_size: int = 64,
|
|
124
|
+
cache_dir: Optional[Union[Path, str]] = None,
|
|
125
|
+
):
|
|
126
|
+
self.account_name = account_name
|
|
127
|
+
self.file_system = file_system
|
|
128
|
+
self.default_batch_size = default_batch_size
|
|
129
|
+
if not self.exists():
|
|
130
|
+
raise ADLSFileSystemNotFound(account_name, file_system)
|
|
131
|
+
|
|
132
|
+
self.cache = None if cache_dir is None else ADLSFileSystemCache(cache_dir)
|
|
133
|
+
|
|
134
|
+
def exists(self) -> bool:
|
|
135
|
+
return self._run(self._exists)
|
|
136
|
+
|
|
137
|
+
@staticmethod
|
|
138
|
+
async def _exists(file_system_client: FileSystemClient) -> bool:
|
|
139
|
+
try:
|
|
140
|
+
return await file_system_client.exists()
|
|
141
|
+
except azure.core.exceptions.AzureError as err:
|
|
142
|
+
translate_azure_error(file_system_client, "", err)
|
|
143
|
+
|
|
144
|
+
def file_exists(self, path: str) -> bool:
|
|
145
|
+
return self._run(self._path_exists, path, False)
|
|
146
|
+
|
|
147
|
+
def dir_exists(self, path: str) -> bool:
|
|
148
|
+
return self._run(self._path_exists, path, True)
|
|
149
|
+
|
|
150
|
+
async def _path_exists(
|
|
151
|
+
self, file_system_client: FileSystemClient, path: str, directory: bool
|
|
152
|
+
) -> bool:
|
|
153
|
+
try:
|
|
154
|
+
info = await self._get_file_info(file_system_client, path)
|
|
155
|
+
except azure.core.exceptions.ResourceNotFoundError:
|
|
156
|
+
return False
|
|
157
|
+
except azure.core.exceptions.AzureError as err:
|
|
158
|
+
translate_azure_error(file_system_client, path, err)
|
|
159
|
+
return directory == is_directory(info)
|
|
160
|
+
|
|
161
|
+
@async_run
|
|
162
|
+
async def _run(self, func: Callable[..., Awaitable], *args, **kwargs):
|
|
163
|
+
"""Main async runner function that pass credential and account info
|
|
164
|
+
to create a file system client, which can then be passed into other async functions.
|
|
165
|
+
|
|
166
|
+
:param func: an async function
|
|
167
|
+
:param args: additional args for func
|
|
168
|
+
:param kwargs: addditional kwargs for func
|
|
169
|
+
"""
|
|
170
|
+
async with DefaultAzureCredential(**get_credential_kwargs()) as credential:
|
|
171
|
+
service_client = DataLakeServiceClient(
|
|
172
|
+
account_url="{}://{}.dfs.core.windows.net".format("https", self.account_name),
|
|
173
|
+
credential=credential,
|
|
174
|
+
)
|
|
175
|
+
async with service_client:
|
|
176
|
+
async with service_client.get_file_system_client(
|
|
177
|
+
file_system=self.file_system
|
|
178
|
+
) as file_system_client:
|
|
179
|
+
return await func(file_system_client, *args, **kwargs)
|
|
180
|
+
|
|
181
|
+
def _local_path_for(self, remote_path: str, local_path: Optional[Union[Path, str]]) -> Path:
|
|
182
|
+
if local_path is None:
|
|
183
|
+
if self.cache is None:
|
|
184
|
+
# use the current working directory as the default location
|
|
185
|
+
return Path(base_name(remote_path)).absolute()
|
|
186
|
+
else:
|
|
187
|
+
# use the cache as the default location
|
|
188
|
+
return self.cache.cache_path(remote_path)
|
|
189
|
+
else:
|
|
190
|
+
# use the fully qualified explicit path
|
|
191
|
+
return Path(local_path).absolute()
|
|
192
|
+
|
|
193
|
+
async def _fetch_file(
|
|
194
|
+
self,
|
|
195
|
+
file_system_client: FileSystemClient,
|
|
196
|
+
remote_path: str,
|
|
197
|
+
local_path: Optional[Union[Path, str]] = None,
|
|
198
|
+
) -> Path:
|
|
199
|
+
"""async function that downloads a file locally given its remote path
|
|
200
|
+
|
|
201
|
+
:returns: a local path of the downloaded file
|
|
202
|
+
"""
|
|
203
|
+
# the local file path we will return to the caller;
|
|
204
|
+
# may download into another path if there is a cache
|
|
205
|
+
return_path = self._local_path_for(remote_path, local_path)
|
|
206
|
+
download_path: Path
|
|
207
|
+
|
|
208
|
+
if self.cache is None:
|
|
209
|
+
download_path = return_path
|
|
210
|
+
else:
|
|
211
|
+
download_path = self.cache.cache_path(remote_path)
|
|
212
|
+
|
|
213
|
+
dir_path = return_path.parent
|
|
214
|
+
dir_path.mkdir(exist_ok=True, parents=True)
|
|
215
|
+
|
|
216
|
+
locally_cached = False
|
|
217
|
+
if self.cache:
|
|
218
|
+
async with file_system_client.get_file_client(remote_path) as file_client:
|
|
219
|
+
file_properties = await file_client.get_file_properties()
|
|
220
|
+
if self.cache.is_valid_for(file_properties):
|
|
221
|
+
# local timestamp cache is up-to-date for this file; skip download
|
|
222
|
+
LOGGER.debug(f"Skipping download of cached {remote_path}")
|
|
223
|
+
locally_cached = True
|
|
224
|
+
if not locally_cached:
|
|
225
|
+
await async_download_or_use_verified(
|
|
226
|
+
file_system_client, remote_path, download_path, cache=global_cache()
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
assert download_path.exists(), "File should have been downloaded by this point"
|
|
230
|
+
if download_path != return_path:
|
|
231
|
+
from_cache_path_to_local(download_path, return_path, link_opts=("ref", "hard"))
|
|
232
|
+
|
|
233
|
+
return return_path
|
|
234
|
+
|
|
235
|
+
async def _fetch_directory(
|
|
236
|
+
self,
|
|
237
|
+
file_system_client: FileSystemClient,
|
|
238
|
+
remote_path: str,
|
|
239
|
+
local_path: Optional[Union[Path, str]] = None,
|
|
240
|
+
batch_size: Optional[int] = None,
|
|
241
|
+
recursive: bool = True,
|
|
242
|
+
path_filter: Optional[Callable[[PathProperties], bool]] = None,
|
|
243
|
+
) -> List[Path]:
|
|
244
|
+
"""Async function that downloads all the files within a given directory,
|
|
245
|
+
including the files in the subdirectories when recursive = True
|
|
246
|
+
|
|
247
|
+
:return: a list of the paths of the files downloaded
|
|
248
|
+
"""
|
|
249
|
+
# normalize remote path to a standard relative dir path -
|
|
250
|
+
# this ensures correctness of strip_prefix() below
|
|
251
|
+
remote_path = remote_path.strip("/") + "/"
|
|
252
|
+
dir_path = self._local_path_for(remote_path, local_path)
|
|
253
|
+
dir_path.mkdir(exist_ok=True, parents=True)
|
|
254
|
+
path_filter_ = _true if path_filter is None else path_filter
|
|
255
|
+
|
|
256
|
+
# remove the remote directory prefix to determine a relative path for creation under dir_path
|
|
257
|
+
def strip_prefix(name):
|
|
258
|
+
return name.lstrip("/")[len(remote_path) :]
|
|
259
|
+
|
|
260
|
+
paths = (
|
|
261
|
+
PathPair(remote_path=path.name, local_path=dir_path / strip_prefix(path.name))
|
|
262
|
+
async for path in file_system_client.get_paths(remote_path, recursive=recursive)
|
|
263
|
+
if not path.is_directory and path_filter_(path)
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
local_paths = []
|
|
267
|
+
async for batch in self._async_batch(paths, batch_size):
|
|
268
|
+
local_paths.extend(
|
|
269
|
+
await asyncio.gather(
|
|
270
|
+
*[
|
|
271
|
+
self._fetch_file(
|
|
272
|
+
file_system_client,
|
|
273
|
+
path_pair.remote_path,
|
|
274
|
+
path_pair.local_path,
|
|
275
|
+
)
|
|
276
|
+
for path_pair in batch
|
|
277
|
+
]
|
|
278
|
+
)
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
return local_paths
|
|
282
|
+
|
|
283
|
+
async def _fetch_files(
|
|
284
|
+
self,
|
|
285
|
+
file_system_client: FileSystemClient,
|
|
286
|
+
remote_paths: Union[Iterable[str], Mapping[str, Union[Path, str]]],
|
|
287
|
+
batch_size: Optional[int] = None,
|
|
288
|
+
):
|
|
289
|
+
if isinstance(remote_paths, MappingABC):
|
|
290
|
+
remote_local_pairs = (
|
|
291
|
+
PathPair(remote_path, Path(local_path))
|
|
292
|
+
for remote_path, local_path in remote_paths.items()
|
|
293
|
+
)
|
|
294
|
+
else:
|
|
295
|
+
remote_local_pairs = (
|
|
296
|
+
PathPair(remote_path, self._local_path_for(remote_path, None))
|
|
297
|
+
for remote_path in remote_paths
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
if batch_size is None:
|
|
301
|
+
batch_size = self.default_batch_size
|
|
302
|
+
|
|
303
|
+
local_paths = []
|
|
304
|
+
for batch in iter(lambda: list(itertools.islice(remote_local_pairs, batch_size)), []):
|
|
305
|
+
local_paths.extend(
|
|
306
|
+
await asyncio.gather(
|
|
307
|
+
*[
|
|
308
|
+
self._fetch_file(
|
|
309
|
+
file_system_client,
|
|
310
|
+
path_pair.remote_path,
|
|
311
|
+
path_pair.local_path,
|
|
312
|
+
)
|
|
313
|
+
for path_pair in batch
|
|
314
|
+
]
|
|
315
|
+
)
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
return local_paths
|
|
319
|
+
|
|
320
|
+
@staticmethod
|
|
321
|
+
async def _put_file(
|
|
322
|
+
file_system_client: FileSystemClient,
|
|
323
|
+
local_path: Union[str, Path],
|
|
324
|
+
remote_path: str,
|
|
325
|
+
metadata: Optional[Mapping[str, str]] = None,
|
|
326
|
+
) -> str:
|
|
327
|
+
"""async function that uploads a local file to a remote path
|
|
328
|
+
|
|
329
|
+
:returns: remote path of uploaded file
|
|
330
|
+
"""
|
|
331
|
+
|
|
332
|
+
async with file_system_client.get_file_client(remote_path) as file_client:
|
|
333
|
+
with open(local_path, "rb") as fp:
|
|
334
|
+
decision = await async_upload_decision_and_settings(file_client.get_file_properties, fp)
|
|
335
|
+
if decision.upload_required:
|
|
336
|
+
await file_client.upload_data(
|
|
337
|
+
fp,
|
|
338
|
+
overwrite=True,
|
|
339
|
+
content_settings=decision.content_settings,
|
|
340
|
+
connection_timeout=CONNECTION_TIMEOUT(),
|
|
341
|
+
chunk_size=UPLOAD_CHUNK_SIZE(),
|
|
342
|
+
metadata={**metadata_for_upload(), **(metadata or {})},
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
return remote_path
|
|
346
|
+
|
|
347
|
+
async def _put_directory(
|
|
348
|
+
self,
|
|
349
|
+
file_system_client: FileSystemClient,
|
|
350
|
+
local_path: Union[str, Path],
|
|
351
|
+
remote_path: str,
|
|
352
|
+
recursive: bool = False,
|
|
353
|
+
batch_size: Optional[int] = None,
|
|
354
|
+
metadata: Optional[Mapping[str, str]] = None,
|
|
355
|
+
) -> List[str]:
|
|
356
|
+
"""async function that uploads all the files in a local directory to a remote path
|
|
357
|
+
|
|
358
|
+
:returns: list of remote paths
|
|
359
|
+
"""
|
|
360
|
+
|
|
361
|
+
local_path = str(local_path).rstrip("/") + "/"
|
|
362
|
+
remote_path = remote_path.rstrip("/") + "/"
|
|
363
|
+
|
|
364
|
+
if batch_size is None:
|
|
365
|
+
batch_size = self.default_batch_size
|
|
366
|
+
|
|
367
|
+
paths = []
|
|
368
|
+
if recursive:
|
|
369
|
+
for root, _subdirs, files in os.walk(local_path):
|
|
370
|
+
for filename in files:
|
|
371
|
+
paths.append(
|
|
372
|
+
PathPair(
|
|
373
|
+
os.path.join(root, filename).replace(local_path, remote_path),
|
|
374
|
+
Path(os.path.join(root, filename)),
|
|
375
|
+
)
|
|
376
|
+
)
|
|
377
|
+
else:
|
|
378
|
+
for filename in os.listdir(local_path):
|
|
379
|
+
if os.path.isfile(os.path.join(local_path, filename)):
|
|
380
|
+
paths.append(
|
|
381
|
+
PathPair(
|
|
382
|
+
os.path.join(remote_path, filename),
|
|
383
|
+
Path(os.path.join(local_path, filename)),
|
|
384
|
+
)
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
remote_paths = []
|
|
388
|
+
|
|
389
|
+
for batch in batcher(paths, batch_size):
|
|
390
|
+
remote_paths.extend(
|
|
391
|
+
await asyncio.gather(
|
|
392
|
+
*[
|
|
393
|
+
self._put_file(
|
|
394
|
+
file_system_client,
|
|
395
|
+
str(path_pair.local_path),
|
|
396
|
+
path_pair.remote_path,
|
|
397
|
+
metadata,
|
|
398
|
+
)
|
|
399
|
+
for path_pair in batch
|
|
400
|
+
]
|
|
401
|
+
)
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
return remote_paths
|
|
405
|
+
|
|
406
|
+
async def _put_files(
|
|
407
|
+
self,
|
|
408
|
+
file_system_client: FileSystemClient,
|
|
409
|
+
local_paths: Iterable[Union[str, Path]],
|
|
410
|
+
remote_path: str,
|
|
411
|
+
batch_size: Optional[int] = None,
|
|
412
|
+
metadata: Optional[Mapping[str, str]] = None,
|
|
413
|
+
) -> List[str]:
|
|
414
|
+
remote_path = remote_path.rstrip("/") + "/"
|
|
415
|
+
|
|
416
|
+
paths: List[PathPair] = []
|
|
417
|
+
|
|
418
|
+
for local_path in local_paths:
|
|
419
|
+
file_name = os.path.basename(local_path)
|
|
420
|
+
paths.append(PathPair(os.path.join(remote_path, file_name), Path(local_path)))
|
|
421
|
+
|
|
422
|
+
if batch_size is None:
|
|
423
|
+
batch_size = self.default_batch_size
|
|
424
|
+
|
|
425
|
+
remote_paths = []
|
|
426
|
+
|
|
427
|
+
for batch in batcher(paths, batch_size):
|
|
428
|
+
remote_paths.extend(
|
|
429
|
+
await asyncio.gather(
|
|
430
|
+
*[
|
|
431
|
+
self._put_file(
|
|
432
|
+
file_system_client,
|
|
433
|
+
str(path_pair.local_path),
|
|
434
|
+
path_pair.remote_path,
|
|
435
|
+
metadata,
|
|
436
|
+
)
|
|
437
|
+
for path_pair in batch
|
|
438
|
+
]
|
|
439
|
+
)
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
return remote_paths
|
|
443
|
+
|
|
444
|
+
@staticmethod
|
|
445
|
+
async def _get_file_info(file_system_client: FileSystemClient, remote_path: str) -> FileProperties:
|
|
446
|
+
"""Returns `FileProperties` for remote files.
|
|
447
|
+
|
|
448
|
+
See :meth:`~ADLSFileSystem.get_file_info` for more details.
|
|
449
|
+
"""
|
|
450
|
+
async with file_system_client.get_file_client(remote_path) as file_client:
|
|
451
|
+
return await file_client.get_file_properties()
|
|
452
|
+
|
|
453
|
+
async def _get_directory_info(
|
|
454
|
+
self,
|
|
455
|
+
file_system_client: FileSystemClient,
|
|
456
|
+
remote_path: str,
|
|
457
|
+
incl_subdirs: bool = False,
|
|
458
|
+
batch_size: Optional[int] = None,
|
|
459
|
+
recursive: bool = True,
|
|
460
|
+
path_filter: Optional[Callable[[FileProperties], bool]] = None,
|
|
461
|
+
) -> List[FileProperties]:
|
|
462
|
+
"""Returns a list of `FileProperties` for files in a remote directory.
|
|
463
|
+
|
|
464
|
+
See :meth:`~ADLSFileSystem.get_directory_info` for more details.
|
|
465
|
+
"""
|
|
466
|
+
|
|
467
|
+
def incl_subdirs_(path: PathProperties) -> bool:
|
|
468
|
+
if incl_subdirs:
|
|
469
|
+
return False
|
|
470
|
+
else:
|
|
471
|
+
return path.is_directory
|
|
472
|
+
|
|
473
|
+
path_filter_ = _true if path_filter is None else path_filter
|
|
474
|
+
|
|
475
|
+
paths = (
|
|
476
|
+
path.name
|
|
477
|
+
async for path in file_system_client.get_paths(remote_path, recursive=recursive)
|
|
478
|
+
if not incl_subdirs_(path) and path_filter_(path)
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
info = []
|
|
482
|
+
|
|
483
|
+
async for batch in self._async_batch(paths, batch_size):
|
|
484
|
+
info.extend(
|
|
485
|
+
await asyncio.gather(*[self._get_file_info(file_system_client, path) for path in batch])
|
|
486
|
+
)
|
|
487
|
+
|
|
488
|
+
return info
|
|
489
|
+
|
|
490
|
+
async def _get_files_info(
|
|
491
|
+
self,
|
|
492
|
+
file_system_client: FileSystemClient,
|
|
493
|
+
remote_paths: Iterable[str],
|
|
494
|
+
batch_size: Optional[int] = None,
|
|
495
|
+
) -> List[Dict[str, Any]]:
|
|
496
|
+
"""Returns a list of `FileProperties` for each file in a list of remote file paths.
|
|
497
|
+
|
|
498
|
+
See :meth:`~ADLSFileSystem.get_files_info` for more details.
|
|
499
|
+
"""
|
|
500
|
+
if batch_size is None:
|
|
501
|
+
batch_size = self.default_batch_size
|
|
502
|
+
|
|
503
|
+
info = []
|
|
504
|
+
for batch in batcher(remote_paths, batch_size):
|
|
505
|
+
info.extend(
|
|
506
|
+
await asyncio.gather(*[self._get_file_info(file_system_client, path) for path in batch])
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
return info
|
|
510
|
+
|
|
511
|
+
@staticmethod
|
|
512
|
+
async def _delete_file(
|
|
513
|
+
file_system_client: FileSystemClient,
|
|
514
|
+
remote_path: str,
|
|
515
|
+
if_modified_since: Optional[datetime.datetime] = None,
|
|
516
|
+
if_unmodified_since: Optional[datetime.datetime] = None,
|
|
517
|
+
) -> DeleteProperties:
|
|
518
|
+
"""Deletes a remote file and returns a response details dict.
|
|
519
|
+
|
|
520
|
+
See :meth:`~ADLSFileSystem.delete_file` for more details.
|
|
521
|
+
"""
|
|
522
|
+
async with file_system_client.get_file_client(remote_path) as file_client:
|
|
523
|
+
try:
|
|
524
|
+
return DeleteProperties(
|
|
525
|
+
path=remote_path,
|
|
526
|
+
**await file_client.delete_file(
|
|
527
|
+
if_modified_since=if_modified_since,
|
|
528
|
+
if_unmodified_since=if_unmodified_since,
|
|
529
|
+
),
|
|
530
|
+
)
|
|
531
|
+
except Exception as e:
|
|
532
|
+
return DeleteProperties(path=remote_path, exception=e)
|
|
533
|
+
|
|
534
|
+
@staticmethod
|
|
535
|
+
async def _delete_directory(
|
|
536
|
+
file_system_client: FileSystemClient,
|
|
537
|
+
remote_path: str,
|
|
538
|
+
if_modified_since: Optional[datetime.datetime] = None,
|
|
539
|
+
if_unmodified_since: Optional[datetime.datetime] = None,
|
|
540
|
+
) -> DeleteProperties:
|
|
541
|
+
"""Deletes a remote directory and returns a response details dict.
|
|
542
|
+
|
|
543
|
+
Warning: If `remote_path` is a file path, it will be deleted without raising an error.
|
|
544
|
+
|
|
545
|
+
See :meth:`~ADLSFileSystem.delete_directory` for more details.
|
|
546
|
+
"""
|
|
547
|
+
async with file_system_client.get_directory_client(remote_path) as directory_client:
|
|
548
|
+
try:
|
|
549
|
+
return DeleteProperties(
|
|
550
|
+
path=remote_path,
|
|
551
|
+
**await directory_client.delete_directory(
|
|
552
|
+
if_modified_since=if_modified_since,
|
|
553
|
+
if_unmodified_since=if_unmodified_since,
|
|
554
|
+
),
|
|
555
|
+
)
|
|
556
|
+
except Exception as e:
|
|
557
|
+
return DeleteProperties(path=remote_path, exception=e)
|
|
558
|
+
|
|
559
|
+
async def _delete_in_directory(
|
|
560
|
+
self,
|
|
561
|
+
file_system_client: FileSystemClient,
|
|
562
|
+
remote_path: str,
|
|
563
|
+
if_modified_since: Optional[datetime.datetime] = None,
|
|
564
|
+
if_unmodified_since: Optional[datetime.datetime] = None,
|
|
565
|
+
cleanup: bool = False,
|
|
566
|
+
batch_size: Optional[int] = None,
|
|
567
|
+
recursive: bool = True,
|
|
568
|
+
path_filter: Optional[Callable[[PathProperties], bool]] = None,
|
|
569
|
+
) -> List[DeleteProperties]:
|
|
570
|
+
"""Deletes files in a remote directory and returns a list of response details dicts.
|
|
571
|
+
|
|
572
|
+
See :meth:`~ADLSFileSystem.delete_in_directory` for more details.
|
|
573
|
+
"""
|
|
574
|
+
|
|
575
|
+
def cmp_subpath_relation(path1: str, path2: str) -> int:
|
|
576
|
+
if path1.startswith(path2):
|
|
577
|
+
return -1
|
|
578
|
+
elif path2.startswith(path1):
|
|
579
|
+
return 1
|
|
580
|
+
return 0
|
|
581
|
+
|
|
582
|
+
path_filter_ = _true if path_filter is None else path_filter
|
|
583
|
+
|
|
584
|
+
file_paths = (
|
|
585
|
+
path.name
|
|
586
|
+
async for path in file_system_client.get_paths(remote_path, recursive=recursive)
|
|
587
|
+
if not path.is_directory and path_filter_(path)
|
|
588
|
+
)
|
|
589
|
+
|
|
590
|
+
del_props = []
|
|
591
|
+
async for batch in self._async_batch(file_paths, batch_size):
|
|
592
|
+
del_props.extend(
|
|
593
|
+
await asyncio.gather(
|
|
594
|
+
*[
|
|
595
|
+
self._delete_file(
|
|
596
|
+
file_system_client,
|
|
597
|
+
path,
|
|
598
|
+
if_modified_since,
|
|
599
|
+
if_unmodified_since,
|
|
600
|
+
)
|
|
601
|
+
for path in batch
|
|
602
|
+
]
|
|
603
|
+
)
|
|
604
|
+
)
|
|
605
|
+
|
|
606
|
+
if cleanup:
|
|
607
|
+
dir_paths = [
|
|
608
|
+
path.name
|
|
609
|
+
async for path in file_system_client.get_paths(remote_path, recursive=recursive)
|
|
610
|
+
if path.is_directory
|
|
611
|
+
]
|
|
612
|
+
dir_paths.sort(key=cmp_to_key(cmp_subpath_relation))
|
|
613
|
+
|
|
614
|
+
# Synchronous because order of operations must be maintaned.
|
|
615
|
+
# Inner empty subdirs must be deleted before outer subdirs.
|
|
616
|
+
for path in dir_paths:
|
|
617
|
+
del_props.extend(await asyncio.gather(self._delete_file(file_system_client, path)))
|
|
618
|
+
|
|
619
|
+
return del_props
|
|
620
|
+
|
|
621
|
+
async def _delete_files(
|
|
622
|
+
self,
|
|
623
|
+
file_system_client: FileSystemClient,
|
|
624
|
+
remote_paths: Iterable[str],
|
|
625
|
+
batch_size: Optional[int] = None,
|
|
626
|
+
if_modified_since: Optional[datetime.datetime] = None,
|
|
627
|
+
if_unmodified_since: Optional[datetime.datetime] = None,
|
|
628
|
+
) -> List[DeleteProperties]:
|
|
629
|
+
"""Deletes each remote file in a list of remote file paths.
|
|
630
|
+
|
|
631
|
+
See :meth:`~ADLSFileSystem.delete_files` for more details.
|
|
632
|
+
"""
|
|
633
|
+
if batch_size is None:
|
|
634
|
+
batch_size = self.default_batch_size
|
|
635
|
+
|
|
636
|
+
del_props = []
|
|
637
|
+
for batch in batcher(remote_paths, batch_size):
|
|
638
|
+
del_props.extend(
|
|
639
|
+
await asyncio.gather(
|
|
640
|
+
*[
|
|
641
|
+
self._delete_file(
|
|
642
|
+
file_system_client,
|
|
643
|
+
path,
|
|
644
|
+
if_modified_since,
|
|
645
|
+
if_unmodified_since,
|
|
646
|
+
)
|
|
647
|
+
for path in batch
|
|
648
|
+
]
|
|
649
|
+
)
|
|
650
|
+
)
|
|
651
|
+
|
|
652
|
+
return del_props
|
|
653
|
+
|
|
654
|
+
async def _delete_directories(
|
|
655
|
+
self,
|
|
656
|
+
file_system_client: FileSystemClient,
|
|
657
|
+
remote_paths: Iterable[str],
|
|
658
|
+
batch_size: Optional[int] = None,
|
|
659
|
+
if_modified_since: Optional[datetime.datetime] = None,
|
|
660
|
+
if_unmodified_since: Optional[datetime.datetime] = None,
|
|
661
|
+
) -> List[DeleteProperties]:
|
|
662
|
+
"""Deletes each remote directory in a list of remote directory paths.
|
|
663
|
+
|
|
664
|
+
Warning: If any `remote_paths` are file paths, they will also be deleted
|
|
665
|
+
without raising an error.
|
|
666
|
+
|
|
667
|
+
See :meth:`~ADLSFileSystem.delete_directories` for more details.
|
|
668
|
+
"""
|
|
669
|
+
if batch_size is None:
|
|
670
|
+
batch_size = self.default_batch_size
|
|
671
|
+
|
|
672
|
+
del_props = []
|
|
673
|
+
for batch in batcher(remote_paths, batch_size):
|
|
674
|
+
del_props.extend(
|
|
675
|
+
await asyncio.gather(
|
|
676
|
+
*[
|
|
677
|
+
self._delete_directory(
|
|
678
|
+
file_system_client,
|
|
679
|
+
path,
|
|
680
|
+
if_modified_since,
|
|
681
|
+
if_unmodified_since,
|
|
682
|
+
)
|
|
683
|
+
for path in batch
|
|
684
|
+
]
|
|
685
|
+
)
|
|
686
|
+
)
|
|
687
|
+
|
|
688
|
+
return del_props
|
|
689
|
+
|
|
690
|
+
async def _async_batch(
|
|
691
|
+
self, it: AsyncIterable[T], size: Optional[int] = None
|
|
692
|
+
) -> AsyncIterator[List[T]]:
|
|
693
|
+
"""Async batch generator"""
|
|
694
|
+
batch_size = size if size is not None else self.default_batch_size
|
|
695
|
+
async with stream.chunks(it, batch_size).stream() as streamer:
|
|
696
|
+
async for chunk in streamer:
|
|
697
|
+
yield chunk
|
|
698
|
+
|
|
699
|
+
def fetch_files(self, remote_paths: Union[Iterable[str], Mapping[str, Union[Path, str]]]):
|
|
700
|
+
return self._run(self._fetch_files, remote_paths)
|
|
701
|
+
|
|
702
|
+
def fetch_file(self, remote_path: str, local_path: Optional[Union[Path, str]] = None) -> Path:
|
|
703
|
+
"""Download the given remote file and save it into a given file path (local_path).
|
|
704
|
+
In case there is a cache directory, the file is downloaded to a matching path under it.
|
|
705
|
+
In that case, when local_path is passed, a hard link is made to the cache copy at the
|
|
706
|
+
local_path location. (a hard link ensures that clearing the cache later will not affect
|
|
707
|
+
the view of the file at local_path)
|
|
708
|
+
|
|
709
|
+
:param remote_path: path in ADLS to download
|
|
710
|
+
:param local_path: path for local file; if not given, use the name from the remote path when
|
|
711
|
+
there is no cache, otherwise use the path under the cache dir corresponding to remote_path
|
|
712
|
+
:return: the local path where the file was downloaded
|
|
713
|
+
"""
|
|
714
|
+
return self._run(self._fetch_file, remote_path, local_path)
|
|
715
|
+
|
|
716
|
+
def fetch_directory(
|
|
717
|
+
self,
|
|
718
|
+
remote_path: str,
|
|
719
|
+
local_path: Optional[Union[Path, str]] = None,
|
|
720
|
+
batch_size: Optional[int] = None,
|
|
721
|
+
recursive: bool = True,
|
|
722
|
+
path_filter: Optional[Callable[[PathProperties], bool]] = None,
|
|
723
|
+
) -> List[Path]:
|
|
724
|
+
"""Download all the files in a given directory and save them in a given directory path.
|
|
725
|
+
In case there is a cache directory, the remote directory is reflected in a subdirectory under it.
|
|
726
|
+
The semantics of local_path are the same as for fetch_file(),
|
|
727
|
+
except that it references a local directory, and the hard links are made for each downloaded
|
|
728
|
+
file under it when there is a cache.
|
|
729
|
+
|
|
730
|
+
:param remote_path: the remote directory to download from
|
|
731
|
+
:param local_path: path for the local directory; if not given, use the name from the
|
|
732
|
+
remote path when there is no cache, otherwise use the path under the cache dir
|
|
733
|
+
corresponding to remote_path
|
|
734
|
+
:param batch_size: the size of each batch
|
|
735
|
+
:param recursive: recurse into subdirectories when downloading?
|
|
736
|
+
:param path_filter: optional callable taking an `azure.storage.filedatalake.PathProperties`
|
|
737
|
+
and returning a bool indicating whether to download the corresponding file
|
|
738
|
+
:return: List of local paths that were downloaded to
|
|
739
|
+
"""
|
|
740
|
+
return self._run(
|
|
741
|
+
self._fetch_directory,
|
|
742
|
+
remote_path,
|
|
743
|
+
local_path,
|
|
744
|
+
batch_size=batch_size,
|
|
745
|
+
recursive=recursive,
|
|
746
|
+
path_filter=path_filter,
|
|
747
|
+
)
|
|
748
|
+
|
|
749
|
+
def fetch_hive_table(
|
|
750
|
+
self,
|
|
751
|
+
table: str,
|
|
752
|
+
local_path: Optional[Union[Path, str]] = None,
|
|
753
|
+
batch_size: Optional[int] = None,
|
|
754
|
+
hive_prefix: str = DEFAULT_HIVE_PREFIX,
|
|
755
|
+
) -> List[Path]:
|
|
756
|
+
"""Download all the files in the directory for a given hive table
|
|
757
|
+
|
|
758
|
+
:param table: e.g. database.tablename
|
|
759
|
+
:param local_path: if not given, the files will be saved to ./database/tablename/
|
|
760
|
+
:param batch_size: the size of each batch
|
|
761
|
+
:param hive_prefix: the path prefix from the container root to the Hive warehouse
|
|
762
|
+
"""
|
|
763
|
+
database, tablename = table.split(".")
|
|
764
|
+
remote_path = (
|
|
765
|
+
f"{hive_prefix.strip('/')}/{database}.db/{tablename}"
|
|
766
|
+
if hive_prefix
|
|
767
|
+
else f"{database}.db/{tablename}"
|
|
768
|
+
)
|
|
769
|
+
local_path_resolved = local_path if local_path is not None else Path(f"{database}/{tablename}")
|
|
770
|
+
|
|
771
|
+
return self.fetch_directory(remote_path, local_path_resolved, batch_size)
|
|
772
|
+
|
|
773
|
+
def put_file(
|
|
774
|
+
self,
|
|
775
|
+
local_path: Union[str, Path],
|
|
776
|
+
remote_path: str,
|
|
777
|
+
metadata: Optional[Mapping[str, str]] = None,
|
|
778
|
+
) -> str:
|
|
779
|
+
"""async function that uploads a local file to a remote location
|
|
780
|
+
|
|
781
|
+
:param local_path: The local path of the file to upload.
|
|
782
|
+
:param remote_path: The remote path to which the file will be uploaded.
|
|
783
|
+
:param metadata: Metadata to add to the file.
|
|
784
|
+
:returns: remote path of uploaded file
|
|
785
|
+
"""
|
|
786
|
+
|
|
787
|
+
return self._run(self._put_file, local_path, remote_path, metadata)
|
|
788
|
+
|
|
789
|
+
def put_directory(
|
|
790
|
+
self,
|
|
791
|
+
local_path: Union[str, Path],
|
|
792
|
+
remote_path: str,
|
|
793
|
+
recursive: bool = False,
|
|
794
|
+
batch_size: Optional[int] = None,
|
|
795
|
+
metadata: Optional[Mapping[str, str]] = None,
|
|
796
|
+
) -> List[str]:
|
|
797
|
+
"""async function that uploads all the files in a local directory to a remote directory
|
|
798
|
+
|
|
799
|
+
:param local_path: The local path of the directory to upload.
|
|
800
|
+
:param remote_path: The remote path to which the directory will be uploaded.
|
|
801
|
+
:param recursive: Recurse into subdirectories when downloading?
|
|
802
|
+
:param batch_size: The size of each batch.
|
|
803
|
+
:param metadata: Metadata to add to each file uploaded.
|
|
804
|
+
:returns: list of remote paths
|
|
805
|
+
"""
|
|
806
|
+
|
|
807
|
+
return self._run(self._put_directory, local_path, remote_path, recursive, batch_size, metadata)
|
|
808
|
+
|
|
809
|
+
def put_files(
|
|
810
|
+
self,
|
|
811
|
+
local_paths: Iterable[Union[str, Path]],
|
|
812
|
+
remote_path: str,
|
|
813
|
+
batch_size: Optional[int] = None,
|
|
814
|
+
metadata: Optional[Mapping[str, str]] = None,
|
|
815
|
+
) -> List[str]:
|
|
816
|
+
"""async function that uploads each in a list of files to a remote directory
|
|
817
|
+
|
|
818
|
+
:param local_paths: The local paths of the directory to upload.
|
|
819
|
+
:param remote_path: The remote path to which the files will be uploaded.
|
|
820
|
+
:param batch_size: The size of each batch.
|
|
821
|
+
:param metadata: Metadata to add to each file uploaded.
|
|
822
|
+
:returns: list of remote paths
|
|
823
|
+
"""
|
|
824
|
+
|
|
825
|
+
return self._run(self._put_files, local_paths, remote_path, batch_size, metadata)
|
|
826
|
+
|
|
827
|
+
def delete_file(
|
|
828
|
+
self,
|
|
829
|
+
remote_path: str,
|
|
830
|
+
if_modified_since: Optional[datetime.datetime] = None,
|
|
831
|
+
if_unmodified_since: Optional[datetime.datetime] = None,
|
|
832
|
+
) -> DeleteProperties:
|
|
833
|
+
"""Async function that deletes a remote file.
|
|
834
|
+
|
|
835
|
+
:param remote_path: Path to remote file location.
|
|
836
|
+
:param if_modified_since: Only delete file if it has been modified since given datetime.
|
|
837
|
+
Default is `None`.
|
|
838
|
+
:param if_unmodified_since: Only delete file if it has been unmodified since given datetime.
|
|
839
|
+
Default is `None`.
|
|
840
|
+
:return: `DeleteProperties`.
|
|
841
|
+
"""
|
|
842
|
+
return self._run(self._delete_file, remote_path, if_modified_since, if_unmodified_since)
|
|
843
|
+
|
|
844
|
+
def delete_directory(
|
|
845
|
+
self,
|
|
846
|
+
remote_path: str,
|
|
847
|
+
if_modified_since: Optional[datetime.datetime] = None,
|
|
848
|
+
if_unmodified_since: Optional[datetime.datetime] = None,
|
|
849
|
+
) -> DeleteProperties:
|
|
850
|
+
"""Async function that deletes a remote directory.
|
|
851
|
+
|
|
852
|
+
Warning: If `remote_path` is a file path, it will be deleted without raising an error.
|
|
853
|
+
|
|
854
|
+
:param remote_path: Path to remote directory location.
|
|
855
|
+
:param if_modified_since: Only delete directory if it has been modified since given datetime.
|
|
856
|
+
Default is `None`.
|
|
857
|
+
:param if_unmodified_since: Only delete directory if it has been unmodified since given datetime.
|
|
858
|
+
Default is `None`.
|
|
859
|
+
:return: `DeleteProperties`.
|
|
860
|
+
"""
|
|
861
|
+
return self._run(self._delete_directory, remote_path, if_modified_since, if_unmodified_since)
|
|
862
|
+
|
|
863
|
+
def delete_in_directory(
|
|
864
|
+
self,
|
|
865
|
+
remote_path: str,
|
|
866
|
+
if_modified_since: Optional[datetime.datetime] = None,
|
|
867
|
+
if_unmodified_since: Optional[datetime.datetime] = None,
|
|
868
|
+
cleanup: bool = False,
|
|
869
|
+
batch_size: Optional[int] = None,
|
|
870
|
+
recursive: bool = True,
|
|
871
|
+
path_filter: Optional[Callable[[PathProperties], bool]] = None,
|
|
872
|
+
) -> List[DeleteProperties]:
|
|
873
|
+
"""Async function that deletes all files in a remote directory, with the option to also delete
|
|
874
|
+
subdirectories left empty afterwards.
|
|
875
|
+
|
|
876
|
+
Note #1: Cleanup step is blocking due to the need to maintain the order in which empty
|
|
877
|
+
subdirectories must be deleted. Inner empty directories have to be deleted before
|
|
878
|
+
outer empty directories so the outer directories can be empty.
|
|
879
|
+
|
|
880
|
+
Note #2: if cleanup is true, the function will attempt to delete all subdirectories,
|
|
881
|
+
however non-empty subdirectories will produce an exception that gets passed and written to their
|
|
882
|
+
respective response details dicts.
|
|
883
|
+
|
|
884
|
+
:param remote_path: Path to remote directory location.
|
|
885
|
+
:param if_modified_since: Only delete files if they have been modified since given datetime.
|
|
886
|
+
Default is `None`.
|
|
887
|
+
:param if_unmodified_since: Only delete files if they have been unmodified since given datetime.
|
|
888
|
+
Default is `None`.
|
|
889
|
+
:param cleanup: Whether to delete subdirectories left empty after file deletion.
|
|
890
|
+
Default is `False`.
|
|
891
|
+
:param batch_size: Number of files to delete in each batch.
|
|
892
|
+
Default is `None`.
|
|
893
|
+
:param recursive: Whether to recurse into subdirectories when deleting.
|
|
894
|
+
Default is `True`.
|
|
895
|
+
:param path_filter: Optional callable taking a `PathProperties` and returning a bool
|
|
896
|
+
indicating whether to delete the corresponding file.
|
|
897
|
+
:return: List of `DeleteProperties`.
|
|
898
|
+
"""
|
|
899
|
+
return self._run(
|
|
900
|
+
self._delete_in_directory,
|
|
901
|
+
remote_path,
|
|
902
|
+
if_modified_since,
|
|
903
|
+
if_unmodified_since,
|
|
904
|
+
cleanup,
|
|
905
|
+
batch_size,
|
|
906
|
+
recursive,
|
|
907
|
+
path_filter,
|
|
908
|
+
)
|
|
909
|
+
|
|
910
|
+
def delete_files(
|
|
911
|
+
self,
|
|
912
|
+
remote_paths: Iterable[str],
|
|
913
|
+
batch_size: Optional[int] = None,
|
|
914
|
+
if_modified_since: Optional[datetime.datetime] = None,
|
|
915
|
+
if_unmodified_since: Optional[datetime.datetime] = None,
|
|
916
|
+
) -> List[DeleteProperties]:
|
|
917
|
+
"""Async function that deletes each file in a list of remote file paths.
|
|
918
|
+
|
|
919
|
+
:param remote_paths: List of paths to remote file locations.
|
|
920
|
+
:param batch_size: Number of files to delete in each batch.
|
|
921
|
+
Default is `None`.
|
|
922
|
+
:param if_modified_since: Only delete files if they have been modified since given datetime.
|
|
923
|
+
Default is `None`.
|
|
924
|
+
:param if_unmodified_since: Only delete files if they have been unmodified since given datetime.
|
|
925
|
+
Default is `None`.
|
|
926
|
+
:return: List of `DeleteProperties`.
|
|
927
|
+
"""
|
|
928
|
+
return self._run(
|
|
929
|
+
self._delete_files,
|
|
930
|
+
remote_paths,
|
|
931
|
+
batch_size,
|
|
932
|
+
if_modified_since,
|
|
933
|
+
if_unmodified_since,
|
|
934
|
+
)
|
|
935
|
+
|
|
936
|
+
def delete_directories(
|
|
937
|
+
self,
|
|
938
|
+
remote_paths: Iterable[str],
|
|
939
|
+
batch_size: Optional[int] = None,
|
|
940
|
+
if_modified_since: Optional[datetime.datetime] = None,
|
|
941
|
+
if_unmodified_since: Optional[datetime.datetime] = None,
|
|
942
|
+
) -> List[DeleteProperties]:
|
|
943
|
+
"""Async function that deletes each directory in a list of remote directory paths.
|
|
944
|
+
|
|
945
|
+
Warning: If any `remote_paths` are file paths, they will also be deleted
|
|
946
|
+
without raising an error.
|
|
947
|
+
|
|
948
|
+
:param remote_paths: List of paths to remote directory locations.
|
|
949
|
+
:param batch_size: Number of directories to delete in each batch.
|
|
950
|
+
Default is `None`.
|
|
951
|
+
:param if_modified_since: Only delete directory if it has been modified since given datetime.
|
|
952
|
+
Default is `None`.
|
|
953
|
+
:param if_unmodified_since: Only delete directory if it has been unmodified since given datetime.
|
|
954
|
+
Default is `None`.
|
|
955
|
+
:return: List of `DeleteProperties`.
|
|
956
|
+
"""
|
|
957
|
+
return self._run(
|
|
958
|
+
self._delete_directories,
|
|
959
|
+
remote_paths,
|
|
960
|
+
batch_size,
|
|
961
|
+
if_modified_since,
|
|
962
|
+
if_unmodified_since,
|
|
963
|
+
)
|
|
964
|
+
|
|
965
|
+
def get_file_info(self, remote_path: str) -> FileProperties:
|
|
966
|
+
"""Async function that gets `FileProperties` for a remote file.
|
|
967
|
+
|
|
968
|
+
:param remote_path: Path to remote file location.
|
|
969
|
+
:return: `FileProperties`
|
|
970
|
+
"""
|
|
971
|
+
return self._run(self._get_file_info, remote_path)
|
|
972
|
+
|
|
973
|
+
def get_directory_info(
|
|
974
|
+
self,
|
|
975
|
+
remote_path: str,
|
|
976
|
+
incl_subdirs: bool = False,
|
|
977
|
+
batch_size: Optional[int] = None,
|
|
978
|
+
recursive: bool = True,
|
|
979
|
+
path_filter: Optional[Callable[[FileProperties], bool]] = None,
|
|
980
|
+
) -> List[FileProperties]:
|
|
981
|
+
"""Async function that gets `FileProperties` for all files in a remote directory.
|
|
982
|
+
|
|
983
|
+
:param remote_path: Path to a remote directory location.
|
|
984
|
+
:param incl_subdirs: Whether to include `FileProperties` for the subdirectories themselves.
|
|
985
|
+
Default is `False`.
|
|
986
|
+
:param batch_size: Number of `FileProperties` to get in each batch.
|
|
987
|
+
Default is `None`.
|
|
988
|
+
:param recursive: Whether to recurse into subdirectories when getting `FileProperties`.
|
|
989
|
+
:param path_filter: Optional callable taking a `PathProperties` and returning a bool
|
|
990
|
+
indicating whether to delete the corresponding file.
|
|
991
|
+
:return: List of `FileProperties`.
|
|
992
|
+
"""
|
|
993
|
+
return self._run(
|
|
994
|
+
self._get_directory_info,
|
|
995
|
+
remote_path,
|
|
996
|
+
incl_subdirs,
|
|
997
|
+
batch_size,
|
|
998
|
+
recursive,
|
|
999
|
+
path_filter,
|
|
1000
|
+
)
|
|
1001
|
+
|
|
1002
|
+
def get_files_info(
|
|
1003
|
+
self, remote_paths: Iterable[str], batch_size: Optional[int] = None
|
|
1004
|
+
) -> List[FileProperties]:
|
|
1005
|
+
"""Async function that gets `FileProperties` for each file in a list of remote file paths.
|
|
1006
|
+
|
|
1007
|
+
:param remote_paths: List of paths to remote file locations.
|
|
1008
|
+
:param batch_size: Number of `FileProperties` to get in each batch.
|
|
1009
|
+
Default is `None`.
|
|
1010
|
+
:return: List of `FileProperties`.
|
|
1011
|
+
"""
|
|
1012
|
+
return self._run(self._get_files_info, remote_paths, batch_size)
|
|
1013
|
+
|
|
1014
|
+
|
|
1015
|
+
class ADLSFileSystemCache:
|
|
1016
|
+
def __init__(self, cache_dir: Union[Path, str]):
|
|
1017
|
+
self.cache_dir = Path(cache_dir).absolute()
|
|
1018
|
+
self._init_dir()
|
|
1019
|
+
|
|
1020
|
+
def _init_dir(self):
|
|
1021
|
+
if self.cache_dir.exists() and not self.cache_dir.is_dir():
|
|
1022
|
+
raise FileExistsError(f"{self.cache_dir} exists but is not a directory; can't use as cache")
|
|
1023
|
+
else:
|
|
1024
|
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
1025
|
+
|
|
1026
|
+
def clear(self):
|
|
1027
|
+
if self.cache_dir.exists():
|
|
1028
|
+
shutil.rmtree(self.cache_dir)
|
|
1029
|
+
|
|
1030
|
+
self._init_dir()
|
|
1031
|
+
|
|
1032
|
+
def __contains__(self, path: str) -> bool:
|
|
1033
|
+
"""Check for existence of a path in the cache for a *blob* (not for directories)"""
|
|
1034
|
+
return self.cache_path(path).is_file()
|
|
1035
|
+
|
|
1036
|
+
def remove(self, path: str):
|
|
1037
|
+
"""Remove a path from the cache. This is irrespective of type (files and dirs), i.e.
|
|
1038
|
+
the end result should be that the cache is ready to have new content written at `path`,
|
|
1039
|
+
either as a file or a directory. In case a cache path corresponding to relative path `path`
|
|
1040
|
+
doesn't exist locally, no action is taken."""
|
|
1041
|
+
cache_path = self.cache_path(path)
|
|
1042
|
+
|
|
1043
|
+
if cache_path.is_dir():
|
|
1044
|
+
shutil.rmtree(cache_path)
|
|
1045
|
+
elif cache_path.exists():
|
|
1046
|
+
os.remove(cache_path)
|
|
1047
|
+
|
|
1048
|
+
def cache_path(self, path: str):
|
|
1049
|
+
"""Return the local path in the cache corresponding to the relative ADLS path `path`"""
|
|
1050
|
+
# ADLS paths are always forward-slash separated, hence we don't use os.path.split here
|
|
1051
|
+
parts = path.split("/")
|
|
1052
|
+
return self.cache_dir.joinpath(*parts)
|
|
1053
|
+
|
|
1054
|
+
def file_handle(self, path: str, mode: str) -> IO:
|
|
1055
|
+
"""Return a file handle to the local path in the cache corresponding to the relative ADLS
|
|
1056
|
+
path, `path`, opened in mode `mode`. Closing the handle is the responsibility of the caller.
|
|
1057
|
+
"""
|
|
1058
|
+
file_path = self.cache_path(path)
|
|
1059
|
+
dir_path = Path(file_path).parent
|
|
1060
|
+
dir_path.mkdir(parents=True, exist_ok=True)
|
|
1061
|
+
return open(file_path, mode)
|
|
1062
|
+
|
|
1063
|
+
def file_properties(self, path: str) -> FileProperties:
|
|
1064
|
+
"""Return an `azure.storage.filedatalake.FileProperties` corresponding to the properties of the
|
|
1065
|
+
local file."""
|
|
1066
|
+
cache_path = self.cache_path(path)
|
|
1067
|
+
if not cache_path.is_file():
|
|
1068
|
+
raise FileNotFoundError(f"No file at {path} in cache at {self.cache_dir}")
|
|
1069
|
+
cache_stat = os.stat(cache_path)
|
|
1070
|
+
cache_mod_time = datetime.datetime.fromtimestamp(cache_stat.st_mtime).astimezone(
|
|
1071
|
+
datetime.timezone.utc
|
|
1072
|
+
)
|
|
1073
|
+
fp = FileProperties(name=path)
|
|
1074
|
+
fp.last_modified = cache_mod_time
|
|
1075
|
+
fp.size = cache_stat.st_size
|
|
1076
|
+
return fp
|
|
1077
|
+
|
|
1078
|
+
def is_valid_for(self, adls_properties: FileProperties) -> bool:
|
|
1079
|
+
"""Check if the cache has a valid copy of a blob at a given relative ADLS path.
|
|
1080
|
+
This is checked by comparison of the local file properties with an
|
|
1081
|
+
`azure.storage.filedatalake.FileProperties` detailing the properties of the ADLS blob.
|
|
1082
|
+
To be valid the local cache path should:
|
|
1083
|
+
- exist and be a proper file
|
|
1084
|
+
- have a newer last-modified time than that of the ADLS blob
|
|
1085
|
+
- have the same size as the ADLS blob
|
|
1086
|
+
"""
|
|
1087
|
+
assert adls_properties.name
|
|
1088
|
+
if adls_properties.name not in self:
|
|
1089
|
+
return False
|
|
1090
|
+
|
|
1091
|
+
cache_properties = self.file_properties(adls_properties.name)
|
|
1092
|
+
if not cache_properties.last_modified:
|
|
1093
|
+
return False
|
|
1094
|
+
return (cache_properties.last_modified > adls_properties.last_modified) and (
|
|
1095
|
+
cache_properties.size == adls_properties.size
|
|
1096
|
+
)
|
|
1097
|
+
|
|
1098
|
+
|
|
1099
|
+
def make_adls_filesystem_getter(
|
|
1100
|
+
account_name: str,
|
|
1101
|
+
file_system: str,
|
|
1102
|
+
default_batch_size: int = 64,
|
|
1103
|
+
cache_dir: Optional[Union[Path, str]] = None,
|
|
1104
|
+
) -> Callable[[], ADLSFileSystem]:
|
|
1105
|
+
"""Wrapper for returning a :py:class:`core.adls.ADLSFileSystem` lazily."""
|
|
1106
|
+
|
|
1107
|
+
@lazy.lazy
|
|
1108
|
+
def get_adls_filesystem() -> ADLSFileSystem:
|
|
1109
|
+
return ADLSFileSystem(account_name, file_system, default_batch_size, cache_dir)
|
|
1110
|
+
|
|
1111
|
+
return get_adls_filesystem
|