thds.adls 3.0.20250116223841__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of thds.adls might be problematic. Click here for more details.

thds/adls/impl.py ADDED
@@ -0,0 +1,1111 @@
1
+ import asyncio
2
+ import datetime
3
+ import itertools
4
+ import logging
5
+ import os
6
+ import shutil
7
+ from collections.abc import Mapping as MappingABC
8
+ from functools import cmp_to_key, wraps
9
+ from pathlib import Path
10
+ from typing import (
11
+ IO,
12
+ Any,
13
+ AsyncIterable,
14
+ AsyncIterator,
15
+ Awaitable,
16
+ Callable,
17
+ Dict,
18
+ Iterable,
19
+ List,
20
+ Mapping,
21
+ Optional,
22
+ TypeVar,
23
+ Union,
24
+ )
25
+
26
+ import attr
27
+ import azure.core.exceptions
28
+ from aiostream import stream
29
+ from azure.identity.aio import DefaultAzureCredential
30
+ from azure.storage.filedatalake import FileProperties, PathProperties
31
+ from azure.storage.filedatalake.aio import DataLakeServiceClient, FileSystemClient
32
+
33
+ from thds.core import lazy, log
34
+
35
+ from ._upload import async_upload_decision_and_settings, metadata_for_upload
36
+ from .conf import CONNECTION_TIMEOUT, UPLOAD_CHUNK_SIZE
37
+ from .download import async_download_or_use_verified
38
+ from .errors import translate_azure_error
39
+ from .file_properties import is_directory
40
+ from .ro_cache import from_cache_path_to_local, global_cache
41
+ from .shared_credential import get_credential_kwargs
42
+
43
+ LOGGER = log.getLogger(__name__)
44
+ log.getLogger("azure.core").setLevel(logging.WARNING)
45
+ log.getLogger("azure.identity").setLevel(logging.WARNING)
46
+
47
+ DEFAULT_HIVE_PREFIX = os.getenv("CORE_HIVE_PREFIX", "")
48
+ WEST_HIVE_PREFIX = "hive/warehouse" # For easy access while we may need backwards compatibility
49
+
50
+ T = TypeVar("T")
51
+
52
+
53
+ def async_run(func: Callable[..., Awaitable[T]]) -> Callable[..., T]:
54
+ """Used to decorate the main runner function to avoid calling async.run too many times
55
+
56
+ :param func: any async function
57
+ :return:
58
+ """
59
+
60
+ @wraps(func)
61
+ def wrapper(*args, **kwargs):
62
+ return asyncio.run(func(*args, **kwargs)) # type: ignore
63
+
64
+ return wrapper
65
+
66
+
67
+ def base_name(remote_path: str) -> str:
68
+ return remote_path.rstrip("/").split("/")[-1]
69
+
70
+
71
+ def _true(_):
72
+ return True
73
+
74
+
75
+ def batcher(it: Iterable[T], size: int = 1) -> Iterable[List[T]]:
76
+ stream = iter(it)
77
+
78
+ def _slice():
79
+ return list(itertools.islice(stream, size))
80
+
81
+ yield from iter(_slice, [])
82
+
83
+
84
+ @attr.s(auto_attribs=True)
85
+ class ADLSFileSystemNotFound(ConnectionError):
86
+ account_name: str
87
+ file_system: str
88
+
89
+ def __str__(self):
90
+ return f"File system {self.file_system!r} not found under account {self.account_name!r}"
91
+
92
+
93
+ @attr.s(auto_attribs=True, frozen=True)
94
+ class PathPair:
95
+ """Store the remote path and the corresponding local path of a file"""
96
+
97
+ remote_path: str
98
+ local_path: Path
99
+
100
+
101
+ @attr.s(auto_attribs=True)
102
+ class DeleteProperties:
103
+ """Convenience class around dicts returned in file deletion."""
104
+
105
+ path: str
106
+ date: Optional[datetime.datetime] = None
107
+ version: Optional[str] = None
108
+ request_id: Optional[str] = None
109
+ deletion_id: Optional[str] = None # Inferring type based on request_id.
110
+ continuation: Optional[Any] = None # Cannot find details on this.
111
+ exception: Optional[Exception] = None
112
+
113
+
114
+ class ADLSFileSystem:
115
+ """A downloader that can be used to download a single file, all the files and subdirectories
116
+ in a given directory, or all the files for a given hive table.
117
+ """
118
+
119
+ def __init__(
120
+ self,
121
+ account_name: str,
122
+ file_system: str,
123
+ default_batch_size: int = 64,
124
+ cache_dir: Optional[Union[Path, str]] = None,
125
+ ):
126
+ self.account_name = account_name
127
+ self.file_system = file_system
128
+ self.default_batch_size = default_batch_size
129
+ if not self.exists():
130
+ raise ADLSFileSystemNotFound(account_name, file_system)
131
+
132
+ self.cache = None if cache_dir is None else ADLSFileSystemCache(cache_dir)
133
+
134
+ def exists(self) -> bool:
135
+ return self._run(self._exists)
136
+
137
+ @staticmethod
138
+ async def _exists(file_system_client: FileSystemClient) -> bool:
139
+ try:
140
+ return await file_system_client.exists()
141
+ except azure.core.exceptions.AzureError as err:
142
+ translate_azure_error(file_system_client, "", err)
143
+
144
+ def file_exists(self, path: str) -> bool:
145
+ return self._run(self._path_exists, path, False)
146
+
147
+ def dir_exists(self, path: str) -> bool:
148
+ return self._run(self._path_exists, path, True)
149
+
150
+ async def _path_exists(
151
+ self, file_system_client: FileSystemClient, path: str, directory: bool
152
+ ) -> bool:
153
+ try:
154
+ info = await self._get_file_info(file_system_client, path)
155
+ except azure.core.exceptions.ResourceNotFoundError:
156
+ return False
157
+ except azure.core.exceptions.AzureError as err:
158
+ translate_azure_error(file_system_client, path, err)
159
+ return directory == is_directory(info)
160
+
161
+ @async_run
162
+ async def _run(self, func: Callable[..., Awaitable], *args, **kwargs):
163
+ """Main async runner function that pass credential and account info
164
+ to create a file system client, which can then be passed into other async functions.
165
+
166
+ :param func: an async function
167
+ :param args: additional args for func
168
+ :param kwargs: addditional kwargs for func
169
+ """
170
+ async with DefaultAzureCredential(**get_credential_kwargs()) as credential:
171
+ service_client = DataLakeServiceClient(
172
+ account_url="{}://{}.dfs.core.windows.net".format("https", self.account_name),
173
+ credential=credential,
174
+ )
175
+ async with service_client:
176
+ async with service_client.get_file_system_client(
177
+ file_system=self.file_system
178
+ ) as file_system_client:
179
+ return await func(file_system_client, *args, **kwargs)
180
+
181
+ def _local_path_for(self, remote_path: str, local_path: Optional[Union[Path, str]]) -> Path:
182
+ if local_path is None:
183
+ if self.cache is None:
184
+ # use the current working directory as the default location
185
+ return Path(base_name(remote_path)).absolute()
186
+ else:
187
+ # use the cache as the default location
188
+ return self.cache.cache_path(remote_path)
189
+ else:
190
+ # use the fully qualified explicit path
191
+ return Path(local_path).absolute()
192
+
193
+ async def _fetch_file(
194
+ self,
195
+ file_system_client: FileSystemClient,
196
+ remote_path: str,
197
+ local_path: Optional[Union[Path, str]] = None,
198
+ ) -> Path:
199
+ """async function that downloads a file locally given its remote path
200
+
201
+ :returns: a local path of the downloaded file
202
+ """
203
+ # the local file path we will return to the caller;
204
+ # may download into another path if there is a cache
205
+ return_path = self._local_path_for(remote_path, local_path)
206
+ download_path: Path
207
+
208
+ if self.cache is None:
209
+ download_path = return_path
210
+ else:
211
+ download_path = self.cache.cache_path(remote_path)
212
+
213
+ dir_path = return_path.parent
214
+ dir_path.mkdir(exist_ok=True, parents=True)
215
+
216
+ locally_cached = False
217
+ if self.cache:
218
+ async with file_system_client.get_file_client(remote_path) as file_client:
219
+ file_properties = await file_client.get_file_properties()
220
+ if self.cache.is_valid_for(file_properties):
221
+ # local timestamp cache is up-to-date for this file; skip download
222
+ LOGGER.debug(f"Skipping download of cached {remote_path}")
223
+ locally_cached = True
224
+ if not locally_cached:
225
+ await async_download_or_use_verified(
226
+ file_system_client, remote_path, download_path, cache=global_cache()
227
+ )
228
+
229
+ assert download_path.exists(), "File should have been downloaded by this point"
230
+ if download_path != return_path:
231
+ from_cache_path_to_local(download_path, return_path, link_opts=("ref", "hard"))
232
+
233
+ return return_path
234
+
235
+ async def _fetch_directory(
236
+ self,
237
+ file_system_client: FileSystemClient,
238
+ remote_path: str,
239
+ local_path: Optional[Union[Path, str]] = None,
240
+ batch_size: Optional[int] = None,
241
+ recursive: bool = True,
242
+ path_filter: Optional[Callable[[PathProperties], bool]] = None,
243
+ ) -> List[Path]:
244
+ """Async function that downloads all the files within a given directory,
245
+ including the files in the subdirectories when recursive = True
246
+
247
+ :return: a list of the paths of the files downloaded
248
+ """
249
+ # normalize remote path to a standard relative dir path -
250
+ # this ensures correctness of strip_prefix() below
251
+ remote_path = remote_path.strip("/") + "/"
252
+ dir_path = self._local_path_for(remote_path, local_path)
253
+ dir_path.mkdir(exist_ok=True, parents=True)
254
+ path_filter_ = _true if path_filter is None else path_filter
255
+
256
+ # remove the remote directory prefix to determine a relative path for creation under dir_path
257
+ def strip_prefix(name):
258
+ return name.lstrip("/")[len(remote_path) :]
259
+
260
+ paths = (
261
+ PathPair(remote_path=path.name, local_path=dir_path / strip_prefix(path.name))
262
+ async for path in file_system_client.get_paths(remote_path, recursive=recursive)
263
+ if not path.is_directory and path_filter_(path)
264
+ )
265
+
266
+ local_paths = []
267
+ async for batch in self._async_batch(paths, batch_size):
268
+ local_paths.extend(
269
+ await asyncio.gather(
270
+ *[
271
+ self._fetch_file(
272
+ file_system_client,
273
+ path_pair.remote_path,
274
+ path_pair.local_path,
275
+ )
276
+ for path_pair in batch
277
+ ]
278
+ )
279
+ )
280
+
281
+ return local_paths
282
+
283
+ async def _fetch_files(
284
+ self,
285
+ file_system_client: FileSystemClient,
286
+ remote_paths: Union[Iterable[str], Mapping[str, Union[Path, str]]],
287
+ batch_size: Optional[int] = None,
288
+ ):
289
+ if isinstance(remote_paths, MappingABC):
290
+ remote_local_pairs = (
291
+ PathPair(remote_path, Path(local_path))
292
+ for remote_path, local_path in remote_paths.items()
293
+ )
294
+ else:
295
+ remote_local_pairs = (
296
+ PathPair(remote_path, self._local_path_for(remote_path, None))
297
+ for remote_path in remote_paths
298
+ )
299
+
300
+ if batch_size is None:
301
+ batch_size = self.default_batch_size
302
+
303
+ local_paths = []
304
+ for batch in iter(lambda: list(itertools.islice(remote_local_pairs, batch_size)), []):
305
+ local_paths.extend(
306
+ await asyncio.gather(
307
+ *[
308
+ self._fetch_file(
309
+ file_system_client,
310
+ path_pair.remote_path,
311
+ path_pair.local_path,
312
+ )
313
+ for path_pair in batch
314
+ ]
315
+ )
316
+ )
317
+
318
+ return local_paths
319
+
320
+ @staticmethod
321
+ async def _put_file(
322
+ file_system_client: FileSystemClient,
323
+ local_path: Union[str, Path],
324
+ remote_path: str,
325
+ metadata: Optional[Mapping[str, str]] = None,
326
+ ) -> str:
327
+ """async function that uploads a local file to a remote path
328
+
329
+ :returns: remote path of uploaded file
330
+ """
331
+
332
+ async with file_system_client.get_file_client(remote_path) as file_client:
333
+ with open(local_path, "rb") as fp:
334
+ decision = await async_upload_decision_and_settings(file_client.get_file_properties, fp)
335
+ if decision.upload_required:
336
+ await file_client.upload_data(
337
+ fp,
338
+ overwrite=True,
339
+ content_settings=decision.content_settings,
340
+ connection_timeout=CONNECTION_TIMEOUT(),
341
+ chunk_size=UPLOAD_CHUNK_SIZE(),
342
+ metadata={**metadata_for_upload(), **(metadata or {})},
343
+ )
344
+
345
+ return remote_path
346
+
347
+ async def _put_directory(
348
+ self,
349
+ file_system_client: FileSystemClient,
350
+ local_path: Union[str, Path],
351
+ remote_path: str,
352
+ recursive: bool = False,
353
+ batch_size: Optional[int] = None,
354
+ metadata: Optional[Mapping[str, str]] = None,
355
+ ) -> List[str]:
356
+ """async function that uploads all the files in a local directory to a remote path
357
+
358
+ :returns: list of remote paths
359
+ """
360
+
361
+ local_path = str(local_path).rstrip("/") + "/"
362
+ remote_path = remote_path.rstrip("/") + "/"
363
+
364
+ if batch_size is None:
365
+ batch_size = self.default_batch_size
366
+
367
+ paths = []
368
+ if recursive:
369
+ for root, _subdirs, files in os.walk(local_path):
370
+ for filename in files:
371
+ paths.append(
372
+ PathPair(
373
+ os.path.join(root, filename).replace(local_path, remote_path),
374
+ Path(os.path.join(root, filename)),
375
+ )
376
+ )
377
+ else:
378
+ for filename in os.listdir(local_path):
379
+ if os.path.isfile(os.path.join(local_path, filename)):
380
+ paths.append(
381
+ PathPair(
382
+ os.path.join(remote_path, filename),
383
+ Path(os.path.join(local_path, filename)),
384
+ )
385
+ )
386
+
387
+ remote_paths = []
388
+
389
+ for batch in batcher(paths, batch_size):
390
+ remote_paths.extend(
391
+ await asyncio.gather(
392
+ *[
393
+ self._put_file(
394
+ file_system_client,
395
+ str(path_pair.local_path),
396
+ path_pair.remote_path,
397
+ metadata,
398
+ )
399
+ for path_pair in batch
400
+ ]
401
+ )
402
+ )
403
+
404
+ return remote_paths
405
+
406
+ async def _put_files(
407
+ self,
408
+ file_system_client: FileSystemClient,
409
+ local_paths: Iterable[Union[str, Path]],
410
+ remote_path: str,
411
+ batch_size: Optional[int] = None,
412
+ metadata: Optional[Mapping[str, str]] = None,
413
+ ) -> List[str]:
414
+ remote_path = remote_path.rstrip("/") + "/"
415
+
416
+ paths: List[PathPair] = []
417
+
418
+ for local_path in local_paths:
419
+ file_name = os.path.basename(local_path)
420
+ paths.append(PathPair(os.path.join(remote_path, file_name), Path(local_path)))
421
+
422
+ if batch_size is None:
423
+ batch_size = self.default_batch_size
424
+
425
+ remote_paths = []
426
+
427
+ for batch in batcher(paths, batch_size):
428
+ remote_paths.extend(
429
+ await asyncio.gather(
430
+ *[
431
+ self._put_file(
432
+ file_system_client,
433
+ str(path_pair.local_path),
434
+ path_pair.remote_path,
435
+ metadata,
436
+ )
437
+ for path_pair in batch
438
+ ]
439
+ )
440
+ )
441
+
442
+ return remote_paths
443
+
444
+ @staticmethod
445
+ async def _get_file_info(file_system_client: FileSystemClient, remote_path: str) -> FileProperties:
446
+ """Returns `FileProperties` for remote files.
447
+
448
+ See :meth:`~ADLSFileSystem.get_file_info` for more details.
449
+ """
450
+ async with file_system_client.get_file_client(remote_path) as file_client:
451
+ return await file_client.get_file_properties()
452
+
453
+ async def _get_directory_info(
454
+ self,
455
+ file_system_client: FileSystemClient,
456
+ remote_path: str,
457
+ incl_subdirs: bool = False,
458
+ batch_size: Optional[int] = None,
459
+ recursive: bool = True,
460
+ path_filter: Optional[Callable[[FileProperties], bool]] = None,
461
+ ) -> List[FileProperties]:
462
+ """Returns a list of `FileProperties` for files in a remote directory.
463
+
464
+ See :meth:`~ADLSFileSystem.get_directory_info` for more details.
465
+ """
466
+
467
+ def incl_subdirs_(path: PathProperties) -> bool:
468
+ if incl_subdirs:
469
+ return False
470
+ else:
471
+ return path.is_directory
472
+
473
+ path_filter_ = _true if path_filter is None else path_filter
474
+
475
+ paths = (
476
+ path.name
477
+ async for path in file_system_client.get_paths(remote_path, recursive=recursive)
478
+ if not incl_subdirs_(path) and path_filter_(path)
479
+ )
480
+
481
+ info = []
482
+
483
+ async for batch in self._async_batch(paths, batch_size):
484
+ info.extend(
485
+ await asyncio.gather(*[self._get_file_info(file_system_client, path) for path in batch])
486
+ )
487
+
488
+ return info
489
+
490
+ async def _get_files_info(
491
+ self,
492
+ file_system_client: FileSystemClient,
493
+ remote_paths: Iterable[str],
494
+ batch_size: Optional[int] = None,
495
+ ) -> List[Dict[str, Any]]:
496
+ """Returns a list of `FileProperties` for each file in a list of remote file paths.
497
+
498
+ See :meth:`~ADLSFileSystem.get_files_info` for more details.
499
+ """
500
+ if batch_size is None:
501
+ batch_size = self.default_batch_size
502
+
503
+ info = []
504
+ for batch in batcher(remote_paths, batch_size):
505
+ info.extend(
506
+ await asyncio.gather(*[self._get_file_info(file_system_client, path) for path in batch])
507
+ )
508
+
509
+ return info
510
+
511
+ @staticmethod
512
+ async def _delete_file(
513
+ file_system_client: FileSystemClient,
514
+ remote_path: str,
515
+ if_modified_since: Optional[datetime.datetime] = None,
516
+ if_unmodified_since: Optional[datetime.datetime] = None,
517
+ ) -> DeleteProperties:
518
+ """Deletes a remote file and returns a response details dict.
519
+
520
+ See :meth:`~ADLSFileSystem.delete_file` for more details.
521
+ """
522
+ async with file_system_client.get_file_client(remote_path) as file_client:
523
+ try:
524
+ return DeleteProperties(
525
+ path=remote_path,
526
+ **await file_client.delete_file(
527
+ if_modified_since=if_modified_since,
528
+ if_unmodified_since=if_unmodified_since,
529
+ ),
530
+ )
531
+ except Exception as e:
532
+ return DeleteProperties(path=remote_path, exception=e)
533
+
534
+ @staticmethod
535
+ async def _delete_directory(
536
+ file_system_client: FileSystemClient,
537
+ remote_path: str,
538
+ if_modified_since: Optional[datetime.datetime] = None,
539
+ if_unmodified_since: Optional[datetime.datetime] = None,
540
+ ) -> DeleteProperties:
541
+ """Deletes a remote directory and returns a response details dict.
542
+
543
+ Warning: If `remote_path` is a file path, it will be deleted without raising an error.
544
+
545
+ See :meth:`~ADLSFileSystem.delete_directory` for more details.
546
+ """
547
+ async with file_system_client.get_directory_client(remote_path) as directory_client:
548
+ try:
549
+ return DeleteProperties(
550
+ path=remote_path,
551
+ **await directory_client.delete_directory(
552
+ if_modified_since=if_modified_since,
553
+ if_unmodified_since=if_unmodified_since,
554
+ ),
555
+ )
556
+ except Exception as e:
557
+ return DeleteProperties(path=remote_path, exception=e)
558
+
559
+ async def _delete_in_directory(
560
+ self,
561
+ file_system_client: FileSystemClient,
562
+ remote_path: str,
563
+ if_modified_since: Optional[datetime.datetime] = None,
564
+ if_unmodified_since: Optional[datetime.datetime] = None,
565
+ cleanup: bool = False,
566
+ batch_size: Optional[int] = None,
567
+ recursive: bool = True,
568
+ path_filter: Optional[Callable[[PathProperties], bool]] = None,
569
+ ) -> List[DeleteProperties]:
570
+ """Deletes files in a remote directory and returns a list of response details dicts.
571
+
572
+ See :meth:`~ADLSFileSystem.delete_in_directory` for more details.
573
+ """
574
+
575
+ def cmp_subpath_relation(path1: str, path2: str) -> int:
576
+ if path1.startswith(path2):
577
+ return -1
578
+ elif path2.startswith(path1):
579
+ return 1
580
+ return 0
581
+
582
+ path_filter_ = _true if path_filter is None else path_filter
583
+
584
+ file_paths = (
585
+ path.name
586
+ async for path in file_system_client.get_paths(remote_path, recursive=recursive)
587
+ if not path.is_directory and path_filter_(path)
588
+ )
589
+
590
+ del_props = []
591
+ async for batch in self._async_batch(file_paths, batch_size):
592
+ del_props.extend(
593
+ await asyncio.gather(
594
+ *[
595
+ self._delete_file(
596
+ file_system_client,
597
+ path,
598
+ if_modified_since,
599
+ if_unmodified_since,
600
+ )
601
+ for path in batch
602
+ ]
603
+ )
604
+ )
605
+
606
+ if cleanup:
607
+ dir_paths = [
608
+ path.name
609
+ async for path in file_system_client.get_paths(remote_path, recursive=recursive)
610
+ if path.is_directory
611
+ ]
612
+ dir_paths.sort(key=cmp_to_key(cmp_subpath_relation))
613
+
614
+ # Synchronous because order of operations must be maintaned.
615
+ # Inner empty subdirs must be deleted before outer subdirs.
616
+ for path in dir_paths:
617
+ del_props.extend(await asyncio.gather(self._delete_file(file_system_client, path)))
618
+
619
+ return del_props
620
+
621
+ async def _delete_files(
622
+ self,
623
+ file_system_client: FileSystemClient,
624
+ remote_paths: Iterable[str],
625
+ batch_size: Optional[int] = None,
626
+ if_modified_since: Optional[datetime.datetime] = None,
627
+ if_unmodified_since: Optional[datetime.datetime] = None,
628
+ ) -> List[DeleteProperties]:
629
+ """Deletes each remote file in a list of remote file paths.
630
+
631
+ See :meth:`~ADLSFileSystem.delete_files` for more details.
632
+ """
633
+ if batch_size is None:
634
+ batch_size = self.default_batch_size
635
+
636
+ del_props = []
637
+ for batch in batcher(remote_paths, batch_size):
638
+ del_props.extend(
639
+ await asyncio.gather(
640
+ *[
641
+ self._delete_file(
642
+ file_system_client,
643
+ path,
644
+ if_modified_since,
645
+ if_unmodified_since,
646
+ )
647
+ for path in batch
648
+ ]
649
+ )
650
+ )
651
+
652
+ return del_props
653
+
654
+ async def _delete_directories(
655
+ self,
656
+ file_system_client: FileSystemClient,
657
+ remote_paths: Iterable[str],
658
+ batch_size: Optional[int] = None,
659
+ if_modified_since: Optional[datetime.datetime] = None,
660
+ if_unmodified_since: Optional[datetime.datetime] = None,
661
+ ) -> List[DeleteProperties]:
662
+ """Deletes each remote directory in a list of remote directory paths.
663
+
664
+ Warning: If any `remote_paths` are file paths, they will also be deleted
665
+ without raising an error.
666
+
667
+ See :meth:`~ADLSFileSystem.delete_directories` for more details.
668
+ """
669
+ if batch_size is None:
670
+ batch_size = self.default_batch_size
671
+
672
+ del_props = []
673
+ for batch in batcher(remote_paths, batch_size):
674
+ del_props.extend(
675
+ await asyncio.gather(
676
+ *[
677
+ self._delete_directory(
678
+ file_system_client,
679
+ path,
680
+ if_modified_since,
681
+ if_unmodified_since,
682
+ )
683
+ for path in batch
684
+ ]
685
+ )
686
+ )
687
+
688
+ return del_props
689
+
690
+ async def _async_batch(
691
+ self, it: AsyncIterable[T], size: Optional[int] = None
692
+ ) -> AsyncIterator[List[T]]:
693
+ """Async batch generator"""
694
+ batch_size = size if size is not None else self.default_batch_size
695
+ async with stream.chunks(it, batch_size).stream() as streamer:
696
+ async for chunk in streamer:
697
+ yield chunk
698
+
699
+ def fetch_files(self, remote_paths: Union[Iterable[str], Mapping[str, Union[Path, str]]]):
700
+ return self._run(self._fetch_files, remote_paths)
701
+
702
+ def fetch_file(self, remote_path: str, local_path: Optional[Union[Path, str]] = None) -> Path:
703
+ """Download the given remote file and save it into a given file path (local_path).
704
+ In case there is a cache directory, the file is downloaded to a matching path under it.
705
+ In that case, when local_path is passed, a hard link is made to the cache copy at the
706
+ local_path location. (a hard link ensures that clearing the cache later will not affect
707
+ the view of the file at local_path)
708
+
709
+ :param remote_path: path in ADLS to download
710
+ :param local_path: path for local file; if not given, use the name from the remote path when
711
+ there is no cache, otherwise use the path under the cache dir corresponding to remote_path
712
+ :return: the local path where the file was downloaded
713
+ """
714
+ return self._run(self._fetch_file, remote_path, local_path)
715
+
716
+ def fetch_directory(
717
+ self,
718
+ remote_path: str,
719
+ local_path: Optional[Union[Path, str]] = None,
720
+ batch_size: Optional[int] = None,
721
+ recursive: bool = True,
722
+ path_filter: Optional[Callable[[PathProperties], bool]] = None,
723
+ ) -> List[Path]:
724
+ """Download all the files in a given directory and save them in a given directory path.
725
+ In case there is a cache directory, the remote directory is reflected in a subdirectory under it.
726
+ The semantics of local_path are the same as for fetch_file(),
727
+ except that it references a local directory, and the hard links are made for each downloaded
728
+ file under it when there is a cache.
729
+
730
+ :param remote_path: the remote directory to download from
731
+ :param local_path: path for the local directory; if not given, use the name from the
732
+ remote path when there is no cache, otherwise use the path under the cache dir
733
+ corresponding to remote_path
734
+ :param batch_size: the size of each batch
735
+ :param recursive: recurse into subdirectories when downloading?
736
+ :param path_filter: optional callable taking an `azure.storage.filedatalake.PathProperties`
737
+ and returning a bool indicating whether to download the corresponding file
738
+ :return: List of local paths that were downloaded to
739
+ """
740
+ return self._run(
741
+ self._fetch_directory,
742
+ remote_path,
743
+ local_path,
744
+ batch_size=batch_size,
745
+ recursive=recursive,
746
+ path_filter=path_filter,
747
+ )
748
+
749
+ def fetch_hive_table(
750
+ self,
751
+ table: str,
752
+ local_path: Optional[Union[Path, str]] = None,
753
+ batch_size: Optional[int] = None,
754
+ hive_prefix: str = DEFAULT_HIVE_PREFIX,
755
+ ) -> List[Path]:
756
+ """Download all the files in the directory for a given hive table
757
+
758
+ :param table: e.g. database.tablename
759
+ :param local_path: if not given, the files will be saved to ./database/tablename/
760
+ :param batch_size: the size of each batch
761
+ :param hive_prefix: the path prefix from the container root to the Hive warehouse
762
+ """
763
+ database, tablename = table.split(".")
764
+ remote_path = (
765
+ f"{hive_prefix.strip('/')}/{database}.db/{tablename}"
766
+ if hive_prefix
767
+ else f"{database}.db/{tablename}"
768
+ )
769
+ local_path_resolved = local_path if local_path is not None else Path(f"{database}/{tablename}")
770
+
771
+ return self.fetch_directory(remote_path, local_path_resolved, batch_size)
772
+
773
+ def put_file(
774
+ self,
775
+ local_path: Union[str, Path],
776
+ remote_path: str,
777
+ metadata: Optional[Mapping[str, str]] = None,
778
+ ) -> str:
779
+ """async function that uploads a local file to a remote location
780
+
781
+ :param local_path: The local path of the file to upload.
782
+ :param remote_path: The remote path to which the file will be uploaded.
783
+ :param metadata: Metadata to add to the file.
784
+ :returns: remote path of uploaded file
785
+ """
786
+
787
+ return self._run(self._put_file, local_path, remote_path, metadata)
788
+
789
+ def put_directory(
790
+ self,
791
+ local_path: Union[str, Path],
792
+ remote_path: str,
793
+ recursive: bool = False,
794
+ batch_size: Optional[int] = None,
795
+ metadata: Optional[Mapping[str, str]] = None,
796
+ ) -> List[str]:
797
+ """async function that uploads all the files in a local directory to a remote directory
798
+
799
+ :param local_path: The local path of the directory to upload.
800
+ :param remote_path: The remote path to which the directory will be uploaded.
801
+ :param recursive: Recurse into subdirectories when downloading?
802
+ :param batch_size: The size of each batch.
803
+ :param metadata: Metadata to add to each file uploaded.
804
+ :returns: list of remote paths
805
+ """
806
+
807
+ return self._run(self._put_directory, local_path, remote_path, recursive, batch_size, metadata)
808
+
809
+ def put_files(
810
+ self,
811
+ local_paths: Iterable[Union[str, Path]],
812
+ remote_path: str,
813
+ batch_size: Optional[int] = None,
814
+ metadata: Optional[Mapping[str, str]] = None,
815
+ ) -> List[str]:
816
+ """async function that uploads each in a list of files to a remote directory
817
+
818
+ :param local_paths: The local paths of the directory to upload.
819
+ :param remote_path: The remote path to which the files will be uploaded.
820
+ :param batch_size: The size of each batch.
821
+ :param metadata: Metadata to add to each file uploaded.
822
+ :returns: list of remote paths
823
+ """
824
+
825
+ return self._run(self._put_files, local_paths, remote_path, batch_size, metadata)
826
+
827
+ def delete_file(
828
+ self,
829
+ remote_path: str,
830
+ if_modified_since: Optional[datetime.datetime] = None,
831
+ if_unmodified_since: Optional[datetime.datetime] = None,
832
+ ) -> DeleteProperties:
833
+ """Async function that deletes a remote file.
834
+
835
+ :param remote_path: Path to remote file location.
836
+ :param if_modified_since: Only delete file if it has been modified since given datetime.
837
+ Default is `None`.
838
+ :param if_unmodified_since: Only delete file if it has been unmodified since given datetime.
839
+ Default is `None`.
840
+ :return: `DeleteProperties`.
841
+ """
842
+ return self._run(self._delete_file, remote_path, if_modified_since, if_unmodified_since)
843
+
844
+ def delete_directory(
845
+ self,
846
+ remote_path: str,
847
+ if_modified_since: Optional[datetime.datetime] = None,
848
+ if_unmodified_since: Optional[datetime.datetime] = None,
849
+ ) -> DeleteProperties:
850
+ """Async function that deletes a remote directory.
851
+
852
+ Warning: If `remote_path` is a file path, it will be deleted without raising an error.
853
+
854
+ :param remote_path: Path to remote directory location.
855
+ :param if_modified_since: Only delete directory if it has been modified since given datetime.
856
+ Default is `None`.
857
+ :param if_unmodified_since: Only delete directory if it has been unmodified since given datetime.
858
+ Default is `None`.
859
+ :return: `DeleteProperties`.
860
+ """
861
+ return self._run(self._delete_directory, remote_path, if_modified_since, if_unmodified_since)
862
+
863
+ def delete_in_directory(
864
+ self,
865
+ remote_path: str,
866
+ if_modified_since: Optional[datetime.datetime] = None,
867
+ if_unmodified_since: Optional[datetime.datetime] = None,
868
+ cleanup: bool = False,
869
+ batch_size: Optional[int] = None,
870
+ recursive: bool = True,
871
+ path_filter: Optional[Callable[[PathProperties], bool]] = None,
872
+ ) -> List[DeleteProperties]:
873
+ """Async function that deletes all files in a remote directory, with the option to also delete
874
+ subdirectories left empty afterwards.
875
+
876
+ Note #1: Cleanup step is blocking due to the need to maintain the order in which empty
877
+ subdirectories must be deleted. Inner empty directories have to be deleted before
878
+ outer empty directories so the outer directories can be empty.
879
+
880
+ Note #2: if cleanup is true, the function will attempt to delete all subdirectories,
881
+ however non-empty subdirectories will produce an exception that gets passed and written to their
882
+ respective response details dicts.
883
+
884
+ :param remote_path: Path to remote directory location.
885
+ :param if_modified_since: Only delete files if they have been modified since given datetime.
886
+ Default is `None`.
887
+ :param if_unmodified_since: Only delete files if they have been unmodified since given datetime.
888
+ Default is `None`.
889
+ :param cleanup: Whether to delete subdirectories left empty after file deletion.
890
+ Default is `False`.
891
+ :param batch_size: Number of files to delete in each batch.
892
+ Default is `None`.
893
+ :param recursive: Whether to recurse into subdirectories when deleting.
894
+ Default is `True`.
895
+ :param path_filter: Optional callable taking a `PathProperties` and returning a bool
896
+ indicating whether to delete the corresponding file.
897
+ :return: List of `DeleteProperties`.
898
+ """
899
+ return self._run(
900
+ self._delete_in_directory,
901
+ remote_path,
902
+ if_modified_since,
903
+ if_unmodified_since,
904
+ cleanup,
905
+ batch_size,
906
+ recursive,
907
+ path_filter,
908
+ )
909
+
910
+ def delete_files(
911
+ self,
912
+ remote_paths: Iterable[str],
913
+ batch_size: Optional[int] = None,
914
+ if_modified_since: Optional[datetime.datetime] = None,
915
+ if_unmodified_since: Optional[datetime.datetime] = None,
916
+ ) -> List[DeleteProperties]:
917
+ """Async function that deletes each file in a list of remote file paths.
918
+
919
+ :param remote_paths: List of paths to remote file locations.
920
+ :param batch_size: Number of files to delete in each batch.
921
+ Default is `None`.
922
+ :param if_modified_since: Only delete files if they have been modified since given datetime.
923
+ Default is `None`.
924
+ :param if_unmodified_since: Only delete files if they have been unmodified since given datetime.
925
+ Default is `None`.
926
+ :return: List of `DeleteProperties`.
927
+ """
928
+ return self._run(
929
+ self._delete_files,
930
+ remote_paths,
931
+ batch_size,
932
+ if_modified_since,
933
+ if_unmodified_since,
934
+ )
935
+
936
+ def delete_directories(
937
+ self,
938
+ remote_paths: Iterable[str],
939
+ batch_size: Optional[int] = None,
940
+ if_modified_since: Optional[datetime.datetime] = None,
941
+ if_unmodified_since: Optional[datetime.datetime] = None,
942
+ ) -> List[DeleteProperties]:
943
+ """Async function that deletes each directory in a list of remote directory paths.
944
+
945
+ Warning: If any `remote_paths` are file paths, they will also be deleted
946
+ without raising an error.
947
+
948
+ :param remote_paths: List of paths to remote directory locations.
949
+ :param batch_size: Number of directories to delete in each batch.
950
+ Default is `None`.
951
+ :param if_modified_since: Only delete directory if it has been modified since given datetime.
952
+ Default is `None`.
953
+ :param if_unmodified_since: Only delete directory if it has been unmodified since given datetime.
954
+ Default is `None`.
955
+ :return: List of `DeleteProperties`.
956
+ """
957
+ return self._run(
958
+ self._delete_directories,
959
+ remote_paths,
960
+ batch_size,
961
+ if_modified_since,
962
+ if_unmodified_since,
963
+ )
964
+
965
+ def get_file_info(self, remote_path: str) -> FileProperties:
966
+ """Async function that gets `FileProperties` for a remote file.
967
+
968
+ :param remote_path: Path to remote file location.
969
+ :return: `FileProperties`
970
+ """
971
+ return self._run(self._get_file_info, remote_path)
972
+
973
+ def get_directory_info(
974
+ self,
975
+ remote_path: str,
976
+ incl_subdirs: bool = False,
977
+ batch_size: Optional[int] = None,
978
+ recursive: bool = True,
979
+ path_filter: Optional[Callable[[FileProperties], bool]] = None,
980
+ ) -> List[FileProperties]:
981
+ """Async function that gets `FileProperties` for all files in a remote directory.
982
+
983
+ :param remote_path: Path to a remote directory location.
984
+ :param incl_subdirs: Whether to include `FileProperties` for the subdirectories themselves.
985
+ Default is `False`.
986
+ :param batch_size: Number of `FileProperties` to get in each batch.
987
+ Default is `None`.
988
+ :param recursive: Whether to recurse into subdirectories when getting `FileProperties`.
989
+ :param path_filter: Optional callable taking a `PathProperties` and returning a bool
990
+ indicating whether to delete the corresponding file.
991
+ :return: List of `FileProperties`.
992
+ """
993
+ return self._run(
994
+ self._get_directory_info,
995
+ remote_path,
996
+ incl_subdirs,
997
+ batch_size,
998
+ recursive,
999
+ path_filter,
1000
+ )
1001
+
1002
+ def get_files_info(
1003
+ self, remote_paths: Iterable[str], batch_size: Optional[int] = None
1004
+ ) -> List[FileProperties]:
1005
+ """Async function that gets `FileProperties` for each file in a list of remote file paths.
1006
+
1007
+ :param remote_paths: List of paths to remote file locations.
1008
+ :param batch_size: Number of `FileProperties` to get in each batch.
1009
+ Default is `None`.
1010
+ :return: List of `FileProperties`.
1011
+ """
1012
+ return self._run(self._get_files_info, remote_paths, batch_size)
1013
+
1014
+
1015
+ class ADLSFileSystemCache:
1016
+ def __init__(self, cache_dir: Union[Path, str]):
1017
+ self.cache_dir = Path(cache_dir).absolute()
1018
+ self._init_dir()
1019
+
1020
+ def _init_dir(self):
1021
+ if self.cache_dir.exists() and not self.cache_dir.is_dir():
1022
+ raise FileExistsError(f"{self.cache_dir} exists but is not a directory; can't use as cache")
1023
+ else:
1024
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
1025
+
1026
+ def clear(self):
1027
+ if self.cache_dir.exists():
1028
+ shutil.rmtree(self.cache_dir)
1029
+
1030
+ self._init_dir()
1031
+
1032
+ def __contains__(self, path: str) -> bool:
1033
+ """Check for existence of a path in the cache for a *blob* (not for directories)"""
1034
+ return self.cache_path(path).is_file()
1035
+
1036
+ def remove(self, path: str):
1037
+ """Remove a path from the cache. This is irrespective of type (files and dirs), i.e.
1038
+ the end result should be that the cache is ready to have new content written at `path`,
1039
+ either as a file or a directory. In case a cache path corresponding to relative path `path`
1040
+ doesn't exist locally, no action is taken."""
1041
+ cache_path = self.cache_path(path)
1042
+
1043
+ if cache_path.is_dir():
1044
+ shutil.rmtree(cache_path)
1045
+ elif cache_path.exists():
1046
+ os.remove(cache_path)
1047
+
1048
+ def cache_path(self, path: str):
1049
+ """Return the local path in the cache corresponding to the relative ADLS path `path`"""
1050
+ # ADLS paths are always forward-slash separated, hence we don't use os.path.split here
1051
+ parts = path.split("/")
1052
+ return self.cache_dir.joinpath(*parts)
1053
+
1054
+ def file_handle(self, path: str, mode: str) -> IO:
1055
+ """Return a file handle to the local path in the cache corresponding to the relative ADLS
1056
+ path, `path`, opened in mode `mode`. Closing the handle is the responsibility of the caller.
1057
+ """
1058
+ file_path = self.cache_path(path)
1059
+ dir_path = Path(file_path).parent
1060
+ dir_path.mkdir(parents=True, exist_ok=True)
1061
+ return open(file_path, mode)
1062
+
1063
+ def file_properties(self, path: str) -> FileProperties:
1064
+ """Return an `azure.storage.filedatalake.FileProperties` corresponding to the properties of the
1065
+ local file."""
1066
+ cache_path = self.cache_path(path)
1067
+ if not cache_path.is_file():
1068
+ raise FileNotFoundError(f"No file at {path} in cache at {self.cache_dir}")
1069
+ cache_stat = os.stat(cache_path)
1070
+ cache_mod_time = datetime.datetime.fromtimestamp(cache_stat.st_mtime).astimezone(
1071
+ datetime.timezone.utc
1072
+ )
1073
+ fp = FileProperties(name=path)
1074
+ fp.last_modified = cache_mod_time
1075
+ fp.size = cache_stat.st_size
1076
+ return fp
1077
+
1078
+ def is_valid_for(self, adls_properties: FileProperties) -> bool:
1079
+ """Check if the cache has a valid copy of a blob at a given relative ADLS path.
1080
+ This is checked by comparison of the local file properties with an
1081
+ `azure.storage.filedatalake.FileProperties` detailing the properties of the ADLS blob.
1082
+ To be valid the local cache path should:
1083
+ - exist and be a proper file
1084
+ - have a newer last-modified time than that of the ADLS blob
1085
+ - have the same size as the ADLS blob
1086
+ """
1087
+ assert adls_properties.name
1088
+ if adls_properties.name not in self:
1089
+ return False
1090
+
1091
+ cache_properties = self.file_properties(adls_properties.name)
1092
+ if not cache_properties.last_modified:
1093
+ return False
1094
+ return (cache_properties.last_modified > adls_properties.last_modified) and (
1095
+ cache_properties.size == adls_properties.size
1096
+ )
1097
+
1098
+
1099
+ def make_adls_filesystem_getter(
1100
+ account_name: str,
1101
+ file_system: str,
1102
+ default_batch_size: int = 64,
1103
+ cache_dir: Optional[Union[Path, str]] = None,
1104
+ ) -> Callable[[], ADLSFileSystem]:
1105
+ """Wrapper for returning a :py:class:`core.adls.ADLSFileSystem` lazily."""
1106
+
1107
+ @lazy.lazy
1108
+ def get_adls_filesystem() -> ADLSFileSystem:
1109
+ return ADLSFileSystem(account_name, file_system, default_batch_size, cache_dir)
1110
+
1111
+ return get_adls_filesystem