thds.adls 3.0.20250116223841__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of thds.adls might be problematic. Click here for more details.
- thds/adls/__init__.py +15 -0
- thds/adls/_progress.py +193 -0
- thds/adls/_upload.py +127 -0
- thds/adls/abfss.py +24 -0
- thds/adls/cached_up_down.py +48 -0
- thds/adls/conf.py +33 -0
- thds/adls/dbfs.py +60 -0
- thds/adls/defaults.py +26 -0
- thds/adls/download.py +394 -0
- thds/adls/download_lock.py +57 -0
- thds/adls/errors.py +44 -0
- thds/adls/etag.py +6 -0
- thds/adls/file_properties.py +13 -0
- thds/adls/fqn.py +169 -0
- thds/adls/global_client.py +78 -0
- thds/adls/impl.py +1111 -0
- thds/adls/md5.py +60 -0
- thds/adls/meta.json +8 -0
- thds/adls/named_roots.py +26 -0
- thds/adls/py.typed +0 -0
- thds/adls/resource/__init__.py +36 -0
- thds/adls/resource/core.py +79 -0
- thds/adls/resource/file_pointers.py +54 -0
- thds/adls/resource/up_down.py +245 -0
- thds/adls/ro_cache.py +126 -0
- thds/adls/shared_credential.py +107 -0
- thds/adls/source.py +66 -0
- thds/adls/tools/download.py +35 -0
- thds/adls/tools/ls.py +38 -0
- thds/adls/uri.py +38 -0
- thds.adls-3.0.20250116223841.dist-info/METADATA +16 -0
- thds.adls-3.0.20250116223841.dist-info/RECORD +35 -0
- thds.adls-3.0.20250116223841.dist-info/WHEEL +5 -0
- thds.adls-3.0.20250116223841.dist-info/entry_points.txt +3 -0
- thds.adls-3.0.20250116223841.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
from azure.storage.blob import BlobServiceClient, ContainerClient
|
|
3
|
+
from azure.storage.filedatalake import DataLakeServiceClient, FileSystemClient
|
|
4
|
+
|
|
5
|
+
from thds.core import cache, config
|
|
6
|
+
|
|
7
|
+
from . import conf
|
|
8
|
+
from .shared_credential import SharedCredential
|
|
9
|
+
|
|
10
|
+
DEFAULT_CONNECTION_POOL_SIZE = config.item("default_connection_pool_size", default=100, parse=int)
|
|
11
|
+
# see docstring below about why we are setting this to be large.
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _connpool_session(connection_pool_size: int) -> requests.Session:
|
|
15
|
+
"""We do lot of multithreaded use of connections to ADLS. For large runs,
|
|
16
|
+
having the default connection pool size of 10 is not enough, because each thread will create a
|
|
17
|
+
new connection if an unused one is not available from the pool, and we often have more than 10 threads
|
|
18
|
+
communicating at a time.
|
|
19
|
+
|
|
20
|
+
The connection pool will not start out this large, so don't fear that we're introducing new overhead here.
|
|
21
|
+
We will just throw away fewer connections that were used only once and replaced by a newer one.
|
|
22
|
+
|
|
23
|
+
The only reason you'd really want to keep this low is if the host you're talking to
|
|
24
|
+
can't handle this many simultaneous connections, but my belief is that ADLS should be
|
|
25
|
+
able to handle thousands (probably hundreds of thousands?) of connections at once.
|
|
26
|
+
"""
|
|
27
|
+
session = requests.Session()
|
|
28
|
+
adapter = requests.adapters.HTTPAdapter(
|
|
29
|
+
pool_connections=connection_pool_size, pool_maxsize=connection_pool_size
|
|
30
|
+
)
|
|
31
|
+
session.mount("https://", adapter)
|
|
32
|
+
session.mount("http://", adapter)
|
|
33
|
+
return session
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def adls_fs_client(
|
|
37
|
+
storage_account: str, container: str, connpool_size: int = DEFAULT_CONNECTION_POOL_SIZE()
|
|
38
|
+
) -> FileSystemClient:
|
|
39
|
+
"""No context managers - is this better?"""
|
|
40
|
+
return DataLakeServiceClient(
|
|
41
|
+
account_url=f"https://{storage_account}.dfs.core.windows.net",
|
|
42
|
+
credential=SharedCredential(),
|
|
43
|
+
session=_connpool_session(connection_pool_size=connpool_size),
|
|
44
|
+
max_single_get_size=conf.MAX_SINGLE_GET_SIZE(), # for downloads
|
|
45
|
+
max_chunk_get_size=conf.MAX_CHUNK_GET_SIZE(), # for downloads
|
|
46
|
+
max_block_size=conf.MAX_BLOCK_SIZE(), # for uploads
|
|
47
|
+
).get_file_system_client(file_system=container)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
"""Singletons are scary, but in practice this appears to be the
|
|
51
|
+
best approach for all applications.
|
|
52
|
+
|
|
53
|
+
This avoids creating a client at a module level and is
|
|
54
|
+
thread-safe.
|
|
55
|
+
"""
|
|
56
|
+
get_global_client = cache.locking(adls_fs_client)
|
|
57
|
+
# deprecated name - prefer get_global_fs_client
|
|
58
|
+
get_global_fs_client = get_global_client
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def adls_blob_container_client(
|
|
62
|
+
storage_account: str, container: str, connpool_size: int = DEFAULT_CONNECTION_POOL_SIZE()
|
|
63
|
+
) -> ContainerClient:
|
|
64
|
+
"""This seems to support an atomic write operation,
|
|
65
|
+
which is in theory going to be faster than two separate network requests.
|
|
66
|
+
"""
|
|
67
|
+
return BlobServiceClient(
|
|
68
|
+
account_url=f"https://{storage_account}.blob.core.windows.net",
|
|
69
|
+
credential=SharedCredential(),
|
|
70
|
+
session=_connpool_session(connection_pool_size=connpool_size),
|
|
71
|
+
max_single_get_size=conf.MAX_SINGLE_GET_SIZE(), # for downloads
|
|
72
|
+
max_chunk_get_size=conf.MAX_CHUNK_GET_SIZE(), # for downloads
|
|
73
|
+
max_block_size=conf.MAX_BLOCK_SIZE(), # for uploads
|
|
74
|
+
max_single_put_size=conf.MAX_SINGLE_PUT_SIZE(), # for_uploads
|
|
75
|
+
).get_container_client(container)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
get_global_blob_container_client = cache.locking(adls_blob_container_client)
|