thds.adls 3.0.20250116223841__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of thds.adls might be problematic. Click here for more details.

@@ -0,0 +1,78 @@
1
+ import requests
2
+ from azure.storage.blob import BlobServiceClient, ContainerClient
3
+ from azure.storage.filedatalake import DataLakeServiceClient, FileSystemClient
4
+
5
+ from thds.core import cache, config
6
+
7
+ from . import conf
8
+ from .shared_credential import SharedCredential
9
+
10
+ DEFAULT_CONNECTION_POOL_SIZE = config.item("default_connection_pool_size", default=100, parse=int)
11
+ # see docstring below about why we are setting this to be large.
12
+
13
+
14
+ def _connpool_session(connection_pool_size: int) -> requests.Session:
15
+ """We do lot of multithreaded use of connections to ADLS. For large runs,
16
+ having the default connection pool size of 10 is not enough, because each thread will create a
17
+ new connection if an unused one is not available from the pool, and we often have more than 10 threads
18
+ communicating at a time.
19
+
20
+ The connection pool will not start out this large, so don't fear that we're introducing new overhead here.
21
+ We will just throw away fewer connections that were used only once and replaced by a newer one.
22
+
23
+ The only reason you'd really want to keep this low is if the host you're talking to
24
+ can't handle this many simultaneous connections, but my belief is that ADLS should be
25
+ able to handle thousands (probably hundreds of thousands?) of connections at once.
26
+ """
27
+ session = requests.Session()
28
+ adapter = requests.adapters.HTTPAdapter(
29
+ pool_connections=connection_pool_size, pool_maxsize=connection_pool_size
30
+ )
31
+ session.mount("https://", adapter)
32
+ session.mount("http://", adapter)
33
+ return session
34
+
35
+
36
+ def adls_fs_client(
37
+ storage_account: str, container: str, connpool_size: int = DEFAULT_CONNECTION_POOL_SIZE()
38
+ ) -> FileSystemClient:
39
+ """No context managers - is this better?"""
40
+ return DataLakeServiceClient(
41
+ account_url=f"https://{storage_account}.dfs.core.windows.net",
42
+ credential=SharedCredential(),
43
+ session=_connpool_session(connection_pool_size=connpool_size),
44
+ max_single_get_size=conf.MAX_SINGLE_GET_SIZE(), # for downloads
45
+ max_chunk_get_size=conf.MAX_CHUNK_GET_SIZE(), # for downloads
46
+ max_block_size=conf.MAX_BLOCK_SIZE(), # for uploads
47
+ ).get_file_system_client(file_system=container)
48
+
49
+
50
+ """Singletons are scary, but in practice this appears to be the
51
+ best approach for all applications.
52
+
53
+ This avoids creating a client at a module level and is
54
+ thread-safe.
55
+ """
56
+ get_global_client = cache.locking(adls_fs_client)
57
+ # deprecated name - prefer get_global_fs_client
58
+ get_global_fs_client = get_global_client
59
+
60
+
61
+ def adls_blob_container_client(
62
+ storage_account: str, container: str, connpool_size: int = DEFAULT_CONNECTION_POOL_SIZE()
63
+ ) -> ContainerClient:
64
+ """This seems to support an atomic write operation,
65
+ which is in theory going to be faster than two separate network requests.
66
+ """
67
+ return BlobServiceClient(
68
+ account_url=f"https://{storage_account}.blob.core.windows.net",
69
+ credential=SharedCredential(),
70
+ session=_connpool_session(connection_pool_size=connpool_size),
71
+ max_single_get_size=conf.MAX_SINGLE_GET_SIZE(), # for downloads
72
+ max_chunk_get_size=conf.MAX_CHUNK_GET_SIZE(), # for downloads
73
+ max_block_size=conf.MAX_BLOCK_SIZE(), # for uploads
74
+ max_single_put_size=conf.MAX_SINGLE_PUT_SIZE(), # for_uploads
75
+ ).get_container_client(container)
76
+
77
+
78
+ get_global_blob_container_client = cache.locking(adls_blob_container_client)