odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""Connection implementations for ODIBI."""
|
|
2
|
+
|
|
3
|
+
from odibi.connections.azure_adls import AzureADLS
|
|
4
|
+
from odibi.connections.azure_sql import AzureSQL
|
|
5
|
+
from odibi.connections.base import BaseConnection
|
|
6
|
+
from odibi.connections.local import LocalConnection
|
|
7
|
+
from odibi.connections.local_dbfs import LocalDBFS
|
|
8
|
+
|
|
9
|
+
__all__ = ["BaseConnection", "LocalConnection", "AzureADLS", "AzureSQL", "LocalDBFS"]
|
|
@@ -0,0 +1,499 @@
|
|
|
1
|
+
"""Azure Data Lake Storage Gen2 connection (Phase 2A: Multi-mode authentication)."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import posixpath
|
|
5
|
+
import threading
|
|
6
|
+
import warnings
|
|
7
|
+
from typing import Any, Dict, Optional
|
|
8
|
+
|
|
9
|
+
from odibi.utils.logging import logger
|
|
10
|
+
from odibi.utils.logging_context import get_logging_context
|
|
11
|
+
|
|
12
|
+
from .base import BaseConnection
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class AzureADLS(BaseConnection):
|
|
16
|
+
"""Azure Data Lake Storage Gen2 connection.
|
|
17
|
+
|
|
18
|
+
Phase 2A: Multi-mode authentication + multi-account support
|
|
19
|
+
Supports key_vault (recommended), direct_key, service_principal, and managed_identity.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
account: str,
|
|
25
|
+
container: str,
|
|
26
|
+
path_prefix: str = "",
|
|
27
|
+
auth_mode: str = "key_vault",
|
|
28
|
+
key_vault_name: Optional[str] = None,
|
|
29
|
+
secret_name: Optional[str] = None,
|
|
30
|
+
account_key: Optional[str] = None,
|
|
31
|
+
sas_token: Optional[str] = None,
|
|
32
|
+
tenant_id: Optional[str] = None,
|
|
33
|
+
client_id: Optional[str] = None,
|
|
34
|
+
client_secret: Optional[str] = None,
|
|
35
|
+
validate: bool = True,
|
|
36
|
+
**kwargs,
|
|
37
|
+
):
|
|
38
|
+
"""Initialize ADLS connection.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
account: Storage account name (e.g., 'mystorageaccount')
|
|
42
|
+
container: Container/filesystem name
|
|
43
|
+
path_prefix: Optional prefix for all paths
|
|
44
|
+
auth_mode: Authentication mode
|
|
45
|
+
('key_vault', 'direct_key', 'sas_token', 'service_principal', 'managed_identity')
|
|
46
|
+
key_vault_name: Azure Key Vault name (required for key_vault mode)
|
|
47
|
+
secret_name: Secret name in Key Vault (required for key_vault mode)
|
|
48
|
+
account_key: Storage account key (required for direct_key mode)
|
|
49
|
+
sas_token: Shared Access Signature token (required for sas_token mode)
|
|
50
|
+
tenant_id: Azure Tenant ID (required for service_principal)
|
|
51
|
+
client_id: Service Principal Client ID (required for service_principal)
|
|
52
|
+
client_secret: Service Principal Client Secret (required for service_principal)
|
|
53
|
+
validate: Validate configuration on init
|
|
54
|
+
"""
|
|
55
|
+
ctx = get_logging_context()
|
|
56
|
+
ctx.log_connection(
|
|
57
|
+
connection_type="azure_adls",
|
|
58
|
+
connection_name=f"{account}/{container}",
|
|
59
|
+
action="init",
|
|
60
|
+
account=account,
|
|
61
|
+
container=container,
|
|
62
|
+
auth_mode=auth_mode,
|
|
63
|
+
path_prefix=path_prefix or "(none)",
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
self.account = account
|
|
67
|
+
self.container = container
|
|
68
|
+
self.path_prefix = path_prefix.strip("/") if path_prefix else ""
|
|
69
|
+
self.auth_mode = auth_mode
|
|
70
|
+
self.key_vault_name = key_vault_name
|
|
71
|
+
self.secret_name = secret_name
|
|
72
|
+
self.account_key = account_key
|
|
73
|
+
self.sas_token = sas_token
|
|
74
|
+
self.tenant_id = tenant_id
|
|
75
|
+
self.client_id = client_id
|
|
76
|
+
self.client_secret = client_secret
|
|
77
|
+
|
|
78
|
+
self._cached_key: Optional[str] = None
|
|
79
|
+
self._cache_lock = threading.Lock()
|
|
80
|
+
|
|
81
|
+
if validate:
|
|
82
|
+
self.validate()
|
|
83
|
+
|
|
84
|
+
def validate(self) -> None:
|
|
85
|
+
"""Validate ADLS connection configuration.
|
|
86
|
+
|
|
87
|
+
Raises:
|
|
88
|
+
ValueError: If required fields are missing for the selected auth_mode
|
|
89
|
+
"""
|
|
90
|
+
ctx = get_logging_context()
|
|
91
|
+
ctx.debug(
|
|
92
|
+
"Validating AzureADLS connection",
|
|
93
|
+
account=self.account,
|
|
94
|
+
container=self.container,
|
|
95
|
+
auth_mode=self.auth_mode,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
if not self.account:
|
|
99
|
+
ctx.error("ADLS connection validation failed: missing 'account'")
|
|
100
|
+
raise ValueError(
|
|
101
|
+
"ADLS connection requires 'account'. "
|
|
102
|
+
"Provide the storage account name (e.g., account: 'mystorageaccount')."
|
|
103
|
+
)
|
|
104
|
+
if not self.container:
|
|
105
|
+
ctx.error(
|
|
106
|
+
"ADLS connection validation failed: missing 'container'",
|
|
107
|
+
account=self.account,
|
|
108
|
+
)
|
|
109
|
+
raise ValueError(
|
|
110
|
+
f"ADLS connection requires 'container' for account '{self.account}'. "
|
|
111
|
+
"Provide the container/filesystem name."
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
if self.auth_mode == "key_vault":
|
|
115
|
+
if not self.key_vault_name or not self.secret_name:
|
|
116
|
+
ctx.error(
|
|
117
|
+
"ADLS key_vault mode validation failed",
|
|
118
|
+
account=self.account,
|
|
119
|
+
container=self.container,
|
|
120
|
+
key_vault_name=self.key_vault_name or "(missing)",
|
|
121
|
+
secret_name=self.secret_name or "(missing)",
|
|
122
|
+
)
|
|
123
|
+
raise ValueError(
|
|
124
|
+
f"key_vault mode requires 'key_vault_name' and 'secret_name' "
|
|
125
|
+
f"for connection to {self.account}/{self.container}"
|
|
126
|
+
)
|
|
127
|
+
elif self.auth_mode == "direct_key":
|
|
128
|
+
if not self.account_key:
|
|
129
|
+
ctx.error(
|
|
130
|
+
"ADLS direct_key mode validation failed: missing account_key",
|
|
131
|
+
account=self.account,
|
|
132
|
+
container=self.container,
|
|
133
|
+
)
|
|
134
|
+
raise ValueError(
|
|
135
|
+
f"direct_key mode requires 'account_key' "
|
|
136
|
+
f"for connection to {self.account}/{self.container}"
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
# Warn in production
|
|
140
|
+
if os.getenv("ODIBI_ENV") == "production":
|
|
141
|
+
ctx.warning(
|
|
142
|
+
"Using direct_key in production is not recommended",
|
|
143
|
+
account=self.account,
|
|
144
|
+
container=self.container,
|
|
145
|
+
)
|
|
146
|
+
warnings.warn(
|
|
147
|
+
f"⚠️ Using direct_key in production is not recommended. "
|
|
148
|
+
f"Use auth_mode: key_vault. Connection: {self.account}/{self.container}",
|
|
149
|
+
UserWarning,
|
|
150
|
+
)
|
|
151
|
+
elif self.auth_mode == "sas_token":
|
|
152
|
+
if not self.sas_token and not (self.key_vault_name and self.secret_name):
|
|
153
|
+
ctx.error(
|
|
154
|
+
"ADLS sas_token mode validation failed",
|
|
155
|
+
account=self.account,
|
|
156
|
+
container=self.container,
|
|
157
|
+
)
|
|
158
|
+
raise ValueError(
|
|
159
|
+
f"sas_token mode requires 'sas_token' (or key_vault_name/secret_name) "
|
|
160
|
+
f"for connection to {self.account}/{self.container}"
|
|
161
|
+
)
|
|
162
|
+
elif self.auth_mode == "service_principal":
|
|
163
|
+
if not self.tenant_id or not self.client_id:
|
|
164
|
+
ctx.error(
|
|
165
|
+
"ADLS service_principal mode validation failed",
|
|
166
|
+
account=self.account,
|
|
167
|
+
container=self.container,
|
|
168
|
+
missing="tenant_id and/or client_id",
|
|
169
|
+
)
|
|
170
|
+
raise ValueError(
|
|
171
|
+
f"service_principal mode requires 'tenant_id' and 'client_id' "
|
|
172
|
+
f"for connection to {self.account}/{self.container}. "
|
|
173
|
+
f"Got tenant_id={self.tenant_id or '(missing)'}, "
|
|
174
|
+
f"client_id={self.client_id or '(missing)'}."
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
if not self.client_secret and not (self.key_vault_name and self.secret_name):
|
|
178
|
+
ctx.error(
|
|
179
|
+
"ADLS service_principal mode validation failed: missing client_secret",
|
|
180
|
+
account=self.account,
|
|
181
|
+
container=self.container,
|
|
182
|
+
)
|
|
183
|
+
raise ValueError(
|
|
184
|
+
f"service_principal mode requires 'client_secret' "
|
|
185
|
+
f"(or key_vault_name/secret_name) for {self.account}/{self.container}"
|
|
186
|
+
)
|
|
187
|
+
elif self.auth_mode == "managed_identity":
|
|
188
|
+
# No specific config required, but we might check if environment supports it
|
|
189
|
+
ctx.debug(
|
|
190
|
+
"Using managed_identity auth mode",
|
|
191
|
+
account=self.account,
|
|
192
|
+
container=self.container,
|
|
193
|
+
)
|
|
194
|
+
else:
|
|
195
|
+
ctx.error(
|
|
196
|
+
"ADLS validation failed: unsupported auth_mode",
|
|
197
|
+
account=self.account,
|
|
198
|
+
container=self.container,
|
|
199
|
+
auth_mode=self.auth_mode,
|
|
200
|
+
)
|
|
201
|
+
raise ValueError(
|
|
202
|
+
f"Unsupported auth_mode: '{self.auth_mode}'. "
|
|
203
|
+
f"Use 'key_vault', 'direct_key', 'service_principal', or 'managed_identity'."
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
ctx.info(
|
|
207
|
+
"AzureADLS connection validated successfully",
|
|
208
|
+
account=self.account,
|
|
209
|
+
container=self.container,
|
|
210
|
+
auth_mode=self.auth_mode,
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
def get_storage_key(self, timeout: float = 30.0) -> Optional[str]:
|
|
214
|
+
"""Get storage account key (cached).
|
|
215
|
+
|
|
216
|
+
Only relevant for 'key_vault' and 'direct_key' modes.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
timeout: Timeout for Key Vault operations in seconds (default: 30.0)
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
Storage account key or None if not applicable for auth_mode
|
|
223
|
+
|
|
224
|
+
Raises:
|
|
225
|
+
ImportError: If azure libraries not installed (key_vault mode)
|
|
226
|
+
TimeoutError: If Key Vault fetch exceeds timeout
|
|
227
|
+
Exception: If Key Vault access fails
|
|
228
|
+
"""
|
|
229
|
+
ctx = get_logging_context()
|
|
230
|
+
|
|
231
|
+
with self._cache_lock:
|
|
232
|
+
# Return cached key if available (double-check inside lock)
|
|
233
|
+
if self._cached_key:
|
|
234
|
+
ctx.debug(
|
|
235
|
+
"Using cached storage key",
|
|
236
|
+
account=self.account,
|
|
237
|
+
container=self.container,
|
|
238
|
+
)
|
|
239
|
+
return self._cached_key
|
|
240
|
+
|
|
241
|
+
if self.auth_mode == "key_vault":
|
|
242
|
+
ctx.debug(
|
|
243
|
+
"Fetching storage key from Key Vault",
|
|
244
|
+
account=self.account,
|
|
245
|
+
key_vault_name=self.key_vault_name,
|
|
246
|
+
secret_name=self.secret_name,
|
|
247
|
+
timeout=timeout,
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
try:
|
|
251
|
+
import concurrent.futures
|
|
252
|
+
|
|
253
|
+
from azure.identity import DefaultAzureCredential
|
|
254
|
+
from azure.keyvault.secrets import SecretClient
|
|
255
|
+
except ImportError as e:
|
|
256
|
+
ctx.error(
|
|
257
|
+
"Key Vault authentication failed: missing azure libraries",
|
|
258
|
+
account=self.account,
|
|
259
|
+
error=str(e),
|
|
260
|
+
)
|
|
261
|
+
raise ImportError(
|
|
262
|
+
"Key Vault authentication requires 'azure-identity' and "
|
|
263
|
+
"'azure-keyvault-secrets'. Install with: pip install odibi[azure]"
|
|
264
|
+
) from e
|
|
265
|
+
|
|
266
|
+
# Create Key Vault client
|
|
267
|
+
credential = DefaultAzureCredential()
|
|
268
|
+
kv_uri = f"https://{self.key_vault_name}.vault.azure.net"
|
|
269
|
+
client = SecretClient(vault_url=kv_uri, credential=credential)
|
|
270
|
+
|
|
271
|
+
ctx.debug(
|
|
272
|
+
"Connecting to Key Vault",
|
|
273
|
+
key_vault_uri=kv_uri,
|
|
274
|
+
secret_name=self.secret_name,
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# Fetch secret with timeout protection
|
|
278
|
+
def _fetch():
|
|
279
|
+
secret = client.get_secret(self.secret_name)
|
|
280
|
+
return secret.value
|
|
281
|
+
|
|
282
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
|
283
|
+
future = executor.submit(_fetch)
|
|
284
|
+
try:
|
|
285
|
+
self._cached_key = future.result(timeout=timeout)
|
|
286
|
+
logger.register_secret(self._cached_key)
|
|
287
|
+
ctx.info(
|
|
288
|
+
"Successfully fetched storage key from Key Vault",
|
|
289
|
+
account=self.account,
|
|
290
|
+
key_vault_name=self.key_vault_name,
|
|
291
|
+
)
|
|
292
|
+
return self._cached_key
|
|
293
|
+
except concurrent.futures.TimeoutError:
|
|
294
|
+
ctx.error(
|
|
295
|
+
"Key Vault fetch timed out",
|
|
296
|
+
account=self.account,
|
|
297
|
+
key_vault_name=self.key_vault_name,
|
|
298
|
+
secret_name=self.secret_name,
|
|
299
|
+
timeout=timeout,
|
|
300
|
+
)
|
|
301
|
+
raise TimeoutError(
|
|
302
|
+
f"Key Vault fetch timed out after {timeout}s for "
|
|
303
|
+
f"vault '{self.key_vault_name}', secret '{self.secret_name}'"
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
elif self.auth_mode == "direct_key":
|
|
307
|
+
ctx.debug(
|
|
308
|
+
"Using direct account key",
|
|
309
|
+
account=self.account,
|
|
310
|
+
)
|
|
311
|
+
return self.account_key
|
|
312
|
+
|
|
313
|
+
elif self.auth_mode == "sas_token":
|
|
314
|
+
# Return cached key (fetched from KV) if available, else sas_token arg
|
|
315
|
+
ctx.debug(
|
|
316
|
+
"Using SAS token",
|
|
317
|
+
account=self.account,
|
|
318
|
+
from_cache=bool(self._cached_key),
|
|
319
|
+
)
|
|
320
|
+
return self._cached_key or self.sas_token
|
|
321
|
+
|
|
322
|
+
# For other modes (SP, MI), we don't use an account key
|
|
323
|
+
ctx.debug(
|
|
324
|
+
"No storage key required for auth_mode",
|
|
325
|
+
account=self.account,
|
|
326
|
+
auth_mode=self.auth_mode,
|
|
327
|
+
)
|
|
328
|
+
return None
|
|
329
|
+
|
|
330
|
+
def get_client_secret(self) -> Optional[str]:
|
|
331
|
+
"""Get Service Principal client secret (cached or literal)."""
|
|
332
|
+
return self._cached_key or self.client_secret
|
|
333
|
+
|
|
334
|
+
def pandas_storage_options(self) -> Dict[str, Any]:
|
|
335
|
+
"""Get storage options for pandas/fsspec.
|
|
336
|
+
|
|
337
|
+
Returns:
|
|
338
|
+
Dictionary with appropriate authentication parameters for fsspec
|
|
339
|
+
"""
|
|
340
|
+
ctx = get_logging_context()
|
|
341
|
+
ctx.debug(
|
|
342
|
+
"Building pandas storage options",
|
|
343
|
+
account=self.account,
|
|
344
|
+
container=self.container,
|
|
345
|
+
auth_mode=self.auth_mode,
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
base_options = {"account_name": self.account}
|
|
349
|
+
|
|
350
|
+
if self.auth_mode in ["key_vault", "direct_key"]:
|
|
351
|
+
return {**base_options, "account_key": self.get_storage_key()}
|
|
352
|
+
|
|
353
|
+
elif self.auth_mode == "sas_token":
|
|
354
|
+
# Use get_storage_key() which handles KV fallback for SAS
|
|
355
|
+
return {**base_options, "sas_token": self.get_storage_key()}
|
|
356
|
+
|
|
357
|
+
elif self.auth_mode == "service_principal":
|
|
358
|
+
return {
|
|
359
|
+
**base_options,
|
|
360
|
+
"tenant_id": self.tenant_id,
|
|
361
|
+
"client_id": self.client_id,
|
|
362
|
+
"client_secret": self.get_client_secret(),
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
elif self.auth_mode == "managed_identity":
|
|
366
|
+
# adlfs supports using DefaultAzureCredential implicitly if anon=False
|
|
367
|
+
# and no other creds provided, assuming azure.identity is installed
|
|
368
|
+
return {**base_options, "anon": False}
|
|
369
|
+
|
|
370
|
+
return base_options
|
|
371
|
+
|
|
372
|
+
def configure_spark(self, spark: "Any") -> None:
|
|
373
|
+
"""Configure Spark session with storage credentials.
|
|
374
|
+
|
|
375
|
+
Args:
|
|
376
|
+
spark: SparkSession instance
|
|
377
|
+
"""
|
|
378
|
+
ctx = get_logging_context()
|
|
379
|
+
ctx.info(
|
|
380
|
+
"Configuring Spark for AzureADLS",
|
|
381
|
+
account=self.account,
|
|
382
|
+
container=self.container,
|
|
383
|
+
auth_mode=self.auth_mode,
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
if self.auth_mode in ["key_vault", "direct_key"]:
|
|
387
|
+
config_key = f"fs.azure.account.key.{self.account}.dfs.core.windows.net"
|
|
388
|
+
spark.conf.set(config_key, self.get_storage_key())
|
|
389
|
+
ctx.debug(
|
|
390
|
+
"Set Spark config for account key",
|
|
391
|
+
config_key=config_key,
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
elif self.auth_mode == "sas_token":
|
|
395
|
+
# SAS Token Configuration
|
|
396
|
+
# fs.azure.sas.token.provider.type -> FixedSASTokenProvider
|
|
397
|
+
# fs.azure.sas.fixed.token -> <token>
|
|
398
|
+
provider_key = f"fs.azure.account.auth.type.{self.account}.dfs.core.windows.net"
|
|
399
|
+
spark.conf.set(provider_key, "SAS")
|
|
400
|
+
|
|
401
|
+
sas_provider_key = (
|
|
402
|
+
f"fs.azure.sas.token.provider.type.{self.account}.dfs.core.windows.net"
|
|
403
|
+
)
|
|
404
|
+
spark.conf.set(
|
|
405
|
+
sas_provider_key, "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider"
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
sas_token = self.get_storage_key()
|
|
409
|
+
|
|
410
|
+
sas_token_key = f"fs.azure.sas.fixed.token.{self.account}.dfs.core.windows.net"
|
|
411
|
+
spark.conf.set(sas_token_key, sas_token)
|
|
412
|
+
|
|
413
|
+
ctx.debug(
|
|
414
|
+
"Set Spark config for SAS token",
|
|
415
|
+
auth_type_key=provider_key,
|
|
416
|
+
provider_key=sas_provider_key,
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
elif self.auth_mode == "service_principal":
|
|
420
|
+
# Configure OAuth for ADLS Gen2
|
|
421
|
+
# Ref: https://hadoop.apache.org/docs/stable/hadoop-azure/abfs.html
|
|
422
|
+
prefix = f"fs.azure.account.auth.type.{self.account}.dfs.core.windows.net"
|
|
423
|
+
spark.conf.set(prefix, "OAuth")
|
|
424
|
+
|
|
425
|
+
prefix = f"fs.azure.account.oauth.provider.type.{self.account}.dfs.core.windows.net"
|
|
426
|
+
spark.conf.set(prefix, "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
|
|
427
|
+
|
|
428
|
+
prefix = f"fs.azure.account.oauth2.client.id.{self.account}.dfs.core.windows.net"
|
|
429
|
+
spark.conf.set(prefix, self.client_id)
|
|
430
|
+
|
|
431
|
+
prefix = f"fs.azure.account.oauth2.client.secret.{self.account}.dfs.core.windows.net"
|
|
432
|
+
spark.conf.set(prefix, self.get_client_secret())
|
|
433
|
+
|
|
434
|
+
prefix = f"fs.azure.account.oauth2.client.endpoint.{self.account}.dfs.core.windows.net"
|
|
435
|
+
endpoint = f"https://login.microsoftonline.com/{self.tenant_id}/oauth2/token"
|
|
436
|
+
spark.conf.set(prefix, endpoint)
|
|
437
|
+
|
|
438
|
+
ctx.debug(
|
|
439
|
+
"Set Spark config for service principal OAuth",
|
|
440
|
+
tenant_id=self.tenant_id,
|
|
441
|
+
client_id=self.client_id,
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
elif self.auth_mode == "managed_identity":
|
|
445
|
+
prefix = f"fs.azure.account.auth.type.{self.account}.dfs.core.windows.net"
|
|
446
|
+
spark.conf.set(prefix, "OAuth")
|
|
447
|
+
|
|
448
|
+
prefix = f"fs.azure.account.oauth.provider.type.{self.account}.dfs.core.windows.net"
|
|
449
|
+
spark.conf.set(prefix, "org.apache.hadoop.fs.azurebfs.oauth2.MsiTokenProvider")
|
|
450
|
+
|
|
451
|
+
ctx.debug(
|
|
452
|
+
"Set Spark config for managed identity",
|
|
453
|
+
account=self.account,
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
ctx.info(
|
|
457
|
+
"Spark configuration complete",
|
|
458
|
+
account=self.account,
|
|
459
|
+
auth_mode=self.auth_mode,
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
def uri(self, path: str) -> str:
|
|
463
|
+
"""Build abfss:// URI for given path.
|
|
464
|
+
|
|
465
|
+
Args:
|
|
466
|
+
path: Relative path within container
|
|
467
|
+
|
|
468
|
+
Returns:
|
|
469
|
+
Full abfss:// URI
|
|
470
|
+
|
|
471
|
+
Example:
|
|
472
|
+
>>> conn = AzureADLS(
|
|
473
|
+
... account="myaccount", container="data",
|
|
474
|
+
... auth_mode="direct_key", account_key="key123"
|
|
475
|
+
... )
|
|
476
|
+
>>> conn.uri("folder/file.csv")
|
|
477
|
+
'abfss://data@myaccount.dfs.core.windows.net/folder/file.csv'
|
|
478
|
+
"""
|
|
479
|
+
if self.path_prefix:
|
|
480
|
+
full_path = posixpath.join(self.path_prefix, path.lstrip("/"))
|
|
481
|
+
else:
|
|
482
|
+
full_path = path.lstrip("/")
|
|
483
|
+
|
|
484
|
+
return f"abfss://{self.container}@{self.account}.dfs.core.windows.net/{full_path}"
|
|
485
|
+
|
|
486
|
+
def get_path(self, relative_path: str) -> str:
|
|
487
|
+
"""Get full abfss:// URI for relative path."""
|
|
488
|
+
ctx = get_logging_context()
|
|
489
|
+
full_uri = self.uri(relative_path)
|
|
490
|
+
|
|
491
|
+
ctx.debug(
|
|
492
|
+
"Resolved ADLS path",
|
|
493
|
+
account=self.account,
|
|
494
|
+
container=self.container,
|
|
495
|
+
relative_path=relative_path,
|
|
496
|
+
full_uri=full_uri,
|
|
497
|
+
)
|
|
498
|
+
|
|
499
|
+
return full_uri
|