ygg 0.1.56__py3-none-any.whl → 0.1.60__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ygg-0.1.56.dist-info → ygg-0.1.60.dist-info}/METADATA +1 -1
- ygg-0.1.60.dist-info/RECORD +74 -0
- {ygg-0.1.56.dist-info → ygg-0.1.60.dist-info}/WHEEL +1 -1
- yggdrasil/ai/__init__.py +2 -0
- yggdrasil/ai/session.py +89 -0
- yggdrasil/ai/sql_session.py +310 -0
- yggdrasil/databricks/__init__.py +0 -3
- yggdrasil/databricks/compute/cluster.py +68 -113
- yggdrasil/databricks/compute/command_execution.py +674 -0
- yggdrasil/databricks/compute/exceptions.py +7 -2
- yggdrasil/databricks/compute/execution_context.py +465 -277
- yggdrasil/databricks/compute/remote.py +4 -14
- yggdrasil/databricks/exceptions.py +10 -0
- yggdrasil/databricks/sql/__init__.py +0 -4
- yggdrasil/databricks/sql/engine.py +161 -173
- yggdrasil/databricks/sql/exceptions.py +9 -1
- yggdrasil/databricks/sql/statement_result.py +108 -120
- yggdrasil/databricks/sql/warehouse.py +331 -92
- yggdrasil/databricks/workspaces/io.py +92 -9
- yggdrasil/databricks/workspaces/path.py +120 -74
- yggdrasil/databricks/workspaces/workspace.py +212 -68
- yggdrasil/libs/databrickslib.py +23 -18
- yggdrasil/libs/extensions/spark_extensions.py +1 -1
- yggdrasil/libs/pandaslib.py +15 -6
- yggdrasil/libs/polarslib.py +49 -13
- yggdrasil/pyutils/__init__.py +1 -0
- yggdrasil/pyutils/callable_serde.py +12 -19
- yggdrasil/pyutils/exceptions.py +16 -0
- yggdrasil/pyutils/mimetypes.py +0 -0
- yggdrasil/pyutils/python_env.py +13 -12
- yggdrasil/pyutils/waiting_config.py +171 -0
- yggdrasil/types/cast/arrow_cast.py +3 -0
- yggdrasil/types/cast/pandas_cast.py +157 -169
- yggdrasil/types/cast/polars_cast.py +11 -43
- yggdrasil/types/dummy_class.py +81 -0
- yggdrasil/version.py +1 -1
- ygg-0.1.56.dist-info/RECORD +0 -68
- yggdrasil/databricks/ai/__init__.py +0 -1
- yggdrasil/databricks/ai/loki.py +0 -374
- {ygg-0.1.56.dist-info → ygg-0.1.60.dist-info}/entry_points.txt +0 -0
- {ygg-0.1.56.dist-info → ygg-0.1.60.dist-info}/licenses/LICENSE +0 -0
- {ygg-0.1.56.dist-info → ygg-0.1.60.dist-info}/top_level.txt +0 -0
|
@@ -36,11 +36,8 @@ def databricks_remote_compute(
|
|
|
36
36
|
cluster_name: Optional[str] = None,
|
|
37
37
|
workspace: Optional[Union[Workspace, str]] = None,
|
|
38
38
|
cluster: Optional["Cluster"] = None,
|
|
39
|
-
timeout: Optional[dt.timedelta] = None,
|
|
40
39
|
env_keys: Optional[List[str]] = None,
|
|
41
40
|
force_local: bool = False,
|
|
42
|
-
update_timeout: Optional[Union[float, dt.timedelta]] = None,
|
|
43
|
-
**options
|
|
44
41
|
) -> Callable[[Callable[..., ReturnType]], Callable[..., ReturnType]]:
|
|
45
42
|
"""Return a decorator that executes functions on a remote cluster.
|
|
46
43
|
|
|
@@ -50,11 +47,8 @@ def databricks_remote_compute(
|
|
|
50
47
|
cluster_name: Optional cluster name to target.
|
|
51
48
|
workspace: Workspace instance or host string for lookup.
|
|
52
49
|
cluster: Pre-configured Cluster instance to reuse.
|
|
53
|
-
timeout: Optional execution timeout for remote calls.
|
|
54
50
|
env_keys: Optional environment variable names to forward.
|
|
55
51
|
force_local: Force local execution
|
|
56
|
-
update_timeout: creation or update wait timeout
|
|
57
|
-
**options: Extra options forwarded to the execution decorator.
|
|
58
52
|
|
|
59
53
|
Returns:
|
|
60
54
|
A decorator that runs functions on the resolved Databricks cluster.
|
|
@@ -85,14 +79,10 @@ def databricks_remote_compute(
|
|
|
85
79
|
workspace=workspace,
|
|
86
80
|
cluster_name=cluster_name,
|
|
87
81
|
single_user_name=workspace.current_user.user_name,
|
|
88
|
-
|
|
82
|
+
wait_update=False
|
|
89
83
|
)
|
|
90
84
|
|
|
91
|
-
cluster.
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
_func=_func,
|
|
95
|
-
env_keys=env_keys,
|
|
96
|
-
timeout=timeout,
|
|
97
|
-
**options
|
|
85
|
+
return cluster.system_context.decorate(
|
|
86
|
+
func=_func,
|
|
87
|
+
environ=env_keys,
|
|
98
88
|
)
|
|
@@ -16,16 +16,20 @@ import logging
|
|
|
16
16
|
import random
|
|
17
17
|
import string
|
|
18
18
|
import time
|
|
19
|
+
from threading import Thread
|
|
19
20
|
from typing import Optional, Union, Any, Dict, List, Literal
|
|
20
21
|
|
|
21
22
|
import pyarrow as pa
|
|
23
|
+
import pyarrow.dataset as pds
|
|
22
24
|
|
|
23
25
|
from .statement_result import StatementResult
|
|
24
26
|
from .types import column_info_to_arrow_field
|
|
25
|
-
from
|
|
26
|
-
from ..workspaces import WorkspaceService
|
|
27
|
-
from ...
|
|
27
|
+
from .warehouse import SQLWarehouse
|
|
28
|
+
from ..workspaces import WorkspaceService, DatabricksPath
|
|
29
|
+
from ...ai.sql_session import SQLAISession, SQLFlavor
|
|
30
|
+
from ...libs.databrickslib import databricks_sdk, DatabricksDummyClass
|
|
28
31
|
from ...libs.sparklib import SparkSession, SparkDataFrame, pyspark
|
|
32
|
+
from ...pyutils.waiting_config import WaitingConfigArg
|
|
29
33
|
from ...types import is_arrow_type_string_like, is_arrow_type_binary_like
|
|
30
34
|
from ...types.cast.cast_options import CastOptions
|
|
31
35
|
from ...types.cast.registry import convert
|
|
@@ -43,13 +47,14 @@ except ImportError:
|
|
|
43
47
|
|
|
44
48
|
if databricks_sdk is not None:
|
|
45
49
|
from databricks.sdk.service.sql import (
|
|
46
|
-
|
|
50
|
+
Disposition, Format,
|
|
47
51
|
ExecuteStatementRequestOnWaitTimeout, StatementParameterListItem
|
|
48
52
|
)
|
|
49
|
-
StatementResponse = StatementResponse
|
|
50
53
|
else:
|
|
51
|
-
|
|
52
|
-
|
|
54
|
+
Disposition = DatabricksDummyClass
|
|
55
|
+
Format = DatabricksDummyClass
|
|
56
|
+
ExecuteStatementRequestOnWaitTimeout = DatabricksDummyClass
|
|
57
|
+
StatementParameterListItem = DatabricksDummyClass
|
|
53
58
|
|
|
54
59
|
|
|
55
60
|
logger = logging.getLogger(__name__)
|
|
@@ -57,7 +62,11 @@ logger = logging.getLogger(__name__)
|
|
|
57
62
|
if pyspark is not None:
|
|
58
63
|
import pyspark.sql.functions as F
|
|
59
64
|
|
|
60
|
-
|
|
65
|
+
|
|
66
|
+
__all__ = [
|
|
67
|
+
"SQLEngine",
|
|
68
|
+
"StatementResult"
|
|
69
|
+
]
|
|
61
70
|
|
|
62
71
|
|
|
63
72
|
@dataclasses.dataclass
|
|
@@ -88,10 +97,12 @@ def _needs_column_mapping(col_name: str) -> bool:
|
|
|
88
97
|
@dataclasses.dataclass
|
|
89
98
|
class SQLEngine(WorkspaceService):
|
|
90
99
|
"""Execute SQL statements and manage tables via Databricks SQL / Spark."""
|
|
91
|
-
warehouse_id: Optional[str] = None
|
|
92
100
|
catalog_name: Optional[str] = None
|
|
93
101
|
schema_name: Optional[str] = None
|
|
94
102
|
|
|
103
|
+
_warehouse: Optional[SQLWarehouse] = dataclasses.field(default=None, repr=False, hash=False, compare=False)
|
|
104
|
+
_ai_session: Optional[SQLAISession] = dataclasses.field(default=None, repr=False, hash=False, compare=False)
|
|
105
|
+
|
|
95
106
|
def table_full_name(
|
|
96
107
|
self,
|
|
97
108
|
catalog_name: Optional[str] = None,
|
|
@@ -147,68 +158,8 @@ class SQLEngine(WorkspaceService):
|
|
|
147
158
|
return self.catalog_name, parts[0], parts[1]
|
|
148
159
|
|
|
149
160
|
catalog_name, schema_name, table_name = parts[-3], parts[-2], parts[-1]
|
|
150
|
-
catalog_name = catalog_name or self.catalog_name
|
|
151
|
-
schema_name = schema_name or self.schema_name
|
|
152
|
-
return catalog_name, schema_name, table_name
|
|
153
|
-
|
|
154
|
-
def _default_warehouse(
|
|
155
|
-
self,
|
|
156
|
-
cluster_size: str = "Small"
|
|
157
|
-
):
|
|
158
|
-
"""Pick a default SQL warehouse (best-effort) matching the desired size.
|
|
159
|
-
|
|
160
|
-
Args:
|
|
161
|
-
cluster_size: Desired warehouse size (Databricks "cluster_size"), e.g. "Small".
|
|
162
|
-
If empty/None, returns the first warehouse encountered.
|
|
163
161
|
|
|
164
|
-
|
|
165
|
-
Warehouse object.
|
|
166
|
-
|
|
167
|
-
Raises:
|
|
168
|
-
ValueError: If no warehouses exist in the workspace.
|
|
169
|
-
"""
|
|
170
|
-
wk = self.workspace.sdk()
|
|
171
|
-
existing = list(wk.warehouses.list())
|
|
172
|
-
first = None
|
|
173
|
-
|
|
174
|
-
for warehouse in existing:
|
|
175
|
-
if first is None:
|
|
176
|
-
first = warehouse
|
|
177
|
-
|
|
178
|
-
if cluster_size:
|
|
179
|
-
if getattr(warehouse, "cluster_size", None) == cluster_size:
|
|
180
|
-
logger.debug("Default warehouse match found: id=%s cluster_size=%s", warehouse.id, warehouse.cluster_size)
|
|
181
|
-
return warehouse
|
|
182
|
-
else:
|
|
183
|
-
logger.debug("Default warehouse selected (first): id=%s", warehouse.id)
|
|
184
|
-
return warehouse
|
|
185
|
-
|
|
186
|
-
if first is not None:
|
|
187
|
-
logger.info(
|
|
188
|
-
"No warehouse matched cluster_size=%s; falling back to first warehouse id=%s cluster_size=%s",
|
|
189
|
-
cluster_size,
|
|
190
|
-
getattr(first, "id", None),
|
|
191
|
-
getattr(first, "cluster_size", None),
|
|
192
|
-
)
|
|
193
|
-
return first
|
|
194
|
-
|
|
195
|
-
raise ValueError(f"No default warehouse found in {wk.config.host}")
|
|
196
|
-
|
|
197
|
-
def _get_or_default_warehouse_id(self, cluster_size: str = "Small") -> str:
|
|
198
|
-
"""Return configured warehouse_id or resolve a default one.
|
|
199
|
-
|
|
200
|
-
Args:
|
|
201
|
-
cluster_size: Desired warehouse size filter used when resolving defaults.
|
|
202
|
-
|
|
203
|
-
Returns:
|
|
204
|
-
Warehouse id string.
|
|
205
|
-
"""
|
|
206
|
-
if not self.warehouse_id:
|
|
207
|
-
dft = self._default_warehouse(cluster_size=cluster_size)
|
|
208
|
-
self.warehouse_id = dft.id
|
|
209
|
-
logger.info("Resolved default warehouse_id=%s (cluster_size=%s)", self.warehouse_id, cluster_size)
|
|
210
|
-
|
|
211
|
-
return self.warehouse_id
|
|
162
|
+
return catalog_name or self.catalog_name, schema_name or self.schema_name, table_name
|
|
212
163
|
|
|
213
164
|
@staticmethod
|
|
214
165
|
def _random_suffix(prefix: str = "") -> str:
|
|
@@ -217,12 +168,44 @@ class SQLEngine(WorkspaceService):
|
|
|
217
168
|
timestamp = int(time.time() * 1000)
|
|
218
169
|
return f"{prefix}{timestamp}_{unique}"
|
|
219
170
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
171
|
+
def warehouse(
|
|
172
|
+
self,
|
|
173
|
+
warehouse_id: Optional[str] = None,
|
|
174
|
+
warehouse_name: Optional[str] = None,
|
|
175
|
+
) -> SQLWarehouse:
|
|
176
|
+
if self._warehouse is None:
|
|
177
|
+
wh = SQLWarehouse(
|
|
178
|
+
workspace=self.workspace,
|
|
179
|
+
warehouse_id=warehouse_id,
|
|
180
|
+
warehouse_name=warehouse_name
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
self._warehouse = wh.find_warehouse(
|
|
184
|
+
warehouse_id=warehouse_id,
|
|
185
|
+
warehouse_name=warehouse_name,
|
|
186
|
+
raise_error=False
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
if self._warehouse is None:
|
|
190
|
+
self._warehouse = wh.create_or_update()
|
|
191
|
+
|
|
192
|
+
return self._warehouse.find_warehouse(
|
|
193
|
+
warehouse_id=warehouse_id,
|
|
194
|
+
warehouse_name=warehouse_name,
|
|
195
|
+
raise_error=True
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
def ai_session(
|
|
199
|
+
self,
|
|
200
|
+
model: str = "databricks-gemini-2-5-pro",
|
|
201
|
+
flavor: SQLFlavor = SQLFlavor.DATABRICKS
|
|
202
|
+
):
|
|
203
|
+
return SQLAISession(
|
|
204
|
+
model=model,
|
|
205
|
+
api_key=self.workspace.current_token(),
|
|
206
|
+
base_url="%s/serving-endpoints" % self.workspace.safe_host,
|
|
207
|
+
flavor=flavor
|
|
208
|
+
)
|
|
226
209
|
|
|
227
210
|
def execute(
|
|
228
211
|
self,
|
|
@@ -230,17 +213,17 @@ class SQLEngine(WorkspaceService):
|
|
|
230
213
|
*,
|
|
231
214
|
engine: Optional[Literal["spark", "api"]] = None,
|
|
232
215
|
warehouse_id: Optional[str] = None,
|
|
216
|
+
warehouse_name: Optional[str] = None,
|
|
233
217
|
byte_limit: Optional[int] = None,
|
|
234
|
-
disposition: Optional[
|
|
235
|
-
format: Optional[
|
|
236
|
-
on_wait_timeout: Optional[
|
|
237
|
-
parameters: Optional[List[
|
|
218
|
+
disposition: Optional[Disposition] = None,
|
|
219
|
+
format: Optional[Format] = None,
|
|
220
|
+
on_wait_timeout: Optional[ExecuteStatementRequestOnWaitTimeout] = None,
|
|
221
|
+
parameters: Optional[List[StatementParameterListItem]] = None,
|
|
238
222
|
row_limit: Optional[int] = None,
|
|
239
223
|
wait_timeout: Optional[str] = None,
|
|
240
224
|
catalog_name: Optional[str] = None,
|
|
241
225
|
schema_name: Optional[str] = None,
|
|
242
|
-
|
|
243
|
-
wait_result: bool = True,
|
|
226
|
+
wait: Optional[WaitingConfigArg] = True
|
|
244
227
|
) -> "StatementResult":
|
|
245
228
|
"""Execute a SQL statement via Spark or Databricks SQL Statement Execution API.
|
|
246
229
|
|
|
@@ -256,6 +239,7 @@ class SQLEngine(WorkspaceService):
|
|
|
256
239
|
statement: SQL statement to execute. If None, a `SELECT *` is generated from the table params.
|
|
257
240
|
engine: "spark" or "api".
|
|
258
241
|
warehouse_id: Warehouse override (for API engine).
|
|
242
|
+
warehouse_name: Warehouse name override (for API engine).
|
|
259
243
|
byte_limit: Optional byte limit for results.
|
|
260
244
|
disposition: Result disposition mode (API engine).
|
|
261
245
|
format: Result format (API engine).
|
|
@@ -265,8 +249,7 @@ class SQLEngine(WorkspaceService):
|
|
|
265
249
|
wait_timeout: API wait timeout value.
|
|
266
250
|
catalog_name: Optional catalog override for API engine.
|
|
267
251
|
schema_name: Optional schema override for API engine.
|
|
268
|
-
|
|
269
|
-
wait_result: Whether to block until completion (API engine).
|
|
252
|
+
wait: Whether to block until completion (API engine).
|
|
270
253
|
|
|
271
254
|
Returns:
|
|
272
255
|
StatementResult.
|
|
@@ -284,72 +267,44 @@ class SQLEngine(WorkspaceService):
|
|
|
284
267
|
if spark_session is None:
|
|
285
268
|
raise ValueError("No spark session found to run sql query")
|
|
286
269
|
|
|
287
|
-
df: SparkDataFrame = spark_session.sql(statement)
|
|
288
|
-
|
|
289
|
-
if row_limit:
|
|
290
|
-
df = df.limit(row_limit)
|
|
291
|
-
|
|
292
270
|
logger.debug(
|
|
293
|
-
"SPARK SQL
|
|
271
|
+
"SPARK SQL executing query:\n%s",
|
|
294
272
|
statement
|
|
295
273
|
)
|
|
296
274
|
|
|
297
|
-
|
|
298
|
-
|
|
275
|
+
df: SparkDataFrame = spark_session.sql(statement)
|
|
276
|
+
|
|
277
|
+
if row_limit:
|
|
278
|
+
df = df.limit(row_limit)
|
|
299
279
|
|
|
300
280
|
return StatementResult(
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
281
|
+
workspace_client=self.workspace.sdk(),
|
|
282
|
+
warehouse_id="SparkSQL",
|
|
283
|
+
statement_id="SparkSQL",
|
|
284
|
+
disposition=Disposition.EXTERNAL_LINKS,
|
|
304
285
|
_spark_df=df,
|
|
305
286
|
)
|
|
306
287
|
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
if (disposition is None or disposition == Disposition.INLINE) and format in [Format.CSV, Format.ARROW_STREAM]:
|
|
312
|
-
disposition = Disposition.EXTERNAL_LINKS
|
|
313
|
-
|
|
314
|
-
if not statement:
|
|
315
|
-
full_name = self.table_full_name(catalog_name=catalog_name, schema_name=schema_name, table_name=table_name)
|
|
316
|
-
statement = f"SELECT * FROM {full_name}"
|
|
317
|
-
|
|
318
|
-
if not warehouse_id:
|
|
319
|
-
warehouse_id = self._get_or_default_warehouse_id()
|
|
288
|
+
wh = self.warehouse(
|
|
289
|
+
warehouse_id=warehouse_id,
|
|
290
|
+
warehouse_name=warehouse_name,
|
|
291
|
+
)
|
|
320
292
|
|
|
321
|
-
|
|
293
|
+
return wh.execute(
|
|
322
294
|
statement=statement,
|
|
323
295
|
warehouse_id=warehouse_id,
|
|
296
|
+
warehouse_name=warehouse_name,
|
|
324
297
|
byte_limit=byte_limit,
|
|
325
298
|
disposition=disposition,
|
|
326
299
|
format=format,
|
|
327
300
|
on_wait_timeout=on_wait_timeout,
|
|
328
301
|
parameters=parameters,
|
|
329
|
-
row_limit=row_limit,
|
|
330
302
|
wait_timeout=wait_timeout,
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
execution = StatementResult(
|
|
336
|
-
engine=self,
|
|
337
|
-
statement_id=response.statement_id,
|
|
338
|
-
_response=response,
|
|
339
|
-
disposition=disposition,
|
|
340
|
-
)
|
|
341
|
-
|
|
342
|
-
logger.info(
|
|
343
|
-
"API SQL executed statement '%s'",
|
|
344
|
-
execution.statement_id
|
|
345
|
-
)
|
|
346
|
-
logger.debug(
|
|
347
|
-
"API SQL executed query:\n%s",
|
|
348
|
-
statement
|
|
303
|
+
catalog_name=catalog_name,
|
|
304
|
+
schema_name=schema_name,
|
|
305
|
+
wait=wait
|
|
349
306
|
)
|
|
350
307
|
|
|
351
|
-
return execution.wait() if wait_result else execution
|
|
352
|
-
|
|
353
308
|
def spark_table(
|
|
354
309
|
self,
|
|
355
310
|
full_name: Optional[str] = None,
|
|
@@ -412,7 +367,7 @@ class SQLEngine(WorkspaceService):
|
|
|
412
367
|
None (mutates the destination table).
|
|
413
368
|
"""
|
|
414
369
|
|
|
415
|
-
if pyspark is not None:
|
|
370
|
+
if pyspark is not None or spark_session is not None:
|
|
416
371
|
spark_session = SparkSession.getActiveSession() if spark_session is None else spark_session
|
|
417
372
|
|
|
418
373
|
if spark_session is not None or isinstance(data, SparkDataFrame):
|
|
@@ -502,6 +457,7 @@ class SQLEngine(WorkspaceService):
|
|
|
502
457
|
if existing_schema is None:
|
|
503
458
|
try:
|
|
504
459
|
existing_schema = connected.get_table_schema(
|
|
460
|
+
location=location,
|
|
505
461
|
catalog_name=catalog_name,
|
|
506
462
|
schema_name=schema_name,
|
|
507
463
|
table_name=table_name,
|
|
@@ -511,8 +467,7 @@ class SQLEngine(WorkspaceService):
|
|
|
511
467
|
data_tbl = convert(data, pa.Table)
|
|
512
468
|
existing_schema = data_tbl.schema
|
|
513
469
|
logger.warning(
|
|
514
|
-
"
|
|
515
|
-
location,
|
|
470
|
+
"%s, creating it from input schema (columns=%s)",
|
|
516
471
|
exc,
|
|
517
472
|
existing_schema.names,
|
|
518
473
|
)
|
|
@@ -544,13 +499,11 @@ class SQLEngine(WorkspaceService):
|
|
|
544
499
|
except Exception:
|
|
545
500
|
logger.exception("Arrow insert failed after auto-creating %s; attempting cleanup (DROP TABLE)", location)
|
|
546
501
|
try:
|
|
547
|
-
connected.drop_table(location=location)
|
|
502
|
+
connected.drop_table(location=location, wait=True)
|
|
548
503
|
except Exception:
|
|
549
504
|
logger.exception("Failed to drop table %s after auto creation error", location)
|
|
550
505
|
raise
|
|
551
506
|
|
|
552
|
-
transaction_id = self._random_suffix()
|
|
553
|
-
|
|
554
507
|
data_tbl = convert(
|
|
555
508
|
data, pa.Table,
|
|
556
509
|
options=cast_options, target_field=existing_schema
|
|
@@ -567,14 +520,15 @@ class SQLEngine(WorkspaceService):
|
|
|
567
520
|
)
|
|
568
521
|
|
|
569
522
|
# Write in temp volume
|
|
570
|
-
temp_volume_path =
|
|
571
|
-
|
|
572
|
-
|
|
523
|
+
temp_volume_path = self.workspace.tmp_path(
|
|
524
|
+
catalog_name=catalog_name,
|
|
525
|
+
schema_name=schema_name,
|
|
526
|
+
volume_name="tmp",
|
|
527
|
+
extension="parquet"
|
|
573
528
|
) if temp_volume_path is None else DatabricksPath.parse(obj=temp_volume_path, workspace=connected.workspace)
|
|
574
529
|
|
|
575
530
|
logger.debug("Staging Parquet to temp volume: %s", temp_volume_path)
|
|
576
|
-
temp_volume_path.
|
|
577
|
-
temp_volume_path.write_arrow_table(data_tbl)
|
|
531
|
+
temp_volume_path.write_arrow_table(data_tbl, file_format=pds.ParquetFileFormat())
|
|
578
532
|
|
|
579
533
|
columns = list(existing_schema.names)
|
|
580
534
|
cols_quoted = ", ".join([f"`{c}`" for c in columns])
|
|
@@ -620,7 +574,12 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
620
574
|
connected.execute(stmt.strip())
|
|
621
575
|
finally:
|
|
622
576
|
try:
|
|
623
|
-
|
|
577
|
+
Thread(
|
|
578
|
+
target=temp_volume_path.rmdir,
|
|
579
|
+
kwargs={
|
|
580
|
+
"recursive": True
|
|
581
|
+
}
|
|
582
|
+
).start()
|
|
624
583
|
except Exception:
|
|
625
584
|
logger.exception("Failed cleaning temp volume: %s", temp_volume_path)
|
|
626
585
|
|
|
@@ -732,8 +691,6 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
732
691
|
cast_options = CastOptions.check_arg(options=cast_options, target_field=existing_schema)
|
|
733
692
|
data = cast_spark_dataframe(data, options=cast_options)
|
|
734
693
|
|
|
735
|
-
logger.debug("Incoming Spark columns: %s", data.columns)
|
|
736
|
-
|
|
737
694
|
if match_by:
|
|
738
695
|
notnull = None
|
|
739
696
|
for k in match_by:
|
|
@@ -788,6 +745,7 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
788
745
|
|
|
789
746
|
def get_table_schema(
|
|
790
747
|
self,
|
|
748
|
+
location: Optional[str] = None,
|
|
791
749
|
catalog_name: Optional[str] = None,
|
|
792
750
|
schema_name: Optional[str] = None,
|
|
793
751
|
table_name: Optional[str] = None,
|
|
@@ -796,6 +754,7 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
796
754
|
"""Fetch a table schema from Unity Catalog and convert it to Arrow types.
|
|
797
755
|
|
|
798
756
|
Args:
|
|
757
|
+
location: Optional Fully qualified location name
|
|
799
758
|
catalog_name: Optional catalog override.
|
|
800
759
|
schema_name: Optional schema override.
|
|
801
760
|
table_name: Optional table name override.
|
|
@@ -804,25 +763,44 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
804
763
|
Returns:
|
|
805
764
|
Arrow Schema or a STRUCT Field representing the table.
|
|
806
765
|
"""
|
|
807
|
-
|
|
766
|
+
location, catalog_name, schema_name, table_name = self._check_location_params(
|
|
767
|
+
location=location,
|
|
808
768
|
catalog_name=catalog_name,
|
|
809
769
|
schema_name=schema_name,
|
|
810
770
|
table_name=table_name,
|
|
811
771
|
safe_chars=False,
|
|
812
772
|
)
|
|
813
773
|
|
|
814
|
-
|
|
774
|
+
client = self.workspace.sdk().tables
|
|
815
775
|
|
|
816
776
|
try:
|
|
817
|
-
table =
|
|
777
|
+
table = client.get(location)
|
|
818
778
|
except Exception as e:
|
|
819
|
-
raise ValueError(f"Table %s not found, {type(e)} {e}" %
|
|
779
|
+
raise ValueError(f"Table %s not found, {type(e)} {e}" % location)
|
|
820
780
|
|
|
821
|
-
fields = [
|
|
781
|
+
fields = [
|
|
782
|
+
column_info_to_arrow_field(_) for _ in table.columns
|
|
783
|
+
]
|
|
784
|
+
|
|
785
|
+
metadata = {
|
|
786
|
+
b"engine": b"databricks",
|
|
787
|
+
b"full_name": location,
|
|
788
|
+
b"catalog_name": catalog_name,
|
|
789
|
+
b"schema_name": schema_name,
|
|
790
|
+
b"table_name": table_name,
|
|
791
|
+
}
|
|
822
792
|
|
|
823
793
|
if to_arrow_schema:
|
|
824
|
-
return pa.schema(
|
|
825
|
-
|
|
794
|
+
return pa.schema(
|
|
795
|
+
fields,
|
|
796
|
+
metadata=metadata
|
|
797
|
+
)
|
|
798
|
+
|
|
799
|
+
return pa.field(
|
|
800
|
+
location,
|
|
801
|
+
pa.struct(fields),
|
|
802
|
+
metadata=metadata
|
|
803
|
+
)
|
|
826
804
|
|
|
827
805
|
def drop_table(
|
|
828
806
|
self,
|
|
@@ -830,6 +808,7 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
830
808
|
catalog_name: Optional[str] = None,
|
|
831
809
|
schema_name: Optional[str] = None,
|
|
832
810
|
table_name: Optional[str] = None,
|
|
811
|
+
wait: Optional[WaitingConfigArg] = True
|
|
833
812
|
):
|
|
834
813
|
"""Drop a table if it exists."""
|
|
835
814
|
location, _, _, _ = self._check_location_params(
|
|
@@ -839,13 +818,17 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
839
818
|
table_name=table_name,
|
|
840
819
|
safe_chars=True,
|
|
841
820
|
)
|
|
842
|
-
|
|
843
|
-
|
|
821
|
+
|
|
822
|
+
logger.debug("Dropping table if exists: %s", location)
|
|
823
|
+
|
|
824
|
+
self.execute(f"DROP TABLE IF EXISTS {location}", wait=wait)
|
|
825
|
+
|
|
826
|
+
logger.info("Dropped table if exists: %s", location)
|
|
844
827
|
|
|
845
828
|
def create_table(
|
|
846
829
|
self,
|
|
847
830
|
field: Union[pa.Field, pa.Schema],
|
|
848
|
-
|
|
831
|
+
full_name: Optional[str] = None, # e.g. catalog.schema.table
|
|
849
832
|
catalog_name: Optional[str] = None,
|
|
850
833
|
schema_name: Optional[str] = None,
|
|
851
834
|
table_name: Optional[str] = None,
|
|
@@ -853,7 +836,7 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
853
836
|
partition_by: Optional[list[str]] = None,
|
|
854
837
|
cluster_by: Optional[bool | list[str]] = True,
|
|
855
838
|
comment: Optional[str] = None,
|
|
856
|
-
|
|
839
|
+
properties: Optional[dict[str, Any]] = None,
|
|
857
840
|
if_not_exists: bool = True,
|
|
858
841
|
or_replace: bool = False,
|
|
859
842
|
using: str = "DELTA",
|
|
@@ -886,7 +869,7 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
886
869
|
- If `pa.Schema`, all schema fields are used as columns.
|
|
887
870
|
- If `pa.Field` with struct type, its children become columns.
|
|
888
871
|
- If `pa.Field` non-struct, it becomes a single-column table.
|
|
889
|
-
|
|
872
|
+
full_name:
|
|
890
873
|
Fully-qualified table name, e.g. `"catalog.schema.table"`.
|
|
891
874
|
If provided, it takes precedence over `catalog_name`/`schema_name`/`table_name`.
|
|
892
875
|
Parts are quoted as needed.
|
|
@@ -906,7 +889,7 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
906
889
|
- list[str] -> emits `CLUSTER BY (<cols...>)` (all cols must exist in schema)
|
|
907
890
|
comment:
|
|
908
891
|
Optional table comment. If not provided and Arrow metadata contains `b"comment"`, that is used.
|
|
909
|
-
|
|
892
|
+
properties:
|
|
910
893
|
Additional/override Delta table properties (final say).
|
|
911
894
|
Example: `{"delta.enableChangeDataFeed": "true"}` or `{"delta.logRetentionDuration": "30 days"}`
|
|
912
895
|
if_not_exists:
|
|
@@ -973,19 +956,22 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
973
956
|
Examples
|
|
974
957
|
--------
|
|
975
958
|
Create a managed Delta table with auto clustering and auto column mapping:
|
|
976
|
-
>>> plan = client.create_table(schema,
|
|
959
|
+
>>> plan = client.create_table(schema, full_name="main.analytics.events", execute=False, return_plan=True)
|
|
977
960
|
>>> print(plan.sql)
|
|
978
961
|
|
|
979
962
|
External table with explicit partitioning and CDF:
|
|
980
963
|
>>> client.create_table(
|
|
981
964
|
... schema,
|
|
982
|
-
...
|
|
965
|
+
... full_name="main.analytics.events",
|
|
983
966
|
... storage_location="abfss://.../events",
|
|
984
967
|
... partition_by=["event_date"],
|
|
985
968
|
... enable_cdf=True,
|
|
986
969
|
... )
|
|
987
970
|
"""
|
|
988
971
|
|
|
972
|
+
if not isinstance(field, (pa.Field, pa.Schema)):
|
|
973
|
+
field = convert(field, pa.Field)
|
|
974
|
+
|
|
989
975
|
# ---- Normalize Arrow input ----
|
|
990
976
|
if isinstance(field, pa.Schema):
|
|
991
977
|
arrow_fields = list(field)
|
|
@@ -998,16 +984,13 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
998
984
|
else:
|
|
999
985
|
arrow_fields = [field]
|
|
1000
986
|
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
# If caller passes raw "cat.schema.table", quote each part safely
|
|
1009
|
-
parts = table_fqn.split(".")
|
|
1010
|
-
table_fqn = ".".join(_quote_ident(p) for p in parts)
|
|
987
|
+
full_name, catalog_name, schema_name, table_name = self._check_location_params(
|
|
988
|
+
location=full_name,
|
|
989
|
+
catalog_name=catalog_name,
|
|
990
|
+
schema_name=schema_name,
|
|
991
|
+
table_name=table_name,
|
|
992
|
+
safe_chars=True
|
|
993
|
+
)
|
|
1011
994
|
|
|
1012
995
|
# ---- Comments ----
|
|
1013
996
|
if comment is None and schema_metadata:
|
|
@@ -1051,7 +1034,7 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
1051
1034
|
create_kw = "CREATE TABLE IF NOT EXISTS"
|
|
1052
1035
|
|
|
1053
1036
|
sql_parts: list[str] = [
|
|
1054
|
-
f"{create_kw} {
|
|
1037
|
+
f"{create_kw} {full_name} (",
|
|
1055
1038
|
" " + ",\n ".join(column_definitions),
|
|
1056
1039
|
")",
|
|
1057
1040
|
f"USING {using}",
|
|
@@ -1096,8 +1079,8 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
1096
1079
|
pass
|
|
1097
1080
|
|
|
1098
1081
|
# Let caller override anything (final say)
|
|
1099
|
-
if
|
|
1100
|
-
props.update(
|
|
1082
|
+
if properties:
|
|
1083
|
+
props.update(properties)
|
|
1101
1084
|
|
|
1102
1085
|
if any_invalid and column_mapping_mode == "none":
|
|
1103
1086
|
warnings.append(
|
|
@@ -1105,6 +1088,11 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
1105
1088
|
"This will fail unless you rename/escape columns."
|
|
1106
1089
|
)
|
|
1107
1090
|
|
|
1091
|
+
default_tags = self.workspace.default_tags()
|
|
1092
|
+
|
|
1093
|
+
for k, v in default_tags.items():
|
|
1094
|
+
props[f"tags.{k}"] = v
|
|
1095
|
+
|
|
1108
1096
|
if props:
|
|
1109
1097
|
def fmt(k: str, v: Any) -> str:
|
|
1110
1098
|
if isinstance(v, str):
|
|
@@ -1122,7 +1110,7 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
1122
1110
|
if not execute:
|
|
1123
1111
|
return plan if return_plan else statement
|
|
1124
1112
|
|
|
1125
|
-
res = self.execute(statement,
|
|
1113
|
+
res = self.execute(statement, wait=wait_result)
|
|
1126
1114
|
plan.result = res
|
|
1127
1115
|
return plan if return_plan else res
|
|
1128
1116
|
|