ygg 0.1.57__py3-none-any.whl → 0.1.64__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ygg-0.1.57.dist-info → ygg-0.1.64.dist-info}/METADATA +2 -2
- ygg-0.1.64.dist-info/RECORD +74 -0
- yggdrasil/ai/__init__.py +2 -0
- yggdrasil/ai/session.py +87 -0
- yggdrasil/ai/sql_session.py +310 -0
- yggdrasil/databricks/__init__.py +0 -3
- yggdrasil/databricks/compute/cluster.py +68 -113
- yggdrasil/databricks/compute/command_execution.py +674 -0
- yggdrasil/databricks/compute/exceptions.py +19 -0
- yggdrasil/databricks/compute/execution_context.py +491 -282
- yggdrasil/databricks/compute/remote.py +4 -14
- yggdrasil/databricks/exceptions.py +10 -0
- yggdrasil/databricks/sql/__init__.py +0 -4
- yggdrasil/databricks/sql/engine.py +178 -178
- yggdrasil/databricks/sql/exceptions.py +9 -1
- yggdrasil/databricks/sql/statement_result.py +108 -120
- yggdrasil/databricks/sql/warehouse.py +339 -92
- yggdrasil/databricks/workspaces/io.py +185 -40
- yggdrasil/databricks/workspaces/path.py +114 -100
- yggdrasil/databricks/workspaces/workspace.py +210 -61
- yggdrasil/exceptions.py +7 -0
- yggdrasil/libs/databrickslib.py +22 -18
- yggdrasil/libs/extensions/spark_extensions.py +1 -1
- yggdrasil/libs/pandaslib.py +15 -6
- yggdrasil/libs/polarslib.py +49 -13
- yggdrasil/pyutils/__init__.py +1 -2
- yggdrasil/pyutils/callable_serde.py +12 -19
- yggdrasil/pyutils/exceptions.py +16 -0
- yggdrasil/pyutils/modules.py +6 -7
- yggdrasil/pyutils/python_env.py +16 -21
- yggdrasil/pyutils/waiting_config.py +171 -0
- yggdrasil/requests/msal.py +9 -96
- yggdrasil/types/cast/arrow_cast.py +3 -0
- yggdrasil/types/cast/pandas_cast.py +157 -169
- yggdrasil/types/cast/polars_cast.py +11 -43
- yggdrasil/types/dummy_class.py +81 -0
- yggdrasil/types/file_format.py +6 -2
- yggdrasil/types/python_defaults.py +92 -76
- yggdrasil/version.py +1 -1
- ygg-0.1.57.dist-info/RECORD +0 -66
- yggdrasil/databricks/ai/loki.py +0 -53
- {ygg-0.1.57.dist-info → ygg-0.1.64.dist-info}/WHEEL +0 -0
- {ygg-0.1.57.dist-info → ygg-0.1.64.dist-info}/entry_points.txt +0 -0
- {ygg-0.1.57.dist-info → ygg-0.1.64.dist-info}/licenses/LICENSE +0 -0
- {ygg-0.1.57.dist-info → ygg-0.1.64.dist-info}/top_level.txt +0 -0
- /yggdrasil/{databricks/ai/__init__.py → pyutils/mimetypes.py} +0 -0
|
@@ -16,17 +16,20 @@ import logging
|
|
|
16
16
|
import random
|
|
17
17
|
import string
|
|
18
18
|
import time
|
|
19
|
-
from
|
|
19
|
+
from threading import Thread
|
|
20
|
+
from typing import Optional, Union, Any, Dict, List, Literal, TYPE_CHECKING
|
|
20
21
|
|
|
21
22
|
import pyarrow as pa
|
|
23
|
+
import pyarrow.dataset as pds
|
|
22
24
|
|
|
23
25
|
from .statement_result import StatementResult
|
|
24
26
|
from .types import column_info_to_arrow_field
|
|
25
|
-
from
|
|
26
|
-
from ..workspaces import WorkspaceService
|
|
27
|
-
from ...libs.databrickslib import databricks_sdk
|
|
27
|
+
from .warehouse import SQLWarehouse
|
|
28
|
+
from ..workspaces import WorkspaceService, DatabricksPath
|
|
29
|
+
from ...libs.databrickslib import databricks_sdk, DatabricksDummyClass
|
|
28
30
|
from ...libs.sparklib import SparkSession, SparkDataFrame, pyspark
|
|
29
|
-
from ...
|
|
31
|
+
from ...pyutils.waiting_config import WaitingConfigArg
|
|
32
|
+
from ...types import is_arrow_type_string_like, is_arrow_type_binary_like, cast_arrow_tabular
|
|
30
33
|
from ...types.cast.cast_options import CastOptions
|
|
31
34
|
from ...types.cast.registry import convert
|
|
32
35
|
from ...types.cast.spark_cast import cast_spark_dataframe
|
|
@@ -43,13 +46,14 @@ except ImportError:
|
|
|
43
46
|
|
|
44
47
|
if databricks_sdk is not None:
|
|
45
48
|
from databricks.sdk.service.sql import (
|
|
46
|
-
|
|
49
|
+
Disposition, Format,
|
|
47
50
|
ExecuteStatementRequestOnWaitTimeout, StatementParameterListItem
|
|
48
51
|
)
|
|
49
|
-
StatementResponse = StatementResponse
|
|
50
52
|
else:
|
|
51
|
-
|
|
52
|
-
|
|
53
|
+
Disposition = DatabricksDummyClass
|
|
54
|
+
Format = DatabricksDummyClass
|
|
55
|
+
ExecuteStatementRequestOnWaitTimeout = DatabricksDummyClass
|
|
56
|
+
StatementParameterListItem = DatabricksDummyClass
|
|
53
57
|
|
|
54
58
|
|
|
55
59
|
logger = logging.getLogger(__name__)
|
|
@@ -57,7 +61,15 @@ logger = logging.getLogger(__name__)
|
|
|
57
61
|
if pyspark is not None:
|
|
58
62
|
import pyspark.sql.functions as F
|
|
59
63
|
|
|
60
|
-
|
|
64
|
+
|
|
65
|
+
if TYPE_CHECKING:
|
|
66
|
+
from ...ai.sql_session import SQLAISession, SQLFlavor
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
__all__ = [
|
|
70
|
+
"SQLEngine",
|
|
71
|
+
"StatementResult"
|
|
72
|
+
]
|
|
61
73
|
|
|
62
74
|
|
|
63
75
|
@dataclasses.dataclass
|
|
@@ -88,10 +100,12 @@ def _needs_column_mapping(col_name: str) -> bool:
|
|
|
88
100
|
@dataclasses.dataclass
|
|
89
101
|
class SQLEngine(WorkspaceService):
|
|
90
102
|
"""Execute SQL statements and manage tables via Databricks SQL / Spark."""
|
|
91
|
-
warehouse_id: Optional[str] = None
|
|
92
103
|
catalog_name: Optional[str] = None
|
|
93
104
|
schema_name: Optional[str] = None
|
|
94
105
|
|
|
106
|
+
_warehouse: Optional[SQLWarehouse] = dataclasses.field(default=None, repr=False, hash=False, compare=False)
|
|
107
|
+
_ai_session: Optional["SQLAISession"] = dataclasses.field(default=None, repr=False, hash=False, compare=False)
|
|
108
|
+
|
|
95
109
|
def table_full_name(
|
|
96
110
|
self,
|
|
97
111
|
catalog_name: Optional[str] = None,
|
|
@@ -147,82 +161,59 @@ class SQLEngine(WorkspaceService):
|
|
|
147
161
|
return self.catalog_name, parts[0], parts[1]
|
|
148
162
|
|
|
149
163
|
catalog_name, schema_name, table_name = parts[-3], parts[-2], parts[-1]
|
|
150
|
-
catalog_name = catalog_name or self.catalog_name
|
|
151
|
-
schema_name = schema_name or self.schema_name
|
|
152
|
-
return catalog_name, schema_name, table_name
|
|
153
164
|
|
|
154
|
-
|
|
155
|
-
self,
|
|
156
|
-
cluster_size: str = "Small"
|
|
157
|
-
):
|
|
158
|
-
"""Pick a default SQL warehouse (best-effort) matching the desired size.
|
|
159
|
-
|
|
160
|
-
Args:
|
|
161
|
-
cluster_size: Desired warehouse size (Databricks "cluster_size"), e.g. "Small".
|
|
162
|
-
If empty/None, returns the first warehouse encountered.
|
|
163
|
-
|
|
164
|
-
Returns:
|
|
165
|
-
Warehouse object.
|
|
165
|
+
return catalog_name or self.catalog_name, schema_name or self.schema_name, table_name
|
|
166
166
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
"""
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
for warehouse in existing:
|
|
175
|
-
if first is None:
|
|
176
|
-
first = warehouse
|
|
177
|
-
|
|
178
|
-
if cluster_size:
|
|
179
|
-
if getattr(warehouse, "cluster_size", None) == cluster_size:
|
|
180
|
-
logger.debug("Default warehouse match found: id=%s cluster_size=%s", warehouse.id, warehouse.cluster_size)
|
|
181
|
-
return warehouse
|
|
182
|
-
else:
|
|
183
|
-
logger.debug("Default warehouse selected (first): id=%s", warehouse.id)
|
|
184
|
-
return warehouse
|
|
167
|
+
@staticmethod
|
|
168
|
+
def _random_suffix(prefix: str = "") -> str:
|
|
169
|
+
"""Generate a unique suffix for temporary resources."""
|
|
170
|
+
unique = "".join(random.choice(string.ascii_lowercase + string.digits) for _ in range(8))
|
|
171
|
+
timestamp = int(time.time() * 1000)
|
|
172
|
+
return f"{prefix}{timestamp}_{unique}"
|
|
185
173
|
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
174
|
+
def warehouse(
|
|
175
|
+
self,
|
|
176
|
+
warehouse_id: Optional[str] = None,
|
|
177
|
+
warehouse_name: Optional[str] = None,
|
|
178
|
+
) -> SQLWarehouse:
|
|
179
|
+
if self._warehouse is None:
|
|
180
|
+
wh = SQLWarehouse(
|
|
181
|
+
workspace=self.workspace,
|
|
182
|
+
warehouse_id=warehouse_id,
|
|
183
|
+
warehouse_name=warehouse_name
|
|
192
184
|
)
|
|
193
|
-
return first
|
|
194
|
-
|
|
195
|
-
raise ValueError(f"No default warehouse found in {wk.config.host}")
|
|
196
185
|
|
|
197
|
-
|
|
198
|
-
|
|
186
|
+
self._warehouse = wh.find_warehouse(
|
|
187
|
+
warehouse_id=warehouse_id,
|
|
188
|
+
warehouse_name=warehouse_name,
|
|
189
|
+
raise_error=False
|
|
190
|
+
)
|
|
199
191
|
|
|
200
|
-
|
|
201
|
-
|
|
192
|
+
if self._warehouse is None:
|
|
193
|
+
self._warehouse = wh.create_or_update()
|
|
202
194
|
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
self.warehouse_id = dft.id
|
|
209
|
-
logger.info("Resolved default warehouse_id=%s (cluster_size=%s)", self.warehouse_id, cluster_size)
|
|
195
|
+
return self._warehouse.find_warehouse(
|
|
196
|
+
warehouse_id=warehouse_id,
|
|
197
|
+
warehouse_name=warehouse_name,
|
|
198
|
+
raise_error=True
|
|
199
|
+
)
|
|
210
200
|
|
|
211
|
-
|
|
201
|
+
def ai_session(
|
|
202
|
+
self,
|
|
203
|
+
model: str = "databricks-gemini-2-5-pro",
|
|
204
|
+
flavor: Optional["SQLFlavor"] = None
|
|
205
|
+
):
|
|
206
|
+
from ...ai.sql_session import SQLAISession, SQLFlavor
|
|
212
207
|
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
"""Generate a unique suffix for temporary resources."""
|
|
216
|
-
unique = "".join(random.choice(string.ascii_lowercase + string.digits) for _ in range(8))
|
|
217
|
-
timestamp = int(time.time() * 1000)
|
|
218
|
-
return f"{prefix}{timestamp}_{unique}"
|
|
208
|
+
if flavor is None:
|
|
209
|
+
flavor = SQLFlavor.DATABRICKS
|
|
219
210
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
211
|
+
return SQLAISession(
|
|
212
|
+
model=model,
|
|
213
|
+
api_key=self.workspace.current_token(),
|
|
214
|
+
base_url="%s/serving-endpoints" % self.workspace.safe_host,
|
|
215
|
+
flavor=flavor
|
|
216
|
+
)
|
|
226
217
|
|
|
227
218
|
def execute(
|
|
228
219
|
self,
|
|
@@ -230,18 +221,18 @@ class SQLEngine(WorkspaceService):
|
|
|
230
221
|
*,
|
|
231
222
|
engine: Optional[Literal["spark", "api"]] = None,
|
|
232
223
|
warehouse_id: Optional[str] = None,
|
|
224
|
+
warehouse_name: Optional[str] = None,
|
|
233
225
|
byte_limit: Optional[int] = None,
|
|
234
|
-
disposition: Optional[
|
|
235
|
-
format: Optional[
|
|
236
|
-
on_wait_timeout: Optional[
|
|
237
|
-
parameters: Optional[List[
|
|
226
|
+
disposition: Optional[Disposition] = None,
|
|
227
|
+
format: Optional[Format] = None,
|
|
228
|
+
on_wait_timeout: Optional[ExecuteStatementRequestOnWaitTimeout] = None,
|
|
229
|
+
parameters: Optional[List[StatementParameterListItem]] = None,
|
|
238
230
|
row_limit: Optional[int] = None,
|
|
239
231
|
wait_timeout: Optional[str] = None,
|
|
240
232
|
catalog_name: Optional[str] = None,
|
|
241
233
|
schema_name: Optional[str] = None,
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
) -> "StatementResult":
|
|
234
|
+
wait: Optional[WaitingConfigArg] = True
|
|
235
|
+
) -> StatementResult:
|
|
245
236
|
"""Execute a SQL statement via Spark or Databricks SQL Statement Execution API.
|
|
246
237
|
|
|
247
238
|
Engine resolution:
|
|
@@ -256,6 +247,7 @@ class SQLEngine(WorkspaceService):
|
|
|
256
247
|
statement: SQL statement to execute. If None, a `SELECT *` is generated from the table params.
|
|
257
248
|
engine: "spark" or "api".
|
|
258
249
|
warehouse_id: Warehouse override (for API engine).
|
|
250
|
+
warehouse_name: Warehouse name override (for API engine).
|
|
259
251
|
byte_limit: Optional byte limit for results.
|
|
260
252
|
disposition: Result disposition mode (API engine).
|
|
261
253
|
format: Result format (API engine).
|
|
@@ -265,8 +257,7 @@ class SQLEngine(WorkspaceService):
|
|
|
265
257
|
wait_timeout: API wait timeout value.
|
|
266
258
|
catalog_name: Optional catalog override for API engine.
|
|
267
259
|
schema_name: Optional schema override for API engine.
|
|
268
|
-
|
|
269
|
-
wait_result: Whether to block until completion (API engine).
|
|
260
|
+
wait: Whether to block until completion (API engine).
|
|
270
261
|
|
|
271
262
|
Returns:
|
|
272
263
|
StatementResult.
|
|
@@ -284,72 +275,44 @@ class SQLEngine(WorkspaceService):
|
|
|
284
275
|
if spark_session is None:
|
|
285
276
|
raise ValueError("No spark session found to run sql query")
|
|
286
277
|
|
|
287
|
-
df: SparkDataFrame = spark_session.sql(statement)
|
|
288
|
-
|
|
289
|
-
if row_limit:
|
|
290
|
-
df = df.limit(row_limit)
|
|
291
|
-
|
|
292
278
|
logger.debug(
|
|
293
|
-
"SPARK SQL
|
|
279
|
+
"SPARK SQL executing query:\n%s",
|
|
294
280
|
statement
|
|
295
281
|
)
|
|
296
282
|
|
|
297
|
-
|
|
298
|
-
|
|
283
|
+
df: SparkDataFrame = spark_session.sql(statement)
|
|
284
|
+
|
|
285
|
+
if row_limit:
|
|
286
|
+
df = df.limit(row_limit)
|
|
299
287
|
|
|
300
288
|
return StatementResult(
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
289
|
+
workspace_client=self.workspace.sdk(),
|
|
290
|
+
warehouse_id="SparkSQL",
|
|
291
|
+
statement_id="SparkSQL",
|
|
292
|
+
disposition=Disposition.EXTERNAL_LINKS,
|
|
304
293
|
_spark_df=df,
|
|
305
294
|
)
|
|
306
295
|
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
if (disposition is None or disposition == Disposition.INLINE) and format in [Format.CSV, Format.ARROW_STREAM]:
|
|
312
|
-
disposition = Disposition.EXTERNAL_LINKS
|
|
313
|
-
|
|
314
|
-
if not statement:
|
|
315
|
-
full_name = self.table_full_name(catalog_name=catalog_name, schema_name=schema_name, table_name=table_name)
|
|
316
|
-
statement = f"SELECT * FROM {full_name}"
|
|
317
|
-
|
|
318
|
-
if not warehouse_id:
|
|
319
|
-
warehouse_id = self._get_or_default_warehouse_id()
|
|
296
|
+
wh = self.warehouse(
|
|
297
|
+
warehouse_id=warehouse_id,
|
|
298
|
+
warehouse_name=warehouse_name,
|
|
299
|
+
)
|
|
320
300
|
|
|
321
|
-
|
|
301
|
+
return wh.execute(
|
|
322
302
|
statement=statement,
|
|
323
303
|
warehouse_id=warehouse_id,
|
|
304
|
+
warehouse_name=warehouse_name,
|
|
324
305
|
byte_limit=byte_limit,
|
|
325
306
|
disposition=disposition,
|
|
326
307
|
format=format,
|
|
327
308
|
on_wait_timeout=on_wait_timeout,
|
|
328
309
|
parameters=parameters,
|
|
329
|
-
row_limit=row_limit,
|
|
330
310
|
wait_timeout=wait_timeout,
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
execution = StatementResult(
|
|
336
|
-
engine=self,
|
|
337
|
-
statement_id=response.statement_id,
|
|
338
|
-
_response=response,
|
|
339
|
-
disposition=disposition,
|
|
340
|
-
)
|
|
341
|
-
|
|
342
|
-
logger.info(
|
|
343
|
-
"API SQL executed statement '%s'",
|
|
344
|
-
execution.statement_id
|
|
345
|
-
)
|
|
346
|
-
logger.debug(
|
|
347
|
-
"API SQL executed query:\n%s",
|
|
348
|
-
statement
|
|
311
|
+
catalog_name=catalog_name,
|
|
312
|
+
schema_name=schema_name,
|
|
313
|
+
wait=wait
|
|
349
314
|
)
|
|
350
315
|
|
|
351
|
-
return execution.wait() if wait_result else execution
|
|
352
|
-
|
|
353
316
|
def spark_table(
|
|
354
317
|
self,
|
|
355
318
|
full_name: Optional[str] = None,
|
|
@@ -412,7 +375,7 @@ class SQLEngine(WorkspaceService):
|
|
|
412
375
|
None (mutates the destination table).
|
|
413
376
|
"""
|
|
414
377
|
|
|
415
|
-
if pyspark is not None:
|
|
378
|
+
if pyspark is not None or spark_session is not None:
|
|
416
379
|
spark_session = SparkSession.getActiveSession() if spark_session is None else spark_session
|
|
417
380
|
|
|
418
381
|
if spark_session is not None or isinstance(data, SparkDataFrame):
|
|
@@ -502,6 +465,7 @@ class SQLEngine(WorkspaceService):
|
|
|
502
465
|
if existing_schema is None:
|
|
503
466
|
try:
|
|
504
467
|
existing_schema = connected.get_table_schema(
|
|
468
|
+
location=location,
|
|
505
469
|
catalog_name=catalog_name,
|
|
506
470
|
schema_name=schema_name,
|
|
507
471
|
table_name=table_name,
|
|
@@ -511,8 +475,7 @@ class SQLEngine(WorkspaceService):
|
|
|
511
475
|
data_tbl = convert(data, pa.Table)
|
|
512
476
|
existing_schema = data_tbl.schema
|
|
513
477
|
logger.warning(
|
|
514
|
-
"
|
|
515
|
-
location,
|
|
478
|
+
"%s, creating it from input schema (columns=%s)",
|
|
516
479
|
exc,
|
|
517
480
|
existing_schema.names,
|
|
518
481
|
)
|
|
@@ -544,17 +507,18 @@ class SQLEngine(WorkspaceService):
|
|
|
544
507
|
except Exception:
|
|
545
508
|
logger.exception("Arrow insert failed after auto-creating %s; attempting cleanup (DROP TABLE)", location)
|
|
546
509
|
try:
|
|
547
|
-
connected.drop_table(location=location)
|
|
510
|
+
connected.drop_table(location=location, wait=True)
|
|
548
511
|
except Exception:
|
|
549
512
|
logger.exception("Failed to drop table %s after auto creation error", location)
|
|
550
513
|
raise
|
|
551
514
|
|
|
552
|
-
|
|
515
|
+
cast_options = CastOptions.check_arg(options=cast_options, target_field=existing_schema)
|
|
516
|
+
|
|
517
|
+
if isinstance(data, (pa.Table, pa.RecordBatch)):
|
|
518
|
+
data_tbl = cast_arrow_tabular(data, options=cast_options)
|
|
519
|
+
else:
|
|
520
|
+
data_tbl = convert(data, pa.Table, options=cast_options)
|
|
553
521
|
|
|
554
|
-
data_tbl = convert(
|
|
555
|
-
data, pa.Table,
|
|
556
|
-
options=cast_options, target_field=existing_schema
|
|
557
|
-
)
|
|
558
522
|
num_rows = data_tbl.num_rows
|
|
559
523
|
|
|
560
524
|
logger.debug(
|
|
@@ -567,14 +531,16 @@ class SQLEngine(WorkspaceService):
|
|
|
567
531
|
)
|
|
568
532
|
|
|
569
533
|
# Write in temp volume
|
|
570
|
-
temp_volume_path =
|
|
571
|
-
|
|
572
|
-
|
|
534
|
+
temp_volume_path = self.workspace.tmp_path(
|
|
535
|
+
catalog_name=catalog_name,
|
|
536
|
+
schema_name=schema_name,
|
|
537
|
+
volume_name="tmp",
|
|
538
|
+
extension="parquet",
|
|
539
|
+
max_lifetime=3600,
|
|
573
540
|
) if temp_volume_path is None else DatabricksPath.parse(obj=temp_volume_path, workspace=connected.workspace)
|
|
574
541
|
|
|
575
542
|
logger.debug("Staging Parquet to temp volume: %s", temp_volume_path)
|
|
576
|
-
temp_volume_path.
|
|
577
|
-
temp_volume_path.write_arrow_table(data_tbl)
|
|
543
|
+
temp_volume_path.write_arrow_table(data_tbl, file_format=pds.ParquetFileFormat())
|
|
578
544
|
|
|
579
545
|
columns = list(existing_schema.names)
|
|
580
546
|
cols_quoted = ", ".join([f"`{c}`" for c in columns])
|
|
@@ -620,7 +586,12 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
620
586
|
connected.execute(stmt.strip())
|
|
621
587
|
finally:
|
|
622
588
|
try:
|
|
623
|
-
|
|
589
|
+
Thread(
|
|
590
|
+
target=temp_volume_path.remove,
|
|
591
|
+
kwargs={
|
|
592
|
+
"recursive": True
|
|
593
|
+
}
|
|
594
|
+
).start()
|
|
624
595
|
except Exception:
|
|
625
596
|
logger.exception("Failed cleaning temp volume: %s", temp_volume_path)
|
|
626
597
|
|
|
@@ -732,8 +703,6 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
732
703
|
cast_options = CastOptions.check_arg(options=cast_options, target_field=existing_schema)
|
|
733
704
|
data = cast_spark_dataframe(data, options=cast_options)
|
|
734
705
|
|
|
735
|
-
logger.debug("Incoming Spark columns: %s", data.columns)
|
|
736
|
-
|
|
737
706
|
if match_by:
|
|
738
707
|
notnull = None
|
|
739
708
|
for k in match_by:
|
|
@@ -788,6 +757,7 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
788
757
|
|
|
789
758
|
def get_table_schema(
|
|
790
759
|
self,
|
|
760
|
+
location: Optional[str] = None,
|
|
791
761
|
catalog_name: Optional[str] = None,
|
|
792
762
|
schema_name: Optional[str] = None,
|
|
793
763
|
table_name: Optional[str] = None,
|
|
@@ -796,6 +766,7 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
796
766
|
"""Fetch a table schema from Unity Catalog and convert it to Arrow types.
|
|
797
767
|
|
|
798
768
|
Args:
|
|
769
|
+
location: Optional Fully qualified location name
|
|
799
770
|
catalog_name: Optional catalog override.
|
|
800
771
|
schema_name: Optional schema override.
|
|
801
772
|
table_name: Optional table name override.
|
|
@@ -804,25 +775,44 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
804
775
|
Returns:
|
|
805
776
|
Arrow Schema or a STRUCT Field representing the table.
|
|
806
777
|
"""
|
|
807
|
-
|
|
778
|
+
location, catalog_name, schema_name, table_name = self._check_location_params(
|
|
779
|
+
location=location,
|
|
808
780
|
catalog_name=catalog_name,
|
|
809
781
|
schema_name=schema_name,
|
|
810
782
|
table_name=table_name,
|
|
811
783
|
safe_chars=False,
|
|
812
784
|
)
|
|
813
785
|
|
|
814
|
-
|
|
786
|
+
client = self.workspace.sdk().tables
|
|
815
787
|
|
|
816
788
|
try:
|
|
817
|
-
table =
|
|
789
|
+
table = client.get(location)
|
|
818
790
|
except Exception as e:
|
|
819
|
-
raise ValueError(f"Table %s not found, {type(e)} {e}" %
|
|
791
|
+
raise ValueError(f"Table %s not found, {type(e)} {e}" % location)
|
|
792
|
+
|
|
793
|
+
fields = [
|
|
794
|
+
column_info_to_arrow_field(_) for _ in table.columns
|
|
795
|
+
]
|
|
820
796
|
|
|
821
|
-
|
|
797
|
+
metadata = {
|
|
798
|
+
b"engine": b"databricks",
|
|
799
|
+
b"full_name": location,
|
|
800
|
+
b"catalog_name": catalog_name,
|
|
801
|
+
b"schema_name": schema_name,
|
|
802
|
+
b"table_name": table_name,
|
|
803
|
+
}
|
|
822
804
|
|
|
823
805
|
if to_arrow_schema:
|
|
824
|
-
return pa.schema(
|
|
825
|
-
|
|
806
|
+
return pa.schema(
|
|
807
|
+
fields,
|
|
808
|
+
metadata=metadata
|
|
809
|
+
)
|
|
810
|
+
|
|
811
|
+
return pa.field(
|
|
812
|
+
location,
|
|
813
|
+
pa.struct(fields),
|
|
814
|
+
metadata=metadata
|
|
815
|
+
)
|
|
826
816
|
|
|
827
817
|
def drop_table(
|
|
828
818
|
self,
|
|
@@ -830,6 +820,7 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
830
820
|
catalog_name: Optional[str] = None,
|
|
831
821
|
schema_name: Optional[str] = None,
|
|
832
822
|
table_name: Optional[str] = None,
|
|
823
|
+
wait: Optional[WaitingConfigArg] = True
|
|
833
824
|
):
|
|
834
825
|
"""Drop a table if it exists."""
|
|
835
826
|
location, _, _, _ = self._check_location_params(
|
|
@@ -839,13 +830,17 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
839
830
|
table_name=table_name,
|
|
840
831
|
safe_chars=True,
|
|
841
832
|
)
|
|
842
|
-
|
|
843
|
-
|
|
833
|
+
|
|
834
|
+
logger.debug("Dropping table if exists: %s", location)
|
|
835
|
+
|
|
836
|
+
self.execute(f"DROP TABLE IF EXISTS {location}", wait=wait)
|
|
837
|
+
|
|
838
|
+
logger.info("Dropped table if exists: %s", location)
|
|
844
839
|
|
|
845
840
|
def create_table(
|
|
846
841
|
self,
|
|
847
842
|
field: Union[pa.Field, pa.Schema],
|
|
848
|
-
|
|
843
|
+
full_name: Optional[str] = None, # e.g. catalog.schema.table
|
|
849
844
|
catalog_name: Optional[str] = None,
|
|
850
845
|
schema_name: Optional[str] = None,
|
|
851
846
|
table_name: Optional[str] = None,
|
|
@@ -853,7 +848,7 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
853
848
|
partition_by: Optional[list[str]] = None,
|
|
854
849
|
cluster_by: Optional[bool | list[str]] = True,
|
|
855
850
|
comment: Optional[str] = None,
|
|
856
|
-
|
|
851
|
+
properties: Optional[dict[str, Any]] = None,
|
|
857
852
|
if_not_exists: bool = True,
|
|
858
853
|
or_replace: bool = False,
|
|
859
854
|
using: str = "DELTA",
|
|
@@ -886,7 +881,7 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
886
881
|
- If `pa.Schema`, all schema fields are used as columns.
|
|
887
882
|
- If `pa.Field` with struct type, its children become columns.
|
|
888
883
|
- If `pa.Field` non-struct, it becomes a single-column table.
|
|
889
|
-
|
|
884
|
+
full_name:
|
|
890
885
|
Fully-qualified table name, e.g. `"catalog.schema.table"`.
|
|
891
886
|
If provided, it takes precedence over `catalog_name`/`schema_name`/`table_name`.
|
|
892
887
|
Parts are quoted as needed.
|
|
@@ -906,7 +901,7 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
906
901
|
- list[str] -> emits `CLUSTER BY (<cols...>)` (all cols must exist in schema)
|
|
907
902
|
comment:
|
|
908
903
|
Optional table comment. If not provided and Arrow metadata contains `b"comment"`, that is used.
|
|
909
|
-
|
|
904
|
+
properties:
|
|
910
905
|
Additional/override Delta table properties (final say).
|
|
911
906
|
Example: `{"delta.enableChangeDataFeed": "true"}` or `{"delta.logRetentionDuration": "30 days"}`
|
|
912
907
|
if_not_exists:
|
|
@@ -973,19 +968,22 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
973
968
|
Examples
|
|
974
969
|
--------
|
|
975
970
|
Create a managed Delta table with auto clustering and auto column mapping:
|
|
976
|
-
>>> plan = client.create_table(schema,
|
|
971
|
+
>>> plan = client.create_table(schema, full_name="main.analytics.events", execute=False, return_plan=True)
|
|
977
972
|
>>> print(plan.sql)
|
|
978
973
|
|
|
979
974
|
External table with explicit partitioning and CDF:
|
|
980
975
|
>>> client.create_table(
|
|
981
976
|
... schema,
|
|
982
|
-
...
|
|
977
|
+
... full_name="main.analytics.events",
|
|
983
978
|
... storage_location="abfss://.../events",
|
|
984
979
|
... partition_by=["event_date"],
|
|
985
980
|
... enable_cdf=True,
|
|
986
981
|
... )
|
|
987
982
|
"""
|
|
988
983
|
|
|
984
|
+
if not isinstance(field, (pa.Field, pa.Schema)):
|
|
985
|
+
field = convert(field, pa.Field)
|
|
986
|
+
|
|
989
987
|
# ---- Normalize Arrow input ----
|
|
990
988
|
if isinstance(field, pa.Schema):
|
|
991
989
|
arrow_fields = list(field)
|
|
@@ -998,16 +996,13 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
998
996
|
else:
|
|
999
997
|
arrow_fields = [field]
|
|
1000
998
|
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
# If caller passes raw "cat.schema.table", quote each part safely
|
|
1009
|
-
parts = table_fqn.split(".")
|
|
1010
|
-
table_fqn = ".".join(_quote_ident(p) for p in parts)
|
|
999
|
+
full_name, catalog_name, schema_name, table_name = self._check_location_params(
|
|
1000
|
+
location=full_name,
|
|
1001
|
+
catalog_name=catalog_name,
|
|
1002
|
+
schema_name=schema_name,
|
|
1003
|
+
table_name=table_name,
|
|
1004
|
+
safe_chars=True
|
|
1005
|
+
)
|
|
1011
1006
|
|
|
1012
1007
|
# ---- Comments ----
|
|
1013
1008
|
if comment is None and schema_metadata:
|
|
@@ -1051,7 +1046,7 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
1051
1046
|
create_kw = "CREATE TABLE IF NOT EXISTS"
|
|
1052
1047
|
|
|
1053
1048
|
sql_parts: list[str] = [
|
|
1054
|
-
f"{create_kw} {
|
|
1049
|
+
f"{create_kw} {full_name} (",
|
|
1055
1050
|
" " + ",\n ".join(column_definitions),
|
|
1056
1051
|
")",
|
|
1057
1052
|
f"USING {using}",
|
|
@@ -1096,8 +1091,8 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
1096
1091
|
pass
|
|
1097
1092
|
|
|
1098
1093
|
# Let caller override anything (final say)
|
|
1099
|
-
if
|
|
1100
|
-
props.update(
|
|
1094
|
+
if properties:
|
|
1095
|
+
props.update(properties)
|
|
1101
1096
|
|
|
1102
1097
|
if any_invalid and column_mapping_mode == "none":
|
|
1103
1098
|
warnings.append(
|
|
@@ -1105,6 +1100,11 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
1105
1100
|
"This will fail unless you rename/escape columns."
|
|
1106
1101
|
)
|
|
1107
1102
|
|
|
1103
|
+
default_tags = self.workspace.default_tags()
|
|
1104
|
+
|
|
1105
|
+
for k, v in default_tags.items():
|
|
1106
|
+
props[f"tags.{k}"] = v
|
|
1107
|
+
|
|
1108
1108
|
if props:
|
|
1109
1109
|
def fmt(k: str, v: Any) -> str:
|
|
1110
1110
|
if isinstance(v, str):
|
|
@@ -1122,7 +1122,7 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
1122
1122
|
if not execute:
|
|
1123
1123
|
return plan if return_plan else statement
|
|
1124
1124
|
|
|
1125
|
-
res = self.execute(statement,
|
|
1125
|
+
res = self.execute(statement, wait=wait_result)
|
|
1126
1126
|
plan.result = res
|
|
1127
1127
|
return plan if return_plan else res
|
|
1128
1128
|
|