ygg 0.1.33__py3-none-any.whl → 0.1.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ygg-0.1.33.dist-info → ygg-0.1.35.dist-info}/METADATA +1 -1
- {ygg-0.1.33.dist-info → ygg-0.1.35.dist-info}/RECORD +15 -13
- yggdrasil/databricks/compute/cluster.py +134 -64
- yggdrasil/databricks/compute/execution_context.py +7 -4
- yggdrasil/databricks/compute/remote.py +31 -13
- yggdrasil/databricks/sql/engine.py +314 -324
- yggdrasil/databricks/workspaces/workspace.py +12 -1
- yggdrasil/pyutils/callable_serde.py +27 -2
- yggdrasil/pyutils/equality.py +107 -0
- yggdrasil/pyutils/expiring_dict.py +176 -0
- yggdrasil/version.py +1 -1
- {ygg-0.1.33.dist-info → ygg-0.1.35.dist-info}/WHEEL +0 -0
- {ygg-0.1.33.dist-info → ygg-0.1.35.dist-info}/entry_points.txt +0 -0
- {ygg-0.1.33.dist-info → ygg-0.1.35.dist-info}/licenses/LICENSE +0 -0
- {ygg-0.1.33.dist-info → ygg-0.1.35.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,15 @@
|
|
|
1
|
-
"""Databricks SQL engine utilities and helpers.
|
|
1
|
+
"""Databricks SQL engine utilities and helpers.
|
|
2
|
+
|
|
3
|
+
This module provides a thin “do the right thing” layer over:
|
|
4
|
+
- Databricks SQL Statement Execution API (warehouse)
|
|
5
|
+
- Spark SQL / Delta Lake (when running inside a Spark-enabled context)
|
|
6
|
+
|
|
7
|
+
It includes helpers to:
|
|
8
|
+
- Build fully-qualified table names
|
|
9
|
+
- Execute SQL via Spark or Databricks SQL API
|
|
10
|
+
- Insert Arrow/Spark data into Delta tables (append/overwrite/merge)
|
|
11
|
+
- Generate DDL from Arrow schemas
|
|
12
|
+
"""
|
|
2
13
|
|
|
3
14
|
import dataclasses
|
|
4
15
|
import logging
|
|
@@ -8,7 +19,6 @@ import time
|
|
|
8
19
|
from typing import Optional, Union, Any, Dict, List, Literal
|
|
9
20
|
|
|
10
21
|
import pyarrow as pa
|
|
11
|
-
import pyarrow.parquet as pq
|
|
12
22
|
|
|
13
23
|
from .statement_result import StatementResult
|
|
14
24
|
from .types import column_info_to_arrow_field
|
|
@@ -28,7 +38,6 @@ except ImportError:
|
|
|
28
38
|
@classmethod
|
|
29
39
|
def forName(cls, *args, **kwargs):
|
|
30
40
|
from delta.tables import DeltaTable
|
|
31
|
-
|
|
32
41
|
return DeltaTable.forName(*args, **kwargs)
|
|
33
42
|
|
|
34
43
|
|
|
@@ -37,23 +46,18 @@ if databricks_sdk is not None:
|
|
|
37
46
|
StatementResponse, Disposition, Format,
|
|
38
47
|
ExecuteStatementRequestOnWaitTimeout, StatementParameterListItem
|
|
39
48
|
)
|
|
40
|
-
|
|
41
49
|
StatementResponse = StatementResponse
|
|
42
50
|
else:
|
|
43
|
-
class StatementResponse:
|
|
51
|
+
class StatementResponse: # pragma: no cover
|
|
44
52
|
pass
|
|
45
53
|
|
|
46
54
|
|
|
47
55
|
logger = logging.getLogger(__name__)
|
|
48
56
|
|
|
49
|
-
|
|
50
57
|
if pyspark is not None:
|
|
51
58
|
import pyspark.sql.functions as F
|
|
52
59
|
|
|
53
|
-
__all__ = [
|
|
54
|
-
"SQLEngine",
|
|
55
|
-
"StatementResult"
|
|
56
|
-
]
|
|
60
|
+
__all__ = ["SQLEngine", "StatementResult"]
|
|
57
61
|
|
|
58
62
|
|
|
59
63
|
class SqlExecutionError(RuntimeError):
|
|
@@ -62,7 +66,7 @@ class SqlExecutionError(RuntimeError):
|
|
|
62
66
|
|
|
63
67
|
@dataclasses.dataclass
|
|
64
68
|
class SQLEngine(WorkspaceService):
|
|
65
|
-
"""Execute SQL statements and manage tables via Databricks."""
|
|
69
|
+
"""Execute SQL statements and manage tables via Databricks SQL / Spark."""
|
|
66
70
|
warehouse_id: Optional[str] = None
|
|
67
71
|
catalog_name: Optional[str] = None
|
|
68
72
|
schema_name: Optional[str] = None
|
|
@@ -72,18 +76,18 @@ class SQLEngine(WorkspaceService):
|
|
|
72
76
|
catalog_name: Optional[str] = None,
|
|
73
77
|
schema_name: Optional[str] = None,
|
|
74
78
|
table_name: Optional[str] = None,
|
|
75
|
-
safe_chars: bool = True
|
|
76
|
-
):
|
|
77
|
-
"""Build a fully qualified table name
|
|
79
|
+
safe_chars: bool = True,
|
|
80
|
+
) -> str:
|
|
81
|
+
"""Build a fully qualified table name (catalog.schema.table).
|
|
78
82
|
|
|
79
83
|
Args:
|
|
80
|
-
catalog_name: Optional catalog override.
|
|
81
|
-
schema_name: Optional schema override.
|
|
84
|
+
catalog_name: Optional catalog override (defaults to engine.catalog_name).
|
|
85
|
+
schema_name: Optional schema override (defaults to engine.schema_name).
|
|
82
86
|
table_name: Table name to qualify.
|
|
83
|
-
safe_chars: Whether to wrap
|
|
87
|
+
safe_chars: Whether to wrap each identifier in backticks.
|
|
84
88
|
|
|
85
89
|
Returns:
|
|
86
|
-
|
|
90
|
+
Fully qualified table name string.
|
|
87
91
|
"""
|
|
88
92
|
catalog_name = catalog_name or self.catalog_name
|
|
89
93
|
schema_name = schema_name or self.schema_name
|
|
@@ -96,21 +100,23 @@ class SQLEngine(WorkspaceService):
|
|
|
96
100
|
return f"`{catalog_name}`.`{schema_name}`.`{table_name}`"
|
|
97
101
|
return f"{catalog_name}.{schema_name}.{table_name}"
|
|
98
102
|
|
|
99
|
-
def _catalog_schema_table_names(
|
|
100
|
-
self,
|
|
101
|
-
full_name: str,
|
|
102
|
-
):
|
|
103
|
+
def _catalog_schema_table_names(self, full_name: str) -> tuple[Optional[str], Optional[str], Optional[str]]:
|
|
103
104
|
"""Parse a catalog.schema.table string into components.
|
|
104
105
|
|
|
106
|
+
Supports partial names:
|
|
107
|
+
- table
|
|
108
|
+
- schema.table
|
|
109
|
+
- catalog.schema.table
|
|
110
|
+
|
|
111
|
+
Backticks are stripped.
|
|
112
|
+
|
|
105
113
|
Args:
|
|
106
|
-
full_name:
|
|
114
|
+
full_name: Fully qualified or partial table name.
|
|
107
115
|
|
|
108
116
|
Returns:
|
|
109
|
-
|
|
117
|
+
Tuple of (catalog_name, schema_name, table_name).
|
|
110
118
|
"""
|
|
111
|
-
parts = [
|
|
112
|
-
_.strip("`") for _ in full_name.split(".")
|
|
113
|
-
]
|
|
119
|
+
parts = [_.strip("`") for _ in full_name.split(".")]
|
|
114
120
|
|
|
115
121
|
if len(parts) == 0:
|
|
116
122
|
return self.catalog_name, self.schema_name, None
|
|
@@ -122,20 +128,20 @@ class SQLEngine(WorkspaceService):
|
|
|
122
128
|
catalog_name, schema_name, table_name = parts[-3], parts[-2], parts[-1]
|
|
123
129
|
catalog_name = catalog_name or self.catalog_name
|
|
124
130
|
schema_name = schema_name or self.schema_name
|
|
125
|
-
|
|
126
131
|
return catalog_name, schema_name, table_name
|
|
127
132
|
|
|
128
|
-
def _default_warehouse(
|
|
129
|
-
|
|
130
|
-
cluster_size: str = "Small"
|
|
131
|
-
):
|
|
132
|
-
"""Return a default SQL warehouse matching the desired size.
|
|
133
|
+
def _default_warehouse(self, cluster_size: str = "Small"):
|
|
134
|
+
"""Pick a default SQL warehouse (best-effort) matching the desired size.
|
|
133
135
|
|
|
134
136
|
Args:
|
|
135
|
-
cluster_size: Desired warehouse size
|
|
137
|
+
cluster_size: Desired warehouse size (Databricks "cluster_size"), e.g. "Small".
|
|
138
|
+
If empty/None, returns the first warehouse encountered.
|
|
136
139
|
|
|
137
140
|
Returns:
|
|
138
|
-
|
|
141
|
+
Warehouse object.
|
|
142
|
+
|
|
143
|
+
Raises:
|
|
144
|
+
ValueError: If no warehouses exist in the workspace.
|
|
139
145
|
"""
|
|
140
146
|
wk = self.workspace.sdk()
|
|
141
147
|
existing = list(wk.warehouses.list())
|
|
@@ -146,48 +152,54 @@ class SQLEngine(WorkspaceService):
|
|
|
146
152
|
first = warehouse
|
|
147
153
|
|
|
148
154
|
if cluster_size:
|
|
149
|
-
if warehouse
|
|
155
|
+
if getattr(warehouse, "cluster_size", None) == cluster_size:
|
|
156
|
+
logger.debug("Default warehouse match found: id=%s cluster_size=%s", warehouse.id, warehouse.cluster_size)
|
|
150
157
|
return warehouse
|
|
151
158
|
else:
|
|
159
|
+
logger.debug("Default warehouse selected (first): id=%s", warehouse.id)
|
|
152
160
|
return warehouse
|
|
153
161
|
|
|
154
162
|
if first is not None:
|
|
163
|
+
logger.info(
|
|
164
|
+
"No warehouse matched cluster_size=%s; falling back to first warehouse id=%s cluster_size=%s",
|
|
165
|
+
cluster_size,
|
|
166
|
+
getattr(first, "id", None),
|
|
167
|
+
getattr(first, "cluster_size", None),
|
|
168
|
+
)
|
|
155
169
|
return first
|
|
156
170
|
|
|
157
171
|
raise ValueError(f"No default warehouse found in {wk.config.host}")
|
|
158
172
|
|
|
159
|
-
def _get_or_default_warehouse_id(
|
|
160
|
-
|
|
161
|
-
cluster_size = "Small"
|
|
162
|
-
):
|
|
163
|
-
"""Return the configured warehouse id or a default one.
|
|
173
|
+
def _get_or_default_warehouse_id(self, cluster_size: str = "Small") -> str:
|
|
174
|
+
"""Return configured warehouse_id or resolve a default one.
|
|
164
175
|
|
|
165
176
|
Args:
|
|
166
|
-
cluster_size: Desired warehouse size filter.
|
|
177
|
+
cluster_size: Desired warehouse size filter used when resolving defaults.
|
|
167
178
|
|
|
168
179
|
Returns:
|
|
169
|
-
|
|
180
|
+
Warehouse id string.
|
|
170
181
|
"""
|
|
171
182
|
if not self.warehouse_id:
|
|
172
183
|
dft = self._default_warehouse(cluster_size=cluster_size)
|
|
173
|
-
|
|
174
184
|
self.warehouse_id = dft.id
|
|
185
|
+
logger.info("Resolved default warehouse_id=%s (cluster_size=%s)", self.warehouse_id, cluster_size)
|
|
186
|
+
|
|
175
187
|
return self.warehouse_id
|
|
176
188
|
|
|
177
189
|
@staticmethod
|
|
178
190
|
def _random_suffix(prefix: str = "") -> str:
|
|
179
|
-
"""Generate a unique suffix for temporary resources.
|
|
180
|
-
|
|
181
|
-
Args:
|
|
182
|
-
prefix: Optional prefix to prepend.
|
|
183
|
-
|
|
184
|
-
Returns:
|
|
185
|
-
A unique suffix string.
|
|
186
|
-
"""
|
|
187
|
-
unique = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(8))
|
|
191
|
+
"""Generate a unique suffix for temporary resources."""
|
|
192
|
+
unique = "".join(random.choice(string.ascii_lowercase + string.digits) for _ in range(8))
|
|
188
193
|
timestamp = int(time.time() * 1000)
|
|
189
194
|
return f"{prefix}{timestamp}_{unique}"
|
|
190
195
|
|
|
196
|
+
@staticmethod
|
|
197
|
+
def _sql_preview(sql: str, limit: int = 220) -> str:
|
|
198
|
+
"""Short, single-line preview for logs (avoids spewing giant SQL)."""
|
|
199
|
+
if not sql:
|
|
200
|
+
return ""
|
|
201
|
+
return sql[:limit] + ("…" if len(sql) > limit else "")
|
|
202
|
+
|
|
191
203
|
def execute(
|
|
192
204
|
self,
|
|
193
205
|
statement: Optional[str] = None,
|
|
@@ -205,56 +217,67 @@ class SQLEngine(WorkspaceService):
|
|
|
205
217
|
schema_name: Optional[str] = None,
|
|
206
218
|
table_name: Optional[str] = None,
|
|
207
219
|
wait_result: bool = True,
|
|
208
|
-
**kwargs,
|
|
209
220
|
) -> "StatementResult":
|
|
210
|
-
"""
|
|
211
|
-
Execute a SQL statement on a SQL warehouse.
|
|
221
|
+
"""Execute a SQL statement via Spark or Databricks SQL Statement Execution API.
|
|
212
222
|
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
223
|
+
Engine resolution:
|
|
224
|
+
- If `engine` is not provided and a Spark session is active -> uses Spark.
|
|
225
|
+
- Otherwise uses Databricks SQL API (warehouse).
|
|
226
|
+
|
|
227
|
+
Waiting behavior (`wait_result`):
|
|
228
|
+
- If True (default): returns a StatementResult in terminal state (SUCCEEDED/FAILED/CANCELED).
|
|
229
|
+
- If False: returns immediately with the initial handle (caller can `.wait()` later).
|
|
217
230
|
|
|
218
231
|
Args:
|
|
219
|
-
statement: SQL statement to execute. If
|
|
220
|
-
engine:
|
|
221
|
-
warehouse_id:
|
|
232
|
+
statement: SQL statement to execute. If None, a `SELECT *` is generated from the table params.
|
|
233
|
+
engine: "spark" or "api".
|
|
234
|
+
warehouse_id: Warehouse override (for API engine).
|
|
222
235
|
byte_limit: Optional byte limit for results.
|
|
223
|
-
disposition: Result disposition mode.
|
|
224
|
-
format: Result format
|
|
225
|
-
on_wait_timeout: Timeout behavior for waiting.
|
|
226
|
-
parameters: Optional statement parameters.
|
|
227
|
-
row_limit: Optional row limit.
|
|
228
|
-
wait_timeout:
|
|
229
|
-
catalog_name: Optional catalog override.
|
|
230
|
-
schema_name: Optional schema override.
|
|
231
|
-
table_name: Optional table
|
|
232
|
-
wait_result: Whether to block until completion.
|
|
233
|
-
**kwargs: Additional API parameters.
|
|
236
|
+
disposition: Result disposition mode (API engine).
|
|
237
|
+
format: Result format (API engine).
|
|
238
|
+
on_wait_timeout: Timeout behavior for waiting (API engine).
|
|
239
|
+
parameters: Optional statement parameters (API engine).
|
|
240
|
+
row_limit: Optional row limit for results (API engine).
|
|
241
|
+
wait_timeout: API wait timeout value.
|
|
242
|
+
catalog_name: Optional catalog override for API engine.
|
|
243
|
+
schema_name: Optional schema override for API engine.
|
|
244
|
+
table_name: Optional table override used when `statement` is None.
|
|
245
|
+
wait_result: Whether to block until completion (API engine).
|
|
234
246
|
|
|
235
247
|
Returns:
|
|
236
|
-
|
|
248
|
+
StatementResult.
|
|
237
249
|
"""
|
|
250
|
+
# --- Engine auto-detection ---
|
|
238
251
|
if not engine:
|
|
239
252
|
if pyspark is not None:
|
|
240
253
|
spark_session = SparkSession.getActiveSession()
|
|
241
|
-
|
|
242
254
|
if spark_session is not None:
|
|
243
255
|
engine = "spark"
|
|
244
256
|
|
|
257
|
+
# --- Spark path ---
|
|
245
258
|
if engine == "spark":
|
|
246
259
|
spark_session = SparkSession.getActiveSession()
|
|
247
|
-
|
|
248
260
|
if spark_session is None:
|
|
249
261
|
raise ValueError("No spark session found to run sql query")
|
|
250
262
|
|
|
263
|
+
df: SparkDataFrame = spark_session.sql(statement)
|
|
264
|
+
|
|
265
|
+
if row_limit:
|
|
266
|
+
df = df.limit(row_limit)
|
|
267
|
+
|
|
268
|
+
logger.info("Spark SQL executed: %s", self._sql_preview(statement))
|
|
269
|
+
|
|
270
|
+
# Avoid Disposition dependency if SDK imports are absent
|
|
271
|
+
spark_disp = disposition if disposition is not None else getattr(globals().get("Disposition", object), "EXTERNAL_LINKS", None)
|
|
272
|
+
|
|
251
273
|
return StatementResult(
|
|
252
274
|
engine=self,
|
|
253
275
|
statement_id="sparksql",
|
|
254
|
-
disposition=
|
|
255
|
-
_spark_df=
|
|
276
|
+
disposition=spark_disp,
|
|
277
|
+
_spark_df=df,
|
|
256
278
|
)
|
|
257
279
|
|
|
280
|
+
# --- API path defaults ---
|
|
258
281
|
if format is None:
|
|
259
282
|
format = Format.ARROW_STREAM
|
|
260
283
|
|
|
@@ -280,7 +303,6 @@ class SQLEngine(WorkspaceService):
|
|
|
280
303
|
wait_timeout=wait_timeout,
|
|
281
304
|
catalog=catalog_name or self.catalog_name,
|
|
282
305
|
schema=schema_name or self.schema_name,
|
|
283
|
-
**kwargs,
|
|
284
306
|
)
|
|
285
307
|
|
|
286
308
|
execution = StatementResult(
|
|
@@ -288,10 +310,15 @@ class SQLEngine(WorkspaceService):
|
|
|
288
310
|
statement_id=response.statement_id,
|
|
289
311
|
_response=response,
|
|
290
312
|
_response_refresh_time=time.time(),
|
|
291
|
-
disposition=disposition
|
|
313
|
+
disposition=disposition,
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
logger.info(
|
|
317
|
+
"API SQL executed: %s",
|
|
318
|
+
self._sql_preview(statement)
|
|
292
319
|
)
|
|
293
320
|
|
|
294
|
-
return execution.wait() if wait_result else
|
|
321
|
+
return execution.wait() if wait_result else execution
|
|
295
322
|
|
|
296
323
|
def spark_table(
|
|
297
324
|
self,
|
|
@@ -300,35 +327,21 @@ class SQLEngine(WorkspaceService):
|
|
|
300
327
|
schema_name: Optional[str] = None,
|
|
301
328
|
table_name: Optional[str] = None,
|
|
302
329
|
):
|
|
303
|
-
"""Return a DeltaTable handle for a given table name.
|
|
304
|
-
|
|
305
|
-
Args:
|
|
306
|
-
full_name: Fully qualified table name.
|
|
307
|
-
catalog_name: Optional catalog override.
|
|
308
|
-
schema_name: Optional schema override.
|
|
309
|
-
table_name: Optional table name override.
|
|
310
|
-
|
|
311
|
-
Returns:
|
|
312
|
-
A Spark DeltaTable handle.
|
|
313
|
-
"""
|
|
330
|
+
"""Return a DeltaTable handle for a given table name (Spark context required)."""
|
|
314
331
|
if not full_name:
|
|
315
332
|
full_name = self.table_full_name(
|
|
316
333
|
catalog_name=catalog_name,
|
|
317
334
|
schema_name=schema_name,
|
|
318
|
-
table_name=table_name
|
|
335
|
+
table_name=table_name,
|
|
319
336
|
)
|
|
320
|
-
|
|
321
337
|
return SparkDeltaTable.forName(
|
|
322
338
|
sparkSession=SparkSession.getActiveSession(),
|
|
323
|
-
tableOrViewName=full_name
|
|
339
|
+
tableOrViewName=full_name,
|
|
324
340
|
)
|
|
325
341
|
|
|
326
342
|
def insert_into(
|
|
327
343
|
self,
|
|
328
|
-
data: Union[
|
|
329
|
-
pa.Table, pa.RecordBatch, pa.RecordBatchReader,
|
|
330
|
-
SparkDataFrame
|
|
331
|
-
],
|
|
344
|
+
data: Union[pa.Table, pa.RecordBatch, pa.RecordBatchReader, SparkDataFrame],
|
|
332
345
|
location: Optional[str] = None,
|
|
333
346
|
catalog_name: Optional[str] = None,
|
|
334
347
|
schema_name: Optional[str] = None,
|
|
@@ -336,14 +349,18 @@ class SQLEngine(WorkspaceService):
|
|
|
336
349
|
mode: str = "auto",
|
|
337
350
|
cast_options: Optional[CastOptions] = None,
|
|
338
351
|
overwrite_schema: bool | None = None,
|
|
339
|
-
match_by: list[str] = None,
|
|
340
|
-
zorder_by: list[str] = None,
|
|
352
|
+
match_by: Optional[list[str]] = None,
|
|
353
|
+
zorder_by: Optional[list[str]] = None,
|
|
341
354
|
optimize_after_merge: bool = False,
|
|
342
|
-
vacuum_hours: int | None = None,
|
|
355
|
+
vacuum_hours: int | None = None,
|
|
343
356
|
spark_session: Optional[SparkSession] = None,
|
|
344
|
-
spark_options: Optional[Dict[str, Any]] = None
|
|
357
|
+
spark_options: Optional[Dict[str, Any]] = None,
|
|
345
358
|
):
|
|
346
|
-
"""Insert data into a table using Spark
|
|
359
|
+
"""Insert data into a Delta table using Spark when available; otherwise stage Arrow.
|
|
360
|
+
|
|
361
|
+
Strategy:
|
|
362
|
+
- If Spark is available and we have an active session (or Spark DF input) -> use `spark_insert_into`.
|
|
363
|
+
- Otherwise -> use `arrow_insert_into` (stages Parquet to a temp volume + runs SQL INSERT/MERGE).
|
|
347
364
|
|
|
348
365
|
Args:
|
|
349
366
|
data: Arrow or Spark data to insert.
|
|
@@ -353,18 +370,18 @@ class SQLEngine(WorkspaceService):
|
|
|
353
370
|
table_name: Optional table name override.
|
|
354
371
|
mode: Insert mode ("auto", "append", "overwrite").
|
|
355
372
|
cast_options: Optional casting options.
|
|
356
|
-
overwrite_schema: Whether to overwrite schema (Spark).
|
|
357
|
-
match_by:
|
|
358
|
-
zorder_by:
|
|
359
|
-
optimize_after_merge: Whether to run OPTIMIZE after merge.
|
|
373
|
+
overwrite_schema: Whether to overwrite schema (Spark path).
|
|
374
|
+
match_by: Merge keys for upserts (MERGE semantics). When set, mode affects behavior.
|
|
375
|
+
zorder_by: Z-ORDER columns (SQL path uses OPTIMIZE ZORDER; Spark path uses Delta optimize API).
|
|
376
|
+
optimize_after_merge: Whether to run OPTIMIZE after a merge (SQL path) / after merge+zorder (Spark path).
|
|
360
377
|
vacuum_hours: Optional VACUUM retention window.
|
|
361
378
|
spark_session: Optional SparkSession override.
|
|
362
379
|
spark_options: Optional Spark write options.
|
|
363
380
|
|
|
364
381
|
Returns:
|
|
365
|
-
None
|
|
382
|
+
None (mutates the destination table).
|
|
366
383
|
"""
|
|
367
|
-
|
|
384
|
+
|
|
368
385
|
if pyspark is not None:
|
|
369
386
|
spark_session = SparkSession.getActiveSession() if spark_session is None else spark_session
|
|
370
387
|
|
|
@@ -382,7 +399,7 @@ class SQLEngine(WorkspaceService):
|
|
|
382
399
|
zorder_by=zorder_by,
|
|
383
400
|
optimize_after_merge=optimize_after_merge,
|
|
384
401
|
vacuum_hours=vacuum_hours,
|
|
385
|
-
spark_options=spark_options
|
|
402
|
+
spark_options=spark_options,
|
|
386
403
|
)
|
|
387
404
|
|
|
388
405
|
return self.arrow_insert_into(
|
|
@@ -402,9 +419,7 @@ class SQLEngine(WorkspaceService):
|
|
|
402
419
|
|
|
403
420
|
def arrow_insert_into(
|
|
404
421
|
self,
|
|
405
|
-
data: Union[
|
|
406
|
-
pa.Table, pa.RecordBatch, pa.RecordBatchReader,
|
|
407
|
-
],
|
|
422
|
+
data: Union[pa.Table, pa.RecordBatch, pa.RecordBatchReader],
|
|
408
423
|
location: Optional[str] = None,
|
|
409
424
|
catalog_name: Optional[str] = None,
|
|
410
425
|
schema_name: Optional[str] = None,
|
|
@@ -412,14 +427,19 @@ class SQLEngine(WorkspaceService):
|
|
|
412
427
|
mode: str = "auto",
|
|
413
428
|
cast_options: Optional[CastOptions] = None,
|
|
414
429
|
overwrite_schema: bool | None = None,
|
|
415
|
-
match_by: list[str] = None,
|
|
416
|
-
zorder_by: list[str] = None,
|
|
430
|
+
match_by: Optional[list[str]] = None,
|
|
431
|
+
zorder_by: Optional[list[str]] = None,
|
|
417
432
|
optimize_after_merge: bool = False,
|
|
418
|
-
vacuum_hours: int | None = None,
|
|
433
|
+
vacuum_hours: int | None = None,
|
|
419
434
|
existing_schema: pa.Schema | None = None,
|
|
420
|
-
temp_volume_path: Optional[Union[str, DatabricksPath]] = None
|
|
435
|
+
temp_volume_path: Optional[Union[str, DatabricksPath]] = None,
|
|
421
436
|
):
|
|
422
|
-
"""Insert Arrow data by staging to a temp volume and running SQL.
|
|
437
|
+
"""Insert Arrow data by staging Parquet to a temp volume and running Databricks SQL.
|
|
438
|
+
|
|
439
|
+
Notes:
|
|
440
|
+
- If the table does not exist, it is created from the input Arrow schema (best-effort).
|
|
441
|
+
- If `match_by` is provided, uses MERGE INTO (upsert).
|
|
442
|
+
- Otherwise uses INSERT INTO / INSERT OVERWRITE depending on mode.
|
|
423
443
|
|
|
424
444
|
Args:
|
|
425
445
|
data: Arrow table/batch data to insert.
|
|
@@ -427,14 +447,14 @@ class SQLEngine(WorkspaceService):
|
|
|
427
447
|
catalog_name: Optional catalog override.
|
|
428
448
|
schema_name: Optional schema override.
|
|
429
449
|
table_name: Optional table name override.
|
|
430
|
-
mode: Insert mode ("auto", "append", "overwrite").
|
|
450
|
+
mode: Insert mode ("auto", "append", "overwrite"). ("auto" behaves like append here.)
|
|
431
451
|
cast_options: Optional casting options.
|
|
432
|
-
overwrite_schema:
|
|
433
|
-
match_by:
|
|
434
|
-
zorder_by:
|
|
435
|
-
optimize_after_merge:
|
|
436
|
-
vacuum_hours: Optional VACUUM retention window.
|
|
437
|
-
existing_schema: Optional pre-fetched schema.
|
|
452
|
+
overwrite_schema: Reserved for parity with Spark path (unused here).
|
|
453
|
+
match_by: Merge keys for MERGE INTO upserts.
|
|
454
|
+
zorder_by: Columns for OPTIMIZE ZORDER BY.
|
|
455
|
+
optimize_after_merge: Run OPTIMIZE after MERGE (in addition to ZORDER optimization).
|
|
456
|
+
vacuum_hours: Optional VACUUM retention window in hours.
|
|
457
|
+
existing_schema: Optional pre-fetched destination schema (Arrow).
|
|
438
458
|
temp_volume_path: Optional temp volume path override.
|
|
439
459
|
|
|
440
460
|
Returns:
|
|
@@ -445,26 +465,26 @@ class SQLEngine(WorkspaceService):
|
|
|
445
465
|
catalog_name=catalog_name,
|
|
446
466
|
schema_name=schema_name,
|
|
447
467
|
table_name=table_name,
|
|
448
|
-
safe_chars=True
|
|
468
|
+
safe_chars=True,
|
|
449
469
|
)
|
|
450
470
|
|
|
451
|
-
with self as connected:
|
|
471
|
+
with self.connect() as connected:
|
|
452
472
|
if existing_schema is None:
|
|
453
473
|
try:
|
|
454
474
|
existing_schema = connected.get_table_schema(
|
|
455
475
|
catalog_name=catalog_name,
|
|
456
476
|
schema_name=schema_name,
|
|
457
477
|
table_name=table_name,
|
|
458
|
-
to_arrow_schema=True
|
|
478
|
+
to_arrow_schema=True,
|
|
459
479
|
)
|
|
460
480
|
except ValueError as exc:
|
|
461
|
-
|
|
462
|
-
existing_schema =
|
|
481
|
+
data_tbl = convert(data, pa.Table)
|
|
482
|
+
existing_schema = data_tbl.schema
|
|
463
483
|
logger.warning(
|
|
464
|
-
"Table %s not found
|
|
484
|
+
"Table %s not found (%s). Creating it from input schema (columns=%s)",
|
|
465
485
|
location,
|
|
466
486
|
exc,
|
|
467
|
-
existing_schema.names
|
|
487
|
+
existing_schema.names,
|
|
468
488
|
)
|
|
469
489
|
|
|
470
490
|
connected.create_table(
|
|
@@ -472,12 +492,12 @@ class SQLEngine(WorkspaceService):
|
|
|
472
492
|
catalog_name=catalog_name,
|
|
473
493
|
schema_name=schema_name,
|
|
474
494
|
table_name=table_name,
|
|
475
|
-
if_not_exists=True
|
|
495
|
+
if_not_exists=True,
|
|
476
496
|
)
|
|
477
497
|
|
|
478
498
|
try:
|
|
479
499
|
return connected.arrow_insert_into(
|
|
480
|
-
data=
|
|
500
|
+
data=data_tbl,
|
|
481
501
|
location=location,
|
|
482
502
|
catalog_name=catalog_name,
|
|
483
503
|
schema_name=schema_name,
|
|
@@ -489,54 +509,62 @@ class SQLEngine(WorkspaceService):
|
|
|
489
509
|
zorder_by=zorder_by,
|
|
490
510
|
optimize_after_merge=optimize_after_merge,
|
|
491
511
|
vacuum_hours=vacuum_hours,
|
|
492
|
-
existing_schema=existing_schema
|
|
512
|
+
existing_schema=existing_schema,
|
|
493
513
|
)
|
|
494
|
-
except:
|
|
514
|
+
except Exception:
|
|
515
|
+
logger.exception("Arrow insert failed after auto-creating %s; attempting cleanup (DROP TABLE)", location)
|
|
495
516
|
try:
|
|
496
517
|
connected.drop_table(location=location)
|
|
497
|
-
except Exception
|
|
498
|
-
logger.
|
|
518
|
+
except Exception:
|
|
519
|
+
logger.exception("Failed to drop table %s after auto creation error", location)
|
|
499
520
|
raise
|
|
500
521
|
|
|
501
522
|
transaction_id = self._random_suffix()
|
|
502
523
|
|
|
503
|
-
|
|
524
|
+
data_tbl = convert(
|
|
525
|
+
data, pa.Table,
|
|
526
|
+
options=cast_options, target_field=existing_schema
|
|
527
|
+
)
|
|
528
|
+
num_rows = data_tbl.num_rows
|
|
529
|
+
|
|
530
|
+
logger.debug(
|
|
531
|
+
"Arrow inserting %s rows into %s (mode=%s, match_by=%s, zorder_by=%s)",
|
|
532
|
+
num_rows,
|
|
533
|
+
location,
|
|
534
|
+
mode,
|
|
535
|
+
match_by,
|
|
536
|
+
zorder_by,
|
|
537
|
+
)
|
|
504
538
|
|
|
505
539
|
# Write in temp volume
|
|
506
540
|
temp_volume_path = connected.dbfs_path(
|
|
507
541
|
kind=DatabricksPathKind.VOLUME,
|
|
508
|
-
parts=[catalog_name, schema_name, "tmp", "sql", transaction_id]
|
|
542
|
+
parts=[catalog_name, schema_name, "tmp", "sql", transaction_id],
|
|
509
543
|
) if temp_volume_path is None else DatabricksPath.parse(obj=temp_volume_path, workspace=connected.workspace)
|
|
510
544
|
|
|
545
|
+
logger.debug("Staging Parquet to temp volume: %s", temp_volume_path)
|
|
511
546
|
temp_volume_path.mkdir()
|
|
547
|
+
temp_volume_path.write_arrow_table(data_tbl)
|
|
512
548
|
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
# get column list from arrow schema
|
|
516
|
-
columns = [c for c in existing_schema.names]
|
|
549
|
+
columns = list(existing_schema.names)
|
|
517
550
|
cols_quoted = ", ".join([f"`{c}`" for c in columns])
|
|
518
551
|
|
|
519
|
-
statements = []
|
|
552
|
+
statements: list[str] = []
|
|
520
553
|
|
|
521
|
-
# Decide how to ingest
|
|
522
|
-
# If merge keys provided -> use MERGE
|
|
523
554
|
if match_by:
|
|
524
|
-
|
|
525
|
-
on_clauses = []
|
|
526
|
-
for k in match_by:
|
|
527
|
-
on_clauses.append(f"T.`{k}` = S.`{k}`")
|
|
528
|
-
on_condition = " AND ".join(on_clauses)
|
|
555
|
+
on_condition = " AND ".join([f"T.`{k}` = S.`{k}`" for k in match_by])
|
|
529
556
|
|
|
530
|
-
# build UPDATE set (all columns except match_by)
|
|
531
557
|
update_cols = [c for c in columns if c not in match_by]
|
|
532
558
|
if update_cols:
|
|
533
559
|
update_set = ", ".join([f"T.`{c}` = S.`{c}`" for c in update_cols])
|
|
534
560
|
update_clause = f"WHEN MATCHED THEN UPDATE SET {update_set}"
|
|
535
561
|
else:
|
|
536
|
-
update_clause = ""
|
|
562
|
+
update_clause = ""
|
|
537
563
|
|
|
538
|
-
|
|
539
|
-
|
|
564
|
+
insert_clause = (
|
|
565
|
+
f"WHEN NOT MATCHED THEN INSERT ({cols_quoted}) "
|
|
566
|
+
f"VALUES ({', '.join([f'S.`{c}`' for c in columns])})"
|
|
567
|
+
)
|
|
540
568
|
|
|
541
569
|
merge_sql = f"""MERGE INTO {location} AS T
|
|
542
570
|
USING (
|
|
@@ -546,41 +574,47 @@ ON {on_condition}
|
|
|
546
574
|
{update_clause}
|
|
547
575
|
{insert_clause}"""
|
|
548
576
|
statements.append(merge_sql)
|
|
549
|
-
|
|
550
577
|
else:
|
|
551
|
-
# No match_by -> plain insert
|
|
552
578
|
if mode.lower() in ("overwrite",):
|
|
553
579
|
insert_sql = f"""INSERT OVERWRITE {location}
|
|
554
580
|
SELECT {cols_quoted}
|
|
555
581
|
FROM parquet.`{temp_volume_path}`"""
|
|
556
582
|
else:
|
|
557
|
-
# default: append
|
|
558
583
|
insert_sql = f"""INSERT INTO {location} ({cols_quoted})
|
|
559
584
|
SELECT {cols_quoted}
|
|
560
585
|
FROM parquet.`{temp_volume_path}`"""
|
|
561
586
|
statements.append(insert_sql)
|
|
562
587
|
|
|
563
|
-
# Execute statements (use your existing execute helper)
|
|
564
588
|
try:
|
|
565
589
|
for stmt in statements:
|
|
566
|
-
# trim and run
|
|
567
590
|
connected.execute(stmt.strip())
|
|
568
591
|
finally:
|
|
569
592
|
try:
|
|
570
593
|
temp_volume_path.rmdir(recursive=True)
|
|
571
|
-
except Exception
|
|
572
|
-
logger.
|
|
594
|
+
except Exception:
|
|
595
|
+
logger.exception("Failed cleaning temp volume: %s", temp_volume_path)
|
|
596
|
+
|
|
597
|
+
logger.info(
|
|
598
|
+
"Arrow inserted %s rows into %s (mode=%s, match_by=%s, zorder_by=%s)",
|
|
599
|
+
num_rows,
|
|
600
|
+
location,
|
|
601
|
+
mode,
|
|
602
|
+
match_by,
|
|
603
|
+
zorder_by,
|
|
604
|
+
)
|
|
573
605
|
|
|
574
|
-
# Optionally run OPTIMIZE / ZORDER / VACUUM if requested (Databricks SQL)
|
|
575
606
|
if zorder_by:
|
|
576
607
|
zcols = ", ".join([f"`{c}`" for c in zorder_by])
|
|
577
608
|
optimize_sql = f"OPTIMIZE {location} ZORDER BY ({zcols})"
|
|
609
|
+
logger.info("Running OPTIMIZE ZORDER BY: %s", zorder_by)
|
|
578
610
|
connected.execute(optimize_sql)
|
|
579
611
|
|
|
580
612
|
if optimize_after_merge and match_by:
|
|
613
|
+
logger.info("Running OPTIMIZE after MERGE")
|
|
581
614
|
connected.execute(f"OPTIMIZE {location}")
|
|
582
615
|
|
|
583
616
|
if vacuum_hours is not None:
|
|
617
|
+
logger.info("Running VACUUM retain=%s hours", vacuum_hours)
|
|
584
618
|
connected.execute(f"VACUUM {location} RETAIN {vacuum_hours} HOURS")
|
|
585
619
|
|
|
586
620
|
return None
|
|
@@ -596,13 +630,20 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
596
630
|
mode: str = "auto",
|
|
597
631
|
cast_options: Optional[CastOptions] = None,
|
|
598
632
|
overwrite_schema: bool | None = None,
|
|
599
|
-
match_by: list[str] = None,
|
|
600
|
-
zorder_by: list[str] = None,
|
|
633
|
+
match_by: Optional[list[str]] = None,
|
|
634
|
+
zorder_by: Optional[list[str]] = None,
|
|
601
635
|
optimize_after_merge: bool = False,
|
|
602
|
-
vacuum_hours: int | None = None,
|
|
636
|
+
vacuum_hours: int | None = None,
|
|
603
637
|
spark_options: Optional[Dict[str, Any]] = None,
|
|
604
638
|
):
|
|
605
|
-
"""Insert a Spark DataFrame into a Delta table
|
|
639
|
+
"""Insert a Spark DataFrame into a Delta table (append/overwrite/merge).
|
|
640
|
+
|
|
641
|
+
Behavior:
|
|
642
|
+
- If the table does not exist: creates it via `saveAsTable(location)` (overwrite).
|
|
643
|
+
- If `match_by` is provided: uses Delta MERGE for upserts.
|
|
644
|
+
- If mode == "overwrite": deletes matching keys first, then appends the batch (fast-ish overwrite-by-key).
|
|
645
|
+
- Else: updates matching rows + inserts new ones.
|
|
646
|
+
- Else: uses `DataFrameWriter.saveAsTable` with mode.
|
|
606
647
|
|
|
607
648
|
Args:
|
|
608
649
|
data: Spark DataFrame to insert.
|
|
@@ -611,12 +652,12 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
611
652
|
schema_name: Optional schema override.
|
|
612
653
|
table_name: Optional table name override.
|
|
613
654
|
mode: Insert mode ("auto", "append", "overwrite").
|
|
614
|
-
cast_options: Optional casting options.
|
|
615
|
-
overwrite_schema: Whether to overwrite schema.
|
|
616
|
-
match_by:
|
|
617
|
-
zorder_by:
|
|
618
|
-
optimize_after_merge: Whether to run
|
|
619
|
-
vacuum_hours: Optional VACUUM retention window.
|
|
655
|
+
cast_options: Optional casting options (align to destination schema).
|
|
656
|
+
overwrite_schema: Whether to overwrite schema on write (when supported).
|
|
657
|
+
match_by: Merge keys for upserts.
|
|
658
|
+
zorder_by: Z-ORDER columns (used only if `optimize_after_merge` is True).
|
|
659
|
+
optimize_after_merge: Whether to run Delta optimize (and z-order) after merge.
|
|
660
|
+
vacuum_hours: Optional VACUUM retention window in hours.
|
|
620
661
|
spark_options: Optional Spark write options.
|
|
621
662
|
|
|
622
663
|
Returns:
|
|
@@ -627,7 +668,15 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
627
668
|
catalog_name=catalog_name,
|
|
628
669
|
schema_name=schema_name,
|
|
629
670
|
table_name=table_name,
|
|
630
|
-
safe_chars=True
|
|
671
|
+
safe_chars=True,
|
|
672
|
+
)
|
|
673
|
+
|
|
674
|
+
logger.info(
|
|
675
|
+
"Spark insert into %s (mode=%s, match_by=%s, overwrite_schema=%s)",
|
|
676
|
+
location,
|
|
677
|
+
mode,
|
|
678
|
+
match_by,
|
|
679
|
+
overwrite_schema,
|
|
631
680
|
)
|
|
632
681
|
|
|
633
682
|
spark_options = spark_options if spark_options else {}
|
|
@@ -636,11 +685,13 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
636
685
|
|
|
637
686
|
try:
|
|
638
687
|
existing_schema = self.get_table_schema(
|
|
639
|
-
catalog_name=catalog_name,
|
|
688
|
+
catalog_name=catalog_name,
|
|
689
|
+
schema_name=schema_name,
|
|
640
690
|
table_name=table_name,
|
|
641
|
-
to_arrow_schema=False
|
|
691
|
+
to_arrow_schema=False,
|
|
642
692
|
)
|
|
643
693
|
except ValueError:
|
|
694
|
+
logger.warning("Destination table missing; creating table %s via overwrite write", location)
|
|
644
695
|
data = convert(data, pyspark.sql.DataFrame)
|
|
645
696
|
data.write.mode("overwrite").options(**spark_options).saveAsTable(location)
|
|
646
697
|
return
|
|
@@ -651,29 +702,25 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
651
702
|
cast_options = CastOptions.check_arg(options=cast_options, target_field=existing_schema)
|
|
652
703
|
data = cast_spark_dataframe(data, options=cast_options)
|
|
653
704
|
|
|
654
|
-
|
|
655
|
-
if match_by:
|
|
656
|
-
notnull: pyspark.sql.Column = None
|
|
705
|
+
logger.debug("Incoming Spark columns: %s", data.columns)
|
|
657
706
|
|
|
707
|
+
if match_by:
|
|
708
|
+
notnull = None
|
|
658
709
|
for k in match_by:
|
|
659
710
|
if k not in data.columns:
|
|
660
711
|
raise ValueError(f"Missing match key '{k}' in DataFrame columns: {data.columns}")
|
|
661
|
-
|
|
662
|
-
notnull = data[k].isNotNull() if notnull is None else notnull & (data[k].isNotNull())
|
|
712
|
+
notnull = data[k].isNotNull() if notnull is None else notnull & data[k].isNotNull()
|
|
663
713
|
|
|
664
714
|
data = data.filter(notnull)
|
|
715
|
+
logger.debug("Filtered null keys for match_by=%s", match_by)
|
|
665
716
|
|
|
666
|
-
# --- Merge (upsert) ---
|
|
667
717
|
target = self.spark_table(full_name=location)
|
|
668
718
|
|
|
669
719
|
if match_by:
|
|
670
|
-
# Build merge condition on the composite key
|
|
671
720
|
cond = " AND ".join([f"t.`{k}` <=> s.`{k}`" for k in match_by])
|
|
672
721
|
|
|
673
722
|
if mode.casefold() == "overwrite":
|
|
674
723
|
data = data.cache()
|
|
675
|
-
|
|
676
|
-
# Step 1: get unique key combos from source
|
|
677
724
|
distinct_keys = data.select([f"`{k}`" for k in match_by]).distinct()
|
|
678
725
|
|
|
679
726
|
(
|
|
@@ -683,35 +730,30 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
683
730
|
.execute()
|
|
684
731
|
)
|
|
685
732
|
|
|
686
|
-
|
|
687
|
-
data.write.format("delta").mode("append").saveAsTable(location)
|
|
733
|
+
data.write.format("delta").mode("append").options(**spark_options).saveAsTable(location)
|
|
688
734
|
else:
|
|
689
735
|
update_cols = [c for c in data.columns if c not in match_by]
|
|
690
|
-
set_expr = {
|
|
691
|
-
c: F.expr(f"s.`{c}`") for c in update_cols
|
|
692
|
-
}
|
|
736
|
+
set_expr = {c: F.expr(f"s.`{c}`") for c in update_cols}
|
|
693
737
|
|
|
694
|
-
# Execute MERGE - update matching records first, then insert new ones
|
|
695
738
|
(
|
|
696
739
|
target.alias("t")
|
|
697
740
|
.merge(data.alias("s"), cond)
|
|
698
|
-
.whenMatchedUpdate(set=set_expr)
|
|
699
|
-
.whenNotMatchedInsertAll()
|
|
741
|
+
.whenMatchedUpdate(set=set_expr)
|
|
742
|
+
.whenNotMatchedInsertAll()
|
|
700
743
|
.execute()
|
|
701
744
|
)
|
|
702
745
|
else:
|
|
703
746
|
if mode == "auto":
|
|
704
747
|
mode = "append"
|
|
748
|
+
logger.info("Spark write saveAsTable mode=%s", mode)
|
|
705
749
|
data.write.mode(mode).options(**spark_options).saveAsTable(location)
|
|
706
750
|
|
|
707
|
-
# --- Optimize: Z-ORDER for faster lookups by composite key (Databricks) ---
|
|
708
751
|
if optimize_after_merge and zorder_by:
|
|
709
|
-
|
|
752
|
+
logger.info("Delta optimize + zorder (%s)", zorder_by)
|
|
710
753
|
target.optimize().executeZOrderBy(*zorder_by)
|
|
711
754
|
|
|
712
|
-
# --- Optional VACUUM ---
|
|
713
755
|
if vacuum_hours is not None:
|
|
714
|
-
|
|
756
|
+
logger.info("Delta vacuum retain=%s hours", vacuum_hours)
|
|
715
757
|
target.vacuum(vacuum_hours)
|
|
716
758
|
|
|
717
759
|
def get_table_schema(
|
|
@@ -719,24 +761,24 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
719
761
|
catalog_name: Optional[str] = None,
|
|
720
762
|
schema_name: Optional[str] = None,
|
|
721
763
|
table_name: Optional[str] = None,
|
|
722
|
-
to_arrow_schema: bool = True
|
|
764
|
+
to_arrow_schema: bool = True,
|
|
723
765
|
) -> Union[pa.Field, pa.Schema]:
|
|
724
|
-
"""Fetch a table schema from Unity Catalog
|
|
766
|
+
"""Fetch a table schema from Unity Catalog and convert it to Arrow types.
|
|
725
767
|
|
|
726
768
|
Args:
|
|
727
769
|
catalog_name: Optional catalog override.
|
|
728
770
|
schema_name: Optional schema override.
|
|
729
771
|
table_name: Optional table name override.
|
|
730
|
-
to_arrow_schema:
|
|
772
|
+
to_arrow_schema: If True returns pa.Schema; else returns a pa.Field(STRUCT<...>).
|
|
731
773
|
|
|
732
774
|
Returns:
|
|
733
|
-
Arrow Schema or Field representing the table.
|
|
775
|
+
Arrow Schema or a STRUCT Field representing the table.
|
|
734
776
|
"""
|
|
735
777
|
full_name = self.table_full_name(
|
|
736
778
|
catalog_name=catalog_name,
|
|
737
779
|
schema_name=schema_name,
|
|
738
780
|
table_name=table_name,
|
|
739
|
-
safe_chars=False
|
|
781
|
+
safe_chars=False,
|
|
740
782
|
)
|
|
741
783
|
|
|
742
784
|
wk = self.workspace.sdk()
|
|
@@ -746,10 +788,7 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
746
788
|
except Exception as e:
|
|
747
789
|
raise ValueError(f"Table %s not found, {type(e)} {e}" % full_name)
|
|
748
790
|
|
|
749
|
-
fields = [
|
|
750
|
-
column_info_to_arrow_field(_)
|
|
751
|
-
for _ in table.columns
|
|
752
|
-
]
|
|
791
|
+
fields = [column_info_to_arrow_field(_) for _ in table.columns]
|
|
753
792
|
|
|
754
793
|
if to_arrow_schema:
|
|
755
794
|
return pa.schema(fields, metadata={b"name": table_name})
|
|
@@ -762,25 +801,15 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
762
801
|
schema_name: Optional[str] = None,
|
|
763
802
|
table_name: Optional[str] = None,
|
|
764
803
|
):
|
|
765
|
-
"""Drop a table if it exists.
|
|
766
|
-
|
|
767
|
-
Args:
|
|
768
|
-
location: Fully qualified table name override.
|
|
769
|
-
catalog_name: Optional catalog override.
|
|
770
|
-
schema_name: Optional schema override.
|
|
771
|
-
table_name: Optional table name override.
|
|
772
|
-
|
|
773
|
-
Returns:
|
|
774
|
-
The StatementResult from executing the drop statement.
|
|
775
|
-
"""
|
|
804
|
+
"""Drop a table if it exists."""
|
|
776
805
|
location, _, _, _ = self._check_location_params(
|
|
777
806
|
location=location,
|
|
778
807
|
catalog_name=catalog_name,
|
|
779
808
|
schema_name=schema_name,
|
|
780
809
|
table_name=table_name,
|
|
781
|
-
safe_chars=True
|
|
810
|
+
safe_chars=True,
|
|
782
811
|
)
|
|
783
|
-
|
|
812
|
+
logger.info("Dropping table if exists: %s", location)
|
|
784
813
|
return self.execute(f"DROP TABLE IF EXISTS {location}")
|
|
785
814
|
|
|
786
815
|
def create_table(
|
|
@@ -797,23 +826,29 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
797
826
|
if_not_exists: bool = True,
|
|
798
827
|
optimize_write: bool = True,
|
|
799
828
|
auto_compact: bool = True,
|
|
800
|
-
execute: bool = True
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
Generate
|
|
829
|
+
execute: bool = True,
|
|
830
|
+
wait_result: bool = True
|
|
831
|
+
) -> Union[str, "StatementResult"]:
|
|
832
|
+
"""Generate (and optionally execute) CREATE TABLE DDL from an Arrow schema/field.
|
|
804
833
|
|
|
805
834
|
Args:
|
|
806
|
-
field:
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
835
|
+
field: Arrow Field or Schema describing the table. If `field` is a schema, it's converted.
|
|
836
|
+
location: Fully qualified table name override.
|
|
837
|
+
table_name: Table name override (used if location not provided).
|
|
838
|
+
catalog_name: Catalog override.
|
|
839
|
+
schema_name: Schema override.
|
|
840
|
+
partition_by: Optional partition columns.
|
|
841
|
+
cluster_by: If True -> CLUSTER BY AUTO. If list[str] -> CLUSTER BY (..). If False -> no clustering.
|
|
842
|
+
comment: Optional table comment (falls back to field metadata b"comment" when present).
|
|
843
|
+
options: Extra table properties.
|
|
844
|
+
if_not_exists: Add IF NOT EXISTS clause.
|
|
845
|
+
optimize_write: Sets delta.autoOptimize.optimizeWrite table property.
|
|
846
|
+
auto_compact: Sets delta.autoOptimize.autoCompact table property.
|
|
847
|
+
execute: If True, executes DDL and returns StatementResult; otherwise returns SQL string.
|
|
848
|
+
wait_result: Waits execution to complete
|
|
814
849
|
|
|
815
850
|
Returns:
|
|
816
|
-
|
|
851
|
+
StatementResult if execute=True, else the DDL SQL string.
|
|
817
852
|
"""
|
|
818
853
|
if not isinstance(field, pa.Field):
|
|
819
854
|
field = convert(field, pa.Field)
|
|
@@ -823,7 +858,7 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
823
858
|
catalog_name=catalog_name,
|
|
824
859
|
schema_name=schema_name,
|
|
825
860
|
table_name=table_name,
|
|
826
|
-
safe_chars=True
|
|
861
|
+
safe_chars=True,
|
|
827
862
|
)
|
|
828
863
|
|
|
829
864
|
if pa.types.is_struct(field.type):
|
|
@@ -831,28 +866,22 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
831
866
|
else:
|
|
832
867
|
children = [field]
|
|
833
868
|
|
|
834
|
-
|
|
835
|
-
column_definitions = [
|
|
836
|
-
self._field_to_ddl(child)
|
|
837
|
-
for child in children
|
|
838
|
-
]
|
|
869
|
+
column_definitions = [self._field_to_ddl(child) for child in children]
|
|
839
870
|
|
|
840
871
|
sql = [
|
|
841
872
|
f"CREATE TABLE {'IF NOT EXISTS ' if if_not_exists else ''}{location} (",
|
|
842
873
|
",\n ".join(column_definitions),
|
|
843
|
-
")"
|
|
874
|
+
")",
|
|
844
875
|
]
|
|
845
876
|
|
|
846
|
-
|
|
847
|
-
if partition_by and len(partition_by) > 0:
|
|
877
|
+
if partition_by:
|
|
848
878
|
sql.append(f"\nPARTITIONED BY ({', '.join(partition_by)})")
|
|
849
879
|
elif cluster_by:
|
|
850
880
|
if isinstance(cluster_by, bool):
|
|
851
|
-
sql.append(
|
|
881
|
+
sql.append("\nCLUSTER BY AUTO")
|
|
852
882
|
else:
|
|
853
883
|
sql.append(f"\nCLUSTER BY ({', '.join(cluster_by)})")
|
|
854
884
|
|
|
855
|
-
# Add comment if provided
|
|
856
885
|
if not comment and field.metadata:
|
|
857
886
|
comment = field.metadata.get(b"comment")
|
|
858
887
|
|
|
@@ -862,32 +891,33 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
862
891
|
if comment:
|
|
863
892
|
sql.append(f"\nCOMMENT '{comment}'")
|
|
864
893
|
|
|
865
|
-
# Add options if provided
|
|
866
894
|
options = {} if options is None else options
|
|
867
895
|
options.update({
|
|
868
896
|
"delta.autoOptimize.optimizeWrite": optimize_write,
|
|
869
|
-
"delta.autoOptimize.autoCompact": auto_compact
|
|
897
|
+
"delta.autoOptimize.autoCompact": auto_compact,
|
|
870
898
|
})
|
|
871
899
|
|
|
872
900
|
option_strs = []
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
option_strs.append(f"'{key}' = '{b_value}'")
|
|
881
|
-
else:
|
|
882
|
-
option_strs.append(f"'{key}' = {value}")
|
|
901
|
+
for key, value in (options or {}).items():
|
|
902
|
+
if isinstance(value, str):
|
|
903
|
+
option_strs.append(f"'{key}' = '{value}'")
|
|
904
|
+
elif isinstance(value, bool):
|
|
905
|
+
option_strs.append(f"'{key}' = '{'true' if value else 'false'}'")
|
|
906
|
+
else:
|
|
907
|
+
option_strs.append(f"'{key}' = {value}")
|
|
883
908
|
|
|
884
909
|
if option_strs:
|
|
885
910
|
sql.append(f"\nTBLPROPERTIES ({', '.join(option_strs)})")
|
|
886
911
|
|
|
887
912
|
statement = "\n".join(sql)
|
|
888
913
|
|
|
914
|
+
logger.debug(
|
|
915
|
+
"Generated CREATE TABLE DDL for %s:\n%s",
|
|
916
|
+
location, statement
|
|
917
|
+
)
|
|
918
|
+
|
|
889
919
|
if execute:
|
|
890
|
-
return self.execute(statement)
|
|
920
|
+
return self.execute(statement, wait_result=wait_result)
|
|
891
921
|
return statement
|
|
892
922
|
|
|
893
923
|
def _check_location_params(
|
|
@@ -896,28 +926,18 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
896
926
|
catalog_name: Optional[str] = None,
|
|
897
927
|
schema_name: Optional[str] = None,
|
|
898
928
|
table_name: Optional[str] = None,
|
|
899
|
-
safe_chars: bool = True
|
|
900
|
-
):
|
|
901
|
-
"""Resolve location
|
|
902
|
-
|
|
903
|
-
Args:
|
|
904
|
-
location: Fully qualified table name override.
|
|
905
|
-
catalog_name: Optional catalog override.
|
|
906
|
-
schema_name: Optional schema override.
|
|
907
|
-
table_name: Optional table name override.
|
|
908
|
-
safe_chars: Whether to wrap identifiers in backticks.
|
|
909
|
-
|
|
910
|
-
Returns:
|
|
911
|
-
A tuple of (location, catalog_name, schema_name, table_name).
|
|
912
|
-
"""
|
|
929
|
+
safe_chars: bool = True,
|
|
930
|
+
) -> tuple[str, Optional[str], Optional[str], Optional[str]]:
|
|
931
|
+
"""Resolve (location OR catalog/schema/table) into a fully-qualified name."""
|
|
913
932
|
if location:
|
|
914
933
|
c, s, t = self._catalog_schema_table_names(location)
|
|
915
934
|
catalog_name, schema_name, table_name = catalog_name or c, schema_name or s, table_name or t
|
|
916
935
|
|
|
917
936
|
location = self.table_full_name(
|
|
918
|
-
catalog_name=catalog_name,
|
|
937
|
+
catalog_name=catalog_name,
|
|
938
|
+
schema_name=schema_name,
|
|
919
939
|
table_name=table_name,
|
|
920
|
-
safe_chars=safe_chars
|
|
940
|
+
safe_chars=safe_chars,
|
|
921
941
|
)
|
|
922
942
|
|
|
923
943
|
return location, catalog_name or self.catalog_name, schema_name or self.schema_name, table_name
|
|
@@ -927,98 +947,68 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
927
947
|
field: pa.Field,
|
|
928
948
|
put_name: bool = True,
|
|
929
949
|
put_not_null: bool = True,
|
|
930
|
-
put_comment: bool = True
|
|
950
|
+
put_comment: bool = True,
|
|
931
951
|
) -> str:
|
|
932
|
-
"""
|
|
933
|
-
Convert a PyField to a DDL column definition.
|
|
934
|
-
|
|
935
|
-
Args:
|
|
936
|
-
field: The PyField to convert
|
|
937
|
-
|
|
938
|
-
Returns:
|
|
939
|
-
A string containing the column definition in DDL format
|
|
940
|
-
"""
|
|
952
|
+
"""Convert an Arrow Field to a Databricks SQL column DDL fragment."""
|
|
941
953
|
name = field.name
|
|
942
954
|
nullable_str = " NOT NULL" if put_not_null and not field.nullable else ""
|
|
943
955
|
name_str = f"{name} " if put_name else ""
|
|
944
956
|
|
|
945
|
-
# Get comment if available
|
|
946
957
|
comment_str = ""
|
|
947
958
|
if put_comment and field.metadata and b"comment" in field.metadata:
|
|
948
959
|
comment = field.metadata[b"comment"].decode("utf-8")
|
|
949
960
|
comment_str = f" COMMENT '{comment}'"
|
|
950
961
|
|
|
951
|
-
# Handle primitive types
|
|
952
962
|
if not pa.types.is_nested(field.type):
|
|
953
963
|
sql_type = SQLEngine._arrow_to_sql_type(field.type)
|
|
954
964
|
return f"{name_str}{sql_type}{nullable_str}{comment_str}"
|
|
955
965
|
|
|
956
|
-
# Handle struct type
|
|
957
966
|
if pa.types.is_struct(field.type):
|
|
958
967
|
child_defs = [SQLEngine._field_to_ddl(child) for child in field.type]
|
|
959
968
|
struct_body = ", ".join(child_defs)
|
|
960
969
|
return f"{name_str}STRUCT<{struct_body}>{nullable_str}{comment_str}"
|
|
961
970
|
|
|
962
|
-
# Handle map type
|
|
963
971
|
if pa.types.is_map(field.type):
|
|
964
972
|
map_type: pa.MapType = field.type
|
|
965
973
|
key_type = SQLEngine._field_to_ddl(map_type.key_field, put_name=False, put_comment=False, put_not_null=False)
|
|
966
974
|
val_type = SQLEngine._field_to_ddl(map_type.item_field, put_name=False, put_comment=False, put_not_null=False)
|
|
967
975
|
return f"{name_str}MAP<{key_type}, {val_type}>{nullable_str}{comment_str}"
|
|
968
976
|
|
|
969
|
-
# Handle list type after map
|
|
970
977
|
if pa.types.is_list(field.type) or pa.types.is_large_list(field.type):
|
|
971
978
|
list_type: pa.ListType = field.type
|
|
972
979
|
elem_type = SQLEngine._field_to_ddl(list_type.value_field, put_name=False, put_comment=False, put_not_null=False)
|
|
973
980
|
return f"{name_str}ARRAY<{elem_type}>{nullable_str}{comment_str}"
|
|
974
981
|
|
|
975
|
-
# Default fallback to string for unknown types
|
|
976
982
|
raise TypeError(f"Cannot make ddl field from {field}")
|
|
977
983
|
|
|
978
984
|
@staticmethod
|
|
979
|
-
def _arrow_to_sql_type(
|
|
980
|
-
|
|
981
|
-
) -> str:
|
|
982
|
-
"""
|
|
983
|
-
Convert an Arrow data type to SQL data type.
|
|
984
|
-
|
|
985
|
-
Args:
|
|
986
|
-
arrow_type: The Arrow data type
|
|
987
|
-
|
|
988
|
-
Returns:
|
|
989
|
-
A string containing the SQL data type
|
|
990
|
-
"""
|
|
985
|
+
def _arrow_to_sql_type(arrow_type: Union[pa.DataType, pa.Decimal128Type]) -> str:
|
|
986
|
+
"""Convert an Arrow data type to a Databricks SQL type string."""
|
|
991
987
|
if pa.types.is_boolean(arrow_type):
|
|
992
988
|
return "BOOLEAN"
|
|
993
|
-
|
|
989
|
+
if pa.types.is_int8(arrow_type):
|
|
994
990
|
return "TINYINT"
|
|
995
|
-
|
|
991
|
+
if pa.types.is_int16(arrow_type):
|
|
996
992
|
return "SMALLINT"
|
|
997
|
-
|
|
993
|
+
if pa.types.is_int32(arrow_type):
|
|
998
994
|
return "INT"
|
|
999
|
-
|
|
995
|
+
if pa.types.is_int64(arrow_type):
|
|
1000
996
|
return "BIGINT"
|
|
1001
|
-
|
|
997
|
+
if pa.types.is_float32(arrow_type):
|
|
1002
998
|
return "FLOAT"
|
|
1003
|
-
|
|
999
|
+
if pa.types.is_float64(arrow_type):
|
|
1004
1000
|
return "DOUBLE"
|
|
1005
|
-
|
|
1001
|
+
if is_arrow_type_string_like(arrow_type):
|
|
1006
1002
|
return "STRING"
|
|
1007
|
-
|
|
1003
|
+
if is_arrow_type_binary_like(arrow_type):
|
|
1008
1004
|
return "BINARY"
|
|
1009
|
-
|
|
1005
|
+
if pa.types.is_timestamp(arrow_type):
|
|
1010
1006
|
tz = getattr(arrow_type, "tz", None)
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
return "TIMESTAMP"
|
|
1014
|
-
return "TIMESTAMP_NTZ"
|
|
1015
|
-
elif pa.types.is_date(arrow_type):
|
|
1007
|
+
return "TIMESTAMP" if tz else "TIMESTAMP_NTZ"
|
|
1008
|
+
if pa.types.is_date(arrow_type):
|
|
1016
1009
|
return "DATE"
|
|
1017
|
-
|
|
1018
|
-
precision
|
|
1019
|
-
|
|
1020
|
-
return f"DECIMAL({precision}, {scale})"
|
|
1021
|
-
elif pa.types.is_null(arrow_type):
|
|
1010
|
+
if pa.types.is_decimal(arrow_type):
|
|
1011
|
+
return f"DECIMAL({arrow_type.precision}, {arrow_type.scale})"
|
|
1012
|
+
if pa.types.is_null(arrow_type):
|
|
1022
1013
|
return "STRING"
|
|
1023
|
-
|
|
1024
|
-
raise ValueError(f"Cannot make ddl type for {arrow_type}")
|
|
1014
|
+
raise ValueError(f"Cannot make ddl type for {arrow_type}")
|