ygg 0.1.33__py3-none-any.whl → 0.1.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ygg-0.1.33.dist-info → ygg-0.1.34.dist-info}/METADATA +1 -1
- {ygg-0.1.33.dist-info → ygg-0.1.34.dist-info}/RECORD +14 -13
- yggdrasil/databricks/compute/cluster.py +106 -57
- yggdrasil/databricks/compute/execution_context.py +5 -2
- yggdrasil/databricks/compute/remote.py +6 -5
- yggdrasil/databricks/sql/engine.py +295 -321
- yggdrasil/databricks/workspaces/workspace.py +12 -1
- yggdrasil/pyutils/callable_serde.py +27 -2
- yggdrasil/pyutils/expiring_dict.py +176 -0
- yggdrasil/version.py +1 -1
- {ygg-0.1.33.dist-info → ygg-0.1.34.dist-info}/WHEEL +0 -0
- {ygg-0.1.33.dist-info → ygg-0.1.34.dist-info}/entry_points.txt +0 -0
- {ygg-0.1.33.dist-info → ygg-0.1.34.dist-info}/licenses/LICENSE +0 -0
- {ygg-0.1.33.dist-info → ygg-0.1.34.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,15 @@
|
|
|
1
|
-
"""Databricks SQL engine utilities and helpers.
|
|
1
|
+
"""Databricks SQL engine utilities and helpers.
|
|
2
|
+
|
|
3
|
+
This module provides a thin “do the right thing” layer over:
|
|
4
|
+
- Databricks SQL Statement Execution API (warehouse)
|
|
5
|
+
- Spark SQL / Delta Lake (when running inside a Spark-enabled context)
|
|
6
|
+
|
|
7
|
+
It includes helpers to:
|
|
8
|
+
- Build fully-qualified table names
|
|
9
|
+
- Execute SQL via Spark or Databricks SQL API
|
|
10
|
+
- Insert Arrow/Spark data into Delta tables (append/overwrite/merge)
|
|
11
|
+
- Generate DDL from Arrow schemas
|
|
12
|
+
"""
|
|
2
13
|
|
|
3
14
|
import dataclasses
|
|
4
15
|
import logging
|
|
@@ -8,7 +19,6 @@ import time
|
|
|
8
19
|
from typing import Optional, Union, Any, Dict, List, Literal
|
|
9
20
|
|
|
10
21
|
import pyarrow as pa
|
|
11
|
-
import pyarrow.parquet as pq
|
|
12
22
|
|
|
13
23
|
from .statement_result import StatementResult
|
|
14
24
|
from .types import column_info_to_arrow_field
|
|
@@ -28,7 +38,6 @@ except ImportError:
|
|
|
28
38
|
@classmethod
|
|
29
39
|
def forName(cls, *args, **kwargs):
|
|
30
40
|
from delta.tables import DeltaTable
|
|
31
|
-
|
|
32
41
|
return DeltaTable.forName(*args, **kwargs)
|
|
33
42
|
|
|
34
43
|
|
|
@@ -37,23 +46,18 @@ if databricks_sdk is not None:
|
|
|
37
46
|
StatementResponse, Disposition, Format,
|
|
38
47
|
ExecuteStatementRequestOnWaitTimeout, StatementParameterListItem
|
|
39
48
|
)
|
|
40
|
-
|
|
41
49
|
StatementResponse = StatementResponse
|
|
42
50
|
else:
|
|
43
|
-
class StatementResponse:
|
|
51
|
+
class StatementResponse: # pragma: no cover
|
|
44
52
|
pass
|
|
45
53
|
|
|
46
54
|
|
|
47
55
|
logger = logging.getLogger(__name__)
|
|
48
56
|
|
|
49
|
-
|
|
50
57
|
if pyspark is not None:
|
|
51
58
|
import pyspark.sql.functions as F
|
|
52
59
|
|
|
53
|
-
__all__ = [
|
|
54
|
-
"SQLEngine",
|
|
55
|
-
"StatementResult"
|
|
56
|
-
]
|
|
60
|
+
__all__ = ["SQLEngine", "StatementResult"]
|
|
57
61
|
|
|
58
62
|
|
|
59
63
|
class SqlExecutionError(RuntimeError):
|
|
@@ -62,7 +66,7 @@ class SqlExecutionError(RuntimeError):
|
|
|
62
66
|
|
|
63
67
|
@dataclasses.dataclass
|
|
64
68
|
class SQLEngine(WorkspaceService):
|
|
65
|
-
"""Execute SQL statements and manage tables via Databricks."""
|
|
69
|
+
"""Execute SQL statements and manage tables via Databricks SQL / Spark."""
|
|
66
70
|
warehouse_id: Optional[str] = None
|
|
67
71
|
catalog_name: Optional[str] = None
|
|
68
72
|
schema_name: Optional[str] = None
|
|
@@ -72,18 +76,18 @@ class SQLEngine(WorkspaceService):
|
|
|
72
76
|
catalog_name: Optional[str] = None,
|
|
73
77
|
schema_name: Optional[str] = None,
|
|
74
78
|
table_name: Optional[str] = None,
|
|
75
|
-
safe_chars: bool = True
|
|
76
|
-
):
|
|
77
|
-
"""Build a fully qualified table name
|
|
79
|
+
safe_chars: bool = True,
|
|
80
|
+
) -> str:
|
|
81
|
+
"""Build a fully qualified table name (catalog.schema.table).
|
|
78
82
|
|
|
79
83
|
Args:
|
|
80
|
-
catalog_name: Optional catalog override.
|
|
81
|
-
schema_name: Optional schema override.
|
|
84
|
+
catalog_name: Optional catalog override (defaults to engine.catalog_name).
|
|
85
|
+
schema_name: Optional schema override (defaults to engine.schema_name).
|
|
82
86
|
table_name: Table name to qualify.
|
|
83
|
-
safe_chars: Whether to wrap
|
|
87
|
+
safe_chars: Whether to wrap each identifier in backticks.
|
|
84
88
|
|
|
85
89
|
Returns:
|
|
86
|
-
|
|
90
|
+
Fully qualified table name string.
|
|
87
91
|
"""
|
|
88
92
|
catalog_name = catalog_name or self.catalog_name
|
|
89
93
|
schema_name = schema_name or self.schema_name
|
|
@@ -96,21 +100,23 @@ class SQLEngine(WorkspaceService):
|
|
|
96
100
|
return f"`{catalog_name}`.`{schema_name}`.`{table_name}`"
|
|
97
101
|
return f"{catalog_name}.{schema_name}.{table_name}"
|
|
98
102
|
|
|
99
|
-
def _catalog_schema_table_names(
|
|
100
|
-
self,
|
|
101
|
-
full_name: str,
|
|
102
|
-
):
|
|
103
|
+
def _catalog_schema_table_names(self, full_name: str) -> tuple[Optional[str], Optional[str], Optional[str]]:
|
|
103
104
|
"""Parse a catalog.schema.table string into components.
|
|
104
105
|
|
|
106
|
+
Supports partial names:
|
|
107
|
+
- table
|
|
108
|
+
- schema.table
|
|
109
|
+
- catalog.schema.table
|
|
110
|
+
|
|
111
|
+
Backticks are stripped.
|
|
112
|
+
|
|
105
113
|
Args:
|
|
106
|
-
full_name:
|
|
114
|
+
full_name: Fully qualified or partial table name.
|
|
107
115
|
|
|
108
116
|
Returns:
|
|
109
|
-
|
|
117
|
+
Tuple of (catalog_name, schema_name, table_name).
|
|
110
118
|
"""
|
|
111
|
-
parts = [
|
|
112
|
-
_.strip("`") for _ in full_name.split(".")
|
|
113
|
-
]
|
|
119
|
+
parts = [_.strip("`") for _ in full_name.split(".")]
|
|
114
120
|
|
|
115
121
|
if len(parts) == 0:
|
|
116
122
|
return self.catalog_name, self.schema_name, None
|
|
@@ -122,20 +128,20 @@ class SQLEngine(WorkspaceService):
|
|
|
122
128
|
catalog_name, schema_name, table_name = parts[-3], parts[-2], parts[-1]
|
|
123
129
|
catalog_name = catalog_name or self.catalog_name
|
|
124
130
|
schema_name = schema_name or self.schema_name
|
|
125
|
-
|
|
126
131
|
return catalog_name, schema_name, table_name
|
|
127
132
|
|
|
128
|
-
def _default_warehouse(
|
|
129
|
-
|
|
130
|
-
cluster_size: str = "Small"
|
|
131
|
-
):
|
|
132
|
-
"""Return a default SQL warehouse matching the desired size.
|
|
133
|
+
def _default_warehouse(self, cluster_size: str = "Small"):
|
|
134
|
+
"""Pick a default SQL warehouse (best-effort) matching the desired size.
|
|
133
135
|
|
|
134
136
|
Args:
|
|
135
|
-
cluster_size: Desired warehouse size
|
|
137
|
+
cluster_size: Desired warehouse size (Databricks "cluster_size"), e.g. "Small".
|
|
138
|
+
If empty/None, returns the first warehouse encountered.
|
|
136
139
|
|
|
137
140
|
Returns:
|
|
138
|
-
|
|
141
|
+
Warehouse object.
|
|
142
|
+
|
|
143
|
+
Raises:
|
|
144
|
+
ValueError: If no warehouses exist in the workspace.
|
|
139
145
|
"""
|
|
140
146
|
wk = self.workspace.sdk()
|
|
141
147
|
existing = list(wk.warehouses.list())
|
|
@@ -146,48 +152,55 @@ class SQLEngine(WorkspaceService):
|
|
|
146
152
|
first = warehouse
|
|
147
153
|
|
|
148
154
|
if cluster_size:
|
|
149
|
-
if warehouse
|
|
155
|
+
if getattr(warehouse, "cluster_size", None) == cluster_size:
|
|
156
|
+
logger.debug("Default warehouse match found: id=%s cluster_size=%s", warehouse.id, warehouse.cluster_size)
|
|
150
157
|
return warehouse
|
|
151
158
|
else:
|
|
159
|
+
logger.debug("Default warehouse selected (first): id=%s", warehouse.id)
|
|
152
160
|
return warehouse
|
|
153
161
|
|
|
154
162
|
if first is not None:
|
|
163
|
+
logger.info(
|
|
164
|
+
"No warehouse matched cluster_size=%s; falling back to first warehouse id=%s cluster_size=%s",
|
|
165
|
+
cluster_size,
|
|
166
|
+
getattr(first, "id", None),
|
|
167
|
+
getattr(first, "cluster_size", None),
|
|
168
|
+
)
|
|
155
169
|
return first
|
|
156
170
|
|
|
157
171
|
raise ValueError(f"No default warehouse found in {wk.config.host}")
|
|
158
172
|
|
|
159
|
-
def _get_or_default_warehouse_id(
|
|
160
|
-
|
|
161
|
-
cluster_size = "Small"
|
|
162
|
-
):
|
|
163
|
-
"""Return the configured warehouse id or a default one.
|
|
173
|
+
def _get_or_default_warehouse_id(self, cluster_size: str = "Small") -> str:
|
|
174
|
+
"""Return configured warehouse_id or resolve a default one.
|
|
164
175
|
|
|
165
176
|
Args:
|
|
166
|
-
cluster_size: Desired warehouse size filter.
|
|
177
|
+
cluster_size: Desired warehouse size filter used when resolving defaults.
|
|
167
178
|
|
|
168
179
|
Returns:
|
|
169
|
-
|
|
180
|
+
Warehouse id string.
|
|
170
181
|
"""
|
|
171
182
|
if not self.warehouse_id:
|
|
172
183
|
dft = self._default_warehouse(cluster_size=cluster_size)
|
|
173
|
-
|
|
174
184
|
self.warehouse_id = dft.id
|
|
185
|
+
logger.info("Resolved default warehouse_id=%s (cluster_size=%s)", self.warehouse_id, cluster_size)
|
|
186
|
+
|
|
175
187
|
return self.warehouse_id
|
|
176
188
|
|
|
177
189
|
@staticmethod
|
|
178
190
|
def _random_suffix(prefix: str = "") -> str:
|
|
179
|
-
"""Generate a unique suffix for temporary resources.
|
|
180
|
-
|
|
181
|
-
Args:
|
|
182
|
-
prefix: Optional prefix to prepend.
|
|
183
|
-
|
|
184
|
-
Returns:
|
|
185
|
-
A unique suffix string.
|
|
186
|
-
"""
|
|
187
|
-
unique = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(8))
|
|
191
|
+
"""Generate a unique suffix for temporary resources."""
|
|
192
|
+
unique = "".join(random.choice(string.ascii_lowercase + string.digits) for _ in range(8))
|
|
188
193
|
timestamp = int(time.time() * 1000)
|
|
189
194
|
return f"{prefix}{timestamp}_{unique}"
|
|
190
195
|
|
|
196
|
+
@staticmethod
|
|
197
|
+
def _sql_preview(sql: str, limit: int = 220) -> str:
|
|
198
|
+
"""Short, single-line preview for logs (avoids spewing giant SQL)."""
|
|
199
|
+
if not sql:
|
|
200
|
+
return ""
|
|
201
|
+
one_line = " ".join(sql.split())
|
|
202
|
+
return one_line[:limit] + ("…" if len(one_line) > limit else "")
|
|
203
|
+
|
|
191
204
|
def execute(
|
|
192
205
|
self,
|
|
193
206
|
statement: Optional[str] = None,
|
|
@@ -207,54 +220,64 @@ class SQLEngine(WorkspaceService):
|
|
|
207
220
|
wait_result: bool = True,
|
|
208
221
|
**kwargs,
|
|
209
222
|
) -> "StatementResult":
|
|
210
|
-
"""
|
|
211
|
-
|
|
223
|
+
"""Execute a SQL statement via Spark or Databricks SQL Statement Execution API.
|
|
224
|
+
|
|
225
|
+
Engine resolution:
|
|
226
|
+
- If `engine` is not provided and a Spark session is active -> uses Spark.
|
|
227
|
+
- Otherwise uses Databricks SQL API (warehouse).
|
|
212
228
|
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
- If wait=False: return initial execution handle without polling.
|
|
229
|
+
Waiting behavior (`wait_result`):
|
|
230
|
+
- If True (default): returns a StatementResult in terminal state (SUCCEEDED/FAILED/CANCELED).
|
|
231
|
+
- If False: returns immediately with the initial handle (caller can `.wait()` later).
|
|
217
232
|
|
|
218
233
|
Args:
|
|
219
|
-
statement: SQL statement to execute. If
|
|
220
|
-
engine:
|
|
221
|
-
warehouse_id:
|
|
234
|
+
statement: SQL statement to execute. If None, a `SELECT *` is generated from the table params.
|
|
235
|
+
engine: "spark" or "api".
|
|
236
|
+
warehouse_id: Warehouse override (for API engine).
|
|
222
237
|
byte_limit: Optional byte limit for results.
|
|
223
|
-
disposition: Result disposition mode.
|
|
224
|
-
format: Result format
|
|
225
|
-
on_wait_timeout: Timeout behavior for waiting.
|
|
226
|
-
parameters: Optional statement parameters.
|
|
227
|
-
row_limit: Optional row limit.
|
|
228
|
-
wait_timeout:
|
|
229
|
-
catalog_name: Optional catalog override.
|
|
230
|
-
schema_name: Optional schema override.
|
|
231
|
-
table_name: Optional table
|
|
232
|
-
wait_result: Whether to block until completion.
|
|
233
|
-
**kwargs:
|
|
238
|
+
disposition: Result disposition mode (API engine).
|
|
239
|
+
format: Result format (API engine).
|
|
240
|
+
on_wait_timeout: Timeout behavior for waiting (API engine).
|
|
241
|
+
parameters: Optional statement parameters (API engine).
|
|
242
|
+
row_limit: Optional row limit for results (API engine).
|
|
243
|
+
wait_timeout: API wait timeout value.
|
|
244
|
+
catalog_name: Optional catalog override for API engine.
|
|
245
|
+
schema_name: Optional schema override for API engine.
|
|
246
|
+
table_name: Optional table override used when `statement` is None.
|
|
247
|
+
wait_result: Whether to block until completion (API engine).
|
|
248
|
+
**kwargs: Extra params forwarded to Databricks SDK execute_statement.
|
|
234
249
|
|
|
235
250
|
Returns:
|
|
236
|
-
|
|
251
|
+
StatementResult.
|
|
237
252
|
"""
|
|
253
|
+
# --- Engine auto-detection ---
|
|
238
254
|
if not engine:
|
|
239
255
|
if pyspark is not None:
|
|
240
256
|
spark_session = SparkSession.getActiveSession()
|
|
241
|
-
|
|
242
257
|
if spark_session is not None:
|
|
243
258
|
engine = "spark"
|
|
244
259
|
|
|
260
|
+
# --- Spark path ---
|
|
245
261
|
if engine == "spark":
|
|
246
262
|
spark_session = SparkSession.getActiveSession()
|
|
247
|
-
|
|
248
263
|
if spark_session is None:
|
|
249
264
|
raise ValueError("No spark session found to run sql query")
|
|
250
265
|
|
|
266
|
+
t0 = time.time()
|
|
267
|
+
df = spark_session.sql(statement)
|
|
268
|
+
logger.info("Spark SQL executed in %.3fs: %s", time.time() - t0, self._sql_preview(statement))
|
|
269
|
+
|
|
270
|
+
# Avoid Disposition dependency if SDK imports are absent
|
|
271
|
+
spark_disp = disposition if disposition is not None else getattr(globals().get("Disposition", object), "EXTERNAL_LINKS", None)
|
|
272
|
+
|
|
251
273
|
return StatementResult(
|
|
252
274
|
engine=self,
|
|
253
275
|
statement_id="sparksql",
|
|
254
|
-
disposition=
|
|
255
|
-
_spark_df=
|
|
276
|
+
disposition=spark_disp,
|
|
277
|
+
_spark_df=df,
|
|
256
278
|
)
|
|
257
279
|
|
|
280
|
+
# --- API path defaults ---
|
|
258
281
|
if format is None:
|
|
259
282
|
format = Format.ARROW_STREAM
|
|
260
283
|
|
|
@@ -264,6 +287,7 @@ class SQLEngine(WorkspaceService):
|
|
|
264
287
|
if not statement:
|
|
265
288
|
full_name = self.table_full_name(catalog_name=catalog_name, schema_name=schema_name, table_name=table_name)
|
|
266
289
|
statement = f"SELECT * FROM {full_name}"
|
|
290
|
+
logger.debug("Autogenerated statement: %s", self._sql_preview(statement))
|
|
267
291
|
|
|
268
292
|
if not warehouse_id:
|
|
269
293
|
warehouse_id = self._get_or_default_warehouse_id()
|
|
@@ -280,7 +304,6 @@ class SQLEngine(WorkspaceService):
|
|
|
280
304
|
wait_timeout=wait_timeout,
|
|
281
305
|
catalog=catalog_name or self.catalog_name,
|
|
282
306
|
schema=schema_name or self.schema_name,
|
|
283
|
-
**kwargs,
|
|
284
307
|
)
|
|
285
308
|
|
|
286
309
|
execution = StatementResult(
|
|
@@ -288,10 +311,11 @@ class SQLEngine(WorkspaceService):
|
|
|
288
311
|
statement_id=response.statement_id,
|
|
289
312
|
_response=response,
|
|
290
313
|
_response_refresh_time=time.time(),
|
|
291
|
-
disposition=disposition
|
|
314
|
+
disposition=disposition,
|
|
292
315
|
)
|
|
293
316
|
|
|
294
|
-
|
|
317
|
+
# BUGFIX: previously returned `wait_result` (a bool) on wait_result=False 🤦
|
|
318
|
+
return execution.wait() if wait_result else execution
|
|
295
319
|
|
|
296
320
|
def spark_table(
|
|
297
321
|
self,
|
|
@@ -300,35 +324,21 @@ class SQLEngine(WorkspaceService):
|
|
|
300
324
|
schema_name: Optional[str] = None,
|
|
301
325
|
table_name: Optional[str] = None,
|
|
302
326
|
):
|
|
303
|
-
"""Return a DeltaTable handle for a given table name.
|
|
304
|
-
|
|
305
|
-
Args:
|
|
306
|
-
full_name: Fully qualified table name.
|
|
307
|
-
catalog_name: Optional catalog override.
|
|
308
|
-
schema_name: Optional schema override.
|
|
309
|
-
table_name: Optional table name override.
|
|
310
|
-
|
|
311
|
-
Returns:
|
|
312
|
-
A Spark DeltaTable handle.
|
|
313
|
-
"""
|
|
327
|
+
"""Return a DeltaTable handle for a given table name (Spark context required)."""
|
|
314
328
|
if not full_name:
|
|
315
329
|
full_name = self.table_full_name(
|
|
316
330
|
catalog_name=catalog_name,
|
|
317
331
|
schema_name=schema_name,
|
|
318
|
-
table_name=table_name
|
|
332
|
+
table_name=table_name,
|
|
319
333
|
)
|
|
320
|
-
|
|
321
334
|
return SparkDeltaTable.forName(
|
|
322
335
|
sparkSession=SparkSession.getActiveSession(),
|
|
323
|
-
tableOrViewName=full_name
|
|
336
|
+
tableOrViewName=full_name,
|
|
324
337
|
)
|
|
325
338
|
|
|
326
339
|
def insert_into(
|
|
327
340
|
self,
|
|
328
|
-
data: Union[
|
|
329
|
-
pa.Table, pa.RecordBatch, pa.RecordBatchReader,
|
|
330
|
-
SparkDataFrame
|
|
331
|
-
],
|
|
341
|
+
data: Union[pa.Table, pa.RecordBatch, pa.RecordBatchReader, SparkDataFrame],
|
|
332
342
|
location: Optional[str] = None,
|
|
333
343
|
catalog_name: Optional[str] = None,
|
|
334
344
|
schema_name: Optional[str] = None,
|
|
@@ -336,14 +346,18 @@ class SQLEngine(WorkspaceService):
|
|
|
336
346
|
mode: str = "auto",
|
|
337
347
|
cast_options: Optional[CastOptions] = None,
|
|
338
348
|
overwrite_schema: bool | None = None,
|
|
339
|
-
match_by: list[str] = None,
|
|
340
|
-
zorder_by: list[str] = None,
|
|
349
|
+
match_by: Optional[list[str]] = None,
|
|
350
|
+
zorder_by: Optional[list[str]] = None,
|
|
341
351
|
optimize_after_merge: bool = False,
|
|
342
|
-
vacuum_hours: int | None = None,
|
|
352
|
+
vacuum_hours: int | None = None,
|
|
343
353
|
spark_session: Optional[SparkSession] = None,
|
|
344
|
-
spark_options: Optional[Dict[str, Any]] = None
|
|
354
|
+
spark_options: Optional[Dict[str, Any]] = None,
|
|
345
355
|
):
|
|
346
|
-
"""Insert data into a table using Spark
|
|
356
|
+
"""Insert data into a Delta table using Spark when available; otherwise stage Arrow.
|
|
357
|
+
|
|
358
|
+
Strategy:
|
|
359
|
+
- If Spark is available and we have an active session (or Spark DF input) -> use `spark_insert_into`.
|
|
360
|
+
- Otherwise -> use `arrow_insert_into` (stages Parquet to a temp volume + runs SQL INSERT/MERGE).
|
|
347
361
|
|
|
348
362
|
Args:
|
|
349
363
|
data: Arrow or Spark data to insert.
|
|
@@ -353,18 +367,18 @@ class SQLEngine(WorkspaceService):
|
|
|
353
367
|
table_name: Optional table name override.
|
|
354
368
|
mode: Insert mode ("auto", "append", "overwrite").
|
|
355
369
|
cast_options: Optional casting options.
|
|
356
|
-
overwrite_schema: Whether to overwrite schema (Spark).
|
|
357
|
-
match_by:
|
|
358
|
-
zorder_by:
|
|
359
|
-
optimize_after_merge: Whether to run OPTIMIZE after merge.
|
|
370
|
+
overwrite_schema: Whether to overwrite schema (Spark path).
|
|
371
|
+
match_by: Merge keys for upserts (MERGE semantics). When set, mode affects behavior.
|
|
372
|
+
zorder_by: Z-ORDER columns (SQL path uses OPTIMIZE ZORDER; Spark path uses Delta optimize API).
|
|
373
|
+
optimize_after_merge: Whether to run OPTIMIZE after a merge (SQL path) / after merge+zorder (Spark path).
|
|
360
374
|
vacuum_hours: Optional VACUUM retention window.
|
|
361
375
|
spark_session: Optional SparkSession override.
|
|
362
376
|
spark_options: Optional Spark write options.
|
|
363
377
|
|
|
364
378
|
Returns:
|
|
365
|
-
None
|
|
379
|
+
None (mutates the destination table).
|
|
366
380
|
"""
|
|
367
|
-
|
|
381
|
+
|
|
368
382
|
if pyspark is not None:
|
|
369
383
|
spark_session = SparkSession.getActiveSession() if spark_session is None else spark_session
|
|
370
384
|
|
|
@@ -382,7 +396,7 @@ class SQLEngine(WorkspaceService):
|
|
|
382
396
|
zorder_by=zorder_by,
|
|
383
397
|
optimize_after_merge=optimize_after_merge,
|
|
384
398
|
vacuum_hours=vacuum_hours,
|
|
385
|
-
spark_options=spark_options
|
|
399
|
+
spark_options=spark_options,
|
|
386
400
|
)
|
|
387
401
|
|
|
388
402
|
return self.arrow_insert_into(
|
|
@@ -402,9 +416,7 @@ class SQLEngine(WorkspaceService):
|
|
|
402
416
|
|
|
403
417
|
def arrow_insert_into(
|
|
404
418
|
self,
|
|
405
|
-
data: Union[
|
|
406
|
-
pa.Table, pa.RecordBatch, pa.RecordBatchReader,
|
|
407
|
-
],
|
|
419
|
+
data: Union[pa.Table, pa.RecordBatch, pa.RecordBatchReader],
|
|
408
420
|
location: Optional[str] = None,
|
|
409
421
|
catalog_name: Optional[str] = None,
|
|
410
422
|
schema_name: Optional[str] = None,
|
|
@@ -412,14 +424,19 @@ class SQLEngine(WorkspaceService):
|
|
|
412
424
|
mode: str = "auto",
|
|
413
425
|
cast_options: Optional[CastOptions] = None,
|
|
414
426
|
overwrite_schema: bool | None = None,
|
|
415
|
-
match_by: list[str] = None,
|
|
416
|
-
zorder_by: list[str] = None,
|
|
427
|
+
match_by: Optional[list[str]] = None,
|
|
428
|
+
zorder_by: Optional[list[str]] = None,
|
|
417
429
|
optimize_after_merge: bool = False,
|
|
418
|
-
vacuum_hours: int | None = None,
|
|
430
|
+
vacuum_hours: int | None = None,
|
|
419
431
|
existing_schema: pa.Schema | None = None,
|
|
420
|
-
temp_volume_path: Optional[Union[str, DatabricksPath]] = None
|
|
432
|
+
temp_volume_path: Optional[Union[str, DatabricksPath]] = None,
|
|
421
433
|
):
|
|
422
|
-
"""Insert Arrow data by staging to a temp volume and running SQL.
|
|
434
|
+
"""Insert Arrow data by staging Parquet to a temp volume and running Databricks SQL.
|
|
435
|
+
|
|
436
|
+
Notes:
|
|
437
|
+
- If the table does not exist, it is created from the input Arrow schema (best-effort).
|
|
438
|
+
- If `match_by` is provided, uses MERGE INTO (upsert).
|
|
439
|
+
- Otherwise uses INSERT INTO / INSERT OVERWRITE depending on mode.
|
|
423
440
|
|
|
424
441
|
Args:
|
|
425
442
|
data: Arrow table/batch data to insert.
|
|
@@ -427,14 +444,14 @@ class SQLEngine(WorkspaceService):
|
|
|
427
444
|
catalog_name: Optional catalog override.
|
|
428
445
|
schema_name: Optional schema override.
|
|
429
446
|
table_name: Optional table name override.
|
|
430
|
-
mode: Insert mode ("auto", "append", "overwrite").
|
|
447
|
+
mode: Insert mode ("auto", "append", "overwrite"). ("auto" behaves like append here.)
|
|
431
448
|
cast_options: Optional casting options.
|
|
432
|
-
overwrite_schema:
|
|
433
|
-
match_by:
|
|
434
|
-
zorder_by:
|
|
435
|
-
optimize_after_merge:
|
|
436
|
-
vacuum_hours: Optional VACUUM retention window.
|
|
437
|
-
existing_schema: Optional pre-fetched schema.
|
|
449
|
+
overwrite_schema: Reserved for parity with Spark path (unused here).
|
|
450
|
+
match_by: Merge keys for MERGE INTO upserts.
|
|
451
|
+
zorder_by: Columns for OPTIMIZE ZORDER BY.
|
|
452
|
+
optimize_after_merge: Run OPTIMIZE after MERGE (in addition to ZORDER optimization).
|
|
453
|
+
vacuum_hours: Optional VACUUM retention window in hours.
|
|
454
|
+
existing_schema: Optional pre-fetched destination schema (Arrow).
|
|
438
455
|
temp_volume_path: Optional temp volume path override.
|
|
439
456
|
|
|
440
457
|
Returns:
|
|
@@ -445,7 +462,15 @@ class SQLEngine(WorkspaceService):
|
|
|
445
462
|
catalog_name=catalog_name,
|
|
446
463
|
schema_name=schema_name,
|
|
447
464
|
table_name=table_name,
|
|
448
|
-
safe_chars=True
|
|
465
|
+
safe_chars=True,
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
logger.info(
|
|
469
|
+
"Arrow insert into %s (mode=%s, match_by=%s, zorder_by=%s)",
|
|
470
|
+
location,
|
|
471
|
+
mode,
|
|
472
|
+
match_by,
|
|
473
|
+
zorder_by,
|
|
449
474
|
)
|
|
450
475
|
|
|
451
476
|
with self as connected:
|
|
@@ -455,16 +480,17 @@ class SQLEngine(WorkspaceService):
|
|
|
455
480
|
catalog_name=catalog_name,
|
|
456
481
|
schema_name=schema_name,
|
|
457
482
|
table_name=table_name,
|
|
458
|
-
to_arrow_schema=True
|
|
483
|
+
to_arrow_schema=True,
|
|
459
484
|
)
|
|
485
|
+
logger.debug("Fetched existing schema for %s (columns=%d)", location, len(existing_schema.names))
|
|
460
486
|
except ValueError as exc:
|
|
461
|
-
|
|
462
|
-
existing_schema =
|
|
487
|
+
data_tbl = convert(data, pa.Table)
|
|
488
|
+
existing_schema = data_tbl.schema
|
|
463
489
|
logger.warning(
|
|
464
|
-
"Table %s not found
|
|
490
|
+
"Table %s not found (%s). Creating it from input schema (columns=%s)",
|
|
465
491
|
location,
|
|
466
492
|
exc,
|
|
467
|
-
existing_schema.names
|
|
493
|
+
existing_schema.names,
|
|
468
494
|
)
|
|
469
495
|
|
|
470
496
|
connected.create_table(
|
|
@@ -472,12 +498,12 @@ class SQLEngine(WorkspaceService):
|
|
|
472
498
|
catalog_name=catalog_name,
|
|
473
499
|
schema_name=schema_name,
|
|
474
500
|
table_name=table_name,
|
|
475
|
-
if_not_exists=True
|
|
501
|
+
if_not_exists=True,
|
|
476
502
|
)
|
|
477
503
|
|
|
478
504
|
try:
|
|
479
505
|
return connected.arrow_insert_into(
|
|
480
|
-
data=
|
|
506
|
+
data=data_tbl,
|
|
481
507
|
location=location,
|
|
482
508
|
catalog_name=catalog_name,
|
|
483
509
|
schema_name=schema_name,
|
|
@@ -489,54 +515,50 @@ class SQLEngine(WorkspaceService):
|
|
|
489
515
|
zorder_by=zorder_by,
|
|
490
516
|
optimize_after_merge=optimize_after_merge,
|
|
491
517
|
vacuum_hours=vacuum_hours,
|
|
492
|
-
existing_schema=existing_schema
|
|
518
|
+
existing_schema=existing_schema,
|
|
493
519
|
)
|
|
494
|
-
except:
|
|
520
|
+
except Exception:
|
|
521
|
+
logger.exception("Arrow insert failed after auto-creating %s; attempting cleanup (DROP TABLE)", location)
|
|
495
522
|
try:
|
|
496
523
|
connected.drop_table(location=location)
|
|
497
|
-
except Exception
|
|
498
|
-
logger.
|
|
524
|
+
except Exception:
|
|
525
|
+
logger.exception("Failed to drop table %s after auto creation error", location)
|
|
499
526
|
raise
|
|
500
527
|
|
|
501
528
|
transaction_id = self._random_suffix()
|
|
502
529
|
|
|
503
|
-
|
|
530
|
+
data_tbl = convert(data, pa.Table, options=cast_options, target_field=existing_schema)
|
|
504
531
|
|
|
505
532
|
# Write in temp volume
|
|
506
533
|
temp_volume_path = connected.dbfs_path(
|
|
507
534
|
kind=DatabricksPathKind.VOLUME,
|
|
508
|
-
parts=[catalog_name, schema_name, "tmp", "sql", transaction_id]
|
|
535
|
+
parts=[catalog_name, schema_name, "tmp", "sql", transaction_id],
|
|
509
536
|
) if temp_volume_path is None else DatabricksPath.parse(obj=temp_volume_path, workspace=connected.workspace)
|
|
510
537
|
|
|
538
|
+
logger.debug("Staging Parquet to temp volume: %s", temp_volume_path)
|
|
511
539
|
temp_volume_path.mkdir()
|
|
540
|
+
temp_volume_path.write_arrow_table(data_tbl)
|
|
512
541
|
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
# get column list from arrow schema
|
|
516
|
-
columns = [c for c in existing_schema.names]
|
|
542
|
+
columns = list(existing_schema.names)
|
|
517
543
|
cols_quoted = ", ".join([f"`{c}`" for c in columns])
|
|
518
544
|
|
|
519
|
-
statements = []
|
|
545
|
+
statements: list[str] = []
|
|
520
546
|
|
|
521
|
-
# Decide how to ingest
|
|
522
|
-
# If merge keys provided -> use MERGE
|
|
523
547
|
if match_by:
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
for k in match_by:
|
|
527
|
-
on_clauses.append(f"T.`{k}` = S.`{k}`")
|
|
528
|
-
on_condition = " AND ".join(on_clauses)
|
|
548
|
+
logger.info("Using MERGE INTO (match_by=%s)", match_by)
|
|
549
|
+
on_condition = " AND ".join([f"T.`{k}` = S.`{k}`" for k in match_by])
|
|
529
550
|
|
|
530
|
-
# build UPDATE set (all columns except match_by)
|
|
531
551
|
update_cols = [c for c in columns if c not in match_by]
|
|
532
552
|
if update_cols:
|
|
533
553
|
update_set = ", ".join([f"T.`{c}` = S.`{c}`" for c in update_cols])
|
|
534
554
|
update_clause = f"WHEN MATCHED THEN UPDATE SET {update_set}"
|
|
535
555
|
else:
|
|
536
|
-
update_clause = ""
|
|
556
|
+
update_clause = ""
|
|
537
557
|
|
|
538
|
-
|
|
539
|
-
|
|
558
|
+
insert_clause = (
|
|
559
|
+
f"WHEN NOT MATCHED THEN INSERT ({cols_quoted}) "
|
|
560
|
+
f"VALUES ({', '.join([f'S.`{c}`' for c in columns])})"
|
|
561
|
+
)
|
|
540
562
|
|
|
541
563
|
merge_sql = f"""MERGE INTO {location} AS T
|
|
542
564
|
USING (
|
|
@@ -546,41 +568,38 @@ ON {on_condition}
|
|
|
546
568
|
{update_clause}
|
|
547
569
|
{insert_clause}"""
|
|
548
570
|
statements.append(merge_sql)
|
|
549
|
-
|
|
550
571
|
else:
|
|
551
|
-
# No match_by -> plain insert
|
|
552
572
|
if mode.lower() in ("overwrite",):
|
|
553
573
|
insert_sql = f"""INSERT OVERWRITE {location}
|
|
554
574
|
SELECT {cols_quoted}
|
|
555
575
|
FROM parquet.`{temp_volume_path}`"""
|
|
556
576
|
else:
|
|
557
|
-
# default: append
|
|
558
577
|
insert_sql = f"""INSERT INTO {location} ({cols_quoted})
|
|
559
578
|
SELECT {cols_quoted}
|
|
560
579
|
FROM parquet.`{temp_volume_path}`"""
|
|
561
580
|
statements.append(insert_sql)
|
|
562
581
|
|
|
563
|
-
# Execute statements (use your existing execute helper)
|
|
564
582
|
try:
|
|
565
583
|
for stmt in statements:
|
|
566
|
-
# trim and run
|
|
567
584
|
connected.execute(stmt.strip())
|
|
568
585
|
finally:
|
|
569
586
|
try:
|
|
570
587
|
temp_volume_path.rmdir(recursive=True)
|
|
571
|
-
except Exception
|
|
572
|
-
logger.
|
|
588
|
+
except Exception:
|
|
589
|
+
logger.exception("Failed cleaning temp volume: %s", temp_volume_path)
|
|
573
590
|
|
|
574
|
-
# Optionally run OPTIMIZE / ZORDER / VACUUM if requested (Databricks SQL)
|
|
575
591
|
if zorder_by:
|
|
576
592
|
zcols = ", ".join([f"`{c}`" for c in zorder_by])
|
|
577
593
|
optimize_sql = f"OPTIMIZE {location} ZORDER BY ({zcols})"
|
|
594
|
+
logger.info("Running OPTIMIZE ZORDER BY: %s", zorder_by)
|
|
578
595
|
connected.execute(optimize_sql)
|
|
579
596
|
|
|
580
597
|
if optimize_after_merge and match_by:
|
|
598
|
+
logger.info("Running OPTIMIZE after MERGE")
|
|
581
599
|
connected.execute(f"OPTIMIZE {location}")
|
|
582
600
|
|
|
583
601
|
if vacuum_hours is not None:
|
|
602
|
+
logger.info("Running VACUUM retain=%s hours", vacuum_hours)
|
|
584
603
|
connected.execute(f"VACUUM {location} RETAIN {vacuum_hours} HOURS")
|
|
585
604
|
|
|
586
605
|
return None
|
|
@@ -596,13 +615,20 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
596
615
|
mode: str = "auto",
|
|
597
616
|
cast_options: Optional[CastOptions] = None,
|
|
598
617
|
overwrite_schema: bool | None = None,
|
|
599
|
-
match_by: list[str] = None,
|
|
600
|
-
zorder_by: list[str] = None,
|
|
618
|
+
match_by: Optional[list[str]] = None,
|
|
619
|
+
zorder_by: Optional[list[str]] = None,
|
|
601
620
|
optimize_after_merge: bool = False,
|
|
602
|
-
vacuum_hours: int | None = None,
|
|
621
|
+
vacuum_hours: int | None = None,
|
|
603
622
|
spark_options: Optional[Dict[str, Any]] = None,
|
|
604
623
|
):
|
|
605
|
-
"""Insert a Spark DataFrame into a Delta table
|
|
624
|
+
"""Insert a Spark DataFrame into a Delta table (append/overwrite/merge).
|
|
625
|
+
|
|
626
|
+
Behavior:
|
|
627
|
+
- If the table does not exist: creates it via `saveAsTable(location)` (overwrite).
|
|
628
|
+
- If `match_by` is provided: uses Delta MERGE for upserts.
|
|
629
|
+
- If mode == "overwrite": deletes matching keys first, then appends the batch (fast-ish overwrite-by-key).
|
|
630
|
+
- Else: updates matching rows + inserts new ones.
|
|
631
|
+
- Else: uses `DataFrameWriter.saveAsTable` with mode.
|
|
606
632
|
|
|
607
633
|
Args:
|
|
608
634
|
data: Spark DataFrame to insert.
|
|
@@ -611,12 +637,12 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
611
637
|
schema_name: Optional schema override.
|
|
612
638
|
table_name: Optional table name override.
|
|
613
639
|
mode: Insert mode ("auto", "append", "overwrite").
|
|
614
|
-
cast_options: Optional casting options.
|
|
615
|
-
overwrite_schema: Whether to overwrite schema.
|
|
616
|
-
match_by:
|
|
617
|
-
zorder_by:
|
|
618
|
-
optimize_after_merge: Whether to run
|
|
619
|
-
vacuum_hours: Optional VACUUM retention window.
|
|
640
|
+
cast_options: Optional casting options (align to destination schema).
|
|
641
|
+
overwrite_schema: Whether to overwrite schema on write (when supported).
|
|
642
|
+
match_by: Merge keys for upserts.
|
|
643
|
+
zorder_by: Z-ORDER columns (used only if `optimize_after_merge` is True).
|
|
644
|
+
optimize_after_merge: Whether to run Delta optimize (and z-order) after merge.
|
|
645
|
+
vacuum_hours: Optional VACUUM retention window in hours.
|
|
620
646
|
spark_options: Optional Spark write options.
|
|
621
647
|
|
|
622
648
|
Returns:
|
|
@@ -627,7 +653,15 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
627
653
|
catalog_name=catalog_name,
|
|
628
654
|
schema_name=schema_name,
|
|
629
655
|
table_name=table_name,
|
|
630
|
-
safe_chars=True
|
|
656
|
+
safe_chars=True,
|
|
657
|
+
)
|
|
658
|
+
|
|
659
|
+
logger.info(
|
|
660
|
+
"Spark insert into %s (mode=%s, match_by=%s, overwrite_schema=%s)",
|
|
661
|
+
location,
|
|
662
|
+
mode,
|
|
663
|
+
match_by,
|
|
664
|
+
overwrite_schema,
|
|
631
665
|
)
|
|
632
666
|
|
|
633
667
|
spark_options = spark_options if spark_options else {}
|
|
@@ -636,11 +670,14 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
636
670
|
|
|
637
671
|
try:
|
|
638
672
|
existing_schema = self.get_table_schema(
|
|
639
|
-
catalog_name=catalog_name,
|
|
673
|
+
catalog_name=catalog_name,
|
|
674
|
+
schema_name=schema_name,
|
|
640
675
|
table_name=table_name,
|
|
641
|
-
to_arrow_schema=False
|
|
676
|
+
to_arrow_schema=False,
|
|
642
677
|
)
|
|
678
|
+
logger.debug("Fetched destination Spark schema for %s", location)
|
|
643
679
|
except ValueError:
|
|
680
|
+
logger.warning("Destination table missing; creating table %s via overwrite write", location)
|
|
644
681
|
data = convert(data, pyspark.sql.DataFrame)
|
|
645
682
|
data.write.mode("overwrite").options(**spark_options).saveAsTable(location)
|
|
646
683
|
return
|
|
@@ -651,29 +688,27 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
651
688
|
cast_options = CastOptions.check_arg(options=cast_options, target_field=existing_schema)
|
|
652
689
|
data = cast_spark_dataframe(data, options=cast_options)
|
|
653
690
|
|
|
654
|
-
|
|
655
|
-
if match_by:
|
|
656
|
-
notnull: pyspark.sql.Column = None
|
|
691
|
+
logger.debug("Incoming Spark columns: %s", data.columns)
|
|
657
692
|
|
|
693
|
+
if match_by:
|
|
694
|
+
notnull = None
|
|
658
695
|
for k in match_by:
|
|
659
696
|
if k not in data.columns:
|
|
660
697
|
raise ValueError(f"Missing match key '{k}' in DataFrame columns: {data.columns}")
|
|
661
|
-
|
|
662
|
-
notnull = data[k].isNotNull() if notnull is None else notnull & (data[k].isNotNull())
|
|
698
|
+
notnull = data[k].isNotNull() if notnull is None else notnull & data[k].isNotNull()
|
|
663
699
|
|
|
664
700
|
data = data.filter(notnull)
|
|
701
|
+
logger.debug("Filtered null keys for match_by=%s", match_by)
|
|
665
702
|
|
|
666
|
-
# --- Merge (upsert) ---
|
|
667
703
|
target = self.spark_table(full_name=location)
|
|
668
704
|
|
|
669
705
|
if match_by:
|
|
670
|
-
# Build merge condition on the composite key
|
|
671
706
|
cond = " AND ".join([f"t.`{k}` <=> s.`{k}`" for k in match_by])
|
|
707
|
+
logger.info("Running Delta MERGE (cond=%s)", cond)
|
|
672
708
|
|
|
673
709
|
if mode.casefold() == "overwrite":
|
|
710
|
+
logger.info("Overwrite-by-key mode: delete matching keys then append")
|
|
674
711
|
data = data.cache()
|
|
675
|
-
|
|
676
|
-
# Step 1: get unique key combos from source
|
|
677
712
|
distinct_keys = data.select([f"`{k}`" for k in match_by]).distinct()
|
|
678
713
|
|
|
679
714
|
(
|
|
@@ -683,35 +718,30 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
683
718
|
.execute()
|
|
684
719
|
)
|
|
685
720
|
|
|
686
|
-
|
|
687
|
-
data.write.format("delta").mode("append").saveAsTable(location)
|
|
721
|
+
data.write.format("delta").mode("append").options(**spark_options).saveAsTable(location)
|
|
688
722
|
else:
|
|
689
723
|
update_cols = [c for c in data.columns if c not in match_by]
|
|
690
|
-
set_expr = {
|
|
691
|
-
c: F.expr(f"s.`{c}`") for c in update_cols
|
|
692
|
-
}
|
|
724
|
+
set_expr = {c: F.expr(f"s.`{c}`") for c in update_cols}
|
|
693
725
|
|
|
694
|
-
# Execute MERGE - update matching records first, then insert new ones
|
|
695
726
|
(
|
|
696
727
|
target.alias("t")
|
|
697
728
|
.merge(data.alias("s"), cond)
|
|
698
|
-
.whenMatchedUpdate(set=set_expr)
|
|
699
|
-
.whenNotMatchedInsertAll()
|
|
729
|
+
.whenMatchedUpdate(set=set_expr)
|
|
730
|
+
.whenNotMatchedInsertAll()
|
|
700
731
|
.execute()
|
|
701
732
|
)
|
|
702
733
|
else:
|
|
703
734
|
if mode == "auto":
|
|
704
735
|
mode = "append"
|
|
736
|
+
logger.info("Spark write saveAsTable mode=%s", mode)
|
|
705
737
|
data.write.mode(mode).options(**spark_options).saveAsTable(location)
|
|
706
738
|
|
|
707
|
-
# --- Optimize: Z-ORDER for faster lookups by composite key (Databricks) ---
|
|
708
739
|
if optimize_after_merge and zorder_by:
|
|
709
|
-
|
|
740
|
+
logger.info("Delta optimize + zorder (%s)", zorder_by)
|
|
710
741
|
target.optimize().executeZOrderBy(*zorder_by)
|
|
711
742
|
|
|
712
|
-
# --- Optional VACUUM ---
|
|
713
743
|
if vacuum_hours is not None:
|
|
714
|
-
|
|
744
|
+
logger.info("Delta vacuum retain=%s hours", vacuum_hours)
|
|
715
745
|
target.vacuum(vacuum_hours)
|
|
716
746
|
|
|
717
747
|
def get_table_schema(
|
|
@@ -719,24 +749,24 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
719
749
|
catalog_name: Optional[str] = None,
|
|
720
750
|
schema_name: Optional[str] = None,
|
|
721
751
|
table_name: Optional[str] = None,
|
|
722
|
-
to_arrow_schema: bool = True
|
|
752
|
+
to_arrow_schema: bool = True,
|
|
723
753
|
) -> Union[pa.Field, pa.Schema]:
|
|
724
|
-
"""Fetch a table schema from Unity Catalog
|
|
754
|
+
"""Fetch a table schema from Unity Catalog and convert it to Arrow types.
|
|
725
755
|
|
|
726
756
|
Args:
|
|
727
757
|
catalog_name: Optional catalog override.
|
|
728
758
|
schema_name: Optional schema override.
|
|
729
759
|
table_name: Optional table name override.
|
|
730
|
-
to_arrow_schema:
|
|
760
|
+
to_arrow_schema: If True returns pa.Schema; else returns a pa.Field(STRUCT<...>).
|
|
731
761
|
|
|
732
762
|
Returns:
|
|
733
|
-
Arrow Schema or Field representing the table.
|
|
763
|
+
Arrow Schema or a STRUCT Field representing the table.
|
|
734
764
|
"""
|
|
735
765
|
full_name = self.table_full_name(
|
|
736
766
|
catalog_name=catalog_name,
|
|
737
767
|
schema_name=schema_name,
|
|
738
768
|
table_name=table_name,
|
|
739
|
-
safe_chars=False
|
|
769
|
+
safe_chars=False,
|
|
740
770
|
)
|
|
741
771
|
|
|
742
772
|
wk = self.workspace.sdk()
|
|
@@ -746,10 +776,7 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
746
776
|
except Exception as e:
|
|
747
777
|
raise ValueError(f"Table %s not found, {type(e)} {e}" % full_name)
|
|
748
778
|
|
|
749
|
-
fields = [
|
|
750
|
-
column_info_to_arrow_field(_)
|
|
751
|
-
for _ in table.columns
|
|
752
|
-
]
|
|
779
|
+
fields = [column_info_to_arrow_field(_) for _ in table.columns]
|
|
753
780
|
|
|
754
781
|
if to_arrow_schema:
|
|
755
782
|
return pa.schema(fields, metadata={b"name": table_name})
|
|
@@ -762,25 +789,15 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
762
789
|
schema_name: Optional[str] = None,
|
|
763
790
|
table_name: Optional[str] = None,
|
|
764
791
|
):
|
|
765
|
-
"""Drop a table if it exists.
|
|
766
|
-
|
|
767
|
-
Args:
|
|
768
|
-
location: Fully qualified table name override.
|
|
769
|
-
catalog_name: Optional catalog override.
|
|
770
|
-
schema_name: Optional schema override.
|
|
771
|
-
table_name: Optional table name override.
|
|
772
|
-
|
|
773
|
-
Returns:
|
|
774
|
-
The StatementResult from executing the drop statement.
|
|
775
|
-
"""
|
|
792
|
+
"""Drop a table if it exists."""
|
|
776
793
|
location, _, _, _ = self._check_location_params(
|
|
777
794
|
location=location,
|
|
778
795
|
catalog_name=catalog_name,
|
|
779
796
|
schema_name=schema_name,
|
|
780
797
|
table_name=table_name,
|
|
781
|
-
safe_chars=True
|
|
798
|
+
safe_chars=True,
|
|
782
799
|
)
|
|
783
|
-
|
|
800
|
+
logger.info("Dropping table if exists: %s", location)
|
|
784
801
|
return self.execute(f"DROP TABLE IF EXISTS {location}")
|
|
785
802
|
|
|
786
803
|
def create_table(
|
|
@@ -797,23 +814,27 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
797
814
|
if_not_exists: bool = True,
|
|
798
815
|
optimize_write: bool = True,
|
|
799
816
|
auto_compact: bool = True,
|
|
800
|
-
execute: bool = True
|
|
801
|
-
) -> str:
|
|
802
|
-
"""
|
|
803
|
-
Generate DDL (Data Definition Language) SQL for creating a table from a PyField schema.
|
|
817
|
+
execute: bool = True,
|
|
818
|
+
) -> Union[str, "StatementResult"]:
|
|
819
|
+
"""Generate (and optionally execute) CREATE TABLE DDL from an Arrow schema/field.
|
|
804
820
|
|
|
805
821
|
Args:
|
|
806
|
-
field:
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
822
|
+
field: Arrow Field or Schema describing the table. If `field` is a schema, it's converted.
|
|
823
|
+
location: Fully qualified table name override.
|
|
824
|
+
table_name: Table name override (used if location not provided).
|
|
825
|
+
catalog_name: Catalog override.
|
|
826
|
+
schema_name: Schema override.
|
|
827
|
+
partition_by: Optional partition columns.
|
|
828
|
+
cluster_by: If True -> CLUSTER BY AUTO. If list[str] -> CLUSTER BY (..). If False -> no clustering.
|
|
829
|
+
comment: Optional table comment (falls back to field metadata b"comment" when present).
|
|
830
|
+
options: Extra table properties.
|
|
831
|
+
if_not_exists: Add IF NOT EXISTS clause.
|
|
832
|
+
optimize_write: Sets delta.autoOptimize.optimizeWrite table property.
|
|
833
|
+
auto_compact: Sets delta.autoOptimize.autoCompact table property.
|
|
834
|
+
execute: If True, executes DDL and returns StatementResult; otherwise returns SQL string.
|
|
814
835
|
|
|
815
836
|
Returns:
|
|
816
|
-
|
|
837
|
+
StatementResult if execute=True, else the DDL SQL string.
|
|
817
838
|
"""
|
|
818
839
|
if not isinstance(field, pa.Field):
|
|
819
840
|
field = convert(field, pa.Field)
|
|
@@ -823,7 +844,7 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
823
844
|
catalog_name=catalog_name,
|
|
824
845
|
schema_name=schema_name,
|
|
825
846
|
table_name=table_name,
|
|
826
|
-
safe_chars=True
|
|
847
|
+
safe_chars=True,
|
|
827
848
|
)
|
|
828
849
|
|
|
829
850
|
if pa.types.is_struct(field.type):
|
|
@@ -831,28 +852,22 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
831
852
|
else:
|
|
832
853
|
children = [field]
|
|
833
854
|
|
|
834
|
-
|
|
835
|
-
column_definitions = [
|
|
836
|
-
self._field_to_ddl(child)
|
|
837
|
-
for child in children
|
|
838
|
-
]
|
|
855
|
+
column_definitions = [self._field_to_ddl(child) for child in children]
|
|
839
856
|
|
|
840
857
|
sql = [
|
|
841
858
|
f"CREATE TABLE {'IF NOT EXISTS ' if if_not_exists else ''}{location} (",
|
|
842
859
|
",\n ".join(column_definitions),
|
|
843
|
-
")"
|
|
860
|
+
")",
|
|
844
861
|
]
|
|
845
862
|
|
|
846
|
-
|
|
847
|
-
if partition_by and len(partition_by) > 0:
|
|
863
|
+
if partition_by:
|
|
848
864
|
sql.append(f"\nPARTITIONED BY ({', '.join(partition_by)})")
|
|
849
865
|
elif cluster_by:
|
|
850
866
|
if isinstance(cluster_by, bool):
|
|
851
|
-
sql.append(
|
|
867
|
+
sql.append("\nCLUSTER BY AUTO")
|
|
852
868
|
else:
|
|
853
869
|
sql.append(f"\nCLUSTER BY ({', '.join(cluster_by)})")
|
|
854
870
|
|
|
855
|
-
# Add comment if provided
|
|
856
871
|
if not comment and field.metadata:
|
|
857
872
|
comment = field.metadata.get(b"comment")
|
|
858
873
|
|
|
@@ -862,30 +877,29 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
862
877
|
if comment:
|
|
863
878
|
sql.append(f"\nCOMMENT '{comment}'")
|
|
864
879
|
|
|
865
|
-
# Add options if provided
|
|
866
880
|
options = {} if options is None else options
|
|
867
881
|
options.update({
|
|
868
882
|
"delta.autoOptimize.optimizeWrite": optimize_write,
|
|
869
|
-
"delta.autoOptimize.autoCompact": auto_compact
|
|
883
|
+
"delta.autoOptimize.autoCompact": auto_compact,
|
|
870
884
|
})
|
|
871
885
|
|
|
872
886
|
option_strs = []
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
option_strs.append(f"'{key}' = '{b_value}'")
|
|
881
|
-
else:
|
|
882
|
-
option_strs.append(f"'{key}' = {value}")
|
|
887
|
+
for key, value in (options or {}).items():
|
|
888
|
+
if isinstance(value, str):
|
|
889
|
+
option_strs.append(f"'{key}' = '{value}'")
|
|
890
|
+
elif isinstance(value, bool):
|
|
891
|
+
option_strs.append(f"'{key}' = '{'true' if value else 'false'}'")
|
|
892
|
+
else:
|
|
893
|
+
option_strs.append(f"'{key}' = {value}")
|
|
883
894
|
|
|
884
895
|
if option_strs:
|
|
885
896
|
sql.append(f"\nTBLPROPERTIES ({', '.join(option_strs)})")
|
|
886
897
|
|
|
887
898
|
statement = "\n".join(sql)
|
|
888
899
|
|
|
900
|
+
logger.info("Generated CREATE TABLE DDL for %s", location)
|
|
901
|
+
logger.debug("DDL:\n%s", statement)
|
|
902
|
+
|
|
889
903
|
if execute:
|
|
890
904
|
return self.execute(statement)
|
|
891
905
|
return statement
|
|
@@ -896,28 +910,18 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
896
910
|
catalog_name: Optional[str] = None,
|
|
897
911
|
schema_name: Optional[str] = None,
|
|
898
912
|
table_name: Optional[str] = None,
|
|
899
|
-
safe_chars: bool = True
|
|
900
|
-
):
|
|
901
|
-
"""Resolve location
|
|
902
|
-
|
|
903
|
-
Args:
|
|
904
|
-
location: Fully qualified table name override.
|
|
905
|
-
catalog_name: Optional catalog override.
|
|
906
|
-
schema_name: Optional schema override.
|
|
907
|
-
table_name: Optional table name override.
|
|
908
|
-
safe_chars: Whether to wrap identifiers in backticks.
|
|
909
|
-
|
|
910
|
-
Returns:
|
|
911
|
-
A tuple of (location, catalog_name, schema_name, table_name).
|
|
912
|
-
"""
|
|
913
|
+
safe_chars: bool = True,
|
|
914
|
+
) -> tuple[str, Optional[str], Optional[str], Optional[str]]:
|
|
915
|
+
"""Resolve (location OR catalog/schema/table) into a fully-qualified name."""
|
|
913
916
|
if location:
|
|
914
917
|
c, s, t = self._catalog_schema_table_names(location)
|
|
915
918
|
catalog_name, schema_name, table_name = catalog_name or c, schema_name or s, table_name or t
|
|
916
919
|
|
|
917
920
|
location = self.table_full_name(
|
|
918
|
-
catalog_name=catalog_name,
|
|
921
|
+
catalog_name=catalog_name,
|
|
922
|
+
schema_name=schema_name,
|
|
919
923
|
table_name=table_name,
|
|
920
|
-
safe_chars=safe_chars
|
|
924
|
+
safe_chars=safe_chars,
|
|
921
925
|
)
|
|
922
926
|
|
|
923
927
|
return location, catalog_name or self.catalog_name, schema_name or self.schema_name, table_name
|
|
@@ -927,98 +931,68 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
927
931
|
field: pa.Field,
|
|
928
932
|
put_name: bool = True,
|
|
929
933
|
put_not_null: bool = True,
|
|
930
|
-
put_comment: bool = True
|
|
934
|
+
put_comment: bool = True,
|
|
931
935
|
) -> str:
|
|
932
|
-
"""
|
|
933
|
-
Convert a PyField to a DDL column definition.
|
|
934
|
-
|
|
935
|
-
Args:
|
|
936
|
-
field: The PyField to convert
|
|
937
|
-
|
|
938
|
-
Returns:
|
|
939
|
-
A string containing the column definition in DDL format
|
|
940
|
-
"""
|
|
936
|
+
"""Convert an Arrow Field to a Databricks SQL column DDL fragment."""
|
|
941
937
|
name = field.name
|
|
942
938
|
nullable_str = " NOT NULL" if put_not_null and not field.nullable else ""
|
|
943
939
|
name_str = f"{name} " if put_name else ""
|
|
944
940
|
|
|
945
|
-
# Get comment if available
|
|
946
941
|
comment_str = ""
|
|
947
942
|
if put_comment and field.metadata and b"comment" in field.metadata:
|
|
948
943
|
comment = field.metadata[b"comment"].decode("utf-8")
|
|
949
944
|
comment_str = f" COMMENT '{comment}'"
|
|
950
945
|
|
|
951
|
-
# Handle primitive types
|
|
952
946
|
if not pa.types.is_nested(field.type):
|
|
953
947
|
sql_type = SQLEngine._arrow_to_sql_type(field.type)
|
|
954
948
|
return f"{name_str}{sql_type}{nullable_str}{comment_str}"
|
|
955
949
|
|
|
956
|
-
# Handle struct type
|
|
957
950
|
if pa.types.is_struct(field.type):
|
|
958
951
|
child_defs = [SQLEngine._field_to_ddl(child) for child in field.type]
|
|
959
952
|
struct_body = ", ".join(child_defs)
|
|
960
953
|
return f"{name_str}STRUCT<{struct_body}>{nullable_str}{comment_str}"
|
|
961
954
|
|
|
962
|
-
# Handle map type
|
|
963
955
|
if pa.types.is_map(field.type):
|
|
964
956
|
map_type: pa.MapType = field.type
|
|
965
957
|
key_type = SQLEngine._field_to_ddl(map_type.key_field, put_name=False, put_comment=False, put_not_null=False)
|
|
966
958
|
val_type = SQLEngine._field_to_ddl(map_type.item_field, put_name=False, put_comment=False, put_not_null=False)
|
|
967
959
|
return f"{name_str}MAP<{key_type}, {val_type}>{nullable_str}{comment_str}"
|
|
968
960
|
|
|
969
|
-
# Handle list type after map
|
|
970
961
|
if pa.types.is_list(field.type) or pa.types.is_large_list(field.type):
|
|
971
962
|
list_type: pa.ListType = field.type
|
|
972
963
|
elem_type = SQLEngine._field_to_ddl(list_type.value_field, put_name=False, put_comment=False, put_not_null=False)
|
|
973
964
|
return f"{name_str}ARRAY<{elem_type}>{nullable_str}{comment_str}"
|
|
974
965
|
|
|
975
|
-
# Default fallback to string for unknown types
|
|
976
966
|
raise TypeError(f"Cannot make ddl field from {field}")
|
|
977
967
|
|
|
978
968
|
@staticmethod
|
|
979
|
-
def _arrow_to_sql_type(
|
|
980
|
-
|
|
981
|
-
) -> str:
|
|
982
|
-
"""
|
|
983
|
-
Convert an Arrow data type to SQL data type.
|
|
984
|
-
|
|
985
|
-
Args:
|
|
986
|
-
arrow_type: The Arrow data type
|
|
987
|
-
|
|
988
|
-
Returns:
|
|
989
|
-
A string containing the SQL data type
|
|
990
|
-
"""
|
|
969
|
+
def _arrow_to_sql_type(arrow_type: Union[pa.DataType, pa.Decimal128Type]) -> str:
|
|
970
|
+
"""Convert an Arrow data type to a Databricks SQL type string."""
|
|
991
971
|
if pa.types.is_boolean(arrow_type):
|
|
992
972
|
return "BOOLEAN"
|
|
993
|
-
|
|
973
|
+
if pa.types.is_int8(arrow_type):
|
|
994
974
|
return "TINYINT"
|
|
995
|
-
|
|
975
|
+
if pa.types.is_int16(arrow_type):
|
|
996
976
|
return "SMALLINT"
|
|
997
|
-
|
|
977
|
+
if pa.types.is_int32(arrow_type):
|
|
998
978
|
return "INT"
|
|
999
|
-
|
|
979
|
+
if pa.types.is_int64(arrow_type):
|
|
1000
980
|
return "BIGINT"
|
|
1001
|
-
|
|
981
|
+
if pa.types.is_float32(arrow_type):
|
|
1002
982
|
return "FLOAT"
|
|
1003
|
-
|
|
983
|
+
if pa.types.is_float64(arrow_type):
|
|
1004
984
|
return "DOUBLE"
|
|
1005
|
-
|
|
985
|
+
if is_arrow_type_string_like(arrow_type):
|
|
1006
986
|
return "STRING"
|
|
1007
|
-
|
|
987
|
+
if is_arrow_type_binary_like(arrow_type):
|
|
1008
988
|
return "BINARY"
|
|
1009
|
-
|
|
989
|
+
if pa.types.is_timestamp(arrow_type):
|
|
1010
990
|
tz = getattr(arrow_type, "tz", None)
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
return "TIMESTAMP"
|
|
1014
|
-
return "TIMESTAMP_NTZ"
|
|
1015
|
-
elif pa.types.is_date(arrow_type):
|
|
991
|
+
return "TIMESTAMP" if tz else "TIMESTAMP_NTZ"
|
|
992
|
+
if pa.types.is_date(arrow_type):
|
|
1016
993
|
return "DATE"
|
|
1017
|
-
|
|
1018
|
-
precision
|
|
1019
|
-
|
|
1020
|
-
return f"DECIMAL({precision}, {scale})"
|
|
1021
|
-
elif pa.types.is_null(arrow_type):
|
|
994
|
+
if pa.types.is_decimal(arrow_type):
|
|
995
|
+
return f"DECIMAL({arrow_type.precision}, {arrow_type.scale})"
|
|
996
|
+
if pa.types.is_null(arrow_type):
|
|
1022
997
|
return "STRING"
|
|
1023
|
-
|
|
1024
|
-
raise ValueError(f"Cannot make ddl type for {arrow_type}")
|
|
998
|
+
raise ValueError(f"Cannot make ddl type for {arrow_type}")
|