ygg 0.1.33__py3-none-any.whl → 0.1.34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,15 @@
1
- """Databricks SQL engine utilities and helpers."""
1
+ """Databricks SQL engine utilities and helpers.
2
+
3
+ This module provides a thin “do the right thing” layer over:
4
+ - Databricks SQL Statement Execution API (warehouse)
5
+ - Spark SQL / Delta Lake (when running inside a Spark-enabled context)
6
+
7
+ It includes helpers to:
8
+ - Build fully-qualified table names
9
+ - Execute SQL via Spark or Databricks SQL API
10
+ - Insert Arrow/Spark data into Delta tables (append/overwrite/merge)
11
+ - Generate DDL from Arrow schemas
12
+ """
2
13
 
3
14
  import dataclasses
4
15
  import logging
@@ -8,7 +19,6 @@ import time
8
19
  from typing import Optional, Union, Any, Dict, List, Literal
9
20
 
10
21
  import pyarrow as pa
11
- import pyarrow.parquet as pq
12
22
 
13
23
  from .statement_result import StatementResult
14
24
  from .types import column_info_to_arrow_field
@@ -28,7 +38,6 @@ except ImportError:
28
38
  @classmethod
29
39
  def forName(cls, *args, **kwargs):
30
40
  from delta.tables import DeltaTable
31
-
32
41
  return DeltaTable.forName(*args, **kwargs)
33
42
 
34
43
 
@@ -37,23 +46,18 @@ if databricks_sdk is not None:
37
46
  StatementResponse, Disposition, Format,
38
47
  ExecuteStatementRequestOnWaitTimeout, StatementParameterListItem
39
48
  )
40
-
41
49
  StatementResponse = StatementResponse
42
50
  else:
43
- class StatementResponse:
51
+ class StatementResponse: # pragma: no cover
44
52
  pass
45
53
 
46
54
 
47
55
  logger = logging.getLogger(__name__)
48
56
 
49
-
50
57
  if pyspark is not None:
51
58
  import pyspark.sql.functions as F
52
59
 
53
- __all__ = [
54
- "SQLEngine",
55
- "StatementResult"
56
- ]
60
+ __all__ = ["SQLEngine", "StatementResult"]
57
61
 
58
62
 
59
63
  class SqlExecutionError(RuntimeError):
@@ -62,7 +66,7 @@ class SqlExecutionError(RuntimeError):
62
66
 
63
67
  @dataclasses.dataclass
64
68
  class SQLEngine(WorkspaceService):
65
- """Execute SQL statements and manage tables via Databricks."""
69
+ """Execute SQL statements and manage tables via Databricks SQL / Spark."""
66
70
  warehouse_id: Optional[str] = None
67
71
  catalog_name: Optional[str] = None
68
72
  schema_name: Optional[str] = None
@@ -72,18 +76,18 @@ class SQLEngine(WorkspaceService):
72
76
  catalog_name: Optional[str] = None,
73
77
  schema_name: Optional[str] = None,
74
78
  table_name: Optional[str] = None,
75
- safe_chars: bool = True
76
- ):
77
- """Build a fully qualified table name for the current catalog/schema.
79
+ safe_chars: bool = True,
80
+ ) -> str:
81
+ """Build a fully qualified table name (catalog.schema.table).
78
82
 
79
83
  Args:
80
- catalog_name: Optional catalog override.
81
- schema_name: Optional schema override.
84
+ catalog_name: Optional catalog override (defaults to engine.catalog_name).
85
+ schema_name: Optional schema override (defaults to engine.schema_name).
82
86
  table_name: Table name to qualify.
83
- safe_chars: Whether to wrap identifiers in backticks.
87
+ safe_chars: Whether to wrap each identifier in backticks.
84
88
 
85
89
  Returns:
86
- The fully qualified table name.
90
+ Fully qualified table name string.
87
91
  """
88
92
  catalog_name = catalog_name or self.catalog_name
89
93
  schema_name = schema_name or self.schema_name
@@ -96,21 +100,23 @@ class SQLEngine(WorkspaceService):
96
100
  return f"`{catalog_name}`.`{schema_name}`.`{table_name}`"
97
101
  return f"{catalog_name}.{schema_name}.{table_name}"
98
102
 
99
- def _catalog_schema_table_names(
100
- self,
101
- full_name: str,
102
- ):
103
+ def _catalog_schema_table_names(self, full_name: str) -> tuple[Optional[str], Optional[str], Optional[str]]:
103
104
  """Parse a catalog.schema.table string into components.
104
105
 
106
+ Supports partial names:
107
+ - table
108
+ - schema.table
109
+ - catalog.schema.table
110
+
111
+ Backticks are stripped.
112
+
105
113
  Args:
106
- full_name: A fully qualified name or partial name.
114
+ full_name: Fully qualified or partial table name.
107
115
 
108
116
  Returns:
109
- A tuple of (catalog_name, schema_name, table_name).
117
+ Tuple of (catalog_name, schema_name, table_name).
110
118
  """
111
- parts = [
112
- _.strip("`") for _ in full_name.split(".")
113
- ]
119
+ parts = [_.strip("`") for _ in full_name.split(".")]
114
120
 
115
121
  if len(parts) == 0:
116
122
  return self.catalog_name, self.schema_name, None
@@ -122,20 +128,20 @@ class SQLEngine(WorkspaceService):
122
128
  catalog_name, schema_name, table_name = parts[-3], parts[-2], parts[-1]
123
129
  catalog_name = catalog_name or self.catalog_name
124
130
  schema_name = schema_name or self.schema_name
125
-
126
131
  return catalog_name, schema_name, table_name
127
132
 
128
- def _default_warehouse(
129
- self,
130
- cluster_size: str = "Small"
131
- ):
132
- """Return a default SQL warehouse matching the desired size.
133
+ def _default_warehouse(self, cluster_size: str = "Small"):
134
+ """Pick a default SQL warehouse (best-effort) matching the desired size.
133
135
 
134
136
  Args:
135
- cluster_size: Desired warehouse size filter.
137
+ cluster_size: Desired warehouse size (Databricks "cluster_size"), e.g. "Small".
138
+ If empty/None, returns the first warehouse encountered.
136
139
 
137
140
  Returns:
138
- The matched warehouse object.
141
+ Warehouse object.
142
+
143
+ Raises:
144
+ ValueError: If no warehouses exist in the workspace.
139
145
  """
140
146
  wk = self.workspace.sdk()
141
147
  existing = list(wk.warehouses.list())
@@ -146,48 +152,55 @@ class SQLEngine(WorkspaceService):
146
152
  first = warehouse
147
153
 
148
154
  if cluster_size:
149
- if warehouse.cluster_size == cluster_size:
155
+ if getattr(warehouse, "cluster_size", None) == cluster_size:
156
+ logger.debug("Default warehouse match found: id=%s cluster_size=%s", warehouse.id, warehouse.cluster_size)
150
157
  return warehouse
151
158
  else:
159
+ logger.debug("Default warehouse selected (first): id=%s", warehouse.id)
152
160
  return warehouse
153
161
 
154
162
  if first is not None:
163
+ logger.info(
164
+ "No warehouse matched cluster_size=%s; falling back to first warehouse id=%s cluster_size=%s",
165
+ cluster_size,
166
+ getattr(first, "id", None),
167
+ getattr(first, "cluster_size", None),
168
+ )
155
169
  return first
156
170
 
157
171
  raise ValueError(f"No default warehouse found in {wk.config.host}")
158
172
 
159
- def _get_or_default_warehouse_id(
160
- self,
161
- cluster_size = "Small"
162
- ):
163
- """Return the configured warehouse id or a default one.
173
+ def _get_or_default_warehouse_id(self, cluster_size: str = "Small") -> str:
174
+ """Return configured warehouse_id or resolve a default one.
164
175
 
165
176
  Args:
166
- cluster_size: Desired warehouse size filter.
177
+ cluster_size: Desired warehouse size filter used when resolving defaults.
167
178
 
168
179
  Returns:
169
- The warehouse id string.
180
+ Warehouse id string.
170
181
  """
171
182
  if not self.warehouse_id:
172
183
  dft = self._default_warehouse(cluster_size=cluster_size)
173
-
174
184
  self.warehouse_id = dft.id
185
+ logger.info("Resolved default warehouse_id=%s (cluster_size=%s)", self.warehouse_id, cluster_size)
186
+
175
187
  return self.warehouse_id
176
188
 
177
189
  @staticmethod
178
190
  def _random_suffix(prefix: str = "") -> str:
179
- """Generate a unique suffix for temporary resources.
180
-
181
- Args:
182
- prefix: Optional prefix to prepend.
183
-
184
- Returns:
185
- A unique suffix string.
186
- """
187
- unique = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(8))
191
+ """Generate a unique suffix for temporary resources."""
192
+ unique = "".join(random.choice(string.ascii_lowercase + string.digits) for _ in range(8))
188
193
  timestamp = int(time.time() * 1000)
189
194
  return f"{prefix}{timestamp}_{unique}"
190
195
 
196
+ @staticmethod
197
+ def _sql_preview(sql: str, limit: int = 220) -> str:
198
+ """Short, single-line preview for logs (avoids spewing giant SQL)."""
199
+ if not sql:
200
+ return ""
201
+ one_line = " ".join(sql.split())
202
+ return one_line[:limit] + ("…" if len(one_line) > limit else "")
203
+
191
204
  def execute(
192
205
  self,
193
206
  statement: Optional[str] = None,
@@ -207,54 +220,64 @@ class SQLEngine(WorkspaceService):
207
220
  wait_result: bool = True,
208
221
  **kwargs,
209
222
  ) -> "StatementResult":
210
- """
211
- Execute a SQL statement on a SQL warehouse.
223
+ """Execute a SQL statement via Spark or Databricks SQL Statement Execution API.
224
+
225
+ Engine resolution:
226
+ - If `engine` is not provided and a Spark session is active -> uses Spark.
227
+ - Otherwise uses Databricks SQL API (warehouse).
212
228
 
213
- - If wait=True (default): poll until terminal state.
214
- - On SUCCEEDED: return final statement object
215
- - On FAILED / CANCELED: raise SqlExecutionError
216
- - If wait=False: return initial execution handle without polling.
229
+ Waiting behavior (`wait_result`):
230
+ - If True (default): returns a StatementResult in terminal state (SUCCEEDED/FAILED/CANCELED).
231
+ - If False: returns immediately with the initial handle (caller can `.wait()` later).
217
232
 
218
233
  Args:
219
- statement: SQL statement to execute. If omitted, selects from the table.
220
- engine: Execution engine ("spark" or "api").
221
- warehouse_id: Optional warehouse id override.
234
+ statement: SQL statement to execute. If None, a `SELECT *` is generated from the table params.
235
+ engine: "spark" or "api".
236
+ warehouse_id: Warehouse override (for API engine).
222
237
  byte_limit: Optional byte limit for results.
223
- disposition: Result disposition mode.
224
- format: Result format for Databricks SQL API.
225
- on_wait_timeout: Timeout behavior for waiting.
226
- parameters: Optional statement parameters.
227
- row_limit: Optional row limit.
228
- wait_timeout: Optional API wait timeout.
229
- catalog_name: Optional catalog override.
230
- schema_name: Optional schema override.
231
- table_name: Optional table name override.
232
- wait_result: Whether to block until completion.
233
- **kwargs: Additional API parameters.
238
+ disposition: Result disposition mode (API engine).
239
+ format: Result format (API engine).
240
+ on_wait_timeout: Timeout behavior for waiting (API engine).
241
+ parameters: Optional statement parameters (API engine).
242
+ row_limit: Optional row limit for results (API engine).
243
+ wait_timeout: API wait timeout value.
244
+ catalog_name: Optional catalog override for API engine.
245
+ schema_name: Optional schema override for API engine.
246
+ table_name: Optional table override used when `statement` is None.
247
+ wait_result: Whether to block until completion (API engine).
248
+ **kwargs: Extra params forwarded to Databricks SDK execute_statement.
234
249
 
235
250
  Returns:
236
- A StatementResult wrapper for the execution.
251
+ StatementResult.
237
252
  """
253
+ # --- Engine auto-detection ---
238
254
  if not engine:
239
255
  if pyspark is not None:
240
256
  spark_session = SparkSession.getActiveSession()
241
-
242
257
  if spark_session is not None:
243
258
  engine = "spark"
244
259
 
260
+ # --- Spark path ---
245
261
  if engine == "spark":
246
262
  spark_session = SparkSession.getActiveSession()
247
-
248
263
  if spark_session is None:
249
264
  raise ValueError("No spark session found to run sql query")
250
265
 
266
+ t0 = time.time()
267
+ df = spark_session.sql(statement)
268
+ logger.info("Spark SQL executed in %.3fs: %s", time.time() - t0, self._sql_preview(statement))
269
+
270
+ # Avoid Disposition dependency if SDK imports are absent
271
+ spark_disp = disposition if disposition is not None else getattr(globals().get("Disposition", object), "EXTERNAL_LINKS", None)
272
+
251
273
  return StatementResult(
252
274
  engine=self,
253
275
  statement_id="sparksql",
254
- disposition=Disposition.EXTERNAL_LINKS,
255
- _spark_df=spark_session.sql(statement)
276
+ disposition=spark_disp,
277
+ _spark_df=df,
256
278
  )
257
279
 
280
+ # --- API path defaults ---
258
281
  if format is None:
259
282
  format = Format.ARROW_STREAM
260
283
 
@@ -264,6 +287,7 @@ class SQLEngine(WorkspaceService):
264
287
  if not statement:
265
288
  full_name = self.table_full_name(catalog_name=catalog_name, schema_name=schema_name, table_name=table_name)
266
289
  statement = f"SELECT * FROM {full_name}"
290
+ logger.debug("Autogenerated statement: %s", self._sql_preview(statement))
267
291
 
268
292
  if not warehouse_id:
269
293
  warehouse_id = self._get_or_default_warehouse_id()
@@ -280,7 +304,6 @@ class SQLEngine(WorkspaceService):
280
304
  wait_timeout=wait_timeout,
281
305
  catalog=catalog_name or self.catalog_name,
282
306
  schema=schema_name or self.schema_name,
283
- **kwargs,
284
307
  )
285
308
 
286
309
  execution = StatementResult(
@@ -288,10 +311,11 @@ class SQLEngine(WorkspaceService):
288
311
  statement_id=response.statement_id,
289
312
  _response=response,
290
313
  _response_refresh_time=time.time(),
291
- disposition=disposition
314
+ disposition=disposition,
292
315
  )
293
316
 
294
- return execution.wait() if wait_result else wait_result
317
+ # BUGFIX: previously returned `wait_result` (a bool) on wait_result=False 🤦
318
+ return execution.wait() if wait_result else execution
295
319
 
296
320
  def spark_table(
297
321
  self,
@@ -300,35 +324,21 @@ class SQLEngine(WorkspaceService):
300
324
  schema_name: Optional[str] = None,
301
325
  table_name: Optional[str] = None,
302
326
  ):
303
- """Return a DeltaTable handle for a given table name.
304
-
305
- Args:
306
- full_name: Fully qualified table name.
307
- catalog_name: Optional catalog override.
308
- schema_name: Optional schema override.
309
- table_name: Optional table name override.
310
-
311
- Returns:
312
- A Spark DeltaTable handle.
313
- """
327
+ """Return a DeltaTable handle for a given table name (Spark context required)."""
314
328
  if not full_name:
315
329
  full_name = self.table_full_name(
316
330
  catalog_name=catalog_name,
317
331
  schema_name=schema_name,
318
- table_name=table_name
332
+ table_name=table_name,
319
333
  )
320
-
321
334
  return SparkDeltaTable.forName(
322
335
  sparkSession=SparkSession.getActiveSession(),
323
- tableOrViewName=full_name
336
+ tableOrViewName=full_name,
324
337
  )
325
338
 
326
339
  def insert_into(
327
340
  self,
328
- data: Union[
329
- pa.Table, pa.RecordBatch, pa.RecordBatchReader,
330
- SparkDataFrame
331
- ],
341
+ data: Union[pa.Table, pa.RecordBatch, pa.RecordBatchReader, SparkDataFrame],
332
342
  location: Optional[str] = None,
333
343
  catalog_name: Optional[str] = None,
334
344
  schema_name: Optional[str] = None,
@@ -336,14 +346,18 @@ class SQLEngine(WorkspaceService):
336
346
  mode: str = "auto",
337
347
  cast_options: Optional[CastOptions] = None,
338
348
  overwrite_schema: bool | None = None,
339
- match_by: list[str] = None,
340
- zorder_by: list[str] = None,
349
+ match_by: Optional[list[str]] = None,
350
+ zorder_by: Optional[list[str]] = None,
341
351
  optimize_after_merge: bool = False,
342
- vacuum_hours: int | None = None, # e.g., 168 for 7 days
352
+ vacuum_hours: int | None = None,
343
353
  spark_session: Optional[SparkSession] = None,
344
- spark_options: Optional[Dict[str, Any]] = None
354
+ spark_options: Optional[Dict[str, Any]] = None,
345
355
  ):
346
- """Insert data into a table using Spark or Arrow paths.
356
+ """Insert data into a Delta table using Spark when available; otherwise stage Arrow.
357
+
358
+ Strategy:
359
+ - If Spark is available and we have an active session (or Spark DF input) -> use `spark_insert_into`.
360
+ - Otherwise -> use `arrow_insert_into` (stages Parquet to a temp volume + runs SQL INSERT/MERGE).
347
361
 
348
362
  Args:
349
363
  data: Arrow or Spark data to insert.
@@ -353,18 +367,18 @@ class SQLEngine(WorkspaceService):
353
367
  table_name: Optional table name override.
354
368
  mode: Insert mode ("auto", "append", "overwrite").
355
369
  cast_options: Optional casting options.
356
- overwrite_schema: Whether to overwrite schema (Spark).
357
- match_by: Optional merge keys for upserts.
358
- zorder_by: Optional Z-ORDER columns.
359
- optimize_after_merge: Whether to run OPTIMIZE after merge.
370
+ overwrite_schema: Whether to overwrite schema (Spark path).
371
+ match_by: Merge keys for upserts (MERGE semantics). When set, mode affects behavior.
372
+ zorder_by: Z-ORDER columns (SQL path uses OPTIMIZE ZORDER; Spark path uses Delta optimize API).
373
+ optimize_after_merge: Whether to run OPTIMIZE after a merge (SQL path) / after merge+zorder (Spark path).
360
374
  vacuum_hours: Optional VACUUM retention window.
361
375
  spark_session: Optional SparkSession override.
362
376
  spark_options: Optional Spark write options.
363
377
 
364
378
  Returns:
365
- None for Arrow inserts, or the Spark insert result.
379
+ None (mutates the destination table).
366
380
  """
367
- # -------- existing logic you provided (kept intact) ----------
381
+
368
382
  if pyspark is not None:
369
383
  spark_session = SparkSession.getActiveSession() if spark_session is None else spark_session
370
384
 
@@ -382,7 +396,7 @@ class SQLEngine(WorkspaceService):
382
396
  zorder_by=zorder_by,
383
397
  optimize_after_merge=optimize_after_merge,
384
398
  vacuum_hours=vacuum_hours,
385
- spark_options=spark_options
399
+ spark_options=spark_options,
386
400
  )
387
401
 
388
402
  return self.arrow_insert_into(
@@ -402,9 +416,7 @@ class SQLEngine(WorkspaceService):
402
416
 
403
417
  def arrow_insert_into(
404
418
  self,
405
- data: Union[
406
- pa.Table, pa.RecordBatch, pa.RecordBatchReader,
407
- ],
419
+ data: Union[pa.Table, pa.RecordBatch, pa.RecordBatchReader],
408
420
  location: Optional[str] = None,
409
421
  catalog_name: Optional[str] = None,
410
422
  schema_name: Optional[str] = None,
@@ -412,14 +424,19 @@ class SQLEngine(WorkspaceService):
412
424
  mode: str = "auto",
413
425
  cast_options: Optional[CastOptions] = None,
414
426
  overwrite_schema: bool | None = None,
415
- match_by: list[str] = None,
416
- zorder_by: list[str] = None,
427
+ match_by: Optional[list[str]] = None,
428
+ zorder_by: Optional[list[str]] = None,
417
429
  optimize_after_merge: bool = False,
418
- vacuum_hours: int | None = None, # e.g., 168 for 7 days
430
+ vacuum_hours: int | None = None,
419
431
  existing_schema: pa.Schema | None = None,
420
- temp_volume_path: Optional[Union[str, DatabricksPath]] = None
432
+ temp_volume_path: Optional[Union[str, DatabricksPath]] = None,
421
433
  ):
422
- """Insert Arrow data by staging to a temp volume and running SQL.
434
+ """Insert Arrow data by staging Parquet to a temp volume and running Databricks SQL.
435
+
436
+ Notes:
437
+ - If the table does not exist, it is created from the input Arrow schema (best-effort).
438
+ - If `match_by` is provided, uses MERGE INTO (upsert).
439
+ - Otherwise uses INSERT INTO / INSERT OVERWRITE depending on mode.
423
440
 
424
441
  Args:
425
442
  data: Arrow table/batch data to insert.
@@ -427,14 +444,14 @@ class SQLEngine(WorkspaceService):
427
444
  catalog_name: Optional catalog override.
428
445
  schema_name: Optional schema override.
429
446
  table_name: Optional table name override.
430
- mode: Insert mode ("auto", "append", "overwrite").
447
+ mode: Insert mode ("auto", "append", "overwrite"). ("auto" behaves like append here.)
431
448
  cast_options: Optional casting options.
432
- overwrite_schema: Whether to overwrite schema.
433
- match_by: Optional merge keys for upserts.
434
- zorder_by: Optional Z-ORDER columns.
435
- optimize_after_merge: Whether to run OPTIMIZE after merge.
436
- vacuum_hours: Optional VACUUM retention window.
437
- existing_schema: Optional pre-fetched schema.
449
+ overwrite_schema: Reserved for parity with Spark path (unused here).
450
+ match_by: Merge keys for MERGE INTO upserts.
451
+ zorder_by: Columns for OPTIMIZE ZORDER BY.
452
+ optimize_after_merge: Run OPTIMIZE after MERGE (in addition to ZORDER optimization).
453
+ vacuum_hours: Optional VACUUM retention window in hours.
454
+ existing_schema: Optional pre-fetched destination schema (Arrow).
438
455
  temp_volume_path: Optional temp volume path override.
439
456
 
440
457
  Returns:
@@ -445,7 +462,15 @@ class SQLEngine(WorkspaceService):
445
462
  catalog_name=catalog_name,
446
463
  schema_name=schema_name,
447
464
  table_name=table_name,
448
- safe_chars=True
465
+ safe_chars=True,
466
+ )
467
+
468
+ logger.info(
469
+ "Arrow insert into %s (mode=%s, match_by=%s, zorder_by=%s)",
470
+ location,
471
+ mode,
472
+ match_by,
473
+ zorder_by,
449
474
  )
450
475
 
451
476
  with self as connected:
@@ -455,16 +480,17 @@ class SQLEngine(WorkspaceService):
455
480
  catalog_name=catalog_name,
456
481
  schema_name=schema_name,
457
482
  table_name=table_name,
458
- to_arrow_schema=True
483
+ to_arrow_schema=True,
459
484
  )
485
+ logger.debug("Fetched existing schema for %s (columns=%d)", location, len(existing_schema.names))
460
486
  except ValueError as exc:
461
- data = convert(data, pa.Table)
462
- existing_schema = data.schema
487
+ data_tbl = convert(data, pa.Table)
488
+ existing_schema = data_tbl.schema
463
489
  logger.warning(
464
- "Table %s not found, %s, creating it based on input data %s",
490
+ "Table %s not found (%s). Creating it from input schema (columns=%s)",
465
491
  location,
466
492
  exc,
467
- existing_schema.names
493
+ existing_schema.names,
468
494
  )
469
495
 
470
496
  connected.create_table(
@@ -472,12 +498,12 @@ class SQLEngine(WorkspaceService):
472
498
  catalog_name=catalog_name,
473
499
  schema_name=schema_name,
474
500
  table_name=table_name,
475
- if_not_exists=True
501
+ if_not_exists=True,
476
502
  )
477
503
 
478
504
  try:
479
505
  return connected.arrow_insert_into(
480
- data=data,
506
+ data=data_tbl,
481
507
  location=location,
482
508
  catalog_name=catalog_name,
483
509
  schema_name=schema_name,
@@ -489,54 +515,50 @@ class SQLEngine(WorkspaceService):
489
515
  zorder_by=zorder_by,
490
516
  optimize_after_merge=optimize_after_merge,
491
517
  vacuum_hours=vacuum_hours,
492
- existing_schema=existing_schema
518
+ existing_schema=existing_schema,
493
519
  )
494
- except:
520
+ except Exception:
521
+ logger.exception("Arrow insert failed after auto-creating %s; attempting cleanup (DROP TABLE)", location)
495
522
  try:
496
523
  connected.drop_table(location=location)
497
- except Exception as e:
498
- logger.warning("Failed to drop table %s after auto creation on error: %s", location, e)
524
+ except Exception:
525
+ logger.exception("Failed to drop table %s after auto creation error", location)
499
526
  raise
500
527
 
501
528
  transaction_id = self._random_suffix()
502
529
 
503
- data = convert(data, pa.Table, options=cast_options, target_field=existing_schema)
530
+ data_tbl = convert(data, pa.Table, options=cast_options, target_field=existing_schema)
504
531
 
505
532
  # Write in temp volume
506
533
  temp_volume_path = connected.dbfs_path(
507
534
  kind=DatabricksPathKind.VOLUME,
508
- parts=[catalog_name, schema_name, "tmp", "sql", transaction_id]
535
+ parts=[catalog_name, schema_name, "tmp", "sql", transaction_id],
509
536
  ) if temp_volume_path is None else DatabricksPath.parse(obj=temp_volume_path, workspace=connected.workspace)
510
537
 
538
+ logger.debug("Staging Parquet to temp volume: %s", temp_volume_path)
511
539
  temp_volume_path.mkdir()
540
+ temp_volume_path.write_arrow_table(data_tbl)
512
541
 
513
- temp_volume_path.write_arrow_table(data)
514
-
515
- # get column list from arrow schema
516
- columns = [c for c in existing_schema.names]
542
+ columns = list(existing_schema.names)
517
543
  cols_quoted = ", ".join([f"`{c}`" for c in columns])
518
544
 
519
- statements = []
545
+ statements: list[str] = []
520
546
 
521
- # Decide how to ingest
522
- # If merge keys provided -> use MERGE
523
547
  if match_by:
524
- # build ON condition using match_by
525
- on_clauses = []
526
- for k in match_by:
527
- on_clauses.append(f"T.`{k}` = S.`{k}`")
528
- on_condition = " AND ".join(on_clauses)
548
+ logger.info("Using MERGE INTO (match_by=%s)", match_by)
549
+ on_condition = " AND ".join([f"T.`{k}` = S.`{k}`" for k in match_by])
529
550
 
530
- # build UPDATE set (all columns except match_by)
531
551
  update_cols = [c for c in columns if c not in match_by]
532
552
  if update_cols:
533
553
  update_set = ", ".join([f"T.`{c}` = S.`{c}`" for c in update_cols])
534
554
  update_clause = f"WHEN MATCHED THEN UPDATE SET {update_set}"
535
555
  else:
536
- update_clause = "" # nothing to update
556
+ update_clause = ""
537
557
 
538
- # build INSERT clause
539
- insert_clause = f"WHEN NOT MATCHED THEN INSERT ({cols_quoted}) VALUES ({', '.join([f'S.`{c}`' for c in columns])})"
558
+ insert_clause = (
559
+ f"WHEN NOT MATCHED THEN INSERT ({cols_quoted}) "
560
+ f"VALUES ({', '.join([f'S.`{c}`' for c in columns])})"
561
+ )
540
562
 
541
563
  merge_sql = f"""MERGE INTO {location} AS T
542
564
  USING (
@@ -546,41 +568,38 @@ ON {on_condition}
546
568
  {update_clause}
547
569
  {insert_clause}"""
548
570
  statements.append(merge_sql)
549
-
550
571
  else:
551
- # No match_by -> plain insert
552
572
  if mode.lower() in ("overwrite",):
553
573
  insert_sql = f"""INSERT OVERWRITE {location}
554
574
  SELECT {cols_quoted}
555
575
  FROM parquet.`{temp_volume_path}`"""
556
576
  else:
557
- # default: append
558
577
  insert_sql = f"""INSERT INTO {location} ({cols_quoted})
559
578
  SELECT {cols_quoted}
560
579
  FROM parquet.`{temp_volume_path}`"""
561
580
  statements.append(insert_sql)
562
581
 
563
- # Execute statements (use your existing execute helper)
564
582
  try:
565
583
  for stmt in statements:
566
- # trim and run
567
584
  connected.execute(stmt.strip())
568
585
  finally:
569
586
  try:
570
587
  temp_volume_path.rmdir(recursive=True)
571
- except Exception as e:
572
- logger.warning(e)
588
+ except Exception:
589
+ logger.exception("Failed cleaning temp volume: %s", temp_volume_path)
573
590
 
574
- # Optionally run OPTIMIZE / ZORDER / VACUUM if requested (Databricks SQL)
575
591
  if zorder_by:
576
592
  zcols = ", ".join([f"`{c}`" for c in zorder_by])
577
593
  optimize_sql = f"OPTIMIZE {location} ZORDER BY ({zcols})"
594
+ logger.info("Running OPTIMIZE ZORDER BY: %s", zorder_by)
578
595
  connected.execute(optimize_sql)
579
596
 
580
597
  if optimize_after_merge and match_by:
598
+ logger.info("Running OPTIMIZE after MERGE")
581
599
  connected.execute(f"OPTIMIZE {location}")
582
600
 
583
601
  if vacuum_hours is not None:
602
+ logger.info("Running VACUUM retain=%s hours", vacuum_hours)
584
603
  connected.execute(f"VACUUM {location} RETAIN {vacuum_hours} HOURS")
585
604
 
586
605
  return None
@@ -596,13 +615,20 @@ FROM parquet.`{temp_volume_path}`"""
596
615
  mode: str = "auto",
597
616
  cast_options: Optional[CastOptions] = None,
598
617
  overwrite_schema: bool | None = None,
599
- match_by: list[str] = None,
600
- zorder_by: list[str] = None,
618
+ match_by: Optional[list[str]] = None,
619
+ zorder_by: Optional[list[str]] = None,
601
620
  optimize_after_merge: bool = False,
602
- vacuum_hours: int | None = None, # e.g., 168 for 7 days
621
+ vacuum_hours: int | None = None,
603
622
  spark_options: Optional[Dict[str, Any]] = None,
604
623
  ):
605
- """Insert a Spark DataFrame into a Delta table with optional merge semantics.
624
+ """Insert a Spark DataFrame into a Delta table (append/overwrite/merge).
625
+
626
+ Behavior:
627
+ - If the table does not exist: creates it via `saveAsTable(location)` (overwrite).
628
+ - If `match_by` is provided: uses Delta MERGE for upserts.
629
+ - If mode == "overwrite": deletes matching keys first, then appends the batch (fast-ish overwrite-by-key).
630
+ - Else: updates matching rows + inserts new ones.
631
+ - Else: uses `DataFrameWriter.saveAsTable` with mode.
606
632
 
607
633
  Args:
608
634
  data: Spark DataFrame to insert.
@@ -611,12 +637,12 @@ FROM parquet.`{temp_volume_path}`"""
611
637
  schema_name: Optional schema override.
612
638
  table_name: Optional table name override.
613
639
  mode: Insert mode ("auto", "append", "overwrite").
614
- cast_options: Optional casting options.
615
- overwrite_schema: Whether to overwrite schema.
616
- match_by: Optional merge keys for upserts.
617
- zorder_by: Optional Z-ORDER columns.
618
- optimize_after_merge: Whether to run OPTIMIZE after merge.
619
- vacuum_hours: Optional VACUUM retention window.
640
+ cast_options: Optional casting options (align to destination schema).
641
+ overwrite_schema: Whether to overwrite schema on write (when supported).
642
+ match_by: Merge keys for upserts.
643
+ zorder_by: Z-ORDER columns (used only if `optimize_after_merge` is True).
644
+ optimize_after_merge: Whether to run Delta optimize (and z-order) after merge.
645
+ vacuum_hours: Optional VACUUM retention window in hours.
620
646
  spark_options: Optional Spark write options.
621
647
 
622
648
  Returns:
@@ -627,7 +653,15 @@ FROM parquet.`{temp_volume_path}`"""
627
653
  catalog_name=catalog_name,
628
654
  schema_name=schema_name,
629
655
  table_name=table_name,
630
- safe_chars=True
656
+ safe_chars=True,
657
+ )
658
+
659
+ logger.info(
660
+ "Spark insert into %s (mode=%s, match_by=%s, overwrite_schema=%s)",
661
+ location,
662
+ mode,
663
+ match_by,
664
+ overwrite_schema,
631
665
  )
632
666
 
633
667
  spark_options = spark_options if spark_options else {}
@@ -636,11 +670,14 @@ FROM parquet.`{temp_volume_path}`"""
636
670
 
637
671
  try:
638
672
  existing_schema = self.get_table_schema(
639
- catalog_name=catalog_name, schema_name=schema_name,
673
+ catalog_name=catalog_name,
674
+ schema_name=schema_name,
640
675
  table_name=table_name,
641
- to_arrow_schema=False
676
+ to_arrow_schema=False,
642
677
  )
678
+ logger.debug("Fetched destination Spark schema for %s", location)
643
679
  except ValueError:
680
+ logger.warning("Destination table missing; creating table %s via overwrite write", location)
644
681
  data = convert(data, pyspark.sql.DataFrame)
645
682
  data.write.mode("overwrite").options(**spark_options).saveAsTable(location)
646
683
  return
@@ -651,29 +688,27 @@ FROM parquet.`{temp_volume_path}`"""
651
688
  cast_options = CastOptions.check_arg(options=cast_options, target_field=existing_schema)
652
689
  data = cast_spark_dataframe(data, options=cast_options)
653
690
 
654
- # --- Sanity checks & pre-cleaning (avoid nulls in keys) ---
655
- if match_by:
656
- notnull: pyspark.sql.Column = None
691
+ logger.debug("Incoming Spark columns: %s", data.columns)
657
692
 
693
+ if match_by:
694
+ notnull = None
658
695
  for k in match_by:
659
696
  if k not in data.columns:
660
697
  raise ValueError(f"Missing match key '{k}' in DataFrame columns: {data.columns}")
661
-
662
- notnull = data[k].isNotNull() if notnull is None else notnull & (data[k].isNotNull())
698
+ notnull = data[k].isNotNull() if notnull is None else notnull & data[k].isNotNull()
663
699
 
664
700
  data = data.filter(notnull)
701
+ logger.debug("Filtered null keys for match_by=%s", match_by)
665
702
 
666
- # --- Merge (upsert) ---
667
703
  target = self.spark_table(full_name=location)
668
704
 
669
705
  if match_by:
670
- # Build merge condition on the composite key
671
706
  cond = " AND ".join([f"t.`{k}` <=> s.`{k}`" for k in match_by])
707
+ logger.info("Running Delta MERGE (cond=%s)", cond)
672
708
 
673
709
  if mode.casefold() == "overwrite":
710
+ logger.info("Overwrite-by-key mode: delete matching keys then append")
674
711
  data = data.cache()
675
-
676
- # Step 1: get unique key combos from source
677
712
  distinct_keys = data.select([f"`{k}`" for k in match_by]).distinct()
678
713
 
679
714
  (
@@ -683,35 +718,30 @@ FROM parquet.`{temp_volume_path}`"""
683
718
  .execute()
684
719
  )
685
720
 
686
- # Step 3: append the clean batch
687
- data.write.format("delta").mode("append").saveAsTable(location)
721
+ data.write.format("delta").mode("append").options(**spark_options).saveAsTable(location)
688
722
  else:
689
723
  update_cols = [c for c in data.columns if c not in match_by]
690
- set_expr = {
691
- c: F.expr(f"s.`{c}`") for c in update_cols
692
- }
724
+ set_expr = {c: F.expr(f"s.`{c}`") for c in update_cols}
693
725
 
694
- # Execute MERGE - update matching records first, then insert new ones
695
726
  (
696
727
  target.alias("t")
697
728
  .merge(data.alias("s"), cond)
698
- .whenMatchedUpdate(set=set_expr) # update matched rows
699
- .whenNotMatchedInsertAll() # insert new rows
729
+ .whenMatchedUpdate(set=set_expr)
730
+ .whenNotMatchedInsertAll()
700
731
  .execute()
701
732
  )
702
733
  else:
703
734
  if mode == "auto":
704
735
  mode = "append"
736
+ logger.info("Spark write saveAsTable mode=%s", mode)
705
737
  data.write.mode(mode).options(**spark_options).saveAsTable(location)
706
738
 
707
- # --- Optimize: Z-ORDER for faster lookups by composite key (Databricks) ---
708
739
  if optimize_after_merge and zorder_by:
709
- # pass columns as varargs
740
+ logger.info("Delta optimize + zorder (%s)", zorder_by)
710
741
  target.optimize().executeZOrderBy(*zorder_by)
711
742
 
712
- # --- Optional VACUUM ---
713
743
  if vacuum_hours is not None:
714
- # Beware data retention policies; set to a safe value or use default 7 days
744
+ logger.info("Delta vacuum retain=%s hours", vacuum_hours)
715
745
  target.vacuum(vacuum_hours)
716
746
 
717
747
  def get_table_schema(
@@ -719,24 +749,24 @@ FROM parquet.`{temp_volume_path}`"""
719
749
  catalog_name: Optional[str] = None,
720
750
  schema_name: Optional[str] = None,
721
751
  table_name: Optional[str] = None,
722
- to_arrow_schema: bool = True
752
+ to_arrow_schema: bool = True,
723
753
  ) -> Union[pa.Field, pa.Schema]:
724
- """Fetch a table schema from Unity Catalog as Arrow types.
754
+ """Fetch a table schema from Unity Catalog and convert it to Arrow types.
725
755
 
726
756
  Args:
727
757
  catalog_name: Optional catalog override.
728
758
  schema_name: Optional schema override.
729
759
  table_name: Optional table name override.
730
- to_arrow_schema: Whether to return an Arrow schema or field.
760
+ to_arrow_schema: If True returns pa.Schema; else returns a pa.Field(STRUCT<...>).
731
761
 
732
762
  Returns:
733
- Arrow Schema or Field representing the table.
763
+ Arrow Schema or a STRUCT Field representing the table.
734
764
  """
735
765
  full_name = self.table_full_name(
736
766
  catalog_name=catalog_name,
737
767
  schema_name=schema_name,
738
768
  table_name=table_name,
739
- safe_chars=False
769
+ safe_chars=False,
740
770
  )
741
771
 
742
772
  wk = self.workspace.sdk()
@@ -746,10 +776,7 @@ FROM parquet.`{temp_volume_path}`"""
746
776
  except Exception as e:
747
777
  raise ValueError(f"Table %s not found, {type(e)} {e}" % full_name)
748
778
 
749
- fields = [
750
- column_info_to_arrow_field(_)
751
- for _ in table.columns
752
- ]
779
+ fields = [column_info_to_arrow_field(_) for _ in table.columns]
753
780
 
754
781
  if to_arrow_schema:
755
782
  return pa.schema(fields, metadata={b"name": table_name})
@@ -762,25 +789,15 @@ FROM parquet.`{temp_volume_path}`"""
762
789
  schema_name: Optional[str] = None,
763
790
  table_name: Optional[str] = None,
764
791
  ):
765
- """Drop a table if it exists.
766
-
767
- Args:
768
- location: Fully qualified table name override.
769
- catalog_name: Optional catalog override.
770
- schema_name: Optional schema override.
771
- table_name: Optional table name override.
772
-
773
- Returns:
774
- The StatementResult from executing the drop statement.
775
- """
792
+ """Drop a table if it exists."""
776
793
  location, _, _, _ = self._check_location_params(
777
794
  location=location,
778
795
  catalog_name=catalog_name,
779
796
  schema_name=schema_name,
780
797
  table_name=table_name,
781
- safe_chars=True
798
+ safe_chars=True,
782
799
  )
783
-
800
+ logger.info("Dropping table if exists: %s", location)
784
801
  return self.execute(f"DROP TABLE IF EXISTS {location}")
785
802
 
786
803
  def create_table(
@@ -797,23 +814,27 @@ FROM parquet.`{temp_volume_path}`"""
797
814
  if_not_exists: bool = True,
798
815
  optimize_write: bool = True,
799
816
  auto_compact: bool = True,
800
- execute: bool = True
801
- ) -> str:
802
- """
803
- Generate DDL (Data Definition Language) SQL for creating a table from a PyField schema.
817
+ execute: bool = True,
818
+ ) -> Union[str, "StatementResult"]:
819
+ """Generate (and optionally execute) CREATE TABLE DDL from an Arrow schema/field.
804
820
 
805
821
  Args:
806
- field: PyField schema that defines the table structure
807
- table_name: Name of the table to create (defaults to schema.name)
808
- catalog_name: Optional catalog name (defaults to "hive_metastore")
809
- schema_name: Optional schema name (defaults to "default")
810
- partition_by: Optional list of column names to partition the table by
811
- comment: Optional table comment
812
- options: Optional table properties
813
- if_not_exists: Whether to add IF NOT EXISTS clause
822
+ field: Arrow Field or Schema describing the table. If `field` is a schema, it's converted.
823
+ location: Fully qualified table name override.
824
+ table_name: Table name override (used if location not provided).
825
+ catalog_name: Catalog override.
826
+ schema_name: Schema override.
827
+ partition_by: Optional partition columns.
828
+ cluster_by: If True -> CLUSTER BY AUTO. If list[str] -> CLUSTER BY (..). If False -> no clustering.
829
+ comment: Optional table comment (falls back to field metadata b"comment" when present).
830
+ options: Extra table properties.
831
+ if_not_exists: Add IF NOT EXISTS clause.
832
+ optimize_write: Sets delta.autoOptimize.optimizeWrite table property.
833
+ auto_compact: Sets delta.autoOptimize.autoCompact table property.
834
+ execute: If True, executes DDL and returns StatementResult; otherwise returns SQL string.
814
835
 
815
836
  Returns:
816
- A SQL string for creating the table
837
+ StatementResult if execute=True, else the DDL SQL string.
817
838
  """
818
839
  if not isinstance(field, pa.Field):
819
840
  field = convert(field, pa.Field)
@@ -823,7 +844,7 @@ FROM parquet.`{temp_volume_path}`"""
823
844
  catalog_name=catalog_name,
824
845
  schema_name=schema_name,
825
846
  table_name=table_name,
826
- safe_chars=True
847
+ safe_chars=True,
827
848
  )
828
849
 
829
850
  if pa.types.is_struct(field.type):
@@ -831,28 +852,22 @@ FROM parquet.`{temp_volume_path}`"""
831
852
  else:
832
853
  children = [field]
833
854
 
834
- # Create the DDL statement
835
- column_definitions = [
836
- self._field_to_ddl(child)
837
- for child in children
838
- ]
855
+ column_definitions = [self._field_to_ddl(child) for child in children]
839
856
 
840
857
  sql = [
841
858
  f"CREATE TABLE {'IF NOT EXISTS ' if if_not_exists else ''}{location} (",
842
859
  ",\n ".join(column_definitions),
843
- ")"
860
+ ")",
844
861
  ]
845
862
 
846
- # Add partition by clause if provided
847
- if partition_by and len(partition_by) > 0:
863
+ if partition_by:
848
864
  sql.append(f"\nPARTITIONED BY ({', '.join(partition_by)})")
849
865
  elif cluster_by:
850
866
  if isinstance(cluster_by, bool):
851
- sql.append(f"\nCLUSTER BY AUTO")
867
+ sql.append("\nCLUSTER BY AUTO")
852
868
  else:
853
869
  sql.append(f"\nCLUSTER BY ({', '.join(cluster_by)})")
854
870
 
855
- # Add comment if provided
856
871
  if not comment and field.metadata:
857
872
  comment = field.metadata.get(b"comment")
858
873
 
@@ -862,30 +877,29 @@ FROM parquet.`{temp_volume_path}`"""
862
877
  if comment:
863
878
  sql.append(f"\nCOMMENT '{comment}'")
864
879
 
865
- # Add options if provided
866
880
  options = {} if options is None else options
867
881
  options.update({
868
882
  "delta.autoOptimize.optimizeWrite": optimize_write,
869
- "delta.autoOptimize.autoCompact": auto_compact
883
+ "delta.autoOptimize.autoCompact": auto_compact,
870
884
  })
871
885
 
872
886
  option_strs = []
873
-
874
- if options:
875
- for key, value in options.items():
876
- if isinstance(value, str):
877
- option_strs.append(f"'{key}' = '{value}'")
878
- elif isinstance(value, bool):
879
- b_value = "true" if value else "false"
880
- option_strs.append(f"'{key}' = '{b_value}'")
881
- else:
882
- option_strs.append(f"'{key}' = {value}")
887
+ for key, value in (options or {}).items():
888
+ if isinstance(value, str):
889
+ option_strs.append(f"'{key}' = '{value}'")
890
+ elif isinstance(value, bool):
891
+ option_strs.append(f"'{key}' = '{'true' if value else 'false'}'")
892
+ else:
893
+ option_strs.append(f"'{key}' = {value}")
883
894
 
884
895
  if option_strs:
885
896
  sql.append(f"\nTBLPROPERTIES ({', '.join(option_strs)})")
886
897
 
887
898
  statement = "\n".join(sql)
888
899
 
900
+ logger.info("Generated CREATE TABLE DDL for %s", location)
901
+ logger.debug("DDL:\n%s", statement)
902
+
889
903
  if execute:
890
904
  return self.execute(statement)
891
905
  return statement
@@ -896,28 +910,18 @@ FROM parquet.`{temp_volume_path}`"""
896
910
  catalog_name: Optional[str] = None,
897
911
  schema_name: Optional[str] = None,
898
912
  table_name: Optional[str] = None,
899
- safe_chars: bool = True
900
- ):
901
- """Resolve location/catalog/schema/table parameters to a full name.
902
-
903
- Args:
904
- location: Fully qualified table name override.
905
- catalog_name: Optional catalog override.
906
- schema_name: Optional schema override.
907
- table_name: Optional table name override.
908
- safe_chars: Whether to wrap identifiers in backticks.
909
-
910
- Returns:
911
- A tuple of (location, catalog_name, schema_name, table_name).
912
- """
913
+ safe_chars: bool = True,
914
+ ) -> tuple[str, Optional[str], Optional[str], Optional[str]]:
915
+ """Resolve (location OR catalog/schema/table) into a fully-qualified name."""
913
916
  if location:
914
917
  c, s, t = self._catalog_schema_table_names(location)
915
918
  catalog_name, schema_name, table_name = catalog_name or c, schema_name or s, table_name or t
916
919
 
917
920
  location = self.table_full_name(
918
- catalog_name=catalog_name, schema_name=schema_name,
921
+ catalog_name=catalog_name,
922
+ schema_name=schema_name,
919
923
  table_name=table_name,
920
- safe_chars=safe_chars
924
+ safe_chars=safe_chars,
921
925
  )
922
926
 
923
927
  return location, catalog_name or self.catalog_name, schema_name or self.schema_name, table_name
@@ -927,98 +931,68 @@ FROM parquet.`{temp_volume_path}`"""
927
931
  field: pa.Field,
928
932
  put_name: bool = True,
929
933
  put_not_null: bool = True,
930
- put_comment: bool = True
934
+ put_comment: bool = True,
931
935
  ) -> str:
932
- """
933
- Convert a PyField to a DDL column definition.
934
-
935
- Args:
936
- field: The PyField to convert
937
-
938
- Returns:
939
- A string containing the column definition in DDL format
940
- """
936
+ """Convert an Arrow Field to a Databricks SQL column DDL fragment."""
941
937
  name = field.name
942
938
  nullable_str = " NOT NULL" if put_not_null and not field.nullable else ""
943
939
  name_str = f"{name} " if put_name else ""
944
940
 
945
- # Get comment if available
946
941
  comment_str = ""
947
942
  if put_comment and field.metadata and b"comment" in field.metadata:
948
943
  comment = field.metadata[b"comment"].decode("utf-8")
949
944
  comment_str = f" COMMENT '{comment}'"
950
945
 
951
- # Handle primitive types
952
946
  if not pa.types.is_nested(field.type):
953
947
  sql_type = SQLEngine._arrow_to_sql_type(field.type)
954
948
  return f"{name_str}{sql_type}{nullable_str}{comment_str}"
955
949
 
956
- # Handle struct type
957
950
  if pa.types.is_struct(field.type):
958
951
  child_defs = [SQLEngine._field_to_ddl(child) for child in field.type]
959
952
  struct_body = ", ".join(child_defs)
960
953
  return f"{name_str}STRUCT<{struct_body}>{nullable_str}{comment_str}"
961
954
 
962
- # Handle map type
963
955
  if pa.types.is_map(field.type):
964
956
  map_type: pa.MapType = field.type
965
957
  key_type = SQLEngine._field_to_ddl(map_type.key_field, put_name=False, put_comment=False, put_not_null=False)
966
958
  val_type = SQLEngine._field_to_ddl(map_type.item_field, put_name=False, put_comment=False, put_not_null=False)
967
959
  return f"{name_str}MAP<{key_type}, {val_type}>{nullable_str}{comment_str}"
968
960
 
969
- # Handle list type after map
970
961
  if pa.types.is_list(field.type) or pa.types.is_large_list(field.type):
971
962
  list_type: pa.ListType = field.type
972
963
  elem_type = SQLEngine._field_to_ddl(list_type.value_field, put_name=False, put_comment=False, put_not_null=False)
973
964
  return f"{name_str}ARRAY<{elem_type}>{nullable_str}{comment_str}"
974
965
 
975
- # Default fallback to string for unknown types
976
966
  raise TypeError(f"Cannot make ddl field from {field}")
977
967
 
978
968
  @staticmethod
979
- def _arrow_to_sql_type(
980
- arrow_type: Union[pa.DataType, pa.Decimal128Type]
981
- ) -> str:
982
- """
983
- Convert an Arrow data type to SQL data type.
984
-
985
- Args:
986
- arrow_type: The Arrow data type
987
-
988
- Returns:
989
- A string containing the SQL data type
990
- """
969
+ def _arrow_to_sql_type(arrow_type: Union[pa.DataType, pa.Decimal128Type]) -> str:
970
+ """Convert an Arrow data type to a Databricks SQL type string."""
991
971
  if pa.types.is_boolean(arrow_type):
992
972
  return "BOOLEAN"
993
- elif pa.types.is_int8(arrow_type):
973
+ if pa.types.is_int8(arrow_type):
994
974
  return "TINYINT"
995
- elif pa.types.is_int16(arrow_type):
975
+ if pa.types.is_int16(arrow_type):
996
976
  return "SMALLINT"
997
- elif pa.types.is_int32(arrow_type):
977
+ if pa.types.is_int32(arrow_type):
998
978
  return "INT"
999
- elif pa.types.is_int64(arrow_type):
979
+ if pa.types.is_int64(arrow_type):
1000
980
  return "BIGINT"
1001
- elif pa.types.is_float32(arrow_type):
981
+ if pa.types.is_float32(arrow_type):
1002
982
  return "FLOAT"
1003
- elif pa.types.is_float64(arrow_type):
983
+ if pa.types.is_float64(arrow_type):
1004
984
  return "DOUBLE"
1005
- elif is_arrow_type_string_like(arrow_type):
985
+ if is_arrow_type_string_like(arrow_type):
1006
986
  return "STRING"
1007
- elif is_arrow_type_binary_like(arrow_type):
987
+ if is_arrow_type_binary_like(arrow_type):
1008
988
  return "BINARY"
1009
- elif pa.types.is_timestamp(arrow_type):
989
+ if pa.types.is_timestamp(arrow_type):
1010
990
  tz = getattr(arrow_type, "tz", None)
1011
-
1012
- if tz:
1013
- return "TIMESTAMP"
1014
- return "TIMESTAMP_NTZ"
1015
- elif pa.types.is_date(arrow_type):
991
+ return "TIMESTAMP" if tz else "TIMESTAMP_NTZ"
992
+ if pa.types.is_date(arrow_type):
1016
993
  return "DATE"
1017
- elif pa.types.is_decimal(arrow_type):
1018
- precision = arrow_type.precision
1019
- scale = arrow_type.scale
1020
- return f"DECIMAL({precision}, {scale})"
1021
- elif pa.types.is_null(arrow_type):
994
+ if pa.types.is_decimal(arrow_type):
995
+ return f"DECIMAL({arrow_type.precision}, {arrow_type.scale})"
996
+ if pa.types.is_null(arrow_type):
1022
997
  return "STRING"
1023
- else:
1024
- raise ValueError(f"Cannot make ddl type for {arrow_type}")
998
+ raise ValueError(f"Cannot make ddl type for {arrow_type}")