ygg 0.1.33__py3-none-any.whl → 0.1.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,15 @@
1
- """Databricks SQL engine utilities and helpers."""
1
+ """Databricks SQL engine utilities and helpers.
2
+
3
+ This module provides a thin “do the right thing” layer over:
4
+ - Databricks SQL Statement Execution API (warehouse)
5
+ - Spark SQL / Delta Lake (when running inside a Spark-enabled context)
6
+
7
+ It includes helpers to:
8
+ - Build fully-qualified table names
9
+ - Execute SQL via Spark or Databricks SQL API
10
+ - Insert Arrow/Spark data into Delta tables (append/overwrite/merge)
11
+ - Generate DDL from Arrow schemas
12
+ """
2
13
 
3
14
  import dataclasses
4
15
  import logging
@@ -8,7 +19,6 @@ import time
8
19
  from typing import Optional, Union, Any, Dict, List, Literal
9
20
 
10
21
  import pyarrow as pa
11
- import pyarrow.parquet as pq
12
22
 
13
23
  from .statement_result import StatementResult
14
24
  from .types import column_info_to_arrow_field
@@ -28,7 +38,6 @@ except ImportError:
28
38
  @classmethod
29
39
  def forName(cls, *args, **kwargs):
30
40
  from delta.tables import DeltaTable
31
-
32
41
  return DeltaTable.forName(*args, **kwargs)
33
42
 
34
43
 
@@ -37,23 +46,18 @@ if databricks_sdk is not None:
37
46
  StatementResponse, Disposition, Format,
38
47
  ExecuteStatementRequestOnWaitTimeout, StatementParameterListItem
39
48
  )
40
-
41
49
  StatementResponse = StatementResponse
42
50
  else:
43
- class StatementResponse:
51
+ class StatementResponse: # pragma: no cover
44
52
  pass
45
53
 
46
54
 
47
55
  logger = logging.getLogger(__name__)
48
56
 
49
-
50
57
  if pyspark is not None:
51
58
  import pyspark.sql.functions as F
52
59
 
53
- __all__ = [
54
- "SQLEngine",
55
- "StatementResult"
56
- ]
60
+ __all__ = ["SQLEngine", "StatementResult"]
57
61
 
58
62
 
59
63
  class SqlExecutionError(RuntimeError):
@@ -62,7 +66,7 @@ class SqlExecutionError(RuntimeError):
62
66
 
63
67
  @dataclasses.dataclass
64
68
  class SQLEngine(WorkspaceService):
65
- """Execute SQL statements and manage tables via Databricks."""
69
+ """Execute SQL statements and manage tables via Databricks SQL / Spark."""
66
70
  warehouse_id: Optional[str] = None
67
71
  catalog_name: Optional[str] = None
68
72
  schema_name: Optional[str] = None
@@ -72,18 +76,18 @@ class SQLEngine(WorkspaceService):
72
76
  catalog_name: Optional[str] = None,
73
77
  schema_name: Optional[str] = None,
74
78
  table_name: Optional[str] = None,
75
- safe_chars: bool = True
76
- ):
77
- """Build a fully qualified table name for the current catalog/schema.
79
+ safe_chars: bool = True,
80
+ ) -> str:
81
+ """Build a fully qualified table name (catalog.schema.table).
78
82
 
79
83
  Args:
80
- catalog_name: Optional catalog override.
81
- schema_name: Optional schema override.
84
+ catalog_name: Optional catalog override (defaults to engine.catalog_name).
85
+ schema_name: Optional schema override (defaults to engine.schema_name).
82
86
  table_name: Table name to qualify.
83
- safe_chars: Whether to wrap identifiers in backticks.
87
+ safe_chars: Whether to wrap each identifier in backticks.
84
88
 
85
89
  Returns:
86
- The fully qualified table name.
90
+ Fully qualified table name string.
87
91
  """
88
92
  catalog_name = catalog_name or self.catalog_name
89
93
  schema_name = schema_name or self.schema_name
@@ -96,21 +100,23 @@ class SQLEngine(WorkspaceService):
96
100
  return f"`{catalog_name}`.`{schema_name}`.`{table_name}`"
97
101
  return f"{catalog_name}.{schema_name}.{table_name}"
98
102
 
99
- def _catalog_schema_table_names(
100
- self,
101
- full_name: str,
102
- ):
103
+ def _catalog_schema_table_names(self, full_name: str) -> tuple[Optional[str], Optional[str], Optional[str]]:
103
104
  """Parse a catalog.schema.table string into components.
104
105
 
106
+ Supports partial names:
107
+ - table
108
+ - schema.table
109
+ - catalog.schema.table
110
+
111
+ Backticks are stripped.
112
+
105
113
  Args:
106
- full_name: A fully qualified name or partial name.
114
+ full_name: Fully qualified or partial table name.
107
115
 
108
116
  Returns:
109
- A tuple of (catalog_name, schema_name, table_name).
117
+ Tuple of (catalog_name, schema_name, table_name).
110
118
  """
111
- parts = [
112
- _.strip("`") for _ in full_name.split(".")
113
- ]
119
+ parts = [_.strip("`") for _ in full_name.split(".")]
114
120
 
115
121
  if len(parts) == 0:
116
122
  return self.catalog_name, self.schema_name, None
@@ -122,20 +128,20 @@ class SQLEngine(WorkspaceService):
122
128
  catalog_name, schema_name, table_name = parts[-3], parts[-2], parts[-1]
123
129
  catalog_name = catalog_name or self.catalog_name
124
130
  schema_name = schema_name or self.schema_name
125
-
126
131
  return catalog_name, schema_name, table_name
127
132
 
128
- def _default_warehouse(
129
- self,
130
- cluster_size: str = "Small"
131
- ):
132
- """Return a default SQL warehouse matching the desired size.
133
+ def _default_warehouse(self, cluster_size: str = "Small"):
134
+ """Pick a default SQL warehouse (best-effort) matching the desired size.
133
135
 
134
136
  Args:
135
- cluster_size: Desired warehouse size filter.
137
+ cluster_size: Desired warehouse size (Databricks "cluster_size"), e.g. "Small".
138
+ If empty/None, returns the first warehouse encountered.
136
139
 
137
140
  Returns:
138
- The matched warehouse object.
141
+ Warehouse object.
142
+
143
+ Raises:
144
+ ValueError: If no warehouses exist in the workspace.
139
145
  """
140
146
  wk = self.workspace.sdk()
141
147
  existing = list(wk.warehouses.list())
@@ -146,48 +152,54 @@ class SQLEngine(WorkspaceService):
146
152
  first = warehouse
147
153
 
148
154
  if cluster_size:
149
- if warehouse.cluster_size == cluster_size:
155
+ if getattr(warehouse, "cluster_size", None) == cluster_size:
156
+ logger.debug("Default warehouse match found: id=%s cluster_size=%s", warehouse.id, warehouse.cluster_size)
150
157
  return warehouse
151
158
  else:
159
+ logger.debug("Default warehouse selected (first): id=%s", warehouse.id)
152
160
  return warehouse
153
161
 
154
162
  if first is not None:
163
+ logger.info(
164
+ "No warehouse matched cluster_size=%s; falling back to first warehouse id=%s cluster_size=%s",
165
+ cluster_size,
166
+ getattr(first, "id", None),
167
+ getattr(first, "cluster_size", None),
168
+ )
155
169
  return first
156
170
 
157
171
  raise ValueError(f"No default warehouse found in {wk.config.host}")
158
172
 
159
- def _get_or_default_warehouse_id(
160
- self,
161
- cluster_size = "Small"
162
- ):
163
- """Return the configured warehouse id or a default one.
173
+ def _get_or_default_warehouse_id(self, cluster_size: str = "Small") -> str:
174
+ """Return configured warehouse_id or resolve a default one.
164
175
 
165
176
  Args:
166
- cluster_size: Desired warehouse size filter.
177
+ cluster_size: Desired warehouse size filter used when resolving defaults.
167
178
 
168
179
  Returns:
169
- The warehouse id string.
180
+ Warehouse id string.
170
181
  """
171
182
  if not self.warehouse_id:
172
183
  dft = self._default_warehouse(cluster_size=cluster_size)
173
-
174
184
  self.warehouse_id = dft.id
185
+ logger.info("Resolved default warehouse_id=%s (cluster_size=%s)", self.warehouse_id, cluster_size)
186
+
175
187
  return self.warehouse_id
176
188
 
177
189
  @staticmethod
178
190
  def _random_suffix(prefix: str = "") -> str:
179
- """Generate a unique suffix for temporary resources.
180
-
181
- Args:
182
- prefix: Optional prefix to prepend.
183
-
184
- Returns:
185
- A unique suffix string.
186
- """
187
- unique = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(8))
191
+ """Generate a unique suffix for temporary resources."""
192
+ unique = "".join(random.choice(string.ascii_lowercase + string.digits) for _ in range(8))
188
193
  timestamp = int(time.time() * 1000)
189
194
  return f"{prefix}{timestamp}_{unique}"
190
195
 
196
+ @staticmethod
197
+ def _sql_preview(sql: str, limit: int = 220) -> str:
198
+ """Short, single-line preview for logs (avoids spewing giant SQL)."""
199
+ if not sql:
200
+ return ""
201
+ return sql[:limit] + ("…" if len(sql) > limit else "")
202
+
191
203
  def execute(
192
204
  self,
193
205
  statement: Optional[str] = None,
@@ -205,56 +217,67 @@ class SQLEngine(WorkspaceService):
205
217
  schema_name: Optional[str] = None,
206
218
  table_name: Optional[str] = None,
207
219
  wait_result: bool = True,
208
- **kwargs,
209
220
  ) -> "StatementResult":
210
- """
211
- Execute a SQL statement on a SQL warehouse.
221
+ """Execute a SQL statement via Spark or Databricks SQL Statement Execution API.
212
222
 
213
- - If wait=True (default): poll until terminal state.
214
- - On SUCCEEDED: return final statement object
215
- - On FAILED / CANCELED: raise SqlExecutionError
216
- - If wait=False: return initial execution handle without polling.
223
+ Engine resolution:
224
+ - If `engine` is not provided and a Spark session is active -> uses Spark.
225
+ - Otherwise uses Databricks SQL API (warehouse).
226
+
227
+ Waiting behavior (`wait_result`):
228
+ - If True (default): returns a StatementResult in terminal state (SUCCEEDED/FAILED/CANCELED).
229
+ - If False: returns immediately with the initial handle (caller can `.wait()` later).
217
230
 
218
231
  Args:
219
- statement: SQL statement to execute. If omitted, selects from the table.
220
- engine: Execution engine ("spark" or "api").
221
- warehouse_id: Optional warehouse id override.
232
+ statement: SQL statement to execute. If None, a `SELECT *` is generated from the table params.
233
+ engine: "spark" or "api".
234
+ warehouse_id: Warehouse override (for API engine).
222
235
  byte_limit: Optional byte limit for results.
223
- disposition: Result disposition mode.
224
- format: Result format for Databricks SQL API.
225
- on_wait_timeout: Timeout behavior for waiting.
226
- parameters: Optional statement parameters.
227
- row_limit: Optional row limit.
228
- wait_timeout: Optional API wait timeout.
229
- catalog_name: Optional catalog override.
230
- schema_name: Optional schema override.
231
- table_name: Optional table name override.
232
- wait_result: Whether to block until completion.
233
- **kwargs: Additional API parameters.
236
+ disposition: Result disposition mode (API engine).
237
+ format: Result format (API engine).
238
+ on_wait_timeout: Timeout behavior for waiting (API engine).
239
+ parameters: Optional statement parameters (API engine).
240
+ row_limit: Optional row limit for results (API engine).
241
+ wait_timeout: API wait timeout value.
242
+ catalog_name: Optional catalog override for API engine.
243
+ schema_name: Optional schema override for API engine.
244
+ table_name: Optional table override used when `statement` is None.
245
+ wait_result: Whether to block until completion (API engine).
234
246
 
235
247
  Returns:
236
- A StatementResult wrapper for the execution.
248
+ StatementResult.
237
249
  """
250
+ # --- Engine auto-detection ---
238
251
  if not engine:
239
252
  if pyspark is not None:
240
253
  spark_session = SparkSession.getActiveSession()
241
-
242
254
  if spark_session is not None:
243
255
  engine = "spark"
244
256
 
257
+ # --- Spark path ---
245
258
  if engine == "spark":
246
259
  spark_session = SparkSession.getActiveSession()
247
-
248
260
  if spark_session is None:
249
261
  raise ValueError("No spark session found to run sql query")
250
262
 
263
+ df: SparkDataFrame = spark_session.sql(statement)
264
+
265
+ if row_limit:
266
+ df = df.limit(row_limit)
267
+
268
+ logger.info("Spark SQL executed: %s", self._sql_preview(statement))
269
+
270
+ # Avoid Disposition dependency if SDK imports are absent
271
+ spark_disp = disposition if disposition is not None else getattr(globals().get("Disposition", object), "EXTERNAL_LINKS", None)
272
+
251
273
  return StatementResult(
252
274
  engine=self,
253
275
  statement_id="sparksql",
254
- disposition=Disposition.EXTERNAL_LINKS,
255
- _spark_df=spark_session.sql(statement)
276
+ disposition=spark_disp,
277
+ _spark_df=df,
256
278
  )
257
279
 
280
+ # --- API path defaults ---
258
281
  if format is None:
259
282
  format = Format.ARROW_STREAM
260
283
 
@@ -280,7 +303,6 @@ class SQLEngine(WorkspaceService):
280
303
  wait_timeout=wait_timeout,
281
304
  catalog=catalog_name or self.catalog_name,
282
305
  schema=schema_name or self.schema_name,
283
- **kwargs,
284
306
  )
285
307
 
286
308
  execution = StatementResult(
@@ -288,10 +310,15 @@ class SQLEngine(WorkspaceService):
288
310
  statement_id=response.statement_id,
289
311
  _response=response,
290
312
  _response_refresh_time=time.time(),
291
- disposition=disposition
313
+ disposition=disposition,
314
+ )
315
+
316
+ logger.info(
317
+ "API SQL executed: %s",
318
+ self._sql_preview(statement)
292
319
  )
293
320
 
294
- return execution.wait() if wait_result else wait_result
321
+ return execution.wait() if wait_result else execution
295
322
 
296
323
  def spark_table(
297
324
  self,
@@ -300,35 +327,21 @@ class SQLEngine(WorkspaceService):
300
327
  schema_name: Optional[str] = None,
301
328
  table_name: Optional[str] = None,
302
329
  ):
303
- """Return a DeltaTable handle for a given table name.
304
-
305
- Args:
306
- full_name: Fully qualified table name.
307
- catalog_name: Optional catalog override.
308
- schema_name: Optional schema override.
309
- table_name: Optional table name override.
310
-
311
- Returns:
312
- A Spark DeltaTable handle.
313
- """
330
+ """Return a DeltaTable handle for a given table name (Spark context required)."""
314
331
  if not full_name:
315
332
  full_name = self.table_full_name(
316
333
  catalog_name=catalog_name,
317
334
  schema_name=schema_name,
318
- table_name=table_name
335
+ table_name=table_name,
319
336
  )
320
-
321
337
  return SparkDeltaTable.forName(
322
338
  sparkSession=SparkSession.getActiveSession(),
323
- tableOrViewName=full_name
339
+ tableOrViewName=full_name,
324
340
  )
325
341
 
326
342
  def insert_into(
327
343
  self,
328
- data: Union[
329
- pa.Table, pa.RecordBatch, pa.RecordBatchReader,
330
- SparkDataFrame
331
- ],
344
+ data: Union[pa.Table, pa.RecordBatch, pa.RecordBatchReader, SparkDataFrame],
332
345
  location: Optional[str] = None,
333
346
  catalog_name: Optional[str] = None,
334
347
  schema_name: Optional[str] = None,
@@ -336,14 +349,18 @@ class SQLEngine(WorkspaceService):
336
349
  mode: str = "auto",
337
350
  cast_options: Optional[CastOptions] = None,
338
351
  overwrite_schema: bool | None = None,
339
- match_by: list[str] = None,
340
- zorder_by: list[str] = None,
352
+ match_by: Optional[list[str]] = None,
353
+ zorder_by: Optional[list[str]] = None,
341
354
  optimize_after_merge: bool = False,
342
- vacuum_hours: int | None = None, # e.g., 168 for 7 days
355
+ vacuum_hours: int | None = None,
343
356
  spark_session: Optional[SparkSession] = None,
344
- spark_options: Optional[Dict[str, Any]] = None
357
+ spark_options: Optional[Dict[str, Any]] = None,
345
358
  ):
346
- """Insert data into a table using Spark or Arrow paths.
359
+ """Insert data into a Delta table using Spark when available; otherwise stage Arrow.
360
+
361
+ Strategy:
362
+ - If Spark is available and we have an active session (or Spark DF input) -> use `spark_insert_into`.
363
+ - Otherwise -> use `arrow_insert_into` (stages Parquet to a temp volume + runs SQL INSERT/MERGE).
347
364
 
348
365
  Args:
349
366
  data: Arrow or Spark data to insert.
@@ -353,18 +370,18 @@ class SQLEngine(WorkspaceService):
353
370
  table_name: Optional table name override.
354
371
  mode: Insert mode ("auto", "append", "overwrite").
355
372
  cast_options: Optional casting options.
356
- overwrite_schema: Whether to overwrite schema (Spark).
357
- match_by: Optional merge keys for upserts.
358
- zorder_by: Optional Z-ORDER columns.
359
- optimize_after_merge: Whether to run OPTIMIZE after merge.
373
+ overwrite_schema: Whether to overwrite schema (Spark path).
374
+ match_by: Merge keys for upserts (MERGE semantics). When set, mode affects behavior.
375
+ zorder_by: Z-ORDER columns (SQL path uses OPTIMIZE ZORDER; Spark path uses Delta optimize API).
376
+ optimize_after_merge: Whether to run OPTIMIZE after a merge (SQL path) / after merge+zorder (Spark path).
360
377
  vacuum_hours: Optional VACUUM retention window.
361
378
  spark_session: Optional SparkSession override.
362
379
  spark_options: Optional Spark write options.
363
380
 
364
381
  Returns:
365
- None for Arrow inserts, or the Spark insert result.
382
+ None (mutates the destination table).
366
383
  """
367
- # -------- existing logic you provided (kept intact) ----------
384
+
368
385
  if pyspark is not None:
369
386
  spark_session = SparkSession.getActiveSession() if spark_session is None else spark_session
370
387
 
@@ -382,7 +399,7 @@ class SQLEngine(WorkspaceService):
382
399
  zorder_by=zorder_by,
383
400
  optimize_after_merge=optimize_after_merge,
384
401
  vacuum_hours=vacuum_hours,
385
- spark_options=spark_options
402
+ spark_options=spark_options,
386
403
  )
387
404
 
388
405
  return self.arrow_insert_into(
@@ -402,9 +419,7 @@ class SQLEngine(WorkspaceService):
402
419
 
403
420
  def arrow_insert_into(
404
421
  self,
405
- data: Union[
406
- pa.Table, pa.RecordBatch, pa.RecordBatchReader,
407
- ],
422
+ data: Union[pa.Table, pa.RecordBatch, pa.RecordBatchReader],
408
423
  location: Optional[str] = None,
409
424
  catalog_name: Optional[str] = None,
410
425
  schema_name: Optional[str] = None,
@@ -412,14 +427,19 @@ class SQLEngine(WorkspaceService):
412
427
  mode: str = "auto",
413
428
  cast_options: Optional[CastOptions] = None,
414
429
  overwrite_schema: bool | None = None,
415
- match_by: list[str] = None,
416
- zorder_by: list[str] = None,
430
+ match_by: Optional[list[str]] = None,
431
+ zorder_by: Optional[list[str]] = None,
417
432
  optimize_after_merge: bool = False,
418
- vacuum_hours: int | None = None, # e.g., 168 for 7 days
433
+ vacuum_hours: int | None = None,
419
434
  existing_schema: pa.Schema | None = None,
420
- temp_volume_path: Optional[Union[str, DatabricksPath]] = None
435
+ temp_volume_path: Optional[Union[str, DatabricksPath]] = None,
421
436
  ):
422
- """Insert Arrow data by staging to a temp volume and running SQL.
437
+ """Insert Arrow data by staging Parquet to a temp volume and running Databricks SQL.
438
+
439
+ Notes:
440
+ - If the table does not exist, it is created from the input Arrow schema (best-effort).
441
+ - If `match_by` is provided, uses MERGE INTO (upsert).
442
+ - Otherwise uses INSERT INTO / INSERT OVERWRITE depending on mode.
423
443
 
424
444
  Args:
425
445
  data: Arrow table/batch data to insert.
@@ -427,14 +447,14 @@ class SQLEngine(WorkspaceService):
427
447
  catalog_name: Optional catalog override.
428
448
  schema_name: Optional schema override.
429
449
  table_name: Optional table name override.
430
- mode: Insert mode ("auto", "append", "overwrite").
450
+ mode: Insert mode ("auto", "append", "overwrite"). ("auto" behaves like append here.)
431
451
  cast_options: Optional casting options.
432
- overwrite_schema: Whether to overwrite schema.
433
- match_by: Optional merge keys for upserts.
434
- zorder_by: Optional Z-ORDER columns.
435
- optimize_after_merge: Whether to run OPTIMIZE after merge.
436
- vacuum_hours: Optional VACUUM retention window.
437
- existing_schema: Optional pre-fetched schema.
452
+ overwrite_schema: Reserved for parity with Spark path (unused here).
453
+ match_by: Merge keys for MERGE INTO upserts.
454
+ zorder_by: Columns for OPTIMIZE ZORDER BY.
455
+ optimize_after_merge: Run OPTIMIZE after MERGE (in addition to ZORDER optimization).
456
+ vacuum_hours: Optional VACUUM retention window in hours.
457
+ existing_schema: Optional pre-fetched destination schema (Arrow).
438
458
  temp_volume_path: Optional temp volume path override.
439
459
 
440
460
  Returns:
@@ -445,26 +465,26 @@ class SQLEngine(WorkspaceService):
445
465
  catalog_name=catalog_name,
446
466
  schema_name=schema_name,
447
467
  table_name=table_name,
448
- safe_chars=True
468
+ safe_chars=True,
449
469
  )
450
470
 
451
- with self as connected:
471
+ with self.connect() as connected:
452
472
  if existing_schema is None:
453
473
  try:
454
474
  existing_schema = connected.get_table_schema(
455
475
  catalog_name=catalog_name,
456
476
  schema_name=schema_name,
457
477
  table_name=table_name,
458
- to_arrow_schema=True
478
+ to_arrow_schema=True,
459
479
  )
460
480
  except ValueError as exc:
461
- data = convert(data, pa.Table)
462
- existing_schema = data.schema
481
+ data_tbl = convert(data, pa.Table)
482
+ existing_schema = data_tbl.schema
463
483
  logger.warning(
464
- "Table %s not found, %s, creating it based on input data %s",
484
+ "Table %s not found (%s). Creating it from input schema (columns=%s)",
465
485
  location,
466
486
  exc,
467
- existing_schema.names
487
+ existing_schema.names,
468
488
  )
469
489
 
470
490
  connected.create_table(
@@ -472,12 +492,12 @@ class SQLEngine(WorkspaceService):
472
492
  catalog_name=catalog_name,
473
493
  schema_name=schema_name,
474
494
  table_name=table_name,
475
- if_not_exists=True
495
+ if_not_exists=True,
476
496
  )
477
497
 
478
498
  try:
479
499
  return connected.arrow_insert_into(
480
- data=data,
500
+ data=data_tbl,
481
501
  location=location,
482
502
  catalog_name=catalog_name,
483
503
  schema_name=schema_name,
@@ -489,54 +509,62 @@ class SQLEngine(WorkspaceService):
489
509
  zorder_by=zorder_by,
490
510
  optimize_after_merge=optimize_after_merge,
491
511
  vacuum_hours=vacuum_hours,
492
- existing_schema=existing_schema
512
+ existing_schema=existing_schema,
493
513
  )
494
- except:
514
+ except Exception:
515
+ logger.exception("Arrow insert failed after auto-creating %s; attempting cleanup (DROP TABLE)", location)
495
516
  try:
496
517
  connected.drop_table(location=location)
497
- except Exception as e:
498
- logger.warning("Failed to drop table %s after auto creation on error: %s", location, e)
518
+ except Exception:
519
+ logger.exception("Failed to drop table %s after auto creation error", location)
499
520
  raise
500
521
 
501
522
  transaction_id = self._random_suffix()
502
523
 
503
- data = convert(data, pa.Table, options=cast_options, target_field=existing_schema)
524
+ data_tbl = convert(
525
+ data, pa.Table,
526
+ options=cast_options, target_field=existing_schema
527
+ )
528
+ num_rows = data_tbl.num_rows
529
+
530
+ logger.debug(
531
+ "Arrow inserting %s rows into %s (mode=%s, match_by=%s, zorder_by=%s)",
532
+ num_rows,
533
+ location,
534
+ mode,
535
+ match_by,
536
+ zorder_by,
537
+ )
504
538
 
505
539
  # Write in temp volume
506
540
  temp_volume_path = connected.dbfs_path(
507
541
  kind=DatabricksPathKind.VOLUME,
508
- parts=[catalog_name, schema_name, "tmp", "sql", transaction_id]
542
+ parts=[catalog_name, schema_name, "tmp", "sql", transaction_id],
509
543
  ) if temp_volume_path is None else DatabricksPath.parse(obj=temp_volume_path, workspace=connected.workspace)
510
544
 
545
+ logger.debug("Staging Parquet to temp volume: %s", temp_volume_path)
511
546
  temp_volume_path.mkdir()
547
+ temp_volume_path.write_arrow_table(data_tbl)
512
548
 
513
- temp_volume_path.write_arrow_table(data)
514
-
515
- # get column list from arrow schema
516
- columns = [c for c in existing_schema.names]
549
+ columns = list(existing_schema.names)
517
550
  cols_quoted = ", ".join([f"`{c}`" for c in columns])
518
551
 
519
- statements = []
552
+ statements: list[str] = []
520
553
 
521
- # Decide how to ingest
522
- # If merge keys provided -> use MERGE
523
554
  if match_by:
524
- # build ON condition using match_by
525
- on_clauses = []
526
- for k in match_by:
527
- on_clauses.append(f"T.`{k}` = S.`{k}`")
528
- on_condition = " AND ".join(on_clauses)
555
+ on_condition = " AND ".join([f"T.`{k}` = S.`{k}`" for k in match_by])
529
556
 
530
- # build UPDATE set (all columns except match_by)
531
557
  update_cols = [c for c in columns if c not in match_by]
532
558
  if update_cols:
533
559
  update_set = ", ".join([f"T.`{c}` = S.`{c}`" for c in update_cols])
534
560
  update_clause = f"WHEN MATCHED THEN UPDATE SET {update_set}"
535
561
  else:
536
- update_clause = "" # nothing to update
562
+ update_clause = ""
537
563
 
538
- # build INSERT clause
539
- insert_clause = f"WHEN NOT MATCHED THEN INSERT ({cols_quoted}) VALUES ({', '.join([f'S.`{c}`' for c in columns])})"
564
+ insert_clause = (
565
+ f"WHEN NOT MATCHED THEN INSERT ({cols_quoted}) "
566
+ f"VALUES ({', '.join([f'S.`{c}`' for c in columns])})"
567
+ )
540
568
 
541
569
  merge_sql = f"""MERGE INTO {location} AS T
542
570
  USING (
@@ -546,41 +574,47 @@ ON {on_condition}
546
574
  {update_clause}
547
575
  {insert_clause}"""
548
576
  statements.append(merge_sql)
549
-
550
577
  else:
551
- # No match_by -> plain insert
552
578
  if mode.lower() in ("overwrite",):
553
579
  insert_sql = f"""INSERT OVERWRITE {location}
554
580
  SELECT {cols_quoted}
555
581
  FROM parquet.`{temp_volume_path}`"""
556
582
  else:
557
- # default: append
558
583
  insert_sql = f"""INSERT INTO {location} ({cols_quoted})
559
584
  SELECT {cols_quoted}
560
585
  FROM parquet.`{temp_volume_path}`"""
561
586
  statements.append(insert_sql)
562
587
 
563
- # Execute statements (use your existing execute helper)
564
588
  try:
565
589
  for stmt in statements:
566
- # trim and run
567
590
  connected.execute(stmt.strip())
568
591
  finally:
569
592
  try:
570
593
  temp_volume_path.rmdir(recursive=True)
571
- except Exception as e:
572
- logger.warning(e)
594
+ except Exception:
595
+ logger.exception("Failed cleaning temp volume: %s", temp_volume_path)
596
+
597
+ logger.info(
598
+ "Arrow inserted %s rows into %s (mode=%s, match_by=%s, zorder_by=%s)",
599
+ num_rows,
600
+ location,
601
+ mode,
602
+ match_by,
603
+ zorder_by,
604
+ )
573
605
 
574
- # Optionally run OPTIMIZE / ZORDER / VACUUM if requested (Databricks SQL)
575
606
  if zorder_by:
576
607
  zcols = ", ".join([f"`{c}`" for c in zorder_by])
577
608
  optimize_sql = f"OPTIMIZE {location} ZORDER BY ({zcols})"
609
+ logger.info("Running OPTIMIZE ZORDER BY: %s", zorder_by)
578
610
  connected.execute(optimize_sql)
579
611
 
580
612
  if optimize_after_merge and match_by:
613
+ logger.info("Running OPTIMIZE after MERGE")
581
614
  connected.execute(f"OPTIMIZE {location}")
582
615
 
583
616
  if vacuum_hours is not None:
617
+ logger.info("Running VACUUM retain=%s hours", vacuum_hours)
584
618
  connected.execute(f"VACUUM {location} RETAIN {vacuum_hours} HOURS")
585
619
 
586
620
  return None
@@ -596,13 +630,20 @@ FROM parquet.`{temp_volume_path}`"""
596
630
  mode: str = "auto",
597
631
  cast_options: Optional[CastOptions] = None,
598
632
  overwrite_schema: bool | None = None,
599
- match_by: list[str] = None,
600
- zorder_by: list[str] = None,
633
+ match_by: Optional[list[str]] = None,
634
+ zorder_by: Optional[list[str]] = None,
601
635
  optimize_after_merge: bool = False,
602
- vacuum_hours: int | None = None, # e.g., 168 for 7 days
636
+ vacuum_hours: int | None = None,
603
637
  spark_options: Optional[Dict[str, Any]] = None,
604
638
  ):
605
- """Insert a Spark DataFrame into a Delta table with optional merge semantics.
639
+ """Insert a Spark DataFrame into a Delta table (append/overwrite/merge).
640
+
641
+ Behavior:
642
+ - If the table does not exist: creates it via `saveAsTable(location)` (overwrite).
643
+ - If `match_by` is provided: uses Delta MERGE for upserts.
644
+ - If mode == "overwrite": deletes matching keys first, then appends the batch (fast-ish overwrite-by-key).
645
+ - Else: updates matching rows + inserts new ones.
646
+ - Else: uses `DataFrameWriter.saveAsTable` with mode.
606
647
 
607
648
  Args:
608
649
  data: Spark DataFrame to insert.
@@ -611,12 +652,12 @@ FROM parquet.`{temp_volume_path}`"""
611
652
  schema_name: Optional schema override.
612
653
  table_name: Optional table name override.
613
654
  mode: Insert mode ("auto", "append", "overwrite").
614
- cast_options: Optional casting options.
615
- overwrite_schema: Whether to overwrite schema.
616
- match_by: Optional merge keys for upserts.
617
- zorder_by: Optional Z-ORDER columns.
618
- optimize_after_merge: Whether to run OPTIMIZE after merge.
619
- vacuum_hours: Optional VACUUM retention window.
655
+ cast_options: Optional casting options (align to destination schema).
656
+ overwrite_schema: Whether to overwrite schema on write (when supported).
657
+ match_by: Merge keys for upserts.
658
+ zorder_by: Z-ORDER columns (used only if `optimize_after_merge` is True).
659
+ optimize_after_merge: Whether to run Delta optimize (and z-order) after merge.
660
+ vacuum_hours: Optional VACUUM retention window in hours.
620
661
  spark_options: Optional Spark write options.
621
662
 
622
663
  Returns:
@@ -627,7 +668,15 @@ FROM parquet.`{temp_volume_path}`"""
627
668
  catalog_name=catalog_name,
628
669
  schema_name=schema_name,
629
670
  table_name=table_name,
630
- safe_chars=True
671
+ safe_chars=True,
672
+ )
673
+
674
+ logger.info(
675
+ "Spark insert into %s (mode=%s, match_by=%s, overwrite_schema=%s)",
676
+ location,
677
+ mode,
678
+ match_by,
679
+ overwrite_schema,
631
680
  )
632
681
 
633
682
  spark_options = spark_options if spark_options else {}
@@ -636,11 +685,13 @@ FROM parquet.`{temp_volume_path}`"""
636
685
 
637
686
  try:
638
687
  existing_schema = self.get_table_schema(
639
- catalog_name=catalog_name, schema_name=schema_name,
688
+ catalog_name=catalog_name,
689
+ schema_name=schema_name,
640
690
  table_name=table_name,
641
- to_arrow_schema=False
691
+ to_arrow_schema=False,
642
692
  )
643
693
  except ValueError:
694
+ logger.warning("Destination table missing; creating table %s via overwrite write", location)
644
695
  data = convert(data, pyspark.sql.DataFrame)
645
696
  data.write.mode("overwrite").options(**spark_options).saveAsTable(location)
646
697
  return
@@ -651,29 +702,25 @@ FROM parquet.`{temp_volume_path}`"""
651
702
  cast_options = CastOptions.check_arg(options=cast_options, target_field=existing_schema)
652
703
  data = cast_spark_dataframe(data, options=cast_options)
653
704
 
654
- # --- Sanity checks & pre-cleaning (avoid nulls in keys) ---
655
- if match_by:
656
- notnull: pyspark.sql.Column = None
705
+ logger.debug("Incoming Spark columns: %s", data.columns)
657
706
 
707
+ if match_by:
708
+ notnull = None
658
709
  for k in match_by:
659
710
  if k not in data.columns:
660
711
  raise ValueError(f"Missing match key '{k}' in DataFrame columns: {data.columns}")
661
-
662
- notnull = data[k].isNotNull() if notnull is None else notnull & (data[k].isNotNull())
712
+ notnull = data[k].isNotNull() if notnull is None else notnull & data[k].isNotNull()
663
713
 
664
714
  data = data.filter(notnull)
715
+ logger.debug("Filtered null keys for match_by=%s", match_by)
665
716
 
666
- # --- Merge (upsert) ---
667
717
  target = self.spark_table(full_name=location)
668
718
 
669
719
  if match_by:
670
- # Build merge condition on the composite key
671
720
  cond = " AND ".join([f"t.`{k}` <=> s.`{k}`" for k in match_by])
672
721
 
673
722
  if mode.casefold() == "overwrite":
674
723
  data = data.cache()
675
-
676
- # Step 1: get unique key combos from source
677
724
  distinct_keys = data.select([f"`{k}`" for k in match_by]).distinct()
678
725
 
679
726
  (
@@ -683,35 +730,30 @@ FROM parquet.`{temp_volume_path}`"""
683
730
  .execute()
684
731
  )
685
732
 
686
- # Step 3: append the clean batch
687
- data.write.format("delta").mode("append").saveAsTable(location)
733
+ data.write.format("delta").mode("append").options(**spark_options).saveAsTable(location)
688
734
  else:
689
735
  update_cols = [c for c in data.columns if c not in match_by]
690
- set_expr = {
691
- c: F.expr(f"s.`{c}`") for c in update_cols
692
- }
736
+ set_expr = {c: F.expr(f"s.`{c}`") for c in update_cols}
693
737
 
694
- # Execute MERGE - update matching records first, then insert new ones
695
738
  (
696
739
  target.alias("t")
697
740
  .merge(data.alias("s"), cond)
698
- .whenMatchedUpdate(set=set_expr) # update matched rows
699
- .whenNotMatchedInsertAll() # insert new rows
741
+ .whenMatchedUpdate(set=set_expr)
742
+ .whenNotMatchedInsertAll()
700
743
  .execute()
701
744
  )
702
745
  else:
703
746
  if mode == "auto":
704
747
  mode = "append"
748
+ logger.info("Spark write saveAsTable mode=%s", mode)
705
749
  data.write.mode(mode).options(**spark_options).saveAsTable(location)
706
750
 
707
- # --- Optimize: Z-ORDER for faster lookups by composite key (Databricks) ---
708
751
  if optimize_after_merge and zorder_by:
709
- # pass columns as varargs
752
+ logger.info("Delta optimize + zorder (%s)", zorder_by)
710
753
  target.optimize().executeZOrderBy(*zorder_by)
711
754
 
712
- # --- Optional VACUUM ---
713
755
  if vacuum_hours is not None:
714
- # Beware data retention policies; set to a safe value or use default 7 days
756
+ logger.info("Delta vacuum retain=%s hours", vacuum_hours)
715
757
  target.vacuum(vacuum_hours)
716
758
 
717
759
  def get_table_schema(
@@ -719,24 +761,24 @@ FROM parquet.`{temp_volume_path}`"""
719
761
  catalog_name: Optional[str] = None,
720
762
  schema_name: Optional[str] = None,
721
763
  table_name: Optional[str] = None,
722
- to_arrow_schema: bool = True
764
+ to_arrow_schema: bool = True,
723
765
  ) -> Union[pa.Field, pa.Schema]:
724
- """Fetch a table schema from Unity Catalog as Arrow types.
766
+ """Fetch a table schema from Unity Catalog and convert it to Arrow types.
725
767
 
726
768
  Args:
727
769
  catalog_name: Optional catalog override.
728
770
  schema_name: Optional schema override.
729
771
  table_name: Optional table name override.
730
- to_arrow_schema: Whether to return an Arrow schema or field.
772
+ to_arrow_schema: If True returns pa.Schema; else returns a pa.Field(STRUCT<...>).
731
773
 
732
774
  Returns:
733
- Arrow Schema or Field representing the table.
775
+ Arrow Schema or a STRUCT Field representing the table.
734
776
  """
735
777
  full_name = self.table_full_name(
736
778
  catalog_name=catalog_name,
737
779
  schema_name=schema_name,
738
780
  table_name=table_name,
739
- safe_chars=False
781
+ safe_chars=False,
740
782
  )
741
783
 
742
784
  wk = self.workspace.sdk()
@@ -746,10 +788,7 @@ FROM parquet.`{temp_volume_path}`"""
746
788
  except Exception as e:
747
789
  raise ValueError(f"Table %s not found, {type(e)} {e}" % full_name)
748
790
 
749
- fields = [
750
- column_info_to_arrow_field(_)
751
- for _ in table.columns
752
- ]
791
+ fields = [column_info_to_arrow_field(_) for _ in table.columns]
753
792
 
754
793
  if to_arrow_schema:
755
794
  return pa.schema(fields, metadata={b"name": table_name})
@@ -762,25 +801,15 @@ FROM parquet.`{temp_volume_path}`"""
762
801
  schema_name: Optional[str] = None,
763
802
  table_name: Optional[str] = None,
764
803
  ):
765
- """Drop a table if it exists.
766
-
767
- Args:
768
- location: Fully qualified table name override.
769
- catalog_name: Optional catalog override.
770
- schema_name: Optional schema override.
771
- table_name: Optional table name override.
772
-
773
- Returns:
774
- The StatementResult from executing the drop statement.
775
- """
804
+ """Drop a table if it exists."""
776
805
  location, _, _, _ = self._check_location_params(
777
806
  location=location,
778
807
  catalog_name=catalog_name,
779
808
  schema_name=schema_name,
780
809
  table_name=table_name,
781
- safe_chars=True
810
+ safe_chars=True,
782
811
  )
783
-
812
+ logger.info("Dropping table if exists: %s", location)
784
813
  return self.execute(f"DROP TABLE IF EXISTS {location}")
785
814
 
786
815
  def create_table(
@@ -797,23 +826,29 @@ FROM parquet.`{temp_volume_path}`"""
797
826
  if_not_exists: bool = True,
798
827
  optimize_write: bool = True,
799
828
  auto_compact: bool = True,
800
- execute: bool = True
801
- ) -> str:
802
- """
803
- Generate DDL (Data Definition Language) SQL for creating a table from a PyField schema.
829
+ execute: bool = True,
830
+ wait_result: bool = True
831
+ ) -> Union[str, "StatementResult"]:
832
+ """Generate (and optionally execute) CREATE TABLE DDL from an Arrow schema/field.
804
833
 
805
834
  Args:
806
- field: PyField schema that defines the table structure
807
- table_name: Name of the table to create (defaults to schema.name)
808
- catalog_name: Optional catalog name (defaults to "hive_metastore")
809
- schema_name: Optional schema name (defaults to "default")
810
- partition_by: Optional list of column names to partition the table by
811
- comment: Optional table comment
812
- options: Optional table properties
813
- if_not_exists: Whether to add IF NOT EXISTS clause
835
+ field: Arrow Field or Schema describing the table. If `field` is a schema, it's converted.
836
+ location: Fully qualified table name override.
837
+ table_name: Table name override (used if location not provided).
838
+ catalog_name: Catalog override.
839
+ schema_name: Schema override.
840
+ partition_by: Optional partition columns.
841
+ cluster_by: If True -> CLUSTER BY AUTO. If list[str] -> CLUSTER BY (..). If False -> no clustering.
842
+ comment: Optional table comment (falls back to field metadata b"comment" when present).
843
+ options: Extra table properties.
844
+ if_not_exists: Add IF NOT EXISTS clause.
845
+ optimize_write: Sets delta.autoOptimize.optimizeWrite table property.
846
+ auto_compact: Sets delta.autoOptimize.autoCompact table property.
847
+ execute: If True, executes DDL and returns StatementResult; otherwise returns SQL string.
848
+ wait_result: Waits execution to complete
814
849
 
815
850
  Returns:
816
- A SQL string for creating the table
851
+ StatementResult if execute=True, else the DDL SQL string.
817
852
  """
818
853
  if not isinstance(field, pa.Field):
819
854
  field = convert(field, pa.Field)
@@ -823,7 +858,7 @@ FROM parquet.`{temp_volume_path}`"""
823
858
  catalog_name=catalog_name,
824
859
  schema_name=schema_name,
825
860
  table_name=table_name,
826
- safe_chars=True
861
+ safe_chars=True,
827
862
  )
828
863
 
829
864
  if pa.types.is_struct(field.type):
@@ -831,28 +866,22 @@ FROM parquet.`{temp_volume_path}`"""
831
866
  else:
832
867
  children = [field]
833
868
 
834
- # Create the DDL statement
835
- column_definitions = [
836
- self._field_to_ddl(child)
837
- for child in children
838
- ]
869
+ column_definitions = [self._field_to_ddl(child) for child in children]
839
870
 
840
871
  sql = [
841
872
  f"CREATE TABLE {'IF NOT EXISTS ' if if_not_exists else ''}{location} (",
842
873
  ",\n ".join(column_definitions),
843
- ")"
874
+ ")",
844
875
  ]
845
876
 
846
- # Add partition by clause if provided
847
- if partition_by and len(partition_by) > 0:
877
+ if partition_by:
848
878
  sql.append(f"\nPARTITIONED BY ({', '.join(partition_by)})")
849
879
  elif cluster_by:
850
880
  if isinstance(cluster_by, bool):
851
- sql.append(f"\nCLUSTER BY AUTO")
881
+ sql.append("\nCLUSTER BY AUTO")
852
882
  else:
853
883
  sql.append(f"\nCLUSTER BY ({', '.join(cluster_by)})")
854
884
 
855
- # Add comment if provided
856
885
  if not comment and field.metadata:
857
886
  comment = field.metadata.get(b"comment")
858
887
 
@@ -862,32 +891,33 @@ FROM parquet.`{temp_volume_path}`"""
862
891
  if comment:
863
892
  sql.append(f"\nCOMMENT '{comment}'")
864
893
 
865
- # Add options if provided
866
894
  options = {} if options is None else options
867
895
  options.update({
868
896
  "delta.autoOptimize.optimizeWrite": optimize_write,
869
- "delta.autoOptimize.autoCompact": auto_compact
897
+ "delta.autoOptimize.autoCompact": auto_compact,
870
898
  })
871
899
 
872
900
  option_strs = []
873
-
874
- if options:
875
- for key, value in options.items():
876
- if isinstance(value, str):
877
- option_strs.append(f"'{key}' = '{value}'")
878
- elif isinstance(value, bool):
879
- b_value = "true" if value else "false"
880
- option_strs.append(f"'{key}' = '{b_value}'")
881
- else:
882
- option_strs.append(f"'{key}' = {value}")
901
+ for key, value in (options or {}).items():
902
+ if isinstance(value, str):
903
+ option_strs.append(f"'{key}' = '{value}'")
904
+ elif isinstance(value, bool):
905
+ option_strs.append(f"'{key}' = '{'true' if value else 'false'}'")
906
+ else:
907
+ option_strs.append(f"'{key}' = {value}")
883
908
 
884
909
  if option_strs:
885
910
  sql.append(f"\nTBLPROPERTIES ({', '.join(option_strs)})")
886
911
 
887
912
  statement = "\n".join(sql)
888
913
 
914
+ logger.debug(
915
+ "Generated CREATE TABLE DDL for %s:\n%s",
916
+ location, statement
917
+ )
918
+
889
919
  if execute:
890
- return self.execute(statement)
920
+ return self.execute(statement, wait_result=wait_result)
891
921
  return statement
892
922
 
893
923
  def _check_location_params(
@@ -896,28 +926,18 @@ FROM parquet.`{temp_volume_path}`"""
896
926
  catalog_name: Optional[str] = None,
897
927
  schema_name: Optional[str] = None,
898
928
  table_name: Optional[str] = None,
899
- safe_chars: bool = True
900
- ):
901
- """Resolve location/catalog/schema/table parameters to a full name.
902
-
903
- Args:
904
- location: Fully qualified table name override.
905
- catalog_name: Optional catalog override.
906
- schema_name: Optional schema override.
907
- table_name: Optional table name override.
908
- safe_chars: Whether to wrap identifiers in backticks.
909
-
910
- Returns:
911
- A tuple of (location, catalog_name, schema_name, table_name).
912
- """
929
+ safe_chars: bool = True,
930
+ ) -> tuple[str, Optional[str], Optional[str], Optional[str]]:
931
+ """Resolve (location OR catalog/schema/table) into a fully-qualified name."""
913
932
  if location:
914
933
  c, s, t = self._catalog_schema_table_names(location)
915
934
  catalog_name, schema_name, table_name = catalog_name or c, schema_name or s, table_name or t
916
935
 
917
936
  location = self.table_full_name(
918
- catalog_name=catalog_name, schema_name=schema_name,
937
+ catalog_name=catalog_name,
938
+ schema_name=schema_name,
919
939
  table_name=table_name,
920
- safe_chars=safe_chars
940
+ safe_chars=safe_chars,
921
941
  )
922
942
 
923
943
  return location, catalog_name or self.catalog_name, schema_name or self.schema_name, table_name
@@ -927,98 +947,68 @@ FROM parquet.`{temp_volume_path}`"""
927
947
  field: pa.Field,
928
948
  put_name: bool = True,
929
949
  put_not_null: bool = True,
930
- put_comment: bool = True
950
+ put_comment: bool = True,
931
951
  ) -> str:
932
- """
933
- Convert a PyField to a DDL column definition.
934
-
935
- Args:
936
- field: The PyField to convert
937
-
938
- Returns:
939
- A string containing the column definition in DDL format
940
- """
952
+ """Convert an Arrow Field to a Databricks SQL column DDL fragment."""
941
953
  name = field.name
942
954
  nullable_str = " NOT NULL" if put_not_null and not field.nullable else ""
943
955
  name_str = f"{name} " if put_name else ""
944
956
 
945
- # Get comment if available
946
957
  comment_str = ""
947
958
  if put_comment and field.metadata and b"comment" in field.metadata:
948
959
  comment = field.metadata[b"comment"].decode("utf-8")
949
960
  comment_str = f" COMMENT '{comment}'"
950
961
 
951
- # Handle primitive types
952
962
  if not pa.types.is_nested(field.type):
953
963
  sql_type = SQLEngine._arrow_to_sql_type(field.type)
954
964
  return f"{name_str}{sql_type}{nullable_str}{comment_str}"
955
965
 
956
- # Handle struct type
957
966
  if pa.types.is_struct(field.type):
958
967
  child_defs = [SQLEngine._field_to_ddl(child) for child in field.type]
959
968
  struct_body = ", ".join(child_defs)
960
969
  return f"{name_str}STRUCT<{struct_body}>{nullable_str}{comment_str}"
961
970
 
962
- # Handle map type
963
971
  if pa.types.is_map(field.type):
964
972
  map_type: pa.MapType = field.type
965
973
  key_type = SQLEngine._field_to_ddl(map_type.key_field, put_name=False, put_comment=False, put_not_null=False)
966
974
  val_type = SQLEngine._field_to_ddl(map_type.item_field, put_name=False, put_comment=False, put_not_null=False)
967
975
  return f"{name_str}MAP<{key_type}, {val_type}>{nullable_str}{comment_str}"
968
976
 
969
- # Handle list type after map
970
977
  if pa.types.is_list(field.type) or pa.types.is_large_list(field.type):
971
978
  list_type: pa.ListType = field.type
972
979
  elem_type = SQLEngine._field_to_ddl(list_type.value_field, put_name=False, put_comment=False, put_not_null=False)
973
980
  return f"{name_str}ARRAY<{elem_type}>{nullable_str}{comment_str}"
974
981
 
975
- # Default fallback to string for unknown types
976
982
  raise TypeError(f"Cannot make ddl field from {field}")
977
983
 
978
984
  @staticmethod
979
- def _arrow_to_sql_type(
980
- arrow_type: Union[pa.DataType, pa.Decimal128Type]
981
- ) -> str:
982
- """
983
- Convert an Arrow data type to SQL data type.
984
-
985
- Args:
986
- arrow_type: The Arrow data type
987
-
988
- Returns:
989
- A string containing the SQL data type
990
- """
985
+ def _arrow_to_sql_type(arrow_type: Union[pa.DataType, pa.Decimal128Type]) -> str:
986
+ """Convert an Arrow data type to a Databricks SQL type string."""
991
987
  if pa.types.is_boolean(arrow_type):
992
988
  return "BOOLEAN"
993
- elif pa.types.is_int8(arrow_type):
989
+ if pa.types.is_int8(arrow_type):
994
990
  return "TINYINT"
995
- elif pa.types.is_int16(arrow_type):
991
+ if pa.types.is_int16(arrow_type):
996
992
  return "SMALLINT"
997
- elif pa.types.is_int32(arrow_type):
993
+ if pa.types.is_int32(arrow_type):
998
994
  return "INT"
999
- elif pa.types.is_int64(arrow_type):
995
+ if pa.types.is_int64(arrow_type):
1000
996
  return "BIGINT"
1001
- elif pa.types.is_float32(arrow_type):
997
+ if pa.types.is_float32(arrow_type):
1002
998
  return "FLOAT"
1003
- elif pa.types.is_float64(arrow_type):
999
+ if pa.types.is_float64(arrow_type):
1004
1000
  return "DOUBLE"
1005
- elif is_arrow_type_string_like(arrow_type):
1001
+ if is_arrow_type_string_like(arrow_type):
1006
1002
  return "STRING"
1007
- elif is_arrow_type_binary_like(arrow_type):
1003
+ if is_arrow_type_binary_like(arrow_type):
1008
1004
  return "BINARY"
1009
- elif pa.types.is_timestamp(arrow_type):
1005
+ if pa.types.is_timestamp(arrow_type):
1010
1006
  tz = getattr(arrow_type, "tz", None)
1011
-
1012
- if tz:
1013
- return "TIMESTAMP"
1014
- return "TIMESTAMP_NTZ"
1015
- elif pa.types.is_date(arrow_type):
1007
+ return "TIMESTAMP" if tz else "TIMESTAMP_NTZ"
1008
+ if pa.types.is_date(arrow_type):
1016
1009
  return "DATE"
1017
- elif pa.types.is_decimal(arrow_type):
1018
- precision = arrow_type.precision
1019
- scale = arrow_type.scale
1020
- return f"DECIMAL({precision}, {scale})"
1021
- elif pa.types.is_null(arrow_type):
1010
+ if pa.types.is_decimal(arrow_type):
1011
+ return f"DECIMAL({arrow_type.precision}, {arrow_type.scale})"
1012
+ if pa.types.is_null(arrow_type):
1022
1013
  return "STRING"
1023
- else:
1024
- raise ValueError(f"Cannot make ddl type for {arrow_type}")
1014
+ raise ValueError(f"Cannot make ddl type for {arrow_type}")