ygg 0.1.50__tar.gz → 0.1.52__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. {ygg-0.1.50 → ygg-0.1.52}/PKG-INFO +1 -1
  2. {ygg-0.1.50 → ygg-0.1.52}/pyproject.toml +1 -1
  3. {ygg-0.1.50 → ygg-0.1.52}/src/ygg.egg-info/PKG-INFO +1 -1
  4. {ygg-0.1.50 → ygg-0.1.52}/src/ygg.egg-info/SOURCES.txt +2 -0
  5. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/databricks/sql/engine.py +288 -84
  6. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/databricks/sql/exceptions.py +3 -1
  7. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/databricks/workspaces/io.py +80 -71
  8. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/databricks/workspaces/path.py +369 -168
  9. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/databricks/workspaces/path_kind.py +3 -3
  10. ygg-0.1.52/src/yggdrasil/databricks/workspaces/volumes_path.py +85 -0
  11. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/libs/databrickslib.py +5 -0
  12. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/pyutils/callable_serde.py +10 -10
  13. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/pyutils/retry.py +2 -2
  14. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/types/cast/registry.py +0 -14
  15. ygg-0.1.52/src/yggdrasil/types/file_format.py +10 -0
  16. ygg-0.1.52/src/yggdrasil/version.py +1 -0
  17. ygg-0.1.50/src/yggdrasil/version.py +0 -1
  18. {ygg-0.1.50 → ygg-0.1.52}/LICENSE +0 -0
  19. {ygg-0.1.50 → ygg-0.1.52}/README.md +0 -0
  20. {ygg-0.1.50 → ygg-0.1.52}/setup.cfg +0 -0
  21. {ygg-0.1.50 → ygg-0.1.52}/src/ygg.egg-info/dependency_links.txt +0 -0
  22. {ygg-0.1.50 → ygg-0.1.52}/src/ygg.egg-info/entry_points.txt +0 -0
  23. {ygg-0.1.50 → ygg-0.1.52}/src/ygg.egg-info/requires.txt +0 -0
  24. {ygg-0.1.50 → ygg-0.1.52}/src/ygg.egg-info/top_level.txt +0 -0
  25. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/__init__.py +0 -0
  26. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/databricks/__init__.py +0 -0
  27. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/databricks/compute/__init__.py +0 -0
  28. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/databricks/compute/cluster.py +0 -0
  29. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/databricks/compute/execution_context.py +0 -0
  30. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/databricks/compute/remote.py +0 -0
  31. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/databricks/jobs/__init__.py +0 -0
  32. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/databricks/jobs/config.py +0 -0
  33. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/databricks/sql/__init__.py +0 -0
  34. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/databricks/sql/statement_result.py +0 -0
  35. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/databricks/sql/types.py +0 -0
  36. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/databricks/sql/warehouse.py +0 -0
  37. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/databricks/workspaces/__init__.py +0 -0
  38. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/databricks/workspaces/filesytem.py +0 -0
  39. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/databricks/workspaces/workspace.py +0 -0
  40. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/dataclasses/__init__.py +0 -0
  41. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/dataclasses/dataclass.py +0 -0
  42. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/libs/__init__.py +0 -0
  43. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/libs/extensions/__init__.py +0 -0
  44. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/libs/extensions/polars_extensions.py +0 -0
  45. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/libs/extensions/spark_extensions.py +0 -0
  46. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/libs/pandaslib.py +0 -0
  47. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/libs/polarslib.py +0 -0
  48. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/libs/sparklib.py +0 -0
  49. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/pyutils/__init__.py +0 -0
  50. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/pyutils/equality.py +0 -0
  51. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/pyutils/exceptions.py +0 -0
  52. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/pyutils/expiring_dict.py +0 -0
  53. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/pyutils/modules.py +0 -0
  54. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/pyutils/parallel.py +0 -0
  55. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/pyutils/python_env.py +0 -0
  56. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/requests/__init__.py +0 -0
  57. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/requests/msal.py +0 -0
  58. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/requests/session.py +0 -0
  59. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/types/__init__.py +0 -0
  60. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/types/cast/__init__.py +0 -0
  61. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/types/cast/arrow_cast.py +0 -0
  62. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/types/cast/cast_options.py +0 -0
  63. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/types/cast/pandas_cast.py +0 -0
  64. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/types/cast/polars_cast.py +0 -0
  65. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/types/cast/polars_pandas_cast.py +0 -0
  66. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/types/cast/spark_cast.py +0 -0
  67. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/types/cast/spark_pandas_cast.py +0 -0
  68. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/types/cast/spark_polars_cast.py +0 -0
  69. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/types/python_arrow.py +0 -0
  70. {ygg-0.1.50 → ygg-0.1.52}/src/yggdrasil/types/python_defaults.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ygg
3
- Version: 0.1.50
3
+ Version: 0.1.52
4
4
  Summary: Type-friendly utilities for moving data between Python objects, Arrow, Polars, Pandas, Spark, and Databricks
5
5
  Author: Yggdrasil contributors
6
6
  License: Apache License
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
5
5
 
6
6
  [project]
7
7
  name = "ygg"
8
- version = "0.1.50"
8
+ version = "0.1.52"
9
9
  description = "Type-friendly utilities for moving data between Python objects, Arrow, Polars, Pandas, Spark, and Databricks"
10
10
  readme = { file = "README.md", content-type = "text/markdown" }
11
11
  license = { file = "LICENSE" }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ygg
3
- Version: 0.1.50
3
+ Version: 0.1.52
4
4
  Summary: Type-friendly utilities for moving data between Python objects, Arrow, Polars, Pandas, Spark, and Databricks
5
5
  Author: Yggdrasil contributors
6
6
  License: Apache License
@@ -27,6 +27,7 @@ src/yggdrasil/databricks/workspaces/filesytem.py
27
27
  src/yggdrasil/databricks/workspaces/io.py
28
28
  src/yggdrasil/databricks/workspaces/path.py
29
29
  src/yggdrasil/databricks/workspaces/path_kind.py
30
+ src/yggdrasil/databricks/workspaces/volumes_path.py
30
31
  src/yggdrasil/databricks/workspaces/workspace.py
31
32
  src/yggdrasil/dataclasses/__init__.py
32
33
  src/yggdrasil/dataclasses/dataclass.py
@@ -51,6 +52,7 @@ src/yggdrasil/requests/__init__.py
51
52
  src/yggdrasil/requests/msal.py
52
53
  src/yggdrasil/requests/session.py
53
54
  src/yggdrasil/types/__init__.py
55
+ src/yggdrasil/types/file_format.py
54
56
  src/yggdrasil/types/python_arrow.py
55
57
  src/yggdrasil/types/python_defaults.py
56
58
  src/yggdrasil/types/cast/__init__.py
@@ -60,8 +60,29 @@ if pyspark is not None:
60
60
  __all__ = ["SQLEngine", "StatementResult"]
61
61
 
62
62
 
63
- class SqlExecutionError(RuntimeError):
64
- """Raised when a SQL statement execution fails."""
63
+ @dataclasses.dataclass
64
+ class CreateTablePlan:
65
+ sql: str
66
+ properties: dict[str, Any]
67
+ warnings: list[str]
68
+ result: Any = None # StatementResult when executed
69
+
70
+
71
+ _INVALID_COL_CHARS = set(" ,;{}()\n\t=")
72
+
73
+
74
+ def _escape_sql_string(s: str) -> str:
75
+ return s.replace("'", "''")
76
+
77
+
78
+ def _quote_ident(ident: str) -> str:
79
+ # Always quote to be safe; also allows reserved keywords
80
+ escaped = ident.replace("`", "``")
81
+ return f"`{escaped}`"
82
+
83
+
84
+ def _needs_column_mapping(col_name: str) -> bool:
85
+ return any(ch in _INVALID_COL_CHARS for ch in col_name)
65
86
 
66
87
 
67
88
  @dataclasses.dataclass
@@ -268,7 +289,10 @@ class SQLEngine(WorkspaceService):
268
289
  if row_limit:
269
290
  df = df.limit(row_limit)
270
291
 
271
- logger.info("Spark SQL executed: %s", self._sql_preview(statement))
292
+ logger.debug(
293
+ "SPARK SQL executed query:\n%s",
294
+ statement
295
+ )
272
296
 
273
297
  # Avoid Disposition dependency if SDK imports are absent
274
298
  spark_disp = disposition if disposition is not None else getattr(globals().get("Disposition", object), "EXTERNAL_LINKS", None)
@@ -316,8 +340,12 @@ class SQLEngine(WorkspaceService):
316
340
  )
317
341
 
318
342
  logger.info(
319
- "API SQL executed: %s",
320
- self._sql_preview(statement)
343
+ "API SQL executed statement '%s'",
344
+ execution.statement_id
345
+ )
346
+ logger.debug(
347
+ "API SQL executed query:\n%s",
348
+ statement
321
349
  )
322
350
 
323
351
  return execution.wait() if wait_result else execution
@@ -816,111 +844,287 @@ FROM parquet.`{temp_volume_path}`"""
816
844
 
817
845
  def create_table(
818
846
  self,
819
- field: pa.Field,
820
- location: Optional[str] = None,
821
- table_name: Optional[str] = None,
847
+ field: Union[pa.Field, pa.Schema],
848
+ table_fqn: Optional[str] = None, # e.g. catalog.schema.table
822
849
  catalog_name: Optional[str] = None,
823
850
  schema_name: Optional[str] = None,
851
+ table_name: Optional[str] = None,
852
+ storage_location: Optional[str] = None, # external location path
824
853
  partition_by: Optional[list[str]] = None,
825
854
  cluster_by: Optional[bool | list[str]] = True,
826
855
  comment: Optional[str] = None,
827
- options: Optional[dict] = None,
856
+ tblproperties: Optional[dict[str, Any]] = None,
828
857
  if_not_exists: bool = True,
858
+ or_replace: bool = False,
859
+ using: str = "DELTA",
829
860
  optimize_write: bool = True,
830
861
  auto_compact: bool = True,
862
+ # perf-ish optional knobs (don’t hard-force)
863
+ enable_cdf: Optional[bool] = None,
864
+ enable_deletion_vectors: Optional[bool] = None,
865
+ target_file_size: Optional[int] = None, # bytes
866
+ # column mapping: None=auto, "none"/"name"/"id" explicit
867
+ column_mapping_mode: Optional[str] = None,
831
868
  execute: bool = True,
832
- wait_result: bool = True
833
- ) -> Union[str, "StatementResult"]:
834
- """Generate (and optionally execute) CREATE TABLE DDL from an Arrow schema/field.
835
-
836
- Args:
837
- field: Arrow Field or Schema describing the table. If `field` is a schema, it's converted.
838
- location: Fully qualified table name override.
839
- table_name: Table name override (used if location not provided).
840
- catalog_name: Catalog override.
841
- schema_name: Schema override.
842
- partition_by: Optional partition columns.
843
- cluster_by: If True -> CLUSTER BY AUTO. If list[str] -> CLUSTER BY (..). If False -> no clustering.
844
- comment: Optional table comment (falls back to field metadata b"comment" when present).
845
- options: Extra table properties.
846
- if_not_exists: Add IF NOT EXISTS clause.
847
- optimize_write: Sets delta.autoOptimize.optimizeWrite table property.
848
- auto_compact: Sets delta.autoOptimize.autoCompact table property.
849
- execute: If True, executes DDL and returns StatementResult; otherwise returns SQL string.
850
- wait_result: Waits execution to complete
851
-
852
- Returns:
853
- StatementResult if execute=True, else the DDL SQL string.
869
+ wait_result: bool = True,
870
+ return_plan: bool = False,
871
+ ) -> Union[str, CreateTablePlan, "StatementResult"]:
872
+ """
873
+ Generate (and optionally execute) a Databricks/Delta `CREATE TABLE` statement from an Apache Arrow
874
+ schema/field, with integration-friendly safety checks and performance-oriented defaults.
875
+
876
+ This helper is meant to be "team-safe":
877
+ - Quotes identifiers (catalog/schema/table/columns) to avoid SQL keyword/name edge cases.
878
+ - Validates `partition_by` / `cluster_by` columns exist in the Arrow schema before generating SQL.
879
+ - Supports managed or external tables via `storage_location`.
880
+ - Optionally enables Delta Column Mapping (name/id) and applies the required protocol upgrades.
881
+
882
+ Parameters
883
+ ----------
884
+ field:
885
+ Arrow schema or field describing the table.
886
+ - If `pa.Schema`, all schema fields are used as columns.
887
+ - If `pa.Field` with struct type, its children become columns.
888
+ - If `pa.Field` non-struct, it becomes a single-column table.
889
+ table_fqn:
890
+ Fully-qualified table name, e.g. `"catalog.schema.table"`.
891
+ If provided, it takes precedence over `catalog_name`/`schema_name`/`table_name`.
892
+ Parts are quoted as needed.
893
+ catalog_name, schema_name, table_name:
894
+ Used to build the table identifier when `table_fqn` is not provided.
895
+ All three must be provided together.
896
+ storage_location:
897
+ If set, emits `LOCATION '<path>'` to create an external Delta table at the given path.
898
+ (Path string is SQL-escaped.)
899
+ partition_by:
900
+ List of partition column names. Must exist in the schema.
901
+ Note: Partitioning is a physical layout choice; only use for low-cardinality columns.
902
+ cluster_by:
903
+ Controls clustering / liquid clustering:
904
+ - True -> emits `CLUSTER BY AUTO`
905
+ - False -> emits no clustering clause
906
+ - list[str] -> emits `CLUSTER BY (<cols...>)` (all cols must exist in schema)
907
+ comment:
908
+ Optional table comment. If not provided and Arrow metadata contains `b"comment"`, that is used.
909
+ tblproperties:
910
+ Additional/override Delta table properties (final say).
911
+ Example: `{"delta.enableChangeDataFeed": "true"}` or `{"delta.logRetentionDuration": "30 days"}`
912
+ if_not_exists:
913
+ If True, generates `CREATE TABLE IF NOT EXISTS ...`.
914
+ Mutually exclusive with `or_replace`.
915
+ or_replace:
916
+ If True, generates `CREATE OR REPLACE TABLE ...`.
917
+ Mutually exclusive with `if_not_exists`.
918
+ using:
919
+ Storage format keyword. Defaults to `"DELTA"`.
920
+ optimize_write:
921
+ Sets `delta.autoOptimize.optimizeWrite` table property.
922
+ auto_compact:
923
+ Sets `delta.autoOptimize.autoCompact` table property.
924
+ enable_cdf:
925
+ If set, adds `delta.enableChangeDataFeed` property.
926
+ Useful for CDC pipelines; avoid enabling by default if you don't need it.
927
+ enable_deletion_vectors:
928
+ If set, adds `delta.enableDeletionVectors` property.
929
+ Can improve performance for updates/deletes in some workloads (subject to platform support).
930
+ target_file_size:
931
+ If set, adds `delta.targetFileSize` (bytes). Helps guide file sizing and reduce small files.
932
+ column_mapping_mode:
933
+ Delta column mapping mode:
934
+ - None -> auto-detect: enables `"name"` only if invalid column names are present, else `"none"`
935
+ - "none" -> do not enable column mapping (max compatibility)
936
+ - "name" -> enable name-based column mapping
937
+ - "id" -> enable id-based column mapping
938
+
939
+ When enabled (name/id), this method also sets the required protocol properties:
940
+ `delta.minReaderVersion=2` and `delta.minWriterVersion=5`.
941
+ execute:
942
+ If True, executes the generated SQL via `self.execute(...)`.
943
+ If False, returns the SQL (or plan) without executing.
944
+ wait_result:
945
+ Passed to `self.execute(...)`. If True, blocks until the statement finishes.
946
+ return_plan:
947
+ If True, returns a `CreateTablePlan` containing SQL + applied properties + warnings (+ result if executed).
948
+ If False:
949
+ - returns SQL string when `execute=False`
950
+ - returns `StatementResult` when `execute=True`
951
+
952
+ Returns
953
+ -------
954
+ Union[str, CreateTablePlan, StatementResult]
955
+ - If `execute=False` and `return_plan=False`: the SQL string.
956
+ - If `execute=False` and `return_plan=True`: `CreateTablePlan(sql=..., properties=..., warnings=...)`.
957
+ - If `execute=True` and `return_plan=False`: `StatementResult`.
958
+ - If `execute=True` and `return_plan=True`: `CreateTablePlan` with `result` populated.
959
+
960
+ Raises
961
+ ------
962
+ ValueError
963
+ If required naming params are missing, if `or_replace` and `if_not_exists` conflict,
964
+ if `column_mapping_mode` is invalid, or if partition/cluster columns are not present.
965
+
966
+ Notes
967
+ -----
968
+ - Column mapping is primarily a metadata feature; performance impact is usually negligible vs IO,
969
+ but enabling it affects compatibility with older readers.
970
+ - Partitioning and clustering are workload-dependent: partition for selective pruning on low-cardinality
971
+ columns; cluster for speeding up common filter/join patterns.
972
+
973
+ Examples
974
+ --------
975
+ Create a managed Delta table with auto clustering and auto column mapping:
976
+ >>> plan = client.create_table(schema, table_fqn="main.analytics.events", execute=False, return_plan=True)
977
+ >>> print(plan.sql)
978
+
979
+ External table with explicit partitioning and CDF:
980
+ >>> client.create_table(
981
+ ... schema,
982
+ ... table_fqn="main.analytics.events",
983
+ ... storage_location="abfss://.../events",
984
+ ... partition_by=["event_date"],
985
+ ... enable_cdf=True,
986
+ ... )
854
987
  """
855
- if not isinstance(field, pa.Field):
856
- field = convert(field, pa.Field)
857
-
858
- location, catalog_name, schema_name, table_name = self._check_location_params(
859
- location=location,
860
- catalog_name=catalog_name,
861
- schema_name=schema_name,
862
- table_name=table_name,
863
- safe_chars=True,
864
- )
865
988
 
866
- if pa.types.is_struct(field.type):
867
- children = list(field.type)
989
+ # ---- Normalize Arrow input ----
990
+ if isinstance(field, pa.Schema):
991
+ arrow_fields = list(field)
992
+ schema_metadata = field.metadata or {}
868
993
  else:
869
- children = [field]
870
-
871
- column_definitions = [self._field_to_ddl(child) for child in children]
872
-
873
- sql = [
874
- f"CREATE TABLE {'IF NOT EXISTS ' if if_not_exists else ''}{location} (",
875
- ",\n ".join(column_definitions),
994
+ # pa.Field
995
+ schema_metadata = field.metadata or {}
996
+ if pa.types.is_struct(field.type):
997
+ arrow_fields = list(field.type)
998
+ else:
999
+ arrow_fields = [field]
1000
+
1001
+ # ---- Resolve table FQN ----
1002
+ # Prefer explicit table_fqn. Else build from catalog/schema/table_name.
1003
+ if table_fqn is None:
1004
+ if not (catalog_name and schema_name and table_name):
1005
+ raise ValueError("Provide table_fqn or (catalog_name, schema_name, table_name).")
1006
+ table_fqn = ".".join(map(_quote_ident, [catalog_name, schema_name, table_name]))
1007
+ else:
1008
+ # If caller passes raw "cat.schema.table", quote each part safely
1009
+ parts = table_fqn.split(".")
1010
+ table_fqn = ".".join(_quote_ident(p) for p in parts)
1011
+
1012
+ # ---- Comments ----
1013
+ if comment is None and schema_metadata:
1014
+ c = schema_metadata.get(b"comment")
1015
+ if isinstance(c, bytes):
1016
+ comment = c.decode("utf-8")
1017
+
1018
+ # ---- Detect invalid column names -> column mapping auto ----
1019
+ any_invalid = any(_needs_column_mapping(f.name) for f in arrow_fields)
1020
+ warnings: list[str] = []
1021
+ if column_mapping_mode is None:
1022
+ column_mapping_mode = "name" if any_invalid else "none"
1023
+
1024
+ if column_mapping_mode not in ("none", "name", "id"):
1025
+ raise ValueError("column_mapping_mode must be one of: None, 'none', 'name', 'id'.")
1026
+
1027
+ # ---- Validate partition/cluster columns exist ----
1028
+ col_names = {f.name for f in arrow_fields}
1029
+ for cols, label in ((partition_by, "partition_by"),):
1030
+ if cols:
1031
+ missing = [c for c in cols if c not in col_names]
1032
+ if missing:
1033
+ raise ValueError(f"{label} contains unknown columns: {missing}")
1034
+
1035
+ if isinstance(cluster_by, list):
1036
+ missing = [c for c in cluster_by if c not in col_names]
1037
+ if missing:
1038
+ raise ValueError(f"cluster_by contains unknown columns: {missing}")
1039
+
1040
+ # ---- Column DDL ----
1041
+ # IMPORTANT: your _field_to_ddl should quote names with backticks if needed.
1042
+ # I’d recommend it ALWAYS quotes via _quote_ident internally.
1043
+ column_definitions = [self._field_to_ddl(child) for child in arrow_fields]
1044
+
1045
+ # ---- Build CREATE TABLE ----
1046
+ if or_replace and if_not_exists:
1047
+ raise ValueError("Use either or_replace or if_not_exists, not both.")
1048
+
1049
+ create_kw = "CREATE OR REPLACE TABLE" if or_replace else "CREATE TABLE"
1050
+ if if_not_exists and not or_replace:
1051
+ create_kw = "CREATE TABLE IF NOT EXISTS"
1052
+
1053
+ sql_parts: list[str] = [
1054
+ f"{create_kw} {table_fqn} (",
1055
+ " " + ",\n ".join(column_definitions),
876
1056
  ")",
1057
+ f"USING {using}",
877
1058
  ]
878
1059
 
879
1060
  if partition_by:
880
- sql.append(f"\nPARTITIONED BY ({', '.join(partition_by)})")
1061
+ sql_parts.append("PARTITIONED BY (" + ", ".join(_quote_ident(c) for c in partition_by) + ")")
881
1062
  elif cluster_by:
882
1063
  if isinstance(cluster_by, bool):
883
- sql.append("\nCLUSTER BY AUTO")
1064
+ sql_parts.append("CLUSTER BY AUTO")
884
1065
  else:
885
- sql.append(f"\nCLUSTER BY ({', '.join(cluster_by)})")
1066
+ sql_parts.append("CLUSTER BY (" + ", ".join(_quote_ident(c) for c in cluster_by) + ")")
886
1067
 
887
- if not comment and field.metadata:
888
- comment = field.metadata.get(b"comment")
1068
+ if comment:
1069
+ sql_parts.append(f"COMMENT '{_escape_sql_string(comment)}'")
889
1070
 
890
- if isinstance(comment, bytes):
891
- comment = comment.decode("utf-8")
1071
+ if storage_location:
1072
+ sql_parts.append(f"LOCATION '{_escape_sql_string(storage_location)}'")
892
1073
 
893
- if comment:
894
- sql.append(f"\nCOMMENT '{comment}'")
895
-
896
- options = {} if options is None else options
897
- options.update({
898
- "delta.autoOptimize.optimizeWrite": optimize_write,
899
- "delta.autoOptimize.autoCompact": auto_compact,
900
- })
901
-
902
- option_strs = []
903
- for key, value in (options or {}).items():
904
- if isinstance(value, str):
905
- option_strs.append(f"'{key}' = '{value}'")
906
- elif isinstance(value, bool):
907
- option_strs.append(f"'{key}' = '{'true' if value else 'false'}'")
908
- else:
909
- option_strs.append(f"'{key}' = {value}")
1074
+ # ---- Table properties (defaults + overrides) ----
1075
+ props: dict[str, Any] = {
1076
+ "delta.autoOptimize.optimizeWrite": bool(optimize_write),
1077
+ "delta.autoOptimize.autoCompact": bool(auto_compact)
1078
+ }
910
1079
 
911
- if option_strs:
912
- sql.append(f"\nTBLPROPERTIES ({', '.join(option_strs)})")
1080
+ if enable_cdf is not None:
1081
+ props["delta.enableChangeDataFeed"] = bool(enable_cdf)
913
1082
 
914
- statement = "\n".join(sql)
1083
+ if enable_deletion_vectors is not None:
1084
+ props["delta.enableDeletionVectors"] = bool(enable_deletion_vectors)
915
1085
 
916
- logger.debug(
917
- "Generated CREATE TABLE DDL for %s:\n%s",
918
- location, statement
919
- )
1086
+ if target_file_size is not None:
1087
+ props["delta.targetFileSize"] = int(target_file_size)
1088
+
1089
+ # Column mapping + required protocol bumps
1090
+ if column_mapping_mode != "none":
1091
+ props["delta.columnMapping.mode"] = column_mapping_mode
1092
+ props["delta.minReaderVersion"] = 2
1093
+ props["delta.minWriterVersion"] = 5
1094
+ else:
1095
+ # only set explicitly if user wants; otherwise leave unset for max compatibility
1096
+ pass
1097
+
1098
+ # Let caller override anything (final say)
1099
+ if tblproperties:
1100
+ props.update(tblproperties)
1101
+
1102
+ if any_invalid and column_mapping_mode == "none":
1103
+ warnings.append(
1104
+ "Schema has invalid column names but column_mapping_mode='none'. "
1105
+ "This will fail unless you rename/escape columns."
1106
+ )
1107
+
1108
+ if props:
1109
+ def fmt(k: str, v: Any) -> str:
1110
+ if isinstance(v, str):
1111
+ return f"'{k}' = '{_escape_sql_string(v)}'"
1112
+ if isinstance(v, bool):
1113
+ return f"'{k}' = '{'true' if v else 'false'}'"
1114
+ return f"'{k}' = {v}"
1115
+
1116
+ sql_parts.append("TBLPROPERTIES (" + ", ".join(fmt(k, v) for k, v in props.items()) + ")")
1117
+
1118
+ statement = "\n".join(sql_parts)
1119
+
1120
+ plan = CreateTablePlan(sql=statement, properties=props, warnings=warnings)
1121
+
1122
+ if not execute:
1123
+ return plan if return_plan else statement
920
1124
 
921
- if execute:
922
- return self.execute(statement, wait_result=wait_result)
923
- return statement
1125
+ res = self.execute(statement, wait_result=wait_result)
1126
+ plan.result = res
1127
+ return plan if return_plan else res
924
1128
 
925
1129
  def _check_location_params(
926
1130
  self,
@@ -17,13 +17,15 @@ class SqlStatementError(RuntimeError):
17
17
 
18
18
  def __str__(self) -> str:
19
19
  meta = []
20
+
20
21
  if self.error_code:
21
22
  meta.append(f"code={self.error_code}")
22
23
  if self.sql_state:
23
24
  meta.append(f"state={self.sql_state}")
24
25
 
25
26
  meta_str = f" ({', '.join(meta)})" if meta else ""
26
- return f"SQL statement {self.statement_id} failed [{self.state}]: {self.message}{meta_str}"
27
+
28
+ return f"SQL statement {self.statement_id!r} failed [{self.state}]: {self.message}{meta_str}"
27
29
 
28
30
  @classmethod
29
31
  def from_statement(cls, stmt: Any) -> "SqlStatementError":