ygg 0.1.51__tar.gz → 0.1.53__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ygg-0.1.51 → ygg-0.1.53}/PKG-INFO +1 -1
- {ygg-0.1.51 → ygg-0.1.53}/pyproject.toml +1 -1
- {ygg-0.1.51 → ygg-0.1.53}/src/ygg.egg-info/PKG-INFO +1 -1
- {ygg-0.1.51 → ygg-0.1.53}/src/ygg.egg-info/SOURCES.txt +2 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/databricks/sql/engine.py +288 -84
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/databricks/sql/exceptions.py +3 -1
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/databricks/workspaces/io.py +78 -69
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/databricks/workspaces/path.py +367 -166
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/databricks/workspaces/path_kind.py +3 -3
- ygg-0.1.53/src/yggdrasil/databricks/workspaces/volumes_path.py +85 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/libs/databrickslib.py +5 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/pyutils/callable_serde.py +10 -10
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/pyutils/retry.py +2 -2
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/types/cast/registry.py +0 -14
- ygg-0.1.53/src/yggdrasil/types/file_format.py +10 -0
- ygg-0.1.53/src/yggdrasil/version.py +1 -0
- ygg-0.1.51/src/yggdrasil/version.py +0 -1
- {ygg-0.1.51 → ygg-0.1.53}/LICENSE +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/README.md +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/setup.cfg +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/ygg.egg-info/dependency_links.txt +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/ygg.egg-info/entry_points.txt +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/ygg.egg-info/requires.txt +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/ygg.egg-info/top_level.txt +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/__init__.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/databricks/__init__.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/databricks/compute/__init__.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/databricks/compute/cluster.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/databricks/compute/execution_context.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/databricks/compute/remote.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/databricks/jobs/__init__.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/databricks/jobs/config.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/databricks/sql/__init__.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/databricks/sql/statement_result.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/databricks/sql/types.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/databricks/sql/warehouse.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/databricks/workspaces/__init__.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/databricks/workspaces/filesytem.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/databricks/workspaces/workspace.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/dataclasses/__init__.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/dataclasses/dataclass.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/libs/__init__.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/libs/extensions/__init__.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/libs/extensions/polars_extensions.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/libs/extensions/spark_extensions.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/libs/pandaslib.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/libs/polarslib.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/libs/sparklib.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/pyutils/__init__.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/pyutils/equality.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/pyutils/exceptions.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/pyutils/expiring_dict.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/pyutils/modules.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/pyutils/parallel.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/pyutils/python_env.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/requests/__init__.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/requests/msal.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/requests/session.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/types/__init__.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/types/cast/__init__.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/types/cast/arrow_cast.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/types/cast/cast_options.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/types/cast/pandas_cast.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/types/cast/polars_cast.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/types/cast/polars_pandas_cast.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/types/cast/spark_cast.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/types/cast/spark_pandas_cast.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/types/cast/spark_polars_cast.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/types/python_arrow.py +0 -0
- {ygg-0.1.51 → ygg-0.1.53}/src/yggdrasil/types/python_defaults.py +0 -0
|
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
|
|
|
5
5
|
|
|
6
6
|
[project]
|
|
7
7
|
name = "ygg"
|
|
8
|
-
version = "0.1.
|
|
8
|
+
version = "0.1.53"
|
|
9
9
|
description = "Type-friendly utilities for moving data between Python objects, Arrow, Polars, Pandas, Spark, and Databricks"
|
|
10
10
|
readme = { file = "README.md", content-type = "text/markdown" }
|
|
11
11
|
license = { file = "LICENSE" }
|
|
@@ -27,6 +27,7 @@ src/yggdrasil/databricks/workspaces/filesytem.py
|
|
|
27
27
|
src/yggdrasil/databricks/workspaces/io.py
|
|
28
28
|
src/yggdrasil/databricks/workspaces/path.py
|
|
29
29
|
src/yggdrasil/databricks/workspaces/path_kind.py
|
|
30
|
+
src/yggdrasil/databricks/workspaces/volumes_path.py
|
|
30
31
|
src/yggdrasil/databricks/workspaces/workspace.py
|
|
31
32
|
src/yggdrasil/dataclasses/__init__.py
|
|
32
33
|
src/yggdrasil/dataclasses/dataclass.py
|
|
@@ -51,6 +52,7 @@ src/yggdrasil/requests/__init__.py
|
|
|
51
52
|
src/yggdrasil/requests/msal.py
|
|
52
53
|
src/yggdrasil/requests/session.py
|
|
53
54
|
src/yggdrasil/types/__init__.py
|
|
55
|
+
src/yggdrasil/types/file_format.py
|
|
54
56
|
src/yggdrasil/types/python_arrow.py
|
|
55
57
|
src/yggdrasil/types/python_defaults.py
|
|
56
58
|
src/yggdrasil/types/cast/__init__.py
|
|
@@ -60,8 +60,29 @@ if pyspark is not None:
|
|
|
60
60
|
__all__ = ["SQLEngine", "StatementResult"]
|
|
61
61
|
|
|
62
62
|
|
|
63
|
-
|
|
64
|
-
|
|
63
|
+
@dataclasses.dataclass
|
|
64
|
+
class CreateTablePlan:
|
|
65
|
+
sql: str
|
|
66
|
+
properties: dict[str, Any]
|
|
67
|
+
warnings: list[str]
|
|
68
|
+
result: Any = None # StatementResult when executed
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
_INVALID_COL_CHARS = set(" ,;{}()\n\t=")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _escape_sql_string(s: str) -> str:
|
|
75
|
+
return s.replace("'", "''")
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _quote_ident(ident: str) -> str:
|
|
79
|
+
# Always quote to be safe; also allows reserved keywords
|
|
80
|
+
escaped = ident.replace("`", "``")
|
|
81
|
+
return f"`{escaped}`"
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _needs_column_mapping(col_name: str) -> bool:
|
|
85
|
+
return any(ch in _INVALID_COL_CHARS for ch in col_name)
|
|
65
86
|
|
|
66
87
|
|
|
67
88
|
@dataclasses.dataclass
|
|
@@ -268,7 +289,10 @@ class SQLEngine(WorkspaceService):
|
|
|
268
289
|
if row_limit:
|
|
269
290
|
df = df.limit(row_limit)
|
|
270
291
|
|
|
271
|
-
logger.
|
|
292
|
+
logger.debug(
|
|
293
|
+
"SPARK SQL executed query:\n%s",
|
|
294
|
+
statement
|
|
295
|
+
)
|
|
272
296
|
|
|
273
297
|
# Avoid Disposition dependency if SDK imports are absent
|
|
274
298
|
spark_disp = disposition if disposition is not None else getattr(globals().get("Disposition", object), "EXTERNAL_LINKS", None)
|
|
@@ -316,8 +340,12 @@ class SQLEngine(WorkspaceService):
|
|
|
316
340
|
)
|
|
317
341
|
|
|
318
342
|
logger.info(
|
|
319
|
-
"API SQL executed
|
|
320
|
-
|
|
343
|
+
"API SQL executed statement '%s'",
|
|
344
|
+
execution.statement_id
|
|
345
|
+
)
|
|
346
|
+
logger.debug(
|
|
347
|
+
"API SQL executed query:\n%s",
|
|
348
|
+
statement
|
|
321
349
|
)
|
|
322
350
|
|
|
323
351
|
return execution.wait() if wait_result else execution
|
|
@@ -816,111 +844,287 @@ FROM parquet.`{temp_volume_path}`"""
|
|
|
816
844
|
|
|
817
845
|
def create_table(
|
|
818
846
|
self,
|
|
819
|
-
field: pa.Field,
|
|
820
|
-
|
|
821
|
-
table_name: Optional[str] = None,
|
|
847
|
+
field: Union[pa.Field, pa.Schema],
|
|
848
|
+
table_fqn: Optional[str] = None, # e.g. catalog.schema.table
|
|
822
849
|
catalog_name: Optional[str] = None,
|
|
823
850
|
schema_name: Optional[str] = None,
|
|
851
|
+
table_name: Optional[str] = None,
|
|
852
|
+
storage_location: Optional[str] = None, # external location path
|
|
824
853
|
partition_by: Optional[list[str]] = None,
|
|
825
854
|
cluster_by: Optional[bool | list[str]] = True,
|
|
826
855
|
comment: Optional[str] = None,
|
|
827
|
-
|
|
856
|
+
tblproperties: Optional[dict[str, Any]] = None,
|
|
828
857
|
if_not_exists: bool = True,
|
|
858
|
+
or_replace: bool = False,
|
|
859
|
+
using: str = "DELTA",
|
|
829
860
|
optimize_write: bool = True,
|
|
830
861
|
auto_compact: bool = True,
|
|
862
|
+
# perf-ish optional knobs (don’t hard-force)
|
|
863
|
+
enable_cdf: Optional[bool] = None,
|
|
864
|
+
enable_deletion_vectors: Optional[bool] = None,
|
|
865
|
+
target_file_size: Optional[int] = None, # bytes
|
|
866
|
+
# column mapping: None=auto, "none"/"name"/"id" explicit
|
|
867
|
+
column_mapping_mode: Optional[str] = None,
|
|
831
868
|
execute: bool = True,
|
|
832
|
-
wait_result: bool = True
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
869
|
+
wait_result: bool = True,
|
|
870
|
+
return_plan: bool = False,
|
|
871
|
+
) -> Union[str, CreateTablePlan, "StatementResult"]:
|
|
872
|
+
"""
|
|
873
|
+
Generate (and optionally execute) a Databricks/Delta `CREATE TABLE` statement from an Apache Arrow
|
|
874
|
+
schema/field, with integration-friendly safety checks and performance-oriented defaults.
|
|
875
|
+
|
|
876
|
+
This helper is meant to be "team-safe":
|
|
877
|
+
- Quotes identifiers (catalog/schema/table/columns) to avoid SQL keyword/name edge cases.
|
|
878
|
+
- Validates `partition_by` / `cluster_by` columns exist in the Arrow schema before generating SQL.
|
|
879
|
+
- Supports managed or external tables via `storage_location`.
|
|
880
|
+
- Optionally enables Delta Column Mapping (name/id) and applies the required protocol upgrades.
|
|
881
|
+
|
|
882
|
+
Parameters
|
|
883
|
+
----------
|
|
884
|
+
field:
|
|
885
|
+
Arrow schema or field describing the table.
|
|
886
|
+
- If `pa.Schema`, all schema fields are used as columns.
|
|
887
|
+
- If `pa.Field` with struct type, its children become columns.
|
|
888
|
+
- If `pa.Field` non-struct, it becomes a single-column table.
|
|
889
|
+
table_fqn:
|
|
890
|
+
Fully-qualified table name, e.g. `"catalog.schema.table"`.
|
|
891
|
+
If provided, it takes precedence over `catalog_name`/`schema_name`/`table_name`.
|
|
892
|
+
Parts are quoted as needed.
|
|
893
|
+
catalog_name, schema_name, table_name:
|
|
894
|
+
Used to build the table identifier when `table_fqn` is not provided.
|
|
895
|
+
All three must be provided together.
|
|
896
|
+
storage_location:
|
|
897
|
+
If set, emits `LOCATION '<path>'` to create an external Delta table at the given path.
|
|
898
|
+
(Path string is SQL-escaped.)
|
|
899
|
+
partition_by:
|
|
900
|
+
List of partition column names. Must exist in the schema.
|
|
901
|
+
Note: Partitioning is a physical layout choice; only use for low-cardinality columns.
|
|
902
|
+
cluster_by:
|
|
903
|
+
Controls clustering / liquid clustering:
|
|
904
|
+
- True -> emits `CLUSTER BY AUTO`
|
|
905
|
+
- False -> emits no clustering clause
|
|
906
|
+
- list[str] -> emits `CLUSTER BY (<cols...>)` (all cols must exist in schema)
|
|
907
|
+
comment:
|
|
908
|
+
Optional table comment. If not provided and Arrow metadata contains `b"comment"`, that is used.
|
|
909
|
+
tblproperties:
|
|
910
|
+
Additional/override Delta table properties (final say).
|
|
911
|
+
Example: `{"delta.enableChangeDataFeed": "true"}` or `{"delta.logRetentionDuration": "30 days"}`
|
|
912
|
+
if_not_exists:
|
|
913
|
+
If True, generates `CREATE TABLE IF NOT EXISTS ...`.
|
|
914
|
+
Mutually exclusive with `or_replace`.
|
|
915
|
+
or_replace:
|
|
916
|
+
If True, generates `CREATE OR REPLACE TABLE ...`.
|
|
917
|
+
Mutually exclusive with `if_not_exists`.
|
|
918
|
+
using:
|
|
919
|
+
Storage format keyword. Defaults to `"DELTA"`.
|
|
920
|
+
optimize_write:
|
|
921
|
+
Sets `delta.autoOptimize.optimizeWrite` table property.
|
|
922
|
+
auto_compact:
|
|
923
|
+
Sets `delta.autoOptimize.autoCompact` table property.
|
|
924
|
+
enable_cdf:
|
|
925
|
+
If set, adds `delta.enableChangeDataFeed` property.
|
|
926
|
+
Useful for CDC pipelines; avoid enabling by default if you don't need it.
|
|
927
|
+
enable_deletion_vectors:
|
|
928
|
+
If set, adds `delta.enableDeletionVectors` property.
|
|
929
|
+
Can improve performance for updates/deletes in some workloads (subject to platform support).
|
|
930
|
+
target_file_size:
|
|
931
|
+
If set, adds `delta.targetFileSize` (bytes). Helps guide file sizing and reduce small files.
|
|
932
|
+
column_mapping_mode:
|
|
933
|
+
Delta column mapping mode:
|
|
934
|
+
- None -> auto-detect: enables `"name"` only if invalid column names are present, else `"none"`
|
|
935
|
+
- "none" -> do not enable column mapping (max compatibility)
|
|
936
|
+
- "name" -> enable name-based column mapping
|
|
937
|
+
- "id" -> enable id-based column mapping
|
|
938
|
+
|
|
939
|
+
When enabled (name/id), this method also sets the required protocol properties:
|
|
940
|
+
`delta.minReaderVersion=2` and `delta.minWriterVersion=5`.
|
|
941
|
+
execute:
|
|
942
|
+
If True, executes the generated SQL via `self.execute(...)`.
|
|
943
|
+
If False, returns the SQL (or plan) without executing.
|
|
944
|
+
wait_result:
|
|
945
|
+
Passed to `self.execute(...)`. If True, blocks until the statement finishes.
|
|
946
|
+
return_plan:
|
|
947
|
+
If True, returns a `CreateTablePlan` containing SQL + applied properties + warnings (+ result if executed).
|
|
948
|
+
If False:
|
|
949
|
+
- returns SQL string when `execute=False`
|
|
950
|
+
- returns `StatementResult` when `execute=True`
|
|
951
|
+
|
|
952
|
+
Returns
|
|
953
|
+
-------
|
|
954
|
+
Union[str, CreateTablePlan, StatementResult]
|
|
955
|
+
- If `execute=False` and `return_plan=False`: the SQL string.
|
|
956
|
+
- If `execute=False` and `return_plan=True`: `CreateTablePlan(sql=..., properties=..., warnings=...)`.
|
|
957
|
+
- If `execute=True` and `return_plan=False`: `StatementResult`.
|
|
958
|
+
- If `execute=True` and `return_plan=True`: `CreateTablePlan` with `result` populated.
|
|
959
|
+
|
|
960
|
+
Raises
|
|
961
|
+
------
|
|
962
|
+
ValueError
|
|
963
|
+
If required naming params are missing, if `or_replace` and `if_not_exists` conflict,
|
|
964
|
+
if `column_mapping_mode` is invalid, or if partition/cluster columns are not present.
|
|
965
|
+
|
|
966
|
+
Notes
|
|
967
|
+
-----
|
|
968
|
+
- Column mapping is primarily a metadata feature; performance impact is usually negligible vs IO,
|
|
969
|
+
but enabling it affects compatibility with older readers.
|
|
970
|
+
- Partitioning and clustering are workload-dependent: partition for selective pruning on low-cardinality
|
|
971
|
+
columns; cluster for speeding up common filter/join patterns.
|
|
972
|
+
|
|
973
|
+
Examples
|
|
974
|
+
--------
|
|
975
|
+
Create a managed Delta table with auto clustering and auto column mapping:
|
|
976
|
+
>>> plan = client.create_table(schema, table_fqn="main.analytics.events", execute=False, return_plan=True)
|
|
977
|
+
>>> print(plan.sql)
|
|
978
|
+
|
|
979
|
+
External table with explicit partitioning and CDF:
|
|
980
|
+
>>> client.create_table(
|
|
981
|
+
... schema,
|
|
982
|
+
... table_fqn="main.analytics.events",
|
|
983
|
+
... storage_location="abfss://.../events",
|
|
984
|
+
... partition_by=["event_date"],
|
|
985
|
+
... enable_cdf=True,
|
|
986
|
+
... )
|
|
854
987
|
"""
|
|
855
|
-
if not isinstance(field, pa.Field):
|
|
856
|
-
field = convert(field, pa.Field)
|
|
857
|
-
|
|
858
|
-
location, catalog_name, schema_name, table_name = self._check_location_params(
|
|
859
|
-
location=location,
|
|
860
|
-
catalog_name=catalog_name,
|
|
861
|
-
schema_name=schema_name,
|
|
862
|
-
table_name=table_name,
|
|
863
|
-
safe_chars=True,
|
|
864
|
-
)
|
|
865
988
|
|
|
866
|
-
|
|
867
|
-
|
|
989
|
+
# ---- Normalize Arrow input ----
|
|
990
|
+
if isinstance(field, pa.Schema):
|
|
991
|
+
arrow_fields = list(field)
|
|
992
|
+
schema_metadata = field.metadata or {}
|
|
868
993
|
else:
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
994
|
+
# pa.Field
|
|
995
|
+
schema_metadata = field.metadata or {}
|
|
996
|
+
if pa.types.is_struct(field.type):
|
|
997
|
+
arrow_fields = list(field.type)
|
|
998
|
+
else:
|
|
999
|
+
arrow_fields = [field]
|
|
1000
|
+
|
|
1001
|
+
# ---- Resolve table FQN ----
|
|
1002
|
+
# Prefer explicit table_fqn. Else build from catalog/schema/table_name.
|
|
1003
|
+
if table_fqn is None:
|
|
1004
|
+
if not (catalog_name and schema_name and table_name):
|
|
1005
|
+
raise ValueError("Provide table_fqn or (catalog_name, schema_name, table_name).")
|
|
1006
|
+
table_fqn = ".".join(map(_quote_ident, [catalog_name, schema_name, table_name]))
|
|
1007
|
+
else:
|
|
1008
|
+
# If caller passes raw "cat.schema.table", quote each part safely
|
|
1009
|
+
parts = table_fqn.split(".")
|
|
1010
|
+
table_fqn = ".".join(_quote_ident(p) for p in parts)
|
|
1011
|
+
|
|
1012
|
+
# ---- Comments ----
|
|
1013
|
+
if comment is None and schema_metadata:
|
|
1014
|
+
c = schema_metadata.get(b"comment")
|
|
1015
|
+
if isinstance(c, bytes):
|
|
1016
|
+
comment = c.decode("utf-8")
|
|
1017
|
+
|
|
1018
|
+
# ---- Detect invalid column names -> column mapping auto ----
|
|
1019
|
+
any_invalid = any(_needs_column_mapping(f.name) for f in arrow_fields)
|
|
1020
|
+
warnings: list[str] = []
|
|
1021
|
+
if column_mapping_mode is None:
|
|
1022
|
+
column_mapping_mode = "name" if any_invalid else "none"
|
|
1023
|
+
|
|
1024
|
+
if column_mapping_mode not in ("none", "name", "id"):
|
|
1025
|
+
raise ValueError("column_mapping_mode must be one of: None, 'none', 'name', 'id'.")
|
|
1026
|
+
|
|
1027
|
+
# ---- Validate partition/cluster columns exist ----
|
|
1028
|
+
col_names = {f.name for f in arrow_fields}
|
|
1029
|
+
for cols, label in ((partition_by, "partition_by"),):
|
|
1030
|
+
if cols:
|
|
1031
|
+
missing = [c for c in cols if c not in col_names]
|
|
1032
|
+
if missing:
|
|
1033
|
+
raise ValueError(f"{label} contains unknown columns: {missing}")
|
|
1034
|
+
|
|
1035
|
+
if isinstance(cluster_by, list):
|
|
1036
|
+
missing = [c for c in cluster_by if c not in col_names]
|
|
1037
|
+
if missing:
|
|
1038
|
+
raise ValueError(f"cluster_by contains unknown columns: {missing}")
|
|
1039
|
+
|
|
1040
|
+
# ---- Column DDL ----
|
|
1041
|
+
# IMPORTANT: your _field_to_ddl should quote names with backticks if needed.
|
|
1042
|
+
# I’d recommend it ALWAYS quotes via _quote_ident internally.
|
|
1043
|
+
column_definitions = [self._field_to_ddl(child) for child in arrow_fields]
|
|
1044
|
+
|
|
1045
|
+
# ---- Build CREATE TABLE ----
|
|
1046
|
+
if or_replace and if_not_exists:
|
|
1047
|
+
raise ValueError("Use either or_replace or if_not_exists, not both.")
|
|
1048
|
+
|
|
1049
|
+
create_kw = "CREATE OR REPLACE TABLE" if or_replace else "CREATE TABLE"
|
|
1050
|
+
if if_not_exists and not or_replace:
|
|
1051
|
+
create_kw = "CREATE TABLE IF NOT EXISTS"
|
|
1052
|
+
|
|
1053
|
+
sql_parts: list[str] = [
|
|
1054
|
+
f"{create_kw} {table_fqn} (",
|
|
1055
|
+
" " + ",\n ".join(column_definitions),
|
|
876
1056
|
")",
|
|
1057
|
+
f"USING {using}",
|
|
877
1058
|
]
|
|
878
1059
|
|
|
879
1060
|
if partition_by:
|
|
880
|
-
|
|
1061
|
+
sql_parts.append("PARTITIONED BY (" + ", ".join(_quote_ident(c) for c in partition_by) + ")")
|
|
881
1062
|
elif cluster_by:
|
|
882
1063
|
if isinstance(cluster_by, bool):
|
|
883
|
-
|
|
1064
|
+
sql_parts.append("CLUSTER BY AUTO")
|
|
884
1065
|
else:
|
|
885
|
-
|
|
1066
|
+
sql_parts.append("CLUSTER BY (" + ", ".join(_quote_ident(c) for c in cluster_by) + ")")
|
|
886
1067
|
|
|
887
|
-
if
|
|
888
|
-
|
|
1068
|
+
if comment:
|
|
1069
|
+
sql_parts.append(f"COMMENT '{_escape_sql_string(comment)}'")
|
|
889
1070
|
|
|
890
|
-
if
|
|
891
|
-
|
|
1071
|
+
if storage_location:
|
|
1072
|
+
sql_parts.append(f"LOCATION '{_escape_sql_string(storage_location)}'")
|
|
892
1073
|
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
"delta.autoOptimize.optimizeWrite": optimize_write,
|
|
899
|
-
"delta.autoOptimize.autoCompact": auto_compact,
|
|
900
|
-
})
|
|
901
|
-
|
|
902
|
-
option_strs = []
|
|
903
|
-
for key, value in (options or {}).items():
|
|
904
|
-
if isinstance(value, str):
|
|
905
|
-
option_strs.append(f"'{key}' = '{value}'")
|
|
906
|
-
elif isinstance(value, bool):
|
|
907
|
-
option_strs.append(f"'{key}' = '{'true' if value else 'false'}'")
|
|
908
|
-
else:
|
|
909
|
-
option_strs.append(f"'{key}' = {value}")
|
|
1074
|
+
# ---- Table properties (defaults + overrides) ----
|
|
1075
|
+
props: dict[str, Any] = {
|
|
1076
|
+
"delta.autoOptimize.optimizeWrite": bool(optimize_write),
|
|
1077
|
+
"delta.autoOptimize.autoCompact": bool(auto_compact)
|
|
1078
|
+
}
|
|
910
1079
|
|
|
911
|
-
if
|
|
912
|
-
|
|
1080
|
+
if enable_cdf is not None:
|
|
1081
|
+
props["delta.enableChangeDataFeed"] = bool(enable_cdf)
|
|
913
1082
|
|
|
914
|
-
|
|
1083
|
+
if enable_deletion_vectors is not None:
|
|
1084
|
+
props["delta.enableDeletionVectors"] = bool(enable_deletion_vectors)
|
|
915
1085
|
|
|
916
|
-
|
|
917
|
-
"
|
|
918
|
-
|
|
919
|
-
|
|
1086
|
+
if target_file_size is not None:
|
|
1087
|
+
props["delta.targetFileSize"] = int(target_file_size)
|
|
1088
|
+
|
|
1089
|
+
# Column mapping + required protocol bumps
|
|
1090
|
+
if column_mapping_mode != "none":
|
|
1091
|
+
props["delta.columnMapping.mode"] = column_mapping_mode
|
|
1092
|
+
props["delta.minReaderVersion"] = 2
|
|
1093
|
+
props["delta.minWriterVersion"] = 5
|
|
1094
|
+
else:
|
|
1095
|
+
# only set explicitly if user wants; otherwise leave unset for max compatibility
|
|
1096
|
+
pass
|
|
1097
|
+
|
|
1098
|
+
# Let caller override anything (final say)
|
|
1099
|
+
if tblproperties:
|
|
1100
|
+
props.update(tblproperties)
|
|
1101
|
+
|
|
1102
|
+
if any_invalid and column_mapping_mode == "none":
|
|
1103
|
+
warnings.append(
|
|
1104
|
+
"Schema has invalid column names but column_mapping_mode='none'. "
|
|
1105
|
+
"This will fail unless you rename/escape columns."
|
|
1106
|
+
)
|
|
1107
|
+
|
|
1108
|
+
if props:
|
|
1109
|
+
def fmt(k: str, v: Any) -> str:
|
|
1110
|
+
if isinstance(v, str):
|
|
1111
|
+
return f"'{k}' = '{_escape_sql_string(v)}'"
|
|
1112
|
+
if isinstance(v, bool):
|
|
1113
|
+
return f"'{k}' = '{'true' if v else 'false'}'"
|
|
1114
|
+
return f"'{k}' = {v}"
|
|
1115
|
+
|
|
1116
|
+
sql_parts.append("TBLPROPERTIES (" + ", ".join(fmt(k, v) for k, v in props.items()) + ")")
|
|
1117
|
+
|
|
1118
|
+
statement = "\n".join(sql_parts)
|
|
1119
|
+
|
|
1120
|
+
plan = CreateTablePlan(sql=statement, properties=props, warnings=warnings)
|
|
1121
|
+
|
|
1122
|
+
if not execute:
|
|
1123
|
+
return plan if return_plan else statement
|
|
920
1124
|
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
return
|
|
1125
|
+
res = self.execute(statement, wait_result=wait_result)
|
|
1126
|
+
plan.result = res
|
|
1127
|
+
return plan if return_plan else res
|
|
924
1128
|
|
|
925
1129
|
def _check_location_params(
|
|
926
1130
|
self,
|
|
@@ -17,13 +17,15 @@ class SqlStatementError(RuntimeError):
|
|
|
17
17
|
|
|
18
18
|
def __str__(self) -> str:
|
|
19
19
|
meta = []
|
|
20
|
+
|
|
20
21
|
if self.error_code:
|
|
21
22
|
meta.append(f"code={self.error_code}")
|
|
22
23
|
if self.sql_state:
|
|
23
24
|
meta.append(f"state={self.sql_state}")
|
|
24
25
|
|
|
25
26
|
meta_str = f" ({', '.join(meta)})" if meta else ""
|
|
26
|
-
|
|
27
|
+
|
|
28
|
+
return f"SQL statement {self.statement_id!r} failed [{self.state}]: {self.message}{meta_str}"
|
|
27
29
|
|
|
28
30
|
@classmethod
|
|
29
31
|
def from_statement(cls, stmt: Any) -> "SqlStatementError":
|