deltacat 2.0.0b11__py3-none-any.whl → 2.0.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. deltacat/__init__.py +78 -3
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/conftest.py +0 -18
  6. deltacat/catalog/__init__.py +2 -0
  7. deltacat/catalog/delegate.py +445 -63
  8. deltacat/catalog/interface.py +188 -62
  9. deltacat/catalog/main/impl.py +2417 -271
  10. deltacat/catalog/model/catalog.py +49 -10
  11. deltacat/catalog/model/properties.py +38 -0
  12. deltacat/compute/compactor/compaction_session.py +97 -75
  13. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  14. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  15. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  16. deltacat/compute/compactor/repartition_session.py +8 -21
  17. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  18. deltacat/compute/compactor/steps/materialize.py +9 -7
  19. deltacat/compute/compactor/steps/repartition.py +12 -11
  20. deltacat/compute/compactor/utils/io.py +6 -5
  21. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  22. deltacat/compute/compactor/utils/system_columns.py +3 -1
  23. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  24. deltacat/compute/compactor_v2/constants.py +30 -1
  25. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  26. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  27. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  28. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  29. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  30. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  31. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  32. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  33. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  34. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  35. deltacat/compute/compactor_v2/utils/io.py +11 -4
  36. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  37. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  38. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  39. deltacat/compute/converter/converter_session.py +145 -32
  40. deltacat/compute/converter/model/convert_input.py +26 -19
  41. deltacat/compute/converter/model/convert_input_files.py +33 -16
  42. deltacat/compute/converter/model/convert_result.py +35 -16
  43. deltacat/compute/converter/model/converter_session_params.py +24 -21
  44. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  45. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  46. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  47. deltacat/compute/converter/steps/convert.py +157 -50
  48. deltacat/compute/converter/steps/dedupe.py +24 -11
  49. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  50. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  51. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  52. deltacat/compute/converter/utils/io.py +101 -12
  53. deltacat/compute/converter/utils/s3u.py +33 -27
  54. deltacat/compute/janitor.py +205 -0
  55. deltacat/compute/jobs/client.py +19 -8
  56. deltacat/compute/resource_estimation/delta.py +38 -6
  57. deltacat/compute/resource_estimation/model.py +8 -0
  58. deltacat/constants.py +44 -0
  59. deltacat/docs/autogen/schema/__init__.py +0 -0
  60. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/examples/compactor/__init__.py +0 -0
  64. deltacat/examples/compactor/aws/__init__.py +1 -0
  65. deltacat/examples/compactor/bootstrap.py +863 -0
  66. deltacat/examples/compactor/compactor.py +373 -0
  67. deltacat/examples/compactor/explorer.py +473 -0
  68. deltacat/examples/compactor/gcp/__init__.py +1 -0
  69. deltacat/examples/compactor/job_runner.py +439 -0
  70. deltacat/examples/compactor/utils/__init__.py +1 -0
  71. deltacat/examples/compactor/utils/common.py +261 -0
  72. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  80. deltacat/exceptions.py +66 -4
  81. deltacat/experimental/catalog/iceberg/impl.py +2 -2
  82. deltacat/experimental/compatibility/__init__.py +0 -0
  83. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  84. deltacat/experimental/converter_agent/__init__.py +0 -0
  85. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  86. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  87. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  88. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
  89. deltacat/experimental/storage/iceberg/impl.py +5 -3
  90. deltacat/experimental/storage/iceberg/model.py +7 -3
  91. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  92. deltacat/experimental/storage/rivulet/dataset.py +0 -3
  93. deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
  94. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
  95. deltacat/io/datasource/deltacat_datasource.py +0 -1
  96. deltacat/storage/__init__.py +20 -2
  97. deltacat/storage/interface.py +54 -32
  98. deltacat/storage/main/impl.py +1494 -541
  99. deltacat/storage/model/delta.py +27 -3
  100. deltacat/storage/model/locator.py +6 -12
  101. deltacat/storage/model/manifest.py +182 -6
  102. deltacat/storage/model/metafile.py +151 -78
  103. deltacat/storage/model/namespace.py +8 -1
  104. deltacat/storage/model/partition.py +117 -42
  105. deltacat/storage/model/schema.py +2427 -159
  106. deltacat/storage/model/sort_key.py +40 -0
  107. deltacat/storage/model/stream.py +9 -2
  108. deltacat/storage/model/table.py +12 -1
  109. deltacat/storage/model/table_version.py +11 -0
  110. deltacat/storage/model/transaction.py +1184 -208
  111. deltacat/storage/model/transform.py +81 -2
  112. deltacat/storage/model/types.py +48 -26
  113. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  114. deltacat/tests/aws/test_s3u.py +2 -31
  115. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
  116. deltacat/tests/catalog/test_catalogs.py +54 -11
  117. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
  118. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  119. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  120. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  121. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  122. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  123. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  124. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  125. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  126. deltacat/tests/compute/conftest.py +8 -44
  127. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  128. deltacat/tests/compute/converter/utils.py +15 -6
  129. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  130. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  131. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  132. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  133. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  134. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  135. deltacat/tests/compute/test_janitor.py +236 -0
  136. deltacat/tests/compute/test_util_common.py +716 -43
  137. deltacat/tests/compute/test_util_constant.py +0 -1
  138. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  139. deltacat/tests/experimental/__init__.py +1 -0
  140. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  141. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  142. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  143. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  144. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  145. deltacat/tests/storage/model/test_schema.py +171 -0
  146. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  147. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  148. deltacat/tests/storage/model/test_transaction.py +393 -48
  149. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  150. deltacat/tests/test_deltacat_api.py +988 -4
  151. deltacat/tests/test_exceptions.py +9 -5
  152. deltacat/tests/test_utils/pyarrow.py +52 -21
  153. deltacat/tests/test_utils/storage.py +23 -34
  154. deltacat/tests/types/__init__.py +0 -0
  155. deltacat/tests/types/test_tables.py +104 -0
  156. deltacat/tests/utils/exceptions.py +22 -0
  157. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  158. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  159. deltacat/tests/utils/test_daft.py +121 -31
  160. deltacat/tests/utils/test_numpy.py +1193 -0
  161. deltacat/tests/utils/test_pandas.py +1106 -0
  162. deltacat/tests/utils/test_polars.py +1040 -0
  163. deltacat/tests/utils/test_pyarrow.py +1370 -89
  164. deltacat/types/media.py +221 -11
  165. deltacat/types/tables.py +2329 -59
  166. deltacat/utils/arguments.py +33 -1
  167. deltacat/utils/daft.py +411 -150
  168. deltacat/utils/filesystem.py +100 -0
  169. deltacat/utils/metafile_locator.py +2 -1
  170. deltacat/utils/numpy.py +118 -26
  171. deltacat/utils/pandas.py +577 -48
  172. deltacat/utils/polars.py +658 -27
  173. deltacat/utils/pyarrow.py +1258 -213
  174. deltacat/utils/ray_utils/dataset.py +101 -10
  175. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  176. deltacat/utils/url.py +56 -15
  177. deltacat-2.0.0.post1.dist-info/METADATA +1163 -0
  178. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/RECORD +183 -145
  179. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/WHEEL +1 -1
  180. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  181. deltacat/compute/merge_on_read/__init__.py +0 -4
  182. deltacat/compute/merge_on_read/daft.py +0 -40
  183. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  184. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  185. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  186. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  187. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  188. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  189. deltacat/utils/s3fs.py +0 -21
  190. deltacat-2.0.0b11.dist-info/METADATA +0 -67
  191. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  192. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  193. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info/licenses}/LICENSE +0 -0
  194. {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,263 @@
1
+ """
2
+ Spark SQL utilities for Iceberg table operations.
3
+
4
+ This module provides Beam DoFn classes that use Spark SQL to work with Iceberg tables,
5
+ """
6
+
7
+ import os
8
+ import apache_beam as beam
9
+ from apache_beam import Row
10
+
11
+
12
+ class SparkSQLIcebergRead(beam.DoFn):
13
+ """
14
+ Custom Beam DoFn that uses Spark SQL to read Iceberg tables.
15
+ """
16
+
17
+ def __init__(
18
+ self,
19
+ table_name: str,
20
+ catalog_uri: str = "http://localhost:8181",
21
+ warehouse: str = "warehouse/",
22
+ ):
23
+ """
24
+ Initialize the Spark SQL reader.
25
+
26
+ Args:
27
+ table_name: Name of the Iceberg table
28
+ catalog_uri: URI of the Iceberg REST catalog
29
+ warehouse: Warehouse path
30
+ """
31
+ self.table_name = table_name
32
+ self.catalog_uri = catalog_uri
33
+ self.warehouse = warehouse
34
+ self.spark = None
35
+
36
+ def setup(self):
37
+ """Set up Spark session (called once per worker)."""
38
+ try:
39
+ from pyspark.sql import SparkSession
40
+ import importlib.metadata
41
+
42
+ # Get Spark version for dependency resolution
43
+ try:
44
+ spark_version = ".".join(
45
+ importlib.metadata.version("pyspark").split(".")[:2]
46
+ )
47
+ except Exception:
48
+ spark_version = "3.5" # Default fallback
49
+
50
+ scala_version = "2.12"
51
+ iceberg_version = "1.6.0"
52
+
53
+ print(f"🔧 Setting up Spark session for reading {self.table_name}")
54
+ print(f" - Spark version: {spark_version}")
55
+ print(f" - Iceberg version: {iceberg_version}")
56
+
57
+ # Set Spark packages for Iceberg runtime
58
+ os.environ["PYSPARK_SUBMIT_ARGS"] = (
59
+ f"--packages org.apache.iceberg:iceberg-spark-runtime-{spark_version}_{scala_version}:{iceberg_version} "
60
+ f"pyspark-shell"
61
+ )
62
+
63
+ # Create Spark session with Iceberg REST catalog configuration
64
+ self.spark = (
65
+ SparkSession.builder.appName(f"DeltaCAT Read - {self.table_name}")
66
+ .config("spark.sql.session.timeZone", "UTC")
67
+ .config(
68
+ "spark.serializer", "org.apache.spark.serializer.KryoSerializer"
69
+ )
70
+ .config(
71
+ "spark.sql.extensions",
72
+ "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
73
+ )
74
+ # Configure REST catalog
75
+ .config(
76
+ "spark.sql.catalog.rest", "org.apache.iceberg.spark.SparkCatalog"
77
+ )
78
+ .config("spark.sql.catalog.rest.type", "rest")
79
+ .config("spark.sql.catalog.rest.uri", self.catalog_uri)
80
+ .config("spark.sql.catalog.rest.warehouse", self.warehouse)
81
+ # Set REST as default catalog
82
+ .config("spark.sql.defaultCatalog", "rest")
83
+ # Local mode configuration (within Beam workers)
84
+ .config("spark.master", "local[1]") # Single thread per worker
85
+ .config("spark.sql.adaptive.enabled", "true")
86
+ # Networking binding
87
+ .config("spark.driver.bindAddress", "127.0.0.1")
88
+ .config("spark.driver.host", "127.0.0.1")
89
+ .config("spark.ui.enabled", "false")
90
+ .config("spark.sql.adaptive.coalescePartitions.enabled", "false")
91
+ .getOrCreate()
92
+ )
93
+
94
+ print(f"✅ Spark session created successfully")
95
+
96
+ except Exception as e:
97
+ print(f"❌ Failed to set up Spark session: {e}")
98
+ raise
99
+
100
+ def teardown(self):
101
+ """Clean up Spark session (called once per worker)."""
102
+ if self.spark:
103
+ try:
104
+ self.spark.stop()
105
+ print("✅ Spark session stopped")
106
+ except Exception as e:
107
+ print(f"⚠️ Error stopping Spark session: {e}")
108
+
109
+ def process(self, element):
110
+ """
111
+ Process element (read from Iceberg table using Spark SQL).
112
+
113
+ Args:
114
+ element: Input element (not used, just triggers the read)
115
+
116
+ Yields:
117
+ Records from the Iceberg table
118
+ """
119
+ try:
120
+ if not self.spark:
121
+ raise RuntimeError("Spark session not initialized")
122
+
123
+ print(f"📖 Reading table {self.table_name} using Spark SQL")
124
+
125
+ # Read from Iceberg table using Spark SQL
126
+ df = self.spark.sql(f"SELECT * FROM {self.table_name}")
127
+
128
+ # Collect all records
129
+ records = df.collect()
130
+
131
+ print(f"📊 Successfully read {len(records)} records from {self.table_name}")
132
+
133
+ # Convert Spark rows to Beam Row objects and yield
134
+ for row in records:
135
+ row_dict = row.asDict()
136
+ # Convert to Beam Row for consistency with write mode
137
+ beam_row = Row(**row_dict)
138
+ yield beam_row
139
+
140
+ except Exception as e:
141
+ print(f"❌ Failed to read from table {self.table_name}: {e}")
142
+ raise
143
+
144
+
145
+ class SparkSQLIcebergRewrite(beam.DoFn):
146
+ """
147
+ Custom Beam DoFn that uses Spark SQL to rewrite Iceberg table data files.
148
+
149
+ This uses Spark's rewrite_data_files procedure to materialize positional deletes
150
+ by rewriting data files. The result is a "clean" table without positional deletes.
151
+ """
152
+
153
+ def __init__(self, catalog_uri, warehouse_path, table_name):
154
+ self.catalog_uri = catalog_uri
155
+ self.warehouse_path = warehouse_path
156
+ self.table_name = table_name
157
+
158
+ def setup(self):
159
+ """Initialize Spark session for rewrite operations."""
160
+ try:
161
+ from pyspark.sql import SparkSession
162
+ import importlib.metadata
163
+
164
+ print(f"🔧 Setting up Spark session for rewriting {self.table_name}")
165
+
166
+ # Detect Spark version for appropriate Iceberg runtime
167
+ spark_version = importlib.metadata.version("pyspark")
168
+ major_minor = ".".join(spark_version.split(".")[:2])
169
+ print(f" - Spark version: {major_minor}")
170
+ print(f" - Iceberg version: 1.6.0")
171
+
172
+ # Configure Spark with Iceberg
173
+ self.spark = (
174
+ SparkSession.builder.appName("IcebergRewrite")
175
+ .config(
176
+ "spark.jars.packages",
177
+ f"org.apache.iceberg:iceberg-spark-runtime-{major_minor}_2.12:1.6.0",
178
+ )
179
+ .config(
180
+ "spark.sql.extensions",
181
+ "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
182
+ )
183
+ .config(
184
+ "spark.sql.catalog.spark_catalog",
185
+ "org.apache.iceberg.spark.SparkSessionCatalog",
186
+ )
187
+ .config("spark.sql.catalog.spark_catalog.type", "rest")
188
+ .config("spark.sql.catalog.spark_catalog.uri", self.catalog_uri)
189
+ .config(
190
+ "spark.sql.catalog.spark_catalog.warehouse", self.warehouse_path
191
+ )
192
+ .config("spark.driver.bindAddress", "127.0.0.1")
193
+ .config("spark.driver.host", "127.0.0.1")
194
+ .config("spark.ui.enabled", "false")
195
+ .getOrCreate()
196
+ )
197
+
198
+ print("✅ Spark session created successfully")
199
+
200
+ except ImportError as e:
201
+ raise RuntimeError(
202
+ f"PySpark is required for rewrite mode. Install with: pip install pyspark"
203
+ ) from e
204
+ except Exception as e:
205
+ raise RuntimeError(f"Failed to create Spark session: {e}") from e
206
+
207
+ def process(self, element):
208
+ """Rewrite table data files to materialize positional deletes."""
209
+ try:
210
+ print(
211
+ f"📋 Rewriting table {self.table_name} to materialize positional deletes"
212
+ )
213
+
214
+ # Use Spark's rewrite_data_files procedure with delete_file_threshold=1
215
+ # This forces rewrite even when there's only 1 positional delete file
216
+ rewrite_sql = f"""
217
+ CALL spark_catalog.system.rewrite_data_files(
218
+ table => '{self.table_name}',
219
+ options => map('delete-file-threshold', '1')
220
+ )
221
+ """
222
+
223
+ print(f"🔄 Executing rewrite procedure with delete_file_threshold=1...")
224
+ print(f" SQL: {rewrite_sql.strip()}")
225
+ print(
226
+ f" Rationale: Forces rewrite even with single positional delete file"
227
+ )
228
+
229
+ result = self.spark.sql(rewrite_sql)
230
+
231
+ # Collect results to see what was rewritten
232
+ rewrite_result = result.collect()[0]
233
+ print(f"📊 Rewrite result: {rewrite_result}")
234
+
235
+ # Check if we actually rewrote anything
236
+ if rewrite_result.rewritten_data_files_count > 0:
237
+ print(
238
+ f"✅ Successfully rewrote {rewrite_result.rewritten_data_files_count} data files"
239
+ )
240
+ print(
241
+ f" - Added {rewrite_result.added_data_files_count} new data files"
242
+ )
243
+ print(f" - Rewrote {rewrite_result.rewritten_bytes_count} bytes")
244
+ print(f" - Positional deletes have been materialized!")
245
+ else:
246
+ print(f"⚠️ No files were rewritten (rewritten_data_files_count=0)")
247
+ print(f" - This may indicate no positional deletes exist")
248
+ print(f" - Or the table may already be in optimal state")
249
+
250
+ yield f"Rewrite completed for {self.table_name}"
251
+
252
+ except Exception as e:
253
+ print(f"❌ Error during rewrite: {e}")
254
+ import traceback
255
+
256
+ traceback.print_exc()
257
+ yield f"Rewrite failed for {self.table_name}: {e}"
258
+
259
+ def teardown(self):
260
+ """Clean up Spark session."""
261
+ if hasattr(self, "spark"):
262
+ print("✅ Spark session stopped")
263
+ self.spark.stop()
deltacat/exceptions.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
  from enum import Enum
3
- from typing import Callable
3
+ from typing import Callable, Optional, TYPE_CHECKING
4
4
  import logging
5
5
 
6
6
  import tenacity
@@ -28,6 +28,9 @@ from deltacat.utils.ray_utils.runtime import (
28
28
  get_current_ray_task_id,
29
29
  )
30
30
 
31
+ if TYPE_CHECKING:
32
+ from deltacat.storage.model.schema import FieldLocator
33
+
31
34
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
32
35
 
33
36
  DELTACAT_STORAGE_PARAM = "deltacat_storage"
@@ -74,9 +77,18 @@ class DeltaCatErrorNames(str, Enum):
74
77
  TABLE_NOT_FOUND_ERROR = "TableNotFoundError"
75
78
  TABLE_VERSION_NOT_FOUND_ERROR = "TableVersionNotFoundError"
76
79
  STREAM_NOT_FOUND_ERROR = "StreamNotFoundError"
80
+ PARTITION_NOT_FOUND_ERROR = "PartitionNotFoundError"
77
81
  DELTA_NOT_FOUND_ERROR = "DeltaNotFoundError"
78
82
  TABLE_ALREADY_EXISTS_ERROR = "TableAlreadyExistsError"
83
+ TABLE_VERSION_ALREADY_EXISTS_ERROR = "TableVersionAlreadyExistsError"
79
84
  NAMESPACE_ALREADY_EXISTS_ERROR = "NamespaceAlreadyExistsError"
85
+ SCHEMA_COMPATIBILITY_ERROR = "SchemaCompatibilityError"
86
+ SCHEMA_VALIDATION_ERROR = "SchemaValidationError"
87
+ TABLE_VALIDATION_ERROR = "TableValidationError"
88
+ CONCURRENT_MODIFICATION_ERROR = "ConcurrentModificationError"
89
+ OBJECT_NOT_FOUND_ERROR = "ObjectNotFoundError"
90
+ OBJECT_DELETED_ERROR = "ObjectDeletedError"
91
+ OBJECT_ALREADY_EXISTS_ERROR = "ObjectAlreadyExistsError"
80
92
 
81
93
 
82
94
  class DeltaCatError(Exception):
@@ -87,9 +99,12 @@ class DeltaCatError(Exception):
87
99
  super().__init__(*args, **kwargs)
88
100
 
89
101
  def _get_ray_task_id_and_node_ip(self):
90
- task_id = get_current_ray_task_id()
91
- node_ip = ray.util.get_node_ip_address()
92
- return task_id, node_ip
102
+ if ray.is_initialized():
103
+ task_id = get_current_ray_task_id()
104
+ node_ip = ray.util.get_node_ip_address()
105
+ return task_id, node_ip
106
+ else:
107
+ return None, None
93
108
 
94
109
 
95
110
  class NonRetryableError(DeltaCatError):
@@ -232,6 +247,10 @@ class TableVersionNotFoundError(NonRetryableError):
232
247
  error_name = DeltaCatErrorNames.TABLE_VERSION_NOT_FOUND_ERROR.value
233
248
 
234
249
 
250
+ class PartitionNotFoundError(NonRetryableError):
251
+ error_name = DeltaCatErrorNames.PARTITION_NOT_FOUND_ERROR.value
252
+
253
+
235
254
  class StreamNotFoundError(NonRetryableError):
236
255
  error_name = DeltaCatErrorNames.STREAM_NOT_FOUND_ERROR.value
237
256
 
@@ -244,10 +263,53 @@ class TableAlreadyExistsError(NonRetryableError):
244
263
  error_name = DeltaCatErrorNames.TABLE_ALREADY_EXISTS_ERROR.value
245
264
 
246
265
 
266
+ class TableVersionAlreadyExistsError(NonRetryableError):
267
+ error_name = DeltaCatErrorNames.TABLE_VERSION_ALREADY_EXISTS_ERROR.value
268
+
269
+
247
270
  class NamespaceAlreadyExistsError(NonRetryableError):
248
271
  error_name = DeltaCatErrorNames.TABLE_ALREADY_EXISTS_ERROR.value
249
272
 
250
273
 
274
+ class ObjectNotFoundError(NonRetryableError):
275
+ error_name = DeltaCatErrorNames.OBJECT_NOT_FOUND_ERROR.value
276
+
277
+
278
+ class ObjectDeletedError(NonRetryableError):
279
+ error_name = DeltaCatErrorNames.OBJECT_DELETED_ERROR.value
280
+
281
+
282
+ class ObjectAlreadyExistsError(NonRetryableError):
283
+ error_name = DeltaCatErrorNames.OBJECT_ALREADY_EXISTS_ERROR.value
284
+
285
+
286
+ class ConcurrentModificationError(NonRetryableError):
287
+ error_name = DeltaCatErrorNames.CONCURRENT_MODIFICATION_ERROR.value
288
+
289
+
290
+ class SchemaValidationError(NonRetryableError):
291
+ error_name = DeltaCatErrorNames.SCHEMA_VALIDATION_ERROR.value
292
+
293
+
294
+ class TableValidationError(NonRetryableError):
295
+ error_name = DeltaCatErrorNames.TABLE_VALIDATION_ERROR.value
296
+
297
+
298
+ class SchemaCompatibilityError(NonRetryableError):
299
+ error_name = DeltaCatErrorNames.SCHEMA_COMPATIBILITY_ERROR.value
300
+ """Raised when a schema update would break backward compatibility."""
301
+
302
+ def __init__(
303
+ self,
304
+ message: str,
305
+ field_locator: Optional[FieldLocator] = None,
306
+ *args,
307
+ **kwargs,
308
+ ):
309
+ super().__init__(message, *args, **kwargs)
310
+ self.field_locator = field_locator
311
+
312
+
251
313
  def categorize_errors(func: Callable):
252
314
  def wrapper(*args, **kwargs):
253
315
  try:
@@ -198,7 +198,7 @@ def create_table(
198
198
  name: str,
199
199
  *args,
200
200
  namespace: Optional[str] = None,
201
- version: Optional[str] = None,
201
+ table_version: Optional[str] = None,
202
202
  lifecycle_state: Optional[LifecycleState] = None,
203
203
  schema: Optional[Schema] = None,
204
204
  partition_scheme: Optional[PartitionScheme] = None,
@@ -242,7 +242,7 @@ def create_table(
242
242
  IcebergStorage.create_table_version(
243
243
  namespace=namespace,
244
244
  table_name=name,
245
- table_version=version,
245
+ table_version=table_version,
246
246
  schema=schema,
247
247
  partition_scheme=partition_scheme,
248
248
  sort_keys=sort_keys,
File without changes
@@ -0,0 +1,201 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Backfill script for backwards compatibility with canonical_string changes.
4
+
5
+ This script migrates existing DeltaCAT catalogs from the old global canonical string
6
+ format (with parent hexdigest) to the new hierarchical format (without parent hexdigest).
7
+
8
+ The old format was: {parent_hexdigest}|{name_parts}
9
+ The new format is: {name_parts}
10
+
11
+ Strategy:
12
+ 1. Patch canonical_string method to use old format for reading existing name mappings
13
+ 2. Use dc.list() to recursively discover all objects with old canonical_string
14
+ 3. Copy each object's name mappings using new canonical_string format for writing
15
+ 4. Works with any PyArrow-supported filesystem (local, S3, GCS, etc.)
16
+
17
+ Usage:
18
+ python deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py --catalog-root /path/to/catalog
19
+ """
20
+
21
+ import argparse
22
+ import logging
23
+ import contextlib
24
+
25
+ import deltacat as dc
26
+ from deltacat.utils.url import DeltaCatUrl
27
+ from deltacat.storage.model.locator import Locator
28
+ from deltacat.api import _copy_objects_in_order
29
+
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ def canonical_string_old(locator, separator: str = "|") -> str:
35
+ """
36
+ Old implementation of canonical_string that included parent hexdigest.
37
+ This is used to read existing name resolution directories.
38
+ """
39
+ parts = []
40
+ parent_hexdigest = locator.parent.hexdigest() if locator.parent else None
41
+ if parent_hexdigest:
42
+ parts.append(parent_hexdigest)
43
+ parts.extend(locator.name.parts())
44
+ return separator.join([str(part) for part in parts])
45
+
46
+
47
+ @contextlib.contextmanager
48
+ def patched_canonical_string(use_old_format: bool = True):
49
+ """
50
+ Context manager that temporarily patches the canonical_string method.
51
+
52
+ Args:
53
+ use_old_format: If True, use old format; if False, use new format
54
+ """
55
+ # Store original method
56
+ original_method = Locator.canonical_string
57
+
58
+ try:
59
+ if use_old_format:
60
+ # Patch with old implementation
61
+ Locator.canonical_string = canonical_string_old
62
+ # If use_old_format is False, keep the current (new) implementation
63
+
64
+ yield
65
+
66
+ finally:
67
+ # Always restore original method
68
+ Locator.canonical_string = original_method
69
+
70
+
71
+ def migrate_catalog(
72
+ source_url: str, destination_url: str, dry_run: bool = False
73
+ ) -> bool:
74
+ """
75
+ Migrate a catalog from old to new canonical string format.
76
+
77
+ Args:
78
+ source_url: Source catalog URL (e.g., 'dc://catalog_root/')
79
+ destination_url: Destination catalog URL (e.g., 'dc://new_catalog_root/')
80
+ dry_run: If True, just show what would be migrated
81
+
82
+ Returns:
83
+ True if migration successful, False otherwise
84
+ """
85
+ try:
86
+ src_url = DeltaCatUrl(source_url)
87
+ dst_url = DeltaCatUrl(destination_url)
88
+
89
+ logger.info(f"Starting migration from {source_url} to {destination_url}")
90
+
91
+ if dry_run:
92
+ logger.info("DRY RUN - No actual changes will be made")
93
+
94
+ if dry_run:
95
+ # Step 1: List all objects using old canonical_string format for dry run
96
+ logger.info(
97
+ "DRY RUN - Discovering objects using old canonical string format..."
98
+ )
99
+ with patched_canonical_string(use_old_format=True):
100
+ src_objects = dc.list(src_url, recursive=True)
101
+
102
+ if hasattr(src_objects, "__len__"):
103
+ logger.info(f"DRY RUN - Found {len(src_objects)} objects to migrate")
104
+ else:
105
+ logger.info("DRY RUN - Found objects to migrate (count unknown)")
106
+
107
+ logger.info(
108
+ "DRY RUN - Would copy objects using new canonical string format"
109
+ )
110
+ return True
111
+
112
+ # Step 2: Read objects with old format, then write with new format
113
+ logger.info("Step 1: Reading all objects using old canonical string format...")
114
+ with patched_canonical_string(use_old_format=True):
115
+ src_objects = dc.list(src_url, recursive=True)
116
+
117
+ if hasattr(src_objects, "__len__"):
118
+ logger.info(f"Found {len(src_objects)} objects to migrate")
119
+ else:
120
+ logger.info("Found objects to migrate (count unknown)")
121
+
122
+ logger.info("Step 2: Writing objects using new canonical string format...")
123
+ with patched_canonical_string(use_old_format=False):
124
+ _copy_objects_in_order(src_objects, dst_url)
125
+
126
+ logger.info("Migration completed successfully!")
127
+ return True
128
+
129
+ except Exception as e:
130
+ logger.error(f"Migration failed: {e}")
131
+ import traceback
132
+
133
+ traceback.print_exc()
134
+ return False
135
+
136
+
137
+ def main():
138
+ parser = argparse.ArgumentParser(
139
+ description="Backfill locator-to-ID mappings for DeltaCAT canonical string changes"
140
+ )
141
+ parser.add_argument(
142
+ "--catalog-root",
143
+ required=True,
144
+ help="Path to the source DeltaCAT catalog root directory",
145
+ )
146
+ parser.add_argument(
147
+ "--destination",
148
+ required=True,
149
+ help="Path to the destination DeltaCAT catalog root directory",
150
+ )
151
+ parser.add_argument(
152
+ "--dry-run",
153
+ action="store_true",
154
+ help="Show what would be migrated without making changes",
155
+ )
156
+ parser.add_argument(
157
+ "--verbose",
158
+ "-v",
159
+ action="store_true",
160
+ help="Enable verbose logging. Writes logs to /tmp/deltacat/ by default.",
161
+ )
162
+
163
+ args = parser.parse_args()
164
+
165
+ # Configure logging
166
+ level = logging.DEBUG if args.verbose else logging.INFO
167
+ logging.basicConfig(level=level, format="%(asctime)s - %(levelname)s - %(message)s")
168
+
169
+ # Initialize DeltaCAT with the catalog
170
+ catalog_config = {
171
+ "local": {
172
+ "root": args.catalog_root,
173
+ }
174
+ }
175
+ dc.init(catalogs=catalog_config)
176
+
177
+ try:
178
+ # Migrate to different location
179
+ source_url = f"dc://{args.catalog_root}/"
180
+ dest_url = f"dc://{args.destination}/"
181
+
182
+ if not args.dry_run:
183
+ # Initialize destination catalog
184
+ dest_config = {
185
+ "dest": {
186
+ "root": args.destination,
187
+ }
188
+ }
189
+ dc.init(catalogs=dest_config)
190
+
191
+ success = migrate_catalog(source_url, dest_url, args.dry_run)
192
+
193
+ return int(success)
194
+
195
+ except Exception as e:
196
+ logger.error(f"Migration failed: {e}")
197
+ return 1
198
+
199
+
200
+ if __name__ == "__main__":
201
+ exit(main())
File without changes
File without changes