deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (298) hide show
  1. deltacat/__init__.py +96 -17
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +0 -18
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2435 -279
  12. deltacat/catalog/model/catalog.py +154 -77
  13. deltacat/catalog/model/properties.py +63 -22
  14. deltacat/compute/compactor/compaction_session.py +97 -75
  15. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  16. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  17. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  18. deltacat/compute/compactor/repartition_session.py +8 -21
  19. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  20. deltacat/compute/compactor/steps/materialize.py +9 -7
  21. deltacat/compute/compactor/steps/repartition.py +12 -11
  22. deltacat/compute/compactor/utils/io.py +6 -5
  23. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  24. deltacat/compute/compactor/utils/system_columns.py +3 -1
  25. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  26. deltacat/compute/compactor_v2/constants.py +30 -1
  27. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  28. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  29. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  30. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  31. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  32. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  33. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  34. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  35. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  36. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  37. deltacat/compute/compactor_v2/utils/io.py +11 -4
  38. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  39. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  40. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  41. deltacat/compute/converter/converter_session.py +145 -32
  42. deltacat/compute/converter/model/convert_input.py +26 -19
  43. deltacat/compute/converter/model/convert_input_files.py +33 -16
  44. deltacat/compute/converter/model/convert_result.py +35 -16
  45. deltacat/compute/converter/model/converter_session_params.py +24 -21
  46. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  47. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  48. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  49. deltacat/compute/converter/steps/convert.py +157 -50
  50. deltacat/compute/converter/steps/dedupe.py +24 -11
  51. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  52. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  53. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  54. deltacat/compute/converter/utils/io.py +101 -12
  55. deltacat/compute/converter/utils/s3u.py +33 -27
  56. deltacat/compute/janitor.py +205 -0
  57. deltacat/compute/jobs/client.py +25 -12
  58. deltacat/compute/resource_estimation/delta.py +38 -6
  59. deltacat/compute/resource_estimation/model.py +8 -0
  60. deltacat/constants.py +45 -2
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/env.py +10 -0
  64. deltacat/examples/basic_logging.py +1 -3
  65. deltacat/examples/compactor/aws/__init__.py +1 -0
  66. deltacat/examples/compactor/bootstrap.py +863 -0
  67. deltacat/examples/compactor/compactor.py +373 -0
  68. deltacat/examples/compactor/explorer.py +473 -0
  69. deltacat/examples/compactor/gcp/__init__.py +1 -0
  70. deltacat/examples/compactor/job_runner.py +439 -0
  71. deltacat/examples/compactor/utils/__init__.py +1 -0
  72. deltacat/examples/compactor/utils/common.py +261 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  79. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  80. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  81. deltacat/examples/indexer/indexer.py +2 -2
  82. deltacat/examples/indexer/job_runner.py +1 -2
  83. deltacat/exceptions.py +66 -4
  84. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  85. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  86. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
  87. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  88. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  89. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  90. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  91. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  92. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  93. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  94. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  95. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  96. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  97. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  98. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  99. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  100. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  101. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  102. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  103. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  104. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  105. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  107. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  108. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  109. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  110. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  111. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  112. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  113. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  114. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  115. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  116. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  117. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  118. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  119. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  120. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  121. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  122. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  123. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  124. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  125. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  126. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  127. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  128. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  129. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  130. deltacat/io/datasource/deltacat_datasource.py +0 -1
  131. deltacat/io/reader/deltacat_read_api.py +1 -1
  132. deltacat/storage/__init__.py +20 -2
  133. deltacat/storage/interface.py +54 -32
  134. deltacat/storage/main/impl.py +1494 -541
  135. deltacat/storage/model/delta.py +27 -3
  136. deltacat/storage/model/locator.py +6 -12
  137. deltacat/storage/model/manifest.py +182 -6
  138. deltacat/storage/model/metafile.py +151 -78
  139. deltacat/storage/model/namespace.py +8 -1
  140. deltacat/storage/model/partition.py +117 -42
  141. deltacat/storage/model/schema.py +2427 -159
  142. deltacat/storage/model/shard.py +6 -2
  143. deltacat/storage/model/sort_key.py +40 -0
  144. deltacat/storage/model/stream.py +9 -2
  145. deltacat/storage/model/table.py +12 -1
  146. deltacat/storage/model/table_version.py +11 -0
  147. deltacat/storage/model/transaction.py +1184 -208
  148. deltacat/storage/model/transform.py +81 -2
  149. deltacat/storage/model/types.py +48 -26
  150. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  151. deltacat/tests/aws/test_s3u.py +2 -31
  152. deltacat/tests/catalog/data/__init__.py +0 -0
  153. deltacat/tests/catalog/main/__init__.py +0 -0
  154. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  155. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  156. deltacat/tests/catalog/model/__init__.py +0 -0
  157. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  158. deltacat/tests/catalog/test_catalogs.py +103 -106
  159. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  160. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  161. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  162. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  163. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  164. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  165. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  166. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  167. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  168. deltacat/tests/compute/conftest.py +8 -44
  169. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  170. deltacat/tests/compute/converter/utils.py +15 -6
  171. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  172. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  173. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  174. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  175. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  176. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  177. deltacat/tests/compute/test_janitor.py +236 -0
  178. deltacat/tests/compute/test_util_common.py +716 -43
  179. deltacat/tests/compute/test_util_constant.py +0 -1
  180. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  181. deltacat/tests/daft/__init__.py +0 -0
  182. deltacat/tests/daft/test_model.py +97 -0
  183. deltacat/tests/experimental/__init__.py +1 -0
  184. deltacat/tests/experimental/catalog/__init__.py +0 -0
  185. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  186. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  187. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  188. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  189. deltacat/tests/experimental/daft/__init__.py +0 -0
  190. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  191. deltacat/tests/experimental/storage/__init__.py +0 -0
  192. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  193. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  194. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  195. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
  196. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  197. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  198. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  199. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  200. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  201. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  202. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  203. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  204. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
  205. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  206. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  207. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  208. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  209. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  210. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  211. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  212. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  213. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  214. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  215. deltacat/tests/storage/model/test_schema.py +171 -0
  216. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  217. deltacat/tests/storage/model/test_shard.py +3 -1
  218. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  219. deltacat/tests/storage/model/test_transaction.py +393 -48
  220. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  221. deltacat/tests/test_deltacat_api.py +988 -4
  222. deltacat/tests/test_exceptions.py +9 -5
  223. deltacat/tests/test_utils/pyarrow.py +52 -21
  224. deltacat/tests/test_utils/storage.py +23 -34
  225. deltacat/tests/types/__init__.py +0 -0
  226. deltacat/tests/types/test_tables.py +104 -0
  227. deltacat/tests/utils/exceptions.py +22 -0
  228. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  229. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  230. deltacat/tests/utils/test_daft.py +121 -31
  231. deltacat/tests/utils/test_numpy.py +1193 -0
  232. deltacat/tests/utils/test_pandas.py +1106 -0
  233. deltacat/tests/utils/test_polars.py +1040 -0
  234. deltacat/tests/utils/test_pyarrow.py +1370 -89
  235. deltacat/types/media.py +224 -14
  236. deltacat/types/tables.py +2329 -59
  237. deltacat/utils/arguments.py +33 -1
  238. deltacat/utils/daft.py +823 -36
  239. deltacat/utils/export.py +3 -1
  240. deltacat/utils/filesystem.py +100 -0
  241. deltacat/utils/metafile_locator.py +2 -1
  242. deltacat/utils/numpy.py +118 -26
  243. deltacat/utils/pandas.py +577 -48
  244. deltacat/utils/polars.py +658 -27
  245. deltacat/utils/pyarrow.py +1258 -213
  246. deltacat/utils/ray_utils/dataset.py +101 -10
  247. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  248. deltacat/utils/url.py +57 -16
  249. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  250. deltacat-2.0.0b12.dist-info/RECORD +439 -0
  251. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  252. deltacat/catalog/iceberg/__init__.py +0 -4
  253. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  254. deltacat/compute/merge_on_read/__init__.py +0 -4
  255. deltacat/compute/merge_on_read/daft.py +0 -40
  256. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  257. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  258. deltacat/daft/daft_scan.py +0 -115
  259. deltacat/daft/model.py +0 -258
  260. deltacat/daft/translator.py +0 -126
  261. deltacat/examples/common/fixtures.py +0 -15
  262. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  263. deltacat/storage/rivulet/__init__.py +0 -11
  264. deltacat/storage/rivulet/feather/__init__.py +0 -5
  265. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  266. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  267. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  268. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  269. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  270. deltacat/utils/s3fs.py +0 -21
  271. deltacat-2.0.0b10.dist-info/METADATA +0 -68
  272. deltacat-2.0.0b10.dist-info/RECORD +0 -381
  273. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  274. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  275. /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
  276. /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
  277. /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
  278. /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
  279. /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
  280. /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
  281. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  282. /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
  283. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  284. /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
  285. /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
  286. /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
  287. /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
  288. /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  289. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  290. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
  291. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  292. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  293. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  294. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  295. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  296. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  297. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  298. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,263 @@
1
+ """
2
+ Spark SQL utilities for Iceberg table operations.
3
+
4
+ This module provides Beam DoFn classes that use Spark SQL to work with Iceberg tables,
5
+ """
6
+
7
+ import os
8
+ import apache_beam as beam
9
+ from apache_beam import Row
10
+
11
+
12
+ class SparkSQLIcebergRead(beam.DoFn):
13
+ """
14
+ Custom Beam DoFn that uses Spark SQL to read Iceberg tables.
15
+ """
16
+
17
+ def __init__(
18
+ self,
19
+ table_name: str,
20
+ catalog_uri: str = "http://localhost:8181",
21
+ warehouse: str = "warehouse/",
22
+ ):
23
+ """
24
+ Initialize the Spark SQL reader.
25
+
26
+ Args:
27
+ table_name: Name of the Iceberg table
28
+ catalog_uri: URI of the Iceberg REST catalog
29
+ warehouse: Warehouse path
30
+ """
31
+ self.table_name = table_name
32
+ self.catalog_uri = catalog_uri
33
+ self.warehouse = warehouse
34
+ self.spark = None
35
+
36
+ def setup(self):
37
+ """Set up Spark session (called once per worker)."""
38
+ try:
39
+ from pyspark.sql import SparkSession
40
+ import importlib.metadata
41
+
42
+ # Get Spark version for dependency resolution
43
+ try:
44
+ spark_version = ".".join(
45
+ importlib.metadata.version("pyspark").split(".")[:2]
46
+ )
47
+ except Exception:
48
+ spark_version = "3.5" # Default fallback
49
+
50
+ scala_version = "2.12"
51
+ iceberg_version = "1.6.0"
52
+
53
+ print(f"🔧 Setting up Spark session for reading {self.table_name}")
54
+ print(f" - Spark version: {spark_version}")
55
+ print(f" - Iceberg version: {iceberg_version}")
56
+
57
+ # Set Spark packages for Iceberg runtime
58
+ os.environ["PYSPARK_SUBMIT_ARGS"] = (
59
+ f"--packages org.apache.iceberg:iceberg-spark-runtime-{spark_version}_{scala_version}:{iceberg_version} "
60
+ f"pyspark-shell"
61
+ )
62
+
63
+ # Create Spark session with Iceberg REST catalog configuration
64
+ self.spark = (
65
+ SparkSession.builder.appName(f"DeltaCAT Read - {self.table_name}")
66
+ .config("spark.sql.session.timeZone", "UTC")
67
+ .config(
68
+ "spark.serializer", "org.apache.spark.serializer.KryoSerializer"
69
+ )
70
+ .config(
71
+ "spark.sql.extensions",
72
+ "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
73
+ )
74
+ # Configure REST catalog
75
+ .config(
76
+ "spark.sql.catalog.rest", "org.apache.iceberg.spark.SparkCatalog"
77
+ )
78
+ .config("spark.sql.catalog.rest.type", "rest")
79
+ .config("spark.sql.catalog.rest.uri", self.catalog_uri)
80
+ .config("spark.sql.catalog.rest.warehouse", self.warehouse)
81
+ # Set REST as default catalog
82
+ .config("spark.sql.defaultCatalog", "rest")
83
+ # Local mode configuration (within Beam workers)
84
+ .config("spark.master", "local[1]") # Single thread per worker
85
+ .config("spark.sql.adaptive.enabled", "true")
86
+ # Networking binding
87
+ .config("spark.driver.bindAddress", "127.0.0.1")
88
+ .config("spark.driver.host", "127.0.0.1")
89
+ .config("spark.ui.enabled", "false")
90
+ .config("spark.sql.adaptive.coalescePartitions.enabled", "false")
91
+ .getOrCreate()
92
+ )
93
+
94
+ print(f"✅ Spark session created successfully")
95
+
96
+ except Exception as e:
97
+ print(f"❌ Failed to set up Spark session: {e}")
98
+ raise
99
+
100
+ def teardown(self):
101
+ """Clean up Spark session (called once per worker)."""
102
+ if self.spark:
103
+ try:
104
+ self.spark.stop()
105
+ print("✅ Spark session stopped")
106
+ except Exception as e:
107
+ print(f"⚠️ Error stopping Spark session: {e}")
108
+
109
+ def process(self, element):
110
+ """
111
+ Process element (read from Iceberg table using Spark SQL).
112
+
113
+ Args:
114
+ element: Input element (not used, just triggers the read)
115
+
116
+ Yields:
117
+ Records from the Iceberg table
118
+ """
119
+ try:
120
+ if not self.spark:
121
+ raise RuntimeError("Spark session not initialized")
122
+
123
+ print(f"📖 Reading table {self.table_name} using Spark SQL")
124
+
125
+ # Read from Iceberg table using Spark SQL
126
+ df = self.spark.sql(f"SELECT * FROM {self.table_name}")
127
+
128
+ # Collect all records
129
+ records = df.collect()
130
+
131
+ print(f"📊 Successfully read {len(records)} records from {self.table_name}")
132
+
133
+ # Convert Spark rows to Beam Row objects and yield
134
+ for row in records:
135
+ row_dict = row.asDict()
136
+ # Convert to Beam Row for consistency with write mode
137
+ beam_row = Row(**row_dict)
138
+ yield beam_row
139
+
140
+ except Exception as e:
141
+ print(f"❌ Failed to read from table {self.table_name}: {e}")
142
+ raise
143
+
144
+
145
+ class SparkSQLIcebergRewrite(beam.DoFn):
146
+ """
147
+ Custom Beam DoFn that uses Spark SQL to rewrite Iceberg table data files.
148
+
149
+ This uses Spark's rewrite_data_files procedure to materialize positional deletes
150
+ by rewriting data files. The result is a "clean" table without positional deletes.
151
+ """
152
+
153
+ def __init__(self, catalog_uri, warehouse_path, table_name):
154
+ self.catalog_uri = catalog_uri
155
+ self.warehouse_path = warehouse_path
156
+ self.table_name = table_name
157
+
158
+ def setup(self):
159
+ """Initialize Spark session for rewrite operations."""
160
+ try:
161
+ from pyspark.sql import SparkSession
162
+ import importlib.metadata
163
+
164
+ print(f"🔧 Setting up Spark session for rewriting {self.table_name}")
165
+
166
+ # Detect Spark version for appropriate Iceberg runtime
167
+ spark_version = importlib.metadata.version("pyspark")
168
+ major_minor = ".".join(spark_version.split(".")[:2])
169
+ print(f" - Spark version: {major_minor}")
170
+ print(f" - Iceberg version: 1.6.0")
171
+
172
+ # Configure Spark with Iceberg
173
+ self.spark = (
174
+ SparkSession.builder.appName("IcebergRewrite")
175
+ .config(
176
+ "spark.jars.packages",
177
+ f"org.apache.iceberg:iceberg-spark-runtime-{major_minor}_2.12:1.6.0",
178
+ )
179
+ .config(
180
+ "spark.sql.extensions",
181
+ "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
182
+ )
183
+ .config(
184
+ "spark.sql.catalog.spark_catalog",
185
+ "org.apache.iceberg.spark.SparkSessionCatalog",
186
+ )
187
+ .config("spark.sql.catalog.spark_catalog.type", "rest")
188
+ .config("spark.sql.catalog.spark_catalog.uri", self.catalog_uri)
189
+ .config(
190
+ "spark.sql.catalog.spark_catalog.warehouse", self.warehouse_path
191
+ )
192
+ .config("spark.driver.bindAddress", "127.0.0.1")
193
+ .config("spark.driver.host", "127.0.0.1")
194
+ .config("spark.ui.enabled", "false")
195
+ .getOrCreate()
196
+ )
197
+
198
+ print("✅ Spark session created successfully")
199
+
200
+ except ImportError as e:
201
+ raise RuntimeError(
202
+ f"PySpark is required for rewrite mode. Install with: pip install pyspark"
203
+ ) from e
204
+ except Exception as e:
205
+ raise RuntimeError(f"Failed to create Spark session: {e}") from e
206
+
207
+ def process(self, element):
208
+ """Rewrite table data files to materialize positional deletes."""
209
+ try:
210
+ print(
211
+ f"📋 Rewriting table {self.table_name} to materialize positional deletes"
212
+ )
213
+
214
+ # Use Spark's rewrite_data_files procedure with delete_file_threshold=1
215
+ # This forces rewrite even when there's only 1 positional delete file
216
+ rewrite_sql = f"""
217
+ CALL spark_catalog.system.rewrite_data_files(
218
+ table => '{self.table_name}',
219
+ options => map('delete-file-threshold', '1')
220
+ )
221
+ """
222
+
223
+ print(f"🔄 Executing rewrite procedure with delete_file_threshold=1...")
224
+ print(f" SQL: {rewrite_sql.strip()}")
225
+ print(
226
+ f" Rationale: Forces rewrite even with single positional delete file"
227
+ )
228
+
229
+ result = self.spark.sql(rewrite_sql)
230
+
231
+ # Collect results to see what was rewritten
232
+ rewrite_result = result.collect()[0]
233
+ print(f"📊 Rewrite result: {rewrite_result}")
234
+
235
+ # Check if we actually rewrote anything
236
+ if rewrite_result.rewritten_data_files_count > 0:
237
+ print(
238
+ f"✅ Successfully rewrote {rewrite_result.rewritten_data_files_count} data files"
239
+ )
240
+ print(
241
+ f" - Added {rewrite_result.added_data_files_count} new data files"
242
+ )
243
+ print(f" - Rewrote {rewrite_result.rewritten_bytes_count} bytes")
244
+ print(f" - Positional deletes have been materialized!")
245
+ else:
246
+ print(f"⚠️ No files were rewritten (rewritten_data_files_count=0)")
247
+ print(f" - This may indicate no positional deletes exist")
248
+ print(f" - Or the table may already be in optimal state")
249
+
250
+ yield f"Rewrite completed for {self.table_name}"
251
+
252
+ except Exception as e:
253
+ print(f"❌ Error during rewrite: {e}")
254
+ import traceback
255
+
256
+ traceback.print_exc()
257
+ yield f"Rewrite failed for {self.table_name}: {e}"
258
+
259
+ def teardown(self):
260
+ """Clean up Spark session."""
261
+ if hasattr(self, "spark"):
262
+ print("✅ Spark session stopped")
263
+ self.spark.stop()
@@ -9,10 +9,8 @@ import deltacat as dc
9
9
 
10
10
  from deltacat import logs
11
11
  from deltacat import IcebergCatalog
12
- from deltacat.catalog.iceberg import IcebergCatalogConfig
13
- from deltacat.examples.common.fixtures import (
14
- store_cli_args_in_os_environ,
15
- )
12
+ from deltacat.experimental.catalog.iceberg import IcebergCatalogConfig
13
+ from env import store_cli_args_in_os_environ
16
14
 
17
15
  from pyiceberg.schema import (
18
16
  Schema,
@@ -23,7 +21,7 @@ from pyiceberg.schema import (
23
21
  from pyiceberg.partitioning import PartitionSpec, PartitionField
24
22
  from pyiceberg.transforms import BucketTransform
25
23
 
26
- from deltacat.storage.iceberg.model import (
24
+ from deltacat.experimental.storage.iceberg.model import (
27
25
  SchemaMapper,
28
26
  PartitionSchemeMapper,
29
27
  )
@@ -4,9 +4,7 @@ import deltacat as dc
4
4
 
5
5
  from deltacat import logs
6
6
  from deltacat import IcebergCatalog
7
- from deltacat.examples.common.fixtures import (
8
- store_cli_args_in_os_environ,
9
- )
7
+ from env import store_cli_args_in_os_environ
10
8
 
11
9
  from pyiceberg.schema import (
12
10
  Schema,
@@ -22,7 +20,7 @@ from pyiceberg.transforms import DayTransform, IdentityTransform
22
20
  from pyiceberg.table.sorting import SortField, SortOrder
23
21
 
24
22
  from deltacat.exceptions import TableAlreadyExistsError
25
- from deltacat.storage.iceberg.model import (
23
+ from deltacat.experimental.storage.iceberg.model import (
26
24
  SchemaMapper,
27
25
  PartitionSchemeMapper,
28
26
  SortSchemeMapper,
@@ -59,8 +59,8 @@ def run(
59
59
  "use_pyarrow": True, # use the native pyarrow reader
60
60
  },
61
61
  # writer arguments to pass to the default writer (polars)
62
- # for the given parquet-based datasink, it accepts the same
63
- # arguments as polars.DataFrame.write_parquet except for `file`
62
+ # for the given parquet-based datasink, it generally accepts the same
63
+ # arguments as polars.DataFrame.write_{dest-type} except for `file`
64
64
  writer_args={
65
65
  "compression": "lz4", # faster compression & decompression
66
66
  # "compression": "zstd", # better compression ratio
@@ -64,8 +64,7 @@ def run_sync(
64
64
  cluster_cfg_file_path = working_dir.joinpath(cloud).joinpath("deltacat.yaml")
65
65
  client = job_client(cluster_cfg_file_path, restart_ray=restart_ray)
66
66
  job_number = 0
67
- while jobs_to_submit > 0:
68
- jobs_to_submit -= 1
67
+ while job_number < jobs_to_submit:
69
68
  job_dest = dest + f".{job_number}"
70
69
  job_run_result = client.run_job(
71
70
  # Entrypoint shell command to execute
deltacat/exceptions.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
  from enum import Enum
3
- from typing import Callable
3
+ from typing import Callable, Optional, TYPE_CHECKING
4
4
  import logging
5
5
 
6
6
  import tenacity
@@ -28,6 +28,9 @@ from deltacat.utils.ray_utils.runtime import (
28
28
  get_current_ray_task_id,
29
29
  )
30
30
 
31
+ if TYPE_CHECKING:
32
+ from deltacat.storage.model.schema import FieldLocator
33
+
31
34
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
32
35
 
33
36
  DELTACAT_STORAGE_PARAM = "deltacat_storage"
@@ -74,9 +77,18 @@ class DeltaCatErrorNames(str, Enum):
74
77
  TABLE_NOT_FOUND_ERROR = "TableNotFoundError"
75
78
  TABLE_VERSION_NOT_FOUND_ERROR = "TableVersionNotFoundError"
76
79
  STREAM_NOT_FOUND_ERROR = "StreamNotFoundError"
80
+ PARTITION_NOT_FOUND_ERROR = "PartitionNotFoundError"
77
81
  DELTA_NOT_FOUND_ERROR = "DeltaNotFoundError"
78
82
  TABLE_ALREADY_EXISTS_ERROR = "TableAlreadyExistsError"
83
+ TABLE_VERSION_ALREADY_EXISTS_ERROR = "TableVersionAlreadyExistsError"
79
84
  NAMESPACE_ALREADY_EXISTS_ERROR = "NamespaceAlreadyExistsError"
85
+ SCHEMA_COMPATIBILITY_ERROR = "SchemaCompatibilityError"
86
+ SCHEMA_VALIDATION_ERROR = "SchemaValidationError"
87
+ TABLE_VALIDATION_ERROR = "TableValidationError"
88
+ CONCURRENT_MODIFICATION_ERROR = "ConcurrentModificationError"
89
+ OBJECT_NOT_FOUND_ERROR = "ObjectNotFoundError"
90
+ OBJECT_DELETED_ERROR = "ObjectDeletedError"
91
+ OBJECT_ALREADY_EXISTS_ERROR = "ObjectAlreadyExistsError"
80
92
 
81
93
 
82
94
  class DeltaCatError(Exception):
@@ -87,9 +99,12 @@ class DeltaCatError(Exception):
87
99
  super().__init__(*args, **kwargs)
88
100
 
89
101
  def _get_ray_task_id_and_node_ip(self):
90
- task_id = get_current_ray_task_id()
91
- node_ip = ray.util.get_node_ip_address()
92
- return task_id, node_ip
102
+ if ray.is_initialized():
103
+ task_id = get_current_ray_task_id()
104
+ node_ip = ray.util.get_node_ip_address()
105
+ return task_id, node_ip
106
+ else:
107
+ return None, None
93
108
 
94
109
 
95
110
  class NonRetryableError(DeltaCatError):
@@ -232,6 +247,10 @@ class TableVersionNotFoundError(NonRetryableError):
232
247
  error_name = DeltaCatErrorNames.TABLE_VERSION_NOT_FOUND_ERROR.value
233
248
 
234
249
 
250
+ class PartitionNotFoundError(NonRetryableError):
251
+ error_name = DeltaCatErrorNames.PARTITION_NOT_FOUND_ERROR.value
252
+
253
+
235
254
  class StreamNotFoundError(NonRetryableError):
236
255
  error_name = DeltaCatErrorNames.STREAM_NOT_FOUND_ERROR.value
237
256
 
@@ -244,10 +263,53 @@ class TableAlreadyExistsError(NonRetryableError):
244
263
  error_name = DeltaCatErrorNames.TABLE_ALREADY_EXISTS_ERROR.value
245
264
 
246
265
 
266
+ class TableVersionAlreadyExistsError(NonRetryableError):
267
+ error_name = DeltaCatErrorNames.TABLE_VERSION_ALREADY_EXISTS_ERROR.value
268
+
269
+
247
270
  class NamespaceAlreadyExistsError(NonRetryableError):
248
271
  error_name = DeltaCatErrorNames.TABLE_ALREADY_EXISTS_ERROR.value
249
272
 
250
273
 
274
+ class ObjectNotFoundError(NonRetryableError):
275
+ error_name = DeltaCatErrorNames.OBJECT_NOT_FOUND_ERROR.value
276
+
277
+
278
+ class ObjectDeletedError(NonRetryableError):
279
+ error_name = DeltaCatErrorNames.OBJECT_DELETED_ERROR.value
280
+
281
+
282
+ class ObjectAlreadyExistsError(NonRetryableError):
283
+ error_name = DeltaCatErrorNames.OBJECT_ALREADY_EXISTS_ERROR.value
284
+
285
+
286
+ class ConcurrentModificationError(NonRetryableError):
287
+ error_name = DeltaCatErrorNames.CONCURRENT_MODIFICATION_ERROR.value
288
+
289
+
290
+ class SchemaValidationError(NonRetryableError):
291
+ error_name = DeltaCatErrorNames.SCHEMA_VALIDATION_ERROR.value
292
+
293
+
294
+ class TableValidationError(NonRetryableError):
295
+ error_name = DeltaCatErrorNames.TABLE_VALIDATION_ERROR.value
296
+
297
+
298
+ class SchemaCompatibilityError(NonRetryableError):
299
+ error_name = DeltaCatErrorNames.SCHEMA_COMPATIBILITY_ERROR.value
300
+ """Raised when a schema update would break backward compatibility."""
301
+
302
+ def __init__(
303
+ self,
304
+ message: str,
305
+ field_locator: Optional[FieldLocator] = None,
306
+ *args,
307
+ **kwargs,
308
+ ):
309
+ super().__init__(message, *args, **kwargs)
310
+ self.field_locator = field_locator
311
+
312
+
251
313
  def categorize_errors(func: Callable):
252
314
  def wrapper(*args, **kwargs):
253
315
  try:
@@ -0,0 +1,6 @@
1
+ from deltacat.experimental.catalog.iceberg.iceberg_catalog_config import (
2
+ IcebergCatalogConfig,
3
+ )
4
+ import deltacat.experimental.catalog.iceberg.impl as IcebergCatalog
5
+
6
+ __all__ = ["IcebergCatalogConfig", "IcebergCatalog"]
@@ -15,7 +15,7 @@ class IcebergCatalogConfig:
15
15
 
16
16
  This configuration is passed through to PyIceberg by invoking load_catalog.
17
17
  The Properties provided must match properties accepted by PyIceberg for each catalog type
18
- See: :func:`deltacat.catalog.iceberg.initialize`
18
+ See: :func:`deltacat.experimental.catalog.iceberg.initialize`
19
19
 
20
20
  Attributes:
21
21
  type: The PyIceberg Catalog instance
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import sys
2
3
 
3
4
  from typing import Any, Dict, List, Optional, Union
4
5
 
@@ -7,13 +8,19 @@ from daft.daft import ScanOperatorHandle, StorageConfig
7
8
  from daft.logical.builder import LogicalPlanBuilder
8
9
 
9
10
  from deltacat import logs
11
+ from deltacat.catalog.model.catalog import Catalog
10
12
  from deltacat.catalog.model.table_definition import TableDefinition
11
- from deltacat.daft.daft_scan import DeltaCatScanOperator
13
+ from deltacat.utils.daft import DeltaCatScanOperator
12
14
  from deltacat.exceptions import TableAlreadyExistsError
13
- from deltacat.storage.iceberg.iceberg_scan_planner import IcebergScanPlanner
14
- from deltacat.storage.iceberg.model import PartitionSchemeMapper, SchemaMapper
15
+ from deltacat.experimental.storage.iceberg.iceberg_scan_planner import (
16
+ IcebergScanPlanner,
17
+ )
18
+ from deltacat.experimental.storage.iceberg.model import (
19
+ PartitionSchemeMapper,
20
+ SchemaMapper,
21
+ )
15
22
  from deltacat.storage.model.partition import PartitionScheme
16
- from deltacat.storage.iceberg.impl import _get_native_catalog
23
+ from deltacat.experimental.storage.iceberg.impl import _get_native_catalog
17
24
  from deltacat.storage.model.sort_key import SortScheme
18
25
  from deltacat.storage.model.list_result import ListResult
19
26
  from deltacat.storage.model.namespace import Namespace, NamespaceProperties
@@ -26,20 +33,31 @@ from deltacat.storage.model.types import (
26
33
  LocalTable,
27
34
  StreamFormat,
28
35
  )
29
- from deltacat.storage.iceberg import impl as IcebergStorage
36
+ from deltacat.experimental.storage.iceberg import impl as IcebergStorage
30
37
  from deltacat.types.media import ContentType
31
38
  from deltacat.types.tables import TableWriteMode
32
39
  from deltacat.constants import DEFAULT_NAMESPACE
33
- from deltacat.catalog.iceberg.iceberg_catalog_config import IcebergCatalogConfig
40
+ from deltacat.experimental.catalog.iceberg.iceberg_catalog_config import (
41
+ IcebergCatalogConfig,
42
+ )
34
43
 
35
- from pyiceberg.catalog import Catalog, load_catalog
44
+ from pyiceberg.catalog import Catalog as PyIcebergCatalog, load_catalog
36
45
  from pyiceberg.transforms import BucketTransform
37
46
 
38
47
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
39
48
 
49
+ IcebergCatalog = sys.modules[__name__]
50
+
51
+
52
+ def from_config(config: IcebergCatalogConfig, *args, **kwargs) -> Catalog:
53
+ """
54
+ Factory method to construct a catalog from Iceberg catalog configuration.
55
+ """
56
+ return Catalog(config, impl=IcebergCatalog, *args, **kwargs)
57
+
40
58
 
41
59
  # catalog functions
42
- def initialize(*args, config: IcebergCatalogConfig, **kwargs) -> Catalog:
60
+ def initialize(config: IcebergCatalogConfig, **kwargs) -> PyIcebergCatalog:
43
61
  """
44
62
  Initializes an Iceberg catalog with the given config.
45
63
 
@@ -123,7 +141,7 @@ def write_to_table(
123
141
  )
124
142
  # TODO(pdames): only append s3:// to output file paths when writing to S3!
125
143
  out_file_paths = [f"s3://{val}" for val in out_df.to_arrow()[0]]
126
- from deltacat.catalog.iceberg import overrides
144
+ from deltacat.experimental.catalog.iceberg import overrides
127
145
 
128
146
  overrides.append(
129
147
  table_definition.table.native_object,
@@ -180,7 +198,7 @@ def create_table(
180
198
  name: str,
181
199
  *args,
182
200
  namespace: Optional[str] = None,
183
- version: Optional[str] = None,
201
+ table_version: Optional[str] = None,
184
202
  lifecycle_state: Optional[LifecycleState] = None,
185
203
  schema: Optional[Schema] = None,
186
204
  partition_scheme: Optional[PartitionScheme] = None,
@@ -224,7 +242,7 @@ def create_table(
224
242
  IcebergStorage.create_table_version(
225
243
  namespace=namespace,
226
244
  table_name=name,
227
- table_version=version,
245
+ table_version=table_version,
228
246
  schema=schema,
229
247
  partition_scheme=partition_scheme,
230
248
  sort_keys=sort_keys,