deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. deltacat/__init__.py +117 -18
  2. deltacat/api.py +536 -126
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +1 -19
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2444 -282
  12. deltacat/catalog/model/catalog.py +208 -113
  13. deltacat/catalog/model/properties.py +63 -24
  14. deltacat/compute/__init__.py +14 -0
  15. deltacat/compute/compactor/compaction_session.py +97 -75
  16. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  17. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  18. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  19. deltacat/compute/compactor/repartition_session.py +8 -21
  20. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  21. deltacat/compute/compactor/steps/materialize.py +9 -7
  22. deltacat/compute/compactor/steps/repartition.py +12 -11
  23. deltacat/compute/compactor/utils/io.py +6 -5
  24. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  25. deltacat/compute/compactor/utils/system_columns.py +3 -1
  26. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  27. deltacat/compute/compactor_v2/constants.py +30 -1
  28. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  30. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  31. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  32. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  33. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  34. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  35. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  36. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  37. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  38. deltacat/compute/compactor_v2/utils/io.py +11 -4
  39. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  40. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  41. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  42. deltacat/compute/converter/constants.py +5 -0
  43. deltacat/compute/converter/converter_session.py +207 -52
  44. deltacat/compute/converter/model/convert_input.py +43 -16
  45. deltacat/compute/converter/model/convert_input_files.py +33 -16
  46. deltacat/compute/converter/model/convert_result.py +80 -0
  47. deltacat/compute/converter/model/converter_session_params.py +64 -19
  48. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  49. deltacat/compute/converter/pyiceberg/overrides.py +193 -65
  50. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  51. deltacat/compute/converter/steps/convert.py +230 -75
  52. deltacat/compute/converter/steps/dedupe.py +46 -12
  53. deltacat/compute/converter/utils/convert_task_options.py +66 -22
  54. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  55. deltacat/compute/converter/utils/iceberg_columns.py +13 -8
  56. deltacat/compute/converter/utils/io.py +173 -13
  57. deltacat/compute/converter/utils/s3u.py +42 -27
  58. deltacat/compute/janitor.py +205 -0
  59. deltacat/compute/jobs/client.py +417 -0
  60. deltacat/compute/resource_estimation/delta.py +38 -6
  61. deltacat/compute/resource_estimation/model.py +8 -0
  62. deltacat/constants.py +49 -6
  63. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  64. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  65. deltacat/env.py +10 -0
  66. deltacat/examples/basic_logging.py +6 -6
  67. deltacat/examples/compactor/aws/__init__.py +1 -0
  68. deltacat/examples/compactor/bootstrap.py +863 -0
  69. deltacat/examples/compactor/compactor.py +373 -0
  70. deltacat/examples/compactor/explorer.py +473 -0
  71. deltacat/examples/compactor/gcp/__init__.py +1 -0
  72. deltacat/examples/compactor/job_runner.py +439 -0
  73. deltacat/examples/compactor/utils/__init__.py +1 -0
  74. deltacat/examples/compactor/utils/common.py +261 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  80. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  81. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
  82. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  83. deltacat/examples/hello_world.py +4 -2
  84. deltacat/examples/indexer/indexer.py +163 -0
  85. deltacat/examples/indexer/job_runner.py +198 -0
  86. deltacat/exceptions.py +66 -4
  87. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  88. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  89. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
  90. deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
  91. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  92. deltacat/experimental/converter_agent/__init__.py +0 -0
  93. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  94. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  95. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  96. deltacat/experimental/daft/__init__.py +4 -0
  97. deltacat/experimental/daft/daft_catalog.py +229 -0
  98. deltacat/experimental/storage/__init__.py +0 -0
  99. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  100. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  101. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  102. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  103. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  104. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  105. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  107. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  108. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  109. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  110. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  111. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  112. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  113. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  114. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  115. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  116. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  117. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  118. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  119. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  120. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  121. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  122. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  123. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  124. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  125. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  126. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  127. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  128. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  129. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  130. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  131. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  132. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  133. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  134. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  135. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  136. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  137. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  138. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  139. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  140. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  141. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  142. deltacat/io/__init__.py +13 -0
  143. deltacat/io/dataset/__init__.py +0 -0
  144. deltacat/io/dataset/deltacat_dataset.py +91 -0
  145. deltacat/io/datasink/__init__.py +0 -0
  146. deltacat/io/datasink/deltacat_datasink.py +207 -0
  147. deltacat/io/datasource/__init__.py +0 -0
  148. deltacat/io/datasource/deltacat_datasource.py +579 -0
  149. deltacat/io/reader/__init__.py +0 -0
  150. deltacat/io/reader/deltacat_read_api.py +172 -0
  151. deltacat/storage/__init__.py +22 -2
  152. deltacat/storage/interface.py +54 -32
  153. deltacat/storage/main/impl.py +1494 -541
  154. deltacat/storage/model/delta.py +27 -3
  155. deltacat/storage/model/expression/__init__.py +47 -0
  156. deltacat/storage/model/expression/expression.py +656 -0
  157. deltacat/storage/model/expression/visitor.py +248 -0
  158. deltacat/storage/model/locator.py +6 -12
  159. deltacat/storage/model/manifest.py +231 -6
  160. deltacat/storage/model/metafile.py +224 -119
  161. deltacat/storage/model/namespace.py +8 -1
  162. deltacat/storage/model/partition.py +117 -42
  163. deltacat/storage/model/scan/push_down.py +32 -5
  164. deltacat/storage/model/schema.py +2427 -159
  165. deltacat/storage/model/shard.py +6 -2
  166. deltacat/storage/model/sort_key.py +40 -0
  167. deltacat/storage/model/stream.py +9 -2
  168. deltacat/storage/model/table.py +12 -1
  169. deltacat/storage/model/table_version.py +11 -0
  170. deltacat/storage/model/transaction.py +1184 -208
  171. deltacat/storage/model/transform.py +81 -2
  172. deltacat/storage/model/types.py +53 -29
  173. deltacat/storage/util/__init__.py +0 -0
  174. deltacat/storage/util/scan_planner.py +26 -0
  175. deltacat/tests/_io/reader/__init__.py +0 -0
  176. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  177. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  178. deltacat/tests/aws/test_s3u.py +2 -31
  179. deltacat/tests/catalog/data/__init__.py +0 -0
  180. deltacat/tests/catalog/main/__init__.py +0 -0
  181. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  182. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  183. deltacat/tests/catalog/model/__init__.py +0 -0
  184. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  185. deltacat/tests/catalog/test_catalogs.py +103 -106
  186. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  187. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  188. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  189. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  190. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  191. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  192. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  193. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  194. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  195. deltacat/tests/compute/conftest.py +8 -44
  196. deltacat/tests/compute/converter/test_convert_session.py +697 -349
  197. deltacat/tests/compute/converter/utils.py +15 -6
  198. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  199. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  200. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  201. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  202. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  203. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  204. deltacat/tests/compute/test_janitor.py +236 -0
  205. deltacat/tests/compute/test_util_common.py +716 -43
  206. deltacat/tests/compute/test_util_constant.py +0 -1
  207. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  208. deltacat/tests/daft/__init__.py +0 -0
  209. deltacat/tests/daft/test_model.py +97 -0
  210. deltacat/tests/experimental/__init__.py +1 -0
  211. deltacat/tests/experimental/catalog/__init__.py +0 -0
  212. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  213. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  214. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  215. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  216. deltacat/tests/experimental/daft/__init__.py +0 -0
  217. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  218. deltacat/tests/experimental/storage/__init__.py +0 -0
  219. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  220. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  221. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  222. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  223. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  224. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  225. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  226. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  227. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  228. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  229. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  230. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  231. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  232. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  233. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  234. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  235. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  236. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  237. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  238. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  239. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  240. deltacat/tests/storage/model/test_expression.py +327 -0
  241. deltacat/tests/storage/model/test_manifest.py +129 -0
  242. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  243. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  244. deltacat/tests/storage/model/test_schema.py +171 -0
  245. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  246. deltacat/tests/storage/model/test_shard.py +3 -1
  247. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  248. deltacat/tests/storage/model/test_transaction.py +393 -48
  249. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  250. deltacat/tests/test_deltacat_api.py +1036 -11
  251. deltacat/tests/test_exceptions.py +9 -5
  252. deltacat/tests/test_utils/pyarrow.py +52 -21
  253. deltacat/tests/test_utils/storage.py +23 -34
  254. deltacat/tests/types/__init__.py +0 -0
  255. deltacat/tests/types/test_tables.py +104 -0
  256. deltacat/tests/utils/exceptions.py +22 -0
  257. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  258. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  259. deltacat/tests/utils/test_daft.py +121 -31
  260. deltacat/tests/utils/test_numpy.py +1193 -0
  261. deltacat/tests/utils/test_pandas.py +1106 -0
  262. deltacat/tests/utils/test_polars.py +1040 -0
  263. deltacat/tests/utils/test_pyarrow.py +1370 -89
  264. deltacat/types/media.py +345 -37
  265. deltacat/types/tables.py +2344 -46
  266. deltacat/utils/arguments.py +33 -1
  267. deltacat/utils/daft.py +824 -40
  268. deltacat/utils/export.py +3 -1
  269. deltacat/utils/filesystem.py +139 -9
  270. deltacat/utils/metafile_locator.py +2 -1
  271. deltacat/utils/numpy.py +118 -26
  272. deltacat/utils/pandas.py +577 -48
  273. deltacat/utils/polars.py +759 -0
  274. deltacat/utils/pyarrow.py +1373 -192
  275. deltacat/utils/ray_utils/concurrency.py +1 -1
  276. deltacat/utils/ray_utils/dataset.py +101 -10
  277. deltacat/utils/ray_utils/runtime.py +56 -4
  278. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  279. deltacat/utils/url.py +1325 -0
  280. deltacat-2.0.0.dist-info/METADATA +1163 -0
  281. deltacat-2.0.0.dist-info/RECORD +439 -0
  282. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  283. deltacat/catalog/iceberg/__init__.py +0 -4
  284. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  285. deltacat/compute/merge_on_read/__init__.py +0 -4
  286. deltacat/compute/merge_on_read/daft.py +0 -40
  287. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  288. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  289. deltacat/examples/common/fixtures.py +0 -15
  290. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  291. deltacat/storage/rivulet/__init__.py +0 -11
  292. deltacat/storage/rivulet/feather/__init__.py +0 -5
  293. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  294. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  295. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  296. deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
  297. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  298. deltacat/utils/s3fs.py +0 -21
  299. deltacat-2.0.dist-info/METADATA +0 -65
  300. deltacat-2.0.dist-info/RECORD +0 -347
  301. /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
  302. /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
  303. /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
  304. /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
  305. /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
  306. /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
  307. /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
  308. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
  309. /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
  310. /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  311. /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
  312. /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
  313. /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  314. /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
  315. /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
  316. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
  317. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  318. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  319. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  320. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  321. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  322. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  323. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  324. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,8 @@
1
+ from collections import defaultdict
1
2
  import pytest
2
3
  import ray
3
- from typing import List
4
+ from typing import List, Dict, Any, Tuple
4
5
  from pyiceberg.catalog.rest import RestCatalog
5
- from pyiceberg.expressions import EqualTo
6
6
  from pyiceberg.schema import Schema
7
7
  from pyiceberg.types import (
8
8
  NestedField,
@@ -12,158 +12,98 @@ from pyiceberg.types import (
12
12
  from pyiceberg.partitioning import PartitionSpec, PartitionField
13
13
  from pyiceberg.transforms import IdentityTransform
14
14
  import pyarrow as pa
15
+ import daft
15
16
 
16
17
  from deltacat.compute.converter.steps.convert import convert
17
18
  from deltacat.compute.converter.model.convert_input import ConvertInput
18
19
  from deltacat.compute.converter.pyiceberg.overrides import (
19
20
  fetch_all_bucket_files,
20
- parquet_files_dict_to_iceberg_data_files,
21
21
  )
22
- from collections import defaultdict
23
22
  from deltacat.compute.converter.utils.converter_session_utils import (
24
23
  group_all_files_to_each_bucket,
25
24
  )
26
25
  from deltacat.tests.compute.converter.utils import (
27
26
  get_s3_file_system,
28
27
  drop_table_if_exists,
28
+ commit_equality_delete_to_table,
29
29
  )
30
30
  from deltacat.compute.converter.pyiceberg.update_snapshot_overrides import (
31
31
  commit_append_snapshot,
32
+ commit_replace_snapshot,
32
33
  )
33
34
 
35
+ from pyiceberg.typedef import Record
36
+ from deltacat.compute.converter.utils.convert_task_options import BASE_MEMORY_BUFFER
37
+ from deltacat.tests.test_utils.filesystem import temp_dir_autocleanup
38
+ from deltacat.compute.converter.converter_session import converter_session
39
+ from deltacat.compute.converter.model.converter_session_params import (
40
+ ConverterSessionParams,
41
+ )
42
+ from pyiceberg.catalog import load_catalog
43
+ import os
44
+ import pyarrow.parquet as pq
45
+ from pyiceberg.manifest import DataFile, DataFileContent, FileFormat
46
+ from pyiceberg.io.pyarrow import (
47
+ data_file_statistics_from_parquet_metadata,
48
+ compute_statistics_plan,
49
+ parquet_path_to_id_mapping,
50
+ )
51
+ from pyiceberg.io.pyarrow import _check_pyarrow_schema_compatible
52
+ from pyiceberg.exceptions import NamespaceAlreadyExistsError, NoSuchTableError
53
+ from pyiceberg.io.pyarrow import schema_to_pyarrow
34
54
 
35
- def run_spark_commands(spark, sqls: List[str]) -> None:
36
- for sql in sqls:
37
- spark.sql(sql)
55
+ # Task memory in bytes for testing
56
+ TASK_MEMORY_BYTES = BASE_MEMORY_BUFFER
38
57
 
39
58
 
40
- @pytest.mark.integration
41
- def test_pyiceberg_spark_setup_sanity(spark, session_catalog: RestCatalog) -> None:
42
- """
43
- This Test was copied over from Pyiceberg integ test: https://github.com/apache/iceberg-python/blob/main/tests/integration/test_deletes.py#L62
44
- First sanity check to ensure all integration with Pyiceberg and Spark are working as expected.
45
- """
46
- identifier = "default.table_partitioned_delete"
47
-
48
- run_spark_commands(
49
- spark,
50
- [
51
- f"DROP TABLE IF EXISTS {identifier}",
52
- f"""
53
- CREATE TABLE {identifier} (
54
- number_partitioned int,
55
- number int
56
- )
57
- USING iceberg
58
- PARTITIONED BY (number_partitioned)
59
- TBLPROPERTIES('format-version' = 2)
60
- """,
61
- f"""
62
- INSERT INTO {identifier} VALUES (10, 20), (10, 30)
63
- """,
64
- f"""
65
- INSERT INTO {identifier} VALUES (11, 20), (11, 30)
66
- """,
67
- ],
59
+ # Test data fixtures
60
+ @pytest.fixture
61
+ def base_schema():
62
+ return Schema(
63
+ NestedField(
64
+ field_id=1, name="number_partitioned", field_type=LongType(), required=False
65
+ ),
66
+ NestedField(
67
+ field_id=2, name="primary_key", field_type=StringType(), required=False
68
+ ),
69
+ NestedField(
70
+ field_id=2147483546,
71
+ name="file_path",
72
+ field_type=StringType(),
73
+ required=False,
74
+ ),
75
+ NestedField(
76
+ field_id=2147483545, name="pos", field_type=LongType(), required=False
77
+ ),
78
+ schema_id=0,
68
79
  )
69
80
 
70
- tbl = session_catalog.load_table(identifier)
71
- tbl.delete(EqualTo("number_partitioned", 10))
72
-
73
- # No overwrite operation
74
- assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == [
75
- "append",
76
- "append",
77
- "delete",
78
- ]
79
- assert tbl.scan().to_arrow().to_pydict() == {
80
- "number_partitioned": [11, 11],
81
- "number": [20, 30],
82
- }
83
-
84
-
85
- @pytest.mark.integration
86
- def test_spark_position_delete_production_sanity(
87
- spark, session_catalog: RestCatalog
88
- ) -> None:
89
- """
90
- Sanity test to ensure Spark position delete production is successful with `merge-on-read` spec V2.
91
- Table has two partition levels. 1. BucketTransform on primary key
92
- """
93
- identifier = "default.table_spark_position_delete_production_sanity"
94
-
95
- run_spark_commands(
96
- spark,
97
- [
98
- f"DROP TABLE IF EXISTS {identifier}",
99
- f"""
100
- CREATE TABLE {identifier} (
101
- number_partitioned INT,
102
- primary_key STRING
103
- )
104
- USING iceberg
105
- PARTITIONED BY (bucket(3, primary_key), number_partitioned)
106
- TBLPROPERTIES(
107
- 'format-version' = 2,
108
- 'write.delete.mode'='merge-on-read',
109
- 'write.update.mode'='merge-on-read',
110
- 'write.merge.mode'='merge-on-read'
111
- )
112
- """,
113
- f"""
114
- INSERT INTO {identifier} VALUES (0, 'pk1'), (0, 'pk2'), (0, 'pk3')
115
- """,
116
- f"""
117
- INSERT INTO {identifier} VALUES (1, 'pk1'), (1, 'pk2'), (1, 'pk3')
118
- """,
119
- ],
120
- )
121
81
 
122
- run_spark_commands(
123
- spark,
124
- [
125
- f"""
126
- DELETE FROM {identifier} WHERE primary_key in ("pk1")
127
- """,
128
- ],
82
+ @pytest.fixture
83
+ def base_schema_without_metadata():
84
+ return Schema(
85
+ NestedField(
86
+ field_id=1, name="number_partitioned", field_type=LongType(), required=False
87
+ ),
88
+ NestedField(
89
+ field_id=2, name="primary_key", field_type=StringType(), required=False
90
+ ),
91
+ schema_id=0,
129
92
  )
130
93
 
131
- tbl = session_catalog.load_table(identifier)
132
- tbl.refresh()
133
-
134
- assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == [
135
- "append",
136
- "append",
137
- "delete",
138
- ]
139
-
140
- assert tbl.scan().to_arrow().to_pydict() == {
141
- "number_partitioned": [1, 1, 0, 0],
142
- "primary_key": ["pk2", "pk3", "pk2", "pk3"],
143
- }
144
-
145
-
146
- @pytest.mark.integration
147
- def test_converter_drop_duplicates_success(
148
- spark, session_catalog: RestCatalog, setup_ray_cluster, mocker
149
- ) -> None:
150
- """
151
- Test for convert compute remote function happy case. Download file results are mocked.
152
- """
153
94
 
154
- # 1. Create Iceberg table
155
- namespace = "default"
156
- table_name = "table_converter_ray_pos_delete_drop_duplicates_compute"
157
- identifier = f"{namespace}.{table_name}"
158
-
159
- schema = Schema(
95
+ @pytest.fixture
96
+ def multi_key_schema():
97
+ return Schema(
160
98
  NestedField(
161
99
  field_id=1, name="number_partitioned", field_type=LongType(), required=False
162
100
  ),
163
101
  NestedField(
164
- field_id=2, name="primary_key", field_type=StringType(), required=False
102
+ field_id=2, name="primary_key1", field_type=StringType(), required=False
103
+ ),
104
+ NestedField(
105
+ field_id=3, name="primary_key2", field_type=LongType(), required=False
165
106
  ),
166
- # Explicitly define "file_path" and "pos" for assertion of deterministic record after dedupe
167
107
  NestedField(
168
108
  field_id=2147483546,
169
109
  name="file_path",
@@ -176,21 +116,55 @@ def test_converter_drop_duplicates_success(
176
116
  schema_id=0,
177
117
  )
178
118
 
119
+
120
+ @pytest.fixture
121
+ def multi_key_schema_without_file_path():
122
+ return Schema(
123
+ NestedField(
124
+ field_id=1, name="number_partitioned", field_type=LongType(), required=False
125
+ ),
126
+ NestedField(
127
+ field_id=2, name="primary_key1", field_type=StringType(), required=False
128
+ ),
129
+ NestedField(
130
+ field_id=3, name="primary_key2", field_type=LongType(), required=False
131
+ ),
132
+ schema_id=0,
133
+ )
134
+
135
+
136
+ @pytest.fixture
137
+ def base_partition_spec():
179
138
  partition_field_identity = PartitionField(
180
139
  source_id=1,
181
140
  field_id=101,
182
141
  transform=IdentityTransform(),
183
142
  name="number_partitioned",
184
143
  )
185
- partition_spec = PartitionSpec(partition_field_identity)
144
+ return PartitionSpec(partition_field_identity)
186
145
 
187
- properties = dict()
188
- properties["write.format.default"] = "parquet"
189
- properties["write.delete.mode"] = "merge-on-read"
190
- properties["write.update.mode"] = "merge-on-read"
191
- properties["write.merge.mode"] = "merge-on-read"
192
- properties["format-version"] = "2"
193
146
 
147
+ @pytest.fixture
148
+ def table_properties():
149
+ return {
150
+ "write.format.default": "parquet",
151
+ "write.delete.mode": "merge-on-read",
152
+ "write.update.mode": "merge-on-read",
153
+ "write.merge.mode": "merge-on-read",
154
+ "format-version": "2",
155
+ }
156
+
157
+
158
+ def create_test_table(
159
+ session_catalog: RestCatalog,
160
+ namespace: str,
161
+ table_name: str,
162
+ schema: Schema,
163
+ partition_spec: PartitionSpec,
164
+ properties: Dict[str, str],
165
+ ) -> str:
166
+ """Helper function to create a test table"""
167
+ identifier = f"{namespace}.{table_name}"
194
168
  drop_table_if_exists(identifier, session_catalog)
195
169
  session_catalog.create_table(
196
170
  identifier,
@@ -198,281 +172,655 @@ def test_converter_drop_duplicates_success(
198
172
  partition_spec=partition_spec,
199
173
  properties=properties,
200
174
  )
175
+ return identifier
176
+
177
+
178
+ def create_mock_data_tables(test_case: Dict[str, Any]) -> Tuple[daft.DataFrame, ...]:
179
+ """Helper function to create mock data tables based on test case"""
180
+ tables = []
181
+ for data in test_case["mock_data"]:
182
+ if "primary_key2" in data: # Multi-key case
183
+ names = ["primary_key1", "primary_key2"]
184
+ table = pa.Table.from_arrays(
185
+ [pa.array(data["primary_key1"]), pa.array(data["primary_key2"])],
186
+ names=names,
187
+ )
188
+ else: # Single key case
189
+ names = ["primary_key"]
190
+ table = pa.Table.from_arrays([pa.array(data["primary_key"])], names=names)
191
+ tables.append(daft.from_arrow(table))
192
+ if "equality_delete_data_mock" in test_case:
193
+ for data in test_case["equality_delete_data_mock"]:
194
+ if "primary_key2" in data: # Multi-key case
195
+ names = ["primary_key1", "primary_key2"]
196
+ table = pa.Table.from_arrays(
197
+ [pa.array(data["primary_key1"]), pa.array(data["primary_key2"])],
198
+ names=names,
199
+ )
200
+ else: # Single key case
201
+ names = ["primary_key"]
202
+ table = pa.Table.from_arrays(
203
+ [pa.array(data["primary_key"])], names=names
204
+ )
205
+ tables.append(daft.from_arrow(table))
206
+ return tuple(tables)
201
207
 
202
- # 2. Use Spark to generate initial data files
203
- tbl = session_catalog.load_table(identifier)
204
- tbl.refresh()
205
- run_spark_commands(
206
- spark,
207
- [
208
- f"""
209
- INSERT INTO {identifier} VALUES (0, "pk1", "path1", 1), (0, "pk2", "path2", 2), (0, "pk3", "path3", 3)
210
- """
211
- ],
212
- )
213
- run_spark_commands(
214
- spark,
215
- [
216
- f"""
217
- INSERT INTO {identifier} VALUES (0, "pk1", "path1", 4), (0, "pk2", "path2", 5), (0, "pk3", "path3", 6)
218
- """
219
- ],
220
- )
221
- run_spark_commands(
222
- spark,
223
- [
224
- f"""
225
- INSERT INTO {identifier} VALUES (0, "pk4", "path4", 7), (0, "pk2", "path2", 8), (0, "pk3", "path3", 9)
226
- """
227
- ],
228
- )
229
208
 
230
- tbl = session_catalog.load_table(identifier)
231
- # 3. Use convert.remote() function to compute position deletes
232
- data_file_dict, equality_delete_dict, pos_delete_dict = fetch_all_bucket_files(tbl)
209
+ def run_spark_commands(spark, sqls: List[str]) -> None:
210
+ """Helper function to run Spark SQL commands"""
211
+ for sql in sqls:
212
+ spark.sql(sql)
233
213
 
234
- convert_input_files_for_all_buckets = group_all_files_to_each_bucket(
235
- data_file_dict=data_file_dict,
236
- equality_delete_dict=equality_delete_dict,
237
- pos_delete_dict=pos_delete_dict,
238
- )
239
214
 
240
- s3_file_system = get_s3_file_system()
215
+ def insert_test_data(spark, identifier: str, test_case: Dict[str, Any]) -> None:
216
+ """Helper function to insert test data into the table"""
217
+ if "primary_key2" in test_case["mock_data"][0]:
218
+ # Multi-key case
219
+ for data in test_case["mock_data"]:
220
+ values = ", ".join(
221
+ f"(0, '{pk1}', {pk2})"
222
+ for pk1, pk2 in zip(data["primary_key1"], data["primary_key2"])
223
+ )
224
+ run_spark_commands(spark, [f"INSERT INTO {identifier} VALUES {values}"])
225
+ else:
226
+ # Single key case
227
+ if test_case["schema"] == "base_schema":
228
+ # For drop duplicates test, use file_path and pos from mock_data
229
+ for data in test_case["mock_data"]:
230
+ values = ", ".join(
231
+ f"(0, '{pk}', '{path}', {pos})"
232
+ for pk, path, pos in zip(
233
+ data["primary_key"], data["file_path"], data["pos"]
234
+ )
235
+ )
236
+ run_spark_commands(spark, [f"INSERT INTO {identifier} VALUES {values}"])
237
+ else:
238
+ # For other tests, just include the basic columns
239
+ for data in test_case["mock_data"]:
240
+ values = ", ".join(f"(0, '{pk}')" for pk in data["primary_key"])
241
+ run_spark_commands(spark, [f"INSERT INTO {identifier} VALUES {values}"])
242
+
243
+
244
+ def create_convert_input(
245
+ tbl,
246
+ convert_input_files_for_all_buckets: List[Any],
247
+ test_case: Dict[str, Any],
248
+ s3_file_system: Any,
249
+ ) -> List[ConvertInput]:
250
+ """Helper function to create convert inputs"""
251
+ convert_inputs = []
241
252
  for i, one_bucket_files in enumerate(convert_input_files_for_all_buckets):
242
253
  convert_input = ConvertInput.of(
243
254
  convert_input_files=one_bucket_files,
244
255
  convert_task_index=i,
245
256
  iceberg_table_warehouse_prefix="warehouse/default",
246
- identifier_fields=["primary_key"],
247
- compact_small_files=False,
257
+ identifier_fields=test_case["identifier_fields"],
258
+ table_io=tbl.io,
259
+ table_metadata=tbl.metadata,
260
+ compact_previous_position_delete_files=False,
248
261
  enforce_primary_key_uniqueness=True,
249
262
  position_delete_for_multiple_data_files=True,
250
263
  max_parallel_data_file_download=10,
251
- s3_file_system=s3_file_system,
264
+ filesystem=s3_file_system,
265
+ s3_client_kwargs={},
266
+ task_memory=TASK_MEMORY_BYTES,
252
267
  )
268
+ convert_inputs.append(convert_input)
269
+ return convert_inputs
253
270
 
254
- number_partitioned_array_1 = pa.array([0, 0, 0], type=pa.int32())
255
- primary_key_array_1 = pa.array(["pk1", "pk2", "pk3"])
256
- names = ["number_partitioned", "primary_key"]
257
- data_table_1 = pa.Table.from_arrays(
258
- [number_partitioned_array_1, primary_key_array_1], names=names
259
- )
260
-
261
- number_partitioned_array_2 = pa.array([0, 0, 0], type=pa.int32())
262
- primary_key_array_2 = pa.array(["pk1", "pk2", "pk3"])
263
- names = ["number_partitioned", "primary_key"]
264
- data_table_2 = pa.Table.from_arrays(
265
- [number_partitioned_array_2, primary_key_array_2], names=names
266
- )
267
271
 
268
- number_partitioned_array_3 = pa.array([0, 0, 0], type=pa.int32())
269
- primary_key_array_3 = pa.array(["pk4", "pk2", "pk3"])
270
- names = ["number_partitioned", "primary_key"]
271
- data_table_3 = pa.Table.from_arrays(
272
- [number_partitioned_array_3, primary_key_array_3], names=names
273
- )
272
+ def process_convert_result(convert_result: Any) -> Tuple[List[Any], List[Any]]:
273
+ """Helper function to process convert results
274
274
 
275
- download_data_mock = mocker.patch(
276
- "deltacat.compute.converter.utils.io.download_parquet_with_daft_hash_applied"
277
- )
278
- download_data_mock.side_effect = (data_table_1, data_table_2, data_table_3)
279
-
280
- convert_ref = convert.remote(convert_input)
275
+ Args:
276
+ convert_result: The result from convert_session
281
277
 
278
+ Returns:
279
+ Tuple[List[Any], List[Any]]: Lists of files to be deleted and added
280
+ """
282
281
  to_be_deleted_files_list = []
283
- to_be_added_files_dict_list = []
284
- convert_result = ray.get(convert_ref)
285
-
286
- partition_value = convert_input.convert_input_files.partition_value
282
+ to_be_added_files_list = []
283
+ if convert_result.to_be_deleted_files:
284
+ to_be_deleted_files_list.extend(convert_result.to_be_deleted_files.values())
285
+ if convert_result.to_be_added_files:
286
+ to_be_added_files_list.extend(convert_result.to_be_added_files)
287
+ return to_be_deleted_files_list, to_be_added_files_list
287
288
 
288
- if convert_result[0]:
289
- to_be_deleted_files_list.extend(convert_result[0].values())
290
289
 
291
- file_location = convert_result[1][partition_value][0]
292
- to_be_added_files = f"s3://{file_location}"
290
+ def verify_result(result, expected_result, verify_pos_index=False):
291
+ """Verify the result matches the expected result.
293
292
 
294
- to_be_added_files_dict = defaultdict()
295
- to_be_added_files_dict[partition_value] = [to_be_added_files]
296
- to_be_added_files_dict_list.append(to_be_added_files_dict)
293
+ Args:
294
+ result: The result to verify
295
+ expected_result: The expected result
296
+ verify_pos_index: Whether to verify position values for primary keys
297
+ """
298
+ if "primary_keys" in expected_result and "primary_key" in result:
299
+ # Single key case
300
+ assert set(result["primary_key"]) == set(expected_result["primary_keys"])
301
+ if verify_pos_index and "pk_to_pos" in expected_result:
302
+ for index in range(len(result["primary_key"])):
303
+ assert (
304
+ result["pos"][index]
305
+ == expected_result["pk_to_pos"][result["primary_key"][index]]
306
+ )
307
+ elif "pk_tuples" in expected_result:
308
+ pk_combined_res = []
309
+ for pk1, pk2 in zip(
310
+ result["primary_key1"],
311
+ result["primary_key2"],
312
+ ):
313
+ pk_combined_res.append((pk1, pk2))
314
+
315
+ # Multi-key case
316
+ assert set(pk_combined_res) == set(expected_result["pk_tuples"])
317
+ else:
318
+ assert set(result) == set(expected_result["primary_keys"])
319
+
320
+
321
+ def verify_spark_read_results(spark, identifier, expected_result):
322
+ spark_read_pos_delete = spark.sql(f"select * from {identifier}").collect()
323
+ all_pk = [
324
+ spark_read_pos_delete[row_idx][1]
325
+ for row_idx in range(len(spark_read_pos_delete))
326
+ ]
327
+ verify_result(all_pk, expected_result, verify_pos_index=False)
297
328
 
298
- # 4. Commit position delete, delete equality deletes from table
299
- new_position_delete_files = parquet_files_dict_to_iceberg_data_files(
300
- io=tbl.io,
301
- table_metadata=tbl.metadata,
302
- files_dict_list=to_be_added_files_dict_list,
303
- )
304
- commit_append_snapshot(
305
- iceberg_table=tbl,
306
- new_position_delete_files=new_position_delete_files,
307
- )
308
- tbl.refresh()
309
329
 
310
- # 5. Only primary key 2 and 3 should exist in table, as primary key 1 is deleted.
311
- pyiceberg_scan_table_rows = tbl.scan().to_arrow().to_pydict()
330
+ def get_file_prefix(tbl):
331
+ """Get the file prefix from a table's data files.
312
332
 
313
- # Only one unique record for each pk exists
314
- all_pk = sorted(pyiceberg_scan_table_rows["primary_key"])
315
- assert all_pk == ["pk1", "pk2", "pk3", "pk4"]
333
+ Args:
334
+ tbl: The table to get the file prefix from
316
335
 
317
- # Expected unique record to keep for each pk
318
- expected_pk_to_pos_mapping = {"pk1": 4, "pk2": 8, "pk3": 9, "pk4": 7}
319
- for pk, pos in zip(
320
- pyiceberg_scan_table_rows["primary_key"], pyiceberg_scan_table_rows["pos"]
321
- ):
322
- assert pos == expected_pk_to_pos_mapping[pk]
336
+ Returns:
337
+ str: The file prefix
338
+ """
339
+ df = tbl.inspect.entries()
340
+ data_files = df.to_pydict()["data_file"]
341
+ file_link = data_files[0]["file_path"]
342
+ file_prefix = "/".join(file_link.split("/")[:-1])
343
+ return file_prefix.split("//")[1]
344
+
345
+
346
+ # Test cases configuration
347
+ TEST_CASES = [
348
+ {
349
+ "name": "single_key_drop_duplicates",
350
+ "table_name": "table_converter_ray_drop_duplicates_success",
351
+ "schema": "base_schema",
352
+ "identifier_fields": ["primary_key"],
353
+ "mock_data": [
354
+ {
355
+ "primary_key": ["pk1", "pk2", "pk3"],
356
+ "file_path": ["path1", "path2", "path3"],
357
+ "pos": [1, 2, 3],
358
+ },
359
+ {
360
+ "primary_key": ["pk1", "pk2", "pk3"],
361
+ "file_path": ["path1", "path2", "path3"],
362
+ "pos": [4, 5, 6],
363
+ },
364
+ {
365
+ "primary_key": ["pk4", "pk2", "pk3"],
366
+ "file_path": ["path4", "path2", "path3"],
367
+ "pos": [7, 8, 9],
368
+ },
369
+ ],
370
+ "expected_result": {
371
+ "primary_keys": ["pk1", "pk2", "pk3", "pk4"],
372
+ "pk_to_pos": {"pk1": 4, "pk2": 8, "pk3": 9, "pk4": 7},
373
+ },
374
+ },
375
+ {
376
+ "name": "multi_key_drop_duplicates",
377
+ "table_name": "table_converter_ray_pos_delete_multiple_identifier_fields",
378
+ "schema": "multi_key_schema_without_file_path",
379
+ "identifier_fields": ["primary_key1", "primary_key2"],
380
+ "mock_data": [
381
+ {"primary_key1": ["pk1", "pk2", "pk3"], "primary_key2": [1, 2, 3]},
382
+ {"primary_key1": ["pk1", "pk2", "pk3"], "primary_key2": [1, 2, 3]},
383
+ {"primary_key1": ["pk4", "pk2", "pk3"], "primary_key2": [1, 3, 4]},
384
+ ],
385
+ "expected_result": {
386
+ "pk_tuples": [
387
+ ("pk1", 1),
388
+ ("pk2", 2),
389
+ ("pk2", 3),
390
+ ("pk3", 3),
391
+ ("pk3", 4),
392
+ ("pk4", 1),
393
+ ]
394
+ },
395
+ },
396
+ {
397
+ "name": "equality_delete",
398
+ "table_name": "table_converter_ray_equality_delete_success",
399
+ "schema": "base_schema_without_metadata",
400
+ "identifier_fields": ["primary_key"],
401
+ "mock_data": [
402
+ {"primary_key": ["pk1", "pk2", "pk3"]},
403
+ {"primary_key": ["pk1", "pk2", "pk3"]},
404
+ {"primary_key": ["pk4", "pk2", "pk3"]},
405
+ ],
406
+ "equality_delete_data_mock": [{"primary_key": ["pk1"]}],
407
+ "equality_delete_data": pa.Table.from_arrays(["pk1"], names=["primary_key"]),
408
+ "verify_spark_read": True,
409
+ "expected_result": {"primary_keys": ["pk2", "pk3", "pk4"]},
410
+ },
411
+ {
412
+ "name": "position_delete",
413
+ "table_name": "table_converter_ray_position_delete_success",
414
+ "schema": "base_schema_without_metadata",
415
+ "identifier_fields": ["primary_key"],
416
+ "mock_data": [
417
+ {"primary_key": ["pk1", "pk2", "pk3"]},
418
+ {"primary_key": ["pk1", "pk2", "pk3"]},
419
+ {"primary_key": ["pk4", "pk2", "pk3"]},
420
+ ],
421
+ "expected_result": {"primary_keys": ["pk1", "pk2", "pk3", "pk4"]},
422
+ },
423
+ {
424
+ "name": "position_delete_read_by_spark",
425
+ "table_name": "table_converter_ray_pos_delete_read_by_spark_success",
426
+ "schema": "base_schema_without_metadata",
427
+ "identifier_fields": ["primary_key"],
428
+ "mock_data": [
429
+ {"primary_key": ["pk1", "pk2", "pk3"]},
430
+ {"primary_key": ["pk1", "pk2", "pk3"]},
431
+ {"primary_key": ["pk4", "pk2", "pk3"]},
432
+ ],
433
+ "expected_result": {"primary_keys": ["pk1", "pk2", "pk3", "pk4"]},
434
+ "verify_spark_read": True,
435
+ "expected_spark_count": 4,
436
+ },
437
+ ]
323
438
 
324
439
 
440
+ @pytest.mark.parametrize("test_case", TEST_CASES)
325
441
  @pytest.mark.integration
326
- def test_converter_pos_delete_read_by_spark_success(
327
- spark, session_catalog: RestCatalog, setup_ray_cluster, mocker
442
+ def test_converter(
443
+ test_case: Dict[str, Any],
444
+ spark,
445
+ session_catalog: RestCatalog,
446
+ setup_ray_cluster,
447
+ mocker,
448
+ request,
328
449
  ) -> None:
329
450
  """
330
- Test for convert compute remote function happy case. Download file results are mocked.
451
+ Parameterized test for converter functionality.
452
+ Tests drop duplicates, equality delete, and position delete scenarios.
331
453
  """
454
+ # Get schema fixture based on test case
455
+ schema = request.getfixturevalue(test_case["schema"])
456
+
457
+ # Create test table
458
+ identifier = create_test_table(
459
+ session_catalog=session_catalog,
460
+ namespace="default",
461
+ table_name=test_case["table_name"],
462
+ schema=schema,
463
+ partition_spec=request.getfixturevalue("base_partition_spec"),
464
+ properties=request.getfixturevalue("table_properties"),
465
+ )
332
466
 
333
- # 1. Create Iceberg table
334
- namespace = "default"
335
- table_name = "table_converter_ray_pos_delete_read_by_spark_success"
336
- identifier = f"{namespace}.{table_name}"
467
+ # Insert test data
468
+ insert_test_data(spark, identifier, test_case)
337
469
 
338
- schema = Schema(
339
- NestedField(
340
- field_id=1, name="number_partitioned", field_type=LongType(), required=False
341
- ),
342
- NestedField(
343
- field_id=2, name="primary_key", field_type=StringType(), required=False
344
- ),
345
- schema_id=0,
346
- )
470
+ # Get files and create convert input
471
+ tbl = session_catalog.load_table(identifier)
472
+ data_file_dict, equality_delete_dict, pos_delete_dict = fetch_all_bucket_files(tbl)
347
473
 
348
- partition_field_identity = PartitionField(
349
- source_id=1,
350
- field_id=101,
351
- transform=IdentityTransform(),
352
- name="number_partitioned",
474
+ # Handle equality delete if present
475
+ if "equality_delete_data" in test_case:
476
+ tbl = session_catalog.load_table(identifier)
477
+ file_prefix = get_file_prefix(tbl)
478
+ partition_value = Record(number_partitioned=0)
479
+
480
+ # Note: Just upload to S3 to mock input data here.
481
+ # NOT committing to Iceberg metadata as equality delete write path not implemented in Pyiceberg/Spark.
482
+ equality_file_list = commit_equality_delete_to_table(
483
+ table=tbl,
484
+ partition_value=partition_value,
485
+ equality_delete_table=test_case["equality_delete_data"],
486
+ file_link_prefix=file_prefix,
487
+ )
488
+ # Mock equality delete input to converter with latest file sequence, so equality delete can be applied to all data before
489
+ equality_delete_dict = defaultdict()
490
+ equality_delete_dict[partition_value] = [(4, equality_file_list[0])]
491
+
492
+ convert_input_files_for_all_buckets = group_all_files_to_each_bucket(
493
+ data_file_dict=data_file_dict,
494
+ equality_delete_dict=equality_delete_dict,
495
+ pos_delete_dict=pos_delete_dict,
353
496
  )
354
- partition_spec = PartitionSpec(partition_field_identity)
355
497
 
356
- properties = dict()
357
- properties["write.format.default"] = "parquet"
358
- properties["write.delete.mode"] = "merge-on-read"
359
- properties["write.update.mode"] = "merge-on-read"
360
- properties["write.merge.mode"] = "merge-on-read"
361
- properties["format-version"] = "2"
498
+ s3_file_system = get_s3_file_system()
499
+ convert_inputs = create_convert_input(
500
+ tbl, convert_input_files_for_all_buckets, test_case, s3_file_system
501
+ )
362
502
 
363
- drop_table_if_exists(identifier, session_catalog)
364
- session_catalog.create_table(
365
- identifier,
366
- schema=schema,
367
- partition_spec=partition_spec,
368
- properties=properties,
503
+ # Create and set up mock data
504
+ mock_data_tables = create_mock_data_tables(test_case)
505
+ download_data_mock = mocker.patch(
506
+ "deltacat.compute.converter.utils.io.daft_read_parquet"
369
507
  )
370
508
 
371
- # 2. Use Spark to generate initial data files
372
- tbl = session_catalog.load_table(identifier)
509
+ download_data_mock.side_effect = mock_data_tables
373
510
 
374
- run_spark_commands(
375
- spark,
376
- [
377
- f"""
378
- INSERT INTO {identifier} VALUES (0, "pk1"), (0, "pk2"), (0, "pk3")
379
- """
380
- ],
381
- )
382
- run_spark_commands(
383
- spark,
384
- [
385
- f"""
386
- INSERT INTO {identifier} VALUES (0, "pk1"), (0, "pk2"), (0, "pk3")
387
- """
388
- ],
389
- )
390
- run_spark_commands(
391
- spark,
392
- [
393
- f"""
394
- INSERT INTO {identifier} VALUES (0, "pk4"), (0, "pk2"), (0, "pk3")
395
- """
396
- ],
511
+ # Run conversion
512
+ convert_ref = convert.remote(convert_inputs[0])
513
+ convert_result = ray.get(convert_ref)
514
+
515
+ # Process results
516
+ to_be_deleted_files_list, to_be_added_files_list = process_convert_result(
517
+ convert_result
397
518
  )
519
+
520
+ if not to_be_deleted_files_list:
521
+ # Commit changes
522
+ commit_append_snapshot(
523
+ iceberg_table=tbl,
524
+ new_position_delete_files=to_be_added_files_list,
525
+ )
526
+ else:
527
+ commit_replace_snapshot(
528
+ iceberg_table=tbl,
529
+ to_be_deleted_files=to_be_deleted_files_list[0],
530
+ new_position_delete_files=to_be_added_files_list,
531
+ )
398
532
  tbl.refresh()
399
533
 
400
- # 3. Use convert.remote() function to compute position deletes
401
- data_file_dict, equality_delete_dict, pos_delete_dict = fetch_all_bucket_files(tbl)
534
+ # Verify results
535
+ pyiceberg_scan_table_rows = tbl.scan().to_arrow().to_pydict()
402
536
 
403
- convert_input_files_for_all_buckets = group_all_files_to_each_bucket(
404
- data_file_dict=data_file_dict,
405
- equality_delete_dict=equality_delete_dict,
406
- pos_delete_dict=pos_delete_dict,
407
- )
537
+ # Verify Spark read if required
538
+ if test_case.get("verify_spark_read", False):
539
+ verify_spark_read_results(spark, identifier, test_case["expected_result"])
540
+ else:
541
+ verify_result(
542
+ pyiceberg_scan_table_rows,
543
+ test_case["expected_result"],
544
+ verify_pos_index=test_case.get("verify_pos_index", False),
545
+ )
408
546
 
409
- s3_file_system = get_s3_file_system()
410
- for i, one_bucket_files in enumerate(convert_input_files_for_all_buckets):
411
- convert_input = ConvertInput.of(
412
- convert_input_files=one_bucket_files,
413
- convert_task_index=i,
414
- iceberg_table_warehouse_prefix="warehouse/default",
415
- identifier_fields=["primary_key"],
416
- compact_small_files=False,
417
- enforce_primary_key_uniqueness=True,
418
- position_delete_for_multiple_data_files=True,
419
- max_parallel_data_file_download=10,
420
- s3_file_system=s3_file_system,
547
+
548
+ def test_converter_session_with_local_filesystem_and_duplicate_ids(
549
+ setup_ray_cluster,
550
+ ) -> None:
551
+ """
552
+ Test converter_session functionality with local PyArrow filesystem using duplicate IDs.
553
+ This test simulates the pattern where duplicate IDs represent updates to existing records.
554
+ The converter should merge these updates by creating position delete files.
555
+ """
556
+ with temp_dir_autocleanup() as temp_catalog_dir:
557
+ # Create warehouse directory
558
+ warehouse_path = os.path.join(temp_catalog_dir, "iceberg_warehouse")
559
+ os.makedirs(warehouse_path, exist_ok=True)
560
+
561
+ # Set up local in-memory catalog
562
+ local_catalog = load_catalog(
563
+ "local_sql_catalog",
564
+ **{
565
+ "type": "in-memory",
566
+ "warehouse": warehouse_path,
567
+ },
421
568
  )
422
569
 
423
- primary_key_array_1 = pa.array(["pk1", "pk2", "pk3"])
424
- names = ["primary_key"]
425
- data_table_1 = pa.Table.from_arrays([primary_key_array_1], names=names)
570
+ # Create local PyArrow filesystem
571
+ import pyarrow.fs as pafs
572
+
573
+ local_filesystem = pafs.LocalFileSystem()
574
+
575
+ # Define schema (id, name, value, version)
576
+ schema = Schema(
577
+ NestedField(field_id=1, name="id", field_type=LongType(), required=True),
578
+ NestedField(
579
+ field_id=2, name="name", field_type=StringType(), required=False
580
+ ),
581
+ NestedField(
582
+ field_id=3, name="value", field_type=LongType(), required=False
583
+ ),
584
+ NestedField(
585
+ field_id=4, name="version", field_type=LongType(), required=False
586
+ ),
587
+ schema_id=0,
588
+ )
426
589
 
427
- primary_key_array_2 = pa.array(["pk1", "pk2", "pk3"])
428
- names = ["primary_key"]
429
- data_table_2 = pa.Table.from_arrays([primary_key_array_2], names=names)
590
+ # Create table properties for merge-on-read
591
+ properties = {
592
+ "write.format.default": "parquet",
593
+ "write.delete.mode": "merge-on-read",
594
+ "write.update.mode": "merge-on-read",
595
+ "write.merge.mode": "merge-on-read",
596
+ "format-version": "2",
597
+ }
598
+
599
+ # Create the table
600
+ table_identifier = "default.test_duplicate_ids"
601
+ try:
602
+ local_catalog.create_namespace("default")
603
+ except NamespaceAlreadyExistsError:
604
+ pass # Namespace may already exist
605
+ try:
606
+ local_catalog.drop_table(table_identifier)
607
+ except NoSuchTableError:
608
+ pass # Table may not exist
609
+
610
+ local_catalog.create_table(
611
+ table_identifier,
612
+ schema=schema,
613
+ properties=properties,
614
+ )
615
+ tbl = local_catalog.load_table(table_identifier)
430
616
 
431
- primary_key_array_3 = pa.array(["pk4", "pk2", "pk3"])
432
- names = ["primary_key"]
433
- data_table_3 = pa.Table.from_arrays([primary_key_array_3], names=names)
617
+ # Set the name mapping property so Iceberg can read parquet files without field IDs
618
+ with tbl.transaction() as tx:
619
+ tx.set_properties(
620
+ **{"schema.name-mapping.default": schema.name_mapping.model_dump_json()}
621
+ )
434
622
 
435
- download_data_mock = mocker.patch(
436
- "deltacat.compute.converter.utils.io.download_parquet_with_daft_hash_applied"
437
- )
438
- download_data_mock.side_effect = (data_table_1, data_table_2, data_table_3)
623
+ # Step 1: Write initial data
624
+ # Create PyArrow table with explicit schema to match Iceberg schema
625
+ arrow_schema = schema_to_pyarrow(schema)
626
+
627
+ initial_data = pa.table(
628
+ {
629
+ "id": [1, 2, 3, 4],
630
+ "name": ["Alice", "Bob", "Charlie", "David"],
631
+ "value": [100, 200, 300, 400],
632
+ "version": [1, 1, 1, 1],
633
+ },
634
+ schema=arrow_schema,
635
+ )
439
636
 
440
- convert_ref = convert.remote(convert_input)
637
+ # Step 2: Write additional data
638
+ additional_data = pa.table(
639
+ {
640
+ "id": [5, 6, 7, 8],
641
+ "name": ["Eve", "Frank", "Grace", "Henry"],
642
+ "value": [500, 600, 700, 800],
643
+ "version": [1, 1, 1, 1],
644
+ },
645
+ schema=arrow_schema,
646
+ )
441
647
 
442
- to_be_deleted_files_list = []
443
- to_be_added_files_dict_list = []
444
- convert_result = ray.get(convert_ref)
648
+ # Step 3: Write updates to existing records (this creates duplicates by ID)
649
+ # These should overwrite the original records with same IDs
650
+ updated_data = pa.table(
651
+ {
652
+ "id": [2, 3, 9], # IDs 2 and 3 are duplicates, 9 is new
653
+ "name": [
654
+ "Robert",
655
+ "Charles",
656
+ "Ivan",
657
+ ], # Updated names for Bob and Charlie
658
+ "value": [201, 301, 900], # Updated values
659
+ "version": [2, 2, 1], # Higher version numbers for updates
660
+ },
661
+ schema=arrow_schema,
662
+ )
445
663
 
446
- partition_value = convert_input.convert_input_files.partition_value
664
+ # Write all data to separate parquet files to simulate multiple writes
665
+ data_files_to_commit = []
447
666
 
448
- if convert_result[0]:
449
- to_be_deleted_files_list.extend(convert_result[0].values())
667
+ for i, data in enumerate([initial_data, additional_data, updated_data]):
668
+ data_file_path = os.path.join(warehouse_path, f"data_{i}.parquet")
669
+ pq.write_table(data, data_file_path)
450
670
 
451
- file_location = convert_result[1][partition_value][0]
452
- to_be_added_files = f"s3://{file_location}"
671
+ # Create DataFile objects for Iceberg
672
+ parquet_metadata = pq.read_metadata(data_file_path)
673
+ file_size = os.path.getsize(data_file_path)
453
674
 
454
- to_be_added_files_dict = defaultdict()
455
- to_be_added_files_dict[partition_value] = [to_be_added_files]
456
- to_be_added_files_dict_list.append(to_be_added_files_dict)
675
+ # Check schema compatibility
676
+ _check_pyarrow_schema_compatible(
677
+ schema, parquet_metadata.schema.to_arrow_schema()
678
+ )
457
679
 
458
- # 4. Commit position delete, delete equality deletes from table
459
- new_position_delete_files = parquet_files_dict_to_iceberg_data_files(
460
- io=tbl.io,
461
- table_metadata=tbl.metadata,
462
- files_dict_list=to_be_added_files_dict_list,
463
- )
680
+ # Calculate statistics
681
+ statistics = data_file_statistics_from_parquet_metadata(
682
+ parquet_metadata=parquet_metadata,
683
+ stats_columns=compute_statistics_plan(schema, tbl.metadata.properties),
684
+ parquet_column_mapping=parquet_path_to_id_mapping(schema),
685
+ )
464
686
 
465
- commit_append_snapshot(
466
- iceberg_table=tbl,
467
- new_position_delete_files=new_position_delete_files,
468
- )
469
- tbl.refresh()
687
+ data_file = DataFile(
688
+ content=DataFileContent.DATA,
689
+ file_path=data_file_path,
690
+ file_format=FileFormat.PARQUET,
691
+ partition={}, # No partitioning
692
+ file_size_in_bytes=file_size,
693
+ sort_order_id=None,
694
+ spec_id=tbl.metadata.default_spec_id,
695
+ key_metadata=None,
696
+ equality_ids=None,
697
+ **statistics.to_serialized_dict(),
698
+ )
699
+ data_files_to_commit.append(data_file)
700
+
701
+ # Commit all data files to the table
702
+ with tbl.transaction() as tx:
703
+ with tx.update_snapshot().fast_append() as update_snapshot:
704
+ for data_file in data_files_to_commit:
705
+ update_snapshot.append_data_file(data_file)
706
+
707
+ tbl.refresh()
708
+
709
+ # Verify we have duplicate IDs before conversion
710
+ initial_scan = tbl.scan().to_arrow().to_pydict()
711
+ print(f"Before conversion - Records with IDs: {sorted(initial_scan['id'])}")
712
+
713
+ # There should be duplicates: [1, 2, 2, 3, 3, 4, 5, 6, 7, 8, 9]
714
+ expected_duplicate_ids = [1, 2, 2, 3, 3, 4, 5, 6, 7, 8, 9]
715
+ assert (
716
+ sorted(initial_scan["id"]) == expected_duplicate_ids
717
+ ), f"Expected duplicate IDs {expected_duplicate_ids}, got {sorted(initial_scan['id'])}"
718
+
719
+ # Now call converter_session to convert equality deletes to position deletes
720
+ converter_params = ConverterSessionParams.of(
721
+ {
722
+ "catalog": local_catalog,
723
+ "iceberg_table_name": table_identifier,
724
+ "iceberg_warehouse_bucket_name": warehouse_path, # Local warehouse path
725
+ "merge_keys": ["id"], # Use ID as the merge key
726
+ "enforce_primary_key_uniqueness": True,
727
+ "task_max_parallelism": 1, # Single task for local testing
728
+ "filesystem": local_filesystem,
729
+ "location_provider_prefix_override": None, # Use local filesystem
730
+ "location_provider_prefix_override": None, # Let the system auto-generate the prefix
731
+ }
732
+ )
470
733
 
471
- # 5. Result assertion: Spark read table contains unique primary key
472
- spark_read_pos_delete = spark.sql(f"select * from {identifier}").collect()
473
- all_pk = [
474
- spark_read_pos_delete[row_idx][1]
475
- for row_idx in range(len(spark_read_pos_delete))
476
- ]
477
- all_pk_sorted = sorted(all_pk)
478
- assert all_pk_sorted == ["pk1", "pk2", "pk3", "pk4"]
734
+ print(f"Running converter_session with local filesystem...")
735
+ print(f"Warehouse path: {warehouse_path}")
736
+ print(f"Merge keys: ['id']")
737
+ print(f"Enforce uniqueness: True")
738
+
739
+ # Run the converter
740
+ converter_session(params=converter_params)
741
+
742
+ # Refresh table and scan again
743
+ tbl.refresh()
744
+ final_scan = tbl.scan().to_arrow().to_pydict()
745
+
746
+ print(f"After conversion - Records with IDs: {sorted(final_scan['id'])}")
747
+ print(f"Final data: {final_scan}")
748
+
749
+ # Verify position delete files were created by checking table metadata
750
+ latest_snapshot = tbl.metadata.current_snapshot()
751
+ if latest_snapshot:
752
+ manifests = latest_snapshot.manifests(tbl.io)
753
+ position_delete_files = []
754
+
755
+ for manifest in manifests:
756
+ entries = manifest.fetch_manifest_entry(tbl.io)
757
+ for entry in entries:
758
+ if entry.data_file.content == DataFileContent.POSITION_DELETES:
759
+ position_delete_files.append(entry.data_file.file_path)
760
+
761
+ print(f"Position delete files found: {position_delete_files}")
762
+ assert (
763
+ len(position_delete_files) > 0
764
+ ), "No position delete files were created by converter_session"
765
+
766
+ # Verify the final result has unique IDs (duplicates should be resolved)
767
+ # Expected: Latest values for each ID based on the updates
768
+ expected_unique_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9] # All unique IDs
769
+ actual_ids = sorted(final_scan["id"])
770
+
771
+ print(f"Expected unique IDs: {expected_unique_ids}")
772
+ print(f"Actual IDs after conversion: {actual_ids}")
773
+
774
+ assert (
775
+ actual_ids == expected_unique_ids
776
+ ), f"Expected unique IDs {expected_unique_ids}, got {actual_ids}"
777
+
778
+ # Verify the updated values are present (higher version should win)
779
+ final_data_by_id = {}
780
+ for i, id_val in enumerate(final_scan["id"]):
781
+ final_data_by_id[id_val] = {
782
+ "name": final_scan["name"][i],
783
+ "value": final_scan["value"][i],
784
+ "version": final_scan["version"][i],
785
+ }
786
+
787
+ # Check that ID 2 has updated value (Robert, 201, version 2)
788
+ assert (
789
+ final_data_by_id[2]["name"] == "Robert"
790
+ ), f"ID 2 should have updated name 'Robert', got '{final_data_by_id[2]['name']}'"
791
+ assert (
792
+ final_data_by_id[2]["value"] == 201
793
+ ), f"ID 2 should have updated value 201, got {final_data_by_id[2]['value']}"
794
+ assert (
795
+ final_data_by_id[2]["version"] == 2
796
+ ), f"ID 2 should have version 2, got {final_data_by_id[2]['version']}"
797
+
798
+ # Check that ID 3 has updated value (Charles, 301, version 2)
799
+ assert (
800
+ final_data_by_id[3]["name"] == "Charles"
801
+ ), f"ID 3 should have updated name 'Charles', got '{final_data_by_id[3]['name']}'"
802
+ assert (
803
+ final_data_by_id[3]["value"] == 301
804
+ ), f"ID 3 should have updated value 301, got {final_data_by_id[3]['value']}"
805
+ assert (
806
+ final_data_by_id[3]["version"] == 2
807
+ ), f"ID 3 should have version 2, got {final_data_by_id[3]['version']}"
808
+
809
+ # Check that new ID 9 is present
810
+ assert (
811
+ final_data_by_id[9]["name"] == "Ivan"
812
+ ), f"ID 9 should have name 'Ivan', got '{final_data_by_id[9]['name']}'"
813
+ assert (
814
+ final_data_by_id[9]["value"] == 900
815
+ ), f"ID 9 should have value 900, got {final_data_by_id[9]['value']}"
816
+
817
+ print(f"✅ Test completed successfully!")
818
+ print(
819
+ f"✅ Position delete files were created: {len(position_delete_files)} files"
820
+ )
821
+ print(f"✅ Duplicate IDs were resolved correctly")
822
+ print(
823
+ f"✅ Updated values were applied (ID 2: Bob->Robert, ID 3: Charlie->Charles)"
824
+ )
825
+ print(f"✅ Final table has {len(actual_ids)} unique records")
826
+ print(f"✅ Temporary warehouse cleaned up at: {temp_catalog_dir}")