deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,826 @@
1
+ from collections import defaultdict
2
+ import pytest
3
+ import ray
4
+ from typing import List, Dict, Any, Tuple
5
+ from pyiceberg.catalog.rest import RestCatalog
6
+ from pyiceberg.schema import Schema
7
+ from pyiceberg.types import (
8
+ NestedField,
9
+ StringType,
10
+ LongType,
11
+ )
12
+ from pyiceberg.partitioning import PartitionSpec, PartitionField
13
+ from pyiceberg.transforms import IdentityTransform
14
+ import pyarrow as pa
15
+ import daft
16
+
17
+ from deltacat.compute.converter.steps.convert import convert
18
+ from deltacat.compute.converter.model.convert_input import ConvertInput
19
+ from deltacat.compute.converter.pyiceberg.overrides import (
20
+ fetch_all_bucket_files,
21
+ )
22
+ from deltacat.compute.converter.utils.converter_session_utils import (
23
+ group_all_files_to_each_bucket,
24
+ )
25
+ from deltacat.tests.compute.converter.utils import (
26
+ get_s3_file_system,
27
+ drop_table_if_exists,
28
+ commit_equality_delete_to_table,
29
+ )
30
+ from deltacat.compute.converter.pyiceberg.update_snapshot_overrides import (
31
+ commit_append_snapshot,
32
+ commit_replace_snapshot,
33
+ )
34
+
35
+ from pyiceberg.typedef import Record
36
+ from deltacat.compute.converter.utils.convert_task_options import BASE_MEMORY_BUFFER
37
+ from deltacat.tests.test_utils.filesystem import temp_dir_autocleanup
38
+ from deltacat.compute.converter.converter_session import converter_session
39
+ from deltacat.compute.converter.model.converter_session_params import (
40
+ ConverterSessionParams,
41
+ )
42
+ from pyiceberg.catalog import load_catalog
43
+ import os
44
+ import pyarrow.parquet as pq
45
+ from pyiceberg.manifest import DataFile, DataFileContent, FileFormat
46
+ from pyiceberg.io.pyarrow import (
47
+ data_file_statistics_from_parquet_metadata,
48
+ compute_statistics_plan,
49
+ parquet_path_to_id_mapping,
50
+ )
51
+ from pyiceberg.io.pyarrow import _check_pyarrow_schema_compatible
52
+ from pyiceberg.exceptions import NamespaceAlreadyExistsError, NoSuchTableError
53
+ from pyiceberg.io.pyarrow import schema_to_pyarrow
54
+
55
+ # Task memory in bytes for testing
56
+ TASK_MEMORY_BYTES = BASE_MEMORY_BUFFER
57
+
58
+
59
+ # Test data fixtures
60
+ @pytest.fixture
61
+ def base_schema():
62
+ return Schema(
63
+ NestedField(
64
+ field_id=1, name="number_partitioned", field_type=LongType(), required=False
65
+ ),
66
+ NestedField(
67
+ field_id=2, name="primary_key", field_type=StringType(), required=False
68
+ ),
69
+ NestedField(
70
+ field_id=2147483546,
71
+ name="file_path",
72
+ field_type=StringType(),
73
+ required=False,
74
+ ),
75
+ NestedField(
76
+ field_id=2147483545, name="pos", field_type=LongType(), required=False
77
+ ),
78
+ schema_id=0,
79
+ )
80
+
81
+
82
+ @pytest.fixture
83
+ def base_schema_without_metadata():
84
+ return Schema(
85
+ NestedField(
86
+ field_id=1, name="number_partitioned", field_type=LongType(), required=False
87
+ ),
88
+ NestedField(
89
+ field_id=2, name="primary_key", field_type=StringType(), required=False
90
+ ),
91
+ schema_id=0,
92
+ )
93
+
94
+
95
+ @pytest.fixture
96
+ def multi_key_schema():
97
+ return Schema(
98
+ NestedField(
99
+ field_id=1, name="number_partitioned", field_type=LongType(), required=False
100
+ ),
101
+ NestedField(
102
+ field_id=2, name="primary_key1", field_type=StringType(), required=False
103
+ ),
104
+ NestedField(
105
+ field_id=3, name="primary_key2", field_type=LongType(), required=False
106
+ ),
107
+ NestedField(
108
+ field_id=2147483546,
109
+ name="file_path",
110
+ field_type=StringType(),
111
+ required=False,
112
+ ),
113
+ NestedField(
114
+ field_id=2147483545, name="pos", field_type=LongType(), required=False
115
+ ),
116
+ schema_id=0,
117
+ )
118
+
119
+
120
+ @pytest.fixture
121
+ def multi_key_schema_without_file_path():
122
+ return Schema(
123
+ NestedField(
124
+ field_id=1, name="number_partitioned", field_type=LongType(), required=False
125
+ ),
126
+ NestedField(
127
+ field_id=2, name="primary_key1", field_type=StringType(), required=False
128
+ ),
129
+ NestedField(
130
+ field_id=3, name="primary_key2", field_type=LongType(), required=False
131
+ ),
132
+ schema_id=0,
133
+ )
134
+
135
+
136
+ @pytest.fixture
137
+ def base_partition_spec():
138
+ partition_field_identity = PartitionField(
139
+ source_id=1,
140
+ field_id=101,
141
+ transform=IdentityTransform(),
142
+ name="number_partitioned",
143
+ )
144
+ return PartitionSpec(partition_field_identity)
145
+
146
+
147
+ @pytest.fixture
148
+ def table_properties():
149
+ return {
150
+ "write.format.default": "parquet",
151
+ "write.delete.mode": "merge-on-read",
152
+ "write.update.mode": "merge-on-read",
153
+ "write.merge.mode": "merge-on-read",
154
+ "format-version": "2",
155
+ }
156
+
157
+
158
+ def create_test_table(
159
+ session_catalog: RestCatalog,
160
+ namespace: str,
161
+ table_name: str,
162
+ schema: Schema,
163
+ partition_spec: PartitionSpec,
164
+ properties: Dict[str, str],
165
+ ) -> str:
166
+ """Helper function to create a test table"""
167
+ identifier = f"{namespace}.{table_name}"
168
+ drop_table_if_exists(identifier, session_catalog)
169
+ session_catalog.create_table(
170
+ identifier,
171
+ schema=schema,
172
+ partition_spec=partition_spec,
173
+ properties=properties,
174
+ )
175
+ return identifier
176
+
177
+
178
+ def create_mock_data_tables(test_case: Dict[str, Any]) -> Tuple[daft.DataFrame, ...]:
179
+ """Helper function to create mock data tables based on test case"""
180
+ tables = []
181
+ for data in test_case["mock_data"]:
182
+ if "primary_key2" in data: # Multi-key case
183
+ names = ["primary_key1", "primary_key2"]
184
+ table = pa.Table.from_arrays(
185
+ [pa.array(data["primary_key1"]), pa.array(data["primary_key2"])],
186
+ names=names,
187
+ )
188
+ else: # Single key case
189
+ names = ["primary_key"]
190
+ table = pa.Table.from_arrays([pa.array(data["primary_key"])], names=names)
191
+ tables.append(daft.from_arrow(table))
192
+ if "equality_delete_data_mock" in test_case:
193
+ for data in test_case["equality_delete_data_mock"]:
194
+ if "primary_key2" in data: # Multi-key case
195
+ names = ["primary_key1", "primary_key2"]
196
+ table = pa.Table.from_arrays(
197
+ [pa.array(data["primary_key1"]), pa.array(data["primary_key2"])],
198
+ names=names,
199
+ )
200
+ else: # Single key case
201
+ names = ["primary_key"]
202
+ table = pa.Table.from_arrays(
203
+ [pa.array(data["primary_key"])], names=names
204
+ )
205
+ tables.append(daft.from_arrow(table))
206
+ return tuple(tables)
207
+
208
+
209
+ def run_spark_commands(spark, sqls: List[str]) -> None:
210
+ """Helper function to run Spark SQL commands"""
211
+ for sql in sqls:
212
+ spark.sql(sql)
213
+
214
+
215
+ def insert_test_data(spark, identifier: str, test_case: Dict[str, Any]) -> None:
216
+ """Helper function to insert test data into the table"""
217
+ if "primary_key2" in test_case["mock_data"][0]:
218
+ # Multi-key case
219
+ for data in test_case["mock_data"]:
220
+ values = ", ".join(
221
+ f"(0, '{pk1}', {pk2})"
222
+ for pk1, pk2 in zip(data["primary_key1"], data["primary_key2"])
223
+ )
224
+ run_spark_commands(spark, [f"INSERT INTO {identifier} VALUES {values}"])
225
+ else:
226
+ # Single key case
227
+ if test_case["schema"] == "base_schema":
228
+ # For drop duplicates test, use file_path and pos from mock_data
229
+ for data in test_case["mock_data"]:
230
+ values = ", ".join(
231
+ f"(0, '{pk}', '{path}', {pos})"
232
+ for pk, path, pos in zip(
233
+ data["primary_key"], data["file_path"], data["pos"]
234
+ )
235
+ )
236
+ run_spark_commands(spark, [f"INSERT INTO {identifier} VALUES {values}"])
237
+ else:
238
+ # For other tests, just include the basic columns
239
+ for data in test_case["mock_data"]:
240
+ values = ", ".join(f"(0, '{pk}')" for pk in data["primary_key"])
241
+ run_spark_commands(spark, [f"INSERT INTO {identifier} VALUES {values}"])
242
+
243
+
244
+ def create_convert_input(
245
+ tbl,
246
+ convert_input_files_for_all_buckets: List[Any],
247
+ test_case: Dict[str, Any],
248
+ s3_file_system: Any,
249
+ ) -> List[ConvertInput]:
250
+ """Helper function to create convert inputs"""
251
+ convert_inputs = []
252
+ for i, one_bucket_files in enumerate(convert_input_files_for_all_buckets):
253
+ convert_input = ConvertInput.of(
254
+ convert_input_files=one_bucket_files,
255
+ convert_task_index=i,
256
+ iceberg_table_warehouse_prefix="warehouse/default",
257
+ identifier_fields=test_case["identifier_fields"],
258
+ table_io=tbl.io,
259
+ table_metadata=tbl.metadata,
260
+ compact_previous_position_delete_files=False,
261
+ enforce_primary_key_uniqueness=True,
262
+ position_delete_for_multiple_data_files=True,
263
+ max_parallel_data_file_download=10,
264
+ filesystem=s3_file_system,
265
+ s3_client_kwargs={},
266
+ task_memory=TASK_MEMORY_BYTES,
267
+ )
268
+ convert_inputs.append(convert_input)
269
+ return convert_inputs
270
+
271
+
272
+ def process_convert_result(convert_result: Any) -> Tuple[List[Any], List[Any]]:
273
+ """Helper function to process convert results
274
+
275
+ Args:
276
+ convert_result: The result from convert_session
277
+
278
+ Returns:
279
+ Tuple[List[Any], List[Any]]: Lists of files to be deleted and added
280
+ """
281
+ to_be_deleted_files_list = []
282
+ to_be_added_files_list = []
283
+ if convert_result.to_be_deleted_files:
284
+ to_be_deleted_files_list.extend(convert_result.to_be_deleted_files.values())
285
+ if convert_result.to_be_added_files:
286
+ to_be_added_files_list.extend(convert_result.to_be_added_files)
287
+ return to_be_deleted_files_list, to_be_added_files_list
288
+
289
+
290
+ def verify_result(result, expected_result, verify_pos_index=False):
291
+ """Verify the result matches the expected result.
292
+
293
+ Args:
294
+ result: The result to verify
295
+ expected_result: The expected result
296
+ verify_pos_index: Whether to verify position values for primary keys
297
+ """
298
+ if "primary_keys" in expected_result and "primary_key" in result:
299
+ # Single key case
300
+ assert set(result["primary_key"]) == set(expected_result["primary_keys"])
301
+ if verify_pos_index and "pk_to_pos" in expected_result:
302
+ for index in range(len(result["primary_key"])):
303
+ assert (
304
+ result["pos"][index]
305
+ == expected_result["pk_to_pos"][result["primary_key"][index]]
306
+ )
307
+ elif "pk_tuples" in expected_result:
308
+ pk_combined_res = []
309
+ for pk1, pk2 in zip(
310
+ result["primary_key1"],
311
+ result["primary_key2"],
312
+ ):
313
+ pk_combined_res.append((pk1, pk2))
314
+
315
+ # Multi-key case
316
+ assert set(pk_combined_res) == set(expected_result["pk_tuples"])
317
+ else:
318
+ assert set(result) == set(expected_result["primary_keys"])
319
+
320
+
321
+ def verify_spark_read_results(spark, identifier, expected_result):
322
+ spark_read_pos_delete = spark.sql(f"select * from {identifier}").collect()
323
+ all_pk = [
324
+ spark_read_pos_delete[row_idx][1]
325
+ for row_idx in range(len(spark_read_pos_delete))
326
+ ]
327
+ verify_result(all_pk, expected_result, verify_pos_index=False)
328
+
329
+
330
+ def get_file_prefix(tbl):
331
+ """Get the file prefix from a table's data files.
332
+
333
+ Args:
334
+ tbl: The table to get the file prefix from
335
+
336
+ Returns:
337
+ str: The file prefix
338
+ """
339
+ df = tbl.inspect.entries()
340
+ data_files = df.to_pydict()["data_file"]
341
+ file_link = data_files[0]["file_path"]
342
+ file_prefix = "/".join(file_link.split("/")[:-1])
343
+ return file_prefix.split("//")[1]
344
+
345
+
346
+ # Test cases configuration
347
+ TEST_CASES = [
348
+ {
349
+ "name": "single_key_drop_duplicates",
350
+ "table_name": "table_converter_ray_drop_duplicates_success",
351
+ "schema": "base_schema",
352
+ "identifier_fields": ["primary_key"],
353
+ "mock_data": [
354
+ {
355
+ "primary_key": ["pk1", "pk2", "pk3"],
356
+ "file_path": ["path1", "path2", "path3"],
357
+ "pos": [1, 2, 3],
358
+ },
359
+ {
360
+ "primary_key": ["pk1", "pk2", "pk3"],
361
+ "file_path": ["path1", "path2", "path3"],
362
+ "pos": [4, 5, 6],
363
+ },
364
+ {
365
+ "primary_key": ["pk4", "pk2", "pk3"],
366
+ "file_path": ["path4", "path2", "path3"],
367
+ "pos": [7, 8, 9],
368
+ },
369
+ ],
370
+ "expected_result": {
371
+ "primary_keys": ["pk1", "pk2", "pk3", "pk4"],
372
+ "pk_to_pos": {"pk1": 4, "pk2": 8, "pk3": 9, "pk4": 7},
373
+ },
374
+ },
375
+ {
376
+ "name": "multi_key_drop_duplicates",
377
+ "table_name": "table_converter_ray_pos_delete_multiple_identifier_fields",
378
+ "schema": "multi_key_schema_without_file_path",
379
+ "identifier_fields": ["primary_key1", "primary_key2"],
380
+ "mock_data": [
381
+ {"primary_key1": ["pk1", "pk2", "pk3"], "primary_key2": [1, 2, 3]},
382
+ {"primary_key1": ["pk1", "pk2", "pk3"], "primary_key2": [1, 2, 3]},
383
+ {"primary_key1": ["pk4", "pk2", "pk3"], "primary_key2": [1, 3, 4]},
384
+ ],
385
+ "expected_result": {
386
+ "pk_tuples": [
387
+ ("pk1", 1),
388
+ ("pk2", 2),
389
+ ("pk2", 3),
390
+ ("pk3", 3),
391
+ ("pk3", 4),
392
+ ("pk4", 1),
393
+ ]
394
+ },
395
+ },
396
+ {
397
+ "name": "equality_delete",
398
+ "table_name": "table_converter_ray_equality_delete_success",
399
+ "schema": "base_schema_without_metadata",
400
+ "identifier_fields": ["primary_key"],
401
+ "mock_data": [
402
+ {"primary_key": ["pk1", "pk2", "pk3"]},
403
+ {"primary_key": ["pk1", "pk2", "pk3"]},
404
+ {"primary_key": ["pk4", "pk2", "pk3"]},
405
+ ],
406
+ "equality_delete_data_mock": [{"primary_key": ["pk1"]}],
407
+ "equality_delete_data": pa.Table.from_arrays(["pk1"], names=["primary_key"]),
408
+ "verify_spark_read": True,
409
+ "expected_result": {"primary_keys": ["pk2", "pk3", "pk4"]},
410
+ },
411
+ {
412
+ "name": "position_delete",
413
+ "table_name": "table_converter_ray_position_delete_success",
414
+ "schema": "base_schema_without_metadata",
415
+ "identifier_fields": ["primary_key"],
416
+ "mock_data": [
417
+ {"primary_key": ["pk1", "pk2", "pk3"]},
418
+ {"primary_key": ["pk1", "pk2", "pk3"]},
419
+ {"primary_key": ["pk4", "pk2", "pk3"]},
420
+ ],
421
+ "expected_result": {"primary_keys": ["pk1", "pk2", "pk3", "pk4"]},
422
+ },
423
+ {
424
+ "name": "position_delete_read_by_spark",
425
+ "table_name": "table_converter_ray_pos_delete_read_by_spark_success",
426
+ "schema": "base_schema_without_metadata",
427
+ "identifier_fields": ["primary_key"],
428
+ "mock_data": [
429
+ {"primary_key": ["pk1", "pk2", "pk3"]},
430
+ {"primary_key": ["pk1", "pk2", "pk3"]},
431
+ {"primary_key": ["pk4", "pk2", "pk3"]},
432
+ ],
433
+ "expected_result": {"primary_keys": ["pk1", "pk2", "pk3", "pk4"]},
434
+ "verify_spark_read": True,
435
+ "expected_spark_count": 4,
436
+ },
437
+ ]
438
+
439
+
440
+ @pytest.mark.parametrize("test_case", TEST_CASES)
441
+ @pytest.mark.integration
442
+ def test_converter(
443
+ test_case: Dict[str, Any],
444
+ spark,
445
+ session_catalog: RestCatalog,
446
+ setup_ray_cluster,
447
+ mocker,
448
+ request,
449
+ ) -> None:
450
+ """
451
+ Parameterized test for converter functionality.
452
+ Tests drop duplicates, equality delete, and position delete scenarios.
453
+ """
454
+ # Get schema fixture based on test case
455
+ schema = request.getfixturevalue(test_case["schema"])
456
+
457
+ # Create test table
458
+ identifier = create_test_table(
459
+ session_catalog=session_catalog,
460
+ namespace="default",
461
+ table_name=test_case["table_name"],
462
+ schema=schema,
463
+ partition_spec=request.getfixturevalue("base_partition_spec"),
464
+ properties=request.getfixturevalue("table_properties"),
465
+ )
466
+
467
+ # Insert test data
468
+ insert_test_data(spark, identifier, test_case)
469
+
470
+ # Get files and create convert input
471
+ tbl = session_catalog.load_table(identifier)
472
+ data_file_dict, equality_delete_dict, pos_delete_dict = fetch_all_bucket_files(tbl)
473
+
474
+ # Handle equality delete if present
475
+ if "equality_delete_data" in test_case:
476
+ tbl = session_catalog.load_table(identifier)
477
+ file_prefix = get_file_prefix(tbl)
478
+ partition_value = Record(number_partitioned=0)
479
+
480
+ # Note: Just upload to S3 to mock input data here.
481
+ # NOT committing to Iceberg metadata as equality delete write path not implemented in Pyiceberg/Spark.
482
+ equality_file_list = commit_equality_delete_to_table(
483
+ table=tbl,
484
+ partition_value=partition_value,
485
+ equality_delete_table=test_case["equality_delete_data"],
486
+ file_link_prefix=file_prefix,
487
+ )
488
+ # Mock equality delete input to converter with latest file sequence, so equality delete can be applied to all data before
489
+ equality_delete_dict = defaultdict()
490
+ equality_delete_dict[partition_value] = [(4, equality_file_list[0])]
491
+
492
+ convert_input_files_for_all_buckets = group_all_files_to_each_bucket(
493
+ data_file_dict=data_file_dict,
494
+ equality_delete_dict=equality_delete_dict,
495
+ pos_delete_dict=pos_delete_dict,
496
+ )
497
+
498
+ s3_file_system = get_s3_file_system()
499
+ convert_inputs = create_convert_input(
500
+ tbl, convert_input_files_for_all_buckets, test_case, s3_file_system
501
+ )
502
+
503
+ # Create and set up mock data
504
+ mock_data_tables = create_mock_data_tables(test_case)
505
+ download_data_mock = mocker.patch(
506
+ "deltacat.compute.converter.utils.io.daft_read_parquet"
507
+ )
508
+
509
+ download_data_mock.side_effect = mock_data_tables
510
+
511
+ # Run conversion
512
+ convert_ref = convert.remote(convert_inputs[0])
513
+ convert_result = ray.get(convert_ref)
514
+
515
+ # Process results
516
+ to_be_deleted_files_list, to_be_added_files_list = process_convert_result(
517
+ convert_result
518
+ )
519
+
520
+ if not to_be_deleted_files_list:
521
+ # Commit changes
522
+ commit_append_snapshot(
523
+ iceberg_table=tbl,
524
+ new_position_delete_files=to_be_added_files_list,
525
+ )
526
+ else:
527
+ commit_replace_snapshot(
528
+ iceberg_table=tbl,
529
+ to_be_deleted_files=to_be_deleted_files_list[0],
530
+ new_position_delete_files=to_be_added_files_list,
531
+ )
532
+ tbl.refresh()
533
+
534
+ # Verify results
535
+ pyiceberg_scan_table_rows = tbl.scan().to_arrow().to_pydict()
536
+
537
+ # Verify Spark read if required
538
+ if test_case.get("verify_spark_read", False):
539
+ verify_spark_read_results(spark, identifier, test_case["expected_result"])
540
+ else:
541
+ verify_result(
542
+ pyiceberg_scan_table_rows,
543
+ test_case["expected_result"],
544
+ verify_pos_index=test_case.get("verify_pos_index", False),
545
+ )
546
+
547
+
548
+ def test_converter_session_with_local_filesystem_and_duplicate_ids(
549
+ setup_ray_cluster,
550
+ ) -> None:
551
+ """
552
+ Test converter_session functionality with local PyArrow filesystem using duplicate IDs.
553
+ This test simulates the pattern where duplicate IDs represent updates to existing records.
554
+ The converter should merge these updates by creating position delete files.
555
+ """
556
+ with temp_dir_autocleanup() as temp_catalog_dir:
557
+ # Create warehouse directory
558
+ warehouse_path = os.path.join(temp_catalog_dir, "iceberg_warehouse")
559
+ os.makedirs(warehouse_path, exist_ok=True)
560
+
561
+ # Set up local in-memory catalog
562
+ local_catalog = load_catalog(
563
+ "local_sql_catalog",
564
+ **{
565
+ "type": "in-memory",
566
+ "warehouse": warehouse_path,
567
+ },
568
+ )
569
+
570
+ # Create local PyArrow filesystem
571
+ import pyarrow.fs as pafs
572
+
573
+ local_filesystem = pafs.LocalFileSystem()
574
+
575
+ # Define schema (id, name, value, version)
576
+ schema = Schema(
577
+ NestedField(field_id=1, name="id", field_type=LongType(), required=True),
578
+ NestedField(
579
+ field_id=2, name="name", field_type=StringType(), required=False
580
+ ),
581
+ NestedField(
582
+ field_id=3, name="value", field_type=LongType(), required=False
583
+ ),
584
+ NestedField(
585
+ field_id=4, name="version", field_type=LongType(), required=False
586
+ ),
587
+ schema_id=0,
588
+ )
589
+
590
+ # Create table properties for merge-on-read
591
+ properties = {
592
+ "write.format.default": "parquet",
593
+ "write.delete.mode": "merge-on-read",
594
+ "write.update.mode": "merge-on-read",
595
+ "write.merge.mode": "merge-on-read",
596
+ "format-version": "2",
597
+ }
598
+
599
+ # Create the table
600
+ table_identifier = "default.test_duplicate_ids"
601
+ try:
602
+ local_catalog.create_namespace("default")
603
+ except NamespaceAlreadyExistsError:
604
+ pass # Namespace may already exist
605
+ try:
606
+ local_catalog.drop_table(table_identifier)
607
+ except NoSuchTableError:
608
+ pass # Table may not exist
609
+
610
+ local_catalog.create_table(
611
+ table_identifier,
612
+ schema=schema,
613
+ properties=properties,
614
+ )
615
+ tbl = local_catalog.load_table(table_identifier)
616
+
617
+ # Set the name mapping property so Iceberg can read parquet files without field IDs
618
+ with tbl.transaction() as tx:
619
+ tx.set_properties(
620
+ **{"schema.name-mapping.default": schema.name_mapping.model_dump_json()}
621
+ )
622
+
623
+ # Step 1: Write initial data
624
+ # Create PyArrow table with explicit schema to match Iceberg schema
625
+ arrow_schema = schema_to_pyarrow(schema)
626
+
627
+ initial_data = pa.table(
628
+ {
629
+ "id": [1, 2, 3, 4],
630
+ "name": ["Alice", "Bob", "Charlie", "David"],
631
+ "value": [100, 200, 300, 400],
632
+ "version": [1, 1, 1, 1],
633
+ },
634
+ schema=arrow_schema,
635
+ )
636
+
637
+ # Step 2: Write additional data
638
+ additional_data = pa.table(
639
+ {
640
+ "id": [5, 6, 7, 8],
641
+ "name": ["Eve", "Frank", "Grace", "Henry"],
642
+ "value": [500, 600, 700, 800],
643
+ "version": [1, 1, 1, 1],
644
+ },
645
+ schema=arrow_schema,
646
+ )
647
+
648
+ # Step 3: Write updates to existing records (this creates duplicates by ID)
649
+ # These should overwrite the original records with same IDs
650
+ updated_data = pa.table(
651
+ {
652
+ "id": [2, 3, 9], # IDs 2 and 3 are duplicates, 9 is new
653
+ "name": [
654
+ "Robert",
655
+ "Charles",
656
+ "Ivan",
657
+ ], # Updated names for Bob and Charlie
658
+ "value": [201, 301, 900], # Updated values
659
+ "version": [2, 2, 1], # Higher version numbers for updates
660
+ },
661
+ schema=arrow_schema,
662
+ )
663
+
664
+ # Write all data to separate parquet files to simulate multiple writes
665
+ data_files_to_commit = []
666
+
667
+ for i, data in enumerate([initial_data, additional_data, updated_data]):
668
+ data_file_path = os.path.join(warehouse_path, f"data_{i}.parquet")
669
+ pq.write_table(data, data_file_path)
670
+
671
+ # Create DataFile objects for Iceberg
672
+ parquet_metadata = pq.read_metadata(data_file_path)
673
+ file_size = os.path.getsize(data_file_path)
674
+
675
+ # Check schema compatibility
676
+ _check_pyarrow_schema_compatible(
677
+ schema, parquet_metadata.schema.to_arrow_schema()
678
+ )
679
+
680
+ # Calculate statistics
681
+ statistics = data_file_statistics_from_parquet_metadata(
682
+ parquet_metadata=parquet_metadata,
683
+ stats_columns=compute_statistics_plan(schema, tbl.metadata.properties),
684
+ parquet_column_mapping=parquet_path_to_id_mapping(schema),
685
+ )
686
+
687
+ data_file = DataFile(
688
+ content=DataFileContent.DATA,
689
+ file_path=data_file_path,
690
+ file_format=FileFormat.PARQUET,
691
+ partition={}, # No partitioning
692
+ file_size_in_bytes=file_size,
693
+ sort_order_id=None,
694
+ spec_id=tbl.metadata.default_spec_id,
695
+ key_metadata=None,
696
+ equality_ids=None,
697
+ **statistics.to_serialized_dict(),
698
+ )
699
+ data_files_to_commit.append(data_file)
700
+
701
+ # Commit all data files to the table
702
+ with tbl.transaction() as tx:
703
+ with tx.update_snapshot().fast_append() as update_snapshot:
704
+ for data_file in data_files_to_commit:
705
+ update_snapshot.append_data_file(data_file)
706
+
707
+ tbl.refresh()
708
+
709
+ # Verify we have duplicate IDs before conversion
710
+ initial_scan = tbl.scan().to_arrow().to_pydict()
711
+ print(f"Before conversion - Records with IDs: {sorted(initial_scan['id'])}")
712
+
713
+ # There should be duplicates: [1, 2, 2, 3, 3, 4, 5, 6, 7, 8, 9]
714
+ expected_duplicate_ids = [1, 2, 2, 3, 3, 4, 5, 6, 7, 8, 9]
715
+ assert (
716
+ sorted(initial_scan["id"]) == expected_duplicate_ids
717
+ ), f"Expected duplicate IDs {expected_duplicate_ids}, got {sorted(initial_scan['id'])}"
718
+
719
+ # Now call converter_session to convert equality deletes to position deletes
720
+ converter_params = ConverterSessionParams.of(
721
+ {
722
+ "catalog": local_catalog,
723
+ "iceberg_table_name": table_identifier,
724
+ "iceberg_warehouse_bucket_name": warehouse_path, # Local warehouse path
725
+ "merge_keys": ["id"], # Use ID as the merge key
726
+ "enforce_primary_key_uniqueness": True,
727
+ "task_max_parallelism": 1, # Single task for local testing
728
+ "filesystem": local_filesystem,
729
+ "location_provider_prefix_override": None, # Use local filesystem
730
+ "location_provider_prefix_override": None, # Let the system auto-generate the prefix
731
+ }
732
+ )
733
+
734
+ print(f"Running converter_session with local filesystem...")
735
+ print(f"Warehouse path: {warehouse_path}")
736
+ print(f"Merge keys: ['id']")
737
+ print(f"Enforce uniqueness: True")
738
+
739
+ # Run the converter
740
+ converter_session(params=converter_params)
741
+
742
+ # Refresh table and scan again
743
+ tbl.refresh()
744
+ final_scan = tbl.scan().to_arrow().to_pydict()
745
+
746
+ print(f"After conversion - Records with IDs: {sorted(final_scan['id'])}")
747
+ print(f"Final data: {final_scan}")
748
+
749
+ # Verify position delete files were created by checking table metadata
750
+ latest_snapshot = tbl.metadata.current_snapshot()
751
+ if latest_snapshot:
752
+ manifests = latest_snapshot.manifests(tbl.io)
753
+ position_delete_files = []
754
+
755
+ for manifest in manifests:
756
+ entries = manifest.fetch_manifest_entry(tbl.io)
757
+ for entry in entries:
758
+ if entry.data_file.content == DataFileContent.POSITION_DELETES:
759
+ position_delete_files.append(entry.data_file.file_path)
760
+
761
+ print(f"Position delete files found: {position_delete_files}")
762
+ assert (
763
+ len(position_delete_files) > 0
764
+ ), "No position delete files were created by converter_session"
765
+
766
+ # Verify the final result has unique IDs (duplicates should be resolved)
767
+ # Expected: Latest values for each ID based on the updates
768
+ expected_unique_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9] # All unique IDs
769
+ actual_ids = sorted(final_scan["id"])
770
+
771
+ print(f"Expected unique IDs: {expected_unique_ids}")
772
+ print(f"Actual IDs after conversion: {actual_ids}")
773
+
774
+ assert (
775
+ actual_ids == expected_unique_ids
776
+ ), f"Expected unique IDs {expected_unique_ids}, got {actual_ids}"
777
+
778
+ # Verify the updated values are present (higher version should win)
779
+ final_data_by_id = {}
780
+ for i, id_val in enumerate(final_scan["id"]):
781
+ final_data_by_id[id_val] = {
782
+ "name": final_scan["name"][i],
783
+ "value": final_scan["value"][i],
784
+ "version": final_scan["version"][i],
785
+ }
786
+
787
+ # Check that ID 2 has updated value (Robert, 201, version 2)
788
+ assert (
789
+ final_data_by_id[2]["name"] == "Robert"
790
+ ), f"ID 2 should have updated name 'Robert', got '{final_data_by_id[2]['name']}'"
791
+ assert (
792
+ final_data_by_id[2]["value"] == 201
793
+ ), f"ID 2 should have updated value 201, got {final_data_by_id[2]['value']}"
794
+ assert (
795
+ final_data_by_id[2]["version"] == 2
796
+ ), f"ID 2 should have version 2, got {final_data_by_id[2]['version']}"
797
+
798
+ # Check that ID 3 has updated value (Charles, 301, version 2)
799
+ assert (
800
+ final_data_by_id[3]["name"] == "Charles"
801
+ ), f"ID 3 should have updated name 'Charles', got '{final_data_by_id[3]['name']}'"
802
+ assert (
803
+ final_data_by_id[3]["value"] == 301
804
+ ), f"ID 3 should have updated value 301, got {final_data_by_id[3]['value']}"
805
+ assert (
806
+ final_data_by_id[3]["version"] == 2
807
+ ), f"ID 3 should have version 2, got {final_data_by_id[3]['version']}"
808
+
809
+ # Check that new ID 9 is present
810
+ assert (
811
+ final_data_by_id[9]["name"] == "Ivan"
812
+ ), f"ID 9 should have name 'Ivan', got '{final_data_by_id[9]['name']}'"
813
+ assert (
814
+ final_data_by_id[9]["value"] == 900
815
+ ), f"ID 9 should have value 900, got {final_data_by_id[9]['value']}"
816
+
817
+ print(f"✅ Test completed successfully!")
818
+ print(
819
+ f"✅ Position delete files were created: {len(position_delete_files)} files"
820
+ )
821
+ print(f"✅ Duplicate IDs were resolved correctly")
822
+ print(
823
+ f"✅ Updated values were applied (ID 2: Bob->Robert, ID 3: Charlie->Charles)"
824
+ )
825
+ print(f"✅ Final table has {len(actual_ids)} unique records")
826
+ print(f"✅ Temporary warehouse cleaned up at: {temp_catalog_dir}")