deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,299 @@
1
+ from typing import List, Dict
2
+ from collections import defaultdict
3
+ import uuid
4
+ from pyiceberg.table import Table
5
+ from pyiceberg.table.metadata import TableMetadata
6
+ from pyiceberg.table.snapshots import (
7
+ Operation,
8
+ )
9
+ from pyiceberg.manifest import (
10
+ DataFile,
11
+ DataFileContent,
12
+ ManifestContent,
13
+ ManifestEntry,
14
+ ManifestEntryStatus,
15
+ ManifestFile,
16
+ write_manifest,
17
+ )
18
+ import itertools
19
+ from pyiceberg.utils.concurrent import ExecutorFactory
20
+ from pyiceberg.table.update.snapshot import _SnapshotProducer, UpdateSnapshot
21
+
22
+
23
+ def replace_delete_files_override(
24
+ update_snapshot: UpdateSnapshot,
25
+ ) -> "_ReplaceDeleteFilesOverride":
26
+ commit_uuid = uuid.uuid4()
27
+ return _ReplaceDeleteFilesOverride(
28
+ commit_uuid=commit_uuid,
29
+ operation=Operation.OVERWRITE,
30
+ transaction=update_snapshot._transaction,
31
+ io=update_snapshot._io,
32
+ snapshot_properties=update_snapshot._snapshot_properties,
33
+ )
34
+
35
+
36
+ class _ReplaceDeleteFilesOverride(_SnapshotProducer):
37
+ def _manifests(self) -> List[ManifestFile]:
38
+ def _write_added_manifest() -> List[ManifestFile]:
39
+ if self._added_data_files:
40
+ with write_manifest(
41
+ format_version=self._transaction.table_metadata.format_version,
42
+ spec=self._transaction.table_metadata.spec(),
43
+ schema=self._transaction.table_metadata.schema(),
44
+ output_file=self.new_manifest_output(),
45
+ snapshot_id=self._snapshot_id,
46
+ ) as writer:
47
+ for data_file in self._added_data_files:
48
+ writer.add(
49
+ ManifestEntry(
50
+ status=ManifestEntryStatus.ADDED,
51
+ snapshot_id=self._snapshot_id,
52
+ sequence_number=None,
53
+ file_sequence_number=None,
54
+ data_file=data_file,
55
+ )
56
+ )
57
+ writer.content = self.writer_content
58
+ return [writer.to_manifest_file()]
59
+ else:
60
+ return []
61
+
62
+ def _write_delete_manifest() -> List[ManifestFile]:
63
+ # Check if we need to mark the files as deleted
64
+ deleted_entries = self._deleted_entries()
65
+ if len(deleted_entries) > 0:
66
+ deleted_manifests = []
67
+ partition_groups: Dict[int, List[ManifestEntry]] = defaultdict(list)
68
+ for deleted_entry in deleted_entries:
69
+ partition_groups[deleted_entry.data_file.spec_id].append(
70
+ deleted_entry
71
+ )
72
+ for spec_id, entries in partition_groups.items():
73
+ with write_manifest(
74
+ format_version=self._transaction.table_metadata.format_version,
75
+ spec=self._transaction.table_metadata.specs()[spec_id],
76
+ schema=self._transaction.table_metadata.schema(),
77
+ output_file=self.new_manifest_output(),
78
+ snapshot_id=self._snapshot_id,
79
+ ) as writer:
80
+ for entry in entries:
81
+ writer.add_entry(entry)
82
+ deleted_manifests.append(writer.to_manifest_file())
83
+ return deleted_manifests
84
+ else:
85
+ return []
86
+
87
+ executor = ExecutorFactory.get_or_create()
88
+
89
+ added_manifests = executor.submit(_write_added_manifest)
90
+ existing_manifests = executor.submit(self._existing_manifests)
91
+ delete_manifests = executor.submit(_write_delete_manifest)
92
+ return self._process_manifests(
93
+ added_manifests.result()
94
+ + existing_manifests.result()
95
+ + delete_manifests.result()
96
+ )
97
+
98
+ def writer_content(self) -> ManifestContent:
99
+ return ManifestContent.DELETES
100
+
101
+ def _existing_manifests(self) -> List[ManifestFile]:
102
+ """To determine if there are any existing manifest files.
103
+
104
+ A fast append will add another ManifestFile to the ManifestList.
105
+ All the existing manifest files are considered existing.
106
+ """
107
+ existing_manifests = []
108
+
109
+ if self._parent_snapshot_id is not None:
110
+ previous_snapshot = self._transaction.table_metadata.snapshot_by_id(
111
+ self._parent_snapshot_id
112
+ )
113
+
114
+ if previous_snapshot is None:
115
+ raise ValueError(
116
+ f"Snapshot could not be found: {self._parent_snapshot_id}"
117
+ )
118
+
119
+ for manifest in previous_snapshot.manifests(io=self._io):
120
+ if (
121
+ manifest.has_added_files()
122
+ or manifest.has_existing_files()
123
+ or manifest.added_snapshot_id == self._snapshot_id
124
+ ):
125
+ existing_manifests.append(manifest)
126
+
127
+ return existing_manifests
128
+
129
+ def _deleted_entries(self) -> List[ManifestEntry]:
130
+ if self._parent_snapshot_id is not None:
131
+ previous_snapshot = self._transaction.table_metadata.snapshot_by_id(
132
+ self._parent_snapshot_id
133
+ )
134
+ if previous_snapshot is None:
135
+ # This should never happen since you cannot overwrite an empty table
136
+ raise ValueError(
137
+ f"Could not find the previous snapshot: {self._parent_snapshot_id}"
138
+ )
139
+
140
+ executor = ExecutorFactory.get_or_create()
141
+
142
+ def _get_entries(manifest: ManifestFile) -> List[ManifestEntry]:
143
+ return [
144
+ ManifestEntry(
145
+ status=ManifestEntryStatus.DELETED,
146
+ snapshot_id=entry.snapshot_id,
147
+ sequence_number=entry.sequence_number,
148
+ file_sequence_number=entry.file_sequence_number,
149
+ data_file=entry.data_file,
150
+ )
151
+ for entry in manifest.fetch_manifest_entry(
152
+ self._io, discard_deleted=True
153
+ )
154
+ if entry.data_file.content == DataFileContent.EQUALITY_DELETES
155
+ and entry.data_file in self._deleted_data_files
156
+ ]
157
+
158
+ list_of_entries = executor.map(
159
+ _get_entries, previous_snapshot.manifests(self._io)
160
+ )
161
+ return list(itertools.chain(*list_of_entries))
162
+ else:
163
+ return []
164
+
165
+
166
+ def commit_append_snapshot(
167
+ iceberg_table: Table, new_position_delete_files: List[DataFile]
168
+ ) -> TableMetadata:
169
+ tx = iceberg_table.transaction()
170
+ try:
171
+ if iceberg_table.metadata.name_mapping() is None:
172
+ tx.set_properties(
173
+ **{
174
+ "schema.name-mapping.default": tx.table_metadata.schema().name_mapping.model_dump_json()
175
+ }
176
+ )
177
+ with append_delete_files_override(tx.update_snapshot()) as append_snapshot:
178
+ if new_position_delete_files:
179
+ for data_file in new_position_delete_files:
180
+ append_snapshot.append_data_file(data_file)
181
+ except Exception as e:
182
+ raise e
183
+ else:
184
+ return tx.commit_transaction().metadata
185
+
186
+
187
+ def append_delete_files_override(
188
+ update_snapshot: UpdateSnapshot,
189
+ ) -> "_AppendDeleteFilesOverride":
190
+ commit_uuid = uuid.uuid4()
191
+ return _AppendDeleteFilesOverride(
192
+ commit_uuid=commit_uuid,
193
+ operation=Operation.APPEND,
194
+ transaction=update_snapshot._transaction,
195
+ io=update_snapshot._io,
196
+ snapshot_properties=update_snapshot._snapshot_properties,
197
+ )
198
+
199
+
200
+ class _AppendDeleteFilesOverride(_SnapshotProducer):
201
+ def _manifests(self) -> List[ManifestFile]:
202
+ def _write_added_manifest() -> List[ManifestFile]:
203
+ if self._added_data_files:
204
+ with write_manifest(
205
+ format_version=self._transaction.table_metadata.format_version,
206
+ spec=self._transaction.table_metadata.spec(),
207
+ schema=self._transaction.table_metadata.schema(),
208
+ output_file=self.new_manifest_output(),
209
+ snapshot_id=self._snapshot_id,
210
+ ) as writer:
211
+ for data_file in self._added_data_files:
212
+ writer.add(
213
+ ManifestEntry(
214
+ status=ManifestEntryStatus.ADDED,
215
+ snapshot_id=self._snapshot_id,
216
+ sequence_number=None,
217
+ file_sequence_number=None,
218
+ data_file=data_file,
219
+ )
220
+ )
221
+ writer.content = self.writer_content
222
+ return [writer.to_manifest_file()]
223
+ else:
224
+ return []
225
+
226
+ executor = ExecutorFactory.get_or_create()
227
+
228
+ added_manifests = executor.submit(_write_added_manifest)
229
+ existing_manifests = executor.submit(self._existing_manifests)
230
+
231
+ return self._process_manifests(
232
+ added_manifests.result() + existing_manifests.result()
233
+ )
234
+
235
+ def writer_content(self) -> ManifestContent:
236
+ return ManifestContent.DELETES
237
+
238
+ def _existing_manifests(self) -> List[ManifestFile]:
239
+ """To determine if there are any existing manifest files.
240
+
241
+ A fast append will add another ManifestFile to the ManifestList.
242
+ All the existing manifest files are considered existing.
243
+ """
244
+ existing_manifests = []
245
+
246
+ if self._parent_snapshot_id is not None:
247
+ previous_snapshot = self._transaction.table_metadata.snapshot_by_id(
248
+ self._parent_snapshot_id
249
+ )
250
+
251
+ if previous_snapshot is None:
252
+ raise ValueError(
253
+ f"Snapshot could not be found: {self._parent_snapshot_id}"
254
+ )
255
+
256
+ for manifest in previous_snapshot.manifests(io=self._io):
257
+ if (
258
+ manifest.has_added_files()
259
+ or manifest.has_existing_files()
260
+ or manifest.added_snapshot_id == self._snapshot_id
261
+ ):
262
+ existing_manifests.append(manifest)
263
+
264
+ return existing_manifests
265
+
266
+ def _deleted_entries(self) -> List[ManifestEntry]:
267
+ """To determine if we need to record any deleted manifest entries.
268
+
269
+ In case of an append, nothing is deleted.
270
+ """
271
+ return []
272
+
273
+
274
+ def commit_replace_snapshot(
275
+ iceberg_table: Table,
276
+ new_position_delete_files: List[DataFile],
277
+ to_be_deleted_files: List[DataFile],
278
+ ) -> TableMetadata:
279
+ tx = iceberg_table.transaction()
280
+ try:
281
+ if iceberg_table.metadata.name_mapping() is None:
282
+ tx.set_properties(
283
+ **{
284
+ "schema.name-mapping.default": tx.table_metadata.schema().name_mapping.model_dump_json()
285
+ }
286
+ )
287
+ with replace_delete_files_override(
288
+ tx.update_snapshot()
289
+ ) as replace_delete_snapshot:
290
+ if new_position_delete_files:
291
+ for data_file in new_position_delete_files:
292
+ replace_delete_snapshot.append_data_file(data_file)
293
+ if to_be_deleted_files:
294
+ for delete_file in to_be_deleted_files:
295
+ replace_delete_snapshot.delete_data_file(delete_file)
296
+ except Exception as e:
297
+ raise e
298
+ else:
299
+ return tx.commit_transaction().metadata
@@ -0,0 +1,366 @@
1
+ import pyarrow.compute as pc
2
+
3
+ import deltacat.compute.converter.utils.iceberg_columns as sc
4
+ import pyarrow as pa
5
+
6
+ from collections import defaultdict
7
+ import ray
8
+ import logging
9
+ from deltacat.compute.converter.model.convert_input import ConvertInput
10
+ from deltacat.compute.converter.steps.dedupe import dedupe_data_files
11
+ from deltacat.compute.converter.utils.io import write_sliced_table
12
+ from deltacat.compute.converter.utils.io import (
13
+ download_data_table_and_append_iceberg_columns,
14
+ )
15
+ from deltacat.compute.converter.utils.converter_session_utils import (
16
+ partition_value_record_to_partition_value_string,
17
+ sort_data_files_maintaining_order,
18
+ )
19
+ from deltacat.compute.converter.pyiceberg.overrides import (
20
+ parquet_files_dict_to_iceberg_data_files,
21
+ )
22
+ from deltacat.compute.converter.model.convert_result import ConvertResult
23
+ from pyiceberg.manifest import DataFileContent
24
+ from deltacat import logs
25
+ from fsspec import AbstractFileSystem
26
+ from typing import List, Dict, Tuple, Optional, Any
27
+ from deltacat.utils.resources import get_current_process_peak_memory_usage_in_bytes
28
+ from deltacat.compute.converter.model.convert_input_files import (
29
+ DataFileList,
30
+ DataFileListGroup,
31
+ )
32
+
33
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
34
+
35
+
36
+ @ray.remote
37
+ def convert(convert_input: ConvertInput) -> ConvertResult:
38
+ convert_input_files = convert_input.convert_input_files
39
+ convert_task_index = convert_input.convert_task_index
40
+ iceberg_table_warehouse_prefix = convert_input.iceberg_table_warehouse_prefix
41
+ identifier_fields = convert_input.identifier_fields
42
+ table_io = convert_input.table_io
43
+ table_metadata = convert_input.table_metadata
44
+ compact_previous_position_delete_files = (
45
+ convert_input.compact_previous_position_delete_files
46
+ )
47
+ position_delete_for_multiple_data_files = (
48
+ convert_input.position_delete_for_multiple_data_files
49
+ )
50
+ max_parallel_data_file_download = convert_input.max_parallel_data_file_download
51
+ filesystem = convert_input.filesystem
52
+ s3_client_kwargs = convert_input.s3_client_kwargs
53
+ task_memory = convert_input.task_memory
54
+
55
+ if not position_delete_for_multiple_data_files:
56
+ raise NotImplementedError(
57
+ f"Distributed file level position delete compute is not supported yet"
58
+ )
59
+ if compact_previous_position_delete_files:
60
+ raise NotImplementedError(f"Compact previous position delete not supported yet")
61
+
62
+ logger.info(f"Starting convert task index: {convert_task_index}")
63
+
64
+ applicable_data_files = convert_input_files.applicable_data_files
65
+ applicable_equality_delete_files = (
66
+ convert_input_files.applicable_equality_delete_files
67
+ )
68
+
69
+ all_data_files_for_this_bucket = convert_input_files.all_data_files_for_dedupe
70
+
71
+ partition_value_str = partition_value_record_to_partition_value_string(
72
+ convert_input_files.partition_value
73
+ )
74
+ partition_value = convert_input_files.partition_value
75
+
76
+ if partition_value_str:
77
+ iceberg_table_warehouse_prefix_with_partition = (
78
+ f"{iceberg_table_warehouse_prefix}/{partition_value_str}"
79
+ )
80
+ else:
81
+ iceberg_table_warehouse_prefix_with_partition = (
82
+ f"{iceberg_table_warehouse_prefix}"
83
+ )
84
+
85
+ enforce_primary_key_uniqueness = convert_input.enforce_primary_key_uniqueness
86
+ total_pos_delete_table = []
87
+ data_table_after_converting_equality_delete = []
88
+ if applicable_equality_delete_files:
89
+ (
90
+ pos_delete_after_converting_equality_delete,
91
+ data_table_after_converting_equality_delete,
92
+ ) = compute_pos_delete_with_limited_parallelism(
93
+ data_files_list=applicable_data_files,
94
+ identifier_columns=identifier_fields,
95
+ equality_delete_files_list=applicable_equality_delete_files,
96
+ iceberg_table_warehouse_prefix_with_partition=iceberg_table_warehouse_prefix_with_partition,
97
+ convert_task_index=convert_task_index,
98
+ max_parallel_data_file_download=max_parallel_data_file_download,
99
+ s3_file_system=filesystem,
100
+ s3_client_kwargs=s3_client_kwargs,
101
+ )
102
+ if pos_delete_after_converting_equality_delete:
103
+ total_pos_delete_table.append(pos_delete_after_converting_equality_delete)
104
+
105
+ if enforce_primary_key_uniqueness:
106
+ data_files_downloaded_during_convert = []
107
+ if applicable_data_files:
108
+ for file_list in applicable_data_files:
109
+ for file in file_list:
110
+ data_files_downloaded_during_convert.append(file)
111
+
112
+ data_files_to_dedupe = get_additional_applicable_data_files(
113
+ all_data_files=all_data_files_for_this_bucket,
114
+ data_files_downloaded=data_files_downloaded_during_convert,
115
+ )
116
+
117
+ dedupe_file_size_bytes = sum(
118
+ data_file.file_size_in_bytes for _, data_file in data_files_to_dedupe
119
+ )
120
+ logger.info(
121
+ f"Total on-disk size of files to dedupe: {dedupe_file_size_bytes} bytes"
122
+ )
123
+
124
+ logger.info(
125
+ f"[Convert task {convert_task_index}]: Got {len(data_files_to_dedupe)} files to dedupe."
126
+ )
127
+
128
+ (
129
+ pos_delete_after_dedupe,
130
+ data_file_to_dedupe_record_count,
131
+ data_file_to_dedupe_size,
132
+ ) = dedupe_data_files(
133
+ data_file_to_dedupe=data_files_to_dedupe,
134
+ identifier_columns=identifier_fields,
135
+ remaining_data_table_after_convert=data_table_after_converting_equality_delete,
136
+ merge_sort_column=sc._ORDERED_RECORD_IDX_COLUMN_NAME,
137
+ s3_client_kwargs=s3_client_kwargs,
138
+ )
139
+ logger.info(
140
+ f"[Convert task {convert_task_index}]: Dedupe produced {len(pos_delete_after_dedupe)} position delete records."
141
+ )
142
+ total_pos_delete_table.append(pos_delete_after_dedupe)
143
+
144
+ total_pos_delete = pa.concat_tables(total_pos_delete_table)
145
+
146
+ logger.info(
147
+ f"[Convert task {convert_task_index}]: Total position delete produced:{len(total_pos_delete)}"
148
+ )
149
+
150
+ to_be_added_files_list = []
151
+ if total_pos_delete:
152
+ to_be_added_files_list_parquet = write_sliced_table(
153
+ table=total_pos_delete,
154
+ base_path=iceberg_table_warehouse_prefix_with_partition,
155
+ table_writer_kwargs={},
156
+ filesystem=filesystem,
157
+ )
158
+
159
+ to_be_added_files_dict = defaultdict()
160
+ to_be_added_files_dict[partition_value] = to_be_added_files_list_parquet
161
+
162
+ logger.info(
163
+ f"[Convert task {convert_task_index}]: Produced {len(to_be_added_files_list_parquet)} position delete files."
164
+ )
165
+ file_content_type = DataFileContent.POSITION_DELETES
166
+ to_be_added_files_list = parquet_files_dict_to_iceberg_data_files(
167
+ io=table_io,
168
+ table_metadata=table_metadata,
169
+ files_dict=to_be_added_files_dict,
170
+ file_content_type=file_content_type,
171
+ )
172
+
173
+ to_be_delete_files_dict = defaultdict()
174
+
175
+ if applicable_equality_delete_files:
176
+ to_be_delete_files_dict[partition_value] = [
177
+ equality_delete_file[1]
178
+ for equality_delete_list in applicable_equality_delete_files
179
+ for equality_delete_file in equality_delete_list
180
+ ]
181
+
182
+ if not enforce_primary_key_uniqueness:
183
+ data_file_to_dedupe_record_count = 0
184
+ data_file_to_dedupe_size = 0
185
+
186
+ peak_memory_usage_bytes = (
187
+ get_current_process_peak_memory_usage_in_bytes()
188
+ ) # Convert KB to bytes
189
+ memory_usage_percentage = (peak_memory_usage_bytes / task_memory) * 100
190
+
191
+ logger.info(
192
+ f"[Convert task {convert_task_index}]: Memory usage stats - "
193
+ f"Peak memory usage: {peak_memory_usage_bytes} bytes, "
194
+ f"Allocated task memory: {convert_input.task_memory} bytes, "
195
+ f"Usage percentage: {memory_usage_percentage:.2f}%"
196
+ )
197
+
198
+ convert_res = ConvertResult.of(
199
+ convert_task_index=convert_task_index,
200
+ to_be_added_files=to_be_added_files_list,
201
+ to_be_deleted_files=to_be_delete_files_dict,
202
+ position_delete_record_count=len(total_pos_delete),
203
+ input_data_files_record_count=data_file_to_dedupe_record_count,
204
+ input_data_files_hash_columns_in_memory_sizes=data_file_to_dedupe_size,
205
+ position_delete_in_memory_sizes=int(total_pos_delete.nbytes),
206
+ position_delete_on_disk_sizes=sum(
207
+ file.file_size_in_bytes for file in to_be_added_files_list
208
+ ),
209
+ input_data_files_on_disk_size=dedupe_file_size_bytes,
210
+ peak_memory_usage_bytes=peak_memory_usage_bytes,
211
+ memory_usage_percentage=memory_usage_percentage,
212
+ )
213
+ return convert_res
214
+
215
+
216
+ def get_additional_applicable_data_files(
217
+ all_data_files: DataFileList,
218
+ data_files_downloaded: DataFileList,
219
+ ) -> DataFileList:
220
+ data_file_to_dedupe = []
221
+ assert len(set(all_data_files)) >= len(set(data_files_downloaded)), (
222
+ f"Length of all data files ({len(set(all_data_files))}) should never be less than "
223
+ f"the length of candidate equality delete data files ({len(set(data_files_downloaded))})"
224
+ )
225
+ if data_files_downloaded:
226
+ # set1.difference(set2) returns elements in set1 but not in set2
227
+ data_file_to_dedupe.extend(
228
+ list(set(data_file_to_dedupe).difference(set(data_files_downloaded)))
229
+ )
230
+ else:
231
+ data_file_to_dedupe = all_data_files
232
+ return data_file_to_dedupe
233
+
234
+
235
+ def filter_rows_to_be_deleted(
236
+ equality_delete_table: Optional[pa.Table],
237
+ data_file_table: Optional[pa.Table],
238
+ identifier_columns: List[str],
239
+ ) -> Tuple[Optional[pa.Table], Optional[pa.Table]]:
240
+ identifier_column = sc._IDENTIFIER_COLUMNS_HASH_COLUMN_NAME
241
+ if equality_delete_table and data_file_table:
242
+ equality_deletes = pc.is_in(
243
+ data_file_table[identifier_column],
244
+ equality_delete_table[identifier_column],
245
+ )
246
+ data_file_record_remaining = pc.invert(
247
+ pc.is_in(
248
+ data_file_table[identifier_column],
249
+ equality_delete_table[identifier_column],
250
+ )
251
+ )
252
+ position_delete_table = data_file_table.filter(equality_deletes)
253
+ remaining_data_table = data_file_table.filter(data_file_record_remaining)
254
+
255
+ position_delete_table = position_delete_table.drop(
256
+ [sc._IDENTIFIER_COLUMNS_HASH_COLUMN_NAME]
257
+ )
258
+ assert len(position_delete_table) + len(remaining_data_table) == len(
259
+ data_file_table
260
+ ), (
261
+ f"Expected undeleted data file record count plus length of pos deletes to match original data file record count of {len(data_file_table)}, "
262
+ f"but found {len(position_delete_table)} pos deletes + {len(remaining_data_table)} equality deletes."
263
+ )
264
+
265
+ return position_delete_table, remaining_data_table
266
+
267
+
268
+ def compute_pos_delete_converting_equality_deletes(
269
+ equality_delete_table: Optional[pa.Table],
270
+ data_file_table: Optional[pa.Table],
271
+ identifier_columns: List[str],
272
+ iceberg_table_warehouse_prefix_with_partition: str,
273
+ s3_file_system: Optional[AbstractFileSystem],
274
+ ) -> Tuple[Optional[pa.Table], Optional[pa.Table]]:
275
+ new_position_delete_table, remaining_data_table = filter_rows_to_be_deleted(
276
+ data_file_table=data_file_table,
277
+ equality_delete_table=equality_delete_table,
278
+ identifier_columns=identifier_columns,
279
+ )
280
+ if new_position_delete_table:
281
+ logger.info(
282
+ f"Length of position delete table after converting from equality deletes:{len(new_position_delete_table)}"
283
+ )
284
+ return new_position_delete_table, remaining_data_table
285
+ elif not remaining_data_table:
286
+ return None, None
287
+ else:
288
+ return None, remaining_data_table
289
+
290
+
291
+ def compute_pos_delete_with_limited_parallelism(
292
+ data_files_list: DataFileListGroup,
293
+ identifier_columns: List[str],
294
+ equality_delete_files_list: DataFileListGroup,
295
+ iceberg_table_warehouse_prefix_with_partition: str,
296
+ convert_task_index: int,
297
+ max_parallel_data_file_download: int,
298
+ s3_file_system: Optional[AbstractFileSystem],
299
+ s3_client_kwargs: Optional[Dict[str, Any]],
300
+ ) -> Tuple[Optional[pa.Table], Optional[pa.Table]]:
301
+ assert len(data_files_list) == len(equality_delete_files_list), (
302
+ f"Number of lists of data files should equal to number of list of equality delete files, "
303
+ f"But got {len(data_files_list)} data files lists vs {len(equality_delete_files_list)}."
304
+ )
305
+
306
+ new_pos_delete_table_total = []
307
+ for data_files, equality_delete_files in zip(
308
+ data_files_list, equality_delete_files_list
309
+ ):
310
+ data_table_total = []
311
+
312
+ # Sort data files by file sequence number first, then file path to
313
+ # make sure files having same sequence number are deterministically sorted
314
+ data_files = sort_data_files_maintaining_order(data_files=data_files)
315
+
316
+ for data_file in data_files:
317
+ data_table = download_data_table_and_append_iceberg_columns(
318
+ file=data_file[1],
319
+ columns_to_download=identifier_columns,
320
+ additional_columns_to_append=[
321
+ sc._FILE_PATH_COLUMN_NAME,
322
+ sc._ORDERED_RECORD_IDX_COLUMN_NAME,
323
+ ],
324
+ s3_client_kwargs=s3_client_kwargs,
325
+ )
326
+ data_table_total.append(data_table)
327
+ data_table_total = pa.concat_tables(data_table_total)
328
+
329
+ equality_delete_table_total = []
330
+ for equality_delete in equality_delete_files:
331
+ equality_delete_table = download_data_table_and_append_iceberg_columns(
332
+ file=equality_delete[1],
333
+ columns_to_download=identifier_columns,
334
+ s3_client_kwargs=s3_client_kwargs,
335
+ )
336
+ equality_delete_table_total.append(equality_delete_table)
337
+ equality_delete_table_total = pa.concat_tables(equality_delete_table_total)
338
+
339
+ (
340
+ new_pos_delete_table,
341
+ remaining_data_table,
342
+ ) = compute_pos_delete_converting_equality_deletes(
343
+ equality_delete_table=equality_delete_table_total,
344
+ data_file_table=data_table_total,
345
+ iceberg_table_warehouse_prefix_with_partition=iceberg_table_warehouse_prefix_with_partition,
346
+ identifier_columns=identifier_columns,
347
+ s3_file_system=s3_file_system,
348
+ )
349
+ new_pos_delete_table_total.append(new_pos_delete_table)
350
+
351
+ if new_pos_delete_table_total:
352
+ new_pos_delete_table_total = pa.concat_tables(new_pos_delete_table_total)
353
+
354
+ logger.info(
355
+ f"[Convert task {convert_task_index}]: Find deletes got {len(data_table_total)} data table records, "
356
+ f"{len(equality_delete_table_total)} equality deletes as input, "
357
+ f"Produced {len(new_pos_delete_table_total)} position deletes based off find deletes input."
358
+ )
359
+
360
+ if not new_pos_delete_table_total:
361
+ logger.info("No records deleted based on equality delete convertion")
362
+
363
+ if not remaining_data_table:
364
+ logger.info("No data table remaining after converting equality deletes")
365
+
366
+ return new_pos_delete_table_total, remaining_data_table