deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. deltacat/__init__.py +117 -18
  2. deltacat/api.py +536 -126
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +1 -19
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2444 -282
  12. deltacat/catalog/model/catalog.py +208 -113
  13. deltacat/catalog/model/properties.py +63 -24
  14. deltacat/compute/__init__.py +14 -0
  15. deltacat/compute/compactor/compaction_session.py +97 -75
  16. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  17. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  18. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  19. deltacat/compute/compactor/repartition_session.py +8 -21
  20. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  21. deltacat/compute/compactor/steps/materialize.py +9 -7
  22. deltacat/compute/compactor/steps/repartition.py +12 -11
  23. deltacat/compute/compactor/utils/io.py +6 -5
  24. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  25. deltacat/compute/compactor/utils/system_columns.py +3 -1
  26. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  27. deltacat/compute/compactor_v2/constants.py +30 -1
  28. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  30. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  31. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  32. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  33. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  34. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  35. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  36. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  37. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  38. deltacat/compute/compactor_v2/utils/io.py +11 -4
  39. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  40. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  41. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  42. deltacat/compute/converter/constants.py +5 -0
  43. deltacat/compute/converter/converter_session.py +207 -52
  44. deltacat/compute/converter/model/convert_input.py +43 -16
  45. deltacat/compute/converter/model/convert_input_files.py +33 -16
  46. deltacat/compute/converter/model/convert_result.py +80 -0
  47. deltacat/compute/converter/model/converter_session_params.py +64 -19
  48. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  49. deltacat/compute/converter/pyiceberg/overrides.py +193 -65
  50. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  51. deltacat/compute/converter/steps/convert.py +230 -75
  52. deltacat/compute/converter/steps/dedupe.py +46 -12
  53. deltacat/compute/converter/utils/convert_task_options.py +66 -22
  54. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  55. deltacat/compute/converter/utils/iceberg_columns.py +13 -8
  56. deltacat/compute/converter/utils/io.py +173 -13
  57. deltacat/compute/converter/utils/s3u.py +42 -27
  58. deltacat/compute/janitor.py +205 -0
  59. deltacat/compute/jobs/client.py +417 -0
  60. deltacat/compute/resource_estimation/delta.py +38 -6
  61. deltacat/compute/resource_estimation/model.py +8 -0
  62. deltacat/constants.py +49 -6
  63. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  64. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  65. deltacat/env.py +10 -0
  66. deltacat/examples/basic_logging.py +6 -6
  67. deltacat/examples/compactor/aws/__init__.py +1 -0
  68. deltacat/examples/compactor/bootstrap.py +863 -0
  69. deltacat/examples/compactor/compactor.py +373 -0
  70. deltacat/examples/compactor/explorer.py +473 -0
  71. deltacat/examples/compactor/gcp/__init__.py +1 -0
  72. deltacat/examples/compactor/job_runner.py +439 -0
  73. deltacat/examples/compactor/utils/__init__.py +1 -0
  74. deltacat/examples/compactor/utils/common.py +261 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  80. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  81. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
  82. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  83. deltacat/examples/hello_world.py +4 -2
  84. deltacat/examples/indexer/indexer.py +163 -0
  85. deltacat/examples/indexer/job_runner.py +198 -0
  86. deltacat/exceptions.py +66 -4
  87. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  88. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  89. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
  90. deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
  91. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  92. deltacat/experimental/converter_agent/__init__.py +0 -0
  93. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  94. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  95. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  96. deltacat/experimental/daft/__init__.py +4 -0
  97. deltacat/experimental/daft/daft_catalog.py +229 -0
  98. deltacat/experimental/storage/__init__.py +0 -0
  99. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  100. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  101. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  102. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  103. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  104. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  105. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  107. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  108. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  109. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  110. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  111. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  112. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  113. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  114. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  115. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  116. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  117. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  118. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  119. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  120. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  121. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  122. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  123. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  124. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  125. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  126. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  127. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  128. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  129. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  130. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  131. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  132. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  133. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  134. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  135. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  136. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  137. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  138. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  139. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  140. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  141. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  142. deltacat/io/__init__.py +13 -0
  143. deltacat/io/dataset/__init__.py +0 -0
  144. deltacat/io/dataset/deltacat_dataset.py +91 -0
  145. deltacat/io/datasink/__init__.py +0 -0
  146. deltacat/io/datasink/deltacat_datasink.py +207 -0
  147. deltacat/io/datasource/__init__.py +0 -0
  148. deltacat/io/datasource/deltacat_datasource.py +579 -0
  149. deltacat/io/reader/__init__.py +0 -0
  150. deltacat/io/reader/deltacat_read_api.py +172 -0
  151. deltacat/storage/__init__.py +22 -2
  152. deltacat/storage/interface.py +54 -32
  153. deltacat/storage/main/impl.py +1494 -541
  154. deltacat/storage/model/delta.py +27 -3
  155. deltacat/storage/model/expression/__init__.py +47 -0
  156. deltacat/storage/model/expression/expression.py +656 -0
  157. deltacat/storage/model/expression/visitor.py +248 -0
  158. deltacat/storage/model/locator.py +6 -12
  159. deltacat/storage/model/manifest.py +231 -6
  160. deltacat/storage/model/metafile.py +224 -119
  161. deltacat/storage/model/namespace.py +8 -1
  162. deltacat/storage/model/partition.py +117 -42
  163. deltacat/storage/model/scan/push_down.py +32 -5
  164. deltacat/storage/model/schema.py +2427 -159
  165. deltacat/storage/model/shard.py +6 -2
  166. deltacat/storage/model/sort_key.py +40 -0
  167. deltacat/storage/model/stream.py +9 -2
  168. deltacat/storage/model/table.py +12 -1
  169. deltacat/storage/model/table_version.py +11 -0
  170. deltacat/storage/model/transaction.py +1184 -208
  171. deltacat/storage/model/transform.py +81 -2
  172. deltacat/storage/model/types.py +53 -29
  173. deltacat/storage/util/__init__.py +0 -0
  174. deltacat/storage/util/scan_planner.py +26 -0
  175. deltacat/tests/_io/reader/__init__.py +0 -0
  176. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  177. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  178. deltacat/tests/aws/test_s3u.py +2 -31
  179. deltacat/tests/catalog/data/__init__.py +0 -0
  180. deltacat/tests/catalog/main/__init__.py +0 -0
  181. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  182. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  183. deltacat/tests/catalog/model/__init__.py +0 -0
  184. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  185. deltacat/tests/catalog/test_catalogs.py +103 -106
  186. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  187. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  188. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  189. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  190. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  191. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  192. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  193. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  194. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  195. deltacat/tests/compute/conftest.py +8 -44
  196. deltacat/tests/compute/converter/test_convert_session.py +697 -349
  197. deltacat/tests/compute/converter/utils.py +15 -6
  198. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  199. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  200. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  201. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  202. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  203. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  204. deltacat/tests/compute/test_janitor.py +236 -0
  205. deltacat/tests/compute/test_util_common.py +716 -43
  206. deltacat/tests/compute/test_util_constant.py +0 -1
  207. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  208. deltacat/tests/daft/__init__.py +0 -0
  209. deltacat/tests/daft/test_model.py +97 -0
  210. deltacat/tests/experimental/__init__.py +1 -0
  211. deltacat/tests/experimental/catalog/__init__.py +0 -0
  212. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  213. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  214. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  215. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  216. deltacat/tests/experimental/daft/__init__.py +0 -0
  217. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  218. deltacat/tests/experimental/storage/__init__.py +0 -0
  219. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  220. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  221. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  222. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  223. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  224. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  225. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  226. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  227. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  228. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  229. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  230. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  231. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  232. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  233. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  234. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  235. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  236. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  237. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  238. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  239. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  240. deltacat/tests/storage/model/test_expression.py +327 -0
  241. deltacat/tests/storage/model/test_manifest.py +129 -0
  242. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  243. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  244. deltacat/tests/storage/model/test_schema.py +171 -0
  245. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  246. deltacat/tests/storage/model/test_shard.py +3 -1
  247. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  248. deltacat/tests/storage/model/test_transaction.py +393 -48
  249. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  250. deltacat/tests/test_deltacat_api.py +1036 -11
  251. deltacat/tests/test_exceptions.py +9 -5
  252. deltacat/tests/test_utils/pyarrow.py +52 -21
  253. deltacat/tests/test_utils/storage.py +23 -34
  254. deltacat/tests/types/__init__.py +0 -0
  255. deltacat/tests/types/test_tables.py +104 -0
  256. deltacat/tests/utils/exceptions.py +22 -0
  257. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  258. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  259. deltacat/tests/utils/test_daft.py +121 -31
  260. deltacat/tests/utils/test_numpy.py +1193 -0
  261. deltacat/tests/utils/test_pandas.py +1106 -0
  262. deltacat/tests/utils/test_polars.py +1040 -0
  263. deltacat/tests/utils/test_pyarrow.py +1370 -89
  264. deltacat/types/media.py +345 -37
  265. deltacat/types/tables.py +2344 -46
  266. deltacat/utils/arguments.py +33 -1
  267. deltacat/utils/daft.py +824 -40
  268. deltacat/utils/export.py +3 -1
  269. deltacat/utils/filesystem.py +139 -9
  270. deltacat/utils/metafile_locator.py +2 -1
  271. deltacat/utils/numpy.py +118 -26
  272. deltacat/utils/pandas.py +577 -48
  273. deltacat/utils/polars.py +759 -0
  274. deltacat/utils/pyarrow.py +1373 -192
  275. deltacat/utils/ray_utils/concurrency.py +1 -1
  276. deltacat/utils/ray_utils/dataset.py +101 -10
  277. deltacat/utils/ray_utils/runtime.py +56 -4
  278. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  279. deltacat/utils/url.py +1325 -0
  280. deltacat-2.0.0.dist-info/METADATA +1163 -0
  281. deltacat-2.0.0.dist-info/RECORD +439 -0
  282. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  283. deltacat/catalog/iceberg/__init__.py +0 -4
  284. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  285. deltacat/compute/merge_on_read/__init__.py +0 -4
  286. deltacat/compute/merge_on_read/daft.py +0 -40
  287. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  288. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  289. deltacat/examples/common/fixtures.py +0 -15
  290. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  291. deltacat/storage/rivulet/__init__.py +0 -11
  292. deltacat/storage/rivulet/feather/__init__.py +0 -5
  293. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  294. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  295. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  296. deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
  297. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  298. deltacat/utils/s3fs.py +0 -21
  299. deltacat-2.0.dist-info/METADATA +0 -65
  300. deltacat-2.0.dist-info/RECORD +0 -347
  301. /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
  302. /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
  303. /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
  304. /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
  305. /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
  306. /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
  307. /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
  308. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
  309. /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
  310. /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  311. /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
  312. /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
  313. /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  314. /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
  315. /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
  316. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
  317. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  318. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  319. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  320. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  321. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  322. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  323. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  324. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -8,36 +8,55 @@ import ray
8
8
  import logging
9
9
  from deltacat.compute.converter.model.convert_input import ConvertInput
10
10
  from deltacat.compute.converter.steps.dedupe import dedupe_data_files
11
- from deltacat.compute.converter.utils.s3u import upload_table_with_retry
11
+ from deltacat.compute.converter.utils.io import write_sliced_table
12
12
  from deltacat.compute.converter.utils.io import (
13
13
  download_data_table_and_append_iceberg_columns,
14
14
  )
15
15
  from deltacat.compute.converter.utils.converter_session_utils import (
16
16
  partition_value_record_to_partition_value_string,
17
+ sort_data_files_maintaining_order,
17
18
  )
18
-
19
+ from deltacat.compute.converter.pyiceberg.overrides import (
20
+ parquet_files_dict_to_iceberg_data_files,
21
+ )
22
+ from deltacat.compute.converter.model.convert_result import ConvertResult
23
+ from pyiceberg.manifest import DataFileContent
19
24
  from deltacat import logs
25
+ from fsspec import AbstractFileSystem
26
+ from typing import List, Dict, Tuple, Optional, Any
27
+ from deltacat.utils.resources import get_current_process_peak_memory_usage_in_bytes
28
+ from deltacat.compute.converter.model.convert_input_files import (
29
+ DataFileList,
30
+ DataFileListGroup,
31
+ )
20
32
 
21
33
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
22
34
 
23
35
 
24
36
  @ray.remote
25
- def convert(convert_input: ConvertInput):
37
+ def convert(convert_input: ConvertInput) -> ConvertResult:
26
38
  convert_input_files = convert_input.convert_input_files
27
39
  convert_task_index = convert_input.convert_task_index
28
40
  iceberg_table_warehouse_prefix = convert_input.iceberg_table_warehouse_prefix
29
41
  identifier_fields = convert_input.identifier_fields
30
- compact_small_files = convert_input.compact_small_files
42
+ table_io = convert_input.table_io
43
+ table_metadata = convert_input.table_metadata
44
+ compact_previous_position_delete_files = (
45
+ convert_input.compact_previous_position_delete_files
46
+ )
31
47
  position_delete_for_multiple_data_files = (
32
48
  convert_input.position_delete_for_multiple_data_files
33
49
  )
34
50
  max_parallel_data_file_download = convert_input.max_parallel_data_file_download
35
- s3_file_system = convert_input.s3_file_system
51
+ filesystem = convert_input.filesystem
52
+ s3_client_kwargs = convert_input.s3_client_kwargs
53
+ task_memory = convert_input.task_memory
54
+
36
55
  if not position_delete_for_multiple_data_files:
37
56
  raise NotImplementedError(
38
57
  f"Distributed file level position delete compute is not supported yet"
39
58
  )
40
- if compact_small_files:
59
+ if compact_previous_position_delete_files:
41
60
  raise NotImplementedError(f"Compact previous position delete not supported yet")
42
61
 
43
62
  logger.info(f"Starting convert task index: {convert_task_index}")
@@ -46,96 +65,214 @@ def convert(convert_input: ConvertInput):
46
65
  applicable_equality_delete_files = (
47
66
  convert_input_files.applicable_equality_delete_files
48
67
  )
68
+
49
69
  all_data_files_for_this_bucket = convert_input_files.all_data_files_for_dedupe
50
70
 
51
71
  partition_value_str = partition_value_record_to_partition_value_string(
52
72
  convert_input_files.partition_value
53
73
  )
54
74
  partition_value = convert_input_files.partition_value
55
- iceberg_table_warehouse_prefix_with_partition = (
56
- f"{iceberg_table_warehouse_prefix}/{partition_value_str}"
57
- )
75
+
76
+ if partition_value_str:
77
+ iceberg_table_warehouse_prefix_with_partition = (
78
+ f"{iceberg_table_warehouse_prefix}/{partition_value_str}"
79
+ )
80
+ else:
81
+ iceberg_table_warehouse_prefix_with_partition = (
82
+ f"{iceberg_table_warehouse_prefix}"
83
+ )
84
+
58
85
  enforce_primary_key_uniqueness = convert_input.enforce_primary_key_uniqueness
59
86
  total_pos_delete_table = []
87
+ data_table_after_converting_equality_delete = []
60
88
  if applicable_equality_delete_files:
61
89
  (
62
- pos_delete_after_converting_equality_delete
90
+ pos_delete_after_converting_equality_delete,
91
+ data_table_after_converting_equality_delete,
63
92
  ) = compute_pos_delete_with_limited_parallelism(
64
93
  data_files_list=applicable_data_files,
65
94
  identifier_columns=identifier_fields,
66
95
  equality_delete_files_list=applicable_equality_delete_files,
67
96
  iceberg_table_warehouse_prefix_with_partition=iceberg_table_warehouse_prefix_with_partition,
97
+ convert_task_index=convert_task_index,
68
98
  max_parallel_data_file_download=max_parallel_data_file_download,
69
- s3_file_system=s3_file_system,
99
+ s3_file_system=filesystem,
100
+ s3_client_kwargs=s3_client_kwargs,
70
101
  )
71
102
  if pos_delete_after_converting_equality_delete:
72
103
  total_pos_delete_table.append(pos_delete_after_converting_equality_delete)
73
104
 
74
105
  if enforce_primary_key_uniqueness:
106
+ data_files_downloaded_during_convert = []
107
+ if applicable_data_files:
108
+ for file_list in applicable_data_files:
109
+ for file in file_list:
110
+ data_files_downloaded_during_convert.append(file)
111
+
75
112
  data_files_to_dedupe = get_additional_applicable_data_files(
76
113
  all_data_files=all_data_files_for_this_bucket,
77
- data_files_downloaded=applicable_data_files,
114
+ data_files_downloaded=data_files_downloaded_during_convert,
115
+ )
116
+
117
+ dedupe_file_size_bytes = sum(
118
+ data_file.file_size_in_bytes for _, data_file in data_files_to_dedupe
119
+ )
120
+ logger.info(
121
+ f"Total on-disk size of files to dedupe: {dedupe_file_size_bytes} bytes"
78
122
  )
79
- pos_delete_after_dedupe = dedupe_data_files(
123
+
124
+ logger.info(
125
+ f"[Convert task {convert_task_index}]: Got {len(data_files_to_dedupe)} files to dedupe."
126
+ )
127
+
128
+ (
129
+ pos_delete_after_dedupe,
130
+ data_file_to_dedupe_record_count,
131
+ data_file_to_dedupe_size,
132
+ ) = dedupe_data_files(
80
133
  data_file_to_dedupe=data_files_to_dedupe,
81
- identify_column_name_concatenated=identifier_fields[0],
82
134
  identifier_columns=identifier_fields,
135
+ remaining_data_table_after_convert=data_table_after_converting_equality_delete,
83
136
  merge_sort_column=sc._ORDERED_RECORD_IDX_COLUMN_NAME,
137
+ s3_client_kwargs=s3_client_kwargs,
138
+ )
139
+ logger.info(
140
+ f"[Convert task {convert_task_index}]: Dedupe produced {len(pos_delete_after_dedupe)} position delete records."
84
141
  )
85
142
  total_pos_delete_table.append(pos_delete_after_dedupe)
86
143
 
87
144
  total_pos_delete = pa.concat_tables(total_pos_delete_table)
88
- to_be_added_files_list = upload_table_with_retry(
89
- table=total_pos_delete,
90
- s3_url_prefix=iceberg_table_warehouse_prefix_with_partition,
91
- s3_table_writer_kwargs={},
92
- s3_file_system=s3_file_system,
145
+
146
+ logger.info(
147
+ f"[Convert task {convert_task_index}]: Total position delete produced:{len(total_pos_delete)}"
93
148
  )
94
149
 
150
+ to_be_added_files_list = []
151
+ if total_pos_delete:
152
+ to_be_added_files_list_parquet = write_sliced_table(
153
+ table=total_pos_delete,
154
+ base_path=iceberg_table_warehouse_prefix_with_partition,
155
+ table_writer_kwargs={},
156
+ filesystem=filesystem,
157
+ )
158
+
159
+ to_be_added_files_dict = defaultdict()
160
+ to_be_added_files_dict[partition_value] = to_be_added_files_list_parquet
161
+
162
+ logger.info(
163
+ f"[Convert task {convert_task_index}]: Produced {len(to_be_added_files_list_parquet)} position delete files."
164
+ )
165
+ file_content_type = DataFileContent.POSITION_DELETES
166
+ to_be_added_files_list = parquet_files_dict_to_iceberg_data_files(
167
+ io=table_io,
168
+ table_metadata=table_metadata,
169
+ files_dict=to_be_added_files_dict,
170
+ file_content_type=file_content_type,
171
+ )
172
+
95
173
  to_be_delete_files_dict = defaultdict()
174
+
96
175
  if applicable_equality_delete_files:
97
176
  to_be_delete_files_dict[partition_value] = [
98
177
  equality_delete_file[1]
99
- for equality_delete_file in applicable_equality_delete_files
178
+ for equality_delete_list in applicable_equality_delete_files
179
+ for equality_delete_file in equality_delete_list
100
180
  ]
101
- to_be_added_files_dict = defaultdict()
102
- to_be_added_files_dict[partition_value] = to_be_added_files_list
103
- return (to_be_delete_files_dict, to_be_added_files_dict)
104
181
 
182
+ if not enforce_primary_key_uniqueness:
183
+ data_file_to_dedupe_record_count = 0
184
+ data_file_to_dedupe_size = 0
185
+
186
+ peak_memory_usage_bytes = (
187
+ get_current_process_peak_memory_usage_in_bytes()
188
+ ) # Convert KB to bytes
189
+ memory_usage_percentage = (peak_memory_usage_bytes / task_memory) * 100
190
+
191
+ logger.info(
192
+ f"[Convert task {convert_task_index}]: Memory usage stats - "
193
+ f"Peak memory usage: {peak_memory_usage_bytes} bytes, "
194
+ f"Allocated task memory: {convert_input.task_memory} bytes, "
195
+ f"Usage percentage: {memory_usage_percentage:.2f}%"
196
+ )
105
197
 
106
- def get_additional_applicable_data_files(all_data_files, data_files_downloaded):
107
- data_file_to_dedupe = all_data_files
198
+ convert_res = ConvertResult.of(
199
+ convert_task_index=convert_task_index,
200
+ to_be_added_files=to_be_added_files_list,
201
+ to_be_deleted_files=to_be_delete_files_dict,
202
+ position_delete_record_count=len(total_pos_delete),
203
+ input_data_files_record_count=data_file_to_dedupe_record_count,
204
+ input_data_files_hash_columns_in_memory_sizes=data_file_to_dedupe_size,
205
+ position_delete_in_memory_sizes=int(total_pos_delete.nbytes),
206
+ position_delete_on_disk_sizes=sum(
207
+ file.file_size_in_bytes for file in to_be_added_files_list
208
+ ),
209
+ input_data_files_on_disk_size=dedupe_file_size_bytes,
210
+ peak_memory_usage_bytes=peak_memory_usage_bytes,
211
+ memory_usage_percentage=memory_usage_percentage,
212
+ )
213
+ return convert_res
214
+
215
+
216
+ def get_additional_applicable_data_files(
217
+ all_data_files: DataFileList,
218
+ data_files_downloaded: DataFileList,
219
+ ) -> DataFileList:
220
+ data_file_to_dedupe = []
221
+ assert len(set(all_data_files)) >= len(set(data_files_downloaded)), (
222
+ f"Length of all data files ({len(set(all_data_files))}) should never be less than "
223
+ f"the length of candidate equality delete data files ({len(set(data_files_downloaded))})"
224
+ )
108
225
  if data_files_downloaded:
109
- data_file_to_dedupe = list(set(all_data_files) - set(data_files_downloaded))
226
+ # set1.difference(set2) returns elements in set1 but not in set2
227
+ data_file_to_dedupe.extend(
228
+ list(set(data_file_to_dedupe).difference(set(data_files_downloaded)))
229
+ )
230
+ else:
231
+ data_file_to_dedupe = all_data_files
110
232
  return data_file_to_dedupe
111
233
 
112
234
 
113
235
  def filter_rows_to_be_deleted(
114
- equality_delete_table, data_file_table, identifier_columns
115
- ):
116
- identifier_column = identifier_columns[0]
236
+ equality_delete_table: Optional[pa.Table],
237
+ data_file_table: Optional[pa.Table],
238
+ identifier_columns: List[str],
239
+ ) -> Tuple[Optional[pa.Table], Optional[pa.Table]]:
240
+ identifier_column = sc._IDENTIFIER_COLUMNS_HASH_COLUMN_NAME
117
241
  if equality_delete_table and data_file_table:
118
242
  equality_deletes = pc.is_in(
119
243
  data_file_table[identifier_column],
120
244
  equality_delete_table[identifier_column],
121
245
  )
246
+ data_file_record_remaining = pc.invert(
247
+ pc.is_in(
248
+ data_file_table[identifier_column],
249
+ equality_delete_table[identifier_column],
250
+ )
251
+ )
122
252
  position_delete_table = data_file_table.filter(equality_deletes)
123
- logger.info(f"positional_delete_table:{position_delete_table.to_pydict()}")
124
- logger.info(f"data_file_table:{data_file_table.to_pydict()}")
125
- logger.info(
126
- f"length_pos_delete_table, {len(position_delete_table)}, length_data_table:{len(data_file_table)}"
253
+ remaining_data_table = data_file_table.filter(data_file_record_remaining)
254
+
255
+ position_delete_table = position_delete_table.drop(
256
+ [sc._IDENTIFIER_COLUMNS_HASH_COLUMN_NAME]
127
257
  )
128
- return position_delete_table
258
+ assert len(position_delete_table) + len(remaining_data_table) == len(
259
+ data_file_table
260
+ ), (
261
+ f"Expected undeleted data file record count plus length of pos deletes to match original data file record count of {len(data_file_table)}, "
262
+ f"but found {len(position_delete_table)} pos deletes + {len(remaining_data_table)} equality deletes."
263
+ )
264
+
265
+ return position_delete_table, remaining_data_table
129
266
 
130
267
 
131
268
  def compute_pos_delete_converting_equality_deletes(
132
- equality_delete_table,
133
- data_file_table,
134
- identifier_columns,
135
- iceberg_table_warehouse_prefix_with_partition,
136
- s3_file_system,
137
- ):
138
- new_position_delete_table = filter_rows_to_be_deleted(
269
+ equality_delete_table: Optional[pa.Table],
270
+ data_file_table: Optional[pa.Table],
271
+ identifier_columns: List[str],
272
+ iceberg_table_warehouse_prefix_with_partition: str,
273
+ s3_file_system: Optional[AbstractFileSystem],
274
+ ) -> Tuple[Optional[pa.Table], Optional[pa.Table]]:
275
+ new_position_delete_table, remaining_data_table = filter_rows_to_be_deleted(
139
276
  data_file_table=data_file_table,
140
277
  equality_delete_table=equality_delete_table,
141
278
  identifier_columns=identifier_columns,
@@ -144,44 +281,47 @@ def compute_pos_delete_converting_equality_deletes(
144
281
  logger.info(
145
282
  f"Length of position delete table after converting from equality deletes:{len(new_position_delete_table)}"
146
283
  )
284
+ return new_position_delete_table, remaining_data_table
285
+ elif not remaining_data_table:
286
+ return None, None
147
287
  else:
148
- return None
149
- return new_position_delete_table
288
+ return None, remaining_data_table
150
289
 
151
290
 
152
- def download_bucketed_table(data_files, equality_delete_files):
153
- from deltacat.utils.pyarrow import s3_file_to_table
154
-
155
- compacted_table = s3_file_to_table(
156
- [data_file.file_path for data_file in data_files]
157
- )
158
- equality_delete_table = s3_file_to_table(
159
- [eq_file.file_path for eq_file in equality_delete_files]
291
+ def compute_pos_delete_with_limited_parallelism(
292
+ data_files_list: DataFileListGroup,
293
+ identifier_columns: List[str],
294
+ equality_delete_files_list: DataFileListGroup,
295
+ iceberg_table_warehouse_prefix_with_partition: str,
296
+ convert_task_index: int,
297
+ max_parallel_data_file_download: int,
298
+ s3_file_system: Optional[AbstractFileSystem],
299
+ s3_client_kwargs: Optional[Dict[str, Any]],
300
+ ) -> Tuple[Optional[pa.Table], Optional[pa.Table]]:
301
+ assert len(data_files_list) == len(equality_delete_files_list), (
302
+ f"Number of lists of data files should equal to number of list of equality delete files, "
303
+ f"But got {len(data_files_list)} data files lists vs {len(equality_delete_files_list)}."
160
304
  )
161
- return compacted_table, equality_delete_table
162
305
 
163
-
164
- def compute_pos_delete_with_limited_parallelism(
165
- data_files_list,
166
- identifier_columns,
167
- equality_delete_files_list,
168
- iceberg_table_warehouse_prefix_with_partition,
169
- max_parallel_data_file_download,
170
- s3_file_system,
171
- ):
306
+ new_pos_delete_table_total = []
172
307
  for data_files, equality_delete_files in zip(
173
308
  data_files_list, equality_delete_files_list
174
309
  ):
175
310
  data_table_total = []
311
+
312
+ # Sort data files by file sequence number first, then file path to
313
+ # make sure files having same sequence number are deterministically sorted
314
+ data_files = sort_data_files_maintaining_order(data_files=data_files)
315
+
176
316
  for data_file in data_files:
177
317
  data_table = download_data_table_and_append_iceberg_columns(
178
- data_files=data_file[1],
318
+ file=data_file[1],
179
319
  columns_to_download=identifier_columns,
180
320
  additional_columns_to_append=[
181
321
  sc._FILE_PATH_COLUMN_NAME,
182
322
  sc._ORDERED_RECORD_IDX_COLUMN_NAME,
183
323
  ],
184
- sequence_number=data_file[0],
324
+ s3_client_kwargs=s3_client_kwargs,
185
325
  )
186
326
  data_table_total.append(data_table)
187
327
  data_table_total = pa.concat_tables(data_table_total)
@@ -189,23 +329,38 @@ def compute_pos_delete_with_limited_parallelism(
189
329
  equality_delete_table_total = []
190
330
  for equality_delete in equality_delete_files:
191
331
  equality_delete_table = download_data_table_and_append_iceberg_columns(
192
- data_files=equality_delete[1],
332
+ file=equality_delete[1],
193
333
  columns_to_download=identifier_columns,
334
+ s3_client_kwargs=s3_client_kwargs,
194
335
  )
195
336
  equality_delete_table_total.append(equality_delete_table)
196
337
  equality_delete_table_total = pa.concat_tables(equality_delete_table_total)
197
338
 
198
- new_pos_delete_table = compute_pos_delete_converting_equality_deletes(
199
- equality_delete_table=equality_delete_table_total,
200
- data_file_table=data_table_total,
201
- iceberg_table_warehouse_prefix_with_partition=iceberg_table_warehouse_prefix_with_partition,
202
- identifier_columns=identifier_columns,
203
- s3_file_system=s3_file_system,
204
- )
205
- if not new_pos_delete_table:
206
- logger.info("No records deleted based on equality delete converstion")
339
+ (
340
+ new_pos_delete_table,
341
+ remaining_data_table,
342
+ ) = compute_pos_delete_converting_equality_deletes(
343
+ equality_delete_table=equality_delete_table_total,
344
+ data_file_table=data_table_total,
345
+ iceberg_table_warehouse_prefix_with_partition=iceberg_table_warehouse_prefix_with_partition,
346
+ identifier_columns=identifier_columns,
347
+ s3_file_system=s3_file_system,
348
+ )
349
+ new_pos_delete_table_total.append(new_pos_delete_table)
350
+
351
+ if new_pos_delete_table_total:
352
+ new_pos_delete_table_total = pa.concat_tables(new_pos_delete_table_total)
207
353
 
208
354
  logger.info(
209
- f"Number of records to delete based on equality delete convertion:{len(new_pos_delete_table)}"
355
+ f"[Convert task {convert_task_index}]: Find deletes got {len(data_table_total)} data table records, "
356
+ f"{len(equality_delete_table_total)} equality deletes as input, "
357
+ f"Produced {len(new_pos_delete_table_total)} position deletes based off find deletes input."
210
358
  )
211
- return new_pos_delete_table
359
+
360
+ if not new_pos_delete_table_total:
361
+ logger.info("No records deleted based on equality delete convertion")
362
+
363
+ if not remaining_data_table:
364
+ logger.info("No data table remaining after converting equality deletes")
365
+
366
+ return new_pos_delete_table_total, remaining_data_table
@@ -4,20 +4,33 @@ import deltacat.compute.converter.utils.iceberg_columns as sc
4
4
  from deltacat.compute.converter.utils.io import (
5
5
  download_data_table_and_append_iceberg_columns,
6
6
  )
7
+ from deltacat.compute.converter.utils.converter_session_utils import (
8
+ sort_data_files_maintaining_order,
9
+ )
10
+ import logging
11
+ from deltacat import logs
12
+ from typing import List, Dict, Tuple, Optional, Any
13
+ from pyiceberg.manifest import DataFile
14
+
15
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
7
16
 
8
17
 
9
18
  def dedupe_data_files(
10
- data_file_to_dedupe,
11
- identify_column_name_concatenated,
12
- identifier_columns,
13
- merge_sort_column,
14
- ):
19
+ data_file_to_dedupe: List[Tuple[int, DataFile]],
20
+ identifier_columns: List[str],
21
+ remaining_data_table_after_convert: Optional[pa.Table],
22
+ merge_sort_column: str,
23
+ s3_client_kwargs: Optional[Dict[str, Any]],
24
+ ) -> Tuple[pa.Table, int, int]:
15
25
  data_file_table = []
26
+ if remaining_data_table_after_convert:
27
+ data_file_table.append(remaining_data_table_after_convert)
16
28
 
17
- # Sort data files by file sequence number first
18
- data_file_to_dedupe = sorted(data_file_to_dedupe, key=lambda f: f[0])
29
+ data_file_to_dedupe = sort_data_files_maintaining_order(
30
+ data_files=data_file_to_dedupe
31
+ )
32
+ downloaded_data_file_record_count = 0
19
33
  for file_tuple in data_file_to_dedupe:
20
- sequence_number = file_tuple[0]
21
34
  data_file = file_tuple[1]
22
35
  data_file_to_dedupe_table = download_data_table_and_append_iceberg_columns(
23
36
  file=data_file,
@@ -26,12 +39,26 @@ def dedupe_data_files(
26
39
  sc._FILE_PATH_COLUMN_NAME,
27
40
  sc._ORDERED_RECORD_IDX_COLUMN_NAME,
28
41
  ],
29
- sequence_number=sequence_number,
42
+ s3_client_kwargs=s3_client_kwargs,
30
43
  )
44
+ logger.info(
45
+ f"Length of downloaded data file table: {len(data_file_to_dedupe_table)}"
46
+ )
47
+ downloaded_data_file_record_count += len(data_file_to_dedupe_table)
31
48
  data_file_table.append(data_file_to_dedupe_table)
32
49
 
33
50
  final_data_to_dedupe = pa.concat_tables(data_file_table)
34
51
 
52
+ dedupe_input_record_count = downloaded_data_file_record_count
53
+ if remaining_data_table_after_convert:
54
+ dedupe_input_record_count += len(remaining_data_table_after_convert)
55
+ assert len(final_data_to_dedupe) == dedupe_input_record_count, (
56
+ f"Mismatch record count while performing table concat, Got {len(final_data_to_dedupe)} in final table, "
57
+ f"while input table length is: {dedupe_input_record_count}"
58
+ )
59
+
60
+ logger.info(f"Length of pyarrow table to dedupe:{len(final_data_to_dedupe)}")
61
+
35
62
  record_idx_iterator = iter(range(len(final_data_to_dedupe)))
36
63
 
37
64
  # Append global record index to used as aggregate column
@@ -40,7 +67,7 @@ def dedupe_data_files(
40
67
  )
41
68
 
42
69
  final_data_table_indices = final_data_to_dedupe.group_by(
43
- identify_column_name_concatenated, use_threads=False
70
+ sc._IDENTIFIER_COLUMNS_HASH_COLUMN_NAME, use_threads=False
44
71
  ).aggregate([(sc._GLOBAL_RECORD_IDX_COLUMN_NAME, "max")])
45
72
 
46
73
  pos_delete_indices = pc.invert(
@@ -55,6 +82,13 @@ def dedupe_data_files(
55
82
  final_data_table_to_delete = final_data_to_dedupe.filter(pos_delete_indices)
56
83
 
57
84
  final_data_table_to_delete = final_data_table_to_delete.drop(
58
- [identify_column_name_concatenated, sc._GLOBAL_RECORD_IDX_COLUMN_NAME]
85
+ [sc._IDENTIFIER_COLUMNS_HASH_COLUMN_NAME, sc._GLOBAL_RECORD_IDX_COLUMN_NAME]
86
+ )
87
+ logger.info(
88
+ f"Deduped {len(final_data_table_to_delete)} Records based off identifier columns."
89
+ )
90
+ return (
91
+ final_data_table_to_delete,
92
+ len(final_data_to_dedupe),
93
+ int(final_data_to_dedupe.nbytes),
59
94
  )
60
- return final_data_table_to_delete
@@ -1,26 +1,36 @@
1
- from typing import Optional, Dict
1
+ from typing import Optional, Dict, List, Tuple, Any
2
2
  from deltacat.exceptions import RetryableError
3
+ from pyiceberg.manifest import DataFile
4
+ from deltacat.compute.converter.model.convert_input_files import ConvertInputFiles
3
5
 
4
- AVERAGE_FILE_PATH_COLUMN_SIZE_BYTES = 80
6
+ AVERAGE_FILE_PATH_COLUMN_SIZE_BYTES = 160
5
7
  AVERAGE_POS_COLUMN_SIZE_BYTES = 4
6
8
  XXHASH_BYTE_PER_RECORD = 8
7
- MEMORY_BUFFER_RATE = 1.2
9
+ MEMORY_BUFFER_RATE = 2
10
+ # Worst case 2 as no duplicates exists across all pk
11
+ PYARROW_AGGREGATE_MEMORY_MULTIPLIER = 2
12
+ # Observed base memory usage at the beginning of each worker process
13
+ BASE_MEMORY_BUFFER = 0.3 * 1024 * 1024 * 1024
8
14
 
9
15
 
10
- def estimate_fixed_hash_columns(hash_value_size_bytes_per_record, total_record_count):
16
+ def estimate_fixed_hash_columns(
17
+ hash_value_size_bytes_per_record: int, total_record_count: int
18
+ ) -> int:
11
19
  return hash_value_size_bytes_per_record * total_record_count
12
20
 
13
21
 
14
- def get_total_record_from_iceberg_files(iceberg_files_list):
22
+ def get_total_record_from_iceberg_files(
23
+ iceberg_files_list: List[Tuple[int, DataFile]]
24
+ ) -> int:
15
25
  total_record_count = 0
16
- for iceberg_files in iceberg_files_list:
17
- total_record_count += sum(file.record_count for file in iceberg_files)
26
+ # file are in form of tuple (sequence_number, DataFile)
27
+ total_record_count += sum(file[1].record_count for file in iceberg_files_list)
18
28
  return total_record_count
19
29
 
20
30
 
21
31
  def estimate_iceberg_pos_delete_additional_columns(
22
- include_columns, num_of_record_count
23
- ):
32
+ include_columns: List[str], num_of_record_count: int
33
+ ) -> int:
24
34
  total_additional_columns_sizes = 0
25
35
  if "file_path" in include_columns:
26
36
  total_additional_columns_sizes += (
@@ -33,7 +43,10 @@ def estimate_iceberg_pos_delete_additional_columns(
33
43
  return total_additional_columns_sizes
34
44
 
35
45
 
36
- def estimate_convert_remote_option_resources(data_files, equality_delete_files):
46
+ def estimate_convert_remote_option_resources(
47
+ data_files: List[Tuple[int, DataFile]],
48
+ equality_delete_files: List[Tuple[int, DataFile]],
49
+ ) -> float:
37
50
  data_file_record_count = get_total_record_from_iceberg_files(data_files)
38
51
  equality_delete_record_count = get_total_record_from_iceberg_files(
39
52
  equality_delete_files
@@ -50,9 +63,9 @@ def estimate_convert_remote_option_resources(data_files, equality_delete_files):
50
63
 
51
64
  def _get_task_options(
52
65
  memory: float,
53
- ray_custom_resources: Optional[Dict] = None,
66
+ ray_custom_resources: Optional[Dict[str, Any]] = None,
54
67
  scheduling_strategy: str = "SPREAD",
55
- ) -> Dict:
68
+ ) -> Dict[str, Any]:
56
69
 
57
70
  # NOTE: With DEFAULT scheduling strategy in Ray 2.20.0, autoscaler does
58
71
  # not spin up enough nodes fast and hence we see only approximately
@@ -68,7 +81,8 @@ def _get_task_options(
68
81
  task_opts["resources"] = ray_custom_resources
69
82
 
70
83
  task_opts["max_retries"] = 3
71
-
84
+ task_opts["num_cpus"] = 1
85
+ task_opts["resources"] = {"convert_task": 1}
72
86
  # List of possible botocore exceptions are available at
73
87
  # https://github.com/boto/botocore/blob/develop/botocore/exceptions.py
74
88
  task_opts["retry_exceptions"] = [RetryableError]
@@ -76,13 +90,43 @@ def _get_task_options(
76
90
  return task_opts
77
91
 
78
92
 
79
- def convert_resource_options_provider(index, files_for_each_bucket):
80
- (
81
- data_files_list,
82
- equality_delete_files_list,
83
- position_delete_files_list,
84
- ) = files_for_each_bucket[1]
85
- memory_requirement = estimate_convert_remote_option_resources(
86
- data_files_list, equality_delete_files_list
93
+ def estimate_dedupe_memory(
94
+ all_data_files_for_dedupe: List[Tuple[int, DataFile]]
95
+ ) -> float:
96
+ dedupe_record_count = get_total_record_from_iceberg_files(all_data_files_for_dedupe)
97
+ produced_pos_memory_required = estimate_iceberg_pos_delete_additional_columns(
98
+ ["file_path", "pos"], dedupe_record_count
99
+ )
100
+ download_pk_memory_required = estimate_fixed_hash_columns(
101
+ XXHASH_BYTE_PER_RECORD, dedupe_record_count
102
+ )
103
+ memory_required_by_dedupe = (
104
+ produced_pos_memory_required + download_pk_memory_required
105
+ ) * PYARROW_AGGREGATE_MEMORY_MULTIPLIER
106
+ memory_with_buffer = memory_required_by_dedupe * MEMORY_BUFFER_RATE
107
+ return memory_with_buffer
108
+
109
+
110
+ def convert_resource_options_provider(
111
+ index: int, convert_input_files: ConvertInputFiles
112
+ ) -> Dict[str, Any]:
113
+ applicable_data_files = convert_input_files.applicable_data_files
114
+ applicable_equality_delete_files = (
115
+ convert_input_files.applicable_equality_delete_files
87
116
  )
88
- return _get_task_options(memory=memory_requirement)
117
+ all_data_files_for_dedupe = convert_input_files.all_data_files_for_dedupe
118
+ total_memory_required = 0
119
+ total_memory_required += BASE_MEMORY_BUFFER
120
+ if applicable_data_files and applicable_equality_delete_files:
121
+ memory_requirement_for_convert_equality_deletes = (
122
+ estimate_convert_remote_option_resources(
123
+ applicable_data_files, applicable_equality_delete_files
124
+ )
125
+ )
126
+ total_memory_required += memory_requirement_for_convert_equality_deletes
127
+ if all_data_files_for_dedupe:
128
+ memory_requirement_for_dedupe = estimate_dedupe_memory(
129
+ all_data_files_for_dedupe
130
+ )
131
+ total_memory_required += memory_requirement_for_dedupe
132
+ return _get_task_options(memory=total_memory_required)