deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,94 @@
1
+ import pyarrow as pa
2
+ import pyarrow.compute as pc
3
+ import deltacat.compute.converter.utils.iceberg_columns as sc
4
+ from deltacat.compute.converter.utils.io import (
5
+ download_data_table_and_append_iceberg_columns,
6
+ )
7
+ from deltacat.compute.converter.utils.converter_session_utils import (
8
+ sort_data_files_maintaining_order,
9
+ )
10
+ import logging
11
+ from deltacat import logs
12
+ from typing import List, Dict, Tuple, Optional, Any
13
+ from pyiceberg.manifest import DataFile
14
+
15
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
16
+
17
+
18
+ def dedupe_data_files(
19
+ data_file_to_dedupe: List[Tuple[int, DataFile]],
20
+ identifier_columns: List[str],
21
+ remaining_data_table_after_convert: Optional[pa.Table],
22
+ merge_sort_column: str,
23
+ s3_client_kwargs: Optional[Dict[str, Any]],
24
+ ) -> Tuple[pa.Table, int, int]:
25
+ data_file_table = []
26
+ if remaining_data_table_after_convert:
27
+ data_file_table.append(remaining_data_table_after_convert)
28
+
29
+ data_file_to_dedupe = sort_data_files_maintaining_order(
30
+ data_files=data_file_to_dedupe
31
+ )
32
+ downloaded_data_file_record_count = 0
33
+ for file_tuple in data_file_to_dedupe:
34
+ data_file = file_tuple[1]
35
+ data_file_to_dedupe_table = download_data_table_and_append_iceberg_columns(
36
+ file=data_file,
37
+ columns_to_download=identifier_columns,
38
+ additional_columns_to_append=[
39
+ sc._FILE_PATH_COLUMN_NAME,
40
+ sc._ORDERED_RECORD_IDX_COLUMN_NAME,
41
+ ],
42
+ s3_client_kwargs=s3_client_kwargs,
43
+ )
44
+ logger.info(
45
+ f"Length of downloaded data file table: {len(data_file_to_dedupe_table)}"
46
+ )
47
+ downloaded_data_file_record_count += len(data_file_to_dedupe_table)
48
+ data_file_table.append(data_file_to_dedupe_table)
49
+
50
+ final_data_to_dedupe = pa.concat_tables(data_file_table)
51
+
52
+ dedupe_input_record_count = downloaded_data_file_record_count
53
+ if remaining_data_table_after_convert:
54
+ dedupe_input_record_count += len(remaining_data_table_after_convert)
55
+ assert len(final_data_to_dedupe) == dedupe_input_record_count, (
56
+ f"Mismatch record count while performing table concat, Got {len(final_data_to_dedupe)} in final table, "
57
+ f"while input table length is: {dedupe_input_record_count}"
58
+ )
59
+
60
+ logger.info(f"Length of pyarrow table to dedupe:{len(final_data_to_dedupe)}")
61
+
62
+ record_idx_iterator = iter(range(len(final_data_to_dedupe)))
63
+
64
+ # Append global record index to used as aggregate column
65
+ final_data_to_dedupe = sc.append_global_record_idx_column(
66
+ final_data_to_dedupe, record_idx_iterator
67
+ )
68
+
69
+ final_data_table_indices = final_data_to_dedupe.group_by(
70
+ sc._IDENTIFIER_COLUMNS_HASH_COLUMN_NAME, use_threads=False
71
+ ).aggregate([(sc._GLOBAL_RECORD_IDX_COLUMN_NAME, "max")])
72
+
73
+ pos_delete_indices = pc.invert(
74
+ pc.is_in(
75
+ final_data_to_dedupe[sc._GLOBAL_RECORD_IDX_COLUMN_NAME],
76
+ value_set=final_data_table_indices[
77
+ f"{sc._GLOBAL_RECORD_IDX_COLUMN_NAME}_max"
78
+ ],
79
+ )
80
+ )
81
+
82
+ final_data_table_to_delete = final_data_to_dedupe.filter(pos_delete_indices)
83
+
84
+ final_data_table_to_delete = final_data_table_to_delete.drop(
85
+ [sc._IDENTIFIER_COLUMNS_HASH_COLUMN_NAME, sc._GLOBAL_RECORD_IDX_COLUMN_NAME]
86
+ )
87
+ logger.info(
88
+ f"Deduped {len(final_data_table_to_delete)} Records based off identifier columns."
89
+ )
90
+ return (
91
+ final_data_table_to_delete,
92
+ len(final_data_to_dedupe),
93
+ int(final_data_to_dedupe.nbytes),
94
+ )
File without changes
@@ -0,0 +1,132 @@
1
+ from typing import Optional, Dict, List, Tuple, Any
2
+ from deltacat.exceptions import RetryableError
3
+ from pyiceberg.manifest import DataFile
4
+ from deltacat.compute.converter.model.convert_input_files import ConvertInputFiles
5
+
6
+ AVERAGE_FILE_PATH_COLUMN_SIZE_BYTES = 160
7
+ AVERAGE_POS_COLUMN_SIZE_BYTES = 4
8
+ XXHASH_BYTE_PER_RECORD = 8
9
+ MEMORY_BUFFER_RATE = 2
10
+ # Worst case 2 as no duplicates exists across all pk
11
+ PYARROW_AGGREGATE_MEMORY_MULTIPLIER = 2
12
+ # Observed base memory usage at the beginning of each worker process
13
+ BASE_MEMORY_BUFFER = 0.3 * 1024 * 1024 * 1024
14
+
15
+
16
+ def estimate_fixed_hash_columns(
17
+ hash_value_size_bytes_per_record: int, total_record_count: int
18
+ ) -> int:
19
+ return hash_value_size_bytes_per_record * total_record_count
20
+
21
+
22
+ def get_total_record_from_iceberg_files(
23
+ iceberg_files_list: List[Tuple[int, DataFile]]
24
+ ) -> int:
25
+ total_record_count = 0
26
+ # file are in form of tuple (sequence_number, DataFile)
27
+ total_record_count += sum(file[1].record_count for file in iceberg_files_list)
28
+ return total_record_count
29
+
30
+
31
+ def estimate_iceberg_pos_delete_additional_columns(
32
+ include_columns: List[str], num_of_record_count: int
33
+ ) -> int:
34
+ total_additional_columns_sizes = 0
35
+ if "file_path" in include_columns:
36
+ total_additional_columns_sizes += (
37
+ AVERAGE_FILE_PATH_COLUMN_SIZE_BYTES * num_of_record_count
38
+ )
39
+ elif "pos" in include_columns:
40
+ total_additional_columns_sizes += (
41
+ AVERAGE_POS_COLUMN_SIZE_BYTES * num_of_record_count
42
+ )
43
+ return total_additional_columns_sizes
44
+
45
+
46
+ def estimate_convert_remote_option_resources(
47
+ data_files: List[Tuple[int, DataFile]],
48
+ equality_delete_files: List[Tuple[int, DataFile]],
49
+ ) -> float:
50
+ data_file_record_count = get_total_record_from_iceberg_files(data_files)
51
+ equality_delete_record_count = get_total_record_from_iceberg_files(
52
+ equality_delete_files
53
+ )
54
+ hash_column_sizes = estimate_fixed_hash_columns(
55
+ XXHASH_BYTE_PER_RECORD, data_file_record_count + equality_delete_record_count
56
+ )
57
+ pos_delete_sizes = estimate_iceberg_pos_delete_additional_columns(
58
+ ["file_path", "pos"], data_file_record_count + equality_delete_record_count
59
+ )
60
+ total_memory_required = hash_column_sizes + pos_delete_sizes
61
+ return total_memory_required * MEMORY_BUFFER_RATE
62
+
63
+
64
+ def _get_task_options(
65
+ memory: float,
66
+ ray_custom_resources: Optional[Dict[str, Any]] = None,
67
+ scheduling_strategy: str = "SPREAD",
68
+ ) -> Dict[str, Any]:
69
+
70
+ # NOTE: With DEFAULT scheduling strategy in Ray 2.20.0, autoscaler does
71
+ # not spin up enough nodes fast and hence we see only approximately
72
+ # 20 tasks get scheduled out of 100 tasks in queue. Hence, we use SPREAD
73
+ # which is also ideal for merge and hash bucket tasks.
74
+ # https://docs.ray.io/en/latest/ray-core/scheduling/index.html
75
+ task_opts = {
76
+ "memory": memory,
77
+ "scheduling_strategy": scheduling_strategy,
78
+ }
79
+
80
+ if ray_custom_resources:
81
+ task_opts["resources"] = ray_custom_resources
82
+
83
+ task_opts["max_retries"] = 3
84
+ task_opts["num_cpus"] = 1
85
+ task_opts["resources"] = {"convert_task": 1}
86
+ # List of possible botocore exceptions are available at
87
+ # https://github.com/boto/botocore/blob/develop/botocore/exceptions.py
88
+ task_opts["retry_exceptions"] = [RetryableError]
89
+
90
+ return task_opts
91
+
92
+
93
+ def estimate_dedupe_memory(
94
+ all_data_files_for_dedupe: List[Tuple[int, DataFile]]
95
+ ) -> float:
96
+ dedupe_record_count = get_total_record_from_iceberg_files(all_data_files_for_dedupe)
97
+ produced_pos_memory_required = estimate_iceberg_pos_delete_additional_columns(
98
+ ["file_path", "pos"], dedupe_record_count
99
+ )
100
+ download_pk_memory_required = estimate_fixed_hash_columns(
101
+ XXHASH_BYTE_PER_RECORD, dedupe_record_count
102
+ )
103
+ memory_required_by_dedupe = (
104
+ produced_pos_memory_required + download_pk_memory_required
105
+ ) * PYARROW_AGGREGATE_MEMORY_MULTIPLIER
106
+ memory_with_buffer = memory_required_by_dedupe * MEMORY_BUFFER_RATE
107
+ return memory_with_buffer
108
+
109
+
110
+ def convert_resource_options_provider(
111
+ index: int, convert_input_files: ConvertInputFiles
112
+ ) -> Dict[str, Any]:
113
+ applicable_data_files = convert_input_files.applicable_data_files
114
+ applicable_equality_delete_files = (
115
+ convert_input_files.applicable_equality_delete_files
116
+ )
117
+ all_data_files_for_dedupe = convert_input_files.all_data_files_for_dedupe
118
+ total_memory_required = 0
119
+ total_memory_required += BASE_MEMORY_BUFFER
120
+ if applicable_data_files and applicable_equality_delete_files:
121
+ memory_requirement_for_convert_equality_deletes = (
122
+ estimate_convert_remote_option_resources(
123
+ applicable_data_files, applicable_equality_delete_files
124
+ )
125
+ )
126
+ total_memory_required += memory_requirement_for_convert_equality_deletes
127
+ if all_data_files_for_dedupe:
128
+ memory_requirement_for_dedupe = estimate_dedupe_memory(
129
+ all_data_files_for_dedupe
130
+ )
131
+ total_memory_required += memory_requirement_for_dedupe
132
+ return _get_task_options(memory=total_memory_required)
@@ -0,0 +1,175 @@
1
+ from collections import defaultdict
2
+ import logging
3
+ from deltacat import logs
4
+ from deltacat.compute.converter.model.convert_input_files import (
5
+ ConvertInputFiles,
6
+ DataFileList,
7
+ DataFileListGroup,
8
+ )
9
+ from typing import List, Dict, Tuple, Any
10
+ from enum import Enum
11
+ from pyiceberg.manifest import DataFile
12
+
13
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
14
+
15
+
16
+ def check_data_files_sequence_number(
17
+ data_files_list: DataFileList,
18
+ equality_delete_files_list: DataFileList,
19
+ ) -> Tuple[DataFileListGroup, DataFileListGroup]:
20
+ # Sort by file sequence number
21
+ data_files_list.sort(key=lambda file_tuple: file_tuple[0])
22
+ equality_delete_files_list.sort(key=lambda file_tuple: file_tuple[0])
23
+ data_file_delete_applicable = []
24
+ result_eq_files_list = []
25
+
26
+ # Loop through each value in equality_delete_file
27
+ for data_file_tuple in data_files_list:
28
+
29
+ # Find all values in equality delete file that having a larger sequence number than current data file
30
+ valid_values_eq = []
31
+
32
+ # Pointer for equality delete file
33
+ eq_file_pointer = 0
34
+ # Move data_file_pointer to the first value in data_file that is smaller than val_equality
35
+ while (
36
+ eq_file_pointer < len(equality_delete_files_list)
37
+ and equality_delete_files_list[eq_file_pointer][0] > data_file_tuple[0]
38
+ ):
39
+ valid_values_eq.append(equality_delete_files_list[eq_file_pointer])
40
+ eq_file_pointer += 1
41
+
42
+ if valid_values_eq:
43
+ # Append the value for both applicable eq files list and applicable data files list
44
+ data_file_delete_applicable.append(data_file_tuple)
45
+ result_eq_files_list.append(valid_values_eq)
46
+
47
+ res_data_file_list = []
48
+ res_equality_delete_file_list = []
49
+ merged_file_dict = defaultdict(list)
50
+ for data_file_sublist, eq_delete_sublist in zip(
51
+ data_file_delete_applicable, result_eq_files_list
52
+ ):
53
+ merged_file_dict[tuple(eq_delete_sublist)].append(data_file_sublist)
54
+ for eq_file_list, data_file_list in merged_file_dict.items():
55
+ res_data_file_list.append(list(set(data_file_list)))
56
+ res_equality_delete_file_list.append(list(set(eq_file_list)))
57
+
58
+ assert len(res_data_file_list) == len(res_equality_delete_file_list), (
59
+ f"length of applicable data files list: {len(res_data_file_list)} "
60
+ f"should equal to length of equality delete files list:{len(res_equality_delete_file_list)}"
61
+ )
62
+
63
+ return res_equality_delete_file_list, res_data_file_list
64
+
65
+
66
+ def construct_iceberg_table_prefix(
67
+ iceberg_warehouse_bucket_name: str, table_name: str, iceberg_namespace: str
68
+ ) -> str:
69
+ return f"{iceberg_warehouse_bucket_name}/{iceberg_namespace}/{table_name}/data"
70
+
71
+
72
+ def partition_value_record_to_partition_value_string(partition: Any) -> str:
73
+ # Get string representation of partition value out of Record[partition_value]
74
+ partition_value_str = partition.__repr__().split("[", 1)[1].split("]")[0]
75
+ return partition_value_str
76
+
77
+
78
+ def group_all_files_to_each_bucket(
79
+ data_file_dict: Dict[Any, DataFileList],
80
+ equality_delete_dict: Dict[Any, DataFileList],
81
+ pos_delete_dict: Dict[Any, DataFileList],
82
+ ) -> List[ConvertInputFiles]:
83
+ convert_input_files_for_all_buckets = []
84
+ files_for_each_bucket_for_deletes = defaultdict(tuple)
85
+ if equality_delete_dict:
86
+ for partition_value, equality_delete_file_list in equality_delete_dict.items():
87
+ if partition_value in data_file_dict:
88
+ (
89
+ result_equality_delete_file,
90
+ result_data_file,
91
+ ) = check_data_files_sequence_number(
92
+ data_files_list=data_file_dict[partition_value],
93
+ equality_delete_files_list=equality_delete_dict[partition_value],
94
+ )
95
+ files_for_each_bucket_for_deletes[partition_value] = (
96
+ result_data_file,
97
+ result_equality_delete_file,
98
+ [],
99
+ )
100
+
101
+ for partition_value, all_data_files_for_each_bucket in data_file_dict.items():
102
+ convert_input_file = ConvertInputFiles.of(
103
+ partition_value=partition_value,
104
+ all_data_files_for_dedupe=all_data_files_for_each_bucket,
105
+ )
106
+ if partition_value in files_for_each_bucket_for_deletes:
107
+ convert_input_file.applicable_data_files = (
108
+ files_for_each_bucket_for_deletes[partition_value][0]
109
+ )
110
+ convert_input_file.applicable_equality_delete_files = (
111
+ files_for_each_bucket_for_deletes[partition_value][1]
112
+ )
113
+ convert_input_files_for_all_buckets.append(convert_input_file)
114
+ return convert_input_files_for_all_buckets
115
+
116
+
117
+ def sort_data_files_maintaining_order(data_files: DataFileList) -> DataFileList:
118
+ """
119
+ Sort data files deterministically based on two criterias:
120
+ 1. Sequence number: Newly added files will have a higher sequence number
121
+ 2. File path: If file sequence is the same, files are guaranteed to be returned in a deterministic order since file path is unique.
122
+ """
123
+ if data_files:
124
+ data_files = sorted(data_files, key=lambda f: (f[0], f[1].file_path))
125
+ return data_files
126
+
127
+
128
+ class SnapshotType(Enum):
129
+ """Enumeration of possible snapshot types."""
130
+
131
+ NONE = "none"
132
+ APPEND = "append"
133
+ REPLACE = "replace"
134
+ DELETE = "delete"
135
+
136
+
137
+ def _get_snapshot_action_description(
138
+ snapshot_type: SnapshotType,
139
+ files_to_delete: List[List[DataFile]],
140
+ files_to_add: List[DataFile],
141
+ ) -> str:
142
+ """Get a human-readable description of the snapshot action."""
143
+ descriptions = {
144
+ SnapshotType.NONE: "No changes needed",
145
+ SnapshotType.APPEND: f"Adding {len(files_to_add)} new files",
146
+ SnapshotType.REPLACE: f"Replacing {sum(len(files) for files in files_to_delete)} files with {len(files_to_add)} new files",
147
+ SnapshotType.DELETE: f"Deleting {sum(len(files) for files in files_to_delete)} files",
148
+ }
149
+ return descriptions[snapshot_type]
150
+
151
+
152
+ def _determine_snapshot_type(
153
+ to_be_deleted_files: List[List[DataFile]], to_be_added_files: List[DataFile]
154
+ ) -> SnapshotType:
155
+ """
156
+ Determine the snapshot type based on file changes.
157
+
158
+ Args:
159
+ to_be_deleted_files: List of files to be deleted
160
+ to_be_added_files: List of files to be added
161
+
162
+ Returns:
163
+ SnapshotType indicating what kind of snapshot to commit
164
+ """
165
+ has_files_to_delete = bool(to_be_deleted_files)
166
+ has_files_to_add = bool(to_be_added_files)
167
+
168
+ if not has_files_to_delete and not has_files_to_add:
169
+ return SnapshotType.NONE
170
+ elif not has_files_to_delete and has_files_to_add:
171
+ return SnapshotType.APPEND
172
+ elif has_files_to_delete and has_files_to_add:
173
+ return SnapshotType.REPLACE
174
+ else: # has_files_to_delete and not has_files_to_add
175
+ return SnapshotType.DELETE
@@ -0,0 +1,87 @@
1
+ import pyarrow as pa
2
+ from typing import Union, Iterator, Any
3
+ import numpy as np
4
+
5
+ # Refer to: https://iceberg.apache.org/spec/#reserved-field-ids for reserved field ids
6
+ ICEBERG_RESERVED_FIELD_ID_FOR_FILE_PATH_COLUMN = 2147483546
7
+
8
+ # Refer to: https://iceberg.apache.org/spec/#reserved-field-ids for reserved field ids
9
+ ICEBERG_RESERVED_FIELD_ID_FOR_POS_COLUMN = 2147483545
10
+
11
+
12
+ def _get_iceberg_col_name(suffix: str) -> str:
13
+ return suffix
14
+
15
+
16
+ _ORDERED_RECORD_IDX_COLUMN_NAME = _get_iceberg_col_name("pos")
17
+ _ORDERED_RECORD_IDX_COLUMN_TYPE = pa.int64()
18
+ _ORDERED_RECORD_IDX_FIELD_METADATA = {
19
+ b"PARQUET:field_id": f"{ICEBERG_RESERVED_FIELD_ID_FOR_POS_COLUMN}"
20
+ }
21
+ _ORDERED_RECORD_IDX_COLUMN_FIELD = pa.field(
22
+ _ORDERED_RECORD_IDX_COLUMN_NAME,
23
+ _ORDERED_RECORD_IDX_COLUMN_TYPE,
24
+ metadata=_ORDERED_RECORD_IDX_FIELD_METADATA,
25
+ nullable=False,
26
+ )
27
+
28
+
29
+ def get_record_index_column_array(obj: Any) -> Union[pa.Array, pa.ChunkedArray]:
30
+ return pa.array(
31
+ obj,
32
+ _ORDERED_RECORD_IDX_COLUMN_TYPE,
33
+ )
34
+
35
+
36
+ def append_record_idx_col(
37
+ table: pa.Table, ordered_record_indices: Iterator[int]
38
+ ) -> pa.Table:
39
+ table = table.append_column(
40
+ _ORDERED_RECORD_IDX_COLUMN_FIELD,
41
+ get_record_index_column_array(ordered_record_indices),
42
+ )
43
+ return table
44
+
45
+
46
+ _FILE_PATH_COLUMN_NAME = _get_iceberg_col_name("file_path")
47
+ _FILE_PATH_COLUMN_TYPE = pa.string()
48
+ _FILE_PATH_FIELD_METADATA = {
49
+ b"PARQUET:field_id": f"{ICEBERG_RESERVED_FIELD_ID_FOR_FILE_PATH_COLUMN}"
50
+ }
51
+ _FILE_PATH_COLUMN_FIELD = pa.field(
52
+ _FILE_PATH_COLUMN_NAME,
53
+ _FILE_PATH_COLUMN_TYPE,
54
+ metadata=_FILE_PATH_FIELD_METADATA,
55
+ nullable=False,
56
+ )
57
+
58
+
59
+ def append_file_path_column(table: pa.Table, file_path: str) -> pa.Table:
60
+ table = table.append_column(
61
+ _FILE_PATH_COLUMN_FIELD,
62
+ pa.array(np.repeat(file_path, len(table)), _FILE_PATH_COLUMN_TYPE),
63
+ )
64
+ return table
65
+
66
+
67
+ _GLOBAL_RECORD_IDX_COLUMN_NAME = _get_iceberg_col_name("global_record_index")
68
+ _GLOBAL_RECORD_IDX_COLUMN_TYPE = pa.int64()
69
+ _GLOBAL_RECORD_IDX_COLUMN_FIELD = pa.field(
70
+ _GLOBAL_RECORD_IDX_COLUMN_NAME,
71
+ _GLOBAL_RECORD_IDX_COLUMN_TYPE,
72
+ )
73
+
74
+
75
+ def append_global_record_idx_column(
76
+ table: pa.Table, ordered_record_indices: Iterator[int]
77
+ ) -> pa.Table:
78
+ table = table.append_column(
79
+ _GLOBAL_RECORD_IDX_COLUMN_NAME,
80
+ pa.array(ordered_record_indices, _GLOBAL_RECORD_IDX_COLUMN_TYPE),
81
+ )
82
+ return table
83
+
84
+
85
+ _IDENTIFIER_COLUMNS_HASH_COLUMN_NAME = _get_iceberg_col_name(
86
+ "identifier_columns_hashed"
87
+ )
@@ -0,0 +1,203 @@
1
+ import logging
2
+
3
+ from fsspec import AbstractFileSystem
4
+ from deltacat import logs
5
+ import deltacat.compute.converter.utils.iceberg_columns as sc
6
+ import daft
7
+ from deltacat.utils.daft import _get_s3_io_config
8
+ from daft import TimeUnit, DataFrame
9
+ import pyarrow as pa
10
+ from typing import Callable, Optional, List, Dict, Any
11
+ from deltacat.utils.pyarrow import sliced_string_cast
12
+ from deltacat.compute.converter.constants import IDENTIFIER_FIELD_DELIMITER
13
+ from deltacat.compute.converter.utils.s3u import upload_table_with_retry
14
+ from pyiceberg.manifest import DataFile
15
+ import pyarrow.compute as pc
16
+ from deltacat.types.media import ContentType
17
+ from deltacat.types.tables import (
18
+ get_table_writer,
19
+ get_table_slicer,
20
+ write_sliced_table as types_write_sliced_table,
21
+ )
22
+ from deltacat.storage import LocalTable, DistributedDataset
23
+ from typing import Union
24
+
25
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
26
+
27
+
28
+ def download_data_table_and_append_iceberg_columns(
29
+ file: DataFile,
30
+ columns_to_download: List[str],
31
+ additional_columns_to_append: Optional[List[str]] = [],
32
+ s3_client_kwargs: Optional[Dict[str, Any]] = None,
33
+ ) -> pa.Table:
34
+ table = download_parquet_with_daft_hash_applied(
35
+ identifier_columns=columns_to_download,
36
+ file=file,
37
+ s3_client_kwargs=s3_client_kwargs,
38
+ )
39
+
40
+ if sc._FILE_PATH_COLUMN_NAME in additional_columns_to_append:
41
+ table = sc.append_file_path_column(table, file.file_path)
42
+ if sc._ORDERED_RECORD_IDX_COLUMN_NAME in additional_columns_to_append:
43
+ record_idx_iterator = iter(range(len(table)))
44
+ table = sc.append_record_idx_col(table, record_idx_iterator)
45
+
46
+ return table
47
+
48
+
49
+ def download_parquet_with_daft_hash_applied(
50
+ identifier_columns: List[str],
51
+ file: DataFile,
52
+ s3_client_kwargs: Optional[Dict[str, Any]],
53
+ **kwargs: Any,
54
+ ) -> pa.Table:
55
+
56
+ # TODO: Add correct read kwargs as in:
57
+ # https://github.com/ray-project/deltacat/blob/383855a4044e4dfe03cf36d7738359d512a517b4/deltacat/utils/daft.py#L97
58
+
59
+ coerce_int96_timestamp_unit = TimeUnit.from_str(
60
+ kwargs.get("coerce_int96_timestamp_unit", "ms")
61
+ )
62
+
63
+ # TODO: Use Daft SHA1 hash instead to minimize probably of data corruption
64
+ io_config = _get_s3_io_config(s3_client_kwargs=s3_client_kwargs)
65
+ df = daft_read_parquet(
66
+ path=file.file_path,
67
+ io_config=io_config,
68
+ coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
69
+ )
70
+
71
+ hash_column = concatenate_hashed_identifier_columns(
72
+ df=df, identifier_columns=identifier_columns
73
+ )
74
+
75
+ table = pa.Table.from_arrays(
76
+ [hash_column], names=[sc._IDENTIFIER_COLUMNS_HASH_COLUMN_NAME]
77
+ )
78
+
79
+ return table
80
+
81
+
82
+ def daft_read_parquet(
83
+ path: str, io_config: Dict[str, Any], coerce_int96_timestamp_unit: TimeUnit
84
+ ) -> DataFrame:
85
+ df = daft.read_parquet(
86
+ path=path,
87
+ io_config=io_config,
88
+ coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
89
+ )
90
+ return df
91
+
92
+
93
+ def concatenate_hashed_identifier_columns(
94
+ df: DataFrame, identifier_columns: List[str]
95
+ ) -> pa.Array:
96
+ pk_hash_columns = []
97
+ previous_hash_column_length = None
98
+ for i in range(len(identifier_columns)):
99
+ pk_hash_column = df.select(daft.col(identifier_columns[i]).hash())
100
+ pk_hash_column_arrow = pk_hash_column.to_arrow()
101
+
102
+ # Assert that each hash column downloaded are same length to ensure we don't create mismatch between columns.
103
+ if not previous_hash_column_length:
104
+ previous_hash_column_length = len(pk_hash_column_arrow)
105
+ else:
106
+ assert previous_hash_column_length == len(pk_hash_column_arrow), (
107
+ f"Identifier column Length mismatch: {identifier_columns[i]} has length {len(pk_hash_column_arrow)} "
108
+ f"but expected {previous_hash_column_length}."
109
+ )
110
+ previous_hash_column_length = len(pk_hash_column_arrow)
111
+
112
+ # Convert identifier from different datatypes to string here
113
+ pk_hash_column_str = sliced_string_cast(
114
+ pk_hash_column_arrow[identifier_columns[i]]
115
+ )
116
+ assert len(pk_hash_column_str) == previous_hash_column_length, (
117
+ f"Casting column Length mismatch: {identifier_columns[i]} has length {len(pk_hash_column_str)} after casting, "
118
+ f"before casting length: {previous_hash_column_length}."
119
+ )
120
+
121
+ pk_hash_columns.append(pk_hash_column_str)
122
+
123
+ pk_hash_columns.append(IDENTIFIER_FIELD_DELIMITER)
124
+ pk_hash_columns_concatenated = pc.binary_join_element_wise(
125
+ *pk_hash_columns, null_handling="replace"
126
+ )
127
+ assert len(pk_hash_columns_concatenated) == previous_hash_column_length, (
128
+ f"Concatenated column Length mismatch: Final concatenated identifier column has length {len(pk_hash_columns_concatenated)}, "
129
+ f"before concatenating length: {previous_hash_column_length}."
130
+ )
131
+
132
+ return pk_hash_columns_concatenated
133
+
134
+
135
+ def write_sliced_table(
136
+ table: Union[LocalTable, DistributedDataset],
137
+ base_path: str,
138
+ table_writer_kwargs: Optional[Dict[str, Any]],
139
+ content_type: ContentType = ContentType.PARQUET,
140
+ max_records_per_file: Optional[int] = 4000000,
141
+ filesystem: Optional[Union[AbstractFileSystem, pa.fs.FileSystem]] = None,
142
+ **kwargs,
143
+ ) -> List[str]:
144
+ """
145
+ Writes the given table to 1 or more files and return the paths
146
+ of the files written.
147
+ """
148
+ if isinstance(filesystem, pa.fs.FileSystem):
149
+ table_writer_fn = get_table_writer(table)
150
+ table_slicer_fn = get_table_slicer(table)
151
+
152
+ # Create a wrapper for the table writer that ensures directory creation
153
+ def table_writer_with_dir_creation(
154
+ dataframe: Any,
155
+ base_path: str,
156
+ filesystem: Optional[Union[AbstractFileSystem, pa.fs.FileSystem]],
157
+ block_path_provider: Callable,
158
+ content_type: str = ContentType.PARQUET.value,
159
+ **kwargs,
160
+ ):
161
+ try:
162
+ # Ensure base path directory exists
163
+ if isinstance(base_path, str):
164
+ # Normalize the base path and ensure it's treated as a directory path
165
+ base_dir = base_path.rstrip("/")
166
+ filesystem.create_dir(base_dir, recursive=True)
167
+ except Exception:
168
+ # Directory might already exist or there might be permission issues
169
+ # Let the original write attempt proceed
170
+ pass
171
+ return table_writer_fn(
172
+ dataframe,
173
+ base_path,
174
+ filesystem,
175
+ block_path_provider,
176
+ content_type,
177
+ **kwargs,
178
+ )
179
+
180
+ # TODO(pdames): Disable redundant file info fetch currently
181
+ # used to construct unused manifest entry metadata.
182
+ manifest_entry_list = types_write_sliced_table(
183
+ table=table,
184
+ base_path=base_path,
185
+ filesystem=filesystem,
186
+ max_records_per_entry=max_records_per_file,
187
+ table_writer_fn=table_writer_with_dir_creation,
188
+ table_slicer_fn=table_slicer_fn,
189
+ table_writer_kwargs=table_writer_kwargs,
190
+ content_type=content_type,
191
+ )
192
+ paths = [entry.uri for entry in manifest_entry_list]
193
+ return paths
194
+ else:
195
+ return upload_table_with_retry(
196
+ table=table,
197
+ s3_url_prefix=base_path,
198
+ s3_table_writer_kwargs=table_writer_kwargs,
199
+ content_type=content_type,
200
+ max_records_per_file=max_records_per_file,
201
+ s3_file_system=filesystem,
202
+ **kwargs,
203
+ )