deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -5,13 +5,15 @@ import ray
5
5
  import time
6
6
  import json
7
7
  from math import ceil
8
+ from urllib.parse import urlparse
9
+ import pyarrow
8
10
 
9
11
  from deltacat.compute.compactor import (
10
12
  PyArrowWriteResult,
11
13
  HighWatermark,
12
14
  RoundCompletionInfo,
13
15
  )
14
- from deltacat.aws import s3u as s3_utils
16
+ from deltacat.utils.filesystem import resolve_path_and_filesystem
15
17
  from deltacat.compute.compactor.model.compactor_version import CompactorVersion
16
18
  from deltacat.compute.compactor_v2.model.evaluate_compaction_result import (
17
19
  ExecutionCompactionResult,
@@ -32,7 +34,7 @@ from deltacat.compute.compactor_v2.utils.merge import (
32
34
  from deltacat.compute.compactor_v2.utils.task_options import (
33
35
  hash_bucket_resource_options_provider,
34
36
  )
35
- from deltacat.compute.compactor.utils import round_completion_file as rcf
37
+ from deltacat.compute.compactor.utils import round_completion_reader as rci
36
38
  from deltacat.compute.compactor import DeltaAnnotated
37
39
  from deltacat.compute.compactor_v2.utils.delta import contains_delete_deltas
38
40
  from deltacat.compute.compactor_v2.deletes.delete_strategy import (
@@ -48,6 +50,7 @@ from deltacat.storage import (
48
50
  DeltaType,
49
51
  DeltaLocator,
50
52
  Partition,
53
+ PartitionLocator,
51
54
  Manifest,
52
55
  Stream,
53
56
  StreamLocator,
@@ -63,7 +66,7 @@ from deltacat.compute.compactor_v2.steps import merge as mg
63
66
  from deltacat.compute.compactor_v2.steps import hash_bucket as hb
64
67
  from deltacat.compute.compactor_v2.utils import io
65
68
 
66
- from typing import List, Optional
69
+ from typing import List, Optional, Union
67
70
  from collections import defaultdict
68
71
  from deltacat.compute.compactor.model.compaction_session_audit_info import (
69
72
  CompactionSessionAuditInfo,
@@ -77,21 +80,39 @@ from deltacat.compute.compactor_v2.utils.task_options import (
77
80
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
78
81
 
79
82
 
83
+ def _get_rci_source_partition_locator(
84
+ params: CompactPartitionParams,
85
+ ) -> PartitionLocator:
86
+ return params.rebase_source_partition_locator or params.source_partition_locator
87
+
88
+
89
+ def _is_inplace_compacted(
90
+ rci_source_partition_locator: PartitionLocator,
91
+ destination_partition_locator: PartitionLocator,
92
+ ) -> bool:
93
+ return (
94
+ rci_source_partition_locator.partition_values
95
+ == destination_partition_locator.partition_values
96
+ and rci_source_partition_locator.stream_id
97
+ == destination_partition_locator.stream_id
98
+ )
99
+
100
+
80
101
  def _fetch_compaction_metadata(
81
102
  params: CompactPartitionParams,
82
103
  ) -> tuple[Optional[Manifest], Optional[RoundCompletionInfo]]:
83
104
 
84
105
  # read the results from any previously completed compaction round
85
106
  round_completion_info: Optional[RoundCompletionInfo] = None
86
- high_watermark: Optional[HighWatermark] = None
107
+ high_watermark: Optional[Union[HighWatermark, int]] = None
87
108
  previous_compacted_delta_manifest: Optional[Manifest] = None
88
109
 
89
110
  if not params.rebase_source_partition_locator:
90
- round_completion_info = rcf.read_round_completion_file(
91
- params.compaction_artifact_s3_bucket,
92
- params.source_partition_locator,
93
- params.destination_partition_locator,
94
- **params.s3_client_kwargs,
111
+ round_completion_info = rci.read_round_completion_info(
112
+ source_partition_locator=params.source_partition_locator,
113
+ destination_partition_locator=params.destination_partition_locator,
114
+ deltacat_storage=params.deltacat_storage,
115
+ deltacat_storage_kwargs=params.deltacat_storage_kwargs,
95
116
  )
96
117
  if not round_completion_info:
97
118
  logger.info(
@@ -111,10 +132,10 @@ def _fetch_compaction_metadata(
111
132
  assert (
112
133
  params.hash_bucket_count == round_completion_info.hash_bucket_count
113
134
  ), (
114
- "The hash bucket count has changed. "
115
- "Kindly run rebase compaction and trigger incremental again. "
116
- f"Hash Bucket count in RCF={round_completion_info.hash_bucket_count} "
117
- f"not equal to Hash bucket count in args={params.hash_bucket_count}."
135
+ "Partition hash bucket count for compaction has changed. "
136
+ "Rebase compaction with the desired hash bucket count before running another incremental compaction. "
137
+ f"Hash bucket count in RCI={round_completion_info.hash_bucket_count} "
138
+ f"!= hash bucket count in params={params.hash_bucket_count}."
118
139
  )
119
140
 
120
141
  logger.info(f"Round completion file: {round_completion_info}")
@@ -129,7 +150,7 @@ def _build_uniform_deltas(
129
150
  mutable_compaction_audit: CompactionSessionAuditInfo,
130
151
  input_deltas: List[Delta],
131
152
  delta_discovery_start: float,
132
- ) -> tuple[List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope], Partition]:
153
+ ) -> tuple[List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope]]:
133
154
 
134
155
  delete_strategy: Optional[DeleteStrategy] = None
135
156
  delete_file_envelopes: Optional[List[DeleteFileEnvelope]] = None
@@ -149,6 +170,7 @@ def _build_uniform_deltas(
149
170
  hash_bucket_count=params.hash_bucket_count,
150
171
  compaction_audit=mutable_compaction_audit,
151
172
  compact_partition_params=params,
173
+ all_column_names=params.all_column_names,
152
174
  deltacat_storage=params.deltacat_storage,
153
175
  deltacat_storage_kwargs=params.deltacat_storage_kwargs,
154
176
  )
@@ -159,10 +181,9 @@ def _build_uniform_deltas(
159
181
  delta_discovery_end - delta_discovery_start
160
182
  )
161
183
 
162
- s3_utils.upload(
163
- mutable_compaction_audit.audit_url,
164
- str(json.dumps(mutable_compaction_audit)),
165
- **params.s3_client_kwargs,
184
+ _upload_audit_data(
185
+ params,
186
+ mutable_compaction_audit,
166
187
  )
167
188
 
168
189
  return (
@@ -222,7 +243,7 @@ def _run_hash_and_merge(
222
243
  uniform_deltas: List[DeltaAnnotated],
223
244
  round_completion_info: RoundCompletionInfo,
224
245
  delete_strategy: Optional[DeleteStrategy],
225
- delete_file_envelopes: Optional[DeleteFileEnvelope],
246
+ delete_file_envelopes: Optional[List[DeleteFileEnvelope]],
226
247
  mutable_compaction_audit: CompactionSessionAuditInfo,
227
248
  previous_compacted_delta_manifest: Optional[Manifest],
228
249
  compacted_partition: Partition,
@@ -267,10 +288,9 @@ def _run_hash_and_merge(
267
288
  hb_end - hb_start,
268
289
  )
269
290
 
270
- s3_utils.upload(
271
- mutable_compaction_audit.audit_url,
272
- str(json.dumps(mutable_compaction_audit)),
273
- **params.s3_client_kwargs,
291
+ _upload_audit_data(
292
+ params,
293
+ mutable_compaction_audit,
274
294
  )
275
295
 
276
296
  hb_data_processed_size_bytes = np.int64(0)
@@ -389,7 +409,7 @@ def _merge(
389
409
  all_hash_group_idx_to_obj_id: dict,
390
410
  compacted_partition: Partition,
391
411
  delete_strategy: DeleteStrategy,
392
- delete_file_envelopes: DeleteFileEnvelope,
412
+ delete_file_envelopes: List[DeleteFileEnvelope],
393
413
  ) -> tuple[List[MergeResult], float]:
394
414
  merge_options_provider = functools.partial(
395
415
  task_resource_options_provider,
@@ -402,13 +422,24 @@ def _merge(
402
422
  round_completion_info=round_completion_info,
403
423
  compacted_delta_manifest=previous_compacted_delta_manifest,
404
424
  primary_keys=params.primary_keys,
405
- deltacat_storage=params.deltacat_storage,
406
- deltacat_storage_kwargs=params.deltacat_storage_kwargs,
407
425
  ray_custom_resources=params.ray_custom_resources,
408
426
  memory_logs_enabled=params.memory_logs_enabled,
409
427
  estimate_resources_params=params.estimate_resources_params,
410
428
  )
411
429
 
430
+ # set previous compacted delta manifest on input so that we don't need a transaction to retrieve it
431
+ if round_completion_info:
432
+ previous_compacted_delta_manifest = params.deltacat_storage.get_delta_manifest(
433
+ round_completion_info.compacted_delta_locator,
434
+ **params.deltacat_storage_kwargs,
435
+ )
436
+
437
+ # create a copy of deltacat storage kwargs without any parent transaction context
438
+ # (can't be serialized by Ray, and we're only downloading already-resolved manifest entries)
439
+ deltacat_storage_kwargs_copy = {
440
+ k: v for k, v in params.deltacat_storage_kwargs.items() if k != "transaction"
441
+ }
442
+
412
443
  def merge_input_provider(index, item) -> dict[str, MergeInput]:
413
444
  return {
414
445
  "input": MergeInput.of(
@@ -422,23 +453,26 @@ def _merge(
422
453
  write_to_partition=compacted_partition,
423
454
  compacted_file_content_type=params.compacted_file_content_type,
424
455
  primary_keys=params.primary_keys,
456
+ all_column_names=params.all_column_names,
425
457
  sort_keys=params.sort_keys,
426
458
  merge_task_index=index,
427
459
  drop_duplicates=params.drop_duplicates,
428
460
  max_records_per_output_file=params.records_per_compacted_file,
429
461
  enable_profiler=params.enable_profiler,
430
462
  metrics_config=params.metrics_config,
431
- s3_table_writer_kwargs=params.s3_table_writer_kwargs,
463
+ table_writer_kwargs=params.table_writer_kwargs,
432
464
  read_kwargs_provider=params.read_kwargs_provider,
433
465
  round_completion_info=round_completion_info,
434
466
  object_store=params.object_store,
435
467
  deltacat_storage=params.deltacat_storage,
436
- deltacat_storage_kwargs=params.deltacat_storage_kwargs,
468
+ deltacat_storage_kwargs=deltacat_storage_kwargs_copy,
437
469
  delete_strategy=delete_strategy,
438
470
  delete_file_envelopes=delete_file_envelopes,
439
471
  memory_logs_enabled=params.memory_logs_enabled,
440
472
  disable_copy_by_reference=params.disable_copy_by_reference,
441
473
  hash_bucket_count=params.hash_bucket_count,
474
+ original_fields=params.original_fields,
475
+ compacted_manifest=previous_compacted_delta_manifest,
442
476
  )
443
477
  }
444
478
 
@@ -474,6 +508,12 @@ def _hash_bucket(
474
508
  estimate_resources_params=params.estimate_resources_params,
475
509
  )
476
510
 
511
+ # create a copy of deltacat storage kwargs without any parent transaction context
512
+ # (can't be serialized by Ray, and we're only downloading already-resolved manifest entries)
513
+ deltacat_storage_kwargs_copy = {
514
+ k: v for k, v in params.deltacat_storage_kwargs.items() if k != "transaction"
515
+ }
516
+
477
517
  def hash_bucket_input_provider(index, item) -> dict[str, HashBucketInput]:
478
518
  return {
479
519
  "input": HashBucketInput.of(
@@ -482,12 +522,13 @@ def _hash_bucket(
482
522
  hb_task_index=index,
483
523
  num_hash_buckets=params.hash_bucket_count,
484
524
  num_hash_groups=params.hash_group_count,
525
+ all_column_names=params.all_column_names,
485
526
  enable_profiler=params.enable_profiler,
486
527
  metrics_config=params.metrics_config,
487
528
  read_kwargs_provider=params.read_kwargs_provider,
488
529
  object_store=params.object_store,
489
530
  deltacat_storage=params.deltacat_storage,
490
- deltacat_storage_kwargs=params.deltacat_storage_kwargs,
531
+ deltacat_storage_kwargs=deltacat_storage_kwargs_copy,
491
532
  memory_logs_enabled=params.memory_logs_enabled,
492
533
  )
493
534
  }
@@ -596,10 +637,9 @@ def _process_merge_results(
596
637
  file_index += mat_result.pyarrow_write_result.files
597
638
  previous_task_index = mat_result.task_index
598
639
 
599
- s3_utils.upload(
600
- mutable_compaction_audit.audit_url,
601
- str(json.dumps(mutable_compaction_audit)),
602
- **params.s3_client_kwargs,
640
+ _upload_audit_data(
641
+ params,
642
+ mutable_compaction_audit,
603
643
  )
604
644
  deltas: List[Delta] = [m.delta for m in mat_results]
605
645
  # Note: An appropriate last stream position must be set
@@ -634,21 +674,20 @@ def _update_and_upload_compaction_audit(
634
674
  + round_completion_info.compacted_pyarrow_write_result.records
635
675
  )
636
676
 
637
- s3_utils.upload(
638
- mutable_compaction_audit.audit_url,
639
- str(json.dumps(mutable_compaction_audit)),
640
- **params.s3_client_kwargs,
677
+ _upload_audit_data(
678
+ params,
679
+ mutable_compaction_audit,
641
680
  )
642
681
  return
643
682
 
644
683
 
645
- def _write_new_round_completion_file(
684
+ def _create_round_completion_info(
646
685
  params: CompactPartitionParams,
647
686
  mutable_compaction_audit: CompactionSessionAuditInfo,
648
687
  compacted_partition: Partition,
649
688
  audit_url: str,
650
689
  hb_id_to_entry_indices_range: dict,
651
- rcf_source_partition_locator: rcf.PartitionLocator,
690
+ rci_source_partition_locator: PartitionLocator,
652
691
  new_compacted_delta_locator: DeltaLocator,
653
692
  pyarrow_write_result: PyArrowWriteResult,
654
693
  prev_round_completion_info: Optional[RoundCompletionInfo] = None,
@@ -690,6 +729,27 @@ def _write_new_round_completion_file(
690
729
  prev_round_completion_info,
691
730
  )
692
731
 
732
+ # Check if this is an in-place compaction before creating RoundCompletionInfo
733
+ logger.info(
734
+ f"Checking if partition {rci_source_partition_locator} is inplace compacted against {params.destination_partition_locator}..."
735
+ )
736
+ is_inplace_compacted: bool = _is_inplace_compacted(
737
+ rci_source_partition_locator, params.destination_partition_locator
738
+ )
739
+
740
+ # Determine the prev_source_partition_locator based on compaction type
741
+ if is_inplace_compacted:
742
+ logger.info(
743
+ "In-place compaction detected. Using compacted partition locator as prev_source_partition_locator. "
744
+ + f"Got compacted partition partition_id of {compacted_partition.locator.partition_id} "
745
+ f"and rci source partition_id of {rci_source_partition_locator.partition_id}."
746
+ )
747
+ prev_source_partition_locator = compacted_partition.locator
748
+ # Update rci_source_partition_locator for backward compatibility
749
+ rci_source_partition_locator = compacted_partition.locator
750
+ else:
751
+ prev_source_partition_locator = rci_source_partition_locator
752
+
693
753
  new_round_completion_info = RoundCompletionInfo.of(
694
754
  high_watermark=params.last_stream_position_to_compact,
695
755
  compacted_delta_locator=new_compacted_delta_locator,
@@ -702,41 +762,17 @@ def _write_new_round_completion_file(
702
762
  compactor_version=CompactorVersion.V2.value,
703
763
  input_inflation=input_inflation,
704
764
  input_average_record_size_bytes=input_average_record_size_bytes,
765
+ prev_source_partition_locator=prev_source_partition_locator,
705
766
  )
706
767
 
707
768
  logger.info(
708
769
  f"Partition-{params.source_partition_locator.partition_values},"
709
770
  f"compacted at: {params.last_stream_position_to_compact},"
710
771
  )
711
- logger.info(
712
- f"Checking if partition {rcf_source_partition_locator} is inplace compacted against {params.destination_partition_locator}..."
713
- )
714
- is_inplace_compacted: bool = (
715
- rcf_source_partition_locator.partition_values
716
- == params.destination_partition_locator.partition_values
717
- and rcf_source_partition_locator.stream_id
718
- == params.destination_partition_locator.stream_id
719
- )
720
- if is_inplace_compacted:
721
- logger.info(
722
- "Overriding round completion file source partition locator as in-place compacted. "
723
- + f"Got compacted partition partition_id of {compacted_partition.locator.partition_id} "
724
- f"and rcf source partition_id of {rcf_source_partition_locator.partition_id}."
725
- )
726
- rcf_source_partition_locator = compacted_partition.locator
727
-
728
- round_completion_file_s3_url = rcf.write_round_completion_file(
729
- params.compaction_artifact_s3_bucket,
730
- rcf_source_partition_locator,
731
- compacted_partition.locator,
732
- new_round_completion_info,
733
- **params.s3_client_kwargs,
734
- )
735
772
 
736
773
  return ExecutionCompactionResult(
737
774
  compacted_partition,
738
775
  new_round_completion_info,
739
- round_completion_file_s3_url,
740
776
  is_inplace_compacted,
741
777
  )
742
778
 
@@ -752,21 +788,29 @@ def _commit_compaction_result(
752
788
  f"Partition-{params.source_partition_locator} -> "
753
789
  f"{compaction_session_type} Compaction session data processing completed"
754
790
  )
791
+ # TODO(pdames): Uncomment this once we support concurrent writes to the same
792
+ # partition (via write_to_table). This requires updating the commit_partition
793
+ # method to support previous partition as input. Right now, a concurrent write
794
+ # to the same partition will cause the commit_partition method to fail.
755
795
  if execute_compaction_result.new_compacted_partition:
756
796
  previous_partition: Optional[Partition] = None
757
- if execute_compaction_result.is_inplace_compacted:
758
- previous_partition: Optional[
759
- Partition
760
- ] = params.deltacat_storage.get_partition(
761
- params.source_partition_locator.stream_locator,
762
- params.source_partition_locator.partition_values,
763
- **params.deltacat_storage_kwargs,
764
- )
765
- # NOTE: Retrieving the previous partition again as the partition_id may have changed by the time commit_partition is called.
797
+ # if execute_compaction_result.is_inplace_compacted:
798
+ # previous_partition: Optional[
799
+ # Partition
800
+ # ] = params.deltacat_storage.get_partition(
801
+ # params.source_partition_locator.stream_locator,
802
+ # params.source_partition_locator.partition_values,
803
+ # **params.deltacat_storage_kwargs,
804
+ # )
805
+ # # NOTE: Retrieving the previous partition again as the partition_id may have changed by the time commit_partition is called.
766
806
  logger.info(
767
807
  f"Committing compacted partition to: {execute_compaction_result.new_compacted_partition.locator} "
768
808
  f"using previous partition: {previous_partition.locator if previous_partition else None}"
769
809
  )
810
+ # Set the round completion info on the partition before committing
811
+ execute_compaction_result.new_compacted_partition.compaction_round_completion_info = (
812
+ execute_compaction_result.new_round_completion_info
813
+ )
770
814
  committed_partition: Partition = params.deltacat_storage.commit_partition(
771
815
  execute_compaction_result.new_compacted_partition,
772
816
  previous_partition,
@@ -777,3 +821,57 @@ def _commit_compaction_result(
777
821
  logger.warning("No new partition was committed during compaction.")
778
822
 
779
823
  logger.info(f"Completed compaction session for: {params.source_partition_locator}")
824
+
825
+
826
+ def _upload_audit_data(
827
+ params: CompactPartitionParams,
828
+ audit_info: CompactionSessionAuditInfo,
829
+ ) -> None:
830
+ """
831
+ Upload audit data to the specified URL using the filesystem from catalog properties.
832
+ """
833
+ audit_url = audit_info.audit_url
834
+ audit_data = json.dumps(audit_info.to_serializable(params.catalog.root))
835
+ if params.catalog and params.catalog.filesystem:
836
+ # Use the filesystem from catalog properties
837
+ filesystem = params.catalog.filesystem
838
+ parsed_url = urlparse(audit_url)
839
+ # For filesystem paths, use the path component
840
+ path = parsed_url.path if parsed_url.scheme else audit_url
841
+
842
+ # Ensure parent directories exist
843
+ import os
844
+
845
+ parent_dir = os.path.dirname(path)
846
+ if (
847
+ parent_dir
848
+ and not filesystem.get_file_info(parent_dir).type
849
+ == pyarrow.fs.FileType.Directory
850
+ ):
851
+ try:
852
+ filesystem.create_dir(parent_dir, recursive=True)
853
+ except Exception as e:
854
+ logger.warning(f"Failed to create directory {parent_dir}: {e}")
855
+
856
+ with filesystem.open_output_stream(path) as output_stream:
857
+ output_stream.write(audit_data.encode("utf-8"))
858
+ else:
859
+ # Fallback: resolve filesystem from the URL
860
+ path, filesystem = resolve_path_and_filesystem(audit_url)
861
+
862
+ # Ensure parent directories exist
863
+ import os
864
+
865
+ parent_dir = os.path.dirname(path)
866
+ if (
867
+ parent_dir
868
+ and not filesystem.get_file_info(parent_dir).type
869
+ == pyarrow.fs.FileType.Directory
870
+ ):
871
+ try:
872
+ filesystem.create_dir(parent_dir, recursive=True)
873
+ except Exception as e:
874
+ logger.warning(f"Failed to create directory {parent_dir}: {e}")
875
+
876
+ with filesystem.open_output_stream(path) as output_stream:
877
+ output_stream.write(audit_data.encode("utf-8"))
@@ -18,7 +18,7 @@ from deltacat.compute.compactor_v2.utils.primary_key_index import (
18
18
  group_hash_bucket_indices,
19
19
  group_by_pk_hash_bucket,
20
20
  )
21
- from deltacat.storage import interface as unimplemented_deltacat_storage
21
+ from deltacat.storage import metastore
22
22
  from deltacat.utils.ray_utils.runtime import (
23
23
  get_current_ray_task_id,
24
24
  get_current_ray_worker_id,
@@ -50,8 +50,9 @@ def _group_file_records_by_pk_hash_bucket(
50
50
  annotated_delta: DeltaAnnotated,
51
51
  num_hash_buckets: int,
52
52
  primary_keys: List[str],
53
+ all_column_names: List[str],
53
54
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
54
- deltacat_storage=unimplemented_deltacat_storage,
55
+ deltacat_storage=metastore,
55
56
  deltacat_storage_kwargs: Optional[dict] = None,
56
57
  ) -> Tuple[Optional[DeltaFileEnvelopeGroups], int, int]:
57
58
  # read input parquet s3 objects into a list of delta file envelopes
@@ -61,6 +62,7 @@ def _group_file_records_by_pk_hash_bucket(
61
62
  total_size_bytes,
62
63
  ) = read_delta_file_envelopes(
63
64
  annotated_delta,
65
+ all_column_names,
64
66
  read_kwargs_provider,
65
67
  deltacat_storage,
66
68
  deltacat_storage_kwargs,
@@ -116,6 +118,7 @@ def _timed_hash_bucket(input: HashBucketInput):
116
118
  annotated_delta=input.annotated_delta,
117
119
  num_hash_buckets=input.num_hash_buckets,
118
120
  primary_keys=input.primary_keys,
121
+ all_column_names=input.all_column_names,
119
122
  read_kwargs_provider=input.read_kwargs_provider,
120
123
  deltacat_storage=input.deltacat_storage,
121
124
  deltacat_storage_kwargs=input.deltacat_storage_kwargs,