deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -37,7 +37,9 @@ class DeltaFileEnvelope(dict):
37
37
  pointing to a file from the uncompacted source table, False if
38
38
  this Locator is pointing to a file in the compacted destination
39
39
  table.
40
- table_storage_strategy: The way the table object is stored in the delta file envelope. If None just stores the table normally
40
+ file_record_count: Record count in the delta file table.
41
+ table_storage_strategy: The way the table object is stored in the
42
+ delta file envelope. If None just stores the table normally
41
43
  Returns:
42
44
  A delta file envelope.
43
45
 
@@ -31,9 +31,11 @@ class DeltaFileLocator(Locator, tuple):
31
31
 
32
32
  file_index: Index of the file in the Delta Manifest.
33
33
 
34
+ file_record_count: Count of records in the Delta File.
35
+
34
36
  Returns:
35
37
  delta_file_locator: The Delta File Locator Tuple as
36
- (is_source_delta, stream_position, file_index).
38
+ (is_src_delta, stream_position, file_index, file_record_count).
37
39
  """
38
40
  return DeltaFileLocator(
39
41
  (is_src_delta, stream_position, file_index, file_record_count)
@@ -1,7 +1,7 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
- from typing import Tuple
4
+ from typing import Tuple, Union
5
5
  from deltacat.storage import DeltaLocator, PartitionLocator
6
6
  from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
7
7
  from typing import Any, Dict, Optional
@@ -10,7 +10,7 @@ from typing import Any, Dict, Optional
10
10
  class HighWatermark(dict):
11
11
  """
12
12
  Inherit from dict to make it easy for serialization/deserialization.
13
- Keep both partition locator and high watermark as a tuple to be persisted in the rcf
13
+ Keep both partition locator and high watermark as a tuple to be persisted in the rci
14
14
  """
15
15
 
16
16
  def set(self, partition_locator: PartitionLocator, delta_stream_position: int):
@@ -34,7 +34,7 @@ class RoundCompletionInfo(dict):
34
34
 
35
35
  @staticmethod
36
36
  def of(
37
- high_watermark: HighWatermark,
37
+ high_watermark: Union[HighWatermark, int],
38
38
  compacted_delta_locator: DeltaLocator,
39
39
  compacted_pyarrow_write_result: PyArrowWriteResult,
40
40
  sort_keys_bit_width: int,
@@ -46,6 +46,7 @@ class RoundCompletionInfo(dict):
46
46
  compactor_version: Optional[str] = None,
47
47
  input_inflation: Optional[float] = None,
48
48
  input_average_record_size_bytes: Optional[float] = None,
49
+ prev_source_partition_locator: Optional[PartitionLocator] = None,
49
50
  ) -> RoundCompletionInfo:
50
51
 
51
52
  rci = RoundCompletionInfo()
@@ -63,10 +64,11 @@ class RoundCompletionInfo(dict):
63
64
  rci["compactorVersion"] = compactor_version
64
65
  rci["inputInflation"] = input_inflation
65
66
  rci["inputAverageRecordSizeBytes"] = input_average_record_size_bytes
67
+ rci["prevSourcePartitionLocator"] = prev_source_partition_locator
66
68
  return rci
67
69
 
68
70
  @property
69
- def high_watermark(self) -> HighWatermark:
71
+ def high_watermark(self) -> Union[HighWatermark, int]:
70
72
  val: Dict[str, Any] = self.get("highWatermark")
71
73
  if (
72
74
  val is not None
@@ -100,7 +102,11 @@ class RoundCompletionInfo(dict):
100
102
 
101
103
  @property
102
104
  def rebase_source_partition_locator(self) -> Optional[PartitionLocator]:
103
- return self.get("rebaseSourcePartitionLocator")
105
+ val = self.get("rebaseSourcePartitionLocator")
106
+ if val is not None and not isinstance(val, PartitionLocator):
107
+ val = PartitionLocator(val)
108
+ self["rebaseSourcePartitionLocator"] = val # Cache the converted value
109
+ return val
104
110
 
105
111
  @property
106
112
  def manifest_entry_copied_by_reference_ratio(self) -> Optional[float]:
@@ -111,7 +117,7 @@ class RoundCompletionInfo(dict):
111
117
  return self["hashBucketCount"]
112
118
 
113
119
  @property
114
- def hb_index_to_entry_range(self) -> Optional[Dict[int, Tuple[int, int]]]:
120
+ def hb_index_to_entry_range(self) -> Optional[Dict[str, Tuple[int, int]]]:
115
121
  """
116
122
  The start index is inclusive and end index is exclusive by default.
117
123
  """
@@ -129,6 +135,10 @@ class RoundCompletionInfo(dict):
129
135
  def input_average_record_size_bytes(self) -> Optional[float]:
130
136
  return self.get("inputAverageRecordSizeBytes")
131
137
 
132
- @staticmethod
133
- def get_audit_bucket_name_and_key(compaction_audit_url: str) -> Tuple[str, str]:
134
- return compaction_audit_url.replace("s3://", "").split("/", 1)
138
+ @property
139
+ def prev_source_partition_locator(self) -> Optional[PartitionLocator]:
140
+ val = self.get("prevSourcePartitionLocator")
141
+ if val is not None and not isinstance(val, PartitionLocator):
142
+ val = PartitionLocator(val)
143
+ self["prevSourcePartitionLocator"] = val # Cache the converted value
144
+ return val
@@ -4,7 +4,7 @@ from ray.types import ObjectRef
4
4
 
5
5
  from typing import Any, Union
6
6
 
7
- from abc import ABC, abstractmethod, abstractproperty
7
+ from abc import ABC, abstractmethod
8
8
  from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
9
9
  from deltacat.storage import (
10
10
  LocalTable,
@@ -15,7 +15,8 @@ LocalTableReference = Union[ObjectRef, LocalTable]
15
15
 
16
16
 
17
17
  class LocalTableStorageStrategy(ABC):
18
- @abstractproperty
18
+ @property
19
+ @abstractmethod
19
20
  def object_store(cls) -> IObjectStore:
20
21
  pass
21
22
 
@@ -21,14 +21,13 @@ from deltacat.utils.placement import PlacementGroupConfig
21
21
  from typing import List, Optional, Dict, Any
22
22
  from deltacat.utils.ray_utils.runtime import live_node_resource_keys
23
23
  from deltacat.compute.compactor.utils import io
24
- from deltacat.compute.compactor.utils import round_completion_file as rcf
25
24
  from deltacat.compute.compactor.steps import repartition as repar
26
25
  from deltacat.compute.compactor.steps.repartition import RepartitionType
27
26
  from deltacat.storage import (
28
27
  Delta,
29
28
  DeltaLocator,
30
29
  PartitionLocator,
31
- interface as unimplemented_deltacat_storage,
30
+ metastore,
32
31
  )
33
32
  from deltacat.utils.metrics import MetricsConfig
34
33
  from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
@@ -41,7 +40,6 @@ def repartition(
41
40
  source_partition_locator: PartitionLocator,
42
41
  destination_partition_locator: PartitionLocator,
43
42
  repartition_args: Any,
44
- repartition_completion_file_s3_url: str,
45
43
  last_stream_position_to_compact: int,
46
44
  repartition_type: RepartitionType = RepartitionType.RANGE,
47
45
  sort_keys: List[SortKey] = None,
@@ -54,9 +52,8 @@ def repartition(
54
52
  pg_config: Optional[PlacementGroupConfig] = None,
55
53
  list_deltas_kwargs: Optional[Dict[str, Any]] = None,
56
54
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
57
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
58
- s3_client_kwargs: Optional[Dict[str, Any]] = None,
59
- deltacat_storage=unimplemented_deltacat_storage,
55
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
56
+ deltacat_storage=metastore,
60
57
  **kwargs,
61
58
  ) -> Optional[str]:
62
59
 
@@ -132,13 +129,13 @@ def repartition(
132
129
  enable_profiler=enable_profiler,
133
130
  metrics_config=metrics_config,
134
131
  read_kwargs_provider=read_kwargs_provider,
135
- s3_table_writer_kwargs=s3_table_writer_kwargs,
132
+ table_writer_kwargs=table_writer_kwargs,
136
133
  repartitioned_file_content_type=repartitioned_file_content_type,
137
134
  deltacat_storage=deltacat_storage,
138
135
  )
139
136
  logger.info(f"Getting {len(repar_tasks_pending)} task results...")
140
137
  repar_results: List[RepartitionResult] = ray.get(repar_tasks_pending)
141
- repar_results: List[Delta] = [rp.range_deltas for rp in repar_results]
138
+ repar_results: List[List[Delta]] = [rp.range_deltas for rp in repar_results]
142
139
  transposed = list(itertools.zip_longest(*repar_results, fillvalue=None))
143
140
  ordered_deltas: List[Delta] = [
144
141
  i for sublist in transposed for i in sublist if i is not None
@@ -153,9 +150,6 @@ def repartition(
153
150
  compacted_delta = deltacat_storage.commit_delta(
154
151
  merged_delta, properties=kwargs.get("properties", {})
155
152
  )
156
- deltacat_storage.commit_partition(partition)
157
- logger.info(f"Committed final delta: {compacted_delta}")
158
- logger.info(f"Job run completed successfully!")
159
153
  new_compacted_delta_locator = DeltaLocator.of(
160
154
  new_compacted_partition_locator,
161
155
  compacted_delta.stream_position,
@@ -173,14 +167,7 @@ def repartition(
173
167
  bit_width_of_sort_keys,
174
168
  None,
175
169
  )
176
- if s3_client_kwargs is None:
177
- s3_client_kwargs = {}
178
-
179
- return rcf.write_round_completion_file(
180
- None,
181
- None,
182
- None,
183
- repartition_completion_info,
184
- repartition_completion_file_s3_url,
185
- **s3_client_kwargs,
186
- )
170
+ partition.compaction_round_completion_info = repartition_completion_info
171
+ deltacat_storage.commit_partition(partition)
172
+ logger.info(f"Committed final delta: {compacted_delta}")
173
+ logger.info(f"Job run completed successfully!")
@@ -15,7 +15,8 @@ from deltacat.compute.compactor import (
15
15
  DeltaFileEnvelope,
16
16
  DeltaFileLocator,
17
17
  )
18
- from deltacat.storage.model.sort_key import SortKey, SortOrder
18
+ from deltacat.storage.model.sort_key import SortKey
19
+ from deltacat.storage import SortOrder
19
20
  from deltacat.compute.compactor.model.dedupe_result import DedupeResult
20
21
  from deltacat.compute.compactor.utils import system_columns as sc
21
22
  from deltacat.utils.ray_utils.runtime import (
@@ -155,15 +156,21 @@ def _timed_dedupe(
155
156
  sort_keys.extend(
156
157
  [
157
158
  SortKey.of(
158
- sc._PARTITION_STREAM_POSITION_COLUMN_NAME,
159
+ [sc._PARTITION_STREAM_POSITION_COLUMN_NAME],
159
160
  SortOrder.ASCENDING,
160
161
  ),
161
162
  SortKey.of(
162
- sc._ORDERED_FILE_IDX_COLUMN_NAME, SortOrder.ASCENDING
163
+ [sc._ORDERED_FILE_IDX_COLUMN_NAME],
164
+ SortOrder.ASCENDING,
163
165
  ),
164
166
  ]
165
167
  )
166
- table = table.take(pc.sort_indices(table, sort_keys=sort_keys))
168
+ table = table.take(
169
+ pc.sort_indices(
170
+ table,
171
+ sort_keys=[pa_key for key in sort_keys for pa_key in key.arrow],
172
+ )
173
+ )
167
174
 
168
175
  # drop duplicates by primary key hash column
169
176
  logger.info(
@@ -21,7 +21,7 @@ from deltacat.compute.compactor.utils.primary_key_index import (
21
21
  group_hash_bucket_indices,
22
22
  group_record_indices_by_hash_bucket,
23
23
  )
24
- from deltacat.storage import interface as unimplemented_deltacat_storage
24
+ from deltacat.storage import metastore
25
25
  from deltacat.types.media import StorageType
26
26
  from deltacat.utils.common import sha1_digest
27
27
  from deltacat.utils.ray_utils.runtime import (
@@ -90,7 +90,7 @@ def _group_file_records_by_pk_hash_bucket(
90
90
  sort_key_names: List[str],
91
91
  is_src_delta: np.bool_ = True,
92
92
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
93
- deltacat_storage=unimplemented_deltacat_storage,
93
+ deltacat_storage=metastore,
94
94
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
95
95
  **kwargs,
96
96
  ) -> Tuple[Optional[DeltaFileEnvelopeGroups], int]:
@@ -139,7 +139,7 @@ def _read_delta_file_envelopes(
139
139
  primary_keys: List[str],
140
140
  sort_key_names: List[str],
141
141
  read_kwargs_provider: Optional[ReadKwargsProvider],
142
- deltacat_storage=unimplemented_deltacat_storage,
142
+ deltacat_storage=metastore,
143
143
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
144
144
  **kwargs,
145
145
  ) -> Tuple[Optional[List[DeltaFileEnvelope]], int]:
@@ -190,7 +190,7 @@ def _timed_hash_bucket(
190
190
  enable_profiler: bool,
191
191
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
192
192
  object_store: Optional[IObjectStore] = None,
193
- deltacat_storage=unimplemented_deltacat_storage,
193
+ deltacat_storage=metastore,
194
194
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
195
195
  **kwargs,
196
196
  ):
@@ -201,7 +201,7 @@ def _timed_hash_bucket(
201
201
  with memray.Tracker(
202
202
  f"hash_bucket_{worker_id}_{task_id}.bin"
203
203
  ) if enable_profiler else nullcontext():
204
- sort_key_names = [key.key_name for key in sort_keys]
204
+ sort_key_names = list(chain.from_iterable([key.key for key in sort_keys]))
205
205
  if not round_completion_info:
206
206
  is_src_delta = True
207
207
  else:
@@ -249,7 +249,7 @@ def hash_bucket(
249
249
  metrics_config: MetricsConfig,
250
250
  read_kwargs_provider: Optional[ReadKwargsProvider],
251
251
  object_store: Optional[IObjectStore],
252
- deltacat_storage=unimplemented_deltacat_storage,
252
+ deltacat_storage=metastore,
253
253
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
254
254
  **kwargs,
255
255
  ) -> HashBucketResult:
@@ -25,10 +25,11 @@ from deltacat.storage import (
25
25
  DeltaType,
26
26
  Partition,
27
27
  PartitionLocator,
28
- Manifest,
29
28
  ManifestEntry,
29
+ ManifestEntryList,
30
30
  )
31
- from deltacat.storage import interface as unimplemented_deltacat_storage
31
+ from deltacat.storage.model.manifest import Manifest
32
+
32
33
  from deltacat.utils.common import ReadKwargsProvider
33
34
  from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES, ContentType
34
35
  from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
@@ -45,6 +46,7 @@ from deltacat.utils.ray_utils.runtime import (
45
46
  )
46
47
  from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
47
48
  from deltacat.utils.resources import get_current_process_peak_memory_usage_in_bytes
49
+ from deltacat.storage import metastore
48
50
 
49
51
  if importlib.util.find_spec("memray"):
50
52
  import memray
@@ -66,9 +68,9 @@ def materialize(
66
68
  metrics_config: MetricsConfig,
67
69
  schema: Optional[pa.Schema] = None,
68
70
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
69
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
71
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
70
72
  object_store: Optional[IObjectStore] = None,
71
- deltacat_storage=unimplemented_deltacat_storage,
73
+ deltacat_storage=metastore,
72
74
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
73
75
  ):
74
76
  if deltacat_storage_kwargs is None:
@@ -77,12 +79,15 @@ def materialize(
77
79
  def _stage_delta_from_manifest_entry_reference_list(
78
80
  manifest_entry_list_reference: List[ManifestEntry],
79
81
  partition: Partition,
80
- delta_type: DeltaType = DeltaType.UPSERT,
82
+ delta_type: DeltaType = DeltaType.APPEND,
81
83
  ) -> Delta:
82
84
  assert (
83
- delta_type == DeltaType.UPSERT
84
- ), "Stage delta with existing manifest entries only supports UPSERT delta type!"
85
- manifest = Manifest.of(entries=manifest_entry_list_reference, uuid=str(uuid4()))
85
+ delta_type == DeltaType.APPEND
86
+ ), "Compaction should always produce APPEND deltas for consistent read operations!"
87
+ manifest = Manifest.of(
88
+ entries=ManifestEntryList.of(manifest_entry_list_reference),
89
+ uuid=str(uuid4()),
90
+ )
86
91
  delta = Delta.of(
87
92
  locator=DeltaLocator.of(partition.locator),
88
93
  delta_type=delta_type,
@@ -106,9 +111,10 @@ def materialize(
106
111
  deltacat_storage.stage_delta,
107
112
  compacted_table,
108
113
  partition,
114
+ delta_type=DeltaType.APPEND, # Compaction always produces APPEND deltas
109
115
  max_records_per_entry=max_records_per_output_file,
110
116
  content_type=compacted_file_content_type,
111
- s3_table_writer_kwargs=s3_table_writer_kwargs,
117
+ table_writer_kwargs=table_writer_kwargs,
112
118
  **deltacat_storage_kwargs,
113
119
  )
114
120
  compacted_table_size = TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](
@@ -10,7 +10,7 @@ import ray
10
10
  from deltacat import logs
11
11
  from deltacat.compute.compactor import DeltaAnnotated
12
12
  from deltacat.compute.compactor.model.repartition_result import RepartitionResult
13
- from deltacat.storage import interface as unimplemented_deltacat_storage
13
+ from deltacat.storage import metastore
14
14
  from deltacat.storage import Partition
15
15
  from deltacat.utils.ray_utils.runtime import (
16
16
  get_current_ray_task_id,
@@ -19,7 +19,7 @@ from deltacat.utils.ray_utils.runtime import (
19
19
  from deltacat.utils.common import ReadKwargsProvider
20
20
  from deltacat.utils.performance import timed_invocation
21
21
  from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
22
- from deltacat.storage import Delta
22
+ from deltacat.storage import Delta, DeltaType
23
23
  from enum import Enum
24
24
 
25
25
  if importlib.util.find_spec("memray"):
@@ -56,9 +56,9 @@ def repartition_range(
56
56
  destination_partition: Partition,
57
57
  repartition_args: dict,
58
58
  max_records_per_output_file: int,
59
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
59
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
60
60
  repartitioned_file_content_type: ContentType = ContentType.PARQUET,
61
- deltacat_storage=unimplemented_deltacat_storage,
61
+ deltacat_storage=metastore,
62
62
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
63
63
  **kwargs,
64
64
  ):
@@ -144,9 +144,10 @@ def repartition_range(
144
144
  partition_delta: Delta = deltacat_storage.stage_delta(
145
145
  partition_table,
146
146
  destination_partition,
147
+ delta_type=DeltaType.APPEND, # Repartition always produces APPEND deltas
147
148
  max_records_per_entry=max_records_per_output_file,
148
149
  content_type=repartitioned_file_content_type,
149
- s3_table_writer_kwargs=s3_table_writer_kwargs,
150
+ table_writer_kwargs=table_writer_kwargs,
150
151
  **deltacat_storage_kwargs,
151
152
  )
152
153
  partition_deltas.append(partition_delta)
@@ -168,9 +169,9 @@ def _timed_repartition(
168
169
  max_records_per_output_file: int,
169
170
  enable_profiler: bool,
170
171
  read_kwargs_provider: Optional[ReadKwargsProvider],
171
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
172
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
172
173
  repartitioned_file_content_type: ContentType = ContentType.PARQUET,
173
- deltacat_storage=unimplemented_deltacat_storage,
174
+ deltacat_storage=metastore,
174
175
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
175
176
  **kwargs,
176
177
  ) -> RepartitionResult:
@@ -192,7 +193,7 @@ def _timed_repartition(
192
193
  destination_partition=destination_partition,
193
194
  repartition_args=repartition_args,
194
195
  max_records_per_output_file=max_records_per_output_file,
195
- s3_table_writer_kwargs=s3_table_writer_kwargs,
196
+ table_writer_kwargs=table_writer_kwargs,
196
197
  repartitioned_file_content_type=repartitioned_file_content_type,
197
198
  deltacat_storage=deltacat_storage,
198
199
  deltacat_storage_kwargs=deltacat_storage_kwargs,
@@ -213,9 +214,9 @@ def repartition(
213
214
  enable_profiler: bool,
214
215
  metrics_config: Optional[MetricsConfig],
215
216
  read_kwargs_provider: Optional[ReadKwargsProvider],
216
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
217
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
217
218
  repartitioned_file_content_type: ContentType = ContentType.PARQUET,
218
- deltacat_storage=unimplemented_deltacat_storage,
219
+ deltacat_storage=metastore,
219
220
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
220
221
  **kwargs,
221
222
  ) -> RepartitionResult:
@@ -231,7 +232,7 @@ def repartition(
231
232
  max_records_per_output_file=max_records_per_output_file,
232
233
  enable_profiler=enable_profiler,
233
234
  read_kwargs_provider=read_kwargs_provider,
234
- s3_table_writer_kwargs=s3_table_writer_kwargs,
235
+ table_writer_kwargs=table_writer_kwargs,
235
236
  repartitioned_file_content_type=repartitioned_file_content_type,
236
237
  deltacat_storage=deltacat_storage,
237
238
  deltacat_storage_kwargs=deltacat_storage_kwargs,
@@ -11,7 +11,7 @@ from deltacat.storage import (
11
11
  PartitionLocator,
12
12
  Delta,
13
13
  ManifestEntry,
14
- interface as unimplemented_deltacat_storage,
14
+ metastore,
15
15
  )
16
16
  from deltacat import logs
17
17
  from deltacat.compute.compactor import DeltaAnnotated
@@ -31,12 +31,13 @@ def discover_deltas(
31
31
  compacted_partition_locator: Optional[PartitionLocator],
32
32
  rebase_source_partition_locator: Optional[PartitionLocator],
33
33
  rebase_source_partition_high_watermark: Optional[int],
34
- deltacat_storage=unimplemented_deltacat_storage,
34
+ deltacat_storage=metastore,
35
35
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
36
36
  list_deltas_kwargs: Optional[Dict[str, Any]] = {},
37
37
  ) -> Tuple[List[Delta], int]:
38
38
  if deltacat_storage_kwargs is None:
39
39
  deltacat_storage_kwargs = {}
40
+
40
41
  # Source One: new deltas from uncompacted table for incremental compaction or deltas from compacted table for rebase
41
42
  start_position_exclusive = (
42
43
  high_watermark.get(source_partition_locator)
@@ -109,7 +110,7 @@ def limit_input_deltas(
109
110
  user_hash_bucket_chunk_size: int,
110
111
  input_deltas_stats: Dict[int, DeltaStats],
111
112
  compaction_audit: CompactionSessionAuditInfo,
112
- deltacat_storage=unimplemented_deltacat_storage,
113
+ deltacat_storage=metastore,
113
114
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
114
115
  **kwargs,
115
116
  ) -> Tuple[List[DeltaAnnotated], int, HighWatermark, bool]:
@@ -272,7 +273,7 @@ def fit_input_deltas(
272
273
  cluster_resources: Dict[str, float],
273
274
  compaction_audit: CompactionSessionAuditInfo,
274
275
  hash_bucket_count: Optional[int],
275
- deltacat_storage=unimplemented_deltacat_storage,
276
+ deltacat_storage=metastore,
276
277
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
277
278
  **kwargs,
278
279
  ) -> Tuple[List[DeltaAnnotated], int, HighWatermark, bool]:
@@ -358,8 +359,8 @@ def fit_input_deltas(
358
359
  def _discover_deltas(
359
360
  source_partition_locator: PartitionLocator,
360
361
  start_position_exclusive: Optional[int],
361
- end_position_inclusive: int,
362
- deltacat_storage=unimplemented_deltacat_storage,
362
+ end_position_inclusive: Optional[int],
363
+ deltacat_storage=metastore,
363
364
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
364
365
  list_deltas_kwargs: Optional[Dict[str, Any]] = {},
365
366
  ) -> List[Delta]:
@@ -0,0 +1,117 @@
1
+ import logging
2
+ from typing import Optional
3
+ from deltacat import logs
4
+ from deltacat.compute.compactor import RoundCompletionInfo
5
+ from deltacat.storage import PartitionLocator
6
+ from deltacat.storage.model.partition import Partition
7
+ from deltacat.utils.metrics import metrics
8
+ from deltacat.exceptions import PartitionNotFoundError
9
+
10
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
11
+
12
+
13
+ @metrics
14
+ def read_round_completion_info(
15
+ source_partition_locator: PartitionLocator,
16
+ destination_partition_locator: PartitionLocator,
17
+ deltacat_storage,
18
+ deltacat_storage_kwargs: Optional[dict] = None,
19
+ destination_partition: Optional[Partition] = None,
20
+ ) -> Optional[RoundCompletionInfo]:
21
+ """
22
+ Read round completion info from the partition metafile.
23
+
24
+ Args:
25
+ source_partition_locator: Source partition locator for validation
26
+ destination_partition_locator: Destination partition locator
27
+ deltacat_storage: Storage implementation
28
+ deltacat_storage_kwargs: Optional storage kwargs
29
+ destination_partition: Optional destination partition to avoid redundant get_partition calls
30
+
31
+ Returns:
32
+ RoundCompletionInfo if found in partition, None otherwise
33
+ """
34
+ if not destination_partition_locator:
35
+ return None
36
+
37
+ if deltacat_storage_kwargs is None:
38
+ deltacat_storage_kwargs = {}
39
+
40
+ try:
41
+ # Use provided partition or get it from storage
42
+ if destination_partition:
43
+ partition = destination_partition
44
+ else:
45
+ # First get the current partition to access its previous_partition_id
46
+ current_partition: Partition = deltacat_storage.get_partition(
47
+ destination_partition_locator.stream_locator,
48
+ destination_partition_locator.partition_values,
49
+ **deltacat_storage_kwargs,
50
+ )
51
+
52
+ # If current partition has round completion info, use it
53
+ if current_partition.compaction_round_completion_info:
54
+ partition = current_partition
55
+ elif current_partition.previous_partition_id is not None:
56
+ # For incremental compaction, we need to get the previous committed partition
57
+ # that contains the round completion info.
58
+ # Get the previous partition by ID - this is where the round completion info should be
59
+ logger.info(
60
+ f"Current partition {destination_partition_locator} does not have round completion info, "
61
+ f"getting previous partition with ID: {current_partition.previous_partition_id}"
62
+ )
63
+ previous_partition = deltacat_storage.get_partition_by_id(
64
+ destination_partition_locator.stream_locator,
65
+ current_partition.previous_partition_id,
66
+ **deltacat_storage_kwargs,
67
+ )
68
+ if previous_partition is not None:
69
+ logger.info(
70
+ f"Found previous partition: {previous_partition.locator}"
71
+ )
72
+ partition = previous_partition
73
+ else:
74
+ raise PartitionNotFoundError(
75
+ f"Previous partition with ID {current_partition.previous_partition_id} not found"
76
+ )
77
+ else:
78
+ logger.info(f"No previous partition ID found, using current partition")
79
+ partition = current_partition
80
+
81
+ if partition:
82
+ round_completion_info = partition.compaction_round_completion_info
83
+ if round_completion_info:
84
+ # Validate that prev_source_partition_locator matches current source
85
+ if (
86
+ not source_partition_locator
87
+ or not round_completion_info.prev_source_partition_locator
88
+ ):
89
+ raise ValueError(
90
+ f"Source partition locator ({source_partition_locator}) and "
91
+ f"prev_source_partition_locator ({round_completion_info.prev_source_partition_locator}) "
92
+ f"must both be provided."
93
+ )
94
+
95
+ if (
96
+ round_completion_info.prev_source_partition_locator.canonical_string()
97
+ != source_partition_locator.canonical_string()
98
+ ):
99
+ logger.warning(
100
+ f"Previous source partition locator mismatch: "
101
+ f"expected {source_partition_locator.canonical_string()}, "
102
+ f"but found {round_completion_info.prev_source_partition_locator.canonical_string()} "
103
+ f"in round completion info. Ignoring cached round completion info."
104
+ )
105
+ return None
106
+
107
+ logger.info(
108
+ f"Read round completion info from partition metafile: {round_completion_info}"
109
+ )
110
+ return round_completion_info
111
+
112
+ except Exception as e:
113
+ logger.debug(
114
+ f"Failed to read round completion info from partition metafile: {e}"
115
+ )
116
+
117
+ return None
@@ -1,6 +1,7 @@
1
1
  import pyarrow as pa
2
2
  from typing import List
3
- from deltacat.storage import PartitionLocator, SortKey
3
+ from itertools import chain
4
+ from deltacat.storage import PartitionLocator, SortKey, TransformName
4
5
 
5
6
  MAX_SORT_KEYS_BIT_WIDTH = 256
6
7
 
@@ -22,7 +23,13 @@ def validate_sort_keys(
22
23
  deltacat_storage_kwargs = {}
23
24
  total_sort_keys_bit_width = 0
24
25
  if sort_keys:
25
- sort_key_names = [key.key_name for key in sort_keys]
26
+ sort_key_names = list(chain.from_iterable([key.key for key in sort_keys]))
27
+ assert all(
28
+ [
29
+ key.transform is None or key.transform.name == TransformName.IDENTITY
30
+ for key in sort_keys
31
+ ]
32
+ ), f"Sort key transforms are not supported: {sort_keys}"
26
33
  assert len(sort_key_names) == len(
27
34
  set(sort_key_names)
28
35
  ), f"Sort key names must be unique: {sort_key_names}"
@@ -294,7 +294,9 @@ def append_dedupe_task_idx_col(table: pa.Table, dedupe_task_indices) -> pa.Table
294
294
 
295
295
 
296
296
  def delta_type_to_field(delta_type: DeltaType) -> bool:
297
- return True if delta_type is DeltaType.UPSERT else False
297
+ # For deduplication purposes, treat both UPSERT and APPEND as UPSERT (True)
298
+ # Only DELETE should be treated as DELETE (False)
299
+ return delta_type is not DeltaType.DELETE
298
300
 
299
301
 
300
302
  def delta_type_from_field(delta_type_field: bool) -> DeltaType: