deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,45 +1,42 @@
1
- import ray
2
- import os
3
- from moto import mock_s3
1
+ import tempfile
2
+ from typing import Any, Dict, List, Optional, Set, Tuple, Callable
3
+ import uuid
4
4
  import pytest
5
- import boto3
6
- from boto3.resources.base import ServiceResource
5
+
7
6
  import pyarrow as pa
8
- from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
7
+ import ray
8
+ import pandas as pd
9
+
10
+ from deltacat.io.file_object_store import FileObjectStore
9
11
  from pytest_benchmark.fixture import BenchmarkFixture
10
12
 
11
13
  from deltacat.tests.compute.test_util_constant import (
12
14
  BASE_TEST_SOURCE_NAMESPACE,
13
15
  BASE_TEST_SOURCE_TABLE_NAME,
14
16
  BASE_TEST_SOURCE_TABLE_VERSION,
15
- TEST_S3_RCF_BUCKET_NAME,
16
17
  DEFAULT_NUM_WORKERS,
17
18
  DEFAULT_WORKER_INSTANCE_CPUS,
18
19
  )
19
20
  from deltacat.compute.compactor.model.compactor_version import CompactorVersion
20
21
  from deltacat.tests.compute.test_util_common import (
21
- get_rcf,
22
- )
23
- from deltacat.tests.test_utils.utils import read_s3_contents
24
- from deltacat.tests.compute.test_util_common import (
25
- get_compacted_delta_locator_from_rcf,
26
- )
27
- from deltacat.tests.compute.test_util_create_table_deltas_repo import (
28
- create_incremental_deltas_on_source_table,
29
- )
30
- from deltacat.tests.compute.test_util_create_table_deltas_repo import (
31
- create_src_w_deltas_destination_rebase_w_deltas_strategy,
22
+ create_src_w_deltas_destination_rebase_w_deltas_strategy_main,
23
+ create_incremental_deltas_on_source_table_main,
24
+ get_rci_from_partition,
25
+ read_audit_file,
26
+ PartitionKey,
27
+ get_compacted_delta_locator_from_partition,
32
28
  )
33
29
  from deltacat.tests.compute.compact_partition_rebase_then_incremental_test_cases import (
34
30
  REBASE_THEN_INCREMENTAL_TEST_CASES,
35
31
  )
36
- from typing import Any, Callable, Dict, List, Optional, Set, Tuple
32
+
37
33
  from deltacat.types.media import StorageType
38
34
  from deltacat.storage import (
39
35
  DeltaType,
40
36
  DeltaLocator,
41
37
  Partition,
42
38
  PartitionLocator,
39
+ metastore,
43
40
  )
44
41
  from deltacat.types.media import ContentType
45
42
  from deltacat.compute.compactor.model.compact_partition_params import (
@@ -52,12 +49,6 @@ from deltacat.compute.compactor.model.compaction_session_audit_info import (
52
49
  CompactionSessionAuditInfo,
53
50
  )
54
51
 
55
- DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
56
- "db_file_path",
57
- "deltacat/tests/local_deltacat_storage/db_test.sqlite",
58
- )
59
-
60
-
61
52
  """
62
53
  MODULE scoped fixtures
63
54
  """
@@ -70,54 +61,11 @@ def setup_ray_cluster():
70
61
  ray.shutdown()
71
62
 
72
63
 
73
- @pytest.fixture(autouse=True, scope="module")
74
- def mock_aws_credential():
75
- os.environ["AWS_ACCESS_KEY_ID"] = "testing"
76
- os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
77
- os.environ["AWS_SECURITY_TOKEN"] = "testing"
78
- os.environ["AWS_SESSION_TOKEN"] = "testing"
79
- os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
80
- yield
81
-
82
-
83
- @pytest.fixture(autouse=True, scope="module")
84
- def cleanup_the_database_file_after_all_compaction_session_package_tests_complete():
85
- # make sure the database file is deleted after all the compactor package tests are completed
86
- if os.path.exists(DATABASE_FILE_PATH_VALUE):
87
- os.remove(DATABASE_FILE_PATH_VALUE)
88
-
89
-
90
- @pytest.fixture(scope="module")
91
- def s3_resource(mock_aws_credential):
92
- with mock_s3():
93
- yield boto3.resource("s3")
94
-
95
-
96
- @pytest.fixture(autouse=True, scope="module")
97
- def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
98
- s3_resource.create_bucket(
99
- ACL="authenticated-read",
100
- Bucket=TEST_S3_RCF_BUCKET_NAME,
101
- )
102
- yield
103
-
104
-
105
64
  """
106
65
  FUNCTION scoped fixtures
107
66
  """
108
67
 
109
68
 
110
- @pytest.fixture(scope="function")
111
- def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
112
- # see deltacat/tests/local_deltacat_storage/README.md for documentation
113
- kwargs_for_local_deltacat_storage: Dict[str, Any] = {
114
- DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
115
- }
116
- yield kwargs_for_local_deltacat_storage
117
- if os.path.exists(DATABASE_FILE_PATH_VALUE):
118
- os.remove(DATABASE_FILE_PATH_VALUE)
119
-
120
-
121
69
  @pytest.fixture(autouse=True, scope="function")
122
70
  def enable_bucketing_spec_validation(monkeypatch):
123
71
  """
@@ -206,13 +154,12 @@ def enable_bucketing_spec_validation(monkeypatch):
206
154
  ],
207
155
  ids=[test_name for test_name in REBASE_THEN_INCREMENTAL_TEST_CASES],
208
156
  )
209
- def test_compact_partition_rebase_then_incremental(
210
- s3_resource: ServiceResource,
211
- local_deltacat_storage_kwargs: Dict[str, Any],
157
+ def test_compact_partition_rebase_then_incremental_main(
158
+ main_deltacat_storage_kwargs: Dict[str, Any],
212
159
  test_name: str,
213
160
  primary_keys: Set[str],
214
161
  sort_keys: List[Optional[Any]],
215
- partition_keys_param: Optional[List[Any]],
162
+ partition_keys_param: Optional[List[PartitionKey]],
216
163
  partition_values_param: List[Optional[str]],
217
164
  input_deltas_param: List[pa.Array],
218
165
  input_deltas_delta_type: str,
@@ -232,9 +179,15 @@ def test_compact_partition_rebase_then_incremental(
232
179
  compact_partition_func: Callable,
233
180
  benchmark: BenchmarkFixture,
234
181
  ):
235
- import deltacat.tests.local_deltacat_storage as ds
182
+ ds_mock_kwargs = main_deltacat_storage_kwargs
183
+ """
184
+ This test performs rebase compaction first, then incremental compaction on the same data.
185
+ This tests the scenario where we first do a rebase (with different source/destination partitions)
186
+ and then follow up with incremental compaction using the result of the rebase.
187
+
188
+ This version uses the main metastore implementation instead of local storage.
189
+ """
236
190
 
237
- ds_mock_kwargs = local_deltacat_storage_kwargs
238
191
  """
239
192
  REBASE
240
193
  """
@@ -243,8 +196,7 @@ def test_compact_partition_rebase_then_incremental(
243
196
  source_table_stream,
244
197
  destination_table_stream,
245
198
  rebased_table_stream,
246
- ) = create_src_w_deltas_destination_rebase_w_deltas_strategy(
247
- primary_keys,
199
+ ) = create_src_w_deltas_destination_rebase_w_deltas_strategy_main(
248
200
  sort_keys,
249
201
  partition_keys,
250
202
  input_deltas_param,
@@ -252,19 +204,48 @@ def test_compact_partition_rebase_then_incremental(
252
204
  partition_values_param,
253
205
  ds_mock_kwargs,
254
206
  )
255
- source_partition: Partition = ds.get_partition(
207
+
208
+ # Convert partition values for partition lookup (same as in other helper functions)
209
+ converted_partition_values_for_lookup = partition_values_param
210
+ if partition_values_param and partition_keys:
211
+ converted_partition_values_for_lookup = []
212
+ for i, (value, pk) in enumerate(zip(partition_values_param, partition_keys)):
213
+ if pk.key_type.value == "int": # Use .value to get string representation
214
+ converted_partition_values_for_lookup.append(int(value))
215
+ elif pk.key_type.value == "timestamp":
216
+ # Handle timestamp partition values
217
+ if isinstance(value, str) and "T" in value and value.endswith("Z"):
218
+ ts = pd.to_datetime(value)
219
+ # Convert to microseconds since epoch for PyArrow timestamp[us]
220
+ converted_partition_values_for_lookup.append(
221
+ int(ts.timestamp() * 1_000_000)
222
+ )
223
+ else:
224
+ converted_partition_values_for_lookup.append(value)
225
+ else:
226
+ converted_partition_values_for_lookup.append(value)
227
+
228
+ source_partition: Partition = metastore.get_partition(
256
229
  source_table_stream.locator,
257
- partition_values_param,
230
+ converted_partition_values_for_lookup,
258
231
  **ds_mock_kwargs,
259
232
  )
233
+ # Generate a destination partition ID based on the source partition
234
+ destination_partition_id = str(uuid.uuid4())
260
235
  destination_partition_locator: PartitionLocator = PartitionLocator.of(
261
236
  destination_table_stream.locator,
262
- partition_values_param,
263
- None,
237
+ converted_partition_values_for_lookup,
238
+ destination_partition_id,
264
239
  )
265
- rebased_partition: Partition = ds.get_partition(
240
+ all_column_names = metastore.get_table_version_column_names(
241
+ destination_partition_locator.namespace,
242
+ destination_partition_locator.table_name,
243
+ destination_partition_locator.table_version,
244
+ **ds_mock_kwargs,
245
+ )
246
+ rebased_partition: Partition = metastore.get_partition(
266
247
  rebased_table_stream.locator,
267
- partition_values_param,
248
+ converted_partition_values_for_lookup,
268
249
  **ds_mock_kwargs,
269
250
  )
270
251
  num_workers, worker_instance_cpu = DEFAULT_NUM_WORKERS, DEFAULT_WORKER_INSTANCE_CPUS
@@ -275,147 +256,195 @@ def test_compact_partition_rebase_then_incremental(
275
256
  pgm = PlacementGroupManager(
276
257
  1, total_cpus, worker_instance_cpu, memory_per_bundle=4000000
277
258
  ).pgs[0]
278
- compact_partition_params = CompactPartitionParams.of(
279
- {
280
- "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
281
- "compacted_file_content_type": ContentType.PARQUET,
282
- "dd_max_parallelism_ratio": 1.0,
283
- "deltacat_storage": ds,
284
- "deltacat_storage_kwargs": ds_mock_kwargs,
285
- "destination_partition_locator": destination_partition_locator,
286
- "hash_bucket_count": hash_bucket_count_param,
287
- "last_stream_position_to_compact": source_partition.stream_position,
288
- "list_deltas_kwargs": {**ds_mock_kwargs, **{"equivalent_table_types": []}},
289
- "object_store": RayPlasmaObjectStore(),
290
- "pg_config": pgm,
291
- "primary_keys": primary_keys,
292
- "read_kwargs_provider": read_kwargs_provider_param,
293
- "rebase_source_partition_locator": source_partition.locator,
294
- "records_per_compacted_file": records_per_compacted_file_param,
295
- "s3_client_kwargs": {},
296
- "source_partition_locator": rebased_partition.locator,
297
- "sort_keys": sort_keys if sort_keys else None,
298
- }
299
- )
300
- # execute
301
- rcf_file_s3_uri = benchmark(compact_partition_func, compact_partition_params)
302
- compacted_delta_locator: DeltaLocator = get_compacted_delta_locator_from_rcf(
303
- s3_resource, rcf_file_s3_uri
304
- )
305
- tables = ds.download_delta(
306
- compacted_delta_locator, storage_type=StorageType.LOCAL, **ds_mock_kwargs
307
- )
308
- actual_rebase_compacted_table = pa.concat_tables(tables)
309
- # if no primary key is specified then sort by sort_key for consistent assertion
310
- sorting_cols: List[Any] = (
311
- [(val, "ascending") for val in primary_keys] if primary_keys else sort_keys
312
- )
313
- rebase_expected_compact_partition_result = (
314
- rebase_expected_compact_partition_result.combine_chunks().sort_by(sorting_cols)
315
- )
316
- actual_rebase_compacted_table = (
317
- actual_rebase_compacted_table.combine_chunks().sort_by(sorting_cols)
318
- )
319
- assert actual_rebase_compacted_table.equals(
320
- rebase_expected_compact_partition_result
321
- ), f"{actual_rebase_compacted_table} does not match {rebase_expected_compact_partition_result}"
322
- """
323
- INCREMENTAL
324
- """
325
- (
326
- source_partition_locator_w_deltas,
327
- new_delta,
328
- incremental_delta_length,
329
- has_delete_deltas,
330
- ) = create_incremental_deltas_on_source_table(
331
- BASE_TEST_SOURCE_NAMESPACE,
332
- BASE_TEST_SOURCE_TABLE_NAME,
333
- BASE_TEST_SOURCE_TABLE_VERSION,
334
- source_table_stream,
335
- partition_values_param,
336
- incremental_deltas,
337
- ds_mock_kwargs,
338
- )
339
- compact_partition_params = CompactPartitionParams.of(
340
- {
341
- "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
342
- "compacted_file_content_type": ContentType.PARQUET,
343
- "dd_max_parallelism_ratio": 1.0,
344
- "deltacat_storage": ds,
345
- "deltacat_storage_kwargs": ds_mock_kwargs,
346
- "destination_partition_locator": compacted_delta_locator.partition_locator,
347
- "drop_duplicates": drop_duplicates_param,
348
- "hash_bucket_count": hash_bucket_count_param,
349
- "last_stream_position_to_compact": new_delta.stream_position,
350
- "list_deltas_kwargs": {**ds_mock_kwargs, **{"equivalent_table_types": []}},
351
- "object_store": RayPlasmaObjectStore(),
352
- "pg_config": pgm,
353
- "primary_keys": primary_keys,
354
- "read_kwargs_provider": read_kwargs_provider_param,
355
- "rebase_source_partition_locator": None,
356
- "rebase_source_partition_high_watermark": None,
357
- "records_per_compacted_file": records_per_compacted_file_param,
358
- "s3_client_kwargs": {},
359
- "source_partition_locator": source_partition_locator_w_deltas,
360
- "sort_keys": sort_keys if sort_keys else None,
361
- }
362
- )
363
- if expected_terminal_exception:
364
- with pytest.raises(expected_terminal_exception) as exc_info:
365
- compact_partition_func(compact_partition_params)
366
- assert expected_terminal_exception_message in str(exc_info.value)
367
- return
368
- rcf_file_s3_uri = compact_partition_func(compact_partition_params)
369
- round_completion_info = get_rcf(s3_resource, rcf_file_s3_uri)
370
- compacted_delta_locator_incremental: DeltaLocator = (
371
- round_completion_info.compacted_delta_locator
372
- )
373
- # assert if RCF covers all files
374
- if compactor_version != CompactorVersion.V1.value:
375
- previous_end = None
376
- for start, end in round_completion_info.hb_index_to_entry_range.values():
377
- assert (previous_end is None and start == 0) or start == previous_end
378
- previous_end = end
379
- assert (
380
- previous_end == round_completion_info.compacted_pyarrow_write_result.files
381
- )
382
259
 
383
- audit_bucket, audit_key = round_completion_info.compaction_audit_url.replace(
384
- "s3://", ""
385
- ).split("/", 1)
386
- compaction_audit_obj: dict = read_s3_contents(s3_resource, audit_bucket, audit_key)
387
- compaction_audit: CompactionSessionAuditInfo = CompactionSessionAuditInfo(
388
- **compaction_audit_obj
389
- )
260
+ with tempfile.TemporaryDirectory() as test_dir:
261
+ # Extract catalog from storage kwargs
262
+ catalog = ds_mock_kwargs.get("inner")
390
263
 
391
- tables = ds.download_delta(
392
- compacted_delta_locator_incremental,
393
- storage_type=StorageType.LOCAL,
394
- **ds_mock_kwargs,
395
- )
396
- actual_compacted_table = pa.concat_tables(tables)
397
- expected_terminal_compact_partition_result = (
398
- expected_terminal_compact_partition_result.combine_chunks().sort_by(
399
- sorting_cols
264
+ compact_partition_params = CompactPartitionParams.of(
265
+ {
266
+ "catalog": catalog,
267
+ "compacted_file_content_type": ContentType.PARQUET,
268
+ "dd_max_parallelism_ratio": 1.0,
269
+ "deltacat_storage": metastore,
270
+ "deltacat_storage_kwargs": ds_mock_kwargs,
271
+ "destination_partition_locator": destination_partition_locator,
272
+ "hash_bucket_count": hash_bucket_count_param,
273
+ "last_stream_position_to_compact": source_partition.stream_position,
274
+ "list_deltas_kwargs": {
275
+ **ds_mock_kwargs,
276
+ **{"equivalent_table_types": []},
277
+ },
278
+ "object_store": FileObjectStore(test_dir),
279
+ "original_fields": {
280
+ "pk_col_1",
281
+ "pk_col_2",
282
+ "sk_col_1",
283
+ "sk_col_2",
284
+ "col_1",
285
+ "col_2",
286
+ "region_id",
287
+ },
288
+ "pg_config": pgm,
289
+ "primary_keys": primary_keys,
290
+ "all_column_names": all_column_names,
291
+ "read_kwargs_provider": read_kwargs_provider_param,
292
+ "rebase_source_partition_locator": source_partition.locator,
293
+ "records_per_compacted_file": records_per_compacted_file_param,
294
+ "source_partition_locator": rebased_partition.locator,
295
+ "sort_keys": sort_keys if sort_keys else None,
296
+ }
400
297
  )
401
- )
402
- actual_compacted_table = actual_compacted_table.combine_chunks().sort_by(
403
- sorting_cols
404
- )
405
- # NOTE: if delete type-deltas are present this relationship no longer holds true
406
- if not has_delete_deltas:
407
- assert compaction_audit.input_records == (
408
- incremental_delta_length if incremental_deltas else 0
409
- ) + len(actual_rebase_compacted_table), (
410
- " Total input records must be equal to incremental deltas"
411
- " + previous compacted table size"
298
+ # execute
299
+ benchmark(compact_partition_func, compact_partition_params)
300
+ compacted_delta_locator: DeltaLocator = (
301
+ get_compacted_delta_locator_from_partition(
302
+ destination_partition_locator,
303
+ metastore,
304
+ catalog=catalog,
305
+ )
306
+ )
307
+ tables = metastore.download_delta(
308
+ compacted_delta_locator,
309
+ storage_type=StorageType.LOCAL,
310
+ **ds_mock_kwargs,
311
+ )
312
+ actual_rebase_compacted_table = pa.concat_tables(tables)
313
+ all_column_names = metastore.get_table_version_column_names(
314
+ destination_partition_locator.namespace,
315
+ destination_partition_locator.table_name,
316
+ destination_partition_locator.table_version,
317
+ **ds_mock_kwargs,
318
+ )
319
+ # if no primary key is specified then sort by sort_key for consistent assertion
320
+ sorting_cols: List[Any] = (
321
+ [(val, "ascending") for val in primary_keys]
322
+ if primary_keys
323
+ else [pa_key for key in sort_keys for pa_key in key.arrow]
324
+ if sort_keys
325
+ else []
326
+ )
327
+ rebase_expected_compact_partition_result = (
328
+ rebase_expected_compact_partition_result.combine_chunks().sort_by(
329
+ sorting_cols
330
+ )
412
331
  )
332
+ actual_rebase_compacted_table = (
333
+ actual_rebase_compacted_table.combine_chunks().sort_by(sorting_cols)
334
+ )
335
+ assert actual_rebase_compacted_table.equals(
336
+ rebase_expected_compact_partition_result
337
+ ), f"{actual_rebase_compacted_table} does not match {rebase_expected_compact_partition_result}"
413
338
 
414
- assert actual_compacted_table.equals(
415
- expected_terminal_compact_partition_result
416
- ), f"{actual_compacted_table} does not match {expected_terminal_compact_partition_result}"
339
+ """
340
+ INCREMENTAL
341
+ """
342
+ (
343
+ source_partition_locator_w_deltas,
344
+ new_delta,
345
+ incremental_delta_length,
346
+ has_delete_deltas,
347
+ ) = create_incremental_deltas_on_source_table_main(
348
+ BASE_TEST_SOURCE_NAMESPACE,
349
+ BASE_TEST_SOURCE_TABLE_NAME,
350
+ BASE_TEST_SOURCE_TABLE_VERSION,
351
+ source_table_stream,
352
+ partition_values_param,
353
+ incremental_deltas,
354
+ ds_mock_kwargs,
355
+ )
417
356
 
418
- if assert_compaction_audit is not None:
419
- if not assert_compaction_audit(compactor_version, compaction_audit):
420
- assert False, "Compaction audit assertion failed"
421
- return
357
+ # Handle empty incremental deltas case
358
+ if new_delta is None:
359
+ # For empty incremental deltas, the expected result should be the same as rebase result
360
+ # Skip incremental compaction and just verify the rebase result
361
+ actual_compact_partition_result = actual_rebase_compacted_table
362
+ compaction_audit = None
363
+ else:
364
+ # Perform incremental compaction when there are actual deltas
365
+ last_stream_position = new_delta.stream_position
366
+
367
+ compact_partition_params = CompactPartitionParams.of(
368
+ {
369
+ "catalog": catalog,
370
+ "compacted_file_content_type": ContentType.PARQUET,
371
+ "dd_max_parallelism_ratio": 1.0,
372
+ "deltacat_storage": metastore,
373
+ "deltacat_storage_kwargs": ds_mock_kwargs,
374
+ "destination_partition_locator": compacted_delta_locator.partition_locator,
375
+ "drop_duplicates": drop_duplicates_param,
376
+ "hash_bucket_count": hash_bucket_count_param,
377
+ "last_stream_position_to_compact": last_stream_position,
378
+ "list_deltas_kwargs": {
379
+ **ds_mock_kwargs,
380
+ **{"equivalent_table_types": []},
381
+ },
382
+ "object_store": FileObjectStore(test_dir),
383
+ "original_fields": {
384
+ "pk_col_1",
385
+ "pk_col_2",
386
+ "sk_col_1",
387
+ "sk_col_2",
388
+ "col_1",
389
+ "col_2",
390
+ "region_id",
391
+ },
392
+ "pg_config": pgm,
393
+ "primary_keys": primary_keys,
394
+ "all_column_names": all_column_names,
395
+ "read_kwargs_provider": read_kwargs_provider_param,
396
+ "rebase_source_partition_locator": None,
397
+ "rebase_source_partition_high_watermark": None,
398
+ "records_per_compacted_file": records_per_compacted_file_param,
399
+ "source_partition_locator": source_partition_locator_w_deltas,
400
+ "sort_keys": sort_keys if sort_keys else None,
401
+ }
402
+ )
403
+ if expected_terminal_exception:
404
+ with pytest.raises(expected_terminal_exception) as exc_info:
405
+ compact_partition_func(compact_partition_params)
406
+ assert expected_terminal_exception_message in str(exc_info.value)
407
+ return
408
+ compact_partition_func(compact_partition_params)
409
+ # assert
410
+ compacted_delta_locator: DeltaLocator = (
411
+ get_compacted_delta_locator_from_partition(
412
+ destination_partition_locator, metastore, catalog=catalog
413
+ )
414
+ )
415
+ tables = metastore.download_delta(
416
+ compacted_delta_locator,
417
+ storage_type=StorageType.LOCAL,
418
+ **ds_mock_kwargs,
419
+ )
420
+ actual_compact_partition_result = pa.concat_tables(tables)
421
+
422
+ # Get compaction audit for verification if needed
423
+ round_completion_info = get_rci_from_partition(
424
+ destination_partition_locator, metastore, catalog=catalog
425
+ )
426
+ # Get catalog root for audit file resolution
427
+ catalog_root = catalog.root
428
+
429
+ compaction_audit_obj: dict = read_audit_file(
430
+ round_completion_info.compaction_audit_url, catalog_root
431
+ )
432
+ compaction_audit = CompactionSessionAuditInfo(**compaction_audit_obj)
433
+
434
+ # Verify the final result
435
+ actual_compact_partition_result = (
436
+ actual_compact_partition_result.combine_chunks().sort_by(sorting_cols)
437
+ )
438
+ expected_terminal_compact_partition_result = (
439
+ expected_terminal_compact_partition_result.combine_chunks().sort_by(
440
+ sorting_cols
441
+ )
442
+ )
443
+ assert actual_compact_partition_result.equals(
444
+ expected_terminal_compact_partition_result
445
+ ), f"{actual_compact_partition_result} does not match {expected_terminal_compact_partition_result}"
446
+
447
+ if assert_compaction_audit is not None and compaction_audit is not None:
448
+ if not assert_compaction_audit(compactor_version, compaction_audit):
449
+ pytest.fail("Compaction audit assertion failed")
450
+ return