deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,29 +1,29 @@
1
- import ray
2
- from moto import mock_s3
3
- import pytest
4
- import os
5
1
  import logging
6
- import boto3
7
- from typing import Any, Callable, Dict, List, Optional, Set, Tuple
8
- from boto3.resources.base import ServiceResource
2
+ from typing import Any, Dict, List, Optional, Set, Tuple, Callable
3
+ import uuid
4
+ import pytest
5
+
9
6
  import pyarrow as pa
7
+ import ray
8
+
10
9
  from pytest_benchmark.fixture import BenchmarkFixture
11
10
  from deltacat.types.media import StorageType
12
11
 
13
12
  from deltacat.tests.compute.test_util_common import (
14
- get_rcf,
13
+ get_rci_from_partition,
14
+ read_audit_file,
15
+ PartitionKeyType,
15
16
  )
16
- from deltacat.compute.compactor.model.compactor_version import CompactorVersion
17
- from deltacat.tests.test_utils.utils import read_s3_contents
18
- from deltacat.tests.compute.test_util_create_table_deltas_repo import (
19
- create_src_w_deltas_destination_plus_destination,
20
- add_late_deltas_to_partition,
17
+ from deltacat.tests.compute.test_util_common import (
18
+ add_late_deltas_to_partition_main,
19
+ create_src_w_deltas_destination_plus_destination_main,
21
20
  )
21
+ from deltacat.compute.compactor.model.compactor_version import CompactorVersion
22
+
22
23
  from deltacat.tests.compute.compact_partition_test_cases import (
23
24
  INCREMENTAL_TEST_CASES,
24
25
  )
25
26
  from deltacat.tests.compute.test_util_constant import (
26
- TEST_S3_RCF_BUCKET_NAME,
27
27
  DEFAULT_NUM_WORKERS,
28
28
  DEFAULT_WORKER_INSTANCE_CPUS,
29
29
  )
@@ -37,6 +37,7 @@ from deltacat.storage import (
37
37
  DeltaLocator,
38
38
  Partition,
39
39
  PartitionLocator,
40
+ metastore,
40
41
  )
41
42
  from deltacat.types.media import ContentType
42
43
  from deltacat.compute.compactor.model.compaction_session_audit_info import (
@@ -50,11 +51,6 @@ from deltacat.utils.placement import (
50
51
  )
51
52
  from deltacat import logs
52
53
 
53
- DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
54
- "db_file_path",
55
- "deltacat/tests/local_deltacat_storage/db_test.sqlite",
56
- )
57
-
58
54
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
59
55
 
60
56
 
@@ -70,55 +66,11 @@ def setup_ray_cluster():
70
66
  ray.shutdown()
71
67
 
72
68
 
73
- @pytest.fixture(autouse=True, scope="module")
74
- def mock_aws_credential():
75
- os.environ["AWS_ACCESS_KEY_ID"] = "testing"
76
- os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
77
- os.environ["AWS_SECURITY_TOKEN"] = "testing"
78
- os.environ["AWS_SESSION_TOKEN"] = "testing"
79
- os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
80
- yield
81
-
82
-
83
- @pytest.fixture(autouse=True, scope="module")
84
- def cleanup_the_database_file_after_all_compaction_session_package_tests_complete():
85
- # make sure the database file is deleted after all the compactor package tests are completed
86
- yield
87
- if os.path.exists(DATABASE_FILE_PATH_VALUE):
88
- os.remove(DATABASE_FILE_PATH_VALUE)
89
-
90
-
91
- @pytest.fixture(scope="module")
92
- def s3_resource():
93
- with mock_s3():
94
- yield boto3.resource("s3")
95
-
96
-
97
- @pytest.fixture(autouse=True, scope="module")
98
- def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
99
- s3_resource.create_bucket(
100
- ACL="authenticated-read",
101
- Bucket=TEST_S3_RCF_BUCKET_NAME,
102
- )
103
- yield
104
-
105
-
106
69
  """
107
70
  FUNCTION scoped fixtures
108
71
  """
109
72
 
110
73
 
111
- @pytest.fixture(scope="function")
112
- def offer_local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
113
- # see deltacat/tests/local_deltacat_storage/README.md for documentation
114
- kwargs_for_local_deltacat_storage: Dict[str, Any] = {
115
- DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
116
- }
117
- yield kwargs_for_local_deltacat_storage
118
- if os.path.exists(DATABASE_FILE_PATH_VALUE):
119
- os.remove(DATABASE_FILE_PATH_VALUE)
120
-
121
-
122
74
  @pytest.fixture(autouse=True, scope="function")
123
75
  def enable_bucketing_spec_validation(monkeypatch):
124
76
  """
@@ -134,6 +86,11 @@ def enable_bucketing_spec_validation(monkeypatch):
134
86
  )
135
87
 
136
88
 
89
+ @pytest.fixture(scope="function")
90
+ def temp_dir(tmp_path):
91
+ return str(tmp_path)
92
+
93
+
137
94
  @pytest.mark.parametrize(
138
95
  [
139
96
  "test_name",
@@ -207,9 +164,8 @@ def enable_bucketing_spec_validation(monkeypatch):
207
164
  ],
208
165
  ids=[test_name for test_name in INCREMENTAL_TEST_CASES],
209
166
  )
210
- def test_compact_partition_incremental(
211
- s3_resource: ServiceResource,
212
- offer_local_deltacat_storage_kwargs: Dict[str, Any],
167
+ def test_compact_partition_incremental_main(
168
+ main_deltacat_storage_kwargs: Dict[str, Any],
213
169
  test_name: str,
214
170
  primary_keys: Set[str],
215
171
  sort_keys: Dict[str, str],
@@ -233,9 +189,16 @@ def test_compact_partition_incremental(
233
189
  compact_partition_func: Callable,
234
190
  benchmark: BenchmarkFixture,
235
191
  ):
236
- import deltacat.tests.local_deltacat_storage as ds
192
+ # Skip in-place compaction tests for main storage as it's not yet implemented
193
+ if is_inplace:
194
+ pytest.skip(
195
+ "In-place compaction not yet implemented in main storage (delta prepending limitation)"
196
+ )
237
197
 
238
- ds_mock_kwargs: Dict[str, Any] = offer_local_deltacat_storage_kwargs
198
+ ds_mock_kwargs: Dict[str, Any] = main_deltacat_storage_kwargs
199
+
200
+ # Extract catalog from storage kwargs
201
+ catalog = ds_mock_kwargs.get("inner")
239
202
 
240
203
  # setup
241
204
  partition_keys = partition_keys_param
@@ -246,8 +209,7 @@ def test_compact_partition_incremental(
246
209
  source_table_namespace,
247
210
  source_table_name,
248
211
  source_table_version,
249
- ) = create_src_w_deltas_destination_plus_destination(
250
- primary_keys,
212
+ ) = create_src_w_deltas_destination_plus_destination_main(
251
213
  sort_keys,
252
214
  partition_keys,
253
215
  input_deltas,
@@ -256,15 +218,38 @@ def test_compact_partition_incremental(
256
218
  ds_mock_kwargs,
257
219
  is_inplace,
258
220
  )
259
- source_partition: Partition = ds.get_partition(
221
+
222
+ # Convert partition values to correct types for get_partition call
223
+ converted_partition_values = []
224
+ if partition_values_param and partition_keys:
225
+ # partition_values_param is a single string, but we need to handle it as a list
226
+ partition_values_list = (
227
+ [partition_values_param]
228
+ if isinstance(partition_values_param, str)
229
+ else partition_values_param
230
+ )
231
+ for i, (value, pk) in enumerate(zip(partition_values_list, partition_keys)):
232
+ if pk.key_type == PartitionKeyType.INT:
233
+ converted_partition_values.append(int(value))
234
+ else:
235
+ converted_partition_values.append(value)
236
+ else:
237
+ converted_partition_values = (
238
+ [partition_values_param] if partition_values_param else []
239
+ )
240
+
241
+ source_partition: Partition = metastore.get_partition(
260
242
  source_table_stream.locator,
261
- partition_values_param,
243
+ converted_partition_values,
244
+ partition_scheme_id="default_partition_scheme" if partition_keys else None,
262
245
  **ds_mock_kwargs,
263
246
  )
247
+ # Generate a destination partition ID based on the source partition
248
+ destination_partition_id = str(uuid.uuid4())
264
249
  destination_partition_locator: PartitionLocator = PartitionLocator.of(
265
250
  destination_table_stream.locator,
266
- partition_values_param,
267
- None,
251
+ converted_partition_values,
252
+ destination_partition_id,
268
253
  )
269
254
  num_workers, worker_instance_cpu = DEFAULT_NUM_WORKERS, DEFAULT_WORKER_INSTANCE_CPUS
270
255
  total_cpus: int = num_workers * worker_instance_cpu
@@ -275,12 +260,18 @@ def test_compact_partition_incremental(
275
260
  if create_placement_group_param
276
261
  else None
277
262
  )
263
+ all_column_names = metastore.get_table_version_column_names(
264
+ destination_table_stream.locator.table_locator.namespace,
265
+ destination_table_stream.locator.table_locator.table_name,
266
+ destination_table_stream.locator.table_version_locator.table_version,
267
+ catalog=catalog,
268
+ )
278
269
  compact_partition_params = CompactPartitionParams.of(
279
270
  {
280
- "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
271
+ "catalog": catalog,
281
272
  "compacted_file_content_type": ContentType.PARQUET,
282
273
  "dd_max_parallelism_ratio": 1.0,
283
- "deltacat_storage": ds,
274
+ "deltacat_storage": metastore,
284
275
  "deltacat_storage_kwargs": ds_mock_kwargs,
285
276
  "destination_partition_locator": destination_partition_locator,
286
277
  "drop_duplicates": drop_duplicates_param,
@@ -289,11 +280,11 @@ def test_compact_partition_incremental(
289
280
  "list_deltas_kwargs": {**ds_mock_kwargs, **{"equivalent_table_types": []}},
290
281
  "pg_config": pgm,
291
282
  "primary_keys": primary_keys,
283
+ "all_column_names": all_column_names,
292
284
  "read_kwargs_provider": read_kwargs_provider_param,
293
285
  "rebase_source_partition_locator": None,
294
286
  "rebase_source_partition_high_watermark": None,
295
287
  "records_per_compacted_file": records_per_compacted_file_param,
296
- "s3_client_kwargs": {},
297
288
  "source_partition_locator": source_partition.locator,
298
289
  "sort_keys": sort_keys if sort_keys else None,
299
290
  }
@@ -304,18 +295,17 @@ def test_compact_partition_incremental(
304
295
  """
305
296
  This callable runs right before invoking the benchmark target function (compaction).
306
297
  This is needed as the benchmark module will invoke the target function multiple times
307
- in a single test run, which can lead to non-idempotent behavior if RCFs are generated.
298
+ in a single test run, which can lead to non-idempotent behavior if RCIs are generated.
308
299
 
309
300
  Returns: args, kwargs
310
301
  """
311
- s3_resource.Bucket(TEST_S3_RCF_BUCKET_NAME).objects.all().delete()
312
302
  return (compact_partition_params,), {}
313
303
 
314
304
  if add_late_deltas:
315
305
  # NOTE: In the case of in-place compaction it is plausible that new deltas may be added to the source partition during compaction
316
306
  # (so that the source_partitition.stream_position > last_stream_position_to_compact).
317
307
  # This parameter helps simulate the case to check that no late deltas are dropped even when the compacted partition is created.
318
- latest_delta, _ = add_late_deltas_to_partition(
308
+ latest_delta, _ = add_late_deltas_to_partition_main(
319
309
  add_late_deltas, source_partition, ds_mock_kwargs
320
310
  )
321
311
  if expected_terminal_exception:
@@ -323,27 +313,28 @@ def test_compact_partition_incremental(
323
313
  compact_partition_func(compact_partition_params)
324
314
  assert expected_terminal_exception_message in str(exc_info.value)
325
315
  return
326
- rcf_file_s3_uri = benchmark.pedantic(
327
- compact_partition_func, setup=_incremental_compaction_setup
328
- )
316
+ benchmark.pedantic(compact_partition_func, setup=_incremental_compaction_setup)
329
317
 
330
- # validate
331
- round_completion_info: RoundCompletionInfo = get_rcf(s3_resource, rcf_file_s3_uri)
318
+ # validate - get RoundCompletionInfo from the compacted partition
319
+ round_completion_info: RoundCompletionInfo = get_rci_from_partition(
320
+ destination_partition_locator, metastore, catalog=catalog
321
+ )
332
322
  compacted_delta_locator: DeltaLocator = (
333
323
  round_completion_info.compacted_delta_locator
334
324
  )
335
- audit_bucket, audit_key = RoundCompletionInfo.get_audit_bucket_name_and_key(
336
- round_completion_info.compaction_audit_url
337
- )
338
325
 
339
- compaction_audit_obj: Dict[str, Any] = read_s3_contents(
340
- s3_resource, audit_bucket, audit_key
326
+ # Get catalog root for audit file resolution
327
+ catalog_root = catalog.root
328
+
329
+ compaction_audit_obj: Dict[str, Any] = read_audit_file(
330
+ round_completion_info.compaction_audit_url, catalog_root
341
331
  )
332
+
342
333
  compaction_audit: CompactionSessionAuditInfo = CompactionSessionAuditInfo(
343
334
  **compaction_audit_obj
344
335
  )
345
336
 
346
- # assert if RCF covers all files
337
+ # assert if RCI covers all files
347
338
  if compactor_version != CompactorVersion.V1.value:
348
339
  previous_end = None
349
340
  for start, end in round_completion_info.hb_index_to_entry_range.values():
@@ -353,7 +344,7 @@ def test_compact_partition_incremental(
353
344
  previous_end == round_completion_info.compacted_pyarrow_write_result.files
354
345
  )
355
346
 
356
- tables = ds.download_delta(
347
+ tables = metastore.download_delta(
357
348
  compacted_delta_locator, storage_type=StorageType.LOCAL, **ds_mock_kwargs
358
349
  )
359
350
  actual_compacted_table = pa.concat_tables(tables)
@@ -387,25 +378,27 @@ def test_compact_partition_incremental(
387
378
  == destination_partition_locator.partition_values
388
379
  and source_partition.locator.stream_id
389
380
  == destination_partition_locator.stream_id
390
- ), f"The source partition: {source_partition.locator.canonical_string} should match the destination partition: {destination_partition_locator.canonical_string}"
381
+ ), f"The source partition: {source_partition.locator} should match the destination partition: {destination_partition_locator}"
391
382
  assert (
392
383
  compacted_delta_locator.stream_id == source_partition.locator.stream_id
393
384
  ), "The compacted delta should be in the same stream as the source"
394
- source_partition: Partition = ds.get_partition(
385
+ source_partition: Partition = metastore.get_partition(
395
386
  source_table_stream.locator,
396
- partition_values_param,
387
+ converted_partition_values,
388
+ partition_scheme_id="default_partition_scheme" if partition_keys else None,
397
389
  **ds_mock_kwargs,
398
390
  )
399
- compacted_partition: Optional[Partition] = ds.get_partition(
391
+ compacted_partition: Optional[Partition] = metastore.get_partition(
400
392
  compacted_delta_locator.stream_locator,
401
- partition_values_param,
393
+ converted_partition_values,
394
+ partition_scheme_id="default_partition_scheme" if partition_keys else None,
402
395
  **ds_mock_kwargs,
403
396
  )
404
397
  assert (
405
398
  compacted_partition.state == source_partition.state == CommitState.COMMITTED
406
399
  ), f"The compacted/source table partition should be in {CommitState.COMMITTED} state and not {CommitState.DEPRECATED}"
407
400
  if add_late_deltas:
408
- compacted_partition_deltas: List[Delta] = ds.list_partition_deltas(
401
+ compacted_partition_deltas: List[Delta] = metastore.list_partition_deltas(
409
402
  partition_like=compacted_partition,
410
403
  ascending_order=False,
411
404
  **ds_mock_kwargs,
@@ -1,43 +1,38 @@
1
- import ray
2
- import os
3
- from moto import mock_s3
1
+ import tempfile
2
+ from typing import Any, Dict, List, Optional, Set, Callable
4
3
  import pytest
5
- import boto3
6
- from boto3.resources.base import ServiceResource
7
4
  import pyarrow as pa
5
+ import ray
6
+
8
7
  from deltacat.io.file_object_store import FileObjectStore
9
8
  from pytest_benchmark.fixture import BenchmarkFixture
10
- import tempfile
11
9
 
12
10
  from deltacat.tests.compute.test_util_constant import (
13
- TEST_S3_RCF_BUCKET_NAME,
14
11
  DEFAULT_NUM_WORKERS,
15
12
  DEFAULT_WORKER_INSTANCE_CPUS,
16
13
  )
17
14
  from deltacat.tests.compute.test_util_common import (
18
- get_rcf,
15
+ get_rci_from_partition,
16
+ read_audit_file,
17
+ PartitionKey,
18
+ get_compacted_delta_locator_from_partition,
19
19
  )
20
- from deltacat.tests.test_utils.utils import read_s3_contents
21
- from deltacat.compute.compactor.model.compactor_version import CompactorVersion
22
20
  from deltacat.tests.compute.test_util_common import (
23
- get_compacted_delta_locator_from_rcf,
21
+ multiple_rounds_create_src_w_deltas_destination_rebase_w_deltas_strategy_main,
24
22
  )
23
+
24
+ from deltacat.compute.compactor.model.compactor_version import CompactorVersion
25
25
  from deltacat.compute.compactor.model.compaction_session_audit_info import (
26
26
  CompactionSessionAuditInfo,
27
27
  )
28
- from deltacat.tests.compute.test_util_create_table_deltas_repo import (
29
- multiple_rounds_create_src_w_deltas_destination_rebase_w_deltas_strategy,
30
- )
31
28
  from deltacat.tests.compute.compact_partition_multiple_rounds_test_cases import (
32
29
  MULTIPLE_ROUNDS_TEST_CASES,
33
30
  )
34
- from typing import Any, Callable, Dict, List, Optional, Set
35
- from deltacat.types.media import StorageType
31
+ from deltacat.types.media import StorageType, ContentType
36
32
  from deltacat.storage import (
37
33
  DeltaLocator,
38
34
  Partition,
39
35
  )
40
- from deltacat.types.media import ContentType
41
36
  from deltacat.compute.compactor.model.compact_partition_params import (
42
37
  CompactPartitionParams,
43
38
  )
@@ -47,11 +42,7 @@ from deltacat.compute.compactor import (
47
42
  from deltacat.utils.placement import (
48
43
  PlacementGroupManager,
49
44
  )
50
-
51
- DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
52
- "db_file_path",
53
- "deltacat/tests/local_deltacat_storage/db_test.sqlite",
54
- )
45
+ from deltacat.storage import metastore
55
46
 
56
47
 
57
48
  """
@@ -66,54 +57,11 @@ def setup_ray_cluster():
66
57
  ray.shutdown()
67
58
 
68
59
 
69
- @pytest.fixture(autouse=True, scope="module")
70
- def mock_aws_credential():
71
- os.environ["AWS_ACCESS_KEY_ID"] = "testing"
72
- os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
73
- os.environ["AWS_SECURITY_TOKEN"] = "testing"
74
- os.environ["AWS_SESSION_TOKEN"] = "testing"
75
- os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
76
- yield
77
-
78
-
79
- @pytest.fixture(autouse=True, scope="module")
80
- def cleanup_the_database_file_after_all_compaction_session_package_tests_complete():
81
- # make sure the database file is deleted after all the compactor package tests are completed
82
- if os.path.exists(DATABASE_FILE_PATH_VALUE):
83
- os.remove(DATABASE_FILE_PATH_VALUE)
84
-
85
-
86
- @pytest.fixture(scope="module")
87
- def s3_resource(mock_aws_credential):
88
- with mock_s3():
89
- yield boto3.resource("s3")
90
-
91
-
92
- @pytest.fixture(autouse=True, scope="module")
93
- def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
94
- s3_resource.create_bucket(
95
- ACL="authenticated-read",
96
- Bucket=TEST_S3_RCF_BUCKET_NAME,
97
- )
98
- yield
99
-
100
-
101
60
  """
102
61
  FUNCTION scoped fixtures
103
62
  """
104
63
 
105
64
 
106
- @pytest.fixture(scope="function")
107
- def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
108
- # see deltacat/tests/local_deltacat_storage/README.md for documentation
109
- kwargs_for_local_deltacat_storage: Dict[str, Any] = {
110
- DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
111
- }
112
- yield kwargs_for_local_deltacat_storage
113
- if os.path.exists(DATABASE_FILE_PATH_VALUE):
114
- os.remove(DATABASE_FILE_PATH_VALUE)
115
-
116
-
117
65
  @pytest.fixture(autouse=True, scope="function")
118
66
  def enable_bucketing_spec_validation(monkeypatch):
119
67
  """
@@ -199,14 +147,13 @@ def enable_bucketing_spec_validation(monkeypatch):
199
147
  ],
200
148
  ids=[test_name for test_name in MULTIPLE_ROUNDS_TEST_CASES],
201
149
  )
202
- def test_compact_partition_rebase_multiple_rounds_same_source_and_destination(
150
+ def test_compact_partition_rebase_multiple_rounds_same_source_and_destination_main(
203
151
  mocker,
204
- s3_resource: ServiceResource,
205
- local_deltacat_storage_kwargs: Dict[str, Any],
152
+ main_deltacat_storage_kwargs: Dict[str, Any],
206
153
  test_name: str,
207
154
  primary_keys: Set[str],
208
155
  sort_keys: List[Optional[Any]],
209
- partition_keys_param: Optional[List[Any]],
156
+ partition_keys_param: Optional[List[PartitionKey]],
210
157
  partition_values_param: List[Optional[str]],
211
158
  input_deltas_param: List[pa.Array],
212
159
  expected_terminal_compact_partition_result: pa.Table,
@@ -225,38 +172,63 @@ def test_compact_partition_rebase_multiple_rounds_same_source_and_destination(
225
172
  num_rounds_param: int,
226
173
  benchmark: BenchmarkFixture,
227
174
  ):
228
- import deltacat.tests.local_deltacat_storage as ds
229
-
230
- ds_mock_kwargs = local_deltacat_storage_kwargs
175
+ ds_mock_kwargs = main_deltacat_storage_kwargs
231
176
  """
232
177
  This test tests different multi-round compaction rebase configurations,
233
- as specified in compact_partition_multiple_rounds_test_cases.py
178
+ as specified in compact_partition_multiple_rounds_test_cases.py.
234
179
  These tests do not test multi-round compaction backfill, which is
235
180
  currently unsupported.
181
+
182
+ This version uses the main metastore implementation instead of local storage.
236
183
  """
237
184
  (
238
185
  source_table_stream,
239
186
  _,
240
187
  rebased_table_stream,
241
188
  _,
242
- ) = multiple_rounds_create_src_w_deltas_destination_rebase_w_deltas_strategy(
243
- primary_keys,
189
+ ) = multiple_rounds_create_src_w_deltas_destination_rebase_w_deltas_strategy_main(
244
190
  sort_keys,
245
191
  partition_keys_param,
246
192
  input_deltas_param,
247
193
  partition_values_param,
248
194
  ds_mock_kwargs,
249
195
  )
250
- source_partition: Partition = ds.get_partition(
251
- source_table_stream.locator,
252
- partition_values_param,
196
+ # Convert partition values for partition lookup (same as in the helper function)
197
+ converted_partition_values_for_lookup = partition_values_param
198
+ if partition_values_param and partition_keys_param:
199
+ converted_partition_values_for_lookup = []
200
+ for i, (value, key) in enumerate(
201
+ zip(partition_values_param, partition_keys_param)
202
+ ):
203
+ if key.key_type == "int":
204
+ converted_partition_values_for_lookup.append(int(value))
205
+ elif key.key_type == "string":
206
+ converted_partition_values_for_lookup.append(str(value))
207
+ elif key.key_type == "timestamp":
208
+ converted_partition_values_for_lookup.append(
209
+ value
210
+ ) # Keep as is for now
211
+ else:
212
+ converted_partition_values_for_lookup.append(value)
213
+
214
+ source_partition: Partition = metastore.get_partition(
215
+ stream_locator=source_table_stream.locator,
216
+ partition_values=converted_partition_values_for_lookup,
217
+ partition_scheme_id=source_table_stream.partition_scheme.id,
253
218
  **ds_mock_kwargs,
254
219
  )
255
- rebased_partition: Partition = ds.get_partition(
256
- rebased_table_stream.locator,
257
- partition_values_param,
220
+ rebased_partition: Partition = metastore.get_partition(
221
+ stream_locator=rebased_table_stream.locator,
222
+ partition_values=converted_partition_values_for_lookup,
223
+ partition_scheme_id=rebased_table_stream.partition_scheme.id,
258
224
  **ds_mock_kwargs,
259
225
  )
226
+ all_column_names = metastore.get_table_version_column_names(
227
+ rebased_table_stream.locator.table_locator.namespace,
228
+ rebased_table_stream.locator.table_locator.table_name,
229
+ rebased_table_stream.locator.table_version_locator.table_version,
230
+ catalog=ds_mock_kwargs.get("inner"),
231
+ )
260
232
  total_cpus = DEFAULT_NUM_WORKERS * DEFAULT_WORKER_INSTANCE_CPUS
261
233
  pgm = None
262
234
  if create_placement_group_param:
@@ -266,10 +238,10 @@ def test_compact_partition_rebase_multiple_rounds_same_source_and_destination(
266
238
  with tempfile.TemporaryDirectory() as test_dir:
267
239
  compact_partition_params = CompactPartitionParams.of(
268
240
  {
269
- "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
241
+ "catalog": ds_mock_kwargs.get("inner"),
270
242
  "compacted_file_content_type": ContentType.PARQUET,
271
243
  "dd_max_parallelism_ratio": 1.0,
272
- "deltacat_storage": ds,
244
+ "deltacat_storage": metastore,
273
245
  "deltacat_storage_kwargs": ds_mock_kwargs,
274
246
  "destination_partition_locator": rebased_partition.locator,
275
247
  "hash_bucket_count": hash_bucket_count_param,
@@ -281,11 +253,11 @@ def test_compact_partition_rebase_multiple_rounds_same_source_and_destination(
281
253
  "object_store": FileObjectStore(test_dir),
282
254
  "pg_config": pgm,
283
255
  "primary_keys": primary_keys,
256
+ "all_column_names": all_column_names,
284
257
  "read_kwargs_provider": read_kwargs_provider_param,
285
258
  "rebase_source_partition_locator": source_partition.locator,
286
259
  "rebase_source_partition_high_watermark": rebased_partition.stream_position,
287
260
  "records_per_compacted_file": records_per_compacted_file_param,
288
- "s3_client_kwargs": {},
289
261
  "source_partition_locator": rebased_partition.locator,
290
262
  "sort_keys": sort_keys if sort_keys else None,
291
263
  "num_rounds": num_rounds_param,
@@ -308,23 +280,25 @@ def test_compact_partition_rebase_multiple_rounds_same_source_and_destination(
308
280
  object_store_clear_spy = mocker.spy(FileObjectStore, "clear")
309
281
 
310
282
  # execute
311
- rcf_file_s3_uri = benchmark(compact_partition_func, compact_partition_params)
283
+ benchmark(compact_partition_func, compact_partition_params)
312
284
 
313
- round_completion_info: RoundCompletionInfo = get_rcf(
314
- s3_resource, rcf_file_s3_uri
315
- )
316
- audit_bucket, audit_key = RoundCompletionInfo.get_audit_bucket_name_and_key(
317
- round_completion_info.compaction_audit_url
285
+ # Get RoundCompletionInfo from the compacted partition
286
+ round_completion_info: RoundCompletionInfo = get_rci_from_partition(
287
+ rebased_partition.locator, metastore, catalog=ds_mock_kwargs.get("inner")
318
288
  )
319
289
 
320
- compaction_audit_obj: Dict[str, Any] = read_s3_contents(
321
- s3_resource, audit_bucket, audit_key
290
+ # Get catalog root for audit file resolution
291
+ catalog = ds_mock_kwargs.get("inner")
292
+ catalog_root = catalog.root
293
+
294
+ compaction_audit_obj: Dict[str, Any] = read_audit_file(
295
+ round_completion_info.compaction_audit_url, catalog_root
322
296
  )
323
297
  compaction_audit: CompactionSessionAuditInfo = CompactionSessionAuditInfo(
324
298
  **compaction_audit_obj
325
299
  )
326
300
 
327
- # assert if RCF covers all files
301
+ # assert if RCI covers all files
328
302
  # multiple rounds feature is only supported in V2 compactor
329
303
  previous_end = None
330
304
  for start, end in round_completion_info.hb_index_to_entry_range.values():
@@ -338,16 +312,24 @@ def test_compact_partition_rebase_multiple_rounds_same_source_and_destination(
338
312
  assert (
339
313
  execute_compaction_result_spy.call_args.args[-1] is False
340
314
  ), "Table version erroneously marked as in-place compacted!"
341
- compacted_delta_locator: DeltaLocator = get_compacted_delta_locator_from_rcf(
342
- s3_resource, rcf_file_s3_uri
315
+ compacted_delta_locator: DeltaLocator = (
316
+ get_compacted_delta_locator_from_partition(
317
+ rebased_partition.locator,
318
+ metastore,
319
+ catalog=ds_mock_kwargs.get("inner"),
320
+ )
343
321
  )
344
- tables = ds.download_delta(
322
+ tables = metastore.download_delta(
345
323
  compacted_delta_locator, storage_type=StorageType.LOCAL, **ds_mock_kwargs
346
324
  )
347
325
  actual_rebase_compacted_table = pa.concat_tables(tables)
348
326
  # if no primary key is specified then sort by sort_key for consistent assertion
349
327
  sorting_cols: List[Any] = (
350
- [(val, "ascending") for val in primary_keys] if primary_keys else sort_keys
328
+ [(val, "ascending") for val in primary_keys]
329
+ if primary_keys
330
+ else [pa_key for key in sort_keys for pa_key in key.arrow]
331
+ if sort_keys
332
+ else []
351
333
  )
352
334
  rebase_expected_compact_partition_result = (
353
335
  rebase_expected_compact_partition_result.combine_chunks().sort_by(