deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. deltacat/__init__.py +117 -18
  2. deltacat/api.py +536 -126
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +1 -19
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2444 -282
  12. deltacat/catalog/model/catalog.py +208 -113
  13. deltacat/catalog/model/properties.py +63 -24
  14. deltacat/compute/__init__.py +14 -0
  15. deltacat/compute/compactor/compaction_session.py +97 -75
  16. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  17. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  18. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  19. deltacat/compute/compactor/repartition_session.py +8 -21
  20. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  21. deltacat/compute/compactor/steps/materialize.py +9 -7
  22. deltacat/compute/compactor/steps/repartition.py +12 -11
  23. deltacat/compute/compactor/utils/io.py +6 -5
  24. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  25. deltacat/compute/compactor/utils/system_columns.py +3 -1
  26. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  27. deltacat/compute/compactor_v2/constants.py +30 -1
  28. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  30. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  31. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  32. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  33. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  34. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  35. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  36. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  37. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  38. deltacat/compute/compactor_v2/utils/io.py +11 -4
  39. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  40. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  41. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  42. deltacat/compute/converter/constants.py +5 -0
  43. deltacat/compute/converter/converter_session.py +207 -52
  44. deltacat/compute/converter/model/convert_input.py +43 -16
  45. deltacat/compute/converter/model/convert_input_files.py +33 -16
  46. deltacat/compute/converter/model/convert_result.py +80 -0
  47. deltacat/compute/converter/model/converter_session_params.py +64 -19
  48. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  49. deltacat/compute/converter/pyiceberg/overrides.py +193 -65
  50. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  51. deltacat/compute/converter/steps/convert.py +230 -75
  52. deltacat/compute/converter/steps/dedupe.py +46 -12
  53. deltacat/compute/converter/utils/convert_task_options.py +66 -22
  54. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  55. deltacat/compute/converter/utils/iceberg_columns.py +13 -8
  56. deltacat/compute/converter/utils/io.py +173 -13
  57. deltacat/compute/converter/utils/s3u.py +42 -27
  58. deltacat/compute/janitor.py +205 -0
  59. deltacat/compute/jobs/client.py +417 -0
  60. deltacat/compute/resource_estimation/delta.py +38 -6
  61. deltacat/compute/resource_estimation/model.py +8 -0
  62. deltacat/constants.py +49 -6
  63. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  64. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  65. deltacat/env.py +10 -0
  66. deltacat/examples/basic_logging.py +6 -6
  67. deltacat/examples/compactor/aws/__init__.py +1 -0
  68. deltacat/examples/compactor/bootstrap.py +863 -0
  69. deltacat/examples/compactor/compactor.py +373 -0
  70. deltacat/examples/compactor/explorer.py +473 -0
  71. deltacat/examples/compactor/gcp/__init__.py +1 -0
  72. deltacat/examples/compactor/job_runner.py +439 -0
  73. deltacat/examples/compactor/utils/__init__.py +1 -0
  74. deltacat/examples/compactor/utils/common.py +261 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  80. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  81. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
  82. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  83. deltacat/examples/hello_world.py +4 -2
  84. deltacat/examples/indexer/indexer.py +163 -0
  85. deltacat/examples/indexer/job_runner.py +198 -0
  86. deltacat/exceptions.py +66 -4
  87. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  88. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  89. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
  90. deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
  91. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  92. deltacat/experimental/converter_agent/__init__.py +0 -0
  93. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  94. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  95. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  96. deltacat/experimental/daft/__init__.py +4 -0
  97. deltacat/experimental/daft/daft_catalog.py +229 -0
  98. deltacat/experimental/storage/__init__.py +0 -0
  99. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  100. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  101. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  102. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  103. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  104. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  105. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  107. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  108. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  109. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  110. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  111. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  112. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  113. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  114. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  115. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  116. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  117. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  118. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  119. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  120. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  121. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  122. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  123. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  124. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  125. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  126. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  127. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  128. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  129. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  130. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  131. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  132. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  133. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  134. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  135. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  136. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  137. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  138. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  139. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  140. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  141. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  142. deltacat/io/__init__.py +13 -0
  143. deltacat/io/dataset/__init__.py +0 -0
  144. deltacat/io/dataset/deltacat_dataset.py +91 -0
  145. deltacat/io/datasink/__init__.py +0 -0
  146. deltacat/io/datasink/deltacat_datasink.py +207 -0
  147. deltacat/io/datasource/__init__.py +0 -0
  148. deltacat/io/datasource/deltacat_datasource.py +579 -0
  149. deltacat/io/reader/__init__.py +0 -0
  150. deltacat/io/reader/deltacat_read_api.py +172 -0
  151. deltacat/storage/__init__.py +22 -2
  152. deltacat/storage/interface.py +54 -32
  153. deltacat/storage/main/impl.py +1494 -541
  154. deltacat/storage/model/delta.py +27 -3
  155. deltacat/storage/model/expression/__init__.py +47 -0
  156. deltacat/storage/model/expression/expression.py +656 -0
  157. deltacat/storage/model/expression/visitor.py +248 -0
  158. deltacat/storage/model/locator.py +6 -12
  159. deltacat/storage/model/manifest.py +231 -6
  160. deltacat/storage/model/metafile.py +224 -119
  161. deltacat/storage/model/namespace.py +8 -1
  162. deltacat/storage/model/partition.py +117 -42
  163. deltacat/storage/model/scan/push_down.py +32 -5
  164. deltacat/storage/model/schema.py +2427 -159
  165. deltacat/storage/model/shard.py +6 -2
  166. deltacat/storage/model/sort_key.py +40 -0
  167. deltacat/storage/model/stream.py +9 -2
  168. deltacat/storage/model/table.py +12 -1
  169. deltacat/storage/model/table_version.py +11 -0
  170. deltacat/storage/model/transaction.py +1184 -208
  171. deltacat/storage/model/transform.py +81 -2
  172. deltacat/storage/model/types.py +53 -29
  173. deltacat/storage/util/__init__.py +0 -0
  174. deltacat/storage/util/scan_planner.py +26 -0
  175. deltacat/tests/_io/reader/__init__.py +0 -0
  176. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  177. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  178. deltacat/tests/aws/test_s3u.py +2 -31
  179. deltacat/tests/catalog/data/__init__.py +0 -0
  180. deltacat/tests/catalog/main/__init__.py +0 -0
  181. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  182. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  183. deltacat/tests/catalog/model/__init__.py +0 -0
  184. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  185. deltacat/tests/catalog/test_catalogs.py +103 -106
  186. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  187. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  188. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  189. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  190. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  191. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  192. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  193. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  194. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  195. deltacat/tests/compute/conftest.py +8 -44
  196. deltacat/tests/compute/converter/test_convert_session.py +697 -349
  197. deltacat/tests/compute/converter/utils.py +15 -6
  198. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  199. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  200. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  201. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  202. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  203. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  204. deltacat/tests/compute/test_janitor.py +236 -0
  205. deltacat/tests/compute/test_util_common.py +716 -43
  206. deltacat/tests/compute/test_util_constant.py +0 -1
  207. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  208. deltacat/tests/daft/__init__.py +0 -0
  209. deltacat/tests/daft/test_model.py +97 -0
  210. deltacat/tests/experimental/__init__.py +1 -0
  211. deltacat/tests/experimental/catalog/__init__.py +0 -0
  212. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  213. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  214. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  215. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  216. deltacat/tests/experimental/daft/__init__.py +0 -0
  217. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  218. deltacat/tests/experimental/storage/__init__.py +0 -0
  219. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  220. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  221. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  222. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  223. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  224. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  225. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  226. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  227. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  228. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  229. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  230. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  231. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  232. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  233. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  234. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  235. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  236. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  237. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  238. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  239. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  240. deltacat/tests/storage/model/test_expression.py +327 -0
  241. deltacat/tests/storage/model/test_manifest.py +129 -0
  242. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  243. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  244. deltacat/tests/storage/model/test_schema.py +171 -0
  245. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  246. deltacat/tests/storage/model/test_shard.py +3 -1
  247. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  248. deltacat/tests/storage/model/test_transaction.py +393 -48
  249. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  250. deltacat/tests/test_deltacat_api.py +1036 -11
  251. deltacat/tests/test_exceptions.py +9 -5
  252. deltacat/tests/test_utils/pyarrow.py +52 -21
  253. deltacat/tests/test_utils/storage.py +23 -34
  254. deltacat/tests/types/__init__.py +0 -0
  255. deltacat/tests/types/test_tables.py +104 -0
  256. deltacat/tests/utils/exceptions.py +22 -0
  257. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  258. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  259. deltacat/tests/utils/test_daft.py +121 -31
  260. deltacat/tests/utils/test_numpy.py +1193 -0
  261. deltacat/tests/utils/test_pandas.py +1106 -0
  262. deltacat/tests/utils/test_polars.py +1040 -0
  263. deltacat/tests/utils/test_pyarrow.py +1370 -89
  264. deltacat/types/media.py +345 -37
  265. deltacat/types/tables.py +2344 -46
  266. deltacat/utils/arguments.py +33 -1
  267. deltacat/utils/daft.py +824 -40
  268. deltacat/utils/export.py +3 -1
  269. deltacat/utils/filesystem.py +139 -9
  270. deltacat/utils/metafile_locator.py +2 -1
  271. deltacat/utils/numpy.py +118 -26
  272. deltacat/utils/pandas.py +577 -48
  273. deltacat/utils/polars.py +759 -0
  274. deltacat/utils/pyarrow.py +1373 -192
  275. deltacat/utils/ray_utils/concurrency.py +1 -1
  276. deltacat/utils/ray_utils/dataset.py +101 -10
  277. deltacat/utils/ray_utils/runtime.py +56 -4
  278. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  279. deltacat/utils/url.py +1325 -0
  280. deltacat-2.0.0.dist-info/METADATA +1163 -0
  281. deltacat-2.0.0.dist-info/RECORD +439 -0
  282. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  283. deltacat/catalog/iceberg/__init__.py +0 -4
  284. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  285. deltacat/compute/merge_on_read/__init__.py +0 -4
  286. deltacat/compute/merge_on_read/daft.py +0 -40
  287. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  288. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  289. deltacat/examples/common/fixtures.py +0 -15
  290. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  291. deltacat/storage/rivulet/__init__.py +0 -11
  292. deltacat/storage/rivulet/feather/__init__.py +0 -5
  293. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  294. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  295. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  296. deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
  297. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  298. deltacat/utils/s3fs.py +0 -21
  299. deltacat-2.0.dist-info/METADATA +0 -65
  300. deltacat-2.0.dist-info/RECORD +0 -347
  301. /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
  302. /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
  303. /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
  304. /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
  305. /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
  306. /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
  307. /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
  308. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
  309. /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
  310. /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  311. /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
  312. /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
  313. /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  314. /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
  315. /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
  316. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
  317. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  318. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  319. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  320. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  321. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  322. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  323. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  324. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,29 +1,29 @@
1
- import ray
2
- from moto import mock_s3
3
- import pytest
4
- import os
5
1
  import logging
6
- import boto3
7
- from typing import Any, Callable, Dict, List, Optional, Set, Tuple
8
- from boto3.resources.base import ServiceResource
2
+ from typing import Any, Dict, List, Optional, Set, Tuple, Callable
3
+ import uuid
4
+ import pytest
5
+
9
6
  import pyarrow as pa
7
+ import ray
8
+
10
9
  from pytest_benchmark.fixture import BenchmarkFixture
11
10
  from deltacat.types.media import StorageType
12
11
 
13
12
  from deltacat.tests.compute.test_util_common import (
14
- get_rcf,
13
+ get_rci_from_partition,
14
+ read_audit_file,
15
+ PartitionKeyType,
15
16
  )
16
- from deltacat.compute.compactor.model.compactor_version import CompactorVersion
17
- from deltacat.tests.test_utils.utils import read_s3_contents
18
- from deltacat.tests.compute.test_util_create_table_deltas_repo import (
19
- create_src_w_deltas_destination_plus_destination,
20
- add_late_deltas_to_partition,
17
+ from deltacat.tests.compute.test_util_common import (
18
+ add_late_deltas_to_partition_main,
19
+ create_src_w_deltas_destination_plus_destination_main,
21
20
  )
21
+ from deltacat.compute.compactor.model.compactor_version import CompactorVersion
22
+
22
23
  from deltacat.tests.compute.compact_partition_test_cases import (
23
24
  INCREMENTAL_TEST_CASES,
24
25
  )
25
26
  from deltacat.tests.compute.test_util_constant import (
26
- TEST_S3_RCF_BUCKET_NAME,
27
27
  DEFAULT_NUM_WORKERS,
28
28
  DEFAULT_WORKER_INSTANCE_CPUS,
29
29
  )
@@ -37,6 +37,7 @@ from deltacat.storage import (
37
37
  DeltaLocator,
38
38
  Partition,
39
39
  PartitionLocator,
40
+ metastore,
40
41
  )
41
42
  from deltacat.types.media import ContentType
42
43
  from deltacat.compute.compactor.model.compaction_session_audit_info import (
@@ -65,34 +66,29 @@ def setup_ray_cluster():
65
66
  ray.shutdown()
66
67
 
67
68
 
68
- @pytest.fixture(autouse=True, scope="module")
69
- def mock_aws_credential():
70
- os.environ["AWS_ACCESS_KEY_ID"] = "testing"
71
- os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
72
- os.environ["AWS_SECURITY_TOKEN"] = "testing"
73
- os.environ["AWS_SESSION_TOKEN"] = "testing"
74
- os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
75
- yield
76
-
69
+ """
70
+ FUNCTION scoped fixtures
71
+ """
77
72
 
78
- @pytest.fixture(scope="module")
79
- def s3_resource():
80
- with mock_s3():
81
- yield boto3.resource("s3")
82
73
 
74
+ @pytest.fixture(autouse=True, scope="function")
75
+ def enable_bucketing_spec_validation(monkeypatch):
76
+ """
77
+ Enable the bucketing spec validation for all tests.
78
+ This will help catch hash bucket drift in testing.
79
+ """
80
+ import deltacat.compute.compactor_v2.steps.merge
83
81
 
84
- @pytest.fixture(autouse=True, scope="module")
85
- def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
86
- s3_resource.create_bucket(
87
- ACL="authenticated-read",
88
- Bucket=TEST_S3_RCF_BUCKET_NAME,
82
+ monkeypatch.setattr(
83
+ deltacat.compute.compactor_v2.steps.merge,
84
+ "BUCKETING_SPEC_COMPLIANCE_PROFILE",
85
+ "ASSERT",
89
86
  )
90
- yield
91
87
 
92
88
 
93
- """
94
- FUNCTION scoped fixtures
95
- """
89
+ @pytest.fixture(scope="function")
90
+ def temp_dir(tmp_path):
91
+ return str(tmp_path)
96
92
 
97
93
 
98
94
  @pytest.mark.parametrize(
@@ -168,9 +164,8 @@ FUNCTION scoped fixtures
168
164
  ],
169
165
  ids=[test_name for test_name in INCREMENTAL_TEST_CASES],
170
166
  )
171
- def test_compact_partition_incremental(
172
- s3_resource: ServiceResource,
173
- local_deltacat_storage_kwargs: Dict[str, Any],
167
+ def test_compact_partition_incremental_main(
168
+ main_deltacat_storage_kwargs: Dict[str, Any],
174
169
  test_name: str,
175
170
  primary_keys: Set[str],
176
171
  sort_keys: Dict[str, str],
@@ -194,9 +189,16 @@ def test_compact_partition_incremental(
194
189
  compact_partition_func: Callable,
195
190
  benchmark: BenchmarkFixture,
196
191
  ):
197
- import deltacat.tests.local_deltacat_storage as ds
192
+ # Skip in-place compaction tests for main storage as it's not yet implemented
193
+ if is_inplace:
194
+ pytest.skip(
195
+ "In-place compaction not yet implemented in main storage (delta prepending limitation)"
196
+ )
197
+
198
+ ds_mock_kwargs: Dict[str, Any] = main_deltacat_storage_kwargs
198
199
 
199
- ds_mock_kwargs: Dict[str, Any] = local_deltacat_storage_kwargs
200
+ # Extract catalog from storage kwargs
201
+ catalog = ds_mock_kwargs.get("inner")
200
202
 
201
203
  # setup
202
204
  partition_keys = partition_keys_param
@@ -207,7 +209,7 @@ def test_compact_partition_incremental(
207
209
  source_table_namespace,
208
210
  source_table_name,
209
211
  source_table_version,
210
- ) = create_src_w_deltas_destination_plus_destination(
212
+ ) = create_src_w_deltas_destination_plus_destination_main(
211
213
  sort_keys,
212
214
  partition_keys,
213
215
  input_deltas,
@@ -216,15 +218,38 @@ def test_compact_partition_incremental(
216
218
  ds_mock_kwargs,
217
219
  is_inplace,
218
220
  )
219
- source_partition: Partition = ds.get_partition(
221
+
222
+ # Convert partition values to correct types for get_partition call
223
+ converted_partition_values = []
224
+ if partition_values_param and partition_keys:
225
+ # partition_values_param is a single string, but we need to handle it as a list
226
+ partition_values_list = (
227
+ [partition_values_param]
228
+ if isinstance(partition_values_param, str)
229
+ else partition_values_param
230
+ )
231
+ for i, (value, pk) in enumerate(zip(partition_values_list, partition_keys)):
232
+ if pk.key_type == PartitionKeyType.INT:
233
+ converted_partition_values.append(int(value))
234
+ else:
235
+ converted_partition_values.append(value)
236
+ else:
237
+ converted_partition_values = (
238
+ [partition_values_param] if partition_values_param else []
239
+ )
240
+
241
+ source_partition: Partition = metastore.get_partition(
220
242
  source_table_stream.locator,
221
- partition_values_param,
243
+ converted_partition_values,
244
+ partition_scheme_id="default_partition_scheme" if partition_keys else None,
222
245
  **ds_mock_kwargs,
223
246
  )
247
+ # Generate a destination partition ID based on the source partition
248
+ destination_partition_id = str(uuid.uuid4())
224
249
  destination_partition_locator: PartitionLocator = PartitionLocator.of(
225
250
  destination_table_stream.locator,
226
- partition_values_param,
227
- None,
251
+ converted_partition_values,
252
+ destination_partition_id,
228
253
  )
229
254
  num_workers, worker_instance_cpu = DEFAULT_NUM_WORKERS, DEFAULT_WORKER_INSTANCE_CPUS
230
255
  total_cpus: int = num_workers * worker_instance_cpu
@@ -235,12 +260,18 @@ def test_compact_partition_incremental(
235
260
  if create_placement_group_param
236
261
  else None
237
262
  )
263
+ all_column_names = metastore.get_table_version_column_names(
264
+ destination_table_stream.locator.table_locator.namespace,
265
+ destination_table_stream.locator.table_locator.table_name,
266
+ destination_table_stream.locator.table_version_locator.table_version,
267
+ catalog=catalog,
268
+ )
238
269
  compact_partition_params = CompactPartitionParams.of(
239
270
  {
240
- "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
271
+ "catalog": catalog,
241
272
  "compacted_file_content_type": ContentType.PARQUET,
242
273
  "dd_max_parallelism_ratio": 1.0,
243
- "deltacat_storage": ds,
274
+ "deltacat_storage": metastore,
244
275
  "deltacat_storage_kwargs": ds_mock_kwargs,
245
276
  "destination_partition_locator": destination_partition_locator,
246
277
  "drop_duplicates": drop_duplicates_param,
@@ -249,11 +280,11 @@ def test_compact_partition_incremental(
249
280
  "list_deltas_kwargs": {**ds_mock_kwargs, **{"equivalent_table_types": []}},
250
281
  "pg_config": pgm,
251
282
  "primary_keys": primary_keys,
283
+ "all_column_names": all_column_names,
252
284
  "read_kwargs_provider": read_kwargs_provider_param,
253
285
  "rebase_source_partition_locator": None,
254
286
  "rebase_source_partition_high_watermark": None,
255
287
  "records_per_compacted_file": records_per_compacted_file_param,
256
- "s3_client_kwargs": {},
257
288
  "source_partition_locator": source_partition.locator,
258
289
  "sort_keys": sort_keys if sort_keys else None,
259
290
  }
@@ -264,18 +295,17 @@ def test_compact_partition_incremental(
264
295
  """
265
296
  This callable runs right before invoking the benchmark target function (compaction).
266
297
  This is needed as the benchmark module will invoke the target function multiple times
267
- in a single test run, which can lead to non-idempotent behavior if RCFs are generated.
298
+ in a single test run, which can lead to non-idempotent behavior if RCIs are generated.
268
299
 
269
300
  Returns: args, kwargs
270
301
  """
271
- s3_resource.Bucket(TEST_S3_RCF_BUCKET_NAME).objects.all().delete()
272
302
  return (compact_partition_params,), {}
273
303
 
274
304
  if add_late_deltas:
275
305
  # NOTE: In the case of in-place compaction it is plausible that new deltas may be added to the source partition during compaction
276
306
  # (so that the source_partitition.stream_position > last_stream_position_to_compact).
277
307
  # This parameter helps simulate the case to check that no late deltas are dropped even when the compacted partition is created.
278
- latest_delta, _ = add_late_deltas_to_partition(
308
+ latest_delta, _ = add_late_deltas_to_partition_main(
279
309
  add_late_deltas, source_partition, ds_mock_kwargs
280
310
  )
281
311
  if expected_terminal_exception:
@@ -283,27 +313,28 @@ def test_compact_partition_incremental(
283
313
  compact_partition_func(compact_partition_params)
284
314
  assert expected_terminal_exception_message in str(exc_info.value)
285
315
  return
286
- rcf_file_s3_uri = benchmark.pedantic(
287
- compact_partition_func, setup=_incremental_compaction_setup
288
- )
316
+ benchmark.pedantic(compact_partition_func, setup=_incremental_compaction_setup)
289
317
 
290
- # validate
291
- round_completion_info: RoundCompletionInfo = get_rcf(s3_resource, rcf_file_s3_uri)
318
+ # validate - get RoundCompletionInfo from the compacted partition
319
+ round_completion_info: RoundCompletionInfo = get_rci_from_partition(
320
+ destination_partition_locator, metastore, catalog=catalog
321
+ )
292
322
  compacted_delta_locator: DeltaLocator = (
293
323
  round_completion_info.compacted_delta_locator
294
324
  )
295
- audit_bucket, audit_key = RoundCompletionInfo.get_audit_bucket_name_and_key(
296
- round_completion_info.compaction_audit_url
297
- )
298
325
 
299
- compaction_audit_obj: Dict[str, Any] = read_s3_contents(
300
- s3_resource, audit_bucket, audit_key
326
+ # Get catalog root for audit file resolution
327
+ catalog_root = catalog.root
328
+
329
+ compaction_audit_obj: Dict[str, Any] = read_audit_file(
330
+ round_completion_info.compaction_audit_url, catalog_root
301
331
  )
332
+
302
333
  compaction_audit: CompactionSessionAuditInfo = CompactionSessionAuditInfo(
303
334
  **compaction_audit_obj
304
335
  )
305
336
 
306
- # assert if RCF covers all files
337
+ # assert if RCI covers all files
307
338
  if compactor_version != CompactorVersion.V1.value:
308
339
  previous_end = None
309
340
  for start, end in round_completion_info.hb_index_to_entry_range.values():
@@ -313,7 +344,7 @@ def test_compact_partition_incremental(
313
344
  previous_end == round_completion_info.compacted_pyarrow_write_result.files
314
345
  )
315
346
 
316
- tables = ds.download_delta(
347
+ tables = metastore.download_delta(
317
348
  compacted_delta_locator, storage_type=StorageType.LOCAL, **ds_mock_kwargs
318
349
  )
319
350
  actual_compacted_table = pa.concat_tables(tables)
@@ -347,25 +378,27 @@ def test_compact_partition_incremental(
347
378
  == destination_partition_locator.partition_values
348
379
  and source_partition.locator.stream_id
349
380
  == destination_partition_locator.stream_id
350
- ), f"The source partition: {source_partition.locator.canonical_string} should match the destination partition: {destination_partition_locator.canonical_string}"
381
+ ), f"The source partition: {source_partition.locator} should match the destination partition: {destination_partition_locator}"
351
382
  assert (
352
383
  compacted_delta_locator.stream_id == source_partition.locator.stream_id
353
384
  ), "The compacted delta should be in the same stream as the source"
354
- source_partition: Partition = ds.get_partition(
385
+ source_partition: Partition = metastore.get_partition(
355
386
  source_table_stream.locator,
356
- partition_values_param,
387
+ converted_partition_values,
388
+ partition_scheme_id="default_partition_scheme" if partition_keys else None,
357
389
  **ds_mock_kwargs,
358
390
  )
359
- compacted_partition: Optional[Partition] = ds.get_partition(
391
+ compacted_partition: Optional[Partition] = metastore.get_partition(
360
392
  compacted_delta_locator.stream_locator,
361
- partition_values_param,
393
+ converted_partition_values,
394
+ partition_scheme_id="default_partition_scheme" if partition_keys else None,
362
395
  **ds_mock_kwargs,
363
396
  )
364
397
  assert (
365
398
  compacted_partition.state == source_partition.state == CommitState.COMMITTED
366
399
  ), f"The compacted/source table partition should be in {CommitState.COMMITTED} state and not {CommitState.DEPRECATED}"
367
400
  if add_late_deltas:
368
- compacted_partition_deltas: List[Delta] = ds.list_partition_deltas(
401
+ compacted_partition_deltas: List[Delta] = metastore.list_partition_deltas(
369
402
  partition_like=compacted_partition,
370
403
  ascending_order=False,
371
404
  **ds_mock_kwargs,
@@ -1,43 +1,38 @@
1
- import ray
2
- import os
3
- from moto import mock_s3
1
+ import tempfile
2
+ from typing import Any, Dict, List, Optional, Set, Callable
4
3
  import pytest
5
- import boto3
6
- from boto3.resources.base import ServiceResource
7
4
  import pyarrow as pa
5
+ import ray
6
+
8
7
  from deltacat.io.file_object_store import FileObjectStore
9
8
  from pytest_benchmark.fixture import BenchmarkFixture
10
- import tempfile
11
9
 
12
10
  from deltacat.tests.compute.test_util_constant import (
13
- TEST_S3_RCF_BUCKET_NAME,
14
11
  DEFAULT_NUM_WORKERS,
15
12
  DEFAULT_WORKER_INSTANCE_CPUS,
16
13
  )
17
14
  from deltacat.tests.compute.test_util_common import (
18
- get_rcf,
15
+ get_rci_from_partition,
16
+ read_audit_file,
17
+ PartitionKey,
18
+ get_compacted_delta_locator_from_partition,
19
19
  )
20
- from deltacat.tests.test_utils.utils import read_s3_contents
21
- from deltacat.compute.compactor.model.compactor_version import CompactorVersion
22
20
  from deltacat.tests.compute.test_util_common import (
23
- get_compacted_delta_locator_from_rcf,
21
+ multiple_rounds_create_src_w_deltas_destination_rebase_w_deltas_strategy_main,
24
22
  )
23
+
24
+ from deltacat.compute.compactor.model.compactor_version import CompactorVersion
25
25
  from deltacat.compute.compactor.model.compaction_session_audit_info import (
26
26
  CompactionSessionAuditInfo,
27
27
  )
28
- from deltacat.tests.compute.test_util_create_table_deltas_repo import (
29
- multiple_rounds_create_src_w_deltas_destination_rebase_w_deltas_strategy,
30
- )
31
28
  from deltacat.tests.compute.compact_partition_multiple_rounds_test_cases import (
32
29
  MULTIPLE_ROUNDS_TEST_CASES,
33
30
  )
34
- from typing import Any, Callable, Dict, List, Optional, Set
35
- from deltacat.types.media import StorageType
31
+ from deltacat.types.media import StorageType, ContentType
36
32
  from deltacat.storage import (
37
33
  DeltaLocator,
38
34
  Partition,
39
35
  )
40
- from deltacat.types.media import ContentType
41
36
  from deltacat.compute.compactor.model.compact_partition_params import (
42
37
  CompactPartitionParams,
43
38
  )
@@ -47,6 +42,8 @@ from deltacat.compute.compactor import (
47
42
  from deltacat.utils.placement import (
48
43
  PlacementGroupManager,
49
44
  )
45
+ from deltacat.storage import metastore
46
+
50
47
 
51
48
  """
52
49
  MODULE scoped fixtures
@@ -60,29 +57,24 @@ def setup_ray_cluster():
60
57
  ray.shutdown()
61
58
 
62
59
 
63
- @pytest.fixture(autouse=True, scope="module")
64
- def mock_aws_credential():
65
- os.environ["AWS_ACCESS_KEY_ID"] = "testing"
66
- os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
67
- os.environ["AWS_SECURITY_TOKEN"] = "testing"
68
- os.environ["AWS_SESSION_TOKEN"] = "testing"
69
- os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
70
- yield
71
-
60
+ """
61
+ FUNCTION scoped fixtures
62
+ """
72
63
 
73
- @pytest.fixture(scope="module")
74
- def s3_resource(mock_aws_credential):
75
- with mock_s3():
76
- yield boto3.resource("s3")
77
64
 
65
+ @pytest.fixture(autouse=True, scope="function")
66
+ def enable_bucketing_spec_validation(monkeypatch):
67
+ """
68
+ Enable the bucketing spec validation for all tests.
69
+ This will help catch hash bucket drift in testing.
70
+ """
71
+ import deltacat.compute.compactor_v2.steps.merge
78
72
 
79
- @pytest.fixture(autouse=True, scope="module")
80
- def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
81
- s3_resource.create_bucket(
82
- ACL="authenticated-read",
83
- Bucket=TEST_S3_RCF_BUCKET_NAME,
73
+ monkeypatch.setattr(
74
+ deltacat.compute.compactor_v2.steps.merge,
75
+ "BUCKETING_SPEC_COMPLIANCE_PROFILE",
76
+ "ASSERT",
84
77
  )
85
- yield
86
78
 
87
79
 
88
80
  @pytest.mark.parametrize(
@@ -155,14 +147,13 @@ def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
155
147
  ],
156
148
  ids=[test_name for test_name in MULTIPLE_ROUNDS_TEST_CASES],
157
149
  )
158
- def test_compact_partition_rebase_multiple_rounds_same_source_and_destination(
150
+ def test_compact_partition_rebase_multiple_rounds_same_source_and_destination_main(
159
151
  mocker,
160
- s3_resource: ServiceResource,
161
- local_deltacat_storage_kwargs: Dict[str, Any],
152
+ main_deltacat_storage_kwargs: Dict[str, Any],
162
153
  test_name: str,
163
154
  primary_keys: Set[str],
164
155
  sort_keys: List[Optional[Any]],
165
- partition_keys_param: Optional[List[Any]],
156
+ partition_keys_param: Optional[List[PartitionKey]],
166
157
  partition_values_param: List[Optional[str]],
167
158
  input_deltas_param: List[pa.Array],
168
159
  expected_terminal_compact_partition_result: pa.Table,
@@ -181,37 +172,63 @@ def test_compact_partition_rebase_multiple_rounds_same_source_and_destination(
181
172
  num_rounds_param: int,
182
173
  benchmark: BenchmarkFixture,
183
174
  ):
184
- import deltacat.tests.local_deltacat_storage as ds
185
-
186
- ds_mock_kwargs = local_deltacat_storage_kwargs
175
+ ds_mock_kwargs = main_deltacat_storage_kwargs
187
176
  """
188
177
  This test tests different multi-round compaction rebase configurations,
189
- as specified in compact_partition_multiple_rounds_test_cases.py
178
+ as specified in compact_partition_multiple_rounds_test_cases.py.
190
179
  These tests do not test multi-round compaction backfill, which is
191
180
  currently unsupported.
181
+
182
+ This version uses the main metastore implementation instead of local storage.
192
183
  """
193
184
  (
194
185
  source_table_stream,
195
186
  _,
196
187
  rebased_table_stream,
197
188
  _,
198
- ) = multiple_rounds_create_src_w_deltas_destination_rebase_w_deltas_strategy(
189
+ ) = multiple_rounds_create_src_w_deltas_destination_rebase_w_deltas_strategy_main(
199
190
  sort_keys,
200
191
  partition_keys_param,
201
192
  input_deltas_param,
202
193
  partition_values_param,
203
194
  ds_mock_kwargs,
204
195
  )
205
- source_partition: Partition = ds.get_partition(
206
- source_table_stream.locator,
207
- partition_values_param,
196
+ # Convert partition values for partition lookup (same as in the helper function)
197
+ converted_partition_values_for_lookup = partition_values_param
198
+ if partition_values_param and partition_keys_param:
199
+ converted_partition_values_for_lookup = []
200
+ for i, (value, key) in enumerate(
201
+ zip(partition_values_param, partition_keys_param)
202
+ ):
203
+ if key.key_type == "int":
204
+ converted_partition_values_for_lookup.append(int(value))
205
+ elif key.key_type == "string":
206
+ converted_partition_values_for_lookup.append(str(value))
207
+ elif key.key_type == "timestamp":
208
+ converted_partition_values_for_lookup.append(
209
+ value
210
+ ) # Keep as is for now
211
+ else:
212
+ converted_partition_values_for_lookup.append(value)
213
+
214
+ source_partition: Partition = metastore.get_partition(
215
+ stream_locator=source_table_stream.locator,
216
+ partition_values=converted_partition_values_for_lookup,
217
+ partition_scheme_id=source_table_stream.partition_scheme.id,
208
218
  **ds_mock_kwargs,
209
219
  )
210
- rebased_partition: Partition = ds.get_partition(
211
- rebased_table_stream.locator,
212
- partition_values_param,
220
+ rebased_partition: Partition = metastore.get_partition(
221
+ stream_locator=rebased_table_stream.locator,
222
+ partition_values=converted_partition_values_for_lookup,
223
+ partition_scheme_id=rebased_table_stream.partition_scheme.id,
213
224
  **ds_mock_kwargs,
214
225
  )
226
+ all_column_names = metastore.get_table_version_column_names(
227
+ rebased_table_stream.locator.table_locator.namespace,
228
+ rebased_table_stream.locator.table_locator.table_name,
229
+ rebased_table_stream.locator.table_version_locator.table_version,
230
+ catalog=ds_mock_kwargs.get("inner"),
231
+ )
215
232
  total_cpus = DEFAULT_NUM_WORKERS * DEFAULT_WORKER_INSTANCE_CPUS
216
233
  pgm = None
217
234
  if create_placement_group_param:
@@ -221,10 +238,10 @@ def test_compact_partition_rebase_multiple_rounds_same_source_and_destination(
221
238
  with tempfile.TemporaryDirectory() as test_dir:
222
239
  compact_partition_params = CompactPartitionParams.of(
223
240
  {
224
- "compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
241
+ "catalog": ds_mock_kwargs.get("inner"),
225
242
  "compacted_file_content_type": ContentType.PARQUET,
226
243
  "dd_max_parallelism_ratio": 1.0,
227
- "deltacat_storage": ds,
244
+ "deltacat_storage": metastore,
228
245
  "deltacat_storage_kwargs": ds_mock_kwargs,
229
246
  "destination_partition_locator": rebased_partition.locator,
230
247
  "hash_bucket_count": hash_bucket_count_param,
@@ -236,11 +253,11 @@ def test_compact_partition_rebase_multiple_rounds_same_source_and_destination(
236
253
  "object_store": FileObjectStore(test_dir),
237
254
  "pg_config": pgm,
238
255
  "primary_keys": primary_keys,
256
+ "all_column_names": all_column_names,
239
257
  "read_kwargs_provider": read_kwargs_provider_param,
240
258
  "rebase_source_partition_locator": source_partition.locator,
241
259
  "rebase_source_partition_high_watermark": rebased_partition.stream_position,
242
260
  "records_per_compacted_file": records_per_compacted_file_param,
243
- "s3_client_kwargs": {},
244
261
  "source_partition_locator": rebased_partition.locator,
245
262
  "sort_keys": sort_keys if sort_keys else None,
246
263
  "num_rounds": num_rounds_param,
@@ -263,23 +280,25 @@ def test_compact_partition_rebase_multiple_rounds_same_source_and_destination(
263
280
  object_store_clear_spy = mocker.spy(FileObjectStore, "clear")
264
281
 
265
282
  # execute
266
- rcf_file_s3_uri = benchmark(compact_partition_func, compact_partition_params)
283
+ benchmark(compact_partition_func, compact_partition_params)
267
284
 
268
- round_completion_info: RoundCompletionInfo = get_rcf(
269
- s3_resource, rcf_file_s3_uri
270
- )
271
- audit_bucket, audit_key = RoundCompletionInfo.get_audit_bucket_name_and_key(
272
- round_completion_info.compaction_audit_url
285
+ # Get RoundCompletionInfo from the compacted partition
286
+ round_completion_info: RoundCompletionInfo = get_rci_from_partition(
287
+ rebased_partition.locator, metastore, catalog=ds_mock_kwargs.get("inner")
273
288
  )
274
289
 
275
- compaction_audit_obj: Dict[str, Any] = read_s3_contents(
276
- s3_resource, audit_bucket, audit_key
290
+ # Get catalog root for audit file resolution
291
+ catalog = ds_mock_kwargs.get("inner")
292
+ catalog_root = catalog.root
293
+
294
+ compaction_audit_obj: Dict[str, Any] = read_audit_file(
295
+ round_completion_info.compaction_audit_url, catalog_root
277
296
  )
278
297
  compaction_audit: CompactionSessionAuditInfo = CompactionSessionAuditInfo(
279
298
  **compaction_audit_obj
280
299
  )
281
300
 
282
- # assert if RCF covers all files
301
+ # assert if RCI covers all files
283
302
  # multiple rounds feature is only supported in V2 compactor
284
303
  previous_end = None
285
304
  for start, end in round_completion_info.hb_index_to_entry_range.values():
@@ -293,10 +312,14 @@ def test_compact_partition_rebase_multiple_rounds_same_source_and_destination(
293
312
  assert (
294
313
  execute_compaction_result_spy.call_args.args[-1] is False
295
314
  ), "Table version erroneously marked as in-place compacted!"
296
- compacted_delta_locator: DeltaLocator = get_compacted_delta_locator_from_rcf(
297
- s3_resource, rcf_file_s3_uri
315
+ compacted_delta_locator: DeltaLocator = (
316
+ get_compacted_delta_locator_from_partition(
317
+ rebased_partition.locator,
318
+ metastore,
319
+ catalog=ds_mock_kwargs.get("inner"),
320
+ )
298
321
  )
299
- tables = ds.download_delta(
322
+ tables = metastore.download_delta(
300
323
  compacted_delta_locator, storage_type=StorageType.LOCAL, **ds_mock_kwargs
301
324
  )
302
325
  actual_rebase_compacted_table = pa.concat_tables(tables)
@@ -1,6 +1,7 @@
1
1
  import json
2
-
2
+ import tempfile
3
3
  import unittest
4
+ import uuid
4
5
 
5
6
 
6
7
  class TestCompactPartitionParams(unittest.TestCase):
@@ -8,9 +9,14 @@ class TestCompactPartitionParams(unittest.TestCase):
8
9
  def setUpClass(cls):
9
10
  from deltacat.types.media import ContentType
10
11
  from deltacat.utils.metrics import MetricsConfig, MetricsTarget
12
+ from deltacat.catalog import CatalogProperties
13
+
14
+ # Create a temporary catalog for testing
15
+ tmpdir = tempfile.mkdtemp()
16
+ cls.test_catalog = CatalogProperties(root=tmpdir)
11
17
 
12
18
  cls.VALID_COMPACT_PARTITION_PARAMS = {
13
- "compaction_artifact_s3_bucket": "foobar",
19
+ "catalog": cls.test_catalog,
14
20
  "compacted_file_content_type": ContentType.PARQUET,
15
21
  "deltacat_storage": "foobar",
16
22
  "destination_partition_locator": {
@@ -26,12 +32,13 @@ class TestCompactPartitionParams(unittest.TestCase):
26
32
  "format": "fooType",
27
33
  },
28
34
  "partitionValues": [],
29
- "partitionId": None,
35
+ "partitionId": str(uuid.uuid4()),
30
36
  },
31
37
  "hash_bucket_count": 200,
32
38
  "last_stream_position_to_compact": 168000000000,
33
39
  "list_deltas_kwargs": {"equivalent_table_types": []},
34
40
  "primary_keys": {"id"},
41
+ "all_column_names": ["id", "foo", "bar", "baz"],
35
42
  "properties": {
36
43
  "parent_stream_position": "1688000000000",
37
44
  },
@@ -52,7 +59,7 @@ class TestCompactPartitionParams(unittest.TestCase):
52
59
  "partitionValues": [],
53
60
  "partitionId": "79612ea39ac5493eae925abe60767d42",
54
61
  },
55
- "s3_table_writer_kwargs": {
62
+ "table_writer_kwargs": {
56
63
  "version": "1.0",
57
64
  "flavor": "foobar",
58
65
  "coerce_timestamps": "ms",
@@ -103,10 +110,8 @@ class TestCompactPartitionParams(unittest.TestCase):
103
110
  json.loads(serialized_params)["compacted_file_content_type"]
104
111
  == params.compacted_file_content_type
105
112
  )
106
- assert (
107
- json.loads(serialized_params)["compaction_artifact_s3_bucket"]
108
- == params.compaction_artifact_s3_bucket
109
- )
113
+ catalog_json = json.loads(serialized_params)["catalog"]
114
+ assert catalog_json["_root"] == params.catalog.root
110
115
  assert (
111
116
  json.loads(serialized_params)["hash_bucket_count"]
112
117
  == params.hash_bucket_count