deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -5,7 +5,8 @@ import logging
5
5
  import ray
6
6
  import time
7
7
  import json
8
- from deltacat.aws import s3u as s3_utils
8
+ import posixpath
9
+ from deltacat.utils.filesystem import resolve_path_and_filesystem
9
10
  import deltacat
10
11
  from deltacat import logs
11
12
  import pyarrow as pa
@@ -25,7 +26,7 @@ from deltacat.storage import (
25
26
  DeltaLocator,
26
27
  Partition,
27
28
  PartitionLocator,
28
- interface as unimplemented_deltacat_storage,
29
+ metastore,
29
30
  )
30
31
  from deltacat.compute.compactor.model.compact_partition_params import (
31
32
  CompactPartitionParams,
@@ -40,7 +41,7 @@ from deltacat.compute.compactor.steps import dedupe as dd
40
41
  from deltacat.compute.compactor.steps import hash_bucket as hb
41
42
  from deltacat.compute.compactor.steps import materialize as mat
42
43
  from deltacat.compute.compactor.utils import io
43
- from deltacat.compute.compactor.utils import round_completion_file as rcf
44
+ from deltacat.compute.compactor.utils import round_completion_reader as rci
44
45
 
45
46
  from deltacat.types.media import ContentType
46
47
  from deltacat.utils.placement import PlacementGroupConfig
@@ -65,13 +66,37 @@ DEFAULT_DEDUPE_MAX_PARALLELISM_RATIO_ARG: int = 1
65
66
  DEFAULT_PROPERTIES_ARG: Dict[str, Any] = {}
66
67
 
67
68
 
69
+ def _upload_audit_data(url: str, content: str, **kwargs) -> None:
70
+ """
71
+ Upload audit data to the specified URL using filesystem-agnostic operations.
72
+ """
73
+ try:
74
+ path, filesystem = resolve_path_and_filesystem(url)
75
+
76
+ # Create parent directories if they don't exist
77
+ parent_dir = posixpath.dirname(path)
78
+ if parent_dir:
79
+ try:
80
+ filesystem.create_dir(parent_dir, recursive=True)
81
+ except Exception as dir_error:
82
+ # Directory might already exist, which is fine
83
+ logger.debug(
84
+ f"Directory creation warning for {parent_dir}: {dir_error}"
85
+ )
86
+
87
+ with filesystem.open_output_stream(path) as stream:
88
+ stream.write(content.encode("utf-8"))
89
+ except Exception as e:
90
+ logger.warning(f"Failed to upload audit data to {url}: {e}")
91
+
92
+
68
93
  def check_preconditions(
69
94
  source_partition_locator: PartitionLocator,
70
95
  destination_partition_locator: PartitionLocator,
71
96
  sort_keys: List[SortKey],
72
97
  max_records_per_output_file: int,
73
98
  new_hash_bucket_count: Optional[int],
74
- deltacat_storage=unimplemented_deltacat_storage,
99
+ deltacat_storage=metastore,
75
100
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
76
101
  **kwargs,
77
102
  ) -> int:
@@ -104,7 +129,7 @@ def compact_partition(
104
129
  source_partition_locator: PartitionLocator,
105
130
  destination_partition_locator: PartitionLocator,
106
131
  primary_keys: Set[str],
107
- compaction_artifact_s3_bucket: str,
132
+ compaction_artifact_path: str,
108
133
  last_stream_position_to_compact: int,
109
134
  *,
110
135
  hash_bucket_count: Optional[int] = None,
@@ -123,37 +148,29 @@ def compact_partition(
123
148
  metrics_config: Optional[MetricsConfig] = None,
124
149
  list_deltas_kwargs: Optional[Dict[str, Any]] = None,
125
150
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
126
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
151
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
127
152
  object_store: Optional[IObjectStore] = RayPlasmaObjectStore(),
128
- s3_client_kwargs: Optional[Dict[str, Any]] = None,
129
- deltacat_storage=unimplemented_deltacat_storage,
153
+ deltacat_storage=metastore,
130
154
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
131
155
  **kwargs,
132
- ) -> Optional[str]:
156
+ ) -> None:
133
157
  if deltacat_storage_kwargs is None:
134
158
  deltacat_storage_kwargs = {}
135
159
  if not importlib.util.find_spec("memray"):
136
160
  logger.info(f"memray profiler not available, disabling all profiling")
137
161
  enable_profiler = False
138
162
 
139
- if s3_client_kwargs is None:
140
- s3_client_kwargs = {}
141
-
142
163
  # memray official documentation link:
143
164
  # https://bloomberg.github.io/memray/getting_started.html
144
165
  with memray.Tracker(
145
166
  f"compaction_partition.bin"
146
167
  ) if enable_profiler else nullcontext():
147
168
  partition = None
148
- (
149
- new_partition,
150
- new_rci,
151
- new_rcf_partition_locator,
152
- ) = _execute_compaction_round(
169
+ (new_partition, new_rci,) = _execute_compaction_round(
153
170
  source_partition_locator,
154
171
  destination_partition_locator,
155
172
  primary_keys,
156
- compaction_artifact_s3_bucket,
173
+ compaction_artifact_path,
157
174
  last_stream_position_to_compact,
158
175
  hash_bucket_count,
159
176
  sort_keys,
@@ -169,9 +186,8 @@ def compact_partition(
169
186
  metrics_config,
170
187
  list_deltas_kwargs,
171
188
  read_kwargs_provider,
172
- s3_table_writer_kwargs,
189
+ table_writer_kwargs,
173
190
  object_store,
174
- s3_client_kwargs,
175
191
  deltacat_storage,
176
192
  deltacat_storage_kwargs,
177
193
  **kwargs,
@@ -182,30 +198,23 @@ def compact_partition(
182
198
  logger.info(
183
199
  f"Partition-{source_partition_locator.partition_values}-> Compaction session data processing completed"
184
200
  )
185
- round_completion_file_s3_url = None
186
201
  if partition:
187
202
  logger.info(f"Committing compacted partition to: {partition.locator}")
203
+ # Set the round completion info on the partition before committing
204
+ partition.compaction_round_completion_info = new_rci
188
205
  partition = deltacat_storage.commit_partition(
189
- partition, **deltacat_storage_kwargs
206
+ partition,
207
+ **deltacat_storage_kwargs,
190
208
  )
191
209
  logger.info(f"Committed compacted partition: {partition}")
192
-
193
- round_completion_file_s3_url = rcf.write_round_completion_file(
194
- compaction_artifact_s3_bucket,
195
- new_rcf_partition_locator,
196
- partition.locator,
197
- new_rci,
198
- **s3_client_kwargs,
199
- )
200
210
  logger.info(f"Completed compaction session for: {source_partition_locator}")
201
- return round_completion_file_s3_url
202
211
 
203
212
 
204
213
  def _execute_compaction_round(
205
214
  source_partition_locator: PartitionLocator,
206
215
  destination_partition_locator: PartitionLocator,
207
216
  primary_keys: Set[str],
208
- compaction_artifact_s3_bucket: str,
217
+ compaction_artifact_path: str,
209
218
  last_stream_position_to_compact: int,
210
219
  hash_bucket_count: Optional[int],
211
220
  sort_keys: List[SortKey],
@@ -221,24 +230,25 @@ def _execute_compaction_round(
221
230
  metrics_config: Optional[MetricsConfig],
222
231
  list_deltas_kwargs: Optional[Dict[str, Any]],
223
232
  read_kwargs_provider: Optional[ReadKwargsProvider],
224
- s3_table_writer_kwargs: Optional[Dict[str, Any]],
233
+ table_writer_kwargs: Optional[Dict[str, Any]],
225
234
  object_store: Optional[IObjectStore],
226
- s3_client_kwargs: Optional[Dict[str, Any]],
227
- deltacat_storage=unimplemented_deltacat_storage,
235
+ deltacat_storage=metastore,
228
236
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
229
237
  **kwargs,
230
- ) -> Tuple[Optional[Partition], Optional[RoundCompletionInfo], Optional[str]]:
238
+ ) -> Tuple[Optional[Partition], Optional[RoundCompletionInfo]]:
231
239
  if deltacat_storage_kwargs is None:
232
240
  deltacat_storage_kwargs = {}
233
- rcf_source_partition_locator = (
241
+ rci_source_partition_locator = (
234
242
  rebase_source_partition_locator
235
243
  if rebase_source_partition_locator
236
244
  else source_partition_locator
237
245
  )
238
- base_audit_url = rcf_source_partition_locator.path(
239
- f"s3://{compaction_artifact_s3_bucket}/compaction-audit"
246
+ # Construct audit URL using filesystem-agnostic path joining
247
+ audit_url = posixpath.join(
248
+ compaction_artifact_path,
249
+ "compaction-audit.json",
250
+ f"{rci_source_partition_locator.hexdigest()}.json",
240
251
  )
241
- audit_url = f"{base_audit_url}.json"
242
252
 
243
253
  logger.info(f"Compaction audit will be written to {audit_url}")
244
254
 
@@ -312,11 +322,11 @@ def _execute_compaction_round(
312
322
  # read the results from any previously completed compaction round
313
323
  round_completion_info = None
314
324
  if not rebase_source_partition_locator:
315
- round_completion_info = rcf.read_round_completion_file(
316
- compaction_artifact_s3_bucket,
317
- source_partition_locator,
318
- destination_partition_locator,
319
- **s3_client_kwargs,
325
+ round_completion_info = rci.read_round_completion_info(
326
+ source_partition_locator=source_partition_locator,
327
+ destination_partition_locator=destination_partition_locator,
328
+ deltacat_storage=deltacat_storage,
329
+ deltacat_storage_kwargs=deltacat_storage_kwargs,
320
330
  )
321
331
  if not round_completion_info:
322
332
  logger.info(
@@ -363,15 +373,11 @@ def _execute_compaction_round(
363
373
  delta_discovery_end - delta_discovery_start
364
374
  )
365
375
 
366
- s3_utils.upload(
367
- compaction_audit.audit_url,
368
- str(json.dumps(compaction_audit)),
369
- **s3_client_kwargs,
370
- )
376
+ _upload_audit_data(audit_url, json.dumps(compaction_audit))
371
377
 
372
378
  if not input_deltas:
373
379
  logger.info("No input deltas found to compact.")
374
- return None, None, None
380
+ return None, None
375
381
 
376
382
  # limit the input deltas to fit on this cluster and convert them to
377
383
  # annotated deltas of equivalent size for easy parallel distribution
@@ -464,11 +470,7 @@ def _execute_compaction_round(
464
470
  hb_end - hb_start,
465
471
  )
466
472
 
467
- s3_utils.upload(
468
- compaction_audit.audit_url,
469
- str(json.dumps(compaction_audit)),
470
- **s3_client_kwargs,
471
- )
473
+ _upload_audit_data(audit_url, json.dumps(compaction_audit))
472
474
 
473
475
  all_hash_group_idx_to_obj_id = defaultdict(list)
474
476
  for hb_result in hb_results:
@@ -485,9 +487,9 @@ def _execute_compaction_round(
485
487
  )
486
488
 
487
489
  compaction_audit.set_input_records(total_hb_record_count.item())
488
- # TODO (pdames): when resources are freed during the last round of hash
489
- # bucketing, start running dedupe tasks that read existing dedupe
490
- # output from S3 then wait for hash bucketing to finish before continuing
490
+ # TODO(pdames): when resources are freed during the last round of hash bucketing,
491
+ # start running dedupe tasks that read hash bucket output from storage then
492
+ # wait for hash bucketing to finish before continuing
491
493
 
492
494
  # create a new stream for this round
493
495
  compacted_stream_locator = destination_partition_locator.stream_locator
@@ -497,6 +499,7 @@ def _execute_compaction_round(
497
499
  compacted_stream_locator.table_version,
498
500
  **deltacat_storage_kwargs,
499
501
  )
502
+
500
503
  partition = deltacat_storage.stage_partition(
501
504
  stream,
502
505
  destination_partition_locator.partition_values,
@@ -571,9 +574,9 @@ def _execute_compaction_round(
571
574
  logger.info(f"Materialize buckets created: " f"{len(all_mat_buckets_to_obj_id)}")
572
575
 
573
576
  compaction_audit.set_materialize_buckets(len(all_mat_buckets_to_obj_id))
574
- # TODO(pdames): when resources are freed during the last round of deduping
577
+ # TODO(pdames): when resources are freed during the last round of deduping,
575
578
  # start running materialize tasks that read materialization source file
576
- # tables from S3 then wait for deduping to finish before continuing
579
+ # tables from storage then wait for deduping to finish before continuing
577
580
 
578
581
  # TODO(pdames): balance inputs to materialization tasks to ensure that each
579
582
  # task has an approximately equal amount of input to materialize
@@ -584,11 +587,7 @@ def _execute_compaction_round(
584
587
  # parallel step 3:
585
588
  # materialize records to keep by index
586
589
 
587
- s3_utils.upload(
588
- compaction_audit.audit_url,
589
- str(json.dumps(compaction_audit)),
590
- **s3_client_kwargs,
591
- )
590
+ _upload_audit_data(audit_url, json.dumps(compaction_audit))
592
591
 
593
592
  materialize_start = time.monotonic()
594
593
  mat_tasks_pending = invoke_parallel(
@@ -610,7 +609,7 @@ def _execute_compaction_round(
610
609
  enable_profiler=enable_profiler,
611
610
  metrics_config=metrics_config,
612
611
  read_kwargs_provider=read_kwargs_provider,
613
- s3_table_writer_kwargs=s3_table_writer_kwargs,
612
+ table_writer_kwargs=table_writer_kwargs,
614
613
  object_store=object_store,
615
614
  deltacat_storage=deltacat_storage,
616
615
  deltacat_storage_kwargs=deltacat_storage_kwargs,
@@ -693,11 +692,7 @@ def _execute_compaction_round(
693
692
  telemetry_time_hb + telemetry_time_dd + telemetry_time_materialize
694
693
  )
695
694
 
696
- s3_utils.upload(
697
- compaction_audit.audit_url,
698
- str(json.dumps(compaction_audit)),
699
- **s3_client_kwargs,
700
- )
695
+ _upload_audit_data(audit_url, json.dumps(compaction_audit))
701
696
 
702
697
  new_round_completion_info = RoundCompletionInfo.of(
703
698
  last_stream_position_compacted,
@@ -710,6 +705,7 @@ def _execute_compaction_round(
710
705
  hash_bucket_count,
711
706
  None,
712
707
  CompactorVersion.V1.value,
708
+ prev_source_partition_locator=rci_source_partition_locator,
713
709
  )
714
710
 
715
711
  logger.info(
@@ -721,17 +717,43 @@ def _execute_compaction_round(
721
717
  return (
722
718
  partition,
723
719
  new_round_completion_info,
724
- rcf_source_partition_locator,
725
720
  )
726
721
 
727
722
 
728
723
  def compact_partition_from_request(
729
724
  compact_partition_params: CompactPartitionParams,
730
725
  *compact_partition_pos_args,
731
- ) -> Optional[str]:
726
+ ) -> None:
732
727
  """
733
728
  Wrapper for compact_partition that allows for the compact_partition parameters to be
734
729
  passed in as a custom dictionary-like CompactPartitionParams object along with any compact_partition positional arguments.
735
730
  :param compact_partition_params:
736
731
  """
737
- return compact_partition(*compact_partition_pos_args, **compact_partition_params)
732
+ # Extract required positional arguments
733
+ source_partition_locator = compact_partition_params.source_partition_locator
734
+ destination_partition_locator = (
735
+ compact_partition_params.destination_partition_locator
736
+ )
737
+ primary_keys = compact_partition_params.primary_keys
738
+ compaction_artifact_path = compact_partition_params.compaction_artifact_path
739
+ last_stream_position_to_compact = (
740
+ compact_partition_params.last_stream_position_to_compact
741
+ )
742
+
743
+ # Create a copy of params without the positional arguments
744
+ kwargs_params = dict(compact_partition_params)
745
+ kwargs_params.pop("source_partition_locator", None)
746
+ kwargs_params.pop("destination_partition_locator", None)
747
+ kwargs_params.pop("primary_keys", None)
748
+ kwargs_params.pop("last_stream_position_to_compact", None)
749
+ # Don't pop compaction_artifact_path as it's a computed property, not stored in the dict
750
+
751
+ compact_partition(
752
+ source_partition_locator,
753
+ destination_partition_locator,
754
+ primary_keys,
755
+ compaction_artifact_path,
756
+ last_stream_position_to_compact,
757
+ *compact_partition_pos_args,
758
+ **kwargs_params,
759
+ )
@@ -2,17 +2,19 @@ from __future__ import annotations
2
2
  import importlib
3
3
  import copy
4
4
  import json
5
- from typing import Any, Dict, List, Optional
5
+ import posixpath
6
+ from typing import Any, Dict, List, Optional, Set
6
7
  from deltacat.io.object_store import IObjectStore
7
8
  from deltacat.utils.common import ReadKwargsProvider
8
9
  from deltacat.types.media import ContentType
9
10
  from deltacat.utils.placement import PlacementGroupConfig
10
11
  from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
11
12
  from deltacat.storage import (
12
- interface as unimplemented_deltacat_storage,
13
+ metastore,
13
14
  PartitionLocator,
14
15
  SortKey,
15
16
  )
17
+ from deltacat.catalog.model.properties import CatalogProperties
16
18
  from deltacat.compute.resource_estimation import (
17
19
  ResourceEstimationMethod,
18
20
  EstimateResourcesParams,
@@ -52,11 +54,22 @@ class CompactPartitionParams(dict):
52
54
  assert (
53
55
  params.get("source_partition_locator") is not None
54
56
  ), "source_partition_locator is a required arg"
57
+ assert params.get("catalog") is not None, "catalog is a required arg"
55
58
  assert (
56
- params.get("compaction_artifact_s3_bucket") is not None
57
- ), "compaction_artifact_s3_bucket is a required arg"
59
+ params.get("all_column_names") is not None
60
+ ), "all_column_names is a required arg"
58
61
 
59
62
  result = CompactPartitionParams(params)
63
+ assert (
64
+ result.destination_partition_locator.partition_id
65
+ ), "destination_partition_locator must have a globally unique partition_id"
66
+ assert (
67
+ result.source_partition_locator.partition_id
68
+ ), "source_partition_locator must have a globally unique partition_id"
69
+ if result.rebase_source_partition_locator:
70
+ assert (
71
+ result.rebase_source_partition_locator.partition_id
72
+ ), "rebase_source_partition_locator must have a globally unique partition_id"
60
73
 
61
74
  result.records_per_compacted_file = params.get(
62
75
  "records_per_compacted_file", MAX_RECORDS_PER_COMPACTED_FILE
@@ -65,15 +78,18 @@ class CompactPartitionParams(dict):
65
78
  "compacted_file_content_type", ContentType.PARQUET
66
79
  )
67
80
  result.object_store = params.get("object_store", RayPlasmaObjectStore())
81
+ result.table_writer_kwargs = params.get("table_writer_kwargs", {})
68
82
 
69
83
  result.enable_profiler = params.get("enable_profiler", False)
70
- result.deltacat_storage = params.get(
71
- "deltacat_storage", unimplemented_deltacat_storage
72
- )
73
- result.s3_client_kwargs = params.get("s3_client_kwargs", {})
84
+ result.deltacat_storage = params.get("deltacat_storage", metastore)
85
+ result.catalog = params.get("catalog")
74
86
  result.deltacat_storage_kwargs = params.get("deltacat_storage_kwargs", {})
75
87
  result.list_deltas_kwargs = params.get("list_deltas_kwargs", {})
76
- result.s3_table_writer_kwargs = params.get("s3_table_writer_kwargs", {})
88
+ result.all_column_names = params.get("all_column_names")
89
+
90
+ # Add catalog to deltacat_storage_kwargs
91
+ result.deltacat_storage_kwargs["catalog"] = result.catalog
92
+
77
93
  result.bit_width_of_sort_keys = validate_sort_keys(
78
94
  result.source_partition_locator,
79
95
  result.sort_keys,
@@ -133,6 +149,8 @@ class CompactPartitionParams(dict):
133
149
  if result.primary_keys:
134
150
  result.primary_keys = sorted(result.primary_keys)
135
151
 
152
+ result.original_fields = params.get("original_fields")
153
+
136
154
  # assertions
137
155
  assert (
138
156
  result.source_partition_locator.partition_values
@@ -177,21 +195,32 @@ class CompactPartitionParams(dict):
177
195
  self["source_partition_locator"] = locator
178
196
 
179
197
  @property
180
- def compaction_artifact_s3_bucket(self) -> str:
181
- return self["compaction_artifact_s3_bucket"]
182
-
183
- @compaction_artifact_s3_bucket.setter
184
- def compaction_artifact_s3_bucket(self, s3_bucket: str) -> None:
185
- self["compaction_artifact_s3_bucket"] = s3_bucket
198
+ def compaction_artifact_path(self) -> str:
199
+ """
200
+ Returns the compaction artifact path based on catalog root.
201
+ """
202
+ return posixpath.join(self.catalog.root, "compute", "compactor")
186
203
 
187
204
  @property
188
- def deltacat_storage(self) -> unimplemented_deltacat_storage:
205
+ def deltacat_storage(self) -> metastore:
189
206
  return self["deltacat_storage"]
190
207
 
191
208
  @deltacat_storage.setter
192
- def deltacat_storage(self, storage: unimplemented_deltacat_storage) -> None:
209
+ def deltacat_storage(self, storage: metastore) -> None:
193
210
  self["deltacat_storage"] = storage
194
211
 
212
+ @property
213
+ def catalog(self) -> CatalogProperties:
214
+ return self["catalog"]
215
+
216
+ @catalog.setter
217
+ def catalog(self, catalog: CatalogProperties) -> None:
218
+ self["catalog"] = catalog
219
+ # Update deltacat_storage_kwargs when catalog is set
220
+ if "deltacat_storage_kwargs" not in self:
221
+ self["deltacat_storage_kwargs"] = {}
222
+ self["deltacat_storage_kwargs"]["catalog"] = catalog
223
+
195
224
  @property
196
225
  def object_store(self) -> IObjectStore:
197
226
  return self["object_store"]
@@ -286,14 +315,6 @@ class CompactPartitionParams(dict):
286
315
  def list_deltas_kwargs(self, kwargs: dict) -> None:
287
316
  self["list_deltas_kwargs"] = kwargs
288
317
 
289
- @property
290
- def s3_table_writer_kwargs(self) -> dict:
291
- return self["s3_table_writer_kwargs"]
292
-
293
- @s3_table_writer_kwargs.setter
294
- def s3_table_writer_kwargs(self, kwargs: dict) -> None:
295
- self["s3_table_writer_kwargs"] = kwargs
296
-
297
318
  @property
298
319
  def deltacat_storage_kwargs(self) -> dict:
299
320
  return self["deltacat_storage_kwargs"]
@@ -303,12 +324,12 @@ class CompactPartitionParams(dict):
303
324
  self["deltacat_storage_kwargs"] = kwargs
304
325
 
305
326
  @property
306
- def s3_client_kwargs(self) -> dict:
307
- return self["s3_client_kwargs"]
327
+ def all_column_names(self) -> List[str]:
328
+ return self.get("all_column_names")
308
329
 
309
- @s3_client_kwargs.setter
310
- def s3_client_kwargs(self, kwargs: dict) -> None:
311
- self["s3_client_kwargs"] = kwargs
330
+ @all_column_names.setter
331
+ def all_column_names(self, column_names: List[str]) -> None:
332
+ self["all_column_names"] = column_names
312
333
 
313
334
  @property
314
335
  def records_per_compacted_file(self) -> int:
@@ -489,6 +510,30 @@ class CompactPartitionParams(dict):
489
510
  average_record_size_bytes=self.average_record_size_bytes,
490
511
  )
491
512
 
513
+ @property
514
+ def table_writer_kwargs(self) -> dict:
515
+ return self["table_writer_kwargs"]
516
+
517
+ @table_writer_kwargs.setter
518
+ def table_writer_kwargs(self, kwargs: dict) -> None:
519
+ self["table_writer_kwargs"] = kwargs
520
+
521
+ @property
522
+ def expected_previous_partition_id(self) -> Optional[str]:
523
+ return self.get("expected_previous_partition_id")
524
+
525
+ @expected_previous_partition_id.setter
526
+ def expected_previous_partition_id(self, partition_id: Optional[str]) -> None:
527
+ self["expected_previous_partition_id"] = partition_id
528
+
529
+ @property
530
+ def original_fields(self) -> Optional[Set[str]]:
531
+ return self.get("original_fields")
532
+
533
+ @original_fields.setter
534
+ def original_fields(self, fields: Optional[Set[str]]) -> None:
535
+ self["original_fields"] = fields
536
+
492
537
  @staticmethod
493
538
  def json_handler_for_compact_partition_params(obj):
494
539
  """
@@ -3,11 +3,13 @@ from __future__ import annotations
3
3
  from typing import Optional
4
4
  import pyarrow as pa
5
5
  import logging
6
+ from pathlib import PosixPath
6
7
  from deltacat import logs
7
8
  from typing import List, Union
8
9
  from deltacat.compute.compactor.model.hash_bucket_result import HashBucketResult
9
10
  from deltacat.compute.compactor.model.dedupe_result import DedupeResult
10
11
  from deltacat.compute.compactor.model.materialize_result import MaterializeResult
12
+ from deltacat.compute.compactor_v2.model.merge_result import MergeResult
11
13
  from deltacat.utils.performance import timed_invocation
12
14
  from deltacat.utils.resources import ClusterUtilization, get_size_of_object_in_bytes
13
15
  from deltacat.compute.compactor import PyArrowWriteResult
@@ -322,17 +324,6 @@ class CompactionSessionAuditInfo(dict):
322
324
  """
323
325
  return self.get("outputSizePyarrowBytes")
324
326
 
325
- @property
326
- def output_record_count(self) -> int:
327
- """
328
- The total number of records in the compacted output (includes untouched records).
329
-
330
- Represents the final record count after compaction, including:
331
- - Records that were processed and materialized
332
- - Records that were untouched and copied by reference
333
- """
334
- return self.get("outputRecordCount")
335
-
336
327
  @property
337
328
  def total_cluster_memory_bytes(self) -> float:
338
329
  """
@@ -681,26 +672,13 @@ class CompactionSessionAuditInfo(dict):
681
672
  self, output_size_bytes: float
682
673
  ) -> CompactionSessionAuditInfo:
683
674
  self["outputSizeBytes"] = output_size_bytes
684
- return output_size_bytes
685
-
686
- def set_output_record_count(
687
- self, output_records: int
688
- ) -> CompactionSessionAuditInfo:
689
- """
690
- This includes both processed records and untouched records copied by reference.
691
- """
692
- if output_records < 0:
693
- raise ValueError(
694
- f"Output record count cannot be negative: {output_records}"
695
- )
696
- self["outputRecordCount"] = output_records
697
675
  return self
698
676
 
699
677
  def set_output_size_pyarrow_bytes(
700
678
  self, output_size_pyarrow_bytes: float
701
679
  ) -> CompactionSessionAuditInfo:
702
680
  self["outputSizePyarrowBytes"] = output_size_pyarrow_bytes
703
- return output_size_pyarrow_bytes
681
+ return self
704
682
 
705
683
  def set_total_cluster_memory_bytes(
706
684
  self, total_cluster_memory_bytes: float
@@ -811,7 +789,10 @@ class CompactionSessionAuditInfo(dict):
811
789
  self,
812
790
  step_name: str,
813
791
  task_results: Union[
814
- List[HashBucketResult], List[DedupeResult], List[MaterializeResult]
792
+ List[HashBucketResult],
793
+ List[DedupeResult],
794
+ List[MaterializeResult],
795
+ List[MergeResult],
815
796
  ],
816
797
  task_results_retrieved_at: float,
817
798
  invoke_time_in_seconds: float,
@@ -926,10 +907,6 @@ class CompactionSessionAuditInfo(dict):
926
907
  self.set_output_file_count(pyarrow_write_result.files)
927
908
  self.set_output_size_bytes(pyarrow_write_result.file_bytes)
928
909
  self.set_output_size_pyarrow_bytes(pyarrow_write_result.pyarrow_bytes)
929
- # NOTE: Aggregating untouched_record_count with records to get a total of record count in the compacted table
930
- self.set_output_record_count(
931
- pyarrow_write_result.records + untouched_file_record_count
932
- )
933
910
 
934
911
  self.set_peak_memory_used_bytes_per_task(
935
912
  max(
@@ -943,3 +920,19 @@ class CompactionSessionAuditInfo(dict):
943
920
  )
944
921
 
945
922
  self.set_pyarrow_version(pa.__version__)
923
+
924
+ def to_serializable(self, catalog_root: str) -> CompactionSessionAuditInfo:
925
+ root_path = PosixPath(catalog_root)
926
+ target_path = PosixPath(self.audit_url)
927
+ if root_path == target_path:
928
+ raise ValueError(
929
+ "Target and root are identical, but expected target to be a child of root."
930
+ )
931
+ try:
932
+ relative_path = target_path.relative_to(root_path)
933
+ # Create a copy of the audit info with the relative path
934
+ audit_copy = CompactionSessionAuditInfo(**dict(self))
935
+ audit_copy["auditUrl"] = str(relative_path)
936
+ return audit_copy
937
+ except ValueError:
938
+ raise ValueError("Expected target to be a child of root.")
@@ -11,10 +11,10 @@ from deltacat import logs
11
11
  from deltacat.storage import (
12
12
  Delta,
13
13
  DeltaType,
14
- Manifest,
15
14
  ManifestEntry,
16
15
  ManifestEntryList,
17
16
  )
17
+ from deltacat.storage.model.manifest import Manifest
18
18
 
19
19
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
20
20
 
@@ -107,7 +107,7 @@ class DeltaAnnotated(Delta):
107
107
  assert len(src_da_annotations) == len(src_da_entries), (
108
108
  f"Unexpected Error: Length of delta annotations "
109
109
  f"({len(src_da_annotations)}) doesn't mach the length of "
110
- f"delta manifest entries ({len(src_da_entries)}).",
110
+ f"delta manifest entries ({len(src_da_entries)})."
111
111
  )
112
112
  for i, src_entry in enumerate(src_da_entries):
113
113
  # create a new da group if src and dest has different delta locator
@@ -161,7 +161,7 @@ class DeltaAnnotated(Delta):
161
161
  assert len(src_da_annotations) == len(src_da_entries), (
162
162
  f"Unexpected Error: Length of delta annotations "
163
163
  f"({len(src_da_annotations)}) doesn't mach the length of "
164
- f"delta manifest entries ({len(src_da_entries)}).",
164
+ f"delta manifest entries ({len(src_da_entries)})."
165
165
  )
166
166
  src_da_entries_length = len(src_da_entries)
167
167
  equal_length = src_da_entries_length // pieces