deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. deltacat/__init__.py +117 -18
  2. deltacat/api.py +536 -126
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +1 -19
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2444 -282
  12. deltacat/catalog/model/catalog.py +208 -113
  13. deltacat/catalog/model/properties.py +63 -24
  14. deltacat/compute/__init__.py +14 -0
  15. deltacat/compute/compactor/compaction_session.py +97 -75
  16. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  17. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  18. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  19. deltacat/compute/compactor/repartition_session.py +8 -21
  20. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  21. deltacat/compute/compactor/steps/materialize.py +9 -7
  22. deltacat/compute/compactor/steps/repartition.py +12 -11
  23. deltacat/compute/compactor/utils/io.py +6 -5
  24. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  25. deltacat/compute/compactor/utils/system_columns.py +3 -1
  26. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  27. deltacat/compute/compactor_v2/constants.py +30 -1
  28. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  30. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  31. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  32. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  33. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  34. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  35. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  36. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  37. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  38. deltacat/compute/compactor_v2/utils/io.py +11 -4
  39. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  40. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  41. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  42. deltacat/compute/converter/constants.py +5 -0
  43. deltacat/compute/converter/converter_session.py +207 -52
  44. deltacat/compute/converter/model/convert_input.py +43 -16
  45. deltacat/compute/converter/model/convert_input_files.py +33 -16
  46. deltacat/compute/converter/model/convert_result.py +80 -0
  47. deltacat/compute/converter/model/converter_session_params.py +64 -19
  48. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  49. deltacat/compute/converter/pyiceberg/overrides.py +193 -65
  50. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  51. deltacat/compute/converter/steps/convert.py +230 -75
  52. deltacat/compute/converter/steps/dedupe.py +46 -12
  53. deltacat/compute/converter/utils/convert_task_options.py +66 -22
  54. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  55. deltacat/compute/converter/utils/iceberg_columns.py +13 -8
  56. deltacat/compute/converter/utils/io.py +173 -13
  57. deltacat/compute/converter/utils/s3u.py +42 -27
  58. deltacat/compute/janitor.py +205 -0
  59. deltacat/compute/jobs/client.py +417 -0
  60. deltacat/compute/resource_estimation/delta.py +38 -6
  61. deltacat/compute/resource_estimation/model.py +8 -0
  62. deltacat/constants.py +49 -6
  63. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  64. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  65. deltacat/env.py +10 -0
  66. deltacat/examples/basic_logging.py +6 -6
  67. deltacat/examples/compactor/aws/__init__.py +1 -0
  68. deltacat/examples/compactor/bootstrap.py +863 -0
  69. deltacat/examples/compactor/compactor.py +373 -0
  70. deltacat/examples/compactor/explorer.py +473 -0
  71. deltacat/examples/compactor/gcp/__init__.py +1 -0
  72. deltacat/examples/compactor/job_runner.py +439 -0
  73. deltacat/examples/compactor/utils/__init__.py +1 -0
  74. deltacat/examples/compactor/utils/common.py +261 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  80. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  81. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
  82. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  83. deltacat/examples/hello_world.py +4 -2
  84. deltacat/examples/indexer/indexer.py +163 -0
  85. deltacat/examples/indexer/job_runner.py +198 -0
  86. deltacat/exceptions.py +66 -4
  87. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  88. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  89. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
  90. deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
  91. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  92. deltacat/experimental/converter_agent/__init__.py +0 -0
  93. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  94. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  95. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  96. deltacat/experimental/daft/__init__.py +4 -0
  97. deltacat/experimental/daft/daft_catalog.py +229 -0
  98. deltacat/experimental/storage/__init__.py +0 -0
  99. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  100. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  101. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  102. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  103. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  104. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  105. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  107. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  108. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  109. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  110. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  111. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  112. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  113. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  114. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  115. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  116. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  117. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  118. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  119. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  120. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  121. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  122. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  123. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  124. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  125. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  126. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  127. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  128. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  129. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  130. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  131. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  132. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  133. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  134. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  135. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  136. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  137. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  138. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  139. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  140. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  141. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  142. deltacat/io/__init__.py +13 -0
  143. deltacat/io/dataset/__init__.py +0 -0
  144. deltacat/io/dataset/deltacat_dataset.py +91 -0
  145. deltacat/io/datasink/__init__.py +0 -0
  146. deltacat/io/datasink/deltacat_datasink.py +207 -0
  147. deltacat/io/datasource/__init__.py +0 -0
  148. deltacat/io/datasource/deltacat_datasource.py +579 -0
  149. deltacat/io/reader/__init__.py +0 -0
  150. deltacat/io/reader/deltacat_read_api.py +172 -0
  151. deltacat/storage/__init__.py +22 -2
  152. deltacat/storage/interface.py +54 -32
  153. deltacat/storage/main/impl.py +1494 -541
  154. deltacat/storage/model/delta.py +27 -3
  155. deltacat/storage/model/expression/__init__.py +47 -0
  156. deltacat/storage/model/expression/expression.py +656 -0
  157. deltacat/storage/model/expression/visitor.py +248 -0
  158. deltacat/storage/model/locator.py +6 -12
  159. deltacat/storage/model/manifest.py +231 -6
  160. deltacat/storage/model/metafile.py +224 -119
  161. deltacat/storage/model/namespace.py +8 -1
  162. deltacat/storage/model/partition.py +117 -42
  163. deltacat/storage/model/scan/push_down.py +32 -5
  164. deltacat/storage/model/schema.py +2427 -159
  165. deltacat/storage/model/shard.py +6 -2
  166. deltacat/storage/model/sort_key.py +40 -0
  167. deltacat/storage/model/stream.py +9 -2
  168. deltacat/storage/model/table.py +12 -1
  169. deltacat/storage/model/table_version.py +11 -0
  170. deltacat/storage/model/transaction.py +1184 -208
  171. deltacat/storage/model/transform.py +81 -2
  172. deltacat/storage/model/types.py +53 -29
  173. deltacat/storage/util/__init__.py +0 -0
  174. deltacat/storage/util/scan_planner.py +26 -0
  175. deltacat/tests/_io/reader/__init__.py +0 -0
  176. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  177. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  178. deltacat/tests/aws/test_s3u.py +2 -31
  179. deltacat/tests/catalog/data/__init__.py +0 -0
  180. deltacat/tests/catalog/main/__init__.py +0 -0
  181. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  182. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  183. deltacat/tests/catalog/model/__init__.py +0 -0
  184. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  185. deltacat/tests/catalog/test_catalogs.py +103 -106
  186. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  187. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  188. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  189. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  190. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  191. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  192. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  193. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  194. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  195. deltacat/tests/compute/conftest.py +8 -44
  196. deltacat/tests/compute/converter/test_convert_session.py +697 -349
  197. deltacat/tests/compute/converter/utils.py +15 -6
  198. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  199. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  200. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  201. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  202. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  203. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  204. deltacat/tests/compute/test_janitor.py +236 -0
  205. deltacat/tests/compute/test_util_common.py +716 -43
  206. deltacat/tests/compute/test_util_constant.py +0 -1
  207. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  208. deltacat/tests/daft/__init__.py +0 -0
  209. deltacat/tests/daft/test_model.py +97 -0
  210. deltacat/tests/experimental/__init__.py +1 -0
  211. deltacat/tests/experimental/catalog/__init__.py +0 -0
  212. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  213. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  214. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  215. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  216. deltacat/tests/experimental/daft/__init__.py +0 -0
  217. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  218. deltacat/tests/experimental/storage/__init__.py +0 -0
  219. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  220. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  221. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  222. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  223. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  224. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  225. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  226. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  227. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  228. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  229. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  230. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  231. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  232. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  233. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  234. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  235. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  236. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  237. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  238. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  239. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  240. deltacat/tests/storage/model/test_expression.py +327 -0
  241. deltacat/tests/storage/model/test_manifest.py +129 -0
  242. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  243. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  244. deltacat/tests/storage/model/test_schema.py +171 -0
  245. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  246. deltacat/tests/storage/model/test_shard.py +3 -1
  247. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  248. deltacat/tests/storage/model/test_transaction.py +393 -48
  249. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  250. deltacat/tests/test_deltacat_api.py +1036 -11
  251. deltacat/tests/test_exceptions.py +9 -5
  252. deltacat/tests/test_utils/pyarrow.py +52 -21
  253. deltacat/tests/test_utils/storage.py +23 -34
  254. deltacat/tests/types/__init__.py +0 -0
  255. deltacat/tests/types/test_tables.py +104 -0
  256. deltacat/tests/utils/exceptions.py +22 -0
  257. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  258. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  259. deltacat/tests/utils/test_daft.py +121 -31
  260. deltacat/tests/utils/test_numpy.py +1193 -0
  261. deltacat/tests/utils/test_pandas.py +1106 -0
  262. deltacat/tests/utils/test_polars.py +1040 -0
  263. deltacat/tests/utils/test_pyarrow.py +1370 -89
  264. deltacat/types/media.py +345 -37
  265. deltacat/types/tables.py +2344 -46
  266. deltacat/utils/arguments.py +33 -1
  267. deltacat/utils/daft.py +824 -40
  268. deltacat/utils/export.py +3 -1
  269. deltacat/utils/filesystem.py +139 -9
  270. deltacat/utils/metafile_locator.py +2 -1
  271. deltacat/utils/numpy.py +118 -26
  272. deltacat/utils/pandas.py +577 -48
  273. deltacat/utils/polars.py +759 -0
  274. deltacat/utils/pyarrow.py +1373 -192
  275. deltacat/utils/ray_utils/concurrency.py +1 -1
  276. deltacat/utils/ray_utils/dataset.py +101 -10
  277. deltacat/utils/ray_utils/runtime.py +56 -4
  278. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  279. deltacat/utils/url.py +1325 -0
  280. deltacat-2.0.0.dist-info/METADATA +1163 -0
  281. deltacat-2.0.0.dist-info/RECORD +439 -0
  282. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  283. deltacat/catalog/iceberg/__init__.py +0 -4
  284. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  285. deltacat/compute/merge_on_read/__init__.py +0 -4
  286. deltacat/compute/merge_on_read/daft.py +0 -40
  287. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  288. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  289. deltacat/examples/common/fixtures.py +0 -15
  290. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  291. deltacat/storage/rivulet/__init__.py +0 -11
  292. deltacat/storage/rivulet/feather/__init__.py +0 -5
  293. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  294. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  295. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  296. deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
  297. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  298. deltacat/utils/s3fs.py +0 -21
  299. deltacat-2.0.dist-info/METADATA +0 -65
  300. deltacat-2.0.dist-info/RECORD +0 -347
  301. /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
  302. /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
  303. /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
  304. /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
  305. /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
  306. /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
  307. /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
  308. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
  309. /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
  310. /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  311. /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
  312. /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
  313. /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  314. /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
  315. /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
  316. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
  317. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  318. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  319. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  320. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  321. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  322. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  323. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  324. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -5,7 +5,8 @@ import logging
5
5
  import ray
6
6
  import time
7
7
  import json
8
- from deltacat.aws import s3u as s3_utils
8
+ import posixpath
9
+ from deltacat.utils.filesystem import resolve_path_and_filesystem
9
10
  import deltacat
10
11
  from deltacat import logs
11
12
  import pyarrow as pa
@@ -25,7 +26,7 @@ from deltacat.storage import (
25
26
  DeltaLocator,
26
27
  Partition,
27
28
  PartitionLocator,
28
- interface as unimplemented_deltacat_storage,
29
+ metastore,
29
30
  )
30
31
  from deltacat.compute.compactor.model.compact_partition_params import (
31
32
  CompactPartitionParams,
@@ -40,7 +41,7 @@ from deltacat.compute.compactor.steps import dedupe as dd
40
41
  from deltacat.compute.compactor.steps import hash_bucket as hb
41
42
  from deltacat.compute.compactor.steps import materialize as mat
42
43
  from deltacat.compute.compactor.utils import io
43
- from deltacat.compute.compactor.utils import round_completion_file as rcf
44
+ from deltacat.compute.compactor.utils import round_completion_reader as rci
44
45
 
45
46
  from deltacat.types.media import ContentType
46
47
  from deltacat.utils.placement import PlacementGroupConfig
@@ -65,13 +66,37 @@ DEFAULT_DEDUPE_MAX_PARALLELISM_RATIO_ARG: int = 1
65
66
  DEFAULT_PROPERTIES_ARG: Dict[str, Any] = {}
66
67
 
67
68
 
69
+ def _upload_audit_data(url: str, content: str, **kwargs) -> None:
70
+ """
71
+ Upload audit data to the specified URL using filesystem-agnostic operations.
72
+ """
73
+ try:
74
+ path, filesystem = resolve_path_and_filesystem(url)
75
+
76
+ # Create parent directories if they don't exist
77
+ parent_dir = posixpath.dirname(path)
78
+ if parent_dir:
79
+ try:
80
+ filesystem.create_dir(parent_dir, recursive=True)
81
+ except Exception as dir_error:
82
+ # Directory might already exist, which is fine
83
+ logger.debug(
84
+ f"Directory creation warning for {parent_dir}: {dir_error}"
85
+ )
86
+
87
+ with filesystem.open_output_stream(path) as stream:
88
+ stream.write(content.encode("utf-8"))
89
+ except Exception as e:
90
+ logger.warning(f"Failed to upload audit data to {url}: {e}")
91
+
92
+
68
93
  def check_preconditions(
69
94
  source_partition_locator: PartitionLocator,
70
95
  destination_partition_locator: PartitionLocator,
71
96
  sort_keys: List[SortKey],
72
97
  max_records_per_output_file: int,
73
98
  new_hash_bucket_count: Optional[int],
74
- deltacat_storage=unimplemented_deltacat_storage,
99
+ deltacat_storage=metastore,
75
100
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
76
101
  **kwargs,
77
102
  ) -> int:
@@ -104,7 +129,7 @@ def compact_partition(
104
129
  source_partition_locator: PartitionLocator,
105
130
  destination_partition_locator: PartitionLocator,
106
131
  primary_keys: Set[str],
107
- compaction_artifact_s3_bucket: str,
132
+ compaction_artifact_path: str,
108
133
  last_stream_position_to_compact: int,
109
134
  *,
110
135
  hash_bucket_count: Optional[int] = None,
@@ -123,37 +148,29 @@ def compact_partition(
123
148
  metrics_config: Optional[MetricsConfig] = None,
124
149
  list_deltas_kwargs: Optional[Dict[str, Any]] = None,
125
150
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
126
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
151
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
127
152
  object_store: Optional[IObjectStore] = RayPlasmaObjectStore(),
128
- s3_client_kwargs: Optional[Dict[str, Any]] = None,
129
- deltacat_storage=unimplemented_deltacat_storage,
153
+ deltacat_storage=metastore,
130
154
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
131
155
  **kwargs,
132
- ) -> Optional[str]:
156
+ ) -> None:
133
157
  if deltacat_storage_kwargs is None:
134
158
  deltacat_storage_kwargs = {}
135
159
  if not importlib.util.find_spec("memray"):
136
160
  logger.info(f"memray profiler not available, disabling all profiling")
137
161
  enable_profiler = False
138
162
 
139
- if s3_client_kwargs is None:
140
- s3_client_kwargs = {}
141
-
142
163
  # memray official documentation link:
143
164
  # https://bloomberg.github.io/memray/getting_started.html
144
165
  with memray.Tracker(
145
166
  f"compaction_partition.bin"
146
167
  ) if enable_profiler else nullcontext():
147
168
  partition = None
148
- (
149
- new_partition,
150
- new_rci,
151
- new_rcf_partition_locator,
152
- ) = _execute_compaction_round(
169
+ (new_partition, new_rci,) = _execute_compaction_round(
153
170
  source_partition_locator,
154
171
  destination_partition_locator,
155
172
  primary_keys,
156
- compaction_artifact_s3_bucket,
173
+ compaction_artifact_path,
157
174
  last_stream_position_to_compact,
158
175
  hash_bucket_count,
159
176
  sort_keys,
@@ -169,9 +186,8 @@ def compact_partition(
169
186
  metrics_config,
170
187
  list_deltas_kwargs,
171
188
  read_kwargs_provider,
172
- s3_table_writer_kwargs,
189
+ table_writer_kwargs,
173
190
  object_store,
174
- s3_client_kwargs,
175
191
  deltacat_storage,
176
192
  deltacat_storage_kwargs,
177
193
  **kwargs,
@@ -182,30 +198,23 @@ def compact_partition(
182
198
  logger.info(
183
199
  f"Partition-{source_partition_locator.partition_values}-> Compaction session data processing completed"
184
200
  )
185
- round_completion_file_s3_url = None
186
201
  if partition:
187
202
  logger.info(f"Committing compacted partition to: {partition.locator}")
203
+ # Set the round completion info on the partition before committing
204
+ partition.compaction_round_completion_info = new_rci
188
205
  partition = deltacat_storage.commit_partition(
189
- partition, **deltacat_storage_kwargs
206
+ partition,
207
+ **deltacat_storage_kwargs,
190
208
  )
191
209
  logger.info(f"Committed compacted partition: {partition}")
192
-
193
- round_completion_file_s3_url = rcf.write_round_completion_file(
194
- compaction_artifact_s3_bucket,
195
- new_rcf_partition_locator,
196
- partition.locator,
197
- new_rci,
198
- **s3_client_kwargs,
199
- )
200
210
  logger.info(f"Completed compaction session for: {source_partition_locator}")
201
- return round_completion_file_s3_url
202
211
 
203
212
 
204
213
  def _execute_compaction_round(
205
214
  source_partition_locator: PartitionLocator,
206
215
  destination_partition_locator: PartitionLocator,
207
216
  primary_keys: Set[str],
208
- compaction_artifact_s3_bucket: str,
217
+ compaction_artifact_path: str,
209
218
  last_stream_position_to_compact: int,
210
219
  hash_bucket_count: Optional[int],
211
220
  sort_keys: List[SortKey],
@@ -221,24 +230,25 @@ def _execute_compaction_round(
221
230
  metrics_config: Optional[MetricsConfig],
222
231
  list_deltas_kwargs: Optional[Dict[str, Any]],
223
232
  read_kwargs_provider: Optional[ReadKwargsProvider],
224
- s3_table_writer_kwargs: Optional[Dict[str, Any]],
233
+ table_writer_kwargs: Optional[Dict[str, Any]],
225
234
  object_store: Optional[IObjectStore],
226
- s3_client_kwargs: Optional[Dict[str, Any]],
227
- deltacat_storage=unimplemented_deltacat_storage,
235
+ deltacat_storage=metastore,
228
236
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
229
237
  **kwargs,
230
- ) -> Tuple[Optional[Partition], Optional[RoundCompletionInfo], Optional[str]]:
238
+ ) -> Tuple[Optional[Partition], Optional[RoundCompletionInfo]]:
231
239
  if deltacat_storage_kwargs is None:
232
240
  deltacat_storage_kwargs = {}
233
- rcf_source_partition_locator = (
241
+ rci_source_partition_locator = (
234
242
  rebase_source_partition_locator
235
243
  if rebase_source_partition_locator
236
244
  else source_partition_locator
237
245
  )
238
- base_audit_url = rcf_source_partition_locator.path(
239
- f"s3://{compaction_artifact_s3_bucket}/compaction-audit"
246
+ # Construct audit URL using filesystem-agnostic path joining
247
+ audit_url = posixpath.join(
248
+ compaction_artifact_path,
249
+ "compaction-audit.json",
250
+ f"{rci_source_partition_locator.hexdigest()}.json",
240
251
  )
241
- audit_url = f"{base_audit_url}.json"
242
252
 
243
253
  logger.info(f"Compaction audit will be written to {audit_url}")
244
254
 
@@ -312,11 +322,11 @@ def _execute_compaction_round(
312
322
  # read the results from any previously completed compaction round
313
323
  round_completion_info = None
314
324
  if not rebase_source_partition_locator:
315
- round_completion_info = rcf.read_round_completion_file(
316
- compaction_artifact_s3_bucket,
317
- source_partition_locator,
318
- destination_partition_locator,
319
- **s3_client_kwargs,
325
+ round_completion_info = rci.read_round_completion_info(
326
+ source_partition_locator=source_partition_locator,
327
+ destination_partition_locator=destination_partition_locator,
328
+ deltacat_storage=deltacat_storage,
329
+ deltacat_storage_kwargs=deltacat_storage_kwargs,
320
330
  )
321
331
  if not round_completion_info:
322
332
  logger.info(
@@ -363,15 +373,11 @@ def _execute_compaction_round(
363
373
  delta_discovery_end - delta_discovery_start
364
374
  )
365
375
 
366
- s3_utils.upload(
367
- compaction_audit.audit_url,
368
- str(json.dumps(compaction_audit)),
369
- **s3_client_kwargs,
370
- )
376
+ _upload_audit_data(audit_url, json.dumps(compaction_audit))
371
377
 
372
378
  if not input_deltas:
373
379
  logger.info("No input deltas found to compact.")
374
- return None, None, None
380
+ return None, None
375
381
 
376
382
  # limit the input deltas to fit on this cluster and convert them to
377
383
  # annotated deltas of equivalent size for easy parallel distribution
@@ -464,11 +470,7 @@ def _execute_compaction_round(
464
470
  hb_end - hb_start,
465
471
  )
466
472
 
467
- s3_utils.upload(
468
- compaction_audit.audit_url,
469
- str(json.dumps(compaction_audit)),
470
- **s3_client_kwargs,
471
- )
473
+ _upload_audit_data(audit_url, json.dumps(compaction_audit))
472
474
 
473
475
  all_hash_group_idx_to_obj_id = defaultdict(list)
474
476
  for hb_result in hb_results:
@@ -485,9 +487,9 @@ def _execute_compaction_round(
485
487
  )
486
488
 
487
489
  compaction_audit.set_input_records(total_hb_record_count.item())
488
- # TODO (pdames): when resources are freed during the last round of hash
489
- # bucketing, start running dedupe tasks that read existing dedupe
490
- # output from S3 then wait for hash bucketing to finish before continuing
490
+ # TODO(pdames): when resources are freed during the last round of hash bucketing,
491
+ # start running dedupe tasks that read hash bucket output from storage then
492
+ # wait for hash bucketing to finish before continuing
491
493
 
492
494
  # create a new stream for this round
493
495
  compacted_stream_locator = destination_partition_locator.stream_locator
@@ -497,6 +499,7 @@ def _execute_compaction_round(
497
499
  compacted_stream_locator.table_version,
498
500
  **deltacat_storage_kwargs,
499
501
  )
502
+
500
503
  partition = deltacat_storage.stage_partition(
501
504
  stream,
502
505
  destination_partition_locator.partition_values,
@@ -571,9 +574,9 @@ def _execute_compaction_round(
571
574
  logger.info(f"Materialize buckets created: " f"{len(all_mat_buckets_to_obj_id)}")
572
575
 
573
576
  compaction_audit.set_materialize_buckets(len(all_mat_buckets_to_obj_id))
574
- # TODO(pdames): when resources are freed during the last round of deduping
577
+ # TODO(pdames): when resources are freed during the last round of deduping,
575
578
  # start running materialize tasks that read materialization source file
576
- # tables from S3 then wait for deduping to finish before continuing
579
+ # tables from storage then wait for deduping to finish before continuing
577
580
 
578
581
  # TODO(pdames): balance inputs to materialization tasks to ensure that each
579
582
  # task has an approximately equal amount of input to materialize
@@ -584,11 +587,7 @@ def _execute_compaction_round(
584
587
  # parallel step 3:
585
588
  # materialize records to keep by index
586
589
 
587
- s3_utils.upload(
588
- compaction_audit.audit_url,
589
- str(json.dumps(compaction_audit)),
590
- **s3_client_kwargs,
591
- )
590
+ _upload_audit_data(audit_url, json.dumps(compaction_audit))
592
591
 
593
592
  materialize_start = time.monotonic()
594
593
  mat_tasks_pending = invoke_parallel(
@@ -610,7 +609,7 @@ def _execute_compaction_round(
610
609
  enable_profiler=enable_profiler,
611
610
  metrics_config=metrics_config,
612
611
  read_kwargs_provider=read_kwargs_provider,
613
- s3_table_writer_kwargs=s3_table_writer_kwargs,
612
+ table_writer_kwargs=table_writer_kwargs,
614
613
  object_store=object_store,
615
614
  deltacat_storage=deltacat_storage,
616
615
  deltacat_storage_kwargs=deltacat_storage_kwargs,
@@ -693,11 +692,7 @@ def _execute_compaction_round(
693
692
  telemetry_time_hb + telemetry_time_dd + telemetry_time_materialize
694
693
  )
695
694
 
696
- s3_utils.upload(
697
- compaction_audit.audit_url,
698
- str(json.dumps(compaction_audit)),
699
- **s3_client_kwargs,
700
- )
695
+ _upload_audit_data(audit_url, json.dumps(compaction_audit))
701
696
 
702
697
  new_round_completion_info = RoundCompletionInfo.of(
703
698
  last_stream_position_compacted,
@@ -710,6 +705,7 @@ def _execute_compaction_round(
710
705
  hash_bucket_count,
711
706
  None,
712
707
  CompactorVersion.V1.value,
708
+ prev_source_partition_locator=rci_source_partition_locator,
713
709
  )
714
710
 
715
711
  logger.info(
@@ -721,17 +717,43 @@ def _execute_compaction_round(
721
717
  return (
722
718
  partition,
723
719
  new_round_completion_info,
724
- rcf_source_partition_locator,
725
720
  )
726
721
 
727
722
 
728
723
  def compact_partition_from_request(
729
724
  compact_partition_params: CompactPartitionParams,
730
725
  *compact_partition_pos_args,
731
- ) -> Optional[str]:
726
+ ) -> None:
732
727
  """
733
728
  Wrapper for compact_partition that allows for the compact_partition parameters to be
734
729
  passed in as a custom dictionary-like CompactPartitionParams object along with any compact_partition positional arguments.
735
730
  :param compact_partition_params:
736
731
  """
737
- return compact_partition(*compact_partition_pos_args, **compact_partition_params)
732
+ # Extract required positional arguments
733
+ source_partition_locator = compact_partition_params.source_partition_locator
734
+ destination_partition_locator = (
735
+ compact_partition_params.destination_partition_locator
736
+ )
737
+ primary_keys = compact_partition_params.primary_keys
738
+ compaction_artifact_path = compact_partition_params.compaction_artifact_path
739
+ last_stream_position_to_compact = (
740
+ compact_partition_params.last_stream_position_to_compact
741
+ )
742
+
743
+ # Create a copy of params without the positional arguments
744
+ kwargs_params = dict(compact_partition_params)
745
+ kwargs_params.pop("source_partition_locator", None)
746
+ kwargs_params.pop("destination_partition_locator", None)
747
+ kwargs_params.pop("primary_keys", None)
748
+ kwargs_params.pop("last_stream_position_to_compact", None)
749
+ # Don't pop compaction_artifact_path as it's a computed property, not stored in the dict
750
+
751
+ compact_partition(
752
+ source_partition_locator,
753
+ destination_partition_locator,
754
+ primary_keys,
755
+ compaction_artifact_path,
756
+ last_stream_position_to_compact,
757
+ *compact_partition_pos_args,
758
+ **kwargs_params,
759
+ )
@@ -2,17 +2,19 @@ from __future__ import annotations
2
2
  import importlib
3
3
  import copy
4
4
  import json
5
- from typing import Any, Dict, List, Optional
5
+ import posixpath
6
+ from typing import Any, Dict, List, Optional, Set
6
7
  from deltacat.io.object_store import IObjectStore
7
8
  from deltacat.utils.common import ReadKwargsProvider
8
9
  from deltacat.types.media import ContentType
9
10
  from deltacat.utils.placement import PlacementGroupConfig
10
11
  from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
11
12
  from deltacat.storage import (
12
- interface as unimplemented_deltacat_storage,
13
+ metastore,
13
14
  PartitionLocator,
14
15
  SortKey,
15
16
  )
17
+ from deltacat.catalog.model.properties import CatalogProperties
16
18
  from deltacat.compute.resource_estimation import (
17
19
  ResourceEstimationMethod,
18
20
  EstimateResourcesParams,
@@ -52,11 +54,22 @@ class CompactPartitionParams(dict):
52
54
  assert (
53
55
  params.get("source_partition_locator") is not None
54
56
  ), "source_partition_locator is a required arg"
57
+ assert params.get("catalog") is not None, "catalog is a required arg"
55
58
  assert (
56
- params.get("compaction_artifact_s3_bucket") is not None
57
- ), "compaction_artifact_s3_bucket is a required arg"
59
+ params.get("all_column_names") is not None
60
+ ), "all_column_names is a required arg"
58
61
 
59
62
  result = CompactPartitionParams(params)
63
+ assert (
64
+ result.destination_partition_locator.partition_id
65
+ ), "destination_partition_locator must have a globally unique partition_id"
66
+ assert (
67
+ result.source_partition_locator.partition_id
68
+ ), "source_partition_locator must have a globally unique partition_id"
69
+ if result.rebase_source_partition_locator:
70
+ assert (
71
+ result.rebase_source_partition_locator.partition_id
72
+ ), "rebase_source_partition_locator must have a globally unique partition_id"
60
73
 
61
74
  result.records_per_compacted_file = params.get(
62
75
  "records_per_compacted_file", MAX_RECORDS_PER_COMPACTED_FILE
@@ -65,15 +78,18 @@ class CompactPartitionParams(dict):
65
78
  "compacted_file_content_type", ContentType.PARQUET
66
79
  )
67
80
  result.object_store = params.get("object_store", RayPlasmaObjectStore())
81
+ result.table_writer_kwargs = params.get("table_writer_kwargs", {})
68
82
 
69
83
  result.enable_profiler = params.get("enable_profiler", False)
70
- result.deltacat_storage = params.get(
71
- "deltacat_storage", unimplemented_deltacat_storage
72
- )
73
- result.s3_client_kwargs = params.get("s3_client_kwargs", {})
84
+ result.deltacat_storage = params.get("deltacat_storage", metastore)
85
+ result.catalog = params.get("catalog")
74
86
  result.deltacat_storage_kwargs = params.get("deltacat_storage_kwargs", {})
75
87
  result.list_deltas_kwargs = params.get("list_deltas_kwargs", {})
76
- result.s3_table_writer_kwargs = params.get("s3_table_writer_kwargs", {})
88
+ result.all_column_names = params.get("all_column_names")
89
+
90
+ # Add catalog to deltacat_storage_kwargs
91
+ result.deltacat_storage_kwargs["catalog"] = result.catalog
92
+
77
93
  result.bit_width_of_sort_keys = validate_sort_keys(
78
94
  result.source_partition_locator,
79
95
  result.sort_keys,
@@ -133,6 +149,8 @@ class CompactPartitionParams(dict):
133
149
  if result.primary_keys:
134
150
  result.primary_keys = sorted(result.primary_keys)
135
151
 
152
+ result.original_fields = params.get("original_fields")
153
+
136
154
  # assertions
137
155
  assert (
138
156
  result.source_partition_locator.partition_values
@@ -177,21 +195,32 @@ class CompactPartitionParams(dict):
177
195
  self["source_partition_locator"] = locator
178
196
 
179
197
  @property
180
- def compaction_artifact_s3_bucket(self) -> str:
181
- return self["compaction_artifact_s3_bucket"]
182
-
183
- @compaction_artifact_s3_bucket.setter
184
- def compaction_artifact_s3_bucket(self, s3_bucket: str) -> None:
185
- self["compaction_artifact_s3_bucket"] = s3_bucket
198
+ def compaction_artifact_path(self) -> str:
199
+ """
200
+ Returns the compaction artifact path based on catalog root.
201
+ """
202
+ return posixpath.join(self.catalog.root, "compute", "compactor")
186
203
 
187
204
  @property
188
- def deltacat_storage(self) -> unimplemented_deltacat_storage:
205
+ def deltacat_storage(self) -> metastore:
189
206
  return self["deltacat_storage"]
190
207
 
191
208
  @deltacat_storage.setter
192
- def deltacat_storage(self, storage: unimplemented_deltacat_storage) -> None:
209
+ def deltacat_storage(self, storage: metastore) -> None:
193
210
  self["deltacat_storage"] = storage
194
211
 
212
+ @property
213
+ def catalog(self) -> CatalogProperties:
214
+ return self["catalog"]
215
+
216
+ @catalog.setter
217
+ def catalog(self, catalog: CatalogProperties) -> None:
218
+ self["catalog"] = catalog
219
+ # Update deltacat_storage_kwargs when catalog is set
220
+ if "deltacat_storage_kwargs" not in self:
221
+ self["deltacat_storage_kwargs"] = {}
222
+ self["deltacat_storage_kwargs"]["catalog"] = catalog
223
+
195
224
  @property
196
225
  def object_store(self) -> IObjectStore:
197
226
  return self["object_store"]
@@ -286,14 +315,6 @@ class CompactPartitionParams(dict):
286
315
  def list_deltas_kwargs(self, kwargs: dict) -> None:
287
316
  self["list_deltas_kwargs"] = kwargs
288
317
 
289
- @property
290
- def s3_table_writer_kwargs(self) -> dict:
291
- return self["s3_table_writer_kwargs"]
292
-
293
- @s3_table_writer_kwargs.setter
294
- def s3_table_writer_kwargs(self, kwargs: dict) -> None:
295
- self["s3_table_writer_kwargs"] = kwargs
296
-
297
318
  @property
298
319
  def deltacat_storage_kwargs(self) -> dict:
299
320
  return self["deltacat_storage_kwargs"]
@@ -303,12 +324,12 @@ class CompactPartitionParams(dict):
303
324
  self["deltacat_storage_kwargs"] = kwargs
304
325
 
305
326
  @property
306
- def s3_client_kwargs(self) -> dict:
307
- return self["s3_client_kwargs"]
327
+ def all_column_names(self) -> List[str]:
328
+ return self.get("all_column_names")
308
329
 
309
- @s3_client_kwargs.setter
310
- def s3_client_kwargs(self, kwargs: dict) -> None:
311
- self["s3_client_kwargs"] = kwargs
330
+ @all_column_names.setter
331
+ def all_column_names(self, column_names: List[str]) -> None:
332
+ self["all_column_names"] = column_names
312
333
 
313
334
  @property
314
335
  def records_per_compacted_file(self) -> int:
@@ -489,6 +510,30 @@ class CompactPartitionParams(dict):
489
510
  average_record_size_bytes=self.average_record_size_bytes,
490
511
  )
491
512
 
513
+ @property
514
+ def table_writer_kwargs(self) -> dict:
515
+ return self["table_writer_kwargs"]
516
+
517
+ @table_writer_kwargs.setter
518
+ def table_writer_kwargs(self, kwargs: dict) -> None:
519
+ self["table_writer_kwargs"] = kwargs
520
+
521
+ @property
522
+ def expected_previous_partition_id(self) -> Optional[str]:
523
+ return self.get("expected_previous_partition_id")
524
+
525
+ @expected_previous_partition_id.setter
526
+ def expected_previous_partition_id(self, partition_id: Optional[str]) -> None:
527
+ self["expected_previous_partition_id"] = partition_id
528
+
529
+ @property
530
+ def original_fields(self) -> Optional[Set[str]]:
531
+ return self.get("original_fields")
532
+
533
+ @original_fields.setter
534
+ def original_fields(self, fields: Optional[Set[str]]) -> None:
535
+ self["original_fields"] = fields
536
+
492
537
  @staticmethod
493
538
  def json_handler_for_compact_partition_params(obj):
494
539
  """
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
  from typing import Optional
4
4
  import pyarrow as pa
5
5
  import logging
6
+ from pathlib import PosixPath
6
7
  from deltacat import logs
7
8
  from typing import List, Union
8
9
  from deltacat.compute.compactor.model.hash_bucket_result import HashBucketResult
@@ -919,3 +920,19 @@ class CompactionSessionAuditInfo(dict):
919
920
  )
920
921
 
921
922
  self.set_pyarrow_version(pa.__version__)
923
+
924
+ def to_serializable(self, catalog_root: str) -> CompactionSessionAuditInfo:
925
+ root_path = PosixPath(catalog_root)
926
+ target_path = PosixPath(self.audit_url)
927
+ if root_path == target_path:
928
+ raise ValueError(
929
+ "Target and root are identical, but expected target to be a child of root."
930
+ )
931
+ try:
932
+ relative_path = target_path.relative_to(root_path)
933
+ # Create a copy of the audit info with the relative path
934
+ audit_copy = CompactionSessionAuditInfo(**dict(self))
935
+ audit_copy["auditUrl"] = str(relative_path)
936
+ return audit_copy
937
+ except ValueError:
938
+ raise ValueError("Expected target to be a child of root.")
@@ -1,7 +1,7 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
- from typing import List, Tuple, Union
4
+ from typing import Tuple, Union
5
5
  from deltacat.storage import DeltaLocator, PartitionLocator
6
6
  from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
7
7
  from typing import Any, Dict, Optional
@@ -10,7 +10,7 @@ from typing import Any, Dict, Optional
10
10
  class HighWatermark(dict):
11
11
  """
12
12
  Inherit from dict to make it easy for serialization/deserialization.
13
- Keep both partition locator and high watermark as a tuple to be persisted in the rcf
13
+ Keep both partition locator and high watermark as a tuple to be persisted in the rci
14
14
  """
15
15
 
16
16
  def set(self, partition_locator: PartitionLocator, delta_stream_position: int):
@@ -46,6 +46,7 @@ class RoundCompletionInfo(dict):
46
46
  compactor_version: Optional[str] = None,
47
47
  input_inflation: Optional[float] = None,
48
48
  input_average_record_size_bytes: Optional[float] = None,
49
+ prev_source_partition_locator: Optional[PartitionLocator] = None,
49
50
  ) -> RoundCompletionInfo:
50
51
 
51
52
  rci = RoundCompletionInfo()
@@ -63,6 +64,7 @@ class RoundCompletionInfo(dict):
63
64
  rci["compactorVersion"] = compactor_version
64
65
  rci["inputInflation"] = input_inflation
65
66
  rci["inputAverageRecordSizeBytes"] = input_average_record_size_bytes
67
+ rci["prevSourcePartitionLocator"] = prev_source_partition_locator
66
68
  return rci
67
69
 
68
70
  @property
@@ -100,7 +102,11 @@ class RoundCompletionInfo(dict):
100
102
 
101
103
  @property
102
104
  def rebase_source_partition_locator(self) -> Optional[PartitionLocator]:
103
- return self.get("rebaseSourcePartitionLocator")
105
+ val = self.get("rebaseSourcePartitionLocator")
106
+ if val is not None and not isinstance(val, PartitionLocator):
107
+ val = PartitionLocator(val)
108
+ self["rebaseSourcePartitionLocator"] = val # Cache the converted value
109
+ return val
104
110
 
105
111
  @property
106
112
  def manifest_entry_copied_by_reference_ratio(self) -> Optional[float]:
@@ -129,6 +135,10 @@ class RoundCompletionInfo(dict):
129
135
  def input_average_record_size_bytes(self) -> Optional[float]:
130
136
  return self.get("inputAverageRecordSizeBytes")
131
137
 
132
- @staticmethod
133
- def get_audit_bucket_name_and_key(compaction_audit_url: str) -> List[str]:
134
- return compaction_audit_url.replace("s3://", "").split("/", 1)
138
+ @property
139
+ def prev_source_partition_locator(self) -> Optional[PartitionLocator]:
140
+ val = self.get("prevSourcePartitionLocator")
141
+ if val is not None and not isinstance(val, PartitionLocator):
142
+ val = PartitionLocator(val)
143
+ self["prevSourcePartitionLocator"] = val # Cache the converted value
144
+ return val