deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (298) hide show
  1. deltacat/__init__.py +96 -17
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +0 -18
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2435 -279
  12. deltacat/catalog/model/catalog.py +154 -77
  13. deltacat/catalog/model/properties.py +63 -22
  14. deltacat/compute/compactor/compaction_session.py +97 -75
  15. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  16. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  17. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  18. deltacat/compute/compactor/repartition_session.py +8 -21
  19. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  20. deltacat/compute/compactor/steps/materialize.py +9 -7
  21. deltacat/compute/compactor/steps/repartition.py +12 -11
  22. deltacat/compute/compactor/utils/io.py +6 -5
  23. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  24. deltacat/compute/compactor/utils/system_columns.py +3 -1
  25. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  26. deltacat/compute/compactor_v2/constants.py +30 -1
  27. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  28. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  29. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  30. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  31. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  32. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  33. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  34. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  35. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  36. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  37. deltacat/compute/compactor_v2/utils/io.py +11 -4
  38. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  39. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  40. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  41. deltacat/compute/converter/converter_session.py +145 -32
  42. deltacat/compute/converter/model/convert_input.py +26 -19
  43. deltacat/compute/converter/model/convert_input_files.py +33 -16
  44. deltacat/compute/converter/model/convert_result.py +35 -16
  45. deltacat/compute/converter/model/converter_session_params.py +24 -21
  46. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  47. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  48. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  49. deltacat/compute/converter/steps/convert.py +157 -50
  50. deltacat/compute/converter/steps/dedupe.py +24 -11
  51. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  52. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  53. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  54. deltacat/compute/converter/utils/io.py +101 -12
  55. deltacat/compute/converter/utils/s3u.py +33 -27
  56. deltacat/compute/janitor.py +205 -0
  57. deltacat/compute/jobs/client.py +25 -12
  58. deltacat/compute/resource_estimation/delta.py +38 -6
  59. deltacat/compute/resource_estimation/model.py +8 -0
  60. deltacat/constants.py +45 -2
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/env.py +10 -0
  64. deltacat/examples/basic_logging.py +1 -3
  65. deltacat/examples/compactor/aws/__init__.py +1 -0
  66. deltacat/examples/compactor/bootstrap.py +863 -0
  67. deltacat/examples/compactor/compactor.py +373 -0
  68. deltacat/examples/compactor/explorer.py +473 -0
  69. deltacat/examples/compactor/gcp/__init__.py +1 -0
  70. deltacat/examples/compactor/job_runner.py +439 -0
  71. deltacat/examples/compactor/utils/__init__.py +1 -0
  72. deltacat/examples/compactor/utils/common.py +261 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  79. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  80. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  81. deltacat/examples/indexer/indexer.py +2 -2
  82. deltacat/examples/indexer/job_runner.py +1 -2
  83. deltacat/exceptions.py +66 -4
  84. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  85. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  86. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
  87. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  88. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  89. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  90. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  91. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  92. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  93. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  94. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  95. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  96. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  97. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  98. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  99. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  100. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  101. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  102. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  103. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  104. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  105. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  107. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  108. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  109. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  110. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  111. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  112. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  113. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  114. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  115. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  116. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  117. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  118. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  119. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  120. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  121. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  122. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  123. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  124. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  125. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  126. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  127. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  128. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  129. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  130. deltacat/io/datasource/deltacat_datasource.py +0 -1
  131. deltacat/io/reader/deltacat_read_api.py +1 -1
  132. deltacat/storage/__init__.py +20 -2
  133. deltacat/storage/interface.py +54 -32
  134. deltacat/storage/main/impl.py +1494 -541
  135. deltacat/storage/model/delta.py +27 -3
  136. deltacat/storage/model/locator.py +6 -12
  137. deltacat/storage/model/manifest.py +182 -6
  138. deltacat/storage/model/metafile.py +151 -78
  139. deltacat/storage/model/namespace.py +8 -1
  140. deltacat/storage/model/partition.py +117 -42
  141. deltacat/storage/model/schema.py +2427 -159
  142. deltacat/storage/model/shard.py +6 -2
  143. deltacat/storage/model/sort_key.py +40 -0
  144. deltacat/storage/model/stream.py +9 -2
  145. deltacat/storage/model/table.py +12 -1
  146. deltacat/storage/model/table_version.py +11 -0
  147. deltacat/storage/model/transaction.py +1184 -208
  148. deltacat/storage/model/transform.py +81 -2
  149. deltacat/storage/model/types.py +48 -26
  150. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  151. deltacat/tests/aws/test_s3u.py +2 -31
  152. deltacat/tests/catalog/data/__init__.py +0 -0
  153. deltacat/tests/catalog/main/__init__.py +0 -0
  154. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  155. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  156. deltacat/tests/catalog/model/__init__.py +0 -0
  157. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  158. deltacat/tests/catalog/test_catalogs.py +103 -106
  159. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  160. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  161. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  162. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  163. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  164. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  165. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  166. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  167. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  168. deltacat/tests/compute/conftest.py +8 -44
  169. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  170. deltacat/tests/compute/converter/utils.py +15 -6
  171. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  172. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  173. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  174. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  175. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  176. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  177. deltacat/tests/compute/test_janitor.py +236 -0
  178. deltacat/tests/compute/test_util_common.py +716 -43
  179. deltacat/tests/compute/test_util_constant.py +0 -1
  180. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  181. deltacat/tests/daft/__init__.py +0 -0
  182. deltacat/tests/daft/test_model.py +97 -0
  183. deltacat/tests/experimental/__init__.py +1 -0
  184. deltacat/tests/experimental/catalog/__init__.py +0 -0
  185. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  186. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  187. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  188. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  189. deltacat/tests/experimental/daft/__init__.py +0 -0
  190. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  191. deltacat/tests/experimental/storage/__init__.py +0 -0
  192. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  193. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  194. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  195. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
  196. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  197. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  198. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  199. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  200. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  201. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  202. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  203. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  204. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
  205. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  206. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  207. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  208. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  209. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  210. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  211. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  212. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  213. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  214. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  215. deltacat/tests/storage/model/test_schema.py +171 -0
  216. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  217. deltacat/tests/storage/model/test_shard.py +3 -1
  218. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  219. deltacat/tests/storage/model/test_transaction.py +393 -48
  220. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  221. deltacat/tests/test_deltacat_api.py +988 -4
  222. deltacat/tests/test_exceptions.py +9 -5
  223. deltacat/tests/test_utils/pyarrow.py +52 -21
  224. deltacat/tests/test_utils/storage.py +23 -34
  225. deltacat/tests/types/__init__.py +0 -0
  226. deltacat/tests/types/test_tables.py +104 -0
  227. deltacat/tests/utils/exceptions.py +22 -0
  228. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  229. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  230. deltacat/tests/utils/test_daft.py +121 -31
  231. deltacat/tests/utils/test_numpy.py +1193 -0
  232. deltacat/tests/utils/test_pandas.py +1106 -0
  233. deltacat/tests/utils/test_polars.py +1040 -0
  234. deltacat/tests/utils/test_pyarrow.py +1370 -89
  235. deltacat/types/media.py +224 -14
  236. deltacat/types/tables.py +2329 -59
  237. deltacat/utils/arguments.py +33 -1
  238. deltacat/utils/daft.py +823 -36
  239. deltacat/utils/export.py +3 -1
  240. deltacat/utils/filesystem.py +100 -0
  241. deltacat/utils/metafile_locator.py +2 -1
  242. deltacat/utils/numpy.py +118 -26
  243. deltacat/utils/pandas.py +577 -48
  244. deltacat/utils/polars.py +658 -27
  245. deltacat/utils/pyarrow.py +1258 -213
  246. deltacat/utils/ray_utils/dataset.py +101 -10
  247. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  248. deltacat/utils/url.py +57 -16
  249. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  250. deltacat-2.0.0b12.dist-info/RECORD +439 -0
  251. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  252. deltacat/catalog/iceberg/__init__.py +0 -4
  253. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  254. deltacat/compute/merge_on_read/__init__.py +0 -4
  255. deltacat/compute/merge_on_read/daft.py +0 -40
  256. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  257. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  258. deltacat/daft/daft_scan.py +0 -115
  259. deltacat/daft/model.py +0 -258
  260. deltacat/daft/translator.py +0 -126
  261. deltacat/examples/common/fixtures.py +0 -15
  262. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  263. deltacat/storage/rivulet/__init__.py +0 -11
  264. deltacat/storage/rivulet/feather/__init__.py +0 -5
  265. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  266. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  267. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  268. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  269. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  270. deltacat/utils/s3fs.py +0 -21
  271. deltacat-2.0.0b10.dist-info/METADATA +0 -68
  272. deltacat-2.0.0b10.dist-info/RECORD +0 -381
  273. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  274. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  275. /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
  276. /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
  277. /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
  278. /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
  279. /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
  280. /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
  281. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  282. /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
  283. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  284. /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
  285. /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
  286. /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
  287. /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
  288. /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  289. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  290. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
  291. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  292. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  293. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  294. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  295. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  296. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  297. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  298. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -16,7 +16,7 @@ from deltacat.compute.compactor_v2.utils.primary_key_index import (
16
16
  hash_group_index_to_hash_bucket_indices,
17
17
  )
18
18
 
19
- from deltacat.storage import interface as unimplemented_deltacat_storage
19
+ from deltacat.storage import metastore
20
20
 
21
21
  from deltacat.io.object_store import IObjectStore
22
22
 
@@ -87,11 +87,13 @@ class LocalMergeFileGroupsProvider(MergeFileGroupsProvider):
87
87
  def __init__(
88
88
  self,
89
89
  uniform_deltas: List[DeltaAnnotated],
90
+ all_column_names: List[str],
90
91
  read_kwargs_provider: Optional[ReadKwargsProvider],
91
- deltacat_storage=unimplemented_deltacat_storage,
92
+ deltacat_storage=metastore,
92
93
  deltacat_storage_kwargs: Optional[dict] = None,
93
94
  ):
94
95
  self._deltas = uniform_deltas
96
+ self._all_column_names = all_column_names
95
97
  self._read_kwargs_provider = read_kwargs_provider
96
98
  self._deltacat_storage = deltacat_storage
97
99
  self._deltacat_storage_kwargs = deltacat_storage_kwargs
@@ -110,6 +112,7 @@ class LocalMergeFileGroupsProvider(MergeFileGroupsProvider):
110
112
  total_size_bytes,
111
113
  ) = read_delta_file_envelopes(
112
114
  annotated_delta,
115
+ self._all_column_names,
113
116
  self._read_kwargs_provider,
114
117
  self._deltacat_storage,
115
118
  self._deltacat_storage_kwargs,
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Dict, List, Optional, Any
3
+ from typing import Dict, List, Optional, Any, Set
4
4
 
5
5
  from deltacat.compute.compactor_v2.model.merge_file_group import (
6
6
  MergeFileGroupsProvider,
@@ -12,9 +12,10 @@ from deltacat.utils.metrics import MetricsConfig
12
12
  from deltacat.utils.common import ReadKwargsProvider
13
13
  from deltacat.io.object_store import IObjectStore
14
14
  from deltacat.storage import (
15
+ Manifest,
15
16
  Partition,
16
17
  SortKey,
17
- interface as unimplemented_deltacat_storage,
18
+ metastore,
18
19
  )
19
20
  from deltacat.compute.compactor_v2.constants import (
20
21
  DROP_DUPLICATES,
@@ -32,22 +33,26 @@ class MergeInput(Dict):
32
33
  write_to_partition: Partition,
33
34
  compacted_file_content_type: ContentType,
34
35
  primary_keys: List[str],
36
+ all_column_names: List[str],
35
37
  drop_duplicates: Optional[bool] = DROP_DUPLICATES,
36
38
  sort_keys: Optional[List[SortKey]] = None,
37
39
  merge_task_index: Optional[int] = 0,
38
40
  max_records_per_output_file: Optional[int] = MAX_RECORDS_PER_COMPACTED_FILE,
39
41
  enable_profiler: Optional[bool] = False,
40
42
  metrics_config: Optional[MetricsConfig] = None,
41
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
43
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
42
44
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
43
45
  round_completion_info: Optional[RoundCompletionInfo] = None,
44
46
  object_store: Optional[IObjectStore] = None,
45
47
  delete_strategy: Optional[DeleteStrategy] = None,
46
48
  delete_file_envelopes: Optional[List[DeleteFileEnvelope]] = None,
47
- deltacat_storage=unimplemented_deltacat_storage,
49
+ deltacat_storage=metastore,
48
50
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
49
51
  memory_logs_enabled: Optional[bool] = None,
50
52
  disable_copy_by_reference: Optional[bool] = None,
53
+ hash_bucket_count: Optional[int] = None,
54
+ original_fields: Optional[Set[str]] = None,
55
+ compacted_manifest: Optional[Manifest] = None,
51
56
  ) -> MergeInput:
52
57
 
53
58
  result = MergeInput()
@@ -55,13 +60,14 @@ class MergeInput(Dict):
55
60
  result["write_to_partition"] = write_to_partition
56
61
  result["compacted_file_content_type"] = compacted_file_content_type
57
62
  result["primary_keys"] = primary_keys
63
+ result["all_column_names"] = all_column_names
58
64
  result["drop_duplicates"] = drop_duplicates
59
65
  result["sort_keys"] = sort_keys
60
66
  result["merge_task_index"] = merge_task_index
61
67
  result["max_records_per_output_file"] = max_records_per_output_file
62
68
  result["enable_profiler"] = enable_profiler
63
69
  result["metrics_config"] = metrics_config
64
- result["s3_table_writer_kwargs"] = s3_table_writer_kwargs or {}
70
+ result["table_writer_kwargs"] = table_writer_kwargs or {}
65
71
  result["read_kwargs_provider"] = read_kwargs_provider
66
72
  result["round_completion_info"] = round_completion_info
67
73
  result["object_store"] = object_store
@@ -71,6 +77,9 @@ class MergeInput(Dict):
71
77
  result["deltacat_storage_kwargs"] = deltacat_storage_kwargs or {}
72
78
  result["memory_logs_enabled"] = memory_logs_enabled
73
79
  result["disable_copy_by_reference"] = disable_copy_by_reference
80
+ result["hash_bucket_count"] = hash_bucket_count
81
+ result["original_fields"] = original_fields
82
+ result["compacted_manifest"] = compacted_manifest
74
83
  return result
75
84
 
76
85
  @property
@@ -89,6 +98,10 @@ class MergeInput(Dict):
89
98
  def primary_keys(self) -> List[str]:
90
99
  return self["primary_keys"]
91
100
 
101
+ @property
102
+ def all_column_names(self) -> List[str]:
103
+ return self["all_column_names"]
104
+
92
105
  @property
93
106
  def drop_duplicates(self) -> int:
94
107
  return self["drop_duplicates"]
@@ -114,8 +127,8 @@ class MergeInput(Dict):
114
127
  return self.get("metrics_config")
115
128
 
116
129
  @property
117
- def s3_table_writer_kwargs(self) -> Optional[Dict[str, Any]]:
118
- return self.get("s3_table_writer_kwargs")
130
+ def table_writer_kwargs(self) -> Optional[Dict[str, Any]]:
131
+ return self.get("table_writer_kwargs")
119
132
 
120
133
  @property
121
134
  def read_kwargs_provider(self) -> Optional[ReadKwargsProvider]:
@@ -130,7 +143,7 @@ class MergeInput(Dict):
130
143
  return self.get("object_store")
131
144
 
132
145
  @property
133
- def deltacat_storage(self) -> unimplemented_deltacat_storage:
146
+ def deltacat_storage(self) -> metastore:
134
147
  return self["deltacat_storage"]
135
148
 
136
149
  @property
@@ -154,3 +167,15 @@ class MergeInput(Dict):
154
167
  @property
155
168
  def disable_copy_by_reference(self) -> bool:
156
169
  return self["disable_copy_by_reference"]
170
+
171
+ @property
172
+ def hash_bucket_count(self) -> int:
173
+ return self["hash_bucket_count"]
174
+
175
+ @property
176
+ def original_fields(self) -> Optional[Set[str]]:
177
+ return self.get("original_fields")
178
+
179
+ @property
180
+ def compacted_manifest(self) -> Optional[Manifest]:
181
+ return self.get("compacted_manifest")
@@ -5,13 +5,15 @@ import ray
5
5
  import time
6
6
  import json
7
7
  from math import ceil
8
+ from urllib.parse import urlparse
9
+ import pyarrow
8
10
 
9
11
  from deltacat.compute.compactor import (
10
12
  PyArrowWriteResult,
11
13
  HighWatermark,
12
14
  RoundCompletionInfo,
13
15
  )
14
- from deltacat.aws import s3u as s3_utils
16
+ from deltacat.utils.filesystem import resolve_path_and_filesystem
15
17
  from deltacat.compute.compactor.model.compactor_version import CompactorVersion
16
18
  from deltacat.compute.compactor_v2.model.evaluate_compaction_result import (
17
19
  ExecutionCompactionResult,
@@ -32,7 +34,7 @@ from deltacat.compute.compactor_v2.utils.merge import (
32
34
  from deltacat.compute.compactor_v2.utils.task_options import (
33
35
  hash_bucket_resource_options_provider,
34
36
  )
35
- from deltacat.compute.compactor.utils import round_completion_file as rcf
37
+ from deltacat.compute.compactor.utils import round_completion_reader as rci
36
38
  from deltacat.compute.compactor import DeltaAnnotated
37
39
  from deltacat.compute.compactor_v2.utils.delta import contains_delete_deltas
38
40
  from deltacat.compute.compactor_v2.deletes.delete_strategy import (
@@ -48,6 +50,7 @@ from deltacat.storage import (
48
50
  DeltaType,
49
51
  DeltaLocator,
50
52
  Partition,
53
+ PartitionLocator,
51
54
  Manifest,
52
55
  Stream,
53
56
  StreamLocator,
@@ -77,6 +80,24 @@ from deltacat.compute.compactor_v2.utils.task_options import (
77
80
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
78
81
 
79
82
 
83
+ def _get_rci_source_partition_locator(
84
+ params: CompactPartitionParams,
85
+ ) -> PartitionLocator:
86
+ return params.rebase_source_partition_locator or params.source_partition_locator
87
+
88
+
89
+ def _is_inplace_compacted(
90
+ rci_source_partition_locator: PartitionLocator,
91
+ destination_partition_locator: PartitionLocator,
92
+ ) -> bool:
93
+ return (
94
+ rci_source_partition_locator.partition_values
95
+ == destination_partition_locator.partition_values
96
+ and rci_source_partition_locator.stream_id
97
+ == destination_partition_locator.stream_id
98
+ )
99
+
100
+
80
101
  def _fetch_compaction_metadata(
81
102
  params: CompactPartitionParams,
82
103
  ) -> tuple[Optional[Manifest], Optional[RoundCompletionInfo]]:
@@ -87,11 +108,11 @@ def _fetch_compaction_metadata(
87
108
  previous_compacted_delta_manifest: Optional[Manifest] = None
88
109
 
89
110
  if not params.rebase_source_partition_locator:
90
- round_completion_info = rcf.read_round_completion_file(
91
- params.compaction_artifact_s3_bucket,
92
- params.source_partition_locator,
93
- params.destination_partition_locator,
94
- **params.s3_client_kwargs,
111
+ round_completion_info = rci.read_round_completion_info(
112
+ source_partition_locator=params.source_partition_locator,
113
+ destination_partition_locator=params.destination_partition_locator,
114
+ deltacat_storage=params.deltacat_storage,
115
+ deltacat_storage_kwargs=params.deltacat_storage_kwargs,
95
116
  )
96
117
  if not round_completion_info:
97
118
  logger.info(
@@ -111,10 +132,10 @@ def _fetch_compaction_metadata(
111
132
  assert (
112
133
  params.hash_bucket_count == round_completion_info.hash_bucket_count
113
134
  ), (
114
- "The hash bucket count has changed. "
115
- "Kindly run rebase compaction and trigger incremental again. "
116
- f"Hash Bucket count in RCF={round_completion_info.hash_bucket_count} "
117
- f"not equal to Hash bucket count in args={params.hash_bucket_count}."
135
+ "Partition hash bucket count for compaction has changed. "
136
+ "Rebase compaction with the desired hash bucket count before running another incremental compaction. "
137
+ f"Hash bucket count in RCI={round_completion_info.hash_bucket_count} "
138
+ f"!= hash bucket count in params={params.hash_bucket_count}."
118
139
  )
119
140
 
120
141
  logger.info(f"Round completion file: {round_completion_info}")
@@ -149,6 +170,7 @@ def _build_uniform_deltas(
149
170
  hash_bucket_count=params.hash_bucket_count,
150
171
  compaction_audit=mutable_compaction_audit,
151
172
  compact_partition_params=params,
173
+ all_column_names=params.all_column_names,
152
174
  deltacat_storage=params.deltacat_storage,
153
175
  deltacat_storage_kwargs=params.deltacat_storage_kwargs,
154
176
  )
@@ -159,10 +181,9 @@ def _build_uniform_deltas(
159
181
  delta_discovery_end - delta_discovery_start
160
182
  )
161
183
 
162
- s3_utils.upload(
163
- mutable_compaction_audit.audit_url,
164
- str(json.dumps(mutable_compaction_audit)),
165
- **params.s3_client_kwargs,
184
+ _upload_audit_data(
185
+ params,
186
+ mutable_compaction_audit,
166
187
  )
167
188
 
168
189
  return (
@@ -267,10 +288,9 @@ def _run_hash_and_merge(
267
288
  hb_end - hb_start,
268
289
  )
269
290
 
270
- s3_utils.upload(
271
- mutable_compaction_audit.audit_url,
272
- str(json.dumps(mutable_compaction_audit)),
273
- **params.s3_client_kwargs,
291
+ _upload_audit_data(
292
+ params,
293
+ mutable_compaction_audit,
274
294
  )
275
295
 
276
296
  hb_data_processed_size_bytes = np.int64(0)
@@ -402,13 +422,24 @@ def _merge(
402
422
  round_completion_info=round_completion_info,
403
423
  compacted_delta_manifest=previous_compacted_delta_manifest,
404
424
  primary_keys=params.primary_keys,
405
- deltacat_storage=params.deltacat_storage,
406
- deltacat_storage_kwargs=params.deltacat_storage_kwargs,
407
425
  ray_custom_resources=params.ray_custom_resources,
408
426
  memory_logs_enabled=params.memory_logs_enabled,
409
427
  estimate_resources_params=params.estimate_resources_params,
410
428
  )
411
429
 
430
+ # set previous compacted delta manifest on input so that we don't need a transaction to retrieve it
431
+ if round_completion_info:
432
+ previous_compacted_delta_manifest = params.deltacat_storage.get_delta_manifest(
433
+ round_completion_info.compacted_delta_locator,
434
+ **params.deltacat_storage_kwargs,
435
+ )
436
+
437
+ # create a copy of deltacat storage kwargs without any parent transaction context
438
+ # (can't be serialized by Ray, and we're only downloading already-resolved manifest entries)
439
+ deltacat_storage_kwargs_copy = {
440
+ k: v for k, v in params.deltacat_storage_kwargs.items() if k != "transaction"
441
+ }
442
+
412
443
  def merge_input_provider(index, item) -> dict[str, MergeInput]:
413
444
  return {
414
445
  "input": MergeInput.of(
@@ -422,22 +453,26 @@ def _merge(
422
453
  write_to_partition=compacted_partition,
423
454
  compacted_file_content_type=params.compacted_file_content_type,
424
455
  primary_keys=params.primary_keys,
456
+ all_column_names=params.all_column_names,
425
457
  sort_keys=params.sort_keys,
426
458
  merge_task_index=index,
427
459
  drop_duplicates=params.drop_duplicates,
428
460
  max_records_per_output_file=params.records_per_compacted_file,
429
461
  enable_profiler=params.enable_profiler,
430
462
  metrics_config=params.metrics_config,
431
- s3_table_writer_kwargs=params.s3_table_writer_kwargs,
463
+ table_writer_kwargs=params.table_writer_kwargs,
432
464
  read_kwargs_provider=params.read_kwargs_provider,
433
465
  round_completion_info=round_completion_info,
434
466
  object_store=params.object_store,
435
467
  deltacat_storage=params.deltacat_storage,
436
- deltacat_storage_kwargs=params.deltacat_storage_kwargs,
468
+ deltacat_storage_kwargs=deltacat_storage_kwargs_copy,
437
469
  delete_strategy=delete_strategy,
438
470
  delete_file_envelopes=delete_file_envelopes,
439
471
  memory_logs_enabled=params.memory_logs_enabled,
440
472
  disable_copy_by_reference=params.disable_copy_by_reference,
473
+ hash_bucket_count=params.hash_bucket_count,
474
+ original_fields=params.original_fields,
475
+ compacted_manifest=previous_compacted_delta_manifest,
441
476
  )
442
477
  }
443
478
 
@@ -473,6 +508,12 @@ def _hash_bucket(
473
508
  estimate_resources_params=params.estimate_resources_params,
474
509
  )
475
510
 
511
+ # create a copy of deltacat storage kwargs without any parent transaction context
512
+ # (can't be serialized by Ray, and we're only downloading already-resolved manifest entries)
513
+ deltacat_storage_kwargs_copy = {
514
+ k: v for k, v in params.deltacat_storage_kwargs.items() if k != "transaction"
515
+ }
516
+
476
517
  def hash_bucket_input_provider(index, item) -> dict[str, HashBucketInput]:
477
518
  return {
478
519
  "input": HashBucketInput.of(
@@ -481,12 +522,13 @@ def _hash_bucket(
481
522
  hb_task_index=index,
482
523
  num_hash_buckets=params.hash_bucket_count,
483
524
  num_hash_groups=params.hash_group_count,
525
+ all_column_names=params.all_column_names,
484
526
  enable_profiler=params.enable_profiler,
485
527
  metrics_config=params.metrics_config,
486
528
  read_kwargs_provider=params.read_kwargs_provider,
487
529
  object_store=params.object_store,
488
530
  deltacat_storage=params.deltacat_storage,
489
- deltacat_storage_kwargs=params.deltacat_storage_kwargs,
531
+ deltacat_storage_kwargs=deltacat_storage_kwargs_copy,
490
532
  memory_logs_enabled=params.memory_logs_enabled,
491
533
  )
492
534
  }
@@ -595,10 +637,9 @@ def _process_merge_results(
595
637
  file_index += mat_result.pyarrow_write_result.files
596
638
  previous_task_index = mat_result.task_index
597
639
 
598
- s3_utils.upload(
599
- mutable_compaction_audit.audit_url,
600
- str(json.dumps(mutable_compaction_audit)),
601
- **params.s3_client_kwargs,
640
+ _upload_audit_data(
641
+ params,
642
+ mutable_compaction_audit,
602
643
  )
603
644
  deltas: List[Delta] = [m.delta for m in mat_results]
604
645
  # Note: An appropriate last stream position must be set
@@ -633,21 +674,20 @@ def _update_and_upload_compaction_audit(
633
674
  + round_completion_info.compacted_pyarrow_write_result.records
634
675
  )
635
676
 
636
- s3_utils.upload(
637
- mutable_compaction_audit.audit_url,
638
- str(json.dumps(mutable_compaction_audit)),
639
- **params.s3_client_kwargs,
677
+ _upload_audit_data(
678
+ params,
679
+ mutable_compaction_audit,
640
680
  )
641
681
  return
642
682
 
643
683
 
644
- def _write_new_round_completion_file(
684
+ def _create_round_completion_info(
645
685
  params: CompactPartitionParams,
646
686
  mutable_compaction_audit: CompactionSessionAuditInfo,
647
687
  compacted_partition: Partition,
648
688
  audit_url: str,
649
689
  hb_id_to_entry_indices_range: dict,
650
- rcf_source_partition_locator: rcf.PartitionLocator,
690
+ rci_source_partition_locator: PartitionLocator,
651
691
  new_compacted_delta_locator: DeltaLocator,
652
692
  pyarrow_write_result: PyArrowWriteResult,
653
693
  prev_round_completion_info: Optional[RoundCompletionInfo] = None,
@@ -689,6 +729,27 @@ def _write_new_round_completion_file(
689
729
  prev_round_completion_info,
690
730
  )
691
731
 
732
+ # Check if this is an in-place compaction before creating RoundCompletionInfo
733
+ logger.info(
734
+ f"Checking if partition {rci_source_partition_locator} is inplace compacted against {params.destination_partition_locator}..."
735
+ )
736
+ is_inplace_compacted: bool = _is_inplace_compacted(
737
+ rci_source_partition_locator, params.destination_partition_locator
738
+ )
739
+
740
+ # Determine the prev_source_partition_locator based on compaction type
741
+ if is_inplace_compacted:
742
+ logger.info(
743
+ "In-place compaction detected. Using compacted partition locator as prev_source_partition_locator. "
744
+ + f"Got compacted partition partition_id of {compacted_partition.locator.partition_id} "
745
+ f"and rci source partition_id of {rci_source_partition_locator.partition_id}."
746
+ )
747
+ prev_source_partition_locator = compacted_partition.locator
748
+ # Update rci_source_partition_locator for backward compatibility
749
+ rci_source_partition_locator = compacted_partition.locator
750
+ else:
751
+ prev_source_partition_locator = rci_source_partition_locator
752
+
692
753
  new_round_completion_info = RoundCompletionInfo.of(
693
754
  high_watermark=params.last_stream_position_to_compact,
694
755
  compacted_delta_locator=new_compacted_delta_locator,
@@ -701,41 +762,17 @@ def _write_new_round_completion_file(
701
762
  compactor_version=CompactorVersion.V2.value,
702
763
  input_inflation=input_inflation,
703
764
  input_average_record_size_bytes=input_average_record_size_bytes,
765
+ prev_source_partition_locator=prev_source_partition_locator,
704
766
  )
705
767
 
706
768
  logger.info(
707
769
  f"Partition-{params.source_partition_locator.partition_values},"
708
770
  f"compacted at: {params.last_stream_position_to_compact},"
709
771
  )
710
- logger.info(
711
- f"Checking if partition {rcf_source_partition_locator} is inplace compacted against {params.destination_partition_locator}..."
712
- )
713
- is_inplace_compacted: bool = (
714
- rcf_source_partition_locator.partition_values
715
- == params.destination_partition_locator.partition_values
716
- and rcf_source_partition_locator.stream_id
717
- == params.destination_partition_locator.stream_id
718
- )
719
- if is_inplace_compacted:
720
- logger.info(
721
- "Overriding round completion file source partition locator as in-place compacted. "
722
- + f"Got compacted partition partition_id of {compacted_partition.locator.partition_id} "
723
- f"and rcf source partition_id of {rcf_source_partition_locator.partition_id}."
724
- )
725
- rcf_source_partition_locator = compacted_partition.locator
726
-
727
- round_completion_file_s3_url = rcf.write_round_completion_file(
728
- params.compaction_artifact_s3_bucket,
729
- rcf_source_partition_locator,
730
- compacted_partition.locator,
731
- new_round_completion_info,
732
- **params.s3_client_kwargs,
733
- )
734
772
 
735
773
  return ExecutionCompactionResult(
736
774
  compacted_partition,
737
775
  new_round_completion_info,
738
- round_completion_file_s3_url,
739
776
  is_inplace_compacted,
740
777
  )
741
778
 
@@ -751,21 +788,29 @@ def _commit_compaction_result(
751
788
  f"Partition-{params.source_partition_locator} -> "
752
789
  f"{compaction_session_type} Compaction session data processing completed"
753
790
  )
791
+ # TODO(pdames): Uncomment this once we support concurrent writes to the same
792
+ # partition (via write_to_table). This requires updating the commit_partition
793
+ # method to support previous partition as input. Right now, a concurrent write
794
+ # to the same partition will cause the commit_partition method to fail.
754
795
  if execute_compaction_result.new_compacted_partition:
755
796
  previous_partition: Optional[Partition] = None
756
- if execute_compaction_result.is_inplace_compacted:
757
- previous_partition: Optional[
758
- Partition
759
- ] = params.deltacat_storage.get_partition(
760
- params.source_partition_locator.stream_locator,
761
- params.source_partition_locator.partition_values,
762
- **params.deltacat_storage_kwargs,
763
- )
764
- # NOTE: Retrieving the previous partition again as the partition_id may have changed by the time commit_partition is called.
797
+ # if execute_compaction_result.is_inplace_compacted:
798
+ # previous_partition: Optional[
799
+ # Partition
800
+ # ] = params.deltacat_storage.get_partition(
801
+ # params.source_partition_locator.stream_locator,
802
+ # params.source_partition_locator.partition_values,
803
+ # **params.deltacat_storage_kwargs,
804
+ # )
805
+ # # NOTE: Retrieving the previous partition again as the partition_id may have changed by the time commit_partition is called.
765
806
  logger.info(
766
807
  f"Committing compacted partition to: {execute_compaction_result.new_compacted_partition.locator} "
767
808
  f"using previous partition: {previous_partition.locator if previous_partition else None}"
768
809
  )
810
+ # Set the round completion info on the partition before committing
811
+ execute_compaction_result.new_compacted_partition.compaction_round_completion_info = (
812
+ execute_compaction_result.new_round_completion_info
813
+ )
769
814
  committed_partition: Partition = params.deltacat_storage.commit_partition(
770
815
  execute_compaction_result.new_compacted_partition,
771
816
  previous_partition,
@@ -776,3 +821,57 @@ def _commit_compaction_result(
776
821
  logger.warning("No new partition was committed during compaction.")
777
822
 
778
823
  logger.info(f"Completed compaction session for: {params.source_partition_locator}")
824
+
825
+
826
+ def _upload_audit_data(
827
+ params: CompactPartitionParams,
828
+ audit_info: CompactionSessionAuditInfo,
829
+ ) -> None:
830
+ """
831
+ Upload audit data to the specified URL using the filesystem from catalog properties.
832
+ """
833
+ audit_url = audit_info.audit_url
834
+ audit_data = json.dumps(audit_info.to_serializable(params.catalog.root))
835
+ if params.catalog and params.catalog.filesystem:
836
+ # Use the filesystem from catalog properties
837
+ filesystem = params.catalog.filesystem
838
+ parsed_url = urlparse(audit_url)
839
+ # For filesystem paths, use the path component
840
+ path = parsed_url.path if parsed_url.scheme else audit_url
841
+
842
+ # Ensure parent directories exist
843
+ import os
844
+
845
+ parent_dir = os.path.dirname(path)
846
+ if (
847
+ parent_dir
848
+ and not filesystem.get_file_info(parent_dir).type
849
+ == pyarrow.fs.FileType.Directory
850
+ ):
851
+ try:
852
+ filesystem.create_dir(parent_dir, recursive=True)
853
+ except Exception as e:
854
+ logger.warning(f"Failed to create directory {parent_dir}: {e}")
855
+
856
+ with filesystem.open_output_stream(path) as output_stream:
857
+ output_stream.write(audit_data.encode("utf-8"))
858
+ else:
859
+ # Fallback: resolve filesystem from the URL
860
+ path, filesystem = resolve_path_and_filesystem(audit_url)
861
+
862
+ # Ensure parent directories exist
863
+ import os
864
+
865
+ parent_dir = os.path.dirname(path)
866
+ if (
867
+ parent_dir
868
+ and not filesystem.get_file_info(parent_dir).type
869
+ == pyarrow.fs.FileType.Directory
870
+ ):
871
+ try:
872
+ filesystem.create_dir(parent_dir, recursive=True)
873
+ except Exception as e:
874
+ logger.warning(f"Failed to create directory {parent_dir}: {e}")
875
+
876
+ with filesystem.open_output_stream(path) as output_stream:
877
+ output_stream.write(audit_data.encode("utf-8"))
@@ -18,7 +18,7 @@ from deltacat.compute.compactor_v2.utils.primary_key_index import (
18
18
  group_hash_bucket_indices,
19
19
  group_by_pk_hash_bucket,
20
20
  )
21
- from deltacat.storage import interface as unimplemented_deltacat_storage
21
+ from deltacat.storage import metastore
22
22
  from deltacat.utils.ray_utils.runtime import (
23
23
  get_current_ray_task_id,
24
24
  get_current_ray_worker_id,
@@ -50,8 +50,9 @@ def _group_file_records_by_pk_hash_bucket(
50
50
  annotated_delta: DeltaAnnotated,
51
51
  num_hash_buckets: int,
52
52
  primary_keys: List[str],
53
+ all_column_names: List[str],
53
54
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
54
- deltacat_storage=unimplemented_deltacat_storage,
55
+ deltacat_storage=metastore,
55
56
  deltacat_storage_kwargs: Optional[dict] = None,
56
57
  ) -> Tuple[Optional[DeltaFileEnvelopeGroups], int, int]:
57
58
  # read input parquet s3 objects into a list of delta file envelopes
@@ -61,6 +62,7 @@ def _group_file_records_by_pk_hash_bucket(
61
62
  total_size_bytes,
62
63
  ) = read_delta_file_envelopes(
63
64
  annotated_delta,
65
+ all_column_names,
64
66
  read_kwargs_provider,
65
67
  deltacat_storage,
66
68
  deltacat_storage_kwargs,
@@ -116,6 +118,7 @@ def _timed_hash_bucket(input: HashBucketInput):
116
118
  annotated_delta=input.annotated_delta,
117
119
  num_hash_buckets=input.num_hash_buckets,
118
120
  primary_keys=input.primary_keys,
121
+ all_column_names=input.all_column_names,
119
122
  read_kwargs_provider=input.read_kwargs_provider,
120
123
  deltacat_storage=input.deltacat_storage,
121
124
  deltacat_storage_kwargs=input.deltacat_storage_kwargs,