deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. deltacat/__init__.py +117 -18
  2. deltacat/api.py +536 -126
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +1 -19
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2444 -282
  12. deltacat/catalog/model/catalog.py +208 -113
  13. deltacat/catalog/model/properties.py +63 -24
  14. deltacat/compute/__init__.py +14 -0
  15. deltacat/compute/compactor/compaction_session.py +97 -75
  16. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  17. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  18. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  19. deltacat/compute/compactor/repartition_session.py +8 -21
  20. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  21. deltacat/compute/compactor/steps/materialize.py +9 -7
  22. deltacat/compute/compactor/steps/repartition.py +12 -11
  23. deltacat/compute/compactor/utils/io.py +6 -5
  24. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  25. deltacat/compute/compactor/utils/system_columns.py +3 -1
  26. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  27. deltacat/compute/compactor_v2/constants.py +30 -1
  28. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  30. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  31. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  32. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  33. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  34. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  35. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  36. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  37. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  38. deltacat/compute/compactor_v2/utils/io.py +11 -4
  39. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  40. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  41. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  42. deltacat/compute/converter/constants.py +5 -0
  43. deltacat/compute/converter/converter_session.py +207 -52
  44. deltacat/compute/converter/model/convert_input.py +43 -16
  45. deltacat/compute/converter/model/convert_input_files.py +33 -16
  46. deltacat/compute/converter/model/convert_result.py +80 -0
  47. deltacat/compute/converter/model/converter_session_params.py +64 -19
  48. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  49. deltacat/compute/converter/pyiceberg/overrides.py +193 -65
  50. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  51. deltacat/compute/converter/steps/convert.py +230 -75
  52. deltacat/compute/converter/steps/dedupe.py +46 -12
  53. deltacat/compute/converter/utils/convert_task_options.py +66 -22
  54. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  55. deltacat/compute/converter/utils/iceberg_columns.py +13 -8
  56. deltacat/compute/converter/utils/io.py +173 -13
  57. deltacat/compute/converter/utils/s3u.py +42 -27
  58. deltacat/compute/janitor.py +205 -0
  59. deltacat/compute/jobs/client.py +417 -0
  60. deltacat/compute/resource_estimation/delta.py +38 -6
  61. deltacat/compute/resource_estimation/model.py +8 -0
  62. deltacat/constants.py +49 -6
  63. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  64. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  65. deltacat/env.py +10 -0
  66. deltacat/examples/basic_logging.py +6 -6
  67. deltacat/examples/compactor/aws/__init__.py +1 -0
  68. deltacat/examples/compactor/bootstrap.py +863 -0
  69. deltacat/examples/compactor/compactor.py +373 -0
  70. deltacat/examples/compactor/explorer.py +473 -0
  71. deltacat/examples/compactor/gcp/__init__.py +1 -0
  72. deltacat/examples/compactor/job_runner.py +439 -0
  73. deltacat/examples/compactor/utils/__init__.py +1 -0
  74. deltacat/examples/compactor/utils/common.py +261 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  80. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  81. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
  82. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  83. deltacat/examples/hello_world.py +4 -2
  84. deltacat/examples/indexer/indexer.py +163 -0
  85. deltacat/examples/indexer/job_runner.py +198 -0
  86. deltacat/exceptions.py +66 -4
  87. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  88. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  89. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
  90. deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
  91. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  92. deltacat/experimental/converter_agent/__init__.py +0 -0
  93. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  94. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  95. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  96. deltacat/experimental/daft/__init__.py +4 -0
  97. deltacat/experimental/daft/daft_catalog.py +229 -0
  98. deltacat/experimental/storage/__init__.py +0 -0
  99. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  100. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  101. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  102. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  103. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  104. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  105. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  107. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  108. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  109. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  110. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  111. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  112. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  113. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  114. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  115. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  116. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  117. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  118. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  119. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  120. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  121. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  122. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  123. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  124. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  125. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  126. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  127. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  128. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  129. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  130. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  131. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  132. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  133. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  134. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  135. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  136. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  137. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  138. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  139. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  140. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  141. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  142. deltacat/io/__init__.py +13 -0
  143. deltacat/io/dataset/__init__.py +0 -0
  144. deltacat/io/dataset/deltacat_dataset.py +91 -0
  145. deltacat/io/datasink/__init__.py +0 -0
  146. deltacat/io/datasink/deltacat_datasink.py +207 -0
  147. deltacat/io/datasource/__init__.py +0 -0
  148. deltacat/io/datasource/deltacat_datasource.py +579 -0
  149. deltacat/io/reader/__init__.py +0 -0
  150. deltacat/io/reader/deltacat_read_api.py +172 -0
  151. deltacat/storage/__init__.py +22 -2
  152. deltacat/storage/interface.py +54 -32
  153. deltacat/storage/main/impl.py +1494 -541
  154. deltacat/storage/model/delta.py +27 -3
  155. deltacat/storage/model/expression/__init__.py +47 -0
  156. deltacat/storage/model/expression/expression.py +656 -0
  157. deltacat/storage/model/expression/visitor.py +248 -0
  158. deltacat/storage/model/locator.py +6 -12
  159. deltacat/storage/model/manifest.py +231 -6
  160. deltacat/storage/model/metafile.py +224 -119
  161. deltacat/storage/model/namespace.py +8 -1
  162. deltacat/storage/model/partition.py +117 -42
  163. deltacat/storage/model/scan/push_down.py +32 -5
  164. deltacat/storage/model/schema.py +2427 -159
  165. deltacat/storage/model/shard.py +6 -2
  166. deltacat/storage/model/sort_key.py +40 -0
  167. deltacat/storage/model/stream.py +9 -2
  168. deltacat/storage/model/table.py +12 -1
  169. deltacat/storage/model/table_version.py +11 -0
  170. deltacat/storage/model/transaction.py +1184 -208
  171. deltacat/storage/model/transform.py +81 -2
  172. deltacat/storage/model/types.py +53 -29
  173. deltacat/storage/util/__init__.py +0 -0
  174. deltacat/storage/util/scan_planner.py +26 -0
  175. deltacat/tests/_io/reader/__init__.py +0 -0
  176. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  177. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  178. deltacat/tests/aws/test_s3u.py +2 -31
  179. deltacat/tests/catalog/data/__init__.py +0 -0
  180. deltacat/tests/catalog/main/__init__.py +0 -0
  181. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  182. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  183. deltacat/tests/catalog/model/__init__.py +0 -0
  184. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  185. deltacat/tests/catalog/test_catalogs.py +103 -106
  186. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  187. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  188. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  189. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  190. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  191. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  192. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  193. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  194. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  195. deltacat/tests/compute/conftest.py +8 -44
  196. deltacat/tests/compute/converter/test_convert_session.py +697 -349
  197. deltacat/tests/compute/converter/utils.py +15 -6
  198. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  199. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  200. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  201. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  202. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  203. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  204. deltacat/tests/compute/test_janitor.py +236 -0
  205. deltacat/tests/compute/test_util_common.py +716 -43
  206. deltacat/tests/compute/test_util_constant.py +0 -1
  207. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  208. deltacat/tests/daft/__init__.py +0 -0
  209. deltacat/tests/daft/test_model.py +97 -0
  210. deltacat/tests/experimental/__init__.py +1 -0
  211. deltacat/tests/experimental/catalog/__init__.py +0 -0
  212. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  213. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  214. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  215. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  216. deltacat/tests/experimental/daft/__init__.py +0 -0
  217. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  218. deltacat/tests/experimental/storage/__init__.py +0 -0
  219. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  220. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  221. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  222. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  223. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  224. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  225. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  226. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  227. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  228. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  229. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  230. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  231. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  232. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  233. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  234. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  235. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  236. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  237. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  238. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  239. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  240. deltacat/tests/storage/model/test_expression.py +327 -0
  241. deltacat/tests/storage/model/test_manifest.py +129 -0
  242. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  243. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  244. deltacat/tests/storage/model/test_schema.py +171 -0
  245. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  246. deltacat/tests/storage/model/test_shard.py +3 -1
  247. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  248. deltacat/tests/storage/model/test_transaction.py +393 -48
  249. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  250. deltacat/tests/test_deltacat_api.py +1036 -11
  251. deltacat/tests/test_exceptions.py +9 -5
  252. deltacat/tests/test_utils/pyarrow.py +52 -21
  253. deltacat/tests/test_utils/storage.py +23 -34
  254. deltacat/tests/types/__init__.py +0 -0
  255. deltacat/tests/types/test_tables.py +104 -0
  256. deltacat/tests/utils/exceptions.py +22 -0
  257. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  258. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  259. deltacat/tests/utils/test_daft.py +121 -31
  260. deltacat/tests/utils/test_numpy.py +1193 -0
  261. deltacat/tests/utils/test_pandas.py +1106 -0
  262. deltacat/tests/utils/test_polars.py +1040 -0
  263. deltacat/tests/utils/test_pyarrow.py +1370 -89
  264. deltacat/types/media.py +345 -37
  265. deltacat/types/tables.py +2344 -46
  266. deltacat/utils/arguments.py +33 -1
  267. deltacat/utils/daft.py +824 -40
  268. deltacat/utils/export.py +3 -1
  269. deltacat/utils/filesystem.py +139 -9
  270. deltacat/utils/metafile_locator.py +2 -1
  271. deltacat/utils/numpy.py +118 -26
  272. deltacat/utils/pandas.py +577 -48
  273. deltacat/utils/polars.py +759 -0
  274. deltacat/utils/pyarrow.py +1373 -192
  275. deltacat/utils/ray_utils/concurrency.py +1 -1
  276. deltacat/utils/ray_utils/dataset.py +101 -10
  277. deltacat/utils/ray_utils/runtime.py +56 -4
  278. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  279. deltacat/utils/url.py +1325 -0
  280. deltacat-2.0.0.dist-info/METADATA +1163 -0
  281. deltacat-2.0.0.dist-info/RECORD +439 -0
  282. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  283. deltacat/catalog/iceberg/__init__.py +0 -4
  284. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  285. deltacat/compute/merge_on_read/__init__.py +0 -4
  286. deltacat/compute/merge_on_read/daft.py +0 -40
  287. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  288. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  289. deltacat/examples/common/fixtures.py +0 -15
  290. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  291. deltacat/storage/rivulet/__init__.py +0 -11
  292. deltacat/storage/rivulet/feather/__init__.py +0 -5
  293. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  294. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  295. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  296. deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
  297. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  298. deltacat/utils/s3fs.py +0 -21
  299. deltacat-2.0.dist-info/METADATA +0 -65
  300. deltacat-2.0.dist-info/RECORD +0 -347
  301. /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
  302. /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
  303. /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
  304. /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
  305. /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
  306. /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
  307. /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
  308. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
  309. /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
  310. /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  311. /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
  312. /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
  313. /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  314. /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
  315. /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
  316. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
  317. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  318. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  319. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  320. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  321. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  322. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  323. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  324. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -7,10 +7,11 @@ import ray
7
7
  import itertools
8
8
  import time
9
9
  import pyarrow.compute as pc
10
+ from deltacat.utils.pyarrow import MAX_INT_BYTES
10
11
  import deltacat.compute.compactor_v2.utils.merge as merge_utils
11
12
  from uuid import uuid4
12
13
  from deltacat import logs
13
- from typing import Callable, Iterator, List, Optional, Tuple
14
+ from typing import Callable, Iterator, List, Optional, Tuple, Set
14
15
  from deltacat.compute.compactor_v2.model.merge_result import MergeResult
15
16
  from deltacat.compute.compactor_v2.model.merge_file_group import MergeFileGroup
16
17
  from deltacat.compute.compactor.model.materialize_result import MaterializeResult
@@ -31,13 +32,14 @@ from deltacat.utils.resources import (
31
32
  )
32
33
  from deltacat.compute.compactor_v2.utils.primary_key_index import (
33
34
  generate_pk_hash_column,
35
+ pk_digest_to_hash_bucket_index,
34
36
  )
35
37
  from deltacat.storage import (
36
38
  Delta,
37
39
  DeltaLocator,
38
40
  DeltaType,
39
41
  Partition,
40
- interface as unimplemented_deltacat_storage,
42
+ metastore,
41
43
  )
42
44
  from deltacat.storage.model.manifest import Manifest
43
45
  from deltacat.compute.compactor_v2.utils.dedupe import drop_duplicates
@@ -46,6 +48,9 @@ from deltacat.compute.compactor_v2.constants import (
46
48
  MERGE_TIME_IN_SECONDS,
47
49
  MERGE_SUCCESS_COUNT,
48
50
  MERGE_FAILURE_COUNT,
51
+ BUCKETING_SPEC_COMPLIANCE_PROFILE,
52
+ BUCKETING_SPEC_COMPLIANCE_ASSERT,
53
+ BUCKETING_SPEC_COMPLIANCE_PRINT_LOG,
49
54
  )
50
55
  from deltacat.exceptions import (
51
56
  categorize_errors,
@@ -57,6 +62,10 @@ if importlib.util.find_spec("memray"):
57
62
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
58
63
 
59
64
 
65
+ _EXISTING_VARIANT_LOG_PREFIX = "Existing variant "
66
+ _INCREMENTAL_TABLE_LOG_PREFIX = "Incremental table "
67
+
68
+
60
69
  def _append_delta_type_column(table: pa.Table, value: np.bool_):
61
70
  return table.append_column(
62
71
  sc._DELTA_TYPE_COLUMN_FIELD,
@@ -85,9 +94,12 @@ def _build_incremental_table(
85
94
  # sort by delta file stream position now instead of sorting every row later
86
95
  is_delete = False
87
96
  for df_envelope in df_envelopes:
88
- assert (
89
- df_envelope.delta_type != DeltaType.APPEND
90
- ), "APPEND type deltas are not supported. Kindly use UPSERT or DELETE"
97
+ # Allow APPEND, UPSERT, and DELETE delta types
98
+ assert df_envelope.delta_type in (
99
+ DeltaType.APPEND,
100
+ DeltaType.UPSERT,
101
+ DeltaType.DELETE,
102
+ ), "Only APPEND, UPSERT, and DELETE delta types are supported"
91
103
  if df_envelope.delta_type == DeltaType.DELETE:
92
104
  is_delete = True
93
105
 
@@ -99,14 +111,35 @@ def _build_incremental_table(
99
111
  )
100
112
 
101
113
  hb_tables.append(table)
102
- result = pa.concat_tables(hb_tables)
114
+ result = _concat_or_coerce_tables(hb_tables)
103
115
  return result
104
116
 
105
117
 
118
+ def _concat_or_coerce_tables(all_tables: List[pa.Table]) -> pa.Table:
119
+ try:
120
+ return pa.concat_tables(all_tables)
121
+ except pa.ArrowInvalid:
122
+ # Fallback path: schema evolution needed - try PyArrow's built-in unification
123
+ if all_tables:
124
+ try:
125
+ return pa.concat_tables(
126
+ all_tables, promote_options="permissive", unify_schemas=True
127
+ )
128
+ except (pa.ArrowInvalid, TypeError, pa.ArrowNotImplementedError):
129
+ # If PyArrow unification fails, re-raise the original error
130
+ raise
131
+ else:
132
+ # Empty table list - should not happen but handle gracefully
133
+ raise RuntimeError("Expected at least one table to merge, but found none.")
134
+
135
+
106
136
  def _merge_tables(
107
137
  table: pa.Table,
108
138
  primary_keys: List[str],
109
139
  can_drop_duplicates: bool,
140
+ hb_index: int,
141
+ num_buckets: int,
142
+ original_fields: Set[str],
110
143
  compacted_table: Optional[pa.Table] = None,
111
144
  ) -> pa.Table:
112
145
  """
@@ -125,6 +158,20 @@ def _merge_tables(
125
158
 
126
159
  all_tables.append(table)
127
160
 
161
+ check_bucketing_spec = BUCKETING_SPEC_COMPLIANCE_PROFILE in [
162
+ BUCKETING_SPEC_COMPLIANCE_PRINT_LOG,
163
+ BUCKETING_SPEC_COMPLIANCE_ASSERT,
164
+ ]
165
+
166
+ if primary_keys and check_bucketing_spec:
167
+ _validate_bucketing_spec_compliance(
168
+ table=all_tables[incremental_idx],
169
+ num_buckets=num_buckets,
170
+ primary_keys=primary_keys,
171
+ hb_index=hb_index,
172
+ log_prefix=_INCREMENTAL_TABLE_LOG_PREFIX,
173
+ )
174
+
128
175
  if not primary_keys or not can_drop_duplicates:
129
176
  logger.info(
130
177
  f"Not dropping duplicates for primary keys={primary_keys} "
@@ -134,7 +181,7 @@ def _merge_tables(
134
181
  all_tables[incremental_idx], DeltaType.DELETE
135
182
  )
136
183
  # we need not drop duplicates
137
- return pa.concat_tables(all_tables)
184
+ return _concat_or_coerce_tables(all_tables)
138
185
 
139
186
  all_tables = generate_pk_hash_column(all_tables, primary_keys=primary_keys)
140
187
 
@@ -144,36 +191,170 @@ def _merge_tables(
144
191
  all_tables[incremental_idx], on=sc._PK_HASH_STRING_COLUMN_NAME
145
192
  )
146
193
 
194
+ # Always drop DELETE rows from incremental table
195
+ incremental_table = _drop_delta_type_rows(incremental_table, DeltaType.DELETE)
196
+
197
+ # Default to using incremental records as-is, override only if merging is needed
198
+ incremental_data = incremental_table
199
+
147
200
  if compacted_table:
148
201
  compacted_table = all_tables[0]
149
202
 
150
- records_to_keep = pc.invert(
151
- pc.is_in(
152
- compacted_table[sc._PK_HASH_STRING_COLUMN_NAME],
153
- incremental_table[sc._PK_HASH_STRING_COLUMN_NAME],
203
+ compacted_pk_hash_str = compacted_table[sc._PK_HASH_STRING_COLUMN_NAME]
204
+ incremental_pk_hash_str = incremental_table[sc._PK_HASH_STRING_COLUMN_NAME]
205
+
206
+ logger.info(
207
+ f"Size of compacted pk hash={compacted_pk_hash_str.nbytes} "
208
+ f"and incremental pk hash={incremental_pk_hash_str.nbytes}."
209
+ )
210
+
211
+ if (
212
+ compacted_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
213
+ or incremental_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
214
+ ):
215
+ logger.info("Casting compacted and incremental pk hash to large_string...")
216
+ # is_in combines the chunks of the chunked array passed which can cause
217
+ # ArrowCapacityError if the total size of string array is over 2GB.
218
+ # Using a large_string would resolve this issue.
219
+ # The cast here should be zero-copy in most cases.
220
+ compacted_pk_hash_str = pc.cast(compacted_pk_hash_str, pa.large_string())
221
+ incremental_pk_hash_str = pc.cast(
222
+ incremental_pk_hash_str, pa.large_string()
154
223
  )
224
+
225
+ records_to_update = pc.is_in(
226
+ compacted_pk_hash_str,
227
+ incremental_pk_hash_str,
155
228
  )
156
229
 
230
+ records_to_keep = pc.invert(records_to_update)
231
+
232
+ # Keep records that don't have updates
157
233
  result_table_list.append(compacted_table.filter(records_to_keep))
158
234
 
159
- incremental_table = _drop_delta_type_rows(incremental_table, DeltaType.DELETE)
160
- result_table_list.append(incremental_table)
235
+ # Override default if merging is needed
236
+ if pc.sum(records_to_update).as_py() > 0: # There are records to update
237
+ old_records_to_update = compacted_table.filter(records_to_update)
238
+ # Perform partial UPSERT: merge old and new records field by field
239
+ incremental_data = _merge_records_partially(
240
+ old_records=old_records_to_update,
241
+ new_records=incremental_table,
242
+ original_fields=original_fields,
243
+ )
244
+
245
+ # Add the determined incremental data
246
+ result_table_list.append(incremental_data)
161
247
 
162
- final_table = pa.concat_tables(result_table_list)
248
+ final_table = _concat_or_coerce_tables(result_table_list)
163
249
  final_table = final_table.drop([sc._PK_HASH_STRING_COLUMN_NAME])
164
250
 
165
251
  return final_table
166
252
 
167
253
 
254
+ def _merge_records_partially(
255
+ old_records: pa.Table, new_records: pa.Table, original_fields: Set[str]
256
+ ) -> pa.Table:
257
+ """
258
+ Merge records field by field for partial UPSERT behavior. Fills missing
259
+ fields in new_records with values from old_records.
260
+
261
+ Args:
262
+ old_records: Records from the compacted table that need updates
263
+ new_records: New records with potential partial field updates
264
+
265
+ Returns:
266
+ Table with merged records where missing fields preserve old values
267
+ """
268
+ # Get field sets (excluding hash column which is used for joining)
269
+ old_fields = set(old_records.column_names) - {sc._PK_HASH_STRING_COLUMN_NAME}
270
+ new_fields = set(new_records.column_names) - {sc._PK_HASH_STRING_COLUMN_NAME}
271
+
272
+ # Find fields that are missing from new_records but exist in old_records
273
+ missing_fields = old_fields - new_fields
274
+
275
+ # Find fields that were auto-added by schema coercion (missing from original user data)
276
+ # These should be treated as missing fields and filled from old_records
277
+ auto_added_null_fields = set()
278
+
279
+ # Use definitive information about which fields were originally provided
280
+ # Any field that exists in both tables but was NOT in the original user data
281
+ # should be treated as auto-added by schema coercion
282
+ for field_name in old_fields & new_fields: # Fields that exist in both
283
+ if field_name not in original_fields:
284
+ auto_added_null_fields.add(field_name)
285
+
286
+ # Combine missing fields with auto-added null fields
287
+ fields_to_fill = missing_fields | auto_added_null_fields
288
+
289
+ # Start with new_records and add missing fields from old_records
290
+ result_columns = {}
291
+
292
+ # Copy all existing columns from new_records
293
+ for column_name in new_records.column_names:
294
+ result_columns[column_name] = new_records[column_name]
295
+
296
+ # Fill in missing/auto-added null fields with values from old_records
297
+ for field_name in fields_to_fill:
298
+ # For missing fields, use the old values entirely
299
+ result_columns[field_name] = old_records[field_name]
300
+
301
+ # Create the enhanced new_records table with all fields filled
302
+ enhanced_new_records = pa.table(result_columns)
303
+
304
+ # Now we can return the enhanced table - it has all the fields with proper values
305
+ # Missing fields are filled with old values, explicitly null fields remain null
306
+ return enhanced_new_records
307
+
308
+
309
+ def _validate_bucketing_spec_compliance(
310
+ table: pa.Table,
311
+ num_buckets: int,
312
+ hb_index: int,
313
+ primary_keys: List[str],
314
+ rci: Optional[RoundCompletionInfo] = None,
315
+ log_prefix=None,
316
+ ) -> None:
317
+ if rci is not None:
318
+ message_prefix = f"{log_prefix}{rci.compacted_delta_locator.namespace}.{rci.compacted_delta_locator.table_name}.{rci.compacted_delta_locator.table_version}.{rci.compacted_delta_locator.partition_id}.{rci.compacted_delta_locator.partition_values}"
319
+ else:
320
+ message_prefix = f"{log_prefix}"
321
+ pki_table = generate_pk_hash_column(
322
+ [table], primary_keys=primary_keys, requires_hash=True
323
+ )[0]
324
+ is_not_compliant: bool = False
325
+ for index, hash_value in enumerate(sc.pk_hash_string_column_np(pki_table)):
326
+ hash_bucket: int = pk_digest_to_hash_bucket_index(hash_value, num_buckets)
327
+ if hash_bucket != hb_index:
328
+ is_not_compliant = True
329
+ logger.info(
330
+ f"{message_prefix} has non-compliant bucketing spec at index: {index} "
331
+ f"Expected hash bucket is {hb_index} but found {hash_bucket}."
332
+ )
333
+ if BUCKETING_SPEC_COMPLIANCE_PROFILE == BUCKETING_SPEC_COMPLIANCE_ASSERT:
334
+ raise AssertionError(
335
+ f"Hash bucket drift detected at index: {index}. Expected hash bucket index"
336
+ f" to be {hb_index} but found {hash_bucket}"
337
+ )
338
+ # No further checks necessary
339
+ break
340
+ if not is_not_compliant:
341
+ logger.debug(
342
+ f"{message_prefix} has compliant bucketing spec for hb_index: {hb_index}"
343
+ )
344
+
345
+
168
346
  def _download_compacted_table(
169
347
  hb_index: int,
170
- rcf: RoundCompletionInfo,
348
+ rci: RoundCompletionInfo,
349
+ primary_keys: List[str],
350
+ all_column_names: List[str],
351
+ compacted_delta_manifest: Optional[Manifest] = None,
171
352
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
172
- deltacat_storage=unimplemented_deltacat_storage,
353
+ deltacat_storage: metastore = metastore,
173
354
  deltacat_storage_kwargs: Optional[dict] = None,
174
355
  ) -> pa.Table:
175
356
  tables = []
176
- hb_index_to_indices = rcf.hb_index_to_entry_range
357
+ hb_index_to_indices = rci.hb_index_to_entry_range
177
358
 
178
359
  if str(hb_index) not in hb_index_to_indices:
179
360
  return None
@@ -183,30 +364,52 @@ def _download_compacted_table(
183
364
  ), "indices should not be none and contains exactly two elements"
184
365
  for offset in range(indices[1] - indices[0]):
185
366
  table = deltacat_storage.download_delta_manifest_entry(
186
- rcf.compacted_delta_locator,
367
+ Delta.of(
368
+ rci.compacted_delta_locator,
369
+ DeltaType.APPEND,
370
+ compacted_delta_manifest.meta,
371
+ None,
372
+ compacted_delta_manifest,
373
+ ),
187
374
  entry_index=(indices[0] + offset),
188
375
  file_reader_kwargs_provider=read_kwargs_provider,
376
+ all_column_names=all_column_names,
189
377
  **deltacat_storage_kwargs,
190
378
  )
191
379
 
192
380
  tables.append(table)
193
381
 
194
- return pa.concat_tables(tables)
382
+ compacted_table = pa.concat_tables(tables)
383
+ check_bucketing_spec = BUCKETING_SPEC_COMPLIANCE_PROFILE in [
384
+ BUCKETING_SPEC_COMPLIANCE_PRINT_LOG,
385
+ BUCKETING_SPEC_COMPLIANCE_ASSERT,
386
+ ]
387
+
388
+ logger.debug(
389
+ f"Value of BUCKETING_SPEC_COMPLIANCE_PROFILE, check_bucketing_spec:"
390
+ f" {BUCKETING_SPEC_COMPLIANCE_PROFILE}, {check_bucketing_spec}"
391
+ )
392
+
393
+ # Bucketing spec compliance isn't required without primary keys
394
+ if primary_keys and check_bucketing_spec:
395
+ _validate_bucketing_spec_compliance(
396
+ compacted_table,
397
+ rci.hash_bucket_count,
398
+ hb_index,
399
+ primary_keys,
400
+ rci=rci,
401
+ log_prefix=_EXISTING_VARIANT_LOG_PREFIX,
402
+ )
403
+ return compacted_table
195
404
 
196
405
 
197
406
  def _copy_all_manifest_files_from_old_hash_buckets(
198
407
  hb_index_copy_by_reference: List[int],
199
408
  round_completion_info: RoundCompletionInfo,
200
409
  write_to_partition: Partition,
201
- deltacat_storage=unimplemented_deltacat_storage,
202
- deltacat_storage_kwargs: Optional[dict] = None,
410
+ compacted_manifest: Optional[Manifest] = None,
203
411
  ) -> List[MaterializeResult]:
204
412
 
205
- compacted_delta_locator = round_completion_info.compacted_delta_locator
206
- manifest = deltacat_storage.get_delta_manifest(
207
- compacted_delta_locator, **deltacat_storage_kwargs
208
- )
209
-
210
413
  manifest_entry_referenced_list = []
211
414
  materialize_result_list = []
212
415
  hb_index_to_indices = round_completion_info.hb_index_to_entry_range
@@ -223,27 +426,27 @@ def _copy_all_manifest_files_from_old_hash_buckets(
223
426
  for offset in range(indices[1] - indices[0]):
224
427
  entry_index = indices[0] + offset
225
428
  assert entry_index < len(
226
- manifest.entries
227
- ), f"entry index: {entry_index} >= {len(manifest.entries)}"
228
- manifest_entry = manifest.entries[entry_index]
429
+ compacted_manifest.entries
430
+ ), f"entry index: {entry_index} >= {len(compacted_manifest.entries)}"
431
+ manifest_entry = compacted_manifest.entries[entry_index]
229
432
  manifest_entry_referenced_list.append(manifest_entry)
230
433
 
231
- manifest = Manifest.of(
434
+ compacted_manifest = Manifest.of(
232
435
  entries=manifest_entry_referenced_list, uuid=str(uuid4())
233
436
  )
234
437
  delta = Delta.of(
235
438
  locator=DeltaLocator.of(write_to_partition.locator),
236
- delta_type=DeltaType.UPSERT,
237
- meta=manifest.meta,
238
- manifest=manifest,
439
+ delta_type=DeltaType.APPEND, # Compaction always produces APPEND deltas
440
+ meta=compacted_manifest.meta,
441
+ manifest=compacted_manifest,
239
442
  previous_stream_position=write_to_partition.stream_position,
240
443
  properties={},
241
444
  )
242
445
  referenced_pyarrow_write_result = PyArrowWriteResult.of(
243
446
  len(manifest_entry_referenced_list),
244
- manifest.meta.source_content_length,
245
- manifest.meta.content_length,
246
- manifest.meta.record_count,
447
+ compacted_manifest.meta.source_content_length,
448
+ compacted_manifest.meta.content_length,
449
+ compacted_manifest.meta.record_count,
247
450
  )
248
451
  materialize_result = MaterializeResult.of(
249
452
  delta=delta,
@@ -268,6 +471,7 @@ def _has_previous_compacted_table(input: MergeInput, hb_idx: int) -> bool:
268
471
  """
269
472
  return (
270
473
  input.round_completion_info
474
+ and input.compacted_manifest is not None
271
475
  and input.round_completion_info.hb_index_to_entry_range
272
476
  and input.round_completion_info.hb_index_to_entry_range.get(str(hb_idx))
273
477
  is not None
@@ -285,6 +489,7 @@ def _can_copy_by_reference(
285
489
  not has_delete
286
490
  and not merge_file_group.dfe_groups
287
491
  and input.round_completion_info is not None
492
+ and input.compacted_manifest is not None
288
493
  )
289
494
 
290
495
  if input.disable_copy_by_reference:
@@ -383,9 +588,9 @@ def _compact_tables(
383
588
  delete_file_envelopes + df_envelopes
384
589
  )
385
590
  assert all(
386
- dfe.delta_type in (DeltaType.UPSERT, DeltaType.DELETE)
591
+ dfe.delta_type in (DeltaType.APPEND, DeltaType.UPSERT, DeltaType.DELETE)
387
592
  for dfe in reordered_all_dfes
388
- ), "All reordered delta file envelopes must be of the UPSERT or DELETE"
593
+ ), "All reordered delta file envelopes must be of the APPEND, UPSERT or DELETE"
389
594
  table = compacted_table
390
595
  aggregated_incremental_len = 0
391
596
  aggregated_deduped_records = 0
@@ -393,13 +598,13 @@ def _compact_tables(
393
598
  for i, (delta_type, delta_type_sequence) in enumerate(
394
599
  _group_sequence_by_delta_type(reordered_all_dfes)
395
600
  ):
396
- if delta_type is DeltaType.UPSERT:
397
- (
398
- table,
399
- incremental_len,
400
- deduped_records,
401
- merge_time,
402
- ) = _apply_upserts(input, delta_type_sequence, hb_idx, table)
601
+ if delta_type is DeltaType.UPSERT or delta_type is DeltaType.APPEND:
602
+ (table, incremental_len, deduped_records, merge_time,) = _apply_upserts(
603
+ input=input,
604
+ dfe_list=delta_type_sequence,
605
+ hb_idx=hb_idx,
606
+ prev_table=table,
607
+ )
403
608
  logger.info(
404
609
  f" [Merge task index {input.merge_task_index}] Merged"
405
610
  f" record count: {len(table)}, size={table.nbytes} took: {merge_time}s"
@@ -434,8 +639,9 @@ def _apply_upserts(
434
639
  prev_table=None,
435
640
  ) -> Tuple[pa.Table, int, int, int]:
436
641
  assert all(
437
- dfe.delta_type is DeltaType.UPSERT for dfe in dfe_list
438
- ), "All incoming delta file envelopes must of the DeltaType.UPSERT"
642
+ dfe.delta_type is DeltaType.UPSERT or dfe.delta_type is DeltaType.APPEND
643
+ for dfe in dfe_list
644
+ ), "All incoming delta file envelopes must of the DeltaType.UPSERT or DeltaType.APPEND"
439
645
  logger.info(
440
646
  f"[Hash bucket index {hb_idx}] Reading dedupe input for "
441
647
  f"{len(dfe_list)} delta file envelope lists..."
@@ -459,6 +665,9 @@ def _apply_upserts(
459
665
  table=table,
460
666
  primary_keys=input.primary_keys,
461
667
  can_drop_duplicates=input.drop_duplicates,
668
+ hb_index=hb_idx,
669
+ num_buckets=input.hash_bucket_count,
670
+ original_fields=input.original_fields,
462
671
  compacted_table=prev_table,
463
672
  )
464
673
  deduped_records = hb_table_record_count - len(table)
@@ -476,8 +685,7 @@ def _copy_manifests_from_hash_bucketing(
476
685
  hb_index_copy_by_reference_ids,
477
686
  input.round_completion_info,
478
687
  input.write_to_partition,
479
- input.deltacat_storage,
480
- input.deltacat_storage_kwargs,
688
+ input.compacted_manifest,
481
689
  )
482
690
  )
483
691
  logger.info(
@@ -494,9 +702,11 @@ def _copy_manifests_from_hash_bucketing(
494
702
  def _timed_merge(input: MergeInput) -> MergeResult:
495
703
  task_id = get_current_ray_task_id()
496
704
  worker_id = get_current_ray_worker_id()
497
- with memray.Tracker(
498
- f"merge_{worker_id}_{task_id}.bin"
499
- ) if input.enable_profiler else nullcontext():
705
+ with (
706
+ memray.Tracker(f"merge_{worker_id}_{task_id}.bin")
707
+ if input.enable_profiler
708
+ else nullcontext()
709
+ ):
500
710
  total_input_records, total_deduped_records = 0, 0
501
711
  total_dropped_records = 0
502
712
  materialized_results: List[MaterializeResult] = []
@@ -515,11 +725,13 @@ def _timed_merge(input: MergeInput) -> MergeResult:
515
725
  ):
516
726
  hb_index_copy_by_ref_ids.append(merge_file_group.hb_index)
517
727
  continue
518
-
519
728
  if _has_previous_compacted_table(input, merge_file_group.hb_index):
520
729
  compacted_table = _download_compacted_table(
521
730
  hb_index=merge_file_group.hb_index,
522
- rcf=input.round_completion_info,
731
+ rci=input.round_completion_info,
732
+ primary_keys=input.primary_keys,
733
+ all_column_names=input.all_column_names,
734
+ compacted_delta_manifest=input.compacted_manifest,
523
735
  read_kwargs_provider=input.read_kwargs_provider,
524
736
  deltacat_storage=input.deltacat_storage,
525
737
  deltacat_storage_kwargs=input.deltacat_storage_kwargs,
@@ -604,5 +816,5 @@ def merge(input: MergeInput) -> MergeResult:
604
816
  merge_result[3],
605
817
  merge_result[4],
606
818
  np.double(emit_metrics_time),
607
- merge_result[4],
819
+ merge_result[6],
608
820
  )
@@ -1,19 +1,21 @@
1
1
  import logging
2
2
  import ray
3
3
  import functools
4
+ from typing import List
4
5
  from deltacat.compute.compactor_v2.constants import (
5
6
  TASK_MAX_PARALLELISM,
6
7
  MAX_PARQUET_METADATA_SIZE,
7
8
  )
9
+ from deltacat.utils.common import ReadKwargsProvider
8
10
  from deltacat.utils.ray_utils.concurrency import invoke_parallel
9
11
  from deltacat import logs
10
12
  from deltacat.storage import (
11
13
  Delta,
12
14
  ManifestEntry,
13
- interface as unimplemented_deltacat_storage,
15
+ metastore,
14
16
  )
15
17
  from typing import Dict, Optional, Any
16
- from deltacat.types.media import TableType
18
+ from deltacat.types.media import DatasetType
17
19
  from deltacat.types.media import ContentType
18
20
  from deltacat.types.partial_download import PartialParquetParameters
19
21
  from deltacat.exceptions import RetryableError
@@ -73,13 +75,26 @@ class AppendContentTypeParamsCache:
73
75
  def _download_parquet_metadata_for_manifest_entry(
74
76
  delta: Delta,
75
77
  entry_index: int,
76
- deltacat_storage: unimplemented_deltacat_storage,
78
+ all_column_names: List[str],
79
+ deltacat_storage: metastore,
77
80
  deltacat_storage_kwargs: Optional[Dict[Any, Any]] = {},
81
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
78
82
  ) -> Dict[str, Any]:
83
+ logger.info(
84
+ f"Downloading the parquet metadata for Delta with locator {delta.locator} and entry_index: {entry_index}"
85
+ )
86
+ if "file_reader_kwargs_provider" in deltacat_storage_kwargs:
87
+ logger.info(
88
+ "'file_reader_kwargs_provider' is also present in deltacat_storage_kwargs. Removing to prevent multiple values for keyword argument"
89
+ )
90
+ deltacat_storage_kwargs.pop("file_reader_kwargs_provider")
91
+
79
92
  pq_file = deltacat_storage.download_delta_manifest_entry(
80
93
  delta,
81
94
  entry_index=entry_index,
82
- table_type=TableType.PYARROW_PARQUET,
95
+ table_type=DatasetType.PYARROW_PARQUET,
96
+ file_reader_kwargs_provider=file_reader_kwargs_provider,
97
+ all_column_names=all_column_names,
83
98
  **deltacat_storage_kwargs,
84
99
  )
85
100
 
@@ -93,15 +108,20 @@ def _download_parquet_metadata_for_manifest_entry(
93
108
 
94
109
  def append_content_type_params(
95
110
  delta: Delta,
111
+ all_column_names: List[str],
96
112
  task_max_parallelism: int = TASK_MAX_PARALLELISM,
97
113
  max_parquet_meta_size_bytes: Optional[int] = MAX_PARQUET_METADATA_SIZE,
98
- deltacat_storage=unimplemented_deltacat_storage,
114
+ deltacat_storage: metastore = metastore,
99
115
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
116
+ file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
100
117
  ) -> bool:
101
118
  """
102
119
  This operation appends content type params into the delta entry. Note
103
120
  that this operation can be time consuming, hence we cache it in a Ray actor.
104
121
  """
122
+ logger.info(
123
+ f"Appending the content type params for Delta with locator {delta.locator}..."
124
+ )
105
125
 
106
126
  if not delta.meta:
107
127
  logger.warning(f"Delta with locator {delta.locator} doesn't contain meta.")
@@ -157,17 +177,25 @@ def append_content_type_params(
157
177
  max_parquet_meta_size_bytes=max_parquet_meta_size_bytes,
158
178
  )
159
179
 
180
+ # create a copy of deltacat_storage_kwargs without transaction key
181
+ deltacat_storage_kwargs_copy = {
182
+ k: v for k, v in deltacat_storage_kwargs.items() if k != "transaction"
183
+ }
184
+
160
185
  def input_provider(index, item) -> Dict:
161
186
  return {
162
- "deltacat_storage_kwargs": deltacat_storage_kwargs,
187
+ "file_reader_kwargs_provider": file_reader_kwargs_provider,
188
+ "deltacat_storage_kwargs": deltacat_storage_kwargs_copy,
163
189
  "deltacat_storage": deltacat_storage,
164
190
  "delta": delta,
165
191
  "entry_index": item,
192
+ "all_column_names": all_column_names,
166
193
  }
167
194
 
168
195
  logger.info(
169
196
  f"Downloading parquet meta for {len(entry_indices_to_download)} manifest entries..."
170
197
  )
198
+
171
199
  pq_files_promise = invoke_parallel(
172
200
  entry_indices_to_download,
173
201
  ray_task=_download_parquet_metadata_for_manifest_entry,
@@ -25,7 +25,7 @@ def _create_chunked_index_array(array: pa.Array) -> pa.Array:
25
25
  result[index] = np.arange(cl, dtype="int32")
26
26
 
27
27
  chunk_lengths = ([0] + chunk_lengths)[:-1]
28
- result = pa.chunked_array(result + np.cumsum(chunk_lengths))
28
+ result = pa.chunked_array(result + np.cumsum(chunk_lengths), type=pa.int32())
29
29
  return result
30
30
 
31
31
 
@@ -9,7 +9,7 @@ from deltacat.storage import (
9
9
  Delta,
10
10
  )
11
11
  from deltacat.storage.model.delta import DeltaType
12
- from deltacat.storage import interface as unimplemented_deltacat_storage
12
+ from deltacat.storage import metastore
13
13
  from deltacat.types.media import StorageType
14
14
  from deltacat.utils.common import ReadKwargsProvider
15
15
  from deltacat import logs
@@ -30,8 +30,9 @@ def contains_delete_deltas(deltas: List[Delta]) -> bool:
30
30
 
31
31
  def read_delta_file_envelopes(
32
32
  annotated_delta: DeltaAnnotated,
33
+ all_column_names: List[str],
33
34
  read_kwargs_provider: Optional[ReadKwargsProvider],
34
- deltacat_storage=unimplemented_deltacat_storage,
35
+ deltacat_storage: metastore = metastore,
35
36
  deltacat_storage_kwargs: Optional[dict] = None,
36
37
  ) -> Tuple[Optional[List[DeltaFileEnvelope]], int, int]:
37
38
  tables = deltacat_storage.download_delta(
@@ -39,6 +40,7 @@ def read_delta_file_envelopes(
39
40
  max_parallelism=1,
40
41
  file_reader_kwargs_provider=read_kwargs_provider,
41
42
  storage_type=StorageType.LOCAL,
43
+ all_column_names=all_column_names,
42
44
  **deltacat_storage_kwargs,
43
45
  )
44
46
  annotations = annotated_delta.annotations
@@ -80,7 +82,7 @@ def read_delta_file_envelopes(
80
82
  def get_local_delta_file_envelopes(
81
83
  uniform_deltas: List[DeltaAnnotated],
82
84
  read_kwargs_provider: Optional[ReadKwargsProvider],
83
- deltacat_storage=unimplemented_deltacat_storage,
85
+ deltacat_storage=metastore,
84
86
  deltacat_storage_kwargs: Optional[dict] = None,
85
87
  ) -> Tuple[List[DeltaFileEnvelope], int]:
86
88
  local_dfe_list = []