deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -11,7 +11,7 @@ from deltacat.utils.pyarrow import MAX_INT_BYTES
11
11
  import deltacat.compute.compactor_v2.utils.merge as merge_utils
12
12
  from uuid import uuid4
13
13
  from deltacat import logs
14
- from typing import Callable, Iterator, List, Optional, Tuple
14
+ from typing import Callable, Iterator, List, Optional, Tuple, Set
15
15
  from deltacat.compute.compactor_v2.model.merge_result import MergeResult
16
16
  from deltacat.compute.compactor_v2.model.merge_file_group import MergeFileGroup
17
17
  from deltacat.compute.compactor.model.materialize_result import MaterializeResult
@@ -38,10 +38,10 @@ from deltacat.storage import (
38
38
  Delta,
39
39
  DeltaLocator,
40
40
  DeltaType,
41
- Manifest,
42
41
  Partition,
43
- interface as unimplemented_deltacat_storage,
42
+ metastore,
44
43
  )
44
+ from deltacat.storage.model.manifest import Manifest
45
45
  from deltacat.compute.compactor_v2.utils.dedupe import drop_duplicates
46
46
  from deltacat.constants import BYTES_PER_GIBIBYTE
47
47
  from deltacat.compute.compactor_v2.constants import (
@@ -94,9 +94,12 @@ def _build_incremental_table(
94
94
  # sort by delta file stream position now instead of sorting every row later
95
95
  is_delete = False
96
96
  for df_envelope in df_envelopes:
97
- assert (
98
- df_envelope.delta_type != DeltaType.APPEND
99
- ), "APPEND type deltas are not supported. Kindly use UPSERT or DELETE"
97
+ # Allow APPEND, UPSERT, and DELETE delta types
98
+ assert df_envelope.delta_type in (
99
+ DeltaType.APPEND,
100
+ DeltaType.UPSERT,
101
+ DeltaType.DELETE,
102
+ ), "Only APPEND, UPSERT, and DELETE delta types are supported"
100
103
  if df_envelope.delta_type == DeltaType.DELETE:
101
104
  is_delete = True
102
105
 
@@ -108,16 +111,35 @@ def _build_incremental_table(
108
111
  )
109
112
 
110
113
  hb_tables.append(table)
111
- result = pa.concat_tables(hb_tables)
114
+ result = _concat_or_coerce_tables(hb_tables)
112
115
  return result
113
116
 
114
117
 
118
+ def _concat_or_coerce_tables(all_tables: List[pa.Table]) -> pa.Table:
119
+ try:
120
+ return pa.concat_tables(all_tables)
121
+ except pa.ArrowInvalid:
122
+ # Fallback path: schema evolution needed - try PyArrow's built-in unification
123
+ if all_tables:
124
+ try:
125
+ return pa.concat_tables(
126
+ all_tables, promote_options="permissive", unify_schemas=True
127
+ )
128
+ except (pa.ArrowInvalid, TypeError, pa.ArrowNotImplementedError):
129
+ # If PyArrow unification fails, re-raise the original error
130
+ raise
131
+ else:
132
+ # Empty table list - should not happen but handle gracefully
133
+ raise RuntimeError("Expected at least one table to merge, but found none.")
134
+
135
+
115
136
  def _merge_tables(
116
137
  table: pa.Table,
117
138
  primary_keys: List[str],
118
139
  can_drop_duplicates: bool,
119
140
  hb_index: int,
120
141
  num_buckets: int,
142
+ original_fields: Set[str],
121
143
  compacted_table: Optional[pa.Table] = None,
122
144
  ) -> pa.Table:
123
145
  """
@@ -159,7 +181,7 @@ def _merge_tables(
159
181
  all_tables[incremental_idx], DeltaType.DELETE
160
182
  )
161
183
  # we need not drop duplicates
162
- return pa.concat_tables(all_tables)
184
+ return _concat_or_coerce_tables(all_tables)
163
185
 
164
186
  all_tables = generate_pk_hash_column(all_tables, primary_keys=primary_keys)
165
187
 
@@ -169,6 +191,12 @@ def _merge_tables(
169
191
  all_tables[incremental_idx], on=sc._PK_HASH_STRING_COLUMN_NAME
170
192
  )
171
193
 
194
+ # Always drop DELETE rows from incremental table
195
+ incremental_table = _drop_delta_type_rows(incremental_table, DeltaType.DELETE)
196
+
197
+ # Default to using incremental records as-is, override only if merging is needed
198
+ incremental_data = incremental_table
199
+
172
200
  if compacted_table:
173
201
  compacted_table = all_tables[0]
174
202
 
@@ -194,34 +222,100 @@ def _merge_tables(
194
222
  incremental_pk_hash_str, pa.large_string()
195
223
  )
196
224
 
197
- records_to_keep = pc.invert(
198
- pc.is_in(
199
- compacted_pk_hash_str,
200
- incremental_pk_hash_str,
201
- )
225
+ records_to_update = pc.is_in(
226
+ compacted_pk_hash_str,
227
+ incremental_pk_hash_str,
202
228
  )
203
229
 
230
+ records_to_keep = pc.invert(records_to_update)
231
+
232
+ # Keep records that don't have updates
204
233
  result_table_list.append(compacted_table.filter(records_to_keep))
205
234
 
206
- incremental_table = _drop_delta_type_rows(incremental_table, DeltaType.DELETE)
207
- result_table_list.append(incremental_table)
235
+ # Override default if merging is needed
236
+ if pc.sum(records_to_update).as_py() > 0: # There are records to update
237
+ old_records_to_update = compacted_table.filter(records_to_update)
238
+ # Perform partial UPSERT: merge old and new records field by field
239
+ incremental_data = _merge_records_partially(
240
+ old_records=old_records_to_update,
241
+ new_records=incremental_table,
242
+ original_fields=original_fields,
243
+ )
244
+
245
+ # Add the determined incremental data
246
+ result_table_list.append(incremental_data)
208
247
 
209
- final_table = pa.concat_tables(result_table_list)
248
+ final_table = _concat_or_coerce_tables(result_table_list)
210
249
  final_table = final_table.drop([sc._PK_HASH_STRING_COLUMN_NAME])
211
250
 
212
251
  return final_table
213
252
 
214
253
 
254
+ def _merge_records_partially(
255
+ old_records: pa.Table, new_records: pa.Table, original_fields: Set[str]
256
+ ) -> pa.Table:
257
+ """
258
+ Merge records field by field for partial UPSERT behavior. Fills missing
259
+ fields in new_records with values from old_records.
260
+
261
+ Args:
262
+ old_records: Records from the compacted table that need updates
263
+ new_records: New records with potential partial field updates
264
+
265
+ Returns:
266
+ Table with merged records where missing fields preserve old values
267
+ """
268
+ # Get field sets (excluding hash column which is used for joining)
269
+ old_fields = set(old_records.column_names) - {sc._PK_HASH_STRING_COLUMN_NAME}
270
+ new_fields = set(new_records.column_names) - {sc._PK_HASH_STRING_COLUMN_NAME}
271
+
272
+ # Find fields that are missing from new_records but exist in old_records
273
+ missing_fields = old_fields - new_fields
274
+
275
+ # Find fields that were auto-added by schema coercion (missing from original user data)
276
+ # These should be treated as missing fields and filled from old_records
277
+ auto_added_null_fields = set()
278
+
279
+ # Use definitive information about which fields were originally provided
280
+ # Any field that exists in both tables but was NOT in the original user data
281
+ # should be treated as auto-added by schema coercion
282
+ for field_name in old_fields & new_fields: # Fields that exist in both
283
+ if field_name not in original_fields:
284
+ auto_added_null_fields.add(field_name)
285
+
286
+ # Combine missing fields with auto-added null fields
287
+ fields_to_fill = missing_fields | auto_added_null_fields
288
+
289
+ # Start with new_records and add missing fields from old_records
290
+ result_columns = {}
291
+
292
+ # Copy all existing columns from new_records
293
+ for column_name in new_records.column_names:
294
+ result_columns[column_name] = new_records[column_name]
295
+
296
+ # Fill in missing/auto-added null fields with values from old_records
297
+ for field_name in fields_to_fill:
298
+ # For missing fields, use the old values entirely
299
+ result_columns[field_name] = old_records[field_name]
300
+
301
+ # Create the enhanced new_records table with all fields filled
302
+ enhanced_new_records = pa.table(result_columns)
303
+
304
+ # Now we can return the enhanced table - it has all the fields with proper values
305
+ # Missing fields are filled with old values, explicitly null fields remain null
306
+ return enhanced_new_records
307
+
308
+
215
309
  def _validate_bucketing_spec_compliance(
216
310
  table: pa.Table,
217
311
  num_buckets: int,
218
312
  hb_index: int,
219
313
  primary_keys: List[str],
220
- rcf: RoundCompletionInfo = None,
314
+ rci: Optional[RoundCompletionInfo] = None,
221
315
  log_prefix=None,
222
316
  ) -> None:
223
- if rcf is not None:
224
- message_prefix = f"{log_prefix}{rcf.compacted_delta_locator.namespace}.{rcf.compacted_delta_locator.table_name}.{rcf.compacted_delta_locator.table_version}.{rcf.compacted_delta_locator.partition_id}.{rcf.compacted_delta_locator.partition_values}"
317
+ if rci is not None:
318
+ message_prefix = f"{log_prefix}{rci.compacted_delta_locator.namespace}.{rci.compacted_delta_locator.table_name}.{rci.compacted_delta_locator.table_version}.{rci.compacted_delta_locator.partition_id}.{rci.compacted_delta_locator.partition_values}"
225
319
  else:
226
320
  message_prefix = f"{log_prefix}"
227
321
  pki_table = generate_pk_hash_column(
@@ -251,14 +345,16 @@ def _validate_bucketing_spec_compliance(
251
345
 
252
346
  def _download_compacted_table(
253
347
  hb_index: int,
254
- rcf: RoundCompletionInfo,
348
+ rci: RoundCompletionInfo,
255
349
  primary_keys: List[str],
350
+ all_column_names: List[str],
351
+ compacted_delta_manifest: Optional[Manifest] = None,
256
352
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
257
- deltacat_storage=unimplemented_deltacat_storage,
353
+ deltacat_storage: metastore = metastore,
258
354
  deltacat_storage_kwargs: Optional[dict] = None,
259
355
  ) -> pa.Table:
260
356
  tables = []
261
- hb_index_to_indices = rcf.hb_index_to_entry_range
357
+ hb_index_to_indices = rci.hb_index_to_entry_range
262
358
 
263
359
  if str(hb_index) not in hb_index_to_indices:
264
360
  return None
@@ -268,9 +364,16 @@ def _download_compacted_table(
268
364
  ), "indices should not be none and contains exactly two elements"
269
365
  for offset in range(indices[1] - indices[0]):
270
366
  table = deltacat_storage.download_delta_manifest_entry(
271
- rcf.compacted_delta_locator,
367
+ Delta.of(
368
+ rci.compacted_delta_locator,
369
+ DeltaType.APPEND,
370
+ compacted_delta_manifest.meta,
371
+ None,
372
+ compacted_delta_manifest,
373
+ ),
272
374
  entry_index=(indices[0] + offset),
273
375
  file_reader_kwargs_provider=read_kwargs_provider,
376
+ all_column_names=all_column_names,
274
377
  **deltacat_storage_kwargs,
275
378
  )
276
379
 
@@ -291,10 +394,10 @@ def _download_compacted_table(
291
394
  if primary_keys and check_bucketing_spec:
292
395
  _validate_bucketing_spec_compliance(
293
396
  compacted_table,
294
- rcf.hash_bucket_count,
397
+ rci.hash_bucket_count,
295
398
  hb_index,
296
399
  primary_keys,
297
- rcf=rcf,
400
+ rci=rci,
298
401
  log_prefix=_EXISTING_VARIANT_LOG_PREFIX,
299
402
  )
300
403
  return compacted_table
@@ -304,15 +407,9 @@ def _copy_all_manifest_files_from_old_hash_buckets(
304
407
  hb_index_copy_by_reference: List[int],
305
408
  round_completion_info: RoundCompletionInfo,
306
409
  write_to_partition: Partition,
307
- deltacat_storage=unimplemented_deltacat_storage,
308
- deltacat_storage_kwargs: Optional[dict] = None,
410
+ compacted_manifest: Optional[Manifest] = None,
309
411
  ) -> List[MaterializeResult]:
310
412
 
311
- compacted_delta_locator = round_completion_info.compacted_delta_locator
312
- manifest = deltacat_storage.get_delta_manifest(
313
- compacted_delta_locator, **deltacat_storage_kwargs
314
- )
315
-
316
413
  manifest_entry_referenced_list = []
317
414
  materialize_result_list = []
318
415
  hb_index_to_indices = round_completion_info.hb_index_to_entry_range
@@ -329,27 +426,27 @@ def _copy_all_manifest_files_from_old_hash_buckets(
329
426
  for offset in range(indices[1] - indices[0]):
330
427
  entry_index = indices[0] + offset
331
428
  assert entry_index < len(
332
- manifest.entries
333
- ), f"entry index: {entry_index} >= {len(manifest.entries)}"
334
- manifest_entry = manifest.entries[entry_index]
429
+ compacted_manifest.entries
430
+ ), f"entry index: {entry_index} >= {len(compacted_manifest.entries)}"
431
+ manifest_entry = compacted_manifest.entries[entry_index]
335
432
  manifest_entry_referenced_list.append(manifest_entry)
336
433
 
337
- manifest = Manifest.of(
434
+ compacted_manifest = Manifest.of(
338
435
  entries=manifest_entry_referenced_list, uuid=str(uuid4())
339
436
  )
340
437
  delta = Delta.of(
341
438
  locator=DeltaLocator.of(write_to_partition.locator),
342
- delta_type=DeltaType.UPSERT,
343
- meta=manifest.meta,
344
- manifest=manifest,
439
+ delta_type=DeltaType.APPEND, # Compaction always produces APPEND deltas
440
+ meta=compacted_manifest.meta,
441
+ manifest=compacted_manifest,
345
442
  previous_stream_position=write_to_partition.stream_position,
346
443
  properties={},
347
444
  )
348
445
  referenced_pyarrow_write_result = PyArrowWriteResult.of(
349
446
  len(manifest_entry_referenced_list),
350
- manifest.meta.source_content_length,
351
- manifest.meta.content_length,
352
- manifest.meta.record_count,
447
+ compacted_manifest.meta.source_content_length,
448
+ compacted_manifest.meta.content_length,
449
+ compacted_manifest.meta.record_count,
353
450
  )
354
451
  materialize_result = MaterializeResult.of(
355
452
  delta=delta,
@@ -374,6 +471,7 @@ def _has_previous_compacted_table(input: MergeInput, hb_idx: int) -> bool:
374
471
  """
375
472
  return (
376
473
  input.round_completion_info
474
+ and input.compacted_manifest is not None
377
475
  and input.round_completion_info.hb_index_to_entry_range
378
476
  and input.round_completion_info.hb_index_to_entry_range.get(str(hb_idx))
379
477
  is not None
@@ -391,6 +489,7 @@ def _can_copy_by_reference(
391
489
  not has_delete
392
490
  and not merge_file_group.dfe_groups
393
491
  and input.round_completion_info is not None
492
+ and input.compacted_manifest is not None
394
493
  )
395
494
 
396
495
  if input.disable_copy_by_reference:
@@ -489,9 +588,9 @@ def _compact_tables(
489
588
  delete_file_envelopes + df_envelopes
490
589
  )
491
590
  assert all(
492
- dfe.delta_type in (DeltaType.UPSERT, DeltaType.DELETE)
591
+ dfe.delta_type in (DeltaType.APPEND, DeltaType.UPSERT, DeltaType.DELETE)
493
592
  for dfe in reordered_all_dfes
494
- ), "All reordered delta file envelopes must be of the UPSERT or DELETE"
593
+ ), "All reordered delta file envelopes must be of the APPEND, UPSERT or DELETE"
495
594
  table = compacted_table
496
595
  aggregated_incremental_len = 0
497
596
  aggregated_deduped_records = 0
@@ -499,7 +598,7 @@ def _compact_tables(
499
598
  for i, (delta_type, delta_type_sequence) in enumerate(
500
599
  _group_sequence_by_delta_type(reordered_all_dfes)
501
600
  ):
502
- if delta_type is DeltaType.UPSERT:
601
+ if delta_type is DeltaType.UPSERT or delta_type is DeltaType.APPEND:
503
602
  (table, incremental_len, deduped_records, merge_time,) = _apply_upserts(
504
603
  input=input,
505
604
  dfe_list=delta_type_sequence,
@@ -540,8 +639,9 @@ def _apply_upserts(
540
639
  prev_table=None,
541
640
  ) -> Tuple[pa.Table, int, int, int]:
542
641
  assert all(
543
- dfe.delta_type is DeltaType.UPSERT for dfe in dfe_list
544
- ), "All incoming delta file envelopes must of the DeltaType.UPSERT"
642
+ dfe.delta_type is DeltaType.UPSERT or dfe.delta_type is DeltaType.APPEND
643
+ for dfe in dfe_list
644
+ ), "All incoming delta file envelopes must of the DeltaType.UPSERT or DeltaType.APPEND"
545
645
  logger.info(
546
646
  f"[Hash bucket index {hb_idx}] Reading dedupe input for "
547
647
  f"{len(dfe_list)} delta file envelope lists..."
@@ -556,16 +656,19 @@ def _apply_upserts(
556
656
  # on non event based sort key does not produce consistent
557
657
  # compaction results. E.g., compaction(delta1, delta2, delta3)
558
658
  # will not be equal to compaction(compaction(delta1, delta2), delta3).
559
- table = table.sort_by(input.sort_keys)
659
+ table = table.sort_by(
660
+ [pa_key for key in input.sort_keys for pa_key in key.arrow]
661
+ )
560
662
  hb_table_record_count = len(table) + (len(prev_table) if prev_table else 0)
561
663
  table, merge_time = timed_invocation(
562
664
  func=_merge_tables,
563
665
  table=table,
564
666
  primary_keys=input.primary_keys,
565
667
  can_drop_duplicates=input.drop_duplicates,
566
- compacted_table=prev_table,
567
668
  hb_index=hb_idx,
568
669
  num_buckets=input.hash_bucket_count,
670
+ original_fields=input.original_fields,
671
+ compacted_table=prev_table,
569
672
  )
570
673
  deduped_records = hb_table_record_count - len(table)
571
674
  return table, incremental_len, deduped_records, merge_time
@@ -582,8 +685,7 @@ def _copy_manifests_from_hash_bucketing(
582
685
  hb_index_copy_by_reference_ids,
583
686
  input.round_completion_info,
584
687
  input.write_to_partition,
585
- input.deltacat_storage,
586
- input.deltacat_storage_kwargs,
688
+ input.compacted_manifest,
587
689
  )
588
690
  )
589
691
  logger.info(
@@ -623,12 +725,13 @@ def _timed_merge(input: MergeInput) -> MergeResult:
623
725
  ):
624
726
  hb_index_copy_by_ref_ids.append(merge_file_group.hb_index)
625
727
  continue
626
-
627
728
  if _has_previous_compacted_table(input, merge_file_group.hb_index):
628
729
  compacted_table = _download_compacted_table(
629
730
  hb_index=merge_file_group.hb_index,
630
- rcf=input.round_completion_info,
731
+ rci=input.round_completion_info,
631
732
  primary_keys=input.primary_keys,
733
+ all_column_names=input.all_column_names,
734
+ compacted_delta_manifest=input.compacted_manifest,
632
735
  read_kwargs_provider=input.read_kwargs_provider,
633
736
  deltacat_storage=input.deltacat_storage,
634
737
  deltacat_storage_kwargs=input.deltacat_storage_kwargs,
@@ -1,6 +1,7 @@
1
1
  import logging
2
2
  import ray
3
3
  import functools
4
+ from typing import List
4
5
  from deltacat.compute.compactor_v2.constants import (
5
6
  TASK_MAX_PARALLELISM,
6
7
  MAX_PARQUET_METADATA_SIZE,
@@ -11,10 +12,10 @@ from deltacat import logs
11
12
  from deltacat.storage import (
12
13
  Delta,
13
14
  ManifestEntry,
14
- interface as unimplemented_deltacat_storage,
15
+ metastore,
15
16
  )
16
17
  from typing import Dict, Optional, Any
17
- from deltacat.types.media import TableType
18
+ from deltacat.types.media import DatasetType
18
19
  from deltacat.types.media import ContentType
19
20
  from deltacat.types.partial_download import PartialParquetParameters
20
21
  from deltacat.exceptions import RetryableError
@@ -74,7 +75,8 @@ class AppendContentTypeParamsCache:
74
75
  def _download_parquet_metadata_for_manifest_entry(
75
76
  delta: Delta,
76
77
  entry_index: int,
77
- deltacat_storage: unimplemented_deltacat_storage,
78
+ all_column_names: List[str],
79
+ deltacat_storage: metastore,
78
80
  deltacat_storage_kwargs: Optional[Dict[Any, Any]] = {},
79
81
  file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
80
82
  ) -> Dict[str, Any]:
@@ -86,11 +88,13 @@ def _download_parquet_metadata_for_manifest_entry(
86
88
  "'file_reader_kwargs_provider' is also present in deltacat_storage_kwargs. Removing to prevent multiple values for keyword argument"
87
89
  )
88
90
  deltacat_storage_kwargs.pop("file_reader_kwargs_provider")
91
+
89
92
  pq_file = deltacat_storage.download_delta_manifest_entry(
90
93
  delta,
91
94
  entry_index=entry_index,
92
- table_type=TableType.PYARROW_PARQUET,
95
+ table_type=DatasetType.PYARROW_PARQUET,
93
96
  file_reader_kwargs_provider=file_reader_kwargs_provider,
97
+ all_column_names=all_column_names,
94
98
  **deltacat_storage_kwargs,
95
99
  )
96
100
 
@@ -104,9 +108,10 @@ def _download_parquet_metadata_for_manifest_entry(
104
108
 
105
109
  def append_content_type_params(
106
110
  delta: Delta,
111
+ all_column_names: List[str],
107
112
  task_max_parallelism: int = TASK_MAX_PARALLELISM,
108
113
  max_parquet_meta_size_bytes: Optional[int] = MAX_PARQUET_METADATA_SIZE,
109
- deltacat_storage=unimplemented_deltacat_storage,
114
+ deltacat_storage: metastore = metastore,
110
115
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
111
116
  file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
112
117
  ) -> bool:
@@ -172,13 +177,19 @@ def append_content_type_params(
172
177
  max_parquet_meta_size_bytes=max_parquet_meta_size_bytes,
173
178
  )
174
179
 
180
+ # create a copy of deltacat_storage_kwargs without transaction key
181
+ deltacat_storage_kwargs_copy = {
182
+ k: v for k, v in deltacat_storage_kwargs.items() if k != "transaction"
183
+ }
184
+
175
185
  def input_provider(index, item) -> Dict:
176
186
  return {
177
187
  "file_reader_kwargs_provider": file_reader_kwargs_provider,
178
- "deltacat_storage_kwargs": deltacat_storage_kwargs,
188
+ "deltacat_storage_kwargs": deltacat_storage_kwargs_copy,
179
189
  "deltacat_storage": deltacat_storage,
180
190
  "delta": delta,
181
191
  "entry_index": item,
192
+ "all_column_names": all_column_names,
182
193
  }
183
194
 
184
195
  logger.info(
@@ -9,7 +9,7 @@ from deltacat.storage import (
9
9
  Delta,
10
10
  )
11
11
  from deltacat.storage.model.delta import DeltaType
12
- from deltacat.storage import interface as unimplemented_deltacat_storage
12
+ from deltacat.storage import metastore
13
13
  from deltacat.types.media import StorageType
14
14
  from deltacat.utils.common import ReadKwargsProvider
15
15
  from deltacat import logs
@@ -30,8 +30,9 @@ def contains_delete_deltas(deltas: List[Delta]) -> bool:
30
30
 
31
31
  def read_delta_file_envelopes(
32
32
  annotated_delta: DeltaAnnotated,
33
+ all_column_names: List[str],
33
34
  read_kwargs_provider: Optional[ReadKwargsProvider],
34
- deltacat_storage=unimplemented_deltacat_storage,
35
+ deltacat_storage: metastore = metastore,
35
36
  deltacat_storage_kwargs: Optional[dict] = None,
36
37
  ) -> Tuple[Optional[List[DeltaFileEnvelope]], int, int]:
37
38
  tables = deltacat_storage.download_delta(
@@ -39,6 +40,7 @@ def read_delta_file_envelopes(
39
40
  max_parallelism=1,
40
41
  file_reader_kwargs_provider=read_kwargs_provider,
41
42
  storage_type=StorageType.LOCAL,
43
+ all_column_names=all_column_names,
42
44
  **deltacat_storage_kwargs,
43
45
  )
44
46
  annotations = annotated_delta.annotations
@@ -80,7 +82,7 @@ def read_delta_file_envelopes(
80
82
  def get_local_delta_file_envelopes(
81
83
  uniform_deltas: List[DeltaAnnotated],
82
84
  read_kwargs_provider: Optional[ReadKwargsProvider],
83
- deltacat_storage=unimplemented_deltacat_storage,
85
+ deltacat_storage=metastore,
84
86
  deltacat_storage_kwargs: Optional[dict] = None,
85
87
  ) -> Tuple[List[DeltaFileEnvelope], int]:
86
88
  local_dfe_list = []
@@ -3,7 +3,7 @@ import functools
3
3
  from deltacat.storage import (
4
4
  PartitionLocator,
5
5
  Delta,
6
- interface as unimplemented_deltacat_storage,
6
+ metastore,
7
7
  )
8
8
  from deltacat import logs
9
9
  from deltacat.compute.compactor.utils import io as io_v1
@@ -38,7 +38,7 @@ def discover_deltas(
38
38
  rebase_source_partition_locator: Optional[PartitionLocator] = None,
39
39
  rebase_source_partition_high_watermark: Optional[int] = None,
40
40
  rcf_high_watermark: Optional[int] = None,
41
- deltacat_storage=unimplemented_deltacat_storage,
41
+ deltacat_storage=metastore,
42
42
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
43
43
  list_deltas_kwargs: Optional[Dict[str, Any]] = {},
44
44
  ) -> List[Delta]:
@@ -67,6 +67,11 @@ def discover_deltas(
67
67
  f"Length of input deltas from delta source table is {len(delta_source_incremental_deltas)}"
68
68
  f" from ({previous_compacted_high_watermark}, {last_stream_position_to_compact}]"
69
69
  )
70
+ logger.info(f"DEBUG: source_partition_locator = {source_partition_locator}")
71
+ logger.info(
72
+ f"DEBUG: source_partition_locator.partition_id = {getattr(source_partition_locator, 'partition_id', 'NO_PARTITION_ID')}"
73
+ )
74
+ logger.info(f"DEBUG: total input deltas found = {len(result)}")
70
75
 
71
76
  if rebase_source_partition_locator:
72
77
  previous_compacted_deltas = io_v1._discover_deltas(
@@ -93,7 +98,8 @@ def create_uniform_input_deltas(
93
98
  hash_bucket_count: int,
94
99
  compaction_audit: CompactionSessionAuditInfo,
95
100
  compact_partition_params: CompactPartitionParams,
96
- deltacat_storage=unimplemented_deltacat_storage,
101
+ all_column_names: List[str],
102
+ deltacat_storage=metastore,
97
103
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
98
104
  ) -> List[DeltaAnnotated]:
99
105
 
@@ -113,6 +119,7 @@ def create_uniform_input_deltas(
113
119
  )
114
120
  append_content_type_params(
115
121
  delta=delta,
122
+ all_column_names=all_column_names,
116
123
  deltacat_storage=deltacat_storage,
117
124
  deltacat_storage_kwargs=deltacat_storage_kwargs,
118
125
  task_max_parallelism=compact_partition_params.task_max_parallelism,
@@ -23,6 +23,7 @@ from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
23
23
 
24
24
  from deltacat.utils.performance import timed_invocation
25
25
  from deltacat.storage import (
26
+ DeltaType,
26
27
  Partition,
27
28
  )
28
29
  from deltacat.compute.compactor_v2.deletes.delete_strategy import (
@@ -47,13 +48,21 @@ def materialize(
47
48
  # TODO (pdames): compare performance to pandas-native materialize path
48
49
  df = compacted_table.to_pandas(split_blocks=True, self_destruct=True)
49
50
  compacted_table = df
51
+ # Extract schema from table_writer_kwargs to pass as direct parameter
52
+ # This ensures schema_id is properly set in the manifest
53
+ schema = None
54
+ if input.table_writer_kwargs and "schema" in input.table_writer_kwargs:
55
+ schema = input.table_writer_kwargs["schema"]
56
+
50
57
  delta, stage_delta_time = timed_invocation(
51
58
  input.deltacat_storage.stage_delta,
52
59
  compacted_table,
53
60
  input.write_to_partition,
61
+ delta_type=DeltaType.APPEND, # Compaction always produces APPEND deltas
54
62
  max_records_per_entry=input.max_records_per_output_file,
55
63
  content_type=input.compacted_file_content_type,
56
- s3_table_writer_kwargs=input.s3_table_writer_kwargs,
64
+ schema=schema, # Pass schema as direct parameter for schema_id extraction
65
+ table_writer_kwargs=input.table_writer_kwargs,
57
66
  **input.deltacat_storage_kwargs,
58
67
  )
59
68
  compacted_table_size = TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](
@@ -112,6 +121,7 @@ def generate_local_merge_input(
112
121
  return MergeInput.of(
113
122
  merge_file_groups_provider=LocalMergeFileGroupsProvider(
114
123
  annotated_deltas,
124
+ all_column_names=params.all_column_names,
115
125
  read_kwargs_provider=params.read_kwargs_provider,
116
126
  deltacat_storage=params.deltacat_storage,
117
127
  deltacat_storage_kwargs=params.deltacat_storage_kwargs,
@@ -119,12 +129,13 @@ def generate_local_merge_input(
119
129
  write_to_partition=compacted_partition,
120
130
  compacted_file_content_type=params.compacted_file_content_type,
121
131
  primary_keys=params.primary_keys,
132
+ all_column_names=params.all_column_names,
122
133
  sort_keys=params.sort_keys,
123
134
  drop_duplicates=params.drop_duplicates,
124
135
  max_records_per_output_file=params.records_per_compacted_file,
125
136
  enable_profiler=params.enable_profiler,
126
137
  metrics_config=params.metrics_config,
127
- s3_table_writer_kwargs=params.s3_table_writer_kwargs,
138
+ table_writer_kwargs=params.table_writer_kwargs,
128
139
  read_kwargs_provider=params.read_kwargs_provider,
129
140
  round_completion_info=round_completion_info,
130
141
  object_store=params.object_store,
@@ -134,4 +145,5 @@ def generate_local_merge_input(
134
145
  delete_file_envelopes=delete_file_envelopes,
135
146
  disable_copy_by_reference=params.disable_copy_by_reference,
136
147
  hash_bucket_count=params.hash_bucket_count,
148
+ original_fields=params.original_fields,
137
149
  )
@@ -11,7 +11,7 @@ from deltacat.compute.compactor_v2.model.merge_file_group import (
11
11
  from deltacat.storage import (
12
12
  Manifest,
13
13
  ManifestEntry,
14
- interface as unimplemented_deltacat_storage,
14
+ metastore,
15
15
  )
16
16
  from deltacat.compute.compactor.model.delta_annotated import DeltaAnnotated
17
17
  from deltacat.compute.compactor.model.round_completion_info import RoundCompletionInfo
@@ -77,8 +77,6 @@ def _get_merge_task_options(
77
77
  round_completion_info: Optional[RoundCompletionInfo] = None,
78
78
  compacted_delta_manifest: Optional[Manifest] = None,
79
79
  primary_keys: Optional[List[str]] = None,
80
- deltacat_storage=unimplemented_deltacat_storage,
81
- deltacat_storage_kwargs: Optional[Dict] = {},
82
80
  memory_logs_enabled: Optional[bool] = None,
83
81
  ) -> Dict[str, Any]:
84
82
  if (
@@ -275,8 +273,6 @@ def merge_resource_options_provider(
275
273
  compacted_delta_manifest: Optional[Manifest] = None,
276
274
  ray_custom_resources: Optional[Dict] = None,
277
275
  primary_keys: Optional[List[str]] = None,
278
- deltacat_storage=unimplemented_deltacat_storage,
279
- deltacat_storage_kwargs: Optional[Dict] = {},
280
276
  memory_logs_enabled: Optional[bool] = None,
281
277
  **kwargs,
282
278
  ) -> Dict:
@@ -306,8 +302,6 @@ def merge_resource_options_provider(
306
302
  round_completion_info=round_completion_info,
307
303
  compacted_delta_manifest=compacted_delta_manifest,
308
304
  primary_keys=primary_keys,
309
- deltacat_storage=deltacat_storage,
310
- deltacat_storage_kwargs=deltacat_storage_kwargs,
311
305
  memory_logs_enabled=memory_logs_enabled,
312
306
  estimate_resources_params=estimate_resources_params,
313
307
  )
@@ -322,7 +316,7 @@ def local_merge_resource_options_provider(
322
316
  compacted_delta_manifest: Optional[Manifest] = None,
323
317
  ray_custom_resources: Optional[Dict] = None,
324
318
  primary_keys: Optional[List[str]] = None,
325
- deltacat_storage=unimplemented_deltacat_storage,
319
+ deltacat_storage=metastore,
326
320
  deltacat_storage_kwargs: Optional[Dict] = {},
327
321
  memory_logs_enabled: Optional[bool] = None,
328
322
  **kwargs,
@@ -348,8 +342,6 @@ def local_merge_resource_options_provider(
348
342
  round_completion_info=round_completion_info,
349
343
  compacted_delta_manifest=compacted_delta_manifest,
350
344
  primary_keys=primary_keys,
351
- deltacat_storage=deltacat_storage,
352
- deltacat_storage_kwargs=deltacat_storage_kwargs,
353
345
  memory_logs_enabled=memory_logs_enabled,
354
346
  estimate_resources_params=estimate_resources_params,
355
347
  )