deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. deltacat/__init__.py +117 -18
  2. deltacat/api.py +536 -126
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +1 -19
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2444 -282
  12. deltacat/catalog/model/catalog.py +208 -113
  13. deltacat/catalog/model/properties.py +63 -24
  14. deltacat/compute/__init__.py +14 -0
  15. deltacat/compute/compactor/compaction_session.py +97 -75
  16. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  17. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  18. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  19. deltacat/compute/compactor/repartition_session.py +8 -21
  20. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  21. deltacat/compute/compactor/steps/materialize.py +9 -7
  22. deltacat/compute/compactor/steps/repartition.py +12 -11
  23. deltacat/compute/compactor/utils/io.py +6 -5
  24. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  25. deltacat/compute/compactor/utils/system_columns.py +3 -1
  26. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  27. deltacat/compute/compactor_v2/constants.py +30 -1
  28. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  30. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  31. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  32. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  33. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  34. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  35. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  36. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  37. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  38. deltacat/compute/compactor_v2/utils/io.py +11 -4
  39. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  40. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  41. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  42. deltacat/compute/converter/constants.py +5 -0
  43. deltacat/compute/converter/converter_session.py +207 -52
  44. deltacat/compute/converter/model/convert_input.py +43 -16
  45. deltacat/compute/converter/model/convert_input_files.py +33 -16
  46. deltacat/compute/converter/model/convert_result.py +80 -0
  47. deltacat/compute/converter/model/converter_session_params.py +64 -19
  48. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  49. deltacat/compute/converter/pyiceberg/overrides.py +193 -65
  50. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  51. deltacat/compute/converter/steps/convert.py +230 -75
  52. deltacat/compute/converter/steps/dedupe.py +46 -12
  53. deltacat/compute/converter/utils/convert_task_options.py +66 -22
  54. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  55. deltacat/compute/converter/utils/iceberg_columns.py +13 -8
  56. deltacat/compute/converter/utils/io.py +173 -13
  57. deltacat/compute/converter/utils/s3u.py +42 -27
  58. deltacat/compute/janitor.py +205 -0
  59. deltacat/compute/jobs/client.py +417 -0
  60. deltacat/compute/resource_estimation/delta.py +38 -6
  61. deltacat/compute/resource_estimation/model.py +8 -0
  62. deltacat/constants.py +49 -6
  63. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  64. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  65. deltacat/env.py +10 -0
  66. deltacat/examples/basic_logging.py +6 -6
  67. deltacat/examples/compactor/aws/__init__.py +1 -0
  68. deltacat/examples/compactor/bootstrap.py +863 -0
  69. deltacat/examples/compactor/compactor.py +373 -0
  70. deltacat/examples/compactor/explorer.py +473 -0
  71. deltacat/examples/compactor/gcp/__init__.py +1 -0
  72. deltacat/examples/compactor/job_runner.py +439 -0
  73. deltacat/examples/compactor/utils/__init__.py +1 -0
  74. deltacat/examples/compactor/utils/common.py +261 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  80. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  81. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
  82. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  83. deltacat/examples/hello_world.py +4 -2
  84. deltacat/examples/indexer/indexer.py +163 -0
  85. deltacat/examples/indexer/job_runner.py +198 -0
  86. deltacat/exceptions.py +66 -4
  87. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  88. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  89. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
  90. deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
  91. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  92. deltacat/experimental/converter_agent/__init__.py +0 -0
  93. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  94. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  95. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  96. deltacat/experimental/daft/__init__.py +4 -0
  97. deltacat/experimental/daft/daft_catalog.py +229 -0
  98. deltacat/experimental/storage/__init__.py +0 -0
  99. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  100. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  101. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  102. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  103. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  104. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  105. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  107. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  108. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  109. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  110. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  111. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  112. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  113. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  114. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  115. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  116. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  117. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  118. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  119. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  120. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  121. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  122. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  123. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  124. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  125. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  126. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  127. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  128. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  129. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  130. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  131. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  132. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  133. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  134. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  135. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  136. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  137. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  138. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  139. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  140. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  141. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  142. deltacat/io/__init__.py +13 -0
  143. deltacat/io/dataset/__init__.py +0 -0
  144. deltacat/io/dataset/deltacat_dataset.py +91 -0
  145. deltacat/io/datasink/__init__.py +0 -0
  146. deltacat/io/datasink/deltacat_datasink.py +207 -0
  147. deltacat/io/datasource/__init__.py +0 -0
  148. deltacat/io/datasource/deltacat_datasource.py +579 -0
  149. deltacat/io/reader/__init__.py +0 -0
  150. deltacat/io/reader/deltacat_read_api.py +172 -0
  151. deltacat/storage/__init__.py +22 -2
  152. deltacat/storage/interface.py +54 -32
  153. deltacat/storage/main/impl.py +1494 -541
  154. deltacat/storage/model/delta.py +27 -3
  155. deltacat/storage/model/expression/__init__.py +47 -0
  156. deltacat/storage/model/expression/expression.py +656 -0
  157. deltacat/storage/model/expression/visitor.py +248 -0
  158. deltacat/storage/model/locator.py +6 -12
  159. deltacat/storage/model/manifest.py +231 -6
  160. deltacat/storage/model/metafile.py +224 -119
  161. deltacat/storage/model/namespace.py +8 -1
  162. deltacat/storage/model/partition.py +117 -42
  163. deltacat/storage/model/scan/push_down.py +32 -5
  164. deltacat/storage/model/schema.py +2427 -159
  165. deltacat/storage/model/shard.py +6 -2
  166. deltacat/storage/model/sort_key.py +40 -0
  167. deltacat/storage/model/stream.py +9 -2
  168. deltacat/storage/model/table.py +12 -1
  169. deltacat/storage/model/table_version.py +11 -0
  170. deltacat/storage/model/transaction.py +1184 -208
  171. deltacat/storage/model/transform.py +81 -2
  172. deltacat/storage/model/types.py +53 -29
  173. deltacat/storage/util/__init__.py +0 -0
  174. deltacat/storage/util/scan_planner.py +26 -0
  175. deltacat/tests/_io/reader/__init__.py +0 -0
  176. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  177. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  178. deltacat/tests/aws/test_s3u.py +2 -31
  179. deltacat/tests/catalog/data/__init__.py +0 -0
  180. deltacat/tests/catalog/main/__init__.py +0 -0
  181. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  182. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  183. deltacat/tests/catalog/model/__init__.py +0 -0
  184. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  185. deltacat/tests/catalog/test_catalogs.py +103 -106
  186. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  187. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  188. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  189. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  190. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  191. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  192. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  193. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  194. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  195. deltacat/tests/compute/conftest.py +8 -44
  196. deltacat/tests/compute/converter/test_convert_session.py +697 -349
  197. deltacat/tests/compute/converter/utils.py +15 -6
  198. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  199. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  200. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  201. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  202. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  203. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  204. deltacat/tests/compute/test_janitor.py +236 -0
  205. deltacat/tests/compute/test_util_common.py +716 -43
  206. deltacat/tests/compute/test_util_constant.py +0 -1
  207. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  208. deltacat/tests/daft/__init__.py +0 -0
  209. deltacat/tests/daft/test_model.py +97 -0
  210. deltacat/tests/experimental/__init__.py +1 -0
  211. deltacat/tests/experimental/catalog/__init__.py +0 -0
  212. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  213. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  214. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  215. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  216. deltacat/tests/experimental/daft/__init__.py +0 -0
  217. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  218. deltacat/tests/experimental/storage/__init__.py +0 -0
  219. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  220. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  221. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  222. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  223. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  224. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  225. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  226. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  227. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  228. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  229. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  230. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  231. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  232. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  233. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  234. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  235. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  236. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  237. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  238. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  239. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  240. deltacat/tests/storage/model/test_expression.py +327 -0
  241. deltacat/tests/storage/model/test_manifest.py +129 -0
  242. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  243. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  244. deltacat/tests/storage/model/test_schema.py +171 -0
  245. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  246. deltacat/tests/storage/model/test_shard.py +3 -1
  247. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  248. deltacat/tests/storage/model/test_transaction.py +393 -48
  249. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  250. deltacat/tests/test_deltacat_api.py +1036 -11
  251. deltacat/tests/test_exceptions.py +9 -5
  252. deltacat/tests/test_utils/pyarrow.py +52 -21
  253. deltacat/tests/test_utils/storage.py +23 -34
  254. deltacat/tests/types/__init__.py +0 -0
  255. deltacat/tests/types/test_tables.py +104 -0
  256. deltacat/tests/utils/exceptions.py +22 -0
  257. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  258. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  259. deltacat/tests/utils/test_daft.py +121 -31
  260. deltacat/tests/utils/test_numpy.py +1193 -0
  261. deltacat/tests/utils/test_pandas.py +1106 -0
  262. deltacat/tests/utils/test_polars.py +1040 -0
  263. deltacat/tests/utils/test_pyarrow.py +1370 -89
  264. deltacat/types/media.py +345 -37
  265. deltacat/types/tables.py +2344 -46
  266. deltacat/utils/arguments.py +33 -1
  267. deltacat/utils/daft.py +824 -40
  268. deltacat/utils/export.py +3 -1
  269. deltacat/utils/filesystem.py +139 -9
  270. deltacat/utils/metafile_locator.py +2 -1
  271. deltacat/utils/numpy.py +118 -26
  272. deltacat/utils/pandas.py +577 -48
  273. deltacat/utils/polars.py +759 -0
  274. deltacat/utils/pyarrow.py +1373 -192
  275. deltacat/utils/ray_utils/concurrency.py +1 -1
  276. deltacat/utils/ray_utils/dataset.py +101 -10
  277. deltacat/utils/ray_utils/runtime.py +56 -4
  278. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  279. deltacat/utils/url.py +1325 -0
  280. deltacat-2.0.0.dist-info/METADATA +1163 -0
  281. deltacat-2.0.0.dist-info/RECORD +439 -0
  282. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  283. deltacat/catalog/iceberg/__init__.py +0 -4
  284. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  285. deltacat/compute/merge_on_read/__init__.py +0 -4
  286. deltacat/compute/merge_on_read/daft.py +0 -40
  287. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  288. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  289. deltacat/examples/common/fixtures.py +0 -15
  290. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  291. deltacat/storage/rivulet/__init__.py +0 -11
  292. deltacat/storage/rivulet/feather/__init__.py +0 -5
  293. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  294. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  295. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  296. deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
  297. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  298. deltacat/utils/s3fs.py +0 -21
  299. deltacat-2.0.dist-info/METADATA +0 -65
  300. deltacat-2.0.dist-info/RECORD +0 -347
  301. /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
  302. /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
  303. /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
  304. /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
  305. /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
  306. /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
  307. /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
  308. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
  309. /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
  310. /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  311. /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
  312. /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
  313. /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  314. /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
  315. /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
  316. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
  317. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  318. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  319. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  320. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  321. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  322. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  323. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  324. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -21,14 +21,13 @@ from deltacat.utils.placement import PlacementGroupConfig
21
21
  from typing import List, Optional, Dict, Any
22
22
  from deltacat.utils.ray_utils.runtime import live_node_resource_keys
23
23
  from deltacat.compute.compactor.utils import io
24
- from deltacat.compute.compactor.utils import round_completion_file as rcf
25
24
  from deltacat.compute.compactor.steps import repartition as repar
26
25
  from deltacat.compute.compactor.steps.repartition import RepartitionType
27
26
  from deltacat.storage import (
28
27
  Delta,
29
28
  DeltaLocator,
30
29
  PartitionLocator,
31
- interface as unimplemented_deltacat_storage,
30
+ metastore,
32
31
  )
33
32
  from deltacat.utils.metrics import MetricsConfig
34
33
  from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
@@ -41,7 +40,6 @@ def repartition(
41
40
  source_partition_locator: PartitionLocator,
42
41
  destination_partition_locator: PartitionLocator,
43
42
  repartition_args: Any,
44
- repartition_completion_file_s3_url: str,
45
43
  last_stream_position_to_compact: int,
46
44
  repartition_type: RepartitionType = RepartitionType.RANGE,
47
45
  sort_keys: List[SortKey] = None,
@@ -54,9 +52,8 @@ def repartition(
54
52
  pg_config: Optional[PlacementGroupConfig] = None,
55
53
  list_deltas_kwargs: Optional[Dict[str, Any]] = None,
56
54
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
57
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
58
- s3_client_kwargs: Optional[Dict[str, Any]] = None,
59
- deltacat_storage=unimplemented_deltacat_storage,
55
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
56
+ deltacat_storage=metastore,
60
57
  **kwargs,
61
58
  ) -> Optional[str]:
62
59
 
@@ -132,7 +129,7 @@ def repartition(
132
129
  enable_profiler=enable_profiler,
133
130
  metrics_config=metrics_config,
134
131
  read_kwargs_provider=read_kwargs_provider,
135
- s3_table_writer_kwargs=s3_table_writer_kwargs,
132
+ table_writer_kwargs=table_writer_kwargs,
136
133
  repartitioned_file_content_type=repartitioned_file_content_type,
137
134
  deltacat_storage=deltacat_storage,
138
135
  )
@@ -153,9 +150,6 @@ def repartition(
153
150
  compacted_delta = deltacat_storage.commit_delta(
154
151
  merged_delta, properties=kwargs.get("properties", {})
155
152
  )
156
- deltacat_storage.commit_partition(partition)
157
- logger.info(f"Committed final delta: {compacted_delta}")
158
- logger.info(f"Job run completed successfully!")
159
153
  new_compacted_delta_locator = DeltaLocator.of(
160
154
  new_compacted_partition_locator,
161
155
  compacted_delta.stream_position,
@@ -173,14 +167,7 @@ def repartition(
173
167
  bit_width_of_sort_keys,
174
168
  None,
175
169
  )
176
- if s3_client_kwargs is None:
177
- s3_client_kwargs = {}
178
-
179
- return rcf.write_round_completion_file(
180
- None,
181
- None,
182
- None,
183
- repartition_completion_info,
184
- repartition_completion_file_s3_url,
185
- **s3_client_kwargs,
186
- )
170
+ partition.compaction_round_completion_info = repartition_completion_info
171
+ deltacat_storage.commit_partition(partition)
172
+ logger.info(f"Committed final delta: {compacted_delta}")
173
+ logger.info(f"Job run completed successfully!")
@@ -21,7 +21,7 @@ from deltacat.compute.compactor.utils.primary_key_index import (
21
21
  group_hash_bucket_indices,
22
22
  group_record_indices_by_hash_bucket,
23
23
  )
24
- from deltacat.storage import interface as unimplemented_deltacat_storage
24
+ from deltacat.storage import metastore
25
25
  from deltacat.types.media import StorageType
26
26
  from deltacat.utils.common import sha1_digest
27
27
  from deltacat.utils.ray_utils.runtime import (
@@ -90,7 +90,7 @@ def _group_file_records_by_pk_hash_bucket(
90
90
  sort_key_names: List[str],
91
91
  is_src_delta: np.bool_ = True,
92
92
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
93
- deltacat_storage=unimplemented_deltacat_storage,
93
+ deltacat_storage=metastore,
94
94
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
95
95
  **kwargs,
96
96
  ) -> Tuple[Optional[DeltaFileEnvelopeGroups], int]:
@@ -139,7 +139,7 @@ def _read_delta_file_envelopes(
139
139
  primary_keys: List[str],
140
140
  sort_key_names: List[str],
141
141
  read_kwargs_provider: Optional[ReadKwargsProvider],
142
- deltacat_storage=unimplemented_deltacat_storage,
142
+ deltacat_storage=metastore,
143
143
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
144
144
  **kwargs,
145
145
  ) -> Tuple[Optional[List[DeltaFileEnvelope]], int]:
@@ -190,7 +190,7 @@ def _timed_hash_bucket(
190
190
  enable_profiler: bool,
191
191
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
192
192
  object_store: Optional[IObjectStore] = None,
193
- deltacat_storage=unimplemented_deltacat_storage,
193
+ deltacat_storage=metastore,
194
194
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
195
195
  **kwargs,
196
196
  ):
@@ -249,7 +249,7 @@ def hash_bucket(
249
249
  metrics_config: MetricsConfig,
250
250
  read_kwargs_provider: Optional[ReadKwargsProvider],
251
251
  object_store: Optional[IObjectStore],
252
- deltacat_storage=unimplemented_deltacat_storage,
252
+ deltacat_storage=metastore,
253
253
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
254
254
  **kwargs,
255
255
  ) -> HashBucketResult:
@@ -29,7 +29,7 @@ from deltacat.storage import (
29
29
  ManifestEntryList,
30
30
  )
31
31
  from deltacat.storage.model.manifest import Manifest
32
- from deltacat.storage import interface as unimplemented_deltacat_storage
32
+
33
33
  from deltacat.utils.common import ReadKwargsProvider
34
34
  from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES, ContentType
35
35
  from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
@@ -46,6 +46,7 @@ from deltacat.utils.ray_utils.runtime import (
46
46
  )
47
47
  from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
48
48
  from deltacat.utils.resources import get_current_process_peak_memory_usage_in_bytes
49
+ from deltacat.storage import metastore
49
50
 
50
51
  if importlib.util.find_spec("memray"):
51
52
  import memray
@@ -67,9 +68,9 @@ def materialize(
67
68
  metrics_config: MetricsConfig,
68
69
  schema: Optional[pa.Schema] = None,
69
70
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
70
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
71
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
71
72
  object_store: Optional[IObjectStore] = None,
72
- deltacat_storage=unimplemented_deltacat_storage,
73
+ deltacat_storage=metastore,
73
74
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
74
75
  ):
75
76
  if deltacat_storage_kwargs is None:
@@ -78,11 +79,11 @@ def materialize(
78
79
  def _stage_delta_from_manifest_entry_reference_list(
79
80
  manifest_entry_list_reference: List[ManifestEntry],
80
81
  partition: Partition,
81
- delta_type: DeltaType = DeltaType.UPSERT,
82
+ delta_type: DeltaType = DeltaType.APPEND,
82
83
  ) -> Delta:
83
84
  assert (
84
- delta_type == DeltaType.UPSERT
85
- ), "Stage delta with existing manifest entries only supports UPSERT delta type!"
85
+ delta_type == DeltaType.APPEND
86
+ ), "Compaction should always produce APPEND deltas for consistent read operations!"
86
87
  manifest = Manifest.of(
87
88
  entries=ManifestEntryList.of(manifest_entry_list_reference),
88
89
  uuid=str(uuid4()),
@@ -110,9 +111,10 @@ def materialize(
110
111
  deltacat_storage.stage_delta,
111
112
  compacted_table,
112
113
  partition,
114
+ delta_type=DeltaType.APPEND, # Compaction always produces APPEND deltas
113
115
  max_records_per_entry=max_records_per_output_file,
114
116
  content_type=compacted_file_content_type,
115
- s3_table_writer_kwargs=s3_table_writer_kwargs,
117
+ table_writer_kwargs=table_writer_kwargs,
116
118
  **deltacat_storage_kwargs,
117
119
  )
118
120
  compacted_table_size = TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](
@@ -10,7 +10,7 @@ import ray
10
10
  from deltacat import logs
11
11
  from deltacat.compute.compactor import DeltaAnnotated
12
12
  from deltacat.compute.compactor.model.repartition_result import RepartitionResult
13
- from deltacat.storage import interface as unimplemented_deltacat_storage
13
+ from deltacat.storage import metastore
14
14
  from deltacat.storage import Partition
15
15
  from deltacat.utils.ray_utils.runtime import (
16
16
  get_current_ray_task_id,
@@ -19,7 +19,7 @@ from deltacat.utils.ray_utils.runtime import (
19
19
  from deltacat.utils.common import ReadKwargsProvider
20
20
  from deltacat.utils.performance import timed_invocation
21
21
  from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
22
- from deltacat.storage import Delta
22
+ from deltacat.storage import Delta, DeltaType
23
23
  from enum import Enum
24
24
 
25
25
  if importlib.util.find_spec("memray"):
@@ -56,9 +56,9 @@ def repartition_range(
56
56
  destination_partition: Partition,
57
57
  repartition_args: dict,
58
58
  max_records_per_output_file: int,
59
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
59
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
60
60
  repartitioned_file_content_type: ContentType = ContentType.PARQUET,
61
- deltacat_storage=unimplemented_deltacat_storage,
61
+ deltacat_storage=metastore,
62
62
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
63
63
  **kwargs,
64
64
  ):
@@ -144,9 +144,10 @@ def repartition_range(
144
144
  partition_delta: Delta = deltacat_storage.stage_delta(
145
145
  partition_table,
146
146
  destination_partition,
147
+ delta_type=DeltaType.APPEND, # Repartition always produces APPEND deltas
147
148
  max_records_per_entry=max_records_per_output_file,
148
149
  content_type=repartitioned_file_content_type,
149
- s3_table_writer_kwargs=s3_table_writer_kwargs,
150
+ table_writer_kwargs=table_writer_kwargs,
150
151
  **deltacat_storage_kwargs,
151
152
  )
152
153
  partition_deltas.append(partition_delta)
@@ -168,9 +169,9 @@ def _timed_repartition(
168
169
  max_records_per_output_file: int,
169
170
  enable_profiler: bool,
170
171
  read_kwargs_provider: Optional[ReadKwargsProvider],
171
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
172
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
172
173
  repartitioned_file_content_type: ContentType = ContentType.PARQUET,
173
- deltacat_storage=unimplemented_deltacat_storage,
174
+ deltacat_storage=metastore,
174
175
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
175
176
  **kwargs,
176
177
  ) -> RepartitionResult:
@@ -192,7 +193,7 @@ def _timed_repartition(
192
193
  destination_partition=destination_partition,
193
194
  repartition_args=repartition_args,
194
195
  max_records_per_output_file=max_records_per_output_file,
195
- s3_table_writer_kwargs=s3_table_writer_kwargs,
196
+ table_writer_kwargs=table_writer_kwargs,
196
197
  repartitioned_file_content_type=repartitioned_file_content_type,
197
198
  deltacat_storage=deltacat_storage,
198
199
  deltacat_storage_kwargs=deltacat_storage_kwargs,
@@ -213,9 +214,9 @@ def repartition(
213
214
  enable_profiler: bool,
214
215
  metrics_config: Optional[MetricsConfig],
215
216
  read_kwargs_provider: Optional[ReadKwargsProvider],
216
- s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
217
+ table_writer_kwargs: Optional[Dict[str, Any]] = None,
217
218
  repartitioned_file_content_type: ContentType = ContentType.PARQUET,
218
- deltacat_storage=unimplemented_deltacat_storage,
219
+ deltacat_storage=metastore,
219
220
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
220
221
  **kwargs,
221
222
  ) -> RepartitionResult:
@@ -231,7 +232,7 @@ def repartition(
231
232
  max_records_per_output_file=max_records_per_output_file,
232
233
  enable_profiler=enable_profiler,
233
234
  read_kwargs_provider=read_kwargs_provider,
234
- s3_table_writer_kwargs=s3_table_writer_kwargs,
235
+ table_writer_kwargs=table_writer_kwargs,
235
236
  repartitioned_file_content_type=repartitioned_file_content_type,
236
237
  deltacat_storage=deltacat_storage,
237
238
  deltacat_storage_kwargs=deltacat_storage_kwargs,
@@ -11,7 +11,7 @@ from deltacat.storage import (
11
11
  PartitionLocator,
12
12
  Delta,
13
13
  ManifestEntry,
14
- interface as unimplemented_deltacat_storage,
14
+ metastore,
15
15
  )
16
16
  from deltacat import logs
17
17
  from deltacat.compute.compactor import DeltaAnnotated
@@ -31,12 +31,13 @@ def discover_deltas(
31
31
  compacted_partition_locator: Optional[PartitionLocator],
32
32
  rebase_source_partition_locator: Optional[PartitionLocator],
33
33
  rebase_source_partition_high_watermark: Optional[int],
34
- deltacat_storage=unimplemented_deltacat_storage,
34
+ deltacat_storage=metastore,
35
35
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
36
36
  list_deltas_kwargs: Optional[Dict[str, Any]] = {},
37
37
  ) -> Tuple[List[Delta], int]:
38
38
  if deltacat_storage_kwargs is None:
39
39
  deltacat_storage_kwargs = {}
40
+
40
41
  # Source One: new deltas from uncompacted table for incremental compaction or deltas from compacted table for rebase
41
42
  start_position_exclusive = (
42
43
  high_watermark.get(source_partition_locator)
@@ -109,7 +110,7 @@ def limit_input_deltas(
109
110
  user_hash_bucket_chunk_size: int,
110
111
  input_deltas_stats: Dict[int, DeltaStats],
111
112
  compaction_audit: CompactionSessionAuditInfo,
112
- deltacat_storage=unimplemented_deltacat_storage,
113
+ deltacat_storage=metastore,
113
114
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
114
115
  **kwargs,
115
116
  ) -> Tuple[List[DeltaAnnotated], int, HighWatermark, bool]:
@@ -272,7 +273,7 @@ def fit_input_deltas(
272
273
  cluster_resources: Dict[str, float],
273
274
  compaction_audit: CompactionSessionAuditInfo,
274
275
  hash_bucket_count: Optional[int],
275
- deltacat_storage=unimplemented_deltacat_storage,
276
+ deltacat_storage=metastore,
276
277
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
277
278
  **kwargs,
278
279
  ) -> Tuple[List[DeltaAnnotated], int, HighWatermark, bool]:
@@ -359,7 +360,7 @@ def _discover_deltas(
359
360
  source_partition_locator: PartitionLocator,
360
361
  start_position_exclusive: Optional[int],
361
362
  end_position_inclusive: Optional[int],
362
- deltacat_storage=unimplemented_deltacat_storage,
363
+ deltacat_storage=metastore,
363
364
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
364
365
  list_deltas_kwargs: Optional[Dict[str, Any]] = {},
365
366
  ) -> List[Delta]:
@@ -0,0 +1,117 @@
1
+ import logging
2
+ from typing import Optional
3
+ from deltacat import logs
4
+ from deltacat.compute.compactor import RoundCompletionInfo
5
+ from deltacat.storage import PartitionLocator
6
+ from deltacat.storage.model.partition import Partition
7
+ from deltacat.utils.metrics import metrics
8
+ from deltacat.exceptions import PartitionNotFoundError
9
+
10
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
11
+
12
+
13
+ @metrics
14
+ def read_round_completion_info(
15
+ source_partition_locator: PartitionLocator,
16
+ destination_partition_locator: PartitionLocator,
17
+ deltacat_storage,
18
+ deltacat_storage_kwargs: Optional[dict] = None,
19
+ destination_partition: Optional[Partition] = None,
20
+ ) -> Optional[RoundCompletionInfo]:
21
+ """
22
+ Read round completion info from the partition metafile.
23
+
24
+ Args:
25
+ source_partition_locator: Source partition locator for validation
26
+ destination_partition_locator: Destination partition locator
27
+ deltacat_storage: Storage implementation
28
+ deltacat_storage_kwargs: Optional storage kwargs
29
+ destination_partition: Optional destination partition to avoid redundant get_partition calls
30
+
31
+ Returns:
32
+ RoundCompletionInfo if found in partition, None otherwise
33
+ """
34
+ if not destination_partition_locator:
35
+ return None
36
+
37
+ if deltacat_storage_kwargs is None:
38
+ deltacat_storage_kwargs = {}
39
+
40
+ try:
41
+ # Use provided partition or get it from storage
42
+ if destination_partition:
43
+ partition = destination_partition
44
+ else:
45
+ # First get the current partition to access its previous_partition_id
46
+ current_partition: Partition = deltacat_storage.get_partition(
47
+ destination_partition_locator.stream_locator,
48
+ destination_partition_locator.partition_values,
49
+ **deltacat_storage_kwargs,
50
+ )
51
+
52
+ # If current partition has round completion info, use it
53
+ if current_partition.compaction_round_completion_info:
54
+ partition = current_partition
55
+ elif current_partition.previous_partition_id is not None:
56
+ # For incremental compaction, we need to get the previous committed partition
57
+ # that contains the round completion info.
58
+ # Get the previous partition by ID - this is where the round completion info should be
59
+ logger.info(
60
+ f"Current partition {destination_partition_locator} does not have round completion info, "
61
+ f"getting previous partition with ID: {current_partition.previous_partition_id}"
62
+ )
63
+ previous_partition = deltacat_storage.get_partition_by_id(
64
+ destination_partition_locator.stream_locator,
65
+ current_partition.previous_partition_id,
66
+ **deltacat_storage_kwargs,
67
+ )
68
+ if previous_partition is not None:
69
+ logger.info(
70
+ f"Found previous partition: {previous_partition.locator}"
71
+ )
72
+ partition = previous_partition
73
+ else:
74
+ raise PartitionNotFoundError(
75
+ f"Previous partition with ID {current_partition.previous_partition_id} not found"
76
+ )
77
+ else:
78
+ logger.info(f"No previous partition ID found, using current partition")
79
+ partition = current_partition
80
+
81
+ if partition:
82
+ round_completion_info = partition.compaction_round_completion_info
83
+ if round_completion_info:
84
+ # Validate that prev_source_partition_locator matches current source
85
+ if (
86
+ not source_partition_locator
87
+ or not round_completion_info.prev_source_partition_locator
88
+ ):
89
+ raise ValueError(
90
+ f"Source partition locator ({source_partition_locator}) and "
91
+ f"prev_source_partition_locator ({round_completion_info.prev_source_partition_locator}) "
92
+ f"must both be provided."
93
+ )
94
+
95
+ if (
96
+ round_completion_info.prev_source_partition_locator.canonical_string()
97
+ != source_partition_locator.canonical_string()
98
+ ):
99
+ logger.warning(
100
+ f"Previous source partition locator mismatch: "
101
+ f"expected {source_partition_locator.canonical_string()}, "
102
+ f"but found {round_completion_info.prev_source_partition_locator.canonical_string()} "
103
+ f"in round completion info. Ignoring cached round completion info."
104
+ )
105
+ return None
106
+
107
+ logger.info(
108
+ f"Read round completion info from partition metafile: {round_completion_info}"
109
+ )
110
+ return round_completion_info
111
+
112
+ except Exception as e:
113
+ logger.debug(
114
+ f"Failed to read round completion info from partition metafile: {e}"
115
+ )
116
+
117
+ return None
@@ -294,7 +294,9 @@ def append_dedupe_task_idx_col(table: pa.Table, dedupe_task_indices) -> pa.Table
294
294
 
295
295
 
296
296
  def delta_type_to_field(delta_type: DeltaType) -> bool:
297
- return True if delta_type is DeltaType.UPSERT else False
297
+ # For deduplication purposes, treat both UPSERT and APPEND as UPSERT (True)
298
+ # Only DELETE should be treated as DELETE (False)
299
+ return delta_type is not DeltaType.DELETE
298
300
 
299
301
 
300
302
  def delta_type_from_field(delta_type_field: bool) -> DeltaType:
@@ -14,7 +14,6 @@ from deltacat.compute.compactor_v2.model.evaluate_compaction_result import (
14
14
  ExecutionCompactionResult,
15
15
  )
16
16
  from deltacat.compute.compactor.model.compactor_version import CompactorVersion
17
- from deltacat.compute.compactor.utils import round_completion_file as rcf
18
17
  from deltacat.compute.compactor import DeltaAnnotated
19
18
  from deltacat.compute.compactor_v2.deletes.delete_strategy import (
20
19
  DeleteStrategy,
@@ -27,6 +26,7 @@ from deltacat.compute.compactor_v2.deletes.delete_file_envelope import (
27
26
  from deltacat.storage import (
28
27
  Delta,
29
28
  DeltaLocator,
29
+ PartitionLocator,
30
30
  )
31
31
  from deltacat.storage.model.manifest import Manifest
32
32
  from deltacat.compute.compactor.model.compact_partition_params import (
@@ -36,13 +36,14 @@ from deltacat.utils.resources import (
36
36
  get_current_process_peak_memory_usage_in_bytes,
37
37
  )
38
38
  from deltacat.compute.compactor_v2.private.compaction_utils import (
39
+ _get_rci_source_partition_locator,
39
40
  _fetch_compaction_metadata,
40
41
  _build_uniform_deltas,
41
42
  _group_uniform_deltas,
42
43
  _stage_new_partition,
43
44
  _run_hash_and_merge,
44
45
  _process_merge_results,
45
- _write_new_round_completion_file,
46
+ _create_round_completion_info,
46
47
  _commit_compaction_result,
47
48
  )
48
49
  from deltacat.utils.metrics import metrics
@@ -64,24 +65,26 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
64
65
 
65
66
  @metrics(prefix=COMPACT_PARTITION_METRIC_PREFIX)
66
67
  @categorize_errors
67
- def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]:
68
+ def compact_partition(params: CompactPartitionParams, **kwargs) -> None:
68
69
  assert (
69
70
  params.hash_bucket_count is not None and params.hash_bucket_count >= 1
70
71
  ), "hash_bucket_count is a required arg for compactor v2"
72
+ assert type(params.hash_bucket_count) is int, "Hash bucket count must be an integer"
71
73
  if params.num_rounds > 1:
72
74
  assert (
73
75
  not params.drop_duplicates
74
76
  ), "num_rounds > 1, drop_duplicates must be False but is True"
75
77
 
76
- with memray.Tracker(
77
- "compaction_partition.bin"
78
- ) if params.enable_profiler else nullcontext():
78
+ with (
79
+ memray.Tracker("compaction_partition.bin")
80
+ if params.enable_profiler
81
+ else nullcontext()
82
+ ):
79
83
  execute_compaction_result: ExecutionCompactionResult = _execute_compaction(
80
84
  params,
81
85
  **kwargs,
82
86
  )
83
87
  _commit_compaction_result(params, execute_compaction_result)
84
- return execute_compaction_result.round_completion_file_s3_url
85
88
 
86
89
 
87
90
  def _execute_compaction(
@@ -96,12 +99,12 @@ def _execute_compaction(
96
99
  previous_compacted_delta_manifest,
97
100
  round_completion_info,
98
101
  ) = fetch_compaction_metadata_result
99
- rcf_source_partition_locator: rcf.PartitionLocator = (
100
- params.rebase_source_partition_locator or params.source_partition_locator
102
+ rci_source_partition_locator: PartitionLocator = _get_rci_source_partition_locator(
103
+ params
101
104
  )
102
105
 
103
- base_audit_url: str = rcf_source_partition_locator.path(
104
- f"s3://{params.compaction_artifact_s3_bucket}/compaction-audit"
106
+ base_audit_url: str = rci_source_partition_locator.path(
107
+ f"{params.compaction_artifact_path}/compaction-audit"
105
108
  )
106
109
  audit_url: str = f"{base_audit_url}.json"
107
110
  logger.info(f"Compaction audit will be written to {audit_url}")
@@ -136,7 +139,7 @@ def _execute_compaction(
136
139
  )
137
140
  if not input_deltas:
138
141
  logger.info("No input deltas found to compact.")
139
- return ExecutionCompactionResult(None, None, None, False)
142
+ return ExecutionCompactionResult(None, None, False)
140
143
  build_uniform_deltas_result: tuple[
141
144
  List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope]
142
145
  ] = _build_uniform_deltas(
@@ -199,13 +202,13 @@ def _execute_compaction(
199
202
 
200
203
  compaction_audit.save_round_completion_stats(mat_results)
201
204
 
202
- compaction_result: ExecutionCompactionResult = _write_new_round_completion_file(
205
+ compaction_result: ExecutionCompactionResult = _create_round_completion_info(
203
206
  params,
204
207
  compaction_audit,
205
208
  compacted_partition,
206
209
  audit_url,
207
210
  hb_id_to_entry_indices_range,
208
- rcf_source_partition_locator,
211
+ rci_source_partition_locator,
209
212
  new_compacted_delta_locator,
210
213
  pyarrow_write_result,
211
214
  round_completion_info,
@@ -1,3 +1,5 @@
1
+ from deltacat.utils.common import env_bool, env_integer, env_string
2
+
1
3
  TOTAL_BYTES_IN_SHA1_HASH = 20
2
4
 
3
5
  PK_DELIMITER = "L6kl7u5f"
@@ -31,7 +33,9 @@ TOTAL_MEMORY_BUFFER_PERCENTAGE = 30
31
33
  # The total size of records that will be hash bucketed at once
32
34
  # Since, sorting is nlogn, we ensure that is not performed
33
35
  # on a very large dataset for best performance.
34
- MAX_SIZE_OF_RECORD_BATCH_IN_GIB = 2 * 1024 * 1024 * 1024
36
+ MAX_SIZE_OF_RECORD_BATCH_IN_GIB = env_integer(
37
+ "MAX_SIZE_OF_RECORD_BATCH_IN_GIB", 2 * 1024 * 1024 * 1024
38
+ )
35
39
 
36
40
  # Whether to drop duplicates during merge.
37
41
  DROP_DUPLICATES = True
@@ -78,3 +82,28 @@ COMPACT_PARTITION_METRIC_PREFIX = "compact_partition"
78
82
  # Number of rounds to run hash/merge for a single
79
83
  # partition. (For large table support)
80
84
  DEFAULT_NUM_ROUNDS = 1
85
+
86
+ # Whether to perform sha1 hashing when required to
87
+ # optimize memory. For example, hashing is always
88
+ # required for bucketing where it's not mandatory
89
+ # when dropping duplicates. Setting this to True
90
+ # will disable sha1 hashing in cases where it isn't
91
+ # mandatory. This flag is False by default.
92
+ SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED = env_bool(
93
+ "SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED", False
94
+ )
95
+
96
+ # This env variable specifies whether to check bucketing spec
97
+ # compliance of the existing compacted table.
98
+ # PRINT_LOG: Enable logging if any partition is found
99
+ # to be non-compliant with the bucketing spec.
100
+ # ASSERT: Fail the job with ValidationError if the
101
+ # current compacted partition is found to be non-compliant
102
+ # with bucketing spec. Note, logging is implicitly enabled
103
+ # in this case.
104
+ BUCKETING_SPEC_COMPLIANCE_PROFILE = env_string(
105
+ "BUCKETING_SPEC_COMPLIANCE_PROFILE", None
106
+ )
107
+
108
+ BUCKETING_SPEC_COMPLIANCE_PRINT_LOG = "PRINT_LOG"
109
+ BUCKETING_SPEC_COMPLIANCE_ASSERT = "ASSERT"
@@ -13,7 +13,6 @@ from typing import Optional
13
13
  class ExecutionCompactionResult:
14
14
  new_compacted_partition: Optional[Partition]
15
15
  new_round_completion_info: Optional[RoundCompletionInfo]
16
- round_completion_file_s3_url: Optional[str]
17
16
  is_inplace_compacted: bool
18
17
 
19
18
  def __iter__(self):
@@ -4,7 +4,7 @@ from typing import Dict, List, Optional, Any
4
4
  from deltacat.utils.metrics import MetricsConfig
5
5
  from deltacat.utils.common import ReadKwargsProvider
6
6
  from deltacat.io.object_store import IObjectStore
7
- from deltacat.storage import interface as unimplemented_deltacat_storage
7
+ from deltacat.storage import metastore
8
8
  from deltacat.compute.compactor import DeltaAnnotated
9
9
 
10
10
 
@@ -15,12 +15,13 @@ class HashBucketInput(Dict):
15
15
  primary_keys: List[str],
16
16
  num_hash_buckets: int,
17
17
  num_hash_groups: int,
18
+ all_column_names: List[str],
18
19
  hb_task_index: Optional[int] = 0,
19
20
  enable_profiler: Optional[bool] = False,
20
21
  metrics_config: Optional[MetricsConfig] = None,
21
22
  read_kwargs_provider: Optional[ReadKwargsProvider] = None,
22
23
  object_store: Optional[IObjectStore] = None,
23
- deltacat_storage=unimplemented_deltacat_storage,
24
+ deltacat_storage=metastore,
24
25
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
25
26
  memory_logs_enabled: Optional[bool] = None,
26
27
  ) -> HashBucketInput:
@@ -31,6 +32,7 @@ class HashBucketInput(Dict):
31
32
  result["hb_task_index"] = hb_task_index
32
33
  result["num_hash_buckets"] = num_hash_buckets
33
34
  result["num_hash_groups"] = num_hash_groups
35
+ result["all_column_names"] = all_column_names
34
36
  result["enable_profiler"] = enable_profiler
35
37
  result["metrics_config"] = metrics_config
36
38
  result["read_kwargs_provider"] = read_kwargs_provider
@@ -61,6 +63,10 @@ class HashBucketInput(Dict):
61
63
  def num_hash_groups(self) -> int:
62
64
  return self["num_hash_groups"]
63
65
 
66
+ @property
67
+ def all_column_names(self) -> List[str]:
68
+ return self["all_column_names"]
69
+
64
70
  @property
65
71
  def enable_profiler(self) -> Optional[bool]:
66
72
  return self.get("enable_profiler")
@@ -78,7 +84,7 @@ class HashBucketInput(Dict):
78
84
  return self.get("object_store")
79
85
 
80
86
  @property
81
- def deltacat_storage(self) -> unimplemented_deltacat_storage:
87
+ def deltacat_storage(self) -> metastore:
82
88
  return self.get("deltacat_storage")
83
89
 
84
90
  @property