deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. deltacat/__init__.py +117 -18
  2. deltacat/api.py +536 -126
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +1 -19
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2444 -282
  12. deltacat/catalog/model/catalog.py +208 -113
  13. deltacat/catalog/model/properties.py +63 -24
  14. deltacat/compute/__init__.py +14 -0
  15. deltacat/compute/compactor/compaction_session.py +97 -75
  16. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  17. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  18. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  19. deltacat/compute/compactor/repartition_session.py +8 -21
  20. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  21. deltacat/compute/compactor/steps/materialize.py +9 -7
  22. deltacat/compute/compactor/steps/repartition.py +12 -11
  23. deltacat/compute/compactor/utils/io.py +6 -5
  24. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  25. deltacat/compute/compactor/utils/system_columns.py +3 -1
  26. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  27. deltacat/compute/compactor_v2/constants.py +30 -1
  28. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  30. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  31. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  32. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  33. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  34. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  35. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  36. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  37. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  38. deltacat/compute/compactor_v2/utils/io.py +11 -4
  39. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  40. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  41. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  42. deltacat/compute/converter/constants.py +5 -0
  43. deltacat/compute/converter/converter_session.py +207 -52
  44. deltacat/compute/converter/model/convert_input.py +43 -16
  45. deltacat/compute/converter/model/convert_input_files.py +33 -16
  46. deltacat/compute/converter/model/convert_result.py +80 -0
  47. deltacat/compute/converter/model/converter_session_params.py +64 -19
  48. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  49. deltacat/compute/converter/pyiceberg/overrides.py +193 -65
  50. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  51. deltacat/compute/converter/steps/convert.py +230 -75
  52. deltacat/compute/converter/steps/dedupe.py +46 -12
  53. deltacat/compute/converter/utils/convert_task_options.py +66 -22
  54. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  55. deltacat/compute/converter/utils/iceberg_columns.py +13 -8
  56. deltacat/compute/converter/utils/io.py +173 -13
  57. deltacat/compute/converter/utils/s3u.py +42 -27
  58. deltacat/compute/janitor.py +205 -0
  59. deltacat/compute/jobs/client.py +417 -0
  60. deltacat/compute/resource_estimation/delta.py +38 -6
  61. deltacat/compute/resource_estimation/model.py +8 -0
  62. deltacat/constants.py +49 -6
  63. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  64. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  65. deltacat/env.py +10 -0
  66. deltacat/examples/basic_logging.py +6 -6
  67. deltacat/examples/compactor/aws/__init__.py +1 -0
  68. deltacat/examples/compactor/bootstrap.py +863 -0
  69. deltacat/examples/compactor/compactor.py +373 -0
  70. deltacat/examples/compactor/explorer.py +473 -0
  71. deltacat/examples/compactor/gcp/__init__.py +1 -0
  72. deltacat/examples/compactor/job_runner.py +439 -0
  73. deltacat/examples/compactor/utils/__init__.py +1 -0
  74. deltacat/examples/compactor/utils/common.py +261 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  79. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  80. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  81. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
  82. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  83. deltacat/examples/hello_world.py +4 -2
  84. deltacat/examples/indexer/indexer.py +163 -0
  85. deltacat/examples/indexer/job_runner.py +198 -0
  86. deltacat/exceptions.py +66 -4
  87. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  88. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  89. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
  90. deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
  91. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  92. deltacat/experimental/converter_agent/__init__.py +0 -0
  93. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  94. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  95. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  96. deltacat/experimental/daft/__init__.py +4 -0
  97. deltacat/experimental/daft/daft_catalog.py +229 -0
  98. deltacat/experimental/storage/__init__.py +0 -0
  99. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  100. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  101. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  102. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  103. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  104. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  105. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  107. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  108. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  109. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  110. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  111. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  112. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  113. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  114. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  115. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  116. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  117. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  118. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  119. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  120. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  121. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  122. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  123. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  124. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  125. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  126. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  127. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  128. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  129. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  130. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  131. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  132. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  133. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  134. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  135. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  136. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  137. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  138. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  139. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  140. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  141. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  142. deltacat/io/__init__.py +13 -0
  143. deltacat/io/dataset/__init__.py +0 -0
  144. deltacat/io/dataset/deltacat_dataset.py +91 -0
  145. deltacat/io/datasink/__init__.py +0 -0
  146. deltacat/io/datasink/deltacat_datasink.py +207 -0
  147. deltacat/io/datasource/__init__.py +0 -0
  148. deltacat/io/datasource/deltacat_datasource.py +579 -0
  149. deltacat/io/reader/__init__.py +0 -0
  150. deltacat/io/reader/deltacat_read_api.py +172 -0
  151. deltacat/storage/__init__.py +22 -2
  152. deltacat/storage/interface.py +54 -32
  153. deltacat/storage/main/impl.py +1494 -541
  154. deltacat/storage/model/delta.py +27 -3
  155. deltacat/storage/model/expression/__init__.py +47 -0
  156. deltacat/storage/model/expression/expression.py +656 -0
  157. deltacat/storage/model/expression/visitor.py +248 -0
  158. deltacat/storage/model/locator.py +6 -12
  159. deltacat/storage/model/manifest.py +231 -6
  160. deltacat/storage/model/metafile.py +224 -119
  161. deltacat/storage/model/namespace.py +8 -1
  162. deltacat/storage/model/partition.py +117 -42
  163. deltacat/storage/model/scan/push_down.py +32 -5
  164. deltacat/storage/model/schema.py +2427 -159
  165. deltacat/storage/model/shard.py +6 -2
  166. deltacat/storage/model/sort_key.py +40 -0
  167. deltacat/storage/model/stream.py +9 -2
  168. deltacat/storage/model/table.py +12 -1
  169. deltacat/storage/model/table_version.py +11 -0
  170. deltacat/storage/model/transaction.py +1184 -208
  171. deltacat/storage/model/transform.py +81 -2
  172. deltacat/storage/model/types.py +53 -29
  173. deltacat/storage/util/__init__.py +0 -0
  174. deltacat/storage/util/scan_planner.py +26 -0
  175. deltacat/tests/_io/reader/__init__.py +0 -0
  176. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  177. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  178. deltacat/tests/aws/test_s3u.py +2 -31
  179. deltacat/tests/catalog/data/__init__.py +0 -0
  180. deltacat/tests/catalog/main/__init__.py +0 -0
  181. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  182. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  183. deltacat/tests/catalog/model/__init__.py +0 -0
  184. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  185. deltacat/tests/catalog/test_catalogs.py +103 -106
  186. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  187. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  188. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  189. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  190. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  191. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  192. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  193. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  194. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  195. deltacat/tests/compute/conftest.py +8 -44
  196. deltacat/tests/compute/converter/test_convert_session.py +697 -349
  197. deltacat/tests/compute/converter/utils.py +15 -6
  198. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  199. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  200. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  201. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  202. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  203. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  204. deltacat/tests/compute/test_janitor.py +236 -0
  205. deltacat/tests/compute/test_util_common.py +716 -43
  206. deltacat/tests/compute/test_util_constant.py +0 -1
  207. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  208. deltacat/tests/daft/__init__.py +0 -0
  209. deltacat/tests/daft/test_model.py +97 -0
  210. deltacat/tests/experimental/__init__.py +1 -0
  211. deltacat/tests/experimental/catalog/__init__.py +0 -0
  212. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  213. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  214. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  215. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  216. deltacat/tests/experimental/daft/__init__.py +0 -0
  217. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  218. deltacat/tests/experimental/storage/__init__.py +0 -0
  219. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  220. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  221. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  222. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
  223. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  224. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  225. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  226. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  227. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  228. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  229. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  230. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  231. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
  232. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  233. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  234. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  235. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  236. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  237. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  238. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  239. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  240. deltacat/tests/storage/model/test_expression.py +327 -0
  241. deltacat/tests/storage/model/test_manifest.py +129 -0
  242. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  243. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  244. deltacat/tests/storage/model/test_schema.py +171 -0
  245. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  246. deltacat/tests/storage/model/test_shard.py +3 -1
  247. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  248. deltacat/tests/storage/model/test_transaction.py +393 -48
  249. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  250. deltacat/tests/test_deltacat_api.py +1036 -11
  251. deltacat/tests/test_exceptions.py +9 -5
  252. deltacat/tests/test_utils/pyarrow.py +52 -21
  253. deltacat/tests/test_utils/storage.py +23 -34
  254. deltacat/tests/types/__init__.py +0 -0
  255. deltacat/tests/types/test_tables.py +104 -0
  256. deltacat/tests/utils/exceptions.py +22 -0
  257. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  258. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  259. deltacat/tests/utils/test_daft.py +121 -31
  260. deltacat/tests/utils/test_numpy.py +1193 -0
  261. deltacat/tests/utils/test_pandas.py +1106 -0
  262. deltacat/tests/utils/test_polars.py +1040 -0
  263. deltacat/tests/utils/test_pyarrow.py +1370 -89
  264. deltacat/types/media.py +345 -37
  265. deltacat/types/tables.py +2344 -46
  266. deltacat/utils/arguments.py +33 -1
  267. deltacat/utils/daft.py +824 -40
  268. deltacat/utils/export.py +3 -1
  269. deltacat/utils/filesystem.py +139 -9
  270. deltacat/utils/metafile_locator.py +2 -1
  271. deltacat/utils/numpy.py +118 -26
  272. deltacat/utils/pandas.py +577 -48
  273. deltacat/utils/polars.py +759 -0
  274. deltacat/utils/pyarrow.py +1373 -192
  275. deltacat/utils/ray_utils/concurrency.py +1 -1
  276. deltacat/utils/ray_utils/dataset.py +101 -10
  277. deltacat/utils/ray_utils/runtime.py +56 -4
  278. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  279. deltacat/utils/url.py +1325 -0
  280. deltacat-2.0.0.dist-info/METADATA +1163 -0
  281. deltacat-2.0.0.dist-info/RECORD +439 -0
  282. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  283. deltacat/catalog/iceberg/__init__.py +0 -4
  284. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  285. deltacat/compute/merge_on_read/__init__.py +0 -4
  286. deltacat/compute/merge_on_read/daft.py +0 -40
  287. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  288. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  289. deltacat/examples/common/fixtures.py +0 -15
  290. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  291. deltacat/storage/rivulet/__init__.py +0 -11
  292. deltacat/storage/rivulet/feather/__init__.py +0 -5
  293. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  294. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  295. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  296. deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
  297. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  298. deltacat/utils/s3fs.py +0 -21
  299. deltacat-2.0.dist-info/METADATA +0 -65
  300. deltacat-2.0.dist-info/RECORD +0 -347
  301. /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
  302. /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
  303. /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
  304. /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
  305. /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
  306. /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
  307. /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
  308. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
  309. /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
  310. /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  311. /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
  312. /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
  313. /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
  314. /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
  315. /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
  316. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
  317. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  318. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  319. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  320. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  321. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  322. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  323. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  324. {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,7 @@ import functools
3
3
  from deltacat.storage import (
4
4
  PartitionLocator,
5
5
  Delta,
6
- interface as unimplemented_deltacat_storage,
6
+ metastore,
7
7
  )
8
8
  from deltacat import logs
9
9
  from deltacat.compute.compactor.utils import io as io_v1
@@ -38,7 +38,7 @@ def discover_deltas(
38
38
  rebase_source_partition_locator: Optional[PartitionLocator] = None,
39
39
  rebase_source_partition_high_watermark: Optional[int] = None,
40
40
  rcf_high_watermark: Optional[int] = None,
41
- deltacat_storage=unimplemented_deltacat_storage,
41
+ deltacat_storage=metastore,
42
42
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
43
43
  list_deltas_kwargs: Optional[Dict[str, Any]] = {},
44
44
  ) -> List[Delta]:
@@ -67,6 +67,11 @@ def discover_deltas(
67
67
  f"Length of input deltas from delta source table is {len(delta_source_incremental_deltas)}"
68
68
  f" from ({previous_compacted_high_watermark}, {last_stream_position_to_compact}]"
69
69
  )
70
+ logger.info(f"DEBUG: source_partition_locator = {source_partition_locator}")
71
+ logger.info(
72
+ f"DEBUG: source_partition_locator.partition_id = {getattr(source_partition_locator, 'partition_id', 'NO_PARTITION_ID')}"
73
+ )
74
+ logger.info(f"DEBUG: total input deltas found = {len(result)}")
70
75
 
71
76
  if rebase_source_partition_locator:
72
77
  previous_compacted_deltas = io_v1._discover_deltas(
@@ -93,7 +98,8 @@ def create_uniform_input_deltas(
93
98
  hash_bucket_count: int,
94
99
  compaction_audit: CompactionSessionAuditInfo,
95
100
  compact_partition_params: CompactPartitionParams,
96
- deltacat_storage=unimplemented_deltacat_storage,
101
+ all_column_names: List[str],
102
+ deltacat_storage=metastore,
97
103
  deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
98
104
  ) -> List[DeltaAnnotated]:
99
105
 
@@ -101,7 +107,6 @@ def create_uniform_input_deltas(
101
107
  delta_manifest_entries_count = 0
102
108
  estimated_da_bytes = 0
103
109
  input_da_list = []
104
-
105
110
  for delta in input_deltas:
106
111
  if (
107
112
  compact_partition_params.enable_input_split
@@ -114,10 +119,12 @@ def create_uniform_input_deltas(
114
119
  )
115
120
  append_content_type_params(
116
121
  delta=delta,
122
+ all_column_names=all_column_names,
117
123
  deltacat_storage=deltacat_storage,
118
124
  deltacat_storage_kwargs=deltacat_storage_kwargs,
119
125
  task_max_parallelism=compact_partition_params.task_max_parallelism,
120
126
  max_parquet_meta_size_bytes=compact_partition_params.max_parquet_meta_size_bytes,
127
+ file_reader_kwargs_provider=compact_partition_params.read_kwargs_provider,
121
128
  )
122
129
 
123
130
  manifest_entries = delta.manifest.entries
@@ -23,6 +23,7 @@ from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
23
23
 
24
24
  from deltacat.utils.performance import timed_invocation
25
25
  from deltacat.storage import (
26
+ DeltaType,
26
27
  Partition,
27
28
  )
28
29
  from deltacat.compute.compactor_v2.deletes.delete_strategy import (
@@ -47,13 +48,21 @@ def materialize(
47
48
  # TODO (pdames): compare performance to pandas-native materialize path
48
49
  df = compacted_table.to_pandas(split_blocks=True, self_destruct=True)
49
50
  compacted_table = df
51
+ # Extract schema from table_writer_kwargs to pass as direct parameter
52
+ # This ensures schema_id is properly set in the manifest
53
+ schema = None
54
+ if input.table_writer_kwargs and "schema" in input.table_writer_kwargs:
55
+ schema = input.table_writer_kwargs["schema"]
56
+
50
57
  delta, stage_delta_time = timed_invocation(
51
58
  input.deltacat_storage.stage_delta,
52
59
  compacted_table,
53
60
  input.write_to_partition,
61
+ delta_type=DeltaType.APPEND, # Compaction always produces APPEND deltas
54
62
  max_records_per_entry=input.max_records_per_output_file,
55
63
  content_type=input.compacted_file_content_type,
56
- s3_table_writer_kwargs=input.s3_table_writer_kwargs,
64
+ schema=schema, # Pass schema as direct parameter for schema_id extraction
65
+ table_writer_kwargs=input.table_writer_kwargs,
57
66
  **input.deltacat_storage_kwargs,
58
67
  )
59
68
  compacted_table_size = TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](
@@ -112,6 +121,7 @@ def generate_local_merge_input(
112
121
  return MergeInput.of(
113
122
  merge_file_groups_provider=LocalMergeFileGroupsProvider(
114
123
  annotated_deltas,
124
+ all_column_names=params.all_column_names,
115
125
  read_kwargs_provider=params.read_kwargs_provider,
116
126
  deltacat_storage=params.deltacat_storage,
117
127
  deltacat_storage_kwargs=params.deltacat_storage_kwargs,
@@ -119,12 +129,13 @@ def generate_local_merge_input(
119
129
  write_to_partition=compacted_partition,
120
130
  compacted_file_content_type=params.compacted_file_content_type,
121
131
  primary_keys=params.primary_keys,
132
+ all_column_names=params.all_column_names,
122
133
  sort_keys=params.sort_keys,
123
134
  drop_duplicates=params.drop_duplicates,
124
135
  max_records_per_output_file=params.records_per_compacted_file,
125
136
  enable_profiler=params.enable_profiler,
126
137
  metrics_config=params.metrics_config,
127
- s3_table_writer_kwargs=params.s3_table_writer_kwargs,
138
+ table_writer_kwargs=params.table_writer_kwargs,
128
139
  read_kwargs_provider=params.read_kwargs_provider,
129
140
  round_completion_info=round_completion_info,
130
141
  object_store=params.object_store,
@@ -133,4 +144,6 @@ def generate_local_merge_input(
133
144
  delete_strategy=delete_strategy,
134
145
  delete_file_envelopes=delete_file_envelopes,
135
146
  disable_copy_by_reference=params.disable_copy_by_reference,
147
+ hash_bucket_count=params.hash_bucket_count,
148
+ original_fields=params.original_fields,
136
149
  )
@@ -10,6 +10,7 @@ from deltacat.compute.compactor_v2.constants import (
10
10
  TOTAL_BYTES_IN_SHA1_HASH,
11
11
  PK_DELIMITER,
12
12
  MAX_SIZE_OF_RECORD_BATCH_IN_GIB,
13
+ SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED,
13
14
  )
14
15
  import time
15
16
  from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelope
@@ -48,6 +49,13 @@ def _is_sha1_desired(hash_columns: List[pa.Array]) -> bool:
48
49
  f"Found total length of hash column={total_len} and total_size={total_size}"
49
50
  )
50
51
 
52
+ if SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED:
53
+ logger.info(
54
+ f"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED is True. "
55
+ f"Returning False for is_sha1_desired"
56
+ )
57
+ return False
58
+
51
59
  return total_size > TOTAL_BYTES_IN_SHA1_HASH * total_len
52
60
 
53
61
 
@@ -70,13 +78,25 @@ def _append_table_by_hash_bucket(
70
78
  f"Grouping a pki table of length {len(pki_table)} took {groupby_latency}s"
71
79
  )
72
80
 
81
+ hb_pk_grouped_by = hb_pk_grouped_by.sort_by(sc._HASH_BUCKET_IDX_COLUMN_NAME)
73
82
  group_count_array = hb_pk_grouped_by[f"{sc._HASH_BUCKET_IDX_COLUMN_NAME}_count"]
74
83
  hb_group_array = hb_pk_grouped_by[sc._HASH_BUCKET_IDX_COLUMN_NAME]
75
84
 
76
85
  result_len = 0
77
86
  for i, group_count in enumerate(group_count_array):
78
87
  hb_idx = hb_group_array[i].as_py()
79
- pyarrow_table = hb_pk_table.slice(offset=result_len, length=group_count.as_py())
88
+ group_count_py = group_count.as_py()
89
+ pyarrow_table = hb_pk_table.slice(offset=result_len, length=group_count_py)
90
+ assert group_count_py == len(
91
+ pyarrow_table
92
+ ), f"Group count {group_count_py} not equal to {len(pyarrow_table)}"
93
+ all_buckets = pc.unique(pyarrow_table[sc._HASH_BUCKET_IDX_COLUMN_NAME])
94
+ assert (
95
+ len(all_buckets) == 1
96
+ ), f"Only one hash bucket is allowed but found {len(all_buckets)}"
97
+ assert (
98
+ all_buckets[0].as_py() == hb_idx
99
+ ), f"Hash bucket not equal, {all_buckets[0]} and {hb_idx}"
80
100
  pyarrow_table = pyarrow_table.drop(
81
101
  [sc._HASH_BUCKET_IDX_COLUMN_NAME, sc._PK_HASH_STRING_COLUMN_NAME]
82
102
  )
@@ -108,9 +128,10 @@ def _optimized_group_record_batches_by_hash_bucket(
108
128
  record_batches = []
109
129
  result_len = 0
110
130
  for record_batch in table_batches:
111
- current_bytes += record_batch.nbytes
112
- record_batches.append(record_batch)
113
- if current_bytes >= MAX_SIZE_OF_RECORD_BATCH_IN_GIB:
131
+ if (
132
+ record_batches
133
+ and current_bytes + record_batch.nbytes >= MAX_SIZE_OF_RECORD_BATCH_IN_GIB
134
+ ):
114
135
  logger.info(
115
136
  f"Total number of record batches without exceeding {MAX_SIZE_OF_RECORD_BATCH_IN_GIB} "
116
137
  f"is {len(record_batches)} and size {current_bytes}"
@@ -128,6 +149,9 @@ def _optimized_group_record_batches_by_hash_bucket(
128
149
  current_bytes = 0
129
150
  record_batches.clear()
130
151
 
152
+ current_bytes += record_batch.nbytes
153
+ record_batches.append(record_batch)
154
+
131
155
  if record_batches:
132
156
  appended_len, append_latency = timed_invocation(
133
157
  _append_table_by_hash_bucket,
@@ -1,12 +1,17 @@
1
1
  import logging
2
2
  from typing import Dict, Optional, List, Tuple, Any
3
3
  from deltacat import logs
4
+ from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
5
+ from deltacat.compute.compactor_v2.constants import (
6
+ AVERAGE_RECORD_SIZE_BYTES as DEFAULT_AVERAGE_RECORD_SIZE_BYTES,
7
+ )
4
8
  from deltacat.compute.compactor_v2.model.merge_file_group import (
5
9
  LocalMergeFileGroupsProvider,
6
10
  )
7
11
  from deltacat.storage import (
8
12
  Manifest,
9
- interface as unimplemented_deltacat_storage,
13
+ ManifestEntry,
14
+ metastore,
10
15
  )
11
16
  from deltacat.compute.compactor.model.delta_annotated import DeltaAnnotated
12
17
  from deltacat.compute.compactor.model.round_completion_info import RoundCompletionInfo
@@ -72,8 +77,6 @@ def _get_merge_task_options(
72
77
  round_completion_info: Optional[RoundCompletionInfo] = None,
73
78
  compacted_delta_manifest: Optional[Manifest] = None,
74
79
  primary_keys: Optional[List[str]] = None,
75
- deltacat_storage=unimplemented_deltacat_storage,
76
- deltacat_storage_kwargs: Optional[Dict] = {},
77
80
  memory_logs_enabled: Optional[bool] = None,
78
81
  ) -> Dict[str, Any]:
79
82
  if (
@@ -81,16 +84,27 @@ def _get_merge_task_options(
81
84
  and compacted_delta_manifest
82
85
  and round_completion_info.hb_index_to_entry_range
83
86
  ):
84
-
85
- previous_inflation = (
86
- round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
87
- / round_completion_info.compacted_pyarrow_write_result.file_bytes
87
+ logger.debug_conditional(
88
+ f"[Merge task {index}]: Using previous compaction rounds to calculate merge memory: {round_completion_info.compacted_pyarrow_write_result}",
89
+ memory_logs_enabled,
90
+ )
91
+ previous_inflation: float = (
92
+ (
93
+ round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
94
+ / round_completion_info.compacted_pyarrow_write_result.file_bytes
95
+ )
96
+ if round_completion_info.compacted_pyarrow_write_result.file_bytes
97
+ else PYARROW_INFLATION_MULTIPLIER
88
98
  )
89
99
  debug_memory_params["previous_inflation"] = previous_inflation
90
100
 
91
- average_record_size = (
92
- round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
93
- / round_completion_info.compacted_pyarrow_write_result.records
101
+ average_record_size: float = (
102
+ (
103
+ round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
104
+ / round_completion_info.compacted_pyarrow_write_result.records
105
+ )
106
+ if round_completion_info.compacted_pyarrow_write_result.records
107
+ else DEFAULT_AVERAGE_RECORD_SIZE_BYTES
94
108
  )
95
109
  debug_memory_params["average_record_size"] = average_record_size
96
110
 
@@ -106,31 +120,36 @@ def _get_merge_task_options(
106
120
  str(hb_idx)
107
121
  ]
108
122
  for entry_index in range(entry_start, entry_end):
109
- entry = compacted_delta_manifest.entries[entry_index]
110
-
111
- current_entry_size = estimate_manifest_entry_size_bytes(
112
- entry=entry,
113
- operation_type=OperationType.PYARROW_DOWNLOAD,
114
- estimate_resources_params=estimate_resources_params,
123
+ entry: ManifestEntry = compacted_delta_manifest.entries[entry_index]
124
+ current_entry_size: float = (
125
+ estimate_manifest_entry_size_bytes(
126
+ entry=entry,
127
+ operation_type=OperationType.PYARROW_DOWNLOAD,
128
+ estimate_resources_params=estimate_resources_params,
129
+ )
130
+ or 0.0
115
131
  )
116
- current_entry_rows = estimate_manifest_entry_num_rows(
117
- entry=entry,
118
- operation_type=OperationType.PYARROW_DOWNLOAD,
119
- estimate_resources_params=estimate_resources_params,
132
+ current_entry_rows: int = (
133
+ estimate_manifest_entry_num_rows(
134
+ entry=entry,
135
+ operation_type=OperationType.PYARROW_DOWNLOAD,
136
+ estimate_resources_params=estimate_resources_params,
137
+ )
138
+ or 0
120
139
  )
121
-
140
+ # NOTE: We can treat the current_entry_size and current_entry_rows as 0 as a None estimated entry size implies a 0 value
122
141
  data_size += current_entry_size
123
142
  num_rows += current_entry_rows
124
-
125
143
  if primary_keys:
126
- pk_size = estimate_manifest_entry_column_size_bytes(
144
+ pk_size: Optional[
145
+ float
146
+ ] = estimate_manifest_entry_column_size_bytes(
127
147
  entry=entry,
128
148
  columns=primary_keys,
129
149
  operation_type=OperationType.PYARROW_DOWNLOAD,
130
150
  estimate_resources_params=estimate_resources_params,
131
151
  )
132
-
133
- if pk_size is None:
152
+ if not pk_size:
134
153
  pk_size_bytes += current_entry_size
135
154
  else:
136
155
  pk_size_bytes += pk_size
@@ -159,7 +178,6 @@ def _get_merge_task_options(
159
178
  f"[Merge task {index}]: Params used for calculating merge memory: {debug_memory_params}",
160
179
  memory_logs_enabled,
161
180
  )
162
-
163
181
  return _get_task_options(0.01, total_memory, ray_custom_resources)
164
182
 
165
183
 
@@ -255,8 +273,6 @@ def merge_resource_options_provider(
255
273
  compacted_delta_manifest: Optional[Manifest] = None,
256
274
  ray_custom_resources: Optional[Dict] = None,
257
275
  primary_keys: Optional[List[str]] = None,
258
- deltacat_storage=unimplemented_deltacat_storage,
259
- deltacat_storage_kwargs: Optional[Dict] = {},
260
276
  memory_logs_enabled: Optional[bool] = None,
261
277
  **kwargs,
262
278
  ) -> Dict:
@@ -286,8 +302,6 @@ def merge_resource_options_provider(
286
302
  round_completion_info=round_completion_info,
287
303
  compacted_delta_manifest=compacted_delta_manifest,
288
304
  primary_keys=primary_keys,
289
- deltacat_storage=deltacat_storage,
290
- deltacat_storage_kwargs=deltacat_storage_kwargs,
291
305
  memory_logs_enabled=memory_logs_enabled,
292
306
  estimate_resources_params=estimate_resources_params,
293
307
  )
@@ -302,7 +316,7 @@ def local_merge_resource_options_provider(
302
316
  compacted_delta_manifest: Optional[Manifest] = None,
303
317
  ray_custom_resources: Optional[Dict] = None,
304
318
  primary_keys: Optional[List[str]] = None,
305
- deltacat_storage=unimplemented_deltacat_storage,
319
+ deltacat_storage=metastore,
306
320
  deltacat_storage_kwargs: Optional[Dict] = {},
307
321
  memory_logs_enabled: Optional[bool] = None,
308
322
  **kwargs,
@@ -328,8 +342,6 @@ def local_merge_resource_options_provider(
328
342
  round_completion_info=round_completion_info,
329
343
  compacted_delta_manifest=compacted_delta_manifest,
330
344
  primary_keys=primary_keys,
331
- deltacat_storage=deltacat_storage,
332
- deltacat_storage_kwargs=deltacat_storage_kwargs,
333
345
  memory_logs_enabled=memory_logs_enabled,
334
346
  estimate_resources_params=estimate_resources_params,
335
347
  )
@@ -2,3 +2,8 @@ DEFAULT_CONVERTER_TASK_MAX_PARALLELISM = 4096
2
2
 
3
3
  # Safe limit ONLY considering CPU limit, typically 32 for a 8x-large worker
4
4
  DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD = 30
5
+
6
+
7
+ # Unique identifier delimiter to ensure different primary key don't end up with same hash when concatenated.
8
+ # e.g.: pk column a with value: 1, 12; pk column b with value: 12, 1; Without delimiter will both become "121".
9
+ IDENTIFIER_FIELD_DELIMITER = "c303282d"