deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,22 @@
1
+ """
2
+ Common utility functions for main storage compaction tests.
3
+
4
+ These functions are shared between incremental and multiple rounds compaction tests.
5
+ """
1
6
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
7
  from __future__ import annotations
3
8
  from enum import Enum
4
- from typing import Any, Dict, List, Optional, Set
9
+ from typing import Any, Dict, List, Optional, Tuple
5
10
  import datetime as dt
6
- from boto3.resources.base import ServiceResource
7
11
  from datetime import timezone
8
12
 
9
- from deltacat.tests.compute.test_util_constant import (
10
- TEST_S3_RCF_BUCKET_NAME,
11
- )
13
+ import tempfile
14
+ import os
15
+ import shutil
16
+
17
+ import pyarrow as pa
18
+
19
+
12
20
  from deltacat.tests.compute.test_util_constant import (
13
21
  BASE_TEST_SOURCE_NAMESPACE,
14
22
  BASE_TEST_SOURCE_TABLE_NAME,
@@ -26,14 +34,34 @@ from deltacat.compute.compactor import (
26
34
  from deltacat.compute.compactor.model.compaction_session_audit_info import (
27
35
  CompactionSessionAuditInfo,
28
36
  )
29
-
30
- from deltacat.storage.model.partition import PartitionLocator
37
+ from deltacat.storage.model.partition import (
38
+ PartitionLocator,
39
+ PartitionScheme,
40
+ PartitionKey as StoragePartitionKey,
41
+ )
31
42
  from deltacat.storage.model.stream import StreamLocator
32
43
  from deltacat.storage.model.table_version import TableVersionLocator
33
44
  from deltacat.storage.model.table import TableLocator
34
45
  from deltacat.storage.model.namespace import NamespaceLocator
46
+ from deltacat.storage.model.sort_key import (
47
+ SortScheme,
48
+ )
49
+ from deltacat.storage.model.delta import (
50
+ Delta,
51
+ DeltaType,
52
+ )
53
+ from deltacat.storage.model.partition import (
54
+ Partition,
55
+ PartitionKeyList,
56
+ )
57
+ from deltacat.storage.model.stream import Stream
58
+ from deltacat.storage.model.transform import IdentityTransform
59
+ from deltacat.storage.model.schema import Schema
35
60
  from deltacat.compute.compactor.model.compactor_version import CompactorVersion
36
61
 
62
+ from deltacat.storage import metastore
63
+ from deltacat.catalog.model.properties import CatalogProperties
64
+
37
65
 
38
66
  class PartitionKeyType(str, Enum):
39
67
  INT = "int"
@@ -73,113 +101,741 @@ def get_test_partition_locator(partition_id):
73
101
  return partition_locator
74
102
 
75
103
 
76
- def _create_table(
104
+ def create_main_deltacat_storage_kwargs() -> Dict[str, Any]:
105
+ """
106
+ Helper function to create main deltacat storage kwargs
107
+
108
+ Returns: kwargs to use for main deltacat storage, i.e. {"catalog": CatalogProperties(...)}
109
+ """
110
+ temp_dir = tempfile.mkdtemp()
111
+ catalog = CatalogProperties(root=temp_dir)
112
+ return {"catalog": catalog}
113
+
114
+
115
+ def clean_up_main_deltacat_storage_kwargs(storage_kwargs: Dict[str, Any]):
116
+ """
117
+ Cleans up directory created by create_main_deltacat_storage_kwargs
118
+ """
119
+ catalog = storage_kwargs["catalog"]
120
+ if hasattr(catalog, "root") and os.path.exists(catalog.root):
121
+ shutil.rmtree(catalog.root)
122
+
123
+
124
+ def _create_table_main(
77
125
  namespace: str,
78
126
  table_name: str,
79
127
  table_version: str,
80
- primary_keys: Set[str],
81
128
  sort_keys: Optional[List[Any]],
82
129
  partition_keys: Optional[List[PartitionKey]],
130
+ input_deltas: Optional[pa.Table],
83
131
  ds_mock_kwargs: Optional[Dict[str, Any]],
84
132
  ):
85
- import deltacat.tests.local_deltacat_storage as ds
86
- from deltacat.types.media import ContentType
87
-
88
- ds.create_namespace(namespace, {}, **ds_mock_kwargs)
89
- ds.create_table_version(
90
- namespace,
91
- table_name,
92
- table_version,
93
- primary_key_column_names=list(primary_keys),
94
- sort_keys=sort_keys,
95
- partition_keys=partition_keys,
96
- supported_content_types=[ContentType.PARQUET],
133
+ """
134
+ Main storage version of _create_table that works for both incremental and multiple rounds tests.
135
+
136
+ For incremental tests, input_deltas is provided to extract schema.
137
+ For multiple rounds tests, input_deltas can be None and we use a simpler approach.
138
+ """
139
+ # Create namespace first
140
+ metastore.create_namespace(namespace=namespace, **ds_mock_kwargs)
141
+
142
+ # Handle schema creation
143
+ if input_deltas is not None:
144
+ # Incremental test approach - extract schema from input deltas
145
+ schema = input_deltas.schema
146
+
147
+ # Add partition key fields to schema if they're not already present
148
+ if partition_keys:
149
+ for pk in partition_keys:
150
+ field_name = pk.key_name
151
+ if field_name not in schema.names:
152
+ # Add partition key field with appropriate type
153
+ if pk.key_type == PartitionKeyType.INT:
154
+ field_type = pa.int32()
155
+ elif pk.key_type == PartitionKeyType.STRING:
156
+ field_type = pa.string()
157
+ elif (
158
+ pk.key_type == PartitionKeyType.TIMESTAMP
159
+ ): # Handle timestamp type properly
160
+ field_type = pa.timestamp("us")
161
+ else:
162
+ field_type = pa.string() # Default to string
163
+
164
+ schema = schema.append(pa.field(field_name, field_type))
165
+
166
+ schema_obj = Schema.of(schema=schema)
167
+ else:
168
+ # Multiple rounds test approach - use None for schema (will be set later)
169
+ schema_obj = None
170
+
171
+ sort_scheme = SortScheme.of(sort_keys) if sort_keys else None
172
+
173
+ # Convert test partition keys to storage partition keys
174
+ storage_partition_keys = []
175
+ if partition_keys:
176
+ for pk in partition_keys:
177
+ storage_partition_key = StoragePartitionKey.of(
178
+ key=[pk.key_name],
179
+ name=pk.key_name,
180
+ transform=IdentityTransform.of(),
181
+ )
182
+ storage_partition_keys.append(storage_partition_key)
183
+
184
+ # Create partition scheme
185
+ partition_scheme = None
186
+ if storage_partition_keys:
187
+ partition_scheme = PartitionScheme.of(
188
+ keys=PartitionKeyList.of(storage_partition_keys),
189
+ scheme_id="default_partition_scheme",
190
+ )
191
+
192
+ # Create table version (which creates table and stream automatically)
193
+ metastore.create_table_version(
194
+ namespace=namespace,
195
+ table_name=table_name,
196
+ table_version=table_version,
197
+ schema=schema_obj,
198
+ partition_scheme=partition_scheme,
199
+ sort_keys=sort_scheme,
97
200
  **ds_mock_kwargs,
98
201
  )
202
+
99
203
  return namespace, table_name, table_version
100
204
 
101
205
 
102
- def create_src_table(
103
- primary_keys: Set[str],
206
+ def create_src_table_main(
104
207
  sort_keys: Optional[List[Any]],
105
208
  partition_keys: Optional[List[PartitionKey]],
209
+ input_deltas: Optional[pa.Table],
106
210
  ds_mock_kwargs: Optional[Dict[str, Any]],
107
211
  ):
212
+ """
213
+ Main storage version of create_src_table
214
+ """
108
215
  source_namespace: str = BASE_TEST_SOURCE_NAMESPACE
109
216
  source_table_name: str = BASE_TEST_SOURCE_TABLE_NAME
110
217
  source_table_version: str = BASE_TEST_SOURCE_TABLE_VERSION
111
- return _create_table(
218
+ return _create_table_main(
112
219
  source_namespace,
113
220
  source_table_name,
114
221
  source_table_version,
115
- primary_keys,
116
222
  sort_keys,
117
223
  partition_keys,
224
+ input_deltas,
118
225
  ds_mock_kwargs,
119
226
  )
120
227
 
121
228
 
122
- def create_destination_table(
123
- primary_keys: Set[str],
229
+ def create_destination_table_main(
124
230
  sort_keys: Optional[List[Any]],
125
231
  partition_keys: Optional[List[PartitionKey]],
232
+ input_deltas: Optional[pa.Table],
126
233
  ds_mock_kwargs: Optional[Dict[str, Any]],
127
234
  ):
235
+ """
236
+ Main storage version of create_destination_table
237
+ """
128
238
  destination_namespace: str = BASE_TEST_DESTINATION_NAMESPACE
129
239
  destination_table_name: str = BASE_TEST_DESTINATION_TABLE_NAME
130
240
  destination_table_version: str = BASE_TEST_DESTINATION_TABLE_VERSION
131
- return _create_table(
241
+ return _create_table_main(
132
242
  destination_namespace,
133
243
  destination_table_name,
134
244
  destination_table_version,
135
- primary_keys,
136
245
  sort_keys,
137
246
  partition_keys,
247
+ input_deltas,
138
248
  ds_mock_kwargs,
139
249
  )
140
250
 
141
251
 
142
- def create_rebase_table(
143
- primary_keys: Set[str],
252
+ def create_rebase_table_main(
144
253
  sort_keys: Optional[List[Any]],
145
254
  partition_keys: Optional[List[PartitionKey]],
255
+ input_deltas: Optional[pa.Table],
146
256
  ds_mock_kwargs: Optional[Dict[str, Any]],
147
257
  ):
258
+ """
259
+ Main storage version of create_rebase_table
260
+ """
148
261
  rebasing_namespace = REBASING_NAMESPACE
149
262
  rebasing_table_name = REBASING_TABLE_NAME
150
263
  rebasing_table_version = REBASING_TABLE_VERSION
151
- return _create_table(
264
+ return _create_table_main(
152
265
  rebasing_namespace,
153
266
  rebasing_table_name,
154
267
  rebasing_table_version,
155
- primary_keys,
156
268
  sort_keys,
157
269
  partition_keys,
270
+ input_deltas,
158
271
  ds_mock_kwargs,
159
272
  )
160
273
 
161
274
 
162
- def get_rcf(s3_resource, rcf_file_s3_uri: str) -> RoundCompletionInfo:
163
- from deltacat.tests.test_utils.utils import read_s3_contents
275
+ def get_rci_from_partition(
276
+ partition_locator: PartitionLocator, deltacat_storage=None, **kwargs
277
+ ) -> RoundCompletionInfo:
278
+ """
279
+ Read RoundCompletionInfo from a partition metafile.
280
+
281
+ Args:
282
+ partition_locator: Locator of the partition containing the RoundCompletionInfo
283
+ deltacat_storage: Storage implementation (defaults to metastore)
284
+ **kwargs: Additional arguments to pass to deltacat_storage.get_partition (e.g., catalog)
285
+
286
+ Returns:
287
+ RoundCompletionInfo object from the partition, or None if not found
288
+ """
289
+ from deltacat.storage import metastore
290
+
291
+ if deltacat_storage is None:
292
+ deltacat_storage = metastore
293
+
294
+ partition = deltacat_storage.get_partition(
295
+ partition_locator.stream_locator, partition_locator.partition_values, **kwargs
296
+ )
297
+
298
+ if partition and partition.compaction_round_completion_info:
299
+ return partition.compaction_round_completion_info
300
+
301
+ return None
302
+
303
+
304
+ def _add_deltas_to_partition_main(
305
+ deltas_ingredients: List[Tuple[pa.Table, DeltaType, Optional[Dict[str, str]]]],
306
+ partition: Optional[Partition],
307
+ ds_mock_kwargs: Optional[Dict[str, Any]],
308
+ ) -> Tuple[Optional[Delta], int]:
309
+ """
310
+ Add deltas to a partition using main storage
311
+ """
312
+ all_deltas_length = 0
313
+ incremental_delta = None
314
+ for (delta_data, delta_type, delete_parameters) in deltas_ingredients:
315
+ staged_delta: Delta = metastore.stage_delta(
316
+ delta_data,
317
+ partition,
318
+ delta_type,
319
+ entry_params=delete_parameters,
320
+ **ds_mock_kwargs,
321
+ )
322
+ incremental_delta = metastore.commit_delta(
323
+ staged_delta,
324
+ **ds_mock_kwargs,
325
+ )
326
+ all_deltas_length += len(delta_data) if delta_data else 0
327
+ return incremental_delta, all_deltas_length
328
+
329
+
330
+ def add_late_deltas_to_partition_main(
331
+ late_deltas: List[Tuple[pa.Table, DeltaType, Optional[Dict[str, str]]]],
332
+ source_partition: Optional[Partition],
333
+ ds_mock_kwargs: Optional[Dict[str, Any]],
334
+ ) -> Tuple[Optional[Delta], int]:
335
+ """
336
+ Add late deltas to a partition using main storage
337
+ """
338
+ return _add_deltas_to_partition_main(late_deltas, source_partition, ds_mock_kwargs)
339
+
340
+
341
+ def multiple_rounds_create_src_w_deltas_destination_rebase_w_deltas_strategy_main(
342
+ sort_keys: Optional[List[Any]],
343
+ partition_keys: Optional[List[PartitionKey]],
344
+ input_deltas: List[pa.Table],
345
+ partition_values: Optional[List[Any]],
346
+ ds_mock_kwargs: Optional[Dict[str, Any]],
347
+ ) -> Tuple[Stream, Stream, Optional[Stream], bool]:
348
+ """
349
+ Main storage version of multiple_rounds_create_src_w_deltas_destination_rebase_w_deltas_strategy
350
+ """
351
+ # For multiple rounds, we need to extract the first delta to get schema
352
+ first_delta_table = input_deltas[0][0] if input_deltas else None
353
+ source_namespace, source_table_name, source_table_version = create_src_table_main(
354
+ sort_keys, partition_keys, first_delta_table, ds_mock_kwargs
355
+ )
356
+
357
+ source_table_stream: Stream = metastore.get_stream(
358
+ namespace=source_namespace,
359
+ table_name=source_table_name,
360
+ table_version=source_table_version,
361
+ **ds_mock_kwargs,
362
+ )
363
+
364
+ # Convert partition values to correct types
365
+ converted_partition_values = []
366
+ if partition_values and partition_keys:
367
+ for i, (value, pk) in enumerate(zip(partition_values, partition_keys)):
368
+ if pk.key_type == PartitionKeyType.INT:
369
+ converted_partition_values.append(int(value))
370
+ else:
371
+ converted_partition_values.append(value)
372
+ else:
373
+ converted_partition_values = partition_values
374
+
375
+ staged_partition: Partition = metastore.stage_partition(
376
+ source_table_stream,
377
+ converted_partition_values,
378
+ partition_scheme_id="default_partition_scheme" if partition_keys else None,
379
+ **ds_mock_kwargs,
380
+ )
381
+
382
+ is_delete = False
383
+ input_delta_length = 0
384
+ for (
385
+ input_delta,
386
+ input_delta_type,
387
+ input_delta_parameters,
388
+ ) in input_deltas:
389
+ if input_delta_type is DeltaType.DELETE:
390
+ is_delete = True
391
+ staged_delta = metastore.stage_delta(
392
+ input_delta,
393
+ staged_partition,
394
+ input_delta_type,
395
+ entry_params=input_delta_parameters,
396
+ **ds_mock_kwargs,
397
+ )
398
+ metastore.commit_delta(staged_delta, **ds_mock_kwargs)
399
+ input_delta_length += len(input_delta)
400
+ metastore.commit_partition(staged_partition, **ds_mock_kwargs)
401
+
402
+ (
403
+ destination_table_namespace,
404
+ destination_table_name,
405
+ destination_table_version,
406
+ ) = create_destination_table_main(
407
+ sort_keys, partition_keys, first_delta_table, ds_mock_kwargs
408
+ )
409
+ destination_table_stream: Stream = metastore.get_stream(
410
+ namespace=destination_table_namespace,
411
+ table_name=destination_table_name,
412
+ table_version=destination_table_version,
413
+ **ds_mock_kwargs,
414
+ )
164
415
 
165
- _, rcf_object_key = rcf_file_s3_uri.strip("s3://").split("/", 1)
166
- rcf_file_output: Dict[str, Any] = read_s3_contents(
167
- s3_resource, TEST_S3_RCF_BUCKET_NAME, rcf_object_key
416
+ # Always create rebase table for multiple rounds tests
417
+ (
418
+ rebasing_table_namespace,
419
+ rebasing_table_name,
420
+ rebasing_table_version,
421
+ ) = create_rebase_table_main(
422
+ sort_keys, partition_keys, first_delta_table, ds_mock_kwargs
423
+ )
424
+ rebasing_table_stream: Stream = metastore.get_stream(
425
+ namespace=rebasing_table_namespace,
426
+ table_name=rebasing_table_name,
427
+ table_version=rebasing_table_version,
428
+ **ds_mock_kwargs,
168
429
  )
169
- return RoundCompletionInfo(**rcf_file_output)
170
430
 
431
+ # Stage partition and add deltas to rebase table
432
+ rebased_staged_partition: Partition = metastore.stage_partition(
433
+ rebasing_table_stream,
434
+ converted_partition_values,
435
+ partition_scheme_id="default_partition_scheme" if partition_keys else None,
436
+ **ds_mock_kwargs,
437
+ )
171
438
 
172
- def get_compacted_delta_locator_from_rcf(
173
- s3_resource: ServiceResource, rcf_file_s3_uri: str
439
+ for (
440
+ input_delta,
441
+ input_delta_type,
442
+ input_delta_parameters,
443
+ ) in input_deltas:
444
+ staged_delta = metastore.stage_delta(
445
+ input_delta,
446
+ rebased_staged_partition,
447
+ input_delta_type,
448
+ entry_params=input_delta_parameters,
449
+ **ds_mock_kwargs,
450
+ )
451
+ metastore.commit_delta(staged_delta, **ds_mock_kwargs)
452
+ metastore.commit_partition(rebased_staged_partition, **ds_mock_kwargs)
453
+
454
+ return (
455
+ source_table_stream,
456
+ destination_table_stream,
457
+ rebasing_table_stream,
458
+ is_delete,
459
+ )
460
+
461
+
462
+ def create_src_w_deltas_destination_plus_destination_main(
463
+ sort_keys: Optional[List[Any]],
464
+ partition_keys: Optional[List[PartitionKey]],
465
+ input_deltas: pa.Table,
466
+ input_delta_type: DeltaType,
467
+ partition_values: Optional[List[Any]],
468
+ ds_mock_kwargs: Optional[Dict[str, Any]],
469
+ simulate_is_inplace: bool = False,
470
+ ) -> Tuple[Stream, Stream, Optional[Stream], str, str, str]:
471
+ """
472
+ Create source with deltas and destination tables for incremental compaction testing
473
+ """
474
+ source_namespace, source_table_name, source_table_version = create_src_table_main(
475
+ sort_keys, partition_keys, input_deltas, ds_mock_kwargs
476
+ )
477
+
478
+ source_table_stream: Stream = metastore.get_stream(
479
+ namespace=source_namespace,
480
+ table_name=source_table_name,
481
+ table_version=source_table_version,
482
+ **ds_mock_kwargs,
483
+ )
484
+
485
+ # Convert partition values to correct types
486
+ converted_partition_values = []
487
+ if partition_values and partition_keys:
488
+ for i, (value, pk) in enumerate(zip(partition_values, partition_keys)):
489
+ if pk.key_type == PartitionKeyType.INT:
490
+ converted_partition_values.append(int(value))
491
+ else:
492
+ converted_partition_values.append(value)
493
+ else:
494
+ converted_partition_values = partition_values
495
+
496
+ staged_partition: Partition = metastore.stage_partition(
497
+ source_table_stream,
498
+ converted_partition_values,
499
+ partition_scheme_id="default_partition_scheme" if partition_keys else None,
500
+ **ds_mock_kwargs,
501
+ )
502
+ metastore.commit_delta(
503
+ metastore.stage_delta(
504
+ input_deltas, staged_partition, input_delta_type, **ds_mock_kwargs
505
+ ),
506
+ **ds_mock_kwargs,
507
+ )
508
+ metastore.commit_partition(staged_partition, **ds_mock_kwargs)
509
+ source_table_stream_after_committed: Stream = metastore.get_stream(
510
+ namespace=source_namespace,
511
+ table_name=source_table_name,
512
+ table_version=source_table_version,
513
+ **ds_mock_kwargs,
514
+ )
515
+
516
+ destination_table_namespace: Optional[str] = None
517
+ destination_table_name: Optional[str] = None
518
+ destination_table_version: Optional[str] = None
519
+ if not simulate_is_inplace:
520
+ (
521
+ destination_table_namespace,
522
+ destination_table_name,
523
+ destination_table_version,
524
+ ) = create_destination_table_main(
525
+ sort_keys, partition_keys, input_deltas, ds_mock_kwargs
526
+ )
527
+ else:
528
+ destination_table_namespace = source_namespace
529
+ destination_table_name = source_table_name
530
+ destination_table_version = source_table_version
531
+
532
+ destination_table_stream: Stream = metastore.get_stream(
533
+ namespace=destination_table_namespace,
534
+ table_name=destination_table_name,
535
+ table_version=destination_table_version,
536
+ **ds_mock_kwargs,
537
+ )
538
+
539
+ return (
540
+ source_table_stream_after_committed,
541
+ destination_table_stream,
542
+ None,
543
+ source_namespace,
544
+ source_table_name,
545
+ source_table_version,
546
+ )
547
+
548
+
549
+ def create_src_w_deltas_destination_rebase_w_deltas_strategy_main(
550
+ sort_keys: Optional[List[Any]],
551
+ partition_keys: Optional[List[PartitionKey]],
552
+ input_deltas: pa.Table,
553
+ input_delta_type: DeltaType,
554
+ partition_values: Optional[List[Any]],
555
+ ds_mock_kwargs: Optional[Dict[str, Any]],
556
+ ) -> Tuple[Stream, Stream, Optional[Stream]]:
557
+ """
558
+ Main storage version of create_src_w_deltas_destination_rebase_w_deltas_strategy
559
+
560
+ Creates source table with deltas, destination table, and rebase table for rebase testing.
561
+ This test scenario sets up different source and rebase partition locators to simulate
562
+ scenarios like hash bucket count changes.
563
+ """
564
+ from deltacat.utils.common import current_time_ms
565
+
566
+ last_stream_position = current_time_ms()
567
+ source_namespace, source_table_name, source_table_version = create_src_table_main(
568
+ sort_keys, partition_keys, input_deltas, ds_mock_kwargs
569
+ )
570
+
571
+ source_table_stream: Stream = metastore.get_stream(
572
+ namespace=source_namespace,
573
+ table_name=source_table_name,
574
+ table_version=source_table_version,
575
+ **ds_mock_kwargs,
576
+ )
577
+
578
+ # Convert partition values to correct types, including timestamp handling
579
+ converted_partition_values = []
580
+ if partition_values and partition_keys:
581
+ for i, (value, pk) in enumerate(zip(partition_values, partition_keys)):
582
+ if pk.key_type == PartitionKeyType.INT:
583
+ converted_partition_values.append(int(value))
584
+ elif pk.key_type == PartitionKeyType.TIMESTAMP:
585
+ # Handle timestamp partition values
586
+ if isinstance(value, str) and "T" in value and value.endswith("Z"):
587
+ import pandas as pd
588
+
589
+ ts = pd.to_datetime(value)
590
+ # Convert to microseconds since epoch for PyArrow timestamp[us]
591
+ converted_partition_values.append(int(ts.timestamp() * 1_000_000))
592
+ else:
593
+ converted_partition_values.append(value)
594
+ else:
595
+ converted_partition_values.append(value)
596
+ else:
597
+ converted_partition_values = partition_values
598
+
599
+ staged_partition: Partition = metastore.stage_partition(
600
+ source_table_stream,
601
+ converted_partition_values,
602
+ partition_scheme_id="default_partition_scheme" if partition_keys else None,
603
+ **ds_mock_kwargs,
604
+ )
605
+ staged_delta: Delta = metastore.stage_delta(
606
+ input_deltas, staged_partition, input_delta_type, **ds_mock_kwargs
607
+ )
608
+ staged_delta.locator.stream_position = last_stream_position
609
+ metastore.commit_delta(staged_delta, **ds_mock_kwargs)
610
+ metastore.commit_partition(staged_partition, **ds_mock_kwargs)
611
+
612
+ source_table_stream_after_committed: Stream = metastore.get_stream(
613
+ namespace=source_namespace,
614
+ table_name=source_table_name,
615
+ table_version=source_table_version,
616
+ **ds_mock_kwargs,
617
+ )
618
+
619
+ # Create the destination table
620
+ (
621
+ destination_table_namespace,
622
+ destination_table_name,
623
+ destination_table_version,
624
+ ) = create_destination_table_main(
625
+ sort_keys, partition_keys, input_deltas, ds_mock_kwargs
626
+ )
627
+
628
+ # Create the rebase table
629
+ (
630
+ rebase_table_namespace,
631
+ rebase_table_name,
632
+ rebase_table_version,
633
+ ) = create_rebase_table_main(
634
+ sort_keys, partition_keys, input_deltas, ds_mock_kwargs
635
+ )
636
+
637
+ rebasing_table_stream: Stream = metastore.get_stream(
638
+ namespace=rebase_table_namespace,
639
+ table_name=rebase_table_name,
640
+ table_version=rebase_table_version,
641
+ **ds_mock_kwargs,
642
+ )
643
+
644
+ staged_partition: Partition = metastore.stage_partition(
645
+ rebasing_table_stream,
646
+ converted_partition_values,
647
+ partition_scheme_id="default_partition_scheme" if partition_keys else None,
648
+ **ds_mock_kwargs,
649
+ )
650
+ staged_delta: Delta = metastore.stage_delta(
651
+ input_deltas, staged_partition, **ds_mock_kwargs
652
+ )
653
+ staged_delta.locator.stream_position = last_stream_position
654
+ metastore.commit_delta(staged_delta, **ds_mock_kwargs)
655
+ metastore.commit_partition(staged_partition, **ds_mock_kwargs)
656
+
657
+ # Get destination stream
658
+ destination_table_stream: Stream = metastore.get_stream(
659
+ namespace=destination_table_namespace,
660
+ table_name=destination_table_name,
661
+ table_version=destination_table_version,
662
+ **ds_mock_kwargs,
663
+ )
664
+
665
+ rebased_stream_after_committed: Stream = metastore.get_stream(
666
+ namespace=rebase_table_namespace,
667
+ table_name=rebase_table_name,
668
+ table_version=rebase_table_version,
669
+ **ds_mock_kwargs,
670
+ )
671
+
672
+ return (
673
+ source_table_stream_after_committed,
674
+ destination_table_stream,
675
+ rebased_stream_after_committed,
676
+ )
677
+
678
+
679
+ def create_incremental_deltas_on_source_table_main(
680
+ source_namespace: str,
681
+ source_table_name: str,
682
+ source_table_version: str,
683
+ source_table_stream: Stream,
684
+ partition_values_param,
685
+ incremental_deltas: List[Tuple[pa.Table, DeltaType, Optional[Dict[str, str]]]],
686
+ ds_mock_kwargs: Optional[Dict[str, Any]] = None,
687
+ ) -> Tuple[PartitionLocator, Delta, int, bool]:
688
+ """
689
+ Main storage version of create_incremental_deltas_on_source_table
690
+ """
691
+ total_records = 0
692
+ has_delete_deltas = False
693
+ new_delta = None
694
+
695
+ # Convert partition values for partition lookup (same as in other helper functions)
696
+ converted_partition_values_for_lookup = partition_values_param
697
+ if (
698
+ partition_values_param
699
+ and source_table_stream.partition_scheme
700
+ and source_table_stream.partition_scheme.keys
701
+ ):
702
+ converted_partition_values_for_lookup = []
703
+
704
+ # Get partition field names from the storage partition scheme
705
+ storage_partition_keys = source_table_stream.partition_scheme.keys
706
+ partition_field_names = []
707
+
708
+ for storage_key in storage_partition_keys:
709
+ # Each storage PartitionKey has a 'key' property that contains FieldLocators
710
+ # Extract the field name from the first FieldLocator
711
+ field_name = storage_key.key[0] if storage_key.key else None
712
+ partition_field_names.append(field_name)
713
+
714
+ for i, value in enumerate(partition_values_param):
715
+ # For timestamp fields like 'region_id', we need to convert the timestamp string
716
+ if i < len(partition_field_names):
717
+ field_name = partition_field_names[i]
718
+
719
+ # Check if this is likely a timestamp field based on the value format
720
+ if isinstance(value, str) and "T" in value and value.endswith("Z"):
721
+ # This looks like a timestamp string - convert it
722
+ import pandas as pd
723
+
724
+ ts = pd.to_datetime(value)
725
+ # Convert to microseconds since epoch for PyArrow timestamp[us]
726
+ converted_partition_values_for_lookup.append(
727
+ int(ts.timestamp() * 1_000_000)
728
+ )
729
+ elif isinstance(value, str) and value.isdigit():
730
+ # This looks like an integer string
731
+ converted_partition_values_for_lookup.append(int(value))
732
+ else:
733
+ # Keep as-is
734
+ converted_partition_values_for_lookup.append(value)
735
+ else:
736
+ converted_partition_values_for_lookup.append(value)
737
+
738
+ # Get the current partition to stage deltas against
739
+ try:
740
+ source_partition: Partition = metastore.get_partition(
741
+ source_table_stream.locator,
742
+ converted_partition_values_for_lookup,
743
+ **ds_mock_kwargs,
744
+ )
745
+ except Exception:
746
+ # If we can't get the partition, it might not exist yet. Try to create it.
747
+ # Stage a new partition if it doesn't exist
748
+ staged_partition: Partition = metastore.stage_partition(
749
+ source_table_stream,
750
+ converted_partition_values_for_lookup,
751
+ partition_scheme_id="default_partition_scheme"
752
+ if source_table_stream.partition_scheme
753
+ else None,
754
+ **ds_mock_kwargs,
755
+ )
756
+ # Commit the empty partition first
757
+ metastore.commit_partition(staged_partition, **ds_mock_kwargs)
758
+
759
+ # Now try to get it again
760
+ source_partition: Partition = metastore.get_partition(
761
+ source_table_stream.locator,
762
+ converted_partition_values_for_lookup,
763
+ **ds_mock_kwargs,
764
+ )
765
+
766
+ if source_partition is None:
767
+ raise ValueError(
768
+ f"Could not create or retrieve partition for values: {converted_partition_values_for_lookup}"
769
+ )
770
+
771
+ for delta_table, delta_type, properties_dict in incremental_deltas:
772
+ # Skip None deltas (empty incremental deltas)
773
+ if delta_table is None:
774
+ continue
775
+
776
+ total_records += len(delta_table)
777
+
778
+ if delta_type == DeltaType.DELETE:
779
+ has_delete_deltas = True
780
+
781
+ # Stage and commit the delta
782
+ staged_delta: Delta = metastore.stage_delta(
783
+ delta_table,
784
+ source_partition,
785
+ delta_type,
786
+ entry_params=properties_dict,
787
+ **ds_mock_kwargs,
788
+ )
789
+ new_delta = metastore.commit_delta(staged_delta, **ds_mock_kwargs)
790
+
791
+ # If all deltas were None, return None for new_delta
792
+ if new_delta is None:
793
+ return None, None, total_records, has_delete_deltas
794
+
795
+ # Get updated stream after deltas were committed
796
+ source_table_stream_after_committed: Stream = metastore.get_stream(
797
+ source_namespace,
798
+ source_table_name,
799
+ source_table_version,
800
+ **ds_mock_kwargs,
801
+ )
802
+
803
+ # Get updated partition after deltas were committed
804
+ source_partition_after_committed: Partition = metastore.get_partition(
805
+ source_table_stream_after_committed.locator,
806
+ converted_partition_values_for_lookup,
807
+ **ds_mock_kwargs,
808
+ )
809
+
810
+ return (
811
+ source_partition_after_committed.locator,
812
+ new_delta,
813
+ total_records,
814
+ has_delete_deltas,
815
+ )
816
+
817
+
818
+ def get_compacted_delta_locator_from_partition(
819
+ partition_locator: PartitionLocator, deltacat_storage=None, **kwargs
174
820
  ):
175
- from deltacat.storage import DeltaLocator
821
+ """
822
+ Get compacted delta locator from partition RoundCompletionInfo.
176
823
 
177
- round_completion_info: RoundCompletionInfo = get_rcf(s3_resource, rcf_file_s3_uri)
824
+ Args:
825
+ partition_locator: Locator of the partition containing the RoundCompletionInfo
826
+ deltacat_storage: Storage implementation (defaults to metastore)
827
+ **kwargs: Additional arguments to pass to get_rci_from_partition (e.g., catalog)
178
828
 
179
- compacted_delta_locator: DeltaLocator = (
180
- round_completion_info.compacted_delta_locator
829
+ Returns:
830
+ DeltaLocator of the compacted delta
831
+ """
832
+ round_completion_info: RoundCompletionInfo = get_rci_from_partition(
833
+ partition_locator, deltacat_storage, **kwargs
181
834
  )
182
- return compacted_delta_locator
835
+
836
+ if round_completion_info:
837
+ return round_completion_info.compacted_delta_locator
838
+ return None
183
839
 
184
840
 
185
841
  def offer_iso8601_timestamp_list(
@@ -318,3 +974,27 @@ def assert_compaction_audit_no_hash_bucket(
318
974
  for entry in audit_entries:
319
975
  assert entry is not None
320
976
  return True
977
+
978
+
979
+ def read_audit_file(audit_file_path: str, catalog_root: str) -> Dict[str, Any]:
980
+ """
981
+ Read audit file from any filesystem.
982
+
983
+ Args:
984
+ audit_file_path: Relative path to the audit file from catalog root
985
+ catalog_root: Absolute path to the catalog root directory
986
+
987
+ Returns:
988
+ Dictionary containing audit data
989
+ """
990
+ from deltacat.utils.filesystem import resolve_path_and_filesystem
991
+ import json
992
+ import posixpath
993
+
994
+ # Resolve absolute path from relative audit path
995
+ absolute_path = posixpath.join(catalog_root, audit_file_path)
996
+
997
+ path, filesystem = resolve_path_and_filesystem(absolute_path)
998
+ with filesystem.open_input_stream(path) as stream:
999
+ content = stream.read().decode("utf-8")
1000
+ return json.loads(content)