deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,439 @@
1
+ import argparse
2
+ from typing import Optional
3
+
4
+
5
+ def run_compactor_local(
6
+ namespace: str,
7
+ table_name: str,
8
+ table_version: str,
9
+ partition_values: str,
10
+ dest_namespace: str,
11
+ dest_table_name: str,
12
+ dest_table_version: str,
13
+ dest_partition_values: str,
14
+ last_stream_position: int,
15
+ primary_keys: str,
16
+ catalog_root: Optional[str] = None,
17
+ compactor_version: str = "V2",
18
+ sort_keys: Optional[str] = None,
19
+ hash_bucket_count: Optional[int] = None,
20
+ records_per_file: int = 1000000,
21
+ table_writer_compression: str = "lz4",
22
+ ) -> None:
23
+ """
24
+ Run the compactor locally using Ray.
25
+
26
+ This function constructs the command line arguments and runs the compactor.py
27
+ script directly in the current Python process.
28
+ """
29
+ # Build command arguments
30
+ cmd_args = [
31
+ f"--namespace '{namespace}'",
32
+ f"--table-name '{table_name}'",
33
+ f"--table-version '{table_version}'",
34
+ f"--partition-values '{partition_values}'",
35
+ f"--dest-namespace '{dest_namespace}'",
36
+ f"--dest-table-name '{dest_table_name}'",
37
+ f"--dest-table-version '{dest_table_version}'",
38
+ f"--dest-partition-values '{dest_partition_values}'",
39
+ f"--last-stream-position {last_stream_position}",
40
+ f"--primary-keys '{primary_keys}'",
41
+ f"--compactor-version '{compactor_version}'",
42
+ ]
43
+
44
+ # Add optional arguments
45
+ if catalog_root:
46
+ cmd_args.append(f"--catalog-root '{catalog_root}'")
47
+ if sort_keys:
48
+ cmd_args.append(f"--sort-keys '{sort_keys}'")
49
+ if hash_bucket_count is not None:
50
+ cmd_args.append(f"--hash-bucket-count {hash_bucket_count}")
51
+ if records_per_file != 1000000:
52
+ cmd_args.append(f"--records-per-file {records_per_file}")
53
+ if table_writer_compression != "lz4":
54
+ cmd_args.append(f"--table-writer-compression '{table_writer_compression}'")
55
+
56
+ # Join all arguments
57
+ cmd_str = " ".join(cmd_args)
58
+ print(f"Running compactor with arguments: {cmd_str}")
59
+
60
+ # Import and run the compactor directly
61
+ from . import compactor
62
+
63
+ # Parse arguments manually and call run function
64
+ compactor.run(
65
+ namespace=namespace,
66
+ table_name=table_name,
67
+ table_version=table_version,
68
+ partition_values=partition_values,
69
+ dest_namespace=dest_namespace,
70
+ dest_table_name=dest_table_name,
71
+ dest_table_version=dest_table_version,
72
+ dest_partition_values=dest_partition_values,
73
+ last_stream_position=last_stream_position,
74
+ primary_keys=primary_keys,
75
+ catalog_root=catalog_root,
76
+ compactor_version=compactor_version,
77
+ sort_keys=sort_keys,
78
+ hash_bucket_count=hash_bucket_count,
79
+ records_per_file=records_per_file,
80
+ table_writer_compression=table_writer_compression,
81
+ )
82
+
83
+
84
+ def run_compactor_local_job(
85
+ namespace: str,
86
+ table_name: str,
87
+ table_version: str,
88
+ partition_values: str,
89
+ dest_namespace: str,
90
+ dest_table_name: str,
91
+ dest_table_version: str,
92
+ dest_partition_values: str,
93
+ last_stream_position: int,
94
+ primary_keys: str,
95
+ catalog_root: Optional[str] = None,
96
+ compactor_version: str = "V2",
97
+ sort_keys: Optional[str] = None,
98
+ hash_bucket_count: Optional[int] = None,
99
+ records_per_file: int = 1000000,
100
+ table_writer_compression: str = "lz4",
101
+ ) -> None:
102
+ """
103
+ Submit the compactor as a local Ray job using a local job client.
104
+
105
+ This function creates a Ray job that runs the compactor.py script
106
+ with the specified parameters.
107
+ """
108
+ from deltacat import local_job_client
109
+
110
+ # Build command arguments
111
+ cmd_args = [
112
+ "python compactor.py",
113
+ f"--namespace '{namespace}'",
114
+ f"--table-name '{table_name}'",
115
+ f"--table-version '{table_version}'",
116
+ f"--partition-values '{partition_values}'",
117
+ f"--dest-namespace '{dest_namespace}'",
118
+ f"--dest-table-name '{dest_table_name}'",
119
+ f"--dest-table-version '{dest_table_version}'",
120
+ f"--dest-partition-values '{dest_partition_values}'",
121
+ f"--last-stream-position {last_stream_position}",
122
+ f"--primary-keys '{primary_keys}'",
123
+ f"--compactor-version '{compactor_version}'",
124
+ ]
125
+
126
+ # Add optional arguments
127
+ if catalog_root:
128
+ cmd_args.append(f"--catalog-root '{catalog_root}'")
129
+ if sort_keys:
130
+ cmd_args.append(f"--sort-keys '{sort_keys}'")
131
+ if hash_bucket_count is not None:
132
+ cmd_args.append(f"--hash-bucket-count {hash_bucket_count}")
133
+ if records_per_file != 1000000:
134
+ cmd_args.append(f"--records-per-file {records_per_file}")
135
+ if table_writer_compression != "lz4":
136
+ cmd_args.append(f"--table-writer-compression '{table_writer_compression}'")
137
+
138
+ # Join all arguments
139
+ entrypoint = " ".join(cmd_args)
140
+ print(f"Submitting local Ray job with entrypoint: {entrypoint}")
141
+
142
+ # Submit the job
143
+ client = local_job_client()
144
+ job_run_result = client.run_job(
145
+ entrypoint=entrypoint,
146
+ runtime_env={"working_dir": "./deltacat/examples/compactor/"},
147
+ )
148
+
149
+ print(f"Job ID {job_run_result.job_id} terminal state: {job_run_result.job_status}")
150
+ print(f"Job logs: {job_run_result.job_logs}")
151
+
152
+ return job_run_result
153
+
154
+
155
+ def run_compactor_remote_job(
156
+ namespace: str,
157
+ table_name: str,
158
+ table_version: str,
159
+ partition_values: str,
160
+ dest_namespace: str,
161
+ dest_table_name: str,
162
+ dest_table_version: str,
163
+ dest_partition_values: str,
164
+ last_stream_position: int,
165
+ primary_keys: str,
166
+ catalog_root: Optional[str] = None,
167
+ compactor_version: str = "V2",
168
+ sort_keys: Optional[str] = None,
169
+ hash_bucket_count: Optional[int] = None,
170
+ records_per_file: int = 1000000,
171
+ table_writer_compression: str = "lz4",
172
+ ) -> None:
173
+ """
174
+ Submit the compactor as a remote Ray job using a remote job client.
175
+
176
+ This function creates a Ray job that runs the compactor.py script
177
+ on a remote Ray cluster with the specified parameters.
178
+
179
+ Args:
180
+ namespace: Source table namespace
181
+ table_name: Source table name
182
+ table_version: Source table version
183
+ partition_values: Comma-separated partition values for source
184
+ dest_namespace: Destination table namespace
185
+ dest_table_name: Destination table name
186
+ dest_table_version: Destination table version
187
+ dest_partition_values: Comma-separated partition values for destination
188
+ last_stream_position: Last stream position to compact
189
+ primary_keys: Comma-separated primary keys
190
+ catalog_root: Root path for catalog (defaults to temp directory)
191
+ compactor_version: Compactor version to use (V1 or V2)
192
+ sort_keys: Comma-separated sort keys (optional)
193
+ hash_bucket_count: Number of hash buckets (required for V2)
194
+ records_per_file: Records per compacted file
195
+ table_writer_compression: Compression type for table writer
196
+ """
197
+ from deltacat import job_client
198
+
199
+ # Build command arguments - same as local job
200
+ cmd_args = [
201
+ "python compactor.py",
202
+ f"--namespace '{namespace}'",
203
+ f"--table-name '{table_name}'",
204
+ f"--table-version '{table_version}'",
205
+ f"--partition-values '{partition_values}'",
206
+ f"--dest-namespace '{dest_namespace}'",
207
+ f"--dest-table-name '{dest_table_name}'",
208
+ f"--dest-table-version '{dest_table_version}'",
209
+ f"--dest-partition-values '{dest_partition_values}'",
210
+ f"--last-stream-position {last_stream_position}",
211
+ f"--primary-keys '{primary_keys}'",
212
+ f"--compactor-version '{compactor_version}'",
213
+ ]
214
+
215
+ # Add optional arguments
216
+ if catalog_root:
217
+ cmd_args.append(f"--catalog-root '{catalog_root}'")
218
+ if sort_keys:
219
+ cmd_args.append(f"--sort-keys '{sort_keys}'")
220
+ if hash_bucket_count is not None:
221
+ cmd_args.append(f"--hash-bucket-count {hash_bucket_count}")
222
+ if records_per_file != 1000000:
223
+ cmd_args.append(f"--records-per-file {records_per_file}")
224
+ if table_writer_compression != "lz4":
225
+ cmd_args.append(f"--table-writer-compression '{table_writer_compression}'")
226
+
227
+ # Join all arguments
228
+ entrypoint = " ".join(cmd_args)
229
+ print(f"Submitting remote Ray job with entrypoint: {entrypoint}")
230
+
231
+ # Submit the job
232
+ # TODO(pdames): Take cloud as an input parameter.
233
+ client = job_client(
234
+ "./aws/deltacat.yaml"
235
+ ) # or job_client() to use current directory
236
+ job_run_result = client.run_job(
237
+ entrypoint=entrypoint,
238
+ runtime_env={"working_dir": "./deltacat/examples/compactor/"},
239
+ )
240
+
241
+ print(f"Job completed with status: {job_run_result.job_status}")
242
+ return job_run_result
243
+
244
+
245
+ if __name__ == "__main__":
246
+ """
247
+ DeltaCAT Job Runner Example - Run compactor jobs using different methods
248
+
249
+ This script demonstrates three ways to run the DeltaCAT compactor:
250
+ 1. Locally in the current process
251
+ 2. As a local Ray job
252
+ 3. As a remote Ray job
253
+
254
+ Example usage:
255
+ $ python job_runner.py \
256
+ $ --namespace 'events' \
257
+ $ --table-name 'user_events' \
258
+ $ --table-version '2' \
259
+ $ --partition-values 'region=us-west-2' \
260
+ $ --dest-namespace 'events' \
261
+ $ --dest-table-name 'user_events_compacted' \
262
+ $ --dest-table-version '1' \
263
+ $ --dest-partition-values 'region=us-west-2' \
264
+ $ --last-stream-position 5000 \
265
+ $ --primary-keys 'user_id,event_id' \
266
+ $ --sort-keys 'timestamp,event_type' \
267
+ $ --compactor-version 'V2' \
268
+ $ --hash-bucket-count 1 \
269
+ $ --job-type 'local'
270
+ """
271
+ script_args = [
272
+ (
273
+ ["--namespace"],
274
+ {
275
+ "help": "Source table namespace",
276
+ "type": str,
277
+ "required": True,
278
+ },
279
+ ),
280
+ (
281
+ ["--table-name"],
282
+ {
283
+ "help": "Source table name",
284
+ "type": str,
285
+ "required": True,
286
+ },
287
+ ),
288
+ (
289
+ ["--table-version"],
290
+ {
291
+ "help": "Source table version",
292
+ "type": str,
293
+ "required": True,
294
+ },
295
+ ),
296
+ (
297
+ ["--partition-values"],
298
+ {
299
+ "help": "Comma-separated partition values for source (leave empty for no partition values)",
300
+ "type": str,
301
+ "default": "",
302
+ },
303
+ ),
304
+ (
305
+ ["--dest-namespace"],
306
+ {
307
+ "help": "Destination table namespace",
308
+ "type": str,
309
+ "required": True,
310
+ },
311
+ ),
312
+ (
313
+ ["--dest-table-name"],
314
+ {
315
+ "help": "Destination table name",
316
+ "type": str,
317
+ "required": True,
318
+ },
319
+ ),
320
+ (
321
+ ["--dest-table-version"],
322
+ {
323
+ "help": "Destination table version",
324
+ "type": str,
325
+ "required": True,
326
+ },
327
+ ),
328
+ (
329
+ ["--dest-partition-values"],
330
+ {
331
+ "help": "Comma-separated partition values for destination (leave empty for no partition values)",
332
+ "type": str,
333
+ "default": "",
334
+ },
335
+ ),
336
+ (
337
+ ["--last-stream-position"],
338
+ {
339
+ "help": "Last stream position to compact",
340
+ "type": int,
341
+ "required": True,
342
+ },
343
+ ),
344
+ (
345
+ ["--primary-keys"],
346
+ {
347
+ "help": "Comma-separated primary keys",
348
+ "type": str,
349
+ "required": True,
350
+ },
351
+ ),
352
+ (
353
+ ["--catalog-root"],
354
+ {
355
+ "help": "Root path for catalog (defaults to temp directory)",
356
+ "type": str,
357
+ "default": None,
358
+ },
359
+ ),
360
+ (
361
+ ["--compactor-version"],
362
+ {
363
+ "help": "Compactor version to use (V1 or V2)",
364
+ "type": str,
365
+ "choices": ["V1", "V2"],
366
+ "default": "V2",
367
+ },
368
+ ),
369
+ (
370
+ ["--sort-keys"],
371
+ {
372
+ "help": "Comma-separated sort keys (optional)",
373
+ "type": str,
374
+ "default": None,
375
+ },
376
+ ),
377
+ (
378
+ ["--hash-bucket-count"],
379
+ {
380
+ "help": "Number of hash buckets (required for V2, ignored for V1)",
381
+ "type": int,
382
+ "default": None,
383
+ },
384
+ ),
385
+ (
386
+ ["--records-per-file"],
387
+ {
388
+ "help": "Records per compacted file",
389
+ "type": int,
390
+ "default": 1000000,
391
+ },
392
+ ),
393
+ (
394
+ ["--table-writer-compression"],
395
+ {
396
+ "help": "Compression type for table writer",
397
+ "type": str,
398
+ "choices": ["lz4", "snappy", "gzip", "brotli", "zstd"],
399
+ "default": "lz4",
400
+ },
401
+ ),
402
+ (
403
+ ["--job-type"],
404
+ {
405
+ "help": "Type of job execution",
406
+ "type": str,
407
+ "choices": ["local", "local-job", "remote-job"],
408
+ "default": "local",
409
+ },
410
+ ),
411
+ ]
412
+
413
+ # Parse CLI input arguments
414
+ parser = argparse.ArgumentParser(
415
+ description="DeltaCAT Job Runner Example - Run compactor jobs using different methods"
416
+ )
417
+ for args, kwargs in script_args:
418
+ parser.add_argument(*args, **kwargs)
419
+ args = parser.parse_args()
420
+ print(f"Command Line Arguments: {args}")
421
+
422
+ # Extract job type and remove it from args
423
+ job_type = args.job_type
424
+ delattr(args, "job_type")
425
+
426
+ # Run the appropriate job type
427
+ if job_type == "local":
428
+ print("Running compactor locally...")
429
+ run_compactor_local(**vars(args))
430
+ elif job_type == "local-job":
431
+ print("Submitting local Ray job...")
432
+ run_compactor_local_job(**vars(args))
433
+ elif job_type == "remote-job":
434
+ print("Submitting remote Ray job...")
435
+ run_compactor_remote_job(**vars(args))
436
+ else:
437
+ raise ValueError(f"Invalid job type: {job_type}")
438
+
439
+ print("Job runner completed!")
@@ -0,0 +1 @@
1
+ # Common utilities for DeltaCAT compactor examples
@@ -0,0 +1,261 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Common utilities for DeltaCAT compactor examples.
4
+
5
+ This module contains shared functionality used across bootstrap.py, explorer.py,
6
+ and compactor.py to reduce code duplication.
7
+ """
8
+
9
+ from typing import Set, List, Optional, Tuple
10
+
11
+ import deltacat as dc
12
+ from deltacat import DeltaCatUrl
13
+ from deltacat.catalog import Catalog, put_catalog, get_table
14
+ from deltacat.catalog.model.properties import CatalogProperties
15
+ from deltacat.storage import metastore
16
+ from deltacat.storage.model.partition import PartitionLocator
17
+ from deltacat.storage.model.sort_key import SortKey
18
+ from deltacat.storage.model.types import SortOrder
19
+
20
+
21
+ def get_default_catalog_root() -> str:
22
+ """Get the default catalog root directory."""
23
+ return "/tmp/deltacat_test"
24
+
25
+
26
+ def initialize_catalog(
27
+ catalog_root: Optional[str] = None, catalog_name: str = "default"
28
+ ) -> CatalogProperties:
29
+ """
30
+ Initialize and register a DeltaCAT catalog.
31
+
32
+ Args:
33
+ catalog_root: Root directory for the catalog. If None, uses default.
34
+ catalog_name: Name to register the catalog under.
35
+
36
+ Returns:
37
+ CatalogProperties instance for the initialized catalog.
38
+ """
39
+ if catalog_root is None:
40
+ catalog_root = get_default_catalog_root()
41
+
42
+ catalog = CatalogProperties(root=catalog_root)
43
+
44
+ # Initialize catalog and register it
45
+ catalog_obj = Catalog(config=catalog)
46
+ put_catalog(catalog_name, catalog_obj)
47
+
48
+ return catalog
49
+
50
+
51
+ def initialize_deltacat_url_catalog(
52
+ catalog_root: Optional[str] = None, catalog_name: str = "compactor_test_catalog"
53
+ ) -> DeltaCatUrl:
54
+ """
55
+ Initialize a DeltaCAT catalog using URL-based approach (used by explorer.py).
56
+
57
+ Args:
58
+ catalog_root: Root directory for the catalog. If None, uses default.
59
+ catalog_name: Name for the catalog URL.
60
+
61
+ Returns:
62
+ DeltaCatUrl instance for the initialized catalog.
63
+ """
64
+ if catalog_root is None:
65
+ catalog_root = get_default_catalog_root()
66
+
67
+ dc.init()
68
+ catalog_url = DeltaCatUrl(f"dc://{catalog_name}")
69
+ dc.put(catalog_url, root=catalog_root)
70
+
71
+ return catalog_url
72
+
73
+
74
+ def parse_primary_keys(primary_keys_str: str) -> Set[str]:
75
+ """Parse comma-separated primary keys string into a set."""
76
+ return set(key.strip() for key in primary_keys_str.split(",") if key.strip())
77
+
78
+
79
+ def parse_partition_values(partition_values_str: str) -> List[str]:
80
+ """Parse comma-separated partition values string into a list."""
81
+ if not partition_values_str.strip():
82
+ return []
83
+ return [value.strip() for value in partition_values_str.split(",") if value.strip()]
84
+
85
+
86
+ def parse_sort_keys(sort_keys_str: str) -> List[SortKey]:
87
+ """Parse comma-separated sort keys string into a list of SortKey objects."""
88
+ if not sort_keys_str or not sort_keys_str.strip():
89
+ return []
90
+
91
+ sort_keys = []
92
+ for key in sort_keys_str.split(","):
93
+ key = key.strip()
94
+ if key:
95
+ sort_keys.append(SortKey.of(key=key, sort_order=SortOrder.ASCENDING))
96
+ return sort_keys
97
+
98
+
99
+ def create_partition_locator(
100
+ namespace: str,
101
+ table_name: str,
102
+ table_version: str,
103
+ partition_values: List[str],
104
+ ) -> PartitionLocator:
105
+ """
106
+ Create a partition locator with the given parameters.
107
+ Note: This creates a locator with partition_id=None, which may not work
108
+ for all operations. Use get_actual_partition_locator() for operations
109
+ that require the actual partition ID.
110
+ """
111
+ return PartitionLocator.of(
112
+ namespace=namespace,
113
+ table_name=table_name,
114
+ table_version=table_version,
115
+ partition_values=partition_values,
116
+ )
117
+
118
+
119
+ def get_actual_partition_locator(
120
+ namespace: str,
121
+ table_name: str,
122
+ table_version: str,
123
+ partition_values: List[str],
124
+ catalog: CatalogProperties,
125
+ catalog_name: str = "default",
126
+ ) -> PartitionLocator:
127
+ """
128
+ Get the actual partition locator by using metastore to find the partition.
129
+ This matches the approach used in bootstrap.py and ensures compatibility.
130
+
131
+ Args:
132
+ namespace: Table namespace
133
+ table_name: Table name
134
+ table_version: Table version
135
+ partition_values: Partition values (can be empty list)
136
+ catalog: CatalogProperties instance
137
+ catalog_name: Name of the registered catalog
138
+
139
+ Returns:
140
+ PartitionLocator with actual partition ID
141
+ """
142
+ try:
143
+ # Initialize catalog like bootstrap.py does
144
+ catalog_obj = Catalog(config=catalog)
145
+ put_catalog(catalog_name, catalog_obj)
146
+
147
+ # Get table definition first
148
+ table_def = get_table(
149
+ name=table_name, namespace=namespace, catalog=catalog_name
150
+ )
151
+
152
+ # Get the actual partition using the table's stream locator
153
+ partition = metastore.get_partition(
154
+ stream_locator=table_def.stream.locator,
155
+ partition_values=partition_values if partition_values else None,
156
+ catalog=catalog,
157
+ )
158
+
159
+ return partition.locator
160
+
161
+ except Exception as e:
162
+ print(f"⚠️ Failed to get actual partition locator: {e}")
163
+ print(f" Falling back to basic partition locator")
164
+ return create_partition_locator(
165
+ namespace, table_name, table_version, partition_values
166
+ )
167
+
168
+
169
+ def format_partition_values_for_command(partition_values: Optional[List[str]]) -> str:
170
+ """Format partition values for use in command line arguments."""
171
+ if not partition_values:
172
+ return ""
173
+ return ",".join(str(v) for v in partition_values)
174
+
175
+
176
+ def get_max_stream_position_from_partition(
177
+ namespace: str,
178
+ table_name: str,
179
+ table_version: str,
180
+ partition_values: List[str],
181
+ catalog: CatalogProperties,
182
+ catalog_name: str = "default",
183
+ ) -> int:
184
+ """
185
+ Get the maximum stream position from a partition by reading its deltas.
186
+
187
+ Args:
188
+ namespace: Table namespace
189
+ table_name: Table name
190
+ table_version: Table version
191
+ partition_values: Partition values
192
+ catalog: CatalogProperties instance
193
+ catalog_name: Name of the registered catalog
194
+
195
+ Returns:
196
+ Maximum stream position found, or 1000 as fallback
197
+ """
198
+ try:
199
+ # Get the actual partition locator
200
+ partition_locator = get_actual_partition_locator(
201
+ namespace,
202
+ table_name,
203
+ table_version,
204
+ partition_values,
205
+ catalog,
206
+ catalog_name,
207
+ )
208
+
209
+ # Create a partition-like object for metastore API
210
+ partition_like = type("obj", (object,), {"locator": partition_locator})()
211
+
212
+ # Get deltas from the partition
213
+ partition_deltas = metastore.list_partition_deltas(
214
+ partition_like=partition_like,
215
+ include_manifest=True,
216
+ catalog=catalog,
217
+ )
218
+
219
+ delta_list = partition_deltas.all_items()
220
+ if delta_list:
221
+ return max(delta.stream_position for delta in delta_list)
222
+ else:
223
+ return 1000 # fallback
224
+
225
+ except Exception as e:
226
+ print(f"⚠️ Failed to get max stream position: {e}")
227
+ return 1000 # fallback
228
+
229
+
230
+ def get_bootstrap_destination_info(
231
+ source_namespace: str, source_table: str
232
+ ) -> Tuple[str, str]:
233
+ """
234
+ Get the corresponding destination namespace and table name for bootstrap test tables.
235
+
236
+ Args:
237
+ source_namespace: Source namespace
238
+ source_table: Source table name
239
+
240
+ Returns:
241
+ Tuple of (dest_namespace, dest_table_name)
242
+ """
243
+ if source_namespace == "compactor_test_source" and source_table == "events":
244
+ return "compactor_test_dest", "events_compacted"
245
+ else:
246
+ # Generic fallback
247
+ return source_namespace, f"{source_table}_compacted"
248
+
249
+
250
+ def print_section_header(title: str, char: str = "=", width: int = 80) -> None:
251
+ """Print a formatted section header."""
252
+ print(char * width)
253
+ print(title)
254
+ print(char * width)
255
+
256
+
257
+ def print_subsection_header(title: str, char: str = "-", width: int = 70) -> None:
258
+ """Print a formatted subsection header."""
259
+ print(char * width)
260
+ print(title)
261
+ print(char * width)
File without changes
File without changes