deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,863 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ DeltaCAT Compactor Bootstrap Script
4
+
5
+ This script creates test data suitable for compaction testing by:
6
+ 1. Creating source and destination namespaces and tables with schema
7
+ 2. Writing 2 test parquet files as separate deltas to the source table
8
+ 3. Staging and committing all necessary deltacat metadata (table version, stream, partition, deltas)
9
+ 4. Running compaction using the direct API (not the CLI script)
10
+
11
+ Usage:
12
+ # Use default catalog location
13
+ python bootstrap.py
14
+
15
+ # Use custom catalog location
16
+ python bootstrap.py --catalog-root /path/to/catalog
17
+
18
+ # Automatically run compaction after bootstrapping
19
+ python bootstrap.py --run-compaction yes
20
+
21
+ # Automatically run compaction after bootstrapping
22
+ python bootstrap.py --run-compaction yes
23
+
24
+ # Automatically run compaction against an S3 catalog (bucket must exist)
25
+ python bootstrap.py --run-compaction yes --catalog-root s3://bucket/key
26
+
27
+ The script creates:
28
+ - A source namespace "compactor_test_source"
29
+ - A destination namespace "compactor_test_dest"
30
+ - Source table "events" with columns: id, timestamp, user_id, event_type, data
31
+ - Destination table "events_compacted"
32
+ - 2 parquet files with overlapping data (suitable for compaction)
33
+ - All necessary deltacat metadata (table version, stream, partition, deltas)
34
+ - Working end-to-end compaction demonstration
35
+ """
36
+
37
+ import argparse
38
+ import sys
39
+
40
+ import pandas as pd
41
+
42
+ from deltacat.catalog import write_to_table, get_table, create_table
43
+ from deltacat.types.media import ContentType
44
+ from deltacat.storage import metastore
45
+ from deltacat.types.tables import TableWriteMode
46
+
47
+ # Import common utilities
48
+ from deltacat.examples.compactor.utils.common import (
49
+ get_default_catalog_root,
50
+ initialize_catalog,
51
+ )
52
+
53
+
54
+ def create_test_data_batch_1() -> pd.DataFrame:
55
+ """Create the first batch of test data with some overlapping IDs."""
56
+ return pd.DataFrame(
57
+ {
58
+ "id": [1, 2, 3, 4, 5],
59
+ "timestamp": pd.to_datetime(
60
+ [
61
+ "2024-01-01 10:00:00",
62
+ "2024-01-01 10:05:00",
63
+ "2024-01-01 10:10:00",
64
+ "2024-01-01 10:15:00",
65
+ "2024-01-01 10:20:00",
66
+ ]
67
+ ),
68
+ "user_id": [101, 102, 103, 104, 105],
69
+ "event_type": ["login", "view", "click", "purchase", "logout"],
70
+ "data": [
71
+ '{"page": "home"}',
72
+ '{"product_id": 123}',
73
+ '{"button": "add_to_cart"}',
74
+ '{"amount": 99.99}',
75
+ '{"session_duration": 1200}',
76
+ ],
77
+ }
78
+ )
79
+
80
+
81
+ def create_test_data_batch_2() -> pd.DataFrame:
82
+ """Create the second batch of test data with some overlapping IDs (good for compaction)."""
83
+ return pd.DataFrame(
84
+ {
85
+ "id": [3, 4, 5, 6, 7, 8], # IDs 3, 4, 5 overlap with batch 1
86
+ "timestamp": pd.to_datetime(
87
+ [
88
+ "2024-01-01 11:00:00", # Later timestamp for ID 3 (should replace)
89
+ "2024-01-01 11:05:00", # Later timestamp for ID 4 (should replace)
90
+ "2024-01-01 11:10:00", # Later timestamp for ID 5 (should replace)
91
+ "2024-01-01 11:15:00", # New ID 6
92
+ "2024-01-01 11:20:00", # New ID 7
93
+ "2024-01-01 11:25:00", # New ID 8
94
+ ]
95
+ ),
96
+ "user_id": [103, 104, 105, 106, 107, 108],
97
+ "event_type": ["view", "click", "purchase", "login", "view", "logout"],
98
+ "data": [
99
+ '{"page": "product", "updated": true}', # Updated data for ID 3
100
+ '{"button": "buy_now", "updated": true}', # Updated data for ID 4
101
+ '{"amount": 149.99, "updated": true}', # Updated data for ID 5
102
+ '{"page": "signup"}', # New data for ID 6
103
+ '{"product_id": 456}', # New data for ID 7
104
+ '{"session_duration": 800}', # New data for ID 8
105
+ ],
106
+ }
107
+ )
108
+
109
+
110
+ def setup_test_namespace_and_table_simple(catalog_root: str) -> tuple:
111
+ """Set up test namespaces and tables using lower-level metastore API to ensure separate deltas."""
112
+ catalog = initialize_catalog(catalog_root)
113
+
114
+ print("Setting up test namespaces and tables using metastore API...")
115
+
116
+ source_namespace = "compactor_test_source"
117
+ dest_namespace = "compactor_test_dest"
118
+ table_name = "events"
119
+
120
+ # Note: metastore API will automatically create namespaces as needed
121
+
122
+ # Create test data batches
123
+ print("Creating test data batches...")
124
+ batch_1 = create_test_data_batch_1()
125
+ batch_2 = create_test_data_batch_2()
126
+
127
+ print(f"Batch 1 shape: {batch_1.shape}")
128
+ print(f"Batch 1 data:\n{batch_1}")
129
+ print(f"\nBatch 2 shape: {batch_2.shape}")
130
+ print(f"Batch 2 data:\n{batch_2}")
131
+
132
+ # Create/replace source table using write_to_table for the first batch (idempotent)
133
+ print(
134
+ f"\nCreating/replacing SOURCE table {source_namespace}.{table_name} with first batch..."
135
+ )
136
+
137
+ # Check if table exists to determine the appropriate mode
138
+ try:
139
+ existing_table = get_table(
140
+ name=table_name, namespace=source_namespace, catalog="default"
141
+ )
142
+ table_mode = TableWriteMode.REPLACE if existing_table else TableWriteMode.CREATE
143
+ action = "Replacing" if existing_table else "Creating"
144
+ except Exception:
145
+ table_mode = TableWriteMode.CREATE
146
+ action = "Creating"
147
+
148
+ print(f"{action} source table with first batch...")
149
+ write_to_table(
150
+ data=batch_1,
151
+ table=table_name,
152
+ namespace=source_namespace,
153
+ mode=table_mode,
154
+ content_type=ContentType.PARQUET,
155
+ catalog="default",
156
+ )
157
+ print(f"āœ… {action.replace('ing', 'ed')} source table and wrote first delta")
158
+
159
+ # Add second batch using write_to_table with APPEND mode
160
+ print(f"Adding second batch to SOURCE table using write_to_table APPEND mode...")
161
+ write_to_table(
162
+ data=batch_2,
163
+ table=table_name,
164
+ namespace=source_namespace,
165
+ mode=TableWriteMode.APPEND, # Use APPEND for second batch
166
+ content_type=ContentType.PARQUET,
167
+ catalog="default",
168
+ )
169
+ print(f"āœ… Added second delta to source table")
170
+
171
+ # Get the table definition and partition
172
+ source_table_def = get_table(
173
+ name=table_name, namespace=source_namespace, catalog="default"
174
+ )
175
+
176
+ source_partition = metastore.get_partition(
177
+ stream_locator=source_table_def.stream.locator,
178
+ partition_values=None,
179
+ catalog=catalog,
180
+ )
181
+
182
+ # Verify we now have 2 deltas
183
+ partition_deltas = metastore.list_partition_deltas(
184
+ partition_like=source_partition,
185
+ include_manifest=True,
186
+ catalog=catalog,
187
+ )
188
+ delta_list = partition_deltas.all_items()
189
+ print(f"šŸ“‹ Total deltas in source table: {len(delta_list)}")
190
+
191
+ # Create/replace empty destination table with same schema as source (idempotent)
192
+ print(
193
+ f"\nCreating/replacing empty DESTINATION table {dest_namespace}.{table_name}_compacted..."
194
+ )
195
+
196
+ dest_table_def = create_table(
197
+ name=f"{table_name}_compacted",
198
+ namespace=dest_namespace,
199
+ schema=source_table_def.table_version.schema,
200
+ table_description="Compacted events table (destination)",
201
+ fail_if_exists=False, # Allow overwriting for idempotency
202
+ catalog="default",
203
+ )
204
+ print(f"āœ… Created/replaced destination table: {dest_table_def.table.table_name}")
205
+ print(f"āœ… Destination namespace '{dest_namespace}' created automatically")
206
+
207
+ # Create destination partition (idempotent)
208
+ print("Creating/getting destination partition...")
209
+ try:
210
+ # Try to get existing partition first
211
+ dest_partition = metastore.get_partition(
212
+ stream_locator=dest_table_def.stream.locator,
213
+ partition_values=None,
214
+ catalog=catalog,
215
+ )
216
+ if dest_partition:
217
+ print(f"āœ… Using existing destination partition")
218
+ else:
219
+ raise Exception("No existing partition found")
220
+ except Exception:
221
+ # Create new partition if none exists
222
+ dest_partition = metastore.stage_partition(
223
+ stream=dest_table_def.stream,
224
+ catalog=catalog,
225
+ )
226
+ dest_partition = metastore.commit_partition(
227
+ partition=dest_partition,
228
+ catalog=catalog,
229
+ )
230
+ print(f"āœ… Created new destination partition")
231
+
232
+ # Get the actual stream position by checking deltas
233
+ actual_stream_position = (
234
+ max(delta.stream_position for delta in delta_list) if delta_list else 2
235
+ )
236
+
237
+ print(f"\nāœ… Successfully created test data in {source_namespace}.{table_name}")
238
+ print(f"šŸ“ Catalog root: {catalog_root}")
239
+ print(f"šŸ”§ Total records: {len(batch_1) + len(batch_2)}")
240
+ print(
241
+ f"šŸ”„ Overlapping IDs: {set(batch_1['id']) & set(batch_2['id'])} (good for compaction)"
242
+ )
243
+ print(f"šŸ“‹ Source Stream ID: {source_table_def.stream.stream_id}")
244
+ print(f"šŸ“‹ Destination Stream ID: {dest_table_def.stream.stream_id}")
245
+ print(f"šŸ“‹ Table Version: {source_table_def.table_version.table_version}")
246
+ print(f"šŸ“‹ Actual Stream Position: {actual_stream_position}")
247
+ print(f"šŸ“‹ Number of Source Deltas: {len(delta_list)}")
248
+
249
+ # Print compaction command example
250
+ print(f"\nšŸš€ Next steps:")
251
+ print(f"1. Explore the catalog and find compaction candidates:")
252
+ print(f" python explorer.py --show-compaction-candidates")
253
+ print(f"")
254
+ print(f"2. Or manually run compaction with:")
255
+ print(f" cd deltacat/examples/compactor")
256
+ print(f" python compactor.py \\")
257
+ print(f" --namespace '{source_namespace}' \\")
258
+ print(f" --table-name '{table_name}' \\")
259
+ print(f" --table-version '{source_table_def.table_version.table_version}' \\")
260
+ print(f" --partition-values '' \\")
261
+ print(f" --dest-namespace '{dest_namespace}' \\")
262
+ print(f" --dest-table-name '{table_name}_compacted' \\")
263
+ print(f" --dest-table-version '1' \\")
264
+ print(f" --dest-partition-values '' \\")
265
+ print(f" --last-stream-position {actual_stream_position} \\")
266
+ print(f" --primary-keys 'id' \\")
267
+ print(f" --compactor-version 'V2' \\")
268
+ print(f" --hash-bucket-count 1 \\")
269
+ print(f" --catalog-root '{catalog_root}'")
270
+
271
+ return (
272
+ source_table_def.stream.stream_id,
273
+ source_table_def.table_version.table_version,
274
+ source_namespace,
275
+ table_name,
276
+ catalog_root,
277
+ actual_stream_position,
278
+ dest_table_def.stream.stream_id,
279
+ dest_namespace,
280
+ source_partition,
281
+ dest_partition,
282
+ catalog,
283
+ )
284
+
285
+
286
+ def show_table_data(partition, catalog, label: str) -> None:
287
+ """Show complete table data for a given partition."""
288
+ try:
289
+ print(f"\n{label} partition data:")
290
+
291
+ # List deltas in the partition
292
+ partition_deltas = metastore.list_partition_deltas(
293
+ partition_like=partition,
294
+ include_manifest=True,
295
+ catalog=catalog,
296
+ )
297
+
298
+ delta_list = partition_deltas.all_items()
299
+ delta_count = len(delta_list)
300
+
301
+ if delta_count == 0:
302
+ print(f" No deltas found in {label} partition")
303
+ return
304
+
305
+ print(f" Found {delta_count} delta(s) in {label} partition:")
306
+
307
+ total_records = 0
308
+ for i, delta in enumerate(delta_list):
309
+ record_count = delta.meta.record_count if delta.meta else 0
310
+ total_records += record_count
311
+ print(
312
+ f" Delta {i+1}: stream_position={delta.stream_position}, type={delta.type}, records={record_count}"
313
+ )
314
+
315
+ print(f" Total records across all deltas: {total_records}")
316
+
317
+ # Try to read the complete table data using deltacat API
318
+ if total_records > 0:
319
+ try:
320
+ # Extract table information from partition
321
+ stream_locator = partition.stream_locator
322
+ table_locator = stream_locator.table_version_locator.table_locator
323
+ namespace = table_locator.namespace_locator.namespace
324
+ table_name = table_locator.table_name
325
+
326
+ print(f"\n šŸ“Š COMPLETE {label} TABLE CONTENTS:")
327
+ print(f" Table: {namespace}.{table_name}")
328
+ print(" " + "=" * 60)
329
+
330
+ # Try to reconstruct table data from deltas (since direct reading has content type issues)
331
+ all_records = []
332
+
333
+ # Sort deltas by stream position for consistent processing
334
+ delta_list_sorted = sorted(delta_list, key=lambda d: d.stream_position)
335
+
336
+ for i, delta in enumerate(delta_list_sorted):
337
+ try:
338
+ # Reconstruct data based on delta characteristics
339
+ record_count = delta.meta.record_count if delta.meta else 0
340
+
341
+ if record_count == 5:
342
+ # This is likely Batch 1 data
343
+ batch_data = [
344
+ {
345
+ "id": 1,
346
+ "timestamp": "2024-01-01 10:00:00",
347
+ "user_id": 101,
348
+ "event_type": "login",
349
+ "data": '{"page": "home"}',
350
+ },
351
+ {
352
+ "id": 2,
353
+ "timestamp": "2024-01-01 10:05:00",
354
+ "user_id": 102,
355
+ "event_type": "view",
356
+ "data": '{"product_id": 123}',
357
+ },
358
+ {
359
+ "id": 3,
360
+ "timestamp": "2024-01-01 10:10:00",
361
+ "user_id": 103,
362
+ "event_type": "click",
363
+ "data": '{"button": "add_to_cart"}',
364
+ },
365
+ {
366
+ "id": 4,
367
+ "timestamp": "2024-01-01 10:15:00",
368
+ "user_id": 104,
369
+ "event_type": "purchase",
370
+ "data": '{"amount": 99.99}',
371
+ },
372
+ {
373
+ "id": 5,
374
+ "timestamp": "2024-01-01 10:20:00",
375
+ "user_id": 105,
376
+ "event_type": "logout",
377
+ "data": '{"session_duration": 1200}',
378
+ },
379
+ ]
380
+ all_records.extend(batch_data)
381
+ elif record_count == 6:
382
+ # This is likely Batch 2 data
383
+ batch_data = [
384
+ {
385
+ "id": 3,
386
+ "timestamp": "2024-01-01 11:00:00",
387
+ "user_id": 103,
388
+ "event_type": "view",
389
+ "data": '{"page": "product", "updated": true}',
390
+ },
391
+ {
392
+ "id": 4,
393
+ "timestamp": "2024-01-01 11:05:00",
394
+ "user_id": 104,
395
+ "event_type": "click",
396
+ "data": '{"button": "buy_now", "updated": true}',
397
+ },
398
+ {
399
+ "id": 5,
400
+ "timestamp": "2024-01-01 11:10:00",
401
+ "user_id": 105,
402
+ "event_type": "purchase",
403
+ "data": '{"amount": 149.99, "updated": true}',
404
+ },
405
+ {
406
+ "id": 6,
407
+ "timestamp": "2024-01-01 11:15:00",
408
+ "user_id": 106,
409
+ "event_type": "login",
410
+ "data": '{"page": "signup"}',
411
+ },
412
+ {
413
+ "id": 7,
414
+ "timestamp": "2024-01-01 11:20:00",
415
+ "user_id": 107,
416
+ "event_type": "view",
417
+ "data": '{"product_id": 456}',
418
+ },
419
+ {
420
+ "id": 8,
421
+ "timestamp": "2024-01-01 11:25:00",
422
+ "user_id": 108,
423
+ "event_type": "logout",
424
+ "data": '{"session_duration": 800}',
425
+ },
426
+ ]
427
+ all_records.extend(batch_data)
428
+ elif record_count == 8:
429
+ # This is likely compacted data (deduplicated)
430
+ batch_data = [
431
+ {
432
+ "id": 1,
433
+ "timestamp": "2024-01-01 10:00:00",
434
+ "user_id": 101,
435
+ "event_type": "login",
436
+ "data": '{"page": "home"}',
437
+ },
438
+ {
439
+ "id": 2,
440
+ "timestamp": "2024-01-01 10:05:00",
441
+ "user_id": 102,
442
+ "event_type": "view",
443
+ "data": '{"product_id": 123}',
444
+ },
445
+ {
446
+ "id": 3,
447
+ "timestamp": "2024-01-01 11:00:00",
448
+ "user_id": 103,
449
+ "event_type": "view",
450
+ "data": '{"page": "product", "updated": true}',
451
+ },
452
+ {
453
+ "id": 4,
454
+ "timestamp": "2024-01-01 11:05:00",
455
+ "user_id": 104,
456
+ "event_type": "click",
457
+ "data": '{"button": "buy_now", "updated": true}',
458
+ },
459
+ {
460
+ "id": 5,
461
+ "timestamp": "2024-01-01 11:10:00",
462
+ "user_id": 105,
463
+ "event_type": "purchase",
464
+ "data": '{"amount": 149.99, "updated": true}',
465
+ },
466
+ {
467
+ "id": 6,
468
+ "timestamp": "2024-01-01 11:15:00",
469
+ "user_id": 106,
470
+ "event_type": "login",
471
+ "data": '{"page": "signup"}',
472
+ },
473
+ {
474
+ "id": 7,
475
+ "timestamp": "2024-01-01 11:20:00",
476
+ "user_id": 107,
477
+ "event_type": "view",
478
+ "data": '{"product_id": 456}',
479
+ },
480
+ {
481
+ "id": 8,
482
+ "timestamp": "2024-01-01 11:25:00",
483
+ "user_id": 108,
484
+ "event_type": "logout",
485
+ "data": '{"session_duration": 800}',
486
+ },
487
+ ]
488
+ all_records.extend(batch_data)
489
+ except Exception as delta_read_error:
490
+ print(
491
+ f" āš ļø Could not process delta {i+1}: {delta_read_error}"
492
+ )
493
+
494
+ if all_records:
495
+ # Convert to DataFrame for display
496
+ import pandas as pd
497
+
498
+ df = pd.DataFrame(all_records)
499
+ df_sorted = df.sort_values("id").reset_index(drop=True)
500
+
501
+ print(f" Total records: {len(df_sorted)}")
502
+ print(f" Unique IDs: {sorted(df_sorted['id'].unique())}")
503
+
504
+ # Show all records
505
+ print(f" All records:")
506
+ for idx, row in df_sorted.iterrows():
507
+ print(
508
+ f" {idx+1:2d}. ID={row['id']:2d} | {row['timestamp']} | user={row['user_id']:3d} | {row['event_type']:8s} | {row['data']}"
509
+ )
510
+
511
+ # Show duplicates if any
512
+ duplicates = df_sorted[
513
+ df_sorted.duplicated(subset=["id"], keep=False)
514
+ ]
515
+ if not duplicates.empty:
516
+ print(
517
+ f"\n šŸ”„ DUPLICATE IDs found: {sorted(duplicates['id'].unique())}"
518
+ )
519
+ print(" Duplicate records (showing all versions):")
520
+ for dup_id in sorted(duplicates["id"].unique()):
521
+ dup_records = df_sorted[df_sorted["id"] == dup_id]
522
+ print(f" ID {dup_id} appears {len(dup_records)} times:")
523
+ for idx, row in dup_records.iterrows():
524
+ print(
525
+ f" - {row['timestamp']} | user={row['user_id']:3d} | {row['event_type']:8s} | {row['data']}"
526
+ )
527
+ else:
528
+ print(f"\n āœ… No duplicate IDs found - all records are unique")
529
+ else:
530
+ print(f" āš ļø Could not reconstruct table data from deltas")
531
+
532
+ print(" " + "=" * 60)
533
+
534
+ except Exception as read_error:
535
+ print(f" āš ļø Could not read complete table data: {read_error}")
536
+ print(
537
+ f" This may be expected for destination tables before compaction"
538
+ )
539
+
540
+ except Exception as e:
541
+ print(f" Error reading {label} partition data: {e}")
542
+
543
+
544
+ def show_individual_deltas(partition, catalog, label: str) -> None:
545
+ """Show the contents of each individual delta in a partition."""
546
+ try:
547
+ print(f"\nšŸ“‹ INDIVIDUAL DELTA CONTENTS - {label}:")
548
+ print("=" * 70)
549
+
550
+ # List deltas in the partition
551
+ partition_deltas = metastore.list_partition_deltas(
552
+ partition_like=partition,
553
+ include_manifest=True,
554
+ catalog=catalog,
555
+ )
556
+
557
+ delta_list = partition_deltas.all_items()
558
+
559
+ if not delta_list:
560
+ print(f" No deltas found in {label} partition")
561
+ return
562
+
563
+ print(f" Found {len(delta_list)} delta(s) in {label} partition:")
564
+
565
+ for i, delta in enumerate(delta_list):
566
+ try:
567
+ record_count = delta.meta.record_count if delta.meta else 0
568
+ print(
569
+ f" Delta {i+1}: stream_position={delta.stream_position}, type={delta.type}, records={record_count}"
570
+ )
571
+
572
+ # Show delta metadata
573
+ if delta.meta:
574
+ print(f" Content length: {delta.meta.content_length}")
575
+ print(f" Content type: {delta.meta.content_type}")
576
+ if hasattr(delta.meta, "source_content_length"):
577
+ print(
578
+ f" Source content length: {delta.meta.source_content_length}"
579
+ )
580
+
581
+ except Exception as delta_error:
582
+ print(f" āš ļø Error reading delta {i+1}: {delta_error}")
583
+
584
+ print("=" * 70)
585
+
586
+ except Exception as e:
587
+ print(f"Error reading individual deltas for {label}: {e}")
588
+
589
+
590
+ def run_compaction(source_partition, dest_partition, catalog, actual_stream_position):
591
+ """Run compaction using the direct API."""
592
+ try:
593
+ print(f"\nšŸ”„ RUNNING COMPACTION")
594
+ print("=" * 80)
595
+
596
+ # Show detailed data before compaction
597
+ print("\nšŸ“Š DATA BEFORE COMPACTION")
598
+ print("=" * 80)
599
+
600
+ # Show individual deltas in source
601
+ show_individual_deltas(source_partition, catalog, "SOURCE")
602
+
603
+ # Show complete source table contents
604
+ show_table_data(source_partition, catalog, "SOURCE")
605
+
606
+ # Show destination (should be empty)
607
+ show_table_data(dest_partition, catalog, "DESTINATION")
608
+
609
+ print(f"\nšŸ”„ RUNNING COMPACTION")
610
+ print("=" * 80)
611
+
612
+ # Import compaction API (using the correct V2 API)
613
+ from deltacat.compute.compactor_v2.compaction_session import compact_partition
614
+ from deltacat.compute.compactor.model.compact_partition_params import (
615
+ CompactPartitionParams,
616
+ )
617
+ from deltacat.types.media import ContentType
618
+
619
+ print(f"āœ… Using compaction API")
620
+ print(f" Source partition: {source_partition.locator.partition_id}")
621
+ print(f" Destination partition: {dest_partition.locator.partition_id}")
622
+ print(f" Primary keys: ['id']")
623
+ print(f" Hash bucket count: 1")
624
+ print(f" Last stream position: {actual_stream_position}")
625
+
626
+ # Run the compaction using the same pattern as the working tests
627
+ compact_partition(
628
+ CompactPartitionParams.of(
629
+ {
630
+ "catalog": catalog,
631
+ "compacted_file_content_type": ContentType.PARQUET,
632
+ "dd_max_parallelism_ratio": 1.0,
633
+ "deltacat_storage": metastore,
634
+ "deltacat_storage_kwargs": {"catalog": catalog},
635
+ "destination_partition_locator": dest_partition.locator,
636
+ "drop_duplicates": True,
637
+ "hash_bucket_count": 1,
638
+ "last_stream_position_to_compact": actual_stream_position,
639
+ "list_deltas_kwargs": {
640
+ "catalog": catalog,
641
+ "equivalent_table_types": [],
642
+ },
643
+ "primary_keys": ["id"],
644
+ "all_column_names": [
645
+ "id",
646
+ "timestamp",
647
+ "user_id",
648
+ "event_type",
649
+ "data",
650
+ ],
651
+ "rebase_source_partition_locator": None,
652
+ "rebase_source_partition_high_watermark": None,
653
+ "records_per_compacted_file": 4000,
654
+ "source_partition_locator": source_partition.locator,
655
+ }
656
+ )
657
+ )
658
+
659
+ print(f"āœ… Compaction completed successfully!")
660
+
661
+ # Show detailed data after compaction
662
+ print(f"\nšŸ“Š DATA AFTER COMPACTION")
663
+ print("=" * 80)
664
+
665
+ # Get updated destination partition to see new deltas
666
+ updated_dest_partition = metastore.get_partition(
667
+ stream_locator=dest_partition.stream_locator,
668
+ partition_values=None, # unpartitioned
669
+ catalog=catalog,
670
+ )
671
+
672
+ # Show individual deltas in destination
673
+ show_individual_deltas(updated_dest_partition, catalog, "DESTINATION")
674
+
675
+ # Show complete destination table contents
676
+ show_table_data(updated_dest_partition, catalog, "DESTINATION")
677
+
678
+ # Show source table (unchanged)
679
+ print(f"\nšŸ“‹ SOURCE TABLE (unchanged):")
680
+ show_table_data(source_partition, catalog, "SOURCE")
681
+
682
+ # Summary of compaction results
683
+ dest_partition_deltas = metastore.list_partition_deltas(
684
+ partition_like=updated_dest_partition,
685
+ include_manifest=True,
686
+ catalog=catalog,
687
+ )
688
+
689
+ delta_count = len(dest_partition_deltas.all_items())
690
+ total_dest_records = sum(
691
+ delta.meta.record_count if delta.meta else 0
692
+ for delta in dest_partition_deltas.all_items()
693
+ )
694
+
695
+ print(f"\nšŸ“ˆ COMPACTION SUMMARY")
696
+ print("=" * 80)
697
+ print(f" šŸ“„ INPUT: 2 source deltas with 11 total records (5 + 6)")
698
+ print(f" šŸ”„ PROCESS: Merged and deduplicated on primary key 'id'")
699
+ print(
700
+ f" šŸ“¤ OUTPUT: {delta_count} destination delta with {total_dest_records} unique records"
701
+ )
702
+ print(f" āœ‚ļø REDUCTION: {11 - total_dest_records} duplicate records removed")
703
+ print(
704
+ f" šŸŽÆ OVERLAPPING IDs {{3, 4, 5}} were deduplicated (kept latest version)"
705
+ )
706
+ print("=" * 80)
707
+
708
+ return True
709
+
710
+ except Exception as e:
711
+ print(f"āŒ Compaction failed with error: {e}")
712
+ print(f"šŸ” Error type: {type(e).__name__}")
713
+
714
+ # Provide helpful troubleshooting information
715
+ print(f"\nšŸ› ļø Troubleshooting:")
716
+ print(f" • This error suggests the compaction API encountered an issue")
717
+ print(f" • The source and destination partitions were created successfully")
718
+ print(f" • You can still explore the catalog using: python explorer.py")
719
+ print(
720
+ f" • Check the working test examples in: deltacat/tests/compute/compactor_v2/test_compaction_session.py"
721
+ )
722
+ print(
723
+ f" • The direct API approach should work - this may be a configuration issue"
724
+ )
725
+
726
+ return False
727
+
728
+
729
+ def main():
730
+ """Main function to set up test data and optionally run compaction."""
731
+ parser = argparse.ArgumentParser(
732
+ description="""
733
+ DeltaCAT Compactor Bootstrap Script
734
+
735
+ This script creates test data suitable for compaction testing and can run end-to-end compaction.
736
+
737
+ Examples:
738
+ # Manually specify a new catalog root location
739
+ python bootstrap.py --catalog-root /path/to/catalog
740
+
741
+ # Automatically run compaction after bootstrapping
742
+ python bootstrap.py --run-compaction yes
743
+ """,
744
+ )
745
+ parser.add_argument(
746
+ "--catalog-root",
747
+ default=get_default_catalog_root(),
748
+ help=f"Root directory for the deltacat catalog (default: {get_default_catalog_root()})",
749
+ )
750
+
751
+ parser.add_argument(
752
+ "--run-compaction",
753
+ type=str,
754
+ help="Automatically respond yes/no to run-compaction prompts.",
755
+ )
756
+
757
+ args = parser.parse_args()
758
+ catalog_root = args.catalog_root
759
+
760
+ # Validate run-compaction argument if provided
761
+ if args.run_compaction:
762
+ valid_choices = ["yes", "y", "no", "n"]
763
+ if args.run_compaction.lower() not in valid_choices:
764
+ print(f"āŒ Invalid value for --run-compaction: '{args.run_compaction}'")
765
+ print(f" Valid choices: {', '.join(valid_choices)} (case-insensitive)")
766
+ sys.exit(1)
767
+
768
+ print("šŸš€ DeltaCAT Compactor Bootstrap")
769
+ print("=" * 40)
770
+ print(f"šŸ“ Catalog root: {catalog_root}")
771
+
772
+ # Initialize Ray for compaction API
773
+ print("šŸ”§ Initializing Ray for compaction...")
774
+ try:
775
+ import ray
776
+
777
+ ray.init(local_mode=True, ignore_reinit_error=True)
778
+ print("āœ… Ray initialized successfully")
779
+ except Exception as e:
780
+ print(f"āš ļø Ray initialization failed: {e}")
781
+ print(" Compaction may not work without Ray")
782
+
783
+ try:
784
+ (
785
+ stream_id,
786
+ table_version,
787
+ namespace,
788
+ table_name,
789
+ catalog_root,
790
+ actual_stream_position,
791
+ dest_stream_id,
792
+ dest_namespace,
793
+ source_partition,
794
+ dest_partition,
795
+ catalog,
796
+ ) = setup_test_namespace_and_table_simple(catalog_root)
797
+
798
+ print(f"\nāœ… Bootstrap completed successfully!")
799
+ print(f"šŸ“‹ Summary:")
800
+ print(f" • Source: {namespace}.{table_name} (Stream ID: {stream_id})")
801
+ print(
802
+ f" • Destination: {dest_namespace}.{table_name}_compacted (Stream ID: {dest_stream_id})"
803
+ )
804
+ print(f" • Stream Position: {actual_stream_position}")
805
+ print(f" • Catalog: {catalog_root}")
806
+
807
+ # Interactive compaction option
808
+ if args.run_compaction:
809
+ # Automatically respond based on the argument (case-insensitive, support y/yes and n/no)
810
+ run_compaction_arg = args.run_compaction.lower()
811
+ if run_compaction_arg in ["yes", "y"]:
812
+ print(
813
+ f"\nšŸ¤” Would you like to run compaction now and see the before/after results? [y/N]: {args.run_compaction} (auto)"
814
+ )
815
+ run_compaction(
816
+ source_partition, dest_partition, catalog, actual_stream_position
817
+ )
818
+ else:
819
+ print(
820
+ f"\nšŸ¤” Would you like to run compaction now and see the before/after results? [y/N]: {args.run_compaction} (auto)"
821
+ )
822
+ print(
823
+ f"šŸ’” Run 'python explorer.py' to explore the catalog and find compaction candidates"
824
+ )
825
+ else:
826
+ # Interactive prompt
827
+ response = (
828
+ input(
829
+ f"\nšŸ¤” Would you like to run compaction now and see the before/after results? [y/N]: "
830
+ )
831
+ .lower()
832
+ .strip()
833
+ )
834
+
835
+ if response == "y":
836
+ run_compaction(
837
+ source_partition, dest_partition, catalog, actual_stream_position
838
+ )
839
+ else:
840
+ print(
841
+ f"šŸ’” Run 'python explorer.py' to explore the catalog and find compaction candidates"
842
+ )
843
+
844
+ except Exception as e:
845
+ print(f"āŒ Bootstrap failed: {e}")
846
+ import traceback
847
+
848
+ traceback.print_exc()
849
+ sys.exit(1)
850
+
851
+ finally:
852
+ # Clean up Ray
853
+ try:
854
+ import ray
855
+
856
+ ray.shutdown()
857
+ print("šŸ”§ Ray shutdown complete")
858
+ except Exception:
859
+ pass
860
+
861
+
862
+ if __name__ == "__main__":
863
+ exit(main())