deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. deltacat/__init__.py +150 -12
  2. deltacat/annotations.py +36 -0
  3. deltacat/api.py +578 -0
  4. deltacat/aws/constants.py +0 -23
  5. deltacat/aws/s3u.py +4 -631
  6. deltacat/benchmarking/benchmark_engine.py +84 -0
  7. deltacat/benchmarking/benchmark_report.py +86 -0
  8. deltacat/benchmarking/benchmark_suite.py +11 -0
  9. deltacat/benchmarking/conftest.py +22 -19
  10. deltacat/benchmarking/data/random_row_generator.py +94 -0
  11. deltacat/benchmarking/data/row_generator.py +10 -0
  12. deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
  13. deltacat/catalog/__init__.py +73 -0
  14. deltacat/catalog/delegate.py +615 -140
  15. deltacat/catalog/interface.py +404 -81
  16. deltacat/catalog/main/impl.py +2882 -0
  17. deltacat/catalog/model/catalog.py +348 -46
  18. deltacat/catalog/model/properties.py +155 -0
  19. deltacat/catalog/model/table_definition.py +32 -1
  20. deltacat/compute/__init__.py +14 -0
  21. deltacat/compute/compactor/compaction_session.py +97 -75
  22. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  23. deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
  24. deltacat/compute/compactor/model/delta_annotated.py +3 -3
  25. deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
  26. deltacat/compute/compactor/model/delta_file_locator.py +3 -1
  27. deltacat/compute/compactor/model/round_completion_info.py +19 -9
  28. deltacat/compute/compactor/model/table_object_store.py +3 -2
  29. deltacat/compute/compactor/repartition_session.py +9 -22
  30. deltacat/compute/compactor/steps/dedupe.py +11 -4
  31. deltacat/compute/compactor/steps/hash_bucket.py +6 -6
  32. deltacat/compute/compactor/steps/materialize.py +15 -9
  33. deltacat/compute/compactor/steps/repartition.py +12 -11
  34. deltacat/compute/compactor/utils/io.py +7 -6
  35. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  36. deltacat/compute/compactor/utils/sort_key.py +9 -2
  37. deltacat/compute/compactor/utils/system_columns.py +3 -1
  38. deltacat/compute/compactor_v2/compaction_session.py +13 -14
  39. deltacat/compute/compactor_v2/deletes/utils.py +3 -3
  40. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  41. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  42. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  43. deltacat/compute/compactor_v2/model/merge_input.py +28 -9
  44. deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
  45. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  46. deltacat/compute/compactor_v2/steps/merge.py +156 -53
  47. deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
  48. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  49. deltacat/compute/compactor_v2/utils/io.py +10 -3
  50. deltacat/compute/compactor_v2/utils/merge.py +14 -2
  51. deltacat/compute/compactor_v2/utils/task_options.py +2 -10
  52. deltacat/compute/converter/constants.py +9 -0
  53. deltacat/compute/converter/converter_session.py +298 -0
  54. deltacat/compute/converter/model/convert_input.py +96 -0
  55. deltacat/compute/converter/model/convert_input_files.py +78 -0
  56. deltacat/compute/converter/model/convert_result.py +80 -0
  57. deltacat/compute/converter/model/converter_session_params.py +144 -0
  58. deltacat/compute/converter/pyiceberg/catalog.py +78 -0
  59. deltacat/compute/converter/pyiceberg/overrides.py +263 -0
  60. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
  61. deltacat/compute/converter/steps/convert.py +366 -0
  62. deltacat/compute/converter/steps/dedupe.py +94 -0
  63. deltacat/compute/converter/utils/__init__.py +0 -0
  64. deltacat/compute/converter/utils/convert_task_options.py +132 -0
  65. deltacat/compute/converter/utils/converter_session_utils.py +175 -0
  66. deltacat/compute/converter/utils/iceberg_columns.py +87 -0
  67. deltacat/compute/converter/utils/io.py +203 -0
  68. deltacat/compute/converter/utils/s3u.py +148 -0
  69. deltacat/compute/janitor.py +205 -0
  70. deltacat/compute/jobs/__init__.py +0 -0
  71. deltacat/compute/jobs/client.py +417 -0
  72. deltacat/compute/resource_estimation/delta.py +11 -1
  73. deltacat/constants.py +90 -1
  74. deltacat/docs/__init__.py +0 -0
  75. deltacat/docs/autogen/__init__.py +0 -0
  76. deltacat/docs/autogen/schema/__init__.py +0 -0
  77. deltacat/docs/autogen/schema/inference/__init__.py +0 -0
  78. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  79. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  80. deltacat/env.py +61 -0
  81. deltacat/examples/__init__.py +0 -0
  82. deltacat/examples/basic_logging.py +101 -0
  83. deltacat/examples/compactor/__init__.py +0 -0
  84. deltacat/examples/compactor/aws/__init__.py +1 -0
  85. deltacat/examples/compactor/bootstrap.py +863 -0
  86. deltacat/examples/compactor/compactor.py +373 -0
  87. deltacat/examples/compactor/explorer.py +473 -0
  88. deltacat/examples/compactor/gcp/__init__.py +1 -0
  89. deltacat/examples/compactor/job_runner.py +439 -0
  90. deltacat/examples/compactor/utils/__init__.py +1 -0
  91. deltacat/examples/compactor/utils/common.py +261 -0
  92. deltacat/examples/experimental/__init__.py +0 -0
  93. deltacat/examples/experimental/iceberg/__init__.py +0 -0
  94. deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
  95. deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
  96. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  97. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  98. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  99. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  100. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  101. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  102. deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
  103. deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
  104. deltacat/examples/hello_world.py +29 -0
  105. deltacat/examples/indexer/__init__.py +0 -0
  106. deltacat/examples/indexer/aws/__init__.py +0 -0
  107. deltacat/examples/indexer/gcp/__init__.py +0 -0
  108. deltacat/examples/indexer/indexer.py +163 -0
  109. deltacat/examples/indexer/job_runner.py +198 -0
  110. deltacat/exceptions.py +116 -12
  111. deltacat/experimental/__init__.py +0 -0
  112. deltacat/experimental/catalog/__init__.py +0 -0
  113. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  114. deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
  115. deltacat/experimental/catalog/iceberg/impl.py +399 -0
  116. deltacat/experimental/catalog/iceberg/overrides.py +72 -0
  117. deltacat/experimental/compatibility/__init__.py +0 -0
  118. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  119. deltacat/experimental/converter_agent/__init__.py +0 -0
  120. deltacat/experimental/converter_agent/beam/__init__.py +0 -0
  121. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  122. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  123. deltacat/experimental/daft/__init__.py +4 -0
  124. deltacat/experimental/daft/daft_catalog.py +229 -0
  125. deltacat/experimental/storage/__init__.py +0 -0
  126. deltacat/experimental/storage/iceberg/__init__.py +0 -0
  127. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  128. deltacat/experimental/storage/iceberg/impl.py +739 -0
  129. deltacat/experimental/storage/iceberg/model.py +713 -0
  130. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  131. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  132. deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
  133. deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
  134. deltacat/experimental/storage/rivulet/dataset.py +745 -0
  135. deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
  136. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  137. deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
  138. deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
  139. deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
  140. deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
  141. deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
  142. deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
  143. deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
  144. deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
  145. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  146. deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
  147. deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
  148. deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
  149. deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
  150. deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
  151. deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
  152. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  153. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  154. deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
  155. deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
  156. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  157. deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
  158. deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
  159. deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
  160. deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
  161. deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
  162. deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
  163. deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
  164. deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
  165. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  166. deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
  167. deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
  168. deltacat/experimental/storage/rivulet/serializer.py +40 -0
  169. deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
  170. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  171. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  172. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  173. deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
  174. deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
  175. deltacat/io/__init__.py +13 -0
  176. deltacat/io/dataset/__init__.py +0 -0
  177. deltacat/io/dataset/deltacat_dataset.py +91 -0
  178. deltacat/io/datasink/__init__.py +0 -0
  179. deltacat/io/datasink/deltacat_datasink.py +207 -0
  180. deltacat/io/datasource/__init__.py +0 -0
  181. deltacat/io/datasource/deltacat_datasource.py +579 -0
  182. deltacat/io/reader/__init__.py +0 -0
  183. deltacat/io/reader/deltacat_read_api.py +172 -0
  184. deltacat/logs.py +4 -1
  185. deltacat/storage/__init__.py +138 -28
  186. deltacat/storage/interface.py +260 -155
  187. deltacat/storage/main/__init__.py +0 -0
  188. deltacat/storage/main/impl.py +3030 -0
  189. deltacat/storage/model/delta.py +142 -71
  190. deltacat/storage/model/expression/__init__.py +47 -0
  191. deltacat/storage/model/expression/expression.py +656 -0
  192. deltacat/storage/model/expression/visitor.py +248 -0
  193. deltacat/storage/model/interop.py +24 -0
  194. deltacat/storage/model/list_result.py +8 -0
  195. deltacat/storage/model/locator.py +93 -9
  196. deltacat/storage/model/manifest.py +643 -0
  197. deltacat/storage/model/metafile.py +1421 -0
  198. deltacat/storage/model/namespace.py +41 -18
  199. deltacat/storage/model/partition.py +443 -43
  200. deltacat/storage/model/scan/__init__.py +0 -0
  201. deltacat/storage/model/scan/push_down.py +46 -0
  202. deltacat/storage/model/scan/scan_plan.py +10 -0
  203. deltacat/storage/model/scan/scan_task.py +34 -0
  204. deltacat/storage/model/schema.py +3160 -0
  205. deltacat/storage/model/shard.py +51 -0
  206. deltacat/storage/model/sort_key.py +210 -13
  207. deltacat/storage/model/stream.py +215 -80
  208. deltacat/storage/model/table.py +134 -29
  209. deltacat/storage/model/table_version.py +333 -46
  210. deltacat/storage/model/transaction.py +1733 -0
  211. deltacat/storage/model/transform.py +274 -58
  212. deltacat/storage/model/types.py +138 -16
  213. deltacat/storage/util/__init__.py +0 -0
  214. deltacat/storage/util/scan_planner.py +26 -0
  215. deltacat/tests/_io/__init__.py +1 -0
  216. deltacat/tests/_io/reader/__init__.py +0 -0
  217. deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
  218. deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
  219. deltacat/tests/aws/test_s3u.py +2 -31
  220. deltacat/tests/catalog/data/__init__.py +0 -0
  221. deltacat/tests/catalog/main/__init__.py +0 -0
  222. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  223. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  224. deltacat/tests/catalog/model/__init__.py +0 -0
  225. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  226. deltacat/tests/catalog/test_catalogs.py +321 -0
  227. deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
  228. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
  229. deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
  230. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
  231. deltacat/tests/compute/compact_partition_test_cases.py +23 -30
  232. deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
  233. deltacat/tests/compute/compactor/utils/test_io.py +125 -123
  234. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  235. deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
  236. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
  237. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
  238. deltacat/tests/compute/conftest.py +39 -0
  239. deltacat/tests/compute/converter/__init__.py +0 -0
  240. deltacat/tests/compute/converter/conftest.py +80 -0
  241. deltacat/tests/compute/converter/test_convert_session.py +826 -0
  242. deltacat/tests/compute/converter/utils.py +132 -0
  243. deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
  244. deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
  245. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
  246. deltacat/tests/compute/test_compact_partition_params.py +16 -11
  247. deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
  248. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
  249. deltacat/tests/compute/test_janitor.py +236 -0
  250. deltacat/tests/compute/test_util_common.py +726 -46
  251. deltacat/tests/compute/test_util_constant.py +0 -1
  252. deltacat/tests/conftest.py +25 -0
  253. deltacat/tests/daft/__init__.py +0 -0
  254. deltacat/tests/daft/test_model.py +97 -0
  255. deltacat/tests/experimental/__init__.py +1 -0
  256. deltacat/tests/experimental/catalog/__init__.py +0 -0
  257. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  258. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  259. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  260. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  261. deltacat/tests/experimental/daft/__init__.py +0 -0
  262. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  263. deltacat/tests/experimental/storage/__init__.py +0 -0
  264. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  265. deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
  266. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  267. deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
  268. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  269. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  270. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  271. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  272. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  273. deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
  274. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  275. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  276. deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
  277. deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
  278. deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
  279. deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
  280. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  281. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
  282. deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
  283. deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
  284. deltacat/tests/storage/__init__.py +0 -0
  285. deltacat/tests/storage/main/__init__.py +0 -0
  286. deltacat/tests/storage/main/test_main_storage.py +8204 -0
  287. deltacat/tests/storage/model/__init__.py +0 -0
  288. deltacat/tests/storage/model/test_delete_parameters.py +21 -0
  289. deltacat/tests/storage/model/test_expression.py +327 -0
  290. deltacat/tests/storage/model/test_manifest.py +129 -0
  291. deltacat/tests/storage/model/test_metafile_io.py +2440 -0
  292. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  293. deltacat/tests/storage/model/test_schema.py +479 -0
  294. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  295. deltacat/tests/storage/model/test_shard.py +24 -0
  296. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  297. deltacat/tests/storage/model/test_table_version.py +110 -0
  298. deltacat/tests/storage/model/test_transaction.py +653 -0
  299. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  300. deltacat/tests/test_deltacat_api.py +1064 -0
  301. deltacat/tests/test_exceptions.py +9 -5
  302. deltacat/tests/test_utils/filesystem.py +14 -0
  303. deltacat/tests/test_utils/message_pack_utils.py +54 -0
  304. deltacat/tests/test_utils/pyarrow.py +50 -26
  305. deltacat/tests/test_utils/storage.py +256 -4
  306. deltacat/tests/types/__init__.py +0 -0
  307. deltacat/tests/types/test_tables.py +104 -0
  308. deltacat/tests/utils/exceptions.py +22 -0
  309. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  310. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  311. deltacat/tests/utils/test_daft.py +124 -34
  312. deltacat/tests/utils/test_numpy.py +1193 -0
  313. deltacat/tests/utils/test_pandas.py +1106 -0
  314. deltacat/tests/utils/test_polars.py +1040 -0
  315. deltacat/tests/utils/test_pyarrow.py +1107 -258
  316. deltacat/types/media.py +345 -37
  317. deltacat/types/partial_download.py +1 -1
  318. deltacat/types/tables.py +2345 -47
  319. deltacat/utils/arguments.py +33 -1
  320. deltacat/utils/daft.py +824 -40
  321. deltacat/utils/export.py +61 -0
  322. deltacat/utils/filesystem.py +450 -0
  323. deltacat/utils/metafile_locator.py +74 -0
  324. deltacat/utils/numpy.py +118 -26
  325. deltacat/utils/pandas.py +577 -48
  326. deltacat/utils/polars.py +759 -0
  327. deltacat/utils/pyarrow.py +1212 -178
  328. deltacat/utils/ray_utils/concurrency.py +1 -1
  329. deltacat/utils/ray_utils/dataset.py +101 -10
  330. deltacat/utils/ray_utils/runtime.py +56 -4
  331. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  332. deltacat/utils/url.py +1325 -0
  333. deltacat-2.0.0.dist-info/METADATA +1163 -0
  334. deltacat-2.0.0.dist-info/RECORD +439 -0
  335. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
  336. deltacat/aws/redshift/__init__.py +0 -19
  337. deltacat/aws/redshift/model/manifest.py +0 -394
  338. deltacat/catalog/default_catalog_impl/__init__.py +0 -369
  339. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  340. deltacat/compute/merge_on_read/__init__.py +0 -4
  341. deltacat/compute/merge_on_read/daft.py +0 -40
  342. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  343. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  344. deltacat/io/dataset.py +0 -73
  345. deltacat/io/read_api.py +0 -143
  346. deltacat/storage/model/delete_parameters.py +0 -40
  347. deltacat/storage/model/partition_spec.py +0 -71
  348. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  349. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
  350. deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
  351. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  352. deltacat/utils/s3fs.py +0 -21
  353. deltacat-1.1.38.dist-info/METADATA +0 -64
  354. deltacat-1.1.38.dist-info/RECORD +0 -219
  355. /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
  356. /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
  357. /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
  358. /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
  359. /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
  360. /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
  361. /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
  362. /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
  363. /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
  364. /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
  365. /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
  366. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
  367. {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,399 @@
1
+ import logging
2
+ import sys
3
+
4
+ from typing import Any, Dict, List, Optional, Union
5
+
6
+ from daft import DataFrame, context
7
+ from daft.daft import ScanOperatorHandle, StorageConfig
8
+ from daft.logical.builder import LogicalPlanBuilder
9
+
10
+ from deltacat import logs
11
+ from deltacat.catalog.model.catalog import Catalog
12
+ from deltacat.catalog.model.table_definition import TableDefinition
13
+ from deltacat.utils.daft import DeltaCatScanOperator
14
+ from deltacat.exceptions import TableAlreadyExistsError
15
+ from deltacat.experimental.storage.iceberg.iceberg_scan_planner import (
16
+ IcebergScanPlanner,
17
+ )
18
+ from deltacat.experimental.storage.iceberg.model import (
19
+ PartitionSchemeMapper,
20
+ SchemaMapper,
21
+ )
22
+ from deltacat.storage.model.partition import PartitionScheme
23
+ from deltacat.experimental.storage.iceberg.impl import _get_native_catalog
24
+ from deltacat.storage.model.sort_key import SortScheme
25
+ from deltacat.storage.model.list_result import ListResult
26
+ from deltacat.storage.model.namespace import Namespace, NamespaceProperties
27
+ from deltacat.storage.model.schema import Schema
28
+ from deltacat.storage.model.table import TableProperties
29
+ from deltacat.storage.model.types import (
30
+ DistributedDataset,
31
+ LifecycleState,
32
+ LocalDataset,
33
+ LocalTable,
34
+ StreamFormat,
35
+ )
36
+ from deltacat.experimental.storage.iceberg import impl as IcebergStorage
37
+ from deltacat.types.media import ContentType
38
+ from deltacat.types.tables import TableWriteMode
39
+ from deltacat.constants import DEFAULT_NAMESPACE
40
+ from deltacat.experimental.catalog.iceberg.iceberg_catalog_config import (
41
+ IcebergCatalogConfig,
42
+ )
43
+
44
+ from pyiceberg.catalog import Catalog as PyIcebergCatalog, load_catalog
45
+ from pyiceberg.transforms import BucketTransform
46
+
47
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
48
+
49
+ IcebergCatalog = sys.modules[__name__]
50
+
51
+
52
+ def from_config(config: IcebergCatalogConfig, *args, **kwargs) -> Catalog:
53
+ """
54
+ Factory method to construct a catalog from Iceberg catalog configuration.
55
+ """
56
+ return Catalog(config, impl=IcebergCatalog, *args, **kwargs)
57
+
58
+
59
+ # catalog functions
60
+ def initialize(config: IcebergCatalogConfig, **kwargs) -> PyIcebergCatalog:
61
+ """
62
+ Initializes an Iceberg catalog with the given config.
63
+
64
+ NOTE: because PyIceberg catalogs are not pickle-able, we cannot accept them as catalog initialization parameters,
65
+ since catalog initialization parameters are passed to Ray actors (see: :class:`deltacat.catalog.Catalogs`)
66
+
67
+ Args:
68
+ **kwargs: Arguments to be passed to PyIceberg Catalog.
69
+ If 'catalog' is provided as a PyIceberg Catalog instance, it will be used directly.
70
+ Otherwise, the arguments will be used to load a catalog via pyiceberg.catalog.load_catalog.
71
+
72
+ Returns:
73
+ IcebergCatalogConfig: Configuration wrapper containing the PyIceberg Catalog.
74
+ """
75
+
76
+ # If no catalog is provided, try to load one with PyIceberg
77
+
78
+ load_catalog_args = {"type": config.type.value, **config.properties, **kwargs}
79
+ catalog = load_catalog(**load_catalog_args)
80
+ return catalog
81
+
82
+
83
+ # table functions
84
+ def write_to_table(
85
+ data: Union[LocalTable, LocalDataset, DistributedDataset],
86
+ table: str,
87
+ *args,
88
+ namespace: Optional[str] = None,
89
+ mode: TableWriteMode = TableWriteMode.AUTO,
90
+ content_type: ContentType = ContentType.PARQUET,
91
+ **kwargs,
92
+ ) -> None:
93
+ """Write local or distributed data to a table. Raises an error if the
94
+ table does not exist and the table write mode is not CREATE or AUTO.
95
+
96
+ When creating a table, all `create_table` parameters may be optionally
97
+ specified as additional keyword arguments. When appending to, or replacing,
98
+ an existing table, all `alter_table` parameters may be optionally specified
99
+ as additional keyword arguments."""
100
+
101
+ # TODO (pdames): derive schema automatically from data if not
102
+ # explicitly specified in kwargs, and table needs to be created
103
+ # kwargs["schema"] = kwargs["schema"] or derived_schema
104
+ kwargs["fail_if_exists"] = mode == TableWriteMode.CREATE
105
+ table_definition = (
106
+ create_table(
107
+ table,
108
+ namespace=namespace,
109
+ *args,
110
+ **kwargs,
111
+ )
112
+ if (mode == TableWriteMode.AUTO or mode == TableWriteMode.CREATE)
113
+ else get_table(table, namespace=namespace, *args, **kwargs)
114
+ )
115
+
116
+ # TODO(pdames): Use native DeltaCAT models to map from Iceberg partitioning to Daft partitioning...
117
+ # this lets us re-use a single model-mapper instead of different per-catalog model mappers
118
+ schema = SchemaMapper.unmap(table_definition.table_version.schema)
119
+ partition_spec = PartitionSchemeMapper.unmap(
120
+ table_definition.table_version.partition_scheme,
121
+ schema,
122
+ )
123
+ if isinstance(data, DataFrame):
124
+ for partition_field in partition_spec.fields:
125
+ if isinstance(partition_field.transform, BucketTransform):
126
+ ice_bucket_transform: BucketTransform = partition_field.transform
127
+ # TODO(pdames): Get a type-checked Iceberg Table automatically via unmap()
128
+ table_location = table_definition.table.native_object.location()
129
+ path = kwargs.get("path") or f"{table_location}/data"
130
+ if content_type == ContentType.PARQUET:
131
+ source_field = schema.find_field(
132
+ name_or_id=partition_field.source_id
133
+ )
134
+ out_df = data.write_parquet(
135
+ path,
136
+ partition_cols=[
137
+ data[source_field.name].partitioning.iceberg_bucket(
138
+ ice_bucket_transform.num_buckets
139
+ ),
140
+ ],
141
+ )
142
+ # TODO(pdames): only append s3:// to output file paths when writing to S3!
143
+ out_file_paths = [f"s3://{val}" for val in out_df.to_arrow()[0]]
144
+ from deltacat.experimental.catalog.iceberg import overrides
145
+
146
+ overrides.append(
147
+ table_definition.table.native_object,
148
+ out_file_paths,
149
+ )
150
+ else:
151
+ raise NotImplementedError(
152
+ f"iceberg writes not implemented for content type: {content_type}"
153
+ )
154
+ else:
155
+ raise NotImplementedError(
156
+ f"daft partitioning not implemented for iceberg transform: {partition_field.transform}"
157
+ )
158
+ else:
159
+ raise NotImplementedError(
160
+ f"iceberg write-back not implemented for data type: {type(data)}"
161
+ )
162
+
163
+
164
+ def read_table(
165
+ table: str, *args, namespace: Optional[str] = None, **kwargs
166
+ ) -> DistributedDataset:
167
+ """Read a table into a distributed dataset."""
168
+ # TODO: more proper IO configuration
169
+ io_config = context.get_context().daft_planning_config.default_io_config
170
+ multithreaded_io = context.get_context().get_or_create_runner().name != "ray"
171
+
172
+ storage_config = StorageConfig(multithreaded_io, io_config)
173
+
174
+ dc_table = get_table(name=table, namespace=namespace, **kwargs)
175
+ dc_scan_operator = DeltaCatScanOperator(dc_table, storage_config)
176
+ handle = ScanOperatorHandle.from_python_scan_operator(dc_scan_operator)
177
+ builder = LogicalPlanBuilder.from_tabular_scan(scan_operator=handle)
178
+ return DataFrame(builder)
179
+
180
+
181
+ def alter_table(
182
+ table: str,
183
+ *args,
184
+ namespace: Optional[str] = None,
185
+ lifecycle_state: Optional[LifecycleState] = None,
186
+ schema_updates: Optional[Dict[str, Any]] = None,
187
+ partition_updates: Optional[Dict[str, Any]] = None,
188
+ sort_keys: Optional[SortScheme] = None,
189
+ description: Optional[str] = None,
190
+ properties: Optional[TableProperties] = None,
191
+ **kwargs,
192
+ ) -> None:
193
+ """Alter table definition."""
194
+ raise NotImplementedError("alter_table not implemented")
195
+
196
+
197
+ def create_table(
198
+ name: str,
199
+ *args,
200
+ namespace: Optional[str] = None,
201
+ table_version: Optional[str] = None,
202
+ lifecycle_state: Optional[LifecycleState] = None,
203
+ schema: Optional[Schema] = None,
204
+ partition_scheme: Optional[PartitionScheme] = None,
205
+ sort_keys: Optional[SortScheme] = None,
206
+ description: Optional[str] = None,
207
+ table_properties: Optional[TableProperties] = None,
208
+ namespace_properties: Optional[NamespaceProperties] = None,
209
+ content_types: Optional[List[ContentType]] = None,
210
+ fail_if_exists: bool = True,
211
+ **kwargs,
212
+ ) -> TableDefinition:
213
+ """Create an empty table in the catalog"""
214
+
215
+ namespace = namespace or default_namespace()
216
+ existing_table = get_table(
217
+ name,
218
+ *args,
219
+ namespace=namespace,
220
+ **kwargs,
221
+ )
222
+ if existing_table:
223
+ if fail_if_exists:
224
+ err_msg = (
225
+ f"Table `{namespace}.{name}` already exists. "
226
+ f"To suppress this error, rerun `create_table()` with "
227
+ f"`fail_if_exists=False`."
228
+ )
229
+ raise TableAlreadyExistsError(err_msg)
230
+ else:
231
+ logger.debug(f"Returning existing table: `{namespace}.{name}`")
232
+ return existing_table
233
+
234
+ if not IcebergStorage.namespace_exists(namespace, **kwargs):
235
+ logger.debug(f"Namespace {namespace} doesn't exist. Creating it...")
236
+ IcebergStorage.create_namespace(
237
+ namespace,
238
+ properties=namespace_properties or {},
239
+ **kwargs,
240
+ )
241
+
242
+ IcebergStorage.create_table_version(
243
+ namespace=namespace,
244
+ table_name=name,
245
+ table_version=table_version,
246
+ schema=schema,
247
+ partition_scheme=partition_scheme,
248
+ sort_keys=sort_keys,
249
+ table_properties=table_properties,
250
+ **kwargs,
251
+ )
252
+
253
+ return get_table(
254
+ name,
255
+ *args,
256
+ namespace=namespace,
257
+ **kwargs,
258
+ )
259
+
260
+
261
+ def drop_table(
262
+ name: str,
263
+ *args,
264
+ namespace: Optional[str] = None,
265
+ table_version: Optional[str] = None,
266
+ purge: bool = False,
267
+ **kwargs,
268
+ ) -> None:
269
+ """Drop a table from the catalog and optionally purge it. Raises an error
270
+ if the table does not exist."""
271
+ raise NotImplementedError("drop_table not implemented")
272
+
273
+
274
+ def refresh_table(table: str, *args, namespace: Optional[str] = None, **kwargs) -> None:
275
+ """Refresh metadata cached on the Ray cluster for the given table."""
276
+ raise NotImplementedError("refresh_table not implemented")
277
+
278
+
279
+ def list_tables(
280
+ *args, namespace: Optional[str] = None, **kwargs
281
+ ) -> ListResult[TableDefinition]:
282
+ """List a page of table definitions. Raises an error if the given namespace
283
+ does not exist."""
284
+ raise NotImplementedError("list_tables not implemented")
285
+
286
+
287
+ def get_table(
288
+ name: str,
289
+ *args,
290
+ namespace: Optional[str] = None,
291
+ table_version: Optional[str] = None,
292
+ stream_format: StreamFormat = StreamFormat.DELTACAT,
293
+ **kwargs,
294
+ ) -> Optional[TableDefinition]:
295
+ """Get table definition metadata.
296
+
297
+ Args:
298
+ name: Name of the table to retrieve
299
+ namespace: Optional namespace of the table. Uses default namespace if not specified.
300
+ table_version: Optional specific version of the table to retrieve.
301
+ If not specified, the latest version is used.
302
+ stream_format: Optional stream format to retrieve
303
+
304
+ Returns:
305
+ Deltacat TableDefinition if the table exists, None otherwise.
306
+ """
307
+ namespace = namespace or default_namespace()
308
+ stream = IcebergStorage.get_stream(namespace=namespace, table_name=name, **kwargs)
309
+ if not stream:
310
+ return None
311
+ table_obj = IcebergStorage.get_table(namespace=namespace, table_name=name, **kwargs)
312
+ if not table_obj:
313
+ return None
314
+ table_version_obj = None
315
+ if table_version:
316
+ table_version_obj = IcebergStorage.get_table_version(
317
+ namespace=namespace, table_name=name, table_version=table_version, **kwargs
318
+ )
319
+ else:
320
+ table_version_obj = IcebergStorage.get_latest_table_version(
321
+ namespace=namespace, table_name=name, **kwargs
322
+ )
323
+ if not table_version_obj:
324
+ return None
325
+ scan_planner = IcebergScanPlanner(_get_native_catalog(**kwargs))
326
+ return TableDefinition.of(
327
+ table=table_obj,
328
+ table_version=table_version_obj,
329
+ stream=stream,
330
+ native_object=table_obj.native_object,
331
+ scan_planner=scan_planner,
332
+ )
333
+
334
+
335
+ def truncate_table(
336
+ table: str, *args, namespace: Optional[str] = None, **kwargs
337
+ ) -> None:
338
+ """Truncate table data. Raises an error if the table does not exist."""
339
+ raise NotImplementedError("truncate_table not implemented")
340
+
341
+
342
+ def rename_table(
343
+ table: str, new_name: str, *args, namespace: Optional[str] = None, **kwargs
344
+ ) -> None:
345
+ """Rename a table."""
346
+ raise NotImplementedError("rename_table not implemented")
347
+
348
+
349
+ def table_exists(table: str, *args, namespace: Optional[str] = None, **kwargs) -> bool:
350
+ """Returns True if the given table exists, False if not."""
351
+ namespace = namespace or default_namespace()
352
+ return IcebergStorage.table_exists(namespace=namespace, table_name=table, **kwargs)
353
+
354
+
355
+ # namespace functions
356
+ def list_namespaces(*args, **kwargs) -> ListResult[Namespace]:
357
+ """List a page of table namespaces."""
358
+ return IcebergStorage.list_namespaces(**kwargs)
359
+
360
+
361
+ def get_namespace(namespace: str, *args, **kwargs) -> Optional[Namespace]:
362
+ """Gets table namespace metadata for the specified table namespace. Returns
363
+ None if the given namespace does not exist."""
364
+ return IcebergStorage.get_namespace(namespace, **kwargs)
365
+
366
+
367
+ def namespace_exists(namespace: str, *args, **kwargs) -> bool:
368
+ """Returns True if the given table namespace exists, False if not."""
369
+ return IcebergStorage.namespace_exists(namespace, **kwargs)
370
+
371
+
372
+ def create_namespace(
373
+ namespace: str, *args, properties: Optional[NamespaceProperties] = None, **kwargs
374
+ ) -> Namespace:
375
+ """Creates a table namespace with the given name and properties. Returns
376
+ the created namespace. Raises an error if the namespace already exists."""
377
+ raise NotImplementedError("create_namespace not implemented")
378
+
379
+
380
+ def alter_namespace(
381
+ namespace: str,
382
+ *args,
383
+ properties: Optional[NamespaceProperties] = None,
384
+ new_namespace: Optional[str] = None,
385
+ **kwargs,
386
+ ) -> None:
387
+ """Alter table namespace definition."""
388
+ raise NotImplementedError("alter_namespace not implemented")
389
+
390
+
391
+ def drop_namespace(namespace: str, *args, purge: bool = False, **kwargs) -> None:
392
+ """Drop the given namespace and all of its tables from the catalog,
393
+ optionally purging them."""
394
+ raise NotImplementedError("drop_namespace not implemented")
395
+
396
+
397
+ def default_namespace(*args, **kwargs) -> str:
398
+ """Returns the default namespace for the catalog."""
399
+ return DEFAULT_NAMESPACE
@@ -0,0 +1,72 @@
1
+ import pyarrow.parquet as pq
2
+
3
+ from typing import Iterator, List
4
+
5
+ from pyarrow.fs import FileSystem
6
+
7
+ from pyiceberg.io.pyarrow import (
8
+ data_file_statistics_from_parquet_metadata,
9
+ compute_statistics_plan,
10
+ parquet_path_to_id_mapping,
11
+ )
12
+ from pyiceberg.table import Table
13
+ from pyiceberg.manifest import DataFile, DataFileContent, FileFormat
14
+ from pyiceberg.types import StructType, NestedField, IntegerType
15
+ from pyiceberg.typedef import Record
16
+
17
+
18
+ def append(table: Table, paths: List[str]) -> None:
19
+ """
20
+ Append files to the table.
21
+ """
22
+ # if len(table.sort_order().fields) > 0:
23
+ # raise ValueError("Cannot write to tables with a sort-order")
24
+
25
+ data_files = write_file(table, paths)
26
+ with table.transaction() as txn:
27
+ with txn.update_snapshot().fast_append() as snapshot_update:
28
+ for data_file in data_files:
29
+ snapshot_update.append_data_file(data_file)
30
+
31
+
32
+ def write_file(table: Table, paths: Iterator[str]) -> Iterator[DataFile]:
33
+ data_files = []
34
+ for file_path in paths:
35
+ partition_dir = file_path.split("/")[-2]
36
+ partition_value = int(partition_dir.split("=")[-1])
37
+ fs_tuple = FileSystem.from_uri(file_path)
38
+ fs = fs_tuple[0]
39
+ fs_path = fs_tuple[1]
40
+ with fs.open_input_file(fs_path) as native_file:
41
+ parquet_metadata = pq.read_metadata(native_file)
42
+ statistics = data_file_statistics_from_parquet_metadata(
43
+ parquet_metadata=parquet_metadata,
44
+ stats_columns=compute_statistics_plan(table.schema(), table.properties),
45
+ parquet_column_mapping=parquet_path_to_id_mapping(table.schema()),
46
+ )
47
+ data_file = DataFile(
48
+ content=DataFileContent.DATA,
49
+ file_path=file_path,
50
+ file_format=FileFormat.PARQUET,
51
+ partition=Record(
52
+ **{
53
+ "struct": StructType(
54
+ NestedField(
55
+ 0,
56
+ table.spec().fields[0].name,
57
+ IntegerType(),
58
+ required=False,
59
+ )
60
+ ),
61
+ **{table.spec().fields[0].name: partition_value},
62
+ }
63
+ ),
64
+ file_size_in_bytes=native_file.size(),
65
+ sort_order_id=None,
66
+ spec_id=table.spec().spec_id,
67
+ equality_ids=None,
68
+ key_metadata=None,
69
+ **statistics.to_serialized_dict(),
70
+ )
71
+ data_files.append(data_file)
72
+ return data_files
File without changes
@@ -0,0 +1,201 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Backfill script for backwards compatibility with canonical_string changes.
4
+
5
+ This script migrates existing DeltaCAT catalogs from the old global canonical string
6
+ format (with parent hexdigest) to the new hierarchical format (without parent hexdigest).
7
+
8
+ The old format was: {parent_hexdigest}|{name_parts}
9
+ The new format is: {name_parts}
10
+
11
+ Strategy:
12
+ 1. Patch canonical_string method to use old format for reading existing name mappings
13
+ 2. Use dc.list() to recursively discover all objects with old canonical_string
14
+ 3. Copy each object's name mappings using new canonical_string format for writing
15
+ 4. Works with any PyArrow-supported filesystem (local, S3, GCS, etc.)
16
+
17
+ Usage:
18
+ python deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py --catalog-root /path/to/catalog
19
+ """
20
+
21
+ import argparse
22
+ import logging
23
+ import contextlib
24
+
25
+ import deltacat as dc
26
+ from deltacat.utils.url import DeltaCatUrl
27
+ from deltacat.storage.model.locator import Locator
28
+ from deltacat.api import _copy_objects_in_order
29
+
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ def canonical_string_old(locator, separator: str = "|") -> str:
35
+ """
36
+ Old implementation of canonical_string that included parent hexdigest.
37
+ This is used to read existing name resolution directories.
38
+ """
39
+ parts = []
40
+ parent_hexdigest = locator.parent.hexdigest() if locator.parent else None
41
+ if parent_hexdigest:
42
+ parts.append(parent_hexdigest)
43
+ parts.extend(locator.name.parts())
44
+ return separator.join([str(part) for part in parts])
45
+
46
+
47
+ @contextlib.contextmanager
48
+ def patched_canonical_string(use_old_format: bool = True):
49
+ """
50
+ Context manager that temporarily patches the canonical_string method.
51
+
52
+ Args:
53
+ use_old_format: If True, use old format; if False, use new format
54
+ """
55
+ # Store original method
56
+ original_method = Locator.canonical_string
57
+
58
+ try:
59
+ if use_old_format:
60
+ # Patch with old implementation
61
+ Locator.canonical_string = canonical_string_old
62
+ # If use_old_format is False, keep the current (new) implementation
63
+
64
+ yield
65
+
66
+ finally:
67
+ # Always restore original method
68
+ Locator.canonical_string = original_method
69
+
70
+
71
+ def migrate_catalog(
72
+ source_url: str, destination_url: str, dry_run: bool = False
73
+ ) -> bool:
74
+ """
75
+ Migrate a catalog from old to new canonical string format.
76
+
77
+ Args:
78
+ source_url: Source catalog URL (e.g., 'dc://catalog_root/')
79
+ destination_url: Destination catalog URL (e.g., 'dc://new_catalog_root/')
80
+ dry_run: If True, just show what would be migrated
81
+
82
+ Returns:
83
+ True if migration successful, False otherwise
84
+ """
85
+ try:
86
+ src_url = DeltaCatUrl(source_url)
87
+ dst_url = DeltaCatUrl(destination_url)
88
+
89
+ logger.info(f"Starting migration from {source_url} to {destination_url}")
90
+
91
+ if dry_run:
92
+ logger.info("DRY RUN - No actual changes will be made")
93
+
94
+ if dry_run:
95
+ # Step 1: List all objects using old canonical_string format for dry run
96
+ logger.info(
97
+ "DRY RUN - Discovering objects using old canonical string format..."
98
+ )
99
+ with patched_canonical_string(use_old_format=True):
100
+ src_objects = dc.list(src_url, recursive=True)
101
+
102
+ if hasattr(src_objects, "__len__"):
103
+ logger.info(f"DRY RUN - Found {len(src_objects)} objects to migrate")
104
+ else:
105
+ logger.info("DRY RUN - Found objects to migrate (count unknown)")
106
+
107
+ logger.info(
108
+ "DRY RUN - Would copy objects using new canonical string format"
109
+ )
110
+ return True
111
+
112
+ # Step 2: Read objects with old format, then write with new format
113
+ logger.info("Step 1: Reading all objects using old canonical string format...")
114
+ with patched_canonical_string(use_old_format=True):
115
+ src_objects = dc.list(src_url, recursive=True)
116
+
117
+ if hasattr(src_objects, "__len__"):
118
+ logger.info(f"Found {len(src_objects)} objects to migrate")
119
+ else:
120
+ logger.info("Found objects to migrate (count unknown)")
121
+
122
+ logger.info("Step 2: Writing objects using new canonical string format...")
123
+ with patched_canonical_string(use_old_format=False):
124
+ _copy_objects_in_order(src_objects, dst_url)
125
+
126
+ logger.info("Migration completed successfully!")
127
+ return True
128
+
129
+ except Exception as e:
130
+ logger.error(f"Migration failed: {e}")
131
+ import traceback
132
+
133
+ traceback.print_exc()
134
+ return False
135
+
136
+
137
+ def main():
138
+ parser = argparse.ArgumentParser(
139
+ description="Backfill locator-to-ID mappings for DeltaCAT canonical string changes"
140
+ )
141
+ parser.add_argument(
142
+ "--catalog-root",
143
+ required=True,
144
+ help="Path to the source DeltaCAT catalog root directory",
145
+ )
146
+ parser.add_argument(
147
+ "--destination",
148
+ required=True,
149
+ help="Path to the destination DeltaCAT catalog root directory",
150
+ )
151
+ parser.add_argument(
152
+ "--dry-run",
153
+ action="store_true",
154
+ help="Show what would be migrated without making changes",
155
+ )
156
+ parser.add_argument(
157
+ "--verbose",
158
+ "-v",
159
+ action="store_true",
160
+ help="Enable verbose logging. Writes logs to /tmp/deltacat/ by default.",
161
+ )
162
+
163
+ args = parser.parse_args()
164
+
165
+ # Configure logging
166
+ level = logging.DEBUG if args.verbose else logging.INFO
167
+ logging.basicConfig(level=level, format="%(asctime)s - %(levelname)s - %(message)s")
168
+
169
+ # Initialize DeltaCAT with the catalog
170
+ catalog_config = {
171
+ "local": {
172
+ "root": args.catalog_root,
173
+ }
174
+ }
175
+ dc.init(catalogs=catalog_config)
176
+
177
+ try:
178
+ # Migrate to different location
179
+ source_url = f"dc://{args.catalog_root}/"
180
+ dest_url = f"dc://{args.destination}/"
181
+
182
+ if not args.dry_run:
183
+ # Initialize destination catalog
184
+ dest_config = {
185
+ "dest": {
186
+ "root": args.destination,
187
+ }
188
+ }
189
+ dc.init(catalogs=dest_config)
190
+
191
+ success = migrate_catalog(source_url, dest_url, args.dry_run)
192
+
193
+ return int(success)
194
+
195
+ except Exception as e:
196
+ logger.error(f"Migration failed: {e}")
197
+ return 1
198
+
199
+
200
+ if __name__ == "__main__":
201
+ exit(main())
File without changes
File without changes