deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (298) hide show
  1. deltacat/__init__.py +96 -17
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +0 -18
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2435 -279
  12. deltacat/catalog/model/catalog.py +154 -77
  13. deltacat/catalog/model/properties.py +63 -22
  14. deltacat/compute/compactor/compaction_session.py +97 -75
  15. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  16. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  17. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  18. deltacat/compute/compactor/repartition_session.py +8 -21
  19. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  20. deltacat/compute/compactor/steps/materialize.py +9 -7
  21. deltacat/compute/compactor/steps/repartition.py +12 -11
  22. deltacat/compute/compactor/utils/io.py +6 -5
  23. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  24. deltacat/compute/compactor/utils/system_columns.py +3 -1
  25. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  26. deltacat/compute/compactor_v2/constants.py +30 -1
  27. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  28. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  29. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  30. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  31. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  32. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  33. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  34. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  35. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  36. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  37. deltacat/compute/compactor_v2/utils/io.py +11 -4
  38. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  39. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  40. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  41. deltacat/compute/converter/converter_session.py +145 -32
  42. deltacat/compute/converter/model/convert_input.py +26 -19
  43. deltacat/compute/converter/model/convert_input_files.py +33 -16
  44. deltacat/compute/converter/model/convert_result.py +35 -16
  45. deltacat/compute/converter/model/converter_session_params.py +24 -21
  46. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  47. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  48. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  49. deltacat/compute/converter/steps/convert.py +157 -50
  50. deltacat/compute/converter/steps/dedupe.py +24 -11
  51. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  52. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  53. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  54. deltacat/compute/converter/utils/io.py +101 -12
  55. deltacat/compute/converter/utils/s3u.py +33 -27
  56. deltacat/compute/janitor.py +205 -0
  57. deltacat/compute/jobs/client.py +25 -12
  58. deltacat/compute/resource_estimation/delta.py +38 -6
  59. deltacat/compute/resource_estimation/model.py +8 -0
  60. deltacat/constants.py +45 -2
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/env.py +10 -0
  64. deltacat/examples/basic_logging.py +1 -3
  65. deltacat/examples/compactor/aws/__init__.py +1 -0
  66. deltacat/examples/compactor/bootstrap.py +863 -0
  67. deltacat/examples/compactor/compactor.py +373 -0
  68. deltacat/examples/compactor/explorer.py +473 -0
  69. deltacat/examples/compactor/gcp/__init__.py +1 -0
  70. deltacat/examples/compactor/job_runner.py +439 -0
  71. deltacat/examples/compactor/utils/__init__.py +1 -0
  72. deltacat/examples/compactor/utils/common.py +261 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  79. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  80. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  81. deltacat/examples/indexer/indexer.py +2 -2
  82. deltacat/examples/indexer/job_runner.py +1 -2
  83. deltacat/exceptions.py +66 -4
  84. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  85. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  86. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
  87. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  88. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  89. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  90. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  91. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  92. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  93. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  94. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  95. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  96. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  97. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  98. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  99. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  100. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  101. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  102. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  103. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  104. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  105. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  107. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  108. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  109. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  110. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  111. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  112. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  113. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  114. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  115. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  116. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  117. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  118. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  119. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  120. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  121. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  122. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  123. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  124. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  125. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  126. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  127. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  128. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  129. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  130. deltacat/io/datasource/deltacat_datasource.py +0 -1
  131. deltacat/io/reader/deltacat_read_api.py +1 -1
  132. deltacat/storage/__init__.py +20 -2
  133. deltacat/storage/interface.py +54 -32
  134. deltacat/storage/main/impl.py +1494 -541
  135. deltacat/storage/model/delta.py +27 -3
  136. deltacat/storage/model/locator.py +6 -12
  137. deltacat/storage/model/manifest.py +182 -6
  138. deltacat/storage/model/metafile.py +151 -78
  139. deltacat/storage/model/namespace.py +8 -1
  140. deltacat/storage/model/partition.py +117 -42
  141. deltacat/storage/model/schema.py +2427 -159
  142. deltacat/storage/model/shard.py +6 -2
  143. deltacat/storage/model/sort_key.py +40 -0
  144. deltacat/storage/model/stream.py +9 -2
  145. deltacat/storage/model/table.py +12 -1
  146. deltacat/storage/model/table_version.py +11 -0
  147. deltacat/storage/model/transaction.py +1184 -208
  148. deltacat/storage/model/transform.py +81 -2
  149. deltacat/storage/model/types.py +48 -26
  150. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  151. deltacat/tests/aws/test_s3u.py +2 -31
  152. deltacat/tests/catalog/data/__init__.py +0 -0
  153. deltacat/tests/catalog/main/__init__.py +0 -0
  154. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  155. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  156. deltacat/tests/catalog/model/__init__.py +0 -0
  157. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  158. deltacat/tests/catalog/test_catalogs.py +103 -106
  159. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  160. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  161. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  162. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  163. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  164. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  165. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  166. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  167. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  168. deltacat/tests/compute/conftest.py +8 -44
  169. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  170. deltacat/tests/compute/converter/utils.py +15 -6
  171. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  172. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  173. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  174. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  175. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  176. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  177. deltacat/tests/compute/test_janitor.py +236 -0
  178. deltacat/tests/compute/test_util_common.py +716 -43
  179. deltacat/tests/compute/test_util_constant.py +0 -1
  180. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  181. deltacat/tests/daft/__init__.py +0 -0
  182. deltacat/tests/daft/test_model.py +97 -0
  183. deltacat/tests/experimental/__init__.py +1 -0
  184. deltacat/tests/experimental/catalog/__init__.py +0 -0
  185. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  186. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  187. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  188. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  189. deltacat/tests/experimental/daft/__init__.py +0 -0
  190. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  191. deltacat/tests/experimental/storage/__init__.py +0 -0
  192. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  193. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  194. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  195. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
  196. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  197. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  198. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  199. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  200. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  201. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  202. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  203. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  204. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
  205. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  206. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  207. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  208. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  209. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  210. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  211. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  212. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  213. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  214. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  215. deltacat/tests/storage/model/test_schema.py +171 -0
  216. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  217. deltacat/tests/storage/model/test_shard.py +3 -1
  218. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  219. deltacat/tests/storage/model/test_transaction.py +393 -48
  220. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  221. deltacat/tests/test_deltacat_api.py +988 -4
  222. deltacat/tests/test_exceptions.py +9 -5
  223. deltacat/tests/test_utils/pyarrow.py +52 -21
  224. deltacat/tests/test_utils/storage.py +23 -34
  225. deltacat/tests/types/__init__.py +0 -0
  226. deltacat/tests/types/test_tables.py +104 -0
  227. deltacat/tests/utils/exceptions.py +22 -0
  228. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  229. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  230. deltacat/tests/utils/test_daft.py +121 -31
  231. deltacat/tests/utils/test_numpy.py +1193 -0
  232. deltacat/tests/utils/test_pandas.py +1106 -0
  233. deltacat/tests/utils/test_polars.py +1040 -0
  234. deltacat/tests/utils/test_pyarrow.py +1370 -89
  235. deltacat/types/media.py +224 -14
  236. deltacat/types/tables.py +2329 -59
  237. deltacat/utils/arguments.py +33 -1
  238. deltacat/utils/daft.py +823 -36
  239. deltacat/utils/export.py +3 -1
  240. deltacat/utils/filesystem.py +100 -0
  241. deltacat/utils/metafile_locator.py +2 -1
  242. deltacat/utils/numpy.py +118 -26
  243. deltacat/utils/pandas.py +577 -48
  244. deltacat/utils/polars.py +658 -27
  245. deltacat/utils/pyarrow.py +1258 -213
  246. deltacat/utils/ray_utils/dataset.py +101 -10
  247. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  248. deltacat/utils/url.py +57 -16
  249. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  250. deltacat-2.0.0b12.dist-info/RECORD +439 -0
  251. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  252. deltacat/catalog/iceberg/__init__.py +0 -4
  253. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  254. deltacat/compute/merge_on_read/__init__.py +0 -4
  255. deltacat/compute/merge_on_read/daft.py +0 -40
  256. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  257. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  258. deltacat/daft/daft_scan.py +0 -115
  259. deltacat/daft/model.py +0 -258
  260. deltacat/daft/translator.py +0 -126
  261. deltacat/examples/common/fixtures.py +0 -15
  262. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  263. deltacat/storage/rivulet/__init__.py +0 -11
  264. deltacat/storage/rivulet/feather/__init__.py +0 -5
  265. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  266. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  267. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  268. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  269. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  270. deltacat/utils/s3fs.py +0 -21
  271. deltacat-2.0.0b10.dist-info/METADATA +0 -68
  272. deltacat-2.0.0b10.dist-info/RECORD +0 -381
  273. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  274. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  275. /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
  276. /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
  277. /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
  278. /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
  279. /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
  280. /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
  281. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  282. /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
  283. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  284. /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
  285. /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
  286. /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
  287. /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
  288. /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  289. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  290. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
  291. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  292. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  293. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  294. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  295. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  296. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  297. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  298. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,201 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Backfill script for backwards compatibility with canonical_string changes.
4
+
5
+ This script migrates existing DeltaCAT catalogs from the old global canonical string
6
+ format (with parent hexdigest) to the new hierarchical format (without parent hexdigest).
7
+
8
+ The old format was: {parent_hexdigest}|{name_parts}
9
+ The new format is: {name_parts}
10
+
11
+ Strategy:
12
+ 1. Patch canonical_string method to use old format for reading existing name mappings
13
+ 2. Use dc.list() to recursively discover all objects with old canonical_string
14
+ 3. Copy each object's name mappings using new canonical_string format for writing
15
+ 4. Works with any PyArrow-supported filesystem (local, S3, GCS, etc.)
16
+
17
+ Usage:
18
+ python deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py --catalog-root /path/to/catalog
19
+ """
20
+
21
+ import argparse
22
+ import logging
23
+ import contextlib
24
+
25
+ import deltacat as dc
26
+ from deltacat.utils.url import DeltaCatUrl
27
+ from deltacat.storage.model.locator import Locator
28
+ from deltacat.api import _copy_objects_in_order
29
+
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ def canonical_string_old(locator, separator: str = "|") -> str:
35
+ """
36
+ Old implementation of canonical_string that included parent hexdigest.
37
+ This is used to read existing name resolution directories.
38
+ """
39
+ parts = []
40
+ parent_hexdigest = locator.parent.hexdigest() if locator.parent else None
41
+ if parent_hexdigest:
42
+ parts.append(parent_hexdigest)
43
+ parts.extend(locator.name.parts())
44
+ return separator.join([str(part) for part in parts])
45
+
46
+
47
+ @contextlib.contextmanager
48
+ def patched_canonical_string(use_old_format: bool = True):
49
+ """
50
+ Context manager that temporarily patches the canonical_string method.
51
+
52
+ Args:
53
+ use_old_format: If True, use old format; if False, use new format
54
+ """
55
+ # Store original method
56
+ original_method = Locator.canonical_string
57
+
58
+ try:
59
+ if use_old_format:
60
+ # Patch with old implementation
61
+ Locator.canonical_string = canonical_string_old
62
+ # If use_old_format is False, keep the current (new) implementation
63
+
64
+ yield
65
+
66
+ finally:
67
+ # Always restore original method
68
+ Locator.canonical_string = original_method
69
+
70
+
71
+ def migrate_catalog(
72
+ source_url: str, destination_url: str, dry_run: bool = False
73
+ ) -> bool:
74
+ """
75
+ Migrate a catalog from old to new canonical string format.
76
+
77
+ Args:
78
+ source_url: Source catalog URL (e.g., 'dc://catalog_root/')
79
+ destination_url: Destination catalog URL (e.g., 'dc://new_catalog_root/')
80
+ dry_run: If True, just show what would be migrated
81
+
82
+ Returns:
83
+ True if migration successful, False otherwise
84
+ """
85
+ try:
86
+ src_url = DeltaCatUrl(source_url)
87
+ dst_url = DeltaCatUrl(destination_url)
88
+
89
+ logger.info(f"Starting migration from {source_url} to {destination_url}")
90
+
91
+ if dry_run:
92
+ logger.info("DRY RUN - No actual changes will be made")
93
+
94
+ if dry_run:
95
+ # Step 1: List all objects using old canonical_string format for dry run
96
+ logger.info(
97
+ "DRY RUN - Discovering objects using old canonical string format..."
98
+ )
99
+ with patched_canonical_string(use_old_format=True):
100
+ src_objects = dc.list(src_url, recursive=True)
101
+
102
+ if hasattr(src_objects, "__len__"):
103
+ logger.info(f"DRY RUN - Found {len(src_objects)} objects to migrate")
104
+ else:
105
+ logger.info("DRY RUN - Found objects to migrate (count unknown)")
106
+
107
+ logger.info(
108
+ "DRY RUN - Would copy objects using new canonical string format"
109
+ )
110
+ return True
111
+
112
+ # Step 2: Read objects with old format, then write with new format
113
+ logger.info("Step 1: Reading all objects using old canonical string format...")
114
+ with patched_canonical_string(use_old_format=True):
115
+ src_objects = dc.list(src_url, recursive=True)
116
+
117
+ if hasattr(src_objects, "__len__"):
118
+ logger.info(f"Found {len(src_objects)} objects to migrate")
119
+ else:
120
+ logger.info("Found objects to migrate (count unknown)")
121
+
122
+ logger.info("Step 2: Writing objects using new canonical string format...")
123
+ with patched_canonical_string(use_old_format=False):
124
+ _copy_objects_in_order(src_objects, dst_url)
125
+
126
+ logger.info("Migration completed successfully!")
127
+ return True
128
+
129
+ except Exception as e:
130
+ logger.error(f"Migration failed: {e}")
131
+ import traceback
132
+
133
+ traceback.print_exc()
134
+ return False
135
+
136
+
137
+ def main():
138
+ parser = argparse.ArgumentParser(
139
+ description="Backfill locator-to-ID mappings for DeltaCAT canonical string changes"
140
+ )
141
+ parser.add_argument(
142
+ "--catalog-root",
143
+ required=True,
144
+ help="Path to the source DeltaCAT catalog root directory",
145
+ )
146
+ parser.add_argument(
147
+ "--destination",
148
+ required=True,
149
+ help="Path to the destination DeltaCAT catalog root directory",
150
+ )
151
+ parser.add_argument(
152
+ "--dry-run",
153
+ action="store_true",
154
+ help="Show what would be migrated without making changes",
155
+ )
156
+ parser.add_argument(
157
+ "--verbose",
158
+ "-v",
159
+ action="store_true",
160
+ help="Enable verbose logging. Writes logs to /tmp/deltacat/ by default.",
161
+ )
162
+
163
+ args = parser.parse_args()
164
+
165
+ # Configure logging
166
+ level = logging.DEBUG if args.verbose else logging.INFO
167
+ logging.basicConfig(level=level, format="%(asctime)s - %(levelname)s - %(message)s")
168
+
169
+ # Initialize DeltaCAT with the catalog
170
+ catalog_config = {
171
+ "local": {
172
+ "root": args.catalog_root,
173
+ }
174
+ }
175
+ dc.init(catalogs=catalog_config)
176
+
177
+ try:
178
+ # Migrate to different location
179
+ source_url = f"dc://{args.catalog_root}/"
180
+ dest_url = f"dc://{args.destination}/"
181
+
182
+ if not args.dry_run:
183
+ # Initialize destination catalog
184
+ dest_config = {
185
+ "dest": {
186
+ "root": args.destination,
187
+ }
188
+ }
189
+ dc.init(catalogs=dest_config)
190
+
191
+ success = migrate_catalog(source_url, dest_url, args.dry_run)
192
+
193
+ return int(success)
194
+
195
+ except Exception as e:
196
+ logger.error(f"Migration failed: {e}")
197
+ return 1
198
+
199
+
200
+ if __name__ == "__main__":
201
+ exit(main())
@@ -0,0 +1,173 @@
1
+ """
2
+ DeltaCAT Job-based Managed I/O for Apache Beam
3
+
4
+ This module provides a job-based implementation of the DeltaCAT table monitor
5
+ that uses Ray jobs for better scalability and resource management instead of
6
+ threading.
7
+
8
+ Key Features:
9
+ - Uses DeltaCAT jobs for table monitoring
10
+ - Unique job IDs prevent duplicate monitoring jobs
11
+ - Supports both local and remote Ray clusters
12
+ - Backward compatible with existing managed.py interface
13
+ """
14
+
15
+ import logging
16
+ from typing import Dict, Any
17
+
18
+ import apache_beam as beam
19
+ from pyiceberg.catalog import CatalogType
20
+
21
+ from deltacat.experimental.converter_agent.table_monitor import submit_table_monitor_job
22
+ from deltacat.compute.converter.constants import DEFAULT_CONVERTER_TASK_MAX_PARALLELISM
23
+ import deltacat.logs as logs
24
+
25
+ # Initialize DeltaCAT logger
26
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
27
+
28
+ # Store original functions before monkey-patching
29
+ _original_write = beam.managed.Write
30
+
31
+
32
+ # Create a dictionary of Java catalog impl to CatalogType
33
+ JAVA_ICEBERG_CATALOG_IMPL_TO_TYPE = {
34
+ "org.apache.iceberg.rest.restcatalog": CatalogType.REST,
35
+ "org.apache.iceberg.hive.hivecatalog": CatalogType.HIVE,
36
+ "org.apache.iceberg.aws.glue.gluecatalog": CatalogType.GLUE,
37
+ "org.apache.iceberg.jdbc.jdbccatalog": CatalogType.SQL,
38
+ }
39
+
40
+
41
+ def _extract_catalog_config_from_beam(config: Dict[str, Any]) -> Dict[str, Any]:
42
+ """Extract catalog configuration from Beam config."""
43
+ catalog_properties = config.get("catalog_properties", {})
44
+
45
+ # Extract catalog implementation class
46
+ catalog_impl = catalog_properties.get("catalog-impl")
47
+
48
+ # Extract catalog type
49
+ catalog_type = catalog_properties.get("type")
50
+
51
+ # Extract other relevant properties
52
+ warehouse = catalog_properties.get("warehouse", "")
53
+ uri = catalog_properties.get("uri", "")
54
+
55
+ return {
56
+ "catalog_impl": catalog_impl,
57
+ "type": catalog_type,
58
+ "warehouse": warehouse,
59
+ "uri": uri,
60
+ "catalog_properties": catalog_properties,
61
+ }
62
+
63
+
64
+ def write(*args, **kwargs):
65
+ """Wrapper over beam.managed.Write that automatically creates a DeltaCAT table monitor & converter job."""
66
+ logger.debug(f"Starting DeltaCAT write operation")
67
+ logger.debug(f"args: {args}")
68
+ logger.debug(f"kwargs keys: {list(kwargs.keys()) if kwargs else 'None'}")
69
+
70
+ # Extract and pop deltacat-specific config keys
71
+ config = kwargs.get("config", {}).copy() if kwargs.get("config") else {}
72
+
73
+ # Extract DeltaCAT converter properties from parent config or individual keys (for backward compatibility)
74
+ deltacat_converter_properties = config.pop("deltacat_converter_properties", {})
75
+
76
+ # Support both new nested structure and old flat structure for backward compatibility
77
+ deltacat_converter_interval = deltacat_converter_properties.get(
78
+ "deltacat_converter_interval", 3.0
79
+ )
80
+
81
+ merge_keys = deltacat_converter_properties.get("merge_keys")
82
+
83
+ # Extract filesystem parameter (optional) - can be in converter properties or top-level config
84
+ filesystem = deltacat_converter_properties.get("filesystem", None)
85
+
86
+ # Extract cluster configuration file path (for remote jobs)
87
+ cluster_cfg_file_path = deltacat_converter_properties.get(
88
+ "cluster_cfg_file_path", None
89
+ )
90
+
91
+ # Extract max converter parallelism
92
+ max_converter_parallelism = deltacat_converter_properties.get(
93
+ "max_converter_parallelism",
94
+ DEFAULT_CONVERTER_TASK_MAX_PARALLELISM,
95
+ )
96
+
97
+ # Extract ray inactivity timeout
98
+ ray_inactivity_timeout = deltacat_converter_properties.get(
99
+ "ray_inactivity_timeout", 10
100
+ )
101
+
102
+ # Extract table identifier and warehouse path
103
+ table_identifier = config.get("table")
104
+ if not table_identifier:
105
+ raise ValueError("Table is required")
106
+
107
+ if table_identifier and "." in table_identifier:
108
+ namespace, table_name = table_identifier.split(".", 1)
109
+ else:
110
+ namespace = "default"
111
+ table_name = table_identifier
112
+
113
+ warehouse_path = config.get("catalog_properties", {}).get("warehouse", "")
114
+
115
+ # Extract catalog configuration for monitoring
116
+ beam_catalog_config = _extract_catalog_config_from_beam(config)
117
+
118
+ # Derive CatalogType from "catalog_impl" or "type" property
119
+ catalog_impl = beam_catalog_config.get("catalog_impl")
120
+ if catalog_impl:
121
+ catalog_type = JAVA_ICEBERG_CATALOG_IMPL_TO_TYPE.get(catalog_impl.lower())
122
+ if not catalog_type:
123
+ raise ValueError(f"Unsupported catalog implementation: {catalog_impl}")
124
+ else:
125
+ catalog_type_str = beam_catalog_config.get("type")
126
+ if catalog_type_str:
127
+ catalog_type = CatalogType(catalog_type_str.lower())
128
+ else:
129
+ raise ValueError(
130
+ f"No catalog implementation or type found in config: {beam_catalog_config}"
131
+ )
132
+
133
+ # Update kwargs with the modified config
134
+ if "config" in kwargs:
135
+ kwargs["config"] = config
136
+
137
+ logger.debug(f"Preparing to submit table monitor job...")
138
+ logger.debug(f"table_name: {table_name}")
139
+ logger.debug(f"deltacat_converter_interval: {deltacat_converter_interval}s")
140
+ logger.debug(f"merge_keys: {merge_keys}")
141
+ logger.debug(f"warehouse_path: {warehouse_path}")
142
+ logger.debug(
143
+ f"filesystem: {type(filesystem).__name__ if filesystem else 'None (auto-resolve)'}"
144
+ )
145
+ logger.debug(f"cluster_cfg_file_path: {cluster_cfg_file_path or 'None (local)'}")
146
+ logger.debug(f"max_converter_parallelism: {max_converter_parallelism}")
147
+ logger.debug(f"ray_inactivity_timeout: {ray_inactivity_timeout}s")
148
+ logger.debug(
149
+ f"using deltacat_converter_properties: {len(deltacat_converter_properties) > 0}"
150
+ )
151
+ logger.debug(f"catalog_type: {catalog_type}")
152
+
153
+ # Submit monitoring job
154
+ try:
155
+ submit_table_monitor_job(
156
+ warehouse_path=warehouse_path,
157
+ catalog_type=catalog_type,
158
+ catalog_uri=beam_catalog_config.get("uri"),
159
+ namespace=namespace,
160
+ table_name=table_name,
161
+ merge_keys=merge_keys,
162
+ monitor_interval=deltacat_converter_interval,
163
+ filesystem=filesystem,
164
+ cluster_cfg_file_path=cluster_cfg_file_path,
165
+ max_converter_parallelism=max_converter_parallelism,
166
+ ray_inactivity_timeout=ray_inactivity_timeout,
167
+ )
168
+ except Exception as e:
169
+ # Don't fail the write operation, just log the error
170
+ logger.error(f"Failed to submit table monitor job: {e}")
171
+ logger.error(f"Exception traceback:", exc_info=True)
172
+ logger.info(f"Delegating to beam.managed.Write")
173
+ return _original_write(*args, **kwargs)