deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (298) hide show
  1. deltacat/__init__.py +96 -17
  2. deltacat/api.py +122 -67
  3. deltacat/aws/constants.py +0 -23
  4. deltacat/aws/s3u.py +4 -631
  5. deltacat/benchmarking/benchmark_engine.py +4 -2
  6. deltacat/benchmarking/conftest.py +0 -18
  7. deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
  8. deltacat/catalog/__init__.py +64 -5
  9. deltacat/catalog/delegate.py +445 -63
  10. deltacat/catalog/interface.py +188 -62
  11. deltacat/catalog/main/impl.py +2435 -279
  12. deltacat/catalog/model/catalog.py +154 -77
  13. deltacat/catalog/model/properties.py +63 -22
  14. deltacat/compute/compactor/compaction_session.py +97 -75
  15. deltacat/compute/compactor/model/compact_partition_params.py +75 -30
  16. deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
  17. deltacat/compute/compactor/model/round_completion_info.py +16 -6
  18. deltacat/compute/compactor/repartition_session.py +8 -21
  19. deltacat/compute/compactor/steps/hash_bucket.py +5 -5
  20. deltacat/compute/compactor/steps/materialize.py +9 -7
  21. deltacat/compute/compactor/steps/repartition.py +12 -11
  22. deltacat/compute/compactor/utils/io.py +6 -5
  23. deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
  24. deltacat/compute/compactor/utils/system_columns.py +3 -1
  25. deltacat/compute/compactor_v2/compaction_session.py +17 -14
  26. deltacat/compute/compactor_v2/constants.py +30 -1
  27. deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
  28. deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
  29. deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
  30. deltacat/compute/compactor_v2/model/merge_input.py +33 -8
  31. deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
  32. deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
  33. deltacat/compute/compactor_v2/steps/merge.py +267 -55
  34. deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
  35. deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
  36. deltacat/compute/compactor_v2/utils/delta.py +5 -3
  37. deltacat/compute/compactor_v2/utils/io.py +11 -4
  38. deltacat/compute/compactor_v2/utils/merge.py +15 -2
  39. deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
  40. deltacat/compute/compactor_v2/utils/task_options.py +45 -33
  41. deltacat/compute/converter/converter_session.py +145 -32
  42. deltacat/compute/converter/model/convert_input.py +26 -19
  43. deltacat/compute/converter/model/convert_input_files.py +33 -16
  44. deltacat/compute/converter/model/convert_result.py +35 -16
  45. deltacat/compute/converter/model/converter_session_params.py +24 -21
  46. deltacat/compute/converter/pyiceberg/catalog.py +21 -18
  47. deltacat/compute/converter/pyiceberg/overrides.py +18 -9
  48. deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
  49. deltacat/compute/converter/steps/convert.py +157 -50
  50. deltacat/compute/converter/steps/dedupe.py +24 -11
  51. deltacat/compute/converter/utils/convert_task_options.py +27 -12
  52. deltacat/compute/converter/utils/converter_session_utils.py +126 -60
  53. deltacat/compute/converter/utils/iceberg_columns.py +8 -8
  54. deltacat/compute/converter/utils/io.py +101 -12
  55. deltacat/compute/converter/utils/s3u.py +33 -27
  56. deltacat/compute/janitor.py +205 -0
  57. deltacat/compute/jobs/client.py +25 -12
  58. deltacat/compute/resource_estimation/delta.py +38 -6
  59. deltacat/compute/resource_estimation/model.py +8 -0
  60. deltacat/constants.py +45 -2
  61. deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
  62. deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
  63. deltacat/env.py +10 -0
  64. deltacat/examples/basic_logging.py +1 -3
  65. deltacat/examples/compactor/aws/__init__.py +1 -0
  66. deltacat/examples/compactor/bootstrap.py +863 -0
  67. deltacat/examples/compactor/compactor.py +373 -0
  68. deltacat/examples/compactor/explorer.py +473 -0
  69. deltacat/examples/compactor/gcp/__init__.py +1 -0
  70. deltacat/examples/compactor/job_runner.py +439 -0
  71. deltacat/examples/compactor/utils/__init__.py +1 -0
  72. deltacat/examples/compactor/utils/common.py +261 -0
  73. deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
  74. deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
  75. deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
  76. deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
  77. deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
  78. deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
  79. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
  80. deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
  81. deltacat/examples/indexer/indexer.py +2 -2
  82. deltacat/examples/indexer/job_runner.py +1 -2
  83. deltacat/exceptions.py +66 -4
  84. deltacat/experimental/catalog/iceberg/__init__.py +6 -0
  85. deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
  86. deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
  87. deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
  88. deltacat/experimental/converter_agent/beam/managed.py +173 -0
  89. deltacat/experimental/converter_agent/table_monitor.py +479 -0
  90. deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
  91. deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
  92. deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
  93. deltacat/experimental/storage/iceberg/visitor.py +119 -0
  94. deltacat/experimental/storage/rivulet/__init__.py +11 -0
  95. deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
  96. deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
  97. deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
  98. deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
  99. deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
  100. deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
  101. deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
  102. deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
  103. deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
  104. deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
  105. deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
  106. deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
  107. deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
  108. deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
  109. deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
  110. deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
  111. deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
  112. deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
  113. deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
  114. deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
  115. deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
  116. deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
  117. deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
  118. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
  119. deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
  120. deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
  121. deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
  122. deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
  123. deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
  124. deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
  125. deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
  126. deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
  127. deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
  128. deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
  129. deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
  130. deltacat/io/datasource/deltacat_datasource.py +0 -1
  131. deltacat/io/reader/deltacat_read_api.py +1 -1
  132. deltacat/storage/__init__.py +20 -2
  133. deltacat/storage/interface.py +54 -32
  134. deltacat/storage/main/impl.py +1494 -541
  135. deltacat/storage/model/delta.py +27 -3
  136. deltacat/storage/model/locator.py +6 -12
  137. deltacat/storage/model/manifest.py +182 -6
  138. deltacat/storage/model/metafile.py +151 -78
  139. deltacat/storage/model/namespace.py +8 -1
  140. deltacat/storage/model/partition.py +117 -42
  141. deltacat/storage/model/schema.py +2427 -159
  142. deltacat/storage/model/shard.py +6 -2
  143. deltacat/storage/model/sort_key.py +40 -0
  144. deltacat/storage/model/stream.py +9 -2
  145. deltacat/storage/model/table.py +12 -1
  146. deltacat/storage/model/table_version.py +11 -0
  147. deltacat/storage/model/transaction.py +1184 -208
  148. deltacat/storage/model/transform.py +81 -2
  149. deltacat/storage/model/types.py +48 -26
  150. deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
  151. deltacat/tests/aws/test_s3u.py +2 -31
  152. deltacat/tests/catalog/data/__init__.py +0 -0
  153. deltacat/tests/catalog/main/__init__.py +0 -0
  154. deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
  155. deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
  156. deltacat/tests/catalog/model/__init__.py +0 -0
  157. deltacat/tests/catalog/model/test_table_definition.py +16 -0
  158. deltacat/tests/catalog/test_catalogs.py +103 -106
  159. deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
  160. deltacat/tests/compute/compact_partition_test_cases.py +35 -8
  161. deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
  162. deltacat/tests/compute/compactor/utils/test_io.py +124 -120
  163. deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
  164. deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
  165. deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
  166. deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
  167. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
  168. deltacat/tests/compute/conftest.py +8 -44
  169. deltacat/tests/compute/converter/test_convert_session.py +675 -490
  170. deltacat/tests/compute/converter/utils.py +15 -6
  171. deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
  172. deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
  173. deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
  174. deltacat/tests/compute/test_compact_partition_params.py +13 -8
  175. deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
  176. deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
  177. deltacat/tests/compute/test_janitor.py +236 -0
  178. deltacat/tests/compute/test_util_common.py +716 -43
  179. deltacat/tests/compute/test_util_constant.py +0 -1
  180. deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
  181. deltacat/tests/daft/__init__.py +0 -0
  182. deltacat/tests/daft/test_model.py +97 -0
  183. deltacat/tests/experimental/__init__.py +1 -0
  184. deltacat/tests/experimental/catalog/__init__.py +0 -0
  185. deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
  186. deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
  187. deltacat/tests/experimental/compatibility/__init__.py +1 -0
  188. deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
  189. deltacat/tests/experimental/daft/__init__.py +0 -0
  190. deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
  191. deltacat/tests/experimental/storage/__init__.py +0 -0
  192. deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
  193. deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
  194. deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
  195. deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
  196. deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
  197. deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
  198. deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
  199. deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
  200. deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
  201. deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
  202. deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
  203. deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
  204. deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
  205. deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
  206. deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
  207. deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
  208. deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
  209. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
  210. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
  211. deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
  212. deltacat/tests/storage/main/test_main_storage.py +6900 -95
  213. deltacat/tests/storage/model/test_metafile_io.py +78 -173
  214. deltacat/tests/storage/model/test_partition_scheme.py +85 -0
  215. deltacat/tests/storage/model/test_schema.py +171 -0
  216. deltacat/tests/storage/model/test_schema_update.py +1925 -0
  217. deltacat/tests/storage/model/test_shard.py +3 -1
  218. deltacat/tests/storage/model/test_sort_scheme.py +90 -0
  219. deltacat/tests/storage/model/test_transaction.py +393 -48
  220. deltacat/tests/storage/model/test_transaction_history.py +886 -0
  221. deltacat/tests/test_deltacat_api.py +988 -4
  222. deltacat/tests/test_exceptions.py +9 -5
  223. deltacat/tests/test_utils/pyarrow.py +52 -21
  224. deltacat/tests/test_utils/storage.py +23 -34
  225. deltacat/tests/types/__init__.py +0 -0
  226. deltacat/tests/types/test_tables.py +104 -0
  227. deltacat/tests/utils/exceptions.py +22 -0
  228. deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
  229. deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
  230. deltacat/tests/utils/test_daft.py +121 -31
  231. deltacat/tests/utils/test_numpy.py +1193 -0
  232. deltacat/tests/utils/test_pandas.py +1106 -0
  233. deltacat/tests/utils/test_polars.py +1040 -0
  234. deltacat/tests/utils/test_pyarrow.py +1370 -89
  235. deltacat/types/media.py +224 -14
  236. deltacat/types/tables.py +2329 -59
  237. deltacat/utils/arguments.py +33 -1
  238. deltacat/utils/daft.py +823 -36
  239. deltacat/utils/export.py +3 -1
  240. deltacat/utils/filesystem.py +100 -0
  241. deltacat/utils/metafile_locator.py +2 -1
  242. deltacat/utils/numpy.py +118 -26
  243. deltacat/utils/pandas.py +577 -48
  244. deltacat/utils/polars.py +658 -27
  245. deltacat/utils/pyarrow.py +1258 -213
  246. deltacat/utils/ray_utils/dataset.py +101 -10
  247. deltacat/utils/reader_compatibility_mapping.py +3083 -0
  248. deltacat/utils/url.py +57 -16
  249. deltacat-2.0.0b12.dist-info/METADATA +1163 -0
  250. deltacat-2.0.0b12.dist-info/RECORD +439 -0
  251. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
  252. deltacat/catalog/iceberg/__init__.py +0 -4
  253. deltacat/compute/compactor/utils/round_completion_file.py +0 -97
  254. deltacat/compute/merge_on_read/__init__.py +0 -4
  255. deltacat/compute/merge_on_read/daft.py +0 -40
  256. deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
  257. deltacat/compute/merge_on_read/utils/delta.py +0 -42
  258. deltacat/daft/daft_scan.py +0 -115
  259. deltacat/daft/model.py +0 -258
  260. deltacat/daft/translator.py +0 -126
  261. deltacat/examples/common/fixtures.py +0 -15
  262. deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
  263. deltacat/storage/rivulet/__init__.py +0 -11
  264. deltacat/storage/rivulet/feather/__init__.py +0 -5
  265. deltacat/storage/rivulet/parquet/__init__.py +0 -5
  266. deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
  267. deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
  268. deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
  269. deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
  270. deltacat/utils/s3fs.py +0 -21
  271. deltacat-2.0.0b10.dist-info/METADATA +0 -68
  272. deltacat-2.0.0b10.dist-info/RECORD +0 -381
  273. /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
  274. /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
  275. /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
  276. /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
  277. /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
  278. /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
  279. /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
  280. /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
  281. /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
  282. /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
  283. /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
  284. /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
  285. /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
  286. /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
  287. /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
  288. /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
  289. /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
  290. /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
  291. /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
  292. /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
  293. /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
  294. /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
  295. /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
  296. /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
  297. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
  298. {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,582 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Unit tests for the backfill script that migrates catalogs from old to new canonical string format.
4
+
5
+ Tests verify that catalogs created with the old canonical_string format (with parent hexdigest)
6
+ can be successfully migrated to the new hierarchical format (without parent hexdigest).
7
+ """
8
+ import os
9
+ import tempfile
10
+ import shutil
11
+ import uuid
12
+ from typing import Dict, Any
13
+ import pandas as pd
14
+ import pyarrow as pa
15
+
16
+ import deltacat as dc
17
+ from deltacat import Catalog
18
+ from deltacat.catalog.main import impl as catalog
19
+ from deltacat.catalog.model.properties import CatalogProperties
20
+ from deltacat.storage.model.schema import Schema, Field
21
+ from deltacat.types.tables import TableWriteMode
22
+ from deltacat import DatasetType
23
+ from deltacat.utils.url import DeltaCatUrl
24
+ from deltacat.storage.model.metafile import Metafile
25
+ from deltacat.storage.model.namespace import NamespaceLocator
26
+ from deltacat.storage.model.table import TableLocator
27
+ from deltacat.storage.model.table_version import TableVersionLocator
28
+ from deltacat.storage.model.stream import StreamLocator
29
+
30
+
31
+ from deltacat.experimental.compatibility.backfill_locator_to_id_mappings import (
32
+ patched_canonical_string,
33
+ migrate_catalog,
34
+ )
35
+
36
+
37
+ def get_catalog_properties(root: str) -> CatalogProperties:
38
+ """Helper to create catalog properties for testing."""
39
+ return CatalogProperties(root=root)
40
+
41
+
42
+ def create_test_schema() -> Schema:
43
+ """Create a basic schema for testing."""
44
+ return Schema.of(
45
+ [
46
+ Field.of(pa.field("id", pa.int64())),
47
+ Field.of(pa.field("name", pa.string())),
48
+ Field.of(pa.field("value", pa.float64())),
49
+ ]
50
+ )
51
+
52
+
53
+ def create_test_data() -> pd.DataFrame:
54
+ """Create test data for writing to tables."""
55
+ return pd.DataFrame(
56
+ {
57
+ "id": [1, 2, 3],
58
+ "name": ["Alice", "Bob", "Charlie"],
59
+ "value": [10.5, 20.0, 30.5],
60
+ }
61
+ )
62
+
63
+
64
+ class TestBackfillLocatorToIdMappings:
65
+ """Test the backfill script for canonical string migration."""
66
+
67
+ @classmethod
68
+ def setup_class(cls):
69
+ """Set up test environment."""
70
+ cls.temp_dir = tempfile.mkdtemp()
71
+ cls.dest_dir = tempfile.mkdtemp()
72
+
73
+ @classmethod
74
+ def teardown_class(cls):
75
+ """Clean up test environment."""
76
+ shutil.rmtree(cls.temp_dir, ignore_errors=True)
77
+ shutil.rmtree(cls.dest_dir, ignore_errors=True)
78
+
79
+ def setup_method(self):
80
+ """Set up for each test method."""
81
+ # Clear directories for each test
82
+ shutil.rmtree(self.temp_dir, ignore_errors=True)
83
+ shutil.rmtree(self.dest_dir, ignore_errors=True)
84
+ self.temp_dir = tempfile.mkdtemp()
85
+ self.dest_dir = tempfile.mkdtemp()
86
+
87
+ self.catalog_properties = get_catalog_properties(root=self.temp_dir)
88
+ self.dest_catalog_properties = get_catalog_properties(root=self.dest_dir)
89
+
90
+ def create_old_format_catalog(self) -> Dict[str, Any]:
91
+ """
92
+ Create a catalog using the old canonical_string format.
93
+
94
+ Returns:
95
+ Dict with information about created objects for verification
96
+ """
97
+ # Initialize DeltaCAT following the correct pattern
98
+ dc.init()
99
+ source_catalog_name = f"test_source_{uuid.uuid4()}"
100
+ dc.put_catalog(
101
+ source_catalog_name, catalog=Catalog(config=self.catalog_properties)
102
+ )
103
+
104
+ # Create catalog structure using old canonical_string format
105
+ with patched_canonical_string(use_old_format=True):
106
+ # Create namespace
107
+ namespace_name = "test_namespace"
108
+ catalog.create_namespace(
109
+ namespace=namespace_name, inner=self.catalog_properties
110
+ )
111
+
112
+ # Create multiple tables with different stream formats
113
+ tables_info = []
114
+
115
+ # Table 1: Basic table with deltacat stream
116
+ table1_name = "table_one"
117
+ table1_data = create_test_data()
118
+ catalog.write_to_table(
119
+ data=table1_data,
120
+ table=table1_name,
121
+ namespace=namespace_name,
122
+ mode=TableWriteMode.CREATE,
123
+ inner=self.catalog_properties,
124
+ # This will create deltacat format stream by default
125
+ )
126
+ tables_info.append(
127
+ {
128
+ "name": table1_name,
129
+ "expected_streams": ["deltacat"],
130
+ "expected_partitions": [("default",)],
131
+ }
132
+ )
133
+
134
+ # Table 2: Table with additional data (creates more partitions/deltas)
135
+ table2_name = "table_two"
136
+ table2_data1 = create_test_data()
137
+ table2_data2 = pd.DataFrame(
138
+ {
139
+ "id": [4, 5, 6],
140
+ "name": ["David", "Eve", "Frank"],
141
+ "value": [40.0, 50.5, 60.0],
142
+ }
143
+ )
144
+
145
+ # Create table
146
+ catalog.write_to_table(
147
+ data=table2_data1,
148
+ table=table2_name,
149
+ namespace=namespace_name,
150
+ mode=TableWriteMode.CREATE,
151
+ inner=self.catalog_properties,
152
+ )
153
+
154
+ # Append more data (creates additional delta)
155
+ catalog.write_to_table(
156
+ data=table2_data2,
157
+ table=table2_name,
158
+ namespace=namespace_name,
159
+ mode=TableWriteMode.APPEND,
160
+ inner=self.catalog_properties,
161
+ )
162
+
163
+ tables_info.append(
164
+ {
165
+ "name": table2_name,
166
+ "expected_streams": ["deltacat"],
167
+ "expected_partitions": [("default",)],
168
+ }
169
+ )
170
+
171
+ # Table 3: Table with explicit schema (might create different stream characteristics)
172
+ table3_name = "table_three"
173
+ table3_data = create_test_data()
174
+ catalog.write_to_table(
175
+ data=table3_data,
176
+ table=table3_name,
177
+ namespace=namespace_name,
178
+ mode=TableWriteMode.CREATE,
179
+ schema=create_test_schema(),
180
+ inner=self.catalog_properties,
181
+ )
182
+ tables_info.append(
183
+ {
184
+ "name": table3_name,
185
+ "expected_streams": ["deltacat"],
186
+ "expected_partitions": [("default",)],
187
+ }
188
+ )
189
+
190
+ return {
191
+ "namespace": namespace_name,
192
+ "tables": tables_info,
193
+ "catalog_root": self.temp_dir,
194
+ "catalog_name": source_catalog_name,
195
+ }
196
+
197
+ def verify_catalog_integrity(
198
+ self, catalog_root: str, expected_objects: Dict[str, Any]
199
+ ):
200
+ """
201
+ Verify that a catalog contains the expected objects and they can be read.
202
+
203
+ Args:
204
+ catalog_root: Path to catalog root
205
+ expected_objects: Dict with expected namespace, tables, etc.
206
+ """
207
+ # Use the catalog name from expected_objects if available, otherwise create a new one
208
+ if "catalog_name" in expected_objects:
209
+ verify_catalog_name = expected_objects["catalog_name"]
210
+ else:
211
+ # Fallback: create a new catalog for verification
212
+ verify_catalog_name = f"verify_{uuid.uuid4()}"
213
+ catalog_props = get_catalog_properties(root=catalog_root)
214
+ dc.put_catalog(verify_catalog_name, catalog=Catalog(config=catalog_props))
215
+
216
+ namespace_name = expected_objects["namespace"]
217
+
218
+ # Verify namespace exists
219
+ assert dc.namespace_exists(
220
+ namespace=namespace_name, catalog=verify_catalog_name
221
+ ), f"Namespace {namespace_name} should exist"
222
+
223
+ # Verify each table exists and can be read
224
+ for table_info in expected_objects["tables"]:
225
+ table_name = table_info["name"]
226
+
227
+ # Check table exists with specific table version (default is "1")
228
+ assert dc.table_exists(
229
+ table=table_name,
230
+ namespace=namespace_name,
231
+ catalog=verify_catalog_name,
232
+ table_version="1",
233
+ ), f"Table {namespace_name}/{table_name} should exist"
234
+
235
+ # Check we can get table definition with specific table version
236
+ table_def = dc.get_table(
237
+ table=table_name,
238
+ namespace=namespace_name,
239
+ catalog=verify_catalog_name,
240
+ table_version="1",
241
+ )
242
+ assert table_def is not None
243
+ assert table_def.table.table_name == table_name
244
+
245
+ # Check we can list table versions using dc.list_tables
246
+ tables_list = dc.list_tables(
247
+ namespace=namespace_name,
248
+ catalog=verify_catalog_name,
249
+ table=table_name, # List versions of this specific table
250
+ )
251
+ assert (
252
+ len(tables_list.all_items()) > 0
253
+ ), f"Table {table_name} should have versions"
254
+
255
+ # Try to read some data from the table to verify it works
256
+ table_data = dc.read_table(
257
+ table=table_name,
258
+ namespace=namespace_name,
259
+ catalog=verify_catalog_name,
260
+ table_version="1",
261
+ read_as=DatasetType.PANDAS,
262
+ )
263
+ assert (
264
+ table_data is not None
265
+ ), f"Should be able to read data from {table_name}"
266
+
267
+ # Verify the data matches expected test data structure
268
+ expected_columns = ["id", "name", "value"]
269
+ assert (
270
+ list(table_data.columns) == expected_columns
271
+ ), f"Table {table_name} should have columns {expected_columns}"
272
+ assert (
273
+ len(table_data) >= 3
274
+ ), f"Table {table_name} should have at least 3 rows of test data"
275
+
276
+ # List all objects in the catalog to verify complete structure including deltas
277
+ all_objects = dc.list(
278
+ DeltaCatUrl(f"dc://{verify_catalog_name}/"), recursive=True
279
+ )
280
+
281
+ # Count objects by type to verify deltas are present
282
+ object_counts = {}
283
+ for obj in all_objects:
284
+ obj_class_name = Metafile.get_class(obj.to_serializable()).__name__
285
+ object_counts[obj_class_name] = object_counts.get(obj_class_name, 0) + 1
286
+
287
+ print(f"Migrated catalog object counts: {object_counts}")
288
+
289
+ # Verify deltas are present (table_two should have 2 deltas due to APPEND operation)
290
+ assert (
291
+ "Delta" in object_counts
292
+ ), "No deltas found in migrated catalog - delta migration may have failed"
293
+ assert (
294
+ object_counts["Delta"] >= 3
295
+ ), f"Expected at least 3 deltas (one for each table, plus one for append), got {object_counts.get('Delta', 0)}"
296
+
297
+ def test_patched_canonical_string_context_manager(self):
298
+ """Test that the canonical_string patching works correctly."""
299
+ # Create test locators
300
+ ns_locator = NamespaceLocator({"namespace": "test_ns"})
301
+ table_locator = TableLocator(
302
+ {"namespaceLocator": ns_locator, "tableName": "test_table"}
303
+ )
304
+
305
+ # Test normal (new) format
306
+ normal_result = table_locator.canonical_string()
307
+ assert normal_result == "test_table"
308
+
309
+ # Test patched (old) format
310
+ with patched_canonical_string(use_old_format=True):
311
+ old_result = table_locator.canonical_string()
312
+ # Should include parent hexdigest
313
+ assert old_result != normal_result
314
+ assert old_result.endswith("|test_table")
315
+ assert len(old_result.split("|")) == 2
316
+
317
+ # Test that patch is restored
318
+ restored_result = table_locator.canonical_string()
319
+ assert restored_result == normal_result
320
+
321
+ def test_migrate_catalog_dry_run(self):
322
+ """Test migration in dry-run mode."""
323
+ # Create catalog with old format
324
+ old_catalog_info = self.create_old_format_catalog()
325
+
326
+ # Create destination catalog
327
+ dest_catalog_name = f"test_dest_{uuid.uuid4()}"
328
+ dc.put_catalog(
329
+ dest_catalog_name, catalog=Catalog(config=self.dest_catalog_properties)
330
+ )
331
+
332
+ # Use catalog names in URLs, not directory paths
333
+ source_url = f"dc://{old_catalog_info['catalog_name']}/"
334
+ dest_url = f"dc://{dest_catalog_name}/"
335
+
336
+ # Test dry run migration
337
+ success = migrate_catalog(source_url, dest_url, dry_run=True)
338
+ assert success, "Dry run migration should succeed"
339
+
340
+ # Destination should be empty after dry run
341
+ dest_contents = (
342
+ os.listdir(self.dest_dir) if os.path.exists(self.dest_dir) else []
343
+ )
344
+ assert len(dest_contents) == 0, "Destination should be empty after dry run"
345
+
346
+ def test_migrate_catalog_full_migration(self):
347
+ """Test full migration from old to new canonical string format."""
348
+ # Create catalog with old canonical_string format
349
+ old_catalog_info = self.create_old_format_catalog()
350
+
351
+ # Verify the old catalog works (using patched canonical_string since it was created with old format)
352
+ with patched_canonical_string(use_old_format=True):
353
+ self.verify_catalog_integrity(self.temp_dir, old_catalog_info)
354
+
355
+ # Create destination catalog
356
+ dest_catalog_name = f"test_dest_{uuid.uuid4()}"
357
+ dc.put_catalog(
358
+ dest_catalog_name, catalog=Catalog(config=self.dest_catalog_properties)
359
+ )
360
+
361
+ # Perform migration using catalog names
362
+ source_url = f"dc://{old_catalog_info['catalog_name']}/"
363
+ dest_url = f"dc://{dest_catalog_name}/"
364
+
365
+ success = migrate_catalog(source_url, dest_url, dry_run=False)
366
+ assert success, "Migration should succeed"
367
+
368
+ # Verify migrated catalog has same structure and data (update catalog_name for destination)
369
+ migrated_catalog_info = old_catalog_info.copy()
370
+ migrated_catalog_info["catalog_name"] = dest_catalog_name
371
+ migrated_catalog_info["catalog_root"] = self.dest_dir
372
+ self.verify_catalog_integrity(self.dest_dir, migrated_catalog_info)
373
+
374
+ # Additional verification: Compare object counts between source and destination
375
+ # This ensures all object types are migrated
376
+ with patched_canonical_string(use_old_format=True):
377
+ source_objects = dc.list(
378
+ DeltaCatUrl(f"dc://{old_catalog_info['catalog_name']}/"), recursive=True
379
+ )
380
+
381
+ dest_objects = dc.list(
382
+ DeltaCatUrl(f"dc://{dest_catalog_name}/"), recursive=True
383
+ )
384
+
385
+ # Count objects by type in both catalogs
386
+ source_counts = {}
387
+ dest_counts = {}
388
+
389
+ for obj in source_objects:
390
+ obj_class_name = Metafile.get_class(obj.to_serializable()).__name__
391
+ source_counts[obj_class_name] = source_counts.get(obj_class_name, 0) + 1
392
+
393
+ for obj in dest_objects:
394
+ obj_class_name = Metafile.get_class(obj.to_serializable()).__name__
395
+ dest_counts[obj_class_name] = dest_counts.get(obj_class_name, 0) + 1
396
+
397
+ # Verify all object types are migrated
398
+ for obj_type, count in source_counts.items():
399
+ assert (
400
+ obj_type in dest_counts
401
+ ), f"Object type {obj_type} missing from destination"
402
+ assert (
403
+ dest_counts[obj_type] == count
404
+ ), f"Object count mismatch for {obj_type}: source={count}, dest={dest_counts[obj_type]}"
405
+
406
+ def test_migrate_catalog_preserves_data_integrity(self):
407
+ """Test that migration preserves data integrity."""
408
+ # Create catalog with old format
409
+ old_catalog_info = self.create_old_format_catalog()
410
+
411
+ # Get expected test data structure for validation
412
+ expected_test_data = create_test_data() # Get the expected data structure
413
+
414
+ # Create destination catalog for migration
415
+ dest_catalog_name = f"test_dest_{uuid.uuid4()}"
416
+ dc.put_catalog(
417
+ dest_catalog_name, catalog=Catalog(config=self.dest_catalog_properties)
418
+ )
419
+
420
+ # Perform migration
421
+ source_url = f"dc://{old_catalog_info['catalog_name']}/"
422
+ dest_url = f"dc://{dest_catalog_name}/"
423
+
424
+ success = migrate_catalog(source_url, dest_url, dry_run=False)
425
+ assert success, "Migration should succeed"
426
+
427
+ # Read data from migrated catalog and compare
428
+ for table_info in old_catalog_info["tables"]:
429
+ table_name = table_info["name"]
430
+ migrated_data = dc.read_table(
431
+ table=table_name,
432
+ namespace=old_catalog_info["namespace"],
433
+ catalog=dest_catalog_name,
434
+ table_version="1",
435
+ read_as=DatasetType.PANDAS,
436
+ )
437
+
438
+ # Verify migrated data structure and content matches expected test data exactly
439
+ assert (
440
+ migrated_data is not None
441
+ ), f"Should be able to read migrated data from {table_name}"
442
+ assert list(migrated_data.columns) == list(
443
+ expected_test_data.columns
444
+ ), f"Migrated {table_name} should have expected columns {list(expected_test_data.columns)}"
445
+
446
+ # For tables that append data, expect at least the base test data rows
447
+ if table_name == "table_two": # This table has appended data
448
+ assert (
449
+ len(migrated_data) == len(expected_test_data) * 2
450
+ ), f"Migrated {table_name} should have at least {len(expected_test_data)} rows (base data)"
451
+ else:
452
+ assert len(migrated_data) == len(
453
+ expected_test_data
454
+ ), f"Migrated {table_name} should have exactly {len(expected_test_data)} rows"
455
+
456
+ # Verify that the migrated data contains the expected test data
457
+ # Sort both dataframes for consistent comparison using 'id' column
458
+ expected_sorted = expected_test_data.sort_values("id").reset_index(
459
+ drop=True
460
+ )
461
+ migrated_sorted = migrated_data.sort_values("id").reset_index(drop=True)
462
+
463
+ # For tables with appended data, check that the original data is present
464
+ if table_name == "table_two":
465
+ # Check that all expected rows are present in the migrated data
466
+ for _, expected_row in expected_sorted.iterrows():
467
+ matching_rows = migrated_sorted[
468
+ migrated_sorted["id"] == expected_row["id"]
469
+ ]
470
+ assert (
471
+ len(matching_rows) > 0
472
+ ), f"Expected row with id {expected_row['id']} not found in migrated {table_name}"
473
+ # Verify the first matching row has the expected values
474
+ actual_row = matching_rows.iloc[0]
475
+ assert (
476
+ actual_row["name"] == expected_row["name"]
477
+ ), f"Name mismatch for id {expected_row['id']} in {table_name}"
478
+ assert (
479
+ actual_row["value"] == expected_row["value"]
480
+ ), f"Value mismatch for id {expected_row['id']} in {table_name}"
481
+ else:
482
+ # For tables without appended data, expect exact match
483
+ try:
484
+ pd.testing.assert_frame_equal(
485
+ expected_sorted,
486
+ migrated_sorted,
487
+ check_dtype=False, # Allow minor type differences
488
+ )
489
+ except AssertionError as e:
490
+ raise AssertionError(
491
+ f"Data content should match expected test data for {table_name} after migration: {e}"
492
+ )
493
+
494
+ def test_migrate_empty_catalog(self):
495
+ """Test migration of an empty catalog."""
496
+ # Create empty catalog with old format
497
+ dc.init()
498
+ empty_catalog_name = f"empty_{uuid.uuid4()}"
499
+ dc.put_catalog(
500
+ empty_catalog_name, catalog=Catalog(config=self.catalog_properties)
501
+ )
502
+
503
+ with patched_canonical_string(use_old_format=True):
504
+ # Just create a namespace, no tables
505
+ dc.create_namespace(namespace="empty_namespace", catalog=empty_catalog_name)
506
+
507
+ # Create destination catalog for migration
508
+ dest_catalog_name = f"dest_{uuid.uuid4()}"
509
+ dc.put_catalog(
510
+ dest_catalog_name, catalog=Catalog(config=self.dest_catalog_properties)
511
+ )
512
+
513
+ # Perform migration
514
+ source_url = f"dc://{empty_catalog_name}/"
515
+ dest_url = f"dc://{dest_catalog_name}/"
516
+
517
+ success = migrate_catalog(source_url, dest_url, dry_run=False)
518
+ assert success, "Migration of empty catalog should succeed"
519
+
520
+ # Verify namespace exists in destination
521
+ assert dc.namespace_exists(
522
+ namespace="empty_namespace", catalog=dest_catalog_name
523
+ ), "Namespace should exist in migrated catalog"
524
+
525
+ def test_migration_error_handling(self):
526
+ """Test migration error handling for invalid inputs."""
527
+ # Test migration with non-existent source
528
+ invalid_source = f"dc://{self.temp_dir}/nonexistent/"
529
+ dest_url = f"dc://{self.dest_dir}/"
530
+
531
+ # This should handle the error gracefully
532
+ success = migrate_catalog(invalid_source, dest_url, dry_run=True)
533
+ # May succeed or fail depending on implementation, but shouldn't crash
534
+ assert isinstance(success, bool), "Should return boolean result"
535
+
536
+ def test_canonical_string_format_differences(self):
537
+ """Test that old and new canonical string formats are actually different."""
538
+ # Create hierarchy of locators
539
+ ns_locator = NamespaceLocator({"namespace": "test_ns"})
540
+ table_locator = TableLocator(
541
+ {"namespaceLocator": ns_locator, "tableName": "test_table"}
542
+ )
543
+ table_version_locator = TableVersionLocator(
544
+ {"tableLocator": table_locator, "version": "1"}
545
+ )
546
+ stream_locator = StreamLocator(
547
+ {"tableVersionLocator": table_version_locator, "streamFormat": "deltacat"}
548
+ )
549
+
550
+ # Test each level shows difference between old and new format
551
+ test_cases = [
552
+ ("namespace", ns_locator, True), # Namespace should be same (no parent)
553
+ ("table", table_locator, False), # Table should be different
554
+ (
555
+ "table_version",
556
+ table_version_locator,
557
+ False,
558
+ ), # Table version should be different
559
+ ("stream", stream_locator, False), # Stream should be different
560
+ ]
561
+
562
+ for obj_type, locator, should_be_same in test_cases:
563
+ new_format = locator.canonical_string()
564
+
565
+ with patched_canonical_string(use_old_format=True):
566
+ old_format = locator.canonical_string()
567
+
568
+ if should_be_same:
569
+ assert (
570
+ old_format == new_format
571
+ ), f"{obj_type} canonical strings should be the same"
572
+ else:
573
+ assert (
574
+ old_format != new_format
575
+ ), f"{obj_type} canonical strings should be different"
576
+ assert (
577
+ "|" in old_format
578
+ ), f"{obj_type} old format should contain separator"
579
+ # New format should be a suffix of old format
580
+ assert old_format.endswith(
581
+ f"|{new_format}"
582
+ ), f"{obj_type} old format should end with new format"
File without changes