deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +117 -18
- deltacat/api.py +536 -126
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +1 -19
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2444 -282
- deltacat/catalog/model/catalog.py +208 -113
- deltacat/catalog/model/properties.py +63 -24
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +207 -52
- deltacat/compute/converter/model/convert_input.py +43 -16
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +64 -19
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +193 -65
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +230 -75
- deltacat/compute/converter/steps/dedupe.py +46 -12
- deltacat/compute/converter/utils/convert_task_options.py +66 -22
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +13 -8
- deltacat/compute/converter/utils/io.py +173 -13
- deltacat/compute/converter/utils/s3u.py +42 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +49 -6
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +6 -6
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
- deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +22 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +231 -6
- deltacat/storage/model/metafile.py +224 -119
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +53 -29
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +697 -349
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1036 -11
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +345 -37
- deltacat/types/tables.py +2344 -46
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +139 -9
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1373 -192
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.dist-info/METADATA +0 -65
- deltacat-2.0.dist-info/RECORD +0 -347
- /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
- /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,201 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
Backfill script for backwards compatibility with canonical_string changes.
|
4
|
+
|
5
|
+
This script migrates existing DeltaCAT catalogs from the old global canonical string
|
6
|
+
format (with parent hexdigest) to the new hierarchical format (without parent hexdigest).
|
7
|
+
|
8
|
+
The old format was: {parent_hexdigest}|{name_parts}
|
9
|
+
The new format is: {name_parts}
|
10
|
+
|
11
|
+
Strategy:
|
12
|
+
1. Patch canonical_string method to use old format for reading existing name mappings
|
13
|
+
2. Use dc.list() to recursively discover all objects with old canonical_string
|
14
|
+
3. Copy each object's name mappings using new canonical_string format for writing
|
15
|
+
4. Works with any PyArrow-supported filesystem (local, S3, GCS, etc.)
|
16
|
+
|
17
|
+
Usage:
|
18
|
+
python deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py --catalog-root /path/to/catalog
|
19
|
+
"""
|
20
|
+
|
21
|
+
import argparse
|
22
|
+
import logging
|
23
|
+
import contextlib
|
24
|
+
|
25
|
+
import deltacat as dc
|
26
|
+
from deltacat.utils.url import DeltaCatUrl
|
27
|
+
from deltacat.storage.model.locator import Locator
|
28
|
+
from deltacat.api import _copy_objects_in_order
|
29
|
+
|
30
|
+
|
31
|
+
logger = logging.getLogger(__name__)
|
32
|
+
|
33
|
+
|
34
|
+
def canonical_string_old(locator, separator: str = "|") -> str:
|
35
|
+
"""
|
36
|
+
Old implementation of canonical_string that included parent hexdigest.
|
37
|
+
This is used to read existing name resolution directories.
|
38
|
+
"""
|
39
|
+
parts = []
|
40
|
+
parent_hexdigest = locator.parent.hexdigest() if locator.parent else None
|
41
|
+
if parent_hexdigest:
|
42
|
+
parts.append(parent_hexdigest)
|
43
|
+
parts.extend(locator.name.parts())
|
44
|
+
return separator.join([str(part) for part in parts])
|
45
|
+
|
46
|
+
|
47
|
+
@contextlib.contextmanager
|
48
|
+
def patched_canonical_string(use_old_format: bool = True):
|
49
|
+
"""
|
50
|
+
Context manager that temporarily patches the canonical_string method.
|
51
|
+
|
52
|
+
Args:
|
53
|
+
use_old_format: If True, use old format; if False, use new format
|
54
|
+
"""
|
55
|
+
# Store original method
|
56
|
+
original_method = Locator.canonical_string
|
57
|
+
|
58
|
+
try:
|
59
|
+
if use_old_format:
|
60
|
+
# Patch with old implementation
|
61
|
+
Locator.canonical_string = canonical_string_old
|
62
|
+
# If use_old_format is False, keep the current (new) implementation
|
63
|
+
|
64
|
+
yield
|
65
|
+
|
66
|
+
finally:
|
67
|
+
# Always restore original method
|
68
|
+
Locator.canonical_string = original_method
|
69
|
+
|
70
|
+
|
71
|
+
def migrate_catalog(
|
72
|
+
source_url: str, destination_url: str, dry_run: bool = False
|
73
|
+
) -> bool:
|
74
|
+
"""
|
75
|
+
Migrate a catalog from old to new canonical string format.
|
76
|
+
|
77
|
+
Args:
|
78
|
+
source_url: Source catalog URL (e.g., 'dc://catalog_root/')
|
79
|
+
destination_url: Destination catalog URL (e.g., 'dc://new_catalog_root/')
|
80
|
+
dry_run: If True, just show what would be migrated
|
81
|
+
|
82
|
+
Returns:
|
83
|
+
True if migration successful, False otherwise
|
84
|
+
"""
|
85
|
+
try:
|
86
|
+
src_url = DeltaCatUrl(source_url)
|
87
|
+
dst_url = DeltaCatUrl(destination_url)
|
88
|
+
|
89
|
+
logger.info(f"Starting migration from {source_url} to {destination_url}")
|
90
|
+
|
91
|
+
if dry_run:
|
92
|
+
logger.info("DRY RUN - No actual changes will be made")
|
93
|
+
|
94
|
+
if dry_run:
|
95
|
+
# Step 1: List all objects using old canonical_string format for dry run
|
96
|
+
logger.info(
|
97
|
+
"DRY RUN - Discovering objects using old canonical string format..."
|
98
|
+
)
|
99
|
+
with patched_canonical_string(use_old_format=True):
|
100
|
+
src_objects = dc.list(src_url, recursive=True)
|
101
|
+
|
102
|
+
if hasattr(src_objects, "__len__"):
|
103
|
+
logger.info(f"DRY RUN - Found {len(src_objects)} objects to migrate")
|
104
|
+
else:
|
105
|
+
logger.info("DRY RUN - Found objects to migrate (count unknown)")
|
106
|
+
|
107
|
+
logger.info(
|
108
|
+
"DRY RUN - Would copy objects using new canonical string format"
|
109
|
+
)
|
110
|
+
return True
|
111
|
+
|
112
|
+
# Step 2: Read objects with old format, then write with new format
|
113
|
+
logger.info("Step 1: Reading all objects using old canonical string format...")
|
114
|
+
with patched_canonical_string(use_old_format=True):
|
115
|
+
src_objects = dc.list(src_url, recursive=True)
|
116
|
+
|
117
|
+
if hasattr(src_objects, "__len__"):
|
118
|
+
logger.info(f"Found {len(src_objects)} objects to migrate")
|
119
|
+
else:
|
120
|
+
logger.info("Found objects to migrate (count unknown)")
|
121
|
+
|
122
|
+
logger.info("Step 2: Writing objects using new canonical string format...")
|
123
|
+
with patched_canonical_string(use_old_format=False):
|
124
|
+
_copy_objects_in_order(src_objects, dst_url)
|
125
|
+
|
126
|
+
logger.info("Migration completed successfully!")
|
127
|
+
return True
|
128
|
+
|
129
|
+
except Exception as e:
|
130
|
+
logger.error(f"Migration failed: {e}")
|
131
|
+
import traceback
|
132
|
+
|
133
|
+
traceback.print_exc()
|
134
|
+
return False
|
135
|
+
|
136
|
+
|
137
|
+
def main():
|
138
|
+
parser = argparse.ArgumentParser(
|
139
|
+
description="Backfill locator-to-ID mappings for DeltaCAT canonical string changes"
|
140
|
+
)
|
141
|
+
parser.add_argument(
|
142
|
+
"--catalog-root",
|
143
|
+
required=True,
|
144
|
+
help="Path to the source DeltaCAT catalog root directory",
|
145
|
+
)
|
146
|
+
parser.add_argument(
|
147
|
+
"--destination",
|
148
|
+
required=True,
|
149
|
+
help="Path to the destination DeltaCAT catalog root directory",
|
150
|
+
)
|
151
|
+
parser.add_argument(
|
152
|
+
"--dry-run",
|
153
|
+
action="store_true",
|
154
|
+
help="Show what would be migrated without making changes",
|
155
|
+
)
|
156
|
+
parser.add_argument(
|
157
|
+
"--verbose",
|
158
|
+
"-v",
|
159
|
+
action="store_true",
|
160
|
+
help="Enable verbose logging. Writes logs to /tmp/deltacat/ by default.",
|
161
|
+
)
|
162
|
+
|
163
|
+
args = parser.parse_args()
|
164
|
+
|
165
|
+
# Configure logging
|
166
|
+
level = logging.DEBUG if args.verbose else logging.INFO
|
167
|
+
logging.basicConfig(level=level, format="%(asctime)s - %(levelname)s - %(message)s")
|
168
|
+
|
169
|
+
# Initialize DeltaCAT with the catalog
|
170
|
+
catalog_config = {
|
171
|
+
"local": {
|
172
|
+
"root": args.catalog_root,
|
173
|
+
}
|
174
|
+
}
|
175
|
+
dc.init(catalogs=catalog_config)
|
176
|
+
|
177
|
+
try:
|
178
|
+
# Migrate to different location
|
179
|
+
source_url = f"dc://{args.catalog_root}/"
|
180
|
+
dest_url = f"dc://{args.destination}/"
|
181
|
+
|
182
|
+
if not args.dry_run:
|
183
|
+
# Initialize destination catalog
|
184
|
+
dest_config = {
|
185
|
+
"dest": {
|
186
|
+
"root": args.destination,
|
187
|
+
}
|
188
|
+
}
|
189
|
+
dc.init(catalogs=dest_config)
|
190
|
+
|
191
|
+
success = migrate_catalog(source_url, dest_url, args.dry_run)
|
192
|
+
|
193
|
+
return int(success)
|
194
|
+
|
195
|
+
except Exception as e:
|
196
|
+
logger.error(f"Migration failed: {e}")
|
197
|
+
return 1
|
198
|
+
|
199
|
+
|
200
|
+
if __name__ == "__main__":
|
201
|
+
exit(main())
|
File without changes
|
File without changes
|
@@ -0,0 +1,173 @@
|
|
1
|
+
"""
|
2
|
+
DeltaCAT Job-based Managed I/O for Apache Beam
|
3
|
+
|
4
|
+
This module provides a job-based implementation of the DeltaCAT table monitor
|
5
|
+
that uses Ray jobs for better scalability and resource management instead of
|
6
|
+
threading.
|
7
|
+
|
8
|
+
Key Features:
|
9
|
+
- Uses DeltaCAT jobs for table monitoring
|
10
|
+
- Unique job IDs prevent duplicate monitoring jobs
|
11
|
+
- Supports both local and remote Ray clusters
|
12
|
+
- Backward compatible with existing managed.py interface
|
13
|
+
"""
|
14
|
+
|
15
|
+
import logging
|
16
|
+
from typing import Dict, Any
|
17
|
+
|
18
|
+
import apache_beam as beam
|
19
|
+
from pyiceberg.catalog import CatalogType
|
20
|
+
|
21
|
+
from deltacat.experimental.converter_agent.table_monitor import submit_table_monitor_job
|
22
|
+
from deltacat.compute.converter.constants import DEFAULT_CONVERTER_TASK_MAX_PARALLELISM
|
23
|
+
import deltacat.logs as logs
|
24
|
+
|
25
|
+
# Initialize DeltaCAT logger
|
26
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
27
|
+
|
28
|
+
# Store original functions before monkey-patching
|
29
|
+
_original_write = beam.managed.Write
|
30
|
+
|
31
|
+
|
32
|
+
# Create a dictionary of Java catalog impl to CatalogType
|
33
|
+
JAVA_ICEBERG_CATALOG_IMPL_TO_TYPE = {
|
34
|
+
"org.apache.iceberg.rest.restcatalog": CatalogType.REST,
|
35
|
+
"org.apache.iceberg.hive.hivecatalog": CatalogType.HIVE,
|
36
|
+
"org.apache.iceberg.aws.glue.gluecatalog": CatalogType.GLUE,
|
37
|
+
"org.apache.iceberg.jdbc.jdbccatalog": CatalogType.SQL,
|
38
|
+
}
|
39
|
+
|
40
|
+
|
41
|
+
def _extract_catalog_config_from_beam(config: Dict[str, Any]) -> Dict[str, Any]:
|
42
|
+
"""Extract catalog configuration from Beam config."""
|
43
|
+
catalog_properties = config.get("catalog_properties", {})
|
44
|
+
|
45
|
+
# Extract catalog implementation class
|
46
|
+
catalog_impl = catalog_properties.get("catalog-impl")
|
47
|
+
|
48
|
+
# Extract catalog type
|
49
|
+
catalog_type = catalog_properties.get("type")
|
50
|
+
|
51
|
+
# Extract other relevant properties
|
52
|
+
warehouse = catalog_properties.get("warehouse", "")
|
53
|
+
uri = catalog_properties.get("uri", "")
|
54
|
+
|
55
|
+
return {
|
56
|
+
"catalog_impl": catalog_impl,
|
57
|
+
"type": catalog_type,
|
58
|
+
"warehouse": warehouse,
|
59
|
+
"uri": uri,
|
60
|
+
"catalog_properties": catalog_properties,
|
61
|
+
}
|
62
|
+
|
63
|
+
|
64
|
+
def write(*args, **kwargs):
|
65
|
+
"""Wrapper over beam.managed.Write that automatically creates a DeltaCAT table monitor & converter job."""
|
66
|
+
logger.debug(f"Starting DeltaCAT write operation")
|
67
|
+
logger.debug(f"args: {args}")
|
68
|
+
logger.debug(f"kwargs keys: {list(kwargs.keys()) if kwargs else 'None'}")
|
69
|
+
|
70
|
+
# Extract and pop deltacat-specific config keys
|
71
|
+
config = kwargs.get("config", {}).copy() if kwargs.get("config") else {}
|
72
|
+
|
73
|
+
# Extract DeltaCAT converter properties from parent config or individual keys (for backward compatibility)
|
74
|
+
deltacat_converter_properties = config.pop("deltacat_converter_properties", {})
|
75
|
+
|
76
|
+
# Support both new nested structure and old flat structure for backward compatibility
|
77
|
+
deltacat_converter_interval = deltacat_converter_properties.get(
|
78
|
+
"deltacat_converter_interval", 3.0
|
79
|
+
)
|
80
|
+
|
81
|
+
merge_keys = deltacat_converter_properties.get("merge_keys")
|
82
|
+
|
83
|
+
# Extract filesystem parameter (optional) - can be in converter properties or top-level config
|
84
|
+
filesystem = deltacat_converter_properties.get("filesystem", None)
|
85
|
+
|
86
|
+
# Extract cluster configuration file path (for remote jobs)
|
87
|
+
cluster_cfg_file_path = deltacat_converter_properties.get(
|
88
|
+
"cluster_cfg_file_path", None
|
89
|
+
)
|
90
|
+
|
91
|
+
# Extract max converter parallelism
|
92
|
+
max_converter_parallelism = deltacat_converter_properties.get(
|
93
|
+
"max_converter_parallelism",
|
94
|
+
DEFAULT_CONVERTER_TASK_MAX_PARALLELISM,
|
95
|
+
)
|
96
|
+
|
97
|
+
# Extract ray inactivity timeout
|
98
|
+
ray_inactivity_timeout = deltacat_converter_properties.get(
|
99
|
+
"ray_inactivity_timeout", 10
|
100
|
+
)
|
101
|
+
|
102
|
+
# Extract table identifier and warehouse path
|
103
|
+
table_identifier = config.get("table")
|
104
|
+
if not table_identifier:
|
105
|
+
raise ValueError("Table is required")
|
106
|
+
|
107
|
+
if table_identifier and "." in table_identifier:
|
108
|
+
namespace, table_name = table_identifier.split(".", 1)
|
109
|
+
else:
|
110
|
+
namespace = "default"
|
111
|
+
table_name = table_identifier
|
112
|
+
|
113
|
+
warehouse_path = config.get("catalog_properties", {}).get("warehouse", "")
|
114
|
+
|
115
|
+
# Extract catalog configuration for monitoring
|
116
|
+
beam_catalog_config = _extract_catalog_config_from_beam(config)
|
117
|
+
|
118
|
+
# Derive CatalogType from "catalog_impl" or "type" property
|
119
|
+
catalog_impl = beam_catalog_config.get("catalog_impl")
|
120
|
+
if catalog_impl:
|
121
|
+
catalog_type = JAVA_ICEBERG_CATALOG_IMPL_TO_TYPE.get(catalog_impl.lower())
|
122
|
+
if not catalog_type:
|
123
|
+
raise ValueError(f"Unsupported catalog implementation: {catalog_impl}")
|
124
|
+
else:
|
125
|
+
catalog_type_str = beam_catalog_config.get("type")
|
126
|
+
if catalog_type_str:
|
127
|
+
catalog_type = CatalogType(catalog_type_str.lower())
|
128
|
+
else:
|
129
|
+
raise ValueError(
|
130
|
+
f"No catalog implementation or type found in config: {beam_catalog_config}"
|
131
|
+
)
|
132
|
+
|
133
|
+
# Update kwargs with the modified config
|
134
|
+
if "config" in kwargs:
|
135
|
+
kwargs["config"] = config
|
136
|
+
|
137
|
+
logger.debug(f"Preparing to submit table monitor job...")
|
138
|
+
logger.debug(f"table_name: {table_name}")
|
139
|
+
logger.debug(f"deltacat_converter_interval: {deltacat_converter_interval}s")
|
140
|
+
logger.debug(f"merge_keys: {merge_keys}")
|
141
|
+
logger.debug(f"warehouse_path: {warehouse_path}")
|
142
|
+
logger.debug(
|
143
|
+
f"filesystem: {type(filesystem).__name__ if filesystem else 'None (auto-resolve)'}"
|
144
|
+
)
|
145
|
+
logger.debug(f"cluster_cfg_file_path: {cluster_cfg_file_path or 'None (local)'}")
|
146
|
+
logger.debug(f"max_converter_parallelism: {max_converter_parallelism}")
|
147
|
+
logger.debug(f"ray_inactivity_timeout: {ray_inactivity_timeout}s")
|
148
|
+
logger.debug(
|
149
|
+
f"using deltacat_converter_properties: {len(deltacat_converter_properties) > 0}"
|
150
|
+
)
|
151
|
+
logger.debug(f"catalog_type: {catalog_type}")
|
152
|
+
|
153
|
+
# Submit monitoring job
|
154
|
+
try:
|
155
|
+
submit_table_monitor_job(
|
156
|
+
warehouse_path=warehouse_path,
|
157
|
+
catalog_type=catalog_type,
|
158
|
+
catalog_uri=beam_catalog_config.get("uri"),
|
159
|
+
namespace=namespace,
|
160
|
+
table_name=table_name,
|
161
|
+
merge_keys=merge_keys,
|
162
|
+
monitor_interval=deltacat_converter_interval,
|
163
|
+
filesystem=filesystem,
|
164
|
+
cluster_cfg_file_path=cluster_cfg_file_path,
|
165
|
+
max_converter_parallelism=max_converter_parallelism,
|
166
|
+
ray_inactivity_timeout=ray_inactivity_timeout,
|
167
|
+
)
|
168
|
+
except Exception as e:
|
169
|
+
# Don't fail the write operation, just log the error
|
170
|
+
logger.error(f"Failed to submit table monitor job: {e}")
|
171
|
+
logger.error(f"Exception traceback:", exc_info=True)
|
172
|
+
logger.info(f"Delegating to beam.managed.Write")
|
173
|
+
return _original_write(*args, **kwargs)
|