deltacat 2.0.0b10__py3-none-any.whl ā 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +96 -17
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2435 -279
- deltacat/catalog/model/catalog.py +154 -77
- deltacat/catalog/model/properties.py +63 -22
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +25 -12
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +45 -2
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +1 -3
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg ā experimental/iceberg}/iceberg_bucket_writer.py +3 -5
- deltacat/examples/{iceberg ā experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/indexer/indexer.py +2 -2
- deltacat/examples/indexer/job_runner.py +1 -2
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog ā experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog ā experimental/catalog}/iceberg/impl.py +29 -11
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage ā experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage ā experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/{storage ā experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage ā experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage ā experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage ā experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage ā experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/{storage ā experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage ā experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage ā experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage ā experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage ā experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage ā experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage ā experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage ā experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage ā experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage ā experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage ā experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage ā experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage ā experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage ā experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage ā experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage ā experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage ā experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage ā experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage ā experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage ā experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage ā experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/io/reader/deltacat_read_api.py +1 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py ā conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage ā experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage ā experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage ā experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage ā experimental/storage}/rivulet/test_dataset.py +5 -3
- deltacat/tests/{storage ā experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage ā experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage ā experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage ā experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage ā experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage ā experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +224 -14
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +823 -36
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +57 -16
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- deltacat-2.0.0b12.dist-info/RECORD +439 -0
- {deltacat-2.0.0b10.dist-info ā deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/daft/daft_scan.py +0 -115
- deltacat/daft/model.py +0 -258
- deltacat/daft/translator.py +0 -126
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b10.dist-info/METADATA +0 -68
- deltacat-2.0.0b10.dist-info/RECORD +0 -381
- /deltacat/{compute/merge_on_read/model ā docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils ā docs/autogen}/__init__.py +0 -0
- /deltacat/{daft ā docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{examples/common ā docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/examples/{iceberg ā compactor}/__init__.py +0 -0
- /deltacat/{storage/iceberg ā examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow ā examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs ā examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore ā examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader ā experimental/catalog}/__init__.py +0 -0
- /deltacat/{catalog ā experimental/catalog}/iceberg/overrides.py +0 -0
- /deltacat/{storage/rivulet/schema ā experimental/compatibility}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer ā experimental/converter_agent}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet ā experimental/converter_agent/beam}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs ā experimental/storage}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema ā experimental/storage/iceberg}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer ā experimental/storage/rivulet/arrow}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py ā experimental/storage/rivulet/fs/__init__.py} +0 -0
- /deltacat/{storage ā experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage ā experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage ā experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage ā experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage ā experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage ā experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.0b10.dist-info ā deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b10.dist-info ā deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,473 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
DeltaCAT Catalog Explorer
|
4
|
+
|
5
|
+
Discover candidate streams and partitions for compaction.
|
6
|
+
It provides an easy way to explore catalog contents and generates example compaction commands.
|
7
|
+
|
8
|
+
Usage:
|
9
|
+
# Explore default test catalog (from bootstrap.py)
|
10
|
+
python explorer.py
|
11
|
+
|
12
|
+
# Explore with custom catalog root
|
13
|
+
python explorer.py --catalog-root /path/to/catalog
|
14
|
+
|
15
|
+
# Explore specific URL
|
16
|
+
python explorer.py --url "dc://my_catalog/my_namespace"
|
17
|
+
|
18
|
+
# Non-recursive listing
|
19
|
+
python explorer.py --no-recursive
|
20
|
+
|
21
|
+
Examples:
|
22
|
+
# After running bootstrap.py
|
23
|
+
python bootstrap.py --catalog-root /tmp/deltacat_test
|
24
|
+
python explorer.py --catalog-root /tmp/deltacat_test
|
25
|
+
|
26
|
+
# Explore and find compaction candidates
|
27
|
+
python explorer.py --catalog-root /tmp/deltacat_test --show-compaction-candidates
|
28
|
+
"""
|
29
|
+
|
30
|
+
import argparse
|
31
|
+
import os
|
32
|
+
import sys
|
33
|
+
from typing import List, Tuple
|
34
|
+
|
35
|
+
import deltacat as dc
|
36
|
+
from deltacat import DeltaCatUrl
|
37
|
+
from deltacat.storage.model.namespace import Namespace
|
38
|
+
from deltacat.storage.model.table import Table
|
39
|
+
from deltacat.storage.model.table_version import TableVersion
|
40
|
+
from deltacat.storage.model.stream import Stream
|
41
|
+
from deltacat.storage.model.partition import Partition
|
42
|
+
from deltacat.storage.model.delta import Delta
|
43
|
+
|
44
|
+
# Import common utilities
|
45
|
+
from deltacat.examples.compactor.utils.common import (
|
46
|
+
get_default_catalog_root,
|
47
|
+
initialize_deltacat_url_catalog,
|
48
|
+
format_partition_values_for_command,
|
49
|
+
get_max_stream_position_from_partition,
|
50
|
+
get_bootstrap_destination_info,
|
51
|
+
)
|
52
|
+
|
53
|
+
|
54
|
+
def setup_catalog(
|
55
|
+
catalog_root: str, catalog_name: str = "compactor_test_catalog"
|
56
|
+
) -> DeltaCatUrl:
|
57
|
+
"""Initialize and register the catalog."""
|
58
|
+
print(f"š§ Initializing catalog...")
|
59
|
+
print(f" Catalog root: {catalog_root}")
|
60
|
+
print(f" Catalog name: {catalog_name}")
|
61
|
+
|
62
|
+
return initialize_deltacat_url_catalog(catalog_root, catalog_name)
|
63
|
+
|
64
|
+
|
65
|
+
def find_compaction_candidates(
|
66
|
+
all_objects: List,
|
67
|
+
) -> List[Tuple[Partition, Stream, TableVersion, Table, Namespace]]:
|
68
|
+
"""Find partitions that are candidates for compaction."""
|
69
|
+
candidates = []
|
70
|
+
|
71
|
+
# Group objects by type for easier lookup
|
72
|
+
namespaces = {ns.namespace: ns for ns in all_objects if isinstance(ns, Namespace)}
|
73
|
+
tables = {
|
74
|
+
(t.namespace, t.table_name): t for t in all_objects if isinstance(t, Table)
|
75
|
+
}
|
76
|
+
table_versions = {
|
77
|
+
(tv.namespace, tv.table_name, tv.table_version): tv
|
78
|
+
for tv in all_objects
|
79
|
+
if isinstance(tv, TableVersion)
|
80
|
+
}
|
81
|
+
streams = {
|
82
|
+
(s.namespace, s.table_name, s.table_version, s.stream_id): s
|
83
|
+
for s in all_objects
|
84
|
+
if isinstance(s, Stream)
|
85
|
+
}
|
86
|
+
partitions = [p for p in all_objects if isinstance(p, Partition)]
|
87
|
+
deltas = [d for d in all_objects if isinstance(d, Delta)]
|
88
|
+
|
89
|
+
# Group deltas by partition for counting
|
90
|
+
deltas_by_partition = {}
|
91
|
+
for delta in deltas:
|
92
|
+
partition_key = (
|
93
|
+
delta.namespace,
|
94
|
+
delta.table_name,
|
95
|
+
delta.table_version,
|
96
|
+
delta.stream_id,
|
97
|
+
delta.partition_id,
|
98
|
+
)
|
99
|
+
if partition_key not in deltas_by_partition:
|
100
|
+
deltas_by_partition[partition_key] = []
|
101
|
+
deltas_by_partition[partition_key].append(delta)
|
102
|
+
|
103
|
+
for partition in partitions:
|
104
|
+
# Find the related objects for this partition
|
105
|
+
namespace = namespaces.get(partition.namespace)
|
106
|
+
table = tables.get((partition.namespace, partition.table_name))
|
107
|
+
table_version = table_versions.get(
|
108
|
+
(partition.namespace, partition.table_name, partition.table_version)
|
109
|
+
)
|
110
|
+
stream = streams.get(
|
111
|
+
(
|
112
|
+
partition.namespace,
|
113
|
+
partition.table_name,
|
114
|
+
partition.table_version,
|
115
|
+
partition.stream_id,
|
116
|
+
)
|
117
|
+
)
|
118
|
+
|
119
|
+
# Check if this partition has deltas
|
120
|
+
partition_key = (
|
121
|
+
partition.namespace,
|
122
|
+
partition.table_name,
|
123
|
+
partition.table_version,
|
124
|
+
partition.stream_id,
|
125
|
+
partition.partition_id,
|
126
|
+
)
|
127
|
+
partition_deltas = deltas_by_partition.get(partition_key, [])
|
128
|
+
|
129
|
+
if all([namespace, table, table_version, stream]):
|
130
|
+
# Check if this partition is a good candidate for compaction
|
131
|
+
# Must have committed stream and at least one delta (preferably multiple)
|
132
|
+
if stream.state == "committed" and len(partition_deltas) > 0:
|
133
|
+
candidates.append((partition, stream, table_version, table, namespace))
|
134
|
+
|
135
|
+
return candidates
|
136
|
+
|
137
|
+
|
138
|
+
def generate_compaction_command(
|
139
|
+
partition: Partition,
|
140
|
+
stream: Stream,
|
141
|
+
table_version: TableVersion,
|
142
|
+
table: Table,
|
143
|
+
namespace: Namespace,
|
144
|
+
catalog_root: str,
|
145
|
+
) -> str:
|
146
|
+
"""Generate an example compaction command for the given partition."""
|
147
|
+
|
148
|
+
# Format partition values for command line
|
149
|
+
partition_values = format_partition_values_for_command(partition.partition_values)
|
150
|
+
|
151
|
+
dest_namespace, dest_table_name = get_bootstrap_destination_info(
|
152
|
+
namespace.namespace, table.table_name
|
153
|
+
)
|
154
|
+
|
155
|
+
# Get stream position for generic tables too
|
156
|
+
from deltacat.catalog.model.properties import CatalogProperties
|
157
|
+
|
158
|
+
catalog = CatalogProperties(root=catalog_root)
|
159
|
+
|
160
|
+
partition_values_list = (
|
161
|
+
list(partition.partition_values) if partition.partition_values else []
|
162
|
+
)
|
163
|
+
max_stream_position = get_max_stream_position_from_partition(
|
164
|
+
namespace.namespace,
|
165
|
+
table.table_name,
|
166
|
+
table_version.table_version,
|
167
|
+
partition_values_list,
|
168
|
+
catalog,
|
169
|
+
)
|
170
|
+
|
171
|
+
command = f"""python compactor.py \\
|
172
|
+
--namespace '{namespace.namespace}' \\
|
173
|
+
--table-name '{table.table_name}' \\
|
174
|
+
--table-version '{table_version.table_version}' \\
|
175
|
+
--partition-values '{partition_values}' \\
|
176
|
+
--dest-namespace '{dest_namespace}' \\
|
177
|
+
--dest-table-name '{dest_table_name}' \\
|
178
|
+
--dest-table-version '1' \\
|
179
|
+
--dest-partition-values '{partition_values}' \\
|
180
|
+
--last-stream-position {max_stream_position} \\
|
181
|
+
--primary-keys 'id' \\
|
182
|
+
--compactor-version 'V2' \\
|
183
|
+
--hash-bucket-count 1 \\
|
184
|
+
--catalog-root '{catalog_root}'"""
|
185
|
+
|
186
|
+
return command
|
187
|
+
|
188
|
+
|
189
|
+
def print_catalog_summary(all_objects: List) -> None:
|
190
|
+
"""Print a summary of the catalog contents."""
|
191
|
+
namespaces = [obj for obj in all_objects if isinstance(obj, Namespace)]
|
192
|
+
tables = [obj for obj in all_objects if isinstance(obj, Table)]
|
193
|
+
table_versions = [obj for obj in all_objects if isinstance(obj, TableVersion)]
|
194
|
+
streams = [obj for obj in all_objects if isinstance(obj, Stream)]
|
195
|
+
partitions = [obj for obj in all_objects if isinstance(obj, Partition)]
|
196
|
+
deltas = [obj for obj in all_objects if isinstance(obj, Delta)]
|
197
|
+
|
198
|
+
print(f"\nš Catalog Summary:")
|
199
|
+
print(f" Namespaces: {len(namespaces)}")
|
200
|
+
print(f" Tables: {len(tables)}")
|
201
|
+
print(f" Table Versions: {len(table_versions)}")
|
202
|
+
print(f" Streams: {len(streams)}")
|
203
|
+
print(f" Partitions: {len(partitions)}")
|
204
|
+
print(f" Deltas: {len(deltas)}")
|
205
|
+
print(f" Total Objects: {len(all_objects)}")
|
206
|
+
|
207
|
+
|
208
|
+
def print_detailed_listing(all_objects: List) -> None:
|
209
|
+
"""Print detailed listing of all objects."""
|
210
|
+
print(f"\nš Detailed Catalog Listing:")
|
211
|
+
|
212
|
+
# Group deltas by partition for better display
|
213
|
+
deltas = [d for d in all_objects if isinstance(d, Delta)]
|
214
|
+
deltas_by_partition = {}
|
215
|
+
for delta in deltas:
|
216
|
+
partition_key = (
|
217
|
+
delta.namespace,
|
218
|
+
delta.table_name,
|
219
|
+
delta.table_version,
|
220
|
+
delta.stream_id,
|
221
|
+
delta.partition_id,
|
222
|
+
)
|
223
|
+
if partition_key not in deltas_by_partition:
|
224
|
+
deltas_by_partition[partition_key] = []
|
225
|
+
deltas_by_partition[partition_key].append(delta)
|
226
|
+
|
227
|
+
for obj in all_objects:
|
228
|
+
if isinstance(obj, Namespace):
|
229
|
+
obj.namespace
|
230
|
+
print(f"š Namespace: {obj.namespace}")
|
231
|
+
elif isinstance(obj, Table):
|
232
|
+
obj.table_name
|
233
|
+
print(f" š Table: {obj.table_name}")
|
234
|
+
elif isinstance(obj, TableVersion):
|
235
|
+
obj.table_version
|
236
|
+
print(f" š Table Version: {obj.table_version} (state: {obj.state})")
|
237
|
+
elif isinstance(obj, Stream):
|
238
|
+
obj.stream_id
|
239
|
+
print(f" š Stream: {obj.stream_id}")
|
240
|
+
print(f" Format: {obj.stream_format}")
|
241
|
+
print(f" State: {obj.state}")
|
242
|
+
elif isinstance(obj, Partition):
|
243
|
+
print(f" š¦ Partition: {obj.partition_id}")
|
244
|
+
if obj.partition_values:
|
245
|
+
print(f" Values: {obj.partition_values}")
|
246
|
+
|
247
|
+
# Show deltas for this partition
|
248
|
+
partition_key = (
|
249
|
+
obj.namespace,
|
250
|
+
obj.table_name,
|
251
|
+
obj.table_version,
|
252
|
+
obj.stream_id,
|
253
|
+
obj.partition_id,
|
254
|
+
)
|
255
|
+
partition_deltas = deltas_by_partition.get(partition_key, [])
|
256
|
+
if partition_deltas:
|
257
|
+
# Sort deltas by stream position
|
258
|
+
sorted_deltas = sorted(
|
259
|
+
partition_deltas, key=lambda d: d.stream_position
|
260
|
+
)
|
261
|
+
for delta in sorted_deltas:
|
262
|
+
print(f" š Delta at position: {delta.stream_position}")
|
263
|
+
else:
|
264
|
+
print(f" ā ļø No deltas found")
|
265
|
+
|
266
|
+
|
267
|
+
def print_compaction_candidates(candidates: List, catalog_root: str) -> None:
|
268
|
+
"""Print compaction candidates with enhanced information."""
|
269
|
+
if candidates:
|
270
|
+
print(f"\nšÆ Compaction Candidates:")
|
271
|
+
print(f" Found {len(candidates)} partition(s) ready for compaction")
|
272
|
+
|
273
|
+
for i, (partition, stream, table_version, table, namespace) in enumerate(
|
274
|
+
candidates, 1
|
275
|
+
):
|
276
|
+
# Count deltas for this partition
|
277
|
+
try:
|
278
|
+
from deltacat.storage import metastore
|
279
|
+
from deltacat.catalog.model.properties import CatalogProperties
|
280
|
+
|
281
|
+
catalog = CatalogProperties(root=catalog_root)
|
282
|
+
|
283
|
+
# Create partition locator
|
284
|
+
partition_locator = {
|
285
|
+
"streamLocator": {
|
286
|
+
"tableVersionLocator": {
|
287
|
+
"tableLocator": {
|
288
|
+
"namespaceLocator": {"namespace": namespace.namespace},
|
289
|
+
"tableName": table.table_name,
|
290
|
+
},
|
291
|
+
"tableVersion": table_version.table_version,
|
292
|
+
},
|
293
|
+
"streamId": stream.stream_id,
|
294
|
+
"format": "deltacat",
|
295
|
+
},
|
296
|
+
"partitionValues": None,
|
297
|
+
"partitionId": partition.partition_id,
|
298
|
+
}
|
299
|
+
|
300
|
+
# Get deltas to count them
|
301
|
+
partition_deltas = metastore.list_partition_deltas(
|
302
|
+
partition_like=type(
|
303
|
+
"obj", (object,), {"locator": partition_locator}
|
304
|
+
)(),
|
305
|
+
include_manifest=True,
|
306
|
+
catalog=catalog,
|
307
|
+
)
|
308
|
+
|
309
|
+
delta_list = partition_deltas.all_items()
|
310
|
+
delta_count = len(delta_list)
|
311
|
+
max_stream_position = (
|
312
|
+
max(delta.stream_position for delta in delta_list)
|
313
|
+
if delta_list
|
314
|
+
else 0
|
315
|
+
)
|
316
|
+
total_records = sum(
|
317
|
+
delta.meta.record_count if delta.meta else 0 for delta in delta_list
|
318
|
+
)
|
319
|
+
except Exception:
|
320
|
+
delta_count = "unknown"
|
321
|
+
max_stream_position = "unknown"
|
322
|
+
total_records = "unknown"
|
323
|
+
|
324
|
+
print(f"\nš¦ Candidate {i}:")
|
325
|
+
print(f" Namespace: {namespace.namespace}")
|
326
|
+
print(f" Table: {table.table_name}")
|
327
|
+
print(f" Table Version: {table_version.table_version}")
|
328
|
+
print(f" Stream: {stream.stream_id}")
|
329
|
+
print(f" Partition: {partition.partition_id}")
|
330
|
+
print(f" Stream State: {stream.state}")
|
331
|
+
print(f" Deltas: {delta_count}")
|
332
|
+
if delta_count != "unknown" and delta_count > 0:
|
333
|
+
print(f" Total Records: {total_records}")
|
334
|
+
print(f" Max Stream Position: {max_stream_position}")
|
335
|
+
if delta_count > 1:
|
336
|
+
print(
|
337
|
+
f" šÆ Good candidate: Multiple deltas available for compaction"
|
338
|
+
)
|
339
|
+
else:
|
340
|
+
print(f" ā ļø Single delta: Limited compaction benefit")
|
341
|
+
|
342
|
+
if i == 1: # Show command for first candidate
|
343
|
+
command = generate_compaction_command(
|
344
|
+
partition, stream, table_version, table, namespace, catalog_root
|
345
|
+
)
|
346
|
+
print(f"\nš Compaction command for candidate {i}:")
|
347
|
+
print(f" cd deltacat/examples/compactor")
|
348
|
+
for line in command.split("\n"):
|
349
|
+
if line.strip():
|
350
|
+
print(f" {line}")
|
351
|
+
else:
|
352
|
+
print(f"\nā ļø No compaction candidates found.")
|
353
|
+
print(
|
354
|
+
f"š” Tip: Compaction candidates are partitions with committed streams and deltas."
|
355
|
+
)
|
356
|
+
print(f" Tables need multiple deltas to benefit from compaction.")
|
357
|
+
|
358
|
+
|
359
|
+
def main():
|
360
|
+
"""Main entry point for the explorer script."""
|
361
|
+
parser = argparse.ArgumentParser(
|
362
|
+
description="Explore DeltaCAT catalog contents and find compaction candidates",
|
363
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
364
|
+
epilog="""
|
365
|
+
Examples:
|
366
|
+
# Explore default test catalog (after running bootstrap.py)
|
367
|
+
python explorer.py --catalog-root /tmp/deltacat_test
|
368
|
+
|
369
|
+
# Explore specific URL
|
370
|
+
python explorer.py --url "dc://my_catalog/my_namespace"
|
371
|
+
|
372
|
+
# Show compaction candidates with example commands
|
373
|
+
python explorer.py --catalog-root /tmp/deltacat_test --show-compaction-candidates
|
374
|
+
|
375
|
+
# Non-recursive listing (step by step)
|
376
|
+
python explorer.py --catalog-root /tmp/deltacat_test --no-recursive
|
377
|
+
""",
|
378
|
+
)
|
379
|
+
|
380
|
+
parser.add_argument(
|
381
|
+
"--catalog-root",
|
382
|
+
type=str,
|
383
|
+
default=get_default_catalog_root(),
|
384
|
+
help=f"Root directory for the DeltaCAT catalog (default: {get_default_catalog_root()}, same as bootstrap.py)",
|
385
|
+
)
|
386
|
+
|
387
|
+
parser.add_argument(
|
388
|
+
"--url",
|
389
|
+
type=str,
|
390
|
+
help="Specific DeltaCAT URL to explore (e.g., 'dc://catalog/namespace'). If not provided, uses the full catalog.",
|
391
|
+
)
|
392
|
+
|
393
|
+
parser.add_argument(
|
394
|
+
"--no-recursive",
|
395
|
+
action="store_true",
|
396
|
+
help="Disable recursive listing (only list top-level objects)",
|
397
|
+
)
|
398
|
+
|
399
|
+
parser.add_argument(
|
400
|
+
"--show-compaction-candidates",
|
401
|
+
action="store_true",
|
402
|
+
help="Show partitions that are candidates for compaction with example commands",
|
403
|
+
)
|
404
|
+
|
405
|
+
parser.add_argument(
|
406
|
+
"--catalog-name",
|
407
|
+
type=str,
|
408
|
+
default="compactor_test_catalog",
|
409
|
+
help="Name to register the catalog under (default: compactor_test_catalog)",
|
410
|
+
)
|
411
|
+
|
412
|
+
args = parser.parse_args()
|
413
|
+
|
414
|
+
# Validate catalog root exists
|
415
|
+
if not os.path.exists(args.catalog_root):
|
416
|
+
print(f"ā Error: Catalog root directory does not exist: {args.catalog_root}")
|
417
|
+
print(f"š” Tip: Run bootstrap.py first to create test data:")
|
418
|
+
print(f" python bootstrap.py --catalog-root {args.catalog_root}")
|
419
|
+
return 1
|
420
|
+
|
421
|
+
print(f"š DeltaCAT Catalog Explorer")
|
422
|
+
print(f"=" * 50)
|
423
|
+
|
424
|
+
try:
|
425
|
+
# Setup catalog
|
426
|
+
catalog_url = setup_catalog(args.catalog_root, args.catalog_name)
|
427
|
+
|
428
|
+
# Determine what URL to explore
|
429
|
+
if args.url:
|
430
|
+
explore_url = DeltaCatUrl(args.url)
|
431
|
+
print(f"šÆ Exploring specific URL: {args.url}")
|
432
|
+
else:
|
433
|
+
explore_url = catalog_url
|
434
|
+
print(f"šÆ Exploring full catalog: {catalog_url.url}")
|
435
|
+
|
436
|
+
# List objects
|
437
|
+
recursive = not args.no_recursive
|
438
|
+
print(f"š Listing mode: {'Recursive' if recursive else 'Non-recursive'}")
|
439
|
+
|
440
|
+
all_objects = dc.list(explore_url, recursive=recursive)
|
441
|
+
|
442
|
+
if not all_objects:
|
443
|
+
print(f"\nā ļø No objects found in catalog.")
|
444
|
+
print(f"š” Tip: Run bootstrap.py to create test data:")
|
445
|
+
print(f" python bootstrap.py --catalog-root {args.catalog_root}")
|
446
|
+
return 0
|
447
|
+
|
448
|
+
# Print summary
|
449
|
+
print_catalog_summary(all_objects)
|
450
|
+
|
451
|
+
# Print detailed listing
|
452
|
+
print_detailed_listing(all_objects)
|
453
|
+
|
454
|
+
# Show compaction candidates if requested
|
455
|
+
if args.show_compaction_candidates:
|
456
|
+
candidates = find_compaction_candidates(all_objects)
|
457
|
+
|
458
|
+
print_compaction_candidates(candidates, args.catalog_root)
|
459
|
+
|
460
|
+
print(f"\nā
Catalog exploration completed!")
|
461
|
+
|
462
|
+
except Exception as e:
|
463
|
+
print(f"\nā Error exploring catalog: {str(e)}")
|
464
|
+
import traceback
|
465
|
+
|
466
|
+
traceback.print_exc()
|
467
|
+
return 1
|
468
|
+
|
469
|
+
return 0
|
470
|
+
|
471
|
+
|
472
|
+
if __name__ == "__main__":
|
473
|
+
sys.exit(main())
|
@@ -0,0 +1 @@
|
|
1
|
+
# DeltaCAT Compactor GCP Examples
|