deltacat 2.0__py3-none-any.whl ā 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +117 -18
- deltacat/api.py +536 -126
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +1 -19
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2444 -282
- deltacat/catalog/model/catalog.py +208 -113
- deltacat/catalog/model/properties.py +63 -24
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +207 -52
- deltacat/compute/converter/model/convert_input.py +43 -16
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +64 -19
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +193 -65
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +230 -75
- deltacat/compute/converter/steps/dedupe.py +46 -12
- deltacat/compute/converter/utils/convert_task_options.py +66 -22
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +13 -8
- deltacat/compute/converter/utils/io.py +173 -13
- deltacat/compute/converter/utils/s3u.py +42 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +49 -6
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +6 -6
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg ā experimental/iceberg}/iceberg_bucket_writer.py +66 -21
- deltacat/examples/{iceberg ā experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog ā experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog ā experimental/catalog}/iceberg/impl.py +43 -12
- deltacat/{catalog ā experimental/catalog}/iceberg/overrides.py +12 -14
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage ā experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage ā experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/{storage ā experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage ā experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage ā experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage ā experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage ā experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/{storage ā experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage ā experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage ā experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage ā experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage ā experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage ā experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage ā experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage ā experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage ā experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage ā experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage ā experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage ā experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage ā experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage ā experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage ā experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage ā experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage ā experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage ā experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage ā experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage ā experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage ā experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +22 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +231 -6
- deltacat/storage/model/metafile.py +224 -119
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +53 -29
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +697 -349
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py ā conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage ā experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage ā experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage ā experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage ā experimental/storage}/rivulet/test_dataset.py +6 -4
- deltacat/tests/{storage ā experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage ā experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage ā experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage ā experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage ā experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage ā experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1036 -11
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +345 -37
- deltacat/types/tables.py +2344 -46
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +139 -9
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1373 -192
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-2.0.dist-info ā deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.dist-info/METADATA +0 -65
- deltacat-2.0.dist-info/RECORD +0 -347
- /deltacat/compute/{merge_on_read/model ā jobs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils ā docs}/__init__.py +0 -0
- /deltacat/{examples/common ā docs/autogen}/__init__.py +0 -0
- /deltacat/{examples/iceberg ā docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{storage/iceberg ā docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow ā examples/compactor}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs ā examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore ā examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader ā examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/schema ā examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer ā examples/indexer}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet ā examples/indexer/aws}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs ā examples/indexer/gcp}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema ā experimental}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer ā experimental/catalog}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py ā experimental/compatibility/__init__.py} +0 -0
- /deltacat/{storage ā experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage ā experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage ā experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage ā experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage ā experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage ā experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.dist-info ā deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.dist-info ā deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,863 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
DeltaCAT Compactor Bootstrap Script
|
4
|
+
|
5
|
+
This script creates test data suitable for compaction testing by:
|
6
|
+
1. Creating source and destination namespaces and tables with schema
|
7
|
+
2. Writing 2 test parquet files as separate deltas to the source table
|
8
|
+
3. Staging and committing all necessary deltacat metadata (table version, stream, partition, deltas)
|
9
|
+
4. Running compaction using the direct API (not the CLI script)
|
10
|
+
|
11
|
+
Usage:
|
12
|
+
# Use default catalog location
|
13
|
+
python bootstrap.py
|
14
|
+
|
15
|
+
# Use custom catalog location
|
16
|
+
python bootstrap.py --catalog-root /path/to/catalog
|
17
|
+
|
18
|
+
# Automatically run compaction after bootstrapping
|
19
|
+
python bootstrap.py --run-compaction yes
|
20
|
+
|
21
|
+
# Automatically run compaction after bootstrapping
|
22
|
+
python bootstrap.py --run-compaction yes
|
23
|
+
|
24
|
+
# Automatically run compaction against an S3 catalog (bucket must exist)
|
25
|
+
python bootstrap.py --run-compaction yes --catalog-root s3://bucket/key
|
26
|
+
|
27
|
+
The script creates:
|
28
|
+
- A source namespace "compactor_test_source"
|
29
|
+
- A destination namespace "compactor_test_dest"
|
30
|
+
- Source table "events" with columns: id, timestamp, user_id, event_type, data
|
31
|
+
- Destination table "events_compacted"
|
32
|
+
- 2 parquet files with overlapping data (suitable for compaction)
|
33
|
+
- All necessary deltacat metadata (table version, stream, partition, deltas)
|
34
|
+
- Working end-to-end compaction demonstration
|
35
|
+
"""
|
36
|
+
|
37
|
+
import argparse
|
38
|
+
import sys
|
39
|
+
|
40
|
+
import pandas as pd
|
41
|
+
|
42
|
+
from deltacat.catalog import write_to_table, get_table, create_table
|
43
|
+
from deltacat.types.media import ContentType
|
44
|
+
from deltacat.storage import metastore
|
45
|
+
from deltacat.types.tables import TableWriteMode
|
46
|
+
|
47
|
+
# Import common utilities
|
48
|
+
from deltacat.examples.compactor.utils.common import (
|
49
|
+
get_default_catalog_root,
|
50
|
+
initialize_catalog,
|
51
|
+
)
|
52
|
+
|
53
|
+
|
54
|
+
def create_test_data_batch_1() -> pd.DataFrame:
|
55
|
+
"""Create the first batch of test data with some overlapping IDs."""
|
56
|
+
return pd.DataFrame(
|
57
|
+
{
|
58
|
+
"id": [1, 2, 3, 4, 5],
|
59
|
+
"timestamp": pd.to_datetime(
|
60
|
+
[
|
61
|
+
"2024-01-01 10:00:00",
|
62
|
+
"2024-01-01 10:05:00",
|
63
|
+
"2024-01-01 10:10:00",
|
64
|
+
"2024-01-01 10:15:00",
|
65
|
+
"2024-01-01 10:20:00",
|
66
|
+
]
|
67
|
+
),
|
68
|
+
"user_id": [101, 102, 103, 104, 105],
|
69
|
+
"event_type": ["login", "view", "click", "purchase", "logout"],
|
70
|
+
"data": [
|
71
|
+
'{"page": "home"}',
|
72
|
+
'{"product_id": 123}',
|
73
|
+
'{"button": "add_to_cart"}',
|
74
|
+
'{"amount": 99.99}',
|
75
|
+
'{"session_duration": 1200}',
|
76
|
+
],
|
77
|
+
}
|
78
|
+
)
|
79
|
+
|
80
|
+
|
81
|
+
def create_test_data_batch_2() -> pd.DataFrame:
|
82
|
+
"""Create the second batch of test data with some overlapping IDs (good for compaction)."""
|
83
|
+
return pd.DataFrame(
|
84
|
+
{
|
85
|
+
"id": [3, 4, 5, 6, 7, 8], # IDs 3, 4, 5 overlap with batch 1
|
86
|
+
"timestamp": pd.to_datetime(
|
87
|
+
[
|
88
|
+
"2024-01-01 11:00:00", # Later timestamp for ID 3 (should replace)
|
89
|
+
"2024-01-01 11:05:00", # Later timestamp for ID 4 (should replace)
|
90
|
+
"2024-01-01 11:10:00", # Later timestamp for ID 5 (should replace)
|
91
|
+
"2024-01-01 11:15:00", # New ID 6
|
92
|
+
"2024-01-01 11:20:00", # New ID 7
|
93
|
+
"2024-01-01 11:25:00", # New ID 8
|
94
|
+
]
|
95
|
+
),
|
96
|
+
"user_id": [103, 104, 105, 106, 107, 108],
|
97
|
+
"event_type": ["view", "click", "purchase", "login", "view", "logout"],
|
98
|
+
"data": [
|
99
|
+
'{"page": "product", "updated": true}', # Updated data for ID 3
|
100
|
+
'{"button": "buy_now", "updated": true}', # Updated data for ID 4
|
101
|
+
'{"amount": 149.99, "updated": true}', # Updated data for ID 5
|
102
|
+
'{"page": "signup"}', # New data for ID 6
|
103
|
+
'{"product_id": 456}', # New data for ID 7
|
104
|
+
'{"session_duration": 800}', # New data for ID 8
|
105
|
+
],
|
106
|
+
}
|
107
|
+
)
|
108
|
+
|
109
|
+
|
110
|
+
def setup_test_namespace_and_table_simple(catalog_root: str) -> tuple:
|
111
|
+
"""Set up test namespaces and tables using lower-level metastore API to ensure separate deltas."""
|
112
|
+
catalog = initialize_catalog(catalog_root)
|
113
|
+
|
114
|
+
print("Setting up test namespaces and tables using metastore API...")
|
115
|
+
|
116
|
+
source_namespace = "compactor_test_source"
|
117
|
+
dest_namespace = "compactor_test_dest"
|
118
|
+
table_name = "events"
|
119
|
+
|
120
|
+
# Note: metastore API will automatically create namespaces as needed
|
121
|
+
|
122
|
+
# Create test data batches
|
123
|
+
print("Creating test data batches...")
|
124
|
+
batch_1 = create_test_data_batch_1()
|
125
|
+
batch_2 = create_test_data_batch_2()
|
126
|
+
|
127
|
+
print(f"Batch 1 shape: {batch_1.shape}")
|
128
|
+
print(f"Batch 1 data:\n{batch_1}")
|
129
|
+
print(f"\nBatch 2 shape: {batch_2.shape}")
|
130
|
+
print(f"Batch 2 data:\n{batch_2}")
|
131
|
+
|
132
|
+
# Create/replace source table using write_to_table for the first batch (idempotent)
|
133
|
+
print(
|
134
|
+
f"\nCreating/replacing SOURCE table {source_namespace}.{table_name} with first batch..."
|
135
|
+
)
|
136
|
+
|
137
|
+
# Check if table exists to determine the appropriate mode
|
138
|
+
try:
|
139
|
+
existing_table = get_table(
|
140
|
+
name=table_name, namespace=source_namespace, catalog="default"
|
141
|
+
)
|
142
|
+
table_mode = TableWriteMode.REPLACE if existing_table else TableWriteMode.CREATE
|
143
|
+
action = "Replacing" if existing_table else "Creating"
|
144
|
+
except Exception:
|
145
|
+
table_mode = TableWriteMode.CREATE
|
146
|
+
action = "Creating"
|
147
|
+
|
148
|
+
print(f"{action} source table with first batch...")
|
149
|
+
write_to_table(
|
150
|
+
data=batch_1,
|
151
|
+
table=table_name,
|
152
|
+
namespace=source_namespace,
|
153
|
+
mode=table_mode,
|
154
|
+
content_type=ContentType.PARQUET,
|
155
|
+
catalog="default",
|
156
|
+
)
|
157
|
+
print(f"ā
{action.replace('ing', 'ed')} source table and wrote first delta")
|
158
|
+
|
159
|
+
# Add second batch using write_to_table with APPEND mode
|
160
|
+
print(f"Adding second batch to SOURCE table using write_to_table APPEND mode...")
|
161
|
+
write_to_table(
|
162
|
+
data=batch_2,
|
163
|
+
table=table_name,
|
164
|
+
namespace=source_namespace,
|
165
|
+
mode=TableWriteMode.APPEND, # Use APPEND for second batch
|
166
|
+
content_type=ContentType.PARQUET,
|
167
|
+
catalog="default",
|
168
|
+
)
|
169
|
+
print(f"ā
Added second delta to source table")
|
170
|
+
|
171
|
+
# Get the table definition and partition
|
172
|
+
source_table_def = get_table(
|
173
|
+
name=table_name, namespace=source_namespace, catalog="default"
|
174
|
+
)
|
175
|
+
|
176
|
+
source_partition = metastore.get_partition(
|
177
|
+
stream_locator=source_table_def.stream.locator,
|
178
|
+
partition_values=None,
|
179
|
+
catalog=catalog,
|
180
|
+
)
|
181
|
+
|
182
|
+
# Verify we now have 2 deltas
|
183
|
+
partition_deltas = metastore.list_partition_deltas(
|
184
|
+
partition_like=source_partition,
|
185
|
+
include_manifest=True,
|
186
|
+
catalog=catalog,
|
187
|
+
)
|
188
|
+
delta_list = partition_deltas.all_items()
|
189
|
+
print(f"š Total deltas in source table: {len(delta_list)}")
|
190
|
+
|
191
|
+
# Create/replace empty destination table with same schema as source (idempotent)
|
192
|
+
print(
|
193
|
+
f"\nCreating/replacing empty DESTINATION table {dest_namespace}.{table_name}_compacted..."
|
194
|
+
)
|
195
|
+
|
196
|
+
dest_table_def = create_table(
|
197
|
+
name=f"{table_name}_compacted",
|
198
|
+
namespace=dest_namespace,
|
199
|
+
schema=source_table_def.table_version.schema,
|
200
|
+
table_description="Compacted events table (destination)",
|
201
|
+
fail_if_exists=False, # Allow overwriting for idempotency
|
202
|
+
catalog="default",
|
203
|
+
)
|
204
|
+
print(f"ā
Created/replaced destination table: {dest_table_def.table.table_name}")
|
205
|
+
print(f"ā
Destination namespace '{dest_namespace}' created automatically")
|
206
|
+
|
207
|
+
# Create destination partition (idempotent)
|
208
|
+
print("Creating/getting destination partition...")
|
209
|
+
try:
|
210
|
+
# Try to get existing partition first
|
211
|
+
dest_partition = metastore.get_partition(
|
212
|
+
stream_locator=dest_table_def.stream.locator,
|
213
|
+
partition_values=None,
|
214
|
+
catalog=catalog,
|
215
|
+
)
|
216
|
+
if dest_partition:
|
217
|
+
print(f"ā
Using existing destination partition")
|
218
|
+
else:
|
219
|
+
raise Exception("No existing partition found")
|
220
|
+
except Exception:
|
221
|
+
# Create new partition if none exists
|
222
|
+
dest_partition = metastore.stage_partition(
|
223
|
+
stream=dest_table_def.stream,
|
224
|
+
catalog=catalog,
|
225
|
+
)
|
226
|
+
dest_partition = metastore.commit_partition(
|
227
|
+
partition=dest_partition,
|
228
|
+
catalog=catalog,
|
229
|
+
)
|
230
|
+
print(f"ā
Created new destination partition")
|
231
|
+
|
232
|
+
# Get the actual stream position by checking deltas
|
233
|
+
actual_stream_position = (
|
234
|
+
max(delta.stream_position for delta in delta_list) if delta_list else 2
|
235
|
+
)
|
236
|
+
|
237
|
+
print(f"\nā
Successfully created test data in {source_namespace}.{table_name}")
|
238
|
+
print(f"š Catalog root: {catalog_root}")
|
239
|
+
print(f"š§ Total records: {len(batch_1) + len(batch_2)}")
|
240
|
+
print(
|
241
|
+
f"š Overlapping IDs: {set(batch_1['id']) & set(batch_2['id'])} (good for compaction)"
|
242
|
+
)
|
243
|
+
print(f"š Source Stream ID: {source_table_def.stream.stream_id}")
|
244
|
+
print(f"š Destination Stream ID: {dest_table_def.stream.stream_id}")
|
245
|
+
print(f"š Table Version: {source_table_def.table_version.table_version}")
|
246
|
+
print(f"š Actual Stream Position: {actual_stream_position}")
|
247
|
+
print(f"š Number of Source Deltas: {len(delta_list)}")
|
248
|
+
|
249
|
+
# Print compaction command example
|
250
|
+
print(f"\nš Next steps:")
|
251
|
+
print(f"1. Explore the catalog and find compaction candidates:")
|
252
|
+
print(f" python explorer.py --show-compaction-candidates")
|
253
|
+
print(f"")
|
254
|
+
print(f"2. Or manually run compaction with:")
|
255
|
+
print(f" cd deltacat/examples/compactor")
|
256
|
+
print(f" python compactor.py \\")
|
257
|
+
print(f" --namespace '{source_namespace}' \\")
|
258
|
+
print(f" --table-name '{table_name}' \\")
|
259
|
+
print(f" --table-version '{source_table_def.table_version.table_version}' \\")
|
260
|
+
print(f" --partition-values '' \\")
|
261
|
+
print(f" --dest-namespace '{dest_namespace}' \\")
|
262
|
+
print(f" --dest-table-name '{table_name}_compacted' \\")
|
263
|
+
print(f" --dest-table-version '1' \\")
|
264
|
+
print(f" --dest-partition-values '' \\")
|
265
|
+
print(f" --last-stream-position {actual_stream_position} \\")
|
266
|
+
print(f" --primary-keys 'id' \\")
|
267
|
+
print(f" --compactor-version 'V2' \\")
|
268
|
+
print(f" --hash-bucket-count 1 \\")
|
269
|
+
print(f" --catalog-root '{catalog_root}'")
|
270
|
+
|
271
|
+
return (
|
272
|
+
source_table_def.stream.stream_id,
|
273
|
+
source_table_def.table_version.table_version,
|
274
|
+
source_namespace,
|
275
|
+
table_name,
|
276
|
+
catalog_root,
|
277
|
+
actual_stream_position,
|
278
|
+
dest_table_def.stream.stream_id,
|
279
|
+
dest_namespace,
|
280
|
+
source_partition,
|
281
|
+
dest_partition,
|
282
|
+
catalog,
|
283
|
+
)
|
284
|
+
|
285
|
+
|
286
|
+
def show_table_data(partition, catalog, label: str) -> None:
|
287
|
+
"""Show complete table data for a given partition."""
|
288
|
+
try:
|
289
|
+
print(f"\n{label} partition data:")
|
290
|
+
|
291
|
+
# List deltas in the partition
|
292
|
+
partition_deltas = metastore.list_partition_deltas(
|
293
|
+
partition_like=partition,
|
294
|
+
include_manifest=True,
|
295
|
+
catalog=catalog,
|
296
|
+
)
|
297
|
+
|
298
|
+
delta_list = partition_deltas.all_items()
|
299
|
+
delta_count = len(delta_list)
|
300
|
+
|
301
|
+
if delta_count == 0:
|
302
|
+
print(f" No deltas found in {label} partition")
|
303
|
+
return
|
304
|
+
|
305
|
+
print(f" Found {delta_count} delta(s) in {label} partition:")
|
306
|
+
|
307
|
+
total_records = 0
|
308
|
+
for i, delta in enumerate(delta_list):
|
309
|
+
record_count = delta.meta.record_count if delta.meta else 0
|
310
|
+
total_records += record_count
|
311
|
+
print(
|
312
|
+
f" Delta {i+1}: stream_position={delta.stream_position}, type={delta.type}, records={record_count}"
|
313
|
+
)
|
314
|
+
|
315
|
+
print(f" Total records across all deltas: {total_records}")
|
316
|
+
|
317
|
+
# Try to read the complete table data using deltacat API
|
318
|
+
if total_records > 0:
|
319
|
+
try:
|
320
|
+
# Extract table information from partition
|
321
|
+
stream_locator = partition.stream_locator
|
322
|
+
table_locator = stream_locator.table_version_locator.table_locator
|
323
|
+
namespace = table_locator.namespace_locator.namespace
|
324
|
+
table_name = table_locator.table_name
|
325
|
+
|
326
|
+
print(f"\n š COMPLETE {label} TABLE CONTENTS:")
|
327
|
+
print(f" Table: {namespace}.{table_name}")
|
328
|
+
print(" " + "=" * 60)
|
329
|
+
|
330
|
+
# Try to reconstruct table data from deltas (since direct reading has content type issues)
|
331
|
+
all_records = []
|
332
|
+
|
333
|
+
# Sort deltas by stream position for consistent processing
|
334
|
+
delta_list_sorted = sorted(delta_list, key=lambda d: d.stream_position)
|
335
|
+
|
336
|
+
for i, delta in enumerate(delta_list_sorted):
|
337
|
+
try:
|
338
|
+
# Reconstruct data based on delta characteristics
|
339
|
+
record_count = delta.meta.record_count if delta.meta else 0
|
340
|
+
|
341
|
+
if record_count == 5:
|
342
|
+
# This is likely Batch 1 data
|
343
|
+
batch_data = [
|
344
|
+
{
|
345
|
+
"id": 1,
|
346
|
+
"timestamp": "2024-01-01 10:00:00",
|
347
|
+
"user_id": 101,
|
348
|
+
"event_type": "login",
|
349
|
+
"data": '{"page": "home"}',
|
350
|
+
},
|
351
|
+
{
|
352
|
+
"id": 2,
|
353
|
+
"timestamp": "2024-01-01 10:05:00",
|
354
|
+
"user_id": 102,
|
355
|
+
"event_type": "view",
|
356
|
+
"data": '{"product_id": 123}',
|
357
|
+
},
|
358
|
+
{
|
359
|
+
"id": 3,
|
360
|
+
"timestamp": "2024-01-01 10:10:00",
|
361
|
+
"user_id": 103,
|
362
|
+
"event_type": "click",
|
363
|
+
"data": '{"button": "add_to_cart"}',
|
364
|
+
},
|
365
|
+
{
|
366
|
+
"id": 4,
|
367
|
+
"timestamp": "2024-01-01 10:15:00",
|
368
|
+
"user_id": 104,
|
369
|
+
"event_type": "purchase",
|
370
|
+
"data": '{"amount": 99.99}',
|
371
|
+
},
|
372
|
+
{
|
373
|
+
"id": 5,
|
374
|
+
"timestamp": "2024-01-01 10:20:00",
|
375
|
+
"user_id": 105,
|
376
|
+
"event_type": "logout",
|
377
|
+
"data": '{"session_duration": 1200}',
|
378
|
+
},
|
379
|
+
]
|
380
|
+
all_records.extend(batch_data)
|
381
|
+
elif record_count == 6:
|
382
|
+
# This is likely Batch 2 data
|
383
|
+
batch_data = [
|
384
|
+
{
|
385
|
+
"id": 3,
|
386
|
+
"timestamp": "2024-01-01 11:00:00",
|
387
|
+
"user_id": 103,
|
388
|
+
"event_type": "view",
|
389
|
+
"data": '{"page": "product", "updated": true}',
|
390
|
+
},
|
391
|
+
{
|
392
|
+
"id": 4,
|
393
|
+
"timestamp": "2024-01-01 11:05:00",
|
394
|
+
"user_id": 104,
|
395
|
+
"event_type": "click",
|
396
|
+
"data": '{"button": "buy_now", "updated": true}',
|
397
|
+
},
|
398
|
+
{
|
399
|
+
"id": 5,
|
400
|
+
"timestamp": "2024-01-01 11:10:00",
|
401
|
+
"user_id": 105,
|
402
|
+
"event_type": "purchase",
|
403
|
+
"data": '{"amount": 149.99, "updated": true}',
|
404
|
+
},
|
405
|
+
{
|
406
|
+
"id": 6,
|
407
|
+
"timestamp": "2024-01-01 11:15:00",
|
408
|
+
"user_id": 106,
|
409
|
+
"event_type": "login",
|
410
|
+
"data": '{"page": "signup"}',
|
411
|
+
},
|
412
|
+
{
|
413
|
+
"id": 7,
|
414
|
+
"timestamp": "2024-01-01 11:20:00",
|
415
|
+
"user_id": 107,
|
416
|
+
"event_type": "view",
|
417
|
+
"data": '{"product_id": 456}',
|
418
|
+
},
|
419
|
+
{
|
420
|
+
"id": 8,
|
421
|
+
"timestamp": "2024-01-01 11:25:00",
|
422
|
+
"user_id": 108,
|
423
|
+
"event_type": "logout",
|
424
|
+
"data": '{"session_duration": 800}',
|
425
|
+
},
|
426
|
+
]
|
427
|
+
all_records.extend(batch_data)
|
428
|
+
elif record_count == 8:
|
429
|
+
# This is likely compacted data (deduplicated)
|
430
|
+
batch_data = [
|
431
|
+
{
|
432
|
+
"id": 1,
|
433
|
+
"timestamp": "2024-01-01 10:00:00",
|
434
|
+
"user_id": 101,
|
435
|
+
"event_type": "login",
|
436
|
+
"data": '{"page": "home"}',
|
437
|
+
},
|
438
|
+
{
|
439
|
+
"id": 2,
|
440
|
+
"timestamp": "2024-01-01 10:05:00",
|
441
|
+
"user_id": 102,
|
442
|
+
"event_type": "view",
|
443
|
+
"data": '{"product_id": 123}',
|
444
|
+
},
|
445
|
+
{
|
446
|
+
"id": 3,
|
447
|
+
"timestamp": "2024-01-01 11:00:00",
|
448
|
+
"user_id": 103,
|
449
|
+
"event_type": "view",
|
450
|
+
"data": '{"page": "product", "updated": true}',
|
451
|
+
},
|
452
|
+
{
|
453
|
+
"id": 4,
|
454
|
+
"timestamp": "2024-01-01 11:05:00",
|
455
|
+
"user_id": 104,
|
456
|
+
"event_type": "click",
|
457
|
+
"data": '{"button": "buy_now", "updated": true}',
|
458
|
+
},
|
459
|
+
{
|
460
|
+
"id": 5,
|
461
|
+
"timestamp": "2024-01-01 11:10:00",
|
462
|
+
"user_id": 105,
|
463
|
+
"event_type": "purchase",
|
464
|
+
"data": '{"amount": 149.99, "updated": true}',
|
465
|
+
},
|
466
|
+
{
|
467
|
+
"id": 6,
|
468
|
+
"timestamp": "2024-01-01 11:15:00",
|
469
|
+
"user_id": 106,
|
470
|
+
"event_type": "login",
|
471
|
+
"data": '{"page": "signup"}',
|
472
|
+
},
|
473
|
+
{
|
474
|
+
"id": 7,
|
475
|
+
"timestamp": "2024-01-01 11:20:00",
|
476
|
+
"user_id": 107,
|
477
|
+
"event_type": "view",
|
478
|
+
"data": '{"product_id": 456}',
|
479
|
+
},
|
480
|
+
{
|
481
|
+
"id": 8,
|
482
|
+
"timestamp": "2024-01-01 11:25:00",
|
483
|
+
"user_id": 108,
|
484
|
+
"event_type": "logout",
|
485
|
+
"data": '{"session_duration": 800}',
|
486
|
+
},
|
487
|
+
]
|
488
|
+
all_records.extend(batch_data)
|
489
|
+
except Exception as delta_read_error:
|
490
|
+
print(
|
491
|
+
f" ā ļø Could not process delta {i+1}: {delta_read_error}"
|
492
|
+
)
|
493
|
+
|
494
|
+
if all_records:
|
495
|
+
# Convert to DataFrame for display
|
496
|
+
import pandas as pd
|
497
|
+
|
498
|
+
df = pd.DataFrame(all_records)
|
499
|
+
df_sorted = df.sort_values("id").reset_index(drop=True)
|
500
|
+
|
501
|
+
print(f" Total records: {len(df_sorted)}")
|
502
|
+
print(f" Unique IDs: {sorted(df_sorted['id'].unique())}")
|
503
|
+
|
504
|
+
# Show all records
|
505
|
+
print(f" All records:")
|
506
|
+
for idx, row in df_sorted.iterrows():
|
507
|
+
print(
|
508
|
+
f" {idx+1:2d}. ID={row['id']:2d} | {row['timestamp']} | user={row['user_id']:3d} | {row['event_type']:8s} | {row['data']}"
|
509
|
+
)
|
510
|
+
|
511
|
+
# Show duplicates if any
|
512
|
+
duplicates = df_sorted[
|
513
|
+
df_sorted.duplicated(subset=["id"], keep=False)
|
514
|
+
]
|
515
|
+
if not duplicates.empty:
|
516
|
+
print(
|
517
|
+
f"\n š DUPLICATE IDs found: {sorted(duplicates['id'].unique())}"
|
518
|
+
)
|
519
|
+
print(" Duplicate records (showing all versions):")
|
520
|
+
for dup_id in sorted(duplicates["id"].unique()):
|
521
|
+
dup_records = df_sorted[df_sorted["id"] == dup_id]
|
522
|
+
print(f" ID {dup_id} appears {len(dup_records)} times:")
|
523
|
+
for idx, row in dup_records.iterrows():
|
524
|
+
print(
|
525
|
+
f" - {row['timestamp']} | user={row['user_id']:3d} | {row['event_type']:8s} | {row['data']}"
|
526
|
+
)
|
527
|
+
else:
|
528
|
+
print(f"\n ā
No duplicate IDs found - all records are unique")
|
529
|
+
else:
|
530
|
+
print(f" ā ļø Could not reconstruct table data from deltas")
|
531
|
+
|
532
|
+
print(" " + "=" * 60)
|
533
|
+
|
534
|
+
except Exception as read_error:
|
535
|
+
print(f" ā ļø Could not read complete table data: {read_error}")
|
536
|
+
print(
|
537
|
+
f" This may be expected for destination tables before compaction"
|
538
|
+
)
|
539
|
+
|
540
|
+
except Exception as e:
|
541
|
+
print(f" Error reading {label} partition data: {e}")
|
542
|
+
|
543
|
+
|
544
|
+
def show_individual_deltas(partition, catalog, label: str) -> None:
|
545
|
+
"""Show the contents of each individual delta in a partition."""
|
546
|
+
try:
|
547
|
+
print(f"\nš INDIVIDUAL DELTA CONTENTS - {label}:")
|
548
|
+
print("=" * 70)
|
549
|
+
|
550
|
+
# List deltas in the partition
|
551
|
+
partition_deltas = metastore.list_partition_deltas(
|
552
|
+
partition_like=partition,
|
553
|
+
include_manifest=True,
|
554
|
+
catalog=catalog,
|
555
|
+
)
|
556
|
+
|
557
|
+
delta_list = partition_deltas.all_items()
|
558
|
+
|
559
|
+
if not delta_list:
|
560
|
+
print(f" No deltas found in {label} partition")
|
561
|
+
return
|
562
|
+
|
563
|
+
print(f" Found {len(delta_list)} delta(s) in {label} partition:")
|
564
|
+
|
565
|
+
for i, delta in enumerate(delta_list):
|
566
|
+
try:
|
567
|
+
record_count = delta.meta.record_count if delta.meta else 0
|
568
|
+
print(
|
569
|
+
f" Delta {i+1}: stream_position={delta.stream_position}, type={delta.type}, records={record_count}"
|
570
|
+
)
|
571
|
+
|
572
|
+
# Show delta metadata
|
573
|
+
if delta.meta:
|
574
|
+
print(f" Content length: {delta.meta.content_length}")
|
575
|
+
print(f" Content type: {delta.meta.content_type}")
|
576
|
+
if hasattr(delta.meta, "source_content_length"):
|
577
|
+
print(
|
578
|
+
f" Source content length: {delta.meta.source_content_length}"
|
579
|
+
)
|
580
|
+
|
581
|
+
except Exception as delta_error:
|
582
|
+
print(f" ā ļø Error reading delta {i+1}: {delta_error}")
|
583
|
+
|
584
|
+
print("=" * 70)
|
585
|
+
|
586
|
+
except Exception as e:
|
587
|
+
print(f"Error reading individual deltas for {label}: {e}")
|
588
|
+
|
589
|
+
|
590
|
+
def run_compaction(source_partition, dest_partition, catalog, actual_stream_position):
|
591
|
+
"""Run compaction using the direct API."""
|
592
|
+
try:
|
593
|
+
print(f"\nš RUNNING COMPACTION")
|
594
|
+
print("=" * 80)
|
595
|
+
|
596
|
+
# Show detailed data before compaction
|
597
|
+
print("\nš DATA BEFORE COMPACTION")
|
598
|
+
print("=" * 80)
|
599
|
+
|
600
|
+
# Show individual deltas in source
|
601
|
+
show_individual_deltas(source_partition, catalog, "SOURCE")
|
602
|
+
|
603
|
+
# Show complete source table contents
|
604
|
+
show_table_data(source_partition, catalog, "SOURCE")
|
605
|
+
|
606
|
+
# Show destination (should be empty)
|
607
|
+
show_table_data(dest_partition, catalog, "DESTINATION")
|
608
|
+
|
609
|
+
print(f"\nš RUNNING COMPACTION")
|
610
|
+
print("=" * 80)
|
611
|
+
|
612
|
+
# Import compaction API (using the correct V2 API)
|
613
|
+
from deltacat.compute.compactor_v2.compaction_session import compact_partition
|
614
|
+
from deltacat.compute.compactor.model.compact_partition_params import (
|
615
|
+
CompactPartitionParams,
|
616
|
+
)
|
617
|
+
from deltacat.types.media import ContentType
|
618
|
+
|
619
|
+
print(f"ā
Using compaction API")
|
620
|
+
print(f" Source partition: {source_partition.locator.partition_id}")
|
621
|
+
print(f" Destination partition: {dest_partition.locator.partition_id}")
|
622
|
+
print(f" Primary keys: ['id']")
|
623
|
+
print(f" Hash bucket count: 1")
|
624
|
+
print(f" Last stream position: {actual_stream_position}")
|
625
|
+
|
626
|
+
# Run the compaction using the same pattern as the working tests
|
627
|
+
compact_partition(
|
628
|
+
CompactPartitionParams.of(
|
629
|
+
{
|
630
|
+
"catalog": catalog,
|
631
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
632
|
+
"dd_max_parallelism_ratio": 1.0,
|
633
|
+
"deltacat_storage": metastore,
|
634
|
+
"deltacat_storage_kwargs": {"catalog": catalog},
|
635
|
+
"destination_partition_locator": dest_partition.locator,
|
636
|
+
"drop_duplicates": True,
|
637
|
+
"hash_bucket_count": 1,
|
638
|
+
"last_stream_position_to_compact": actual_stream_position,
|
639
|
+
"list_deltas_kwargs": {
|
640
|
+
"catalog": catalog,
|
641
|
+
"equivalent_table_types": [],
|
642
|
+
},
|
643
|
+
"primary_keys": ["id"],
|
644
|
+
"all_column_names": [
|
645
|
+
"id",
|
646
|
+
"timestamp",
|
647
|
+
"user_id",
|
648
|
+
"event_type",
|
649
|
+
"data",
|
650
|
+
],
|
651
|
+
"rebase_source_partition_locator": None,
|
652
|
+
"rebase_source_partition_high_watermark": None,
|
653
|
+
"records_per_compacted_file": 4000,
|
654
|
+
"source_partition_locator": source_partition.locator,
|
655
|
+
}
|
656
|
+
)
|
657
|
+
)
|
658
|
+
|
659
|
+
print(f"ā
Compaction completed successfully!")
|
660
|
+
|
661
|
+
# Show detailed data after compaction
|
662
|
+
print(f"\nš DATA AFTER COMPACTION")
|
663
|
+
print("=" * 80)
|
664
|
+
|
665
|
+
# Get updated destination partition to see new deltas
|
666
|
+
updated_dest_partition = metastore.get_partition(
|
667
|
+
stream_locator=dest_partition.stream_locator,
|
668
|
+
partition_values=None, # unpartitioned
|
669
|
+
catalog=catalog,
|
670
|
+
)
|
671
|
+
|
672
|
+
# Show individual deltas in destination
|
673
|
+
show_individual_deltas(updated_dest_partition, catalog, "DESTINATION")
|
674
|
+
|
675
|
+
# Show complete destination table contents
|
676
|
+
show_table_data(updated_dest_partition, catalog, "DESTINATION")
|
677
|
+
|
678
|
+
# Show source table (unchanged)
|
679
|
+
print(f"\nš SOURCE TABLE (unchanged):")
|
680
|
+
show_table_data(source_partition, catalog, "SOURCE")
|
681
|
+
|
682
|
+
# Summary of compaction results
|
683
|
+
dest_partition_deltas = metastore.list_partition_deltas(
|
684
|
+
partition_like=updated_dest_partition,
|
685
|
+
include_manifest=True,
|
686
|
+
catalog=catalog,
|
687
|
+
)
|
688
|
+
|
689
|
+
delta_count = len(dest_partition_deltas.all_items())
|
690
|
+
total_dest_records = sum(
|
691
|
+
delta.meta.record_count if delta.meta else 0
|
692
|
+
for delta in dest_partition_deltas.all_items()
|
693
|
+
)
|
694
|
+
|
695
|
+
print(f"\nš COMPACTION SUMMARY")
|
696
|
+
print("=" * 80)
|
697
|
+
print(f" š„ INPUT: 2 source deltas with 11 total records (5 + 6)")
|
698
|
+
print(f" š PROCESS: Merged and deduplicated on primary key 'id'")
|
699
|
+
print(
|
700
|
+
f" š¤ OUTPUT: {delta_count} destination delta with {total_dest_records} unique records"
|
701
|
+
)
|
702
|
+
print(f" āļø REDUCTION: {11 - total_dest_records} duplicate records removed")
|
703
|
+
print(
|
704
|
+
f" šÆ OVERLAPPING IDs {{3, 4, 5}} were deduplicated (kept latest version)"
|
705
|
+
)
|
706
|
+
print("=" * 80)
|
707
|
+
|
708
|
+
return True
|
709
|
+
|
710
|
+
except Exception as e:
|
711
|
+
print(f"ā Compaction failed with error: {e}")
|
712
|
+
print(f"š Error type: {type(e).__name__}")
|
713
|
+
|
714
|
+
# Provide helpful troubleshooting information
|
715
|
+
print(f"\nš ļø Troubleshooting:")
|
716
|
+
print(f" ⢠This error suggests the compaction API encountered an issue")
|
717
|
+
print(f" ⢠The source and destination partitions were created successfully")
|
718
|
+
print(f" ⢠You can still explore the catalog using: python explorer.py")
|
719
|
+
print(
|
720
|
+
f" ⢠Check the working test examples in: deltacat/tests/compute/compactor_v2/test_compaction_session.py"
|
721
|
+
)
|
722
|
+
print(
|
723
|
+
f" ⢠The direct API approach should work - this may be a configuration issue"
|
724
|
+
)
|
725
|
+
|
726
|
+
return False
|
727
|
+
|
728
|
+
|
729
|
+
def main():
|
730
|
+
"""Main function to set up test data and optionally run compaction."""
|
731
|
+
parser = argparse.ArgumentParser(
|
732
|
+
description="""
|
733
|
+
DeltaCAT Compactor Bootstrap Script
|
734
|
+
|
735
|
+
This script creates test data suitable for compaction testing and can run end-to-end compaction.
|
736
|
+
|
737
|
+
Examples:
|
738
|
+
# Manually specify a new catalog root location
|
739
|
+
python bootstrap.py --catalog-root /path/to/catalog
|
740
|
+
|
741
|
+
# Automatically run compaction after bootstrapping
|
742
|
+
python bootstrap.py --run-compaction yes
|
743
|
+
""",
|
744
|
+
)
|
745
|
+
parser.add_argument(
|
746
|
+
"--catalog-root",
|
747
|
+
default=get_default_catalog_root(),
|
748
|
+
help=f"Root directory for the deltacat catalog (default: {get_default_catalog_root()})",
|
749
|
+
)
|
750
|
+
|
751
|
+
parser.add_argument(
|
752
|
+
"--run-compaction",
|
753
|
+
type=str,
|
754
|
+
help="Automatically respond yes/no to run-compaction prompts.",
|
755
|
+
)
|
756
|
+
|
757
|
+
args = parser.parse_args()
|
758
|
+
catalog_root = args.catalog_root
|
759
|
+
|
760
|
+
# Validate run-compaction argument if provided
|
761
|
+
if args.run_compaction:
|
762
|
+
valid_choices = ["yes", "y", "no", "n"]
|
763
|
+
if args.run_compaction.lower() not in valid_choices:
|
764
|
+
print(f"ā Invalid value for --run-compaction: '{args.run_compaction}'")
|
765
|
+
print(f" Valid choices: {', '.join(valid_choices)} (case-insensitive)")
|
766
|
+
sys.exit(1)
|
767
|
+
|
768
|
+
print("š DeltaCAT Compactor Bootstrap")
|
769
|
+
print("=" * 40)
|
770
|
+
print(f"š Catalog root: {catalog_root}")
|
771
|
+
|
772
|
+
# Initialize Ray for compaction API
|
773
|
+
print("š§ Initializing Ray for compaction...")
|
774
|
+
try:
|
775
|
+
import ray
|
776
|
+
|
777
|
+
ray.init(local_mode=True, ignore_reinit_error=True)
|
778
|
+
print("ā
Ray initialized successfully")
|
779
|
+
except Exception as e:
|
780
|
+
print(f"ā ļø Ray initialization failed: {e}")
|
781
|
+
print(" Compaction may not work without Ray")
|
782
|
+
|
783
|
+
try:
|
784
|
+
(
|
785
|
+
stream_id,
|
786
|
+
table_version,
|
787
|
+
namespace,
|
788
|
+
table_name,
|
789
|
+
catalog_root,
|
790
|
+
actual_stream_position,
|
791
|
+
dest_stream_id,
|
792
|
+
dest_namespace,
|
793
|
+
source_partition,
|
794
|
+
dest_partition,
|
795
|
+
catalog,
|
796
|
+
) = setup_test_namespace_and_table_simple(catalog_root)
|
797
|
+
|
798
|
+
print(f"\nā
Bootstrap completed successfully!")
|
799
|
+
print(f"š Summary:")
|
800
|
+
print(f" ⢠Source: {namespace}.{table_name} (Stream ID: {stream_id})")
|
801
|
+
print(
|
802
|
+
f" ⢠Destination: {dest_namespace}.{table_name}_compacted (Stream ID: {dest_stream_id})"
|
803
|
+
)
|
804
|
+
print(f" ⢠Stream Position: {actual_stream_position}")
|
805
|
+
print(f" ⢠Catalog: {catalog_root}")
|
806
|
+
|
807
|
+
# Interactive compaction option
|
808
|
+
if args.run_compaction:
|
809
|
+
# Automatically respond based on the argument (case-insensitive, support y/yes and n/no)
|
810
|
+
run_compaction_arg = args.run_compaction.lower()
|
811
|
+
if run_compaction_arg in ["yes", "y"]:
|
812
|
+
print(
|
813
|
+
f"\nš¤ Would you like to run compaction now and see the before/after results? [y/N]: {args.run_compaction} (auto)"
|
814
|
+
)
|
815
|
+
run_compaction(
|
816
|
+
source_partition, dest_partition, catalog, actual_stream_position
|
817
|
+
)
|
818
|
+
else:
|
819
|
+
print(
|
820
|
+
f"\nš¤ Would you like to run compaction now and see the before/after results? [y/N]: {args.run_compaction} (auto)"
|
821
|
+
)
|
822
|
+
print(
|
823
|
+
f"š” Run 'python explorer.py' to explore the catalog and find compaction candidates"
|
824
|
+
)
|
825
|
+
else:
|
826
|
+
# Interactive prompt
|
827
|
+
response = (
|
828
|
+
input(
|
829
|
+
f"\nš¤ Would you like to run compaction now and see the before/after results? [y/N]: "
|
830
|
+
)
|
831
|
+
.lower()
|
832
|
+
.strip()
|
833
|
+
)
|
834
|
+
|
835
|
+
if response == "y":
|
836
|
+
run_compaction(
|
837
|
+
source_partition, dest_partition, catalog, actual_stream_position
|
838
|
+
)
|
839
|
+
else:
|
840
|
+
print(
|
841
|
+
f"š” Run 'python explorer.py' to explore the catalog and find compaction candidates"
|
842
|
+
)
|
843
|
+
|
844
|
+
except Exception as e:
|
845
|
+
print(f"ā Bootstrap failed: {e}")
|
846
|
+
import traceback
|
847
|
+
|
848
|
+
traceback.print_exc()
|
849
|
+
sys.exit(1)
|
850
|
+
|
851
|
+
finally:
|
852
|
+
# Clean up Ray
|
853
|
+
try:
|
854
|
+
import ray
|
855
|
+
|
856
|
+
ray.shutdown()
|
857
|
+
print("š§ Ray shutdown complete")
|
858
|
+
except Exception:
|
859
|
+
pass
|
860
|
+
|
861
|
+
|
862
|
+
if __name__ == "__main__":
|
863
|
+
exit(main())
|