deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +150 -12
- deltacat/annotations.py +36 -0
- deltacat/api.py +578 -0
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +84 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +22 -19
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
- deltacat/catalog/__init__.py +73 -0
- deltacat/catalog/delegate.py +615 -140
- deltacat/catalog/interface.py +404 -81
- deltacat/catalog/main/impl.py +2882 -0
- deltacat/catalog/model/catalog.py +348 -46
- deltacat/catalog/model/properties.py +155 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +19 -9
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +9 -22
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +6 -6
- deltacat/compute/compactor/steps/materialize.py +15 -9
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +7 -6
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +13 -14
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +28 -9
- deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +156 -53
- deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +10 -3
- deltacat/compute/compactor_v2/utils/merge.py +14 -2
- deltacat/compute/compactor_v2/utils/task_options.py +2 -10
- deltacat/compute/converter/constants.py +9 -0
- deltacat/compute/converter/converter_session.py +298 -0
- deltacat/compute/converter/model/convert_input.py +96 -0
- deltacat/compute/converter/model/convert_input_files.py +78 -0
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +144 -0
- deltacat/compute/converter/pyiceberg/catalog.py +78 -0
- deltacat/compute/converter/pyiceberg/overrides.py +263 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
- deltacat/compute/converter/steps/convert.py +366 -0
- deltacat/compute/converter/steps/dedupe.py +94 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +132 -0
- deltacat/compute/converter/utils/converter_session_utils.py +175 -0
- deltacat/compute/converter/utils/iceberg_columns.py +87 -0
- deltacat/compute/converter/utils/io.py +203 -0
- deltacat/compute/converter/utils/s3u.py +148 -0
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +11 -1
- deltacat/constants.py +90 -1
- deltacat/docs/__init__.py +0 -0
- deltacat/docs/autogen/__init__.py +0 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +61 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
- deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
- deltacat/examples/hello_world.py +29 -0
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +116 -12
- deltacat/experimental/__init__.py +0 -0
- deltacat/experimental/catalog/__init__.py +0 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/experimental/catalog/iceberg/impl.py +399 -0
- deltacat/experimental/catalog/iceberg/overrides.py +72 -0
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/experimental/storage/iceberg/impl.py +739 -0
- deltacat/experimental/storage/iceberg/model.py +713 -0
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
- deltacat/experimental/storage/rivulet/dataset.py +745 -0
- deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
- deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
- deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
- deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
- deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
- deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
- deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
- deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
- deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
- deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
- deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
- deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
- deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
- deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
- deltacat/experimental/storage/rivulet/serializer.py +40 -0
- deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +138 -28
- deltacat/storage/interface.py +260 -155
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +3030 -0
- deltacat/storage/model/delta.py +142 -71
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -9
- deltacat/storage/model/manifest.py +643 -0
- deltacat/storage/model/metafile.py +1421 -0
- deltacat/storage/model/namespace.py +41 -18
- deltacat/storage/model/partition.py +443 -43
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +46 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +3160 -0
- deltacat/storage/model/shard.py +51 -0
- deltacat/storage/model/sort_key.py +210 -13
- deltacat/storage/model/stream.py +215 -80
- deltacat/storage/model/table.py +134 -29
- deltacat/storage/model/table_version.py +333 -46
- deltacat/storage/model/transaction.py +1733 -0
- deltacat/storage/model/transform.py +274 -58
- deltacat/storage/model/types.py +138 -16
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +321 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +23 -30
- deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
- deltacat/tests/compute/compactor/utils/test_io.py +125 -123
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
- deltacat/tests/compute/conftest.py +39 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +826 -0
- deltacat/tests/compute/converter/utils.py +132 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
- deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
- deltacat/tests/compute/test_compact_partition_params.py +16 -11
- deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +726 -46
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/conftest.py +25 -0
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
- deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +8204 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +2440 -0
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +479 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +24 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +653 -0
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1064 -0
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +50 -26
- deltacat/tests/test_utils/storage.py +256 -4
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +124 -34
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1107 -258
- deltacat/types/media.py +345 -37
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +2345 -47
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +61 -0
- deltacat/utils/filesystem.py +450 -0
- deltacat/utils/metafile_locator.py +74 -0
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1212 -178
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/aws/redshift/model/manifest.py +0 -394
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-1.1.38.dist-info/METADATA +0 -64
- deltacat-1.1.38.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
- /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
- /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,29 +1,29 @@
|
|
1
|
-
import ray
|
2
|
-
from moto import mock_s3
|
3
|
-
import pytest
|
4
|
-
import os
|
5
1
|
import logging
|
6
|
-
import
|
7
|
-
|
8
|
-
|
2
|
+
from typing import Any, Dict, List, Optional, Set, Tuple, Callable
|
3
|
+
import uuid
|
4
|
+
import pytest
|
5
|
+
|
9
6
|
import pyarrow as pa
|
7
|
+
import ray
|
8
|
+
|
10
9
|
from pytest_benchmark.fixture import BenchmarkFixture
|
11
10
|
from deltacat.types.media import StorageType
|
12
11
|
|
13
12
|
from deltacat.tests.compute.test_util_common import (
|
14
|
-
|
13
|
+
get_rci_from_partition,
|
14
|
+
read_audit_file,
|
15
|
+
PartitionKeyType,
|
15
16
|
)
|
16
|
-
from deltacat.compute.
|
17
|
-
|
18
|
-
|
19
|
-
create_src_w_deltas_destination_plus_destination,
|
20
|
-
add_late_deltas_to_partition,
|
17
|
+
from deltacat.tests.compute.test_util_common import (
|
18
|
+
add_late_deltas_to_partition_main,
|
19
|
+
create_src_w_deltas_destination_plus_destination_main,
|
21
20
|
)
|
21
|
+
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
22
|
+
|
22
23
|
from deltacat.tests.compute.compact_partition_test_cases import (
|
23
24
|
INCREMENTAL_TEST_CASES,
|
24
25
|
)
|
25
26
|
from deltacat.tests.compute.test_util_constant import (
|
26
|
-
TEST_S3_RCF_BUCKET_NAME,
|
27
27
|
DEFAULT_NUM_WORKERS,
|
28
28
|
DEFAULT_WORKER_INSTANCE_CPUS,
|
29
29
|
)
|
@@ -37,6 +37,7 @@ from deltacat.storage import (
|
|
37
37
|
DeltaLocator,
|
38
38
|
Partition,
|
39
39
|
PartitionLocator,
|
40
|
+
metastore,
|
40
41
|
)
|
41
42
|
from deltacat.types.media import ContentType
|
42
43
|
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
@@ -50,11 +51,6 @@ from deltacat.utils.placement import (
|
|
50
51
|
)
|
51
52
|
from deltacat import logs
|
52
53
|
|
53
|
-
DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
|
54
|
-
"db_file_path",
|
55
|
-
"deltacat/tests/local_deltacat_storage/db_test.sqlite",
|
56
|
-
)
|
57
|
-
|
58
54
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
59
55
|
|
60
56
|
|
@@ -70,55 +66,11 @@ def setup_ray_cluster():
|
|
70
66
|
ray.shutdown()
|
71
67
|
|
72
68
|
|
73
|
-
@pytest.fixture(autouse=True, scope="module")
|
74
|
-
def mock_aws_credential():
|
75
|
-
os.environ["AWS_ACCESS_KEY_ID"] = "testing"
|
76
|
-
os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
|
77
|
-
os.environ["AWS_SECURITY_TOKEN"] = "testing"
|
78
|
-
os.environ["AWS_SESSION_TOKEN"] = "testing"
|
79
|
-
os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
|
80
|
-
yield
|
81
|
-
|
82
|
-
|
83
|
-
@pytest.fixture(autouse=True, scope="module")
|
84
|
-
def cleanup_the_database_file_after_all_compaction_session_package_tests_complete():
|
85
|
-
# make sure the database file is deleted after all the compactor package tests are completed
|
86
|
-
yield
|
87
|
-
if os.path.exists(DATABASE_FILE_PATH_VALUE):
|
88
|
-
os.remove(DATABASE_FILE_PATH_VALUE)
|
89
|
-
|
90
|
-
|
91
|
-
@pytest.fixture(scope="module")
|
92
|
-
def s3_resource():
|
93
|
-
with mock_s3():
|
94
|
-
yield boto3.resource("s3")
|
95
|
-
|
96
|
-
|
97
|
-
@pytest.fixture(autouse=True, scope="module")
|
98
|
-
def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
|
99
|
-
s3_resource.create_bucket(
|
100
|
-
ACL="authenticated-read",
|
101
|
-
Bucket=TEST_S3_RCF_BUCKET_NAME,
|
102
|
-
)
|
103
|
-
yield
|
104
|
-
|
105
|
-
|
106
69
|
"""
|
107
70
|
FUNCTION scoped fixtures
|
108
71
|
"""
|
109
72
|
|
110
73
|
|
111
|
-
@pytest.fixture(scope="function")
|
112
|
-
def offer_local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
|
113
|
-
# see deltacat/tests/local_deltacat_storage/README.md for documentation
|
114
|
-
kwargs_for_local_deltacat_storage: Dict[str, Any] = {
|
115
|
-
DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
|
116
|
-
}
|
117
|
-
yield kwargs_for_local_deltacat_storage
|
118
|
-
if os.path.exists(DATABASE_FILE_PATH_VALUE):
|
119
|
-
os.remove(DATABASE_FILE_PATH_VALUE)
|
120
|
-
|
121
|
-
|
122
74
|
@pytest.fixture(autouse=True, scope="function")
|
123
75
|
def enable_bucketing_spec_validation(monkeypatch):
|
124
76
|
"""
|
@@ -134,6 +86,11 @@ def enable_bucketing_spec_validation(monkeypatch):
|
|
134
86
|
)
|
135
87
|
|
136
88
|
|
89
|
+
@pytest.fixture(scope="function")
|
90
|
+
def temp_dir(tmp_path):
|
91
|
+
return str(tmp_path)
|
92
|
+
|
93
|
+
|
137
94
|
@pytest.mark.parametrize(
|
138
95
|
[
|
139
96
|
"test_name",
|
@@ -207,9 +164,8 @@ def enable_bucketing_spec_validation(monkeypatch):
|
|
207
164
|
],
|
208
165
|
ids=[test_name for test_name in INCREMENTAL_TEST_CASES],
|
209
166
|
)
|
210
|
-
def
|
211
|
-
|
212
|
-
offer_local_deltacat_storage_kwargs: Dict[str, Any],
|
167
|
+
def test_compact_partition_incremental_main(
|
168
|
+
main_deltacat_storage_kwargs: Dict[str, Any],
|
213
169
|
test_name: str,
|
214
170
|
primary_keys: Set[str],
|
215
171
|
sort_keys: Dict[str, str],
|
@@ -233,9 +189,16 @@ def test_compact_partition_incremental(
|
|
233
189
|
compact_partition_func: Callable,
|
234
190
|
benchmark: BenchmarkFixture,
|
235
191
|
):
|
236
|
-
|
192
|
+
# Skip in-place compaction tests for main storage as it's not yet implemented
|
193
|
+
if is_inplace:
|
194
|
+
pytest.skip(
|
195
|
+
"In-place compaction not yet implemented in main storage (delta prepending limitation)"
|
196
|
+
)
|
237
197
|
|
238
|
-
ds_mock_kwargs: Dict[str, Any] =
|
198
|
+
ds_mock_kwargs: Dict[str, Any] = main_deltacat_storage_kwargs
|
199
|
+
|
200
|
+
# Extract catalog from storage kwargs
|
201
|
+
catalog = ds_mock_kwargs.get("inner")
|
239
202
|
|
240
203
|
# setup
|
241
204
|
partition_keys = partition_keys_param
|
@@ -246,8 +209,7 @@ def test_compact_partition_incremental(
|
|
246
209
|
source_table_namespace,
|
247
210
|
source_table_name,
|
248
211
|
source_table_version,
|
249
|
-
) =
|
250
|
-
primary_keys,
|
212
|
+
) = create_src_w_deltas_destination_plus_destination_main(
|
251
213
|
sort_keys,
|
252
214
|
partition_keys,
|
253
215
|
input_deltas,
|
@@ -256,15 +218,38 @@ def test_compact_partition_incremental(
|
|
256
218
|
ds_mock_kwargs,
|
257
219
|
is_inplace,
|
258
220
|
)
|
259
|
-
|
221
|
+
|
222
|
+
# Convert partition values to correct types for get_partition call
|
223
|
+
converted_partition_values = []
|
224
|
+
if partition_values_param and partition_keys:
|
225
|
+
# partition_values_param is a single string, but we need to handle it as a list
|
226
|
+
partition_values_list = (
|
227
|
+
[partition_values_param]
|
228
|
+
if isinstance(partition_values_param, str)
|
229
|
+
else partition_values_param
|
230
|
+
)
|
231
|
+
for i, (value, pk) in enumerate(zip(partition_values_list, partition_keys)):
|
232
|
+
if pk.key_type == PartitionKeyType.INT:
|
233
|
+
converted_partition_values.append(int(value))
|
234
|
+
else:
|
235
|
+
converted_partition_values.append(value)
|
236
|
+
else:
|
237
|
+
converted_partition_values = (
|
238
|
+
[partition_values_param] if partition_values_param else []
|
239
|
+
)
|
240
|
+
|
241
|
+
source_partition: Partition = metastore.get_partition(
|
260
242
|
source_table_stream.locator,
|
261
|
-
|
243
|
+
converted_partition_values,
|
244
|
+
partition_scheme_id="default_partition_scheme" if partition_keys else None,
|
262
245
|
**ds_mock_kwargs,
|
263
246
|
)
|
247
|
+
# Generate a destination partition ID based on the source partition
|
248
|
+
destination_partition_id = str(uuid.uuid4())
|
264
249
|
destination_partition_locator: PartitionLocator = PartitionLocator.of(
|
265
250
|
destination_table_stream.locator,
|
266
|
-
|
267
|
-
|
251
|
+
converted_partition_values,
|
252
|
+
destination_partition_id,
|
268
253
|
)
|
269
254
|
num_workers, worker_instance_cpu = DEFAULT_NUM_WORKERS, DEFAULT_WORKER_INSTANCE_CPUS
|
270
255
|
total_cpus: int = num_workers * worker_instance_cpu
|
@@ -275,12 +260,18 @@ def test_compact_partition_incremental(
|
|
275
260
|
if create_placement_group_param
|
276
261
|
else None
|
277
262
|
)
|
263
|
+
all_column_names = metastore.get_table_version_column_names(
|
264
|
+
destination_table_stream.locator.table_locator.namespace,
|
265
|
+
destination_table_stream.locator.table_locator.table_name,
|
266
|
+
destination_table_stream.locator.table_version_locator.table_version,
|
267
|
+
catalog=catalog,
|
268
|
+
)
|
278
269
|
compact_partition_params = CompactPartitionParams.of(
|
279
270
|
{
|
280
|
-
"
|
271
|
+
"catalog": catalog,
|
281
272
|
"compacted_file_content_type": ContentType.PARQUET,
|
282
273
|
"dd_max_parallelism_ratio": 1.0,
|
283
|
-
"deltacat_storage":
|
274
|
+
"deltacat_storage": metastore,
|
284
275
|
"deltacat_storage_kwargs": ds_mock_kwargs,
|
285
276
|
"destination_partition_locator": destination_partition_locator,
|
286
277
|
"drop_duplicates": drop_duplicates_param,
|
@@ -289,11 +280,11 @@ def test_compact_partition_incremental(
|
|
289
280
|
"list_deltas_kwargs": {**ds_mock_kwargs, **{"equivalent_table_types": []}},
|
290
281
|
"pg_config": pgm,
|
291
282
|
"primary_keys": primary_keys,
|
283
|
+
"all_column_names": all_column_names,
|
292
284
|
"read_kwargs_provider": read_kwargs_provider_param,
|
293
285
|
"rebase_source_partition_locator": None,
|
294
286
|
"rebase_source_partition_high_watermark": None,
|
295
287
|
"records_per_compacted_file": records_per_compacted_file_param,
|
296
|
-
"s3_client_kwargs": {},
|
297
288
|
"source_partition_locator": source_partition.locator,
|
298
289
|
"sort_keys": sort_keys if sort_keys else None,
|
299
290
|
}
|
@@ -304,18 +295,17 @@ def test_compact_partition_incremental(
|
|
304
295
|
"""
|
305
296
|
This callable runs right before invoking the benchmark target function (compaction).
|
306
297
|
This is needed as the benchmark module will invoke the target function multiple times
|
307
|
-
in a single test run, which can lead to non-idempotent behavior if
|
298
|
+
in a single test run, which can lead to non-idempotent behavior if RCIs are generated.
|
308
299
|
|
309
300
|
Returns: args, kwargs
|
310
301
|
"""
|
311
|
-
s3_resource.Bucket(TEST_S3_RCF_BUCKET_NAME).objects.all().delete()
|
312
302
|
return (compact_partition_params,), {}
|
313
303
|
|
314
304
|
if add_late_deltas:
|
315
305
|
# NOTE: In the case of in-place compaction it is plausible that new deltas may be added to the source partition during compaction
|
316
306
|
# (so that the source_partitition.stream_position > last_stream_position_to_compact).
|
317
307
|
# This parameter helps simulate the case to check that no late deltas are dropped even when the compacted partition is created.
|
318
|
-
latest_delta, _ =
|
308
|
+
latest_delta, _ = add_late_deltas_to_partition_main(
|
319
309
|
add_late_deltas, source_partition, ds_mock_kwargs
|
320
310
|
)
|
321
311
|
if expected_terminal_exception:
|
@@ -323,27 +313,28 @@ def test_compact_partition_incremental(
|
|
323
313
|
compact_partition_func(compact_partition_params)
|
324
314
|
assert expected_terminal_exception_message in str(exc_info.value)
|
325
315
|
return
|
326
|
-
|
327
|
-
compact_partition_func, setup=_incremental_compaction_setup
|
328
|
-
)
|
316
|
+
benchmark.pedantic(compact_partition_func, setup=_incremental_compaction_setup)
|
329
317
|
|
330
|
-
# validate
|
331
|
-
round_completion_info: RoundCompletionInfo =
|
318
|
+
# validate - get RoundCompletionInfo from the compacted partition
|
319
|
+
round_completion_info: RoundCompletionInfo = get_rci_from_partition(
|
320
|
+
destination_partition_locator, metastore, catalog=catalog
|
321
|
+
)
|
332
322
|
compacted_delta_locator: DeltaLocator = (
|
333
323
|
round_completion_info.compacted_delta_locator
|
334
324
|
)
|
335
|
-
audit_bucket, audit_key = RoundCompletionInfo.get_audit_bucket_name_and_key(
|
336
|
-
round_completion_info.compaction_audit_url
|
337
|
-
)
|
338
325
|
|
339
|
-
|
340
|
-
|
326
|
+
# Get catalog root for audit file resolution
|
327
|
+
catalog_root = catalog.root
|
328
|
+
|
329
|
+
compaction_audit_obj: Dict[str, Any] = read_audit_file(
|
330
|
+
round_completion_info.compaction_audit_url, catalog_root
|
341
331
|
)
|
332
|
+
|
342
333
|
compaction_audit: CompactionSessionAuditInfo = CompactionSessionAuditInfo(
|
343
334
|
**compaction_audit_obj
|
344
335
|
)
|
345
336
|
|
346
|
-
# assert if
|
337
|
+
# assert if RCI covers all files
|
347
338
|
if compactor_version != CompactorVersion.V1.value:
|
348
339
|
previous_end = None
|
349
340
|
for start, end in round_completion_info.hb_index_to_entry_range.values():
|
@@ -353,7 +344,7 @@ def test_compact_partition_incremental(
|
|
353
344
|
previous_end == round_completion_info.compacted_pyarrow_write_result.files
|
354
345
|
)
|
355
346
|
|
356
|
-
tables =
|
347
|
+
tables = metastore.download_delta(
|
357
348
|
compacted_delta_locator, storage_type=StorageType.LOCAL, **ds_mock_kwargs
|
358
349
|
)
|
359
350
|
actual_compacted_table = pa.concat_tables(tables)
|
@@ -387,25 +378,27 @@ def test_compact_partition_incremental(
|
|
387
378
|
== destination_partition_locator.partition_values
|
388
379
|
and source_partition.locator.stream_id
|
389
380
|
== destination_partition_locator.stream_id
|
390
|
-
), f"The source partition: {source_partition.locator
|
381
|
+
), f"The source partition: {source_partition.locator} should match the destination partition: {destination_partition_locator}"
|
391
382
|
assert (
|
392
383
|
compacted_delta_locator.stream_id == source_partition.locator.stream_id
|
393
384
|
), "The compacted delta should be in the same stream as the source"
|
394
|
-
source_partition: Partition =
|
385
|
+
source_partition: Partition = metastore.get_partition(
|
395
386
|
source_table_stream.locator,
|
396
|
-
|
387
|
+
converted_partition_values,
|
388
|
+
partition_scheme_id="default_partition_scheme" if partition_keys else None,
|
397
389
|
**ds_mock_kwargs,
|
398
390
|
)
|
399
|
-
compacted_partition: Optional[Partition] =
|
391
|
+
compacted_partition: Optional[Partition] = metastore.get_partition(
|
400
392
|
compacted_delta_locator.stream_locator,
|
401
|
-
|
393
|
+
converted_partition_values,
|
394
|
+
partition_scheme_id="default_partition_scheme" if partition_keys else None,
|
402
395
|
**ds_mock_kwargs,
|
403
396
|
)
|
404
397
|
assert (
|
405
398
|
compacted_partition.state == source_partition.state == CommitState.COMMITTED
|
406
399
|
), f"The compacted/source table partition should be in {CommitState.COMMITTED} state and not {CommitState.DEPRECATED}"
|
407
400
|
if add_late_deltas:
|
408
|
-
compacted_partition_deltas: List[Delta] =
|
401
|
+
compacted_partition_deltas: List[Delta] = metastore.list_partition_deltas(
|
409
402
|
partition_like=compacted_partition,
|
410
403
|
ascending_order=False,
|
411
404
|
**ds_mock_kwargs,
|
@@ -1,43 +1,38 @@
|
|
1
|
-
import
|
2
|
-
import
|
3
|
-
from moto import mock_s3
|
1
|
+
import tempfile
|
2
|
+
from typing import Any, Dict, List, Optional, Set, Callable
|
4
3
|
import pytest
|
5
|
-
import boto3
|
6
|
-
from boto3.resources.base import ServiceResource
|
7
4
|
import pyarrow as pa
|
5
|
+
import ray
|
6
|
+
|
8
7
|
from deltacat.io.file_object_store import FileObjectStore
|
9
8
|
from pytest_benchmark.fixture import BenchmarkFixture
|
10
|
-
import tempfile
|
11
9
|
|
12
10
|
from deltacat.tests.compute.test_util_constant import (
|
13
|
-
TEST_S3_RCF_BUCKET_NAME,
|
14
11
|
DEFAULT_NUM_WORKERS,
|
15
12
|
DEFAULT_WORKER_INSTANCE_CPUS,
|
16
13
|
)
|
17
14
|
from deltacat.tests.compute.test_util_common import (
|
18
|
-
|
15
|
+
get_rci_from_partition,
|
16
|
+
read_audit_file,
|
17
|
+
PartitionKey,
|
18
|
+
get_compacted_delta_locator_from_partition,
|
19
19
|
)
|
20
|
-
from deltacat.tests.test_utils.utils import read_s3_contents
|
21
|
-
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
22
20
|
from deltacat.tests.compute.test_util_common import (
|
23
|
-
|
21
|
+
multiple_rounds_create_src_w_deltas_destination_rebase_w_deltas_strategy_main,
|
24
22
|
)
|
23
|
+
|
24
|
+
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
25
25
|
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
26
26
|
CompactionSessionAuditInfo,
|
27
27
|
)
|
28
|
-
from deltacat.tests.compute.test_util_create_table_deltas_repo import (
|
29
|
-
multiple_rounds_create_src_w_deltas_destination_rebase_w_deltas_strategy,
|
30
|
-
)
|
31
28
|
from deltacat.tests.compute.compact_partition_multiple_rounds_test_cases import (
|
32
29
|
MULTIPLE_ROUNDS_TEST_CASES,
|
33
30
|
)
|
34
|
-
from
|
35
|
-
from deltacat.types.media import StorageType
|
31
|
+
from deltacat.types.media import StorageType, ContentType
|
36
32
|
from deltacat.storage import (
|
37
33
|
DeltaLocator,
|
38
34
|
Partition,
|
39
35
|
)
|
40
|
-
from deltacat.types.media import ContentType
|
41
36
|
from deltacat.compute.compactor.model.compact_partition_params import (
|
42
37
|
CompactPartitionParams,
|
43
38
|
)
|
@@ -47,11 +42,7 @@ from deltacat.compute.compactor import (
|
|
47
42
|
from deltacat.utils.placement import (
|
48
43
|
PlacementGroupManager,
|
49
44
|
)
|
50
|
-
|
51
|
-
DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
|
52
|
-
"db_file_path",
|
53
|
-
"deltacat/tests/local_deltacat_storage/db_test.sqlite",
|
54
|
-
)
|
45
|
+
from deltacat.storage import metastore
|
55
46
|
|
56
47
|
|
57
48
|
"""
|
@@ -66,54 +57,11 @@ def setup_ray_cluster():
|
|
66
57
|
ray.shutdown()
|
67
58
|
|
68
59
|
|
69
|
-
@pytest.fixture(autouse=True, scope="module")
|
70
|
-
def mock_aws_credential():
|
71
|
-
os.environ["AWS_ACCESS_KEY_ID"] = "testing"
|
72
|
-
os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
|
73
|
-
os.environ["AWS_SECURITY_TOKEN"] = "testing"
|
74
|
-
os.environ["AWS_SESSION_TOKEN"] = "testing"
|
75
|
-
os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
|
76
|
-
yield
|
77
|
-
|
78
|
-
|
79
|
-
@pytest.fixture(autouse=True, scope="module")
|
80
|
-
def cleanup_the_database_file_after_all_compaction_session_package_tests_complete():
|
81
|
-
# make sure the database file is deleted after all the compactor package tests are completed
|
82
|
-
if os.path.exists(DATABASE_FILE_PATH_VALUE):
|
83
|
-
os.remove(DATABASE_FILE_PATH_VALUE)
|
84
|
-
|
85
|
-
|
86
|
-
@pytest.fixture(scope="module")
|
87
|
-
def s3_resource(mock_aws_credential):
|
88
|
-
with mock_s3():
|
89
|
-
yield boto3.resource("s3")
|
90
|
-
|
91
|
-
|
92
|
-
@pytest.fixture(autouse=True, scope="module")
|
93
|
-
def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
|
94
|
-
s3_resource.create_bucket(
|
95
|
-
ACL="authenticated-read",
|
96
|
-
Bucket=TEST_S3_RCF_BUCKET_NAME,
|
97
|
-
)
|
98
|
-
yield
|
99
|
-
|
100
|
-
|
101
60
|
"""
|
102
61
|
FUNCTION scoped fixtures
|
103
62
|
"""
|
104
63
|
|
105
64
|
|
106
|
-
@pytest.fixture(scope="function")
|
107
|
-
def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
|
108
|
-
# see deltacat/tests/local_deltacat_storage/README.md for documentation
|
109
|
-
kwargs_for_local_deltacat_storage: Dict[str, Any] = {
|
110
|
-
DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
|
111
|
-
}
|
112
|
-
yield kwargs_for_local_deltacat_storage
|
113
|
-
if os.path.exists(DATABASE_FILE_PATH_VALUE):
|
114
|
-
os.remove(DATABASE_FILE_PATH_VALUE)
|
115
|
-
|
116
|
-
|
117
65
|
@pytest.fixture(autouse=True, scope="function")
|
118
66
|
def enable_bucketing_spec_validation(monkeypatch):
|
119
67
|
"""
|
@@ -199,14 +147,13 @@ def enable_bucketing_spec_validation(monkeypatch):
|
|
199
147
|
],
|
200
148
|
ids=[test_name for test_name in MULTIPLE_ROUNDS_TEST_CASES],
|
201
149
|
)
|
202
|
-
def
|
150
|
+
def test_compact_partition_rebase_multiple_rounds_same_source_and_destination_main(
|
203
151
|
mocker,
|
204
|
-
|
205
|
-
local_deltacat_storage_kwargs: Dict[str, Any],
|
152
|
+
main_deltacat_storage_kwargs: Dict[str, Any],
|
206
153
|
test_name: str,
|
207
154
|
primary_keys: Set[str],
|
208
155
|
sort_keys: List[Optional[Any]],
|
209
|
-
partition_keys_param: Optional[List[
|
156
|
+
partition_keys_param: Optional[List[PartitionKey]],
|
210
157
|
partition_values_param: List[Optional[str]],
|
211
158
|
input_deltas_param: List[pa.Array],
|
212
159
|
expected_terminal_compact_partition_result: pa.Table,
|
@@ -225,38 +172,63 @@ def test_compact_partition_rebase_multiple_rounds_same_source_and_destination(
|
|
225
172
|
num_rounds_param: int,
|
226
173
|
benchmark: BenchmarkFixture,
|
227
174
|
):
|
228
|
-
|
229
|
-
|
230
|
-
ds_mock_kwargs = local_deltacat_storage_kwargs
|
175
|
+
ds_mock_kwargs = main_deltacat_storage_kwargs
|
231
176
|
"""
|
232
177
|
This test tests different multi-round compaction rebase configurations,
|
233
|
-
as specified in compact_partition_multiple_rounds_test_cases.py
|
178
|
+
as specified in compact_partition_multiple_rounds_test_cases.py.
|
234
179
|
These tests do not test multi-round compaction backfill, which is
|
235
180
|
currently unsupported.
|
181
|
+
|
182
|
+
This version uses the main metastore implementation instead of local storage.
|
236
183
|
"""
|
237
184
|
(
|
238
185
|
source_table_stream,
|
239
186
|
_,
|
240
187
|
rebased_table_stream,
|
241
188
|
_,
|
242
|
-
) =
|
243
|
-
primary_keys,
|
189
|
+
) = multiple_rounds_create_src_w_deltas_destination_rebase_w_deltas_strategy_main(
|
244
190
|
sort_keys,
|
245
191
|
partition_keys_param,
|
246
192
|
input_deltas_param,
|
247
193
|
partition_values_param,
|
248
194
|
ds_mock_kwargs,
|
249
195
|
)
|
250
|
-
|
251
|
-
|
252
|
-
|
196
|
+
# Convert partition values for partition lookup (same as in the helper function)
|
197
|
+
converted_partition_values_for_lookup = partition_values_param
|
198
|
+
if partition_values_param and partition_keys_param:
|
199
|
+
converted_partition_values_for_lookup = []
|
200
|
+
for i, (value, key) in enumerate(
|
201
|
+
zip(partition_values_param, partition_keys_param)
|
202
|
+
):
|
203
|
+
if key.key_type == "int":
|
204
|
+
converted_partition_values_for_lookup.append(int(value))
|
205
|
+
elif key.key_type == "string":
|
206
|
+
converted_partition_values_for_lookup.append(str(value))
|
207
|
+
elif key.key_type == "timestamp":
|
208
|
+
converted_partition_values_for_lookup.append(
|
209
|
+
value
|
210
|
+
) # Keep as is for now
|
211
|
+
else:
|
212
|
+
converted_partition_values_for_lookup.append(value)
|
213
|
+
|
214
|
+
source_partition: Partition = metastore.get_partition(
|
215
|
+
stream_locator=source_table_stream.locator,
|
216
|
+
partition_values=converted_partition_values_for_lookup,
|
217
|
+
partition_scheme_id=source_table_stream.partition_scheme.id,
|
253
218
|
**ds_mock_kwargs,
|
254
219
|
)
|
255
|
-
rebased_partition: Partition =
|
256
|
-
rebased_table_stream.locator,
|
257
|
-
|
220
|
+
rebased_partition: Partition = metastore.get_partition(
|
221
|
+
stream_locator=rebased_table_stream.locator,
|
222
|
+
partition_values=converted_partition_values_for_lookup,
|
223
|
+
partition_scheme_id=rebased_table_stream.partition_scheme.id,
|
258
224
|
**ds_mock_kwargs,
|
259
225
|
)
|
226
|
+
all_column_names = metastore.get_table_version_column_names(
|
227
|
+
rebased_table_stream.locator.table_locator.namespace,
|
228
|
+
rebased_table_stream.locator.table_locator.table_name,
|
229
|
+
rebased_table_stream.locator.table_version_locator.table_version,
|
230
|
+
catalog=ds_mock_kwargs.get("inner"),
|
231
|
+
)
|
260
232
|
total_cpus = DEFAULT_NUM_WORKERS * DEFAULT_WORKER_INSTANCE_CPUS
|
261
233
|
pgm = None
|
262
234
|
if create_placement_group_param:
|
@@ -266,10 +238,10 @@ def test_compact_partition_rebase_multiple_rounds_same_source_and_destination(
|
|
266
238
|
with tempfile.TemporaryDirectory() as test_dir:
|
267
239
|
compact_partition_params = CompactPartitionParams.of(
|
268
240
|
{
|
269
|
-
"
|
241
|
+
"catalog": ds_mock_kwargs.get("inner"),
|
270
242
|
"compacted_file_content_type": ContentType.PARQUET,
|
271
243
|
"dd_max_parallelism_ratio": 1.0,
|
272
|
-
"deltacat_storage":
|
244
|
+
"deltacat_storage": metastore,
|
273
245
|
"deltacat_storage_kwargs": ds_mock_kwargs,
|
274
246
|
"destination_partition_locator": rebased_partition.locator,
|
275
247
|
"hash_bucket_count": hash_bucket_count_param,
|
@@ -281,11 +253,11 @@ def test_compact_partition_rebase_multiple_rounds_same_source_and_destination(
|
|
281
253
|
"object_store": FileObjectStore(test_dir),
|
282
254
|
"pg_config": pgm,
|
283
255
|
"primary_keys": primary_keys,
|
256
|
+
"all_column_names": all_column_names,
|
284
257
|
"read_kwargs_provider": read_kwargs_provider_param,
|
285
258
|
"rebase_source_partition_locator": source_partition.locator,
|
286
259
|
"rebase_source_partition_high_watermark": rebased_partition.stream_position,
|
287
260
|
"records_per_compacted_file": records_per_compacted_file_param,
|
288
|
-
"s3_client_kwargs": {},
|
289
261
|
"source_partition_locator": rebased_partition.locator,
|
290
262
|
"sort_keys": sort_keys if sort_keys else None,
|
291
263
|
"num_rounds": num_rounds_param,
|
@@ -308,23 +280,25 @@ def test_compact_partition_rebase_multiple_rounds_same_source_and_destination(
|
|
308
280
|
object_store_clear_spy = mocker.spy(FileObjectStore, "clear")
|
309
281
|
|
310
282
|
# execute
|
311
|
-
|
283
|
+
benchmark(compact_partition_func, compact_partition_params)
|
312
284
|
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
audit_bucket, audit_key = RoundCompletionInfo.get_audit_bucket_name_and_key(
|
317
|
-
round_completion_info.compaction_audit_url
|
285
|
+
# Get RoundCompletionInfo from the compacted partition
|
286
|
+
round_completion_info: RoundCompletionInfo = get_rci_from_partition(
|
287
|
+
rebased_partition.locator, metastore, catalog=ds_mock_kwargs.get("inner")
|
318
288
|
)
|
319
289
|
|
320
|
-
|
321
|
-
|
290
|
+
# Get catalog root for audit file resolution
|
291
|
+
catalog = ds_mock_kwargs.get("inner")
|
292
|
+
catalog_root = catalog.root
|
293
|
+
|
294
|
+
compaction_audit_obj: Dict[str, Any] = read_audit_file(
|
295
|
+
round_completion_info.compaction_audit_url, catalog_root
|
322
296
|
)
|
323
297
|
compaction_audit: CompactionSessionAuditInfo = CompactionSessionAuditInfo(
|
324
298
|
**compaction_audit_obj
|
325
299
|
)
|
326
300
|
|
327
|
-
# assert if
|
301
|
+
# assert if RCI covers all files
|
328
302
|
# multiple rounds feature is only supported in V2 compactor
|
329
303
|
previous_end = None
|
330
304
|
for start, end in round_completion_info.hb_index_to_entry_range.values():
|
@@ -338,16 +312,24 @@ def test_compact_partition_rebase_multiple_rounds_same_source_and_destination(
|
|
338
312
|
assert (
|
339
313
|
execute_compaction_result_spy.call_args.args[-1] is False
|
340
314
|
), "Table version erroneously marked as in-place compacted!"
|
341
|
-
compacted_delta_locator: DeltaLocator =
|
342
|
-
|
315
|
+
compacted_delta_locator: DeltaLocator = (
|
316
|
+
get_compacted_delta_locator_from_partition(
|
317
|
+
rebased_partition.locator,
|
318
|
+
metastore,
|
319
|
+
catalog=ds_mock_kwargs.get("inner"),
|
320
|
+
)
|
343
321
|
)
|
344
|
-
tables =
|
322
|
+
tables = metastore.download_delta(
|
345
323
|
compacted_delta_locator, storage_type=StorageType.LOCAL, **ds_mock_kwargs
|
346
324
|
)
|
347
325
|
actual_rebase_compacted_table = pa.concat_tables(tables)
|
348
326
|
# if no primary key is specified then sort by sort_key for consistent assertion
|
349
327
|
sorting_cols: List[Any] = (
|
350
|
-
[(val, "ascending") for val in primary_keys]
|
328
|
+
[(val, "ascending") for val in primary_keys]
|
329
|
+
if primary_keys
|
330
|
+
else [pa_key for key in sort_keys for pa_key in key.arrow]
|
331
|
+
if sort_keys
|
332
|
+
else []
|
351
333
|
)
|
352
334
|
rebase_expected_compact_partition_result = (
|
353
335
|
rebase_expected_compact_partition_result.combine_chunks().sort_by(
|