deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +96 -17
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2435 -279
- deltacat/catalog/model/catalog.py +154 -77
- deltacat/catalog/model/properties.py +63 -22
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +25 -12
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +45 -2
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +1 -3
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/indexer/indexer.py +2 -2
- deltacat/examples/indexer/job_runner.py +1 -2
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/io/reader/deltacat_read_api.py +1 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +224 -14
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +823 -36
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +57 -16
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- deltacat-2.0.0b12.dist-info/RECORD +439 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/daft/daft_scan.py +0 -115
- deltacat/daft/model.py +0 -258
- deltacat/daft/translator.py +0 -126
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b10.dist-info/METADATA +0 -68
- deltacat-2.0.0b10.dist-info/RECORD +0 -381
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
- /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
- /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
- /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -1,29 +1,29 @@
|
|
1
|
-
import ray
|
2
|
-
from moto import mock_s3
|
3
|
-
import pytest
|
4
|
-
import os
|
5
1
|
import logging
|
6
|
-
import
|
7
|
-
|
8
|
-
|
2
|
+
from typing import Any, Dict, List, Optional, Set, Tuple, Callable
|
3
|
+
import uuid
|
4
|
+
import pytest
|
5
|
+
|
9
6
|
import pyarrow as pa
|
7
|
+
import ray
|
8
|
+
|
10
9
|
from pytest_benchmark.fixture import BenchmarkFixture
|
11
10
|
from deltacat.types.media import StorageType
|
12
11
|
|
13
12
|
from deltacat.tests.compute.test_util_common import (
|
14
|
-
|
13
|
+
get_rci_from_partition,
|
14
|
+
read_audit_file,
|
15
|
+
PartitionKeyType,
|
15
16
|
)
|
16
|
-
from deltacat.compute.
|
17
|
-
|
18
|
-
|
19
|
-
create_src_w_deltas_destination_plus_destination,
|
20
|
-
add_late_deltas_to_partition,
|
17
|
+
from deltacat.tests.compute.test_util_common import (
|
18
|
+
add_late_deltas_to_partition_main,
|
19
|
+
create_src_w_deltas_destination_plus_destination_main,
|
21
20
|
)
|
21
|
+
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
22
|
+
|
22
23
|
from deltacat.tests.compute.compact_partition_test_cases import (
|
23
24
|
INCREMENTAL_TEST_CASES,
|
24
25
|
)
|
25
26
|
from deltacat.tests.compute.test_util_constant import (
|
26
|
-
TEST_S3_RCF_BUCKET_NAME,
|
27
27
|
DEFAULT_NUM_WORKERS,
|
28
28
|
DEFAULT_WORKER_INSTANCE_CPUS,
|
29
29
|
)
|
@@ -37,6 +37,7 @@ from deltacat.storage import (
|
|
37
37
|
DeltaLocator,
|
38
38
|
Partition,
|
39
39
|
PartitionLocator,
|
40
|
+
metastore,
|
40
41
|
)
|
41
42
|
from deltacat.types.media import ContentType
|
42
43
|
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
@@ -65,34 +66,29 @@ def setup_ray_cluster():
|
|
65
66
|
ray.shutdown()
|
66
67
|
|
67
68
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
|
72
|
-
os.environ["AWS_SECURITY_TOKEN"] = "testing"
|
73
|
-
os.environ["AWS_SESSION_TOKEN"] = "testing"
|
74
|
-
os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
|
75
|
-
yield
|
76
|
-
|
69
|
+
"""
|
70
|
+
FUNCTION scoped fixtures
|
71
|
+
"""
|
77
72
|
|
78
|
-
@pytest.fixture(scope="module")
|
79
|
-
def s3_resource():
|
80
|
-
with mock_s3():
|
81
|
-
yield boto3.resource("s3")
|
82
73
|
|
74
|
+
@pytest.fixture(autouse=True, scope="function")
|
75
|
+
def enable_bucketing_spec_validation(monkeypatch):
|
76
|
+
"""
|
77
|
+
Enable the bucketing spec validation for all tests.
|
78
|
+
This will help catch hash bucket drift in testing.
|
79
|
+
"""
|
80
|
+
import deltacat.compute.compactor_v2.steps.merge
|
83
81
|
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
Bucket=TEST_S3_RCF_BUCKET_NAME,
|
82
|
+
monkeypatch.setattr(
|
83
|
+
deltacat.compute.compactor_v2.steps.merge,
|
84
|
+
"BUCKETING_SPEC_COMPLIANCE_PROFILE",
|
85
|
+
"ASSERT",
|
89
86
|
)
|
90
|
-
yield
|
91
87
|
|
92
88
|
|
93
|
-
""
|
94
|
-
|
95
|
-
|
89
|
+
@pytest.fixture(scope="function")
|
90
|
+
def temp_dir(tmp_path):
|
91
|
+
return str(tmp_path)
|
96
92
|
|
97
93
|
|
98
94
|
@pytest.mark.parametrize(
|
@@ -168,9 +164,8 @@ FUNCTION scoped fixtures
|
|
168
164
|
],
|
169
165
|
ids=[test_name for test_name in INCREMENTAL_TEST_CASES],
|
170
166
|
)
|
171
|
-
def
|
172
|
-
|
173
|
-
local_deltacat_storage_kwargs: Dict[str, Any],
|
167
|
+
def test_compact_partition_incremental_main(
|
168
|
+
main_deltacat_storage_kwargs: Dict[str, Any],
|
174
169
|
test_name: str,
|
175
170
|
primary_keys: Set[str],
|
176
171
|
sort_keys: Dict[str, str],
|
@@ -194,9 +189,16 @@ def test_compact_partition_incremental(
|
|
194
189
|
compact_partition_func: Callable,
|
195
190
|
benchmark: BenchmarkFixture,
|
196
191
|
):
|
197
|
-
|
192
|
+
# Skip in-place compaction tests for main storage as it's not yet implemented
|
193
|
+
if is_inplace:
|
194
|
+
pytest.skip(
|
195
|
+
"In-place compaction not yet implemented in main storage (delta prepending limitation)"
|
196
|
+
)
|
197
|
+
|
198
|
+
ds_mock_kwargs: Dict[str, Any] = main_deltacat_storage_kwargs
|
198
199
|
|
199
|
-
|
200
|
+
# Extract catalog from storage kwargs
|
201
|
+
catalog = ds_mock_kwargs.get("inner")
|
200
202
|
|
201
203
|
# setup
|
202
204
|
partition_keys = partition_keys_param
|
@@ -207,7 +209,7 @@ def test_compact_partition_incremental(
|
|
207
209
|
source_table_namespace,
|
208
210
|
source_table_name,
|
209
211
|
source_table_version,
|
210
|
-
) =
|
212
|
+
) = create_src_w_deltas_destination_plus_destination_main(
|
211
213
|
sort_keys,
|
212
214
|
partition_keys,
|
213
215
|
input_deltas,
|
@@ -216,15 +218,38 @@ def test_compact_partition_incremental(
|
|
216
218
|
ds_mock_kwargs,
|
217
219
|
is_inplace,
|
218
220
|
)
|
219
|
-
|
221
|
+
|
222
|
+
# Convert partition values to correct types for get_partition call
|
223
|
+
converted_partition_values = []
|
224
|
+
if partition_values_param and partition_keys:
|
225
|
+
# partition_values_param is a single string, but we need to handle it as a list
|
226
|
+
partition_values_list = (
|
227
|
+
[partition_values_param]
|
228
|
+
if isinstance(partition_values_param, str)
|
229
|
+
else partition_values_param
|
230
|
+
)
|
231
|
+
for i, (value, pk) in enumerate(zip(partition_values_list, partition_keys)):
|
232
|
+
if pk.key_type == PartitionKeyType.INT:
|
233
|
+
converted_partition_values.append(int(value))
|
234
|
+
else:
|
235
|
+
converted_partition_values.append(value)
|
236
|
+
else:
|
237
|
+
converted_partition_values = (
|
238
|
+
[partition_values_param] if partition_values_param else []
|
239
|
+
)
|
240
|
+
|
241
|
+
source_partition: Partition = metastore.get_partition(
|
220
242
|
source_table_stream.locator,
|
221
|
-
|
243
|
+
converted_partition_values,
|
244
|
+
partition_scheme_id="default_partition_scheme" if partition_keys else None,
|
222
245
|
**ds_mock_kwargs,
|
223
246
|
)
|
247
|
+
# Generate a destination partition ID based on the source partition
|
248
|
+
destination_partition_id = str(uuid.uuid4())
|
224
249
|
destination_partition_locator: PartitionLocator = PartitionLocator.of(
|
225
250
|
destination_table_stream.locator,
|
226
|
-
|
227
|
-
|
251
|
+
converted_partition_values,
|
252
|
+
destination_partition_id,
|
228
253
|
)
|
229
254
|
num_workers, worker_instance_cpu = DEFAULT_NUM_WORKERS, DEFAULT_WORKER_INSTANCE_CPUS
|
230
255
|
total_cpus: int = num_workers * worker_instance_cpu
|
@@ -235,12 +260,18 @@ def test_compact_partition_incremental(
|
|
235
260
|
if create_placement_group_param
|
236
261
|
else None
|
237
262
|
)
|
263
|
+
all_column_names = metastore.get_table_version_column_names(
|
264
|
+
destination_table_stream.locator.table_locator.namespace,
|
265
|
+
destination_table_stream.locator.table_locator.table_name,
|
266
|
+
destination_table_stream.locator.table_version_locator.table_version,
|
267
|
+
catalog=catalog,
|
268
|
+
)
|
238
269
|
compact_partition_params = CompactPartitionParams.of(
|
239
270
|
{
|
240
|
-
"
|
271
|
+
"catalog": catalog,
|
241
272
|
"compacted_file_content_type": ContentType.PARQUET,
|
242
273
|
"dd_max_parallelism_ratio": 1.0,
|
243
|
-
"deltacat_storage":
|
274
|
+
"deltacat_storage": metastore,
|
244
275
|
"deltacat_storage_kwargs": ds_mock_kwargs,
|
245
276
|
"destination_partition_locator": destination_partition_locator,
|
246
277
|
"drop_duplicates": drop_duplicates_param,
|
@@ -249,11 +280,11 @@ def test_compact_partition_incremental(
|
|
249
280
|
"list_deltas_kwargs": {**ds_mock_kwargs, **{"equivalent_table_types": []}},
|
250
281
|
"pg_config": pgm,
|
251
282
|
"primary_keys": primary_keys,
|
283
|
+
"all_column_names": all_column_names,
|
252
284
|
"read_kwargs_provider": read_kwargs_provider_param,
|
253
285
|
"rebase_source_partition_locator": None,
|
254
286
|
"rebase_source_partition_high_watermark": None,
|
255
287
|
"records_per_compacted_file": records_per_compacted_file_param,
|
256
|
-
"s3_client_kwargs": {},
|
257
288
|
"source_partition_locator": source_partition.locator,
|
258
289
|
"sort_keys": sort_keys if sort_keys else None,
|
259
290
|
}
|
@@ -264,18 +295,17 @@ def test_compact_partition_incremental(
|
|
264
295
|
"""
|
265
296
|
This callable runs right before invoking the benchmark target function (compaction).
|
266
297
|
This is needed as the benchmark module will invoke the target function multiple times
|
267
|
-
in a single test run, which can lead to non-idempotent behavior if
|
298
|
+
in a single test run, which can lead to non-idempotent behavior if RCIs are generated.
|
268
299
|
|
269
300
|
Returns: args, kwargs
|
270
301
|
"""
|
271
|
-
s3_resource.Bucket(TEST_S3_RCF_BUCKET_NAME).objects.all().delete()
|
272
302
|
return (compact_partition_params,), {}
|
273
303
|
|
274
304
|
if add_late_deltas:
|
275
305
|
# NOTE: In the case of in-place compaction it is plausible that new deltas may be added to the source partition during compaction
|
276
306
|
# (so that the source_partitition.stream_position > last_stream_position_to_compact).
|
277
307
|
# This parameter helps simulate the case to check that no late deltas are dropped even when the compacted partition is created.
|
278
|
-
latest_delta, _ =
|
308
|
+
latest_delta, _ = add_late_deltas_to_partition_main(
|
279
309
|
add_late_deltas, source_partition, ds_mock_kwargs
|
280
310
|
)
|
281
311
|
if expected_terminal_exception:
|
@@ -283,27 +313,28 @@ def test_compact_partition_incremental(
|
|
283
313
|
compact_partition_func(compact_partition_params)
|
284
314
|
assert expected_terminal_exception_message in str(exc_info.value)
|
285
315
|
return
|
286
|
-
|
287
|
-
compact_partition_func, setup=_incremental_compaction_setup
|
288
|
-
)
|
316
|
+
benchmark.pedantic(compact_partition_func, setup=_incremental_compaction_setup)
|
289
317
|
|
290
|
-
# validate
|
291
|
-
round_completion_info: RoundCompletionInfo =
|
318
|
+
# validate - get RoundCompletionInfo from the compacted partition
|
319
|
+
round_completion_info: RoundCompletionInfo = get_rci_from_partition(
|
320
|
+
destination_partition_locator, metastore, catalog=catalog
|
321
|
+
)
|
292
322
|
compacted_delta_locator: DeltaLocator = (
|
293
323
|
round_completion_info.compacted_delta_locator
|
294
324
|
)
|
295
|
-
audit_bucket, audit_key = RoundCompletionInfo.get_audit_bucket_name_and_key(
|
296
|
-
round_completion_info.compaction_audit_url
|
297
|
-
)
|
298
325
|
|
299
|
-
|
300
|
-
|
326
|
+
# Get catalog root for audit file resolution
|
327
|
+
catalog_root = catalog.root
|
328
|
+
|
329
|
+
compaction_audit_obj: Dict[str, Any] = read_audit_file(
|
330
|
+
round_completion_info.compaction_audit_url, catalog_root
|
301
331
|
)
|
332
|
+
|
302
333
|
compaction_audit: CompactionSessionAuditInfo = CompactionSessionAuditInfo(
|
303
334
|
**compaction_audit_obj
|
304
335
|
)
|
305
336
|
|
306
|
-
# assert if
|
337
|
+
# assert if RCI covers all files
|
307
338
|
if compactor_version != CompactorVersion.V1.value:
|
308
339
|
previous_end = None
|
309
340
|
for start, end in round_completion_info.hb_index_to_entry_range.values():
|
@@ -313,7 +344,7 @@ def test_compact_partition_incremental(
|
|
313
344
|
previous_end == round_completion_info.compacted_pyarrow_write_result.files
|
314
345
|
)
|
315
346
|
|
316
|
-
tables =
|
347
|
+
tables = metastore.download_delta(
|
317
348
|
compacted_delta_locator, storage_type=StorageType.LOCAL, **ds_mock_kwargs
|
318
349
|
)
|
319
350
|
actual_compacted_table = pa.concat_tables(tables)
|
@@ -347,25 +378,27 @@ def test_compact_partition_incremental(
|
|
347
378
|
== destination_partition_locator.partition_values
|
348
379
|
and source_partition.locator.stream_id
|
349
380
|
== destination_partition_locator.stream_id
|
350
|
-
), f"The source partition: {source_partition.locator
|
381
|
+
), f"The source partition: {source_partition.locator} should match the destination partition: {destination_partition_locator}"
|
351
382
|
assert (
|
352
383
|
compacted_delta_locator.stream_id == source_partition.locator.stream_id
|
353
384
|
), "The compacted delta should be in the same stream as the source"
|
354
|
-
source_partition: Partition =
|
385
|
+
source_partition: Partition = metastore.get_partition(
|
355
386
|
source_table_stream.locator,
|
356
|
-
|
387
|
+
converted_partition_values,
|
388
|
+
partition_scheme_id="default_partition_scheme" if partition_keys else None,
|
357
389
|
**ds_mock_kwargs,
|
358
390
|
)
|
359
|
-
compacted_partition: Optional[Partition] =
|
391
|
+
compacted_partition: Optional[Partition] = metastore.get_partition(
|
360
392
|
compacted_delta_locator.stream_locator,
|
361
|
-
|
393
|
+
converted_partition_values,
|
394
|
+
partition_scheme_id="default_partition_scheme" if partition_keys else None,
|
362
395
|
**ds_mock_kwargs,
|
363
396
|
)
|
364
397
|
assert (
|
365
398
|
compacted_partition.state == source_partition.state == CommitState.COMMITTED
|
366
399
|
), f"The compacted/source table partition should be in {CommitState.COMMITTED} state and not {CommitState.DEPRECATED}"
|
367
400
|
if add_late_deltas:
|
368
|
-
compacted_partition_deltas: List[Delta] =
|
401
|
+
compacted_partition_deltas: List[Delta] = metastore.list_partition_deltas(
|
369
402
|
partition_like=compacted_partition,
|
370
403
|
ascending_order=False,
|
371
404
|
**ds_mock_kwargs,
|
@@ -1,43 +1,38 @@
|
|
1
|
-
import
|
2
|
-
import
|
3
|
-
from moto import mock_s3
|
1
|
+
import tempfile
|
2
|
+
from typing import Any, Dict, List, Optional, Set, Callable
|
4
3
|
import pytest
|
5
|
-
import boto3
|
6
|
-
from boto3.resources.base import ServiceResource
|
7
4
|
import pyarrow as pa
|
5
|
+
import ray
|
6
|
+
|
8
7
|
from deltacat.io.file_object_store import FileObjectStore
|
9
8
|
from pytest_benchmark.fixture import BenchmarkFixture
|
10
|
-
import tempfile
|
11
9
|
|
12
10
|
from deltacat.tests.compute.test_util_constant import (
|
13
|
-
TEST_S3_RCF_BUCKET_NAME,
|
14
11
|
DEFAULT_NUM_WORKERS,
|
15
12
|
DEFAULT_WORKER_INSTANCE_CPUS,
|
16
13
|
)
|
17
14
|
from deltacat.tests.compute.test_util_common import (
|
18
|
-
|
15
|
+
get_rci_from_partition,
|
16
|
+
read_audit_file,
|
17
|
+
PartitionKey,
|
18
|
+
get_compacted_delta_locator_from_partition,
|
19
19
|
)
|
20
|
-
from deltacat.tests.test_utils.utils import read_s3_contents
|
21
|
-
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
22
20
|
from deltacat.tests.compute.test_util_common import (
|
23
|
-
|
21
|
+
multiple_rounds_create_src_w_deltas_destination_rebase_w_deltas_strategy_main,
|
24
22
|
)
|
23
|
+
|
24
|
+
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
25
25
|
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
26
26
|
CompactionSessionAuditInfo,
|
27
27
|
)
|
28
|
-
from deltacat.tests.compute.test_util_create_table_deltas_repo import (
|
29
|
-
multiple_rounds_create_src_w_deltas_destination_rebase_w_deltas_strategy,
|
30
|
-
)
|
31
28
|
from deltacat.tests.compute.compact_partition_multiple_rounds_test_cases import (
|
32
29
|
MULTIPLE_ROUNDS_TEST_CASES,
|
33
30
|
)
|
34
|
-
from
|
35
|
-
from deltacat.types.media import StorageType
|
31
|
+
from deltacat.types.media import StorageType, ContentType
|
36
32
|
from deltacat.storage import (
|
37
33
|
DeltaLocator,
|
38
34
|
Partition,
|
39
35
|
)
|
40
|
-
from deltacat.types.media import ContentType
|
41
36
|
from deltacat.compute.compactor.model.compact_partition_params import (
|
42
37
|
CompactPartitionParams,
|
43
38
|
)
|
@@ -47,6 +42,8 @@ from deltacat.compute.compactor import (
|
|
47
42
|
from deltacat.utils.placement import (
|
48
43
|
PlacementGroupManager,
|
49
44
|
)
|
45
|
+
from deltacat.storage import metastore
|
46
|
+
|
50
47
|
|
51
48
|
"""
|
52
49
|
MODULE scoped fixtures
|
@@ -60,29 +57,24 @@ def setup_ray_cluster():
|
|
60
57
|
ray.shutdown()
|
61
58
|
|
62
59
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
|
67
|
-
os.environ["AWS_SECURITY_TOKEN"] = "testing"
|
68
|
-
os.environ["AWS_SESSION_TOKEN"] = "testing"
|
69
|
-
os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
|
70
|
-
yield
|
71
|
-
|
60
|
+
"""
|
61
|
+
FUNCTION scoped fixtures
|
62
|
+
"""
|
72
63
|
|
73
|
-
@pytest.fixture(scope="module")
|
74
|
-
def s3_resource(mock_aws_credential):
|
75
|
-
with mock_s3():
|
76
|
-
yield boto3.resource("s3")
|
77
64
|
|
65
|
+
@pytest.fixture(autouse=True, scope="function")
|
66
|
+
def enable_bucketing_spec_validation(monkeypatch):
|
67
|
+
"""
|
68
|
+
Enable the bucketing spec validation for all tests.
|
69
|
+
This will help catch hash bucket drift in testing.
|
70
|
+
"""
|
71
|
+
import deltacat.compute.compactor_v2.steps.merge
|
78
72
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
Bucket=TEST_S3_RCF_BUCKET_NAME,
|
73
|
+
monkeypatch.setattr(
|
74
|
+
deltacat.compute.compactor_v2.steps.merge,
|
75
|
+
"BUCKETING_SPEC_COMPLIANCE_PROFILE",
|
76
|
+
"ASSERT",
|
84
77
|
)
|
85
|
-
yield
|
86
78
|
|
87
79
|
|
88
80
|
@pytest.mark.parametrize(
|
@@ -155,14 +147,13 @@ def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
|
|
155
147
|
],
|
156
148
|
ids=[test_name for test_name in MULTIPLE_ROUNDS_TEST_CASES],
|
157
149
|
)
|
158
|
-
def
|
150
|
+
def test_compact_partition_rebase_multiple_rounds_same_source_and_destination_main(
|
159
151
|
mocker,
|
160
|
-
|
161
|
-
local_deltacat_storage_kwargs: Dict[str, Any],
|
152
|
+
main_deltacat_storage_kwargs: Dict[str, Any],
|
162
153
|
test_name: str,
|
163
154
|
primary_keys: Set[str],
|
164
155
|
sort_keys: List[Optional[Any]],
|
165
|
-
partition_keys_param: Optional[List[
|
156
|
+
partition_keys_param: Optional[List[PartitionKey]],
|
166
157
|
partition_values_param: List[Optional[str]],
|
167
158
|
input_deltas_param: List[pa.Array],
|
168
159
|
expected_terminal_compact_partition_result: pa.Table,
|
@@ -181,37 +172,63 @@ def test_compact_partition_rebase_multiple_rounds_same_source_and_destination(
|
|
181
172
|
num_rounds_param: int,
|
182
173
|
benchmark: BenchmarkFixture,
|
183
174
|
):
|
184
|
-
|
185
|
-
|
186
|
-
ds_mock_kwargs = local_deltacat_storage_kwargs
|
175
|
+
ds_mock_kwargs = main_deltacat_storage_kwargs
|
187
176
|
"""
|
188
177
|
This test tests different multi-round compaction rebase configurations,
|
189
|
-
as specified in compact_partition_multiple_rounds_test_cases.py
|
178
|
+
as specified in compact_partition_multiple_rounds_test_cases.py.
|
190
179
|
These tests do not test multi-round compaction backfill, which is
|
191
180
|
currently unsupported.
|
181
|
+
|
182
|
+
This version uses the main metastore implementation instead of local storage.
|
192
183
|
"""
|
193
184
|
(
|
194
185
|
source_table_stream,
|
195
186
|
_,
|
196
187
|
rebased_table_stream,
|
197
188
|
_,
|
198
|
-
) =
|
189
|
+
) = multiple_rounds_create_src_w_deltas_destination_rebase_w_deltas_strategy_main(
|
199
190
|
sort_keys,
|
200
191
|
partition_keys_param,
|
201
192
|
input_deltas_param,
|
202
193
|
partition_values_param,
|
203
194
|
ds_mock_kwargs,
|
204
195
|
)
|
205
|
-
|
206
|
-
|
207
|
-
|
196
|
+
# Convert partition values for partition lookup (same as in the helper function)
|
197
|
+
converted_partition_values_for_lookup = partition_values_param
|
198
|
+
if partition_values_param and partition_keys_param:
|
199
|
+
converted_partition_values_for_lookup = []
|
200
|
+
for i, (value, key) in enumerate(
|
201
|
+
zip(partition_values_param, partition_keys_param)
|
202
|
+
):
|
203
|
+
if key.key_type == "int":
|
204
|
+
converted_partition_values_for_lookup.append(int(value))
|
205
|
+
elif key.key_type == "string":
|
206
|
+
converted_partition_values_for_lookup.append(str(value))
|
207
|
+
elif key.key_type == "timestamp":
|
208
|
+
converted_partition_values_for_lookup.append(
|
209
|
+
value
|
210
|
+
) # Keep as is for now
|
211
|
+
else:
|
212
|
+
converted_partition_values_for_lookup.append(value)
|
213
|
+
|
214
|
+
source_partition: Partition = metastore.get_partition(
|
215
|
+
stream_locator=source_table_stream.locator,
|
216
|
+
partition_values=converted_partition_values_for_lookup,
|
217
|
+
partition_scheme_id=source_table_stream.partition_scheme.id,
|
208
218
|
**ds_mock_kwargs,
|
209
219
|
)
|
210
|
-
rebased_partition: Partition =
|
211
|
-
rebased_table_stream.locator,
|
212
|
-
|
220
|
+
rebased_partition: Partition = metastore.get_partition(
|
221
|
+
stream_locator=rebased_table_stream.locator,
|
222
|
+
partition_values=converted_partition_values_for_lookup,
|
223
|
+
partition_scheme_id=rebased_table_stream.partition_scheme.id,
|
213
224
|
**ds_mock_kwargs,
|
214
225
|
)
|
226
|
+
all_column_names = metastore.get_table_version_column_names(
|
227
|
+
rebased_table_stream.locator.table_locator.namespace,
|
228
|
+
rebased_table_stream.locator.table_locator.table_name,
|
229
|
+
rebased_table_stream.locator.table_version_locator.table_version,
|
230
|
+
catalog=ds_mock_kwargs.get("inner"),
|
231
|
+
)
|
215
232
|
total_cpus = DEFAULT_NUM_WORKERS * DEFAULT_WORKER_INSTANCE_CPUS
|
216
233
|
pgm = None
|
217
234
|
if create_placement_group_param:
|
@@ -221,10 +238,10 @@ def test_compact_partition_rebase_multiple_rounds_same_source_and_destination(
|
|
221
238
|
with tempfile.TemporaryDirectory() as test_dir:
|
222
239
|
compact_partition_params = CompactPartitionParams.of(
|
223
240
|
{
|
224
|
-
"
|
241
|
+
"catalog": ds_mock_kwargs.get("inner"),
|
225
242
|
"compacted_file_content_type": ContentType.PARQUET,
|
226
243
|
"dd_max_parallelism_ratio": 1.0,
|
227
|
-
"deltacat_storage":
|
244
|
+
"deltacat_storage": metastore,
|
228
245
|
"deltacat_storage_kwargs": ds_mock_kwargs,
|
229
246
|
"destination_partition_locator": rebased_partition.locator,
|
230
247
|
"hash_bucket_count": hash_bucket_count_param,
|
@@ -236,11 +253,11 @@ def test_compact_partition_rebase_multiple_rounds_same_source_and_destination(
|
|
236
253
|
"object_store": FileObjectStore(test_dir),
|
237
254
|
"pg_config": pgm,
|
238
255
|
"primary_keys": primary_keys,
|
256
|
+
"all_column_names": all_column_names,
|
239
257
|
"read_kwargs_provider": read_kwargs_provider_param,
|
240
258
|
"rebase_source_partition_locator": source_partition.locator,
|
241
259
|
"rebase_source_partition_high_watermark": rebased_partition.stream_position,
|
242
260
|
"records_per_compacted_file": records_per_compacted_file_param,
|
243
|
-
"s3_client_kwargs": {},
|
244
261
|
"source_partition_locator": rebased_partition.locator,
|
245
262
|
"sort_keys": sort_keys if sort_keys else None,
|
246
263
|
"num_rounds": num_rounds_param,
|
@@ -263,23 +280,25 @@ def test_compact_partition_rebase_multiple_rounds_same_source_and_destination(
|
|
263
280
|
object_store_clear_spy = mocker.spy(FileObjectStore, "clear")
|
264
281
|
|
265
282
|
# execute
|
266
|
-
|
283
|
+
benchmark(compact_partition_func, compact_partition_params)
|
267
284
|
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
audit_bucket, audit_key = RoundCompletionInfo.get_audit_bucket_name_and_key(
|
272
|
-
round_completion_info.compaction_audit_url
|
285
|
+
# Get RoundCompletionInfo from the compacted partition
|
286
|
+
round_completion_info: RoundCompletionInfo = get_rci_from_partition(
|
287
|
+
rebased_partition.locator, metastore, catalog=ds_mock_kwargs.get("inner")
|
273
288
|
)
|
274
289
|
|
275
|
-
|
276
|
-
|
290
|
+
# Get catalog root for audit file resolution
|
291
|
+
catalog = ds_mock_kwargs.get("inner")
|
292
|
+
catalog_root = catalog.root
|
293
|
+
|
294
|
+
compaction_audit_obj: Dict[str, Any] = read_audit_file(
|
295
|
+
round_completion_info.compaction_audit_url, catalog_root
|
277
296
|
)
|
278
297
|
compaction_audit: CompactionSessionAuditInfo = CompactionSessionAuditInfo(
|
279
298
|
**compaction_audit_obj
|
280
299
|
)
|
281
300
|
|
282
|
-
# assert if
|
301
|
+
# assert if RCI covers all files
|
283
302
|
# multiple rounds feature is only supported in V2 compactor
|
284
303
|
previous_end = None
|
285
304
|
for start, end in round_completion_info.hb_index_to_entry_range.values():
|
@@ -293,10 +312,14 @@ def test_compact_partition_rebase_multiple_rounds_same_source_and_destination(
|
|
293
312
|
assert (
|
294
313
|
execute_compaction_result_spy.call_args.args[-1] is False
|
295
314
|
), "Table version erroneously marked as in-place compacted!"
|
296
|
-
compacted_delta_locator: DeltaLocator =
|
297
|
-
|
315
|
+
compacted_delta_locator: DeltaLocator = (
|
316
|
+
get_compacted_delta_locator_from_partition(
|
317
|
+
rebased_partition.locator,
|
318
|
+
metastore,
|
319
|
+
catalog=ds_mock_kwargs.get("inner"),
|
320
|
+
)
|
298
321
|
)
|
299
|
-
tables =
|
322
|
+
tables = metastore.download_delta(
|
300
323
|
compacted_delta_locator, storage_type=StorageType.LOCAL, **ds_mock_kwargs
|
301
324
|
)
|
302
325
|
actual_rebase_compacted_table = pa.concat_tables(tables)
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import json
|
2
|
-
|
2
|
+
import tempfile
|
3
3
|
import unittest
|
4
|
+
import uuid
|
4
5
|
|
5
6
|
|
6
7
|
class TestCompactPartitionParams(unittest.TestCase):
|
@@ -8,9 +9,14 @@ class TestCompactPartitionParams(unittest.TestCase):
|
|
8
9
|
def setUpClass(cls):
|
9
10
|
from deltacat.types.media import ContentType
|
10
11
|
from deltacat.utils.metrics import MetricsConfig, MetricsTarget
|
12
|
+
from deltacat.catalog import CatalogProperties
|
13
|
+
|
14
|
+
# Create a temporary catalog for testing
|
15
|
+
tmpdir = tempfile.mkdtemp()
|
16
|
+
cls.test_catalog = CatalogProperties(root=tmpdir)
|
11
17
|
|
12
18
|
cls.VALID_COMPACT_PARTITION_PARAMS = {
|
13
|
-
"
|
19
|
+
"catalog": cls.test_catalog,
|
14
20
|
"compacted_file_content_type": ContentType.PARQUET,
|
15
21
|
"deltacat_storage": "foobar",
|
16
22
|
"destination_partition_locator": {
|
@@ -26,12 +32,13 @@ class TestCompactPartitionParams(unittest.TestCase):
|
|
26
32
|
"format": "fooType",
|
27
33
|
},
|
28
34
|
"partitionValues": [],
|
29
|
-
"partitionId":
|
35
|
+
"partitionId": str(uuid.uuid4()),
|
30
36
|
},
|
31
37
|
"hash_bucket_count": 200,
|
32
38
|
"last_stream_position_to_compact": 168000000000,
|
33
39
|
"list_deltas_kwargs": {"equivalent_table_types": []},
|
34
40
|
"primary_keys": {"id"},
|
41
|
+
"all_column_names": ["id", "foo", "bar", "baz"],
|
35
42
|
"properties": {
|
36
43
|
"parent_stream_position": "1688000000000",
|
37
44
|
},
|
@@ -52,7 +59,7 @@ class TestCompactPartitionParams(unittest.TestCase):
|
|
52
59
|
"partitionValues": [],
|
53
60
|
"partitionId": "79612ea39ac5493eae925abe60767d42",
|
54
61
|
},
|
55
|
-
"
|
62
|
+
"table_writer_kwargs": {
|
56
63
|
"version": "1.0",
|
57
64
|
"flavor": "foobar",
|
58
65
|
"coerce_timestamps": "ms",
|
@@ -103,10 +110,8 @@ class TestCompactPartitionParams(unittest.TestCase):
|
|
103
110
|
json.loads(serialized_params)["compacted_file_content_type"]
|
104
111
|
== params.compacted_file_content_type
|
105
112
|
)
|
106
|
-
|
107
|
-
|
108
|
-
== params.compaction_artifact_s3_bucket
|
109
|
-
)
|
113
|
+
catalog_json = json.loads(serialized_params)["catalog"]
|
114
|
+
assert catalog_json["_root"] == params.catalog.root
|
110
115
|
assert (
|
111
116
|
json.loads(serialized_params)["hash_bucket_count"]
|
112
117
|
== params.hash_bucket_count
|