deltacat 2.0.0b11__py3-none-any.whl → 2.0.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0.post1.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/top_level.txt +0 -0
@@ -1,45 +1,42 @@
|
|
1
|
-
import
|
2
|
-
import
|
3
|
-
|
1
|
+
import tempfile
|
2
|
+
from typing import Any, Dict, List, Optional, Set, Tuple, Callable
|
3
|
+
import uuid
|
4
4
|
import pytest
|
5
|
-
|
6
|
-
from boto3.resources.base import ServiceResource
|
5
|
+
|
7
6
|
import pyarrow as pa
|
8
|
-
|
7
|
+
import ray
|
8
|
+
import pandas as pd
|
9
|
+
|
10
|
+
from deltacat.io.file_object_store import FileObjectStore
|
9
11
|
from pytest_benchmark.fixture import BenchmarkFixture
|
10
12
|
|
11
13
|
from deltacat.tests.compute.test_util_constant import (
|
12
14
|
BASE_TEST_SOURCE_NAMESPACE,
|
13
15
|
BASE_TEST_SOURCE_TABLE_NAME,
|
14
16
|
BASE_TEST_SOURCE_TABLE_VERSION,
|
15
|
-
TEST_S3_RCF_BUCKET_NAME,
|
16
17
|
DEFAULT_NUM_WORKERS,
|
17
18
|
DEFAULT_WORKER_INSTANCE_CPUS,
|
18
19
|
)
|
19
20
|
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
20
21
|
from deltacat.tests.compute.test_util_common import (
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
from deltacat.tests.compute.test_util_create_table_deltas_repo import (
|
28
|
-
create_incremental_deltas_on_source_table,
|
29
|
-
)
|
30
|
-
from deltacat.tests.compute.test_util_create_table_deltas_repo import (
|
31
|
-
create_src_w_deltas_destination_rebase_w_deltas_strategy,
|
22
|
+
create_src_w_deltas_destination_rebase_w_deltas_strategy_main,
|
23
|
+
create_incremental_deltas_on_source_table_main,
|
24
|
+
get_rci_from_partition,
|
25
|
+
read_audit_file,
|
26
|
+
PartitionKey,
|
27
|
+
get_compacted_delta_locator_from_partition,
|
32
28
|
)
|
33
29
|
from deltacat.tests.compute.compact_partition_rebase_then_incremental_test_cases import (
|
34
30
|
REBASE_THEN_INCREMENTAL_TEST_CASES,
|
35
31
|
)
|
36
|
-
|
32
|
+
|
37
33
|
from deltacat.types.media import StorageType
|
38
34
|
from deltacat.storage import (
|
39
35
|
DeltaType,
|
40
36
|
DeltaLocator,
|
41
37
|
Partition,
|
42
38
|
PartitionLocator,
|
39
|
+
metastore,
|
43
40
|
)
|
44
41
|
from deltacat.types.media import ContentType
|
45
42
|
from deltacat.compute.compactor.model.compact_partition_params import (
|
@@ -64,29 +61,24 @@ def setup_ray_cluster():
|
|
64
61
|
ray.shutdown()
|
65
62
|
|
66
63
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
|
71
|
-
os.environ["AWS_SECURITY_TOKEN"] = "testing"
|
72
|
-
os.environ["AWS_SESSION_TOKEN"] = "testing"
|
73
|
-
os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
|
74
|
-
yield
|
75
|
-
|
64
|
+
"""
|
65
|
+
FUNCTION scoped fixtures
|
66
|
+
"""
|
76
67
|
|
77
|
-
@pytest.fixture(scope="module")
|
78
|
-
def s3_resource(mock_aws_credential):
|
79
|
-
with mock_s3():
|
80
|
-
yield boto3.resource("s3")
|
81
68
|
|
69
|
+
@pytest.fixture(autouse=True, scope="function")
|
70
|
+
def enable_bucketing_spec_validation(monkeypatch):
|
71
|
+
"""
|
72
|
+
Enable the bucketing spec validation for all tests.
|
73
|
+
This will help catch hash bucket drift in testing.
|
74
|
+
"""
|
75
|
+
import deltacat.compute.compactor_v2.steps.merge
|
82
76
|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
Bucket=TEST_S3_RCF_BUCKET_NAME,
|
77
|
+
monkeypatch.setattr(
|
78
|
+
deltacat.compute.compactor_v2.steps.merge,
|
79
|
+
"BUCKETING_SPEC_COMPLIANCE_PROFILE",
|
80
|
+
"ASSERT",
|
88
81
|
)
|
89
|
-
yield
|
90
82
|
|
91
83
|
|
92
84
|
@pytest.mark.parametrize(
|
@@ -162,13 +154,12 @@ def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
|
|
162
154
|
],
|
163
155
|
ids=[test_name for test_name in REBASE_THEN_INCREMENTAL_TEST_CASES],
|
164
156
|
)
|
165
|
-
def
|
166
|
-
|
167
|
-
local_deltacat_storage_kwargs: Dict[str, Any],
|
157
|
+
def test_compact_partition_rebase_then_incremental_main(
|
158
|
+
main_deltacat_storage_kwargs: Dict[str, Any],
|
168
159
|
test_name: str,
|
169
160
|
primary_keys: Set[str],
|
170
161
|
sort_keys: List[Optional[Any]],
|
171
|
-
partition_keys_param: Optional[List[
|
162
|
+
partition_keys_param: Optional[List[PartitionKey]],
|
172
163
|
partition_values_param: List[Optional[str]],
|
173
164
|
input_deltas_param: List[pa.Array],
|
174
165
|
input_deltas_delta_type: str,
|
@@ -188,9 +179,15 @@ def test_compact_partition_rebase_then_incremental(
|
|
188
179
|
compact_partition_func: Callable,
|
189
180
|
benchmark: BenchmarkFixture,
|
190
181
|
):
|
191
|
-
|
182
|
+
ds_mock_kwargs = main_deltacat_storage_kwargs
|
183
|
+
"""
|
184
|
+
This test performs rebase compaction first, then incremental compaction on the same data.
|
185
|
+
This tests the scenario where we first do a rebase (with different source/destination partitions)
|
186
|
+
and then follow up with incremental compaction using the result of the rebase.
|
187
|
+
|
188
|
+
This version uses the main metastore implementation instead of local storage.
|
189
|
+
"""
|
192
190
|
|
193
|
-
ds_mock_kwargs = local_deltacat_storage_kwargs
|
194
191
|
"""
|
195
192
|
REBASE
|
196
193
|
"""
|
@@ -199,7 +196,7 @@ def test_compact_partition_rebase_then_incremental(
|
|
199
196
|
source_table_stream,
|
200
197
|
destination_table_stream,
|
201
198
|
rebased_table_stream,
|
202
|
-
) =
|
199
|
+
) = create_src_w_deltas_destination_rebase_w_deltas_strategy_main(
|
203
200
|
sort_keys,
|
204
201
|
partition_keys,
|
205
202
|
input_deltas_param,
|
@@ -207,19 +204,48 @@ def test_compact_partition_rebase_then_incremental(
|
|
207
204
|
partition_values_param,
|
208
205
|
ds_mock_kwargs,
|
209
206
|
)
|
210
|
-
|
207
|
+
|
208
|
+
# Convert partition values for partition lookup (same as in other helper functions)
|
209
|
+
converted_partition_values_for_lookup = partition_values_param
|
210
|
+
if partition_values_param and partition_keys:
|
211
|
+
converted_partition_values_for_lookup = []
|
212
|
+
for i, (value, pk) in enumerate(zip(partition_values_param, partition_keys)):
|
213
|
+
if pk.key_type.value == "int": # Use .value to get string representation
|
214
|
+
converted_partition_values_for_lookup.append(int(value))
|
215
|
+
elif pk.key_type.value == "timestamp":
|
216
|
+
# Handle timestamp partition values
|
217
|
+
if isinstance(value, str) and "T" in value and value.endswith("Z"):
|
218
|
+
ts = pd.to_datetime(value)
|
219
|
+
# Convert to microseconds since epoch for PyArrow timestamp[us]
|
220
|
+
converted_partition_values_for_lookup.append(
|
221
|
+
int(ts.timestamp() * 1_000_000)
|
222
|
+
)
|
223
|
+
else:
|
224
|
+
converted_partition_values_for_lookup.append(value)
|
225
|
+
else:
|
226
|
+
converted_partition_values_for_lookup.append(value)
|
227
|
+
|
228
|
+
source_partition: Partition = metastore.get_partition(
|
211
229
|
source_table_stream.locator,
|
212
|
-
|
230
|
+
converted_partition_values_for_lookup,
|
213
231
|
**ds_mock_kwargs,
|
214
232
|
)
|
233
|
+
# Generate a destination partition ID based on the source partition
|
234
|
+
destination_partition_id = str(uuid.uuid4())
|
215
235
|
destination_partition_locator: PartitionLocator = PartitionLocator.of(
|
216
236
|
destination_table_stream.locator,
|
217
|
-
|
218
|
-
|
237
|
+
converted_partition_values_for_lookup,
|
238
|
+
destination_partition_id,
|
239
|
+
)
|
240
|
+
all_column_names = metastore.get_table_version_column_names(
|
241
|
+
destination_partition_locator.namespace,
|
242
|
+
destination_partition_locator.table_name,
|
243
|
+
destination_partition_locator.table_version,
|
244
|
+
**ds_mock_kwargs,
|
219
245
|
)
|
220
|
-
rebased_partition: Partition =
|
246
|
+
rebased_partition: Partition = metastore.get_partition(
|
221
247
|
rebased_table_stream.locator,
|
222
|
-
|
248
|
+
converted_partition_values_for_lookup,
|
223
249
|
**ds_mock_kwargs,
|
224
250
|
)
|
225
251
|
num_workers, worker_instance_cpu = DEFAULT_NUM_WORKERS, DEFAULT_WORKER_INSTANCE_CPUS
|
@@ -230,151 +256,195 @@ def test_compact_partition_rebase_then_incremental(
|
|
230
256
|
pgm = PlacementGroupManager(
|
231
257
|
1, total_cpus, worker_instance_cpu, memory_per_bundle=4000000
|
232
258
|
).pgs[0]
|
233
|
-
compact_partition_params = CompactPartitionParams.of(
|
234
|
-
{
|
235
|
-
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
236
|
-
"compacted_file_content_type": ContentType.PARQUET,
|
237
|
-
"dd_max_parallelism_ratio": 1.0,
|
238
|
-
"deltacat_storage": ds,
|
239
|
-
"deltacat_storage_kwargs": ds_mock_kwargs,
|
240
|
-
"destination_partition_locator": destination_partition_locator,
|
241
|
-
"hash_bucket_count": hash_bucket_count_param,
|
242
|
-
"last_stream_position_to_compact": source_partition.stream_position,
|
243
|
-
"list_deltas_kwargs": {**ds_mock_kwargs, **{"equivalent_table_types": []}},
|
244
|
-
"object_store": RayPlasmaObjectStore(),
|
245
|
-
"pg_config": pgm,
|
246
|
-
"primary_keys": primary_keys,
|
247
|
-
"read_kwargs_provider": read_kwargs_provider_param,
|
248
|
-
"rebase_source_partition_locator": source_partition.locator,
|
249
|
-
"records_per_compacted_file": records_per_compacted_file_param,
|
250
|
-
"s3_client_kwargs": {},
|
251
|
-
"source_partition_locator": rebased_partition.locator,
|
252
|
-
"sort_keys": sort_keys if sort_keys else None,
|
253
|
-
}
|
254
|
-
)
|
255
|
-
# execute
|
256
|
-
rcf_file_s3_uri = benchmark(compact_partition_func, compact_partition_params)
|
257
|
-
compacted_delta_locator: DeltaLocator = get_compacted_delta_locator_from_rcf(
|
258
|
-
s3_resource, rcf_file_s3_uri
|
259
|
-
)
|
260
|
-
tables = ds.download_delta(
|
261
|
-
compacted_delta_locator, storage_type=StorageType.LOCAL, **ds_mock_kwargs
|
262
|
-
)
|
263
|
-
actual_rebase_compacted_table = pa.concat_tables(tables)
|
264
|
-
# if no primary key is specified then sort by sort_key for consistent assertion
|
265
|
-
sorting_cols: List[Any] = (
|
266
|
-
[(val, "ascending") for val in primary_keys]
|
267
|
-
if primary_keys
|
268
|
-
else [pa_key for key in sort_keys for pa_key in key.arrow]
|
269
|
-
if sort_keys
|
270
|
-
else []
|
271
|
-
)
|
272
|
-
rebase_expected_compact_partition_result = (
|
273
|
-
rebase_expected_compact_partition_result.combine_chunks().sort_by(sorting_cols)
|
274
|
-
)
|
275
|
-
actual_rebase_compacted_table = (
|
276
|
-
actual_rebase_compacted_table.combine_chunks().sort_by(sorting_cols)
|
277
|
-
)
|
278
|
-
assert actual_rebase_compacted_table.equals(
|
279
|
-
rebase_expected_compact_partition_result
|
280
|
-
), f"{actual_rebase_compacted_table} does not match {rebase_expected_compact_partition_result}"
|
281
|
-
"""
|
282
|
-
INCREMENTAL
|
283
|
-
"""
|
284
|
-
(
|
285
|
-
source_partition_locator_w_deltas,
|
286
|
-
new_delta,
|
287
|
-
incremental_delta_length,
|
288
|
-
has_delete_deltas,
|
289
|
-
) = create_incremental_deltas_on_source_table(
|
290
|
-
BASE_TEST_SOURCE_NAMESPACE,
|
291
|
-
BASE_TEST_SOURCE_TABLE_NAME,
|
292
|
-
BASE_TEST_SOURCE_TABLE_VERSION,
|
293
|
-
source_table_stream,
|
294
|
-
partition_values_param,
|
295
|
-
incremental_deltas,
|
296
|
-
ds_mock_kwargs,
|
297
|
-
)
|
298
|
-
compact_partition_params = CompactPartitionParams.of(
|
299
|
-
{
|
300
|
-
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
301
|
-
"compacted_file_content_type": ContentType.PARQUET,
|
302
|
-
"dd_max_parallelism_ratio": 1.0,
|
303
|
-
"deltacat_storage": ds,
|
304
|
-
"deltacat_storage_kwargs": ds_mock_kwargs,
|
305
|
-
"destination_partition_locator": compacted_delta_locator.partition_locator,
|
306
|
-
"drop_duplicates": drop_duplicates_param,
|
307
|
-
"hash_bucket_count": hash_bucket_count_param,
|
308
|
-
"last_stream_position_to_compact": new_delta.stream_position,
|
309
|
-
"list_deltas_kwargs": {**ds_mock_kwargs, **{"equivalent_table_types": []}},
|
310
|
-
"object_store": RayPlasmaObjectStore(),
|
311
|
-
"pg_config": pgm,
|
312
|
-
"primary_keys": primary_keys,
|
313
|
-
"read_kwargs_provider": read_kwargs_provider_param,
|
314
|
-
"rebase_source_partition_locator": None,
|
315
|
-
"rebase_source_partition_high_watermark": None,
|
316
|
-
"records_per_compacted_file": records_per_compacted_file_param,
|
317
|
-
"s3_client_kwargs": {},
|
318
|
-
"source_partition_locator": source_partition_locator_w_deltas,
|
319
|
-
"sort_keys": sort_keys if sort_keys else None,
|
320
|
-
}
|
321
|
-
)
|
322
|
-
if expected_terminal_exception:
|
323
|
-
with pytest.raises(expected_terminal_exception) as exc_info:
|
324
|
-
compact_partition_func(compact_partition_params)
|
325
|
-
assert expected_terminal_exception_message in str(exc_info.value)
|
326
|
-
return
|
327
|
-
rcf_file_s3_uri = compact_partition_func(compact_partition_params)
|
328
|
-
round_completion_info = get_rcf(s3_resource, rcf_file_s3_uri)
|
329
|
-
compacted_delta_locator_incremental: DeltaLocator = (
|
330
|
-
round_completion_info.compacted_delta_locator
|
331
|
-
)
|
332
|
-
# assert if RCF covers all files
|
333
|
-
if compactor_version != CompactorVersion.V1.value:
|
334
|
-
previous_end = None
|
335
|
-
for start, end in round_completion_info.hb_index_to_entry_range.values():
|
336
|
-
assert (previous_end is None and start == 0) or start == previous_end
|
337
|
-
previous_end = end
|
338
|
-
assert (
|
339
|
-
previous_end == round_completion_info.compacted_pyarrow_write_result.files
|
340
|
-
)
|
341
259
|
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
compaction_audit_obj: dict = read_s3_contents(s3_resource, audit_bucket, audit_key)
|
346
|
-
compaction_audit: CompactionSessionAuditInfo = CompactionSessionAuditInfo(
|
347
|
-
**compaction_audit_obj
|
348
|
-
)
|
260
|
+
with tempfile.TemporaryDirectory() as test_dir:
|
261
|
+
# Extract catalog from storage kwargs
|
262
|
+
catalog = ds_mock_kwargs.get("inner")
|
349
263
|
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
264
|
+
compact_partition_params = CompactPartitionParams.of(
|
265
|
+
{
|
266
|
+
"catalog": catalog,
|
267
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
268
|
+
"dd_max_parallelism_ratio": 1.0,
|
269
|
+
"deltacat_storage": metastore,
|
270
|
+
"deltacat_storage_kwargs": ds_mock_kwargs,
|
271
|
+
"destination_partition_locator": destination_partition_locator,
|
272
|
+
"hash_bucket_count": hash_bucket_count_param,
|
273
|
+
"last_stream_position_to_compact": source_partition.stream_position,
|
274
|
+
"list_deltas_kwargs": {
|
275
|
+
**ds_mock_kwargs,
|
276
|
+
**{"equivalent_table_types": []},
|
277
|
+
},
|
278
|
+
"object_store": FileObjectStore(test_dir),
|
279
|
+
"original_fields": {
|
280
|
+
"pk_col_1",
|
281
|
+
"pk_col_2",
|
282
|
+
"sk_col_1",
|
283
|
+
"sk_col_2",
|
284
|
+
"col_1",
|
285
|
+
"col_2",
|
286
|
+
"region_id",
|
287
|
+
},
|
288
|
+
"pg_config": pgm,
|
289
|
+
"primary_keys": primary_keys,
|
290
|
+
"all_column_names": all_column_names,
|
291
|
+
"read_kwargs_provider": read_kwargs_provider_param,
|
292
|
+
"rebase_source_partition_locator": source_partition.locator,
|
293
|
+
"records_per_compacted_file": records_per_compacted_file_param,
|
294
|
+
"source_partition_locator": rebased_partition.locator,
|
295
|
+
"sort_keys": sort_keys if sort_keys else None,
|
296
|
+
}
|
359
297
|
)
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
)
|
369
|
-
|
370
|
-
|
298
|
+
# execute
|
299
|
+
benchmark(compact_partition_func, compact_partition_params)
|
300
|
+
compacted_delta_locator: DeltaLocator = (
|
301
|
+
get_compacted_delta_locator_from_partition(
|
302
|
+
destination_partition_locator,
|
303
|
+
metastore,
|
304
|
+
catalog=catalog,
|
305
|
+
)
|
306
|
+
)
|
307
|
+
tables = metastore.download_delta(
|
308
|
+
compacted_delta_locator,
|
309
|
+
storage_type=StorageType.LOCAL,
|
310
|
+
**ds_mock_kwargs,
|
311
|
+
)
|
312
|
+
actual_rebase_compacted_table = pa.concat_tables(tables)
|
313
|
+
all_column_names = metastore.get_table_version_column_names(
|
314
|
+
destination_partition_locator.namespace,
|
315
|
+
destination_partition_locator.table_name,
|
316
|
+
destination_partition_locator.table_version,
|
317
|
+
**ds_mock_kwargs,
|
318
|
+
)
|
319
|
+
# if no primary key is specified then sort by sort_key for consistent assertion
|
320
|
+
sorting_cols: List[Any] = (
|
321
|
+
[(val, "ascending") for val in primary_keys]
|
322
|
+
if primary_keys
|
323
|
+
else [pa_key for key in sort_keys for pa_key in key.arrow]
|
324
|
+
if sort_keys
|
325
|
+
else []
|
326
|
+
)
|
327
|
+
rebase_expected_compact_partition_result = (
|
328
|
+
rebase_expected_compact_partition_result.combine_chunks().sort_by(
|
329
|
+
sorting_cols
|
330
|
+
)
|
331
|
+
)
|
332
|
+
actual_rebase_compacted_table = (
|
333
|
+
actual_rebase_compacted_table.combine_chunks().sort_by(sorting_cols)
|
334
|
+
)
|
335
|
+
assert actual_rebase_compacted_table.equals(
|
336
|
+
rebase_expected_compact_partition_result
|
337
|
+
), f"{actual_rebase_compacted_table} does not match {rebase_expected_compact_partition_result}"
|
338
|
+
|
339
|
+
"""
|
340
|
+
INCREMENTAL
|
341
|
+
"""
|
342
|
+
(
|
343
|
+
source_partition_locator_w_deltas,
|
344
|
+
new_delta,
|
345
|
+
incremental_delta_length,
|
346
|
+
has_delete_deltas,
|
347
|
+
) = create_incremental_deltas_on_source_table_main(
|
348
|
+
BASE_TEST_SOURCE_NAMESPACE,
|
349
|
+
BASE_TEST_SOURCE_TABLE_NAME,
|
350
|
+
BASE_TEST_SOURCE_TABLE_VERSION,
|
351
|
+
source_table_stream,
|
352
|
+
partition_values_param,
|
353
|
+
incremental_deltas,
|
354
|
+
ds_mock_kwargs,
|
371
355
|
)
|
372
356
|
|
373
|
-
|
374
|
-
|
375
|
-
|
357
|
+
# Handle empty incremental deltas case
|
358
|
+
if new_delta is None:
|
359
|
+
# For empty incremental deltas, the expected result should be the same as rebase result
|
360
|
+
# Skip incremental compaction and just verify the rebase result
|
361
|
+
actual_compact_partition_result = actual_rebase_compacted_table
|
362
|
+
compaction_audit = None
|
363
|
+
else:
|
364
|
+
# Perform incremental compaction when there are actual deltas
|
365
|
+
last_stream_position = new_delta.stream_position
|
376
366
|
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
367
|
+
compact_partition_params = CompactPartitionParams.of(
|
368
|
+
{
|
369
|
+
"catalog": catalog,
|
370
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
371
|
+
"dd_max_parallelism_ratio": 1.0,
|
372
|
+
"deltacat_storage": metastore,
|
373
|
+
"deltacat_storage_kwargs": ds_mock_kwargs,
|
374
|
+
"destination_partition_locator": compacted_delta_locator.partition_locator,
|
375
|
+
"drop_duplicates": drop_duplicates_param,
|
376
|
+
"hash_bucket_count": hash_bucket_count_param,
|
377
|
+
"last_stream_position_to_compact": last_stream_position,
|
378
|
+
"list_deltas_kwargs": {
|
379
|
+
**ds_mock_kwargs,
|
380
|
+
**{"equivalent_table_types": []},
|
381
|
+
},
|
382
|
+
"object_store": FileObjectStore(test_dir),
|
383
|
+
"original_fields": {
|
384
|
+
"pk_col_1",
|
385
|
+
"pk_col_2",
|
386
|
+
"sk_col_1",
|
387
|
+
"sk_col_2",
|
388
|
+
"col_1",
|
389
|
+
"col_2",
|
390
|
+
"region_id",
|
391
|
+
},
|
392
|
+
"pg_config": pgm,
|
393
|
+
"primary_keys": primary_keys,
|
394
|
+
"all_column_names": all_column_names,
|
395
|
+
"read_kwargs_provider": read_kwargs_provider_param,
|
396
|
+
"rebase_source_partition_locator": None,
|
397
|
+
"rebase_source_partition_high_watermark": None,
|
398
|
+
"records_per_compacted_file": records_per_compacted_file_param,
|
399
|
+
"source_partition_locator": source_partition_locator_w_deltas,
|
400
|
+
"sort_keys": sort_keys if sort_keys else None,
|
401
|
+
}
|
402
|
+
)
|
403
|
+
if expected_terminal_exception:
|
404
|
+
with pytest.raises(expected_terminal_exception) as exc_info:
|
405
|
+
compact_partition_func(compact_partition_params)
|
406
|
+
assert expected_terminal_exception_message in str(exc_info.value)
|
407
|
+
return
|
408
|
+
compact_partition_func(compact_partition_params)
|
409
|
+
# assert
|
410
|
+
compacted_delta_locator: DeltaLocator = (
|
411
|
+
get_compacted_delta_locator_from_partition(
|
412
|
+
destination_partition_locator, metastore, catalog=catalog
|
413
|
+
)
|
414
|
+
)
|
415
|
+
tables = metastore.download_delta(
|
416
|
+
compacted_delta_locator,
|
417
|
+
storage_type=StorageType.LOCAL,
|
418
|
+
**ds_mock_kwargs,
|
419
|
+
)
|
420
|
+
actual_compact_partition_result = pa.concat_tables(tables)
|
421
|
+
|
422
|
+
# Get compaction audit for verification if needed
|
423
|
+
round_completion_info = get_rci_from_partition(
|
424
|
+
destination_partition_locator, metastore, catalog=catalog
|
425
|
+
)
|
426
|
+
# Get catalog root for audit file resolution
|
427
|
+
catalog_root = catalog.root
|
428
|
+
|
429
|
+
compaction_audit_obj: dict = read_audit_file(
|
430
|
+
round_completion_info.compaction_audit_url, catalog_root
|
431
|
+
)
|
432
|
+
compaction_audit = CompactionSessionAuditInfo(**compaction_audit_obj)
|
433
|
+
|
434
|
+
# Verify the final result
|
435
|
+
actual_compact_partition_result = (
|
436
|
+
actual_compact_partition_result.combine_chunks().sort_by(sorting_cols)
|
437
|
+
)
|
438
|
+
expected_terminal_compact_partition_result = (
|
439
|
+
expected_terminal_compact_partition_result.combine_chunks().sort_by(
|
440
|
+
sorting_cols
|
441
|
+
)
|
442
|
+
)
|
443
|
+
assert actual_compact_partition_result.equals(
|
444
|
+
expected_terminal_compact_partition_result
|
445
|
+
), f"{actual_compact_partition_result} does not match {expected_terminal_compact_partition_result}"
|
446
|
+
|
447
|
+
if assert_compaction_audit is not None and compaction_audit is not None:
|
448
|
+
if not assert_compaction_audit(compactor_version, compaction_audit):
|
449
|
+
pytest.fail("Compaction audit assertion failed")
|
450
|
+
return
|