deltacat 1.1.36__py3-none-any.whl → 2.0.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +42 -3
- deltacat/annotations.py +36 -0
- deltacat/api.py +168 -0
- deltacat/aws/s3u.py +4 -4
- deltacat/benchmarking/benchmark_engine.py +82 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +21 -0
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
- deltacat/catalog/__init__.py +14 -0
- deltacat/catalog/delegate.py +199 -106
- deltacat/catalog/iceberg/__init__.py +4 -0
- deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/catalog/iceberg/impl.py +368 -0
- deltacat/catalog/iceberg/overrides.py +74 -0
- deltacat/catalog/interface.py +273 -76
- deltacat/catalog/main/impl.py +720 -0
- deltacat/catalog/model/catalog.py +227 -20
- deltacat/catalog/model/properties.py +116 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +5 -5
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +1 -1
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +1 -1
- deltacat/compute/compactor/steps/materialize.py +6 -2
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor_v2/compaction_session.py +5 -9
- deltacat/compute/compactor_v2/constants.py +1 -30
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/merge_input.py +1 -7
- deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
- deltacat/compute/compactor_v2/steps/merge.py +17 -126
- deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/compactor_v2/utils/merge.py +0 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
- deltacat/compute/compactor_v2/utils/task_options.py +23 -43
- deltacat/compute/converter/constants.py +4 -0
- deltacat/compute/converter/converter_session.py +143 -0
- deltacat/compute/converter/model/convert_input.py +69 -0
- deltacat/compute/converter/model/convert_input_files.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +99 -0
- deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- deltacat/compute/converter/pyiceberg/catalog.py +75 -0
- deltacat/compute/converter/pyiceberg/overrides.py +135 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
- deltacat/compute/converter/steps/__init__.py +0 -0
- deltacat/compute/converter/steps/convert.py +211 -0
- deltacat/compute/converter/steps/dedupe.py +60 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +88 -0
- deltacat/compute/converter/utils/converter_session_utils.py +109 -0
- deltacat/compute/converter/utils/iceberg_columns.py +82 -0
- deltacat/compute/converter/utils/io.py +43 -0
- deltacat/compute/converter/utils/s3u.py +133 -0
- deltacat/compute/resource_estimation/delta.py +1 -19
- deltacat/constants.py +47 -1
- deltacat/env.py +51 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/common/__init__.py +0 -0
- deltacat/examples/common/fixtures.py +15 -0
- deltacat/examples/hello_world.py +27 -0
- deltacat/examples/iceberg/__init__.py +0 -0
- deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
- deltacat/examples/iceberg/iceberg_reader.py +149 -0
- deltacat/exceptions.py +51 -9
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +118 -28
- deltacat/storage/iceberg/__init__.py +0 -0
- deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
- deltacat/storage/iceberg/impl.py +737 -0
- deltacat/storage/iceberg/model.py +709 -0
- deltacat/storage/interface.py +217 -134
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +2077 -0
- deltacat/storage/model/delta.py +118 -71
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -3
- deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
- deltacat/storage/model/metafile.py +1316 -0
- deltacat/storage/model/namespace.py +34 -18
- deltacat/storage/model/partition.py +362 -37
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +19 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +892 -0
- deltacat/storage/model/shard.py +47 -0
- deltacat/storage/model/sort_key.py +170 -13
- deltacat/storage/model/stream.py +208 -80
- deltacat/storage/model/table.py +123 -29
- deltacat/storage/model/table_version.py +322 -46
- deltacat/storage/model/transaction.py +757 -0
- deltacat/storage/model/transform.py +198 -61
- deltacat/storage/model/types.py +111 -13
- deltacat/storage/rivulet/__init__.py +11 -0
- deltacat/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/storage/rivulet/arrow/serializer.py +75 -0
- deltacat/storage/rivulet/dataset.py +744 -0
- deltacat/storage/rivulet/dataset_executor.py +87 -0
- deltacat/storage/rivulet/feather/__init__.py +5 -0
- deltacat/storage/rivulet/feather/file_reader.py +136 -0
- deltacat/storage/rivulet/feather/serializer.py +35 -0
- deltacat/storage/rivulet/fs/__init__.py +0 -0
- deltacat/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/storage/rivulet/fs/file_store.py +130 -0
- deltacat/storage/rivulet/fs/input_file.py +76 -0
- deltacat/storage/rivulet/fs/output_file.py +86 -0
- deltacat/storage/rivulet/logical_plan.py +105 -0
- deltacat/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/storage/rivulet/metastore/delta.py +190 -0
- deltacat/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/storage/rivulet/metastore/sst.py +82 -0
- deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/storage/rivulet/mvp/Table.py +101 -0
- deltacat/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/storage/rivulet/parquet/file_reader.py +127 -0
- deltacat/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/storage/rivulet/reader/__init__.py +0 -0
- deltacat/storage/rivulet/reader/block_scanner.py +378 -0
- deltacat/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/storage/rivulet/reader/data_scan.py +63 -0
- deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
- deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
- deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
- deltacat/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/storage/rivulet/schema/__init__.py +0 -0
- deltacat/storage/rivulet/schema/datatype.py +128 -0
- deltacat/storage/rivulet/schema/schema.py +251 -0
- deltacat/storage/rivulet/serializer.py +40 -0
- deltacat/storage/rivulet/serializer_factory.py +42 -0
- deltacat/storage/rivulet/writer/__init__.py +0 -0
- deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/catalog/test_catalogs.py +324 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +19 -53
- deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
- deltacat/tests/compute/compactor/utils/test_io.py +6 -8
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
- deltacat/tests/compute/conftest.py +75 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +478 -0
- deltacat/tests/compute/converter/utils.py +123 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
- deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
- deltacat/tests/compute/test_compact_partition_params.py +3 -3
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
- deltacat/tests/compute/test_util_common.py +19 -12
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
- deltacat/tests/local_deltacat_storage/__init__.py +76 -103
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/conftest.py +25 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +1399 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_metafile_io.py +2535 -0
- deltacat/tests/storage/model/test_schema.py +308 -0
- deltacat/tests/storage/model/test_shard.py +22 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +308 -0
- deltacat/tests/storage/rivulet/__init__.py +0 -0
- deltacat/tests/storage/rivulet/conftest.py +149 -0
- deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
- deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/storage/rivulet/test_dataset.py +406 -0
- deltacat/tests/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/storage/rivulet/test_utils.py +122 -0
- deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/test_deltacat_api.py +39 -0
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +8 -15
- deltacat/tests/test_utils/storage.py +266 -3
- deltacat/tests/utils/test_daft.py +3 -3
- deltacat/tests/utils/test_pyarrow.py +0 -432
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +1 -1
- deltacat/utils/export.py +59 -0
- deltacat/utils/filesystem.py +320 -0
- deltacat/utils/metafile_locator.py +73 -0
- deltacat/utils/pyarrow.py +36 -183
- deltacat-2.0.0b2.dist-info/METADATA +65 -0
- deltacat-2.0.0b2.dist-info/RECORD +349 -0
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
- deltacat-1.1.36.dist-info/METADATA +0 -64
- deltacat-1.1.36.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/LICENSE +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/WHEEL +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,11 @@
|
|
1
1
|
import unittest
|
2
2
|
from unittest import mock
|
3
|
-
from deltacat.tests.test_utils.constants import TEST_UPSERT_DELTA
|
4
|
-
from typing import Any, Dict
|
5
3
|
|
6
|
-
|
7
|
-
|
8
|
-
|
4
|
+
from deltacat.tests.compute.conftest import (
|
5
|
+
create_local_deltacat_storage_file,
|
6
|
+
clean_up_local_deltacat_storage_file,
|
9
7
|
)
|
8
|
+
from deltacat.tests.test_utils.constants import TEST_UPSERT_DELTA
|
10
9
|
|
11
10
|
|
12
11
|
class TestFitInputDeltas(unittest.TestCase):
|
@@ -19,9 +18,7 @@ class TestFitInputDeltas(unittest.TestCase):
|
|
19
18
|
CompactionSessionAuditInfo,
|
20
19
|
)
|
21
20
|
|
22
|
-
cls.kwargs_for_local_deltacat_storage
|
23
|
-
DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
|
24
|
-
}
|
21
|
+
cls.kwargs_for_local_deltacat_storage = create_local_deltacat_storage_file()
|
25
22
|
|
26
23
|
cls.COMPACTION_AUDIT = CompactionSessionAuditInfo("1.0", "2.3", "test")
|
27
24
|
|
@@ -30,6 +27,7 @@ class TestFitInputDeltas(unittest.TestCase):
|
|
30
27
|
@classmethod
|
31
28
|
def tearDownClass(cls) -> None:
|
32
29
|
cls.module_patcher.stop()
|
30
|
+
clean_up_local_deltacat_storage_file(cls.kwargs_for_local_deltacat_storage)
|
33
31
|
|
34
32
|
def test_sanity(self):
|
35
33
|
from deltacat.compute.compactor.utils import io
|
@@ -1,14 +1,10 @@
|
|
1
|
-
from typing import Dict, Any
|
2
1
|
import ray
|
3
2
|
import os
|
4
|
-
import pyarrow as pa
|
5
3
|
import pytest
|
6
4
|
import boto3
|
7
|
-
import json
|
8
5
|
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
9
6
|
CompactionSessionAuditInfo,
|
10
7
|
)
|
11
|
-
from deltacat.exceptions import ValidationError
|
12
8
|
from boto3.resources.base import ServiceResource
|
13
9
|
import deltacat.tests.local_deltacat_storage as ds
|
14
10
|
from deltacat.types.media import ContentType
|
@@ -31,11 +27,6 @@ from deltacat.tests.test_utils.pyarrow import (
|
|
31
27
|
)
|
32
28
|
from moto import mock_s3
|
33
29
|
|
34
|
-
DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
|
35
|
-
"db_file_path",
|
36
|
-
"deltacat/tests/local_deltacat_storage/db_test.sqlite",
|
37
|
-
)
|
38
|
-
|
39
30
|
|
40
31
|
@pytest.fixture(autouse=True, scope="module")
|
41
32
|
def setup_ray_cluster():
|
@@ -69,38 +60,6 @@ def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
|
|
69
60
|
yield
|
70
61
|
|
71
62
|
|
72
|
-
@pytest.fixture(scope="function")
|
73
|
-
def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
|
74
|
-
kwargs_for_local_deltacat_storage: Dict[str, Any] = {
|
75
|
-
DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
|
76
|
-
}
|
77
|
-
yield kwargs_for_local_deltacat_storage
|
78
|
-
if os.path.exists(DATABASE_FILE_PATH_VALUE):
|
79
|
-
os.remove(DATABASE_FILE_PATH_VALUE)
|
80
|
-
|
81
|
-
|
82
|
-
@pytest.fixture(scope="function")
|
83
|
-
def disable_sha1(monkeypatch):
|
84
|
-
import deltacat.compute.compactor_v2.utils.primary_key_index
|
85
|
-
|
86
|
-
monkeypatch.setattr(
|
87
|
-
deltacat.compute.compactor_v2.utils.primary_key_index,
|
88
|
-
"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED",
|
89
|
-
True,
|
90
|
-
)
|
91
|
-
|
92
|
-
|
93
|
-
@pytest.fixture(scope="function")
|
94
|
-
def enable_bucketing_spec_validation(monkeypatch):
|
95
|
-
import deltacat.compute.compactor_v2.steps.merge
|
96
|
-
|
97
|
-
monkeypatch.setattr(
|
98
|
-
deltacat.compute.compactor_v2.steps.merge,
|
99
|
-
"BUCKETING_SPEC_COMPLIANCE_PROFILE",
|
100
|
-
"ASSERT",
|
101
|
-
)
|
102
|
-
|
103
|
-
|
104
63
|
class TestCompactionSession:
|
105
64
|
"""
|
106
65
|
This class adds specific tests that aren't part of the parametrized test suite.
|
@@ -581,428 +540,3 @@ class TestCompactionSession:
|
|
581
540
|
}
|
582
541
|
)
|
583
542
|
)
|
584
|
-
|
585
|
-
def test_compact_partition_when_incremental_pk_hash_is_over_2gb(
|
586
|
-
self, s3_resource, local_deltacat_storage_kwargs, disable_sha1
|
587
|
-
):
|
588
|
-
"""
|
589
|
-
A test case which ensures the compaction succeeds even if the incremental
|
590
|
-
arrow table size is over 2GB. It is added to prevent ArrowCapacityError
|
591
|
-
when running is_in operation during merge.
|
592
|
-
|
593
|
-
Note that we set SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED to bypass sha1 hashing
|
594
|
-
which truncates the lengths of pk strings when deduping.
|
595
|
-
"""
|
596
|
-
# setup
|
597
|
-
staged_source = stage_partition_from_file_paths(
|
598
|
-
self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
|
599
|
-
)
|
600
|
-
# we create chunked array to avoid ArrowCapacityError
|
601
|
-
chunked_pk_array = pa.chunked_array([["13bytesstring"], ["12bytestring"]])
|
602
|
-
table = pa.table([chunked_pk_array], names=["pk"])
|
603
|
-
source_delta = commit_delta_to_staged_partition(
|
604
|
-
staged_source, pa_table=table, **local_deltacat_storage_kwargs
|
605
|
-
)
|
606
|
-
|
607
|
-
staged_dest = stage_partition_from_file_paths(
|
608
|
-
self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
|
609
|
-
)
|
610
|
-
dest_partition = ds.commit_partition(
|
611
|
-
staged_dest, **local_deltacat_storage_kwargs
|
612
|
-
)
|
613
|
-
|
614
|
-
# rebase first
|
615
|
-
rebase_url = compact_partition(
|
616
|
-
CompactPartitionParams.of(
|
617
|
-
{
|
618
|
-
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
619
|
-
"compacted_file_content_type": ContentType.PARQUET,
|
620
|
-
"dd_max_parallelism_ratio": 1.0,
|
621
|
-
"deltacat_storage": ds,
|
622
|
-
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
623
|
-
"destination_partition_locator": dest_partition.locator,
|
624
|
-
"drop_duplicates": True,
|
625
|
-
"hash_bucket_count": 1,
|
626
|
-
"last_stream_position_to_compact": source_delta.stream_position,
|
627
|
-
"list_deltas_kwargs": {
|
628
|
-
**local_deltacat_storage_kwargs,
|
629
|
-
**{"equivalent_table_types": []},
|
630
|
-
},
|
631
|
-
"primary_keys": ["pk"],
|
632
|
-
"rebase_source_partition_locator": source_delta.partition_locator,
|
633
|
-
"rebase_source_partition_high_watermark": source_delta.stream_position,
|
634
|
-
"records_per_compacted_file": 4000,
|
635
|
-
"s3_client_kwargs": {},
|
636
|
-
"source_partition_locator": source_delta.partition_locator,
|
637
|
-
"resource_estimation_method": ResourceEstimationMethod.PREVIOUS_INFLATION,
|
638
|
-
}
|
639
|
-
)
|
640
|
-
)
|
641
|
-
|
642
|
-
rebased_rcf = get_rcf(s3_resource, rebase_url)
|
643
|
-
|
644
|
-
assert rebased_rcf.compacted_pyarrow_write_result.files == 1
|
645
|
-
assert rebased_rcf.compacted_pyarrow_write_result.records == 2
|
646
|
-
|
647
|
-
# Run incremental with a small delta on source
|
648
|
-
chunked_pk_array = pa.chunked_array(
|
649
|
-
[["13bytesstring" * 95_000_000], ["12bytestring" * 95_000_000]]
|
650
|
-
) # 2.3GB
|
651
|
-
table = pa.table([chunked_pk_array], names=["pk"])
|
652
|
-
|
653
|
-
incremental_source_delta = commit_delta_to_partition(
|
654
|
-
source_delta.partition_locator,
|
655
|
-
pa_table=table,
|
656
|
-
**local_deltacat_storage_kwargs,
|
657
|
-
)
|
658
|
-
assert (
|
659
|
-
incremental_source_delta.partition_locator == source_delta.partition_locator
|
660
|
-
), "source partition locator should not change"
|
661
|
-
dest_partition = ds.get_partition(
|
662
|
-
dest_partition.stream_locator,
|
663
|
-
dest_partition.partition_values,
|
664
|
-
**local_deltacat_storage_kwargs,
|
665
|
-
)
|
666
|
-
|
667
|
-
assert (
|
668
|
-
dest_partition.locator
|
669
|
-
== rebased_rcf.compacted_delta_locator.partition_locator
|
670
|
-
), "The new destination partition should be same as compacted partition"
|
671
|
-
|
672
|
-
# Run incremental
|
673
|
-
incremental_url = compact_partition(
|
674
|
-
CompactPartitionParams.of(
|
675
|
-
{
|
676
|
-
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
677
|
-
"compacted_file_content_type": ContentType.PARQUET,
|
678
|
-
"dd_max_parallelism_ratio": 1.0,
|
679
|
-
"deltacat_storage": ds,
|
680
|
-
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
681
|
-
"destination_partition_locator": dest_partition.locator,
|
682
|
-
"drop_duplicates": True,
|
683
|
-
"hash_bucket_count": 1,
|
684
|
-
"last_stream_position_to_compact": incremental_source_delta.stream_position,
|
685
|
-
"list_deltas_kwargs": {
|
686
|
-
**local_deltacat_storage_kwargs,
|
687
|
-
**{"equivalent_table_types": []},
|
688
|
-
},
|
689
|
-
"primary_keys": ["pk"],
|
690
|
-
"records_per_compacted_file": 4000,
|
691
|
-
"s3_client_kwargs": {},
|
692
|
-
"source_partition_locator": incremental_source_delta.partition_locator,
|
693
|
-
"resource_estimation_method": ResourceEstimationMethod.PREVIOUS_INFLATION,
|
694
|
-
}
|
695
|
-
)
|
696
|
-
)
|
697
|
-
|
698
|
-
incremental_rcf = get_rcf(s3_resource, incremental_url)
|
699
|
-
|
700
|
-
assert incremental_rcf.compacted_pyarrow_write_result.files == 1
|
701
|
-
assert (
|
702
|
-
incremental_rcf.compacted_pyarrow_write_result.pyarrow_bytes >= 2300000000
|
703
|
-
)
|
704
|
-
assert incremental_rcf.compacted_pyarrow_write_result.records == 4
|
705
|
-
|
706
|
-
def test_compact_partition_when_bucket_spec_validation_fails(
|
707
|
-
self,
|
708
|
-
s3_resource,
|
709
|
-
local_deltacat_storage_kwargs,
|
710
|
-
enable_bucketing_spec_validation,
|
711
|
-
):
|
712
|
-
"""
|
713
|
-
A test case which asserts the bucketing spec validation throws an assertion error
|
714
|
-
when the validation has failed.
|
715
|
-
"""
|
716
|
-
|
717
|
-
# setup
|
718
|
-
staged_source = stage_partition_from_file_paths(
|
719
|
-
self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
|
720
|
-
)
|
721
|
-
|
722
|
-
source_delta = commit_delta_to_staged_partition(
|
723
|
-
staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
|
724
|
-
)
|
725
|
-
|
726
|
-
staged_dest = stage_partition_from_file_paths(
|
727
|
-
self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
|
728
|
-
)
|
729
|
-
dest_partition = ds.commit_partition(
|
730
|
-
staged_dest, **local_deltacat_storage_kwargs
|
731
|
-
)
|
732
|
-
|
733
|
-
# action
|
734
|
-
rcf_url = compact_partition(
|
735
|
-
CompactPartitionParams.of(
|
736
|
-
{
|
737
|
-
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
738
|
-
"compacted_file_content_type": ContentType.PARQUET,
|
739
|
-
"dd_max_parallelism_ratio": 1.0,
|
740
|
-
"deltacat_storage": ds,
|
741
|
-
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
742
|
-
"destination_partition_locator": dest_partition.locator,
|
743
|
-
"drop_duplicates": True,
|
744
|
-
"hash_bucket_count": 4,
|
745
|
-
"last_stream_position_to_compact": source_delta.stream_position,
|
746
|
-
"list_deltas_kwargs": {
|
747
|
-
**local_deltacat_storage_kwargs,
|
748
|
-
**{"equivalent_table_types": []},
|
749
|
-
},
|
750
|
-
"primary_keys": ["pk"],
|
751
|
-
"rebase_source_partition_locator": source_delta.partition_locator,
|
752
|
-
"rebase_source_partition_high_watermark": source_delta.stream_position,
|
753
|
-
"records_per_compacted_file": 1,
|
754
|
-
"s3_client_kwargs": {},
|
755
|
-
"source_partition_locator": source_delta.partition_locator,
|
756
|
-
}
|
757
|
-
)
|
758
|
-
)
|
759
|
-
|
760
|
-
backfill_rcf = get_rcf(s3_resource, rcf_url)
|
761
|
-
bucket, backfill_key1, backfill_key2 = rcf_url.strip("s3://").split("/")
|
762
|
-
# Move the records to different hash buckets to simulate a validation failure.
|
763
|
-
backfill_rcf["hbIndexToEntryRange"] = {"1": [0, 3]}
|
764
|
-
s3_resource.Bucket(bucket).put_object(
|
765
|
-
Key=f"{backfill_key1}/{backfill_key2}", Body=json.dumps(backfill_rcf)
|
766
|
-
)
|
767
|
-
|
768
|
-
# Now run an incremental compaction and verify if the previous RCF was read properly.
|
769
|
-
new_source_delta = commit_delta_to_partition(
|
770
|
-
source_delta.partition_locator,
|
771
|
-
[self.INCREMENTAL_FILE_PATH],
|
772
|
-
**local_deltacat_storage_kwargs,
|
773
|
-
)
|
774
|
-
|
775
|
-
new_destination_partition = ds.get_partition(
|
776
|
-
dest_partition.stream_locator, [], **local_deltacat_storage_kwargs
|
777
|
-
)
|
778
|
-
|
779
|
-
with pytest.raises(ValidationError) as excinfo:
|
780
|
-
compact_partition(
|
781
|
-
CompactPartitionParams.of(
|
782
|
-
{
|
783
|
-
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
784
|
-
"compacted_file_content_type": ContentType.PARQUET,
|
785
|
-
"dd_max_parallelism_ratio": 1.0,
|
786
|
-
"deltacat_storage": ds,
|
787
|
-
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
788
|
-
"destination_partition_locator": new_destination_partition.locator,
|
789
|
-
"drop_duplicates": True,
|
790
|
-
"hash_bucket_count": 4,
|
791
|
-
"last_stream_position_to_compact": new_source_delta.stream_position,
|
792
|
-
"list_deltas_kwargs": {
|
793
|
-
**local_deltacat_storage_kwargs,
|
794
|
-
**{"equivalent_table_types": []},
|
795
|
-
},
|
796
|
-
"primary_keys": ["pk"],
|
797
|
-
"rebase_source_partition_locator": None,
|
798
|
-
"rebase_source_partition_high_watermark": None,
|
799
|
-
"records_per_compacted_file": 4000,
|
800
|
-
"s3_client_kwargs": {},
|
801
|
-
"source_partition_locator": new_source_delta.partition_locator,
|
802
|
-
}
|
803
|
-
)
|
804
|
-
)
|
805
|
-
|
806
|
-
assert (
|
807
|
-
"Hash bucket drift detected at index: 0. Expected hash bucket index to be 1 but found 0"
|
808
|
-
in str(excinfo.value)
|
809
|
-
)
|
810
|
-
|
811
|
-
def test_compact_partition_when_bucket_spec_validation_fails_but_env_variable_disabled(
|
812
|
-
self,
|
813
|
-
s3_resource,
|
814
|
-
local_deltacat_storage_kwargs,
|
815
|
-
):
|
816
|
-
"""
|
817
|
-
A test case which asserts even if bucketing spec validation fails, compaction doesn't
|
818
|
-
throw an error if the feature is not enabled.
|
819
|
-
"""
|
820
|
-
|
821
|
-
# setup
|
822
|
-
staged_source = stage_partition_from_file_paths(
|
823
|
-
self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
|
824
|
-
)
|
825
|
-
|
826
|
-
source_delta = commit_delta_to_staged_partition(
|
827
|
-
staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
|
828
|
-
)
|
829
|
-
|
830
|
-
staged_dest = stage_partition_from_file_paths(
|
831
|
-
self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
|
832
|
-
)
|
833
|
-
dest_partition = ds.commit_partition(
|
834
|
-
staged_dest, **local_deltacat_storage_kwargs
|
835
|
-
)
|
836
|
-
|
837
|
-
# action
|
838
|
-
rcf_url = compact_partition(
|
839
|
-
CompactPartitionParams.of(
|
840
|
-
{
|
841
|
-
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
842
|
-
"compacted_file_content_type": ContentType.PARQUET,
|
843
|
-
"dd_max_parallelism_ratio": 1.0,
|
844
|
-
"deltacat_storage": ds,
|
845
|
-
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
846
|
-
"destination_partition_locator": dest_partition.locator,
|
847
|
-
"drop_duplicates": True,
|
848
|
-
"hash_bucket_count": 4,
|
849
|
-
"last_stream_position_to_compact": source_delta.stream_position,
|
850
|
-
"list_deltas_kwargs": {
|
851
|
-
**local_deltacat_storage_kwargs,
|
852
|
-
**{"equivalent_table_types": []},
|
853
|
-
},
|
854
|
-
"primary_keys": ["pk"],
|
855
|
-
"rebase_source_partition_locator": source_delta.partition_locator,
|
856
|
-
"rebase_source_partition_high_watermark": source_delta.stream_position,
|
857
|
-
"records_per_compacted_file": 1,
|
858
|
-
"s3_client_kwargs": {},
|
859
|
-
"source_partition_locator": source_delta.partition_locator,
|
860
|
-
}
|
861
|
-
)
|
862
|
-
)
|
863
|
-
|
864
|
-
backfill_rcf = get_rcf(s3_resource, rcf_url)
|
865
|
-
bucket, backfill_key1, backfill_key2 = rcf_url.strip("s3://").split("/")
|
866
|
-
# Move the records to different hash buckets to simulate a validation failure.
|
867
|
-
backfill_rcf["hbIndexToEntryRange"] = {"1": [0, 3]}
|
868
|
-
s3_resource.Bucket(bucket).put_object(
|
869
|
-
Key=f"{backfill_key1}/{backfill_key2}", Body=json.dumps(backfill_rcf)
|
870
|
-
)
|
871
|
-
|
872
|
-
# Now run an incremental compaction and verify if the previous RCF was read properly.
|
873
|
-
new_source_delta = commit_delta_to_partition(
|
874
|
-
source_delta.partition_locator,
|
875
|
-
[self.INCREMENTAL_FILE_PATH],
|
876
|
-
**local_deltacat_storage_kwargs,
|
877
|
-
)
|
878
|
-
|
879
|
-
new_destination_partition = ds.get_partition(
|
880
|
-
dest_partition.stream_locator, [], **local_deltacat_storage_kwargs
|
881
|
-
)
|
882
|
-
|
883
|
-
new_rcf = compact_partition(
|
884
|
-
CompactPartitionParams.of(
|
885
|
-
{
|
886
|
-
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
887
|
-
"compacted_file_content_type": ContentType.PARQUET,
|
888
|
-
"dd_max_parallelism_ratio": 1.0,
|
889
|
-
"deltacat_storage": ds,
|
890
|
-
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
891
|
-
"destination_partition_locator": new_destination_partition.locator,
|
892
|
-
"drop_duplicates": True,
|
893
|
-
"hash_bucket_count": 4,
|
894
|
-
"last_stream_position_to_compact": new_source_delta.stream_position,
|
895
|
-
"list_deltas_kwargs": {
|
896
|
-
**local_deltacat_storage_kwargs,
|
897
|
-
**{"equivalent_table_types": []},
|
898
|
-
},
|
899
|
-
"primary_keys": ["pk"],
|
900
|
-
"rebase_source_partition_locator": None,
|
901
|
-
"rebase_source_partition_high_watermark": None,
|
902
|
-
"records_per_compacted_file": 4000,
|
903
|
-
"s3_client_kwargs": {},
|
904
|
-
"source_partition_locator": new_source_delta.partition_locator,
|
905
|
-
}
|
906
|
-
)
|
907
|
-
)
|
908
|
-
|
909
|
-
incremental_rcf = get_rcf(s3_resource, new_rcf)
|
910
|
-
assert incremental_rcf.hash_bucket_count == 4
|
911
|
-
assert len(incremental_rcf.hb_index_to_entry_range) == 2
|
912
|
-
|
913
|
-
def test_compact_partition_when_bucket_spec_validation_succeeds(
|
914
|
-
self,
|
915
|
-
s3_resource,
|
916
|
-
local_deltacat_storage_kwargs,
|
917
|
-
enable_bucketing_spec_validation,
|
918
|
-
):
|
919
|
-
"""
|
920
|
-
A test case which asserts the bucketing spec validation does not throw
|
921
|
-
and error when the validation succeeds.
|
922
|
-
"""
|
923
|
-
|
924
|
-
# setup
|
925
|
-
staged_source = stage_partition_from_file_paths(
|
926
|
-
self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
|
927
|
-
)
|
928
|
-
|
929
|
-
source_delta = commit_delta_to_staged_partition(
|
930
|
-
staged_source, [self.BACKFILL_FILE_PATH], **local_deltacat_storage_kwargs
|
931
|
-
)
|
932
|
-
|
933
|
-
staged_dest = stage_partition_from_file_paths(
|
934
|
-
self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
|
935
|
-
)
|
936
|
-
dest_partition = ds.commit_partition(
|
937
|
-
staged_dest, **local_deltacat_storage_kwargs
|
938
|
-
)
|
939
|
-
|
940
|
-
# action
|
941
|
-
rcf_url = compact_partition(
|
942
|
-
CompactPartitionParams.of(
|
943
|
-
{
|
944
|
-
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
945
|
-
"compacted_file_content_type": ContentType.PARQUET,
|
946
|
-
"dd_max_parallelism_ratio": 1.0,
|
947
|
-
"deltacat_storage": ds,
|
948
|
-
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
949
|
-
"destination_partition_locator": dest_partition.locator,
|
950
|
-
"drop_duplicates": True,
|
951
|
-
"hash_bucket_count": 4,
|
952
|
-
"last_stream_position_to_compact": source_delta.stream_position,
|
953
|
-
"list_deltas_kwargs": {
|
954
|
-
**local_deltacat_storage_kwargs,
|
955
|
-
**{"equivalent_table_types": []},
|
956
|
-
},
|
957
|
-
"primary_keys": ["pk"],
|
958
|
-
"rebase_source_partition_locator": source_delta.partition_locator,
|
959
|
-
"rebase_source_partition_high_watermark": source_delta.stream_position,
|
960
|
-
"records_per_compacted_file": 1,
|
961
|
-
"s3_client_kwargs": {},
|
962
|
-
"source_partition_locator": source_delta.partition_locator,
|
963
|
-
}
|
964
|
-
)
|
965
|
-
)
|
966
|
-
|
967
|
-
rcf = get_rcf(s3_resource, rcf_url)
|
968
|
-
assert rcf.hash_bucket_count == 4
|
969
|
-
|
970
|
-
# Now run an incremental compaction and verify if the previous RCF was read properly.
|
971
|
-
new_source_delta = commit_delta_to_partition(
|
972
|
-
source_delta.partition_locator,
|
973
|
-
[self.INCREMENTAL_FILE_PATH],
|
974
|
-
**local_deltacat_storage_kwargs,
|
975
|
-
)
|
976
|
-
|
977
|
-
new_destination_partition = ds.get_partition(
|
978
|
-
dest_partition.stream_locator, [], **local_deltacat_storage_kwargs
|
979
|
-
)
|
980
|
-
|
981
|
-
new_uri = compact_partition(
|
982
|
-
CompactPartitionParams.of(
|
983
|
-
{
|
984
|
-
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
985
|
-
"compacted_file_content_type": ContentType.PARQUET,
|
986
|
-
"dd_max_parallelism_ratio": 1.0,
|
987
|
-
"deltacat_storage": ds,
|
988
|
-
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
989
|
-
"destination_partition_locator": new_destination_partition.locator,
|
990
|
-
"drop_duplicates": True,
|
991
|
-
"hash_bucket_count": 4,
|
992
|
-
"last_stream_position_to_compact": new_source_delta.stream_position,
|
993
|
-
"list_deltas_kwargs": {
|
994
|
-
**local_deltacat_storage_kwargs,
|
995
|
-
**{"equivalent_table_types": []},
|
996
|
-
},
|
997
|
-
"primary_keys": ["pk"],
|
998
|
-
"rebase_source_partition_locator": None,
|
999
|
-
"rebase_source_partition_high_watermark": None,
|
1000
|
-
"records_per_compacted_file": 4000,
|
1001
|
-
"s3_client_kwargs": {},
|
1002
|
-
"source_partition_locator": new_source_delta.partition_locator,
|
1003
|
-
}
|
1004
|
-
)
|
1005
|
-
)
|
1006
|
-
|
1007
|
-
rcf = get_rcf(s3_resource, new_uri)
|
1008
|
-
assert rcf.hash_bucket_count == 4
|