deltacat 2.0.0b11__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
@@ -103,11 +103,6 @@ class IncrementalCompactionTestCaseParams(BaseCompactorTestCase):
|
|
103
103
|
add_late_deltas: Optional[List[Tuple[pa.Table, DeltaType, Optional[EntryParams]]]]
|
104
104
|
|
105
105
|
|
106
|
-
@dataclass(frozen=True)
|
107
|
-
class NoRCFOutputCompactionTestCaseParams(BaseCompactorTestCase):
|
108
|
-
pass
|
109
|
-
|
110
|
-
|
111
106
|
def with_compactor_version_func_test_param(
|
112
107
|
test_cases: Dict[str, BaseCompactorTestCase] = None
|
113
108
|
):
|
@@ -135,7 +130,7 @@ def with_compactor_version_func_test_param(
|
|
135
130
|
|
136
131
|
|
137
132
|
INCREMENTAL_TEST_CASES: Dict[str, IncrementalCompactionTestCaseParams] = {
|
138
|
-
"1-incremental-pkstr-sknone-
|
133
|
+
"1-incremental-pkstr-sknone-norci": IncrementalCompactionTestCaseParams(
|
139
134
|
primary_keys={"pk_col_1"},
|
140
135
|
sort_keys=ZERO_VALUED_SORT_KEY,
|
141
136
|
partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
|
@@ -161,7 +156,7 @@ INCREMENTAL_TEST_CASES: Dict[str, IncrementalCompactionTestCaseParams] = {
|
|
161
156
|
skip_enabled_compact_partition_drivers=None,
|
162
157
|
assert_compaction_audit=assert_compaction_audit,
|
163
158
|
),
|
164
|
-
"2-incremental-pkstr-skstr-
|
159
|
+
"2-incremental-pkstr-skstr-norci": IncrementalCompactionTestCaseParams(
|
165
160
|
primary_keys={"pk_col_1"},
|
166
161
|
sort_keys=ZERO_VALUED_SORT_KEY,
|
167
162
|
partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
|
@@ -190,7 +185,7 @@ INCREMENTAL_TEST_CASES: Dict[str, IncrementalCompactionTestCaseParams] = {
|
|
190
185
|
skip_enabled_compact_partition_drivers=None,
|
191
186
|
assert_compaction_audit=assert_compaction_audit,
|
192
187
|
),
|
193
|
-
"3-incremental-pkstr-multiskstr-
|
188
|
+
"3-incremental-pkstr-multiskstr-norci": IncrementalCompactionTestCaseParams(
|
194
189
|
primary_keys={"pk_col_1"},
|
195
190
|
sort_keys=[
|
196
191
|
SortKey.of(key=["sk_col_1"]),
|
@@ -599,6 +594,38 @@ INCREMENTAL_TEST_CASES: Dict[str, IncrementalCompactionTestCaseParams] = {
|
|
599
594
|
skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
|
600
595
|
assert_compaction_audit=None,
|
601
596
|
),
|
597
|
+
"15-incremental-empty-input-with-single-hash-bucket": IncrementalCompactionTestCaseParams(
|
598
|
+
primary_keys={"pk_col_1"},
|
599
|
+
sort_keys=[SortKey.of(key=["sk_col_1"])],
|
600
|
+
partition_keys=ZERO_VALUED_PARTITION_KEYS_PARAM,
|
601
|
+
partition_values=ZERO_VALUED_PARTITION_VALUES_PARAM,
|
602
|
+
input_deltas=pa.Table.from_arrays(
|
603
|
+
[
|
604
|
+
pa.array([]),
|
605
|
+
pa.array([]),
|
606
|
+
],
|
607
|
+
names=["pk_col_1", "sk_col_1"],
|
608
|
+
),
|
609
|
+
input_deltas_delta_type=DeltaType.UPSERT,
|
610
|
+
expected_terminal_compact_partition_result=pa.Table.from_arrays(
|
611
|
+
[
|
612
|
+
pa.array([]),
|
613
|
+
pa.array([]),
|
614
|
+
],
|
615
|
+
names=["pk_col_1", "sk_col_1"],
|
616
|
+
),
|
617
|
+
expected_terminal_exception=None,
|
618
|
+
expected_terminal_exception_message=None,
|
619
|
+
do_create_placement_group=False,
|
620
|
+
records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
|
621
|
+
hash_bucket_count=1,
|
622
|
+
read_kwargs_provider=None,
|
623
|
+
drop_duplicates=True,
|
624
|
+
is_inplace=False,
|
625
|
+
add_late_deltas=None,
|
626
|
+
skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
|
627
|
+
assert_compaction_audit=assert_compaction_audit_no_hash_bucket,
|
628
|
+
),
|
602
629
|
}
|
603
630
|
|
604
631
|
INCREMENTAL_TEST_CASES = with_compactor_version_func_test_param(INCREMENTAL_TEST_CASES)
|
@@ -49,7 +49,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
49
49
|
self.destination_partition: Partition = MagicMock()
|
50
50
|
self.repartition_args = {"column": "last_updated", "ranges": [1678665487112747]}
|
51
51
|
self.max_records_per_output_file = 2
|
52
|
-
self.
|
52
|
+
self.table_writer_kwargs = {}
|
53
53
|
self.repartitioned_file_content_type = ContentType.PARQUET
|
54
54
|
self.deltacat_storage = MagicMock()
|
55
55
|
self.deltacat_storage_kwargs = MagicMock()
|
@@ -60,7 +60,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
60
60
|
self.destination_partition,
|
61
61
|
self.repartition_args,
|
62
62
|
self.max_records_per_output_file,
|
63
|
-
self.
|
63
|
+
self.table_writer_kwargs,
|
64
64
|
self.repartitioned_file_content_type,
|
65
65
|
self.deltacat_storage,
|
66
66
|
self.deltacat_storage_kwargs,
|
@@ -87,7 +87,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
87
87
|
self.destination_partition,
|
88
88
|
self.repartition_args,
|
89
89
|
self.max_records_per_output_file,
|
90
|
-
self.
|
90
|
+
self.table_writer_kwargs,
|
91
91
|
self.repartitioned_file_content_type,
|
92
92
|
self.deltacat_storage,
|
93
93
|
self.deltacat_storage_kwargs,
|
@@ -101,7 +101,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
101
101
|
self.destination_partition,
|
102
102
|
self.repartition_args,
|
103
103
|
self.max_records_per_output_file,
|
104
|
-
self.
|
104
|
+
self.table_writer_kwargs,
|
105
105
|
self.repartitioned_file_content_type,
|
106
106
|
self.deltacat_storage,
|
107
107
|
self.deltacat_storage_kwargs,
|
@@ -114,7 +114,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
114
114
|
self.destination_partition,
|
115
115
|
self.repartition_args,
|
116
116
|
self.max_records_per_output_file,
|
117
|
-
self.
|
117
|
+
self.table_writer_kwargs,
|
118
118
|
self.repartitioned_file_content_type,
|
119
119
|
self.deltacat_storage,
|
120
120
|
self.deltacat_storage_kwargs,
|
@@ -128,7 +128,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
128
128
|
self.destination_partition,
|
129
129
|
self.repartition_args,
|
130
130
|
self.max_records_per_output_file,
|
131
|
-
self.
|
131
|
+
self.table_writer_kwargs,
|
132
132
|
self.repartitioned_file_content_type,
|
133
133
|
self.deltacat_storage,
|
134
134
|
self.deltacat_storage_kwargs,
|
@@ -143,7 +143,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
143
143
|
self.destination_partition,
|
144
144
|
self.repartition_args,
|
145
145
|
self.max_records_per_output_file,
|
146
|
-
self.
|
146
|
+
self.table_writer_kwargs,
|
147
147
|
self.repartitioned_file_content_type,
|
148
148
|
self.deltacat_storage,
|
149
149
|
self.deltacat_storage_kwargs,
|
@@ -158,7 +158,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
158
158
|
self.destination_partition,
|
159
159
|
self.repartition_args,
|
160
160
|
self.max_records_per_output_file,
|
161
|
-
self.
|
161
|
+
self.table_writer_kwargs,
|
162
162
|
self.repartitioned_file_content_type,
|
163
163
|
self.deltacat_storage,
|
164
164
|
self.deltacat_storage_kwargs,
|
@@ -175,7 +175,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
175
175
|
self.destination_partition,
|
176
176
|
self.repartition_args,
|
177
177
|
self.max_records_per_output_file,
|
178
|
-
self.
|
178
|
+
self.table_writer_kwargs,
|
179
179
|
self.repartitioned_file_content_type,
|
180
180
|
self.deltacat_storage,
|
181
181
|
self.deltacat_storage_kwargs,
|
@@ -189,7 +189,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
189
189
|
self.destination_partition,
|
190
190
|
self.repartition_args,
|
191
191
|
self.max_records_per_output_file,
|
192
|
-
self.
|
192
|
+
self.table_writer_kwargs,
|
193
193
|
self.repartitioned_file_content_type,
|
194
194
|
self.deltacat_storage,
|
195
195
|
self.deltacat_storage_kwargs,
|
@@ -206,7 +206,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
206
206
|
self.destination_partition,
|
207
207
|
self.repartition_args,
|
208
208
|
self.max_records_per_output_file,
|
209
|
-
self.
|
209
|
+
self.table_writer_kwargs,
|
210
210
|
self.repartitioned_file_content_type,
|
211
211
|
self.deltacat_storage,
|
212
212
|
),
|
@@ -233,7 +233,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
233
233
|
self.destination_partition,
|
234
234
|
self.repartition_args,
|
235
235
|
self.max_records_per_output_file,
|
236
|
-
self.
|
236
|
+
self.table_writer_kwargs,
|
237
237
|
self.repartitioned_file_content_type,
|
238
238
|
self.deltacat_storage,
|
239
239
|
self.deltacat_storage_kwargs,
|
@@ -1,131 +1,135 @@
|
|
1
|
-
import
|
1
|
+
import pytest
|
2
2
|
from unittest import mock
|
3
3
|
|
4
|
-
from deltacat.tests.compute.conftest import (
|
5
|
-
create_local_deltacat_storage_file,
|
6
|
-
clean_up_local_deltacat_storage_file,
|
7
|
-
)
|
8
4
|
from deltacat.tests.test_utils.constants import TEST_UPSERT_DELTA
|
9
5
|
|
10
6
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
7
|
+
@pytest.fixture(scope="module", autouse=True)
|
8
|
+
def mock_ray():
|
9
|
+
"""Mock ray module for all tests in this module"""
|
10
|
+
module_patcher = mock.patch.dict("sys.modules", {"ray": mock.MagicMock()})
|
11
|
+
module_patcher.start()
|
12
|
+
yield
|
13
|
+
module_patcher.stop()
|
14
|
+
|
15
|
+
|
16
|
+
@pytest.fixture
|
17
|
+
def compaction_audit():
|
18
|
+
"""Fixture for CompactionSessionAuditInfo"""
|
19
|
+
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
20
|
+
CompactionSessionAuditInfo,
|
21
|
+
)
|
22
|
+
|
23
|
+
return CompactionSessionAuditInfo("1.0", "2.3", "test")
|
24
|
+
|
25
|
+
|
26
|
+
def test_sanity(main_deltacat_storage_kwargs, compaction_audit):
|
27
|
+
from deltacat.compute.compactor.utils import io
|
28
|
+
from deltacat.storage import metastore
|
29
|
+
|
30
|
+
(
|
31
|
+
delta_list,
|
32
|
+
hash_bucket_count,
|
33
|
+
high_watermark,
|
34
|
+
require_multiple_rounds,
|
35
|
+
) = io.fit_input_deltas(
|
36
|
+
[TEST_UPSERT_DELTA],
|
37
|
+
{"CPU": 1, "memory": 20000000},
|
38
|
+
compaction_audit,
|
39
|
+
None,
|
40
|
+
metastore,
|
41
|
+
main_deltacat_storage_kwargs,
|
42
|
+
)
|
43
|
+
|
44
|
+
assert hash_bucket_count is not None
|
45
|
+
assert len(delta_list) == 1
|
46
|
+
assert high_watermark is not None
|
47
|
+
assert require_multiple_rounds is False
|
48
|
+
assert compaction_audit.hash_bucket_count is not None
|
49
|
+
assert compaction_audit.input_file_count is not None
|
50
|
+
assert compaction_audit.input_size_bytes is not None
|
51
|
+
assert compaction_audit.total_cluster_memory_bytes is not None
|
52
|
+
|
53
|
+
|
54
|
+
def test_when_hash_bucket_count_overridden(
|
55
|
+
main_deltacat_storage_kwargs, compaction_audit
|
56
|
+
):
|
57
|
+
from deltacat.compute.compactor.utils import io
|
58
|
+
from deltacat.storage import metastore
|
59
|
+
|
60
|
+
(
|
61
|
+
delta_list,
|
62
|
+
hash_bucket_count,
|
63
|
+
high_watermark,
|
64
|
+
require_multiple_rounds,
|
65
|
+
) = io.fit_input_deltas(
|
66
|
+
[TEST_UPSERT_DELTA],
|
67
|
+
{"CPU": 1, "memory": 20000000},
|
68
|
+
compaction_audit,
|
69
|
+
20,
|
70
|
+
metastore,
|
71
|
+
main_deltacat_storage_kwargs,
|
72
|
+
)
|
73
|
+
|
74
|
+
assert hash_bucket_count == 20
|
75
|
+
assert len(delta_list) == 1
|
76
|
+
assert high_watermark is not None
|
77
|
+
assert require_multiple_rounds is False
|
78
|
+
|
79
|
+
|
80
|
+
def test_when_not_enough_memory_splits_manifest_entries(
|
81
|
+
main_deltacat_storage_kwargs, compaction_audit
|
82
|
+
):
|
83
|
+
from deltacat.compute.compactor.utils import io
|
84
|
+
from deltacat.storage import metastore
|
85
|
+
|
86
|
+
(
|
87
|
+
delta_list,
|
88
|
+
hash_bucket_count,
|
89
|
+
high_watermark,
|
90
|
+
require_multiple_rounds,
|
91
|
+
) = io.fit_input_deltas(
|
92
|
+
[TEST_UPSERT_DELTA],
|
93
|
+
{"CPU": 2, "memory": 10},
|
94
|
+
compaction_audit,
|
95
|
+
20,
|
96
|
+
metastore,
|
97
|
+
main_deltacat_storage_kwargs,
|
98
|
+
)
|
99
|
+
|
100
|
+
assert hash_bucket_count is not None
|
101
|
+
assert len(delta_list) == 2
|
102
|
+
assert high_watermark is not None
|
103
|
+
assert require_multiple_rounds is False
|
104
|
+
|
105
|
+
|
106
|
+
def test_when_no_input_deltas(main_deltacat_storage_kwargs, compaction_audit):
|
107
|
+
from deltacat.compute.compactor.utils import io
|
108
|
+
from deltacat.storage import metastore
|
109
|
+
|
110
|
+
with pytest.raises(AssertionError):
|
111
|
+
io.fit_input_deltas(
|
112
|
+
[],
|
113
|
+
{"CPU": 100, "memory": 20000.0},
|
114
|
+
compaction_audit,
|
115
|
+
None,
|
116
|
+
metastore,
|
117
|
+
main_deltacat_storage_kwargs,
|
19
118
|
)
|
20
119
|
|
21
|
-
cls.kwargs_for_local_deltacat_storage = create_local_deltacat_storage_file()
|
22
|
-
|
23
|
-
cls.COMPACTION_AUDIT = CompactionSessionAuditInfo("1.0", "2.3", "test")
|
24
|
-
|
25
|
-
super().setUpClass()
|
26
|
-
|
27
|
-
@classmethod
|
28
|
-
def tearDownClass(cls) -> None:
|
29
|
-
cls.module_patcher.stop()
|
30
|
-
clean_up_local_deltacat_storage_file(cls.kwargs_for_local_deltacat_storage)
|
31
120
|
|
32
|
-
|
33
|
-
|
34
|
-
|
121
|
+
def test_when_cpu_resources_is_not_passed(
|
122
|
+
main_deltacat_storage_kwargs, compaction_audit
|
123
|
+
):
|
124
|
+
from deltacat.compute.compactor.utils import io
|
125
|
+
from deltacat.storage import metastore
|
35
126
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
) = io.fit_input_deltas(
|
42
|
-
[TEST_UPSERT_DELTA],
|
43
|
-
{"CPU": 1, "memory": 20000000},
|
44
|
-
self.COMPACTION_AUDIT,
|
127
|
+
with pytest.raises(KeyError):
|
128
|
+
io.fit_input_deltas(
|
129
|
+
[],
|
130
|
+
{},
|
131
|
+
compaction_audit,
|
45
132
|
None,
|
46
|
-
|
47
|
-
|
133
|
+
metastore,
|
134
|
+
main_deltacat_storage_kwargs,
|
48
135
|
)
|
49
|
-
|
50
|
-
self.assertIsNotNone(hash_bucket_count)
|
51
|
-
self.assertTrue(1, len(delta_list))
|
52
|
-
self.assertIsNotNone(high_watermark)
|
53
|
-
self.assertFalse(require_multiple_rounds)
|
54
|
-
self.assertIsNotNone(hash_bucket_count, self.COMPACTION_AUDIT.hash_bucket_count)
|
55
|
-
self.assertIsNotNone(self.COMPACTION_AUDIT.input_file_count)
|
56
|
-
self.assertIsNotNone(self.COMPACTION_AUDIT.input_size_bytes)
|
57
|
-
self.assertIsNotNone(self.COMPACTION_AUDIT.total_cluster_memory_bytes)
|
58
|
-
|
59
|
-
def test_when_hash_bucket_count_overridden(self):
|
60
|
-
from deltacat.compute.compactor.utils import io
|
61
|
-
import deltacat.tests.local_deltacat_storage as ds
|
62
|
-
|
63
|
-
(
|
64
|
-
delta_list,
|
65
|
-
hash_bucket_count,
|
66
|
-
high_watermark,
|
67
|
-
require_multiple_rounds,
|
68
|
-
) = io.fit_input_deltas(
|
69
|
-
[TEST_UPSERT_DELTA],
|
70
|
-
{"CPU": 1, "memory": 20000000},
|
71
|
-
self.COMPACTION_AUDIT,
|
72
|
-
20,
|
73
|
-
ds,
|
74
|
-
self.kwargs_for_local_deltacat_storage,
|
75
|
-
)
|
76
|
-
|
77
|
-
self.assertEqual(20, hash_bucket_count)
|
78
|
-
self.assertEqual(1, len(delta_list))
|
79
|
-
self.assertIsNotNone(high_watermark)
|
80
|
-
self.assertFalse(require_multiple_rounds)
|
81
|
-
|
82
|
-
def test_when_not_enough_memory_splits_manifest_entries(self):
|
83
|
-
from deltacat.compute.compactor.utils import io
|
84
|
-
import deltacat.tests.local_deltacat_storage as ds
|
85
|
-
|
86
|
-
(
|
87
|
-
delta_list,
|
88
|
-
hash_bucket_count,
|
89
|
-
high_watermark,
|
90
|
-
require_multiple_rounds,
|
91
|
-
) = io.fit_input_deltas(
|
92
|
-
[TEST_UPSERT_DELTA],
|
93
|
-
{"CPU": 2, "memory": 10},
|
94
|
-
self.COMPACTION_AUDIT,
|
95
|
-
20,
|
96
|
-
ds,
|
97
|
-
self.kwargs_for_local_deltacat_storage,
|
98
|
-
)
|
99
|
-
|
100
|
-
self.assertIsNotNone(hash_bucket_count)
|
101
|
-
self.assertTrue(2, len(delta_list))
|
102
|
-
self.assertIsNotNone(high_watermark)
|
103
|
-
self.assertFalse(require_multiple_rounds)
|
104
|
-
|
105
|
-
def test_when_no_input_deltas(self):
|
106
|
-
from deltacat.compute.compactor.utils import io
|
107
|
-
import deltacat.tests.local_deltacat_storage as ds
|
108
|
-
|
109
|
-
with self.assertRaises(AssertionError):
|
110
|
-
io.fit_input_deltas(
|
111
|
-
[],
|
112
|
-
{"CPU": 100, "memory": 20000.0},
|
113
|
-
self.COMPACTION_AUDIT,
|
114
|
-
None,
|
115
|
-
ds,
|
116
|
-
self.kwargs_for_local_deltacat_storage,
|
117
|
-
)
|
118
|
-
|
119
|
-
def test_when_cpu_resources_is_not_passed(self):
|
120
|
-
from deltacat.compute.compactor.utils import io
|
121
|
-
import deltacat.tests.local_deltacat_storage as ds
|
122
|
-
|
123
|
-
with self.assertRaises(KeyError):
|
124
|
-
io.fit_input_deltas(
|
125
|
-
[],
|
126
|
-
{},
|
127
|
-
self.COMPACTION_AUDIT,
|
128
|
-
None,
|
129
|
-
ds,
|
130
|
-
self.kwargs_for_local_deltacat_storage,
|
131
|
-
)
|
@@ -0,0 +1,254 @@
|
|
1
|
+
from unittest.mock import Mock
|
2
|
+
from deltacat.compute.compactor.utils.round_completion_reader import (
|
3
|
+
read_round_completion_info,
|
4
|
+
)
|
5
|
+
from deltacat.tests.compute.test_util_common import get_test_partition_locator
|
6
|
+
from deltacat.compute.compactor import RoundCompletionInfo
|
7
|
+
from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
|
8
|
+
from deltacat.storage.model.partition import Partition
|
9
|
+
|
10
|
+
|
11
|
+
class TestRoundCompletionInfoInPartition:
|
12
|
+
def test_read_round_completion_info_from_partition_with_matching_source(self):
|
13
|
+
"""
|
14
|
+
Test reading RoundCompletionInfo from partition metafile with matching source partition locator.
|
15
|
+
"""
|
16
|
+
source_locator = get_test_partition_locator("source")
|
17
|
+
destination_locator = get_test_partition_locator("destination")
|
18
|
+
|
19
|
+
# Create a test RoundCompletionInfo with prev_source_partition_locator
|
20
|
+
pawr = PyArrowWriteResult.of(
|
21
|
+
file_count=1, pyarrow_bytes=1000, file_bytes=1000, record_count=100
|
22
|
+
)
|
23
|
+
|
24
|
+
expected_rci = RoundCompletionInfo.of(
|
25
|
+
high_watermark=122,
|
26
|
+
compacted_delta_locator=None,
|
27
|
+
compacted_pyarrow_write_result=pawr,
|
28
|
+
sort_keys_bit_width=12,
|
29
|
+
prev_source_partition_locator=source_locator,
|
30
|
+
)
|
31
|
+
|
32
|
+
# Create a partition with RoundCompletionInfo
|
33
|
+
partition = Partition.of(
|
34
|
+
locator=destination_locator,
|
35
|
+
content_types=None,
|
36
|
+
compaction_round_completion_info=expected_rci,
|
37
|
+
)
|
38
|
+
|
39
|
+
# Mock the storage
|
40
|
+
mock_storage = Mock()
|
41
|
+
|
42
|
+
# Test reading with partition provided (no storage call needed)
|
43
|
+
rci = read_round_completion_info(
|
44
|
+
source_partition_locator=source_locator,
|
45
|
+
destination_partition_locator=destination_locator,
|
46
|
+
deltacat_storage=mock_storage,
|
47
|
+
deltacat_storage_kwargs={},
|
48
|
+
destination_partition=partition,
|
49
|
+
)
|
50
|
+
|
51
|
+
assert rci is not None
|
52
|
+
assert rci == expected_rci
|
53
|
+
assert rci.high_watermark == 122
|
54
|
+
assert rci.sort_keys_bit_width == 12
|
55
|
+
assert (
|
56
|
+
rci.prev_source_partition_locator.partition_id
|
57
|
+
== source_locator.partition_id
|
58
|
+
)
|
59
|
+
|
60
|
+
# Verify storage was not called since partition was provided
|
61
|
+
mock_storage.get_partition.assert_not_called()
|
62
|
+
|
63
|
+
def test_read_round_completion_info_from_partition_with_mismatched_source(self):
|
64
|
+
"""
|
65
|
+
Test reading RoundCompletionInfo from partition metafile with mismatched source partition locator.
|
66
|
+
Should return None and log a warning.
|
67
|
+
"""
|
68
|
+
source_locator = get_test_partition_locator("source")
|
69
|
+
different_source_locator = get_test_partition_locator("different_source")
|
70
|
+
destination_locator = get_test_partition_locator("destination")
|
71
|
+
|
72
|
+
# Create a test RoundCompletionInfo with different prev_source_partition_locator
|
73
|
+
pawr = PyArrowWriteResult.of(
|
74
|
+
file_count=1, pyarrow_bytes=1000, file_bytes=1000, record_count=100
|
75
|
+
)
|
76
|
+
|
77
|
+
expected_rci = RoundCompletionInfo.of(
|
78
|
+
high_watermark=122,
|
79
|
+
compacted_delta_locator=None,
|
80
|
+
compacted_pyarrow_write_result=pawr,
|
81
|
+
sort_keys_bit_width=12,
|
82
|
+
prev_source_partition_locator=different_source_locator, # Different from source_locator
|
83
|
+
)
|
84
|
+
|
85
|
+
# Create a partition with RoundCompletionInfo
|
86
|
+
partition = Partition.of(
|
87
|
+
locator=destination_locator,
|
88
|
+
content_types=None,
|
89
|
+
compaction_round_completion_info=expected_rci,
|
90
|
+
)
|
91
|
+
|
92
|
+
# Mock the storage
|
93
|
+
mock_storage = Mock()
|
94
|
+
|
95
|
+
# Test reading with mismatched source locator
|
96
|
+
rci = read_round_completion_info(
|
97
|
+
source_partition_locator=source_locator, # Different from the one in RoundCompletionInfo
|
98
|
+
destination_partition_locator=destination_locator,
|
99
|
+
deltacat_storage=mock_storage,
|
100
|
+
deltacat_storage_kwargs={},
|
101
|
+
destination_partition=partition,
|
102
|
+
)
|
103
|
+
|
104
|
+
# Should return None due to mismatch
|
105
|
+
assert rci is None
|
106
|
+
|
107
|
+
# Verify storage was not called since partition was provided
|
108
|
+
mock_storage.get_partition.assert_not_called()
|
109
|
+
|
110
|
+
def test_read_round_completion_info_from_storage_when_partition_not_provided(self):
|
111
|
+
"""
|
112
|
+
Test reading RoundCompletionInfo from storage when partition is not provided.
|
113
|
+
"""
|
114
|
+
source_locator = get_test_partition_locator("source")
|
115
|
+
destination_locator = get_test_partition_locator("destination")
|
116
|
+
|
117
|
+
# Create a test RoundCompletionInfo
|
118
|
+
pawr = PyArrowWriteResult.of(
|
119
|
+
file_count=1, pyarrow_bytes=1000, file_bytes=1000, record_count=100
|
120
|
+
)
|
121
|
+
|
122
|
+
expected_rci = RoundCompletionInfo.of(
|
123
|
+
high_watermark=122,
|
124
|
+
compacted_delta_locator=None,
|
125
|
+
compacted_pyarrow_write_result=pawr,
|
126
|
+
sort_keys_bit_width=12,
|
127
|
+
prev_source_partition_locator=source_locator,
|
128
|
+
)
|
129
|
+
|
130
|
+
# Create a partition with RoundCompletionInfo
|
131
|
+
partition = Partition.of(
|
132
|
+
locator=destination_locator,
|
133
|
+
content_types=None,
|
134
|
+
compaction_round_completion_info=expected_rci,
|
135
|
+
)
|
136
|
+
|
137
|
+
# Mock the storage to return the partition
|
138
|
+
mock_storage = Mock()
|
139
|
+
mock_storage.get_partition.return_value = partition
|
140
|
+
|
141
|
+
# Test reading without partition provided (storage call needed)
|
142
|
+
rci = read_round_completion_info(
|
143
|
+
source_partition_locator=source_locator,
|
144
|
+
destination_partition_locator=destination_locator,
|
145
|
+
deltacat_storage=mock_storage,
|
146
|
+
deltacat_storage_kwargs={"test_arg": "test_value"},
|
147
|
+
)
|
148
|
+
|
149
|
+
assert rci is not None
|
150
|
+
assert rci == expected_rci
|
151
|
+
assert rci.high_watermark == 122
|
152
|
+
|
153
|
+
# Verify storage was called with correct parameters
|
154
|
+
mock_storage.get_partition.assert_called_once_with(
|
155
|
+
destination_locator.stream_locator,
|
156
|
+
destination_locator.partition_values,
|
157
|
+
test_arg="test_value",
|
158
|
+
)
|
159
|
+
|
160
|
+
def test_read_round_completion_info_when_partition_not_found(self):
|
161
|
+
"""
|
162
|
+
Test reading RoundCompletionInfo when partition is not found in storage.
|
163
|
+
"""
|
164
|
+
source_locator = get_test_partition_locator("source")
|
165
|
+
destination_locator = get_test_partition_locator("destination")
|
166
|
+
|
167
|
+
# Mock the storage to return None (partition not found)
|
168
|
+
mock_storage = Mock()
|
169
|
+
mock_storage.get_partition.return_value = None
|
170
|
+
|
171
|
+
# Test reading when partition not found
|
172
|
+
rci = read_round_completion_info(
|
173
|
+
source_partition_locator=source_locator,
|
174
|
+
destination_partition_locator=destination_locator,
|
175
|
+
deltacat_storage=mock_storage,
|
176
|
+
deltacat_storage_kwargs={},
|
177
|
+
)
|
178
|
+
|
179
|
+
# Should return None when partition not found
|
180
|
+
assert rci is None
|
181
|
+
|
182
|
+
# Verify storage was called
|
183
|
+
mock_storage.get_partition.assert_called_once()
|
184
|
+
|
185
|
+
def test_read_round_completion_info_when_no_completion_info_in_partition(self):
|
186
|
+
"""
|
187
|
+
Test reading RoundCompletionInfo when partition exists but has no completion info.
|
188
|
+
"""
|
189
|
+
source_locator = get_test_partition_locator("source")
|
190
|
+
destination_locator = get_test_partition_locator("destination")
|
191
|
+
|
192
|
+
# Create a partition without RoundCompletionInfo
|
193
|
+
partition = Partition.of(
|
194
|
+
locator=destination_locator,
|
195
|
+
content_types=None,
|
196
|
+
compaction_round_completion_info=None,
|
197
|
+
)
|
198
|
+
|
199
|
+
# Mock the storage to return the partition
|
200
|
+
mock_storage = Mock()
|
201
|
+
mock_storage.get_partition.return_value = partition
|
202
|
+
|
203
|
+
# Test reading when no completion info in partition
|
204
|
+
rci = read_round_completion_info(
|
205
|
+
source_partition_locator=source_locator,
|
206
|
+
destination_partition_locator=destination_locator,
|
207
|
+
deltacat_storage=mock_storage,
|
208
|
+
deltacat_storage_kwargs={},
|
209
|
+
)
|
210
|
+
|
211
|
+
# Should return None when no completion info
|
212
|
+
assert rci is None
|
213
|
+
|
214
|
+
def test_read_with_missing_prev_source_partition_locator_returns_none(self):
|
215
|
+
"""
|
216
|
+
Test that reading with missing prev_source_partition_locator returns None.
|
217
|
+
"""
|
218
|
+
source_locator = get_test_partition_locator("source")
|
219
|
+
destination_locator = get_test_partition_locator("destination")
|
220
|
+
|
221
|
+
# Create RoundCompletionInfo without prev_source_partition_locator
|
222
|
+
pawr = PyArrowWriteResult.of(
|
223
|
+
file_count=1, pyarrow_bytes=1000, file_bytes=1000, record_count=100
|
224
|
+
)
|
225
|
+
|
226
|
+
rcf = RoundCompletionInfo.of(
|
227
|
+
high_watermark=122,
|
228
|
+
compacted_delta_locator=None,
|
229
|
+
compacted_pyarrow_write_result=pawr,
|
230
|
+
sort_keys_bit_width=12,
|
231
|
+
prev_source_partition_locator=None, # Missing
|
232
|
+
)
|
233
|
+
|
234
|
+
# Create a partition with RoundCompletionInfo
|
235
|
+
partition = Partition.of(
|
236
|
+
locator=destination_locator,
|
237
|
+
content_types=None,
|
238
|
+
compaction_round_completion_info=rcf,
|
239
|
+
)
|
240
|
+
|
241
|
+
# Mock the storage
|
242
|
+
mock_storage = Mock()
|
243
|
+
|
244
|
+
# Test reading should return None due to missing prev_source_partition_locator
|
245
|
+
result = read_round_completion_info(
|
246
|
+
source_partition_locator=source_locator,
|
247
|
+
destination_partition_locator=destination_locator,
|
248
|
+
deltacat_storage=mock_storage,
|
249
|
+
deltacat_storage_kwargs={},
|
250
|
+
destination_partition=partition,
|
251
|
+
)
|
252
|
+
|
253
|
+
# Should return None when prev_source_partition_locator is missing or mismatched
|
254
|
+
assert result is None
|