deltacat 1.1.36__py3-none-any.whl → 2.0.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +42 -3
- deltacat/annotations.py +36 -0
- deltacat/api.py +168 -0
- deltacat/aws/s3u.py +4 -4
- deltacat/benchmarking/benchmark_engine.py +82 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +21 -0
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
- deltacat/catalog/__init__.py +14 -0
- deltacat/catalog/delegate.py +199 -106
- deltacat/catalog/iceberg/__init__.py +4 -0
- deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/catalog/iceberg/impl.py +368 -0
- deltacat/catalog/iceberg/overrides.py +74 -0
- deltacat/catalog/interface.py +273 -76
- deltacat/catalog/main/impl.py +720 -0
- deltacat/catalog/model/catalog.py +227 -20
- deltacat/catalog/model/properties.py +116 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +5 -5
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +1 -1
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +1 -1
- deltacat/compute/compactor/steps/materialize.py +6 -2
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor_v2/compaction_session.py +5 -9
- deltacat/compute/compactor_v2/constants.py +1 -30
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/merge_input.py +1 -7
- deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
- deltacat/compute/compactor_v2/steps/merge.py +17 -126
- deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/compactor_v2/utils/merge.py +0 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
- deltacat/compute/compactor_v2/utils/task_options.py +23 -43
- deltacat/compute/converter/constants.py +4 -0
- deltacat/compute/converter/converter_session.py +143 -0
- deltacat/compute/converter/model/convert_input.py +69 -0
- deltacat/compute/converter/model/convert_input_files.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +99 -0
- deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- deltacat/compute/converter/pyiceberg/catalog.py +75 -0
- deltacat/compute/converter/pyiceberg/overrides.py +135 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
- deltacat/compute/converter/steps/__init__.py +0 -0
- deltacat/compute/converter/steps/convert.py +211 -0
- deltacat/compute/converter/steps/dedupe.py +60 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +88 -0
- deltacat/compute/converter/utils/converter_session_utils.py +109 -0
- deltacat/compute/converter/utils/iceberg_columns.py +82 -0
- deltacat/compute/converter/utils/io.py +43 -0
- deltacat/compute/converter/utils/s3u.py +133 -0
- deltacat/compute/resource_estimation/delta.py +1 -19
- deltacat/constants.py +47 -1
- deltacat/env.py +51 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/common/__init__.py +0 -0
- deltacat/examples/common/fixtures.py +15 -0
- deltacat/examples/hello_world.py +27 -0
- deltacat/examples/iceberg/__init__.py +0 -0
- deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
- deltacat/examples/iceberg/iceberg_reader.py +149 -0
- deltacat/exceptions.py +51 -9
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +118 -28
- deltacat/storage/iceberg/__init__.py +0 -0
- deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
- deltacat/storage/iceberg/impl.py +737 -0
- deltacat/storage/iceberg/model.py +709 -0
- deltacat/storage/interface.py +217 -134
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +2077 -0
- deltacat/storage/model/delta.py +118 -71
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -3
- deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
- deltacat/storage/model/metafile.py +1316 -0
- deltacat/storage/model/namespace.py +34 -18
- deltacat/storage/model/partition.py +362 -37
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +19 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +892 -0
- deltacat/storage/model/shard.py +47 -0
- deltacat/storage/model/sort_key.py +170 -13
- deltacat/storage/model/stream.py +208 -80
- deltacat/storage/model/table.py +123 -29
- deltacat/storage/model/table_version.py +322 -46
- deltacat/storage/model/transaction.py +757 -0
- deltacat/storage/model/transform.py +198 -61
- deltacat/storage/model/types.py +111 -13
- deltacat/storage/rivulet/__init__.py +11 -0
- deltacat/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/storage/rivulet/arrow/serializer.py +75 -0
- deltacat/storage/rivulet/dataset.py +744 -0
- deltacat/storage/rivulet/dataset_executor.py +87 -0
- deltacat/storage/rivulet/feather/__init__.py +5 -0
- deltacat/storage/rivulet/feather/file_reader.py +136 -0
- deltacat/storage/rivulet/feather/serializer.py +35 -0
- deltacat/storage/rivulet/fs/__init__.py +0 -0
- deltacat/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/storage/rivulet/fs/file_store.py +130 -0
- deltacat/storage/rivulet/fs/input_file.py +76 -0
- deltacat/storage/rivulet/fs/output_file.py +86 -0
- deltacat/storage/rivulet/logical_plan.py +105 -0
- deltacat/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/storage/rivulet/metastore/delta.py +190 -0
- deltacat/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/storage/rivulet/metastore/sst.py +82 -0
- deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/storage/rivulet/mvp/Table.py +101 -0
- deltacat/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/storage/rivulet/parquet/file_reader.py +127 -0
- deltacat/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/storage/rivulet/reader/__init__.py +0 -0
- deltacat/storage/rivulet/reader/block_scanner.py +378 -0
- deltacat/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/storage/rivulet/reader/data_scan.py +63 -0
- deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
- deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
- deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
- deltacat/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/storage/rivulet/schema/__init__.py +0 -0
- deltacat/storage/rivulet/schema/datatype.py +128 -0
- deltacat/storage/rivulet/schema/schema.py +251 -0
- deltacat/storage/rivulet/serializer.py +40 -0
- deltacat/storage/rivulet/serializer_factory.py +42 -0
- deltacat/storage/rivulet/writer/__init__.py +0 -0
- deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/catalog/test_catalogs.py +324 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +19 -53
- deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
- deltacat/tests/compute/compactor/utils/test_io.py +6 -8
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
- deltacat/tests/compute/conftest.py +75 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +478 -0
- deltacat/tests/compute/converter/utils.py +123 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
- deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
- deltacat/tests/compute/test_compact_partition_params.py +3 -3
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
- deltacat/tests/compute/test_util_common.py +19 -12
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
- deltacat/tests/local_deltacat_storage/__init__.py +76 -103
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/conftest.py +25 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +1399 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_metafile_io.py +2535 -0
- deltacat/tests/storage/model/test_schema.py +308 -0
- deltacat/tests/storage/model/test_shard.py +22 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +308 -0
- deltacat/tests/storage/rivulet/__init__.py +0 -0
- deltacat/tests/storage/rivulet/conftest.py +149 -0
- deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
- deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/storage/rivulet/test_dataset.py +406 -0
- deltacat/tests/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/storage/rivulet/test_utils.py +122 -0
- deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/test_deltacat_api.py +39 -0
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +8 -15
- deltacat/tests/test_utils/storage.py +266 -3
- deltacat/tests/utils/test_daft.py +3 -3
- deltacat/tests/utils/test_pyarrow.py +0 -432
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +1 -1
- deltacat/utils/export.py +59 -0
- deltacat/utils/filesystem.py +320 -0
- deltacat/utils/metafile_locator.py +73 -0
- deltacat/utils/pyarrow.py +36 -183
- deltacat-2.0.0b2.dist-info/METADATA +65 -0
- deltacat-2.0.0b2.dist-info/RECORD +349 -0
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
- deltacat-1.1.36.dist-info/METADATA +0 -64
- deltacat-1.1.36.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/LICENSE +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/WHEEL +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.0b2.dist-info}/top_level.txt +0 -0
@@ -50,11 +50,6 @@ from deltacat.utils.placement import (
|
|
50
50
|
)
|
51
51
|
from deltacat import logs
|
52
52
|
|
53
|
-
DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
|
54
|
-
"db_file_path",
|
55
|
-
"deltacat/tests/local_deltacat_storage/db_test.sqlite",
|
56
|
-
)
|
57
|
-
|
58
53
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
59
54
|
|
60
55
|
|
@@ -80,14 +75,6 @@ def mock_aws_credential():
|
|
80
75
|
yield
|
81
76
|
|
82
77
|
|
83
|
-
@pytest.fixture(autouse=True, scope="module")
|
84
|
-
def cleanup_the_database_file_after_all_compaction_session_package_tests_complete():
|
85
|
-
# make sure the database file is deleted after all the compactor package tests are completed
|
86
|
-
yield
|
87
|
-
if os.path.exists(DATABASE_FILE_PATH_VALUE):
|
88
|
-
os.remove(DATABASE_FILE_PATH_VALUE)
|
89
|
-
|
90
|
-
|
91
78
|
@pytest.fixture(scope="module")
|
92
79
|
def s3_resource():
|
93
80
|
with mock_s3():
|
@@ -108,32 +95,6 @@ FUNCTION scoped fixtures
|
|
108
95
|
"""
|
109
96
|
|
110
97
|
|
111
|
-
@pytest.fixture(scope="function")
|
112
|
-
def offer_local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
|
113
|
-
# see deltacat/tests/local_deltacat_storage/README.md for documentation
|
114
|
-
kwargs_for_local_deltacat_storage: Dict[str, Any] = {
|
115
|
-
DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
|
116
|
-
}
|
117
|
-
yield kwargs_for_local_deltacat_storage
|
118
|
-
if os.path.exists(DATABASE_FILE_PATH_VALUE):
|
119
|
-
os.remove(DATABASE_FILE_PATH_VALUE)
|
120
|
-
|
121
|
-
|
122
|
-
@pytest.fixture(autouse=True, scope="function")
|
123
|
-
def enable_bucketing_spec_validation(monkeypatch):
|
124
|
-
"""
|
125
|
-
Enable the bucketing spec validation for all tests.
|
126
|
-
This will help catch hash bucket drift in testing.
|
127
|
-
"""
|
128
|
-
import deltacat.compute.compactor_v2.steps.merge
|
129
|
-
|
130
|
-
monkeypatch.setattr(
|
131
|
-
deltacat.compute.compactor_v2.steps.merge,
|
132
|
-
"BUCKETING_SPEC_COMPLIANCE_PROFILE",
|
133
|
-
"ASSERT",
|
134
|
-
)
|
135
|
-
|
136
|
-
|
137
98
|
@pytest.mark.parametrize(
|
138
99
|
[
|
139
100
|
"test_name",
|
@@ -209,7 +170,7 @@ def enable_bucketing_spec_validation(monkeypatch):
|
|
209
170
|
)
|
210
171
|
def test_compact_partition_incremental(
|
211
172
|
s3_resource: ServiceResource,
|
212
|
-
|
173
|
+
local_deltacat_storage_kwargs: Dict[str, Any],
|
213
174
|
test_name: str,
|
214
175
|
primary_keys: Set[str],
|
215
176
|
sort_keys: Dict[str, str],
|
@@ -235,7 +196,7 @@ def test_compact_partition_incremental(
|
|
235
196
|
):
|
236
197
|
import deltacat.tests.local_deltacat_storage as ds
|
237
198
|
|
238
|
-
ds_mock_kwargs: Dict[str, Any] =
|
199
|
+
ds_mock_kwargs: Dict[str, Any] = local_deltacat_storage_kwargs
|
239
200
|
|
240
201
|
# setup
|
241
202
|
partition_keys = partition_keys_param
|
@@ -247,7 +208,6 @@ def test_compact_partition_incremental(
|
|
247
208
|
source_table_name,
|
248
209
|
source_table_version,
|
249
210
|
) = create_src_w_deltas_destination_plus_destination(
|
250
|
-
primary_keys,
|
251
211
|
sort_keys,
|
252
212
|
partition_keys,
|
253
213
|
input_deltas,
|
@@ -48,12 +48,6 @@ from deltacat.utils.placement import (
|
|
48
48
|
PlacementGroupManager,
|
49
49
|
)
|
50
50
|
|
51
|
-
DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
|
52
|
-
"db_file_path",
|
53
|
-
"deltacat/tests/local_deltacat_storage/db_test.sqlite",
|
54
|
-
)
|
55
|
-
|
56
|
-
|
57
51
|
"""
|
58
52
|
MODULE scoped fixtures
|
59
53
|
"""
|
@@ -76,13 +70,6 @@ def mock_aws_credential():
|
|
76
70
|
yield
|
77
71
|
|
78
72
|
|
79
|
-
@pytest.fixture(autouse=True, scope="module")
|
80
|
-
def cleanup_the_database_file_after_all_compaction_session_package_tests_complete():
|
81
|
-
# make sure the database file is deleted after all the compactor package tests are completed
|
82
|
-
if os.path.exists(DATABASE_FILE_PATH_VALUE):
|
83
|
-
os.remove(DATABASE_FILE_PATH_VALUE)
|
84
|
-
|
85
|
-
|
86
73
|
@pytest.fixture(scope="module")
|
87
74
|
def s3_resource(mock_aws_credential):
|
88
75
|
with mock_s3():
|
@@ -98,37 +85,6 @@ def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
|
|
98
85
|
yield
|
99
86
|
|
100
87
|
|
101
|
-
"""
|
102
|
-
FUNCTION scoped fixtures
|
103
|
-
"""
|
104
|
-
|
105
|
-
|
106
|
-
@pytest.fixture(scope="function")
|
107
|
-
def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
|
108
|
-
# see deltacat/tests/local_deltacat_storage/README.md for documentation
|
109
|
-
kwargs_for_local_deltacat_storage: Dict[str, Any] = {
|
110
|
-
DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
|
111
|
-
}
|
112
|
-
yield kwargs_for_local_deltacat_storage
|
113
|
-
if os.path.exists(DATABASE_FILE_PATH_VALUE):
|
114
|
-
os.remove(DATABASE_FILE_PATH_VALUE)
|
115
|
-
|
116
|
-
|
117
|
-
@pytest.fixture(autouse=True, scope="function")
|
118
|
-
def enable_bucketing_spec_validation(monkeypatch):
|
119
|
-
"""
|
120
|
-
Enable the bucketing spec validation for all tests.
|
121
|
-
This will help catch hash bucket drift in testing.
|
122
|
-
"""
|
123
|
-
import deltacat.compute.compactor_v2.steps.merge
|
124
|
-
|
125
|
-
monkeypatch.setattr(
|
126
|
-
deltacat.compute.compactor_v2.steps.merge,
|
127
|
-
"BUCKETING_SPEC_COMPLIANCE_PROFILE",
|
128
|
-
"ASSERT",
|
129
|
-
)
|
130
|
-
|
131
|
-
|
132
88
|
@pytest.mark.parametrize(
|
133
89
|
[
|
134
90
|
"test_name",
|
@@ -240,7 +196,6 @@ def test_compact_partition_rebase_multiple_rounds_same_source_and_destination(
|
|
240
196
|
rebased_table_stream,
|
241
197
|
_,
|
242
198
|
) = multiple_rounds_create_src_w_deltas_destination_rebase_w_deltas_strategy(
|
243
|
-
primary_keys,
|
244
199
|
sort_keys,
|
245
200
|
partition_keys_param,
|
246
201
|
input_deltas_param,
|
@@ -347,7 +302,11 @@ def test_compact_partition_rebase_multiple_rounds_same_source_and_destination(
|
|
347
302
|
actual_rebase_compacted_table = pa.concat_tables(tables)
|
348
303
|
# if no primary key is specified then sort by sort_key for consistent assertion
|
349
304
|
sorting_cols: List[Any] = (
|
350
|
-
[(val, "ascending") for val in primary_keys]
|
305
|
+
[(val, "ascending") for val in primary_keys]
|
306
|
+
if primary_keys
|
307
|
+
else [pa_key for key in sort_keys for pa_key in key.arrow]
|
308
|
+
if sort_keys
|
309
|
+
else []
|
351
310
|
)
|
352
311
|
rebase_expected_compact_partition_result = (
|
353
312
|
rebase_expected_compact_partition_result.combine_chunks().sort_by(
|
@@ -23,7 +23,7 @@ class TestCompactPartitionParams(unittest.TestCase):
|
|
23
23
|
"tableVersion": "1",
|
24
24
|
},
|
25
25
|
"streamId": "foobar",
|
26
|
-
"
|
26
|
+
"format": "fooType",
|
27
27
|
},
|
28
28
|
"partitionValues": [],
|
29
29
|
"partitionId": None,
|
@@ -47,7 +47,7 @@ class TestCompactPartitionParams(unittest.TestCase):
|
|
47
47
|
"table_version": "1",
|
48
48
|
},
|
49
49
|
"streamId": "foobar",
|
50
|
-
"
|
50
|
+
"format": "fooType",
|
51
51
|
},
|
52
52
|
"partitionValues": [],
|
53
53
|
"partitionId": "79612ea39ac5493eae925abe60767d42",
|
@@ -67,7 +67,7 @@ class TestCompactPartitionParams(unittest.TestCase):
|
|
67
67
|
"tableVersion": "2",
|
68
68
|
},
|
69
69
|
"streamId": "foobar",
|
70
|
-
"
|
70
|
+
"format": "fooType",
|
71
71
|
},
|
72
72
|
"partitionValues": [],
|
73
73
|
"partitionId": "79612ea39ac5493eae925abe60767d42",
|
@@ -48,12 +48,6 @@ from deltacat.utils.placement import (
|
|
48
48
|
PlacementGroupManager,
|
49
49
|
)
|
50
50
|
|
51
|
-
DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
|
52
|
-
"db_file_path",
|
53
|
-
"deltacat/tests/local_deltacat_storage/db_test.sqlite",
|
54
|
-
)
|
55
|
-
|
56
|
-
|
57
51
|
"""
|
58
52
|
MODULE scoped fixtures
|
59
53
|
"""
|
@@ -76,13 +70,6 @@ def mock_aws_credential():
|
|
76
70
|
yield
|
77
71
|
|
78
72
|
|
79
|
-
@pytest.fixture(autouse=True, scope="module")
|
80
|
-
def cleanup_the_database_file_after_all_compaction_session_package_tests_complete():
|
81
|
-
# make sure the database file is deleted after all the compactor package tests are completed
|
82
|
-
if os.path.exists(DATABASE_FILE_PATH_VALUE):
|
83
|
-
os.remove(DATABASE_FILE_PATH_VALUE)
|
84
|
-
|
85
|
-
|
86
73
|
@pytest.fixture(scope="module")
|
87
74
|
def s3_resource(mock_aws_credential):
|
88
75
|
with mock_s3():
|
@@ -98,37 +85,6 @@ def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
|
|
98
85
|
yield
|
99
86
|
|
100
87
|
|
101
|
-
"""
|
102
|
-
FUNCTION scoped fixtures
|
103
|
-
"""
|
104
|
-
|
105
|
-
|
106
|
-
@pytest.fixture(scope="function")
|
107
|
-
def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
|
108
|
-
# see deltacat/tests/local_deltacat_storage/README.md for documentation
|
109
|
-
kwargs_for_local_deltacat_storage: Dict[str, Any] = {
|
110
|
-
DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
|
111
|
-
}
|
112
|
-
yield kwargs_for_local_deltacat_storage
|
113
|
-
if os.path.exists(DATABASE_FILE_PATH_VALUE):
|
114
|
-
os.remove(DATABASE_FILE_PATH_VALUE)
|
115
|
-
|
116
|
-
|
117
|
-
@pytest.fixture(autouse=True, scope="function")
|
118
|
-
def enable_bucketing_spec_validation(monkeypatch):
|
119
|
-
"""
|
120
|
-
Enable the bucketing spec validation for all tests.
|
121
|
-
This will help catch hash bucket drift in testing.
|
122
|
-
"""
|
123
|
-
import deltacat.compute.compactor_v2.steps.merge
|
124
|
-
|
125
|
-
monkeypatch.setattr(
|
126
|
-
deltacat.compute.compactor_v2.steps.merge,
|
127
|
-
"BUCKETING_SPEC_COMPLIANCE_PROFILE",
|
128
|
-
"ASSERT",
|
129
|
-
)
|
130
|
-
|
131
|
-
|
132
88
|
@pytest.mark.parametrize(
|
133
89
|
[
|
134
90
|
"test_name",
|
@@ -239,7 +195,6 @@ def test_compact_partition_rebase_same_source_and_destination(
|
|
239
195
|
_,
|
240
196
|
rebased_table_stream,
|
241
197
|
) = create_src_w_deltas_destination_rebase_w_deltas_strategy(
|
242
|
-
primary_keys,
|
243
198
|
sort_keys,
|
244
199
|
partition_keys,
|
245
200
|
input_deltas_param,
|
@@ -351,7 +306,7 @@ def test_compact_partition_rebase_same_source_and_destination(
|
|
351
306
|
if primary_keys:
|
352
307
|
sorting_cols.extend([(val, "ascending") for val in primary_keys])
|
353
308
|
if sort_keys:
|
354
|
-
sorting_cols.extend(sort_keys)
|
309
|
+
sorting_cols.extend([pa_key for key in sort_keys for pa_key in key.arrow])
|
355
310
|
|
356
311
|
rebase_expected_compact_partition_result = (
|
357
312
|
rebase_expected_compact_partition_result.combine_chunks().sort_by(
|
@@ -52,12 +52,6 @@ from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
|
52
52
|
CompactionSessionAuditInfo,
|
53
53
|
)
|
54
54
|
|
55
|
-
DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
|
56
|
-
"db_file_path",
|
57
|
-
"deltacat/tests/local_deltacat_storage/db_test.sqlite",
|
58
|
-
)
|
59
|
-
|
60
|
-
|
61
55
|
"""
|
62
56
|
MODULE scoped fixtures
|
63
57
|
"""
|
@@ -80,13 +74,6 @@ def mock_aws_credential():
|
|
80
74
|
yield
|
81
75
|
|
82
76
|
|
83
|
-
@pytest.fixture(autouse=True, scope="module")
|
84
|
-
def cleanup_the_database_file_after_all_compaction_session_package_tests_complete():
|
85
|
-
# make sure the database file is deleted after all the compactor package tests are completed
|
86
|
-
if os.path.exists(DATABASE_FILE_PATH_VALUE):
|
87
|
-
os.remove(DATABASE_FILE_PATH_VALUE)
|
88
|
-
|
89
|
-
|
90
77
|
@pytest.fixture(scope="module")
|
91
78
|
def s3_resource(mock_aws_credential):
|
92
79
|
with mock_s3():
|
@@ -102,37 +89,6 @@ def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
|
|
102
89
|
yield
|
103
90
|
|
104
91
|
|
105
|
-
"""
|
106
|
-
FUNCTION scoped fixtures
|
107
|
-
"""
|
108
|
-
|
109
|
-
|
110
|
-
@pytest.fixture(scope="function")
|
111
|
-
def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
|
112
|
-
# see deltacat/tests/local_deltacat_storage/README.md for documentation
|
113
|
-
kwargs_for_local_deltacat_storage: Dict[str, Any] = {
|
114
|
-
DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
|
115
|
-
}
|
116
|
-
yield kwargs_for_local_deltacat_storage
|
117
|
-
if os.path.exists(DATABASE_FILE_PATH_VALUE):
|
118
|
-
os.remove(DATABASE_FILE_PATH_VALUE)
|
119
|
-
|
120
|
-
|
121
|
-
@pytest.fixture(autouse=True, scope="function")
|
122
|
-
def enable_bucketing_spec_validation(monkeypatch):
|
123
|
-
"""
|
124
|
-
Enable the bucketing spec validation for all tests.
|
125
|
-
This will help catch hash bucket drift in testing.
|
126
|
-
"""
|
127
|
-
import deltacat.compute.compactor_v2.steps.merge
|
128
|
-
|
129
|
-
monkeypatch.setattr(
|
130
|
-
deltacat.compute.compactor_v2.steps.merge,
|
131
|
-
"BUCKETING_SPEC_COMPLIANCE_PROFILE",
|
132
|
-
"ASSERT",
|
133
|
-
)
|
134
|
-
|
135
|
-
|
136
92
|
@pytest.mark.parametrize(
|
137
93
|
[
|
138
94
|
"test_name",
|
@@ -244,7 +200,6 @@ def test_compact_partition_rebase_then_incremental(
|
|
244
200
|
destination_table_stream,
|
245
201
|
rebased_table_stream,
|
246
202
|
) = create_src_w_deltas_destination_rebase_w_deltas_strategy(
|
247
|
-
primary_keys,
|
248
203
|
sort_keys,
|
249
204
|
partition_keys,
|
250
205
|
input_deltas_param,
|
@@ -308,7 +263,11 @@ def test_compact_partition_rebase_then_incremental(
|
|
308
263
|
actual_rebase_compacted_table = pa.concat_tables(tables)
|
309
264
|
# if no primary key is specified then sort by sort_key for consistent assertion
|
310
265
|
sorting_cols: List[Any] = (
|
311
|
-
[(val, "ascending") for val in primary_keys]
|
266
|
+
[(val, "ascending") for val in primary_keys]
|
267
|
+
if primary_keys
|
268
|
+
else [pa_key for key in sort_keys for pa_key in key.arrow]
|
269
|
+
if sort_keys
|
270
|
+
else []
|
312
271
|
)
|
313
272
|
rebase_expected_compact_partition_result = (
|
314
273
|
rebase_expected_compact_partition_result.combine_chunks().sort_by(sorting_cols)
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
from enum import Enum
|
4
|
-
from typing import Any, Dict, List, Optional
|
4
|
+
from typing import Any, Dict, List, Optional
|
5
5
|
import datetime as dt
|
6
6
|
from boto3.resources.base import ServiceResource
|
7
7
|
from datetime import timezone
|
@@ -27,11 +27,18 @@ from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
|
27
27
|
CompactionSessionAuditInfo,
|
28
28
|
)
|
29
29
|
|
30
|
-
from deltacat.storage.model.partition import
|
30
|
+
from deltacat.storage.model.partition import (
|
31
|
+
PartitionLocator,
|
32
|
+
PartitionScheme,
|
33
|
+
PartitionKey as PartitionSchemeKey,
|
34
|
+
)
|
31
35
|
from deltacat.storage.model.stream import StreamLocator
|
32
36
|
from deltacat.storage.model.table_version import TableVersionLocator
|
33
37
|
from deltacat.storage.model.table import TableLocator
|
34
38
|
from deltacat.storage.model.namespace import NamespaceLocator
|
39
|
+
from deltacat.storage.model.sort_key import (
|
40
|
+
SortScheme,
|
41
|
+
)
|
35
42
|
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
36
43
|
|
37
44
|
|
@@ -77,7 +84,6 @@ def _create_table(
|
|
77
84
|
namespace: str,
|
78
85
|
table_name: str,
|
79
86
|
table_version: str,
|
80
|
-
primary_keys: Set[str],
|
81
87
|
sort_keys: Optional[List[Any]],
|
82
88
|
partition_keys: Optional[List[PartitionKey]],
|
83
89
|
ds_mock_kwargs: Optional[Dict[str, Any]],
|
@@ -86,13 +92,20 @@ def _create_table(
|
|
86
92
|
from deltacat.types.media import ContentType
|
87
93
|
|
88
94
|
ds.create_namespace(namespace, {}, **ds_mock_kwargs)
|
95
|
+
partition_scheme = (
|
96
|
+
PartitionScheme.of(
|
97
|
+
[PartitionSchemeKey.of(key.key_name) for key in partition_keys]
|
98
|
+
)
|
99
|
+
if partition_keys
|
100
|
+
else None
|
101
|
+
)
|
102
|
+
sort_scheme = SortScheme.of(sort_keys) if sort_keys else None
|
89
103
|
ds.create_table_version(
|
90
104
|
namespace,
|
91
105
|
table_name,
|
92
106
|
table_version,
|
93
|
-
|
94
|
-
|
95
|
-
partition_keys=partition_keys,
|
107
|
+
sort_keys=sort_scheme,
|
108
|
+
partition_scheme=partition_scheme,
|
96
109
|
supported_content_types=[ContentType.PARQUET],
|
97
110
|
**ds_mock_kwargs,
|
98
111
|
)
|
@@ -100,7 +113,6 @@ def _create_table(
|
|
100
113
|
|
101
114
|
|
102
115
|
def create_src_table(
|
103
|
-
primary_keys: Set[str],
|
104
116
|
sort_keys: Optional[List[Any]],
|
105
117
|
partition_keys: Optional[List[PartitionKey]],
|
106
118
|
ds_mock_kwargs: Optional[Dict[str, Any]],
|
@@ -112,7 +124,6 @@ def create_src_table(
|
|
112
124
|
source_namespace,
|
113
125
|
source_table_name,
|
114
126
|
source_table_version,
|
115
|
-
primary_keys,
|
116
127
|
sort_keys,
|
117
128
|
partition_keys,
|
118
129
|
ds_mock_kwargs,
|
@@ -120,7 +131,6 @@ def create_src_table(
|
|
120
131
|
|
121
132
|
|
122
133
|
def create_destination_table(
|
123
|
-
primary_keys: Set[str],
|
124
134
|
sort_keys: Optional[List[Any]],
|
125
135
|
partition_keys: Optional[List[PartitionKey]],
|
126
136
|
ds_mock_kwargs: Optional[Dict[str, Any]],
|
@@ -132,7 +142,6 @@ def create_destination_table(
|
|
132
142
|
destination_namespace,
|
133
143
|
destination_table_name,
|
134
144
|
destination_table_version,
|
135
|
-
primary_keys,
|
136
145
|
sort_keys,
|
137
146
|
partition_keys,
|
138
147
|
ds_mock_kwargs,
|
@@ -140,7 +149,6 @@ def create_destination_table(
|
|
140
149
|
|
141
150
|
|
142
151
|
def create_rebase_table(
|
143
|
-
primary_keys: Set[str],
|
144
152
|
sort_keys: Optional[List[Any]],
|
145
153
|
partition_keys: Optional[List[PartitionKey]],
|
146
154
|
ds_mock_kwargs: Optional[Dict[str, Any]],
|
@@ -152,7 +160,6 @@ def create_rebase_table(
|
|
152
160
|
rebasing_namespace,
|
153
161
|
rebasing_table_name,
|
154
162
|
rebasing_table_version,
|
155
|
-
primary_keys,
|
156
163
|
sort_keys,
|
157
164
|
partition_keys,
|
158
165
|
ds_mock_kwargs,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
|
-
from typing import Any, Dict, List, Optional,
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
4
4
|
import pyarrow as pa
|
5
5
|
|
6
6
|
from deltacat.tests.compute.test_util_common import (
|
@@ -38,7 +38,7 @@ def _add_deltas_to_partition(
|
|
38
38
|
delta_data,
|
39
39
|
partition,
|
40
40
|
delta_type,
|
41
|
-
|
41
|
+
entry_params=delete_parameters,
|
42
42
|
**ds_mock_kwargs,
|
43
43
|
)
|
44
44
|
incremental_delta = ds.commit_delta(
|
@@ -87,7 +87,7 @@ def create_incremental_deltas_on_source_table(
|
|
87
87
|
incremental_data,
|
88
88
|
src_partition,
|
89
89
|
incremental_delta_type,
|
90
|
-
|
90
|
+
entry_params=incremental_delete_parameters,
|
91
91
|
**ds_mock_kwargs,
|
92
92
|
),
|
93
93
|
**ds_mock_kwargs,
|
@@ -113,7 +113,6 @@ def create_incremental_deltas_on_source_table(
|
|
113
113
|
|
114
114
|
|
115
115
|
def create_src_w_deltas_destination_plus_destination(
|
116
|
-
primary_keys: Set[str],
|
117
116
|
sort_keys: Optional[List[Any]],
|
118
117
|
partition_keys: Optional[List[PartitionKey]],
|
119
118
|
input_deltas: pa.Table,
|
@@ -125,7 +124,7 @@ def create_src_w_deltas_destination_plus_destination(
|
|
125
124
|
import deltacat.tests.local_deltacat_storage as ds
|
126
125
|
|
127
126
|
source_namespace, source_table_name, source_table_version = create_src_table(
|
128
|
-
|
127
|
+
sort_keys, partition_keys, ds_mock_kwargs
|
129
128
|
)
|
130
129
|
|
131
130
|
source_table_stream: Stream = ds.get_stream(
|
@@ -158,9 +157,7 @@ def create_src_w_deltas_destination_plus_destination(
|
|
158
157
|
destination_table_namespace,
|
159
158
|
destination_table_name,
|
160
159
|
destination_table_version,
|
161
|
-
) = create_destination_table(
|
162
|
-
primary_keys, sort_keys, partition_keys, ds_mock_kwargs
|
163
|
-
)
|
160
|
+
) = create_destination_table(sort_keys, partition_keys, ds_mock_kwargs)
|
164
161
|
else:
|
165
162
|
# not creating a table as in-place
|
166
163
|
destination_table_namespace = source_namespace
|
@@ -184,7 +181,6 @@ def create_src_w_deltas_destination_plus_destination(
|
|
184
181
|
|
185
182
|
|
186
183
|
def create_src_w_deltas_destination_rebase_w_deltas_strategy(
|
187
|
-
primary_keys: Set[str],
|
188
184
|
sort_keys: Optional[List[Any]],
|
189
185
|
partition_keys: Optional[List[PartitionKey]],
|
190
186
|
input_deltas: pa.Table,
|
@@ -198,7 +194,7 @@ def create_src_w_deltas_destination_rebase_w_deltas_strategy(
|
|
198
194
|
|
199
195
|
last_stream_position = current_time_ms()
|
200
196
|
source_namespace, source_table_name, source_table_version = create_src_table(
|
201
|
-
|
197
|
+
sort_keys, partition_keys, ds_mock_kwargs
|
202
198
|
)
|
203
199
|
|
204
200
|
source_table_stream: Stream = ds.get_stream(
|
@@ -230,15 +226,13 @@ def create_src_w_deltas_destination_rebase_w_deltas_strategy(
|
|
230
226
|
destination_table_namespace,
|
231
227
|
destination_table_name,
|
232
228
|
destination_table_version,
|
233
|
-
) = create_destination_table(
|
234
|
-
primary_keys, sort_keys, partition_keys, ds_mock_kwargs
|
235
|
-
)
|
229
|
+
) = create_destination_table(sort_keys, partition_keys, ds_mock_kwargs)
|
236
230
|
# create the rebase table
|
237
231
|
(
|
238
232
|
rebase_table_namespace,
|
239
233
|
rebase_table_name,
|
240
234
|
rebase_table_version,
|
241
|
-
) = create_rebase_table(
|
235
|
+
) = create_rebase_table(sort_keys, partition_keys, ds_mock_kwargs)
|
242
236
|
rebasing_table_stream: Stream = ds.get_stream(
|
243
237
|
namespace=rebase_table_namespace,
|
244
238
|
table_name=rebase_table_name,
|
@@ -280,7 +274,6 @@ def create_src_w_deltas_destination_rebase_w_deltas_strategy(
|
|
280
274
|
|
281
275
|
|
282
276
|
def multiple_rounds_create_src_w_deltas_destination_rebase_w_deltas_strategy(
|
283
|
-
primary_keys: Set[str],
|
284
277
|
sort_keys: Optional[List[Any]],
|
285
278
|
partition_keys: Optional[List[PartitionKey]],
|
286
279
|
input_deltas: List[pa.Table],
|
@@ -291,7 +284,7 @@ def multiple_rounds_create_src_w_deltas_destination_rebase_w_deltas_strategy(
|
|
291
284
|
from deltacat.storage import Partition, Stream
|
292
285
|
|
293
286
|
source_namespace, source_table_name, source_table_version = create_src_table(
|
294
|
-
|
287
|
+
sort_keys, partition_keys, ds_mock_kwargs
|
295
288
|
)
|
296
289
|
|
297
290
|
source_table_stream: Stream = ds.get_stream(
|
@@ -316,7 +309,7 @@ def multiple_rounds_create_src_w_deltas_destination_rebase_w_deltas_strategy(
|
|
316
309
|
input_delta,
|
317
310
|
staged_partition,
|
318
311
|
input_delta_type,
|
319
|
-
|
312
|
+
entry_params=input_delta_parameters,
|
320
313
|
**ds_mock_kwargs,
|
321
314
|
)
|
322
315
|
ds.commit_delta(
|
@@ -336,15 +329,13 @@ def multiple_rounds_create_src_w_deltas_destination_rebase_w_deltas_strategy(
|
|
336
329
|
destination_table_namespace,
|
337
330
|
destination_table_name,
|
338
331
|
destination_table_version,
|
339
|
-
) = create_destination_table(
|
340
|
-
primary_keys, sort_keys, partition_keys, ds_mock_kwargs
|
341
|
-
)
|
332
|
+
) = create_destination_table(sort_keys, partition_keys, ds_mock_kwargs)
|
342
333
|
# create the rebase table
|
343
334
|
(
|
344
335
|
rebase_table_namespace,
|
345
336
|
rebase_table_name,
|
346
337
|
rebase_table_version,
|
347
|
-
) = create_rebase_table(
|
338
|
+
) = create_rebase_table(sort_keys, partition_keys, ds_mock_kwargs)
|
348
339
|
rebasing_table_stream: Stream = ds.get_stream(
|
349
340
|
namespace=rebase_table_namespace,
|
350
341
|
table_name=rebase_table_name,
|
@@ -366,7 +357,7 @@ def multiple_rounds_create_src_w_deltas_destination_rebase_w_deltas_strategy(
|
|
366
357
|
input_delta,
|
367
358
|
staged_partition,
|
368
359
|
input_delta_type,
|
369
|
-
|
360
|
+
entry_params=input_delta_parameters,
|
370
361
|
**ds_mock_kwargs,
|
371
362
|
)
|
372
363
|
ds.commit_delta(
|