deltacat 2.0.0b11__py3-none-any.whl → 2.0.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +78 -3
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/catalog/__init__.py +2 -0
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2417 -271
- deltacat/catalog/model/catalog.py +49 -10
- deltacat/catalog/model/properties.py +38 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +19 -8
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +44 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/impl.py +2 -2
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +105 -4
- deltacat/experimental/storage/iceberg/impl.py +5 -3
- deltacat/experimental/storage/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/dataset.py +0 -3
- deltacat/experimental/storage/rivulet/metastore/delta.py +0 -2
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +3 -2
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1606 -70
- deltacat/tests/catalog/test_catalogs.py +54 -11
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -71
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +221 -11
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +411 -150
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +56 -15
- deltacat-2.0.0.post1.dist-info/METADATA +1163 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/RECORD +183 -145
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b11.dist-info/METADATA +0 -67
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b11.dist-info → deltacat-2.0.0.post1.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,12 @@
|
|
1
1
|
import uuid
|
2
2
|
import logging
|
3
|
+
from collections import defaultdict
|
4
|
+
|
3
5
|
from pyiceberg.exceptions import NoSuchTableError
|
6
|
+
from pyiceberg.manifest import DataFileContent
|
7
|
+
from deltacat.compute.converter.pyiceberg.overrides import (
|
8
|
+
parquet_files_dict_to_iceberg_data_files,
|
9
|
+
)
|
4
10
|
from deltacat import logs
|
5
11
|
|
6
12
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
@@ -14,9 +20,6 @@ def get_s3_file_system():
|
|
14
20
|
secret_key="password",
|
15
21
|
endpoint_override="http://localhost:9000",
|
16
22
|
)
|
17
|
-
# 'region="us-east-1", proxy_options={'scheme': 'http', 'host': 'localhost',
|
18
|
-
# 'port': 9000, 'username': 'admin',
|
19
|
-
# 'password': 'password'})
|
20
23
|
|
21
24
|
|
22
25
|
def write_equality_data_table(
|
@@ -110,10 +113,16 @@ def commit_equality_delete_to_table(
|
|
110
113
|
)
|
111
114
|
]
|
112
115
|
|
113
|
-
|
114
|
-
|
116
|
+
equality_delete_dict_list = defaultdict()
|
117
|
+
equality_delete_dict_list[partition_value] = data_files
|
118
|
+
equality_file_list = parquet_files_dict_to_iceberg_data_files(
|
119
|
+
io=table.io,
|
120
|
+
table_metadata=table.metadata,
|
121
|
+
files_dict=equality_delete_dict_list,
|
122
|
+
file_content_type=DataFileContent.EQUALITY_DELETES,
|
115
123
|
)
|
116
|
-
|
124
|
+
|
125
|
+
return equality_file_list
|
117
126
|
|
118
127
|
|
119
128
|
def drop_table_if_exists(table, catalog):
|
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
from deltacat.storage import metastore
|
2
2
|
from deltacat.types.media import ContentType
|
3
3
|
import pytest
|
4
4
|
from deltacat.storage import Delta
|
@@ -21,7 +21,7 @@ Function scoped fixtures
|
|
21
21
|
|
22
22
|
|
23
23
|
@pytest.fixture(scope="function")
|
24
|
-
def parquet_delta_with_manifest(
|
24
|
+
def parquet_delta_with_manifest(main_deltacat_storage_kwargs):
|
25
25
|
"""
|
26
26
|
These fixtures are function scoped as functions can modify the delta.
|
27
27
|
"""
|
@@ -31,7 +31,7 @@ def parquet_delta_with_manifest(local_deltacat_storage_kwargs):
|
|
31
31
|
"test_namespace",
|
32
32
|
file_paths=[DELTA_CSV_FILE_PATH],
|
33
33
|
content_type=ContentType.PARQUET,
|
34
|
-
**
|
34
|
+
**main_deltacat_storage_kwargs
|
35
35
|
)
|
36
36
|
|
37
37
|
result.meta["source_content_length"] = 0
|
@@ -44,14 +44,14 @@ def parquet_delta_with_manifest(local_deltacat_storage_kwargs):
|
|
44
44
|
|
45
45
|
|
46
46
|
@pytest.fixture(scope="function")
|
47
|
-
def utsv_delta_with_manifest(
|
47
|
+
def utsv_delta_with_manifest(main_deltacat_storage_kwargs):
|
48
48
|
from deltacat.tests.test_utils.pyarrow import create_delta_from_csv_file
|
49
49
|
|
50
50
|
result = create_delta_from_csv_file(
|
51
51
|
"test_namespace",
|
52
52
|
file_paths=[DELTA_CSV_FILE_PATH],
|
53
53
|
content_type=ContentType.UNESCAPED_TSV,
|
54
|
-
**
|
54
|
+
**main_deltacat_storage_kwargs
|
55
55
|
)
|
56
56
|
|
57
57
|
result.meta["source_content_length"] = 0
|
@@ -64,14 +64,14 @@ def utsv_delta_with_manifest(local_deltacat_storage_kwargs):
|
|
64
64
|
|
65
65
|
|
66
66
|
@pytest.fixture(scope="function")
|
67
|
-
def delta_without_manifest(
|
67
|
+
def delta_without_manifest(main_deltacat_storage_kwargs):
|
68
68
|
from deltacat.tests.test_utils.pyarrow import create_delta_from_csv_file
|
69
69
|
|
70
70
|
delta = create_delta_from_csv_file(
|
71
71
|
"test_namespace",
|
72
72
|
file_paths=[DELTA_CSV_FILE_PATH],
|
73
73
|
content_type=ContentType.PARQUET,
|
74
|
-
**
|
74
|
+
**main_deltacat_storage_kwargs
|
75
75
|
)
|
76
76
|
|
77
77
|
# now we intentionally remove manifest
|
@@ -83,14 +83,14 @@ def delta_without_manifest(local_deltacat_storage_kwargs):
|
|
83
83
|
|
84
84
|
|
85
85
|
@pytest.fixture(scope="function")
|
86
|
-
def delta_with_populated_meta(
|
86
|
+
def delta_with_populated_meta(main_deltacat_storage_kwargs):
|
87
87
|
from deltacat.tests.test_utils.pyarrow import create_delta_from_csv_file
|
88
88
|
|
89
89
|
delta = create_delta_from_csv_file(
|
90
90
|
"test_namespace",
|
91
91
|
file_paths=[DELTA_CSV_FILE_PATH],
|
92
92
|
content_type=ContentType.PARQUET,
|
93
|
-
**
|
93
|
+
**main_deltacat_storage_kwargs
|
94
94
|
)
|
95
95
|
|
96
96
|
return delta
|
@@ -98,14 +98,14 @@ def delta_with_populated_meta(local_deltacat_storage_kwargs):
|
|
98
98
|
|
99
99
|
class TestEstimateResourcesRequiredToProcessDelta:
|
100
100
|
def test_delta_with_prepopulated_meta_returns_directly(
|
101
|
-
self,
|
101
|
+
self, main_deltacat_storage_kwargs, delta_with_populated_meta: Delta
|
102
102
|
):
|
103
103
|
|
104
104
|
result = estimate_resources_required_to_process_delta(
|
105
105
|
delta=delta_with_populated_meta,
|
106
106
|
operation_type=OperationType.PYARROW_DOWNLOAD,
|
107
|
-
deltacat_storage=
|
108
|
-
deltacat_storage_kwargs=
|
107
|
+
deltacat_storage=metastore,
|
108
|
+
deltacat_storage_kwargs=main_deltacat_storage_kwargs,
|
109
109
|
)
|
110
110
|
|
111
111
|
assert (
|
@@ -125,7 +125,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
125
125
|
)
|
126
126
|
|
127
127
|
def test_delta_manifest_empty_when_default_method(
|
128
|
-
self,
|
128
|
+
self, main_deltacat_storage_kwargs, delta_without_manifest: Delta
|
129
129
|
):
|
130
130
|
params = EstimateResourcesParams.of(
|
131
131
|
resource_estimation_method=ResourceEstimationMethod.DEFAULT,
|
@@ -136,8 +136,8 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
136
136
|
result = estimate_resources_required_to_process_delta(
|
137
137
|
delta=delta_without_manifest,
|
138
138
|
operation_type=OperationType.PYARROW_DOWNLOAD,
|
139
|
-
deltacat_storage=
|
140
|
-
deltacat_storage_kwargs=
|
139
|
+
deltacat_storage=metastore,
|
140
|
+
deltacat_storage_kwargs=main_deltacat_storage_kwargs,
|
141
141
|
estimate_resources_params=params,
|
142
142
|
)
|
143
143
|
|
@@ -156,7 +156,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
156
156
|
)
|
157
157
|
|
158
158
|
def test_delta_manifest_exists_when_default_method(
|
159
|
-
self,
|
159
|
+
self, main_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
|
160
160
|
):
|
161
161
|
params = EstimateResourcesParams.of(
|
162
162
|
resource_estimation_method=ResourceEstimationMethod.DEFAULT,
|
@@ -167,8 +167,8 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
167
167
|
result = estimate_resources_required_to_process_delta(
|
168
168
|
delta=parquet_delta_with_manifest,
|
169
169
|
operation_type=OperationType.PYARROW_DOWNLOAD,
|
170
|
-
deltacat_storage=
|
171
|
-
deltacat_storage_kwargs=
|
170
|
+
deltacat_storage=metastore,
|
171
|
+
deltacat_storage_kwargs=main_deltacat_storage_kwargs,
|
172
172
|
estimate_resources_params=params,
|
173
173
|
)
|
174
174
|
|
@@ -191,7 +191,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
191
191
|
)
|
192
192
|
|
193
193
|
def test_previous_inflation_arg_not_passed_when_default_method(
|
194
|
-
self,
|
194
|
+
self, main_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
|
195
195
|
):
|
196
196
|
with pytest.raises(AssertionError):
|
197
197
|
params = EstimateResourcesParams.of(
|
@@ -202,13 +202,13 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
202
202
|
estimate_resources_required_to_process_delta(
|
203
203
|
delta=parquet_delta_with_manifest,
|
204
204
|
operation_type=OperationType.PYARROW_DOWNLOAD,
|
205
|
-
deltacat_storage=
|
206
|
-
deltacat_storage_kwargs=
|
205
|
+
deltacat_storage=metastore,
|
206
|
+
deltacat_storage_kwargs=main_deltacat_storage_kwargs,
|
207
207
|
estimate_resources_params=params,
|
208
208
|
)
|
209
209
|
|
210
210
|
def test_estimate_resources_params_not_passed_assumes_default(
|
211
|
-
self,
|
211
|
+
self, main_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
|
212
212
|
):
|
213
213
|
params = EstimateResourcesParams.of(
|
214
214
|
previous_inflation=7,
|
@@ -218,8 +218,8 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
218
218
|
result = estimate_resources_required_to_process_delta(
|
219
219
|
delta=parquet_delta_with_manifest,
|
220
220
|
operation_type=OperationType.PYARROW_DOWNLOAD,
|
221
|
-
deltacat_storage=
|
222
|
-
deltacat_storage_kwargs=
|
221
|
+
deltacat_storage=metastore,
|
222
|
+
deltacat_storage_kwargs=main_deltacat_storage_kwargs,
|
223
223
|
estimate_resources_params=params,
|
224
224
|
)
|
225
225
|
|
@@ -242,7 +242,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
242
242
|
)
|
243
243
|
|
244
244
|
def test_delta_manifest_empty_when_content_type_meta(
|
245
|
-
self,
|
245
|
+
self, main_deltacat_storage_kwargs, delta_without_manifest: Delta
|
246
246
|
):
|
247
247
|
params = EstimateResourcesParams.of(
|
248
248
|
resource_estimation_method=ResourceEstimationMethod.CONTENT_TYPE_META,
|
@@ -252,8 +252,8 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
252
252
|
result = estimate_resources_required_to_process_delta(
|
253
253
|
delta=delta_without_manifest,
|
254
254
|
operation_type=OperationType.PYARROW_DOWNLOAD,
|
255
|
-
deltacat_storage=
|
256
|
-
deltacat_storage_kwargs=
|
255
|
+
deltacat_storage=metastore,
|
256
|
+
deltacat_storage_kwargs=main_deltacat_storage_kwargs,
|
257
257
|
estimate_resources_params=params,
|
258
258
|
)
|
259
259
|
|
@@ -267,7 +267,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
267
267
|
assert result.statistics.record_count == 7
|
268
268
|
|
269
269
|
def test_delta_manifest_exists_when_content_type_meta(
|
270
|
-
self,
|
270
|
+
self, main_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
|
271
271
|
):
|
272
272
|
params = EstimateResourcesParams.of(
|
273
273
|
resource_estimation_method=ResourceEstimationMethod.CONTENT_TYPE_META,
|
@@ -277,8 +277,8 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
277
277
|
result = estimate_resources_required_to_process_delta(
|
278
278
|
delta=parquet_delta_with_manifest,
|
279
279
|
operation_type=OperationType.PYARROW_DOWNLOAD,
|
280
|
-
deltacat_storage=
|
281
|
-
deltacat_storage_kwargs=
|
280
|
+
deltacat_storage=metastore,
|
281
|
+
deltacat_storage_kwargs=main_deltacat_storage_kwargs,
|
282
282
|
estimate_resources_params=params,
|
283
283
|
)
|
284
284
|
|
@@ -292,7 +292,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
292
292
|
assert result.statistics.record_count == 7
|
293
293
|
|
294
294
|
def test_delta_manifest_empty_when_intelligent_estimation(
|
295
|
-
self,
|
295
|
+
self, main_deltacat_storage_kwargs, delta_without_manifest: Delta
|
296
296
|
):
|
297
297
|
params = EstimateResourcesParams.of(
|
298
298
|
resource_estimation_method=ResourceEstimationMethod.INTELLIGENT_ESTIMATION,
|
@@ -302,8 +302,8 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
302
302
|
result = estimate_resources_required_to_process_delta(
|
303
303
|
delta=delta_without_manifest,
|
304
304
|
operation_type=OperationType.PYARROW_DOWNLOAD,
|
305
|
-
deltacat_storage=
|
306
|
-
deltacat_storage_kwargs=
|
305
|
+
deltacat_storage=metastore,
|
306
|
+
deltacat_storage_kwargs=main_deltacat_storage_kwargs,
|
307
307
|
estimate_resources_params=params,
|
308
308
|
)
|
309
309
|
|
@@ -317,7 +317,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
317
317
|
assert result.statistics.record_count == 7
|
318
318
|
|
319
319
|
def test_delta_manifest_exists_when_intelligent_estimation(
|
320
|
-
self,
|
320
|
+
self, main_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
|
321
321
|
):
|
322
322
|
params = EstimateResourcesParams.of(
|
323
323
|
resource_estimation_method=ResourceEstimationMethod.INTELLIGENT_ESTIMATION,
|
@@ -327,8 +327,8 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
327
327
|
result = estimate_resources_required_to_process_delta(
|
328
328
|
delta=parquet_delta_with_manifest,
|
329
329
|
operation_type=OperationType.PYARROW_DOWNLOAD,
|
330
|
-
deltacat_storage=
|
331
|
-
deltacat_storage_kwargs=
|
330
|
+
deltacat_storage=metastore,
|
331
|
+
deltacat_storage_kwargs=main_deltacat_storage_kwargs,
|
332
332
|
estimate_resources_params=params,
|
333
333
|
)
|
334
334
|
|
@@ -342,7 +342,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
342
342
|
assert result.statistics.record_count == 7
|
343
343
|
|
344
344
|
def test_delta_manifest_exists_inflation_absent_when_intelligent_estimation(
|
345
|
-
self,
|
345
|
+
self, main_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
|
346
346
|
):
|
347
347
|
params = EstimateResourcesParams.of(
|
348
348
|
resource_estimation_method=ResourceEstimationMethod.INTELLIGENT_ESTIMATION,
|
@@ -352,15 +352,15 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
352
352
|
result = estimate_resources_required_to_process_delta(
|
353
353
|
delta=parquet_delta_with_manifest,
|
354
354
|
operation_type=OperationType.PYARROW_DOWNLOAD,
|
355
|
-
deltacat_storage=
|
356
|
-
deltacat_storage_kwargs=
|
355
|
+
deltacat_storage=metastore,
|
356
|
+
deltacat_storage_kwargs=main_deltacat_storage_kwargs,
|
357
357
|
estimate_resources_params=params,
|
358
358
|
)
|
359
359
|
|
360
360
|
assert result is None
|
361
361
|
|
362
362
|
def test_delta_utsv_data_when_intelligent_estimation(
|
363
|
-
self,
|
363
|
+
self, main_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta
|
364
364
|
):
|
365
365
|
params = EstimateResourcesParams.of(
|
366
366
|
resource_estimation_method=ResourceEstimationMethod.INTELLIGENT_ESTIMATION,
|
@@ -370,15 +370,15 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
370
370
|
result = estimate_resources_required_to_process_delta(
|
371
371
|
delta=utsv_delta_with_manifest,
|
372
372
|
operation_type=OperationType.PYARROW_DOWNLOAD,
|
373
|
-
deltacat_storage=
|
374
|
-
deltacat_storage_kwargs=
|
373
|
+
deltacat_storage=metastore,
|
374
|
+
deltacat_storage_kwargs=main_deltacat_storage_kwargs,
|
375
375
|
estimate_resources_params=params,
|
376
376
|
)
|
377
377
|
|
378
378
|
assert result is None
|
379
379
|
|
380
380
|
def test_empty_delta_sampled_when_file_sampling(
|
381
|
-
self,
|
381
|
+
self, main_deltacat_storage_kwargs, delta_without_manifest: Delta
|
382
382
|
):
|
383
383
|
params = EstimateResourcesParams.of(
|
384
384
|
resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING,
|
@@ -388,8 +388,31 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
388
388
|
result = estimate_resources_required_to_process_delta(
|
389
389
|
delta=delta_without_manifest,
|
390
390
|
operation_type=OperationType.PYARROW_DOWNLOAD,
|
391
|
-
deltacat_storage=
|
392
|
-
deltacat_storage_kwargs=
|
391
|
+
deltacat_storage=metastore,
|
392
|
+
deltacat_storage_kwargs=main_deltacat_storage_kwargs,
|
393
|
+
estimate_resources_params=params,
|
394
|
+
)
|
395
|
+
|
396
|
+
assert delta_without_manifest.manifest is not None
|
397
|
+
assert result.memory_bytes is not None
|
398
|
+
assert (
|
399
|
+
result.statistics.on_disk_size_bytes
|
400
|
+
== delta_without_manifest.meta.content_length
|
401
|
+
)
|
402
|
+
|
403
|
+
def test_empty_delta_sampled_when_file_sampling_with_previous_inflation(
|
404
|
+
self, main_deltacat_storage_kwargs, delta_without_manifest: Delta
|
405
|
+
):
|
406
|
+
params = EstimateResourcesParams.of(
|
407
|
+
resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING_WITH_PREVIOUS_INFLATION,
|
408
|
+
max_files_to_sample=2,
|
409
|
+
)
|
410
|
+
|
411
|
+
result = estimate_resources_required_to_process_delta(
|
412
|
+
delta=delta_without_manifest,
|
413
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
414
|
+
deltacat_storage=metastore,
|
415
|
+
deltacat_storage_kwargs=main_deltacat_storage_kwargs,
|
393
416
|
estimate_resources_params=params,
|
394
417
|
)
|
395
418
|
|
@@ -401,7 +424,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
401
424
|
)
|
402
425
|
|
403
426
|
def test_delta_manifest_parquet_when_file_sampling(
|
404
|
-
self,
|
427
|
+
self, main_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
|
405
428
|
):
|
406
429
|
params = EstimateResourcesParams.of(
|
407
430
|
resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING,
|
@@ -411,8 +434,29 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
411
434
|
result = estimate_resources_required_to_process_delta(
|
412
435
|
delta=parquet_delta_with_manifest,
|
413
436
|
operation_type=OperationType.PYARROW_DOWNLOAD,
|
414
|
-
deltacat_storage=
|
415
|
-
deltacat_storage_kwargs=
|
437
|
+
deltacat_storage=metastore,
|
438
|
+
deltacat_storage_kwargs=main_deltacat_storage_kwargs,
|
439
|
+
estimate_resources_params=params,
|
440
|
+
)
|
441
|
+
assert result.memory_bytes is not None
|
442
|
+
assert (
|
443
|
+
result.statistics.on_disk_size_bytes
|
444
|
+
== parquet_delta_with_manifest.meta.content_length
|
445
|
+
)
|
446
|
+
|
447
|
+
def test_delta_manifest_parquet_when_file_sampling_with_previous_inflation(
|
448
|
+
self, main_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
|
449
|
+
):
|
450
|
+
params = EstimateResourcesParams.of(
|
451
|
+
resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING_WITH_PREVIOUS_INFLATION,
|
452
|
+
max_files_to_sample=2,
|
453
|
+
)
|
454
|
+
|
455
|
+
result = estimate_resources_required_to_process_delta(
|
456
|
+
delta=parquet_delta_with_manifest,
|
457
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
458
|
+
deltacat_storage=metastore,
|
459
|
+
deltacat_storage_kwargs=main_deltacat_storage_kwargs,
|
416
460
|
estimate_resources_params=params,
|
417
461
|
)
|
418
462
|
assert result.memory_bytes is not None
|
@@ -423,7 +467,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
423
467
|
|
424
468
|
def test_parquet_delta_when_file_sampling_and_arrow_size_zero(
|
425
469
|
self,
|
426
|
-
|
470
|
+
main_deltacat_storage_kwargs,
|
427
471
|
parquet_delta_with_manifest: Delta,
|
428
472
|
monkeypatch,
|
429
473
|
):
|
@@ -441,13 +485,13 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
441
485
|
|
442
486
|
return MockedValue()
|
443
487
|
|
444
|
-
monkeypatch.setattr(
|
488
|
+
monkeypatch.setattr(metastore, "download_delta_manifest_entry", mock_func)
|
445
489
|
|
446
490
|
result = estimate_resources_required_to_process_delta(
|
447
491
|
delta=parquet_delta_with_manifest,
|
448
492
|
operation_type=OperationType.PYARROW_DOWNLOAD,
|
449
|
-
deltacat_storage=
|
450
|
-
deltacat_storage_kwargs=
|
493
|
+
deltacat_storage=metastore,
|
494
|
+
deltacat_storage_kwargs=main_deltacat_storage_kwargs,
|
451
495
|
estimate_resources_params=params,
|
452
496
|
)
|
453
497
|
|
@@ -459,7 +503,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
459
503
|
)
|
460
504
|
|
461
505
|
def test_delta_manifest_utsv_when_file_sampling(
|
462
|
-
self,
|
506
|
+
self, main_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta
|
463
507
|
):
|
464
508
|
params = EstimateResourcesParams.of(
|
465
509
|
resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING,
|
@@ -469,8 +513,8 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
469
513
|
result = estimate_resources_required_to_process_delta(
|
470
514
|
delta=utsv_delta_with_manifest,
|
471
515
|
operation_type=OperationType.PYARROW_DOWNLOAD,
|
472
|
-
deltacat_storage=
|
473
|
-
deltacat_storage_kwargs=
|
516
|
+
deltacat_storage=metastore,
|
517
|
+
deltacat_storage_kwargs=main_deltacat_storage_kwargs,
|
474
518
|
estimate_resources_params=params,
|
475
519
|
)
|
476
520
|
assert result.memory_bytes is not None
|
@@ -480,7 +524,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
480
524
|
)
|
481
525
|
|
482
526
|
def test_delta_manifest_utsv_when_file_sampling_zero_files_to_sample(
|
483
|
-
self,
|
527
|
+
self, main_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta
|
484
528
|
):
|
485
529
|
params = EstimateResourcesParams.of(
|
486
530
|
resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING,
|
@@ -490,14 +534,36 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
490
534
|
result = estimate_resources_required_to_process_delta(
|
491
535
|
delta=utsv_delta_with_manifest,
|
492
536
|
operation_type=OperationType.PYARROW_DOWNLOAD,
|
493
|
-
deltacat_storage=
|
494
|
-
deltacat_storage_kwargs=
|
537
|
+
deltacat_storage=metastore,
|
538
|
+
deltacat_storage_kwargs=main_deltacat_storage_kwargs,
|
495
539
|
estimate_resources_params=params,
|
496
540
|
)
|
497
541
|
assert result is None
|
498
542
|
|
543
|
+
def test_delta_manifest_utsv_when_file_sampling_with_previous_inflation_zero_files_to_sample(
|
544
|
+
self, main_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta
|
545
|
+
):
|
546
|
+
previous_inflation = 7
|
547
|
+
params = EstimateResourcesParams.of(
|
548
|
+
resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING_WITH_PREVIOUS_INFLATION,
|
549
|
+
max_files_to_sample=None,
|
550
|
+
previous_inflation=previous_inflation,
|
551
|
+
)
|
552
|
+
|
553
|
+
result = estimate_resources_required_to_process_delta(
|
554
|
+
delta=utsv_delta_with_manifest,
|
555
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
556
|
+
deltacat_storage=metastore,
|
557
|
+
deltacat_storage_kwargs=main_deltacat_storage_kwargs,
|
558
|
+
estimate_resources_params=params,
|
559
|
+
)
|
560
|
+
assert result is not None
|
561
|
+
assert result.memory_bytes == (
|
562
|
+
utsv_delta_with_manifest.meta.content_length * previous_inflation
|
563
|
+
)
|
564
|
+
|
499
565
|
def test_empty_delta_when_default_v2(
|
500
|
-
self,
|
566
|
+
self, main_deltacat_storage_kwargs, delta_without_manifest: Delta
|
501
567
|
):
|
502
568
|
params = EstimateResourcesParams.of(
|
503
569
|
resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
|
@@ -509,8 +575,8 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
509
575
|
result = estimate_resources_required_to_process_delta(
|
510
576
|
delta=delta_without_manifest,
|
511
577
|
operation_type=OperationType.PYARROW_DOWNLOAD,
|
512
|
-
deltacat_storage=
|
513
|
-
deltacat_storage_kwargs=
|
578
|
+
deltacat_storage=metastore,
|
579
|
+
deltacat_storage_kwargs=main_deltacat_storage_kwargs,
|
514
580
|
estimate_resources_params=params,
|
515
581
|
)
|
516
582
|
|
@@ -522,7 +588,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
522
588
|
)
|
523
589
|
|
524
590
|
def test_parquet_delta_when_default_v2(
|
525
|
-
self,
|
591
|
+
self, main_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
|
526
592
|
):
|
527
593
|
params = EstimateResourcesParams.of(
|
528
594
|
resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
|
@@ -535,8 +601,8 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
535
601
|
result = estimate_resources_required_to_process_delta(
|
536
602
|
delta=parquet_delta_with_manifest,
|
537
603
|
operation_type=OperationType.PYARROW_DOWNLOAD,
|
538
|
-
deltacat_storage=
|
539
|
-
deltacat_storage_kwargs=
|
604
|
+
deltacat_storage=metastore,
|
605
|
+
deltacat_storage_kwargs=main_deltacat_storage_kwargs,
|
540
606
|
estimate_resources_params=params,
|
541
607
|
)
|
542
608
|
|
@@ -548,7 +614,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
548
614
|
)
|
549
615
|
|
550
616
|
def test_parquet_delta_when_default_v2_without_avg_record_size_and_sampling(
|
551
|
-
self,
|
617
|
+
self, main_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
|
552
618
|
):
|
553
619
|
params = EstimateResourcesParams.of(
|
554
620
|
resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
|
@@ -559,8 +625,8 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
559
625
|
result = estimate_resources_required_to_process_delta(
|
560
626
|
delta=parquet_delta_with_manifest,
|
561
627
|
operation_type=OperationType.PYARROW_DOWNLOAD,
|
562
|
-
deltacat_storage=
|
563
|
-
deltacat_storage_kwargs=
|
628
|
+
deltacat_storage=metastore,
|
629
|
+
deltacat_storage_kwargs=main_deltacat_storage_kwargs,
|
564
630
|
estimate_resources_params=params,
|
565
631
|
)
|
566
632
|
|
@@ -572,7 +638,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
572
638
|
)
|
573
639
|
|
574
640
|
def test_parquet_delta_when_default_v2_and_files_to_sample_zero(
|
575
|
-
self,
|
641
|
+
self, main_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
|
576
642
|
):
|
577
643
|
params = EstimateResourcesParams.of(
|
578
644
|
resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
|
@@ -585,8 +651,8 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
585
651
|
result = estimate_resources_required_to_process_delta(
|
586
652
|
delta=parquet_delta_with_manifest,
|
587
653
|
operation_type=OperationType.PYARROW_DOWNLOAD,
|
588
|
-
deltacat_storage=
|
589
|
-
deltacat_storage_kwargs=
|
654
|
+
deltacat_storage=metastore,
|
655
|
+
deltacat_storage_kwargs=main_deltacat_storage_kwargs,
|
590
656
|
estimate_resources_params=params,
|
591
657
|
)
|
592
658
|
|
@@ -598,7 +664,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
598
664
|
)
|
599
665
|
|
600
666
|
def test_utsv_delta_when_default_v2(
|
601
|
-
self,
|
667
|
+
self, main_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta
|
602
668
|
):
|
603
669
|
params = EstimateResourcesParams.of(
|
604
670
|
resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
|
@@ -611,8 +677,8 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
611
677
|
result = estimate_resources_required_to_process_delta(
|
612
678
|
delta=utsv_delta_with_manifest,
|
613
679
|
operation_type=OperationType.PYARROW_DOWNLOAD,
|
614
|
-
deltacat_storage=
|
615
|
-
deltacat_storage_kwargs=
|
680
|
+
deltacat_storage=metastore,
|
681
|
+
deltacat_storage_kwargs=main_deltacat_storage_kwargs,
|
616
682
|
estimate_resources_params=params,
|
617
683
|
)
|
618
684
|
|
@@ -624,7 +690,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
624
690
|
)
|
625
691
|
|
626
692
|
def test_utsv_delta_when_default_v2_without_avg_record_size(
|
627
|
-
self,
|
693
|
+
self, main_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta
|
628
694
|
):
|
629
695
|
params = EstimateResourcesParams.of(
|
630
696
|
resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
|
@@ -636,8 +702,8 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
636
702
|
result = estimate_resources_required_to_process_delta(
|
637
703
|
delta=utsv_delta_with_manifest,
|
638
704
|
operation_type=OperationType.PYARROW_DOWNLOAD,
|
639
|
-
deltacat_storage=
|
640
|
-
deltacat_storage_kwargs=
|
705
|
+
deltacat_storage=metastore,
|
706
|
+
deltacat_storage_kwargs=main_deltacat_storage_kwargs,
|
641
707
|
estimate_resources_params=params,
|
642
708
|
)
|
643
709
|
|
@@ -650,7 +716,7 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
650
716
|
)
|
651
717
|
|
652
718
|
def test_parquet_delta_without_inflation_when_default_v2(
|
653
|
-
self,
|
719
|
+
self, main_deltacat_storage_kwargs, parquet_delta_with_manifest: Delta
|
654
720
|
):
|
655
721
|
params = EstimateResourcesParams.of(
|
656
722
|
resource_estimation_method=ResourceEstimationMethod.DEFAULT_V2,
|
@@ -663,8 +729,8 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
663
729
|
result = estimate_resources_required_to_process_delta(
|
664
730
|
delta=parquet_delta_with_manifest,
|
665
731
|
operation_type=OperationType.PYARROW_DOWNLOAD,
|
666
|
-
deltacat_storage=
|
667
|
-
deltacat_storage_kwargs=
|
732
|
+
deltacat_storage=metastore,
|
733
|
+
deltacat_storage_kwargs=main_deltacat_storage_kwargs,
|
668
734
|
estimate_resources_params=params,
|
669
735
|
)
|
670
736
|
|