deltacat 1.1.36__py3-none-any.whl → 2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +42 -3
- deltacat/annotations.py +36 -0
- deltacat/api.py +168 -0
- deltacat/aws/s3u.py +4 -4
- deltacat/benchmarking/benchmark_engine.py +82 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +21 -0
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
- deltacat/catalog/__init__.py +14 -0
- deltacat/catalog/delegate.py +199 -106
- deltacat/catalog/iceberg/__init__.py +4 -0
- deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/catalog/iceberg/impl.py +368 -0
- deltacat/catalog/iceberg/overrides.py +74 -0
- deltacat/catalog/interface.py +273 -76
- deltacat/catalog/main/impl.py +720 -0
- deltacat/catalog/model/catalog.py +227 -20
- deltacat/catalog/model/properties.py +116 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +5 -5
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +1 -1
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +1 -1
- deltacat/compute/compactor/steps/materialize.py +6 -2
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor_v2/compaction_session.py +5 -9
- deltacat/compute/compactor_v2/constants.py +1 -30
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/merge_input.py +1 -7
- deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
- deltacat/compute/compactor_v2/steps/merge.py +17 -126
- deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/io.py +1 -1
- deltacat/compute/compactor_v2/utils/merge.py +0 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
- deltacat/compute/compactor_v2/utils/task_options.py +23 -43
- deltacat/compute/converter/constants.py +4 -0
- deltacat/compute/converter/converter_session.py +143 -0
- deltacat/compute/converter/model/convert_input.py +69 -0
- deltacat/compute/converter/model/convert_input_files.py +61 -0
- deltacat/compute/converter/model/converter_session_params.py +99 -0
- deltacat/compute/converter/pyiceberg/__init__.py +0 -0
- deltacat/compute/converter/pyiceberg/catalog.py +75 -0
- deltacat/compute/converter/pyiceberg/overrides.py +135 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
- deltacat/compute/converter/steps/__init__.py +0 -0
- deltacat/compute/converter/steps/convert.py +211 -0
- deltacat/compute/converter/steps/dedupe.py +60 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +88 -0
- deltacat/compute/converter/utils/converter_session_utils.py +109 -0
- deltacat/compute/converter/utils/iceberg_columns.py +82 -0
- deltacat/compute/converter/utils/io.py +43 -0
- deltacat/compute/converter/utils/s3u.py +133 -0
- deltacat/compute/resource_estimation/delta.py +1 -19
- deltacat/constants.py +47 -1
- deltacat/env.py +51 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/common/__init__.py +0 -0
- deltacat/examples/common/fixtures.py +15 -0
- deltacat/examples/hello_world.py +27 -0
- deltacat/examples/iceberg/__init__.py +0 -0
- deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
- deltacat/examples/iceberg/iceberg_reader.py +149 -0
- deltacat/exceptions.py +51 -9
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +118 -28
- deltacat/storage/iceberg/__init__.py +0 -0
- deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
- deltacat/storage/iceberg/impl.py +737 -0
- deltacat/storage/iceberg/model.py +709 -0
- deltacat/storage/interface.py +217 -134
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +2077 -0
- deltacat/storage/model/delta.py +118 -71
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -3
- deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
- deltacat/storage/model/metafile.py +1316 -0
- deltacat/storage/model/namespace.py +34 -18
- deltacat/storage/model/partition.py +362 -37
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +19 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +892 -0
- deltacat/storage/model/shard.py +47 -0
- deltacat/storage/model/sort_key.py +170 -13
- deltacat/storage/model/stream.py +208 -80
- deltacat/storage/model/table.py +123 -29
- deltacat/storage/model/table_version.py +322 -46
- deltacat/storage/model/transaction.py +757 -0
- deltacat/storage/model/transform.py +198 -61
- deltacat/storage/model/types.py +111 -13
- deltacat/storage/rivulet/__init__.py +11 -0
- deltacat/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/storage/rivulet/arrow/serializer.py +75 -0
- deltacat/storage/rivulet/dataset.py +744 -0
- deltacat/storage/rivulet/dataset_executor.py +87 -0
- deltacat/storage/rivulet/feather/__init__.py +5 -0
- deltacat/storage/rivulet/feather/file_reader.py +136 -0
- deltacat/storage/rivulet/feather/serializer.py +35 -0
- deltacat/storage/rivulet/fs/__init__.py +0 -0
- deltacat/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/storage/rivulet/fs/file_store.py +130 -0
- deltacat/storage/rivulet/fs/input_file.py +76 -0
- deltacat/storage/rivulet/fs/output_file.py +86 -0
- deltacat/storage/rivulet/logical_plan.py +105 -0
- deltacat/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/storage/rivulet/metastore/delta.py +190 -0
- deltacat/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/storage/rivulet/metastore/sst.py +82 -0
- deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/storage/rivulet/mvp/Table.py +101 -0
- deltacat/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/__init__.py +5 -0
- deltacat/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/storage/rivulet/parquet/file_reader.py +127 -0
- deltacat/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/storage/rivulet/reader/__init__.py +0 -0
- deltacat/storage/rivulet/reader/block_scanner.py +378 -0
- deltacat/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/storage/rivulet/reader/data_scan.py +63 -0
- deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
- deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
- deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
- deltacat/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/storage/rivulet/schema/__init__.py +0 -0
- deltacat/storage/rivulet/schema/datatype.py +128 -0
- deltacat/storage/rivulet/schema/schema.py +251 -0
- deltacat/storage/rivulet/serializer.py +40 -0
- deltacat/storage/rivulet/serializer_factory.py +42 -0
- deltacat/storage/rivulet/writer/__init__.py +0 -0
- deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/catalog/test_catalogs.py +324 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +19 -53
- deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
- deltacat/tests/compute/compactor/utils/test_io.py +6 -8
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
- deltacat/tests/compute/conftest.py +75 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +478 -0
- deltacat/tests/compute/converter/utils.py +123 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
- deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
- deltacat/tests/compute/test_compact_partition_params.py +3 -3
- deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
- deltacat/tests/compute/test_util_common.py +19 -12
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
- deltacat/tests/local_deltacat_storage/__init__.py +76 -103
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/conftest.py +25 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +1399 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_metafile_io.py +2535 -0
- deltacat/tests/storage/model/test_schema.py +308 -0
- deltacat/tests/storage/model/test_shard.py +22 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +308 -0
- deltacat/tests/storage/rivulet/__init__.py +0 -0
- deltacat/tests/storage/rivulet/conftest.py +149 -0
- deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
- deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/storage/rivulet/test_dataset.py +406 -0
- deltacat/tests/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/storage/rivulet/test_utils.py +122 -0
- deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
- deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/test_deltacat_api.py +39 -0
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +8 -15
- deltacat/tests/test_utils/storage.py +266 -3
- deltacat/tests/utils/test_daft.py +3 -3
- deltacat/tests/utils/test_pyarrow.py +0 -432
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +1 -1
- deltacat/utils/export.py +59 -0
- deltacat/utils/filesystem.py +320 -0
- deltacat/utils/metafile_locator.py +73 -0
- deltacat/utils/pyarrow.py +36 -183
- deltacat-2.0.dist-info/METADATA +65 -0
- deltacat-2.0.dist-info/RECORD +347 -0
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
- deltacat-1.1.36.dist-info/METADATA +0 -64
- deltacat-1.1.36.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{io/aws → catalog/main}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
- {deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0
@@ -1,37 +1,6 @@
|
|
1
1
|
import unittest
|
2
2
|
import ray
|
3
|
-
from deltacat.compute.compactor_v2.utils.task_options import
|
4
|
-
_get_task_options,
|
5
|
-
_get_merge_task_options,
|
6
|
-
logger,
|
7
|
-
)
|
8
|
-
from deltacat.compute.resource_estimation.model import (
|
9
|
-
EstimateResourcesParams,
|
10
|
-
ResourceEstimationMethod,
|
11
|
-
)
|
12
|
-
from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
|
13
|
-
from deltacat.compute.compactor import (
|
14
|
-
PyArrowWriteResult,
|
15
|
-
RoundCompletionInfo,
|
16
|
-
)
|
17
|
-
from deltacat.types.media import (
|
18
|
-
ContentType,
|
19
|
-
ContentEncoding,
|
20
|
-
)
|
21
|
-
from deltacat.storage import (
|
22
|
-
DeltaLocator,
|
23
|
-
Manifest,
|
24
|
-
ManifestMeta,
|
25
|
-
ManifestEntry,
|
26
|
-
ManifestEntryList,
|
27
|
-
PartitionValues,
|
28
|
-
)
|
29
|
-
from unittest.mock import MagicMock
|
30
|
-
from typing import Optional
|
31
|
-
|
32
|
-
from deltacat.compute.compactor_v2.constants import (
|
33
|
-
AVERAGE_RECORD_SIZE_BYTES as DEFAULT_AVERAGE_RECORD_SIZE_BYTES,
|
34
|
-
)
|
3
|
+
from deltacat.compute.compactor_v2.utils.task_options import _get_task_options
|
35
4
|
|
36
5
|
|
37
6
|
@ray.remote
|
@@ -45,95 +14,11 @@ def throwing_func():
|
|
45
14
|
|
46
15
|
|
47
16
|
class TestTaskOptions(unittest.TestCase):
|
48
|
-
TEST_INDEX = 0
|
49
|
-
TEST_HB_GROUP_IDX = 0
|
50
|
-
TEST_STREAM_POSITION = 1_000_000
|
51
|
-
TEST_NUM_HASH_GROUPS = 1
|
52
|
-
|
53
17
|
@classmethod
|
54
18
|
def setUpClass(cls):
|
55
19
|
ray.init(local_mode=True, ignore_reinit_error=True)
|
56
20
|
super().setUpClass()
|
57
21
|
|
58
|
-
@classmethod
|
59
|
-
def tearDownClass(cls) -> None:
|
60
|
-
ray.shutdown()
|
61
|
-
|
62
|
-
def _make_estimate_resource_params(
|
63
|
-
cls,
|
64
|
-
resource_estimation_method: Optional[
|
65
|
-
ResourceEstimationMethod
|
66
|
-
] = ResourceEstimationMethod.DEFAULT,
|
67
|
-
previous_inflation: Optional[int] = 7,
|
68
|
-
average_record_size_bytes: Optional[int] = 1000,
|
69
|
-
):
|
70
|
-
return EstimateResourcesParams.of(
|
71
|
-
resource_estimation_method=resource_estimation_method,
|
72
|
-
previous_inflation=previous_inflation,
|
73
|
-
average_record_size_bytes=average_record_size_bytes,
|
74
|
-
)
|
75
|
-
|
76
|
-
def _make_manifest(
|
77
|
-
self,
|
78
|
-
source_content_length: Optional[int] = 1000,
|
79
|
-
content_type: Optional[ContentType] = ContentType.PARQUET,
|
80
|
-
content_encoding: Optional[ContentEncoding] = ContentEncoding.IDENTITY,
|
81
|
-
partition_values: Optional[PartitionValues] = None,
|
82
|
-
uri: Optional[str] = "test",
|
83
|
-
url: Optional[str] = "test",
|
84
|
-
author: Optional[str] = "foo",
|
85
|
-
entry_uuid: Optional[str] = "foo",
|
86
|
-
manifest_uuid: Optional[str] = "bar",
|
87
|
-
) -> Manifest:
|
88
|
-
meta = ManifestMeta.of(
|
89
|
-
10,
|
90
|
-
10,
|
91
|
-
content_type=content_type,
|
92
|
-
content_encoding=content_encoding,
|
93
|
-
source_content_length=source_content_length,
|
94
|
-
partition_values=partition_values,
|
95
|
-
)
|
96
|
-
|
97
|
-
return Manifest.of(
|
98
|
-
entries=ManifestEntryList.of(
|
99
|
-
[
|
100
|
-
ManifestEntry.of(
|
101
|
-
uri=uri, url=url, meta=meta, mandatory=True, uuid=entry_uuid
|
102
|
-
)
|
103
|
-
]
|
104
|
-
),
|
105
|
-
author=author,
|
106
|
-
uuid=manifest_uuid,
|
107
|
-
)
|
108
|
-
|
109
|
-
def make_round_completion_info(
|
110
|
-
self,
|
111
|
-
high_watermark: Optional[int] = 1_000_000,
|
112
|
-
compacted_delta_locator: Optional[DeltaLocator] = None,
|
113
|
-
records_written: Optional[int] = 10,
|
114
|
-
bytes_written: Optional[int] = 10,
|
115
|
-
files_written: Optional[int] = 10,
|
116
|
-
rows_dropped: Optional[int] = 10,
|
117
|
-
sort_keys_bit_width: Optional[int] = 0,
|
118
|
-
hash_bucket_count: Optional[int] = 1,
|
119
|
-
hb_index_to_entry_range: Optional[dict] = None,
|
120
|
-
) -> RoundCompletionInfo:
|
121
|
-
if compacted_delta_locator is None:
|
122
|
-
compacted_delta_locator = MagicMock(spec=DeltaLocator)
|
123
|
-
|
124
|
-
hb_index_to_entry_range = hb_index_to_entry_range or {"0": (0, 1)}
|
125
|
-
|
126
|
-
return RoundCompletionInfo.of(
|
127
|
-
compacted_delta_locator=compacted_delta_locator,
|
128
|
-
high_watermark=high_watermark,
|
129
|
-
compacted_pyarrow_write_result=PyArrowWriteResult.of(
|
130
|
-
records_written, bytes_written, files_written, rows_dropped
|
131
|
-
),
|
132
|
-
sort_keys_bit_width=sort_keys_bit_width,
|
133
|
-
hb_index_to_entry_range=hb_index_to_entry_range,
|
134
|
-
hash_bucket_count=hash_bucket_count,
|
135
|
-
)
|
136
|
-
|
137
22
|
def test_get_task_options_sanity(self):
|
138
23
|
opts = _get_task_options(0.01, 0.01)
|
139
24
|
result_ref = valid_func.options(**opts).remote()
|
@@ -146,160 +31,3 @@ class TestTaskOptions(unittest.TestCase):
|
|
146
31
|
result_ref = throwing_func.options(**opts).remote()
|
147
32
|
|
148
33
|
self.assertRaises(ConnectionAbortedError, lambda: ray.get(result_ref))
|
149
|
-
|
150
|
-
def test_get_merge_task_options_memory_logs_enabled_sanity(self):
|
151
|
-
test_index = 0
|
152
|
-
test_hb_group_idx = 0
|
153
|
-
test_debug_memory_params = {"merge_task_index": test_index}
|
154
|
-
test_estimate_memory_params = self._make_estimate_resource_params()
|
155
|
-
test_ray_custom_resources = {}
|
156
|
-
test_rcf = self.make_round_completion_info()
|
157
|
-
test_manifest = self._make_manifest()
|
158
|
-
expected_task_opts = {
|
159
|
-
"max_retries": 3,
|
160
|
-
"memory": 1680.64,
|
161
|
-
"num_cpus": 0.01,
|
162
|
-
"scheduling_strategy": "SPREAD",
|
163
|
-
}
|
164
|
-
expected_previous_inflation = 1.0
|
165
|
-
expected_average_record_size = 1.0
|
166
|
-
with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
|
167
|
-
# At least one log of level DEBUG must be emitted
|
168
|
-
actual_merge_tasks_opts = _get_merge_task_options(
|
169
|
-
index=test_index,
|
170
|
-
hb_group_idx=test_hb_group_idx,
|
171
|
-
data_size=1,
|
172
|
-
pk_size_bytes=1,
|
173
|
-
num_rows=1,
|
174
|
-
num_hash_groups=1,
|
175
|
-
total_memory_buffer_percentage=1,
|
176
|
-
incremental_index_array_size=1,
|
177
|
-
debug_memory_params=test_debug_memory_params,
|
178
|
-
ray_custom_resources=test_ray_custom_resources,
|
179
|
-
estimate_resources_params=test_estimate_memory_params,
|
180
|
-
round_completion_info=test_rcf,
|
181
|
-
compacted_delta_manifest=test_manifest,
|
182
|
-
memory_logs_enabled=True,
|
183
|
-
)
|
184
|
-
assert {k: actual_merge_tasks_opts[k] for k in expected_task_opts}
|
185
|
-
log_message_round_completion_info = cm.records[0].getMessage()
|
186
|
-
log_message_debug_memory_params = cm.records[1].getMessage()
|
187
|
-
self.assertIn(
|
188
|
-
f"[Merge task {test_index}]: Using previous compaction rounds to calculate merge memory",
|
189
|
-
log_message_round_completion_info,
|
190
|
-
)
|
191
|
-
self.assertIn(
|
192
|
-
f"[Merge task {test_index}]: Params used for calculating merge memory",
|
193
|
-
log_message_debug_memory_params,
|
194
|
-
)
|
195
|
-
self.assertIn(
|
196
|
-
f"'previous_inflation': {expected_previous_inflation}",
|
197
|
-
log_message_debug_memory_params,
|
198
|
-
)
|
199
|
-
self.assertIn(
|
200
|
-
f"'average_record_size': {expected_average_record_size}",
|
201
|
-
log_message_debug_memory_params,
|
202
|
-
)
|
203
|
-
|
204
|
-
def test_get_merge_task_options_memory_logs_enabled_fallback_previous_inflation_fallback_average_record_size(
|
205
|
-
self,
|
206
|
-
):
|
207
|
-
test_index = 0
|
208
|
-
test_hb_group_idx = 0
|
209
|
-
test_debug_memory_params = {"merge_task_index": test_index}
|
210
|
-
test_estimate_memory_params = self._make_estimate_resource_params()
|
211
|
-
test_ray_custom_resources = {}
|
212
|
-
test_rcf = self.make_round_completion_info(
|
213
|
-
bytes_written=0, records_written=0, files_written=0, rows_dropped=0
|
214
|
-
)
|
215
|
-
test_manifest = self._make_manifest()
|
216
|
-
expected_task_opts = {
|
217
|
-
"max_retries": 3,
|
218
|
-
"memory": 1680.64,
|
219
|
-
"num_cpus": 0.01,
|
220
|
-
"scheduling_strategy": "SPREAD",
|
221
|
-
}
|
222
|
-
expected_previous_inflation = PYARROW_INFLATION_MULTIPLIER
|
223
|
-
expected_average_record_size = DEFAULT_AVERAGE_RECORD_SIZE_BYTES
|
224
|
-
with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
|
225
|
-
# At least one log of level DEBUG must be emitted
|
226
|
-
actual_merge_tasks_opts = _get_merge_task_options(
|
227
|
-
index=test_index,
|
228
|
-
hb_group_idx=test_hb_group_idx,
|
229
|
-
data_size=1,
|
230
|
-
pk_size_bytes=1,
|
231
|
-
num_rows=1,
|
232
|
-
num_hash_groups=1,
|
233
|
-
total_memory_buffer_percentage=1,
|
234
|
-
incremental_index_array_size=1,
|
235
|
-
debug_memory_params=test_debug_memory_params,
|
236
|
-
ray_custom_resources=test_ray_custom_resources,
|
237
|
-
estimate_resources_params=test_estimate_memory_params,
|
238
|
-
round_completion_info=test_rcf,
|
239
|
-
compacted_delta_manifest=test_manifest,
|
240
|
-
memory_logs_enabled=True,
|
241
|
-
)
|
242
|
-
assert {k: actual_merge_tasks_opts[k] for k in expected_task_opts}
|
243
|
-
log_message_round_completion_info = cm.records[0].getMessage()
|
244
|
-
log_message_debug_memory_params = cm.records[1].getMessage()
|
245
|
-
self.assertIn(
|
246
|
-
f"[Merge task {test_index}]: Using previous compaction rounds to calculate merge memory",
|
247
|
-
log_message_round_completion_info,
|
248
|
-
)
|
249
|
-
self.assertIn(
|
250
|
-
f"[Merge task {test_index}]: Params used for calculating merge memory",
|
251
|
-
log_message_debug_memory_params,
|
252
|
-
)
|
253
|
-
self.assertIn(
|
254
|
-
f"'previous_inflation': {expected_previous_inflation}",
|
255
|
-
log_message_debug_memory_params,
|
256
|
-
)
|
257
|
-
self.assertIn(
|
258
|
-
f"'average_record_size': {expected_average_record_size}",
|
259
|
-
log_message_debug_memory_params,
|
260
|
-
)
|
261
|
-
|
262
|
-
def test_get_merge_task_options_memory_logs_enabled_not_using_previous_round_completion_info(
|
263
|
-
self,
|
264
|
-
):
|
265
|
-
test_index = 0
|
266
|
-
test_hb_group_idx = 0
|
267
|
-
test_debug_memory_params = {"merge_task_index": test_index}
|
268
|
-
test_estimate_memory_params = self._make_estimate_resource_params()
|
269
|
-
test_ray_custom_resources = {}
|
270
|
-
test_rcf = None
|
271
|
-
test_manifest = self._make_manifest()
|
272
|
-
expected_task_opts = {
|
273
|
-
"max_retries": 3,
|
274
|
-
"memory": 1680.64,
|
275
|
-
"num_cpus": 0.01,
|
276
|
-
"scheduling_strategy": "SPREAD",
|
277
|
-
}
|
278
|
-
with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
|
279
|
-
# At least one log of level DEBUG must be emitted
|
280
|
-
actual_merge_tasks_opts = _get_merge_task_options(
|
281
|
-
index=test_index,
|
282
|
-
hb_group_idx=test_hb_group_idx,
|
283
|
-
data_size=1,
|
284
|
-
pk_size_bytes=1,
|
285
|
-
num_rows=1,
|
286
|
-
num_hash_groups=1,
|
287
|
-
total_memory_buffer_percentage=1,
|
288
|
-
incremental_index_array_size=1,
|
289
|
-
debug_memory_params=test_debug_memory_params,
|
290
|
-
ray_custom_resources=test_ray_custom_resources,
|
291
|
-
estimate_resources_params=test_estimate_memory_params,
|
292
|
-
round_completion_info=test_rcf,
|
293
|
-
compacted_delta_manifest=test_manifest,
|
294
|
-
memory_logs_enabled=True,
|
295
|
-
)
|
296
|
-
assert {k: actual_merge_tasks_opts[k] for k in expected_task_opts}
|
297
|
-
log_message_debug_memory_params = cm.records[0].getMessage()
|
298
|
-
self.assertIn(
|
299
|
-
f"[Merge task {test_index}]: Params used for calculating merge memory",
|
300
|
-
log_message_debug_memory_params,
|
301
|
-
)
|
302
|
-
self.assertNotIn(
|
303
|
-
"'average_record_size'",
|
304
|
-
log_message_debug_memory_params,
|
305
|
-
)
|
@@ -0,0 +1,75 @@
|
|
1
|
+
import os
|
2
|
+
import tempfile
|
3
|
+
import shutil
|
4
|
+
from typing import Dict
|
5
|
+
|
6
|
+
import pytest
|
7
|
+
|
8
|
+
|
9
|
+
@pytest.fixture
|
10
|
+
def temp_dir():
|
11
|
+
"""
|
12
|
+
Fixture that creates a temporary directory for tests and cleans it up afterwards.
|
13
|
+
|
14
|
+
Returns:
|
15
|
+
str: Path to the temporary directory
|
16
|
+
"""
|
17
|
+
# Create a temporary directory
|
18
|
+
dir_path = tempfile.mkdtemp()
|
19
|
+
|
20
|
+
# Provide the directory path to the test
|
21
|
+
yield dir_path
|
22
|
+
|
23
|
+
# Cleanup: remove the directory after the test is done
|
24
|
+
shutil.rmtree(dir_path)
|
25
|
+
|
26
|
+
|
27
|
+
@pytest.fixture(scope="function")
|
28
|
+
def local_deltacat_storage_kwargs(temp_dir):
|
29
|
+
"""
|
30
|
+
Fixture that creates a temporary database file for each test function
|
31
|
+
and returns storage kwargs dictionary.
|
32
|
+
|
33
|
+
Returns:
|
34
|
+
dict: A dictionary with db_file_path key pointing to a temporary database file
|
35
|
+
"""
|
36
|
+
# Create a unique database file in the temporary directory
|
37
|
+
db_file_path = os.path.join(temp_dir, "db_test.sqlite")
|
38
|
+
|
39
|
+
# Return kwargs dictionary ready to use
|
40
|
+
kwargs = {"db_file_path": db_file_path}
|
41
|
+
yield kwargs
|
42
|
+
|
43
|
+
# Cleanup: remove the database file if it exists
|
44
|
+
if os.path.exists(db_file_path):
|
45
|
+
os.remove(db_file_path)
|
46
|
+
|
47
|
+
|
48
|
+
def create_local_deltacat_storage_file() -> Dict[str, str]:
|
49
|
+
"""
|
50
|
+
Helper function to create a local deltacat storage file
|
51
|
+
|
52
|
+
Essentially uses the same approach as local_deltacat_storage_kwargs, but more flexible
|
53
|
+
if the consumer does not want to use a function scoped fixture
|
54
|
+
|
55
|
+
Returns: kwargs to use for local deltacat storage, i.e. {"db_file_path": $db_file}
|
56
|
+
"""
|
57
|
+
temp_dir = tempfile.mkdtemp()
|
58
|
+
db_file_path = os.path.join(temp_dir, "db_test.sqlite")
|
59
|
+
return {"db_file_path": db_file_path}
|
60
|
+
|
61
|
+
|
62
|
+
def clean_up_local_deltacat_storage_file(local_storage_kwargs: Dict[str, str]):
|
63
|
+
"""
|
64
|
+
Cleans up local file and directory created by create_local_deltacat_storage_file
|
65
|
+
"""
|
66
|
+
db_file = local_storage_kwargs["db_file_path"]
|
67
|
+
dir_path = os.path.dirname(db_file)
|
68
|
+
|
69
|
+
# Remove the database file if it exists
|
70
|
+
if os.path.exists(db_file):
|
71
|
+
os.remove(db_file)
|
72
|
+
|
73
|
+
# Remove the temporary directory if it exists
|
74
|
+
if os.path.exists(dir_path):
|
75
|
+
shutil.rmtree(dir_path)
|
File without changes
|
@@ -0,0 +1,80 @@
|
|
1
|
+
import pytest
|
2
|
+
from pyspark.sql import SparkSession
|
3
|
+
import os
|
4
|
+
import ray
|
5
|
+
from pyiceberg.catalog import Catalog, load_catalog
|
6
|
+
|
7
|
+
|
8
|
+
@pytest.fixture
|
9
|
+
def spark():
|
10
|
+
import importlib.metadata
|
11
|
+
|
12
|
+
spark_version = ".".join(importlib.metadata.version("pyspark").split(".")[:2])
|
13
|
+
scala_version = "2.12"
|
14
|
+
iceberg_version = "1.6.0"
|
15
|
+
|
16
|
+
os.environ["PYSPARK_SUBMIT_ARGS"] = (
|
17
|
+
f"--packages org.apache.iceberg:iceberg-spark-runtime-{spark_version}_{scala_version}:{iceberg_version},"
|
18
|
+
f"org.apache.iceberg:iceberg-aws-bundle:{iceberg_version} pyspark-shell"
|
19
|
+
)
|
20
|
+
os.environ["AWS_REGION"] = "us-east-1"
|
21
|
+
os.environ["AWS_ACCESS_KEY_ID"] = "admin"
|
22
|
+
os.environ["AWS_SECRET_ACCESS_KEY"] = "password"
|
23
|
+
|
24
|
+
spark = (
|
25
|
+
SparkSession.builder.appName("PyIceberg integration test")
|
26
|
+
.config("spark.sql.session.timeZone", "UTC")
|
27
|
+
.config(
|
28
|
+
"spark.sql.extensions",
|
29
|
+
"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
|
30
|
+
)
|
31
|
+
.config(
|
32
|
+
"spark.sql.catalog.integration", "org.apache.iceberg.spark.SparkCatalog"
|
33
|
+
)
|
34
|
+
.config(
|
35
|
+
"spark.sql.catalog.integration.catalog-impl",
|
36
|
+
"org.apache.iceberg.rest.RESTCatalog",
|
37
|
+
)
|
38
|
+
.config("spark.sql.catalog.integration.cache-enabled", "false")
|
39
|
+
.config("spark.sql.catalog.integration.uri", "http://localhost:8181")
|
40
|
+
.config(
|
41
|
+
"spark.sql.catalog.integration.io-impl",
|
42
|
+
"org.apache.iceberg.aws.s3.S3FileIO",
|
43
|
+
)
|
44
|
+
.config("spark.sql.catalog.integration.warehouse", "s3://warehouse/wh/")
|
45
|
+
.config("spark.sql.catalog.integration.s3.endpoint", "http://localhost:9000")
|
46
|
+
.config("spark.sql.catalog.integration.s3.path-style-access", "true")
|
47
|
+
.config("spark.sql.defaultCatalog", "integration")
|
48
|
+
.config("spark.sql.catalog.hive", "org.apache.iceberg.spark.SparkCatalog")
|
49
|
+
.config("spark.sql.catalog.hive.type", "hive")
|
50
|
+
.config("spark.sql.catalog.hive.uri", "http://localhost:9083")
|
51
|
+
.config("spark.sql.catalog.hive.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
|
52
|
+
.config("spark.sql.catalog.hive.warehouse", "s3://warehouse/hive/")
|
53
|
+
.config("spark.sql.catalog.hive.s3.endpoint", "http://localhost:9000")
|
54
|
+
.config("spark.sql.catalog.hive.s3.path-style-access", "true")
|
55
|
+
.config("spark.sql.execution.arrow.pyspark.enabled", "true")
|
56
|
+
.getOrCreate()
|
57
|
+
)
|
58
|
+
|
59
|
+
return spark
|
60
|
+
|
61
|
+
|
62
|
+
@pytest.fixture(scope="session")
|
63
|
+
def session_catalog() -> Catalog:
|
64
|
+
return load_catalog(
|
65
|
+
"local",
|
66
|
+
**{
|
67
|
+
"type": "rest",
|
68
|
+
"uri": "http://localhost:8181",
|
69
|
+
"s3.endpoint": "http://localhost:9000",
|
70
|
+
"s3.access-key-id": "admin",
|
71
|
+
"s3.secret-access-key": "password",
|
72
|
+
},
|
73
|
+
)
|
74
|
+
|
75
|
+
|
76
|
+
@pytest.fixture(autouse=True, scope="module")
|
77
|
+
def setup_ray_cluster():
|
78
|
+
ray.init(local_mode=True, ignore_reinit_error=True)
|
79
|
+
yield
|
80
|
+
ray.shutdown()
|