deltacat 1.1.30__py3-none-any.whl → 1.1.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor_v2/utils/task_options.py +43 -23
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +273 -1
- deltacat/tests/utils/test_pyarrow.py +106 -4
- deltacat/utils/pyarrow.py +11 -5
- {deltacat-1.1.30.dist-info → deltacat-1.1.32.dist-info}/METADATA +1 -1
- {deltacat-1.1.30.dist-info → deltacat-1.1.32.dist-info}/RECORD +10 -10
- {deltacat-1.1.30.dist-info → deltacat-1.1.32.dist-info}/LICENSE +0 -0
- {deltacat-1.1.30.dist-info → deltacat-1.1.32.dist-info}/WHEEL +0 -0
- {deltacat-1.1.30.dist-info → deltacat-1.1.32.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -1,11 +1,16 @@
|
|
1
1
|
import logging
|
2
2
|
from typing import Dict, Optional, List, Tuple, Any
|
3
3
|
from deltacat import logs
|
4
|
+
from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
|
5
|
+
from deltacat.compute.compactor_v2.constants import (
|
6
|
+
AVERAGE_RECORD_SIZE_BYTES as DEFAULT_AVERAGE_RECORD_SIZE_BYTES,
|
7
|
+
)
|
4
8
|
from deltacat.compute.compactor_v2.model.merge_file_group import (
|
5
9
|
LocalMergeFileGroupsProvider,
|
6
10
|
)
|
7
11
|
from deltacat.storage import (
|
8
12
|
Manifest,
|
13
|
+
ManifestEntry,
|
9
14
|
interface as unimplemented_deltacat_storage,
|
10
15
|
)
|
11
16
|
from deltacat.compute.compactor.model.delta_annotated import DeltaAnnotated
|
@@ -81,16 +86,27 @@ def _get_merge_task_options(
|
|
81
86
|
and compacted_delta_manifest
|
82
87
|
and round_completion_info.hb_index_to_entry_range
|
83
88
|
):
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
89
|
+
logger.debug_conditional(
|
90
|
+
f"[Merge task {index}]: Using previous compaction rounds to calculate merge memory: {round_completion_info.compacted_pyarrow_write_result}",
|
91
|
+
memory_logs_enabled,
|
92
|
+
)
|
93
|
+
previous_inflation: float = (
|
94
|
+
(
|
95
|
+
round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
|
96
|
+
/ round_completion_info.compacted_pyarrow_write_result.file_bytes
|
97
|
+
)
|
98
|
+
if round_completion_info.compacted_pyarrow_write_result.file_bytes
|
99
|
+
else PYARROW_INFLATION_MULTIPLIER
|
88
100
|
)
|
89
101
|
debug_memory_params["previous_inflation"] = previous_inflation
|
90
102
|
|
91
|
-
average_record_size = (
|
92
|
-
|
93
|
-
|
103
|
+
average_record_size: float = (
|
104
|
+
(
|
105
|
+
round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
|
106
|
+
/ round_completion_info.compacted_pyarrow_write_result.records
|
107
|
+
)
|
108
|
+
if round_completion_info.compacted_pyarrow_write_result.records
|
109
|
+
else DEFAULT_AVERAGE_RECORD_SIZE_BYTES
|
94
110
|
)
|
95
111
|
debug_memory_params["average_record_size"] = average_record_size
|
96
112
|
|
@@ -106,31 +122,36 @@ def _get_merge_task_options(
|
|
106
122
|
str(hb_idx)
|
107
123
|
]
|
108
124
|
for entry_index in range(entry_start, entry_end):
|
109
|
-
entry = compacted_delta_manifest.entries[entry_index]
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
125
|
+
entry: ManifestEntry = compacted_delta_manifest.entries[entry_index]
|
126
|
+
current_entry_size: float = (
|
127
|
+
estimate_manifest_entry_size_bytes(
|
128
|
+
entry=entry,
|
129
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
130
|
+
estimate_resources_params=estimate_resources_params,
|
131
|
+
)
|
132
|
+
or 0.0
|
115
133
|
)
|
116
|
-
current_entry_rows =
|
117
|
-
|
118
|
-
|
119
|
-
|
134
|
+
current_entry_rows: int = (
|
135
|
+
estimate_manifest_entry_num_rows(
|
136
|
+
entry=entry,
|
137
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
138
|
+
estimate_resources_params=estimate_resources_params,
|
139
|
+
)
|
140
|
+
or 0
|
120
141
|
)
|
121
|
-
|
142
|
+
# NOTE: We can treat the current_entry_size and current_entry_rows as 0 as a None estimated entry size implies a 0 value
|
122
143
|
data_size += current_entry_size
|
123
144
|
num_rows += current_entry_rows
|
124
|
-
|
125
145
|
if primary_keys:
|
126
|
-
pk_size
|
146
|
+
pk_size: Optional[
|
147
|
+
float
|
148
|
+
] = estimate_manifest_entry_column_size_bytes(
|
127
149
|
entry=entry,
|
128
150
|
columns=primary_keys,
|
129
151
|
operation_type=OperationType.PYARROW_DOWNLOAD,
|
130
152
|
estimate_resources_params=estimate_resources_params,
|
131
153
|
)
|
132
|
-
|
133
|
-
if pk_size is None:
|
154
|
+
if not pk_size:
|
134
155
|
pk_size_bytes += current_entry_size
|
135
156
|
else:
|
136
157
|
pk_size_bytes += pk_size
|
@@ -159,7 +180,6 @@ def _get_merge_task_options(
|
|
159
180
|
f"[Merge task {index}]: Params used for calculating merge memory: {debug_memory_params}",
|
160
181
|
memory_logs_enabled,
|
161
182
|
)
|
162
|
-
|
163
183
|
return _get_task_options(0.01, total_memory, ray_custom_resources)
|
164
184
|
|
165
185
|
|
@@ -1,6 +1,37 @@
|
|
1
1
|
import unittest
|
2
2
|
import ray
|
3
|
-
from deltacat.compute.compactor_v2.utils.task_options import
|
3
|
+
from deltacat.compute.compactor_v2.utils.task_options import (
|
4
|
+
_get_task_options,
|
5
|
+
_get_merge_task_options,
|
6
|
+
logger,
|
7
|
+
)
|
8
|
+
from deltacat.compute.resource_estimation.model import (
|
9
|
+
EstimateResourcesParams,
|
10
|
+
ResourceEstimationMethod,
|
11
|
+
)
|
12
|
+
from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
|
13
|
+
from deltacat.compute.compactor import (
|
14
|
+
PyArrowWriteResult,
|
15
|
+
RoundCompletionInfo,
|
16
|
+
)
|
17
|
+
from deltacat.types.media import (
|
18
|
+
ContentType,
|
19
|
+
ContentEncoding,
|
20
|
+
)
|
21
|
+
from deltacat.storage import (
|
22
|
+
DeltaLocator,
|
23
|
+
Manifest,
|
24
|
+
ManifestMeta,
|
25
|
+
ManifestEntry,
|
26
|
+
ManifestEntryList,
|
27
|
+
PartitionValues,
|
28
|
+
)
|
29
|
+
from unittest.mock import MagicMock
|
30
|
+
from typing import Optional
|
31
|
+
|
32
|
+
from deltacat.compute.compactor_v2.constants import (
|
33
|
+
AVERAGE_RECORD_SIZE_BYTES as DEFAULT_AVERAGE_RECORD_SIZE_BYTES,
|
34
|
+
)
|
4
35
|
|
5
36
|
|
6
37
|
@ray.remote
|
@@ -14,11 +45,95 @@ def throwing_func():
|
|
14
45
|
|
15
46
|
|
16
47
|
class TestTaskOptions(unittest.TestCase):
|
48
|
+
TEST_INDEX = 0
|
49
|
+
TEST_HB_GROUP_IDX = 0
|
50
|
+
TEST_STREAM_POSITION = 1_000_000
|
51
|
+
TEST_NUM_HASH_GROUPS = 1
|
52
|
+
|
17
53
|
@classmethod
|
18
54
|
def setUpClass(cls):
|
19
55
|
ray.init(local_mode=True, ignore_reinit_error=True)
|
20
56
|
super().setUpClass()
|
21
57
|
|
58
|
+
@classmethod
|
59
|
+
def tearDownClass(cls) -> None:
|
60
|
+
ray.shutdown()
|
61
|
+
|
62
|
+
def _make_estimate_resource_params(
|
63
|
+
cls,
|
64
|
+
resource_estimation_method: Optional[
|
65
|
+
ResourceEstimationMethod
|
66
|
+
] = ResourceEstimationMethod.DEFAULT,
|
67
|
+
previous_inflation: Optional[int] = 7,
|
68
|
+
average_record_size_bytes: Optional[int] = 1000,
|
69
|
+
):
|
70
|
+
return EstimateResourcesParams.of(
|
71
|
+
resource_estimation_method=resource_estimation_method,
|
72
|
+
previous_inflation=previous_inflation,
|
73
|
+
average_record_size_bytes=average_record_size_bytes,
|
74
|
+
)
|
75
|
+
|
76
|
+
def _make_manifest(
|
77
|
+
self,
|
78
|
+
source_content_length: Optional[int] = 1000,
|
79
|
+
content_type: Optional[ContentType] = ContentType.PARQUET,
|
80
|
+
content_encoding: Optional[ContentEncoding] = ContentEncoding.IDENTITY,
|
81
|
+
partition_values: Optional[PartitionValues] = None,
|
82
|
+
uri: Optional[str] = "test",
|
83
|
+
url: Optional[str] = "test",
|
84
|
+
author: Optional[str] = "foo",
|
85
|
+
entry_uuid: Optional[str] = "foo",
|
86
|
+
manifest_uuid: Optional[str] = "bar",
|
87
|
+
) -> Manifest:
|
88
|
+
meta = ManifestMeta.of(
|
89
|
+
10,
|
90
|
+
10,
|
91
|
+
content_type=content_type,
|
92
|
+
content_encoding=content_encoding,
|
93
|
+
source_content_length=source_content_length,
|
94
|
+
partition_values=partition_values,
|
95
|
+
)
|
96
|
+
|
97
|
+
return Manifest.of(
|
98
|
+
entries=ManifestEntryList.of(
|
99
|
+
[
|
100
|
+
ManifestEntry.of(
|
101
|
+
uri=uri, url=url, meta=meta, mandatory=True, uuid=entry_uuid
|
102
|
+
)
|
103
|
+
]
|
104
|
+
),
|
105
|
+
author=author,
|
106
|
+
uuid=manifest_uuid,
|
107
|
+
)
|
108
|
+
|
109
|
+
def make_round_completion_info(
|
110
|
+
self,
|
111
|
+
high_watermark: Optional[int] = 1_000_000,
|
112
|
+
compacted_delta_locator: Optional[DeltaLocator] = None,
|
113
|
+
records_written: Optional[int] = 10,
|
114
|
+
bytes_written: Optional[int] = 10,
|
115
|
+
files_written: Optional[int] = 10,
|
116
|
+
rows_dropped: Optional[int] = 10,
|
117
|
+
sort_keys_bit_width: Optional[int] = 0,
|
118
|
+
hash_bucket_count: Optional[int] = 1,
|
119
|
+
hb_index_to_entry_range: Optional[dict] = None,
|
120
|
+
) -> RoundCompletionInfo:
|
121
|
+
if compacted_delta_locator is None:
|
122
|
+
compacted_delta_locator = MagicMock(spec=DeltaLocator)
|
123
|
+
|
124
|
+
hb_index_to_entry_range = hb_index_to_entry_range or {"0": (0, 1)}
|
125
|
+
|
126
|
+
return RoundCompletionInfo.of(
|
127
|
+
compacted_delta_locator=compacted_delta_locator,
|
128
|
+
high_watermark=high_watermark,
|
129
|
+
compacted_pyarrow_write_result=PyArrowWriteResult.of(
|
130
|
+
records_written, bytes_written, files_written, rows_dropped
|
131
|
+
),
|
132
|
+
sort_keys_bit_width=sort_keys_bit_width,
|
133
|
+
hb_index_to_entry_range=hb_index_to_entry_range,
|
134
|
+
hash_bucket_count=hash_bucket_count,
|
135
|
+
)
|
136
|
+
|
22
137
|
def test_get_task_options_sanity(self):
|
23
138
|
opts = _get_task_options(0.01, 0.01)
|
24
139
|
result_ref = valid_func.options(**opts).remote()
|
@@ -31,3 +146,160 @@ class TestTaskOptions(unittest.TestCase):
|
|
31
146
|
result_ref = throwing_func.options(**opts).remote()
|
32
147
|
|
33
148
|
self.assertRaises(ConnectionAbortedError, lambda: ray.get(result_ref))
|
149
|
+
|
150
|
+
def test_get_merge_task_options_memory_logs_enabled_sanity(self):
|
151
|
+
test_index = 0
|
152
|
+
test_hb_group_idx = 0
|
153
|
+
test_debug_memory_params = {"merge_task_index": test_index}
|
154
|
+
test_estimate_memory_params = self._make_estimate_resource_params()
|
155
|
+
test_ray_custom_resources = {}
|
156
|
+
test_rcf = self.make_round_completion_info()
|
157
|
+
test_manifest = self._make_manifest()
|
158
|
+
expected_task_opts = {
|
159
|
+
"max_retries": 3,
|
160
|
+
"memory": 1680.64,
|
161
|
+
"num_cpus": 0.01,
|
162
|
+
"scheduling_strategy": "SPREAD",
|
163
|
+
}
|
164
|
+
expected_previous_inflation = 1.0
|
165
|
+
expected_average_record_size = 1.0
|
166
|
+
with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
|
167
|
+
# At least one log of level DEBUG must be emitted
|
168
|
+
actual_merge_tasks_opts = _get_merge_task_options(
|
169
|
+
index=test_index,
|
170
|
+
hb_group_idx=test_hb_group_idx,
|
171
|
+
data_size=1,
|
172
|
+
pk_size_bytes=1,
|
173
|
+
num_rows=1,
|
174
|
+
num_hash_groups=1,
|
175
|
+
total_memory_buffer_percentage=1,
|
176
|
+
incremental_index_array_size=1,
|
177
|
+
debug_memory_params=test_debug_memory_params,
|
178
|
+
ray_custom_resources=test_ray_custom_resources,
|
179
|
+
estimate_resources_params=test_estimate_memory_params,
|
180
|
+
round_completion_info=test_rcf,
|
181
|
+
compacted_delta_manifest=test_manifest,
|
182
|
+
memory_logs_enabled=True,
|
183
|
+
)
|
184
|
+
assert {k: actual_merge_tasks_opts[k] for k in expected_task_opts}
|
185
|
+
log_message_round_completion_info = cm.records[0].getMessage()
|
186
|
+
log_message_debug_memory_params = cm.records[1].getMessage()
|
187
|
+
self.assertIn(
|
188
|
+
f"[Merge task {test_index}]: Using previous compaction rounds to calculate merge memory",
|
189
|
+
log_message_round_completion_info,
|
190
|
+
)
|
191
|
+
self.assertIn(
|
192
|
+
f"[Merge task {test_index}]: Params used for calculating merge memory",
|
193
|
+
log_message_debug_memory_params,
|
194
|
+
)
|
195
|
+
self.assertIn(
|
196
|
+
f"'previous_inflation': {expected_previous_inflation}",
|
197
|
+
log_message_debug_memory_params,
|
198
|
+
)
|
199
|
+
self.assertIn(
|
200
|
+
f"'average_record_size': {expected_average_record_size}",
|
201
|
+
log_message_debug_memory_params,
|
202
|
+
)
|
203
|
+
|
204
|
+
def test_get_merge_task_options_memory_logs_enabled_fallback_previous_inflation_fallback_average_record_size(
|
205
|
+
self,
|
206
|
+
):
|
207
|
+
test_index = 0
|
208
|
+
test_hb_group_idx = 0
|
209
|
+
test_debug_memory_params = {"merge_task_index": test_index}
|
210
|
+
test_estimate_memory_params = self._make_estimate_resource_params()
|
211
|
+
test_ray_custom_resources = {}
|
212
|
+
test_rcf = self.make_round_completion_info(
|
213
|
+
bytes_written=0, records_written=0, files_written=0, rows_dropped=0
|
214
|
+
)
|
215
|
+
test_manifest = self._make_manifest()
|
216
|
+
expected_task_opts = {
|
217
|
+
"max_retries": 3,
|
218
|
+
"memory": 1680.64,
|
219
|
+
"num_cpus": 0.01,
|
220
|
+
"scheduling_strategy": "SPREAD",
|
221
|
+
}
|
222
|
+
expected_previous_inflation = PYARROW_INFLATION_MULTIPLIER
|
223
|
+
expected_average_record_size = DEFAULT_AVERAGE_RECORD_SIZE_BYTES
|
224
|
+
with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
|
225
|
+
# At least one log of level DEBUG must be emitted
|
226
|
+
actual_merge_tasks_opts = _get_merge_task_options(
|
227
|
+
index=test_index,
|
228
|
+
hb_group_idx=test_hb_group_idx,
|
229
|
+
data_size=1,
|
230
|
+
pk_size_bytes=1,
|
231
|
+
num_rows=1,
|
232
|
+
num_hash_groups=1,
|
233
|
+
total_memory_buffer_percentage=1,
|
234
|
+
incremental_index_array_size=1,
|
235
|
+
debug_memory_params=test_debug_memory_params,
|
236
|
+
ray_custom_resources=test_ray_custom_resources,
|
237
|
+
estimate_resources_params=test_estimate_memory_params,
|
238
|
+
round_completion_info=test_rcf,
|
239
|
+
compacted_delta_manifest=test_manifest,
|
240
|
+
memory_logs_enabled=True,
|
241
|
+
)
|
242
|
+
assert {k: actual_merge_tasks_opts[k] for k in expected_task_opts}
|
243
|
+
log_message_round_completion_info = cm.records[0].getMessage()
|
244
|
+
log_message_debug_memory_params = cm.records[1].getMessage()
|
245
|
+
self.assertIn(
|
246
|
+
f"[Merge task {test_index}]: Using previous compaction rounds to calculate merge memory",
|
247
|
+
log_message_round_completion_info,
|
248
|
+
)
|
249
|
+
self.assertIn(
|
250
|
+
f"[Merge task {test_index}]: Params used for calculating merge memory",
|
251
|
+
log_message_debug_memory_params,
|
252
|
+
)
|
253
|
+
self.assertIn(
|
254
|
+
f"'previous_inflation': {expected_previous_inflation}",
|
255
|
+
log_message_debug_memory_params,
|
256
|
+
)
|
257
|
+
self.assertIn(
|
258
|
+
f"'average_record_size': {expected_average_record_size}",
|
259
|
+
log_message_debug_memory_params,
|
260
|
+
)
|
261
|
+
|
262
|
+
def test_get_merge_task_options_memory_logs_enabled_not_using_previous_round_completion_info(
|
263
|
+
self,
|
264
|
+
):
|
265
|
+
test_index = 0
|
266
|
+
test_hb_group_idx = 0
|
267
|
+
test_debug_memory_params = {"merge_task_index": test_index}
|
268
|
+
test_estimate_memory_params = self._make_estimate_resource_params()
|
269
|
+
test_ray_custom_resources = {}
|
270
|
+
test_rcf = None
|
271
|
+
test_manifest = self._make_manifest()
|
272
|
+
expected_task_opts = {
|
273
|
+
"max_retries": 3,
|
274
|
+
"memory": 1680.64,
|
275
|
+
"num_cpus": 0.01,
|
276
|
+
"scheduling_strategy": "SPREAD",
|
277
|
+
}
|
278
|
+
with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
|
279
|
+
# At least one log of level DEBUG must be emitted
|
280
|
+
actual_merge_tasks_opts = _get_merge_task_options(
|
281
|
+
index=test_index,
|
282
|
+
hb_group_idx=test_hb_group_idx,
|
283
|
+
data_size=1,
|
284
|
+
pk_size_bytes=1,
|
285
|
+
num_rows=1,
|
286
|
+
num_hash_groups=1,
|
287
|
+
total_memory_buffer_percentage=1,
|
288
|
+
incremental_index_array_size=1,
|
289
|
+
debug_memory_params=test_debug_memory_params,
|
290
|
+
ray_custom_resources=test_ray_custom_resources,
|
291
|
+
estimate_resources_params=test_estimate_memory_params,
|
292
|
+
round_completion_info=test_rcf,
|
293
|
+
compacted_delta_manifest=test_manifest,
|
294
|
+
memory_logs_enabled=True,
|
295
|
+
)
|
296
|
+
assert {k: actual_merge_tasks_opts[k] for k in expected_task_opts}
|
297
|
+
log_message_debug_memory_params = cm.records[0].getMessage()
|
298
|
+
self.assertIn(
|
299
|
+
f"[Merge task {test_index}]: Params used for calculating merge memory",
|
300
|
+
log_message_debug_memory_params,
|
301
|
+
)
|
302
|
+
self.assertNotIn(
|
303
|
+
"'average_record_size'",
|
304
|
+
log_message_debug_memory_params,
|
305
|
+
)
|
@@ -2,9 +2,12 @@ from unittest import TestCase
|
|
2
2
|
from deltacat.utils.pyarrow import (
|
3
3
|
s3_partial_parquet_file_to_table,
|
4
4
|
pyarrow_read_csv,
|
5
|
+
ContentTypeValidationError,
|
5
6
|
content_type_to_reader_kwargs,
|
6
7
|
_add_column_kwargs,
|
8
|
+
logger,
|
7
9
|
s3_file_to_table,
|
10
|
+
s3_file_to_parquet,
|
8
11
|
ReadKwargsProviderPyArrowSchemaOverride,
|
9
12
|
RAISE_ON_EMPTY_CSV_KWARG,
|
10
13
|
RAISE_ON_DECIMAL_OVERFLOW,
|
@@ -435,7 +438,7 @@ class TestReadCSV(TestCase):
|
|
435
438
|
pa.lib.ArrowInvalid,
|
436
439
|
lambda: pyarrow_read_csv(
|
437
440
|
OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH,
|
438
|
-
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
|
441
|
+
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True},
|
439
442
|
),
|
440
443
|
)
|
441
444
|
|
@@ -479,7 +482,7 @@ class TestReadCSV(TestCase):
|
|
479
482
|
pa.lib.ArrowInvalid,
|
480
483
|
lambda: pyarrow_read_csv(
|
481
484
|
OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
|
482
|
-
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
|
485
|
+
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True},
|
483
486
|
),
|
484
487
|
)
|
485
488
|
|
@@ -590,7 +593,7 @@ class TestReadCSV(TestCase):
|
|
590
593
|
pa.lib.ArrowNotImplementedError,
|
591
594
|
lambda: pyarrow_read_csv(
|
592
595
|
OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
|
593
|
-
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
|
596
|
+
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True},
|
594
597
|
),
|
595
598
|
)
|
596
599
|
|
@@ -818,8 +821,11 @@ class TestS3FileToTable(TestCase):
|
|
818
821
|
schema = pa.schema(
|
819
822
|
[("is_active", pa.string()), ("ship_datetime_utc", pa.timestamp("us"))]
|
820
823
|
)
|
821
|
-
|
822
824
|
# OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG has no effect on uTSV files
|
825
|
+
pa_kwargs_provider = lambda content_type, kwargs: {
|
826
|
+
"reader_type": "pyarrow",
|
827
|
+
**kwargs,
|
828
|
+
}
|
823
829
|
pa_kwargs_provider = lambda content_type, kwargs: {
|
824
830
|
"reader_type": "pyarrow",
|
825
831
|
OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
|
@@ -864,3 +870,99 @@ class TestS3FileToTable(TestCase):
|
|
864
870
|
schema = result.schema
|
865
871
|
schema_index = schema.get_field_index("n_legs")
|
866
872
|
self.assertEqual(schema.field(schema_index).type, "int64")
|
873
|
+
|
874
|
+
|
875
|
+
class TestS3FileToParquet(TestCase):
|
876
|
+
def test_s3_file_to_parquet_sanity(self):
|
877
|
+
test_s3_url = PARQUET_FILE_PATH
|
878
|
+
test_content_type = ContentType.PARQUET.value
|
879
|
+
test_content_encoding = ContentEncoding.IDENTITY.value
|
880
|
+
pa_kwargs_provider = lambda content_type, kwargs: {
|
881
|
+
"reader_type": "pyarrow",
|
882
|
+
**kwargs,
|
883
|
+
}
|
884
|
+
with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
|
885
|
+
result_parquet_file: ParquetFile = s3_file_to_parquet(
|
886
|
+
test_s3_url,
|
887
|
+
test_content_type,
|
888
|
+
test_content_encoding,
|
889
|
+
["n_legs", "animal"],
|
890
|
+
["n_legs"],
|
891
|
+
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
892
|
+
)
|
893
|
+
log_message_log_args = cm.records[0].getMessage()
|
894
|
+
log_message_presanitize_kwargs = cm.records[1].getMessage()
|
895
|
+
self.assertIn(
|
896
|
+
f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}",
|
897
|
+
log_message_log_args,
|
898
|
+
)
|
899
|
+
self.assertIn("{'reader_type': 'pyarrow'}", log_message_presanitize_kwargs)
|
900
|
+
for index, field in enumerate(result_parquet_file.schema_arrow):
|
901
|
+
self.assertEqual(
|
902
|
+
field.name, result_parquet_file.schema_arrow.field(index).name
|
903
|
+
)
|
904
|
+
self.assertEqual(result_parquet_file.schema_arrow.field(0).type, "int64")
|
905
|
+
|
906
|
+
def test_s3_file_to_parquet_when_parquet_gzip_encoding_and_overridden_returns_success(
|
907
|
+
self,
|
908
|
+
):
|
909
|
+
test_s3_url = PARQUET_FILE_PATH
|
910
|
+
test_content_type = ContentType.PARQUET.value
|
911
|
+
test_content_encoding = ContentEncoding.GZIP.value
|
912
|
+
pa_kwargs_provider = lambda content_type, kwargs: {
|
913
|
+
"reader_type": "pyarrow",
|
914
|
+
OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
|
915
|
+
**kwargs,
|
916
|
+
}
|
917
|
+
with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
|
918
|
+
result_parquet_file: ParquetFile = s3_file_to_parquet(
|
919
|
+
test_s3_url,
|
920
|
+
test_content_type,
|
921
|
+
test_content_encoding,
|
922
|
+
["n_legs", "animal"],
|
923
|
+
["n_legs"],
|
924
|
+
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
925
|
+
)
|
926
|
+
log_message_log_args = cm.records[0].getMessage()
|
927
|
+
log_message_log_new_content_encoding = cm.records[1].getMessage()
|
928
|
+
log_message_presanitize_kwargs = cm.records[2].getMessage()
|
929
|
+
self.assertIn(
|
930
|
+
f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}",
|
931
|
+
log_message_log_args,
|
932
|
+
)
|
933
|
+
self.assertIn(
|
934
|
+
f"Overriding {test_s3_url} content encoding from {ContentEncoding.GZIP.value} to {ContentEncoding.IDENTITY.value}",
|
935
|
+
log_message_log_new_content_encoding,
|
936
|
+
)
|
937
|
+
self.assertIn("{'reader_type': 'pyarrow'}", log_message_presanitize_kwargs)
|
938
|
+
for index, field in enumerate(result_parquet_file.schema_arrow):
|
939
|
+
self.assertEqual(
|
940
|
+
field.name, result_parquet_file.schema_arrow.field(index).name
|
941
|
+
)
|
942
|
+
self.assertEqual(result_parquet_file.schema_arrow.field(0).type, "int64")
|
943
|
+
|
944
|
+
def test_s3_file_to_parquet_when_parquet_gzip_encoding_not_overridden_throws_error(
|
945
|
+
self,
|
946
|
+
):
|
947
|
+
test_s3_url = PARQUET_FILE_PATH
|
948
|
+
test_content_type = ContentType.PARQUET.value
|
949
|
+
test_content_encoding = ContentEncoding.GZIP.value
|
950
|
+
pa_kwargs_provider = lambda content_type, kwargs: {
|
951
|
+
"reader_type": "pyarrow",
|
952
|
+
**kwargs,
|
953
|
+
}
|
954
|
+
with self.assertRaises(ContentTypeValidationError):
|
955
|
+
with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
|
956
|
+
s3_file_to_parquet(
|
957
|
+
test_s3_url,
|
958
|
+
test_content_type,
|
959
|
+
test_content_encoding,
|
960
|
+
["n_legs", "animal"],
|
961
|
+
["n_legs"],
|
962
|
+
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
963
|
+
)
|
964
|
+
log_message_log_args = cm.records[0].getMessage()
|
965
|
+
self.assertIn(
|
966
|
+
f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}",
|
967
|
+
log_message_log_args,
|
968
|
+
)
|
deltacat/utils/pyarrow.py
CHANGED
@@ -617,7 +617,18 @@ def s3_file_to_parquet(
|
|
617
617
|
f"Reading {s3_url} to PyArrow ParquetFile. "
|
618
618
|
f"Content type: {content_type}. Encoding: {content_encoding}"
|
619
619
|
)
|
620
|
+
kwargs = {}
|
621
|
+
if pa_read_func_kwargs_provider:
|
622
|
+
kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
|
620
623
|
|
624
|
+
if OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG in kwargs:
|
625
|
+
new_content_encoding = kwargs.pop(OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG)
|
626
|
+
if content_type == ContentType.PARQUET.value:
|
627
|
+
logger.debug(
|
628
|
+
f"Overriding {s3_url} content encoding from {content_encoding} "
|
629
|
+
f"to {new_content_encoding}"
|
630
|
+
)
|
631
|
+
content_encoding = new_content_encoding
|
621
632
|
if (
|
622
633
|
content_type != ContentType.PARQUET.value
|
623
634
|
or content_encoding != ContentEncoding.IDENTITY
|
@@ -630,15 +641,10 @@ def s3_file_to_parquet(
|
|
630
641
|
if s3_client_kwargs is None:
|
631
642
|
s3_client_kwargs = {}
|
632
643
|
|
633
|
-
kwargs = {}
|
634
|
-
|
635
644
|
if s3_url.startswith("s3://"):
|
636
645
|
s3_file_system = create_s3_file_system(s3_client_kwargs)
|
637
646
|
kwargs["filesystem"] = s3_file_system
|
638
647
|
|
639
|
-
if pa_read_func_kwargs_provider:
|
640
|
-
kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
|
641
|
-
|
642
648
|
logger.debug(f"Pre-sanitize kwargs for {s3_url}: {kwargs}")
|
643
649
|
|
644
650
|
kwargs = sanitize_kwargs_to_callable(ParquetFile.__init__, kwargs)
|
@@ -1,4 +1,4 @@
|
|
1
|
-
deltacat/__init__.py,sha256=
|
1
|
+
deltacat/__init__.py,sha256=amNk91Zxauag8dm3s8SuUKinWdeAA2EaiWG9_SdboQE,1778
|
2
2
|
deltacat/constants.py,sha256=TUJLXUJ9xq1Ryil72yLkKR8EDH_Irp5wUg56QstbRNE,2181
|
3
3
|
deltacat/exceptions.py,sha256=7sjk3BuMY5Oo-6OvAfHncZx_OcvtEL47BblWr2F7waE,12740
|
4
4
|
deltacat/logs.py,sha256=EQSDin1deehzz5xlLV1_TrFJrO_IBZ9Ahp7MdL-4cK8,9363
|
@@ -77,7 +77,7 @@ deltacat/compute/compactor_v2/utils/delta.py,sha256=I7Yvda8NVbpKXG3nM2Ku1utvR2r2
|
|
77
77
|
deltacat/compute/compactor_v2/utils/io.py,sha256=3m4dorxj-WD6Yu9_3gRE6gz3C-eNJA7nn02sHKwo-J8,6018
|
78
78
|
deltacat/compute/compactor_v2/utils/merge.py,sha256=EV_iKhNc3WflgfLW1Q46dXUvyClx8VebWHGtninEfsI,5311
|
79
79
|
deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=SbQ97M1Cxld-zZik2QMSzlj20g6JlENaQx_0PhlCIP8,12034
|
80
|
-
deltacat/compute/compactor_v2/utils/task_options.py,sha256=
|
80
|
+
deltacat/compute/compactor_v2/utils/task_options.py,sha256=0GoB_DLkCN1q8CVKTlWlDYt55qnpTDIa9fPyXJwB-cU,13801
|
81
81
|
deltacat/compute/merge_on_read/__init__.py,sha256=ckbgngmqPjYBYz_NySsR1vNTOb_hNpeL1sYkZKvBI9M,214
|
82
82
|
deltacat/compute/merge_on_read/daft.py,sha256=1oC38u5ig_aTrq7EzyWBo8Ui54rb6yERYMk-vEFbpxM,1400
|
83
83
|
deltacat/compute/merge_on_read/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -156,7 +156,7 @@ deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=y8nNHq9ADH
|
|
156
156
|
deltacat/tests/compute/compactor_v2/test_hashlib.py,sha256=8csF2hFWtBvY2MbX3-6iphCsVXxRp0zP1NTnKhfdmkg,328
|
157
157
|
deltacat/tests/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
158
158
|
deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py,sha256=aFb9rzT_EK9k8qAMHPtpqd5btyEmll1So1loDmZkotQ,1769
|
159
|
-
deltacat/tests/compute/compactor_v2/utils/test_task_options.py,sha256=
|
159
|
+
deltacat/tests/compute/compactor_v2/utils/test_task_options.py,sha256=YDQKUKv3Vv8S1fe0YQmjHTrwnWSliqKHIWGu0fEdKnI,11478
|
160
160
|
deltacat/tests/compute/resource_estimation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
161
161
|
deltacat/tests/compute/resource_estimation/test_delta.py,sha256=HCL2oUnCqm0E26T3HLJjMhoAsHTJIWPYGwIKRgM_H7E,25712
|
162
162
|
deltacat/tests/compute/resource_estimation/test_manifest.py,sha256=yrMvqDjolExdRf6Vtg5XaKDuaKz9ok15PCZ7_aJOYrI,32893
|
@@ -180,7 +180,7 @@ deltacat/tests/utils/test_cloudpickle.py,sha256=J0pnBY3-PxlUh6MamZAN1PuquKQPr2iy
|
|
180
180
|
deltacat/tests/utils/test_daft.py,sha256=kY8lkXoQvyWunok8UvOsh1An297rb3jcnstTuIAyAlc,8232
|
181
181
|
deltacat/tests/utils/test_metrics.py,sha256=Ym9nOz1EtB180pLmvugihj1sDTNDMb5opIjjr5Nmcls,16339
|
182
182
|
deltacat/tests/utils/test_placement.py,sha256=g61wVOMkHe4YJeR9Oxg_BOVQ6bhHHbC3IBYv8YhUu94,597
|
183
|
-
deltacat/tests/utils/test_pyarrow.py,sha256=
|
183
|
+
deltacat/tests/utils/test_pyarrow.py,sha256=tuh6HzQOuAHPFxK5Mhgjjdm76Z9Z72H3MZPcJ4RnZn8,37372
|
184
184
|
deltacat/tests/utils/test_record_batch_tables.py,sha256=AkG1WyljQmjnl-AxhbFWyo5LnMIKRyLScfgC2B_ES-s,11321
|
185
185
|
deltacat/tests/utils/test_resources.py,sha256=HtpvDrfPZQNtGDXUlsIzc_yd7Vf1cDscZ3YbN0oTvO8,2560
|
186
186
|
deltacat/tests/utils/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -201,7 +201,7 @@ deltacat/utils/numpy.py,sha256=SpHKKvC-K8NINTWGVfTZ5-gBFTGYqaXjjgKFhsdUjwg,2049
|
|
201
201
|
deltacat/utils/pandas.py,sha256=q99mlRB7tymICMcNbfGLfLqFu_C-feyPZKZm2CWJJVc,9574
|
202
202
|
deltacat/utils/performance.py,sha256=7ZLaMkS1ehPSIhT5uOQVBHvjC70iKHzoFquFo-KL0PI,645
|
203
203
|
deltacat/utils/placement.py,sha256=Lj20fb-eq8rgMdm_M2MBMfDLwhDM1sS1nJj2DvIK56s,12060
|
204
|
-
deltacat/utils/pyarrow.py,sha256=
|
204
|
+
deltacat/utils/pyarrow.py,sha256=MFCsHJKapqrhaaBeVAvwR2F1MglsNNhVZeCbk7YIdyI,35266
|
205
205
|
deltacat/utils/resources.py,sha256=Ax1OgLLbZI4oYpp4Ki27OLaST-7I-AJgZwU87FVfY8g,8253
|
206
206
|
deltacat/utils/s3fs.py,sha256=PmUJ5Fm1WmD-_zp_M6yd9VbXvIoJuBeK6ApOdJJApLE,662
|
207
207
|
deltacat/utils/schema.py,sha256=m4Wm4ZQcpttzOUxex4dVneGlHy1_E36HspTcjNYzvVM,1564
|
@@ -211,8 +211,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
|
|
211
211
|
deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
|
212
212
|
deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
|
213
213
|
deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
|
214
|
-
deltacat-1.1.
|
215
|
-
deltacat-1.1.
|
216
|
-
deltacat-1.1.
|
217
|
-
deltacat-1.1.
|
218
|
-
deltacat-1.1.
|
214
|
+
deltacat-1.1.32.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
215
|
+
deltacat-1.1.32.dist-info/METADATA,sha256=KqU11gn6r8cnfoyKq4_C8widB7w_wdmfN_ikhHjSZfI,1733
|
216
|
+
deltacat-1.1.32.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
217
|
+
deltacat-1.1.32.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
|
218
|
+
deltacat-1.1.32.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|