deltacat 1.1.29__py3-none-any.whl → 1.1.31__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor_v2/utils/task_options.py +43 -23
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +273 -1
- deltacat/tests/utils/test_pyarrow.py +52 -0
- deltacat/utils/pyarrow.py +10 -0
- {deltacat-1.1.29.dist-info → deltacat-1.1.31.dist-info}/METADATA +1 -1
- {deltacat-1.1.29.dist-info → deltacat-1.1.31.dist-info}/RECORD +10 -10
- {deltacat-1.1.29.dist-info → deltacat-1.1.31.dist-info}/LICENSE +0 -0
- {deltacat-1.1.29.dist-info → deltacat-1.1.31.dist-info}/WHEEL +0 -0
- {deltacat-1.1.29.dist-info → deltacat-1.1.31.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -1,11 +1,16 @@
|
|
1
1
|
import logging
|
2
2
|
from typing import Dict, Optional, List, Tuple, Any
|
3
3
|
from deltacat import logs
|
4
|
+
from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
|
5
|
+
from deltacat.compute.compactor_v2.constants import (
|
6
|
+
AVERAGE_RECORD_SIZE_BYTES as DEFAULT_AVERAGE_RECORD_SIZE_BYTES,
|
7
|
+
)
|
4
8
|
from deltacat.compute.compactor_v2.model.merge_file_group import (
|
5
9
|
LocalMergeFileGroupsProvider,
|
6
10
|
)
|
7
11
|
from deltacat.storage import (
|
8
12
|
Manifest,
|
13
|
+
ManifestEntry,
|
9
14
|
interface as unimplemented_deltacat_storage,
|
10
15
|
)
|
11
16
|
from deltacat.compute.compactor.model.delta_annotated import DeltaAnnotated
|
@@ -81,16 +86,27 @@ def _get_merge_task_options(
|
|
81
86
|
and compacted_delta_manifest
|
82
87
|
and round_completion_info.hb_index_to_entry_range
|
83
88
|
):
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
89
|
+
logger.debug_conditional(
|
90
|
+
f"[Merge task {index}]: Using previous compaction rounds to calculate merge memory: {round_completion_info.compacted_pyarrow_write_result}",
|
91
|
+
memory_logs_enabled,
|
92
|
+
)
|
93
|
+
previous_inflation: float = (
|
94
|
+
(
|
95
|
+
round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
|
96
|
+
/ round_completion_info.compacted_pyarrow_write_result.file_bytes
|
97
|
+
)
|
98
|
+
if round_completion_info.compacted_pyarrow_write_result.file_bytes
|
99
|
+
else PYARROW_INFLATION_MULTIPLIER
|
88
100
|
)
|
89
101
|
debug_memory_params["previous_inflation"] = previous_inflation
|
90
102
|
|
91
|
-
average_record_size = (
|
92
|
-
|
93
|
-
|
103
|
+
average_record_size: float = (
|
104
|
+
(
|
105
|
+
round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
|
106
|
+
/ round_completion_info.compacted_pyarrow_write_result.records
|
107
|
+
)
|
108
|
+
if round_completion_info.compacted_pyarrow_write_result.records
|
109
|
+
else DEFAULT_AVERAGE_RECORD_SIZE_BYTES
|
94
110
|
)
|
95
111
|
debug_memory_params["average_record_size"] = average_record_size
|
96
112
|
|
@@ -106,31 +122,36 @@ def _get_merge_task_options(
|
|
106
122
|
str(hb_idx)
|
107
123
|
]
|
108
124
|
for entry_index in range(entry_start, entry_end):
|
109
|
-
entry = compacted_delta_manifest.entries[entry_index]
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
125
|
+
entry: ManifestEntry = compacted_delta_manifest.entries[entry_index]
|
126
|
+
current_entry_size: float = (
|
127
|
+
estimate_manifest_entry_size_bytes(
|
128
|
+
entry=entry,
|
129
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
130
|
+
estimate_resources_params=estimate_resources_params,
|
131
|
+
)
|
132
|
+
or 0.0
|
115
133
|
)
|
116
|
-
current_entry_rows =
|
117
|
-
|
118
|
-
|
119
|
-
|
134
|
+
current_entry_rows: int = (
|
135
|
+
estimate_manifest_entry_num_rows(
|
136
|
+
entry=entry,
|
137
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
138
|
+
estimate_resources_params=estimate_resources_params,
|
139
|
+
)
|
140
|
+
or 0
|
120
141
|
)
|
121
|
-
|
142
|
+
# NOTE: We can treat the current_entry_size and current_entry_rows as 0 as a None estimated entry size implies a 0 value
|
122
143
|
data_size += current_entry_size
|
123
144
|
num_rows += current_entry_rows
|
124
|
-
|
125
145
|
if primary_keys:
|
126
|
-
pk_size
|
146
|
+
pk_size: Optional[
|
147
|
+
float
|
148
|
+
] = estimate_manifest_entry_column_size_bytes(
|
127
149
|
entry=entry,
|
128
150
|
columns=primary_keys,
|
129
151
|
operation_type=OperationType.PYARROW_DOWNLOAD,
|
130
152
|
estimate_resources_params=estimate_resources_params,
|
131
153
|
)
|
132
|
-
|
133
|
-
if pk_size is None:
|
154
|
+
if not pk_size:
|
134
155
|
pk_size_bytes += current_entry_size
|
135
156
|
else:
|
136
157
|
pk_size_bytes += pk_size
|
@@ -159,7 +180,6 @@ def _get_merge_task_options(
|
|
159
180
|
f"[Merge task {index}]: Params used for calculating merge memory: {debug_memory_params}",
|
160
181
|
memory_logs_enabled,
|
161
182
|
)
|
162
|
-
|
163
183
|
return _get_task_options(0.01, total_memory, ray_custom_resources)
|
164
184
|
|
165
185
|
|
@@ -1,6 +1,37 @@
|
|
1
1
|
import unittest
|
2
2
|
import ray
|
3
|
-
from deltacat.compute.compactor_v2.utils.task_options import
|
3
|
+
from deltacat.compute.compactor_v2.utils.task_options import (
|
4
|
+
_get_task_options,
|
5
|
+
_get_merge_task_options,
|
6
|
+
logger,
|
7
|
+
)
|
8
|
+
from deltacat.compute.resource_estimation.model import (
|
9
|
+
EstimateResourcesParams,
|
10
|
+
ResourceEstimationMethod,
|
11
|
+
)
|
12
|
+
from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
|
13
|
+
from deltacat.compute.compactor import (
|
14
|
+
PyArrowWriteResult,
|
15
|
+
RoundCompletionInfo,
|
16
|
+
)
|
17
|
+
from deltacat.types.media import (
|
18
|
+
ContentType,
|
19
|
+
ContentEncoding,
|
20
|
+
)
|
21
|
+
from deltacat.storage import (
|
22
|
+
DeltaLocator,
|
23
|
+
Manifest,
|
24
|
+
ManifestMeta,
|
25
|
+
ManifestEntry,
|
26
|
+
ManifestEntryList,
|
27
|
+
PartitionValues,
|
28
|
+
)
|
29
|
+
from unittest.mock import MagicMock
|
30
|
+
from typing import Optional
|
31
|
+
|
32
|
+
from deltacat.compute.compactor_v2.constants import (
|
33
|
+
AVERAGE_RECORD_SIZE_BYTES as DEFAULT_AVERAGE_RECORD_SIZE_BYTES,
|
34
|
+
)
|
4
35
|
|
5
36
|
|
6
37
|
@ray.remote
|
@@ -14,11 +45,95 @@ def throwing_func():
|
|
14
45
|
|
15
46
|
|
16
47
|
class TestTaskOptions(unittest.TestCase):
|
48
|
+
TEST_INDEX = 0
|
49
|
+
TEST_HB_GROUP_IDX = 0
|
50
|
+
TEST_STREAM_POSITION = 1_000_000
|
51
|
+
TEST_NUM_HASH_GROUPS = 1
|
52
|
+
|
17
53
|
@classmethod
|
18
54
|
def setUpClass(cls):
|
19
55
|
ray.init(local_mode=True, ignore_reinit_error=True)
|
20
56
|
super().setUpClass()
|
21
57
|
|
58
|
+
@classmethod
|
59
|
+
def tearDownClass(cls) -> None:
|
60
|
+
ray.shutdown()
|
61
|
+
|
62
|
+
def _make_estimate_resource_params(
|
63
|
+
cls,
|
64
|
+
resource_estimation_method: Optional[
|
65
|
+
ResourceEstimationMethod
|
66
|
+
] = ResourceEstimationMethod.DEFAULT,
|
67
|
+
previous_inflation: Optional[int] = 7,
|
68
|
+
average_record_size_bytes: Optional[int] = 1000,
|
69
|
+
):
|
70
|
+
return EstimateResourcesParams.of(
|
71
|
+
resource_estimation_method=resource_estimation_method,
|
72
|
+
previous_inflation=previous_inflation,
|
73
|
+
average_record_size_bytes=average_record_size_bytes,
|
74
|
+
)
|
75
|
+
|
76
|
+
def _make_manifest(
|
77
|
+
self,
|
78
|
+
source_content_length: Optional[int] = 1000,
|
79
|
+
content_type: Optional[ContentType] = ContentType.PARQUET,
|
80
|
+
content_encoding: Optional[ContentEncoding] = ContentEncoding.IDENTITY,
|
81
|
+
partition_values: Optional[PartitionValues] = None,
|
82
|
+
uri: Optional[str] = "test",
|
83
|
+
url: Optional[str] = "test",
|
84
|
+
author: Optional[str] = "foo",
|
85
|
+
entry_uuid: Optional[str] = "foo",
|
86
|
+
manifest_uuid: Optional[str] = "bar",
|
87
|
+
) -> Manifest:
|
88
|
+
meta = ManifestMeta.of(
|
89
|
+
10,
|
90
|
+
10,
|
91
|
+
content_type=content_type,
|
92
|
+
content_encoding=content_encoding,
|
93
|
+
source_content_length=source_content_length,
|
94
|
+
partition_values=partition_values,
|
95
|
+
)
|
96
|
+
|
97
|
+
return Manifest.of(
|
98
|
+
entries=ManifestEntryList.of(
|
99
|
+
[
|
100
|
+
ManifestEntry.of(
|
101
|
+
uri=uri, url=url, meta=meta, mandatory=True, uuid=entry_uuid
|
102
|
+
)
|
103
|
+
]
|
104
|
+
),
|
105
|
+
author=author,
|
106
|
+
uuid=manifest_uuid,
|
107
|
+
)
|
108
|
+
|
109
|
+
def make_round_completion_info(
|
110
|
+
self,
|
111
|
+
high_watermark: Optional[int] = 1_000_000,
|
112
|
+
compacted_delta_locator: Optional[DeltaLocator] = None,
|
113
|
+
records_written: Optional[int] = 10,
|
114
|
+
bytes_written: Optional[int] = 10,
|
115
|
+
files_written: Optional[int] = 10,
|
116
|
+
rows_dropped: Optional[int] = 10,
|
117
|
+
sort_keys_bit_width: Optional[int] = 0,
|
118
|
+
hash_bucket_count: Optional[int] = 1,
|
119
|
+
hb_index_to_entry_range: Optional[dict] = None,
|
120
|
+
) -> RoundCompletionInfo:
|
121
|
+
if compacted_delta_locator is None:
|
122
|
+
compacted_delta_locator = MagicMock(spec=DeltaLocator)
|
123
|
+
|
124
|
+
hb_index_to_entry_range = hb_index_to_entry_range or {"0": (0, 1)}
|
125
|
+
|
126
|
+
return RoundCompletionInfo.of(
|
127
|
+
compacted_delta_locator=compacted_delta_locator,
|
128
|
+
high_watermark=high_watermark,
|
129
|
+
compacted_pyarrow_write_result=PyArrowWriteResult.of(
|
130
|
+
records_written, bytes_written, files_written, rows_dropped
|
131
|
+
),
|
132
|
+
sort_keys_bit_width=sort_keys_bit_width,
|
133
|
+
hb_index_to_entry_range=hb_index_to_entry_range,
|
134
|
+
hash_bucket_count=hash_bucket_count,
|
135
|
+
)
|
136
|
+
|
22
137
|
def test_get_task_options_sanity(self):
|
23
138
|
opts = _get_task_options(0.01, 0.01)
|
24
139
|
result_ref = valid_func.options(**opts).remote()
|
@@ -31,3 +146,160 @@ class TestTaskOptions(unittest.TestCase):
|
|
31
146
|
result_ref = throwing_func.options(**opts).remote()
|
32
147
|
|
33
148
|
self.assertRaises(ConnectionAbortedError, lambda: ray.get(result_ref))
|
149
|
+
|
150
|
+
def test_get_merge_task_options_memory_logs_enabled_sanity(self):
|
151
|
+
test_index = 0
|
152
|
+
test_hb_group_idx = 0
|
153
|
+
test_debug_memory_params = {"merge_task_index": test_index}
|
154
|
+
test_estimate_memory_params = self._make_estimate_resource_params()
|
155
|
+
test_ray_custom_resources = {}
|
156
|
+
test_rcf = self.make_round_completion_info()
|
157
|
+
test_manifest = self._make_manifest()
|
158
|
+
expected_task_opts = {
|
159
|
+
"max_retries": 3,
|
160
|
+
"memory": 1680.64,
|
161
|
+
"num_cpus": 0.01,
|
162
|
+
"scheduling_strategy": "SPREAD",
|
163
|
+
}
|
164
|
+
expected_previous_inflation = 1.0
|
165
|
+
expected_average_record_size = 1.0
|
166
|
+
with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
|
167
|
+
# At least one log of level DEBUG must be emitted
|
168
|
+
actual_merge_tasks_opts = _get_merge_task_options(
|
169
|
+
index=test_index,
|
170
|
+
hb_group_idx=test_hb_group_idx,
|
171
|
+
data_size=1,
|
172
|
+
pk_size_bytes=1,
|
173
|
+
num_rows=1,
|
174
|
+
num_hash_groups=1,
|
175
|
+
total_memory_buffer_percentage=1,
|
176
|
+
incremental_index_array_size=1,
|
177
|
+
debug_memory_params=test_debug_memory_params,
|
178
|
+
ray_custom_resources=test_ray_custom_resources,
|
179
|
+
estimate_resources_params=test_estimate_memory_params,
|
180
|
+
round_completion_info=test_rcf,
|
181
|
+
compacted_delta_manifest=test_manifest,
|
182
|
+
memory_logs_enabled=True,
|
183
|
+
)
|
184
|
+
assert {k: actual_merge_tasks_opts[k] for k in expected_task_opts}
|
185
|
+
log_message_round_completion_info = cm.records[0].getMessage()
|
186
|
+
log_message_debug_memory_params = cm.records[1].getMessage()
|
187
|
+
self.assertIn(
|
188
|
+
f"[Merge task {test_index}]: Using previous compaction rounds to calculate merge memory",
|
189
|
+
log_message_round_completion_info,
|
190
|
+
)
|
191
|
+
self.assertIn(
|
192
|
+
f"[Merge task {test_index}]: Params used for calculating merge memory",
|
193
|
+
log_message_debug_memory_params,
|
194
|
+
)
|
195
|
+
self.assertIn(
|
196
|
+
f"'previous_inflation': {expected_previous_inflation}",
|
197
|
+
log_message_debug_memory_params,
|
198
|
+
)
|
199
|
+
self.assertIn(
|
200
|
+
f"'average_record_size': {expected_average_record_size}",
|
201
|
+
log_message_debug_memory_params,
|
202
|
+
)
|
203
|
+
|
204
|
+
def test_get_merge_task_options_memory_logs_enabled_fallback_previous_inflation_fallback_average_record_size(
|
205
|
+
self,
|
206
|
+
):
|
207
|
+
test_index = 0
|
208
|
+
test_hb_group_idx = 0
|
209
|
+
test_debug_memory_params = {"merge_task_index": test_index}
|
210
|
+
test_estimate_memory_params = self._make_estimate_resource_params()
|
211
|
+
test_ray_custom_resources = {}
|
212
|
+
test_rcf = self.make_round_completion_info(
|
213
|
+
bytes_written=0, records_written=0, files_written=0, rows_dropped=0
|
214
|
+
)
|
215
|
+
test_manifest = self._make_manifest()
|
216
|
+
expected_task_opts = {
|
217
|
+
"max_retries": 3,
|
218
|
+
"memory": 1680.64,
|
219
|
+
"num_cpus": 0.01,
|
220
|
+
"scheduling_strategy": "SPREAD",
|
221
|
+
}
|
222
|
+
expected_previous_inflation = PYARROW_INFLATION_MULTIPLIER
|
223
|
+
expected_average_record_size = DEFAULT_AVERAGE_RECORD_SIZE_BYTES
|
224
|
+
with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
|
225
|
+
# At least one log of level DEBUG must be emitted
|
226
|
+
actual_merge_tasks_opts = _get_merge_task_options(
|
227
|
+
index=test_index,
|
228
|
+
hb_group_idx=test_hb_group_idx,
|
229
|
+
data_size=1,
|
230
|
+
pk_size_bytes=1,
|
231
|
+
num_rows=1,
|
232
|
+
num_hash_groups=1,
|
233
|
+
total_memory_buffer_percentage=1,
|
234
|
+
incremental_index_array_size=1,
|
235
|
+
debug_memory_params=test_debug_memory_params,
|
236
|
+
ray_custom_resources=test_ray_custom_resources,
|
237
|
+
estimate_resources_params=test_estimate_memory_params,
|
238
|
+
round_completion_info=test_rcf,
|
239
|
+
compacted_delta_manifest=test_manifest,
|
240
|
+
memory_logs_enabled=True,
|
241
|
+
)
|
242
|
+
assert {k: actual_merge_tasks_opts[k] for k in expected_task_opts}
|
243
|
+
log_message_round_completion_info = cm.records[0].getMessage()
|
244
|
+
log_message_debug_memory_params = cm.records[1].getMessage()
|
245
|
+
self.assertIn(
|
246
|
+
f"[Merge task {test_index}]: Using previous compaction rounds to calculate merge memory",
|
247
|
+
log_message_round_completion_info,
|
248
|
+
)
|
249
|
+
self.assertIn(
|
250
|
+
f"[Merge task {test_index}]: Params used for calculating merge memory",
|
251
|
+
log_message_debug_memory_params,
|
252
|
+
)
|
253
|
+
self.assertIn(
|
254
|
+
f"'previous_inflation': {expected_previous_inflation}",
|
255
|
+
log_message_debug_memory_params,
|
256
|
+
)
|
257
|
+
self.assertIn(
|
258
|
+
f"'average_record_size': {expected_average_record_size}",
|
259
|
+
log_message_debug_memory_params,
|
260
|
+
)
|
261
|
+
|
262
|
+
def test_get_merge_task_options_memory_logs_enabled_not_using_previous_round_completion_info(
|
263
|
+
self,
|
264
|
+
):
|
265
|
+
test_index = 0
|
266
|
+
test_hb_group_idx = 0
|
267
|
+
test_debug_memory_params = {"merge_task_index": test_index}
|
268
|
+
test_estimate_memory_params = self._make_estimate_resource_params()
|
269
|
+
test_ray_custom_resources = {}
|
270
|
+
test_rcf = None
|
271
|
+
test_manifest = self._make_manifest()
|
272
|
+
expected_task_opts = {
|
273
|
+
"max_retries": 3,
|
274
|
+
"memory": 1680.64,
|
275
|
+
"num_cpus": 0.01,
|
276
|
+
"scheduling_strategy": "SPREAD",
|
277
|
+
}
|
278
|
+
with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
|
279
|
+
# At least one log of level DEBUG must be emitted
|
280
|
+
actual_merge_tasks_opts = _get_merge_task_options(
|
281
|
+
index=test_index,
|
282
|
+
hb_group_idx=test_hb_group_idx,
|
283
|
+
data_size=1,
|
284
|
+
pk_size_bytes=1,
|
285
|
+
num_rows=1,
|
286
|
+
num_hash_groups=1,
|
287
|
+
total_memory_buffer_percentage=1,
|
288
|
+
incremental_index_array_size=1,
|
289
|
+
debug_memory_params=test_debug_memory_params,
|
290
|
+
ray_custom_resources=test_ray_custom_resources,
|
291
|
+
estimate_resources_params=test_estimate_memory_params,
|
292
|
+
round_completion_info=test_rcf,
|
293
|
+
compacted_delta_manifest=test_manifest,
|
294
|
+
memory_logs_enabled=True,
|
295
|
+
)
|
296
|
+
assert {k: actual_merge_tasks_opts[k] for k in expected_task_opts}
|
297
|
+
log_message_debug_memory_params = cm.records[0].getMessage()
|
298
|
+
self.assertIn(
|
299
|
+
f"[Merge task {test_index}]: Params used for calculating merge memory",
|
300
|
+
log_message_debug_memory_params,
|
301
|
+
)
|
302
|
+
self.assertNotIn(
|
303
|
+
"'average_record_size'",
|
304
|
+
log_message_debug_memory_params,
|
305
|
+
)
|
@@ -8,6 +8,7 @@ from deltacat.utils.pyarrow import (
|
|
8
8
|
ReadKwargsProviderPyArrowSchemaOverride,
|
9
9
|
RAISE_ON_EMPTY_CSV_KWARG,
|
10
10
|
RAISE_ON_DECIMAL_OVERFLOW,
|
11
|
+
OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG,
|
11
12
|
)
|
12
13
|
import decimal
|
13
14
|
from deltacat.types.media import ContentEncoding, ContentType
|
@@ -812,3 +813,54 @@ class TestS3FileToTable(TestCase):
|
|
812
813
|
schema = result.schema
|
813
814
|
schema_index = schema.get_field_index("n_legs")
|
814
815
|
self.assertEqual(schema.field(schema_index).type, "int64")
|
816
|
+
|
817
|
+
def test_s3_file_to_table_when_utsv_gzip_and_content_type_overridden(self):
|
818
|
+
schema = pa.schema(
|
819
|
+
[("is_active", pa.string()), ("ship_datetime_utc", pa.timestamp("us"))]
|
820
|
+
)
|
821
|
+
|
822
|
+
# OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG has no effect on uTSV files
|
823
|
+
pa_kwargs_provider = lambda content_type, kwargs: {
|
824
|
+
"reader_type": "pyarrow",
|
825
|
+
OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
|
826
|
+
**kwargs,
|
827
|
+
}
|
828
|
+
|
829
|
+
result = s3_file_to_table(
|
830
|
+
GZIP_COMPRESSED_FILE_UTSV_PATH,
|
831
|
+
ContentType.UNESCAPED_TSV.value,
|
832
|
+
ContentEncoding.GZIP.value,
|
833
|
+
["is_active", "ship_datetime_utc"],
|
834
|
+
None,
|
835
|
+
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
836
|
+
)
|
837
|
+
|
838
|
+
self.assertEqual(len(result), 3)
|
839
|
+
self.assertEqual(len(result.column_names), 2)
|
840
|
+
result_schema = result.schema
|
841
|
+
for index, field in enumerate(result_schema):
|
842
|
+
self.assertEqual(field.name, schema.field(index).name)
|
843
|
+
|
844
|
+
self.assertEqual(result.schema.field(0).type, "string")
|
845
|
+
|
846
|
+
def test_s3_file_to_table_when_parquet_gzip_and_encoding_overridden(self):
|
847
|
+
pa_kwargs_provider = lambda content_type, kwargs: {
|
848
|
+
"reader_type": "pyarrow",
|
849
|
+
OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
|
850
|
+
**kwargs,
|
851
|
+
}
|
852
|
+
|
853
|
+
result = s3_file_to_table(
|
854
|
+
PARQUET_FILE_PATH,
|
855
|
+
ContentType.PARQUET.value,
|
856
|
+
ContentEncoding.GZIP.value,
|
857
|
+
["n_legs", "animal"],
|
858
|
+
["n_legs"],
|
859
|
+
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
860
|
+
)
|
861
|
+
|
862
|
+
self.assertEqual(len(result), 6)
|
863
|
+
self.assertEqual(len(result.column_names), 1)
|
864
|
+
schema = result.schema
|
865
|
+
schema_index = schema.get_field_index("n_legs")
|
866
|
+
self.assertEqual(schema.field(schema_index).type, "int64")
|
deltacat/utils/pyarrow.py
CHANGED
@@ -47,6 +47,7 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
|
47
47
|
|
48
48
|
RAISE_ON_EMPTY_CSV_KWARG = "raise_on_empty_csv"
|
49
49
|
READER_TYPE_KWARG = "reader_type"
|
50
|
+
OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG = "override_content_encoding_for_parquet"
|
50
51
|
|
51
52
|
"""
|
52
53
|
By default, round decimal values using half_to_even round mode when
|
@@ -543,6 +544,15 @@ def s3_file_to_table(
|
|
543
544
|
if pa_read_func_kwargs_provider is not None:
|
544
545
|
kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
|
545
546
|
|
547
|
+
if OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG in kwargs:
|
548
|
+
new_content_encoding = kwargs.pop(OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG)
|
549
|
+
if content_type == ContentType.PARQUET.value:
|
550
|
+
logger.debug(
|
551
|
+
f"Overriding {s3_url} content encoding from {content_encoding} "
|
552
|
+
f"to {new_content_encoding}"
|
553
|
+
)
|
554
|
+
content_encoding = new_content_encoding
|
555
|
+
|
546
556
|
if (
|
547
557
|
content_type == ContentType.PARQUET.value
|
548
558
|
and content_encoding == ContentEncoding.IDENTITY.value
|
@@ -1,4 +1,4 @@
|
|
1
|
-
deltacat/__init__.py,sha256=
|
1
|
+
deltacat/__init__.py,sha256=gdOpCNy03T2HEQIQqSqopv0b0UL5pwXWa4McRHxMlAw,1778
|
2
2
|
deltacat/constants.py,sha256=TUJLXUJ9xq1Ryil72yLkKR8EDH_Irp5wUg56QstbRNE,2181
|
3
3
|
deltacat/exceptions.py,sha256=7sjk3BuMY5Oo-6OvAfHncZx_OcvtEL47BblWr2F7waE,12740
|
4
4
|
deltacat/logs.py,sha256=EQSDin1deehzz5xlLV1_TrFJrO_IBZ9Ahp7MdL-4cK8,9363
|
@@ -77,7 +77,7 @@ deltacat/compute/compactor_v2/utils/delta.py,sha256=I7Yvda8NVbpKXG3nM2Ku1utvR2r2
|
|
77
77
|
deltacat/compute/compactor_v2/utils/io.py,sha256=3m4dorxj-WD6Yu9_3gRE6gz3C-eNJA7nn02sHKwo-J8,6018
|
78
78
|
deltacat/compute/compactor_v2/utils/merge.py,sha256=EV_iKhNc3WflgfLW1Q46dXUvyClx8VebWHGtninEfsI,5311
|
79
79
|
deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=SbQ97M1Cxld-zZik2QMSzlj20g6JlENaQx_0PhlCIP8,12034
|
80
|
-
deltacat/compute/compactor_v2/utils/task_options.py,sha256=
|
80
|
+
deltacat/compute/compactor_v2/utils/task_options.py,sha256=0GoB_DLkCN1q8CVKTlWlDYt55qnpTDIa9fPyXJwB-cU,13801
|
81
81
|
deltacat/compute/merge_on_read/__init__.py,sha256=ckbgngmqPjYBYz_NySsR1vNTOb_hNpeL1sYkZKvBI9M,214
|
82
82
|
deltacat/compute/merge_on_read/daft.py,sha256=1oC38u5ig_aTrq7EzyWBo8Ui54rb6yERYMk-vEFbpxM,1400
|
83
83
|
deltacat/compute/merge_on_read/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -156,7 +156,7 @@ deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=y8nNHq9ADH
|
|
156
156
|
deltacat/tests/compute/compactor_v2/test_hashlib.py,sha256=8csF2hFWtBvY2MbX3-6iphCsVXxRp0zP1NTnKhfdmkg,328
|
157
157
|
deltacat/tests/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
158
158
|
deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py,sha256=aFb9rzT_EK9k8qAMHPtpqd5btyEmll1So1loDmZkotQ,1769
|
159
|
-
deltacat/tests/compute/compactor_v2/utils/test_task_options.py,sha256=
|
159
|
+
deltacat/tests/compute/compactor_v2/utils/test_task_options.py,sha256=YDQKUKv3Vv8S1fe0YQmjHTrwnWSliqKHIWGu0fEdKnI,11478
|
160
160
|
deltacat/tests/compute/resource_estimation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
161
161
|
deltacat/tests/compute/resource_estimation/test_delta.py,sha256=HCL2oUnCqm0E26T3HLJjMhoAsHTJIWPYGwIKRgM_H7E,25712
|
162
162
|
deltacat/tests/compute/resource_estimation/test_manifest.py,sha256=yrMvqDjolExdRf6Vtg5XaKDuaKz9ok15PCZ7_aJOYrI,32893
|
@@ -180,7 +180,7 @@ deltacat/tests/utils/test_cloudpickle.py,sha256=J0pnBY3-PxlUh6MamZAN1PuquKQPr2iy
|
|
180
180
|
deltacat/tests/utils/test_daft.py,sha256=kY8lkXoQvyWunok8UvOsh1An297rb3jcnstTuIAyAlc,8232
|
181
181
|
deltacat/tests/utils/test_metrics.py,sha256=Ym9nOz1EtB180pLmvugihj1sDTNDMb5opIjjr5Nmcls,16339
|
182
182
|
deltacat/tests/utils/test_placement.py,sha256=g61wVOMkHe4YJeR9Oxg_BOVQ6bhHHbC3IBYv8YhUu94,597
|
183
|
-
deltacat/tests/utils/test_pyarrow.py,sha256=
|
183
|
+
deltacat/tests/utils/test_pyarrow.py,sha256=JmhcuphXD8B2SLnOgrPgrqCcdHg_BL6IjFAiNRmuA1I,32790
|
184
184
|
deltacat/tests/utils/test_record_batch_tables.py,sha256=AkG1WyljQmjnl-AxhbFWyo5LnMIKRyLScfgC2B_ES-s,11321
|
185
185
|
deltacat/tests/utils/test_resources.py,sha256=HtpvDrfPZQNtGDXUlsIzc_yd7Vf1cDscZ3YbN0oTvO8,2560
|
186
186
|
deltacat/tests/utils/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -201,7 +201,7 @@ deltacat/utils/numpy.py,sha256=SpHKKvC-K8NINTWGVfTZ5-gBFTGYqaXjjgKFhsdUjwg,2049
|
|
201
201
|
deltacat/utils/pandas.py,sha256=q99mlRB7tymICMcNbfGLfLqFu_C-feyPZKZm2CWJJVc,9574
|
202
202
|
deltacat/utils/performance.py,sha256=7ZLaMkS1ehPSIhT5uOQVBHvjC70iKHzoFquFo-KL0PI,645
|
203
203
|
deltacat/utils/placement.py,sha256=Lj20fb-eq8rgMdm_M2MBMfDLwhDM1sS1nJj2DvIK56s,12060
|
204
|
-
deltacat/utils/pyarrow.py,sha256=
|
204
|
+
deltacat/utils/pyarrow.py,sha256=9Dggs8waJrbgP62NG4ssZsl-9fl3cJ4fjYLsJ1HjhHQ,34847
|
205
205
|
deltacat/utils/resources.py,sha256=Ax1OgLLbZI4oYpp4Ki27OLaST-7I-AJgZwU87FVfY8g,8253
|
206
206
|
deltacat/utils/s3fs.py,sha256=PmUJ5Fm1WmD-_zp_M6yd9VbXvIoJuBeK6ApOdJJApLE,662
|
207
207
|
deltacat/utils/schema.py,sha256=m4Wm4ZQcpttzOUxex4dVneGlHy1_E36HspTcjNYzvVM,1564
|
@@ -211,8 +211,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
|
|
211
211
|
deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
|
212
212
|
deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
|
213
213
|
deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
|
214
|
-
deltacat-1.1.
|
215
|
-
deltacat-1.1.
|
216
|
-
deltacat-1.1.
|
217
|
-
deltacat-1.1.
|
218
|
-
deltacat-1.1.
|
214
|
+
deltacat-1.1.31.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
215
|
+
deltacat-1.1.31.dist-info/METADATA,sha256=JrWYw0uKVprpH34i-_cOUYjWI3egRQx0rhCn--OnE_0,1733
|
216
|
+
deltacat-1.1.31.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
217
|
+
deltacat-1.1.31.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
|
218
|
+
deltacat-1.1.31.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|