deltacat 1.1.29__py3-none-any.whl → 1.1.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor_v2/utils/task_options.py +43 -23
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +273 -1
- deltacat/tests/utils/test_pyarrow.py +52 -0
- deltacat/utils/pyarrow.py +10 -0
- {deltacat-1.1.29.dist-info → deltacat-1.1.31.dist-info}/METADATA +1 -1
- {deltacat-1.1.29.dist-info → deltacat-1.1.31.dist-info}/RECORD +10 -10
- {deltacat-1.1.29.dist-info → deltacat-1.1.31.dist-info}/LICENSE +0 -0
- {deltacat-1.1.29.dist-info → deltacat-1.1.31.dist-info}/WHEEL +0 -0
- {deltacat-1.1.29.dist-info → deltacat-1.1.31.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -1,11 +1,16 @@
|
|
1
1
|
import logging
|
2
2
|
from typing import Dict, Optional, List, Tuple, Any
|
3
3
|
from deltacat import logs
|
4
|
+
from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
|
5
|
+
from deltacat.compute.compactor_v2.constants import (
|
6
|
+
AVERAGE_RECORD_SIZE_BYTES as DEFAULT_AVERAGE_RECORD_SIZE_BYTES,
|
7
|
+
)
|
4
8
|
from deltacat.compute.compactor_v2.model.merge_file_group import (
|
5
9
|
LocalMergeFileGroupsProvider,
|
6
10
|
)
|
7
11
|
from deltacat.storage import (
|
8
12
|
Manifest,
|
13
|
+
ManifestEntry,
|
9
14
|
interface as unimplemented_deltacat_storage,
|
10
15
|
)
|
11
16
|
from deltacat.compute.compactor.model.delta_annotated import DeltaAnnotated
|
@@ -81,16 +86,27 @@ def _get_merge_task_options(
|
|
81
86
|
and compacted_delta_manifest
|
82
87
|
and round_completion_info.hb_index_to_entry_range
|
83
88
|
):
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
89
|
+
logger.debug_conditional(
|
90
|
+
f"[Merge task {index}]: Using previous compaction rounds to calculate merge memory: {round_completion_info.compacted_pyarrow_write_result}",
|
91
|
+
memory_logs_enabled,
|
92
|
+
)
|
93
|
+
previous_inflation: float = (
|
94
|
+
(
|
95
|
+
round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
|
96
|
+
/ round_completion_info.compacted_pyarrow_write_result.file_bytes
|
97
|
+
)
|
98
|
+
if round_completion_info.compacted_pyarrow_write_result.file_bytes
|
99
|
+
else PYARROW_INFLATION_MULTIPLIER
|
88
100
|
)
|
89
101
|
debug_memory_params["previous_inflation"] = previous_inflation
|
90
102
|
|
91
|
-
average_record_size = (
|
92
|
-
|
93
|
-
|
103
|
+
average_record_size: float = (
|
104
|
+
(
|
105
|
+
round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
|
106
|
+
/ round_completion_info.compacted_pyarrow_write_result.records
|
107
|
+
)
|
108
|
+
if round_completion_info.compacted_pyarrow_write_result.records
|
109
|
+
else DEFAULT_AVERAGE_RECORD_SIZE_BYTES
|
94
110
|
)
|
95
111
|
debug_memory_params["average_record_size"] = average_record_size
|
96
112
|
|
@@ -106,31 +122,36 @@ def _get_merge_task_options(
|
|
106
122
|
str(hb_idx)
|
107
123
|
]
|
108
124
|
for entry_index in range(entry_start, entry_end):
|
109
|
-
entry = compacted_delta_manifest.entries[entry_index]
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
125
|
+
entry: ManifestEntry = compacted_delta_manifest.entries[entry_index]
|
126
|
+
current_entry_size: float = (
|
127
|
+
estimate_manifest_entry_size_bytes(
|
128
|
+
entry=entry,
|
129
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
130
|
+
estimate_resources_params=estimate_resources_params,
|
131
|
+
)
|
132
|
+
or 0.0
|
115
133
|
)
|
116
|
-
current_entry_rows =
|
117
|
-
|
118
|
-
|
119
|
-
|
134
|
+
current_entry_rows: int = (
|
135
|
+
estimate_manifest_entry_num_rows(
|
136
|
+
entry=entry,
|
137
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
138
|
+
estimate_resources_params=estimate_resources_params,
|
139
|
+
)
|
140
|
+
or 0
|
120
141
|
)
|
121
|
-
|
142
|
+
# NOTE: We can treat the current_entry_size and current_entry_rows as 0 as a None estimated entry size implies a 0 value
|
122
143
|
data_size += current_entry_size
|
123
144
|
num_rows += current_entry_rows
|
124
|
-
|
125
145
|
if primary_keys:
|
126
|
-
pk_size
|
146
|
+
pk_size: Optional[
|
147
|
+
float
|
148
|
+
] = estimate_manifest_entry_column_size_bytes(
|
127
149
|
entry=entry,
|
128
150
|
columns=primary_keys,
|
129
151
|
operation_type=OperationType.PYARROW_DOWNLOAD,
|
130
152
|
estimate_resources_params=estimate_resources_params,
|
131
153
|
)
|
132
|
-
|
133
|
-
if pk_size is None:
|
154
|
+
if not pk_size:
|
134
155
|
pk_size_bytes += current_entry_size
|
135
156
|
else:
|
136
157
|
pk_size_bytes += pk_size
|
@@ -159,7 +180,6 @@ def _get_merge_task_options(
|
|
159
180
|
f"[Merge task {index}]: Params used for calculating merge memory: {debug_memory_params}",
|
160
181
|
memory_logs_enabled,
|
161
182
|
)
|
162
|
-
|
163
183
|
return _get_task_options(0.01, total_memory, ray_custom_resources)
|
164
184
|
|
165
185
|
|
@@ -1,6 +1,37 @@
|
|
1
1
|
import unittest
|
2
2
|
import ray
|
3
|
-
from deltacat.compute.compactor_v2.utils.task_options import
|
3
|
+
from deltacat.compute.compactor_v2.utils.task_options import (
|
4
|
+
_get_task_options,
|
5
|
+
_get_merge_task_options,
|
6
|
+
logger,
|
7
|
+
)
|
8
|
+
from deltacat.compute.resource_estimation.model import (
|
9
|
+
EstimateResourcesParams,
|
10
|
+
ResourceEstimationMethod,
|
11
|
+
)
|
12
|
+
from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
|
13
|
+
from deltacat.compute.compactor import (
|
14
|
+
PyArrowWriteResult,
|
15
|
+
RoundCompletionInfo,
|
16
|
+
)
|
17
|
+
from deltacat.types.media import (
|
18
|
+
ContentType,
|
19
|
+
ContentEncoding,
|
20
|
+
)
|
21
|
+
from deltacat.storage import (
|
22
|
+
DeltaLocator,
|
23
|
+
Manifest,
|
24
|
+
ManifestMeta,
|
25
|
+
ManifestEntry,
|
26
|
+
ManifestEntryList,
|
27
|
+
PartitionValues,
|
28
|
+
)
|
29
|
+
from unittest.mock import MagicMock
|
30
|
+
from typing import Optional
|
31
|
+
|
32
|
+
from deltacat.compute.compactor_v2.constants import (
|
33
|
+
AVERAGE_RECORD_SIZE_BYTES as DEFAULT_AVERAGE_RECORD_SIZE_BYTES,
|
34
|
+
)
|
4
35
|
|
5
36
|
|
6
37
|
@ray.remote
|
@@ -14,11 +45,95 @@ def throwing_func():
|
|
14
45
|
|
15
46
|
|
16
47
|
class TestTaskOptions(unittest.TestCase):
|
48
|
+
TEST_INDEX = 0
|
49
|
+
TEST_HB_GROUP_IDX = 0
|
50
|
+
TEST_STREAM_POSITION = 1_000_000
|
51
|
+
TEST_NUM_HASH_GROUPS = 1
|
52
|
+
|
17
53
|
@classmethod
|
18
54
|
def setUpClass(cls):
|
19
55
|
ray.init(local_mode=True, ignore_reinit_error=True)
|
20
56
|
super().setUpClass()
|
21
57
|
|
58
|
+
@classmethod
|
59
|
+
def tearDownClass(cls) -> None:
|
60
|
+
ray.shutdown()
|
61
|
+
|
62
|
+
def _make_estimate_resource_params(
|
63
|
+
cls,
|
64
|
+
resource_estimation_method: Optional[
|
65
|
+
ResourceEstimationMethod
|
66
|
+
] = ResourceEstimationMethod.DEFAULT,
|
67
|
+
previous_inflation: Optional[int] = 7,
|
68
|
+
average_record_size_bytes: Optional[int] = 1000,
|
69
|
+
):
|
70
|
+
return EstimateResourcesParams.of(
|
71
|
+
resource_estimation_method=resource_estimation_method,
|
72
|
+
previous_inflation=previous_inflation,
|
73
|
+
average_record_size_bytes=average_record_size_bytes,
|
74
|
+
)
|
75
|
+
|
76
|
+
def _make_manifest(
|
77
|
+
self,
|
78
|
+
source_content_length: Optional[int] = 1000,
|
79
|
+
content_type: Optional[ContentType] = ContentType.PARQUET,
|
80
|
+
content_encoding: Optional[ContentEncoding] = ContentEncoding.IDENTITY,
|
81
|
+
partition_values: Optional[PartitionValues] = None,
|
82
|
+
uri: Optional[str] = "test",
|
83
|
+
url: Optional[str] = "test",
|
84
|
+
author: Optional[str] = "foo",
|
85
|
+
entry_uuid: Optional[str] = "foo",
|
86
|
+
manifest_uuid: Optional[str] = "bar",
|
87
|
+
) -> Manifest:
|
88
|
+
meta = ManifestMeta.of(
|
89
|
+
10,
|
90
|
+
10,
|
91
|
+
content_type=content_type,
|
92
|
+
content_encoding=content_encoding,
|
93
|
+
source_content_length=source_content_length,
|
94
|
+
partition_values=partition_values,
|
95
|
+
)
|
96
|
+
|
97
|
+
return Manifest.of(
|
98
|
+
entries=ManifestEntryList.of(
|
99
|
+
[
|
100
|
+
ManifestEntry.of(
|
101
|
+
uri=uri, url=url, meta=meta, mandatory=True, uuid=entry_uuid
|
102
|
+
)
|
103
|
+
]
|
104
|
+
),
|
105
|
+
author=author,
|
106
|
+
uuid=manifest_uuid,
|
107
|
+
)
|
108
|
+
|
109
|
+
def make_round_completion_info(
|
110
|
+
self,
|
111
|
+
high_watermark: Optional[int] = 1_000_000,
|
112
|
+
compacted_delta_locator: Optional[DeltaLocator] = None,
|
113
|
+
records_written: Optional[int] = 10,
|
114
|
+
bytes_written: Optional[int] = 10,
|
115
|
+
files_written: Optional[int] = 10,
|
116
|
+
rows_dropped: Optional[int] = 10,
|
117
|
+
sort_keys_bit_width: Optional[int] = 0,
|
118
|
+
hash_bucket_count: Optional[int] = 1,
|
119
|
+
hb_index_to_entry_range: Optional[dict] = None,
|
120
|
+
) -> RoundCompletionInfo:
|
121
|
+
if compacted_delta_locator is None:
|
122
|
+
compacted_delta_locator = MagicMock(spec=DeltaLocator)
|
123
|
+
|
124
|
+
hb_index_to_entry_range = hb_index_to_entry_range or {"0": (0, 1)}
|
125
|
+
|
126
|
+
return RoundCompletionInfo.of(
|
127
|
+
compacted_delta_locator=compacted_delta_locator,
|
128
|
+
high_watermark=high_watermark,
|
129
|
+
compacted_pyarrow_write_result=PyArrowWriteResult.of(
|
130
|
+
records_written, bytes_written, files_written, rows_dropped
|
131
|
+
),
|
132
|
+
sort_keys_bit_width=sort_keys_bit_width,
|
133
|
+
hb_index_to_entry_range=hb_index_to_entry_range,
|
134
|
+
hash_bucket_count=hash_bucket_count,
|
135
|
+
)
|
136
|
+
|
22
137
|
def test_get_task_options_sanity(self):
|
23
138
|
opts = _get_task_options(0.01, 0.01)
|
24
139
|
result_ref = valid_func.options(**opts).remote()
|
@@ -31,3 +146,160 @@ class TestTaskOptions(unittest.TestCase):
|
|
31
146
|
result_ref = throwing_func.options(**opts).remote()
|
32
147
|
|
33
148
|
self.assertRaises(ConnectionAbortedError, lambda: ray.get(result_ref))
|
149
|
+
|
150
|
+
def test_get_merge_task_options_memory_logs_enabled_sanity(self):
|
151
|
+
test_index = 0
|
152
|
+
test_hb_group_idx = 0
|
153
|
+
test_debug_memory_params = {"merge_task_index": test_index}
|
154
|
+
test_estimate_memory_params = self._make_estimate_resource_params()
|
155
|
+
test_ray_custom_resources = {}
|
156
|
+
test_rcf = self.make_round_completion_info()
|
157
|
+
test_manifest = self._make_manifest()
|
158
|
+
expected_task_opts = {
|
159
|
+
"max_retries": 3,
|
160
|
+
"memory": 1680.64,
|
161
|
+
"num_cpus": 0.01,
|
162
|
+
"scheduling_strategy": "SPREAD",
|
163
|
+
}
|
164
|
+
expected_previous_inflation = 1.0
|
165
|
+
expected_average_record_size = 1.0
|
166
|
+
with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
|
167
|
+
# At least one log of level DEBUG must be emitted
|
168
|
+
actual_merge_tasks_opts = _get_merge_task_options(
|
169
|
+
index=test_index,
|
170
|
+
hb_group_idx=test_hb_group_idx,
|
171
|
+
data_size=1,
|
172
|
+
pk_size_bytes=1,
|
173
|
+
num_rows=1,
|
174
|
+
num_hash_groups=1,
|
175
|
+
total_memory_buffer_percentage=1,
|
176
|
+
incremental_index_array_size=1,
|
177
|
+
debug_memory_params=test_debug_memory_params,
|
178
|
+
ray_custom_resources=test_ray_custom_resources,
|
179
|
+
estimate_resources_params=test_estimate_memory_params,
|
180
|
+
round_completion_info=test_rcf,
|
181
|
+
compacted_delta_manifest=test_manifest,
|
182
|
+
memory_logs_enabled=True,
|
183
|
+
)
|
184
|
+
assert {k: actual_merge_tasks_opts[k] for k in expected_task_opts}
|
185
|
+
log_message_round_completion_info = cm.records[0].getMessage()
|
186
|
+
log_message_debug_memory_params = cm.records[1].getMessage()
|
187
|
+
self.assertIn(
|
188
|
+
f"[Merge task {test_index}]: Using previous compaction rounds to calculate merge memory",
|
189
|
+
log_message_round_completion_info,
|
190
|
+
)
|
191
|
+
self.assertIn(
|
192
|
+
f"[Merge task {test_index}]: Params used for calculating merge memory",
|
193
|
+
log_message_debug_memory_params,
|
194
|
+
)
|
195
|
+
self.assertIn(
|
196
|
+
f"'previous_inflation': {expected_previous_inflation}",
|
197
|
+
log_message_debug_memory_params,
|
198
|
+
)
|
199
|
+
self.assertIn(
|
200
|
+
f"'average_record_size': {expected_average_record_size}",
|
201
|
+
log_message_debug_memory_params,
|
202
|
+
)
|
203
|
+
|
204
|
+
def test_get_merge_task_options_memory_logs_enabled_fallback_previous_inflation_fallback_average_record_size(
|
205
|
+
self,
|
206
|
+
):
|
207
|
+
test_index = 0
|
208
|
+
test_hb_group_idx = 0
|
209
|
+
test_debug_memory_params = {"merge_task_index": test_index}
|
210
|
+
test_estimate_memory_params = self._make_estimate_resource_params()
|
211
|
+
test_ray_custom_resources = {}
|
212
|
+
test_rcf = self.make_round_completion_info(
|
213
|
+
bytes_written=0, records_written=0, files_written=0, rows_dropped=0
|
214
|
+
)
|
215
|
+
test_manifest = self._make_manifest()
|
216
|
+
expected_task_opts = {
|
217
|
+
"max_retries": 3,
|
218
|
+
"memory": 1680.64,
|
219
|
+
"num_cpus": 0.01,
|
220
|
+
"scheduling_strategy": "SPREAD",
|
221
|
+
}
|
222
|
+
expected_previous_inflation = PYARROW_INFLATION_MULTIPLIER
|
223
|
+
expected_average_record_size = DEFAULT_AVERAGE_RECORD_SIZE_BYTES
|
224
|
+
with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
|
225
|
+
# At least one log of level DEBUG must be emitted
|
226
|
+
actual_merge_tasks_opts = _get_merge_task_options(
|
227
|
+
index=test_index,
|
228
|
+
hb_group_idx=test_hb_group_idx,
|
229
|
+
data_size=1,
|
230
|
+
pk_size_bytes=1,
|
231
|
+
num_rows=1,
|
232
|
+
num_hash_groups=1,
|
233
|
+
total_memory_buffer_percentage=1,
|
234
|
+
incremental_index_array_size=1,
|
235
|
+
debug_memory_params=test_debug_memory_params,
|
236
|
+
ray_custom_resources=test_ray_custom_resources,
|
237
|
+
estimate_resources_params=test_estimate_memory_params,
|
238
|
+
round_completion_info=test_rcf,
|
239
|
+
compacted_delta_manifest=test_manifest,
|
240
|
+
memory_logs_enabled=True,
|
241
|
+
)
|
242
|
+
assert {k: actual_merge_tasks_opts[k] for k in expected_task_opts}
|
243
|
+
log_message_round_completion_info = cm.records[0].getMessage()
|
244
|
+
log_message_debug_memory_params = cm.records[1].getMessage()
|
245
|
+
self.assertIn(
|
246
|
+
f"[Merge task {test_index}]: Using previous compaction rounds to calculate merge memory",
|
247
|
+
log_message_round_completion_info,
|
248
|
+
)
|
249
|
+
self.assertIn(
|
250
|
+
f"[Merge task {test_index}]: Params used for calculating merge memory",
|
251
|
+
log_message_debug_memory_params,
|
252
|
+
)
|
253
|
+
self.assertIn(
|
254
|
+
f"'previous_inflation': {expected_previous_inflation}",
|
255
|
+
log_message_debug_memory_params,
|
256
|
+
)
|
257
|
+
self.assertIn(
|
258
|
+
f"'average_record_size': {expected_average_record_size}",
|
259
|
+
log_message_debug_memory_params,
|
260
|
+
)
|
261
|
+
|
262
|
+
def test_get_merge_task_options_memory_logs_enabled_not_using_previous_round_completion_info(
|
263
|
+
self,
|
264
|
+
):
|
265
|
+
test_index = 0
|
266
|
+
test_hb_group_idx = 0
|
267
|
+
test_debug_memory_params = {"merge_task_index": test_index}
|
268
|
+
test_estimate_memory_params = self._make_estimate_resource_params()
|
269
|
+
test_ray_custom_resources = {}
|
270
|
+
test_rcf = None
|
271
|
+
test_manifest = self._make_manifest()
|
272
|
+
expected_task_opts = {
|
273
|
+
"max_retries": 3,
|
274
|
+
"memory": 1680.64,
|
275
|
+
"num_cpus": 0.01,
|
276
|
+
"scheduling_strategy": "SPREAD",
|
277
|
+
}
|
278
|
+
with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
|
279
|
+
# At least one log of level DEBUG must be emitted
|
280
|
+
actual_merge_tasks_opts = _get_merge_task_options(
|
281
|
+
index=test_index,
|
282
|
+
hb_group_idx=test_hb_group_idx,
|
283
|
+
data_size=1,
|
284
|
+
pk_size_bytes=1,
|
285
|
+
num_rows=1,
|
286
|
+
num_hash_groups=1,
|
287
|
+
total_memory_buffer_percentage=1,
|
288
|
+
incremental_index_array_size=1,
|
289
|
+
debug_memory_params=test_debug_memory_params,
|
290
|
+
ray_custom_resources=test_ray_custom_resources,
|
291
|
+
estimate_resources_params=test_estimate_memory_params,
|
292
|
+
round_completion_info=test_rcf,
|
293
|
+
compacted_delta_manifest=test_manifest,
|
294
|
+
memory_logs_enabled=True,
|
295
|
+
)
|
296
|
+
assert {k: actual_merge_tasks_opts[k] for k in expected_task_opts}
|
297
|
+
log_message_debug_memory_params = cm.records[0].getMessage()
|
298
|
+
self.assertIn(
|
299
|
+
f"[Merge task {test_index}]: Params used for calculating merge memory",
|
300
|
+
log_message_debug_memory_params,
|
301
|
+
)
|
302
|
+
self.assertNotIn(
|
303
|
+
"'average_record_size'",
|
304
|
+
log_message_debug_memory_params,
|
305
|
+
)
|
@@ -8,6 +8,7 @@ from deltacat.utils.pyarrow import (
|
|
8
8
|
ReadKwargsProviderPyArrowSchemaOverride,
|
9
9
|
RAISE_ON_EMPTY_CSV_KWARG,
|
10
10
|
RAISE_ON_DECIMAL_OVERFLOW,
|
11
|
+
OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG,
|
11
12
|
)
|
12
13
|
import decimal
|
13
14
|
from deltacat.types.media import ContentEncoding, ContentType
|
@@ -812,3 +813,54 @@ class TestS3FileToTable(TestCase):
|
|
812
813
|
schema = result.schema
|
813
814
|
schema_index = schema.get_field_index("n_legs")
|
814
815
|
self.assertEqual(schema.field(schema_index).type, "int64")
|
816
|
+
|
817
|
+
def test_s3_file_to_table_when_utsv_gzip_and_content_type_overridden(self):
|
818
|
+
schema = pa.schema(
|
819
|
+
[("is_active", pa.string()), ("ship_datetime_utc", pa.timestamp("us"))]
|
820
|
+
)
|
821
|
+
|
822
|
+
# OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG has no effect on uTSV files
|
823
|
+
pa_kwargs_provider = lambda content_type, kwargs: {
|
824
|
+
"reader_type": "pyarrow",
|
825
|
+
OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
|
826
|
+
**kwargs,
|
827
|
+
}
|
828
|
+
|
829
|
+
result = s3_file_to_table(
|
830
|
+
GZIP_COMPRESSED_FILE_UTSV_PATH,
|
831
|
+
ContentType.UNESCAPED_TSV.value,
|
832
|
+
ContentEncoding.GZIP.value,
|
833
|
+
["is_active", "ship_datetime_utc"],
|
834
|
+
None,
|
835
|
+
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
836
|
+
)
|
837
|
+
|
838
|
+
self.assertEqual(len(result), 3)
|
839
|
+
self.assertEqual(len(result.column_names), 2)
|
840
|
+
result_schema = result.schema
|
841
|
+
for index, field in enumerate(result_schema):
|
842
|
+
self.assertEqual(field.name, schema.field(index).name)
|
843
|
+
|
844
|
+
self.assertEqual(result.schema.field(0).type, "string")
|
845
|
+
|
846
|
+
def test_s3_file_to_table_when_parquet_gzip_and_encoding_overridden(self):
|
847
|
+
pa_kwargs_provider = lambda content_type, kwargs: {
|
848
|
+
"reader_type": "pyarrow",
|
849
|
+
OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
|
850
|
+
**kwargs,
|
851
|
+
}
|
852
|
+
|
853
|
+
result = s3_file_to_table(
|
854
|
+
PARQUET_FILE_PATH,
|
855
|
+
ContentType.PARQUET.value,
|
856
|
+
ContentEncoding.GZIP.value,
|
857
|
+
["n_legs", "animal"],
|
858
|
+
["n_legs"],
|
859
|
+
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
860
|
+
)
|
861
|
+
|
862
|
+
self.assertEqual(len(result), 6)
|
863
|
+
self.assertEqual(len(result.column_names), 1)
|
864
|
+
schema = result.schema
|
865
|
+
schema_index = schema.get_field_index("n_legs")
|
866
|
+
self.assertEqual(schema.field(schema_index).type, "int64")
|
deltacat/utils/pyarrow.py
CHANGED
@@ -47,6 +47,7 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
|
47
47
|
|
48
48
|
RAISE_ON_EMPTY_CSV_KWARG = "raise_on_empty_csv"
|
49
49
|
READER_TYPE_KWARG = "reader_type"
|
50
|
+
OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG = "override_content_encoding_for_parquet"
|
50
51
|
|
51
52
|
"""
|
52
53
|
By default, round decimal values using half_to_even round mode when
|
@@ -543,6 +544,15 @@ def s3_file_to_table(
|
|
543
544
|
if pa_read_func_kwargs_provider is not None:
|
544
545
|
kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
|
545
546
|
|
547
|
+
if OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG in kwargs:
|
548
|
+
new_content_encoding = kwargs.pop(OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG)
|
549
|
+
if content_type == ContentType.PARQUET.value:
|
550
|
+
logger.debug(
|
551
|
+
f"Overriding {s3_url} content encoding from {content_encoding} "
|
552
|
+
f"to {new_content_encoding}"
|
553
|
+
)
|
554
|
+
content_encoding = new_content_encoding
|
555
|
+
|
546
556
|
if (
|
547
557
|
content_type == ContentType.PARQUET.value
|
548
558
|
and content_encoding == ContentEncoding.IDENTITY.value
|
@@ -1,4 +1,4 @@
|
|
1
|
-
deltacat/__init__.py,sha256=
|
1
|
+
deltacat/__init__.py,sha256=gdOpCNy03T2HEQIQqSqopv0b0UL5pwXWa4McRHxMlAw,1778
|
2
2
|
deltacat/constants.py,sha256=TUJLXUJ9xq1Ryil72yLkKR8EDH_Irp5wUg56QstbRNE,2181
|
3
3
|
deltacat/exceptions.py,sha256=7sjk3BuMY5Oo-6OvAfHncZx_OcvtEL47BblWr2F7waE,12740
|
4
4
|
deltacat/logs.py,sha256=EQSDin1deehzz5xlLV1_TrFJrO_IBZ9Ahp7MdL-4cK8,9363
|
@@ -77,7 +77,7 @@ deltacat/compute/compactor_v2/utils/delta.py,sha256=I7Yvda8NVbpKXG3nM2Ku1utvR2r2
|
|
77
77
|
deltacat/compute/compactor_v2/utils/io.py,sha256=3m4dorxj-WD6Yu9_3gRE6gz3C-eNJA7nn02sHKwo-J8,6018
|
78
78
|
deltacat/compute/compactor_v2/utils/merge.py,sha256=EV_iKhNc3WflgfLW1Q46dXUvyClx8VebWHGtninEfsI,5311
|
79
79
|
deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=SbQ97M1Cxld-zZik2QMSzlj20g6JlENaQx_0PhlCIP8,12034
|
80
|
-
deltacat/compute/compactor_v2/utils/task_options.py,sha256=
|
80
|
+
deltacat/compute/compactor_v2/utils/task_options.py,sha256=0GoB_DLkCN1q8CVKTlWlDYt55qnpTDIa9fPyXJwB-cU,13801
|
81
81
|
deltacat/compute/merge_on_read/__init__.py,sha256=ckbgngmqPjYBYz_NySsR1vNTOb_hNpeL1sYkZKvBI9M,214
|
82
82
|
deltacat/compute/merge_on_read/daft.py,sha256=1oC38u5ig_aTrq7EzyWBo8Ui54rb6yERYMk-vEFbpxM,1400
|
83
83
|
deltacat/compute/merge_on_read/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -156,7 +156,7 @@ deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=y8nNHq9ADH
|
|
156
156
|
deltacat/tests/compute/compactor_v2/test_hashlib.py,sha256=8csF2hFWtBvY2MbX3-6iphCsVXxRp0zP1NTnKhfdmkg,328
|
157
157
|
deltacat/tests/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
158
158
|
deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py,sha256=aFb9rzT_EK9k8qAMHPtpqd5btyEmll1So1loDmZkotQ,1769
|
159
|
-
deltacat/tests/compute/compactor_v2/utils/test_task_options.py,sha256=
|
159
|
+
deltacat/tests/compute/compactor_v2/utils/test_task_options.py,sha256=YDQKUKv3Vv8S1fe0YQmjHTrwnWSliqKHIWGu0fEdKnI,11478
|
160
160
|
deltacat/tests/compute/resource_estimation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
161
161
|
deltacat/tests/compute/resource_estimation/test_delta.py,sha256=HCL2oUnCqm0E26T3HLJjMhoAsHTJIWPYGwIKRgM_H7E,25712
|
162
162
|
deltacat/tests/compute/resource_estimation/test_manifest.py,sha256=yrMvqDjolExdRf6Vtg5XaKDuaKz9ok15PCZ7_aJOYrI,32893
|
@@ -180,7 +180,7 @@ deltacat/tests/utils/test_cloudpickle.py,sha256=J0pnBY3-PxlUh6MamZAN1PuquKQPr2iy
|
|
180
180
|
deltacat/tests/utils/test_daft.py,sha256=kY8lkXoQvyWunok8UvOsh1An297rb3jcnstTuIAyAlc,8232
|
181
181
|
deltacat/tests/utils/test_metrics.py,sha256=Ym9nOz1EtB180pLmvugihj1sDTNDMb5opIjjr5Nmcls,16339
|
182
182
|
deltacat/tests/utils/test_placement.py,sha256=g61wVOMkHe4YJeR9Oxg_BOVQ6bhHHbC3IBYv8YhUu94,597
|
183
|
-
deltacat/tests/utils/test_pyarrow.py,sha256=
|
183
|
+
deltacat/tests/utils/test_pyarrow.py,sha256=JmhcuphXD8B2SLnOgrPgrqCcdHg_BL6IjFAiNRmuA1I,32790
|
184
184
|
deltacat/tests/utils/test_record_batch_tables.py,sha256=AkG1WyljQmjnl-AxhbFWyo5LnMIKRyLScfgC2B_ES-s,11321
|
185
185
|
deltacat/tests/utils/test_resources.py,sha256=HtpvDrfPZQNtGDXUlsIzc_yd7Vf1cDscZ3YbN0oTvO8,2560
|
186
186
|
deltacat/tests/utils/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -201,7 +201,7 @@ deltacat/utils/numpy.py,sha256=SpHKKvC-K8NINTWGVfTZ5-gBFTGYqaXjjgKFhsdUjwg,2049
|
|
201
201
|
deltacat/utils/pandas.py,sha256=q99mlRB7tymICMcNbfGLfLqFu_C-feyPZKZm2CWJJVc,9574
|
202
202
|
deltacat/utils/performance.py,sha256=7ZLaMkS1ehPSIhT5uOQVBHvjC70iKHzoFquFo-KL0PI,645
|
203
203
|
deltacat/utils/placement.py,sha256=Lj20fb-eq8rgMdm_M2MBMfDLwhDM1sS1nJj2DvIK56s,12060
|
204
|
-
deltacat/utils/pyarrow.py,sha256=
|
204
|
+
deltacat/utils/pyarrow.py,sha256=9Dggs8waJrbgP62NG4ssZsl-9fl3cJ4fjYLsJ1HjhHQ,34847
|
205
205
|
deltacat/utils/resources.py,sha256=Ax1OgLLbZI4oYpp4Ki27OLaST-7I-AJgZwU87FVfY8g,8253
|
206
206
|
deltacat/utils/s3fs.py,sha256=PmUJ5Fm1WmD-_zp_M6yd9VbXvIoJuBeK6ApOdJJApLE,662
|
207
207
|
deltacat/utils/schema.py,sha256=m4Wm4ZQcpttzOUxex4dVneGlHy1_E36HspTcjNYzvVM,1564
|
@@ -211,8 +211,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
|
|
211
211
|
deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
|
212
212
|
deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
|
213
213
|
deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
|
214
|
-
deltacat-1.1.
|
215
|
-
deltacat-1.1.
|
216
|
-
deltacat-1.1.
|
217
|
-
deltacat-1.1.
|
218
|
-
deltacat-1.1.
|
214
|
+
deltacat-1.1.31.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
215
|
+
deltacat-1.1.31.dist-info/METADATA,sha256=JrWYw0uKVprpH34i-_cOUYjWI3egRQx0rhCn--OnE_0,1733
|
216
|
+
deltacat-1.1.31.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
217
|
+
deltacat-1.1.31.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
|
218
|
+
deltacat-1.1.31.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|