deltacat 1.1.30__py3-none-any.whl → 1.1.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deltacat/__init__.py CHANGED
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "1.1.30"
47
+ __version__ = "1.1.32"
48
48
 
49
49
 
50
50
  __all__ = [
@@ -1,11 +1,16 @@
1
1
  import logging
2
2
  from typing import Dict, Optional, List, Tuple, Any
3
3
  from deltacat import logs
4
+ from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
5
+ from deltacat.compute.compactor_v2.constants import (
6
+ AVERAGE_RECORD_SIZE_BYTES as DEFAULT_AVERAGE_RECORD_SIZE_BYTES,
7
+ )
4
8
  from deltacat.compute.compactor_v2.model.merge_file_group import (
5
9
  LocalMergeFileGroupsProvider,
6
10
  )
7
11
  from deltacat.storage import (
8
12
  Manifest,
13
+ ManifestEntry,
9
14
  interface as unimplemented_deltacat_storage,
10
15
  )
11
16
  from deltacat.compute.compactor.model.delta_annotated import DeltaAnnotated
@@ -81,16 +86,27 @@ def _get_merge_task_options(
81
86
  and compacted_delta_manifest
82
87
  and round_completion_info.hb_index_to_entry_range
83
88
  ):
84
-
85
- previous_inflation = (
86
- round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
87
- / round_completion_info.compacted_pyarrow_write_result.file_bytes
89
+ logger.debug_conditional(
90
+ f"[Merge task {index}]: Using previous compaction rounds to calculate merge memory: {round_completion_info.compacted_pyarrow_write_result}",
91
+ memory_logs_enabled,
92
+ )
93
+ previous_inflation: float = (
94
+ (
95
+ round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
96
+ / round_completion_info.compacted_pyarrow_write_result.file_bytes
97
+ )
98
+ if round_completion_info.compacted_pyarrow_write_result.file_bytes
99
+ else PYARROW_INFLATION_MULTIPLIER
88
100
  )
89
101
  debug_memory_params["previous_inflation"] = previous_inflation
90
102
 
91
- average_record_size = (
92
- round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
93
- / round_completion_info.compacted_pyarrow_write_result.records
103
+ average_record_size: float = (
104
+ (
105
+ round_completion_info.compacted_pyarrow_write_result.pyarrow_bytes
106
+ / round_completion_info.compacted_pyarrow_write_result.records
107
+ )
108
+ if round_completion_info.compacted_pyarrow_write_result.records
109
+ else DEFAULT_AVERAGE_RECORD_SIZE_BYTES
94
110
  )
95
111
  debug_memory_params["average_record_size"] = average_record_size
96
112
 
@@ -106,31 +122,36 @@ def _get_merge_task_options(
106
122
  str(hb_idx)
107
123
  ]
108
124
  for entry_index in range(entry_start, entry_end):
109
- entry = compacted_delta_manifest.entries[entry_index]
110
-
111
- current_entry_size = estimate_manifest_entry_size_bytes(
112
- entry=entry,
113
- operation_type=OperationType.PYARROW_DOWNLOAD,
114
- estimate_resources_params=estimate_resources_params,
125
+ entry: ManifestEntry = compacted_delta_manifest.entries[entry_index]
126
+ current_entry_size: float = (
127
+ estimate_manifest_entry_size_bytes(
128
+ entry=entry,
129
+ operation_type=OperationType.PYARROW_DOWNLOAD,
130
+ estimate_resources_params=estimate_resources_params,
131
+ )
132
+ or 0.0
115
133
  )
116
- current_entry_rows = estimate_manifest_entry_num_rows(
117
- entry=entry,
118
- operation_type=OperationType.PYARROW_DOWNLOAD,
119
- estimate_resources_params=estimate_resources_params,
134
+ current_entry_rows: int = (
135
+ estimate_manifest_entry_num_rows(
136
+ entry=entry,
137
+ operation_type=OperationType.PYARROW_DOWNLOAD,
138
+ estimate_resources_params=estimate_resources_params,
139
+ )
140
+ or 0
120
141
  )
121
-
142
+ # NOTE: We can treat the current_entry_size and current_entry_rows as 0 as a None estimated entry size implies a 0 value
122
143
  data_size += current_entry_size
123
144
  num_rows += current_entry_rows
124
-
125
145
  if primary_keys:
126
- pk_size = estimate_manifest_entry_column_size_bytes(
146
+ pk_size: Optional[
147
+ float
148
+ ] = estimate_manifest_entry_column_size_bytes(
127
149
  entry=entry,
128
150
  columns=primary_keys,
129
151
  operation_type=OperationType.PYARROW_DOWNLOAD,
130
152
  estimate_resources_params=estimate_resources_params,
131
153
  )
132
-
133
- if pk_size is None:
154
+ if not pk_size:
134
155
  pk_size_bytes += current_entry_size
135
156
  else:
136
157
  pk_size_bytes += pk_size
@@ -159,7 +180,6 @@ def _get_merge_task_options(
159
180
  f"[Merge task {index}]: Params used for calculating merge memory: {debug_memory_params}",
160
181
  memory_logs_enabled,
161
182
  )
162
-
163
183
  return _get_task_options(0.01, total_memory, ray_custom_resources)
164
184
 
165
185
 
@@ -1,6 +1,37 @@
1
1
  import unittest
2
2
  import ray
3
- from deltacat.compute.compactor_v2.utils.task_options import _get_task_options
3
+ from deltacat.compute.compactor_v2.utils.task_options import (
4
+ _get_task_options,
5
+ _get_merge_task_options,
6
+ logger,
7
+ )
8
+ from deltacat.compute.resource_estimation.model import (
9
+ EstimateResourcesParams,
10
+ ResourceEstimationMethod,
11
+ )
12
+ from deltacat.constants import PYARROW_INFLATION_MULTIPLIER
13
+ from deltacat.compute.compactor import (
14
+ PyArrowWriteResult,
15
+ RoundCompletionInfo,
16
+ )
17
+ from deltacat.types.media import (
18
+ ContentType,
19
+ ContentEncoding,
20
+ )
21
+ from deltacat.storage import (
22
+ DeltaLocator,
23
+ Manifest,
24
+ ManifestMeta,
25
+ ManifestEntry,
26
+ ManifestEntryList,
27
+ PartitionValues,
28
+ )
29
+ from unittest.mock import MagicMock
30
+ from typing import Optional
31
+
32
+ from deltacat.compute.compactor_v2.constants import (
33
+ AVERAGE_RECORD_SIZE_BYTES as DEFAULT_AVERAGE_RECORD_SIZE_BYTES,
34
+ )
4
35
 
5
36
 
6
37
  @ray.remote
@@ -14,11 +45,95 @@ def throwing_func():
14
45
 
15
46
 
16
47
  class TestTaskOptions(unittest.TestCase):
48
+ TEST_INDEX = 0
49
+ TEST_HB_GROUP_IDX = 0
50
+ TEST_STREAM_POSITION = 1_000_000
51
+ TEST_NUM_HASH_GROUPS = 1
52
+
17
53
  @classmethod
18
54
  def setUpClass(cls):
19
55
  ray.init(local_mode=True, ignore_reinit_error=True)
20
56
  super().setUpClass()
21
57
 
58
+ @classmethod
59
+ def tearDownClass(cls) -> None:
60
+ ray.shutdown()
61
+
62
+ def _make_estimate_resource_params(
63
+ cls,
64
+ resource_estimation_method: Optional[
65
+ ResourceEstimationMethod
66
+ ] = ResourceEstimationMethod.DEFAULT,
67
+ previous_inflation: Optional[int] = 7,
68
+ average_record_size_bytes: Optional[int] = 1000,
69
+ ):
70
+ return EstimateResourcesParams.of(
71
+ resource_estimation_method=resource_estimation_method,
72
+ previous_inflation=previous_inflation,
73
+ average_record_size_bytes=average_record_size_bytes,
74
+ )
75
+
76
+ def _make_manifest(
77
+ self,
78
+ source_content_length: Optional[int] = 1000,
79
+ content_type: Optional[ContentType] = ContentType.PARQUET,
80
+ content_encoding: Optional[ContentEncoding] = ContentEncoding.IDENTITY,
81
+ partition_values: Optional[PartitionValues] = None,
82
+ uri: Optional[str] = "test",
83
+ url: Optional[str] = "test",
84
+ author: Optional[str] = "foo",
85
+ entry_uuid: Optional[str] = "foo",
86
+ manifest_uuid: Optional[str] = "bar",
87
+ ) -> Manifest:
88
+ meta = ManifestMeta.of(
89
+ 10,
90
+ 10,
91
+ content_type=content_type,
92
+ content_encoding=content_encoding,
93
+ source_content_length=source_content_length,
94
+ partition_values=partition_values,
95
+ )
96
+
97
+ return Manifest.of(
98
+ entries=ManifestEntryList.of(
99
+ [
100
+ ManifestEntry.of(
101
+ uri=uri, url=url, meta=meta, mandatory=True, uuid=entry_uuid
102
+ )
103
+ ]
104
+ ),
105
+ author=author,
106
+ uuid=manifest_uuid,
107
+ )
108
+
109
+ def make_round_completion_info(
110
+ self,
111
+ high_watermark: Optional[int] = 1_000_000,
112
+ compacted_delta_locator: Optional[DeltaLocator] = None,
113
+ records_written: Optional[int] = 10,
114
+ bytes_written: Optional[int] = 10,
115
+ files_written: Optional[int] = 10,
116
+ rows_dropped: Optional[int] = 10,
117
+ sort_keys_bit_width: Optional[int] = 0,
118
+ hash_bucket_count: Optional[int] = 1,
119
+ hb_index_to_entry_range: Optional[dict] = None,
120
+ ) -> RoundCompletionInfo:
121
+ if compacted_delta_locator is None:
122
+ compacted_delta_locator = MagicMock(spec=DeltaLocator)
123
+
124
+ hb_index_to_entry_range = hb_index_to_entry_range or {"0": (0, 1)}
125
+
126
+ return RoundCompletionInfo.of(
127
+ compacted_delta_locator=compacted_delta_locator,
128
+ high_watermark=high_watermark,
129
+ compacted_pyarrow_write_result=PyArrowWriteResult.of(
130
+ records_written, bytes_written, files_written, rows_dropped
131
+ ),
132
+ sort_keys_bit_width=sort_keys_bit_width,
133
+ hb_index_to_entry_range=hb_index_to_entry_range,
134
+ hash_bucket_count=hash_bucket_count,
135
+ )
136
+
22
137
  def test_get_task_options_sanity(self):
23
138
  opts = _get_task_options(0.01, 0.01)
24
139
  result_ref = valid_func.options(**opts).remote()
@@ -31,3 +146,160 @@ class TestTaskOptions(unittest.TestCase):
31
146
  result_ref = throwing_func.options(**opts).remote()
32
147
 
33
148
  self.assertRaises(ConnectionAbortedError, lambda: ray.get(result_ref))
149
+
150
+ def test_get_merge_task_options_memory_logs_enabled_sanity(self):
151
+ test_index = 0
152
+ test_hb_group_idx = 0
153
+ test_debug_memory_params = {"merge_task_index": test_index}
154
+ test_estimate_memory_params = self._make_estimate_resource_params()
155
+ test_ray_custom_resources = {}
156
+ test_rcf = self.make_round_completion_info()
157
+ test_manifest = self._make_manifest()
158
+ expected_task_opts = {
159
+ "max_retries": 3,
160
+ "memory": 1680.64,
161
+ "num_cpus": 0.01,
162
+ "scheduling_strategy": "SPREAD",
163
+ }
164
+ expected_previous_inflation = 1.0
165
+ expected_average_record_size = 1.0
166
+ with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
167
+ # At least one log of level DEBUG must be emitted
168
+ actual_merge_tasks_opts = _get_merge_task_options(
169
+ index=test_index,
170
+ hb_group_idx=test_hb_group_idx,
171
+ data_size=1,
172
+ pk_size_bytes=1,
173
+ num_rows=1,
174
+ num_hash_groups=1,
175
+ total_memory_buffer_percentage=1,
176
+ incremental_index_array_size=1,
177
+ debug_memory_params=test_debug_memory_params,
178
+ ray_custom_resources=test_ray_custom_resources,
179
+ estimate_resources_params=test_estimate_memory_params,
180
+ round_completion_info=test_rcf,
181
+ compacted_delta_manifest=test_manifest,
182
+ memory_logs_enabled=True,
183
+ )
184
+ assert {k: actual_merge_tasks_opts[k] for k in expected_task_opts}
185
+ log_message_round_completion_info = cm.records[0].getMessage()
186
+ log_message_debug_memory_params = cm.records[1].getMessage()
187
+ self.assertIn(
188
+ f"[Merge task {test_index}]: Using previous compaction rounds to calculate merge memory",
189
+ log_message_round_completion_info,
190
+ )
191
+ self.assertIn(
192
+ f"[Merge task {test_index}]: Params used for calculating merge memory",
193
+ log_message_debug_memory_params,
194
+ )
195
+ self.assertIn(
196
+ f"'previous_inflation': {expected_previous_inflation}",
197
+ log_message_debug_memory_params,
198
+ )
199
+ self.assertIn(
200
+ f"'average_record_size': {expected_average_record_size}",
201
+ log_message_debug_memory_params,
202
+ )
203
+
204
+ def test_get_merge_task_options_memory_logs_enabled_fallback_previous_inflation_fallback_average_record_size(
205
+ self,
206
+ ):
207
+ test_index = 0
208
+ test_hb_group_idx = 0
209
+ test_debug_memory_params = {"merge_task_index": test_index}
210
+ test_estimate_memory_params = self._make_estimate_resource_params()
211
+ test_ray_custom_resources = {}
212
+ test_rcf = self.make_round_completion_info(
213
+ bytes_written=0, records_written=0, files_written=0, rows_dropped=0
214
+ )
215
+ test_manifest = self._make_manifest()
216
+ expected_task_opts = {
217
+ "max_retries": 3,
218
+ "memory": 1680.64,
219
+ "num_cpus": 0.01,
220
+ "scheduling_strategy": "SPREAD",
221
+ }
222
+ expected_previous_inflation = PYARROW_INFLATION_MULTIPLIER
223
+ expected_average_record_size = DEFAULT_AVERAGE_RECORD_SIZE_BYTES
224
+ with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
225
+ # At least one log of level DEBUG must be emitted
226
+ actual_merge_tasks_opts = _get_merge_task_options(
227
+ index=test_index,
228
+ hb_group_idx=test_hb_group_idx,
229
+ data_size=1,
230
+ pk_size_bytes=1,
231
+ num_rows=1,
232
+ num_hash_groups=1,
233
+ total_memory_buffer_percentage=1,
234
+ incremental_index_array_size=1,
235
+ debug_memory_params=test_debug_memory_params,
236
+ ray_custom_resources=test_ray_custom_resources,
237
+ estimate_resources_params=test_estimate_memory_params,
238
+ round_completion_info=test_rcf,
239
+ compacted_delta_manifest=test_manifest,
240
+ memory_logs_enabled=True,
241
+ )
242
+ assert {k: actual_merge_tasks_opts[k] for k in expected_task_opts}
243
+ log_message_round_completion_info = cm.records[0].getMessage()
244
+ log_message_debug_memory_params = cm.records[1].getMessage()
245
+ self.assertIn(
246
+ f"[Merge task {test_index}]: Using previous compaction rounds to calculate merge memory",
247
+ log_message_round_completion_info,
248
+ )
249
+ self.assertIn(
250
+ f"[Merge task {test_index}]: Params used for calculating merge memory",
251
+ log_message_debug_memory_params,
252
+ )
253
+ self.assertIn(
254
+ f"'previous_inflation': {expected_previous_inflation}",
255
+ log_message_debug_memory_params,
256
+ )
257
+ self.assertIn(
258
+ f"'average_record_size': {expected_average_record_size}",
259
+ log_message_debug_memory_params,
260
+ )
261
+
262
+ def test_get_merge_task_options_memory_logs_enabled_not_using_previous_round_completion_info(
263
+ self,
264
+ ):
265
+ test_index = 0
266
+ test_hb_group_idx = 0
267
+ test_debug_memory_params = {"merge_task_index": test_index}
268
+ test_estimate_memory_params = self._make_estimate_resource_params()
269
+ test_ray_custom_resources = {}
270
+ test_rcf = None
271
+ test_manifest = self._make_manifest()
272
+ expected_task_opts = {
273
+ "max_retries": 3,
274
+ "memory": 1680.64,
275
+ "num_cpus": 0.01,
276
+ "scheduling_strategy": "SPREAD",
277
+ }
278
+ with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
279
+ # At least one log of level DEBUG must be emitted
280
+ actual_merge_tasks_opts = _get_merge_task_options(
281
+ index=test_index,
282
+ hb_group_idx=test_hb_group_idx,
283
+ data_size=1,
284
+ pk_size_bytes=1,
285
+ num_rows=1,
286
+ num_hash_groups=1,
287
+ total_memory_buffer_percentage=1,
288
+ incremental_index_array_size=1,
289
+ debug_memory_params=test_debug_memory_params,
290
+ ray_custom_resources=test_ray_custom_resources,
291
+ estimate_resources_params=test_estimate_memory_params,
292
+ round_completion_info=test_rcf,
293
+ compacted_delta_manifest=test_manifest,
294
+ memory_logs_enabled=True,
295
+ )
296
+ assert {k: actual_merge_tasks_opts[k] for k in expected_task_opts}
297
+ log_message_debug_memory_params = cm.records[0].getMessage()
298
+ self.assertIn(
299
+ f"[Merge task {test_index}]: Params used for calculating merge memory",
300
+ log_message_debug_memory_params,
301
+ )
302
+ self.assertNotIn(
303
+ "'average_record_size'",
304
+ log_message_debug_memory_params,
305
+ )
@@ -2,9 +2,12 @@ from unittest import TestCase
2
2
  from deltacat.utils.pyarrow import (
3
3
  s3_partial_parquet_file_to_table,
4
4
  pyarrow_read_csv,
5
+ ContentTypeValidationError,
5
6
  content_type_to_reader_kwargs,
6
7
  _add_column_kwargs,
8
+ logger,
7
9
  s3_file_to_table,
10
+ s3_file_to_parquet,
8
11
  ReadKwargsProviderPyArrowSchemaOverride,
9
12
  RAISE_ON_EMPTY_CSV_KWARG,
10
13
  RAISE_ON_DECIMAL_OVERFLOW,
@@ -435,7 +438,7 @@ class TestReadCSV(TestCase):
435
438
  pa.lib.ArrowInvalid,
436
439
  lambda: pyarrow_read_csv(
437
440
  OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH,
438
- **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
441
+ **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True},
439
442
  ),
440
443
  )
441
444
 
@@ -479,7 +482,7 @@ class TestReadCSV(TestCase):
479
482
  pa.lib.ArrowInvalid,
480
483
  lambda: pyarrow_read_csv(
481
484
  OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
482
- **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
485
+ **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True},
483
486
  ),
484
487
  )
485
488
 
@@ -590,7 +593,7 @@ class TestReadCSV(TestCase):
590
593
  pa.lib.ArrowNotImplementedError,
591
594
  lambda: pyarrow_read_csv(
592
595
  OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
593
- **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
596
+ **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True},
594
597
  ),
595
598
  )
596
599
 
@@ -818,8 +821,11 @@ class TestS3FileToTable(TestCase):
818
821
  schema = pa.schema(
819
822
  [("is_active", pa.string()), ("ship_datetime_utc", pa.timestamp("us"))]
820
823
  )
821
-
822
824
  # OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG has no effect on uTSV files
825
+ pa_kwargs_provider = lambda content_type, kwargs: {
826
+ "reader_type": "pyarrow",
827
+ **kwargs,
828
+ }
823
829
  pa_kwargs_provider = lambda content_type, kwargs: {
824
830
  "reader_type": "pyarrow",
825
831
  OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
@@ -864,3 +870,99 @@ class TestS3FileToTable(TestCase):
864
870
  schema = result.schema
865
871
  schema_index = schema.get_field_index("n_legs")
866
872
  self.assertEqual(schema.field(schema_index).type, "int64")
873
+
874
+
875
+ class TestS3FileToParquet(TestCase):
876
+ def test_s3_file_to_parquet_sanity(self):
877
+ test_s3_url = PARQUET_FILE_PATH
878
+ test_content_type = ContentType.PARQUET.value
879
+ test_content_encoding = ContentEncoding.IDENTITY.value
880
+ pa_kwargs_provider = lambda content_type, kwargs: {
881
+ "reader_type": "pyarrow",
882
+ **kwargs,
883
+ }
884
+ with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
885
+ result_parquet_file: ParquetFile = s3_file_to_parquet(
886
+ test_s3_url,
887
+ test_content_type,
888
+ test_content_encoding,
889
+ ["n_legs", "animal"],
890
+ ["n_legs"],
891
+ pa_read_func_kwargs_provider=pa_kwargs_provider,
892
+ )
893
+ log_message_log_args = cm.records[0].getMessage()
894
+ log_message_presanitize_kwargs = cm.records[1].getMessage()
895
+ self.assertIn(
896
+ f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}",
897
+ log_message_log_args,
898
+ )
899
+ self.assertIn("{'reader_type': 'pyarrow'}", log_message_presanitize_kwargs)
900
+ for index, field in enumerate(result_parquet_file.schema_arrow):
901
+ self.assertEqual(
902
+ field.name, result_parquet_file.schema_arrow.field(index).name
903
+ )
904
+ self.assertEqual(result_parquet_file.schema_arrow.field(0).type, "int64")
905
+
906
+ def test_s3_file_to_parquet_when_parquet_gzip_encoding_and_overridden_returns_success(
907
+ self,
908
+ ):
909
+ test_s3_url = PARQUET_FILE_PATH
910
+ test_content_type = ContentType.PARQUET.value
911
+ test_content_encoding = ContentEncoding.GZIP.value
912
+ pa_kwargs_provider = lambda content_type, kwargs: {
913
+ "reader_type": "pyarrow",
914
+ OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
915
+ **kwargs,
916
+ }
917
+ with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
918
+ result_parquet_file: ParquetFile = s3_file_to_parquet(
919
+ test_s3_url,
920
+ test_content_type,
921
+ test_content_encoding,
922
+ ["n_legs", "animal"],
923
+ ["n_legs"],
924
+ pa_read_func_kwargs_provider=pa_kwargs_provider,
925
+ )
926
+ log_message_log_args = cm.records[0].getMessage()
927
+ log_message_log_new_content_encoding = cm.records[1].getMessage()
928
+ log_message_presanitize_kwargs = cm.records[2].getMessage()
929
+ self.assertIn(
930
+ f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}",
931
+ log_message_log_args,
932
+ )
933
+ self.assertIn(
934
+ f"Overriding {test_s3_url} content encoding from {ContentEncoding.GZIP.value} to {ContentEncoding.IDENTITY.value}",
935
+ log_message_log_new_content_encoding,
936
+ )
937
+ self.assertIn("{'reader_type': 'pyarrow'}", log_message_presanitize_kwargs)
938
+ for index, field in enumerate(result_parquet_file.schema_arrow):
939
+ self.assertEqual(
940
+ field.name, result_parquet_file.schema_arrow.field(index).name
941
+ )
942
+ self.assertEqual(result_parquet_file.schema_arrow.field(0).type, "int64")
943
+
944
+ def test_s3_file_to_parquet_when_parquet_gzip_encoding_not_overridden_throws_error(
945
+ self,
946
+ ):
947
+ test_s3_url = PARQUET_FILE_PATH
948
+ test_content_type = ContentType.PARQUET.value
949
+ test_content_encoding = ContentEncoding.GZIP.value
950
+ pa_kwargs_provider = lambda content_type, kwargs: {
951
+ "reader_type": "pyarrow",
952
+ **kwargs,
953
+ }
954
+ with self.assertRaises(ContentTypeValidationError):
955
+ with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
956
+ s3_file_to_parquet(
957
+ test_s3_url,
958
+ test_content_type,
959
+ test_content_encoding,
960
+ ["n_legs", "animal"],
961
+ ["n_legs"],
962
+ pa_read_func_kwargs_provider=pa_kwargs_provider,
963
+ )
964
+ log_message_log_args = cm.records[0].getMessage()
965
+ self.assertIn(
966
+ f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}",
967
+ log_message_log_args,
968
+ )
deltacat/utils/pyarrow.py CHANGED
@@ -617,7 +617,18 @@ def s3_file_to_parquet(
617
617
  f"Reading {s3_url} to PyArrow ParquetFile. "
618
618
  f"Content type: {content_type}. Encoding: {content_encoding}"
619
619
  )
620
+ kwargs = {}
621
+ if pa_read_func_kwargs_provider:
622
+ kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
620
623
 
624
+ if OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG in kwargs:
625
+ new_content_encoding = kwargs.pop(OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG)
626
+ if content_type == ContentType.PARQUET.value:
627
+ logger.debug(
628
+ f"Overriding {s3_url} content encoding from {content_encoding} "
629
+ f"to {new_content_encoding}"
630
+ )
631
+ content_encoding = new_content_encoding
621
632
  if (
622
633
  content_type != ContentType.PARQUET.value
623
634
  or content_encoding != ContentEncoding.IDENTITY
@@ -630,15 +641,10 @@ def s3_file_to_parquet(
630
641
  if s3_client_kwargs is None:
631
642
  s3_client_kwargs = {}
632
643
 
633
- kwargs = {}
634
-
635
644
  if s3_url.startswith("s3://"):
636
645
  s3_file_system = create_s3_file_system(s3_client_kwargs)
637
646
  kwargs["filesystem"] = s3_file_system
638
647
 
639
- if pa_read_func_kwargs_provider:
640
- kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
641
-
642
648
  logger.debug(f"Pre-sanitize kwargs for {s3_url}: {kwargs}")
643
649
 
644
650
  kwargs = sanitize_kwargs_to_callable(ParquetFile.__init__, kwargs)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 1.1.30
3
+ Version: 1.1.32
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -1,4 +1,4 @@
1
- deltacat/__init__.py,sha256=tvf604BxhCSEXRkDh5BdZzFHPZmoSOElBRJJd34KNuo,1778
1
+ deltacat/__init__.py,sha256=amNk91Zxauag8dm3s8SuUKinWdeAA2EaiWG9_SdboQE,1778
2
2
  deltacat/constants.py,sha256=TUJLXUJ9xq1Ryil72yLkKR8EDH_Irp5wUg56QstbRNE,2181
3
3
  deltacat/exceptions.py,sha256=7sjk3BuMY5Oo-6OvAfHncZx_OcvtEL47BblWr2F7waE,12740
4
4
  deltacat/logs.py,sha256=EQSDin1deehzz5xlLV1_TrFJrO_IBZ9Ahp7MdL-4cK8,9363
@@ -77,7 +77,7 @@ deltacat/compute/compactor_v2/utils/delta.py,sha256=I7Yvda8NVbpKXG3nM2Ku1utvR2r2
77
77
  deltacat/compute/compactor_v2/utils/io.py,sha256=3m4dorxj-WD6Yu9_3gRE6gz3C-eNJA7nn02sHKwo-J8,6018
78
78
  deltacat/compute/compactor_v2/utils/merge.py,sha256=EV_iKhNc3WflgfLW1Q46dXUvyClx8VebWHGtninEfsI,5311
79
79
  deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=SbQ97M1Cxld-zZik2QMSzlj20g6JlENaQx_0PhlCIP8,12034
80
- deltacat/compute/compactor_v2/utils/task_options.py,sha256=W0jyWIIZ0tcSAGp8mhpnu1G8p3rmX4d3juCPpAJxnDM,12649
80
+ deltacat/compute/compactor_v2/utils/task_options.py,sha256=0GoB_DLkCN1q8CVKTlWlDYt55qnpTDIa9fPyXJwB-cU,13801
81
81
  deltacat/compute/merge_on_read/__init__.py,sha256=ckbgngmqPjYBYz_NySsR1vNTOb_hNpeL1sYkZKvBI9M,214
82
82
  deltacat/compute/merge_on_read/daft.py,sha256=1oC38u5ig_aTrq7EzyWBo8Ui54rb6yERYMk-vEFbpxM,1400
83
83
  deltacat/compute/merge_on_read/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -156,7 +156,7 @@ deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=y8nNHq9ADH
156
156
  deltacat/tests/compute/compactor_v2/test_hashlib.py,sha256=8csF2hFWtBvY2MbX3-6iphCsVXxRp0zP1NTnKhfdmkg,328
157
157
  deltacat/tests/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
158
158
  deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py,sha256=aFb9rzT_EK9k8qAMHPtpqd5btyEmll1So1loDmZkotQ,1769
159
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py,sha256=37DkR1u_XwhedV9cGed6FFuJTC0XmuiowHJIa_Op6uA,865
159
+ deltacat/tests/compute/compactor_v2/utils/test_task_options.py,sha256=YDQKUKv3Vv8S1fe0YQmjHTrwnWSliqKHIWGu0fEdKnI,11478
160
160
  deltacat/tests/compute/resource_estimation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
161
161
  deltacat/tests/compute/resource_estimation/test_delta.py,sha256=HCL2oUnCqm0E26T3HLJjMhoAsHTJIWPYGwIKRgM_H7E,25712
162
162
  deltacat/tests/compute/resource_estimation/test_manifest.py,sha256=yrMvqDjolExdRf6Vtg5XaKDuaKz9ok15PCZ7_aJOYrI,32893
@@ -180,7 +180,7 @@ deltacat/tests/utils/test_cloudpickle.py,sha256=J0pnBY3-PxlUh6MamZAN1PuquKQPr2iy
180
180
  deltacat/tests/utils/test_daft.py,sha256=kY8lkXoQvyWunok8UvOsh1An297rb3jcnstTuIAyAlc,8232
181
181
  deltacat/tests/utils/test_metrics.py,sha256=Ym9nOz1EtB180pLmvugihj1sDTNDMb5opIjjr5Nmcls,16339
182
182
  deltacat/tests/utils/test_placement.py,sha256=g61wVOMkHe4YJeR9Oxg_BOVQ6bhHHbC3IBYv8YhUu94,597
183
- deltacat/tests/utils/test_pyarrow.py,sha256=JmhcuphXD8B2SLnOgrPgrqCcdHg_BL6IjFAiNRmuA1I,32790
183
+ deltacat/tests/utils/test_pyarrow.py,sha256=tuh6HzQOuAHPFxK5Mhgjjdm76Z9Z72H3MZPcJ4RnZn8,37372
184
184
  deltacat/tests/utils/test_record_batch_tables.py,sha256=AkG1WyljQmjnl-AxhbFWyo5LnMIKRyLScfgC2B_ES-s,11321
185
185
  deltacat/tests/utils/test_resources.py,sha256=HtpvDrfPZQNtGDXUlsIzc_yd7Vf1cDscZ3YbN0oTvO8,2560
186
186
  deltacat/tests/utils/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -201,7 +201,7 @@ deltacat/utils/numpy.py,sha256=SpHKKvC-K8NINTWGVfTZ5-gBFTGYqaXjjgKFhsdUjwg,2049
201
201
  deltacat/utils/pandas.py,sha256=q99mlRB7tymICMcNbfGLfLqFu_C-feyPZKZm2CWJJVc,9574
202
202
  deltacat/utils/performance.py,sha256=7ZLaMkS1ehPSIhT5uOQVBHvjC70iKHzoFquFo-KL0PI,645
203
203
  deltacat/utils/placement.py,sha256=Lj20fb-eq8rgMdm_M2MBMfDLwhDM1sS1nJj2DvIK56s,12060
204
- deltacat/utils/pyarrow.py,sha256=9Dggs8waJrbgP62NG4ssZsl-9fl3cJ4fjYLsJ1HjhHQ,34847
204
+ deltacat/utils/pyarrow.py,sha256=MFCsHJKapqrhaaBeVAvwR2F1MglsNNhVZeCbk7YIdyI,35266
205
205
  deltacat/utils/resources.py,sha256=Ax1OgLLbZI4oYpp4Ki27OLaST-7I-AJgZwU87FVfY8g,8253
206
206
  deltacat/utils/s3fs.py,sha256=PmUJ5Fm1WmD-_zp_M6yd9VbXvIoJuBeK6ApOdJJApLE,662
207
207
  deltacat/utils/schema.py,sha256=m4Wm4ZQcpttzOUxex4dVneGlHy1_E36HspTcjNYzvVM,1564
@@ -211,8 +211,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
211
211
  deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
212
212
  deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
213
213
  deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
214
- deltacat-1.1.30.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
215
- deltacat-1.1.30.dist-info/METADATA,sha256=rlPQCyZovCT28JZm694aOiYCH8SJ9R37yq_l_Yba0vg,1733
216
- deltacat-1.1.30.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
217
- deltacat-1.1.30.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
218
- deltacat-1.1.30.dist-info/RECORD,,
214
+ deltacat-1.1.32.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
215
+ deltacat-1.1.32.dist-info/METADATA,sha256=KqU11gn6r8cnfoyKq4_C8widB7w_wdmfN_ikhHjSZfI,1733
216
+ deltacat-1.1.32.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
217
+ deltacat-1.1.32.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
218
+ deltacat-1.1.32.dist-info/RECORD,,