deltacat 1.1.31__py3-none-any.whl → 1.1.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/tests/utils/test_pyarrow.py +106 -4
- deltacat/utils/pyarrow.py +11 -5
- {deltacat-1.1.31.dist-info → deltacat-1.1.32.dist-info}/METADATA +1 -1
- {deltacat-1.1.31.dist-info → deltacat-1.1.32.dist-info}/RECORD +8 -8
- {deltacat-1.1.31.dist-info → deltacat-1.1.32.dist-info}/LICENSE +0 -0
- {deltacat-1.1.31.dist-info → deltacat-1.1.32.dist-info}/WHEEL +0 -0
- {deltacat-1.1.31.dist-info → deltacat-1.1.32.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -2,9 +2,12 @@ from unittest import TestCase
|
|
2
2
|
from deltacat.utils.pyarrow import (
|
3
3
|
s3_partial_parquet_file_to_table,
|
4
4
|
pyarrow_read_csv,
|
5
|
+
ContentTypeValidationError,
|
5
6
|
content_type_to_reader_kwargs,
|
6
7
|
_add_column_kwargs,
|
8
|
+
logger,
|
7
9
|
s3_file_to_table,
|
10
|
+
s3_file_to_parquet,
|
8
11
|
ReadKwargsProviderPyArrowSchemaOverride,
|
9
12
|
RAISE_ON_EMPTY_CSV_KWARG,
|
10
13
|
RAISE_ON_DECIMAL_OVERFLOW,
|
@@ -435,7 +438,7 @@ class TestReadCSV(TestCase):
|
|
435
438
|
pa.lib.ArrowInvalid,
|
436
439
|
lambda: pyarrow_read_csv(
|
437
440
|
OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH,
|
438
|
-
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
|
441
|
+
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True},
|
439
442
|
),
|
440
443
|
)
|
441
444
|
|
@@ -479,7 +482,7 @@ class TestReadCSV(TestCase):
|
|
479
482
|
pa.lib.ArrowInvalid,
|
480
483
|
lambda: pyarrow_read_csv(
|
481
484
|
OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
|
482
|
-
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
|
485
|
+
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True},
|
483
486
|
),
|
484
487
|
)
|
485
488
|
|
@@ -590,7 +593,7 @@ class TestReadCSV(TestCase):
|
|
590
593
|
pa.lib.ArrowNotImplementedError,
|
591
594
|
lambda: pyarrow_read_csv(
|
592
595
|
OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
|
593
|
-
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
|
596
|
+
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True},
|
594
597
|
),
|
595
598
|
)
|
596
599
|
|
@@ -818,8 +821,11 @@ class TestS3FileToTable(TestCase):
|
|
818
821
|
schema = pa.schema(
|
819
822
|
[("is_active", pa.string()), ("ship_datetime_utc", pa.timestamp("us"))]
|
820
823
|
)
|
821
|
-
|
822
824
|
# OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG has no effect on uTSV files
|
825
|
+
pa_kwargs_provider = lambda content_type, kwargs: {
|
826
|
+
"reader_type": "pyarrow",
|
827
|
+
**kwargs,
|
828
|
+
}
|
823
829
|
pa_kwargs_provider = lambda content_type, kwargs: {
|
824
830
|
"reader_type": "pyarrow",
|
825
831
|
OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
|
@@ -864,3 +870,99 @@ class TestS3FileToTable(TestCase):
|
|
864
870
|
schema = result.schema
|
865
871
|
schema_index = schema.get_field_index("n_legs")
|
866
872
|
self.assertEqual(schema.field(schema_index).type, "int64")
|
873
|
+
|
874
|
+
|
875
|
+
class TestS3FileToParquet(TestCase):
|
876
|
+
def test_s3_file_to_parquet_sanity(self):
|
877
|
+
test_s3_url = PARQUET_FILE_PATH
|
878
|
+
test_content_type = ContentType.PARQUET.value
|
879
|
+
test_content_encoding = ContentEncoding.IDENTITY.value
|
880
|
+
pa_kwargs_provider = lambda content_type, kwargs: {
|
881
|
+
"reader_type": "pyarrow",
|
882
|
+
**kwargs,
|
883
|
+
}
|
884
|
+
with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
|
885
|
+
result_parquet_file: ParquetFile = s3_file_to_parquet(
|
886
|
+
test_s3_url,
|
887
|
+
test_content_type,
|
888
|
+
test_content_encoding,
|
889
|
+
["n_legs", "animal"],
|
890
|
+
["n_legs"],
|
891
|
+
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
892
|
+
)
|
893
|
+
log_message_log_args = cm.records[0].getMessage()
|
894
|
+
log_message_presanitize_kwargs = cm.records[1].getMessage()
|
895
|
+
self.assertIn(
|
896
|
+
f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}",
|
897
|
+
log_message_log_args,
|
898
|
+
)
|
899
|
+
self.assertIn("{'reader_type': 'pyarrow'}", log_message_presanitize_kwargs)
|
900
|
+
for index, field in enumerate(result_parquet_file.schema_arrow):
|
901
|
+
self.assertEqual(
|
902
|
+
field.name, result_parquet_file.schema_arrow.field(index).name
|
903
|
+
)
|
904
|
+
self.assertEqual(result_parquet_file.schema_arrow.field(0).type, "int64")
|
905
|
+
|
906
|
+
def test_s3_file_to_parquet_when_parquet_gzip_encoding_and_overridden_returns_success(
|
907
|
+
self,
|
908
|
+
):
|
909
|
+
test_s3_url = PARQUET_FILE_PATH
|
910
|
+
test_content_type = ContentType.PARQUET.value
|
911
|
+
test_content_encoding = ContentEncoding.GZIP.value
|
912
|
+
pa_kwargs_provider = lambda content_type, kwargs: {
|
913
|
+
"reader_type": "pyarrow",
|
914
|
+
OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
|
915
|
+
**kwargs,
|
916
|
+
}
|
917
|
+
with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
|
918
|
+
result_parquet_file: ParquetFile = s3_file_to_parquet(
|
919
|
+
test_s3_url,
|
920
|
+
test_content_type,
|
921
|
+
test_content_encoding,
|
922
|
+
["n_legs", "animal"],
|
923
|
+
["n_legs"],
|
924
|
+
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
925
|
+
)
|
926
|
+
log_message_log_args = cm.records[0].getMessage()
|
927
|
+
log_message_log_new_content_encoding = cm.records[1].getMessage()
|
928
|
+
log_message_presanitize_kwargs = cm.records[2].getMessage()
|
929
|
+
self.assertIn(
|
930
|
+
f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}",
|
931
|
+
log_message_log_args,
|
932
|
+
)
|
933
|
+
self.assertIn(
|
934
|
+
f"Overriding {test_s3_url} content encoding from {ContentEncoding.GZIP.value} to {ContentEncoding.IDENTITY.value}",
|
935
|
+
log_message_log_new_content_encoding,
|
936
|
+
)
|
937
|
+
self.assertIn("{'reader_type': 'pyarrow'}", log_message_presanitize_kwargs)
|
938
|
+
for index, field in enumerate(result_parquet_file.schema_arrow):
|
939
|
+
self.assertEqual(
|
940
|
+
field.name, result_parquet_file.schema_arrow.field(index).name
|
941
|
+
)
|
942
|
+
self.assertEqual(result_parquet_file.schema_arrow.field(0).type, "int64")
|
943
|
+
|
944
|
+
def test_s3_file_to_parquet_when_parquet_gzip_encoding_not_overridden_throws_error(
|
945
|
+
self,
|
946
|
+
):
|
947
|
+
test_s3_url = PARQUET_FILE_PATH
|
948
|
+
test_content_type = ContentType.PARQUET.value
|
949
|
+
test_content_encoding = ContentEncoding.GZIP.value
|
950
|
+
pa_kwargs_provider = lambda content_type, kwargs: {
|
951
|
+
"reader_type": "pyarrow",
|
952
|
+
**kwargs,
|
953
|
+
}
|
954
|
+
with self.assertRaises(ContentTypeValidationError):
|
955
|
+
with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
|
956
|
+
s3_file_to_parquet(
|
957
|
+
test_s3_url,
|
958
|
+
test_content_type,
|
959
|
+
test_content_encoding,
|
960
|
+
["n_legs", "animal"],
|
961
|
+
["n_legs"],
|
962
|
+
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
963
|
+
)
|
964
|
+
log_message_log_args = cm.records[0].getMessage()
|
965
|
+
self.assertIn(
|
966
|
+
f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}",
|
967
|
+
log_message_log_args,
|
968
|
+
)
|
deltacat/utils/pyarrow.py
CHANGED
@@ -617,7 +617,18 @@ def s3_file_to_parquet(
|
|
617
617
|
f"Reading {s3_url} to PyArrow ParquetFile. "
|
618
618
|
f"Content type: {content_type}. Encoding: {content_encoding}"
|
619
619
|
)
|
620
|
+
kwargs = {}
|
621
|
+
if pa_read_func_kwargs_provider:
|
622
|
+
kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
|
620
623
|
|
624
|
+
if OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG in kwargs:
|
625
|
+
new_content_encoding = kwargs.pop(OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG)
|
626
|
+
if content_type == ContentType.PARQUET.value:
|
627
|
+
logger.debug(
|
628
|
+
f"Overriding {s3_url} content encoding from {content_encoding} "
|
629
|
+
f"to {new_content_encoding}"
|
630
|
+
)
|
631
|
+
content_encoding = new_content_encoding
|
621
632
|
if (
|
622
633
|
content_type != ContentType.PARQUET.value
|
623
634
|
or content_encoding != ContentEncoding.IDENTITY
|
@@ -630,15 +641,10 @@ def s3_file_to_parquet(
|
|
630
641
|
if s3_client_kwargs is None:
|
631
642
|
s3_client_kwargs = {}
|
632
643
|
|
633
|
-
kwargs = {}
|
634
|
-
|
635
644
|
if s3_url.startswith("s3://"):
|
636
645
|
s3_file_system = create_s3_file_system(s3_client_kwargs)
|
637
646
|
kwargs["filesystem"] = s3_file_system
|
638
647
|
|
639
|
-
if pa_read_func_kwargs_provider:
|
640
|
-
kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
|
641
|
-
|
642
648
|
logger.debug(f"Pre-sanitize kwargs for {s3_url}: {kwargs}")
|
643
649
|
|
644
650
|
kwargs = sanitize_kwargs_to_callable(ParquetFile.__init__, kwargs)
|
@@ -1,4 +1,4 @@
|
|
1
|
-
deltacat/__init__.py,sha256=
|
1
|
+
deltacat/__init__.py,sha256=amNk91Zxauag8dm3s8SuUKinWdeAA2EaiWG9_SdboQE,1778
|
2
2
|
deltacat/constants.py,sha256=TUJLXUJ9xq1Ryil72yLkKR8EDH_Irp5wUg56QstbRNE,2181
|
3
3
|
deltacat/exceptions.py,sha256=7sjk3BuMY5Oo-6OvAfHncZx_OcvtEL47BblWr2F7waE,12740
|
4
4
|
deltacat/logs.py,sha256=EQSDin1deehzz5xlLV1_TrFJrO_IBZ9Ahp7MdL-4cK8,9363
|
@@ -180,7 +180,7 @@ deltacat/tests/utils/test_cloudpickle.py,sha256=J0pnBY3-PxlUh6MamZAN1PuquKQPr2iy
|
|
180
180
|
deltacat/tests/utils/test_daft.py,sha256=kY8lkXoQvyWunok8UvOsh1An297rb3jcnstTuIAyAlc,8232
|
181
181
|
deltacat/tests/utils/test_metrics.py,sha256=Ym9nOz1EtB180pLmvugihj1sDTNDMb5opIjjr5Nmcls,16339
|
182
182
|
deltacat/tests/utils/test_placement.py,sha256=g61wVOMkHe4YJeR9Oxg_BOVQ6bhHHbC3IBYv8YhUu94,597
|
183
|
-
deltacat/tests/utils/test_pyarrow.py,sha256=
|
183
|
+
deltacat/tests/utils/test_pyarrow.py,sha256=tuh6HzQOuAHPFxK5Mhgjjdm76Z9Z72H3MZPcJ4RnZn8,37372
|
184
184
|
deltacat/tests/utils/test_record_batch_tables.py,sha256=AkG1WyljQmjnl-AxhbFWyo5LnMIKRyLScfgC2B_ES-s,11321
|
185
185
|
deltacat/tests/utils/test_resources.py,sha256=HtpvDrfPZQNtGDXUlsIzc_yd7Vf1cDscZ3YbN0oTvO8,2560
|
186
186
|
deltacat/tests/utils/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -201,7 +201,7 @@ deltacat/utils/numpy.py,sha256=SpHKKvC-K8NINTWGVfTZ5-gBFTGYqaXjjgKFhsdUjwg,2049
|
|
201
201
|
deltacat/utils/pandas.py,sha256=q99mlRB7tymICMcNbfGLfLqFu_C-feyPZKZm2CWJJVc,9574
|
202
202
|
deltacat/utils/performance.py,sha256=7ZLaMkS1ehPSIhT5uOQVBHvjC70iKHzoFquFo-KL0PI,645
|
203
203
|
deltacat/utils/placement.py,sha256=Lj20fb-eq8rgMdm_M2MBMfDLwhDM1sS1nJj2DvIK56s,12060
|
204
|
-
deltacat/utils/pyarrow.py,sha256=
|
204
|
+
deltacat/utils/pyarrow.py,sha256=MFCsHJKapqrhaaBeVAvwR2F1MglsNNhVZeCbk7YIdyI,35266
|
205
205
|
deltacat/utils/resources.py,sha256=Ax1OgLLbZI4oYpp4Ki27OLaST-7I-AJgZwU87FVfY8g,8253
|
206
206
|
deltacat/utils/s3fs.py,sha256=PmUJ5Fm1WmD-_zp_M6yd9VbXvIoJuBeK6ApOdJJApLE,662
|
207
207
|
deltacat/utils/schema.py,sha256=m4Wm4ZQcpttzOUxex4dVneGlHy1_E36HspTcjNYzvVM,1564
|
@@ -211,8 +211,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
|
|
211
211
|
deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
|
212
212
|
deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
|
213
213
|
deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
|
214
|
-
deltacat-1.1.
|
215
|
-
deltacat-1.1.
|
216
|
-
deltacat-1.1.
|
217
|
-
deltacat-1.1.
|
218
|
-
deltacat-1.1.
|
214
|
+
deltacat-1.1.32.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
215
|
+
deltacat-1.1.32.dist-info/METADATA,sha256=KqU11gn6r8cnfoyKq4_C8widB7w_wdmfN_ikhHjSZfI,1733
|
216
|
+
deltacat-1.1.32.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
217
|
+
deltacat-1.1.32.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
|
218
|
+
deltacat-1.1.32.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|