deltacat 1.1.29__py3-none-any.whl → 1.1.30__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- deltacat/__init__.py +1 -1
- deltacat/tests/utils/test_pyarrow.py +52 -0
- deltacat/utils/pyarrow.py +10 -0
- {deltacat-1.1.29.dist-info → deltacat-1.1.30.dist-info}/METADATA +1 -1
- {deltacat-1.1.29.dist-info → deltacat-1.1.30.dist-info}/RECORD +8 -8
- {deltacat-1.1.29.dist-info → deltacat-1.1.30.dist-info}/LICENSE +0 -0
- {deltacat-1.1.29.dist-info → deltacat-1.1.30.dist-info}/WHEEL +0 -0
- {deltacat-1.1.29.dist-info → deltacat-1.1.30.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -8,6 +8,7 @@ from deltacat.utils.pyarrow import (
|
|
8
8
|
ReadKwargsProviderPyArrowSchemaOverride,
|
9
9
|
RAISE_ON_EMPTY_CSV_KWARG,
|
10
10
|
RAISE_ON_DECIMAL_OVERFLOW,
|
11
|
+
OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG,
|
11
12
|
)
|
12
13
|
import decimal
|
13
14
|
from deltacat.types.media import ContentEncoding, ContentType
|
@@ -812,3 +813,54 @@ class TestS3FileToTable(TestCase):
|
|
812
813
|
schema = result.schema
|
813
814
|
schema_index = schema.get_field_index("n_legs")
|
814
815
|
self.assertEqual(schema.field(schema_index).type, "int64")
|
816
|
+
|
817
|
+
def test_s3_file_to_table_when_utsv_gzip_and_content_type_overridden(self):
|
818
|
+
schema = pa.schema(
|
819
|
+
[("is_active", pa.string()), ("ship_datetime_utc", pa.timestamp("us"))]
|
820
|
+
)
|
821
|
+
|
822
|
+
# OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG has no effect on uTSV files
|
823
|
+
pa_kwargs_provider = lambda content_type, kwargs: {
|
824
|
+
"reader_type": "pyarrow",
|
825
|
+
OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
|
826
|
+
**kwargs,
|
827
|
+
}
|
828
|
+
|
829
|
+
result = s3_file_to_table(
|
830
|
+
GZIP_COMPRESSED_FILE_UTSV_PATH,
|
831
|
+
ContentType.UNESCAPED_TSV.value,
|
832
|
+
ContentEncoding.GZIP.value,
|
833
|
+
["is_active", "ship_datetime_utc"],
|
834
|
+
None,
|
835
|
+
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
836
|
+
)
|
837
|
+
|
838
|
+
self.assertEqual(len(result), 3)
|
839
|
+
self.assertEqual(len(result.column_names), 2)
|
840
|
+
result_schema = result.schema
|
841
|
+
for index, field in enumerate(result_schema):
|
842
|
+
self.assertEqual(field.name, schema.field(index).name)
|
843
|
+
|
844
|
+
self.assertEqual(result.schema.field(0).type, "string")
|
845
|
+
|
846
|
+
def test_s3_file_to_table_when_parquet_gzip_and_encoding_overridden(self):
|
847
|
+
pa_kwargs_provider = lambda content_type, kwargs: {
|
848
|
+
"reader_type": "pyarrow",
|
849
|
+
OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
|
850
|
+
**kwargs,
|
851
|
+
}
|
852
|
+
|
853
|
+
result = s3_file_to_table(
|
854
|
+
PARQUET_FILE_PATH,
|
855
|
+
ContentType.PARQUET.value,
|
856
|
+
ContentEncoding.GZIP.value,
|
857
|
+
["n_legs", "animal"],
|
858
|
+
["n_legs"],
|
859
|
+
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
860
|
+
)
|
861
|
+
|
862
|
+
self.assertEqual(len(result), 6)
|
863
|
+
self.assertEqual(len(result.column_names), 1)
|
864
|
+
schema = result.schema
|
865
|
+
schema_index = schema.get_field_index("n_legs")
|
866
|
+
self.assertEqual(schema.field(schema_index).type, "int64")
|
deltacat/utils/pyarrow.py
CHANGED
@@ -47,6 +47,7 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
|
47
47
|
|
48
48
|
RAISE_ON_EMPTY_CSV_KWARG = "raise_on_empty_csv"
|
49
49
|
READER_TYPE_KWARG = "reader_type"
|
50
|
+
OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG = "override_content_encoding_for_parquet"
|
50
51
|
|
51
52
|
"""
|
52
53
|
By default, round decimal values using half_to_even round mode when
|
@@ -543,6 +544,15 @@ def s3_file_to_table(
|
|
543
544
|
if pa_read_func_kwargs_provider is not None:
|
544
545
|
kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
|
545
546
|
|
547
|
+
if OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG in kwargs:
|
548
|
+
new_content_encoding = kwargs.pop(OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG)
|
549
|
+
if content_type == ContentType.PARQUET.value:
|
550
|
+
logger.debug(
|
551
|
+
f"Overriding {s3_url} content encoding from {content_encoding} "
|
552
|
+
f"to {new_content_encoding}"
|
553
|
+
)
|
554
|
+
content_encoding = new_content_encoding
|
555
|
+
|
546
556
|
if (
|
547
557
|
content_type == ContentType.PARQUET.value
|
548
558
|
and content_encoding == ContentEncoding.IDENTITY.value
|
@@ -1,4 +1,4 @@
|
|
1
|
-
deltacat/__init__.py,sha256=
|
1
|
+
deltacat/__init__.py,sha256=tvf604BxhCSEXRkDh5BdZzFHPZmoSOElBRJJd34KNuo,1778
|
2
2
|
deltacat/constants.py,sha256=TUJLXUJ9xq1Ryil72yLkKR8EDH_Irp5wUg56QstbRNE,2181
|
3
3
|
deltacat/exceptions.py,sha256=7sjk3BuMY5Oo-6OvAfHncZx_OcvtEL47BblWr2F7waE,12740
|
4
4
|
deltacat/logs.py,sha256=EQSDin1deehzz5xlLV1_TrFJrO_IBZ9Ahp7MdL-4cK8,9363
|
@@ -180,7 +180,7 @@ deltacat/tests/utils/test_cloudpickle.py,sha256=J0pnBY3-PxlUh6MamZAN1PuquKQPr2iy
|
|
180
180
|
deltacat/tests/utils/test_daft.py,sha256=kY8lkXoQvyWunok8UvOsh1An297rb3jcnstTuIAyAlc,8232
|
181
181
|
deltacat/tests/utils/test_metrics.py,sha256=Ym9nOz1EtB180pLmvugihj1sDTNDMb5opIjjr5Nmcls,16339
|
182
182
|
deltacat/tests/utils/test_placement.py,sha256=g61wVOMkHe4YJeR9Oxg_BOVQ6bhHHbC3IBYv8YhUu94,597
|
183
|
-
deltacat/tests/utils/test_pyarrow.py,sha256=
|
183
|
+
deltacat/tests/utils/test_pyarrow.py,sha256=JmhcuphXD8B2SLnOgrPgrqCcdHg_BL6IjFAiNRmuA1I,32790
|
184
184
|
deltacat/tests/utils/test_record_batch_tables.py,sha256=AkG1WyljQmjnl-AxhbFWyo5LnMIKRyLScfgC2B_ES-s,11321
|
185
185
|
deltacat/tests/utils/test_resources.py,sha256=HtpvDrfPZQNtGDXUlsIzc_yd7Vf1cDscZ3YbN0oTvO8,2560
|
186
186
|
deltacat/tests/utils/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -201,7 +201,7 @@ deltacat/utils/numpy.py,sha256=SpHKKvC-K8NINTWGVfTZ5-gBFTGYqaXjjgKFhsdUjwg,2049
|
|
201
201
|
deltacat/utils/pandas.py,sha256=q99mlRB7tymICMcNbfGLfLqFu_C-feyPZKZm2CWJJVc,9574
|
202
202
|
deltacat/utils/performance.py,sha256=7ZLaMkS1ehPSIhT5uOQVBHvjC70iKHzoFquFo-KL0PI,645
|
203
203
|
deltacat/utils/placement.py,sha256=Lj20fb-eq8rgMdm_M2MBMfDLwhDM1sS1nJj2DvIK56s,12060
|
204
|
-
deltacat/utils/pyarrow.py,sha256=
|
204
|
+
deltacat/utils/pyarrow.py,sha256=9Dggs8waJrbgP62NG4ssZsl-9fl3cJ4fjYLsJ1HjhHQ,34847
|
205
205
|
deltacat/utils/resources.py,sha256=Ax1OgLLbZI4oYpp4Ki27OLaST-7I-AJgZwU87FVfY8g,8253
|
206
206
|
deltacat/utils/s3fs.py,sha256=PmUJ5Fm1WmD-_zp_M6yd9VbXvIoJuBeK6ApOdJJApLE,662
|
207
207
|
deltacat/utils/schema.py,sha256=m4Wm4ZQcpttzOUxex4dVneGlHy1_E36HspTcjNYzvVM,1564
|
@@ -211,8 +211,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
|
|
211
211
|
deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
|
212
212
|
deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
|
213
213
|
deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
|
214
|
-
deltacat-1.1.
|
215
|
-
deltacat-1.1.
|
216
|
-
deltacat-1.1.
|
217
|
-
deltacat-1.1.
|
218
|
-
deltacat-1.1.
|
214
|
+
deltacat-1.1.30.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
215
|
+
deltacat-1.1.30.dist-info/METADATA,sha256=rlPQCyZovCT28JZm694aOiYCH8SJ9R37yq_l_Yba0vg,1733
|
216
|
+
deltacat-1.1.30.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
217
|
+
deltacat-1.1.30.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
|
218
|
+
deltacat-1.1.30.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|