deltacat 1.1.31__py3-none-any.whl → 1.1.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deltacat/__init__.py CHANGED
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "1.1.31"
47
+ __version__ = "1.1.32"
48
48
 
49
49
 
50
50
  __all__ = [
@@ -2,9 +2,12 @@ from unittest import TestCase
2
2
  from deltacat.utils.pyarrow import (
3
3
  s3_partial_parquet_file_to_table,
4
4
  pyarrow_read_csv,
5
+ ContentTypeValidationError,
5
6
  content_type_to_reader_kwargs,
6
7
  _add_column_kwargs,
8
+ logger,
7
9
  s3_file_to_table,
10
+ s3_file_to_parquet,
8
11
  ReadKwargsProviderPyArrowSchemaOverride,
9
12
  RAISE_ON_EMPTY_CSV_KWARG,
10
13
  RAISE_ON_DECIMAL_OVERFLOW,
@@ -435,7 +438,7 @@ class TestReadCSV(TestCase):
435
438
  pa.lib.ArrowInvalid,
436
439
  lambda: pyarrow_read_csv(
437
440
  OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH,
438
- **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
441
+ **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True},
439
442
  ),
440
443
  )
441
444
 
@@ -479,7 +482,7 @@ class TestReadCSV(TestCase):
479
482
  pa.lib.ArrowInvalid,
480
483
  lambda: pyarrow_read_csv(
481
484
  OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
482
- **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
485
+ **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True},
483
486
  ),
484
487
  )
485
488
 
@@ -590,7 +593,7 @@ class TestReadCSV(TestCase):
590
593
  pa.lib.ArrowNotImplementedError,
591
594
  lambda: pyarrow_read_csv(
592
595
  OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
593
- **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
596
+ **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True},
594
597
  ),
595
598
  )
596
599
 
@@ -818,8 +821,11 @@ class TestS3FileToTable(TestCase):
818
821
  schema = pa.schema(
819
822
  [("is_active", pa.string()), ("ship_datetime_utc", pa.timestamp("us"))]
820
823
  )
821
-
822
824
  # OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG has no effect on uTSV files
825
+ pa_kwargs_provider = lambda content_type, kwargs: {
826
+ "reader_type": "pyarrow",
827
+ **kwargs,
828
+ }
823
829
  pa_kwargs_provider = lambda content_type, kwargs: {
824
830
  "reader_type": "pyarrow",
825
831
  OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
@@ -864,3 +870,99 @@ class TestS3FileToTable(TestCase):
864
870
  schema = result.schema
865
871
  schema_index = schema.get_field_index("n_legs")
866
872
  self.assertEqual(schema.field(schema_index).type, "int64")
873
+
874
+
875
+ class TestS3FileToParquet(TestCase):
876
+ def test_s3_file_to_parquet_sanity(self):
877
+ test_s3_url = PARQUET_FILE_PATH
878
+ test_content_type = ContentType.PARQUET.value
879
+ test_content_encoding = ContentEncoding.IDENTITY.value
880
+ pa_kwargs_provider = lambda content_type, kwargs: {
881
+ "reader_type": "pyarrow",
882
+ **kwargs,
883
+ }
884
+ with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
885
+ result_parquet_file: ParquetFile = s3_file_to_parquet(
886
+ test_s3_url,
887
+ test_content_type,
888
+ test_content_encoding,
889
+ ["n_legs", "animal"],
890
+ ["n_legs"],
891
+ pa_read_func_kwargs_provider=pa_kwargs_provider,
892
+ )
893
+ log_message_log_args = cm.records[0].getMessage()
894
+ log_message_presanitize_kwargs = cm.records[1].getMessage()
895
+ self.assertIn(
896
+ f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}",
897
+ log_message_log_args,
898
+ )
899
+ self.assertIn("{'reader_type': 'pyarrow'}", log_message_presanitize_kwargs)
900
+ for index, field in enumerate(result_parquet_file.schema_arrow):
901
+ self.assertEqual(
902
+ field.name, result_parquet_file.schema_arrow.field(index).name
903
+ )
904
+ self.assertEqual(result_parquet_file.schema_arrow.field(0).type, "int64")
905
+
906
+ def test_s3_file_to_parquet_when_parquet_gzip_encoding_and_overridden_returns_success(
907
+ self,
908
+ ):
909
+ test_s3_url = PARQUET_FILE_PATH
910
+ test_content_type = ContentType.PARQUET.value
911
+ test_content_encoding = ContentEncoding.GZIP.value
912
+ pa_kwargs_provider = lambda content_type, kwargs: {
913
+ "reader_type": "pyarrow",
914
+ OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
915
+ **kwargs,
916
+ }
917
+ with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
918
+ result_parquet_file: ParquetFile = s3_file_to_parquet(
919
+ test_s3_url,
920
+ test_content_type,
921
+ test_content_encoding,
922
+ ["n_legs", "animal"],
923
+ ["n_legs"],
924
+ pa_read_func_kwargs_provider=pa_kwargs_provider,
925
+ )
926
+ log_message_log_args = cm.records[0].getMessage()
927
+ log_message_log_new_content_encoding = cm.records[1].getMessage()
928
+ log_message_presanitize_kwargs = cm.records[2].getMessage()
929
+ self.assertIn(
930
+ f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}",
931
+ log_message_log_args,
932
+ )
933
+ self.assertIn(
934
+ f"Overriding {test_s3_url} content encoding from {ContentEncoding.GZIP.value} to {ContentEncoding.IDENTITY.value}",
935
+ log_message_log_new_content_encoding,
936
+ )
937
+ self.assertIn("{'reader_type': 'pyarrow'}", log_message_presanitize_kwargs)
938
+ for index, field in enumerate(result_parquet_file.schema_arrow):
939
+ self.assertEqual(
940
+ field.name, result_parquet_file.schema_arrow.field(index).name
941
+ )
942
+ self.assertEqual(result_parquet_file.schema_arrow.field(0).type, "int64")
943
+
944
+ def test_s3_file_to_parquet_when_parquet_gzip_encoding_not_overridden_throws_error(
945
+ self,
946
+ ):
947
+ test_s3_url = PARQUET_FILE_PATH
948
+ test_content_type = ContentType.PARQUET.value
949
+ test_content_encoding = ContentEncoding.GZIP.value
950
+ pa_kwargs_provider = lambda content_type, kwargs: {
951
+ "reader_type": "pyarrow",
952
+ **kwargs,
953
+ }
954
+ with self.assertRaises(ContentTypeValidationError):
955
+ with self.assertLogs(logger=logger.name, level="DEBUG") as cm:
956
+ s3_file_to_parquet(
957
+ test_s3_url,
958
+ test_content_type,
959
+ test_content_encoding,
960
+ ["n_legs", "animal"],
961
+ ["n_legs"],
962
+ pa_read_func_kwargs_provider=pa_kwargs_provider,
963
+ )
964
+ log_message_log_args = cm.records[0].getMessage()
965
+ self.assertIn(
966
+ f"Reading {test_s3_url} to PyArrow ParquetFile. Content type: {test_content_type}. Encoding: {test_content_encoding}",
967
+ log_message_log_args,
968
+ )
deltacat/utils/pyarrow.py CHANGED
@@ -617,7 +617,18 @@ def s3_file_to_parquet(
617
617
  f"Reading {s3_url} to PyArrow ParquetFile. "
618
618
  f"Content type: {content_type}. Encoding: {content_encoding}"
619
619
  )
620
+ kwargs = {}
621
+ if pa_read_func_kwargs_provider:
622
+ kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
620
623
 
624
+ if OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG in kwargs:
625
+ new_content_encoding = kwargs.pop(OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG)
626
+ if content_type == ContentType.PARQUET.value:
627
+ logger.debug(
628
+ f"Overriding {s3_url} content encoding from {content_encoding} "
629
+ f"to {new_content_encoding}"
630
+ )
631
+ content_encoding = new_content_encoding
621
632
  if (
622
633
  content_type != ContentType.PARQUET.value
623
634
  or content_encoding != ContentEncoding.IDENTITY
@@ -630,15 +641,10 @@ def s3_file_to_parquet(
630
641
  if s3_client_kwargs is None:
631
642
  s3_client_kwargs = {}
632
643
 
633
- kwargs = {}
634
-
635
644
  if s3_url.startswith("s3://"):
636
645
  s3_file_system = create_s3_file_system(s3_client_kwargs)
637
646
  kwargs["filesystem"] = s3_file_system
638
647
 
639
- if pa_read_func_kwargs_provider:
640
- kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
641
-
642
648
  logger.debug(f"Pre-sanitize kwargs for {s3_url}: {kwargs}")
643
649
 
644
650
  kwargs = sanitize_kwargs_to_callable(ParquetFile.__init__, kwargs)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 1.1.31
3
+ Version: 1.1.32
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -1,4 +1,4 @@
1
- deltacat/__init__.py,sha256=gdOpCNy03T2HEQIQqSqopv0b0UL5pwXWa4McRHxMlAw,1778
1
+ deltacat/__init__.py,sha256=amNk91Zxauag8dm3s8SuUKinWdeAA2EaiWG9_SdboQE,1778
2
2
  deltacat/constants.py,sha256=TUJLXUJ9xq1Ryil72yLkKR8EDH_Irp5wUg56QstbRNE,2181
3
3
  deltacat/exceptions.py,sha256=7sjk3BuMY5Oo-6OvAfHncZx_OcvtEL47BblWr2F7waE,12740
4
4
  deltacat/logs.py,sha256=EQSDin1deehzz5xlLV1_TrFJrO_IBZ9Ahp7MdL-4cK8,9363
@@ -180,7 +180,7 @@ deltacat/tests/utils/test_cloudpickle.py,sha256=J0pnBY3-PxlUh6MamZAN1PuquKQPr2iy
180
180
  deltacat/tests/utils/test_daft.py,sha256=kY8lkXoQvyWunok8UvOsh1An297rb3jcnstTuIAyAlc,8232
181
181
  deltacat/tests/utils/test_metrics.py,sha256=Ym9nOz1EtB180pLmvugihj1sDTNDMb5opIjjr5Nmcls,16339
182
182
  deltacat/tests/utils/test_placement.py,sha256=g61wVOMkHe4YJeR9Oxg_BOVQ6bhHHbC3IBYv8YhUu94,597
183
- deltacat/tests/utils/test_pyarrow.py,sha256=JmhcuphXD8B2SLnOgrPgrqCcdHg_BL6IjFAiNRmuA1I,32790
183
+ deltacat/tests/utils/test_pyarrow.py,sha256=tuh6HzQOuAHPFxK5Mhgjjdm76Z9Z72H3MZPcJ4RnZn8,37372
184
184
  deltacat/tests/utils/test_record_batch_tables.py,sha256=AkG1WyljQmjnl-AxhbFWyo5LnMIKRyLScfgC2B_ES-s,11321
185
185
  deltacat/tests/utils/test_resources.py,sha256=HtpvDrfPZQNtGDXUlsIzc_yd7Vf1cDscZ3YbN0oTvO8,2560
186
186
  deltacat/tests/utils/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -201,7 +201,7 @@ deltacat/utils/numpy.py,sha256=SpHKKvC-K8NINTWGVfTZ5-gBFTGYqaXjjgKFhsdUjwg,2049
201
201
  deltacat/utils/pandas.py,sha256=q99mlRB7tymICMcNbfGLfLqFu_C-feyPZKZm2CWJJVc,9574
202
202
  deltacat/utils/performance.py,sha256=7ZLaMkS1ehPSIhT5uOQVBHvjC70iKHzoFquFo-KL0PI,645
203
203
  deltacat/utils/placement.py,sha256=Lj20fb-eq8rgMdm_M2MBMfDLwhDM1sS1nJj2DvIK56s,12060
204
- deltacat/utils/pyarrow.py,sha256=9Dggs8waJrbgP62NG4ssZsl-9fl3cJ4fjYLsJ1HjhHQ,34847
204
+ deltacat/utils/pyarrow.py,sha256=MFCsHJKapqrhaaBeVAvwR2F1MglsNNhVZeCbk7YIdyI,35266
205
205
  deltacat/utils/resources.py,sha256=Ax1OgLLbZI4oYpp4Ki27OLaST-7I-AJgZwU87FVfY8g,8253
206
206
  deltacat/utils/s3fs.py,sha256=PmUJ5Fm1WmD-_zp_M6yd9VbXvIoJuBeK6ApOdJJApLE,662
207
207
  deltacat/utils/schema.py,sha256=m4Wm4ZQcpttzOUxex4dVneGlHy1_E36HspTcjNYzvVM,1564
@@ -211,8 +211,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
211
211
  deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
212
212
  deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
213
213
  deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
214
- deltacat-1.1.31.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
215
- deltacat-1.1.31.dist-info/METADATA,sha256=JrWYw0uKVprpH34i-_cOUYjWI3egRQx0rhCn--OnE_0,1733
216
- deltacat-1.1.31.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
217
- deltacat-1.1.31.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
218
- deltacat-1.1.31.dist-info/RECORD,,
214
+ deltacat-1.1.32.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
215
+ deltacat-1.1.32.dist-info/METADATA,sha256=KqU11gn6r8cnfoyKq4_C8widB7w_wdmfN_ikhHjSZfI,1733
216
+ deltacat-1.1.32.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
217
+ deltacat-1.1.32.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
218
+ deltacat-1.1.32.dist-info/RECORD,,