deltacat 1.1.29__py3-none-any.whl → 1.1.30__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
deltacat/__init__.py CHANGED
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "1.1.29"
47
+ __version__ = "1.1.30"
48
48
 
49
49
 
50
50
  __all__ = [
@@ -8,6 +8,7 @@ from deltacat.utils.pyarrow import (
8
8
  ReadKwargsProviderPyArrowSchemaOverride,
9
9
  RAISE_ON_EMPTY_CSV_KWARG,
10
10
  RAISE_ON_DECIMAL_OVERFLOW,
11
+ OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG,
11
12
  )
12
13
  import decimal
13
14
  from deltacat.types.media import ContentEncoding, ContentType
@@ -812,3 +813,54 @@ class TestS3FileToTable(TestCase):
812
813
  schema = result.schema
813
814
  schema_index = schema.get_field_index("n_legs")
814
815
  self.assertEqual(schema.field(schema_index).type, "int64")
816
+
817
+ def test_s3_file_to_table_when_utsv_gzip_and_content_type_overridden(self):
818
+ schema = pa.schema(
819
+ [("is_active", pa.string()), ("ship_datetime_utc", pa.timestamp("us"))]
820
+ )
821
+
822
+ # OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG has no effect on uTSV files
823
+ pa_kwargs_provider = lambda content_type, kwargs: {
824
+ "reader_type": "pyarrow",
825
+ OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
826
+ **kwargs,
827
+ }
828
+
829
+ result = s3_file_to_table(
830
+ GZIP_COMPRESSED_FILE_UTSV_PATH,
831
+ ContentType.UNESCAPED_TSV.value,
832
+ ContentEncoding.GZIP.value,
833
+ ["is_active", "ship_datetime_utc"],
834
+ None,
835
+ pa_read_func_kwargs_provider=pa_kwargs_provider,
836
+ )
837
+
838
+ self.assertEqual(len(result), 3)
839
+ self.assertEqual(len(result.column_names), 2)
840
+ result_schema = result.schema
841
+ for index, field in enumerate(result_schema):
842
+ self.assertEqual(field.name, schema.field(index).name)
843
+
844
+ self.assertEqual(result.schema.field(0).type, "string")
845
+
846
+ def test_s3_file_to_table_when_parquet_gzip_and_encoding_overridden(self):
847
+ pa_kwargs_provider = lambda content_type, kwargs: {
848
+ "reader_type": "pyarrow",
849
+ OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG: ContentEncoding.IDENTITY.value,
850
+ **kwargs,
851
+ }
852
+
853
+ result = s3_file_to_table(
854
+ PARQUET_FILE_PATH,
855
+ ContentType.PARQUET.value,
856
+ ContentEncoding.GZIP.value,
857
+ ["n_legs", "animal"],
858
+ ["n_legs"],
859
+ pa_read_func_kwargs_provider=pa_kwargs_provider,
860
+ )
861
+
862
+ self.assertEqual(len(result), 6)
863
+ self.assertEqual(len(result.column_names), 1)
864
+ schema = result.schema
865
+ schema_index = schema.get_field_index("n_legs")
866
+ self.assertEqual(schema.field(schema_index).type, "int64")
deltacat/utils/pyarrow.py CHANGED
@@ -47,6 +47,7 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
47
47
 
48
48
  RAISE_ON_EMPTY_CSV_KWARG = "raise_on_empty_csv"
49
49
  READER_TYPE_KWARG = "reader_type"
50
+ OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG = "override_content_encoding_for_parquet"
50
51
 
51
52
  """
52
53
  By default, round decimal values using half_to_even round mode when
@@ -543,6 +544,15 @@ def s3_file_to_table(
543
544
  if pa_read_func_kwargs_provider is not None:
544
545
  kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
545
546
 
547
+ if OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG in kwargs:
548
+ new_content_encoding = kwargs.pop(OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG)
549
+ if content_type == ContentType.PARQUET.value:
550
+ logger.debug(
551
+ f"Overriding {s3_url} content encoding from {content_encoding} "
552
+ f"to {new_content_encoding}"
553
+ )
554
+ content_encoding = new_content_encoding
555
+
546
556
  if (
547
557
  content_type == ContentType.PARQUET.value
548
558
  and content_encoding == ContentEncoding.IDENTITY.value
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 1.1.29
3
+ Version: 1.1.30
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -1,4 +1,4 @@
1
- deltacat/__init__.py,sha256=DoUiDxmgMh8HUGOEAG7CUY0Q9Ip-S7gePDsL8XQO5kk,1778
1
+ deltacat/__init__.py,sha256=tvf604BxhCSEXRkDh5BdZzFHPZmoSOElBRJJd34KNuo,1778
2
2
  deltacat/constants.py,sha256=TUJLXUJ9xq1Ryil72yLkKR8EDH_Irp5wUg56QstbRNE,2181
3
3
  deltacat/exceptions.py,sha256=7sjk3BuMY5Oo-6OvAfHncZx_OcvtEL47BblWr2F7waE,12740
4
4
  deltacat/logs.py,sha256=EQSDin1deehzz5xlLV1_TrFJrO_IBZ9Ahp7MdL-4cK8,9363
@@ -180,7 +180,7 @@ deltacat/tests/utils/test_cloudpickle.py,sha256=J0pnBY3-PxlUh6MamZAN1PuquKQPr2iy
180
180
  deltacat/tests/utils/test_daft.py,sha256=kY8lkXoQvyWunok8UvOsh1An297rb3jcnstTuIAyAlc,8232
181
181
  deltacat/tests/utils/test_metrics.py,sha256=Ym9nOz1EtB180pLmvugihj1sDTNDMb5opIjjr5Nmcls,16339
182
182
  deltacat/tests/utils/test_placement.py,sha256=g61wVOMkHe4YJeR9Oxg_BOVQ6bhHHbC3IBYv8YhUu94,597
183
- deltacat/tests/utils/test_pyarrow.py,sha256=fDjDkGPjdRZA3kgjgiQRym9shdeDYgkdDPYU2a7IEUk,30790
183
+ deltacat/tests/utils/test_pyarrow.py,sha256=JmhcuphXD8B2SLnOgrPgrqCcdHg_BL6IjFAiNRmuA1I,32790
184
184
  deltacat/tests/utils/test_record_batch_tables.py,sha256=AkG1WyljQmjnl-AxhbFWyo5LnMIKRyLScfgC2B_ES-s,11321
185
185
  deltacat/tests/utils/test_resources.py,sha256=HtpvDrfPZQNtGDXUlsIzc_yd7Vf1cDscZ3YbN0oTvO8,2560
186
186
  deltacat/tests/utils/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -201,7 +201,7 @@ deltacat/utils/numpy.py,sha256=SpHKKvC-K8NINTWGVfTZ5-gBFTGYqaXjjgKFhsdUjwg,2049
201
201
  deltacat/utils/pandas.py,sha256=q99mlRB7tymICMcNbfGLfLqFu_C-feyPZKZm2CWJJVc,9574
202
202
  deltacat/utils/performance.py,sha256=7ZLaMkS1ehPSIhT5uOQVBHvjC70iKHzoFquFo-KL0PI,645
203
203
  deltacat/utils/placement.py,sha256=Lj20fb-eq8rgMdm_M2MBMfDLwhDM1sS1nJj2DvIK56s,12060
204
- deltacat/utils/pyarrow.py,sha256=R3KkJPenE48rS3VrfFKSkJerX94f4e7X2dUPBQg44DY,34339
204
+ deltacat/utils/pyarrow.py,sha256=9Dggs8waJrbgP62NG4ssZsl-9fl3cJ4fjYLsJ1HjhHQ,34847
205
205
  deltacat/utils/resources.py,sha256=Ax1OgLLbZI4oYpp4Ki27OLaST-7I-AJgZwU87FVfY8g,8253
206
206
  deltacat/utils/s3fs.py,sha256=PmUJ5Fm1WmD-_zp_M6yd9VbXvIoJuBeK6ApOdJJApLE,662
207
207
  deltacat/utils/schema.py,sha256=m4Wm4ZQcpttzOUxex4dVneGlHy1_E36HspTcjNYzvVM,1564
@@ -211,8 +211,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
211
211
  deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
212
212
  deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
213
213
  deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
214
- deltacat-1.1.29.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
215
- deltacat-1.1.29.dist-info/METADATA,sha256=ZverlgFUJV4wGJao8tusRCv_sRNX4KJ4RTNAGvBCJes,1733
216
- deltacat-1.1.29.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
217
- deltacat-1.1.29.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
218
- deltacat-1.1.29.dist-info/RECORD,,
214
+ deltacat-1.1.30.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
215
+ deltacat-1.1.30.dist-info/METADATA,sha256=rlPQCyZovCT28JZm694aOiYCH8SJ9R37yq_l_Yba0vg,1733
216
+ deltacat-1.1.30.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
217
+ deltacat-1.1.30.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
218
+ deltacat-1.1.30.dist-info/RECORD,,