data-prep-toolkit 0.2.2.dev2__py3-none-any.whl → 0.2.3.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: data_prep_toolkit
3
- Version: 0.2.2.dev2
3
+ Version: 0.2.3.dev0
4
4
  Summary: Data Preparation Toolkit Library for Ray and Python
5
5
  Author-email: Maroun Touma <touma@us.ibm.com>
6
6
  License: Apache-2.0
@@ -13,6 +13,7 @@ Requires-Dist: boto3==1.34.69
13
13
  Requires-Dist: argparse
14
14
  Requires-Dist: mmh3
15
15
  Requires-Dist: psutil
16
+ Requires-Dist: polars>=1.9.0
16
17
  Provides-Extra: dev
17
18
  Requires-Dist: twine; extra == "dev"
18
19
  Requires-Dist: pytest>=7.3.2; extra == "dev"
@@ -47,7 +47,7 @@ data_processing/utils/params_utils.py,sha256=oAKY3wC8b17rDUJGqX19-rAQHDc9SQn1ksT
47
47
  data_processing/utils/pipinstaller.py,sha256=PxFNwEy8v4FqjwYgrPhH0UTrCgsJvM5WAE2fKylsk2Q,2511
48
48
  data_processing/utils/transform_configuration.json,sha256=6YBw0Hk2mokY6JBn1kR6L9AkV_yivbFrpSoHecAJp9o,4562
49
49
  data_processing/utils/transform_configurator.py,sha256=9OHSCQ8rFSoDdMW6ZCHYdNe6thRwV9zOaRPnLkWNMYE,3601
50
- data_processing/utils/transform_utils.py,sha256=KGNioN35B1i1h-MIsfm3QvXLlU1aGXimheva7NbUhMM,8496
50
+ data_processing/utils/transform_utils.py,sha256=1IEowOYQA6HOGEalqujbDVatrBaImnuY5OKmUYGaGwI,9068
51
51
  data_processing/utils/unrecoverable.py,sha256=cbF74AGK1IdRor_L1w_hPwglV_b2blP6Ad4ET79xrl0,831
52
52
  data_processing_ray/runtime/ray/__init__.py,sha256=vjQOvb_OJNq3c1F_tG3WjO-pciY77Z1lETO2Ha_GVbw,784
53
53
  data_processing_ray/runtime/ray/execution_configuration.py,sha256=C9YFixlATr7PPpkVQ0WzjCCPTWFuP80W2rnzY1bbp5I,4628
@@ -72,7 +72,7 @@ data_processing_spark/runtime/spark/transform_runtime.py,sha256=je27rTRdd-5Wtd8n
72
72
  data_processing_spark/test_support/transform/__init__.py,sha256=FQJyj7z1hXQynngMVQlCTJxTh2bdc4jN4220CBmLTqE,872
73
73
  data_processing_spark/test_support/transform/noop_folder_transform.py,sha256=z0jXCVKJYHPqB9ZTfUxnQkUVDnmfWjvss4_I3QZ8JZ4,2187
74
74
  data_processing_spark/test_support/transform/noop_transform.py,sha256=0FR3o-LnXf-UFS5gU0j-i4LVlw1mHDxGaPI40dkkIKY,1694
75
- data_prep_toolkit-0.2.2.dev2.dist-info/METADATA,sha256=XgskYjPA5pddqDgaBrPpe1IeqOpHPB2WscNM4dRh7XQ,2240
76
- data_prep_toolkit-0.2.2.dev2.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
77
- data_prep_toolkit-0.2.2.dev2.dist-info/top_level.txt,sha256=XGMDmY55_pe5KeRWvO0un9a640e2v99tzbBBtjNybPM,58
78
- data_prep_toolkit-0.2.2.dev2.dist-info/RECORD,,
75
+ data_prep_toolkit-0.2.3.dev0.dist-info/METADATA,sha256=u2UV51dNeRPKa4R0bEOI6udMGEodA8JzlCffyt-xXt8,2269
76
+ data_prep_toolkit-0.2.3.dev0.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
77
+ data_prep_toolkit-0.2.3.dev0.dist-info/top_level.txt,sha256=XGMDmY55_pe5KeRWvO0un9a640e2v99tzbBBtjNybPM,58
78
+ data_prep_toolkit-0.2.3.dev0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.3.0)
2
+ Generator: setuptools (75.6.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -11,6 +11,7 @@
11
11
  ################################################################################
12
12
 
13
13
  import hashlib
14
+ import io
14
15
  import os
15
16
  import string
16
17
  import sys
@@ -144,8 +145,21 @@ class TransformUtils:
144
145
  table = pq.read_table(reader, schema=schema)
145
146
  return table
146
147
  except Exception as e:
147
- logger.error(f"Failed to convert byte array to arrow table, exception {e}. Skipping it")
148
- return None
148
+ logger.warning(f"Could not convert bytes to pyarrow: {e}")
149
+
150
+ # We have seen this exception before when using pyarrow, but polars does not throw it.
151
+ # "Nested data conversions not implemented for chunked array outputs"
152
+ # See issue 816 https://github.com/IBM/data-prep-kit/issues/816.
153
+ logger.info(f"Attempting read of pyarrow Table using polars")
154
+ try:
155
+ import polars
156
+
157
+ df = polars.read_parquet(io.BytesIO(data))
158
+ table = df.to_arrow()
159
+ except Exception as e:
160
+ logger.error(f"Could not convert bytes to pyarrow using polars: {e}. Skipping.")
161
+ table = None
162
+ return table
149
163
 
150
164
  @staticmethod
151
165
  def convert_arrow_to_binary(table: pa.Table) -> bytes: