deltacat 1.1.27__py3-none-any.whl → 1.1.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deltacat/__init__.py CHANGED
@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
44
44
 
45
45
  deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
46
46
 
47
- __version__ = "1.1.27"
47
+ __version__ = "1.1.28"
48
48
 
49
49
 
50
50
  __all__ = [
@@ -7,7 +7,9 @@ from deltacat.utils.pyarrow import (
7
7
  s3_file_to_table,
8
8
  ReadKwargsProviderPyArrowSchemaOverride,
9
9
  RAISE_ON_EMPTY_CSV_KWARG,
10
+ RAISE_ON_DECIMAL_OVERFLOW,
10
11
  )
12
+ import decimal
11
13
  from deltacat.types.media import ContentEncoding, ContentType
12
14
  from deltacat.types.partial_download import PartialParquetParameters
13
15
  from pyarrow.parquet import ParquetFile
@@ -16,6 +18,12 @@ import pyarrow as pa
16
18
  PARQUET_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet"
17
19
  EMPTY_UTSV_PATH = "deltacat/tests/utils/data/empty.csv"
18
20
  NON_EMPTY_VALID_UTSV_PATH = "deltacat/tests/utils/data/non_empty_valid.csv"
21
+ OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH = (
22
+ "deltacat/tests/utils/data/overflowing_decimal_precision.csv"
23
+ )
24
+ OVERFLOWING_DECIMAL_SCALE_UTSV_PATH = (
25
+ "deltacat/tests/utils/data/overflowing_decimal_scale.csv"
26
+ )
19
27
  GZIP_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.gz"
20
28
  BZ2_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.bz2"
21
29
 
@@ -407,6 +415,253 @@ class TestReadCSV(TestCase):
407
415
  ),
408
416
  )
409
417
 
418
+ def test_read_csv_when_decimal_precision_overflows_and_raise_kwarg_specified(self):
419
+ schema = pa.schema(
420
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(4, 2))]
421
+ )
422
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
423
+ _add_column_kwargs(
424
+ ContentType.UNESCAPED_TSV.value,
425
+ ["is_active", "decimal_value"],
426
+ ["is_active", "decimal_value"],
427
+ kwargs,
428
+ )
429
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
430
+
431
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
432
+ self.assertRaises(
433
+ pa.lib.ArrowInvalid,
434
+ lambda: pyarrow_read_csv(
435
+ OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH,
436
+ **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
437
+ ),
438
+ )
439
+
440
+ def test_read_csv_when_decimal_precision_overflows_sanity(self):
441
+ schema = pa.schema(
442
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(4, 2))]
443
+ )
444
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
445
+ _add_column_kwargs(
446
+ ContentType.UNESCAPED_TSV.value,
447
+ ["is_active", "decimal_value"],
448
+ ["is_active", "decimal_value"],
449
+ kwargs,
450
+ )
451
+
452
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
453
+
454
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
455
+
456
+ self.assertRaises(
457
+ pa.lib.ArrowInvalid,
458
+ lambda: pyarrow_read_csv(OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH, **kwargs),
459
+ )
460
+
461
+ def test_read_csv_when_decimal_scale_overflows_and_raise_kwarg_specified(self):
462
+ schema = pa.schema(
463
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(20, 2))]
464
+ )
465
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
466
+ _add_column_kwargs(
467
+ ContentType.UNESCAPED_TSV.value,
468
+ ["is_active", "decimal_value"],
469
+ ["is_active", "decimal_value"],
470
+ kwargs,
471
+ )
472
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
473
+
474
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
475
+
476
+ self.assertRaises(
477
+ pa.lib.ArrowInvalid,
478
+ lambda: pyarrow_read_csv(
479
+ OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
480
+ **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
481
+ ),
482
+ )
483
+
484
+ def test_read_csv_when_decimal_scale_overflows_sanity(self):
485
+ schema = pa.schema(
486
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(20, 2))]
487
+ )
488
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
489
+ _add_column_kwargs(
490
+ ContentType.UNESCAPED_TSV.value,
491
+ ["is_active", "decimal_value"],
492
+ ["is_active", "decimal_value"],
493
+ kwargs,
494
+ )
495
+
496
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
497
+
498
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
499
+
500
+ result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
501
+
502
+ self.assertEqual(len(result), 3)
503
+ self.assertEqual(
504
+ result[1][0].as_py(), decimal.Decimal("322236.66")
505
+ ) # rounding decimal
506
+ self.assertEqual(result[1][1].as_py(), decimal.Decimal("32.33")) # not rounded
507
+ self.assertEqual(len(result.column_names), 2)
508
+ result_schema = result.schema
509
+ self.assertEqual(result_schema.field(0).type, "string")
510
+ self.assertEqual(result_schema.field(1).type, pa.decimal128(20, 2))
511
+
512
+ def test_read_csv_when_decimal_scale_overflows_and_negative_scale(self):
513
+ schema = pa.schema(
514
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(20, -2))]
515
+ )
516
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
517
+ _add_column_kwargs(
518
+ ContentType.UNESCAPED_TSV.value,
519
+ ["is_active", "decimal_value"],
520
+ ["is_active", "decimal_value"],
521
+ kwargs,
522
+ )
523
+
524
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
525
+
526
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
527
+
528
+ result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
529
+
530
+ self.assertEqual(len(result), 3)
531
+ self.assertEqual(
532
+ result[1][0].as_py(),
533
+ decimal.Decimal("322200"), # consequence of negative scale
534
+ ) # rounding decimal
535
+ self.assertEqual(result[1][1].as_py(), decimal.Decimal("00"))
536
+ self.assertEqual(len(result.column_names), 2)
537
+ result_schema = result.schema
538
+ self.assertEqual(result_schema.field(0).type, "string")
539
+ self.assertEqual(result_schema.field(1).type, pa.decimal128(20, -2))
540
+
541
+ def test_read_csv_when_decimal_scale_overflows_with_decimal256(self):
542
+ schema = pa.schema(
543
+ [("is_active", pa.string()), ("decimal_value", pa.decimal256(20, 2))]
544
+ )
545
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
546
+ _add_column_kwargs(
547
+ ContentType.UNESCAPED_TSV.value,
548
+ ["is_active", "decimal_value"],
549
+ ["is_active", "decimal_value"],
550
+ kwargs,
551
+ )
552
+
553
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
554
+
555
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
556
+
557
+ result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
558
+
559
+ self.assertEqual(len(result), 3)
560
+ self.assertEqual(
561
+ result[1][0].as_py(), decimal.Decimal("322236.66")
562
+ ) # rounding decimal
563
+ self.assertEqual(result[1][1].as_py(), decimal.Decimal("32.33")) # not rounded
564
+ self.assertEqual(len(result.column_names), 2)
565
+ result_schema = result.schema
566
+ self.assertEqual(result_schema.field(0).type, "string")
567
+ self.assertEqual(result_schema.field(1).type, pa.decimal256(20, 2))
568
+
569
+ def test_read_csv_when_decimal_scale_overflows_with_decimal256_and_raise_on_overflow(
570
+ self,
571
+ ):
572
+ schema = pa.schema(
573
+ [("is_active", pa.string()), ("decimal_value", pa.decimal256(20, 2))]
574
+ )
575
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
576
+ _add_column_kwargs(
577
+ ContentType.UNESCAPED_TSV.value,
578
+ ["is_active", "decimal_value"],
579
+ ["is_active", "decimal_value"],
580
+ kwargs,
581
+ )
582
+
583
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
584
+
585
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
586
+
587
+ self.assertRaises(
588
+ pa.lib.ArrowNotImplementedError,
589
+ lambda: pyarrow_read_csv(
590
+ OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
591
+ **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
592
+ ),
593
+ )
594
+
595
+ def test_read_csv_when_decimal_scale_overflows_without_any_schema_then_infers(self):
596
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
597
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=None)
598
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
599
+
600
+ result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
601
+
602
+ # The default behavior of pyarrow is to invalid skip rows
603
+ self.assertEqual(len(result), 2)
604
+ self.assertEqual(result[1][0].as_py(), 32.33) # rounding decimal
605
+ self.assertEqual(result[1][1].as_py(), 0.4) # not rounded
606
+ self.assertEqual(len(result.column_names), 2)
607
+ result_schema = result.schema
608
+ self.assertEqual(result_schema.field(0).type, "string")
609
+ self.assertEqual(result_schema.field(1).type, pa.float64())
610
+
611
+ def test_read_csv_when_decimal_scale_and_precision_overflow_and_raise_on_overflow(
612
+ self,
613
+ ):
614
+ schema = pa.schema(
615
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(5, 2))]
616
+ )
617
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
618
+ _add_column_kwargs(
619
+ ContentType.UNESCAPED_TSV.value,
620
+ ["is_active", "decimal_value"],
621
+ ["is_active", "decimal_value"],
622
+ kwargs,
623
+ )
624
+
625
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
626
+
627
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
628
+
629
+ self.assertRaises(
630
+ pa.lib.ArrowInvalid,
631
+ lambda: pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs),
632
+ )
633
+
634
+ def test_read_csv_when_decimal_scale_overflow_and_file_like_obj_passed(self):
635
+ schema = pa.schema(
636
+ [("is_active", pa.string()), ("decimal_value", pa.decimal128(15, 2))]
637
+ )
638
+ kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
639
+ _add_column_kwargs(
640
+ ContentType.UNESCAPED_TSV.value,
641
+ ["is_active", "decimal_value"],
642
+ ["is_active", "decimal_value"],
643
+ kwargs,
644
+ )
645
+
646
+ read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
647
+
648
+ kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
649
+
650
+ with open(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, "rb") as file:
651
+ result = pyarrow_read_csv(file, **kwargs)
652
+
653
+ self.assertEqual(len(result), 3)
654
+ self.assertEqual(
655
+ result[1][0].as_py(), decimal.Decimal("322236.66")
656
+ ) # rounding decimal
657
+ self.assertEqual(
658
+ result[1][1].as_py(), decimal.Decimal("32.33")
659
+ ) # not rounded
660
+ self.assertEqual(len(result.column_names), 2)
661
+ result_schema = result.schema
662
+ self.assertEqual(result_schema.field(0).type, "string")
663
+ self.assertEqual(result_schema.field(1).type, pa.decimal128(15, 2))
664
+
410
665
 
411
666
  class TestS3FileToTable(TestCase):
412
667
  def test_s3_file_to_table_identity_sanity(self):
deltacat/utils/pyarrow.py CHANGED
@@ -1,6 +1,7 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
+ import copy
4
5
  import bz2
5
6
  import gzip
6
7
  import io
@@ -47,6 +48,17 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
47
48
  RAISE_ON_EMPTY_CSV_KWARG = "raise_on_empty_csv"
48
49
  READER_TYPE_KWARG = "reader_type"
49
50
 
51
+ """
52
+ By default, round decimal values using half_to_even round mode when
53
+ rescaling a decimal to the given scale and precision in the schema would cause
54
+ data loss. Setting any non null value of this argument will result
55
+ in an error instead.
56
+ """
57
+ RAISE_ON_DECIMAL_OVERFLOW = "raise_on_decimal_overflow"
58
+ # Note the maximum from https://arrow.apache.org/docs/python/generated/pyarrow.Decimal256Type.html#pyarrow.Decimal256Type
59
+ DECIMAL256_DEFAULT_SCALE = 38
60
+ DECIMAL256_MAX_PRECISION = 76
61
+
50
62
 
51
63
  def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Schema:
52
64
 
@@ -64,45 +76,162 @@ def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Sche
64
76
  return target_schema
65
77
 
66
78
 
67
- def pyarrow_read_csv(*args, **kwargs) -> pa.Table:
68
- try:
69
- new_kwargs = sanitize_kwargs_by_supported_kwargs(
70
- ["read_options", "parse_options", "convert_options", "memory_pool"], kwargs
79
+ def _extract_arrow_schema_from_read_csv_kwargs(kwargs: Dict[str, Any]) -> pa.Schema:
80
+ schema = None
81
+ if (
82
+ "convert_options" in kwargs
83
+ and kwargs["convert_options"].column_types is not None
84
+ ):
85
+ schema = kwargs["convert_options"].column_types
86
+ if not isinstance(schema, pa.Schema):
87
+ schema = pa.schema(schema)
88
+ if kwargs["convert_options"].include_columns:
89
+ schema = _filter_schema_for_columns(
90
+ schema, kwargs["convert_options"].include_columns
91
+ )
92
+ elif (
93
+ kwargs.get("read_options") is not None
94
+ and kwargs["read_options"].column_names
95
+ ):
96
+ schema = _filter_schema_for_columns(
97
+ schema, kwargs["read_options"].column_names
98
+ )
99
+ else:
100
+ logger.debug(
101
+ "Schema not specified in the kwargs."
102
+ " Hence, schema could not be inferred from the empty CSV."
71
103
  )
104
+
105
+ return schema
106
+
107
+
108
+ def _new_schema_with_replaced_fields(
109
+ schema: pa.Schema, field_to_replace: Callable[[pa.Field], Optional[pa.Field]]
110
+ ) -> pa.Schema:
111
+ if schema is None:
112
+ return None
113
+
114
+ new_schema_fields = []
115
+ for field in schema:
116
+ new_field = field_to_replace(field)
117
+ if new_field is not None:
118
+ new_schema_fields.append(new_field)
119
+ else:
120
+ new_schema_fields.append(field)
121
+
122
+ return pa.schema(new_schema_fields, metadata=schema.metadata)
123
+
124
+
125
+ def _read_csv_rounding_decimal_columns_to_fit_scale(
126
+ schema: pa.Schema, reader_args: List[Any], reader_kwargs: Dict[str, Any]
127
+ ) -> pa.Table:
128
+ # Note: We read decimals as strings first because CSV
129
+ # conversion to decimal256 isn't implemented as of pyarrow==12.0.1
130
+ new_schema = _new_schema_with_replaced_fields(
131
+ schema,
132
+ lambda fld: pa.field(fld.name, pa.string(), metadata=fld.metadata)
133
+ if pa.types.is_decimal128(fld.type) or pa.types.is_decimal256(fld.type)
134
+ else None,
135
+ )
136
+ new_kwargs = sanitize_kwargs_by_supported_kwargs(
137
+ ["read_options", "parse_options", "convert_options", "memory_pool"],
138
+ reader_kwargs,
139
+ )
140
+ # Creating a shallow copy for efficiency
141
+ new_convert_options = copy.copy(new_kwargs["convert_options"])
142
+ new_convert_options.column_types = new_schema
143
+ new_reader_kwargs = {**new_kwargs, "convert_options": new_convert_options}
144
+ arrow_table = pacsv.read_csv(*reader_args, **new_reader_kwargs)
145
+
146
+ for column_index, field in enumerate(schema):
147
+ if pa.types.is_decimal128(field.type) or pa.types.is_decimal256(field.type):
148
+ column_array = arrow_table[field.name]
149
+ # We always cast to decimal256 to accomodate fixed scale of 38
150
+ cast_to_type = pa.decimal256(
151
+ DECIMAL256_MAX_PRECISION, DECIMAL256_DEFAULT_SCALE
152
+ )
153
+ casted_decimal_array = pc.cast(column_array, cast_to_type)
154
+ # Note that scale can be negative
155
+ rounded_column_array = pc.round(
156
+ casted_decimal_array, ndigits=field.type.scale
157
+ )
158
+ final_decimal_array = pc.cast(rounded_column_array, field.type)
159
+ arrow_table = arrow_table.set_column(
160
+ column_index,
161
+ field,
162
+ final_decimal_array,
163
+ )
164
+ logger.debug(
165
+ f"Rounded decimal column: {field.name} to {field.type.scale} scale and"
166
+ f" {field.type.precision} precision"
167
+ )
168
+
169
+ return arrow_table
170
+
171
+
172
+ def pyarrow_read_csv_default(*args, **kwargs):
173
+ new_kwargs = sanitize_kwargs_by_supported_kwargs(
174
+ ["read_options", "parse_options", "convert_options", "memory_pool"], kwargs
175
+ )
176
+
177
+ try:
72
178
  return pacsv.read_csv(*args, **new_kwargs)
73
179
  except pa.lib.ArrowInvalid as e:
74
- if e.__str__() == "Empty CSV file" and not kwargs.get(RAISE_ON_EMPTY_CSV_KWARG):
75
- schema = None
76
- if (
77
- "convert_options" in kwargs
78
- and kwargs["convert_options"].column_types is not None
79
- ):
80
- schema = kwargs["convert_options"].column_types
81
- if not isinstance(schema, pa.Schema):
82
- schema = pa.schema(schema)
83
- if kwargs["convert_options"].include_columns:
84
- schema = _filter_schema_for_columns(
85
- schema, kwargs["convert_options"].include_columns
86
- )
87
- elif (
88
- kwargs.get("read_options") is not None
89
- and kwargs["read_options"].column_names
180
+ error_str = e.__str__()
181
+ schema = _extract_arrow_schema_from_read_csv_kwargs(kwargs)
182
+
183
+ if error_str == "Empty CSV file" and not kwargs.get(RAISE_ON_EMPTY_CSV_KWARG):
184
+ logger.debug(f"Read CSV empty schema being used: {schema}")
185
+ return pa.Table.from_pylist([], schema=schema)
186
+ if not kwargs.get(RAISE_ON_DECIMAL_OVERFLOW):
187
+ # Note, this logic requires expensive casting. To prevent downgrading performance
188
+ # for happy path reads, we are handling this case in response to an error.
189
+ logger.warning(
190
+ "Rescaling Decimal to the given scale in the schema. "
191
+ f"Original error: {error_str}"
192
+ )
193
+
194
+ if schema is not None and "convert_options" in kwargs:
195
+ if (
196
+ "Rescaling Decimal" in error_str
197
+ and "value would cause data loss" in error_str
90
198
  ):
91
- schema = _filter_schema_for_columns(
92
- schema, kwargs["read_options"].column_names
199
+ logger.debug(f"Checking if the file: {args[0]}...")
200
+ # Since we are re-reading the file, we have to seek to beginning
201
+ if isinstance(args[0], io.IOBase) and args[0].seekable():
202
+ logger.debug(f"Seeking to the beginning of the file {args[0]}")
203
+ args[0].seek(0)
204
+ return _read_csv_rounding_decimal_columns_to_fit_scale(
205
+ schema=schema, reader_args=args, reader_kwargs=kwargs
93
206
  )
94
-
95
207
  else:
96
208
  logger.debug(
97
- "Schema not specified in the kwargs."
98
- " Hence, schema could not be inferred from the empty CSV."
209
+ "Schema is None when trying to adjust decimal values. "
210
+ "Hence, bubbling up exception..."
99
211
  )
100
212
 
101
- logger.debug(f"Read CSV empty schema being used: {schema}")
102
- return pa.Table.from_pylist([], schema=schema)
103
213
  raise e
104
214
 
105
215
 
216
+ def pyarrow_read_csv(*args, **kwargs) -> pa.Table:
217
+ schema = _extract_arrow_schema_from_read_csv_kwargs(kwargs)
218
+
219
+ # CSV conversion to decimal256 isn't supported as of pyarrow=12.0.1
220
+ # Below ensures decimal256 is casted properly.
221
+ schema_includes_decimal256 = (
222
+ (True if any([pa.types.is_decimal256(x.type) for x in schema]) else False)
223
+ if schema is not None
224
+ else None
225
+ )
226
+ if schema_includes_decimal256 and not kwargs.get(RAISE_ON_DECIMAL_OVERFLOW):
227
+ # falling back to expensive method of reading CSV
228
+ return _read_csv_rounding_decimal_columns_to_fit_scale(
229
+ schema, reader_args=args, reader_kwargs=kwargs
230
+ )
231
+ else:
232
+ return pyarrow_read_csv_default(*args, **kwargs)
233
+
234
+
106
235
  CONTENT_TYPE_TO_PA_READ_FUNC: Dict[str, Callable] = {
107
236
  ContentType.UNESCAPED_TSV.value: pyarrow_read_csv,
108
237
  ContentType.TSV.value: pyarrow_read_csv,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deltacat
3
- Version: 1.1.27
3
+ Version: 1.1.28
4
4
  Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
5
5
  Home-page: https://github.com/ray-project/deltacat
6
6
  Author: Ray Team
@@ -1,4 +1,4 @@
1
- deltacat/__init__.py,sha256=NNgt1N6a4dwztCKl6C7klF3mQEn-S-sBHNZPKPqRHko,1778
1
+ deltacat/__init__.py,sha256=GPlTQc6AW4ig_nZJ7kMVe-kbZxYfrSVGFN1YEqY8dXU,1778
2
2
  deltacat/constants.py,sha256=TUJLXUJ9xq1Ryil72yLkKR8EDH_Irp5wUg56QstbRNE,2181
3
3
  deltacat/exceptions.py,sha256=7sjk3BuMY5Oo-6OvAfHncZx_OcvtEL47BblWr2F7waE,12740
4
4
  deltacat/logs.py,sha256=EQSDin1deehzz5xlLV1_TrFJrO_IBZ9Ahp7MdL-4cK8,9363
@@ -179,7 +179,7 @@ deltacat/tests/utils/test_cloudpickle.py,sha256=J0pnBY3-PxlUh6MamZAN1PuquKQPr2iy
179
179
  deltacat/tests/utils/test_daft.py,sha256=kY8lkXoQvyWunok8UvOsh1An297rb3jcnstTuIAyAlc,8232
180
180
  deltacat/tests/utils/test_metrics.py,sha256=Ym9nOz1EtB180pLmvugihj1sDTNDMb5opIjjr5Nmcls,16339
181
181
  deltacat/tests/utils/test_placement.py,sha256=g61wVOMkHe4YJeR9Oxg_BOVQ6bhHHbC3IBYv8YhUu94,597
182
- deltacat/tests/utils/test_pyarrow.py,sha256=AWx0DTsBA1PkQag48w_HeQdz7tlBzJsm9v7Nd6-dhEY,19607
182
+ deltacat/tests/utils/test_pyarrow.py,sha256=YDuyFYNjy6thzfA6Z2a0dOytUugsExu1uMUOhmP_aXc,29977
183
183
  deltacat/tests/utils/test_record_batch_tables.py,sha256=AkG1WyljQmjnl-AxhbFWyo5LnMIKRyLScfgC2B_ES-s,11321
184
184
  deltacat/tests/utils/test_resources.py,sha256=HtpvDrfPZQNtGDXUlsIzc_yd7Vf1cDscZ3YbN0oTvO8,2560
185
185
  deltacat/tests/utils/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -200,7 +200,7 @@ deltacat/utils/numpy.py,sha256=SpHKKvC-K8NINTWGVfTZ5-gBFTGYqaXjjgKFhsdUjwg,2049
200
200
  deltacat/utils/pandas.py,sha256=q99mlRB7tymICMcNbfGLfLqFu_C-feyPZKZm2CWJJVc,9574
201
201
  deltacat/utils/performance.py,sha256=7ZLaMkS1ehPSIhT5uOQVBHvjC70iKHzoFquFo-KL0PI,645
202
202
  deltacat/utils/placement.py,sha256=Lj20fb-eq8rgMdm_M2MBMfDLwhDM1sS1nJj2DvIK56s,12060
203
- deltacat/utils/pyarrow.py,sha256=nW_eD6fWAlbyHUzPj1rOOfnUbpP3RnAgNSuuVNyvhZ4,29174
203
+ deltacat/utils/pyarrow.py,sha256=xEZRzbTBU6uj9K4DtvngIPtQkTA8haVgQ4Y4vjwHvtM,34311
204
204
  deltacat/utils/resources.py,sha256=Ax1OgLLbZI4oYpp4Ki27OLaST-7I-AJgZwU87FVfY8g,8253
205
205
  deltacat/utils/s3fs.py,sha256=PmUJ5Fm1WmD-_zp_M6yd9VbXvIoJuBeK6ApOdJJApLE,662
206
206
  deltacat/utils/schema.py,sha256=m4Wm4ZQcpttzOUxex4dVneGlHy1_E36HspTcjNYzvVM,1564
@@ -210,8 +210,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
210
210
  deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
211
211
  deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
212
212
  deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
213
- deltacat-1.1.27.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
214
- deltacat-1.1.27.dist-info/METADATA,sha256=VL7sWG3lO3cV3tzwTiCTgpm7h0K5Dh3GtKiqojgSgHI,1733
215
- deltacat-1.1.27.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
216
- deltacat-1.1.27.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
217
- deltacat-1.1.27.dist-info/RECORD,,
213
+ deltacat-1.1.28.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
214
+ deltacat-1.1.28.dist-info/METADATA,sha256=-ZnMp9C26vVxi015Il3UtBMm9pHqA8aZmTnNTgr8Tb8,1733
215
+ deltacat-1.1.28.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
216
+ deltacat-1.1.28.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
217
+ deltacat-1.1.28.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.44.0)
2
+ Generator: bdist_wheel (0.45.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5