deltacat 1.1.27__py3-none-any.whl → 1.1.28__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- deltacat/__init__.py +1 -1
- deltacat/tests/utils/test_pyarrow.py +255 -0
- deltacat/utils/pyarrow.py +156 -27
- {deltacat-1.1.27.dist-info → deltacat-1.1.28.dist-info}/METADATA +1 -1
- {deltacat-1.1.27.dist-info → deltacat-1.1.28.dist-info}/RECORD +8 -8
- {deltacat-1.1.27.dist-info → deltacat-1.1.28.dist-info}/WHEEL +1 -1
- {deltacat-1.1.27.dist-info → deltacat-1.1.28.dist-info}/LICENSE +0 -0
- {deltacat-1.1.27.dist-info → deltacat-1.1.28.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -7,7 +7,9 @@ from deltacat.utils.pyarrow import (
|
|
7
7
|
s3_file_to_table,
|
8
8
|
ReadKwargsProviderPyArrowSchemaOverride,
|
9
9
|
RAISE_ON_EMPTY_CSV_KWARG,
|
10
|
+
RAISE_ON_DECIMAL_OVERFLOW,
|
10
11
|
)
|
12
|
+
import decimal
|
11
13
|
from deltacat.types.media import ContentEncoding, ContentType
|
12
14
|
from deltacat.types.partial_download import PartialParquetParameters
|
13
15
|
from pyarrow.parquet import ParquetFile
|
@@ -16,6 +18,12 @@ import pyarrow as pa
|
|
16
18
|
PARQUET_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet"
|
17
19
|
EMPTY_UTSV_PATH = "deltacat/tests/utils/data/empty.csv"
|
18
20
|
NON_EMPTY_VALID_UTSV_PATH = "deltacat/tests/utils/data/non_empty_valid.csv"
|
21
|
+
OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH = (
|
22
|
+
"deltacat/tests/utils/data/overflowing_decimal_precision.csv"
|
23
|
+
)
|
24
|
+
OVERFLOWING_DECIMAL_SCALE_UTSV_PATH = (
|
25
|
+
"deltacat/tests/utils/data/overflowing_decimal_scale.csv"
|
26
|
+
)
|
19
27
|
GZIP_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.gz"
|
20
28
|
BZ2_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.bz2"
|
21
29
|
|
@@ -407,6 +415,253 @@ class TestReadCSV(TestCase):
|
|
407
415
|
),
|
408
416
|
)
|
409
417
|
|
418
|
+
def test_read_csv_when_decimal_precision_overflows_and_raise_kwarg_specified(self):
|
419
|
+
schema = pa.schema(
|
420
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(4, 2))]
|
421
|
+
)
|
422
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
423
|
+
_add_column_kwargs(
|
424
|
+
ContentType.UNESCAPED_TSV.value,
|
425
|
+
["is_active", "decimal_value"],
|
426
|
+
["is_active", "decimal_value"],
|
427
|
+
kwargs,
|
428
|
+
)
|
429
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
430
|
+
|
431
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
432
|
+
self.assertRaises(
|
433
|
+
pa.lib.ArrowInvalid,
|
434
|
+
lambda: pyarrow_read_csv(
|
435
|
+
OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH,
|
436
|
+
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
|
437
|
+
),
|
438
|
+
)
|
439
|
+
|
440
|
+
def test_read_csv_when_decimal_precision_overflows_sanity(self):
|
441
|
+
schema = pa.schema(
|
442
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(4, 2))]
|
443
|
+
)
|
444
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
445
|
+
_add_column_kwargs(
|
446
|
+
ContentType.UNESCAPED_TSV.value,
|
447
|
+
["is_active", "decimal_value"],
|
448
|
+
["is_active", "decimal_value"],
|
449
|
+
kwargs,
|
450
|
+
)
|
451
|
+
|
452
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
453
|
+
|
454
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
455
|
+
|
456
|
+
self.assertRaises(
|
457
|
+
pa.lib.ArrowInvalid,
|
458
|
+
lambda: pyarrow_read_csv(OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH, **kwargs),
|
459
|
+
)
|
460
|
+
|
461
|
+
def test_read_csv_when_decimal_scale_overflows_and_raise_kwarg_specified(self):
|
462
|
+
schema = pa.schema(
|
463
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(20, 2))]
|
464
|
+
)
|
465
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
466
|
+
_add_column_kwargs(
|
467
|
+
ContentType.UNESCAPED_TSV.value,
|
468
|
+
["is_active", "decimal_value"],
|
469
|
+
["is_active", "decimal_value"],
|
470
|
+
kwargs,
|
471
|
+
)
|
472
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
473
|
+
|
474
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
475
|
+
|
476
|
+
self.assertRaises(
|
477
|
+
pa.lib.ArrowInvalid,
|
478
|
+
lambda: pyarrow_read_csv(
|
479
|
+
OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
|
480
|
+
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
|
481
|
+
),
|
482
|
+
)
|
483
|
+
|
484
|
+
def test_read_csv_when_decimal_scale_overflows_sanity(self):
|
485
|
+
schema = pa.schema(
|
486
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(20, 2))]
|
487
|
+
)
|
488
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
489
|
+
_add_column_kwargs(
|
490
|
+
ContentType.UNESCAPED_TSV.value,
|
491
|
+
["is_active", "decimal_value"],
|
492
|
+
["is_active", "decimal_value"],
|
493
|
+
kwargs,
|
494
|
+
)
|
495
|
+
|
496
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
497
|
+
|
498
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
499
|
+
|
500
|
+
result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
|
501
|
+
|
502
|
+
self.assertEqual(len(result), 3)
|
503
|
+
self.assertEqual(
|
504
|
+
result[1][0].as_py(), decimal.Decimal("322236.66")
|
505
|
+
) # rounding decimal
|
506
|
+
self.assertEqual(result[1][1].as_py(), decimal.Decimal("32.33")) # not rounded
|
507
|
+
self.assertEqual(len(result.column_names), 2)
|
508
|
+
result_schema = result.schema
|
509
|
+
self.assertEqual(result_schema.field(0).type, "string")
|
510
|
+
self.assertEqual(result_schema.field(1).type, pa.decimal128(20, 2))
|
511
|
+
|
512
|
+
def test_read_csv_when_decimal_scale_overflows_and_negative_scale(self):
|
513
|
+
schema = pa.schema(
|
514
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(20, -2))]
|
515
|
+
)
|
516
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
517
|
+
_add_column_kwargs(
|
518
|
+
ContentType.UNESCAPED_TSV.value,
|
519
|
+
["is_active", "decimal_value"],
|
520
|
+
["is_active", "decimal_value"],
|
521
|
+
kwargs,
|
522
|
+
)
|
523
|
+
|
524
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
525
|
+
|
526
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
527
|
+
|
528
|
+
result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
|
529
|
+
|
530
|
+
self.assertEqual(len(result), 3)
|
531
|
+
self.assertEqual(
|
532
|
+
result[1][0].as_py(),
|
533
|
+
decimal.Decimal("322200"), # consequence of negative scale
|
534
|
+
) # rounding decimal
|
535
|
+
self.assertEqual(result[1][1].as_py(), decimal.Decimal("00"))
|
536
|
+
self.assertEqual(len(result.column_names), 2)
|
537
|
+
result_schema = result.schema
|
538
|
+
self.assertEqual(result_schema.field(0).type, "string")
|
539
|
+
self.assertEqual(result_schema.field(1).type, pa.decimal128(20, -2))
|
540
|
+
|
541
|
+
def test_read_csv_when_decimal_scale_overflows_with_decimal256(self):
|
542
|
+
schema = pa.schema(
|
543
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal256(20, 2))]
|
544
|
+
)
|
545
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
546
|
+
_add_column_kwargs(
|
547
|
+
ContentType.UNESCAPED_TSV.value,
|
548
|
+
["is_active", "decimal_value"],
|
549
|
+
["is_active", "decimal_value"],
|
550
|
+
kwargs,
|
551
|
+
)
|
552
|
+
|
553
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
554
|
+
|
555
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
556
|
+
|
557
|
+
result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
|
558
|
+
|
559
|
+
self.assertEqual(len(result), 3)
|
560
|
+
self.assertEqual(
|
561
|
+
result[1][0].as_py(), decimal.Decimal("322236.66")
|
562
|
+
) # rounding decimal
|
563
|
+
self.assertEqual(result[1][1].as_py(), decimal.Decimal("32.33")) # not rounded
|
564
|
+
self.assertEqual(len(result.column_names), 2)
|
565
|
+
result_schema = result.schema
|
566
|
+
self.assertEqual(result_schema.field(0).type, "string")
|
567
|
+
self.assertEqual(result_schema.field(1).type, pa.decimal256(20, 2))
|
568
|
+
|
569
|
+
def test_read_csv_when_decimal_scale_overflows_with_decimal256_and_raise_on_overflow(
|
570
|
+
self,
|
571
|
+
):
|
572
|
+
schema = pa.schema(
|
573
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal256(20, 2))]
|
574
|
+
)
|
575
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
576
|
+
_add_column_kwargs(
|
577
|
+
ContentType.UNESCAPED_TSV.value,
|
578
|
+
["is_active", "decimal_value"],
|
579
|
+
["is_active", "decimal_value"],
|
580
|
+
kwargs,
|
581
|
+
)
|
582
|
+
|
583
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
584
|
+
|
585
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
586
|
+
|
587
|
+
self.assertRaises(
|
588
|
+
pa.lib.ArrowNotImplementedError,
|
589
|
+
lambda: pyarrow_read_csv(
|
590
|
+
OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
|
591
|
+
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
|
592
|
+
),
|
593
|
+
)
|
594
|
+
|
595
|
+
def test_read_csv_when_decimal_scale_overflows_without_any_schema_then_infers(self):
|
596
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
597
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=None)
|
598
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
599
|
+
|
600
|
+
result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
|
601
|
+
|
602
|
+
# The default behavior of pyarrow is to invalid skip rows
|
603
|
+
self.assertEqual(len(result), 2)
|
604
|
+
self.assertEqual(result[1][0].as_py(), 32.33) # rounding decimal
|
605
|
+
self.assertEqual(result[1][1].as_py(), 0.4) # not rounded
|
606
|
+
self.assertEqual(len(result.column_names), 2)
|
607
|
+
result_schema = result.schema
|
608
|
+
self.assertEqual(result_schema.field(0).type, "string")
|
609
|
+
self.assertEqual(result_schema.field(1).type, pa.float64())
|
610
|
+
|
611
|
+
def test_read_csv_when_decimal_scale_and_precision_overflow_and_raise_on_overflow(
|
612
|
+
self,
|
613
|
+
):
|
614
|
+
schema = pa.schema(
|
615
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(5, 2))]
|
616
|
+
)
|
617
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
618
|
+
_add_column_kwargs(
|
619
|
+
ContentType.UNESCAPED_TSV.value,
|
620
|
+
["is_active", "decimal_value"],
|
621
|
+
["is_active", "decimal_value"],
|
622
|
+
kwargs,
|
623
|
+
)
|
624
|
+
|
625
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
626
|
+
|
627
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
628
|
+
|
629
|
+
self.assertRaises(
|
630
|
+
pa.lib.ArrowInvalid,
|
631
|
+
lambda: pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs),
|
632
|
+
)
|
633
|
+
|
634
|
+
def test_read_csv_when_decimal_scale_overflow_and_file_like_obj_passed(self):
|
635
|
+
schema = pa.schema(
|
636
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(15, 2))]
|
637
|
+
)
|
638
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
639
|
+
_add_column_kwargs(
|
640
|
+
ContentType.UNESCAPED_TSV.value,
|
641
|
+
["is_active", "decimal_value"],
|
642
|
+
["is_active", "decimal_value"],
|
643
|
+
kwargs,
|
644
|
+
)
|
645
|
+
|
646
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
647
|
+
|
648
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
649
|
+
|
650
|
+
with open(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, "rb") as file:
|
651
|
+
result = pyarrow_read_csv(file, **kwargs)
|
652
|
+
|
653
|
+
self.assertEqual(len(result), 3)
|
654
|
+
self.assertEqual(
|
655
|
+
result[1][0].as_py(), decimal.Decimal("322236.66")
|
656
|
+
) # rounding decimal
|
657
|
+
self.assertEqual(
|
658
|
+
result[1][1].as_py(), decimal.Decimal("32.33")
|
659
|
+
) # not rounded
|
660
|
+
self.assertEqual(len(result.column_names), 2)
|
661
|
+
result_schema = result.schema
|
662
|
+
self.assertEqual(result_schema.field(0).type, "string")
|
663
|
+
self.assertEqual(result_schema.field(1).type, pa.decimal128(15, 2))
|
664
|
+
|
410
665
|
|
411
666
|
class TestS3FileToTable(TestCase):
|
412
667
|
def test_s3_file_to_table_identity_sanity(self):
|
deltacat/utils/pyarrow.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
+
import copy
|
4
5
|
import bz2
|
5
6
|
import gzip
|
6
7
|
import io
|
@@ -47,6 +48,17 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
|
47
48
|
RAISE_ON_EMPTY_CSV_KWARG = "raise_on_empty_csv"
|
48
49
|
READER_TYPE_KWARG = "reader_type"
|
49
50
|
|
51
|
+
"""
|
52
|
+
By default, round decimal values using half_to_even round mode when
|
53
|
+
rescaling a decimal to the given scale and precision in the schema would cause
|
54
|
+
data loss. Setting any non null value of this argument will result
|
55
|
+
in an error instead.
|
56
|
+
"""
|
57
|
+
RAISE_ON_DECIMAL_OVERFLOW = "raise_on_decimal_overflow"
|
58
|
+
# Note the maximum from https://arrow.apache.org/docs/python/generated/pyarrow.Decimal256Type.html#pyarrow.Decimal256Type
|
59
|
+
DECIMAL256_DEFAULT_SCALE = 38
|
60
|
+
DECIMAL256_MAX_PRECISION = 76
|
61
|
+
|
50
62
|
|
51
63
|
def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Schema:
|
52
64
|
|
@@ -64,45 +76,162 @@ def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Sche
|
|
64
76
|
return target_schema
|
65
77
|
|
66
78
|
|
67
|
-
def
|
68
|
-
|
69
|
-
|
70
|
-
|
79
|
+
def _extract_arrow_schema_from_read_csv_kwargs(kwargs: Dict[str, Any]) -> pa.Schema:
|
80
|
+
schema = None
|
81
|
+
if (
|
82
|
+
"convert_options" in kwargs
|
83
|
+
and kwargs["convert_options"].column_types is not None
|
84
|
+
):
|
85
|
+
schema = kwargs["convert_options"].column_types
|
86
|
+
if not isinstance(schema, pa.Schema):
|
87
|
+
schema = pa.schema(schema)
|
88
|
+
if kwargs["convert_options"].include_columns:
|
89
|
+
schema = _filter_schema_for_columns(
|
90
|
+
schema, kwargs["convert_options"].include_columns
|
91
|
+
)
|
92
|
+
elif (
|
93
|
+
kwargs.get("read_options") is not None
|
94
|
+
and kwargs["read_options"].column_names
|
95
|
+
):
|
96
|
+
schema = _filter_schema_for_columns(
|
97
|
+
schema, kwargs["read_options"].column_names
|
98
|
+
)
|
99
|
+
else:
|
100
|
+
logger.debug(
|
101
|
+
"Schema not specified in the kwargs."
|
102
|
+
" Hence, schema could not be inferred from the empty CSV."
|
71
103
|
)
|
104
|
+
|
105
|
+
return schema
|
106
|
+
|
107
|
+
|
108
|
+
def _new_schema_with_replaced_fields(
|
109
|
+
schema: pa.Schema, field_to_replace: Callable[[pa.Field], Optional[pa.Field]]
|
110
|
+
) -> pa.Schema:
|
111
|
+
if schema is None:
|
112
|
+
return None
|
113
|
+
|
114
|
+
new_schema_fields = []
|
115
|
+
for field in schema:
|
116
|
+
new_field = field_to_replace(field)
|
117
|
+
if new_field is not None:
|
118
|
+
new_schema_fields.append(new_field)
|
119
|
+
else:
|
120
|
+
new_schema_fields.append(field)
|
121
|
+
|
122
|
+
return pa.schema(new_schema_fields, metadata=schema.metadata)
|
123
|
+
|
124
|
+
|
125
|
+
def _read_csv_rounding_decimal_columns_to_fit_scale(
|
126
|
+
schema: pa.Schema, reader_args: List[Any], reader_kwargs: Dict[str, Any]
|
127
|
+
) -> pa.Table:
|
128
|
+
# Note: We read decimals as strings first because CSV
|
129
|
+
# conversion to decimal256 isn't implemented as of pyarrow==12.0.1
|
130
|
+
new_schema = _new_schema_with_replaced_fields(
|
131
|
+
schema,
|
132
|
+
lambda fld: pa.field(fld.name, pa.string(), metadata=fld.metadata)
|
133
|
+
if pa.types.is_decimal128(fld.type) or pa.types.is_decimal256(fld.type)
|
134
|
+
else None,
|
135
|
+
)
|
136
|
+
new_kwargs = sanitize_kwargs_by_supported_kwargs(
|
137
|
+
["read_options", "parse_options", "convert_options", "memory_pool"],
|
138
|
+
reader_kwargs,
|
139
|
+
)
|
140
|
+
# Creating a shallow copy for efficiency
|
141
|
+
new_convert_options = copy.copy(new_kwargs["convert_options"])
|
142
|
+
new_convert_options.column_types = new_schema
|
143
|
+
new_reader_kwargs = {**new_kwargs, "convert_options": new_convert_options}
|
144
|
+
arrow_table = pacsv.read_csv(*reader_args, **new_reader_kwargs)
|
145
|
+
|
146
|
+
for column_index, field in enumerate(schema):
|
147
|
+
if pa.types.is_decimal128(field.type) or pa.types.is_decimal256(field.type):
|
148
|
+
column_array = arrow_table[field.name]
|
149
|
+
# We always cast to decimal256 to accomodate fixed scale of 38
|
150
|
+
cast_to_type = pa.decimal256(
|
151
|
+
DECIMAL256_MAX_PRECISION, DECIMAL256_DEFAULT_SCALE
|
152
|
+
)
|
153
|
+
casted_decimal_array = pc.cast(column_array, cast_to_type)
|
154
|
+
# Note that scale can be negative
|
155
|
+
rounded_column_array = pc.round(
|
156
|
+
casted_decimal_array, ndigits=field.type.scale
|
157
|
+
)
|
158
|
+
final_decimal_array = pc.cast(rounded_column_array, field.type)
|
159
|
+
arrow_table = arrow_table.set_column(
|
160
|
+
column_index,
|
161
|
+
field,
|
162
|
+
final_decimal_array,
|
163
|
+
)
|
164
|
+
logger.debug(
|
165
|
+
f"Rounded decimal column: {field.name} to {field.type.scale} scale and"
|
166
|
+
f" {field.type.precision} precision"
|
167
|
+
)
|
168
|
+
|
169
|
+
return arrow_table
|
170
|
+
|
171
|
+
|
172
|
+
def pyarrow_read_csv_default(*args, **kwargs):
|
173
|
+
new_kwargs = sanitize_kwargs_by_supported_kwargs(
|
174
|
+
["read_options", "parse_options", "convert_options", "memory_pool"], kwargs
|
175
|
+
)
|
176
|
+
|
177
|
+
try:
|
72
178
|
return pacsv.read_csv(*args, **new_kwargs)
|
73
179
|
except pa.lib.ArrowInvalid as e:
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
)
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
180
|
+
error_str = e.__str__()
|
181
|
+
schema = _extract_arrow_schema_from_read_csv_kwargs(kwargs)
|
182
|
+
|
183
|
+
if error_str == "Empty CSV file" and not kwargs.get(RAISE_ON_EMPTY_CSV_KWARG):
|
184
|
+
logger.debug(f"Read CSV empty schema being used: {schema}")
|
185
|
+
return pa.Table.from_pylist([], schema=schema)
|
186
|
+
if not kwargs.get(RAISE_ON_DECIMAL_OVERFLOW):
|
187
|
+
# Note, this logic requires expensive casting. To prevent downgrading performance
|
188
|
+
# for happy path reads, we are handling this case in response to an error.
|
189
|
+
logger.warning(
|
190
|
+
"Rescaling Decimal to the given scale in the schema. "
|
191
|
+
f"Original error: {error_str}"
|
192
|
+
)
|
193
|
+
|
194
|
+
if schema is not None and "convert_options" in kwargs:
|
195
|
+
if (
|
196
|
+
"Rescaling Decimal" in error_str
|
197
|
+
and "value would cause data loss" in error_str
|
90
198
|
):
|
91
|
-
|
92
|
-
|
199
|
+
logger.debug(f"Checking if the file: {args[0]}...")
|
200
|
+
# Since we are re-reading the file, we have to seek to beginning
|
201
|
+
if isinstance(args[0], io.IOBase) and args[0].seekable():
|
202
|
+
logger.debug(f"Seeking to the beginning of the file {args[0]}")
|
203
|
+
args[0].seek(0)
|
204
|
+
return _read_csv_rounding_decimal_columns_to_fit_scale(
|
205
|
+
schema=schema, reader_args=args, reader_kwargs=kwargs
|
93
206
|
)
|
94
|
-
|
95
207
|
else:
|
96
208
|
logger.debug(
|
97
|
-
"Schema
|
98
|
-
"
|
209
|
+
"Schema is None when trying to adjust decimal values. "
|
210
|
+
"Hence, bubbling up exception..."
|
99
211
|
)
|
100
212
|
|
101
|
-
logger.debug(f"Read CSV empty schema being used: {schema}")
|
102
|
-
return pa.Table.from_pylist([], schema=schema)
|
103
213
|
raise e
|
104
214
|
|
105
215
|
|
216
|
+
def pyarrow_read_csv(*args, **kwargs) -> pa.Table:
|
217
|
+
schema = _extract_arrow_schema_from_read_csv_kwargs(kwargs)
|
218
|
+
|
219
|
+
# CSV conversion to decimal256 isn't supported as of pyarrow=12.0.1
|
220
|
+
# Below ensures decimal256 is casted properly.
|
221
|
+
schema_includes_decimal256 = (
|
222
|
+
(True if any([pa.types.is_decimal256(x.type) for x in schema]) else False)
|
223
|
+
if schema is not None
|
224
|
+
else None
|
225
|
+
)
|
226
|
+
if schema_includes_decimal256 and not kwargs.get(RAISE_ON_DECIMAL_OVERFLOW):
|
227
|
+
# falling back to expensive method of reading CSV
|
228
|
+
return _read_csv_rounding_decimal_columns_to_fit_scale(
|
229
|
+
schema, reader_args=args, reader_kwargs=kwargs
|
230
|
+
)
|
231
|
+
else:
|
232
|
+
return pyarrow_read_csv_default(*args, **kwargs)
|
233
|
+
|
234
|
+
|
106
235
|
CONTENT_TYPE_TO_PA_READ_FUNC: Dict[str, Callable] = {
|
107
236
|
ContentType.UNESCAPED_TSV.value: pyarrow_read_csv,
|
108
237
|
ContentType.TSV.value: pyarrow_read_csv,
|
@@ -1,4 +1,4 @@
|
|
1
|
-
deltacat/__init__.py,sha256=
|
1
|
+
deltacat/__init__.py,sha256=GPlTQc6AW4ig_nZJ7kMVe-kbZxYfrSVGFN1YEqY8dXU,1778
|
2
2
|
deltacat/constants.py,sha256=TUJLXUJ9xq1Ryil72yLkKR8EDH_Irp5wUg56QstbRNE,2181
|
3
3
|
deltacat/exceptions.py,sha256=7sjk3BuMY5Oo-6OvAfHncZx_OcvtEL47BblWr2F7waE,12740
|
4
4
|
deltacat/logs.py,sha256=EQSDin1deehzz5xlLV1_TrFJrO_IBZ9Ahp7MdL-4cK8,9363
|
@@ -179,7 +179,7 @@ deltacat/tests/utils/test_cloudpickle.py,sha256=J0pnBY3-PxlUh6MamZAN1PuquKQPr2iy
|
|
179
179
|
deltacat/tests/utils/test_daft.py,sha256=kY8lkXoQvyWunok8UvOsh1An297rb3jcnstTuIAyAlc,8232
|
180
180
|
deltacat/tests/utils/test_metrics.py,sha256=Ym9nOz1EtB180pLmvugihj1sDTNDMb5opIjjr5Nmcls,16339
|
181
181
|
deltacat/tests/utils/test_placement.py,sha256=g61wVOMkHe4YJeR9Oxg_BOVQ6bhHHbC3IBYv8YhUu94,597
|
182
|
-
deltacat/tests/utils/test_pyarrow.py,sha256=
|
182
|
+
deltacat/tests/utils/test_pyarrow.py,sha256=YDuyFYNjy6thzfA6Z2a0dOytUugsExu1uMUOhmP_aXc,29977
|
183
183
|
deltacat/tests/utils/test_record_batch_tables.py,sha256=AkG1WyljQmjnl-AxhbFWyo5LnMIKRyLScfgC2B_ES-s,11321
|
184
184
|
deltacat/tests/utils/test_resources.py,sha256=HtpvDrfPZQNtGDXUlsIzc_yd7Vf1cDscZ3YbN0oTvO8,2560
|
185
185
|
deltacat/tests/utils/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -200,7 +200,7 @@ deltacat/utils/numpy.py,sha256=SpHKKvC-K8NINTWGVfTZ5-gBFTGYqaXjjgKFhsdUjwg,2049
|
|
200
200
|
deltacat/utils/pandas.py,sha256=q99mlRB7tymICMcNbfGLfLqFu_C-feyPZKZm2CWJJVc,9574
|
201
201
|
deltacat/utils/performance.py,sha256=7ZLaMkS1ehPSIhT5uOQVBHvjC70iKHzoFquFo-KL0PI,645
|
202
202
|
deltacat/utils/placement.py,sha256=Lj20fb-eq8rgMdm_M2MBMfDLwhDM1sS1nJj2DvIK56s,12060
|
203
|
-
deltacat/utils/pyarrow.py,sha256=
|
203
|
+
deltacat/utils/pyarrow.py,sha256=xEZRzbTBU6uj9K4DtvngIPtQkTA8haVgQ4Y4vjwHvtM,34311
|
204
204
|
deltacat/utils/resources.py,sha256=Ax1OgLLbZI4oYpp4Ki27OLaST-7I-AJgZwU87FVfY8g,8253
|
205
205
|
deltacat/utils/s3fs.py,sha256=PmUJ5Fm1WmD-_zp_M6yd9VbXvIoJuBeK6ApOdJJApLE,662
|
206
206
|
deltacat/utils/schema.py,sha256=m4Wm4ZQcpttzOUxex4dVneGlHy1_E36HspTcjNYzvVM,1564
|
@@ -210,8 +210,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
|
|
210
210
|
deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
|
211
211
|
deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
|
212
212
|
deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
|
213
|
-
deltacat-1.1.
|
214
|
-
deltacat-1.1.
|
215
|
-
deltacat-1.1.
|
216
|
-
deltacat-1.1.
|
217
|
-
deltacat-1.1.
|
213
|
+
deltacat-1.1.28.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
214
|
+
deltacat-1.1.28.dist-info/METADATA,sha256=-ZnMp9C26vVxi015Il3UtBMm9pHqA8aZmTnNTgr8Tb8,1733
|
215
|
+
deltacat-1.1.28.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
216
|
+
deltacat-1.1.28.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
|
217
|
+
deltacat-1.1.28.dist-info/RECORD,,
|
File without changes
|
File without changes
|