deltacat 1.1.27__py3-none-any.whl → 1.1.28__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/tests/utils/test_pyarrow.py +255 -0
- deltacat/utils/pyarrow.py +156 -27
- {deltacat-1.1.27.dist-info → deltacat-1.1.28.dist-info}/METADATA +1 -1
- {deltacat-1.1.27.dist-info → deltacat-1.1.28.dist-info}/RECORD +8 -8
- {deltacat-1.1.27.dist-info → deltacat-1.1.28.dist-info}/WHEEL +1 -1
- {deltacat-1.1.27.dist-info → deltacat-1.1.28.dist-info}/LICENSE +0 -0
- {deltacat-1.1.27.dist-info → deltacat-1.1.28.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -7,7 +7,9 @@ from deltacat.utils.pyarrow import (
|
|
7
7
|
s3_file_to_table,
|
8
8
|
ReadKwargsProviderPyArrowSchemaOverride,
|
9
9
|
RAISE_ON_EMPTY_CSV_KWARG,
|
10
|
+
RAISE_ON_DECIMAL_OVERFLOW,
|
10
11
|
)
|
12
|
+
import decimal
|
11
13
|
from deltacat.types.media import ContentEncoding, ContentType
|
12
14
|
from deltacat.types.partial_download import PartialParquetParameters
|
13
15
|
from pyarrow.parquet import ParquetFile
|
@@ -16,6 +18,12 @@ import pyarrow as pa
|
|
16
18
|
PARQUET_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet"
|
17
19
|
EMPTY_UTSV_PATH = "deltacat/tests/utils/data/empty.csv"
|
18
20
|
NON_EMPTY_VALID_UTSV_PATH = "deltacat/tests/utils/data/non_empty_valid.csv"
|
21
|
+
OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH = (
|
22
|
+
"deltacat/tests/utils/data/overflowing_decimal_precision.csv"
|
23
|
+
)
|
24
|
+
OVERFLOWING_DECIMAL_SCALE_UTSV_PATH = (
|
25
|
+
"deltacat/tests/utils/data/overflowing_decimal_scale.csv"
|
26
|
+
)
|
19
27
|
GZIP_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.gz"
|
20
28
|
BZ2_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.bz2"
|
21
29
|
|
@@ -407,6 +415,253 @@ class TestReadCSV(TestCase):
|
|
407
415
|
),
|
408
416
|
)
|
409
417
|
|
418
|
+
def test_read_csv_when_decimal_precision_overflows_and_raise_kwarg_specified(self):
|
419
|
+
schema = pa.schema(
|
420
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(4, 2))]
|
421
|
+
)
|
422
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
423
|
+
_add_column_kwargs(
|
424
|
+
ContentType.UNESCAPED_TSV.value,
|
425
|
+
["is_active", "decimal_value"],
|
426
|
+
["is_active", "decimal_value"],
|
427
|
+
kwargs,
|
428
|
+
)
|
429
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
430
|
+
|
431
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
432
|
+
self.assertRaises(
|
433
|
+
pa.lib.ArrowInvalid,
|
434
|
+
lambda: pyarrow_read_csv(
|
435
|
+
OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH,
|
436
|
+
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
|
437
|
+
),
|
438
|
+
)
|
439
|
+
|
440
|
+
def test_read_csv_when_decimal_precision_overflows_sanity(self):
|
441
|
+
schema = pa.schema(
|
442
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(4, 2))]
|
443
|
+
)
|
444
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
445
|
+
_add_column_kwargs(
|
446
|
+
ContentType.UNESCAPED_TSV.value,
|
447
|
+
["is_active", "decimal_value"],
|
448
|
+
["is_active", "decimal_value"],
|
449
|
+
kwargs,
|
450
|
+
)
|
451
|
+
|
452
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
453
|
+
|
454
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
455
|
+
|
456
|
+
self.assertRaises(
|
457
|
+
pa.lib.ArrowInvalid,
|
458
|
+
lambda: pyarrow_read_csv(OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH, **kwargs),
|
459
|
+
)
|
460
|
+
|
461
|
+
def test_read_csv_when_decimal_scale_overflows_and_raise_kwarg_specified(self):
|
462
|
+
schema = pa.schema(
|
463
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(20, 2))]
|
464
|
+
)
|
465
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
466
|
+
_add_column_kwargs(
|
467
|
+
ContentType.UNESCAPED_TSV.value,
|
468
|
+
["is_active", "decimal_value"],
|
469
|
+
["is_active", "decimal_value"],
|
470
|
+
kwargs,
|
471
|
+
)
|
472
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
473
|
+
|
474
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
475
|
+
|
476
|
+
self.assertRaises(
|
477
|
+
pa.lib.ArrowInvalid,
|
478
|
+
lambda: pyarrow_read_csv(
|
479
|
+
OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
|
480
|
+
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
|
481
|
+
),
|
482
|
+
)
|
483
|
+
|
484
|
+
def test_read_csv_when_decimal_scale_overflows_sanity(self):
|
485
|
+
schema = pa.schema(
|
486
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(20, 2))]
|
487
|
+
)
|
488
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
489
|
+
_add_column_kwargs(
|
490
|
+
ContentType.UNESCAPED_TSV.value,
|
491
|
+
["is_active", "decimal_value"],
|
492
|
+
["is_active", "decimal_value"],
|
493
|
+
kwargs,
|
494
|
+
)
|
495
|
+
|
496
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
497
|
+
|
498
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
499
|
+
|
500
|
+
result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
|
501
|
+
|
502
|
+
self.assertEqual(len(result), 3)
|
503
|
+
self.assertEqual(
|
504
|
+
result[1][0].as_py(), decimal.Decimal("322236.66")
|
505
|
+
) # rounding decimal
|
506
|
+
self.assertEqual(result[1][1].as_py(), decimal.Decimal("32.33")) # not rounded
|
507
|
+
self.assertEqual(len(result.column_names), 2)
|
508
|
+
result_schema = result.schema
|
509
|
+
self.assertEqual(result_schema.field(0).type, "string")
|
510
|
+
self.assertEqual(result_schema.field(1).type, pa.decimal128(20, 2))
|
511
|
+
|
512
|
+
def test_read_csv_when_decimal_scale_overflows_and_negative_scale(self):
|
513
|
+
schema = pa.schema(
|
514
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(20, -2))]
|
515
|
+
)
|
516
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
517
|
+
_add_column_kwargs(
|
518
|
+
ContentType.UNESCAPED_TSV.value,
|
519
|
+
["is_active", "decimal_value"],
|
520
|
+
["is_active", "decimal_value"],
|
521
|
+
kwargs,
|
522
|
+
)
|
523
|
+
|
524
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
525
|
+
|
526
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
527
|
+
|
528
|
+
result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
|
529
|
+
|
530
|
+
self.assertEqual(len(result), 3)
|
531
|
+
self.assertEqual(
|
532
|
+
result[1][0].as_py(),
|
533
|
+
decimal.Decimal("322200"), # consequence of negative scale
|
534
|
+
) # rounding decimal
|
535
|
+
self.assertEqual(result[1][1].as_py(), decimal.Decimal("00"))
|
536
|
+
self.assertEqual(len(result.column_names), 2)
|
537
|
+
result_schema = result.schema
|
538
|
+
self.assertEqual(result_schema.field(0).type, "string")
|
539
|
+
self.assertEqual(result_schema.field(1).type, pa.decimal128(20, -2))
|
540
|
+
|
541
|
+
def test_read_csv_when_decimal_scale_overflows_with_decimal256(self):
|
542
|
+
schema = pa.schema(
|
543
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal256(20, 2))]
|
544
|
+
)
|
545
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
546
|
+
_add_column_kwargs(
|
547
|
+
ContentType.UNESCAPED_TSV.value,
|
548
|
+
["is_active", "decimal_value"],
|
549
|
+
["is_active", "decimal_value"],
|
550
|
+
kwargs,
|
551
|
+
)
|
552
|
+
|
553
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
554
|
+
|
555
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
556
|
+
|
557
|
+
result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
|
558
|
+
|
559
|
+
self.assertEqual(len(result), 3)
|
560
|
+
self.assertEqual(
|
561
|
+
result[1][0].as_py(), decimal.Decimal("322236.66")
|
562
|
+
) # rounding decimal
|
563
|
+
self.assertEqual(result[1][1].as_py(), decimal.Decimal("32.33")) # not rounded
|
564
|
+
self.assertEqual(len(result.column_names), 2)
|
565
|
+
result_schema = result.schema
|
566
|
+
self.assertEqual(result_schema.field(0).type, "string")
|
567
|
+
self.assertEqual(result_schema.field(1).type, pa.decimal256(20, 2))
|
568
|
+
|
569
|
+
def test_read_csv_when_decimal_scale_overflows_with_decimal256_and_raise_on_overflow(
|
570
|
+
self,
|
571
|
+
):
|
572
|
+
schema = pa.schema(
|
573
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal256(20, 2))]
|
574
|
+
)
|
575
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
576
|
+
_add_column_kwargs(
|
577
|
+
ContentType.UNESCAPED_TSV.value,
|
578
|
+
["is_active", "decimal_value"],
|
579
|
+
["is_active", "decimal_value"],
|
580
|
+
kwargs,
|
581
|
+
)
|
582
|
+
|
583
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
584
|
+
|
585
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
586
|
+
|
587
|
+
self.assertRaises(
|
588
|
+
pa.lib.ArrowNotImplementedError,
|
589
|
+
lambda: pyarrow_read_csv(
|
590
|
+
OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
|
591
|
+
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
|
592
|
+
),
|
593
|
+
)
|
594
|
+
|
595
|
+
def test_read_csv_when_decimal_scale_overflows_without_any_schema_then_infers(self):
|
596
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
597
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=None)
|
598
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
599
|
+
|
600
|
+
result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
|
601
|
+
|
602
|
+
# The default behavior of pyarrow is to invalid skip rows
|
603
|
+
self.assertEqual(len(result), 2)
|
604
|
+
self.assertEqual(result[1][0].as_py(), 32.33) # rounding decimal
|
605
|
+
self.assertEqual(result[1][1].as_py(), 0.4) # not rounded
|
606
|
+
self.assertEqual(len(result.column_names), 2)
|
607
|
+
result_schema = result.schema
|
608
|
+
self.assertEqual(result_schema.field(0).type, "string")
|
609
|
+
self.assertEqual(result_schema.field(1).type, pa.float64())
|
610
|
+
|
611
|
+
def test_read_csv_when_decimal_scale_and_precision_overflow_and_raise_on_overflow(
|
612
|
+
self,
|
613
|
+
):
|
614
|
+
schema = pa.schema(
|
615
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(5, 2))]
|
616
|
+
)
|
617
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
618
|
+
_add_column_kwargs(
|
619
|
+
ContentType.UNESCAPED_TSV.value,
|
620
|
+
["is_active", "decimal_value"],
|
621
|
+
["is_active", "decimal_value"],
|
622
|
+
kwargs,
|
623
|
+
)
|
624
|
+
|
625
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
626
|
+
|
627
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
628
|
+
|
629
|
+
self.assertRaises(
|
630
|
+
pa.lib.ArrowInvalid,
|
631
|
+
lambda: pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs),
|
632
|
+
)
|
633
|
+
|
634
|
+
def test_read_csv_when_decimal_scale_overflow_and_file_like_obj_passed(self):
|
635
|
+
schema = pa.schema(
|
636
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(15, 2))]
|
637
|
+
)
|
638
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
639
|
+
_add_column_kwargs(
|
640
|
+
ContentType.UNESCAPED_TSV.value,
|
641
|
+
["is_active", "decimal_value"],
|
642
|
+
["is_active", "decimal_value"],
|
643
|
+
kwargs,
|
644
|
+
)
|
645
|
+
|
646
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
647
|
+
|
648
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
649
|
+
|
650
|
+
with open(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, "rb") as file:
|
651
|
+
result = pyarrow_read_csv(file, **kwargs)
|
652
|
+
|
653
|
+
self.assertEqual(len(result), 3)
|
654
|
+
self.assertEqual(
|
655
|
+
result[1][0].as_py(), decimal.Decimal("322236.66")
|
656
|
+
) # rounding decimal
|
657
|
+
self.assertEqual(
|
658
|
+
result[1][1].as_py(), decimal.Decimal("32.33")
|
659
|
+
) # not rounded
|
660
|
+
self.assertEqual(len(result.column_names), 2)
|
661
|
+
result_schema = result.schema
|
662
|
+
self.assertEqual(result_schema.field(0).type, "string")
|
663
|
+
self.assertEqual(result_schema.field(1).type, pa.decimal128(15, 2))
|
664
|
+
|
410
665
|
|
411
666
|
class TestS3FileToTable(TestCase):
|
412
667
|
def test_s3_file_to_table_identity_sanity(self):
|
deltacat/utils/pyarrow.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
+
import copy
|
4
5
|
import bz2
|
5
6
|
import gzip
|
6
7
|
import io
|
@@ -47,6 +48,17 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
|
47
48
|
RAISE_ON_EMPTY_CSV_KWARG = "raise_on_empty_csv"
|
48
49
|
READER_TYPE_KWARG = "reader_type"
|
49
50
|
|
51
|
+
"""
|
52
|
+
By default, round decimal values using half_to_even round mode when
|
53
|
+
rescaling a decimal to the given scale and precision in the schema would cause
|
54
|
+
data loss. Setting any non null value of this argument will result
|
55
|
+
in an error instead.
|
56
|
+
"""
|
57
|
+
RAISE_ON_DECIMAL_OVERFLOW = "raise_on_decimal_overflow"
|
58
|
+
# Note the maximum from https://arrow.apache.org/docs/python/generated/pyarrow.Decimal256Type.html#pyarrow.Decimal256Type
|
59
|
+
DECIMAL256_DEFAULT_SCALE = 38
|
60
|
+
DECIMAL256_MAX_PRECISION = 76
|
61
|
+
|
50
62
|
|
51
63
|
def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Schema:
|
52
64
|
|
@@ -64,45 +76,162 @@ def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Sche
|
|
64
76
|
return target_schema
|
65
77
|
|
66
78
|
|
67
|
-
def
|
68
|
-
|
69
|
-
|
70
|
-
|
79
|
+
def _extract_arrow_schema_from_read_csv_kwargs(kwargs: Dict[str, Any]) -> pa.Schema:
|
80
|
+
schema = None
|
81
|
+
if (
|
82
|
+
"convert_options" in kwargs
|
83
|
+
and kwargs["convert_options"].column_types is not None
|
84
|
+
):
|
85
|
+
schema = kwargs["convert_options"].column_types
|
86
|
+
if not isinstance(schema, pa.Schema):
|
87
|
+
schema = pa.schema(schema)
|
88
|
+
if kwargs["convert_options"].include_columns:
|
89
|
+
schema = _filter_schema_for_columns(
|
90
|
+
schema, kwargs["convert_options"].include_columns
|
91
|
+
)
|
92
|
+
elif (
|
93
|
+
kwargs.get("read_options") is not None
|
94
|
+
and kwargs["read_options"].column_names
|
95
|
+
):
|
96
|
+
schema = _filter_schema_for_columns(
|
97
|
+
schema, kwargs["read_options"].column_names
|
98
|
+
)
|
99
|
+
else:
|
100
|
+
logger.debug(
|
101
|
+
"Schema not specified in the kwargs."
|
102
|
+
" Hence, schema could not be inferred from the empty CSV."
|
71
103
|
)
|
104
|
+
|
105
|
+
return schema
|
106
|
+
|
107
|
+
|
108
|
+
def _new_schema_with_replaced_fields(
|
109
|
+
schema: pa.Schema, field_to_replace: Callable[[pa.Field], Optional[pa.Field]]
|
110
|
+
) -> pa.Schema:
|
111
|
+
if schema is None:
|
112
|
+
return None
|
113
|
+
|
114
|
+
new_schema_fields = []
|
115
|
+
for field in schema:
|
116
|
+
new_field = field_to_replace(field)
|
117
|
+
if new_field is not None:
|
118
|
+
new_schema_fields.append(new_field)
|
119
|
+
else:
|
120
|
+
new_schema_fields.append(field)
|
121
|
+
|
122
|
+
return pa.schema(new_schema_fields, metadata=schema.metadata)
|
123
|
+
|
124
|
+
|
125
|
+
def _read_csv_rounding_decimal_columns_to_fit_scale(
|
126
|
+
schema: pa.Schema, reader_args: List[Any], reader_kwargs: Dict[str, Any]
|
127
|
+
) -> pa.Table:
|
128
|
+
# Note: We read decimals as strings first because CSV
|
129
|
+
# conversion to decimal256 isn't implemented as of pyarrow==12.0.1
|
130
|
+
new_schema = _new_schema_with_replaced_fields(
|
131
|
+
schema,
|
132
|
+
lambda fld: pa.field(fld.name, pa.string(), metadata=fld.metadata)
|
133
|
+
if pa.types.is_decimal128(fld.type) or pa.types.is_decimal256(fld.type)
|
134
|
+
else None,
|
135
|
+
)
|
136
|
+
new_kwargs = sanitize_kwargs_by_supported_kwargs(
|
137
|
+
["read_options", "parse_options", "convert_options", "memory_pool"],
|
138
|
+
reader_kwargs,
|
139
|
+
)
|
140
|
+
# Creating a shallow copy for efficiency
|
141
|
+
new_convert_options = copy.copy(new_kwargs["convert_options"])
|
142
|
+
new_convert_options.column_types = new_schema
|
143
|
+
new_reader_kwargs = {**new_kwargs, "convert_options": new_convert_options}
|
144
|
+
arrow_table = pacsv.read_csv(*reader_args, **new_reader_kwargs)
|
145
|
+
|
146
|
+
for column_index, field in enumerate(schema):
|
147
|
+
if pa.types.is_decimal128(field.type) or pa.types.is_decimal256(field.type):
|
148
|
+
column_array = arrow_table[field.name]
|
149
|
+
# We always cast to decimal256 to accomodate fixed scale of 38
|
150
|
+
cast_to_type = pa.decimal256(
|
151
|
+
DECIMAL256_MAX_PRECISION, DECIMAL256_DEFAULT_SCALE
|
152
|
+
)
|
153
|
+
casted_decimal_array = pc.cast(column_array, cast_to_type)
|
154
|
+
# Note that scale can be negative
|
155
|
+
rounded_column_array = pc.round(
|
156
|
+
casted_decimal_array, ndigits=field.type.scale
|
157
|
+
)
|
158
|
+
final_decimal_array = pc.cast(rounded_column_array, field.type)
|
159
|
+
arrow_table = arrow_table.set_column(
|
160
|
+
column_index,
|
161
|
+
field,
|
162
|
+
final_decimal_array,
|
163
|
+
)
|
164
|
+
logger.debug(
|
165
|
+
f"Rounded decimal column: {field.name} to {field.type.scale} scale and"
|
166
|
+
f" {field.type.precision} precision"
|
167
|
+
)
|
168
|
+
|
169
|
+
return arrow_table
|
170
|
+
|
171
|
+
|
172
|
+
def pyarrow_read_csv_default(*args, **kwargs):
|
173
|
+
new_kwargs = sanitize_kwargs_by_supported_kwargs(
|
174
|
+
["read_options", "parse_options", "convert_options", "memory_pool"], kwargs
|
175
|
+
)
|
176
|
+
|
177
|
+
try:
|
72
178
|
return pacsv.read_csv(*args, **new_kwargs)
|
73
179
|
except pa.lib.ArrowInvalid as e:
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
)
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
180
|
+
error_str = e.__str__()
|
181
|
+
schema = _extract_arrow_schema_from_read_csv_kwargs(kwargs)
|
182
|
+
|
183
|
+
if error_str == "Empty CSV file" and not kwargs.get(RAISE_ON_EMPTY_CSV_KWARG):
|
184
|
+
logger.debug(f"Read CSV empty schema being used: {schema}")
|
185
|
+
return pa.Table.from_pylist([], schema=schema)
|
186
|
+
if not kwargs.get(RAISE_ON_DECIMAL_OVERFLOW):
|
187
|
+
# Note, this logic requires expensive casting. To prevent downgrading performance
|
188
|
+
# for happy path reads, we are handling this case in response to an error.
|
189
|
+
logger.warning(
|
190
|
+
"Rescaling Decimal to the given scale in the schema. "
|
191
|
+
f"Original error: {error_str}"
|
192
|
+
)
|
193
|
+
|
194
|
+
if schema is not None and "convert_options" in kwargs:
|
195
|
+
if (
|
196
|
+
"Rescaling Decimal" in error_str
|
197
|
+
and "value would cause data loss" in error_str
|
90
198
|
):
|
91
|
-
|
92
|
-
|
199
|
+
logger.debug(f"Checking if the file: {args[0]}...")
|
200
|
+
# Since we are re-reading the file, we have to seek to beginning
|
201
|
+
if isinstance(args[0], io.IOBase) and args[0].seekable():
|
202
|
+
logger.debug(f"Seeking to the beginning of the file {args[0]}")
|
203
|
+
args[0].seek(0)
|
204
|
+
return _read_csv_rounding_decimal_columns_to_fit_scale(
|
205
|
+
schema=schema, reader_args=args, reader_kwargs=kwargs
|
93
206
|
)
|
94
|
-
|
95
207
|
else:
|
96
208
|
logger.debug(
|
97
|
-
"Schema
|
98
|
-
"
|
209
|
+
"Schema is None when trying to adjust decimal values. "
|
210
|
+
"Hence, bubbling up exception..."
|
99
211
|
)
|
100
212
|
|
101
|
-
logger.debug(f"Read CSV empty schema being used: {schema}")
|
102
|
-
return pa.Table.from_pylist([], schema=schema)
|
103
213
|
raise e
|
104
214
|
|
105
215
|
|
216
|
+
def pyarrow_read_csv(*args, **kwargs) -> pa.Table:
|
217
|
+
schema = _extract_arrow_schema_from_read_csv_kwargs(kwargs)
|
218
|
+
|
219
|
+
# CSV conversion to decimal256 isn't supported as of pyarrow=12.0.1
|
220
|
+
# Below ensures decimal256 is casted properly.
|
221
|
+
schema_includes_decimal256 = (
|
222
|
+
(True if any([pa.types.is_decimal256(x.type) for x in schema]) else False)
|
223
|
+
if schema is not None
|
224
|
+
else None
|
225
|
+
)
|
226
|
+
if schema_includes_decimal256 and not kwargs.get(RAISE_ON_DECIMAL_OVERFLOW):
|
227
|
+
# falling back to expensive method of reading CSV
|
228
|
+
return _read_csv_rounding_decimal_columns_to_fit_scale(
|
229
|
+
schema, reader_args=args, reader_kwargs=kwargs
|
230
|
+
)
|
231
|
+
else:
|
232
|
+
return pyarrow_read_csv_default(*args, **kwargs)
|
233
|
+
|
234
|
+
|
106
235
|
CONTENT_TYPE_TO_PA_READ_FUNC: Dict[str, Callable] = {
|
107
236
|
ContentType.UNESCAPED_TSV.value: pyarrow_read_csv,
|
108
237
|
ContentType.TSV.value: pyarrow_read_csv,
|
@@ -1,4 +1,4 @@
|
|
1
|
-
deltacat/__init__.py,sha256=
|
1
|
+
deltacat/__init__.py,sha256=GPlTQc6AW4ig_nZJ7kMVe-kbZxYfrSVGFN1YEqY8dXU,1778
|
2
2
|
deltacat/constants.py,sha256=TUJLXUJ9xq1Ryil72yLkKR8EDH_Irp5wUg56QstbRNE,2181
|
3
3
|
deltacat/exceptions.py,sha256=7sjk3BuMY5Oo-6OvAfHncZx_OcvtEL47BblWr2F7waE,12740
|
4
4
|
deltacat/logs.py,sha256=EQSDin1deehzz5xlLV1_TrFJrO_IBZ9Ahp7MdL-4cK8,9363
|
@@ -179,7 +179,7 @@ deltacat/tests/utils/test_cloudpickle.py,sha256=J0pnBY3-PxlUh6MamZAN1PuquKQPr2iy
|
|
179
179
|
deltacat/tests/utils/test_daft.py,sha256=kY8lkXoQvyWunok8UvOsh1An297rb3jcnstTuIAyAlc,8232
|
180
180
|
deltacat/tests/utils/test_metrics.py,sha256=Ym9nOz1EtB180pLmvugihj1sDTNDMb5opIjjr5Nmcls,16339
|
181
181
|
deltacat/tests/utils/test_placement.py,sha256=g61wVOMkHe4YJeR9Oxg_BOVQ6bhHHbC3IBYv8YhUu94,597
|
182
|
-
deltacat/tests/utils/test_pyarrow.py,sha256=
|
182
|
+
deltacat/tests/utils/test_pyarrow.py,sha256=YDuyFYNjy6thzfA6Z2a0dOytUugsExu1uMUOhmP_aXc,29977
|
183
183
|
deltacat/tests/utils/test_record_batch_tables.py,sha256=AkG1WyljQmjnl-AxhbFWyo5LnMIKRyLScfgC2B_ES-s,11321
|
184
184
|
deltacat/tests/utils/test_resources.py,sha256=HtpvDrfPZQNtGDXUlsIzc_yd7Vf1cDscZ3YbN0oTvO8,2560
|
185
185
|
deltacat/tests/utils/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -200,7 +200,7 @@ deltacat/utils/numpy.py,sha256=SpHKKvC-K8NINTWGVfTZ5-gBFTGYqaXjjgKFhsdUjwg,2049
|
|
200
200
|
deltacat/utils/pandas.py,sha256=q99mlRB7tymICMcNbfGLfLqFu_C-feyPZKZm2CWJJVc,9574
|
201
201
|
deltacat/utils/performance.py,sha256=7ZLaMkS1ehPSIhT5uOQVBHvjC70iKHzoFquFo-KL0PI,645
|
202
202
|
deltacat/utils/placement.py,sha256=Lj20fb-eq8rgMdm_M2MBMfDLwhDM1sS1nJj2DvIK56s,12060
|
203
|
-
deltacat/utils/pyarrow.py,sha256=
|
203
|
+
deltacat/utils/pyarrow.py,sha256=xEZRzbTBU6uj9K4DtvngIPtQkTA8haVgQ4Y4vjwHvtM,34311
|
204
204
|
deltacat/utils/resources.py,sha256=Ax1OgLLbZI4oYpp4Ki27OLaST-7I-AJgZwU87FVfY8g,8253
|
205
205
|
deltacat/utils/s3fs.py,sha256=PmUJ5Fm1WmD-_zp_M6yd9VbXvIoJuBeK6ApOdJJApLE,662
|
206
206
|
deltacat/utils/schema.py,sha256=m4Wm4ZQcpttzOUxex4dVneGlHy1_E36HspTcjNYzvVM,1564
|
@@ -210,8 +210,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
|
|
210
210
|
deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
|
211
211
|
deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
|
212
212
|
deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
|
213
|
-
deltacat-1.1.
|
214
|
-
deltacat-1.1.
|
215
|
-
deltacat-1.1.
|
216
|
-
deltacat-1.1.
|
217
|
-
deltacat-1.1.
|
213
|
+
deltacat-1.1.28.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
214
|
+
deltacat-1.1.28.dist-info/METADATA,sha256=-ZnMp9C26vVxi015Il3UtBMm9pHqA8aZmTnNTgr8Tb8,1733
|
215
|
+
deltacat-1.1.28.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
216
|
+
deltacat-1.1.28.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
|
217
|
+
deltacat-1.1.28.dist-info/RECORD,,
|
File without changes
|
File without changes
|