deltacat 1.1.27__py3-none-any.whl → 1.1.29__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor_v2/constants.py +15 -1
- deltacat/compute/compactor_v2/steps/merge.py +30 -5
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +15 -3
- deltacat/tests/compute/compact_partition_test_cases.py +32 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +133 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/test_utils/pyarrow.py +15 -8
- deltacat/tests/utils/test_pyarrow.py +278 -0
- deltacat/utils/pyarrow.py +162 -31
- {deltacat-1.1.27.dist-info → deltacat-1.1.29.dist-info}/METADATA +1 -1
- {deltacat-1.1.27.dist-info → deltacat-1.1.29.dist-info}/RECORD +16 -15
- {deltacat-1.1.27.dist-info → deltacat-1.1.29.dist-info}/WHEEL +1 -1
- {deltacat-1.1.27.dist-info → deltacat-1.1.29.dist-info}/LICENSE +0 -0
- {deltacat-1.1.27.dist-info → deltacat-1.1.29.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
from deltacat.utils.common import env_bool, env_integer
|
2
|
+
|
1
3
|
TOTAL_BYTES_IN_SHA1_HASH = 20
|
2
4
|
|
3
5
|
PK_DELIMITER = "L6kl7u5f"
|
@@ -31,7 +33,9 @@ TOTAL_MEMORY_BUFFER_PERCENTAGE = 30
|
|
31
33
|
# The total size of records that will be hash bucketed at once
|
32
34
|
# Since, sorting is nlogn, we ensure that is not performed
|
33
35
|
# on a very large dataset for best performance.
|
34
|
-
MAX_SIZE_OF_RECORD_BATCH_IN_GIB =
|
36
|
+
MAX_SIZE_OF_RECORD_BATCH_IN_GIB = env_integer(
|
37
|
+
"MAX_SIZE_OF_RECORD_BATCH_IN_GIB", 2 * 1024 * 1024 * 1024
|
38
|
+
)
|
35
39
|
|
36
40
|
# Whether to drop duplicates during merge.
|
37
41
|
DROP_DUPLICATES = True
|
@@ -78,3 +82,13 @@ COMPACT_PARTITION_METRIC_PREFIX = "compact_partition"
|
|
78
82
|
# Number of rounds to run hash/merge for a single
|
79
83
|
# partition. (For large table support)
|
80
84
|
DEFAULT_NUM_ROUNDS = 1
|
85
|
+
|
86
|
+
# Whether to perform sha1 hashing when required to
|
87
|
+
# optimize memory. For example, hashing is always
|
88
|
+
# required for bucketing where it's not mandatory
|
89
|
+
# when dropping duplicates. Setting this to True
|
90
|
+
# will disable sha1 hashing in cases where it isn't
|
91
|
+
# mandatory. This flag is False by default.
|
92
|
+
SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED = env_bool(
|
93
|
+
"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED", False
|
94
|
+
)
|
@@ -7,6 +7,7 @@ import ray
|
|
7
7
|
import itertools
|
8
8
|
import time
|
9
9
|
import pyarrow.compute as pc
|
10
|
+
from deltacat.utils.pyarrow import MAX_INT_BYTES
|
10
11
|
import deltacat.compute.compactor_v2.utils.merge as merge_utils
|
11
12
|
from uuid import uuid4
|
12
13
|
from deltacat import logs
|
@@ -147,10 +148,32 @@ def _merge_tables(
|
|
147
148
|
if compacted_table:
|
148
149
|
compacted_table = all_tables[0]
|
149
150
|
|
151
|
+
compacted_pk_hash_str = compacted_table[sc._PK_HASH_STRING_COLUMN_NAME]
|
152
|
+
incremental_pk_hash_str = incremental_table[sc._PK_HASH_STRING_COLUMN_NAME]
|
153
|
+
|
154
|
+
logger.info(
|
155
|
+
f"Size of compacted pk hash={compacted_pk_hash_str.nbytes} "
|
156
|
+
f"and incremental pk hash={incremental_pk_hash_str.nbytes}."
|
157
|
+
)
|
158
|
+
|
159
|
+
if (
|
160
|
+
compacted_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
|
161
|
+
or incremental_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
|
162
|
+
):
|
163
|
+
logger.info("Casting compacted and incremental pk hash to large_string...")
|
164
|
+
# is_in combines the chunks of the chunked array passed which can cause
|
165
|
+
# ArrowCapacityError if the total size of string array is over 2GB.
|
166
|
+
# Using a large_string would resolve this issue.
|
167
|
+
# The cast here should be zero-copy in most cases.
|
168
|
+
compacted_pk_hash_str = pc.cast(compacted_pk_hash_str, pa.large_string())
|
169
|
+
incremental_pk_hash_str = pc.cast(
|
170
|
+
incremental_pk_hash_str, pa.large_string()
|
171
|
+
)
|
172
|
+
|
150
173
|
records_to_keep = pc.invert(
|
151
174
|
pc.is_in(
|
152
|
-
|
153
|
-
|
175
|
+
compacted_pk_hash_str,
|
176
|
+
incremental_pk_hash_str,
|
154
177
|
)
|
155
178
|
)
|
156
179
|
|
@@ -492,9 +515,11 @@ def _copy_manifests_from_hash_bucketing(
|
|
492
515
|
def _timed_merge(input: MergeInput) -> MergeResult:
|
493
516
|
task_id = get_current_ray_task_id()
|
494
517
|
worker_id = get_current_ray_worker_id()
|
495
|
-
with
|
496
|
-
f"merge_{worker_id}_{task_id}.bin"
|
497
|
-
|
518
|
+
with (
|
519
|
+
memray.Tracker(f"merge_{worker_id}_{task_id}.bin")
|
520
|
+
if input.enable_profiler
|
521
|
+
else nullcontext()
|
522
|
+
):
|
498
523
|
total_input_records, total_deduped_records = 0, 0
|
499
524
|
total_dropped_records = 0
|
500
525
|
materialized_results: List[MaterializeResult] = []
|
@@ -25,7 +25,7 @@ def _create_chunked_index_array(array: pa.Array) -> pa.Array:
|
|
25
25
|
result[index] = np.arange(cl, dtype="int32")
|
26
26
|
|
27
27
|
chunk_lengths = ([0] + chunk_lengths)[:-1]
|
28
|
-
result = pa.chunked_array(result + np.cumsum(chunk_lengths))
|
28
|
+
result = pa.chunked_array(result + np.cumsum(chunk_lengths), type=pa.int32())
|
29
29
|
return result
|
30
30
|
|
31
31
|
|
@@ -10,6 +10,7 @@ from deltacat.compute.compactor_v2.constants import (
|
|
10
10
|
TOTAL_BYTES_IN_SHA1_HASH,
|
11
11
|
PK_DELIMITER,
|
12
12
|
MAX_SIZE_OF_RECORD_BATCH_IN_GIB,
|
13
|
+
SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED,
|
13
14
|
)
|
14
15
|
import time
|
15
16
|
from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelope
|
@@ -48,6 +49,13 @@ def _is_sha1_desired(hash_columns: List[pa.Array]) -> bool:
|
|
48
49
|
f"Found total length of hash column={total_len} and total_size={total_size}"
|
49
50
|
)
|
50
51
|
|
52
|
+
if SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED:
|
53
|
+
logger.info(
|
54
|
+
f"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED is True. "
|
55
|
+
f"Returning False for is_sha1_desired"
|
56
|
+
)
|
57
|
+
return False
|
58
|
+
|
51
59
|
return total_size > TOTAL_BYTES_IN_SHA1_HASH * total_len
|
52
60
|
|
53
61
|
|
@@ -108,9 +116,10 @@ def _optimized_group_record_batches_by_hash_bucket(
|
|
108
116
|
record_batches = []
|
109
117
|
result_len = 0
|
110
118
|
for record_batch in table_batches:
|
111
|
-
|
112
|
-
|
113
|
-
|
119
|
+
if (
|
120
|
+
record_batches
|
121
|
+
and current_bytes + record_batch.nbytes >= MAX_SIZE_OF_RECORD_BATCH_IN_GIB
|
122
|
+
):
|
114
123
|
logger.info(
|
115
124
|
f"Total number of record batches without exceeding {MAX_SIZE_OF_RECORD_BATCH_IN_GIB} "
|
116
125
|
f"is {len(record_batches)} and size {current_bytes}"
|
@@ -128,6 +137,9 @@ def _optimized_group_record_batches_by_hash_bucket(
|
|
128
137
|
current_bytes = 0
|
129
138
|
record_batches.clear()
|
130
139
|
|
140
|
+
current_bytes += record_batch.nbytes
|
141
|
+
record_batches.append(record_batch)
|
142
|
+
|
131
143
|
if record_batches:
|
132
144
|
appended_len, append_latency = timed_invocation(
|
133
145
|
_append_table_by_hash_bucket,
|
@@ -601,6 +601,38 @@ INCREMENTAL_TEST_CASES: Dict[str, IncrementalCompactionTestCaseParams] = {
|
|
601
601
|
skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
|
602
602
|
assert_compaction_audit=None,
|
603
603
|
),
|
604
|
+
"15-incremental-empty-input-with-single-hash-bucket": IncrementalCompactionTestCaseParams(
|
605
|
+
primary_keys={"pk_col_1"},
|
606
|
+
sort_keys=[SortKey.of(key_name="sk_col_1")],
|
607
|
+
partition_keys=ZERO_VALUED_PARTITION_KEYS_PARAM,
|
608
|
+
partition_values=ZERO_VALUED_PARTITION_VALUES_PARAM,
|
609
|
+
input_deltas=pa.Table.from_arrays(
|
610
|
+
[
|
611
|
+
pa.array([]),
|
612
|
+
pa.array([]),
|
613
|
+
],
|
614
|
+
names=["pk_col_1", "sk_col_1"],
|
615
|
+
),
|
616
|
+
input_deltas_delta_type=DeltaType.UPSERT,
|
617
|
+
expected_terminal_compact_partition_result=pa.Table.from_arrays(
|
618
|
+
[
|
619
|
+
pa.array([]),
|
620
|
+
pa.array([]),
|
621
|
+
],
|
622
|
+
names=["pk_col_1", "sk_col_1"],
|
623
|
+
),
|
624
|
+
expected_terminal_exception=None,
|
625
|
+
expected_terminal_exception_message=None,
|
626
|
+
do_create_placement_group=False,
|
627
|
+
records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
|
628
|
+
hash_bucket_count=1,
|
629
|
+
read_kwargs_provider=None,
|
630
|
+
drop_duplicates=True,
|
631
|
+
is_inplace=False,
|
632
|
+
add_late_deltas=None,
|
633
|
+
skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
|
634
|
+
assert_compaction_audit=assert_compaction_audit_no_hash_bucket,
|
635
|
+
),
|
604
636
|
}
|
605
637
|
|
606
638
|
INCREMENTAL_TEST_CASES = with_compactor_version_func_test_param(INCREMENTAL_TEST_CASES)
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from typing import Dict, Any
|
2
2
|
import ray
|
3
3
|
import os
|
4
|
+
import pyarrow as pa
|
4
5
|
import pytest
|
5
6
|
import boto3
|
6
7
|
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
@@ -76,6 +77,17 @@ def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
|
|
76
77
|
os.remove(DATABASE_FILE_PATH_VALUE)
|
77
78
|
|
78
79
|
|
80
|
+
@pytest.fixture(scope="function")
|
81
|
+
def disable_sha1(monkeypatch):
|
82
|
+
import deltacat.compute.compactor_v2.utils.primary_key_index
|
83
|
+
|
84
|
+
monkeypatch.setattr(
|
85
|
+
deltacat.compute.compactor_v2.utils.primary_key_index,
|
86
|
+
"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED",
|
87
|
+
True,
|
88
|
+
)
|
89
|
+
|
90
|
+
|
79
91
|
class TestCompactionSession:
|
80
92
|
"""
|
81
93
|
This class adds specific tests that aren't part of the parametrized test suite.
|
@@ -556,3 +568,124 @@ class TestCompactionSession:
|
|
556
568
|
}
|
557
569
|
)
|
558
570
|
)
|
571
|
+
|
572
|
+
def test_compact_partition_when_incremental_pk_hash_is_over_2gb(
|
573
|
+
self, s3_resource, local_deltacat_storage_kwargs, disable_sha1
|
574
|
+
):
|
575
|
+
"""
|
576
|
+
A test case which ensures the compaction succeeds even if the incremental
|
577
|
+
arrow table size is over 2GB. It is added to prevent ArrowCapacityError
|
578
|
+
when running is_in operation during merge.
|
579
|
+
|
580
|
+
Note that we set SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED to bypass sha1 hashing
|
581
|
+
which truncates the lengths of pk strings when deduping.
|
582
|
+
"""
|
583
|
+
# setup
|
584
|
+
staged_source = stage_partition_from_file_paths(
|
585
|
+
self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
|
586
|
+
)
|
587
|
+
# we create chunked array to avoid ArrowCapacityError
|
588
|
+
chunked_pk_array = pa.chunked_array([["13bytesstring"], ["12bytestring"]])
|
589
|
+
table = pa.table([chunked_pk_array], names=["pk"])
|
590
|
+
source_delta = commit_delta_to_staged_partition(
|
591
|
+
staged_source, pa_table=table, **local_deltacat_storage_kwargs
|
592
|
+
)
|
593
|
+
|
594
|
+
staged_dest = stage_partition_from_file_paths(
|
595
|
+
self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
|
596
|
+
)
|
597
|
+
dest_partition = ds.commit_partition(
|
598
|
+
staged_dest, **local_deltacat_storage_kwargs
|
599
|
+
)
|
600
|
+
|
601
|
+
# rebase first
|
602
|
+
rebase_url = compact_partition(
|
603
|
+
CompactPartitionParams.of(
|
604
|
+
{
|
605
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
606
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
607
|
+
"dd_max_parallelism_ratio": 1.0,
|
608
|
+
"deltacat_storage": ds,
|
609
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
610
|
+
"destination_partition_locator": dest_partition.locator,
|
611
|
+
"drop_duplicates": True,
|
612
|
+
"hash_bucket_count": 1,
|
613
|
+
"last_stream_position_to_compact": source_delta.stream_position,
|
614
|
+
"list_deltas_kwargs": {
|
615
|
+
**local_deltacat_storage_kwargs,
|
616
|
+
**{"equivalent_table_types": []},
|
617
|
+
},
|
618
|
+
"primary_keys": ["pk"],
|
619
|
+
"rebase_source_partition_locator": source_delta.partition_locator,
|
620
|
+
"rebase_source_partition_high_watermark": source_delta.stream_position,
|
621
|
+
"records_per_compacted_file": 4000,
|
622
|
+
"s3_client_kwargs": {},
|
623
|
+
"source_partition_locator": source_delta.partition_locator,
|
624
|
+
"resource_estimation_method": ResourceEstimationMethod.PREVIOUS_INFLATION,
|
625
|
+
}
|
626
|
+
)
|
627
|
+
)
|
628
|
+
|
629
|
+
rebased_rcf = get_rcf(s3_resource, rebase_url)
|
630
|
+
|
631
|
+
assert rebased_rcf.compacted_pyarrow_write_result.files == 1
|
632
|
+
assert rebased_rcf.compacted_pyarrow_write_result.records == 2
|
633
|
+
|
634
|
+
# Run incremental with a small delta on source
|
635
|
+
chunked_pk_array = pa.chunked_array(
|
636
|
+
[["13bytesstring" * 95_000_000], ["12bytestring" * 95_000_000]]
|
637
|
+
) # 2.3GB
|
638
|
+
table = pa.table([chunked_pk_array], names=["pk"])
|
639
|
+
|
640
|
+
incremental_source_delta = commit_delta_to_partition(
|
641
|
+
source_delta.partition_locator,
|
642
|
+
pa_table=table,
|
643
|
+
**local_deltacat_storage_kwargs,
|
644
|
+
)
|
645
|
+
assert (
|
646
|
+
incremental_source_delta.partition_locator == source_delta.partition_locator
|
647
|
+
), "source partition locator should not change"
|
648
|
+
dest_partition = ds.get_partition(
|
649
|
+
dest_partition.stream_locator,
|
650
|
+
dest_partition.partition_values,
|
651
|
+
**local_deltacat_storage_kwargs,
|
652
|
+
)
|
653
|
+
|
654
|
+
assert (
|
655
|
+
dest_partition.locator
|
656
|
+
== rebased_rcf.compacted_delta_locator.partition_locator
|
657
|
+
), "The new destination partition should be same as compacted partition"
|
658
|
+
|
659
|
+
# Run incremental
|
660
|
+
incremental_url = compact_partition(
|
661
|
+
CompactPartitionParams.of(
|
662
|
+
{
|
663
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
664
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
665
|
+
"dd_max_parallelism_ratio": 1.0,
|
666
|
+
"deltacat_storage": ds,
|
667
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
668
|
+
"destination_partition_locator": dest_partition.locator,
|
669
|
+
"drop_duplicates": True,
|
670
|
+
"hash_bucket_count": 1,
|
671
|
+
"last_stream_position_to_compact": incremental_source_delta.stream_position,
|
672
|
+
"list_deltas_kwargs": {
|
673
|
+
**local_deltacat_storage_kwargs,
|
674
|
+
**{"equivalent_table_types": []},
|
675
|
+
},
|
676
|
+
"primary_keys": ["pk"],
|
677
|
+
"records_per_compacted_file": 4000,
|
678
|
+
"s3_client_kwargs": {},
|
679
|
+
"source_partition_locator": incremental_source_delta.partition_locator,
|
680
|
+
"resource_estimation_method": ResourceEstimationMethod.PREVIOUS_INFLATION,
|
681
|
+
}
|
682
|
+
)
|
683
|
+
)
|
684
|
+
|
685
|
+
incremental_rcf = get_rcf(s3_resource, incremental_url)
|
686
|
+
|
687
|
+
assert incremental_rcf.compacted_pyarrow_write_result.files == 1
|
688
|
+
assert (
|
689
|
+
incremental_rcf.compacted_pyarrow_write_result.pyarrow_bytes >= 2300000000
|
690
|
+
)
|
691
|
+
assert incremental_rcf.compacted_pyarrow_write_result.records == 4
|
@@ -0,0 +1,45 @@
|
|
1
|
+
import pyarrow as pa
|
2
|
+
from deltacat.compute.compactor_v2.utils.primary_key_index import (
|
3
|
+
group_by_pk_hash_bucket,
|
4
|
+
)
|
5
|
+
|
6
|
+
|
7
|
+
class TestGroupByPkHashBucket:
|
8
|
+
def test_sanity(self):
|
9
|
+
record = pa.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
|
10
|
+
pk = pa.array(["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"])
|
11
|
+
record_batch = pa.RecordBatch.from_arrays([record, pk], names=["record", "pk"])
|
12
|
+
table = pa.Table.from_batches([record_batch])
|
13
|
+
grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
|
14
|
+
|
15
|
+
assert len(grouped_array) == 3
|
16
|
+
total_records = 0
|
17
|
+
for arr in grouped_array:
|
18
|
+
if arr is not None:
|
19
|
+
total_records += len(arr[1])
|
20
|
+
|
21
|
+
assert total_records == len(table)
|
22
|
+
|
23
|
+
def test_when_record_batches_exceed_int_max_size(self):
|
24
|
+
record = pa.array(["12bytestring" * 90_000_000])
|
25
|
+
record_batch = pa.RecordBatch.from_arrays([record], names=["pk"])
|
26
|
+
table = pa.Table.from_batches([record_batch, record_batch])
|
27
|
+
|
28
|
+
grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
|
29
|
+
|
30
|
+
assert len(grouped_array) == 3
|
31
|
+
# two record batches are preserved as combining them
|
32
|
+
# would exceed 2GB.
|
33
|
+
assert len(grouped_array[2].to_batches()) == 2
|
34
|
+
|
35
|
+
def test_when_record_batches_less_than_int_max_size(self):
|
36
|
+
record = pa.array(["12bytestring" * 90_000])
|
37
|
+
record_batch = pa.RecordBatch.from_arrays([record], names=["pk"])
|
38
|
+
table = pa.Table.from_batches([record_batch, record_batch])
|
39
|
+
|
40
|
+
grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
|
41
|
+
|
42
|
+
assert len(grouped_array) == 3
|
43
|
+
# Combined the arrays into one record batch as the size
|
44
|
+
# would not exceed 2GB.
|
45
|
+
assert len(grouped_array[1].to_batches()) == 1
|
@@ -47,7 +47,8 @@ def stage_partition_from_file_paths(
|
|
47
47
|
|
48
48
|
def commit_delta_to_staged_partition(
|
49
49
|
staged_partition,
|
50
|
-
file_paths: List[str],
|
50
|
+
file_paths: List[str] = None,
|
51
|
+
pa_table: pa.Table = None,
|
51
52
|
content_type: ContentType = ContentType.PARQUET,
|
52
53
|
*args,
|
53
54
|
**kwargs,
|
@@ -57,6 +58,7 @@ def commit_delta_to_staged_partition(
|
|
57
58
|
*args,
|
58
59
|
file_paths=file_paths,
|
59
60
|
content_type=content_type,
|
61
|
+
pa_table=pa_table,
|
60
62
|
**kwargs,
|
61
63
|
)
|
62
64
|
ds.commit_partition(staged_partition, **kwargs)
|
@@ -76,23 +78,28 @@ def download_delta(delta_like: Union[Delta, DeltaLocator], *args, **kwargs) -> D
|
|
76
78
|
|
77
79
|
def commit_delta_to_partition(
|
78
80
|
partition: Union[Partition, PartitionLocator],
|
79
|
-
file_paths: List[str],
|
81
|
+
file_paths: List[str] = None,
|
82
|
+
pa_table: pa.Table = None,
|
80
83
|
content_type: ContentType = ContentType.PARQUET,
|
81
84
|
*args,
|
82
85
|
**kwargs,
|
83
86
|
) -> Delta:
|
84
|
-
tables = []
|
85
87
|
|
86
88
|
if isinstance(partition, PartitionLocator):
|
87
89
|
partition = ds.get_partition(
|
88
90
|
partition.stream_locator, partition.partition_values, *args, **kwargs
|
89
91
|
)
|
92
|
+
if pa_table is None:
|
93
|
+
assert file_paths is not None, "One of pa_table or file_paths must be passed."
|
94
|
+
tables = []
|
95
|
+
for file_path in file_paths:
|
96
|
+
table = pa.csv.read_csv(file_path)
|
97
|
+
tables.append(table)
|
90
98
|
|
91
|
-
|
92
|
-
table = pa.csv.read_csv(file_path)
|
93
|
-
tables.append(table)
|
99
|
+
pa_table = pa.concat_tables(tables)
|
94
100
|
|
95
|
-
|
96
|
-
|
101
|
+
staged_delta = ds.stage_delta(
|
102
|
+
pa_table, partition, content_type=content_type, **kwargs
|
103
|
+
)
|
97
104
|
|
98
105
|
return ds.commit_delta(staged_delta, **kwargs)
|
@@ -7,15 +7,24 @@ from deltacat.utils.pyarrow import (
|
|
7
7
|
s3_file_to_table,
|
8
8
|
ReadKwargsProviderPyArrowSchemaOverride,
|
9
9
|
RAISE_ON_EMPTY_CSV_KWARG,
|
10
|
+
RAISE_ON_DECIMAL_OVERFLOW,
|
10
11
|
)
|
12
|
+
import decimal
|
11
13
|
from deltacat.types.media import ContentEncoding, ContentType
|
12
14
|
from deltacat.types.partial_download import PartialParquetParameters
|
13
15
|
from pyarrow.parquet import ParquetFile
|
14
16
|
import pyarrow as pa
|
15
17
|
|
16
18
|
PARQUET_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet"
|
19
|
+
PARQUET_GZIP_COMPRESSED_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet.gz"
|
17
20
|
EMPTY_UTSV_PATH = "deltacat/tests/utils/data/empty.csv"
|
18
21
|
NON_EMPTY_VALID_UTSV_PATH = "deltacat/tests/utils/data/non_empty_valid.csv"
|
22
|
+
OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH = (
|
23
|
+
"deltacat/tests/utils/data/overflowing_decimal_precision.csv"
|
24
|
+
)
|
25
|
+
OVERFLOWING_DECIMAL_SCALE_UTSV_PATH = (
|
26
|
+
"deltacat/tests/utils/data/overflowing_decimal_scale.csv"
|
27
|
+
)
|
19
28
|
GZIP_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.gz"
|
20
29
|
BZ2_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.bz2"
|
21
30
|
|
@@ -407,6 +416,253 @@ class TestReadCSV(TestCase):
|
|
407
416
|
),
|
408
417
|
)
|
409
418
|
|
419
|
+
def test_read_csv_when_decimal_precision_overflows_and_raise_kwarg_specified(self):
|
420
|
+
schema = pa.schema(
|
421
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(4, 2))]
|
422
|
+
)
|
423
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
424
|
+
_add_column_kwargs(
|
425
|
+
ContentType.UNESCAPED_TSV.value,
|
426
|
+
["is_active", "decimal_value"],
|
427
|
+
["is_active", "decimal_value"],
|
428
|
+
kwargs,
|
429
|
+
)
|
430
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
431
|
+
|
432
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
433
|
+
self.assertRaises(
|
434
|
+
pa.lib.ArrowInvalid,
|
435
|
+
lambda: pyarrow_read_csv(
|
436
|
+
OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH,
|
437
|
+
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
|
438
|
+
),
|
439
|
+
)
|
440
|
+
|
441
|
+
def test_read_csv_when_decimal_precision_overflows_sanity(self):
|
442
|
+
schema = pa.schema(
|
443
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(4, 2))]
|
444
|
+
)
|
445
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
446
|
+
_add_column_kwargs(
|
447
|
+
ContentType.UNESCAPED_TSV.value,
|
448
|
+
["is_active", "decimal_value"],
|
449
|
+
["is_active", "decimal_value"],
|
450
|
+
kwargs,
|
451
|
+
)
|
452
|
+
|
453
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
454
|
+
|
455
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
456
|
+
|
457
|
+
self.assertRaises(
|
458
|
+
pa.lib.ArrowInvalid,
|
459
|
+
lambda: pyarrow_read_csv(OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH, **kwargs),
|
460
|
+
)
|
461
|
+
|
462
|
+
def test_read_csv_when_decimal_scale_overflows_and_raise_kwarg_specified(self):
|
463
|
+
schema = pa.schema(
|
464
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(20, 2))]
|
465
|
+
)
|
466
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
467
|
+
_add_column_kwargs(
|
468
|
+
ContentType.UNESCAPED_TSV.value,
|
469
|
+
["is_active", "decimal_value"],
|
470
|
+
["is_active", "decimal_value"],
|
471
|
+
kwargs,
|
472
|
+
)
|
473
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
474
|
+
|
475
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
476
|
+
|
477
|
+
self.assertRaises(
|
478
|
+
pa.lib.ArrowInvalid,
|
479
|
+
lambda: pyarrow_read_csv(
|
480
|
+
OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
|
481
|
+
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
|
482
|
+
),
|
483
|
+
)
|
484
|
+
|
485
|
+
def test_read_csv_when_decimal_scale_overflows_sanity(self):
|
486
|
+
schema = pa.schema(
|
487
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(20, 2))]
|
488
|
+
)
|
489
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
490
|
+
_add_column_kwargs(
|
491
|
+
ContentType.UNESCAPED_TSV.value,
|
492
|
+
["is_active", "decimal_value"],
|
493
|
+
["is_active", "decimal_value"],
|
494
|
+
kwargs,
|
495
|
+
)
|
496
|
+
|
497
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
498
|
+
|
499
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
500
|
+
|
501
|
+
result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
|
502
|
+
|
503
|
+
self.assertEqual(len(result), 3)
|
504
|
+
self.assertEqual(
|
505
|
+
result[1][0].as_py(), decimal.Decimal("322236.66")
|
506
|
+
) # rounding decimal
|
507
|
+
self.assertEqual(result[1][1].as_py(), decimal.Decimal("32.33")) # not rounded
|
508
|
+
self.assertEqual(len(result.column_names), 2)
|
509
|
+
result_schema = result.schema
|
510
|
+
self.assertEqual(result_schema.field(0).type, "string")
|
511
|
+
self.assertEqual(result_schema.field(1).type, pa.decimal128(20, 2))
|
512
|
+
|
513
|
+
def test_read_csv_when_decimal_scale_overflows_and_negative_scale(self):
|
514
|
+
schema = pa.schema(
|
515
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(20, -2))]
|
516
|
+
)
|
517
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
518
|
+
_add_column_kwargs(
|
519
|
+
ContentType.UNESCAPED_TSV.value,
|
520
|
+
["is_active", "decimal_value"],
|
521
|
+
["is_active", "decimal_value"],
|
522
|
+
kwargs,
|
523
|
+
)
|
524
|
+
|
525
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
526
|
+
|
527
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
528
|
+
|
529
|
+
result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
|
530
|
+
|
531
|
+
self.assertEqual(len(result), 3)
|
532
|
+
self.assertEqual(
|
533
|
+
result[1][0].as_py(),
|
534
|
+
decimal.Decimal("322200"), # consequence of negative scale
|
535
|
+
) # rounding decimal
|
536
|
+
self.assertEqual(result[1][1].as_py(), decimal.Decimal("00"))
|
537
|
+
self.assertEqual(len(result.column_names), 2)
|
538
|
+
result_schema = result.schema
|
539
|
+
self.assertEqual(result_schema.field(0).type, "string")
|
540
|
+
self.assertEqual(result_schema.field(1).type, pa.decimal128(20, -2))
|
541
|
+
|
542
|
+
def test_read_csv_when_decimal_scale_overflows_with_decimal256(self):
|
543
|
+
schema = pa.schema(
|
544
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal256(20, 2))]
|
545
|
+
)
|
546
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
547
|
+
_add_column_kwargs(
|
548
|
+
ContentType.UNESCAPED_TSV.value,
|
549
|
+
["is_active", "decimal_value"],
|
550
|
+
["is_active", "decimal_value"],
|
551
|
+
kwargs,
|
552
|
+
)
|
553
|
+
|
554
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
555
|
+
|
556
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
557
|
+
|
558
|
+
result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
|
559
|
+
|
560
|
+
self.assertEqual(len(result), 3)
|
561
|
+
self.assertEqual(
|
562
|
+
result[1][0].as_py(), decimal.Decimal("322236.66")
|
563
|
+
) # rounding decimal
|
564
|
+
self.assertEqual(result[1][1].as_py(), decimal.Decimal("32.33")) # not rounded
|
565
|
+
self.assertEqual(len(result.column_names), 2)
|
566
|
+
result_schema = result.schema
|
567
|
+
self.assertEqual(result_schema.field(0).type, "string")
|
568
|
+
self.assertEqual(result_schema.field(1).type, pa.decimal256(20, 2))
|
569
|
+
|
570
|
+
def test_read_csv_when_decimal_scale_overflows_with_decimal256_and_raise_on_overflow(
|
571
|
+
self,
|
572
|
+
):
|
573
|
+
schema = pa.schema(
|
574
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal256(20, 2))]
|
575
|
+
)
|
576
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
577
|
+
_add_column_kwargs(
|
578
|
+
ContentType.UNESCAPED_TSV.value,
|
579
|
+
["is_active", "decimal_value"],
|
580
|
+
["is_active", "decimal_value"],
|
581
|
+
kwargs,
|
582
|
+
)
|
583
|
+
|
584
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
585
|
+
|
586
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
587
|
+
|
588
|
+
self.assertRaises(
|
589
|
+
pa.lib.ArrowNotImplementedError,
|
590
|
+
lambda: pyarrow_read_csv(
|
591
|
+
OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
|
592
|
+
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
|
593
|
+
),
|
594
|
+
)
|
595
|
+
|
596
|
+
def test_read_csv_when_decimal_scale_overflows_without_any_schema_then_infers(self):
|
597
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
598
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=None)
|
599
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
600
|
+
|
601
|
+
result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
|
602
|
+
|
603
|
+
# The default behavior of pyarrow is to invalid skip rows
|
604
|
+
self.assertEqual(len(result), 2)
|
605
|
+
self.assertEqual(result[1][0].as_py(), 32.33) # rounding decimal
|
606
|
+
self.assertEqual(result[1][1].as_py(), 0.4) # not rounded
|
607
|
+
self.assertEqual(len(result.column_names), 2)
|
608
|
+
result_schema = result.schema
|
609
|
+
self.assertEqual(result_schema.field(0).type, "string")
|
610
|
+
self.assertEqual(result_schema.field(1).type, pa.float64())
|
611
|
+
|
612
|
+
def test_read_csv_when_decimal_scale_and_precision_overflow_and_raise_on_overflow(
|
613
|
+
self,
|
614
|
+
):
|
615
|
+
schema = pa.schema(
|
616
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(5, 2))]
|
617
|
+
)
|
618
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
619
|
+
_add_column_kwargs(
|
620
|
+
ContentType.UNESCAPED_TSV.value,
|
621
|
+
["is_active", "decimal_value"],
|
622
|
+
["is_active", "decimal_value"],
|
623
|
+
kwargs,
|
624
|
+
)
|
625
|
+
|
626
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
627
|
+
|
628
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
629
|
+
|
630
|
+
self.assertRaises(
|
631
|
+
pa.lib.ArrowInvalid,
|
632
|
+
lambda: pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs),
|
633
|
+
)
|
634
|
+
|
635
|
+
def test_read_csv_when_decimal_scale_overflow_and_file_like_obj_passed(self):
|
636
|
+
schema = pa.schema(
|
637
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(15, 2))]
|
638
|
+
)
|
639
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
640
|
+
_add_column_kwargs(
|
641
|
+
ContentType.UNESCAPED_TSV.value,
|
642
|
+
["is_active", "decimal_value"],
|
643
|
+
["is_active", "decimal_value"],
|
644
|
+
kwargs,
|
645
|
+
)
|
646
|
+
|
647
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
648
|
+
|
649
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
650
|
+
|
651
|
+
with open(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, "rb") as file:
|
652
|
+
result = pyarrow_read_csv(file, **kwargs)
|
653
|
+
|
654
|
+
self.assertEqual(len(result), 3)
|
655
|
+
self.assertEqual(
|
656
|
+
result[1][0].as_py(), decimal.Decimal("322236.66")
|
657
|
+
) # rounding decimal
|
658
|
+
self.assertEqual(
|
659
|
+
result[1][1].as_py(), decimal.Decimal("32.33")
|
660
|
+
) # not rounded
|
661
|
+
self.assertEqual(len(result.column_names), 2)
|
662
|
+
result_schema = result.schema
|
663
|
+
self.assertEqual(result_schema.field(0).type, "string")
|
664
|
+
self.assertEqual(result_schema.field(1).type, pa.decimal128(15, 2))
|
665
|
+
|
410
666
|
|
411
667
|
class TestS3FileToTable(TestCase):
|
412
668
|
def test_s3_file_to_table_identity_sanity(self):
|
@@ -534,3 +790,25 @@ class TestS3FileToTable(TestCase):
|
|
534
790
|
self.assertEqual(field.name, schema.field(index).name)
|
535
791
|
|
536
792
|
self.assertEqual(result.schema.field(1).type, "string")
|
793
|
+
|
794
|
+
def test_s3_file_to_table_when_parquet_gzip(self):
|
795
|
+
|
796
|
+
pa_kwargs_provider = lambda content_type, kwargs: {
|
797
|
+
"reader_type": "pyarrow",
|
798
|
+
**kwargs,
|
799
|
+
}
|
800
|
+
|
801
|
+
result = s3_file_to_table(
|
802
|
+
PARQUET_GZIP_COMPRESSED_FILE_PATH,
|
803
|
+
ContentType.PARQUET.value,
|
804
|
+
ContentEncoding.GZIP.value,
|
805
|
+
["n_legs", "animal"],
|
806
|
+
["n_legs"],
|
807
|
+
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
808
|
+
)
|
809
|
+
|
810
|
+
self.assertEqual(len(result), 6)
|
811
|
+
self.assertEqual(len(result.column_names), 1)
|
812
|
+
schema = result.schema
|
813
|
+
schema_index = schema.get_field_index("n_legs")
|
814
|
+
self.assertEqual(schema.field(schema_index).type, "int64")
|
deltacat/utils/pyarrow.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
+
import copy
|
4
5
|
import bz2
|
5
6
|
import gzip
|
6
7
|
import io
|
@@ -47,6 +48,18 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
|
47
48
|
RAISE_ON_EMPTY_CSV_KWARG = "raise_on_empty_csv"
|
48
49
|
READER_TYPE_KWARG = "reader_type"
|
49
50
|
|
51
|
+
"""
|
52
|
+
By default, round decimal values using half_to_even round mode when
|
53
|
+
rescaling a decimal to the given scale and precision in the schema would cause
|
54
|
+
data loss. Setting any non null value of this argument will result
|
55
|
+
in an error instead.
|
56
|
+
"""
|
57
|
+
RAISE_ON_DECIMAL_OVERFLOW = "raise_on_decimal_overflow"
|
58
|
+
# Note the maximum from https://arrow.apache.org/docs/python/generated/pyarrow.Decimal256Type.html#pyarrow.Decimal256Type
|
59
|
+
DECIMAL256_DEFAULT_SCALE = 38
|
60
|
+
DECIMAL256_MAX_PRECISION = 76
|
61
|
+
MAX_INT_BYTES = 2147483646
|
62
|
+
|
50
63
|
|
51
64
|
def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Schema:
|
52
65
|
|
@@ -64,45 +77,164 @@ def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Sche
|
|
64
77
|
return target_schema
|
65
78
|
|
66
79
|
|
67
|
-
def
|
68
|
-
|
69
|
-
|
70
|
-
|
80
|
+
def _extract_arrow_schema_from_read_csv_kwargs(kwargs: Dict[str, Any]) -> pa.Schema:
|
81
|
+
schema = None
|
82
|
+
if (
|
83
|
+
"convert_options" in kwargs
|
84
|
+
and kwargs["convert_options"].column_types is not None
|
85
|
+
):
|
86
|
+
schema = kwargs["convert_options"].column_types
|
87
|
+
if not isinstance(schema, pa.Schema):
|
88
|
+
schema = pa.schema(schema)
|
89
|
+
if kwargs["convert_options"].include_columns:
|
90
|
+
schema = _filter_schema_for_columns(
|
91
|
+
schema, kwargs["convert_options"].include_columns
|
92
|
+
)
|
93
|
+
elif (
|
94
|
+
kwargs.get("read_options") is not None
|
95
|
+
and kwargs["read_options"].column_names
|
96
|
+
):
|
97
|
+
schema = _filter_schema_for_columns(
|
98
|
+
schema, kwargs["read_options"].column_names
|
99
|
+
)
|
100
|
+
else:
|
101
|
+
logger.debug(
|
102
|
+
"Schema not specified in the kwargs."
|
103
|
+
" Hence, schema could not be inferred from the empty CSV."
|
71
104
|
)
|
105
|
+
|
106
|
+
return schema
|
107
|
+
|
108
|
+
|
109
|
+
def _new_schema_with_replaced_fields(
|
110
|
+
schema: pa.Schema, field_to_replace: Callable[[pa.Field], Optional[pa.Field]]
|
111
|
+
) -> pa.Schema:
|
112
|
+
if schema is None:
|
113
|
+
return None
|
114
|
+
|
115
|
+
new_schema_fields = []
|
116
|
+
for field in schema:
|
117
|
+
new_field = field_to_replace(field)
|
118
|
+
if new_field is not None:
|
119
|
+
new_schema_fields.append(new_field)
|
120
|
+
else:
|
121
|
+
new_schema_fields.append(field)
|
122
|
+
|
123
|
+
return pa.schema(new_schema_fields, metadata=schema.metadata)
|
124
|
+
|
125
|
+
|
126
|
+
def _read_csv_rounding_decimal_columns_to_fit_scale(
|
127
|
+
schema: pa.Schema, reader_args: List[Any], reader_kwargs: Dict[str, Any]
|
128
|
+
) -> pa.Table:
|
129
|
+
# Note: We read decimals as strings first because CSV
|
130
|
+
# conversion to decimal256 isn't implemented as of pyarrow==12.0.1
|
131
|
+
new_schema = _new_schema_with_replaced_fields(
|
132
|
+
schema,
|
133
|
+
lambda fld: (
|
134
|
+
pa.field(fld.name, pa.string(), metadata=fld.metadata)
|
135
|
+
if pa.types.is_decimal128(fld.type) or pa.types.is_decimal256(fld.type)
|
136
|
+
else None
|
137
|
+
),
|
138
|
+
)
|
139
|
+
new_kwargs = sanitize_kwargs_by_supported_kwargs(
|
140
|
+
["read_options", "parse_options", "convert_options", "memory_pool"],
|
141
|
+
reader_kwargs,
|
142
|
+
)
|
143
|
+
# Creating a shallow copy for efficiency
|
144
|
+
new_convert_options = copy.copy(new_kwargs["convert_options"])
|
145
|
+
new_convert_options.column_types = new_schema
|
146
|
+
new_reader_kwargs = {**new_kwargs, "convert_options": new_convert_options}
|
147
|
+
arrow_table = pacsv.read_csv(*reader_args, **new_reader_kwargs)
|
148
|
+
|
149
|
+
for column_index, field in enumerate(schema):
|
150
|
+
if pa.types.is_decimal128(field.type) or pa.types.is_decimal256(field.type):
|
151
|
+
column_array = arrow_table[field.name]
|
152
|
+
# We always cast to decimal256 to accomodate fixed scale of 38
|
153
|
+
cast_to_type = pa.decimal256(
|
154
|
+
DECIMAL256_MAX_PRECISION, DECIMAL256_DEFAULT_SCALE
|
155
|
+
)
|
156
|
+
casted_decimal_array = pc.cast(column_array, cast_to_type)
|
157
|
+
# Note that scale can be negative
|
158
|
+
rounded_column_array = pc.round(
|
159
|
+
casted_decimal_array, ndigits=field.type.scale
|
160
|
+
)
|
161
|
+
final_decimal_array = pc.cast(rounded_column_array, field.type)
|
162
|
+
arrow_table = arrow_table.set_column(
|
163
|
+
column_index,
|
164
|
+
field,
|
165
|
+
final_decimal_array,
|
166
|
+
)
|
167
|
+
logger.debug(
|
168
|
+
f"Rounded decimal column: {field.name} to {field.type.scale} scale and"
|
169
|
+
f" {field.type.precision} precision"
|
170
|
+
)
|
171
|
+
|
172
|
+
return arrow_table
|
173
|
+
|
174
|
+
|
175
|
+
def pyarrow_read_csv_default(*args, **kwargs):
|
176
|
+
new_kwargs = sanitize_kwargs_by_supported_kwargs(
|
177
|
+
["read_options", "parse_options", "convert_options", "memory_pool"], kwargs
|
178
|
+
)
|
179
|
+
|
180
|
+
try:
|
72
181
|
return pacsv.read_csv(*args, **new_kwargs)
|
73
182
|
except pa.lib.ArrowInvalid as e:
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
)
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
183
|
+
error_str = e.__str__()
|
184
|
+
schema = _extract_arrow_schema_from_read_csv_kwargs(kwargs)
|
185
|
+
|
186
|
+
if error_str == "Empty CSV file" and not kwargs.get(RAISE_ON_EMPTY_CSV_KWARG):
|
187
|
+
logger.debug(f"Read CSV empty schema being used: {schema}")
|
188
|
+
return pa.Table.from_pylist([], schema=schema)
|
189
|
+
if not kwargs.get(RAISE_ON_DECIMAL_OVERFLOW):
|
190
|
+
# Note, this logic requires expensive casting. To prevent downgrading performance
|
191
|
+
# for happy path reads, we are handling this case in response to an error.
|
192
|
+
logger.warning(
|
193
|
+
"Rescaling Decimal to the given scale in the schema. "
|
194
|
+
f"Original error: {error_str}"
|
195
|
+
)
|
196
|
+
|
197
|
+
if schema is not None and "convert_options" in kwargs:
|
198
|
+
if (
|
199
|
+
"Rescaling Decimal" in error_str
|
200
|
+
and "value would cause data loss" in error_str
|
90
201
|
):
|
91
|
-
|
92
|
-
|
202
|
+
logger.debug(f"Checking if the file: {args[0]}...")
|
203
|
+
# Since we are re-reading the file, we have to seek to beginning
|
204
|
+
if isinstance(args[0], io.IOBase) and args[0].seekable():
|
205
|
+
logger.debug(f"Seeking to the beginning of the file {args[0]}")
|
206
|
+
args[0].seek(0)
|
207
|
+
return _read_csv_rounding_decimal_columns_to_fit_scale(
|
208
|
+
schema=schema, reader_args=args, reader_kwargs=kwargs
|
93
209
|
)
|
94
|
-
|
95
210
|
else:
|
96
211
|
logger.debug(
|
97
|
-
"Schema
|
98
|
-
"
|
212
|
+
"Schema is None when trying to adjust decimal values. "
|
213
|
+
"Hence, bubbling up exception..."
|
99
214
|
)
|
100
215
|
|
101
|
-
logger.debug(f"Read CSV empty schema being used: {schema}")
|
102
|
-
return pa.Table.from_pylist([], schema=schema)
|
103
216
|
raise e
|
104
217
|
|
105
218
|
|
219
|
+
def pyarrow_read_csv(*args, **kwargs) -> pa.Table:
|
220
|
+
schema = _extract_arrow_schema_from_read_csv_kwargs(kwargs)
|
221
|
+
|
222
|
+
# CSV conversion to decimal256 isn't supported as of pyarrow=12.0.1
|
223
|
+
# Below ensures decimal256 is casted properly.
|
224
|
+
schema_includes_decimal256 = (
|
225
|
+
(True if any([pa.types.is_decimal256(x.type) for x in schema]) else False)
|
226
|
+
if schema is not None
|
227
|
+
else None
|
228
|
+
)
|
229
|
+
if schema_includes_decimal256 and not kwargs.get(RAISE_ON_DECIMAL_OVERFLOW):
|
230
|
+
# falling back to expensive method of reading CSV
|
231
|
+
return _read_csv_rounding_decimal_columns_to_fit_scale(
|
232
|
+
schema, reader_args=args, reader_kwargs=kwargs
|
233
|
+
)
|
234
|
+
else:
|
235
|
+
return pyarrow_read_csv_default(*args, **kwargs)
|
236
|
+
|
237
|
+
|
106
238
|
CONTENT_TYPE_TO_PA_READ_FUNC: Dict[str, Callable] = {
|
107
239
|
ContentType.UNESCAPED_TSV.value: pyarrow_read_csv,
|
108
240
|
ContentType.TSV.value: pyarrow_read_csv,
|
@@ -440,8 +572,8 @@ def s3_file_to_table(
|
|
440
572
|
**s3_client_kwargs,
|
441
573
|
)
|
442
574
|
|
443
|
-
|
444
|
-
|
575
|
+
if READER_TYPE_KWARG in kwargs:
|
576
|
+
kwargs.pop(READER_TYPE_KWARG)
|
445
577
|
|
446
578
|
filesystem = io
|
447
579
|
if s3_url.startswith("s3://"):
|
@@ -783,7 +915,6 @@ def sliced_string_cast(array: pa.ChunkedArray) -> pa.ChunkedArray:
|
|
783
915
|
TODO: deprecate this function when pyarrow performs proper ChunkedArray -> ChunkedArray casting
|
784
916
|
"""
|
785
917
|
dtype = array.type
|
786
|
-
MAX_BYTES = 2147483646
|
787
918
|
max_str_len = None
|
788
919
|
if pa.types.is_integer(dtype):
|
789
920
|
max_str_len = _int_max_string_len()
|
@@ -795,7 +926,7 @@ def sliced_string_cast(array: pa.ChunkedArray) -> pa.ChunkedArray:
|
|
795
926
|
max_str_len = _max_decimal256_string_len()
|
796
927
|
|
797
928
|
if max_str_len is not None:
|
798
|
-
max_elems_per_chunk =
|
929
|
+
max_elems_per_chunk = MAX_INT_BYTES // (2 * max_str_len) # safety factor of 2
|
799
930
|
all_chunks = []
|
800
931
|
for chunk in array.chunks:
|
801
932
|
if len(chunk) < max_elems_per_chunk:
|
@@ -1,4 +1,4 @@
|
|
1
|
-
deltacat/__init__.py,sha256=
|
1
|
+
deltacat/__init__.py,sha256=DoUiDxmgMh8HUGOEAG7CUY0Q9Ip-S7gePDsL8XQO5kk,1778
|
2
2
|
deltacat/constants.py,sha256=TUJLXUJ9xq1Ryil72yLkKR8EDH_Irp5wUg56QstbRNE,2181
|
3
3
|
deltacat/exceptions.py,sha256=7sjk3BuMY5Oo-6OvAfHncZx_OcvtEL47BblWr2F7waE,12740
|
4
4
|
deltacat/logs.py,sha256=EQSDin1deehzz5xlLV1_TrFJrO_IBZ9Ahp7MdL-4cK8,9363
|
@@ -51,7 +51,7 @@ deltacat/compute/compactor/utils/sort_key.py,sha256=oK6otg-CSsma6zlGPaKg-KNEvcZR
|
|
51
51
|
deltacat/compute/compactor/utils/system_columns.py,sha256=CNIgAGos0xAGEpdaQIH7KfbSRrGZgjRbItXMararqXQ,9399
|
52
52
|
deltacat/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
53
53
|
deltacat/compute/compactor_v2/compaction_session.py,sha256=COtol2s63DRPbd-AN9KCiWr4exLX8x5Tvxea_7cOGEQ,8078
|
54
|
-
deltacat/compute/compactor_v2/constants.py,sha256=
|
54
|
+
deltacat/compute/compactor_v2/constants.py,sha256=wvd34d7RGdniGbbiJcMljxRrRas4_uy9F9UaqXfS_Ag,3034
|
55
55
|
deltacat/compute/compactor_v2/deletes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
56
56
|
deltacat/compute/compactor_v2/deletes/delete_file_envelope.py,sha256=AeuH9JRMwp6mvQf6P2cqL92hUEtResQq6qUTS0kIKac,3111
|
57
57
|
deltacat/compute/compactor_v2/deletes/delete_strategy.py,sha256=SMEJOxR-5r92kvKNqtu2w6HmwtmhljcZX1wcNEuS-4w,2833
|
@@ -69,14 +69,14 @@ deltacat/compute/compactor_v2/private/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCe
|
|
69
69
|
deltacat/compute/compactor_v2/private/compaction_utils.py,sha256=fMWXg1SCIIgjk9p_OFYrcm760dOKNbFO1Lj3_JI3GCY,30929
|
70
70
|
deltacat/compute/compactor_v2/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
71
71
|
deltacat/compute/compactor_v2/steps/hash_bucket.py,sha256=1R5xLUkl7GqL1nY-apAgY1czKDEHjIVYSRi9qLOMass,6726
|
72
|
-
deltacat/compute/compactor_v2/steps/merge.py,sha256=
|
72
|
+
deltacat/compute/compactor_v2/steps/merge.py,sha256=qxmb3cmiKvOgfuOzlJT4Q60zOyWNjsiuZSzxdh6KTm8,22909
|
73
73
|
deltacat/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
74
74
|
deltacat/compute/compactor_v2/utils/content_type_params.py,sha256=1P9CDpuWErsFcTTlRCeuUQHDokVI92he_MsL82uRAdA,7424
|
75
|
-
deltacat/compute/compactor_v2/utils/dedupe.py,sha256=
|
75
|
+
deltacat/compute/compactor_v2/utils/dedupe.py,sha256=Jz1QbBOdZJwT8K1vD9q01eOn7hdLNZ_AF7bJ0wficr0,1949
|
76
76
|
deltacat/compute/compactor_v2/utils/delta.py,sha256=I7Yvda8NVbpKXG3nM2Ku1utvR2r2OpHvUMqUL2ja3aw,3626
|
77
77
|
deltacat/compute/compactor_v2/utils/io.py,sha256=3m4dorxj-WD6Yu9_3gRE6gz3C-eNJA7nn02sHKwo-J8,6018
|
78
78
|
deltacat/compute/compactor_v2/utils/merge.py,sha256=EV_iKhNc3WflgfLW1Q46dXUvyClx8VebWHGtninEfsI,5311
|
79
|
-
deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=
|
79
|
+
deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=SbQ97M1Cxld-zZik2QMSzlj20g6JlENaQx_0PhlCIP8,12034
|
80
80
|
deltacat/compute/compactor_v2/utils/task_options.py,sha256=W0jyWIIZ0tcSAGp8mhpnu1G8p3rmX4d3juCPpAJxnDM,12649
|
81
81
|
deltacat/compute/merge_on_read/__init__.py,sha256=ckbgngmqPjYBYz_NySsR1vNTOb_hNpeL1sYkZKvBI9M,214
|
82
82
|
deltacat/compute/merge_on_read/daft.py,sha256=1oC38u5ig_aTrq7EzyWBo8Ui54rb6yERYMk-vEFbpxM,1400
|
@@ -136,7 +136,7 @@ deltacat/tests/compute/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3
|
|
136
136
|
deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py,sha256=kWyIJQMyF9oBemvgOp3ngGhMpH9zjkznV-67ewELgHQ,37719
|
137
137
|
deltacat/tests/compute/compact_partition_rebase_test_cases.py,sha256=8HVr3EIFYFqNaJoqeCuj9xIBjM4Ch2bx-mJcO4BRrLo,16839
|
138
138
|
deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py,sha256=l_6-pAKOsRY3NbtfHsYmEaJEkq6IJueYuLsjyJxNgz4,81564
|
139
|
-
deltacat/tests/compute/compact_partition_test_cases.py,sha256=
|
139
|
+
deltacat/tests/compute/compact_partition_test_cases.py,sha256=HJ15Xyawv8ImFju8wDwt22fH5okoPhyS-QAygkXDb7Q,27422
|
140
140
|
deltacat/tests/compute/test_compact_partition_incremental.py,sha256=lkfAraOJmEmieesf7b1BqlfTS26YjYM5xXOXoTMrsos,14989
|
141
141
|
deltacat/tests/compute/test_compact_partition_multiple_rounds.py,sha256=xXBA66TTfARR90m5KQs31nmiokuMy9iGQt7Z9evyG7M,12950
|
142
142
|
deltacat/tests/compute/test_compact_partition_params.py,sha256=Dm5eLyHo8oGMeO3XBbpj1rZqHtPZ1hAB7z2qvzc4Lxk,8497
|
@@ -152,9 +152,10 @@ deltacat/tests/compute/compactor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JC
|
|
152
152
|
deltacat/tests/compute/compactor/utils/test_io.py,sha256=st5mlU4cVU-eQl7B4mvPgNA3izuNwbVawYOp-NcoyrI,4326
|
153
153
|
deltacat/tests/compute/compactor/utils/test_round_completion_file.py,sha256=LAQ4usiRF4oTx4cA85L0eOcBa_Z-febc-CuzUijSGrI,7439
|
154
154
|
deltacat/tests/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
155
|
-
deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=
|
155
|
+
deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=y8nNHq9ADHENzUKMQYguB45zOD7F2lZgcBYYTvbTsdM,28957
|
156
156
|
deltacat/tests/compute/compactor_v2/test_hashlib.py,sha256=8csF2hFWtBvY2MbX3-6iphCsVXxRp0zP1NTnKhfdmkg,328
|
157
157
|
deltacat/tests/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
158
|
+
deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py,sha256=aFb9rzT_EK9k8qAMHPtpqd5btyEmll1So1loDmZkotQ,1769
|
158
159
|
deltacat/tests/compute/compactor_v2/utils/test_task_options.py,sha256=37DkR1u_XwhedV9cGed6FFuJTC0XmuiowHJIa_Op6uA,865
|
159
160
|
deltacat/tests/compute/resource_estimation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
160
161
|
deltacat/tests/compute/resource_estimation/test_delta.py,sha256=HCL2oUnCqm0E26T3HLJjMhoAsHTJIWPYGwIKRgM_H7E,25712
|
@@ -171,7 +172,7 @@ deltacat/tests/local_deltacat_storage/__init__.py,sha256=5T9ubNIS42-BotEH0yrUiWE
|
|
171
172
|
deltacat/tests/local_deltacat_storage/exceptions.py,sha256=oxZ0psmrEO0M6P2r8gHQ2E8E-Y8UBfUCBUIwfuHcx38,251
|
172
173
|
deltacat/tests/test_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
173
174
|
deltacat/tests/test_utils/constants.py,sha256=UYe--9T_clYjiOpv0M7TtAMGdpje_SMZ-w8n0IeCAjc,214
|
174
|
-
deltacat/tests/test_utils/pyarrow.py,sha256=
|
175
|
+
deltacat/tests/test_utils/pyarrow.py,sha256=QDdGilzsJ2xUESiGotdNVZde9yD7ja9MvNhhssnox-E,3083
|
175
176
|
deltacat/tests/test_utils/storage.py,sha256=93GEn4A5WbMHWk0Ec4Bd7RxeHoSEnBfSarfWhKOSNtM,972
|
176
177
|
deltacat/tests/test_utils/utils.py,sha256=a32qEwcSSd1lvRi0aJJ4ZLnc1ZyXmoQF_K95zaQRk2M,455
|
177
178
|
deltacat/tests/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -179,7 +180,7 @@ deltacat/tests/utils/test_cloudpickle.py,sha256=J0pnBY3-PxlUh6MamZAN1PuquKQPr2iy
|
|
179
180
|
deltacat/tests/utils/test_daft.py,sha256=kY8lkXoQvyWunok8UvOsh1An297rb3jcnstTuIAyAlc,8232
|
180
181
|
deltacat/tests/utils/test_metrics.py,sha256=Ym9nOz1EtB180pLmvugihj1sDTNDMb5opIjjr5Nmcls,16339
|
181
182
|
deltacat/tests/utils/test_placement.py,sha256=g61wVOMkHe4YJeR9Oxg_BOVQ6bhHHbC3IBYv8YhUu94,597
|
182
|
-
deltacat/tests/utils/test_pyarrow.py,sha256=
|
183
|
+
deltacat/tests/utils/test_pyarrow.py,sha256=fDjDkGPjdRZA3kgjgiQRym9shdeDYgkdDPYU2a7IEUk,30790
|
183
184
|
deltacat/tests/utils/test_record_batch_tables.py,sha256=AkG1WyljQmjnl-AxhbFWyo5LnMIKRyLScfgC2B_ES-s,11321
|
184
185
|
deltacat/tests/utils/test_resources.py,sha256=HtpvDrfPZQNtGDXUlsIzc_yd7Vf1cDscZ3YbN0oTvO8,2560
|
185
186
|
deltacat/tests/utils/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -200,7 +201,7 @@ deltacat/utils/numpy.py,sha256=SpHKKvC-K8NINTWGVfTZ5-gBFTGYqaXjjgKFhsdUjwg,2049
|
|
200
201
|
deltacat/utils/pandas.py,sha256=q99mlRB7tymICMcNbfGLfLqFu_C-feyPZKZm2CWJJVc,9574
|
201
202
|
deltacat/utils/performance.py,sha256=7ZLaMkS1ehPSIhT5uOQVBHvjC70iKHzoFquFo-KL0PI,645
|
202
203
|
deltacat/utils/placement.py,sha256=Lj20fb-eq8rgMdm_M2MBMfDLwhDM1sS1nJj2DvIK56s,12060
|
203
|
-
deltacat/utils/pyarrow.py,sha256=
|
204
|
+
deltacat/utils/pyarrow.py,sha256=R3KkJPenE48rS3VrfFKSkJerX94f4e7X2dUPBQg44DY,34339
|
204
205
|
deltacat/utils/resources.py,sha256=Ax1OgLLbZI4oYpp4Ki27OLaST-7I-AJgZwU87FVfY8g,8253
|
205
206
|
deltacat/utils/s3fs.py,sha256=PmUJ5Fm1WmD-_zp_M6yd9VbXvIoJuBeK6ApOdJJApLE,662
|
206
207
|
deltacat/utils/schema.py,sha256=m4Wm4ZQcpttzOUxex4dVneGlHy1_E36HspTcjNYzvVM,1564
|
@@ -210,8 +211,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
|
|
210
211
|
deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
|
211
212
|
deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
|
212
213
|
deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
|
213
|
-
deltacat-1.1.
|
214
|
-
deltacat-1.1.
|
215
|
-
deltacat-1.1.
|
216
|
-
deltacat-1.1.
|
217
|
-
deltacat-1.1.
|
214
|
+
deltacat-1.1.29.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
215
|
+
deltacat-1.1.29.dist-info/METADATA,sha256=ZverlgFUJV4wGJao8tusRCv_sRNX4KJ4RTNAGvBCJes,1733
|
216
|
+
deltacat-1.1.29.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
217
|
+
deltacat-1.1.29.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
|
218
|
+
deltacat-1.1.29.dist-info/RECORD,,
|
File without changes
|
File without changes
|