deltacat 1.1.27__py3-none-any.whl → 1.1.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor_v2/constants.py +15 -1
- deltacat/compute/compactor_v2/steps/merge.py +30 -5
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +15 -3
- deltacat/tests/compute/compact_partition_test_cases.py +32 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +133 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/test_utils/pyarrow.py +15 -8
- deltacat/tests/utils/test_pyarrow.py +278 -0
- deltacat/utils/pyarrow.py +162 -31
- {deltacat-1.1.27.dist-info → deltacat-1.1.29.dist-info}/METADATA +1 -1
- {deltacat-1.1.27.dist-info → deltacat-1.1.29.dist-info}/RECORD +16 -15
- {deltacat-1.1.27.dist-info → deltacat-1.1.29.dist-info}/WHEEL +1 -1
- {deltacat-1.1.27.dist-info → deltacat-1.1.29.dist-info}/LICENSE +0 -0
- {deltacat-1.1.27.dist-info → deltacat-1.1.29.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
from deltacat.utils.common import env_bool, env_integer
|
2
|
+
|
1
3
|
TOTAL_BYTES_IN_SHA1_HASH = 20
|
2
4
|
|
3
5
|
PK_DELIMITER = "L6kl7u5f"
|
@@ -31,7 +33,9 @@ TOTAL_MEMORY_BUFFER_PERCENTAGE = 30
|
|
31
33
|
# The total size of records that will be hash bucketed at once
|
32
34
|
# Since, sorting is nlogn, we ensure that is not performed
|
33
35
|
# on a very large dataset for best performance.
|
34
|
-
MAX_SIZE_OF_RECORD_BATCH_IN_GIB =
|
36
|
+
MAX_SIZE_OF_RECORD_BATCH_IN_GIB = env_integer(
|
37
|
+
"MAX_SIZE_OF_RECORD_BATCH_IN_GIB", 2 * 1024 * 1024 * 1024
|
38
|
+
)
|
35
39
|
|
36
40
|
# Whether to drop duplicates during merge.
|
37
41
|
DROP_DUPLICATES = True
|
@@ -78,3 +82,13 @@ COMPACT_PARTITION_METRIC_PREFIX = "compact_partition"
|
|
78
82
|
# Number of rounds to run hash/merge for a single
|
79
83
|
# partition. (For large table support)
|
80
84
|
DEFAULT_NUM_ROUNDS = 1
|
85
|
+
|
86
|
+
# Whether to perform sha1 hashing when required to
|
87
|
+
# optimize memory. For example, hashing is always
|
88
|
+
# required for bucketing where it's not mandatory
|
89
|
+
# when dropping duplicates. Setting this to True
|
90
|
+
# will disable sha1 hashing in cases where it isn't
|
91
|
+
# mandatory. This flag is False by default.
|
92
|
+
SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED = env_bool(
|
93
|
+
"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED", False
|
94
|
+
)
|
@@ -7,6 +7,7 @@ import ray
|
|
7
7
|
import itertools
|
8
8
|
import time
|
9
9
|
import pyarrow.compute as pc
|
10
|
+
from deltacat.utils.pyarrow import MAX_INT_BYTES
|
10
11
|
import deltacat.compute.compactor_v2.utils.merge as merge_utils
|
11
12
|
from uuid import uuid4
|
12
13
|
from deltacat import logs
|
@@ -147,10 +148,32 @@ def _merge_tables(
|
|
147
148
|
if compacted_table:
|
148
149
|
compacted_table = all_tables[0]
|
149
150
|
|
151
|
+
compacted_pk_hash_str = compacted_table[sc._PK_HASH_STRING_COLUMN_NAME]
|
152
|
+
incremental_pk_hash_str = incremental_table[sc._PK_HASH_STRING_COLUMN_NAME]
|
153
|
+
|
154
|
+
logger.info(
|
155
|
+
f"Size of compacted pk hash={compacted_pk_hash_str.nbytes} "
|
156
|
+
f"and incremental pk hash={incremental_pk_hash_str.nbytes}."
|
157
|
+
)
|
158
|
+
|
159
|
+
if (
|
160
|
+
compacted_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
|
161
|
+
or incremental_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
|
162
|
+
):
|
163
|
+
logger.info("Casting compacted and incremental pk hash to large_string...")
|
164
|
+
# is_in combines the chunks of the chunked array passed which can cause
|
165
|
+
# ArrowCapacityError if the total size of string array is over 2GB.
|
166
|
+
# Using a large_string would resolve this issue.
|
167
|
+
# The cast here should be zero-copy in most cases.
|
168
|
+
compacted_pk_hash_str = pc.cast(compacted_pk_hash_str, pa.large_string())
|
169
|
+
incremental_pk_hash_str = pc.cast(
|
170
|
+
incremental_pk_hash_str, pa.large_string()
|
171
|
+
)
|
172
|
+
|
150
173
|
records_to_keep = pc.invert(
|
151
174
|
pc.is_in(
|
152
|
-
|
153
|
-
|
175
|
+
compacted_pk_hash_str,
|
176
|
+
incremental_pk_hash_str,
|
154
177
|
)
|
155
178
|
)
|
156
179
|
|
@@ -492,9 +515,11 @@ def _copy_manifests_from_hash_bucketing(
|
|
492
515
|
def _timed_merge(input: MergeInput) -> MergeResult:
|
493
516
|
task_id = get_current_ray_task_id()
|
494
517
|
worker_id = get_current_ray_worker_id()
|
495
|
-
with
|
496
|
-
f"merge_{worker_id}_{task_id}.bin"
|
497
|
-
|
518
|
+
with (
|
519
|
+
memray.Tracker(f"merge_{worker_id}_{task_id}.bin")
|
520
|
+
if input.enable_profiler
|
521
|
+
else nullcontext()
|
522
|
+
):
|
498
523
|
total_input_records, total_deduped_records = 0, 0
|
499
524
|
total_dropped_records = 0
|
500
525
|
materialized_results: List[MaterializeResult] = []
|
@@ -25,7 +25,7 @@ def _create_chunked_index_array(array: pa.Array) -> pa.Array:
|
|
25
25
|
result[index] = np.arange(cl, dtype="int32")
|
26
26
|
|
27
27
|
chunk_lengths = ([0] + chunk_lengths)[:-1]
|
28
|
-
result = pa.chunked_array(result + np.cumsum(chunk_lengths))
|
28
|
+
result = pa.chunked_array(result + np.cumsum(chunk_lengths), type=pa.int32())
|
29
29
|
return result
|
30
30
|
|
31
31
|
|
@@ -10,6 +10,7 @@ from deltacat.compute.compactor_v2.constants import (
|
|
10
10
|
TOTAL_BYTES_IN_SHA1_HASH,
|
11
11
|
PK_DELIMITER,
|
12
12
|
MAX_SIZE_OF_RECORD_BATCH_IN_GIB,
|
13
|
+
SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED,
|
13
14
|
)
|
14
15
|
import time
|
15
16
|
from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelope
|
@@ -48,6 +49,13 @@ def _is_sha1_desired(hash_columns: List[pa.Array]) -> bool:
|
|
48
49
|
f"Found total length of hash column={total_len} and total_size={total_size}"
|
49
50
|
)
|
50
51
|
|
52
|
+
if SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED:
|
53
|
+
logger.info(
|
54
|
+
f"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED is True. "
|
55
|
+
f"Returning False for is_sha1_desired"
|
56
|
+
)
|
57
|
+
return False
|
58
|
+
|
51
59
|
return total_size > TOTAL_BYTES_IN_SHA1_HASH * total_len
|
52
60
|
|
53
61
|
|
@@ -108,9 +116,10 @@ def _optimized_group_record_batches_by_hash_bucket(
|
|
108
116
|
record_batches = []
|
109
117
|
result_len = 0
|
110
118
|
for record_batch in table_batches:
|
111
|
-
|
112
|
-
|
113
|
-
|
119
|
+
if (
|
120
|
+
record_batches
|
121
|
+
and current_bytes + record_batch.nbytes >= MAX_SIZE_OF_RECORD_BATCH_IN_GIB
|
122
|
+
):
|
114
123
|
logger.info(
|
115
124
|
f"Total number of record batches without exceeding {MAX_SIZE_OF_RECORD_BATCH_IN_GIB} "
|
116
125
|
f"is {len(record_batches)} and size {current_bytes}"
|
@@ -128,6 +137,9 @@ def _optimized_group_record_batches_by_hash_bucket(
|
|
128
137
|
current_bytes = 0
|
129
138
|
record_batches.clear()
|
130
139
|
|
140
|
+
current_bytes += record_batch.nbytes
|
141
|
+
record_batches.append(record_batch)
|
142
|
+
|
131
143
|
if record_batches:
|
132
144
|
appended_len, append_latency = timed_invocation(
|
133
145
|
_append_table_by_hash_bucket,
|
@@ -601,6 +601,38 @@ INCREMENTAL_TEST_CASES: Dict[str, IncrementalCompactionTestCaseParams] = {
|
|
601
601
|
skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
|
602
602
|
assert_compaction_audit=None,
|
603
603
|
),
|
604
|
+
"15-incremental-empty-input-with-single-hash-bucket": IncrementalCompactionTestCaseParams(
|
605
|
+
primary_keys={"pk_col_1"},
|
606
|
+
sort_keys=[SortKey.of(key_name="sk_col_1")],
|
607
|
+
partition_keys=ZERO_VALUED_PARTITION_KEYS_PARAM,
|
608
|
+
partition_values=ZERO_VALUED_PARTITION_VALUES_PARAM,
|
609
|
+
input_deltas=pa.Table.from_arrays(
|
610
|
+
[
|
611
|
+
pa.array([]),
|
612
|
+
pa.array([]),
|
613
|
+
],
|
614
|
+
names=["pk_col_1", "sk_col_1"],
|
615
|
+
),
|
616
|
+
input_deltas_delta_type=DeltaType.UPSERT,
|
617
|
+
expected_terminal_compact_partition_result=pa.Table.from_arrays(
|
618
|
+
[
|
619
|
+
pa.array([]),
|
620
|
+
pa.array([]),
|
621
|
+
],
|
622
|
+
names=["pk_col_1", "sk_col_1"],
|
623
|
+
),
|
624
|
+
expected_terminal_exception=None,
|
625
|
+
expected_terminal_exception_message=None,
|
626
|
+
do_create_placement_group=False,
|
627
|
+
records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
|
628
|
+
hash_bucket_count=1,
|
629
|
+
read_kwargs_provider=None,
|
630
|
+
drop_duplicates=True,
|
631
|
+
is_inplace=False,
|
632
|
+
add_late_deltas=None,
|
633
|
+
skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
|
634
|
+
assert_compaction_audit=assert_compaction_audit_no_hash_bucket,
|
635
|
+
),
|
604
636
|
}
|
605
637
|
|
606
638
|
INCREMENTAL_TEST_CASES = with_compactor_version_func_test_param(INCREMENTAL_TEST_CASES)
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from typing import Dict, Any
|
2
2
|
import ray
|
3
3
|
import os
|
4
|
+
import pyarrow as pa
|
4
5
|
import pytest
|
5
6
|
import boto3
|
6
7
|
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
@@ -76,6 +77,17 @@ def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
|
|
76
77
|
os.remove(DATABASE_FILE_PATH_VALUE)
|
77
78
|
|
78
79
|
|
80
|
+
@pytest.fixture(scope="function")
|
81
|
+
def disable_sha1(monkeypatch):
|
82
|
+
import deltacat.compute.compactor_v2.utils.primary_key_index
|
83
|
+
|
84
|
+
monkeypatch.setattr(
|
85
|
+
deltacat.compute.compactor_v2.utils.primary_key_index,
|
86
|
+
"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED",
|
87
|
+
True,
|
88
|
+
)
|
89
|
+
|
90
|
+
|
79
91
|
class TestCompactionSession:
|
80
92
|
"""
|
81
93
|
This class adds specific tests that aren't part of the parametrized test suite.
|
@@ -556,3 +568,124 @@ class TestCompactionSession:
|
|
556
568
|
}
|
557
569
|
)
|
558
570
|
)
|
571
|
+
|
572
|
+
def test_compact_partition_when_incremental_pk_hash_is_over_2gb(
|
573
|
+
self, s3_resource, local_deltacat_storage_kwargs, disable_sha1
|
574
|
+
):
|
575
|
+
"""
|
576
|
+
A test case which ensures the compaction succeeds even if the incremental
|
577
|
+
arrow table size is over 2GB. It is added to prevent ArrowCapacityError
|
578
|
+
when running is_in operation during merge.
|
579
|
+
|
580
|
+
Note that we set SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED to bypass sha1 hashing
|
581
|
+
which truncates the lengths of pk strings when deduping.
|
582
|
+
"""
|
583
|
+
# setup
|
584
|
+
staged_source = stage_partition_from_file_paths(
|
585
|
+
self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
|
586
|
+
)
|
587
|
+
# we create chunked array to avoid ArrowCapacityError
|
588
|
+
chunked_pk_array = pa.chunked_array([["13bytesstring"], ["12bytestring"]])
|
589
|
+
table = pa.table([chunked_pk_array], names=["pk"])
|
590
|
+
source_delta = commit_delta_to_staged_partition(
|
591
|
+
staged_source, pa_table=table, **local_deltacat_storage_kwargs
|
592
|
+
)
|
593
|
+
|
594
|
+
staged_dest = stage_partition_from_file_paths(
|
595
|
+
self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
|
596
|
+
)
|
597
|
+
dest_partition = ds.commit_partition(
|
598
|
+
staged_dest, **local_deltacat_storage_kwargs
|
599
|
+
)
|
600
|
+
|
601
|
+
# rebase first
|
602
|
+
rebase_url = compact_partition(
|
603
|
+
CompactPartitionParams.of(
|
604
|
+
{
|
605
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
606
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
607
|
+
"dd_max_parallelism_ratio": 1.0,
|
608
|
+
"deltacat_storage": ds,
|
609
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
610
|
+
"destination_partition_locator": dest_partition.locator,
|
611
|
+
"drop_duplicates": True,
|
612
|
+
"hash_bucket_count": 1,
|
613
|
+
"last_stream_position_to_compact": source_delta.stream_position,
|
614
|
+
"list_deltas_kwargs": {
|
615
|
+
**local_deltacat_storage_kwargs,
|
616
|
+
**{"equivalent_table_types": []},
|
617
|
+
},
|
618
|
+
"primary_keys": ["pk"],
|
619
|
+
"rebase_source_partition_locator": source_delta.partition_locator,
|
620
|
+
"rebase_source_partition_high_watermark": source_delta.stream_position,
|
621
|
+
"records_per_compacted_file": 4000,
|
622
|
+
"s3_client_kwargs": {},
|
623
|
+
"source_partition_locator": source_delta.partition_locator,
|
624
|
+
"resource_estimation_method": ResourceEstimationMethod.PREVIOUS_INFLATION,
|
625
|
+
}
|
626
|
+
)
|
627
|
+
)
|
628
|
+
|
629
|
+
rebased_rcf = get_rcf(s3_resource, rebase_url)
|
630
|
+
|
631
|
+
assert rebased_rcf.compacted_pyarrow_write_result.files == 1
|
632
|
+
assert rebased_rcf.compacted_pyarrow_write_result.records == 2
|
633
|
+
|
634
|
+
# Run incremental with a small delta on source
|
635
|
+
chunked_pk_array = pa.chunked_array(
|
636
|
+
[["13bytesstring" * 95_000_000], ["12bytestring" * 95_000_000]]
|
637
|
+
) # 2.3GB
|
638
|
+
table = pa.table([chunked_pk_array], names=["pk"])
|
639
|
+
|
640
|
+
incremental_source_delta = commit_delta_to_partition(
|
641
|
+
source_delta.partition_locator,
|
642
|
+
pa_table=table,
|
643
|
+
**local_deltacat_storage_kwargs,
|
644
|
+
)
|
645
|
+
assert (
|
646
|
+
incremental_source_delta.partition_locator == source_delta.partition_locator
|
647
|
+
), "source partition locator should not change"
|
648
|
+
dest_partition = ds.get_partition(
|
649
|
+
dest_partition.stream_locator,
|
650
|
+
dest_partition.partition_values,
|
651
|
+
**local_deltacat_storage_kwargs,
|
652
|
+
)
|
653
|
+
|
654
|
+
assert (
|
655
|
+
dest_partition.locator
|
656
|
+
== rebased_rcf.compacted_delta_locator.partition_locator
|
657
|
+
), "The new destination partition should be same as compacted partition"
|
658
|
+
|
659
|
+
# Run incremental
|
660
|
+
incremental_url = compact_partition(
|
661
|
+
CompactPartitionParams.of(
|
662
|
+
{
|
663
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
664
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
665
|
+
"dd_max_parallelism_ratio": 1.0,
|
666
|
+
"deltacat_storage": ds,
|
667
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
668
|
+
"destination_partition_locator": dest_partition.locator,
|
669
|
+
"drop_duplicates": True,
|
670
|
+
"hash_bucket_count": 1,
|
671
|
+
"last_stream_position_to_compact": incremental_source_delta.stream_position,
|
672
|
+
"list_deltas_kwargs": {
|
673
|
+
**local_deltacat_storage_kwargs,
|
674
|
+
**{"equivalent_table_types": []},
|
675
|
+
},
|
676
|
+
"primary_keys": ["pk"],
|
677
|
+
"records_per_compacted_file": 4000,
|
678
|
+
"s3_client_kwargs": {},
|
679
|
+
"source_partition_locator": incremental_source_delta.partition_locator,
|
680
|
+
"resource_estimation_method": ResourceEstimationMethod.PREVIOUS_INFLATION,
|
681
|
+
}
|
682
|
+
)
|
683
|
+
)
|
684
|
+
|
685
|
+
incremental_rcf = get_rcf(s3_resource, incremental_url)
|
686
|
+
|
687
|
+
assert incremental_rcf.compacted_pyarrow_write_result.files == 1
|
688
|
+
assert (
|
689
|
+
incremental_rcf.compacted_pyarrow_write_result.pyarrow_bytes >= 2300000000
|
690
|
+
)
|
691
|
+
assert incremental_rcf.compacted_pyarrow_write_result.records == 4
|
@@ -0,0 +1,45 @@
|
|
1
|
+
import pyarrow as pa
|
2
|
+
from deltacat.compute.compactor_v2.utils.primary_key_index import (
|
3
|
+
group_by_pk_hash_bucket,
|
4
|
+
)
|
5
|
+
|
6
|
+
|
7
|
+
class TestGroupByPkHashBucket:
|
8
|
+
def test_sanity(self):
|
9
|
+
record = pa.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
|
10
|
+
pk = pa.array(["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"])
|
11
|
+
record_batch = pa.RecordBatch.from_arrays([record, pk], names=["record", "pk"])
|
12
|
+
table = pa.Table.from_batches([record_batch])
|
13
|
+
grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
|
14
|
+
|
15
|
+
assert len(grouped_array) == 3
|
16
|
+
total_records = 0
|
17
|
+
for arr in grouped_array:
|
18
|
+
if arr is not None:
|
19
|
+
total_records += len(arr[1])
|
20
|
+
|
21
|
+
assert total_records == len(table)
|
22
|
+
|
23
|
+
def test_when_record_batches_exceed_int_max_size(self):
|
24
|
+
record = pa.array(["12bytestring" * 90_000_000])
|
25
|
+
record_batch = pa.RecordBatch.from_arrays([record], names=["pk"])
|
26
|
+
table = pa.Table.from_batches([record_batch, record_batch])
|
27
|
+
|
28
|
+
grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
|
29
|
+
|
30
|
+
assert len(grouped_array) == 3
|
31
|
+
# two record batches are preserved as combining them
|
32
|
+
# would exceed 2GB.
|
33
|
+
assert len(grouped_array[2].to_batches()) == 2
|
34
|
+
|
35
|
+
def test_when_record_batches_less_than_int_max_size(self):
|
36
|
+
record = pa.array(["12bytestring" * 90_000])
|
37
|
+
record_batch = pa.RecordBatch.from_arrays([record], names=["pk"])
|
38
|
+
table = pa.Table.from_batches([record_batch, record_batch])
|
39
|
+
|
40
|
+
grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
|
41
|
+
|
42
|
+
assert len(grouped_array) == 3
|
43
|
+
# Combined the arrays into one record batch as the size
|
44
|
+
# would not exceed 2GB.
|
45
|
+
assert len(grouped_array[1].to_batches()) == 1
|
@@ -47,7 +47,8 @@ def stage_partition_from_file_paths(
|
|
47
47
|
|
48
48
|
def commit_delta_to_staged_partition(
|
49
49
|
staged_partition,
|
50
|
-
file_paths: List[str],
|
50
|
+
file_paths: List[str] = None,
|
51
|
+
pa_table: pa.Table = None,
|
51
52
|
content_type: ContentType = ContentType.PARQUET,
|
52
53
|
*args,
|
53
54
|
**kwargs,
|
@@ -57,6 +58,7 @@ def commit_delta_to_staged_partition(
|
|
57
58
|
*args,
|
58
59
|
file_paths=file_paths,
|
59
60
|
content_type=content_type,
|
61
|
+
pa_table=pa_table,
|
60
62
|
**kwargs,
|
61
63
|
)
|
62
64
|
ds.commit_partition(staged_partition, **kwargs)
|
@@ -76,23 +78,28 @@ def download_delta(delta_like: Union[Delta, DeltaLocator], *args, **kwargs) -> D
|
|
76
78
|
|
77
79
|
def commit_delta_to_partition(
|
78
80
|
partition: Union[Partition, PartitionLocator],
|
79
|
-
file_paths: List[str],
|
81
|
+
file_paths: List[str] = None,
|
82
|
+
pa_table: pa.Table = None,
|
80
83
|
content_type: ContentType = ContentType.PARQUET,
|
81
84
|
*args,
|
82
85
|
**kwargs,
|
83
86
|
) -> Delta:
|
84
|
-
tables = []
|
85
87
|
|
86
88
|
if isinstance(partition, PartitionLocator):
|
87
89
|
partition = ds.get_partition(
|
88
90
|
partition.stream_locator, partition.partition_values, *args, **kwargs
|
89
91
|
)
|
92
|
+
if pa_table is None:
|
93
|
+
assert file_paths is not None, "One of pa_table or file_paths must be passed."
|
94
|
+
tables = []
|
95
|
+
for file_path in file_paths:
|
96
|
+
table = pa.csv.read_csv(file_path)
|
97
|
+
tables.append(table)
|
90
98
|
|
91
|
-
|
92
|
-
table = pa.csv.read_csv(file_path)
|
93
|
-
tables.append(table)
|
99
|
+
pa_table = pa.concat_tables(tables)
|
94
100
|
|
95
|
-
|
96
|
-
|
101
|
+
staged_delta = ds.stage_delta(
|
102
|
+
pa_table, partition, content_type=content_type, **kwargs
|
103
|
+
)
|
97
104
|
|
98
105
|
return ds.commit_delta(staged_delta, **kwargs)
|
@@ -7,15 +7,24 @@ from deltacat.utils.pyarrow import (
|
|
7
7
|
s3_file_to_table,
|
8
8
|
ReadKwargsProviderPyArrowSchemaOverride,
|
9
9
|
RAISE_ON_EMPTY_CSV_KWARG,
|
10
|
+
RAISE_ON_DECIMAL_OVERFLOW,
|
10
11
|
)
|
12
|
+
import decimal
|
11
13
|
from deltacat.types.media import ContentEncoding, ContentType
|
12
14
|
from deltacat.types.partial_download import PartialParquetParameters
|
13
15
|
from pyarrow.parquet import ParquetFile
|
14
16
|
import pyarrow as pa
|
15
17
|
|
16
18
|
PARQUET_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet"
|
19
|
+
PARQUET_GZIP_COMPRESSED_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet.gz"
|
17
20
|
EMPTY_UTSV_PATH = "deltacat/tests/utils/data/empty.csv"
|
18
21
|
NON_EMPTY_VALID_UTSV_PATH = "deltacat/tests/utils/data/non_empty_valid.csv"
|
22
|
+
OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH = (
|
23
|
+
"deltacat/tests/utils/data/overflowing_decimal_precision.csv"
|
24
|
+
)
|
25
|
+
OVERFLOWING_DECIMAL_SCALE_UTSV_PATH = (
|
26
|
+
"deltacat/tests/utils/data/overflowing_decimal_scale.csv"
|
27
|
+
)
|
19
28
|
GZIP_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.gz"
|
20
29
|
BZ2_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.bz2"
|
21
30
|
|
@@ -407,6 +416,253 @@ class TestReadCSV(TestCase):
|
|
407
416
|
),
|
408
417
|
)
|
409
418
|
|
419
|
+
def test_read_csv_when_decimal_precision_overflows_and_raise_kwarg_specified(self):
|
420
|
+
schema = pa.schema(
|
421
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(4, 2))]
|
422
|
+
)
|
423
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
424
|
+
_add_column_kwargs(
|
425
|
+
ContentType.UNESCAPED_TSV.value,
|
426
|
+
["is_active", "decimal_value"],
|
427
|
+
["is_active", "decimal_value"],
|
428
|
+
kwargs,
|
429
|
+
)
|
430
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
431
|
+
|
432
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
433
|
+
self.assertRaises(
|
434
|
+
pa.lib.ArrowInvalid,
|
435
|
+
lambda: pyarrow_read_csv(
|
436
|
+
OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH,
|
437
|
+
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
|
438
|
+
),
|
439
|
+
)
|
440
|
+
|
441
|
+
def test_read_csv_when_decimal_precision_overflows_sanity(self):
|
442
|
+
schema = pa.schema(
|
443
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(4, 2))]
|
444
|
+
)
|
445
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
446
|
+
_add_column_kwargs(
|
447
|
+
ContentType.UNESCAPED_TSV.value,
|
448
|
+
["is_active", "decimal_value"],
|
449
|
+
["is_active", "decimal_value"],
|
450
|
+
kwargs,
|
451
|
+
)
|
452
|
+
|
453
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
454
|
+
|
455
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
456
|
+
|
457
|
+
self.assertRaises(
|
458
|
+
pa.lib.ArrowInvalid,
|
459
|
+
lambda: pyarrow_read_csv(OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH, **kwargs),
|
460
|
+
)
|
461
|
+
|
462
|
+
def test_read_csv_when_decimal_scale_overflows_and_raise_kwarg_specified(self):
|
463
|
+
schema = pa.schema(
|
464
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(20, 2))]
|
465
|
+
)
|
466
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
467
|
+
_add_column_kwargs(
|
468
|
+
ContentType.UNESCAPED_TSV.value,
|
469
|
+
["is_active", "decimal_value"],
|
470
|
+
["is_active", "decimal_value"],
|
471
|
+
kwargs,
|
472
|
+
)
|
473
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
474
|
+
|
475
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
476
|
+
|
477
|
+
self.assertRaises(
|
478
|
+
pa.lib.ArrowInvalid,
|
479
|
+
lambda: pyarrow_read_csv(
|
480
|
+
OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
|
481
|
+
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
|
482
|
+
),
|
483
|
+
)
|
484
|
+
|
485
|
+
def test_read_csv_when_decimal_scale_overflows_sanity(self):
|
486
|
+
schema = pa.schema(
|
487
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(20, 2))]
|
488
|
+
)
|
489
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
490
|
+
_add_column_kwargs(
|
491
|
+
ContentType.UNESCAPED_TSV.value,
|
492
|
+
["is_active", "decimal_value"],
|
493
|
+
["is_active", "decimal_value"],
|
494
|
+
kwargs,
|
495
|
+
)
|
496
|
+
|
497
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
498
|
+
|
499
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
500
|
+
|
501
|
+
result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
|
502
|
+
|
503
|
+
self.assertEqual(len(result), 3)
|
504
|
+
self.assertEqual(
|
505
|
+
result[1][0].as_py(), decimal.Decimal("322236.66")
|
506
|
+
) # rounding decimal
|
507
|
+
self.assertEqual(result[1][1].as_py(), decimal.Decimal("32.33")) # not rounded
|
508
|
+
self.assertEqual(len(result.column_names), 2)
|
509
|
+
result_schema = result.schema
|
510
|
+
self.assertEqual(result_schema.field(0).type, "string")
|
511
|
+
self.assertEqual(result_schema.field(1).type, pa.decimal128(20, 2))
|
512
|
+
|
513
|
+
def test_read_csv_when_decimal_scale_overflows_and_negative_scale(self):
|
514
|
+
schema = pa.schema(
|
515
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(20, -2))]
|
516
|
+
)
|
517
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
518
|
+
_add_column_kwargs(
|
519
|
+
ContentType.UNESCAPED_TSV.value,
|
520
|
+
["is_active", "decimal_value"],
|
521
|
+
["is_active", "decimal_value"],
|
522
|
+
kwargs,
|
523
|
+
)
|
524
|
+
|
525
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
526
|
+
|
527
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
528
|
+
|
529
|
+
result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
|
530
|
+
|
531
|
+
self.assertEqual(len(result), 3)
|
532
|
+
self.assertEqual(
|
533
|
+
result[1][0].as_py(),
|
534
|
+
decimal.Decimal("322200"), # consequence of negative scale
|
535
|
+
) # rounding decimal
|
536
|
+
self.assertEqual(result[1][1].as_py(), decimal.Decimal("00"))
|
537
|
+
self.assertEqual(len(result.column_names), 2)
|
538
|
+
result_schema = result.schema
|
539
|
+
self.assertEqual(result_schema.field(0).type, "string")
|
540
|
+
self.assertEqual(result_schema.field(1).type, pa.decimal128(20, -2))
|
541
|
+
|
542
|
+
def test_read_csv_when_decimal_scale_overflows_with_decimal256(self):
|
543
|
+
schema = pa.schema(
|
544
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal256(20, 2))]
|
545
|
+
)
|
546
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
547
|
+
_add_column_kwargs(
|
548
|
+
ContentType.UNESCAPED_TSV.value,
|
549
|
+
["is_active", "decimal_value"],
|
550
|
+
["is_active", "decimal_value"],
|
551
|
+
kwargs,
|
552
|
+
)
|
553
|
+
|
554
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
555
|
+
|
556
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
557
|
+
|
558
|
+
result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
|
559
|
+
|
560
|
+
self.assertEqual(len(result), 3)
|
561
|
+
self.assertEqual(
|
562
|
+
result[1][0].as_py(), decimal.Decimal("322236.66")
|
563
|
+
) # rounding decimal
|
564
|
+
self.assertEqual(result[1][1].as_py(), decimal.Decimal("32.33")) # not rounded
|
565
|
+
self.assertEqual(len(result.column_names), 2)
|
566
|
+
result_schema = result.schema
|
567
|
+
self.assertEqual(result_schema.field(0).type, "string")
|
568
|
+
self.assertEqual(result_schema.field(1).type, pa.decimal256(20, 2))
|
569
|
+
|
570
|
+
def test_read_csv_when_decimal_scale_overflows_with_decimal256_and_raise_on_overflow(
|
571
|
+
self,
|
572
|
+
):
|
573
|
+
schema = pa.schema(
|
574
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal256(20, 2))]
|
575
|
+
)
|
576
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
577
|
+
_add_column_kwargs(
|
578
|
+
ContentType.UNESCAPED_TSV.value,
|
579
|
+
["is_active", "decimal_value"],
|
580
|
+
["is_active", "decimal_value"],
|
581
|
+
kwargs,
|
582
|
+
)
|
583
|
+
|
584
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
585
|
+
|
586
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
587
|
+
|
588
|
+
self.assertRaises(
|
589
|
+
pa.lib.ArrowNotImplementedError,
|
590
|
+
lambda: pyarrow_read_csv(
|
591
|
+
OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
|
592
|
+
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
|
593
|
+
),
|
594
|
+
)
|
595
|
+
|
596
|
+
def test_read_csv_when_decimal_scale_overflows_without_any_schema_then_infers(self):
|
597
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
598
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=None)
|
599
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
600
|
+
|
601
|
+
result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
|
602
|
+
|
603
|
+
# The default behavior of pyarrow is to invalid skip rows
|
604
|
+
self.assertEqual(len(result), 2)
|
605
|
+
self.assertEqual(result[1][0].as_py(), 32.33) # rounding decimal
|
606
|
+
self.assertEqual(result[1][1].as_py(), 0.4) # not rounded
|
607
|
+
self.assertEqual(len(result.column_names), 2)
|
608
|
+
result_schema = result.schema
|
609
|
+
self.assertEqual(result_schema.field(0).type, "string")
|
610
|
+
self.assertEqual(result_schema.field(1).type, pa.float64())
|
611
|
+
|
612
|
+
def test_read_csv_when_decimal_scale_and_precision_overflow_and_raise_on_overflow(
|
613
|
+
self,
|
614
|
+
):
|
615
|
+
schema = pa.schema(
|
616
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(5, 2))]
|
617
|
+
)
|
618
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
619
|
+
_add_column_kwargs(
|
620
|
+
ContentType.UNESCAPED_TSV.value,
|
621
|
+
["is_active", "decimal_value"],
|
622
|
+
["is_active", "decimal_value"],
|
623
|
+
kwargs,
|
624
|
+
)
|
625
|
+
|
626
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
627
|
+
|
628
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
629
|
+
|
630
|
+
self.assertRaises(
|
631
|
+
pa.lib.ArrowInvalid,
|
632
|
+
lambda: pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs),
|
633
|
+
)
|
634
|
+
|
635
|
+
def test_read_csv_when_decimal_scale_overflow_and_file_like_obj_passed(self):
|
636
|
+
schema = pa.schema(
|
637
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(15, 2))]
|
638
|
+
)
|
639
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
640
|
+
_add_column_kwargs(
|
641
|
+
ContentType.UNESCAPED_TSV.value,
|
642
|
+
["is_active", "decimal_value"],
|
643
|
+
["is_active", "decimal_value"],
|
644
|
+
kwargs,
|
645
|
+
)
|
646
|
+
|
647
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
648
|
+
|
649
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
650
|
+
|
651
|
+
with open(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, "rb") as file:
|
652
|
+
result = pyarrow_read_csv(file, **kwargs)
|
653
|
+
|
654
|
+
self.assertEqual(len(result), 3)
|
655
|
+
self.assertEqual(
|
656
|
+
result[1][0].as_py(), decimal.Decimal("322236.66")
|
657
|
+
) # rounding decimal
|
658
|
+
self.assertEqual(
|
659
|
+
result[1][1].as_py(), decimal.Decimal("32.33")
|
660
|
+
) # not rounded
|
661
|
+
self.assertEqual(len(result.column_names), 2)
|
662
|
+
result_schema = result.schema
|
663
|
+
self.assertEqual(result_schema.field(0).type, "string")
|
664
|
+
self.assertEqual(result_schema.field(1).type, pa.decimal128(15, 2))
|
665
|
+
|
410
666
|
|
411
667
|
class TestS3FileToTable(TestCase):
|
412
668
|
def test_s3_file_to_table_identity_sanity(self):
|
@@ -534,3 +790,25 @@ class TestS3FileToTable(TestCase):
|
|
534
790
|
self.assertEqual(field.name, schema.field(index).name)
|
535
791
|
|
536
792
|
self.assertEqual(result.schema.field(1).type, "string")
|
793
|
+
|
794
|
+
def test_s3_file_to_table_when_parquet_gzip(self):
|
795
|
+
|
796
|
+
pa_kwargs_provider = lambda content_type, kwargs: {
|
797
|
+
"reader_type": "pyarrow",
|
798
|
+
**kwargs,
|
799
|
+
}
|
800
|
+
|
801
|
+
result = s3_file_to_table(
|
802
|
+
PARQUET_GZIP_COMPRESSED_FILE_PATH,
|
803
|
+
ContentType.PARQUET.value,
|
804
|
+
ContentEncoding.GZIP.value,
|
805
|
+
["n_legs", "animal"],
|
806
|
+
["n_legs"],
|
807
|
+
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
808
|
+
)
|
809
|
+
|
810
|
+
self.assertEqual(len(result), 6)
|
811
|
+
self.assertEqual(len(result.column_names), 1)
|
812
|
+
schema = result.schema
|
813
|
+
schema_index = schema.get_field_index("n_legs")
|
814
|
+
self.assertEqual(schema.field(schema_index).type, "int64")
|
deltacat/utils/pyarrow.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
+
import copy
|
4
5
|
import bz2
|
5
6
|
import gzip
|
6
7
|
import io
|
@@ -47,6 +48,18 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
|
47
48
|
RAISE_ON_EMPTY_CSV_KWARG = "raise_on_empty_csv"
|
48
49
|
READER_TYPE_KWARG = "reader_type"
|
49
50
|
|
51
|
+
"""
|
52
|
+
By default, round decimal values using half_to_even round mode when
|
53
|
+
rescaling a decimal to the given scale and precision in the schema would cause
|
54
|
+
data loss. Setting any non null value of this argument will result
|
55
|
+
in an error instead.
|
56
|
+
"""
|
57
|
+
RAISE_ON_DECIMAL_OVERFLOW = "raise_on_decimal_overflow"
|
58
|
+
# Note the maximum from https://arrow.apache.org/docs/python/generated/pyarrow.Decimal256Type.html#pyarrow.Decimal256Type
|
59
|
+
DECIMAL256_DEFAULT_SCALE = 38
|
60
|
+
DECIMAL256_MAX_PRECISION = 76
|
61
|
+
MAX_INT_BYTES = 2147483646
|
62
|
+
|
50
63
|
|
51
64
|
def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Schema:
|
52
65
|
|
@@ -64,45 +77,164 @@ def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Sche
|
|
64
77
|
return target_schema
|
65
78
|
|
66
79
|
|
67
|
-
def
|
68
|
-
|
69
|
-
|
70
|
-
|
80
|
+
def _extract_arrow_schema_from_read_csv_kwargs(kwargs: Dict[str, Any]) -> pa.Schema:
|
81
|
+
schema = None
|
82
|
+
if (
|
83
|
+
"convert_options" in kwargs
|
84
|
+
and kwargs["convert_options"].column_types is not None
|
85
|
+
):
|
86
|
+
schema = kwargs["convert_options"].column_types
|
87
|
+
if not isinstance(schema, pa.Schema):
|
88
|
+
schema = pa.schema(schema)
|
89
|
+
if kwargs["convert_options"].include_columns:
|
90
|
+
schema = _filter_schema_for_columns(
|
91
|
+
schema, kwargs["convert_options"].include_columns
|
92
|
+
)
|
93
|
+
elif (
|
94
|
+
kwargs.get("read_options") is not None
|
95
|
+
and kwargs["read_options"].column_names
|
96
|
+
):
|
97
|
+
schema = _filter_schema_for_columns(
|
98
|
+
schema, kwargs["read_options"].column_names
|
99
|
+
)
|
100
|
+
else:
|
101
|
+
logger.debug(
|
102
|
+
"Schema not specified in the kwargs."
|
103
|
+
" Hence, schema could not be inferred from the empty CSV."
|
71
104
|
)
|
105
|
+
|
106
|
+
return schema
|
107
|
+
|
108
|
+
|
109
|
+
def _new_schema_with_replaced_fields(
|
110
|
+
schema: pa.Schema, field_to_replace: Callable[[pa.Field], Optional[pa.Field]]
|
111
|
+
) -> pa.Schema:
|
112
|
+
if schema is None:
|
113
|
+
return None
|
114
|
+
|
115
|
+
new_schema_fields = []
|
116
|
+
for field in schema:
|
117
|
+
new_field = field_to_replace(field)
|
118
|
+
if new_field is not None:
|
119
|
+
new_schema_fields.append(new_field)
|
120
|
+
else:
|
121
|
+
new_schema_fields.append(field)
|
122
|
+
|
123
|
+
return pa.schema(new_schema_fields, metadata=schema.metadata)
|
124
|
+
|
125
|
+
|
126
|
+
def _read_csv_rounding_decimal_columns_to_fit_scale(
|
127
|
+
schema: pa.Schema, reader_args: List[Any], reader_kwargs: Dict[str, Any]
|
128
|
+
) -> pa.Table:
|
129
|
+
# Note: We read decimals as strings first because CSV
|
130
|
+
# conversion to decimal256 isn't implemented as of pyarrow==12.0.1
|
131
|
+
new_schema = _new_schema_with_replaced_fields(
|
132
|
+
schema,
|
133
|
+
lambda fld: (
|
134
|
+
pa.field(fld.name, pa.string(), metadata=fld.metadata)
|
135
|
+
if pa.types.is_decimal128(fld.type) or pa.types.is_decimal256(fld.type)
|
136
|
+
else None
|
137
|
+
),
|
138
|
+
)
|
139
|
+
new_kwargs = sanitize_kwargs_by_supported_kwargs(
|
140
|
+
["read_options", "parse_options", "convert_options", "memory_pool"],
|
141
|
+
reader_kwargs,
|
142
|
+
)
|
143
|
+
# Creating a shallow copy for efficiency
|
144
|
+
new_convert_options = copy.copy(new_kwargs["convert_options"])
|
145
|
+
new_convert_options.column_types = new_schema
|
146
|
+
new_reader_kwargs = {**new_kwargs, "convert_options": new_convert_options}
|
147
|
+
arrow_table = pacsv.read_csv(*reader_args, **new_reader_kwargs)
|
148
|
+
|
149
|
+
for column_index, field in enumerate(schema):
|
150
|
+
if pa.types.is_decimal128(field.type) or pa.types.is_decimal256(field.type):
|
151
|
+
column_array = arrow_table[field.name]
|
152
|
+
# We always cast to decimal256 to accomodate fixed scale of 38
|
153
|
+
cast_to_type = pa.decimal256(
|
154
|
+
DECIMAL256_MAX_PRECISION, DECIMAL256_DEFAULT_SCALE
|
155
|
+
)
|
156
|
+
casted_decimal_array = pc.cast(column_array, cast_to_type)
|
157
|
+
# Note that scale can be negative
|
158
|
+
rounded_column_array = pc.round(
|
159
|
+
casted_decimal_array, ndigits=field.type.scale
|
160
|
+
)
|
161
|
+
final_decimal_array = pc.cast(rounded_column_array, field.type)
|
162
|
+
arrow_table = arrow_table.set_column(
|
163
|
+
column_index,
|
164
|
+
field,
|
165
|
+
final_decimal_array,
|
166
|
+
)
|
167
|
+
logger.debug(
|
168
|
+
f"Rounded decimal column: {field.name} to {field.type.scale} scale and"
|
169
|
+
f" {field.type.precision} precision"
|
170
|
+
)
|
171
|
+
|
172
|
+
return arrow_table
|
173
|
+
|
174
|
+
|
175
|
+
def pyarrow_read_csv_default(*args, **kwargs):
|
176
|
+
new_kwargs = sanitize_kwargs_by_supported_kwargs(
|
177
|
+
["read_options", "parse_options", "convert_options", "memory_pool"], kwargs
|
178
|
+
)
|
179
|
+
|
180
|
+
try:
|
72
181
|
return pacsv.read_csv(*args, **new_kwargs)
|
73
182
|
except pa.lib.ArrowInvalid as e:
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
)
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
183
|
+
error_str = e.__str__()
|
184
|
+
schema = _extract_arrow_schema_from_read_csv_kwargs(kwargs)
|
185
|
+
|
186
|
+
if error_str == "Empty CSV file" and not kwargs.get(RAISE_ON_EMPTY_CSV_KWARG):
|
187
|
+
logger.debug(f"Read CSV empty schema being used: {schema}")
|
188
|
+
return pa.Table.from_pylist([], schema=schema)
|
189
|
+
if not kwargs.get(RAISE_ON_DECIMAL_OVERFLOW):
|
190
|
+
# Note, this logic requires expensive casting. To prevent downgrading performance
|
191
|
+
# for happy path reads, we are handling this case in response to an error.
|
192
|
+
logger.warning(
|
193
|
+
"Rescaling Decimal to the given scale in the schema. "
|
194
|
+
f"Original error: {error_str}"
|
195
|
+
)
|
196
|
+
|
197
|
+
if schema is not None and "convert_options" in kwargs:
|
198
|
+
if (
|
199
|
+
"Rescaling Decimal" in error_str
|
200
|
+
and "value would cause data loss" in error_str
|
90
201
|
):
|
91
|
-
|
92
|
-
|
202
|
+
logger.debug(f"Checking if the file: {args[0]}...")
|
203
|
+
# Since we are re-reading the file, we have to seek to beginning
|
204
|
+
if isinstance(args[0], io.IOBase) and args[0].seekable():
|
205
|
+
logger.debug(f"Seeking to the beginning of the file {args[0]}")
|
206
|
+
args[0].seek(0)
|
207
|
+
return _read_csv_rounding_decimal_columns_to_fit_scale(
|
208
|
+
schema=schema, reader_args=args, reader_kwargs=kwargs
|
93
209
|
)
|
94
|
-
|
95
210
|
else:
|
96
211
|
logger.debug(
|
97
|
-
"Schema
|
98
|
-
"
|
212
|
+
"Schema is None when trying to adjust decimal values. "
|
213
|
+
"Hence, bubbling up exception..."
|
99
214
|
)
|
100
215
|
|
101
|
-
logger.debug(f"Read CSV empty schema being used: {schema}")
|
102
|
-
return pa.Table.from_pylist([], schema=schema)
|
103
216
|
raise e
|
104
217
|
|
105
218
|
|
219
|
+
def pyarrow_read_csv(*args, **kwargs) -> pa.Table:
|
220
|
+
schema = _extract_arrow_schema_from_read_csv_kwargs(kwargs)
|
221
|
+
|
222
|
+
# CSV conversion to decimal256 isn't supported as of pyarrow=12.0.1
|
223
|
+
# Below ensures decimal256 is casted properly.
|
224
|
+
schema_includes_decimal256 = (
|
225
|
+
(True if any([pa.types.is_decimal256(x.type) for x in schema]) else False)
|
226
|
+
if schema is not None
|
227
|
+
else None
|
228
|
+
)
|
229
|
+
if schema_includes_decimal256 and not kwargs.get(RAISE_ON_DECIMAL_OVERFLOW):
|
230
|
+
# falling back to expensive method of reading CSV
|
231
|
+
return _read_csv_rounding_decimal_columns_to_fit_scale(
|
232
|
+
schema, reader_args=args, reader_kwargs=kwargs
|
233
|
+
)
|
234
|
+
else:
|
235
|
+
return pyarrow_read_csv_default(*args, **kwargs)
|
236
|
+
|
237
|
+
|
106
238
|
CONTENT_TYPE_TO_PA_READ_FUNC: Dict[str, Callable] = {
|
107
239
|
ContentType.UNESCAPED_TSV.value: pyarrow_read_csv,
|
108
240
|
ContentType.TSV.value: pyarrow_read_csv,
|
@@ -440,8 +572,8 @@ def s3_file_to_table(
|
|
440
572
|
**s3_client_kwargs,
|
441
573
|
)
|
442
574
|
|
443
|
-
|
444
|
-
|
575
|
+
if READER_TYPE_KWARG in kwargs:
|
576
|
+
kwargs.pop(READER_TYPE_KWARG)
|
445
577
|
|
446
578
|
filesystem = io
|
447
579
|
if s3_url.startswith("s3://"):
|
@@ -783,7 +915,6 @@ def sliced_string_cast(array: pa.ChunkedArray) -> pa.ChunkedArray:
|
|
783
915
|
TODO: deprecate this function when pyarrow performs proper ChunkedArray -> ChunkedArray casting
|
784
916
|
"""
|
785
917
|
dtype = array.type
|
786
|
-
MAX_BYTES = 2147483646
|
787
918
|
max_str_len = None
|
788
919
|
if pa.types.is_integer(dtype):
|
789
920
|
max_str_len = _int_max_string_len()
|
@@ -795,7 +926,7 @@ def sliced_string_cast(array: pa.ChunkedArray) -> pa.ChunkedArray:
|
|
795
926
|
max_str_len = _max_decimal256_string_len()
|
796
927
|
|
797
928
|
if max_str_len is not None:
|
798
|
-
max_elems_per_chunk =
|
929
|
+
max_elems_per_chunk = MAX_INT_BYTES // (2 * max_str_len) # safety factor of 2
|
799
930
|
all_chunks = []
|
800
931
|
for chunk in array.chunks:
|
801
932
|
if len(chunk) < max_elems_per_chunk:
|
@@ -1,4 +1,4 @@
|
|
1
|
-
deltacat/__init__.py,sha256=
|
1
|
+
deltacat/__init__.py,sha256=DoUiDxmgMh8HUGOEAG7CUY0Q9Ip-S7gePDsL8XQO5kk,1778
|
2
2
|
deltacat/constants.py,sha256=TUJLXUJ9xq1Ryil72yLkKR8EDH_Irp5wUg56QstbRNE,2181
|
3
3
|
deltacat/exceptions.py,sha256=7sjk3BuMY5Oo-6OvAfHncZx_OcvtEL47BblWr2F7waE,12740
|
4
4
|
deltacat/logs.py,sha256=EQSDin1deehzz5xlLV1_TrFJrO_IBZ9Ahp7MdL-4cK8,9363
|
@@ -51,7 +51,7 @@ deltacat/compute/compactor/utils/sort_key.py,sha256=oK6otg-CSsma6zlGPaKg-KNEvcZR
|
|
51
51
|
deltacat/compute/compactor/utils/system_columns.py,sha256=CNIgAGos0xAGEpdaQIH7KfbSRrGZgjRbItXMararqXQ,9399
|
52
52
|
deltacat/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
53
53
|
deltacat/compute/compactor_v2/compaction_session.py,sha256=COtol2s63DRPbd-AN9KCiWr4exLX8x5Tvxea_7cOGEQ,8078
|
54
|
-
deltacat/compute/compactor_v2/constants.py,sha256=
|
54
|
+
deltacat/compute/compactor_v2/constants.py,sha256=wvd34d7RGdniGbbiJcMljxRrRas4_uy9F9UaqXfS_Ag,3034
|
55
55
|
deltacat/compute/compactor_v2/deletes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
56
56
|
deltacat/compute/compactor_v2/deletes/delete_file_envelope.py,sha256=AeuH9JRMwp6mvQf6P2cqL92hUEtResQq6qUTS0kIKac,3111
|
57
57
|
deltacat/compute/compactor_v2/deletes/delete_strategy.py,sha256=SMEJOxR-5r92kvKNqtu2w6HmwtmhljcZX1wcNEuS-4w,2833
|
@@ -69,14 +69,14 @@ deltacat/compute/compactor_v2/private/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCe
|
|
69
69
|
deltacat/compute/compactor_v2/private/compaction_utils.py,sha256=fMWXg1SCIIgjk9p_OFYrcm760dOKNbFO1Lj3_JI3GCY,30929
|
70
70
|
deltacat/compute/compactor_v2/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
71
71
|
deltacat/compute/compactor_v2/steps/hash_bucket.py,sha256=1R5xLUkl7GqL1nY-apAgY1czKDEHjIVYSRi9qLOMass,6726
|
72
|
-
deltacat/compute/compactor_v2/steps/merge.py,sha256=
|
72
|
+
deltacat/compute/compactor_v2/steps/merge.py,sha256=qxmb3cmiKvOgfuOzlJT4Q60zOyWNjsiuZSzxdh6KTm8,22909
|
73
73
|
deltacat/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
74
74
|
deltacat/compute/compactor_v2/utils/content_type_params.py,sha256=1P9CDpuWErsFcTTlRCeuUQHDokVI92he_MsL82uRAdA,7424
|
75
|
-
deltacat/compute/compactor_v2/utils/dedupe.py,sha256=
|
75
|
+
deltacat/compute/compactor_v2/utils/dedupe.py,sha256=Jz1QbBOdZJwT8K1vD9q01eOn7hdLNZ_AF7bJ0wficr0,1949
|
76
76
|
deltacat/compute/compactor_v2/utils/delta.py,sha256=I7Yvda8NVbpKXG3nM2Ku1utvR2r2OpHvUMqUL2ja3aw,3626
|
77
77
|
deltacat/compute/compactor_v2/utils/io.py,sha256=3m4dorxj-WD6Yu9_3gRE6gz3C-eNJA7nn02sHKwo-J8,6018
|
78
78
|
deltacat/compute/compactor_v2/utils/merge.py,sha256=EV_iKhNc3WflgfLW1Q46dXUvyClx8VebWHGtninEfsI,5311
|
79
|
-
deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=
|
79
|
+
deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=SbQ97M1Cxld-zZik2QMSzlj20g6JlENaQx_0PhlCIP8,12034
|
80
80
|
deltacat/compute/compactor_v2/utils/task_options.py,sha256=W0jyWIIZ0tcSAGp8mhpnu1G8p3rmX4d3juCPpAJxnDM,12649
|
81
81
|
deltacat/compute/merge_on_read/__init__.py,sha256=ckbgngmqPjYBYz_NySsR1vNTOb_hNpeL1sYkZKvBI9M,214
|
82
82
|
deltacat/compute/merge_on_read/daft.py,sha256=1oC38u5ig_aTrq7EzyWBo8Ui54rb6yERYMk-vEFbpxM,1400
|
@@ -136,7 +136,7 @@ deltacat/tests/compute/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3
|
|
136
136
|
deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py,sha256=kWyIJQMyF9oBemvgOp3ngGhMpH9zjkznV-67ewELgHQ,37719
|
137
137
|
deltacat/tests/compute/compact_partition_rebase_test_cases.py,sha256=8HVr3EIFYFqNaJoqeCuj9xIBjM4Ch2bx-mJcO4BRrLo,16839
|
138
138
|
deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py,sha256=l_6-pAKOsRY3NbtfHsYmEaJEkq6IJueYuLsjyJxNgz4,81564
|
139
|
-
deltacat/tests/compute/compact_partition_test_cases.py,sha256=
|
139
|
+
deltacat/tests/compute/compact_partition_test_cases.py,sha256=HJ15Xyawv8ImFju8wDwt22fH5okoPhyS-QAygkXDb7Q,27422
|
140
140
|
deltacat/tests/compute/test_compact_partition_incremental.py,sha256=lkfAraOJmEmieesf7b1BqlfTS26YjYM5xXOXoTMrsos,14989
|
141
141
|
deltacat/tests/compute/test_compact_partition_multiple_rounds.py,sha256=xXBA66TTfARR90m5KQs31nmiokuMy9iGQt7Z9evyG7M,12950
|
142
142
|
deltacat/tests/compute/test_compact_partition_params.py,sha256=Dm5eLyHo8oGMeO3XBbpj1rZqHtPZ1hAB7z2qvzc4Lxk,8497
|
@@ -152,9 +152,10 @@ deltacat/tests/compute/compactor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JC
|
|
152
152
|
deltacat/tests/compute/compactor/utils/test_io.py,sha256=st5mlU4cVU-eQl7B4mvPgNA3izuNwbVawYOp-NcoyrI,4326
|
153
153
|
deltacat/tests/compute/compactor/utils/test_round_completion_file.py,sha256=LAQ4usiRF4oTx4cA85L0eOcBa_Z-febc-CuzUijSGrI,7439
|
154
154
|
deltacat/tests/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
155
|
-
deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=
|
155
|
+
deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=y8nNHq9ADHENzUKMQYguB45zOD7F2lZgcBYYTvbTsdM,28957
|
156
156
|
deltacat/tests/compute/compactor_v2/test_hashlib.py,sha256=8csF2hFWtBvY2MbX3-6iphCsVXxRp0zP1NTnKhfdmkg,328
|
157
157
|
deltacat/tests/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
158
|
+
deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py,sha256=aFb9rzT_EK9k8qAMHPtpqd5btyEmll1So1loDmZkotQ,1769
|
158
159
|
deltacat/tests/compute/compactor_v2/utils/test_task_options.py,sha256=37DkR1u_XwhedV9cGed6FFuJTC0XmuiowHJIa_Op6uA,865
|
159
160
|
deltacat/tests/compute/resource_estimation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
160
161
|
deltacat/tests/compute/resource_estimation/test_delta.py,sha256=HCL2oUnCqm0E26T3HLJjMhoAsHTJIWPYGwIKRgM_H7E,25712
|
@@ -171,7 +172,7 @@ deltacat/tests/local_deltacat_storage/__init__.py,sha256=5T9ubNIS42-BotEH0yrUiWE
|
|
171
172
|
deltacat/tests/local_deltacat_storage/exceptions.py,sha256=oxZ0psmrEO0M6P2r8gHQ2E8E-Y8UBfUCBUIwfuHcx38,251
|
172
173
|
deltacat/tests/test_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
173
174
|
deltacat/tests/test_utils/constants.py,sha256=UYe--9T_clYjiOpv0M7TtAMGdpje_SMZ-w8n0IeCAjc,214
|
174
|
-
deltacat/tests/test_utils/pyarrow.py,sha256=
|
175
|
+
deltacat/tests/test_utils/pyarrow.py,sha256=QDdGilzsJ2xUESiGotdNVZde9yD7ja9MvNhhssnox-E,3083
|
175
176
|
deltacat/tests/test_utils/storage.py,sha256=93GEn4A5WbMHWk0Ec4Bd7RxeHoSEnBfSarfWhKOSNtM,972
|
176
177
|
deltacat/tests/test_utils/utils.py,sha256=a32qEwcSSd1lvRi0aJJ4ZLnc1ZyXmoQF_K95zaQRk2M,455
|
177
178
|
deltacat/tests/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -179,7 +180,7 @@ deltacat/tests/utils/test_cloudpickle.py,sha256=J0pnBY3-PxlUh6MamZAN1PuquKQPr2iy
|
|
179
180
|
deltacat/tests/utils/test_daft.py,sha256=kY8lkXoQvyWunok8UvOsh1An297rb3jcnstTuIAyAlc,8232
|
180
181
|
deltacat/tests/utils/test_metrics.py,sha256=Ym9nOz1EtB180pLmvugihj1sDTNDMb5opIjjr5Nmcls,16339
|
181
182
|
deltacat/tests/utils/test_placement.py,sha256=g61wVOMkHe4YJeR9Oxg_BOVQ6bhHHbC3IBYv8YhUu94,597
|
182
|
-
deltacat/tests/utils/test_pyarrow.py,sha256=
|
183
|
+
deltacat/tests/utils/test_pyarrow.py,sha256=fDjDkGPjdRZA3kgjgiQRym9shdeDYgkdDPYU2a7IEUk,30790
|
183
184
|
deltacat/tests/utils/test_record_batch_tables.py,sha256=AkG1WyljQmjnl-AxhbFWyo5LnMIKRyLScfgC2B_ES-s,11321
|
184
185
|
deltacat/tests/utils/test_resources.py,sha256=HtpvDrfPZQNtGDXUlsIzc_yd7Vf1cDscZ3YbN0oTvO8,2560
|
185
186
|
deltacat/tests/utils/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -200,7 +201,7 @@ deltacat/utils/numpy.py,sha256=SpHKKvC-K8NINTWGVfTZ5-gBFTGYqaXjjgKFhsdUjwg,2049
|
|
200
201
|
deltacat/utils/pandas.py,sha256=q99mlRB7tymICMcNbfGLfLqFu_C-feyPZKZm2CWJJVc,9574
|
201
202
|
deltacat/utils/performance.py,sha256=7ZLaMkS1ehPSIhT5uOQVBHvjC70iKHzoFquFo-KL0PI,645
|
202
203
|
deltacat/utils/placement.py,sha256=Lj20fb-eq8rgMdm_M2MBMfDLwhDM1sS1nJj2DvIK56s,12060
|
203
|
-
deltacat/utils/pyarrow.py,sha256=
|
204
|
+
deltacat/utils/pyarrow.py,sha256=R3KkJPenE48rS3VrfFKSkJerX94f4e7X2dUPBQg44DY,34339
|
204
205
|
deltacat/utils/resources.py,sha256=Ax1OgLLbZI4oYpp4Ki27OLaST-7I-AJgZwU87FVfY8g,8253
|
205
206
|
deltacat/utils/s3fs.py,sha256=PmUJ5Fm1WmD-_zp_M6yd9VbXvIoJuBeK6ApOdJJApLE,662
|
206
207
|
deltacat/utils/schema.py,sha256=m4Wm4ZQcpttzOUxex4dVneGlHy1_E36HspTcjNYzvVM,1564
|
@@ -210,8 +211,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
|
|
210
211
|
deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
|
211
212
|
deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
|
212
213
|
deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
|
213
|
-
deltacat-1.1.
|
214
|
-
deltacat-1.1.
|
215
|
-
deltacat-1.1.
|
216
|
-
deltacat-1.1.
|
217
|
-
deltacat-1.1.
|
214
|
+
deltacat-1.1.29.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
215
|
+
deltacat-1.1.29.dist-info/METADATA,sha256=ZverlgFUJV4wGJao8tusRCv_sRNX4KJ4RTNAGvBCJes,1733
|
216
|
+
deltacat-1.1.29.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
217
|
+
deltacat-1.1.29.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
|
218
|
+
deltacat-1.1.29.dist-info/RECORD,,
|
File without changes
|
File without changes
|