deltacat 1.1.28__py3-none-any.whl → 1.1.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor_v2/constants.py +15 -1
- deltacat/compute/compactor_v2/steps/merge.py +30 -5
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/primary_key_index.py +15 -3
- deltacat/tests/compute/compact_partition_test_cases.py +32 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +133 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/test_utils/pyarrow.py +15 -8
- deltacat/tests/utils/test_pyarrow.py +23 -0
- deltacat/utils/pyarrow.py +9 -7
- {deltacat-1.1.28.dist-info → deltacat-1.1.29.dist-info}/METADATA +1 -1
- {deltacat-1.1.28.dist-info → deltacat-1.1.29.dist-info}/RECORD +16 -15
- {deltacat-1.1.28.dist-info → deltacat-1.1.29.dist-info}/LICENSE +0 -0
- {deltacat-1.1.28.dist-info → deltacat-1.1.29.dist-info}/WHEEL +0 -0
- {deltacat-1.1.28.dist-info → deltacat-1.1.29.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
from deltacat.utils.common import env_bool, env_integer
|
2
|
+
|
1
3
|
TOTAL_BYTES_IN_SHA1_HASH = 20
|
2
4
|
|
3
5
|
PK_DELIMITER = "L6kl7u5f"
|
@@ -31,7 +33,9 @@ TOTAL_MEMORY_BUFFER_PERCENTAGE = 30
|
|
31
33
|
# The total size of records that will be hash bucketed at once
|
32
34
|
# Since, sorting is nlogn, we ensure that is not performed
|
33
35
|
# on a very large dataset for best performance.
|
34
|
-
MAX_SIZE_OF_RECORD_BATCH_IN_GIB =
|
36
|
+
MAX_SIZE_OF_RECORD_BATCH_IN_GIB = env_integer(
|
37
|
+
"MAX_SIZE_OF_RECORD_BATCH_IN_GIB", 2 * 1024 * 1024 * 1024
|
38
|
+
)
|
35
39
|
|
36
40
|
# Whether to drop duplicates during merge.
|
37
41
|
DROP_DUPLICATES = True
|
@@ -78,3 +82,13 @@ COMPACT_PARTITION_METRIC_PREFIX = "compact_partition"
|
|
78
82
|
# Number of rounds to run hash/merge for a single
|
79
83
|
# partition. (For large table support)
|
80
84
|
DEFAULT_NUM_ROUNDS = 1
|
85
|
+
|
86
|
+
# Whether to perform sha1 hashing when required to
|
87
|
+
# optimize memory. For example, hashing is always
|
88
|
+
# required for bucketing where it's not mandatory
|
89
|
+
# when dropping duplicates. Setting this to True
|
90
|
+
# will disable sha1 hashing in cases where it isn't
|
91
|
+
# mandatory. This flag is False by default.
|
92
|
+
SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED = env_bool(
|
93
|
+
"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED", False
|
94
|
+
)
|
@@ -7,6 +7,7 @@ import ray
|
|
7
7
|
import itertools
|
8
8
|
import time
|
9
9
|
import pyarrow.compute as pc
|
10
|
+
from deltacat.utils.pyarrow import MAX_INT_BYTES
|
10
11
|
import deltacat.compute.compactor_v2.utils.merge as merge_utils
|
11
12
|
from uuid import uuid4
|
12
13
|
from deltacat import logs
|
@@ -147,10 +148,32 @@ def _merge_tables(
|
|
147
148
|
if compacted_table:
|
148
149
|
compacted_table = all_tables[0]
|
149
150
|
|
151
|
+
compacted_pk_hash_str = compacted_table[sc._PK_HASH_STRING_COLUMN_NAME]
|
152
|
+
incremental_pk_hash_str = incremental_table[sc._PK_HASH_STRING_COLUMN_NAME]
|
153
|
+
|
154
|
+
logger.info(
|
155
|
+
f"Size of compacted pk hash={compacted_pk_hash_str.nbytes} "
|
156
|
+
f"and incremental pk hash={incremental_pk_hash_str.nbytes}."
|
157
|
+
)
|
158
|
+
|
159
|
+
if (
|
160
|
+
compacted_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
|
161
|
+
or incremental_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
|
162
|
+
):
|
163
|
+
logger.info("Casting compacted and incremental pk hash to large_string...")
|
164
|
+
# is_in combines the chunks of the chunked array passed which can cause
|
165
|
+
# ArrowCapacityError if the total size of string array is over 2GB.
|
166
|
+
# Using a large_string would resolve this issue.
|
167
|
+
# The cast here should be zero-copy in most cases.
|
168
|
+
compacted_pk_hash_str = pc.cast(compacted_pk_hash_str, pa.large_string())
|
169
|
+
incremental_pk_hash_str = pc.cast(
|
170
|
+
incremental_pk_hash_str, pa.large_string()
|
171
|
+
)
|
172
|
+
|
150
173
|
records_to_keep = pc.invert(
|
151
174
|
pc.is_in(
|
152
|
-
|
153
|
-
|
175
|
+
compacted_pk_hash_str,
|
176
|
+
incremental_pk_hash_str,
|
154
177
|
)
|
155
178
|
)
|
156
179
|
|
@@ -492,9 +515,11 @@ def _copy_manifests_from_hash_bucketing(
|
|
492
515
|
def _timed_merge(input: MergeInput) -> MergeResult:
|
493
516
|
task_id = get_current_ray_task_id()
|
494
517
|
worker_id = get_current_ray_worker_id()
|
495
|
-
with
|
496
|
-
f"merge_{worker_id}_{task_id}.bin"
|
497
|
-
|
518
|
+
with (
|
519
|
+
memray.Tracker(f"merge_{worker_id}_{task_id}.bin")
|
520
|
+
if input.enable_profiler
|
521
|
+
else nullcontext()
|
522
|
+
):
|
498
523
|
total_input_records, total_deduped_records = 0, 0
|
499
524
|
total_dropped_records = 0
|
500
525
|
materialized_results: List[MaterializeResult] = []
|
@@ -25,7 +25,7 @@ def _create_chunked_index_array(array: pa.Array) -> pa.Array:
|
|
25
25
|
result[index] = np.arange(cl, dtype="int32")
|
26
26
|
|
27
27
|
chunk_lengths = ([0] + chunk_lengths)[:-1]
|
28
|
-
result = pa.chunked_array(result + np.cumsum(chunk_lengths))
|
28
|
+
result = pa.chunked_array(result + np.cumsum(chunk_lengths), type=pa.int32())
|
29
29
|
return result
|
30
30
|
|
31
31
|
|
@@ -10,6 +10,7 @@ from deltacat.compute.compactor_v2.constants import (
|
|
10
10
|
TOTAL_BYTES_IN_SHA1_HASH,
|
11
11
|
PK_DELIMITER,
|
12
12
|
MAX_SIZE_OF_RECORD_BATCH_IN_GIB,
|
13
|
+
SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED,
|
13
14
|
)
|
14
15
|
import time
|
15
16
|
from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelope
|
@@ -48,6 +49,13 @@ def _is_sha1_desired(hash_columns: List[pa.Array]) -> bool:
|
|
48
49
|
f"Found total length of hash column={total_len} and total_size={total_size}"
|
49
50
|
)
|
50
51
|
|
52
|
+
if SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED:
|
53
|
+
logger.info(
|
54
|
+
f"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED is True. "
|
55
|
+
f"Returning False for is_sha1_desired"
|
56
|
+
)
|
57
|
+
return False
|
58
|
+
|
51
59
|
return total_size > TOTAL_BYTES_IN_SHA1_HASH * total_len
|
52
60
|
|
53
61
|
|
@@ -108,9 +116,10 @@ def _optimized_group_record_batches_by_hash_bucket(
|
|
108
116
|
record_batches = []
|
109
117
|
result_len = 0
|
110
118
|
for record_batch in table_batches:
|
111
|
-
|
112
|
-
|
113
|
-
|
119
|
+
if (
|
120
|
+
record_batches
|
121
|
+
and current_bytes + record_batch.nbytes >= MAX_SIZE_OF_RECORD_BATCH_IN_GIB
|
122
|
+
):
|
114
123
|
logger.info(
|
115
124
|
f"Total number of record batches without exceeding {MAX_SIZE_OF_RECORD_BATCH_IN_GIB} "
|
116
125
|
f"is {len(record_batches)} and size {current_bytes}"
|
@@ -128,6 +137,9 @@ def _optimized_group_record_batches_by_hash_bucket(
|
|
128
137
|
current_bytes = 0
|
129
138
|
record_batches.clear()
|
130
139
|
|
140
|
+
current_bytes += record_batch.nbytes
|
141
|
+
record_batches.append(record_batch)
|
142
|
+
|
131
143
|
if record_batches:
|
132
144
|
appended_len, append_latency = timed_invocation(
|
133
145
|
_append_table_by_hash_bucket,
|
@@ -601,6 +601,38 @@ INCREMENTAL_TEST_CASES: Dict[str, IncrementalCompactionTestCaseParams] = {
|
|
601
601
|
skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
|
602
602
|
assert_compaction_audit=None,
|
603
603
|
),
|
604
|
+
"15-incremental-empty-input-with-single-hash-bucket": IncrementalCompactionTestCaseParams(
|
605
|
+
primary_keys={"pk_col_1"},
|
606
|
+
sort_keys=[SortKey.of(key_name="sk_col_1")],
|
607
|
+
partition_keys=ZERO_VALUED_PARTITION_KEYS_PARAM,
|
608
|
+
partition_values=ZERO_VALUED_PARTITION_VALUES_PARAM,
|
609
|
+
input_deltas=pa.Table.from_arrays(
|
610
|
+
[
|
611
|
+
pa.array([]),
|
612
|
+
pa.array([]),
|
613
|
+
],
|
614
|
+
names=["pk_col_1", "sk_col_1"],
|
615
|
+
),
|
616
|
+
input_deltas_delta_type=DeltaType.UPSERT,
|
617
|
+
expected_terminal_compact_partition_result=pa.Table.from_arrays(
|
618
|
+
[
|
619
|
+
pa.array([]),
|
620
|
+
pa.array([]),
|
621
|
+
],
|
622
|
+
names=["pk_col_1", "sk_col_1"],
|
623
|
+
),
|
624
|
+
expected_terminal_exception=None,
|
625
|
+
expected_terminal_exception_message=None,
|
626
|
+
do_create_placement_group=False,
|
627
|
+
records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
|
628
|
+
hash_bucket_count=1,
|
629
|
+
read_kwargs_provider=None,
|
630
|
+
drop_duplicates=True,
|
631
|
+
is_inplace=False,
|
632
|
+
add_late_deltas=None,
|
633
|
+
skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
|
634
|
+
assert_compaction_audit=assert_compaction_audit_no_hash_bucket,
|
635
|
+
),
|
604
636
|
}
|
605
637
|
|
606
638
|
INCREMENTAL_TEST_CASES = with_compactor_version_func_test_param(INCREMENTAL_TEST_CASES)
|
@@ -1,6 +1,7 @@
|
|
1
1
|
from typing import Dict, Any
|
2
2
|
import ray
|
3
3
|
import os
|
4
|
+
import pyarrow as pa
|
4
5
|
import pytest
|
5
6
|
import boto3
|
6
7
|
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
@@ -76,6 +77,17 @@ def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
|
|
76
77
|
os.remove(DATABASE_FILE_PATH_VALUE)
|
77
78
|
|
78
79
|
|
80
|
+
@pytest.fixture(scope="function")
|
81
|
+
def disable_sha1(monkeypatch):
|
82
|
+
import deltacat.compute.compactor_v2.utils.primary_key_index
|
83
|
+
|
84
|
+
monkeypatch.setattr(
|
85
|
+
deltacat.compute.compactor_v2.utils.primary_key_index,
|
86
|
+
"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED",
|
87
|
+
True,
|
88
|
+
)
|
89
|
+
|
90
|
+
|
79
91
|
class TestCompactionSession:
|
80
92
|
"""
|
81
93
|
This class adds specific tests that aren't part of the parametrized test suite.
|
@@ -556,3 +568,124 @@ class TestCompactionSession:
|
|
556
568
|
}
|
557
569
|
)
|
558
570
|
)
|
571
|
+
|
572
|
+
def test_compact_partition_when_incremental_pk_hash_is_over_2gb(
|
573
|
+
self, s3_resource, local_deltacat_storage_kwargs, disable_sha1
|
574
|
+
):
|
575
|
+
"""
|
576
|
+
A test case which ensures the compaction succeeds even if the incremental
|
577
|
+
arrow table size is over 2GB. It is added to prevent ArrowCapacityError
|
578
|
+
when running is_in operation during merge.
|
579
|
+
|
580
|
+
Note that we set SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED to bypass sha1 hashing
|
581
|
+
which truncates the lengths of pk strings when deduping.
|
582
|
+
"""
|
583
|
+
# setup
|
584
|
+
staged_source = stage_partition_from_file_paths(
|
585
|
+
self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
|
586
|
+
)
|
587
|
+
# we create chunked array to avoid ArrowCapacityError
|
588
|
+
chunked_pk_array = pa.chunked_array([["13bytesstring"], ["12bytestring"]])
|
589
|
+
table = pa.table([chunked_pk_array], names=["pk"])
|
590
|
+
source_delta = commit_delta_to_staged_partition(
|
591
|
+
staged_source, pa_table=table, **local_deltacat_storage_kwargs
|
592
|
+
)
|
593
|
+
|
594
|
+
staged_dest = stage_partition_from_file_paths(
|
595
|
+
self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
|
596
|
+
)
|
597
|
+
dest_partition = ds.commit_partition(
|
598
|
+
staged_dest, **local_deltacat_storage_kwargs
|
599
|
+
)
|
600
|
+
|
601
|
+
# rebase first
|
602
|
+
rebase_url = compact_partition(
|
603
|
+
CompactPartitionParams.of(
|
604
|
+
{
|
605
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
606
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
607
|
+
"dd_max_parallelism_ratio": 1.0,
|
608
|
+
"deltacat_storage": ds,
|
609
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
610
|
+
"destination_partition_locator": dest_partition.locator,
|
611
|
+
"drop_duplicates": True,
|
612
|
+
"hash_bucket_count": 1,
|
613
|
+
"last_stream_position_to_compact": source_delta.stream_position,
|
614
|
+
"list_deltas_kwargs": {
|
615
|
+
**local_deltacat_storage_kwargs,
|
616
|
+
**{"equivalent_table_types": []},
|
617
|
+
},
|
618
|
+
"primary_keys": ["pk"],
|
619
|
+
"rebase_source_partition_locator": source_delta.partition_locator,
|
620
|
+
"rebase_source_partition_high_watermark": source_delta.stream_position,
|
621
|
+
"records_per_compacted_file": 4000,
|
622
|
+
"s3_client_kwargs": {},
|
623
|
+
"source_partition_locator": source_delta.partition_locator,
|
624
|
+
"resource_estimation_method": ResourceEstimationMethod.PREVIOUS_INFLATION,
|
625
|
+
}
|
626
|
+
)
|
627
|
+
)
|
628
|
+
|
629
|
+
rebased_rcf = get_rcf(s3_resource, rebase_url)
|
630
|
+
|
631
|
+
assert rebased_rcf.compacted_pyarrow_write_result.files == 1
|
632
|
+
assert rebased_rcf.compacted_pyarrow_write_result.records == 2
|
633
|
+
|
634
|
+
# Run incremental with a small delta on source
|
635
|
+
chunked_pk_array = pa.chunked_array(
|
636
|
+
[["13bytesstring" * 95_000_000], ["12bytestring" * 95_000_000]]
|
637
|
+
) # 2.3GB
|
638
|
+
table = pa.table([chunked_pk_array], names=["pk"])
|
639
|
+
|
640
|
+
incremental_source_delta = commit_delta_to_partition(
|
641
|
+
source_delta.partition_locator,
|
642
|
+
pa_table=table,
|
643
|
+
**local_deltacat_storage_kwargs,
|
644
|
+
)
|
645
|
+
assert (
|
646
|
+
incremental_source_delta.partition_locator == source_delta.partition_locator
|
647
|
+
), "source partition locator should not change"
|
648
|
+
dest_partition = ds.get_partition(
|
649
|
+
dest_partition.stream_locator,
|
650
|
+
dest_partition.partition_values,
|
651
|
+
**local_deltacat_storage_kwargs,
|
652
|
+
)
|
653
|
+
|
654
|
+
assert (
|
655
|
+
dest_partition.locator
|
656
|
+
== rebased_rcf.compacted_delta_locator.partition_locator
|
657
|
+
), "The new destination partition should be same as compacted partition"
|
658
|
+
|
659
|
+
# Run incremental
|
660
|
+
incremental_url = compact_partition(
|
661
|
+
CompactPartitionParams.of(
|
662
|
+
{
|
663
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
664
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
665
|
+
"dd_max_parallelism_ratio": 1.0,
|
666
|
+
"deltacat_storage": ds,
|
667
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
668
|
+
"destination_partition_locator": dest_partition.locator,
|
669
|
+
"drop_duplicates": True,
|
670
|
+
"hash_bucket_count": 1,
|
671
|
+
"last_stream_position_to_compact": incremental_source_delta.stream_position,
|
672
|
+
"list_deltas_kwargs": {
|
673
|
+
**local_deltacat_storage_kwargs,
|
674
|
+
**{"equivalent_table_types": []},
|
675
|
+
},
|
676
|
+
"primary_keys": ["pk"],
|
677
|
+
"records_per_compacted_file": 4000,
|
678
|
+
"s3_client_kwargs": {},
|
679
|
+
"source_partition_locator": incremental_source_delta.partition_locator,
|
680
|
+
"resource_estimation_method": ResourceEstimationMethod.PREVIOUS_INFLATION,
|
681
|
+
}
|
682
|
+
)
|
683
|
+
)
|
684
|
+
|
685
|
+
incremental_rcf = get_rcf(s3_resource, incremental_url)
|
686
|
+
|
687
|
+
assert incremental_rcf.compacted_pyarrow_write_result.files == 1
|
688
|
+
assert (
|
689
|
+
incremental_rcf.compacted_pyarrow_write_result.pyarrow_bytes >= 2300000000
|
690
|
+
)
|
691
|
+
assert incremental_rcf.compacted_pyarrow_write_result.records == 4
|
@@ -0,0 +1,45 @@
|
|
1
|
+
import pyarrow as pa
|
2
|
+
from deltacat.compute.compactor_v2.utils.primary_key_index import (
|
3
|
+
group_by_pk_hash_bucket,
|
4
|
+
)
|
5
|
+
|
6
|
+
|
7
|
+
class TestGroupByPkHashBucket:
|
8
|
+
def test_sanity(self):
|
9
|
+
record = pa.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
|
10
|
+
pk = pa.array(["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"])
|
11
|
+
record_batch = pa.RecordBatch.from_arrays([record, pk], names=["record", "pk"])
|
12
|
+
table = pa.Table.from_batches([record_batch])
|
13
|
+
grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
|
14
|
+
|
15
|
+
assert len(grouped_array) == 3
|
16
|
+
total_records = 0
|
17
|
+
for arr in grouped_array:
|
18
|
+
if arr is not None:
|
19
|
+
total_records += len(arr[1])
|
20
|
+
|
21
|
+
assert total_records == len(table)
|
22
|
+
|
23
|
+
def test_when_record_batches_exceed_int_max_size(self):
|
24
|
+
record = pa.array(["12bytestring" * 90_000_000])
|
25
|
+
record_batch = pa.RecordBatch.from_arrays([record], names=["pk"])
|
26
|
+
table = pa.Table.from_batches([record_batch, record_batch])
|
27
|
+
|
28
|
+
grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
|
29
|
+
|
30
|
+
assert len(grouped_array) == 3
|
31
|
+
# two record batches are preserved as combining them
|
32
|
+
# would exceed 2GB.
|
33
|
+
assert len(grouped_array[2].to_batches()) == 2
|
34
|
+
|
35
|
+
def test_when_record_batches_less_than_int_max_size(self):
|
36
|
+
record = pa.array(["12bytestring" * 90_000])
|
37
|
+
record_batch = pa.RecordBatch.from_arrays([record], names=["pk"])
|
38
|
+
table = pa.Table.from_batches([record_batch, record_batch])
|
39
|
+
|
40
|
+
grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
|
41
|
+
|
42
|
+
assert len(grouped_array) == 3
|
43
|
+
# Combined the arrays into one record batch as the size
|
44
|
+
# would not exceed 2GB.
|
45
|
+
assert len(grouped_array[1].to_batches()) == 1
|
@@ -47,7 +47,8 @@ def stage_partition_from_file_paths(
|
|
47
47
|
|
48
48
|
def commit_delta_to_staged_partition(
|
49
49
|
staged_partition,
|
50
|
-
file_paths: List[str],
|
50
|
+
file_paths: List[str] = None,
|
51
|
+
pa_table: pa.Table = None,
|
51
52
|
content_type: ContentType = ContentType.PARQUET,
|
52
53
|
*args,
|
53
54
|
**kwargs,
|
@@ -57,6 +58,7 @@ def commit_delta_to_staged_partition(
|
|
57
58
|
*args,
|
58
59
|
file_paths=file_paths,
|
59
60
|
content_type=content_type,
|
61
|
+
pa_table=pa_table,
|
60
62
|
**kwargs,
|
61
63
|
)
|
62
64
|
ds.commit_partition(staged_partition, **kwargs)
|
@@ -76,23 +78,28 @@ def download_delta(delta_like: Union[Delta, DeltaLocator], *args, **kwargs) -> D
|
|
76
78
|
|
77
79
|
def commit_delta_to_partition(
|
78
80
|
partition: Union[Partition, PartitionLocator],
|
79
|
-
file_paths: List[str],
|
81
|
+
file_paths: List[str] = None,
|
82
|
+
pa_table: pa.Table = None,
|
80
83
|
content_type: ContentType = ContentType.PARQUET,
|
81
84
|
*args,
|
82
85
|
**kwargs,
|
83
86
|
) -> Delta:
|
84
|
-
tables = []
|
85
87
|
|
86
88
|
if isinstance(partition, PartitionLocator):
|
87
89
|
partition = ds.get_partition(
|
88
90
|
partition.stream_locator, partition.partition_values, *args, **kwargs
|
89
91
|
)
|
92
|
+
if pa_table is None:
|
93
|
+
assert file_paths is not None, "One of pa_table or file_paths must be passed."
|
94
|
+
tables = []
|
95
|
+
for file_path in file_paths:
|
96
|
+
table = pa.csv.read_csv(file_path)
|
97
|
+
tables.append(table)
|
90
98
|
|
91
|
-
|
92
|
-
table = pa.csv.read_csv(file_path)
|
93
|
-
tables.append(table)
|
99
|
+
pa_table = pa.concat_tables(tables)
|
94
100
|
|
95
|
-
|
96
|
-
|
101
|
+
staged_delta = ds.stage_delta(
|
102
|
+
pa_table, partition, content_type=content_type, **kwargs
|
103
|
+
)
|
97
104
|
|
98
105
|
return ds.commit_delta(staged_delta, **kwargs)
|
@@ -16,6 +16,7 @@ from pyarrow.parquet import ParquetFile
|
|
16
16
|
import pyarrow as pa
|
17
17
|
|
18
18
|
PARQUET_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet"
|
19
|
+
PARQUET_GZIP_COMPRESSED_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet.gz"
|
19
20
|
EMPTY_UTSV_PATH = "deltacat/tests/utils/data/empty.csv"
|
20
21
|
NON_EMPTY_VALID_UTSV_PATH = "deltacat/tests/utils/data/non_empty_valid.csv"
|
21
22
|
OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH = (
|
@@ -789,3 +790,25 @@ class TestS3FileToTable(TestCase):
|
|
789
790
|
self.assertEqual(field.name, schema.field(index).name)
|
790
791
|
|
791
792
|
self.assertEqual(result.schema.field(1).type, "string")
|
793
|
+
|
794
|
+
def test_s3_file_to_table_when_parquet_gzip(self):
|
795
|
+
|
796
|
+
pa_kwargs_provider = lambda content_type, kwargs: {
|
797
|
+
"reader_type": "pyarrow",
|
798
|
+
**kwargs,
|
799
|
+
}
|
800
|
+
|
801
|
+
result = s3_file_to_table(
|
802
|
+
PARQUET_GZIP_COMPRESSED_FILE_PATH,
|
803
|
+
ContentType.PARQUET.value,
|
804
|
+
ContentEncoding.GZIP.value,
|
805
|
+
["n_legs", "animal"],
|
806
|
+
["n_legs"],
|
807
|
+
pa_read_func_kwargs_provider=pa_kwargs_provider,
|
808
|
+
)
|
809
|
+
|
810
|
+
self.assertEqual(len(result), 6)
|
811
|
+
self.assertEqual(len(result.column_names), 1)
|
812
|
+
schema = result.schema
|
813
|
+
schema_index = schema.get_field_index("n_legs")
|
814
|
+
self.assertEqual(schema.field(schema_index).type, "int64")
|
deltacat/utils/pyarrow.py
CHANGED
@@ -58,6 +58,7 @@ RAISE_ON_DECIMAL_OVERFLOW = "raise_on_decimal_overflow"
|
|
58
58
|
# Note the maximum from https://arrow.apache.org/docs/python/generated/pyarrow.Decimal256Type.html#pyarrow.Decimal256Type
|
59
59
|
DECIMAL256_DEFAULT_SCALE = 38
|
60
60
|
DECIMAL256_MAX_PRECISION = 76
|
61
|
+
MAX_INT_BYTES = 2147483646
|
61
62
|
|
62
63
|
|
63
64
|
def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Schema:
|
@@ -129,9 +130,11 @@ def _read_csv_rounding_decimal_columns_to_fit_scale(
|
|
129
130
|
# conversion to decimal256 isn't implemented as of pyarrow==12.0.1
|
130
131
|
new_schema = _new_schema_with_replaced_fields(
|
131
132
|
schema,
|
132
|
-
lambda fld:
|
133
|
-
|
134
|
-
|
133
|
+
lambda fld: (
|
134
|
+
pa.field(fld.name, pa.string(), metadata=fld.metadata)
|
135
|
+
if pa.types.is_decimal128(fld.type) or pa.types.is_decimal256(fld.type)
|
136
|
+
else None
|
137
|
+
),
|
135
138
|
)
|
136
139
|
new_kwargs = sanitize_kwargs_by_supported_kwargs(
|
137
140
|
["read_options", "parse_options", "convert_options", "memory_pool"],
|
@@ -569,8 +572,8 @@ def s3_file_to_table(
|
|
569
572
|
**s3_client_kwargs,
|
570
573
|
)
|
571
574
|
|
572
|
-
|
573
|
-
|
575
|
+
if READER_TYPE_KWARG in kwargs:
|
576
|
+
kwargs.pop(READER_TYPE_KWARG)
|
574
577
|
|
575
578
|
filesystem = io
|
576
579
|
if s3_url.startswith("s3://"):
|
@@ -912,7 +915,6 @@ def sliced_string_cast(array: pa.ChunkedArray) -> pa.ChunkedArray:
|
|
912
915
|
TODO: deprecate this function when pyarrow performs proper ChunkedArray -> ChunkedArray casting
|
913
916
|
"""
|
914
917
|
dtype = array.type
|
915
|
-
MAX_BYTES = 2147483646
|
916
918
|
max_str_len = None
|
917
919
|
if pa.types.is_integer(dtype):
|
918
920
|
max_str_len = _int_max_string_len()
|
@@ -924,7 +926,7 @@ def sliced_string_cast(array: pa.ChunkedArray) -> pa.ChunkedArray:
|
|
924
926
|
max_str_len = _max_decimal256_string_len()
|
925
927
|
|
926
928
|
if max_str_len is not None:
|
927
|
-
max_elems_per_chunk =
|
929
|
+
max_elems_per_chunk = MAX_INT_BYTES // (2 * max_str_len) # safety factor of 2
|
928
930
|
all_chunks = []
|
929
931
|
for chunk in array.chunks:
|
930
932
|
if len(chunk) < max_elems_per_chunk:
|
@@ -1,4 +1,4 @@
|
|
1
|
-
deltacat/__init__.py,sha256=
|
1
|
+
deltacat/__init__.py,sha256=DoUiDxmgMh8HUGOEAG7CUY0Q9Ip-S7gePDsL8XQO5kk,1778
|
2
2
|
deltacat/constants.py,sha256=TUJLXUJ9xq1Ryil72yLkKR8EDH_Irp5wUg56QstbRNE,2181
|
3
3
|
deltacat/exceptions.py,sha256=7sjk3BuMY5Oo-6OvAfHncZx_OcvtEL47BblWr2F7waE,12740
|
4
4
|
deltacat/logs.py,sha256=EQSDin1deehzz5xlLV1_TrFJrO_IBZ9Ahp7MdL-4cK8,9363
|
@@ -51,7 +51,7 @@ deltacat/compute/compactor/utils/sort_key.py,sha256=oK6otg-CSsma6zlGPaKg-KNEvcZR
|
|
51
51
|
deltacat/compute/compactor/utils/system_columns.py,sha256=CNIgAGos0xAGEpdaQIH7KfbSRrGZgjRbItXMararqXQ,9399
|
52
52
|
deltacat/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
53
53
|
deltacat/compute/compactor_v2/compaction_session.py,sha256=COtol2s63DRPbd-AN9KCiWr4exLX8x5Tvxea_7cOGEQ,8078
|
54
|
-
deltacat/compute/compactor_v2/constants.py,sha256=
|
54
|
+
deltacat/compute/compactor_v2/constants.py,sha256=wvd34d7RGdniGbbiJcMljxRrRas4_uy9F9UaqXfS_Ag,3034
|
55
55
|
deltacat/compute/compactor_v2/deletes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
56
56
|
deltacat/compute/compactor_v2/deletes/delete_file_envelope.py,sha256=AeuH9JRMwp6mvQf6P2cqL92hUEtResQq6qUTS0kIKac,3111
|
57
57
|
deltacat/compute/compactor_v2/deletes/delete_strategy.py,sha256=SMEJOxR-5r92kvKNqtu2w6HmwtmhljcZX1wcNEuS-4w,2833
|
@@ -69,14 +69,14 @@ deltacat/compute/compactor_v2/private/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCe
|
|
69
69
|
deltacat/compute/compactor_v2/private/compaction_utils.py,sha256=fMWXg1SCIIgjk9p_OFYrcm760dOKNbFO1Lj3_JI3GCY,30929
|
70
70
|
deltacat/compute/compactor_v2/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
71
71
|
deltacat/compute/compactor_v2/steps/hash_bucket.py,sha256=1R5xLUkl7GqL1nY-apAgY1czKDEHjIVYSRi9qLOMass,6726
|
72
|
-
deltacat/compute/compactor_v2/steps/merge.py,sha256=
|
72
|
+
deltacat/compute/compactor_v2/steps/merge.py,sha256=qxmb3cmiKvOgfuOzlJT4Q60zOyWNjsiuZSzxdh6KTm8,22909
|
73
73
|
deltacat/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
74
74
|
deltacat/compute/compactor_v2/utils/content_type_params.py,sha256=1P9CDpuWErsFcTTlRCeuUQHDokVI92he_MsL82uRAdA,7424
|
75
|
-
deltacat/compute/compactor_v2/utils/dedupe.py,sha256=
|
75
|
+
deltacat/compute/compactor_v2/utils/dedupe.py,sha256=Jz1QbBOdZJwT8K1vD9q01eOn7hdLNZ_AF7bJ0wficr0,1949
|
76
76
|
deltacat/compute/compactor_v2/utils/delta.py,sha256=I7Yvda8NVbpKXG3nM2Ku1utvR2r2OpHvUMqUL2ja3aw,3626
|
77
77
|
deltacat/compute/compactor_v2/utils/io.py,sha256=3m4dorxj-WD6Yu9_3gRE6gz3C-eNJA7nn02sHKwo-J8,6018
|
78
78
|
deltacat/compute/compactor_v2/utils/merge.py,sha256=EV_iKhNc3WflgfLW1Q46dXUvyClx8VebWHGtninEfsI,5311
|
79
|
-
deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=
|
79
|
+
deltacat/compute/compactor_v2/utils/primary_key_index.py,sha256=SbQ97M1Cxld-zZik2QMSzlj20g6JlENaQx_0PhlCIP8,12034
|
80
80
|
deltacat/compute/compactor_v2/utils/task_options.py,sha256=W0jyWIIZ0tcSAGp8mhpnu1G8p3rmX4d3juCPpAJxnDM,12649
|
81
81
|
deltacat/compute/merge_on_read/__init__.py,sha256=ckbgngmqPjYBYz_NySsR1vNTOb_hNpeL1sYkZKvBI9M,214
|
82
82
|
deltacat/compute/merge_on_read/daft.py,sha256=1oC38u5ig_aTrq7EzyWBo8Ui54rb6yERYMk-vEFbpxM,1400
|
@@ -136,7 +136,7 @@ deltacat/tests/compute/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3
|
|
136
136
|
deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py,sha256=kWyIJQMyF9oBemvgOp3ngGhMpH9zjkznV-67ewELgHQ,37719
|
137
137
|
deltacat/tests/compute/compact_partition_rebase_test_cases.py,sha256=8HVr3EIFYFqNaJoqeCuj9xIBjM4Ch2bx-mJcO4BRrLo,16839
|
138
138
|
deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py,sha256=l_6-pAKOsRY3NbtfHsYmEaJEkq6IJueYuLsjyJxNgz4,81564
|
139
|
-
deltacat/tests/compute/compact_partition_test_cases.py,sha256=
|
139
|
+
deltacat/tests/compute/compact_partition_test_cases.py,sha256=HJ15Xyawv8ImFju8wDwt22fH5okoPhyS-QAygkXDb7Q,27422
|
140
140
|
deltacat/tests/compute/test_compact_partition_incremental.py,sha256=lkfAraOJmEmieesf7b1BqlfTS26YjYM5xXOXoTMrsos,14989
|
141
141
|
deltacat/tests/compute/test_compact_partition_multiple_rounds.py,sha256=xXBA66TTfARR90m5KQs31nmiokuMy9iGQt7Z9evyG7M,12950
|
142
142
|
deltacat/tests/compute/test_compact_partition_params.py,sha256=Dm5eLyHo8oGMeO3XBbpj1rZqHtPZ1hAB7z2qvzc4Lxk,8497
|
@@ -152,9 +152,10 @@ deltacat/tests/compute/compactor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JC
|
|
152
152
|
deltacat/tests/compute/compactor/utils/test_io.py,sha256=st5mlU4cVU-eQl7B4mvPgNA3izuNwbVawYOp-NcoyrI,4326
|
153
153
|
deltacat/tests/compute/compactor/utils/test_round_completion_file.py,sha256=LAQ4usiRF4oTx4cA85L0eOcBa_Z-febc-CuzUijSGrI,7439
|
154
154
|
deltacat/tests/compute/compactor_v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
155
|
-
deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=
|
155
|
+
deltacat/tests/compute/compactor_v2/test_compaction_session.py,sha256=y8nNHq9ADHENzUKMQYguB45zOD7F2lZgcBYYTvbTsdM,28957
|
156
156
|
deltacat/tests/compute/compactor_v2/test_hashlib.py,sha256=8csF2hFWtBvY2MbX3-6iphCsVXxRp0zP1NTnKhfdmkg,328
|
157
157
|
deltacat/tests/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
158
|
+
deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py,sha256=aFb9rzT_EK9k8qAMHPtpqd5btyEmll1So1loDmZkotQ,1769
|
158
159
|
deltacat/tests/compute/compactor_v2/utils/test_task_options.py,sha256=37DkR1u_XwhedV9cGed6FFuJTC0XmuiowHJIa_Op6uA,865
|
159
160
|
deltacat/tests/compute/resource_estimation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
160
161
|
deltacat/tests/compute/resource_estimation/test_delta.py,sha256=HCL2oUnCqm0E26T3HLJjMhoAsHTJIWPYGwIKRgM_H7E,25712
|
@@ -171,7 +172,7 @@ deltacat/tests/local_deltacat_storage/__init__.py,sha256=5T9ubNIS42-BotEH0yrUiWE
|
|
171
172
|
deltacat/tests/local_deltacat_storage/exceptions.py,sha256=oxZ0psmrEO0M6P2r8gHQ2E8E-Y8UBfUCBUIwfuHcx38,251
|
172
173
|
deltacat/tests/test_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
173
174
|
deltacat/tests/test_utils/constants.py,sha256=UYe--9T_clYjiOpv0M7TtAMGdpje_SMZ-w8n0IeCAjc,214
|
174
|
-
deltacat/tests/test_utils/pyarrow.py,sha256=
|
175
|
+
deltacat/tests/test_utils/pyarrow.py,sha256=QDdGilzsJ2xUESiGotdNVZde9yD7ja9MvNhhssnox-E,3083
|
175
176
|
deltacat/tests/test_utils/storage.py,sha256=93GEn4A5WbMHWk0Ec4Bd7RxeHoSEnBfSarfWhKOSNtM,972
|
176
177
|
deltacat/tests/test_utils/utils.py,sha256=a32qEwcSSd1lvRi0aJJ4ZLnc1ZyXmoQF_K95zaQRk2M,455
|
177
178
|
deltacat/tests/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -179,7 +180,7 @@ deltacat/tests/utils/test_cloudpickle.py,sha256=J0pnBY3-PxlUh6MamZAN1PuquKQPr2iy
|
|
179
180
|
deltacat/tests/utils/test_daft.py,sha256=kY8lkXoQvyWunok8UvOsh1An297rb3jcnstTuIAyAlc,8232
|
180
181
|
deltacat/tests/utils/test_metrics.py,sha256=Ym9nOz1EtB180pLmvugihj1sDTNDMb5opIjjr5Nmcls,16339
|
181
182
|
deltacat/tests/utils/test_placement.py,sha256=g61wVOMkHe4YJeR9Oxg_BOVQ6bhHHbC3IBYv8YhUu94,597
|
182
|
-
deltacat/tests/utils/test_pyarrow.py,sha256=
|
183
|
+
deltacat/tests/utils/test_pyarrow.py,sha256=fDjDkGPjdRZA3kgjgiQRym9shdeDYgkdDPYU2a7IEUk,30790
|
183
184
|
deltacat/tests/utils/test_record_batch_tables.py,sha256=AkG1WyljQmjnl-AxhbFWyo5LnMIKRyLScfgC2B_ES-s,11321
|
184
185
|
deltacat/tests/utils/test_resources.py,sha256=HtpvDrfPZQNtGDXUlsIzc_yd7Vf1cDscZ3YbN0oTvO8,2560
|
185
186
|
deltacat/tests/utils/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -200,7 +201,7 @@ deltacat/utils/numpy.py,sha256=SpHKKvC-K8NINTWGVfTZ5-gBFTGYqaXjjgKFhsdUjwg,2049
|
|
200
201
|
deltacat/utils/pandas.py,sha256=q99mlRB7tymICMcNbfGLfLqFu_C-feyPZKZm2CWJJVc,9574
|
201
202
|
deltacat/utils/performance.py,sha256=7ZLaMkS1ehPSIhT5uOQVBHvjC70iKHzoFquFo-KL0PI,645
|
202
203
|
deltacat/utils/placement.py,sha256=Lj20fb-eq8rgMdm_M2MBMfDLwhDM1sS1nJj2DvIK56s,12060
|
203
|
-
deltacat/utils/pyarrow.py,sha256=
|
204
|
+
deltacat/utils/pyarrow.py,sha256=R3KkJPenE48rS3VrfFKSkJerX94f4e7X2dUPBQg44DY,34339
|
204
205
|
deltacat/utils/resources.py,sha256=Ax1OgLLbZI4oYpp4Ki27OLaST-7I-AJgZwU87FVfY8g,8253
|
205
206
|
deltacat/utils/s3fs.py,sha256=PmUJ5Fm1WmD-_zp_M6yd9VbXvIoJuBeK6ApOdJJApLE,662
|
206
207
|
deltacat/utils/schema.py,sha256=m4Wm4ZQcpttzOUxex4dVneGlHy1_E36HspTcjNYzvVM,1564
|
@@ -210,8 +211,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
|
|
210
211
|
deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
|
211
212
|
deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
|
212
213
|
deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
|
213
|
-
deltacat-1.1.
|
214
|
-
deltacat-1.1.
|
215
|
-
deltacat-1.1.
|
216
|
-
deltacat-1.1.
|
217
|
-
deltacat-1.1.
|
214
|
+
deltacat-1.1.29.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
215
|
+
deltacat-1.1.29.dist-info/METADATA,sha256=ZverlgFUJV4wGJao8tusRCv_sRNX4KJ4RTNAGvBCJes,1733
|
216
|
+
deltacat-1.1.29.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
217
|
+
deltacat-1.1.29.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
|
218
|
+
deltacat-1.1.29.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|