deltacat 0.1.18b15__py3-none-any.whl → 0.1.18b16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor/model/compact_partition_params.py +11 -1
- deltacat/compute/compactor/model/compaction_session_audit_info.py +13 -0
- deltacat/compute/compactor/model/delta_annotated.py +10 -6
- deltacat/compute/compactor/repartition_session.py +2 -0
- deltacat/compute/compactor/steps/repartition.py +6 -0
- deltacat/compute/compactor_v2/compaction_session.py +72 -69
- deltacat/compute/compactor_v2/constants.py +3 -0
- deltacat/compute/compactor_v2/model/merge_input.py +17 -1
- deltacat/compute/compactor_v2/steps/merge.py +430 -2
- deltacat/compute/compactor_v2/utils/content_type_params.py +43 -14
- deltacat/compute/compactor_v2/utils/dedupe.py +58 -0
- deltacat/compute/compactor_v2/utils/io.py +11 -8
- deltacat/compute/compactor_v2/utils/primary_key_index.py +58 -25
- deltacat/compute/compactor_v2/utils/task_options.py +8 -15
- deltacat/tests/compute/common.py +1 -1
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -0
- deltacat/tests/compute/test_compaction_session_incremental.py +16 -1
- deltacat/tests/compute/testcases.py +7 -2
- deltacat/tests/test_utils/pyarrow.py +23 -6
- deltacat/types/partial_download.py +1 -0
- deltacat/types/tables.py +5 -0
- deltacat/utils/arguments.py +1 -2
- deltacat/utils/pyarrow.py +5 -0
- {deltacat-0.1.18b15.dist-info → deltacat-0.1.18b16.dist-info}/METADATA +1 -1
- {deltacat-0.1.18b15.dist-info → deltacat-0.1.18b16.dist-info}/RECORD +29 -30
- deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +0 -199
- {deltacat-0.1.18b15.dist-info → deltacat-0.1.18b16.dist-info}/LICENSE +0 -0
- {deltacat-0.1.18b15.dist-info → deltacat-0.1.18b16.dist-info}/WHEEL +0 -0
- {deltacat-0.1.18b15.dist-info → deltacat-0.1.18b16.dist-info}/top_level.txt +0 -0
@@ -31,8 +31,19 @@ def _append_sha1_hash_to_table(table: pa.Table, hash_column: pa.Array) -> pa.Tab
|
|
31
31
|
return sc.append_pk_hash_string_column(table, result)
|
32
32
|
|
33
33
|
|
34
|
-
def _is_sha1_desired(
|
35
|
-
|
34
|
+
def _is_sha1_desired(hash_columns: List[pa.Array]) -> bool:
|
35
|
+
total_size = 0
|
36
|
+
total_len = 0
|
37
|
+
|
38
|
+
for hash_column in hash_columns:
|
39
|
+
total_size += hash_column.nbytes
|
40
|
+
total_len += len(hash_column)
|
41
|
+
|
42
|
+
logger.info(
|
43
|
+
f"Found total length of hash column={total_len} and total_size={total_size}"
|
44
|
+
)
|
45
|
+
|
46
|
+
return total_size > TOTAL_BYTES_IN_SHA1_HASH * total_len
|
36
47
|
|
37
48
|
|
38
49
|
def _append_table_by_hash_bucket(
|
@@ -61,7 +72,9 @@ def _append_table_by_hash_bucket(
|
|
61
72
|
for i, group_count in enumerate(group_count_array):
|
62
73
|
hb_idx = hb_group_array[i].as_py()
|
63
74
|
pyarrow_table = hb_pk_table.slice(offset=result_len, length=group_count.as_py())
|
64
|
-
pyarrow_table = pyarrow_table.drop(
|
75
|
+
pyarrow_table = pyarrow_table.drop(
|
76
|
+
[sc._HASH_BUCKET_IDX_COLUMN_NAME, sc._PK_HASH_STRING_COLUMN_NAME]
|
77
|
+
)
|
65
78
|
if hash_bucket_to_table[hb_idx] is None:
|
66
79
|
hash_bucket_to_table[hb_idx] = []
|
67
80
|
hash_bucket_to_table[hb_idx].append(pyarrow_table)
|
@@ -142,7 +155,7 @@ def _optimized_group_record_batches_by_hash_bucket(
|
|
142
155
|
def group_by_pk_hash_bucket(
|
143
156
|
table: pa.Table, num_buckets: int, primary_keys: List[str]
|
144
157
|
) -> np.ndarray:
|
145
|
-
table = generate_pk_hash_column(table, primary_keys, requires_sha1=True)
|
158
|
+
table = generate_pk_hash_column([table], primary_keys, requires_sha1=True)[0]
|
146
159
|
|
147
160
|
# group hash bucket record indices
|
148
161
|
result = group_record_indices_by_hash_bucket(
|
@@ -154,53 +167,73 @@ def group_by_pk_hash_bucket(
|
|
154
167
|
|
155
168
|
|
156
169
|
def generate_pk_hash_column(
|
157
|
-
|
170
|
+
tables: List[pa.Table],
|
158
171
|
primary_keys: Optional[List[str]] = None,
|
159
172
|
requires_sha1: bool = False,
|
160
|
-
) -> pa.Table:
|
173
|
+
) -> List[pa.Table]:
|
161
174
|
"""
|
162
|
-
Returns a new table after generating the primary key hash if desired.
|
175
|
+
Returns a new table list after generating the primary key hash if desired.
|
163
176
|
|
164
177
|
1. If there are no primary keys, each hash will be unique uuid/sha1 hex
|
165
|
-
2. If there are more than 0 primary keys, returns a table with
|
178
|
+
2. If there are more than 0 primary keys, returns a table with pk hash column appended.
|
166
179
|
"""
|
167
180
|
|
168
|
-
|
169
|
-
|
170
|
-
can_sha1 = False
|
171
|
-
if primary_keys:
|
181
|
+
def _generate_pk_hash(table: pa.Table) -> pa.Array:
|
172
182
|
pk_columns = []
|
173
183
|
for pk_name in primary_keys:
|
174
184
|
pk_columns.append(pc.cast(table[pk_name], pa.string()))
|
175
185
|
|
176
186
|
pk_columns.append(PK_DELIMITER)
|
177
187
|
hash_column = pc.binary_join_element_wise(*pk_columns)
|
188
|
+
return hash_column
|
178
189
|
|
179
|
-
|
180
|
-
else:
|
190
|
+
def _generate_uuid(table: pa.Table) -> pa.Array:
|
181
191
|
hash_column = pa.array(
|
182
192
|
[uuid.uuid4().hex for _ in range(len(table))], pa.string()
|
183
193
|
)
|
194
|
+
return hash_column
|
195
|
+
|
196
|
+
start = time.monotonic()
|
197
|
+
|
198
|
+
hash_column_list = []
|
199
|
+
|
200
|
+
can_sha1 = False
|
201
|
+
if primary_keys:
|
202
|
+
hash_column_list = [_generate_pk_hash(table) for table in tables]
|
203
|
+
|
204
|
+
can_sha1 = requires_sha1 or _is_sha1_desired(hash_column_list)
|
205
|
+
else:
|
206
|
+
hash_column_list = [_generate_uuid(table) for table in tables]
|
184
207
|
|
185
208
|
logger.info(
|
186
|
-
f"can_generate_sha1={can_sha1} for the table
|
187
|
-
f"={hash_column.nbytes} bytes, num_rows={len(hash_column)}, "
|
188
|
-
f"and requires_sha1={requires_sha1}"
|
209
|
+
f"can_generate_sha1={can_sha1} for the table and requires_sha1={requires_sha1}"
|
189
210
|
)
|
190
211
|
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
212
|
+
result = []
|
213
|
+
|
214
|
+
total_len = 0
|
215
|
+
total_size = 0
|
216
|
+
for index, table in enumerate(tables):
|
217
|
+
if can_sha1:
|
218
|
+
table = _append_sha1_hash_to_table(table, hash_column_list[index])
|
219
|
+
else:
|
220
|
+
table = table.append_column(
|
221
|
+
sc._PK_HASH_STRING_COLUMN_FIELD, hash_column_list[index]
|
222
|
+
)
|
223
|
+
|
224
|
+
total_len += len(table)
|
225
|
+
total_size += hash_column_list[index].nbytes
|
226
|
+
|
227
|
+
result.append(table)
|
195
228
|
|
196
229
|
end = time.monotonic()
|
197
230
|
|
198
231
|
logger.info(
|
199
|
-
f"Took {end - start}s to generate pk hash of len: {
|
200
|
-
f"
|
232
|
+
f"Took {end - start}s to generate pk hash of len: {total_len}"
|
233
|
+
f" for size: {total_size} bytes"
|
201
234
|
)
|
202
235
|
|
203
|
-
return
|
236
|
+
return result
|
204
237
|
|
205
238
|
|
206
239
|
def group_record_indices_by_hash_bucket(
|
@@ -298,7 +331,7 @@ def hash_group_index_to_hash_bucket_indices(
|
|
298
331
|
if hb_group > num_buckets:
|
299
332
|
return []
|
300
333
|
|
301
|
-
return range(hb_group,
|
334
|
+
return range(hb_group, num_buckets, num_groups)
|
302
335
|
|
303
336
|
|
304
337
|
def pk_digest_to_hash_bucket_index(digest: str, num_buckets: int) -> int:
|
@@ -2,7 +2,7 @@ from typing import Dict, Optional, List, Tuple
|
|
2
2
|
from deltacat.types.media import ContentEncoding, ContentType
|
3
3
|
from deltacat.types.partial_download import PartialParquetParameters
|
4
4
|
from deltacat.storage import (
|
5
|
-
|
5
|
+
Manifest,
|
6
6
|
ManifestEntry,
|
7
7
|
interface as unimplemented_deltacat_storage,
|
8
8
|
)
|
@@ -11,9 +11,6 @@ from deltacat.compute.compactor.model.round_completion_info import RoundCompleti
|
|
11
11
|
from deltacat.compute.compactor_v2.utils.primary_key_index import (
|
12
12
|
hash_group_index_to_hash_bucket_indices,
|
13
13
|
)
|
14
|
-
from deltacat.compute.compactor_v2.utils.content_type_params import (
|
15
|
-
append_content_type_params,
|
16
|
-
)
|
17
14
|
from deltacat.compute.compactor_v2.constants import TOTAL_MEMORY_BUFFER_PERCENTAGE
|
18
15
|
|
19
16
|
|
@@ -24,6 +21,7 @@ def _get_parquet_type_params_if_exist(
|
|
24
21
|
entry.meta
|
25
22
|
and entry.meta.content_type == ContentType.PARQUET
|
26
23
|
and entry.meta.content_encoding == ContentEncoding.IDENTITY
|
24
|
+
and entry.meta.content_type_parameters
|
27
25
|
):
|
28
26
|
for type_params in entry.meta.content_type_parameters:
|
29
27
|
if isinstance(type_params, PartialParquetParameters):
|
@@ -93,7 +91,7 @@ def estimate_manifest_entry_column_size_bytes(
|
|
93
91
|
|
94
92
|
type_params = _get_parquet_type_params_if_exist(entry=entry)
|
95
93
|
|
96
|
-
if type_params.pq_metadata:
|
94
|
+
if type_params and type_params.pq_metadata:
|
97
95
|
return _calculate_parquet_column_size(type_params=type_params, columns=columns)
|
98
96
|
|
99
97
|
return None
|
@@ -153,7 +151,7 @@ def merge_resource_options_provider(
|
|
153
151
|
hash_group_size_bytes: Dict[int, int],
|
154
152
|
hash_group_num_rows: Dict[int, int],
|
155
153
|
round_completion_info: Optional[RoundCompletionInfo] = None,
|
156
|
-
|
154
|
+
compacted_delta_manifest: Optional[Manifest] = None,
|
157
155
|
primary_keys: Optional[List[str]] = None,
|
158
156
|
deltacat_storage=unimplemented_deltacat_storage,
|
159
157
|
deltacat_storage_kwargs: Optional[Dict] = {},
|
@@ -168,8 +166,8 @@ def merge_resource_options_provider(
|
|
168
166
|
|
169
167
|
if (
|
170
168
|
round_completion_info
|
171
|
-
and
|
172
|
-
and round_completion_info.
|
169
|
+
and compacted_delta_manifest
|
170
|
+
and round_completion_info.hb_index_to_entry_range
|
173
171
|
):
|
174
172
|
|
175
173
|
previous_inflation = (
|
@@ -187,15 +185,10 @@ def merge_resource_options_provider(
|
|
187
185
|
|
188
186
|
for hb_idx in iterable:
|
189
187
|
entry_start, entry_end = round_completion_info.hb_index_to_entry_range[
|
190
|
-
hb_idx
|
188
|
+
str(hb_idx)
|
191
189
|
]
|
192
190
|
for entry_index in range(entry_start, entry_end):
|
193
|
-
entry =
|
194
|
-
compacted_delta,
|
195
|
-
entry_index=entry_index,
|
196
|
-
deltacat_storage=deltacat_storage,
|
197
|
-
deltacat_storage_kwargs=deltacat_storage_kwargs,
|
198
|
-
)
|
191
|
+
entry = compacted_delta_manifest.entries[entry_index]
|
199
192
|
|
200
193
|
current_entry_size = estimate_manifest_entry_size_bytes(
|
201
194
|
entry=entry, previous_inflation=previous_inflation
|
deltacat/tests/compute/common.py
CHANGED
@@ -15,7 +15,7 @@ BASE_TEST_DESTINATION_NAMESPACE = "destination_test_namespace"
|
|
15
15
|
BASE_TEST_DESTINATION_TABLE_NAME = "destination_test_table_RAY"
|
16
16
|
BASE_TEST_DESTINATION_TABLE_VERSION = "1"
|
17
17
|
|
18
|
-
HASH_BUCKET_COUNT: int =
|
18
|
+
HASH_BUCKET_COUNT: int = 3
|
19
19
|
|
20
20
|
MAX_RECORDS_PER_FILE: int = 1
|
21
21
|
|
@@ -49,6 +49,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
49
49
|
self.destination_partition: PartitionLocator = MagicMock()
|
50
50
|
self.repartition_args = {"column": "last_updated", "ranges": [1678665487112747]}
|
51
51
|
self.max_records_per_output_file = 2
|
52
|
+
self.s3_table_writer_kwargs = {}
|
52
53
|
self.repartitioned_file_content_type = ContentType.PARQUET
|
53
54
|
self.deltacat_storage = MagicMock()
|
54
55
|
self.deltacat_storage_kwargs = MagicMock()
|
@@ -59,6 +60,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
59
60
|
self.destination_partition,
|
60
61
|
self.repartition_args,
|
61
62
|
self.max_records_per_output_file,
|
63
|
+
self.s3_table_writer_kwargs,
|
62
64
|
self.repartitioned_file_content_type,
|
63
65
|
self.deltacat_storage,
|
64
66
|
self.deltacat_storage_kwargs,
|
@@ -85,6 +87,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
85
87
|
self.destination_partition,
|
86
88
|
self.repartition_args,
|
87
89
|
self.max_records_per_output_file,
|
90
|
+
self.s3_table_writer_kwargs,
|
88
91
|
self.repartitioned_file_content_type,
|
89
92
|
self.deltacat_storage,
|
90
93
|
self.deltacat_storage_kwargs,
|
@@ -98,6 +101,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
98
101
|
self.destination_partition,
|
99
102
|
self.repartition_args,
|
100
103
|
self.max_records_per_output_file,
|
104
|
+
self.s3_table_writer_kwargs,
|
101
105
|
self.repartitioned_file_content_type,
|
102
106
|
self.deltacat_storage,
|
103
107
|
self.deltacat_storage_kwargs,
|
@@ -110,6 +114,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
110
114
|
self.destination_partition,
|
111
115
|
self.repartition_args,
|
112
116
|
self.max_records_per_output_file,
|
117
|
+
self.s3_table_writer_kwargs,
|
113
118
|
self.repartitioned_file_content_type,
|
114
119
|
self.deltacat_storage,
|
115
120
|
self.deltacat_storage_kwargs,
|
@@ -123,6 +128,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
123
128
|
self.destination_partition,
|
124
129
|
self.repartition_args,
|
125
130
|
self.max_records_per_output_file,
|
131
|
+
self.s3_table_writer_kwargs,
|
126
132
|
self.repartitioned_file_content_type,
|
127
133
|
self.deltacat_storage,
|
128
134
|
self.deltacat_storage_kwargs,
|
@@ -137,6 +143,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
137
143
|
self.destination_partition,
|
138
144
|
self.repartition_args,
|
139
145
|
self.max_records_per_output_file,
|
146
|
+
self.s3_table_writer_kwargs,
|
140
147
|
self.repartitioned_file_content_type,
|
141
148
|
self.deltacat_storage,
|
142
149
|
self.deltacat_storage_kwargs,
|
@@ -151,6 +158,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
151
158
|
self.destination_partition,
|
152
159
|
self.repartition_args,
|
153
160
|
self.max_records_per_output_file,
|
161
|
+
self.s3_table_writer_kwargs,
|
154
162
|
self.repartitioned_file_content_type,
|
155
163
|
self.deltacat_storage,
|
156
164
|
self.deltacat_storage_kwargs,
|
@@ -167,6 +175,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
167
175
|
self.destination_partition,
|
168
176
|
self.repartition_args,
|
169
177
|
self.max_records_per_output_file,
|
178
|
+
self.s3_table_writer_kwargs,
|
170
179
|
self.repartitioned_file_content_type,
|
171
180
|
self.deltacat_storage,
|
172
181
|
self.deltacat_storage_kwargs,
|
@@ -180,6 +189,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
180
189
|
self.destination_partition,
|
181
190
|
self.repartition_args,
|
182
191
|
self.max_records_per_output_file,
|
192
|
+
self.s3_table_writer_kwargs,
|
183
193
|
self.repartitioned_file_content_type,
|
184
194
|
self.deltacat_storage,
|
185
195
|
self.deltacat_storage_kwargs,
|
@@ -196,6 +206,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
196
206
|
self.destination_partition,
|
197
207
|
self.repartition_args,
|
198
208
|
self.max_records_per_output_file,
|
209
|
+
self.s3_table_writer_kwargs,
|
199
210
|
self.repartitioned_file_content_type,
|
200
211
|
self.deltacat_storage,
|
201
212
|
),
|
@@ -222,6 +233,7 @@ class TestRepartitionRange(unittest.TestCase):
|
|
222
233
|
self.destination_partition,
|
223
234
|
self.repartition_args,
|
224
235
|
self.max_records_per_output_file,
|
236
|
+
self.s3_table_writer_kwargs,
|
225
237
|
self.repartitioned_file_content_type,
|
226
238
|
self.deltacat_storage,
|
227
239
|
self.deltacat_storage_kwargs,
|
@@ -238,7 +238,8 @@ def test_compact_partition_incremental(
|
|
238
238
|
rebase_source_partition_locator_param,
|
239
239
|
partition_values_param,
|
240
240
|
expected_result,
|
241
|
-
|
241
|
+
# use and implement func and func_kwargs if you want to run additional validations apart from the ones in the test
|
242
|
+
validation_callback_func,
|
242
243
|
validation_callback_func_kwargs,
|
243
244
|
do_teardown_local_deltacat_storage_db,
|
244
245
|
use_prev_compacted,
|
@@ -281,6 +282,10 @@ def test_compact_partition_incremental(
|
|
281
282
|
arrow_arrays_param,
|
282
283
|
partition_values_param,
|
283
284
|
ds_mock_kwargs,
|
285
|
+
f"{test_name}_src_namespace",
|
286
|
+
f"{test_name}_table_src",
|
287
|
+
f"{test_name}_dest_namespace",
|
288
|
+
f"{test_name}_table_dest",
|
284
289
|
)
|
285
290
|
ray.shutdown()
|
286
291
|
ray.init(local_mode=True)
|
@@ -334,6 +339,16 @@ def test_compact_partition_incremental(
|
|
334
339
|
compacted_delta_locator = round_completion_info.compacted_delta_locator
|
335
340
|
tables = ds.download_delta(compacted_delta_locator, **ds_mock_kwargs)
|
336
341
|
compacted_table = pa.concat_tables(tables)
|
342
|
+
|
343
|
+
# the compacted table may contain multiple files and chunks
|
344
|
+
# and order of records may be incorrect due to multiple files.
|
345
|
+
expected_result = expected_result.combine_chunks().sort_by(
|
346
|
+
[(val, "ascending") for val in primary_keys_param]
|
347
|
+
)
|
348
|
+
compacted_table = compacted_table.combine_chunks().sort_by(
|
349
|
+
[(val, "ascending") for val in primary_keys_param]
|
350
|
+
)
|
351
|
+
|
337
352
|
assert compacted_table.equals(
|
338
353
|
expected_result
|
339
354
|
), f"{compacted_table} does not match {expected_result}"
|
@@ -12,13 +12,18 @@ from deltacat.tests.compute.common import (
|
|
12
12
|
from deltacat.compute.compactor.compaction_session import (
|
13
13
|
compact_partition_from_request as compact_partition_v1,
|
14
14
|
)
|
15
|
+
from deltacat.compute.compactor_v2.compaction_session import (
|
16
|
+
compact_partition as compact_partition_v2,
|
17
|
+
)
|
15
18
|
|
16
19
|
|
17
20
|
def create_tests_cases_for_all_compactor_versions(test_cases: Dict[str, List]):
|
18
21
|
final_cases = {}
|
19
|
-
for version, compact_partition_func in enumerate(
|
22
|
+
for version, compact_partition_func in enumerate(
|
23
|
+
[compact_partition_v1, compact_partition_v2]
|
24
|
+
):
|
20
25
|
for case_name, case_value in test_cases.items():
|
21
|
-
final_cases[f"{case_name}_v{version}"] = [
|
26
|
+
final_cases[f"{case_name}_v{version + 1}"] = [
|
22
27
|
*case_value,
|
23
28
|
compact_partition_func,
|
24
29
|
]
|
@@ -1,24 +1,42 @@
|
|
1
1
|
from typing import List
|
2
2
|
import pyarrow as pa
|
3
|
-
from deltacat.storage import Delta
|
3
|
+
from deltacat.storage import Delta, Partition
|
4
4
|
import deltacat.tests.local_deltacat_storage as ds
|
5
5
|
|
6
6
|
|
7
7
|
def create_delta_from_csv_file(
|
8
8
|
namespace: str, file_paths: List[str], *args, **kwargs
|
9
9
|
) -> Delta:
|
10
|
-
|
10
|
+
staged_partition = stage_partition_from_csv_file(
|
11
|
+
namespace, file_paths, *args, **kwargs
|
12
|
+
)
|
11
13
|
|
12
|
-
|
13
|
-
|
14
|
-
|
14
|
+
committed_delta = commit_delta_to_staged_partition(
|
15
|
+
staged_partition, file_paths, *args, **kwargs
|
16
|
+
)
|
15
17
|
|
18
|
+
return committed_delta
|
19
|
+
|
20
|
+
|
21
|
+
def stage_partition_from_csv_file(
|
22
|
+
namespace: str, file_paths: List[str], *args, **kwargs
|
23
|
+
) -> Partition:
|
16
24
|
ds.create_namespace(namespace, {}, **kwargs)
|
17
25
|
table_name = "-".join(file_paths).replace("/", "_")
|
18
26
|
ds.create_table_version(namespace, table_name, "1", **kwargs)
|
19
27
|
stream = ds.get_stream(namespace, table_name, "1", **kwargs)
|
20
28
|
staged_partition = ds.stage_partition(stream, [], **kwargs)
|
29
|
+
return staged_partition
|
21
30
|
|
31
|
+
|
32
|
+
def commit_delta_to_staged_partition(
|
33
|
+
staged_partition, file_paths: List[str], *args, **kwargs
|
34
|
+
) -> Delta:
|
35
|
+
tables = []
|
36
|
+
|
37
|
+
for file_path in file_paths:
|
38
|
+
table = pa.csv.read_csv(file_path)
|
39
|
+
tables.append(table)
|
22
40
|
deltas = []
|
23
41
|
|
24
42
|
for table in tables:
|
@@ -28,5 +46,4 @@ def create_delta_from_csv_file(
|
|
28
46
|
merged_delta = Delta.merge_deltas(deltas=deltas)
|
29
47
|
committed_delta = ds.commit_delta(merged_delta, **kwargs)
|
30
48
|
ds.commit_partition(staged_partition, **kwargs)
|
31
|
-
|
32
49
|
return committed_delta
|
@@ -38,6 +38,7 @@ class PartialParquetParameters(PartialFileDownloadParams):
|
|
38
38
|
num_row_groups = pq_metadata.num_row_groups
|
39
39
|
row_groups_to_download = [rg for rg in range(num_row_groups)]
|
40
40
|
in_memory_size_bytes = 0.0
|
41
|
+
num_rows = pq_metadata.num_rows
|
41
42
|
|
42
43
|
for rg in row_groups_to_download:
|
43
44
|
row_group_meta = pq_metadata.row_group(rg)
|
deltacat/types/tables.py
CHANGED
@@ -4,6 +4,7 @@ from typing import Callable, Dict, Type, Union
|
|
4
4
|
import numpy as np
|
5
5
|
import pandas as pd
|
6
6
|
import pyarrow as pa
|
7
|
+
import pyarrow.parquet as papq
|
7
8
|
from ray.data.dataset import Dataset
|
8
9
|
from ray.data.read_api import (
|
9
10
|
from_arrow,
|
@@ -49,6 +50,7 @@ TABLE_CLASS_TO_SIZE_FUNC: Dict[
|
|
49
50
|
Type[Union[dcs.LocalTable, dcs.DistributedDataset]], Callable
|
50
51
|
] = {
|
51
52
|
pa.Table: pa_utils.table_size,
|
53
|
+
papq.ParquetFile: pa_utils.parquet_file_size,
|
52
54
|
pd.DataFrame: pd_utils.dataframe_size,
|
53
55
|
np.ndarray: np_utils.ndarray_size,
|
54
56
|
Dataset: ds_utils.dataset_size,
|
@@ -56,18 +58,21 @@ TABLE_CLASS_TO_SIZE_FUNC: Dict[
|
|
56
58
|
|
57
59
|
TABLE_CLASS_TO_TABLE_TYPE: Dict[Type[dcs.LocalTable], str] = {
|
58
60
|
pa.Table: TableType.PYARROW.value,
|
61
|
+
papq.ParquetFile: TableType.PYARROW_PARQUET.value,
|
59
62
|
pd.DataFrame: TableType.PANDAS.value,
|
60
63
|
np.ndarray: TableType.NUMPY.value,
|
61
64
|
}
|
62
65
|
|
63
66
|
TABLE_TYPE_TO_DATASET_CREATE_FUNC: Dict[str, Callable] = {
|
64
67
|
TableType.PYARROW.value: from_arrow,
|
68
|
+
TableType.PYARROW_PARQUET.value: from_arrow,
|
65
69
|
TableType.NUMPY.value: from_numpy,
|
66
70
|
TableType.PANDAS.value: from_pandas,
|
67
71
|
}
|
68
72
|
|
69
73
|
TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS: Dict[str, Callable] = {
|
70
74
|
TableType.PYARROW.value: from_arrow_refs,
|
75
|
+
TableType.PYARROW_PARQUET.value: from_arrow_refs,
|
71
76
|
TableType.NUMPY.value: from_numpy,
|
72
77
|
TableType.PANDAS.value: from_pandas_refs,
|
73
78
|
}
|
deltacat/utils/arguments.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
import inspect
|
2
|
-
import copy
|
3
2
|
from typing import Any, Dict
|
4
3
|
|
5
4
|
|
@@ -13,7 +12,7 @@ def sanitize_kwargs_to_callable(callable: Any, kwargs: Dict) -> Dict:
|
|
13
12
|
signature = inspect.signature(callable)
|
14
13
|
params = signature.parameters
|
15
14
|
|
16
|
-
new_kwargs =
|
15
|
+
new_kwargs = {**kwargs}
|
17
16
|
|
18
17
|
for key in params:
|
19
18
|
if params[key].kind == inspect.Parameter.VAR_KEYWORD:
|
deltacat/utils/pyarrow.py
CHANGED
@@ -294,6 +294,7 @@ def s3_partial_parquet_file_to_table(
|
|
294
294
|
content_type=content_type,
|
295
295
|
content_encoding=content_encoding,
|
296
296
|
partial_file_download_params=partial_file_download_params,
|
297
|
+
pa_read_func_kwargs_provider=pa_read_func_kwargs_provider,
|
297
298
|
**s3_client_kwargs,
|
298
299
|
)
|
299
300
|
|
@@ -488,6 +489,10 @@ def table_size(table: pa.Table) -> int:
|
|
488
489
|
return table.nbytes
|
489
490
|
|
490
491
|
|
492
|
+
def parquet_file_size(table: papq.ParquetFile) -> int:
|
493
|
+
return table.metadata.serialized_size
|
494
|
+
|
495
|
+
|
491
496
|
def table_to_file(
|
492
497
|
table: pa.Table,
|
493
498
|
base_path: str,
|