deltacat 0.1.18b14__py3-none-any.whl → 0.1.18b15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/clients.py +17 -6
- deltacat/aws/redshift/model/manifest.py +4 -0
- deltacat/aws/s3u.py +24 -1
- deltacat/compute/compactor/compaction_session.py +42 -18
- deltacat/compute/compactor/model/compact_partition_params.py +287 -58
- deltacat/compute/compactor/model/compaction_session_audit_info.py +150 -9
- deltacat/compute/compactor/model/delta_annotated.py +91 -9
- deltacat/compute/compactor/model/delta_file_envelope.py +14 -2
- deltacat/compute/compactor/model/round_completion_info.py +17 -1
- deltacat/compute/compactor/repartition_session.py +2 -1
- deltacat/compute/compactor/steps/dedupe.py +9 -6
- deltacat/compute/compactor/steps/hash_bucket.py +24 -3
- deltacat/compute/compactor/steps/materialize.py +11 -6
- deltacat/compute/compactor/steps/repartition.py +16 -1
- deltacat/compute/compactor/utils/io.py +40 -23
- deltacat/compute/compactor/utils/sort_key.py +5 -0
- deltacat/compute/compactor/utils/system_columns.py +43 -0
- deltacat/compute/compactor_v2/compaction_session.py +506 -0
- deltacat/compute/compactor_v2/constants.py +34 -0
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
- deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
- deltacat/compute/compactor_v2/model/merge_input.py +127 -0
- deltacat/compute/compactor_v2/model/merge_result.py +12 -0
- deltacat/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
- deltacat/compute/compactor_v2/steps/merge.py +41 -0
- deltacat/compute/compactor_v2/utils/__init__.py +0 -0
- deltacat/compute/compactor_v2/utils/content_type_params.py +37 -0
- deltacat/compute/compactor_v2/utils/io.py +149 -0
- deltacat/compute/compactor_v2/utils/primary_key_index.py +308 -0
- deltacat/compute/compactor_v2/utils/task_options.py +228 -0
- deltacat/compute/metastats/meta_stats.py +4 -2
- deltacat/compute/metastats/stats.py +1 -0
- deltacat/compute/metastats/utils/io.py +4 -0
- deltacat/compute/stats/utils/io.py +20 -5
- deltacat/exceptions.py +4 -0
- deltacat/io/memcached_object_store.py +37 -14
- deltacat/logs.py +4 -3
- deltacat/storage/interface.py +8 -1
- deltacat/storage/model/types.py +2 -1
- deltacat/tests/aws/test_clients.py +16 -3
- deltacat/tests/compute/__init__.py +0 -0
- deltacat/tests/compute/common.py +96 -0
- deltacat/tests/compute/compactor/__init__.py +0 -0
- deltacat/tests/compute/compactor/steps/__init__.py +0 -0
- deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +22 -8
- deltacat/tests/compute/compactor/utils/__init__.py +0 -0
- deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
- deltacat/tests/compute/compactor_v2/__init__.py +0 -0
- deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +199 -0
- deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
- deltacat/tests/compute/test_compaction_session_incremental.py +348 -0
- deltacat/tests/compute/testcases.py +390 -0
- deltacat/tests/io/test_memcached_object_store.py +5 -4
- deltacat/tests/local_deltacat_storage/__init__.py +62 -19
- deltacat/tests/test_utils/pyarrow.py +32 -0
- deltacat/tests/test_utils/utils.py +13 -0
- deltacat/tests/utils/data/__init__.py +0 -0
- deltacat/tests/utils/test_daft.py +76 -0
- deltacat/tests/utils/test_pyarrow.py +133 -0
- deltacat/tests/utils/test_resources.py +23 -20
- deltacat/types/media.py +1 -0
- deltacat/types/partial_download.py +82 -0
- deltacat/types/tables.py +1 -0
- deltacat/utils/arguments.py +26 -0
- deltacat/utils/daft.py +87 -0
- deltacat/utils/placement.py +20 -3
- deltacat/utils/pyarrow.py +213 -1
- deltacat/utils/ray_utils/concurrency.py +26 -1
- deltacat/utils/resources.py +72 -1
- deltacat/utils/s3fs.py +21 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/METADATA +17 -3
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/RECORD +80 -47
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/WHEEL +1 -1
- /deltacat/{tests/compactor → compute/compactor_v2}/__init__.py +0 -0
- /deltacat/{tests/compactor/utils → compute/compactor_v2/model}/__init__.py +0 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/LICENSE +0 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b15.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,390 @@
|
|
1
|
+
import pyarrow as pa
|
2
|
+
from typing import Dict, List
|
3
|
+
from deltacat.tests.compute.common import (
|
4
|
+
MAX_RECORDS_PER_FILE,
|
5
|
+
offer_iso8601_timestamp_list,
|
6
|
+
)
|
7
|
+
from deltacat.tests.compute.common import (
|
8
|
+
BASE_TEST_SOURCE_TABLE_VERSION,
|
9
|
+
BASE_TEST_DESTINATION_TABLE_VERSION,
|
10
|
+
HASH_BUCKET_COUNT,
|
11
|
+
)
|
12
|
+
from deltacat.compute.compactor.compaction_session import (
|
13
|
+
compact_partition_from_request as compact_partition_v1,
|
14
|
+
)
|
15
|
+
|
16
|
+
|
17
|
+
def create_tests_cases_for_all_compactor_versions(test_cases: Dict[str, List]):
|
18
|
+
final_cases = {}
|
19
|
+
for version, compact_partition_func in enumerate([compact_partition_v1]):
|
20
|
+
for case_name, case_value in test_cases.items():
|
21
|
+
final_cases[f"{case_name}_v{version}"] = [
|
22
|
+
*case_value,
|
23
|
+
compact_partition_func,
|
24
|
+
]
|
25
|
+
|
26
|
+
return final_cases
|
27
|
+
|
28
|
+
|
29
|
+
"""
|
30
|
+
TODO Test Cases:
|
31
|
+
1. incremental w/wout round completion file
|
32
|
+
2. Backfill w/wout round completion
|
33
|
+
3. Rebase w/wout round completion file
|
34
|
+
4. Rebase then incremental (use same round completion file)
|
35
|
+
"""
|
36
|
+
|
37
|
+
|
38
|
+
INCREMENTAL_INDEPENDENT_TEST_CASES = {
|
39
|
+
"1-incremental-pkstr-sknone-norcf": [
|
40
|
+
BASE_TEST_SOURCE_TABLE_VERSION,
|
41
|
+
BASE_TEST_DESTINATION_TABLE_VERSION,
|
42
|
+
{"pk_col_1"}, # Primary key columns
|
43
|
+
[], # Sort key columns
|
44
|
+
[{"key_name": "region_id", "key_type": "int"}], # Partition keys
|
45
|
+
["pk_col_1"], # column_names
|
46
|
+
[pa.array([str(i) for i in range(10)])], # arrow arrays
|
47
|
+
None, # rebase_source_partition_locator_param
|
48
|
+
["1"], # partition_values_param
|
49
|
+
pa.Table.from_arrays(
|
50
|
+
[pa.array([str(i) for i in range(10)])],
|
51
|
+
names=["pk_col_1"], # expected_result
|
52
|
+
),
|
53
|
+
None, # validation_callback_func
|
54
|
+
None, # validation_callback_func_kwargs
|
55
|
+
True, # teardown_local_deltacat_storage_db
|
56
|
+
False, # use_prev_compacted
|
57
|
+
True, # create_placement_group_param
|
58
|
+
MAX_RECORDS_PER_FILE, # records_per_compacted_file_param
|
59
|
+
HASH_BUCKET_COUNT, # hash_bucket_count_param
|
60
|
+
],
|
61
|
+
"2-incremental-pkstr-skstr-norcf": [
|
62
|
+
BASE_TEST_SOURCE_TABLE_VERSION,
|
63
|
+
BASE_TEST_DESTINATION_TABLE_VERSION,
|
64
|
+
["pk_col_1"],
|
65
|
+
[
|
66
|
+
{
|
67
|
+
"key_name": "sk_col_1",
|
68
|
+
}
|
69
|
+
],
|
70
|
+
[],
|
71
|
+
["pk_col_1", "sk_col_1"],
|
72
|
+
[pa.array([str(i) for i in range(10)]), pa.array(["test"] * 10)],
|
73
|
+
None,
|
74
|
+
["1"],
|
75
|
+
pa.Table.from_arrays(
|
76
|
+
[pa.array([str(i) for i in range(10)]), pa.array(["test"] * 10)],
|
77
|
+
names=["pk_col_1", "sk_col_1"],
|
78
|
+
),
|
79
|
+
None,
|
80
|
+
None,
|
81
|
+
True,
|
82
|
+
False,
|
83
|
+
True,
|
84
|
+
MAX_RECORDS_PER_FILE,
|
85
|
+
HASH_BUCKET_COUNT,
|
86
|
+
],
|
87
|
+
"3-incremental-pkstr-multiskstr-norcf": [
|
88
|
+
BASE_TEST_SOURCE_TABLE_VERSION,
|
89
|
+
BASE_TEST_DESTINATION_TABLE_VERSION,
|
90
|
+
["pk_col_1"],
|
91
|
+
[
|
92
|
+
{
|
93
|
+
"key_name": "sk_col_1",
|
94
|
+
},
|
95
|
+
{
|
96
|
+
"key_name": "sk_col_2",
|
97
|
+
},
|
98
|
+
],
|
99
|
+
[],
|
100
|
+
["pk_col_1", "sk_col_1", "sk_col_2"],
|
101
|
+
[
|
102
|
+
pa.array([str(i) for i in range(10)]),
|
103
|
+
pa.array(["test"] * 10),
|
104
|
+
pa.array(["foo"] * 10),
|
105
|
+
],
|
106
|
+
None,
|
107
|
+
["1"],
|
108
|
+
pa.Table.from_arrays(
|
109
|
+
[
|
110
|
+
pa.array([str(i) for i in range(10)]),
|
111
|
+
pa.array(["test"] * 10),
|
112
|
+
pa.array(["foo"] * 10),
|
113
|
+
],
|
114
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2"],
|
115
|
+
),
|
116
|
+
None,
|
117
|
+
None,
|
118
|
+
True,
|
119
|
+
False,
|
120
|
+
True,
|
121
|
+
MAX_RECORDS_PER_FILE,
|
122
|
+
HASH_BUCKET_COUNT,
|
123
|
+
],
|
124
|
+
"4-incremental-duplicate-pk": [
|
125
|
+
BASE_TEST_SOURCE_TABLE_VERSION,
|
126
|
+
BASE_TEST_DESTINATION_TABLE_VERSION,
|
127
|
+
["pk_col_1"],
|
128
|
+
[
|
129
|
+
{
|
130
|
+
"key_name": "sk_col_1",
|
131
|
+
},
|
132
|
+
{
|
133
|
+
"key_name": "sk_col_2",
|
134
|
+
},
|
135
|
+
],
|
136
|
+
[],
|
137
|
+
["pk_col_1", "sk_col_1", "sk_col_2"],
|
138
|
+
[
|
139
|
+
pa.array([str(i) for i in range(5)] + ["6", "6", "6", "6", "6"]),
|
140
|
+
pa.array([str(i) for i in range(10)]),
|
141
|
+
pa.array(["foo"] * 10),
|
142
|
+
],
|
143
|
+
None,
|
144
|
+
["1"],
|
145
|
+
pa.Table.from_arrays(
|
146
|
+
[
|
147
|
+
pa.array([str(i) for i in range(5)] + ["6"]),
|
148
|
+
pa.array([str(i) for i in range(5)] + ["9"]),
|
149
|
+
pa.array(["foo"] * 6),
|
150
|
+
],
|
151
|
+
names=["pk_col_1", "sk_col_1", "sk_col_2"],
|
152
|
+
),
|
153
|
+
None,
|
154
|
+
None,
|
155
|
+
True,
|
156
|
+
False,
|
157
|
+
True,
|
158
|
+
MAX_RECORDS_PER_FILE,
|
159
|
+
HASH_BUCKET_COUNT,
|
160
|
+
],
|
161
|
+
"5-incremental-decimal-pk-simple": [
|
162
|
+
BASE_TEST_SOURCE_TABLE_VERSION,
|
163
|
+
BASE_TEST_DESTINATION_TABLE_VERSION,
|
164
|
+
["pk_col_1"],
|
165
|
+
[
|
166
|
+
{
|
167
|
+
"key_name": "sk_col_1",
|
168
|
+
},
|
169
|
+
],
|
170
|
+
[],
|
171
|
+
["pk_col_1", "sk_col_1"],
|
172
|
+
[
|
173
|
+
pa.array([i / 10 for i in range(0, 10)]),
|
174
|
+
pa.array([str(i) for i in range(10)]),
|
175
|
+
],
|
176
|
+
None,
|
177
|
+
["1"],
|
178
|
+
pa.Table.from_arrays(
|
179
|
+
[
|
180
|
+
pa.array([i / 10 for i in range(0, 10)]),
|
181
|
+
pa.array([str(i) for i in range(10)]),
|
182
|
+
],
|
183
|
+
names=["pk_col_1", "sk_col_1"],
|
184
|
+
),
|
185
|
+
None,
|
186
|
+
None,
|
187
|
+
True,
|
188
|
+
False,
|
189
|
+
True,
|
190
|
+
MAX_RECORDS_PER_FILE,
|
191
|
+
HASH_BUCKET_COUNT,
|
192
|
+
],
|
193
|
+
"7-incremental-integer-pk-simple": [
|
194
|
+
BASE_TEST_SOURCE_TABLE_VERSION,
|
195
|
+
BASE_TEST_DESTINATION_TABLE_VERSION,
|
196
|
+
["pk_col_1"],
|
197
|
+
[
|
198
|
+
{
|
199
|
+
"key_name": "sk_col_1",
|
200
|
+
},
|
201
|
+
],
|
202
|
+
[],
|
203
|
+
["pk_col_1", "sk_col_1"],
|
204
|
+
[
|
205
|
+
pa.array([i for i in range(0, 10)]),
|
206
|
+
pa.array([str(i) for i in range(10)]),
|
207
|
+
],
|
208
|
+
None,
|
209
|
+
["1"],
|
210
|
+
pa.Table.from_arrays(
|
211
|
+
[
|
212
|
+
pa.array([i for i in range(0, 10)]),
|
213
|
+
pa.array([str(i) for i in range(10)]),
|
214
|
+
],
|
215
|
+
names=["pk_col_1", "sk_col_1"],
|
216
|
+
),
|
217
|
+
None,
|
218
|
+
None,
|
219
|
+
True,
|
220
|
+
False,
|
221
|
+
True,
|
222
|
+
MAX_RECORDS_PER_FILE,
|
223
|
+
HASH_BUCKET_COUNT,
|
224
|
+
],
|
225
|
+
"8-incremental-timestamp-pk-simple": [
|
226
|
+
BASE_TEST_SOURCE_TABLE_VERSION,
|
227
|
+
BASE_TEST_DESTINATION_TABLE_VERSION,
|
228
|
+
["pk_col_1"],
|
229
|
+
[
|
230
|
+
{
|
231
|
+
"key_name": "sk_col_1",
|
232
|
+
},
|
233
|
+
],
|
234
|
+
[],
|
235
|
+
["pk_col_1", "sk_col_1"],
|
236
|
+
[
|
237
|
+
pa.array(offer_iso8601_timestamp_list(10, "minutes")),
|
238
|
+
pa.array([str(i) for i in range(10)]),
|
239
|
+
],
|
240
|
+
None,
|
241
|
+
["1"],
|
242
|
+
pa.Table.from_arrays(
|
243
|
+
[
|
244
|
+
pa.array(offer_iso8601_timestamp_list(10, "minutes")),
|
245
|
+
pa.array([str(i) for i in range(10)]),
|
246
|
+
],
|
247
|
+
names=["pk_col_1", "sk_col_1"],
|
248
|
+
),
|
249
|
+
None,
|
250
|
+
None,
|
251
|
+
True,
|
252
|
+
False,
|
253
|
+
True,
|
254
|
+
MAX_RECORDS_PER_FILE,
|
255
|
+
HASH_BUCKET_COUNT,
|
256
|
+
],
|
257
|
+
"9-incremental-decimal-timestamp-pk-multi": [
|
258
|
+
BASE_TEST_SOURCE_TABLE_VERSION,
|
259
|
+
BASE_TEST_DESTINATION_TABLE_VERSION,
|
260
|
+
["pk_col_1", "pk_col_2"],
|
261
|
+
[
|
262
|
+
{
|
263
|
+
"key_name": "sk_col_1",
|
264
|
+
},
|
265
|
+
],
|
266
|
+
[],
|
267
|
+
["pk_col_1", "pk_col_2", "sk_col_1"],
|
268
|
+
[
|
269
|
+
pa.array([i / 10 for i in range(0, 20)]),
|
270
|
+
pa.array(offer_iso8601_timestamp_list(20, "minutes")),
|
271
|
+
pa.array([0.1] * 4 + [0.2] * 4 + [0.3] * 4 + [0.4] * 4 + [0.5] * 4),
|
272
|
+
],
|
273
|
+
None,
|
274
|
+
["1"],
|
275
|
+
pa.Table.from_arrays(
|
276
|
+
[
|
277
|
+
pa.array([i / 10 for i in range(0, 20)]),
|
278
|
+
pa.array(offer_iso8601_timestamp_list(20, "minutes")),
|
279
|
+
pa.array([0.1] * 4 + [0.2] * 4 + [0.3] * 4 + [0.4] * 4 + [0.5] * 4),
|
280
|
+
],
|
281
|
+
names=["pk_col_1", "pk_col_2", "sk_col_1"],
|
282
|
+
),
|
283
|
+
None,
|
284
|
+
None,
|
285
|
+
True,
|
286
|
+
False,
|
287
|
+
True,
|
288
|
+
MAX_RECORDS_PER_FILE,
|
289
|
+
HASH_BUCKET_COUNT,
|
290
|
+
],
|
291
|
+
"10-incremental-decimal-pk-multi-dup": [
|
292
|
+
BASE_TEST_SOURCE_TABLE_VERSION,
|
293
|
+
BASE_TEST_DESTINATION_TABLE_VERSION,
|
294
|
+
["pk_col_1"],
|
295
|
+
[
|
296
|
+
{
|
297
|
+
"key_name": "sk_col_1",
|
298
|
+
},
|
299
|
+
],
|
300
|
+
[],
|
301
|
+
["pk_col_1", "sk_col_1"],
|
302
|
+
[
|
303
|
+
pa.array([0.1] * 4 + [0.2] * 4 + [0.3] * 4 + [0.4] * 4 + [0.5] * 4),
|
304
|
+
pa.array(reversed([i for i in range(20)])),
|
305
|
+
],
|
306
|
+
None,
|
307
|
+
["1"],
|
308
|
+
pa.Table.from_arrays(
|
309
|
+
[
|
310
|
+
pa.array([0.1, 0.2, 0.3, 0.4, 0.5]),
|
311
|
+
pa.array([19, 15, 11, 7, 3]),
|
312
|
+
],
|
313
|
+
names=["pk_col_1", "sk_col_1"],
|
314
|
+
),
|
315
|
+
None,
|
316
|
+
None,
|
317
|
+
True,
|
318
|
+
False,
|
319
|
+
True,
|
320
|
+
MAX_RECORDS_PER_FILE,
|
321
|
+
HASH_BUCKET_COUNT,
|
322
|
+
],
|
323
|
+
}
|
324
|
+
|
325
|
+
"""
|
326
|
+
for test_name, (
|
327
|
+
source_table_version,
|
328
|
+
destination_table_version,
|
329
|
+
primary_keys_param,
|
330
|
+
sort_keys_param,
|
331
|
+
partition_keys_param,
|
332
|
+
column_names_param,
|
333
|
+
arrow_arrays_param,
|
334
|
+
rebase_source_partition_locator_param,
|
335
|
+
partition_values_param,
|
336
|
+
expected_result,
|
337
|
+
validation_callback_func,
|
338
|
+
validation_callback_func_kwargs,
|
339
|
+
do_teardown_local_deltacat_storage_db,
|
340
|
+
use_prev_compacted,
|
341
|
+
create_placement_group_param,
|
342
|
+
records_per_compacted_file_param,
|
343
|
+
hash_bucket_count_param,
|
344
|
+
) in INCREMENTAL_TEST_CASES.items()
|
345
|
+
"""
|
346
|
+
|
347
|
+
# TODO: Add test cases where next tc is dependent on the previous compacted table existing
|
348
|
+
INCREMENTAL_DEPENDENT_TEST_CASES = {
|
349
|
+
"11-incremental-multi-dup-retain-table": (
|
350
|
+
BASE_TEST_SOURCE_TABLE_VERSION,
|
351
|
+
BASE_TEST_DESTINATION_TABLE_VERSION,
|
352
|
+
["pk_col_1", "pk_col_2"],
|
353
|
+
[
|
354
|
+
{
|
355
|
+
"key_name": "sk_col_1",
|
356
|
+
},
|
357
|
+
],
|
358
|
+
[],
|
359
|
+
["pk_col_1", "pk_col_2", "sk_col_1"],
|
360
|
+
[
|
361
|
+
pa.array([i / 10 for i in range(0, 20)]),
|
362
|
+
pa.array(offer_iso8601_timestamp_list(20, "minutes")),
|
363
|
+
pa.array([0.1] * 4 + [0.2] * 4 + [0.3] * 4 + [0.4] * 4 + [0.5] * 4),
|
364
|
+
],
|
365
|
+
None,
|
366
|
+
["1"],
|
367
|
+
pa.Table.from_arrays(
|
368
|
+
[
|
369
|
+
pa.array([i / 10 for i in range(0, 20)]),
|
370
|
+
pa.array(offer_iso8601_timestamp_list(20, "minutes")),
|
371
|
+
pa.array([0.1] * 4 + [0.2] * 4 + [0.3] * 4 + [0.4] * 4 + [0.5] * 4),
|
372
|
+
],
|
373
|
+
names=["pk_col_1", "pk_col_2", "sk_col_1"],
|
374
|
+
),
|
375
|
+
None,
|
376
|
+
None,
|
377
|
+
False,
|
378
|
+
False,
|
379
|
+
True,
|
380
|
+
MAX_RECORDS_PER_FILE,
|
381
|
+
HASH_BUCKET_COUNT,
|
382
|
+
),
|
383
|
+
}
|
384
|
+
|
385
|
+
INCREMENTAL_TEST_CASES = create_tests_cases_for_all_compactor_versions(
|
386
|
+
{
|
387
|
+
**INCREMENTAL_INDEPENDENT_TEST_CASES,
|
388
|
+
**INCREMENTAL_DEPENDENT_TEST_CASES,
|
389
|
+
}
|
390
|
+
)
|
@@ -11,7 +11,9 @@ class TestMemcachedObjectStore(unittest.TestCase):
|
|
11
11
|
def setUp(self):
|
12
12
|
from deltacat.io.memcached_object_store import MemcachedObjectStore
|
13
13
|
|
14
|
-
self.object_store = MemcachedObjectStore(
|
14
|
+
self.object_store = MemcachedObjectStore(
|
15
|
+
storage_node_ips=["172.1.1.1", "172.2.2.2", "172.3.3.3"]
|
16
|
+
)
|
15
17
|
|
16
18
|
@mock.patch("deltacat.io.memcached_object_store.Client")
|
17
19
|
@mock.patch("deltacat.io.memcached_object_store.RetryingClient")
|
@@ -29,11 +31,10 @@ class TestMemcachedObjectStore(unittest.TestCase):
|
|
29
31
|
mock_retrying_client.return_value = mock_client.return_value
|
30
32
|
mock_client.return_value.set_many.return_value = []
|
31
33
|
|
32
|
-
result = self.object_store.put_many(["a", "b"])
|
34
|
+
result = self.object_store.put_many(["a", "b", "c"])
|
33
35
|
|
34
|
-
self.assertEqual(
|
36
|
+
self.assertEqual(3, len(result))
|
35
37
|
self.assertRegex(result[0], ".*_.*")
|
36
|
-
self.assertEqual(1, mock_client.return_value.set_many.call_count)
|
37
38
|
|
38
39
|
@mock.patch("deltacat.io.memcached_object_store.Client")
|
39
40
|
@mock.patch("deltacat.io.memcached_object_store.RetryingClient")
|
@@ -181,16 +181,22 @@ def list_deltas(
|
|
181
181
|
partition = get_partition(stream.locator, partition_values, *args, **kwargs)
|
182
182
|
|
183
183
|
all_deltas = list_partition_deltas(
|
184
|
-
partition,
|
184
|
+
partition,
|
185
|
+
first_stream_position=first_stream_position,
|
186
|
+
last_stream_position=last_stream_position,
|
187
|
+
ascending_order=ascending_order,
|
188
|
+
include_manifest=include_manifest,
|
189
|
+
*args,
|
190
|
+
**kwargs,
|
185
191
|
).all_items()
|
186
192
|
|
187
193
|
result = []
|
188
194
|
|
189
195
|
for delta in all_deltas:
|
190
196
|
if (
|
191
|
-
not first_stream_position or first_stream_position
|
197
|
+
not first_stream_position or first_stream_position < delta.stream_position
|
192
198
|
) and (
|
193
|
-
not last_stream_position or
|
199
|
+
not last_stream_position or delta.stream_position <= last_stream_position
|
194
200
|
):
|
195
201
|
result.append(delta)
|
196
202
|
|
@@ -202,16 +208,38 @@ def list_deltas(
|
|
202
208
|
|
203
209
|
|
204
210
|
def list_partition_deltas(
|
205
|
-
|
211
|
+
partition_like: Union[Partition, PartitionLocator],
|
212
|
+
first_stream_position: Optional[int] = None,
|
213
|
+
last_stream_position: Optional[int] = None,
|
214
|
+
ascending_order: bool = False,
|
215
|
+
include_manifest: bool = False,
|
216
|
+
*args,
|
217
|
+
**kwargs,
|
206
218
|
) -> ListResult[Delta]:
|
207
219
|
cur, con = _get_sqlite3_cursor_con(kwargs)
|
208
220
|
|
209
|
-
if
|
221
|
+
if partition_like is None:
|
210
222
|
return ListResult.of([], None, None)
|
211
223
|
|
224
|
+
if first_stream_position is None:
|
225
|
+
first_stream_position = 0
|
226
|
+
|
227
|
+
if last_stream_position is None:
|
228
|
+
last_stream_position = float("inf")
|
229
|
+
|
230
|
+
assert isinstance(partition_like, Partition) or isinstance(
|
231
|
+
partition_like, PartitionLocator
|
232
|
+
), f"Expected a Partition or PartitionLocator as an input argument but found {partition_like}"
|
233
|
+
|
234
|
+
partition_locator = None
|
235
|
+
if isinstance(partition_like, Partition):
|
236
|
+
partition_locator = partition_like.locator
|
237
|
+
else:
|
238
|
+
partition_locator = partition_like
|
239
|
+
|
212
240
|
res = cur.execute(
|
213
241
|
"SELECT * FROM deltas WHERE partition_locator = ?",
|
214
|
-
(
|
242
|
+
(partition_locator.canonical_string(),),
|
215
243
|
)
|
216
244
|
|
217
245
|
serialized_items = res.fetchall()
|
@@ -222,12 +250,19 @@ def list_partition_deltas(
|
|
222
250
|
result = []
|
223
251
|
for item in serialized_items:
|
224
252
|
current_delta = Delta(json.loads(item[2]))
|
225
|
-
|
253
|
+
if (
|
254
|
+
first_stream_position
|
255
|
+
<= current_delta.stream_position
|
256
|
+
<= last_stream_position
|
257
|
+
):
|
258
|
+
result.append(current_delta)
|
226
259
|
|
227
260
|
if not include_manifest:
|
228
261
|
current_delta.manifest = None
|
229
262
|
|
230
|
-
result.sort(
|
263
|
+
result.sort(
|
264
|
+
reverse=True if not ascending_order else False, key=lambda d: d.stream_position
|
265
|
+
)
|
231
266
|
return ListResult.of(result, None, None)
|
232
267
|
|
233
268
|
|
@@ -334,7 +369,6 @@ def download_delta_manifest_entry(
|
|
334
369
|
cur, con = _get_sqlite3_cursor_con(kwargs)
|
335
370
|
|
336
371
|
manifest = get_delta_manifest(delta_like, *args, **kwargs)
|
337
|
-
|
338
372
|
if entry_index >= len(manifest.entries):
|
339
373
|
raise IndexError(
|
340
374
|
f"Manifest entry index {entry_index} does not exist. "
|
@@ -352,7 +386,6 @@ def download_delta_manifest_entry(
|
|
352
386
|
)
|
353
387
|
|
354
388
|
serialized_data = serialized_data[0]
|
355
|
-
|
356
389
|
if entry.meta.content_type == ContentType.PARQUET:
|
357
390
|
if table_type == TableType.PYARROW_PARQUET:
|
358
391
|
table = pa.parquet.ParquetFile(io.BytesIO(serialized_data))
|
@@ -388,18 +421,17 @@ def download_delta_manifest_entry(
|
|
388
421
|
|
389
422
|
def get_delta_manifest(
|
390
423
|
delta_like: Union[Delta, DeltaLocator], *args, **kwargs
|
391
|
-
) -> Manifest:
|
424
|
+
) -> Optional[Manifest]:
|
392
425
|
delta = get_delta(
|
393
|
-
delta_like.namespace,
|
394
|
-
delta_like.table_name,
|
395
|
-
delta_like.stream_position,
|
396
|
-
delta_like.partition_values,
|
397
|
-
delta_like.table_version,
|
398
|
-
True,
|
426
|
+
namespace=delta_like.namespace,
|
427
|
+
table_name=delta_like.table_name,
|
428
|
+
stream_position=delta_like.stream_position,
|
429
|
+
partition_values=delta_like.partition_values,
|
430
|
+
table_version=delta_like.table_version,
|
431
|
+
include_manifest=True,
|
399
432
|
*args,
|
400
433
|
**kwargs,
|
401
434
|
)
|
402
|
-
|
403
435
|
if not delta:
|
404
436
|
return None
|
405
437
|
|
@@ -462,7 +494,6 @@ def create_table_version(
|
|
462
494
|
cur, con = _get_sqlite3_cursor_con(kwargs)
|
463
495
|
|
464
496
|
latest_version = get_latest_table_version(namespace, table_name, *args, **kwargs)
|
465
|
-
|
466
497
|
if (
|
467
498
|
table_version is not None
|
468
499
|
and latest_version
|
@@ -762,7 +793,18 @@ def commit_partition(partition: Partition, *args, **kwargs) -> Partition:
|
|
762
793
|
params = (json.dumps(pv_partition), pv_partition.locator.canonical_string())
|
763
794
|
cur.execute("UPDATE partitions SET value = ? WHERE locator = ?", params)
|
764
795
|
|
796
|
+
deltas = list_partition_deltas(partition, *args, **kwargs).all_items()
|
797
|
+
deltas.sort(reverse=True, key=lambda x: x.stream_position)
|
798
|
+
|
799
|
+
stream_position = partition.stream_position
|
800
|
+
if deltas:
|
801
|
+
stream_position = deltas[0].stream_position
|
802
|
+
|
765
803
|
partition.state = CommitState.COMMITTED
|
804
|
+
partition.stream_position = stream_position
|
805
|
+
partition.previous_stream_position = (
|
806
|
+
pv_partition.stream_position if pv_partition else None
|
807
|
+
)
|
766
808
|
params = (json.dumps(partition), partition.locator.canonical_string())
|
767
809
|
cur.execute("UPDATE partitions SET value = ? WHERE locator = ?", params)
|
768
810
|
con.commit()
|
@@ -1032,6 +1074,7 @@ def get_stream(
|
|
1032
1074
|
*args,
|
1033
1075
|
**kwargs,
|
1034
1076
|
) -> Optional[Stream]:
|
1077
|
+
assert not isinstance(table_version, int), f"Passed an integer as the table version"
|
1035
1078
|
obj = get_table_version(namespace, table_name, table_version, *args, **kwargs)
|
1036
1079
|
|
1037
1080
|
if obj is None:
|
@@ -0,0 +1,32 @@
|
|
1
|
+
from typing import List
|
2
|
+
import pyarrow as pa
|
3
|
+
from deltacat.storage import Delta
|
4
|
+
import deltacat.tests.local_deltacat_storage as ds
|
5
|
+
|
6
|
+
|
7
|
+
def create_delta_from_csv_file(
|
8
|
+
namespace: str, file_paths: List[str], *args, **kwargs
|
9
|
+
) -> Delta:
|
10
|
+
tables = []
|
11
|
+
|
12
|
+
for file_path in file_paths:
|
13
|
+
table = pa.csv.read_csv(file_path)
|
14
|
+
tables.append(table)
|
15
|
+
|
16
|
+
ds.create_namespace(namespace, {}, **kwargs)
|
17
|
+
table_name = "-".join(file_paths).replace("/", "_")
|
18
|
+
ds.create_table_version(namespace, table_name, "1", **kwargs)
|
19
|
+
stream = ds.get_stream(namespace, table_name, "1", **kwargs)
|
20
|
+
staged_partition = ds.stage_partition(stream, [], **kwargs)
|
21
|
+
|
22
|
+
deltas = []
|
23
|
+
|
24
|
+
for table in tables:
|
25
|
+
delta = ds.stage_delta(table, staged_partition, **kwargs)
|
26
|
+
deltas.append(delta)
|
27
|
+
|
28
|
+
merged_delta = Delta.merge_deltas(deltas=deltas)
|
29
|
+
committed_delta = ds.commit_delta(merged_delta, **kwargs)
|
30
|
+
ds.commit_partition(staged_partition, **kwargs)
|
31
|
+
|
32
|
+
return committed_delta
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
|
+
from __future__ import annotations
|
3
|
+
import json
|
4
|
+
from typing import Any, Dict
|
5
|
+
from boto3.resources.base import ServiceResource
|
6
|
+
|
7
|
+
|
8
|
+
def read_s3_contents(
|
9
|
+
s3_resource: ServiceResource, bucket_name: str, key: str
|
10
|
+
) -> Dict[str, Any]:
|
11
|
+
response = s3_resource.Object(bucket_name, key).get()
|
12
|
+
file_content: str = response["Body"].read().decode("utf-8")
|
13
|
+
return json.loads(file_content)
|
File without changes
|
@@ -0,0 +1,76 @@
|
|
1
|
+
import unittest
|
2
|
+
from deltacat.types.media import ContentEncoding, ContentType
|
3
|
+
from deltacat.utils.daft import daft_s3_file_to_table
|
4
|
+
|
5
|
+
from deltacat.utils.pyarrow import ReadKwargsProviderPyArrowSchemaOverride
|
6
|
+
from deltacat.types.partial_download import PartialParquetParameters
|
7
|
+
import pyarrow as pa
|
8
|
+
|
9
|
+
from pyarrow import parquet as pq
|
10
|
+
|
11
|
+
|
12
|
+
class TestDaftParquetReader(unittest.TestCase):
|
13
|
+
MVP_PATH = "deltacat/tests/utils/data/mvp.parquet"
|
14
|
+
|
15
|
+
def test_read_from_s3_all_columns(self):
|
16
|
+
table = daft_s3_file_to_table(
|
17
|
+
self.MVP_PATH,
|
18
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
19
|
+
content_type=ContentType.PARQUET.value,
|
20
|
+
)
|
21
|
+
self.assertEqual(table.schema.names, ["a", "b"])
|
22
|
+
self.assertEqual(table.num_rows, 100)
|
23
|
+
|
24
|
+
def test_read_from_s3_single_column_via_include_columns(self):
|
25
|
+
table = daft_s3_file_to_table(
|
26
|
+
self.MVP_PATH,
|
27
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
28
|
+
content_type=ContentType.PARQUET.value,
|
29
|
+
include_columns=["b"],
|
30
|
+
)
|
31
|
+
self.assertEqual(table.schema.names, ["b"])
|
32
|
+
self.assertEqual(table.num_rows, 100)
|
33
|
+
|
34
|
+
def test_read_from_s3_single_column_via_column_names(self):
|
35
|
+
table = daft_s3_file_to_table(
|
36
|
+
self.MVP_PATH,
|
37
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
38
|
+
content_type=ContentType.PARQUET.value,
|
39
|
+
column_names=["b"],
|
40
|
+
)
|
41
|
+
self.assertEqual(table.schema.names, ["b"])
|
42
|
+
self.assertEqual(table.num_rows, 100)
|
43
|
+
|
44
|
+
def test_read_from_s3_single_column_with_schema(self):
|
45
|
+
schema = pa.schema([("a", pa.int64()), ("b", pa.string())])
|
46
|
+
pa_read_func_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
|
47
|
+
schema=schema
|
48
|
+
)
|
49
|
+
table = daft_s3_file_to_table(
|
50
|
+
self.MVP_PATH,
|
51
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
52
|
+
content_type=ContentType.PARQUET.value,
|
53
|
+
include_columns=["b"],
|
54
|
+
pa_read_func_kwargs_provider=pa_read_func_kwargs_provider,
|
55
|
+
)
|
56
|
+
self.assertEqual(table.schema.names, ["b"])
|
57
|
+
self.assertEqual(table.num_rows, 100)
|
58
|
+
|
59
|
+
def test_read_from_s3_single_column_with_row_groups(self):
|
60
|
+
|
61
|
+
metadata = pq.read_metadata(self.MVP_PATH)
|
62
|
+
ppp = PartialParquetParameters.of(pq_metadata=metadata)
|
63
|
+
ppp["row_groups_to_download"] = ppp.row_groups_to_download[1:2]
|
64
|
+
table = daft_s3_file_to_table(
|
65
|
+
self.MVP_PATH,
|
66
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
67
|
+
content_type=ContentType.PARQUET.value,
|
68
|
+
column_names=["b"],
|
69
|
+
partial_file_download_params=ppp,
|
70
|
+
)
|
71
|
+
self.assertEqual(table.schema.names, ["b"])
|
72
|
+
self.assertEqual(table.num_rows, 10)
|
73
|
+
|
74
|
+
|
75
|
+
if __name__ == "__main__":
|
76
|
+
unittest.main()
|