deltacat 0.1.18b13__py3-none-any.whl → 0.1.18b15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. deltacat/__init__.py +3 -2
  2. deltacat/aws/clients.py +123 -3
  3. deltacat/aws/redshift/model/manifest.py +4 -0
  4. deltacat/aws/s3u.py +24 -1
  5. deltacat/benchmarking/benchmark_parquet_reads.py +53 -0
  6. deltacat/benchmarking/conftest.py +61 -0
  7. deltacat/catalog/delegate.py +1 -1
  8. deltacat/catalog/interface.py +1 -1
  9. deltacat/compute/compactor/__init__.py +0 -3
  10. deltacat/compute/compactor/compaction_session.py +45 -20
  11. deltacat/compute/compactor/model/compact_partition_params.py +287 -58
  12. deltacat/compute/compactor/model/compaction_session_audit_info.py +150 -9
  13. deltacat/compute/compactor/model/delta_annotated.py +91 -9
  14. deltacat/compute/compactor/model/delta_file_envelope.py +15 -3
  15. deltacat/compute/compactor/model/primary_key_index.py +1 -1
  16. deltacat/compute/compactor/model/round_completion_info.py +17 -1
  17. deltacat/compute/compactor/repartition_session.py +5 -3
  18. deltacat/compute/compactor/steps/dedupe.py +10 -8
  19. deltacat/compute/compactor/steps/hash_bucket.py +25 -4
  20. deltacat/compute/compactor/steps/materialize.py +11 -6
  21. deltacat/compute/compactor/steps/repartition.py +16 -1
  22. deltacat/compute/compactor/utils/io.py +40 -23
  23. deltacat/compute/compactor/utils/primary_key_index.py +1 -15
  24. deltacat/compute/compactor/utils/sort_key.py +57 -0
  25. deltacat/compute/compactor/utils/system_columns.py +43 -0
  26. deltacat/compute/compactor_v2/compaction_session.py +506 -0
  27. deltacat/compute/compactor_v2/constants.py +34 -0
  28. deltacat/compute/compactor_v2/model/__init__.py +0 -0
  29. deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
  30. deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
  31. deltacat/compute/compactor_v2/model/merge_input.py +127 -0
  32. deltacat/compute/compactor_v2/model/merge_result.py +12 -0
  33. deltacat/compute/compactor_v2/steps/__init__.py +0 -0
  34. deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
  35. deltacat/compute/compactor_v2/steps/merge.py +41 -0
  36. deltacat/compute/compactor_v2/utils/__init__.py +0 -0
  37. deltacat/compute/compactor_v2/utils/content_type_params.py +37 -0
  38. deltacat/compute/compactor_v2/utils/io.py +149 -0
  39. deltacat/compute/compactor_v2/utils/primary_key_index.py +308 -0
  40. deltacat/compute/compactor_v2/utils/task_options.py +228 -0
  41. deltacat/compute/metastats/meta_stats.py +4 -2
  42. deltacat/compute/metastats/stats.py +1 -0
  43. deltacat/compute/metastats/utils/io.py +4 -0
  44. deltacat/compute/stats/utils/io.py +20 -5
  45. deltacat/exceptions.py +4 -0
  46. deltacat/io/memcached_object_store.py +37 -14
  47. deltacat/logs.py +4 -3
  48. deltacat/storage/__init__.py +3 -0
  49. deltacat/storage/interface.py +11 -2
  50. deltacat/storage/model/sort_key.py +33 -0
  51. deltacat/storage/model/table_version.py +11 -0
  52. deltacat/storage/model/types.py +2 -1
  53. deltacat/tests/aws/__init__.py +0 -0
  54. deltacat/tests/aws/test_clients.py +80 -0
  55. deltacat/tests/compute/__init__.py +0 -0
  56. deltacat/tests/compute/common.py +96 -0
  57. deltacat/tests/compute/compactor/__init__.py +0 -0
  58. deltacat/tests/compute/compactor/steps/__init__.py +0 -0
  59. deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +22 -8
  60. deltacat/tests/compute/compactor/utils/__init__.py +0 -0
  61. deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
  62. deltacat/tests/compute/compactor_v2/__init__.py +0 -0
  63. deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
  64. deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +199 -0
  65. deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
  66. deltacat/tests/compute/test_compaction_session_incremental.py +348 -0
  67. deltacat/tests/compute/testcases.py +390 -0
  68. deltacat/tests/io/test_memcached_object_store.py +5 -4
  69. deltacat/tests/local_deltacat_storage/__init__.py +1109 -0
  70. deltacat/tests/test_utils/pyarrow.py +32 -0
  71. deltacat/tests/test_utils/utils.py +13 -0
  72. deltacat/tests/utils/data/__init__.py +0 -0
  73. deltacat/tests/utils/test_daft.py +76 -0
  74. deltacat/tests/utils/test_pyarrow.py +133 -0
  75. deltacat/tests/utils/test_resources.py +23 -20
  76. deltacat/types/media.py +1 -0
  77. deltacat/types/partial_download.py +82 -0
  78. deltacat/types/tables.py +1 -0
  79. deltacat/utils/arguments.py +26 -0
  80. deltacat/utils/daft.py +87 -0
  81. deltacat/utils/performance.py +4 -2
  82. deltacat/utils/placement.py +20 -3
  83. deltacat/utils/pyarrow.py +213 -1
  84. deltacat/utils/ray_utils/concurrency.py +26 -1
  85. deltacat/utils/resources.py +72 -1
  86. deltacat/utils/s3fs.py +21 -0
  87. {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/METADATA +27 -13
  88. deltacat-0.1.18b15.dist-info/RECORD +176 -0
  89. {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/WHEEL +1 -1
  90. deltacat/compute/compactor/model/sort_key.py +0 -98
  91. deltacat-0.1.18b13.dist-info/RECORD +0 -136
  92. /deltacat/{tests/compactor → benchmarking}/__init__.py +0 -0
  93. /deltacat/{tests/compactor/utils → compute/compactor_v2}/__init__.py +0 -0
  94. {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/LICENSE +0 -0
  95. {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,390 @@
1
+ import pyarrow as pa
2
+ from typing import Dict, List
3
+ from deltacat.tests.compute.common import (
4
+ MAX_RECORDS_PER_FILE,
5
+ offer_iso8601_timestamp_list,
6
+ )
7
+ from deltacat.tests.compute.common import (
8
+ BASE_TEST_SOURCE_TABLE_VERSION,
9
+ BASE_TEST_DESTINATION_TABLE_VERSION,
10
+ HASH_BUCKET_COUNT,
11
+ )
12
+ from deltacat.compute.compactor.compaction_session import (
13
+ compact_partition_from_request as compact_partition_v1,
14
+ )
15
+
16
+
17
+ def create_tests_cases_for_all_compactor_versions(test_cases: Dict[str, List]):
18
+ final_cases = {}
19
+ for version, compact_partition_func in enumerate([compact_partition_v1]):
20
+ for case_name, case_value in test_cases.items():
21
+ final_cases[f"{case_name}_v{version}"] = [
22
+ *case_value,
23
+ compact_partition_func,
24
+ ]
25
+
26
+ return final_cases
27
+
28
+
29
+ """
30
+ TODO Test Cases:
31
+ 1. incremental w/wout round completion file
32
+ 2. Backfill w/wout round completion
33
+ 3. Rebase w/wout round completion file
34
+ 4. Rebase then incremental (use same round completion file)
35
+ """
36
+
37
+
38
+ INCREMENTAL_INDEPENDENT_TEST_CASES = {
39
+ "1-incremental-pkstr-sknone-norcf": [
40
+ BASE_TEST_SOURCE_TABLE_VERSION,
41
+ BASE_TEST_DESTINATION_TABLE_VERSION,
42
+ {"pk_col_1"}, # Primary key columns
43
+ [], # Sort key columns
44
+ [{"key_name": "region_id", "key_type": "int"}], # Partition keys
45
+ ["pk_col_1"], # column_names
46
+ [pa.array([str(i) for i in range(10)])], # arrow arrays
47
+ None, # rebase_source_partition_locator_param
48
+ ["1"], # partition_values_param
49
+ pa.Table.from_arrays(
50
+ [pa.array([str(i) for i in range(10)])],
51
+ names=["pk_col_1"], # expected_result
52
+ ),
53
+ None, # validation_callback_func
54
+ None, # validation_callback_func_kwargs
55
+ True, # teardown_local_deltacat_storage_db
56
+ False, # use_prev_compacted
57
+ True, # create_placement_group_param
58
+ MAX_RECORDS_PER_FILE, # records_per_compacted_file_param
59
+ HASH_BUCKET_COUNT, # hash_bucket_count_param
60
+ ],
61
+ "2-incremental-pkstr-skstr-norcf": [
62
+ BASE_TEST_SOURCE_TABLE_VERSION,
63
+ BASE_TEST_DESTINATION_TABLE_VERSION,
64
+ ["pk_col_1"],
65
+ [
66
+ {
67
+ "key_name": "sk_col_1",
68
+ }
69
+ ],
70
+ [],
71
+ ["pk_col_1", "sk_col_1"],
72
+ [pa.array([str(i) for i in range(10)]), pa.array(["test"] * 10)],
73
+ None,
74
+ ["1"],
75
+ pa.Table.from_arrays(
76
+ [pa.array([str(i) for i in range(10)]), pa.array(["test"] * 10)],
77
+ names=["pk_col_1", "sk_col_1"],
78
+ ),
79
+ None,
80
+ None,
81
+ True,
82
+ False,
83
+ True,
84
+ MAX_RECORDS_PER_FILE,
85
+ HASH_BUCKET_COUNT,
86
+ ],
87
+ "3-incremental-pkstr-multiskstr-norcf": [
88
+ BASE_TEST_SOURCE_TABLE_VERSION,
89
+ BASE_TEST_DESTINATION_TABLE_VERSION,
90
+ ["pk_col_1"],
91
+ [
92
+ {
93
+ "key_name": "sk_col_1",
94
+ },
95
+ {
96
+ "key_name": "sk_col_2",
97
+ },
98
+ ],
99
+ [],
100
+ ["pk_col_1", "sk_col_1", "sk_col_2"],
101
+ [
102
+ pa.array([str(i) for i in range(10)]),
103
+ pa.array(["test"] * 10),
104
+ pa.array(["foo"] * 10),
105
+ ],
106
+ None,
107
+ ["1"],
108
+ pa.Table.from_arrays(
109
+ [
110
+ pa.array([str(i) for i in range(10)]),
111
+ pa.array(["test"] * 10),
112
+ pa.array(["foo"] * 10),
113
+ ],
114
+ names=["pk_col_1", "sk_col_1", "sk_col_2"],
115
+ ),
116
+ None,
117
+ None,
118
+ True,
119
+ False,
120
+ True,
121
+ MAX_RECORDS_PER_FILE,
122
+ HASH_BUCKET_COUNT,
123
+ ],
124
+ "4-incremental-duplicate-pk": [
125
+ BASE_TEST_SOURCE_TABLE_VERSION,
126
+ BASE_TEST_DESTINATION_TABLE_VERSION,
127
+ ["pk_col_1"],
128
+ [
129
+ {
130
+ "key_name": "sk_col_1",
131
+ },
132
+ {
133
+ "key_name": "sk_col_2",
134
+ },
135
+ ],
136
+ [],
137
+ ["pk_col_1", "sk_col_1", "sk_col_2"],
138
+ [
139
+ pa.array([str(i) for i in range(5)] + ["6", "6", "6", "6", "6"]),
140
+ pa.array([str(i) for i in range(10)]),
141
+ pa.array(["foo"] * 10),
142
+ ],
143
+ None,
144
+ ["1"],
145
+ pa.Table.from_arrays(
146
+ [
147
+ pa.array([str(i) for i in range(5)] + ["6"]),
148
+ pa.array([str(i) for i in range(5)] + ["9"]),
149
+ pa.array(["foo"] * 6),
150
+ ],
151
+ names=["pk_col_1", "sk_col_1", "sk_col_2"],
152
+ ),
153
+ None,
154
+ None,
155
+ True,
156
+ False,
157
+ True,
158
+ MAX_RECORDS_PER_FILE,
159
+ HASH_BUCKET_COUNT,
160
+ ],
161
+ "5-incremental-decimal-pk-simple": [
162
+ BASE_TEST_SOURCE_TABLE_VERSION,
163
+ BASE_TEST_DESTINATION_TABLE_VERSION,
164
+ ["pk_col_1"],
165
+ [
166
+ {
167
+ "key_name": "sk_col_1",
168
+ },
169
+ ],
170
+ [],
171
+ ["pk_col_1", "sk_col_1"],
172
+ [
173
+ pa.array([i / 10 for i in range(0, 10)]),
174
+ pa.array([str(i) for i in range(10)]),
175
+ ],
176
+ None,
177
+ ["1"],
178
+ pa.Table.from_arrays(
179
+ [
180
+ pa.array([i / 10 for i in range(0, 10)]),
181
+ pa.array([str(i) for i in range(10)]),
182
+ ],
183
+ names=["pk_col_1", "sk_col_1"],
184
+ ),
185
+ None,
186
+ None,
187
+ True,
188
+ False,
189
+ True,
190
+ MAX_RECORDS_PER_FILE,
191
+ HASH_BUCKET_COUNT,
192
+ ],
193
+ "7-incremental-integer-pk-simple": [
194
+ BASE_TEST_SOURCE_TABLE_VERSION,
195
+ BASE_TEST_DESTINATION_TABLE_VERSION,
196
+ ["pk_col_1"],
197
+ [
198
+ {
199
+ "key_name": "sk_col_1",
200
+ },
201
+ ],
202
+ [],
203
+ ["pk_col_1", "sk_col_1"],
204
+ [
205
+ pa.array([i for i in range(0, 10)]),
206
+ pa.array([str(i) for i in range(10)]),
207
+ ],
208
+ None,
209
+ ["1"],
210
+ pa.Table.from_arrays(
211
+ [
212
+ pa.array([i for i in range(0, 10)]),
213
+ pa.array([str(i) for i in range(10)]),
214
+ ],
215
+ names=["pk_col_1", "sk_col_1"],
216
+ ),
217
+ None,
218
+ None,
219
+ True,
220
+ False,
221
+ True,
222
+ MAX_RECORDS_PER_FILE,
223
+ HASH_BUCKET_COUNT,
224
+ ],
225
+ "8-incremental-timestamp-pk-simple": [
226
+ BASE_TEST_SOURCE_TABLE_VERSION,
227
+ BASE_TEST_DESTINATION_TABLE_VERSION,
228
+ ["pk_col_1"],
229
+ [
230
+ {
231
+ "key_name": "sk_col_1",
232
+ },
233
+ ],
234
+ [],
235
+ ["pk_col_1", "sk_col_1"],
236
+ [
237
+ pa.array(offer_iso8601_timestamp_list(10, "minutes")),
238
+ pa.array([str(i) for i in range(10)]),
239
+ ],
240
+ None,
241
+ ["1"],
242
+ pa.Table.from_arrays(
243
+ [
244
+ pa.array(offer_iso8601_timestamp_list(10, "minutes")),
245
+ pa.array([str(i) for i in range(10)]),
246
+ ],
247
+ names=["pk_col_1", "sk_col_1"],
248
+ ),
249
+ None,
250
+ None,
251
+ True,
252
+ False,
253
+ True,
254
+ MAX_RECORDS_PER_FILE,
255
+ HASH_BUCKET_COUNT,
256
+ ],
257
+ "9-incremental-decimal-timestamp-pk-multi": [
258
+ BASE_TEST_SOURCE_TABLE_VERSION,
259
+ BASE_TEST_DESTINATION_TABLE_VERSION,
260
+ ["pk_col_1", "pk_col_2"],
261
+ [
262
+ {
263
+ "key_name": "sk_col_1",
264
+ },
265
+ ],
266
+ [],
267
+ ["pk_col_1", "pk_col_2", "sk_col_1"],
268
+ [
269
+ pa.array([i / 10 for i in range(0, 20)]),
270
+ pa.array(offer_iso8601_timestamp_list(20, "minutes")),
271
+ pa.array([0.1] * 4 + [0.2] * 4 + [0.3] * 4 + [0.4] * 4 + [0.5] * 4),
272
+ ],
273
+ None,
274
+ ["1"],
275
+ pa.Table.from_arrays(
276
+ [
277
+ pa.array([i / 10 for i in range(0, 20)]),
278
+ pa.array(offer_iso8601_timestamp_list(20, "minutes")),
279
+ pa.array([0.1] * 4 + [0.2] * 4 + [0.3] * 4 + [0.4] * 4 + [0.5] * 4),
280
+ ],
281
+ names=["pk_col_1", "pk_col_2", "sk_col_1"],
282
+ ),
283
+ None,
284
+ None,
285
+ True,
286
+ False,
287
+ True,
288
+ MAX_RECORDS_PER_FILE,
289
+ HASH_BUCKET_COUNT,
290
+ ],
291
+ "10-incremental-decimal-pk-multi-dup": [
292
+ BASE_TEST_SOURCE_TABLE_VERSION,
293
+ BASE_TEST_DESTINATION_TABLE_VERSION,
294
+ ["pk_col_1"],
295
+ [
296
+ {
297
+ "key_name": "sk_col_1",
298
+ },
299
+ ],
300
+ [],
301
+ ["pk_col_1", "sk_col_1"],
302
+ [
303
+ pa.array([0.1] * 4 + [0.2] * 4 + [0.3] * 4 + [0.4] * 4 + [0.5] * 4),
304
+ pa.array(reversed([i for i in range(20)])),
305
+ ],
306
+ None,
307
+ ["1"],
308
+ pa.Table.from_arrays(
309
+ [
310
+ pa.array([0.1, 0.2, 0.3, 0.4, 0.5]),
311
+ pa.array([19, 15, 11, 7, 3]),
312
+ ],
313
+ names=["pk_col_1", "sk_col_1"],
314
+ ),
315
+ None,
316
+ None,
317
+ True,
318
+ False,
319
+ True,
320
+ MAX_RECORDS_PER_FILE,
321
+ HASH_BUCKET_COUNT,
322
+ ],
323
+ }
324
+
325
+ """
326
+ for test_name, (
327
+ source_table_version,
328
+ destination_table_version,
329
+ primary_keys_param,
330
+ sort_keys_param,
331
+ partition_keys_param,
332
+ column_names_param,
333
+ arrow_arrays_param,
334
+ rebase_source_partition_locator_param,
335
+ partition_values_param,
336
+ expected_result,
337
+ validation_callback_func,
338
+ validation_callback_func_kwargs,
339
+ do_teardown_local_deltacat_storage_db,
340
+ use_prev_compacted,
341
+ create_placement_group_param,
342
+ records_per_compacted_file_param,
343
+ hash_bucket_count_param,
344
+ ) in INCREMENTAL_TEST_CASES.items()
345
+ """
346
+
347
+ # TODO: Add test cases where next tc is dependent on the previous compacted table existing
348
+ INCREMENTAL_DEPENDENT_TEST_CASES = {
349
+ "11-incremental-multi-dup-retain-table": (
350
+ BASE_TEST_SOURCE_TABLE_VERSION,
351
+ BASE_TEST_DESTINATION_TABLE_VERSION,
352
+ ["pk_col_1", "pk_col_2"],
353
+ [
354
+ {
355
+ "key_name": "sk_col_1",
356
+ },
357
+ ],
358
+ [],
359
+ ["pk_col_1", "pk_col_2", "sk_col_1"],
360
+ [
361
+ pa.array([i / 10 for i in range(0, 20)]),
362
+ pa.array(offer_iso8601_timestamp_list(20, "minutes")),
363
+ pa.array([0.1] * 4 + [0.2] * 4 + [0.3] * 4 + [0.4] * 4 + [0.5] * 4),
364
+ ],
365
+ None,
366
+ ["1"],
367
+ pa.Table.from_arrays(
368
+ [
369
+ pa.array([i / 10 for i in range(0, 20)]),
370
+ pa.array(offer_iso8601_timestamp_list(20, "minutes")),
371
+ pa.array([0.1] * 4 + [0.2] * 4 + [0.3] * 4 + [0.4] * 4 + [0.5] * 4),
372
+ ],
373
+ names=["pk_col_1", "pk_col_2", "sk_col_1"],
374
+ ),
375
+ None,
376
+ None,
377
+ False,
378
+ False,
379
+ True,
380
+ MAX_RECORDS_PER_FILE,
381
+ HASH_BUCKET_COUNT,
382
+ ),
383
+ }
384
+
385
+ INCREMENTAL_TEST_CASES = create_tests_cases_for_all_compactor_versions(
386
+ {
387
+ **INCREMENTAL_INDEPENDENT_TEST_CASES,
388
+ **INCREMENTAL_DEPENDENT_TEST_CASES,
389
+ }
390
+ )
@@ -11,7 +11,9 @@ class TestMemcachedObjectStore(unittest.TestCase):
11
11
  def setUp(self):
12
12
  from deltacat.io.memcached_object_store import MemcachedObjectStore
13
13
 
14
- self.object_store = MemcachedObjectStore()
14
+ self.object_store = MemcachedObjectStore(
15
+ storage_node_ips=["172.1.1.1", "172.2.2.2", "172.3.3.3"]
16
+ )
15
17
 
16
18
  @mock.patch("deltacat.io.memcached_object_store.Client")
17
19
  @mock.patch("deltacat.io.memcached_object_store.RetryingClient")
@@ -29,11 +31,10 @@ class TestMemcachedObjectStore(unittest.TestCase):
29
31
  mock_retrying_client.return_value = mock_client.return_value
30
32
  mock_client.return_value.set_many.return_value = []
31
33
 
32
- result = self.object_store.put_many(["a", "b"])
34
+ result = self.object_store.put_many(["a", "b", "c"])
33
35
 
34
- self.assertEqual(2, len(result))
36
+ self.assertEqual(3, len(result))
35
37
  self.assertRegex(result[0], ".*_.*")
36
- self.assertEqual(1, mock_client.return_value.set_many.call_count)
37
38
 
38
39
  @mock.patch("deltacat.io.memcached_object_store.Client")
39
40
  @mock.patch("deltacat.io.memcached_object_store.RetryingClient")