deltacat 0.1.18b14__py3-none-any.whl → 0.1.18b16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/clients.py +17 -6
  3. deltacat/aws/redshift/model/manifest.py +4 -0
  4. deltacat/aws/s3u.py +24 -1
  5. deltacat/compute/compactor/compaction_session.py +42 -18
  6. deltacat/compute/compactor/model/compact_partition_params.py +297 -58
  7. deltacat/compute/compactor/model/compaction_session_audit_info.py +163 -9
  8. deltacat/compute/compactor/model/delta_annotated.py +95 -9
  9. deltacat/compute/compactor/model/delta_file_envelope.py +14 -2
  10. deltacat/compute/compactor/model/round_completion_info.py +17 -1
  11. deltacat/compute/compactor/repartition_session.py +4 -1
  12. deltacat/compute/compactor/steps/dedupe.py +9 -6
  13. deltacat/compute/compactor/steps/hash_bucket.py +24 -3
  14. deltacat/compute/compactor/steps/materialize.py +11 -6
  15. deltacat/compute/compactor/steps/repartition.py +22 -1
  16. deltacat/compute/compactor/utils/io.py +40 -23
  17. deltacat/compute/compactor/utils/sort_key.py +5 -0
  18. deltacat/compute/compactor/utils/system_columns.py +43 -0
  19. deltacat/compute/compactor_v2/compaction_session.py +509 -0
  20. deltacat/compute/compactor_v2/constants.py +37 -0
  21. deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
  22. deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
  23. deltacat/compute/compactor_v2/model/merge_input.py +143 -0
  24. deltacat/compute/compactor_v2/model/merge_result.py +12 -0
  25. deltacat/compute/compactor_v2/steps/__init__.py +0 -0
  26. deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
  27. deltacat/compute/compactor_v2/steps/merge.py +469 -0
  28. deltacat/compute/compactor_v2/utils/__init__.py +0 -0
  29. deltacat/compute/compactor_v2/utils/content_type_params.py +66 -0
  30. deltacat/compute/compactor_v2/utils/dedupe.py +58 -0
  31. deltacat/compute/compactor_v2/utils/io.py +152 -0
  32. deltacat/compute/compactor_v2/utils/primary_key_index.py +341 -0
  33. deltacat/compute/compactor_v2/utils/task_options.py +221 -0
  34. deltacat/compute/metastats/meta_stats.py +4 -2
  35. deltacat/compute/metastats/stats.py +1 -0
  36. deltacat/compute/metastats/utils/io.py +4 -0
  37. deltacat/compute/stats/utils/io.py +20 -5
  38. deltacat/exceptions.py +4 -0
  39. deltacat/io/memcached_object_store.py +37 -14
  40. deltacat/logs.py +4 -3
  41. deltacat/storage/interface.py +8 -1
  42. deltacat/storage/model/types.py +2 -1
  43. deltacat/tests/aws/test_clients.py +16 -3
  44. deltacat/tests/compute/__init__.py +0 -0
  45. deltacat/tests/compute/common.py +96 -0
  46. deltacat/tests/compute/compactor/__init__.py +0 -0
  47. deltacat/tests/compute/compactor/steps/__init__.py +0 -0
  48. deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +34 -8
  49. deltacat/tests/compute/compactor/utils/__init__.py +0 -0
  50. deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
  51. deltacat/tests/compute/compactor_v2/__init__.py +0 -0
  52. deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
  53. deltacat/tests/compute/test_compaction_session_incremental.py +363 -0
  54. deltacat/tests/compute/testcases.py +395 -0
  55. deltacat/tests/io/test_memcached_object_store.py +5 -4
  56. deltacat/tests/local_deltacat_storage/__init__.py +62 -19
  57. deltacat/tests/test_utils/pyarrow.py +49 -0
  58. deltacat/tests/test_utils/utils.py +13 -0
  59. deltacat/tests/utils/data/__init__.py +0 -0
  60. deltacat/tests/utils/test_daft.py +76 -0
  61. deltacat/tests/utils/test_pyarrow.py +133 -0
  62. deltacat/tests/utils/test_resources.py +23 -20
  63. deltacat/types/media.py +1 -0
  64. deltacat/types/partial_download.py +83 -0
  65. deltacat/types/tables.py +6 -0
  66. deltacat/utils/arguments.py +25 -0
  67. deltacat/utils/daft.py +87 -0
  68. deltacat/utils/placement.py +20 -3
  69. deltacat/utils/pyarrow.py +218 -1
  70. deltacat/utils/ray_utils/concurrency.py +26 -1
  71. deltacat/utils/resources.py +72 -1
  72. deltacat/utils/s3fs.py +21 -0
  73. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/METADATA +17 -3
  74. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/RECORD +79 -47
  75. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/WHEEL +1 -1
  76. /deltacat/{tests/compactor → compute/compactor_v2}/__init__.py +0 -0
  77. /deltacat/{tests/compactor/utils → compute/compactor_v2/model}/__init__.py +0 -0
  78. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/LICENSE +0 -0
  79. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,395 @@
1
+ import pyarrow as pa
2
+ from typing import Dict, List
3
+ from deltacat.tests.compute.common import (
4
+ MAX_RECORDS_PER_FILE,
5
+ offer_iso8601_timestamp_list,
6
+ )
7
+ from deltacat.tests.compute.common import (
8
+ BASE_TEST_SOURCE_TABLE_VERSION,
9
+ BASE_TEST_DESTINATION_TABLE_VERSION,
10
+ HASH_BUCKET_COUNT,
11
+ )
12
+ from deltacat.compute.compactor.compaction_session import (
13
+ compact_partition_from_request as compact_partition_v1,
14
+ )
15
+ from deltacat.compute.compactor_v2.compaction_session import (
16
+ compact_partition as compact_partition_v2,
17
+ )
18
+
19
+
20
+ def create_tests_cases_for_all_compactor_versions(test_cases: Dict[str, List]):
21
+ final_cases = {}
22
+ for version, compact_partition_func in enumerate(
23
+ [compact_partition_v1, compact_partition_v2]
24
+ ):
25
+ for case_name, case_value in test_cases.items():
26
+ final_cases[f"{case_name}_v{version + 1}"] = [
27
+ *case_value,
28
+ compact_partition_func,
29
+ ]
30
+
31
+ return final_cases
32
+
33
+
34
+ """
35
+ TODO Test Cases:
36
+ 1. incremental w/wout round completion file
37
+ 2. Backfill w/wout round completion
38
+ 3. Rebase w/wout round completion file
39
+ 4. Rebase then incremental (use same round completion file)
40
+ """
41
+
42
+
43
+ INCREMENTAL_INDEPENDENT_TEST_CASES = {
44
+ "1-incremental-pkstr-sknone-norcf": [
45
+ BASE_TEST_SOURCE_TABLE_VERSION,
46
+ BASE_TEST_DESTINATION_TABLE_VERSION,
47
+ {"pk_col_1"}, # Primary key columns
48
+ [], # Sort key columns
49
+ [{"key_name": "region_id", "key_type": "int"}], # Partition keys
50
+ ["pk_col_1"], # column_names
51
+ [pa.array([str(i) for i in range(10)])], # arrow arrays
52
+ None, # rebase_source_partition_locator_param
53
+ ["1"], # partition_values_param
54
+ pa.Table.from_arrays(
55
+ [pa.array([str(i) for i in range(10)])],
56
+ names=["pk_col_1"], # expected_result
57
+ ),
58
+ None, # validation_callback_func
59
+ None, # validation_callback_func_kwargs
60
+ True, # teardown_local_deltacat_storage_db
61
+ False, # use_prev_compacted
62
+ True, # create_placement_group_param
63
+ MAX_RECORDS_PER_FILE, # records_per_compacted_file_param
64
+ HASH_BUCKET_COUNT, # hash_bucket_count_param
65
+ ],
66
+ "2-incremental-pkstr-skstr-norcf": [
67
+ BASE_TEST_SOURCE_TABLE_VERSION,
68
+ BASE_TEST_DESTINATION_TABLE_VERSION,
69
+ ["pk_col_1"],
70
+ [
71
+ {
72
+ "key_name": "sk_col_1",
73
+ }
74
+ ],
75
+ [],
76
+ ["pk_col_1", "sk_col_1"],
77
+ [pa.array([str(i) for i in range(10)]), pa.array(["test"] * 10)],
78
+ None,
79
+ ["1"],
80
+ pa.Table.from_arrays(
81
+ [pa.array([str(i) for i in range(10)]), pa.array(["test"] * 10)],
82
+ names=["pk_col_1", "sk_col_1"],
83
+ ),
84
+ None,
85
+ None,
86
+ True,
87
+ False,
88
+ True,
89
+ MAX_RECORDS_PER_FILE,
90
+ HASH_BUCKET_COUNT,
91
+ ],
92
+ "3-incremental-pkstr-multiskstr-norcf": [
93
+ BASE_TEST_SOURCE_TABLE_VERSION,
94
+ BASE_TEST_DESTINATION_TABLE_VERSION,
95
+ ["pk_col_1"],
96
+ [
97
+ {
98
+ "key_name": "sk_col_1",
99
+ },
100
+ {
101
+ "key_name": "sk_col_2",
102
+ },
103
+ ],
104
+ [],
105
+ ["pk_col_1", "sk_col_1", "sk_col_2"],
106
+ [
107
+ pa.array([str(i) for i in range(10)]),
108
+ pa.array(["test"] * 10),
109
+ pa.array(["foo"] * 10),
110
+ ],
111
+ None,
112
+ ["1"],
113
+ pa.Table.from_arrays(
114
+ [
115
+ pa.array([str(i) for i in range(10)]),
116
+ pa.array(["test"] * 10),
117
+ pa.array(["foo"] * 10),
118
+ ],
119
+ names=["pk_col_1", "sk_col_1", "sk_col_2"],
120
+ ),
121
+ None,
122
+ None,
123
+ True,
124
+ False,
125
+ True,
126
+ MAX_RECORDS_PER_FILE,
127
+ HASH_BUCKET_COUNT,
128
+ ],
129
+ "4-incremental-duplicate-pk": [
130
+ BASE_TEST_SOURCE_TABLE_VERSION,
131
+ BASE_TEST_DESTINATION_TABLE_VERSION,
132
+ ["pk_col_1"],
133
+ [
134
+ {
135
+ "key_name": "sk_col_1",
136
+ },
137
+ {
138
+ "key_name": "sk_col_2",
139
+ },
140
+ ],
141
+ [],
142
+ ["pk_col_1", "sk_col_1", "sk_col_2"],
143
+ [
144
+ pa.array([str(i) for i in range(5)] + ["6", "6", "6", "6", "6"]),
145
+ pa.array([str(i) for i in range(10)]),
146
+ pa.array(["foo"] * 10),
147
+ ],
148
+ None,
149
+ ["1"],
150
+ pa.Table.from_arrays(
151
+ [
152
+ pa.array([str(i) for i in range(5)] + ["6"]),
153
+ pa.array([str(i) for i in range(5)] + ["9"]),
154
+ pa.array(["foo"] * 6),
155
+ ],
156
+ names=["pk_col_1", "sk_col_1", "sk_col_2"],
157
+ ),
158
+ None,
159
+ None,
160
+ True,
161
+ False,
162
+ True,
163
+ MAX_RECORDS_PER_FILE,
164
+ HASH_BUCKET_COUNT,
165
+ ],
166
+ "5-incremental-decimal-pk-simple": [
167
+ BASE_TEST_SOURCE_TABLE_VERSION,
168
+ BASE_TEST_DESTINATION_TABLE_VERSION,
169
+ ["pk_col_1"],
170
+ [
171
+ {
172
+ "key_name": "sk_col_1",
173
+ },
174
+ ],
175
+ [],
176
+ ["pk_col_1", "sk_col_1"],
177
+ [
178
+ pa.array([i / 10 for i in range(0, 10)]),
179
+ pa.array([str(i) for i in range(10)]),
180
+ ],
181
+ None,
182
+ ["1"],
183
+ pa.Table.from_arrays(
184
+ [
185
+ pa.array([i / 10 for i in range(0, 10)]),
186
+ pa.array([str(i) for i in range(10)]),
187
+ ],
188
+ names=["pk_col_1", "sk_col_1"],
189
+ ),
190
+ None,
191
+ None,
192
+ True,
193
+ False,
194
+ True,
195
+ MAX_RECORDS_PER_FILE,
196
+ HASH_BUCKET_COUNT,
197
+ ],
198
+ "7-incremental-integer-pk-simple": [
199
+ BASE_TEST_SOURCE_TABLE_VERSION,
200
+ BASE_TEST_DESTINATION_TABLE_VERSION,
201
+ ["pk_col_1"],
202
+ [
203
+ {
204
+ "key_name": "sk_col_1",
205
+ },
206
+ ],
207
+ [],
208
+ ["pk_col_1", "sk_col_1"],
209
+ [
210
+ pa.array([i for i in range(0, 10)]),
211
+ pa.array([str(i) for i in range(10)]),
212
+ ],
213
+ None,
214
+ ["1"],
215
+ pa.Table.from_arrays(
216
+ [
217
+ pa.array([i for i in range(0, 10)]),
218
+ pa.array([str(i) for i in range(10)]),
219
+ ],
220
+ names=["pk_col_1", "sk_col_1"],
221
+ ),
222
+ None,
223
+ None,
224
+ True,
225
+ False,
226
+ True,
227
+ MAX_RECORDS_PER_FILE,
228
+ HASH_BUCKET_COUNT,
229
+ ],
230
+ "8-incremental-timestamp-pk-simple": [
231
+ BASE_TEST_SOURCE_TABLE_VERSION,
232
+ BASE_TEST_DESTINATION_TABLE_VERSION,
233
+ ["pk_col_1"],
234
+ [
235
+ {
236
+ "key_name": "sk_col_1",
237
+ },
238
+ ],
239
+ [],
240
+ ["pk_col_1", "sk_col_1"],
241
+ [
242
+ pa.array(offer_iso8601_timestamp_list(10, "minutes")),
243
+ pa.array([str(i) for i in range(10)]),
244
+ ],
245
+ None,
246
+ ["1"],
247
+ pa.Table.from_arrays(
248
+ [
249
+ pa.array(offer_iso8601_timestamp_list(10, "minutes")),
250
+ pa.array([str(i) for i in range(10)]),
251
+ ],
252
+ names=["pk_col_1", "sk_col_1"],
253
+ ),
254
+ None,
255
+ None,
256
+ True,
257
+ False,
258
+ True,
259
+ MAX_RECORDS_PER_FILE,
260
+ HASH_BUCKET_COUNT,
261
+ ],
262
+ "9-incremental-decimal-timestamp-pk-multi": [
263
+ BASE_TEST_SOURCE_TABLE_VERSION,
264
+ BASE_TEST_DESTINATION_TABLE_VERSION,
265
+ ["pk_col_1", "pk_col_2"],
266
+ [
267
+ {
268
+ "key_name": "sk_col_1",
269
+ },
270
+ ],
271
+ [],
272
+ ["pk_col_1", "pk_col_2", "sk_col_1"],
273
+ [
274
+ pa.array([i / 10 for i in range(0, 20)]),
275
+ pa.array(offer_iso8601_timestamp_list(20, "minutes")),
276
+ pa.array([0.1] * 4 + [0.2] * 4 + [0.3] * 4 + [0.4] * 4 + [0.5] * 4),
277
+ ],
278
+ None,
279
+ ["1"],
280
+ pa.Table.from_arrays(
281
+ [
282
+ pa.array([i / 10 for i in range(0, 20)]),
283
+ pa.array(offer_iso8601_timestamp_list(20, "minutes")),
284
+ pa.array([0.1] * 4 + [0.2] * 4 + [0.3] * 4 + [0.4] * 4 + [0.5] * 4),
285
+ ],
286
+ names=["pk_col_1", "pk_col_2", "sk_col_1"],
287
+ ),
288
+ None,
289
+ None,
290
+ True,
291
+ False,
292
+ True,
293
+ MAX_RECORDS_PER_FILE,
294
+ HASH_BUCKET_COUNT,
295
+ ],
296
+ "10-incremental-decimal-pk-multi-dup": [
297
+ BASE_TEST_SOURCE_TABLE_VERSION,
298
+ BASE_TEST_DESTINATION_TABLE_VERSION,
299
+ ["pk_col_1"],
300
+ [
301
+ {
302
+ "key_name": "sk_col_1",
303
+ },
304
+ ],
305
+ [],
306
+ ["pk_col_1", "sk_col_1"],
307
+ [
308
+ pa.array([0.1] * 4 + [0.2] * 4 + [0.3] * 4 + [0.4] * 4 + [0.5] * 4),
309
+ pa.array(reversed([i for i in range(20)])),
310
+ ],
311
+ None,
312
+ ["1"],
313
+ pa.Table.from_arrays(
314
+ [
315
+ pa.array([0.1, 0.2, 0.3, 0.4, 0.5]),
316
+ pa.array([19, 15, 11, 7, 3]),
317
+ ],
318
+ names=["pk_col_1", "sk_col_1"],
319
+ ),
320
+ None,
321
+ None,
322
+ True,
323
+ False,
324
+ True,
325
+ MAX_RECORDS_PER_FILE,
326
+ HASH_BUCKET_COUNT,
327
+ ],
328
+ }
329
+
330
+ """
331
+ for test_name, (
332
+ source_table_version,
333
+ destination_table_version,
334
+ primary_keys_param,
335
+ sort_keys_param,
336
+ partition_keys_param,
337
+ column_names_param,
338
+ arrow_arrays_param,
339
+ rebase_source_partition_locator_param,
340
+ partition_values_param,
341
+ expected_result,
342
+ validation_callback_func,
343
+ validation_callback_func_kwargs,
344
+ do_teardown_local_deltacat_storage_db,
345
+ use_prev_compacted,
346
+ create_placement_group_param,
347
+ records_per_compacted_file_param,
348
+ hash_bucket_count_param,
349
+ ) in INCREMENTAL_TEST_CASES.items()
350
+ """
351
+
352
+ # TODO: Add test cases where next tc is dependent on the previous compacted table existing
353
+ INCREMENTAL_DEPENDENT_TEST_CASES = {
354
+ "11-incremental-multi-dup-retain-table": (
355
+ BASE_TEST_SOURCE_TABLE_VERSION,
356
+ BASE_TEST_DESTINATION_TABLE_VERSION,
357
+ ["pk_col_1", "pk_col_2"],
358
+ [
359
+ {
360
+ "key_name": "sk_col_1",
361
+ },
362
+ ],
363
+ [],
364
+ ["pk_col_1", "pk_col_2", "sk_col_1"],
365
+ [
366
+ pa.array([i / 10 for i in range(0, 20)]),
367
+ pa.array(offer_iso8601_timestamp_list(20, "minutes")),
368
+ pa.array([0.1] * 4 + [0.2] * 4 + [0.3] * 4 + [0.4] * 4 + [0.5] * 4),
369
+ ],
370
+ None,
371
+ ["1"],
372
+ pa.Table.from_arrays(
373
+ [
374
+ pa.array([i / 10 for i in range(0, 20)]),
375
+ pa.array(offer_iso8601_timestamp_list(20, "minutes")),
376
+ pa.array([0.1] * 4 + [0.2] * 4 + [0.3] * 4 + [0.4] * 4 + [0.5] * 4),
377
+ ],
378
+ names=["pk_col_1", "pk_col_2", "sk_col_1"],
379
+ ),
380
+ None,
381
+ None,
382
+ False,
383
+ False,
384
+ True,
385
+ MAX_RECORDS_PER_FILE,
386
+ HASH_BUCKET_COUNT,
387
+ ),
388
+ }
389
+
390
+ INCREMENTAL_TEST_CASES = create_tests_cases_for_all_compactor_versions(
391
+ {
392
+ **INCREMENTAL_INDEPENDENT_TEST_CASES,
393
+ **INCREMENTAL_DEPENDENT_TEST_CASES,
394
+ }
395
+ )
@@ -11,7 +11,9 @@ class TestMemcachedObjectStore(unittest.TestCase):
11
11
  def setUp(self):
12
12
  from deltacat.io.memcached_object_store import MemcachedObjectStore
13
13
 
14
- self.object_store = MemcachedObjectStore()
14
+ self.object_store = MemcachedObjectStore(
15
+ storage_node_ips=["172.1.1.1", "172.2.2.2", "172.3.3.3"]
16
+ )
15
17
 
16
18
  @mock.patch("deltacat.io.memcached_object_store.Client")
17
19
  @mock.patch("deltacat.io.memcached_object_store.RetryingClient")
@@ -29,11 +31,10 @@ class TestMemcachedObjectStore(unittest.TestCase):
29
31
  mock_retrying_client.return_value = mock_client.return_value
30
32
  mock_client.return_value.set_many.return_value = []
31
33
 
32
- result = self.object_store.put_many(["a", "b"])
34
+ result = self.object_store.put_many(["a", "b", "c"])
33
35
 
34
- self.assertEqual(2, len(result))
36
+ self.assertEqual(3, len(result))
35
37
  self.assertRegex(result[0], ".*_.*")
36
- self.assertEqual(1, mock_client.return_value.set_many.call_count)
37
38
 
38
39
  @mock.patch("deltacat.io.memcached_object_store.Client")
39
40
  @mock.patch("deltacat.io.memcached_object_store.RetryingClient")
@@ -181,16 +181,22 @@ def list_deltas(
181
181
  partition = get_partition(stream.locator, partition_values, *args, **kwargs)
182
182
 
183
183
  all_deltas = list_partition_deltas(
184
- partition, include_manifest, *args, **kwargs
184
+ partition,
185
+ first_stream_position=first_stream_position,
186
+ last_stream_position=last_stream_position,
187
+ ascending_order=ascending_order,
188
+ include_manifest=include_manifest,
189
+ *args,
190
+ **kwargs,
185
191
  ).all_items()
186
192
 
187
193
  result = []
188
194
 
189
195
  for delta in all_deltas:
190
196
  if (
191
- not first_stream_position or first_stream_position <= delta.stream_position
197
+ not first_stream_position or first_stream_position < delta.stream_position
192
198
  ) and (
193
- not last_stream_position or last_stream_position > delta.stream_position
199
+ not last_stream_position or delta.stream_position <= last_stream_position
194
200
  ):
195
201
  result.append(delta)
196
202
 
@@ -202,16 +208,38 @@ def list_deltas(
202
208
 
203
209
 
204
210
  def list_partition_deltas(
205
- partition: Partition, include_manifest: bool = False, *args, **kwargs
211
+ partition_like: Union[Partition, PartitionLocator],
212
+ first_stream_position: Optional[int] = None,
213
+ last_stream_position: Optional[int] = None,
214
+ ascending_order: bool = False,
215
+ include_manifest: bool = False,
216
+ *args,
217
+ **kwargs,
206
218
  ) -> ListResult[Delta]:
207
219
  cur, con = _get_sqlite3_cursor_con(kwargs)
208
220
 
209
- if partition is None:
221
+ if partition_like is None:
210
222
  return ListResult.of([], None, None)
211
223
 
224
+ if first_stream_position is None:
225
+ first_stream_position = 0
226
+
227
+ if last_stream_position is None:
228
+ last_stream_position = float("inf")
229
+
230
+ assert isinstance(partition_like, Partition) or isinstance(
231
+ partition_like, PartitionLocator
232
+ ), f"Expected a Partition or PartitionLocator as an input argument but found {partition_like}"
233
+
234
+ partition_locator = None
235
+ if isinstance(partition_like, Partition):
236
+ partition_locator = partition_like.locator
237
+ else:
238
+ partition_locator = partition_like
239
+
212
240
  res = cur.execute(
213
241
  "SELECT * FROM deltas WHERE partition_locator = ?",
214
- (partition.locator.canonical_string(),),
242
+ (partition_locator.canonical_string(),),
215
243
  )
216
244
 
217
245
  serialized_items = res.fetchall()
@@ -222,12 +250,19 @@ def list_partition_deltas(
222
250
  result = []
223
251
  for item in serialized_items:
224
252
  current_delta = Delta(json.loads(item[2]))
225
- result.append(current_delta)
253
+ if (
254
+ first_stream_position
255
+ <= current_delta.stream_position
256
+ <= last_stream_position
257
+ ):
258
+ result.append(current_delta)
226
259
 
227
260
  if not include_manifest:
228
261
  current_delta.manifest = None
229
262
 
230
- result.sort(reverse=True, key=lambda d: d.stream_position)
263
+ result.sort(
264
+ reverse=True if not ascending_order else False, key=lambda d: d.stream_position
265
+ )
231
266
  return ListResult.of(result, None, None)
232
267
 
233
268
 
@@ -334,7 +369,6 @@ def download_delta_manifest_entry(
334
369
  cur, con = _get_sqlite3_cursor_con(kwargs)
335
370
 
336
371
  manifest = get_delta_manifest(delta_like, *args, **kwargs)
337
-
338
372
  if entry_index >= len(manifest.entries):
339
373
  raise IndexError(
340
374
  f"Manifest entry index {entry_index} does not exist. "
@@ -352,7 +386,6 @@ def download_delta_manifest_entry(
352
386
  )
353
387
 
354
388
  serialized_data = serialized_data[0]
355
-
356
389
  if entry.meta.content_type == ContentType.PARQUET:
357
390
  if table_type == TableType.PYARROW_PARQUET:
358
391
  table = pa.parquet.ParquetFile(io.BytesIO(serialized_data))
@@ -388,18 +421,17 @@ def download_delta_manifest_entry(
388
421
 
389
422
  def get_delta_manifest(
390
423
  delta_like: Union[Delta, DeltaLocator], *args, **kwargs
391
- ) -> Manifest:
424
+ ) -> Optional[Manifest]:
392
425
  delta = get_delta(
393
- delta_like.namespace,
394
- delta_like.table_name,
395
- delta_like.stream_position,
396
- delta_like.partition_values,
397
- delta_like.table_version,
398
- True,
426
+ namespace=delta_like.namespace,
427
+ table_name=delta_like.table_name,
428
+ stream_position=delta_like.stream_position,
429
+ partition_values=delta_like.partition_values,
430
+ table_version=delta_like.table_version,
431
+ include_manifest=True,
399
432
  *args,
400
433
  **kwargs,
401
434
  )
402
-
403
435
  if not delta:
404
436
  return None
405
437
 
@@ -462,7 +494,6 @@ def create_table_version(
462
494
  cur, con = _get_sqlite3_cursor_con(kwargs)
463
495
 
464
496
  latest_version = get_latest_table_version(namespace, table_name, *args, **kwargs)
465
-
466
497
  if (
467
498
  table_version is not None
468
499
  and latest_version
@@ -762,7 +793,18 @@ def commit_partition(partition: Partition, *args, **kwargs) -> Partition:
762
793
  params = (json.dumps(pv_partition), pv_partition.locator.canonical_string())
763
794
  cur.execute("UPDATE partitions SET value = ? WHERE locator = ?", params)
764
795
 
796
+ deltas = list_partition_deltas(partition, *args, **kwargs).all_items()
797
+ deltas.sort(reverse=True, key=lambda x: x.stream_position)
798
+
799
+ stream_position = partition.stream_position
800
+ if deltas:
801
+ stream_position = deltas[0].stream_position
802
+
765
803
  partition.state = CommitState.COMMITTED
804
+ partition.stream_position = stream_position
805
+ partition.previous_stream_position = (
806
+ pv_partition.stream_position if pv_partition else None
807
+ )
766
808
  params = (json.dumps(partition), partition.locator.canonical_string())
767
809
  cur.execute("UPDATE partitions SET value = ? WHERE locator = ?", params)
768
810
  con.commit()
@@ -1032,6 +1074,7 @@ def get_stream(
1032
1074
  *args,
1033
1075
  **kwargs,
1034
1076
  ) -> Optional[Stream]:
1077
+ assert not isinstance(table_version, int), f"Passed an integer as the table version"
1035
1078
  obj = get_table_version(namespace, table_name, table_version, *args, **kwargs)
1036
1079
 
1037
1080
  if obj is None:
@@ -0,0 +1,49 @@
1
+ from typing import List
2
+ import pyarrow as pa
3
+ from deltacat.storage import Delta, Partition
4
+ import deltacat.tests.local_deltacat_storage as ds
5
+
6
+
7
+ def create_delta_from_csv_file(
8
+ namespace: str, file_paths: List[str], *args, **kwargs
9
+ ) -> Delta:
10
+ staged_partition = stage_partition_from_csv_file(
11
+ namespace, file_paths, *args, **kwargs
12
+ )
13
+
14
+ committed_delta = commit_delta_to_staged_partition(
15
+ staged_partition, file_paths, *args, **kwargs
16
+ )
17
+
18
+ return committed_delta
19
+
20
+
21
+ def stage_partition_from_csv_file(
22
+ namespace: str, file_paths: List[str], *args, **kwargs
23
+ ) -> Partition:
24
+ ds.create_namespace(namespace, {}, **kwargs)
25
+ table_name = "-".join(file_paths).replace("/", "_")
26
+ ds.create_table_version(namespace, table_name, "1", **kwargs)
27
+ stream = ds.get_stream(namespace, table_name, "1", **kwargs)
28
+ staged_partition = ds.stage_partition(stream, [], **kwargs)
29
+ return staged_partition
30
+
31
+
32
+ def commit_delta_to_staged_partition(
33
+ staged_partition, file_paths: List[str], *args, **kwargs
34
+ ) -> Delta:
35
+ tables = []
36
+
37
+ for file_path in file_paths:
38
+ table = pa.csv.read_csv(file_path)
39
+ tables.append(table)
40
+ deltas = []
41
+
42
+ for table in tables:
43
+ delta = ds.stage_delta(table, staged_partition, **kwargs)
44
+ deltas.append(delta)
45
+
46
+ merged_delta = Delta.merge_deltas(deltas=deltas)
47
+ committed_delta = ds.commit_delta(merged_delta, **kwargs)
48
+ ds.commit_partition(staged_partition, **kwargs)
49
+ return committed_delta
@@ -0,0 +1,13 @@
1
+ # Allow classes to use self-referencing Type hints in Python 3.7.
2
+ from __future__ import annotations
3
+ import json
4
+ from typing import Any, Dict
5
+ from boto3.resources.base import ServiceResource
6
+
7
+
8
+ def read_s3_contents(
9
+ s3_resource: ServiceResource, bucket_name: str, key: str
10
+ ) -> Dict[str, Any]:
11
+ response = s3_resource.Object(bucket_name, key).get()
12
+ file_content: str = response["Body"].read().decode("utf-8")
13
+ return json.loads(file_content)
File without changes