deltacat 0.1.18b14__py3-none-any.whl → 0.1.18b16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/clients.py +17 -6
  3. deltacat/aws/redshift/model/manifest.py +4 -0
  4. deltacat/aws/s3u.py +24 -1
  5. deltacat/compute/compactor/compaction_session.py +42 -18
  6. deltacat/compute/compactor/model/compact_partition_params.py +297 -58
  7. deltacat/compute/compactor/model/compaction_session_audit_info.py +163 -9
  8. deltacat/compute/compactor/model/delta_annotated.py +95 -9
  9. deltacat/compute/compactor/model/delta_file_envelope.py +14 -2
  10. deltacat/compute/compactor/model/round_completion_info.py +17 -1
  11. deltacat/compute/compactor/repartition_session.py +4 -1
  12. deltacat/compute/compactor/steps/dedupe.py +9 -6
  13. deltacat/compute/compactor/steps/hash_bucket.py +24 -3
  14. deltacat/compute/compactor/steps/materialize.py +11 -6
  15. deltacat/compute/compactor/steps/repartition.py +22 -1
  16. deltacat/compute/compactor/utils/io.py +40 -23
  17. deltacat/compute/compactor/utils/sort_key.py +5 -0
  18. deltacat/compute/compactor/utils/system_columns.py +43 -0
  19. deltacat/compute/compactor_v2/compaction_session.py +509 -0
  20. deltacat/compute/compactor_v2/constants.py +37 -0
  21. deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
  22. deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
  23. deltacat/compute/compactor_v2/model/merge_input.py +143 -0
  24. deltacat/compute/compactor_v2/model/merge_result.py +12 -0
  25. deltacat/compute/compactor_v2/steps/__init__.py +0 -0
  26. deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
  27. deltacat/compute/compactor_v2/steps/merge.py +469 -0
  28. deltacat/compute/compactor_v2/utils/__init__.py +0 -0
  29. deltacat/compute/compactor_v2/utils/content_type_params.py +66 -0
  30. deltacat/compute/compactor_v2/utils/dedupe.py +58 -0
  31. deltacat/compute/compactor_v2/utils/io.py +152 -0
  32. deltacat/compute/compactor_v2/utils/primary_key_index.py +341 -0
  33. deltacat/compute/compactor_v2/utils/task_options.py +221 -0
  34. deltacat/compute/metastats/meta_stats.py +4 -2
  35. deltacat/compute/metastats/stats.py +1 -0
  36. deltacat/compute/metastats/utils/io.py +4 -0
  37. deltacat/compute/stats/utils/io.py +20 -5
  38. deltacat/exceptions.py +4 -0
  39. deltacat/io/memcached_object_store.py +37 -14
  40. deltacat/logs.py +4 -3
  41. deltacat/storage/interface.py +8 -1
  42. deltacat/storage/model/types.py +2 -1
  43. deltacat/tests/aws/test_clients.py +16 -3
  44. deltacat/tests/compute/__init__.py +0 -0
  45. deltacat/tests/compute/common.py +96 -0
  46. deltacat/tests/compute/compactor/__init__.py +0 -0
  47. deltacat/tests/compute/compactor/steps/__init__.py +0 -0
  48. deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +34 -8
  49. deltacat/tests/compute/compactor/utils/__init__.py +0 -0
  50. deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
  51. deltacat/tests/compute/compactor_v2/__init__.py +0 -0
  52. deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
  53. deltacat/tests/compute/test_compaction_session_incremental.py +363 -0
  54. deltacat/tests/compute/testcases.py +395 -0
  55. deltacat/tests/io/test_memcached_object_store.py +5 -4
  56. deltacat/tests/local_deltacat_storage/__init__.py +62 -19
  57. deltacat/tests/test_utils/pyarrow.py +49 -0
  58. deltacat/tests/test_utils/utils.py +13 -0
  59. deltacat/tests/utils/data/__init__.py +0 -0
  60. deltacat/tests/utils/test_daft.py +76 -0
  61. deltacat/tests/utils/test_pyarrow.py +133 -0
  62. deltacat/tests/utils/test_resources.py +23 -20
  63. deltacat/types/media.py +1 -0
  64. deltacat/types/partial_download.py +83 -0
  65. deltacat/types/tables.py +6 -0
  66. deltacat/utils/arguments.py +25 -0
  67. deltacat/utils/daft.py +87 -0
  68. deltacat/utils/placement.py +20 -3
  69. deltacat/utils/pyarrow.py +218 -1
  70. deltacat/utils/ray_utils/concurrency.py +26 -1
  71. deltacat/utils/resources.py +72 -1
  72. deltacat/utils/s3fs.py +21 -0
  73. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/METADATA +17 -3
  74. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/RECORD +79 -47
  75. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/WHEEL +1 -1
  76. /deltacat/{tests/compactor → compute/compactor_v2}/__init__.py +0 -0
  77. /deltacat/{tests/compactor/utils → compute/compactor_v2/model}/__init__.py +0 -0
  78. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/LICENSE +0 -0
  79. {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,469 @@
1
+ import logging
2
+ import importlib
3
+ from deltacat.compute.compactor_v2.model.merge_input import MergeInput
4
+ import numpy as np
5
+ import pyarrow as pa
6
+ import ray
7
+ import time
8
+ import pyarrow.compute as pc
9
+ from uuid import uuid4
10
+ from collections import defaultdict
11
+ from deltacat import logs
12
+ from typing import List, Optional
13
+ from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES
14
+ from deltacat.compute.compactor_v2.model.merge_result import MergeResult
15
+ from deltacat.compute.compactor.model.materialize_result import MaterializeResult
16
+ from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
17
+ from deltacat.compute.compactor import (
18
+ RoundCompletionInfo,
19
+ DeltaFileEnvelope,
20
+ )
21
+ from deltacat.utils.common import ReadKwargsProvider
22
+
23
+ from contextlib import nullcontext
24
+ from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
25
+ from deltacat.utils.ray_utils.runtime import (
26
+ get_current_ray_task_id,
27
+ get_current_ray_worker_id,
28
+ )
29
+ from deltacat.compute.compactor.utils import system_columns as sc
30
+
31
+ from deltacat.utils.performance import timed_invocation
32
+ from deltacat.utils.metrics import emit_timer_metrics
33
+ from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
34
+ from deltacat.compute.compactor_v2.utils.primary_key_index import (
35
+ generate_pk_hash_column,
36
+ hash_group_index_to_hash_bucket_indices,
37
+ )
38
+ from deltacat.storage import (
39
+ Delta,
40
+ DeltaLocator,
41
+ DeltaType,
42
+ Manifest,
43
+ Partition,
44
+ interface as unimplemented_deltacat_storage,
45
+ )
46
+ from deltacat.compute.compactor_v2.utils.dedupe import drop_duplicates
47
+
48
+
49
+ if importlib.util.find_spec("memray"):
50
+ import memray
51
+
52
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
53
+
54
+
55
+ def _append_delta_type_column(table: pa.Table, value: np.bool_):
56
+ return table.append_column(
57
+ sc._DELTA_TYPE_COLUMN_FIELD,
58
+ pa.array(np.repeat(value, len(table)), sc._DELTA_TYPE_COLUMN_TYPE),
59
+ )
60
+
61
+
62
+ def _drop_delta_type_rows(table: pa.Table, delta_type: DeltaType) -> pa.Table:
63
+ if sc._DELTA_TYPE_COLUMN_NAME not in table.column_names:
64
+ return table
65
+
66
+ delta_type_value = sc.delta_type_to_field(delta_type)
67
+
68
+ result = table.filter(
69
+ pc.not_equal(table[sc._DELTA_TYPE_COLUMN_NAME], delta_type_value)
70
+ )
71
+
72
+ return result.drop([sc._DELTA_TYPE_COLUMN_NAME])
73
+
74
+
75
+ def _build_incremental_table(
76
+ hash_bucket_index: int,
77
+ df_envelopes_list: List[List[DeltaFileEnvelope]],
78
+ ) -> pa.Table:
79
+
80
+ logger.info(
81
+ f"[Hash bucket index {hash_bucket_index}] Reading dedupe input for "
82
+ f"{len(df_envelopes_list)} delta file envelope lists..."
83
+ )
84
+ hb_tables = []
85
+ # sort by delta file stream position now instead of sorting every row later
86
+ df_envelopes = [d for dfe_list in df_envelopes_list for d in dfe_list]
87
+ df_envelopes = sorted(
88
+ df_envelopes,
89
+ key=lambda df: (df.stream_position, df.file_index),
90
+ reverse=False, # ascending
91
+ )
92
+ is_delete = False
93
+ for df_envelope in df_envelopes:
94
+ assert (
95
+ df_envelope.delta_type != DeltaType.APPEND
96
+ ), "APPEND type deltas are not supported. Kindly use UPSERT or DELETE"
97
+ if df_envelope.delta_type == DeltaType.DELETE:
98
+ is_delete = True
99
+
100
+ for df_envelope in df_envelopes:
101
+ table = df_envelope.table
102
+ if is_delete:
103
+ table = _append_delta_type_column(
104
+ table, np.bool_(sc.delta_type_to_field(df_envelope.delta_type))
105
+ )
106
+
107
+ hb_tables.append(table)
108
+
109
+ result = pa.concat_tables(hb_tables)
110
+
111
+ return result
112
+
113
+
114
+ def _merge_tables(
115
+ table: pa.Table,
116
+ primary_keys: List[str],
117
+ can_drop_duplicates: bool,
118
+ compacted_table: Optional[pa.Table] = None,
119
+ ) -> pa.Table:
120
+ """
121
+ Merges the table with compacted table dropping duplicates where necessary.
122
+
123
+ This method ensures the appropriate deltas of types DELETE/UPSERT are correctly
124
+ appended to the table.
125
+ """
126
+
127
+ all_tables = []
128
+ incremental_idx = 0
129
+
130
+ if compacted_table:
131
+ incremental_idx = 1
132
+ all_tables.append(compacted_table)
133
+
134
+ all_tables.append(table)
135
+
136
+ if not primary_keys or not can_drop_duplicates:
137
+ logger.info(
138
+ f"Not dropping duplicates for primary keys={primary_keys} "
139
+ f"and can_drop_duplicates={can_drop_duplicates}"
140
+ )
141
+ all_tables[incremental_idx] = _drop_delta_type_rows(
142
+ all_tables[incremental_idx], DeltaType.DELETE
143
+ )
144
+ # we need not drop duplicates
145
+ return pa.concat_tables(all_tables)
146
+
147
+ all_tables = generate_pk_hash_column(all_tables, primary_keys=primary_keys)
148
+
149
+ result_table_list = []
150
+
151
+ incremental_table = drop_duplicates(
152
+ all_tables[incremental_idx], on=sc._PK_HASH_STRING_COLUMN_NAME
153
+ )
154
+
155
+ if compacted_table:
156
+ compacted_table = all_tables[0]
157
+
158
+ records_to_keep = pc.invert(
159
+ pc.is_in(
160
+ compacted_table[sc._PK_HASH_STRING_COLUMN_NAME],
161
+ incremental_table[sc._PK_HASH_STRING_COLUMN_NAME],
162
+ )
163
+ )
164
+
165
+ result_table_list.append(compacted_table.filter(records_to_keep))
166
+
167
+ incremental_table = _drop_delta_type_rows(incremental_table, DeltaType.DELETE)
168
+ result_table_list.append(incremental_table)
169
+
170
+ final_table = pa.concat_tables(result_table_list)
171
+ final_table = final_table.drop([sc._PK_HASH_STRING_COLUMN_NAME])
172
+
173
+ return final_table
174
+
175
+
176
+ def _download_compacted_table(
177
+ hb_index: int,
178
+ rcf: RoundCompletionInfo,
179
+ read_kwargs_provider: Optional[ReadKwargsProvider] = None,
180
+ deltacat_storage=unimplemented_deltacat_storage,
181
+ deltacat_storage_kwargs: Optional[dict] = None,
182
+ ) -> pa.Table:
183
+ tables = []
184
+ hb_index_to_indices = rcf.hb_index_to_entry_range
185
+
186
+ if str(hb_index) not in hb_index_to_indices:
187
+ return None
188
+
189
+ indices = hb_index_to_indices[str(hb_index)]
190
+
191
+ assert (
192
+ indices is not None and len(indices) == 2
193
+ ), "indices should not be none and contains exactly two elements"
194
+
195
+ for offset in range(indices[1] - indices[0]):
196
+ table = deltacat_storage.download_delta_manifest_entry(
197
+ rcf.compacted_delta_locator,
198
+ entry_index=(indices[0] + offset),
199
+ file_reader_kwargs_provider=read_kwargs_provider,
200
+ **deltacat_storage_kwargs,
201
+ )
202
+
203
+ tables.append(table)
204
+
205
+ return pa.concat_tables(tables)
206
+
207
+
208
+ def _copy_all_manifest_files_from_old_hash_buckets(
209
+ hb_index_copy_by_reference: List[int],
210
+ round_completion_info: RoundCompletionInfo,
211
+ write_to_partition: Partition,
212
+ deltacat_storage=unimplemented_deltacat_storage,
213
+ deltacat_storage_kwargs: Optional[dict] = None,
214
+ ) -> List[MaterializeResult]:
215
+
216
+ compacted_delta_locator = round_completion_info.compacted_delta_locator
217
+ manifest = deltacat_storage.get_delta_manifest(
218
+ compacted_delta_locator, **deltacat_storage_kwargs
219
+ )
220
+
221
+ manifest_entry_referenced_list = []
222
+ materialize_result_list = []
223
+ hb_index_to_indices = round_completion_info.hb_index_to_entry_range
224
+
225
+ if hb_index_to_indices is None:
226
+ logger.info(f"Nothing to copy by reference. Skipping...")
227
+ return []
228
+
229
+ for hb_index in hb_index_copy_by_reference:
230
+ if str(hb_index) not in hb_index_to_indices:
231
+ continue
232
+
233
+ indices = hb_index_to_indices[str(hb_index)]
234
+ for offset in range(indices[1] - indices[0]):
235
+ entry_index = indices[0] + offset
236
+ assert entry_index < len(
237
+ manifest.entries
238
+ ), f"entry index: {entry_index} >= {len(manifest.entries)}"
239
+ manifest_entry = manifest.entries[entry_index]
240
+ manifest_entry_referenced_list.append(manifest_entry)
241
+
242
+ manifest = Manifest.of(
243
+ entries=manifest_entry_referenced_list, uuid=str(uuid4())
244
+ )
245
+ delta = Delta.of(
246
+ locator=DeltaLocator.of(write_to_partition.locator),
247
+ delta_type=DeltaType.UPSERT,
248
+ meta=manifest.meta,
249
+ manifest=manifest,
250
+ previous_stream_position=write_to_partition.stream_position,
251
+ properties={},
252
+ )
253
+ materialize_result = MaterializeResult.of(
254
+ delta=delta,
255
+ task_index=hb_index,
256
+ pyarrow_write_result=None,
257
+ # TODO (pdames): Generalize WriteResult to contain in-memory-table-type
258
+ # and in-memory-table-bytes instead of tight coupling to paBytes
259
+ referenced_pyarrow_write_result=PyArrowWriteResult.of(
260
+ len(manifest_entry_referenced_list),
261
+ manifest.meta.source_content_length,
262
+ manifest.meta.content_length,
263
+ manifest.meta.record_count,
264
+ ),
265
+ )
266
+ materialize_result_list.append(materialize_result)
267
+ return materialize_result_list
268
+
269
+
270
+ def _timed_merge(input: MergeInput) -> MergeResult:
271
+ def _materialize(
272
+ hash_bucket_index,
273
+ compacted_tables: List[pa.Table],
274
+ ) -> MaterializeResult:
275
+ compacted_table = pa.concat_tables(compacted_tables)
276
+ if input.compacted_file_content_type in DELIMITED_TEXT_CONTENT_TYPES:
277
+ # TODO (rkenmi): Investigate if we still need to convert this table to pandas DataFrame
278
+ # TODO (pdames): compare performance to pandas-native materialize path
279
+ df = compacted_table.to_pandas(split_blocks=True, self_destruct=True)
280
+ compacted_table = df
281
+ delta, stage_delta_time = timed_invocation(
282
+ input.deltacat_storage.stage_delta,
283
+ compacted_table,
284
+ input.write_to_partition,
285
+ max_records_per_entry=input.max_records_per_output_file,
286
+ content_type=input.compacted_file_content_type,
287
+ s3_table_writer_kwargs=input.s3_table_writer_kwargs,
288
+ **input.deltacat_storage_kwargs,
289
+ )
290
+ compacted_table_size = TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](
291
+ compacted_table
292
+ )
293
+ logger.debug(
294
+ f"Time taken for materialize task"
295
+ f" to upload {len(compacted_table)} records"
296
+ f" of size {compacted_table_size} is: {stage_delta_time}s"
297
+ )
298
+ manifest = delta.manifest
299
+ manifest_records = manifest.meta.record_count
300
+ assert manifest_records == len(compacted_table), (
301
+ f"Unexpected Error: Materialized delta manifest record count "
302
+ f"({manifest_records}) does not equal compacted table record count "
303
+ f"({len(compacted_table)})"
304
+ )
305
+ materialize_result = MaterializeResult.of(
306
+ delta=delta,
307
+ task_index=hash_bucket_index,
308
+ # TODO (pdames): Generalize WriteResult to contain in-memory-table-type
309
+ # and in-memory-table-bytes instead of tight coupling to paBytes
310
+ pyarrow_write_result=PyArrowWriteResult.of(
311
+ len(manifest.entries),
312
+ TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](compacted_table),
313
+ manifest.meta.content_length,
314
+ len(compacted_table),
315
+ ),
316
+ )
317
+ logger.info(f"Materialize result: {materialize_result}")
318
+ return materialize_result
319
+
320
+ task_id = get_current_ray_task_id()
321
+ worker_id = get_current_ray_worker_id()
322
+ with memray.Tracker(
323
+ f"merge_{worker_id}_{task_id}.bin"
324
+ ) if input.enable_profiler else nullcontext():
325
+ # In V2, we need to mitigate risk of running out of memory here in cases of
326
+ # severe skew of primary key updates in deltas. By severe skew, we mean
327
+ # one hash bucket require more memory than a worker instance have.
328
+ logger.info(
329
+ f"[Merge task {input.merge_task_index}] Getting delta file envelope "
330
+ f"groups for {len(input.dfe_groups_refs)} object refs..."
331
+ )
332
+
333
+ delta_file_envelope_groups_list = input.object_store.get_many(
334
+ input.dfe_groups_refs
335
+ )
336
+ hb_index_to_delta_file_envelopes_list = defaultdict(list)
337
+ for delta_file_envelope_groups in delta_file_envelope_groups_list:
338
+ assert input.hash_bucket_count == len(delta_file_envelope_groups), (
339
+ f"The hash bucket count must match the dfe size as {input.hash_bucket_count}"
340
+ f" != {len(delta_file_envelope_groups)}"
341
+ )
342
+
343
+ for hb_idx, dfes in enumerate(delta_file_envelope_groups):
344
+ if dfes:
345
+ hb_index_to_delta_file_envelopes_list[hb_idx].append(dfes)
346
+
347
+ valid_hb_indices_iterable = hash_group_index_to_hash_bucket_indices(
348
+ input.hash_group_index, input.hash_bucket_count, input.num_hash_groups
349
+ )
350
+
351
+ total_deduped_records = 0
352
+ total_dfes_found = 0
353
+
354
+ materialized_results: List[MaterializeResult] = []
355
+ hb_index_copy_by_reference = []
356
+ for hb_idx in valid_hb_indices_iterable:
357
+ dfe_list = hb_index_to_delta_file_envelopes_list.get(hb_idx)
358
+
359
+ if dfe_list:
360
+ total_dfes_found += 1
361
+ table = _build_incremental_table(hb_idx, dfe_list)
362
+
363
+ incremental_len = len(table)
364
+ logger.info(
365
+ f"Got the incremental table of length {incremental_len} for hash bucket {hb_idx}"
366
+ )
367
+
368
+ if input.sort_keys:
369
+ # Incremental is sorted and merged, as sorting
370
+ # on non event based sort key does not produce consistent
371
+ # compaction results. E.g., compaction(delta1, delta2, delta3)
372
+ # will not be equal to compaction(compaction(delta1, delta2), delta3).
373
+ table = table.sort_by(input.sort_keys)
374
+
375
+ compacted_table = None
376
+ if (
377
+ input.round_completion_info
378
+ and input.round_completion_info.hb_index_to_entry_range
379
+ and input.round_completion_info.hb_index_to_entry_range.get(
380
+ str(hb_idx)
381
+ )
382
+ is not None
383
+ ):
384
+
385
+ compacted_table = _download_compacted_table(
386
+ hb_index=hb_idx,
387
+ rcf=input.round_completion_info,
388
+ read_kwargs_provider=input.read_kwargs_provider,
389
+ deltacat_storage=input.deltacat_storage,
390
+ deltacat_storage_kwargs=input.deltacat_storage_kwargs,
391
+ )
392
+
393
+ hb_table_record_count = len(table) + (
394
+ len(compacted_table) if compacted_table else 0
395
+ )
396
+
397
+ table, merge_time = timed_invocation(
398
+ func=_merge_tables,
399
+ table=table,
400
+ primary_keys=input.primary_keys,
401
+ can_drop_duplicates=input.drop_duplicates,
402
+ compacted_table=compacted_table,
403
+ )
404
+ total_deduped_records += hb_table_record_count - len(table)
405
+
406
+ logger.info(
407
+ f"[Merge task index {input.merge_task_index}] Merged "
408
+ f"record count: {len(table)}, took: {merge_time}s"
409
+ )
410
+
411
+ materialized_results.append(_materialize(hb_idx, [table]))
412
+ else:
413
+ hb_index_copy_by_reference.append(hb_idx)
414
+
415
+ if input.round_completion_info and hb_index_copy_by_reference:
416
+ referenced_materialized_results = (
417
+ _copy_all_manifest_files_from_old_hash_buckets(
418
+ hb_index_copy_by_reference,
419
+ input.round_completion_info,
420
+ input.write_to_partition,
421
+ input.deltacat_storage,
422
+ input.deltacat_storage_kwargs,
423
+ )
424
+ )
425
+ logger.info(
426
+ f"Copying {len(referenced_materialized_results)} manifest files by reference..."
427
+ )
428
+ materialized_results.extend(referenced_materialized_results)
429
+
430
+ assert total_dfes_found == len(hb_index_to_delta_file_envelopes_list), (
431
+ "The total dfe list does not match the input dfes from hash bucket as "
432
+ f"{total_dfes_found} != {len(hb_index_to_delta_file_envelopes_list)}"
433
+ )
434
+
435
+ peak_memory_usage_bytes = get_current_node_peak_memory_usage_in_bytes()
436
+
437
+ return MergeResult(
438
+ materialized_results,
439
+ np.int64(total_deduped_records),
440
+ np.double(peak_memory_usage_bytes),
441
+ np.double(0.0),
442
+ np.double(time.time()),
443
+ )
444
+
445
+
446
+ @ray.remote
447
+ def merge(input: MergeInput) -> MergeResult:
448
+
449
+ logger.info(f"Starting merge task...")
450
+ merge_result, duration = timed_invocation(func=_timed_merge, input=input)
451
+
452
+ emit_metrics_time = 0.0
453
+ if input.metrics_config:
454
+ emit_result, latency = timed_invocation(
455
+ func=emit_timer_metrics,
456
+ metrics_name="merge",
457
+ value=duration,
458
+ metrics_config=input.metrics_config,
459
+ )
460
+ emit_metrics_time = latency
461
+
462
+ logger.info(f"Finished merge task...")
463
+ return MergeResult(
464
+ merge_result[0],
465
+ merge_result[1],
466
+ merge_result[2],
467
+ np.double(emit_metrics_time),
468
+ merge_result[4],
469
+ )
File without changes
@@ -0,0 +1,66 @@
1
+ import logging
2
+ from deltacat import logs
3
+ from deltacat.storage import (
4
+ Delta,
5
+ interface as unimplemented_deltacat_storage,
6
+ )
7
+ from typing import Dict, Optional, Any
8
+ from deltacat.types.media import TableType, StorageType
9
+ from deltacat.types.media import ContentType
10
+ from deltacat.types.partial_download import PartialParquetParameters
11
+
12
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
13
+
14
+
15
+ def append_content_type_params(
16
+ delta: Delta,
17
+ entry_index: Optional[int] = None,
18
+ deltacat_storage=unimplemented_deltacat_storage,
19
+ deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
20
+ ) -> None:
21
+
22
+ if delta.meta.content_type != ContentType.PARQUET.value:
23
+ logger.info(
24
+ f"Delta with locator {delta.locator} is not a parquet delta, "
25
+ "skipping appending content type parameters."
26
+ )
27
+ return
28
+
29
+ manifest_entries = delta.manifest.entries
30
+ ordered_pq_meta = []
31
+
32
+ if entry_index is not None:
33
+ manifest_entries = [delta.manifest.entries[entry_index]]
34
+
35
+ pq_file = deltacat_storage.download_delta_manifest_entry(
36
+ delta,
37
+ entry_index=entry_index,
38
+ table_type=TableType.PYARROW_PARQUET,
39
+ **deltacat_storage_kwargs,
40
+ )
41
+
42
+ partial_file_meta = PartialParquetParameters.of(pq_metadata=pq_file.metadata)
43
+ ordered_pq_meta.append(partial_file_meta)
44
+
45
+ else:
46
+ pq_files = deltacat_storage.download_delta(
47
+ delta,
48
+ table_type=TableType.PYARROW_PARQUET,
49
+ storage_type=StorageType.DISTRIBUTED,
50
+ **deltacat_storage_kwargs,
51
+ )
52
+
53
+ assert len(pq_files) == len(
54
+ manifest_entries
55
+ ), f"Expected {len(manifest_entries)} pq files, got {len(pq_files)}"
56
+
57
+ ordered_pq_meta = [
58
+ PartialParquetParameters.of(pq_metadata=pq_file.metadata)
59
+ for pq_file in pq_files
60
+ ]
61
+
62
+ for entry_index, entry in enumerate(manifest_entries):
63
+ if not entry.meta.content_type_parameters:
64
+ entry.meta.content_type_parameters = []
65
+
66
+ entry.meta.content_type_parameters.append(ordered_pq_meta[entry_index])
@@ -0,0 +1,58 @@
1
+ import logging
2
+ from deltacat import logs
3
+ import pyarrow.compute as pc
4
+ import pyarrow as pa
5
+ import numpy as np
6
+ from deltacat.utils.performance import timed_invocation
7
+ from deltacat.compute.compactor.utils import system_columns as sc
8
+
9
+ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
10
+
11
+
12
+ def _create_chunked_index_array(array: pa.Array) -> pa.Array:
13
+ """
14
+ Creates an chunked array where each chunk is of same size in the input array.
15
+ """
16
+ chunk_lengths = [
17
+ len(array.chunk(chunk_index)) for chunk_index in range(len(array.chunks))
18
+ ]
19
+ result = np.array([np.arange(cl) for cl in chunk_lengths], dtype="object")
20
+ chunk_lengths = ([0] + chunk_lengths)[:-1]
21
+ result = pa.chunked_array(result + np.cumsum(chunk_lengths))
22
+ return result
23
+
24
+
25
+ def drop_duplicates(table: pa.Table, on: str) -> pa.Table:
26
+ """
27
+ It is important to not combine the chunks for performance reasons.
28
+ """
29
+
30
+ if on not in table.column_names:
31
+ return table
32
+
33
+ index_array, array_latency = timed_invocation(
34
+ _create_chunked_index_array, table[on]
35
+ )
36
+
37
+ logger.info(
38
+ "Created a chunked index array of length "
39
+ f" {len(index_array)} in {array_latency}s"
40
+ )
41
+
42
+ table = table.set_column(
43
+ table.shape[1], sc._ORDERED_RECORD_IDX_COLUMN_NAME, index_array
44
+ )
45
+ selector = table.group_by([on]).aggregate(
46
+ [(sc._ORDERED_RECORD_IDX_COLUMN_NAME, "max")]
47
+ )
48
+
49
+ table = table.filter(
50
+ pc.is_in(
51
+ table[sc._ORDERED_RECORD_IDX_COLUMN_NAME],
52
+ value_set=selector[f"{sc._ORDERED_RECORD_IDX_COLUMN_NAME}_max"],
53
+ )
54
+ )
55
+
56
+ table = table.drop([sc._ORDERED_RECORD_IDX_COLUMN_NAME])
57
+
58
+ return table