deltacat 0.1.8__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. deltacat/__init__.py +41 -15
  2. deltacat/aws/clients.py +12 -31
  3. deltacat/aws/constants.py +1 -1
  4. deltacat/aws/redshift/__init__.py +7 -2
  5. deltacat/aws/redshift/model/manifest.py +54 -50
  6. deltacat/aws/s3u.py +188 -218
  7. deltacat/catalog/delegate.py +151 -185
  8. deltacat/catalog/interface.py +78 -97
  9. deltacat/catalog/model/catalog.py +21 -21
  10. deltacat/catalog/model/table_definition.py +11 -9
  11. deltacat/compute/compactor/__init__.py +12 -16
  12. deltacat/compute/compactor/compaction_session.py +259 -316
  13. deltacat/compute/compactor/model/delta_annotated.py +60 -44
  14. deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
  15. deltacat/compute/compactor/model/delta_file_locator.py +10 -8
  16. deltacat/compute/compactor/model/materialize_result.py +6 -7
  17. deltacat/compute/compactor/model/primary_key_index.py +38 -34
  18. deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
  19. deltacat/compute/compactor/model/round_completion_info.py +25 -19
  20. deltacat/compute/compactor/model/sort_key.py +18 -15
  21. deltacat/compute/compactor/steps/dedupe.py +152 -259
  22. deltacat/compute/compactor/steps/hash_bucket.py +57 -73
  23. deltacat/compute/compactor/steps/materialize.py +138 -99
  24. deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
  25. deltacat/compute/compactor/steps/rehash/rewrite_index.py +11 -13
  26. deltacat/compute/compactor/utils/io.py +59 -47
  27. deltacat/compute/compactor/utils/primary_key_index.py +131 -90
  28. deltacat/compute/compactor/utils/round_completion_file.py +22 -23
  29. deltacat/compute/compactor/utils/system_columns.py +33 -42
  30. deltacat/compute/metastats/meta_stats.py +235 -157
  31. deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
  32. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
  33. deltacat/compute/metastats/stats.py +95 -64
  34. deltacat/compute/metastats/utils/io.py +100 -53
  35. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
  36. deltacat/compute/metastats/utils/ray_utils.py +38 -33
  37. deltacat/compute/stats/basic.py +107 -69
  38. deltacat/compute/stats/models/delta_column_stats.py +11 -8
  39. deltacat/compute/stats/models/delta_stats.py +59 -32
  40. deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
  41. deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
  42. deltacat/compute/stats/models/stats_result.py +24 -14
  43. deltacat/compute/stats/utils/intervals.py +16 -9
  44. deltacat/compute/stats/utils/io.py +86 -51
  45. deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
  46. deltacat/constants.py +8 -10
  47. deltacat/io/__init__.py +2 -2
  48. deltacat/io/aws/redshift/redshift_datasource.py +157 -143
  49. deltacat/io/dataset.py +14 -17
  50. deltacat/io/read_api.py +36 -33
  51. deltacat/logs.py +94 -42
  52. deltacat/storage/__init__.py +18 -8
  53. deltacat/storage/interface.py +196 -213
  54. deltacat/storage/model/delta.py +45 -51
  55. deltacat/storage/model/list_result.py +12 -8
  56. deltacat/storage/model/namespace.py +4 -5
  57. deltacat/storage/model/partition.py +42 -42
  58. deltacat/storage/model/stream.py +29 -30
  59. deltacat/storage/model/table.py +14 -14
  60. deltacat/storage/model/table_version.py +32 -31
  61. deltacat/storage/model/types.py +1 -0
  62. deltacat/tests/stats/test_intervals.py +11 -24
  63. deltacat/tests/utils/test_record_batch_tables.py +284 -0
  64. deltacat/types/media.py +3 -4
  65. deltacat/types/tables.py +31 -21
  66. deltacat/utils/common.py +5 -11
  67. deltacat/utils/numpy.py +20 -22
  68. deltacat/utils/pandas.py +73 -100
  69. deltacat/utils/performance.py +3 -9
  70. deltacat/utils/placement.py +276 -231
  71. deltacat/utils/pyarrow.py +302 -89
  72. deltacat/utils/ray_utils/collections.py +2 -1
  73. deltacat/utils/ray_utils/concurrency.py +38 -32
  74. deltacat/utils/ray_utils/dataset.py +28 -28
  75. deltacat/utils/ray_utils/performance.py +5 -9
  76. deltacat/utils/ray_utils/runtime.py +9 -10
  77. {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/METADATA +22 -12
  78. deltacat-0.1.11.dist-info/RECORD +110 -0
  79. {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/WHEEL +1 -1
  80. deltacat/autoscaler/events/__init__.py +0 -0
  81. deltacat/autoscaler/events/compaction/__init__.py +0 -0
  82. deltacat/autoscaler/events/compaction/cluster.py +0 -82
  83. deltacat/autoscaler/events/compaction/collections/__init__.py +0 -0
  84. deltacat/autoscaler/events/compaction/collections/partition_key_value.py +0 -36
  85. deltacat/autoscaler/events/compaction/dispatcher.py +0 -28
  86. deltacat/autoscaler/events/compaction/input.py +0 -27
  87. deltacat/autoscaler/events/compaction/process.py +0 -25
  88. deltacat/autoscaler/events/compaction/session_manager.py +0 -13
  89. deltacat/autoscaler/events/compaction/utils.py +0 -216
  90. deltacat/autoscaler/events/compaction/workflow.py +0 -303
  91. deltacat/autoscaler/events/dispatcher.py +0 -95
  92. deltacat/autoscaler/events/dynamodb/__init__.py +0 -0
  93. deltacat/autoscaler/events/dynamodb/event_store.py +0 -164
  94. deltacat/autoscaler/events/event_store.py +0 -55
  95. deltacat/autoscaler/events/exceptions.py +0 -6
  96. deltacat/autoscaler/events/processor.py +0 -177
  97. deltacat/autoscaler/events/session_manager.py +0 -25
  98. deltacat/autoscaler/events/states.py +0 -88
  99. deltacat/autoscaler/events/workflow.py +0 -54
  100. deltacat/autoscaler/node_group.py +0 -230
  101. deltacat/autoscaler/utils.py +0 -69
  102. deltacat-0.1.8.dist-info/RECORD +0 -131
  103. /deltacat/{autoscaler → tests/utils}/__init__.py +0 -0
  104. {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/LICENSE +0 -0
  105. {deltacat-0.1.8.dist-info → deltacat-0.1.11.dist-info}/top_level.txt +0 -0
@@ -1,26 +1,29 @@
1
1
  import logging
2
+ from collections import defaultdict
3
+ from typing import Any, Dict, List, Optional, Tuple
4
+
5
+ import numpy as np
2
6
  import pyarrow as pa
3
- import ray
4
- import time
5
7
  import pyarrow.compute as pc
6
- import numpy as np
7
- from deltacat.compute.compactor.utils.system_columns import get_minimal_hb_schema
8
- from deltacat.utils.pyarrow import ReadKwargsProviderPyArrowSchemaOverride
8
+ import ray
9
9
  from ray import cloudpickle
10
10
  from ray.types import ObjectRef
11
11
 
12
12
  from deltacat import logs
13
- from collections import defaultdict
14
- from itertools import repeat
15
- from deltacat.storage import DeltaType
16
- from deltacat.compute.compactor import SortKey, SortOrder, \
17
- RoundCompletionInfo, PrimaryKeyIndexVersionLocator, DeltaFileEnvelope, \
18
- DeltaFileLocator, PyArrowWriteResult
19
- from deltacat.compute.compactor.utils import system_columns as sc, \
20
- primary_key_index as pki
21
-
22
- from typing import Any, Dict, List, Optional, Tuple
23
- from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
13
+ from deltacat.compute.compactor import (
14
+ DeltaFileEnvelope,
15
+ DeltaFileLocator,
16
+ PrimaryKeyIndexVersionLocator,
17
+ PyArrowWriteResult,
18
+ RoundCompletionInfo,
19
+ SortKey,
20
+ SortOrder,
21
+ )
22
+ from deltacat.compute.compactor.utils import primary_key_index as pki
23
+ from deltacat.compute.compactor.utils import system_columns as sc
24
+ from deltacat.compute.compactor.utils.system_columns import get_minimal_hb_schema
25
+ from deltacat.utils.performance import timed_invocation
26
+ from deltacat.utils.pyarrow import ReadKwargsProviderPyArrowSchemaOverride
24
27
 
25
28
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
26
29
 
@@ -32,18 +35,21 @@ DedupeTaskIndexWithObjectId = Tuple[DedupeTaskIndex, PickledObjectRef]
32
35
  DedupeResult = Tuple[
33
36
  Dict[MaterializeBucketIndex, DedupeTaskIndexWithObjectId],
34
37
  List[ObjectRef[DeltaFileLocatorToRecords]],
35
- PyArrowWriteResult
38
+ PyArrowWriteResult,
36
39
  ]
37
40
 
38
41
 
39
- def union_primary_key_indices(
40
- s3_bucket: str,
41
- round_completion_info: RoundCompletionInfo,
42
- hash_bucket_index: int,
43
- df_envelopes_list: List[List[DeltaFileEnvelope]]) -> pa.Table:
42
+ def _union_primary_key_indices(
43
+ s3_bucket: str,
44
+ round_completion_info: RoundCompletionInfo,
45
+ hash_bucket_index: int,
46
+ df_envelopes_list: List[List[DeltaFileEnvelope]],
47
+ ) -> pa.Table:
44
48
 
45
- logger.info(f"Reading dedupe input for {len(df_envelopes_list)} "
46
- f"delta file envelope lists...")
49
+ logger.info(
50
+ f"[Hash bucket index {hash_bucket_index}] Reading dedupe input for "
51
+ f"{len(df_envelopes_list)} delta file envelope lists..."
52
+ )
47
53
  # read compacted input parquet files first
48
54
  # (which implicitly have older stream positions than deltas)
49
55
  hb_tables = []
@@ -53,37 +59,15 @@ def union_primary_key_indices(
53
59
  hash_bucket_index,
54
60
  round_completion_info.primary_key_index_version_locator,
55
61
  # Enforce consistent column ordering by reading from a schema, to prevent schema mismatch errors
56
- file_reader_kwargs_provider=ReadKwargsProviderPyArrowSchemaOverride(schema=get_minimal_hb_schema())
62
+ file_reader_kwargs_provider=ReadKwargsProviderPyArrowSchemaOverride(
63
+ schema=get_minimal_hb_schema()
64
+ ),
57
65
  )
58
66
  if tables:
59
- prev_compacted_delta_stream_pos = round_completion_info\
60
- .compacted_delta_locator \
61
- .stream_position
62
- if prev_compacted_delta_stream_pos is None:
63
- raise ValueError(f"Unexpected Error: No previous compacted "
64
- f"delta stream position found in round "
65
- f"completion info: {round_completion_info}")
66
67
  prior_pk_index_table = pa.concat_tables(tables)
67
- prior_pk_index_table = sc.append_stream_position_column(
68
- prior_pk_index_table,
69
- repeat(
70
- prev_compacted_delta_stream_pos,
71
- len(prior_pk_index_table),
72
- ),
73
- )
74
- prior_pk_index_table = sc.append_delta_type_col(
75
- prior_pk_index_table,
76
- repeat(
77
- sc.delta_type_to_field(DeltaType.UPSERT),
78
- len(prior_pk_index_table),
79
- )
80
- )
81
- prior_pk_index_table = sc.append_is_source_col(
82
- prior_pk_index_table,
83
- repeat(
84
- False,
85
- len(prior_pk_index_table),
86
- )
68
+ logger.info(
69
+ f"Number of records in prior primary index for hash bucket"
70
+ f" {hash_bucket_index}: {prior_pk_index_table.num_rows}"
87
71
  )
88
72
  hb_tables.append(prior_pk_index_table)
89
73
 
@@ -99,114 +83,56 @@ def union_primary_key_indices(
99
83
 
100
84
  hb_table = pa.concat_tables(hb_tables)
101
85
 
86
+ logger.info(
87
+ f"Total records in hash bucket {hash_bucket_index} is {hb_table.num_rows}"
88
+ )
102
89
  return hb_table
103
90
 
104
91
 
105
- def drop_duplicates_by_primary_key_hash(table: pa.Table) -> pa.Table:
92
+ def _drop_duplicates_by_primary_key_hash(table: pa.Table) -> pa.Table:
106
93
  value_to_last_row_idx = {}
107
- row_idx = 0
108
- pk_op_chunk_iter = zip(
109
- sc.pk_hash_column(table).iterchunks(),
110
- sc.delta_type_column(table).iterchunks(),
94
+
95
+ pk_hash_np = sc.pk_hash_column_np(table)
96
+ op_type_np = sc.delta_type_column_np(table)
97
+
98
+ assert len(pk_hash_np) == len(op_type_np), (
99
+ f"Primary key digest column length ({len(pk_hash_np)}) doesn't "
100
+ f"match delta type column length ({len(op_type_np)})."
111
101
  )
112
- for (pk_chunk, op_chunk) in pk_op_chunk_iter:
113
- pk_op_val_iter = zip(
114
- pk_chunk.to_numpy(zero_copy_only=False),
115
- op_chunk.to_numpy(zero_copy_only=False),
116
- )
117
- for (pk_val, op_val) in pk_op_val_iter:
118
- # operation type is True for `UPSERT` and False for `DELETE`
119
- if op_val:
120
- # UPSERT this row
121
- value_to_last_row_idx[pk_val] = row_idx
122
- else:
123
- # DELETE this row
124
- value_to_last_row_idx.pop(pk_val, None)
125
- row_idx += 1
102
+
103
+ # TODO(raghumdani): move the dedupe to C++ using arrow methods or similar.
104
+ row_idx = 0
105
+ pk_op_val_iter = zip(pk_hash_np, op_type_np)
106
+ for (pk_val, op_val) in pk_op_val_iter:
107
+
108
+ # operation type is True for `UPSERT` and False for `DELETE`
109
+ if op_val:
110
+ # UPSERT this row
111
+ value_to_last_row_idx[pk_val] = row_idx
112
+ else:
113
+ # DELETE this row
114
+ value_to_last_row_idx.pop(pk_val, None)
115
+
116
+ row_idx += 1
117
+
126
118
  return table.take(list(value_to_last_row_idx.values()))
127
119
 
128
120
 
129
- def write_new_primary_key_index(
130
- s3_bucket: str,
131
- new_primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
132
- max_rows_per_index_file: int,
133
- max_rows_per_mat_file: int,
134
- num_materialize_buckets: int,
135
- dedupe_task_index: int,
136
- deduped_tables: List[Tuple[int, pa.Table]],
137
- row_counts: Dict[int, Dict[Tuple[np.bool_, np.int64, np.int32],
138
- Dict[int, int]]]) -> PyArrowWriteResult:
139
-
140
- logger.info(f"Writing new deduped primary key index: "
141
- f"{new_primary_key_index_version_locator}")
142
- # TODO (pdames): move to RecordCountsPendingMaterialize.finalize()?
143
- file_idx = 0
144
- prev_file_idx = 0
145
- dest_file_indices = defaultdict(
146
- lambda: defaultdict(
147
- lambda: defaultdict(int)
148
- )
149
- )
150
- dest_file_row_indices = defaultdict(
151
- lambda: defaultdict(
152
- lambda: defaultdict(int)
153
- )
121
+ def _write_new_primary_key_index(
122
+ s3_bucket: str,
123
+ new_primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
124
+ max_rows_per_index_file: int,
125
+ dedupe_task_index: int,
126
+ deduped_tables: List[Tuple[int, pa.Table]],
127
+ ) -> PyArrowWriteResult:
128
+
129
+ logger.info(
130
+ f"[Dedupe task index {dedupe_task_index}] Writing new deduped primary key index: "
131
+ f"{new_primary_key_index_version_locator}"
154
132
  )
155
- for mat_bucket in sorted(row_counts.keys()):
156
- mat_bucket_row_idx = 0
157
- sorted_src_dfls = sorted(row_counts[mat_bucket].keys())
158
- for src_dfl in sorted_src_dfls:
159
- sorted_dd_tasks = sorted(row_counts[mat_bucket][src_dfl].keys())
160
- for dd_task_idx in sorted_dd_tasks:
161
- dest_file_row_indices[mat_bucket][src_dfl][dd_task_idx] = \
162
- mat_bucket_row_idx % max_rows_per_mat_file
163
- file_idx = prev_file_idx + int(
164
- mat_bucket_row_idx / max_rows_per_mat_file
165
- )
166
- dest_file_indices[mat_bucket][src_dfl][dd_task_idx] = file_idx
167
- row_count = row_counts[mat_bucket][src_dfl][dd_task_idx]
168
- mat_bucket_row_idx += row_count
169
- prev_file_idx = file_idx + 1
170
133
 
171
134
  pki_results = []
172
- src_dfl_row_counts = defaultdict(int)
173
135
  for hb_index, table in deduped_tables:
174
- is_source_col = sc.is_source_column_np(table)
175
- stream_pos_col = sc.stream_position_column_np(table)
176
- file_idx_col = sc.file_index_column_np(table)
177
- dest_file_idx_col = []
178
- dest_file_row_idx_col = []
179
- for row_idx in range(len(table)):
180
- src_dfl = DeltaFileLocator.of(
181
- is_source_col[row_idx],
182
- stream_pos_col[row_idx],
183
- file_idx_col[row_idx],
184
- )
185
- mat_bucket = delta_file_locator_to_mat_bucket_index(
186
- src_dfl,
187
- num_materialize_buckets,
188
- )
189
- dest_file_start_idx = \
190
- dest_file_indices[mat_bucket][src_dfl][dedupe_task_index]
191
- dest_file_row_idx_offset = src_dfl_row_counts[src_dfl] + \
192
- dest_file_row_indices[mat_bucket][src_dfl][dedupe_task_index]
193
- dest_file_idx_offset = int(
194
- dest_file_row_idx_offset / max_rows_per_mat_file
195
- )
196
- dest_file_idx = dest_file_start_idx + dest_file_idx_offset
197
- dest_file_idx_col.append(dest_file_idx)
198
- dest_file_row_idx = dest_file_row_idx_offset % max_rows_per_mat_file
199
- dest_file_row_idx_col.append(dest_file_row_idx)
200
- src_dfl_row_counts[src_dfl] += 1
201
- table = table.drop([
202
- sc._IS_SOURCE_COLUMN_NAME,
203
- sc._PARTITION_STREAM_POSITION_COLUMN_NAME,
204
- sc._ORDERED_FILE_IDX_COLUMN_NAME,
205
- sc._ORDERED_RECORD_IDX_COLUMN_NAME,
206
- ])
207
- table = sc.append_file_idx_column(table, dest_file_idx_col)
208
- table = sc.append_record_idx_col(table, dest_file_row_idx_col)
209
-
210
136
  hb_pki_result = pki.write_primary_key_index_files(
211
137
  table,
212
138
  new_primary_key_index_version_locator,
@@ -217,77 +143,44 @@ def write_new_primary_key_index(
217
143
  pki_results.append(hb_pki_result)
218
144
 
219
145
  result = PyArrowWriteResult.union(pki_results)
220
- logger.info(f"Wrote new deduped primary key index: "
221
- f"{new_primary_key_index_version_locator}. Result: {result}")
146
+ logger.info(
147
+ f"[Dedupe task index {dedupe_task_index}] Wrote new deduped primary key index: "
148
+ f"{new_primary_key_index_version_locator}. Result: {result}"
149
+ )
222
150
  return result
223
151
 
224
152
 
225
153
  def delta_file_locator_to_mat_bucket_index(
226
- df_locator: DeltaFileLocator,
227
- materialize_bucket_count: int) -> int:
154
+ df_locator: DeltaFileLocator, materialize_bucket_count: int
155
+ ) -> int:
228
156
  digest = df_locator.digest()
229
157
  return int.from_bytes(digest, "big") % materialize_bucket_count
230
158
 
231
159
 
232
- @ray.remote(num_cpus=0.1)
233
- class RecordCountsPendingMaterialize:
234
- def __init__(self, expected_result_count: int):
235
- # materialize_bucket -> src_file_id
236
- self.record_counts = defaultdict(
237
- # delta_file_locator -> dedupe task index
238
- lambda: defaultdict(
239
- # dedupe task index -> row count
240
- lambda: defaultdict(int)
241
- )
242
- )
243
- self.expected_result_count = expected_result_count
244
- self.actual_result_count = 0
245
-
246
- def add_record_counts(
247
- self,
248
- result_idx: int,
249
- record_counts:
250
- Dict[int, Dict[Tuple[np.bool_, np.int64, np.int32], int]]) -> None:
251
- for mat_bucket, df_locator_rows in record_counts.items():
252
- for df_locator, rows in df_locator_rows.items():
253
- self.record_counts[mat_bucket][df_locator][result_idx] += rows
254
- self.actual_result_count += 1
255
-
256
- def get_record_counts(self) -> \
257
- Dict[int, Dict[Tuple[np.bool_, np.int64, np.int32],
258
- Dict[int, int]]]:
259
- return self.record_counts
260
-
261
- def get_expected_result_count(self) -> int:
262
- return self.expected_result_count
263
-
264
- def get_actual_result_count(self) -> int:
265
- return self.actual_result_count
266
-
267
- def is_finalized(self) -> bool:
268
- return self.actual_result_count == self.expected_result_count
269
-
270
-
271
160
  @ray.remote(num_returns=3)
272
161
  def dedupe(
273
- compaction_artifact_s3_bucket: str,
274
- round_completion_info: Optional[RoundCompletionInfo],
275
- new_primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
276
- object_ids: List[Any],
277
- sort_keys: List[SortKey],
278
- max_records_per_index_file: int,
279
- max_records_per_materialized_file: int,
280
- num_materialize_buckets: int,
281
- dedupe_task_index: int,
282
- delete_old_primary_key_index: bool,
283
- record_counts_pending_materialize: RecordCountsPendingMaterialize) -> DedupeResult:
284
-
285
- logger.info(f"Starting dedupe task...")
162
+ compaction_artifact_s3_bucket: str,
163
+ round_completion_info: Optional[RoundCompletionInfo],
164
+ new_primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
165
+ object_ids: List[Any],
166
+ sort_keys: List[SortKey],
167
+ max_records_per_index_file: int,
168
+ num_materialize_buckets: int,
169
+ dedupe_task_index: int,
170
+ delete_old_primary_key_index: bool,
171
+ ) -> DedupeResult:
172
+
173
+ logger.info(f"[Dedupe task {dedupe_task_index}] Starting dedupe task...")
286
174
  # TODO (pdames): mitigate risk of running out of memory here in cases of
287
175
  # severe skew of primary key updates in deltas
288
176
  src_file_records_obj_refs = [
289
- cloudpickle.loads(obj_id_pkl) for obj_id_pkl in object_ids]
290
- logger.info(f"Getting delta file envelope groups object refs...")
177
+ cloudpickle.loads(obj_id_pkl) for obj_id_pkl in object_ids
178
+ ]
179
+ logger.info(
180
+ f"[Dedupe task {dedupe_task_index}] Getting delta file envelope "
181
+ f"groups for {len(src_file_records_obj_refs)} object refs..."
182
+ )
183
+
291
184
  delta_file_envelope_groups_list = ray.get(src_file_records_obj_refs)
292
185
  hb_index_to_delta_file_envelopes_list = defaultdict(list)
293
186
  for delta_file_envelope_groups in delta_file_envelope_groups_list:
@@ -296,36 +189,51 @@ def dedupe(
296
189
  hb_index_to_delta_file_envelopes_list[hb_idx].append(dfes)
297
190
  src_file_id_to_row_indices = defaultdict(list)
298
191
  deduped_tables = []
299
- logger.info(f"Running {len(hb_index_to_delta_file_envelopes_list)} "
300
- f"dedupe rounds...")
192
+ logger.info(
193
+ f"[Dedupe task {dedupe_task_index}] Running {len(hb_index_to_delta_file_envelopes_list)} "
194
+ f"dedupe rounds..."
195
+ )
301
196
  for hb_idx, dfe_list in hb_index_to_delta_file_envelopes_list.items():
302
- table = union_primary_key_indices(
303
- compaction_artifact_s3_bucket,
304
- round_completion_info,
305
- hb_idx,
306
- dfe_list,
197
+ logger.info(f"{dedupe_task_index}: union primary keys for hb_index: {hb_idx}")
198
+
199
+ table, union_time = timed_invocation(
200
+ func=_union_primary_key_indices,
201
+ s3_bucket=compaction_artifact_s3_bucket,
202
+ round_completion_info=round_completion_info,
203
+ hash_bucket_index=hb_idx,
204
+ df_envelopes_list=dfe_list,
205
+ )
206
+ logger.info(
207
+ f"[Dedupe {dedupe_task_index}] Dedupe round input "
208
+ f"record count: {len(table)}, took {union_time}s"
307
209
  )
308
- logger.info(f"Dedupe round input record count: {len(table)}")
309
210
 
310
211
  # sort by sort keys
311
212
  if len(sort_keys):
312
213
  # TODO (pdames): convert to O(N) dedupe w/ sort keys
313
- sort_keys.extend([
314
- SortKey.of(
315
- sc._PARTITION_STREAM_POSITION_COLUMN_NAME,
316
- SortOrder.ASCENDING
317
- ),
318
- SortKey.of(
319
- sc._ORDERED_FILE_IDX_COLUMN_NAME,
320
- SortOrder.ASCENDING
321
- ),
322
- ])
214
+ sort_keys.extend(
215
+ [
216
+ SortKey.of(
217
+ sc._PARTITION_STREAM_POSITION_COLUMN_NAME, SortOrder.ASCENDING
218
+ ),
219
+ SortKey.of(sc._ORDERED_FILE_IDX_COLUMN_NAME, SortOrder.ASCENDING),
220
+ ]
221
+ )
323
222
  table = table.take(pc.sort_indices(table, sort_keys=sort_keys))
324
223
 
325
224
  # drop duplicates by primary key hash column
326
- table = drop_duplicates_by_primary_key_hash(table)
327
- table = table.drop([sc._DELTA_TYPE_COLUMN_NAME])
328
- logger.info(f"Dedupe round output record count: {len(table)}")
225
+ logger.info(
226
+ f"[Dedupe task index {dedupe_task_index}] Dropping duplicates for {hb_idx}"
227
+ )
228
+
229
+ table, drop_time = timed_invocation(
230
+ func=_drop_duplicates_by_primary_key_hash, table=table
231
+ )
232
+
233
+ logger.info(
234
+ f"[Dedupe task index {dedupe_task_index}] Dedupe round output "
235
+ f"record count: {len(table)}, took: {drop_time}s"
236
+ )
329
237
 
330
238
  deduped_tables.append((hb_idx, table))
331
239
 
@@ -344,7 +252,9 @@ def dedupe(
344
252
 
345
253
  logger.info(f"Finished all dedupe rounds...")
346
254
  mat_bucket_to_src_file_record_count = defaultdict(dict)
347
- mat_bucket_to_src_file_records: Dict[MaterializeBucketIndex, DeltaFileLocatorToRecords] = defaultdict(dict)
255
+ mat_bucket_to_src_file_records: Dict[
256
+ MaterializeBucketIndex, DeltaFileLocatorToRecords
257
+ ] = defaultdict(dict)
348
258
  for src_dfl, src_row_indices in src_file_id_to_row_indices.items():
349
259
  mat_bucket = delta_file_locator_to_mat_bucket_index(
350
260
  src_dfl,
@@ -353,48 +263,33 @@ def dedupe(
353
263
  mat_bucket_to_src_file_records[mat_bucket][src_dfl] = np.array(
354
264
  src_row_indices,
355
265
  )
356
- mat_bucket_to_src_file_record_count[mat_bucket][src_dfl] = \
357
- len(src_row_indices)
266
+ mat_bucket_to_src_file_record_count[mat_bucket][src_dfl] = len(src_row_indices)
358
267
 
359
- mat_bucket_to_dd_idx_obj_id: Dict[MaterializeBucketIndex, DedupeTaskIndexWithObjectId] = {}
268
+ mat_bucket_to_dd_idx_obj_id: Dict[
269
+ MaterializeBucketIndex, DedupeTaskIndexWithObjectId
270
+ ] = {}
360
271
  src_file_records_obj_refs: List[ObjectRef[DeltaFileLocatorToRecords]] = []
361
272
  for mat_bucket, src_file_records in mat_bucket_to_src_file_records.items():
362
273
  object_ref = ray.put(src_file_records)
363
- src_file_records_obj_refs.append(object_ref)
274
+ pickled_object_ref = cloudpickle.dumps(object_ref)
275
+ src_file_records_obj_refs.append(pickled_object_ref)
364
276
  mat_bucket_to_dd_idx_obj_id[mat_bucket] = (
365
277
  dedupe_task_index,
366
- cloudpickle.dumps(object_ref),
278
+ pickled_object_ref,
367
279
  )
368
- logger.info(f"Count of materialize buckets with object refs: "
369
- f"{len(mat_bucket_to_dd_idx_obj_id)}")
370
-
371
- record_counts_pending_materialize.add_record_counts.remote(
372
- dedupe_task_index,
373
- mat_bucket_to_src_file_record_count,
374
- )
375
-
376
- # wait for all dedupe tasks to reach this point before continuing
280
+ del object_ref
281
+ del pickled_object_ref
377
282
  logger.info(
378
- f"Waiting for all dedupe tasks to finish writing record counts...")
379
- finalized = False
380
- while not finalized:
381
- finalized = ray.get(
382
- record_counts_pending_materialize.is_finalized.remote()
383
- )
384
- time.sleep(0.25)
385
- record_counts = ray.get(
386
- record_counts_pending_materialize.get_record_counts.remote()
283
+ f"Count of materialize buckets with object refs: "
284
+ f"{len(mat_bucket_to_dd_idx_obj_id)}"
387
285
  )
388
286
 
389
- write_pki_result: PyArrowWriteResult = write_new_primary_key_index(
287
+ write_pki_result: PyArrowWriteResult = _write_new_primary_key_index(
390
288
  compaction_artifact_s3_bucket,
391
289
  new_primary_key_index_version_locator,
392
290
  max_records_per_index_file,
393
- max_records_per_materialized_file,
394
- num_materialize_buckets,
395
291
  dedupe_task_index,
396
292
  deduped_tables,
397
- record_counts,
398
293
  )
399
294
 
400
295
  if delete_old_primary_key_index:
@@ -402,7 +297,5 @@ def dedupe(
402
297
  compaction_artifact_s3_bucket,
403
298
  round_completion_info.primary_key_index_version_locator,
404
299
  )
405
- logger.info(f"Finished dedupe task...")
406
- return mat_bucket_to_dd_idx_obj_id, \
407
- src_file_records_obj_refs, \
408
- write_pki_result
300
+ logger.info(f"[Dedupe task index {dedupe_task_index}] Finished dedupe task...")
301
+ return mat_bucket_to_dd_idx_obj_id, src_file_records_obj_refs, write_pki_result