deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. deltacat/__init__.py +41 -15
  2. deltacat/aws/clients.py +12 -31
  3. deltacat/aws/constants.py +1 -1
  4. deltacat/aws/redshift/__init__.py +7 -2
  5. deltacat/aws/redshift/model/manifest.py +54 -50
  6. deltacat/aws/s3u.py +176 -187
  7. deltacat/catalog/delegate.py +151 -185
  8. deltacat/catalog/interface.py +78 -97
  9. deltacat/catalog/model/catalog.py +21 -21
  10. deltacat/catalog/model/table_definition.py +11 -9
  11. deltacat/compute/compactor/__init__.py +12 -16
  12. deltacat/compute/compactor/compaction_session.py +237 -166
  13. deltacat/compute/compactor/model/delta_annotated.py +60 -44
  14. deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
  15. deltacat/compute/compactor/model/delta_file_locator.py +10 -8
  16. deltacat/compute/compactor/model/materialize_result.py +6 -7
  17. deltacat/compute/compactor/model/primary_key_index.py +38 -34
  18. deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
  19. deltacat/compute/compactor/model/round_completion_info.py +25 -19
  20. deltacat/compute/compactor/model/sort_key.py +18 -15
  21. deltacat/compute/compactor/steps/dedupe.py +119 -94
  22. deltacat/compute/compactor/steps/hash_bucket.py +48 -47
  23. deltacat/compute/compactor/steps/materialize.py +86 -92
  24. deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
  25. deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
  26. deltacat/compute/compactor/utils/io.py +59 -47
  27. deltacat/compute/compactor/utils/primary_key_index.py +91 -80
  28. deltacat/compute/compactor/utils/round_completion_file.py +22 -23
  29. deltacat/compute/compactor/utils/system_columns.py +33 -45
  30. deltacat/compute/metastats/meta_stats.py +235 -157
  31. deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
  32. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
  33. deltacat/compute/metastats/stats.py +95 -64
  34. deltacat/compute/metastats/utils/io.py +100 -53
  35. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
  36. deltacat/compute/metastats/utils/ray_utils.py +38 -33
  37. deltacat/compute/stats/basic.py +107 -69
  38. deltacat/compute/stats/models/delta_column_stats.py +11 -8
  39. deltacat/compute/stats/models/delta_stats.py +59 -32
  40. deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
  41. deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
  42. deltacat/compute/stats/models/stats_result.py +24 -14
  43. deltacat/compute/stats/utils/intervals.py +16 -9
  44. deltacat/compute/stats/utils/io.py +86 -51
  45. deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
  46. deltacat/constants.py +4 -13
  47. deltacat/io/__init__.py +2 -2
  48. deltacat/io/aws/redshift/redshift_datasource.py +157 -143
  49. deltacat/io/dataset.py +14 -17
  50. deltacat/io/read_api.py +36 -33
  51. deltacat/logs.py +94 -42
  52. deltacat/storage/__init__.py +18 -8
  53. deltacat/storage/interface.py +196 -213
  54. deltacat/storage/model/delta.py +45 -51
  55. deltacat/storage/model/list_result.py +12 -8
  56. deltacat/storage/model/namespace.py +4 -5
  57. deltacat/storage/model/partition.py +42 -42
  58. deltacat/storage/model/stream.py +29 -30
  59. deltacat/storage/model/table.py +14 -14
  60. deltacat/storage/model/table_version.py +32 -31
  61. deltacat/storage/model/types.py +1 -0
  62. deltacat/tests/stats/test_intervals.py +11 -24
  63. deltacat/tests/utils/__init__.py +0 -0
  64. deltacat/tests/utils/test_record_batch_tables.py +284 -0
  65. deltacat/types/media.py +3 -4
  66. deltacat/types/tables.py +31 -21
  67. deltacat/utils/common.py +5 -11
  68. deltacat/utils/numpy.py +20 -22
  69. deltacat/utils/pandas.py +73 -100
  70. deltacat/utils/performance.py +3 -9
  71. deltacat/utils/placement.py +259 -230
  72. deltacat/utils/pyarrow.py +302 -89
  73. deltacat/utils/ray_utils/collections.py +2 -1
  74. deltacat/utils/ray_utils/concurrency.py +27 -28
  75. deltacat/utils/ray_utils/dataset.py +28 -28
  76. deltacat/utils/ray_utils/performance.py +5 -9
  77. deltacat/utils/ray_utils/runtime.py +9 -10
  78. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
  79. deltacat-0.1.12.dist-info/RECORD +110 -0
  80. deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
  81. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
  82. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
  83. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0
@@ -1,51 +1,66 @@
1
- import logging
2
1
  import functools
3
- import ray
4
-
2
+ import logging
5
3
  from collections import defaultdict
4
+ from typing import Dict, List, Optional, Set, Tuple
5
+
6
+ import pyarrow as pa
7
+ import ray
6
8
 
7
9
  from deltacat import logs
10
+ from deltacat.compute.compactor import (
11
+ PrimaryKeyIndexLocator,
12
+ PrimaryKeyIndexMeta,
13
+ PrimaryKeyIndexVersionLocator,
14
+ PrimaryKeyIndexVersionMeta,
15
+ PyArrowWriteResult,
16
+ RoundCompletionInfo,
17
+ SortKey,
18
+ )
19
+ from deltacat.compute.compactor.steps import dedupe as dd
20
+ from deltacat.compute.compactor.steps import hash_bucket as hb
21
+ from deltacat.compute.compactor.steps import materialize as mat
22
+ from deltacat.compute.compactor.utils import io
23
+ from deltacat.compute.compactor.utils import primary_key_index as pki
24
+ from deltacat.compute.compactor.utils import round_completion_file as rcf
8
25
  from deltacat.compute.stats.models.delta_stats import DeltaStats
9
- from deltacat.storage import Delta, DeltaLocator, Partition, \
10
- PartitionLocator, interface as unimplemented_deltacat_storage
11
- from deltacat.utils.ray_utils.concurrency import invoke_parallel, \
12
- round_robin_options_provider
13
- from deltacat.utils.ray_utils.runtime import live_node_resource_keys
14
- from deltacat.compute.compactor.steps import hash_bucket as hb, dedupe as dd, \
15
- materialize as mat
16
- from deltacat.compute.compactor import SortKey, PrimaryKeyIndexMeta, \
17
- PrimaryKeyIndexLocator, PrimaryKeyIndexVersionMeta, \
18
- PrimaryKeyIndexVersionLocator, RoundCompletionInfo, \
19
- PyArrowWriteResult
20
- from deltacat.compute.compactor.utils import round_completion_file as rcf, io, \
21
- primary_key_index as pki
26
+ from deltacat.storage import Delta, DeltaLocator, Partition, PartitionLocator
27
+ from deltacat.storage import interface as unimplemented_deltacat_storage
22
28
  from deltacat.types.media import ContentType
23
29
  from deltacat.utils.placement import PlacementGroupConfig
24
- from typing import List, Set, Optional, Tuple, Dict
30
+ from deltacat.utils.ray_utils.concurrency import (
31
+ invoke_parallel,
32
+ round_robin_options_provider,
33
+ )
34
+ from deltacat.utils.ray_utils.runtime import live_node_resource_keys
25
35
 
26
- import pyarrow as pa
27
36
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
28
37
 
29
38
  _PRIMARY_KEY_INDEX_ALGORITHM_VERSION: str = "1.0"
30
39
 
31
40
 
32
41
  def check_preconditions(
33
- source_partition_locator: PartitionLocator,
34
- compacted_partition_locator: PartitionLocator,
35
- sort_keys: List[SortKey],
36
- max_records_per_output_file: int,
37
- new_hash_bucket_count: Optional[int],
38
- deltacat_storage=unimplemented_deltacat_storage) -> int:
39
-
40
- assert source_partition_locator.partition_values \
41
- == compacted_partition_locator.partition_values, \
42
- "In-place compaction must use the same partition values for the " \
42
+ source_partition_locator: PartitionLocator,
43
+ compacted_partition_locator: PartitionLocator,
44
+ sort_keys: List[SortKey],
45
+ max_records_per_output_file: int,
46
+ new_hash_bucket_count: Optional[int],
47
+ deltacat_storage=unimplemented_deltacat_storage,
48
+ ) -> int:
49
+
50
+ assert (
51
+ source_partition_locator.partition_values
52
+ == compacted_partition_locator.partition_values
53
+ ), (
54
+ "In-place compaction must use the same partition values for the "
43
55
  "source and destination."
44
- assert max_records_per_output_file >= 1, \
45
- "Max records per output file must be a positive value"
56
+ )
57
+ assert (
58
+ max_records_per_output_file >= 1
59
+ ), "Max records per output file must be a positive value"
46
60
  if new_hash_bucket_count is not None:
47
- assert new_hash_bucket_count >= 1, \
48
- "New hash bucket count must be a positive value"
61
+ assert (
62
+ new_hash_bucket_count >= 1
63
+ ), "New hash bucket count must be a positive value"
49
64
  return SortKey.validate_sort_keys(
50
65
  source_partition_locator,
51
66
  sort_keys,
@@ -54,95 +69,110 @@ def check_preconditions(
54
69
 
55
70
 
56
71
  def compact_partition(
57
- source_partition_locator: PartitionLocator,
58
- compacted_partition_locator: PartitionLocator,
59
- primary_keys: Set[str],
60
- compaction_artifact_s3_bucket: str,
61
- last_stream_position_to_compact: int,
62
- hash_bucket_count: Optional[int] = None,
63
- sort_keys: List[SortKey] = None,
64
- records_per_primary_key_index_file: int = 38_000_000,
65
- records_per_compacted_file: int = 4_000_000,
66
- input_deltas_stats: Dict[int, DeltaStats] = None,
67
- min_pk_index_pa_bytes: int = 0,
68
- min_hash_bucket_chunk_size: int = 0,
69
- compacted_file_content_type: ContentType = ContentType.PARQUET,
70
- delete_prev_primary_key_index: bool = False,
71
- read_round_completion: bool = False,
72
- pg_config: Optional[PlacementGroupConfig] = None,
73
- schema_on_read: Optional[pa.schema] = None, # TODO (ricmiyam): Remove this and retrieve schema from storage API
74
- deltacat_storage=unimplemented_deltacat_storage):
72
+ source_partition_locator: PartitionLocator,
73
+ destination_partition_locator: PartitionLocator,
74
+ primary_keys: Set[str],
75
+ compaction_artifact_s3_bucket: str,
76
+ last_stream_position_to_compact: int,
77
+ *,
78
+ hash_bucket_count: Optional[int] = None,
79
+ sort_keys: List[SortKey] = None,
80
+ records_per_primary_key_index_file: int = 38_000_000,
81
+ records_per_compacted_file: int = 4_000_000,
82
+ input_deltas_stats: Dict[int, DeltaStats] = None,
83
+ min_pk_index_pa_bytes: int = 0,
84
+ min_hash_bucket_chunk_size: int = 0,
85
+ compacted_file_content_type: ContentType = ContentType.PARQUET,
86
+ delete_prev_primary_key_index: bool = False,
87
+ pg_config: Optional[PlacementGroupConfig] = None,
88
+ schema_on_read: Optional[
89
+ pa.schema
90
+ ] = None, # TODO (ricmiyam): Remove this and retrieve schema from storage API
91
+ rebase_source_partition_locator: Optional[PartitionLocator] = None,
92
+ rebase_source_partition_high_watermark: Optional[int] = None,
93
+ deltacat_storage=unimplemented_deltacat_storage,
94
+ ) -> Optional[str]:
75
95
 
76
96
  logger.info(f"Starting compaction session for: {source_partition_locator}")
77
97
  partition = None
78
98
  compaction_rounds_executed = 0
79
99
  has_next_compaction_round = True
100
+ new_rcf_s3_url = None
80
101
  while has_next_compaction_round:
81
- has_next_compaction_round, new_partition, new_rci = \
82
- _execute_compaction_round(
83
- source_partition_locator,
84
- compacted_partition_locator,
85
- primary_keys,
86
- compaction_artifact_s3_bucket,
87
- last_stream_position_to_compact,
88
- hash_bucket_count,
89
- sort_keys,
90
- records_per_primary_key_index_file,
91
- records_per_compacted_file,
92
- input_deltas_stats,
93
- min_pk_index_pa_bytes,
94
- min_hash_bucket_chunk_size,
95
- compacted_file_content_type,
96
- delete_prev_primary_key_index,
97
- read_round_completion,
98
- schema_on_read,
99
- deltacat_storage=deltacat_storage,
100
- pg_config=pg_config
101
- )
102
+ (
103
+ has_next_compaction_round,
104
+ new_partition,
105
+ new_rci,
106
+ new_rcf_s3_url,
107
+ ) = _execute_compaction_round(
108
+ source_partition_locator,
109
+ destination_partition_locator,
110
+ primary_keys,
111
+ compaction_artifact_s3_bucket,
112
+ last_stream_position_to_compact,
113
+ hash_bucket_count,
114
+ sort_keys,
115
+ records_per_primary_key_index_file,
116
+ records_per_compacted_file,
117
+ input_deltas_stats,
118
+ min_pk_index_pa_bytes,
119
+ min_hash_bucket_chunk_size,
120
+ compacted_file_content_type,
121
+ delete_prev_primary_key_index,
122
+ pg_config,
123
+ schema_on_read,
124
+ rebase_source_partition_locator,
125
+ rebase_source_partition_high_watermark,
126
+ deltacat_storage,
127
+ )
102
128
  if new_partition:
103
129
  partition = new_partition
104
- compacted_partition_locator = new_partition.locator
130
+ destination_partition_locator = new_partition.locator
105
131
  compaction_rounds_executed += 1
106
132
  # Take new primary key index sizes into account for subsequent compaction rounds and their dedupe steps
107
133
  if new_rci:
108
134
  min_pk_index_pa_bytes = new_rci.pk_index_pyarrow_write_result.pyarrow_bytes
109
135
 
110
- logger.info(f"Partition-{source_partition_locator.partition_values}-> Compaction session data processing completed in "
111
- f"{compaction_rounds_executed} rounds.")
136
+ logger.info(
137
+ f"Partition-{source_partition_locator.partition_values}-> Compaction session data processing completed in "
138
+ f"{compaction_rounds_executed} rounds."
139
+ )
112
140
  if partition:
113
141
  logger.info(f"Committing compacted partition to: {partition.locator}")
114
142
  partition = deltacat_storage.commit_partition(partition)
115
143
  logger.info(f"Committed compacted partition: {partition}")
116
144
  logger.info(f"Completed compaction session for: {source_partition_locator}")
145
+ return new_rcf_s3_url
117
146
 
118
147
 
119
148
  def _execute_compaction_round(
120
- source_partition_locator: PartitionLocator,
121
- compacted_partition_locator: PartitionLocator,
122
- primary_keys: Set[str],
123
- compaction_artifact_s3_bucket: str,
124
- last_stream_position_to_compact: int,
125
- new_hash_bucket_count: Optional[int],
126
- sort_keys: List[SortKey],
127
- records_per_primary_key_index_file: int,
128
- records_per_compacted_file: int,
129
- input_deltas_stats: Dict[int, DeltaStats],
130
- min_pk_index_pa_bytes: int,
131
- min_hash_bucket_chunk_size: int,
132
- compacted_file_content_type: ContentType,
133
- delete_prev_primary_key_index: bool,
134
- read_round_completion: bool,
135
- schema_on_read: Optional[pa.schema],
136
- deltacat_storage = unimplemented_deltacat_storage,
137
- pg_config: Optional[PlacementGroupConfig] = None) \
138
- -> Tuple[bool, Optional[Partition], Optional[RoundCompletionInfo]]:
139
-
149
+ source_partition_locator: PartitionLocator,
150
+ compacted_partition_locator: PartitionLocator,
151
+ primary_keys: Set[str],
152
+ compaction_artifact_s3_bucket: str,
153
+ last_stream_position_to_compact: int,
154
+ new_hash_bucket_count: Optional[int],
155
+ sort_keys: List[SortKey],
156
+ records_per_primary_key_index_file: int,
157
+ records_per_compacted_file: int,
158
+ input_deltas_stats: Dict[int, DeltaStats],
159
+ min_pk_index_pa_bytes: int,
160
+ min_hash_bucket_chunk_size: int,
161
+ compacted_file_content_type: ContentType,
162
+ delete_prev_primary_key_index: bool,
163
+ pg_config: Optional[PlacementGroupConfig],
164
+ schema_on_read: Optional[pa.schema],
165
+ rebase_source_partition_locator: Optional[PartitionLocator],
166
+ rebase_source_partition_high_watermark: Optional[int],
167
+ deltacat_storage=unimplemented_deltacat_storage,
168
+ ) -> Tuple[bool, Optional[Partition], Optional[RoundCompletionInfo], Optional[str]]:
140
169
 
141
170
  if not primary_keys:
142
171
  # TODO (pdames): run simple rebatch to reduce all deltas into 1 delta
143
172
  # with normalized manifest entry sizes
144
173
  raise NotImplementedError(
145
- "Compaction only supports tables with 1 or more primary keys")
174
+ "Compaction only supports tables with 1 or more primary keys"
175
+ )
146
176
  if sort_keys is None:
147
177
  sort_keys = []
148
178
  # TODO (pdames): detect and handle schema evolution (at least ensure that
@@ -166,23 +196,25 @@ def _execute_compaction_round(
166
196
  cluster_resources = ray.cluster_resources()
167
197
  logger.info(f"Total cluster resources: {cluster_resources}")
168
198
  node_resource_keys = None
169
- if pg_config: # use resource in each placement group
199
+ if pg_config: # use resource in each placement group
170
200
  cluster_resources = pg_config.resource
171
- cluster_cpus = cluster_resources['CPU']
172
- else: # use all cluster resource
201
+ cluster_cpus = cluster_resources["CPU"]
202
+ else: # use all cluster resource
173
203
  logger.info(f"Available cluster resources: {ray.available_resources()}")
174
204
  cluster_cpus = int(cluster_resources["CPU"])
175
205
  logger.info(f"Total cluster CPUs: {cluster_cpus}")
176
206
  node_resource_keys = live_node_resource_keys()
177
- logger.info(f"Found {len(node_resource_keys)} live cluster nodes: "
178
- f"{node_resource_keys}")
207
+ logger.info(
208
+ f"Found {len(node_resource_keys)} live cluster nodes: "
209
+ f"{node_resource_keys}"
210
+ )
179
211
 
180
212
  # create a remote options provider to round-robin tasks across all nodes or allocated bundles
181
213
  logger.info(f"Setting round robin scheduling with node id:{node_resource_keys}")
182
214
  round_robin_opt_provider = functools.partial(
183
215
  round_robin_options_provider,
184
216
  resource_keys=node_resource_keys,
185
- pg_config = pg_config.opts if pg_config else None
217
+ pg_config=pg_config.opts if pg_config else None,
186
218
  )
187
219
 
188
220
  # assign a distinct index to each node in the cluster
@@ -206,16 +238,20 @@ def _execute_compaction_round(
206
238
  _PRIMARY_KEY_INDEX_ALGORITHM_VERSION,
207
239
  )
208
240
  compatible_primary_key_index_locator = PrimaryKeyIndexLocator.of(
209
- compatible_primary_key_index_meta)
210
- compatible_primary_key_index_root_path = \
241
+ compatible_primary_key_index_meta
242
+ )
243
+ compatible_primary_key_index_root_path = (
211
244
  compatible_primary_key_index_locator.primary_key_index_root_path
245
+ )
212
246
 
213
247
  # read the results from any previously completed compaction round that used
214
248
  # a compatible primary key index
215
249
  round_completion_info = None
216
- if read_round_completion:
217
- logger.info(f"Reading round completion file for compatible "
218
- f"primary key index root path {compatible_primary_key_index_root_path}")
250
+ if not rebase_source_partition_locator:
251
+ logger.info(
252
+ f"Reading round completion file for compatible "
253
+ f"primary key index root path: {compatible_primary_key_index_root_path}"
254
+ )
219
255
  round_completion_info = rcf.read_round_completion_file(
220
256
  compaction_artifact_s3_bucket,
221
257
  source_partition_locator,
@@ -226,21 +262,34 @@ def _execute_compaction_round(
226
262
  # read the previous compaction round's hash bucket count, if any
227
263
  old_hash_bucket_count = None
228
264
  if round_completion_info:
229
- old_pki_version_locator = round_completion_info\
230
- .primary_key_index_version_locator
231
- old_hash_bucket_count = old_pki_version_locator\
232
- .primary_key_index_version_meta \
233
- .hash_bucket_count
234
- min_pk_index_pa_bytes = round_completion_info.pk_index_pyarrow_write_result.pyarrow_bytes
265
+ old_pki_version_locator = (
266
+ round_completion_info.primary_key_index_version_locator
267
+ )
268
+ old_hash_bucket_count = (
269
+ old_pki_version_locator.primary_key_index_version_meta.hash_bucket_count
270
+ )
271
+ min_pk_index_pa_bytes = (
272
+ round_completion_info.pk_index_pyarrow_write_result.pyarrow_bytes
273
+ )
274
+ else:
275
+ logger.info(
276
+ f"No prior round info read. Source partition: "
277
+ f"{source_partition_locator}. Primary key index locator: "
278
+ f"{compatible_primary_key_index_locator}. Rebase source "
279
+ f"partition locator: {rebase_source_partition_locator}"
280
+ )
235
281
 
236
282
  # use the new hash bucket count if provided, or fall back to old count
237
- hash_bucket_count = new_hash_bucket_count \
238
- if new_hash_bucket_count is not None \
283
+ hash_bucket_count = (
284
+ new_hash_bucket_count
285
+ if new_hash_bucket_count is not None
239
286
  else old_hash_bucket_count
287
+ )
240
288
 
241
289
  # discover input delta files
242
- high_watermark = round_completion_info.high_watermark \
243
- if round_completion_info else None
290
+ high_watermark = (
291
+ round_completion_info.high_watermark if round_completion_info else None
292
+ )
244
293
 
245
294
  input_deltas = io.discover_deltas(
246
295
  source_partition_locator,
@@ -251,25 +300,29 @@ def _execute_compaction_round(
251
300
 
252
301
  if not input_deltas:
253
302
  logger.info("No input deltas found to compact.")
254
- return False, None, None
303
+ return False, None, None, None
255
304
 
256
305
  # limit the input deltas to fit on this cluster and convert them to
257
306
  # annotated deltas of equivalent size for easy parallel distribution
258
307
 
259
- uniform_deltas, hash_bucket_count, last_stream_position_compacted = \
260
- io.limit_input_deltas(
261
- input_deltas,
262
- cluster_resources,
263
- hash_bucket_count,
264
- min_pk_index_pa_bytes,
265
- min_hash_bucket_chunk_size,
266
- input_deltas_stats=input_deltas_stats,
267
- deltacat_storage=deltacat_storage
268
- )
308
+ (
309
+ uniform_deltas,
310
+ hash_bucket_count,
311
+ last_stream_position_compacted,
312
+ ) = io.limit_input_deltas(
313
+ input_deltas,
314
+ cluster_resources,
315
+ hash_bucket_count,
316
+ min_pk_index_pa_bytes,
317
+ min_hash_bucket_chunk_size,
318
+ input_deltas_stats=input_deltas_stats,
319
+ deltacat_storage=deltacat_storage,
320
+ )
269
321
 
270
- assert hash_bucket_count is not None and hash_bucket_count > 0, \
271
- f"Unexpected Error: Default hash bucket count ({hash_bucket_count}) " \
272
- f"is invalid."
322
+ assert hash_bucket_count is not None and hash_bucket_count > 0, (
323
+ f"Expected hash bucket count to be a positive integer, but found "
324
+ f"`{hash_bucket_count}`"
325
+ )
273
326
 
274
327
  # rehash the primary key index if necessary
275
328
  if round_completion_info:
@@ -277,8 +330,8 @@ def _execute_compaction_round(
277
330
  # the previous primary key index is compatible with the current, but
278
331
  # will need to be rehashed if the hash bucket count has changed
279
332
  if hash_bucket_count != old_hash_bucket_count:
280
- # TODO(draghave): manually test the path after prior primary key
281
- # index was already built
333
+ # TODO(draghave): manually test the path after prior primary key
334
+ # index was already built
282
335
  round_completion_info = pki.rehash(
283
336
  round_robin_opt_provider,
284
337
  compaction_artifact_s3_bucket,
@@ -289,10 +342,6 @@ def _execute_compaction_round(
289
342
  records_per_primary_key_index_file,
290
343
  delete_prev_primary_key_index,
291
344
  )
292
- else:
293
- logger.info(f"No prior round completion file found. Source partition: "
294
- f"{source_partition_locator}. Primary key index locator: "
295
- f"{compatible_primary_key_index_locator}")
296
345
 
297
346
  # parallel step 1:
298
347
  # group like primary keys together by hashing them into buckets
@@ -315,7 +364,7 @@ def _execute_compaction_round(
315
364
  for hash_group_index, object_id in enumerate(hash_group_idx_to_obj_id):
316
365
  if object_id:
317
366
  all_hash_group_idx_to_obj_id[hash_group_index].append(object_id)
318
- hash_group_count = dedupe_task_count = len(all_hash_group_idx_to_obj_id)
367
+ hash_group_count = len(all_hash_group_idx_to_obj_id)
319
368
  logger.info(f"Hash bucket groups created: {hash_group_count}")
320
369
 
321
370
  # TODO (pdames): when resources are freed during the last round of hash
@@ -343,9 +392,11 @@ def _execute_compaction_round(
343
392
  _PRIMARY_KEY_INDEX_ALGORITHM_VERSION,
344
393
  )
345
394
  new_primary_key_index_locator = PrimaryKeyIndexLocator.of(
346
- new_primary_key_index_meta)
347
- new_primary_key_index_root_path = new_primary_key_index_locator\
348
- .primary_key_index_root_path
395
+ new_primary_key_index_meta
396
+ )
397
+ new_primary_key_index_root_path = (
398
+ new_primary_key_index_locator.primary_key_index_root_path
399
+ )
349
400
 
350
401
  # generate a new primary key index version locator for this round
351
402
  new_primary_key_index_version_meta = PrimaryKeyIndexVersionMeta.of(
@@ -353,8 +404,8 @@ def _execute_compaction_round(
353
404
  hash_bucket_count,
354
405
  )
355
406
  new_pki_version_locator = PrimaryKeyIndexVersionLocator.generate(
356
- new_primary_key_index_version_meta)
357
-
407
+ new_primary_key_index_version_meta
408
+ )
358
409
 
359
410
  # parallel step 2:
360
411
  # discover records with duplicate primary keys in each hash bucket, and
@@ -366,30 +417,34 @@ def _execute_compaction_round(
366
417
  ray_task=dd.dedupe,
367
418
  max_parallelism=max_parallelism,
368
419
  options_provider=round_robin_opt_provider,
369
- kwargs_provider=lambda index, item: {"dedupe_task_index": index,
370
- "object_ids": item},
420
+ kwargs_provider=lambda index, item: {
421
+ "dedupe_task_index": index,
422
+ "object_ids": item,
423
+ },
371
424
  compaction_artifact_s3_bucket=compaction_artifact_s3_bucket,
372
425
  round_completion_info=round_completion_info,
373
426
  new_primary_key_index_version_locator=new_pki_version_locator,
374
427
  sort_keys=sort_keys,
375
428
  max_records_per_index_file=records_per_primary_key_index_file,
376
429
  num_materialize_buckets=num_materialize_buckets,
377
- delete_old_primary_key_index=delete_prev_primary_key_index
430
+ delete_old_primary_key_index=delete_prev_primary_key_index,
378
431
  )
379
432
  logger.info(f"Getting {len(dd_tasks_pending)} dedupe results...")
380
433
  dd_results = ray.get([t[0] for t in dd_tasks_pending])
381
434
  logger.info(f"Got {len(dd_results)} dedupe results.")
382
435
  all_mat_buckets_to_obj_id = defaultdict(list)
383
436
  for mat_bucket_idx_to_obj_id in dd_results:
384
- for bucket_idx, dd_task_index_and_object_id_tuple in \
385
- mat_bucket_idx_to_obj_id.items():
437
+ for (
438
+ bucket_idx,
439
+ dd_task_index_and_object_id_tuple,
440
+ ) in mat_bucket_idx_to_obj_id.items():
386
441
  all_mat_buckets_to_obj_id[bucket_idx].append(
387
- dd_task_index_and_object_id_tuple)
442
+ dd_task_index_and_object_id_tuple
443
+ )
388
444
  logger.info(f"Getting {len(dd_tasks_pending)} dedupe result stat(s)...")
389
445
  pki_stats = ray.get([t[2] for t in dd_tasks_pending])
390
446
  logger.info(f"Got {len(pki_stats)} dedupe result stat(s).")
391
- logger.info(f"Materialize buckets created: "
392
- f"{len(all_mat_buckets_to_obj_id)}")
447
+ logger.info(f"Materialize buckets created: " f"{len(all_mat_buckets_to_obj_id)}")
393
448
 
394
449
  # TODO(pdames): when resources are freed during the last round of deduping
395
450
  # start running materialize tasks that read materialization source file
@@ -408,9 +463,9 @@ def _execute_compaction_round(
408
463
  ray_task=mat.materialize,
409
464
  max_parallelism=max_parallelism,
410
465
  options_provider=round_robin_opt_provider,
411
- kwargs_provider=lambda index, mat_bucket_idx_to_obj_id: {
412
- "mat_bucket_index": mat_bucket_idx_to_obj_id[0],
413
- "dedupe_task_idx_and_obj_id_tuples": mat_bucket_idx_to_obj_id[1],
466
+ kwargs_provider=lambda index, mat_bucket_index_to_obj_id: {
467
+ "mat_bucket_index": mat_bucket_index_to_obj_id[0],
468
+ "dedupe_task_idx_and_obj_id_tuples": mat_bucket_index_to_obj_id[1],
414
469
  },
415
470
  schema=schema_on_read,
416
471
  round_completion_info=round_completion_info,
@@ -435,24 +490,40 @@ def _execute_compaction_round(
435
490
  compacted_delta.stream_position,
436
491
  )
437
492
 
438
- round_completion_info = RoundCompletionInfo.of(
439
- last_stream_position_compacted,
493
+ rci_high_watermark = (
494
+ rebase_source_partition_high_watermark
495
+ if rebase_source_partition_high_watermark
496
+ else last_stream_position_compacted
497
+ )
498
+ new_round_completion_info = RoundCompletionInfo.of(
499
+ rci_high_watermark,
440
500
  new_compacted_delta_locator,
441
- PyArrowWriteResult.union([m.pyarrow_write_result
442
- for m in mat_results]),
501
+ PyArrowWriteResult.union([m.pyarrow_write_result for m in mat_results]),
443
502
  PyArrowWriteResult.union(pki_stats),
444
503
  bit_width_of_sort_keys,
445
504
  new_pki_version_locator,
505
+ rebase_source_partition_locator
506
+ or round_completion_info.rebase_source_partition_locator,
446
507
  )
447
- rcf.write_round_completion_file(
508
+ rcf_source_partition_locator = (
509
+ rebase_source_partition_locator
510
+ if rebase_source_partition_locator
511
+ else source_partition_locator
512
+ )
513
+ round_completion_file_s3_url = rcf.write_round_completion_file(
448
514
  compaction_artifact_s3_bucket,
449
- source_partition_locator,
515
+ rcf_source_partition_locator,
450
516
  new_primary_key_index_root_path,
451
- round_completion_info,
517
+ new_round_completion_info,
518
+ )
519
+ logger.info(
520
+ f"partition-{source_partition_locator.partition_values},"
521
+ f"compacted at: {last_stream_position_compacted},"
522
+ f"last position: {last_stream_position_to_compact}"
523
+ )
524
+ return (
525
+ (last_stream_position_compacted < last_stream_position_to_compact),
526
+ partition,
527
+ new_round_completion_info,
528
+ round_completion_file_s3_url,
452
529
  )
453
- logger.info(f"partition-{source_partition_locator.partition_values},compacted at:{last_stream_position_compacted}, last position:{last_stream_position_to_compact}")
454
- return \
455
- (last_stream_position_compacted < last_stream_position_to_compact), \
456
- partition, \
457
- round_completion_info
458
-