deltacat 0.1.6__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. deltacat/__init__.py +41 -15
  2. deltacat/aws/clients.py +12 -31
  3. deltacat/aws/constants.py +1 -1
  4. deltacat/aws/redshift/__init__.py +7 -2
  5. deltacat/aws/redshift/model/manifest.py +54 -50
  6. deltacat/aws/s3u.py +183 -194
  7. deltacat/catalog/delegate.py +151 -185
  8. deltacat/catalog/interface.py +78 -97
  9. deltacat/catalog/model/catalog.py +21 -21
  10. deltacat/catalog/model/table_definition.py +11 -9
  11. deltacat/compute/compactor/__init__.py +12 -16
  12. deltacat/compute/compactor/compaction_session.py +249 -198
  13. deltacat/compute/compactor/model/delta_annotated.py +60 -44
  14. deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
  15. deltacat/compute/compactor/model/delta_file_locator.py +10 -8
  16. deltacat/compute/compactor/model/materialize_result.py +6 -7
  17. deltacat/compute/compactor/model/primary_key_index.py +38 -34
  18. deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
  19. deltacat/compute/compactor/model/round_completion_info.py +25 -19
  20. deltacat/compute/compactor/model/sort_key.py +18 -15
  21. deltacat/compute/compactor/steps/dedupe.py +153 -260
  22. deltacat/compute/compactor/steps/hash_bucket.py +56 -56
  23. deltacat/compute/compactor/steps/materialize.py +139 -100
  24. deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
  25. deltacat/compute/compactor/steps/rehash/rewrite_index.py +11 -13
  26. deltacat/compute/compactor/utils/io.py +59 -47
  27. deltacat/compute/compactor/utils/primary_key_index.py +131 -90
  28. deltacat/compute/compactor/utils/round_completion_file.py +22 -23
  29. deltacat/compute/compactor/utils/system_columns.py +33 -42
  30. deltacat/compute/metastats/meta_stats.py +235 -157
  31. deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
  32. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
  33. deltacat/compute/metastats/stats.py +95 -64
  34. deltacat/compute/metastats/utils/io.py +100 -53
  35. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
  36. deltacat/compute/metastats/utils/ray_utils.py +38 -33
  37. deltacat/compute/stats/basic.py +107 -69
  38. deltacat/compute/stats/models/delta_column_stats.py +11 -8
  39. deltacat/compute/stats/models/delta_stats.py +59 -32
  40. deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
  41. deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
  42. deltacat/compute/stats/models/stats_result.py +24 -14
  43. deltacat/compute/stats/utils/intervals.py +16 -9
  44. deltacat/compute/stats/utils/io.py +86 -51
  45. deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
  46. deltacat/constants.py +8 -10
  47. deltacat/io/__init__.py +2 -2
  48. deltacat/io/aws/redshift/redshift_datasource.py +157 -143
  49. deltacat/io/dataset.py +14 -17
  50. deltacat/io/read_api.py +36 -33
  51. deltacat/logs.py +94 -42
  52. deltacat/storage/__init__.py +18 -8
  53. deltacat/storage/interface.py +196 -213
  54. deltacat/storage/model/delta.py +45 -51
  55. deltacat/storage/model/list_result.py +12 -8
  56. deltacat/storage/model/namespace.py +4 -5
  57. deltacat/storage/model/partition.py +42 -42
  58. deltacat/storage/model/stream.py +29 -30
  59. deltacat/storage/model/table.py +14 -14
  60. deltacat/storage/model/table_version.py +32 -31
  61. deltacat/storage/model/types.py +1 -0
  62. deltacat/tests/stats/test_intervals.py +11 -24
  63. deltacat/tests/utils/__init__.py +0 -0
  64. deltacat/tests/utils/test_record_batch_tables.py +284 -0
  65. deltacat/types/media.py +3 -4
  66. deltacat/types/tables.py +31 -21
  67. deltacat/utils/common.py +5 -11
  68. deltacat/utils/numpy.py +20 -22
  69. deltacat/utils/pandas.py +73 -100
  70. deltacat/utils/performance.py +3 -9
  71. deltacat/utils/placement.py +276 -228
  72. deltacat/utils/pyarrow.py +302 -89
  73. deltacat/utils/ray_utils/collections.py +2 -1
  74. deltacat/utils/ray_utils/concurrency.py +36 -29
  75. deltacat/utils/ray_utils/dataset.py +28 -28
  76. deltacat/utils/ray_utils/performance.py +5 -9
  77. deltacat/utils/ray_utils/runtime.py +9 -10
  78. {deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/METADATA +21 -11
  79. deltacat-0.1.11.dist-info/RECORD +110 -0
  80. {deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/WHEEL +1 -1
  81. deltacat-0.1.6.dist-info/RECORD +0 -108
  82. {deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/LICENSE +0 -0
  83. {deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/top_level.txt +0 -0
@@ -1,54 +1,66 @@
1
- import logging
2
- import time
3
1
  import functools
4
- import ray
5
-
2
+ import logging
6
3
  from collections import defaultdict
4
+ from typing import Dict, List, Optional, Set, Tuple
5
+
6
+ import pyarrow as pa
7
+ import ray
7
8
 
8
9
  from deltacat import logs
10
+ from deltacat.compute.compactor import (
11
+ PrimaryKeyIndexLocator,
12
+ PrimaryKeyIndexMeta,
13
+ PrimaryKeyIndexVersionLocator,
14
+ PrimaryKeyIndexVersionMeta,
15
+ PyArrowWriteResult,
16
+ RoundCompletionInfo,
17
+ SortKey,
18
+ )
19
+ from deltacat.compute.compactor.steps import dedupe as dd
20
+ from deltacat.compute.compactor.steps import hash_bucket as hb
21
+ from deltacat.compute.compactor.steps import materialize as mat
22
+ from deltacat.compute.compactor.utils import io
23
+ from deltacat.compute.compactor.utils import primary_key_index as pki
24
+ from deltacat.compute.compactor.utils import round_completion_file as rcf
9
25
  from deltacat.compute.stats.models.delta_stats import DeltaStats
10
- from deltacat.storage import Delta, DeltaLocator, Partition, \
11
- PartitionLocator, interface as unimplemented_deltacat_storage
12
- from deltacat.utils.ray_utils.concurrency import invoke_parallel, \
13
- round_robin_options_provider
14
- from deltacat.utils.ray_utils.runtime import live_node_resource_keys
15
- from deltacat.compute.compactor.steps import hash_bucket as hb, dedupe as dd, \
16
- materialize as mat
17
- from deltacat.compute.compactor import SortKey, PrimaryKeyIndexMeta, \
18
- PrimaryKeyIndexLocator, PrimaryKeyIndexVersionMeta, \
19
- PrimaryKeyIndexVersionLocator, RoundCompletionInfo, \
20
- PyArrowWriteResult
21
- from deltacat.compute.compactor.utils import round_completion_file as rcf, io, \
22
- primary_key_index as pki
26
+ from deltacat.storage import Delta, DeltaLocator, Partition, PartitionLocator
27
+ from deltacat.storage import interface as unimplemented_deltacat_storage
23
28
  from deltacat.types.media import ContentType
29
+ from deltacat.utils.placement import PlacementGroupConfig
30
+ from deltacat.utils.ray_utils.concurrency import (
31
+ invoke_parallel,
32
+ round_robin_options_provider,
33
+ )
34
+ from deltacat.utils.ray_utils.runtime import live_node_resource_keys
24
35
 
25
- from typing import List, Set, Optional, Tuple, Dict, Union, Any
26
-
27
- import pyarrow as pa
28
36
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
29
37
 
30
- _SORT_KEY_NAME_INDEX: int = 0
31
- _SORT_KEY_ORDER_INDEX: int = 1
32
38
  _PRIMARY_KEY_INDEX_ALGORITHM_VERSION: str = "1.0"
33
39
 
34
40
 
35
41
  def check_preconditions(
36
- source_partition_locator: PartitionLocator,
37
- compacted_partition_locator: PartitionLocator,
38
- sort_keys: List[SortKey],
39
- max_records_per_output_file: int,
40
- new_hash_bucket_count: Optional[int],
41
- deltacat_storage=unimplemented_deltacat_storage) -> int:
42
-
43
- assert source_partition_locator.partition_values \
44
- == compacted_partition_locator.partition_values, \
45
- "In-place compaction must use the same partition values for the " \
42
+ source_partition_locator: PartitionLocator,
43
+ compacted_partition_locator: PartitionLocator,
44
+ sort_keys: List[SortKey],
45
+ max_records_per_output_file: int,
46
+ new_hash_bucket_count: Optional[int],
47
+ deltacat_storage=unimplemented_deltacat_storage,
48
+ ) -> int:
49
+
50
+ assert (
51
+ source_partition_locator.partition_values
52
+ == compacted_partition_locator.partition_values
53
+ ), (
54
+ "In-place compaction must use the same partition values for the "
46
55
  "source and destination."
47
- assert max_records_per_output_file >= 1, \
48
- "Max records per output file must be a positive value"
56
+ )
57
+ assert (
58
+ max_records_per_output_file >= 1
59
+ ), "Max records per output file must be a positive value"
49
60
  if new_hash_bucket_count is not None:
50
- assert new_hash_bucket_count >= 1, \
51
- "New hash bucket count must be a positive value"
61
+ assert (
62
+ new_hash_bucket_count >= 1
63
+ ), "New hash bucket count must be a positive value"
52
64
  return SortKey.validate_sort_keys(
53
65
  source_partition_locator,
54
66
  sort_keys,
@@ -57,101 +69,111 @@ def check_preconditions(
57
69
 
58
70
 
59
71
  def compact_partition(
60
- source_partition_locator: PartitionLocator,
61
- compacted_partition_locator: PartitionLocator,
62
- primary_keys: Set[str],
63
- compaction_artifact_s3_bucket: str,
64
- last_stream_position_to_compact: int,
65
- hash_bucket_count: Optional[int] = None,
66
- sort_keys: List[SortKey] = None,
67
- records_per_primary_key_index_file: int = 38_000_000,
68
- records_per_compacted_file: int = 4_000_000,
69
- input_deltas_stats: Dict[int, DeltaStats] = None,
70
- min_pk_index_pa_bytes: int = 0,
71
- min_hash_bucket_chunk_size: int = 0,
72
- compacted_file_content_type: ContentType = ContentType.PARQUET,
73
- delete_prev_primary_key_index: bool = False,
74
- read_round_completion: bool = False,
75
- pg_config: Optional[List[Dict[str, Any]]] = None,
76
- schema_on_read: Optional[pa.schema] = None, # TODO (ricmiyam): Remove this and retrieve schema from storage API
77
- deltacat_storage=unimplemented_deltacat_storage):
72
+ source_partition_locator: PartitionLocator,
73
+ destination_partition_locator: PartitionLocator,
74
+ primary_keys: Set[str],
75
+ compaction_artifact_s3_bucket: str,
76
+ last_stream_position_to_compact: int,
77
+ *,
78
+ hash_bucket_count: Optional[int] = None,
79
+ sort_keys: List[SortKey] = None,
80
+ records_per_primary_key_index_file: int = 38_000_000,
81
+ records_per_compacted_file: int = 4_000_000,
82
+ input_deltas_stats: Dict[int, DeltaStats] = None,
83
+ min_pk_index_pa_bytes: int = 0,
84
+ min_hash_bucket_chunk_size: int = 0,
85
+ compacted_file_content_type: ContentType = ContentType.PARQUET,
86
+ delete_prev_primary_key_index: bool = False,
87
+ pg_config: Optional[PlacementGroupConfig] = None,
88
+ schema_on_read: Optional[
89
+ pa.schema
90
+ ] = None, # TODO (ricmiyam): Remove this and retrieve schema from storage API
91
+ rebase_source_partition_locator: Optional[PartitionLocator] = None,
92
+ rebase_source_partition_high_watermark: Optional[int] = None,
93
+ deltacat_storage=unimplemented_deltacat_storage,
94
+ ) -> Optional[str]:
78
95
 
79
96
  logger.info(f"Starting compaction session for: {source_partition_locator}")
80
97
  partition = None
81
98
  compaction_rounds_executed = 0
82
99
  has_next_compaction_round = True
83
- opts={}
84
- if pg_config:
85
- opts=pg_config[0]
100
+ new_rcf_s3_url = None
86
101
  while has_next_compaction_round:
87
- has_next_compaction_round_obj, new_partition_obj, new_rci_obj = \
88
- _execute_compaction_round.options(**opts).remote(
89
- source_partition_locator,
90
- compacted_partition_locator,
91
- primary_keys,
92
- compaction_artifact_s3_bucket,
93
- last_stream_position_to_compact,
94
- hash_bucket_count,
95
- sort_keys,
96
- records_per_primary_key_index_file,
97
- records_per_compacted_file,
98
- input_deltas_stats,
99
- min_pk_index_pa_bytes,
100
- min_hash_bucket_chunk_size,
101
- compacted_file_content_type,
102
- delete_prev_primary_key_index,
103
- read_round_completion,
104
- schema_on_read,
105
- deltacat_storage=deltacat_storage,
106
- pg_config=pg_config
107
- )
108
- has_next_compaction_round = ray.get(has_next_compaction_round_obj)
109
- new_partition = ray.get(new_partition_obj)
110
- new_rci = ray.get(new_rci_obj)
102
+ (
103
+ has_next_compaction_round,
104
+ new_partition,
105
+ new_rci,
106
+ new_rcf_s3_url,
107
+ ) = _execute_compaction_round(
108
+ source_partition_locator,
109
+ destination_partition_locator,
110
+ primary_keys,
111
+ compaction_artifact_s3_bucket,
112
+ last_stream_position_to_compact,
113
+ hash_bucket_count,
114
+ sort_keys,
115
+ records_per_primary_key_index_file,
116
+ records_per_compacted_file,
117
+ input_deltas_stats,
118
+ min_pk_index_pa_bytes,
119
+ min_hash_bucket_chunk_size,
120
+ compacted_file_content_type,
121
+ delete_prev_primary_key_index,
122
+ pg_config,
123
+ schema_on_read,
124
+ rebase_source_partition_locator,
125
+ rebase_source_partition_high_watermark,
126
+ deltacat_storage,
127
+ )
111
128
  if new_partition:
112
129
  partition = new_partition
113
- compacted_partition_locator = new_partition.locator
130
+ destination_partition_locator = new_partition.locator
114
131
  compaction_rounds_executed += 1
115
132
  # Take new primary key index sizes into account for subsequent compaction rounds and their dedupe steps
116
133
  if new_rci:
117
134
  min_pk_index_pa_bytes = new_rci.pk_index_pyarrow_write_result.pyarrow_bytes
118
135
 
119
- logger.info(f"Partition-{source_partition_locator.partition_values}-> Compaction session data processing completed in "
120
- f"{compaction_rounds_executed} rounds.")
136
+ logger.info(
137
+ f"Partition-{source_partition_locator.partition_values}-> Compaction session data processing completed in "
138
+ f"{compaction_rounds_executed} rounds."
139
+ )
121
140
  if partition:
122
141
  logger.info(f"Committing compacted partition to: {partition.locator}")
123
142
  partition = deltacat_storage.commit_partition(partition)
124
143
  logger.info(f"Committed compacted partition: {partition}")
125
144
  logger.info(f"Completed compaction session for: {source_partition_locator}")
145
+ return new_rcf_s3_url
126
146
 
127
- @ray.remote(num_cpus=0.1,num_returns=3)
128
- def _execute_compaction_round(
129
- source_partition_locator: PartitionLocator,
130
- compacted_partition_locator: PartitionLocator,
131
- primary_keys: Set[str],
132
- compaction_artifact_s3_bucket: str,
133
- last_stream_position_to_compact: int,
134
- new_hash_bucket_count: Optional[int],
135
- sort_keys: List[SortKey],
136
- records_per_primary_key_index_file: int,
137
- records_per_compacted_file: int,
138
- input_deltas_stats: Dict[int, DeltaStats],
139
- min_pk_index_pa_bytes: int,
140
- min_hash_bucket_chunk_size: int,
141
- compacted_file_content_type: ContentType,
142
- delete_prev_primary_key_index: bool,
143
- read_round_completion: bool,
144
- schema_on_read: Optional[pa.schema],
145
- deltacat_storage = unimplemented_deltacat_storage,
146
- pg_config: Optional[List[Dict[str, Any]]] = None) \
147
- -> Tuple[bool, Optional[Partition], Optional[RoundCompletionInfo]]:
148
147
 
148
+ @ray.remote(num_cpus=0.1, num_returns=3)
149
+ def _execute_compaction_round(
150
+ source_partition_locator: PartitionLocator,
151
+ compacted_partition_locator: PartitionLocator,
152
+ primary_keys: Set[str],
153
+ compaction_artifact_s3_bucket: str,
154
+ last_stream_position_to_compact: int,
155
+ new_hash_bucket_count: Optional[int],
156
+ sort_keys: List[SortKey],
157
+ records_per_primary_key_index_file: int,
158
+ records_per_compacted_file: int,
159
+ input_deltas_stats: Dict[int, DeltaStats],
160
+ min_pk_index_pa_bytes: int,
161
+ min_hash_bucket_chunk_size: int,
162
+ compacted_file_content_type: ContentType,
163
+ delete_prev_primary_key_index: bool,
164
+ pg_config: Optional[PlacementGroupConfig],
165
+ schema_on_read: Optional[pa.schema],
166
+ rebase_source_partition_locator: Optional[PartitionLocator],
167
+ rebase_source_partition_high_watermark: Optional[int],
168
+ deltacat_storage=unimplemented_deltacat_storage,
169
+ ) -> Tuple[bool, Optional[Partition], Optional[RoundCompletionInfo], Optional[str]]:
149
170
 
150
171
  if not primary_keys:
151
172
  # TODO (pdames): run simple rebatch to reduce all deltas into 1 delta
152
173
  # with normalized manifest entry sizes
153
174
  raise NotImplementedError(
154
- "Compaction only supports tables with 1 or more primary keys")
175
+ "Compaction only supports tables with 1 or more primary keys"
176
+ )
155
177
  if sort_keys is None:
156
178
  sort_keys = []
157
179
  # TODO (pdames): detect and handle schema evolution (at least ensure that
@@ -172,39 +194,30 @@ def _execute_compaction_round(
172
194
  # sort primary keys to produce the same pk digest regardless of input order
173
195
  primary_keys = sorted(primary_keys)
174
196
 
175
- # collect cluster resource stats
176
- # cluster_resources = ray.cluster_resources()
177
- # logger.info(f"Total cluster resources: {cluster_resources}")
178
- # logger.info(f"Available cluster resources: {ray.available_resources()}")
179
- # cluster_cpus = int(cluster_resources["CPU"])
180
- # logger.info(f"Total cluster CPUs: {cluster_cpus}")
181
-
182
- # collect node group resources
183
-
184
197
  cluster_resources = ray.cluster_resources()
185
198
  logger.info(f"Total cluster resources: {cluster_resources}")
186
- if pg_config: # use resource in each placement group
187
- node_resource_keys=None
188
- cluster_resources = pg_config[1]
189
- cluster_cpus = cluster_resources['CPU']
190
- else: # use all cluster resource
199
+ node_resource_keys = None
200
+ if pg_config: # use resource in each placement group
201
+ cluster_resources = pg_config.resource
202
+ cluster_cpus = cluster_resources["CPU"]
203
+ else: # use all cluster resource
191
204
  logger.info(f"Available cluster resources: {ray.available_resources()}")
192
205
  cluster_cpus = int(cluster_resources["CPU"])
193
206
  logger.info(f"Total cluster CPUs: {cluster_cpus}")
194
207
  node_resource_keys = live_node_resource_keys()
195
- logger.info(f"Found {len(node_resource_keys)} live cluster nodes: "
196
- f"{node_resource_keys}")
197
-
198
- if node_resource_keys:
199
- # create a remote options provider to round-robin tasks across all nodes
200
- logger.info(f"Setting round robin scheduling with node id:{node_resource_keys}")
201
- round_robin_opt_provider = functools.partial(
202
- round_robin_options_provider,
203
- resource_keys=node_resource_keys,
208
+ logger.info(
209
+ f"Found {len(node_resource_keys)} live cluster nodes: "
210
+ f"{node_resource_keys}"
204
211
  )
205
- else:
206
- logger.info("Setting round robin scheduling to None")
207
- round_robin_opt_provider = None
212
+
213
+ # create a remote options provider to round-robin tasks across all nodes or allocated bundles
214
+ logger.info(f"Setting round robin scheduling with node id:{node_resource_keys}")
215
+ round_robin_opt_provider = functools.partial(
216
+ round_robin_options_provider,
217
+ resource_keys=node_resource_keys,
218
+ pg_config=pg_config.opts if pg_config else None,
219
+ )
220
+
208
221
  # assign a distinct index to each node in the cluster
209
222
  # head_node_ip = urllib.request.urlopen(
210
223
  # "http://169.254.169.254/latest/meta-data/local-ipv4"
@@ -226,38 +239,58 @@ def _execute_compaction_round(
226
239
  _PRIMARY_KEY_INDEX_ALGORITHM_VERSION,
227
240
  )
228
241
  compatible_primary_key_index_locator = PrimaryKeyIndexLocator.of(
229
- compatible_primary_key_index_meta)
230
- compatible_primary_key_index_root_path = \
242
+ compatible_primary_key_index_meta
243
+ )
244
+ compatible_primary_key_index_root_path = (
231
245
  compatible_primary_key_index_locator.primary_key_index_root_path
246
+ )
232
247
 
233
248
  # read the results from any previously completed compaction round that used
234
249
  # a compatible primary key index
235
250
  round_completion_info = None
236
- if read_round_completion:
251
+ if not rebase_source_partition_locator:
252
+ logger.info(
253
+ f"Reading round completion file for compatible "
254
+ f"primary key index root path: {compatible_primary_key_index_root_path}"
255
+ )
237
256
  round_completion_info = rcf.read_round_completion_file(
238
257
  compaction_artifact_s3_bucket,
239
258
  source_partition_locator,
240
259
  compatible_primary_key_index_root_path,
241
260
  )
261
+ logger.info(f"Round completion file: {round_completion_info}")
242
262
 
243
263
  # read the previous compaction round's hash bucket count, if any
244
264
  old_hash_bucket_count = None
245
265
  if round_completion_info:
246
- old_pki_version_locator = round_completion_info\
247
- .primary_key_index_version_locator
248
- old_hash_bucket_count = old_pki_version_locator\
249
- .primary_key_index_version_meta \
250
- .hash_bucket_count
251
- min_pk_index_pa_bytes = round_completion_info.pk_index_pyarrow_write_result.pyarrow_bytes
266
+ old_pki_version_locator = (
267
+ round_completion_info.primary_key_index_version_locator
268
+ )
269
+ old_hash_bucket_count = (
270
+ old_pki_version_locator.primary_key_index_version_meta.hash_bucket_count
271
+ )
272
+ min_pk_index_pa_bytes = (
273
+ round_completion_info.pk_index_pyarrow_write_result.pyarrow_bytes
274
+ )
275
+ else:
276
+ logger.info(
277
+ f"No prior round info read. Source partition: "
278
+ f"{source_partition_locator}. Primary key index locator: "
279
+ f"{compatible_primary_key_index_locator}. Rebase source "
280
+ f"partition locator: {rebase_source_partition_locator}"
281
+ )
252
282
 
253
283
  # use the new hash bucket count if provided, or fall back to old count
254
- hash_bucket_count = new_hash_bucket_count \
255
- if new_hash_bucket_count is not None \
284
+ hash_bucket_count = (
285
+ new_hash_bucket_count
286
+ if new_hash_bucket_count is not None
256
287
  else old_hash_bucket_count
288
+ )
257
289
 
258
290
  # discover input delta files
259
- high_watermark = round_completion_info.high_watermark \
260
- if round_completion_info else None
291
+ high_watermark = (
292
+ round_completion_info.high_watermark if round_completion_info else None
293
+ )
261
294
 
262
295
  input_deltas = io.discover_deltas(
263
296
  source_partition_locator,
@@ -268,33 +301,38 @@ def _execute_compaction_round(
268
301
 
269
302
  if not input_deltas:
270
303
  logger.info("No input deltas found to compact.")
271
- return False, None, None
304
+ return False, None, None, None
272
305
 
273
306
  # limit the input deltas to fit on this cluster and convert them to
274
307
  # annotated deltas of equivalent size for easy parallel distribution
275
308
 
276
- uniform_deltas, hash_bucket_count, last_stream_position_compacted = \
277
- io.limit_input_deltas(
278
- input_deltas,
279
- cluster_resources,
280
- hash_bucket_count,
281
- min_pk_index_pa_bytes,
282
- min_hash_bucket_chunk_size,
283
- input_deltas_stats=input_deltas_stats,
284
- deltacat_storage=deltacat_storage
285
- )
309
+ (
310
+ uniform_deltas,
311
+ hash_bucket_count,
312
+ last_stream_position_compacted,
313
+ ) = io.limit_input_deltas(
314
+ input_deltas,
315
+ cluster_resources,
316
+ hash_bucket_count,
317
+ min_pk_index_pa_bytes,
318
+ min_hash_bucket_chunk_size,
319
+ input_deltas_stats=input_deltas_stats,
320
+ deltacat_storage=deltacat_storage,
321
+ )
286
322
 
287
- assert hash_bucket_count is not None and hash_bucket_count > 0, \
288
- f"Unexpected Error: Default hash bucket count ({hash_bucket_count}) " \
289
- f"is invalid."
323
+ assert hash_bucket_count is not None and hash_bucket_count > 0, (
324
+ f"Expected hash bucket count to be a positive integer, but found "
325
+ f"`{hash_bucket_count}`"
326
+ )
290
327
 
291
328
  # rehash the primary key index if necessary
292
- round_completion_info = None
293
329
  if round_completion_info:
294
330
  logger.info(f"Round completion file contents: {round_completion_info}")
295
331
  # the previous primary key index is compatible with the current, but
296
332
  # will need to be rehashed if the hash bucket count has changed
297
333
  if hash_bucket_count != old_hash_bucket_count:
334
+ # TODO(draghave): manually test the path after prior primary key
335
+ # index was already built
298
336
  round_completion_info = pki.rehash(
299
337
  round_robin_opt_provider,
300
338
  compaction_artifact_s3_bucket,
@@ -305,10 +343,6 @@ def _execute_compaction_round(
305
343
  records_per_primary_key_index_file,
306
344
  delete_prev_primary_key_index,
307
345
  )
308
- else:
309
- logger.info(f"No prior round completion file found. Source partition: "
310
- f"{source_partition_locator}. Primary key index locator: "
311
- f"{compatible_primary_key_index_locator}")
312
346
 
313
347
  # parallel step 1:
314
348
  # group like primary keys together by hashing them into buckets
@@ -331,7 +365,7 @@ def _execute_compaction_round(
331
365
  for hash_group_index, object_id in enumerate(hash_group_idx_to_obj_id):
332
366
  if object_id:
333
367
  all_hash_group_idx_to_obj_id[hash_group_index].append(object_id)
334
- hash_group_count = dedupe_task_count = len(all_hash_group_idx_to_obj_id)
368
+ hash_group_count = len(all_hash_group_idx_to_obj_id)
335
369
  logger.info(f"Hash bucket groups created: {hash_group_count}")
336
370
 
337
371
  # TODO (pdames): when resources are freed during the last round of hash
@@ -359,9 +393,11 @@ def _execute_compaction_round(
359
393
  _PRIMARY_KEY_INDEX_ALGORITHM_VERSION,
360
394
  )
361
395
  new_primary_key_index_locator = PrimaryKeyIndexLocator.of(
362
- new_primary_key_index_meta)
363
- new_primary_key_index_root_path = new_primary_key_index_locator\
364
- .primary_key_index_root_path
396
+ new_primary_key_index_meta
397
+ )
398
+ new_primary_key_index_root_path = (
399
+ new_primary_key_index_locator.primary_key_index_root_path
400
+ )
365
401
 
366
402
  # generate a new primary key index version locator for this round
367
403
  new_primary_key_index_version_meta = PrimaryKeyIndexVersionMeta.of(
@@ -369,47 +405,47 @@ def _execute_compaction_round(
369
405
  hash_bucket_count,
370
406
  )
371
407
  new_pki_version_locator = PrimaryKeyIndexVersionLocator.generate(
372
- new_primary_key_index_version_meta)
373
-
408
+ new_primary_key_index_version_meta
409
+ )
374
410
 
375
411
  # parallel step 2:
376
412
  # discover records with duplicate primary keys in each hash bucket, and
377
413
  # identify the index of records to keep or drop based on sort keys
378
414
  num_materialize_buckets = max_parallelism
379
415
  logger.info(f"Materialize Bucket Count: {num_materialize_buckets}")
380
- record_counts_pending_materialize = \
381
- dd.RecordCountsPendingMaterialize.remote(dedupe_task_count)
382
416
  dd_tasks_pending = invoke_parallel(
383
417
  items=all_hash_group_idx_to_obj_id.values(),
384
418
  ray_task=dd.dedupe,
385
419
  max_parallelism=max_parallelism,
386
420
  options_provider=round_robin_opt_provider,
387
- kwargs_provider=lambda index, item: {"dedupe_task_index": index,
388
- "object_ids": item},
421
+ kwargs_provider=lambda index, item: {
422
+ "dedupe_task_index": index,
423
+ "object_ids": item,
424
+ },
389
425
  compaction_artifact_s3_bucket=compaction_artifact_s3_bucket,
390
426
  round_completion_info=round_completion_info,
391
427
  new_primary_key_index_version_locator=new_pki_version_locator,
392
428
  sort_keys=sort_keys,
393
429
  max_records_per_index_file=records_per_primary_key_index_file,
394
- max_records_per_materialized_file=records_per_compacted_file,
395
430
  num_materialize_buckets=num_materialize_buckets,
396
431
  delete_old_primary_key_index=delete_prev_primary_key_index,
397
- record_counts_pending_materialize=record_counts_pending_materialize,
398
432
  )
399
433
  logger.info(f"Getting {len(dd_tasks_pending)} dedupe results...")
400
434
  dd_results = ray.get([t[0] for t in dd_tasks_pending])
401
435
  logger.info(f"Got {len(dd_results)} dedupe results.")
402
436
  all_mat_buckets_to_obj_id = defaultdict(list)
403
437
  for mat_bucket_idx_to_obj_id in dd_results:
404
- for bucket_idx, dd_task_index_and_object_id_tuple in \
405
- mat_bucket_idx_to_obj_id.items():
438
+ for (
439
+ bucket_idx,
440
+ dd_task_index_and_object_id_tuple,
441
+ ) in mat_bucket_idx_to_obj_id.items():
406
442
  all_mat_buckets_to_obj_id[bucket_idx].append(
407
- dd_task_index_and_object_id_tuple)
443
+ dd_task_index_and_object_id_tuple
444
+ )
408
445
  logger.info(f"Getting {len(dd_tasks_pending)} dedupe result stat(s)...")
409
446
  pki_stats = ray.get([t[2] for t in dd_tasks_pending])
410
447
  logger.info(f"Got {len(pki_stats)} dedupe result stat(s).")
411
- logger.info(f"Materialize buckets created: "
412
- f"{len(all_mat_buckets_to_obj_id)}")
448
+ logger.info(f"Materialize buckets created: " f"{len(all_mat_buckets_to_obj_id)}")
413
449
 
414
450
  # TODO(pdames): when resources are freed during the last round of deduping
415
451
  # start running materialize tasks that read materialization source file
@@ -428,9 +464,9 @@ def _execute_compaction_round(
428
464
  ray_task=mat.materialize,
429
465
  max_parallelism=max_parallelism,
430
466
  options_provider=round_robin_opt_provider,
431
- kwargs_provider=lambda index, mat_bucket_idx_to_obj_id: {
432
- "mat_bucket_index": mat_bucket_idx_to_obj_id[0],
433
- "dedupe_task_idx_and_obj_id_tuples": mat_bucket_idx_to_obj_id[1],
467
+ kwargs_provider=lambda index, mat_bucket_index_to_obj_id: {
468
+ "mat_bucket_index": mat_bucket_index_to_obj_id[0],
469
+ "dedupe_task_idx_and_obj_id_tuples": mat_bucket_index_to_obj_id[1],
434
470
  },
435
471
  schema=schema_on_read,
436
472
  round_completion_info=round_completion_info,
@@ -455,25 +491,40 @@ def _execute_compaction_round(
455
491
  compacted_delta.stream_position,
456
492
  )
457
493
 
458
- round_completion_info = RoundCompletionInfo.of(
459
- last_stream_position_compacted,
494
+ rci_high_watermark = (
495
+ rebase_source_partition_high_watermark
496
+ if rebase_source_partition_high_watermark
497
+ else last_stream_position_compacted
498
+ )
499
+ new_round_completion_info = RoundCompletionInfo.of(
500
+ rci_high_watermark,
460
501
  new_compacted_delta_locator,
461
- PyArrowWriteResult.union([m.pyarrow_write_result
462
- for m in mat_results]),
502
+ PyArrowWriteResult.union([m.pyarrow_write_result for m in mat_results]),
463
503
  PyArrowWriteResult.union(pki_stats),
464
504
  bit_width_of_sort_keys,
465
505
  new_pki_version_locator,
506
+ rebase_source_partition_locator
507
+ or round_completion_info.rebase_source_partition_locator,
508
+ )
509
+ rcf_source_partition_locator = (
510
+ rebase_source_partition_locator
511
+ if rebase_source_partition_locator
512
+ else source_partition_locator
466
513
  )
467
- rcf.write_round_completion_file(
514
+ round_completion_file_s3_url = rcf.write_round_completion_file(
468
515
  compaction_artifact_s3_bucket,
469
- source_partition_locator,
516
+ rcf_source_partition_locator,
470
517
  new_primary_key_index_root_path,
471
- round_completion_info,
518
+ new_round_completion_info,
519
+ )
520
+ logger.info(
521
+ f"partition-{source_partition_locator.partition_values},"
522
+ f"compacted at: {last_stream_position_compacted},"
523
+ f"last position: {last_stream_position_to_compact}"
524
+ )
525
+ return (
526
+ (last_stream_position_compacted < last_stream_position_to_compact),
527
+ partition,
528
+ new_round_completion_info,
529
+ round_completion_file_s3_url,
472
530
  )
473
- time_mat_e = time.time()
474
- logger.info(f"partition-{source_partition_locator.partition_values},compacted at:{last_stream_position_compacted}, last position:{last_stream_position_to_compact}")
475
- return \
476
- (last_stream_position_compacted < last_stream_position_to_compact), \
477
- partition, \
478
- round_completion_info
479
-