deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. deltacat/__init__.py +41 -15
  2. deltacat/aws/clients.py +12 -31
  3. deltacat/aws/constants.py +1 -1
  4. deltacat/aws/redshift/__init__.py +7 -2
  5. deltacat/aws/redshift/model/manifest.py +54 -50
  6. deltacat/aws/s3u.py +176 -187
  7. deltacat/catalog/delegate.py +151 -185
  8. deltacat/catalog/interface.py +78 -97
  9. deltacat/catalog/model/catalog.py +21 -21
  10. deltacat/catalog/model/table_definition.py +11 -9
  11. deltacat/compute/compactor/__init__.py +12 -16
  12. deltacat/compute/compactor/compaction_session.py +237 -166
  13. deltacat/compute/compactor/model/delta_annotated.py +60 -44
  14. deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
  15. deltacat/compute/compactor/model/delta_file_locator.py +10 -8
  16. deltacat/compute/compactor/model/materialize_result.py +6 -7
  17. deltacat/compute/compactor/model/primary_key_index.py +38 -34
  18. deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
  19. deltacat/compute/compactor/model/round_completion_info.py +25 -19
  20. deltacat/compute/compactor/model/sort_key.py +18 -15
  21. deltacat/compute/compactor/steps/dedupe.py +119 -94
  22. deltacat/compute/compactor/steps/hash_bucket.py +48 -47
  23. deltacat/compute/compactor/steps/materialize.py +86 -92
  24. deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
  25. deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
  26. deltacat/compute/compactor/utils/io.py +59 -47
  27. deltacat/compute/compactor/utils/primary_key_index.py +91 -80
  28. deltacat/compute/compactor/utils/round_completion_file.py +22 -23
  29. deltacat/compute/compactor/utils/system_columns.py +33 -45
  30. deltacat/compute/metastats/meta_stats.py +235 -157
  31. deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
  32. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
  33. deltacat/compute/metastats/stats.py +95 -64
  34. deltacat/compute/metastats/utils/io.py +100 -53
  35. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
  36. deltacat/compute/metastats/utils/ray_utils.py +38 -33
  37. deltacat/compute/stats/basic.py +107 -69
  38. deltacat/compute/stats/models/delta_column_stats.py +11 -8
  39. deltacat/compute/stats/models/delta_stats.py +59 -32
  40. deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
  41. deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
  42. deltacat/compute/stats/models/stats_result.py +24 -14
  43. deltacat/compute/stats/utils/intervals.py +16 -9
  44. deltacat/compute/stats/utils/io.py +86 -51
  45. deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
  46. deltacat/constants.py +4 -13
  47. deltacat/io/__init__.py +2 -2
  48. deltacat/io/aws/redshift/redshift_datasource.py +157 -143
  49. deltacat/io/dataset.py +14 -17
  50. deltacat/io/read_api.py +36 -33
  51. deltacat/logs.py +94 -42
  52. deltacat/storage/__init__.py +18 -8
  53. deltacat/storage/interface.py +196 -213
  54. deltacat/storage/model/delta.py +45 -51
  55. deltacat/storage/model/list_result.py +12 -8
  56. deltacat/storage/model/namespace.py +4 -5
  57. deltacat/storage/model/partition.py +42 -42
  58. deltacat/storage/model/stream.py +29 -30
  59. deltacat/storage/model/table.py +14 -14
  60. deltacat/storage/model/table_version.py +32 -31
  61. deltacat/storage/model/types.py +1 -0
  62. deltacat/tests/stats/test_intervals.py +11 -24
  63. deltacat/tests/utils/__init__.py +0 -0
  64. deltacat/tests/utils/test_record_batch_tables.py +284 -0
  65. deltacat/types/media.py +3 -4
  66. deltacat/types/tables.py +31 -21
  67. deltacat/utils/common.py +5 -11
  68. deltacat/utils/numpy.py +20 -22
  69. deltacat/utils/pandas.py +73 -100
  70. deltacat/utils/performance.py +3 -9
  71. deltacat/utils/placement.py +259 -230
  72. deltacat/utils/pyarrow.py +302 -89
  73. deltacat/utils/ray_utils/collections.py +2 -1
  74. deltacat/utils/ray_utils/concurrency.py +27 -28
  75. deltacat/utils/ray_utils/dataset.py +28 -28
  76. deltacat/utils/ray_utils/performance.py +5 -9
  77. deltacat/utils/ray_utils/runtime.py +9 -10
  78. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
  79. deltacat-0.1.12.dist-info/RECORD +110 -0
  80. deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
  81. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
  82. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
  83. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,11 @@
1
1
  import logging
2
- import ray
3
- import pyarrow as pa
4
-
2
+ import time
5
3
  from collections import defaultdict
6
4
  from itertools import chain, repeat
7
5
  from typing import List, Optional, Tuple
8
6
 
9
7
  import pyarrow as pa
10
8
  import ray
11
- from pyarrow import compute as pc
12
9
  from ray import cloudpickle
13
10
 
14
11
  from deltacat import logs
@@ -21,14 +18,15 @@ from deltacat.compute.compactor.steps.dedupe import (
21
18
  DedupeTaskIndexWithObjectId,
22
19
  DeltaFileLocatorToRecords,
23
20
  )
24
- from deltacat.compute.compactor.utils import system_columns as sc
25
21
  from deltacat.storage import Delta, DeltaLocator, Partition, PartitionLocator
26
22
  from deltacat.storage import interface as unimplemented_deltacat_storage
27
23
  from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES, ContentType
28
24
  from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
25
+ from deltacat.utils.performance import timed_invocation
29
26
  from deltacat.utils.pyarrow import (
30
27
  ReadKwargsProviderPyArrowCsvPureUtf8,
31
28
  ReadKwargsProviderPyArrowSchemaOverride,
29
+ RecordBatchTables,
32
30
  )
33
31
 
34
32
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
@@ -36,47 +34,49 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
36
34
 
37
35
  @ray.remote
38
36
  def materialize(
39
- source_partition_locator: PartitionLocator,
40
- round_completion_info: Optional[RoundCompletionInfo],
41
- partition: Partition,
42
- mat_bucket_index: int,
43
- dedupe_task_idx_and_obj_id_tuples: List[DedupeTaskIndexWithObjectId],
44
- max_records_per_output_file: int,
45
- compacted_file_content_type: ContentType,
46
- schema: Optional[pa.Schema] = None,
47
- deltacat_storage=unimplemented_deltacat_storage) -> MaterializeResult:
48
-
49
- def _materialize(
50
- compacted_tables: List[pa.Table],
51
- compacted_tables_record_count: int) -> MaterializeResult:
52
- compacted_tables_size = sum([TABLE_CLASS_TO_SIZE_FUNC[type(tbl)](tbl)
53
- for tbl in compacted_tables])
54
- logger.debug(f"Uploading {len(compacted_tables)} compacted tables "
55
- f"with size: {compacted_tables_size} bytes "
56
- f"and record count: {compacted_tables_record_count}")
37
+ source_partition_locator: PartitionLocator,
38
+ round_completion_info: Optional[RoundCompletionInfo],
39
+ partition: Partition,
40
+ mat_bucket_index: int,
41
+ dedupe_task_idx_and_obj_id_tuples: List[DedupeTaskIndexWithObjectId],
42
+ max_records_per_output_file: int,
43
+ compacted_file_content_type: ContentType,
44
+ schema: Optional[pa.Schema] = None,
45
+ deltacat_storage=unimplemented_deltacat_storage,
46
+ ) -> MaterializeResult:
47
+ # TODO (rkenmi): Add docstrings for the steps in the compaction workflow
48
+ # https://github.com/ray-project/deltacat/issues/79
49
+ def _materialize(compacted_tables: List[pa.Table]) -> MaterializeResult:
57
50
  compacted_table = pa.concat_tables(compacted_tables)
51
+
58
52
  if compacted_file_content_type in DELIMITED_TEXT_CONTENT_TYPES:
59
- # TODO (ricmiyam): Investigate if we still need to convert this table to pandas DataFrame
53
+ # TODO (rkenmi): Investigate if we still need to convert this table to pandas DataFrame
60
54
  # TODO (pdames): compare performance to pandas-native materialize path
61
- df = compacted_table.to_pandas(
62
- split_blocks=True,
63
- self_destruct=True,
64
- zero_copy_only=True
65
- )
55
+ df = compacted_table.to_pandas(split_blocks=True, self_destruct=True)
66
56
  compacted_table = df
67
- delta = deltacat_storage.stage_delta(
57
+ delta, stage_delta_time = timed_invocation(
58
+ deltacat_storage.stage_delta,
68
59
  compacted_table,
69
60
  partition,
70
61
  max_records_per_entry=max_records_per_output_file,
71
62
  content_type=compacted_file_content_type,
72
63
  )
64
+ compacted_table_size = TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](
65
+ compacted_table
66
+ )
67
+ logger.debug(
68
+ f"Time taken for materialize task"
69
+ f" to upload {len(compacted_table)} records"
70
+ f" of size {compacted_table_size} is: {stage_delta_time}s"
71
+ )
73
72
  manifest = delta.manifest
74
73
  manifest_records = manifest.meta.record_count
75
- assert(manifest_records == len(compacted_table),
76
- f"Unexpected Error: Materialized delta manifest record count "
77
- f"({manifest_records}) does not equal compacted table record count "
78
- f"({len(compacted_table)})")
79
-
74
+ assert (
75
+ manifest_records == len(compacted_table),
76
+ f"Unexpected Error: Materialized delta manifest record count "
77
+ f"({manifest_records}) does not equal compacted table record count "
78
+ f"({len(compacted_table)})",
79
+ )
80
80
  materialize_result = MaterializeResult.of(
81
81
  delta,
82
82
  mat_bucket_index,
@@ -92,17 +92,20 @@ def materialize(
92
92
  logger.info(f"Materialize result: {materialize_result}")
93
93
  return materialize_result
94
94
 
95
- logger.info(f"Starting materialize task...")
95
+ logger.info(
96
+ f"Starting materialize task with"
97
+ f" materialize bucket index: {mat_bucket_index}..."
98
+ )
99
+ start = time.time()
96
100
  dedupe_task_idx_and_obj_ref_tuples = [
97
101
  (
98
102
  t1,
99
103
  cloudpickle.loads(t2),
100
- ) for t1, t2 in dedupe_task_idx_and_obj_id_tuples
104
+ )
105
+ for t1, t2 in dedupe_task_idx_and_obj_id_tuples
101
106
  ]
102
107
  logger.info(f"Resolved materialize task obj refs...")
103
- dedupe_task_indices, obj_refs = zip(
104
- *dedupe_task_idx_and_obj_ref_tuples
105
- )
108
+ dedupe_task_indices, obj_refs = zip(*dedupe_task_idx_and_obj_ref_tuples)
106
109
  # this depends on `ray.get` result order matching input order, as per the
107
110
  # contract established in: https://github.com/ray-project/ray/pull/16763
108
111
  src_file_records_list = ray.get(list(obj_refs))
@@ -114,21 +117,23 @@ def materialize(
114
117
  (record_numbers, repeat(dedupe_task_idx, len(record_numbers)))
115
118
  )
116
119
  manifest_cache = {}
117
- compacted_tables = []
118
120
  materialized_results: List[MaterializeResult] = []
119
- total_record_count = 0
121
+ record_batch_tables = RecordBatchTables(max_records_per_output_file)
120
122
  for src_dfl in sorted(all_src_file_records.keys()):
121
- record_numbers_dd_task_idx_tpl_list: List[Tuple[DeltaFileLocatorToRecords, repeat]] = \
122
- all_src_file_records[src_dfl]
123
+ record_numbers_dd_task_idx_tpl_list: List[
124
+ Tuple[DeltaFileLocatorToRecords, repeat]
125
+ ] = all_src_file_records[src_dfl]
123
126
  record_numbers_tpl, dedupe_task_idx_iter_tpl = zip(
124
127
  *record_numbers_dd_task_idx_tpl_list
125
128
  )
126
129
  is_src_partition_file_np = src_dfl.is_source_delta
127
130
  src_stream_position_np = src_dfl.stream_position
128
131
  src_file_idx_np = src_dfl.file_index
129
- src_file_partition_locator = source_partition_locator \
130
- if is_src_partition_file_np \
132
+ src_file_partition_locator = (
133
+ source_partition_locator
134
+ if is_src_partition_file_np
131
135
  else round_completion_info.compacted_delta_locator.partition_locator
136
+ )
132
137
  delta_locator = DeltaLocator.of(
133
138
  src_file_partition_locator,
134
139
  src_stream_position_np.item(),
@@ -148,65 +153,54 @@ def materialize(
148
153
  # enforce a consistent schema if provided, when reading files into PyArrow tables
149
154
  elif schema is not None:
150
155
  read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
151
- schema=schema)
152
- pa_table = deltacat_storage.download_delta_manifest_entry(
156
+ schema=schema
157
+ )
158
+ pa_table, download_delta_manifest_entry_time = timed_invocation(
159
+ deltacat_storage.download_delta_manifest_entry,
153
160
  Delta.of(delta_locator, None, None, None, manifest),
154
161
  src_file_idx_np.item(),
155
162
  file_reader_kwargs_provider=read_kwargs_provider,
156
163
  )
157
- record_count = len(pa_table)
158
- mask_pylist = list(repeat(False, record_count))
164
+ logger.debug(
165
+ f"Time taken for materialize task"
166
+ f" to download delta locator {delta_locator} with entry ID {src_file_idx_np.item()}"
167
+ f" is: {download_delta_manifest_entry_time}s"
168
+ )
169
+ mask_pylist = list(repeat(False, len(pa_table)))
159
170
  record_numbers = chain.from_iterable(record_numbers_tpl)
160
171
  # TODO(raghumdani): reference the same file URIs while writing the files
161
- # instead of copying the data over and creating new files.
172
+ # instead of copying the data over and creating new files.
162
173
  for record_number in record_numbers:
163
174
  mask_pylist[record_number] = True
164
175
  mask = pa.array(mask_pylist)
165
176
  pa_table = pa_table.filter(mask)
177
+ record_batch_tables.append(pa_table)
178
+ if record_batch_tables.has_batches():
179
+ batched_tables = record_batch_tables.evict()
180
+ materialized_results.append(_materialize(batched_tables))
166
181
 
167
- # appending, sorting, taking, and dropping has 2-3X latency of a
168
- # single filter on average, and thus provides better average
169
- # performance than repeatedly filtering the table in dedupe task index
170
- # order
171
- dedupe_task_indices = chain.from_iterable(dedupe_task_idx_iter_tpl)
172
- pa_table = sc.append_dedupe_task_idx_col(
173
- pa_table,
174
- dedupe_task_indices,
175
- )
176
- pa_sort_keys = [(sc._DEDUPE_TASK_IDX_COLUMN_NAME, "ascending")]
177
- pa_table = pa_table.take(
178
- pc.sort_indices(pa_table, sort_keys=pa_sort_keys),
179
- )
180
- pa_table = pa_table.drop(
181
- [sc._DEDUPE_TASK_IDX_COLUMN_NAME]
182
- )
183
-
184
- # Write manifests up to max_records_per_output_file
185
- # TODO(raghumdani): Write exactly the same number of records into each file to
186
- # produce a read-optimized view of the tables.
187
- if compacted_tables and \
188
- total_record_count + record_count > max_records_per_output_file:
189
- materialized_results.append(_materialize(compacted_tables, total_record_count))
190
- # Free up written tables in memory
191
- compacted_tables.clear()
192
- total_record_count = 0
193
-
194
- total_record_count += record_count
195
- compacted_tables.append(pa_table)
196
-
197
- materialized_results.append(_materialize(compacted_tables, total_record_count))
198
- # Free up written tables in memory
199
- compacted_tables.clear()
182
+ if record_batch_tables.has_remaining():
183
+ materialized_results.append(_materialize(record_batch_tables.remaining))
200
184
 
201
185
  merged_delta = Delta.merge_deltas([mr.delta for mr in materialized_results])
202
- assert materialized_results and len(materialized_results) > 0, \
203
- f"Expected at least one materialized result in materialize step."
204
-
186
+ assert (
187
+ materialized_results and len(materialized_results) > 0
188
+ ), f"Expected at least one materialized result in materialize step."
189
+
190
+ write_results = [mr.pyarrow_write_result for mr in materialized_results]
191
+ logger.debug(
192
+ f"{len(write_results)} files written"
193
+ f" with records: {[wr.records for wr in write_results]}"
194
+ )
205
195
  # Merge all new deltas into one for this materialize bucket index
206
- merged_materialize_result = MaterializeResult.of(merged_delta,
207
- materialized_results[0].task_index,
208
- PyArrowWriteResult.union([mr.pyarrow_write_result
209
- for mr in materialized_results]))
210
-
196
+ merged_materialize_result = MaterializeResult.of(
197
+ merged_delta,
198
+ materialized_results[0].task_index,
199
+ PyArrowWriteResult.union(
200
+ [mr.pyarrow_write_result for mr in materialized_results]
201
+ ),
202
+ )
211
203
  logger.info(f"Finished materialize task...")
204
+ end = time.time()
205
+ logger.info(f"Materialize task ended in {end - start}s")
212
206
  return merged_materialize_result
@@ -1,22 +1,21 @@
1
1
  import logging
2
- import ray
3
- import pyarrow as pa
4
- import numpy as np
2
+ from typing import List, Tuple
5
3
 
4
+ import numpy as np
5
+ import pyarrow as pa
6
+ import ray
6
7
  from ray.types import ObjectRef
7
8
 
8
9
  from deltacat import logs
9
10
  from deltacat.compute.compactor import PrimaryKeyIndexVersionLocator
10
11
  from deltacat.compute.compactor.utils import primary_key_index as pki
11
12
 
12
- from typing import List, Tuple
13
-
14
13
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
15
14
 
16
15
 
17
16
  def group_file_records_by_pk_hash_bucket(
18
- pki_table: pa.Table,
19
- num_buckets: int) -> np.ndarray:
17
+ pki_table: pa.Table, num_buckets: int
18
+ ) -> np.ndarray:
20
19
  # generate the new table for each new hash bucket
21
20
  hash_bucket_to_indices = pki.group_record_indices_by_hash_bucket(
22
21
  pki_table,
@@ -29,13 +28,14 @@ def group_file_records_by_pk_hash_bucket(
29
28
  return hash_bucket_to_table
30
29
 
31
30
 
32
- @ray.remote(num_cpus=1,num_returns=2)
31
+ @ray.remote(num_cpus=1, num_returns=2)
33
32
  def rehash_bucket(
34
- hash_bucket_index: int,
35
- s3_bucket: str,
36
- old_pki_version_locator: PrimaryKeyIndexVersionLocator,
37
- num_buckets: int,
38
- num_groups: int) -> Tuple[np.ndarray, List[ObjectRef]]:
33
+ hash_bucket_index: int,
34
+ s3_bucket: str,
35
+ old_pki_version_locator: PrimaryKeyIndexVersionLocator,
36
+ num_buckets: int,
37
+ num_groups: int,
38
+ ) -> Tuple[np.ndarray, List[ObjectRef]]:
39
39
 
40
40
  logger.info(f"Starting rehash bucket task...")
41
41
  tables = pki.download_hash_bucket_entries(
@@ -16,11 +16,11 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
16
16
 
17
17
  @ray.remote(num_cpus=1, num_returns=2)
18
18
  def rewrite_index(
19
- object_ids: List[Any],
20
- s3_bucket: str,
21
- new_primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
22
- max_records_per_index_file: int) -> \
23
- Tuple[PyArrowWriteResult, List[ObjectRef]]:
19
+ object_ids: List[Any],
20
+ s3_bucket: str,
21
+ new_primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
22
+ max_records_per_index_file: int,
23
+ ) -> Tuple[PyArrowWriteResult, List[ObjectRef]]:
24
24
 
25
25
  logger.info(f"Starting rewrite primary key index task...")
26
26
  object_refs = [cloudpickle.loads(obj_id_pkl) for obj_id_pkl in object_ids]
@@ -1,24 +1,23 @@
1
1
  import logging
2
- import time
3
2
  import math
4
- from deltacat.compute.stats.models.delta_stats import DeltaStats
5
- from deltacat.constants import PYARROW_INFLATION_MULTIPLIER, BYTES_PER_MEBIBYTE
3
+ from typing import Dict, List, Optional, Tuple
6
4
 
7
- from deltacat.storage import PartitionLocator, Delta, \
8
- interface as unimplemented_deltacat_storage
9
5
  from deltacat import logs
10
6
  from deltacat.compute.compactor import DeltaAnnotated
11
-
12
- from typing import Dict, List, Optional, Tuple
7
+ from deltacat.compute.stats.models.delta_stats import DeltaStats
8
+ from deltacat.constants import BYTES_PER_MEBIBYTE, PYARROW_INFLATION_MULTIPLIER
9
+ from deltacat.storage import Delta, PartitionLocator
10
+ from deltacat.storage import interface as unimplemented_deltacat_storage
13
11
 
14
12
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
15
13
 
16
14
 
17
15
  def discover_deltas(
18
- source_partition_locator: PartitionLocator,
19
- start_position_exclusive: Optional[int],
20
- end_position_inclusive: int,
21
- deltacat_storage=unimplemented_deltacat_storage) -> List[Delta]:
16
+ source_partition_locator: PartitionLocator,
17
+ start_position_exclusive: Optional[int],
18
+ end_position_inclusive: int,
19
+ deltacat_storage=unimplemented_deltacat_storage,
20
+ ) -> List[Delta]:
22
21
 
23
22
  stream_locator = source_partition_locator.stream_locator
24
23
  namespace = stream_locator.namespace
@@ -36,32 +35,38 @@ def discover_deltas(
36
35
  )
37
36
  deltas = deltas_list_result.all_items()
38
37
  if not deltas:
39
- raise RuntimeError(f"Unexpected Error: Couldn't find any deltas to "
40
- f"compact in delta stream position range "
41
- f"('{start_position_exclusive}', "
42
- f"'{end_position_inclusive}']. Source partition: "
43
- f"{source_partition_locator}")
38
+ raise RuntimeError(
39
+ f"Unexpected Error: Couldn't find any deltas to "
40
+ f"compact in delta stream position range "
41
+ f"('{start_position_exclusive}', "
42
+ f"'{end_position_inclusive}']. Source partition: "
43
+ f"{source_partition_locator}"
44
+ )
44
45
  if start_position_exclusive:
45
46
  first_delta = deltas.pop(0)
46
- logger.info(f"Removed exclusive start delta w/ expected stream "
47
- f"position '{start_position_exclusive}' from deltas to "
48
- f"compact: {first_delta}")
49
- logger.info(f"Count of deltas to compact in delta stream "
50
- f"position range ('{start_position_exclusive}', "
51
- f"'{end_position_inclusive}']: {len(deltas)}. Source "
52
- f"partition: '{source_partition_locator}'")
47
+ logger.info(
48
+ f"Removed exclusive start delta w/ expected stream "
49
+ f"position '{start_position_exclusive}' from deltas to "
50
+ f"compact: {first_delta}"
51
+ )
52
+ logger.info(
53
+ f"Count of deltas to compact in delta stream "
54
+ f"position range ('{start_position_exclusive}', "
55
+ f"'{end_position_inclusive}']: {len(deltas)}. Source "
56
+ f"partition: '{source_partition_locator}'"
57
+ )
53
58
  return deltas
54
59
 
55
60
 
56
61
  def limit_input_deltas(
57
- input_deltas: List[Delta],
58
- cluster_resources: Dict[str, float],
59
- hash_bucket_count: int,
60
- min_pk_index_pa_bytes: int,
61
- user_hash_bucket_chunk_size: int,
62
- input_deltas_stats: Dict[int, DeltaStats],
63
- deltacat_storage=unimplemented_deltacat_storage) \
64
- -> Tuple[List[DeltaAnnotated], int, int]:
62
+ input_deltas: List[Delta],
63
+ cluster_resources: Dict[str, float],
64
+ hash_bucket_count: int,
65
+ min_pk_index_pa_bytes: int,
66
+ user_hash_bucket_chunk_size: int,
67
+ input_deltas_stats: Dict[int, DeltaStats],
68
+ deltacat_storage=unimplemented_deltacat_storage,
69
+ ) -> Tuple[List[DeltaAnnotated], int, int]:
65
70
 
66
71
  # TODO (pdames): when row counts are available in metadata, use them
67
72
  # instead of bytes - memory consumption depends more on number of
@@ -78,9 +83,10 @@ def limit_input_deltas(
78
83
  # )
79
84
  if min_pk_index_pa_bytes > 0:
80
85
  required_heap_mem_for_dedupe = worker_obj_store_mem - min_pk_index_pa_bytes
81
- assert required_heap_mem_for_dedupe > 0, \
82
- f"Not enough required memory available to re-batch input deltas" \
86
+ assert required_heap_mem_for_dedupe > 0, (
87
+ f"Not enough required memory available to re-batch input deltas"
83
88
  f"and initiate the dedupe step."
89
+ )
84
90
  # Size of batched deltas must also be reduced to have enough space for primary
85
91
  # key index files (from earlier compaction rounds) in the dedupe step, since
86
92
  # they will be loaded into worker heap memory.
@@ -88,8 +94,7 @@ def limit_input_deltas(
88
94
 
89
95
  logger.info(f"Total worker object store memory: {worker_obj_store_mem}")
90
96
  worker_obj_store_mem_per_task = worker_obj_store_mem / worker_cpus
91
- logger.info(f"Worker object store memory/task: "
92
- f"{worker_obj_store_mem_per_task}")
97
+ logger.info(f"Worker object store memory/task: " f"{worker_obj_store_mem_per_task}")
93
98
  worker_task_mem = cluster_resources["memory"]
94
99
  logger.info(f"Total worker memory: {worker_task_mem}")
95
100
  # TODO (pdames): ensure fixed memory per CPU in heterogenous clusters
@@ -105,8 +110,10 @@ def limit_input_deltas(
105
110
  if input_deltas_stats is None:
106
111
  input_deltas_stats = {}
107
112
 
108
- input_deltas_stats = {int(stream_pos): DeltaStats(delta_stats)
109
- for stream_pos, delta_stats in input_deltas_stats.items()}
113
+ input_deltas_stats = {
114
+ int(stream_pos): DeltaStats(delta_stats)
115
+ for stream_pos, delta_stats in input_deltas_stats.items()
116
+ }
110
117
  for delta in input_deltas:
111
118
  manifest = deltacat_storage.get_delta_manifest(delta)
112
119
  delta.manifest = manifest
@@ -118,7 +125,8 @@ def limit_input_deltas(
118
125
  # TODO (pdames): ensure pyarrow object fits in per-task obj store mem
119
126
  logger.warning(
120
127
  f"Stats are missing for delta stream position {delta.stream_position}, "
121
- f"materialized delta may not fit in per-task object store memory.")
128
+ f"materialized delta may not fit in per-task object store memory."
129
+ )
122
130
  manifest_entries = delta.manifest.entries
123
131
  delta_manifest_entries += len(manifest_entries)
124
132
  for entry in manifest_entries:
@@ -130,13 +138,13 @@ def limit_input_deltas(
130
138
  logger.info(
131
139
  f"Input deltas limited to "
132
140
  f"{len(limited_input_da_list)} by object store mem "
133
- f"({delta_bytes_pyarrow} > {worker_obj_store_mem})")
141
+ f"({delta_bytes_pyarrow} > {worker_obj_store_mem})"
142
+ )
134
143
  break
135
144
  delta_annotated = DeltaAnnotated.of(delta)
136
145
  limited_input_da_list.append(delta_annotated)
137
146
 
138
- logger.info(f"Input deltas to compact this round: "
139
- f"{len(limited_input_da_list)}")
147
+ logger.info(f"Input deltas to compact this round: " f"{len(limited_input_da_list)}")
140
148
  logger.info(f"Input delta bytes to compact: {delta_bytes}")
141
149
  logger.info(f"Input delta files to compact: {delta_manifest_entries}")
142
150
  logger.info(f"Latest input delta stream position: {latest_stream_position}")
@@ -146,10 +154,12 @@ def limit_input_deltas(
146
154
 
147
155
  # TODO (pdames): determine min hash buckets from size of all deltas
148
156
  # (not just deltas for this round)
149
- min_hash_bucket_count = int(max(
150
- math.ceil(delta_bytes_pyarrow / worker_obj_store_mem_per_task),
151
- min(worker_cpus, 256),
152
- ))
157
+ min_hash_bucket_count = int(
158
+ max(
159
+ math.ceil(delta_bytes_pyarrow / worker_obj_store_mem_per_task),
160
+ min(worker_cpus, 256),
161
+ )
162
+ )
153
163
  logger.info(f"Minimum recommended hash buckets: {min_hash_bucket_count}")
154
164
 
155
165
  if hash_bucket_count is None:
@@ -168,7 +178,8 @@ def limit_input_deltas(
168
178
  f"resolve this problem either specify a larger number of hash "
169
179
  f"buckets when running compaction, omit a custom hash bucket "
170
180
  f"count when running compaction, or provision workers with more "
171
- f"task memory per CPU.")
181
+ f"task memory per CPU."
182
+ )
172
183
 
173
184
  hash_bucket_chunk_size = user_hash_bucket_chunk_size
174
185
  max_hash_bucket_chunk_size = math.ceil(
@@ -185,7 +196,8 @@ def limit_input_deltas(
185
196
  f"specify a smaller hash bucket chunk size when running "
186
197
  f"compaction, omit a custom hash bucket chunk size when running "
187
198
  f"compaction, or provision workers with more task and object "
188
- f"store memory per CPU.")
199
+ f"store memory per CPU."
200
+ )
189
201
  elif not hash_bucket_chunk_size:
190
202
  hash_bucket_chunk_size_load_balanced = max(
191
203
  math.ceil(max(delta_bytes, delta_bytes_pyarrow) / worker_cpus),