deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. deltacat/__init__.py +41 -15
  2. deltacat/aws/clients.py +12 -31
  3. deltacat/aws/constants.py +1 -1
  4. deltacat/aws/redshift/__init__.py +7 -2
  5. deltacat/aws/redshift/model/manifest.py +54 -50
  6. deltacat/aws/s3u.py +176 -187
  7. deltacat/catalog/delegate.py +151 -185
  8. deltacat/catalog/interface.py +78 -97
  9. deltacat/catalog/model/catalog.py +21 -21
  10. deltacat/catalog/model/table_definition.py +11 -9
  11. deltacat/compute/compactor/__init__.py +12 -16
  12. deltacat/compute/compactor/compaction_session.py +237 -166
  13. deltacat/compute/compactor/model/delta_annotated.py +60 -44
  14. deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
  15. deltacat/compute/compactor/model/delta_file_locator.py +10 -8
  16. deltacat/compute/compactor/model/materialize_result.py +6 -7
  17. deltacat/compute/compactor/model/primary_key_index.py +38 -34
  18. deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
  19. deltacat/compute/compactor/model/round_completion_info.py +25 -19
  20. deltacat/compute/compactor/model/sort_key.py +18 -15
  21. deltacat/compute/compactor/steps/dedupe.py +119 -94
  22. deltacat/compute/compactor/steps/hash_bucket.py +48 -47
  23. deltacat/compute/compactor/steps/materialize.py +86 -92
  24. deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
  25. deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
  26. deltacat/compute/compactor/utils/io.py +59 -47
  27. deltacat/compute/compactor/utils/primary_key_index.py +91 -80
  28. deltacat/compute/compactor/utils/round_completion_file.py +22 -23
  29. deltacat/compute/compactor/utils/system_columns.py +33 -45
  30. deltacat/compute/metastats/meta_stats.py +235 -157
  31. deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
  32. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
  33. deltacat/compute/metastats/stats.py +95 -64
  34. deltacat/compute/metastats/utils/io.py +100 -53
  35. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
  36. deltacat/compute/metastats/utils/ray_utils.py +38 -33
  37. deltacat/compute/stats/basic.py +107 -69
  38. deltacat/compute/stats/models/delta_column_stats.py +11 -8
  39. deltacat/compute/stats/models/delta_stats.py +59 -32
  40. deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
  41. deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
  42. deltacat/compute/stats/models/stats_result.py +24 -14
  43. deltacat/compute/stats/utils/intervals.py +16 -9
  44. deltacat/compute/stats/utils/io.py +86 -51
  45. deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
  46. deltacat/constants.py +4 -13
  47. deltacat/io/__init__.py +2 -2
  48. deltacat/io/aws/redshift/redshift_datasource.py +157 -143
  49. deltacat/io/dataset.py +14 -17
  50. deltacat/io/read_api.py +36 -33
  51. deltacat/logs.py +94 -42
  52. deltacat/storage/__init__.py +18 -8
  53. deltacat/storage/interface.py +196 -213
  54. deltacat/storage/model/delta.py +45 -51
  55. deltacat/storage/model/list_result.py +12 -8
  56. deltacat/storage/model/namespace.py +4 -5
  57. deltacat/storage/model/partition.py +42 -42
  58. deltacat/storage/model/stream.py +29 -30
  59. deltacat/storage/model/table.py +14 -14
  60. deltacat/storage/model/table_version.py +32 -31
  61. deltacat/storage/model/types.py +1 -0
  62. deltacat/tests/stats/test_intervals.py +11 -24
  63. deltacat/tests/utils/__init__.py +0 -0
  64. deltacat/tests/utils/test_record_batch_tables.py +284 -0
  65. deltacat/types/media.py +3 -4
  66. deltacat/types/tables.py +31 -21
  67. deltacat/utils/common.py +5 -11
  68. deltacat/utils/numpy.py +20 -22
  69. deltacat/utils/pandas.py +73 -100
  70. deltacat/utils/performance.py +3 -9
  71. deltacat/utils/placement.py +259 -230
  72. deltacat/utils/pyarrow.py +302 -89
  73. deltacat/utils/ray_utils/collections.py +2 -1
  74. deltacat/utils/ray_utils/concurrency.py +27 -28
  75. deltacat/utils/ray_utils/dataset.py +28 -28
  76. deltacat/utils/ray_utils/performance.py +5 -9
  77. deltacat/utils/ray_utils/runtime.py +9 -10
  78. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
  79. deltacat-0.1.12.dist-info/RECORD +110 -0
  80. deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
  81. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
  82. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
  83. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0
@@ -1,52 +1,59 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
- import ray
5
- import os
6
4
  import functools
7
5
  import logging
8
-
6
+ import os
9
7
  import pathlib
8
+ from typing import Dict, List, Optional, Set
10
9
 
11
- from typing import Dict, Set, List, Optional, Tuple
12
-
13
- from deltacat.compute.stats.models.delta_stats import DeltaStats
10
+ import ray
14
11
  from ray.types import ObjectRef
15
12
 
16
13
  from deltacat import logs
17
- from deltacat.constants import BYTES_PER_GIBIBYTE
18
- from deltacat.compute.stats.models.delta_stats_cache_result import DeltaStatsCacheResult
19
- from deltacat.compute.stats.models.stats_result import StatsResult
14
+ from deltacat.compute.compactor import DeltaAnnotated
15
+ from deltacat.compute.metastats.model.stats_cluster_size_estimator import (
16
+ StatsClusterSizeEstimator,
17
+ )
18
+ from deltacat.compute.metastats.stats import start_stats_collection
19
+ from deltacat.compute.metastats.utils.constants import (
20
+ DEFAULT_CPUS_PER_INSTANCE_R5_8XLARGE,
21
+ DEFAULT_JOB_RUN_TRACE_ID,
22
+ HEAD_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO,
23
+ MANIFEST_FILE_COUNT_PER_CPU,
24
+ STATS_CLUSTER_R5_INSTANCE_TYPE,
25
+ WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO,
26
+ )
20
27
  from deltacat.compute.metastats.utils.io import read_cached_partition_stats
28
+ from deltacat.compute.metastats.utils.pyarrow_memory_estimation_function import (
29
+ estimation_function,
30
+ )
31
+ from deltacat.compute.metastats.utils.ray_utils import replace_cluster_cfg_vars
32
+ from deltacat.compute.stats.models.delta_stats import DeltaStats
33
+ from deltacat.compute.stats.models.delta_stats_cache_result import DeltaStatsCacheResult
34
+ from deltacat.compute.stats.utils.intervals import DeltaRange, merge_intervals
21
35
  from deltacat.compute.stats.utils.io import get_deltas_from_range
22
- from deltacat.compute.stats.utils.intervals import merge_intervals, DeltaRange
23
-
24
- from deltacat.compute.metastats.utils.ray_utils import ray_up, ray_init, get_head_node_ip, replace_cluster_cfg_vars, ray_down, clean_up_cluster_cfg_file
25
- from deltacat.compute.metastats.utils.constants import MANIFEST_FILE_COUNT_PER_CPU, R5_MEMORY_PER_CPU, HEAD_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO, \
26
- WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO, STATS_CLUSTER_R5_INSTANCE_TYPE, DEFAULT_JOB_RUN_TRACE_ID, DEFAULT_CPUS_PER_INSTANCE_R5_8XLARGE
27
- from deltacat.compute.metastats.model.stats_cluster_size_estimator import StatsClusterSizeEstimator
28
- from deltacat.compute.metastats.utils.pyarrow_memory_estimation_function import estimation_function
29
-
30
- from deltacat.storage import PartitionLocator, DeltaLocator, Delta
36
+ from deltacat.constants import (
37
+ BYTES_PER_GIBIBYTE,
38
+ PYARROW_INFLATION_MULTIPLIER_ALL_COLUMNS,
39
+ )
40
+ from deltacat.storage import Delta, DeltaLocator, PartitionLocator
31
41
  from deltacat.storage import interface as unimplemented_deltacat_storage
32
-
33
- from deltacat.constants import PYARROW_INFLATION_MULTIPLIER_ALL_COLUMNS
34
- from deltacat.compute.compactor import DeltaAnnotated
35
- from deltacat.compute.metastats.stats import start_stats_collection
36
-
37
42
  from deltacat.utils.performance import timed_invocation
38
43
 
39
44
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
40
45
 
41
46
 
42
- def collect_metastats(source_partition_locators: List[PartitionLocator],
43
- columns: Optional[List[str]] = None,
44
- file_count_per_cpu: Optional[int] = MANIFEST_FILE_COUNT_PER_CPU,
45
- stat_results_s3_bucket: Optional[str] = None,
46
- metastats_results_s3_bucket: Optional[str] = None,
47
- deltacat_storage=unimplemented_deltacat_storage,
48
- *args,
49
- **kwargs) -> Dict[str, Dict[int, DeltaStats]]:
47
+ def collect_metastats(
48
+ source_partition_locators: List[PartitionLocator],
49
+ columns: Optional[List[str]] = None,
50
+ file_count_per_cpu: Optional[int] = MANIFEST_FILE_COUNT_PER_CPU,
51
+ stat_results_s3_bucket: Optional[str] = None,
52
+ metastats_results_s3_bucket: Optional[str] = None,
53
+ deltacat_storage=unimplemented_deltacat_storage,
54
+ *args,
55
+ **kwargs,
56
+ ) -> Dict[str, Dict[int, DeltaStats]]:
50
57
 
51
58
  # TODO: Add CompactionEventDispatcher for metastats collection started event
52
59
  stats_res_all_partitions: Dict[str, Dict[int, DeltaStats]] = {}
@@ -68,7 +75,7 @@ def collect_metastats(source_partition_locators: List[PartitionLocator],
68
75
  file_count_per_cpu=file_count_per_cpu,
69
76
  deltacat_storage=deltacat_storage,
70
77
  *args,
71
- **kwargs
78
+ **kwargs,
72
79
  )
73
80
  stats_res_obj_ref_all_partitions[partition_value_string] = stats_res_obj_ref
74
81
  for pv, stats_res_obj_ref in stats_res_obj_ref_all_partitions.items():
@@ -88,63 +95,80 @@ def collect_metastats(source_partition_locators: List[PartitionLocator],
88
95
 
89
96
  for stats in stats_column_result.get("column_stats"):
90
97
  partition_pyarrow_sum += stats.get("stats").get("pyarrowTableBytes")
91
- aggregate_partition_stats_for_validation[partition_val] = [partition_stats_sum_row_count, partition_pyarrow_sum]
92
- logger.info(f"partitions_stats_result for partition value: {partition_val}: rowCount: {partition_stats_sum_row_count}; pyarrowTableBytes: {partition_pyarrow_sum}")
98
+ aggregate_partition_stats_for_validation[partition_val] = [
99
+ partition_stats_sum_row_count,
100
+ partition_pyarrow_sum,
101
+ ]
102
+ logger.info(
103
+ f"partitions_stats_result for partition value: {partition_val}: rowCount: {partition_stats_sum_row_count}; pyarrowTableBytes: {partition_pyarrow_sum}"
104
+ )
93
105
  return aggregate_partition_stats_for_validation
94
106
 
95
107
  # return stats_res_all_partitions
96
108
 
109
+
97
110
  @ray.remote(num_cpus=1)
98
- def collect_from_partition(source_partition_locator: PartitionLocator,
99
- partition_value_string,
100
- partition_canonical_string,
101
- delta_stream_position_range_set: Optional[Set[DeltaRange]] = None,
102
- columns: Optional[List[str]] = None,
103
- file_count_per_cpu: Optional[int] = MANIFEST_FILE_COUNT_PER_CPU,
104
- stat_results_s3_bucket: Optional[str] = None,
105
- metastats_results_s3_bucket: Optional[str] = None,
106
- deltacat_storage=unimplemented_deltacat_storage,
107
- *args,
108
- **kwargs) -> ObjectRef[Dict[int, DeltaStats]]:
111
+ def collect_from_partition(
112
+ source_partition_locator: PartitionLocator,
113
+ partition_value_string,
114
+ partition_canonical_string,
115
+ delta_stream_position_range_set: Optional[Set[DeltaRange]] = None,
116
+ columns: Optional[List[str]] = None,
117
+ file_count_per_cpu: Optional[int] = MANIFEST_FILE_COUNT_PER_CPU,
118
+ stat_results_s3_bucket: Optional[str] = None,
119
+ metastats_results_s3_bucket: Optional[str] = None,
120
+ deltacat_storage=unimplemented_deltacat_storage,
121
+ *args,
122
+ **kwargs,
123
+ ) -> ObjectRef[Dict[int, DeltaStats]]:
109
124
 
110
125
  if not columns:
111
- columns = deltacat_storage.get_table_version_column_names(source_partition_locator.namespace,
112
- source_partition_locator.table_name,
113
- source_partition_locator.table_version)
114
- deltas = _find_deltas(source_partition_locator,
115
- delta_stream_position_range_set,
116
- deltacat_storage)
126
+ columns = deltacat_storage.get_table_version_column_names(
127
+ source_partition_locator.namespace,
128
+ source_partition_locator.table_name,
129
+ source_partition_locator.table_version,
130
+ )
131
+ deltas = _find_deltas(
132
+ source_partition_locator, delta_stream_position_range_set, deltacat_storage
133
+ )
117
134
 
118
135
  logger.info(f"Find {len(deltas)} deltas!")
119
136
  trace_id = DEFAULT_JOB_RUN_TRACE_ID
120
137
  if "trace_id" in kwargs:
121
138
  trace_id = kwargs.get("trace_id")
122
139
  else:
123
- logger.warning(f"No job run trace id specified, default to {DEFAULT_JOB_RUN_TRACE_ID}")
140
+ logger.warning(
141
+ f"No job run trace id specified, default to {DEFAULT_JOB_RUN_TRACE_ID}"
142
+ )
124
143
 
125
144
  cpus_per_instance = DEFAULT_CPUS_PER_INSTANCE_R5_8XLARGE
126
145
  if cpus_per_instance in kwargs:
127
146
  cpus_per_instance = kwargs.get("cpus_per_instance")
128
147
  else:
129
- logger.info(f"Stats cluster CPUS per instance not specified, default to {DEFAULT_CPUS_PER_INSTANCE_R5_8XLARGE}")
148
+ logger.info(
149
+ f"Stats cluster CPUS per instance not specified, default to {DEFAULT_CPUS_PER_INSTANCE_R5_8XLARGE}"
150
+ )
130
151
 
131
152
  stats_res_obj_ref = _start_all_stats_collection_from_deltas(
132
- deltas,
133
- partition_value_string,
134
- partition_canonical_string,
135
- columns,
136
- trace_id,
137
- file_count_per_cpu,
138
- cpus_per_instance,
139
- stat_results_s3_bucket,
140
- metastats_results_s3_bucket,
141
- deltacat_storage)
153
+ deltas,
154
+ partition_value_string,
155
+ partition_canonical_string,
156
+ columns,
157
+ trace_id,
158
+ file_count_per_cpu,
159
+ cpus_per_instance,
160
+ stat_results_s3_bucket,
161
+ metastats_results_s3_bucket,
162
+ deltacat_storage,
163
+ )
142
164
  return stats_res_obj_ref
143
165
 
144
166
 
145
- def _find_deltas(source_partition_locator: PartitionLocator,
146
- delta_stream_position_range_set: Optional[Set[DeltaRange]] = None,
147
- deltacat_storage=unimplemented_deltacat_storage) -> List[Delta]:
167
+ def _find_deltas(
168
+ source_partition_locator: PartitionLocator,
169
+ delta_stream_position_range_set: Optional[Set[DeltaRange]] = None,
170
+ deltacat_storage=unimplemented_deltacat_storage,
171
+ ) -> List[Delta]:
148
172
 
149
173
  if delta_stream_position_range_set is None:
150
174
  delta_stream_position_range_set = {(None, None)}
@@ -152,8 +176,9 @@ def _find_deltas(source_partition_locator: PartitionLocator,
152
176
 
153
177
  for range_pair in merge_intervals(delta_stream_position_range_set):
154
178
  begin, end = range_pair
155
- promise: ObjectRef[List[Delta]] = get_deltas_from_range.remote(source_partition_locator, begin, end,
156
- deltacat_storage)
179
+ promise: ObjectRef[List[Delta]] = get_deltas_from_range.remote(
180
+ source_partition_locator, begin, end, deltacat_storage
181
+ )
157
182
  delta_range_lookup_pending.append(promise)
158
183
 
159
184
  delta_list_by_ranges: List[List[Delta]] = ray.get(delta_range_lookup_pending)
@@ -162,25 +187,28 @@ def _find_deltas(source_partition_locator: PartitionLocator,
162
187
 
163
188
 
164
189
  def _start_all_stats_collection_from_deltas(
165
- deltas: List[Delta],
166
- partition_value_string: Optional[str],
167
- partition_canonical_string: Optional[str],
168
- columns: Optional[List[str]] = None,
169
- trace_id: Optional[str] = None,
170
- file_count_per_cpu: Optional[int] = MANIFEST_FILE_COUNT_PER_CPU,
171
- cpus_per_instance: Optional[int] = DEFAULT_CPUS_PER_INSTANCE_R5_8XLARGE,
172
- stat_results_s3_bucket: Optional[str] = None,
173
- metastats_results_s3_bucket: Optional[str] = None,
174
- deltacat_storage=unimplemented_deltacat_storage) -> Dict[int, DeltaStats]:
175
-
176
- delta_cache_lookup_pending: List[List[ObjectRef[DeltaStatsCacheResult], Delta]] = []
190
+ deltas: List[Delta],
191
+ partition_value_string: Optional[str],
192
+ partition_canonical_string: Optional[str],
193
+ columns: Optional[List[str]] = None,
194
+ trace_id: Optional[str] = None,
195
+ file_count_per_cpu: Optional[int] = MANIFEST_FILE_COUNT_PER_CPU,
196
+ cpus_per_instance: Optional[int] = DEFAULT_CPUS_PER_INSTANCE_R5_8XLARGE,
197
+ stat_results_s3_bucket: Optional[str] = None,
198
+ metastats_results_s3_bucket: Optional[str] = None,
199
+ deltacat_storage=unimplemented_deltacat_storage,
200
+ ) -> Dict[int, DeltaStats]:
201
+
177
202
  delta_stats_compute_list: List[DeltaLocator] = []
178
203
  meta_stats_list_ready: List[DeltaLocator] = []
179
204
  meta_stats_list_to_compute: List[DeltaLocator] = []
180
205
 
181
206
  if stat_results_s3_bucket:
182
- found_columns_stats_map: Dict[int, List[DeltaStatsCacheResult]] = \
183
- read_cached_partition_stats(partition_canonical_string, stat_results_s3_bucket)
207
+ found_columns_stats_map: Dict[
208
+ int, List[DeltaStatsCacheResult]
209
+ ] = read_cached_partition_stats(
210
+ partition_canonical_string, stat_results_s3_bucket
211
+ )
184
212
 
185
213
  delta_cache_res: List[DeltaStats] = []
186
214
  for delta in deltas:
@@ -188,10 +216,11 @@ def _start_all_stats_collection_from_deltas(
188
216
  cached_result = found_columns_stats_map[delta.stream_position]
189
217
  if cached_result.hits:
190
218
  delta_cache_res.append(cached_result.hits)
191
- meta_stats_list_ready.append(cached_result.hits.column_stats[0].manifest_stats.delta_locator)
219
+ meta_stats_list_ready.append(
220
+ cached_result.hits.column_stats[0].manifest_stats.delta_locator
221
+ )
192
222
 
193
223
  if cached_result.misses:
194
- missed_column_names: List[str] = cached_result.misses.column_names
195
224
  delta_locator: DeltaLocator = cached_result.misses.delta_locator
196
225
  delta_stats_compute_list.append(delta_locator)
197
226
  meta_stats_list_to_compute.append(delta_locator)
@@ -202,46 +231,55 @@ def _start_all_stats_collection_from_deltas(
202
231
  logger.info(f"Collecting stats on {len(delta_stats_compute_list)} deltas!")
203
232
  delta_stats_compute_res: Dict[int, DeltaStats] = {}
204
233
  if delta_stats_compute_list:
205
- delta_stats_compute_res = _start_metadata_stats_collection(delta_stats_compute_list=delta_stats_compute_list,
206
- meta_stats_list_ready=meta_stats_list_ready,
207
- meta_stats_list_to_compute=meta_stats_list_to_compute,
208
- partition_value_string=partition_value_string,
209
- partition_canonical_string=partition_canonical_string,
210
- columns=columns,
211
- trace_id=trace_id,
212
- file_count_per_cpu=file_count_per_cpu,
213
- cpus_per_instance=cpus_per_instance,
214
- stat_results_s3_bucket=stat_results_s3_bucket,
215
- metastats_results_s3_bucket=metastats_results_s3_bucket,
216
- deltacat_storage=deltacat_storage)
234
+ delta_stats_compute_res = _start_metadata_stats_collection(
235
+ delta_stats_compute_list=delta_stats_compute_list,
236
+ meta_stats_list_ready=meta_stats_list_ready,
237
+ meta_stats_list_to_compute=meta_stats_list_to_compute,
238
+ partition_value_string=partition_value_string,
239
+ partition_canonical_string=partition_canonical_string,
240
+ columns=columns,
241
+ trace_id=trace_id,
242
+ file_count_per_cpu=file_count_per_cpu,
243
+ cpus_per_instance=cpus_per_instance,
244
+ stat_results_s3_bucket=stat_results_s3_bucket,
245
+ metastats_results_s3_bucket=metastats_results_s3_bucket,
246
+ deltacat_storage=deltacat_storage,
247
+ )
217
248
 
218
249
  delta_stream_range_stats: Dict[int, DeltaStats] = {}
219
250
  for delta_column_stats in delta_cache_res:
220
- assert len(delta_column_stats.column_stats) > 0, \
221
- f"Expected columns of `{delta_column_stats}` to be non-empty"
222
- stream_position = delta_column_stats.column_stats[0].manifest_stats.delta_locator.stream_position
251
+ assert (
252
+ len(delta_column_stats.column_stats) > 0
253
+ ), f"Expected columns of `{delta_column_stats}` to be non-empty"
254
+ stream_position = delta_column_stats.column_stats[
255
+ 0
256
+ ].manifest_stats.delta_locator.stream_position
223
257
  delta_stream_range_stats[stream_position] = delta_column_stats
224
258
 
225
259
  # stats collection result: if we have cached stats and missed column stats for same delta, stats collection for this delta is still needed
226
260
  # and the final result will use the newly collected stats for this delta.
227
- stats_collection_res: Dict[int, DeltaStats] = {**delta_stream_range_stats, **delta_stats_compute_res}
261
+ stats_collection_res: Dict[int, DeltaStats] = {
262
+ **delta_stream_range_stats,
263
+ **delta_stats_compute_res,
264
+ }
228
265
 
229
266
  return stats_collection_res
230
267
 
231
268
 
232
269
  def _start_metadata_stats_collection(
233
- delta_stats_compute_list: List[DeltaLocator],
234
- meta_stats_list_ready: List[DeltaLocator],
235
- meta_stats_list_to_compute: List[DeltaLocator],
236
- partition_value_string: Optional[str],
237
- partition_canonical_string: Optional[str],
238
- columns: Optional[List[str]] = None,
239
- trace_id: Optional[str] = None,
240
- file_count_per_cpu: Optional[int] = MANIFEST_FILE_COUNT_PER_CPU,
241
- cpus_per_instance: Optional[int] = DEFAULT_CPUS_PER_INSTANCE_R5_8XLARGE,
242
- stat_results_s3_bucket: Optional[str] = None,
243
- metastats_results_s3_bucket: Optional[str] = None,
244
- deltacat_storage=unimplemented_deltacat_storage) -> Dict[int, DeltaStats]:
270
+ delta_stats_compute_list: List[DeltaLocator],
271
+ meta_stats_list_ready: List[DeltaLocator],
272
+ meta_stats_list_to_compute: List[DeltaLocator],
273
+ partition_value_string: Optional[str],
274
+ partition_canonical_string: Optional[str],
275
+ columns: Optional[List[str]] = None,
276
+ trace_id: Optional[str] = None,
277
+ file_count_per_cpu: Optional[int] = MANIFEST_FILE_COUNT_PER_CPU,
278
+ cpus_per_instance: Optional[int] = DEFAULT_CPUS_PER_INSTANCE_R5_8XLARGE,
279
+ stat_results_s3_bucket: Optional[str] = None,
280
+ metastats_results_s3_bucket: Optional[str] = None,
281
+ deltacat_storage=unimplemented_deltacat_storage,
282
+ ) -> Dict[int, DeltaStats]:
245
283
 
246
284
  meta_stats_res_ready: Dict[int, int] = {}
247
285
 
@@ -254,7 +292,11 @@ def _start_metadata_stats_collection(
254
292
  delta_meta_count += entry.meta.content_length
255
293
  meta_stats_res_ready[delta.stream_position] = delta_meta_count
256
294
 
257
- first_delta_locator = meta_stats_list_ready[0] if meta_stats_list_ready else meta_stats_list_to_compute[0]
295
+ first_delta_locator = (
296
+ meta_stats_list_ready[0]
297
+ if meta_stats_list_ready
298
+ else meta_stats_list_to_compute[0]
299
+ )
258
300
  manifest = deltacat_storage.get_delta_manifest(first_delta_locator)
259
301
  content_type = manifest.meta.content_type
260
302
  content_encoding = manifest.meta.content_type
@@ -272,40 +314,42 @@ def _start_metadata_stats_collection(
272
314
  delta_meta_count += entry.meta.content_length
273
315
  meta_stats_to_compute[delta.stream_position] = delta_meta_count
274
316
 
275
- min_cpus = _estimate_cpus_needed(meta_stats_to_compute, R5_MEMORY_PER_CPU, file_count_per_cpu, manifest_file_count_to_compute,
276
- partition_value_string)
277
- min_workers = int(min_cpus // cpus_per_instance) + 1
278
-
279
- batched_delta_stats_compute_list = _batch_deltas(delta_stats_compute_list,
280
- file_count_per_cpu,
281
- cpus_per_instance,
282
- deltacat_storage,
283
- content_type,
284
- content_encoding)
317
+ batched_delta_stats_compute_list = _batch_deltas(
318
+ delta_stats_compute_list,
319
+ file_count_per_cpu,
320
+ cpus_per_instance,
321
+ deltacat_storage,
322
+ content_type,
323
+ content_encoding,
324
+ )
285
325
 
286
326
  # out_cluster_cfg = _setup_stats_cluster(min_workers,
287
327
  # partition_value_string,
288
328
  # trace_id,
289
329
  # cpus_per_instance)
290
330
  out_cluster_cfg = None
291
- delta_stats_res: Dict[int, DeltaStats] = _start_stats_cluster(out_cluster_cfg,
292
- batched_delta_stats_compute_list,
293
- columns,
294
- stat_results_s3_bucket,
295
- metastats_results_s3_bucket,
296
- deltacat_storage,
297
- partition_canonical_string)
331
+ delta_stats_res: Dict[int, DeltaStats] = _start_stats_cluster(
332
+ out_cluster_cfg,
333
+ batched_delta_stats_compute_list,
334
+ columns,
335
+ stat_results_s3_bucket,
336
+ metastats_results_s3_bucket,
337
+ deltacat_storage,
338
+ partition_canonical_string,
339
+ )
298
340
 
299
341
  return delta_stats_res
300
342
 
301
343
 
302
- def _start_stats_cluster(out_cluster_cfg: str,
303
- batched_delta_stats_compute_list: List[DeltaAnnotated],
304
- columns: List[str],
305
- stat_results_s3_bucket: Optional[str] = None,
306
- metastats_results_s3_bucket: Optional[str] = None,
307
- deltacat_storage=unimplemented_deltacat_storage,
308
- partition_val:Optional[str]="partition_val"):
344
+ def _start_stats_cluster(
345
+ out_cluster_cfg: str,
346
+ batched_delta_stats_compute_list: List[DeltaAnnotated],
347
+ columns: List[str],
348
+ stat_results_s3_bucket: Optional[str] = None,
349
+ metastats_results_s3_bucket: Optional[str] = None,
350
+ deltacat_storage=unimplemented_deltacat_storage,
351
+ partition_val: Optional[str] = "partition_val",
352
+ ):
309
353
  # ray_up_latency = timed_invocation(
310
354
  # func=ray_up,
311
355
  # cluster_cfg=out_cluster_cfg
@@ -321,46 +365,72 @@ def _start_stats_cluster(out_cluster_cfg: str,
321
365
  columns=columns,
322
366
  stat_results_s3_bucket=stat_results_s3_bucket,
323
367
  metastats_results_s3_bucket=metastats_results_s3_bucket,
324
- deltacat_storage=deltacat_storage
368
+ deltacat_storage=deltacat_storage,
369
+ )
370
+ logger.info(
371
+ f"actual_stats_collection_latency: {partition_val}: {stats_collection_latency}"
325
372
  )
326
- logger.info(f"actual_stats_collection_latency: {partition_val}: {stats_collection_latency}")
327
373
  # client.disconnect()
328
374
  # ray_down(out_cluster_cfg)
329
375
  # clean_up_cluster_cfg_file(out_cluster_cfg)
330
376
  return delta_stream_range_stats
331
377
 
332
378
 
333
- def _estimate_cpus_needed(meta_stats_to_compute, memory_gb_per_cpu, file_count_per_cpu, manifest_file_count_to_compute, partition_val):
379
+ def _estimate_cpus_needed(
380
+ meta_stats_to_compute,
381
+ memory_gb_per_cpu,
382
+ file_count_per_cpu,
383
+ manifest_file_count_to_compute,
384
+ partition_val,
385
+ ):
334
386
  content_length_sum = 0
335
387
  for val in meta_stats_to_compute.values():
336
388
  content_length_sum += val
337
389
  manifest_file_count_sum = 0
338
390
  for val in manifest_file_count_to_compute.values():
339
391
  manifest_file_count_sum += val
340
- estimated_memory_bytes_needed = content_length_sum * PYARROW_INFLATION_MULTIPLIER_ALL_COLUMNS
392
+ estimated_memory_bytes_needed = (
393
+ content_length_sum * PYARROW_INFLATION_MULTIPLIER_ALL_COLUMNS
394
+ )
341
395
  estimated_memory_gib_needed = estimated_memory_bytes_needed / BYTES_PER_GIBIBYTE
342
396
 
343
- logger.info(f"estimated_memory_gib_needed: {partition_val} : {estimated_memory_gib_needed}")
397
+ logger.info(
398
+ f"estimated_memory_gib_needed: {partition_val} : {estimated_memory_gib_needed}"
399
+ )
344
400
  logger.info(f"manifest_file_count_sum: {partition_val} : {manifest_file_count_sum}")
345
401
 
346
- memory_per_cpu_available = memory_gb_per_cpu * (1 - WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO)
347
- estimator = StatsClusterSizeEstimator.of(memory_per_cpu_available, file_count_per_cpu,
348
- estimated_memory_gib_needed, manifest_file_count_sum)
402
+ memory_per_cpu_available = memory_gb_per_cpu * (
403
+ 1 - WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO
404
+ )
405
+ estimator = StatsClusterSizeEstimator.of(
406
+ memory_per_cpu_available,
407
+ file_count_per_cpu,
408
+ estimated_memory_gib_needed,
409
+ manifest_file_count_sum,
410
+ )
349
411
  min_cpus = StatsClusterSizeEstimator.estimate_cpus_needed(estimator)
350
412
  return min_cpus
351
413
 
352
414
 
353
- def _batch_deltas(delta_stats_compute_list,
354
- file_count_per_cpu,
355
- cpu_per_instance,
356
- deltacat_storage,
357
- content_type,
358
- content_encoding) -> List[DeltaAnnotated]:
359
- worker_node_mem = cpu_per_instance * (1 - WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO) * BYTES_PER_GIBIBYTE
415
+ def _batch_deltas(
416
+ delta_stats_compute_list,
417
+ file_count_per_cpu,
418
+ cpu_per_instance,
419
+ deltacat_storage,
420
+ content_type,
421
+ content_encoding,
422
+ ) -> List[DeltaAnnotated]:
423
+ worker_node_mem = (
424
+ cpu_per_instance
425
+ * (1 - WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO)
426
+ * BYTES_PER_GIBIBYTE
427
+ )
360
428
  delta_list = []
361
429
 
362
430
  estimate_based_on_content_length = functools.partial(
363
- estimation_function, content_type=content_type, content_encoding=content_encoding,
431
+ estimation_function,
432
+ content_type=content_type,
433
+ content_encoding=content_encoding,
364
434
  )
365
435
 
366
436
  for delta_locator in delta_stats_compute_list:
@@ -381,8 +451,14 @@ def _batch_deltas(delta_stats_compute_list,
381
451
  return rebatched_da_list
382
452
 
383
453
 
384
- def _setup_stats_cluster(min_workers, partition_value_string, trace_id, cpus_per_instance):
385
- stats_cluster_instance_type = int(cpus_per_instance // 4) if cpus_per_instance else STATS_CLUSTER_R5_INSTANCE_TYPE
454
+ def _setup_stats_cluster(
455
+ min_workers, partition_value_string, trace_id, cpus_per_instance
456
+ ):
457
+ stats_cluster_instance_type = (
458
+ int(cpus_per_instance // 4)
459
+ if cpus_per_instance
460
+ else STATS_CLUSTER_R5_INSTANCE_TYPE
461
+ )
386
462
  stats_cluster_instance_type_str = f"r5.{stats_cluster_instance_type}xlarge".strip()
387
463
  parent_dir_path = pathlib.Path(__file__).parent.resolve()
388
464
  in_cfg = os.path.join(parent_dir_path, "config", "stats_cluster_example.yaml")
@@ -393,7 +469,9 @@ def _setup_stats_cluster(min_workers, partition_value_string, trace_id, cpus_per
393
469
  min_workers=min_workers,
394
470
  head_type=stats_cluster_instance_type_str,
395
471
  worker_type=stats_cluster_instance_type_str,
396
- head_object_store_memory_pct=HEAD_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO*100,
397
- worker_object_store_memory_pct=WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO*100)
472
+ head_object_store_memory_pct=HEAD_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO * 100,
473
+ worker_object_store_memory_pct=WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO
474
+ * 100,
475
+ )
398
476
 
399
477
  return out_cluster_cfg_file_path
@@ -1,20 +1,16 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
- from collections import defaultdict
5
- from typing import List, Dict, Optional, Set, Any, NamedTuple
4
+ from typing import Dict
6
5
 
7
6
  from deltacat.compute.stats.models.delta_stats import DeltaStats
8
- from deltacat.compute.stats.models.manifest_entry_stats import ManifestEntryStats
9
- from deltacat.compute.stats.models.stats_result import StatsResult
10
- from deltacat.compute.stats.types import StatsType
11
- from deltacat.storage import DeltaLocator
12
7
 
13
8
 
14
9
  class PartitionStats(dict):
15
-
16
10
  @staticmethod
17
- def of(delta_stats: Dict[DeltaStats], partition_canonical_string: str) -> PartitionStats:
11
+ def of(
12
+ delta_stats: Dict[DeltaStats], partition_canonical_string: str
13
+ ) -> PartitionStats:
18
14
  ps = PartitionStats()
19
15
  ps["delta_stats"] = delta_stats
20
16
  ps["partition_canonical_string"] = partition_canonical_string
@@ -25,7 +21,9 @@ class PartitionStats(dict):
25
21
  delta_stats_dict = {}
26
22
  for stream_position, delta_stats in partition_stats["delta_stats"].items():
27
23
  delta_stats_dict[stream_position] = DeltaStats.build_from_dict(delta_stats)
28
- return PartitionStats.of(delta_stats_dict, partition_stats["partition_canonical_string"])
24
+ return PartitionStats.of(
25
+ delta_stats_dict, partition_stats["partition_canonical_string"]
26
+ )
29
27
 
30
28
  @property
31
29
  def delta_stats(self) -> Dict[DeltaStats]:
@@ -34,4 +32,3 @@ class PartitionStats(dict):
34
32
  @property
35
33
  def partition_canonical_string(self) -> str:
36
34
  return self["partition_canonical_string"]
37
-
@@ -5,9 +5,13 @@ from deltacat.compute.stats.models.delta_stats import DeltaStats
5
5
 
6
6
 
7
7
  class StatsClusterSizeEstimator(dict):
8
-
9
8
  @staticmethod
10
- def of(memory_per_cpu: int, file_count_per_cpu: int, total_memory_needed: int, total_file_count: int) -> DeltaStats:
9
+ def of(
10
+ memory_per_cpu: int,
11
+ file_count_per_cpu: int,
12
+ total_memory_needed: int,
13
+ total_file_count: int,
14
+ ) -> DeltaStats:
11
15
  estimator = StatsClusterSizeEstimator()
12
16
  estimator["memory_per_cpu"] = memory_per_cpu
13
17
  estimator["file_count_per_cpu"] = file_count_per_cpu
@@ -55,6 +59,10 @@ class StatsClusterSizeEstimator(dict):
55
59
  # 4. If not enough stats for this content_type and content_encoding combination: use the basic PYARROW_INFLATION_MULTIPLIER instead.
56
60
  # So, only option 4 is implemented here since this pre-requirement for first 3 options are not met for first round of metastats&stats collection.
57
61
 
58
- min_cpus_based_on_memory = (estimator.total_memory_needed // estimator.memory_per_cpu) + 1
59
- min_cpus_based_on_file_count = (estimator.total_file_count // estimator.file_count_per_cpu) + 1
60
- return max(min_cpus_based_on_memory, min_cpus_based_on_file_count)
62
+ min_cpus_based_on_memory = (
63
+ estimator.total_memory_needed // estimator.memory_per_cpu
64
+ ) + 1
65
+ min_cpus_based_on_file_count = (
66
+ estimator.total_file_count // estimator.file_count_per_cpu
67
+ ) + 1
68
+ return max(min_cpus_based_on_memory, min_cpus_based_on_file_count)