deltacat 1.1.17__py3-none-any.whl → 1.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/constants.py +0 -1
  3. deltacat/compute/compactor/model/compact_partition_params.py +76 -0
  4. deltacat/compute/compactor/model/compaction_session_audit_info.py +26 -0
  5. deltacat/compute/compactor/model/delta_annotated.py +16 -9
  6. deltacat/compute/compactor_v2/constants.py +3 -0
  7. deltacat/compute/compactor_v2/private/compaction_utils.py +9 -5
  8. deltacat/compute/compactor_v2/utils/content_type_params.py +185 -34
  9. deltacat/compute/compactor_v2/utils/io.py +28 -14
  10. deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
  11. deltacat/compute/compactor_v2/utils/task_options.py +128 -183
  12. deltacat/compute/resource_estimation/__init__.py +27 -0
  13. deltacat/compute/resource_estimation/delta.py +271 -0
  14. deltacat/compute/resource_estimation/manifest.py +394 -0
  15. deltacat/compute/resource_estimation/model.py +165 -0
  16. deltacat/compute/resource_estimation/parquet.py +108 -0
  17. deltacat/constants.py +5 -0
  18. deltacat/exceptions.py +2 -4
  19. deltacat/logs.py +8 -0
  20. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +77 -0
  21. deltacat/tests/compute/compact_partition_rebase_test_cases.py +308 -0
  22. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +159 -0
  23. deltacat/tests/compute/compactor_v2/test_compaction_session.py +157 -0
  24. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +3 -3
  25. deltacat/tests/compute/resource_estimation/test_delta.py +605 -0
  26. deltacat/tests/compute/resource_estimation/test_manifest.py +921 -0
  27. deltacat/tests/compute/test_compact_partition_rebase.py +13 -4
  28. deltacat/tests/compute/test_util_common.py +2 -0
  29. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -5
  30. deltacat/tests/test_logs.py +34 -0
  31. deltacat/tests/test_utils/pyarrow.py +15 -5
  32. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/METADATA +2 -2
  33. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/RECORD +38 -54
  34. deltacat/compute/metastats/meta_stats.py +0 -479
  35. deltacat/compute/metastats/model/__init__.py +0 -0
  36. deltacat/compute/metastats/model/partition_stats_dict.py +0 -34
  37. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +0 -68
  38. deltacat/compute/metastats/stats.py +0 -182
  39. deltacat/compute/metastats/utils/__init__.py +0 -0
  40. deltacat/compute/metastats/utils/constants.py +0 -16
  41. deltacat/compute/metastats/utils/io.py +0 -223
  42. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +0 -18
  43. deltacat/compute/metastats/utils/ray_utils.py +0 -129
  44. deltacat/compute/stats/basic.py +0 -226
  45. deltacat/compute/stats/models/__init__.py +0 -0
  46. deltacat/compute/stats/models/delta_column_stats.py +0 -98
  47. deltacat/compute/stats/models/delta_stats.py +0 -233
  48. deltacat/compute/stats/models/delta_stats_cache_result.py +0 -49
  49. deltacat/compute/stats/models/manifest_entry_stats.py +0 -72
  50. deltacat/compute/stats/models/stats_result.py +0 -104
  51. deltacat/compute/stats/utils/__init__.py +0 -0
  52. deltacat/compute/stats/utils/intervals.py +0 -94
  53. deltacat/compute/stats/utils/io.py +0 -230
  54. deltacat/compute/stats/utils/manifest_stats_file.py +0 -100
  55. deltacat/tests/stats/__init__.py +0 -0
  56. deltacat/tests/stats/test_intervals.py +0 -49
  57. /deltacat/{compute/metastats → tests/compute/resource_estimation}/__init__.py +0 -0
  58. /deltacat/{compute/metastats/config → tests/compute/resource_estimation/data}/__init__.py +0 -0
  59. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/LICENSE +0 -0
  60. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/WHEEL +0 -0
  61. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/top_level.txt +0 -0
@@ -1,182 +0,0 @@
1
- import logging
2
- from collections import defaultdict
3
- from typing import Dict, List, Optional
4
-
5
- import ray
6
- from ray.types import ObjectRef
7
-
8
- from deltacat import logs
9
- from deltacat.aws import s3u as s3_utils
10
- from deltacat.aws.clients import client_cache
11
- from deltacat.aws.constants import AWS_REGION
12
- from deltacat.compute.compactor import DeltaAnnotated
13
- from deltacat.compute.metastats.utils.io import (
14
- cache_inflation_rate_data_for_delta_stats_ready,
15
- cache_partition_stats_to_s3,
16
- collect_stats_by_columns,
17
- )
18
- from deltacat.compute.stats.models.delta_column_stats import DeltaColumnStats
19
- from deltacat.compute.stats.models.delta_stats import DeltaStats
20
- from deltacat.compute.stats.models.manifest_entry_stats import ManifestEntryStats
21
- from deltacat.compute.stats.models.stats_result import StatsResult
22
- from deltacat.storage import DeltaLocator, PartitionLocator
23
- from deltacat.storage import interface as unimplemented_deltacat_storage
24
-
25
- logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
26
-
27
- # TODO: get cpu info from ray.nodes() resource key
28
- DEFAULT_CPUS_STATS_CLUSTER_INSTANCE = 32
29
-
30
-
31
- def start_stats_collection(
32
- batched_delta_stats_compute_list: List[DeltaAnnotated],
33
- columns: List[str],
34
- stat_results_s3_bucket: Optional[str] = None,
35
- metastats_results_s3_bucket: Optional[str] = None,
36
- deltacat_storage=unimplemented_deltacat_storage,
37
- **kwargs,
38
- ) -> Dict[str, List[DeltaStats]]:
39
- """Collects statistics on deltas, given a set of delta stream position ranges.
40
- Example:
41
- >>> collect(locator, set((1, 5), (4, 8), (13, 16)))
42
- {
43
- 1: DeltaStats(), # DeltaStats for stream positions 1 - 8
44
- 13: DeltaStats() # DeltaStats for stream positions 13 - 16
45
- }
46
- Args:
47
- source_partition_locator: Reference to the partition locator tied to the given delta stream positions
48
- delta_stream_position_range_set: A set of intervals with an int type representing finite,
49
- closed bounded values, and a None type representing unbounded infinity.
50
- columns: Columns can be optionally included to collect stats on specific columns.
51
- By default, all columns will be calculated.
52
- stat_results_s3_bucket: Used as a cache file storage for computed delta stats
53
- metastats_results_s3_bucket: Used as cache file storage for inflation rate meta stats
54
- deltacat_storage: Client implementation of the DeltaCAT storage interface
55
- Returns:
56
- A mapping of stream positions to their corresponding delta stats.
57
- """
58
- # TODO: Add CompactionEventDispatcher for stats collection started event
59
- delta_stats_compute_pending: List[ObjectRef[Dict[str, List[StatsResult, int]]]] = []
60
-
61
- for batched_deltas in batched_delta_stats_compute_list:
62
- splitted_annotated_deltas = DeltaAnnotated.split(
63
- batched_deltas, DEFAULT_CPUS_STATS_CLUSTER_INSTANCE
64
- )
65
- for splitted_annotated_delta in splitted_annotated_deltas:
66
- delta_stats_compute_pending.append(
67
- collect_stats_by_columns.remote(
68
- splitted_annotated_delta, columns, deltacat_storage
69
- )
70
- )
71
-
72
- column_stats_map = _process_stats(delta_stats_compute_pending)
73
-
74
- if not batched_delta_stats_compute_list:
75
- logger.info("No new delta need stats collection")
76
- else:
77
- (
78
- delta_stream_range_stats,
79
- partition_canonical_string,
80
- ) = resolve_annotated_delta_stats_to_original_deltas_stats(
81
- column_stats_map, columns, batched_delta_stats_compute_list[0]
82
- )
83
-
84
- _cache_stats_res_to_s3(
85
- stat_results_s3_bucket, delta_stream_range_stats, partition_canonical_string
86
- )
87
-
88
- base_path = s3_utils.parse_s3_url(metastats_results_s3_bucket).url
89
- inflation_rate_stats_s3_url = f"{base_path}/inflation-rates.json"
90
- cache_inflation_rate_data_for_delta_stats_ready(
91
- delta_stream_range_stats, inflation_rate_stats_s3_url, deltacat_storage
92
- )
93
- # TODO: Add CompactionEventDispatcher for stats collection completed event
94
- return delta_stream_range_stats
95
-
96
-
97
- def _get_account_id() -> str:
98
- client = client_cache("sts", region_name=AWS_REGION)
99
- account_id = client.get_caller_identity()["Account"]
100
- return account_id
101
-
102
-
103
- def _process_stats(
104
- delta_stats_compute_pending: List[ObjectRef[DeltaStats]],
105
- ) -> List[DeltaStats]:
106
- delta_stats_processed_list: List[DeltaStats] = _resolve_pending_stats(
107
- delta_stats_compute_pending
108
- )
109
-
110
- return delta_stats_processed_list
111
-
112
-
113
- def _resolve_pending_stats(
114
- delta_stats_pending_list: List[ObjectRef[DeltaStats]],
115
- ) -> List[DeltaStats]:
116
- delta_stats_processed_list: List[DeltaStats] = []
117
-
118
- while delta_stats_pending_list:
119
- ready, delta_stats_pending_list = ray.wait(delta_stats_pending_list)
120
- processed_stats_batch: List[DeltaStats] = ray.get(ready)
121
- delta_stats_processed_list.extend(processed_stats_batch)
122
-
123
- return delta_stats_processed_list
124
-
125
-
126
- def _cache_stats_res_to_s3(
127
- stat_results_s3_bucket, delta_stream_range_stats, partition_canonical_string
128
- ):
129
- if stat_results_s3_bucket:
130
- # Cache the stats into the file store
131
- cache_partition_stats_to_s3(
132
- stat_results_s3_bucket, delta_stream_range_stats, partition_canonical_string
133
- )
134
-
135
-
136
- def resolve_annotated_delta_stats_to_original_deltas_stats(
137
- column_stats_map, column_names, delta_annotated
138
- ) -> Dict[int, DeltaStats]:
139
-
140
- partition_values = delta_annotated["deltaLocator"]["partitionLocator"][
141
- "partitionValues"
142
- ]
143
- partition_id = delta_annotated["deltaLocator"]["partitionLocator"]["partitionId"]
144
- stream_locator = delta_annotated["deltaLocator"]["partitionLocator"][
145
- "streamLocator"
146
- ]
147
- partition_locator = PartitionLocator.of(
148
- stream_locator, partition_values, partition_id
149
- )
150
-
151
- # Dict[stream_position: List[StatsResult]]
152
- manifest_column_stats_list = defaultdict(lambda: [])
153
- for i in range(len(column_stats_map)):
154
- for column_name in column_names:
155
- for j in range(len(column_stats_map[i][column_name])):
156
- manifest_column_stats_list[
157
- column_stats_map[i][column_name][j][1]
158
- ].append([column_stats_map[i][column_name][j][0], column_name])
159
-
160
- stats_res: Dict[int, List[DeltaStats]] = {}
161
- for key, value in manifest_column_stats_list.items():
162
- delta_locator = DeltaLocator.of(partition_locator, key)
163
-
164
- # Dict[column_name: List[StatsResult]]
165
- manifest_stats_list = defaultdict(lambda: [])
166
- for manifest_stat in value:
167
- manifest_stats_list[manifest_stat[1]].append(manifest_stat[0])
168
- delta_ds_column_stats: List[DeltaColumnStats] = []
169
- for column_name, column_manifest_stats_list in manifest_stats_list.items():
170
-
171
- column_manifest_stats = ManifestEntryStats.of(
172
- column_manifest_stats_list, delta_locator
173
- )
174
- dataset_column_stats = DeltaColumnStats.of(
175
- column_name, column_manifest_stats
176
- )
177
- delta_ds_column_stats.append(dataset_column_stats)
178
-
179
- dataset_stats: DeltaStats = DeltaStats.of(delta_ds_column_stats)
180
- stats_res[key] = dataset_stats
181
-
182
- return stats_res, partition_locator.canonical_string()
File without changes
@@ -1,16 +0,0 @@
1
- # Default to use r5.8xlarge instance type for stats collection cluster
2
- STATS_CLUSTER_R5_INSTANCE_TYPE = 8
3
- # Using R5 instance type, 8GiB memory is available per cpu
4
- R5_MEMORY_PER_CPU = 8
5
- # Default to use r5.8xlarge instance type for stats collection cluster
6
- DEFAULT_CPUS_PER_INSTANCE_R5_8XLARGE = 32
7
- # memory reserved for head node object store
8
- HEAD_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO = 0.3
9
- # memory reserved for worker node object store
10
- WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO = 0.1
11
- # each cpu should not be processing more than this number of files to avoid unreasonable S3 I/O latency
12
- MANIFEST_FILE_COUNT_PER_CPU = 200
13
- # MAX_WORKER_MULTIPLIER * min_workers = max_workers to determine max workers based on min workers given
14
- MAX_WORKER_MULTIPLIER = 2
15
- # default trace id used for metastats collection triggered without trace id
16
- DEFAULT_JOB_RUN_TRACE_ID = "0"
@@ -1,223 +0,0 @@
1
- import json
2
- import logging
3
- from collections import defaultdict
4
- from typing import Any, Dict, List, Optional
5
-
6
- import pyarrow
7
- import ray
8
-
9
- from deltacat import LocalTable, TableType, logs
10
- from deltacat.aws import s3u as s3_utils
11
- from deltacat.compute.compactor import DeltaAnnotated
12
- from deltacat.compute.metastats.model.partition_stats_dict import PartitionStats
13
- from deltacat.compute.stats.models.delta_column_stats import DeltaColumnStats
14
- from deltacat.compute.stats.models.delta_stats import DeltaStats, DeltaStatsCacheMiss
15
- from deltacat.compute.stats.models.delta_stats_cache_result import DeltaStatsCacheResult
16
- from deltacat.compute.stats.models.stats_result import StatsResult
17
- from deltacat.storage import Delta
18
- from deltacat.storage import interface as unimplemented_deltacat_storage
19
- from deltacat.utils.common import sha1_hexdigest
20
-
21
- logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
22
-
23
-
24
- def cache_inflation_rate_data_for_delta_stats_ready(
25
- delta_stats_processed_list, inflation_rate_stats_s3_url, deltacat_storage
26
- ):
27
- meta_stats_processed_list: Dict[int, int] = {}
28
-
29
- for key, value in delta_stats_processed_list.items():
30
- delta_locator = value.column_stats[0].manifest_stats.delta_locator
31
- delta_meta_count = 0
32
- manifest = deltacat_storage.get_delta_manifest(delta_locator)
33
- delta = Delta.of(delta_locator, None, None, None, manifest)
34
- for entry in delta.manifest.entries:
35
- delta_meta_count += entry.meta.content_length
36
- meta_stats_processed_list[delta.stream_position] = delta_meta_count
37
-
38
- cache_inflation_rate_res = dict()
39
-
40
- for key, value in delta_stats_processed_list.items():
41
- delta_stats_pyarrow_bytes_sum = 0
42
- delta_stats_row_count = 0
43
- for column_stats in (
44
- delta_stats_processed_list[key].column_stats[0].manifest_stats.stats
45
- ):
46
- delta_stats_row_count += column_stats.get("rowCount")
47
- for stats in delta_stats_processed_list[key].get("column_stats"):
48
-
49
- delta_stats_pyarrow_bytes_sum += stats.get("stats").get("pyarrowTableBytes")
50
- cache_inflation_rate_res[key] = [
51
- meta_stats_processed_list[key],
52
- delta_stats_row_count,
53
- delta_stats_pyarrow_bytes_sum,
54
- ]
55
-
56
- if inflation_rate_stats_s3_url:
57
- logger.warning(
58
- f"reading previous inflation rate stats from: {inflation_rate_stats_s3_url}"
59
- )
60
-
61
- result = s3_utils.download(inflation_rate_stats_s3_url, fail_if_not_found=False)
62
-
63
- prev_inflation_rate_stats = dict()
64
- if result:
65
- json_str = result["Body"].read().decode("utf-8")
66
- prev_inflation_rate_stats_read = json.loads(json_str)
67
- prev_inflation_rate_stats = (
68
- prev_inflation_rate_stats_read
69
- if prev_inflation_rate_stats_read
70
- else dict()
71
- )
72
- logger.debug(
73
- f"read stats completion info: {prev_inflation_rate_stats_read}"
74
- )
75
- logger.debug(
76
- f"writing inflation rate info to S3: {inflation_rate_stats_s3_url}"
77
- )
78
- prev_inflation_rate_stats.update(cache_inflation_rate_res)
79
- logger.debug(
80
- f"writing current inflation rate info to S3: {prev_inflation_rate_stats}"
81
- )
82
- s3_utils.upload(
83
- inflation_rate_stats_s3_url, json.dumps(prev_inflation_rate_stats)
84
- )
85
- else:
86
- logger.warning(
87
- f"No valid s3 url received to cache inflation rate stats, got {inflation_rate_stats_s3_url}"
88
- )
89
-
90
-
91
- def read_cached_partition_stats(
92
- partition_canonical_string: str, stat_results_s3_bucket: str
93
- ):
94
- partition_stats_url = get_partition_stats_s3_url(
95
- partition_canonical_string, stat_results_s3_bucket
96
- )
97
- logger.info(f"reading partition stats completion file from: {partition_stats_url}")
98
-
99
- result = s3_utils.download(partition_stats_url, fail_if_not_found=False)
100
- delta_stats_cache_res_map: Dict[int, List[DeltaStatsCacheResult]] = {}
101
- if result:
102
- json_str = result["Body"].read().decode("utf-8")
103
- partition_stats_str = json.loads(json_str)
104
- delta_stats_cache_res_map = get_delta_stats_from_partition_stats(
105
- partition_stats_str
106
- )
107
-
108
- return delta_stats_cache_res_map
109
-
110
-
111
- def get_partition_stats_s3_url(
112
- partition_canonical_string: str, stat_results_s3_bucket: str
113
- ):
114
- stats_partition_canonical_string = f"{partition_canonical_string}"
115
- stats_partition_hexdigest = sha1_hexdigest(
116
- stats_partition_canonical_string.encode("utf-8")
117
- )
118
- base_path = s3_utils.parse_s3_url(stat_results_s3_bucket).url
119
-
120
- return f"{base_path}/{stats_partition_hexdigest}.json"
121
-
122
-
123
- def get_delta_stats_from_partition_stats(partition_stats_str: str):
124
-
125
- partition_stats = PartitionStats.build_from_dict(partition_stats_str)
126
-
127
- found_columns_stats_map: Dict[int, List[DeltaStatsCacheResult]] = {}
128
- for stream_position, delta_stats in partition_stats.delta_stats.items():
129
- found_columns_stats: List[DeltaColumnStats] = []
130
- missed_columns: List[str] = []
131
- for cs in delta_stats.column_stats:
132
- if cs.manifest_stats:
133
- found_columns_stats.append(cs)
134
- else:
135
- missed_columns.append(cs.column)
136
-
137
- delta_locator = delta_stats.column_stats[0].manifest_stats.delta_locator
138
- found_stats: Optional[DeltaStats] = (
139
- DeltaStats.of(found_columns_stats) if found_columns_stats else None
140
- )
141
- missed_stats: Optional[DeltaStatsCacheMiss] = (
142
- DeltaStatsCacheMiss(missed_columns, delta_locator)
143
- if missed_columns
144
- else None
145
- )
146
- delta_stats_cache_res = DeltaStatsCacheResult.of(found_stats, missed_stats)
147
- found_columns_stats_map[int(stream_position)] = delta_stats_cache_res
148
- return found_columns_stats_map
149
-
150
-
151
- def cache_partition_stats_to_s3(
152
- stat_results_s3_bucket, delta_stream_range_stats, partition_canonical_string
153
- ):
154
- partition_stats = PartitionStats.of(
155
- delta_stream_range_stats, partition_canonical_string
156
- )
157
- logger.info(f"writing partition stats completion for {partition_canonical_string}")
158
- partition_stats_completion_file_s3_url = get_partition_stats_s3_url(
159
- partition_canonical_string, stat_results_s3_bucket
160
- )
161
- s3_utils.upload(
162
- partition_stats_completion_file_s3_url, str(json.dumps(partition_stats))
163
- )
164
- logger.debug(
165
- f"stats completion file written to: {partition_stats_completion_file_s3_url}"
166
- )
167
-
168
-
169
- @ray.remote
170
- def collect_stats_by_columns(
171
- delta_annotated: DeltaAnnotated,
172
- columns_to_compute: Optional[List[str]] = None,
173
- deltacat_storage=unimplemented_deltacat_storage,
174
- deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
175
- ) -> Dict[str, Any]:
176
- """Materializes one manifest entry at a time to save memory usage and calculate stats from each of its columns.
177
-
178
- Args:
179
- delta: A delta object to calculate stats for
180
- columns_to_compute: Columns to calculate stats for. If not provided, all columns are considered.
181
- deltacat_storage: Client implementation of the DeltaCAT storage interface
182
-
183
- Returns:
184
- A delta wide stats container
185
- """
186
- if deltacat_storage_kwargs is None:
187
- deltacat_storage_kwargs = {}
188
- total_tables_size = 0
189
-
190
- # Mapping of column_name -> [stats_file_idx_1, stats_file_idx_2, ... stats_file_idx_n]
191
- column_stats_map = defaultdict(
192
- lambda: [[None, None]] * len(delta_annotated["manifest"].get("entries"))
193
- )
194
- src_da_entries = delta_annotated["manifest"].get("entries")
195
- manifest_annotations = delta_annotated["annotations"]
196
- for file_idx, manifest in enumerate(src_da_entries):
197
- entry_pyarrow_table: LocalTable = (
198
- deltacat_storage.download_delta_manifest_entry(
199
- delta_annotated,
200
- file_idx,
201
- TableType.PYARROW,
202
- columns_to_compute,
203
- equivalent_table_types="uncompacted",
204
- **deltacat_storage_kwargs,
205
- )
206
- )
207
- assert isinstance(entry_pyarrow_table, pyarrow.Table), (
208
- f"Stats collection is only supported for PyArrow tables, but received a table of "
209
- f"type '{type(entry_pyarrow_table)}' for manifest entry {file_idx} of delta: {delta_annotated.locator}."
210
- )
211
- total_tables_size += entry_pyarrow_table.nbytes
212
- if not columns_to_compute:
213
- columns_to_compute = entry_pyarrow_table.column_names
214
-
215
- for column_idx, pyarrow_column in enumerate(entry_pyarrow_table.columns):
216
- column_name = columns_to_compute[column_idx]
217
- origin_delta_stream_position = manifest_annotations[file_idx][-1]
218
- column_stats_map[column_name][file_idx] = [
219
- StatsResult.of(len(pyarrow_column), pyarrow_column.nbytes),
220
- origin_delta_stream_position,
221
- ]
222
-
223
- return column_stats_map
@@ -1,18 +0,0 @@
1
- from deltacat.constants import PYARROW_INFLATION_MULTIPLIER_ALL_COLUMNS
2
-
3
-
4
- def estimation_function(
5
- content_length, content_type, content_encoding, *args, **kwargs
6
- ):
7
- # TODO(zyiqin): update the estimation here to be consistent with number of required worker nodes estimate.
8
- # Current implementation is only a rough guess using the PYARROW_INFLATION_MULTIPLIER(content_length to pyarrow butes(all columns).
9
- # The full implementation logic should be:
10
- # 1. liner regression with a confidence level: pull metastats data for all deltas for this partition if len(datapoints) > 30.
11
- # 2. if not enough previous stats collected for same partition: Fall back to datapoints for all paritions for same table.
12
- # 3. If not enough stats collected for this table: use average content length to each content_type and content_encoding inflation rates
13
- # 4. If not enough stats for this content_type and content_encoding combination: use the basic PYARROW_INFLATION_MULTIPLIER instead.
14
-
15
- if content_length:
16
- return content_length * PYARROW_INFLATION_MULTIPLIER_ALL_COLUMNS
17
- else:
18
- return 0
@@ -1,129 +0,0 @@
1
- import errno
2
- import logging
3
- import os
4
- import subprocess
5
- from subprocess import run
6
- from typing import Any
7
-
8
- import ray
9
- from tenacity import RetryError, Retrying, stop_after_attempt, wait_fixed
10
-
11
- from deltacat import logs
12
- from deltacat.compute.metastats.utils.constants import (
13
- MAX_WORKER_MULTIPLIER,
14
- WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO,
15
- )
16
-
17
- logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
18
-
19
- RAY_DOWN_DEFAULT_RETRY_ATTEMPTS = 3
20
-
21
-
22
- def run_cmd_exit_code(cmd: str) -> int:
23
- logger.info(f"running command {cmd}")
24
- exit_code = int(os.system(cmd))
25
- logger.info(f"Got {exit_code} when running {cmd}")
26
-
27
-
28
- def run_cmd_with_retry(cmd: str) -> None:
29
- retrying = Retrying(
30
- wait=wait_fixed(2), stop=stop_after_attempt(RAY_DOWN_DEFAULT_RETRY_ATTEMPTS)
31
- )
32
- try:
33
- retrying(run_cmd_exit_code(cmd))
34
- except RetryError:
35
- logger.info(f"{cmd} failed after {RAY_DOWN_DEFAULT_RETRY_ATTEMPTS} retries.")
36
-
37
-
38
- def run_cmd(cmd: str) -> None:
39
- result = run(cmd, shell=True, capture_output=True)
40
- exit_code = int(result.returncode)
41
- assert exit_code == 0, (
42
- f"`{cmd}` failed. Exit code: {exit_code} " f"Error Trace: {result.stderr}"
43
- )
44
-
45
-
46
- def ray_up(cluster_cfg: str) -> None:
47
- logger.info(f"Starting Ray cluster '{cluster_cfg}'")
48
- run_cmd(f"ray up {cluster_cfg} -y --no-config-cache --no-restart")
49
- logger.info(f"Started Ray cluster '{cluster_cfg}'")
50
-
51
-
52
- def ray_down(cluster_cfg: str) -> None:
53
- logger.info(f"Destroying Ray cluster '{cluster_cfg}'")
54
- run_cmd_with_retry(f"ray down {cluster_cfg} -y")
55
- logger.info(f"Destroyed Ray cluster '{cluster_cfg}'")
56
-
57
-
58
- def clean_up_cluster_cfg_file(cluster_cfg) -> None:
59
- logger.info(f"Removing stats cluster config at: '{cluster_cfg}'")
60
- run_cmd(f"rm -f {cluster_cfg}")
61
- logger.info(f"Removed stats cluster config at: '{cluster_cfg}'")
62
-
63
-
64
- def get_head_node_ip(cluster_cfg: str) -> str:
65
- logger.info(f"Getting Ray cluster head node IP for '{cluster_cfg}'")
66
- proc = subprocess.run(
67
- f"ray get-head-ip {cluster_cfg}",
68
- shell=True,
69
- capture_output=True,
70
- text=True,
71
- check=True,
72
- )
73
- # the head node IP should be the last line printed to stdout
74
- head_node_ip = proc.stdout.splitlines()[-1]
75
- logger.info(f"Ray cluster head node IP for '{cluster_cfg}': {head_node_ip}")
76
- return head_node_ip
77
-
78
-
79
- def ray_init(host, port) -> Any:
80
- ray_init_uri = f"ray://{host}:{port}"
81
- logger.info(f"Connecting Ray Client to '{ray_init_uri}'")
82
- client = ray.init(ray_init_uri, allow_multiple=True)
83
- logger.info(f"Connected Ray Client to '{ray_init_uri}'")
84
- return client
85
-
86
-
87
- def replace_cluster_cfg_vars(
88
- partition_canonical_string: str,
89
- trace_id: str,
90
- file_path: str,
91
- min_workers: int,
92
- head_type: str,
93
- worker_type: str,
94
- head_object_store_memory_pct: int,
95
- worker_object_store_memory_pct: int,
96
- ) -> str:
97
-
98
- head_object_store_memory_pct = head_object_store_memory_pct if not None else 30
99
- worker_object_store_memory_pct = WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO * 100
100
-
101
- max_workers = int(min_workers * MAX_WORKER_MULTIPLIER)
102
- with open(file_path, "r+") as file:
103
- contents = file.read().replace("{{use-internal-ips}}", "True")
104
- contents = contents.replace(
105
- "{{partition_canonical_string}}", partition_canonical_string
106
- )
107
- contents = contents.replace("'{{trace_id}}'", trace_id)
108
- contents = contents.replace("'{{min-workers}}'", str(min_workers))
109
- contents = contents.replace("'{{max-workers}}'", str(max_workers))
110
- contents = contents.replace("'{{head-instance-type}}'", head_type)
111
- contents = contents.replace("'{{worker-instance-type}}'", worker_type)
112
- contents = contents.replace(
113
- "'{{head-object-store-memory-pct}}'", str(head_object_store_memory_pct)
114
- )
115
- contents = contents.replace(
116
- "'{{worker-object-store-memory-pct}}'", str(worker_object_store_memory_pct)
117
- )
118
- partition_id = partition_canonical_string.split("|")[-1]
119
- out_file_name = f"{trace_id}-{partition_id}.{os.path.basename(file_path)}"
120
- out_file_dir = os.path.join(os.path.dirname(file_path), "tmp")
121
- out_file_path = os.path.join(out_file_dir, out_file_name)
122
- try:
123
- os.makedirs(os.path.dirname(out_file_path))
124
- except OSError as e:
125
- if e.errno != errno.EEXIST:
126
- raise
127
- with open(out_file_path, "w") as output:
128
- output.write(contents)
129
- return out_file_path