deltacat 1.1.17__py3-none-any.whl → 1.1.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/constants.py +0 -1
- deltacat/compute/compactor/model/compact_partition_params.py +76 -0
- deltacat/compute/compactor/model/compaction_session_audit_info.py +26 -0
- deltacat/compute/compactor/model/delta_annotated.py +16 -9
- deltacat/compute/compactor_v2/constants.py +3 -0
- deltacat/compute/compactor_v2/private/compaction_utils.py +9 -5
- deltacat/compute/compactor_v2/utils/content_type_params.py +185 -34
- deltacat/compute/compactor_v2/utils/io.py +28 -14
- deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
- deltacat/compute/compactor_v2/utils/task_options.py +128 -183
- deltacat/compute/resource_estimation/__init__.py +27 -0
- deltacat/compute/resource_estimation/delta.py +271 -0
- deltacat/compute/resource_estimation/manifest.py +394 -0
- deltacat/compute/resource_estimation/model.py +165 -0
- deltacat/compute/resource_estimation/parquet.py +108 -0
- deltacat/constants.py +5 -0
- deltacat/exceptions.py +2 -4
- deltacat/logs.py +8 -0
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +77 -0
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +308 -0
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +159 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +157 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +3 -3
- deltacat/tests/compute/resource_estimation/test_delta.py +605 -0
- deltacat/tests/compute/resource_estimation/test_manifest.py +921 -0
- deltacat/tests/compute/test_compact_partition_rebase.py +13 -4
- deltacat/tests/compute/test_util_common.py +2 -0
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -5
- deltacat/tests/test_logs.py +34 -0
- deltacat/tests/test_utils/pyarrow.py +15 -5
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/METADATA +2 -2
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/RECORD +38 -54
- deltacat/compute/metastats/meta_stats.py +0 -479
- deltacat/compute/metastats/model/__init__.py +0 -0
- deltacat/compute/metastats/model/partition_stats_dict.py +0 -34
- deltacat/compute/metastats/model/stats_cluster_size_estimator.py +0 -68
- deltacat/compute/metastats/stats.py +0 -182
- deltacat/compute/metastats/utils/__init__.py +0 -0
- deltacat/compute/metastats/utils/constants.py +0 -16
- deltacat/compute/metastats/utils/io.py +0 -223
- deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +0 -18
- deltacat/compute/metastats/utils/ray_utils.py +0 -129
- deltacat/compute/stats/basic.py +0 -226
- deltacat/compute/stats/models/__init__.py +0 -0
- deltacat/compute/stats/models/delta_column_stats.py +0 -98
- deltacat/compute/stats/models/delta_stats.py +0 -233
- deltacat/compute/stats/models/delta_stats_cache_result.py +0 -49
- deltacat/compute/stats/models/manifest_entry_stats.py +0 -72
- deltacat/compute/stats/models/stats_result.py +0 -104
- deltacat/compute/stats/utils/__init__.py +0 -0
- deltacat/compute/stats/utils/intervals.py +0 -94
- deltacat/compute/stats/utils/io.py +0 -230
- deltacat/compute/stats/utils/manifest_stats_file.py +0 -100
- deltacat/tests/stats/__init__.py +0 -0
- deltacat/tests/stats/test_intervals.py +0 -49
- /deltacat/{compute/metastats → tests/compute/resource_estimation}/__init__.py +0 -0
- /deltacat/{compute/metastats/config → tests/compute/resource_estimation/data}/__init__.py +0 -0
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/LICENSE +0 -0
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/WHEEL +0 -0
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/top_level.txt +0 -0
@@ -1,182 +0,0 @@
|
|
1
|
-
import logging
|
2
|
-
from collections import defaultdict
|
3
|
-
from typing import Dict, List, Optional
|
4
|
-
|
5
|
-
import ray
|
6
|
-
from ray.types import ObjectRef
|
7
|
-
|
8
|
-
from deltacat import logs
|
9
|
-
from deltacat.aws import s3u as s3_utils
|
10
|
-
from deltacat.aws.clients import client_cache
|
11
|
-
from deltacat.aws.constants import AWS_REGION
|
12
|
-
from deltacat.compute.compactor import DeltaAnnotated
|
13
|
-
from deltacat.compute.metastats.utils.io import (
|
14
|
-
cache_inflation_rate_data_for_delta_stats_ready,
|
15
|
-
cache_partition_stats_to_s3,
|
16
|
-
collect_stats_by_columns,
|
17
|
-
)
|
18
|
-
from deltacat.compute.stats.models.delta_column_stats import DeltaColumnStats
|
19
|
-
from deltacat.compute.stats.models.delta_stats import DeltaStats
|
20
|
-
from deltacat.compute.stats.models.manifest_entry_stats import ManifestEntryStats
|
21
|
-
from deltacat.compute.stats.models.stats_result import StatsResult
|
22
|
-
from deltacat.storage import DeltaLocator, PartitionLocator
|
23
|
-
from deltacat.storage import interface as unimplemented_deltacat_storage
|
24
|
-
|
25
|
-
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
26
|
-
|
27
|
-
# TODO: get cpu info from ray.nodes() resource key
|
28
|
-
DEFAULT_CPUS_STATS_CLUSTER_INSTANCE = 32
|
29
|
-
|
30
|
-
|
31
|
-
def start_stats_collection(
|
32
|
-
batched_delta_stats_compute_list: List[DeltaAnnotated],
|
33
|
-
columns: List[str],
|
34
|
-
stat_results_s3_bucket: Optional[str] = None,
|
35
|
-
metastats_results_s3_bucket: Optional[str] = None,
|
36
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
37
|
-
**kwargs,
|
38
|
-
) -> Dict[str, List[DeltaStats]]:
|
39
|
-
"""Collects statistics on deltas, given a set of delta stream position ranges.
|
40
|
-
Example:
|
41
|
-
>>> collect(locator, set((1, 5), (4, 8), (13, 16)))
|
42
|
-
{
|
43
|
-
1: DeltaStats(), # DeltaStats for stream positions 1 - 8
|
44
|
-
13: DeltaStats() # DeltaStats for stream positions 13 - 16
|
45
|
-
}
|
46
|
-
Args:
|
47
|
-
source_partition_locator: Reference to the partition locator tied to the given delta stream positions
|
48
|
-
delta_stream_position_range_set: A set of intervals with an int type representing finite,
|
49
|
-
closed bounded values, and a None type representing unbounded infinity.
|
50
|
-
columns: Columns can be optionally included to collect stats on specific columns.
|
51
|
-
By default, all columns will be calculated.
|
52
|
-
stat_results_s3_bucket: Used as a cache file storage for computed delta stats
|
53
|
-
metastats_results_s3_bucket: Used as cache file storage for inflation rate meta stats
|
54
|
-
deltacat_storage: Client implementation of the DeltaCAT storage interface
|
55
|
-
Returns:
|
56
|
-
A mapping of stream positions to their corresponding delta stats.
|
57
|
-
"""
|
58
|
-
# TODO: Add CompactionEventDispatcher for stats collection started event
|
59
|
-
delta_stats_compute_pending: List[ObjectRef[Dict[str, List[StatsResult, int]]]] = []
|
60
|
-
|
61
|
-
for batched_deltas in batched_delta_stats_compute_list:
|
62
|
-
splitted_annotated_deltas = DeltaAnnotated.split(
|
63
|
-
batched_deltas, DEFAULT_CPUS_STATS_CLUSTER_INSTANCE
|
64
|
-
)
|
65
|
-
for splitted_annotated_delta in splitted_annotated_deltas:
|
66
|
-
delta_stats_compute_pending.append(
|
67
|
-
collect_stats_by_columns.remote(
|
68
|
-
splitted_annotated_delta, columns, deltacat_storage
|
69
|
-
)
|
70
|
-
)
|
71
|
-
|
72
|
-
column_stats_map = _process_stats(delta_stats_compute_pending)
|
73
|
-
|
74
|
-
if not batched_delta_stats_compute_list:
|
75
|
-
logger.info("No new delta need stats collection")
|
76
|
-
else:
|
77
|
-
(
|
78
|
-
delta_stream_range_stats,
|
79
|
-
partition_canonical_string,
|
80
|
-
) = resolve_annotated_delta_stats_to_original_deltas_stats(
|
81
|
-
column_stats_map, columns, batched_delta_stats_compute_list[0]
|
82
|
-
)
|
83
|
-
|
84
|
-
_cache_stats_res_to_s3(
|
85
|
-
stat_results_s3_bucket, delta_stream_range_stats, partition_canonical_string
|
86
|
-
)
|
87
|
-
|
88
|
-
base_path = s3_utils.parse_s3_url(metastats_results_s3_bucket).url
|
89
|
-
inflation_rate_stats_s3_url = f"{base_path}/inflation-rates.json"
|
90
|
-
cache_inflation_rate_data_for_delta_stats_ready(
|
91
|
-
delta_stream_range_stats, inflation_rate_stats_s3_url, deltacat_storage
|
92
|
-
)
|
93
|
-
# TODO: Add CompactionEventDispatcher for stats collection completed event
|
94
|
-
return delta_stream_range_stats
|
95
|
-
|
96
|
-
|
97
|
-
def _get_account_id() -> str:
|
98
|
-
client = client_cache("sts", region_name=AWS_REGION)
|
99
|
-
account_id = client.get_caller_identity()["Account"]
|
100
|
-
return account_id
|
101
|
-
|
102
|
-
|
103
|
-
def _process_stats(
|
104
|
-
delta_stats_compute_pending: List[ObjectRef[DeltaStats]],
|
105
|
-
) -> List[DeltaStats]:
|
106
|
-
delta_stats_processed_list: List[DeltaStats] = _resolve_pending_stats(
|
107
|
-
delta_stats_compute_pending
|
108
|
-
)
|
109
|
-
|
110
|
-
return delta_stats_processed_list
|
111
|
-
|
112
|
-
|
113
|
-
def _resolve_pending_stats(
|
114
|
-
delta_stats_pending_list: List[ObjectRef[DeltaStats]],
|
115
|
-
) -> List[DeltaStats]:
|
116
|
-
delta_stats_processed_list: List[DeltaStats] = []
|
117
|
-
|
118
|
-
while delta_stats_pending_list:
|
119
|
-
ready, delta_stats_pending_list = ray.wait(delta_stats_pending_list)
|
120
|
-
processed_stats_batch: List[DeltaStats] = ray.get(ready)
|
121
|
-
delta_stats_processed_list.extend(processed_stats_batch)
|
122
|
-
|
123
|
-
return delta_stats_processed_list
|
124
|
-
|
125
|
-
|
126
|
-
def _cache_stats_res_to_s3(
|
127
|
-
stat_results_s3_bucket, delta_stream_range_stats, partition_canonical_string
|
128
|
-
):
|
129
|
-
if stat_results_s3_bucket:
|
130
|
-
# Cache the stats into the file store
|
131
|
-
cache_partition_stats_to_s3(
|
132
|
-
stat_results_s3_bucket, delta_stream_range_stats, partition_canonical_string
|
133
|
-
)
|
134
|
-
|
135
|
-
|
136
|
-
def resolve_annotated_delta_stats_to_original_deltas_stats(
|
137
|
-
column_stats_map, column_names, delta_annotated
|
138
|
-
) -> Dict[int, DeltaStats]:
|
139
|
-
|
140
|
-
partition_values = delta_annotated["deltaLocator"]["partitionLocator"][
|
141
|
-
"partitionValues"
|
142
|
-
]
|
143
|
-
partition_id = delta_annotated["deltaLocator"]["partitionLocator"]["partitionId"]
|
144
|
-
stream_locator = delta_annotated["deltaLocator"]["partitionLocator"][
|
145
|
-
"streamLocator"
|
146
|
-
]
|
147
|
-
partition_locator = PartitionLocator.of(
|
148
|
-
stream_locator, partition_values, partition_id
|
149
|
-
)
|
150
|
-
|
151
|
-
# Dict[stream_position: List[StatsResult]]
|
152
|
-
manifest_column_stats_list = defaultdict(lambda: [])
|
153
|
-
for i in range(len(column_stats_map)):
|
154
|
-
for column_name in column_names:
|
155
|
-
for j in range(len(column_stats_map[i][column_name])):
|
156
|
-
manifest_column_stats_list[
|
157
|
-
column_stats_map[i][column_name][j][1]
|
158
|
-
].append([column_stats_map[i][column_name][j][0], column_name])
|
159
|
-
|
160
|
-
stats_res: Dict[int, List[DeltaStats]] = {}
|
161
|
-
for key, value in manifest_column_stats_list.items():
|
162
|
-
delta_locator = DeltaLocator.of(partition_locator, key)
|
163
|
-
|
164
|
-
# Dict[column_name: List[StatsResult]]
|
165
|
-
manifest_stats_list = defaultdict(lambda: [])
|
166
|
-
for manifest_stat in value:
|
167
|
-
manifest_stats_list[manifest_stat[1]].append(manifest_stat[0])
|
168
|
-
delta_ds_column_stats: List[DeltaColumnStats] = []
|
169
|
-
for column_name, column_manifest_stats_list in manifest_stats_list.items():
|
170
|
-
|
171
|
-
column_manifest_stats = ManifestEntryStats.of(
|
172
|
-
column_manifest_stats_list, delta_locator
|
173
|
-
)
|
174
|
-
dataset_column_stats = DeltaColumnStats.of(
|
175
|
-
column_name, column_manifest_stats
|
176
|
-
)
|
177
|
-
delta_ds_column_stats.append(dataset_column_stats)
|
178
|
-
|
179
|
-
dataset_stats: DeltaStats = DeltaStats.of(delta_ds_column_stats)
|
180
|
-
stats_res[key] = dataset_stats
|
181
|
-
|
182
|
-
return stats_res, partition_locator.canonical_string()
|
File without changes
|
@@ -1,16 +0,0 @@
|
|
1
|
-
# Default to use r5.8xlarge instance type for stats collection cluster
|
2
|
-
STATS_CLUSTER_R5_INSTANCE_TYPE = 8
|
3
|
-
# Using R5 instance type, 8GiB memory is available per cpu
|
4
|
-
R5_MEMORY_PER_CPU = 8
|
5
|
-
# Default to use r5.8xlarge instance type for stats collection cluster
|
6
|
-
DEFAULT_CPUS_PER_INSTANCE_R5_8XLARGE = 32
|
7
|
-
# memory reserved for head node object store
|
8
|
-
HEAD_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO = 0.3
|
9
|
-
# memory reserved for worker node object store
|
10
|
-
WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO = 0.1
|
11
|
-
# each cpu should not be processing more than this number of files to avoid unreasonable S3 I/O latency
|
12
|
-
MANIFEST_FILE_COUNT_PER_CPU = 200
|
13
|
-
# MAX_WORKER_MULTIPLIER * min_workers = max_workers to determine max workers based on min workers given
|
14
|
-
MAX_WORKER_MULTIPLIER = 2
|
15
|
-
# default trace id used for metastats collection triggered without trace id
|
16
|
-
DEFAULT_JOB_RUN_TRACE_ID = "0"
|
@@ -1,223 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
import logging
|
3
|
-
from collections import defaultdict
|
4
|
-
from typing import Any, Dict, List, Optional
|
5
|
-
|
6
|
-
import pyarrow
|
7
|
-
import ray
|
8
|
-
|
9
|
-
from deltacat import LocalTable, TableType, logs
|
10
|
-
from deltacat.aws import s3u as s3_utils
|
11
|
-
from deltacat.compute.compactor import DeltaAnnotated
|
12
|
-
from deltacat.compute.metastats.model.partition_stats_dict import PartitionStats
|
13
|
-
from deltacat.compute.stats.models.delta_column_stats import DeltaColumnStats
|
14
|
-
from deltacat.compute.stats.models.delta_stats import DeltaStats, DeltaStatsCacheMiss
|
15
|
-
from deltacat.compute.stats.models.delta_stats_cache_result import DeltaStatsCacheResult
|
16
|
-
from deltacat.compute.stats.models.stats_result import StatsResult
|
17
|
-
from deltacat.storage import Delta
|
18
|
-
from deltacat.storage import interface as unimplemented_deltacat_storage
|
19
|
-
from deltacat.utils.common import sha1_hexdigest
|
20
|
-
|
21
|
-
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
22
|
-
|
23
|
-
|
24
|
-
def cache_inflation_rate_data_for_delta_stats_ready(
|
25
|
-
delta_stats_processed_list, inflation_rate_stats_s3_url, deltacat_storage
|
26
|
-
):
|
27
|
-
meta_stats_processed_list: Dict[int, int] = {}
|
28
|
-
|
29
|
-
for key, value in delta_stats_processed_list.items():
|
30
|
-
delta_locator = value.column_stats[0].manifest_stats.delta_locator
|
31
|
-
delta_meta_count = 0
|
32
|
-
manifest = deltacat_storage.get_delta_manifest(delta_locator)
|
33
|
-
delta = Delta.of(delta_locator, None, None, None, manifest)
|
34
|
-
for entry in delta.manifest.entries:
|
35
|
-
delta_meta_count += entry.meta.content_length
|
36
|
-
meta_stats_processed_list[delta.stream_position] = delta_meta_count
|
37
|
-
|
38
|
-
cache_inflation_rate_res = dict()
|
39
|
-
|
40
|
-
for key, value in delta_stats_processed_list.items():
|
41
|
-
delta_stats_pyarrow_bytes_sum = 0
|
42
|
-
delta_stats_row_count = 0
|
43
|
-
for column_stats in (
|
44
|
-
delta_stats_processed_list[key].column_stats[0].manifest_stats.stats
|
45
|
-
):
|
46
|
-
delta_stats_row_count += column_stats.get("rowCount")
|
47
|
-
for stats in delta_stats_processed_list[key].get("column_stats"):
|
48
|
-
|
49
|
-
delta_stats_pyarrow_bytes_sum += stats.get("stats").get("pyarrowTableBytes")
|
50
|
-
cache_inflation_rate_res[key] = [
|
51
|
-
meta_stats_processed_list[key],
|
52
|
-
delta_stats_row_count,
|
53
|
-
delta_stats_pyarrow_bytes_sum,
|
54
|
-
]
|
55
|
-
|
56
|
-
if inflation_rate_stats_s3_url:
|
57
|
-
logger.warning(
|
58
|
-
f"reading previous inflation rate stats from: {inflation_rate_stats_s3_url}"
|
59
|
-
)
|
60
|
-
|
61
|
-
result = s3_utils.download(inflation_rate_stats_s3_url, fail_if_not_found=False)
|
62
|
-
|
63
|
-
prev_inflation_rate_stats = dict()
|
64
|
-
if result:
|
65
|
-
json_str = result["Body"].read().decode("utf-8")
|
66
|
-
prev_inflation_rate_stats_read = json.loads(json_str)
|
67
|
-
prev_inflation_rate_stats = (
|
68
|
-
prev_inflation_rate_stats_read
|
69
|
-
if prev_inflation_rate_stats_read
|
70
|
-
else dict()
|
71
|
-
)
|
72
|
-
logger.debug(
|
73
|
-
f"read stats completion info: {prev_inflation_rate_stats_read}"
|
74
|
-
)
|
75
|
-
logger.debug(
|
76
|
-
f"writing inflation rate info to S3: {inflation_rate_stats_s3_url}"
|
77
|
-
)
|
78
|
-
prev_inflation_rate_stats.update(cache_inflation_rate_res)
|
79
|
-
logger.debug(
|
80
|
-
f"writing current inflation rate info to S3: {prev_inflation_rate_stats}"
|
81
|
-
)
|
82
|
-
s3_utils.upload(
|
83
|
-
inflation_rate_stats_s3_url, json.dumps(prev_inflation_rate_stats)
|
84
|
-
)
|
85
|
-
else:
|
86
|
-
logger.warning(
|
87
|
-
f"No valid s3 url received to cache inflation rate stats, got {inflation_rate_stats_s3_url}"
|
88
|
-
)
|
89
|
-
|
90
|
-
|
91
|
-
def read_cached_partition_stats(
|
92
|
-
partition_canonical_string: str, stat_results_s3_bucket: str
|
93
|
-
):
|
94
|
-
partition_stats_url = get_partition_stats_s3_url(
|
95
|
-
partition_canonical_string, stat_results_s3_bucket
|
96
|
-
)
|
97
|
-
logger.info(f"reading partition stats completion file from: {partition_stats_url}")
|
98
|
-
|
99
|
-
result = s3_utils.download(partition_stats_url, fail_if_not_found=False)
|
100
|
-
delta_stats_cache_res_map: Dict[int, List[DeltaStatsCacheResult]] = {}
|
101
|
-
if result:
|
102
|
-
json_str = result["Body"].read().decode("utf-8")
|
103
|
-
partition_stats_str = json.loads(json_str)
|
104
|
-
delta_stats_cache_res_map = get_delta_stats_from_partition_stats(
|
105
|
-
partition_stats_str
|
106
|
-
)
|
107
|
-
|
108
|
-
return delta_stats_cache_res_map
|
109
|
-
|
110
|
-
|
111
|
-
def get_partition_stats_s3_url(
|
112
|
-
partition_canonical_string: str, stat_results_s3_bucket: str
|
113
|
-
):
|
114
|
-
stats_partition_canonical_string = f"{partition_canonical_string}"
|
115
|
-
stats_partition_hexdigest = sha1_hexdigest(
|
116
|
-
stats_partition_canonical_string.encode("utf-8")
|
117
|
-
)
|
118
|
-
base_path = s3_utils.parse_s3_url(stat_results_s3_bucket).url
|
119
|
-
|
120
|
-
return f"{base_path}/{stats_partition_hexdigest}.json"
|
121
|
-
|
122
|
-
|
123
|
-
def get_delta_stats_from_partition_stats(partition_stats_str: str):
|
124
|
-
|
125
|
-
partition_stats = PartitionStats.build_from_dict(partition_stats_str)
|
126
|
-
|
127
|
-
found_columns_stats_map: Dict[int, List[DeltaStatsCacheResult]] = {}
|
128
|
-
for stream_position, delta_stats in partition_stats.delta_stats.items():
|
129
|
-
found_columns_stats: List[DeltaColumnStats] = []
|
130
|
-
missed_columns: List[str] = []
|
131
|
-
for cs in delta_stats.column_stats:
|
132
|
-
if cs.manifest_stats:
|
133
|
-
found_columns_stats.append(cs)
|
134
|
-
else:
|
135
|
-
missed_columns.append(cs.column)
|
136
|
-
|
137
|
-
delta_locator = delta_stats.column_stats[0].manifest_stats.delta_locator
|
138
|
-
found_stats: Optional[DeltaStats] = (
|
139
|
-
DeltaStats.of(found_columns_stats) if found_columns_stats else None
|
140
|
-
)
|
141
|
-
missed_stats: Optional[DeltaStatsCacheMiss] = (
|
142
|
-
DeltaStatsCacheMiss(missed_columns, delta_locator)
|
143
|
-
if missed_columns
|
144
|
-
else None
|
145
|
-
)
|
146
|
-
delta_stats_cache_res = DeltaStatsCacheResult.of(found_stats, missed_stats)
|
147
|
-
found_columns_stats_map[int(stream_position)] = delta_stats_cache_res
|
148
|
-
return found_columns_stats_map
|
149
|
-
|
150
|
-
|
151
|
-
def cache_partition_stats_to_s3(
|
152
|
-
stat_results_s3_bucket, delta_stream_range_stats, partition_canonical_string
|
153
|
-
):
|
154
|
-
partition_stats = PartitionStats.of(
|
155
|
-
delta_stream_range_stats, partition_canonical_string
|
156
|
-
)
|
157
|
-
logger.info(f"writing partition stats completion for {partition_canonical_string}")
|
158
|
-
partition_stats_completion_file_s3_url = get_partition_stats_s3_url(
|
159
|
-
partition_canonical_string, stat_results_s3_bucket
|
160
|
-
)
|
161
|
-
s3_utils.upload(
|
162
|
-
partition_stats_completion_file_s3_url, str(json.dumps(partition_stats))
|
163
|
-
)
|
164
|
-
logger.debug(
|
165
|
-
f"stats completion file written to: {partition_stats_completion_file_s3_url}"
|
166
|
-
)
|
167
|
-
|
168
|
-
|
169
|
-
@ray.remote
|
170
|
-
def collect_stats_by_columns(
|
171
|
-
delta_annotated: DeltaAnnotated,
|
172
|
-
columns_to_compute: Optional[List[str]] = None,
|
173
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
174
|
-
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
175
|
-
) -> Dict[str, Any]:
|
176
|
-
"""Materializes one manifest entry at a time to save memory usage and calculate stats from each of its columns.
|
177
|
-
|
178
|
-
Args:
|
179
|
-
delta: A delta object to calculate stats for
|
180
|
-
columns_to_compute: Columns to calculate stats for. If not provided, all columns are considered.
|
181
|
-
deltacat_storage: Client implementation of the DeltaCAT storage interface
|
182
|
-
|
183
|
-
Returns:
|
184
|
-
A delta wide stats container
|
185
|
-
"""
|
186
|
-
if deltacat_storage_kwargs is None:
|
187
|
-
deltacat_storage_kwargs = {}
|
188
|
-
total_tables_size = 0
|
189
|
-
|
190
|
-
# Mapping of column_name -> [stats_file_idx_1, stats_file_idx_2, ... stats_file_idx_n]
|
191
|
-
column_stats_map = defaultdict(
|
192
|
-
lambda: [[None, None]] * len(delta_annotated["manifest"].get("entries"))
|
193
|
-
)
|
194
|
-
src_da_entries = delta_annotated["manifest"].get("entries")
|
195
|
-
manifest_annotations = delta_annotated["annotations"]
|
196
|
-
for file_idx, manifest in enumerate(src_da_entries):
|
197
|
-
entry_pyarrow_table: LocalTable = (
|
198
|
-
deltacat_storage.download_delta_manifest_entry(
|
199
|
-
delta_annotated,
|
200
|
-
file_idx,
|
201
|
-
TableType.PYARROW,
|
202
|
-
columns_to_compute,
|
203
|
-
equivalent_table_types="uncompacted",
|
204
|
-
**deltacat_storage_kwargs,
|
205
|
-
)
|
206
|
-
)
|
207
|
-
assert isinstance(entry_pyarrow_table, pyarrow.Table), (
|
208
|
-
f"Stats collection is only supported for PyArrow tables, but received a table of "
|
209
|
-
f"type '{type(entry_pyarrow_table)}' for manifest entry {file_idx} of delta: {delta_annotated.locator}."
|
210
|
-
)
|
211
|
-
total_tables_size += entry_pyarrow_table.nbytes
|
212
|
-
if not columns_to_compute:
|
213
|
-
columns_to_compute = entry_pyarrow_table.column_names
|
214
|
-
|
215
|
-
for column_idx, pyarrow_column in enumerate(entry_pyarrow_table.columns):
|
216
|
-
column_name = columns_to_compute[column_idx]
|
217
|
-
origin_delta_stream_position = manifest_annotations[file_idx][-1]
|
218
|
-
column_stats_map[column_name][file_idx] = [
|
219
|
-
StatsResult.of(len(pyarrow_column), pyarrow_column.nbytes),
|
220
|
-
origin_delta_stream_position,
|
221
|
-
]
|
222
|
-
|
223
|
-
return column_stats_map
|
@@ -1,18 +0,0 @@
|
|
1
|
-
from deltacat.constants import PYARROW_INFLATION_MULTIPLIER_ALL_COLUMNS
|
2
|
-
|
3
|
-
|
4
|
-
def estimation_function(
|
5
|
-
content_length, content_type, content_encoding, *args, **kwargs
|
6
|
-
):
|
7
|
-
# TODO(zyiqin): update the estimation here to be consistent with number of required worker nodes estimate.
|
8
|
-
# Current implementation is only a rough guess using the PYARROW_INFLATION_MULTIPLIER(content_length to pyarrow butes(all columns).
|
9
|
-
# The full implementation logic should be:
|
10
|
-
# 1. liner regression with a confidence level: pull metastats data for all deltas for this partition if len(datapoints) > 30.
|
11
|
-
# 2. if not enough previous stats collected for same partition: Fall back to datapoints for all paritions for same table.
|
12
|
-
# 3. If not enough stats collected for this table: use average content length to each content_type and content_encoding inflation rates
|
13
|
-
# 4. If not enough stats for this content_type and content_encoding combination: use the basic PYARROW_INFLATION_MULTIPLIER instead.
|
14
|
-
|
15
|
-
if content_length:
|
16
|
-
return content_length * PYARROW_INFLATION_MULTIPLIER_ALL_COLUMNS
|
17
|
-
else:
|
18
|
-
return 0
|
@@ -1,129 +0,0 @@
|
|
1
|
-
import errno
|
2
|
-
import logging
|
3
|
-
import os
|
4
|
-
import subprocess
|
5
|
-
from subprocess import run
|
6
|
-
from typing import Any
|
7
|
-
|
8
|
-
import ray
|
9
|
-
from tenacity import RetryError, Retrying, stop_after_attempt, wait_fixed
|
10
|
-
|
11
|
-
from deltacat import logs
|
12
|
-
from deltacat.compute.metastats.utils.constants import (
|
13
|
-
MAX_WORKER_MULTIPLIER,
|
14
|
-
WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO,
|
15
|
-
)
|
16
|
-
|
17
|
-
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
18
|
-
|
19
|
-
RAY_DOWN_DEFAULT_RETRY_ATTEMPTS = 3
|
20
|
-
|
21
|
-
|
22
|
-
def run_cmd_exit_code(cmd: str) -> int:
|
23
|
-
logger.info(f"running command {cmd}")
|
24
|
-
exit_code = int(os.system(cmd))
|
25
|
-
logger.info(f"Got {exit_code} when running {cmd}")
|
26
|
-
|
27
|
-
|
28
|
-
def run_cmd_with_retry(cmd: str) -> None:
|
29
|
-
retrying = Retrying(
|
30
|
-
wait=wait_fixed(2), stop=stop_after_attempt(RAY_DOWN_DEFAULT_RETRY_ATTEMPTS)
|
31
|
-
)
|
32
|
-
try:
|
33
|
-
retrying(run_cmd_exit_code(cmd))
|
34
|
-
except RetryError:
|
35
|
-
logger.info(f"{cmd} failed after {RAY_DOWN_DEFAULT_RETRY_ATTEMPTS} retries.")
|
36
|
-
|
37
|
-
|
38
|
-
def run_cmd(cmd: str) -> None:
|
39
|
-
result = run(cmd, shell=True, capture_output=True)
|
40
|
-
exit_code = int(result.returncode)
|
41
|
-
assert exit_code == 0, (
|
42
|
-
f"`{cmd}` failed. Exit code: {exit_code} " f"Error Trace: {result.stderr}"
|
43
|
-
)
|
44
|
-
|
45
|
-
|
46
|
-
def ray_up(cluster_cfg: str) -> None:
|
47
|
-
logger.info(f"Starting Ray cluster '{cluster_cfg}'")
|
48
|
-
run_cmd(f"ray up {cluster_cfg} -y --no-config-cache --no-restart")
|
49
|
-
logger.info(f"Started Ray cluster '{cluster_cfg}'")
|
50
|
-
|
51
|
-
|
52
|
-
def ray_down(cluster_cfg: str) -> None:
|
53
|
-
logger.info(f"Destroying Ray cluster '{cluster_cfg}'")
|
54
|
-
run_cmd_with_retry(f"ray down {cluster_cfg} -y")
|
55
|
-
logger.info(f"Destroyed Ray cluster '{cluster_cfg}'")
|
56
|
-
|
57
|
-
|
58
|
-
def clean_up_cluster_cfg_file(cluster_cfg) -> None:
|
59
|
-
logger.info(f"Removing stats cluster config at: '{cluster_cfg}'")
|
60
|
-
run_cmd(f"rm -f {cluster_cfg}")
|
61
|
-
logger.info(f"Removed stats cluster config at: '{cluster_cfg}'")
|
62
|
-
|
63
|
-
|
64
|
-
def get_head_node_ip(cluster_cfg: str) -> str:
|
65
|
-
logger.info(f"Getting Ray cluster head node IP for '{cluster_cfg}'")
|
66
|
-
proc = subprocess.run(
|
67
|
-
f"ray get-head-ip {cluster_cfg}",
|
68
|
-
shell=True,
|
69
|
-
capture_output=True,
|
70
|
-
text=True,
|
71
|
-
check=True,
|
72
|
-
)
|
73
|
-
# the head node IP should be the last line printed to stdout
|
74
|
-
head_node_ip = proc.stdout.splitlines()[-1]
|
75
|
-
logger.info(f"Ray cluster head node IP for '{cluster_cfg}': {head_node_ip}")
|
76
|
-
return head_node_ip
|
77
|
-
|
78
|
-
|
79
|
-
def ray_init(host, port) -> Any:
|
80
|
-
ray_init_uri = f"ray://{host}:{port}"
|
81
|
-
logger.info(f"Connecting Ray Client to '{ray_init_uri}'")
|
82
|
-
client = ray.init(ray_init_uri, allow_multiple=True)
|
83
|
-
logger.info(f"Connected Ray Client to '{ray_init_uri}'")
|
84
|
-
return client
|
85
|
-
|
86
|
-
|
87
|
-
def replace_cluster_cfg_vars(
|
88
|
-
partition_canonical_string: str,
|
89
|
-
trace_id: str,
|
90
|
-
file_path: str,
|
91
|
-
min_workers: int,
|
92
|
-
head_type: str,
|
93
|
-
worker_type: str,
|
94
|
-
head_object_store_memory_pct: int,
|
95
|
-
worker_object_store_memory_pct: int,
|
96
|
-
) -> str:
|
97
|
-
|
98
|
-
head_object_store_memory_pct = head_object_store_memory_pct if not None else 30
|
99
|
-
worker_object_store_memory_pct = WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO * 100
|
100
|
-
|
101
|
-
max_workers = int(min_workers * MAX_WORKER_MULTIPLIER)
|
102
|
-
with open(file_path, "r+") as file:
|
103
|
-
contents = file.read().replace("{{use-internal-ips}}", "True")
|
104
|
-
contents = contents.replace(
|
105
|
-
"{{partition_canonical_string}}", partition_canonical_string
|
106
|
-
)
|
107
|
-
contents = contents.replace("'{{trace_id}}'", trace_id)
|
108
|
-
contents = contents.replace("'{{min-workers}}'", str(min_workers))
|
109
|
-
contents = contents.replace("'{{max-workers}}'", str(max_workers))
|
110
|
-
contents = contents.replace("'{{head-instance-type}}'", head_type)
|
111
|
-
contents = contents.replace("'{{worker-instance-type}}'", worker_type)
|
112
|
-
contents = contents.replace(
|
113
|
-
"'{{head-object-store-memory-pct}}'", str(head_object_store_memory_pct)
|
114
|
-
)
|
115
|
-
contents = contents.replace(
|
116
|
-
"'{{worker-object-store-memory-pct}}'", str(worker_object_store_memory_pct)
|
117
|
-
)
|
118
|
-
partition_id = partition_canonical_string.split("|")[-1]
|
119
|
-
out_file_name = f"{trace_id}-{partition_id}.{os.path.basename(file_path)}"
|
120
|
-
out_file_dir = os.path.join(os.path.dirname(file_path), "tmp")
|
121
|
-
out_file_path = os.path.join(out_file_dir, out_file_name)
|
122
|
-
try:
|
123
|
-
os.makedirs(os.path.dirname(out_file_path))
|
124
|
-
except OSError as e:
|
125
|
-
if e.errno != errno.EEXIST:
|
126
|
-
raise
|
127
|
-
with open(out_file_path, "w") as output:
|
128
|
-
output.write(contents)
|
129
|
-
return out_file_path
|