deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +41 -15
- deltacat/aws/clients.py +12 -31
- deltacat/aws/constants.py +1 -1
- deltacat/aws/redshift/__init__.py +7 -2
- deltacat/aws/redshift/model/manifest.py +54 -50
- deltacat/aws/s3u.py +176 -187
- deltacat/catalog/delegate.py +151 -185
- deltacat/catalog/interface.py +78 -97
- deltacat/catalog/model/catalog.py +21 -21
- deltacat/catalog/model/table_definition.py +11 -9
- deltacat/compute/compactor/__init__.py +12 -16
- deltacat/compute/compactor/compaction_session.py +237 -166
- deltacat/compute/compactor/model/delta_annotated.py +60 -44
- deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
- deltacat/compute/compactor/model/delta_file_locator.py +10 -8
- deltacat/compute/compactor/model/materialize_result.py +6 -7
- deltacat/compute/compactor/model/primary_key_index.py +38 -34
- deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
- deltacat/compute/compactor/model/round_completion_info.py +25 -19
- deltacat/compute/compactor/model/sort_key.py +18 -15
- deltacat/compute/compactor/steps/dedupe.py +119 -94
- deltacat/compute/compactor/steps/hash_bucket.py +48 -47
- deltacat/compute/compactor/steps/materialize.py +86 -92
- deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
- deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
- deltacat/compute/compactor/utils/io.py +59 -47
- deltacat/compute/compactor/utils/primary_key_index.py +91 -80
- deltacat/compute/compactor/utils/round_completion_file.py +22 -23
- deltacat/compute/compactor/utils/system_columns.py +33 -45
- deltacat/compute/metastats/meta_stats.py +235 -157
- deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
- deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
- deltacat/compute/metastats/stats.py +95 -64
- deltacat/compute/metastats/utils/io.py +100 -53
- deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
- deltacat/compute/metastats/utils/ray_utils.py +38 -33
- deltacat/compute/stats/basic.py +107 -69
- deltacat/compute/stats/models/delta_column_stats.py +11 -8
- deltacat/compute/stats/models/delta_stats.py +59 -32
- deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
- deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
- deltacat/compute/stats/models/stats_result.py +24 -14
- deltacat/compute/stats/utils/intervals.py +16 -9
- deltacat/compute/stats/utils/io.py +86 -51
- deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
- deltacat/constants.py +4 -13
- deltacat/io/__init__.py +2 -2
- deltacat/io/aws/redshift/redshift_datasource.py +157 -143
- deltacat/io/dataset.py +14 -17
- deltacat/io/read_api.py +36 -33
- deltacat/logs.py +94 -42
- deltacat/storage/__init__.py +18 -8
- deltacat/storage/interface.py +196 -213
- deltacat/storage/model/delta.py +45 -51
- deltacat/storage/model/list_result.py +12 -8
- deltacat/storage/model/namespace.py +4 -5
- deltacat/storage/model/partition.py +42 -42
- deltacat/storage/model/stream.py +29 -30
- deltacat/storage/model/table.py +14 -14
- deltacat/storage/model/table_version.py +32 -31
- deltacat/storage/model/types.py +1 -0
- deltacat/tests/stats/test_intervals.py +11 -24
- deltacat/tests/utils/__init__.py +0 -0
- deltacat/tests/utils/test_record_batch_tables.py +284 -0
- deltacat/types/media.py +3 -4
- deltacat/types/tables.py +31 -21
- deltacat/utils/common.py +5 -11
- deltacat/utils/numpy.py +20 -22
- deltacat/utils/pandas.py +73 -100
- deltacat/utils/performance.py +3 -9
- deltacat/utils/placement.py +259 -230
- deltacat/utils/pyarrow.py +302 -89
- deltacat/utils/ray_utils/collections.py +2 -1
- deltacat/utils/ray_utils/concurrency.py +27 -28
- deltacat/utils/ray_utils/dataset.py +28 -28
- deltacat/utils/ray_utils/performance.py +5 -9
- deltacat/utils/ray_utils/runtime.py +9 -10
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
- deltacat-0.1.12.dist-info/RECORD +110 -0
- deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0
@@ -1,30 +1,25 @@
|
|
1
|
-
import ray
|
2
1
|
import logging
|
3
|
-
from typing import Dict, Set, Tuple, List, Optional, Any
|
4
2
|
from collections import defaultdict
|
3
|
+
from typing import Dict, List, Optional
|
5
4
|
|
6
|
-
|
7
|
-
from deltacat.compute.stats.models.stats_result import StatsResult
|
5
|
+
import ray
|
8
6
|
from ray.types import ObjectRef
|
9
7
|
|
10
8
|
from deltacat import logs
|
11
|
-
from deltacat.compute.stats.models.delta_stats_cache_result import DeltaStatsCacheResult
|
12
|
-
|
13
|
-
from deltacat.utils.ray_utils.concurrency import invoke_parallel, \
|
14
|
-
round_robin_options_provider
|
15
|
-
from deltacat.compute.metastats.utils.io import collect_stats_by_columns, cache_inflation_rate_data_for_delta_stats_ready, cache_partition_stats_to_s3
|
16
|
-
|
17
|
-
from deltacat.storage import PartitionLocator, DeltaLocator, Delta
|
18
|
-
from deltacat.storage import interface as unimplemented_deltacat_storage
|
19
|
-
|
20
|
-
from deltacat.aws.clients import client_cache
|
21
9
|
from deltacat.aws import s3u as s3_utils
|
22
|
-
|
23
|
-
|
24
|
-
from deltacat.compute.stats.models.manifest_entry_stats import ManifestEntryStats
|
25
|
-
from deltacat.compute.stats.models.delta_column_stats import DeltaColumnStats
|
26
|
-
|
10
|
+
from deltacat.aws.clients import client_cache
|
27
11
|
from deltacat.compute.compactor import DeltaAnnotated
|
12
|
+
from deltacat.compute.metastats.utils.io import (
|
13
|
+
cache_inflation_rate_data_for_delta_stats_ready,
|
14
|
+
cache_partition_stats_to_s3,
|
15
|
+
collect_stats_by_columns,
|
16
|
+
)
|
17
|
+
from deltacat.compute.stats.models.delta_column_stats import DeltaColumnStats
|
18
|
+
from deltacat.compute.stats.models.delta_stats import DeltaStats
|
19
|
+
from deltacat.compute.stats.models.manifest_entry_stats import ManifestEntryStats
|
20
|
+
from deltacat.compute.stats.models.stats_result import StatsResult
|
21
|
+
from deltacat.storage import DeltaLocator, PartitionLocator
|
22
|
+
from deltacat.storage import interface as unimplemented_deltacat_storage
|
28
23
|
|
29
24
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
30
25
|
|
@@ -32,51 +27,67 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
|
32
27
|
DEFAULT_CPUS_STATS_CLUSTER_INSTANCE = 32
|
33
28
|
|
34
29
|
|
35
|
-
def start_stats_collection(
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
30
|
+
def start_stats_collection(
|
31
|
+
batched_delta_stats_compute_list: List[DeltaAnnotated],
|
32
|
+
columns: List[str],
|
33
|
+
stat_results_s3_bucket: Optional[str] = None,
|
34
|
+
metastats_results_s3_bucket: Optional[str] = None,
|
35
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
36
|
+
) -> Dict[str, List[DeltaStats]]:
|
40
37
|
"""Collects statistics on deltas, given a set of delta stream position ranges.
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
38
|
+
Example:
|
39
|
+
>>> collect(locator, set((1, 5), (4, 8), (13, 16)))
|
40
|
+
{
|
41
|
+
1: DeltaStats(), # DeltaStats for stream positions 1 - 8
|
42
|
+
13: DeltaStats() # DeltaStats for stream positions 13 - 16
|
43
|
+
}
|
44
|
+
Args:
|
45
|
+
source_partition_locator: Reference to the partition locator tied to the given delta stream positions
|
46
|
+
delta_stream_position_range_set: A set of intervals with an int type representing finite,
|
47
|
+
closed bounded values, and a None type representing unbounded infinity.
|
48
|
+
columns: Columns can be optionally included to collect stats on specific columns.
|
49
|
+
By default, all columns will be calculated.
|
50
|
+
stat_results_s3_bucket: Used as a cache file storage for computed delta stats
|
51
|
+
metastats_results_s3_bucket: Used as cache file storage for inflation rate meta stats
|
52
|
+
deltacat_storage: Client implementation of the DeltaCAT storage interface
|
53
|
+
Returns:
|
54
|
+
A mapping of stream positions to their corresponding delta stats.
|
58
55
|
"""
|
59
56
|
# TODO: Add CompactionEventDispatcher for stats collection started event
|
60
57
|
delta_stats_compute_pending: List[ObjectRef[Dict[str, List[StatsResult, int]]]] = []
|
61
58
|
|
62
59
|
for batched_deltas in batched_delta_stats_compute_list:
|
63
|
-
splitted_annotated_deltas = DeltaAnnotated.split(
|
60
|
+
splitted_annotated_deltas = DeltaAnnotated.split(
|
61
|
+
batched_deltas, DEFAULT_CPUS_STATS_CLUSTER_INSTANCE
|
62
|
+
)
|
64
63
|
for splitted_annotated_delta in splitted_annotated_deltas:
|
65
|
-
delta_stats_compute_pending.append(
|
64
|
+
delta_stats_compute_pending.append(
|
65
|
+
collect_stats_by_columns.remote(
|
66
|
+
splitted_annotated_delta, columns, deltacat_storage
|
67
|
+
)
|
68
|
+
)
|
66
69
|
|
67
70
|
column_stats_map = _process_stats(delta_stats_compute_pending)
|
68
71
|
|
69
72
|
if not batched_delta_stats_compute_list:
|
70
73
|
logger.info("No new delta need stats collection")
|
71
74
|
else:
|
72
|
-
|
75
|
+
(
|
76
|
+
delta_stream_range_stats,
|
77
|
+
partition_canonical_string,
|
78
|
+
) = resolve_annotated_delta_stats_to_original_deltas_stats(
|
79
|
+
column_stats_map, columns, batched_delta_stats_compute_list[0]
|
80
|
+
)
|
73
81
|
|
74
|
-
_cache_stats_res_to_s3(
|
82
|
+
_cache_stats_res_to_s3(
|
83
|
+
stat_results_s3_bucket, delta_stream_range_stats, partition_canonical_string
|
84
|
+
)
|
75
85
|
|
76
86
|
base_path = s3_utils.parse_s3_url(metastats_results_s3_bucket).url
|
77
87
|
inflation_rate_stats_s3_url = f"{base_path}/inflation-rates.json"
|
78
|
-
cache_inflation_rate_data_for_delta_stats_ready(
|
79
|
-
|
88
|
+
cache_inflation_rate_data_for_delta_stats_ready(
|
89
|
+
delta_stream_range_stats, inflation_rate_stats_s3_url, deltacat_storage
|
90
|
+
)
|
80
91
|
# TODO: Add CompactionEventDispatcher for stats collection completed event
|
81
92
|
return delta_stream_range_stats
|
82
93
|
|
@@ -87,13 +98,19 @@ def _get_account_id() -> str:
|
|
87
98
|
return account_id
|
88
99
|
|
89
100
|
|
90
|
-
def _process_stats(
|
91
|
-
|
101
|
+
def _process_stats(
|
102
|
+
delta_stats_compute_pending: List[ObjectRef[DeltaStats]],
|
103
|
+
) -> List[DeltaStats]:
|
104
|
+
delta_stats_processed_list: List[DeltaStats] = _resolve_pending_stats(
|
105
|
+
delta_stats_compute_pending
|
106
|
+
)
|
92
107
|
|
93
108
|
return delta_stats_processed_list
|
94
109
|
|
95
110
|
|
96
|
-
def _resolve_pending_stats(
|
111
|
+
def _resolve_pending_stats(
|
112
|
+
delta_stats_pending_list: List[ObjectRef[DeltaStats]],
|
113
|
+
) -> List[DeltaStats]:
|
97
114
|
delta_stats_processed_list: List[DeltaStats] = []
|
98
115
|
|
99
116
|
while delta_stats_pending_list:
|
@@ -104,29 +121,39 @@ def _resolve_pending_stats(delta_stats_pending_list: List[ObjectRef[DeltaStats]]
|
|
104
121
|
return delta_stats_processed_list
|
105
122
|
|
106
123
|
|
107
|
-
def _cache_stats_res_to_s3(
|
108
|
-
|
109
|
-
|
124
|
+
def _cache_stats_res_to_s3(
|
125
|
+
stat_results_s3_bucket, delta_stream_range_stats, partition_canonical_string
|
126
|
+
):
|
110
127
|
if stat_results_s3_bucket:
|
111
128
|
# Cache the stats into the file store
|
112
|
-
cache_partition_stats_to_s3(
|
129
|
+
cache_partition_stats_to_s3(
|
130
|
+
stat_results_s3_bucket, delta_stream_range_stats, partition_canonical_string
|
131
|
+
)
|
113
132
|
|
114
133
|
|
115
|
-
def resolve_annotated_delta_stats_to_original_deltas_stats(
|
116
|
-
|
134
|
+
def resolve_annotated_delta_stats_to_original_deltas_stats(
|
135
|
+
column_stats_map, column_names, delta_annotated
|
136
|
+
) -> Dict[int, DeltaStats]:
|
117
137
|
|
118
|
-
partition_values = delta_annotated["deltaLocator"]["partitionLocator"][
|
138
|
+
partition_values = delta_annotated["deltaLocator"]["partitionLocator"][
|
139
|
+
"partitionValues"
|
140
|
+
]
|
119
141
|
partition_id = delta_annotated["deltaLocator"]["partitionLocator"]["partitionId"]
|
120
|
-
stream_locator = delta_annotated["deltaLocator"]["partitionLocator"][
|
121
|
-
|
142
|
+
stream_locator = delta_annotated["deltaLocator"]["partitionLocator"][
|
143
|
+
"streamLocator"
|
144
|
+
]
|
145
|
+
partition_locator = PartitionLocator.of(
|
146
|
+
stream_locator, partition_values, partition_id
|
147
|
+
)
|
122
148
|
|
123
149
|
# Dict[stream_position: List[StatsResult]]
|
124
150
|
manifest_column_stats_list = defaultdict(lambda: [])
|
125
151
|
for i in range(len(column_stats_map)):
|
126
152
|
for column_name in column_names:
|
127
153
|
for j in range(len(column_stats_map[i][column_name])):
|
128
|
-
manifest_column_stats_list[
|
129
|
-
|
154
|
+
manifest_column_stats_list[
|
155
|
+
column_stats_map[i][column_name][j][1]
|
156
|
+
].append([column_stats_map[i][column_name][j][0], column_name])
|
130
157
|
|
131
158
|
stats_res: Dict[int, List[DeltaStats]] = {}
|
132
159
|
for key, value in manifest_column_stats_list.items():
|
@@ -139,11 +166,15 @@ Dict[int, DeltaStats]:
|
|
139
166
|
delta_ds_column_stats: List[DeltaColumnStats] = []
|
140
167
|
for column_name, column_manifest_stats_list in manifest_stats_list.items():
|
141
168
|
|
142
|
-
column_manifest_stats = ManifestEntryStats.of(
|
143
|
-
|
169
|
+
column_manifest_stats = ManifestEntryStats.of(
|
170
|
+
column_manifest_stats_list, delta_locator
|
171
|
+
)
|
172
|
+
dataset_column_stats = DeltaColumnStats.of(
|
173
|
+
column_name, column_manifest_stats
|
174
|
+
)
|
144
175
|
delta_ds_column_stats.append(dataset_column_stats)
|
145
176
|
|
146
177
|
dataset_stats: DeltaStats = DeltaStats.of(delta_ds_column_stats)
|
147
178
|
stats_res[key] = dataset_stats
|
148
179
|
|
149
|
-
return stats_res, partition_locator.canonical_string()
|
180
|
+
return stats_res, partition_locator.canonical_string()
|
@@ -1,28 +1,29 @@
|
|
1
|
-
import logging
|
2
1
|
import json
|
2
|
+
import logging
|
3
|
+
from collections import defaultdict
|
4
|
+
from typing import Any, Dict, List, Optional
|
5
|
+
|
3
6
|
import pyarrow
|
4
7
|
import ray
|
5
8
|
|
6
|
-
from deltacat import LocalTable, TableType
|
7
|
-
from deltacat.storage import Delta
|
8
|
-
from deltacat.compute.compactor import DeltaAnnotated
|
9
|
+
from deltacat import LocalTable, TableType, logs
|
9
10
|
from deltacat.aws import s3u as s3_utils
|
10
|
-
from deltacat.
|
11
|
-
from deltacat.storage import interface as unimplemented_deltacat_storage
|
11
|
+
from deltacat.compute.compactor import DeltaAnnotated
|
12
12
|
from deltacat.compute.metastats.model.partition_stats_dict import PartitionStats
|
13
|
-
from deltacat.compute.stats.models.delta_stats_cache_result import DeltaStatsCacheResult
|
14
13
|
from deltacat.compute.stats.models.delta_column_stats import DeltaColumnStats
|
15
14
|
from deltacat.compute.stats.models.delta_stats import DeltaStats, DeltaStatsCacheMiss
|
15
|
+
from deltacat.compute.stats.models.delta_stats_cache_result import DeltaStatsCacheResult
|
16
16
|
from deltacat.compute.stats.models.stats_result import StatsResult
|
17
|
+
from deltacat.storage import Delta
|
18
|
+
from deltacat.storage import interface as unimplemented_deltacat_storage
|
19
|
+
from deltacat.utils.common import sha1_hexdigest
|
17
20
|
|
18
|
-
from typing import Dict, List, Optional, Any
|
19
|
-
from collections import defaultdict
|
20
|
-
from deltacat import logs
|
21
21
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
22
22
|
|
23
23
|
|
24
|
-
def cache_inflation_rate_data_for_delta_stats_ready(
|
25
|
-
|
24
|
+
def cache_inflation_rate_data_for_delta_stats_ready(
|
25
|
+
delta_stats_processed_list, inflation_rate_stats_s3_url, deltacat_storage
|
26
|
+
):
|
26
27
|
meta_stats_processed_list: Dict[int, int] = {}
|
27
28
|
|
28
29
|
for key, value in delta_stats_processed_list.items():
|
@@ -39,17 +40,23 @@ def cache_inflation_rate_data_for_delta_stats_ready(delta_stats_processed_list,
|
|
39
40
|
for key, value in delta_stats_processed_list.items():
|
40
41
|
delta_stats_pyarrow_bytes_sum = 0
|
41
42
|
delta_stats_row_count = 0
|
42
|
-
for column_stats in
|
43
|
+
for column_stats in (
|
44
|
+
delta_stats_processed_list[key].column_stats[0].manifest_stats.stats
|
45
|
+
):
|
43
46
|
delta_stats_row_count += column_stats.get("rowCount")
|
44
47
|
for stats in delta_stats_processed_list[key].get("column_stats"):
|
45
48
|
|
46
49
|
delta_stats_pyarrow_bytes_sum += stats.get("stats").get("pyarrowTableBytes")
|
47
|
-
cache_inflation_rate_res[key] = [
|
48
|
-
|
50
|
+
cache_inflation_rate_res[key] = [
|
51
|
+
meta_stats_processed_list[key],
|
52
|
+
delta_stats_row_count,
|
53
|
+
delta_stats_pyarrow_bytes_sum,
|
54
|
+
]
|
49
55
|
|
50
56
|
if inflation_rate_stats_s3_url:
|
51
57
|
logger.warning(
|
52
|
-
f"reading previous inflation rate stats from: {inflation_rate_stats_s3_url}"
|
58
|
+
f"reading previous inflation rate stats from: {inflation_rate_stats_s3_url}"
|
59
|
+
)
|
53
60
|
|
54
61
|
result = s3_utils.download(inflation_rate_stats_s3_url, fail_if_not_found=False)
|
55
62
|
|
@@ -57,38 +64,57 @@ def cache_inflation_rate_data_for_delta_stats_ready(delta_stats_processed_list,
|
|
57
64
|
if result:
|
58
65
|
json_str = result["Body"].read().decode("utf-8")
|
59
66
|
prev_inflation_rate_stats_read = json.loads(json_str)
|
60
|
-
prev_inflation_rate_stats =
|
61
|
-
|
67
|
+
prev_inflation_rate_stats = (
|
68
|
+
prev_inflation_rate_stats_read
|
69
|
+
if prev_inflation_rate_stats_read
|
70
|
+
else dict()
|
71
|
+
)
|
72
|
+
logger.debug(
|
73
|
+
f"read stats completion info: {prev_inflation_rate_stats_read}"
|
74
|
+
)
|
62
75
|
logger.debug(
|
63
|
-
f"writing inflation rate info to S3: {inflation_rate_stats_s3_url}"
|
76
|
+
f"writing inflation rate info to S3: {inflation_rate_stats_s3_url}"
|
77
|
+
)
|
64
78
|
prev_inflation_rate_stats.update(cache_inflation_rate_res)
|
65
|
-
logger.debug(
|
79
|
+
logger.debug(
|
80
|
+
f"writing current inflation rate info to S3: {prev_inflation_rate_stats}"
|
81
|
+
)
|
66
82
|
s3_utils.upload(
|
67
|
-
inflation_rate_stats_s3_url,
|
68
|
-
json.dumps(prev_inflation_rate_stats)
|
83
|
+
inflation_rate_stats_s3_url, json.dumps(prev_inflation_rate_stats)
|
69
84
|
)
|
70
85
|
else:
|
71
|
-
logger.warning(
|
86
|
+
logger.warning(
|
87
|
+
f"No valid s3 url received to cache inflation rate stats, got {inflation_rate_stats_s3_url}"
|
88
|
+
)
|
72
89
|
|
73
90
|
|
74
|
-
def read_cached_partition_stats(
|
75
|
-
|
76
|
-
|
77
|
-
|
91
|
+
def read_cached_partition_stats(
|
92
|
+
partition_canonical_string: str, stat_results_s3_bucket: str
|
93
|
+
):
|
94
|
+
partition_stats_url = get_partition_stats_s3_url(
|
95
|
+
partition_canonical_string, stat_results_s3_bucket
|
96
|
+
)
|
97
|
+
logger.info(f"reading partition stats completion file from: {partition_stats_url}")
|
78
98
|
|
79
99
|
result = s3_utils.download(partition_stats_url, fail_if_not_found=False)
|
80
100
|
delta_stats_cache_res_map: Dict[int, List[DeltaStatsCacheResult]] = {}
|
81
101
|
if result:
|
82
102
|
json_str = result["Body"].read().decode("utf-8")
|
83
103
|
partition_stats_str = json.loads(json_str)
|
84
|
-
delta_stats_cache_res_map = get_delta_stats_from_partition_stats(
|
104
|
+
delta_stats_cache_res_map = get_delta_stats_from_partition_stats(
|
105
|
+
partition_stats_str
|
106
|
+
)
|
85
107
|
|
86
108
|
return delta_stats_cache_res_map
|
87
109
|
|
88
110
|
|
89
|
-
def get_partition_stats_s3_url(
|
111
|
+
def get_partition_stats_s3_url(
|
112
|
+
partition_canonical_string: str, stat_results_s3_bucket: str
|
113
|
+
):
|
90
114
|
stats_partition_canonical_string = f"{partition_canonical_string}"
|
91
|
-
stats_partition_hexdigest = sha1_hexdigest(
|
115
|
+
stats_partition_hexdigest = sha1_hexdigest(
|
116
|
+
stats_partition_canonical_string.encode("utf-8")
|
117
|
+
)
|
92
118
|
base_path = s3_utils.parse_s3_url(stat_results_s3_bucket).url
|
93
119
|
|
94
120
|
return f"{base_path}/{stats_partition_hexdigest}.json"
|
@@ -109,34 +135,43 @@ def get_delta_stats_from_partition_stats(partition_stats_str: str):
|
|
109
135
|
missed_columns.append(cs.column)
|
110
136
|
|
111
137
|
delta_locator = delta_stats.column_stats[0].manifest_stats.delta_locator
|
112
|
-
found_stats: Optional[DeltaStats] =
|
113
|
-
|
114
|
-
|
138
|
+
found_stats: Optional[DeltaStats] = (
|
139
|
+
DeltaStats.of(found_columns_stats) if found_columns_stats else None
|
140
|
+
)
|
141
|
+
missed_stats: Optional[DeltaStatsCacheMiss] = (
|
142
|
+
DeltaStatsCacheMiss(missed_columns, delta_locator)
|
143
|
+
if missed_columns
|
144
|
+
else None
|
145
|
+
)
|
115
146
|
delta_stats_cache_res = DeltaStatsCacheResult.of(found_stats, missed_stats)
|
116
147
|
found_columns_stats_map[int(stream_position)] = delta_stats_cache_res
|
117
148
|
return found_columns_stats_map
|
118
149
|
|
119
150
|
|
120
|
-
def cache_partition_stats_to_s3(
|
121
|
-
|
122
|
-
|
123
|
-
|
151
|
+
def cache_partition_stats_to_s3(
|
152
|
+
stat_results_s3_bucket, delta_stream_range_stats, partition_canonical_string
|
153
|
+
):
|
154
|
+
partition_stats = PartitionStats.of(
|
155
|
+
delta_stream_range_stats, partition_canonical_string
|
156
|
+
)
|
157
|
+
logger.info(f"writing partition stats completion for {partition_canonical_string}")
|
124
158
|
partition_stats_completion_file_s3_url = get_partition_stats_s3_url(
|
125
|
-
partition_canonical_string,
|
126
|
-
stat_results_s3_bucket
|
159
|
+
partition_canonical_string, stat_results_s3_bucket
|
127
160
|
)
|
128
161
|
s3_utils.upload(
|
129
|
-
partition_stats_completion_file_s3_url,
|
130
|
-
str(json.dumps(partition_stats))
|
162
|
+
partition_stats_completion_file_s3_url, str(json.dumps(partition_stats))
|
131
163
|
)
|
132
164
|
logger.debug(
|
133
|
-
f"stats completion file written to: {partition_stats_completion_file_s3_url}"
|
165
|
+
f"stats completion file written to: {partition_stats_completion_file_s3_url}"
|
166
|
+
)
|
134
167
|
|
135
168
|
|
136
169
|
@ray.remote
|
137
|
-
def collect_stats_by_columns(
|
138
|
-
|
139
|
-
|
170
|
+
def collect_stats_by_columns(
|
171
|
+
delta_annotated: DeltaAnnotated,
|
172
|
+
columns_to_compute: Optional[List[str]] = None,
|
173
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
174
|
+
) -> Dict[str, Any]:
|
140
175
|
"""Materializes one manifest entry at a time to save memory usage and calculate stats from each of its columns.
|
141
176
|
|
142
177
|
Args:
|
@@ -150,15 +185,25 @@ def collect_stats_by_columns(delta_annotated: DeltaAnnotated,
|
|
150
185
|
total_tables_size = 0
|
151
186
|
|
152
187
|
# Mapping of column_name -> [stats_file_idx_1, stats_file_idx_2, ... stats_file_idx_n]
|
153
|
-
column_stats_map = defaultdict(
|
188
|
+
column_stats_map = defaultdict(
|
189
|
+
lambda: [[None, None]] * len(delta_annotated["manifest"].get("entries"))
|
190
|
+
)
|
154
191
|
src_da_entries = delta_annotated["manifest"].get("entries")
|
155
192
|
manifest_annotations = delta_annotated["annotations"]
|
156
193
|
for file_idx, manifest in enumerate(src_da_entries):
|
157
|
-
entry_pyarrow_table: LocalTable =
|
158
|
-
deltacat_storage.download_delta_manifest_entry(
|
159
|
-
|
160
|
-
|
194
|
+
entry_pyarrow_table: LocalTable = (
|
195
|
+
deltacat_storage.download_delta_manifest_entry(
|
196
|
+
delta_annotated,
|
197
|
+
file_idx,
|
198
|
+
TableType.PYARROW,
|
199
|
+
columns_to_compute,
|
200
|
+
equivalent_table_types="uncompacted",
|
201
|
+
)
|
202
|
+
)
|
203
|
+
assert isinstance(entry_pyarrow_table, pyarrow.Table), (
|
204
|
+
f"Stats collection is only supported for PyArrow tables, but received a table of "
|
161
205
|
f"type '{type(entry_pyarrow_table)}' for manifest entry {file_idx} of delta: {delta_annotated.locator}."
|
206
|
+
)
|
162
207
|
total_tables_size += entry_pyarrow_table.nbytes
|
163
208
|
if not columns_to_compute:
|
164
209
|
columns_to_compute = entry_pyarrow_table.column_names
|
@@ -166,7 +211,9 @@ def collect_stats_by_columns(delta_annotated: DeltaAnnotated,
|
|
166
211
|
for column_idx, pyarrow_column in enumerate(entry_pyarrow_table.columns):
|
167
212
|
column_name = columns_to_compute[column_idx]
|
168
213
|
origin_delta_stream_position = manifest_annotations[file_idx][-1]
|
169
|
-
column_stats_map[column_name][file_idx] = [
|
170
|
-
|
214
|
+
column_stats_map[column_name][file_idx] = [
|
215
|
+
StatsResult.of(len(pyarrow_column), pyarrow_column.nbytes),
|
216
|
+
origin_delta_stream_position,
|
217
|
+
]
|
171
218
|
|
172
|
-
return column_stats_map
|
219
|
+
return column_stats_map
|
@@ -1,6 +1,9 @@
|
|
1
1
|
from deltacat.constants import PYARROW_INFLATION_MULTIPLIER_ALL_COLUMNS
|
2
2
|
|
3
|
-
|
3
|
+
|
4
|
+
def estimation_function(
|
5
|
+
content_length, content_type, content_encoding, *args, **kwargs
|
6
|
+
):
|
4
7
|
# TODO(zyiqin): update the estimation here to be consistent with number of required worker nodes estimate.
|
5
8
|
# Current implementation is only a rough guess using the PYARROW_INFLATION_MULTIPLIER(content_length to pyarrow butes(all columns).
|
6
9
|
# The full implementation logic should be:
|
@@ -12,4 +15,4 @@ def estimation_function(content_length, content_type, content_encoding, *args, *
|
|
12
15
|
if content_length:
|
13
16
|
return content_length * PYARROW_INFLATION_MULTIPLIER_ALL_COLUMNS
|
14
17
|
else:
|
15
|
-
return 0
|
18
|
+
return 0
|
@@ -1,15 +1,18 @@
|
|
1
|
+
import errno
|
2
|
+
import logging
|
1
3
|
import os
|
2
4
|
import subprocess
|
5
|
+
from subprocess import run
|
6
|
+
from typing import Any
|
7
|
+
|
3
8
|
import ray
|
4
|
-
import
|
5
|
-
import logging
|
9
|
+
from tenacity import RetryError, Retrying, stop_after_attempt, wait_fixed
|
6
10
|
|
7
11
|
from deltacat import logs
|
8
|
-
from
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
from subprocess import run, PIPE
|
12
|
+
from deltacat.compute.metastats.utils.constants import (
|
13
|
+
MAX_WORKER_MULTIPLIER,
|
14
|
+
WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO,
|
15
|
+
)
|
13
16
|
|
14
17
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
15
18
|
|
@@ -24,13 +27,10 @@ def run_cmd_exit_code(cmd: str) -> int:
|
|
24
27
|
|
25
28
|
def run_cmd_with_retry(cmd: str) -> None:
|
26
29
|
retrying = Retrying(
|
27
|
-
wait=wait_fixed(2),
|
28
|
-
stop=stop_after_attempt(RAY_DOWN_DEFAULT_RETRY_ATTEMPTS)
|
30
|
+
wait=wait_fixed(2), stop=stop_after_attempt(RAY_DOWN_DEFAULT_RETRY_ATTEMPTS)
|
29
31
|
)
|
30
32
|
try:
|
31
|
-
retrying(
|
32
|
-
run_cmd_exit_code(cmd)
|
33
|
-
)
|
33
|
+
retrying(run_cmd_exit_code(cmd))
|
34
34
|
except RetryError:
|
35
35
|
logger.info(f"{cmd} failed after {RAY_DOWN_DEFAULT_RETRY_ATTEMPTS} retries.")
|
36
36
|
|
@@ -38,8 +38,9 @@ def run_cmd_with_retry(cmd: str) -> None:
|
|
38
38
|
def run_cmd(cmd: str) -> None:
|
39
39
|
result = run(cmd, shell=True, capture_output=True)
|
40
40
|
exit_code = int(result.returncode)
|
41
|
-
assert exit_code == 0,
|
42
|
-
|
41
|
+
assert exit_code == 0, (
|
42
|
+
f"`{cmd}` failed. Exit code: {exit_code} " f"Error Trace: {result.stderr}"
|
43
|
+
)
|
43
44
|
|
44
45
|
|
45
46
|
def ray_up(cluster_cfg: str) -> None:
|
@@ -67,7 +68,8 @@ def get_head_node_ip(cluster_cfg: str) -> str:
|
|
67
68
|
shell=True,
|
68
69
|
capture_output=True,
|
69
70
|
text=True,
|
70
|
-
check=True
|
71
|
+
check=True,
|
72
|
+
)
|
71
73
|
# the head node IP should be the last line printed to stdout
|
72
74
|
head_node_ip = proc.stdout.splitlines()[-1]
|
73
75
|
logger.info(f"Ray cluster head node IP for '{cluster_cfg}': {head_node_ip}")
|
@@ -83,14 +85,15 @@ def ray_init(host, port) -> Any:
|
|
83
85
|
|
84
86
|
|
85
87
|
def replace_cluster_cfg_vars(
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
88
|
+
partition_canonical_string: str,
|
89
|
+
trace_id: str,
|
90
|
+
file_path: str,
|
91
|
+
min_workers: int,
|
92
|
+
head_type: str,
|
93
|
+
worker_type: str,
|
94
|
+
head_object_store_memory_pct: int,
|
95
|
+
worker_object_store_memory_pct: int,
|
96
|
+
) -> str:
|
94
97
|
|
95
98
|
head_object_store_memory_pct = head_object_store_memory_pct if not None else 30
|
96
99
|
worker_object_store_memory_pct = WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO * 100
|
@@ -98,18 +101,20 @@ def replace_cluster_cfg_vars(
|
|
98
101
|
max_workers = int(min_workers * MAX_WORKER_MULTIPLIER)
|
99
102
|
with open(file_path, "r+") as file:
|
100
103
|
contents = file.read().replace("{{use-internal-ips}}", "True")
|
101
|
-
contents = contents.replace("{{partition_canonical_string}}", partition_canonical_string)
|
102
|
-
contents = contents.replace("{{trace_id}}", trace_id)
|
103
|
-
contents = contents.replace("{{min-workers}}", str(min_workers))
|
104
|
-
contents = contents.replace("{{max-workers}}", str(max_workers))
|
105
|
-
contents = contents.replace("{{head-instance-type}}", head_type)
|
106
|
-
contents = contents.replace("{{worker-instance-type}}", worker_type)
|
107
104
|
contents = contents.replace(
|
108
|
-
"{{
|
109
|
-
|
105
|
+
"{{partition_canonical_string}}", partition_canonical_string
|
106
|
+
)
|
107
|
+
contents = contents.replace("'{{trace_id}}'", trace_id)
|
108
|
+
contents = contents.replace("'{{min-workers}}'", str(min_workers))
|
109
|
+
contents = contents.replace("'{{max-workers}}'", str(max_workers))
|
110
|
+
contents = contents.replace("'{{head-instance-type}}'", head_type)
|
111
|
+
contents = contents.replace("'{{worker-instance-type}}'", worker_type)
|
110
112
|
contents = contents.replace(
|
111
|
-
"{{
|
112
|
-
|
113
|
+
"'{{head-object-store-memory-pct}}'", str(head_object_store_memory_pct)
|
114
|
+
)
|
115
|
+
contents = contents.replace(
|
116
|
+
"'{{worker-object-store-memory-pct}}'", str(worker_object_store_memory_pct)
|
117
|
+
)
|
113
118
|
partition_id = partition_canonical_string.split("|")[-1]
|
114
119
|
out_file_name = f"{trace_id}-{partition_id}.{os.path.basename(file_path)}"
|
115
120
|
out_file_dir = os.path.join(os.path.dirname(file_path), "tmp")
|