deltacat 1.1.17__py3-none-any.whl → 1.1.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/constants.py +0 -1
- deltacat/compute/compactor/model/compact_partition_params.py +76 -0
- deltacat/compute/compactor/model/compaction_session_audit_info.py +26 -0
- deltacat/compute/compactor/model/delta_annotated.py +16 -9
- deltacat/compute/compactor_v2/constants.py +3 -0
- deltacat/compute/compactor_v2/private/compaction_utils.py +9 -5
- deltacat/compute/compactor_v2/utils/content_type_params.py +185 -34
- deltacat/compute/compactor_v2/utils/io.py +28 -14
- deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
- deltacat/compute/compactor_v2/utils/task_options.py +128 -183
- deltacat/compute/resource_estimation/__init__.py +27 -0
- deltacat/compute/resource_estimation/delta.py +271 -0
- deltacat/compute/resource_estimation/manifest.py +394 -0
- deltacat/compute/resource_estimation/model.py +165 -0
- deltacat/compute/resource_estimation/parquet.py +108 -0
- deltacat/constants.py +5 -0
- deltacat/exceptions.py +2 -4
- deltacat/logs.py +8 -0
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +77 -0
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +308 -0
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +159 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +157 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +3 -3
- deltacat/tests/compute/resource_estimation/test_delta.py +605 -0
- deltacat/tests/compute/resource_estimation/test_manifest.py +921 -0
- deltacat/tests/compute/test_compact_partition_rebase.py +13 -4
- deltacat/tests/compute/test_util_common.py +2 -0
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -5
- deltacat/tests/test_logs.py +34 -0
- deltacat/tests/test_utils/pyarrow.py +15 -5
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/METADATA +2 -2
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/RECORD +38 -54
- deltacat/compute/metastats/meta_stats.py +0 -479
- deltacat/compute/metastats/model/__init__.py +0 -0
- deltacat/compute/metastats/model/partition_stats_dict.py +0 -34
- deltacat/compute/metastats/model/stats_cluster_size_estimator.py +0 -68
- deltacat/compute/metastats/stats.py +0 -182
- deltacat/compute/metastats/utils/__init__.py +0 -0
- deltacat/compute/metastats/utils/constants.py +0 -16
- deltacat/compute/metastats/utils/io.py +0 -223
- deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +0 -18
- deltacat/compute/metastats/utils/ray_utils.py +0 -129
- deltacat/compute/stats/basic.py +0 -226
- deltacat/compute/stats/models/__init__.py +0 -0
- deltacat/compute/stats/models/delta_column_stats.py +0 -98
- deltacat/compute/stats/models/delta_stats.py +0 -233
- deltacat/compute/stats/models/delta_stats_cache_result.py +0 -49
- deltacat/compute/stats/models/manifest_entry_stats.py +0 -72
- deltacat/compute/stats/models/stats_result.py +0 -104
- deltacat/compute/stats/utils/__init__.py +0 -0
- deltacat/compute/stats/utils/intervals.py +0 -94
- deltacat/compute/stats/utils/io.py +0 -230
- deltacat/compute/stats/utils/manifest_stats_file.py +0 -100
- deltacat/tests/stats/__init__.py +0 -0
- deltacat/tests/stats/test_intervals.py +0 -49
- /deltacat/{compute/metastats → tests/compute/resource_estimation}/__init__.py +0 -0
- /deltacat/{compute/metastats/config → tests/compute/resource_estimation/data}/__init__.py +0 -0
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/LICENSE +0 -0
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/WHEEL +0 -0
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/top_level.txt +0 -0
deltacat/compute/stats/basic.py
DELETED
@@ -1,226 +0,0 @@
|
|
1
|
-
from typing import Dict, List, Optional, Set, Tuple
|
2
|
-
|
3
|
-
import ray
|
4
|
-
from ray.types import ObjectRef
|
5
|
-
|
6
|
-
from deltacat.compute.stats.models.delta_stats import DeltaStats
|
7
|
-
from deltacat.compute.stats.models.delta_stats_cache_result import DeltaStatsCacheResult
|
8
|
-
from deltacat.compute.stats.models.stats_result import StatsResult
|
9
|
-
from deltacat.compute.stats.types import StatsType
|
10
|
-
from deltacat.compute.stats.utils.intervals import DeltaRange, merge_intervals
|
11
|
-
from deltacat.compute.stats.utils.io import (
|
12
|
-
cache_delta_column_stats,
|
13
|
-
get_delta_stats,
|
14
|
-
get_deltas_from_range,
|
15
|
-
read_cached_delta_stats,
|
16
|
-
)
|
17
|
-
from deltacat.storage import Delta, DeltaLocator, PartitionLocator
|
18
|
-
from deltacat.storage import interface as unimplemented_deltacat_storage
|
19
|
-
|
20
|
-
# TODO (ricmiyam): Decouple DeltaCAT from S3-based paths
|
21
|
-
# TODO (ricmiyam): Determine cache eviction policy
|
22
|
-
|
23
|
-
|
24
|
-
def collect(
|
25
|
-
source_partition_locator: PartitionLocator,
|
26
|
-
delta_stream_position_range_set: Optional[Set[DeltaRange]] = None,
|
27
|
-
columns: Optional[List[str]] = None,
|
28
|
-
stat_results_s3_bucket: Optional[str] = None,
|
29
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
30
|
-
) -> Dict[int, DeltaStats]:
|
31
|
-
"""Collects statistics on deltas, given a set of delta stream position ranges.
|
32
|
-
|
33
|
-
Example:
|
34
|
-
>>> collect(locator, set((1, 5), (4, 8), (13, 16)))
|
35
|
-
{
|
36
|
-
1: DeltaStats(), # DeltaStats for stream positions 1 - 8
|
37
|
-
13: DeltaStats() # DeltaStats for stream positions 13 - 16
|
38
|
-
}
|
39
|
-
|
40
|
-
Args:
|
41
|
-
source_partition_locator: Reference to the partition locator tied to the given delta stream positions
|
42
|
-
delta_stream_position_range_set: A set of intervals with an int type representing finite,
|
43
|
-
closed bounded values, and a None type representing unbounded infinity.
|
44
|
-
columns: Columns can be optionally included to collect stats on specific columns.
|
45
|
-
By default, all columns will be calculated.
|
46
|
-
stat_results_s3_bucket: Used as a cache file storage for computed delta stats
|
47
|
-
deltacat_storage: Client implementation of the DeltaCAT storage interface
|
48
|
-
|
49
|
-
Returns:
|
50
|
-
A mapping of stream positions to their corresponding delta stats.
|
51
|
-
"""
|
52
|
-
if delta_stream_position_range_set is None:
|
53
|
-
delta_stream_position_range_set = {(None, None)}
|
54
|
-
delta_stream_range_stats: Dict[int, DeltaStats] = {}
|
55
|
-
delta_range_lookup_pending: List[ObjectRef[List[Delta]]] = []
|
56
|
-
|
57
|
-
if not columns:
|
58
|
-
columns = deltacat_storage.get_table_version_column_names(
|
59
|
-
source_partition_locator.namespace,
|
60
|
-
source_partition_locator.table_name,
|
61
|
-
source_partition_locator.table_version,
|
62
|
-
)
|
63
|
-
for range_pair in merge_intervals(delta_stream_position_range_set):
|
64
|
-
begin, end = range_pair
|
65
|
-
promise: ObjectRef[List[Delta]] = get_deltas_from_range.remote(
|
66
|
-
source_partition_locator, begin, end, deltacat_storage
|
67
|
-
)
|
68
|
-
delta_range_lookup_pending.append(promise)
|
69
|
-
|
70
|
-
delta_list_by_ranges: List[List[Delta]] = ray.get(delta_range_lookup_pending)
|
71
|
-
deltas = [delta for delta_list in delta_list_by_ranges for delta in delta_list]
|
72
|
-
|
73
|
-
delta_stats_processed_list: List[DeltaStats] = _collect_stats_from_deltas(
|
74
|
-
deltas, columns, stat_results_s3_bucket, deltacat_storage
|
75
|
-
)
|
76
|
-
|
77
|
-
for delta_column_stats in delta_stats_processed_list:
|
78
|
-
assert (
|
79
|
-
len(delta_column_stats.column_stats) > 0
|
80
|
-
), f"Expected columns of `{delta_column_stats}` to be non-empty"
|
81
|
-
stream_position = delta_column_stats.column_stats[
|
82
|
-
0
|
83
|
-
].manifest_stats.delta_locator.stream_position
|
84
|
-
delta_stream_range_stats[stream_position] = delta_column_stats
|
85
|
-
|
86
|
-
return delta_stream_range_stats
|
87
|
-
|
88
|
-
|
89
|
-
def collect_from_deltas(
|
90
|
-
deltas: List[Delta],
|
91
|
-
stat_types: Set[StatsType],
|
92
|
-
columns: Optional[List[str]] = None,
|
93
|
-
stat_results_s3_bucket: Optional[str] = None,
|
94
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
95
|
-
) -> StatsResult:
|
96
|
-
"""
|
97
|
-
Variant of the `collect` function that takes a list of deltas and computes
|
98
|
-
the aggregate of all the delta stats.
|
99
|
-
"""
|
100
|
-
if columns is None and deltas:
|
101
|
-
delta_locator: DeltaLocator = deltas[0].locator
|
102
|
-
columns = deltacat_storage.get_table_version_column_names(
|
103
|
-
delta_locator.namespace,
|
104
|
-
delta_locator.table_name,
|
105
|
-
delta_locator.table_version,
|
106
|
-
)
|
107
|
-
|
108
|
-
delta_stats_processed_list: List[DeltaStats] = _collect_stats_from_deltas(
|
109
|
-
deltas, columns, stat_results_s3_bucket, deltacat_storage
|
110
|
-
)
|
111
|
-
|
112
|
-
return StatsResult.merge(
|
113
|
-
[delta_ds.stats for delta_ds in delta_stats_processed_list], stat_types
|
114
|
-
)
|
115
|
-
|
116
|
-
|
117
|
-
def _collect_stats_from_deltas(
|
118
|
-
deltas: List[Delta],
|
119
|
-
columns: Optional[List[str]] = None,
|
120
|
-
stat_results_s3_bucket: Optional[str] = None,
|
121
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
122
|
-
) -> List[DeltaStats]:
|
123
|
-
delta_cache_lookup_pending: List[ObjectRef[DeltaStatsCacheResult]] = []
|
124
|
-
delta_stats_compute_pending: List[ObjectRef[DeltaStats]] = []
|
125
|
-
|
126
|
-
for delta in deltas:
|
127
|
-
if stat_results_s3_bucket:
|
128
|
-
promise: ObjectRef[DeltaStatsCacheResult] = read_cached_delta_stats.remote(
|
129
|
-
delta, columns, stat_results_s3_bucket
|
130
|
-
)
|
131
|
-
delta_cache_lookup_pending.append(promise)
|
132
|
-
continue
|
133
|
-
|
134
|
-
delta_stats_compute_pending.append(
|
135
|
-
get_delta_stats.remote(delta.locator, columns, deltacat_storage)
|
136
|
-
)
|
137
|
-
|
138
|
-
return _process_stats(
|
139
|
-
delta_cache_lookup_pending,
|
140
|
-
delta_stats_compute_pending,
|
141
|
-
stat_results_s3_bucket,
|
142
|
-
deltacat_storage,
|
143
|
-
)
|
144
|
-
|
145
|
-
|
146
|
-
def _process_stats(
|
147
|
-
delta_cache_lookup_pending: List[ObjectRef[DeltaStatsCacheResult]],
|
148
|
-
delta_stats_compute_pending: List[ObjectRef[DeltaStats]],
|
149
|
-
stat_results_s3_bucket: Optional[str] = None,
|
150
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
151
|
-
) -> List[DeltaStats]:
|
152
|
-
if stat_results_s3_bucket:
|
153
|
-
delta_stats_processed_list: List[DeltaStats] = _resolve_pending_stats_and_cache(
|
154
|
-
delta_cache_lookup_pending, stat_results_s3_bucket, deltacat_storage
|
155
|
-
)
|
156
|
-
else:
|
157
|
-
delta_stats_processed_list: List[DeltaStats] = _resolve_pending_stats(
|
158
|
-
delta_stats_compute_pending
|
159
|
-
)
|
160
|
-
|
161
|
-
return delta_stats_processed_list
|
162
|
-
|
163
|
-
|
164
|
-
def _resolve_pending_stats_and_cache(
|
165
|
-
delta_cache_lookup_pending: List[ObjectRef[DeltaStatsCacheResult]],
|
166
|
-
stat_results_s3_bucket: str,
|
167
|
-
deltacat_storage,
|
168
|
-
) -> List[DeltaStats]:
|
169
|
-
delta_stats_cached_list, delta_stats_pending_list = _get_cached_and_pending_stats(
|
170
|
-
delta_cache_lookup_pending, deltacat_storage
|
171
|
-
)
|
172
|
-
delta_stats_resolved_list: List[DeltaStats] = _resolve_pending_stats(
|
173
|
-
delta_stats_pending_list
|
174
|
-
)
|
175
|
-
|
176
|
-
# Cache the stats into the file store
|
177
|
-
delta_stats_to_cache: List[ObjectRef] = [
|
178
|
-
cache_delta_column_stats.remote(stat_results_s3_bucket, dcs)
|
179
|
-
for dataset_stats in delta_stats_resolved_list
|
180
|
-
for dcs in dataset_stats.column_stats
|
181
|
-
]
|
182
|
-
ray.get(delta_stats_to_cache)
|
183
|
-
|
184
|
-
return [*delta_stats_cached_list, *delta_stats_resolved_list]
|
185
|
-
|
186
|
-
|
187
|
-
def _get_cached_and_pending_stats(
|
188
|
-
discover_deltas_pending: List[ObjectRef[DeltaStatsCacheResult]],
|
189
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
190
|
-
) -> Tuple[List[DeltaStats], List[ObjectRef[DeltaStats]]]:
|
191
|
-
"""
|
192
|
-
Returns a tuple of a list of delta stats fetched from the cache, and a list of Ray tasks which will
|
193
|
-
calculate the stats for deltas on cache miss.
|
194
|
-
"""
|
195
|
-
delta_stats_processed: List[DeltaStats] = []
|
196
|
-
delta_stats_pending: List[ObjectRef[DeltaStats]] = []
|
197
|
-
while discover_deltas_pending:
|
198
|
-
ready, discover_deltas_pending = ray.wait(discover_deltas_pending)
|
199
|
-
|
200
|
-
cached_results: List[DeltaStatsCacheResult] = ray.get(ready)
|
201
|
-
for cached_result in cached_results:
|
202
|
-
if cached_result.hits:
|
203
|
-
delta_stats_processed.append(cached_result.hits)
|
204
|
-
|
205
|
-
if cached_result.misses:
|
206
|
-
missed_column_names: List[str] = cached_result.misses.column_names
|
207
|
-
delta_locator: DeltaLocator = cached_result.misses.delta_locator
|
208
|
-
delta_stats_pending.append(
|
209
|
-
get_delta_stats.remote(
|
210
|
-
delta_locator, missed_column_names, deltacat_storage
|
211
|
-
)
|
212
|
-
)
|
213
|
-
|
214
|
-
return delta_stats_processed, delta_stats_pending
|
215
|
-
|
216
|
-
|
217
|
-
def _resolve_pending_stats(
|
218
|
-
delta_stats_pending_list: List[ObjectRef[DeltaStats]],
|
219
|
-
) -> List[DeltaStats]:
|
220
|
-
delta_stats_processed_list: List[DeltaStats] = []
|
221
|
-
while delta_stats_pending_list:
|
222
|
-
ready, delta_stats_pending_list = ray.wait(delta_stats_pending_list)
|
223
|
-
processed_stats_batch: List[DeltaStats] = ray.get(ready)
|
224
|
-
delta_stats_processed_list.extend(processed_stats_batch)
|
225
|
-
|
226
|
-
return delta_stats_processed_list
|
File without changes
|
@@ -1,98 +0,0 @@
|
|
1
|
-
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
|
-
from __future__ import annotations
|
3
|
-
|
4
|
-
from typing import Any, Dict, List, Optional
|
5
|
-
|
6
|
-
from deltacat.compute.stats.models.manifest_entry_stats import ManifestEntryStats
|
7
|
-
from deltacat.compute.stats.models.stats_result import StatsResult
|
8
|
-
from deltacat.compute.stats.types import StatsType
|
9
|
-
|
10
|
-
|
11
|
-
class DeltaColumnStats(dict):
|
12
|
-
"""
|
13
|
-
Stats container for an individual column of a Delta.
|
14
|
-
Provides distinct stats results for each manifest entry of the Delta.
|
15
|
-
|
16
|
-
Example:
|
17
|
-
Manifest Entry 1
|
18
|
-
=======
|
19
|
-
foo bar baz
|
20
|
-
A B C
|
21
|
-
D E F
|
22
|
-
|
23
|
-
Manifest Entry 2
|
24
|
-
=======
|
25
|
-
foo bar baz
|
26
|
-
G H I
|
27
|
-
J K L
|
28
|
-
|
29
|
-
DeltaColumnStats("foo",
|
30
|
-
ManifestEntryStats([
|
31
|
-
StatsResult([A, D]), # Manifest Entry 1
|
32
|
-
StatsResult([G, J]), # Manifest Entry 2
|
33
|
-
]))
|
34
|
-
DeltaColumnStats("bar",
|
35
|
-
ManifestEntryStats([
|
36
|
-
StatsResult([B, E]), # Manifest Entry 1
|
37
|
-
StatsResult([H, K]), # Manifest Entry 2
|
38
|
-
]))
|
39
|
-
DeltaColumnStats("baz",
|
40
|
-
ManifestEntryStats([
|
41
|
-
StatsResult([C, F]), # Manifest Entry 1
|
42
|
-
StatsResult([I, L]), # Manifest Entry 2
|
43
|
-
]))
|
44
|
-
"""
|
45
|
-
|
46
|
-
@staticmethod
|
47
|
-
def of(column: str, manifest_stats: ManifestEntryStats) -> DeltaColumnStats:
|
48
|
-
"""
|
49
|
-
Creates a container of a column name and the column stats for one or more manifest entries.
|
50
|
-
"""
|
51
|
-
dcs = DeltaColumnStats()
|
52
|
-
dcs["column"] = column
|
53
|
-
dcs["manifestStats"] = manifest_stats
|
54
|
-
|
55
|
-
if manifest_stats:
|
56
|
-
# Omit row count for columnar-centric stats
|
57
|
-
dcs["stats"] = dcs._merge_manifest_stats()
|
58
|
-
|
59
|
-
return dcs
|
60
|
-
|
61
|
-
@staticmethod
|
62
|
-
def build_from_dict(delta_column_stats: List[str, Any]) -> List[DeltaColumnStats]:
|
63
|
-
return DeltaColumnStats.of(
|
64
|
-
delta_column_stats["column"],
|
65
|
-
ManifestEntryStats.build_from_dict(delta_column_stats["manifestStats"]),
|
66
|
-
)
|
67
|
-
|
68
|
-
@property
|
69
|
-
def column(self) -> str:
|
70
|
-
"""Returns the column name."""
|
71
|
-
return self.get("column")
|
72
|
-
|
73
|
-
@property
|
74
|
-
def manifest_stats(self) -> Optional[ManifestEntryStats]:
|
75
|
-
"""Returns a container that represents stats at the manifest level.
|
76
|
-
|
77
|
-
A container holds a list of computed stats for each manifest entry.
|
78
|
-
"""
|
79
|
-
val: Dict[str, Any] = self.get("manifestStats")
|
80
|
-
if val is not None and not isinstance(val, ManifestEntryStats):
|
81
|
-
self["manifestStats"] = val = ManifestEntryStats(val)
|
82
|
-
return val
|
83
|
-
|
84
|
-
@property
|
85
|
-
def stats(self) -> Optional[StatsResult]:
|
86
|
-
"""Combines the numerical stats for every manifest entry and returns it."""
|
87
|
-
val: Dict[str, Any] = self.get("stats")
|
88
|
-
if val is not None and not isinstance(val, StatsResult):
|
89
|
-
self["stats"] = val = StatsResult(val)
|
90
|
-
elif val is None and self.manifest_stats:
|
91
|
-
self["stats"] = val = self._merge_manifest_stats()
|
92
|
-
|
93
|
-
return val
|
94
|
-
|
95
|
-
def _merge_manifest_stats(self) -> StatsResult:
|
96
|
-
return StatsResult.merge(
|
97
|
-
self.manifest_stats.stats, {StatsType.PYARROW_TABLE_BYTES}
|
98
|
-
)
|
@@ -1,233 +0,0 @@
|
|
1
|
-
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
|
-
from __future__ import annotations
|
3
|
-
|
4
|
-
from collections import defaultdict
|
5
|
-
from typing import Any, Dict, List, NamedTuple, Optional, Set
|
6
|
-
|
7
|
-
from deltacat.compute.stats.models.delta_column_stats import DeltaColumnStats
|
8
|
-
from deltacat.compute.stats.models.manifest_entry_stats import ManifestEntryStats
|
9
|
-
from deltacat.compute.stats.models.stats_result import StatsResult
|
10
|
-
from deltacat.compute.stats.types import StatsType
|
11
|
-
from deltacat.storage import DeltaLocator
|
12
|
-
|
13
|
-
|
14
|
-
class DeltaStats(dict):
|
15
|
-
"""
|
16
|
-
Stats container for all columns of a delta.
|
17
|
-
|
18
|
-
Provides distinct stats for each delta manifest entry, aggregate stats across all manifest entries,
|
19
|
-
and a DeltaColumnStats reference for each column.
|
20
|
-
|
21
|
-
Each DeltaColumnStats has a column name and a ManifestEntryStats object,
|
22
|
-
which contains column-level stats for each delta manifest entry.
|
23
|
-
|
24
|
-
Example of visual representation:
|
25
|
-
Manifest Entry 1
|
26
|
-
=======
|
27
|
-
foo bar baz
|
28
|
-
A B C
|
29
|
-
D E F
|
30
|
-
|
31
|
-
Manifest Entry 2
|
32
|
-
=======
|
33
|
-
foo bar baz
|
34
|
-
G H I
|
35
|
-
J K L
|
36
|
-
|
37
|
-
DeltaStats([
|
38
|
-
DeltaColumnStats("foo",
|
39
|
-
ManifestEntryStats([
|
40
|
-
StatsResult([A, D]), # Manifest Entry 1
|
41
|
-
StatsResult([G, J]), # Manifest Entry 2
|
42
|
-
]))
|
43
|
-
DeltaColumnStats("bar",
|
44
|
-
ManifestEntryStats([
|
45
|
-
StatsResult([B, E]), # Manifest Entry 1
|
46
|
-
StatsResult([H, K]), # Manifest Entry 2
|
47
|
-
]))
|
48
|
-
DeltaColumnStats("baz",
|
49
|
-
ManifestEntryStats([
|
50
|
-
StatsResult([C, F]), # Manifest Entry 1
|
51
|
-
StatsResult([I, L]), # Manifest Entry 2
|
52
|
-
]))
|
53
|
-
], Stats(AllDeltaColumnStats))
|
54
|
-
"""
|
55
|
-
|
56
|
-
@staticmethod
|
57
|
-
def of(column_stats: List[DeltaColumnStats]) -> DeltaStats:
|
58
|
-
ds = DeltaStats()
|
59
|
-
ds["column_stats"] = column_stats
|
60
|
-
ds["stats"] = DeltaStats.get_delta_stats(column_stats)
|
61
|
-
return ds
|
62
|
-
|
63
|
-
@staticmethod
|
64
|
-
def build_from_dict(delta_stats: dict) -> DeltaStats:
|
65
|
-
delta_column_stats_list = []
|
66
|
-
for dcs in delta_stats["column_stats"]:
|
67
|
-
delta_column_stats_list.append(DeltaColumnStats.build_from_dict(dcs))
|
68
|
-
return DeltaStats.of(delta_column_stats_list)
|
69
|
-
|
70
|
-
@property
|
71
|
-
def column_stats(self) -> List[DeltaColumnStats]:
|
72
|
-
"""
|
73
|
-
Returns a list of stats associated to each column in this delta.
|
74
|
-
"""
|
75
|
-
return self["column_stats"]
|
76
|
-
|
77
|
-
@property
|
78
|
-
def stats(self) -> Optional[StatsResult]:
|
79
|
-
"""Returns a StatsResult object that represents this delta, aggregated by the column stats of this delta."""
|
80
|
-
val: Dict[str, Any] = self.get("stats")
|
81
|
-
if val is not None and not isinstance(val, StatsResult):
|
82
|
-
self["stats"] = val = StatsResult(val)
|
83
|
-
elif val is None and self.column_stats:
|
84
|
-
self["stats"] = val = DeltaStats.get_delta_stats(self.column_stats)
|
85
|
-
|
86
|
-
return val
|
87
|
-
|
88
|
-
@property
|
89
|
-
def columns(self) -> List[str]:
|
90
|
-
"""Returns a list of column names associated to this delta.
|
91
|
-
|
92
|
-
Returns:
|
93
|
-
A list of column names
|
94
|
-
"""
|
95
|
-
return DeltaStats.get_column_names(self.column_stats)
|
96
|
-
|
97
|
-
def manifest_entry_stats(self, manifest_entry_idx: int) -> StatsResult:
|
98
|
-
"""Calculate the stats of a manifest entry by combining its columnar stats.
|
99
|
-
|
100
|
-
Args:
|
101
|
-
manifest_entry_idx: The manifest entry table to calculate stats for
|
102
|
-
|
103
|
-
Returns:
|
104
|
-
Stats for the manifest entry.
|
105
|
-
"""
|
106
|
-
return StatsResult.merge(
|
107
|
-
DeltaStats.get_manifest_entry_column_stats(
|
108
|
-
self.column_stats, manifest_entry_idx
|
109
|
-
),
|
110
|
-
record_row_count_once=True,
|
111
|
-
)
|
112
|
-
|
113
|
-
def manifest_entry_column_stats(self, manifest_entry_idx: int) -> List[StatsResult]:
|
114
|
-
"""Fetch a list of stats for each column in a manifest entry.
|
115
|
-
|
116
|
-
Args:
|
117
|
-
manifest_entry_idx: The manifest entry table to calculate stats for
|
118
|
-
|
119
|
-
Returns:
|
120
|
-
A list of columnar stats for the manifest entry
|
121
|
-
"""
|
122
|
-
return DeltaStats.get_manifest_entry_column_stats(
|
123
|
-
self.column_stats, manifest_entry_idx
|
124
|
-
)
|
125
|
-
|
126
|
-
@staticmethod
|
127
|
-
def get_manifest_entry_column_stats(
|
128
|
-
columns: List[DeltaColumnStats], manifest_entry_idx: int
|
129
|
-
) -> List[StatsResult]:
|
130
|
-
"""Helper method to provide a list of columnar stats for a specific manifest entry.
|
131
|
-
|
132
|
-
Returns:
|
133
|
-
A list of columnar stats for the manifest entry
|
134
|
-
"""
|
135
|
-
dataset_columnar_stats_list: List[ManifestEntryStats] = [
|
136
|
-
column.manifest_stats
|
137
|
-
for column in columns
|
138
|
-
if column.manifest_stats is not None
|
139
|
-
]
|
140
|
-
try:
|
141
|
-
return [
|
142
|
-
stats.stats[manifest_entry_idx] for stats in dataset_columnar_stats_list
|
143
|
-
]
|
144
|
-
except IndexError:
|
145
|
-
sci: ManifestEntryStats = dataset_columnar_stats_list[0]
|
146
|
-
raise ValueError(
|
147
|
-
f"Table index {manifest_entry_idx} is not present in this dataset of {sci.delta_locator} "
|
148
|
-
f"with manifest table count of {len(sci.stats)}"
|
149
|
-
)
|
150
|
-
|
151
|
-
@staticmethod
|
152
|
-
def get_column_names(columns: List[DeltaColumnStats]) -> List[str]:
|
153
|
-
"""Helper method to get the names of each column from a list of delta column stats
|
154
|
-
|
155
|
-
Args:
|
156
|
-
columns: A list of delta column stats
|
157
|
-
|
158
|
-
Returns:
|
159
|
-
A list of column names
|
160
|
-
"""
|
161
|
-
return [column_stats.column for column_stats in columns] if columns else []
|
162
|
-
|
163
|
-
@staticmethod
|
164
|
-
def get_delta_stats(
|
165
|
-
columns: List[DeltaColumnStats], stat_types: Optional[Set[StatsType]] = None
|
166
|
-
) -> Optional[StatsResult]:
|
167
|
-
"""Calculate the sum of provided column stats and return it
|
168
|
-
|
169
|
-
Args:
|
170
|
-
columns: A list of delta column stats
|
171
|
-
|
172
|
-
Returns:
|
173
|
-
Stats for the calculated sum
|
174
|
-
"""
|
175
|
-
assert columns and len(columns) > 0, (
|
176
|
-
f"Expected columns `{columns}` of type `{type(columns)}` "
|
177
|
-
f"to be a non-empty list of DeltaColumnStats"
|
178
|
-
)
|
179
|
-
|
180
|
-
assert all(
|
181
|
-
[col.manifest_stats for col in columns]
|
182
|
-
), f"Expected stats completion info to be present in each item of {columns} "
|
183
|
-
|
184
|
-
manifest_entry_count = len(columns[0].manifest_stats.stats)
|
185
|
-
column_stats_map: Dict[str, List[Optional[StatsResult]]] = defaultdict(
|
186
|
-
lambda: [None] * manifest_entry_count
|
187
|
-
)
|
188
|
-
|
189
|
-
for column_stats in columns:
|
190
|
-
for file_idx, entry_stats in enumerate(column_stats.manifest_stats.stats):
|
191
|
-
column_stats_map[column_stats.column][file_idx] = entry_stats
|
192
|
-
|
193
|
-
return DeltaStats._merge_stats_from_columns_to_dataset(
|
194
|
-
DeltaStats.get_column_names(columns),
|
195
|
-
column_stats_map,
|
196
|
-
manifest_entry_count,
|
197
|
-
stat_types,
|
198
|
-
)
|
199
|
-
|
200
|
-
@staticmethod
|
201
|
-
def _merge_stats_from_columns_to_dataset(
|
202
|
-
column_names: List[str],
|
203
|
-
column_stats: Dict[str, List[Optional[StatsResult]]],
|
204
|
-
manifest_entries_size: int,
|
205
|
-
stat_types: Optional[Set[StatsType]] = None,
|
206
|
-
) -> StatsResult:
|
207
|
-
manifest_entry_stats_summary_list: List[StatsResult] = []
|
208
|
-
for manifest_entry_idx in range(manifest_entries_size):
|
209
|
-
curr_manifest_entry_column_stats_list: List[StatsResult] = []
|
210
|
-
for column_name in column_names:
|
211
|
-
current_table_column_stats: StatsResult = column_stats[column_name][
|
212
|
-
manifest_entry_idx
|
213
|
-
]
|
214
|
-
curr_manifest_entry_column_stats_list.append(current_table_column_stats)
|
215
|
-
|
216
|
-
curr_manifest_entry_stats_summary = StatsResult.merge(
|
217
|
-
curr_manifest_entry_column_stats_list,
|
218
|
-
stat_types,
|
219
|
-
record_row_count_once=True,
|
220
|
-
)
|
221
|
-
manifest_entry_stats_summary_list.append(curr_manifest_entry_stats_summary)
|
222
|
-
return StatsResult.merge(manifest_entry_stats_summary_list, stat_types)
|
223
|
-
|
224
|
-
|
225
|
-
class DeltaStatsCacheMiss(NamedTuple):
|
226
|
-
"""A helper class for cache miss results from DeltaStatsCacheResult.
|
227
|
-
|
228
|
-
`column_names` represents missing dataset column names from the file system (ex: S3).
|
229
|
-
delta_locator` is tied to the missing dataset columns and provided for future calculations.
|
230
|
-
"""
|
231
|
-
|
232
|
-
column_names: List[str]
|
233
|
-
delta_locator: DeltaLocator
|
@@ -1,49 +0,0 @@
|
|
1
|
-
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
|
-
from __future__ import annotations
|
3
|
-
|
4
|
-
from typing import Optional
|
5
|
-
|
6
|
-
from deltacat.compute.stats.models.delta_stats import DeltaStats, DeltaStatsCacheMiss
|
7
|
-
|
8
|
-
|
9
|
-
class DeltaStatsCacheResult(dict):
|
10
|
-
"""A helper class containing the results from a cache query.
|
11
|
-
|
12
|
-
Stats are fetched and cached at the column level, and each column may represent one
|
13
|
-
or more manifest entries.
|
14
|
-
"""
|
15
|
-
|
16
|
-
@staticmethod
|
17
|
-
def of(
|
18
|
-
hits: Optional[DeltaStats], misses: Optional[DeltaStatsCacheMiss]
|
19
|
-
) -> DeltaStatsCacheResult:
|
20
|
-
cds = DeltaStatsCacheResult()
|
21
|
-
cds["hits"] = hits
|
22
|
-
cds["misses"] = misses
|
23
|
-
return cds
|
24
|
-
|
25
|
-
@property
|
26
|
-
def hits(self) -> Optional[DeltaStats]:
|
27
|
-
"""Retrieve stats that were found in the cache
|
28
|
-
|
29
|
-
`hits` represents a DeltaStats object that contains dataset-wide statistics across
|
30
|
-
many of its tables (or manifest entries) and is composed of one or more column-wide
|
31
|
-
DeltaColumnStats.
|
32
|
-
|
33
|
-
Returns:
|
34
|
-
A delta wide stats container
|
35
|
-
"""
|
36
|
-
return self["hits"]
|
37
|
-
|
38
|
-
@property
|
39
|
-
def misses(self) -> Optional[DeltaStatsCacheMiss]:
|
40
|
-
"""Retrieve stats that were missing from the cache
|
41
|
-
|
42
|
-
`misses` represents a DeltaStatsCacheMiss object that contains a list of
|
43
|
-
column names that were not found in the file system (ex: S3) and a `delta_locator`
|
44
|
-
as a reference to the delta metadata tied to the missing dataset columns.
|
45
|
-
|
46
|
-
Returns:
|
47
|
-
A tuple with metadata regarding the cache miss
|
48
|
-
"""
|
49
|
-
return self["misses"]
|
@@ -1,72 +0,0 @@
|
|
1
|
-
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
|
-
from __future__ import annotations
|
3
|
-
|
4
|
-
from typing import Any, Dict, List
|
5
|
-
|
6
|
-
import pyarrow as pa
|
7
|
-
|
8
|
-
from deltacat.compute.stats.models.stats_result import StatsResult
|
9
|
-
from deltacat.storage import DeltaLocator
|
10
|
-
|
11
|
-
|
12
|
-
class ManifestEntryStats(dict):
|
13
|
-
"""Holds computed statistics for one or more manifest entries (tables) and their corresponding delta locator.
|
14
|
-
|
15
|
-
To be stored/retrieved from a file system (ex: S3).
|
16
|
-
"""
|
17
|
-
|
18
|
-
@staticmethod
|
19
|
-
def of(
|
20
|
-
manifest_entries_stats: List[StatsResult], delta_locator: DeltaLocator
|
21
|
-
) -> ManifestEntryStats:
|
22
|
-
"""
|
23
|
-
Creates a stats container that represents a particular manifest.
|
24
|
-
|
25
|
-
`manifest_entries_stats` are a list of distinct stats for each manifest entry file
|
26
|
-
tied to this manifest. `delta_locator` is provided as a reference to the delta where the
|
27
|
-
manifest entries reside.
|
28
|
-
"""
|
29
|
-
|
30
|
-
mes = ManifestEntryStats()
|
31
|
-
mes["deltaLocator"] = delta_locator
|
32
|
-
mes["stats"] = manifest_entries_stats
|
33
|
-
mes["pyarrowVersion"] = pa.__version__
|
34
|
-
return mes
|
35
|
-
|
36
|
-
@staticmethod
|
37
|
-
def build_from_dict(manifest_entries_stats: dict) -> ManifestEntryStats:
|
38
|
-
stats_res_list = []
|
39
|
-
for stats_res in manifest_entries_stats["stats"]:
|
40
|
-
stats_res_list.append(
|
41
|
-
StatsResult.of(stats_res["rowCount"], stats_res["pyarrowTableBytes"])
|
42
|
-
)
|
43
|
-
return ManifestEntryStats.of(
|
44
|
-
stats_res_list, manifest_entries_stats["deltaLocator"]
|
45
|
-
)
|
46
|
-
|
47
|
-
@property
|
48
|
-
def delta_locator(self) -> DeltaLocator:
|
49
|
-
"""Reference to the delta that holds the manifest entries
|
50
|
-
|
51
|
-
Returns:
|
52
|
-
A delta locator object
|
53
|
-
"""
|
54
|
-
val: Dict[str, Any] = self.get("deltaLocator")
|
55
|
-
if val is not None and not isinstance(val, DeltaLocator):
|
56
|
-
self["deltaLocator"] = val = DeltaLocator(val)
|
57
|
-
return val
|
58
|
-
|
59
|
-
@property
|
60
|
-
def stats(self) -> List[StatsResult]:
|
61
|
-
"""
|
62
|
-
Returns a list of distinct stats for each manifest entry file.
|
63
|
-
"""
|
64
|
-
val = self["stats"]
|
65
|
-
return [StatsResult(_) for _ in val] if val else []
|
66
|
-
|
67
|
-
@property
|
68
|
-
def pyarrow_version(self) -> str:
|
69
|
-
"""
|
70
|
-
Read-only property which returns the PyArrow version number as it was written into a file system.
|
71
|
-
"""
|
72
|
-
return self.get("pyarrowVersion")
|