deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +41 -15
- deltacat/aws/clients.py +12 -31
- deltacat/aws/constants.py +1 -1
- deltacat/aws/redshift/__init__.py +7 -2
- deltacat/aws/redshift/model/manifest.py +54 -50
- deltacat/aws/s3u.py +176 -187
- deltacat/catalog/delegate.py +151 -185
- deltacat/catalog/interface.py +78 -97
- deltacat/catalog/model/catalog.py +21 -21
- deltacat/catalog/model/table_definition.py +11 -9
- deltacat/compute/compactor/__init__.py +12 -16
- deltacat/compute/compactor/compaction_session.py +237 -166
- deltacat/compute/compactor/model/delta_annotated.py +60 -44
- deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
- deltacat/compute/compactor/model/delta_file_locator.py +10 -8
- deltacat/compute/compactor/model/materialize_result.py +6 -7
- deltacat/compute/compactor/model/primary_key_index.py +38 -34
- deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
- deltacat/compute/compactor/model/round_completion_info.py +25 -19
- deltacat/compute/compactor/model/sort_key.py +18 -15
- deltacat/compute/compactor/steps/dedupe.py +119 -94
- deltacat/compute/compactor/steps/hash_bucket.py +48 -47
- deltacat/compute/compactor/steps/materialize.py +86 -92
- deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
- deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
- deltacat/compute/compactor/utils/io.py +59 -47
- deltacat/compute/compactor/utils/primary_key_index.py +91 -80
- deltacat/compute/compactor/utils/round_completion_file.py +22 -23
- deltacat/compute/compactor/utils/system_columns.py +33 -45
- deltacat/compute/metastats/meta_stats.py +235 -157
- deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
- deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
- deltacat/compute/metastats/stats.py +95 -64
- deltacat/compute/metastats/utils/io.py +100 -53
- deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
- deltacat/compute/metastats/utils/ray_utils.py +38 -33
- deltacat/compute/stats/basic.py +107 -69
- deltacat/compute/stats/models/delta_column_stats.py +11 -8
- deltacat/compute/stats/models/delta_stats.py +59 -32
- deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
- deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
- deltacat/compute/stats/models/stats_result.py +24 -14
- deltacat/compute/stats/utils/intervals.py +16 -9
- deltacat/compute/stats/utils/io.py +86 -51
- deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
- deltacat/constants.py +4 -13
- deltacat/io/__init__.py +2 -2
- deltacat/io/aws/redshift/redshift_datasource.py +157 -143
- deltacat/io/dataset.py +14 -17
- deltacat/io/read_api.py +36 -33
- deltacat/logs.py +94 -42
- deltacat/storage/__init__.py +18 -8
- deltacat/storage/interface.py +196 -213
- deltacat/storage/model/delta.py +45 -51
- deltacat/storage/model/list_result.py +12 -8
- deltacat/storage/model/namespace.py +4 -5
- deltacat/storage/model/partition.py +42 -42
- deltacat/storage/model/stream.py +29 -30
- deltacat/storage/model/table.py +14 -14
- deltacat/storage/model/table_version.py +32 -31
- deltacat/storage/model/types.py +1 -0
- deltacat/tests/stats/test_intervals.py +11 -24
- deltacat/tests/utils/__init__.py +0 -0
- deltacat/tests/utils/test_record_batch_tables.py +284 -0
- deltacat/types/media.py +3 -4
- deltacat/types/tables.py +31 -21
- deltacat/utils/common.py +5 -11
- deltacat/utils/numpy.py +20 -22
- deltacat/utils/pandas.py +73 -100
- deltacat/utils/performance.py +3 -9
- deltacat/utils/placement.py +259 -230
- deltacat/utils/pyarrow.py +302 -89
- deltacat/utils/ray_utils/collections.py +2 -1
- deltacat/utils/ray_utils/concurrency.py +27 -28
- deltacat/utils/ray_utils/dataset.py +28 -28
- deltacat/utils/ray_utils/performance.py +5 -9
- deltacat/utils/ray_utils/runtime.py +9 -10
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
- deltacat-0.1.12.dist-info/RECORD +110 -0
- deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0
deltacat/compute/stats/basic.py
CHANGED
@@ -1,30 +1,33 @@
|
|
1
|
-
import
|
2
|
-
from typing import Dict, Set, Tuple, List, Optional
|
1
|
+
from typing import Dict, List, Optional, Set, Tuple
|
3
2
|
|
4
|
-
|
3
|
+
import ray
|
5
4
|
from ray.types import ObjectRef
|
6
5
|
|
6
|
+
from deltacat.compute.stats.models.delta_stats import DeltaStats
|
7
7
|
from deltacat.compute.stats.models.delta_stats_cache_result import DeltaStatsCacheResult
|
8
8
|
from deltacat.compute.stats.models.stats_result import StatsResult
|
9
9
|
from deltacat.compute.stats.types import StatsType
|
10
|
-
from deltacat.compute.stats.utils.
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
10
|
+
from deltacat.compute.stats.utils.intervals import DeltaRange, merge_intervals
|
11
|
+
from deltacat.compute.stats.utils.io import (
|
12
|
+
cache_delta_column_stats,
|
13
|
+
get_delta_stats,
|
14
|
+
get_deltas_from_range,
|
15
|
+
read_cached_delta_stats,
|
16
|
+
)
|
17
|
+
from deltacat.storage import Delta, DeltaLocator, PartitionLocator
|
15
18
|
from deltacat.storage import interface as unimplemented_deltacat_storage
|
16
19
|
|
17
|
-
|
18
20
|
# TODO (ricmiyam): Decouple DeltaCAT from S3-based paths
|
19
21
|
# TODO (ricmiyam): Determine cache eviction policy
|
20
22
|
|
21
23
|
|
22
24
|
def collect(
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
25
|
+
source_partition_locator: PartitionLocator,
|
26
|
+
delta_stream_position_range_set: Optional[Set[DeltaRange]] = None,
|
27
|
+
columns: Optional[List[str]] = None,
|
28
|
+
stat_results_s3_bucket: Optional[str] = None,
|
29
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
30
|
+
) -> Dict[int, DeltaStats]:
|
28
31
|
"""Collects statistics on deltas, given a set of delta stream position ranges.
|
29
32
|
|
30
33
|
Example:
|
@@ -52,109 +55,139 @@ def collect(
|
|
52
55
|
delta_range_lookup_pending: List[ObjectRef[List[Delta]]] = []
|
53
56
|
|
54
57
|
if not columns:
|
55
|
-
columns = deltacat_storage.get_table_version_column_names(
|
56
|
-
|
57
|
-
|
58
|
+
columns = deltacat_storage.get_table_version_column_names(
|
59
|
+
source_partition_locator.namespace,
|
60
|
+
source_partition_locator.table_name,
|
61
|
+
source_partition_locator.table_version,
|
62
|
+
)
|
58
63
|
for range_pair in merge_intervals(delta_stream_position_range_set):
|
59
64
|
begin, end = range_pair
|
60
|
-
promise: ObjectRef[List[Delta]] = get_deltas_from_range.remote(
|
61
|
-
|
65
|
+
promise: ObjectRef[List[Delta]] = get_deltas_from_range.remote(
|
66
|
+
source_partition_locator, begin, end, deltacat_storage
|
67
|
+
)
|
62
68
|
delta_range_lookup_pending.append(promise)
|
63
69
|
|
64
70
|
delta_list_by_ranges: List[List[Delta]] = ray.get(delta_range_lookup_pending)
|
65
71
|
deltas = [delta for delta_list in delta_list_by_ranges for delta in delta_list]
|
66
72
|
|
67
|
-
delta_stats_processed_list: List[DeltaStats] = _collect_stats_from_deltas(
|
68
|
-
|
69
|
-
|
70
|
-
deltacat_storage)
|
73
|
+
delta_stats_processed_list: List[DeltaStats] = _collect_stats_from_deltas(
|
74
|
+
deltas, columns, stat_results_s3_bucket, deltacat_storage
|
75
|
+
)
|
71
76
|
|
72
77
|
for delta_column_stats in delta_stats_processed_list:
|
73
|
-
assert
|
74
|
-
|
75
|
-
|
78
|
+
assert (
|
79
|
+
len(delta_column_stats.column_stats) > 0
|
80
|
+
), f"Expected columns of `{delta_column_stats}` to be non-empty"
|
81
|
+
stream_position = delta_column_stats.column_stats[
|
82
|
+
0
|
83
|
+
].manifest_stats.delta_locator.stream_position
|
76
84
|
delta_stream_range_stats[stream_position] = delta_column_stats
|
77
85
|
|
78
86
|
return delta_stream_range_stats
|
79
87
|
|
80
88
|
|
81
89
|
def collect_from_deltas(
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
90
|
+
deltas: List[Delta],
|
91
|
+
stat_types: Set[StatsType],
|
92
|
+
columns: Optional[List[str]] = None,
|
93
|
+
stat_results_s3_bucket: Optional[str] = None,
|
94
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
95
|
+
) -> StatsResult:
|
87
96
|
"""
|
88
97
|
Variant of the `collect` function that takes a list of deltas and computes
|
89
98
|
the aggregate of all the delta stats.
|
90
99
|
"""
|
91
100
|
if columns is None and deltas:
|
92
101
|
delta_locator: DeltaLocator = deltas[0].locator
|
93
|
-
columns = deltacat_storage.get_table_version_column_names(
|
94
|
-
|
95
|
-
|
102
|
+
columns = deltacat_storage.get_table_version_column_names(
|
103
|
+
delta_locator.namespace,
|
104
|
+
delta_locator.table_name,
|
105
|
+
delta_locator.table_version,
|
106
|
+
)
|
96
107
|
|
97
|
-
delta_stats_processed_list: List[DeltaStats] =
|
98
|
-
|
108
|
+
delta_stats_processed_list: List[DeltaStats] = _collect_stats_from_deltas(
|
109
|
+
deltas, columns, stat_results_s3_bucket, deltacat_storage
|
110
|
+
)
|
99
111
|
|
100
|
-
return StatsResult.merge(
|
112
|
+
return StatsResult.merge(
|
113
|
+
[delta_ds.stats for delta_ds in delta_stats_processed_list], stat_types
|
114
|
+
)
|
101
115
|
|
102
116
|
|
103
117
|
def _collect_stats_from_deltas(
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
118
|
+
deltas: List[Delta],
|
119
|
+
columns: Optional[List[str]] = None,
|
120
|
+
stat_results_s3_bucket: Optional[str] = None,
|
121
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
122
|
+
) -> List[DeltaStats]:
|
108
123
|
delta_cache_lookup_pending: List[ObjectRef[DeltaStatsCacheResult]] = []
|
109
124
|
delta_stats_compute_pending: List[ObjectRef[DeltaStats]] = []
|
110
125
|
|
111
126
|
for delta in deltas:
|
112
127
|
if stat_results_s3_bucket:
|
113
|
-
promise: ObjectRef[DeltaStatsCacheResult] =
|
114
|
-
|
128
|
+
promise: ObjectRef[DeltaStatsCacheResult] = read_cached_delta_stats.remote(
|
129
|
+
delta, columns, stat_results_s3_bucket
|
130
|
+
)
|
115
131
|
delta_cache_lookup_pending.append(promise)
|
116
132
|
continue
|
117
133
|
|
118
|
-
delta_stats_compute_pending.append(
|
134
|
+
delta_stats_compute_pending.append(
|
135
|
+
get_delta_stats.remote(delta.locator, columns, deltacat_storage)
|
136
|
+
)
|
119
137
|
|
120
|
-
return _process_stats(
|
121
|
-
|
138
|
+
return _process_stats(
|
139
|
+
delta_cache_lookup_pending,
|
140
|
+
delta_stats_compute_pending,
|
141
|
+
stat_results_s3_bucket,
|
142
|
+
deltacat_storage,
|
143
|
+
)
|
122
144
|
|
123
145
|
|
124
146
|
def _process_stats(
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
147
|
+
delta_cache_lookup_pending: List[ObjectRef[DeltaStatsCacheResult]],
|
148
|
+
delta_stats_compute_pending: List[ObjectRef[DeltaStats]],
|
149
|
+
stat_results_s3_bucket: Optional[str] = None,
|
150
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
151
|
+
) -> List[DeltaStats]:
|
129
152
|
if stat_results_s3_bucket:
|
130
|
-
delta_stats_processed_list: List[DeltaStats] = _resolve_pending_stats_and_cache(
|
131
|
-
|
132
|
-
|
153
|
+
delta_stats_processed_list: List[DeltaStats] = _resolve_pending_stats_and_cache(
|
154
|
+
delta_cache_lookup_pending, stat_results_s3_bucket, deltacat_storage
|
155
|
+
)
|
133
156
|
else:
|
134
|
-
delta_stats_processed_list: List[DeltaStats] = _resolve_pending_stats(
|
157
|
+
delta_stats_processed_list: List[DeltaStats] = _resolve_pending_stats(
|
158
|
+
delta_stats_compute_pending
|
159
|
+
)
|
135
160
|
|
136
161
|
return delta_stats_processed_list
|
137
162
|
|
138
163
|
|
139
|
-
def _resolve_pending_stats_and_cache(
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
164
|
+
def _resolve_pending_stats_and_cache(
|
165
|
+
delta_cache_lookup_pending: List[ObjectRef[DeltaStatsCacheResult]],
|
166
|
+
stat_results_s3_bucket: str,
|
167
|
+
deltacat_storage,
|
168
|
+
) -> List[DeltaStats]:
|
169
|
+
delta_stats_cached_list, delta_stats_pending_list = _get_cached_and_pending_stats(
|
170
|
+
delta_cache_lookup_pending, deltacat_storage
|
171
|
+
)
|
172
|
+
delta_stats_resolved_list: List[DeltaStats] = _resolve_pending_stats(
|
173
|
+
delta_stats_pending_list
|
174
|
+
)
|
145
175
|
|
146
176
|
# Cache the stats into the file store
|
147
|
-
delta_stats_to_cache: List[ObjectRef] = [
|
148
|
-
|
149
|
-
|
177
|
+
delta_stats_to_cache: List[ObjectRef] = [
|
178
|
+
cache_delta_column_stats.remote(stat_results_s3_bucket, dcs)
|
179
|
+
for dataset_stats in delta_stats_resolved_list
|
180
|
+
for dcs in dataset_stats.column_stats
|
181
|
+
]
|
150
182
|
ray.get(delta_stats_to_cache)
|
151
183
|
|
152
184
|
return [*delta_stats_cached_list, *delta_stats_resolved_list]
|
153
185
|
|
154
186
|
|
155
|
-
def _get_cached_and_pending_stats(
|
156
|
-
|
157
|
-
|
187
|
+
def _get_cached_and_pending_stats(
|
188
|
+
discover_deltas_pending: List[ObjectRef[DeltaStatsCacheResult]],
|
189
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
190
|
+
) -> Tuple[List[DeltaStats], List[ObjectRef[DeltaStats]]]:
|
158
191
|
"""
|
159
192
|
Returns a tuple of a list of delta stats fetched from the cache, and a list of Ray tasks which will
|
160
193
|
calculate the stats for deltas on cache miss.
|
@@ -172,13 +205,18 @@ def _get_cached_and_pending_stats(discover_deltas_pending: List[ObjectRef[DeltaS
|
|
172
205
|
if cached_result.misses:
|
173
206
|
missed_column_names: List[str] = cached_result.misses.column_names
|
174
207
|
delta_locator: DeltaLocator = cached_result.misses.delta_locator
|
175
|
-
delta_stats_pending.append(
|
208
|
+
delta_stats_pending.append(
|
209
|
+
get_delta_stats.remote(
|
210
|
+
delta_locator, missed_column_names, deltacat_storage
|
211
|
+
)
|
212
|
+
)
|
176
213
|
|
177
214
|
return delta_stats_processed, delta_stats_pending
|
178
215
|
|
179
216
|
|
180
|
-
def _resolve_pending_stats(
|
181
|
-
|
217
|
+
def _resolve_pending_stats(
|
218
|
+
delta_stats_pending_list: List[ObjectRef[DeltaStats]],
|
219
|
+
) -> List[DeltaStats]:
|
182
220
|
delta_stats_processed_list: List[DeltaStats] = []
|
183
221
|
while delta_stats_pending_list:
|
184
222
|
ready, delta_stats_pending_list = ray.wait(delta_stats_pending_list)
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
-
from typing import
|
4
|
+
from typing import Any, Dict, List, Optional
|
5
5
|
|
6
6
|
from deltacat.compute.stats.models.manifest_entry_stats import ManifestEntryStats
|
7
7
|
from deltacat.compute.stats.models.stats_result import StatsResult
|
@@ -42,6 +42,7 @@ class DeltaColumnStats(dict):
|
|
42
42
|
StatsResult([I, L]), # Manifest Entry 2
|
43
43
|
]))
|
44
44
|
"""
|
45
|
+
|
45
46
|
@staticmethod
|
46
47
|
def of(column: str, manifest_stats: ManifestEntryStats) -> DeltaColumnStats:
|
47
48
|
"""
|
@@ -59,13 +60,14 @@ class DeltaColumnStats(dict):
|
|
59
60
|
|
60
61
|
@staticmethod
|
61
62
|
def build_from_dict(delta_column_stats: List[str, Any]) -> List[DeltaColumnStats]:
|
62
|
-
return DeltaColumnStats.of(
|
63
|
-
|
63
|
+
return DeltaColumnStats.of(
|
64
|
+
delta_column_stats["column"],
|
65
|
+
ManifestEntryStats.build_from_dict(delta_column_stats["manifestStats"]),
|
66
|
+
)
|
64
67
|
|
65
68
|
@property
|
66
69
|
def column(self) -> str:
|
67
|
-
"""Returns the column name.
|
68
|
-
"""
|
70
|
+
"""Returns the column name."""
|
69
71
|
return self.get("column")
|
70
72
|
|
71
73
|
@property
|
@@ -81,8 +83,7 @@ class DeltaColumnStats(dict):
|
|
81
83
|
|
82
84
|
@property
|
83
85
|
def stats(self) -> Optional[StatsResult]:
|
84
|
-
"""
|
85
|
-
"""
|
86
|
+
"""Combines the numerical stats for every manifest entry and returns it."""
|
86
87
|
val: Dict[str, Any] = self.get("stats")
|
87
88
|
if val is not None and not isinstance(val, StatsResult):
|
88
89
|
self["stats"] = val = StatsResult(val)
|
@@ -92,4 +93,6 @@ class DeltaColumnStats(dict):
|
|
92
93
|
return val
|
93
94
|
|
94
95
|
def _merge_manifest_stats(self) -> StatsResult:
|
95
|
-
return StatsResult.merge(
|
96
|
+
return StatsResult.merge(
|
97
|
+
self.manifest_stats.stats, {StatsType.PYARROW_TABLE_BYTES}
|
98
|
+
)
|
@@ -2,7 +2,7 @@
|
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
4
|
from collections import defaultdict
|
5
|
-
from typing import
|
5
|
+
from typing import Any, Dict, List, NamedTuple, Optional, Set
|
6
6
|
|
7
7
|
from deltacat.compute.stats.models.delta_column_stats import DeltaColumnStats
|
8
8
|
from deltacat.compute.stats.models.manifest_entry_stats import ManifestEntryStats
|
@@ -76,8 +76,7 @@ class DeltaStats(dict):
|
|
76
76
|
|
77
77
|
@property
|
78
78
|
def stats(self) -> Optional[StatsResult]:
|
79
|
-
"""Returns a StatsResult object that represents this delta, aggregated by the column stats of this delta.
|
80
|
-
"""
|
79
|
+
"""Returns a StatsResult object that represents this delta, aggregated by the column stats of this delta."""
|
81
80
|
val: Dict[str, Any] = self.get("stats")
|
82
81
|
if val is not None and not isinstance(val, StatsResult):
|
83
82
|
self["stats"] = val = StatsResult(val)
|
@@ -104,8 +103,12 @@ class DeltaStats(dict):
|
|
104
103
|
Returns:
|
105
104
|
Stats for the manifest entry.
|
106
105
|
"""
|
107
|
-
return StatsResult.merge(
|
108
|
-
|
106
|
+
return StatsResult.merge(
|
107
|
+
DeltaStats.get_manifest_entry_column_stats(
|
108
|
+
self.column_stats, manifest_entry_idx
|
109
|
+
),
|
110
|
+
record_row_count_once=True,
|
111
|
+
)
|
109
112
|
|
110
113
|
def manifest_entry_column_stats(self, manifest_entry_idx: int) -> List[StatsResult]:
|
111
114
|
"""Fetch a list of stats for each column in a manifest entry.
|
@@ -116,23 +119,34 @@ class DeltaStats(dict):
|
|
116
119
|
Returns:
|
117
120
|
A list of columnar stats for the manifest entry
|
118
121
|
"""
|
119
|
-
return DeltaStats.get_manifest_entry_column_stats(
|
122
|
+
return DeltaStats.get_manifest_entry_column_stats(
|
123
|
+
self.column_stats, manifest_entry_idx
|
124
|
+
)
|
120
125
|
|
121
126
|
@staticmethod
|
122
|
-
def get_manifest_entry_column_stats(
|
127
|
+
def get_manifest_entry_column_stats(
|
128
|
+
columns: List[DeltaColumnStats], manifest_entry_idx: int
|
129
|
+
) -> List[StatsResult]:
|
123
130
|
"""Helper method to provide a list of columnar stats for a specific manifest entry.
|
124
131
|
|
125
132
|
Returns:
|
126
133
|
A list of columnar stats for the manifest entry
|
127
134
|
"""
|
128
|
-
dataset_columnar_stats_list: List[ManifestEntryStats] = [
|
129
|
-
|
135
|
+
dataset_columnar_stats_list: List[ManifestEntryStats] = [
|
136
|
+
column.manifest_stats
|
137
|
+
for column in columns
|
138
|
+
if column.manifest_stats is not None
|
139
|
+
]
|
130
140
|
try:
|
131
|
-
return [
|
141
|
+
return [
|
142
|
+
stats.stats[manifest_entry_idx] for stats in dataset_columnar_stats_list
|
143
|
+
]
|
132
144
|
except IndexError:
|
133
145
|
sci: ManifestEntryStats = dataset_columnar_stats_list[0]
|
134
|
-
raise ValueError(
|
135
|
-
|
146
|
+
raise ValueError(
|
147
|
+
f"Table index {manifest_entry_idx} is not present in this dataset of {sci.delta_locator} "
|
148
|
+
f"with manifest table count of {len(sci.stats)}"
|
149
|
+
)
|
136
150
|
|
137
151
|
@staticmethod
|
138
152
|
def get_column_names(columns: List[DeltaColumnStats]) -> List[str]:
|
@@ -147,8 +161,9 @@ class DeltaStats(dict):
|
|
147
161
|
return [column_stats.column for column_stats in columns] if columns else []
|
148
162
|
|
149
163
|
@staticmethod
|
150
|
-
def get_delta_stats(
|
151
|
-
|
164
|
+
def get_delta_stats(
|
165
|
+
columns: List[DeltaColumnStats], stat_types: Optional[Set[StatsType]] = None
|
166
|
+
) -> Optional[StatsResult]:
|
152
167
|
"""Calculate the sum of provided column stats and return it
|
153
168
|
|
154
169
|
Args:
|
@@ -157,41 +172,52 @@ class DeltaStats(dict):
|
|
157
172
|
Returns:
|
158
173
|
Stats for the calculated sum
|
159
174
|
"""
|
160
|
-
assert columns and len(columns) > 0,
|
161
|
-
f"Expected columns `{columns}` of type `{type(columns)}` "
|
175
|
+
assert columns and len(columns) > 0, (
|
176
|
+
f"Expected columns `{columns}` of type `{type(columns)}` "
|
162
177
|
f"to be a non-empty list of DeltaColumnStats"
|
178
|
+
)
|
163
179
|
|
164
|
-
assert all(
|
165
|
-
|
180
|
+
assert all(
|
181
|
+
[col.manifest_stats for col in columns]
|
182
|
+
), f"Expected stats completion info to be present in each item of {columns} "
|
166
183
|
|
167
184
|
manifest_entry_count = len(columns[0].manifest_stats.stats)
|
168
|
-
column_stats_map: Dict[str, List[Optional[StatsResult]]] =
|
169
|
-
|
185
|
+
column_stats_map: Dict[str, List[Optional[StatsResult]]] = defaultdict(
|
186
|
+
lambda: [None] * manifest_entry_count
|
187
|
+
)
|
170
188
|
|
171
189
|
for column_stats in columns:
|
172
190
|
for file_idx, entry_stats in enumerate(column_stats.manifest_stats.stats):
|
173
191
|
column_stats_map[column_stats.column][file_idx] = entry_stats
|
174
192
|
|
175
|
-
return DeltaStats._merge_stats_from_columns_to_dataset(
|
176
|
-
|
177
|
-
|
178
|
-
|
193
|
+
return DeltaStats._merge_stats_from_columns_to_dataset(
|
194
|
+
DeltaStats.get_column_names(columns),
|
195
|
+
column_stats_map,
|
196
|
+
manifest_entry_count,
|
197
|
+
stat_types,
|
198
|
+
)
|
179
199
|
|
180
200
|
@staticmethod
|
181
|
-
def _merge_stats_from_columns_to_dataset(
|
182
|
-
|
183
|
-
|
184
|
-
|
201
|
+
def _merge_stats_from_columns_to_dataset(
|
202
|
+
column_names: List[str],
|
203
|
+
column_stats: Dict[str, List[Optional[StatsResult]]],
|
204
|
+
manifest_entries_size: int,
|
205
|
+
stat_types: Optional[Set[StatsType]] = None,
|
206
|
+
) -> StatsResult:
|
185
207
|
manifest_entry_stats_summary_list: List[StatsResult] = []
|
186
208
|
for manifest_entry_idx in range(manifest_entries_size):
|
187
209
|
curr_manifest_entry_column_stats_list: List[StatsResult] = []
|
188
210
|
for column_name in column_names:
|
189
|
-
current_table_column_stats: StatsResult = column_stats[column_name][
|
211
|
+
current_table_column_stats: StatsResult = column_stats[column_name][
|
212
|
+
manifest_entry_idx
|
213
|
+
]
|
190
214
|
curr_manifest_entry_column_stats_list.append(current_table_column_stats)
|
191
215
|
|
192
|
-
curr_manifest_entry_stats_summary = StatsResult.merge(
|
193
|
-
|
194
|
-
|
216
|
+
curr_manifest_entry_stats_summary = StatsResult.merge(
|
217
|
+
curr_manifest_entry_column_stats_list,
|
218
|
+
stat_types,
|
219
|
+
record_row_count_once=True,
|
220
|
+
)
|
195
221
|
manifest_entry_stats_summary_list.append(curr_manifest_entry_stats_summary)
|
196
222
|
return StatsResult.merge(manifest_entry_stats_summary_list, stat_types)
|
197
223
|
|
@@ -202,5 +228,6 @@ class DeltaStatsCacheMiss(NamedTuple):
|
|
202
228
|
`column_names` represents missing dataset column names from the file system (ex: S3).
|
203
229
|
delta_locator` is tied to the missing dataset columns and provided for future calculations.
|
204
230
|
"""
|
231
|
+
|
205
232
|
column_names: List[str]
|
206
233
|
delta_locator: DeltaLocator
|
@@ -12,8 +12,11 @@ class DeltaStatsCacheResult(dict):
|
|
12
12
|
Stats are fetched and cached at the column level, and each column may represent one
|
13
13
|
or more manifest entries.
|
14
14
|
"""
|
15
|
+
|
15
16
|
@staticmethod
|
16
|
-
def of(
|
17
|
+
def of(
|
18
|
+
hits: Optional[DeltaStats], misses: Optional[DeltaStatsCacheMiss]
|
19
|
+
) -> DeltaStatsCacheResult:
|
17
20
|
cds = DeltaStatsCacheResult()
|
18
21
|
cds["hits"] = hits
|
19
22
|
cds["misses"] = misses
|
@@ -1,22 +1,24 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
+
from typing import Any, Dict, List
|
5
|
+
|
4
6
|
import pyarrow as pa
|
5
7
|
|
6
8
|
from deltacat.compute.stats.models.stats_result import StatsResult
|
7
9
|
from deltacat.storage import DeltaLocator
|
8
10
|
|
9
|
-
from typing import Any, Dict, List
|
10
|
-
|
11
11
|
|
12
12
|
class ManifestEntryStats(dict):
|
13
13
|
"""Holds computed statistics for one or more manifest entries (tables) and their corresponding delta locator.
|
14
14
|
|
15
15
|
To be stored/retrieved from a file system (ex: S3).
|
16
16
|
"""
|
17
|
+
|
17
18
|
@staticmethod
|
18
|
-
def of(
|
19
|
-
|
19
|
+
def of(
|
20
|
+
manifest_entries_stats: List[StatsResult], delta_locator: DeltaLocator
|
21
|
+
) -> ManifestEntryStats:
|
20
22
|
"""
|
21
23
|
Creates a stats container that represents a particular manifest.
|
22
24
|
|
@@ -35,8 +37,12 @@ class ManifestEntryStats(dict):
|
|
35
37
|
def build_from_dict(manifest_entries_stats: dict) -> ManifestEntryStats:
|
36
38
|
stats_res_list = []
|
37
39
|
for stats_res in manifest_entries_stats["stats"]:
|
38
|
-
stats_res_list.append(
|
39
|
-
|
40
|
+
stats_res_list.append(
|
41
|
+
StatsResult.of(stats_res["rowCount"], stats_res["pyarrowTableBytes"])
|
42
|
+
)
|
43
|
+
return ManifestEntryStats.of(
|
44
|
+
stats_res_list, manifest_entries_stats["deltaLocator"]
|
45
|
+
)
|
40
46
|
|
41
47
|
@property
|
42
48
|
def delta_locator(self) -> DeltaLocator:
|
@@ -1,17 +1,19 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
-
from typing import Optional, List, Set, Dict, Any
|
5
|
-
|
6
4
|
from collections import defaultdict
|
7
|
-
from
|
5
|
+
from typing import Any, Dict, List, Optional, Set
|
6
|
+
|
7
|
+
from deltacat.compute.stats.types import ALL_STATS_TYPES, StatsType
|
8
|
+
|
8
9
|
|
9
10
|
class StatsResult(dict):
|
10
|
-
"""A generic container that holds stats for a single manifest entry file.
|
11
|
-
|
11
|
+
"""A generic container that holds stats for a single manifest entry file."""
|
12
|
+
|
12
13
|
@staticmethod
|
13
|
-
def of(
|
14
|
-
|
14
|
+
def of(
|
15
|
+
row_count: Optional[int] = 0, pyarrow_table_bytes: Optional[int] = 0
|
16
|
+
) -> StatsResult:
|
15
17
|
"""Static factory for building a stats result object
|
16
18
|
|
17
19
|
Args:
|
@@ -54,13 +56,20 @@ class StatsResult(dict):
|
|
54
56
|
Returns:
|
55
57
|
A stats result object
|
56
58
|
"""
|
57
|
-
return StatsResult(
|
58
|
-
|
59
|
+
return StatsResult(
|
60
|
+
{
|
61
|
+
k: v
|
62
|
+
for k, v in stats_types.items()
|
63
|
+
if k in [StatsType.ROW_COUNT, StatsType.PYARROW_TABLE_BYTES]
|
64
|
+
}
|
65
|
+
)
|
59
66
|
|
60
67
|
@staticmethod
|
61
|
-
def merge(
|
62
|
-
|
63
|
-
|
68
|
+
def merge(
|
69
|
+
stats_list: List[StatsResult],
|
70
|
+
stat_types: Optional[Set[StatsType]] = None,
|
71
|
+
record_row_count_once: bool = False,
|
72
|
+
) -> StatsResult:
|
64
73
|
"""Helper method to merge any list of StatsResult objects into one.
|
65
74
|
|
66
75
|
StatsResult objects are merged by adding up their numerical stats.
|
@@ -75,9 +84,10 @@ class StatsResult(dict):
|
|
75
84
|
Returns:
|
76
85
|
A stats result object
|
77
86
|
"""
|
78
|
-
assert isinstance(stats_list, list) and len(stats_list) > 0,
|
79
|
-
f"Expected stats list: {stats_list} of type {type(stats_list)} to be a "
|
87
|
+
assert isinstance(stats_list, list) and len(stats_list) > 0, (
|
88
|
+
f"Expected stats list: {stats_list} of type {type(stats_list)} to be a "
|
80
89
|
f"non-empty list of StatsResult objects."
|
90
|
+
)
|
81
91
|
|
82
92
|
# Fallback to all stat types if not provided
|
83
93
|
stats_to_collect: Set = stat_types or ALL_STATS_TYPES
|
@@ -1,4 +1,4 @@
|
|
1
|
-
from typing import
|
1
|
+
from typing import Iterable, List, Optional, Set, Tuple, Union
|
2
2
|
|
3
3
|
DeltaPosition = Optional[int]
|
4
4
|
NumericDeltaPosition = Union[int, float] # float is added here to support math.inf
|
@@ -38,7 +38,9 @@ def merge_intervals(intervals: Set[DeltaRange]) -> Set[DeltaRange]:
|
|
38
38
|
for interval in intervals_list:
|
39
39
|
start, end = interval
|
40
40
|
if start > end:
|
41
|
-
raise ValueError(
|
41
|
+
raise ValueError(
|
42
|
+
f"Invalid stream position range interval: ({start}, {end})"
|
43
|
+
)
|
42
44
|
|
43
45
|
if merge_start is None and merge_end is None:
|
44
46
|
merge_start, merge_end = start, end
|
@@ -57,7 +59,9 @@ def merge_intervals(intervals: Set[DeltaRange]) -> Set[DeltaRange]:
|
|
57
59
|
return merged
|
58
60
|
|
59
61
|
|
60
|
-
def _add_merged_interval(
|
62
|
+
def _add_merged_interval(
|
63
|
+
result_set: set, start: NumericDeltaPosition, end: NumericDeltaPosition
|
64
|
+
):
|
61
65
|
start_pos: DeltaPosition = start if isinstance(start, int) else None
|
62
66
|
end_pos: DeltaPosition = end if isinstance(end, int) else None
|
63
67
|
result_set.add((start_pos, end_pos))
|
@@ -67,9 +71,9 @@ def _to_numeric_values(intervals_list: List[DeltaRange]):
|
|
67
71
|
for i, interval in enumerate(intervals_list):
|
68
72
|
start, end = _get_validated_interval(interval)
|
69
73
|
if start is None:
|
70
|
-
start = float(
|
74
|
+
start = float("-inf")
|
71
75
|
if end is None:
|
72
|
-
end = float(
|
76
|
+
end = float("inf")
|
73
77
|
|
74
78
|
intervals_list[i] = (start, end)
|
75
79
|
|
@@ -79,9 +83,12 @@ def _get_validated_interval(interval: DeltaRange) -> DeltaRange:
|
|
79
83
|
raise ValueError(f"Interval {interval} must be a tuple of size 2")
|
80
84
|
|
81
85
|
start, end = interval
|
82
|
-
if not (isinstance(start, int) or start is None)
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
+
if not (isinstance(start, int) or start is None) or not (
|
87
|
+
isinstance(end, int) or end is None
|
88
|
+
):
|
89
|
+
raise ValueError(
|
90
|
+
f"Invalid stream position value types: "
|
91
|
+
f"({start}, {end}) - ({type(start), type(end)})"
|
92
|
+
)
|
86
93
|
|
87
94
|
return start, end
|