deltacat 1.1.20__py3-none-any.whl → 1.1.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor_v2/private/compaction_utils.py +0 -1
- deltacat/compute/stats/models/__init__.py +0 -0
- deltacat/compute/stats/models/delta_column_stats.py +98 -0
- deltacat/compute/stats/models/delta_stats.py +233 -0
- deltacat/compute/stats/models/delta_stats_cache_result.py +49 -0
- deltacat/compute/stats/models/manifest_entry_stats.py +72 -0
- deltacat/compute/stats/models/stats_result.py +104 -0
- {deltacat-1.1.20.dist-info → deltacat-1.1.22.dist-info}/METADATA +1 -1
- {deltacat-1.1.20.dist-info → deltacat-1.1.22.dist-info}/RECORD +13 -7
- {deltacat-1.1.20.dist-info → deltacat-1.1.22.dist-info}/LICENSE +0 -0
- {deltacat-1.1.20.dist-info → deltacat-1.1.22.dist-info}/WHEEL +0 -0
- {deltacat-1.1.20.dist-info → deltacat-1.1.22.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
File without changes
|
@@ -0,0 +1,98 @@
|
|
1
|
+
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
from typing import Any, Dict, List, Optional
|
5
|
+
|
6
|
+
from deltacat.compute.stats.models.manifest_entry_stats import ManifestEntryStats
|
7
|
+
from deltacat.compute.stats.models.stats_result import StatsResult
|
8
|
+
from deltacat.compute.stats.types import StatsType
|
9
|
+
|
10
|
+
|
11
|
+
class DeltaColumnStats(dict):
|
12
|
+
"""
|
13
|
+
Stats container for an individual column of a Delta.
|
14
|
+
Provides distinct stats results for each manifest entry of the Delta.
|
15
|
+
|
16
|
+
Example:
|
17
|
+
Manifest Entry 1
|
18
|
+
=======
|
19
|
+
foo bar baz
|
20
|
+
A B C
|
21
|
+
D E F
|
22
|
+
|
23
|
+
Manifest Entry 2
|
24
|
+
=======
|
25
|
+
foo bar baz
|
26
|
+
G H I
|
27
|
+
J K L
|
28
|
+
|
29
|
+
DeltaColumnStats("foo",
|
30
|
+
ManifestEntryStats([
|
31
|
+
StatsResult([A, D]), # Manifest Entry 1
|
32
|
+
StatsResult([G, J]), # Manifest Entry 2
|
33
|
+
]))
|
34
|
+
DeltaColumnStats("bar",
|
35
|
+
ManifestEntryStats([
|
36
|
+
StatsResult([B, E]), # Manifest Entry 1
|
37
|
+
StatsResult([H, K]), # Manifest Entry 2
|
38
|
+
]))
|
39
|
+
DeltaColumnStats("baz",
|
40
|
+
ManifestEntryStats([
|
41
|
+
StatsResult([C, F]), # Manifest Entry 1
|
42
|
+
StatsResult([I, L]), # Manifest Entry 2
|
43
|
+
]))
|
44
|
+
"""
|
45
|
+
|
46
|
+
@staticmethod
|
47
|
+
def of(column: str, manifest_stats: ManifestEntryStats) -> DeltaColumnStats:
|
48
|
+
"""
|
49
|
+
Creates a container of a column name and the column stats for one or more manifest entries.
|
50
|
+
"""
|
51
|
+
dcs = DeltaColumnStats()
|
52
|
+
dcs["column"] = column
|
53
|
+
dcs["manifestStats"] = manifest_stats
|
54
|
+
|
55
|
+
if manifest_stats:
|
56
|
+
# Omit row count for columnar-centric stats
|
57
|
+
dcs["stats"] = dcs._merge_manifest_stats()
|
58
|
+
|
59
|
+
return dcs
|
60
|
+
|
61
|
+
@staticmethod
|
62
|
+
def build_from_dict(delta_column_stats: List[str, Any]) -> List[DeltaColumnStats]:
|
63
|
+
return DeltaColumnStats.of(
|
64
|
+
delta_column_stats["column"],
|
65
|
+
ManifestEntryStats.build_from_dict(delta_column_stats["manifestStats"]),
|
66
|
+
)
|
67
|
+
|
68
|
+
@property
|
69
|
+
def column(self) -> str:
|
70
|
+
"""Returns the column name."""
|
71
|
+
return self.get("column")
|
72
|
+
|
73
|
+
@property
|
74
|
+
def manifest_stats(self) -> Optional[ManifestEntryStats]:
|
75
|
+
"""Returns a container that represents stats at the manifest level.
|
76
|
+
|
77
|
+
A container holds a list of computed stats for each manifest entry.
|
78
|
+
"""
|
79
|
+
val: Dict[str, Any] = self.get("manifestStats")
|
80
|
+
if val is not None and not isinstance(val, ManifestEntryStats):
|
81
|
+
self["manifestStats"] = val = ManifestEntryStats(val)
|
82
|
+
return val
|
83
|
+
|
84
|
+
@property
|
85
|
+
def stats(self) -> Optional[StatsResult]:
|
86
|
+
"""Combines the numerical stats for every manifest entry and returns it."""
|
87
|
+
val: Dict[str, Any] = self.get("stats")
|
88
|
+
if val is not None and not isinstance(val, StatsResult):
|
89
|
+
self["stats"] = val = StatsResult(val)
|
90
|
+
elif val is None and self.manifest_stats:
|
91
|
+
self["stats"] = val = self._merge_manifest_stats()
|
92
|
+
|
93
|
+
return val
|
94
|
+
|
95
|
+
def _merge_manifest_stats(self) -> StatsResult:
|
96
|
+
return StatsResult.merge(
|
97
|
+
self.manifest_stats.stats, {StatsType.PYARROW_TABLE_BYTES}
|
98
|
+
)
|
@@ -0,0 +1,233 @@
|
|
1
|
+
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
from collections import defaultdict
|
5
|
+
from typing import Any, Dict, List, NamedTuple, Optional, Set
|
6
|
+
|
7
|
+
from deltacat.compute.stats.models.delta_column_stats import DeltaColumnStats
|
8
|
+
from deltacat.compute.stats.models.manifest_entry_stats import ManifestEntryStats
|
9
|
+
from deltacat.compute.stats.models.stats_result import StatsResult
|
10
|
+
from deltacat.compute.stats.types import StatsType
|
11
|
+
from deltacat.storage import DeltaLocator
|
12
|
+
|
13
|
+
|
14
|
+
class DeltaStats(dict):
|
15
|
+
"""
|
16
|
+
Stats container for all columns of a delta.
|
17
|
+
|
18
|
+
Provides distinct stats for each delta manifest entry, aggregate stats across all manifest entries,
|
19
|
+
and a DeltaColumnStats reference for each column.
|
20
|
+
|
21
|
+
Each DeltaColumnStats has a column name and a ManifestEntryStats object,
|
22
|
+
which contains column-level stats for each delta manifest entry.
|
23
|
+
|
24
|
+
Example of visual representation:
|
25
|
+
Manifest Entry 1
|
26
|
+
=======
|
27
|
+
foo bar baz
|
28
|
+
A B C
|
29
|
+
D E F
|
30
|
+
|
31
|
+
Manifest Entry 2
|
32
|
+
=======
|
33
|
+
foo bar baz
|
34
|
+
G H I
|
35
|
+
J K L
|
36
|
+
|
37
|
+
DeltaStats([
|
38
|
+
DeltaColumnStats("foo",
|
39
|
+
ManifestEntryStats([
|
40
|
+
StatsResult([A, D]), # Manifest Entry 1
|
41
|
+
StatsResult([G, J]), # Manifest Entry 2
|
42
|
+
]))
|
43
|
+
DeltaColumnStats("bar",
|
44
|
+
ManifestEntryStats([
|
45
|
+
StatsResult([B, E]), # Manifest Entry 1
|
46
|
+
StatsResult([H, K]), # Manifest Entry 2
|
47
|
+
]))
|
48
|
+
DeltaColumnStats("baz",
|
49
|
+
ManifestEntryStats([
|
50
|
+
StatsResult([C, F]), # Manifest Entry 1
|
51
|
+
StatsResult([I, L]), # Manifest Entry 2
|
52
|
+
]))
|
53
|
+
], Stats(AllDeltaColumnStats))
|
54
|
+
"""
|
55
|
+
|
56
|
+
@staticmethod
|
57
|
+
def of(column_stats: List[DeltaColumnStats]) -> DeltaStats:
|
58
|
+
ds = DeltaStats()
|
59
|
+
ds["column_stats"] = column_stats
|
60
|
+
ds["stats"] = DeltaStats.get_delta_stats(column_stats)
|
61
|
+
return ds
|
62
|
+
|
63
|
+
@staticmethod
|
64
|
+
def build_from_dict(delta_stats: dict) -> DeltaStats:
|
65
|
+
delta_column_stats_list = []
|
66
|
+
for dcs in delta_stats["column_stats"]:
|
67
|
+
delta_column_stats_list.append(DeltaColumnStats.build_from_dict(dcs))
|
68
|
+
return DeltaStats.of(delta_column_stats_list)
|
69
|
+
|
70
|
+
@property
|
71
|
+
def column_stats(self) -> List[DeltaColumnStats]:
|
72
|
+
"""
|
73
|
+
Returns a list of stats associated to each column in this delta.
|
74
|
+
"""
|
75
|
+
return self["column_stats"]
|
76
|
+
|
77
|
+
@property
|
78
|
+
def stats(self) -> Optional[StatsResult]:
|
79
|
+
"""Returns a StatsResult object that represents this delta, aggregated by the column stats of this delta."""
|
80
|
+
val: Dict[str, Any] = self.get("stats")
|
81
|
+
if val is not None and not isinstance(val, StatsResult):
|
82
|
+
self["stats"] = val = StatsResult(val)
|
83
|
+
elif val is None and self.column_stats:
|
84
|
+
self["stats"] = val = DeltaStats.get_delta_stats(self.column_stats)
|
85
|
+
|
86
|
+
return val
|
87
|
+
|
88
|
+
@property
|
89
|
+
def columns(self) -> List[str]:
|
90
|
+
"""Returns a list of column names associated to this delta.
|
91
|
+
|
92
|
+
Returns:
|
93
|
+
A list of column names
|
94
|
+
"""
|
95
|
+
return DeltaStats.get_column_names(self.column_stats)
|
96
|
+
|
97
|
+
def manifest_entry_stats(self, manifest_entry_idx: int) -> StatsResult:
|
98
|
+
"""Calculate the stats of a manifest entry by combining its columnar stats.
|
99
|
+
|
100
|
+
Args:
|
101
|
+
manifest_entry_idx: The manifest entry table to calculate stats for
|
102
|
+
|
103
|
+
Returns:
|
104
|
+
Stats for the manifest entry.
|
105
|
+
"""
|
106
|
+
return StatsResult.merge(
|
107
|
+
DeltaStats.get_manifest_entry_column_stats(
|
108
|
+
self.column_stats, manifest_entry_idx
|
109
|
+
),
|
110
|
+
record_row_count_once=True,
|
111
|
+
)
|
112
|
+
|
113
|
+
def manifest_entry_column_stats(self, manifest_entry_idx: int) -> List[StatsResult]:
|
114
|
+
"""Fetch a list of stats for each column in a manifest entry.
|
115
|
+
|
116
|
+
Args:
|
117
|
+
manifest_entry_idx: The manifest entry table to calculate stats for
|
118
|
+
|
119
|
+
Returns:
|
120
|
+
A list of columnar stats for the manifest entry
|
121
|
+
"""
|
122
|
+
return DeltaStats.get_manifest_entry_column_stats(
|
123
|
+
self.column_stats, manifest_entry_idx
|
124
|
+
)
|
125
|
+
|
126
|
+
@staticmethod
|
127
|
+
def get_manifest_entry_column_stats(
|
128
|
+
columns: List[DeltaColumnStats], manifest_entry_idx: int
|
129
|
+
) -> List[StatsResult]:
|
130
|
+
"""Helper method to provide a list of columnar stats for a specific manifest entry.
|
131
|
+
|
132
|
+
Returns:
|
133
|
+
A list of columnar stats for the manifest entry
|
134
|
+
"""
|
135
|
+
dataset_columnar_stats_list: List[ManifestEntryStats] = [
|
136
|
+
column.manifest_stats
|
137
|
+
for column in columns
|
138
|
+
if column.manifest_stats is not None
|
139
|
+
]
|
140
|
+
try:
|
141
|
+
return [
|
142
|
+
stats.stats[manifest_entry_idx] for stats in dataset_columnar_stats_list
|
143
|
+
]
|
144
|
+
except IndexError:
|
145
|
+
sci: ManifestEntryStats = dataset_columnar_stats_list[0]
|
146
|
+
raise ValueError(
|
147
|
+
f"Table index {manifest_entry_idx} is not present in this dataset of {sci.delta_locator} "
|
148
|
+
f"with manifest table count of {len(sci.stats)}"
|
149
|
+
)
|
150
|
+
|
151
|
+
@staticmethod
|
152
|
+
def get_column_names(columns: List[DeltaColumnStats]) -> List[str]:
|
153
|
+
"""Helper method to get the names of each column from a list of delta column stats
|
154
|
+
|
155
|
+
Args:
|
156
|
+
columns: A list of delta column stats
|
157
|
+
|
158
|
+
Returns:
|
159
|
+
A list of column names
|
160
|
+
"""
|
161
|
+
return [column_stats.column for column_stats in columns] if columns else []
|
162
|
+
|
163
|
+
@staticmethod
|
164
|
+
def get_delta_stats(
|
165
|
+
columns: List[DeltaColumnStats], stat_types: Optional[Set[StatsType]] = None
|
166
|
+
) -> Optional[StatsResult]:
|
167
|
+
"""Calculate the sum of provided column stats and return it
|
168
|
+
|
169
|
+
Args:
|
170
|
+
columns: A list of delta column stats
|
171
|
+
|
172
|
+
Returns:
|
173
|
+
Stats for the calculated sum
|
174
|
+
"""
|
175
|
+
assert columns and len(columns) > 0, (
|
176
|
+
f"Expected columns `{columns}` of type `{type(columns)}` "
|
177
|
+
f"to be a non-empty list of DeltaColumnStats"
|
178
|
+
)
|
179
|
+
|
180
|
+
assert all(
|
181
|
+
[col.manifest_stats for col in columns]
|
182
|
+
), f"Expected stats completion info to be present in each item of {columns} "
|
183
|
+
|
184
|
+
manifest_entry_count = len(columns[0].manifest_stats.stats)
|
185
|
+
column_stats_map: Dict[str, List[Optional[StatsResult]]] = defaultdict(
|
186
|
+
lambda: [None] * manifest_entry_count
|
187
|
+
)
|
188
|
+
|
189
|
+
for column_stats in columns:
|
190
|
+
for file_idx, entry_stats in enumerate(column_stats.manifest_stats.stats):
|
191
|
+
column_stats_map[column_stats.column][file_idx] = entry_stats
|
192
|
+
|
193
|
+
return DeltaStats._merge_stats_from_columns_to_dataset(
|
194
|
+
DeltaStats.get_column_names(columns),
|
195
|
+
column_stats_map,
|
196
|
+
manifest_entry_count,
|
197
|
+
stat_types,
|
198
|
+
)
|
199
|
+
|
200
|
+
@staticmethod
|
201
|
+
def _merge_stats_from_columns_to_dataset(
|
202
|
+
column_names: List[str],
|
203
|
+
column_stats: Dict[str, List[Optional[StatsResult]]],
|
204
|
+
manifest_entries_size: int,
|
205
|
+
stat_types: Optional[Set[StatsType]] = None,
|
206
|
+
) -> StatsResult:
|
207
|
+
manifest_entry_stats_summary_list: List[StatsResult] = []
|
208
|
+
for manifest_entry_idx in range(manifest_entries_size):
|
209
|
+
curr_manifest_entry_column_stats_list: List[StatsResult] = []
|
210
|
+
for column_name in column_names:
|
211
|
+
current_table_column_stats: StatsResult = column_stats[column_name][
|
212
|
+
manifest_entry_idx
|
213
|
+
]
|
214
|
+
curr_manifest_entry_column_stats_list.append(current_table_column_stats)
|
215
|
+
|
216
|
+
curr_manifest_entry_stats_summary = StatsResult.merge(
|
217
|
+
curr_manifest_entry_column_stats_list,
|
218
|
+
stat_types,
|
219
|
+
record_row_count_once=True,
|
220
|
+
)
|
221
|
+
manifest_entry_stats_summary_list.append(curr_manifest_entry_stats_summary)
|
222
|
+
return StatsResult.merge(manifest_entry_stats_summary_list, stat_types)
|
223
|
+
|
224
|
+
|
225
|
+
class DeltaStatsCacheMiss(NamedTuple):
|
226
|
+
"""A helper class for cache miss results from DeltaStatsCacheResult.
|
227
|
+
|
228
|
+
`column_names` represents missing dataset column names from the file system (ex: S3).
|
229
|
+
delta_locator` is tied to the missing dataset columns and provided for future calculations.
|
230
|
+
"""
|
231
|
+
|
232
|
+
column_names: List[str]
|
233
|
+
delta_locator: DeltaLocator
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
from typing import Optional
|
5
|
+
|
6
|
+
from deltacat.compute.stats.models.delta_stats import DeltaStats, DeltaStatsCacheMiss
|
7
|
+
|
8
|
+
|
9
|
+
class DeltaStatsCacheResult(dict):
|
10
|
+
"""A helper class containing the results from a cache query.
|
11
|
+
|
12
|
+
Stats are fetched and cached at the column level, and each column may represent one
|
13
|
+
or more manifest entries.
|
14
|
+
"""
|
15
|
+
|
16
|
+
@staticmethod
|
17
|
+
def of(
|
18
|
+
hits: Optional[DeltaStats], misses: Optional[DeltaStatsCacheMiss]
|
19
|
+
) -> DeltaStatsCacheResult:
|
20
|
+
cds = DeltaStatsCacheResult()
|
21
|
+
cds["hits"] = hits
|
22
|
+
cds["misses"] = misses
|
23
|
+
return cds
|
24
|
+
|
25
|
+
@property
|
26
|
+
def hits(self) -> Optional[DeltaStats]:
|
27
|
+
"""Retrieve stats that were found in the cache
|
28
|
+
|
29
|
+
`hits` represents a DeltaStats object that contains dataset-wide statistics across
|
30
|
+
many of its tables (or manifest entries) and is composed of one or more column-wide
|
31
|
+
DeltaColumnStats.
|
32
|
+
|
33
|
+
Returns:
|
34
|
+
A delta wide stats container
|
35
|
+
"""
|
36
|
+
return self["hits"]
|
37
|
+
|
38
|
+
@property
|
39
|
+
def misses(self) -> Optional[DeltaStatsCacheMiss]:
|
40
|
+
"""Retrieve stats that were missing from the cache
|
41
|
+
|
42
|
+
`misses` represents a DeltaStatsCacheMiss object that contains a list of
|
43
|
+
column names that were not found in the file system (ex: S3) and a `delta_locator`
|
44
|
+
as a reference to the delta metadata tied to the missing dataset columns.
|
45
|
+
|
46
|
+
Returns:
|
47
|
+
A tuple with metadata regarding the cache miss
|
48
|
+
"""
|
49
|
+
return self["misses"]
|
@@ -0,0 +1,72 @@
|
|
1
|
+
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
from typing import Any, Dict, List
|
5
|
+
|
6
|
+
import pyarrow as pa
|
7
|
+
|
8
|
+
from deltacat.compute.stats.models.stats_result import StatsResult
|
9
|
+
from deltacat.storage import DeltaLocator
|
10
|
+
|
11
|
+
|
12
|
+
class ManifestEntryStats(dict):
|
13
|
+
"""Holds computed statistics for one or more manifest entries (tables) and their corresponding delta locator.
|
14
|
+
|
15
|
+
To be stored/retrieved from a file system (ex: S3).
|
16
|
+
"""
|
17
|
+
|
18
|
+
@staticmethod
|
19
|
+
def of(
|
20
|
+
manifest_entries_stats: List[StatsResult], delta_locator: DeltaLocator
|
21
|
+
) -> ManifestEntryStats:
|
22
|
+
"""
|
23
|
+
Creates a stats container that represents a particular manifest.
|
24
|
+
|
25
|
+
`manifest_entries_stats` are a list of distinct stats for each manifest entry file
|
26
|
+
tied to this manifest. `delta_locator` is provided as a reference to the delta where the
|
27
|
+
manifest entries reside.
|
28
|
+
"""
|
29
|
+
|
30
|
+
mes = ManifestEntryStats()
|
31
|
+
mes["deltaLocator"] = delta_locator
|
32
|
+
mes["stats"] = manifest_entries_stats
|
33
|
+
mes["pyarrowVersion"] = pa.__version__
|
34
|
+
return mes
|
35
|
+
|
36
|
+
@staticmethod
|
37
|
+
def build_from_dict(manifest_entries_stats: dict) -> ManifestEntryStats:
|
38
|
+
stats_res_list = []
|
39
|
+
for stats_res in manifest_entries_stats["stats"]:
|
40
|
+
stats_res_list.append(
|
41
|
+
StatsResult.of(stats_res["rowCount"], stats_res["pyarrowTableBytes"])
|
42
|
+
)
|
43
|
+
return ManifestEntryStats.of(
|
44
|
+
stats_res_list, manifest_entries_stats["deltaLocator"]
|
45
|
+
)
|
46
|
+
|
47
|
+
@property
|
48
|
+
def delta_locator(self) -> DeltaLocator:
|
49
|
+
"""Reference to the delta that holds the manifest entries
|
50
|
+
|
51
|
+
Returns:
|
52
|
+
A delta locator object
|
53
|
+
"""
|
54
|
+
val: Dict[str, Any] = self.get("deltaLocator")
|
55
|
+
if val is not None and not isinstance(val, DeltaLocator):
|
56
|
+
self["deltaLocator"] = val = DeltaLocator(val)
|
57
|
+
return val
|
58
|
+
|
59
|
+
@property
|
60
|
+
def stats(self) -> List[StatsResult]:
|
61
|
+
"""
|
62
|
+
Returns a list of distinct stats for each manifest entry file.
|
63
|
+
"""
|
64
|
+
val = self["stats"]
|
65
|
+
return [StatsResult(_) for _ in val] if val else []
|
66
|
+
|
67
|
+
@property
|
68
|
+
def pyarrow_version(self) -> str:
|
69
|
+
"""
|
70
|
+
Read-only property which returns the PyArrow version number as it was written into a file system.
|
71
|
+
"""
|
72
|
+
return self.get("pyarrowVersion")
|
@@ -0,0 +1,104 @@
|
|
1
|
+
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
from collections import defaultdict
|
5
|
+
from typing import Any, Dict, List, Optional, Set
|
6
|
+
|
7
|
+
from deltacat.compute.stats.types import ALL_STATS_TYPES, StatsType
|
8
|
+
|
9
|
+
|
10
|
+
class StatsResult(dict):
|
11
|
+
"""A generic container that holds stats for a single manifest entry file."""
|
12
|
+
|
13
|
+
@staticmethod
|
14
|
+
def of(
|
15
|
+
row_count: Optional[int] = 0, pyarrow_table_bytes: Optional[int] = 0
|
16
|
+
) -> StatsResult:
|
17
|
+
"""Static factory for building a stats result object
|
18
|
+
|
19
|
+
Args:
|
20
|
+
row_count: The total number of rows of a manifest entry
|
21
|
+
pyarrow_table_bytes: The total number of bytes when loaded into memory as a PyArrow Table
|
22
|
+
|
23
|
+
Returns:
|
24
|
+
A stats result object
|
25
|
+
"""
|
26
|
+
sr = StatsResult()
|
27
|
+
sr[StatsType.ROW_COUNT.value] = row_count
|
28
|
+
sr[StatsType.PYARROW_TABLE_BYTES.value] = pyarrow_table_bytes
|
29
|
+
return sr
|
30
|
+
|
31
|
+
@property
|
32
|
+
def row_count(self) -> int:
|
33
|
+
"""Represents the row count of a manifest entry file.
|
34
|
+
|
35
|
+
Returns:
|
36
|
+
The total number of rows of a manifest entry
|
37
|
+
"""
|
38
|
+
return self[StatsType.ROW_COUNT.value]
|
39
|
+
|
40
|
+
@property
|
41
|
+
def pyarrow_table_bytes(self) -> int:
|
42
|
+
"""Represents the size of a manifest entry file (in bytes) as it was loaded into a PyArrow table.
|
43
|
+
|
44
|
+
Returns:
|
45
|
+
The total number of bytes when loaded into memory as a PyArrow Table
|
46
|
+
"""
|
47
|
+
return self[StatsType.PYARROW_TABLE_BYTES.value]
|
48
|
+
|
49
|
+
@staticmethod
|
50
|
+
def from_stats_types(stats_types: Dict[StatsType, Any]) -> StatsResult:
|
51
|
+
"""A helper method to filter a dictionary by supported stats and returns a stats result object.
|
52
|
+
|
53
|
+
Args:
|
54
|
+
stats_types: Stats that should be included for constructing a stats result
|
55
|
+
|
56
|
+
Returns:
|
57
|
+
A stats result object
|
58
|
+
"""
|
59
|
+
return StatsResult(
|
60
|
+
{
|
61
|
+
k: v
|
62
|
+
for k, v in stats_types.items()
|
63
|
+
if k in [StatsType.ROW_COUNT, StatsType.PYARROW_TABLE_BYTES]
|
64
|
+
}
|
65
|
+
)
|
66
|
+
|
67
|
+
@staticmethod
|
68
|
+
def merge(
|
69
|
+
stats_list: List[StatsResult],
|
70
|
+
stat_types: Optional[Set[StatsType]] = None,
|
71
|
+
record_row_count_once: bool = False,
|
72
|
+
) -> StatsResult:
|
73
|
+
"""Helper method to merge any list of StatsResult objects into one.
|
74
|
+
|
75
|
+
StatsResult objects are merged by adding up their numerical stats.
|
76
|
+
TODO (ricmiyam): Handle non-numerical stats when they are added
|
77
|
+
|
78
|
+
Args:
|
79
|
+
stat_types: If provided, the calculation will only include the requested stats.
|
80
|
+
record_row_count_once: If optionally set to `True`, then row counts are only added
|
81
|
+
from the first stats entry. One use case for this is merging table-centric stats
|
82
|
+
by columns, since the row count is expected to be the same across different columns.
|
83
|
+
|
84
|
+
Returns:
|
85
|
+
A stats result object
|
86
|
+
"""
|
87
|
+
assert isinstance(stats_list, list) and len(stats_list) > 0, (
|
88
|
+
f"Expected stats list: {stats_list} of type {type(stats_list)} to be a "
|
89
|
+
f"non-empty list of StatsResult objects."
|
90
|
+
)
|
91
|
+
|
92
|
+
# Fallback to all stat types if not provided
|
93
|
+
stats_to_collect: Set = stat_types or ALL_STATS_TYPES
|
94
|
+
|
95
|
+
merged_stats: Dict[StatsType, int] = defaultdict(int)
|
96
|
+
for stats_result in stats_list:
|
97
|
+
for stat_type in stats_to_collect:
|
98
|
+
if stats_result:
|
99
|
+
merged_stats[stat_type.value] += stats_result[stat_type.value]
|
100
|
+
|
101
|
+
if record_row_count_once and StatsType.ROW_COUNT in stats_to_collect:
|
102
|
+
merged_stats[StatsType.ROW_COUNT.value] = stats_list[0].row_count
|
103
|
+
|
104
|
+
return StatsResult.from_stats_types(merged_stats)
|
@@ -1,4 +1,4 @@
|
|
1
|
-
deltacat/__init__.py,sha256=
|
1
|
+
deltacat/__init__.py,sha256=DA3ai-LdX6TopktWC4tQPRs9GXGxAjSkEz-TeJbnWdE,1778
|
2
2
|
deltacat/constants.py,sha256=TUJLXUJ9xq1Ryil72yLkKR8EDH_Irp5wUg56QstbRNE,2181
|
3
3
|
deltacat/exceptions.py,sha256=7sjk3BuMY5Oo-6OvAfHncZx_OcvtEL47BblWr2F7waE,12740
|
4
4
|
deltacat/logs.py,sha256=EQSDin1deehzz5xlLV1_TrFJrO_IBZ9Ahp7MdL-4cK8,9363
|
@@ -66,7 +66,7 @@ deltacat/compute/compactor_v2/model/merge_file_group.py,sha256=1o86t9lc3K6ZvtViV
|
|
66
66
|
deltacat/compute/compactor_v2/model/merge_input.py,sha256=-SxTE0e67z2V7MiMEVz5aMu4E0k8h3-vqohvUUOC0do,5659
|
67
67
|
deltacat/compute/compactor_v2/model/merge_result.py,sha256=_IZTCStpb4UKiRCJYA3g6EhAqjrw0t9vmoDAN8kIK-Y,436
|
68
68
|
deltacat/compute/compactor_v2/private/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
69
|
-
deltacat/compute/compactor_v2/private/compaction_utils.py,sha256=
|
69
|
+
deltacat/compute/compactor_v2/private/compaction_utils.py,sha256=QKGekJQWL_S1DifnENSQ7PQm5k7x27CoDT0m4QQWBIk,30416
|
70
70
|
deltacat/compute/compactor_v2/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
71
71
|
deltacat/compute/compactor_v2/steps/hash_bucket.py,sha256=1R5xLUkl7GqL1nY-apAgY1czKDEHjIVYSRi9qLOMass,6726
|
72
72
|
deltacat/compute/compactor_v2/steps/merge.py,sha256=LpktsDPfj7Of6RgUw9w1f3Y3OBkPDjvtyXjzFaIDoSo,21771
|
@@ -91,6 +91,12 @@ deltacat/compute/resource_estimation/model.py,sha256=psyagFXdpLGt8DfDqy7c8DWiuXC
|
|
91
91
|
deltacat/compute/resource_estimation/parquet.py,sha256=5_apma4EKbKcm-nfV73-qN2nfnCeyhFW23ZHX3jz0Kw,3158
|
92
92
|
deltacat/compute/stats/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
93
93
|
deltacat/compute/stats/types.py,sha256=cp0lT8nITTKbnkc03OysRjXfcfXzQml9a4wqCnR6kqs,215
|
94
|
+
deltacat/compute/stats/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
95
|
+
deltacat/compute/stats/models/delta_column_stats.py,sha256=-wXjB2c0BC1RDheumjL_j5-DfRNql4WsK9GpMFQI1cg,3300
|
96
|
+
deltacat/compute/stats/models/delta_stats.py,sha256=hBith8_hbF9TVr6HocLAt6RJ_kZZKO4zrGP8VOP05vA,8556
|
97
|
+
deltacat/compute/stats/models/delta_stats_cache_result.py,sha256=mbJYxpZd5jaER_BWrCD2hROFy3p1nNdBrj66nUpc6io,1624
|
98
|
+
deltacat/compute/stats/models/manifest_entry_stats.py,sha256=NCDAe2nPDEI4kOkuwNkRFgGPS-rqQaQqLuaLoKk20KQ,2419
|
99
|
+
deltacat/compute/stats/models/stats_result.py,sha256=XQAlmzhUqRmg4jzEMUAOqcYn1HUOBTMryBH1CCVlet8,3820
|
94
100
|
deltacat/io/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
95
101
|
deltacat/io/dataset.py,sha256=pFU5UfK-fD9C4fIeffJtrA6yVQSgAx2UPbxzQ4GMFL8,3203
|
96
102
|
deltacat/io/file_object_store.py,sha256=HCFeXu9cWXPXVk54MHel_nw3-wIuzhMt2RI6jKzjRYM,1346
|
@@ -204,8 +210,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
|
|
204
210
|
deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
|
205
211
|
deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
|
206
212
|
deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
|
207
|
-
deltacat-1.1.
|
208
|
-
deltacat-1.1.
|
209
|
-
deltacat-1.1.
|
210
|
-
deltacat-1.1.
|
211
|
-
deltacat-1.1.
|
213
|
+
deltacat-1.1.22.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
214
|
+
deltacat-1.1.22.dist-info/METADATA,sha256=uaAF2wK6KSi4pooJGoqJERmH_zBIz7WZgUgORbQCnaA,1733
|
215
|
+
deltacat-1.1.22.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
216
|
+
deltacat-1.1.22.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
|
217
|
+
deltacat-1.1.22.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|