deltacat 1.1.18__py3-none-any.whl → 1.1.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/compute/compactor/model/compact_partition_params.py +76 -0
  3. deltacat/compute/compactor/model/compaction_session_audit_info.py +26 -0
  4. deltacat/compute/compactor/model/delta_annotated.py +16 -9
  5. deltacat/compute/compactor_v2/constants.py +3 -0
  6. deltacat/compute/compactor_v2/private/compaction_utils.py +11 -5
  7. deltacat/compute/compactor_v2/utils/content_type_params.py +185 -34
  8. deltacat/compute/compactor_v2/utils/io.py +28 -14
  9. deltacat/compute/compactor_v2/utils/task_options.py +128 -183
  10. deltacat/compute/resource_estimation/__init__.py +27 -0
  11. deltacat/compute/resource_estimation/delta.py +271 -0
  12. deltacat/compute/resource_estimation/manifest.py +394 -0
  13. deltacat/compute/resource_estimation/model.py +165 -0
  14. deltacat/compute/resource_estimation/parquet.py +108 -0
  15. deltacat/constants.py +5 -0
  16. deltacat/logs.py +8 -0
  17. deltacat/tests/compute/compactor_v2/test_compaction_session.py +157 -0
  18. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +3 -3
  19. deltacat/tests/compute/resource_estimation/test_delta.py +605 -0
  20. deltacat/tests/compute/resource_estimation/test_manifest.py +921 -0
  21. deltacat/tests/compute/test_util_common.py +2 -0
  22. deltacat/tests/test_logs.py +34 -0
  23. deltacat/tests/test_utils/pyarrow.py +15 -5
  24. {deltacat-1.1.18.dist-info → deltacat-1.1.20.dist-info}/METADATA +2 -2
  25. {deltacat-1.1.18.dist-info → deltacat-1.1.20.dist-info}/RECORD +30 -46
  26. deltacat/compute/metastats/meta_stats.py +0 -479
  27. deltacat/compute/metastats/model/__init__.py +0 -0
  28. deltacat/compute/metastats/model/partition_stats_dict.py +0 -34
  29. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +0 -68
  30. deltacat/compute/metastats/stats.py +0 -182
  31. deltacat/compute/metastats/utils/__init__.py +0 -0
  32. deltacat/compute/metastats/utils/constants.py +0 -16
  33. deltacat/compute/metastats/utils/io.py +0 -223
  34. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +0 -18
  35. deltacat/compute/metastats/utils/ray_utils.py +0 -129
  36. deltacat/compute/stats/basic.py +0 -226
  37. deltacat/compute/stats/models/__init__.py +0 -0
  38. deltacat/compute/stats/models/delta_column_stats.py +0 -98
  39. deltacat/compute/stats/models/delta_stats.py +0 -233
  40. deltacat/compute/stats/models/delta_stats_cache_result.py +0 -49
  41. deltacat/compute/stats/models/manifest_entry_stats.py +0 -72
  42. deltacat/compute/stats/models/stats_result.py +0 -104
  43. deltacat/compute/stats/utils/__init__.py +0 -0
  44. deltacat/compute/stats/utils/intervals.py +0 -94
  45. deltacat/compute/stats/utils/io.py +0 -230
  46. deltacat/compute/stats/utils/manifest_stats_file.py +0 -100
  47. deltacat/tests/stats/__init__.py +0 -0
  48. deltacat/tests/stats/test_intervals.py +0 -49
  49. /deltacat/{compute/metastats → tests/compute/resource_estimation}/__init__.py +0 -0
  50. /deltacat/{compute/metastats/config → tests/compute/resource_estimation/data}/__init__.py +0 -0
  51. {deltacat-1.1.18.dist-info → deltacat-1.1.20.dist-info}/LICENSE +0 -0
  52. {deltacat-1.1.18.dist-info → deltacat-1.1.20.dist-info}/WHEEL +0 -0
  53. {deltacat-1.1.18.dist-info → deltacat-1.1.20.dist-info}/top_level.txt +0 -0
@@ -1,226 +0,0 @@
1
- from typing import Dict, List, Optional, Set, Tuple
2
-
3
- import ray
4
- from ray.types import ObjectRef
5
-
6
- from deltacat.compute.stats.models.delta_stats import DeltaStats
7
- from deltacat.compute.stats.models.delta_stats_cache_result import DeltaStatsCacheResult
8
- from deltacat.compute.stats.models.stats_result import StatsResult
9
- from deltacat.compute.stats.types import StatsType
10
- from deltacat.compute.stats.utils.intervals import DeltaRange, merge_intervals
11
- from deltacat.compute.stats.utils.io import (
12
- cache_delta_column_stats,
13
- get_delta_stats,
14
- get_deltas_from_range,
15
- read_cached_delta_stats,
16
- )
17
- from deltacat.storage import Delta, DeltaLocator, PartitionLocator
18
- from deltacat.storage import interface as unimplemented_deltacat_storage
19
-
20
- # TODO (ricmiyam): Decouple DeltaCAT from S3-based paths
21
- # TODO (ricmiyam): Determine cache eviction policy
22
-
23
-
24
- def collect(
25
- source_partition_locator: PartitionLocator,
26
- delta_stream_position_range_set: Optional[Set[DeltaRange]] = None,
27
- columns: Optional[List[str]] = None,
28
- stat_results_s3_bucket: Optional[str] = None,
29
- deltacat_storage=unimplemented_deltacat_storage,
30
- ) -> Dict[int, DeltaStats]:
31
- """Collects statistics on deltas, given a set of delta stream position ranges.
32
-
33
- Example:
34
- >>> collect(locator, set((1, 5), (4, 8), (13, 16)))
35
- {
36
- 1: DeltaStats(), # DeltaStats for stream positions 1 - 8
37
- 13: DeltaStats() # DeltaStats for stream positions 13 - 16
38
- }
39
-
40
- Args:
41
- source_partition_locator: Reference to the partition locator tied to the given delta stream positions
42
- delta_stream_position_range_set: A set of intervals with an int type representing finite,
43
- closed bounded values, and a None type representing unbounded infinity.
44
- columns: Columns can be optionally included to collect stats on specific columns.
45
- By default, all columns will be calculated.
46
- stat_results_s3_bucket: Used as a cache file storage for computed delta stats
47
- deltacat_storage: Client implementation of the DeltaCAT storage interface
48
-
49
- Returns:
50
- A mapping of stream positions to their corresponding delta stats.
51
- """
52
- if delta_stream_position_range_set is None:
53
- delta_stream_position_range_set = {(None, None)}
54
- delta_stream_range_stats: Dict[int, DeltaStats] = {}
55
- delta_range_lookup_pending: List[ObjectRef[List[Delta]]] = []
56
-
57
- if not columns:
58
- columns = deltacat_storage.get_table_version_column_names(
59
- source_partition_locator.namespace,
60
- source_partition_locator.table_name,
61
- source_partition_locator.table_version,
62
- )
63
- for range_pair in merge_intervals(delta_stream_position_range_set):
64
- begin, end = range_pair
65
- promise: ObjectRef[List[Delta]] = get_deltas_from_range.remote(
66
- source_partition_locator, begin, end, deltacat_storage
67
- )
68
- delta_range_lookup_pending.append(promise)
69
-
70
- delta_list_by_ranges: List[List[Delta]] = ray.get(delta_range_lookup_pending)
71
- deltas = [delta for delta_list in delta_list_by_ranges for delta in delta_list]
72
-
73
- delta_stats_processed_list: List[DeltaStats] = _collect_stats_from_deltas(
74
- deltas, columns, stat_results_s3_bucket, deltacat_storage
75
- )
76
-
77
- for delta_column_stats in delta_stats_processed_list:
78
- assert (
79
- len(delta_column_stats.column_stats) > 0
80
- ), f"Expected columns of `{delta_column_stats}` to be non-empty"
81
- stream_position = delta_column_stats.column_stats[
82
- 0
83
- ].manifest_stats.delta_locator.stream_position
84
- delta_stream_range_stats[stream_position] = delta_column_stats
85
-
86
- return delta_stream_range_stats
87
-
88
-
89
- def collect_from_deltas(
90
- deltas: List[Delta],
91
- stat_types: Set[StatsType],
92
- columns: Optional[List[str]] = None,
93
- stat_results_s3_bucket: Optional[str] = None,
94
- deltacat_storage=unimplemented_deltacat_storage,
95
- ) -> StatsResult:
96
- """
97
- Variant of the `collect` function that takes a list of deltas and computes
98
- the aggregate of all the delta stats.
99
- """
100
- if columns is None and deltas:
101
- delta_locator: DeltaLocator = deltas[0].locator
102
- columns = deltacat_storage.get_table_version_column_names(
103
- delta_locator.namespace,
104
- delta_locator.table_name,
105
- delta_locator.table_version,
106
- )
107
-
108
- delta_stats_processed_list: List[DeltaStats] = _collect_stats_from_deltas(
109
- deltas, columns, stat_results_s3_bucket, deltacat_storage
110
- )
111
-
112
- return StatsResult.merge(
113
- [delta_ds.stats for delta_ds in delta_stats_processed_list], stat_types
114
- )
115
-
116
-
117
- def _collect_stats_from_deltas(
118
- deltas: List[Delta],
119
- columns: Optional[List[str]] = None,
120
- stat_results_s3_bucket: Optional[str] = None,
121
- deltacat_storage=unimplemented_deltacat_storage,
122
- ) -> List[DeltaStats]:
123
- delta_cache_lookup_pending: List[ObjectRef[DeltaStatsCacheResult]] = []
124
- delta_stats_compute_pending: List[ObjectRef[DeltaStats]] = []
125
-
126
- for delta in deltas:
127
- if stat_results_s3_bucket:
128
- promise: ObjectRef[DeltaStatsCacheResult] = read_cached_delta_stats.remote(
129
- delta, columns, stat_results_s3_bucket
130
- )
131
- delta_cache_lookup_pending.append(promise)
132
- continue
133
-
134
- delta_stats_compute_pending.append(
135
- get_delta_stats.remote(delta.locator, columns, deltacat_storage)
136
- )
137
-
138
- return _process_stats(
139
- delta_cache_lookup_pending,
140
- delta_stats_compute_pending,
141
- stat_results_s3_bucket,
142
- deltacat_storage,
143
- )
144
-
145
-
146
- def _process_stats(
147
- delta_cache_lookup_pending: List[ObjectRef[DeltaStatsCacheResult]],
148
- delta_stats_compute_pending: List[ObjectRef[DeltaStats]],
149
- stat_results_s3_bucket: Optional[str] = None,
150
- deltacat_storage=unimplemented_deltacat_storage,
151
- ) -> List[DeltaStats]:
152
- if stat_results_s3_bucket:
153
- delta_stats_processed_list: List[DeltaStats] = _resolve_pending_stats_and_cache(
154
- delta_cache_lookup_pending, stat_results_s3_bucket, deltacat_storage
155
- )
156
- else:
157
- delta_stats_processed_list: List[DeltaStats] = _resolve_pending_stats(
158
- delta_stats_compute_pending
159
- )
160
-
161
- return delta_stats_processed_list
162
-
163
-
164
- def _resolve_pending_stats_and_cache(
165
- delta_cache_lookup_pending: List[ObjectRef[DeltaStatsCacheResult]],
166
- stat_results_s3_bucket: str,
167
- deltacat_storage,
168
- ) -> List[DeltaStats]:
169
- delta_stats_cached_list, delta_stats_pending_list = _get_cached_and_pending_stats(
170
- delta_cache_lookup_pending, deltacat_storage
171
- )
172
- delta_stats_resolved_list: List[DeltaStats] = _resolve_pending_stats(
173
- delta_stats_pending_list
174
- )
175
-
176
- # Cache the stats into the file store
177
- delta_stats_to_cache: List[ObjectRef] = [
178
- cache_delta_column_stats.remote(stat_results_s3_bucket, dcs)
179
- for dataset_stats in delta_stats_resolved_list
180
- for dcs in dataset_stats.column_stats
181
- ]
182
- ray.get(delta_stats_to_cache)
183
-
184
- return [*delta_stats_cached_list, *delta_stats_resolved_list]
185
-
186
-
187
- def _get_cached_and_pending_stats(
188
- discover_deltas_pending: List[ObjectRef[DeltaStatsCacheResult]],
189
- deltacat_storage=unimplemented_deltacat_storage,
190
- ) -> Tuple[List[DeltaStats], List[ObjectRef[DeltaStats]]]:
191
- """
192
- Returns a tuple of a list of delta stats fetched from the cache, and a list of Ray tasks which will
193
- calculate the stats for deltas on cache miss.
194
- """
195
- delta_stats_processed: List[DeltaStats] = []
196
- delta_stats_pending: List[ObjectRef[DeltaStats]] = []
197
- while discover_deltas_pending:
198
- ready, discover_deltas_pending = ray.wait(discover_deltas_pending)
199
-
200
- cached_results: List[DeltaStatsCacheResult] = ray.get(ready)
201
- for cached_result in cached_results:
202
- if cached_result.hits:
203
- delta_stats_processed.append(cached_result.hits)
204
-
205
- if cached_result.misses:
206
- missed_column_names: List[str] = cached_result.misses.column_names
207
- delta_locator: DeltaLocator = cached_result.misses.delta_locator
208
- delta_stats_pending.append(
209
- get_delta_stats.remote(
210
- delta_locator, missed_column_names, deltacat_storage
211
- )
212
- )
213
-
214
- return delta_stats_processed, delta_stats_pending
215
-
216
-
217
- def _resolve_pending_stats(
218
- delta_stats_pending_list: List[ObjectRef[DeltaStats]],
219
- ) -> List[DeltaStats]:
220
- delta_stats_processed_list: List[DeltaStats] = []
221
- while delta_stats_pending_list:
222
- ready, delta_stats_pending_list = ray.wait(delta_stats_pending_list)
223
- processed_stats_batch: List[DeltaStats] = ray.get(ready)
224
- delta_stats_processed_list.extend(processed_stats_batch)
225
-
226
- return delta_stats_processed_list
File without changes
@@ -1,98 +0,0 @@
1
- # Allow classes to use self-referencing Type hints in Python 3.7.
2
- from __future__ import annotations
3
-
4
- from typing import Any, Dict, List, Optional
5
-
6
- from deltacat.compute.stats.models.manifest_entry_stats import ManifestEntryStats
7
- from deltacat.compute.stats.models.stats_result import StatsResult
8
- from deltacat.compute.stats.types import StatsType
9
-
10
-
11
- class DeltaColumnStats(dict):
12
- """
13
- Stats container for an individual column of a Delta.
14
- Provides distinct stats results for each manifest entry of the Delta.
15
-
16
- Example:
17
- Manifest Entry 1
18
- =======
19
- foo bar baz
20
- A B C
21
- D E F
22
-
23
- Manifest Entry 2
24
- =======
25
- foo bar baz
26
- G H I
27
- J K L
28
-
29
- DeltaColumnStats("foo",
30
- ManifestEntryStats([
31
- StatsResult([A, D]), # Manifest Entry 1
32
- StatsResult([G, J]), # Manifest Entry 2
33
- ]))
34
- DeltaColumnStats("bar",
35
- ManifestEntryStats([
36
- StatsResult([B, E]), # Manifest Entry 1
37
- StatsResult([H, K]), # Manifest Entry 2
38
- ]))
39
- DeltaColumnStats("baz",
40
- ManifestEntryStats([
41
- StatsResult([C, F]), # Manifest Entry 1
42
- StatsResult([I, L]), # Manifest Entry 2
43
- ]))
44
- """
45
-
46
- @staticmethod
47
- def of(column: str, manifest_stats: ManifestEntryStats) -> DeltaColumnStats:
48
- """
49
- Creates a container of a column name and the column stats for one or more manifest entries.
50
- """
51
- dcs = DeltaColumnStats()
52
- dcs["column"] = column
53
- dcs["manifestStats"] = manifest_stats
54
-
55
- if manifest_stats:
56
- # Omit row count for columnar-centric stats
57
- dcs["stats"] = dcs._merge_manifest_stats()
58
-
59
- return dcs
60
-
61
- @staticmethod
62
- def build_from_dict(delta_column_stats: List[str, Any]) -> List[DeltaColumnStats]:
63
- return DeltaColumnStats.of(
64
- delta_column_stats["column"],
65
- ManifestEntryStats.build_from_dict(delta_column_stats["manifestStats"]),
66
- )
67
-
68
- @property
69
- def column(self) -> str:
70
- """Returns the column name."""
71
- return self.get("column")
72
-
73
- @property
74
- def manifest_stats(self) -> Optional[ManifestEntryStats]:
75
- """Returns a container that represents stats at the manifest level.
76
-
77
- A container holds a list of computed stats for each manifest entry.
78
- """
79
- val: Dict[str, Any] = self.get("manifestStats")
80
- if val is not None and not isinstance(val, ManifestEntryStats):
81
- self["manifestStats"] = val = ManifestEntryStats(val)
82
- return val
83
-
84
- @property
85
- def stats(self) -> Optional[StatsResult]:
86
- """Combines the numerical stats for every manifest entry and returns it."""
87
- val: Dict[str, Any] = self.get("stats")
88
- if val is not None and not isinstance(val, StatsResult):
89
- self["stats"] = val = StatsResult(val)
90
- elif val is None and self.manifest_stats:
91
- self["stats"] = val = self._merge_manifest_stats()
92
-
93
- return val
94
-
95
- def _merge_manifest_stats(self) -> StatsResult:
96
- return StatsResult.merge(
97
- self.manifest_stats.stats, {StatsType.PYARROW_TABLE_BYTES}
98
- )
@@ -1,233 +0,0 @@
1
- # Allow classes to use self-referencing Type hints in Python 3.7.
2
- from __future__ import annotations
3
-
4
- from collections import defaultdict
5
- from typing import Any, Dict, List, NamedTuple, Optional, Set
6
-
7
- from deltacat.compute.stats.models.delta_column_stats import DeltaColumnStats
8
- from deltacat.compute.stats.models.manifest_entry_stats import ManifestEntryStats
9
- from deltacat.compute.stats.models.stats_result import StatsResult
10
- from deltacat.compute.stats.types import StatsType
11
- from deltacat.storage import DeltaLocator
12
-
13
-
14
- class DeltaStats(dict):
15
- """
16
- Stats container for all columns of a delta.
17
-
18
- Provides distinct stats for each delta manifest entry, aggregate stats across all manifest entries,
19
- and a DeltaColumnStats reference for each column.
20
-
21
- Each DeltaColumnStats has a column name and a ManifestEntryStats object,
22
- which contains column-level stats for each delta manifest entry.
23
-
24
- Example of visual representation:
25
- Manifest Entry 1
26
- =======
27
- foo bar baz
28
- A B C
29
- D E F
30
-
31
- Manifest Entry 2
32
- =======
33
- foo bar baz
34
- G H I
35
- J K L
36
-
37
- DeltaStats([
38
- DeltaColumnStats("foo",
39
- ManifestEntryStats([
40
- StatsResult([A, D]), # Manifest Entry 1
41
- StatsResult([G, J]), # Manifest Entry 2
42
- ]))
43
- DeltaColumnStats("bar",
44
- ManifestEntryStats([
45
- StatsResult([B, E]), # Manifest Entry 1
46
- StatsResult([H, K]), # Manifest Entry 2
47
- ]))
48
- DeltaColumnStats("baz",
49
- ManifestEntryStats([
50
- StatsResult([C, F]), # Manifest Entry 1
51
- StatsResult([I, L]), # Manifest Entry 2
52
- ]))
53
- ], Stats(AllDeltaColumnStats))
54
- """
55
-
56
- @staticmethod
57
- def of(column_stats: List[DeltaColumnStats]) -> DeltaStats:
58
- ds = DeltaStats()
59
- ds["column_stats"] = column_stats
60
- ds["stats"] = DeltaStats.get_delta_stats(column_stats)
61
- return ds
62
-
63
- @staticmethod
64
- def build_from_dict(delta_stats: dict) -> DeltaStats:
65
- delta_column_stats_list = []
66
- for dcs in delta_stats["column_stats"]:
67
- delta_column_stats_list.append(DeltaColumnStats.build_from_dict(dcs))
68
- return DeltaStats.of(delta_column_stats_list)
69
-
70
- @property
71
- def column_stats(self) -> List[DeltaColumnStats]:
72
- """
73
- Returns a list of stats associated to each column in this delta.
74
- """
75
- return self["column_stats"]
76
-
77
- @property
78
- def stats(self) -> Optional[StatsResult]:
79
- """Returns a StatsResult object that represents this delta, aggregated by the column stats of this delta."""
80
- val: Dict[str, Any] = self.get("stats")
81
- if val is not None and not isinstance(val, StatsResult):
82
- self["stats"] = val = StatsResult(val)
83
- elif val is None and self.column_stats:
84
- self["stats"] = val = DeltaStats.get_delta_stats(self.column_stats)
85
-
86
- return val
87
-
88
- @property
89
- def columns(self) -> List[str]:
90
- """Returns a list of column names associated to this delta.
91
-
92
- Returns:
93
- A list of column names
94
- """
95
- return DeltaStats.get_column_names(self.column_stats)
96
-
97
- def manifest_entry_stats(self, manifest_entry_idx: int) -> StatsResult:
98
- """Calculate the stats of a manifest entry by combining its columnar stats.
99
-
100
- Args:
101
- manifest_entry_idx: The manifest entry table to calculate stats for
102
-
103
- Returns:
104
- Stats for the manifest entry.
105
- """
106
- return StatsResult.merge(
107
- DeltaStats.get_manifest_entry_column_stats(
108
- self.column_stats, manifest_entry_idx
109
- ),
110
- record_row_count_once=True,
111
- )
112
-
113
- def manifest_entry_column_stats(self, manifest_entry_idx: int) -> List[StatsResult]:
114
- """Fetch a list of stats for each column in a manifest entry.
115
-
116
- Args:
117
- manifest_entry_idx: The manifest entry table to calculate stats for
118
-
119
- Returns:
120
- A list of columnar stats for the manifest entry
121
- """
122
- return DeltaStats.get_manifest_entry_column_stats(
123
- self.column_stats, manifest_entry_idx
124
- )
125
-
126
- @staticmethod
127
- def get_manifest_entry_column_stats(
128
- columns: List[DeltaColumnStats], manifest_entry_idx: int
129
- ) -> List[StatsResult]:
130
- """Helper method to provide a list of columnar stats for a specific manifest entry.
131
-
132
- Returns:
133
- A list of columnar stats for the manifest entry
134
- """
135
- dataset_columnar_stats_list: List[ManifestEntryStats] = [
136
- column.manifest_stats
137
- for column in columns
138
- if column.manifest_stats is not None
139
- ]
140
- try:
141
- return [
142
- stats.stats[manifest_entry_idx] for stats in dataset_columnar_stats_list
143
- ]
144
- except IndexError:
145
- sci: ManifestEntryStats = dataset_columnar_stats_list[0]
146
- raise ValueError(
147
- f"Table index {manifest_entry_idx} is not present in this dataset of {sci.delta_locator} "
148
- f"with manifest table count of {len(sci.stats)}"
149
- )
150
-
151
- @staticmethod
152
- def get_column_names(columns: List[DeltaColumnStats]) -> List[str]:
153
- """Helper method to get the names of each column from a list of delta column stats
154
-
155
- Args:
156
- columns: A list of delta column stats
157
-
158
- Returns:
159
- A list of column names
160
- """
161
- return [column_stats.column for column_stats in columns] if columns else []
162
-
163
- @staticmethod
164
- def get_delta_stats(
165
- columns: List[DeltaColumnStats], stat_types: Optional[Set[StatsType]] = None
166
- ) -> Optional[StatsResult]:
167
- """Calculate the sum of provided column stats and return it
168
-
169
- Args:
170
- columns: A list of delta column stats
171
-
172
- Returns:
173
- Stats for the calculated sum
174
- """
175
- assert columns and len(columns) > 0, (
176
- f"Expected columns `{columns}` of type `{type(columns)}` "
177
- f"to be a non-empty list of DeltaColumnStats"
178
- )
179
-
180
- assert all(
181
- [col.manifest_stats for col in columns]
182
- ), f"Expected stats completion info to be present in each item of {columns} "
183
-
184
- manifest_entry_count = len(columns[0].manifest_stats.stats)
185
- column_stats_map: Dict[str, List[Optional[StatsResult]]] = defaultdict(
186
- lambda: [None] * manifest_entry_count
187
- )
188
-
189
- for column_stats in columns:
190
- for file_idx, entry_stats in enumerate(column_stats.manifest_stats.stats):
191
- column_stats_map[column_stats.column][file_idx] = entry_stats
192
-
193
- return DeltaStats._merge_stats_from_columns_to_dataset(
194
- DeltaStats.get_column_names(columns),
195
- column_stats_map,
196
- manifest_entry_count,
197
- stat_types,
198
- )
199
-
200
- @staticmethod
201
- def _merge_stats_from_columns_to_dataset(
202
- column_names: List[str],
203
- column_stats: Dict[str, List[Optional[StatsResult]]],
204
- manifest_entries_size: int,
205
- stat_types: Optional[Set[StatsType]] = None,
206
- ) -> StatsResult:
207
- manifest_entry_stats_summary_list: List[StatsResult] = []
208
- for manifest_entry_idx in range(manifest_entries_size):
209
- curr_manifest_entry_column_stats_list: List[StatsResult] = []
210
- for column_name in column_names:
211
- current_table_column_stats: StatsResult = column_stats[column_name][
212
- manifest_entry_idx
213
- ]
214
- curr_manifest_entry_column_stats_list.append(current_table_column_stats)
215
-
216
- curr_manifest_entry_stats_summary = StatsResult.merge(
217
- curr_manifest_entry_column_stats_list,
218
- stat_types,
219
- record_row_count_once=True,
220
- )
221
- manifest_entry_stats_summary_list.append(curr_manifest_entry_stats_summary)
222
- return StatsResult.merge(manifest_entry_stats_summary_list, stat_types)
223
-
224
-
225
- class DeltaStatsCacheMiss(NamedTuple):
226
- """A helper class for cache miss results from DeltaStatsCacheResult.
227
-
228
- `column_names` represents missing dataset column names from the file system (ex: S3).
229
- delta_locator` is tied to the missing dataset columns and provided for future calculations.
230
- """
231
-
232
- column_names: List[str]
233
- delta_locator: DeltaLocator
@@ -1,49 +0,0 @@
1
- # Allow classes to use self-referencing Type hints in Python 3.7.
2
- from __future__ import annotations
3
-
4
- from typing import Optional
5
-
6
- from deltacat.compute.stats.models.delta_stats import DeltaStats, DeltaStatsCacheMiss
7
-
8
-
9
- class DeltaStatsCacheResult(dict):
10
- """A helper class containing the results from a cache query.
11
-
12
- Stats are fetched and cached at the column level, and each column may represent one
13
- or more manifest entries.
14
- """
15
-
16
- @staticmethod
17
- def of(
18
- hits: Optional[DeltaStats], misses: Optional[DeltaStatsCacheMiss]
19
- ) -> DeltaStatsCacheResult:
20
- cds = DeltaStatsCacheResult()
21
- cds["hits"] = hits
22
- cds["misses"] = misses
23
- return cds
24
-
25
- @property
26
- def hits(self) -> Optional[DeltaStats]:
27
- """Retrieve stats that were found in the cache
28
-
29
- `hits` represents a DeltaStats object that contains dataset-wide statistics across
30
- many of its tables (or manifest entries) and is composed of one or more column-wide
31
- DeltaColumnStats.
32
-
33
- Returns:
34
- A delta wide stats container
35
- """
36
- return self["hits"]
37
-
38
- @property
39
- def misses(self) -> Optional[DeltaStatsCacheMiss]:
40
- """Retrieve stats that were missing from the cache
41
-
42
- `misses` represents a DeltaStatsCacheMiss object that contains a list of
43
- column names that were not found in the file system (ex: S3) and a `delta_locator`
44
- as a reference to the delta metadata tied to the missing dataset columns.
45
-
46
- Returns:
47
- A tuple with metadata regarding the cache miss
48
- """
49
- return self["misses"]
@@ -1,72 +0,0 @@
1
- # Allow classes to use self-referencing Type hints in Python 3.7.
2
- from __future__ import annotations
3
-
4
- from typing import Any, Dict, List
5
-
6
- import pyarrow as pa
7
-
8
- from deltacat.compute.stats.models.stats_result import StatsResult
9
- from deltacat.storage import DeltaLocator
10
-
11
-
12
- class ManifestEntryStats(dict):
13
- """Holds computed statistics for one or more manifest entries (tables) and their corresponding delta locator.
14
-
15
- To be stored/retrieved from a file system (ex: S3).
16
- """
17
-
18
- @staticmethod
19
- def of(
20
- manifest_entries_stats: List[StatsResult], delta_locator: DeltaLocator
21
- ) -> ManifestEntryStats:
22
- """
23
- Creates a stats container that represents a particular manifest.
24
-
25
- `manifest_entries_stats` are a list of distinct stats for each manifest entry file
26
- tied to this manifest. `delta_locator` is provided as a reference to the delta where the
27
- manifest entries reside.
28
- """
29
-
30
- mes = ManifestEntryStats()
31
- mes["deltaLocator"] = delta_locator
32
- mes["stats"] = manifest_entries_stats
33
- mes["pyarrowVersion"] = pa.__version__
34
- return mes
35
-
36
- @staticmethod
37
- def build_from_dict(manifest_entries_stats: dict) -> ManifestEntryStats:
38
- stats_res_list = []
39
- for stats_res in manifest_entries_stats["stats"]:
40
- stats_res_list.append(
41
- StatsResult.of(stats_res["rowCount"], stats_res["pyarrowTableBytes"])
42
- )
43
- return ManifestEntryStats.of(
44
- stats_res_list, manifest_entries_stats["deltaLocator"]
45
- )
46
-
47
- @property
48
- def delta_locator(self) -> DeltaLocator:
49
- """Reference to the delta that holds the manifest entries
50
-
51
- Returns:
52
- A delta locator object
53
- """
54
- val: Dict[str, Any] = self.get("deltaLocator")
55
- if val is not None and not isinstance(val, DeltaLocator):
56
- self["deltaLocator"] = val = DeltaLocator(val)
57
- return val
58
-
59
- @property
60
- def stats(self) -> List[StatsResult]:
61
- """
62
- Returns a list of distinct stats for each manifest entry file.
63
- """
64
- val = self["stats"]
65
- return [StatsResult(_) for _ in val] if val else []
66
-
67
- @property
68
- def pyarrow_version(self) -> str:
69
- """
70
- Read-only property which returns the PyArrow version number as it was written into a file system.
71
- """
72
- return self.get("pyarrowVersion")