deltacat 1.1.17__py3-none-any.whl → 1.1.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/constants.py +0 -1
- deltacat/compute/compactor/model/compact_partition_params.py +76 -0
- deltacat/compute/compactor/model/compaction_session_audit_info.py +26 -0
- deltacat/compute/compactor/model/delta_annotated.py +16 -9
- deltacat/compute/compactor_v2/constants.py +3 -0
- deltacat/compute/compactor_v2/private/compaction_utils.py +9 -5
- deltacat/compute/compactor_v2/utils/content_type_params.py +185 -34
- deltacat/compute/compactor_v2/utils/io.py +28 -14
- deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
- deltacat/compute/compactor_v2/utils/task_options.py +128 -183
- deltacat/compute/resource_estimation/__init__.py +27 -0
- deltacat/compute/resource_estimation/delta.py +271 -0
- deltacat/compute/resource_estimation/manifest.py +394 -0
- deltacat/compute/resource_estimation/model.py +165 -0
- deltacat/compute/resource_estimation/parquet.py +108 -0
- deltacat/constants.py +5 -0
- deltacat/exceptions.py +2 -4
- deltacat/logs.py +8 -0
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +77 -0
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +308 -0
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +159 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +157 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +3 -3
- deltacat/tests/compute/resource_estimation/test_delta.py +605 -0
- deltacat/tests/compute/resource_estimation/test_manifest.py +921 -0
- deltacat/tests/compute/test_compact_partition_rebase.py +13 -4
- deltacat/tests/compute/test_util_common.py +2 -0
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -5
- deltacat/tests/test_logs.py +34 -0
- deltacat/tests/test_utils/pyarrow.py +15 -5
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/METADATA +2 -2
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/RECORD +38 -54
- deltacat/compute/metastats/meta_stats.py +0 -479
- deltacat/compute/metastats/model/__init__.py +0 -0
- deltacat/compute/metastats/model/partition_stats_dict.py +0 -34
- deltacat/compute/metastats/model/stats_cluster_size_estimator.py +0 -68
- deltacat/compute/metastats/stats.py +0 -182
- deltacat/compute/metastats/utils/__init__.py +0 -0
- deltacat/compute/metastats/utils/constants.py +0 -16
- deltacat/compute/metastats/utils/io.py +0 -223
- deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +0 -18
- deltacat/compute/metastats/utils/ray_utils.py +0 -129
- deltacat/compute/stats/basic.py +0 -226
- deltacat/compute/stats/models/__init__.py +0 -0
- deltacat/compute/stats/models/delta_column_stats.py +0 -98
- deltacat/compute/stats/models/delta_stats.py +0 -233
- deltacat/compute/stats/models/delta_stats_cache_result.py +0 -49
- deltacat/compute/stats/models/manifest_entry_stats.py +0 -72
- deltacat/compute/stats/models/stats_result.py +0 -104
- deltacat/compute/stats/utils/__init__.py +0 -0
- deltacat/compute/stats/utils/intervals.py +0 -94
- deltacat/compute/stats/utils/io.py +0 -230
- deltacat/compute/stats/utils/manifest_stats_file.py +0 -100
- deltacat/tests/stats/__init__.py +0 -0
- deltacat/tests/stats/test_intervals.py +0 -49
- /deltacat/{compute/metastats → tests/compute/resource_estimation}/__init__.py +0 -0
- /deltacat/{compute/metastats/config → tests/compute/resource_estimation/data}/__init__.py +0 -0
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/LICENSE +0 -0
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/WHEEL +0 -0
- {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/top_level.txt +0 -0
@@ -1,104 +0,0 @@
|
|
1
|
-
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
|
-
from __future__ import annotations
|
3
|
-
|
4
|
-
from collections import defaultdict
|
5
|
-
from typing import Any, Dict, List, Optional, Set
|
6
|
-
|
7
|
-
from deltacat.compute.stats.types import ALL_STATS_TYPES, StatsType
|
8
|
-
|
9
|
-
|
10
|
-
class StatsResult(dict):
|
11
|
-
"""A generic container that holds stats for a single manifest entry file."""
|
12
|
-
|
13
|
-
@staticmethod
|
14
|
-
def of(
|
15
|
-
row_count: Optional[int] = 0, pyarrow_table_bytes: Optional[int] = 0
|
16
|
-
) -> StatsResult:
|
17
|
-
"""Static factory for building a stats result object
|
18
|
-
|
19
|
-
Args:
|
20
|
-
row_count: The total number of rows of a manifest entry
|
21
|
-
pyarrow_table_bytes: The total number of bytes when loaded into memory as a PyArrow Table
|
22
|
-
|
23
|
-
Returns:
|
24
|
-
A stats result object
|
25
|
-
"""
|
26
|
-
sr = StatsResult()
|
27
|
-
sr[StatsType.ROW_COUNT.value] = row_count
|
28
|
-
sr[StatsType.PYARROW_TABLE_BYTES.value] = pyarrow_table_bytes
|
29
|
-
return sr
|
30
|
-
|
31
|
-
@property
|
32
|
-
def row_count(self) -> int:
|
33
|
-
"""Represents the row count of a manifest entry file.
|
34
|
-
|
35
|
-
Returns:
|
36
|
-
The total number of rows of a manifest entry
|
37
|
-
"""
|
38
|
-
return self[StatsType.ROW_COUNT.value]
|
39
|
-
|
40
|
-
@property
|
41
|
-
def pyarrow_table_bytes(self) -> int:
|
42
|
-
"""Represents the size of a manifest entry file (in bytes) as it was loaded into a PyArrow table.
|
43
|
-
|
44
|
-
Returns:
|
45
|
-
The total number of bytes when loaded into memory as a PyArrow Table
|
46
|
-
"""
|
47
|
-
return self[StatsType.PYARROW_TABLE_BYTES.value]
|
48
|
-
|
49
|
-
@staticmethod
|
50
|
-
def from_stats_types(stats_types: Dict[StatsType, Any]) -> StatsResult:
|
51
|
-
"""A helper method to filter a dictionary by supported stats and returns a stats result object.
|
52
|
-
|
53
|
-
Args:
|
54
|
-
stats_types: Stats that should be included for constructing a stats result
|
55
|
-
|
56
|
-
Returns:
|
57
|
-
A stats result object
|
58
|
-
"""
|
59
|
-
return StatsResult(
|
60
|
-
{
|
61
|
-
k: v
|
62
|
-
for k, v in stats_types.items()
|
63
|
-
if k in [StatsType.ROW_COUNT, StatsType.PYARROW_TABLE_BYTES]
|
64
|
-
}
|
65
|
-
)
|
66
|
-
|
67
|
-
@staticmethod
|
68
|
-
def merge(
|
69
|
-
stats_list: List[StatsResult],
|
70
|
-
stat_types: Optional[Set[StatsType]] = None,
|
71
|
-
record_row_count_once: bool = False,
|
72
|
-
) -> StatsResult:
|
73
|
-
"""Helper method to merge any list of StatsResult objects into one.
|
74
|
-
|
75
|
-
StatsResult objects are merged by adding up their numerical stats.
|
76
|
-
TODO (ricmiyam): Handle non-numerical stats when they are added
|
77
|
-
|
78
|
-
Args:
|
79
|
-
stat_types: If provided, the calculation will only include the requested stats.
|
80
|
-
record_row_count_once: If optionally set to `True`, then row counts are only added
|
81
|
-
from the first stats entry. One use case for this is merging table-centric stats
|
82
|
-
by columns, since the row count is expected to be the same across different columns.
|
83
|
-
|
84
|
-
Returns:
|
85
|
-
A stats result object
|
86
|
-
"""
|
87
|
-
assert isinstance(stats_list, list) and len(stats_list) > 0, (
|
88
|
-
f"Expected stats list: {stats_list} of type {type(stats_list)} to be a "
|
89
|
-
f"non-empty list of StatsResult objects."
|
90
|
-
)
|
91
|
-
|
92
|
-
# Fallback to all stat types if not provided
|
93
|
-
stats_to_collect: Set = stat_types or ALL_STATS_TYPES
|
94
|
-
|
95
|
-
merged_stats: Dict[StatsType, int] = defaultdict(int)
|
96
|
-
for stats_result in stats_list:
|
97
|
-
for stat_type in stats_to_collect:
|
98
|
-
if stats_result:
|
99
|
-
merged_stats[stat_type.value] += stats_result[stat_type.value]
|
100
|
-
|
101
|
-
if record_row_count_once and StatsType.ROW_COUNT in stats_to_collect:
|
102
|
-
merged_stats[StatsType.ROW_COUNT.value] = stats_list[0].row_count
|
103
|
-
|
104
|
-
return StatsResult.from_stats_types(merged_stats)
|
File without changes
|
@@ -1,94 +0,0 @@
|
|
1
|
-
from typing import Iterable, List, Optional, Set, Tuple, Union
|
2
|
-
|
3
|
-
DeltaPosition = Optional[int]
|
4
|
-
NumericDeltaPosition = Union[int, float] # float is added here to support math.inf
|
5
|
-
DeltaRange = Tuple[DeltaPosition, DeltaPosition]
|
6
|
-
|
7
|
-
|
8
|
-
def merge_intervals(intervals: Set[DeltaRange]) -> Set[DeltaRange]:
|
9
|
-
"""Merges a set of N input intervals into a minimal number of output intervals.
|
10
|
-
|
11
|
-
All input intervals will be merged into a minimal number of output intervals in O(N log N) time.
|
12
|
-
|
13
|
-
Example:
|
14
|
-
>>> merge_intervals((3, 9), (8, 12), (15, 19))
|
15
|
-
((3, 12), (15, 19))
|
16
|
-
|
17
|
-
Example:
|
18
|
-
>>> merge_intervals((3, 9), (None, 15), (13, 30))
|
19
|
-
((None, 30))
|
20
|
-
|
21
|
-
Args:
|
22
|
-
intervals: A list of intervals with an int type representing finite, closed bounded values and
|
23
|
-
a None type representing infinity.
|
24
|
-
|
25
|
-
Returns:
|
26
|
-
A minimal number of output intervals
|
27
|
-
"""
|
28
|
-
merged: Set[DeltaRange] = set()
|
29
|
-
intervals_list: List[DeltaRange] = list(intervals)
|
30
|
-
|
31
|
-
if len(intervals_list) == 0:
|
32
|
-
return merged
|
33
|
-
|
34
|
-
_to_numeric_values(intervals_list)
|
35
|
-
intervals_list.sort() # sort by starting range numbers
|
36
|
-
|
37
|
-
merge_start, merge_end = None, None
|
38
|
-
for interval in intervals_list:
|
39
|
-
start, end = interval
|
40
|
-
if start > end:
|
41
|
-
raise ValueError(
|
42
|
-
f"Invalid stream position range interval: ({start}, {end})"
|
43
|
-
)
|
44
|
-
|
45
|
-
if merge_start is None and merge_end is None:
|
46
|
-
merge_start, merge_end = start, end
|
47
|
-
continue
|
48
|
-
|
49
|
-
if merge_end < start:
|
50
|
-
# add current merge interval if no overlap, begin new interval
|
51
|
-
_add_merged_interval(merged, merge_start, merge_end)
|
52
|
-
merge_start, merge_end = start, end
|
53
|
-
elif merge_end < end:
|
54
|
-
merge_end = end # expand current merge interval if there is an overlap
|
55
|
-
|
56
|
-
# add final merge interval
|
57
|
-
_add_merged_interval(merged, merge_start, merge_end)
|
58
|
-
|
59
|
-
return merged
|
60
|
-
|
61
|
-
|
62
|
-
def _add_merged_interval(
|
63
|
-
result_set: set, start: NumericDeltaPosition, end: NumericDeltaPosition
|
64
|
-
):
|
65
|
-
start_pos: DeltaPosition = start if isinstance(start, int) else None
|
66
|
-
end_pos: DeltaPosition = end if isinstance(end, int) else None
|
67
|
-
result_set.add((start_pos, end_pos))
|
68
|
-
|
69
|
-
|
70
|
-
def _to_numeric_values(intervals_list: List[DeltaRange]):
|
71
|
-
for i, interval in enumerate(intervals_list):
|
72
|
-
start, end = _get_validated_interval(interval)
|
73
|
-
if start is None:
|
74
|
-
start = float("-inf")
|
75
|
-
if end is None:
|
76
|
-
end = float("inf")
|
77
|
-
|
78
|
-
intervals_list[i] = (start, end)
|
79
|
-
|
80
|
-
|
81
|
-
def _get_validated_interval(interval: DeltaRange) -> DeltaRange:
|
82
|
-
if not isinstance(interval, Iterable) or len(interval) != 2:
|
83
|
-
raise ValueError(f"Interval {interval} must be a tuple of size 2")
|
84
|
-
|
85
|
-
start, end = interval
|
86
|
-
if not (isinstance(start, int) or start is None) or not (
|
87
|
-
isinstance(end, int) or end is None
|
88
|
-
):
|
89
|
-
raise ValueError(
|
90
|
-
f"Invalid stream position value types: "
|
91
|
-
f"({start}, {end}) - ({type(start), type(end)})"
|
92
|
-
)
|
93
|
-
|
94
|
-
return start, end
|
@@ -1,230 +0,0 @@
|
|
1
|
-
import logging
|
2
|
-
from collections import defaultdict
|
3
|
-
from typing import Any, Dict, List, Optional
|
4
|
-
|
5
|
-
import pyarrow
|
6
|
-
import ray
|
7
|
-
|
8
|
-
from deltacat import LocalTable, TableType, logs
|
9
|
-
from deltacat.compute.stats.models.delta_column_stats import DeltaColumnStats
|
10
|
-
from deltacat.compute.stats.models.delta_stats import DeltaStats, DeltaStatsCacheMiss
|
11
|
-
from deltacat.compute.stats.models.delta_stats_cache_result import DeltaStatsCacheResult
|
12
|
-
from deltacat.compute.stats.models.manifest_entry_stats import ManifestEntryStats
|
13
|
-
from deltacat.compute.stats.models.stats_result import StatsResult
|
14
|
-
from deltacat.compute.stats.utils.intervals import DeltaRange
|
15
|
-
from deltacat.compute.stats.utils.manifest_stats_file import (
|
16
|
-
read_manifest_stats_by_columns,
|
17
|
-
write_manifest_stats_file,
|
18
|
-
)
|
19
|
-
from deltacat.storage import Delta, DeltaLocator, PartitionLocator
|
20
|
-
from deltacat.storage import interface as unimplemented_deltacat_storage
|
21
|
-
|
22
|
-
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
23
|
-
|
24
|
-
|
25
|
-
@ray.remote
|
26
|
-
def read_cached_delta_stats(
|
27
|
-
delta: Delta, columns_to_fetch: List[str], stat_results_s3_bucket: str
|
28
|
-
):
|
29
|
-
"""Read delta stats that are cached in S3
|
30
|
-
|
31
|
-
This Ray distributed task reads delta stats from a file system (i.e. S3) based on specified columns.
|
32
|
-
Stats are extracted from columns that are found (cache hit), while columns that are missing
|
33
|
-
from the file system will record their column names and the delta locator as a cache miss.
|
34
|
-
|
35
|
-
Args:
|
36
|
-
delta: The delta object to look up
|
37
|
-
columns_to_fetch: Columns to look up for this delta
|
38
|
-
stat_results_s3_bucket: The S3 bucket name
|
39
|
-
"""
|
40
|
-
|
41
|
-
delta_locator = DeltaLocator.of(delta.partition_locator, delta.stream_position)
|
42
|
-
column_stats_completion_info: List[
|
43
|
-
DeltaColumnStats
|
44
|
-
] = read_manifest_stats_by_columns(
|
45
|
-
stat_results_s3_bucket, columns_to_fetch, delta_locator
|
46
|
-
)
|
47
|
-
|
48
|
-
found_columns_stats: List[DeltaColumnStats] = []
|
49
|
-
missed_columns: List[str] = []
|
50
|
-
for column_stats in column_stats_completion_info:
|
51
|
-
if column_stats.manifest_stats:
|
52
|
-
found_columns_stats.append(column_stats)
|
53
|
-
else:
|
54
|
-
missed_columns.append(column_stats.column)
|
55
|
-
|
56
|
-
found_stats: Optional[DeltaStats] = (
|
57
|
-
DeltaStats.of(found_columns_stats) if found_columns_stats else None
|
58
|
-
)
|
59
|
-
missed_stats: Optional[DeltaStatsCacheMiss] = (
|
60
|
-
DeltaStatsCacheMiss(missed_columns, delta.locator) if missed_columns else None
|
61
|
-
)
|
62
|
-
|
63
|
-
return DeltaStatsCacheResult.of(found_stats, missed_stats)
|
64
|
-
|
65
|
-
|
66
|
-
@ray.remote
|
67
|
-
def cache_delta_column_stats(
|
68
|
-
stat_results_s3_bucket: str, dataset_column: DeltaColumnStats
|
69
|
-
) -> None:
|
70
|
-
"""Ray distributed task to cache the delta column stats into a file system (i.e. S3).
|
71
|
-
|
72
|
-
Args:
|
73
|
-
stat_results_s3_bucket: The S3 bucket name
|
74
|
-
dataset_column: Column-oriented stats for a given delta
|
75
|
-
"""
|
76
|
-
write_manifest_stats_file(
|
77
|
-
stat_results_s3_bucket, dataset_column.column, dataset_column.manifest_stats
|
78
|
-
)
|
79
|
-
|
80
|
-
|
81
|
-
@ray.remote
|
82
|
-
def get_delta_stats(
|
83
|
-
delta_locator: DeltaLocator,
|
84
|
-
columns: Optional[List[str]] = None,
|
85
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
86
|
-
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
87
|
-
) -> DeltaStats:
|
88
|
-
"""Ray distributed task to compute and collect stats for a requested delta.
|
89
|
-
If no columns are requested, stats will be computed for all columns.
|
90
|
-
Args:
|
91
|
-
delta_locator: A reference to the delta
|
92
|
-
columns: Column names to specify for this delta. If not provided, all columns are considered.
|
93
|
-
deltacat_storage: Client implementation of the DeltaCAT storage interface
|
94
|
-
Returns:
|
95
|
-
A delta wide stats container
|
96
|
-
"""
|
97
|
-
if deltacat_storage_kwargs is None:
|
98
|
-
deltacat_storage_kwargs = {}
|
99
|
-
manifest = deltacat_storage.get_delta_manifest(
|
100
|
-
delta_locator, **deltacat_storage_kwargs
|
101
|
-
)
|
102
|
-
delta = Delta.of(delta_locator, None, None, None, manifest)
|
103
|
-
return _collect_stats_by_columns(
|
104
|
-
delta, columns, deltacat_storage, deltacat_storage_kwargs
|
105
|
-
)
|
106
|
-
|
107
|
-
|
108
|
-
@ray.remote
|
109
|
-
def get_deltas_from_range(
|
110
|
-
source_partition_locator: PartitionLocator,
|
111
|
-
start_position_inclusive: DeltaRange,
|
112
|
-
end_position_inclusive: DeltaRange,
|
113
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
114
|
-
**kwargs,
|
115
|
-
) -> List[Delta]:
|
116
|
-
"""Looks up deltas in the specified partition using Ray, given both starting and ending delta stream positions.
|
117
|
-
|
118
|
-
Args:
|
119
|
-
source_partition_locator: Reference to the partition locator tied to the given delta stream positions
|
120
|
-
start_position_inclusive: Starting stream position of a range interval.
|
121
|
-
Can be an int type to represent a closed bounded range, or a None type to represent unbounded infinity.
|
122
|
-
end_position_inclusive: Ending stream position of a range interval.
|
123
|
-
Can be an int type to represent a closed bounded range, or a None type to represent unbounded infinity.
|
124
|
-
deltacat_storage: Client implementation of the DeltaCAT storage interface
|
125
|
-
|
126
|
-
Returns:
|
127
|
-
a list of delta objects
|
128
|
-
"""
|
129
|
-
|
130
|
-
namespace, partition_values = (
|
131
|
-
source_partition_locator.namespace,
|
132
|
-
source_partition_locator.partition_values,
|
133
|
-
)
|
134
|
-
table_name, table_version = (
|
135
|
-
source_partition_locator.table_name,
|
136
|
-
source_partition_locator.table_version,
|
137
|
-
)
|
138
|
-
deltas_list_result = deltacat_storage.list_deltas(
|
139
|
-
namespace,
|
140
|
-
table_name,
|
141
|
-
partition_values,
|
142
|
-
table_version,
|
143
|
-
start_position_inclusive,
|
144
|
-
end_position_inclusive,
|
145
|
-
ascending_order=True,
|
146
|
-
include_manifest=False,
|
147
|
-
**kwargs,
|
148
|
-
)
|
149
|
-
return deltas_list_result.all_items()
|
150
|
-
|
151
|
-
|
152
|
-
def _collect_stats_by_columns(
|
153
|
-
delta: Delta,
|
154
|
-
columns_to_compute: Optional[List[str]] = None,
|
155
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
156
|
-
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
157
|
-
) -> DeltaStats:
|
158
|
-
"""Materializes one manifest entry at a time to save memory usage and calculate stats from each of its columns.
|
159
|
-
Args:
|
160
|
-
delta: A delta object to calculate stats for
|
161
|
-
columns_to_compute: Columns to calculate stats for. If not provided, all columns are considered.
|
162
|
-
deltacat_storage: Client implementation of the DeltaCAT storage interface
|
163
|
-
Returns:
|
164
|
-
A delta wide stats container
|
165
|
-
"""
|
166
|
-
if deltacat_storage_kwargs is None:
|
167
|
-
deltacat_storage_kwargs = {}
|
168
|
-
assert (
|
169
|
-
delta.manifest is not None
|
170
|
-
), f"Manifest should not be missing from delta for stats calculation: {delta}"
|
171
|
-
|
172
|
-
# Mapping of column_name -> [stats_file_idx_1, stats_file_idx_2, ... stats_file_idx_n]
|
173
|
-
column_stats_map: Dict[str, List[Optional[StatsResult]]] = defaultdict(
|
174
|
-
lambda: [None] * len(delta.manifest.entries)
|
175
|
-
)
|
176
|
-
|
177
|
-
total_tables_size = 0
|
178
|
-
for file_idx, manifest in enumerate(delta.manifest.entries):
|
179
|
-
entry_pyarrow_table: LocalTable = (
|
180
|
-
deltacat_storage.download_delta_manifest_entry(
|
181
|
-
delta,
|
182
|
-
file_idx,
|
183
|
-
TableType.PYARROW,
|
184
|
-
columns_to_compute,
|
185
|
-
**deltacat_storage_kwargs,
|
186
|
-
)
|
187
|
-
)
|
188
|
-
assert isinstance(entry_pyarrow_table, pyarrow.Table), (
|
189
|
-
f"Stats collection is only supported for PyArrow tables, but received a table of "
|
190
|
-
f"type '{type(entry_pyarrow_table)}' for manifest entry {file_idx} of delta: {delta.locator}."
|
191
|
-
)
|
192
|
-
total_tables_size += entry_pyarrow_table.nbytes
|
193
|
-
if not columns_to_compute:
|
194
|
-
columns_to_compute = entry_pyarrow_table.column_names
|
195
|
-
|
196
|
-
for column_idx, pyarrow_column in enumerate(entry_pyarrow_table.columns):
|
197
|
-
column_name = columns_to_compute[column_idx]
|
198
|
-
column_stats_map[column_name][file_idx] = StatsResult.of(
|
199
|
-
len(pyarrow_column), pyarrow_column.nbytes
|
200
|
-
)
|
201
|
-
|
202
|
-
# Add column-wide stats for a list of tables, these will be used for caching and retrieving later
|
203
|
-
delta_ds_column_stats: List[DeltaColumnStats] = _to_dataset_column_stats(
|
204
|
-
delta.locator, columns_to_compute, column_stats_map
|
205
|
-
)
|
206
|
-
|
207
|
-
dataset_stats: DeltaStats = DeltaStats.of(delta_ds_column_stats)
|
208
|
-
|
209
|
-
# Quick validation for calculations
|
210
|
-
assert dataset_stats.stats.pyarrow_table_bytes == total_tables_size, (
|
211
|
-
f"Expected the size of all PyArrow tables ({total_tables_size} bytes) "
|
212
|
-
f"to match the sum of each of its columns ({dataset_stats.stats.pyarrow_table_bytes} bytes)"
|
213
|
-
)
|
214
|
-
|
215
|
-
return dataset_stats
|
216
|
-
|
217
|
-
|
218
|
-
def _to_dataset_column_stats(
|
219
|
-
delta_locator: DeltaLocator,
|
220
|
-
column_names: List[str],
|
221
|
-
column_manifest_map: Dict[str, List[Optional[StatsResult]]],
|
222
|
-
) -> List[DeltaColumnStats]:
|
223
|
-
dataset_stats: List[DeltaColumnStats] = []
|
224
|
-
for column_name in column_names:
|
225
|
-
column_manifest_stats = ManifestEntryStats.of(
|
226
|
-
column_manifest_map[column_name], delta_locator
|
227
|
-
)
|
228
|
-
dataset_column_stats = DeltaColumnStats.of(column_name, column_manifest_stats)
|
229
|
-
dataset_stats.append(dataset_column_stats)
|
230
|
-
return dataset_stats
|
@@ -1,100 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
import logging
|
3
|
-
from typing import List
|
4
|
-
|
5
|
-
from deltacat import logs
|
6
|
-
from deltacat.aws import s3u as s3_utils
|
7
|
-
from deltacat.compute.stats.models.delta_column_stats import DeltaColumnStats
|
8
|
-
from deltacat.compute.stats.models.manifest_entry_stats import ManifestEntryStats
|
9
|
-
from deltacat.storage import DeltaLocator
|
10
|
-
from deltacat.utils.common import sha1_hexdigest
|
11
|
-
|
12
|
-
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
13
|
-
|
14
|
-
|
15
|
-
def get_manifest_stats_s3_url(
|
16
|
-
bucket: str, column_name: str, delta_locator: DeltaLocator
|
17
|
-
) -> str:
|
18
|
-
"""Returns the S3 URL path to the column-oriented delta stats
|
19
|
-
|
20
|
-
Args:
|
21
|
-
bucket: The S3 bucket
|
22
|
-
column_name: The name of the column to look up stats for
|
23
|
-
delta_locator: The reference to the delta corresponding to the manifest entries
|
24
|
-
|
25
|
-
Returns:
|
26
|
-
A S3 URL path
|
27
|
-
"""
|
28
|
-
stats_column_id = f"{delta_locator.canonical_string()}|{column_name}"
|
29
|
-
stats_column_hexdigest = sha1_hexdigest(stats_column_id.encode("utf-8"))
|
30
|
-
base_path = s3_utils.parse_s3_url(bucket).url
|
31
|
-
return f"{base_path}/{stats_column_hexdigest}.json"
|
32
|
-
|
33
|
-
|
34
|
-
def read_manifest_stats_by_columns(
|
35
|
-
bucket: str, column_names: List[str], delta_locator: DeltaLocator
|
36
|
-
) -> List[DeltaColumnStats]:
|
37
|
-
"""Fetch a list of delta column stats by reading each column-oriented delta stats file from S3
|
38
|
-
|
39
|
-
Args:
|
40
|
-
bucket: The S3 bucket
|
41
|
-
column_names: A list of column names to look up stats for
|
42
|
-
delta_locator: The reference to the delta corresponding to the manifest entries
|
43
|
-
|
44
|
-
Returns:
|
45
|
-
A list of delta column stats
|
46
|
-
"""
|
47
|
-
return [
|
48
|
-
DeltaColumnStats.of(
|
49
|
-
column, read_manifest_stats_file(bucket, column, delta_locator)
|
50
|
-
)
|
51
|
-
for column in column_names
|
52
|
-
]
|
53
|
-
|
54
|
-
|
55
|
-
def read_manifest_stats_file(
|
56
|
-
bucket: str, column_name: str, delta_locator: DeltaLocator
|
57
|
-
) -> ManifestEntryStats:
|
58
|
-
"""Read a manifest entry stats from S3
|
59
|
-
|
60
|
-
Args:
|
61
|
-
bucket: The S3 bucket
|
62
|
-
column_name: The name of the column to look up stats for
|
63
|
-
delta_locator: The reference to the delta corresponding to the manifest entries
|
64
|
-
|
65
|
-
Returns:
|
66
|
-
A container that holds a list of manifest entry stats for the given column name
|
67
|
-
"""
|
68
|
-
|
69
|
-
stats_completion_file_url = get_manifest_stats_s3_url(
|
70
|
-
bucket, column_name, delta_locator
|
71
|
-
)
|
72
|
-
logger.info(f"reading stats completion file from: {stats_completion_file_url}")
|
73
|
-
stats_completion_info_file = None
|
74
|
-
result = s3_utils.download(stats_completion_file_url, fail_if_not_found=False)
|
75
|
-
if result:
|
76
|
-
json_str = result["Body"].read().decode("utf-8")
|
77
|
-
stats_completion_info_file = ManifestEntryStats(json.loads(json_str))
|
78
|
-
logger.info(f"read stats completion info: {stats_completion_info_file}")
|
79
|
-
return stats_completion_info_file
|
80
|
-
|
81
|
-
|
82
|
-
def write_manifest_stats_file(
|
83
|
-
bucket: str, column_name: str, manifest_entry_stats: ManifestEntryStats
|
84
|
-
) -> None:
|
85
|
-
"""Write a manifest entry stats into S3
|
86
|
-
|
87
|
-
Args:
|
88
|
-
bucket: The S3 bucket
|
89
|
-
column_name: The name of the column which represents this manifest entry stats
|
90
|
-
manifest_entry_stats: The manifest entry stats to serialize and store into S3
|
91
|
-
"""
|
92
|
-
logger.info(f"writing stats completion file contents: {manifest_entry_stats}")
|
93
|
-
stats_completion_file_s3_url = get_manifest_stats_s3_url(
|
94
|
-
bucket,
|
95
|
-
column_name,
|
96
|
-
manifest_entry_stats.delta_locator,
|
97
|
-
)
|
98
|
-
logger.info(f"writing stats completion file to: {stats_completion_file_s3_url}")
|
99
|
-
s3_utils.upload(stats_completion_file_s3_url, str(json.dumps(manifest_entry_stats)))
|
100
|
-
logger.info(f"stats completion file written to: {stats_completion_file_s3_url}")
|
deltacat/tests/stats/__init__.py
DELETED
File without changes
|
@@ -1,49 +0,0 @@
|
|
1
|
-
import unittest
|
2
|
-
from typing import Tuple
|
3
|
-
|
4
|
-
from deltacat.compute.stats.utils.intervals import DeltaRange, merge_intervals
|
5
|
-
|
6
|
-
|
7
|
-
class TestMergeIntervals(unittest.TestCase):
|
8
|
-
def test_unbounded_start_range(self):
|
9
|
-
intervals = sorted(merge_intervals({(3, 9), (None, 15), (13, 30)}))
|
10
|
-
interval: Tuple[DeltaRange, DeltaRange] = intervals[0]
|
11
|
-
self.assertEqual(interval[0], None)
|
12
|
-
self.assertEqual(interval[1], 30)
|
13
|
-
|
14
|
-
def test_unbounded_end_range(self):
|
15
|
-
intervals = sorted(merge_intervals({(3, 9), (2, None), (13, 30)}))
|
16
|
-
interval: Tuple[DeltaRange, DeltaRange] = intervals[0]
|
17
|
-
self.assertEqual(interval[0], 2)
|
18
|
-
self.assertEqual(interval[1], None)
|
19
|
-
|
20
|
-
def test_unbounded_start_end_range(self):
|
21
|
-
intervals = sorted(merge_intervals({(None, None)}))
|
22
|
-
interval: Tuple[DeltaRange, DeltaRange] = intervals[0]
|
23
|
-
self.assertEqual(interval[0], None)
|
24
|
-
self.assertEqual(interval[1], None)
|
25
|
-
|
26
|
-
def test_no_overlap_range(self):
|
27
|
-
intervals = sorted(merge_intervals({(3, 9), (11, 14), (19, 30)}))
|
28
|
-
interval1: Tuple[DeltaRange, DeltaRange] = intervals[0]
|
29
|
-
interval2: Tuple[DeltaRange, DeltaRange] = intervals[1]
|
30
|
-
interval3: Tuple[DeltaRange, DeltaRange] = intervals[2]
|
31
|
-
self.assertEqual(interval1, (3, 9))
|
32
|
-
self.assertEqual(interval2, (11, 14))
|
33
|
-
self.assertEqual(interval3, (19, 30))
|
34
|
-
|
35
|
-
def test_overlap_range(self):
|
36
|
-
intervals = sorted(merge_intervals({(3, 9), (9, 14), (14, 30)}))
|
37
|
-
interval1: Tuple[DeltaRange, DeltaRange] = intervals[0]
|
38
|
-
self.assertEqual(interval1, (3, 30))
|
39
|
-
|
40
|
-
def test_invalid_range(self):
|
41
|
-
self.assertRaises(ValueError, merge_intervals, {(3, 9), (9, 3)})
|
42
|
-
|
43
|
-
def test_invalid_type(self):
|
44
|
-
self.assertRaises(ValueError, merge_intervals, {(3, 9), (1.2, 3)})
|
45
|
-
self.assertRaises(ValueError, merge_intervals, {(3, 9), ("1", 3)})
|
46
|
-
|
47
|
-
|
48
|
-
if __name__ == "__main__":
|
49
|
-
unittest.main()
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|