deltacat 1.1.17__py3-none-any.whl → 1.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. deltacat/__init__.py +1 -1
  2. deltacat/aws/constants.py +0 -1
  3. deltacat/compute/compactor/model/compact_partition_params.py +76 -0
  4. deltacat/compute/compactor/model/compaction_session_audit_info.py +26 -0
  5. deltacat/compute/compactor/model/delta_annotated.py +16 -9
  6. deltacat/compute/compactor_v2/constants.py +3 -0
  7. deltacat/compute/compactor_v2/private/compaction_utils.py +9 -5
  8. deltacat/compute/compactor_v2/utils/content_type_params.py +185 -34
  9. deltacat/compute/compactor_v2/utils/io.py +28 -14
  10. deltacat/compute/compactor_v2/utils/primary_key_index.py +9 -4
  11. deltacat/compute/compactor_v2/utils/task_options.py +128 -183
  12. deltacat/compute/resource_estimation/__init__.py +27 -0
  13. deltacat/compute/resource_estimation/delta.py +271 -0
  14. deltacat/compute/resource_estimation/manifest.py +394 -0
  15. deltacat/compute/resource_estimation/model.py +165 -0
  16. deltacat/compute/resource_estimation/parquet.py +108 -0
  17. deltacat/constants.py +5 -0
  18. deltacat/exceptions.py +2 -4
  19. deltacat/logs.py +8 -0
  20. deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +77 -0
  21. deltacat/tests/compute/compact_partition_rebase_test_cases.py +308 -0
  22. deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +159 -0
  23. deltacat/tests/compute/compactor_v2/test_compaction_session.py +157 -0
  24. deltacat/tests/compute/compactor_v2/utils/test_task_options.py +3 -3
  25. deltacat/tests/compute/resource_estimation/test_delta.py +605 -0
  26. deltacat/tests/compute/resource_estimation/test_manifest.py +921 -0
  27. deltacat/tests/compute/test_compact_partition_rebase.py +13 -4
  28. deltacat/tests/compute/test_util_common.py +2 -0
  29. deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -5
  30. deltacat/tests/test_logs.py +34 -0
  31. deltacat/tests/test_utils/pyarrow.py +15 -5
  32. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/METADATA +2 -2
  33. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/RECORD +38 -54
  34. deltacat/compute/metastats/meta_stats.py +0 -479
  35. deltacat/compute/metastats/model/__init__.py +0 -0
  36. deltacat/compute/metastats/model/partition_stats_dict.py +0 -34
  37. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +0 -68
  38. deltacat/compute/metastats/stats.py +0 -182
  39. deltacat/compute/metastats/utils/__init__.py +0 -0
  40. deltacat/compute/metastats/utils/constants.py +0 -16
  41. deltacat/compute/metastats/utils/io.py +0 -223
  42. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +0 -18
  43. deltacat/compute/metastats/utils/ray_utils.py +0 -129
  44. deltacat/compute/stats/basic.py +0 -226
  45. deltacat/compute/stats/models/__init__.py +0 -0
  46. deltacat/compute/stats/models/delta_column_stats.py +0 -98
  47. deltacat/compute/stats/models/delta_stats.py +0 -233
  48. deltacat/compute/stats/models/delta_stats_cache_result.py +0 -49
  49. deltacat/compute/stats/models/manifest_entry_stats.py +0 -72
  50. deltacat/compute/stats/models/stats_result.py +0 -104
  51. deltacat/compute/stats/utils/__init__.py +0 -0
  52. deltacat/compute/stats/utils/intervals.py +0 -94
  53. deltacat/compute/stats/utils/io.py +0 -230
  54. deltacat/compute/stats/utils/manifest_stats_file.py +0 -100
  55. deltacat/tests/stats/__init__.py +0 -0
  56. deltacat/tests/stats/test_intervals.py +0 -49
  57. /deltacat/{compute/metastats → tests/compute/resource_estimation}/__init__.py +0 -0
  58. /deltacat/{compute/metastats/config → tests/compute/resource_estimation/data}/__init__.py +0 -0
  59. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/LICENSE +0 -0
  60. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/WHEEL +0 -0
  61. {deltacat-1.1.17.dist-info → deltacat-1.1.19.dist-info}/top_level.txt +0 -0
@@ -1,104 +0,0 @@
1
- # Allow classes to use self-referencing Type hints in Python 3.7.
2
- from __future__ import annotations
3
-
4
- from collections import defaultdict
5
- from typing import Any, Dict, List, Optional, Set
6
-
7
- from deltacat.compute.stats.types import ALL_STATS_TYPES, StatsType
8
-
9
-
10
- class StatsResult(dict):
11
- """A generic container that holds stats for a single manifest entry file."""
12
-
13
- @staticmethod
14
- def of(
15
- row_count: Optional[int] = 0, pyarrow_table_bytes: Optional[int] = 0
16
- ) -> StatsResult:
17
- """Static factory for building a stats result object
18
-
19
- Args:
20
- row_count: The total number of rows of a manifest entry
21
- pyarrow_table_bytes: The total number of bytes when loaded into memory as a PyArrow Table
22
-
23
- Returns:
24
- A stats result object
25
- """
26
- sr = StatsResult()
27
- sr[StatsType.ROW_COUNT.value] = row_count
28
- sr[StatsType.PYARROW_TABLE_BYTES.value] = pyarrow_table_bytes
29
- return sr
30
-
31
- @property
32
- def row_count(self) -> int:
33
- """Represents the row count of a manifest entry file.
34
-
35
- Returns:
36
- The total number of rows of a manifest entry
37
- """
38
- return self[StatsType.ROW_COUNT.value]
39
-
40
- @property
41
- def pyarrow_table_bytes(self) -> int:
42
- """Represents the size of a manifest entry file (in bytes) as it was loaded into a PyArrow table.
43
-
44
- Returns:
45
- The total number of bytes when loaded into memory as a PyArrow Table
46
- """
47
- return self[StatsType.PYARROW_TABLE_BYTES.value]
48
-
49
- @staticmethod
50
- def from_stats_types(stats_types: Dict[StatsType, Any]) -> StatsResult:
51
- """A helper method to filter a dictionary by supported stats and returns a stats result object.
52
-
53
- Args:
54
- stats_types: Stats that should be included for constructing a stats result
55
-
56
- Returns:
57
- A stats result object
58
- """
59
- return StatsResult(
60
- {
61
- k: v
62
- for k, v in stats_types.items()
63
- if k in [StatsType.ROW_COUNT, StatsType.PYARROW_TABLE_BYTES]
64
- }
65
- )
66
-
67
- @staticmethod
68
- def merge(
69
- stats_list: List[StatsResult],
70
- stat_types: Optional[Set[StatsType]] = None,
71
- record_row_count_once: bool = False,
72
- ) -> StatsResult:
73
- """Helper method to merge any list of StatsResult objects into one.
74
-
75
- StatsResult objects are merged by adding up their numerical stats.
76
- TODO (ricmiyam): Handle non-numerical stats when they are added
77
-
78
- Args:
79
- stat_types: If provided, the calculation will only include the requested stats.
80
- record_row_count_once: If optionally set to `True`, then row counts are only added
81
- from the first stats entry. One use case for this is merging table-centric stats
82
- by columns, since the row count is expected to be the same across different columns.
83
-
84
- Returns:
85
- A stats result object
86
- """
87
- assert isinstance(stats_list, list) and len(stats_list) > 0, (
88
- f"Expected stats list: {stats_list} of type {type(stats_list)} to be a "
89
- f"non-empty list of StatsResult objects."
90
- )
91
-
92
- # Fallback to all stat types if not provided
93
- stats_to_collect: Set = stat_types or ALL_STATS_TYPES
94
-
95
- merged_stats: Dict[StatsType, int] = defaultdict(int)
96
- for stats_result in stats_list:
97
- for stat_type in stats_to_collect:
98
- if stats_result:
99
- merged_stats[stat_type.value] += stats_result[stat_type.value]
100
-
101
- if record_row_count_once and StatsType.ROW_COUNT in stats_to_collect:
102
- merged_stats[StatsType.ROW_COUNT.value] = stats_list[0].row_count
103
-
104
- return StatsResult.from_stats_types(merged_stats)
File without changes
@@ -1,94 +0,0 @@
1
- from typing import Iterable, List, Optional, Set, Tuple, Union
2
-
3
- DeltaPosition = Optional[int]
4
- NumericDeltaPosition = Union[int, float] # float is added here to support math.inf
5
- DeltaRange = Tuple[DeltaPosition, DeltaPosition]
6
-
7
-
8
- def merge_intervals(intervals: Set[DeltaRange]) -> Set[DeltaRange]:
9
- """Merges a set of N input intervals into a minimal number of output intervals.
10
-
11
- All input intervals will be merged into a minimal number of output intervals in O(N log N) time.
12
-
13
- Example:
14
- >>> merge_intervals((3, 9), (8, 12), (15, 19))
15
- ((3, 12), (15, 19))
16
-
17
- Example:
18
- >>> merge_intervals((3, 9), (None, 15), (13, 30))
19
- ((None, 30))
20
-
21
- Args:
22
- intervals: A list of intervals with an int type representing finite, closed bounded values and
23
- a None type representing infinity.
24
-
25
- Returns:
26
- A minimal number of output intervals
27
- """
28
- merged: Set[DeltaRange] = set()
29
- intervals_list: List[DeltaRange] = list(intervals)
30
-
31
- if len(intervals_list) == 0:
32
- return merged
33
-
34
- _to_numeric_values(intervals_list)
35
- intervals_list.sort() # sort by starting range numbers
36
-
37
- merge_start, merge_end = None, None
38
- for interval in intervals_list:
39
- start, end = interval
40
- if start > end:
41
- raise ValueError(
42
- f"Invalid stream position range interval: ({start}, {end})"
43
- )
44
-
45
- if merge_start is None and merge_end is None:
46
- merge_start, merge_end = start, end
47
- continue
48
-
49
- if merge_end < start:
50
- # add current merge interval if no overlap, begin new interval
51
- _add_merged_interval(merged, merge_start, merge_end)
52
- merge_start, merge_end = start, end
53
- elif merge_end < end:
54
- merge_end = end # expand current merge interval if there is an overlap
55
-
56
- # add final merge interval
57
- _add_merged_interval(merged, merge_start, merge_end)
58
-
59
- return merged
60
-
61
-
62
- def _add_merged_interval(
63
- result_set: set, start: NumericDeltaPosition, end: NumericDeltaPosition
64
- ):
65
- start_pos: DeltaPosition = start if isinstance(start, int) else None
66
- end_pos: DeltaPosition = end if isinstance(end, int) else None
67
- result_set.add((start_pos, end_pos))
68
-
69
-
70
- def _to_numeric_values(intervals_list: List[DeltaRange]):
71
- for i, interval in enumerate(intervals_list):
72
- start, end = _get_validated_interval(interval)
73
- if start is None:
74
- start = float("-inf")
75
- if end is None:
76
- end = float("inf")
77
-
78
- intervals_list[i] = (start, end)
79
-
80
-
81
- def _get_validated_interval(interval: DeltaRange) -> DeltaRange:
82
- if not isinstance(interval, Iterable) or len(interval) != 2:
83
- raise ValueError(f"Interval {interval} must be a tuple of size 2")
84
-
85
- start, end = interval
86
- if not (isinstance(start, int) or start is None) or not (
87
- isinstance(end, int) or end is None
88
- ):
89
- raise ValueError(
90
- f"Invalid stream position value types: "
91
- f"({start}, {end}) - ({type(start), type(end)})"
92
- )
93
-
94
- return start, end
@@ -1,230 +0,0 @@
1
- import logging
2
- from collections import defaultdict
3
- from typing import Any, Dict, List, Optional
4
-
5
- import pyarrow
6
- import ray
7
-
8
- from deltacat import LocalTable, TableType, logs
9
- from deltacat.compute.stats.models.delta_column_stats import DeltaColumnStats
10
- from deltacat.compute.stats.models.delta_stats import DeltaStats, DeltaStatsCacheMiss
11
- from deltacat.compute.stats.models.delta_stats_cache_result import DeltaStatsCacheResult
12
- from deltacat.compute.stats.models.manifest_entry_stats import ManifestEntryStats
13
- from deltacat.compute.stats.models.stats_result import StatsResult
14
- from deltacat.compute.stats.utils.intervals import DeltaRange
15
- from deltacat.compute.stats.utils.manifest_stats_file import (
16
- read_manifest_stats_by_columns,
17
- write_manifest_stats_file,
18
- )
19
- from deltacat.storage import Delta, DeltaLocator, PartitionLocator
20
- from deltacat.storage import interface as unimplemented_deltacat_storage
21
-
22
- logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
23
-
24
-
25
- @ray.remote
26
- def read_cached_delta_stats(
27
- delta: Delta, columns_to_fetch: List[str], stat_results_s3_bucket: str
28
- ):
29
- """Read delta stats that are cached in S3
30
-
31
- This Ray distributed task reads delta stats from a file system (i.e. S3) based on specified columns.
32
- Stats are extracted from columns that are found (cache hit), while columns that are missing
33
- from the file system will record their column names and the delta locator as a cache miss.
34
-
35
- Args:
36
- delta: The delta object to look up
37
- columns_to_fetch: Columns to look up for this delta
38
- stat_results_s3_bucket: The S3 bucket name
39
- """
40
-
41
- delta_locator = DeltaLocator.of(delta.partition_locator, delta.stream_position)
42
- column_stats_completion_info: List[
43
- DeltaColumnStats
44
- ] = read_manifest_stats_by_columns(
45
- stat_results_s3_bucket, columns_to_fetch, delta_locator
46
- )
47
-
48
- found_columns_stats: List[DeltaColumnStats] = []
49
- missed_columns: List[str] = []
50
- for column_stats in column_stats_completion_info:
51
- if column_stats.manifest_stats:
52
- found_columns_stats.append(column_stats)
53
- else:
54
- missed_columns.append(column_stats.column)
55
-
56
- found_stats: Optional[DeltaStats] = (
57
- DeltaStats.of(found_columns_stats) if found_columns_stats else None
58
- )
59
- missed_stats: Optional[DeltaStatsCacheMiss] = (
60
- DeltaStatsCacheMiss(missed_columns, delta.locator) if missed_columns else None
61
- )
62
-
63
- return DeltaStatsCacheResult.of(found_stats, missed_stats)
64
-
65
-
66
- @ray.remote
67
- def cache_delta_column_stats(
68
- stat_results_s3_bucket: str, dataset_column: DeltaColumnStats
69
- ) -> None:
70
- """Ray distributed task to cache the delta column stats into a file system (i.e. S3).
71
-
72
- Args:
73
- stat_results_s3_bucket: The S3 bucket name
74
- dataset_column: Column-oriented stats for a given delta
75
- """
76
- write_manifest_stats_file(
77
- stat_results_s3_bucket, dataset_column.column, dataset_column.manifest_stats
78
- )
79
-
80
-
81
- @ray.remote
82
- def get_delta_stats(
83
- delta_locator: DeltaLocator,
84
- columns: Optional[List[str]] = None,
85
- deltacat_storage=unimplemented_deltacat_storage,
86
- deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
87
- ) -> DeltaStats:
88
- """Ray distributed task to compute and collect stats for a requested delta.
89
- If no columns are requested, stats will be computed for all columns.
90
- Args:
91
- delta_locator: A reference to the delta
92
- columns: Column names to specify for this delta. If not provided, all columns are considered.
93
- deltacat_storage: Client implementation of the DeltaCAT storage interface
94
- Returns:
95
- A delta wide stats container
96
- """
97
- if deltacat_storage_kwargs is None:
98
- deltacat_storage_kwargs = {}
99
- manifest = deltacat_storage.get_delta_manifest(
100
- delta_locator, **deltacat_storage_kwargs
101
- )
102
- delta = Delta.of(delta_locator, None, None, None, manifest)
103
- return _collect_stats_by_columns(
104
- delta, columns, deltacat_storage, deltacat_storage_kwargs
105
- )
106
-
107
-
108
- @ray.remote
109
- def get_deltas_from_range(
110
- source_partition_locator: PartitionLocator,
111
- start_position_inclusive: DeltaRange,
112
- end_position_inclusive: DeltaRange,
113
- deltacat_storage=unimplemented_deltacat_storage,
114
- **kwargs,
115
- ) -> List[Delta]:
116
- """Looks up deltas in the specified partition using Ray, given both starting and ending delta stream positions.
117
-
118
- Args:
119
- source_partition_locator: Reference to the partition locator tied to the given delta stream positions
120
- start_position_inclusive: Starting stream position of a range interval.
121
- Can be an int type to represent a closed bounded range, or a None type to represent unbounded infinity.
122
- end_position_inclusive: Ending stream position of a range interval.
123
- Can be an int type to represent a closed bounded range, or a None type to represent unbounded infinity.
124
- deltacat_storage: Client implementation of the DeltaCAT storage interface
125
-
126
- Returns:
127
- a list of delta objects
128
- """
129
-
130
- namespace, partition_values = (
131
- source_partition_locator.namespace,
132
- source_partition_locator.partition_values,
133
- )
134
- table_name, table_version = (
135
- source_partition_locator.table_name,
136
- source_partition_locator.table_version,
137
- )
138
- deltas_list_result = deltacat_storage.list_deltas(
139
- namespace,
140
- table_name,
141
- partition_values,
142
- table_version,
143
- start_position_inclusive,
144
- end_position_inclusive,
145
- ascending_order=True,
146
- include_manifest=False,
147
- **kwargs,
148
- )
149
- return deltas_list_result.all_items()
150
-
151
-
152
- def _collect_stats_by_columns(
153
- delta: Delta,
154
- columns_to_compute: Optional[List[str]] = None,
155
- deltacat_storage=unimplemented_deltacat_storage,
156
- deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
157
- ) -> DeltaStats:
158
- """Materializes one manifest entry at a time to save memory usage and calculate stats from each of its columns.
159
- Args:
160
- delta: A delta object to calculate stats for
161
- columns_to_compute: Columns to calculate stats for. If not provided, all columns are considered.
162
- deltacat_storage: Client implementation of the DeltaCAT storage interface
163
- Returns:
164
- A delta wide stats container
165
- """
166
- if deltacat_storage_kwargs is None:
167
- deltacat_storage_kwargs = {}
168
- assert (
169
- delta.manifest is not None
170
- ), f"Manifest should not be missing from delta for stats calculation: {delta}"
171
-
172
- # Mapping of column_name -> [stats_file_idx_1, stats_file_idx_2, ... stats_file_idx_n]
173
- column_stats_map: Dict[str, List[Optional[StatsResult]]] = defaultdict(
174
- lambda: [None] * len(delta.manifest.entries)
175
- )
176
-
177
- total_tables_size = 0
178
- for file_idx, manifest in enumerate(delta.manifest.entries):
179
- entry_pyarrow_table: LocalTable = (
180
- deltacat_storage.download_delta_manifest_entry(
181
- delta,
182
- file_idx,
183
- TableType.PYARROW,
184
- columns_to_compute,
185
- **deltacat_storage_kwargs,
186
- )
187
- )
188
- assert isinstance(entry_pyarrow_table, pyarrow.Table), (
189
- f"Stats collection is only supported for PyArrow tables, but received a table of "
190
- f"type '{type(entry_pyarrow_table)}' for manifest entry {file_idx} of delta: {delta.locator}."
191
- )
192
- total_tables_size += entry_pyarrow_table.nbytes
193
- if not columns_to_compute:
194
- columns_to_compute = entry_pyarrow_table.column_names
195
-
196
- for column_idx, pyarrow_column in enumerate(entry_pyarrow_table.columns):
197
- column_name = columns_to_compute[column_idx]
198
- column_stats_map[column_name][file_idx] = StatsResult.of(
199
- len(pyarrow_column), pyarrow_column.nbytes
200
- )
201
-
202
- # Add column-wide stats for a list of tables, these will be used for caching and retrieving later
203
- delta_ds_column_stats: List[DeltaColumnStats] = _to_dataset_column_stats(
204
- delta.locator, columns_to_compute, column_stats_map
205
- )
206
-
207
- dataset_stats: DeltaStats = DeltaStats.of(delta_ds_column_stats)
208
-
209
- # Quick validation for calculations
210
- assert dataset_stats.stats.pyarrow_table_bytes == total_tables_size, (
211
- f"Expected the size of all PyArrow tables ({total_tables_size} bytes) "
212
- f"to match the sum of each of its columns ({dataset_stats.stats.pyarrow_table_bytes} bytes)"
213
- )
214
-
215
- return dataset_stats
216
-
217
-
218
- def _to_dataset_column_stats(
219
- delta_locator: DeltaLocator,
220
- column_names: List[str],
221
- column_manifest_map: Dict[str, List[Optional[StatsResult]]],
222
- ) -> List[DeltaColumnStats]:
223
- dataset_stats: List[DeltaColumnStats] = []
224
- for column_name in column_names:
225
- column_manifest_stats = ManifestEntryStats.of(
226
- column_manifest_map[column_name], delta_locator
227
- )
228
- dataset_column_stats = DeltaColumnStats.of(column_name, column_manifest_stats)
229
- dataset_stats.append(dataset_column_stats)
230
- return dataset_stats
@@ -1,100 +0,0 @@
1
- import json
2
- import logging
3
- from typing import List
4
-
5
- from deltacat import logs
6
- from deltacat.aws import s3u as s3_utils
7
- from deltacat.compute.stats.models.delta_column_stats import DeltaColumnStats
8
- from deltacat.compute.stats.models.manifest_entry_stats import ManifestEntryStats
9
- from deltacat.storage import DeltaLocator
10
- from deltacat.utils.common import sha1_hexdigest
11
-
12
- logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
13
-
14
-
15
- def get_manifest_stats_s3_url(
16
- bucket: str, column_name: str, delta_locator: DeltaLocator
17
- ) -> str:
18
- """Returns the S3 URL path to the column-oriented delta stats
19
-
20
- Args:
21
- bucket: The S3 bucket
22
- column_name: The name of the column to look up stats for
23
- delta_locator: The reference to the delta corresponding to the manifest entries
24
-
25
- Returns:
26
- A S3 URL path
27
- """
28
- stats_column_id = f"{delta_locator.canonical_string()}|{column_name}"
29
- stats_column_hexdigest = sha1_hexdigest(stats_column_id.encode("utf-8"))
30
- base_path = s3_utils.parse_s3_url(bucket).url
31
- return f"{base_path}/{stats_column_hexdigest}.json"
32
-
33
-
34
- def read_manifest_stats_by_columns(
35
- bucket: str, column_names: List[str], delta_locator: DeltaLocator
36
- ) -> List[DeltaColumnStats]:
37
- """Fetch a list of delta column stats by reading each column-oriented delta stats file from S3
38
-
39
- Args:
40
- bucket: The S3 bucket
41
- column_names: A list of column names to look up stats for
42
- delta_locator: The reference to the delta corresponding to the manifest entries
43
-
44
- Returns:
45
- A list of delta column stats
46
- """
47
- return [
48
- DeltaColumnStats.of(
49
- column, read_manifest_stats_file(bucket, column, delta_locator)
50
- )
51
- for column in column_names
52
- ]
53
-
54
-
55
- def read_manifest_stats_file(
56
- bucket: str, column_name: str, delta_locator: DeltaLocator
57
- ) -> ManifestEntryStats:
58
- """Read a manifest entry stats from S3
59
-
60
- Args:
61
- bucket: The S3 bucket
62
- column_name: The name of the column to look up stats for
63
- delta_locator: The reference to the delta corresponding to the manifest entries
64
-
65
- Returns:
66
- A container that holds a list of manifest entry stats for the given column name
67
- """
68
-
69
- stats_completion_file_url = get_manifest_stats_s3_url(
70
- bucket, column_name, delta_locator
71
- )
72
- logger.info(f"reading stats completion file from: {stats_completion_file_url}")
73
- stats_completion_info_file = None
74
- result = s3_utils.download(stats_completion_file_url, fail_if_not_found=False)
75
- if result:
76
- json_str = result["Body"].read().decode("utf-8")
77
- stats_completion_info_file = ManifestEntryStats(json.loads(json_str))
78
- logger.info(f"read stats completion info: {stats_completion_info_file}")
79
- return stats_completion_info_file
80
-
81
-
82
- def write_manifest_stats_file(
83
- bucket: str, column_name: str, manifest_entry_stats: ManifestEntryStats
84
- ) -> None:
85
- """Write a manifest entry stats into S3
86
-
87
- Args:
88
- bucket: The S3 bucket
89
- column_name: The name of the column which represents this manifest entry stats
90
- manifest_entry_stats: The manifest entry stats to serialize and store into S3
91
- """
92
- logger.info(f"writing stats completion file contents: {manifest_entry_stats}")
93
- stats_completion_file_s3_url = get_manifest_stats_s3_url(
94
- bucket,
95
- column_name,
96
- manifest_entry_stats.delta_locator,
97
- )
98
- logger.info(f"writing stats completion file to: {stats_completion_file_s3_url}")
99
- s3_utils.upload(stats_completion_file_s3_url, str(json.dumps(manifest_entry_stats)))
100
- logger.info(f"stats completion file written to: {stats_completion_file_s3_url}")
File without changes
@@ -1,49 +0,0 @@
1
- import unittest
2
- from typing import Tuple
3
-
4
- from deltacat.compute.stats.utils.intervals import DeltaRange, merge_intervals
5
-
6
-
7
- class TestMergeIntervals(unittest.TestCase):
8
- def test_unbounded_start_range(self):
9
- intervals = sorted(merge_intervals({(3, 9), (None, 15), (13, 30)}))
10
- interval: Tuple[DeltaRange, DeltaRange] = intervals[0]
11
- self.assertEqual(interval[0], None)
12
- self.assertEqual(interval[1], 30)
13
-
14
- def test_unbounded_end_range(self):
15
- intervals = sorted(merge_intervals({(3, 9), (2, None), (13, 30)}))
16
- interval: Tuple[DeltaRange, DeltaRange] = intervals[0]
17
- self.assertEqual(interval[0], 2)
18
- self.assertEqual(interval[1], None)
19
-
20
- def test_unbounded_start_end_range(self):
21
- intervals = sorted(merge_intervals({(None, None)}))
22
- interval: Tuple[DeltaRange, DeltaRange] = intervals[0]
23
- self.assertEqual(interval[0], None)
24
- self.assertEqual(interval[1], None)
25
-
26
- def test_no_overlap_range(self):
27
- intervals = sorted(merge_intervals({(3, 9), (11, 14), (19, 30)}))
28
- interval1: Tuple[DeltaRange, DeltaRange] = intervals[0]
29
- interval2: Tuple[DeltaRange, DeltaRange] = intervals[1]
30
- interval3: Tuple[DeltaRange, DeltaRange] = intervals[2]
31
- self.assertEqual(interval1, (3, 9))
32
- self.assertEqual(interval2, (11, 14))
33
- self.assertEqual(interval3, (19, 30))
34
-
35
- def test_overlap_range(self):
36
- intervals = sorted(merge_intervals({(3, 9), (9, 14), (14, 30)}))
37
- interval1: Tuple[DeltaRange, DeltaRange] = intervals[0]
38
- self.assertEqual(interval1, (3, 30))
39
-
40
- def test_invalid_range(self):
41
- self.assertRaises(ValueError, merge_intervals, {(3, 9), (9, 3)})
42
-
43
- def test_invalid_type(self):
44
- self.assertRaises(ValueError, merge_intervals, {(3, 9), (1.2, 3)})
45
- self.assertRaises(ValueError, merge_intervals, {(3, 9), ("1", 3)})
46
-
47
-
48
- if __name__ == "__main__":
49
- unittest.main()