deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. deltacat/__init__.py +41 -15
  2. deltacat/aws/clients.py +12 -31
  3. deltacat/aws/constants.py +1 -1
  4. deltacat/aws/redshift/__init__.py +7 -2
  5. deltacat/aws/redshift/model/manifest.py +54 -50
  6. deltacat/aws/s3u.py +176 -187
  7. deltacat/catalog/delegate.py +151 -185
  8. deltacat/catalog/interface.py +78 -97
  9. deltacat/catalog/model/catalog.py +21 -21
  10. deltacat/catalog/model/table_definition.py +11 -9
  11. deltacat/compute/compactor/__init__.py +12 -16
  12. deltacat/compute/compactor/compaction_session.py +237 -166
  13. deltacat/compute/compactor/model/delta_annotated.py +60 -44
  14. deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
  15. deltacat/compute/compactor/model/delta_file_locator.py +10 -8
  16. deltacat/compute/compactor/model/materialize_result.py +6 -7
  17. deltacat/compute/compactor/model/primary_key_index.py +38 -34
  18. deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
  19. deltacat/compute/compactor/model/round_completion_info.py +25 -19
  20. deltacat/compute/compactor/model/sort_key.py +18 -15
  21. deltacat/compute/compactor/steps/dedupe.py +119 -94
  22. deltacat/compute/compactor/steps/hash_bucket.py +48 -47
  23. deltacat/compute/compactor/steps/materialize.py +86 -92
  24. deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
  25. deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
  26. deltacat/compute/compactor/utils/io.py +59 -47
  27. deltacat/compute/compactor/utils/primary_key_index.py +91 -80
  28. deltacat/compute/compactor/utils/round_completion_file.py +22 -23
  29. deltacat/compute/compactor/utils/system_columns.py +33 -45
  30. deltacat/compute/metastats/meta_stats.py +235 -157
  31. deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
  32. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
  33. deltacat/compute/metastats/stats.py +95 -64
  34. deltacat/compute/metastats/utils/io.py +100 -53
  35. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
  36. deltacat/compute/metastats/utils/ray_utils.py +38 -33
  37. deltacat/compute/stats/basic.py +107 -69
  38. deltacat/compute/stats/models/delta_column_stats.py +11 -8
  39. deltacat/compute/stats/models/delta_stats.py +59 -32
  40. deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
  41. deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
  42. deltacat/compute/stats/models/stats_result.py +24 -14
  43. deltacat/compute/stats/utils/intervals.py +16 -9
  44. deltacat/compute/stats/utils/io.py +86 -51
  45. deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
  46. deltacat/constants.py +4 -13
  47. deltacat/io/__init__.py +2 -2
  48. deltacat/io/aws/redshift/redshift_datasource.py +157 -143
  49. deltacat/io/dataset.py +14 -17
  50. deltacat/io/read_api.py +36 -33
  51. deltacat/logs.py +94 -42
  52. deltacat/storage/__init__.py +18 -8
  53. deltacat/storage/interface.py +196 -213
  54. deltacat/storage/model/delta.py +45 -51
  55. deltacat/storage/model/list_result.py +12 -8
  56. deltacat/storage/model/namespace.py +4 -5
  57. deltacat/storage/model/partition.py +42 -42
  58. deltacat/storage/model/stream.py +29 -30
  59. deltacat/storage/model/table.py +14 -14
  60. deltacat/storage/model/table_version.py +32 -31
  61. deltacat/storage/model/types.py +1 -0
  62. deltacat/tests/stats/test_intervals.py +11 -24
  63. deltacat/tests/utils/__init__.py +0 -0
  64. deltacat/tests/utils/test_record_batch_tables.py +284 -0
  65. deltacat/types/media.py +3 -4
  66. deltacat/types/tables.py +31 -21
  67. deltacat/utils/common.py +5 -11
  68. deltacat/utils/numpy.py +20 -22
  69. deltacat/utils/pandas.py +73 -100
  70. deltacat/utils/performance.py +3 -9
  71. deltacat/utils/placement.py +259 -230
  72. deltacat/utils/pyarrow.py +302 -89
  73. deltacat/utils/ray_utils/collections.py +2 -1
  74. deltacat/utils/ray_utils/concurrency.py +27 -28
  75. deltacat/utils/ray_utils/dataset.py +28 -28
  76. deltacat/utils/ray_utils/performance.py +5 -9
  77. deltacat/utils/ray_utils/runtime.py +9 -10
  78. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
  79. deltacat-0.1.12.dist-info/RECORD +110 -0
  80. deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
  81. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
  82. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
  83. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0
@@ -1,30 +1,33 @@
1
- import ray
2
- from typing import Dict, Set, Tuple, List, Optional
1
+ from typing import Dict, List, Optional, Set, Tuple
3
2
 
4
- from deltacat.compute.stats.models.delta_stats import DeltaStats
3
+ import ray
5
4
  from ray.types import ObjectRef
6
5
 
6
+ from deltacat.compute.stats.models.delta_stats import DeltaStats
7
7
  from deltacat.compute.stats.models.delta_stats_cache_result import DeltaStatsCacheResult
8
8
  from deltacat.compute.stats.models.stats_result import StatsResult
9
9
  from deltacat.compute.stats.types import StatsType
10
- from deltacat.compute.stats.utils.io import read_cached_delta_stats, cache_delta_column_stats, get_delta_stats, \
11
- get_deltas_from_range
12
- from deltacat.compute.stats.utils.intervals import merge_intervals, DeltaRange
13
-
14
- from deltacat.storage import PartitionLocator, DeltaLocator, Delta
10
+ from deltacat.compute.stats.utils.intervals import DeltaRange, merge_intervals
11
+ from deltacat.compute.stats.utils.io import (
12
+ cache_delta_column_stats,
13
+ get_delta_stats,
14
+ get_deltas_from_range,
15
+ read_cached_delta_stats,
16
+ )
17
+ from deltacat.storage import Delta, DeltaLocator, PartitionLocator
15
18
  from deltacat.storage import interface as unimplemented_deltacat_storage
16
19
 
17
-
18
20
  # TODO (ricmiyam): Decouple DeltaCAT from S3-based paths
19
21
  # TODO (ricmiyam): Determine cache eviction policy
20
22
 
21
23
 
22
24
  def collect(
23
- source_partition_locator: PartitionLocator,
24
- delta_stream_position_range_set: Optional[Set[DeltaRange]] = None,
25
- columns: Optional[List[str]] = None,
26
- stat_results_s3_bucket: Optional[str] = None,
27
- deltacat_storage=unimplemented_deltacat_storage) -> Dict[int, DeltaStats]:
25
+ source_partition_locator: PartitionLocator,
26
+ delta_stream_position_range_set: Optional[Set[DeltaRange]] = None,
27
+ columns: Optional[List[str]] = None,
28
+ stat_results_s3_bucket: Optional[str] = None,
29
+ deltacat_storage=unimplemented_deltacat_storage,
30
+ ) -> Dict[int, DeltaStats]:
28
31
  """Collects statistics on deltas, given a set of delta stream position ranges.
29
32
 
30
33
  Example:
@@ -52,109 +55,139 @@ def collect(
52
55
  delta_range_lookup_pending: List[ObjectRef[List[Delta]]] = []
53
56
 
54
57
  if not columns:
55
- columns = deltacat_storage.get_table_version_column_names(source_partition_locator.namespace,
56
- source_partition_locator.table_name,
57
- source_partition_locator.table_version)
58
+ columns = deltacat_storage.get_table_version_column_names(
59
+ source_partition_locator.namespace,
60
+ source_partition_locator.table_name,
61
+ source_partition_locator.table_version,
62
+ )
58
63
  for range_pair in merge_intervals(delta_stream_position_range_set):
59
64
  begin, end = range_pair
60
- promise: ObjectRef[List[Delta]] = get_deltas_from_range.remote(source_partition_locator, begin, end,
61
- deltacat_storage)
65
+ promise: ObjectRef[List[Delta]] = get_deltas_from_range.remote(
66
+ source_partition_locator, begin, end, deltacat_storage
67
+ )
62
68
  delta_range_lookup_pending.append(promise)
63
69
 
64
70
  delta_list_by_ranges: List[List[Delta]] = ray.get(delta_range_lookup_pending)
65
71
  deltas = [delta for delta_list in delta_list_by_ranges for delta in delta_list]
66
72
 
67
- delta_stats_processed_list: List[DeltaStats] = _collect_stats_from_deltas(deltas,
68
- columns,
69
- stat_results_s3_bucket,
70
- deltacat_storage)
73
+ delta_stats_processed_list: List[DeltaStats] = _collect_stats_from_deltas(
74
+ deltas, columns, stat_results_s3_bucket, deltacat_storage
75
+ )
71
76
 
72
77
  for delta_column_stats in delta_stats_processed_list:
73
- assert len(delta_column_stats.column_stats) > 0, \
74
- f"Expected columns of `{delta_column_stats}` to be non-empty"
75
- stream_position = delta_column_stats.column_stats[0].manifest_stats.delta_locator.stream_position
78
+ assert (
79
+ len(delta_column_stats.column_stats) > 0
80
+ ), f"Expected columns of `{delta_column_stats}` to be non-empty"
81
+ stream_position = delta_column_stats.column_stats[
82
+ 0
83
+ ].manifest_stats.delta_locator.stream_position
76
84
  delta_stream_range_stats[stream_position] = delta_column_stats
77
85
 
78
86
  return delta_stream_range_stats
79
87
 
80
88
 
81
89
  def collect_from_deltas(
82
- deltas: List[Delta],
83
- stat_types: Set[StatsType],
84
- columns: Optional[List[str]] = None,
85
- stat_results_s3_bucket: Optional[str] = None,
86
- deltacat_storage=unimplemented_deltacat_storage) -> StatsResult:
90
+ deltas: List[Delta],
91
+ stat_types: Set[StatsType],
92
+ columns: Optional[List[str]] = None,
93
+ stat_results_s3_bucket: Optional[str] = None,
94
+ deltacat_storage=unimplemented_deltacat_storage,
95
+ ) -> StatsResult:
87
96
  """
88
97
  Variant of the `collect` function that takes a list of deltas and computes
89
98
  the aggregate of all the delta stats.
90
99
  """
91
100
  if columns is None and deltas:
92
101
  delta_locator: DeltaLocator = deltas[0].locator
93
- columns = deltacat_storage.get_table_version_column_names(delta_locator.namespace,
94
- delta_locator.table_name,
95
- delta_locator.table_version)
102
+ columns = deltacat_storage.get_table_version_column_names(
103
+ delta_locator.namespace,
104
+ delta_locator.table_name,
105
+ delta_locator.table_version,
106
+ )
96
107
 
97
- delta_stats_processed_list: List[DeltaStats] = \
98
- _collect_stats_from_deltas(deltas, columns, stat_results_s3_bucket, deltacat_storage)
108
+ delta_stats_processed_list: List[DeltaStats] = _collect_stats_from_deltas(
109
+ deltas, columns, stat_results_s3_bucket, deltacat_storage
110
+ )
99
111
 
100
- return StatsResult.merge([delta_ds.stats for delta_ds in delta_stats_processed_list], stat_types)
112
+ return StatsResult.merge(
113
+ [delta_ds.stats for delta_ds in delta_stats_processed_list], stat_types
114
+ )
101
115
 
102
116
 
103
117
  def _collect_stats_from_deltas(
104
- deltas: List[Delta],
105
- columns: Optional[List[str]] = None,
106
- stat_results_s3_bucket: Optional[str] = None,
107
- deltacat_storage=unimplemented_deltacat_storage) -> List[DeltaStats]:
118
+ deltas: List[Delta],
119
+ columns: Optional[List[str]] = None,
120
+ stat_results_s3_bucket: Optional[str] = None,
121
+ deltacat_storage=unimplemented_deltacat_storage,
122
+ ) -> List[DeltaStats]:
108
123
  delta_cache_lookup_pending: List[ObjectRef[DeltaStatsCacheResult]] = []
109
124
  delta_stats_compute_pending: List[ObjectRef[DeltaStats]] = []
110
125
 
111
126
  for delta in deltas:
112
127
  if stat_results_s3_bucket:
113
- promise: ObjectRef[DeltaStatsCacheResult] = \
114
- read_cached_delta_stats.remote(delta, columns, stat_results_s3_bucket)
128
+ promise: ObjectRef[DeltaStatsCacheResult] = read_cached_delta_stats.remote(
129
+ delta, columns, stat_results_s3_bucket
130
+ )
115
131
  delta_cache_lookup_pending.append(promise)
116
132
  continue
117
133
 
118
- delta_stats_compute_pending.append(get_delta_stats.remote(delta.locator, columns, deltacat_storage))
134
+ delta_stats_compute_pending.append(
135
+ get_delta_stats.remote(delta.locator, columns, deltacat_storage)
136
+ )
119
137
 
120
- return _process_stats(delta_cache_lookup_pending, delta_stats_compute_pending,
121
- stat_results_s3_bucket, deltacat_storage)
138
+ return _process_stats(
139
+ delta_cache_lookup_pending,
140
+ delta_stats_compute_pending,
141
+ stat_results_s3_bucket,
142
+ deltacat_storage,
143
+ )
122
144
 
123
145
 
124
146
  def _process_stats(
125
- delta_cache_lookup_pending: List[ObjectRef[DeltaStatsCacheResult]],
126
- delta_stats_compute_pending: List[ObjectRef[DeltaStats]],
127
- stat_results_s3_bucket: Optional[str] = None,
128
- deltacat_storage=unimplemented_deltacat_storage) -> List[DeltaStats]:
147
+ delta_cache_lookup_pending: List[ObjectRef[DeltaStatsCacheResult]],
148
+ delta_stats_compute_pending: List[ObjectRef[DeltaStats]],
149
+ stat_results_s3_bucket: Optional[str] = None,
150
+ deltacat_storage=unimplemented_deltacat_storage,
151
+ ) -> List[DeltaStats]:
129
152
  if stat_results_s3_bucket:
130
- delta_stats_processed_list: List[DeltaStats] = _resolve_pending_stats_and_cache(delta_cache_lookup_pending,
131
- stat_results_s3_bucket,
132
- deltacat_storage)
153
+ delta_stats_processed_list: List[DeltaStats] = _resolve_pending_stats_and_cache(
154
+ delta_cache_lookup_pending, stat_results_s3_bucket, deltacat_storage
155
+ )
133
156
  else:
134
- delta_stats_processed_list: List[DeltaStats] = _resolve_pending_stats(delta_stats_compute_pending)
157
+ delta_stats_processed_list: List[DeltaStats] = _resolve_pending_stats(
158
+ delta_stats_compute_pending
159
+ )
135
160
 
136
161
  return delta_stats_processed_list
137
162
 
138
163
 
139
- def _resolve_pending_stats_and_cache(delta_cache_lookup_pending: List[ObjectRef[DeltaStatsCacheResult]],
140
- stat_results_s3_bucket: str,
141
- deltacat_storage) -> List[DeltaStats]:
142
- delta_stats_cached_list, delta_stats_pending_list = \
143
- _get_cached_and_pending_stats(delta_cache_lookup_pending, deltacat_storage)
144
- delta_stats_resolved_list: List[DeltaStats] = _resolve_pending_stats(delta_stats_pending_list)
164
+ def _resolve_pending_stats_and_cache(
165
+ delta_cache_lookup_pending: List[ObjectRef[DeltaStatsCacheResult]],
166
+ stat_results_s3_bucket: str,
167
+ deltacat_storage,
168
+ ) -> List[DeltaStats]:
169
+ delta_stats_cached_list, delta_stats_pending_list = _get_cached_and_pending_stats(
170
+ delta_cache_lookup_pending, deltacat_storage
171
+ )
172
+ delta_stats_resolved_list: List[DeltaStats] = _resolve_pending_stats(
173
+ delta_stats_pending_list
174
+ )
145
175
 
146
176
  # Cache the stats into the file store
147
- delta_stats_to_cache: List[ObjectRef] = [cache_delta_column_stats.remote(stat_results_s3_bucket, dcs)
148
- for dataset_stats in delta_stats_resolved_list
149
- for dcs in dataset_stats.column_stats]
177
+ delta_stats_to_cache: List[ObjectRef] = [
178
+ cache_delta_column_stats.remote(stat_results_s3_bucket, dcs)
179
+ for dataset_stats in delta_stats_resolved_list
180
+ for dcs in dataset_stats.column_stats
181
+ ]
150
182
  ray.get(delta_stats_to_cache)
151
183
 
152
184
  return [*delta_stats_cached_list, *delta_stats_resolved_list]
153
185
 
154
186
 
155
- def _get_cached_and_pending_stats(discover_deltas_pending: List[ObjectRef[DeltaStatsCacheResult]],
156
- deltacat_storage=unimplemented_deltacat_storage) \
157
- -> Tuple[List[DeltaStats], List[ObjectRef[DeltaStats]]]:
187
+ def _get_cached_and_pending_stats(
188
+ discover_deltas_pending: List[ObjectRef[DeltaStatsCacheResult]],
189
+ deltacat_storage=unimplemented_deltacat_storage,
190
+ ) -> Tuple[List[DeltaStats], List[ObjectRef[DeltaStats]]]:
158
191
  """
159
192
  Returns a tuple of a list of delta stats fetched from the cache, and a list of Ray tasks which will
160
193
  calculate the stats for deltas on cache miss.
@@ -172,13 +205,18 @@ def _get_cached_and_pending_stats(discover_deltas_pending: List[ObjectRef[DeltaS
172
205
  if cached_result.misses:
173
206
  missed_column_names: List[str] = cached_result.misses.column_names
174
207
  delta_locator: DeltaLocator = cached_result.misses.delta_locator
175
- delta_stats_pending.append(get_delta_stats.remote(delta_locator, missed_column_names, deltacat_storage))
208
+ delta_stats_pending.append(
209
+ get_delta_stats.remote(
210
+ delta_locator, missed_column_names, deltacat_storage
211
+ )
212
+ )
176
213
 
177
214
  return delta_stats_processed, delta_stats_pending
178
215
 
179
216
 
180
- def _resolve_pending_stats(delta_stats_pending_list: List[ObjectRef[DeltaStats]]) \
181
- -> List[DeltaStats]:
217
+ def _resolve_pending_stats(
218
+ delta_stats_pending_list: List[ObjectRef[DeltaStats]],
219
+ ) -> List[DeltaStats]:
182
220
  delta_stats_processed_list: List[DeltaStats] = []
183
221
  while delta_stats_pending_list:
184
222
  ready, delta_stats_pending_list = ray.wait(delta_stats_pending_list)
@@ -1,7 +1,7 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
- from typing import Optional, Dict, Any, List
4
+ from typing import Any, Dict, List, Optional
5
5
 
6
6
  from deltacat.compute.stats.models.manifest_entry_stats import ManifestEntryStats
7
7
  from deltacat.compute.stats.models.stats_result import StatsResult
@@ -42,6 +42,7 @@ class DeltaColumnStats(dict):
42
42
  StatsResult([I, L]), # Manifest Entry 2
43
43
  ]))
44
44
  """
45
+
45
46
  @staticmethod
46
47
  def of(column: str, manifest_stats: ManifestEntryStats) -> DeltaColumnStats:
47
48
  """
@@ -59,13 +60,14 @@ class DeltaColumnStats(dict):
59
60
 
60
61
  @staticmethod
61
62
  def build_from_dict(delta_column_stats: List[str, Any]) -> List[DeltaColumnStats]:
62
- return DeltaColumnStats.of(delta_column_stats["column"],
63
- ManifestEntryStats.build_from_dict(delta_column_stats["manifestStats"]))
63
+ return DeltaColumnStats.of(
64
+ delta_column_stats["column"],
65
+ ManifestEntryStats.build_from_dict(delta_column_stats["manifestStats"]),
66
+ )
64
67
 
65
68
  @property
66
69
  def column(self) -> str:
67
- """Returns the column name.
68
- """
70
+ """Returns the column name."""
69
71
  return self.get("column")
70
72
 
71
73
  @property
@@ -81,8 +83,7 @@ class DeltaColumnStats(dict):
81
83
 
82
84
  @property
83
85
  def stats(self) -> Optional[StatsResult]:
84
- """ Combines the numerical stats for every manifest entry and returns it.
85
- """
86
+ """Combines the numerical stats for every manifest entry and returns it."""
86
87
  val: Dict[str, Any] = self.get("stats")
87
88
  if val is not None and not isinstance(val, StatsResult):
88
89
  self["stats"] = val = StatsResult(val)
@@ -92,4 +93,6 @@ class DeltaColumnStats(dict):
92
93
  return val
93
94
 
94
95
  def _merge_manifest_stats(self) -> StatsResult:
95
- return StatsResult.merge(self.manifest_stats.stats, {StatsType.PYARROW_TABLE_BYTES})
96
+ return StatsResult.merge(
97
+ self.manifest_stats.stats, {StatsType.PYARROW_TABLE_BYTES}
98
+ )
@@ -2,7 +2,7 @@
2
2
  from __future__ import annotations
3
3
 
4
4
  from collections import defaultdict
5
- from typing import List, Dict, Optional, Set, Any, NamedTuple
5
+ from typing import Any, Dict, List, NamedTuple, Optional, Set
6
6
 
7
7
  from deltacat.compute.stats.models.delta_column_stats import DeltaColumnStats
8
8
  from deltacat.compute.stats.models.manifest_entry_stats import ManifestEntryStats
@@ -76,8 +76,7 @@ class DeltaStats(dict):
76
76
 
77
77
  @property
78
78
  def stats(self) -> Optional[StatsResult]:
79
- """Returns a StatsResult object that represents this delta, aggregated by the column stats of this delta.
80
- """
79
+ """Returns a StatsResult object that represents this delta, aggregated by the column stats of this delta."""
81
80
  val: Dict[str, Any] = self.get("stats")
82
81
  if val is not None and not isinstance(val, StatsResult):
83
82
  self["stats"] = val = StatsResult(val)
@@ -104,8 +103,12 @@ class DeltaStats(dict):
104
103
  Returns:
105
104
  Stats for the manifest entry.
106
105
  """
107
- return StatsResult.merge(DeltaStats.get_manifest_entry_column_stats(self.column_stats, manifest_entry_idx),
108
- record_row_count_once=True)
106
+ return StatsResult.merge(
107
+ DeltaStats.get_manifest_entry_column_stats(
108
+ self.column_stats, manifest_entry_idx
109
+ ),
110
+ record_row_count_once=True,
111
+ )
109
112
 
110
113
  def manifest_entry_column_stats(self, manifest_entry_idx: int) -> List[StatsResult]:
111
114
  """Fetch a list of stats for each column in a manifest entry.
@@ -116,23 +119,34 @@ class DeltaStats(dict):
116
119
  Returns:
117
120
  A list of columnar stats for the manifest entry
118
121
  """
119
- return DeltaStats.get_manifest_entry_column_stats(self.column_stats, manifest_entry_idx)
122
+ return DeltaStats.get_manifest_entry_column_stats(
123
+ self.column_stats, manifest_entry_idx
124
+ )
120
125
 
121
126
  @staticmethod
122
- def get_manifest_entry_column_stats(columns: List[DeltaColumnStats], manifest_entry_idx: int) -> List[StatsResult]:
127
+ def get_manifest_entry_column_stats(
128
+ columns: List[DeltaColumnStats], manifest_entry_idx: int
129
+ ) -> List[StatsResult]:
123
130
  """Helper method to provide a list of columnar stats for a specific manifest entry.
124
131
 
125
132
  Returns:
126
133
  A list of columnar stats for the manifest entry
127
134
  """
128
- dataset_columnar_stats_list: List[ManifestEntryStats] = [column.manifest_stats for column in columns
129
- if column.manifest_stats is not None]
135
+ dataset_columnar_stats_list: List[ManifestEntryStats] = [
136
+ column.manifest_stats
137
+ for column in columns
138
+ if column.manifest_stats is not None
139
+ ]
130
140
  try:
131
- return [stats.stats[manifest_entry_idx] for stats in dataset_columnar_stats_list]
141
+ return [
142
+ stats.stats[manifest_entry_idx] for stats in dataset_columnar_stats_list
143
+ ]
132
144
  except IndexError:
133
145
  sci: ManifestEntryStats = dataset_columnar_stats_list[0]
134
- raise ValueError(f"Table index {manifest_entry_idx} is not present in this dataset of {sci.delta_locator} "
135
- f"with manifest table count of {len(sci.stats)}")
146
+ raise ValueError(
147
+ f"Table index {manifest_entry_idx} is not present in this dataset of {sci.delta_locator} "
148
+ f"with manifest table count of {len(sci.stats)}"
149
+ )
136
150
 
137
151
  @staticmethod
138
152
  def get_column_names(columns: List[DeltaColumnStats]) -> List[str]:
@@ -147,8 +161,9 @@ class DeltaStats(dict):
147
161
  return [column_stats.column for column_stats in columns] if columns else []
148
162
 
149
163
  @staticmethod
150
- def get_delta_stats(columns: List[DeltaColumnStats],
151
- stat_types: Optional[Set[StatsType]] = None) -> Optional[StatsResult]:
164
+ def get_delta_stats(
165
+ columns: List[DeltaColumnStats], stat_types: Optional[Set[StatsType]] = None
166
+ ) -> Optional[StatsResult]:
152
167
  """Calculate the sum of provided column stats and return it
153
168
 
154
169
  Args:
@@ -157,41 +172,52 @@ class DeltaStats(dict):
157
172
  Returns:
158
173
  Stats for the calculated sum
159
174
  """
160
- assert columns and len(columns) > 0, \
161
- f"Expected columns `{columns}` of type `{type(columns)}` " \
175
+ assert columns and len(columns) > 0, (
176
+ f"Expected columns `{columns}` of type `{type(columns)}` "
162
177
  f"to be a non-empty list of DeltaColumnStats"
178
+ )
163
179
 
164
- assert all([col.manifest_stats for col in columns]), \
165
- f"Expected stats completion info to be present in each item of {columns} "
180
+ assert all(
181
+ [col.manifest_stats for col in columns]
182
+ ), f"Expected stats completion info to be present in each item of {columns} "
166
183
 
167
184
  manifest_entry_count = len(columns[0].manifest_stats.stats)
168
- column_stats_map: Dict[str, List[Optional[StatsResult]]] = \
169
- defaultdict(lambda: [None] * manifest_entry_count)
185
+ column_stats_map: Dict[str, List[Optional[StatsResult]]] = defaultdict(
186
+ lambda: [None] * manifest_entry_count
187
+ )
170
188
 
171
189
  for column_stats in columns:
172
190
  for file_idx, entry_stats in enumerate(column_stats.manifest_stats.stats):
173
191
  column_stats_map[column_stats.column][file_idx] = entry_stats
174
192
 
175
- return DeltaStats._merge_stats_from_columns_to_dataset(DeltaStats.get_column_names(columns),
176
- column_stats_map,
177
- manifest_entry_count,
178
- stat_types)
193
+ return DeltaStats._merge_stats_from_columns_to_dataset(
194
+ DeltaStats.get_column_names(columns),
195
+ column_stats_map,
196
+ manifest_entry_count,
197
+ stat_types,
198
+ )
179
199
 
180
200
  @staticmethod
181
- def _merge_stats_from_columns_to_dataset(column_names: List[str],
182
- column_stats: Dict[str, List[Optional[StatsResult]]],
183
- manifest_entries_size: int,
184
- stat_types: Optional[Set[StatsType]] = None) -> StatsResult:
201
+ def _merge_stats_from_columns_to_dataset(
202
+ column_names: List[str],
203
+ column_stats: Dict[str, List[Optional[StatsResult]]],
204
+ manifest_entries_size: int,
205
+ stat_types: Optional[Set[StatsType]] = None,
206
+ ) -> StatsResult:
185
207
  manifest_entry_stats_summary_list: List[StatsResult] = []
186
208
  for manifest_entry_idx in range(manifest_entries_size):
187
209
  curr_manifest_entry_column_stats_list: List[StatsResult] = []
188
210
  for column_name in column_names:
189
- current_table_column_stats: StatsResult = column_stats[column_name][manifest_entry_idx]
211
+ current_table_column_stats: StatsResult = column_stats[column_name][
212
+ manifest_entry_idx
213
+ ]
190
214
  curr_manifest_entry_column_stats_list.append(current_table_column_stats)
191
215
 
192
- curr_manifest_entry_stats_summary = StatsResult.merge(curr_manifest_entry_column_stats_list,
193
- stat_types,
194
- record_row_count_once=True)
216
+ curr_manifest_entry_stats_summary = StatsResult.merge(
217
+ curr_manifest_entry_column_stats_list,
218
+ stat_types,
219
+ record_row_count_once=True,
220
+ )
195
221
  manifest_entry_stats_summary_list.append(curr_manifest_entry_stats_summary)
196
222
  return StatsResult.merge(manifest_entry_stats_summary_list, stat_types)
197
223
 
@@ -202,5 +228,6 @@ class DeltaStatsCacheMiss(NamedTuple):
202
228
  `column_names` represents missing dataset column names from the file system (ex: S3).
203
229
  delta_locator` is tied to the missing dataset columns and provided for future calculations.
204
230
  """
231
+
205
232
  column_names: List[str]
206
233
  delta_locator: DeltaLocator
@@ -12,8 +12,11 @@ class DeltaStatsCacheResult(dict):
12
12
  Stats are fetched and cached at the column level, and each column may represent one
13
13
  or more manifest entries.
14
14
  """
15
+
15
16
  @staticmethod
16
- def of(hits: Optional[DeltaStats], misses: Optional[DeltaStatsCacheMiss]) -> DeltaStatsCacheResult:
17
+ def of(
18
+ hits: Optional[DeltaStats], misses: Optional[DeltaStatsCacheMiss]
19
+ ) -> DeltaStatsCacheResult:
17
20
  cds = DeltaStatsCacheResult()
18
21
  cds["hits"] = hits
19
22
  cds["misses"] = misses
@@ -1,22 +1,24 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
+ from typing import Any, Dict, List
5
+
4
6
  import pyarrow as pa
5
7
 
6
8
  from deltacat.compute.stats.models.stats_result import StatsResult
7
9
  from deltacat.storage import DeltaLocator
8
10
 
9
- from typing import Any, Dict, List
10
-
11
11
 
12
12
  class ManifestEntryStats(dict):
13
13
  """Holds computed statistics for one or more manifest entries (tables) and their corresponding delta locator.
14
14
 
15
15
  To be stored/retrieved from a file system (ex: S3).
16
16
  """
17
+
17
18
  @staticmethod
18
- def of(manifest_entries_stats: List[StatsResult],
19
- delta_locator: DeltaLocator) -> ManifestEntryStats:
19
+ def of(
20
+ manifest_entries_stats: List[StatsResult], delta_locator: DeltaLocator
21
+ ) -> ManifestEntryStats:
20
22
  """
21
23
  Creates a stats container that represents a particular manifest.
22
24
 
@@ -35,8 +37,12 @@ class ManifestEntryStats(dict):
35
37
  def build_from_dict(manifest_entries_stats: dict) -> ManifestEntryStats:
36
38
  stats_res_list = []
37
39
  for stats_res in manifest_entries_stats["stats"]:
38
- stats_res_list.append(StatsResult.of(stats_res["rowCount"], stats_res["pyarrowTableBytes"]))
39
- return ManifestEntryStats.of(stats_res_list, manifest_entries_stats["deltaLocator"])
40
+ stats_res_list.append(
41
+ StatsResult.of(stats_res["rowCount"], stats_res["pyarrowTableBytes"])
42
+ )
43
+ return ManifestEntryStats.of(
44
+ stats_res_list, manifest_entries_stats["deltaLocator"]
45
+ )
40
46
 
41
47
  @property
42
48
  def delta_locator(self) -> DeltaLocator:
@@ -1,17 +1,19 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
- from typing import Optional, List, Set, Dict, Any
5
-
6
4
  from collections import defaultdict
7
- from deltacat.compute.stats.types import StatsType, ALL_STATS_TYPES
5
+ from typing import Any, Dict, List, Optional, Set
6
+
7
+ from deltacat.compute.stats.types import ALL_STATS_TYPES, StatsType
8
+
8
9
 
9
10
  class StatsResult(dict):
10
- """A generic container that holds stats for a single manifest entry file.
11
- """
11
+ """A generic container that holds stats for a single manifest entry file."""
12
+
12
13
  @staticmethod
13
- def of(row_count: Optional[int] = 0,
14
- pyarrow_table_bytes: Optional[int] = 0) -> StatsResult:
14
+ def of(
15
+ row_count: Optional[int] = 0, pyarrow_table_bytes: Optional[int] = 0
16
+ ) -> StatsResult:
15
17
  """Static factory for building a stats result object
16
18
 
17
19
  Args:
@@ -54,13 +56,20 @@ class StatsResult(dict):
54
56
  Returns:
55
57
  A stats result object
56
58
  """
57
- return StatsResult({k: v for k, v in stats_types.items()
58
- if k in [StatsType.ROW_COUNT, StatsType.PYARROW_TABLE_BYTES]})
59
+ return StatsResult(
60
+ {
61
+ k: v
62
+ for k, v in stats_types.items()
63
+ if k in [StatsType.ROW_COUNT, StatsType.PYARROW_TABLE_BYTES]
64
+ }
65
+ )
59
66
 
60
67
  @staticmethod
61
- def merge(stats_list: List[StatsResult],
62
- stat_types: Optional[Set[StatsType]] = None,
63
- record_row_count_once: bool = False) -> StatsResult:
68
+ def merge(
69
+ stats_list: List[StatsResult],
70
+ stat_types: Optional[Set[StatsType]] = None,
71
+ record_row_count_once: bool = False,
72
+ ) -> StatsResult:
64
73
  """Helper method to merge any list of StatsResult objects into one.
65
74
 
66
75
  StatsResult objects are merged by adding up their numerical stats.
@@ -75,9 +84,10 @@ class StatsResult(dict):
75
84
  Returns:
76
85
  A stats result object
77
86
  """
78
- assert isinstance(stats_list, list) and len(stats_list) > 0, \
79
- f"Expected stats list: {stats_list} of type {type(stats_list)} to be a " \
87
+ assert isinstance(stats_list, list) and len(stats_list) > 0, (
88
+ f"Expected stats list: {stats_list} of type {type(stats_list)} to be a "
80
89
  f"non-empty list of StatsResult objects."
90
+ )
81
91
 
82
92
  # Fallback to all stat types if not provided
83
93
  stats_to_collect: Set = stat_types or ALL_STATS_TYPES
@@ -1,4 +1,4 @@
1
- from typing import Set, Tuple, List, Iterable, Union, Optional
1
+ from typing import Iterable, List, Optional, Set, Tuple, Union
2
2
 
3
3
  DeltaPosition = Optional[int]
4
4
  NumericDeltaPosition = Union[int, float] # float is added here to support math.inf
@@ -38,7 +38,9 @@ def merge_intervals(intervals: Set[DeltaRange]) -> Set[DeltaRange]:
38
38
  for interval in intervals_list:
39
39
  start, end = interval
40
40
  if start > end:
41
- raise ValueError(f"Invalid stream position range interval: ({start}, {end})")
41
+ raise ValueError(
42
+ f"Invalid stream position range interval: ({start}, {end})"
43
+ )
42
44
 
43
45
  if merge_start is None and merge_end is None:
44
46
  merge_start, merge_end = start, end
@@ -57,7 +59,9 @@ def merge_intervals(intervals: Set[DeltaRange]) -> Set[DeltaRange]:
57
59
  return merged
58
60
 
59
61
 
60
- def _add_merged_interval(result_set: set, start: NumericDeltaPosition, end: NumericDeltaPosition):
62
+ def _add_merged_interval(
63
+ result_set: set, start: NumericDeltaPosition, end: NumericDeltaPosition
64
+ ):
61
65
  start_pos: DeltaPosition = start if isinstance(start, int) else None
62
66
  end_pos: DeltaPosition = end if isinstance(end, int) else None
63
67
  result_set.add((start_pos, end_pos))
@@ -67,9 +71,9 @@ def _to_numeric_values(intervals_list: List[DeltaRange]):
67
71
  for i, interval in enumerate(intervals_list):
68
72
  start, end = _get_validated_interval(interval)
69
73
  if start is None:
70
- start = float('-inf')
74
+ start = float("-inf")
71
75
  if end is None:
72
- end = float('inf')
76
+ end = float("inf")
73
77
 
74
78
  intervals_list[i] = (start, end)
75
79
 
@@ -79,9 +83,12 @@ def _get_validated_interval(interval: DeltaRange) -> DeltaRange:
79
83
  raise ValueError(f"Interval {interval} must be a tuple of size 2")
80
84
 
81
85
  start, end = interval
82
- if not (isinstance(start, int) or start is None) \
83
- or not (isinstance(end, int) or end is None):
84
- raise ValueError(f"Invalid stream position value types: "
85
- f"({start}, {end}) - ({type(start), type(end)})")
86
+ if not (isinstance(start, int) or start is None) or not (
87
+ isinstance(end, int) or end is None
88
+ ):
89
+ raise ValueError(
90
+ f"Invalid stream position value types: "
91
+ f"({start}, {end}) - ({type(start), type(end)})"
92
+ )
86
93
 
87
94
  return start, end