deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. deltacat/__init__.py +41 -15
  2. deltacat/aws/clients.py +12 -31
  3. deltacat/aws/constants.py +1 -1
  4. deltacat/aws/redshift/__init__.py +7 -2
  5. deltacat/aws/redshift/model/manifest.py +54 -50
  6. deltacat/aws/s3u.py +176 -187
  7. deltacat/catalog/delegate.py +151 -185
  8. deltacat/catalog/interface.py +78 -97
  9. deltacat/catalog/model/catalog.py +21 -21
  10. deltacat/catalog/model/table_definition.py +11 -9
  11. deltacat/compute/compactor/__init__.py +12 -16
  12. deltacat/compute/compactor/compaction_session.py +237 -166
  13. deltacat/compute/compactor/model/delta_annotated.py +60 -44
  14. deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
  15. deltacat/compute/compactor/model/delta_file_locator.py +10 -8
  16. deltacat/compute/compactor/model/materialize_result.py +6 -7
  17. deltacat/compute/compactor/model/primary_key_index.py +38 -34
  18. deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
  19. deltacat/compute/compactor/model/round_completion_info.py +25 -19
  20. deltacat/compute/compactor/model/sort_key.py +18 -15
  21. deltacat/compute/compactor/steps/dedupe.py +119 -94
  22. deltacat/compute/compactor/steps/hash_bucket.py +48 -47
  23. deltacat/compute/compactor/steps/materialize.py +86 -92
  24. deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
  25. deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
  26. deltacat/compute/compactor/utils/io.py +59 -47
  27. deltacat/compute/compactor/utils/primary_key_index.py +91 -80
  28. deltacat/compute/compactor/utils/round_completion_file.py +22 -23
  29. deltacat/compute/compactor/utils/system_columns.py +33 -45
  30. deltacat/compute/metastats/meta_stats.py +235 -157
  31. deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
  32. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
  33. deltacat/compute/metastats/stats.py +95 -64
  34. deltacat/compute/metastats/utils/io.py +100 -53
  35. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
  36. deltacat/compute/metastats/utils/ray_utils.py +38 -33
  37. deltacat/compute/stats/basic.py +107 -69
  38. deltacat/compute/stats/models/delta_column_stats.py +11 -8
  39. deltacat/compute/stats/models/delta_stats.py +59 -32
  40. deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
  41. deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
  42. deltacat/compute/stats/models/stats_result.py +24 -14
  43. deltacat/compute/stats/utils/intervals.py +16 -9
  44. deltacat/compute/stats/utils/io.py +86 -51
  45. deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
  46. deltacat/constants.py +4 -13
  47. deltacat/io/__init__.py +2 -2
  48. deltacat/io/aws/redshift/redshift_datasource.py +157 -143
  49. deltacat/io/dataset.py +14 -17
  50. deltacat/io/read_api.py +36 -33
  51. deltacat/logs.py +94 -42
  52. deltacat/storage/__init__.py +18 -8
  53. deltacat/storage/interface.py +196 -213
  54. deltacat/storage/model/delta.py +45 -51
  55. deltacat/storage/model/list_result.py +12 -8
  56. deltacat/storage/model/namespace.py +4 -5
  57. deltacat/storage/model/partition.py +42 -42
  58. deltacat/storage/model/stream.py +29 -30
  59. deltacat/storage/model/table.py +14 -14
  60. deltacat/storage/model/table_version.py +32 -31
  61. deltacat/storage/model/types.py +1 -0
  62. deltacat/tests/stats/test_intervals.py +11 -24
  63. deltacat/tests/utils/__init__.py +0 -0
  64. deltacat/tests/utils/test_record_batch_tables.py +284 -0
  65. deltacat/types/media.py +3 -4
  66. deltacat/types/tables.py +31 -21
  67. deltacat/utils/common.py +5 -11
  68. deltacat/utils/numpy.py +20 -22
  69. deltacat/utils/pandas.py +73 -100
  70. deltacat/utils/performance.py +3 -9
  71. deltacat/utils/placement.py +259 -230
  72. deltacat/utils/pyarrow.py +302 -89
  73. deltacat/utils/ray_utils/collections.py +2 -1
  74. deltacat/utils/ray_utils/concurrency.py +27 -28
  75. deltacat/utils/ray_utils/dataset.py +28 -28
  76. deltacat/utils/ray_utils/performance.py +5 -9
  77. deltacat/utils/ray_utils/runtime.py +9 -10
  78. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
  79. deltacat-0.1.12.dist-info/RECORD +110 -0
  80. deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
  81. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
  82. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
  83. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0
@@ -1,30 +1,25 @@
1
- import ray
2
1
  import logging
3
- from typing import Dict, Set, Tuple, List, Optional, Any
4
2
  from collections import defaultdict
3
+ from typing import Dict, List, Optional
5
4
 
6
- from deltacat.compute.stats.models.delta_stats import DeltaStats
7
- from deltacat.compute.stats.models.stats_result import StatsResult
5
+ import ray
8
6
  from ray.types import ObjectRef
9
7
 
10
8
  from deltacat import logs
11
- from deltacat.compute.stats.models.delta_stats_cache_result import DeltaStatsCacheResult
12
-
13
- from deltacat.utils.ray_utils.concurrency import invoke_parallel, \
14
- round_robin_options_provider
15
- from deltacat.compute.metastats.utils.io import collect_stats_by_columns, cache_inflation_rate_data_for_delta_stats_ready, cache_partition_stats_to_s3
16
-
17
- from deltacat.storage import PartitionLocator, DeltaLocator, Delta
18
- from deltacat.storage import interface as unimplemented_deltacat_storage
19
-
20
- from deltacat.aws.clients import client_cache
21
9
  from deltacat.aws import s3u as s3_utils
22
-
23
-
24
- from deltacat.compute.stats.models.manifest_entry_stats import ManifestEntryStats
25
- from deltacat.compute.stats.models.delta_column_stats import DeltaColumnStats
26
-
10
+ from deltacat.aws.clients import client_cache
27
11
  from deltacat.compute.compactor import DeltaAnnotated
12
+ from deltacat.compute.metastats.utils.io import (
13
+ cache_inflation_rate_data_for_delta_stats_ready,
14
+ cache_partition_stats_to_s3,
15
+ collect_stats_by_columns,
16
+ )
17
+ from deltacat.compute.stats.models.delta_column_stats import DeltaColumnStats
18
+ from deltacat.compute.stats.models.delta_stats import DeltaStats
19
+ from deltacat.compute.stats.models.manifest_entry_stats import ManifestEntryStats
20
+ from deltacat.compute.stats.models.stats_result import StatsResult
21
+ from deltacat.storage import DeltaLocator, PartitionLocator
22
+ from deltacat.storage import interface as unimplemented_deltacat_storage
28
23
 
29
24
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
30
25
 
@@ -32,51 +27,67 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
32
27
  DEFAULT_CPUS_STATS_CLUSTER_INSTANCE = 32
33
28
 
34
29
 
35
- def start_stats_collection(batched_delta_stats_compute_list: List[DeltaAnnotated],
36
- columns: List[str],
37
- stat_results_s3_bucket: Optional[str]=None,
38
- metastats_results_s3_bucket: Optional[str]=None,
39
- deltacat_storage=unimplemented_deltacat_storage) -> Dict[str, List[DeltaStats]]:
30
+ def start_stats_collection(
31
+ batched_delta_stats_compute_list: List[DeltaAnnotated],
32
+ columns: List[str],
33
+ stat_results_s3_bucket: Optional[str] = None,
34
+ metastats_results_s3_bucket: Optional[str] = None,
35
+ deltacat_storage=unimplemented_deltacat_storage,
36
+ ) -> Dict[str, List[DeltaStats]]:
40
37
  """Collects statistics on deltas, given a set of delta stream position ranges.
41
- Example:
42
- >>> collect(locator, set((1, 5), (4, 8), (13, 16)))
43
- {
44
- 1: DeltaStats(), # DeltaStats for stream positions 1 - 8
45
- 13: DeltaStats() # DeltaStats for stream positions 13 - 16
46
- }
47
- Args:
48
- source_partition_locator: Reference to the partition locator tied to the given delta stream positions
49
- delta_stream_position_range_set: A set of intervals with an int type representing finite,
50
- closed bounded values, and a None type representing unbounded infinity.
51
- columns: Columns can be optionally included to collect stats on specific columns.
52
- By default, all columns will be calculated.
53
- stat_results_s3_bucket: Used as a cache file storage for computed delta stats
54
- metastats_results_s3_bucket: Used as cache file storage for inflation rate meta stats
55
- deltacat_storage: Client implementation of the DeltaCAT storage interface
56
- Returns:
57
- A mapping of stream positions to their corresponding delta stats.
38
+ Example:
39
+ >>> collect(locator, set((1, 5), (4, 8), (13, 16)))
40
+ {
41
+ 1: DeltaStats(), # DeltaStats for stream positions 1 - 8
42
+ 13: DeltaStats() # DeltaStats for stream positions 13 - 16
43
+ }
44
+ Args:
45
+ source_partition_locator: Reference to the partition locator tied to the given delta stream positions
46
+ delta_stream_position_range_set: A set of intervals with an int type representing finite,
47
+ closed bounded values, and a None type representing unbounded infinity.
48
+ columns: Columns can be optionally included to collect stats on specific columns.
49
+ By default, all columns will be calculated.
50
+ stat_results_s3_bucket: Used as a cache file storage for computed delta stats
51
+ metastats_results_s3_bucket: Used as cache file storage for inflation rate meta stats
52
+ deltacat_storage: Client implementation of the DeltaCAT storage interface
53
+ Returns:
54
+ A mapping of stream positions to their corresponding delta stats.
58
55
  """
59
56
  # TODO: Add CompactionEventDispatcher for stats collection started event
60
57
  delta_stats_compute_pending: List[ObjectRef[Dict[str, List[StatsResult, int]]]] = []
61
58
 
62
59
  for batched_deltas in batched_delta_stats_compute_list:
63
- splitted_annotated_deltas = DeltaAnnotated.split(batched_deltas, DEFAULT_CPUS_STATS_CLUSTER_INSTANCE)
60
+ splitted_annotated_deltas = DeltaAnnotated.split(
61
+ batched_deltas, DEFAULT_CPUS_STATS_CLUSTER_INSTANCE
62
+ )
64
63
  for splitted_annotated_delta in splitted_annotated_deltas:
65
- delta_stats_compute_pending.append(collect_stats_by_columns.remote(splitted_annotated_delta, columns, deltacat_storage))
64
+ delta_stats_compute_pending.append(
65
+ collect_stats_by_columns.remote(
66
+ splitted_annotated_delta, columns, deltacat_storage
67
+ )
68
+ )
66
69
 
67
70
  column_stats_map = _process_stats(delta_stats_compute_pending)
68
71
 
69
72
  if not batched_delta_stats_compute_list:
70
73
  logger.info("No new delta need stats collection")
71
74
  else:
72
- delta_stream_range_stats, partition_canonical_string = resolve_annotated_delta_stats_to_original_deltas_stats(column_stats_map, columns, batched_delta_stats_compute_list[0])
75
+ (
76
+ delta_stream_range_stats,
77
+ partition_canonical_string,
78
+ ) = resolve_annotated_delta_stats_to_original_deltas_stats(
79
+ column_stats_map, columns, batched_delta_stats_compute_list[0]
80
+ )
73
81
 
74
- _cache_stats_res_to_s3(stat_results_s3_bucket, delta_stream_range_stats, partition_canonical_string)
82
+ _cache_stats_res_to_s3(
83
+ stat_results_s3_bucket, delta_stream_range_stats, partition_canonical_string
84
+ )
75
85
 
76
86
  base_path = s3_utils.parse_s3_url(metastats_results_s3_bucket).url
77
87
  inflation_rate_stats_s3_url = f"{base_path}/inflation-rates.json"
78
- cache_inflation_rate_data_for_delta_stats_ready(delta_stream_range_stats, inflation_rate_stats_s3_url,
79
- deltacat_storage)
88
+ cache_inflation_rate_data_for_delta_stats_ready(
89
+ delta_stream_range_stats, inflation_rate_stats_s3_url, deltacat_storage
90
+ )
80
91
  # TODO: Add CompactionEventDispatcher for stats collection completed event
81
92
  return delta_stream_range_stats
82
93
 
@@ -87,13 +98,19 @@ def _get_account_id() -> str:
87
98
  return account_id
88
99
 
89
100
 
90
- def _process_stats(delta_stats_compute_pending: List[ObjectRef[DeltaStats]]) -> List[DeltaStats]:
91
- delta_stats_processed_list: List[DeltaStats] = _resolve_pending_stats(delta_stats_compute_pending)
101
+ def _process_stats(
102
+ delta_stats_compute_pending: List[ObjectRef[DeltaStats]],
103
+ ) -> List[DeltaStats]:
104
+ delta_stats_processed_list: List[DeltaStats] = _resolve_pending_stats(
105
+ delta_stats_compute_pending
106
+ )
92
107
 
93
108
  return delta_stats_processed_list
94
109
 
95
110
 
96
- def _resolve_pending_stats(delta_stats_pending_list: List[ObjectRef[DeltaStats]]) -> List[DeltaStats]:
111
+ def _resolve_pending_stats(
112
+ delta_stats_pending_list: List[ObjectRef[DeltaStats]],
113
+ ) -> List[DeltaStats]:
97
114
  delta_stats_processed_list: List[DeltaStats] = []
98
115
 
99
116
  while delta_stats_pending_list:
@@ -104,29 +121,39 @@ def _resolve_pending_stats(delta_stats_pending_list: List[ObjectRef[DeltaStats]]
104
121
  return delta_stats_processed_list
105
122
 
106
123
 
107
- def _cache_stats_res_to_s3(stat_results_s3_bucket,
108
- delta_stream_range_stats,
109
- partition_canonical_string):
124
+ def _cache_stats_res_to_s3(
125
+ stat_results_s3_bucket, delta_stream_range_stats, partition_canonical_string
126
+ ):
110
127
  if stat_results_s3_bucket:
111
128
  # Cache the stats into the file store
112
- cache_partition_stats_to_s3(stat_results_s3_bucket, delta_stream_range_stats, partition_canonical_string)
129
+ cache_partition_stats_to_s3(
130
+ stat_results_s3_bucket, delta_stream_range_stats, partition_canonical_string
131
+ )
113
132
 
114
133
 
115
- def resolve_annotated_delta_stats_to_original_deltas_stats(column_stats_map, column_names, delta_annotated) -> \
116
- Dict[int, DeltaStats]:
134
+ def resolve_annotated_delta_stats_to_original_deltas_stats(
135
+ column_stats_map, column_names, delta_annotated
136
+ ) -> Dict[int, DeltaStats]:
117
137
 
118
- partition_values = delta_annotated["deltaLocator"]["partitionLocator"]["partitionValues"]
138
+ partition_values = delta_annotated["deltaLocator"]["partitionLocator"][
139
+ "partitionValues"
140
+ ]
119
141
  partition_id = delta_annotated["deltaLocator"]["partitionLocator"]["partitionId"]
120
- stream_locator = delta_annotated["deltaLocator"]["partitionLocator"]["streamLocator"]
121
- partition_locator = PartitionLocator.of(stream_locator, partition_values, partition_id)
142
+ stream_locator = delta_annotated["deltaLocator"]["partitionLocator"][
143
+ "streamLocator"
144
+ ]
145
+ partition_locator = PartitionLocator.of(
146
+ stream_locator, partition_values, partition_id
147
+ )
122
148
 
123
149
  # Dict[stream_position: List[StatsResult]]
124
150
  manifest_column_stats_list = defaultdict(lambda: [])
125
151
  for i in range(len(column_stats_map)):
126
152
  for column_name in column_names:
127
153
  for j in range(len(column_stats_map[i][column_name])):
128
- manifest_column_stats_list[column_stats_map[i][column_name][j][1]].append(
129
- [column_stats_map[i][column_name][j][0], column_name])
154
+ manifest_column_stats_list[
155
+ column_stats_map[i][column_name][j][1]
156
+ ].append([column_stats_map[i][column_name][j][0], column_name])
130
157
 
131
158
  stats_res: Dict[int, List[DeltaStats]] = {}
132
159
  for key, value in manifest_column_stats_list.items():
@@ -139,11 +166,15 @@ Dict[int, DeltaStats]:
139
166
  delta_ds_column_stats: List[DeltaColumnStats] = []
140
167
  for column_name, column_manifest_stats_list in manifest_stats_list.items():
141
168
 
142
- column_manifest_stats = ManifestEntryStats.of(column_manifest_stats_list, delta_locator)
143
- dataset_column_stats = DeltaColumnStats.of(column_name, column_manifest_stats)
169
+ column_manifest_stats = ManifestEntryStats.of(
170
+ column_manifest_stats_list, delta_locator
171
+ )
172
+ dataset_column_stats = DeltaColumnStats.of(
173
+ column_name, column_manifest_stats
174
+ )
144
175
  delta_ds_column_stats.append(dataset_column_stats)
145
176
 
146
177
  dataset_stats: DeltaStats = DeltaStats.of(delta_ds_column_stats)
147
178
  stats_res[key] = dataset_stats
148
179
 
149
- return stats_res, partition_locator.canonical_string()
180
+ return stats_res, partition_locator.canonical_string()
@@ -1,28 +1,29 @@
1
- import logging
2
1
  import json
2
+ import logging
3
+ from collections import defaultdict
4
+ from typing import Any, Dict, List, Optional
5
+
3
6
  import pyarrow
4
7
  import ray
5
8
 
6
- from deltacat import LocalTable, TableType
7
- from deltacat.storage import Delta
8
- from deltacat.compute.compactor import DeltaAnnotated
9
+ from deltacat import LocalTable, TableType, logs
9
10
  from deltacat.aws import s3u as s3_utils
10
- from deltacat.utils.common import sha1_hexdigest
11
- from deltacat.storage import interface as unimplemented_deltacat_storage
11
+ from deltacat.compute.compactor import DeltaAnnotated
12
12
  from deltacat.compute.metastats.model.partition_stats_dict import PartitionStats
13
- from deltacat.compute.stats.models.delta_stats_cache_result import DeltaStatsCacheResult
14
13
  from deltacat.compute.stats.models.delta_column_stats import DeltaColumnStats
15
14
  from deltacat.compute.stats.models.delta_stats import DeltaStats, DeltaStatsCacheMiss
15
+ from deltacat.compute.stats.models.delta_stats_cache_result import DeltaStatsCacheResult
16
16
  from deltacat.compute.stats.models.stats_result import StatsResult
17
+ from deltacat.storage import Delta
18
+ from deltacat.storage import interface as unimplemented_deltacat_storage
19
+ from deltacat.utils.common import sha1_hexdigest
17
20
 
18
- from typing import Dict, List, Optional, Any
19
- from collections import defaultdict
20
- from deltacat import logs
21
21
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
22
22
 
23
23
 
24
- def cache_inflation_rate_data_for_delta_stats_ready(delta_stats_processed_list, inflation_rate_stats_s3_url,
25
- deltacat_storage):
24
+ def cache_inflation_rate_data_for_delta_stats_ready(
25
+ delta_stats_processed_list, inflation_rate_stats_s3_url, deltacat_storage
26
+ ):
26
27
  meta_stats_processed_list: Dict[int, int] = {}
27
28
 
28
29
  for key, value in delta_stats_processed_list.items():
@@ -39,17 +40,23 @@ def cache_inflation_rate_data_for_delta_stats_ready(delta_stats_processed_list,
39
40
  for key, value in delta_stats_processed_list.items():
40
41
  delta_stats_pyarrow_bytes_sum = 0
41
42
  delta_stats_row_count = 0
42
- for column_stats in delta_stats_processed_list[key].column_stats[0].manifest_stats.stats:
43
+ for column_stats in (
44
+ delta_stats_processed_list[key].column_stats[0].manifest_stats.stats
45
+ ):
43
46
  delta_stats_row_count += column_stats.get("rowCount")
44
47
  for stats in delta_stats_processed_list[key].get("column_stats"):
45
48
 
46
49
  delta_stats_pyarrow_bytes_sum += stats.get("stats").get("pyarrowTableBytes")
47
- cache_inflation_rate_res[key] = [meta_stats_processed_list[key], delta_stats_row_count,
48
- delta_stats_pyarrow_bytes_sum]
50
+ cache_inflation_rate_res[key] = [
51
+ meta_stats_processed_list[key],
52
+ delta_stats_row_count,
53
+ delta_stats_pyarrow_bytes_sum,
54
+ ]
49
55
 
50
56
  if inflation_rate_stats_s3_url:
51
57
  logger.warning(
52
- f"reading previous inflation rate stats from: {inflation_rate_stats_s3_url}")
58
+ f"reading previous inflation rate stats from: {inflation_rate_stats_s3_url}"
59
+ )
53
60
 
54
61
  result = s3_utils.download(inflation_rate_stats_s3_url, fail_if_not_found=False)
55
62
 
@@ -57,38 +64,57 @@ def cache_inflation_rate_data_for_delta_stats_ready(delta_stats_processed_list,
57
64
  if result:
58
65
  json_str = result["Body"].read().decode("utf-8")
59
66
  prev_inflation_rate_stats_read = json.loads(json_str)
60
- prev_inflation_rate_stats = prev_inflation_rate_stats_read if prev_inflation_rate_stats_read else dict()
61
- logger.debug(f"read stats completion info: {prev_inflation_rate_stats_read}")
67
+ prev_inflation_rate_stats = (
68
+ prev_inflation_rate_stats_read
69
+ if prev_inflation_rate_stats_read
70
+ else dict()
71
+ )
72
+ logger.debug(
73
+ f"read stats completion info: {prev_inflation_rate_stats_read}"
74
+ )
62
75
  logger.debug(
63
- f"writing inflation rate info to S3: {inflation_rate_stats_s3_url}")
76
+ f"writing inflation rate info to S3: {inflation_rate_stats_s3_url}"
77
+ )
64
78
  prev_inflation_rate_stats.update(cache_inflation_rate_res)
65
- logger.debug(f"writing current inflation rate info to S3: {prev_inflation_rate_stats}")
79
+ logger.debug(
80
+ f"writing current inflation rate info to S3: {prev_inflation_rate_stats}"
81
+ )
66
82
  s3_utils.upload(
67
- inflation_rate_stats_s3_url,
68
- json.dumps(prev_inflation_rate_stats)
83
+ inflation_rate_stats_s3_url, json.dumps(prev_inflation_rate_stats)
69
84
  )
70
85
  else:
71
- logger.warning(f"No valid s3 url received to cache inflation rate stats, got {inflation_rate_stats_s3_url}")
86
+ logger.warning(
87
+ f"No valid s3 url received to cache inflation rate stats, got {inflation_rate_stats_s3_url}"
88
+ )
72
89
 
73
90
 
74
- def read_cached_partition_stats(partition_canonical_string: str, stat_results_s3_bucket: str):
75
- partition_stats_url = get_partition_stats_s3_url(partition_canonical_string, stat_results_s3_bucket)
76
- logger.info(
77
- f"reading partition stats completion file from: {partition_stats_url}")
91
+ def read_cached_partition_stats(
92
+ partition_canonical_string: str, stat_results_s3_bucket: str
93
+ ):
94
+ partition_stats_url = get_partition_stats_s3_url(
95
+ partition_canonical_string, stat_results_s3_bucket
96
+ )
97
+ logger.info(f"reading partition stats completion file from: {partition_stats_url}")
78
98
 
79
99
  result = s3_utils.download(partition_stats_url, fail_if_not_found=False)
80
100
  delta_stats_cache_res_map: Dict[int, List[DeltaStatsCacheResult]] = {}
81
101
  if result:
82
102
  json_str = result["Body"].read().decode("utf-8")
83
103
  partition_stats_str = json.loads(json_str)
84
- delta_stats_cache_res_map = get_delta_stats_from_partition_stats(partition_stats_str)
104
+ delta_stats_cache_res_map = get_delta_stats_from_partition_stats(
105
+ partition_stats_str
106
+ )
85
107
 
86
108
  return delta_stats_cache_res_map
87
109
 
88
110
 
89
- def get_partition_stats_s3_url(partition_canonical_string: str, stat_results_s3_bucket: str):
111
+ def get_partition_stats_s3_url(
112
+ partition_canonical_string: str, stat_results_s3_bucket: str
113
+ ):
90
114
  stats_partition_canonical_string = f"{partition_canonical_string}"
91
- stats_partition_hexdigest = sha1_hexdigest(stats_partition_canonical_string.encode("utf-8"))
115
+ stats_partition_hexdigest = sha1_hexdigest(
116
+ stats_partition_canonical_string.encode("utf-8")
117
+ )
92
118
  base_path = s3_utils.parse_s3_url(stat_results_s3_bucket).url
93
119
 
94
120
  return f"{base_path}/{stats_partition_hexdigest}.json"
@@ -109,34 +135,43 @@ def get_delta_stats_from_partition_stats(partition_stats_str: str):
109
135
  missed_columns.append(cs.column)
110
136
 
111
137
  delta_locator = delta_stats.column_stats[0].manifest_stats.delta_locator
112
- found_stats: Optional[DeltaStats] = DeltaStats.of(found_columns_stats) if found_columns_stats else None
113
- missed_stats: Optional[DeltaStatsCacheMiss] = DeltaStatsCacheMiss(missed_columns, delta_locator) \
114
- if missed_columns else None
138
+ found_stats: Optional[DeltaStats] = (
139
+ DeltaStats.of(found_columns_stats) if found_columns_stats else None
140
+ )
141
+ missed_stats: Optional[DeltaStatsCacheMiss] = (
142
+ DeltaStatsCacheMiss(missed_columns, delta_locator)
143
+ if missed_columns
144
+ else None
145
+ )
115
146
  delta_stats_cache_res = DeltaStatsCacheResult.of(found_stats, missed_stats)
116
147
  found_columns_stats_map[int(stream_position)] = delta_stats_cache_res
117
148
  return found_columns_stats_map
118
149
 
119
150
 
120
- def cache_partition_stats_to_s3(stat_results_s3_bucket, delta_stream_range_stats, partition_canonical_string):
121
- partition_stats = PartitionStats.of(delta_stream_range_stats, partition_canonical_string)
122
- logger.info(
123
- f"writing partition stats completion for {partition_canonical_string}")
151
+ def cache_partition_stats_to_s3(
152
+ stat_results_s3_bucket, delta_stream_range_stats, partition_canonical_string
153
+ ):
154
+ partition_stats = PartitionStats.of(
155
+ delta_stream_range_stats, partition_canonical_string
156
+ )
157
+ logger.info(f"writing partition stats completion for {partition_canonical_string}")
124
158
  partition_stats_completion_file_s3_url = get_partition_stats_s3_url(
125
- partition_canonical_string,
126
- stat_results_s3_bucket
159
+ partition_canonical_string, stat_results_s3_bucket
127
160
  )
128
161
  s3_utils.upload(
129
- partition_stats_completion_file_s3_url,
130
- str(json.dumps(partition_stats))
162
+ partition_stats_completion_file_s3_url, str(json.dumps(partition_stats))
131
163
  )
132
164
  logger.debug(
133
- f"stats completion file written to: {partition_stats_completion_file_s3_url}")
165
+ f"stats completion file written to: {partition_stats_completion_file_s3_url}"
166
+ )
134
167
 
135
168
 
136
169
  @ray.remote
137
- def collect_stats_by_columns(delta_annotated: DeltaAnnotated,
138
- columns_to_compute: Optional[List[str]] = None,
139
- deltacat_storage=unimplemented_deltacat_storage) -> Dict[str, Any]:
170
+ def collect_stats_by_columns(
171
+ delta_annotated: DeltaAnnotated,
172
+ columns_to_compute: Optional[List[str]] = None,
173
+ deltacat_storage=unimplemented_deltacat_storage,
174
+ ) -> Dict[str, Any]:
140
175
  """Materializes one manifest entry at a time to save memory usage and calculate stats from each of its columns.
141
176
 
142
177
  Args:
@@ -150,15 +185,25 @@ def collect_stats_by_columns(delta_annotated: DeltaAnnotated,
150
185
  total_tables_size = 0
151
186
 
152
187
  # Mapping of column_name -> [stats_file_idx_1, stats_file_idx_2, ... stats_file_idx_n]
153
- column_stats_map = defaultdict(lambda: [[None, None]] * len(delta_annotated["manifest"].get("entries")))
188
+ column_stats_map = defaultdict(
189
+ lambda: [[None, None]] * len(delta_annotated["manifest"].get("entries"))
190
+ )
154
191
  src_da_entries = delta_annotated["manifest"].get("entries")
155
192
  manifest_annotations = delta_annotated["annotations"]
156
193
  for file_idx, manifest in enumerate(src_da_entries):
157
- entry_pyarrow_table: LocalTable = \
158
- deltacat_storage.download_delta_manifest_entry(delta_annotated, file_idx, TableType.PYARROW, columns_to_compute, equivalent_table_types="uncompacted")
159
- assert isinstance(entry_pyarrow_table, pyarrow.Table), \
160
- f"Stats collection is only supported for PyArrow tables, but received a table of " \
194
+ entry_pyarrow_table: LocalTable = (
195
+ deltacat_storage.download_delta_manifest_entry(
196
+ delta_annotated,
197
+ file_idx,
198
+ TableType.PYARROW,
199
+ columns_to_compute,
200
+ equivalent_table_types="uncompacted",
201
+ )
202
+ )
203
+ assert isinstance(entry_pyarrow_table, pyarrow.Table), (
204
+ f"Stats collection is only supported for PyArrow tables, but received a table of "
161
205
  f"type '{type(entry_pyarrow_table)}' for manifest entry {file_idx} of delta: {delta_annotated.locator}."
206
+ )
162
207
  total_tables_size += entry_pyarrow_table.nbytes
163
208
  if not columns_to_compute:
164
209
  columns_to_compute = entry_pyarrow_table.column_names
@@ -166,7 +211,9 @@ def collect_stats_by_columns(delta_annotated: DeltaAnnotated,
166
211
  for column_idx, pyarrow_column in enumerate(entry_pyarrow_table.columns):
167
212
  column_name = columns_to_compute[column_idx]
168
213
  origin_delta_stream_position = manifest_annotations[file_idx][-1]
169
- column_stats_map[column_name][file_idx] = [StatsResult.of(len(pyarrow_column), pyarrow_column.nbytes),
170
- origin_delta_stream_position]
214
+ column_stats_map[column_name][file_idx] = [
215
+ StatsResult.of(len(pyarrow_column), pyarrow_column.nbytes),
216
+ origin_delta_stream_position,
217
+ ]
171
218
 
172
- return column_stats_map
219
+ return column_stats_map
@@ -1,6 +1,9 @@
1
1
  from deltacat.constants import PYARROW_INFLATION_MULTIPLIER_ALL_COLUMNS
2
2
 
3
- def estimation_function(content_length, content_type, content_encoding, *args, **kwargs):
3
+
4
+ def estimation_function(
5
+ content_length, content_type, content_encoding, *args, **kwargs
6
+ ):
4
7
  # TODO(zyiqin): update the estimation here to be consistent with number of required worker nodes estimate.
5
8
  # Current implementation is only a rough guess using the PYARROW_INFLATION_MULTIPLIER(content_length to pyarrow butes(all columns).
6
9
  # The full implementation logic should be:
@@ -12,4 +15,4 @@ def estimation_function(content_length, content_type, content_encoding, *args, *
12
15
  if content_length:
13
16
  return content_length * PYARROW_INFLATION_MULTIPLIER_ALL_COLUMNS
14
17
  else:
15
- return 0
18
+ return 0
@@ -1,15 +1,18 @@
1
+ import errno
2
+ import logging
1
3
  import os
2
4
  import subprocess
5
+ from subprocess import run
6
+ from typing import Any
7
+
3
8
  import ray
4
- import errno
5
- import logging
9
+ from tenacity import RetryError, Retrying, stop_after_attempt, wait_fixed
6
10
 
7
11
  from deltacat import logs
8
- from tenacity import retry, stop_after_attempt
9
- from typing import Any
10
- from deltacat.compute.metastats.utils.constants import WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO, MAX_WORKER_MULTIPLIER
11
- from tenacity import Retrying, stop_after_attempt, wait_fixed, RetryError
12
- from subprocess import run, PIPE
12
+ from deltacat.compute.metastats.utils.constants import (
13
+ MAX_WORKER_MULTIPLIER,
14
+ WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO,
15
+ )
13
16
 
14
17
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
15
18
 
@@ -24,13 +27,10 @@ def run_cmd_exit_code(cmd: str) -> int:
24
27
 
25
28
  def run_cmd_with_retry(cmd: str) -> None:
26
29
  retrying = Retrying(
27
- wait=wait_fixed(2),
28
- stop=stop_after_attempt(RAY_DOWN_DEFAULT_RETRY_ATTEMPTS)
30
+ wait=wait_fixed(2), stop=stop_after_attempt(RAY_DOWN_DEFAULT_RETRY_ATTEMPTS)
29
31
  )
30
32
  try:
31
- retrying(
32
- run_cmd_exit_code(cmd)
33
- )
33
+ retrying(run_cmd_exit_code(cmd))
34
34
  except RetryError:
35
35
  logger.info(f"{cmd} failed after {RAY_DOWN_DEFAULT_RETRY_ATTEMPTS} retries.")
36
36
 
@@ -38,8 +38,9 @@ def run_cmd_with_retry(cmd: str) -> None:
38
38
  def run_cmd(cmd: str) -> None:
39
39
  result = run(cmd, shell=True, capture_output=True)
40
40
  exit_code = int(result.returncode)
41
- assert exit_code == 0, f"`{cmd}` failed. Exit code: {exit_code} " \
42
- f"Error Trace: {result.stderr}"
41
+ assert exit_code == 0, (
42
+ f"`{cmd}` failed. Exit code: {exit_code} " f"Error Trace: {result.stderr}"
43
+ )
43
44
 
44
45
 
45
46
  def ray_up(cluster_cfg: str) -> None:
@@ -67,7 +68,8 @@ def get_head_node_ip(cluster_cfg: str) -> str:
67
68
  shell=True,
68
69
  capture_output=True,
69
70
  text=True,
70
- check=True)
71
+ check=True,
72
+ )
71
73
  # the head node IP should be the last line printed to stdout
72
74
  head_node_ip = proc.stdout.splitlines()[-1]
73
75
  logger.info(f"Ray cluster head node IP for '{cluster_cfg}': {head_node_ip}")
@@ -83,14 +85,15 @@ def ray_init(host, port) -> Any:
83
85
 
84
86
 
85
87
  def replace_cluster_cfg_vars(
86
- partition_canonical_string: str,
87
- trace_id: str,
88
- file_path: str,
89
- min_workers: int,
90
- head_type: str,
91
- worker_type: str,
92
- head_object_store_memory_pct: int,
93
- worker_object_store_memory_pct: int) -> str:
88
+ partition_canonical_string: str,
89
+ trace_id: str,
90
+ file_path: str,
91
+ min_workers: int,
92
+ head_type: str,
93
+ worker_type: str,
94
+ head_object_store_memory_pct: int,
95
+ worker_object_store_memory_pct: int,
96
+ ) -> str:
94
97
 
95
98
  head_object_store_memory_pct = head_object_store_memory_pct if not None else 30
96
99
  worker_object_store_memory_pct = WORKER_NODE_OBJECT_STORE_MEMORY_RESERVE_RATIO * 100
@@ -98,18 +101,20 @@ def replace_cluster_cfg_vars(
98
101
  max_workers = int(min_workers * MAX_WORKER_MULTIPLIER)
99
102
  with open(file_path, "r+") as file:
100
103
  contents = file.read().replace("{{use-internal-ips}}", "True")
101
- contents = contents.replace("{{partition_canonical_string}}", partition_canonical_string)
102
- contents = contents.replace("{{trace_id}}", trace_id)
103
- contents = contents.replace("{{min-workers}}", str(min_workers))
104
- contents = contents.replace("{{max-workers}}", str(max_workers))
105
- contents = contents.replace("{{head-instance-type}}", head_type)
106
- contents = contents.replace("{{worker-instance-type}}", worker_type)
107
104
  contents = contents.replace(
108
- "{{head-object-store-memory-pct}}",
109
- str(head_object_store_memory_pct))
105
+ "{{partition_canonical_string}}", partition_canonical_string
106
+ )
107
+ contents = contents.replace("'{{trace_id}}'", trace_id)
108
+ contents = contents.replace("'{{min-workers}}'", str(min_workers))
109
+ contents = contents.replace("'{{max-workers}}'", str(max_workers))
110
+ contents = contents.replace("'{{head-instance-type}}'", head_type)
111
+ contents = contents.replace("'{{worker-instance-type}}'", worker_type)
110
112
  contents = contents.replace(
111
- "{{worker-object-store-memory-pct}}",
112
- str(worker_object_store_memory_pct))
113
+ "'{{head-object-store-memory-pct}}'", str(head_object_store_memory_pct)
114
+ )
115
+ contents = contents.replace(
116
+ "'{{worker-object-store-memory-pct}}'", str(worker_object_store_memory_pct)
117
+ )
113
118
  partition_id = partition_canonical_string.split("|")[-1]
114
119
  out_file_name = f"{trace_id}-{partition_id}.{os.path.basename(file_path)}"
115
120
  out_file_dir = os.path.join(os.path.dirname(file_path), "tmp")