deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. deltacat/__init__.py +41 -15
  2. deltacat/aws/clients.py +12 -31
  3. deltacat/aws/constants.py +1 -1
  4. deltacat/aws/redshift/__init__.py +7 -2
  5. deltacat/aws/redshift/model/manifest.py +54 -50
  6. deltacat/aws/s3u.py +176 -187
  7. deltacat/catalog/delegate.py +151 -185
  8. deltacat/catalog/interface.py +78 -97
  9. deltacat/catalog/model/catalog.py +21 -21
  10. deltacat/catalog/model/table_definition.py +11 -9
  11. deltacat/compute/compactor/__init__.py +12 -16
  12. deltacat/compute/compactor/compaction_session.py +237 -166
  13. deltacat/compute/compactor/model/delta_annotated.py +60 -44
  14. deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
  15. deltacat/compute/compactor/model/delta_file_locator.py +10 -8
  16. deltacat/compute/compactor/model/materialize_result.py +6 -7
  17. deltacat/compute/compactor/model/primary_key_index.py +38 -34
  18. deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
  19. deltacat/compute/compactor/model/round_completion_info.py +25 -19
  20. deltacat/compute/compactor/model/sort_key.py +18 -15
  21. deltacat/compute/compactor/steps/dedupe.py +119 -94
  22. deltacat/compute/compactor/steps/hash_bucket.py +48 -47
  23. deltacat/compute/compactor/steps/materialize.py +86 -92
  24. deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
  25. deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
  26. deltacat/compute/compactor/utils/io.py +59 -47
  27. deltacat/compute/compactor/utils/primary_key_index.py +91 -80
  28. deltacat/compute/compactor/utils/round_completion_file.py +22 -23
  29. deltacat/compute/compactor/utils/system_columns.py +33 -45
  30. deltacat/compute/metastats/meta_stats.py +235 -157
  31. deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
  32. deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
  33. deltacat/compute/metastats/stats.py +95 -64
  34. deltacat/compute/metastats/utils/io.py +100 -53
  35. deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
  36. deltacat/compute/metastats/utils/ray_utils.py +38 -33
  37. deltacat/compute/stats/basic.py +107 -69
  38. deltacat/compute/stats/models/delta_column_stats.py +11 -8
  39. deltacat/compute/stats/models/delta_stats.py +59 -32
  40. deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
  41. deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
  42. deltacat/compute/stats/models/stats_result.py +24 -14
  43. deltacat/compute/stats/utils/intervals.py +16 -9
  44. deltacat/compute/stats/utils/io.py +86 -51
  45. deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
  46. deltacat/constants.py +4 -13
  47. deltacat/io/__init__.py +2 -2
  48. deltacat/io/aws/redshift/redshift_datasource.py +157 -143
  49. deltacat/io/dataset.py +14 -17
  50. deltacat/io/read_api.py +36 -33
  51. deltacat/logs.py +94 -42
  52. deltacat/storage/__init__.py +18 -8
  53. deltacat/storage/interface.py +196 -213
  54. deltacat/storage/model/delta.py +45 -51
  55. deltacat/storage/model/list_result.py +12 -8
  56. deltacat/storage/model/namespace.py +4 -5
  57. deltacat/storage/model/partition.py +42 -42
  58. deltacat/storage/model/stream.py +29 -30
  59. deltacat/storage/model/table.py +14 -14
  60. deltacat/storage/model/table_version.py +32 -31
  61. deltacat/storage/model/types.py +1 -0
  62. deltacat/tests/stats/test_intervals.py +11 -24
  63. deltacat/tests/utils/__init__.py +0 -0
  64. deltacat/tests/utils/test_record_batch_tables.py +284 -0
  65. deltacat/types/media.py +3 -4
  66. deltacat/types/tables.py +31 -21
  67. deltacat/utils/common.py +5 -11
  68. deltacat/utils/numpy.py +20 -22
  69. deltacat/utils/pandas.py +73 -100
  70. deltacat/utils/performance.py +3 -9
  71. deltacat/utils/placement.py +259 -230
  72. deltacat/utils/pyarrow.py +302 -89
  73. deltacat/utils/ray_utils/collections.py +2 -1
  74. deltacat/utils/ray_utils/concurrency.py +27 -28
  75. deltacat/utils/ray_utils/dataset.py +28 -28
  76. deltacat/utils/ray_utils/performance.py +5 -9
  77. deltacat/utils/ray_utils/runtime.py +9 -10
  78. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
  79. deltacat-0.1.12.dist-info/RECORD +110 -0
  80. deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
  81. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
  82. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
  83. {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0
@@ -2,21 +2,26 @@
2
2
  from __future__ import annotations
3
3
 
4
4
  import logging
5
- from deltacat import logs
6
- from deltacat.storage import DeltaType, Manifest, ManifestEntry, \
7
- ManifestEntryList
8
- from typing import List, Optional, Callable, Union
9
5
  from types import FunctionType
10
- from deltacat.storage import Delta
6
+ from typing import Callable, List, Optional, Union
7
+
8
+ from deltacat import logs
9
+ from deltacat.storage import (
10
+ Delta,
11
+ DeltaType,
12
+ Manifest,
13
+ ManifestEntry,
14
+ ManifestEntryList,
15
+ )
11
16
 
12
17
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
13
18
 
14
19
 
15
20
  class DeltaAnnotation(tuple):
16
21
  @staticmethod
17
- def of(file_index: int,
18
- delta_type: DeltaType,
19
- stream_position: int) -> DeltaAnnotation:
22
+ def of(
23
+ file_index: int, delta_type: DeltaType, stream_position: int
24
+ ) -> DeltaAnnotation:
20
25
  return DeltaAnnotation((file_index, delta_type, stream_position))
21
26
 
22
27
  @property
@@ -48,17 +53,19 @@ class DeltaAnnotated(Delta):
48
53
  if entries:
49
54
  dtype = delta.type
50
55
  pos = delta.stream_position
51
- _annotations = [DeltaAnnotation.of(i, dtype, pos) for i in
52
- range(len(entries))]
56
+ _annotations = [
57
+ DeltaAnnotation.of(i, dtype, pos) for i in range(len(entries))
58
+ ]
53
59
  delta_annotated.annotations = _annotations
54
60
  return delta_annotated
55
61
 
56
62
  @staticmethod
57
63
  def rebatch(
58
- annotated_deltas: List[DeltaAnnotated],
59
- min_delta_bytes,
60
- min_file_counts: Optional[Union[int, float]]=float("inf"),
61
- estimation_function: Optional[Callable]=None) -> List[DeltaAnnotated]:
64
+ annotated_deltas: List[DeltaAnnotated],
65
+ min_delta_bytes,
66
+ min_file_counts: Optional[Union[int, float]] = float("inf"),
67
+ estimation_function: Optional[Callable] = None,
68
+ ) -> List[DeltaAnnotated]:
62
69
  """
63
70
  Simple greedy algorithm to split/merge 1 or more annotated deltas into
64
71
  size-limited annotated deltas. All ordered manifest entries in the input
@@ -75,30 +82,38 @@ class DeltaAnnotated(Delta):
75
82
  for src_da in annotated_deltas:
76
83
  src_da_annotations = src_da.annotations
77
84
  src_da_entries = src_da.manifest.entries
78
- assert(len(src_da_annotations) == len(src_da_entries),
79
- f"Unexpected Error: Length of delta annotations "
80
- f"({len(src_da_annotations)}) doesn't mach the length of "
81
- f"delta manifest entries ({len(src_da_entries)}).")
85
+ assert (
86
+ len(src_da_annotations) == len(src_da_entries),
87
+ f"Unexpected Error: Length of delta annotations "
88
+ f"({len(src_da_annotations)}) doesn't mach the length of "
89
+ f"delta manifest entries ({len(src_da_entries)}).",
90
+ )
82
91
  for i, src_entry in enumerate(src_da_entries):
83
92
  DeltaAnnotated._append_annotated_entry(
84
- src_da,
85
- new_da,
86
- src_entry,
87
- src_da_annotations[i])
93
+ src_da, new_da, src_entry, src_da_annotations[i]
94
+ )
88
95
  # TODO: Fetch s3_obj["Size"] if entry content length undefined?
89
- estimated_new_da_bytes = estimation_function(src_entry.meta.content_length) if type(
90
- estimation_function) is FunctionType else src_entry.meta.content_length
96
+ estimated_new_da_bytes = (
97
+ estimation_function(src_entry.meta.content_length)
98
+ if type(estimation_function) is FunctionType
99
+ else src_entry.meta.content_length
100
+ )
91
101
  new_da_bytes += estimated_new_da_bytes
92
102
  da_group_entry_count += 1
93
- if new_da_bytes >= min_delta_bytes or da_group_entry_count >= min_file_counts:
103
+ if (
104
+ new_da_bytes >= min_delta_bytes
105
+ or da_group_entry_count >= min_file_counts
106
+ ):
94
107
  if new_da_bytes >= min_delta_bytes:
95
108
  logger.info(
96
109
  f"Appending group of {da_group_entry_count} elements "
97
- f"and {new_da_bytes} bytes to meet file size limit")
110
+ f"and {new_da_bytes} bytes to meet file size limit"
111
+ )
98
112
  if da_group_entry_count >= min_file_counts:
99
113
  logger.info(
100
114
  f"Appending group of {da_group_entry_count} elements "
101
- f"and {da_group_entry_count} files to meet file count limit")
115
+ f"and {da_group_entry_count} files to meet file count limit"
116
+ )
102
117
  groups.append(new_da)
103
118
  new_da = DeltaAnnotated()
104
119
  new_da_bytes = 0
@@ -108,32 +123,31 @@ class DeltaAnnotated(Delta):
108
123
  return groups
109
124
 
110
125
  @staticmethod
111
- def split(
112
- src_da: DeltaAnnotated,
113
- pieces: int) -> List[DeltaAnnotated]:
126
+ def split(src_da: DeltaAnnotated, pieces: int) -> List[DeltaAnnotated]:
114
127
  groups = []
115
128
  new_da = DeltaAnnotated()
116
129
  da_group_entry_count = 0
117
130
  src_da_annotations = src_da.annotations
118
131
  src_da_entries = src_da.manifest.entries
119
- assert (len(src_da_annotations) == len(src_da_entries),
120
- f"Unexpected Error: Length of delta annotations "
121
- f"({len(src_da_annotations)}) doesn't mach the length of "
122
- f"delta manifest entries ({len(src_da_entries)}).")
132
+ assert (
133
+ len(src_da_annotations) == len(src_da_entries),
134
+ f"Unexpected Error: Length of delta annotations "
135
+ f"({len(src_da_annotations)}) doesn't mach the length of "
136
+ f"delta manifest entries ({len(src_da_entries)}).",
137
+ )
123
138
  src_da_entries_length = len(src_da_entries)
124
139
  equal_length = src_da_entries_length // pieces
125
140
  for i in range(len(src_da_entries)):
126
141
  DeltaAnnotated._append_annotated_entry(
127
- src_da,
128
- new_da,
129
- src_da_entries[i],
130
- src_da_annotations[i])
142
+ src_da, new_da, src_da_entries[i], src_da_annotations[i]
143
+ )
131
144
  # TODO: Fetch s3_obj["Size"] if entry content length undefined?
132
145
  da_group_entry_count += 1
133
146
  if da_group_entry_count >= equal_length and i < equal_length * (pieces - 1):
134
147
  logger.info(
135
148
  f"Splitting {da_group_entry_count} manifest files "
136
- f"to {pieces} pieces of {equal_length} size.")
149
+ f"to {pieces} pieces of {equal_length} size."
150
+ )
137
151
  groups.append(new_da)
138
152
  new_da = DeltaAnnotated()
139
153
  da_group_entry_count = 0
@@ -141,7 +155,8 @@ class DeltaAnnotated(Delta):
141
155
  groups.append(new_da)
142
156
  logger.info(
143
157
  f"Splitting {da_group_entry_count} manifest files "
144
- f"to {pieces} pieces of {equal_length} size.")
158
+ f"to {pieces} pieces of {equal_length} size."
159
+ )
145
160
  new_da = DeltaAnnotated()
146
161
  if new_da:
147
162
  groups.append(new_da)
@@ -157,10 +172,11 @@ class DeltaAnnotated(Delta):
157
172
 
158
173
  @staticmethod
159
174
  def _append_annotated_entry(
160
- src_da: DeltaAnnotated,
161
- dst_da: DeltaAnnotated,
162
- src_entry: ManifestEntry,
163
- src_annotation: DeltaAnnotation):
175
+ src_da: DeltaAnnotated,
176
+ dst_da: DeltaAnnotated,
177
+ src_entry: ManifestEntry,
178
+ src_annotation: DeltaAnnotation,
179
+ ):
164
180
 
165
181
  if not dst_da:
166
182
  # copy all extended properties from the source delta manifest impl
@@ -2,19 +2,18 @@
2
2
  from __future__ import annotations
3
3
 
4
4
  import numpy as np
5
- from deltacat.storage import DeltaType, LocalTable
6
5
 
6
+ from deltacat.storage import DeltaType, LocalTable
7
7
 
8
8
  DeltaFileEnvelopeGroups = np.ndarray
9
9
 
10
10
 
11
11
  class DeltaFileEnvelope(dict):
12
12
  @staticmethod
13
- def of(stream_position: int,
14
- file_index: int,
15
- delta_type: DeltaType,
16
- table: LocalTable) -> DeltaFileEnvelope:
17
- """ Static factory builder for a Delta File Envelope
13
+ def of(
14
+ stream_position: int, file_index: int, delta_type: DeltaType, table: LocalTable
15
+ ) -> DeltaFileEnvelope:
16
+ """Static factory builder for a Delta File Envelope
18
17
  `
19
18
  Args:
20
19
  stream_position: Stream position of a delta.
@@ -8,9 +8,9 @@ from deltacat.storage import Locator
8
8
 
9
9
  class DeltaFileLocator(Locator, tuple):
10
10
  @staticmethod
11
- def of(is_src_delta: np.bool_,
12
- stream_position: np.int64,
13
- file_index: np.int32) -> DeltaFileLocator:
11
+ def of(
12
+ is_src_delta: np.bool_, stream_position: np.int64, file_index: np.int32
13
+ ) -> DeltaFileLocator:
14
14
  """
15
15
  Create a Delta File Locator tuple that can be used to uniquely identify
16
16
  and retrieve a file from any compaction job run input Delta.
@@ -30,11 +30,13 @@ class DeltaFileLocator(Locator, tuple):
30
30
  delta_file_locator: The Delta File Locator Tuple as
31
31
  (is_source_delta, stream_position, file_index).
32
32
  """
33
- return DeltaFileLocator((
34
- is_src_delta,
35
- stream_position,
36
- file_index,
37
- ))
33
+ return DeltaFileLocator(
34
+ (
35
+ is_src_delta,
36
+ stream_position,
37
+ file_index,
38
+ )
39
+ )
38
40
 
39
41
  @property
40
42
  def is_source_delta(self) -> np.bool_:
@@ -1,18 +1,17 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
- from deltacat.storage import Delta
5
- from deltacat.compute.compactor.model.pyarrow_write_result import \
6
- PyArrowWriteResult
7
-
8
4
  from typing import Any, Dict
9
5
 
6
+ from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
7
+ from deltacat.storage import Delta
8
+
10
9
 
11
10
  class MaterializeResult(dict):
12
11
  @staticmethod
13
- def of(delta: Delta,
14
- task_index: int,
15
- pyarrow_write_result: PyArrowWriteResult) -> MaterializeResult:
12
+ def of(
13
+ delta: Delta, task_index: int, pyarrow_write_result: PyArrowWriteResult
14
+ ) -> MaterializeResult:
16
15
  materialize_result = MaterializeResult()
17
16
  materialize_result["delta"] = delta
18
17
  materialize_result["taskIndex"] = task_index
@@ -1,19 +1,17 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
+ from typing import Any, Dict, List
4
5
  from uuid import uuid4
5
6
 
6
7
  from deltacat.compute.compactor.model.sort_key import SortKey
7
8
  from deltacat.storage import Locator, PartitionLocator
8
9
  from deltacat.utils.common import sha1_hexdigest
9
10
 
10
- from typing import Any, Dict, List
11
-
12
11
 
13
12
  class PrimaryKeyIndexLocator(Locator, dict):
14
13
  @staticmethod
15
- def of(primary_key_index_meta: PrimaryKeyIndexMeta) \
16
- -> PrimaryKeyIndexLocator:
14
+ def of(primary_key_index_meta: PrimaryKeyIndexMeta) -> PrimaryKeyIndexLocator:
17
15
  """
18
16
  Creates a Primary Key Index Locator from the given Primary Key
19
17
  Index Metadata. A Primary Key Index Locator consists of a Primary Key
@@ -33,13 +31,16 @@ class PrimaryKeyIndexLocator(Locator, dict):
33
31
 
34
32
  @staticmethod
35
33
  def _root_path(
36
- compacted_partition_locator: PartitionLocator,
37
- primary_keys: List[str],
38
- sort_keys: List[SortKey],
39
- primary_key_index_algorithm_version: str) -> str:
34
+ compacted_partition_locator: PartitionLocator,
35
+ primary_keys: List[str],
36
+ sort_keys: List[SortKey],
37
+ primary_key_index_algorithm_version: str,
38
+ ) -> str:
40
39
  pl_hexdigest = compacted_partition_locator.hexdigest()
41
- pki_version_str = f"{pl_hexdigest}|{primary_keys}|{sort_keys}|" \
42
- f"{primary_key_index_algorithm_version}"
40
+ pki_version_str = (
41
+ f"{pl_hexdigest}|{primary_keys}|{sort_keys}|"
42
+ f"{primary_key_index_algorithm_version}"
43
+ )
43
44
  return sha1_hexdigest(pki_version_str.encode("utf-8"))
44
45
 
45
46
  @property
@@ -76,10 +77,12 @@ class PrimaryKeyIndexLocator(Locator, dict):
76
77
 
77
78
  class PrimaryKeyIndexMeta(dict):
78
79
  @staticmethod
79
- def of(compacted_partition_locator: PartitionLocator,
80
- primary_keys: List[str],
81
- sort_keys: List[SortKey],
82
- primary_key_index_algo_version: str) -> PrimaryKeyIndexMeta:
80
+ def of(
81
+ compacted_partition_locator: PartitionLocator,
82
+ primary_keys: List[str],
83
+ sort_keys: List[SortKey],
84
+ primary_key_index_algo_version: str,
85
+ ) -> PrimaryKeyIndexMeta:
83
86
  """
84
87
  Creates Primary Key Index Metadata from the given compacted
85
88
  Partition Locator, primary keys, sort keys, and primary key index
@@ -114,8 +117,10 @@ class PrimaryKeyIndexMeta(dict):
114
117
 
115
118
  class PrimaryKeyIndexVersionLocator(Locator, dict):
116
119
  @staticmethod
117
- def of(primary_key_index_version_meta: PrimaryKeyIndexVersionMeta,
118
- pki_version_root_path: str) -> PrimaryKeyIndexVersionLocator:
120
+ def of(
121
+ primary_key_index_version_meta: PrimaryKeyIndexVersionMeta,
122
+ pki_version_root_path: str,
123
+ ) -> PrimaryKeyIndexVersionLocator:
119
124
  """
120
125
  Creates a primary key index version locator from the given primary key
121
126
  index version metadata and version root path. Note that, while this is
@@ -129,8 +134,9 @@ class PrimaryKeyIndexVersionLocator(Locator, dict):
129
134
  return pkivl
130
135
 
131
136
  @staticmethod
132
- def generate(pki_version_meta: PrimaryKeyIndexVersionMeta) \
133
- -> PrimaryKeyIndexVersionLocator:
137
+ def generate(
138
+ pki_version_meta: PrimaryKeyIndexVersionMeta,
139
+ ) -> PrimaryKeyIndexVersionLocator:
134
140
  """
135
141
  Creates a new primary key index version locator from the given primary
136
142
  key index version metadata. A primary key index version locator
@@ -142,11 +148,12 @@ class PrimaryKeyIndexVersionLocator(Locator, dict):
142
148
  deterministically from the compacted partition locator, primary keys,
143
149
  sort keys, and primary key index algorithm version.
144
150
  """
145
- pki_version_root_path = PrimaryKeyIndexVersionLocator.\
146
- _generate_version_root_path(
151
+ pki_version_root_path = (
152
+ PrimaryKeyIndexVersionLocator._generate_version_root_path(
147
153
  PrimaryKeyIndexVersionLocator._pki_root_path(pki_version_meta),
148
154
  pki_version_meta.hash_bucket_count,
149
155
  )
156
+ )
150
157
  pkivl = PrimaryKeyIndexVersionLocator()
151
158
  pkivl["primaryKeyIndexVersionMeta"] = pki_version_meta
152
159
  pkivl["primaryKeyIndexVersionRootPath"] = pki_version_root_path
@@ -159,17 +166,14 @@ class PrimaryKeyIndexVersionLocator(Locator, dict):
159
166
  return pki_locator.primary_key_index_root_path
160
167
 
161
168
  @staticmethod
162
- def _generate_version_root_path(
163
- pki_root_path: str,
164
- hash_bucket_count: int) -> str:
169
+ def _generate_version_root_path(pki_root_path: str, hash_bucket_count: int) -> str:
165
170
  return f"{pki_root_path}/{hash_bucket_count}/{str(uuid4())}"
166
171
 
167
172
  @property
168
173
  def primary_key_index_version_meta(self) -> PrimaryKeyIndexVersionMeta:
169
174
  val: Dict[str, Any] = self.get("primaryKeyIndexVersionMeta")
170
175
  if val is not None and not isinstance(val, PrimaryKeyIndexVersionMeta):
171
- self["primaryKeyIndexVersionMeta"] = val = \
172
- PrimaryKeyIndexVersionMeta(val)
176
+ self["primaryKeyIndexVersionMeta"] = val = PrimaryKeyIndexVersionMeta(val)
173
177
  return val
174
178
 
175
179
  @property
@@ -179,7 +183,8 @@ class PrimaryKeyIndexVersionLocator(Locator, dict):
179
183
  index version locator.
180
184
  """
181
185
  return PrimaryKeyIndexVersionLocator._pki_root_path(
182
- self.primary_key_index_version_meta)
186
+ self.primary_key_index_version_meta
187
+ )
183
188
 
184
189
  @property
185
190
  def primary_key_index_version_root_path(self) -> str:
@@ -206,9 +211,8 @@ class PrimaryKeyIndexVersionLocator(Locator, dict):
206
211
  return f"{pkiv_root_path}/{hb_index}"
207
212
 
208
213
  def get_pkiv_hb_index_s3_url_base(
209
- self,
210
- s3_bucket: str,
211
- hash_bucket_index: int) -> str:
214
+ self, s3_bucket: str, hash_bucket_index: int
215
+ ) -> str:
212
216
  """
213
217
  Gets the base S3 URL of a single hash bucket of the given primary key
214
218
  index version locator.
@@ -217,9 +221,8 @@ class PrimaryKeyIndexVersionLocator(Locator, dict):
217
221
  return f"s3://{s3_bucket}/{hbi_root_path}"
218
222
 
219
223
  def get_pkiv_hb_index_manifest_s3_url(
220
- self,
221
- s3_bucket: str,
222
- hash_bucket_index: int) -> str:
224
+ self, s3_bucket: str, hash_bucket_index: int
225
+ ) -> str:
223
226
  """
224
227
  Gets the S3 URL of the manifest for a single primary key index version
225
228
  hash bucket.
@@ -241,8 +244,9 @@ class PrimaryKeyIndexVersionLocator(Locator, dict):
241
244
 
242
245
  class PrimaryKeyIndexVersionMeta(dict):
243
246
  @staticmethod
244
- def of(primary_key_index_meta: PrimaryKeyIndexMeta,
245
- hash_bucket_count: int) -> PrimaryKeyIndexVersionMeta:
247
+ def of(
248
+ primary_key_index_meta: PrimaryKeyIndexMeta, hash_bucket_count: int
249
+ ) -> PrimaryKeyIndexVersionMeta:
246
250
  """
247
251
  Creates Primary Key Index Version Metadata from the given Primary Key
248
252
  Index Metadata and hash bucket count.
@@ -6,10 +6,9 @@ from typing import List
6
6
 
7
7
  class PyArrowWriteResult(dict):
8
8
  @staticmethod
9
- def of(file_count: int,
10
- pyarrow_bytes: int,
11
- file_bytes: int,
12
- record_count: int) -> PyArrowWriteResult:
9
+ def of(
10
+ file_count: int, pyarrow_bytes: int, file_bytes: int, record_count: int
11
+ ) -> PyArrowWriteResult:
13
12
  pawr = PyArrowWriteResult()
14
13
  pawr["files"] = file_count
15
14
  pawr["paBytes"] = pyarrow_bytes
@@ -1,24 +1,26 @@
1
1
  # Allow classes to use self-referencing Type hints in Python 3.7.
2
2
  from __future__ import annotations
3
3
 
4
- from deltacat.storage import DeltaLocator
5
- from deltacat.compute.compactor.model.pyarrow_write_result import \
6
- PyArrowWriteResult
7
- from deltacat.compute.compactor.model.primary_key_index import \
8
- PrimaryKeyIndexVersionLocator
4
+ from typing import Any, Dict, Optional
9
5
 
10
- from typing import Any, Dict
6
+ from deltacat.compute.compactor.model.primary_key_index import (
7
+ PrimaryKeyIndexVersionLocator,
8
+ )
9
+ from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
10
+ from deltacat.storage import DeltaLocator, PartitionLocator
11
11
 
12
12
 
13
13
  class RoundCompletionInfo(dict):
14
14
  @staticmethod
15
- def of(high_watermark: int,
16
- compacted_delta_locator: DeltaLocator,
17
- compacted_pyarrow_write_result: PyArrowWriteResult,
18
- pk_index_pyarrow_write_result: PyArrowWriteResult,
19
- sort_keys_bit_width: int,
20
- primary_key_index_version_locator: PrimaryKeyIndexVersionLocator) \
21
- -> RoundCompletionInfo:
15
+ def of(
16
+ high_watermark: int,
17
+ compacted_delta_locator: DeltaLocator,
18
+ compacted_pyarrow_write_result: PyArrowWriteResult,
19
+ pk_index_pyarrow_write_result: PyArrowWriteResult,
20
+ sort_keys_bit_width: int,
21
+ primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
22
+ rebase_source_partition_locator: Optional[PartitionLocator],
23
+ ) -> RoundCompletionInfo:
22
24
 
23
25
  rci = RoundCompletionInfo()
24
26
  rci["highWatermark"] = high_watermark
@@ -27,6 +29,7 @@ class RoundCompletionInfo(dict):
27
29
  rci["pkIndexPyarrowWriteResult"] = pk_index_pyarrow_write_result
28
30
  rci["sortKeysBitWidth"] = sort_keys_bit_width
29
31
  rci["primaryKeyIndexVersionLocator"] = primary_key_index_version_locator
32
+ rci["rebaseSourcePartitionLocator"] = rebase_source_partition_locator
30
33
  return rci
31
34
 
32
35
  @property
@@ -59,11 +62,14 @@ class RoundCompletionInfo(dict):
59
62
  return self["sortKeysBitWidth"]
60
63
 
61
64
  @property
62
- def primary_key_index_version_locator(self) \
63
- -> PrimaryKeyIndexVersionLocator:
65
+ def primary_key_index_version_locator(self) -> PrimaryKeyIndexVersionLocator:
64
66
  val: Dict[str, Any] = self.get("primaryKeyIndexVersionLocator")
65
- if val is not None \
66
- and not isinstance(val, PrimaryKeyIndexVersionLocator):
67
- self["primaryKeyIndexVersionLocator"] = val = \
68
- PrimaryKeyIndexVersionLocator(val)
67
+ if val is not None and not isinstance(val, PrimaryKeyIndexVersionLocator):
68
+ self["primaryKeyIndexVersionLocator"] = val = PrimaryKeyIndexVersionLocator(
69
+ val
70
+ )
69
71
  return val
72
+
73
+ @property
74
+ def rebase_source_partition_locator(self) -> Optional[PartitionLocator]:
75
+ return self.get("rebaseSourcePartitionLocator")
@@ -2,14 +2,13 @@
2
2
  from __future__ import annotations
3
3
 
4
4
  import logging
5
- import pyarrow as pa
6
-
7
5
  from enum import Enum
6
+ from typing import List
8
7
 
9
- from deltacat.storage import PartitionLocator
10
- from deltacat import logs
8
+ import pyarrow as pa
11
9
 
12
- from typing import List
10
+ from deltacat import logs
11
+ from deltacat.storage import PartitionLocator
13
12
 
14
13
  logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
15
14
 
@@ -23,8 +22,7 @@ class SortOrder(str, Enum):
23
22
 
24
23
  class SortKey(tuple):
25
24
  @staticmethod
26
- def of(key_name: str, sort_order: SortOrder = SortOrder.ASCENDING) \
27
- -> SortKey:
25
+ def of(key_name: str, sort_order: SortOrder = SortOrder.ASCENDING) -> SortKey:
28
26
  """
29
27
  Create a sort key from a field name to use as the sort key, and
30
28
  the sort order for this key. If no sort order is specified, then the
@@ -39,9 +37,10 @@ class SortKey(tuple):
39
37
 
40
38
  @staticmethod
41
39
  def validate_sort_keys(
42
- source_partition_locator: PartitionLocator,
43
- sort_keys: List[SortKey],
44
- deltacat_storage) -> int:
40
+ source_partition_locator: PartitionLocator,
41
+ sort_keys: List[SortKey],
42
+ deltacat_storage,
43
+ ) -> int:
45
44
  """
46
45
  Validates the input sort keys to ensure that they are unique, are using
47
46
  a valid sort key model, are all fixed-width data types, and that the
@@ -51,8 +50,9 @@ class SortKey(tuple):
51
50
  total_sort_keys_bit_width = 0
52
51
  if sort_keys:
53
52
  sort_key_names = [key.key_name for key in sort_keys]
54
- assert len(sort_key_names) == len(set(sort_key_names)), \
55
- f"Sort key names must be unique: {sort_key_names}"
53
+ assert len(sort_key_names) == len(
54
+ set(sort_key_names)
55
+ ), f"Sort key names must be unique: {sort_key_names}"
56
56
  stream_locator = source_partition_locator.stream_locator
57
57
  table_version_schema = deltacat_storage.get_table_version_schema(
58
58
  stream_locator.namespace,
@@ -70,19 +70,22 @@ class SortKey(tuple):
70
70
  f"Total length of sort keys "
71
71
  f"({total_sort_keys_bit_width}) is greater "
72
72
  f"than the max supported bit width for all "
73
- f"sort keys ({MAX_SORT_KEYS_BIT_WIDTH})")
73
+ f"sort keys ({MAX_SORT_KEYS_BIT_WIDTH})"
74
+ )
74
75
  except ValueError as e:
75
76
  raise ValueError(
76
77
  f"Unable to get bit width of sort key: {pa_field}. "
77
78
  f"Please ensure that all sort keys are fixed-size "
78
- f"PyArrow data types.") from e
79
+ f"PyArrow data types."
80
+ ) from e
79
81
  else:
80
82
  logger.warning(
81
83
  f"Unable to estimate sort key bit width for schema type "
82
84
  f"{type(table_version_schema)}. This compaction job run "
83
85
  f"may run out of memory, run more slowly, or underutilize "
84
86
  f"available resources. To fix this, either remove the "
85
- f"sort keys or provide a PyArrow schema.")
87
+ f"sort keys or provide a PyArrow schema."
88
+ )
86
89
  total_sort_keys_bit_width = MAX_SORT_KEYS_BIT_WIDTH
87
90
  return total_sort_keys_bit_width
88
91