deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +41 -15
- deltacat/aws/clients.py +12 -31
- deltacat/aws/constants.py +1 -1
- deltacat/aws/redshift/__init__.py +7 -2
- deltacat/aws/redshift/model/manifest.py +54 -50
- deltacat/aws/s3u.py +176 -187
- deltacat/catalog/delegate.py +151 -185
- deltacat/catalog/interface.py +78 -97
- deltacat/catalog/model/catalog.py +21 -21
- deltacat/catalog/model/table_definition.py +11 -9
- deltacat/compute/compactor/__init__.py +12 -16
- deltacat/compute/compactor/compaction_session.py +237 -166
- deltacat/compute/compactor/model/delta_annotated.py +60 -44
- deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
- deltacat/compute/compactor/model/delta_file_locator.py +10 -8
- deltacat/compute/compactor/model/materialize_result.py +6 -7
- deltacat/compute/compactor/model/primary_key_index.py +38 -34
- deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
- deltacat/compute/compactor/model/round_completion_info.py +25 -19
- deltacat/compute/compactor/model/sort_key.py +18 -15
- deltacat/compute/compactor/steps/dedupe.py +119 -94
- deltacat/compute/compactor/steps/hash_bucket.py +48 -47
- deltacat/compute/compactor/steps/materialize.py +86 -92
- deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
- deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
- deltacat/compute/compactor/utils/io.py +59 -47
- deltacat/compute/compactor/utils/primary_key_index.py +91 -80
- deltacat/compute/compactor/utils/round_completion_file.py +22 -23
- deltacat/compute/compactor/utils/system_columns.py +33 -45
- deltacat/compute/metastats/meta_stats.py +235 -157
- deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
- deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
- deltacat/compute/metastats/stats.py +95 -64
- deltacat/compute/metastats/utils/io.py +100 -53
- deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
- deltacat/compute/metastats/utils/ray_utils.py +38 -33
- deltacat/compute/stats/basic.py +107 -69
- deltacat/compute/stats/models/delta_column_stats.py +11 -8
- deltacat/compute/stats/models/delta_stats.py +59 -32
- deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
- deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
- deltacat/compute/stats/models/stats_result.py +24 -14
- deltacat/compute/stats/utils/intervals.py +16 -9
- deltacat/compute/stats/utils/io.py +86 -51
- deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
- deltacat/constants.py +4 -13
- deltacat/io/__init__.py +2 -2
- deltacat/io/aws/redshift/redshift_datasource.py +157 -143
- deltacat/io/dataset.py +14 -17
- deltacat/io/read_api.py +36 -33
- deltacat/logs.py +94 -42
- deltacat/storage/__init__.py +18 -8
- deltacat/storage/interface.py +196 -213
- deltacat/storage/model/delta.py +45 -51
- deltacat/storage/model/list_result.py +12 -8
- deltacat/storage/model/namespace.py +4 -5
- deltacat/storage/model/partition.py +42 -42
- deltacat/storage/model/stream.py +29 -30
- deltacat/storage/model/table.py +14 -14
- deltacat/storage/model/table_version.py +32 -31
- deltacat/storage/model/types.py +1 -0
- deltacat/tests/stats/test_intervals.py +11 -24
- deltacat/tests/utils/__init__.py +0 -0
- deltacat/tests/utils/test_record_batch_tables.py +284 -0
- deltacat/types/media.py +3 -4
- deltacat/types/tables.py +31 -21
- deltacat/utils/common.py +5 -11
- deltacat/utils/numpy.py +20 -22
- deltacat/utils/pandas.py +73 -100
- deltacat/utils/performance.py +3 -9
- deltacat/utils/placement.py +259 -230
- deltacat/utils/pyarrow.py +302 -89
- deltacat/utils/ray_utils/collections.py +2 -1
- deltacat/utils/ray_utils/concurrency.py +27 -28
- deltacat/utils/ray_utils/dataset.py +28 -28
- deltacat/utils/ray_utils/performance.py +5 -9
- deltacat/utils/ray_utils/runtime.py +9 -10
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
- deltacat-0.1.12.dist-info/RECORD +110 -0
- deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0
@@ -2,21 +2,26 @@
|
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
4
|
import logging
|
5
|
-
from deltacat import logs
|
6
|
-
from deltacat.storage import DeltaType, Manifest, ManifestEntry, \
|
7
|
-
ManifestEntryList
|
8
|
-
from typing import List, Optional, Callable, Union
|
9
5
|
from types import FunctionType
|
10
|
-
from
|
6
|
+
from typing import Callable, List, Optional, Union
|
7
|
+
|
8
|
+
from deltacat import logs
|
9
|
+
from deltacat.storage import (
|
10
|
+
Delta,
|
11
|
+
DeltaType,
|
12
|
+
Manifest,
|
13
|
+
ManifestEntry,
|
14
|
+
ManifestEntryList,
|
15
|
+
)
|
11
16
|
|
12
17
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
13
18
|
|
14
19
|
|
15
20
|
class DeltaAnnotation(tuple):
|
16
21
|
@staticmethod
|
17
|
-
def of(
|
18
|
-
|
19
|
-
|
22
|
+
def of(
|
23
|
+
file_index: int, delta_type: DeltaType, stream_position: int
|
24
|
+
) -> DeltaAnnotation:
|
20
25
|
return DeltaAnnotation((file_index, delta_type, stream_position))
|
21
26
|
|
22
27
|
@property
|
@@ -48,17 +53,19 @@ class DeltaAnnotated(Delta):
|
|
48
53
|
if entries:
|
49
54
|
dtype = delta.type
|
50
55
|
pos = delta.stream_position
|
51
|
-
_annotations = [
|
52
|
-
|
56
|
+
_annotations = [
|
57
|
+
DeltaAnnotation.of(i, dtype, pos) for i in range(len(entries))
|
58
|
+
]
|
53
59
|
delta_annotated.annotations = _annotations
|
54
60
|
return delta_annotated
|
55
61
|
|
56
62
|
@staticmethod
|
57
63
|
def rebatch(
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
64
|
+
annotated_deltas: List[DeltaAnnotated],
|
65
|
+
min_delta_bytes,
|
66
|
+
min_file_counts: Optional[Union[int, float]] = float("inf"),
|
67
|
+
estimation_function: Optional[Callable] = None,
|
68
|
+
) -> List[DeltaAnnotated]:
|
62
69
|
"""
|
63
70
|
Simple greedy algorithm to split/merge 1 or more annotated deltas into
|
64
71
|
size-limited annotated deltas. All ordered manifest entries in the input
|
@@ -75,30 +82,38 @@ class DeltaAnnotated(Delta):
|
|
75
82
|
for src_da in annotated_deltas:
|
76
83
|
src_da_annotations = src_da.annotations
|
77
84
|
src_da_entries = src_da.manifest.entries
|
78
|
-
assert
|
79
|
-
|
80
|
-
|
81
|
-
|
85
|
+
assert (
|
86
|
+
len(src_da_annotations) == len(src_da_entries),
|
87
|
+
f"Unexpected Error: Length of delta annotations "
|
88
|
+
f"({len(src_da_annotations)}) doesn't mach the length of "
|
89
|
+
f"delta manifest entries ({len(src_da_entries)}).",
|
90
|
+
)
|
82
91
|
for i, src_entry in enumerate(src_da_entries):
|
83
92
|
DeltaAnnotated._append_annotated_entry(
|
84
|
-
src_da,
|
85
|
-
|
86
|
-
src_entry,
|
87
|
-
src_da_annotations[i])
|
93
|
+
src_da, new_da, src_entry, src_da_annotations[i]
|
94
|
+
)
|
88
95
|
# TODO: Fetch s3_obj["Size"] if entry content length undefined?
|
89
|
-
estimated_new_da_bytes =
|
90
|
-
estimation_function
|
96
|
+
estimated_new_da_bytes = (
|
97
|
+
estimation_function(src_entry.meta.content_length)
|
98
|
+
if type(estimation_function) is FunctionType
|
99
|
+
else src_entry.meta.content_length
|
100
|
+
)
|
91
101
|
new_da_bytes += estimated_new_da_bytes
|
92
102
|
da_group_entry_count += 1
|
93
|
-
if
|
103
|
+
if (
|
104
|
+
new_da_bytes >= min_delta_bytes
|
105
|
+
or da_group_entry_count >= min_file_counts
|
106
|
+
):
|
94
107
|
if new_da_bytes >= min_delta_bytes:
|
95
108
|
logger.info(
|
96
109
|
f"Appending group of {da_group_entry_count} elements "
|
97
|
-
f"and {new_da_bytes} bytes to meet file size limit"
|
110
|
+
f"and {new_da_bytes} bytes to meet file size limit"
|
111
|
+
)
|
98
112
|
if da_group_entry_count >= min_file_counts:
|
99
113
|
logger.info(
|
100
114
|
f"Appending group of {da_group_entry_count} elements "
|
101
|
-
f"and {da_group_entry_count} files to meet file count limit"
|
115
|
+
f"and {da_group_entry_count} files to meet file count limit"
|
116
|
+
)
|
102
117
|
groups.append(new_da)
|
103
118
|
new_da = DeltaAnnotated()
|
104
119
|
new_da_bytes = 0
|
@@ -108,32 +123,31 @@ class DeltaAnnotated(Delta):
|
|
108
123
|
return groups
|
109
124
|
|
110
125
|
@staticmethod
|
111
|
-
def split(
|
112
|
-
src_da: DeltaAnnotated,
|
113
|
-
pieces: int) -> List[DeltaAnnotated]:
|
126
|
+
def split(src_da: DeltaAnnotated, pieces: int) -> List[DeltaAnnotated]:
|
114
127
|
groups = []
|
115
128
|
new_da = DeltaAnnotated()
|
116
129
|
da_group_entry_count = 0
|
117
130
|
src_da_annotations = src_da.annotations
|
118
131
|
src_da_entries = src_da.manifest.entries
|
119
|
-
assert (
|
120
|
-
|
121
|
-
|
122
|
-
|
132
|
+
assert (
|
133
|
+
len(src_da_annotations) == len(src_da_entries),
|
134
|
+
f"Unexpected Error: Length of delta annotations "
|
135
|
+
f"({len(src_da_annotations)}) doesn't mach the length of "
|
136
|
+
f"delta manifest entries ({len(src_da_entries)}).",
|
137
|
+
)
|
123
138
|
src_da_entries_length = len(src_da_entries)
|
124
139
|
equal_length = src_da_entries_length // pieces
|
125
140
|
for i in range(len(src_da_entries)):
|
126
141
|
DeltaAnnotated._append_annotated_entry(
|
127
|
-
src_da,
|
128
|
-
|
129
|
-
src_da_entries[i],
|
130
|
-
src_da_annotations[i])
|
142
|
+
src_da, new_da, src_da_entries[i], src_da_annotations[i]
|
143
|
+
)
|
131
144
|
# TODO: Fetch s3_obj["Size"] if entry content length undefined?
|
132
145
|
da_group_entry_count += 1
|
133
146
|
if da_group_entry_count >= equal_length and i < equal_length * (pieces - 1):
|
134
147
|
logger.info(
|
135
148
|
f"Splitting {da_group_entry_count} manifest files "
|
136
|
-
f"to {pieces} pieces of {equal_length} size."
|
149
|
+
f"to {pieces} pieces of {equal_length} size."
|
150
|
+
)
|
137
151
|
groups.append(new_da)
|
138
152
|
new_da = DeltaAnnotated()
|
139
153
|
da_group_entry_count = 0
|
@@ -141,7 +155,8 @@ class DeltaAnnotated(Delta):
|
|
141
155
|
groups.append(new_da)
|
142
156
|
logger.info(
|
143
157
|
f"Splitting {da_group_entry_count} manifest files "
|
144
|
-
f"to {pieces} pieces of {equal_length} size."
|
158
|
+
f"to {pieces} pieces of {equal_length} size."
|
159
|
+
)
|
145
160
|
new_da = DeltaAnnotated()
|
146
161
|
if new_da:
|
147
162
|
groups.append(new_da)
|
@@ -157,10 +172,11 @@ class DeltaAnnotated(Delta):
|
|
157
172
|
|
158
173
|
@staticmethod
|
159
174
|
def _append_annotated_entry(
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
175
|
+
src_da: DeltaAnnotated,
|
176
|
+
dst_da: DeltaAnnotated,
|
177
|
+
src_entry: ManifestEntry,
|
178
|
+
src_annotation: DeltaAnnotation,
|
179
|
+
):
|
164
180
|
|
165
181
|
if not dst_da:
|
166
182
|
# copy all extended properties from the source delta manifest impl
|
@@ -2,19 +2,18 @@
|
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
4
|
import numpy as np
|
5
|
-
from deltacat.storage import DeltaType, LocalTable
|
6
5
|
|
6
|
+
from deltacat.storage import DeltaType, LocalTable
|
7
7
|
|
8
8
|
DeltaFileEnvelopeGroups = np.ndarray
|
9
9
|
|
10
10
|
|
11
11
|
class DeltaFileEnvelope(dict):
|
12
12
|
@staticmethod
|
13
|
-
def of(
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
""" Static factory builder for a Delta File Envelope
|
13
|
+
def of(
|
14
|
+
stream_position: int, file_index: int, delta_type: DeltaType, table: LocalTable
|
15
|
+
) -> DeltaFileEnvelope:
|
16
|
+
"""Static factory builder for a Delta File Envelope
|
18
17
|
`
|
19
18
|
Args:
|
20
19
|
stream_position: Stream position of a delta.
|
@@ -8,9 +8,9 @@ from deltacat.storage import Locator
|
|
8
8
|
|
9
9
|
class DeltaFileLocator(Locator, tuple):
|
10
10
|
@staticmethod
|
11
|
-
def of(
|
12
|
-
|
13
|
-
|
11
|
+
def of(
|
12
|
+
is_src_delta: np.bool_, stream_position: np.int64, file_index: np.int32
|
13
|
+
) -> DeltaFileLocator:
|
14
14
|
"""
|
15
15
|
Create a Delta File Locator tuple that can be used to uniquely identify
|
16
16
|
and retrieve a file from any compaction job run input Delta.
|
@@ -30,11 +30,13 @@ class DeltaFileLocator(Locator, tuple):
|
|
30
30
|
delta_file_locator: The Delta File Locator Tuple as
|
31
31
|
(is_source_delta, stream_position, file_index).
|
32
32
|
"""
|
33
|
-
return DeltaFileLocator(
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
33
|
+
return DeltaFileLocator(
|
34
|
+
(
|
35
|
+
is_src_delta,
|
36
|
+
stream_position,
|
37
|
+
file_index,
|
38
|
+
)
|
39
|
+
)
|
38
40
|
|
39
41
|
@property
|
40
42
|
def is_source_delta(self) -> np.bool_:
|
@@ -1,18 +1,17 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
-
from deltacat.storage import Delta
|
5
|
-
from deltacat.compute.compactor.model.pyarrow_write_result import \
|
6
|
-
PyArrowWriteResult
|
7
|
-
|
8
4
|
from typing import Any, Dict
|
9
5
|
|
6
|
+
from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
|
7
|
+
from deltacat.storage import Delta
|
8
|
+
|
10
9
|
|
11
10
|
class MaterializeResult(dict):
|
12
11
|
@staticmethod
|
13
|
-
def of(
|
14
|
-
|
15
|
-
|
12
|
+
def of(
|
13
|
+
delta: Delta, task_index: int, pyarrow_write_result: PyArrowWriteResult
|
14
|
+
) -> MaterializeResult:
|
16
15
|
materialize_result = MaterializeResult()
|
17
16
|
materialize_result["delta"] = delta
|
18
17
|
materialize_result["taskIndex"] = task_index
|
@@ -1,19 +1,17 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
+
from typing import Any, Dict, List
|
4
5
|
from uuid import uuid4
|
5
6
|
|
6
7
|
from deltacat.compute.compactor.model.sort_key import SortKey
|
7
8
|
from deltacat.storage import Locator, PartitionLocator
|
8
9
|
from deltacat.utils.common import sha1_hexdigest
|
9
10
|
|
10
|
-
from typing import Any, Dict, List
|
11
|
-
|
12
11
|
|
13
12
|
class PrimaryKeyIndexLocator(Locator, dict):
|
14
13
|
@staticmethod
|
15
|
-
def of(primary_key_index_meta: PrimaryKeyIndexMeta)
|
16
|
-
-> PrimaryKeyIndexLocator:
|
14
|
+
def of(primary_key_index_meta: PrimaryKeyIndexMeta) -> PrimaryKeyIndexLocator:
|
17
15
|
"""
|
18
16
|
Creates a Primary Key Index Locator from the given Primary Key
|
19
17
|
Index Metadata. A Primary Key Index Locator consists of a Primary Key
|
@@ -33,13 +31,16 @@ class PrimaryKeyIndexLocator(Locator, dict):
|
|
33
31
|
|
34
32
|
@staticmethod
|
35
33
|
def _root_path(
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
34
|
+
compacted_partition_locator: PartitionLocator,
|
35
|
+
primary_keys: List[str],
|
36
|
+
sort_keys: List[SortKey],
|
37
|
+
primary_key_index_algorithm_version: str,
|
38
|
+
) -> str:
|
40
39
|
pl_hexdigest = compacted_partition_locator.hexdigest()
|
41
|
-
pki_version_str =
|
42
|
-
|
40
|
+
pki_version_str = (
|
41
|
+
f"{pl_hexdigest}|{primary_keys}|{sort_keys}|"
|
42
|
+
f"{primary_key_index_algorithm_version}"
|
43
|
+
)
|
43
44
|
return sha1_hexdigest(pki_version_str.encode("utf-8"))
|
44
45
|
|
45
46
|
@property
|
@@ -76,10 +77,12 @@ class PrimaryKeyIndexLocator(Locator, dict):
|
|
76
77
|
|
77
78
|
class PrimaryKeyIndexMeta(dict):
|
78
79
|
@staticmethod
|
79
|
-
def of(
|
80
|
-
|
81
|
-
|
82
|
-
|
80
|
+
def of(
|
81
|
+
compacted_partition_locator: PartitionLocator,
|
82
|
+
primary_keys: List[str],
|
83
|
+
sort_keys: List[SortKey],
|
84
|
+
primary_key_index_algo_version: str,
|
85
|
+
) -> PrimaryKeyIndexMeta:
|
83
86
|
"""
|
84
87
|
Creates Primary Key Index Metadata from the given compacted
|
85
88
|
Partition Locator, primary keys, sort keys, and primary key index
|
@@ -114,8 +117,10 @@ class PrimaryKeyIndexMeta(dict):
|
|
114
117
|
|
115
118
|
class PrimaryKeyIndexVersionLocator(Locator, dict):
|
116
119
|
@staticmethod
|
117
|
-
def of(
|
118
|
-
|
120
|
+
def of(
|
121
|
+
primary_key_index_version_meta: PrimaryKeyIndexVersionMeta,
|
122
|
+
pki_version_root_path: str,
|
123
|
+
) -> PrimaryKeyIndexVersionLocator:
|
119
124
|
"""
|
120
125
|
Creates a primary key index version locator from the given primary key
|
121
126
|
index version metadata and version root path. Note that, while this is
|
@@ -129,8 +134,9 @@ class PrimaryKeyIndexVersionLocator(Locator, dict):
|
|
129
134
|
return pkivl
|
130
135
|
|
131
136
|
@staticmethod
|
132
|
-
def generate(
|
133
|
-
|
137
|
+
def generate(
|
138
|
+
pki_version_meta: PrimaryKeyIndexVersionMeta,
|
139
|
+
) -> PrimaryKeyIndexVersionLocator:
|
134
140
|
"""
|
135
141
|
Creates a new primary key index version locator from the given primary
|
136
142
|
key index version metadata. A primary key index version locator
|
@@ -142,11 +148,12 @@ class PrimaryKeyIndexVersionLocator(Locator, dict):
|
|
142
148
|
deterministically from the compacted partition locator, primary keys,
|
143
149
|
sort keys, and primary key index algorithm version.
|
144
150
|
"""
|
145
|
-
pki_version_root_path =
|
146
|
-
_generate_version_root_path(
|
151
|
+
pki_version_root_path = (
|
152
|
+
PrimaryKeyIndexVersionLocator._generate_version_root_path(
|
147
153
|
PrimaryKeyIndexVersionLocator._pki_root_path(pki_version_meta),
|
148
154
|
pki_version_meta.hash_bucket_count,
|
149
155
|
)
|
156
|
+
)
|
150
157
|
pkivl = PrimaryKeyIndexVersionLocator()
|
151
158
|
pkivl["primaryKeyIndexVersionMeta"] = pki_version_meta
|
152
159
|
pkivl["primaryKeyIndexVersionRootPath"] = pki_version_root_path
|
@@ -159,17 +166,14 @@ class PrimaryKeyIndexVersionLocator(Locator, dict):
|
|
159
166
|
return pki_locator.primary_key_index_root_path
|
160
167
|
|
161
168
|
@staticmethod
|
162
|
-
def _generate_version_root_path(
|
163
|
-
pki_root_path: str,
|
164
|
-
hash_bucket_count: int) -> str:
|
169
|
+
def _generate_version_root_path(pki_root_path: str, hash_bucket_count: int) -> str:
|
165
170
|
return f"{pki_root_path}/{hash_bucket_count}/{str(uuid4())}"
|
166
171
|
|
167
172
|
@property
|
168
173
|
def primary_key_index_version_meta(self) -> PrimaryKeyIndexVersionMeta:
|
169
174
|
val: Dict[str, Any] = self.get("primaryKeyIndexVersionMeta")
|
170
175
|
if val is not None and not isinstance(val, PrimaryKeyIndexVersionMeta):
|
171
|
-
self["primaryKeyIndexVersionMeta"] = val =
|
172
|
-
PrimaryKeyIndexVersionMeta(val)
|
176
|
+
self["primaryKeyIndexVersionMeta"] = val = PrimaryKeyIndexVersionMeta(val)
|
173
177
|
return val
|
174
178
|
|
175
179
|
@property
|
@@ -179,7 +183,8 @@ class PrimaryKeyIndexVersionLocator(Locator, dict):
|
|
179
183
|
index version locator.
|
180
184
|
"""
|
181
185
|
return PrimaryKeyIndexVersionLocator._pki_root_path(
|
182
|
-
self.primary_key_index_version_meta
|
186
|
+
self.primary_key_index_version_meta
|
187
|
+
)
|
183
188
|
|
184
189
|
@property
|
185
190
|
def primary_key_index_version_root_path(self) -> str:
|
@@ -206,9 +211,8 @@ class PrimaryKeyIndexVersionLocator(Locator, dict):
|
|
206
211
|
return f"{pkiv_root_path}/{hb_index}"
|
207
212
|
|
208
213
|
def get_pkiv_hb_index_s3_url_base(
|
209
|
-
|
210
|
-
|
211
|
-
hash_bucket_index: int) -> str:
|
214
|
+
self, s3_bucket: str, hash_bucket_index: int
|
215
|
+
) -> str:
|
212
216
|
"""
|
213
217
|
Gets the base S3 URL of a single hash bucket of the given primary key
|
214
218
|
index version locator.
|
@@ -217,9 +221,8 @@ class PrimaryKeyIndexVersionLocator(Locator, dict):
|
|
217
221
|
return f"s3://{s3_bucket}/{hbi_root_path}"
|
218
222
|
|
219
223
|
def get_pkiv_hb_index_manifest_s3_url(
|
220
|
-
|
221
|
-
|
222
|
-
hash_bucket_index: int) -> str:
|
224
|
+
self, s3_bucket: str, hash_bucket_index: int
|
225
|
+
) -> str:
|
223
226
|
"""
|
224
227
|
Gets the S3 URL of the manifest for a single primary key index version
|
225
228
|
hash bucket.
|
@@ -241,8 +244,9 @@ class PrimaryKeyIndexVersionLocator(Locator, dict):
|
|
241
244
|
|
242
245
|
class PrimaryKeyIndexVersionMeta(dict):
|
243
246
|
@staticmethod
|
244
|
-
def of(
|
245
|
-
|
247
|
+
def of(
|
248
|
+
primary_key_index_meta: PrimaryKeyIndexMeta, hash_bucket_count: int
|
249
|
+
) -> PrimaryKeyIndexVersionMeta:
|
246
250
|
"""
|
247
251
|
Creates Primary Key Index Version Metadata from the given Primary Key
|
248
252
|
Index Metadata and hash bucket count.
|
@@ -6,10 +6,9 @@ from typing import List
|
|
6
6
|
|
7
7
|
class PyArrowWriteResult(dict):
|
8
8
|
@staticmethod
|
9
|
-
def of(
|
10
|
-
|
11
|
-
|
12
|
-
record_count: int) -> PyArrowWriteResult:
|
9
|
+
def of(
|
10
|
+
file_count: int, pyarrow_bytes: int, file_bytes: int, record_count: int
|
11
|
+
) -> PyArrowWriteResult:
|
13
12
|
pawr = PyArrowWriteResult()
|
14
13
|
pawr["files"] = file_count
|
15
14
|
pawr["paBytes"] = pyarrow_bytes
|
@@ -1,24 +1,26 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
-
from
|
5
|
-
from deltacat.compute.compactor.model.pyarrow_write_result import \
|
6
|
-
PyArrowWriteResult
|
7
|
-
from deltacat.compute.compactor.model.primary_key_index import \
|
8
|
-
PrimaryKeyIndexVersionLocator
|
4
|
+
from typing import Any, Dict, Optional
|
9
5
|
|
10
|
-
from
|
6
|
+
from deltacat.compute.compactor.model.primary_key_index import (
|
7
|
+
PrimaryKeyIndexVersionLocator,
|
8
|
+
)
|
9
|
+
from deltacat.compute.compactor.model.pyarrow_write_result import PyArrowWriteResult
|
10
|
+
from deltacat.storage import DeltaLocator, PartitionLocator
|
11
11
|
|
12
12
|
|
13
13
|
class RoundCompletionInfo(dict):
|
14
14
|
@staticmethod
|
15
|
-
def of(
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
15
|
+
def of(
|
16
|
+
high_watermark: int,
|
17
|
+
compacted_delta_locator: DeltaLocator,
|
18
|
+
compacted_pyarrow_write_result: PyArrowWriteResult,
|
19
|
+
pk_index_pyarrow_write_result: PyArrowWriteResult,
|
20
|
+
sort_keys_bit_width: int,
|
21
|
+
primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
|
22
|
+
rebase_source_partition_locator: Optional[PartitionLocator],
|
23
|
+
) -> RoundCompletionInfo:
|
22
24
|
|
23
25
|
rci = RoundCompletionInfo()
|
24
26
|
rci["highWatermark"] = high_watermark
|
@@ -27,6 +29,7 @@ class RoundCompletionInfo(dict):
|
|
27
29
|
rci["pkIndexPyarrowWriteResult"] = pk_index_pyarrow_write_result
|
28
30
|
rci["sortKeysBitWidth"] = sort_keys_bit_width
|
29
31
|
rci["primaryKeyIndexVersionLocator"] = primary_key_index_version_locator
|
32
|
+
rci["rebaseSourcePartitionLocator"] = rebase_source_partition_locator
|
30
33
|
return rci
|
31
34
|
|
32
35
|
@property
|
@@ -59,11 +62,14 @@ class RoundCompletionInfo(dict):
|
|
59
62
|
return self["sortKeysBitWidth"]
|
60
63
|
|
61
64
|
@property
|
62
|
-
def primary_key_index_version_locator(self)
|
63
|
-
-> PrimaryKeyIndexVersionLocator:
|
65
|
+
def primary_key_index_version_locator(self) -> PrimaryKeyIndexVersionLocator:
|
64
66
|
val: Dict[str, Any] = self.get("primaryKeyIndexVersionLocator")
|
65
|
-
if val is not None
|
66
|
-
|
67
|
-
|
68
|
-
|
67
|
+
if val is not None and not isinstance(val, PrimaryKeyIndexVersionLocator):
|
68
|
+
self["primaryKeyIndexVersionLocator"] = val = PrimaryKeyIndexVersionLocator(
|
69
|
+
val
|
70
|
+
)
|
69
71
|
return val
|
72
|
+
|
73
|
+
@property
|
74
|
+
def rebase_source_partition_locator(self) -> Optional[PartitionLocator]:
|
75
|
+
return self.get("rebaseSourcePartitionLocator")
|
@@ -2,14 +2,13 @@
|
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
4
|
import logging
|
5
|
-
import pyarrow as pa
|
6
|
-
|
7
5
|
from enum import Enum
|
6
|
+
from typing import List
|
8
7
|
|
9
|
-
|
10
|
-
from deltacat import logs
|
8
|
+
import pyarrow as pa
|
11
9
|
|
12
|
-
from
|
10
|
+
from deltacat import logs
|
11
|
+
from deltacat.storage import PartitionLocator
|
13
12
|
|
14
13
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
15
14
|
|
@@ -23,8 +22,7 @@ class SortOrder(str, Enum):
|
|
23
22
|
|
24
23
|
class SortKey(tuple):
|
25
24
|
@staticmethod
|
26
|
-
def of(key_name: str, sort_order: SortOrder = SortOrder.ASCENDING)
|
27
|
-
-> SortKey:
|
25
|
+
def of(key_name: str, sort_order: SortOrder = SortOrder.ASCENDING) -> SortKey:
|
28
26
|
"""
|
29
27
|
Create a sort key from a field name to use as the sort key, and
|
30
28
|
the sort order for this key. If no sort order is specified, then the
|
@@ -39,9 +37,10 @@ class SortKey(tuple):
|
|
39
37
|
|
40
38
|
@staticmethod
|
41
39
|
def validate_sort_keys(
|
42
|
-
|
43
|
-
|
44
|
-
|
40
|
+
source_partition_locator: PartitionLocator,
|
41
|
+
sort_keys: List[SortKey],
|
42
|
+
deltacat_storage,
|
43
|
+
) -> int:
|
45
44
|
"""
|
46
45
|
Validates the input sort keys to ensure that they are unique, are using
|
47
46
|
a valid sort key model, are all fixed-width data types, and that the
|
@@ -51,8 +50,9 @@ class SortKey(tuple):
|
|
51
50
|
total_sort_keys_bit_width = 0
|
52
51
|
if sort_keys:
|
53
52
|
sort_key_names = [key.key_name for key in sort_keys]
|
54
|
-
assert len(sort_key_names) == len(
|
55
|
-
|
53
|
+
assert len(sort_key_names) == len(
|
54
|
+
set(sort_key_names)
|
55
|
+
), f"Sort key names must be unique: {sort_key_names}"
|
56
56
|
stream_locator = source_partition_locator.stream_locator
|
57
57
|
table_version_schema = deltacat_storage.get_table_version_schema(
|
58
58
|
stream_locator.namespace,
|
@@ -70,19 +70,22 @@ class SortKey(tuple):
|
|
70
70
|
f"Total length of sort keys "
|
71
71
|
f"({total_sort_keys_bit_width}) is greater "
|
72
72
|
f"than the max supported bit width for all "
|
73
|
-
f"sort keys ({MAX_SORT_KEYS_BIT_WIDTH})"
|
73
|
+
f"sort keys ({MAX_SORT_KEYS_BIT_WIDTH})"
|
74
|
+
)
|
74
75
|
except ValueError as e:
|
75
76
|
raise ValueError(
|
76
77
|
f"Unable to get bit width of sort key: {pa_field}. "
|
77
78
|
f"Please ensure that all sort keys are fixed-size "
|
78
|
-
f"PyArrow data types."
|
79
|
+
f"PyArrow data types."
|
80
|
+
) from e
|
79
81
|
else:
|
80
82
|
logger.warning(
|
81
83
|
f"Unable to estimate sort key bit width for schema type "
|
82
84
|
f"{type(table_version_schema)}. This compaction job run "
|
83
85
|
f"may run out of memory, run more slowly, or underutilize "
|
84
86
|
f"available resources. To fix this, either remove the "
|
85
|
-
f"sort keys or provide a PyArrow schema."
|
87
|
+
f"sort keys or provide a PyArrow schema."
|
88
|
+
)
|
86
89
|
total_sort_keys_bit_width = MAX_SORT_KEYS_BIT_WIDTH
|
87
90
|
return total_sort_keys_bit_width
|
88
91
|
|