deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +41 -15
- deltacat/aws/clients.py +12 -31
- deltacat/aws/constants.py +1 -1
- deltacat/aws/redshift/__init__.py +7 -2
- deltacat/aws/redshift/model/manifest.py +54 -50
- deltacat/aws/s3u.py +176 -187
- deltacat/catalog/delegate.py +151 -185
- deltacat/catalog/interface.py +78 -97
- deltacat/catalog/model/catalog.py +21 -21
- deltacat/catalog/model/table_definition.py +11 -9
- deltacat/compute/compactor/__init__.py +12 -16
- deltacat/compute/compactor/compaction_session.py +237 -166
- deltacat/compute/compactor/model/delta_annotated.py +60 -44
- deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
- deltacat/compute/compactor/model/delta_file_locator.py +10 -8
- deltacat/compute/compactor/model/materialize_result.py +6 -7
- deltacat/compute/compactor/model/primary_key_index.py +38 -34
- deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
- deltacat/compute/compactor/model/round_completion_info.py +25 -19
- deltacat/compute/compactor/model/sort_key.py +18 -15
- deltacat/compute/compactor/steps/dedupe.py +119 -94
- deltacat/compute/compactor/steps/hash_bucket.py +48 -47
- deltacat/compute/compactor/steps/materialize.py +86 -92
- deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
- deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
- deltacat/compute/compactor/utils/io.py +59 -47
- deltacat/compute/compactor/utils/primary_key_index.py +91 -80
- deltacat/compute/compactor/utils/round_completion_file.py +22 -23
- deltacat/compute/compactor/utils/system_columns.py +33 -45
- deltacat/compute/metastats/meta_stats.py +235 -157
- deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
- deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
- deltacat/compute/metastats/stats.py +95 -64
- deltacat/compute/metastats/utils/io.py +100 -53
- deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
- deltacat/compute/metastats/utils/ray_utils.py +38 -33
- deltacat/compute/stats/basic.py +107 -69
- deltacat/compute/stats/models/delta_column_stats.py +11 -8
- deltacat/compute/stats/models/delta_stats.py +59 -32
- deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
- deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
- deltacat/compute/stats/models/stats_result.py +24 -14
- deltacat/compute/stats/utils/intervals.py +16 -9
- deltacat/compute/stats/utils/io.py +86 -51
- deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
- deltacat/constants.py +4 -13
- deltacat/io/__init__.py +2 -2
- deltacat/io/aws/redshift/redshift_datasource.py +157 -143
- deltacat/io/dataset.py +14 -17
- deltacat/io/read_api.py +36 -33
- deltacat/logs.py +94 -42
- deltacat/storage/__init__.py +18 -8
- deltacat/storage/interface.py +196 -213
- deltacat/storage/model/delta.py +45 -51
- deltacat/storage/model/list_result.py +12 -8
- deltacat/storage/model/namespace.py +4 -5
- deltacat/storage/model/partition.py +42 -42
- deltacat/storage/model/stream.py +29 -30
- deltacat/storage/model/table.py +14 -14
- deltacat/storage/model/table_version.py +32 -31
- deltacat/storage/model/types.py +1 -0
- deltacat/tests/stats/test_intervals.py +11 -24
- deltacat/tests/utils/__init__.py +0 -0
- deltacat/tests/utils/test_record_batch_tables.py +284 -0
- deltacat/types/media.py +3 -4
- deltacat/types/tables.py +31 -21
- deltacat/utils/common.py +5 -11
- deltacat/utils/numpy.py +20 -22
- deltacat/utils/pandas.py +73 -100
- deltacat/utils/performance.py +3 -9
- deltacat/utils/placement.py +259 -230
- deltacat/utils/pyarrow.py +302 -89
- deltacat/utils/ray_utils/collections.py +2 -1
- deltacat/utils/ray_utils/concurrency.py +27 -28
- deltacat/utils/ray_utils/dataset.py +28 -28
- deltacat/utils/ray_utils/performance.py +5 -9
- deltacat/utils/ray_utils/runtime.py +9 -10
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
- deltacat-0.1.12.dist-info/RECORD +110 -0
- deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0
@@ -8,14 +8,8 @@ import pyarrow as pa
|
|
8
8
|
import ray
|
9
9
|
import s3fs
|
10
10
|
from ray import cloudpickle
|
11
|
-
from deltacat.constants import PRIMARY_KEY_INDEX_WRITE_BOTO3_CONFIG
|
12
11
|
from ray.types import ObjectRef
|
13
12
|
|
14
|
-
from deltacat.storage import Manifest, PartitionLocator
|
15
|
-
from deltacat.utils.ray_utils.concurrency import invoke_parallel
|
16
|
-
from deltacat.compute.compactor import PyArrowWriteResult, \
|
17
|
-
RoundCompletionInfo, PrimaryKeyIndexMeta, PrimaryKeyIndexLocator, \
|
18
|
-
PrimaryKeyIndexVersionMeta, PrimaryKeyIndexVersionLocator
|
19
13
|
from deltacat import logs
|
20
14
|
from deltacat.aws import s3u
|
21
15
|
from deltacat.compute.compactor import (
|
@@ -30,29 +24,31 @@ from deltacat.compute.compactor.steps.rehash import rehash_bucket as rb
|
|
30
24
|
from deltacat.compute.compactor.steps.rehash import rewrite_index as ri
|
31
25
|
from deltacat.compute.compactor.utils import round_completion_file as rcf
|
32
26
|
from deltacat.compute.compactor.utils import system_columns as sc
|
27
|
+
from deltacat.constants import PRIMARY_KEY_INDEX_WRITE_BOTO3_CONFIG
|
33
28
|
from deltacat.storage import Manifest, PartitionLocator
|
34
29
|
from deltacat.types.media import ContentEncoding, ContentType
|
35
30
|
from deltacat.types.tables import get_table_slicer, get_table_writer
|
36
31
|
from deltacat.utils.common import ReadKwargsProvider
|
37
|
-
from deltacat.utils.ray_utils.concurrency import
|
38
|
-
invoke_parallel
|
39
|
-
)
|
32
|
+
from deltacat.utils.ray_utils.concurrency import invoke_parallel
|
40
33
|
|
41
34
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
42
35
|
|
43
36
|
|
44
37
|
def rehash(
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
38
|
+
options_provider: Callable[[int, Any], Dict[str, Any]],
|
39
|
+
s3_bucket: str,
|
40
|
+
source_partition_locator: PartitionLocator,
|
41
|
+
old_rci: RoundCompletionInfo,
|
42
|
+
new_hash_bucket_count: int,
|
43
|
+
hash_bucket_index_group_count: int,
|
44
|
+
records_per_primary_key_index_file: int,
|
45
|
+
delete_old_primary_key_index: bool,
|
46
|
+
) -> RoundCompletionInfo:
|
47
|
+
|
48
|
+
logger.info(
|
49
|
+
f"Rehashing primary key index. Old round completion info: "
|
50
|
+
f"{old_rci}. New hash bucket count: {new_hash_bucket_count}"
|
51
|
+
)
|
56
52
|
|
57
53
|
# collect old primary key index information
|
58
54
|
old_pki_version_locator = old_rci.primary_key_index_version_locator
|
@@ -60,10 +56,12 @@ def rehash(
|
|
60
56
|
old_pki_meta = old_pkiv_meta.primary_key_index_meta
|
61
57
|
old_compacted_partition_locator = old_pki_meta.compacted_partition_locator
|
62
58
|
if old_pkiv_meta.hash_bucket_count == new_hash_bucket_count:
|
63
|
-
raise ValueError(
|
64
|
-
|
65
|
-
|
66
|
-
|
59
|
+
raise ValueError(
|
60
|
+
f"Primary key index rehash failed. Old hash bucket "
|
61
|
+
f"count ({new_hash_bucket_count}) is "
|
62
|
+
f"equal to new hash bucket count. Partition: "
|
63
|
+
f"{old_compacted_partition_locator}."
|
64
|
+
)
|
67
65
|
|
68
66
|
# generate a new unique primary key index version locator to rehash into
|
69
67
|
new_pki_meta = PrimaryKeyIndexMeta.of(
|
@@ -78,7 +76,8 @@ def rehash(
|
|
78
76
|
new_hash_bucket_count,
|
79
77
|
)
|
80
78
|
rehashed_pki_version_locator = PrimaryKeyIndexVersionLocator.generate(
|
81
|
-
new_pki_version_meta
|
79
|
+
new_pki_version_meta
|
80
|
+
)
|
82
81
|
|
83
82
|
# launch a rehash task for each bucket of the old primary key index version
|
84
83
|
old_hash_bucket_count = old_pkiv_meta.hash_bucket_count
|
@@ -124,6 +123,7 @@ def rehash(
|
|
124
123
|
PyArrowWriteResult.union(pki_stats),
|
125
124
|
old_rci.sort_keys_bit_width,
|
126
125
|
rehashed_pki_version_locator,
|
126
|
+
old_rci.rebase_source_partition_locator,
|
127
127
|
)
|
128
128
|
rcf.write_round_completion_file(
|
129
129
|
s3_bucket,
|
@@ -136,41 +136,48 @@ def rehash(
|
|
136
136
|
s3_bucket,
|
137
137
|
old_pki_version_locator,
|
138
138
|
)
|
139
|
-
logger.info(
|
140
|
-
|
139
|
+
logger.info(
|
140
|
+
f"Rehashed primary key index. New round completion info: "
|
141
|
+
f"{round_completion_info}."
|
142
|
+
)
|
141
143
|
return round_completion_info
|
142
144
|
|
143
145
|
|
144
146
|
def download_hash_bucket_entries(
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
pk_index_manifest_s3_url =
|
152
|
-
.get_pkiv_hb_index_manifest_s3_url(
|
147
|
+
s3_bucket: str,
|
148
|
+
hash_bucket_index: int,
|
149
|
+
primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
|
150
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
151
|
+
) -> List[pa.Table]:
|
152
|
+
|
153
|
+
pk_index_manifest_s3_url = (
|
154
|
+
primary_key_index_version_locator.get_pkiv_hb_index_manifest_s3_url(
|
153
155
|
s3_bucket,
|
154
156
|
hash_bucket_index,
|
155
157
|
)
|
158
|
+
)
|
156
159
|
result = s3u.download(pk_index_manifest_s3_url, False)
|
157
|
-
logger.info(
|
158
|
-
|
159
|
-
|
160
|
+
logger.info(
|
161
|
+
f"Downloading primary key index hash bucket manifest entries: "
|
162
|
+
f"{pk_index_manifest_s3_url}. Primary key index version "
|
163
|
+
f"locator: {primary_key_index_version_locator}"
|
164
|
+
)
|
160
165
|
pk_index_manifest = Manifest(json.loads(result["Body"].read().decode("utf-8")))
|
161
|
-
tables = s3u.download_manifest_entries(
|
162
|
-
|
166
|
+
tables = s3u.download_manifest_entries(
|
167
|
+
pk_index_manifest, file_reader_kwargs_provider=file_reader_kwargs_provider
|
168
|
+
)
|
163
169
|
if not tables:
|
164
170
|
logger.warning(
|
165
171
|
f"Primary key index manifest is empty at: "
|
166
172
|
f"{pk_index_manifest_s3_url}. Primary key index version "
|
167
|
-
f"locator: {primary_key_index_version_locator}"
|
173
|
+
f"locator: {primary_key_index_version_locator}"
|
174
|
+
)
|
168
175
|
return tables
|
169
176
|
|
170
177
|
|
171
178
|
def delete_primary_key_index_version(
|
172
|
-
|
173
|
-
|
179
|
+
s3_bucket: str, pki_version_locator: PrimaryKeyIndexVersionLocator
|
180
|
+
) -> None:
|
174
181
|
|
175
182
|
logger.info(f"Deleting primary key index: {pki_version_locator}")
|
176
183
|
s3u.delete_files_by_prefix(
|
@@ -181,8 +188,8 @@ def delete_primary_key_index_version(
|
|
181
188
|
|
182
189
|
|
183
190
|
def group_record_indices_by_hash_bucket(
|
184
|
-
|
185
|
-
|
191
|
+
pki_table: pa.Table, num_buckets: int
|
192
|
+
) -> np.ndarray:
|
186
193
|
|
187
194
|
hash_bucket_to_indices = np.empty([num_buckets], dtype="object")
|
188
195
|
record_index = 0
|
@@ -196,11 +203,10 @@ def group_record_indices_by_hash_bucket(
|
|
196
203
|
|
197
204
|
|
198
205
|
def group_hash_bucket_indices(
|
199
|
-
|
200
|
-
|
201
|
-
num_groups: int) -> Tuple[np.ndarray, List[ObjectRef]]:
|
206
|
+
hash_bucket_object_groups: np.ndarray, num_buckets: int, num_groups: int
|
207
|
+
) -> Tuple[np.ndarray, List[ObjectRef]]:
|
202
208
|
"""
|
203
|
-
Groups all the ObjectRef that belongs to a particular hash bucket group and hash bucket index.
|
209
|
+
Groups all the ObjectRef that belongs to a particular hash bucket group and hash bucket index.
|
204
210
|
"""
|
205
211
|
|
206
212
|
object_refs = []
|
@@ -214,8 +220,7 @@ def group_hash_bucket_indices(
|
|
214
220
|
if obj:
|
215
221
|
hb_group = hb_index % num_groups
|
216
222
|
if hb_group_to_object[hb_group] is None:
|
217
|
-
hb_group_to_object[hb_group] = np.empty(
|
218
|
-
[num_buckets], dtype="object")
|
223
|
+
hb_group_to_object[hb_group] = np.empty([num_buckets], dtype="object")
|
219
224
|
hb_group_to_object[hb_group][hb_index] = obj
|
220
225
|
|
221
226
|
for hb_group, obj in enumerate(hb_group_to_object):
|
@@ -225,21 +230,19 @@ def group_hash_bucket_indices(
|
|
225
230
|
pickled_obj_ref = cloudpickle.dumps(obj_ref)
|
226
231
|
object_refs.append(pickled_obj_ref)
|
227
232
|
hash_bucket_group_to_obj_id[hb_group] = pickled_obj_ref
|
228
|
-
# NOTE: The cloudpickle.dumps API call creates an out of band object reference to the object_ref variable.
|
229
|
-
# After pickling, Ray cannot track the serialized copy of the object or determine when the ObjectRef has been deserialized
|
230
|
-
# (e.g., if the ObjectRef is deserialized by a non-Ray process).
|
231
|
-
# Thus the object_ref cannot be tracked by Ray's distributed reference counter, even if it goes out of scope.
|
232
|
-
# The object now has a permanent reference and the data can't be freed from Ray’s object store.
|
233
|
-
# Manually deleting the untrackable object references offsets these permanent references and
|
234
|
-
# helps to allow these objects to be garbage collected normally.
|
233
|
+
# NOTE: The cloudpickle.dumps API call creates an out of band object reference to the object_ref variable.
|
234
|
+
# After pickling, Ray cannot track the serialized copy of the object or determine when the ObjectRef has been deserialized
|
235
|
+
# (e.g., if the ObjectRef is deserialized by a non-Ray process).
|
236
|
+
# Thus the object_ref cannot be tracked by Ray's distributed reference counter, even if it goes out of scope.
|
237
|
+
# The object now has a permanent reference and the data can't be freed from Ray’s object store.
|
238
|
+
# Manually deleting the untrackable object references offsets these permanent references and
|
239
|
+
# helps to allow these objects to be garbage collected normally.
|
235
240
|
del obj_ref
|
236
241
|
del pickled_obj_ref
|
237
242
|
return hash_bucket_group_to_obj_id, object_refs
|
238
243
|
|
239
244
|
|
240
|
-
def pk_digest_to_hash_bucket_index(
|
241
|
-
digest,
|
242
|
-
num_buckets: int) -> int:
|
245
|
+
def pk_digest_to_hash_bucket_index(digest, num_buckets: int) -> int:
|
243
246
|
"""
|
244
247
|
Deterministically get the hash bucket a particular digest belongs to
|
245
248
|
based on number of total hash buckets.
|
@@ -249,11 +252,12 @@ def pk_digest_to_hash_bucket_index(
|
|
249
252
|
|
250
253
|
|
251
254
|
def write_primary_key_index_files(
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
255
|
+
table: pa.Table,
|
256
|
+
primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
|
257
|
+
s3_bucket: str,
|
258
|
+
hb_index: int,
|
259
|
+
records_per_index_file: int,
|
260
|
+
) -> PyArrowWriteResult:
|
257
261
|
"""
|
258
262
|
Writes primary key index files for the given hash bucket index out to the
|
259
263
|
specified S3 bucket at the path identified by the given primary key index
|
@@ -262,19 +266,24 @@ def write_primary_key_index_files(
|
|
262
266
|
|
263
267
|
TODO(raghumdani): Support writing primary key index to any data catalog
|
264
268
|
"""
|
265
|
-
logger.info(
|
266
|
-
|
267
|
-
|
269
|
+
logger.info(
|
270
|
+
f"Writing primary key index files for hash bucket {hb_index}. "
|
271
|
+
f"Primary key index version locator: "
|
272
|
+
f"{primary_key_index_version_locator}."
|
273
|
+
)
|
268
274
|
s3_file_system = s3fs.S3FileSystem(
|
269
275
|
anon=False,
|
270
276
|
s3_additional_kwargs={
|
271
277
|
"ContentType": ContentType.PARQUET.value,
|
272
278
|
"ContentEncoding": ContentEncoding.IDENTITY.value,
|
273
279
|
},
|
274
|
-
config_kwargs=PRIMARY_KEY_INDEX_WRITE_BOTO3_CONFIG
|
280
|
+
config_kwargs=PRIMARY_KEY_INDEX_WRITE_BOTO3_CONFIG,
|
281
|
+
)
|
282
|
+
pkiv_hb_index_s3_url_base = (
|
283
|
+
primary_key_index_version_locator.get_pkiv_hb_index_s3_url_base(
|
284
|
+
s3_bucket, hb_index
|
285
|
+
)
|
275
286
|
)
|
276
|
-
pkiv_hb_index_s3_url_base = primary_key_index_version_locator\
|
277
|
-
.get_pkiv_hb_index_s3_url_base(s3_bucket, hb_index)
|
278
287
|
manifest_entries = s3u.upload_sliced_table(
|
279
288
|
table,
|
280
289
|
pkiv_hb_index_s3_url_base,
|
@@ -284,19 +293,21 @@ def write_primary_key_index_files(
|
|
284
293
|
get_table_slicer(table),
|
285
294
|
)
|
286
295
|
manifest = Manifest.of(manifest_entries)
|
287
|
-
pkiv_hb_index_s3_manifest_s3_url =
|
288
|
-
.get_pkiv_hb_index_manifest_s3_url(
|
289
|
-
|
290
|
-
|
291
|
-
str(json.dumps(manifest))
|
296
|
+
pkiv_hb_index_s3_manifest_s3_url = (
|
297
|
+
primary_key_index_version_locator.get_pkiv_hb_index_manifest_s3_url(
|
298
|
+
s3_bucket, hb_index
|
299
|
+
)
|
292
300
|
)
|
301
|
+
s3u.upload(pkiv_hb_index_s3_manifest_s3_url, str(json.dumps(manifest)))
|
293
302
|
result = PyArrowWriteResult.of(
|
294
303
|
len(manifest_entries),
|
295
304
|
table.nbytes,
|
296
305
|
manifest.meta.content_length,
|
297
306
|
len(table),
|
298
307
|
)
|
299
|
-
logger.info(
|
300
|
-
|
301
|
-
|
308
|
+
logger.info(
|
309
|
+
f"Wrote primary key index files for hash bucket {hb_index}. "
|
310
|
+
f"Primary key index version locator: "
|
311
|
+
f"{primary_key_index_version_locator}. Result: {result}"
|
312
|
+
)
|
302
313
|
return result
|
@@ -1,35 +1,35 @@
|
|
1
|
-
import logging
|
2
1
|
import json
|
2
|
+
import logging
|
3
3
|
|
4
|
-
from deltacat.storage import PartitionLocator
|
5
|
-
from deltacat.compute.compactor import RoundCompletionInfo
|
6
4
|
from deltacat import logs
|
5
|
+
from deltacat.compute.compactor import RoundCompletionInfo
|
6
|
+
from deltacat.storage import PartitionLocator
|
7
7
|
|
8
8
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
9
9
|
|
10
10
|
|
11
11
|
def get_round_completion_file_s3_url(
|
12
|
-
|
13
|
-
|
14
|
-
pki_root_path: str) -> str:
|
12
|
+
bucket: str, source_partition_locator: PartitionLocator, pki_root_path: str
|
13
|
+
) -> str:
|
15
14
|
|
16
15
|
base_url = source_partition_locator.path(f"s3://{bucket}")
|
17
16
|
return f"{base_url}/{pki_root_path}.json"
|
18
17
|
|
19
18
|
|
20
19
|
def read_round_completion_file(
|
21
|
-
|
22
|
-
|
23
|
-
|
20
|
+
bucket: str,
|
21
|
+
source_partition_locator: PartitionLocator,
|
22
|
+
primary_key_index_root_path: str,
|
23
|
+
) -> RoundCompletionInfo:
|
24
24
|
|
25
25
|
from deltacat.aws import s3u as s3_utils
|
26
|
+
|
26
27
|
round_completion_file_url = get_round_completion_file_s3_url(
|
27
28
|
bucket,
|
28
29
|
source_partition_locator,
|
29
30
|
primary_key_index_root_path,
|
30
31
|
)
|
31
|
-
logger.info(
|
32
|
-
f"reading round completion file from: {round_completion_file_url}")
|
32
|
+
logger.info(f"reading round completion file from: {round_completion_file_url}")
|
33
33
|
round_completion_info = None
|
34
34
|
result = s3_utils.download(round_completion_file_url, False)
|
35
35
|
if result:
|
@@ -40,24 +40,23 @@ def read_round_completion_file(
|
|
40
40
|
|
41
41
|
|
42
42
|
def write_round_completion_file(
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
43
|
+
bucket: str,
|
44
|
+
source_partition_locator: PartitionLocator,
|
45
|
+
primary_key_index_root_path: str,
|
46
|
+
round_completion_info: RoundCompletionInfo,
|
47
|
+
) -> str:
|
47
48
|
|
48
49
|
from deltacat.aws import s3u as s3_utils
|
49
|
-
|
50
|
-
|
50
|
+
|
51
|
+
logger.info(f"writing round completion file contents: {round_completion_info}")
|
51
52
|
round_completion_file_s3_url = get_round_completion_file_s3_url(
|
52
53
|
bucket,
|
53
54
|
source_partition_locator,
|
54
55
|
primary_key_index_root_path,
|
55
56
|
)
|
56
|
-
logger.info(
|
57
|
-
f"writing round completion file to: {round_completion_file_s3_url}")
|
57
|
+
logger.info(f"writing round completion file to: {round_completion_file_s3_url}")
|
58
58
|
s3_utils.upload(
|
59
|
-
round_completion_file_s3_url,
|
60
|
-
str(json.dumps(round_completion_info))
|
59
|
+
round_completion_file_s3_url, str(json.dumps(round_completion_info))
|
61
60
|
)
|
62
|
-
logger.info(
|
63
|
-
|
61
|
+
logger.info(f"round completion file written to: {round_completion_file_s3_url}")
|
62
|
+
return round_completion_file_s3_url
|
@@ -1,10 +1,11 @@
|
|
1
|
-
import pyarrow as pa
|
2
|
-
import numpy as np
|
3
1
|
from itertools import repeat
|
4
2
|
from typing import Union
|
5
3
|
|
6
|
-
|
4
|
+
import numpy as np
|
5
|
+
import pyarrow as pa
|
6
|
+
|
7
7
|
from deltacat.compute.compactor import DeltaFileEnvelope
|
8
|
+
from deltacat.storage import DeltaType
|
8
9
|
|
9
10
|
_SYS_COL_UUID = "4000f124-dfbd-48c6-885b-7b22621a6d41"
|
10
11
|
|
@@ -65,10 +66,7 @@ _IS_SOURCE_COLUMN_FIELD = pa.field(
|
|
65
66
|
|
66
67
|
|
67
68
|
def get_pk_hash_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
|
68
|
-
return pa.array(
|
69
|
-
obj,
|
70
|
-
_PK_HASH_COLUMN_TYPE
|
71
|
-
)
|
69
|
+
return pa.array(obj, _PK_HASH_COLUMN_TYPE)
|
72
70
|
|
73
71
|
|
74
72
|
def pk_hash_column_np(table: pa.Table) -> np.ndarray:
|
@@ -79,6 +77,10 @@ def pk_hash_column(table: pa.Table) -> pa.ChunkedArray:
|
|
79
77
|
return table[_PK_HASH_COLUMN_NAME]
|
80
78
|
|
81
79
|
|
80
|
+
def delta_type_column_np(table: pa.Table) -> np.ndarray:
|
81
|
+
return table[_DELTA_TYPE_COLUMN_NAME].to_numpy()
|
82
|
+
|
83
|
+
|
82
84
|
def delta_type_column(table: pa.Table) -> pa.ChunkedArray:
|
83
85
|
return table[_DELTA_TYPE_COLUMN_NAME]
|
84
86
|
|
@@ -101,8 +103,7 @@ def stream_position_column_np(table: pa.Table) -> np.ndarray:
|
|
101
103
|
return table[_PARTITION_STREAM_POSITION_COLUMN_NAME].to_numpy()
|
102
104
|
|
103
105
|
|
104
|
-
def get_file_index_column_array(obj)
|
105
|
-
-> Union[pa.Array, pa.ChunkedArray]:
|
106
|
+
def get_file_index_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
|
106
107
|
return pa.array(
|
107
108
|
obj,
|
108
109
|
_ORDERED_FILE_IDX_COLUMN_TYPE,
|
@@ -113,8 +114,7 @@ def file_index_column_np(table: pa.Table) -> np.ndarray:
|
|
113
114
|
return table[_ORDERED_FILE_IDX_COLUMN_NAME].to_numpy()
|
114
115
|
|
115
116
|
|
116
|
-
def get_record_index_column_array(obj) ->
|
117
|
-
Union[pa.Array, pa.ChunkedArray]:
|
117
|
+
def get_record_index_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
|
118
118
|
return pa.array(
|
119
119
|
obj,
|
120
120
|
_ORDERED_RECORD_IDX_COLUMN_TYPE,
|
@@ -144,7 +144,8 @@ def get_is_source_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
|
|
144
144
|
|
145
145
|
|
146
146
|
def project_delta_file_metadata_on_table(
|
147
|
-
|
147
|
+
delta_file_envelope: DeltaFileEnvelope,
|
148
|
+
) -> pa.Table:
|
148
149
|
|
149
150
|
table = delta_file_envelope.table
|
150
151
|
|
@@ -181,42 +182,33 @@ def project_delta_file_metadata_on_table(
|
|
181
182
|
return table
|
182
183
|
|
183
184
|
|
184
|
-
def append_stream_position_column(
|
185
|
-
table: pa.Table,
|
186
|
-
stream_positions):
|
185
|
+
def append_stream_position_column(table: pa.Table, stream_positions):
|
187
186
|
|
188
187
|
table = table.append_column(
|
189
188
|
_PARTITION_STREAM_POSITION_COLUMN_FIELD,
|
190
|
-
get_stream_position_column_array(stream_positions)
|
189
|
+
get_stream_position_column_array(stream_positions),
|
191
190
|
)
|
192
191
|
return table
|
193
192
|
|
194
193
|
|
195
|
-
def append_file_idx_column(
|
196
|
-
table: pa.Table,
|
197
|
-
ordered_file_indices):
|
194
|
+
def append_file_idx_column(table: pa.Table, ordered_file_indices):
|
198
195
|
|
199
196
|
table = table.append_column(
|
200
197
|
_ORDERED_FILE_IDX_COLUMN_FIELD,
|
201
|
-
get_file_index_column_array(ordered_file_indices)
|
198
|
+
get_file_index_column_array(ordered_file_indices),
|
202
199
|
)
|
203
200
|
return table
|
204
201
|
|
205
202
|
|
206
|
-
def append_pk_hash_column(
|
207
|
-
table: pa.Table,
|
208
|
-
pk_hashes) -> pa.Table:
|
203
|
+
def append_pk_hash_column(table: pa.Table, pk_hashes) -> pa.Table:
|
209
204
|
|
210
205
|
table = table.append_column(
|
211
|
-
_PK_HASH_COLUMN_FIELD,
|
212
|
-
get_pk_hash_column_array(pk_hashes)
|
206
|
+
_PK_HASH_COLUMN_FIELD, get_pk_hash_column_array(pk_hashes)
|
213
207
|
)
|
214
208
|
return table
|
215
209
|
|
216
210
|
|
217
|
-
def append_record_idx_col(
|
218
|
-
table: pa.Table,
|
219
|
-
ordered_record_indices) -> pa.Table:
|
211
|
+
def append_record_idx_col(table: pa.Table, ordered_record_indices) -> pa.Table:
|
220
212
|
|
221
213
|
table = table.append_column(
|
222
214
|
_ORDERED_RECORD_IDX_COLUMN_FIELD,
|
@@ -225,9 +217,7 @@ def append_record_idx_col(
|
|
225
217
|
return table
|
226
218
|
|
227
219
|
|
228
|
-
def append_dedupe_task_idx_col(
|
229
|
-
table: pa.Table,
|
230
|
-
dedupe_task_indices) -> pa.Table:
|
220
|
+
def append_dedupe_task_idx_col(table: pa.Table, dedupe_task_indices) -> pa.Table:
|
231
221
|
|
232
222
|
table = table.append_column(
|
233
223
|
_DEDUPE_TASK_IDX_COLUMN_FIELD,
|
@@ -244,9 +234,7 @@ def delta_type_from_field(delta_type_field: bool) -> DeltaType:
|
|
244
234
|
return DeltaType.UPSERT if delta_type_field else DeltaType.DELETE
|
245
235
|
|
246
236
|
|
247
|
-
def append_delta_type_col(
|
248
|
-
table: pa.Table,
|
249
|
-
delta_types) -> pa.Table:
|
237
|
+
def append_delta_type_col(table: pa.Table, delta_types) -> pa.Table:
|
250
238
|
|
251
239
|
table = table.append_column(
|
252
240
|
_DELTA_TYPE_COLUMN_FIELD,
|
@@ -255,9 +243,7 @@ def append_delta_type_col(
|
|
255
243
|
return table
|
256
244
|
|
257
245
|
|
258
|
-
def append_is_source_col(
|
259
|
-
table: pa.Table,
|
260
|
-
booleans) -> pa.Table:
|
246
|
+
def append_is_source_col(table: pa.Table, booleans) -> pa.Table:
|
261
247
|
|
262
248
|
table = table.append_column(
|
263
249
|
_IS_SOURCE_COLUMN_FIELD,
|
@@ -267,11 +253,13 @@ def append_is_source_col(
|
|
267
253
|
|
268
254
|
|
269
255
|
def get_minimal_hb_schema() -> pa.schema:
|
270
|
-
return pa.schema(
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
256
|
+
return pa.schema(
|
257
|
+
[
|
258
|
+
_PK_HASH_COLUMN_FIELD,
|
259
|
+
_ORDERED_RECORD_IDX_COLUMN_FIELD,
|
260
|
+
_ORDERED_FILE_IDX_COLUMN_FIELD,
|
261
|
+
_PARTITION_STREAM_POSITION_COLUMN_FIELD,
|
262
|
+
_DELTA_TYPE_COLUMN_FIELD,
|
263
|
+
_IS_SOURCE_COLUMN_FIELD,
|
264
|
+
]
|
265
|
+
)
|