deltacat 0.1.6__py3-none-any.whl → 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +41 -15
- deltacat/aws/clients.py +12 -31
- deltacat/aws/constants.py +1 -1
- deltacat/aws/redshift/__init__.py +7 -2
- deltacat/aws/redshift/model/manifest.py +54 -50
- deltacat/aws/s3u.py +183 -194
- deltacat/catalog/delegate.py +151 -185
- deltacat/catalog/interface.py +78 -97
- deltacat/catalog/model/catalog.py +21 -21
- deltacat/catalog/model/table_definition.py +11 -9
- deltacat/compute/compactor/__init__.py +12 -16
- deltacat/compute/compactor/compaction_session.py +249 -198
- deltacat/compute/compactor/model/delta_annotated.py +60 -44
- deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
- deltacat/compute/compactor/model/delta_file_locator.py +10 -8
- deltacat/compute/compactor/model/materialize_result.py +6 -7
- deltacat/compute/compactor/model/primary_key_index.py +38 -34
- deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
- deltacat/compute/compactor/model/round_completion_info.py +25 -19
- deltacat/compute/compactor/model/sort_key.py +18 -15
- deltacat/compute/compactor/steps/dedupe.py +153 -260
- deltacat/compute/compactor/steps/hash_bucket.py +56 -56
- deltacat/compute/compactor/steps/materialize.py +139 -100
- deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
- deltacat/compute/compactor/steps/rehash/rewrite_index.py +11 -13
- deltacat/compute/compactor/utils/io.py +59 -47
- deltacat/compute/compactor/utils/primary_key_index.py +131 -90
- deltacat/compute/compactor/utils/round_completion_file.py +22 -23
- deltacat/compute/compactor/utils/system_columns.py +33 -42
- deltacat/compute/metastats/meta_stats.py +235 -157
- deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
- deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
- deltacat/compute/metastats/stats.py +95 -64
- deltacat/compute/metastats/utils/io.py +100 -53
- deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
- deltacat/compute/metastats/utils/ray_utils.py +38 -33
- deltacat/compute/stats/basic.py +107 -69
- deltacat/compute/stats/models/delta_column_stats.py +11 -8
- deltacat/compute/stats/models/delta_stats.py +59 -32
- deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
- deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
- deltacat/compute/stats/models/stats_result.py +24 -14
- deltacat/compute/stats/utils/intervals.py +16 -9
- deltacat/compute/stats/utils/io.py +86 -51
- deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
- deltacat/constants.py +8 -10
- deltacat/io/__init__.py +2 -2
- deltacat/io/aws/redshift/redshift_datasource.py +157 -143
- deltacat/io/dataset.py +14 -17
- deltacat/io/read_api.py +36 -33
- deltacat/logs.py +94 -42
- deltacat/storage/__init__.py +18 -8
- deltacat/storage/interface.py +196 -213
- deltacat/storage/model/delta.py +45 -51
- deltacat/storage/model/list_result.py +12 -8
- deltacat/storage/model/namespace.py +4 -5
- deltacat/storage/model/partition.py +42 -42
- deltacat/storage/model/stream.py +29 -30
- deltacat/storage/model/table.py +14 -14
- deltacat/storage/model/table_version.py +32 -31
- deltacat/storage/model/types.py +1 -0
- deltacat/tests/stats/test_intervals.py +11 -24
- deltacat/tests/utils/__init__.py +0 -0
- deltacat/tests/utils/test_record_batch_tables.py +284 -0
- deltacat/types/media.py +3 -4
- deltacat/types/tables.py +31 -21
- deltacat/utils/common.py +5 -11
- deltacat/utils/numpy.py +20 -22
- deltacat/utils/pandas.py +73 -100
- deltacat/utils/performance.py +3 -9
- deltacat/utils/placement.py +276 -228
- deltacat/utils/pyarrow.py +302 -89
- deltacat/utils/ray_utils/collections.py +2 -1
- deltacat/utils/ray_utils/concurrency.py +36 -29
- deltacat/utils/ray_utils/dataset.py +28 -28
- deltacat/utils/ray_utils/performance.py +5 -9
- deltacat/utils/ray_utils/runtime.py +9 -10
- {deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/METADATA +21 -11
- deltacat-0.1.11.dist-info/RECORD +110 -0
- {deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/WHEEL +1 -1
- deltacat-0.1.6.dist-info/RECORD +0 -108
- {deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/LICENSE +0 -0
- {deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/top_level.txt +0 -0
@@ -1,24 +1,23 @@
|
|
1
1
|
import logging
|
2
|
-
import time
|
3
2
|
import math
|
4
|
-
from
|
5
|
-
from deltacat.constants import PYARROW_INFLATION_MULTIPLIER, BYTES_PER_MEBIBYTE
|
3
|
+
from typing import Dict, List, Optional, Tuple
|
6
4
|
|
7
|
-
from deltacat.storage import PartitionLocator, Delta, \
|
8
|
-
interface as unimplemented_deltacat_storage
|
9
5
|
from deltacat import logs
|
10
6
|
from deltacat.compute.compactor import DeltaAnnotated
|
11
|
-
|
12
|
-
from
|
7
|
+
from deltacat.compute.stats.models.delta_stats import DeltaStats
|
8
|
+
from deltacat.constants import BYTES_PER_MEBIBYTE, PYARROW_INFLATION_MULTIPLIER
|
9
|
+
from deltacat.storage import Delta, PartitionLocator
|
10
|
+
from deltacat.storage import interface as unimplemented_deltacat_storage
|
13
11
|
|
14
12
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
15
13
|
|
16
14
|
|
17
15
|
def discover_deltas(
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
16
|
+
source_partition_locator: PartitionLocator,
|
17
|
+
start_position_exclusive: Optional[int],
|
18
|
+
end_position_inclusive: int,
|
19
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
20
|
+
) -> List[Delta]:
|
22
21
|
|
23
22
|
stream_locator = source_partition_locator.stream_locator
|
24
23
|
namespace = stream_locator.namespace
|
@@ -36,32 +35,38 @@ def discover_deltas(
|
|
36
35
|
)
|
37
36
|
deltas = deltas_list_result.all_items()
|
38
37
|
if not deltas:
|
39
|
-
raise RuntimeError(
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
38
|
+
raise RuntimeError(
|
39
|
+
f"Unexpected Error: Couldn't find any deltas to "
|
40
|
+
f"compact in delta stream position range "
|
41
|
+
f"('{start_position_exclusive}', "
|
42
|
+
f"'{end_position_inclusive}']. Source partition: "
|
43
|
+
f"{source_partition_locator}"
|
44
|
+
)
|
44
45
|
if start_position_exclusive:
|
45
46
|
first_delta = deltas.pop(0)
|
46
|
-
logger.info(
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
47
|
+
logger.info(
|
48
|
+
f"Removed exclusive start delta w/ expected stream "
|
49
|
+
f"position '{start_position_exclusive}' from deltas to "
|
50
|
+
f"compact: {first_delta}"
|
51
|
+
)
|
52
|
+
logger.info(
|
53
|
+
f"Count of deltas to compact in delta stream "
|
54
|
+
f"position range ('{start_position_exclusive}', "
|
55
|
+
f"'{end_position_inclusive}']: {len(deltas)}. Source "
|
56
|
+
f"partition: '{source_partition_locator}'"
|
57
|
+
)
|
53
58
|
return deltas
|
54
59
|
|
55
60
|
|
56
61
|
def limit_input_deltas(
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
62
|
+
input_deltas: List[Delta],
|
63
|
+
cluster_resources: Dict[str, float],
|
64
|
+
hash_bucket_count: int,
|
65
|
+
min_pk_index_pa_bytes: int,
|
66
|
+
user_hash_bucket_chunk_size: int,
|
67
|
+
input_deltas_stats: Dict[int, DeltaStats],
|
68
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
69
|
+
) -> Tuple[List[DeltaAnnotated], int, int]:
|
65
70
|
|
66
71
|
# TODO (pdames): when row counts are available in metadata, use them
|
67
72
|
# instead of bytes - memory consumption depends more on number of
|
@@ -78,9 +83,10 @@ def limit_input_deltas(
|
|
78
83
|
# )
|
79
84
|
if min_pk_index_pa_bytes > 0:
|
80
85
|
required_heap_mem_for_dedupe = worker_obj_store_mem - min_pk_index_pa_bytes
|
81
|
-
assert required_heap_mem_for_dedupe > 0,
|
82
|
-
f"Not enough required memory available to re-batch input deltas"
|
86
|
+
assert required_heap_mem_for_dedupe > 0, (
|
87
|
+
f"Not enough required memory available to re-batch input deltas"
|
83
88
|
f"and initiate the dedupe step."
|
89
|
+
)
|
84
90
|
# Size of batched deltas must also be reduced to have enough space for primary
|
85
91
|
# key index files (from earlier compaction rounds) in the dedupe step, since
|
86
92
|
# they will be loaded into worker heap memory.
|
@@ -88,8 +94,7 @@ def limit_input_deltas(
|
|
88
94
|
|
89
95
|
logger.info(f"Total worker object store memory: {worker_obj_store_mem}")
|
90
96
|
worker_obj_store_mem_per_task = worker_obj_store_mem / worker_cpus
|
91
|
-
logger.info(f"Worker object store memory/task: "
|
92
|
-
f"{worker_obj_store_mem_per_task}")
|
97
|
+
logger.info(f"Worker object store memory/task: " f"{worker_obj_store_mem_per_task}")
|
93
98
|
worker_task_mem = cluster_resources["memory"]
|
94
99
|
logger.info(f"Total worker memory: {worker_task_mem}")
|
95
100
|
# TODO (pdames): ensure fixed memory per CPU in heterogenous clusters
|
@@ -105,8 +110,10 @@ def limit_input_deltas(
|
|
105
110
|
if input_deltas_stats is None:
|
106
111
|
input_deltas_stats = {}
|
107
112
|
|
108
|
-
input_deltas_stats = {
|
109
|
-
|
113
|
+
input_deltas_stats = {
|
114
|
+
int(stream_pos): DeltaStats(delta_stats)
|
115
|
+
for stream_pos, delta_stats in input_deltas_stats.items()
|
116
|
+
}
|
110
117
|
for delta in input_deltas:
|
111
118
|
manifest = deltacat_storage.get_delta_manifest(delta)
|
112
119
|
delta.manifest = manifest
|
@@ -118,7 +125,8 @@ def limit_input_deltas(
|
|
118
125
|
# TODO (pdames): ensure pyarrow object fits in per-task obj store mem
|
119
126
|
logger.warning(
|
120
127
|
f"Stats are missing for delta stream position {delta.stream_position}, "
|
121
|
-
f"materialized delta may not fit in per-task object store memory."
|
128
|
+
f"materialized delta may not fit in per-task object store memory."
|
129
|
+
)
|
122
130
|
manifest_entries = delta.manifest.entries
|
123
131
|
delta_manifest_entries += len(manifest_entries)
|
124
132
|
for entry in manifest_entries:
|
@@ -130,13 +138,13 @@ def limit_input_deltas(
|
|
130
138
|
logger.info(
|
131
139
|
f"Input deltas limited to "
|
132
140
|
f"{len(limited_input_da_list)} by object store mem "
|
133
|
-
f"({delta_bytes_pyarrow} > {worker_obj_store_mem})"
|
141
|
+
f"({delta_bytes_pyarrow} > {worker_obj_store_mem})"
|
142
|
+
)
|
134
143
|
break
|
135
144
|
delta_annotated = DeltaAnnotated.of(delta)
|
136
145
|
limited_input_da_list.append(delta_annotated)
|
137
146
|
|
138
|
-
logger.info(f"Input deltas to compact this round: "
|
139
|
-
f"{len(limited_input_da_list)}")
|
147
|
+
logger.info(f"Input deltas to compact this round: " f"{len(limited_input_da_list)}")
|
140
148
|
logger.info(f"Input delta bytes to compact: {delta_bytes}")
|
141
149
|
logger.info(f"Input delta files to compact: {delta_manifest_entries}")
|
142
150
|
logger.info(f"Latest input delta stream position: {latest_stream_position}")
|
@@ -146,10 +154,12 @@ def limit_input_deltas(
|
|
146
154
|
|
147
155
|
# TODO (pdames): determine min hash buckets from size of all deltas
|
148
156
|
# (not just deltas for this round)
|
149
|
-
min_hash_bucket_count = int(
|
150
|
-
|
151
|
-
|
152
|
-
|
157
|
+
min_hash_bucket_count = int(
|
158
|
+
max(
|
159
|
+
math.ceil(delta_bytes_pyarrow / worker_obj_store_mem_per_task),
|
160
|
+
min(worker_cpus, 256),
|
161
|
+
)
|
162
|
+
)
|
153
163
|
logger.info(f"Minimum recommended hash buckets: {min_hash_bucket_count}")
|
154
164
|
|
155
165
|
if hash_bucket_count is None:
|
@@ -168,7 +178,8 @@ def limit_input_deltas(
|
|
168
178
|
f"resolve this problem either specify a larger number of hash "
|
169
179
|
f"buckets when running compaction, omit a custom hash bucket "
|
170
180
|
f"count when running compaction, or provision workers with more "
|
171
|
-
f"task memory per CPU."
|
181
|
+
f"task memory per CPU."
|
182
|
+
)
|
172
183
|
|
173
184
|
hash_bucket_chunk_size = user_hash_bucket_chunk_size
|
174
185
|
max_hash_bucket_chunk_size = math.ceil(
|
@@ -185,7 +196,8 @@ def limit_input_deltas(
|
|
185
196
|
f"specify a smaller hash bucket chunk size when running "
|
186
197
|
f"compaction, omit a custom hash bucket chunk size when running "
|
187
198
|
f"compaction, or provision workers with more task and object "
|
188
|
-
f"store memory per CPU."
|
199
|
+
f"store memory per CPU."
|
200
|
+
)
|
189
201
|
elif not hash_bucket_chunk_size:
|
190
202
|
hash_bucket_chunk_size_load_balanced = max(
|
191
203
|
math.ceil(max(delta_bytes, delta_bytes_pyarrow) / worker_cpus),
|
@@ -1,48 +1,54 @@
|
|
1
|
-
import logging
|
2
1
|
import json
|
3
|
-
import
|
4
|
-
import pyarrow as pa
|
5
|
-
import numpy as np
|
6
|
-
import s3fs
|
2
|
+
import logging
|
7
3
|
from collections import defaultdict
|
4
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple
|
8
5
|
|
9
|
-
|
6
|
+
import numpy as np
|
7
|
+
import pyarrow as pa
|
8
|
+
import ray
|
9
|
+
import s3fs
|
10
10
|
from ray import cloudpickle
|
11
|
+
from ray.types import ObjectRef
|
11
12
|
|
12
|
-
from deltacat
|
13
|
-
from deltacat.
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
13
|
+
from deltacat import logs
|
14
|
+
from deltacat.aws import s3u
|
15
|
+
from deltacat.compute.compactor import (
|
16
|
+
PrimaryKeyIndexLocator,
|
17
|
+
PrimaryKeyIndexMeta,
|
18
|
+
PrimaryKeyIndexVersionLocator,
|
19
|
+
PrimaryKeyIndexVersionMeta,
|
20
|
+
PyArrowWriteResult,
|
21
|
+
RoundCompletionInfo,
|
22
|
+
)
|
23
|
+
from deltacat.compute.compactor.steps.rehash import rehash_bucket as rb
|
24
|
+
from deltacat.compute.compactor.steps.rehash import rewrite_index as ri
|
18
25
|
from deltacat.compute.compactor.utils import round_completion_file as rcf
|
19
26
|
from deltacat.compute.compactor.utils import system_columns as sc
|
20
|
-
from deltacat.
|
21
|
-
|
22
|
-
from deltacat.types.
|
23
|
-
from deltacat.types.
|
24
|
-
from deltacat.
|
25
|
-
from deltacat import
|
26
|
-
|
27
|
-
from typing import Any, Callable, Dict, List, Optional, Tuple
|
28
|
-
|
29
|
-
from ray.types import ObjectRef
|
27
|
+
from deltacat.constants import PRIMARY_KEY_INDEX_WRITE_BOTO3_CONFIG
|
28
|
+
from deltacat.storage import Manifest, PartitionLocator
|
29
|
+
from deltacat.types.media import ContentEncoding, ContentType
|
30
|
+
from deltacat.types.tables import get_table_slicer, get_table_writer
|
31
|
+
from deltacat.utils.common import ReadKwargsProvider
|
32
|
+
from deltacat.utils.ray_utils.concurrency import invoke_parallel
|
30
33
|
|
31
34
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
32
35
|
|
33
36
|
|
34
37
|
def rehash(
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
38
|
+
options_provider: Callable[[int, Any], Dict[str, Any]],
|
39
|
+
s3_bucket: str,
|
40
|
+
source_partition_locator: PartitionLocator,
|
41
|
+
old_rci: RoundCompletionInfo,
|
42
|
+
new_hash_bucket_count: int,
|
43
|
+
hash_bucket_index_group_count: int,
|
44
|
+
records_per_primary_key_index_file: int,
|
45
|
+
delete_old_primary_key_index: bool,
|
46
|
+
) -> RoundCompletionInfo:
|
47
|
+
|
48
|
+
logger.info(
|
49
|
+
f"Rehashing primary key index. Old round completion info: "
|
50
|
+
f"{old_rci}. New hash bucket count: {new_hash_bucket_count}"
|
51
|
+
)
|
46
52
|
|
47
53
|
# collect old primary key index information
|
48
54
|
old_pki_version_locator = old_rci.primary_key_index_version_locator
|
@@ -50,10 +56,12 @@ def rehash(
|
|
50
56
|
old_pki_meta = old_pkiv_meta.primary_key_index_meta
|
51
57
|
old_compacted_partition_locator = old_pki_meta.compacted_partition_locator
|
52
58
|
if old_pkiv_meta.hash_bucket_count == new_hash_bucket_count:
|
53
|
-
raise ValueError(
|
54
|
-
|
55
|
-
|
56
|
-
|
59
|
+
raise ValueError(
|
60
|
+
f"Primary key index rehash failed. Old hash bucket "
|
61
|
+
f"count ({new_hash_bucket_count}) is "
|
62
|
+
f"equal to new hash bucket count. Partition: "
|
63
|
+
f"{old_compacted_partition_locator}."
|
64
|
+
)
|
57
65
|
|
58
66
|
# generate a new unique primary key index version locator to rehash into
|
59
67
|
new_pki_meta = PrimaryKeyIndexMeta.of(
|
@@ -68,7 +76,8 @@ def rehash(
|
|
68
76
|
new_hash_bucket_count,
|
69
77
|
)
|
70
78
|
rehashed_pki_version_locator = PrimaryKeyIndexVersionLocator.generate(
|
71
|
-
new_pki_version_meta
|
79
|
+
new_pki_version_meta
|
80
|
+
)
|
72
81
|
|
73
82
|
# launch a rehash task for each bucket of the old primary key index version
|
74
83
|
old_hash_bucket_count = old_pkiv_meta.hash_bucket_count
|
@@ -114,6 +123,7 @@ def rehash(
|
|
114
123
|
PyArrowWriteResult.union(pki_stats),
|
115
124
|
old_rci.sort_keys_bit_width,
|
116
125
|
rehashed_pki_version_locator,
|
126
|
+
old_rci.rebase_source_partition_locator,
|
117
127
|
)
|
118
128
|
rcf.write_round_completion_file(
|
119
129
|
s3_bucket,
|
@@ -126,41 +136,48 @@ def rehash(
|
|
126
136
|
s3_bucket,
|
127
137
|
old_pki_version_locator,
|
128
138
|
)
|
129
|
-
logger.info(
|
130
|
-
|
139
|
+
logger.info(
|
140
|
+
f"Rehashed primary key index. New round completion info: "
|
141
|
+
f"{round_completion_info}."
|
142
|
+
)
|
131
143
|
return round_completion_info
|
132
144
|
|
133
145
|
|
134
146
|
def download_hash_bucket_entries(
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
pk_index_manifest_s3_url =
|
142
|
-
.get_pkiv_hb_index_manifest_s3_url(
|
147
|
+
s3_bucket: str,
|
148
|
+
hash_bucket_index: int,
|
149
|
+
primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
|
150
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
151
|
+
) -> List[pa.Table]:
|
152
|
+
|
153
|
+
pk_index_manifest_s3_url = (
|
154
|
+
primary_key_index_version_locator.get_pkiv_hb_index_manifest_s3_url(
|
143
155
|
s3_bucket,
|
144
156
|
hash_bucket_index,
|
145
157
|
)
|
158
|
+
)
|
146
159
|
result = s3u.download(pk_index_manifest_s3_url, False)
|
147
|
-
logger.info(
|
148
|
-
|
149
|
-
|
160
|
+
logger.info(
|
161
|
+
f"Downloading primary key index hash bucket manifest entries: "
|
162
|
+
f"{pk_index_manifest_s3_url}. Primary key index version "
|
163
|
+
f"locator: {primary_key_index_version_locator}"
|
164
|
+
)
|
150
165
|
pk_index_manifest = Manifest(json.loads(result["Body"].read().decode("utf-8")))
|
151
|
-
tables = s3u.download_manifest_entries(
|
152
|
-
|
166
|
+
tables = s3u.download_manifest_entries(
|
167
|
+
pk_index_manifest, file_reader_kwargs_provider=file_reader_kwargs_provider
|
168
|
+
)
|
153
169
|
if not tables:
|
154
170
|
logger.warning(
|
155
171
|
f"Primary key index manifest is empty at: "
|
156
172
|
f"{pk_index_manifest_s3_url}. Primary key index version "
|
157
|
-
f"locator: {primary_key_index_version_locator}"
|
173
|
+
f"locator: {primary_key_index_version_locator}"
|
174
|
+
)
|
158
175
|
return tables
|
159
176
|
|
160
177
|
|
161
178
|
def delete_primary_key_index_version(
|
162
|
-
|
163
|
-
|
179
|
+
s3_bucket: str, pki_version_locator: PrimaryKeyIndexVersionLocator
|
180
|
+
) -> None:
|
164
181
|
|
165
182
|
logger.info(f"Deleting primary key index: {pki_version_locator}")
|
166
183
|
s3u.delete_files_by_prefix(
|
@@ -171,8 +188,8 @@ def delete_primary_key_index_version(
|
|
171
188
|
|
172
189
|
|
173
190
|
def group_record_indices_by_hash_bucket(
|
174
|
-
|
175
|
-
|
191
|
+
pki_table: pa.Table, num_buckets: int
|
192
|
+
) -> np.ndarray:
|
176
193
|
|
177
194
|
hash_bucket_to_indices = np.empty([num_buckets], dtype="object")
|
178
195
|
record_index = 0
|
@@ -186,9 +203,11 @@ def group_record_indices_by_hash_bucket(
|
|
186
203
|
|
187
204
|
|
188
205
|
def group_hash_bucket_indices(
|
189
|
-
|
190
|
-
|
191
|
-
|
206
|
+
hash_bucket_object_groups: np.ndarray, num_buckets: int, num_groups: int
|
207
|
+
) -> Tuple[np.ndarray, List[ObjectRef]]:
|
208
|
+
"""
|
209
|
+
Groups all the ObjectRef that belongs to a particular hash bucket group and hash bucket index.
|
210
|
+
"""
|
192
211
|
|
193
212
|
object_refs = []
|
194
213
|
hash_bucket_group_to_obj_id = np.empty([num_groups], dtype="object")
|
@@ -201,50 +220,70 @@ def group_hash_bucket_indices(
|
|
201
220
|
if obj:
|
202
221
|
hb_group = hb_index % num_groups
|
203
222
|
if hb_group_to_object[hb_group] is None:
|
204
|
-
hb_group_to_object[hb_group] = np.empty(
|
205
|
-
[num_buckets], dtype="object")
|
223
|
+
hb_group_to_object[hb_group] = np.empty([num_buckets], dtype="object")
|
206
224
|
hb_group_to_object[hb_group][hb_index] = obj
|
207
225
|
|
208
226
|
for hb_group, obj in enumerate(hb_group_to_object):
|
209
|
-
if obj is
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
227
|
+
if obj is None:
|
228
|
+
continue
|
229
|
+
obj_ref = ray.put(obj)
|
230
|
+
pickled_obj_ref = cloudpickle.dumps(obj_ref)
|
231
|
+
object_refs.append(pickled_obj_ref)
|
232
|
+
hash_bucket_group_to_obj_id[hb_group] = pickled_obj_ref
|
233
|
+
# NOTE: The cloudpickle.dumps API call creates an out of band object reference to the object_ref variable.
|
234
|
+
# After pickling, Ray cannot track the serialized copy of the object or determine when the ObjectRef has been deserialized
|
235
|
+
# (e.g., if the ObjectRef is deserialized by a non-Ray process).
|
236
|
+
# Thus the object_ref cannot be tracked by Ray's distributed reference counter, even if it goes out of scope.
|
237
|
+
# The object now has a permanent reference and the data can't be freed from Ray’s object store.
|
238
|
+
# Manually deleting the untrackable object references offsets these permanent references and
|
239
|
+
# helps to allow these objects to be garbage collected normally.
|
240
|
+
del obj_ref
|
241
|
+
del pickled_obj_ref
|
214
242
|
return hash_bucket_group_to_obj_id, object_refs
|
215
243
|
|
216
244
|
|
217
|
-
def pk_digest_to_hash_bucket_index(
|
218
|
-
|
219
|
-
|
245
|
+
def pk_digest_to_hash_bucket_index(digest, num_buckets: int) -> int:
|
246
|
+
"""
|
247
|
+
Deterministically get the hash bucket a particular digest belongs to
|
248
|
+
based on number of total hash buckets.
|
249
|
+
"""
|
220
250
|
|
221
251
|
return int.from_bytes(digest, "big") % num_buckets
|
222
252
|
|
223
253
|
|
224
254
|
def write_primary_key_index_files(
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
255
|
+
table: pa.Table,
|
256
|
+
primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
|
257
|
+
s3_bucket: str,
|
258
|
+
hb_index: int,
|
259
|
+
records_per_index_file: int,
|
260
|
+
) -> PyArrowWriteResult:
|
230
261
|
"""
|
231
262
|
Writes primary key index files for the given hash bucket index out to the
|
232
263
|
specified S3 bucket at the path identified by the given primary key index
|
233
264
|
version locator. Output is written as 1 or more Parquet files with the
|
234
265
|
given maximum number of records per file.
|
266
|
+
|
267
|
+
TODO(raghumdani): Support writing primary key index to any data catalog
|
235
268
|
"""
|
236
|
-
logger.info(
|
237
|
-
|
238
|
-
|
269
|
+
logger.info(
|
270
|
+
f"Writing primary key index files for hash bucket {hb_index}. "
|
271
|
+
f"Primary key index version locator: "
|
272
|
+
f"{primary_key_index_version_locator}."
|
273
|
+
)
|
239
274
|
s3_file_system = s3fs.S3FileSystem(
|
240
275
|
anon=False,
|
241
276
|
s3_additional_kwargs={
|
242
277
|
"ContentType": ContentType.PARQUET.value,
|
243
278
|
"ContentEncoding": ContentEncoding.IDENTITY.value,
|
244
|
-
}
|
279
|
+
},
|
280
|
+
config_kwargs=PRIMARY_KEY_INDEX_WRITE_BOTO3_CONFIG,
|
281
|
+
)
|
282
|
+
pkiv_hb_index_s3_url_base = (
|
283
|
+
primary_key_index_version_locator.get_pkiv_hb_index_s3_url_base(
|
284
|
+
s3_bucket, hb_index
|
285
|
+
)
|
245
286
|
)
|
246
|
-
pkiv_hb_index_s3_url_base = primary_key_index_version_locator\
|
247
|
-
.get_pkiv_hb_index_s3_url_base(s3_bucket, hb_index)
|
248
287
|
manifest_entries = s3u.upload_sliced_table(
|
249
288
|
table,
|
250
289
|
pkiv_hb_index_s3_url_base,
|
@@ -254,19 +293,21 @@ def write_primary_key_index_files(
|
|
254
293
|
get_table_slicer(table),
|
255
294
|
)
|
256
295
|
manifest = Manifest.of(manifest_entries)
|
257
|
-
pkiv_hb_index_s3_manifest_s3_url =
|
258
|
-
.get_pkiv_hb_index_manifest_s3_url(
|
259
|
-
|
260
|
-
|
261
|
-
str(json.dumps(manifest))
|
296
|
+
pkiv_hb_index_s3_manifest_s3_url = (
|
297
|
+
primary_key_index_version_locator.get_pkiv_hb_index_manifest_s3_url(
|
298
|
+
s3_bucket, hb_index
|
299
|
+
)
|
262
300
|
)
|
301
|
+
s3u.upload(pkiv_hb_index_s3_manifest_s3_url, str(json.dumps(manifest)))
|
263
302
|
result = PyArrowWriteResult.of(
|
264
303
|
len(manifest_entries),
|
265
304
|
table.nbytes,
|
266
305
|
manifest.meta.content_length,
|
267
306
|
len(table),
|
268
307
|
)
|
269
|
-
logger.info(
|
270
|
-
|
271
|
-
|
308
|
+
logger.info(
|
309
|
+
f"Wrote primary key index files for hash bucket {hb_index}. "
|
310
|
+
f"Primary key index version locator: "
|
311
|
+
f"{primary_key_index_version_locator}. Result: {result}"
|
312
|
+
)
|
272
313
|
return result
|
@@ -1,35 +1,35 @@
|
|
1
|
-
import logging
|
2
1
|
import json
|
2
|
+
import logging
|
3
3
|
|
4
|
-
from deltacat.storage import PartitionLocator
|
5
|
-
from deltacat.compute.compactor import RoundCompletionInfo
|
6
4
|
from deltacat import logs
|
5
|
+
from deltacat.compute.compactor import RoundCompletionInfo
|
6
|
+
from deltacat.storage import PartitionLocator
|
7
7
|
|
8
8
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
9
9
|
|
10
10
|
|
11
11
|
def get_round_completion_file_s3_url(
|
12
|
-
|
13
|
-
|
14
|
-
pki_root_path: str) -> str:
|
12
|
+
bucket: str, source_partition_locator: PartitionLocator, pki_root_path: str
|
13
|
+
) -> str:
|
15
14
|
|
16
15
|
base_url = source_partition_locator.path(f"s3://{bucket}")
|
17
16
|
return f"{base_url}/{pki_root_path}.json"
|
18
17
|
|
19
18
|
|
20
19
|
def read_round_completion_file(
|
21
|
-
|
22
|
-
|
23
|
-
|
20
|
+
bucket: str,
|
21
|
+
source_partition_locator: PartitionLocator,
|
22
|
+
primary_key_index_root_path: str,
|
23
|
+
) -> RoundCompletionInfo:
|
24
24
|
|
25
25
|
from deltacat.aws import s3u as s3_utils
|
26
|
+
|
26
27
|
round_completion_file_url = get_round_completion_file_s3_url(
|
27
28
|
bucket,
|
28
29
|
source_partition_locator,
|
29
30
|
primary_key_index_root_path,
|
30
31
|
)
|
31
|
-
logger.info(
|
32
|
-
f"reading round completion file from: {round_completion_file_url}")
|
32
|
+
logger.info(f"reading round completion file from: {round_completion_file_url}")
|
33
33
|
round_completion_info = None
|
34
34
|
result = s3_utils.download(round_completion_file_url, False)
|
35
35
|
if result:
|
@@ -40,24 +40,23 @@ def read_round_completion_file(
|
|
40
40
|
|
41
41
|
|
42
42
|
def write_round_completion_file(
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
43
|
+
bucket: str,
|
44
|
+
source_partition_locator: PartitionLocator,
|
45
|
+
primary_key_index_root_path: str,
|
46
|
+
round_completion_info: RoundCompletionInfo,
|
47
|
+
) -> str:
|
47
48
|
|
48
49
|
from deltacat.aws import s3u as s3_utils
|
49
|
-
|
50
|
-
|
50
|
+
|
51
|
+
logger.info(f"writing round completion file contents: {round_completion_info}")
|
51
52
|
round_completion_file_s3_url = get_round_completion_file_s3_url(
|
52
53
|
bucket,
|
53
54
|
source_partition_locator,
|
54
55
|
primary_key_index_root_path,
|
55
56
|
)
|
56
|
-
logger.info(
|
57
|
-
f"writing round completion file to: {round_completion_file_s3_url}")
|
57
|
+
logger.info(f"writing round completion file to: {round_completion_file_s3_url}")
|
58
58
|
s3_utils.upload(
|
59
|
-
round_completion_file_s3_url,
|
60
|
-
str(json.dumps(round_completion_info))
|
59
|
+
round_completion_file_s3_url, str(json.dumps(round_completion_info))
|
61
60
|
)
|
62
|
-
logger.info(
|
63
|
-
|
61
|
+
logger.info(f"round completion file written to: {round_completion_file_s3_url}")
|
62
|
+
return round_completion_file_s3_url
|