deltacat 0.1.18b12__py3-none-any.whl → 0.1.18b13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor/repartition_session.py +3 -1
- deltacat/compute/compactor/steps/repartition.py +4 -1
- deltacat/compute/compactor/utils/primary_key_index.py +1 -222
- deltacat/constants.py +3 -4
- deltacat/utils/placement.py +7 -2
- {deltacat-0.1.18b12.dist-info → deltacat-0.1.18b13.dist-info}/METADATA +1 -1
- {deltacat-0.1.18b12.dist-info → deltacat-0.1.18b13.dist-info}/RECORD +11 -14
- deltacat/compute/compactor/steps/rehash/__init__.py +0 -0
- deltacat/compute/compactor/steps/rehash/rehash_bucket.py +0 -57
- deltacat/compute/compactor/steps/rehash/rewrite_index.py +0 -48
- {deltacat-0.1.18b12.dist-info → deltacat-0.1.18b13.dist-info}/LICENSE +0 -0
- {deltacat-0.1.18b12.dist-info → deltacat-0.1.18b13.dist-info}/WHEEL +0 -0
- {deltacat-0.1.18b12.dist-info → deltacat-0.1.18b13.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -144,7 +144,9 @@ def repartition(
|
|
144
144
|
logger.info(f"repartition {repar_end - repar_start} seconds")
|
145
145
|
logger.info(f"Got {len(ordered_deltas)} task results.")
|
146
146
|
# ordered_deltas are ordered as [cold1, cold2, coldN, hot1, hot2, hotN]
|
147
|
-
merged_delta = Delta.merge_deltas(
|
147
|
+
merged_delta = Delta.merge_deltas(
|
148
|
+
ordered_deltas, stream_position=last_stream_position_to_compact
|
149
|
+
)
|
148
150
|
compacted_delta = deltacat_storage.commit_delta(
|
149
151
|
merged_delta, properties=kwargs.get("properties", {})
|
150
152
|
)
|
@@ -2,6 +2,7 @@ import importlib
|
|
2
2
|
import logging
|
3
3
|
from contextlib import nullcontext
|
4
4
|
import pyarrow.compute as pc
|
5
|
+
from deltacat.constants import SIGNED_INT64_MIN_VALUE, SIGNED_INT64_MAX_VALUE
|
5
6
|
import pyarrow as pa
|
6
7
|
from typing import List, Optional
|
7
8
|
from deltacat.types.media import StorageType, ContentType
|
@@ -93,7 +94,9 @@ def repartition_range(
|
|
93
94
|
if not all(column in table.column_names for table in tables):
|
94
95
|
raise ValueError(f"Column {column} does not exist in the table")
|
95
96
|
partition_ranges.sort()
|
96
|
-
partition_ranges =
|
97
|
+
partition_ranges = (
|
98
|
+
[SIGNED_INT64_MIN_VALUE] + partition_ranges + [SIGNED_INT64_MAX_VALUE]
|
99
|
+
)
|
97
100
|
partitioned_tables_list = [[] for _ in range(len(partition_ranges) - 1)]
|
98
101
|
|
99
102
|
total_record_count = 0
|
@@ -1,180 +1,21 @@
|
|
1
|
-
import json
|
2
1
|
import logging
|
3
|
-
from
|
4
|
-
from typing import Any, Callable, Dict, List, Optional, Tuple
|
2
|
+
from typing import List, Optional, Tuple
|
5
3
|
|
6
4
|
import numpy as np
|
7
5
|
import pyarrow as pa
|
8
|
-
import ray
|
9
|
-
import s3fs
|
10
6
|
from ray.types import ObjectRef
|
11
7
|
|
12
8
|
from deltacat import logs
|
13
9
|
from deltacat.aws import s3u
|
14
10
|
from deltacat.compute.compactor import (
|
15
|
-
PrimaryKeyIndexLocator,
|
16
|
-
PrimaryKeyIndexMeta,
|
17
11
|
PrimaryKeyIndexVersionLocator,
|
18
|
-
PrimaryKeyIndexVersionMeta,
|
19
|
-
PyArrowWriteResult,
|
20
|
-
RoundCompletionInfo,
|
21
12
|
)
|
22
|
-
from deltacat.compute.compactor.steps.rehash import rehash_bucket as rb
|
23
|
-
from deltacat.compute.compactor.steps.rehash import rewrite_index as ri
|
24
|
-
from deltacat.compute.compactor.utils import round_completion_file as rcf
|
25
13
|
from deltacat.compute.compactor.utils import system_columns as sc
|
26
|
-
from deltacat.constants import PRIMARY_KEY_INDEX_WRITE_BOTO3_CONFIG
|
27
|
-
from deltacat.storage import Manifest, PartitionLocator
|
28
|
-
from deltacat.types.media import ContentEncoding, ContentType
|
29
|
-
from deltacat.types.tables import get_table_slicer, get_table_writer
|
30
|
-
from deltacat.utils.common import ReadKwargsProvider
|
31
|
-
from deltacat.utils.ray_utils.concurrency import invoke_parallel
|
32
14
|
from deltacat.io.object_store import IObjectStore
|
33
15
|
|
34
16
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
35
17
|
|
36
18
|
|
37
|
-
def rehash(
|
38
|
-
options_provider: Callable[[int, Any], Dict[str, Any]],
|
39
|
-
s3_bucket: str,
|
40
|
-
source_partition_locator: PartitionLocator,
|
41
|
-
old_rci: RoundCompletionInfo,
|
42
|
-
new_hash_bucket_count: int,
|
43
|
-
hash_bucket_index_group_count: int,
|
44
|
-
records_per_primary_key_index_file: int,
|
45
|
-
delete_old_primary_key_index: bool,
|
46
|
-
) -> RoundCompletionInfo:
|
47
|
-
|
48
|
-
logger.info(
|
49
|
-
f"Rehashing primary key index. Old round completion info: "
|
50
|
-
f"{old_rci}. New hash bucket count: {new_hash_bucket_count}"
|
51
|
-
)
|
52
|
-
|
53
|
-
# collect old primary key index information
|
54
|
-
old_pki_version_locator = old_rci.primary_key_index_version_locator
|
55
|
-
old_pkiv_meta = old_pki_version_locator.primary_key_index_version_meta
|
56
|
-
old_pki_meta = old_pkiv_meta.primary_key_index_meta
|
57
|
-
old_compacted_partition_locator = old_pki_meta.compacted_partition_locator
|
58
|
-
if old_pkiv_meta.hash_bucket_count == new_hash_bucket_count:
|
59
|
-
raise ValueError(
|
60
|
-
f"Primary key index rehash failed. Old hash bucket "
|
61
|
-
f"count ({new_hash_bucket_count}) is "
|
62
|
-
f"equal to new hash bucket count. Partition: "
|
63
|
-
f"{old_compacted_partition_locator}."
|
64
|
-
)
|
65
|
-
|
66
|
-
# generate a new unique primary key index version locator to rehash into
|
67
|
-
new_pki_meta = PrimaryKeyIndexMeta.of(
|
68
|
-
old_compacted_partition_locator,
|
69
|
-
old_pki_meta.primary_keys,
|
70
|
-
old_pki_meta.sort_keys,
|
71
|
-
old_pki_meta.primary_key_index_algorithm_version,
|
72
|
-
)
|
73
|
-
new_pki_locator = PrimaryKeyIndexLocator.of(new_pki_meta)
|
74
|
-
new_pki_version_meta = PrimaryKeyIndexVersionMeta.of(
|
75
|
-
new_pki_meta,
|
76
|
-
new_hash_bucket_count,
|
77
|
-
)
|
78
|
-
rehashed_pki_version_locator = PrimaryKeyIndexVersionLocator.generate(
|
79
|
-
new_pki_version_meta
|
80
|
-
)
|
81
|
-
|
82
|
-
# launch a rehash task for each bucket of the old primary key index version
|
83
|
-
old_hash_bucket_count = old_pkiv_meta.hash_bucket_count
|
84
|
-
hb_tasks_pending = invoke_parallel(
|
85
|
-
items=range(old_hash_bucket_count),
|
86
|
-
ray_task=rb.rehash_bucket,
|
87
|
-
max_parallelism=None,
|
88
|
-
options_provider=options_provider,
|
89
|
-
s3_bucket=s3_bucket,
|
90
|
-
old_pki_version_locator=old_pki_version_locator,
|
91
|
-
num_buckets=new_hash_bucket_count,
|
92
|
-
num_groups=hash_bucket_index_group_count,
|
93
|
-
)
|
94
|
-
logger.info(f"Getting {len(hb_tasks_pending)} rehash bucket results...")
|
95
|
-
hb_results = ray.get([t[0] for t in hb_tasks_pending])
|
96
|
-
logger.info(f"Got {len(hb_results)} rehash bucket results.")
|
97
|
-
all_hash_group_idx_to_obj_id = defaultdict(list)
|
98
|
-
for hash_group_idx_to_obj_id in hb_results:
|
99
|
-
for hash_group_index, object_id in enumerate(hash_group_idx_to_obj_id):
|
100
|
-
if object_id:
|
101
|
-
all_hash_group_idx_to_obj_id[hash_group_index].append(object_id)
|
102
|
-
hash_group_count = len(all_hash_group_idx_to_obj_id)
|
103
|
-
logger.info(f"Rehash bucket groups created: {hash_group_count}")
|
104
|
-
|
105
|
-
# write primary key index files for each rehashed output bucket
|
106
|
-
pki_stats_promises = invoke_parallel(
|
107
|
-
items=all_hash_group_idx_to_obj_id.values(),
|
108
|
-
ray_task=ri.rewrite_index,
|
109
|
-
max_parallelism=None,
|
110
|
-
options_provider=options_provider,
|
111
|
-
s3_bucket=s3_bucket,
|
112
|
-
new_primary_key_index_version_locator=rehashed_pki_version_locator,
|
113
|
-
max_records_per_index_file=records_per_primary_key_index_file,
|
114
|
-
)
|
115
|
-
logger.info(f"Getting {len(pki_stats_promises)} rewrite index results...")
|
116
|
-
pki_stats = ray.get([t[0] for t in pki_stats_promises])
|
117
|
-
logger.info(f"Got {len(pki_stats)} rewrite index results.")
|
118
|
-
|
119
|
-
round_completion_info = RoundCompletionInfo.of(
|
120
|
-
old_rci.high_watermark,
|
121
|
-
old_rci.compacted_delta_locator,
|
122
|
-
old_rci.compacted_pyarrow_write_result,
|
123
|
-
PyArrowWriteResult.union(pki_stats),
|
124
|
-
old_rci.sort_keys_bit_width,
|
125
|
-
rehashed_pki_version_locator,
|
126
|
-
old_rci.rebase_source_partition_locator,
|
127
|
-
)
|
128
|
-
rcf.write_round_completion_file(
|
129
|
-
s3_bucket,
|
130
|
-
source_partition_locator,
|
131
|
-
new_pki_locator.primary_key_index_root_path,
|
132
|
-
round_completion_info,
|
133
|
-
)
|
134
|
-
if delete_old_primary_key_index:
|
135
|
-
delete_primary_key_index_version(
|
136
|
-
s3_bucket,
|
137
|
-
old_pki_version_locator,
|
138
|
-
)
|
139
|
-
logger.info(
|
140
|
-
f"Rehashed primary key index. New round completion info: "
|
141
|
-
f"{round_completion_info}."
|
142
|
-
)
|
143
|
-
return round_completion_info
|
144
|
-
|
145
|
-
|
146
|
-
def download_hash_bucket_entries(
|
147
|
-
s3_bucket: str,
|
148
|
-
hash_bucket_index: int,
|
149
|
-
primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
|
150
|
-
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
151
|
-
) -> List[pa.Table]:
|
152
|
-
|
153
|
-
pk_index_manifest_s3_url = (
|
154
|
-
primary_key_index_version_locator.get_pkiv_hb_index_manifest_s3_url(
|
155
|
-
s3_bucket,
|
156
|
-
hash_bucket_index,
|
157
|
-
)
|
158
|
-
)
|
159
|
-
result = s3u.download(pk_index_manifest_s3_url, False)
|
160
|
-
logger.info(
|
161
|
-
f"Downloading primary key index hash bucket manifest entries: "
|
162
|
-
f"{pk_index_manifest_s3_url}. Primary key index version "
|
163
|
-
f"locator: {primary_key_index_version_locator}"
|
164
|
-
)
|
165
|
-
pk_index_manifest = Manifest(json.loads(result["Body"].read().decode("utf-8")))
|
166
|
-
tables = s3u.download_manifest_entries(
|
167
|
-
pk_index_manifest, file_reader_kwargs_provider=file_reader_kwargs_provider
|
168
|
-
)
|
169
|
-
if not tables:
|
170
|
-
logger.warning(
|
171
|
-
f"Primary key index manifest is empty at: "
|
172
|
-
f"{pk_index_manifest_s3_url}. Primary key index version "
|
173
|
-
f"locator: {primary_key_index_version_locator}"
|
174
|
-
)
|
175
|
-
return tables
|
176
|
-
|
177
|
-
|
178
19
|
def delete_primary_key_index_version(
|
179
20
|
s3_bucket: str, pki_version_locator: PrimaryKeyIndexVersionLocator
|
180
21
|
) -> None:
|
@@ -243,65 +84,3 @@ def pk_digest_to_hash_bucket_index(digest, num_buckets: int) -> int:
|
|
243
84
|
"""
|
244
85
|
|
245
86
|
return int.from_bytes(digest, "big") % num_buckets
|
246
|
-
|
247
|
-
|
248
|
-
def write_primary_key_index_files(
|
249
|
-
table: pa.Table,
|
250
|
-
primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
|
251
|
-
s3_bucket: str,
|
252
|
-
hb_index: int,
|
253
|
-
records_per_index_file: int,
|
254
|
-
) -> PyArrowWriteResult:
|
255
|
-
"""
|
256
|
-
Writes primary key index files for the given hash bucket index out to the
|
257
|
-
specified S3 bucket at the path identified by the given primary key index
|
258
|
-
version locator. Output is written as 1 or more Parquet files with the
|
259
|
-
given maximum number of records per file.
|
260
|
-
|
261
|
-
TODO(raghumdani): Support writing primary key index to any data catalog
|
262
|
-
"""
|
263
|
-
logger.info(
|
264
|
-
f"Writing primary key index files for hash bucket {hb_index}. "
|
265
|
-
f"Primary key index version locator: "
|
266
|
-
f"{primary_key_index_version_locator}."
|
267
|
-
)
|
268
|
-
s3_file_system = s3fs.S3FileSystem(
|
269
|
-
anon=False,
|
270
|
-
s3_additional_kwargs={
|
271
|
-
"ContentType": ContentType.PARQUET.value,
|
272
|
-
"ContentEncoding": ContentEncoding.IDENTITY.value,
|
273
|
-
},
|
274
|
-
config_kwargs=PRIMARY_KEY_INDEX_WRITE_BOTO3_CONFIG,
|
275
|
-
)
|
276
|
-
pkiv_hb_index_s3_url_base = (
|
277
|
-
primary_key_index_version_locator.get_pkiv_hb_index_s3_url_base(
|
278
|
-
s3_bucket, hb_index
|
279
|
-
)
|
280
|
-
)
|
281
|
-
manifest_entries = s3u.upload_sliced_table(
|
282
|
-
table,
|
283
|
-
pkiv_hb_index_s3_url_base,
|
284
|
-
s3_file_system,
|
285
|
-
records_per_index_file,
|
286
|
-
get_table_writer(table),
|
287
|
-
get_table_slicer(table),
|
288
|
-
)
|
289
|
-
manifest = Manifest.of(manifest_entries)
|
290
|
-
pkiv_hb_index_s3_manifest_s3_url = (
|
291
|
-
primary_key_index_version_locator.get_pkiv_hb_index_manifest_s3_url(
|
292
|
-
s3_bucket, hb_index
|
293
|
-
)
|
294
|
-
)
|
295
|
-
s3u.upload(pkiv_hb_index_s3_manifest_s3_url, str(json.dumps(manifest)))
|
296
|
-
result = PyArrowWriteResult.of(
|
297
|
-
len(manifest_entries),
|
298
|
-
table.nbytes,
|
299
|
-
manifest.meta.content_length,
|
300
|
-
len(table),
|
301
|
-
)
|
302
|
-
logger.info(
|
303
|
-
f"Wrote primary key index files for hash bucket {hb_index}. "
|
304
|
-
f"Primary key index version locator: "
|
305
|
-
f"{primary_key_index_version_locator}. Result: {result}"
|
306
|
-
)
|
307
|
-
return result
|
deltacat/constants.py
CHANGED
@@ -36,6 +36,9 @@ BYTES_PER_GIBIBYTE = 2**30
|
|
36
36
|
BYTES_PER_TEBIBYTE = 2**40
|
37
37
|
BYTES_PER_PEBIBYTE = 2**50
|
38
38
|
|
39
|
+
SIGNED_INT64_MIN_VALUE = -(2**63)
|
40
|
+
SIGNED_INT64_MAX_VALUE = 2**63 - 1
|
41
|
+
|
39
42
|
# Inflation multiplier from snappy-compressed parquet to pyarrow.
|
40
43
|
# This should be kept larger than actual average inflation multipliers.
|
41
44
|
# Note that this is a very rough guess since actual observed pyarrow
|
@@ -49,8 +52,4 @@ PYARROW_INFLATION_MULTIPLIER = 2.5
|
|
49
52
|
# Inflation multiplier from snappy-compressed parquet to pyarrow for all columns.
|
50
53
|
PYARROW_INFLATION_MULTIPLIER_ALL_COLUMNS = 6
|
51
54
|
|
52
|
-
PRIMARY_KEY_INDEX_WRITE_BOTO3_CONFIG = {
|
53
|
-
"retries": {"max_attempts": 25, "mode": "standard"}
|
54
|
-
}
|
55
|
-
|
56
55
|
MEMORY_TO_HASH_BUCKET_COUNT_RATIO = 0.0512 * BYTES_PER_TEBIBYTE
|
deltacat/utils/placement.py
CHANGED
@@ -229,8 +229,13 @@ class PlacementGroupManager:
|
|
229
229
|
|
230
230
|
def get_current_node_resource_key(self) -> str:
|
231
231
|
# on ec2: address="172.31.34.51:6379"
|
232
|
-
# on
|
233
|
-
|
232
|
+
# on AWS Glue for Ray: address = "2600:1f10:4674:6815:aadb:2dc8:de61:bc8e:6379"
|
233
|
+
(
|
234
|
+
current_node_name,
|
235
|
+
_,
|
236
|
+
) = ray.experimental.internal_kv.global_gcs_client.address.rsplit(
|
237
|
+
":", 1
|
238
|
+
) # using rsplit split on the last occurence of delimiter ":"
|
234
239
|
for node in ray.nodes():
|
235
240
|
if node["NodeName"] == current_node_name:
|
236
241
|
# Found the node.
|
@@ -1,5 +1,5 @@
|
|
1
|
-
deltacat/__init__.py,sha256=
|
2
|
-
deltacat/constants.py,sha256=
|
1
|
+
deltacat/__init__.py,sha256=_t2_FxNTDhr42lxts3cV8iHgCrw_PAT3pIx7MHSA5Ro,1811
|
2
|
+
deltacat/constants.py,sha256=_6oRI-3yp5c8J1qKGQZrt89I9-ttT_gSSvVsJ0h8Duc,1939
|
3
3
|
deltacat/exceptions.py,sha256=x7qem7FLujXf-DzPsNcQ-XYkW3cF3A0YGIbxkcpz0Mw,146
|
4
4
|
deltacat/logs.py,sha256=yyve_6Y4bLWAdCOnxFOPrSR9FRXwZuh68_rRoPpmg08,5633
|
5
5
|
deltacat/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -18,7 +18,7 @@ deltacat/catalog/model/table_definition.py,sha256=tKrM1mmaQlvxqXrLt3QJVZK5BZfaJn
|
|
18
18
|
deltacat/compute/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
19
|
deltacat/compute/compactor/__init__.py,sha256=kmWC-Qnw861k7mPhLH4fQEL6CaMeBql2AipHeFqJ2uI,1127
|
20
20
|
deltacat/compute/compactor/compaction_session.py,sha256=21Ai6esOqw9nhXIpbVQteLvROIPeiqpDg1iBsOclais,25946
|
21
|
-
deltacat/compute/compactor/repartition_session.py,sha256=
|
21
|
+
deltacat/compute/compactor/repartition_session.py,sha256=IYBygwvoAGAY6uftZ3C4bAW0VKPfGuKjkdbpr6_FnCo,6986
|
22
22
|
deltacat/compute/compactor/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
23
|
deltacat/compute/compactor/model/compact_partition_params.py,sha256=QvjH10IsA8O6ufVzwPz-mcw326BT-Zbs29wFGCcGerA,5677
|
24
24
|
deltacat/compute/compactor/model/compaction_session_audit_info.py,sha256=TKgFFdd38cplihdMtHja-cBTwk3dflEipc8smWtZlGg,25231
|
@@ -37,13 +37,10 @@ deltacat/compute/compactor/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
|
|
37
37
|
deltacat/compute/compactor/steps/dedupe.py,sha256=R6p43mOUWgA1t468FS8JU-Wlrr96tt0ccwa0uytuaRY,10063
|
38
38
|
deltacat/compute/compactor/steps/hash_bucket.py,sha256=ZzJQWulSOMve7bDZX7ZRuYAl4bSC4U5SJzPhpeGpKB0,9769
|
39
39
|
deltacat/compute/compactor/steps/materialize.py,sha256=mXxKSaPL7iYtqP-eiJlFwi8kuywFmiU5FLS2-DW5314,13964
|
40
|
-
deltacat/compute/compactor/steps/repartition.py,sha256=
|
41
|
-
deltacat/compute/compactor/steps/rehash/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
42
|
-
deltacat/compute/compactor/steps/rehash/rehash_bucket.py,sha256=yh-sBuUI3hqw2vk_nK9o-KDrgSww4oSvAz2hBxTkv8s,1765
|
43
|
-
deltacat/compute/compactor/steps/rehash/rewrite_index.py,sha256=-HVM08pk5ROHEgDP-FVty55-a_0dsGRiSnPlNJw7C6Q,1838
|
40
|
+
deltacat/compute/compactor/steps/repartition.py,sha256=EH843SI33fporpxQbeBmEQdvogSYmVYih6hUkxXlZ9w,9953
|
44
41
|
deltacat/compute/compactor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
45
42
|
deltacat/compute/compactor/utils/io.py,sha256=itraIfLGUFfVFrW-XHnsEEa9GNIJR4VCnav0LyjHons,16543
|
46
|
-
deltacat/compute/compactor/utils/primary_key_index.py,sha256=
|
43
|
+
deltacat/compute/compactor/utils/primary_key_index.py,sha256=ldcgWqnwCfnGSmUWpe68zvFO7SfOXCrytLTISQ3KwNY,2866
|
47
44
|
deltacat/compute/compactor/utils/round_completion_file.py,sha256=DmZfHeAXlQn0DDdcsIHZROHWfyBCKTY3pNUdHzalqkE,2284
|
48
45
|
deltacat/compute/compactor/utils/system_columns.py,sha256=I36NAEGwRegv56ouVLwTCCisyoOupDCbbaxtoFDzYTE,8121
|
49
46
|
deltacat/compute/metastats/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -123,7 +120,7 @@ deltacat/utils/metrics.py,sha256=1CHb5f9SXvTeKljjGawK6wmyij0HN9X6ixMiTssbT_w,467
|
|
123
120
|
deltacat/utils/numpy.py,sha256=ZiGREobTVT6IZXgPxkSUpLJFN2Hn8KEZcrqybLDXCIA,2027
|
124
121
|
deltacat/utils/pandas.py,sha256=eGOpiZE1zLznTtuwoN80j4PBp1_bUV8SE4c951r0a3o,9561
|
125
122
|
deltacat/utils/performance.py,sha256=rC3CPfroZP3T5TbRNZXB9GRBr0F9i2KUeZYL45JBgCU,610
|
126
|
-
deltacat/utils/placement.py,sha256=
|
123
|
+
deltacat/utils/placement.py,sha256=6ppSypvmkVH5twN-UdAmDaNLJkBaGnJ2DDMv5NmNv4o,11210
|
127
124
|
deltacat/utils/pyarrow.py,sha256=dgAruwOpWYSlnJ5w8iJz_NWpfQoZHA_iG-F7CBDieko,18245
|
128
125
|
deltacat/utils/resources.py,sha256=fA53NiJOd5rLMtwvuTnqTyq4g59deD6NCGDbX5yIlg8,2908
|
129
126
|
deltacat/utils/ray_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -132,8 +129,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=AyL7hpvYjkmsz-KcpYjVgPpNsmu-x8-rl
|
|
132
129
|
deltacat/utils/ray_utils/dataset.py,sha256=SIljK3UkSqQ6Ntit_iSiYt9yYjN_gGrCTX6_72XdQ3w,3244
|
133
130
|
deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
|
134
131
|
deltacat/utils/ray_utils/runtime.py,sha256=xOVkqL6o8qGsewGvzhMKxmCcqcFZDnNILuz5IGMgxSc,4991
|
135
|
-
deltacat-0.1.
|
136
|
-
deltacat-0.1.
|
137
|
-
deltacat-0.1.
|
138
|
-
deltacat-0.1.
|
139
|
-
deltacat-0.1.
|
132
|
+
deltacat-0.1.18b13.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
133
|
+
deltacat-0.1.18b13.dist-info/METADATA,sha256=BxNxkho94qIqKJWLh4ShgkrA1BPKKQd1so2e3YV8z5U,1558
|
134
|
+
deltacat-0.1.18b13.dist-info/WHEEL,sha256=AtBG6SXL3KF_v0NxLf0ehyVOh0cold-JbJYXNGorC6Q,92
|
135
|
+
deltacat-0.1.18b13.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
|
136
|
+
deltacat-0.1.18b13.dist-info/RECORD,,
|
File without changes
|
@@ -1,57 +0,0 @@
|
|
1
|
-
import logging
|
2
|
-
from typing import List, Tuple
|
3
|
-
|
4
|
-
import numpy as np
|
5
|
-
import pyarrow as pa
|
6
|
-
import ray
|
7
|
-
from ray.types import ObjectRef
|
8
|
-
|
9
|
-
from deltacat import logs
|
10
|
-
from deltacat.compute.compactor import PrimaryKeyIndexVersionLocator
|
11
|
-
from deltacat.compute.compactor.utils import primary_key_index as pki
|
12
|
-
|
13
|
-
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
14
|
-
|
15
|
-
|
16
|
-
def group_file_records_by_pk_hash_bucket(
|
17
|
-
pki_table: pa.Table, num_buckets: int
|
18
|
-
) -> np.ndarray:
|
19
|
-
# generate the new table for each new hash bucket
|
20
|
-
hash_bucket_to_indices = pki.group_record_indices_by_hash_bucket(
|
21
|
-
pki_table,
|
22
|
-
num_buckets,
|
23
|
-
)
|
24
|
-
hash_bucket_to_table = np.empty([num_buckets], dtype="object")
|
25
|
-
for hash_bucket, indices in enumerate(hash_bucket_to_indices):
|
26
|
-
if indices:
|
27
|
-
hash_bucket_to_table[hash_bucket] = pki_table.take(indices)
|
28
|
-
return hash_bucket_to_table
|
29
|
-
|
30
|
-
|
31
|
-
@ray.remote(num_cpus=1, num_returns=2)
|
32
|
-
def rehash_bucket(
|
33
|
-
hash_bucket_index: int,
|
34
|
-
s3_bucket: str,
|
35
|
-
old_pki_version_locator: PrimaryKeyIndexVersionLocator,
|
36
|
-
num_buckets: int,
|
37
|
-
num_groups: int,
|
38
|
-
) -> Tuple[np.ndarray, List[ObjectRef]]:
|
39
|
-
|
40
|
-
logger.info(f"Starting rehash bucket task...")
|
41
|
-
tables = pki.download_hash_bucket_entries(
|
42
|
-
s3_bucket,
|
43
|
-
hash_bucket_index,
|
44
|
-
old_pki_version_locator,
|
45
|
-
)
|
46
|
-
prior_pk_index_table = pa.concat_tables(tables)
|
47
|
-
hash_bucket_to_table = group_file_records_by_pk_hash_bucket(
|
48
|
-
prior_pk_index_table,
|
49
|
-
num_buckets,
|
50
|
-
)
|
51
|
-
hash_bucket_group_to_obj_id, object_refs = pki.group_hash_bucket_indices(
|
52
|
-
hash_bucket_to_table,
|
53
|
-
num_buckets,
|
54
|
-
num_groups,
|
55
|
-
)
|
56
|
-
logger.info(f"Finished rehash bucket task...")
|
57
|
-
return hash_bucket_group_to_obj_id, object_refs
|
@@ -1,48 +0,0 @@
|
|
1
|
-
import logging
|
2
|
-
from collections import defaultdict
|
3
|
-
from typing import Any, List, Tuple
|
4
|
-
|
5
|
-
import pyarrow as pa
|
6
|
-
import ray
|
7
|
-
from ray import cloudpickle
|
8
|
-
from ray.types import ObjectRef
|
9
|
-
|
10
|
-
from deltacat import logs
|
11
|
-
from deltacat.compute.compactor import PrimaryKeyIndexVersionLocator, PyArrowWriteResult
|
12
|
-
from deltacat.compute.compactor.utils import primary_key_index as pki
|
13
|
-
|
14
|
-
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
15
|
-
|
16
|
-
|
17
|
-
@ray.remote(num_cpus=1, num_returns=2)
|
18
|
-
def rewrite_index(
|
19
|
-
object_ids: List[Any],
|
20
|
-
s3_bucket: str,
|
21
|
-
new_primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
|
22
|
-
max_records_per_index_file: int,
|
23
|
-
) -> Tuple[PyArrowWriteResult, List[ObjectRef]]:
|
24
|
-
|
25
|
-
logger.info(f"Starting rewrite primary key index task...")
|
26
|
-
object_refs = [cloudpickle.loads(obj_id_pkl) for obj_id_pkl in object_ids]
|
27
|
-
logger.info(f"Getting table groups object refs...")
|
28
|
-
table_groups_list = ray.get(object_refs)
|
29
|
-
logger.info(f"Got {len(table_groups_list)} table groups object refs...")
|
30
|
-
hb_index_to_tables = defaultdict(list)
|
31
|
-
for table_groups in table_groups_list:
|
32
|
-
for hb_index, table in enumerate(table_groups):
|
33
|
-
if table is not None:
|
34
|
-
hb_index_to_tables[hb_index].append(table)
|
35
|
-
logger.info(f"Running {len(hb_index_to_tables)} rewrite index rounds...")
|
36
|
-
pki_stats = []
|
37
|
-
for hb_index, tables in hb_index_to_tables.items():
|
38
|
-
table = pa.concat_tables(tables)
|
39
|
-
hb_pki_stats = pki.write_primary_key_index_files(
|
40
|
-
table,
|
41
|
-
new_primary_key_index_version_locator,
|
42
|
-
s3_bucket,
|
43
|
-
hb_index,
|
44
|
-
max_records_per_index_file,
|
45
|
-
)
|
46
|
-
pki_stats.append(hb_pki_stats)
|
47
|
-
logger.info(f"Finished rewrite primary key index task...")
|
48
|
-
return PyArrowWriteResult.union(pki_stats), object_refs
|
File without changes
|
File without changes
|
File without changes
|