deltacat 0.1.18b13__py3-none-any.whl → 0.1.18b15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +3 -2
- deltacat/aws/clients.py +123 -3
- deltacat/aws/redshift/model/manifest.py +4 -0
- deltacat/aws/s3u.py +24 -1
- deltacat/benchmarking/benchmark_parquet_reads.py +53 -0
- deltacat/benchmarking/conftest.py +61 -0
- deltacat/catalog/delegate.py +1 -1
- deltacat/catalog/interface.py +1 -1
- deltacat/compute/compactor/__init__.py +0 -3
- deltacat/compute/compactor/compaction_session.py +45 -20
- deltacat/compute/compactor/model/compact_partition_params.py +287 -58
- deltacat/compute/compactor/model/compaction_session_audit_info.py +150 -9
- deltacat/compute/compactor/model/delta_annotated.py +91 -9
- deltacat/compute/compactor/model/delta_file_envelope.py +15 -3
- deltacat/compute/compactor/model/primary_key_index.py +1 -1
- deltacat/compute/compactor/model/round_completion_info.py +17 -1
- deltacat/compute/compactor/repartition_session.py +5 -3
- deltacat/compute/compactor/steps/dedupe.py +10 -8
- deltacat/compute/compactor/steps/hash_bucket.py +25 -4
- deltacat/compute/compactor/steps/materialize.py +11 -6
- deltacat/compute/compactor/steps/repartition.py +16 -1
- deltacat/compute/compactor/utils/io.py +40 -23
- deltacat/compute/compactor/utils/primary_key_index.py +1 -15
- deltacat/compute/compactor/utils/sort_key.py +57 -0
- deltacat/compute/compactor/utils/system_columns.py +43 -0
- deltacat/compute/compactor_v2/compaction_session.py +506 -0
- deltacat/compute/compactor_v2/constants.py +34 -0
- deltacat/compute/compactor_v2/model/__init__.py +0 -0
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
- deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
- deltacat/compute/compactor_v2/model/merge_input.py +127 -0
- deltacat/compute/compactor_v2/model/merge_result.py +12 -0
- deltacat/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
- deltacat/compute/compactor_v2/steps/merge.py +41 -0
- deltacat/compute/compactor_v2/utils/__init__.py +0 -0
- deltacat/compute/compactor_v2/utils/content_type_params.py +37 -0
- deltacat/compute/compactor_v2/utils/io.py +149 -0
- deltacat/compute/compactor_v2/utils/primary_key_index.py +308 -0
- deltacat/compute/compactor_v2/utils/task_options.py +228 -0
- deltacat/compute/metastats/meta_stats.py +4 -2
- deltacat/compute/metastats/stats.py +1 -0
- deltacat/compute/metastats/utils/io.py +4 -0
- deltacat/compute/stats/utils/io.py +20 -5
- deltacat/exceptions.py +4 -0
- deltacat/io/memcached_object_store.py +37 -14
- deltacat/logs.py +4 -3
- deltacat/storage/__init__.py +3 -0
- deltacat/storage/interface.py +11 -2
- deltacat/storage/model/sort_key.py +33 -0
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/types.py +2 -1
- deltacat/tests/aws/__init__.py +0 -0
- deltacat/tests/aws/test_clients.py +80 -0
- deltacat/tests/compute/__init__.py +0 -0
- deltacat/tests/compute/common.py +96 -0
- deltacat/tests/compute/compactor/__init__.py +0 -0
- deltacat/tests/compute/compactor/steps/__init__.py +0 -0
- deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +22 -8
- deltacat/tests/compute/compactor/utils/__init__.py +0 -0
- deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
- deltacat/tests/compute/compactor_v2/__init__.py +0 -0
- deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +199 -0
- deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
- deltacat/tests/compute/test_compaction_session_incremental.py +348 -0
- deltacat/tests/compute/testcases.py +390 -0
- deltacat/tests/io/test_memcached_object_store.py +5 -4
- deltacat/tests/local_deltacat_storage/__init__.py +1109 -0
- deltacat/tests/test_utils/pyarrow.py +32 -0
- deltacat/tests/test_utils/utils.py +13 -0
- deltacat/tests/utils/data/__init__.py +0 -0
- deltacat/tests/utils/test_daft.py +76 -0
- deltacat/tests/utils/test_pyarrow.py +133 -0
- deltacat/tests/utils/test_resources.py +23 -20
- deltacat/types/media.py +1 -0
- deltacat/types/partial_download.py +82 -0
- deltacat/types/tables.py +1 -0
- deltacat/utils/arguments.py +26 -0
- deltacat/utils/daft.py +87 -0
- deltacat/utils/performance.py +4 -2
- deltacat/utils/placement.py +20 -3
- deltacat/utils/pyarrow.py +213 -1
- deltacat/utils/ray_utils/concurrency.py +26 -1
- deltacat/utils/resources.py +72 -1
- deltacat/utils/s3fs.py +21 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/METADATA +27 -13
- deltacat-0.1.18b15.dist-info/RECORD +176 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/WHEEL +1 -1
- deltacat/compute/compactor/model/sort_key.py +0 -98
- deltacat-0.1.18b13.dist-info/RECORD +0 -136
- /deltacat/{tests/compactor → benchmarking}/__init__.py +0 -0
- /deltacat/{tests/compactor/utils → compute/compactor_v2}/__init__.py +0 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/LICENSE +0 -0
- {deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/top_level.txt +0 -0
@@ -10,11 +10,12 @@ from deltacat.constants import (
|
|
10
10
|
from deltacat.storage import (
|
11
11
|
PartitionLocator,
|
12
12
|
Delta,
|
13
|
+
ManifestEntry,
|
13
14
|
interface as unimplemented_deltacat_storage,
|
14
15
|
)
|
15
16
|
from deltacat import logs
|
16
17
|
from deltacat.compute.compactor import DeltaAnnotated
|
17
|
-
from typing import Dict, List, Optional, Tuple, Union
|
18
|
+
from typing import Dict, List, Optional, Tuple, Union, Any
|
18
19
|
from deltacat.compute.compactor import HighWatermark
|
19
20
|
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
20
21
|
CompactionSessionAuditInfo,
|
@@ -31,23 +32,30 @@ def discover_deltas(
|
|
31
32
|
rebase_source_partition_locator: Optional[PartitionLocator],
|
32
33
|
rebase_source_partition_high_watermark: Optional[int],
|
33
34
|
deltacat_storage=unimplemented_deltacat_storage,
|
34
|
-
|
35
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
|
36
|
+
list_deltas_kwargs: Optional[Dict[str, Any]] = {},
|
35
37
|
) -> Tuple[List[Delta], int]:
|
36
|
-
|
38
|
+
if deltacat_storage_kwargs is None:
|
39
|
+
deltacat_storage_kwargs = {}
|
37
40
|
# Source One: new deltas from uncompacted table for incremental compaction or deltas from compacted table for rebase
|
38
|
-
|
39
|
-
source_partition_locator,
|
41
|
+
start_position_exclusive = (
|
40
42
|
high_watermark.get(source_partition_locator)
|
41
43
|
if isinstance(high_watermark, dict)
|
42
|
-
else high_watermark
|
44
|
+
else high_watermark
|
45
|
+
)
|
46
|
+
input_deltas = _discover_deltas(
|
47
|
+
source_partition_locator,
|
48
|
+
start_position_exclusive,
|
43
49
|
last_stream_position_to_compact
|
44
50
|
if not rebase_source_partition_locator
|
45
51
|
else deltacat_storage.get_partition(
|
46
52
|
source_partition_locator.stream_locator,
|
47
53
|
source_partition_locator.partition_values,
|
54
|
+
**deltacat_storage_kwargs,
|
48
55
|
).stream_position,
|
49
56
|
deltacat_storage,
|
50
|
-
|
57
|
+
deltacat_storage_kwargs,
|
58
|
+
list_deltas_kwargs,
|
51
59
|
)
|
52
60
|
|
53
61
|
# Source Two: delta from compacted table for incremental compaction or new deltas from uncompacted table for rebase
|
@@ -56,6 +64,7 @@ def discover_deltas(
|
|
56
64
|
compacted_partition = deltacat_storage.get_partition(
|
57
65
|
compacted_partition_locator.stream_locator,
|
58
66
|
compacted_partition_locator.partition_values,
|
67
|
+
**deltacat_storage_kwargs,
|
59
68
|
)
|
60
69
|
previous_last_stream_position_compacted = (
|
61
70
|
compacted_partition.stream_position if compacted_partition else -1
|
@@ -67,7 +76,8 @@ def discover_deltas(
|
|
67
76
|
None,
|
68
77
|
previous_last_stream_position_compacted,
|
69
78
|
deltacat_storage,
|
70
|
-
|
79
|
+
deltacat_storage_kwargs,
|
80
|
+
list_deltas_kwargs,
|
71
81
|
)
|
72
82
|
logger.info(
|
73
83
|
f"Length of input deltas from uncompacted table {len(input_deltas)} up to {last_stream_position_to_compact},"
|
@@ -80,7 +90,8 @@ def discover_deltas(
|
|
80
90
|
rebase_source_partition_high_watermark,
|
81
91
|
last_stream_position_to_compact,
|
82
92
|
deltacat_storage,
|
83
|
-
|
93
|
+
deltacat_storage_kwargs,
|
94
|
+
list_deltas_kwargs,
|
84
95
|
)
|
85
96
|
logger.info(
|
86
97
|
f"Length of input deltas from uncompacted table {len(input_deltas_new)} up to {last_stream_position_to_compact},"
|
@@ -99,6 +110,8 @@ def limit_input_deltas(
|
|
99
110
|
input_deltas_stats: Dict[int, DeltaStats],
|
100
111
|
compaction_audit: CompactionSessionAuditInfo,
|
101
112
|
deltacat_storage=unimplemented_deltacat_storage,
|
113
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
114
|
+
**kwargs,
|
102
115
|
) -> Tuple[List[DeltaAnnotated], int, HighWatermark, bool]:
|
103
116
|
# TODO (pdames): when row counts are available in metadata, use them
|
104
117
|
# instead of bytes - memory consumption depends more on number of
|
@@ -108,6 +121,8 @@ def limit_input_deltas(
|
|
108
121
|
# this assumption could be removed, but we'd still need to know the max
|
109
122
|
# resources we COULD get for this cluster, and the amount of memory
|
110
123
|
# available per CPU should remain fixed across the cluster.
|
124
|
+
if deltacat_storage_kwargs is None:
|
125
|
+
deltacat_storage_kwargs = {}
|
111
126
|
worker_cpus = int(cluster_resources["CPU"])
|
112
127
|
worker_obj_store_mem = float(cluster_resources["object_store_memory"])
|
113
128
|
logger.info(f"Total worker object store memory: {worker_obj_store_mem}")
|
@@ -135,7 +150,7 @@ def limit_input_deltas(
|
|
135
150
|
for stream_pos, delta_stats in input_deltas_stats.items()
|
136
151
|
}
|
137
152
|
for delta in input_deltas:
|
138
|
-
manifest = deltacat_storage.get_delta_manifest(delta)
|
153
|
+
manifest = deltacat_storage.get_delta_manifest(delta, **deltacat_storage_kwargs)
|
139
154
|
delta.manifest = manifest
|
140
155
|
position = delta.stream_position
|
141
156
|
delta_stats = input_deltas_stats.get(delta.stream_position, DeltaStats())
|
@@ -258,6 +273,8 @@ def fit_input_deltas(
|
|
258
273
|
compaction_audit: CompactionSessionAuditInfo,
|
259
274
|
hash_bucket_count: Optional[int],
|
260
275
|
deltacat_storage=unimplemented_deltacat_storage,
|
276
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
277
|
+
**kwargs,
|
261
278
|
) -> Tuple[List[DeltaAnnotated], int, HighWatermark, bool]:
|
262
279
|
"""
|
263
280
|
This method tries to fit all the input deltas to run into the existing cluster. Contrary to
|
@@ -277,6 +294,8 @@ def fit_input_deltas(
|
|
277
294
|
Tuple of list of annotated deltas, recommended hash bucket count, high watermark,
|
278
295
|
and whether multiple rounds are required (which is always False)
|
279
296
|
"""
|
297
|
+
if deltacat_storage_kwargs is None:
|
298
|
+
deltacat_storage_kwargs = {}
|
280
299
|
worker_cpus = int(cluster_resources["CPU"])
|
281
300
|
total_memory = float(cluster_resources["memory"])
|
282
301
|
high_watermark = HighWatermark()
|
@@ -306,8 +325,8 @@ def fit_input_deltas(
|
|
306
325
|
# We assume that the cluster is capable of distributing all tasks
|
307
326
|
# correctly. Hence, the correct in-memory size will be in the ratio of
|
308
327
|
# in-disk size.
|
309
|
-
def estimate_size(
|
310
|
-
return (content_length * 1.0 / delta_bytes) * total_memory
|
328
|
+
def estimate_size(manifest_entry: ManifestEntry):
|
329
|
+
return (manifest_entry.meta.content_length * 1.0 / delta_bytes) * total_memory
|
311
330
|
|
312
331
|
# Assuming each CPU consumes equal amount of memory
|
313
332
|
min_delta_bytes = total_memory / worker_cpus
|
@@ -341,18 +360,16 @@ def _discover_deltas(
|
|
341
360
|
start_position_exclusive: Optional[int],
|
342
361
|
end_position_inclusive: int,
|
343
362
|
deltacat_storage=unimplemented_deltacat_storage,
|
344
|
-
|
363
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = {},
|
364
|
+
list_deltas_kwargs: Optional[Dict[str, Any]] = {},
|
345
365
|
) -> List[Delta]:
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
deltas_list_result = deltacat_storage.
|
352
|
-
|
353
|
-
table_name=table_name,
|
354
|
-
partition_values=partition_values,
|
355
|
-
table_version=table_version,
|
366
|
+
if deltacat_storage_kwargs is None:
|
367
|
+
deltacat_storage_kwargs = {}
|
368
|
+
|
369
|
+
kwargs = {**deltacat_storage_kwargs, **list_deltas_kwargs}
|
370
|
+
|
371
|
+
deltas_list_result = deltacat_storage.list_partition_deltas(
|
372
|
+
partition_like=source_partition_locator,
|
356
373
|
first_stream_position=start_position_exclusive,
|
357
374
|
last_stream_position=end_position_inclusive,
|
358
375
|
ascending_order=True,
|
@@ -6,26 +6,12 @@ import pyarrow as pa
|
|
6
6
|
from ray.types import ObjectRef
|
7
7
|
|
8
8
|
from deltacat import logs
|
9
|
-
from deltacat.aws import s3u
|
10
|
-
from deltacat.compute.compactor import (
|
11
|
-
PrimaryKeyIndexVersionLocator,
|
12
|
-
)
|
13
9
|
from deltacat.compute.compactor.utils import system_columns as sc
|
14
10
|
from deltacat.io.object_store import IObjectStore
|
15
11
|
|
16
12
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
17
13
|
|
18
|
-
|
19
|
-
def delete_primary_key_index_version(
|
20
|
-
s3_bucket: str, pki_version_locator: PrimaryKeyIndexVersionLocator
|
21
|
-
) -> None:
|
22
|
-
|
23
|
-
logger.info(f"Deleting primary key index: {pki_version_locator}")
|
24
|
-
s3u.delete_files_by_prefix(
|
25
|
-
s3_bucket,
|
26
|
-
pki_version_locator.primary_key_index_version_root_path,
|
27
|
-
)
|
28
|
-
logger.info(f"Primary key index deleted: {pki_version_locator}")
|
14
|
+
# TODO: Deprecate this module in the favor of compactor_v2
|
29
15
|
|
30
16
|
|
31
17
|
def group_record_indices_by_hash_bucket(
|
@@ -0,0 +1,57 @@
|
|
1
|
+
import pyarrow as pa
|
2
|
+
from typing import List
|
3
|
+
from deltacat.storage import PartitionLocator, SortKey
|
4
|
+
|
5
|
+
MAX_SORT_KEYS_BIT_WIDTH = 256
|
6
|
+
|
7
|
+
|
8
|
+
def validate_sort_keys(
|
9
|
+
source_partition_locator: PartitionLocator,
|
10
|
+
sort_keys: List[SortKey],
|
11
|
+
deltacat_storage,
|
12
|
+
deltacat_storage_kwargs,
|
13
|
+
**kwargs,
|
14
|
+
) -> int:
|
15
|
+
"""
|
16
|
+
Validates the input sort keys to ensure that they are unique, are using
|
17
|
+
a valid sort key model, are all fixed-width data types, and that the
|
18
|
+
sum of bit widths across sort key data types is less-than-or-equal-to
|
19
|
+
256. Returns the sum of bit widths across all sort keys.
|
20
|
+
"""
|
21
|
+
if deltacat_storage_kwargs is None:
|
22
|
+
deltacat_storage_kwargs = {}
|
23
|
+
total_sort_keys_bit_width = 0
|
24
|
+
if sort_keys:
|
25
|
+
sort_key_names = [key.key_name for key in sort_keys]
|
26
|
+
assert len(sort_key_names) == len(
|
27
|
+
set(sort_key_names)
|
28
|
+
), f"Sort key names must be unique: {sort_key_names}"
|
29
|
+
stream_locator = source_partition_locator.stream_locator
|
30
|
+
table_version_schema = deltacat_storage.get_table_version_schema(
|
31
|
+
stream_locator.namespace,
|
32
|
+
stream_locator.table_name,
|
33
|
+
stream_locator.table_version,
|
34
|
+
**deltacat_storage_kwargs,
|
35
|
+
)
|
36
|
+
if isinstance(table_version_schema, pa.Schema):
|
37
|
+
for sort_key_name in sort_key_names:
|
38
|
+
pa_field: pa.Field = pa.Schema.field(sort_key_name)
|
39
|
+
pa_type: pa.DataType = pa_field.type
|
40
|
+
try:
|
41
|
+
total_sort_keys_bit_width += pa_type.bit_width
|
42
|
+
if total_sort_keys_bit_width > MAX_SORT_KEYS_BIT_WIDTH:
|
43
|
+
raise ValueError(
|
44
|
+
f"Total length of sort keys "
|
45
|
+
f"({total_sort_keys_bit_width}) is greater "
|
46
|
+
f"than the max supported bit width for all "
|
47
|
+
f"sort keys ({MAX_SORT_KEYS_BIT_WIDTH})"
|
48
|
+
)
|
49
|
+
except ValueError as e:
|
50
|
+
raise ValueError(
|
51
|
+
f"Unable to get bit width of sort key: {pa_field}. "
|
52
|
+
f"Please ensure that all sort keys are fixed-size "
|
53
|
+
f"PyArrow data types."
|
54
|
+
) from e
|
55
|
+
else:
|
56
|
+
total_sort_keys_bit_width = MAX_SORT_KEYS_BIT_WIDTH
|
57
|
+
return total_sort_keys_bit_width
|
@@ -22,6 +22,13 @@ _PK_HASH_COLUMN_FIELD = pa.field(
|
|
22
22
|
_PK_HASH_COLUMN_TYPE,
|
23
23
|
)
|
24
24
|
|
25
|
+
_PK_HASH_STRING_COLUMN_NAME = _get_sys_col_name("hash_str")
|
26
|
+
_PK_HASH_STRING_COLUMN_TYPE = pa.string()
|
27
|
+
_PK_HASH_STRING_COLUMN_FIELD = pa.field(
|
28
|
+
_PK_HASH_STRING_COLUMN_NAME,
|
29
|
+
_PK_HASH_STRING_COLUMN_TYPE,
|
30
|
+
)
|
31
|
+
|
25
32
|
_DEDUPE_TASK_IDX_COLUMN_NAME = _get_sys_col_name("dedupe_task_idx")
|
26
33
|
_DEDUPE_TASK_IDX_COLUMN_TYPE = pa.int32()
|
27
34
|
_DEDUPE_TASK_IDX_COLUMN_FIELD = pa.field(
|
@@ -36,6 +43,12 @@ _PARTITION_STREAM_POSITION_COLUMN_FIELD = pa.field(
|
|
36
43
|
_PARTITION_STREAM_POSITION_COLUMN_TYPE,
|
37
44
|
)
|
38
45
|
|
46
|
+
_HASH_BUCKET_IDX_COLUMN_NAME = _get_sys_col_name("hash_bucket_idx")
|
47
|
+
_HASH_BUCKET_IDX_COLUMN_TYPE = pa.int32()
|
48
|
+
_HASH_BUCKET_IDX_COLUMN_FIELD = pa.field(
|
49
|
+
_HASH_BUCKET_IDX_COLUMN_NAME, _HASH_BUCKET_IDX_COLUMN_TYPE
|
50
|
+
)
|
51
|
+
|
39
52
|
_ORDERED_FILE_IDX_COLUMN_NAME = _get_sys_col_name("file_index")
|
40
53
|
_ORDERED_FILE_IDX_COLUMN_TYPE = pa.int32()
|
41
54
|
_ORDERED_FILE_IDX_COLUMN_FIELD = pa.field(
|
@@ -76,10 +89,18 @@ def get_pk_hash_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
|
|
76
89
|
return pa.array(obj, _PK_HASH_COLUMN_TYPE)
|
77
90
|
|
78
91
|
|
92
|
+
def get_pk_hash_string_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
|
93
|
+
return pa.array(obj, _PK_HASH_STRING_COLUMN_TYPE)
|
94
|
+
|
95
|
+
|
79
96
|
def pk_hash_column_np(table: pa.Table) -> np.ndarray:
|
80
97
|
return table[_PK_HASH_COLUMN_NAME].to_numpy()
|
81
98
|
|
82
99
|
|
100
|
+
def pk_hash_string_column_np(table: pa.Table) -> np.ndarray:
|
101
|
+
return table[_PK_HASH_STRING_COLUMN_NAME].to_numpy()
|
102
|
+
|
103
|
+
|
83
104
|
def pk_hash_column(table: pa.Table) -> pa.ChunkedArray:
|
84
105
|
return table[_PK_HASH_COLUMN_NAME]
|
85
106
|
|
@@ -143,6 +164,10 @@ def get_delta_type_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
|
|
143
164
|
)
|
144
165
|
|
145
166
|
|
167
|
+
def get_hash_bucket_idx_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
|
168
|
+
return pa.array(obj, _HASH_BUCKET_IDX_COLUMN_TYPE)
|
169
|
+
|
170
|
+
|
146
171
|
def get_is_source_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
|
147
172
|
return pa.array(
|
148
173
|
obj,
|
@@ -232,6 +257,24 @@ def append_pk_hash_column(table: pa.Table, pk_hashes) -> pa.Table:
|
|
232
257
|
return table
|
233
258
|
|
234
259
|
|
260
|
+
def append_pk_hash_string_column(table: pa.Table, pk_hashes) -> pa.Table:
|
261
|
+
|
262
|
+
table = table.append_column(
|
263
|
+
_PK_HASH_STRING_COLUMN_FIELD, get_pk_hash_string_column_array(pk_hashes)
|
264
|
+
)
|
265
|
+
return table
|
266
|
+
|
267
|
+
|
268
|
+
def append_hash_bucket_idx_col(table: pa.Table, hash_bucket_indexes) -> pa.Table:
|
269
|
+
|
270
|
+
table = table.append_column(
|
271
|
+
_HASH_BUCKET_IDX_COLUMN_FIELD,
|
272
|
+
get_hash_bucket_idx_column_array(hash_bucket_indexes),
|
273
|
+
)
|
274
|
+
|
275
|
+
return table
|
276
|
+
|
277
|
+
|
235
278
|
def append_record_idx_col(table: pa.Table, ordered_record_indices) -> pa.Table:
|
236
279
|
|
237
280
|
table = table.append_column(
|