deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +41 -15
- deltacat/aws/clients.py +12 -31
- deltacat/aws/constants.py +1 -1
- deltacat/aws/redshift/__init__.py +7 -2
- deltacat/aws/redshift/model/manifest.py +54 -50
- deltacat/aws/s3u.py +176 -187
- deltacat/catalog/delegate.py +151 -185
- deltacat/catalog/interface.py +78 -97
- deltacat/catalog/model/catalog.py +21 -21
- deltacat/catalog/model/table_definition.py +11 -9
- deltacat/compute/compactor/__init__.py +12 -16
- deltacat/compute/compactor/compaction_session.py +237 -166
- deltacat/compute/compactor/model/delta_annotated.py +60 -44
- deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
- deltacat/compute/compactor/model/delta_file_locator.py +10 -8
- deltacat/compute/compactor/model/materialize_result.py +6 -7
- deltacat/compute/compactor/model/primary_key_index.py +38 -34
- deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
- deltacat/compute/compactor/model/round_completion_info.py +25 -19
- deltacat/compute/compactor/model/sort_key.py +18 -15
- deltacat/compute/compactor/steps/dedupe.py +119 -94
- deltacat/compute/compactor/steps/hash_bucket.py +48 -47
- deltacat/compute/compactor/steps/materialize.py +86 -92
- deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
- deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
- deltacat/compute/compactor/utils/io.py +59 -47
- deltacat/compute/compactor/utils/primary_key_index.py +91 -80
- deltacat/compute/compactor/utils/round_completion_file.py +22 -23
- deltacat/compute/compactor/utils/system_columns.py +33 -45
- deltacat/compute/metastats/meta_stats.py +235 -157
- deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
- deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
- deltacat/compute/metastats/stats.py +95 -64
- deltacat/compute/metastats/utils/io.py +100 -53
- deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
- deltacat/compute/metastats/utils/ray_utils.py +38 -33
- deltacat/compute/stats/basic.py +107 -69
- deltacat/compute/stats/models/delta_column_stats.py +11 -8
- deltacat/compute/stats/models/delta_stats.py +59 -32
- deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
- deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
- deltacat/compute/stats/models/stats_result.py +24 -14
- deltacat/compute/stats/utils/intervals.py +16 -9
- deltacat/compute/stats/utils/io.py +86 -51
- deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
- deltacat/constants.py +4 -13
- deltacat/io/__init__.py +2 -2
- deltacat/io/aws/redshift/redshift_datasource.py +157 -143
- deltacat/io/dataset.py +14 -17
- deltacat/io/read_api.py +36 -33
- deltacat/logs.py +94 -42
- deltacat/storage/__init__.py +18 -8
- deltacat/storage/interface.py +196 -213
- deltacat/storage/model/delta.py +45 -51
- deltacat/storage/model/list_result.py +12 -8
- deltacat/storage/model/namespace.py +4 -5
- deltacat/storage/model/partition.py +42 -42
- deltacat/storage/model/stream.py +29 -30
- deltacat/storage/model/table.py +14 -14
- deltacat/storage/model/table_version.py +32 -31
- deltacat/storage/model/types.py +1 -0
- deltacat/tests/stats/test_intervals.py +11 -24
- deltacat/tests/utils/__init__.py +0 -0
- deltacat/tests/utils/test_record_batch_tables.py +284 -0
- deltacat/types/media.py +3 -4
- deltacat/types/tables.py +31 -21
- deltacat/utils/common.py +5 -11
- deltacat/utils/numpy.py +20 -22
- deltacat/utils/pandas.py +73 -100
- deltacat/utils/performance.py +3 -9
- deltacat/utils/placement.py +259 -230
- deltacat/utils/pyarrow.py +302 -89
- deltacat/utils/ray_utils/collections.py +2 -1
- deltacat/utils/ray_utils/concurrency.py +27 -28
- deltacat/utils/ray_utils/dataset.py +28 -28
- deltacat/utils/ray_utils/performance.py +5 -9
- deltacat/utils/ray_utils/runtime.py +9 -10
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
- deltacat-0.1.12.dist-info/RECORD +110 -0
- deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
- {deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,11 @@
|
|
1
1
|
import logging
|
2
|
-
import
|
3
|
-
import pyarrow as pa
|
4
|
-
|
2
|
+
import time
|
5
3
|
from collections import defaultdict
|
6
4
|
from itertools import chain, repeat
|
7
5
|
from typing import List, Optional, Tuple
|
8
6
|
|
9
7
|
import pyarrow as pa
|
10
8
|
import ray
|
11
|
-
from pyarrow import compute as pc
|
12
9
|
from ray import cloudpickle
|
13
10
|
|
14
11
|
from deltacat import logs
|
@@ -21,14 +18,15 @@ from deltacat.compute.compactor.steps.dedupe import (
|
|
21
18
|
DedupeTaskIndexWithObjectId,
|
22
19
|
DeltaFileLocatorToRecords,
|
23
20
|
)
|
24
|
-
from deltacat.compute.compactor.utils import system_columns as sc
|
25
21
|
from deltacat.storage import Delta, DeltaLocator, Partition, PartitionLocator
|
26
22
|
from deltacat.storage import interface as unimplemented_deltacat_storage
|
27
23
|
from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES, ContentType
|
28
24
|
from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
|
25
|
+
from deltacat.utils.performance import timed_invocation
|
29
26
|
from deltacat.utils.pyarrow import (
|
30
27
|
ReadKwargsProviderPyArrowCsvPureUtf8,
|
31
28
|
ReadKwargsProviderPyArrowSchemaOverride,
|
29
|
+
RecordBatchTables,
|
32
30
|
)
|
33
31
|
|
34
32
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
@@ -36,47 +34,49 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
|
36
34
|
|
37
35
|
@ray.remote
|
38
36
|
def materialize(
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
compacted_tables_size = sum([TABLE_CLASS_TO_SIZE_FUNC[type(tbl)](tbl)
|
53
|
-
for tbl in compacted_tables])
|
54
|
-
logger.debug(f"Uploading {len(compacted_tables)} compacted tables "
|
55
|
-
f"with size: {compacted_tables_size} bytes "
|
56
|
-
f"and record count: {compacted_tables_record_count}")
|
37
|
+
source_partition_locator: PartitionLocator,
|
38
|
+
round_completion_info: Optional[RoundCompletionInfo],
|
39
|
+
partition: Partition,
|
40
|
+
mat_bucket_index: int,
|
41
|
+
dedupe_task_idx_and_obj_id_tuples: List[DedupeTaskIndexWithObjectId],
|
42
|
+
max_records_per_output_file: int,
|
43
|
+
compacted_file_content_type: ContentType,
|
44
|
+
schema: Optional[pa.Schema] = None,
|
45
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
46
|
+
) -> MaterializeResult:
|
47
|
+
# TODO (rkenmi): Add docstrings for the steps in the compaction workflow
|
48
|
+
# https://github.com/ray-project/deltacat/issues/79
|
49
|
+
def _materialize(compacted_tables: List[pa.Table]) -> MaterializeResult:
|
57
50
|
compacted_table = pa.concat_tables(compacted_tables)
|
51
|
+
|
58
52
|
if compacted_file_content_type in DELIMITED_TEXT_CONTENT_TYPES:
|
59
|
-
# TODO (
|
53
|
+
# TODO (rkenmi): Investigate if we still need to convert this table to pandas DataFrame
|
60
54
|
# TODO (pdames): compare performance to pandas-native materialize path
|
61
|
-
df = compacted_table.to_pandas(
|
62
|
-
split_blocks=True,
|
63
|
-
self_destruct=True,
|
64
|
-
zero_copy_only=True
|
65
|
-
)
|
55
|
+
df = compacted_table.to_pandas(split_blocks=True, self_destruct=True)
|
66
56
|
compacted_table = df
|
67
|
-
delta =
|
57
|
+
delta, stage_delta_time = timed_invocation(
|
58
|
+
deltacat_storage.stage_delta,
|
68
59
|
compacted_table,
|
69
60
|
partition,
|
70
61
|
max_records_per_entry=max_records_per_output_file,
|
71
62
|
content_type=compacted_file_content_type,
|
72
63
|
)
|
64
|
+
compacted_table_size = TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](
|
65
|
+
compacted_table
|
66
|
+
)
|
67
|
+
logger.debug(
|
68
|
+
f"Time taken for materialize task"
|
69
|
+
f" to upload {len(compacted_table)} records"
|
70
|
+
f" of size {compacted_table_size} is: {stage_delta_time}s"
|
71
|
+
)
|
73
72
|
manifest = delta.manifest
|
74
73
|
manifest_records = manifest.meta.record_count
|
75
|
-
assert
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
74
|
+
assert (
|
75
|
+
manifest_records == len(compacted_table),
|
76
|
+
f"Unexpected Error: Materialized delta manifest record count "
|
77
|
+
f"({manifest_records}) does not equal compacted table record count "
|
78
|
+
f"({len(compacted_table)})",
|
79
|
+
)
|
80
80
|
materialize_result = MaterializeResult.of(
|
81
81
|
delta,
|
82
82
|
mat_bucket_index,
|
@@ -92,17 +92,20 @@ def materialize(
|
|
92
92
|
logger.info(f"Materialize result: {materialize_result}")
|
93
93
|
return materialize_result
|
94
94
|
|
95
|
-
logger.info(
|
95
|
+
logger.info(
|
96
|
+
f"Starting materialize task with"
|
97
|
+
f" materialize bucket index: {mat_bucket_index}..."
|
98
|
+
)
|
99
|
+
start = time.time()
|
96
100
|
dedupe_task_idx_and_obj_ref_tuples = [
|
97
101
|
(
|
98
102
|
t1,
|
99
103
|
cloudpickle.loads(t2),
|
100
|
-
)
|
104
|
+
)
|
105
|
+
for t1, t2 in dedupe_task_idx_and_obj_id_tuples
|
101
106
|
]
|
102
107
|
logger.info(f"Resolved materialize task obj refs...")
|
103
|
-
dedupe_task_indices, obj_refs = zip(
|
104
|
-
*dedupe_task_idx_and_obj_ref_tuples
|
105
|
-
)
|
108
|
+
dedupe_task_indices, obj_refs = zip(*dedupe_task_idx_and_obj_ref_tuples)
|
106
109
|
# this depends on `ray.get` result order matching input order, as per the
|
107
110
|
# contract established in: https://github.com/ray-project/ray/pull/16763
|
108
111
|
src_file_records_list = ray.get(list(obj_refs))
|
@@ -114,21 +117,23 @@ def materialize(
|
|
114
117
|
(record_numbers, repeat(dedupe_task_idx, len(record_numbers)))
|
115
118
|
)
|
116
119
|
manifest_cache = {}
|
117
|
-
compacted_tables = []
|
118
120
|
materialized_results: List[MaterializeResult] = []
|
119
|
-
|
121
|
+
record_batch_tables = RecordBatchTables(max_records_per_output_file)
|
120
122
|
for src_dfl in sorted(all_src_file_records.keys()):
|
121
|
-
record_numbers_dd_task_idx_tpl_list: List[
|
122
|
-
|
123
|
+
record_numbers_dd_task_idx_tpl_list: List[
|
124
|
+
Tuple[DeltaFileLocatorToRecords, repeat]
|
125
|
+
] = all_src_file_records[src_dfl]
|
123
126
|
record_numbers_tpl, dedupe_task_idx_iter_tpl = zip(
|
124
127
|
*record_numbers_dd_task_idx_tpl_list
|
125
128
|
)
|
126
129
|
is_src_partition_file_np = src_dfl.is_source_delta
|
127
130
|
src_stream_position_np = src_dfl.stream_position
|
128
131
|
src_file_idx_np = src_dfl.file_index
|
129
|
-
src_file_partition_locator =
|
130
|
-
|
132
|
+
src_file_partition_locator = (
|
133
|
+
source_partition_locator
|
134
|
+
if is_src_partition_file_np
|
131
135
|
else round_completion_info.compacted_delta_locator.partition_locator
|
136
|
+
)
|
132
137
|
delta_locator = DeltaLocator.of(
|
133
138
|
src_file_partition_locator,
|
134
139
|
src_stream_position_np.item(),
|
@@ -148,65 +153,54 @@ def materialize(
|
|
148
153
|
# enforce a consistent schema if provided, when reading files into PyArrow tables
|
149
154
|
elif schema is not None:
|
150
155
|
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
|
151
|
-
schema=schema
|
152
|
-
|
156
|
+
schema=schema
|
157
|
+
)
|
158
|
+
pa_table, download_delta_manifest_entry_time = timed_invocation(
|
159
|
+
deltacat_storage.download_delta_manifest_entry,
|
153
160
|
Delta.of(delta_locator, None, None, None, manifest),
|
154
161
|
src_file_idx_np.item(),
|
155
162
|
file_reader_kwargs_provider=read_kwargs_provider,
|
156
163
|
)
|
157
|
-
|
158
|
-
|
164
|
+
logger.debug(
|
165
|
+
f"Time taken for materialize task"
|
166
|
+
f" to download delta locator {delta_locator} with entry ID {src_file_idx_np.item()}"
|
167
|
+
f" is: {download_delta_manifest_entry_time}s"
|
168
|
+
)
|
169
|
+
mask_pylist = list(repeat(False, len(pa_table)))
|
159
170
|
record_numbers = chain.from_iterable(record_numbers_tpl)
|
160
171
|
# TODO(raghumdani): reference the same file URIs while writing the files
|
161
|
-
# instead of copying the data over and creating new files.
|
172
|
+
# instead of copying the data over and creating new files.
|
162
173
|
for record_number in record_numbers:
|
163
174
|
mask_pylist[record_number] = True
|
164
175
|
mask = pa.array(mask_pylist)
|
165
176
|
pa_table = pa_table.filter(mask)
|
177
|
+
record_batch_tables.append(pa_table)
|
178
|
+
if record_batch_tables.has_batches():
|
179
|
+
batched_tables = record_batch_tables.evict()
|
180
|
+
materialized_results.append(_materialize(batched_tables))
|
166
181
|
|
167
|
-
|
168
|
-
|
169
|
-
# performance than repeatedly filtering the table in dedupe task index
|
170
|
-
# order
|
171
|
-
dedupe_task_indices = chain.from_iterable(dedupe_task_idx_iter_tpl)
|
172
|
-
pa_table = sc.append_dedupe_task_idx_col(
|
173
|
-
pa_table,
|
174
|
-
dedupe_task_indices,
|
175
|
-
)
|
176
|
-
pa_sort_keys = [(sc._DEDUPE_TASK_IDX_COLUMN_NAME, "ascending")]
|
177
|
-
pa_table = pa_table.take(
|
178
|
-
pc.sort_indices(pa_table, sort_keys=pa_sort_keys),
|
179
|
-
)
|
180
|
-
pa_table = pa_table.drop(
|
181
|
-
[sc._DEDUPE_TASK_IDX_COLUMN_NAME]
|
182
|
-
)
|
183
|
-
|
184
|
-
# Write manifests up to max_records_per_output_file
|
185
|
-
# TODO(raghumdani): Write exactly the same number of records into each file to
|
186
|
-
# produce a read-optimized view of the tables.
|
187
|
-
if compacted_tables and \
|
188
|
-
total_record_count + record_count > max_records_per_output_file:
|
189
|
-
materialized_results.append(_materialize(compacted_tables, total_record_count))
|
190
|
-
# Free up written tables in memory
|
191
|
-
compacted_tables.clear()
|
192
|
-
total_record_count = 0
|
193
|
-
|
194
|
-
total_record_count += record_count
|
195
|
-
compacted_tables.append(pa_table)
|
196
|
-
|
197
|
-
materialized_results.append(_materialize(compacted_tables, total_record_count))
|
198
|
-
# Free up written tables in memory
|
199
|
-
compacted_tables.clear()
|
182
|
+
if record_batch_tables.has_remaining():
|
183
|
+
materialized_results.append(_materialize(record_batch_tables.remaining))
|
200
184
|
|
201
185
|
merged_delta = Delta.merge_deltas([mr.delta for mr in materialized_results])
|
202
|
-
assert
|
203
|
-
|
204
|
-
|
186
|
+
assert (
|
187
|
+
materialized_results and len(materialized_results) > 0
|
188
|
+
), f"Expected at least one materialized result in materialize step."
|
189
|
+
|
190
|
+
write_results = [mr.pyarrow_write_result for mr in materialized_results]
|
191
|
+
logger.debug(
|
192
|
+
f"{len(write_results)} files written"
|
193
|
+
f" with records: {[wr.records for wr in write_results]}"
|
194
|
+
)
|
205
195
|
# Merge all new deltas into one for this materialize bucket index
|
206
|
-
merged_materialize_result = MaterializeResult.of(
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
196
|
+
merged_materialize_result = MaterializeResult.of(
|
197
|
+
merged_delta,
|
198
|
+
materialized_results[0].task_index,
|
199
|
+
PyArrowWriteResult.union(
|
200
|
+
[mr.pyarrow_write_result for mr in materialized_results]
|
201
|
+
),
|
202
|
+
)
|
211
203
|
logger.info(f"Finished materialize task...")
|
204
|
+
end = time.time()
|
205
|
+
logger.info(f"Materialize task ended in {end - start}s")
|
212
206
|
return merged_materialize_result
|
@@ -1,22 +1,21 @@
|
|
1
1
|
import logging
|
2
|
-
import
|
3
|
-
import pyarrow as pa
|
4
|
-
import numpy as np
|
2
|
+
from typing import List, Tuple
|
5
3
|
|
4
|
+
import numpy as np
|
5
|
+
import pyarrow as pa
|
6
|
+
import ray
|
6
7
|
from ray.types import ObjectRef
|
7
8
|
|
8
9
|
from deltacat import logs
|
9
10
|
from deltacat.compute.compactor import PrimaryKeyIndexVersionLocator
|
10
11
|
from deltacat.compute.compactor.utils import primary_key_index as pki
|
11
12
|
|
12
|
-
from typing import List, Tuple
|
13
|
-
|
14
13
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
15
14
|
|
16
15
|
|
17
16
|
def group_file_records_by_pk_hash_bucket(
|
18
|
-
|
19
|
-
|
17
|
+
pki_table: pa.Table, num_buckets: int
|
18
|
+
) -> np.ndarray:
|
20
19
|
# generate the new table for each new hash bucket
|
21
20
|
hash_bucket_to_indices = pki.group_record_indices_by_hash_bucket(
|
22
21
|
pki_table,
|
@@ -29,13 +28,14 @@ def group_file_records_by_pk_hash_bucket(
|
|
29
28
|
return hash_bucket_to_table
|
30
29
|
|
31
30
|
|
32
|
-
@ray.remote(num_cpus=1,num_returns=2)
|
31
|
+
@ray.remote(num_cpus=1, num_returns=2)
|
33
32
|
def rehash_bucket(
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
33
|
+
hash_bucket_index: int,
|
34
|
+
s3_bucket: str,
|
35
|
+
old_pki_version_locator: PrimaryKeyIndexVersionLocator,
|
36
|
+
num_buckets: int,
|
37
|
+
num_groups: int,
|
38
|
+
) -> Tuple[np.ndarray, List[ObjectRef]]:
|
39
39
|
|
40
40
|
logger.info(f"Starting rehash bucket task...")
|
41
41
|
tables = pki.download_hash_bucket_entries(
|
@@ -16,11 +16,11 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
|
16
16
|
|
17
17
|
@ray.remote(num_cpus=1, num_returns=2)
|
18
18
|
def rewrite_index(
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
19
|
+
object_ids: List[Any],
|
20
|
+
s3_bucket: str,
|
21
|
+
new_primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
|
22
|
+
max_records_per_index_file: int,
|
23
|
+
) -> Tuple[PyArrowWriteResult, List[ObjectRef]]:
|
24
24
|
|
25
25
|
logger.info(f"Starting rewrite primary key index task...")
|
26
26
|
object_refs = [cloudpickle.loads(obj_id_pkl) for obj_id_pkl in object_ids]
|
@@ -1,24 +1,23 @@
|
|
1
1
|
import logging
|
2
|
-
import time
|
3
2
|
import math
|
4
|
-
from
|
5
|
-
from deltacat.constants import PYARROW_INFLATION_MULTIPLIER, BYTES_PER_MEBIBYTE
|
3
|
+
from typing import Dict, List, Optional, Tuple
|
6
4
|
|
7
|
-
from deltacat.storage import PartitionLocator, Delta, \
|
8
|
-
interface as unimplemented_deltacat_storage
|
9
5
|
from deltacat import logs
|
10
6
|
from deltacat.compute.compactor import DeltaAnnotated
|
11
|
-
|
12
|
-
from
|
7
|
+
from deltacat.compute.stats.models.delta_stats import DeltaStats
|
8
|
+
from deltacat.constants import BYTES_PER_MEBIBYTE, PYARROW_INFLATION_MULTIPLIER
|
9
|
+
from deltacat.storage import Delta, PartitionLocator
|
10
|
+
from deltacat.storage import interface as unimplemented_deltacat_storage
|
13
11
|
|
14
12
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
15
13
|
|
16
14
|
|
17
15
|
def discover_deltas(
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
16
|
+
source_partition_locator: PartitionLocator,
|
17
|
+
start_position_exclusive: Optional[int],
|
18
|
+
end_position_inclusive: int,
|
19
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
20
|
+
) -> List[Delta]:
|
22
21
|
|
23
22
|
stream_locator = source_partition_locator.stream_locator
|
24
23
|
namespace = stream_locator.namespace
|
@@ -36,32 +35,38 @@ def discover_deltas(
|
|
36
35
|
)
|
37
36
|
deltas = deltas_list_result.all_items()
|
38
37
|
if not deltas:
|
39
|
-
raise RuntimeError(
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
38
|
+
raise RuntimeError(
|
39
|
+
f"Unexpected Error: Couldn't find any deltas to "
|
40
|
+
f"compact in delta stream position range "
|
41
|
+
f"('{start_position_exclusive}', "
|
42
|
+
f"'{end_position_inclusive}']. Source partition: "
|
43
|
+
f"{source_partition_locator}"
|
44
|
+
)
|
44
45
|
if start_position_exclusive:
|
45
46
|
first_delta = deltas.pop(0)
|
46
|
-
logger.info(
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
47
|
+
logger.info(
|
48
|
+
f"Removed exclusive start delta w/ expected stream "
|
49
|
+
f"position '{start_position_exclusive}' from deltas to "
|
50
|
+
f"compact: {first_delta}"
|
51
|
+
)
|
52
|
+
logger.info(
|
53
|
+
f"Count of deltas to compact in delta stream "
|
54
|
+
f"position range ('{start_position_exclusive}', "
|
55
|
+
f"'{end_position_inclusive}']: {len(deltas)}. Source "
|
56
|
+
f"partition: '{source_partition_locator}'"
|
57
|
+
)
|
53
58
|
return deltas
|
54
59
|
|
55
60
|
|
56
61
|
def limit_input_deltas(
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
62
|
+
input_deltas: List[Delta],
|
63
|
+
cluster_resources: Dict[str, float],
|
64
|
+
hash_bucket_count: int,
|
65
|
+
min_pk_index_pa_bytes: int,
|
66
|
+
user_hash_bucket_chunk_size: int,
|
67
|
+
input_deltas_stats: Dict[int, DeltaStats],
|
68
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
69
|
+
) -> Tuple[List[DeltaAnnotated], int, int]:
|
65
70
|
|
66
71
|
# TODO (pdames): when row counts are available in metadata, use them
|
67
72
|
# instead of bytes - memory consumption depends more on number of
|
@@ -78,9 +83,10 @@ def limit_input_deltas(
|
|
78
83
|
# )
|
79
84
|
if min_pk_index_pa_bytes > 0:
|
80
85
|
required_heap_mem_for_dedupe = worker_obj_store_mem - min_pk_index_pa_bytes
|
81
|
-
assert required_heap_mem_for_dedupe > 0,
|
82
|
-
f"Not enough required memory available to re-batch input deltas"
|
86
|
+
assert required_heap_mem_for_dedupe > 0, (
|
87
|
+
f"Not enough required memory available to re-batch input deltas"
|
83
88
|
f"and initiate the dedupe step."
|
89
|
+
)
|
84
90
|
# Size of batched deltas must also be reduced to have enough space for primary
|
85
91
|
# key index files (from earlier compaction rounds) in the dedupe step, since
|
86
92
|
# they will be loaded into worker heap memory.
|
@@ -88,8 +94,7 @@ def limit_input_deltas(
|
|
88
94
|
|
89
95
|
logger.info(f"Total worker object store memory: {worker_obj_store_mem}")
|
90
96
|
worker_obj_store_mem_per_task = worker_obj_store_mem / worker_cpus
|
91
|
-
logger.info(f"Worker object store memory/task: "
|
92
|
-
f"{worker_obj_store_mem_per_task}")
|
97
|
+
logger.info(f"Worker object store memory/task: " f"{worker_obj_store_mem_per_task}")
|
93
98
|
worker_task_mem = cluster_resources["memory"]
|
94
99
|
logger.info(f"Total worker memory: {worker_task_mem}")
|
95
100
|
# TODO (pdames): ensure fixed memory per CPU in heterogenous clusters
|
@@ -105,8 +110,10 @@ def limit_input_deltas(
|
|
105
110
|
if input_deltas_stats is None:
|
106
111
|
input_deltas_stats = {}
|
107
112
|
|
108
|
-
input_deltas_stats = {
|
109
|
-
|
113
|
+
input_deltas_stats = {
|
114
|
+
int(stream_pos): DeltaStats(delta_stats)
|
115
|
+
for stream_pos, delta_stats in input_deltas_stats.items()
|
116
|
+
}
|
110
117
|
for delta in input_deltas:
|
111
118
|
manifest = deltacat_storage.get_delta_manifest(delta)
|
112
119
|
delta.manifest = manifest
|
@@ -118,7 +125,8 @@ def limit_input_deltas(
|
|
118
125
|
# TODO (pdames): ensure pyarrow object fits in per-task obj store mem
|
119
126
|
logger.warning(
|
120
127
|
f"Stats are missing for delta stream position {delta.stream_position}, "
|
121
|
-
f"materialized delta may not fit in per-task object store memory."
|
128
|
+
f"materialized delta may not fit in per-task object store memory."
|
129
|
+
)
|
122
130
|
manifest_entries = delta.manifest.entries
|
123
131
|
delta_manifest_entries += len(manifest_entries)
|
124
132
|
for entry in manifest_entries:
|
@@ -130,13 +138,13 @@ def limit_input_deltas(
|
|
130
138
|
logger.info(
|
131
139
|
f"Input deltas limited to "
|
132
140
|
f"{len(limited_input_da_list)} by object store mem "
|
133
|
-
f"({delta_bytes_pyarrow} > {worker_obj_store_mem})"
|
141
|
+
f"({delta_bytes_pyarrow} > {worker_obj_store_mem})"
|
142
|
+
)
|
134
143
|
break
|
135
144
|
delta_annotated = DeltaAnnotated.of(delta)
|
136
145
|
limited_input_da_list.append(delta_annotated)
|
137
146
|
|
138
|
-
logger.info(f"Input deltas to compact this round: "
|
139
|
-
f"{len(limited_input_da_list)}")
|
147
|
+
logger.info(f"Input deltas to compact this round: " f"{len(limited_input_da_list)}")
|
140
148
|
logger.info(f"Input delta bytes to compact: {delta_bytes}")
|
141
149
|
logger.info(f"Input delta files to compact: {delta_manifest_entries}")
|
142
150
|
logger.info(f"Latest input delta stream position: {latest_stream_position}")
|
@@ -146,10 +154,12 @@ def limit_input_deltas(
|
|
146
154
|
|
147
155
|
# TODO (pdames): determine min hash buckets from size of all deltas
|
148
156
|
# (not just deltas for this round)
|
149
|
-
min_hash_bucket_count = int(
|
150
|
-
|
151
|
-
|
152
|
-
|
157
|
+
min_hash_bucket_count = int(
|
158
|
+
max(
|
159
|
+
math.ceil(delta_bytes_pyarrow / worker_obj_store_mem_per_task),
|
160
|
+
min(worker_cpus, 256),
|
161
|
+
)
|
162
|
+
)
|
153
163
|
logger.info(f"Minimum recommended hash buckets: {min_hash_bucket_count}")
|
154
164
|
|
155
165
|
if hash_bucket_count is None:
|
@@ -168,7 +178,8 @@ def limit_input_deltas(
|
|
168
178
|
f"resolve this problem either specify a larger number of hash "
|
169
179
|
f"buckets when running compaction, omit a custom hash bucket "
|
170
180
|
f"count when running compaction, or provision workers with more "
|
171
|
-
f"task memory per CPU."
|
181
|
+
f"task memory per CPU."
|
182
|
+
)
|
172
183
|
|
173
184
|
hash_bucket_chunk_size = user_hash_bucket_chunk_size
|
174
185
|
max_hash_bucket_chunk_size = math.ceil(
|
@@ -185,7 +196,8 @@ def limit_input_deltas(
|
|
185
196
|
f"specify a smaller hash bucket chunk size when running "
|
186
197
|
f"compaction, omit a custom hash bucket chunk size when running "
|
187
198
|
f"compaction, or provision workers with more task and object "
|
188
|
-
f"store memory per CPU."
|
199
|
+
f"store memory per CPU."
|
200
|
+
)
|
189
201
|
elif not hash_bucket_chunk_size:
|
190
202
|
hash_bucket_chunk_size_load_balanced = max(
|
191
203
|
math.ceil(max(delta_bytes, delta_bytes_pyarrow) / worker_cpus),
|