deltacat 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor/model/compact_partition_params.py +25 -0
- deltacat/compute/compactor/model/compaction_session_audit_info.py +11 -0
- deltacat/compute/compactor/model/delta_file_envelope.py +21 -3
- deltacat/compute/compactor/model/table_object_store.py +51 -0
- deltacat/compute/compactor/utils/io.py +1 -1
- deltacat/compute/compactor_v2/compaction_session.py +80 -14
- deltacat/compute/compactor_v2/deletes/__init__.py +0 -0
- deltacat/compute/compactor_v2/deletes/delete_file_envelope.py +83 -0
- deltacat/compute/compactor_v2/deletes/delete_strategy.py +82 -0
- deltacat/compute/compactor_v2/deletes/delete_strategy_equality_delete.py +161 -0
- deltacat/compute/compactor_v2/deletes/model.py +23 -0
- deltacat/compute/compactor_v2/deletes/utils.py +164 -0
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +6 -0
- deltacat/compute/compactor_v2/model/merge_input.py +24 -1
- deltacat/compute/compactor_v2/model/merge_result.py +1 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -6
- deltacat/compute/compactor_v2/steps/merge.py +221 -50
- deltacat/compute/compactor_v2/utils/delta.py +11 -1
- deltacat/compute/compactor_v2/utils/merge.py +10 -0
- deltacat/compute/compactor_v2/utils/task_options.py +94 -8
- deltacat/io/memcached_object_store.py +20 -0
- deltacat/io/ray_plasma_object_store.py +6 -0
- deltacat/logs.py +29 -2
- deltacat/storage/__init__.py +3 -0
- deltacat/storage/interface.py +2 -0
- deltacat/storage/model/delete_parameters.py +40 -0
- deltacat/storage/model/delta.py +25 -1
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +1930 -0
- deltacat/tests/compute/compact_partition_test_cases.py +16 -822
- deltacat/tests/compute/compactor/utils/test_io.py +4 -4
- deltacat/tests/compute/test_compact_partition_incremental.py +4 -0
- deltacat/tests/compute/test_compact_partition_params.py +5 -0
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +32 -20
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +28 -10
- deltacat/tests/io/test_memcached_object_store.py +19 -0
- deltacat/tests/local_deltacat_storage/__init__.py +3 -0
- deltacat/tests/test_utils/constants.py +1 -2
- deltacat/tests/test_utils/pyarrow.py +27 -10
- deltacat/utils/pandas.py +1 -1
- deltacat/utils/ray_utils/runtime.py +3 -3
- deltacat/utils/resources.py +7 -5
- {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/METADATA +1 -1
- {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/RECORD +47 -38
- {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/LICENSE +0 -0
- {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/WHEEL +0 -0
- {deltacat-1.0.2.dist-info → deltacat-1.1.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,161 @@
|
|
1
|
+
from typing import List, Optional
|
2
|
+
import logging
|
3
|
+
import pyarrow as pa
|
4
|
+
from deltacat import logs
|
5
|
+
|
6
|
+
from typing import Callable, Tuple
|
7
|
+
from deltacat.compute.compactor_v2.deletes.delete_file_envelope import (
|
8
|
+
DeleteFileEnvelope,
|
9
|
+
)
|
10
|
+
from deltacat.compute.compactor_v2.deletes.delete_strategy import (
|
11
|
+
DeleteStrategy,
|
12
|
+
)
|
13
|
+
import pyarrow.compute as pc
|
14
|
+
import numpy as np
|
15
|
+
|
16
|
+
|
17
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
18
|
+
|
19
|
+
|
20
|
+
class EqualityDeleteStrategy(DeleteStrategy):
|
21
|
+
"""
|
22
|
+
A strategy for applying row-level deletes on tables during compaction based on equality conditions on one or more columns. It
|
23
|
+
implements the "equality delete" approach, which marks a row as deleted by one or more column values like pk1=3 or col1="foo", col2="bar".
|
24
|
+
|
25
|
+
Attributes:
|
26
|
+
_name (str): The name of the delete strategy.
|
27
|
+
|
28
|
+
Methods:
|
29
|
+
name(self) -> str:
|
30
|
+
Returns the name of the delete strategy.
|
31
|
+
|
32
|
+
apply_deletes(self, table, delete_file_envelope, *args, **kwargs) -> Tuple[pa.Table, int]:
|
33
|
+
Apply delete operations on the given table using the provided delete file envelope.
|
34
|
+
|
35
|
+
apply_many_deletes(self, table, delete_file_envelopes, *args, **kwargs) -> Tuple[pa.Table, int]:
|
36
|
+
Apply delete operations on the given table using all provided delete file envelopes.
|
37
|
+
"""
|
38
|
+
|
39
|
+
_name = "EqualityDeleteStrategy"
|
40
|
+
|
41
|
+
@property
|
42
|
+
def name(self) -> str:
|
43
|
+
"""
|
44
|
+
The name of the delete strategy.
|
45
|
+
"""
|
46
|
+
return self._name
|
47
|
+
|
48
|
+
def _drop_rows(
|
49
|
+
self,
|
50
|
+
table: pa.Table,
|
51
|
+
delete_table: pa.Table,
|
52
|
+
delete_column_names: List[str],
|
53
|
+
equality_predicate_operation: Optional[Callable] = pa.compute.and_,
|
54
|
+
) -> Tuple[pa.Table, int]:
|
55
|
+
"""
|
56
|
+
Drop rows from the given table based on the provided delete table and column names.
|
57
|
+
|
58
|
+
Args:
|
59
|
+
table (pa.Table): The input table to drop rows from.
|
60
|
+
delete_table (pa.Table): The table containing the values to match for deletion.
|
61
|
+
delete_column_names (List[str]): A list of column names to check for equality with the delete table.
|
62
|
+
equality_predicate_operation (Optional[Callable], optional): The operation to combine equality predicates for multiple columns.
|
63
|
+
Defaults to pa.compute.and_.
|
64
|
+
|
65
|
+
Returns:
|
66
|
+
Tuple[pa.Table, int]: A tuple containing the updated table after dropping rows,
|
67
|
+
and the number of rows dropped.
|
68
|
+
"""
|
69
|
+
if len(delete_column_names) < 1:
|
70
|
+
return table, 0
|
71
|
+
# all 1s -> all True so wont discard any from the curr_boolean_mask
|
72
|
+
prev_boolean_mask = pa.array(np.ones(len(table), dtype=bool))
|
73
|
+
# all 0s -> all False so if mask is never modified all rows will be kept
|
74
|
+
curr_boolean_mask = pa.array(np.zeros(len(table), dtype=bool))
|
75
|
+
for delete_column_name in delete_column_names:
|
76
|
+
if delete_column_name not in table.column_names:
|
77
|
+
logger.warning(
|
78
|
+
f"Column name {delete_column_name} not in table column names. Skipping dropping rows for this column."
|
79
|
+
)
|
80
|
+
continue
|
81
|
+
curr_boolean_mask = pc.is_in(
|
82
|
+
table[delete_column_name],
|
83
|
+
value_set=delete_table[delete_column_name],
|
84
|
+
)
|
85
|
+
curr_boolean_mask = equality_predicate_operation(
|
86
|
+
prev_boolean_mask, curr_boolean_mask
|
87
|
+
)
|
88
|
+
prev_boolean_mask = curr_boolean_mask
|
89
|
+
number_of_rows_before_dropping = len(table)
|
90
|
+
logger.debug(
|
91
|
+
f"Number of table rows before dropping: {number_of_rows_before_dropping}."
|
92
|
+
)
|
93
|
+
table = table.filter(pc.invert(curr_boolean_mask))
|
94
|
+
number_of_rows_after_dropping = len(table)
|
95
|
+
logger.debug(
|
96
|
+
f"Number of table rows after dropping: {number_of_rows_after_dropping}."
|
97
|
+
)
|
98
|
+
dropped_rows = number_of_rows_before_dropping - number_of_rows_after_dropping
|
99
|
+
return table, dropped_rows
|
100
|
+
|
101
|
+
def apply_deletes(
|
102
|
+
self,
|
103
|
+
table: Optional[pa.Table],
|
104
|
+
delete_file_envelope: DeleteFileEnvelope,
|
105
|
+
*args,
|
106
|
+
**kwargs,
|
107
|
+
) -> Tuple[pa.Table, int]:
|
108
|
+
"""
|
109
|
+
Apply delete operations on the given table using the provided delete file envelope.
|
110
|
+
|
111
|
+
Args:
|
112
|
+
table (Optional[pa.Table]): The pyarrow table to apply deletes on.
|
113
|
+
delete_file_envelope (DeleteFileEnvelope): The delete file envelope containing delete parameters.
|
114
|
+
|
115
|
+
Returns:
|
116
|
+
Tuple[pa.Table, int]: A tuple containing the updated Arrow table after applying deletes,
|
117
|
+
and the number of rows deleted.
|
118
|
+
"""
|
119
|
+
if not table or not delete_file_envelope.table:
|
120
|
+
logger.debug(
|
121
|
+
f"No table passed or no delete file envelope delete table found. DeleteFileEnvelope: {delete_file_envelope}"
|
122
|
+
)
|
123
|
+
return table, 0
|
124
|
+
delete_columns = delete_file_envelope.delete_columns
|
125
|
+
delete_table = delete_file_envelope.table
|
126
|
+
table, number_of_rows_dropped = self._drop_rows(
|
127
|
+
table, delete_table, delete_columns
|
128
|
+
)
|
129
|
+
return table, number_of_rows_dropped
|
130
|
+
|
131
|
+
def apply_many_deletes(
|
132
|
+
self,
|
133
|
+
table: Optional[pa.Table],
|
134
|
+
delete_file_envelopes: List[DeleteFileEnvelope],
|
135
|
+
*args,
|
136
|
+
**kwargs,
|
137
|
+
) -> Tuple[pa.Table, int]:
|
138
|
+
"""
|
139
|
+
Apply delete operations on the given table using all provided delete file envelopes.
|
140
|
+
|
141
|
+
Args:
|
142
|
+
table (Optional[pa.Table]): The pyarrow table to apply deletes on.
|
143
|
+
delete_file_envelopes (List[DeleteFileEnvelope]): A list of delete file envelopes containing delete parameters.
|
144
|
+
|
145
|
+
Returns:
|
146
|
+
Tuple[pa.Table, int]: A tuple containing the updated pyarrow table after applying all deletes,
|
147
|
+
and the total number of rows deleted.
|
148
|
+
"""
|
149
|
+
# looking up table references are lighter than the actual table
|
150
|
+
if not table:
|
151
|
+
logger.debug("No table passed.")
|
152
|
+
return table, 0
|
153
|
+
total_dropped_rows = 0
|
154
|
+
for delete_file_envelope in delete_file_envelopes:
|
155
|
+
if delete_file_envelope.table_reference is None:
|
156
|
+
continue
|
157
|
+
table, number_of_rows_dropped = self.apply_deletes(
|
158
|
+
table, delete_file_envelope
|
159
|
+
)
|
160
|
+
total_dropped_rows += number_of_rows_dropped
|
161
|
+
return table, total_dropped_rows
|
@@ -0,0 +1,23 @@
|
|
1
|
+
from deltacat.storage import (
|
2
|
+
Delta,
|
3
|
+
)
|
4
|
+
|
5
|
+
from typing import List
|
6
|
+
|
7
|
+
from dataclasses import dataclass, fields
|
8
|
+
from deltacat.compute.compactor_v2.deletes.delete_file_envelope import (
|
9
|
+
DeleteFileEnvelope,
|
10
|
+
)
|
11
|
+
from deltacat.compute.compactor_v2.deletes.delete_strategy import (
|
12
|
+
DeleteStrategy,
|
13
|
+
)
|
14
|
+
|
15
|
+
|
16
|
+
@dataclass
|
17
|
+
class PrepareDeleteResult:
|
18
|
+
non_delete_deltas: List[Delta]
|
19
|
+
delete_file_envelopes: List[DeleteFileEnvelope]
|
20
|
+
delete_strategy: DeleteStrategy
|
21
|
+
|
22
|
+
def __iter__(self):
|
23
|
+
return (getattr(self, field.name) for field in fields(self))
|
@@ -0,0 +1,164 @@
|
|
1
|
+
import itertools
|
2
|
+
|
3
|
+
from deltacat.compute.compactor_v2.deletes.model import (
|
4
|
+
PrepareDeleteResult,
|
5
|
+
DeleteFileEnvelope,
|
6
|
+
)
|
7
|
+
from deltacat.storage import (
|
8
|
+
DeltaType,
|
9
|
+
)
|
10
|
+
from collections import defaultdict
|
11
|
+
import logging
|
12
|
+
|
13
|
+
from typing import Optional, List, Dict, Tuple
|
14
|
+
from deltacat.types.media import StorageType
|
15
|
+
from deltacat.compute.compactor.model.compact_partition_params import (
|
16
|
+
CompactPartitionParams,
|
17
|
+
)
|
18
|
+
from deltacat.compute.compactor_v2.deletes.delete_strategy_equality_delete import (
|
19
|
+
EqualityDeleteStrategy,
|
20
|
+
)
|
21
|
+
import pyarrow as pa
|
22
|
+
from deltacat.storage import (
|
23
|
+
Delta,
|
24
|
+
)
|
25
|
+
from deltacat import logs
|
26
|
+
|
27
|
+
|
28
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
29
|
+
|
30
|
+
|
31
|
+
def _aggregate_delete_deltas(input_deltas: List[Delta]) -> Dict[int, List[Delta]]:
|
32
|
+
"""
|
33
|
+
Aggregates consecutive DELETE deltas with the same delete parameters into groups.
|
34
|
+
|
35
|
+
Args:
|
36
|
+
input_deltas (List[Delta]): A list of Delta objects representing delete operations.
|
37
|
+
Returns:
|
38
|
+
Dict[int, List[Delta]]: A dictionary where the keys are the stream positions of the
|
39
|
+
earliest delta in each group of consecutive DELETE deltas with the same delete parameters,
|
40
|
+
and the values are lists containing those deltas.
|
41
|
+
"""
|
42
|
+
start_stream_spos_to_delete_delta_sequence: Dict[int, List[Delta]] = defaultdict(
|
43
|
+
list
|
44
|
+
)
|
45
|
+
delete_deltas_sequence_grouped_by_delete_parameters: List[
|
46
|
+
Tuple[bool, List[Delta]]
|
47
|
+
] = [
|
48
|
+
(is_delete, list(delete_delta_group))
|
49
|
+
for (is_delete, _), delete_delta_group in itertools.groupby(
|
50
|
+
input_deltas, lambda d: (d.type is DeltaType.DELETE, d.delete_parameters)
|
51
|
+
)
|
52
|
+
]
|
53
|
+
for (
|
54
|
+
is_delete_delta_sequence,
|
55
|
+
delete_delta_sequence,
|
56
|
+
) in delete_deltas_sequence_grouped_by_delete_parameters:
|
57
|
+
if not is_delete_delta_sequence:
|
58
|
+
continue
|
59
|
+
starting_stream_position_of_delete_sequence: int = delete_delta_sequence[
|
60
|
+
0
|
61
|
+
].stream_position
|
62
|
+
start_stream_spos_to_delete_delta_sequence[
|
63
|
+
starting_stream_position_of_delete_sequence
|
64
|
+
] = delete_delta_sequence
|
65
|
+
return start_stream_spos_to_delete_delta_sequence
|
66
|
+
|
67
|
+
|
68
|
+
def _get_delete_file_envelopes(
|
69
|
+
params: CompactPartitionParams,
|
70
|
+
delete_spos_to_delete_deltas: Dict[int, List],
|
71
|
+
) -> List[DeleteFileEnvelope]:
|
72
|
+
"""
|
73
|
+
Create a list of DeleteFileEnvelope objects from the given dictionary of delete deltas.
|
74
|
+
Args:
|
75
|
+
params (CompactPartitionParams): compaction session parameters
|
76
|
+
delete_spos_to_delete_deltas (Dict[int, List]): A dictionary where the keys are the stream positions of the
|
77
|
+
earliest delta in each group of consecutive DELETE deltas with the same delete parameters,
|
78
|
+
and the values are lists containing those deltas.
|
79
|
+
Returns:
|
80
|
+
List[DeleteFileEnvelope]: A list of DeleteFileEnvelope objects.
|
81
|
+
"""
|
82
|
+
delete_file_envelopes = []
|
83
|
+
for (
|
84
|
+
start_stream_position,
|
85
|
+
delete_delta_sequence,
|
86
|
+
) in delete_spos_to_delete_deltas.items():
|
87
|
+
consecutive_delete_tables: List[pa.Table] = []
|
88
|
+
for delete_delta in delete_delta_sequence:
|
89
|
+
assert (
|
90
|
+
delete_delta.delete_parameters is not None
|
91
|
+
), "Delete type deltas are required to have delete parameters defined"
|
92
|
+
delete_columns: Optional[
|
93
|
+
List[str]
|
94
|
+
] = delete_delta.delete_parameters.equality_column_names
|
95
|
+
assert len(delete_columns) > 0, "At least 1 delete column is required"
|
96
|
+
# delete columns should exist in underlying table
|
97
|
+
delete_dataset = params.deltacat_storage.download_delta(
|
98
|
+
delete_delta,
|
99
|
+
file_reader_kwargs_provider=params.read_kwargs_provider,
|
100
|
+
columns=delete_columns,
|
101
|
+
storage_type=StorageType.LOCAL,
|
102
|
+
max_parallelism=1,
|
103
|
+
**params.deltacat_storage_kwargs,
|
104
|
+
)
|
105
|
+
consecutive_delete_tables.extend(delete_dataset)
|
106
|
+
delete_table: pa.Table = pa.concat_tables(consecutive_delete_tables)
|
107
|
+
delete_file_envelopes.append(
|
108
|
+
DeleteFileEnvelope.of(
|
109
|
+
start_stream_position,
|
110
|
+
delta_type=DeltaType.DELETE,
|
111
|
+
table=delete_table,
|
112
|
+
delete_columns=delete_columns,
|
113
|
+
)
|
114
|
+
)
|
115
|
+
return delete_file_envelopes
|
116
|
+
|
117
|
+
|
118
|
+
def prepare_deletes(
|
119
|
+
params: CompactPartitionParams,
|
120
|
+
input_deltas: List[Delta],
|
121
|
+
*args,
|
122
|
+
**kwargs,
|
123
|
+
) -> PrepareDeleteResult:
|
124
|
+
"""
|
125
|
+
Prepares delete operations for a compaction process.
|
126
|
+
|
127
|
+
This function processes all the deltas and consolidates consecutive DELETE deltas.
|
128
|
+
It creates a list of these delete file envelopes.
|
129
|
+
Additionally, non-DELETE deltas are accumulated in a separate list.
|
130
|
+
|
131
|
+
Args:
|
132
|
+
params (CompactPartitionParams): Parameters for the compaction process.
|
133
|
+
input_deltas (List[Delta]): A list of Delta objects representing delete operations.
|
134
|
+
|
135
|
+
Returns:
|
136
|
+
PrepareDeleteResult:
|
137
|
+
- A list of Deltas excluding all DELETE operations.
|
138
|
+
- A list of DeleteFileEnvelope objects representing the consolidated delete operations.
|
139
|
+
- An instance of the EqualityDeleteStrategy class.
|
140
|
+
|
141
|
+
Raises:
|
142
|
+
AssertionError: If the input_deltas list is not sorted in non-decreasing order by stream_position.
|
143
|
+
AssertionError: If the number of delete file envelopes does not match the number of DELETE-type Delta sequences.
|
144
|
+
"""
|
145
|
+
if not input_deltas:
|
146
|
+
return PrepareDeleteResult(input_deltas, [], None)
|
147
|
+
assert all(
|
148
|
+
input_deltas[i].stream_position <= input_deltas[i + 1].stream_position
|
149
|
+
for i in range(len(input_deltas) - 1)
|
150
|
+
), "Uniform deltas must be in non-decreasing order by stream position"
|
151
|
+
start_stream_spos_to_delete_delta_sequence: Dict[
|
152
|
+
int, List[Delta]
|
153
|
+
] = _aggregate_delete_deltas(input_deltas)
|
154
|
+
delete_file_envelopes: List[DeleteFileEnvelope] = _get_delete_file_envelopes(
|
155
|
+
params, start_stream_spos_to_delete_delta_sequence
|
156
|
+
)
|
157
|
+
assert len(start_stream_spos_to_delete_delta_sequence) == len(
|
158
|
+
delete_file_envelopes
|
159
|
+
), "The number of delete file envelopes should match the number of DELETE-type Delta sequences"
|
160
|
+
return PrepareDeleteResult(
|
161
|
+
[delta for delta in input_deltas if delta.type is not DeltaType.DELETE],
|
162
|
+
delete_file_envelopes,
|
163
|
+
EqualityDeleteStrategy(),
|
164
|
+
)
|
@@ -22,6 +22,7 @@ class HashBucketInput(Dict):
|
|
22
22
|
object_store: Optional[IObjectStore] = None,
|
23
23
|
deltacat_storage=unimplemented_deltacat_storage,
|
24
24
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
25
|
+
memory_logs_enabled: Optional[bool] = None,
|
25
26
|
) -> HashBucketInput:
|
26
27
|
|
27
28
|
result = HashBucketInput()
|
@@ -36,6 +37,7 @@ class HashBucketInput(Dict):
|
|
36
37
|
result["object_store"] = object_store
|
37
38
|
result["deltacat_storage"] = deltacat_storage
|
38
39
|
result["deltacat_storage_kwargs"] = deltacat_storage_kwargs or {}
|
40
|
+
result["memory_logs_enabled"] = memory_logs_enabled
|
39
41
|
|
40
42
|
return result
|
41
43
|
|
@@ -82,3 +84,7 @@ class HashBucketInput(Dict):
|
|
82
84
|
@property
|
83
85
|
def deltacat_storage_kwargs(self) -> Optional[Dict[str, Any]]:
|
84
86
|
return self.get("deltacat_storage_kwargs")
|
87
|
+
|
88
|
+
@property
|
89
|
+
def memory_logs_enabled(self) -> Optional[bool]:
|
90
|
+
return self.get("memory_logs_enabled")
|
@@ -5,6 +5,9 @@ from typing import Dict, List, Optional, Any
|
|
5
5
|
from deltacat.compute.compactor_v2.model.merge_file_group import (
|
6
6
|
MergeFileGroupsProvider,
|
7
7
|
)
|
8
|
+
from deltacat.compute.compactor_v2.deletes.delete_file_envelope import (
|
9
|
+
DeleteFileEnvelope,
|
10
|
+
)
|
8
11
|
from deltacat.utils.metrics import MetricsConfig
|
9
12
|
from deltacat.utils.common import ReadKwargsProvider
|
10
13
|
from deltacat.io.object_store import IObjectStore
|
@@ -17,6 +20,7 @@ from deltacat.compute.compactor_v2.constants import (
|
|
17
20
|
DROP_DUPLICATES,
|
18
21
|
MAX_RECORDS_PER_COMPACTED_FILE,
|
19
22
|
)
|
23
|
+
from deltacat.compute.compactor_v2.deletes.delete_strategy import DeleteStrategy
|
20
24
|
from deltacat.types.media import ContentType
|
21
25
|
from deltacat.compute.compactor.model.round_completion_info import RoundCompletionInfo
|
22
26
|
|
@@ -38,8 +42,11 @@ class MergeInput(Dict):
|
|
38
42
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
39
43
|
round_completion_info: Optional[RoundCompletionInfo] = None,
|
40
44
|
object_store: Optional[IObjectStore] = None,
|
45
|
+
delete_strategy: Optional[DeleteStrategy] = None,
|
46
|
+
delete_file_envelopes: Optional[List] = None,
|
41
47
|
deltacat_storage=unimplemented_deltacat_storage,
|
42
48
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
49
|
+
memory_logs_enabled: Optional[bool] = None,
|
43
50
|
) -> MergeInput:
|
44
51
|
|
45
52
|
result = MergeInput()
|
@@ -57,9 +64,11 @@ class MergeInput(Dict):
|
|
57
64
|
result["read_kwargs_provider"] = read_kwargs_provider
|
58
65
|
result["round_completion_info"] = round_completion_info
|
59
66
|
result["object_store"] = object_store
|
67
|
+
result["delete_file_envelopes"] = delete_file_envelopes
|
68
|
+
result["delete_strategy"] = delete_strategy
|
60
69
|
result["deltacat_storage"] = deltacat_storage
|
61
70
|
result["deltacat_storage_kwargs"] = deltacat_storage_kwargs or {}
|
62
|
-
|
71
|
+
result["memory_logs_enabled"] = memory_logs_enabled
|
63
72
|
return result
|
64
73
|
|
65
74
|
@property
|
@@ -125,3 +134,17 @@ class MergeInput(Dict):
|
|
125
134
|
@property
|
126
135
|
def deltacat_storage_kwargs(self) -> Optional[Dict[str, Any]]:
|
127
136
|
return self.get("deltacat_storage_kwargs")
|
137
|
+
|
138
|
+
@property
|
139
|
+
def memory_logs_enabled(self) -> Optional[bool]:
|
140
|
+
return self.get("memory_logs_enabled")
|
141
|
+
|
142
|
+
@property
|
143
|
+
def delete_file_envelopes(
|
144
|
+
self,
|
145
|
+
) -> Optional[List[DeleteFileEnvelope]]:
|
146
|
+
return self.get("delete_file_envelopes")
|
147
|
+
|
148
|
+
@property
|
149
|
+
def delete_strategy(self) -> Optional[DeleteStrategy]:
|
150
|
+
return self.get("delete_strategy")
|
@@ -8,6 +8,7 @@ class MergeResult(NamedTuple):
|
|
8
8
|
materialize_results: List[MaterializeResult]
|
9
9
|
input_record_count: np.int64
|
10
10
|
deduped_record_count: np.int64
|
11
|
+
deleted_record_count: np.int64
|
11
12
|
peak_memory_usage_bytes: np.double
|
12
13
|
telemetry_time_in_seconds: np.double
|
13
14
|
task_completed_at: np.double
|
@@ -70,14 +70,12 @@ def _group_file_records_by_pk_hash_bucket(
|
|
70
70
|
hb_to_delta_file_envelopes = np.empty([num_hash_buckets], dtype="object")
|
71
71
|
for dfe in delta_file_envelopes:
|
72
72
|
logger.info("Grouping by pk hash bucket")
|
73
|
-
|
73
|
+
group_start = time.monotonic()
|
74
74
|
hash_bucket_to_table = group_by_pk_hash_bucket(
|
75
|
-
dfe.table,
|
76
|
-
num_hash_buckets,
|
77
|
-
primary_keys,
|
75
|
+
dfe.table, num_hash_buckets, primary_keys
|
78
76
|
)
|
79
77
|
group_end = time.monotonic()
|
80
|
-
logger.info(f"Grouping took: {group_end -
|
78
|
+
logger.info(f"Grouping took: {group_end - group_start}")
|
81
79
|
for hb, table in enumerate(hash_bucket_to_table):
|
82
80
|
if table:
|
83
81
|
if hb_to_delta_file_envelopes[hb] is None:
|
@@ -144,7 +142,8 @@ def hash_bucket(input: HashBucketInput) -> HashBucketResult:
|
|
144
142
|
f"({process_util.max_memory/BYTES_PER_GIBIBYTE} GB)"
|
145
143
|
)
|
146
144
|
|
147
|
-
|
145
|
+
if input.memory_logs_enabled:
|
146
|
+
process_util.schedule_callback(log_peak_memory, 10)
|
148
147
|
|
149
148
|
hash_bucket_result, duration = timed_invocation(
|
150
149
|
func=_timed_hash_bucket, input=input
|