deltacat 0.1.6__py3-none-any.whl → 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +41 -15
- deltacat/aws/clients.py +12 -31
- deltacat/aws/constants.py +1 -1
- deltacat/aws/redshift/__init__.py +7 -2
- deltacat/aws/redshift/model/manifest.py +54 -50
- deltacat/aws/s3u.py +183 -194
- deltacat/catalog/delegate.py +151 -185
- deltacat/catalog/interface.py +78 -97
- deltacat/catalog/model/catalog.py +21 -21
- deltacat/catalog/model/table_definition.py +11 -9
- deltacat/compute/compactor/__init__.py +12 -16
- deltacat/compute/compactor/compaction_session.py +249 -198
- deltacat/compute/compactor/model/delta_annotated.py +60 -44
- deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
- deltacat/compute/compactor/model/delta_file_locator.py +10 -8
- deltacat/compute/compactor/model/materialize_result.py +6 -7
- deltacat/compute/compactor/model/primary_key_index.py +38 -34
- deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
- deltacat/compute/compactor/model/round_completion_info.py +25 -19
- deltacat/compute/compactor/model/sort_key.py +18 -15
- deltacat/compute/compactor/steps/dedupe.py +153 -260
- deltacat/compute/compactor/steps/hash_bucket.py +56 -56
- deltacat/compute/compactor/steps/materialize.py +139 -100
- deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
- deltacat/compute/compactor/steps/rehash/rewrite_index.py +11 -13
- deltacat/compute/compactor/utils/io.py +59 -47
- deltacat/compute/compactor/utils/primary_key_index.py +131 -90
- deltacat/compute/compactor/utils/round_completion_file.py +22 -23
- deltacat/compute/compactor/utils/system_columns.py +33 -42
- deltacat/compute/metastats/meta_stats.py +235 -157
- deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
- deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
- deltacat/compute/metastats/stats.py +95 -64
- deltacat/compute/metastats/utils/io.py +100 -53
- deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
- deltacat/compute/metastats/utils/ray_utils.py +38 -33
- deltacat/compute/stats/basic.py +107 -69
- deltacat/compute/stats/models/delta_column_stats.py +11 -8
- deltacat/compute/stats/models/delta_stats.py +59 -32
- deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
- deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
- deltacat/compute/stats/models/stats_result.py +24 -14
- deltacat/compute/stats/utils/intervals.py +16 -9
- deltacat/compute/stats/utils/io.py +86 -51
- deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
- deltacat/constants.py +8 -10
- deltacat/io/__init__.py +2 -2
- deltacat/io/aws/redshift/redshift_datasource.py +157 -143
- deltacat/io/dataset.py +14 -17
- deltacat/io/read_api.py +36 -33
- deltacat/logs.py +94 -42
- deltacat/storage/__init__.py +18 -8
- deltacat/storage/interface.py +196 -213
- deltacat/storage/model/delta.py +45 -51
- deltacat/storage/model/list_result.py +12 -8
- deltacat/storage/model/namespace.py +4 -5
- deltacat/storage/model/partition.py +42 -42
- deltacat/storage/model/stream.py +29 -30
- deltacat/storage/model/table.py +14 -14
- deltacat/storage/model/table_version.py +32 -31
- deltacat/storage/model/types.py +1 -0
- deltacat/tests/stats/test_intervals.py +11 -24
- deltacat/tests/utils/__init__.py +0 -0
- deltacat/tests/utils/test_record_batch_tables.py +284 -0
- deltacat/types/media.py +3 -4
- deltacat/types/tables.py +31 -21
- deltacat/utils/common.py +5 -11
- deltacat/utils/numpy.py +20 -22
- deltacat/utils/pandas.py +73 -100
- deltacat/utils/performance.py +3 -9
- deltacat/utils/placement.py +276 -228
- deltacat/utils/pyarrow.py +302 -89
- deltacat/utils/ray_utils/collections.py +2 -1
- deltacat/utils/ray_utils/concurrency.py +36 -29
- deltacat/utils/ray_utils/dataset.py +28 -28
- deltacat/utils/ray_utils/performance.py +5 -9
- deltacat/utils/ray_utils/runtime.py +9 -10
- {deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/METADATA +21 -11
- deltacat-0.1.11.dist-info/RECORD +110 -0
- {deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/WHEEL +1 -1
- deltacat-0.1.6.dist-info/RECORD +0 -108
- {deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/LICENSE +0 -0
- {deltacat-0.1.6.dist-info → deltacat-0.1.11.dist-info}/top_level.txt +0 -0
@@ -1,37 +1,37 @@
|
|
1
|
-
import ray
|
2
|
-
import pyarrow as pa
|
3
|
-
import numpy as np
|
4
1
|
import logging
|
5
|
-
|
6
|
-
from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelopeGroups
|
7
2
|
from itertools import chain
|
3
|
+
from typing import Generator, List, Optional, Tuple
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
import pyarrow as pa
|
7
|
+
import ray
|
8
|
+
from ray.types import ObjectRef
|
8
9
|
|
9
10
|
from deltacat import logs
|
10
|
-
from deltacat.compute.compactor import DeltaAnnotated, DeltaFileEnvelope,
|
11
|
-
|
12
|
-
from deltacat.compute.compactor.utils
|
13
|
-
|
11
|
+
from deltacat.compute.compactor import DeltaAnnotated, DeltaFileEnvelope, SortKey
|
12
|
+
from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelopeGroups
|
13
|
+
from deltacat.compute.compactor.utils import system_columns as sc
|
14
|
+
from deltacat.compute.compactor.utils.primary_key_index import (
|
15
|
+
group_hash_bucket_indices,
|
16
|
+
group_record_indices_by_hash_bucket,
|
17
|
+
)
|
14
18
|
from deltacat.storage import interface as unimplemented_deltacat_storage
|
15
19
|
from deltacat.types.media import StorageType
|
16
20
|
from deltacat.utils.common import sha1_digest
|
17
|
-
from deltacat.compute.compactor.utils import system_columns as sc
|
18
|
-
|
19
|
-
from typing import List, Optional, Generator, Tuple
|
20
|
-
|
21
|
-
from ray.types import ObjectRef
|
22
21
|
|
23
22
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
24
23
|
|
25
|
-
_PK_BYTES_DELIMITER = b
|
24
|
+
_PK_BYTES_DELIMITER = b"L6kl7u5f"
|
26
25
|
|
27
26
|
HashBucketGroupToObjectId = np.ndarray
|
28
|
-
HashBucketResult = Tuple[
|
27
|
+
HashBucketResult = Tuple[
|
28
|
+
HashBucketGroupToObjectId, List[ObjectRef[DeltaFileEnvelopeGroups]]
|
29
|
+
]
|
29
30
|
|
30
31
|
|
31
|
-
def
|
32
|
-
|
33
|
-
|
34
|
-
primary_keys: List[str]) -> np.ndarray:
|
32
|
+
def _group_by_pk_hash_bucket(
|
33
|
+
table: pa.Table, num_buckets: int, primary_keys: List[str]
|
34
|
+
) -> np.ndarray:
|
35
35
|
|
36
36
|
# generate the primary key digest column
|
37
37
|
all_pk_column_fields = []
|
@@ -39,7 +39,7 @@ def group_by_pk_hash_bucket(
|
|
39
39
|
# casting a primary key column to numpy also ensures no nulls exist
|
40
40
|
column_fields = table[pk_name].to_numpy()
|
41
41
|
all_pk_column_fields.append(column_fields)
|
42
|
-
hash_column_generator =
|
42
|
+
hash_column_generator = _hash_pk_bytes_generator(all_pk_column_fields)
|
43
43
|
table = sc.append_pk_hash_column(table, hash_column_generator)
|
44
44
|
|
45
45
|
# drop primary key columns to free up memory
|
@@ -62,26 +62,24 @@ def group_by_pk_hash_bucket(
|
|
62
62
|
return hash_bucket_to_table
|
63
63
|
|
64
64
|
|
65
|
-
def
|
65
|
+
def _hash_pk_bytes_generator(all_column_fields) -> Generator[bytes, None, None]:
|
66
66
|
for field_index in range(len(all_column_fields[0])):
|
67
67
|
bytes_to_join = []
|
68
68
|
for column_fields in all_column_fields:
|
69
|
-
bytes_to_join.append(
|
70
|
-
bytes(str(column_fields[field_index]), "utf-8")
|
71
|
-
)
|
69
|
+
bytes_to_join.append(bytes(str(column_fields[field_index]), "utf-8"))
|
72
70
|
yield sha1_digest(_PK_BYTES_DELIMITER.join(bytes_to_join))
|
73
71
|
|
74
72
|
|
75
|
-
def
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
73
|
+
def _group_file_records_by_pk_hash_bucket(
|
74
|
+
annotated_delta: DeltaAnnotated,
|
75
|
+
num_hash_buckets: int,
|
76
|
+
primary_keys: List[str],
|
77
|
+
sort_key_names: List[str],
|
78
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
79
|
+
) -> Optional[DeltaFileEnvelopeGroups]:
|
82
80
|
|
83
81
|
# read input parquet s3 objects into a list of delta file envelopes
|
84
|
-
delta_file_envelopes =
|
82
|
+
delta_file_envelopes = _read_delta_file_envelopes(
|
85
83
|
annotated_delta,
|
86
84
|
primary_keys,
|
87
85
|
sort_key_names,
|
@@ -93,7 +91,7 @@ def group_file_records_by_pk_hash_bucket(
|
|
93
91
|
# group the data by primary key hash value
|
94
92
|
hb_to_delta_file_envelopes = np.empty([num_hash_buckets], dtype="object")
|
95
93
|
for dfe in delta_file_envelopes:
|
96
|
-
hash_bucket_to_table =
|
94
|
+
hash_bucket_to_table = _group_by_pk_hash_bucket(
|
97
95
|
dfe.table,
|
98
96
|
num_hash_buckets,
|
99
97
|
primary_keys,
|
@@ -104,19 +102,18 @@ def group_file_records_by_pk_hash_bucket(
|
|
104
102
|
hb_to_delta_file_envelopes[hb] = []
|
105
103
|
hb_to_delta_file_envelopes[hb].append(
|
106
104
|
DeltaFileEnvelope.of(
|
107
|
-
dfe.stream_position,
|
108
|
-
|
109
|
-
|
110
|
-
table))
|
105
|
+
dfe.stream_position, dfe.file_index, dfe.delta_type, table
|
106
|
+
)
|
107
|
+
)
|
111
108
|
return hb_to_delta_file_envelopes
|
112
109
|
|
113
110
|
|
114
|
-
def
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
111
|
+
def _read_delta_file_envelopes(
|
112
|
+
annotated_delta: DeltaAnnotated,
|
113
|
+
primary_keys: List[str],
|
114
|
+
sort_key_names: List[str],
|
115
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
116
|
+
) -> Optional[List[DeltaFileEnvelope]]:
|
120
117
|
|
121
118
|
columns_to_read = list(chain(primary_keys, sort_key_names))
|
122
119
|
tables = deltacat_storage.download_delta(
|
@@ -126,10 +123,12 @@ def read_delta_file_envelopes(
|
|
126
123
|
storage_type=StorageType.LOCAL,
|
127
124
|
)
|
128
125
|
annotations = annotated_delta.annotations
|
129
|
-
assert
|
130
|
-
|
131
|
-
|
132
|
-
|
126
|
+
assert (
|
127
|
+
len(tables) == len(annotations),
|
128
|
+
f"Unexpected Error: Length of downloaded delta manifest tables "
|
129
|
+
f"({len(tables)}) doesn't match the length of delta manifest "
|
130
|
+
f"annotations ({len(annotations)}).",
|
131
|
+
)
|
133
132
|
if not tables:
|
134
133
|
return None
|
135
134
|
|
@@ -145,18 +144,19 @@ def read_delta_file_envelopes(
|
|
145
144
|
return delta_file_envelopes
|
146
145
|
|
147
146
|
|
148
|
-
@ray.remote(
|
147
|
+
@ray.remote(num_returns=2)
|
149
148
|
def hash_bucket(
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
149
|
+
annotated_delta: DeltaAnnotated,
|
150
|
+
primary_keys: List[str],
|
151
|
+
sort_keys: List[SortKey],
|
152
|
+
num_buckets: int,
|
153
|
+
num_groups: int,
|
154
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
155
|
+
) -> HashBucketResult:
|
156
156
|
|
157
157
|
logger.info(f"Starting hash bucket task...")
|
158
158
|
sort_key_names = [key.key_name for key in sort_keys]
|
159
|
-
delta_file_envelope_groups =
|
159
|
+
delta_file_envelope_groups = _group_file_records_by_pk_hash_bucket(
|
160
160
|
annotated_delta,
|
161
161
|
num_buckets,
|
162
162
|
primary_keys,
|
@@ -1,57 +1,113 @@
|
|
1
|
-
import logging
|
2
|
-
import
|
3
|
-
import pyarrow as pa
|
4
|
-
|
1
|
+
import logging
|
2
|
+
import time
|
5
3
|
from collections import defaultdict
|
6
|
-
|
7
|
-
from deltacat.compute.compactor.steps.dedupe import DedupeTaskIndexWithObjectId, \
|
8
|
-
DeltaFileLocatorToRecords
|
9
4
|
from itertools import chain, repeat
|
5
|
+
from typing import List, Optional, Tuple
|
10
6
|
|
11
|
-
|
12
|
-
|
7
|
+
import pyarrow as pa
|
8
|
+
import ray
|
13
9
|
from ray import cloudpickle
|
14
10
|
|
15
11
|
from deltacat import logs
|
16
|
-
from deltacat.
|
17
|
-
|
18
|
-
|
19
|
-
RoundCompletionInfo
|
20
|
-
|
21
|
-
from deltacat.
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
12
|
+
from deltacat.compute.compactor import (
|
13
|
+
MaterializeResult,
|
14
|
+
PyArrowWriteResult,
|
15
|
+
RoundCompletionInfo,
|
16
|
+
)
|
17
|
+
from deltacat.compute.compactor.steps.dedupe import (
|
18
|
+
DedupeTaskIndexWithObjectId,
|
19
|
+
DeltaFileLocatorToRecords,
|
20
|
+
)
|
21
|
+
from deltacat.storage import Delta, DeltaLocator, Partition, PartitionLocator
|
22
|
+
from deltacat.storage import interface as unimplemented_deltacat_storage
|
23
|
+
from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES, ContentType
|
26
24
|
from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
|
27
|
-
from deltacat.utils.
|
25
|
+
from deltacat.utils.performance import timed_invocation
|
26
|
+
from deltacat.utils.pyarrow import (
|
27
|
+
ReadKwargsProviderPyArrowCsvPureUtf8,
|
28
|
+
ReadKwargsProviderPyArrowSchemaOverride,
|
29
|
+
RecordBatchTables,
|
30
|
+
)
|
28
31
|
|
29
32
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
30
33
|
|
31
34
|
|
32
|
-
@ray.remote
|
35
|
+
@ray.remote
|
33
36
|
def materialize(
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
37
|
+
source_partition_locator: PartitionLocator,
|
38
|
+
round_completion_info: Optional[RoundCompletionInfo],
|
39
|
+
partition: Partition,
|
40
|
+
mat_bucket_index: int,
|
41
|
+
dedupe_task_idx_and_obj_id_tuples: List[DedupeTaskIndexWithObjectId],
|
42
|
+
max_records_per_output_file: int,
|
43
|
+
compacted_file_content_type: ContentType,
|
44
|
+
schema: Optional[pa.Schema] = None,
|
45
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
46
|
+
) -> MaterializeResult:
|
47
|
+
# TODO (rkenmi): Add docstrings for the steps in the compaction workflow
|
48
|
+
# https://github.com/ray-project/deltacat/issues/79
|
49
|
+
def _materialize(compacted_tables: List[pa.Table]) -> MaterializeResult:
|
50
|
+
compacted_table = pa.concat_tables(compacted_tables)
|
51
|
+
|
52
|
+
if compacted_file_content_type in DELIMITED_TEXT_CONTENT_TYPES:
|
53
|
+
# TODO (rkenmi): Investigate if we still need to convert this table to pandas DataFrame
|
54
|
+
# TODO (pdames): compare performance to pandas-native materialize path
|
55
|
+
df = compacted_table.to_pandas(
|
56
|
+
split_blocks=True, self_destruct=True, zero_copy_only=True
|
57
|
+
)
|
58
|
+
compacted_table = df
|
59
|
+
delta, stage_delta_time = timed_invocation(
|
60
|
+
deltacat_storage.stage_delta,
|
61
|
+
compacted_table,
|
62
|
+
partition,
|
63
|
+
max_records_per_entry=max_records_per_output_file,
|
64
|
+
content_type=compacted_file_content_type,
|
65
|
+
)
|
66
|
+
compacted_table_size = TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](
|
67
|
+
compacted_table
|
68
|
+
)
|
69
|
+
logger.debug(
|
70
|
+
f"Time taken for materialize task"
|
71
|
+
f" to upload {len(compacted_table)} records"
|
72
|
+
f" of size {compacted_table_size} is: {stage_delta_time}s"
|
73
|
+
)
|
74
|
+
manifest = delta.manifest
|
75
|
+
manifest_records = manifest.meta.record_count
|
76
|
+
assert (
|
77
|
+
manifest_records == len(compacted_table),
|
78
|
+
f"Unexpected Error: Materialized delta manifest record count "
|
79
|
+
f"({manifest_records}) does not equal compacted table record count "
|
80
|
+
f"({len(compacted_table)})",
|
81
|
+
)
|
82
|
+
materialize_result = MaterializeResult.of(
|
83
|
+
delta,
|
84
|
+
mat_bucket_index,
|
85
|
+
# TODO (pdames): Generalize WriteResult to contain in-memory-table-type
|
86
|
+
# and in-memory-table-bytes instead of tight coupling to paBytes
|
87
|
+
PyArrowWriteResult.of(
|
88
|
+
len(manifest.entries),
|
89
|
+
TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](compacted_table),
|
90
|
+
manifest.meta.content_length,
|
91
|
+
len(compacted_table),
|
92
|
+
),
|
93
|
+
)
|
94
|
+
logger.info(f"Materialize result: {materialize_result}")
|
95
|
+
return materialize_result
|
43
96
|
|
44
|
-
logger.info(
|
97
|
+
logger.info(
|
98
|
+
f"Starting materialize task with"
|
99
|
+
f" materialize bucket index: {mat_bucket_index}..."
|
100
|
+
)
|
101
|
+
start = time.time()
|
45
102
|
dedupe_task_idx_and_obj_ref_tuples = [
|
46
103
|
(
|
47
|
-
|
48
|
-
cloudpickle.loads(
|
49
|
-
)
|
104
|
+
t1,
|
105
|
+
cloudpickle.loads(t2),
|
106
|
+
)
|
107
|
+
for t1, t2 in dedupe_task_idx_and_obj_id_tuples
|
50
108
|
]
|
51
109
|
logger.info(f"Resolved materialize task obj refs...")
|
52
|
-
dedupe_task_indices, obj_refs = zip(
|
53
|
-
*dedupe_task_idx_and_obj_ref_tuples
|
54
|
-
)
|
110
|
+
dedupe_task_indices, obj_refs = zip(*dedupe_task_idx_and_obj_ref_tuples)
|
55
111
|
# this depends on `ray.get` result order matching input order, as per the
|
56
112
|
# contract established in: https://github.com/ray-project/ray/pull/16763
|
57
113
|
src_file_records_list = ray.get(list(obj_refs))
|
@@ -63,19 +119,23 @@ def materialize(
|
|
63
119
|
(record_numbers, repeat(dedupe_task_idx, len(record_numbers)))
|
64
120
|
)
|
65
121
|
manifest_cache = {}
|
66
|
-
|
122
|
+
materialized_results: List[MaterializeResult] = []
|
123
|
+
record_batch_tables = RecordBatchTables(max_records_per_output_file)
|
67
124
|
for src_dfl in sorted(all_src_file_records.keys()):
|
68
|
-
record_numbers_dd_task_idx_tpl_list: List[
|
69
|
-
|
125
|
+
record_numbers_dd_task_idx_tpl_list: List[
|
126
|
+
Tuple[DeltaFileLocatorToRecords, repeat]
|
127
|
+
] = all_src_file_records[src_dfl]
|
70
128
|
record_numbers_tpl, dedupe_task_idx_iter_tpl = zip(
|
71
129
|
*record_numbers_dd_task_idx_tpl_list
|
72
130
|
)
|
73
131
|
is_src_partition_file_np = src_dfl.is_source_delta
|
74
132
|
src_stream_position_np = src_dfl.stream_position
|
75
133
|
src_file_idx_np = src_dfl.file_index
|
76
|
-
src_file_partition_locator =
|
77
|
-
|
134
|
+
src_file_partition_locator = (
|
135
|
+
source_partition_locator
|
136
|
+
if is_src_partition_file_np
|
78
137
|
else round_completion_info.compacted_delta_locator.partition_locator
|
138
|
+
)
|
79
139
|
delta_locator = DeltaLocator.of(
|
80
140
|
src_file_partition_locator,
|
81
141
|
src_stream_position_np.item(),
|
@@ -95,75 +155,54 @@ def materialize(
|
|
95
155
|
# enforce a consistent schema if provided, when reading files into PyArrow tables
|
96
156
|
elif schema is not None:
|
97
157
|
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
|
98
|
-
schema=schema
|
99
|
-
|
158
|
+
schema=schema
|
159
|
+
)
|
160
|
+
pa_table, download_delta_manifest_entry_time = timed_invocation(
|
161
|
+
deltacat_storage.download_delta_manifest_entry,
|
100
162
|
Delta.of(delta_locator, None, None, None, manifest),
|
101
163
|
src_file_idx_np.item(),
|
102
|
-
|
164
|
+
file_reader_kwargs_provider=read_kwargs_provider,
|
165
|
+
)
|
166
|
+
logger.debug(
|
167
|
+
f"Time taken for materialize task"
|
168
|
+
f" to download delta locator {delta_locator} with entry ID {src_file_idx_np.item()}"
|
169
|
+
f" is: {download_delta_manifest_entry_time}s"
|
103
170
|
)
|
104
171
|
mask_pylist = list(repeat(False, len(pa_table)))
|
105
172
|
record_numbers = chain.from_iterable(record_numbers_tpl)
|
173
|
+
# TODO(raghumdani): reference the same file URIs while writing the files
|
174
|
+
# instead of copying the data over and creating new files.
|
106
175
|
for record_number in record_numbers:
|
107
176
|
mask_pylist[record_number] = True
|
108
177
|
mask = pa.array(mask_pylist)
|
109
|
-
|
178
|
+
pa_table = pa_table.filter(mask)
|
179
|
+
record_batch_tables.append(pa_table)
|
180
|
+
if record_batch_tables.has_batches():
|
181
|
+
batched_tables = record_batch_tables.evict()
|
182
|
+
materialized_results.append(_materialize(batched_tables))
|
110
183
|
|
111
|
-
|
112
|
-
|
113
|
-
# performance than repeatedly filtering the table in dedupe task index
|
114
|
-
# order
|
115
|
-
dedupe_task_indices = chain.from_iterable(dedupe_task_idx_iter_tpl)
|
116
|
-
compacted_table = sc.append_dedupe_task_idx_col(
|
117
|
-
compacted_table,
|
118
|
-
dedupe_task_indices,
|
119
|
-
)
|
120
|
-
pa_sort_keys = [(sc._DEDUPE_TASK_IDX_COLUMN_NAME, "ascending")]
|
121
|
-
compacted_table = compacted_table.take(
|
122
|
-
pc.sort_indices(compacted_table, sort_keys=pa_sort_keys),
|
123
|
-
)
|
124
|
-
compacted_table = compacted_table.drop(
|
125
|
-
[sc._DEDUPE_TASK_IDX_COLUMN_NAME]
|
126
|
-
)
|
127
|
-
compacted_tables.append(compacted_table)
|
184
|
+
if record_batch_tables.has_remaining():
|
185
|
+
materialized_results.append(_materialize(record_batch_tables.remaining))
|
128
186
|
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
split_blocks=True,
|
139
|
-
self_destruct=True,
|
140
|
-
)
|
141
|
-
del compacted_table
|
142
|
-
compacted_table = df
|
143
|
-
delta = deltacat_storage.stage_delta(
|
144
|
-
compacted_table,
|
145
|
-
partition,
|
146
|
-
max_records_per_entry=max_records_per_output_file,
|
147
|
-
content_type=compacted_file_content_type,
|
187
|
+
merged_delta = Delta.merge_deltas([mr.delta for mr in materialized_results])
|
188
|
+
assert (
|
189
|
+
materialized_results and len(materialized_results) > 0
|
190
|
+
), f"Expected at least one materialized result in materialize step."
|
191
|
+
|
192
|
+
write_results = [mr.pyarrow_write_result for mr in materialized_results]
|
193
|
+
logger.debug(
|
194
|
+
f"{len(write_results)} files written"
|
195
|
+
f" with records: {[wr.records for wr in write_results]}"
|
148
196
|
)
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
materialize_result = MaterializeResult.of(
|
156
|
-
delta,
|
157
|
-
mat_bucket_index,
|
158
|
-
# TODO (pdames): Generalize WriteResult to contain in-memory-table-type
|
159
|
-
# and in-memory-table-bytes instead of tight coupling to paBytes
|
160
|
-
PyArrowWriteResult.of(
|
161
|
-
len(manifest.entries),
|
162
|
-
TABLE_CLASS_TO_SIZE_FUNC[type(compacted_table)](compacted_table),
|
163
|
-
manifest.meta.content_length,
|
164
|
-
len(compacted_table),
|
197
|
+
# Merge all new deltas into one for this materialize bucket index
|
198
|
+
merged_materialize_result = MaterializeResult.of(
|
199
|
+
merged_delta,
|
200
|
+
materialized_results[0].task_index,
|
201
|
+
PyArrowWriteResult.union(
|
202
|
+
[mr.pyarrow_write_result for mr in materialized_results]
|
165
203
|
),
|
166
204
|
)
|
167
|
-
logger.info(f"Materialize result: {materialize_result}")
|
168
205
|
logger.info(f"Finished materialize task...")
|
169
|
-
|
206
|
+
end = time.time()
|
207
|
+
logger.info(f"Materialize task ended in {end - start}s")
|
208
|
+
return merged_materialize_result
|
@@ -1,22 +1,21 @@
|
|
1
1
|
import logging
|
2
|
-
import
|
3
|
-
import pyarrow as pa
|
4
|
-
import numpy as np
|
2
|
+
from typing import List, Tuple
|
5
3
|
|
4
|
+
import numpy as np
|
5
|
+
import pyarrow as pa
|
6
|
+
import ray
|
6
7
|
from ray.types import ObjectRef
|
7
8
|
|
8
9
|
from deltacat import logs
|
9
10
|
from deltacat.compute.compactor import PrimaryKeyIndexVersionLocator
|
10
11
|
from deltacat.compute.compactor.utils import primary_key_index as pki
|
11
12
|
|
12
|
-
from typing import List, Tuple
|
13
|
-
|
14
13
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
15
14
|
|
16
15
|
|
17
16
|
def group_file_records_by_pk_hash_bucket(
|
18
|
-
|
19
|
-
|
17
|
+
pki_table: pa.Table, num_buckets: int
|
18
|
+
) -> np.ndarray:
|
20
19
|
# generate the new table for each new hash bucket
|
21
20
|
hash_bucket_to_indices = pki.group_record_indices_by_hash_bucket(
|
22
21
|
pki_table,
|
@@ -29,13 +28,14 @@ def group_file_records_by_pk_hash_bucket(
|
|
29
28
|
return hash_bucket_to_table
|
30
29
|
|
31
30
|
|
32
|
-
@ray.remote(num_cpus=1,num_returns=2)
|
31
|
+
@ray.remote(num_cpus=1, num_returns=2)
|
33
32
|
def rehash_bucket(
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
33
|
+
hash_bucket_index: int,
|
34
|
+
s3_bucket: str,
|
35
|
+
old_pki_version_locator: PrimaryKeyIndexVersionLocator,
|
36
|
+
num_buckets: int,
|
37
|
+
num_groups: int,
|
38
|
+
) -> Tuple[np.ndarray, List[ObjectRef]]:
|
39
39
|
|
40
40
|
logger.info(f"Starting rehash bucket task...")
|
41
41
|
tables = pki.download_hash_bucket_entries(
|
@@ -1,28 +1,26 @@
|
|
1
|
-
import ray
|
2
1
|
import logging
|
3
|
-
import pyarrow as pa
|
4
2
|
from collections import defaultdict
|
5
|
-
from
|
6
|
-
from deltacat import logs
|
3
|
+
from typing import Any, List, Tuple
|
7
4
|
|
5
|
+
import pyarrow as pa
|
6
|
+
import ray
|
7
|
+
from ray import cloudpickle
|
8
8
|
from ray.types import ObjectRef
|
9
9
|
|
10
|
-
from deltacat
|
11
|
-
|
10
|
+
from deltacat import logs
|
11
|
+
from deltacat.compute.compactor import PrimaryKeyIndexVersionLocator, PyArrowWriteResult
|
12
12
|
from deltacat.compute.compactor.utils import primary_key_index as pki
|
13
13
|
|
14
|
-
from typing import Any, List, Tuple
|
15
|
-
|
16
14
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
17
15
|
|
18
16
|
|
19
17
|
@ray.remote(num_cpus=1, num_returns=2)
|
20
18
|
def rewrite_index(
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
19
|
+
object_ids: List[Any],
|
20
|
+
s3_bucket: str,
|
21
|
+
new_primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
|
22
|
+
max_records_per_index_file: int,
|
23
|
+
) -> Tuple[PyArrowWriteResult, List[ObjectRef]]:
|
26
24
|
|
27
25
|
logger.info(f"Starting rewrite primary key index task...")
|
28
26
|
object_refs = [cloudpickle.loads(obj_id_pkl) for obj_id_pkl in object_ids]
|