deltacat 0.1.18b14__py3-none-any.whl → 0.1.18b16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/aws/clients.py +17 -6
- deltacat/aws/redshift/model/manifest.py +4 -0
- deltacat/aws/s3u.py +24 -1
- deltacat/compute/compactor/compaction_session.py +42 -18
- deltacat/compute/compactor/model/compact_partition_params.py +297 -58
- deltacat/compute/compactor/model/compaction_session_audit_info.py +163 -9
- deltacat/compute/compactor/model/delta_annotated.py +95 -9
- deltacat/compute/compactor/model/delta_file_envelope.py +14 -2
- deltacat/compute/compactor/model/round_completion_info.py +17 -1
- deltacat/compute/compactor/repartition_session.py +4 -1
- deltacat/compute/compactor/steps/dedupe.py +9 -6
- deltacat/compute/compactor/steps/hash_bucket.py +24 -3
- deltacat/compute/compactor/steps/materialize.py +11 -6
- deltacat/compute/compactor/steps/repartition.py +22 -1
- deltacat/compute/compactor/utils/io.py +40 -23
- deltacat/compute/compactor/utils/sort_key.py +5 -0
- deltacat/compute/compactor/utils/system_columns.py +43 -0
- deltacat/compute/compactor_v2/compaction_session.py +509 -0
- deltacat/compute/compactor_v2/constants.py +37 -0
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
- deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
- deltacat/compute/compactor_v2/model/merge_input.py +143 -0
- deltacat/compute/compactor_v2/model/merge_result.py +12 -0
- deltacat/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
- deltacat/compute/compactor_v2/steps/merge.py +469 -0
- deltacat/compute/compactor_v2/utils/__init__.py +0 -0
- deltacat/compute/compactor_v2/utils/content_type_params.py +66 -0
- deltacat/compute/compactor_v2/utils/dedupe.py +58 -0
- deltacat/compute/compactor_v2/utils/io.py +152 -0
- deltacat/compute/compactor_v2/utils/primary_key_index.py +341 -0
- deltacat/compute/compactor_v2/utils/task_options.py +221 -0
- deltacat/compute/metastats/meta_stats.py +4 -2
- deltacat/compute/metastats/stats.py +1 -0
- deltacat/compute/metastats/utils/io.py +4 -0
- deltacat/compute/stats/utils/io.py +20 -5
- deltacat/exceptions.py +4 -0
- deltacat/io/memcached_object_store.py +37 -14
- deltacat/logs.py +4 -3
- deltacat/storage/interface.py +8 -1
- deltacat/storage/model/types.py +2 -1
- deltacat/tests/aws/test_clients.py +16 -3
- deltacat/tests/compute/__init__.py +0 -0
- deltacat/tests/compute/common.py +96 -0
- deltacat/tests/compute/compactor/__init__.py +0 -0
- deltacat/tests/compute/compactor/steps/__init__.py +0 -0
- deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +34 -8
- deltacat/tests/compute/compactor/utils/__init__.py +0 -0
- deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
- deltacat/tests/compute/compactor_v2/__init__.py +0 -0
- deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
- deltacat/tests/compute/test_compaction_session_incremental.py +363 -0
- deltacat/tests/compute/testcases.py +395 -0
- deltacat/tests/io/test_memcached_object_store.py +5 -4
- deltacat/tests/local_deltacat_storage/__init__.py +62 -19
- deltacat/tests/test_utils/pyarrow.py +49 -0
- deltacat/tests/test_utils/utils.py +13 -0
- deltacat/tests/utils/data/__init__.py +0 -0
- deltacat/tests/utils/test_daft.py +76 -0
- deltacat/tests/utils/test_pyarrow.py +133 -0
- deltacat/tests/utils/test_resources.py +23 -20
- deltacat/types/media.py +1 -0
- deltacat/types/partial_download.py +83 -0
- deltacat/types/tables.py +6 -0
- deltacat/utils/arguments.py +25 -0
- deltacat/utils/daft.py +87 -0
- deltacat/utils/placement.py +20 -3
- deltacat/utils/pyarrow.py +218 -1
- deltacat/utils/ray_utils/concurrency.py +26 -1
- deltacat/utils/resources.py +72 -1
- deltacat/utils/s3fs.py +21 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/METADATA +17 -3
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/RECORD +79 -47
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/WHEEL +1 -1
- /deltacat/{tests/compactor → compute/compactor_v2}/__init__.py +0 -0
- /deltacat/{tests/compactor/utils → compute/compactor_v2/model}/__init__.py +0 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/LICENSE +0 -0
- {deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,143 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from ray.types import ObjectRef
|
4
|
+
from typing import Dict, List, Optional, Any
|
5
|
+
from deltacat.utils.metrics import MetricsConfig
|
6
|
+
from deltacat.utils.common import ReadKwargsProvider
|
7
|
+
from deltacat.io.object_store import IObjectStore
|
8
|
+
from deltacat.storage import (
|
9
|
+
Partition,
|
10
|
+
SortKey,
|
11
|
+
interface as unimplemented_deltacat_storage,
|
12
|
+
)
|
13
|
+
from deltacat.compute.compactor_v2.constants import (
|
14
|
+
DROP_DUPLICATES,
|
15
|
+
MAX_RECORDS_PER_COMPACTED_FILE,
|
16
|
+
)
|
17
|
+
from deltacat.types.media import ContentType
|
18
|
+
from deltacat.compute.compactor.model.round_completion_info import RoundCompletionInfo
|
19
|
+
from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelopeGroups
|
20
|
+
|
21
|
+
|
22
|
+
class MergeInput(Dict):
|
23
|
+
@staticmethod
|
24
|
+
def of(
|
25
|
+
dfe_groups_refs: List[ObjectRef[DeltaFileEnvelopeGroups]],
|
26
|
+
write_to_partition: Partition,
|
27
|
+
compacted_file_content_type: ContentType,
|
28
|
+
primary_keys: List[str],
|
29
|
+
hash_group_index: int,
|
30
|
+
num_hash_groups: int,
|
31
|
+
hash_bucket_count: int,
|
32
|
+
drop_duplicates: Optional[bool] = DROP_DUPLICATES,
|
33
|
+
sort_keys: Optional[List[SortKey]] = None,
|
34
|
+
merge_task_index: Optional[int] = 0,
|
35
|
+
max_records_per_output_file: Optional[int] = MAX_RECORDS_PER_COMPACTED_FILE,
|
36
|
+
enable_profiler: Optional[bool] = False,
|
37
|
+
metrics_config: Optional[MetricsConfig] = None,
|
38
|
+
s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
39
|
+
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
40
|
+
round_completion_info: Optional[RoundCompletionInfo] = None,
|
41
|
+
object_store: Optional[IObjectStore] = None,
|
42
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
43
|
+
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
44
|
+
) -> MergeInput:
|
45
|
+
|
46
|
+
result = MergeInput()
|
47
|
+
result["dfe_groups_refs"] = dfe_groups_refs
|
48
|
+
result["write_to_partition"] = write_to_partition
|
49
|
+
result["compacted_file_content_type"] = compacted_file_content_type
|
50
|
+
result["primary_keys"] = primary_keys
|
51
|
+
result["hash_group_index"] = hash_group_index
|
52
|
+
result["num_hash_groups"] = num_hash_groups
|
53
|
+
result["hash_bucket_count"] = hash_bucket_count
|
54
|
+
result["drop_duplicates"] = drop_duplicates
|
55
|
+
result["sort_keys"] = sort_keys
|
56
|
+
result["merge_task_index"] = merge_task_index
|
57
|
+
result["max_records_per_output_file"] = max_records_per_output_file
|
58
|
+
result["enable_profiler"] = enable_profiler
|
59
|
+
result["metrics_config"] = metrics_config
|
60
|
+
result["s3_table_writer_kwargs"] = s3_table_writer_kwargs or {}
|
61
|
+
result["read_kwargs_provider"] = read_kwargs_provider
|
62
|
+
result["round_completion_info"] = round_completion_info
|
63
|
+
result["object_store"] = object_store
|
64
|
+
result["deltacat_storage"] = deltacat_storage
|
65
|
+
result["deltacat_storage_kwargs"] = deltacat_storage_kwargs or {}
|
66
|
+
|
67
|
+
return result
|
68
|
+
|
69
|
+
@property
|
70
|
+
def dfe_groups_refs(self) -> List[ObjectRef[DeltaFileEnvelopeGroups]]:
|
71
|
+
return self["dfe_groups_refs"]
|
72
|
+
|
73
|
+
@property
|
74
|
+
def write_to_partition(self) -> Partition:
|
75
|
+
return self["write_to_partition"]
|
76
|
+
|
77
|
+
@property
|
78
|
+
def compacted_file_content_type(self) -> ContentType:
|
79
|
+
return self["compacted_file_content_type"]
|
80
|
+
|
81
|
+
@property
|
82
|
+
def primary_keys(self) -> List[str]:
|
83
|
+
return self["primary_keys"]
|
84
|
+
|
85
|
+
@property
|
86
|
+
def hash_group_index(self) -> int:
|
87
|
+
return self["hash_group_index"]
|
88
|
+
|
89
|
+
@property
|
90
|
+
def num_hash_groups(self) -> int:
|
91
|
+
return self["num_hash_groups"]
|
92
|
+
|
93
|
+
@property
|
94
|
+
def hash_bucket_count(self) -> int:
|
95
|
+
return self["hash_bucket_count"]
|
96
|
+
|
97
|
+
@property
|
98
|
+
def drop_duplicates(self) -> int:
|
99
|
+
return self["drop_duplicates"]
|
100
|
+
|
101
|
+
@property
|
102
|
+
def sort_keys(self) -> Optional[List[SortKey]]:
|
103
|
+
return self.get("sort_keys")
|
104
|
+
|
105
|
+
@property
|
106
|
+
def merge_task_index(self) -> int:
|
107
|
+
return self.get("merge_task_index")
|
108
|
+
|
109
|
+
@property
|
110
|
+
def max_records_per_output_file(self) -> int:
|
111
|
+
return self.get("max_records_per_output_file")
|
112
|
+
|
113
|
+
@property
|
114
|
+
def enable_profiler(self) -> bool:
|
115
|
+
return self.get("enable_profiler")
|
116
|
+
|
117
|
+
@property
|
118
|
+
def metrics_config(self) -> Optional[MetricsConfig]:
|
119
|
+
return self.get("metrics_config")
|
120
|
+
|
121
|
+
@property
|
122
|
+
def s3_table_writer_kwargs(self) -> Optional[Dict[str, Any]]:
|
123
|
+
return self.get("s3_table_writer_kwargs")
|
124
|
+
|
125
|
+
@property
|
126
|
+
def read_kwargs_provider(self) -> Optional[ReadKwargsProvider]:
|
127
|
+
return self.get("read_kwargs_provider")
|
128
|
+
|
129
|
+
@property
|
130
|
+
def round_completion_info(self) -> Optional[RoundCompletionInfo]:
|
131
|
+
return self.get("round_completion_info")
|
132
|
+
|
133
|
+
@property
|
134
|
+
def object_store(self) -> Optional[IObjectStore]:
|
135
|
+
return self.get("object_store")
|
136
|
+
|
137
|
+
@property
|
138
|
+
def deltacat_storage(self) -> unimplemented_deltacat_storage:
|
139
|
+
return self["deltacat_storage"]
|
140
|
+
|
141
|
+
@property
|
142
|
+
def deltacat_storage_kwargs(self) -> Optional[Dict[str, Any]]:
|
143
|
+
return self.get("deltacat_storage_kwargs")
|
@@ -0,0 +1,12 @@
|
|
1
|
+
from typing import NamedTuple, List
|
2
|
+
from deltacat.compute.compactor.model.materialize_result import MaterializeResult
|
3
|
+
|
4
|
+
import numpy as np
|
5
|
+
|
6
|
+
|
7
|
+
class MergeResult(NamedTuple):
|
8
|
+
materialize_results: List[MaterializeResult]
|
9
|
+
deduped_record_count: np.int64
|
10
|
+
peak_memory_usage_bytes: np.double
|
11
|
+
telemetry_time_in_seconds: np.double
|
12
|
+
task_completed_at: np.double
|
File without changes
|
@@ -0,0 +1,203 @@
|
|
1
|
+
import importlib
|
2
|
+
import logging
|
3
|
+
import time
|
4
|
+
from contextlib import nullcontext
|
5
|
+
from typing import List, Optional, Tuple
|
6
|
+
from deltacat.compute.compactor_v2.model.hash_bucket_input import HashBucketInput
|
7
|
+
import numpy as np
|
8
|
+
import pyarrow as pa
|
9
|
+
import ray
|
10
|
+
from deltacat import logs
|
11
|
+
from deltacat.compute.compactor import (
|
12
|
+
DeltaAnnotated,
|
13
|
+
DeltaFileEnvelope,
|
14
|
+
)
|
15
|
+
from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelopeGroups
|
16
|
+
from deltacat.compute.compactor_v2.model.hash_bucket_result import HashBucketResult
|
17
|
+
from deltacat.compute.compactor_v2.utils.primary_key_index import (
|
18
|
+
group_hash_bucket_indices,
|
19
|
+
group_by_pk_hash_bucket,
|
20
|
+
)
|
21
|
+
from deltacat.storage import interface as unimplemented_deltacat_storage
|
22
|
+
from deltacat.types.media import StorageType
|
23
|
+
from deltacat.utils.ray_utils.runtime import (
|
24
|
+
get_current_ray_task_id,
|
25
|
+
get_current_ray_worker_id,
|
26
|
+
)
|
27
|
+
from deltacat.utils.common import ReadKwargsProvider
|
28
|
+
from deltacat.utils.performance import timed_invocation
|
29
|
+
from deltacat.utils.metrics import emit_timer_metrics
|
30
|
+
from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
|
31
|
+
|
32
|
+
if importlib.util.find_spec("memray"):
|
33
|
+
import memray
|
34
|
+
|
35
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
36
|
+
|
37
|
+
|
38
|
+
def _read_delta_file_envelopes(
|
39
|
+
annotated_delta: DeltaAnnotated,
|
40
|
+
read_kwargs_provider: Optional[ReadKwargsProvider],
|
41
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
42
|
+
deltacat_storage_kwargs: Optional[dict] = None,
|
43
|
+
) -> Tuple[Optional[List[DeltaFileEnvelope]], int, int]:
|
44
|
+
|
45
|
+
tables = deltacat_storage.download_delta(
|
46
|
+
annotated_delta,
|
47
|
+
max_parallelism=1,
|
48
|
+
file_reader_kwargs_provider=read_kwargs_provider,
|
49
|
+
storage_type=StorageType.LOCAL,
|
50
|
+
**deltacat_storage_kwargs,
|
51
|
+
)
|
52
|
+
annotations = annotated_delta.annotations
|
53
|
+
assert (
|
54
|
+
len(tables) == len(annotations),
|
55
|
+
f"Unexpected Error: Length of downloaded delta manifest tables "
|
56
|
+
f"({len(tables)}) doesn't match the length of delta manifest "
|
57
|
+
f"annotations ({len(annotations)}).",
|
58
|
+
)
|
59
|
+
if not tables:
|
60
|
+
return None, 0, 0
|
61
|
+
|
62
|
+
delta_stream_position = annotations[0].annotation_stream_position
|
63
|
+
delta_type = annotations[0].annotation_delta_type
|
64
|
+
|
65
|
+
for annotation in annotations:
|
66
|
+
assert annotation.annotation_stream_position == delta_stream_position, (
|
67
|
+
f"Annotation stream position does not match - {annotation.annotation_stream_position} "
|
68
|
+
f"!= {delta_stream_position}"
|
69
|
+
)
|
70
|
+
assert annotation.annotation_delta_type == delta_type, (
|
71
|
+
f"Annotation delta type does not match - {annotation.annotation_delta_type} "
|
72
|
+
f"!= {delta_type}"
|
73
|
+
)
|
74
|
+
|
75
|
+
delta_file_envelopes = []
|
76
|
+
table = pa.concat_tables(tables)
|
77
|
+
total_record_count = len(table)
|
78
|
+
total_size_bytes = int(table.nbytes)
|
79
|
+
|
80
|
+
delta_file = DeltaFileEnvelope.of(
|
81
|
+
stream_position=delta_stream_position,
|
82
|
+
delta_type=delta_type,
|
83
|
+
table=table,
|
84
|
+
)
|
85
|
+
delta_file_envelopes.append(delta_file)
|
86
|
+
return delta_file_envelopes, total_record_count, total_size_bytes
|
87
|
+
|
88
|
+
|
89
|
+
def _group_file_records_by_pk_hash_bucket(
|
90
|
+
annotated_delta: DeltaAnnotated,
|
91
|
+
num_hash_buckets: int,
|
92
|
+
primary_keys: List[str],
|
93
|
+
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
94
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
95
|
+
deltacat_storage_kwargs: Optional[dict] = None,
|
96
|
+
) -> Tuple[Optional[DeltaFileEnvelopeGroups], int, int]:
|
97
|
+
# read input parquet s3 objects into a list of delta file envelopes
|
98
|
+
(
|
99
|
+
delta_file_envelopes,
|
100
|
+
total_record_count,
|
101
|
+
total_size_bytes,
|
102
|
+
) = _read_delta_file_envelopes(
|
103
|
+
annotated_delta,
|
104
|
+
read_kwargs_provider,
|
105
|
+
deltacat_storage,
|
106
|
+
deltacat_storage_kwargs,
|
107
|
+
)
|
108
|
+
|
109
|
+
if delta_file_envelopes is None:
|
110
|
+
return None, 0, 0
|
111
|
+
|
112
|
+
logger.info(f"Read all delta file envelopes: {len(delta_file_envelopes)}")
|
113
|
+
|
114
|
+
# group the data by primary key hash value
|
115
|
+
hb_to_delta_file_envelopes = np.empty([num_hash_buckets], dtype="object")
|
116
|
+
for dfe in delta_file_envelopes:
|
117
|
+
logger.info("Grouping by pk hash bucket")
|
118
|
+
start = time.monotonic()
|
119
|
+
hash_bucket_to_table = group_by_pk_hash_bucket(
|
120
|
+
dfe.table,
|
121
|
+
num_hash_buckets,
|
122
|
+
primary_keys,
|
123
|
+
)
|
124
|
+
group_end = time.monotonic()
|
125
|
+
logger.info(f"Grouping took: {group_end - start}")
|
126
|
+
for hb, table in enumerate(hash_bucket_to_table):
|
127
|
+
if table:
|
128
|
+
if hb_to_delta_file_envelopes[hb] is None:
|
129
|
+
hb_to_delta_file_envelopes[hb] = []
|
130
|
+
hb_to_delta_file_envelopes[hb].append(
|
131
|
+
DeltaFileEnvelope.of(
|
132
|
+
stream_position=dfe.stream_position,
|
133
|
+
file_index=dfe.file_index,
|
134
|
+
delta_type=dfe.delta_type,
|
135
|
+
table=table,
|
136
|
+
)
|
137
|
+
)
|
138
|
+
return hb_to_delta_file_envelopes, total_record_count, total_size_bytes
|
139
|
+
|
140
|
+
|
141
|
+
def _timed_hash_bucket(input: HashBucketInput):
|
142
|
+
task_id = get_current_ray_task_id()
|
143
|
+
worker_id = get_current_ray_worker_id()
|
144
|
+
with memray.Tracker(
|
145
|
+
f"hash_bucket_{worker_id}_{task_id}.bin"
|
146
|
+
) if input.enable_profiler else nullcontext():
|
147
|
+
(
|
148
|
+
delta_file_envelope_groups,
|
149
|
+
total_record_count,
|
150
|
+
total_size_bytes,
|
151
|
+
) = _group_file_records_by_pk_hash_bucket(
|
152
|
+
annotated_delta=input.annotated_delta,
|
153
|
+
num_hash_buckets=input.num_hash_buckets,
|
154
|
+
primary_keys=input.primary_keys,
|
155
|
+
read_kwargs_provider=input.read_kwargs_provider,
|
156
|
+
deltacat_storage=input.deltacat_storage,
|
157
|
+
deltacat_storage_kwargs=input.deltacat_storage_kwargs,
|
158
|
+
)
|
159
|
+
hash_bucket_group_to_obj_id_tuple = group_hash_bucket_indices(
|
160
|
+
hash_bucket_object_groups=delta_file_envelope_groups,
|
161
|
+
num_buckets=input.num_hash_buckets,
|
162
|
+
num_groups=input.num_hash_groups,
|
163
|
+
object_store=input.object_store,
|
164
|
+
)
|
165
|
+
|
166
|
+
peak_memory_usage_bytes = get_current_node_peak_memory_usage_in_bytes()
|
167
|
+
return HashBucketResult(
|
168
|
+
hash_bucket_group_to_obj_id_tuple,
|
169
|
+
np.int64(total_size_bytes),
|
170
|
+
np.int64(total_record_count),
|
171
|
+
np.double(peak_memory_usage_bytes),
|
172
|
+
np.double(0.0),
|
173
|
+
np.double(time.time()),
|
174
|
+
)
|
175
|
+
|
176
|
+
|
177
|
+
@ray.remote
|
178
|
+
def hash_bucket(input: HashBucketInput) -> HashBucketResult:
|
179
|
+
|
180
|
+
logger.info(f"Starting hash bucket task...")
|
181
|
+
hash_bucket_result, duration = timed_invocation(
|
182
|
+
func=_timed_hash_bucket, input=input
|
183
|
+
)
|
184
|
+
|
185
|
+
emit_metrics_time = 0.0
|
186
|
+
if input.metrics_config:
|
187
|
+
emit_result, latency = timed_invocation(
|
188
|
+
func=emit_timer_metrics,
|
189
|
+
metrics_name="hash_bucket",
|
190
|
+
value=duration,
|
191
|
+
metrics_config=input.metrics_config,
|
192
|
+
)
|
193
|
+
emit_metrics_time = latency
|
194
|
+
|
195
|
+
logger.info(f"Finished hash bucket task...")
|
196
|
+
return HashBucketResult(
|
197
|
+
hash_bucket_result[0],
|
198
|
+
hash_bucket_result[1],
|
199
|
+
hash_bucket_result[2],
|
200
|
+
hash_bucket_result[3],
|
201
|
+
np.double(emit_metrics_time),
|
202
|
+
hash_bucket_result[5],
|
203
|
+
)
|