deltacat 0.1.18b1__py3-none-any.whl → 0.1.18b3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor/compaction_session.py +62 -25
- deltacat/compute/compactor/model/delta_annotated.py +1 -1
- deltacat/compute/compactor/model/materialize_result.py +16 -2
- deltacat/compute/compactor/model/repartition_result.py +6 -0
- deltacat/compute/compactor/model/round_completion_info.py +8 -0
- deltacat/compute/compactor/repartition_session.py +174 -0
- deltacat/compute/compactor/steps/materialize.py +116 -27
- deltacat/compute/compactor/steps/repartition.py +210 -0
- deltacat/compute/compactor/utils/io.py +131 -49
- deltacat/compute/compactor/utils/round_completion_file.py +14 -16
- deltacat/constants.py +2 -0
- deltacat/storage/interface.py +1 -1
- deltacat/storage/model/types.py +10 -2
- deltacat/tests/compactor/utils/__init__.py +0 -0
- deltacat/tests/compactor/utils/test_io.py +69 -0
- deltacat/tests/test_repartition.py +193 -0
- deltacat/tests/test_utils/__init__.py +0 -0
- deltacat/tests/test_utils/constants.py +7 -0
- deltacat/tests/utils/test_resources.py +36 -0
- deltacat/utils/ray_utils/concurrency.py +2 -0
- deltacat/utils/resources.py +72 -0
- {deltacat-0.1.18b1.dist-info → deltacat-0.1.18b3.dist-info}/METADATA +2 -5
- {deltacat-0.1.18b1.dist-info → deltacat-0.1.18b3.dist-info}/RECORD +28 -18
- {deltacat-0.1.18b1.dist-info → deltacat-0.1.18b3.dist-info}/WHEEL +1 -1
- /deltacat/{utils/profiling.py → tests/compactor/__init__.py} +0 -0
- {deltacat-0.1.18b1.dist-info → deltacat-0.1.18b3.dist-info}/LICENSE +0 -0
- {deltacat-0.1.18b1.dist-info → deltacat-0.1.18b3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,210 @@
|
|
1
|
+
import importlib
|
2
|
+
import logging
|
3
|
+
from contextlib import nullcontext
|
4
|
+
import pyarrow.compute as pc
|
5
|
+
import pyarrow as pa
|
6
|
+
from typing import List, Optional
|
7
|
+
from deltacat.types.media import StorageType, ContentType
|
8
|
+
import ray
|
9
|
+
from deltacat import logs
|
10
|
+
from deltacat.compute.compactor import DeltaAnnotated
|
11
|
+
from deltacat.compute.compactor.model.repartition_result import RepartitionResult
|
12
|
+
from deltacat.storage import interface as unimplemented_deltacat_storage
|
13
|
+
from deltacat.storage import Partition
|
14
|
+
from deltacat.utils.ray_utils.runtime import (
|
15
|
+
get_current_ray_task_id,
|
16
|
+
get_current_ray_worker_id,
|
17
|
+
)
|
18
|
+
from deltacat.utils.common import ReadKwargsProvider
|
19
|
+
from deltacat.utils.performance import timed_invocation
|
20
|
+
from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
|
21
|
+
from deltacat.storage import Delta
|
22
|
+
from enum import Enum
|
23
|
+
|
24
|
+
if importlib.util.find_spec("memray"):
|
25
|
+
import memray
|
26
|
+
|
27
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
28
|
+
|
29
|
+
"""
|
30
|
+
Similar to Spark (https://sparkbyexamples.com/spark/spark-partitioning-understanding/), where
|
31
|
+
partition helps in localizing the data and reduce the data shuffling across the network nodes reducing network latency
|
32
|
+
which is a major component of the transformation operation thereby reducing the time of completion.
|
33
|
+
Deltacat with Ray can support different partitioning strategies to reduce the data movement either across network or between compute and storage
|
34
|
+
Note that the term partition here is different from the term used in catalog
|
35
|
+
Type of Partition:
|
36
|
+
Range Partition: It assigns rows to partitions based on column values falling within a given range, e.g., repartition(column="last_updated", ranges=['2023-01-01', '2023-02-01', '2023-03-01']), data will be split into 4 files
|
37
|
+
Hash Partition: Hash Partitioning attempts to spread the data evenly across various partitions based on the key, e.g., repartition(column="last_updated", num_partitions=10), data will be split into 10 files evenly
|
38
|
+
"""
|
39
|
+
|
40
|
+
|
41
|
+
class RepartitionType(str, Enum):
|
42
|
+
RANGE = "range"
|
43
|
+
HASH = "hash"
|
44
|
+
|
45
|
+
|
46
|
+
def generate_unique_name(base_name: str, existing_names: List[str]):
|
47
|
+
counter = 1
|
48
|
+
while base_name + str(counter) in existing_names:
|
49
|
+
counter += 1
|
50
|
+
return base_name + str(counter)
|
51
|
+
|
52
|
+
|
53
|
+
def repartition_range(
|
54
|
+
tables: List[pa.Table],
|
55
|
+
destination_partition: Partition,
|
56
|
+
repartition_args: dict,
|
57
|
+
max_records_per_output_file: int,
|
58
|
+
repartitioned_file_content_type: ContentType = ContentType.PARQUET,
|
59
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
60
|
+
):
|
61
|
+
"""
|
62
|
+
Repartitions a list of Arrow tables based on specified ranges and stores the repartitioned tables.
|
63
|
+
|
64
|
+
Args:
|
65
|
+
tables (List[pa.Table]): List of tables to be repartitioned.
|
66
|
+
destination_partition (Partition): The partition to store the repartitioned tables.
|
67
|
+
repartition_args (dict): Arguments for repartitioning. Must include a "column" key for the column to partition
|
68
|
+
on and a "ranges" key for a list of partition range values.
|
69
|
+
max_records_per_output_file (int): Maximum number of records per output file.
|
70
|
+
repartitioned_file_content_type (ContentType, optional): The content type of the repartitioned files.
|
71
|
+
Defaults to ContentType.PARQUET.
|
72
|
+
deltacat_storage (unimplemented): Storage where to put the repartitioned data.
|
73
|
+
|
74
|
+
Raises:
|
75
|
+
ValueError: If no partition ranges are specified or if the column to partition on does not exist in the tables.
|
76
|
+
|
77
|
+
Returns:
|
78
|
+
RepartitionResult: Contains a list of the stored deltas for each partition range.
|
79
|
+
Examples:
|
80
|
+
repartition_args['ranges']= [x, y, z], The tables will be repartitioned into 4 files, i.e., (-inf, x], (x, y], (y, z], (z, inf)
|
81
|
+
|
82
|
+
Note:
|
83
|
+
The function assumes that the tables all share the same schema. If the column to partition on does not exist
|
84
|
+
in the tables, an error will be raised. For each partition range, a new file is created. This could result in
|
85
|
+
more output files than input files.
|
86
|
+
"""
|
87
|
+
column: str = repartition_args["column"]
|
88
|
+
partition_ranges: List = repartition_args["ranges"]
|
89
|
+
if len(partition_ranges) == 0:
|
90
|
+
raise ValueError("No partition ranges specified")
|
91
|
+
# check if the column exists in the table
|
92
|
+
# TODO: design a better way to handle the case when the column does not exist in the table, e.g., backfill + repartition by stream position + file id
|
93
|
+
if not all(column in table.column_names for table in tables):
|
94
|
+
raise ValueError(f"Column {column} does not exist in the table")
|
95
|
+
partition_ranges.sort()
|
96
|
+
partition_ranges = [-float("Inf")] + partition_ranges + [float("Inf")]
|
97
|
+
partitioned_tables_list = [[] for _ in range(len(partition_ranges) - 1)]
|
98
|
+
|
99
|
+
total_record_count = 0
|
100
|
+
col_name_int64 = f"{column}_int64"
|
101
|
+
col_name_int64 = generate_unique_name(col_name_int64, tables[0].schema.names)
|
102
|
+
for table in tables:
|
103
|
+
total_record_count += len(table)
|
104
|
+
table_new = table.add_column(
|
105
|
+
0,
|
106
|
+
pa.field(col_name_int64, pa.int64()),
|
107
|
+
pc.cast(table[column], pa.int64()),
|
108
|
+
)
|
109
|
+
# Iterate over pairs of values in partition_ranges
|
110
|
+
for i, (lower_limit, upper_limit) in enumerate(
|
111
|
+
zip(partition_ranges[:-1], partition_ranges[1:]), start=0
|
112
|
+
):
|
113
|
+
# Add the table filtered by the lower and upper limits to partitioned_tables_list
|
114
|
+
partitioned_tables_list[i].append(
|
115
|
+
table_new.filter(
|
116
|
+
(pc.field(col_name_int64) > pc.scalar(lower_limit))
|
117
|
+
& (pc.field(col_name_int64) <= pc.scalar(upper_limit))
|
118
|
+
)
|
119
|
+
)
|
120
|
+
partition_table_length = 0
|
121
|
+
# After re-grouping the tables by specified ranges, for each group, we need concat and stage the tables
|
122
|
+
partition_deltas: List[Delta] = []
|
123
|
+
for partition_tables in partitioned_tables_list:
|
124
|
+
if len(partition_tables) > 0:
|
125
|
+
partition_table: pa.Table = pa.concat_tables(partition_tables)
|
126
|
+
if len(partition_table) > 0:
|
127
|
+
partition_table_length += len(partition_table)
|
128
|
+
partition_delta: Delta = deltacat_storage.stage_delta(
|
129
|
+
partition_table,
|
130
|
+
destination_partition,
|
131
|
+
max_records_per_entry=max_records_per_output_file,
|
132
|
+
content_type=repartitioned_file_content_type,
|
133
|
+
)
|
134
|
+
partition_deltas.append(partition_delta)
|
135
|
+
|
136
|
+
assert (
|
137
|
+
partition_table_length == total_record_count
|
138
|
+
), f"Repartitioned table should have the same number of records {partition_table_length} as the original table {total_record_count}"
|
139
|
+
return RepartitionResult(
|
140
|
+
range_deltas=partition_deltas,
|
141
|
+
)
|
142
|
+
|
143
|
+
|
144
|
+
def _timed_repartition(
|
145
|
+
annotated_delta: DeltaAnnotated,
|
146
|
+
destination_partition: Partition,
|
147
|
+
repartition_type: RepartitionType,
|
148
|
+
repartition_args: dict,
|
149
|
+
max_records_per_output_file: int,
|
150
|
+
enable_profiler: bool,
|
151
|
+
read_kwargs_provider: Optional[ReadKwargsProvider],
|
152
|
+
repartitioned_file_content_type: ContentType = ContentType.PARQUET,
|
153
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
154
|
+
) -> RepartitionResult:
|
155
|
+
task_id = get_current_ray_task_id()
|
156
|
+
worker_id = get_current_ray_worker_id()
|
157
|
+
with memray.Tracker(
|
158
|
+
f"repartition_{worker_id}_{task_id}.bin"
|
159
|
+
) if enable_profiler else nullcontext():
|
160
|
+
tables: List[pa.Table] = deltacat_storage.download_delta(
|
161
|
+
annotated_delta,
|
162
|
+
storage_type=StorageType.LOCAL,
|
163
|
+
file_reader_kwargs_provider=read_kwargs_provider,
|
164
|
+
)
|
165
|
+
if repartition_type == RepartitionType.RANGE:
|
166
|
+
return repartition_range(
|
167
|
+
tables=tables,
|
168
|
+
destination_partition=destination_partition,
|
169
|
+
repartition_args=repartition_args,
|
170
|
+
max_records_per_output_file=max_records_per_output_file,
|
171
|
+
repartitioned_file_content_type=repartitioned_file_content_type,
|
172
|
+
deltacat_storage=deltacat_storage,
|
173
|
+
)
|
174
|
+
else:
|
175
|
+
raise NotImplementedError(
|
176
|
+
f"Repartition type {repartition_type} is not supported."
|
177
|
+
)
|
178
|
+
|
179
|
+
|
180
|
+
@ray.remote
|
181
|
+
def repartition(
|
182
|
+
annotated_delta: DeltaAnnotated,
|
183
|
+
destination_partition: Partition,
|
184
|
+
repartition_type: RepartitionType,
|
185
|
+
repartition_args: dict,
|
186
|
+
max_records_per_output_file: int,
|
187
|
+
enable_profiler: bool,
|
188
|
+
metrics_config: Optional[MetricsConfig],
|
189
|
+
read_kwargs_provider: Optional[ReadKwargsProvider],
|
190
|
+
repartitioned_file_content_type: ContentType = ContentType.PARQUET,
|
191
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
192
|
+
) -> RepartitionResult:
|
193
|
+
logger.info(f"Starting repartition task...")
|
194
|
+
repartition_result, duration = timed_invocation(
|
195
|
+
func=_timed_repartition,
|
196
|
+
annotated_delta=annotated_delta,
|
197
|
+
destination_partition=destination_partition,
|
198
|
+
repartition_type=repartition_type,
|
199
|
+
repartition_args=repartition_args,
|
200
|
+
max_records_per_output_file=max_records_per_output_file,
|
201
|
+
enable_profiler=enable_profiler,
|
202
|
+
read_kwargs_provider=read_kwargs_provider,
|
203
|
+
repartitioned_file_content_type=repartitioned_file_content_type,
|
204
|
+
deltacat_storage=deltacat_storage,
|
205
|
+
)
|
206
|
+
if metrics_config:
|
207
|
+
emit_timer_metrics(
|
208
|
+
metrics_name="repartition", value=duration, metrics_config=metrics_config
|
209
|
+
)
|
210
|
+
return repartition_result
|
@@ -1,7 +1,11 @@
|
|
1
1
|
import logging
|
2
2
|
import math
|
3
3
|
from deltacat.compute.stats.models.delta_stats import DeltaStats
|
4
|
-
from deltacat.constants import
|
4
|
+
from deltacat.constants import (
|
5
|
+
PYARROW_INFLATION_MULTIPLIER,
|
6
|
+
BYTES_PER_MEBIBYTE,
|
7
|
+
MEMORY_TO_HASH_BUCKET_COUNT_RATIO,
|
8
|
+
)
|
5
9
|
|
6
10
|
from deltacat.storage import (
|
7
11
|
PartitionLocator,
|
@@ -84,54 +88,6 @@ def discover_deltas(
|
|
84
88
|
return input_deltas, previous_last_stream_position_compacted
|
85
89
|
|
86
90
|
|
87
|
-
def _discover_deltas(
|
88
|
-
source_partition_locator: PartitionLocator,
|
89
|
-
start_position_exclusive: Optional[int],
|
90
|
-
end_position_inclusive: int,
|
91
|
-
deltacat_storage=unimplemented_deltacat_storage,
|
92
|
-
**kwargs,
|
93
|
-
) -> List[Delta]:
|
94
|
-
stream_locator = source_partition_locator.stream_locator
|
95
|
-
namespace = stream_locator.namespace
|
96
|
-
table_name = stream_locator.table_name
|
97
|
-
table_version = stream_locator.table_version
|
98
|
-
partition_values = source_partition_locator.partition_values
|
99
|
-
deltas_list_result = deltacat_storage.list_deltas(
|
100
|
-
namespace=namespace,
|
101
|
-
table_name=table_name,
|
102
|
-
partition_values=partition_values,
|
103
|
-
table_version=table_version,
|
104
|
-
first_stream_position=start_position_exclusive,
|
105
|
-
last_stream_position=end_position_inclusive,
|
106
|
-
ascending_order=True,
|
107
|
-
include_manifest=True,
|
108
|
-
**kwargs,
|
109
|
-
)
|
110
|
-
deltas = deltas_list_result.all_items()
|
111
|
-
if not deltas:
|
112
|
-
raise RuntimeError(
|
113
|
-
f"Unexpected Error: Couldn't find any deltas to "
|
114
|
-
f"compact in delta stream position range "
|
115
|
-
f"('{start_position_exclusive}', "
|
116
|
-
f"'{end_position_inclusive}']. Source partition: "
|
117
|
-
f"{source_partition_locator}"
|
118
|
-
)
|
119
|
-
if start_position_exclusive == deltas[0].stream_position:
|
120
|
-
first_delta = deltas.pop(0)
|
121
|
-
logger.info(
|
122
|
-
f"Removed exclusive start delta w/ expected stream "
|
123
|
-
f"position '{start_position_exclusive}' from deltas to "
|
124
|
-
f"compact: {first_delta}"
|
125
|
-
)
|
126
|
-
logger.info(
|
127
|
-
f"Count of deltas to compact in delta stream "
|
128
|
-
f"position range ('{start_position_exclusive}', "
|
129
|
-
f"'{end_position_inclusive}']: {len(deltas)}. Source "
|
130
|
-
f"partition: '{source_partition_locator}'"
|
131
|
-
)
|
132
|
-
return deltas
|
133
|
-
|
134
|
-
|
135
91
|
def limit_input_deltas(
|
136
92
|
input_deltas: List[Delta],
|
137
93
|
cluster_resources: Dict[str, float],
|
@@ -285,3 +241,129 @@ def limit_input_deltas(
|
|
285
241
|
logger.info(f"Input uniform delta count: {len(rebatched_da_list)}")
|
286
242
|
|
287
243
|
return rebatched_da_list, hash_bucket_count, high_watermark, require_multiple_rounds
|
244
|
+
|
245
|
+
|
246
|
+
def fit_input_deltas(
|
247
|
+
input_deltas: List[Delta],
|
248
|
+
cluster_resources: Dict[str, float],
|
249
|
+
hash_bucket_count: Optional[int],
|
250
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
251
|
+
) -> Tuple[List[DeltaAnnotated], int, HighWatermark, bool]:
|
252
|
+
"""
|
253
|
+
This method tries to fit all the input deltas to run into the existing cluster. Contrary to
|
254
|
+
'limit_input_deltas', it will not fail if the current cluster cannot run the compaction reliably.
|
255
|
+
It is the responsibility of the caller to ensure that they pass enough resources for the job to execute.
|
256
|
+
|
257
|
+
Note: There is a possibility that individual file could be very large, which makes the deltas non uniform.
|
258
|
+
In such scenarios, it is advisable to allocate multiple vCPUs to the tasks to ensure parallelism.
|
259
|
+
|
260
|
+
Args:
|
261
|
+
input_deltas: The input deltas to be normalized.
|
262
|
+
cluster_resources: Total available resources in the cluster.
|
263
|
+
hash_bucket_count: The hash bucket count.
|
264
|
+
deltacat_storage: An implementation of the DeltaCAT storage interface.
|
265
|
+
|
266
|
+
Returns:
|
267
|
+
Tuple of list of annotated deltas, recommended hash bucket count, high watermark,
|
268
|
+
and whether multiple rounds are required (which is always False)
|
269
|
+
"""
|
270
|
+
worker_cpus = int(cluster_resources["CPU"])
|
271
|
+
total_memory = float(cluster_resources["memory"])
|
272
|
+
high_watermark = HighWatermark()
|
273
|
+
annotated_input_da_list = []
|
274
|
+
delta_bytes = 0
|
275
|
+
total_files = 0
|
276
|
+
|
277
|
+
if not input_deltas:
|
278
|
+
raise AssertionError("No input deltas found!")
|
279
|
+
|
280
|
+
for delta in input_deltas:
|
281
|
+
manifest_entries = delta.manifest.entries
|
282
|
+
position = delta.stream_position
|
283
|
+
|
284
|
+
for entry in manifest_entries:
|
285
|
+
delta_bytes += entry.meta.content_length
|
286
|
+
|
287
|
+
total_files += len(manifest_entries)
|
288
|
+
|
289
|
+
high_watermark.set(
|
290
|
+
delta.locator.partition_locator,
|
291
|
+
max(position, high_watermark.get(delta.locator.partition_locator)),
|
292
|
+
)
|
293
|
+
delta_annotated = DeltaAnnotated.of(delta)
|
294
|
+
annotated_input_da_list.append(delta_annotated)
|
295
|
+
|
296
|
+
# We assume that the cluster is capable of distributing all tasks
|
297
|
+
# correctly. Hence, the correct in-memory size will be in the ratio of
|
298
|
+
# in-disk size.
|
299
|
+
def estimate_size(content_length):
|
300
|
+
return (content_length * 1.0 / delta_bytes) * total_memory
|
301
|
+
|
302
|
+
# Assuming each CPU consumes equal amount of memory
|
303
|
+
min_delta_bytes = total_memory / worker_cpus
|
304
|
+
rebatched_da_list = DeltaAnnotated.rebatch(
|
305
|
+
annotated_deltas=annotated_input_da_list,
|
306
|
+
min_delta_bytes=min_delta_bytes,
|
307
|
+
estimation_function=estimate_size,
|
308
|
+
)
|
309
|
+
|
310
|
+
# Recommended hash buckets based on the experiments performed
|
311
|
+
# using S3 input for optimal throughput.
|
312
|
+
if hash_bucket_count is None:
|
313
|
+
hash_bucket_count = int(
|
314
|
+
math.ceil(total_memory / MEMORY_TO_HASH_BUCKET_COUNT_RATIO)
|
315
|
+
)
|
316
|
+
|
317
|
+
logger.info(
|
318
|
+
f"Input delta bytes: {delta_bytes}, Total files: {total_files}, The worker_cpus: {worker_cpus}, "
|
319
|
+
f" total_memory: {total_memory}, and hash_bucket_count: {hash_bucket_count}"
|
320
|
+
)
|
321
|
+
return rebatched_da_list, hash_bucket_count, high_watermark, False
|
322
|
+
|
323
|
+
|
324
|
+
def _discover_deltas(
|
325
|
+
source_partition_locator: PartitionLocator,
|
326
|
+
start_position_exclusive: Optional[int],
|
327
|
+
end_position_inclusive: int,
|
328
|
+
deltacat_storage=unimplemented_deltacat_storage,
|
329
|
+
**kwargs,
|
330
|
+
) -> List[Delta]:
|
331
|
+
stream_locator = source_partition_locator.stream_locator
|
332
|
+
namespace = stream_locator.namespace
|
333
|
+
table_name = stream_locator.table_name
|
334
|
+
table_version = stream_locator.table_version
|
335
|
+
partition_values = source_partition_locator.partition_values
|
336
|
+
deltas_list_result = deltacat_storage.list_deltas(
|
337
|
+
namespace=namespace,
|
338
|
+
table_name=table_name,
|
339
|
+
partition_values=partition_values,
|
340
|
+
table_version=table_version,
|
341
|
+
first_stream_position=start_position_exclusive,
|
342
|
+
last_stream_position=end_position_inclusive,
|
343
|
+
ascending_order=True,
|
344
|
+
include_manifest=True,
|
345
|
+
**kwargs,
|
346
|
+
)
|
347
|
+
deltas = deltas_list_result.all_items()
|
348
|
+
if not deltas:
|
349
|
+
raise RuntimeError(
|
350
|
+
f"Unexpected Error: Couldn't find any deltas to "
|
351
|
+
f"compact in delta stream position range "
|
352
|
+
f"('{start_position_exclusive}', "
|
353
|
+
f"'{end_position_inclusive}']. Source partition: "
|
354
|
+
f"{source_partition_locator}"
|
355
|
+
)
|
356
|
+
if start_position_exclusive == deltas[0].stream_position:
|
357
|
+
first_delta = deltas.pop(0)
|
358
|
+
logger.info(
|
359
|
+
f"Removed exclusive start delta w/ expected stream "
|
360
|
+
f"position '{start_position_exclusive}' from deltas to "
|
361
|
+
f"compact: {first_delta}"
|
362
|
+
)
|
363
|
+
logger.info(
|
364
|
+
f"Count of deltas to compact in delta stream "
|
365
|
+
f"position range ('{start_position_exclusive}', "
|
366
|
+
f"'{end_position_inclusive}']: {len(deltas)}. Source "
|
367
|
+
f"partition: '{source_partition_locator}'"
|
368
|
+
)
|
369
|
+
return deltas
|
@@ -4,6 +4,8 @@ import logging
|
|
4
4
|
from deltacat import logs
|
5
5
|
from deltacat.compute.compactor import RoundCompletionInfo
|
6
6
|
from deltacat.storage import PartitionLocator
|
7
|
+
from deltacat.aws import s3u as s3_utils
|
8
|
+
from typing import Optional
|
7
9
|
|
8
10
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
9
11
|
|
@@ -20,8 +22,6 @@ def read_round_completion_file(
|
|
20
22
|
bucket: str, source_partition_locator: PartitionLocator
|
21
23
|
) -> RoundCompletionInfo:
|
22
24
|
|
23
|
-
from deltacat.aws import s3u as s3_utils
|
24
|
-
|
25
25
|
round_completion_file_url = get_round_completion_file_s3_url(
|
26
26
|
bucket,
|
27
27
|
source_partition_locator,
|
@@ -37,21 +37,19 @@ def read_round_completion_file(
|
|
37
37
|
|
38
38
|
|
39
39
|
def write_round_completion_file(
|
40
|
-
bucket: str,
|
41
|
-
source_partition_locator: PartitionLocator,
|
40
|
+
bucket: Optional[str],
|
41
|
+
source_partition_locator: Optional[PartitionLocator],
|
42
42
|
round_completion_info: RoundCompletionInfo,
|
43
|
+
completion_file_s3_url: str = None,
|
43
44
|
) -> str:
|
44
45
|
|
45
|
-
from deltacat.aws import s3u as s3_utils
|
46
|
-
|
47
46
|
logger.info(f"writing round completion file contents: {round_completion_info}")
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
)
|
56
|
-
|
57
|
-
return round_completion_file_s3_url
|
47
|
+
if completion_file_s3_url is None:
|
48
|
+
completion_file_s3_url = get_round_completion_file_s3_url(
|
49
|
+
bucket,
|
50
|
+
source_partition_locator,
|
51
|
+
)
|
52
|
+
logger.info(f"writing round completion file to: {completion_file_s3_url}")
|
53
|
+
s3_utils.upload(completion_file_s3_url, str(json.dumps(round_completion_info)))
|
54
|
+
logger.info(f"round completion file written to: {completion_file_s3_url}")
|
55
|
+
return completion_file_s3_url
|
deltacat/constants.py
CHANGED
deltacat/storage/interface.py
CHANGED
@@ -443,7 +443,7 @@ def get_partition(
|
|
443
443
|
|
444
444
|
|
445
445
|
def stage_delta(
|
446
|
-
data: Union[LocalTable, LocalDataset, DistributedDataset],
|
446
|
+
data: Union[LocalTable, LocalDataset, DistributedDataset, Manifest],
|
447
447
|
partition: Partition,
|
448
448
|
delta_type: DeltaType = DeltaType.UPSERT,
|
449
449
|
max_records_per_entry: Optional[int] = None,
|
deltacat/storage/model/types.py
CHANGED
@@ -1,15 +1,23 @@
|
|
1
1
|
from enum import Enum
|
2
|
-
from typing import
|
2
|
+
from typing import List, Union, Any
|
3
3
|
|
4
4
|
import numpy as np
|
5
5
|
import pandas as pd
|
6
6
|
import pyarrow as pa
|
7
|
+
import pkg_resources
|
7
8
|
from ray.data._internal.arrow_block import ArrowRow
|
8
9
|
from ray.data.dataset import Dataset
|
9
10
|
|
10
11
|
LocalTable = Union[pa.Table, pd.DataFrame, np.ndarray]
|
11
12
|
LocalDataset = List[LocalTable]
|
12
|
-
|
13
|
+
# Starting Ray 2.5.0, Dataset follows a strict mode (https://docs.ray.io/en/latest/data/faq.html#migrating-to-strict-mode),
|
14
|
+
# and generic annotation is removed. So add a version checker to determine whether to use the old or new definition.
|
15
|
+
ray_version = pkg_resources.parse_version(pkg_resources.get_distribution("ray").version)
|
16
|
+
change_version = pkg_resources.parse_version("2.5.0")
|
17
|
+
if ray_version < change_version:
|
18
|
+
DistributedDataset = Dataset[Union[ArrowRow, np.ndarray, Any]]
|
19
|
+
else:
|
20
|
+
DistributedDataset = Dataset
|
13
21
|
|
14
22
|
|
15
23
|
class DeltaType(str, Enum):
|
File without changes
|
@@ -0,0 +1,69 @@
|
|
1
|
+
import unittest
|
2
|
+
from unittest import mock
|
3
|
+
from deltacat.tests.test_utils.constants import TEST_DELTA
|
4
|
+
|
5
|
+
|
6
|
+
class TestFitInputDeltas(unittest.TestCase):
|
7
|
+
@classmethod
|
8
|
+
def setUpClass(cls):
|
9
|
+
cls.module_patcher = mock.patch.dict("sys.modules", {"ray": mock.MagicMock()})
|
10
|
+
cls.module_patcher.start()
|
11
|
+
|
12
|
+
super().setUpClass()
|
13
|
+
|
14
|
+
def test_sanity(self):
|
15
|
+
from deltacat.compute.compactor.utils import io
|
16
|
+
|
17
|
+
(
|
18
|
+
delta_list,
|
19
|
+
hash_bucket_count,
|
20
|
+
high_watermark,
|
21
|
+
require_multiple_rounds,
|
22
|
+
) = io.fit_input_deltas([TEST_DELTA], {"CPU": 1, "memory": 20000000}, None)
|
23
|
+
|
24
|
+
self.assertIsNotNone(hash_bucket_count)
|
25
|
+
self.assertTrue(1, len(delta_list))
|
26
|
+
self.assertIsNotNone(high_watermark)
|
27
|
+
self.assertFalse(require_multiple_rounds)
|
28
|
+
|
29
|
+
def test_when_hash_bucket_count_overridden(self):
|
30
|
+
from deltacat.compute.compactor.utils import io
|
31
|
+
|
32
|
+
(
|
33
|
+
delta_list,
|
34
|
+
hash_bucket_count,
|
35
|
+
high_watermark,
|
36
|
+
require_multiple_rounds,
|
37
|
+
) = io.fit_input_deltas([TEST_DELTA], {"CPU": 1, "memory": 20000000}, 20)
|
38
|
+
|
39
|
+
self.assertEqual(20, hash_bucket_count)
|
40
|
+
self.assertEqual(1, len(delta_list))
|
41
|
+
self.assertIsNotNone(high_watermark)
|
42
|
+
self.assertFalse(require_multiple_rounds)
|
43
|
+
|
44
|
+
def test_when_not_enough_memory_splits_manifest_entries(self):
|
45
|
+
from deltacat.compute.compactor.utils import io
|
46
|
+
|
47
|
+
(
|
48
|
+
delta_list,
|
49
|
+
hash_bucket_count,
|
50
|
+
high_watermark,
|
51
|
+
require_multiple_rounds,
|
52
|
+
) = io.fit_input_deltas([TEST_DELTA], {"CPU": 2, "memory": 10}, 20)
|
53
|
+
|
54
|
+
self.assertIsNotNone(hash_bucket_count)
|
55
|
+
self.assertTrue(2, len(delta_list))
|
56
|
+
self.assertIsNotNone(high_watermark)
|
57
|
+
self.assertFalse(require_multiple_rounds)
|
58
|
+
|
59
|
+
def test_when_no_input_deltas(self):
|
60
|
+
from deltacat.compute.compactor.utils import io
|
61
|
+
|
62
|
+
with self.assertRaises(AssertionError):
|
63
|
+
io.fit_input_deltas([], {"CPU": 100, "memory": 20000.0}, None)
|
64
|
+
|
65
|
+
def test_when_cpu_resources_is_not_passed(self):
|
66
|
+
from deltacat.compute.compactor.utils import io
|
67
|
+
|
68
|
+
with self.assertRaises(KeyError):
|
69
|
+
io.fit_input_deltas([], {}, None)
|