deltacat 0.1.18b3__py3-none-any.whl → 0.1.18b7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor/compaction_session.py +184 -29
- deltacat/compute/compactor/model/compact_partition_params.py +153 -0
- deltacat/compute/compactor/model/compaction_session_audit_info.py +725 -0
- deltacat/compute/compactor/model/dedupe_result.py +3 -0
- deltacat/compute/compactor/model/delta_file_envelope.py +8 -0
- deltacat/compute/compactor/model/delta_file_locator.py +11 -6
- deltacat/compute/compactor/model/hash_bucket_result.py +3 -0
- deltacat/compute/compactor/model/materialize_result.py +27 -6
- deltacat/compute/compactor/model/round_completion_info.py +9 -0
- deltacat/compute/compactor/steps/dedupe.py +35 -19
- deltacat/compute/compactor/steps/hash_bucket.py +41 -16
- deltacat/compute/compactor/steps/materialize.py +73 -70
- deltacat/compute/compactor/utils/io.py +15 -0
- deltacat/compute/compactor/utils/primary_key_index.py +9 -15
- deltacat/compute/compactor/utils/round_completion_file.py +13 -4
- deltacat/compute/compactor/utils/system_columns.py +32 -0
- deltacat/io/__init__.py +0 -7
- deltacat/io/file_object_store.py +48 -0
- deltacat/io/memcached_object_store.py +121 -0
- deltacat/io/object_store.py +51 -0
- deltacat/io/ray_plasma_object_store.py +23 -0
- deltacat/io/redis_object_store.py +114 -0
- deltacat/io/s3_object_store.py +44 -0
- deltacat/storage/model/delta.py +2 -1
- deltacat/tests/compactor/test_compact_partition_params.py +237 -0
- deltacat/tests/compactor/utils/test_io.py +27 -5
- deltacat/tests/io/__init__.py +0 -0
- deltacat/tests/io/test_file_object_store.py +86 -0
- deltacat/tests/io/test_memcached_object_store.py +158 -0
- deltacat/tests/io/test_ray_plasma_object_store.py +54 -0
- deltacat/tests/io/test_redis_object_store.py +103 -0
- deltacat/tests/io/test_s3_object_store.py +59 -0
- deltacat/tests/utils/test_record_batch_tables.py +1 -1
- deltacat/tests/utils/test_resources.py +9 -0
- deltacat/utils/ray_utils/concurrency.py +0 -2
- deltacat/utils/resources.py +30 -18
- {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/METADATA +3 -1
- {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/RECORD +42 -27
- {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/LICENSE +0 -0
- {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/WHEEL +0 -0
- {deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/top_level.txt +0 -0
@@ -5,10 +5,10 @@ from uuid import uuid4
|
|
5
5
|
from collections import defaultdict
|
6
6
|
from contextlib import nullcontext
|
7
7
|
from itertools import chain, repeat
|
8
|
-
from typing import List, Optional, Tuple, Dict, Any
|
8
|
+
from typing import List, Optional, Tuple, Dict, Any
|
9
9
|
import pyarrow as pa
|
10
|
+
import numpy as np
|
10
11
|
import ray
|
11
|
-
from ray import cloudpickle
|
12
12
|
from deltacat import logs
|
13
13
|
from deltacat.compute.compactor import (
|
14
14
|
MaterializeResult,
|
@@ -27,15 +27,13 @@ from deltacat.storage import (
|
|
27
27
|
PartitionLocator,
|
28
28
|
Manifest,
|
29
29
|
ManifestEntry,
|
30
|
-
LocalDataset,
|
31
|
-
LocalTable,
|
32
|
-
DistributedDataset,
|
33
30
|
)
|
34
31
|
from deltacat.storage import interface as unimplemented_deltacat_storage
|
35
32
|
from deltacat.utils.common import ReadKwargsProvider
|
36
33
|
from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES, ContentType
|
37
34
|
from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
|
38
35
|
from deltacat.utils.performance import timed_invocation
|
36
|
+
from deltacat.io.object_store import IObjectStore
|
39
37
|
from deltacat.utils.pyarrow import (
|
40
38
|
ReadKwargsProviderPyArrowCsvPureUtf8,
|
41
39
|
ReadKwargsProviderPyArrowSchemaOverride,
|
@@ -46,6 +44,7 @@ from deltacat.utils.ray_utils.runtime import (
|
|
46
44
|
get_current_ray_worker_id,
|
47
45
|
)
|
48
46
|
from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
|
47
|
+
from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
|
49
48
|
|
50
49
|
if importlib.util.find_spec("memray"):
|
51
50
|
import memray
|
@@ -62,29 +61,15 @@ def materialize(
|
|
62
61
|
dedupe_task_idx_and_obj_id_tuples: List[DedupeTaskIndexWithObjectId],
|
63
62
|
max_records_per_output_file: int,
|
64
63
|
compacted_file_content_type: ContentType,
|
64
|
+
enable_manifest_entry_copy_by_reference: bool,
|
65
65
|
enable_profiler: bool,
|
66
66
|
metrics_config: MetricsConfig,
|
67
67
|
schema: Optional[pa.Schema] = None,
|
68
68
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
69
69
|
s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
70
|
+
object_store: Optional[IObjectStore] = None,
|
70
71
|
deltacat_storage=unimplemented_deltacat_storage,
|
71
72
|
):
|
72
|
-
def _stage_delta_implementation(
|
73
|
-
data: Union[LocalTable, LocalDataset, DistributedDataset, Manifest],
|
74
|
-
partition: Partition,
|
75
|
-
stage_delta_from_existing_manifest: Optional[bool],
|
76
|
-
) -> Delta:
|
77
|
-
if stage_delta_from_existing_manifest:
|
78
|
-
delta = Delta.of(
|
79
|
-
locator=DeltaLocator.of(partition.locator),
|
80
|
-
delta_type=DeltaType.UPSERT,
|
81
|
-
meta=manifest.meta,
|
82
|
-
manifest=data,
|
83
|
-
previous_stream_position=partition.stream_position,
|
84
|
-
properties={},
|
85
|
-
)
|
86
|
-
return delta
|
87
|
-
|
88
73
|
def _stage_delta_from_manifest_entry_reference_list(
|
89
74
|
manifest_entry_list_reference: List[ManifestEntry],
|
90
75
|
partition: Partition,
|
@@ -94,11 +79,13 @@ def materialize(
|
|
94
79
|
delta_type == DeltaType.UPSERT
|
95
80
|
), "Stage delta with existing manifest entries only supports UPSERT delta type!"
|
96
81
|
manifest = Manifest.of(entries=manifest_entry_list_reference, uuid=str(uuid4()))
|
97
|
-
delta =
|
98
|
-
|
99
|
-
partition=partition,
|
82
|
+
delta = Delta.of(
|
83
|
+
locator=DeltaLocator.of(partition.locator),
|
100
84
|
delta_type=delta_type,
|
101
|
-
|
85
|
+
meta=manifest.meta,
|
86
|
+
manifest=manifest,
|
87
|
+
previous_stream_position=partition.stream_position,
|
88
|
+
properties={},
|
102
89
|
)
|
103
90
|
return delta
|
104
91
|
|
@@ -160,18 +147,11 @@ def materialize(
|
|
160
147
|
f"dedupe_{worker_id}_{task_id}.bin"
|
161
148
|
) if enable_profiler else nullcontext():
|
162
149
|
start = time.time()
|
163
|
-
dedupe_task_idx_and_obj_ref_tuples = [
|
164
|
-
(
|
165
|
-
t1,
|
166
|
-
cloudpickle.loads(t2),
|
167
|
-
)
|
168
|
-
for t1, t2 in dedupe_task_idx_and_obj_id_tuples
|
169
|
-
]
|
170
150
|
logger.info(f"Resolved materialize task obj refs...")
|
171
|
-
dedupe_task_indices, obj_refs = zip(*
|
151
|
+
dedupe_task_indices, obj_refs = zip(*dedupe_task_idx_and_obj_id_tuples)
|
172
152
|
# this depends on `ray.get` result order matching input order, as per the
|
173
153
|
# contract established in: https://github.com/ray-project/ray/pull/16763
|
174
|
-
src_file_records_list =
|
154
|
+
src_file_records_list = object_store.get_many(list(obj_refs))
|
175
155
|
all_src_file_records = defaultdict(list)
|
176
156
|
for i, src_file_records in enumerate(src_file_records_list):
|
177
157
|
dedupe_task_idx = dedupe_task_indices[i]
|
@@ -195,13 +175,13 @@ def materialize(
|
|
195
175
|
is_src_partition_file_np = src_dfl.is_source_delta
|
196
176
|
src_stream_position_np = src_dfl.stream_position
|
197
177
|
src_file_idx_np = src_dfl.file_index
|
178
|
+
src_file_record_count = src_dfl.file_record_count.item()
|
198
179
|
count_of_src_dfl += 1
|
199
180
|
src_file_partition_locator = (
|
200
181
|
source_partition_locator
|
201
182
|
if is_src_partition_file_np
|
202
183
|
else round_completion_info.compacted_delta_locator.partition_locator
|
203
184
|
)
|
204
|
-
|
205
185
|
delta_locator = DeltaLocator.of(
|
206
186
|
src_file_partition_locator,
|
207
187
|
src_stream_position_np.item(),
|
@@ -223,43 +203,45 @@ def materialize(
|
|
223
203
|
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
|
224
204
|
schema=schema
|
225
205
|
)
|
226
|
-
pa_table, download_delta_manifest_entry_time = timed_invocation(
|
227
|
-
deltacat_storage.download_delta_manifest_entry,
|
228
|
-
Delta.of(delta_locator, None, None, None, manifest),
|
229
|
-
src_file_idx_np.item(),
|
230
|
-
file_reader_kwargs_provider=read_kwargs_provider,
|
231
|
-
)
|
232
|
-
logger.debug(
|
233
|
-
f"Time taken for materialize task"
|
234
|
-
f" to download delta locator {delta_locator} with entry ID {src_file_idx_np.item()}"
|
235
|
-
f" is: {download_delta_manifest_entry_time}s"
|
236
|
-
)
|
237
206
|
record_numbers = chain.from_iterable(record_numbers_tpl)
|
238
207
|
record_numbers_length = 0
|
239
|
-
mask_pylist = list(repeat(False,
|
208
|
+
mask_pylist = list(repeat(False, src_file_record_count))
|
240
209
|
for record_number in record_numbers:
|
241
210
|
record_numbers_length += 1
|
242
211
|
mask_pylist[record_number] = True
|
243
212
|
if (
|
244
|
-
|
213
|
+
round_completion_info
|
214
|
+
and enable_manifest_entry_copy_by_reference
|
215
|
+
and record_numbers_length == src_file_record_count
|
245
216
|
and src_file_partition_locator
|
246
217
|
== round_completion_info.compacted_delta_locator.partition_locator
|
247
218
|
):
|
248
219
|
logger.debug(
|
249
220
|
f"Untouched manifest file found, "
|
250
221
|
f"record numbers length: {record_numbers_length} "
|
251
|
-
f"same as downloaded table length: {
|
222
|
+
f"same as downloaded table length: {src_file_record_count}"
|
252
223
|
)
|
253
224
|
untouched_src_manifest_entry = manifest.entries[src_file_idx_np.item()]
|
254
225
|
manifest_entry_list_reference.append(untouched_src_manifest_entry)
|
255
226
|
referenced_pyarrow_write_result = PyArrowWriteResult.of(
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
227
|
+
1,
|
228
|
+
untouched_src_manifest_entry.meta.source_content_length,
|
229
|
+
untouched_src_manifest_entry.meta.content_length,
|
230
|
+
src_file_record_count,
|
260
231
|
)
|
261
232
|
referenced_pyarrow_write_results.append(referenced_pyarrow_write_result)
|
262
233
|
else:
|
234
|
+
pa_table, download_delta_manifest_entry_time = timed_invocation(
|
235
|
+
deltacat_storage.download_delta_manifest_entry,
|
236
|
+
Delta.of(delta_locator, None, None, None, manifest),
|
237
|
+
src_file_idx_np.item(),
|
238
|
+
file_reader_kwargs_provider=read_kwargs_provider,
|
239
|
+
)
|
240
|
+
logger.debug(
|
241
|
+
f"Time taken for materialize task"
|
242
|
+
f" to download delta locator {delta_locator} with entry ID {src_file_idx_np.item()}"
|
243
|
+
f" is: {download_delta_manifest_entry_time}s"
|
244
|
+
)
|
263
245
|
mask = pa.array(mask_pylist)
|
264
246
|
pa_table = pa_table.filter(mask)
|
265
247
|
record_batch_tables.append(pa_table)
|
@@ -274,15 +256,11 @@ def materialize(
|
|
274
256
|
|
275
257
|
referenced_manifest_delta = (
|
276
258
|
_stage_delta_from_manifest_entry_reference_list(
|
277
|
-
manifest_entry_list_reference
|
259
|
+
manifest_entry_list_reference, partition
|
278
260
|
)
|
279
261
|
if manifest_entry_list_reference
|
280
262
|
else None
|
281
263
|
)
|
282
|
-
if referenced_manifest_delta:
|
283
|
-
logger.info(
|
284
|
-
f"Got delta with {len(referenced_manifest_delta.manifest.entries)} referenced manifest entries"
|
285
|
-
)
|
286
264
|
|
287
265
|
merged_materialized_delta = [mr.delta for mr in materialized_results]
|
288
266
|
merged_materialized_delta.append(referenced_manifest_delta)
|
@@ -290,33 +268,58 @@ def materialize(
|
|
290
268
|
[d for d in merged_materialized_delta if d is not None]
|
291
269
|
)
|
292
270
|
|
293
|
-
write_results_union = referenced_pyarrow_write_results
|
271
|
+
write_results_union = [*referenced_pyarrow_write_results]
|
294
272
|
if materialized_results:
|
295
273
|
for mr in materialized_results:
|
296
274
|
write_results_union.append(mr.pyarrow_write_result)
|
297
275
|
write_result = PyArrowWriteResult.union(write_results_union)
|
276
|
+
referenced_write_result = PyArrowWriteResult.union(
|
277
|
+
referenced_pyarrow_write_results
|
278
|
+
)
|
279
|
+
|
280
|
+
if referenced_manifest_delta:
|
281
|
+
logger.info(
|
282
|
+
f"Got delta with {len(referenced_manifest_delta.manifest.entries)} referenced manifest entries"
|
283
|
+
)
|
284
|
+
assert referenced_write_result.files == len(
|
285
|
+
referenced_manifest_delta.manifest.entries
|
286
|
+
), "The files referenced must match with the entries in the delta"
|
287
|
+
|
288
|
+
assert write_result.files == len(
|
289
|
+
merged_delta.manifest.entries
|
290
|
+
), "The total number of files written by materialize must match manifest entries"
|
298
291
|
|
299
292
|
logger.debug(
|
300
|
-
f"{
|
301
|
-
f" with records: {
|
302
|
-
)
|
303
|
-
# Merge all new deltas into one for this materialize bucket index
|
304
|
-
merged_materialize_result = MaterializeResult.of(
|
305
|
-
merged_delta,
|
306
|
-
mat_bucket_index,
|
307
|
-
write_result,
|
308
|
-
len(manifest_entry_list_reference),
|
309
|
-
count_of_src_dfl,
|
293
|
+
f"{write_result.files} files written"
|
294
|
+
f" with records: {write_result.records}"
|
310
295
|
)
|
311
296
|
|
312
297
|
logger.info(f"Finished materialize task...")
|
313
298
|
end = time.time()
|
314
299
|
duration = end - start
|
300
|
+
|
301
|
+
emit_metrics_time = 0.0
|
315
302
|
if metrics_config:
|
316
|
-
|
303
|
+
emit_result, latency = timed_invocation(
|
304
|
+
func=emit_timer_metrics,
|
317
305
|
metrics_name="materialize",
|
318
306
|
value=duration,
|
319
307
|
metrics_config=metrics_config,
|
320
308
|
)
|
309
|
+
emit_metrics_time = latency
|
321
310
|
logger.info(f"Materialize task ended in {end - start}s")
|
311
|
+
|
312
|
+
peak_memory_usage_bytes = get_current_node_peak_memory_usage_in_bytes()
|
313
|
+
|
314
|
+
# Merge all new deltas into one for this materialize bucket index
|
315
|
+
merged_materialize_result = MaterializeResult.of(
|
316
|
+
merged_delta,
|
317
|
+
mat_bucket_index,
|
318
|
+
write_result,
|
319
|
+
referenced_write_result,
|
320
|
+
np.double(peak_memory_usage_bytes),
|
321
|
+
np.double(emit_metrics_time),
|
322
|
+
np.double(time.time()),
|
323
|
+
)
|
324
|
+
|
322
325
|
return merged_materialize_result
|
@@ -16,6 +16,9 @@ from deltacat import logs
|
|
16
16
|
from deltacat.compute.compactor import DeltaAnnotated
|
17
17
|
from typing import Dict, List, Optional, Tuple, Union
|
18
18
|
from deltacat.compute.compactor import HighWatermark
|
19
|
+
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
20
|
+
CompactionSessionAuditInfo,
|
21
|
+
)
|
19
22
|
|
20
23
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
21
24
|
|
@@ -94,6 +97,7 @@ def limit_input_deltas(
|
|
94
97
|
hash_bucket_count: int,
|
95
98
|
user_hash_bucket_chunk_size: int,
|
96
99
|
input_deltas_stats: Dict[int, DeltaStats],
|
100
|
+
compaction_audit: CompactionSessionAuditInfo,
|
97
101
|
deltacat_storage=unimplemented_deltacat_storage,
|
98
102
|
) -> Tuple[List[DeltaAnnotated], int, HighWatermark, bool]:
|
99
103
|
# TODO (pdames): when row counts are available in metadata, use them
|
@@ -236,6 +240,11 @@ def limit_input_deltas(
|
|
236
240
|
# TODO (pdames): Test and add value for min_file_counts
|
237
241
|
)
|
238
242
|
|
243
|
+
compaction_audit.set_input_size_bytes(delta_bytes)
|
244
|
+
compaction_audit.set_input_file_count(delta_manifest_entries)
|
245
|
+
compaction_audit.set_total_cluster_memory_bytes(worker_task_mem)
|
246
|
+
compaction_audit.set_hash_bucket_count(hash_bucket_count)
|
247
|
+
|
239
248
|
logger.info(f"Hash bucket chunk size: {hash_bucket_chunk_size}")
|
240
249
|
logger.info(f"Hash bucket count: {hash_bucket_count}")
|
241
250
|
logger.info(f"Input uniform delta count: {len(rebatched_da_list)}")
|
@@ -246,6 +255,7 @@ def limit_input_deltas(
|
|
246
255
|
def fit_input_deltas(
|
247
256
|
input_deltas: List[Delta],
|
248
257
|
cluster_resources: Dict[str, float],
|
258
|
+
compaction_audit: CompactionSessionAuditInfo,
|
249
259
|
hash_bucket_count: Optional[int],
|
250
260
|
deltacat_storage=unimplemented_deltacat_storage,
|
251
261
|
) -> Tuple[List[DeltaAnnotated], int, HighWatermark, bool]:
|
@@ -314,6 +324,11 @@ def fit_input_deltas(
|
|
314
324
|
math.ceil(total_memory / MEMORY_TO_HASH_BUCKET_COUNT_RATIO)
|
315
325
|
)
|
316
326
|
|
327
|
+
compaction_audit.set_input_file_count(total_files)
|
328
|
+
compaction_audit.set_input_size_bytes(delta_bytes)
|
329
|
+
compaction_audit.set_total_cluster_memory_bytes(total_memory)
|
330
|
+
compaction_audit.set_hash_bucket_count(hash_bucket_count)
|
331
|
+
|
317
332
|
logger.info(
|
318
333
|
f"Input delta bytes: {delta_bytes}, Total files: {total_files}, The worker_cpus: {worker_cpus}, "
|
319
334
|
f" total_memory: {total_memory}, and hash_bucket_count: {hash_bucket_count}"
|
@@ -7,7 +7,6 @@ import numpy as np
|
|
7
7
|
import pyarrow as pa
|
8
8
|
import ray
|
9
9
|
import s3fs
|
10
|
-
from ray import cloudpickle
|
11
10
|
from ray.types import ObjectRef
|
12
11
|
|
13
12
|
from deltacat import logs
|
@@ -30,6 +29,7 @@ from deltacat.types.media import ContentEncoding, ContentType
|
|
30
29
|
from deltacat.types.tables import get_table_slicer, get_table_writer
|
31
30
|
from deltacat.utils.common import ReadKwargsProvider
|
32
31
|
from deltacat.utils.ray_utils.concurrency import invoke_parallel
|
32
|
+
from deltacat.io.object_store import IObjectStore
|
33
33
|
|
34
34
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
35
35
|
|
@@ -203,7 +203,10 @@ def group_record_indices_by_hash_bucket(
|
|
203
203
|
|
204
204
|
|
205
205
|
def group_hash_bucket_indices(
|
206
|
-
hash_bucket_object_groups: np.ndarray,
|
206
|
+
hash_bucket_object_groups: np.ndarray,
|
207
|
+
num_buckets: int,
|
208
|
+
num_groups: int,
|
209
|
+
object_store: Optional[IObjectStore] = None,
|
207
210
|
) -> Tuple[np.ndarray, List[ObjectRef]]:
|
208
211
|
"""
|
209
212
|
Groups all the ObjectRef that belongs to a particular hash bucket group and hash bucket index.
|
@@ -226,19 +229,10 @@ def group_hash_bucket_indices(
|
|
226
229
|
for hb_group, obj in enumerate(hb_group_to_object):
|
227
230
|
if obj is None:
|
228
231
|
continue
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
# NOTE: The cloudpickle.dumps API call creates an out of band object reference to the object_ref variable.
|
234
|
-
# After pickling, Ray cannot track the serialized copy of the object or determine when the ObjectRef has been deserialized
|
235
|
-
# (e.g., if the ObjectRef is deserialized by a non-Ray process).
|
236
|
-
# Thus the object_ref cannot be tracked by Ray's distributed reference counter, even if it goes out of scope.
|
237
|
-
# The object now has a permanent reference and the data can't be freed from Ray’s object store.
|
238
|
-
# Manually deleting the untrackable object references offsets these permanent references and
|
239
|
-
# helps to allow these objects to be garbage collected normally.
|
240
|
-
del obj_ref
|
241
|
-
del pickled_obj_ref
|
232
|
+
object_ref = object_store.put(obj)
|
233
|
+
object_refs.append(object_ref)
|
234
|
+
hash_bucket_group_to_obj_id[hb_group] = object_ref
|
235
|
+
del object_ref
|
242
236
|
return hash_bucket_group_to_obj_id, object_refs
|
243
237
|
|
244
238
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import json
|
2
2
|
import logging
|
3
|
-
|
3
|
+
from typing import Dict, Any
|
4
4
|
from deltacat import logs
|
5
5
|
from deltacat.compute.compactor import RoundCompletionInfo
|
6
6
|
from deltacat.storage import PartitionLocator
|
@@ -19,7 +19,9 @@ def get_round_completion_file_s3_url(
|
|
19
19
|
|
20
20
|
|
21
21
|
def read_round_completion_file(
|
22
|
-
bucket: str,
|
22
|
+
bucket: str,
|
23
|
+
source_partition_locator: PartitionLocator,
|
24
|
+
**s3_client_kwargs: Optional[Dict[str, Any]],
|
23
25
|
) -> RoundCompletionInfo:
|
24
26
|
|
25
27
|
round_completion_file_url = get_round_completion_file_s3_url(
|
@@ -28,7 +30,7 @@ def read_round_completion_file(
|
|
28
30
|
)
|
29
31
|
logger.info(f"reading round completion file from: {round_completion_file_url}")
|
30
32
|
round_completion_info = None
|
31
|
-
result = s3_utils.download(round_completion_file_url, False)
|
33
|
+
result = s3_utils.download(round_completion_file_url, False, **s3_client_kwargs)
|
32
34
|
if result:
|
33
35
|
json_str = result["Body"].read().decode("utf-8")
|
34
36
|
round_completion_info = RoundCompletionInfo(json.loads(json_str))
|
@@ -41,7 +43,10 @@ def write_round_completion_file(
|
|
41
43
|
source_partition_locator: Optional[PartitionLocator],
|
42
44
|
round_completion_info: RoundCompletionInfo,
|
43
45
|
completion_file_s3_url: str = None,
|
46
|
+
**s3_client_kwargs: Optional[Dict[str, Any]],
|
44
47
|
) -> str:
|
48
|
+
if bucket is None and completion_file_s3_url is None:
|
49
|
+
raise AssertionError("Either bucket or completion_file_s3_url must be passed")
|
45
50
|
|
46
51
|
logger.info(f"writing round completion file contents: {round_completion_info}")
|
47
52
|
if completion_file_s3_url is None:
|
@@ -50,6 +55,10 @@ def write_round_completion_file(
|
|
50
55
|
source_partition_locator,
|
51
56
|
)
|
52
57
|
logger.info(f"writing round completion file to: {completion_file_s3_url}")
|
53
|
-
s3_utils.upload(
|
58
|
+
s3_utils.upload(
|
59
|
+
completion_file_s3_url,
|
60
|
+
str(json.dumps(round_completion_info)),
|
61
|
+
**s3_client_kwargs,
|
62
|
+
)
|
54
63
|
logger.info(f"round completion file written to: {completion_file_s3_url}")
|
55
64
|
return completion_file_s3_url
|
@@ -64,6 +64,13 @@ _IS_SOURCE_COLUMN_FIELD = pa.field(
|
|
64
64
|
_IS_SOURCE_COLUMN_TYPE,
|
65
65
|
)
|
66
66
|
|
67
|
+
_FILE_RECORD_COUNT_COLUMN_NAME = _get_sys_col_name("file_record_count")
|
68
|
+
_FILE_RECORD_COUNT_COLUMN_TYPE = pa.int64()
|
69
|
+
_FILE_RECORD_COUNT_COLUMN_FIELD = pa.field(
|
70
|
+
_FILE_RECORD_COUNT_COLUMN_NAME,
|
71
|
+
_FILE_RECORD_COUNT_COLUMN_TYPE,
|
72
|
+
)
|
73
|
+
|
67
74
|
|
68
75
|
def get_pk_hash_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
|
69
76
|
return pa.array(obj, _PK_HASH_COLUMN_TYPE)
|
@@ -143,6 +150,17 @@ def get_is_source_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
|
|
143
150
|
)
|
144
151
|
|
145
152
|
|
153
|
+
def file_record_count_column_np(table: pa.Table) -> np.ndarray:
|
154
|
+
return table[_FILE_RECORD_COUNT_COLUMN_NAME].to_numpy()
|
155
|
+
|
156
|
+
|
157
|
+
def get_file_record_count_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
|
158
|
+
return pa.array(
|
159
|
+
obj,
|
160
|
+
_FILE_RECORD_COUNT_COLUMN_TYPE,
|
161
|
+
)
|
162
|
+
|
163
|
+
|
146
164
|
def project_delta_file_metadata_on_table(
|
147
165
|
delta_file_envelope: DeltaFileEnvelope,
|
148
166
|
) -> pa.Table:
|
@@ -179,6 +197,12 @@ def project_delta_file_metadata_on_table(
|
|
179
197
|
len(table),
|
180
198
|
)
|
181
199
|
table = append_is_source_col(table, is_source_iterator)
|
200
|
+
|
201
|
+
# append row count column
|
202
|
+
file_record_count_iterator = repeat(
|
203
|
+
delta_file_envelope.file_record_count, len(table)
|
204
|
+
)
|
205
|
+
table = append_file_record_count_col(table, file_record_count_iterator)
|
182
206
|
return table
|
183
207
|
|
184
208
|
|
@@ -252,6 +276,14 @@ def append_is_source_col(table: pa.Table, booleans) -> pa.Table:
|
|
252
276
|
return table
|
253
277
|
|
254
278
|
|
279
|
+
def append_file_record_count_col(table: pa.Table, file_record_count):
|
280
|
+
table = table.append_column(
|
281
|
+
_FILE_RECORD_COUNT_COLUMN_FIELD,
|
282
|
+
get_file_record_count_column_array(file_record_count),
|
283
|
+
)
|
284
|
+
return table
|
285
|
+
|
286
|
+
|
255
287
|
def get_minimal_hb_schema() -> pa.schema:
|
256
288
|
return pa.schema(
|
257
289
|
[
|
deltacat/io/__init__.py
CHANGED
@@ -0,0 +1,48 @@
|
|
1
|
+
import logging
|
2
|
+
from ray import cloudpickle
|
3
|
+
import time
|
4
|
+
from deltacat.io.object_store import IObjectStore
|
5
|
+
from typing import Any, List
|
6
|
+
from deltacat import logs
|
7
|
+
import os
|
8
|
+
import uuid
|
9
|
+
from builtins import open
|
10
|
+
|
11
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
12
|
+
|
13
|
+
|
14
|
+
class FileObjectStore(IObjectStore):
|
15
|
+
"""
|
16
|
+
An implementation of object store that uses file system.
|
17
|
+
"""
|
18
|
+
|
19
|
+
def __init__(self, dir_path: str) -> None:
|
20
|
+
self.dir_path = dir_path
|
21
|
+
super().__init__()
|
22
|
+
|
23
|
+
def put_many(self, objects: List[object], *args, **kwargs) -> List[Any]:
|
24
|
+
result = []
|
25
|
+
|
26
|
+
for obj in objects:
|
27
|
+
serialized = cloudpickle.dumps(obj)
|
28
|
+
ref = f"{self.dir_path}/{uuid.uuid4()}"
|
29
|
+
with open(ref, "xb") as f:
|
30
|
+
f.write(serialized)
|
31
|
+
|
32
|
+
result.append(ref)
|
33
|
+
|
34
|
+
return result
|
35
|
+
|
36
|
+
def get_many(self, refs: List[Any], *args, **kwargs) -> List[object]:
|
37
|
+
result = []
|
38
|
+
start = time.monotonic()
|
39
|
+
for ref in refs:
|
40
|
+
with open(ref, "rb") as f:
|
41
|
+
serialized = f.read()
|
42
|
+
loaded = cloudpickle.loads(serialized)
|
43
|
+
result.append(loaded)
|
44
|
+
os.remove(ref)
|
45
|
+
end = time.monotonic()
|
46
|
+
|
47
|
+
logger.info(f"The total time taken to read all objects is: {end - start}")
|
48
|
+
return result
|
@@ -0,0 +1,121 @@
|
|
1
|
+
import logging
|
2
|
+
from ray import cloudpickle
|
3
|
+
from collections import defaultdict
|
4
|
+
import time
|
5
|
+
from deltacat.io.object_store import IObjectStore
|
6
|
+
from typing import Any, List
|
7
|
+
from deltacat import logs
|
8
|
+
import uuid
|
9
|
+
import socket
|
10
|
+
from pymemcache.client.base import Client
|
11
|
+
from pymemcache.client.retrying import RetryingClient
|
12
|
+
from pymemcache.exceptions import MemcacheUnexpectedCloseError
|
13
|
+
|
14
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
15
|
+
|
16
|
+
|
17
|
+
class MemcachedObjectStore(IObjectStore):
|
18
|
+
"""
|
19
|
+
An implementation of object store that uses Memcached.
|
20
|
+
"""
|
21
|
+
|
22
|
+
def __init__(self, port=11212) -> None:
|
23
|
+
self.client_cache = {}
|
24
|
+
self.current_ip = None
|
25
|
+
self.SEPARATOR = "_"
|
26
|
+
self.port = port
|
27
|
+
super().__init__()
|
28
|
+
|
29
|
+
def put_many(self, objects: List[object], *args, **kwargs) -> List[Any]:
|
30
|
+
input = {}
|
31
|
+
result = []
|
32
|
+
current_ip = self._get_current_ip()
|
33
|
+
for obj in objects:
|
34
|
+
serialized = cloudpickle.dumps(obj)
|
35
|
+
uid = uuid.uuid4()
|
36
|
+
ref = self._create_ref(uid, current_ip)
|
37
|
+
input[uid.__str__()] = serialized
|
38
|
+
result.append(ref)
|
39
|
+
|
40
|
+
client = self._get_client_by_ip(current_ip)
|
41
|
+
if client.set_many(input, noreply=False):
|
42
|
+
raise RuntimeError("Unable to write few keys to cache")
|
43
|
+
|
44
|
+
return result
|
45
|
+
|
46
|
+
def put(self, obj: object, *args, **kwargs) -> Any:
|
47
|
+
serialized = cloudpickle.dumps(obj)
|
48
|
+
uid = uuid.uuid4()
|
49
|
+
current_ip = self._get_current_ip()
|
50
|
+
ref = self._create_ref(uid, current_ip)
|
51
|
+
client = self._get_client_by_ip(current_ip)
|
52
|
+
|
53
|
+
if client.set(uid.__str__(), serialized):
|
54
|
+
return ref
|
55
|
+
else:
|
56
|
+
raise RuntimeError("Unable to write to cache")
|
57
|
+
|
58
|
+
def get_many(self, refs: List[Any], *args, **kwargs) -> List[object]:
|
59
|
+
result = []
|
60
|
+
uid_per_ip = defaultdict(lambda: [])
|
61
|
+
|
62
|
+
start = time.monotonic()
|
63
|
+
for ref in refs:
|
64
|
+
uid, ip = ref.split(self.SEPARATOR)
|
65
|
+
uid_per_ip[ip].append(uid)
|
66
|
+
|
67
|
+
for (ip, uids) in uid_per_ip.items():
|
68
|
+
client = self._get_client_by_ip(ip)
|
69
|
+
cache_result = client.get_many(uids)
|
70
|
+
assert len(cache_result) == len(
|
71
|
+
uids
|
72
|
+
), f"Not all values were returned from cache as {len(cache_result)} != {len(uids)}"
|
73
|
+
|
74
|
+
values = cache_result.values()
|
75
|
+
total_bytes = 0
|
76
|
+
|
77
|
+
deserialize_start = time.monotonic()
|
78
|
+
for serialized in values:
|
79
|
+
deserialized = cloudpickle.loads(serialized)
|
80
|
+
total_bytes += len(serialized)
|
81
|
+
result.append(deserialized)
|
82
|
+
|
83
|
+
deserialize_end = time.monotonic()
|
84
|
+
logger.debug(
|
85
|
+
f"The time taken to deserialize {total_bytes} bytes is: {deserialize_end - deserialize_start}",
|
86
|
+
)
|
87
|
+
|
88
|
+
end = time.monotonic()
|
89
|
+
|
90
|
+
logger.info(f"The total time taken to read all objects is: {end - start}")
|
91
|
+
return result
|
92
|
+
|
93
|
+
def get(self, ref: Any, *args, **kwargs) -> object:
|
94
|
+
uid, ip = ref.split(self.SEPARATOR)
|
95
|
+
client = self._get_client_by_ip(ip)
|
96
|
+
serialized = client.get(uid)
|
97
|
+
return cloudpickle.loads(serialized)
|
98
|
+
|
99
|
+
def _create_ref(self, uid, ip) -> str:
|
100
|
+
return f"{uid}{self.SEPARATOR}{ip}"
|
101
|
+
|
102
|
+
def _get_client_by_ip(self, ip_address: str):
|
103
|
+
if ip_address in self.client_cache:
|
104
|
+
return self.client_cache[ip_address]
|
105
|
+
|
106
|
+
base_client = Client((ip_address, self.port))
|
107
|
+
client = RetryingClient(
|
108
|
+
base_client,
|
109
|
+
attempts=3,
|
110
|
+
retry_delay=0.01,
|
111
|
+
retry_for=[MemcacheUnexpectedCloseError],
|
112
|
+
)
|
113
|
+
|
114
|
+
self.client_cache[ip_address] = client
|
115
|
+
return client
|
116
|
+
|
117
|
+
def _get_current_ip(self):
|
118
|
+
if self.current_ip is None:
|
119
|
+
self.current_ip = socket.gethostbyname(socket.gethostname())
|
120
|
+
|
121
|
+
return self.current_ip
|