deltacat 0.1.18b4__py3-none-any.whl → 0.1.18b7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +1 -1
- deltacat/compute/compactor/compaction_session.py +46 -6
- deltacat/compute/compactor/steps/dedupe.py +9 -14
- deltacat/compute/compactor/steps/hash_bucket.py +5 -3
- deltacat/compute/compactor/steps/materialize.py +18 -37
- deltacat/compute/compactor/utils/primary_key_index.py +9 -15
- deltacat/compute/compactor/utils/round_completion_file.py +11 -4
- deltacat/io/__init__.py +0 -7
- deltacat/io/file_object_store.py +48 -0
- deltacat/io/memcached_object_store.py +121 -0
- deltacat/io/object_store.py +51 -0
- deltacat/io/ray_plasma_object_store.py +23 -0
- deltacat/io/redis_object_store.py +114 -0
- deltacat/io/s3_object_store.py +44 -0
- deltacat/tests/compactor/utils/test_io.py +4 -0
- deltacat/tests/io/__init__.py +0 -0
- deltacat/tests/io/test_file_object_store.py +86 -0
- deltacat/tests/io/test_memcached_object_store.py +158 -0
- deltacat/tests/io/test_ray_plasma_object_store.py +54 -0
- deltacat/tests/io/test_redis_object_store.py +103 -0
- deltacat/tests/io/test_s3_object_store.py +59 -0
- deltacat/tests/utils/test_resources.py +4 -0
- {deltacat-0.1.18b4.dist-info → deltacat-0.1.18b7.dist-info}/METADATA +3 -1
- {deltacat-0.1.18b4.dist-info → deltacat-0.1.18b7.dist-info}/RECORD +27 -15
- {deltacat-0.1.18b4.dist-info → deltacat-0.1.18b7.dist-info}/LICENSE +0 -0
- {deltacat-0.1.18b4.dist-info → deltacat-0.1.18b7.dist-info}/WHEEL +0 -0
- {deltacat-0.1.18b4.dist-info → deltacat-0.1.18b7.dist-info}/top_level.txt +0 -0
deltacat/__init__.py
CHANGED
@@ -16,6 +16,8 @@ from deltacat.compute.compactor import (
|
|
16
16
|
)
|
17
17
|
from deltacat.compute.compactor.model.dedupe_result import DedupeResult
|
18
18
|
from deltacat.compute.compactor.model.hash_bucket_result import HashBucketResult
|
19
|
+
from deltacat.io.object_store import IObjectStore
|
20
|
+
from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
|
19
21
|
from deltacat.compute.compactor.model.materialize_result import MaterializeResult
|
20
22
|
from deltacat.compute.stats.models.delta_stats import DeltaStats
|
21
23
|
from deltacat.storage import (
|
@@ -112,6 +114,8 @@ def compact_partition(
|
|
112
114
|
list_deltas_kwargs: Optional[Dict[str, Any]] = None,
|
113
115
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
114
116
|
s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
117
|
+
object_store: Optional[IObjectStore] = RayPlasmaObjectStore(),
|
118
|
+
s3_client_kwargs: Optional[Dict[str, Any]] = None,
|
115
119
|
deltacat_storage=unimplemented_deltacat_storage,
|
116
120
|
**kwargs,
|
117
121
|
) -> Optional[str]:
|
@@ -151,6 +155,8 @@ def compact_partition(
|
|
151
155
|
list_deltas_kwargs,
|
152
156
|
read_kwargs_provider,
|
153
157
|
s3_table_writer_kwargs,
|
158
|
+
object_store,
|
159
|
+
s3_client_kwargs,
|
154
160
|
deltacat_storage,
|
155
161
|
**kwargs,
|
156
162
|
)
|
@@ -196,6 +202,8 @@ def _execute_compaction_round(
|
|
196
202
|
list_deltas_kwargs: Optional[Dict[str, Any]],
|
197
203
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
198
204
|
s3_table_writer_kwargs: Optional[Dict[str, Any]],
|
205
|
+
object_store: Optional[IObjectStore],
|
206
|
+
s3_client_kwargs: Optional[Dict[str, Any]],
|
199
207
|
deltacat_storage=unimplemented_deltacat_storage,
|
200
208
|
**kwargs,
|
201
209
|
) -> Tuple[Optional[Partition], Optional[RoundCompletionInfo], Optional[str]]:
|
@@ -287,6 +295,13 @@ def _execute_compaction_round(
|
|
287
295
|
)
|
288
296
|
logger.info(f"Round completion file: {round_completion_info}")
|
289
297
|
|
298
|
+
enable_manifest_entry_copy_by_reference = (
|
299
|
+
False if rebase_source_partition_locator else True
|
300
|
+
)
|
301
|
+
logger.info(
|
302
|
+
f"Enable manifest entry copy by reference is set to: {enable_manifest_entry_copy_by_reference}"
|
303
|
+
)
|
304
|
+
|
290
305
|
# discover input delta files
|
291
306
|
# For rebase:
|
292
307
|
# Copy the old compacted table to a new destination, plus any new deltas from rebased source
|
@@ -318,7 +333,11 @@ def _execute_compaction_round(
|
|
318
333
|
delta_discovery_end - delta_discovery_start
|
319
334
|
)
|
320
335
|
|
321
|
-
s3_utils.upload(
|
336
|
+
s3_utils.upload(
|
337
|
+
compaction_audit.audit_url,
|
338
|
+
str(json.dumps(compaction_audit)),
|
339
|
+
**s3_client_kwargs,
|
340
|
+
)
|
322
341
|
|
323
342
|
if not input_deltas:
|
324
343
|
logger.info("No input deltas found to compact.")
|
@@ -392,6 +411,7 @@ def _execute_compaction_round(
|
|
392
411
|
enable_profiler=enable_profiler,
|
393
412
|
metrics_config=metrics_config,
|
394
413
|
read_kwargs_provider=read_kwargs_provider,
|
414
|
+
object_store=object_store,
|
395
415
|
deltacat_storage=deltacat_storage,
|
396
416
|
)
|
397
417
|
|
@@ -411,7 +431,11 @@ def _execute_compaction_round(
|
|
411
431
|
hb_end - hb_start,
|
412
432
|
)
|
413
433
|
|
414
|
-
s3_utils.upload(
|
434
|
+
s3_utils.upload(
|
435
|
+
compaction_audit.audit_url,
|
436
|
+
str(json.dumps(compaction_audit)),
|
437
|
+
**s3_client_kwargs,
|
438
|
+
)
|
415
439
|
|
416
440
|
all_hash_group_idx_to_obj_id = defaultdict(list)
|
417
441
|
for hb_result in hb_results:
|
@@ -453,11 +477,16 @@ def _execute_compaction_round(
|
|
453
477
|
logger.info(f"Materialize Bucket Count: {num_materialize_buckets}")
|
454
478
|
|
455
479
|
dedupe_start = time.monotonic()
|
456
|
-
|
480
|
+
dd_max_parallelism = int(
|
481
|
+
max_parallelism * kwargs.get("dd_max_parallelism_ratio", 1)
|
482
|
+
)
|
483
|
+
logger.info(
|
484
|
+
f"dd max_parallelism is set to {dd_max_parallelism}, max_parallelism is {max_parallelism}"
|
485
|
+
)
|
457
486
|
dd_tasks_pending = invoke_parallel(
|
458
487
|
items=all_hash_group_idx_to_obj_id.values(),
|
459
488
|
ray_task=dd.dedupe,
|
460
|
-
max_parallelism=
|
489
|
+
max_parallelism=dd_max_parallelism,
|
461
490
|
options_provider=round_robin_opt_provider,
|
462
491
|
kwargs_provider=lambda index, item: {
|
463
492
|
"dedupe_task_index": index,
|
@@ -467,6 +496,7 @@ def _execute_compaction_round(
|
|
467
496
|
num_materialize_buckets=num_materialize_buckets,
|
468
497
|
enable_profiler=enable_profiler,
|
469
498
|
metrics_config=metrics_config,
|
499
|
+
object_store=object_store,
|
470
500
|
)
|
471
501
|
|
472
502
|
dedupe_invoke_end = time.monotonic()
|
@@ -520,7 +550,11 @@ def _execute_compaction_round(
|
|
520
550
|
# parallel step 3:
|
521
551
|
# materialize records to keep by index
|
522
552
|
|
523
|
-
s3_utils.upload(
|
553
|
+
s3_utils.upload(
|
554
|
+
compaction_audit.audit_url,
|
555
|
+
str(json.dumps(compaction_audit)),
|
556
|
+
**s3_client_kwargs,
|
557
|
+
)
|
524
558
|
|
525
559
|
materialize_start = time.monotonic()
|
526
560
|
|
@@ -537,12 +571,14 @@ def _execute_compaction_round(
|
|
537
571
|
round_completion_info=round_completion_info,
|
538
572
|
source_partition_locator=source_partition_locator,
|
539
573
|
partition=partition,
|
574
|
+
enable_manifest_entry_copy_by_reference=enable_manifest_entry_copy_by_reference,
|
540
575
|
max_records_per_output_file=records_per_compacted_file,
|
541
576
|
compacted_file_content_type=compacted_file_content_type,
|
542
577
|
enable_profiler=enable_profiler,
|
543
578
|
metrics_config=metrics_config,
|
544
579
|
read_kwargs_provider=read_kwargs_provider,
|
545
580
|
s3_table_writer_kwargs=s3_table_writer_kwargs,
|
581
|
+
object_store=object_store,
|
546
582
|
deltacat_storage=deltacat_storage,
|
547
583
|
)
|
548
584
|
|
@@ -620,7 +656,11 @@ def _execute_compaction_round(
|
|
620
656
|
mat_results, telemetry_time_hb + telemetry_time_dd + telemetry_time_materialize
|
621
657
|
)
|
622
658
|
|
623
|
-
s3_utils.upload(
|
659
|
+
s3_utils.upload(
|
660
|
+
compaction_audit.audit_url,
|
661
|
+
str(json.dumps(compaction_audit)),
|
662
|
+
**s3_client_kwargs,
|
663
|
+
)
|
624
664
|
|
625
665
|
new_round_completion_info = RoundCompletionInfo.of(
|
626
666
|
last_stream_position_compacted,
|
@@ -1,5 +1,6 @@
|
|
1
1
|
import importlib
|
2
2
|
import logging
|
3
|
+
from typing import Optional
|
3
4
|
import time
|
4
5
|
from collections import defaultdict
|
5
6
|
from contextlib import nullcontext
|
@@ -8,7 +9,6 @@ import numpy as np
|
|
8
9
|
import pyarrow as pa
|
9
10
|
import pyarrow.compute as pc
|
10
11
|
import ray
|
11
|
-
from ray import cloudpickle
|
12
12
|
|
13
13
|
from deltacat import logs
|
14
14
|
from deltacat.compute.compactor import (
|
@@ -25,6 +25,7 @@ from deltacat.utils.ray_utils.runtime import (
|
|
25
25
|
)
|
26
26
|
from deltacat.utils.performance import timed_invocation
|
27
27
|
from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
|
28
|
+
from deltacat.io.object_store import IObjectStore
|
28
29
|
from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
|
29
30
|
|
30
31
|
if importlib.util.find_spec("memray"):
|
@@ -106,6 +107,7 @@ def _timed_dedupe(
|
|
106
107
|
num_materialize_buckets: int,
|
107
108
|
dedupe_task_index: int,
|
108
109
|
enable_profiler: bool,
|
110
|
+
object_store: Optional[IObjectStore],
|
109
111
|
):
|
110
112
|
task_id = get_current_ray_task_id()
|
111
113
|
worker_id = get_current_ray_worker_id()
|
@@ -114,15 +116,12 @@ def _timed_dedupe(
|
|
114
116
|
) if enable_profiler else nullcontext():
|
115
117
|
# TODO (pdames): mitigate risk of running out of memory here in cases of
|
116
118
|
# severe skew of primary key updates in deltas
|
117
|
-
src_file_records_obj_refs = [
|
118
|
-
cloudpickle.loads(obj_id_pkl) for obj_id_pkl in object_ids
|
119
|
-
]
|
120
119
|
logger.info(
|
121
120
|
f"[Dedupe task {dedupe_task_index}] Getting delta file envelope "
|
122
|
-
f"groups for {len(
|
121
|
+
f"groups for {len(object_ids)} object refs..."
|
123
122
|
)
|
124
123
|
|
125
|
-
delta_file_envelope_groups_list =
|
124
|
+
delta_file_envelope_groups_list = object_store.get_many(object_ids)
|
126
125
|
hb_index_to_delta_file_envelopes_list = defaultdict(list)
|
127
126
|
for delta_file_envelope_groups in delta_file_envelope_groups_list:
|
128
127
|
for hb_idx, dfes in enumerate(delta_file_envelope_groups):
|
@@ -201,7 +200,6 @@ def _timed_dedupe(
|
|
201
200
|
src_file_id_to_row_indices[src_dfl].append(row_idx_col[row_idx])
|
202
201
|
|
203
202
|
logger.info(f"Finished all dedupe rounds...")
|
204
|
-
mat_bucket_to_src_file_record_count = defaultdict(dict)
|
205
203
|
mat_bucket_to_src_file_records: Dict[
|
206
204
|
MaterializeBucketIndex, DeltaFileLocatorToRecords
|
207
205
|
] = defaultdict(dict)
|
@@ -213,22 +211,17 @@ def _timed_dedupe(
|
|
213
211
|
mat_bucket_to_src_file_records[mat_bucket][src_dfl] = np.array(
|
214
212
|
src_row_indices,
|
215
213
|
)
|
216
|
-
mat_bucket_to_src_file_record_count[mat_bucket][src_dfl] = len(
|
217
|
-
src_row_indices
|
218
|
-
)
|
219
214
|
|
220
215
|
mat_bucket_to_dd_idx_obj_id: Dict[
|
221
216
|
MaterializeBucketIndex, DedupeTaskIndexWithObjectId
|
222
217
|
] = {}
|
223
218
|
for mat_bucket, src_file_records in mat_bucket_to_src_file_records.items():
|
224
|
-
object_ref =
|
225
|
-
pickled_object_ref = cloudpickle.dumps(object_ref)
|
219
|
+
object_ref = object_store.put(src_file_records)
|
226
220
|
mat_bucket_to_dd_idx_obj_id[mat_bucket] = (
|
227
221
|
dedupe_task_index,
|
228
|
-
|
222
|
+
object_ref,
|
229
223
|
)
|
230
224
|
del object_ref
|
231
|
-
del pickled_object_ref
|
232
225
|
logger.info(
|
233
226
|
f"Count of materialize buckets with object refs: "
|
234
227
|
f"{len(mat_bucket_to_dd_idx_obj_id)}"
|
@@ -253,6 +246,7 @@ def dedupe(
|
|
253
246
|
dedupe_task_index: int,
|
254
247
|
enable_profiler: bool,
|
255
248
|
metrics_config: MetricsConfig,
|
249
|
+
object_store: Optional[IObjectStore],
|
256
250
|
) -> DedupeResult:
|
257
251
|
logger.info(f"[Dedupe task {dedupe_task_index}] Starting dedupe task...")
|
258
252
|
dedupe_result, duration = timed_invocation(
|
@@ -262,6 +256,7 @@ def dedupe(
|
|
262
256
|
num_materialize_buckets=num_materialize_buckets,
|
263
257
|
dedupe_task_index=dedupe_task_index,
|
264
258
|
enable_profiler=enable_profiler,
|
259
|
+
object_store=object_store,
|
265
260
|
)
|
266
261
|
|
267
262
|
emit_metrics_time = 0.0
|
@@ -31,6 +31,7 @@ from deltacat.utils.ray_utils.runtime import (
|
|
31
31
|
from deltacat.utils.common import ReadKwargsProvider
|
32
32
|
from deltacat.utils.performance import timed_invocation
|
33
33
|
from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
|
34
|
+
from deltacat.io.object_store import IObjectStore
|
34
35
|
from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
|
35
36
|
|
36
37
|
if importlib.util.find_spec("memray"):
|
@@ -179,6 +180,7 @@ def _timed_hash_bucket(
|
|
179
180
|
num_groups: int,
|
180
181
|
enable_profiler: bool,
|
181
182
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
183
|
+
object_store: Optional[IObjectStore] = None,
|
182
184
|
deltacat_storage=unimplemented_deltacat_storage,
|
183
185
|
):
|
184
186
|
task_id = get_current_ray_task_id()
|
@@ -207,9 +209,7 @@ def _timed_hash_bucket(
|
|
207
209
|
deltacat_storage,
|
208
210
|
)
|
209
211
|
hash_bucket_group_to_obj_id, _ = group_hash_bucket_indices(
|
210
|
-
delta_file_envelope_groups,
|
211
|
-
num_buckets,
|
212
|
-
num_groups,
|
212
|
+
delta_file_envelope_groups, num_buckets, num_groups, object_store
|
213
213
|
)
|
214
214
|
|
215
215
|
peak_memory_usage_bytes = get_current_node_peak_memory_usage_in_bytes()
|
@@ -233,6 +233,7 @@ def hash_bucket(
|
|
233
233
|
enable_profiler: bool,
|
234
234
|
metrics_config: MetricsConfig,
|
235
235
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
236
|
+
object_store: Optional[IObjectStore],
|
236
237
|
deltacat_storage=unimplemented_deltacat_storage,
|
237
238
|
) -> HashBucketResult:
|
238
239
|
|
@@ -247,6 +248,7 @@ def hash_bucket(
|
|
247
248
|
num_groups=num_groups,
|
248
249
|
enable_profiler=enable_profiler,
|
249
250
|
read_kwargs_provider=read_kwargs_provider,
|
251
|
+
object_store=object_store,
|
250
252
|
deltacat_storage=deltacat_storage,
|
251
253
|
)
|
252
254
|
|
@@ -5,11 +5,10 @@ from uuid import uuid4
|
|
5
5
|
from collections import defaultdict
|
6
6
|
from contextlib import nullcontext
|
7
7
|
from itertools import chain, repeat
|
8
|
-
from typing import List, Optional, Tuple, Dict, Any
|
8
|
+
from typing import List, Optional, Tuple, Dict, Any
|
9
9
|
import pyarrow as pa
|
10
10
|
import numpy as np
|
11
11
|
import ray
|
12
|
-
from ray import cloudpickle
|
13
12
|
from deltacat import logs
|
14
13
|
from deltacat.compute.compactor import (
|
15
14
|
MaterializeResult,
|
@@ -28,15 +27,13 @@ from deltacat.storage import (
|
|
28
27
|
PartitionLocator,
|
29
28
|
Manifest,
|
30
29
|
ManifestEntry,
|
31
|
-
LocalDataset,
|
32
|
-
LocalTable,
|
33
|
-
DistributedDataset,
|
34
30
|
)
|
35
31
|
from deltacat.storage import interface as unimplemented_deltacat_storage
|
36
32
|
from deltacat.utils.common import ReadKwargsProvider
|
37
33
|
from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES, ContentType
|
38
34
|
from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
|
39
35
|
from deltacat.utils.performance import timed_invocation
|
36
|
+
from deltacat.io.object_store import IObjectStore
|
40
37
|
from deltacat.utils.pyarrow import (
|
41
38
|
ReadKwargsProviderPyArrowCsvPureUtf8,
|
42
39
|
ReadKwargsProviderPyArrowSchemaOverride,
|
@@ -64,29 +61,15 @@ def materialize(
|
|
64
61
|
dedupe_task_idx_and_obj_id_tuples: List[DedupeTaskIndexWithObjectId],
|
65
62
|
max_records_per_output_file: int,
|
66
63
|
compacted_file_content_type: ContentType,
|
64
|
+
enable_manifest_entry_copy_by_reference: bool,
|
67
65
|
enable_profiler: bool,
|
68
66
|
metrics_config: MetricsConfig,
|
69
67
|
schema: Optional[pa.Schema] = None,
|
70
68
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
71
69
|
s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
70
|
+
object_store: Optional[IObjectStore] = None,
|
72
71
|
deltacat_storage=unimplemented_deltacat_storage,
|
73
72
|
):
|
74
|
-
def _stage_delta_implementation(
|
75
|
-
data: Union[LocalTable, LocalDataset, DistributedDataset, Manifest],
|
76
|
-
partition: Partition,
|
77
|
-
stage_delta_from_existing_manifest: Optional[bool],
|
78
|
-
) -> Delta:
|
79
|
-
if stage_delta_from_existing_manifest:
|
80
|
-
delta = Delta.of(
|
81
|
-
locator=DeltaLocator.of(partition.locator),
|
82
|
-
delta_type=DeltaType.UPSERT,
|
83
|
-
meta=manifest.meta,
|
84
|
-
manifest=data,
|
85
|
-
previous_stream_position=partition.stream_position,
|
86
|
-
properties={},
|
87
|
-
)
|
88
|
-
return delta
|
89
|
-
|
90
73
|
def _stage_delta_from_manifest_entry_reference_list(
|
91
74
|
manifest_entry_list_reference: List[ManifestEntry],
|
92
75
|
partition: Partition,
|
@@ -96,10 +79,13 @@ def materialize(
|
|
96
79
|
delta_type == DeltaType.UPSERT
|
97
80
|
), "Stage delta with existing manifest entries only supports UPSERT delta type!"
|
98
81
|
manifest = Manifest.of(entries=manifest_entry_list_reference, uuid=str(uuid4()))
|
99
|
-
delta =
|
100
|
-
|
101
|
-
|
102
|
-
|
82
|
+
delta = Delta.of(
|
83
|
+
locator=DeltaLocator.of(partition.locator),
|
84
|
+
delta_type=delta_type,
|
85
|
+
meta=manifest.meta,
|
86
|
+
manifest=manifest,
|
87
|
+
previous_stream_position=partition.stream_position,
|
88
|
+
properties={},
|
103
89
|
)
|
104
90
|
return delta
|
105
91
|
|
@@ -161,18 +147,11 @@ def materialize(
|
|
161
147
|
f"dedupe_{worker_id}_{task_id}.bin"
|
162
148
|
) if enable_profiler else nullcontext():
|
163
149
|
start = time.time()
|
164
|
-
dedupe_task_idx_and_obj_ref_tuples = [
|
165
|
-
(
|
166
|
-
t1,
|
167
|
-
cloudpickle.loads(t2),
|
168
|
-
)
|
169
|
-
for t1, t2 in dedupe_task_idx_and_obj_id_tuples
|
170
|
-
]
|
171
150
|
logger.info(f"Resolved materialize task obj refs...")
|
172
|
-
dedupe_task_indices, obj_refs = zip(*
|
151
|
+
dedupe_task_indices, obj_refs = zip(*dedupe_task_idx_and_obj_id_tuples)
|
173
152
|
# this depends on `ray.get` result order matching input order, as per the
|
174
153
|
# contract established in: https://github.com/ray-project/ray/pull/16763
|
175
|
-
src_file_records_list =
|
154
|
+
src_file_records_list = object_store.get_many(list(obj_refs))
|
176
155
|
all_src_file_records = defaultdict(list)
|
177
156
|
for i, src_file_records in enumerate(src_file_records_list):
|
178
157
|
dedupe_task_idx = dedupe_task_indices[i]
|
@@ -231,7 +210,9 @@ def materialize(
|
|
231
210
|
record_numbers_length += 1
|
232
211
|
mask_pylist[record_number] = True
|
233
212
|
if (
|
234
|
-
|
213
|
+
round_completion_info
|
214
|
+
and enable_manifest_entry_copy_by_reference
|
215
|
+
and record_numbers_length == src_file_record_count
|
235
216
|
and src_file_partition_locator
|
236
217
|
== round_completion_info.compacted_delta_locator.partition_locator
|
237
218
|
):
|
@@ -244,8 +225,8 @@ def materialize(
|
|
244
225
|
manifest_entry_list_reference.append(untouched_src_manifest_entry)
|
245
226
|
referenced_pyarrow_write_result = PyArrowWriteResult.of(
|
246
227
|
1,
|
247
|
-
|
248
|
-
|
228
|
+
untouched_src_manifest_entry.meta.source_content_length,
|
229
|
+
untouched_src_manifest_entry.meta.content_length,
|
249
230
|
src_file_record_count,
|
250
231
|
)
|
251
232
|
referenced_pyarrow_write_results.append(referenced_pyarrow_write_result)
|
@@ -7,7 +7,6 @@ import numpy as np
|
|
7
7
|
import pyarrow as pa
|
8
8
|
import ray
|
9
9
|
import s3fs
|
10
|
-
from ray import cloudpickle
|
11
10
|
from ray.types import ObjectRef
|
12
11
|
|
13
12
|
from deltacat import logs
|
@@ -30,6 +29,7 @@ from deltacat.types.media import ContentEncoding, ContentType
|
|
30
29
|
from deltacat.types.tables import get_table_slicer, get_table_writer
|
31
30
|
from deltacat.utils.common import ReadKwargsProvider
|
32
31
|
from deltacat.utils.ray_utils.concurrency import invoke_parallel
|
32
|
+
from deltacat.io.object_store import IObjectStore
|
33
33
|
|
34
34
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
35
35
|
|
@@ -203,7 +203,10 @@ def group_record_indices_by_hash_bucket(
|
|
203
203
|
|
204
204
|
|
205
205
|
def group_hash_bucket_indices(
|
206
|
-
hash_bucket_object_groups: np.ndarray,
|
206
|
+
hash_bucket_object_groups: np.ndarray,
|
207
|
+
num_buckets: int,
|
208
|
+
num_groups: int,
|
209
|
+
object_store: Optional[IObjectStore] = None,
|
207
210
|
) -> Tuple[np.ndarray, List[ObjectRef]]:
|
208
211
|
"""
|
209
212
|
Groups all the ObjectRef that belongs to a particular hash bucket group and hash bucket index.
|
@@ -226,19 +229,10 @@ def group_hash_bucket_indices(
|
|
226
229
|
for hb_group, obj in enumerate(hb_group_to_object):
|
227
230
|
if obj is None:
|
228
231
|
continue
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
# NOTE: The cloudpickle.dumps API call creates an out of band object reference to the object_ref variable.
|
234
|
-
# After pickling, Ray cannot track the serialized copy of the object or determine when the ObjectRef has been deserialized
|
235
|
-
# (e.g., if the ObjectRef is deserialized by a non-Ray process).
|
236
|
-
# Thus the object_ref cannot be tracked by Ray's distributed reference counter, even if it goes out of scope.
|
237
|
-
# The object now has a permanent reference and the data can't be freed from Ray’s object store.
|
238
|
-
# Manually deleting the untrackable object references offsets these permanent references and
|
239
|
-
# helps to allow these objects to be garbage collected normally.
|
240
|
-
del obj_ref
|
241
|
-
del pickled_obj_ref
|
232
|
+
object_ref = object_store.put(obj)
|
233
|
+
object_refs.append(object_ref)
|
234
|
+
hash_bucket_group_to_obj_id[hb_group] = object_ref
|
235
|
+
del object_ref
|
242
236
|
return hash_bucket_group_to_obj_id, object_refs
|
243
237
|
|
244
238
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import json
|
2
2
|
import logging
|
3
|
-
|
3
|
+
from typing import Dict, Any
|
4
4
|
from deltacat import logs
|
5
5
|
from deltacat.compute.compactor import RoundCompletionInfo
|
6
6
|
from deltacat.storage import PartitionLocator
|
@@ -19,7 +19,9 @@ def get_round_completion_file_s3_url(
|
|
19
19
|
|
20
20
|
|
21
21
|
def read_round_completion_file(
|
22
|
-
bucket: str,
|
22
|
+
bucket: str,
|
23
|
+
source_partition_locator: PartitionLocator,
|
24
|
+
**s3_client_kwargs: Optional[Dict[str, Any]],
|
23
25
|
) -> RoundCompletionInfo:
|
24
26
|
|
25
27
|
round_completion_file_url = get_round_completion_file_s3_url(
|
@@ -28,7 +30,7 @@ def read_round_completion_file(
|
|
28
30
|
)
|
29
31
|
logger.info(f"reading round completion file from: {round_completion_file_url}")
|
30
32
|
round_completion_info = None
|
31
|
-
result = s3_utils.download(round_completion_file_url, False)
|
33
|
+
result = s3_utils.download(round_completion_file_url, False, **s3_client_kwargs)
|
32
34
|
if result:
|
33
35
|
json_str = result["Body"].read().decode("utf-8")
|
34
36
|
round_completion_info = RoundCompletionInfo(json.loads(json_str))
|
@@ -41,6 +43,7 @@ def write_round_completion_file(
|
|
41
43
|
source_partition_locator: Optional[PartitionLocator],
|
42
44
|
round_completion_info: RoundCompletionInfo,
|
43
45
|
completion_file_s3_url: str = None,
|
46
|
+
**s3_client_kwargs: Optional[Dict[str, Any]],
|
44
47
|
) -> str:
|
45
48
|
if bucket is None and completion_file_s3_url is None:
|
46
49
|
raise AssertionError("Either bucket or completion_file_s3_url must be passed")
|
@@ -52,6 +55,10 @@ def write_round_completion_file(
|
|
52
55
|
source_partition_locator,
|
53
56
|
)
|
54
57
|
logger.info(f"writing round completion file to: {completion_file_s3_url}")
|
55
|
-
s3_utils.upload(
|
58
|
+
s3_utils.upload(
|
59
|
+
completion_file_s3_url,
|
60
|
+
str(json.dumps(round_completion_info)),
|
61
|
+
**s3_client_kwargs,
|
62
|
+
)
|
56
63
|
logger.info(f"round completion file written to: {completion_file_s3_url}")
|
57
64
|
return completion_file_s3_url
|
deltacat/io/__init__.py
CHANGED
@@ -0,0 +1,48 @@
|
|
1
|
+
import logging
|
2
|
+
from ray import cloudpickle
|
3
|
+
import time
|
4
|
+
from deltacat.io.object_store import IObjectStore
|
5
|
+
from typing import Any, List
|
6
|
+
from deltacat import logs
|
7
|
+
import os
|
8
|
+
import uuid
|
9
|
+
from builtins import open
|
10
|
+
|
11
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
12
|
+
|
13
|
+
|
14
|
+
class FileObjectStore(IObjectStore):
|
15
|
+
"""
|
16
|
+
An implementation of object store that uses file system.
|
17
|
+
"""
|
18
|
+
|
19
|
+
def __init__(self, dir_path: str) -> None:
|
20
|
+
self.dir_path = dir_path
|
21
|
+
super().__init__()
|
22
|
+
|
23
|
+
def put_many(self, objects: List[object], *args, **kwargs) -> List[Any]:
|
24
|
+
result = []
|
25
|
+
|
26
|
+
for obj in objects:
|
27
|
+
serialized = cloudpickle.dumps(obj)
|
28
|
+
ref = f"{self.dir_path}/{uuid.uuid4()}"
|
29
|
+
with open(ref, "xb") as f:
|
30
|
+
f.write(serialized)
|
31
|
+
|
32
|
+
result.append(ref)
|
33
|
+
|
34
|
+
return result
|
35
|
+
|
36
|
+
def get_many(self, refs: List[Any], *args, **kwargs) -> List[object]:
|
37
|
+
result = []
|
38
|
+
start = time.monotonic()
|
39
|
+
for ref in refs:
|
40
|
+
with open(ref, "rb") as f:
|
41
|
+
serialized = f.read()
|
42
|
+
loaded = cloudpickle.loads(serialized)
|
43
|
+
result.append(loaded)
|
44
|
+
os.remove(ref)
|
45
|
+
end = time.monotonic()
|
46
|
+
|
47
|
+
logger.info(f"The total time taken to read all objects is: {end - start}")
|
48
|
+
return result
|