deltacat 0.1.18b4__tar.gz → 0.1.18b6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {deltacat-0.1.18b4/deltacat.egg-info → deltacat-0.1.18b6}/PKG-INFO +1 -1
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/__init__.py +1 -1
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/compaction_session.py +23 -2
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/dedupe.py +9 -14
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/hash_bucket.py +5 -3
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/materialize.py +18 -37
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/utils/primary_key_index.py +9 -15
- deltacat-0.1.18b6/deltacat/io/file_object_store.py +48 -0
- deltacat-0.1.18b6/deltacat/io/memcached_object_store.py +121 -0
- deltacat-0.1.18b6/deltacat/io/object_store.py +51 -0
- deltacat-0.1.18b6/deltacat/io/ray_plasma_object_store.py +23 -0
- deltacat-0.1.18b6/deltacat/io/redis_object_store.py +114 -0
- deltacat-0.1.18b6/deltacat/io/s3_object_store.py +44 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/tests/compactor/utils/test_io.py +4 -0
- deltacat-0.1.18b6/deltacat/tests/io/test_file_object_store.py +86 -0
- deltacat-0.1.18b6/deltacat/tests/io/test_memcached_object_store.py +158 -0
- deltacat-0.1.18b6/deltacat/tests/io/test_ray_plasma_object_store.py +54 -0
- deltacat-0.1.18b6/deltacat/tests/io/test_redis_object_store.py +103 -0
- deltacat-0.1.18b6/deltacat/tests/io/test_s3_object_store.py +59 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/tests/utils/test_resources.py +4 -0
- deltacat-0.1.18b6/deltacat/utils/__init__.py +0 -0
- deltacat-0.1.18b6/deltacat/utils/ray_utils/__init__.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6/deltacat.egg-info}/PKG-INFO +1 -1
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat.egg-info/SOURCES.txt +12 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat.egg-info/requires.txt +2 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/setup.py +2 -0
- deltacat-0.1.18b4/deltacat/io/__init__.py +0 -7
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/LICENSE +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/MANIFEST.in +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/README.md +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/aws/__init__.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/aws/clients.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/aws/constants.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/aws/redshift/__init__.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/aws/redshift/model/__init__.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/aws/redshift/model/manifest.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/aws/s3u.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/catalog/__init__.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/catalog/delegate.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/catalog/interface.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/catalog/model/__init__.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/catalog/model/catalog.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/catalog/model/table_definition.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/__init__.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/__init__.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/__init__.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/compact_partition_params.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/compaction_session_audit_info.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/dedupe_result.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/delta_annotated.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/delta_file_envelope.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/delta_file_locator.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/hash_bucket_result.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/materialize_result.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/primary_key_index.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/pyarrow_write_result.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/repartition_result.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/round_completion_info.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/model/sort_key.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/repartition_session.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/__init__.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/rehash/__init__.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/rehash/rehash_bucket.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/rehash/rewrite_index.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/repartition.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/utils/__init__.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/utils/io.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/utils/round_completion_file.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/utils/system_columns.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/metastats/__init__.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/metastats/config/__init__.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/metastats/meta_stats.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/metastats/model/__init__.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/metastats/model/partition_stats_dict.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/metastats/model/stats_cluster_size_estimator.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/metastats/stats.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/metastats/utils/__init__.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/metastats/utils/constants.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/metastats/utils/io.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/metastats/utils/ray_utils.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/stats/__init__.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/stats/basic.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/stats/models/__init__.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/stats/models/delta_column_stats.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/stats/models/delta_stats.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/stats/models/delta_stats_cache_result.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/stats/models/manifest_entry_stats.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/stats/models/stats_result.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/stats/types.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/stats/utils/__init__.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/stats/utils/intervals.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/stats/utils/io.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/stats/utils/manifest_stats_file.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/constants.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/exceptions.py +0 -0
- {deltacat-0.1.18b4/deltacat/io/aws → deltacat-0.1.18b6/deltacat/io}/__init__.py +0 -0
- {deltacat-0.1.18b4/deltacat/io/aws/redshift → deltacat-0.1.18b6/deltacat/io/aws}/__init__.py +0 -0
- {deltacat-0.1.18b4/deltacat/storage/model → deltacat-0.1.18b6/deltacat/io/aws/redshift}/__init__.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/io/aws/redshift/redshift_datasource.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/io/dataset.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/io/read_api.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/logs.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/storage/__init__.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/storage/interface.py +0 -0
- {deltacat-0.1.18b4/deltacat/tests → deltacat-0.1.18b6/deltacat/storage/model}/__init__.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/storage/model/delta.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/storage/model/list_result.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/storage/model/locator.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/storage/model/namespace.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/storage/model/partition.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/storage/model/stream.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/storage/model/table.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/storage/model/table_version.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/storage/model/types.py +0 -0
- {deltacat-0.1.18b4/deltacat/tests/compactor → deltacat-0.1.18b6/deltacat/tests}/__init__.py +0 -0
- {deltacat-0.1.18b4/deltacat/tests/compactor/utils → deltacat-0.1.18b6/deltacat/tests/compactor}/__init__.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/tests/compactor/test_compact_partition_params.py +0 -0
- {deltacat-0.1.18b4/deltacat/tests/stats → deltacat-0.1.18b6/deltacat/tests/compactor/utils}/__init__.py +0 -0
- {deltacat-0.1.18b4/deltacat/tests/test_utils → deltacat-0.1.18b6/deltacat/tests/io}/__init__.py +0 -0
- {deltacat-0.1.18b4/deltacat/tests/utils → deltacat-0.1.18b6/deltacat/tests/stats}/__init__.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/tests/stats/test_intervals.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/tests/test_repartition.py +0 -0
- {deltacat-0.1.18b4/deltacat/types → deltacat-0.1.18b6/deltacat/tests/test_utils}/__init__.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/tests/test_utils/constants.py +0 -0
- {deltacat-0.1.18b4/deltacat → deltacat-0.1.18b6/deltacat/tests}/utils/__init__.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/tests/utils/test_record_batch_tables.py +0 -0
- {deltacat-0.1.18b4/deltacat/utils/ray_utils → deltacat-0.1.18b6/deltacat/types}/__init__.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/types/media.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/types/tables.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/utils/common.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/utils/metrics.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/utils/numpy.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/utils/pandas.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/utils/performance.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/utils/placement.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/utils/pyarrow.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/utils/ray_utils/collections.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/utils/ray_utils/concurrency.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/utils/ray_utils/dataset.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/utils/ray_utils/performance.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/utils/ray_utils/runtime.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/utils/resources.py +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat.egg-info/dependency_links.txt +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat.egg-info/top_level.txt +0 -0
- {deltacat-0.1.18b4 → deltacat-0.1.18b6}/setup.cfg +0 -0
@@ -16,6 +16,8 @@ from deltacat.compute.compactor import (
|
|
16
16
|
)
|
17
17
|
from deltacat.compute.compactor.model.dedupe_result import DedupeResult
|
18
18
|
from deltacat.compute.compactor.model.hash_bucket_result import HashBucketResult
|
19
|
+
from deltacat.io.object_store import IObjectStore
|
20
|
+
from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
|
19
21
|
from deltacat.compute.compactor.model.materialize_result import MaterializeResult
|
20
22
|
from deltacat.compute.stats.models.delta_stats import DeltaStats
|
21
23
|
from deltacat.storage import (
|
@@ -112,6 +114,7 @@ def compact_partition(
|
|
112
114
|
list_deltas_kwargs: Optional[Dict[str, Any]] = None,
|
113
115
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
114
116
|
s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
117
|
+
object_store: Optional[IObjectStore] = RayPlasmaObjectStore(),
|
115
118
|
deltacat_storage=unimplemented_deltacat_storage,
|
116
119
|
**kwargs,
|
117
120
|
) -> Optional[str]:
|
@@ -151,6 +154,7 @@ def compact_partition(
|
|
151
154
|
list_deltas_kwargs,
|
152
155
|
read_kwargs_provider,
|
153
156
|
s3_table_writer_kwargs,
|
157
|
+
object_store,
|
154
158
|
deltacat_storage,
|
155
159
|
**kwargs,
|
156
160
|
)
|
@@ -196,6 +200,7 @@ def _execute_compaction_round(
|
|
196
200
|
list_deltas_kwargs: Optional[Dict[str, Any]],
|
197
201
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
198
202
|
s3_table_writer_kwargs: Optional[Dict[str, Any]],
|
203
|
+
object_store: Optional[IObjectStore],
|
199
204
|
deltacat_storage=unimplemented_deltacat_storage,
|
200
205
|
**kwargs,
|
201
206
|
) -> Tuple[Optional[Partition], Optional[RoundCompletionInfo], Optional[str]]:
|
@@ -287,6 +292,13 @@ def _execute_compaction_round(
|
|
287
292
|
)
|
288
293
|
logger.info(f"Round completion file: {round_completion_info}")
|
289
294
|
|
295
|
+
enable_manifest_entry_copy_by_reference = (
|
296
|
+
False if rebase_source_partition_locator else True
|
297
|
+
)
|
298
|
+
logger.info(
|
299
|
+
f"Enable manifest entry copy by reference is set to: {enable_manifest_entry_copy_by_reference}"
|
300
|
+
)
|
301
|
+
|
290
302
|
# discover input delta files
|
291
303
|
# For rebase:
|
292
304
|
# Copy the old compacted table to a new destination, plus any new deltas from rebased source
|
@@ -392,6 +404,7 @@ def _execute_compaction_round(
|
|
392
404
|
enable_profiler=enable_profiler,
|
393
405
|
metrics_config=metrics_config,
|
394
406
|
read_kwargs_provider=read_kwargs_provider,
|
407
|
+
object_store=object_store,
|
395
408
|
deltacat_storage=deltacat_storage,
|
396
409
|
)
|
397
410
|
|
@@ -453,11 +466,16 @@ def _execute_compaction_round(
|
|
453
466
|
logger.info(f"Materialize Bucket Count: {num_materialize_buckets}")
|
454
467
|
|
455
468
|
dedupe_start = time.monotonic()
|
456
|
-
|
469
|
+
dd_max_parallelism = int(
|
470
|
+
max_parallelism * kwargs.get("dd_max_parallelism_ratio", 1)
|
471
|
+
)
|
472
|
+
logger.info(
|
473
|
+
f"dd max_parallelism is set to {dd_max_parallelism}, max_parallelism is {max_parallelism}"
|
474
|
+
)
|
457
475
|
dd_tasks_pending = invoke_parallel(
|
458
476
|
items=all_hash_group_idx_to_obj_id.values(),
|
459
477
|
ray_task=dd.dedupe,
|
460
|
-
max_parallelism=
|
478
|
+
max_parallelism=dd_max_parallelism,
|
461
479
|
options_provider=round_robin_opt_provider,
|
462
480
|
kwargs_provider=lambda index, item: {
|
463
481
|
"dedupe_task_index": index,
|
@@ -467,6 +485,7 @@ def _execute_compaction_round(
|
|
467
485
|
num_materialize_buckets=num_materialize_buckets,
|
468
486
|
enable_profiler=enable_profiler,
|
469
487
|
metrics_config=metrics_config,
|
488
|
+
object_store=object_store,
|
470
489
|
)
|
471
490
|
|
472
491
|
dedupe_invoke_end = time.monotonic()
|
@@ -537,12 +556,14 @@ def _execute_compaction_round(
|
|
537
556
|
round_completion_info=round_completion_info,
|
538
557
|
source_partition_locator=source_partition_locator,
|
539
558
|
partition=partition,
|
559
|
+
enable_manifest_entry_copy_by_reference=enable_manifest_entry_copy_by_reference,
|
540
560
|
max_records_per_output_file=records_per_compacted_file,
|
541
561
|
compacted_file_content_type=compacted_file_content_type,
|
542
562
|
enable_profiler=enable_profiler,
|
543
563
|
metrics_config=metrics_config,
|
544
564
|
read_kwargs_provider=read_kwargs_provider,
|
545
565
|
s3_table_writer_kwargs=s3_table_writer_kwargs,
|
566
|
+
object_store=object_store,
|
546
567
|
deltacat_storage=deltacat_storage,
|
547
568
|
)
|
548
569
|
|
@@ -1,5 +1,6 @@
|
|
1
1
|
import importlib
|
2
2
|
import logging
|
3
|
+
from typing import Optional
|
3
4
|
import time
|
4
5
|
from collections import defaultdict
|
5
6
|
from contextlib import nullcontext
|
@@ -8,7 +9,6 @@ import numpy as np
|
|
8
9
|
import pyarrow as pa
|
9
10
|
import pyarrow.compute as pc
|
10
11
|
import ray
|
11
|
-
from ray import cloudpickle
|
12
12
|
|
13
13
|
from deltacat import logs
|
14
14
|
from deltacat.compute.compactor import (
|
@@ -25,6 +25,7 @@ from deltacat.utils.ray_utils.runtime import (
|
|
25
25
|
)
|
26
26
|
from deltacat.utils.performance import timed_invocation
|
27
27
|
from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
|
28
|
+
from deltacat.io.object_store import IObjectStore
|
28
29
|
from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
|
29
30
|
|
30
31
|
if importlib.util.find_spec("memray"):
|
@@ -106,6 +107,7 @@ def _timed_dedupe(
|
|
106
107
|
num_materialize_buckets: int,
|
107
108
|
dedupe_task_index: int,
|
108
109
|
enable_profiler: bool,
|
110
|
+
object_store: Optional[IObjectStore],
|
109
111
|
):
|
110
112
|
task_id = get_current_ray_task_id()
|
111
113
|
worker_id = get_current_ray_worker_id()
|
@@ -114,15 +116,12 @@ def _timed_dedupe(
|
|
114
116
|
) if enable_profiler else nullcontext():
|
115
117
|
# TODO (pdames): mitigate risk of running out of memory here in cases of
|
116
118
|
# severe skew of primary key updates in deltas
|
117
|
-
src_file_records_obj_refs = [
|
118
|
-
cloudpickle.loads(obj_id_pkl) for obj_id_pkl in object_ids
|
119
|
-
]
|
120
119
|
logger.info(
|
121
120
|
f"[Dedupe task {dedupe_task_index}] Getting delta file envelope "
|
122
|
-
f"groups for {len(
|
121
|
+
f"groups for {len(object_ids)} object refs..."
|
123
122
|
)
|
124
123
|
|
125
|
-
delta_file_envelope_groups_list =
|
124
|
+
delta_file_envelope_groups_list = object_store.get_many(object_ids)
|
126
125
|
hb_index_to_delta_file_envelopes_list = defaultdict(list)
|
127
126
|
for delta_file_envelope_groups in delta_file_envelope_groups_list:
|
128
127
|
for hb_idx, dfes in enumerate(delta_file_envelope_groups):
|
@@ -201,7 +200,6 @@ def _timed_dedupe(
|
|
201
200
|
src_file_id_to_row_indices[src_dfl].append(row_idx_col[row_idx])
|
202
201
|
|
203
202
|
logger.info(f"Finished all dedupe rounds...")
|
204
|
-
mat_bucket_to_src_file_record_count = defaultdict(dict)
|
205
203
|
mat_bucket_to_src_file_records: Dict[
|
206
204
|
MaterializeBucketIndex, DeltaFileLocatorToRecords
|
207
205
|
] = defaultdict(dict)
|
@@ -213,22 +211,17 @@ def _timed_dedupe(
|
|
213
211
|
mat_bucket_to_src_file_records[mat_bucket][src_dfl] = np.array(
|
214
212
|
src_row_indices,
|
215
213
|
)
|
216
|
-
mat_bucket_to_src_file_record_count[mat_bucket][src_dfl] = len(
|
217
|
-
src_row_indices
|
218
|
-
)
|
219
214
|
|
220
215
|
mat_bucket_to_dd_idx_obj_id: Dict[
|
221
216
|
MaterializeBucketIndex, DedupeTaskIndexWithObjectId
|
222
217
|
] = {}
|
223
218
|
for mat_bucket, src_file_records in mat_bucket_to_src_file_records.items():
|
224
|
-
object_ref =
|
225
|
-
pickled_object_ref = cloudpickle.dumps(object_ref)
|
219
|
+
object_ref = object_store.put(src_file_records)
|
226
220
|
mat_bucket_to_dd_idx_obj_id[mat_bucket] = (
|
227
221
|
dedupe_task_index,
|
228
|
-
|
222
|
+
object_ref,
|
229
223
|
)
|
230
224
|
del object_ref
|
231
|
-
del pickled_object_ref
|
232
225
|
logger.info(
|
233
226
|
f"Count of materialize buckets with object refs: "
|
234
227
|
f"{len(mat_bucket_to_dd_idx_obj_id)}"
|
@@ -253,6 +246,7 @@ def dedupe(
|
|
253
246
|
dedupe_task_index: int,
|
254
247
|
enable_profiler: bool,
|
255
248
|
metrics_config: MetricsConfig,
|
249
|
+
object_store: Optional[IObjectStore],
|
256
250
|
) -> DedupeResult:
|
257
251
|
logger.info(f"[Dedupe task {dedupe_task_index}] Starting dedupe task...")
|
258
252
|
dedupe_result, duration = timed_invocation(
|
@@ -262,6 +256,7 @@ def dedupe(
|
|
262
256
|
num_materialize_buckets=num_materialize_buckets,
|
263
257
|
dedupe_task_index=dedupe_task_index,
|
264
258
|
enable_profiler=enable_profiler,
|
259
|
+
object_store=object_store,
|
265
260
|
)
|
266
261
|
|
267
262
|
emit_metrics_time = 0.0
|
@@ -31,6 +31,7 @@ from deltacat.utils.ray_utils.runtime import (
|
|
31
31
|
from deltacat.utils.common import ReadKwargsProvider
|
32
32
|
from deltacat.utils.performance import timed_invocation
|
33
33
|
from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
|
34
|
+
from deltacat.io.object_store import IObjectStore
|
34
35
|
from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
|
35
36
|
|
36
37
|
if importlib.util.find_spec("memray"):
|
@@ -179,6 +180,7 @@ def _timed_hash_bucket(
|
|
179
180
|
num_groups: int,
|
180
181
|
enable_profiler: bool,
|
181
182
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
183
|
+
object_store: Optional[IObjectStore] = None,
|
182
184
|
deltacat_storage=unimplemented_deltacat_storage,
|
183
185
|
):
|
184
186
|
task_id = get_current_ray_task_id()
|
@@ -207,9 +209,7 @@ def _timed_hash_bucket(
|
|
207
209
|
deltacat_storage,
|
208
210
|
)
|
209
211
|
hash_bucket_group_to_obj_id, _ = group_hash_bucket_indices(
|
210
|
-
delta_file_envelope_groups,
|
211
|
-
num_buckets,
|
212
|
-
num_groups,
|
212
|
+
delta_file_envelope_groups, num_buckets, num_groups, object_store
|
213
213
|
)
|
214
214
|
|
215
215
|
peak_memory_usage_bytes = get_current_node_peak_memory_usage_in_bytes()
|
@@ -233,6 +233,7 @@ def hash_bucket(
|
|
233
233
|
enable_profiler: bool,
|
234
234
|
metrics_config: MetricsConfig,
|
235
235
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
236
|
+
object_store: Optional[IObjectStore],
|
236
237
|
deltacat_storage=unimplemented_deltacat_storage,
|
237
238
|
) -> HashBucketResult:
|
238
239
|
|
@@ -247,6 +248,7 @@ def hash_bucket(
|
|
247
248
|
num_groups=num_groups,
|
248
249
|
enable_profiler=enable_profiler,
|
249
250
|
read_kwargs_provider=read_kwargs_provider,
|
251
|
+
object_store=object_store,
|
250
252
|
deltacat_storage=deltacat_storage,
|
251
253
|
)
|
252
254
|
|
@@ -5,11 +5,10 @@ from uuid import uuid4
|
|
5
5
|
from collections import defaultdict
|
6
6
|
from contextlib import nullcontext
|
7
7
|
from itertools import chain, repeat
|
8
|
-
from typing import List, Optional, Tuple, Dict, Any
|
8
|
+
from typing import List, Optional, Tuple, Dict, Any
|
9
9
|
import pyarrow as pa
|
10
10
|
import numpy as np
|
11
11
|
import ray
|
12
|
-
from ray import cloudpickle
|
13
12
|
from deltacat import logs
|
14
13
|
from deltacat.compute.compactor import (
|
15
14
|
MaterializeResult,
|
@@ -28,15 +27,13 @@ from deltacat.storage import (
|
|
28
27
|
PartitionLocator,
|
29
28
|
Manifest,
|
30
29
|
ManifestEntry,
|
31
|
-
LocalDataset,
|
32
|
-
LocalTable,
|
33
|
-
DistributedDataset,
|
34
30
|
)
|
35
31
|
from deltacat.storage import interface as unimplemented_deltacat_storage
|
36
32
|
from deltacat.utils.common import ReadKwargsProvider
|
37
33
|
from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES, ContentType
|
38
34
|
from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
|
39
35
|
from deltacat.utils.performance import timed_invocation
|
36
|
+
from deltacat.io.object_store import IObjectStore
|
40
37
|
from deltacat.utils.pyarrow import (
|
41
38
|
ReadKwargsProviderPyArrowCsvPureUtf8,
|
42
39
|
ReadKwargsProviderPyArrowSchemaOverride,
|
@@ -64,29 +61,15 @@ def materialize(
|
|
64
61
|
dedupe_task_idx_and_obj_id_tuples: List[DedupeTaskIndexWithObjectId],
|
65
62
|
max_records_per_output_file: int,
|
66
63
|
compacted_file_content_type: ContentType,
|
64
|
+
enable_manifest_entry_copy_by_reference: bool,
|
67
65
|
enable_profiler: bool,
|
68
66
|
metrics_config: MetricsConfig,
|
69
67
|
schema: Optional[pa.Schema] = None,
|
70
68
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
71
69
|
s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
70
|
+
object_store: Optional[IObjectStore] = None,
|
72
71
|
deltacat_storage=unimplemented_deltacat_storage,
|
73
72
|
):
|
74
|
-
def _stage_delta_implementation(
|
75
|
-
data: Union[LocalTable, LocalDataset, DistributedDataset, Manifest],
|
76
|
-
partition: Partition,
|
77
|
-
stage_delta_from_existing_manifest: Optional[bool],
|
78
|
-
) -> Delta:
|
79
|
-
if stage_delta_from_existing_manifest:
|
80
|
-
delta = Delta.of(
|
81
|
-
locator=DeltaLocator.of(partition.locator),
|
82
|
-
delta_type=DeltaType.UPSERT,
|
83
|
-
meta=manifest.meta,
|
84
|
-
manifest=data,
|
85
|
-
previous_stream_position=partition.stream_position,
|
86
|
-
properties={},
|
87
|
-
)
|
88
|
-
return delta
|
89
|
-
|
90
73
|
def _stage_delta_from_manifest_entry_reference_list(
|
91
74
|
manifest_entry_list_reference: List[ManifestEntry],
|
92
75
|
partition: Partition,
|
@@ -96,10 +79,13 @@ def materialize(
|
|
96
79
|
delta_type == DeltaType.UPSERT
|
97
80
|
), "Stage delta with existing manifest entries only supports UPSERT delta type!"
|
98
81
|
manifest = Manifest.of(entries=manifest_entry_list_reference, uuid=str(uuid4()))
|
99
|
-
delta =
|
100
|
-
|
101
|
-
|
102
|
-
|
82
|
+
delta = Delta.of(
|
83
|
+
locator=DeltaLocator.of(partition.locator),
|
84
|
+
delta_type=delta_type,
|
85
|
+
meta=manifest.meta,
|
86
|
+
manifest=manifest,
|
87
|
+
previous_stream_position=partition.stream_position,
|
88
|
+
properties={},
|
103
89
|
)
|
104
90
|
return delta
|
105
91
|
|
@@ -161,18 +147,11 @@ def materialize(
|
|
161
147
|
f"dedupe_{worker_id}_{task_id}.bin"
|
162
148
|
) if enable_profiler else nullcontext():
|
163
149
|
start = time.time()
|
164
|
-
dedupe_task_idx_and_obj_ref_tuples = [
|
165
|
-
(
|
166
|
-
t1,
|
167
|
-
cloudpickle.loads(t2),
|
168
|
-
)
|
169
|
-
for t1, t2 in dedupe_task_idx_and_obj_id_tuples
|
170
|
-
]
|
171
150
|
logger.info(f"Resolved materialize task obj refs...")
|
172
|
-
dedupe_task_indices, obj_refs = zip(*
|
151
|
+
dedupe_task_indices, obj_refs = zip(*dedupe_task_idx_and_obj_id_tuples)
|
173
152
|
# this depends on `ray.get` result order matching input order, as per the
|
174
153
|
# contract established in: https://github.com/ray-project/ray/pull/16763
|
175
|
-
src_file_records_list =
|
154
|
+
src_file_records_list = object_store.get_many(list(obj_refs))
|
176
155
|
all_src_file_records = defaultdict(list)
|
177
156
|
for i, src_file_records in enumerate(src_file_records_list):
|
178
157
|
dedupe_task_idx = dedupe_task_indices[i]
|
@@ -231,7 +210,9 @@ def materialize(
|
|
231
210
|
record_numbers_length += 1
|
232
211
|
mask_pylist[record_number] = True
|
233
212
|
if (
|
234
|
-
|
213
|
+
round_completion_info
|
214
|
+
and enable_manifest_entry_copy_by_reference
|
215
|
+
and record_numbers_length == src_file_record_count
|
235
216
|
and src_file_partition_locator
|
236
217
|
== round_completion_info.compacted_delta_locator.partition_locator
|
237
218
|
):
|
@@ -244,8 +225,8 @@ def materialize(
|
|
244
225
|
manifest_entry_list_reference.append(untouched_src_manifest_entry)
|
245
226
|
referenced_pyarrow_write_result = PyArrowWriteResult.of(
|
246
227
|
1,
|
247
|
-
|
248
|
-
|
228
|
+
untouched_src_manifest_entry.meta.source_content_length,
|
229
|
+
untouched_src_manifest_entry.meta.content_length,
|
249
230
|
src_file_record_count,
|
250
231
|
)
|
251
232
|
referenced_pyarrow_write_results.append(referenced_pyarrow_write_result)
|
{deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/utils/primary_key_index.py
RENAMED
@@ -7,7 +7,6 @@ import numpy as np
|
|
7
7
|
import pyarrow as pa
|
8
8
|
import ray
|
9
9
|
import s3fs
|
10
|
-
from ray import cloudpickle
|
11
10
|
from ray.types import ObjectRef
|
12
11
|
|
13
12
|
from deltacat import logs
|
@@ -30,6 +29,7 @@ from deltacat.types.media import ContentEncoding, ContentType
|
|
30
29
|
from deltacat.types.tables import get_table_slicer, get_table_writer
|
31
30
|
from deltacat.utils.common import ReadKwargsProvider
|
32
31
|
from deltacat.utils.ray_utils.concurrency import invoke_parallel
|
32
|
+
from deltacat.io.object_store import IObjectStore
|
33
33
|
|
34
34
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
35
35
|
|
@@ -203,7 +203,10 @@ def group_record_indices_by_hash_bucket(
|
|
203
203
|
|
204
204
|
|
205
205
|
def group_hash_bucket_indices(
|
206
|
-
hash_bucket_object_groups: np.ndarray,
|
206
|
+
hash_bucket_object_groups: np.ndarray,
|
207
|
+
num_buckets: int,
|
208
|
+
num_groups: int,
|
209
|
+
object_store: Optional[IObjectStore] = None,
|
207
210
|
) -> Tuple[np.ndarray, List[ObjectRef]]:
|
208
211
|
"""
|
209
212
|
Groups all the ObjectRef that belongs to a particular hash bucket group and hash bucket index.
|
@@ -226,19 +229,10 @@ def group_hash_bucket_indices(
|
|
226
229
|
for hb_group, obj in enumerate(hb_group_to_object):
|
227
230
|
if obj is None:
|
228
231
|
continue
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
# NOTE: The cloudpickle.dumps API call creates an out of band object reference to the object_ref variable.
|
234
|
-
# After pickling, Ray cannot track the serialized copy of the object or determine when the ObjectRef has been deserialized
|
235
|
-
# (e.g., if the ObjectRef is deserialized by a non-Ray process).
|
236
|
-
# Thus the object_ref cannot be tracked by Ray's distributed reference counter, even if it goes out of scope.
|
237
|
-
# The object now has a permanent reference and the data can't be freed from Ray’s object store.
|
238
|
-
# Manually deleting the untrackable object references offsets these permanent references and
|
239
|
-
# helps to allow these objects to be garbage collected normally.
|
240
|
-
del obj_ref
|
241
|
-
del pickled_obj_ref
|
232
|
+
object_ref = object_store.put(obj)
|
233
|
+
object_refs.append(object_ref)
|
234
|
+
hash_bucket_group_to_obj_id[hb_group] = object_ref
|
235
|
+
del object_ref
|
242
236
|
return hash_bucket_group_to_obj_id, object_refs
|
243
237
|
|
244
238
|
|
@@ -0,0 +1,48 @@
|
|
1
|
+
import logging
|
2
|
+
from ray import cloudpickle
|
3
|
+
import time
|
4
|
+
from deltacat.io.object_store import IObjectStore
|
5
|
+
from typing import Any, List
|
6
|
+
from deltacat import logs
|
7
|
+
import os
|
8
|
+
import uuid
|
9
|
+
from builtins import open
|
10
|
+
|
11
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
12
|
+
|
13
|
+
|
14
|
+
class FileObjectStore(IObjectStore):
|
15
|
+
"""
|
16
|
+
An implementation of object store that uses file system.
|
17
|
+
"""
|
18
|
+
|
19
|
+
def __init__(self, dir_path: str) -> None:
|
20
|
+
self.dir_path = dir_path
|
21
|
+
super().__init__()
|
22
|
+
|
23
|
+
def put_many(self, objects: List[object], *args, **kwargs) -> List[Any]:
|
24
|
+
result = []
|
25
|
+
|
26
|
+
for obj in objects:
|
27
|
+
serialized = cloudpickle.dumps(obj)
|
28
|
+
ref = f"{self.dir_path}/{uuid.uuid4()}"
|
29
|
+
with open(ref, "xb") as f:
|
30
|
+
f.write(serialized)
|
31
|
+
|
32
|
+
result.append(ref)
|
33
|
+
|
34
|
+
return result
|
35
|
+
|
36
|
+
def get_many(self, refs: List[Any], *args, **kwargs) -> List[object]:
|
37
|
+
result = []
|
38
|
+
start = time.monotonic()
|
39
|
+
for ref in refs:
|
40
|
+
with open(ref, "rb") as f:
|
41
|
+
serialized = f.read()
|
42
|
+
loaded = cloudpickle.loads(serialized)
|
43
|
+
result.append(loaded)
|
44
|
+
os.remove(ref)
|
45
|
+
end = time.monotonic()
|
46
|
+
|
47
|
+
logger.info(f"The total time taken to read all objects is: {end - start}")
|
48
|
+
return result
|
@@ -0,0 +1,121 @@
|
|
1
|
+
import logging
|
2
|
+
from ray import cloudpickle
|
3
|
+
from collections import defaultdict
|
4
|
+
import time
|
5
|
+
from deltacat.io.object_store import IObjectStore
|
6
|
+
from typing import Any, List
|
7
|
+
from deltacat import logs
|
8
|
+
import uuid
|
9
|
+
import socket
|
10
|
+
from pymemcache.client.base import Client
|
11
|
+
from pymemcache.client.retrying import RetryingClient
|
12
|
+
from pymemcache.exceptions import MemcacheUnexpectedCloseError
|
13
|
+
|
14
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
15
|
+
|
16
|
+
|
17
|
+
class MemcachedObjectStore(IObjectStore):
|
18
|
+
"""
|
19
|
+
An implementation of object store that uses Memcached.
|
20
|
+
"""
|
21
|
+
|
22
|
+
def __init__(self, port=11212) -> None:
|
23
|
+
self.client_cache = {}
|
24
|
+
self.current_ip = None
|
25
|
+
self.SEPARATOR = "_"
|
26
|
+
self.port = port
|
27
|
+
super().__init__()
|
28
|
+
|
29
|
+
def put_many(self, objects: List[object], *args, **kwargs) -> List[Any]:
|
30
|
+
input = {}
|
31
|
+
result = []
|
32
|
+
current_ip = self._get_current_ip()
|
33
|
+
for obj in objects:
|
34
|
+
serialized = cloudpickle.dumps(obj)
|
35
|
+
uid = uuid.uuid4()
|
36
|
+
ref = self._create_ref(uid, current_ip)
|
37
|
+
input[uid.__str__()] = serialized
|
38
|
+
result.append(ref)
|
39
|
+
|
40
|
+
client = self._get_client_by_ip(current_ip)
|
41
|
+
if client.set_many(input, noreply=False):
|
42
|
+
raise RuntimeError("Unable to write few keys to cache")
|
43
|
+
|
44
|
+
return result
|
45
|
+
|
46
|
+
def put(self, obj: object, *args, **kwargs) -> Any:
|
47
|
+
serialized = cloudpickle.dumps(obj)
|
48
|
+
uid = uuid.uuid4()
|
49
|
+
current_ip = self._get_current_ip()
|
50
|
+
ref = self._create_ref(uid, current_ip)
|
51
|
+
client = self._get_client_by_ip(current_ip)
|
52
|
+
|
53
|
+
if client.set(uid.__str__(), serialized):
|
54
|
+
return ref
|
55
|
+
else:
|
56
|
+
raise RuntimeError("Unable to write to cache")
|
57
|
+
|
58
|
+
def get_many(self, refs: List[Any], *args, **kwargs) -> List[object]:
|
59
|
+
result = []
|
60
|
+
uid_per_ip = defaultdict(lambda: [])
|
61
|
+
|
62
|
+
start = time.monotonic()
|
63
|
+
for ref in refs:
|
64
|
+
uid, ip = ref.split(self.SEPARATOR)
|
65
|
+
uid_per_ip[ip].append(uid)
|
66
|
+
|
67
|
+
for (ip, uids) in uid_per_ip.items():
|
68
|
+
client = self._get_client_by_ip(ip)
|
69
|
+
cache_result = client.get_many(uids)
|
70
|
+
assert len(cache_result) == len(
|
71
|
+
uids
|
72
|
+
), f"Not all values were returned from cache as {len(cache_result)} != {len(uids)}"
|
73
|
+
|
74
|
+
values = cache_result.values()
|
75
|
+
total_bytes = 0
|
76
|
+
|
77
|
+
deserialize_start = time.monotonic()
|
78
|
+
for serialized in values:
|
79
|
+
deserialized = cloudpickle.loads(serialized)
|
80
|
+
total_bytes += len(serialized)
|
81
|
+
result.append(deserialized)
|
82
|
+
|
83
|
+
deserialize_end = time.monotonic()
|
84
|
+
logger.debug(
|
85
|
+
f"The time taken to deserialize {total_bytes} bytes is: {deserialize_end - deserialize_start}",
|
86
|
+
)
|
87
|
+
|
88
|
+
end = time.monotonic()
|
89
|
+
|
90
|
+
logger.info(f"The total time taken to read all objects is: {end - start}")
|
91
|
+
return result
|
92
|
+
|
93
|
+
def get(self, ref: Any, *args, **kwargs) -> object:
|
94
|
+
uid, ip = ref.split(self.SEPARATOR)
|
95
|
+
client = self._get_client_by_ip(ip)
|
96
|
+
serialized = client.get(uid)
|
97
|
+
return cloudpickle.loads(serialized)
|
98
|
+
|
99
|
+
def _create_ref(self, uid, ip) -> str:
|
100
|
+
return f"{uid}{self.SEPARATOR}{ip}"
|
101
|
+
|
102
|
+
def _get_client_by_ip(self, ip_address: str):
|
103
|
+
if ip_address in self.client_cache:
|
104
|
+
return self.client_cache[ip_address]
|
105
|
+
|
106
|
+
base_client = Client((ip_address, self.port))
|
107
|
+
client = RetryingClient(
|
108
|
+
base_client,
|
109
|
+
attempts=3,
|
110
|
+
retry_delay=0.01,
|
111
|
+
retry_for=[MemcacheUnexpectedCloseError],
|
112
|
+
)
|
113
|
+
|
114
|
+
self.client_cache[ip_address] = client
|
115
|
+
return client
|
116
|
+
|
117
|
+
def _get_current_ip(self):
|
118
|
+
if self.current_ip is None:
|
119
|
+
self.current_ip = socket.gethostbyname(socket.gethostname())
|
120
|
+
|
121
|
+
return self.current_ip
|
@@ -0,0 +1,51 @@
|
|
1
|
+
from typing import List, Any
|
2
|
+
|
3
|
+
|
4
|
+
class IObjectStore:
|
5
|
+
"""
|
6
|
+
An object store interface.
|
7
|
+
"""
|
8
|
+
|
9
|
+
def setup(self, *args, **kwargs) -> Any:
|
10
|
+
...
|
11
|
+
|
12
|
+
"""
|
13
|
+
Sets up everything needed to run the object store.
|
14
|
+
"""
|
15
|
+
|
16
|
+
def put(self, obj: object, *args, **kwargs) -> Any:
|
17
|
+
"""
|
18
|
+
Put a single object into the object store. Depending
|
19
|
+
on the implementation, this method can be sync or async.
|
20
|
+
"""
|
21
|
+
return self.put_many([obj])[0]
|
22
|
+
|
23
|
+
def put_many(self, objects: List[object], *args, **kwargs) -> List[Any]:
|
24
|
+
...
|
25
|
+
|
26
|
+
"""
|
27
|
+
Put many objects into the object store. It would return an ordered list
|
28
|
+
of object references corresponding to each object in the input.
|
29
|
+
"""
|
30
|
+
|
31
|
+
def get(self, ref: Any, *args, **kwargs) -> object:
|
32
|
+
"""
|
33
|
+
Get a single object from an object store.
|
34
|
+
"""
|
35
|
+
return self.get_many([ref])[0]
|
36
|
+
|
37
|
+
def get_many(self, refs: List[Any], *args, **kwargs) -> List[object]:
|
38
|
+
...
|
39
|
+
|
40
|
+
"""
|
41
|
+
Get a list of objects from the object store. Use this method to
|
42
|
+
avoid multiple get calls. Note that depending on implementation it may
|
43
|
+
or may not return ordered results.
|
44
|
+
"""
|
45
|
+
|
46
|
+
def clear(self, *args, **kwargs) -> bool:
|
47
|
+
...
|
48
|
+
|
49
|
+
"""
|
50
|
+
Clears the object store and all the associated data in it.
|
51
|
+
"""
|
@@ -0,0 +1,23 @@
|
|
1
|
+
import ray
|
2
|
+
from ray import cloudpickle
|
3
|
+
from deltacat.io.object_store import IObjectStore
|
4
|
+
from typing import Any, List
|
5
|
+
|
6
|
+
|
7
|
+
class RayPlasmaObjectStore(IObjectStore):
|
8
|
+
"""
|
9
|
+
An implementation of object store that uses Ray plasma object store.
|
10
|
+
"""
|
11
|
+
|
12
|
+
def put_many(self, objects: List[object], *args, **kwargs) -> List[Any]:
|
13
|
+
result = []
|
14
|
+
for obj in objects:
|
15
|
+
object_ref = ray.put(obj)
|
16
|
+
pickled = cloudpickle.dumps(object_ref)
|
17
|
+
result.append(pickled)
|
18
|
+
|
19
|
+
return result
|
20
|
+
|
21
|
+
def get_many(self, refs: List[Any], *args, **kwargs) -> List[object]:
|
22
|
+
loaded_refs = [cloudpickle.loads(obj_id) for obj_id in refs]
|
23
|
+
return ray.get(loaded_refs)
|