PyPI - deltacat - Versions diffs - 0.1.18b3__py3-none-any.whl → 0.1.18b7__py3-none-any.whl - Mend

deltacat 0.1.18b3py3-none-any.whl → 0.1.18b7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

deltacat/__init__.py +1 -1
deltacat/compute/compactor/compaction_session.py +184 -29
deltacat/compute/compactor/model/compact_partition_params.py +153 -0
deltacat/compute/compactor/model/compaction_session_audit_info.py +725 -0
deltacat/compute/compactor/model/dedupe_result.py +3 -0
deltacat/compute/compactor/model/delta_file_envelope.py +8 -0
deltacat/compute/compactor/model/delta_file_locator.py +11 -6
deltacat/compute/compactor/model/hash_bucket_result.py +3 -0
deltacat/compute/compactor/model/materialize_result.py +27 -6
deltacat/compute/compactor/model/round_completion_info.py +9 -0
deltacat/compute/compactor/steps/dedupe.py +35 -19
deltacat/compute/compactor/steps/hash_bucket.py +41 -16
deltacat/compute/compactor/steps/materialize.py +73 -70
deltacat/compute/compactor/utils/io.py +15 -0
deltacat/compute/compactor/utils/primary_key_index.py +9 -15
deltacat/compute/compactor/utils/round_completion_file.py +13 -4
deltacat/compute/compactor/utils/system_columns.py +32 -0
deltacat/io/__init__.py +0 -7
deltacat/io/file_object_store.py +48 -0
deltacat/io/memcached_object_store.py +121 -0
deltacat/io/object_store.py +51 -0
deltacat/io/ray_plasma_object_store.py +23 -0
deltacat/io/redis_object_store.py +114 -0
deltacat/io/s3_object_store.py +44 -0
deltacat/storage/model/delta.py +2 -1
deltacat/tests/compactor/test_compact_partition_params.py +237 -0
deltacat/tests/compactor/utils/test_io.py +27 -5
deltacat/tests/io/__init__.py +0 -0
deltacat/tests/io/test_file_object_store.py +86 -0
deltacat/tests/io/test_memcached_object_store.py +158 -0
deltacat/tests/io/test_ray_plasma_object_store.py +54 -0
deltacat/tests/io/test_redis_object_store.py +103 -0
deltacat/tests/io/test_s3_object_store.py +59 -0
deltacat/tests/utils/test_record_batch_tables.py +1 -1
deltacat/tests/utils/test_resources.py +9 -0
deltacat/utils/ray_utils/concurrency.py +0 -2
deltacat/utils/resources.py +30 -18
{deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/METADATA +3 -1
{deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/RECORD +42 -27
{deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/LICENSE +0 -0
{deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/WHEEL +0 -0
{deltacat-0.1.18b3.dist-info → deltacat-0.1.18b7.dist-info}/top_level.txt +0 -0

deltacat/compute/compactor/steps/materialize.py CHANGED Viewed

@@ -5,10 +5,10 @@ from uuid import uuid4
 from collections import defaultdict
 from contextlib import nullcontext
 from itertools import chain, repeat
-from typing import List, Optional, Tuple, Dict, Any, Union
+from typing import List, Optional, Tuple, Dict, Any
 import pyarrow as pa
+import numpy as np
 import ray
-from ray import cloudpickle
 from deltacat import logs
 from deltacat.compute.compactor import (
     MaterializeResult,
@@ -27,15 +27,13 @@ from deltacat.storage import (
     PartitionLocator,
     Manifest,
     ManifestEntry,
-    LocalDataset,
-    LocalTable,
-    DistributedDataset,
 )
 from deltacat.storage import interface as unimplemented_deltacat_storage
 from deltacat.utils.common import ReadKwargsProvider
 from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES, ContentType
 from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
 from deltacat.utils.performance import timed_invocation
+from deltacat.io.object_store import IObjectStore
 from deltacat.utils.pyarrow import (
     ReadKwargsProviderPyArrowCsvPureUtf8,
     ReadKwargsProviderPyArrowSchemaOverride,
@@ -46,6 +44,7 @@ from deltacat.utils.ray_utils.runtime import (
     get_current_ray_worker_id,
 )
 from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
+from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
 if importlib.util.find_spec("memray"):
     import memray
@@ -62,29 +61,15 @@ def materialize(
     dedupe_task_idx_and_obj_id_tuples: List[DedupeTaskIndexWithObjectId],
     max_records_per_output_file: int,
     compacted_file_content_type: ContentType,
+    enable_manifest_entry_copy_by_reference: bool,
     enable_profiler: bool,
     metrics_config: MetricsConfig,
     schema: Optional[pa.Schema] = None,
     read_kwargs_provider: Optional[ReadKwargsProvider] = None,
     s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
+    object_store: Optional[IObjectStore] = None,
     deltacat_storage=unimplemented_deltacat_storage,
 ):
-    def _stage_delta_implementation(
-        data: Union[LocalTable, LocalDataset, DistributedDataset, Manifest],
-        partition: Partition,
-        stage_delta_from_existing_manifest: Optional[bool],
-    ) -> Delta:
-        if stage_delta_from_existing_manifest:
-            delta = Delta.of(
-                locator=DeltaLocator.of(partition.locator),
-                delta_type=DeltaType.UPSERT,
-                meta=manifest.meta,
-                manifest=data,
-                previous_stream_position=partition.stream_position,
-                properties={},
-            )
-            return delta
     def _stage_delta_from_manifest_entry_reference_list(
         manifest_entry_list_reference: List[ManifestEntry],
         partition: Partition,
@@ -94,11 +79,13 @@ def materialize(
             delta_type == DeltaType.UPSERT
         ), "Stage delta with existing manifest entries only supports UPSERT delta type!"
         manifest = Manifest.of(entries=manifest_entry_list_reference, uuid=str(uuid4()))
-        delta = _stage_delta_implementation(
-            data=manifest,
-            partition=partition,
+        delta = Delta.of(
+            locator=DeltaLocator.of(partition.locator),
             delta_type=delta_type,
-            stage_delta_from_existing_manifest=True,
+            meta=manifest.meta,
+            manifest=manifest,
+            previous_stream_position=partition.stream_position,
+            properties={},
         )
         return delta
@@ -160,18 +147,11 @@ def materialize(
         f"dedupe_{worker_id}_{task_id}.bin"
     ) if enable_profiler else nullcontext():
         start = time.time()
-        dedupe_task_idx_and_obj_ref_tuples = [
-            (
-                t1,
-                cloudpickle.loads(t2),
-            )
-            for t1, t2 in dedupe_task_idx_and_obj_id_tuples
-        ]
         logger.info(f"Resolved materialize task obj refs...")
-        dedupe_task_indices, obj_refs = zip(*dedupe_task_idx_and_obj_ref_tuples)
+        dedupe_task_indices, obj_refs = zip(*dedupe_task_idx_and_obj_id_tuples)
         # this depends on `ray.get` result order matching input order, as per the
         # contract established in: https://github.com/ray-project/ray/pull/16763
-        src_file_records_list = ray.get(list(obj_refs))
+        src_file_records_list = object_store.get_many(list(obj_refs))
         all_src_file_records = defaultdict(list)
         for i, src_file_records in enumerate(src_file_records_list):
             dedupe_task_idx = dedupe_task_indices[i]
@@ -195,13 +175,13 @@ def materialize(
             is_src_partition_file_np = src_dfl.is_source_delta
             src_stream_position_np = src_dfl.stream_position
             src_file_idx_np = src_dfl.file_index
+            src_file_record_count = src_dfl.file_record_count.item()
             count_of_src_dfl += 1
             src_file_partition_locator = (
                 source_partition_locator
                 if is_src_partition_file_np
                 else round_completion_info.compacted_delta_locator.partition_locator
             )
             delta_locator = DeltaLocator.of(
                 src_file_partition_locator,
                 src_stream_position_np.item(),
@@ -223,43 +203,45 @@ def materialize(
                     read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(
                         schema=schema
                     )
-            pa_table, download_delta_manifest_entry_time = timed_invocation(
-                deltacat_storage.download_delta_manifest_entry,
-                Delta.of(delta_locator, None, None, None, manifest),
-                src_file_idx_np.item(),
-                file_reader_kwargs_provider=read_kwargs_provider,
-            )
-            logger.debug(
-                f"Time taken for materialize task"
-                f" to download delta locator {delta_locator} with entry ID {src_file_idx_np.item()}"
-                f" is: {download_delta_manifest_entry_time}s"
-            )
             record_numbers = chain.from_iterable(record_numbers_tpl)
             record_numbers_length = 0
-            mask_pylist = list(repeat(False, len(pa_table)))
+            mask_pylist = list(repeat(False, src_file_record_count))
             for record_number in record_numbers:
                 record_numbers_length += 1
                 mask_pylist[record_number] = True
             if (
-                record_numbers_length == len(pa_table)
+                round_completion_info
+                and enable_manifest_entry_copy_by_reference
+                and record_numbers_length == src_file_record_count
                 and src_file_partition_locator
                 == round_completion_info.compacted_delta_locator.partition_locator
             ):
                 logger.debug(
                     f"Untouched manifest file found, "
                     f"record numbers length: {record_numbers_length} "
-                    f"same as downloaded table length: {len(pa_table)}"
+                    f"same as downloaded table length: {src_file_record_count}"
                 )
                 untouched_src_manifest_entry = manifest.entries[src_file_idx_np.item()]
                 manifest_entry_list_reference.append(untouched_src_manifest_entry)
                 referenced_pyarrow_write_result = PyArrowWriteResult.of(
-                    len(untouched_src_manifest_entry.entries),
-                    TABLE_CLASS_TO_SIZE_FUNC[type(pa_table)](pa_table),
-                    manifest.meta.content_length,
-                    len(pa_table),
+                    1,
+                    untouched_src_manifest_entry.meta.source_content_length,
+                    untouched_src_manifest_entry.meta.content_length,
+                    src_file_record_count,
                 )
                 referenced_pyarrow_write_results.append(referenced_pyarrow_write_result)
             else:
+                pa_table, download_delta_manifest_entry_time = timed_invocation(
+                    deltacat_storage.download_delta_manifest_entry,
+                    Delta.of(delta_locator, None, None, None, manifest),
+                    src_file_idx_np.item(),
+                    file_reader_kwargs_provider=read_kwargs_provider,
+                )
+                logger.debug(
+                    f"Time taken for materialize task"
+                    f" to download delta locator {delta_locator} with entry ID {src_file_idx_np.item()}"
+                    f" is: {download_delta_manifest_entry_time}s"
+                )
                 mask = pa.array(mask_pylist)
                 pa_table = pa_table.filter(mask)
                 record_batch_tables.append(pa_table)
@@ -274,15 +256,11 @@ def materialize(
         referenced_manifest_delta = (
             _stage_delta_from_manifest_entry_reference_list(
-                manifest_entry_list_reference
+                manifest_entry_list_reference, partition
             )
             if manifest_entry_list_reference
             else None
         )
-        if referenced_manifest_delta:
-            logger.info(
-                f"Got delta with {len(referenced_manifest_delta.manifest.entries)} referenced manifest entries"
-            )
         merged_materialized_delta = [mr.delta for mr in materialized_results]
         merged_materialized_delta.append(referenced_manifest_delta)
@@ -290,33 +268,58 @@ def materialize(
             [d for d in merged_materialized_delta if d is not None]
         )
-        write_results_union = referenced_pyarrow_write_results
+        write_results_union = [*referenced_pyarrow_write_results]
         if materialized_results:
             for mr in materialized_results:
                 write_results_union.append(mr.pyarrow_write_result)
         write_result = PyArrowWriteResult.union(write_results_union)
+        referenced_write_result = PyArrowWriteResult.union(
+            referenced_pyarrow_write_results
+        )
+        if referenced_manifest_delta:
+            logger.info(
+                f"Got delta with {len(referenced_manifest_delta.manifest.entries)} referenced manifest entries"
+            )
+            assert referenced_write_result.files == len(
+                referenced_manifest_delta.manifest.entries
+            ), "The files referenced must match with the entries in the delta"
+        assert write_result.files == len(
+            merged_delta.manifest.entries
+        ), "The total number of files written by materialize must match manifest entries"
         logger.debug(
-            f"{len(write_results_union)} files written"
-            f" with records: {[wr.records for wr in write_results_union]}"
-        )
-        # Merge all new deltas into one for this materialize bucket index
-        merged_materialize_result = MaterializeResult.of(
-            merged_delta,
-            mat_bucket_index,
-            write_result,
-            len(manifest_entry_list_reference),
-            count_of_src_dfl,
+            f"{write_result.files} files written"
+            f" with records: {write_result.records}"
         )
         logger.info(f"Finished materialize task...")
         end = time.time()
         duration = end - start
+        emit_metrics_time = 0.0
         if metrics_config:
-            emit_timer_metrics(
+            emit_result, latency = timed_invocation(
+                func=emit_timer_metrics,
                 metrics_name="materialize",
                 value=duration,
                 metrics_config=metrics_config,
             )
+            emit_metrics_time = latency
         logger.info(f"Materialize task ended in {end - start}s")
+        peak_memory_usage_bytes = get_current_node_peak_memory_usage_in_bytes()
+        # Merge all new deltas into one for this materialize bucket index
+        merged_materialize_result = MaterializeResult.of(
+            merged_delta,
+            mat_bucket_index,
+            write_result,
+            referenced_write_result,
+            np.double(peak_memory_usage_bytes),
+            np.double(emit_metrics_time),
+            np.double(time.time()),
+        )
         return merged_materialize_result

deltacat/compute/compactor/utils/io.py CHANGED Viewed

@@ -16,6 +16,9 @@ from deltacat import logs
 from deltacat.compute.compactor import DeltaAnnotated
 from typing import Dict, List, Optional, Tuple, Union
 from deltacat.compute.compactor import HighWatermark
+from deltacat.compute.compactor.model.compaction_session_audit_info import (
+    CompactionSessionAuditInfo,
+)
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
@@ -94,6 +97,7 @@ def limit_input_deltas(
     hash_bucket_count: int,
     user_hash_bucket_chunk_size: int,
     input_deltas_stats: Dict[int, DeltaStats],
+    compaction_audit: CompactionSessionAuditInfo,
     deltacat_storage=unimplemented_deltacat_storage,
 ) -> Tuple[List[DeltaAnnotated], int, HighWatermark, bool]:
     # TODO (pdames): when row counts are available in metadata, use them
@@ -236,6 +240,11 @@ def limit_input_deltas(
         # TODO (pdames): Test and add value for min_file_counts
     )
+    compaction_audit.set_input_size_bytes(delta_bytes)
+    compaction_audit.set_input_file_count(delta_manifest_entries)
+    compaction_audit.set_total_cluster_memory_bytes(worker_task_mem)
+    compaction_audit.set_hash_bucket_count(hash_bucket_count)
     logger.info(f"Hash bucket chunk size: {hash_bucket_chunk_size}")
     logger.info(f"Hash bucket count: {hash_bucket_count}")
     logger.info(f"Input uniform delta count: {len(rebatched_da_list)}")
@@ -246,6 +255,7 @@ def limit_input_deltas(
 def fit_input_deltas(
     input_deltas: List[Delta],
     cluster_resources: Dict[str, float],
+    compaction_audit: CompactionSessionAuditInfo,
     hash_bucket_count: Optional[int],
     deltacat_storage=unimplemented_deltacat_storage,
 ) -> Tuple[List[DeltaAnnotated], int, HighWatermark, bool]:
@@ -314,6 +324,11 @@ def fit_input_deltas(
             math.ceil(total_memory / MEMORY_TO_HASH_BUCKET_COUNT_RATIO)
         )
+    compaction_audit.set_input_file_count(total_files)
+    compaction_audit.set_input_size_bytes(delta_bytes)
+    compaction_audit.set_total_cluster_memory_bytes(total_memory)
+    compaction_audit.set_hash_bucket_count(hash_bucket_count)
     logger.info(
         f"Input delta bytes: {delta_bytes}, Total files: {total_files}, The worker_cpus: {worker_cpus}, "
         f" total_memory: {total_memory}, and hash_bucket_count: {hash_bucket_count}"

deltacat/compute/compactor/utils/primary_key_index.py CHANGED Viewed

@@ -7,7 +7,6 @@ import numpy as np
 import pyarrow as pa
 import ray
 import s3fs
-from ray import cloudpickle
 from ray.types import ObjectRef
 from deltacat import logs
@@ -30,6 +29,7 @@ from deltacat.types.media import ContentEncoding, ContentType
 from deltacat.types.tables import get_table_slicer, get_table_writer
 from deltacat.utils.common import ReadKwargsProvider
 from deltacat.utils.ray_utils.concurrency import invoke_parallel
+from deltacat.io.object_store import IObjectStore
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
@@ -203,7 +203,10 @@ def group_record_indices_by_hash_bucket(
 def group_hash_bucket_indices(
-    hash_bucket_object_groups: np.ndarray, num_buckets: int, num_groups: int
+    hash_bucket_object_groups: np.ndarray,
+    num_buckets: int,
+    num_groups: int,
+    object_store: Optional[IObjectStore] = None,
 ) -> Tuple[np.ndarray, List[ObjectRef]]:
     """
     Groups all the ObjectRef that belongs to a particular hash bucket group and hash bucket index.
@@ -226,19 +229,10 @@ def group_hash_bucket_indices(
     for hb_group, obj in enumerate(hb_group_to_object):
         if obj is None:
             continue
-        obj_ref = ray.put(obj)
-        pickled_obj_ref = cloudpickle.dumps(obj_ref)
-        object_refs.append(pickled_obj_ref)
-        hash_bucket_group_to_obj_id[hb_group] = pickled_obj_ref
-        # NOTE: The cloudpickle.dumps API call creates an out of band object reference to the object_ref variable.
-        # After pickling, Ray cannot track the serialized copy of the object or determine when the ObjectRef has been deserialized
-        # (e.g., if the ObjectRef is deserialized by a non-Ray process).
-        # Thus the object_ref cannot be tracked by Ray's distributed reference counter, even if it goes out of scope.
-        # The object now has a permanent reference and the data can't be freed from Ray’s object store.
-        # Manually deleting the untrackable object references offsets these permanent references and
-        # helps to allow these objects to be garbage collected normally.
-        del obj_ref
-        del pickled_obj_ref
+        object_ref = object_store.put(obj)
+        object_refs.append(object_ref)
+        hash_bucket_group_to_obj_id[hb_group] = object_ref
+        del object_ref
     return hash_bucket_group_to_obj_id, object_refs

deltacat/compute/compactor/utils/round_completion_file.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import json
 import logging
+from typing import Dict, Any
 from deltacat import logs
 from deltacat.compute.compactor import RoundCompletionInfo
 from deltacat.storage import PartitionLocator
@@ -19,7 +19,9 @@ def get_round_completion_file_s3_url(
 def read_round_completion_file(
-    bucket: str, source_partition_locator: PartitionLocator
+    bucket: str,
+    source_partition_locator: PartitionLocator,
+    **s3_client_kwargs: Optional[Dict[str, Any]],
 ) -> RoundCompletionInfo:
     round_completion_file_url = get_round_completion_file_s3_url(
@@ -28,7 +30,7 @@ def read_round_completion_file(
     )
     logger.info(f"reading round completion file from: {round_completion_file_url}")
     round_completion_info = None
-    result = s3_utils.download(round_completion_file_url, False)
+    result = s3_utils.download(round_completion_file_url, False, **s3_client_kwargs)
     if result:
         json_str = result["Body"].read().decode("utf-8")
         round_completion_info = RoundCompletionInfo(json.loads(json_str))
@@ -41,7 +43,10 @@ def write_round_completion_file(
     source_partition_locator: Optional[PartitionLocator],
     round_completion_info: RoundCompletionInfo,
     completion_file_s3_url: str = None,
+    **s3_client_kwargs: Optional[Dict[str, Any]],
 ) -> str:
+    if bucket is None and completion_file_s3_url is None:
+        raise AssertionError("Either bucket or completion_file_s3_url must be passed")
     logger.info(f"writing round completion file contents: {round_completion_info}")
     if completion_file_s3_url is None:
@@ -50,6 +55,10 @@ def write_round_completion_file(
             source_partition_locator,
         )
     logger.info(f"writing round completion file to: {completion_file_s3_url}")
-    s3_utils.upload(completion_file_s3_url, str(json.dumps(round_completion_info)))
+    s3_utils.upload(
+        completion_file_s3_url,
+        str(json.dumps(round_completion_info)),
+        **s3_client_kwargs,
+    )
     logger.info(f"round completion file written to: {completion_file_s3_url}")
     return completion_file_s3_url

deltacat/compute/compactor/utils/system_columns.py CHANGED Viewed

@@ -64,6 +64,13 @@ _IS_SOURCE_COLUMN_FIELD = pa.field(
     _IS_SOURCE_COLUMN_TYPE,
 )
+_FILE_RECORD_COUNT_COLUMN_NAME = _get_sys_col_name("file_record_count")
+_FILE_RECORD_COUNT_COLUMN_TYPE = pa.int64()
+_FILE_RECORD_COUNT_COLUMN_FIELD = pa.field(
+    _FILE_RECORD_COUNT_COLUMN_NAME,
+    _FILE_RECORD_COUNT_COLUMN_TYPE,
+)
 def get_pk_hash_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
     return pa.array(obj, _PK_HASH_COLUMN_TYPE)
@@ -143,6 +150,17 @@ def get_is_source_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
     )
+def file_record_count_column_np(table: pa.Table) -> np.ndarray:
+    return table[_FILE_RECORD_COUNT_COLUMN_NAME].to_numpy()
+def get_file_record_count_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
+    return pa.array(
+        obj,
+        _FILE_RECORD_COUNT_COLUMN_TYPE,
+    )
 def project_delta_file_metadata_on_table(
     delta_file_envelope: DeltaFileEnvelope,
 ) -> pa.Table:
@@ -179,6 +197,12 @@ def project_delta_file_metadata_on_table(
         len(table),
     )
     table = append_is_source_col(table, is_source_iterator)
+    # append row count column
+    file_record_count_iterator = repeat(
+        delta_file_envelope.file_record_count, len(table)
+    )
+    table = append_file_record_count_col(table, file_record_count_iterator)
     return table
@@ -252,6 +276,14 @@ def append_is_source_col(table: pa.Table, booleans) -> pa.Table:
     return table
+def append_file_record_count_col(table: pa.Table, file_record_count):
+    table = table.append_column(
+        _FILE_RECORD_COUNT_COLUMN_FIELD,
+        get_file_record_count_column_array(file_record_count),
+    )
+    return table
 def get_minimal_hb_schema() -> pa.schema:
     return pa.schema(
         [

deltacat/io/__init__.py CHANGED Viewed

@@ -1,7 +0,0 @@
-from deltacat.io.dataset import DeltacatDataset
-from deltacat.io.read_api import read_redshift
-__all__ = [
-    "DeltacatDataset",
-    "read_redshift",
-]

deltacat/io/file_object_store.py ADDED Viewed

@@ -0,0 +1,48 @@
+import logging
+from ray import cloudpickle
+import time
+from deltacat.io.object_store import IObjectStore
+from typing import Any, List
+from deltacat import logs
+import os
+import uuid
+from builtins import open
+logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
+class FileObjectStore(IObjectStore):
+    """
+    An implementation of object store that uses file system.
+    """
+    def __init__(self, dir_path: str) -> None:
+        self.dir_path = dir_path
+        super().__init__()
+    def put_many(self, objects: List[object], *args, **kwargs) -> List[Any]:
+        result = []
+        for obj in objects:
+            serialized = cloudpickle.dumps(obj)
+            ref = f"{self.dir_path}/{uuid.uuid4()}"
+            with open(ref, "xb") as f:
+                f.write(serialized)
+            result.append(ref)
+        return result
+    def get_many(self, refs: List[Any], *args, **kwargs) -> List[object]:
+        result = []
+        start = time.monotonic()
+        for ref in refs:
+            with open(ref, "rb") as f:
+                serialized = f.read()
+                loaded = cloudpickle.loads(serialized)
+                result.append(loaded)
+            os.remove(ref)
+        end = time.monotonic()
+        logger.info(f"The total time taken to read all objects is: {end - start}")
+        return result

deltacat/io/memcached_object_store.py ADDED Viewed

@@ -0,0 +1,121 @@
+import logging
+from ray import cloudpickle
+from collections import defaultdict
+import time
+from deltacat.io.object_store import IObjectStore
+from typing import Any, List
+from deltacat import logs
+import uuid
+import socket
+from pymemcache.client.base import Client
+from pymemcache.client.retrying import RetryingClient
+from pymemcache.exceptions import MemcacheUnexpectedCloseError
+logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
+class MemcachedObjectStore(IObjectStore):
+    """
+    An implementation of object store that uses Memcached.
+    """
+    def __init__(self, port=11212) -> None:
+        self.client_cache = {}
+        self.current_ip = None
+        self.SEPARATOR = "_"
+        self.port = port
+        super().__init__()
+    def put_many(self, objects: List[object], *args, **kwargs) -> List[Any]:
+        input = {}
+        result = []
+        current_ip = self._get_current_ip()
+        for obj in objects:
+            serialized = cloudpickle.dumps(obj)
+            uid = uuid.uuid4()
+            ref = self._create_ref(uid, current_ip)
+            input[uid.__str__()] = serialized
+            result.append(ref)
+        client = self._get_client_by_ip(current_ip)
+        if client.set_many(input, noreply=False):
+            raise RuntimeError("Unable to write few keys to cache")
+        return result
+    def put(self, obj: object, *args, **kwargs) -> Any:
+        serialized = cloudpickle.dumps(obj)
+        uid = uuid.uuid4()
+        current_ip = self._get_current_ip()
+        ref = self._create_ref(uid, current_ip)
+        client = self._get_client_by_ip(current_ip)
+        if client.set(uid.__str__(), serialized):
+            return ref
+        else:
+            raise RuntimeError("Unable to write to cache")
+    def get_many(self, refs: List[Any], *args, **kwargs) -> List[object]:
+        result = []
+        uid_per_ip = defaultdict(lambda: [])
+        start = time.monotonic()
+        for ref in refs:
+            uid, ip = ref.split(self.SEPARATOR)
+            uid_per_ip[ip].append(uid)
+        for (ip, uids) in uid_per_ip.items():
+            client = self._get_client_by_ip(ip)
+            cache_result = client.get_many(uids)
+            assert len(cache_result) == len(
+                uids
+            ), f"Not all values were returned from cache as {len(cache_result)} != {len(uids)}"
+            values = cache_result.values()
+            total_bytes = 0
+            deserialize_start = time.monotonic()
+            for serialized in values:
+                deserialized = cloudpickle.loads(serialized)
+                total_bytes += len(serialized)
+                result.append(deserialized)
+            deserialize_end = time.monotonic()
+            logger.debug(
+                f"The time taken to deserialize {total_bytes} bytes is: {deserialize_end - deserialize_start}",
+            )
+        end = time.monotonic()
+        logger.info(f"The total time taken to read all objects is: {end - start}")
+        return result
+    def get(self, ref: Any, *args, **kwargs) -> object:
+        uid, ip = ref.split(self.SEPARATOR)
+        client = self._get_client_by_ip(ip)
+        serialized = client.get(uid)
+        return cloudpickle.loads(serialized)
+    def _create_ref(self, uid, ip) -> str:
+        return f"{uid}{self.SEPARATOR}{ip}"
+    def _get_client_by_ip(self, ip_address: str):
+        if ip_address in self.client_cache:
+            return self.client_cache[ip_address]
+        base_client = Client((ip_address, self.port))
+        client = RetryingClient(
+            base_client,
+            attempts=3,
+            retry_delay=0.01,
+            retry_for=[MemcacheUnexpectedCloseError],
+        )
+        self.client_cache[ip_address] = client
+        return client
+    def _get_current_ip(self):
+        if self.current_ip is None:
+            self.current_ip = socket.gethostbyname(socket.gethostname())
+        return self.current_ip

deltacat 0.1.18b3__py3-none-any.whl → 0.1.18b7__py3-none-any.whl

deltacat 0.1.18b3py3-none-any.whl → 0.1.18b7py3-none-any.whl