PyPI - deltacat - Versions diffs - 0.1.18b4__tar.gz → 0.1.18b6__tar.gz - Mend

deltacat 0.1.18b4tar.gz → 0.1.18b6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (146) hide show

{deltacat-0.1.18b4/deltacat.egg-info → deltacat-0.1.18b6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: deltacat
-Version: 0.1.18b4
+Version: 0.1.18b6
 Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
 Home-page: https://github.com/ray-project/deltacat
 Author: Ray Team

{deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/__init__.py RENAMED Viewed

@@ -43,7 +43,7 @@ from deltacat.types.tables import TableWriteMode
 deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
-__version__ = "0.1.18b4"
+__version__ = "0.1.18b6"
 __all__ = [

{deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/compaction_session.py RENAMED Viewed

@@ -16,6 +16,8 @@ from deltacat.compute.compactor import (
 )
 from deltacat.compute.compactor.model.dedupe_result import DedupeResult
 from deltacat.compute.compactor.model.hash_bucket_result import HashBucketResult
+from deltacat.io.object_store import IObjectStore
+from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
 from deltacat.compute.compactor.model.materialize_result import MaterializeResult
 from deltacat.compute.stats.models.delta_stats import DeltaStats
 from deltacat.storage import (
@@ -112,6 +114,7 @@ def compact_partition(
     list_deltas_kwargs: Optional[Dict[str, Any]] = None,
     read_kwargs_provider: Optional[ReadKwargsProvider] = None,
     s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
+    object_store: Optional[IObjectStore] = RayPlasmaObjectStore(),
     deltacat_storage=unimplemented_deltacat_storage,
     **kwargs,
 ) -> Optional[str]:
@@ -151,6 +154,7 @@ def compact_partition(
             list_deltas_kwargs,
             read_kwargs_provider,
             s3_table_writer_kwargs,
+            object_store,
             deltacat_storage,
             **kwargs,
         )
@@ -196,6 +200,7 @@ def _execute_compaction_round(
     list_deltas_kwargs: Optional[Dict[str, Any]],
     read_kwargs_provider: Optional[ReadKwargsProvider],
     s3_table_writer_kwargs: Optional[Dict[str, Any]],
+    object_store: Optional[IObjectStore],
     deltacat_storage=unimplemented_deltacat_storage,
     **kwargs,
 ) -> Tuple[Optional[Partition], Optional[RoundCompletionInfo], Optional[str]]:
@@ -287,6 +292,13 @@ def _execute_compaction_round(
             )
         logger.info(f"Round completion file: {round_completion_info}")
+    enable_manifest_entry_copy_by_reference = (
+        False if rebase_source_partition_locator else True
+    )
+    logger.info(
+        f"Enable manifest entry copy by reference is set to: {enable_manifest_entry_copy_by_reference}"
+    )
     # discover input delta files
     # For rebase:
     # Copy the old compacted table to a new destination, plus any new deltas from rebased source
@@ -392,6 +404,7 @@ def _execute_compaction_round(
         enable_profiler=enable_profiler,
         metrics_config=metrics_config,
         read_kwargs_provider=read_kwargs_provider,
+        object_store=object_store,
         deltacat_storage=deltacat_storage,
     )
@@ -453,11 +466,16 @@ def _execute_compaction_round(
     logger.info(f"Materialize Bucket Count: {num_materialize_buckets}")
     dedupe_start = time.monotonic()
+    dd_max_parallelism = int(
+        max_parallelism * kwargs.get("dd_max_parallelism_ratio", 1)
+    )
+    logger.info(
+        f"dd max_parallelism is set to {dd_max_parallelism}, max_parallelism is {max_parallelism}"
+    )
     dd_tasks_pending = invoke_parallel(
         items=all_hash_group_idx_to_obj_id.values(),
         ray_task=dd.dedupe,
-        max_parallelism=max_parallelism,
+        max_parallelism=dd_max_parallelism,
         options_provider=round_robin_opt_provider,
         kwargs_provider=lambda index, item: {
             "dedupe_task_index": index,
@@ -467,6 +485,7 @@ def _execute_compaction_round(
         num_materialize_buckets=num_materialize_buckets,
         enable_profiler=enable_profiler,
         metrics_config=metrics_config,
+        object_store=object_store,
     )
     dedupe_invoke_end = time.monotonic()
@@ -537,12 +556,14 @@ def _execute_compaction_round(
         round_completion_info=round_completion_info,
         source_partition_locator=source_partition_locator,
         partition=partition,
+        enable_manifest_entry_copy_by_reference=enable_manifest_entry_copy_by_reference,
         max_records_per_output_file=records_per_compacted_file,
         compacted_file_content_type=compacted_file_content_type,
         enable_profiler=enable_profiler,
         metrics_config=metrics_config,
         read_kwargs_provider=read_kwargs_provider,
         s3_table_writer_kwargs=s3_table_writer_kwargs,
+        object_store=object_store,
         deltacat_storage=deltacat_storage,
     )

{deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/dedupe.py RENAMED Viewed

@@ -1,5 +1,6 @@
 import importlib
 import logging
+from typing import Optional
 import time
 from collections import defaultdict
 from contextlib import nullcontext
@@ -8,7 +9,6 @@ import numpy as np
 import pyarrow as pa
 import pyarrow.compute as pc
 import ray
-from ray import cloudpickle
 from deltacat import logs
 from deltacat.compute.compactor import (
@@ -25,6 +25,7 @@ from deltacat.utils.ray_utils.runtime import (
 )
 from deltacat.utils.performance import timed_invocation
 from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
+from deltacat.io.object_store import IObjectStore
 from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
 if importlib.util.find_spec("memray"):
@@ -106,6 +107,7 @@ def _timed_dedupe(
     num_materialize_buckets: int,
     dedupe_task_index: int,
     enable_profiler: bool,
+    object_store: Optional[IObjectStore],
 ):
     task_id = get_current_ray_task_id()
     worker_id = get_current_ray_worker_id()
@@ -114,15 +116,12 @@ def _timed_dedupe(
     ) if enable_profiler else nullcontext():
         # TODO (pdames): mitigate risk of running out of memory here in cases of
         #  severe skew of primary key updates in deltas
-        src_file_records_obj_refs = [
-            cloudpickle.loads(obj_id_pkl) for obj_id_pkl in object_ids
-        ]
         logger.info(
             f"[Dedupe task {dedupe_task_index}] Getting delta file envelope "
-            f"groups for {len(src_file_records_obj_refs)} object refs..."
+            f"groups for {len(object_ids)} object refs..."
         )
-        delta_file_envelope_groups_list = ray.get(src_file_records_obj_refs)
+        delta_file_envelope_groups_list = object_store.get_many(object_ids)
         hb_index_to_delta_file_envelopes_list = defaultdict(list)
         for delta_file_envelope_groups in delta_file_envelope_groups_list:
             for hb_idx, dfes in enumerate(delta_file_envelope_groups):
@@ -201,7 +200,6 @@ def _timed_dedupe(
                 src_file_id_to_row_indices[src_dfl].append(row_idx_col[row_idx])
         logger.info(f"Finished all dedupe rounds...")
-        mat_bucket_to_src_file_record_count = defaultdict(dict)
         mat_bucket_to_src_file_records: Dict[
             MaterializeBucketIndex, DeltaFileLocatorToRecords
         ] = defaultdict(dict)
@@ -213,22 +211,17 @@ def _timed_dedupe(
             mat_bucket_to_src_file_records[mat_bucket][src_dfl] = np.array(
                 src_row_indices,
             )
-            mat_bucket_to_src_file_record_count[mat_bucket][src_dfl] = len(
-                src_row_indices
-            )
         mat_bucket_to_dd_idx_obj_id: Dict[
             MaterializeBucketIndex, DedupeTaskIndexWithObjectId
         ] = {}
         for mat_bucket, src_file_records in mat_bucket_to_src_file_records.items():
-            object_ref = ray.put(src_file_records)
-            pickled_object_ref = cloudpickle.dumps(object_ref)
+            object_ref = object_store.put(src_file_records)
             mat_bucket_to_dd_idx_obj_id[mat_bucket] = (
                 dedupe_task_index,
-                pickled_object_ref,
+                object_ref,
             )
             del object_ref
-            del pickled_object_ref
         logger.info(
             f"Count of materialize buckets with object refs: "
             f"{len(mat_bucket_to_dd_idx_obj_id)}"
@@ -253,6 +246,7 @@ def dedupe(
     dedupe_task_index: int,
     enable_profiler: bool,
     metrics_config: MetricsConfig,
+    object_store: Optional[IObjectStore],
 ) -> DedupeResult:
     logger.info(f"[Dedupe task {dedupe_task_index}] Starting dedupe task...")
     dedupe_result, duration = timed_invocation(
@@ -262,6 +256,7 @@ def dedupe(
         num_materialize_buckets=num_materialize_buckets,
         dedupe_task_index=dedupe_task_index,
         enable_profiler=enable_profiler,
+        object_store=object_store,
     )
     emit_metrics_time = 0.0

{deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/hash_bucket.py RENAMED Viewed

@@ -31,6 +31,7 @@ from deltacat.utils.ray_utils.runtime import (
 from deltacat.utils.common import ReadKwargsProvider
 from deltacat.utils.performance import timed_invocation
 from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
+from deltacat.io.object_store import IObjectStore
 from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
 if importlib.util.find_spec("memray"):
@@ -179,6 +180,7 @@ def _timed_hash_bucket(
     num_groups: int,
     enable_profiler: bool,
     read_kwargs_provider: Optional[ReadKwargsProvider] = None,
+    object_store: Optional[IObjectStore] = None,
     deltacat_storage=unimplemented_deltacat_storage,
 ):
     task_id = get_current_ray_task_id()
@@ -207,9 +209,7 @@ def _timed_hash_bucket(
             deltacat_storage,
         )
         hash_bucket_group_to_obj_id, _ = group_hash_bucket_indices(
-            delta_file_envelope_groups,
-            num_buckets,
-            num_groups,
+            delta_file_envelope_groups, num_buckets, num_groups, object_store
         )
         peak_memory_usage_bytes = get_current_node_peak_memory_usage_in_bytes()
@@ -233,6 +233,7 @@ def hash_bucket(
     enable_profiler: bool,
     metrics_config: MetricsConfig,
     read_kwargs_provider: Optional[ReadKwargsProvider],
+    object_store: Optional[IObjectStore],
     deltacat_storage=unimplemented_deltacat_storage,
 ) -> HashBucketResult:
@@ -247,6 +248,7 @@ def hash_bucket(
         num_groups=num_groups,
         enable_profiler=enable_profiler,
         read_kwargs_provider=read_kwargs_provider,
+        object_store=object_store,
         deltacat_storage=deltacat_storage,
     )

{deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/steps/materialize.py RENAMED Viewed

@@ -5,11 +5,10 @@ from uuid import uuid4
 from collections import defaultdict
 from contextlib import nullcontext
 from itertools import chain, repeat
-from typing import List, Optional, Tuple, Dict, Any, Union
+from typing import List, Optional, Tuple, Dict, Any
 import pyarrow as pa
 import numpy as np
 import ray
-from ray import cloudpickle
 from deltacat import logs
 from deltacat.compute.compactor import (
     MaterializeResult,
@@ -28,15 +27,13 @@ from deltacat.storage import (
     PartitionLocator,
     Manifest,
     ManifestEntry,
-    LocalDataset,
-    LocalTable,
-    DistributedDataset,
 )
 from deltacat.storage import interface as unimplemented_deltacat_storage
 from deltacat.utils.common import ReadKwargsProvider
 from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES, ContentType
 from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
 from deltacat.utils.performance import timed_invocation
+from deltacat.io.object_store import IObjectStore
 from deltacat.utils.pyarrow import (
     ReadKwargsProviderPyArrowCsvPureUtf8,
     ReadKwargsProviderPyArrowSchemaOverride,
@@ -64,29 +61,15 @@ def materialize(
     dedupe_task_idx_and_obj_id_tuples: List[DedupeTaskIndexWithObjectId],
     max_records_per_output_file: int,
     compacted_file_content_type: ContentType,
+    enable_manifest_entry_copy_by_reference: bool,
     enable_profiler: bool,
     metrics_config: MetricsConfig,
     schema: Optional[pa.Schema] = None,
     read_kwargs_provider: Optional[ReadKwargsProvider] = None,
     s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
+    object_store: Optional[IObjectStore] = None,
     deltacat_storage=unimplemented_deltacat_storage,
 ):
-    def _stage_delta_implementation(
-        data: Union[LocalTable, LocalDataset, DistributedDataset, Manifest],
-        partition: Partition,
-        stage_delta_from_existing_manifest: Optional[bool],
-    ) -> Delta:
-        if stage_delta_from_existing_manifest:
-            delta = Delta.of(
-                locator=DeltaLocator.of(partition.locator),
-                delta_type=DeltaType.UPSERT,
-                meta=manifest.meta,
-                manifest=data,
-                previous_stream_position=partition.stream_position,
-                properties={},
-            )
-            return delta
     def _stage_delta_from_manifest_entry_reference_list(
         manifest_entry_list_reference: List[ManifestEntry],
         partition: Partition,
@@ -96,10 +79,13 @@ def materialize(
             delta_type == DeltaType.UPSERT
         ), "Stage delta with existing manifest entries only supports UPSERT delta type!"
         manifest = Manifest.of(entries=manifest_entry_list_reference, uuid=str(uuid4()))
-        delta = _stage_delta_implementation(
-            data=manifest,
-            partition=partition,
-            stage_delta_from_existing_manifest=True,
+        delta = Delta.of(
+            locator=DeltaLocator.of(partition.locator),
+            delta_type=delta_type,
+            meta=manifest.meta,
+            manifest=manifest,
+            previous_stream_position=partition.stream_position,
+            properties={},
         )
         return delta
@@ -161,18 +147,11 @@ def materialize(
         f"dedupe_{worker_id}_{task_id}.bin"
     ) if enable_profiler else nullcontext():
         start = time.time()
-        dedupe_task_idx_and_obj_ref_tuples = [
-            (
-                t1,
-                cloudpickle.loads(t2),
-            )
-            for t1, t2 in dedupe_task_idx_and_obj_id_tuples
-        ]
         logger.info(f"Resolved materialize task obj refs...")
-        dedupe_task_indices, obj_refs = zip(*dedupe_task_idx_and_obj_ref_tuples)
+        dedupe_task_indices, obj_refs = zip(*dedupe_task_idx_and_obj_id_tuples)
         # this depends on `ray.get` result order matching input order, as per the
         # contract established in: https://github.com/ray-project/ray/pull/16763
-        src_file_records_list = ray.get(list(obj_refs))
+        src_file_records_list = object_store.get_many(list(obj_refs))
         all_src_file_records = defaultdict(list)
         for i, src_file_records in enumerate(src_file_records_list):
             dedupe_task_idx = dedupe_task_indices[i]
@@ -231,7 +210,9 @@ def materialize(
                 record_numbers_length += 1
                 mask_pylist[record_number] = True
             if (
-                record_numbers_length == src_file_record_count
+                round_completion_info
+                and enable_manifest_entry_copy_by_reference
+                and record_numbers_length == src_file_record_count
                 and src_file_partition_locator
                 == round_completion_info.compacted_delta_locator.partition_locator
             ):
@@ -244,8 +225,8 @@ def materialize(
                 manifest_entry_list_reference.append(untouched_src_manifest_entry)
                 referenced_pyarrow_write_result = PyArrowWriteResult.of(
                     1,
-                    manifest.meta.source_content_length,
-                    manifest.meta.content_length,
+                    untouched_src_manifest_entry.meta.source_content_length,
+                    untouched_src_manifest_entry.meta.content_length,
                     src_file_record_count,
                 )
                 referenced_pyarrow_write_results.append(referenced_pyarrow_write_result)

{deltacat-0.1.18b4 → deltacat-0.1.18b6}/deltacat/compute/compactor/utils/primary_key_index.py RENAMED Viewed

@@ -7,7 +7,6 @@ import numpy as np
 import pyarrow as pa
 import ray
 import s3fs
-from ray import cloudpickle
 from ray.types import ObjectRef
 from deltacat import logs
@@ -30,6 +29,7 @@ from deltacat.types.media import ContentEncoding, ContentType
 from deltacat.types.tables import get_table_slicer, get_table_writer
 from deltacat.utils.common import ReadKwargsProvider
 from deltacat.utils.ray_utils.concurrency import invoke_parallel
+from deltacat.io.object_store import IObjectStore
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
@@ -203,7 +203,10 @@ def group_record_indices_by_hash_bucket(
 def group_hash_bucket_indices(
-    hash_bucket_object_groups: np.ndarray, num_buckets: int, num_groups: int
+    hash_bucket_object_groups: np.ndarray,
+    num_buckets: int,
+    num_groups: int,
+    object_store: Optional[IObjectStore] = None,
 ) -> Tuple[np.ndarray, List[ObjectRef]]:
     """
     Groups all the ObjectRef that belongs to a particular hash bucket group and hash bucket index.
@@ -226,19 +229,10 @@ def group_hash_bucket_indices(
     for hb_group, obj in enumerate(hb_group_to_object):
         if obj is None:
             continue
-        obj_ref = ray.put(obj)
-        pickled_obj_ref = cloudpickle.dumps(obj_ref)
-        object_refs.append(pickled_obj_ref)
-        hash_bucket_group_to_obj_id[hb_group] = pickled_obj_ref
-        # NOTE: The cloudpickle.dumps API call creates an out of band object reference to the object_ref variable.
-        # After pickling, Ray cannot track the serialized copy of the object or determine when the ObjectRef has been deserialized
-        # (e.g., if the ObjectRef is deserialized by a non-Ray process).
-        # Thus the object_ref cannot be tracked by Ray's distributed reference counter, even if it goes out of scope.
-        # The object now has a permanent reference and the data can't be freed from Ray’s object store.
-        # Manually deleting the untrackable object references offsets these permanent references and
-        # helps to allow these objects to be garbage collected normally.
-        del obj_ref
-        del pickled_obj_ref
+        object_ref = object_store.put(obj)
+        object_refs.append(object_ref)
+        hash_bucket_group_to_obj_id[hb_group] = object_ref
+        del object_ref
     return hash_bucket_group_to_obj_id, object_refs

deltacat-0.1.18b6/deltacat/io/file_object_store.py ADDED Viewed

@@ -0,0 +1,48 @@
+import logging
+from ray import cloudpickle
+import time
+from deltacat.io.object_store import IObjectStore
+from typing import Any, List
+from deltacat import logs
+import os
+import uuid
+from builtins import open
+logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
+class FileObjectStore(IObjectStore):
+    """
+    An implementation of object store that uses file system.
+    """
+    def __init__(self, dir_path: str) -> None:
+        self.dir_path = dir_path
+        super().__init__()
+    def put_many(self, objects: List[object], *args, **kwargs) -> List[Any]:
+        result = []
+        for obj in objects:
+            serialized = cloudpickle.dumps(obj)
+            ref = f"{self.dir_path}/{uuid.uuid4()}"
+            with open(ref, "xb") as f:
+                f.write(serialized)
+            result.append(ref)
+        return result
+    def get_many(self, refs: List[Any], *args, **kwargs) -> List[object]:
+        result = []
+        start = time.monotonic()
+        for ref in refs:
+            with open(ref, "rb") as f:
+                serialized = f.read()
+                loaded = cloudpickle.loads(serialized)
+                result.append(loaded)
+            os.remove(ref)
+        end = time.monotonic()
+        logger.info(f"The total time taken to read all objects is: {end - start}")
+        return result

deltacat-0.1.18b6/deltacat/io/memcached_object_store.py ADDED Viewed

@@ -0,0 +1,121 @@
+import logging
+from ray import cloudpickle
+from collections import defaultdict
+import time
+from deltacat.io.object_store import IObjectStore
+from typing import Any, List
+from deltacat import logs
+import uuid
+import socket
+from pymemcache.client.base import Client
+from pymemcache.client.retrying import RetryingClient
+from pymemcache.exceptions import MemcacheUnexpectedCloseError
+logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
+class MemcachedObjectStore(IObjectStore):
+    """
+    An implementation of object store that uses Memcached.
+    """
+    def __init__(self, port=11212) -> None:
+        self.client_cache = {}
+        self.current_ip = None
+        self.SEPARATOR = "_"
+        self.port = port
+        super().__init__()
+    def put_many(self, objects: List[object], *args, **kwargs) -> List[Any]:
+        input = {}
+        result = []
+        current_ip = self._get_current_ip()
+        for obj in objects:
+            serialized = cloudpickle.dumps(obj)
+            uid = uuid.uuid4()
+            ref = self._create_ref(uid, current_ip)
+            input[uid.__str__()] = serialized
+            result.append(ref)
+        client = self._get_client_by_ip(current_ip)
+        if client.set_many(input, noreply=False):
+            raise RuntimeError("Unable to write few keys to cache")
+        return result
+    def put(self, obj: object, *args, **kwargs) -> Any:
+        serialized = cloudpickle.dumps(obj)
+        uid = uuid.uuid4()
+        current_ip = self._get_current_ip()
+        ref = self._create_ref(uid, current_ip)
+        client = self._get_client_by_ip(current_ip)
+        if client.set(uid.__str__(), serialized):
+            return ref
+        else:
+            raise RuntimeError("Unable to write to cache")
+    def get_many(self, refs: List[Any], *args, **kwargs) -> List[object]:
+        result = []
+        uid_per_ip = defaultdict(lambda: [])
+        start = time.monotonic()
+        for ref in refs:
+            uid, ip = ref.split(self.SEPARATOR)
+            uid_per_ip[ip].append(uid)
+        for (ip, uids) in uid_per_ip.items():
+            client = self._get_client_by_ip(ip)
+            cache_result = client.get_many(uids)
+            assert len(cache_result) == len(
+                uids
+            ), f"Not all values were returned from cache as {len(cache_result)} != {len(uids)}"
+            values = cache_result.values()
+            total_bytes = 0
+            deserialize_start = time.monotonic()
+            for serialized in values:
+                deserialized = cloudpickle.loads(serialized)
+                total_bytes += len(serialized)
+                result.append(deserialized)
+            deserialize_end = time.monotonic()
+            logger.debug(
+                f"The time taken to deserialize {total_bytes} bytes is: {deserialize_end - deserialize_start}",
+            )
+        end = time.monotonic()
+        logger.info(f"The total time taken to read all objects is: {end - start}")
+        return result
+    def get(self, ref: Any, *args, **kwargs) -> object:
+        uid, ip = ref.split(self.SEPARATOR)
+        client = self._get_client_by_ip(ip)
+        serialized = client.get(uid)
+        return cloudpickle.loads(serialized)
+    def _create_ref(self, uid, ip) -> str:
+        return f"{uid}{self.SEPARATOR}{ip}"
+    def _get_client_by_ip(self, ip_address: str):
+        if ip_address in self.client_cache:
+            return self.client_cache[ip_address]
+        base_client = Client((ip_address, self.port))
+        client = RetryingClient(
+            base_client,
+            attempts=3,
+            retry_delay=0.01,
+            retry_for=[MemcacheUnexpectedCloseError],
+        )
+        self.client_cache[ip_address] = client
+        return client
+    def _get_current_ip(self):
+        if self.current_ip is None:
+            self.current_ip = socket.gethostbyname(socket.gethostname())
+        return self.current_ip

deltacat-0.1.18b6/deltacat/io/object_store.py ADDED Viewed

@@ -0,0 +1,51 @@
+from typing import List, Any
+class IObjectStore:
+    """
+    An object store interface.
+    """
+    def setup(self, *args, **kwargs) -> Any:
+        ...
+        """
+        Sets up everything needed to run the object store.
+        """
+    def put(self, obj: object, *args, **kwargs) -> Any:
+        """
+        Put a single object into the object store. Depending
+        on the implementation, this method can be sync or async.
+        """
+        return self.put_many([obj])[0]
+    def put_many(self, objects: List[object], *args, **kwargs) -> List[Any]:
+        ...
+        """
+        Put many objects into the object store. It would return an ordered list
+        of object references corresponding to each object in the input.
+        """
+    def get(self, ref: Any, *args, **kwargs) -> object:
+        """
+        Get a single object from an object store.
+        """
+        return self.get_many([ref])[0]
+    def get_many(self, refs: List[Any], *args, **kwargs) -> List[object]:
+        ...
+        """
+        Get a list of objects from the object store. Use this method to
+        avoid multiple get calls. Note that depending on implementation it may
+        or may not return ordered results.
+        """
+    def clear(self, *args, **kwargs) -> bool:
+        ...
+        """
+        Clears the object store and all the associated data in it.
+        """

deltacat-0.1.18b6/deltacat/io/ray_plasma_object_store.py ADDED Viewed

@@ -0,0 +1,23 @@
+import ray
+from ray import cloudpickle
+from deltacat.io.object_store import IObjectStore
+from typing import Any, List
+class RayPlasmaObjectStore(IObjectStore):
+    """
+    An implementation of object store that uses Ray plasma object store.
+    """
+    def put_many(self, objects: List[object], *args, **kwargs) -> List[Any]:
+        result = []
+        for obj in objects:
+            object_ref = ray.put(obj)
+            pickled = cloudpickle.dumps(object_ref)
+            result.append(pickled)
+        return result
+    def get_many(self, refs: List[Any], *args, **kwargs) -> List[object]:
+        loaded_refs = [cloudpickle.loads(obj_id) for obj_id in refs]
+        return ray.get(loaded_refs)

deltacat 0.1.18b4__tar.gz → 0.1.18b6__tar.gz

deltacat 0.1.18b4tar.gz → 0.1.18b6tar.gz