PyPI - deltacat - Versions diffs - 0.1.18b4__py3-none-any.whl → 0.1.18b7__py3-none-any.whl - Mend

deltacat 0.1.18b4py3-none-any.whl → 0.1.18b7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

deltacat/__init__.py +1 -1
deltacat/compute/compactor/compaction_session.py +46 -6
deltacat/compute/compactor/steps/dedupe.py +9 -14
deltacat/compute/compactor/steps/hash_bucket.py +5 -3
deltacat/compute/compactor/steps/materialize.py +18 -37
deltacat/compute/compactor/utils/primary_key_index.py +9 -15
deltacat/compute/compactor/utils/round_completion_file.py +11 -4
deltacat/io/__init__.py +0 -7
deltacat/io/file_object_store.py +48 -0
deltacat/io/memcached_object_store.py +121 -0
deltacat/io/object_store.py +51 -0
deltacat/io/ray_plasma_object_store.py +23 -0
deltacat/io/redis_object_store.py +114 -0
deltacat/io/s3_object_store.py +44 -0
deltacat/tests/compactor/utils/test_io.py +4 -0
deltacat/tests/io/__init__.py +0 -0
deltacat/tests/io/test_file_object_store.py +86 -0
deltacat/tests/io/test_memcached_object_store.py +158 -0
deltacat/tests/io/test_ray_plasma_object_store.py +54 -0
deltacat/tests/io/test_redis_object_store.py +103 -0
deltacat/tests/io/test_s3_object_store.py +59 -0
deltacat/tests/utils/test_resources.py +4 -0
{deltacat-0.1.18b4.dist-info → deltacat-0.1.18b7.dist-info}/METADATA +3 -1
{deltacat-0.1.18b4.dist-info → deltacat-0.1.18b7.dist-info}/RECORD +27 -15
{deltacat-0.1.18b4.dist-info → deltacat-0.1.18b7.dist-info}/LICENSE +0 -0
{deltacat-0.1.18b4.dist-info → deltacat-0.1.18b7.dist-info}/WHEEL +0 -0
{deltacat-0.1.18b4.dist-info → deltacat-0.1.18b7.dist-info}/top_level.txt +0 -0

deltacat/__init__.py CHANGED Viewed

@@ -43,7 +43,7 @@ from deltacat.types.tables import TableWriteMode
 deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
-__version__ = "0.1.18b4"
+__version__ = "0.1.18b7"
 __all__ = [

deltacat/compute/compactor/compaction_session.py CHANGED Viewed

@@ -16,6 +16,8 @@ from deltacat.compute.compactor import (
 )
 from deltacat.compute.compactor.model.dedupe_result import DedupeResult
 from deltacat.compute.compactor.model.hash_bucket_result import HashBucketResult
+from deltacat.io.object_store import IObjectStore
+from deltacat.io.ray_plasma_object_store import RayPlasmaObjectStore
 from deltacat.compute.compactor.model.materialize_result import MaterializeResult
 from deltacat.compute.stats.models.delta_stats import DeltaStats
 from deltacat.storage import (
@@ -112,6 +114,8 @@ def compact_partition(
     list_deltas_kwargs: Optional[Dict[str, Any]] = None,
     read_kwargs_provider: Optional[ReadKwargsProvider] = None,
     s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
+    object_store: Optional[IObjectStore] = RayPlasmaObjectStore(),
+    s3_client_kwargs: Optional[Dict[str, Any]] = None,
     deltacat_storage=unimplemented_deltacat_storage,
     **kwargs,
 ) -> Optional[str]:
@@ -151,6 +155,8 @@ def compact_partition(
             list_deltas_kwargs,
             read_kwargs_provider,
             s3_table_writer_kwargs,
+            object_store,
+            s3_client_kwargs,
             deltacat_storage,
             **kwargs,
         )
@@ -196,6 +202,8 @@ def _execute_compaction_round(
     list_deltas_kwargs: Optional[Dict[str, Any]],
     read_kwargs_provider: Optional[ReadKwargsProvider],
     s3_table_writer_kwargs: Optional[Dict[str, Any]],
+    object_store: Optional[IObjectStore],
+    s3_client_kwargs: Optional[Dict[str, Any]],
     deltacat_storage=unimplemented_deltacat_storage,
     **kwargs,
 ) -> Tuple[Optional[Partition], Optional[RoundCompletionInfo], Optional[str]]:
@@ -287,6 +295,13 @@ def _execute_compaction_round(
             )
         logger.info(f"Round completion file: {round_completion_info}")
+    enable_manifest_entry_copy_by_reference = (
+        False if rebase_source_partition_locator else True
+    )
+    logger.info(
+        f"Enable manifest entry copy by reference is set to: {enable_manifest_entry_copy_by_reference}"
+    )
     # discover input delta files
     # For rebase:
     # Copy the old compacted table to a new destination, plus any new deltas from rebased source
@@ -318,7 +333,11 @@ def _execute_compaction_round(
         delta_discovery_end - delta_discovery_start
     )
-    s3_utils.upload(compaction_audit.audit_url, str(json.dumps(compaction_audit)))
+    s3_utils.upload(
+        compaction_audit.audit_url,
+        str(json.dumps(compaction_audit)),
+        **s3_client_kwargs,
+    )
     if not input_deltas:
         logger.info("No input deltas found to compact.")
@@ -392,6 +411,7 @@ def _execute_compaction_round(
         enable_profiler=enable_profiler,
         metrics_config=metrics_config,
         read_kwargs_provider=read_kwargs_provider,
+        object_store=object_store,
         deltacat_storage=deltacat_storage,
     )
@@ -411,7 +431,11 @@ def _execute_compaction_round(
         hb_end - hb_start,
     )
-    s3_utils.upload(compaction_audit.audit_url, str(json.dumps(compaction_audit)))
+    s3_utils.upload(
+        compaction_audit.audit_url,
+        str(json.dumps(compaction_audit)),
+        **s3_client_kwargs,
+    )
     all_hash_group_idx_to_obj_id = defaultdict(list)
     for hb_result in hb_results:
@@ -453,11 +477,16 @@ def _execute_compaction_round(
     logger.info(f"Materialize Bucket Count: {num_materialize_buckets}")
     dedupe_start = time.monotonic()
+    dd_max_parallelism = int(
+        max_parallelism * kwargs.get("dd_max_parallelism_ratio", 1)
+    )
+    logger.info(
+        f"dd max_parallelism is set to {dd_max_parallelism}, max_parallelism is {max_parallelism}"
+    )
     dd_tasks_pending = invoke_parallel(
         items=all_hash_group_idx_to_obj_id.values(),
         ray_task=dd.dedupe,
-        max_parallelism=max_parallelism,
+        max_parallelism=dd_max_parallelism,
         options_provider=round_robin_opt_provider,
         kwargs_provider=lambda index, item: {
             "dedupe_task_index": index,
@@ -467,6 +496,7 @@ def _execute_compaction_round(
         num_materialize_buckets=num_materialize_buckets,
         enable_profiler=enable_profiler,
         metrics_config=metrics_config,
+        object_store=object_store,
     )
     dedupe_invoke_end = time.monotonic()
@@ -520,7 +550,11 @@ def _execute_compaction_round(
     # parallel step 3:
     # materialize records to keep by index
-    s3_utils.upload(compaction_audit.audit_url, str(json.dumps(compaction_audit)))
+    s3_utils.upload(
+        compaction_audit.audit_url,
+        str(json.dumps(compaction_audit)),
+        **s3_client_kwargs,
+    )
     materialize_start = time.monotonic()
@@ -537,12 +571,14 @@ def _execute_compaction_round(
         round_completion_info=round_completion_info,
         source_partition_locator=source_partition_locator,
         partition=partition,
+        enable_manifest_entry_copy_by_reference=enable_manifest_entry_copy_by_reference,
         max_records_per_output_file=records_per_compacted_file,
         compacted_file_content_type=compacted_file_content_type,
         enable_profiler=enable_profiler,
         metrics_config=metrics_config,
         read_kwargs_provider=read_kwargs_provider,
         s3_table_writer_kwargs=s3_table_writer_kwargs,
+        object_store=object_store,
         deltacat_storage=deltacat_storage,
     )
@@ -620,7 +656,11 @@ def _execute_compaction_round(
         mat_results, telemetry_time_hb + telemetry_time_dd + telemetry_time_materialize
     )
-    s3_utils.upload(compaction_audit.audit_url, str(json.dumps(compaction_audit)))
+    s3_utils.upload(
+        compaction_audit.audit_url,
+        str(json.dumps(compaction_audit)),
+        **s3_client_kwargs,
+    )
     new_round_completion_info = RoundCompletionInfo.of(
         last_stream_position_compacted,

deltacat/compute/compactor/steps/dedupe.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import importlib
 import logging
+from typing import Optional
 import time
 from collections import defaultdict
 from contextlib import nullcontext
@@ -8,7 +9,6 @@ import numpy as np
 import pyarrow as pa
 import pyarrow.compute as pc
 import ray
-from ray import cloudpickle
 from deltacat import logs
 from deltacat.compute.compactor import (
@@ -25,6 +25,7 @@ from deltacat.utils.ray_utils.runtime import (
 )
 from deltacat.utils.performance import timed_invocation
 from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
+from deltacat.io.object_store import IObjectStore
 from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
 if importlib.util.find_spec("memray"):
@@ -106,6 +107,7 @@ def _timed_dedupe(
     num_materialize_buckets: int,
     dedupe_task_index: int,
     enable_profiler: bool,
+    object_store: Optional[IObjectStore],
 ):
     task_id = get_current_ray_task_id()
     worker_id = get_current_ray_worker_id()
@@ -114,15 +116,12 @@ def _timed_dedupe(
     ) if enable_profiler else nullcontext():
         # TODO (pdames): mitigate risk of running out of memory here in cases of
         #  severe skew of primary key updates in deltas
-        src_file_records_obj_refs = [
-            cloudpickle.loads(obj_id_pkl) for obj_id_pkl in object_ids
-        ]
         logger.info(
             f"[Dedupe task {dedupe_task_index}] Getting delta file envelope "
-            f"groups for {len(src_file_records_obj_refs)} object refs..."
+            f"groups for {len(object_ids)} object refs..."
         )
-        delta_file_envelope_groups_list = ray.get(src_file_records_obj_refs)
+        delta_file_envelope_groups_list = object_store.get_many(object_ids)
         hb_index_to_delta_file_envelopes_list = defaultdict(list)
         for delta_file_envelope_groups in delta_file_envelope_groups_list:
             for hb_idx, dfes in enumerate(delta_file_envelope_groups):
@@ -201,7 +200,6 @@ def _timed_dedupe(
                 src_file_id_to_row_indices[src_dfl].append(row_idx_col[row_idx])
         logger.info(f"Finished all dedupe rounds...")
-        mat_bucket_to_src_file_record_count = defaultdict(dict)
         mat_bucket_to_src_file_records: Dict[
             MaterializeBucketIndex, DeltaFileLocatorToRecords
         ] = defaultdict(dict)
@@ -213,22 +211,17 @@ def _timed_dedupe(
             mat_bucket_to_src_file_records[mat_bucket][src_dfl] = np.array(
                 src_row_indices,
             )
-            mat_bucket_to_src_file_record_count[mat_bucket][src_dfl] = len(
-                src_row_indices
-            )
         mat_bucket_to_dd_idx_obj_id: Dict[
             MaterializeBucketIndex, DedupeTaskIndexWithObjectId
         ] = {}
         for mat_bucket, src_file_records in mat_bucket_to_src_file_records.items():
-            object_ref = ray.put(src_file_records)
-            pickled_object_ref = cloudpickle.dumps(object_ref)
+            object_ref = object_store.put(src_file_records)
             mat_bucket_to_dd_idx_obj_id[mat_bucket] = (
                 dedupe_task_index,
-                pickled_object_ref,
+                object_ref,
             )
             del object_ref
-            del pickled_object_ref
         logger.info(
             f"Count of materialize buckets with object refs: "
             f"{len(mat_bucket_to_dd_idx_obj_id)}"
@@ -253,6 +246,7 @@ def dedupe(
     dedupe_task_index: int,
     enable_profiler: bool,
     metrics_config: MetricsConfig,
+    object_store: Optional[IObjectStore],
 ) -> DedupeResult:
     logger.info(f"[Dedupe task {dedupe_task_index}] Starting dedupe task...")
     dedupe_result, duration = timed_invocation(
@@ -262,6 +256,7 @@ def dedupe(
         num_materialize_buckets=num_materialize_buckets,
         dedupe_task_index=dedupe_task_index,
         enable_profiler=enable_profiler,
+        object_store=object_store,
     )
     emit_metrics_time = 0.0

deltacat/compute/compactor/steps/hash_bucket.py CHANGED Viewed

@@ -31,6 +31,7 @@ from deltacat.utils.ray_utils.runtime import (
 from deltacat.utils.common import ReadKwargsProvider
 from deltacat.utils.performance import timed_invocation
 from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
+from deltacat.io.object_store import IObjectStore
 from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
 if importlib.util.find_spec("memray"):
@@ -179,6 +180,7 @@ def _timed_hash_bucket(
     num_groups: int,
     enable_profiler: bool,
     read_kwargs_provider: Optional[ReadKwargsProvider] = None,
+    object_store: Optional[IObjectStore] = None,
     deltacat_storage=unimplemented_deltacat_storage,
 ):
     task_id = get_current_ray_task_id()
@@ -207,9 +209,7 @@ def _timed_hash_bucket(
             deltacat_storage,
         )
         hash_bucket_group_to_obj_id, _ = group_hash_bucket_indices(
-            delta_file_envelope_groups,
-            num_buckets,
-            num_groups,
+            delta_file_envelope_groups, num_buckets, num_groups, object_store
         )
         peak_memory_usage_bytes = get_current_node_peak_memory_usage_in_bytes()
@@ -233,6 +233,7 @@ def hash_bucket(
     enable_profiler: bool,
     metrics_config: MetricsConfig,
     read_kwargs_provider: Optional[ReadKwargsProvider],
+    object_store: Optional[IObjectStore],
     deltacat_storage=unimplemented_deltacat_storage,
 ) -> HashBucketResult:
@@ -247,6 +248,7 @@ def hash_bucket(
         num_groups=num_groups,
         enable_profiler=enable_profiler,
         read_kwargs_provider=read_kwargs_provider,
+        object_store=object_store,
         deltacat_storage=deltacat_storage,
     )

deltacat/compute/compactor/steps/materialize.py CHANGED Viewed

@@ -5,11 +5,10 @@ from uuid import uuid4
 from collections import defaultdict
 from contextlib import nullcontext
 from itertools import chain, repeat
-from typing import List, Optional, Tuple, Dict, Any, Union
+from typing import List, Optional, Tuple, Dict, Any
 import pyarrow as pa
 import numpy as np
 import ray
-from ray import cloudpickle
 from deltacat import logs
 from deltacat.compute.compactor import (
     MaterializeResult,
@@ -28,15 +27,13 @@ from deltacat.storage import (
     PartitionLocator,
     Manifest,
     ManifestEntry,
-    LocalDataset,
-    LocalTable,
-    DistributedDataset,
 )
 from deltacat.storage import interface as unimplemented_deltacat_storage
 from deltacat.utils.common import ReadKwargsProvider
 from deltacat.types.media import DELIMITED_TEXT_CONTENT_TYPES, ContentType
 from deltacat.types.tables import TABLE_CLASS_TO_SIZE_FUNC
 from deltacat.utils.performance import timed_invocation
+from deltacat.io.object_store import IObjectStore
 from deltacat.utils.pyarrow import (
     ReadKwargsProviderPyArrowCsvPureUtf8,
     ReadKwargsProviderPyArrowSchemaOverride,
@@ -64,29 +61,15 @@ def materialize(
     dedupe_task_idx_and_obj_id_tuples: List[DedupeTaskIndexWithObjectId],
     max_records_per_output_file: int,
     compacted_file_content_type: ContentType,
+    enable_manifest_entry_copy_by_reference: bool,
     enable_profiler: bool,
     metrics_config: MetricsConfig,
     schema: Optional[pa.Schema] = None,
     read_kwargs_provider: Optional[ReadKwargsProvider] = None,
     s3_table_writer_kwargs: Optional[Dict[str, Any]] = None,
+    object_store: Optional[IObjectStore] = None,
     deltacat_storage=unimplemented_deltacat_storage,
 ):
-    def _stage_delta_implementation(
-        data: Union[LocalTable, LocalDataset, DistributedDataset, Manifest],
-        partition: Partition,
-        stage_delta_from_existing_manifest: Optional[bool],
-    ) -> Delta:
-        if stage_delta_from_existing_manifest:
-            delta = Delta.of(
-                locator=DeltaLocator.of(partition.locator),
-                delta_type=DeltaType.UPSERT,
-                meta=manifest.meta,
-                manifest=data,
-                previous_stream_position=partition.stream_position,
-                properties={},
-            )
-            return delta
     def _stage_delta_from_manifest_entry_reference_list(
         manifest_entry_list_reference: List[ManifestEntry],
         partition: Partition,
@@ -96,10 +79,13 @@ def materialize(
             delta_type == DeltaType.UPSERT
         ), "Stage delta with existing manifest entries only supports UPSERT delta type!"
         manifest = Manifest.of(entries=manifest_entry_list_reference, uuid=str(uuid4()))
-        delta = _stage_delta_implementation(
-            data=manifest,
-            partition=partition,
-            stage_delta_from_existing_manifest=True,
+        delta = Delta.of(
+            locator=DeltaLocator.of(partition.locator),
+            delta_type=delta_type,
+            meta=manifest.meta,
+            manifest=manifest,
+            previous_stream_position=partition.stream_position,
+            properties={},
         )
         return delta
@@ -161,18 +147,11 @@ def materialize(
         f"dedupe_{worker_id}_{task_id}.bin"
     ) if enable_profiler else nullcontext():
         start = time.time()
-        dedupe_task_idx_and_obj_ref_tuples = [
-            (
-                t1,
-                cloudpickle.loads(t2),
-            )
-            for t1, t2 in dedupe_task_idx_and_obj_id_tuples
-        ]
         logger.info(f"Resolved materialize task obj refs...")
-        dedupe_task_indices, obj_refs = zip(*dedupe_task_idx_and_obj_ref_tuples)
+        dedupe_task_indices, obj_refs = zip(*dedupe_task_idx_and_obj_id_tuples)
         # this depends on `ray.get` result order matching input order, as per the
         # contract established in: https://github.com/ray-project/ray/pull/16763
-        src_file_records_list = ray.get(list(obj_refs))
+        src_file_records_list = object_store.get_many(list(obj_refs))
         all_src_file_records = defaultdict(list)
         for i, src_file_records in enumerate(src_file_records_list):
             dedupe_task_idx = dedupe_task_indices[i]
@@ -231,7 +210,9 @@ def materialize(
                 record_numbers_length += 1
                 mask_pylist[record_number] = True
             if (
-                record_numbers_length == src_file_record_count
+                round_completion_info
+                and enable_manifest_entry_copy_by_reference
+                and record_numbers_length == src_file_record_count
                 and src_file_partition_locator
                 == round_completion_info.compacted_delta_locator.partition_locator
             ):
@@ -244,8 +225,8 @@ def materialize(
                 manifest_entry_list_reference.append(untouched_src_manifest_entry)
                 referenced_pyarrow_write_result = PyArrowWriteResult.of(
                     1,
-                    manifest.meta.source_content_length,
-                    manifest.meta.content_length,
+                    untouched_src_manifest_entry.meta.source_content_length,
+                    untouched_src_manifest_entry.meta.content_length,
                     src_file_record_count,
                 )
                 referenced_pyarrow_write_results.append(referenced_pyarrow_write_result)

deltacat/compute/compactor/utils/primary_key_index.py CHANGED Viewed

@@ -7,7 +7,6 @@ import numpy as np
 import pyarrow as pa
 import ray
 import s3fs
-from ray import cloudpickle
 from ray.types import ObjectRef
 from deltacat import logs
@@ -30,6 +29,7 @@ from deltacat.types.media import ContentEncoding, ContentType
 from deltacat.types.tables import get_table_slicer, get_table_writer
 from deltacat.utils.common import ReadKwargsProvider
 from deltacat.utils.ray_utils.concurrency import invoke_parallel
+from deltacat.io.object_store import IObjectStore
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
@@ -203,7 +203,10 @@ def group_record_indices_by_hash_bucket(
 def group_hash_bucket_indices(
-    hash_bucket_object_groups: np.ndarray, num_buckets: int, num_groups: int
+    hash_bucket_object_groups: np.ndarray,
+    num_buckets: int,
+    num_groups: int,
+    object_store: Optional[IObjectStore] = None,
 ) -> Tuple[np.ndarray, List[ObjectRef]]:
     """
     Groups all the ObjectRef that belongs to a particular hash bucket group and hash bucket index.
@@ -226,19 +229,10 @@ def group_hash_bucket_indices(
     for hb_group, obj in enumerate(hb_group_to_object):
         if obj is None:
             continue
-        obj_ref = ray.put(obj)
-        pickled_obj_ref = cloudpickle.dumps(obj_ref)
-        object_refs.append(pickled_obj_ref)
-        hash_bucket_group_to_obj_id[hb_group] = pickled_obj_ref
-        # NOTE: The cloudpickle.dumps API call creates an out of band object reference to the object_ref variable.
-        # After pickling, Ray cannot track the serialized copy of the object or determine when the ObjectRef has been deserialized
-        # (e.g., if the ObjectRef is deserialized by a non-Ray process).
-        # Thus the object_ref cannot be tracked by Ray's distributed reference counter, even if it goes out of scope.
-        # The object now has a permanent reference and the data can't be freed from Ray’s object store.
-        # Manually deleting the untrackable object references offsets these permanent references and
-        # helps to allow these objects to be garbage collected normally.
-        del obj_ref
-        del pickled_obj_ref
+        object_ref = object_store.put(obj)
+        object_refs.append(object_ref)
+        hash_bucket_group_to_obj_id[hb_group] = object_ref
+        del object_ref
     return hash_bucket_group_to_obj_id, object_refs

deltacat/compute/compactor/utils/round_completion_file.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import json
 import logging
+from typing import Dict, Any
 from deltacat import logs
 from deltacat.compute.compactor import RoundCompletionInfo
 from deltacat.storage import PartitionLocator
@@ -19,7 +19,9 @@ def get_round_completion_file_s3_url(
 def read_round_completion_file(
-    bucket: str, source_partition_locator: PartitionLocator
+    bucket: str,
+    source_partition_locator: PartitionLocator,
+    **s3_client_kwargs: Optional[Dict[str, Any]],
 ) -> RoundCompletionInfo:
     round_completion_file_url = get_round_completion_file_s3_url(
@@ -28,7 +30,7 @@ def read_round_completion_file(
     )
     logger.info(f"reading round completion file from: {round_completion_file_url}")
     round_completion_info = None
-    result = s3_utils.download(round_completion_file_url, False)
+    result = s3_utils.download(round_completion_file_url, False, **s3_client_kwargs)
     if result:
         json_str = result["Body"].read().decode("utf-8")
         round_completion_info = RoundCompletionInfo(json.loads(json_str))
@@ -41,6 +43,7 @@ def write_round_completion_file(
     source_partition_locator: Optional[PartitionLocator],
     round_completion_info: RoundCompletionInfo,
     completion_file_s3_url: str = None,
+    **s3_client_kwargs: Optional[Dict[str, Any]],
 ) -> str:
     if bucket is None and completion_file_s3_url is None:
         raise AssertionError("Either bucket or completion_file_s3_url must be passed")
@@ -52,6 +55,10 @@ def write_round_completion_file(
             source_partition_locator,
         )
     logger.info(f"writing round completion file to: {completion_file_s3_url}")
-    s3_utils.upload(completion_file_s3_url, str(json.dumps(round_completion_info)))
+    s3_utils.upload(
+        completion_file_s3_url,
+        str(json.dumps(round_completion_info)),
+        **s3_client_kwargs,
+    )
     logger.info(f"round completion file written to: {completion_file_s3_url}")
     return completion_file_s3_url

deltacat/io/__init__.py CHANGED Viewed

@@ -1,7 +0,0 @@
-from deltacat.io.dataset import DeltacatDataset
-from deltacat.io.read_api import read_redshift
-__all__ = [
-    "DeltacatDataset",
-    "read_redshift",
-]

deltacat/io/file_object_store.py ADDED Viewed

@@ -0,0 +1,48 @@
+import logging
+from ray import cloudpickle
+import time
+from deltacat.io.object_store import IObjectStore
+from typing import Any, List
+from deltacat import logs
+import os
+import uuid
+from builtins import open
+logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
+class FileObjectStore(IObjectStore):
+    """
+    An implementation of object store that uses file system.
+    """
+    def __init__(self, dir_path: str) -> None:
+        self.dir_path = dir_path
+        super().__init__()
+    def put_many(self, objects: List[object], *args, **kwargs) -> List[Any]:
+        result = []
+        for obj in objects:
+            serialized = cloudpickle.dumps(obj)
+            ref = f"{self.dir_path}/{uuid.uuid4()}"
+            with open(ref, "xb") as f:
+                f.write(serialized)
+            result.append(ref)
+        return result
+    def get_many(self, refs: List[Any], *args, **kwargs) -> List[object]:
+        result = []
+        start = time.monotonic()
+        for ref in refs:
+            with open(ref, "rb") as f:
+                serialized = f.read()
+                loaded = cloudpickle.loads(serialized)
+                result.append(loaded)
+            os.remove(ref)
+        end = time.monotonic()
+        logger.info(f"The total time taken to read all objects is: {end - start}")
+        return result

deltacat 0.1.18b4__py3-none-any.whl → 0.1.18b7__py3-none-any.whl

deltacat 0.1.18b4py3-none-any.whl → 0.1.18b7py3-none-any.whl