PyPI - deltacat - Versions diffs - 0.1.18b13__py3-none-any.whl → 0.1.18b15__py3-none-any.whl - Mend

deltacat 0.1.18b13py3-none-any.whl → 0.1.18b15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

deltacat/__init__.py +3 -2
deltacat/aws/clients.py +123 -3
deltacat/aws/redshift/model/manifest.py +4 -0
deltacat/aws/s3u.py +24 -1
deltacat/benchmarking/benchmark_parquet_reads.py +53 -0
deltacat/benchmarking/conftest.py +61 -0
deltacat/catalog/delegate.py +1 -1
deltacat/catalog/interface.py +1 -1
deltacat/compute/compactor/__init__.py +0 -3
deltacat/compute/compactor/compaction_session.py +45 -20
deltacat/compute/compactor/model/compact_partition_params.py +287 -58
deltacat/compute/compactor/model/compaction_session_audit_info.py +150 -9
deltacat/compute/compactor/model/delta_annotated.py +91 -9
deltacat/compute/compactor/model/delta_file_envelope.py +15 -3
deltacat/compute/compactor/model/primary_key_index.py +1 -1
deltacat/compute/compactor/model/round_completion_info.py +17 -1
deltacat/compute/compactor/repartition_session.py +5 -3
deltacat/compute/compactor/steps/dedupe.py +10 -8
deltacat/compute/compactor/steps/hash_bucket.py +25 -4
deltacat/compute/compactor/steps/materialize.py +11 -6
deltacat/compute/compactor/steps/repartition.py +16 -1
deltacat/compute/compactor/utils/io.py +40 -23
deltacat/compute/compactor/utils/primary_key_index.py +1 -15
deltacat/compute/compactor/utils/sort_key.py +57 -0
deltacat/compute/compactor/utils/system_columns.py +43 -0
deltacat/compute/compactor_v2/compaction_session.py +506 -0
deltacat/compute/compactor_v2/constants.py +34 -0
deltacat/compute/compactor_v2/model/__init__.py +0 -0
deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
deltacat/compute/compactor_v2/model/merge_input.py +127 -0
deltacat/compute/compactor_v2/model/merge_result.py +12 -0
deltacat/compute/compactor_v2/steps/__init__.py +0 -0
deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
deltacat/compute/compactor_v2/steps/merge.py +41 -0
deltacat/compute/compactor_v2/utils/__init__.py +0 -0
deltacat/compute/compactor_v2/utils/content_type_params.py +37 -0
deltacat/compute/compactor_v2/utils/io.py +149 -0
deltacat/compute/compactor_v2/utils/primary_key_index.py +308 -0
deltacat/compute/compactor_v2/utils/task_options.py +228 -0
deltacat/compute/metastats/meta_stats.py +4 -2
deltacat/compute/metastats/stats.py +1 -0
deltacat/compute/metastats/utils/io.py +4 -0
deltacat/compute/stats/utils/io.py +20 -5
deltacat/exceptions.py +4 -0
deltacat/io/memcached_object_store.py +37 -14
deltacat/logs.py +4 -3
deltacat/storage/__init__.py +3 -0
deltacat/storage/interface.py +11 -2
deltacat/storage/model/sort_key.py +33 -0
deltacat/storage/model/table_version.py +11 -0
deltacat/storage/model/types.py +2 -1
deltacat/tests/aws/__init__.py +0 -0
deltacat/tests/aws/test_clients.py +80 -0
deltacat/tests/compute/__init__.py +0 -0
deltacat/tests/compute/common.py +96 -0
deltacat/tests/compute/compactor/__init__.py +0 -0
deltacat/tests/compute/compactor/steps/__init__.py +0 -0
deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +22 -8
deltacat/tests/compute/compactor/utils/__init__.py +0 -0
deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
deltacat/tests/compute/compactor_v2/__init__.py +0 -0
deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +199 -0
deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
deltacat/tests/compute/test_compaction_session_incremental.py +348 -0
deltacat/tests/compute/testcases.py +390 -0
deltacat/tests/io/test_memcached_object_store.py +5 -4
deltacat/tests/local_deltacat_storage/__init__.py +1109 -0
deltacat/tests/test_utils/pyarrow.py +32 -0
deltacat/tests/test_utils/utils.py +13 -0
deltacat/tests/utils/data/__init__.py +0 -0
deltacat/tests/utils/test_daft.py +76 -0
deltacat/tests/utils/test_pyarrow.py +133 -0
deltacat/tests/utils/test_resources.py +23 -20
deltacat/types/media.py +1 -0
deltacat/types/partial_download.py +82 -0
deltacat/types/tables.py +1 -0
deltacat/utils/arguments.py +26 -0
deltacat/utils/daft.py +87 -0
deltacat/utils/performance.py +4 -2
deltacat/utils/placement.py +20 -3
deltacat/utils/pyarrow.py +213 -1
deltacat/utils/ray_utils/concurrency.py +26 -1
deltacat/utils/resources.py +72 -1
deltacat/utils/s3fs.py +21 -0
{deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/METADATA +27 -13
deltacat-0.1.18b15.dist-info/RECORD +176 -0
{deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/WHEEL +1 -1
deltacat/compute/compactor/model/sort_key.py +0 -98
deltacat-0.1.18b13.dist-info/RECORD +0 -136
/deltacat/{tests/compactor → benchmarking}/__init__.py +0 -0
/deltacat/{tests/compactor/utils → compute/compactor_v2}/__init__.py +0 -0
{deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/LICENSE +0 -0
{deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/top_level.txt +0 -0

deltacat/utils/pyarrow.py CHANGED Viewed

@@ -7,6 +7,8 @@ import io
 import logging
 from functools import partial
 from typing import Any, Callable, Dict, Iterable, List, Optional
+from pyarrow.parquet import ParquetFile
+from deltacat.exceptions import ValidationError
 import pyarrow as pa
 from fsspec import AbstractFileSystem
@@ -15,6 +17,7 @@ from pyarrow import feather as paf
 from pyarrow import json as pajson
 from pyarrow import parquet as papq
 from ray.data.datasource import BlockWritePathProvider
+from deltacat.utils.s3fs import create_s3_file_system
 from deltacat import logs
 from deltacat.types.media import (
@@ -23,12 +26,17 @@ from deltacat.types.media import (
     ContentEncoding,
     ContentType,
 )
+from deltacat.types.partial_download import (
+    PartialFileDownloadParams,
+    PartialParquetParameters,
+)
 from deltacat.utils.common import ContentTypeKwargsProvider, ReadKwargsProvider
 from deltacat.utils.performance import timed_invocation
+from deltacat.utils.daft import daft_s3_file_to_table
+from deltacat.utils.arguments import sanitize_kwargs_to_callable
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
 CONTENT_TYPE_TO_PA_READ_FUNC: Dict[str, Callable] = {
     ContentType.UNESCAPED_TSV.value: pacsv.read_csv,
     ContentType.TSV.value: pacsv.read_csv,
@@ -170,6 +178,7 @@ class ReadKwargsProviderPyArrowSchemaOverride(ContentTypeKwargsProvider):
         self,
         schema: Optional[pa.Schema] = None,
         pq_coerce_int96_timestamp_unit: Optional[str] = None,
+        parquet_reader_type: Optional[str] = None,
     ):
         """
@@ -182,6 +191,7 @@ class ReadKwargsProviderPyArrowSchemaOverride(ContentTypeKwargsProvider):
         """
         self.schema = schema
         self.pq_coerce_int96_timestamp_unit = pq_coerce_int96_timestamp_unit
+        self.parquet_reader_type = parquet_reader_type
     def _get_kwargs(self, content_type: str, kwargs: Dict[str, Any]) -> Dict[str, Any]:
         if content_type in DELIMITED_TEXT_CONTENT_TYPES:
@@ -201,6 +211,11 @@ class ReadKwargsProviderPyArrowSchemaOverride(ContentTypeKwargsProvider):
                     "coerce_int96_timestamp_unit"
                 ] = self.pq_coerce_int96_timestamp_unit
+            if self.parquet_reader_type:
+                kwargs["reader_type"] = self.parquet_reader_type
+            else:
+                kwargs["reader_type"] = "daft"
         return kwargs
@@ -237,6 +252,118 @@ def _add_column_kwargs(
                 )
+def _get_compatible_target_schema(
+    table_schema: pa.Schema, input_schema: pa.Schema
+) -> pa.Schema:
+    target_schema_fields = []
+    for field in table_schema:
+        index = input_schema.get_field_index(field.name)
+        if index != -1:
+            target_field = input_schema.field(index)
+            target_schema_fields.append(target_field)
+        else:
+            target_schema_fields.append(field)
+    target_schema = pa.schema(target_schema_fields, metadata=table_schema.metadata)
+    return target_schema
+def s3_partial_parquet_file_to_table(
+    s3_url: str,
+    content_type: str,
+    content_encoding: str,
+    column_names: Optional[List[str]] = None,
+    include_columns: Optional[List[str]] = None,
+    pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
+    partial_file_download_params: Optional[PartialParquetParameters] = None,
+    **s3_client_kwargs,
+) -> pa.Table:
+    assert (
+        partial_file_download_params is not None
+    ), "Partial parquet params must not be None"
+    assert (
+        partial_file_download_params.row_groups_to_download is not None
+    ), "No row groups to download"
+    pq_file = s3_file_to_parquet(
+        s3_url=s3_url,
+        content_type=content_type,
+        content_encoding=content_encoding,
+        partial_file_download_params=partial_file_download_params,
+        **s3_client_kwargs,
+    )
+    table, latency = timed_invocation(
+        pq_file.read_row_groups,
+        partial_file_download_params.row_groups_to_download,
+        columns=include_columns or column_names,
+    )
+    logger.debug(f"Successfully read from s3_url={s3_url} in {latency}s")
+    kwargs = {}
+    if pa_read_func_kwargs_provider:
+        kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
+    # Note: ordering is not guaranteed.
+    if kwargs.get("schema") is not None:
+        input_schema = kwargs.get("schema")
+        table_schema = table.schema
+        target_schema = _get_compatible_target_schema(table_schema, input_schema)
+        casted_table = table.cast(target_schema)
+        return casted_table
+    return table
+def s3_parquet_file_to_table(
+    s3_url: str,
+    content_type: str,
+    content_encoding: str,
+    column_names: Optional[List[str]] = None,
+    include_columns: Optional[List[str]] = None,
+    pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
+    partial_file_download_params: Optional[PartialFileDownloadParams] = None,
+    **s3_client_kwargs,
+) -> pa.Table:
+    logger.debug(
+        f"Reading to Parquet table using read_table for {content_type} "
+        f"and encoding: {content_encoding}"
+    )
+    if s3_client_kwargs is None:
+        s3_client_kwargs = {}
+    kwargs = {}
+    if s3_url.startswith("s3://"):
+        s3_file_system = create_s3_file_system(s3_client_kwargs)
+        kwargs["filesystem"] = s3_file_system
+    _add_column_kwargs(
+        content_type=content_type,
+        column_names=column_names,
+        include_columns=include_columns,
+        kwargs=kwargs,
+    )
+    if pa_read_func_kwargs_provider:
+        kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
+    table, latency = timed_invocation(papq.read_table, s3_url, **kwargs)
+    logger.debug(f"Successfully read the table from url={s3_url} in {latency}s")
+    return table
 def s3_file_to_table(
     s3_url: str,
     content_type: str,
@@ -244,6 +371,7 @@ def s3_file_to_table(
     column_names: Optional[List[str]] = None,
     include_columns: Optional[List[str]] = None,
     pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
+    partial_file_download_params: Optional[PartialFileDownloadParams] = None,
     **s3_client_kwargs,
 ) -> pa.Table:
@@ -253,6 +381,39 @@ def s3_file_to_table(
         f"Reading {s3_url} to PyArrow. Content type: {content_type}. "
         f"Encoding: {content_encoding}"
     )
+    if (
+        content_type == ContentType.PARQUET.value
+        and content_encoding == ContentEncoding.IDENTITY.value
+    ):
+        logger.debug(
+            f"Performing read using parquet reader for encoding={content_encoding} "
+            f"and content_type={content_type}"
+        )
+        kwargs = {}
+        if pa_read_func_kwargs_provider is not None:
+            kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
+        if kwargs.get("reader_type", "daft") == "daft":
+            parquet_reader_func = daft_s3_file_to_table
+        elif partial_file_download_params and isinstance(
+            partial_file_download_params, PartialParquetParameters
+        ):
+            parquet_reader_func = s3_partial_parquet_file_to_table
+        else:
+            parquet_reader_func = s3_parquet_file_to_table
+        return parquet_reader_func(
+            s3_url=s3_url,
+            content_type=content_type,
+            content_encoding=content_encoding,
+            column_names=column_names,
+            include_columns=include_columns,
+            pa_read_func_kwargs_provider=pa_read_func_kwargs_provider,
+            partial_file_download_params=partial_file_download_params,
+            **s3_client_kwargs,
+        )
     s3_obj = s3_utils.get_object_at_url(s3_url, **s3_client_kwargs)
     logger.debug(f"Read S3 object from {s3_url}: {s3_obj}")
     pa_read_func = CONTENT_TYPE_TO_PA_READ_FUNC[content_type]
@@ -272,6 +433,57 @@ def s3_file_to_table(
     return table
+def s3_file_to_parquet(
+    s3_url: str,
+    content_type: str,
+    content_encoding: str,
+    column_names: Optional[List[str]] = None,
+    include_columns: Optional[List[str]] = None,
+    pa_read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
+    partial_file_download_params: Optional[PartialFileDownloadParams] = None,
+    **s3_client_kwargs,
+) -> ParquetFile:
+    logger.debug(
+        f"Reading {s3_url} to PyArrow ParquetFile. "
+        f"Content type: {content_type}. Encoding: {content_encoding}"
+    )
+    if (
+        content_type != ContentType.PARQUET.value
+        or content_encoding != ContentEncoding.IDENTITY
+    ):
+        raise ValidationError(
+            f"S3 file with content type: {content_type} and "
+            f"content encoding: {content_encoding} cannot be read"
+            "into pyarrow.parquet.ParquetFile"
+        )
+    if s3_client_kwargs is None:
+        s3_client_kwargs = {}
+    kwargs = {}
+    if s3_url.startswith("s3://"):
+        s3_file_system = create_s3_file_system(s3_client_kwargs)
+        kwargs["filesystem"] = s3_file_system
+    if pa_read_func_kwargs_provider:
+        kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
+    logger.debug(f"Pre-sanitize kwargs for {s3_url}: {kwargs}")
+    kwargs = sanitize_kwargs_to_callable(ParquetFile.__init__, kwargs)
+    logger.debug(
+        f"Reading the file from {s3_url} into ParquetFile with kwargs: {kwargs}"
+    )
+    pqFile, latency = timed_invocation(ParquetFile, s3_url, **kwargs)
+    logger.debug(f"Time to get {s3_url} into parquet file: {latency}s")
+    return pqFile
 def table_size(table: pa.Table) -> int:
     return table.nbytes

deltacat/utils/ray_utils/concurrency.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import copy
 import itertools
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
+from deltacat.utils.placement import PlacementGroupConfig
 import ray
 from ray._private.ray_constants import MIN_RESOURCE_GRANULARITY
 from ray.types import ObjectRef
@@ -115,3 +115,28 @@ def round_robin_options_provider(
         resource_key_index = i % len(resource_keys)
         key = resource_keys[resource_key_index]
         return {"resources": {key: resource_amount_provider(resource_key_index)}}
+def task_resource_options_provider(
+    i: int,
+    item: Any,
+    resource_amount_provider: Callable[[int, Any], Dict] = lambda x: {},
+    pg_config: Optional[PlacementGroupConfig] = None,
+    **kwargs,
+) -> Dict:
+    """
+    Return options that needs to be provided to each task.
+    """
+    options = resource_amount_provider(i, item, **kwargs)
+    if pg_config:
+        options_to_append = copy.deepcopy(pg_config.opts)
+        bundle_key_index = i % len(
+            options_to_append["scheduling_strategy"].placement_group.bundle_specs
+        )
+        options_to_append[
+            "scheduling_strategy"
+        ].placement_group_bundle_index = bundle_key_index
+        options = {**options, **options_to_append}
+    return options

deltacat/utils/resources.py CHANGED Viewed

@@ -1,15 +1,20 @@
 # Allow classes to use self-referencing Type hints in Python 3.7.
 from __future__ import annotations
+from contextlib import AbstractContextManager
+from types import TracebackType
 import ray
 import sys
-from typing import Dict, Any
+import threading
+import time
+from typing import Dict, Any, Optional
 from dataclasses import dataclass
 from deltacat import logs
 import logging
 from resource import getrusage, RUSAGE_SELF
 import platform
 import psutil
+import schedule
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
@@ -56,6 +61,72 @@ class ClusterUtilization:
         )
+class ClusterUtilizationOverTimeRange(AbstractContextManager):
+    """
+    This class can be used to compute the cluster utilization metrics
+    which requires us to compute it over time as they change on-demand.
+    For example, in an autoscaling cluster, the vCPUs keep changing and hence
+    more important metrics to capture in that scenario is vcpu-seconds.
+    """
+    def __init__(self) -> None:
+        self.total_vcpu_seconds = 0.0
+        self.used_vcpu_seconds = 0.0
+    def __enter__(self) -> Any:
+        schedule.every().second.do(self._update_vcpus)
+        self.stop_run_schedules = self._run_schedule()
+        return super().__enter__()
+    def __exit__(
+        self,
+        __exc_type: type[BaseException] | None,
+        __exc_value: BaseException | None,
+        __traceback: TracebackType | None,
+    ) -> bool | None:
+        if __exc_value:
+            logger.error(
+                f"Error ocurred while calculating cluster resources: {__exc_value}"
+            )
+        self.stop_run_schedules.set()
+        return super().__exit__(__exc_type, __exc_value, __traceback)
+    # It is not truely parallel(due to GIL Ref: https://wiki.python.org/moin/GlobalInterpreterLock)
+    # even if we are using threading library. However, it averages out and gives a very good approximation.
+    def _update_vcpus(self):
+        cluster_resources = ray.cluster_resources()
+        available_resources = ray.available_resources()
+        if "CPU" not in cluster_resources:
+            return
+        if "CPU" in available_resources:
+            self.used_vcpu_seconds = self.used_vcpu_seconds + float(
+                str(cluster_resources["CPU"] - available_resources["CPU"])
+            )
+            self.total_vcpu_seconds = self.total_vcpu_seconds + float(
+                str(cluster_resources["CPU"])
+            )
+        else:
+            self.total_vcpu_seconds = self.total_vcpu_seconds + float(
+                str(cluster_resources["CPU"])
+            )
+    def _run_schedule(self, interval: Optional[float] = 1.0):
+        cease_continuous_run = threading.Event()
+        class ScheduleThread(threading.Thread):
+            @classmethod
+            def run(cls):
+                while not cease_continuous_run.is_set():
+                    schedule.run_pending()
+                    time.sleep(float(str(interval)))
+        continuous_thread = ScheduleThread()
+        continuous_thread.start()
+        return cease_continuous_run
 def get_current_node_peak_memory_usage_in_bytes():
     """
     Returns the peak memory usage of the node in bytes. This method works across

deltacat/utils/s3fs.py ADDED Viewed

@@ -0,0 +1,21 @@
+import s3fs
+def create_s3_file_system(s3_client_kwargs: dict) -> s3fs.S3FileSystem:
+    if not s3_client_kwargs:
+        return s3fs.S3FileSystem(anon=True)
+    config_kwargs = {}
+    if s3_client_kwargs.get("config") is not None:
+        boto_config = s3_client_kwargs.pop("config")
+        for key, val in boto_config.__dict__.items():
+            if not key.startswith("_") and val is not None:
+                config_kwargs[key] = val
+    anon = False
+    if s3_client_kwargs.get("aws_access_key_id") is None:
+        anon = True
+    return s3fs.S3FileSystem(
+        anon=anon, client_kwargs=s3_client_kwargs, config_kwargs=config_kwargs or None
+    )

{deltacat-0.1.18b13.dist-info → deltacat-0.1.18b15.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: deltacat
-Version: 0.1.18b13
+Version: 0.1.18b15
 Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
 Home-page: https://github.com/ray-project/deltacat
 Author: Ray Team
@@ -15,17 +15,19 @@ Classifier: Programming Language :: Python :: 3.9
 Classifier: Operating System :: OS Independent
 Requires-Python: >=3.7
 Description-Content-Type: text/markdown
-Requires-Dist: boto3 (~=1.20)
-Requires-Dist: numpy (==1.21.5)
-Requires-Dist: pandas (==1.3.5)
-Requires-Dist: pyarrow (==10.0.1)
-Requires-Dist: pydantic (==1.10.4)
-Requires-Dist: ray[default] (~=2.0)
-Requires-Dist: s3fs (==2022.2.0)
-Requires-Dist: tenacity (==8.1.0)
-Requires-Dist: typing-extensions (==4.4.0)
-Requires-Dist: pymemcache (==4.0.0)
-Requires-Dist: redis (==4.6.0)
+Requires-Dist: boto3 ~=1.20
+Requires-Dist: numpy ==1.21.5
+Requires-Dist: pandas ==1.3.5
+Requires-Dist: pyarrow ==12.0.1
+Requires-Dist: pydantic ==1.10.4
+Requires-Dist: ray[default] ~=2.0
+Requires-Dist: s3fs ==2022.2.0
+Requires-Dist: tenacity ==8.1.0
+Requires-Dist: typing-extensions ==4.4.0
+Requires-Dist: pymemcache ==4.0.0
+Requires-Dist: redis ==4.6.0
+Requires-Dist: getdaft ==0.1.15
+Requires-Dist: schedule ==1.2.0
 # DeltaCAT
@@ -40,10 +42,22 @@ for common table management tasks, including petabyte-scale
 change-data-capture, data consistency checks, and table repair.
 ## Getting Started
----
 ### Install
 ```
 pip install deltacat
 ```
+### Running Tests
+```
+pip3 install virtualenv
+virtualenv test_env
+source test_env/bin/activate
+pip3 install -r requirements.txt
+pytest
+```

deltacat 0.1.18b13__py3-none-any.whl → 0.1.18b15__py3-none-any.whl

deltacat 0.1.18b13py3-none-any.whl → 0.1.18b15py3-none-any.whl