PyPI - deltacat - Versions diffs - 0.1.18b14__py3-none-any.whl → 0.1.18b16__py3-none-any.whl - Mend

deltacat 0.1.18b14py3-none-any.whl → 0.1.18b16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (79) hide show

deltacat/__init__.py +1 -1
deltacat/aws/clients.py +17 -6
deltacat/aws/redshift/model/manifest.py +4 -0
deltacat/aws/s3u.py +24 -1
deltacat/compute/compactor/compaction_session.py +42 -18
deltacat/compute/compactor/model/compact_partition_params.py +297 -58
deltacat/compute/compactor/model/compaction_session_audit_info.py +163 -9
deltacat/compute/compactor/model/delta_annotated.py +95 -9
deltacat/compute/compactor/model/delta_file_envelope.py +14 -2
deltacat/compute/compactor/model/round_completion_info.py +17 -1
deltacat/compute/compactor/repartition_session.py +4 -1
deltacat/compute/compactor/steps/dedupe.py +9 -6
deltacat/compute/compactor/steps/hash_bucket.py +24 -3
deltacat/compute/compactor/steps/materialize.py +11 -6
deltacat/compute/compactor/steps/repartition.py +22 -1
deltacat/compute/compactor/utils/io.py +40 -23
deltacat/compute/compactor/utils/sort_key.py +5 -0
deltacat/compute/compactor/utils/system_columns.py +43 -0
deltacat/compute/compactor_v2/compaction_session.py +509 -0
deltacat/compute/compactor_v2/constants.py +37 -0
deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
deltacat/compute/compactor_v2/model/merge_input.py +143 -0
deltacat/compute/compactor_v2/model/merge_result.py +12 -0
deltacat/compute/compactor_v2/steps/__init__.py +0 -0
deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
deltacat/compute/compactor_v2/steps/merge.py +469 -0
deltacat/compute/compactor_v2/utils/__init__.py +0 -0
deltacat/compute/compactor_v2/utils/content_type_params.py +66 -0
deltacat/compute/compactor_v2/utils/dedupe.py +58 -0
deltacat/compute/compactor_v2/utils/io.py +152 -0
deltacat/compute/compactor_v2/utils/primary_key_index.py +341 -0
deltacat/compute/compactor_v2/utils/task_options.py +221 -0
deltacat/compute/metastats/meta_stats.py +4 -2
deltacat/compute/metastats/stats.py +1 -0
deltacat/compute/metastats/utils/io.py +4 -0
deltacat/compute/stats/utils/io.py +20 -5
deltacat/exceptions.py +4 -0
deltacat/io/memcached_object_store.py +37 -14
deltacat/logs.py +4 -3
deltacat/storage/interface.py +8 -1
deltacat/storage/model/types.py +2 -1
deltacat/tests/aws/test_clients.py +16 -3
deltacat/tests/compute/__init__.py +0 -0
deltacat/tests/compute/common.py +96 -0
deltacat/tests/compute/compactor/__init__.py +0 -0
deltacat/tests/compute/compactor/steps/__init__.py +0 -0
deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} +34 -8
deltacat/tests/compute/compactor/utils/__init__.py +0 -0
deltacat/tests/{compactor → compute/compactor}/utils/test_io.py +47 -5
deltacat/tests/compute/compactor_v2/__init__.py +0 -0
deltacat/tests/{compactor → compute}/test_compact_partition_params.py +14 -30
deltacat/tests/compute/test_compaction_session_incremental.py +363 -0
deltacat/tests/compute/testcases.py +395 -0
deltacat/tests/io/test_memcached_object_store.py +5 -4
deltacat/tests/local_deltacat_storage/__init__.py +62 -19
deltacat/tests/test_utils/pyarrow.py +49 -0
deltacat/tests/test_utils/utils.py +13 -0
deltacat/tests/utils/data/__init__.py +0 -0
deltacat/tests/utils/test_daft.py +76 -0
deltacat/tests/utils/test_pyarrow.py +133 -0
deltacat/tests/utils/test_resources.py +23 -20
deltacat/types/media.py +1 -0
deltacat/types/partial_download.py +83 -0
deltacat/types/tables.py +6 -0
deltacat/utils/arguments.py +25 -0
deltacat/utils/daft.py +87 -0
deltacat/utils/placement.py +20 -3
deltacat/utils/pyarrow.py +218 -1
deltacat/utils/ray_utils/concurrency.py +26 -1
deltacat/utils/resources.py +72 -1
deltacat/utils/s3fs.py +21 -0
{deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/METADATA +17 -3
{deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/RECORD +79 -47
{deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/WHEEL +1 -1
/deltacat/{tests/compactor → compute/compactor_v2}/__init__.py +0 -0
/deltacat/{tests/compactor/utils → compute/compactor_v2/model}/__init__.py +0 -0
{deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/LICENSE +0 -0
{deltacat-0.1.18b14.dist-info → deltacat-0.1.18b16.dist-info}/top_level.txt +0 -0

deltacat/compute/metastats/meta_stats.py CHANGED Viewed

@@ -5,7 +5,7 @@ import functools
 import logging
 import os
 import pathlib
-from typing import Dict, List, Optional, Set
+from typing import Any, Dict, List, Optional, Set
 import ray
 from ray.types import ObjectRef
@@ -118,10 +118,12 @@ def collect_from_partition(
     stat_results_s3_bucket: Optional[str] = None,
     metastats_results_s3_bucket: Optional[str] = None,
     deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
     *args,
     **kwargs,
 ) -> ObjectRef[Dict[int, DeltaStats]]:
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
     if not columns:
         columns = deltacat_storage.get_table_version_column_names(
             source_partition_locator.namespace,

deltacat/compute/metastats/stats.py CHANGED Viewed

@@ -33,6 +33,7 @@ def start_stats_collection(
     stat_results_s3_bucket: Optional[str] = None,
     metastats_results_s3_bucket: Optional[str] = None,
     deltacat_storage=unimplemented_deltacat_storage,
+    **kwargs,
 ) -> Dict[str, List[DeltaStats]]:
     """Collects statistics on deltas, given a set of delta stream position ranges.
     Example:

deltacat/compute/metastats/utils/io.py CHANGED Viewed

@@ -171,6 +171,7 @@ def collect_stats_by_columns(
     delta_annotated: DeltaAnnotated,
     columns_to_compute: Optional[List[str]] = None,
     deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
 ) -> Dict[str, Any]:
     """Materializes one manifest entry at a time to save memory usage and calculate stats from each of its columns.
@@ -182,6 +183,8 @@ def collect_stats_by_columns(
     Returns:
         A delta wide stats container
     """
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
     total_tables_size = 0
     # Mapping of column_name -> [stats_file_idx_1, stats_file_idx_2, ... stats_file_idx_n]
@@ -198,6 +201,7 @@ def collect_stats_by_columns(
                 TableType.PYARROW,
                 columns_to_compute,
                 equivalent_table_types="uncompacted",
+                **deltacat_storage_kwargs,
             )
         )
         assert isinstance(entry_pyarrow_table, pyarrow.Table), (

deltacat/compute/stats/utils/io.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
 from collections import defaultdict
-from typing import Dict, List, Optional
+from typing import Any, Dict, List, Optional
 import pyarrow
 import ray
@@ -83,6 +83,7 @@ def get_delta_stats(
     delta_locator: DeltaLocator,
     columns: Optional[List[str]] = None,
     deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
 ) -> DeltaStats:
     """Ray distributed task to compute and collect stats for a requested delta.
     If no columns are requested, stats will be computed for all columns.
@@ -93,10 +94,15 @@ def get_delta_stats(
     Returns:
         A delta wide stats container
     """
-    manifest = deltacat_storage.get_delta_manifest(delta_locator)
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
+    manifest = deltacat_storage.get_delta_manifest(
+        delta_locator, **deltacat_storage_kwargs
+    )
     delta = Delta.of(delta_locator, None, None, None, manifest)
-    return _collect_stats_by_columns(delta, columns, deltacat_storage)
+    return _collect_stats_by_columns(
+        delta, columns, deltacat_storage, deltacat_storage_kwargs
+    )
 @ray.remote
@@ -105,6 +111,7 @@ def get_deltas_from_range(
     start_position_inclusive: DeltaRange,
     end_position_inclusive: DeltaRange,
     deltacat_storage=unimplemented_deltacat_storage,
+    **kwargs,
 ) -> List[Delta]:
     """Looks up deltas in the specified partition using Ray, given both starting and ending delta stream positions.
@@ -137,6 +144,7 @@ def get_deltas_from_range(
         end_position_inclusive,
         ascending_order=True,
         include_manifest=False,
+        **kwargs,
     )
     return deltas_list_result.all_items()
@@ -145,6 +153,7 @@ def _collect_stats_by_columns(
     delta: Delta,
     columns_to_compute: Optional[List[str]] = None,
     deltacat_storage=unimplemented_deltacat_storage,
+    deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
 ) -> DeltaStats:
     """Materializes one manifest entry at a time to save memory usage and calculate stats from each of its columns.
     Args:
@@ -154,6 +163,8 @@ def _collect_stats_by_columns(
     Returns:
         A delta wide stats container
     """
+    if deltacat_storage_kwargs is None:
+        deltacat_storage_kwargs = {}
     assert (
         delta.manifest is not None
     ), f"Manifest should not be missing from delta for stats calculation: {delta}"
@@ -167,7 +178,11 @@ def _collect_stats_by_columns(
     for file_idx, manifest in enumerate(delta.manifest.entries):
         entry_pyarrow_table: LocalTable = (
             deltacat_storage.download_delta_manifest_entry(
-                delta, file_idx, TableType.PYARROW, columns_to_compute
+                delta,
+                file_idx,
+                TableType.PYARROW,
+                columns_to_compute,
+                **deltacat_storage_kwargs,
             )
         )
         assert isinstance(entry_pyarrow_table, pyarrow.Table), (

deltacat/exceptions.py CHANGED Viewed

@@ -8,3 +8,7 @@ class NonRetryableError(Exception):
 class ConcurrentModificationError(Exception):
     pass
+class ValidationError(NonRetryableError):
+    pass

deltacat/io/memcached_object_store.py CHANGED Viewed

@@ -3,13 +3,14 @@ from ray import cloudpickle
 from collections import defaultdict
 import time
 from deltacat.io.object_store import IObjectStore
-from typing import Any, List
+from typing import Any, List, Optional
 from deltacat import logs
 import uuid
 import socket
 from pymemcache.client.base import Client
 from pymemcache.client.retrying import RetryingClient
 from pymemcache.exceptions import MemcacheUnexpectedCloseError
+from pymemcache.client.rendezvous import RendezvousHash
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
@@ -19,36 +20,46 @@ class MemcachedObjectStore(IObjectStore):
     An implementation of object store that uses Memcached.
     """
-    def __init__(self, port=11212) -> None:
+    def __init__(
+        self, storage_node_ips: Optional[List[str]] = None, port: Optional[int] = 11212
+    ) -> None:
         self.client_cache = {}
         self.current_ip = None
         self.SEPARATOR = "_"
         self.port = port
+        self.storage_node_ips = storage_node_ips
+        self.hasher = None
         super().__init__()
+    def initialize_hasher(self):
+        if not self.hasher and self.storage_node_ips:
+            self.hasher = RendezvousHash()
+            for n in self.storage_node_ips:
+                self.hasher.add_node(n)
     def put_many(self, objects: List[object], *args, **kwargs) -> List[Any]:
-        input = {}
+        input = defaultdict(dict)
         result = []
-        current_ip = self._get_current_ip()
         for obj in objects:
             serialized = cloudpickle.dumps(obj)
             uid = uuid.uuid4()
-            ref = self._create_ref(uid, current_ip)
-            input[uid.__str__()] = serialized
+            create_ref_ip = self._get_create_ref_ip(uid.__str__())
+            ref = self._create_ref(uid, create_ref_ip)
+            input[create_ref_ip][uid.__str__()] = serialized
             result.append(ref)
-        client = self._get_client_by_ip(current_ip)
-        if client.set_many(input, noreply=False):
-            raise RuntimeError("Unable to write few keys to cache")
+        for create_ref_ip, uid_to_object in input.items():
+            client = self._get_client_by_ip(create_ref_ip)
+            if client.set_many(uid_to_object, noreply=False):
+                raise RuntimeError("Unable to write few keys to cache")
         return result
     def put(self, obj: object, *args, **kwargs) -> Any:
         serialized = cloudpickle.dumps(obj)
         uid = uuid.uuid4()
-        current_ip = self._get_current_ip()
-        ref = self._create_ref(uid, current_ip)
-        client = self._get_client_by_ip(current_ip)
+        create_ref_ip = self._get_create_ref_ip(uid.__str__())
+        ref = self._create_ref(uid, create_ref_ip)
+        client = self._get_client_by_ip(create_ref_ip)
         if client.set(uid.__str__(), serialized):
             return ref
@@ -99,6 +110,18 @@ class MemcachedObjectStore(IObjectStore):
     def _create_ref(self, uid, ip) -> str:
         return f"{uid}{self.SEPARATOR}{ip}"
+    def _get_storage_node_ip(self, key: str):
+        self.initialize_hasher()
+        storage_node_ip = self.hasher.get_node(key)
+        return storage_node_ip
+    def _get_create_ref_ip(self, uid: str):
+        if self.storage_node_ips:
+            create_ref_ip = self._get_storage_node_ip(uid)
+        else:
+            create_ref_ip = self._get_current_ip()
+        return create_ref_ip
     def _get_client_by_ip(self, ip_address: str):
         if ip_address in self.client_cache:
             return self.client_cache[ip_address]
@@ -108,7 +131,7 @@ class MemcachedObjectStore(IObjectStore):
             base_client,
             attempts=3,
             retry_delay=0.01,
-            retry_for=[MemcacheUnexpectedCloseError],
+            retry_for=[MemcacheUnexpectedCloseError, ConnectionResetError],
         )
         self.client_cache[ip_address] = client

deltacat/logs.py CHANGED Viewed

@@ -143,9 +143,10 @@ def _configure_logger(
             log_dir, log_base_file_name, primary_log_level
         )
         _add_logger_handler(logger, handler)
-    ray_runtime_ctx = ray.get_runtime_context()
-    if ray_runtime_ctx.worker.connected:
-        logger = RayRuntimeContextLoggerAdapter(logger, ray_runtime_ctx)
+    if ray.is_initialized():
+        ray_runtime_ctx = ray.get_runtime_context()
+        if ray_runtime_ctx.worker.connected:
+            logger = RayRuntimeContextLoggerAdapter(logger, ray_runtime_ctx)
     return logger

deltacat/storage/interface.py CHANGED Viewed

@@ -21,6 +21,7 @@ from deltacat.storage import (
     Table,
     TableVersion,
     SortKey,
+    PartitionLocator,
 )
 from deltacat.types.media import ContentType, StorageType, TableType
 from deltacat.utils.common import ReadKwargsProvider
@@ -105,7 +106,13 @@ def list_deltas(
 def list_partition_deltas(
-    partition: Partition, include_manifest: bool = False, *args, **kwargs
+    partition_like: Union[Partition, PartitionLocator],
+    first_stream_position: Optional[int] = None,
+    last_stream_position: Optional[int] = None,
+    ascending_order: bool = False,
+    include_manifest: bool = False,
+    *args,
+    **kwargs
 ) -> ListResult[Delta]:
     """
     Lists a page of deltas committed to the given partition.

deltacat/storage/model/types.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from enum import Enum
 from typing import List, Union, Any
+from pyarrow.parquet import ParquetFile
 import numpy as np
 import pandas as pd
 import pyarrow as pa
@@ -8,7 +9,7 @@ import pkg_resources
 from ray.data._internal.arrow_block import ArrowRow
 from ray.data.dataset import Dataset
-LocalTable = Union[pa.Table, pd.DataFrame, np.ndarray]
+LocalTable = Union[pa.Table, pd.DataFrame, np.ndarray, ParquetFile]
 LocalDataset = List[LocalTable]
 # Starting Ray 2.5.0, Dataset follows a strict mode (https://docs.ray.io/en/latest/data/faq.html#migrating-to-strict-mode),
 # and generic annotation is removed. So add a version checker to determine whether to use the old or new definition.

deltacat/tests/aws/test_clients.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from unittest.mock import patch
 import unittest
-import json
 from http import HTTPStatus
+import requests
 HAPPY_RESPONSE = {
     "AccessKeyId": "ASIA123456789",
@@ -20,7 +20,7 @@ class MockResponse:
     """
     def __init__(self, status_code: int, text: str, reason: str = "") -> None:
-        self.status_code = status_code
+        self.status_code: requests.Response.status_code = status_code
         self.text = text
         self.reason = reason
@@ -55,7 +55,7 @@ class TestBlockUntilInstanceMetadataServiceReturnsSuccess(unittest.TestCase):
         )
         requests_mock.get.side_effect = [
-            MockResponse(HTTPStatus.OK, json.dumps(HAPPY_RESPONSE)),
+            MockResponse(HTTPStatus.OK, "foo"),
             MockResponse(HTTPStatus.TOO_MANY_REQUESTS, "foo"),
             MockResponse(HTTPStatus.INTERNAL_SERVER_ERROR, "foo"),
             MockResponse(HTTPStatus.NOT_IMPLEMENTED, "bar"),
@@ -65,3 +65,16 @@ class TestBlockUntilInstanceMetadataServiceReturnsSuccess(unittest.TestCase):
         self.assertEqual(
             block_until_instance_metadata_service_returns_success().status_code, 200
         )
+    @patch("deltacat.aws.clients.requests")
+    def test_retrying_status_on_shortlist_returns_early(self, requests_mock):
+        from deltacat.aws.clients import (
+            block_until_instance_metadata_service_returns_success,
+        )
+        requests_mock.get.side_effect = [
+            MockResponse(HTTPStatus.FORBIDDEN, "foo"),
+        ]
+        self.assertEqual(
+            block_until_instance_metadata_service_returns_success().status_code, 403
+        )

deltacat/tests/compute/__init__.py ADDED Viewed

File without changes

deltacat/tests/compute/common.py ADDED Viewed

@@ -0,0 +1,96 @@
+# Allow classes to use self-referencing Type hints in Python 3.7.
+from __future__ import annotations
+from enum import Enum
+from typing import List
+import datetime as dt
+from datetime import timezone
+TEST_S3_RCF_BUCKET_NAME = "test-compaction-artifacts-bucket"
+# REBASE  src = spark compacted table to create an initial version of ray compacted table
+BASE_TEST_SOURCE_NAMESPACE = "source_test_namespace"
+BASE_TEST_SOURCE_TABLE_NAME = "test_table"
+BASE_TEST_SOURCE_TABLE_VERSION = "1"
+BASE_TEST_DESTINATION_NAMESPACE = "destination_test_namespace"
+BASE_TEST_DESTINATION_TABLE_NAME = "destination_test_table_RAY"
+BASE_TEST_DESTINATION_TABLE_VERSION = "1"
+HASH_BUCKET_COUNT: int = 3
+MAX_RECORDS_PER_FILE: int = 1
+UTC_ISO_8601_FORMAT_WITHOUT_MILLIS = "%Y-%m-%dT%H:%M:%SZ"  # '2018-09-05T14:09:03Z'
+class PartitionKeyType(str, Enum):
+    INT = "int"
+    STRING = "string"
+    TIMESTAMP = "timestamp"
+class PartitionKey(dict):
+    @staticmethod
+    def of(key_name: str, key_type: PartitionKeyType) -> PartitionKey:
+        return PartitionKey({"keyName": key_name, "keyType": key_type.value})
+    @property
+    def key_name(self) -> str:
+        return self["keyName"]
+    @property
+    def key_type(self) -> PartitionKeyType:
+        key_type = self["keyType"]
+        return None if key_type is None else PartitionKeyType(key_type)
+def setup_sort_and_partition_keys(sort_keys_param, partition_keys_param):
+    from deltacat.storage.model.sort_key import SortKey
+    sort_keys, partition_keys = None, None
+    if sort_keys_param is not None:
+        sort_keys = [SortKey.of(sort_key["key_name"]) for sort_key in sort_keys_param]
+    if partition_keys_param is not None:
+        partition_keys = [
+            PartitionKey.of(
+                partition_key["key_name"], PartitionKeyType(partition_key["key_type"])
+            )
+            for partition_key in partition_keys_param
+        ]
+    return sort_keys, partition_keys
+def offer_iso8601_timestamp_list(
+    periods: int,
+    unit_of_time: str,
+    end_time=dt.datetime(2023, 5, 3, 10, 0, 0, 0, tzinfo=timezone.utc),
+) -> List[str]:
+    """
+    Returns a list of ISO 8601 timestamps, each periods units of time before the start time.
+    Args:
+    periods: The number of timestamps to return.
+    unit_of_time: The unit of time to use for the timestamps. Must be one of "seconds", "minutes", "hours", "days", or "weeks".
+    end_time: The end time for the timestamps. Defaults to 2023-05-03T10:00:00Z.
+    Returns:
+    A list of ISO 8601 timestamps, each periods units of time before the start time.
+    Raises:
+    ValueError: If the unit_of_time argument is not one of "seconds", "minutes", "hours", "days", or "weeks".
+    """
+    import datetime as dt
+    acceptable_units_of_time = ["seconds", "minutes", "hours", "days", "weeks"]
+    if unit_of_time not in acceptable_units_of_time:
+        raise ValueError(
+            f"unit_of_time {unit_of_time} is not supported. Please use one of these time units: {acceptable_units_of_time}"
+        )
+    res = []
+    for i in range(periods):
+        kwarg = {unit_of_time: i}
+        res.append(
+            (end_time - dt.timedelta(**kwarg)).strftime(
+                UTC_ISO_8601_FORMAT_WITHOUT_MILLIS
+            )
+        )
+    return res

deltacat/tests/compute/compactor/__init__.py ADDED Viewed

File without changes

deltacat/tests/compute/compactor/steps/__init__.py ADDED Viewed

File without changes

deltacat/tests/{test_repartition.py → compute/compactor/steps/test_repartition.py} RENAMED Viewed

@@ -49,8 +49,10 @@ class TestRepartitionRange(unittest.TestCase):
         self.destination_partition: PartitionLocator = MagicMock()
         self.repartition_args = {"column": "last_updated", "ranges": [1678665487112747]}
         self.max_records_per_output_file = 2
+        self.s3_table_writer_kwargs = {}
         self.repartitioned_file_content_type = ContentType.PARQUET
         self.deltacat_storage = MagicMock()
+        self.deltacat_storage_kwargs = MagicMock()
     def test_repartition_range(self):
         result = repartition_range(
@@ -58,8 +60,10 @@ class TestRepartitionRange(unittest.TestCase):
             self.destination_partition,
             self.repartition_args,
             self.max_records_per_output_file,
+            self.s3_table_writer_kwargs,
             self.repartitioned_file_content_type,
             self.deltacat_storage,
+            self.deltacat_storage_kwargs,
         )
         # Assert that a RepartitionResult object is returned
         self.assertIsInstance(result, RepartitionResult)
@@ -83,8 +87,10 @@ class TestRepartitionRange(unittest.TestCase):
                 self.destination_partition,
                 self.repartition_args,
                 self.max_records_per_output_file,
+                self.s3_table_writer_kwargs,
                 self.repartitioned_file_content_type,
                 self.deltacat_storage,
+                self.deltacat_storage_kwargs,
             )
     def test_empty_ranges(self):
@@ -95,8 +101,10 @@ class TestRepartitionRange(unittest.TestCase):
                 self.destination_partition,
                 self.repartition_args,
                 self.max_records_per_output_file,
+                self.s3_table_writer_kwargs,
                 self.repartitioned_file_content_type,
                 self.deltacat_storage,
+                self.deltacat_storage_kwargs,
             )
     def test_one_value_in_ranges(self):
@@ -106,8 +114,10 @@ class TestRepartitionRange(unittest.TestCase):
             self.destination_partition,
             self.repartition_args,
             self.max_records_per_output_file,
+            self.s3_table_writer_kwargs,
             self.repartitioned_file_content_type,
             self.deltacat_storage,
+            self.deltacat_storage_kwargs,
         )
         self.assertEqual(len(result.range_deltas), 2)
@@ -118,8 +128,10 @@ class TestRepartitionRange(unittest.TestCase):
             self.destination_partition,
             self.repartition_args,
             self.max_records_per_output_file,
+            self.s3_table_writer_kwargs,
             self.repartitioned_file_content_type,
             self.deltacat_storage,
+            self.deltacat_storage_kwargs,
         )
         self.assertEqual(len(result.range_deltas), 3)
@@ -131,8 +143,10 @@ class TestRepartitionRange(unittest.TestCase):
             self.destination_partition,
             self.repartition_args,
             self.max_records_per_output_file,
+            self.s3_table_writer_kwargs,
             self.repartitioned_file_content_type,
             self.deltacat_storage,
+            self.deltacat_storage_kwargs,
         )
         self.assertLess(len(result.range_deltas), 2)
@@ -144,8 +158,10 @@ class TestRepartitionRange(unittest.TestCase):
                 self.destination_partition,
                 self.repartition_args,
                 self.max_records_per_output_file,
+                self.s3_table_writer_kwargs,
                 self.repartitioned_file_content_type,
                 self.deltacat_storage,
+                self.deltacat_storage_kwargs,
             )
     def test_unsorted_ranges(self):
@@ -159,8 +175,10 @@ class TestRepartitionRange(unittest.TestCase):
             self.destination_partition,
             self.repartition_args,
             self.max_records_per_output_file,
+            self.s3_table_writer_kwargs,
             self.repartitioned_file_content_type,
             self.deltacat_storage,
+            self.deltacat_storage_kwargs,
         )
         self.assertEqual(len(result.range_deltas), 4)
@@ -171,22 +189,28 @@ class TestRepartitionRange(unittest.TestCase):
             self.destination_partition,
             self.repartition_args,
             self.max_records_per_output_file,
+            self.s3_table_writer_kwargs,
             self.repartitioned_file_content_type,
             self.deltacat_storage,
+            self.deltacat_storage_kwargs,
         )
         self.assertEqual(len(result.range_deltas), 2)
     def test_ranges_with_inf(self):
         self.repartition_args["ranges"] = [1678665487112747, float("inf")]
-        result = repartition_range(
-            self.tables,
-            self.destination_partition,
-            self.repartition_args,
-            self.max_records_per_output_file,
-            self.repartitioned_file_content_type,
-            self.deltacat_storage,
+        self.assertRaises(
+            pa.lib.ArrowInvalid,
+            lambda: repartition_range(
+                self.tables,
+                self.destination_partition,
+                self.repartition_args,
+                self.max_records_per_output_file,
+                self.s3_table_writer_kwargs,
+                self.repartitioned_file_content_type,
+                self.deltacat_storage,
+            ),
         )
-        self.assertEqual(len(result.range_deltas), 2)
     def test_null_rows_are_not_dropped(self):
         # Add null value to the first table
@@ -209,8 +233,10 @@ class TestRepartitionRange(unittest.TestCase):
             self.destination_partition,
             self.repartition_args,
             self.max_records_per_output_file,
+            self.s3_table_writer_kwargs,
             self.repartitioned_file_content_type,
             self.deltacat_storage,
+            self.deltacat_storage_kwargs,
         )
         # Assuming range_deltas is a list of DataFrames,

deltacat/tests/compute/compactor/utils/__init__.py ADDED Viewed

File without changes

deltacat 0.1.18b14__py3-none-any.whl → 0.1.18b16__py3-none-any.whl

deltacat 0.1.18b14py3-none-any.whl → 0.1.18b16py3-none-any.whl