PyPI - deltacat - Versions diffs - 1.1.26__py3-none-any.whl → 1.1.28__py3-none-any.whl - Mend

deltacat 1.1.26py3-none-any.whl → 1.1.28py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

deltacat/__init__.py CHANGED Viewed

@@ -44,7 +44,7 @@ from deltacat.types.tables import TableWriteMode
 deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
-__version__ = "1.1.26"
+__version__ = "1.1.28"
 __all__ = [

deltacat/compute/compactor_v2/private/compaction_utils.py CHANGED Viewed

@@ -584,8 +584,11 @@ def _process_merge_results(
             f"Duplicate record count ({duplicate_hash_bucket_mat_results}) is as large "
             f"as or greater than params.num_rounds, which is {params.num_rounds}"
         )
+        # ensure start index is the first file index if task index is same
         hb_id_to_entry_indices_range[str(mat_result.task_index)] = (
-            file_index,
+            hb_id_to_entry_indices_range.get(str(mat_result.task_index), [file_index])[
+                0
+            ],
             file_index + mat_result.pyarrow_write_result.files,
         )

deltacat/compute/resource_estimation/delta.py CHANGED Viewed

@@ -188,7 +188,7 @@ def _estimate_resources_required_to_process_delta_using_file_sampling(
         sampled_on_disk_size += delta.manifest.entries[entry_index].meta.content_length
         sampled_num_rows += len(tbl)
-    if not sampled_on_disk_size:
+    if not sampled_on_disk_size or not sampled_in_memory_size:
         return EstimatedResources.of(
             memory_bytes=0,
             statistics=Statistics.of(

deltacat/tests/compute/resource_estimation/test_delta.py CHANGED Viewed

@@ -437,6 +437,43 @@ class TestEstimateResourcesRequiredToProcessDelta:
             == parquet_delta_with_manifest.meta.content_length
         )
+    def test_parquet_delta_when_file_sampling_and_arrow_size_zero(
+        self,
+        local_deltacat_storage_kwargs,
+        parquet_delta_with_manifest: Delta,
+        monkeypatch,
+    ):
+        params = EstimateResourcesParams.of(
+            resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING,
+            max_files_to_sample=2,
+        )
+        def mock_func(*args, **kwargs):
+            class MockedValue:
+                nbytes = 0
+                def __len__(self):
+                    return 0
+            return MockedValue()
+        monkeypatch.setattr(ds, "download_delta_manifest_entry", mock_func)
+        result = estimate_resources_required_to_process_delta(
+            delta=parquet_delta_with_manifest,
+            operation_type=OperationType.PYARROW_DOWNLOAD,
+            deltacat_storage=ds,
+            deltacat_storage_kwargs=local_deltacat_storage_kwargs,
+            estimate_resources_params=params,
+        )
+        assert parquet_delta_with_manifest.manifest is not None
+        assert result.memory_bytes == 0
+        assert (
+            result.statistics.on_disk_size_bytes
+            == parquet_delta_with_manifest.meta.content_length
+        )
     def test_delta_manifest_utsv_when_file_sampling(
         self, local_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta
     ):

deltacat/tests/compute/test_compact_partition_incremental.py CHANGED Viewed

@@ -328,6 +328,16 @@ def test_compact_partition_incremental(
         **compaction_audit_obj
     )
+    # assert if RCF covers all files
+    if compactor_version != CompactorVersion.V1.value:
+        previous_end = None
+        for start, end in round_completion_info.hb_index_to_entry_range.values():
+            assert (previous_end is None and start == 0) or start == previous_end
+            previous_end = end
+        assert (
+            previous_end == round_completion_info.compacted_pyarrow_write_result.files
+        )
     tables = ds.download_delta(
         compacted_delta_locator, storage_type=StorageType.LOCAL, **ds_mock_kwargs
     )

deltacat/tests/compute/test_compact_partition_multiple_rounds.py CHANGED Viewed

@@ -309,6 +309,16 @@ def test_compact_partition_rebase_multiple_rounds_same_source_and_destination(
             **compaction_audit_obj
         )
+        # assert if RCF covers all files
+        # multiple rounds feature is only supported in V2 compactor
+        previous_end = None
+        for start, end in round_completion_info.hb_index_to_entry_range.values():
+            assert (previous_end is None and start == 0) or start == previous_end
+            previous_end = end
+        assert (
+            previous_end == round_completion_info.compacted_pyarrow_write_result.files
+        )
         # Assert not in-place compacted
         assert (
             execute_compaction_result_spy.call_args.args[-1] is False

deltacat/tests/compute/test_compact_partition_rebase.py CHANGED Viewed

@@ -299,6 +299,17 @@ def test_compact_partition_rebase_same_source_and_destination(
             round_completion_info.compaction_audit_url
         )
+        # assert if RCF covers all files
+        if compactor_version != CompactorVersion.V1.value:
+            previous_end = None
+            for start, end in round_completion_info.hb_index_to_entry_range.values():
+                assert (previous_end is None and start == 0) or start == previous_end
+                previous_end = end
+            assert (
+                previous_end
+                == round_completion_info.compacted_pyarrow_write_result.files
+            )
         compaction_audit_obj: Dict[str, Any] = read_s3_contents(
             s3_resource, audit_bucket, audit_key
         )

deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py CHANGED Viewed

@@ -355,6 +355,16 @@ def test_compact_partition_rebase_then_incremental(
     compacted_delta_locator_incremental: DeltaLocator = (
         round_completion_info.compacted_delta_locator
     )
+    # assert if RCF covers all files
+    if compactor_version != CompactorVersion.V1.value:
+        previous_end = None
+        for start, end in round_completion_info.hb_index_to_entry_range.values():
+            assert (previous_end is None and start == 0) or start == previous_end
+            previous_end = end
+        assert (
+            previous_end == round_completion_info.compacted_pyarrow_write_result.files
+        )
     audit_bucket, audit_key = round_completion_info.compaction_audit_url.replace(
         "s3://", ""
     ).split("/", 1)

deltacat/tests/utils/test_pyarrow.py CHANGED Viewed

@@ -7,7 +7,9 @@ from deltacat.utils.pyarrow import (
     s3_file_to_table,
     ReadKwargsProviderPyArrowSchemaOverride,
     RAISE_ON_EMPTY_CSV_KWARG,
+    RAISE_ON_DECIMAL_OVERFLOW,
 )
+import decimal
 from deltacat.types.media import ContentEncoding, ContentType
 from deltacat.types.partial_download import PartialParquetParameters
 from pyarrow.parquet import ParquetFile
@@ -16,6 +18,12 @@ import pyarrow as pa
 PARQUET_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet"
 EMPTY_UTSV_PATH = "deltacat/tests/utils/data/empty.csv"
 NON_EMPTY_VALID_UTSV_PATH = "deltacat/tests/utils/data/non_empty_valid.csv"
+OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH = (
+    "deltacat/tests/utils/data/overflowing_decimal_precision.csv"
+)
+OVERFLOWING_DECIMAL_SCALE_UTSV_PATH = (
+    "deltacat/tests/utils/data/overflowing_decimal_scale.csv"
+)
 GZIP_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.gz"
 BZ2_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.bz2"
@@ -407,6 +415,253 @@ class TestReadCSV(TestCase):
             ),
         )
+    def test_read_csv_when_decimal_precision_overflows_and_raise_kwarg_specified(self):
+        schema = pa.schema(
+            [("is_active", pa.string()), ("decimal_value", pa.decimal128(4, 2))]
+        )
+        kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
+        _add_column_kwargs(
+            ContentType.UNESCAPED_TSV.value,
+            ["is_active", "decimal_value"],
+            ["is_active", "decimal_value"],
+            kwargs,
+        )
+        read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
+        kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
+        self.assertRaises(
+            pa.lib.ArrowInvalid,
+            lambda: pyarrow_read_csv(
+                OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH,
+                **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
+            ),
+        )
+    def test_read_csv_when_decimal_precision_overflows_sanity(self):
+        schema = pa.schema(
+            [("is_active", pa.string()), ("decimal_value", pa.decimal128(4, 2))]
+        )
+        kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
+        _add_column_kwargs(
+            ContentType.UNESCAPED_TSV.value,
+            ["is_active", "decimal_value"],
+            ["is_active", "decimal_value"],
+            kwargs,
+        )
+        read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
+        kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
+        self.assertRaises(
+            pa.lib.ArrowInvalid,
+            lambda: pyarrow_read_csv(OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH, **kwargs),
+        )
+    def test_read_csv_when_decimal_scale_overflows_and_raise_kwarg_specified(self):
+        schema = pa.schema(
+            [("is_active", pa.string()), ("decimal_value", pa.decimal128(20, 2))]
+        )
+        kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
+        _add_column_kwargs(
+            ContentType.UNESCAPED_TSV.value,
+            ["is_active", "decimal_value"],
+            ["is_active", "decimal_value"],
+            kwargs,
+        )
+        read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
+        kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
+        self.assertRaises(
+            pa.lib.ArrowInvalid,
+            lambda: pyarrow_read_csv(
+                OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
+                **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
+            ),
+        )
+    def test_read_csv_when_decimal_scale_overflows_sanity(self):
+        schema = pa.schema(
+            [("is_active", pa.string()), ("decimal_value", pa.decimal128(20, 2))]
+        )
+        kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
+        _add_column_kwargs(
+            ContentType.UNESCAPED_TSV.value,
+            ["is_active", "decimal_value"],
+            ["is_active", "decimal_value"],
+            kwargs,
+        )
+        read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
+        kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
+        result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
+        self.assertEqual(len(result), 3)
+        self.assertEqual(
+            result[1][0].as_py(), decimal.Decimal("322236.66")
+        )  # rounding decimal
+        self.assertEqual(result[1][1].as_py(), decimal.Decimal("32.33"))  # not rounded
+        self.assertEqual(len(result.column_names), 2)
+        result_schema = result.schema
+        self.assertEqual(result_schema.field(0).type, "string")
+        self.assertEqual(result_schema.field(1).type, pa.decimal128(20, 2))
+    def test_read_csv_when_decimal_scale_overflows_and_negative_scale(self):
+        schema = pa.schema(
+            [("is_active", pa.string()), ("decimal_value", pa.decimal128(20, -2))]
+        )
+        kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
+        _add_column_kwargs(
+            ContentType.UNESCAPED_TSV.value,
+            ["is_active", "decimal_value"],
+            ["is_active", "decimal_value"],
+            kwargs,
+        )
+        read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
+        kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
+        result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
+        self.assertEqual(len(result), 3)
+        self.assertEqual(
+            result[1][0].as_py(),
+            decimal.Decimal("322200"),  # consequence of negative scale
+        )  # rounding decimal
+        self.assertEqual(result[1][1].as_py(), decimal.Decimal("00"))
+        self.assertEqual(len(result.column_names), 2)
+        result_schema = result.schema
+        self.assertEqual(result_schema.field(0).type, "string")
+        self.assertEqual(result_schema.field(1).type, pa.decimal128(20, -2))
+    def test_read_csv_when_decimal_scale_overflows_with_decimal256(self):
+        schema = pa.schema(
+            [("is_active", pa.string()), ("decimal_value", pa.decimal256(20, 2))]
+        )
+        kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
+        _add_column_kwargs(
+            ContentType.UNESCAPED_TSV.value,
+            ["is_active", "decimal_value"],
+            ["is_active", "decimal_value"],
+            kwargs,
+        )
+        read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
+        kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
+        result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
+        self.assertEqual(len(result), 3)
+        self.assertEqual(
+            result[1][0].as_py(), decimal.Decimal("322236.66")
+        )  # rounding decimal
+        self.assertEqual(result[1][1].as_py(), decimal.Decimal("32.33"))  # not rounded
+        self.assertEqual(len(result.column_names), 2)
+        result_schema = result.schema
+        self.assertEqual(result_schema.field(0).type, "string")
+        self.assertEqual(result_schema.field(1).type, pa.decimal256(20, 2))
+    def test_read_csv_when_decimal_scale_overflows_with_decimal256_and_raise_on_overflow(
+        self,
+    ):
+        schema = pa.schema(
+            [("is_active", pa.string()), ("decimal_value", pa.decimal256(20, 2))]
+        )
+        kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
+        _add_column_kwargs(
+            ContentType.UNESCAPED_TSV.value,
+            ["is_active", "decimal_value"],
+            ["is_active", "decimal_value"],
+            kwargs,
+        )
+        read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
+        kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
+        self.assertRaises(
+            pa.lib.ArrowNotImplementedError,
+            lambda: pyarrow_read_csv(
+                OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
+                **{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
+            ),
+        )
+    def test_read_csv_when_decimal_scale_overflows_without_any_schema_then_infers(self):
+        kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
+        read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=None)
+        kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
+        result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
+        # The default behavior of pyarrow is to invalid skip rows
+        self.assertEqual(len(result), 2)
+        self.assertEqual(result[1][0].as_py(), 32.33)  # rounding decimal
+        self.assertEqual(result[1][1].as_py(), 0.4)  # not rounded
+        self.assertEqual(len(result.column_names), 2)
+        result_schema = result.schema
+        self.assertEqual(result_schema.field(0).type, "string")
+        self.assertEqual(result_schema.field(1).type, pa.float64())
+    def test_read_csv_when_decimal_scale_and_precision_overflow_and_raise_on_overflow(
+        self,
+    ):
+        schema = pa.schema(
+            [("is_active", pa.string()), ("decimal_value", pa.decimal128(5, 2))]
+        )
+        kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
+        _add_column_kwargs(
+            ContentType.UNESCAPED_TSV.value,
+            ["is_active", "decimal_value"],
+            ["is_active", "decimal_value"],
+            kwargs,
+        )
+        read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
+        kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
+        self.assertRaises(
+            pa.lib.ArrowInvalid,
+            lambda: pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs),
+        )
+    def test_read_csv_when_decimal_scale_overflow_and_file_like_obj_passed(self):
+        schema = pa.schema(
+            [("is_active", pa.string()), ("decimal_value", pa.decimal128(15, 2))]
+        )
+        kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
+        _add_column_kwargs(
+            ContentType.UNESCAPED_TSV.value,
+            ["is_active", "decimal_value"],
+            ["is_active", "decimal_value"],
+            kwargs,
+        )
+        read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
+        kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
+        with open(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, "rb") as file:
+            result = pyarrow_read_csv(file, **kwargs)
+            self.assertEqual(len(result), 3)
+            self.assertEqual(
+                result[1][0].as_py(), decimal.Decimal("322236.66")
+            )  # rounding decimal
+            self.assertEqual(
+                result[1][1].as_py(), decimal.Decimal("32.33")
+            )  # not rounded
+            self.assertEqual(len(result.column_names), 2)
+            result_schema = result.schema
+            self.assertEqual(result_schema.field(0).type, "string")
+            self.assertEqual(result_schema.field(1).type, pa.decimal128(15, 2))
 class TestS3FileToTable(TestCase):
     def test_s3_file_to_table_identity_sanity(self):

deltacat/utils/pyarrow.py CHANGED Viewed

@@ -1,6 +1,7 @@
 # Allow classes to use self-referencing Type hints in Python 3.7.
 from __future__ import annotations
+import copy
 import bz2
 import gzip
 import io
@@ -47,6 +48,17 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
 RAISE_ON_EMPTY_CSV_KWARG = "raise_on_empty_csv"
 READER_TYPE_KWARG = "reader_type"
+"""
+By default, round decimal values using half_to_even round mode when
+rescaling a decimal to the given scale and precision in the schema would cause
+data loss. Setting any non null value of this argument will result
+in an error instead.
+"""
+RAISE_ON_DECIMAL_OVERFLOW = "raise_on_decimal_overflow"
+# Note the maximum from https://arrow.apache.org/docs/python/generated/pyarrow.Decimal256Type.html#pyarrow.Decimal256Type
+DECIMAL256_DEFAULT_SCALE = 38
+DECIMAL256_MAX_PRECISION = 76
 def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Schema:
@@ -64,45 +76,162 @@ def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Sche
     return target_schema
-def pyarrow_read_csv(*args, **kwargs) -> pa.Table:
-    try:
-        new_kwargs = sanitize_kwargs_by_supported_kwargs(
-            ["read_options", "parse_options", "convert_options", "memory_pool"], kwargs
+def _extract_arrow_schema_from_read_csv_kwargs(kwargs: Dict[str, Any]) -> pa.Schema:
+    schema = None
+    if (
+        "convert_options" in kwargs
+        and kwargs["convert_options"].column_types is not None
+    ):
+        schema = kwargs["convert_options"].column_types
+        if not isinstance(schema, pa.Schema):
+            schema = pa.schema(schema)
+        if kwargs["convert_options"].include_columns:
+            schema = _filter_schema_for_columns(
+                schema, kwargs["convert_options"].include_columns
+            )
+        elif (
+            kwargs.get("read_options") is not None
+            and kwargs["read_options"].column_names
+        ):
+            schema = _filter_schema_for_columns(
+                schema, kwargs["read_options"].column_names
+            )
+    else:
+        logger.debug(
+            "Schema not specified in the kwargs."
+            " Hence, schema could not be inferred from the empty CSV."
         )
+    return schema
+def _new_schema_with_replaced_fields(
+    schema: pa.Schema, field_to_replace: Callable[[pa.Field], Optional[pa.Field]]
+) -> pa.Schema:
+    if schema is None:
+        return None
+    new_schema_fields = []
+    for field in schema:
+        new_field = field_to_replace(field)
+        if new_field is not None:
+            new_schema_fields.append(new_field)
+        else:
+            new_schema_fields.append(field)
+    return pa.schema(new_schema_fields, metadata=schema.metadata)
+def _read_csv_rounding_decimal_columns_to_fit_scale(
+    schema: pa.Schema, reader_args: List[Any], reader_kwargs: Dict[str, Any]
+) -> pa.Table:
+    # Note: We read decimals as strings first because CSV
+    # conversion to decimal256 isn't implemented as of pyarrow==12.0.1
+    new_schema = _new_schema_with_replaced_fields(
+        schema,
+        lambda fld: pa.field(fld.name, pa.string(), metadata=fld.metadata)
+        if pa.types.is_decimal128(fld.type) or pa.types.is_decimal256(fld.type)
+        else None,
+    )
+    new_kwargs = sanitize_kwargs_by_supported_kwargs(
+        ["read_options", "parse_options", "convert_options", "memory_pool"],
+        reader_kwargs,
+    )
+    # Creating a shallow copy for efficiency
+    new_convert_options = copy.copy(new_kwargs["convert_options"])
+    new_convert_options.column_types = new_schema
+    new_reader_kwargs = {**new_kwargs, "convert_options": new_convert_options}
+    arrow_table = pacsv.read_csv(*reader_args, **new_reader_kwargs)
+    for column_index, field in enumerate(schema):
+        if pa.types.is_decimal128(field.type) or pa.types.is_decimal256(field.type):
+            column_array = arrow_table[field.name]
+            # We always cast to decimal256 to accomodate fixed scale of 38
+            cast_to_type = pa.decimal256(
+                DECIMAL256_MAX_PRECISION, DECIMAL256_DEFAULT_SCALE
+            )
+            casted_decimal_array = pc.cast(column_array, cast_to_type)
+            # Note that scale can be negative
+            rounded_column_array = pc.round(
+                casted_decimal_array, ndigits=field.type.scale
+            )
+            final_decimal_array = pc.cast(rounded_column_array, field.type)
+            arrow_table = arrow_table.set_column(
+                column_index,
+                field,
+                final_decimal_array,
+            )
+            logger.debug(
+                f"Rounded decimal column: {field.name} to {field.type.scale} scale and"
+                f" {field.type.precision} precision"
+            )
+    return arrow_table
+def pyarrow_read_csv_default(*args, **kwargs):
+    new_kwargs = sanitize_kwargs_by_supported_kwargs(
+        ["read_options", "parse_options", "convert_options", "memory_pool"], kwargs
+    )
+    try:
         return pacsv.read_csv(*args, **new_kwargs)
     except pa.lib.ArrowInvalid as e:
-        if e.__str__() == "Empty CSV file" and not kwargs.get(RAISE_ON_EMPTY_CSV_KWARG):
-            schema = None
-            if (
-                "convert_options" in kwargs
-                and kwargs["convert_options"].column_types is not None
-            ):
-                schema = kwargs["convert_options"].column_types
-                if not isinstance(schema, pa.Schema):
-                    schema = pa.schema(schema)
-                if kwargs["convert_options"].include_columns:
-                    schema = _filter_schema_for_columns(
-                        schema, kwargs["convert_options"].include_columns
-                    )
-                elif (
-                    kwargs.get("read_options") is not None
-                    and kwargs["read_options"].column_names
+        error_str = e.__str__()
+        schema = _extract_arrow_schema_from_read_csv_kwargs(kwargs)
+        if error_str == "Empty CSV file" and not kwargs.get(RAISE_ON_EMPTY_CSV_KWARG):
+            logger.debug(f"Read CSV empty schema being used: {schema}")
+            return pa.Table.from_pylist([], schema=schema)
+        if not kwargs.get(RAISE_ON_DECIMAL_OVERFLOW):
+            # Note, this logic requires expensive casting. To prevent downgrading performance
+            # for happy path reads, we are handling this case in response to an error.
+            logger.warning(
+                "Rescaling Decimal to the given scale in the schema. "
+                f"Original error: {error_str}"
+            )
+            if schema is not None and "convert_options" in kwargs:
+                if (
+                    "Rescaling Decimal" in error_str
+                    and "value would cause data loss" in error_str
                 ):
-                    schema = _filter_schema_for_columns(
-                        schema, kwargs["read_options"].column_names
+                    logger.debug(f"Checking if the file: {args[0]}...")
+                    # Since we are re-reading the file, we have to seek to beginning
+                    if isinstance(args[0], io.IOBase) and args[0].seekable():
+                        logger.debug(f"Seeking to the beginning of the file {args[0]}")
+                        args[0].seek(0)
+                    return _read_csv_rounding_decimal_columns_to_fit_scale(
+                        schema=schema, reader_args=args, reader_kwargs=kwargs
                     )
             else:
                 logger.debug(
-                    "Schema not specified in the kwargs."
-                    " Hence, schema could not be inferred from the empty CSV."
+                    "Schema is None when trying to adjust decimal values. "
+                    "Hence, bubbling up exception..."
                 )
-            logger.debug(f"Read CSV empty schema being used: {schema}")
-            return pa.Table.from_pylist([], schema=schema)
         raise e
+def pyarrow_read_csv(*args, **kwargs) -> pa.Table:
+    schema = _extract_arrow_schema_from_read_csv_kwargs(kwargs)
+    # CSV conversion to decimal256 isn't supported as of pyarrow=12.0.1
+    # Below ensures decimal256 is casted properly.
+    schema_includes_decimal256 = (
+        (True if any([pa.types.is_decimal256(x.type) for x in schema]) else False)
+        if schema is not None
+        else None
+    )
+    if schema_includes_decimal256 and not kwargs.get(RAISE_ON_DECIMAL_OVERFLOW):
+        # falling back to expensive method of reading CSV
+        return _read_csv_rounding_decimal_columns_to_fit_scale(
+            schema, reader_args=args, reader_kwargs=kwargs
+        )
+    else:
+        return pyarrow_read_csv_default(*args, **kwargs)
 CONTENT_TYPE_TO_PA_READ_FUNC: Dict[str, Callable] = {
     ContentType.UNESCAPED_TSV.value: pyarrow_read_csv,
     ContentType.TSV.value: pyarrow_read_csv,

{deltacat-1.1.26.dist-info → deltacat-1.1.28.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: deltacat
-Version: 1.1.26
+Version: 1.1.28
 Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
 Home-page: https://github.com/ray-project/deltacat
 Author: Ray Team

{deltacat-1.1.26.dist-info → deltacat-1.1.28.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-deltacat/__init__.py,sha256=N7LrDYFJUaYdchJUVZ8VN_9QUJzuETzkz-oT833iEr4,1778
+deltacat/__init__.py,sha256=GPlTQc6AW4ig_nZJ7kMVe-kbZxYfrSVGFN1YEqY8dXU,1778
 deltacat/constants.py,sha256=TUJLXUJ9xq1Ryil72yLkKR8EDH_Irp5wUg56QstbRNE,2181
 deltacat/exceptions.py,sha256=7sjk3BuMY5Oo-6OvAfHncZx_OcvtEL47BblWr2F7waE,12740
 deltacat/logs.py,sha256=EQSDin1deehzz5xlLV1_TrFJrO_IBZ9Ahp7MdL-4cK8,9363
@@ -66,7 +66,7 @@ deltacat/compute/compactor_v2/model/merge_file_group.py,sha256=1o86t9lc3K6ZvtViV
 deltacat/compute/compactor_v2/model/merge_input.py,sha256=-SxTE0e67z2V7MiMEVz5aMu4E0k8h3-vqohvUUOC0do,5659
 deltacat/compute/compactor_v2/model/merge_result.py,sha256=_IZTCStpb4UKiRCJYA3g6EhAqjrw0t9vmoDAN8kIK-Y,436
 deltacat/compute/compactor_v2/private/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-deltacat/compute/compactor_v2/private/compaction_utils.py,sha256=e8pZFobq6KBCy67ZRn2z1CAwNVjPIJnAiD4HHDmDbCk,30757
+deltacat/compute/compactor_v2/private/compaction_utils.py,sha256=fMWXg1SCIIgjk9p_OFYrcm760dOKNbFO1Lj3_JI3GCY,30929
 deltacat/compute/compactor_v2/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 deltacat/compute/compactor_v2/steps/hash_bucket.py,sha256=1R5xLUkl7GqL1nY-apAgY1czKDEHjIVYSRi9qLOMass,6726
 deltacat/compute/compactor_v2/steps/merge.py,sha256=LpktsDPfj7Of6RgUw9w1f3Y3OBkPDjvtyXjzFaIDoSo,21771
@@ -85,7 +85,7 @@ deltacat/compute/merge_on_read/model/merge_on_read_params.py,sha256=Q51znagh8PtL
 deltacat/compute/merge_on_read/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 deltacat/compute/merge_on_read/utils/delta.py,sha256=e4BtOHa5XPpUnR4r0HqBKjXckBsTI8qBwdUWwpJfkWQ,1367
 deltacat/compute/resource_estimation/__init__.py,sha256=4bfBXcq-VAt9JCmjvj3yAmn0lEHVGdGsUCCoMGxjEqA,799
-deltacat/compute/resource_estimation/delta.py,sha256=Ei4v9UYhtcT5P-wNEMAg0E4mYl0z5FpSkaTufVoGD18,9492
+deltacat/compute/resource_estimation/delta.py,sha256=8oRy1rgGUimwMqPB5At81AS-AsjPHdcvLHzJ9TW8RpM,9522
 deltacat/compute/resource_estimation/manifest.py,sha256=gSqOyIda-pYq3vRsKFq3IiZvwhV3mMqrWPtsmUH9dD8,13035
 deltacat/compute/resource_estimation/model.py,sha256=psyagFXdpLGt8DfDqy7c8DWiuXCacr0Swe5f0M7DdO4,5465
 deltacat/compute/resource_estimation/parquet.py,sha256=5_apma4EKbKcm-nfV73-qN2nfnCeyhFW23ZHX3jz0Kw,3158
@@ -137,11 +137,11 @@ deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py,sha256=kW
 deltacat/tests/compute/compact_partition_rebase_test_cases.py,sha256=8HVr3EIFYFqNaJoqeCuj9xIBjM4Ch2bx-mJcO4BRrLo,16839
 deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py,sha256=l_6-pAKOsRY3NbtfHsYmEaJEkq6IJueYuLsjyJxNgz4,81564
 deltacat/tests/compute/compact_partition_test_cases.py,sha256=R9eiKvxCLqcoHjAx3iOogdnXZEO9TvLbRf0wA7bcJN4,26170
-deltacat/tests/compute/test_compact_partition_incremental.py,sha256=Z0hyQGhMZjCaOn1Vk4qUbgDiS7HDhtdNeFQyG1PJhqA,14559
-deltacat/tests/compute/test_compact_partition_multiple_rounds.py,sha256=Qw74ajnKf41C3MCMvf4bIPXA6-ucKlPj_IeEqDm8rCg,12503
+deltacat/tests/compute/test_compact_partition_incremental.py,sha256=lkfAraOJmEmieesf7b1BqlfTS26YjYM5xXOXoTMrsos,14989
+deltacat/tests/compute/test_compact_partition_multiple_rounds.py,sha256=xXBA66TTfARR90m5KQs31nmiokuMy9iGQt7Z9evyG7M,12950
 deltacat/tests/compute/test_compact_partition_params.py,sha256=Dm5eLyHo8oGMeO3XBbpj1rZqHtPZ1hAB7z2qvzc4Lxk,8497
-deltacat/tests/compute/test_compact_partition_rebase.py,sha256=ztSiLgC2OpU4yz81vz-4xWzvZyrLGojtzomsW4q7Bl8,12626
-deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py,sha256=CHHfNFEJW8S1We7NE1Gg6EaoKEWnaOMRxWrLyirrahc,14643
+deltacat/tests/compute/test_compact_partition_rebase.py,sha256=DNcpmnBo5QoZ23BiIhJCC3zaDK0xClZLUb2-ZEEp5s4,13108
+deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py,sha256=Rxen3QGIaxVPa8lcO7NDMRxQ0aBjrOKn46LK5ZsfQTo,15073
 deltacat/tests/compute/test_util_common.py,sha256=0mEHo38bgH64y0XZ_zgUL_aZgQMgJOSTlOYvIJxG_MM,11825
 deltacat/tests/compute/test_util_constant.py,sha256=4o-W3E7r7jhFl1A3OFLLrdKnwcF46zx4lEIDY8ONJ3c,929
 deltacat/tests/compute/test_util_create_table_deltas_repo.py,sha256=Q3HJj1fjoe2JwRUOW8KEjbTqPIIoP2o_T3ZGH6SJnCM,13244
@@ -157,7 +157,7 @@ deltacat/tests/compute/compactor_v2/test_hashlib.py,sha256=8csF2hFWtBvY2MbX3-6ip
 deltacat/tests/compute/compactor_v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 deltacat/tests/compute/compactor_v2/utils/test_task_options.py,sha256=37DkR1u_XwhedV9cGed6FFuJTC0XmuiowHJIa_Op6uA,865
 deltacat/tests/compute/resource_estimation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-deltacat/tests/compute/resource_estimation/test_delta.py,sha256=LyzRitBrasQa35Bq7rHTQInaOelSWOSoC0_dyjgpNuE,24505
+deltacat/tests/compute/resource_estimation/test_delta.py,sha256=HCL2oUnCqm0E26T3HLJjMhoAsHTJIWPYGwIKRgM_H7E,25712
 deltacat/tests/compute/resource_estimation/test_manifest.py,sha256=yrMvqDjolExdRf6Vtg5XaKDuaKz9ok15PCZ7_aJOYrI,32893
 deltacat/tests/compute/resource_estimation/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 deltacat/tests/io/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -179,7 +179,7 @@ deltacat/tests/utils/test_cloudpickle.py,sha256=J0pnBY3-PxlUh6MamZAN1PuquKQPr2iy
 deltacat/tests/utils/test_daft.py,sha256=kY8lkXoQvyWunok8UvOsh1An297rb3jcnstTuIAyAlc,8232
 deltacat/tests/utils/test_metrics.py,sha256=Ym9nOz1EtB180pLmvugihj1sDTNDMb5opIjjr5Nmcls,16339
 deltacat/tests/utils/test_placement.py,sha256=g61wVOMkHe4YJeR9Oxg_BOVQ6bhHHbC3IBYv8YhUu94,597
-deltacat/tests/utils/test_pyarrow.py,sha256=AWx0DTsBA1PkQag48w_HeQdz7tlBzJsm9v7Nd6-dhEY,19607
+deltacat/tests/utils/test_pyarrow.py,sha256=YDuyFYNjy6thzfA6Z2a0dOytUugsExu1uMUOhmP_aXc,29977
 deltacat/tests/utils/test_record_batch_tables.py,sha256=AkG1WyljQmjnl-AxhbFWyo5LnMIKRyLScfgC2B_ES-s,11321
 deltacat/tests/utils/test_resources.py,sha256=HtpvDrfPZQNtGDXUlsIzc_yd7Vf1cDscZ3YbN0oTvO8,2560
 deltacat/tests/utils/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -200,7 +200,7 @@ deltacat/utils/numpy.py,sha256=SpHKKvC-K8NINTWGVfTZ5-gBFTGYqaXjjgKFhsdUjwg,2049
 deltacat/utils/pandas.py,sha256=q99mlRB7tymICMcNbfGLfLqFu_C-feyPZKZm2CWJJVc,9574
 deltacat/utils/performance.py,sha256=7ZLaMkS1ehPSIhT5uOQVBHvjC70iKHzoFquFo-KL0PI,645
 deltacat/utils/placement.py,sha256=Lj20fb-eq8rgMdm_M2MBMfDLwhDM1sS1nJj2DvIK56s,12060
-deltacat/utils/pyarrow.py,sha256=nW_eD6fWAlbyHUzPj1rOOfnUbpP3RnAgNSuuVNyvhZ4,29174
+deltacat/utils/pyarrow.py,sha256=xEZRzbTBU6uj9K4DtvngIPtQkTA8haVgQ4Y4vjwHvtM,34311
 deltacat/utils/resources.py,sha256=Ax1OgLLbZI4oYpp4Ki27OLaST-7I-AJgZwU87FVfY8g,8253
 deltacat/utils/s3fs.py,sha256=PmUJ5Fm1WmD-_zp_M6yd9VbXvIoJuBeK6ApOdJJApLE,662
 deltacat/utils/schema.py,sha256=m4Wm4ZQcpttzOUxex4dVneGlHy1_E36HspTcjNYzvVM,1564
@@ -210,8 +210,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=JDVwMiQWrmuSlyCWAoiq9ctoJ0XADEfDD
 deltacat/utils/ray_utils/dataset.py,sha256=waHdtH0c835a-2t51HYRHnulfC0_zBxx8mFSAPvPSPM,3274
 deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
 deltacat/utils/ray_utils/runtime.py,sha256=rB0A-tU9WZHz0J11LzJdANYtL397YyuemcA1l-K9dAw,5029
-deltacat-1.1.26.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-deltacat-1.1.26.dist-info/METADATA,sha256=5p2qZYAkOXBNT_rc9PyfGJ5Id3zKfbTp3KhiqZWNxas,1733
-deltacat-1.1.26.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
-deltacat-1.1.26.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
-deltacat-1.1.26.dist-info/RECORD,,
+deltacat-1.1.28.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+deltacat-1.1.28.dist-info/METADATA,sha256=-ZnMp9C26vVxi015Il3UtBMm9pHqA8aZmTnNTgr8Tb8,1733
+deltacat-1.1.28.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
+deltacat-1.1.28.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
+deltacat-1.1.28.dist-info/RECORD,,

{deltacat-1.1.26.dist-info → deltacat-1.1.28.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.44.0)
+Generator: bdist_wheel (0.45.1)
 Root-Is-Purelib: true
 Tag: py3-none-any

{deltacat-1.1.26.dist-info → deltacat-1.1.28.dist-info}/LICENSE RENAMED Viewed

File without changes

{deltacat-1.1.26.dist-info → deltacat-1.1.28.dist-info}/top_level.txt RENAMED Viewed

File without changes

deltacat 1.1.26__py3-none-any.whl → 1.1.28__py3-none-any.whl

deltacat 1.1.26py3-none-any.whl → 1.1.28py3-none-any.whl