PyPI - deltacat - Versions diffs - 2.0.0.post1__py3-none-any.whl → 2.0.0.post2__py3-none-any.whl - Mend

deltacat 2.0.0.post1py3-none-any.whl → 2.0.0.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

deltacat/__init__.py +1 -1
deltacat/api.py +44 -7
deltacat/catalog/main/impl.py +34 -110
deltacat/examples/hello_world.py +10 -4
deltacat/examples/indexer/indexer.py +3 -0
deltacat/examples/indexer/job_runner.py +6 -1
deltacat/storage/model/schema.py +17 -4
deltacat/tests/aws/test_s3u.py +9 -1
deltacat/tests/catalog/test_default_catalog_impl.py +198 -7
deltacat/types/media.py +282 -0
deltacat/types/tables.py +5 -11
deltacat/utils/pandas.py +11 -3
deltacat/utils/polars.py +3 -1
deltacat/utils/pyarrow.py +7 -3
deltacat/utils/url.py +22 -0
{deltacat-2.0.0.post1.dist-info → deltacat-2.0.0.post2.dist-info}/METADATA +161 -47
{deltacat-2.0.0.post1.dist-info → deltacat-2.0.0.post2.dist-info}/RECORD +20 -20
{deltacat-2.0.0.post1.dist-info → deltacat-2.0.0.post2.dist-info}/WHEEL +0 -0
{deltacat-2.0.0.post1.dist-info → deltacat-2.0.0.post2.dist-info}/licenses/LICENSE +0 -0
{deltacat-2.0.0.post1.dist-info → deltacat-2.0.0.post2.dist-info}/top_level.txt +0 -0

deltacat/tests/catalog/test_default_catalog_impl.py CHANGED Viewed

@@ -1048,11 +1048,11 @@ class TestCopyOnWrite:
         "dataset_type",
         [
             DatasetType.PANDAS,
-            DatasetType.PYARROW,  # Now supported with field tracking pipeline
-            DatasetType.POLARS,  # Now supported with field tracking pipeline
-            DatasetType.DAFT,  # Distributed dataset type - now supported with field tracking pipeline
-            DatasetType.RAY_DATASET,  # Distributed dataset type - now supported with field tracking pipeline
-            DatasetType.NUMPY,  # Now supported with from_pandas helper and field tracking pipeline
+            DatasetType.PYARROW,
+            DatasetType.POLARS,
+            DatasetType.DAFT,
+            DatasetType.RAY_DATASET,
+            DatasetType.NUMPY,
         ],
     )
     def test_partial_upsert_all_dataset_types(self, dataset_type):
@@ -1114,8 +1114,14 @@ class TestCopyOnWrite:
             table=table_name,
             namespace=self.test_namespace,
             catalog=self.catalog_name,
+            read_as=dataset_type,
+        )
+        table = dc.get_table(
+            table_name,
+            catalog=self.catalog_name,
+            namespace=self.test_namespace,
         )
-        result_df = result.to_pandas()
+        result_df = dc.to_pandas(result, schema=table.table_version.schema.arrow)
         # Verify results
         assert len(result_df) == 4, f"Should have 4 records ({dataset_type.value})"
@@ -1239,8 +1245,14 @@ class TestCopyOnWrite:
             table=table_name,
             namespace=self.test_namespace,
             catalog=self.catalog_name,
+            read_as=dataset_type,
+        )
+        table = dc.get_table(
+            table_name,
+            catalog=self.catalog_name,
+            namespace=self.test_namespace,
         )
-        result_df = result.to_pandas()
+        result_df = dc.to_pandas(result, schema=table.table_version.schema.arrow)
         # Verify results
         assert len(result_df) == 4, f"Should have 4 records ({dataset_type.value})"
@@ -1401,6 +1413,185 @@ class TestCopyOnWrite:
         self._verify_dataframe_contents(result, expected_final_data)
+    def test_schema_evolution_delta_manifest_schema_ids(self):
+        """
+        Test that delta manifest entries record correct schema IDs during schema evolution.
+        This test verifies the fix for the issue where MERGE operations with new columns
+        were recording incorrect schema IDs in delta manifest entries, causing reads
+        to use old schemas instead of evolved schemas.
+        """
+        from deltacat.storage.model.metafile import Metafile
+        from deltacat.storage.model.delta import Delta
+        table_name = "test_schema_evolution_manifest_ids"
+        # Step 1: Create table with merge keys (initial schema)
+        self._create_table_with_merge_keys(table_name)
+        # Step 2: Write initial data using PyArrow for an exact match with the declared schema
+        # This ensures that schema evolution isn't triggered by the first write (which would
+        # result in 2 schemas created by the first write instead of 1)
+        initial_data = pa.table(
+            {
+                "id": pa.array([1, 2, 3], type=pa.int64()),
+                "name": pa.array(["Alice", "Bob", "Charlie"], type=pa.string()),
+                "age": pa.array([25, 30, 35], type=pa.int32()),
+                "city": pa.array(["NYC", "LA", "Chicago"], type=pa.string()),
+            }
+        )
+        dc.write_to_table(
+            data=initial_data,
+            table=table_name,
+            namespace=self.test_namespace,
+            mode=TableWriteMode.MERGE,
+            content_type=ContentType.PARQUET,
+            catalog=self.catalog_name,
+        )
+        # Step 3: Write MERGE data with NEW COLUMNS (triggers schema evolution)
+        merge_data = pa.table(
+            {
+                "id": pa.array([1, 2, 4], type=pa.int64()),  # Update existing + add new
+                "salary": pa.array(
+                    [50000, 60000, 55000], type=pa.int64()
+                ),  # NEW COLUMN
+                "department": pa.array(
+                    ["Engineering", "Sales", "Marketing"], type=pa.string()
+                ),  # NEW COLUMN
+            }
+        )
+        dc.write_to_table(
+            data=merge_data,
+            table=table_name,
+            namespace=self.test_namespace,
+            mode=TableWriteMode.MERGE,
+            content_type=ContentType.PARQUET,
+            catalog=self.catalog_name,
+        )
+        # Writing the same data again shouldn't trigger schema evolution
+        dc.write_to_table(
+            data=merge_data,
+            table=table_name,
+            namespace=self.test_namespace,
+            mode=TableWriteMode.MERGE,
+            content_type=ContentType.PARQUET,
+            catalog=self.catalog_name,
+        )
+        # Step 4: Get table definition to access schema evolution history
+        table_def = dc.get_table(
+            table=table_name,
+            namespace=self.test_namespace,
+            catalog=self.catalog_name,
+        )
+        all_schemas = table_def.table_version.schemas
+        # Verify we have schema evolution (should have 2 schemas: original + evolved)
+        assert (
+            len(all_schemas) == 2
+        ), f"Expected 2 schemas after evolution, got {len(all_schemas)}"
+        initial_schema = all_schemas[0]  # Original schema
+        evolved_schema = all_schemas[1]  # Latest schema after evolution
+        initial_schema_id = initial_schema.id
+        evolved_schema_id = evolved_schema.id
+        # Step 5: Extract schema IDs from delta manifest entries
+        def extract_schema_ids_from_deltas(all_objects):
+            """Extract schema IDs from Delta objects by parsing manifest entries."""
+            schema_ids = []
+            for obj in all_objects:
+                obj_type = Metafile.get_class(obj)
+                if obj_type == Delta:
+                    delta_obj = obj
+                    # Access manifest entries to get schema IDs
+                    if delta_obj.manifest:
+                        manifest = delta_obj.manifest
+                        if manifest.entries:
+                            for i, entry in enumerate(manifest.entries):
+                                # Extract schema ID from manifest entry
+                                if entry.meta and entry.meta.schema_id is not None:
+                                    schema_id_value = entry.meta.schema_id
+                                    schema_ids.append(schema_id_value)
+            return schema_ids
+        # Use dc.list with recursive=True to find all objects for this table
+        table_url = dc.DeltaCatUrl(
+            f"dc://{self.catalog_name}/{self.test_namespace}/{table_name}"
+        )
+        all_objects = dc.list(table_url, recursive=True)
+        # Extract schema IDs from all delta manifest entries
+        manifest_schema_ids = extract_schema_ids_from_deltas(all_objects)
+        # Step 6: Verify schema ID correctness
+        # We should have exactly 4 manifest entries (1 from first write + 3 from second write + 0 from third write)
+        assert (
+            len(manifest_schema_ids) == 4
+        ), f"Expected 4 manifest entries with schema IDs, got {len(manifest_schema_ids)}"
+        # Check if manifest schema IDs match table schema IDs
+        table_schema_ids = {initial_schema_id, evolved_schema_id}
+        manifest_schema_ids_set = set(manifest_schema_ids)
+        if table_schema_ids == manifest_schema_ids_set:
+            # The first delta should use the initial schema ID
+            initial_entries = [
+                sid for sid in manifest_schema_ids if sid == initial_schema_id
+            ]
+            assert (
+                len(initial_entries) == 1
+            ), f"Expected 1 initial entry with schema ID {initial_schema_id}, but found {len(initial_entries)}"
+            # The second delta should use the evolved schema ID
+            evolved_entries = [
+                sid for sid in manifest_schema_ids if sid == evolved_schema_id
+            ]
+            assert (
+                len(evolved_entries) == 3
+            ), f"Expected 3 evolved entries with schema ID {evolved_schema_id}, but found {len(evolved_entries)}"
+        else:
+            # This should not happen with PyArrow tables - fail the test
+            assert (
+                False
+            ), f"Schema IDs should match. Table: {sorted(table_schema_ids)}, Manifest: {sorted(manifest_schema_ids_set)}"
+        # Step 7: Verify the data can be read correctly with evolved schema
+        final_data = dc.to_pandas(
+            dc.read_table(
+                table=table_name,
+                namespace=self.test_namespace,
+                catalog=self.catalog_name,
+            )
+        )
+        # Should have all original columns plus new columns
+        expected_columns = {"id", "name", "age", "city", "salary", "department"}
+        actual_columns = set(final_data.columns)
+        assert expected_columns.issubset(
+            actual_columns
+        ), f"Missing columns: {expected_columns - actual_columns}"
+        # Verify data integrity - all records should have both old and new data
+        assert (
+            len(final_data) == 4
+        ), f"Expected 4 records after merge, got {len(final_data)}"
+        # Check that evolved columns are properly populated
+        salary_values = final_data["salary"].dropna()
+        dept_values = final_data["department"].dropna()
+        assert (
+            len(salary_values) >= 3
+        ), f"Expected salary values for at least 3 records, got {len(salary_values)}"
+        assert (
+            len(dept_values) >= 3
+        ), f"Expected department values for at least 3 records, got {len(dept_values)}"
     def test_append_delta_count_compaction(self):
         """Test that compaction is triggered by appended delta count for APPEND mode writes."""
         table_name = "test_append_delta_compaction"

deltacat/types/media.py CHANGED Viewed

@@ -1,3 +1,5 @@
+# Allow classes to use self-referencing Type hints in Python 3.7.
+from __future__ import annotations
 from enum import Enum
 from typing import Set, Dict
@@ -401,3 +403,283 @@ class DatastoreType(str, Enum):
     WARC = "warc"
     WEBDATASET = "webdataset"
     XML = "xml"
+    def url(self, url: str) -> str:
+        """
+        Returns a DeltaCAT URL string for this datastore type and the given base URL.
+        Typically, DeltaCAT URLs will be of the form <DatastoreType>+<URL>.
+        However, the following Datastore Types don't follow the <DatastoreType>+<URL> convention:
+        {DatastoreType.MONGO}: <mongodb_uri>?database=<db_name>&collection=<collection_name>&...
+        {DatastoreType.BIGQUERY}: bigquery://<project_id>/<dataset>?param1=val1&...
+        {DatastoreType.CLICKHOUSE}: <clickhouse_dsn>?table=<table_name>?param1=val1&...
+        {DatastoreType.DATABRICKS_TABLES}: databricks://<warehouse_id>?param1=val1&...
+        {DatastoreType.ICEBERG}: iceberg://<table_identifier>?param1=val1&...
+        Args:
+            url: The base URL to convert to a DeltaCAT URL.
+        Returns:
+            A DeltaCAT URL string for this datastore type and the given URL.
+        """
+        if self == DatastoreType.BIGQUERY:
+            raise ValueError(
+                f"No DataStore URL for BigQuery. Use a URL of the form: bigquery://<project_id>/<dataset>?param1=val1&..."
+            )
+        if self == DatastoreType.CLICKHOUSE:
+            raise ValueError(
+                f"No DataStore URL for ClickHouse. Use a URL of the form: <clickhouse_dsn>?table=<table_name>?param1=val1&..."
+            )
+        if self == DatastoreType.DATABRICKS_TABLES:
+            raise ValueError(
+                f"No DataStore URL for Databricks. Use a URL of the form: databricks://<warehouse_id>?param1=val1&..."
+            )
+        if self == DatastoreType.ICEBERG:
+            raise ValueError(
+                f"No DataStore URL for Iceberg. Use a URL of the form: iceberg://<table_identifier>?param1=val1&..."
+            )
+        if self == DatastoreType.MONGO:
+            raise ValueError(
+                f"No DataStore URL for MongoDB. Use a URL of the form: <mongodb_uri>?database=<db_name>&collection=<collection_name>&..."
+            )
+        if self in [
+            DatastoreType.DELTACAT,
+            DatastoreType.DELTACAT_NAMESPACE,
+            DatastoreType.DELTACAT_TABLE,
+            DatastoreType.DELTACAT_TABLE_VERSION,
+            DatastoreType.DELTACAT_STREAM,
+            DatastoreType.DELTACAT_PARTITION,
+            DatastoreType.DELTACAT_DELTA,
+        ]:
+            raise ValueError(
+                f"No DataStore URL for DeltaCAT. Use a URL of the form: dc://<catalog>/[namespace]/[table]/[tableversion]/[stream]/[partition]/[delta]"
+            )
+        return f"{self.value}+{url}"
+    @staticmethod
+    def from_url(url: str) -> DatastoreType:
+        """
+        Returns an inferred DatastoreType for the given URL.
+        Args:
+            url: The URL or file path to analyze for datastore type inference.
+        Returns:
+            An inferred DatastoreType for the given URL.
+        Raises:
+            ValueError: If a DatastoreType cannot be inferred from the given URL.
+        """
+        # Detect by prefix first
+        # DeltaCAT URLs
+        if url.startswith("dc://"):
+            return DatastoreType.DELTACAT
+        # External Datastore Types
+        if url.startswith("hudi+") or url.startswith("hudi://"):
+            return DatastoreType.HUDI
+        if url.startswith("iceberg+") or url.startswith("iceberg://"):
+            return DatastoreType.ICEBERG
+        if url.startswith("deltalake+") or url.startswith("deltalake://"):
+            return DatastoreType.DELTA_LAKE
+        if url.startswith("deltasharing+") or url.startswith("deltasharing://"):
+            return DatastoreType.DELTA_SHARING
+        if url.startswith("bigquery+") or url.startswith("bigquery://"):
+            return DatastoreType.BIGQUERY
+        if url.startswith("clickhouse+") or url.startswith("clickhouse://"):
+            return DatastoreType.CLICKHOUSE
+        if url.startswith("databricks+") or url.startswith("databricks://"):
+            return DatastoreType.DATABRICKS_TABLES
+        if url.startswith("mongodb+") or url.startswith("mongodb://"):
+            return DatastoreType.MONGO
+        # File Format Types
+        if url.startswith("binary+") or url.startswith("binary://"):
+            return DatastoreType.BINARY
+        if url.startswith("csv+") or url.startswith("csv://"):
+            return DatastoreType.CSV
+        if url.startswith("json+") or url.startswith("json://"):
+            return DatastoreType.JSON
+        if url.startswith("avro+") or url.startswith("avro://"):
+            return DatastoreType.AVRO
+        if url.startswith("orc+") or url.startswith("orc://"):
+            return DatastoreType.ORC
+        if url.startswith("feather+") or url.startswith("feather://"):
+            return DatastoreType.FEATHER
+        if url.startswith("numpy+") or url.startswith("numpy://"):
+            return DatastoreType.NUMPY
+        if url.startswith("parquet+") or url.startswith("parquet://"):
+            return DatastoreType.PARQUET
+        if url.startswith("hdf+") or url.startswith("hdf://"):
+            return DatastoreType.HDF
+        if url.startswith("lance+") or url.startswith("lance://"):
+            return DatastoreType.LANCE
+        if url.startswith("tfrecords+") or url.startswith("tfrecords://"):
+            return DatastoreType.TFRECORDS
+        if url.startswith("webdataset+") or url.startswith("webdataset://"):
+            return DatastoreType.WEBDATASET
+        # Text and Web Types
+        if url.startswith("text+") or url.startswith("text://"):
+            return DatastoreType.TEXT
+        if url.startswith("html+") or url.startswith("html://"):
+            return DatastoreType.HTML
+        if url.startswith("warc+") or url.startswith("warc://"):
+            return DatastoreType.WARC
+        if url.startswith("xml+") or url.startswith("xml://"):
+            return DatastoreType.XML
+        # Media Types
+        if url.startswith("audio+") or url.startswith("audio://"):
+            return DatastoreType.AUDIO
+        if url.startswith("images+") or url.startswith("images://"):
+            return DatastoreType.IMAGES
+        if url.startswith("videos+") or url.startswith("videos://"):
+            return DatastoreType.VIDEOS
+        extension = "." + url.split(".")[-1].lower()
+        # Fallback to file-extensions
+        if extension in [".parquet", ".pq"]:
+            return DatastoreType.PARQUET
+        if extension == ".csv":
+            return DatastoreType.CSV
+        if extension == ".json":
+            return DatastoreType.JSON
+        if extension == ".avro":
+            return DatastoreType.AVRO
+        if extension == ".orc":
+            return DatastoreType.ORC
+        if extension == ".feather":
+            return DatastoreType.FEATHER
+        if extension == ".npy":
+            return DatastoreType.NUMPY
+        # Text formats
+        if extension in [".txt", ".text", ".md"]:
+            return DatastoreType.TEXT
+        # Data science formats
+        if extension in [".hdf", ".h5", ".hdf5"]:
+            return DatastoreType.HDF
+        if extension == ".lance":
+            return DatastoreType.LANCE
+        if extension in [".tfrecords", ".tfrecord"]:
+            return DatastoreType.TFRECORDS
+        if extension == ".webdataset":
+            return DatastoreType.WEBDATASET
+        # Web formats
+        if extension in [".html", ".htm"]:
+            return DatastoreType.HTML
+        if extension == ".warc":
+            return DatastoreType.WARC
+        if extension == ".xml":
+            return DatastoreType.XML
+        # Binary formats
+        if extension in [".bin", ".exe", ".dll", ".so", ".dylib", ".a", ".lib"]:
+            return DatastoreType.BINARY
+        # Media formats - Images
+        if extension in [
+            ".jpg",
+            ".jpeg",
+            ".png",
+            ".gif",
+            ".bmp",
+            ".tiff",
+            ".tif",
+            ".ico",
+            ".webp",
+            ".svg",
+            ".heic",
+            ".heif",
+            ".jp2",
+            ".jfif",
+            ".pjpeg",
+            ".pjp",
+        ]:
+            return DatastoreType.IMAGES
+        # Media formats - Videos
+        if extension in [
+            ".mp4",
+            ".mov",
+            ".avi",
+            ".mkv",
+            ".webm",
+            ".flv",
+            ".wmv",
+            ".m4v",
+            ".3gp",
+            ".3g2",
+            ".f4v",
+            ".asf",
+            ".rm",
+            ".rmvb",
+            ".vob",
+            ".ogv",
+            ".drc",
+            ".mng",
+            ".qt",
+            ".yuv",
+            ".mpg",
+            ".mpeg",
+            ".m2v",
+            ".m2ts",
+            ".mts",
+            ".ts",
+        ]:
+            return DatastoreType.VIDEOS
+        # Media formats - Audio
+        if extension in [
+            ".mp3",
+            ".wav",
+            ".ogg",
+            ".flac",
+            ".aac",
+            ".m4a",
+            ".m4b",
+            ".m4p",
+            ".wma",
+            ".ra",
+            ".amr",
+            ".ape",
+            ".au",
+            ".gsm",
+            ".dss",
+            ".dvf",
+            ".msv",
+            ".opus",
+            ".tta",
+            ".voc",
+            ".vox",
+            ".wv",
+            ".3ga",
+            ".ac3",
+            ".adt",
+            ".adts",
+        ]:
+            return DatastoreType.AUDIO
+        # Default to binary
+        return DatastoreType.BINARY
+    @staticmethod
+    def get_url(url: str) -> str:
+        """
+        Returns a DeltaCAT URL string with an inferred datastore type for the given URL.
+        Args:
+            url: The URL or file path to analyze for datastore type inference.
+        Returns:
+            A DeltaCAT URL string for the inferred datastore type.
+        Raises:
+            ValueError: If a DeltaCAT URL cannot be inferred from the given URL.
+        """
+        return DatastoreType.from_url(url).url(url)

deltacat/types/tables.py CHANGED Viewed

@@ -690,7 +690,7 @@ def _convert_all(tables: List[LocalTable], conversion_fn: Callable, **kwargs):
     if not tables:  # Empty list
         return pd.DataFrame()
-    # Convert list elements
+    # Convert list elements to the same type
     all_tables = []
     for i, table in enumerate(tables):
         try:
@@ -699,15 +699,9 @@ def _convert_all(tables: List[LocalTable], conversion_fn: Callable, **kwargs):
         except Exception as e:
             raise ValueError(f"Failed to convert list element {i}: {e}") from e
-    # Concatenate with error handling - handle different table types
+    # Concatenate with error handling
     try:
-        # Check if we have PyArrow tables
-        if all(isinstance(table, pa.Table) for table in all_tables):
-            # Use PyArrow concatenation for PyArrow tables
-            return pa.concat_tables(all_tables, promote_options="permissive")
-        else:
-            # Use pandas concatenation for other types
-            return pd.concat(all_tables, ignore_index=True, sort=False)
+        return concat_tables(all_tables, get_dataset_type(all_tables[0]))
     except Exception as e:
         raise ValueError(f"Failed to concatenate {len(all_tables)} tables: {e}") from e
@@ -879,7 +873,7 @@ def get_table_slicer(table: Union[LocalTable, DistributedDataset]) -> Callable:
     return _get_table_function(table, TABLE_CLASS_TO_SLICER_FUNC, "slicer")
-def get_dataset_type(dataset: Dataset) -> DatasetType:
+def get_dataset_type(dataset: Union[LocalTable, DistributedDataset]) -> DatasetType:
     """Get the DatasetType enum value for a given dataset object.
     Args:
@@ -1382,7 +1376,7 @@ class UuidBlockWritePathProvider(FilenameProvider):
         self.write_paths.append(write_path)
         if block is not None:
             self.blocks.append(block)
-        return write_path
+        return filename
     def __call__(
         self,

deltacat/utils/pandas.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import csv
 import logging
 import math
+import posixpath
 import bz2
 import gzip
 from functools import partial
@@ -403,12 +404,18 @@ def slice_dataframe(
     return dataframes
-def concat_dataframes(dataframes: List[pd.DataFrame]) -> Optional[pd.DataFrame]:
+def concat_dataframes(
+    dataframes: List[pd.DataFrame],
+    axis: int = 0,
+    copy: bool = False,
+    ignore_index: bool = True,
+    **kwargs,
+) -> Optional[pd.DataFrame]:
     if dataframes is None or not len(dataframes):
         return None
     if len(dataframes) == 1:
         return next(iter(dataframes))
-    return pd.concat(dataframes, axis=0, copy=False)
+    return pd.concat(dataframes, axis=axis, copy=copy, ignore_index=ignore_index)
 def append_column_to_dataframe(
@@ -807,5 +814,6 @@ def dataframe_to_file(
             f"implemented. Known content types: "
             f"{CONTENT_TYPE_TO_PD_WRITE_FUNC.keys}"
         )
-    path = block_path_provider(base_path)
+    filename = block_path_provider(base_path)
+    path = posixpath.join(base_path, filename)
     writer(dataframe, path, filesystem=filesystem, **writer_kwargs)

deltacat/utils/polars.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import logging
+import posixpath
 import bz2
 import gzip
 from functools import partial
@@ -274,7 +275,8 @@ def dataframe_to_file(
             f"implemented. Known content types: "
             f"{CONTENT_TYPE_TO_PL_WRITE_FUNC.keys()}"
         )
-    path = block_path_provider(base_path)
+    filename = block_path_provider(base_path)
+    path = posixpath.join(base_path, filename)
     logger.debug(f"Writing table: {table} with kwargs: {writer_kwargs} to path: {path}")
     writer(table, path, filesystem=filesystem, **writer_kwargs)

deltacat/utils/pyarrow.py CHANGED Viewed

@@ -6,6 +6,7 @@ import bz2
 import gzip
 import io
 import logging
+import posixpath
 from functools import partial
 from typing import Any, Callable, Dict, Iterable, List, Optional, Union, Tuple
 from datetime import datetime, date
@@ -1027,7 +1028,8 @@ def table_to_file(
             f"implemented. Known content types: "
             f"{CONTENT_TYPE_TO_PA_WRITE_FUNC.keys}"
         )
-    path = block_path_provider(base_path)
+    filename = block_path_provider(base_path)
+    path = posixpath.join(base_path, filename)
     writer_kwargs = content_type_to_writer_kwargs(content_type)
     writer_kwargs.update(kwargs)
     logger.debug(f"Writing table: {table} with kwargs: {writer_kwargs} to path: {path}")
@@ -1493,7 +1495,9 @@ def file_to_parquet(
 def concat_tables(
-    tables: List[Union[pa.Table, papq.ParquetFile]]
+    tables: List[Union[pa.Table, papq.ParquetFile]],
+    promote_options: Optional[str] = "permissive",
+    **kwargs,
 ) -> Optional[Union[pa.Table, List[papq.ParquetFile]]]:
     """
     Concatenate a list of PyArrow Tables or ParquetFiles.
@@ -1525,7 +1529,7 @@ def concat_tables(
         else:
             converted_tables.append(table)
-    return pa.concat_tables(converted_tables)
+    return pa.concat_tables(converted_tables, promote_options=promote_options, **kwargs)
 def delta_manifest_to_table(

deltacat 2.0.0.post1__py3-none-any.whl → 2.0.0.post2__py3-none-any.whl

deltacat 2.0.0.post1py3-none-any.whl → 2.0.0.post2py3-none-any.whl