PyPI - deltacat - Versions diffs - 2.0.0b7__py3-none-any.whl → 2.0.0b10__py3-none-any.whl - Mend

deltacat 2.0.0b7py3-none-any.whl → 2.0.0b10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

deltacat/__init__.py +27 -6
deltacat/api.py +478 -123
deltacat/aws/s3u.py +2 -2
deltacat/benchmarking/conftest.py +1 -1
deltacat/catalog/main/impl.py +12 -6
deltacat/catalog/model/catalog.py +65 -47
deltacat/catalog/model/properties.py +1 -3
deltacat/compute/__init__.py +14 -0
deltacat/compute/converter/constants.py +5 -0
deltacat/compute/converter/converter_session.py +78 -36
deltacat/compute/converter/model/convert_input.py +24 -4
deltacat/compute/converter/model/convert_result.py +61 -0
deltacat/compute/converter/model/converter_session_params.py +52 -10
deltacat/compute/converter/pyiceberg/overrides.py +181 -62
deltacat/compute/converter/steps/convert.py +84 -36
deltacat/compute/converter/steps/dedupe.py +25 -4
deltacat/compute/converter/utils/convert_task_options.py +42 -13
deltacat/compute/converter/utils/iceberg_columns.py +5 -0
deltacat/compute/converter/utils/io.py +82 -11
deltacat/compute/converter/utils/s3u.py +13 -4
deltacat/compute/jobs/__init__.py +0 -0
deltacat/compute/jobs/client.py +404 -0
deltacat/constants.py +4 -4
deltacat/daft/daft_scan.py +7 -3
deltacat/daft/translator.py +126 -0
deltacat/examples/basic_logging.py +5 -3
deltacat/examples/hello_world.py +4 -2
deltacat/examples/indexer/__init__.py +0 -0
deltacat/examples/indexer/aws/__init__.py +0 -0
deltacat/examples/indexer/gcp/__init__.py +0 -0
deltacat/examples/indexer/indexer.py +163 -0
deltacat/examples/indexer/job_runner.py +199 -0
deltacat/io/__init__.py +13 -0
deltacat/io/dataset/__init__.py +0 -0
deltacat/io/dataset/deltacat_dataset.py +91 -0
deltacat/io/datasink/__init__.py +0 -0
deltacat/io/datasink/deltacat_datasink.py +207 -0
deltacat/io/datasource/__init__.py +0 -0
deltacat/io/datasource/deltacat_datasource.py +580 -0
deltacat/io/reader/__init__.py +0 -0
deltacat/io/reader/deltacat_read_api.py +172 -0
deltacat/storage/__init__.py +2 -0
deltacat/storage/model/expression/__init__.py +47 -0
deltacat/storage/model/expression/expression.py +656 -0
deltacat/storage/model/expression/visitor.py +248 -0
deltacat/storage/model/metafile.py +74 -42
deltacat/storage/model/scan/push_down.py +32 -5
deltacat/storage/model/types.py +5 -3
deltacat/storage/rivulet/__init__.py +4 -4
deltacat/tests/_io/reader/__init__.py +0 -0
deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
deltacat/tests/compute/converter/test_convert_session.py +209 -46
deltacat/tests/local_deltacat_storage/__init__.py +1 -0
deltacat/tests/storage/model/test_expression.py +327 -0
deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +2 -1
deltacat/tests/storage/rivulet/test_dataset.py +1 -1
deltacat/tests/storage/rivulet/test_manifest.py +1 -1
deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +1 -1
deltacat/tests/test_deltacat_api.py +50 -9
deltacat/types/media.py +141 -43
deltacat/types/tables.py +35 -7
deltacat/utils/daft.py +2 -2
deltacat/utils/filesystem.py +39 -9
deltacat/utils/polars.py +128 -0
deltacat/utils/pyarrow.py +151 -15
deltacat/utils/ray_utils/concurrency.py +1 -1
deltacat/utils/ray_utils/runtime.py +56 -4
deltacat/utils/url.py +1284 -0
{deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/METADATA +9 -6
{deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/RECORD +73 -48
{deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/LICENSE +0 -0
{deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/WHEEL +0 -0
{deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/top_level.txt +0 -0

deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import pytest
-from deltacat import Dataset
+from deltacat.storage.rivulet.dataset import Dataset
 from deltacat.storage.rivulet.fs.file_provider import FileProvider
 from deltacat.storage.rivulet.fs.file_store import FileStore
 from deltacat.storage.rivulet.metastore.delta import DeltacatManifestIO

deltacat/tests/test_deltacat_api.py CHANGED Viewed

@@ -1,29 +1,39 @@
 import shutil
 import tempfile
 import deltacat as dc
+from deltacat.constants import METAFILE_FORMAT_MSGPACK
+from deltacat import Namespace, DeltaCatUrl, DatasetType
+from deltacat.storage import Metafile
+from deltacat.io import (
+    METAFILE_TYPE_COLUMN_NAME,
+    METAFILE_DATA_COLUMN_NAME,
+)
 class TestDeltaCAT:
     @classmethod
-    def setup_class(cls):
+    def setup_method(cls):
         cls.temp_dir_1 = tempfile.mkdtemp()
         cls.temp_dir_2 = tempfile.mkdtemp()
         # Initialize DeltaCAT with two local catalogs.
-        dc.put("test_catalog_1", root=cls.temp_dir_1)
-        dc.put("test_catalog_2", root=cls.temp_dir_2)
+        dc.init()
+        dc.put(DeltaCatUrl("dc://test_catalog_1"), root=cls.temp_dir_1)
+        dc.put(DeltaCatUrl("dc://test_catalog_2"), root=cls.temp_dir_2)
     @classmethod
-    def teardown_class(cls):
+    def teardown_method(cls):
         shutil.rmtree(cls.temp_dir_1)
         shutil.rmtree(cls.temp_dir_2)
     def test_cross_catalog_namespace_copy(self):
         # Given two empty DeltaCAT catalogs.
         # When a namespace is copied across catalogs.
-        namespace_src = dc.put("test_catalog_1/test_namespace")
+        namespace_src = dc.put(DeltaCatUrl("dc://test_catalog_1/test_namespace"))
         namespace_dst = dc.copy(
-            "test_catalog_1/test_namespace",
-            "test_catalog_2",
+            DeltaCatUrl("dc://test_catalog_1/test_namespace"),
+            DeltaCatUrl("dc://test_catalog_2/test_namespace"),
         )
         # Expect the catalog namespace created in each catalog
         # method to be equivalent and equal to the source namespace.
@@ -33,7 +43,38 @@ class TestDeltaCAT:
         # When each catalog namespace is fetched explicitly
         # Expect them to be equivalent but not equal
         # (due to different metafile IDs).
-        actual_namespace_src = dc.get("test_catalog_1/test_namespace")
-        actual_namespace_dst = dc.get("test_catalog_2/test_namespace")
+        actual_namespace_src = dc.get(DeltaCatUrl("dc://test_catalog_1/test_namespace"))
+        actual_namespace_dst = dc.get(DeltaCatUrl("dc://test_catalog_2/test_namespace"))
         assert actual_namespace_src.equivalent_to(actual_namespace_dst)
         assert not actual_namespace_src == actual_namespace_dst
+    def test_catalog_listing_shallow_local_metafiles(self):
+        # Given two empty DeltaCAT catalogs.
+        # When a namespace is put in the catalog.
+        namespace_src: Namespace = dc.put(
+            DeltaCatUrl("dc://test_catalog_1/test_namespace")
+        )
+        # Expect the namespace to be listed.
+        assert any(
+            namespace_src.equivalent_to(other)
+            for other in dc.list(DeltaCatUrl("dc://test_catalog_1"))
+        )
+    def test_catalog_listing_shallow_ray_dataset(self):
+        # Given two empty DeltaCAT catalogs.
+        # When a namespace is put in the catalog.
+        namespace_src: Namespace = dc.put(
+            DeltaCatUrl("dc://test_catalog_1/test_namespace")
+        )
+        # Expect the namespace to be listed.
+        dataset = dc.list(
+            DeltaCatUrl("dc://test_catalog_1"),
+            dataset_type=DatasetType.RAY_DATASET,
+        )
+        actual_namespace = Metafile.deserialize(
+            serialized=dataset.take(1)[0][METAFILE_DATA_COLUMN_NAME],
+            meta_format=METAFILE_FORMAT_MSGPACK,
+        )
+        assert actual_namespace.equivalent_to(namespace_src)
+        namespace_type = dataset.take(1)[0][METAFILE_TYPE_COLUMN_NAME]
+        assert namespace_type == "Namespace"

deltacat/types/media.py CHANGED Viewed

@@ -1,30 +1,48 @@
 from enum import Enum
-from typing import Dict, Set
+from typing import Set
 class ContentType(str, Enum):
-    # See also:
-    # https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.17
-    # https://www.iana.org/assignments/media-types/media-types.xhtml
+    """
+    Enumeration used to resolve the entity-body Media Type (formerly known as
+    MIME type) in an HTTP request.
+    https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.17
+    https://www.iana.org/assignments/media-types/media-types.xhtml
+    """
     # IANA registered types
+    AVRO = "application/avro"
+    BINARY = "application/octet-stream"
     CSV = "text/csv"
+    HDF = "application/x-hdf"
+    HTML = "text/html"
     JSON = "application/json"
+    TEXT = "text/plain"
+    WEBDATASET = "application/x-web-dataset"
+    XML = "text/xml"
     # unregistered types
-    TSV = "text/tsv"
-    PSV = "text/psv"
-    PARQUET = "application/parquet"
-    ORC = "application/orc"
     FEATHER = "application/feather"
-    UNESCAPED_TSV = "application/x-amzn-unescaped-tsv"
     ION = "application/x-amzn-ion"
+    ORC = "application/orc"
+    PARQUET = "application/parquet"
+    PSV = "text/psv"
+    TSV = "text/tsv"
+    UNESCAPED_TSV = "application/x-amzn-unescaped-tsv"
 class ContentEncoding(str, Enum):
-    # See also:
-    # https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.11
-    # http://www.iana.org/assignments/http-parameters/http-parameters.xhtml#content-coding
+    """
+    Enumeration used as a modifier for :class:`deltacat.types.media.ContentType`
+    to indicate that additional encodings have been applied to the entity-body
+    Media Type in an HTTP request.
+    https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.11
+    http://www.iana.org/assignments/http-parameters/http-parameters.xhtml#content-coding
+    """
     # IANA registered encodings
     GZIP = "gzip"
@@ -37,27 +55,6 @@ class ContentEncoding(str, Enum):
     SNAPPY = "snappy"
-class TableType(str, Enum):
-    PYARROW = "pyarrow"
-    PANDAS = "pandas"
-    NUMPY = "numpy"
-    PYARROW_PARQUET = "pyarrow_parquet"
-class DistributedDatasetType(str, Enum):
-    DAFT = "daft"
-    RAY_DATASET = "ray_dataset"
-class SchemaType(str, Enum):
-    ARROW = "arrow"
-class StorageType(str, Enum):
-    LOCAL = "local"
-    DISTRIBUTED = "distributed"
 DELIMITED_TEXT_CONTENT_TYPES: Set[str] = {
     ContentType.UNESCAPED_TSV.value,
     ContentType.TSV.value,
@@ -73,6 +70,7 @@ TABULAR_CONTENT_TYPES: Set[str] = {
     ContentType.PARQUET.value,
     ContentType.ORC.value,
     ContentType.FEATHER.value,
+    ContentType.AVRO.value,
 }
 EXPLICIT_COMPRESSION_CONTENT_TYPES: Set[str] = {
@@ -83,13 +81,113 @@ EXPLICIT_COMPRESSION_CONTENT_TYPES: Set[str] = {
     ContentType.JSON.value,
 }
-CONTENT_TYPE_TO_USER_KWARGS_KEY: Dict[str, str] = {
-    ContentType.UNESCAPED_TSV.value: "unescaped_tsv",
-    ContentType.TSV.value: "csv",
-    ContentType.CSV.value: "csv",
-    ContentType.PSV.value: "csv",
-    ContentType.PARQUET.value: "parquet",
-    ContentType.FEATHER.value: "feather",
-    ContentType.ORC.value: "orc",
-    ContentType.JSON.value: "json",
-}
+class DatasetType(str, Enum):
+    """
+    Enumeration used to identify the in-memory local or distributed dataset
+    to be used for file IO, queries, and data transformation. Typically used
+    together with :class:`deltacat.types.media.DatastoreType` to resolve the
+    compute layer that will be responsible for reading, transforming, and
+    writing data to a given datastore.
+    """
+    # local
+    NUMPY = "numpy"  # numpy.ndarray
+    PANDAS = "pandas"  # pandas.DataFrame
+    POLARS = "polars"  # polars.DataFrame
+    PYARROW = "pyarrow"  # pyarrow.Table
+    PYARROW_PARQUET = "pyarrow_parquet"  # pyarrow.parquet.ParquetFile
+    # distributed
+    DAFT = "daft"  # daft.DataFrame
+    RAY_DATASET = "ray_dataset"  # ray.data.Dataset
+    @staticmethod
+    def distributed():
+        return {
+            DatasetType.DAFT,
+            DatasetType.RAY_DATASET,
+        }
+    @staticmethod
+    def local():
+        return {
+            DatasetType.NUMPY,
+            DatasetType.PANDAS,
+            DatasetType.POLARS,
+            DatasetType.PYARROW,
+            DatasetType.PYARROW_PARQUET,
+        }
+# deprecated by DatasetType - populated dynamically for backwards compatibility
+TableType = Enum(
+    "TableType",
+    {d.name: d.value for d in DatasetType.local()},
+)
+# deprecated by DatasetType - populated dynamically for backwards compatibility
+DistributedDatasetType = Enum(
+    "DistributedDatasetType",
+    {d.name: d.value for d in DatasetType.distributed()},
+)
+# deprecated by DatasetType.local() and DatasetType.distributed()
+# kept for backwards compatibility
+class StorageType(str, Enum):
+    LOCAL = "local"
+    DISTRIBUTED = "distributed"
+class DatastoreType(str, Enum):
+    """
+    Enumeration used to identify the type of reader required to connect to and
+    correctly interpret data stored at a given path. Typically used together
+    with :class:`deltacat.types.media.DatasetType` to resolve a reader or
+    writer for that data store. Note that, although some overlap exists between
+    enum values here and in :class:`deltacat.types.media.ContentType`, each
+    enum serve a different purpose. The purpose of
+    :class:`deltacat.types.media.ContentType` is to resolve the MIME type for
+    specific types of files, and may be used together with multi-content-type
+    datastore types to describe the specific file types read/written to that
+    datastore (e.g., Iceberg, Hudi, Delta Lake, Audio, Images, Video, etc.)
+    """
+    # DeltaCAT Catalog Datasets
+    DELTACAT = "dc"
+    DELTACAT_NAMESPACE = "namespace"
+    DELTACAT_TABLE = "table"
+    DELTACAT_TABLE_VERSION = "tableversion"
+    DELTACAT_STREAM = "stream"
+    DELTACAT_PARTITION = "partition"
+    DELTACAT_DELTA = "delta"
+    # External Datasets
+    AUDIO = "audio"
+    AVRO = "avro"
+    BIGQUERY = "bigquery"
+    BINARY = "binary"
+    CSV = "csv"
+    CLICKHOUSE = "clickhouse"
+    DATABRICKS_TABLES = "databricks"
+    DELTA_LAKE = "deltalake"
+    DELTA_SHARING = "deltasharing"
+    FEATHER = "feather"
+    HDF = "hdf"
+    HTML = "html"
+    HUDI = "hudi"
+    ICEBERG = "iceberg"
+    IMAGES = "images"
+    JSON = "json"
+    LANCE = "lance"
+    MONGO = "mongodb"
+    NUMPY = "numpy"
+    ORC = "orc"
+    PARQUET = "parquet"
+    TEXT = "text"
+    TFRECORDS = "tfrecords"
+    VIDEOS = "videos"
+    WARC = "warc"
+    WEBDATASET = "webdataset"
+    XML = "xml"

deltacat/types/tables.py CHANGED Viewed

@@ -3,9 +3,10 @@ from typing import Callable, Dict, Type, Union
 import numpy as np
 import pandas as pd
+import polars as pl
 import pyarrow as pa
 import pyarrow.parquet as papq
-from ray.data.dataset import Dataset
+from ray.data.dataset import Dataset as RayDataset
 from ray.data.read_api import (
     from_arrow,
     from_arrow_refs,
@@ -18,11 +19,12 @@ import deltacat.storage as dcs
 from deltacat.types.media import TableType, DistributedDatasetType
 from deltacat.utils import numpy as np_utils
 from deltacat.utils import pandas as pd_utils
+from deltacat.utils import polars as pl_utils
 from deltacat.utils import pyarrow as pa_utils
 from deltacat.utils import daft as daft_utils
 from deltacat.utils.ray_utils import dataset as ds_utils
-TABLE_TYPE_TO_READER_FUNC: Dict[int, Callable] = {
+TABLE_TYPE_TO_S3_READER_FUNC: Dict[int, Callable] = {
     TableType.PYARROW_PARQUET.value: pa_utils.s3_file_to_parquet,
     TableType.PYARROW.value: pa_utils.s3_file_to_table,
     TableType.PANDAS.value: pd_utils.s3_file_to_dataframe,
@@ -34,8 +36,9 @@ TABLE_CLASS_TO_WRITER_FUNC: Dict[
 ] = {
     pa.Table: pa_utils.table_to_file,
     pd.DataFrame: pd_utils.dataframe_to_file,
+    pl.DataFrame: pl_utils.dataframe_to_file,
     np.ndarray: np_utils.ndarray_to_file,
-    Dataset: ds_utils.dataset_to_file,
+    RayDataset: ds_utils.dataset_to_file,
 }
 TABLE_CLASS_TO_SLICER_FUNC: Dict[
@@ -43,8 +46,9 @@ TABLE_CLASS_TO_SLICER_FUNC: Dict[
 ] = {
     pa.Table: pa_utils.slice_table,
     pd.DataFrame: pd_utils.slice_dataframe,
+    pl.DataFrame: pl_utils.slice_table,
     np.ndarray: np_utils.slice_ndarray,
-    Dataset: ds_utils.slice_dataset,
+    RayDataset: ds_utils.slice_dataset,
 }
 TABLE_CLASS_TO_SIZE_FUNC: Dict[
@@ -53,13 +57,27 @@ TABLE_CLASS_TO_SIZE_FUNC: Dict[
     pa.Table: pa_utils.table_size,
     papq.ParquetFile: pa_utils.parquet_file_size,
     pd.DataFrame: pd_utils.dataframe_size,
+    pl.DataFrame: pl_utils.dataframe_size,
     np.ndarray: np_utils.ndarray_size,
-    Dataset: ds_utils.dataset_size,
+    RayDataset: ds_utils.dataset_size,
+}
+TABLE_CLASS_TO_PYARROW_FUNC: Dict[
+    Type[Union[dcs.LocalTable, dcs.DistributedDataset]], Callable
+] = {
+    pa.Table: lambda table, **kwargs: table,
+    papq.ParquetFile: lambda table, **kwargs: table.read(**kwargs),
+    pd.DataFrame: lambda table, **kwargs: pa.Table.from_pandas(table, **kwargs),
+    pl.DataFrame: lambda table, **kwargs: pl.DataFrame.to_arrow(table, **kwargs),
+    np.ndarray: lambda table, **kwargs: pa.Table.from_arrays(
+        [pa.array(table[:, i]) for i in range(table.shape[1])]
+    ),
 }
 TABLE_CLASS_TO_TABLE_TYPE: Dict[Type[dcs.LocalTable], str] = {
     pa.Table: TableType.PYARROW.value,
     papq.ParquetFile: TableType.PYARROW_PARQUET.value,
+    pl.DataFrame: TableType.POLARS.value,
     pd.DataFrame: TableType.PANDAS.value,
     np.ndarray: TableType.NUMPY.value,
 }
@@ -78,7 +96,6 @@ TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS: Dict[str, Callable] = {
     TableType.PANDAS.value: from_pandas_refs,
 }
 DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC: Dict[int, Callable] = {
     DistributedDatasetType.DAFT.value: daft_utils.s3_files_to_dataframe
 }
@@ -106,7 +123,18 @@ class TableWriteMode(str, Enum):
 def get_table_length(table: Union[dcs.LocalTable, dcs.DistributedDataset]) -> int:
-    return len(table) if not isinstance(table, Dataset) else table.count()
+    return len(table) if not isinstance(table, RayDataset) else table.count()
+def get_table_size(table: Union[dcs.LocalTable, dcs.DistributedDataset]) -> int:
+    table_size_func = TABLE_CLASS_TO_SIZE_FUNC.get(type(table))
+    if table_size_func is None:
+        msg = (
+            f"No size function found for table type: {type(table)}.\n"
+            f"Known table types: {TABLE_CLASS_TO_SIZE_FUNC.keys}"
+        )
+        raise ValueError(msg)
+    return table_size_func(table)
 def get_table_writer(table: Union[dcs.LocalTable, dcs.DistributedDataset]) -> Callable:

deltacat/utils/daft.py CHANGED Viewed

@@ -2,8 +2,8 @@ import logging
 from typing import Optional, List, Any, Dict, Callable
 import daft
 import ray
-from daft.recordbatch import read_parquet_into_pyarrow
 from daft import TimeUnit, DataFrame
+from daft.recordbatch import read_parquet_into_pyarrow
 from daft.io import IOConfig, S3Config
 import pyarrow as pa
@@ -51,7 +51,7 @@ def s3_files_to_dataframe(
     ), f"daft native reader currently only supports identity encoding, got {content_encoding}"
     if not ray.is_initialized():
-        ray.init(address="auto", ignore_reinit_error=True, **ray_init_options)
+        ray.init(ignore_reinit_error=True, **ray_init_options)
     daft.context.set_runner_ray(noop_if_initialized=True)

deltacat/utils/filesystem.py CHANGED Viewed

@@ -2,12 +2,12 @@ from __future__ import annotations
 import re
 from typing import Optional, Tuple, Union, List
+from datetime import timedelta
 import sys
 import urllib
 import pathlib
-import pyarrow
 import pyarrow as pa
 from pyarrow.fs import (
     _resolve_filesystem_and_path,
@@ -17,6 +17,7 @@ from pyarrow.fs import (
     FileSystem,
     FSSpecHandler,
     PyFileSystem,
+    GcsFileSystem,
 )
 _LOCAL_SCHEME = "local"
@@ -24,8 +25,8 @@ _LOCAL_SCHEME = "local"
 def resolve_paths_and_filesystem(
     paths: Union[str, List[str]],
-    filesystem: pyarrow.fs.FileSystem = None,
-) -> Tuple[List[str], pyarrow.fs.FileSystem]:
+    filesystem: FileSystem = None,
+) -> Tuple[List[str], FileSystem]:
     """
     Resolves and normalizes all provided paths, infers a filesystem from the
     paths or validates the provided filesystem against the paths and ensures
@@ -113,19 +114,26 @@ def resolve_paths_and_filesystem(
             else:
                 raise
         if filesystem is None:
-            filesystem = resolved_filesystem
+            if isinstance(resolved_filesystem, GcsFileSystem):
+                # Configure a retry time limit for GcsFileSystem so that it
+                # doesn't hang forever trying to get file info (e.g., when
+                # trying to get a public file w/o anonymous=True).
+                filesystem = GcsFileSystem(
+                    retry_time_limit=timedelta(seconds=60),
+                )
+            else:
+                filesystem = resolved_filesystem
         elif need_unwrap_path_protocol:
             resolved_path = _unwrap_protocol(resolved_path)
         resolved_path = filesystem.normalize_path(resolved_path)
         resolved_paths.append(resolved_path)
     return resolved_paths, filesystem
 def resolve_path_and_filesystem(
     path: str,
-    filesystem: Optional[pyarrow.fs.FileSystem] = None,
-) -> Tuple[str, pyarrow.fs.FileSystem]:
+    filesystem: Optional[FileSystem] = None,
+) -> Tuple[str, FileSystem]:
     """
     Resolves and normalizes the provided path, infers a filesystem from the
     path or validates the provided filesystem against the path.
@@ -148,7 +156,7 @@ def resolve_path_and_filesystem(
 def list_directory(
     path: str,
-    filesystem: pyarrow.fs.FileSystem,
+    filesystem: FileSystem,
     exclude_prefixes: Optional[List[str]] = None,
     ignore_missing_path: bool = False,
     recursive: bool = False,
@@ -199,7 +207,7 @@ def list_directory(
 def get_file_info(
     path: str,
-    filesystem: pyarrow.fs.FileSystem,
+    filesystem: FileSystem,
     ignore_missing_path: bool = False,
 ) -> FileInfo:
     """Get the file info for the provided path."""
@@ -227,6 +235,9 @@ def _handle_read_os_error(
         r"(?:(.*)AWS Error ACCESS_DENIED during HeadObject operation: No response "
         r"body\.(.*))$"
     )
+    gcp_error_pattern = (
+        r"^(?:(.*)google::cloud::Status\(UNAVAILABLE:(.*?)Couldn't resolve host name)"
+    )
     if re.match(aws_error_pattern, str(error)):
         # Specially handle AWS error when reading files, to give a clearer error
         # message to avoid confusing users. The real issue is most likely that the AWS
@@ -243,9 +254,28 @@ def _handle_read_os_error(
                 "You can also run AWS CLI command to get more detailed error message "
                 "(e.g., aws s3 ls <file-name>). "
                 "See https://awscli.amazonaws.com/v2/documentation/api/latest/reference/s3/index.html "  # noqa
+                "and https://arrow.apache.org/docs/python/generated/pyarrow.fs.S3FileSystem.html "
                 "for more information."
             )
         )
+    elif re.match(gcp_error_pattern, str(error)):
+        # Special handling for GCP errors (e.g., handling the special case of
+        # requiring the filesystem to be instantiated with anonymous access to
+        # read public files).
+        if isinstance(paths, str):
+            paths = f'"{paths}"'
+        raise OSError(
+            (
+                f"Failing to read GCP GS file(s): {paths}. "
+                "Please check that file exists and has properly configured access. "
+                "If this is a public file, please instantiate a filesystem with "
+                "anonymous access via `pyarrow.fs.GcsFileSystem(anonymous=True)` "
+                "to read it. See https://google.aip.dev/auth/4110 and "
+                "https://arrow.apache.org/docs/python/generated/pyarrow.fs.GcsFileSystem.html"  # noqa
+                "for more information."
+            )
+        )
     else:
         raise error

deltacat/utils/polars.py ADDED Viewed

@@ -0,0 +1,128 @@
+import logging
+from typing import Optional, List, Dict, Callable, Union
+import polars as pl
+from fsspec import AbstractFileSystem
+from ray.data.datasource import FilenameProvider
+from deltacat import logs
+from deltacat.types.media import ContentType
+logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
+def write_json(
+    table: pl.DataFrame,
+    path: str,
+    *,
+    filesystem: Optional[AbstractFileSystem] = None,
+    fs_open_kwargs: Dict[str, any] = {},
+    **write_kwargs,
+) -> None:
+    if not filesystem:
+        table.write_ndjson(path, **write_kwargs)
+    else:
+        with filesystem.open(path, "wb", **fs_open_kwargs) as f:
+            table.write_ndjson(f, **write_kwargs)
+def write_csv(
+    table: pl.DataFrame,
+    path: str,
+    *,
+    filesystem: Optional[AbstractFileSystem] = None,
+    fs_open_kwargs: Dict[str, any] = {},
+    **write_kwargs,
+) -> None:
+    if not filesystem:
+        table.write_csv(path, **write_kwargs)
+    else:
+        with filesystem.open(path, "wb", **fs_open_kwargs) as f:
+            table.write_csv(f, **write_kwargs)
+def write_avro(
+    table: pl.DataFrame,
+    path: str,
+    *,
+    filesystem: Optional[AbstractFileSystem] = None,
+    fs_open_kwargs: Dict[str, any] = {},
+    **write_kwargs,
+) -> None:
+    if not filesystem:
+        table.write_avro(path, **write_kwargs)
+    else:
+        with filesystem.open(path, "wb", **fs_open_kwargs) as f:
+            table.write_avro(f, **write_kwargs)
+def write_parquet(
+    table: pl.DataFrame,
+    path: str,
+    *,
+    filesystem: Optional[AbstractFileSystem] = None,
+    fs_open_kwargs: Dict[str, any] = {},
+    **write_kwargs,
+) -> None:
+    if not filesystem:
+        table.write_parquet(path, **write_kwargs)
+    else:
+        with filesystem.open(path, "wb", **fs_open_kwargs) as f:
+            table.write_parquet(f, **write_kwargs)
+CONTENT_TYPE_TO_PL_WRITE_FUNC: Dict[str, Callable] = {
+    # TODO (pdames): add support for other delimited text content types as
+    #  pyarrow adds support for custom delimiters, escaping, and None value
+    #  representations to pyarrow.csv.WriteOptions.
+    ContentType.AVRO.value: write_avro,
+    ContentType.CSV.value: write_csv,
+    ContentType.PARQUET.value: write_parquet,
+    ContentType.JSON.value: write_json,
+}
+def slice_table(table: pl.DataFrame, max_len: Optional[int]) -> List[pl.DataFrame]:
+    """
+    Iteratively create 0-copy table slices.
+    """
+    if max_len is None:
+        return [table]
+    tables = []
+    offset = 0
+    records_remaining = len(table)
+    while records_remaining > 0:
+        records_this_entry = min(max_len, records_remaining)
+        tables.append(table.slice(offset, records_this_entry))
+        records_remaining -= records_this_entry
+        offset += records_this_entry
+    return tables
+def dataframe_size(table: pl.DataFrame) -> int:
+    return table.estimated_size()
+def dataframe_to_file(
+    table: pl.DataFrame,
+    base_path: str,
+    file_system: Optional[AbstractFileSystem],
+    block_path_provider: Union[Callable, FilenameProvider],
+    content_type: str = ContentType.PARQUET.value,
+    **kwargs,
+) -> None:
+    """
+    Writes the given Pyarrow Table to a file.
+    """
+    writer = CONTENT_TYPE_TO_PL_WRITE_FUNC.get(content_type)
+    if not writer:
+        raise NotImplementedError(
+            f"Pyarrow writer for content type '{content_type}' not "
+            f"implemented. Known content types: "
+            f"{CONTENT_TYPE_TO_PL_WRITE_FUNC.keys}"
+        )
+    path = block_path_provider(base_path)
+    logger.debug(f"Writing table: {table} with kwargs: {kwargs} to path: {path}")
+    writer(table, path, filesystem=file_system, **kwargs)

deltacat 2.0.0b7__py3-none-any.whl → 2.0.0b10__py3-none-any.whl

deltacat 2.0.0b7py3-none-any.whl → 2.0.0b10py3-none-any.whl