PyPI - polars-runtime-compat - Versions diffs - 1.34.0b3__cp39-abi3-macosx_11_0_arm64.whl → 1.34.0b4__cp39-abi3-macosx_11_0_arm64.whl - Mend

polars-runtime-compat 1.34.0b3__cp39-abi3-macosx_11_0_arm64.whl → 1.34.0b4__cp39-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of polars-runtime-compat might be problematic. Click here for more details.

Files changed (203) hide show

_polars_runtime_compat/_polars_runtime_compat.abi3.so +0 -0
{polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b4.dist-info}/METADATA +1 -1
polars_runtime_compat-1.34.0b4.dist-info/RECORD +6 -0
polars/__init__.py +0 -528
polars/_cpu_check.py +0 -265
polars/_dependencies.py +0 -355
polars/_plr.py +0 -99
polars/_plr.pyi +0 -2496
polars/_reexport.py +0 -23
polars/_typing.py +0 -478
polars/_utils/__init__.py +0 -37
polars/_utils/async_.py +0 -102
polars/_utils/cache.py +0 -176
polars/_utils/cloud.py +0 -40
polars/_utils/constants.py +0 -29
polars/_utils/construction/__init__.py +0 -46
polars/_utils/construction/dataframe.py +0 -1397
polars/_utils/construction/other.py +0 -72
polars/_utils/construction/series.py +0 -560
polars/_utils/construction/utils.py +0 -118
polars/_utils/convert.py +0 -224
polars/_utils/deprecation.py +0 -406
polars/_utils/getitem.py +0 -457
polars/_utils/logging.py +0 -11
polars/_utils/nest_asyncio.py +0 -264
polars/_utils/parquet.py +0 -15
polars/_utils/parse/__init__.py +0 -12
polars/_utils/parse/expr.py +0 -242
polars/_utils/polars_version.py +0 -19
polars/_utils/pycapsule.py +0 -53
polars/_utils/scan.py +0 -27
polars/_utils/serde.py +0 -63
polars/_utils/slice.py +0 -215
polars/_utils/udfs.py +0 -1251
polars/_utils/unstable.py +0 -63
polars/_utils/various.py +0 -782
polars/_utils/wrap.py +0 -25
polars/api.py +0 -370
polars/catalog/__init__.py +0 -0
polars/catalog/unity/__init__.py +0 -19
polars/catalog/unity/client.py +0 -733
polars/catalog/unity/models.py +0 -152
polars/config.py +0 -1571
polars/convert/__init__.py +0 -25
polars/convert/general.py +0 -1046
polars/convert/normalize.py +0 -261
polars/dataframe/__init__.py +0 -5
polars/dataframe/_html.py +0 -186
polars/dataframe/frame.py +0 -12582
polars/dataframe/group_by.py +0 -1067
polars/dataframe/plotting.py +0 -257
polars/datatype_expr/__init__.py +0 -5
polars/datatype_expr/array.py +0 -56
polars/datatype_expr/datatype_expr.py +0 -304
polars/datatype_expr/list.py +0 -18
polars/datatype_expr/struct.py +0 -69
polars/datatypes/__init__.py +0 -122
polars/datatypes/_parse.py +0 -195
polars/datatypes/_utils.py +0 -48
polars/datatypes/classes.py +0 -1213
polars/datatypes/constants.py +0 -11
polars/datatypes/constructor.py +0 -172
polars/datatypes/convert.py +0 -366
polars/datatypes/group.py +0 -130
polars/exceptions.py +0 -230
polars/expr/__init__.py +0 -7
polars/expr/array.py +0 -964
polars/expr/binary.py +0 -346
polars/expr/categorical.py +0 -306
polars/expr/datetime.py +0 -2620
polars/expr/expr.py +0 -11272
polars/expr/list.py +0 -1408
polars/expr/meta.py +0 -444
polars/expr/name.py +0 -321
polars/expr/string.py +0 -3045
polars/expr/struct.py +0 -357
polars/expr/whenthen.py +0 -185
polars/functions/__init__.py +0 -193
polars/functions/aggregation/__init__.py +0 -33
polars/functions/aggregation/horizontal.py +0 -298
polars/functions/aggregation/vertical.py +0 -341
polars/functions/as_datatype.py +0 -848
polars/functions/business.py +0 -138
polars/functions/col.py +0 -384
polars/functions/datatype.py +0 -121
polars/functions/eager.py +0 -524
polars/functions/escape_regex.py +0 -29
polars/functions/lazy.py +0 -2751
polars/functions/len.py +0 -68
polars/functions/lit.py +0 -210
polars/functions/random.py +0 -22
polars/functions/range/__init__.py +0 -19
polars/functions/range/_utils.py +0 -15
polars/functions/range/date_range.py +0 -303
polars/functions/range/datetime_range.py +0 -370
polars/functions/range/int_range.py +0 -348
polars/functions/range/linear_space.py +0 -311
polars/functions/range/time_range.py +0 -287
polars/functions/repeat.py +0 -301
polars/functions/whenthen.py +0 -353
polars/interchange/__init__.py +0 -10
polars/interchange/buffer.py +0 -77
polars/interchange/column.py +0 -190
polars/interchange/dataframe.py +0 -230
polars/interchange/from_dataframe.py +0 -328
polars/interchange/protocol.py +0 -303
polars/interchange/utils.py +0 -170
polars/io/__init__.py +0 -64
polars/io/_utils.py +0 -317
polars/io/avro.py +0 -49
polars/io/clipboard.py +0 -36
polars/io/cloud/__init__.py +0 -17
polars/io/cloud/_utils.py +0 -80
polars/io/cloud/credential_provider/__init__.py +0 -17
polars/io/cloud/credential_provider/_builder.py +0 -520
polars/io/cloud/credential_provider/_providers.py +0 -618
polars/io/csv/__init__.py +0 -9
polars/io/csv/_utils.py +0 -38
polars/io/csv/batched_reader.py +0 -142
polars/io/csv/functions.py +0 -1495
polars/io/database/__init__.py +0 -6
polars/io/database/_arrow_registry.py +0 -70
polars/io/database/_cursor_proxies.py +0 -147
polars/io/database/_executor.py +0 -578
polars/io/database/_inference.py +0 -314
polars/io/database/_utils.py +0 -144
polars/io/database/functions.py +0 -516
polars/io/delta.py +0 -499
polars/io/iceberg/__init__.py +0 -3
polars/io/iceberg/_utils.py +0 -697
polars/io/iceberg/dataset.py +0 -556
polars/io/iceberg/functions.py +0 -151
polars/io/ipc/__init__.py +0 -8
polars/io/ipc/functions.py +0 -514
polars/io/json/__init__.py +0 -3
polars/io/json/read.py +0 -101
polars/io/ndjson.py +0 -332
polars/io/parquet/__init__.py +0 -17
polars/io/parquet/field_overwrites.py +0 -140
polars/io/parquet/functions.py +0 -722
polars/io/partition.py +0 -491
polars/io/plugins.py +0 -187
polars/io/pyarrow_dataset/__init__.py +0 -5
polars/io/pyarrow_dataset/anonymous_scan.py +0 -109
polars/io/pyarrow_dataset/functions.py +0 -79
polars/io/scan_options/__init__.py +0 -5
polars/io/scan_options/_options.py +0 -59
polars/io/scan_options/cast_options.py +0 -126
polars/io/spreadsheet/__init__.py +0 -6
polars/io/spreadsheet/_utils.py +0 -52
polars/io/spreadsheet/_write_utils.py +0 -647
polars/io/spreadsheet/functions.py +0 -1323
polars/lazyframe/__init__.py +0 -9
polars/lazyframe/engine_config.py +0 -61
polars/lazyframe/frame.py +0 -8564
polars/lazyframe/group_by.py +0 -669
polars/lazyframe/in_process.py +0 -42
polars/lazyframe/opt_flags.py +0 -333
polars/meta/__init__.py +0 -14
polars/meta/build.py +0 -33
polars/meta/index_type.py +0 -27
polars/meta/thread_pool.py +0 -50
polars/meta/versions.py +0 -120
polars/ml/__init__.py +0 -0
polars/ml/torch.py +0 -213
polars/ml/utilities.py +0 -30
polars/plugins.py +0 -155
polars/py.typed +0 -0
polars/pyproject.toml +0 -103
polars/schema.py +0 -265
polars/selectors.py +0 -3117
polars/series/__init__.py +0 -5
polars/series/array.py +0 -776
polars/series/binary.py +0 -254
polars/series/categorical.py +0 -246
polars/series/datetime.py +0 -2275
polars/series/list.py +0 -1087
polars/series/plotting.py +0 -191
polars/series/series.py +0 -9197
polars/series/string.py +0 -2367
polars/series/struct.py +0 -154
polars/series/utils.py +0 -191
polars/sql/__init__.py +0 -7
polars/sql/context.py +0 -677
polars/sql/functions.py +0 -139
polars/string_cache.py +0 -185
polars/testing/__init__.py +0 -13
polars/testing/asserts/__init__.py +0 -9
polars/testing/asserts/frame.py +0 -231
polars/testing/asserts/series.py +0 -219
polars/testing/asserts/utils.py +0 -12
polars/testing/parametric/__init__.py +0 -33
polars/testing/parametric/profiles.py +0 -107
polars/testing/parametric/strategies/__init__.py +0 -22
polars/testing/parametric/strategies/_utils.py +0 -14
polars/testing/parametric/strategies/core.py +0 -615
polars/testing/parametric/strategies/data.py +0 -452
polars/testing/parametric/strategies/dtype.py +0 -436
polars/testing/parametric/strategies/legacy.py +0 -169
polars/type_aliases.py +0 -24
polars_runtime_compat-1.34.0b3.dist-info/RECORD +0 -203
{polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b4.dist-info}/WHEEL +0 -0
{polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b4.dist-info}/licenses/LICENSE +0 -0

polars/io/iceberg/dataset.py DELETED Viewed

@@ -1,556 +0,0 @@
-from __future__ import annotations
-import os
-from abc import ABC, abstractmethod
-from dataclasses import dataclass
-from functools import partial
-from time import perf_counter
-from typing import TYPE_CHECKING, Any, Literal
-import polars._reexport as pl
-from polars._utils.logging import eprint, verbose
-from polars.exceptions import ComputeError
-from polars.io.iceberg._utils import (
-    IcebergStatisticsLoader,
-    IdentityTransformedPartitionValuesBuilder,
-    _scan_pyarrow_dataset_impl,
-)
-from polars.io.scan_options.cast_options import ScanCastOptions
-if TYPE_CHECKING:
-    import pyarrow as pa
-    import pyiceberg.schema
-    from pyiceberg.table import Table
-    from polars.lazyframe.frame import LazyFrame
-class IcebergDataset:
-    """Dataset interface for PyIceberg."""
-    def __init__(
-        self,
-        source: str | Table,
-        *,
-        snapshot_id: int | None = None,
-        iceberg_storage_properties: dict[str, Any] | None = None,
-        reader_override: Literal["native", "pyiceberg"] | None = None,
-        use_metadata_statistics: bool = True,
-    ) -> None:
-        self._metadata_path = None
-        self._table = None
-        self._snapshot_id = snapshot_id
-        self._iceberg_storage_properties = iceberg_storage_properties
-        self._reader_override: Literal["native", "pyiceberg"] | None = reader_override
-        self._use_metadata_statistics = use_metadata_statistics
-        # Accept either a path or a table object. The one we don't have is
-        # lazily initialized when needed.
-        if isinstance(source, str):
-            self._metadata_path = source
-        else:
-            self._table = source
-    #
-    # PythonDatasetProvider interface functions
-    #
-    def schema(self) -> pa.schema:
-        """Fetch the schema of the table."""
-        return self.arrow_schema()
-    def arrow_schema(self) -> pa.schema:
-        """Fetch the arrow schema of the table."""
-        from pyiceberg.io.pyarrow import schema_to_pyarrow
-        return schema_to_pyarrow(self.table().schema())
-    def to_dataset_scan(
-        self,
-        *,
-        existing_resolved_version_key: str | None = None,
-        limit: int | None = None,
-        projection: list[str] | None = None,
-        filter_columns: list[str] | None = None,
-    ) -> tuple[LazyFrame, str] | None:
-        """Construct a LazyFrame scan."""
-        if (
-            scan_data := self._to_dataset_scan_impl(
-                existing_resolved_version_key=existing_resolved_version_key,
-                limit=limit,
-                projection=projection,
-                filter_columns=filter_columns,
-            )
-        ) is None:
-            return None
-        return scan_data.to_lazyframe(), scan_data.snapshot_id_key
-    def _to_dataset_scan_impl(
-        self,
-        *,
-        existing_resolved_version_key: str | None = None,
-        limit: int | None = None,
-        projection: list[str] | None = None,
-        filter_columns: list[str] | None = None,
-    ) -> _NativeIcebergScanData | _PyIcebergScanData | None:
-        from pyiceberg.io.pyarrow import schema_to_pyarrow
-        import polars._utils.logging
-        verbose = polars._utils.logging.verbose()
-        if verbose:
-            eprint(
-                "IcebergDataset: to_dataset_scan(): "
-                f"snapshot ID: {self._snapshot_id}, "
-                f"limit: {limit}, "
-                f"projection: {projection}, "
-                f"filter_columns: {filter_columns}, "
-                f"self._use_metadata_statistics: {self._use_metadata_statistics}"
-            )
-        tbl = self.table()
-        if verbose:
-            eprint(
-                "IcebergDataset: to_dataset_scan(): "
-                f"tbl.metadata.current_snapshot_id: {tbl.metadata.current_snapshot_id}"
-            )
-        snapshot_id = self._snapshot_id
-        schema_id = None
-        if snapshot_id is not None:
-            snapshot = tbl.snapshot_by_id(snapshot_id)
-            if snapshot is None:
-                msg = f"iceberg snapshot ID not found: {snapshot_id}"
-                raise ValueError(msg)
-            schema_id = snapshot.schema_id
-            if schema_id is None:
-                msg = (
-                    f"IcebergDataset: requested snapshot {snapshot_id} "
-                    "did not contain a schema ID"
-                )
-                raise ValueError(msg)
-            iceberg_schema = tbl.schemas()[schema_id]
-            snapshot_id_key = f"{snapshot.snapshot_id}"
-        else:
-            iceberg_schema = tbl.schema()
-            schema_id = tbl.metadata.current_schema_id
-            snapshot_id_key = (
-                f"{v.snapshot_id}" if (v := tbl.current_snapshot()) is not None else ""
-            )
-        if (
-            existing_resolved_version_key is not None
-            and existing_resolved_version_key == snapshot_id_key
-        ):
-            if verbose:
-                eprint(
-                    "IcebergDataset: to_dataset_scan(): early return "
-                    f"({snapshot_id_key = })"
-                )
-            return None
-        # Take from parameter first then envvar
-        reader_override = self._reader_override or os.getenv(
-            "POLARS_ICEBERG_READER_OVERRIDE"
-        )
-        if reader_override and reader_override not in ["native", "pyiceberg"]:
-            msg = (
-                "iceberg: unknown value for reader_override: "
-                f"'{reader_override}', expected one of ('native', 'pyiceberg')"
-            )
-            raise ValueError(msg)
-        fallback_reason = (
-            "forced reader_override='pyiceberg'"
-            if reader_override == "pyiceberg"
-            else f"unsupported table format version: {tbl.format_version}"
-            if not tbl.format_version <= 2
-            else None
-        )
-        selected_fields = ("*",) if projection is None else tuple(projection)
-        projected_iceberg_schema = (
-            iceberg_schema
-            if selected_fields == ("*",)
-            else iceberg_schema.select(*selected_fields)
-        )
-        sources = []
-        missing_field_defaults = IdentityTransformedPartitionValuesBuilder(
-            tbl,
-            projected_iceberg_schema,
-        )
-        statistics_loader: IcebergStatisticsLoader | None = (
-            IcebergStatisticsLoader(tbl, iceberg_schema.select(*filter_columns))
-            if self._use_metadata_statistics and filter_columns is not None
-            else None
-        )
-        deletion_files: dict[int, list[str]] = {}
-        if reader_override != "pyiceberg" and not fallback_reason:
-            from pyiceberg.manifest import DataFileContent, FileFormat
-            if verbose:
-                eprint("IcebergDataset: to_dataset_scan(): begin path expansion")
-            start_time = perf_counter()
-            scan = tbl.scan(
-                snapshot_id=snapshot_id,
-                limit=limit,
-                selected_fields=selected_fields,
-            )
-            total_deletion_files = 0
-            for i, file_info in enumerate(scan.plan_files()):
-                if file_info.file.file_format != FileFormat.PARQUET:
-                    fallback_reason = (
-                        f"non-parquet format: {file_info.file.file_format}"
-                    )
-                    break
-                if file_info.delete_files:
-                    deletion_files[i] = []
-                    for deletion_file in file_info.delete_files:
-                        if deletion_file.content != DataFileContent.POSITION_DELETES:
-                            fallback_reason = (
-                                "unsupported deletion file type: "
-                                f"{deletion_file.content}"
-                            )
-                            break
-                        if deletion_file.file_format != FileFormat.PARQUET:
-                            fallback_reason = (
-                                "unsupported deletion file format: "
-                                f"{deletion_file.file_format}"
-                            )
-                            break
-                        deletion_files[i].append(deletion_file.file_path)
-                        total_deletion_files += 1
-                if fallback_reason:
-                    break
-                missing_field_defaults.push_partition_values(
-                    current_index=i,
-                    partition_spec_id=file_info.file.spec_id,
-                    partition_values=file_info.file.partition,
-                )
-                if statistics_loader is not None:
-                    statistics_loader.push_file_statistics(file_info.file)
-                sources.append(file_info.file.file_path)
-            if verbose:
-                elapsed = perf_counter() - start_time
-                eprint(
-                    "IcebergDataset: to_dataset_scan(): "
-                    f"finish path expansion ({elapsed:.3f}s)"
-                )
-        if not fallback_reason:
-            if verbose:
-                s = "" if len(sources) == 1 else "s"
-                s2 = "" if total_deletion_files == 1 else "s"
-                eprint(
-                    "IcebergDataset: to_dataset_scan(): "
-                    f"native scan_parquet(): "
-                    f"{len(sources)} source{s}, "
-                    f"snapshot ID: {snapshot_id}, "
-                    f"schema ID: {schema_id}, "
-                    f"{total_deletion_files} deletion file{s2}"
-                )
-            # The arrow schema returned by `schema_to_pyarrow` will contain
-            # 'PARQUET:field_id'
-            column_mapping = schema_to_pyarrow(iceberg_schema)
-            identity_transformed_values = missing_field_defaults.finish()
-            min_max_statistics = (
-                statistics_loader.finish(len(sources), identity_transformed_values)
-                if statistics_loader is not None
-                else None
-            )
-            storage_options = (
-                _convert_iceberg_to_object_store_storage_options(
-                    self._iceberg_storage_properties
-                )
-                if self._iceberg_storage_properties is not None
-                else None
-            )
-            return _NativeIcebergScanData(
-                sources=sources,
-                projected_iceberg_schema=projected_iceberg_schema,
-                column_mapping=column_mapping,
-                default_values=identity_transformed_values,
-                deletion_files=deletion_files,
-                min_max_statistics=min_max_statistics,
-                statistics_loader=statistics_loader,
-                storage_options=storage_options,
-                _snapshot_id_key=snapshot_id_key,
-            )
-        elif reader_override == "native":
-            msg = f"iceberg reader_override='native' failed: {fallback_reason}"
-            raise ComputeError(msg)
-        if verbose:
-            eprint(
-                "IcebergDataset: to_dataset_scan(): "
-                f"fallback to python[pyiceberg] scan: {fallback_reason}"
-            )
-        func = partial(
-            _scan_pyarrow_dataset_impl,
-            tbl,
-            snapshot_id=snapshot_id,
-            n_rows=limit,
-            with_columns=projection,
-        )
-        arrow_schema = schema_to_pyarrow(tbl.schema())
-        lf = pl.LazyFrame._scan_python_function(
-            arrow_schema,
-            func,
-            pyarrow=True,
-            is_pure=True,
-        )
-        return _PyIcebergScanData(lf=lf, _snapshot_id_key=snapshot_id_key)
-    #
-    # Accessors
-    #
-    def metadata_path(self) -> str:
-        """Fetch the metadata path."""
-        if self._metadata_path is None:
-            if self._table is None:
-                msg = "impl error: both metadata_path and table are None"
-                raise ValueError(msg)
-            self._metadata_path = self.table().metadata_location
-        return self._metadata_path
-    def table(self) -> Table:
-        """Fetch the PyIceberg Table object."""
-        if self._table is None:
-            if self._metadata_path is None:
-                msg = "impl error: both metadata_path and table are None"
-                raise ValueError(msg)
-            if verbose():
-                eprint(f"IcebergDataset: construct table from {self._metadata_path = }")
-            from pyiceberg.table import StaticTable
-            self._table = StaticTable.from_metadata(
-                metadata_location=self._metadata_path,
-                properties=self._iceberg_storage_properties or {},
-            )
-        return self._table
-    #
-    # Serialization functions
-    #
-    # We don't serialize the iceberg table object - the remote machine should
-    # use their own permissions to reconstruct the table object from the path.
-    #
-    def __getstate__(self) -> dict[str, Any]:
-        state = {
-            "metadata_path": self.metadata_path(),
-            "snapshot_id": self._snapshot_id,
-            "iceberg_storage_properties": self._iceberg_storage_properties,
-            "reader_override": self._reader_override,
-        }
-        if verbose():
-            path_repr = state["metadata_path"]
-            snapshot_id = f"'{v}'" if (v := state["snapshot_id"]) is not None else None
-            keys_repr = _redact_dict_values(state["iceberg_storage_properties"])
-            reader_override = state["reader_override"]
-            eprint(
-                "IcebergDataset: getstate(): "
-                f"path: '{path_repr}', "
-                f"snapshot_id: {snapshot_id}, "
-                f"iceberg_storage_properties: {keys_repr}, "
-                f"reader_override: {reader_override}"
-            )
-        return state
-    def __setstate__(self, state: dict[str, Any]) -> None:
-        if verbose():
-            path_repr = state["metadata_path"]
-            snapshot_id = state["snapshot_id"]
-            keys_repr = _redact_dict_values(state["iceberg_storage_properties"])
-            reader_override = state["reader_override"]
-            eprint(
-                "IcebergDataset: getstate(): "
-                f"path: '{path_repr}', "
-                f"snapshot_id: '{snapshot_id}', "
-                f"iceberg_storage_properties: {keys_repr}, "
-                f"reader_override: {reader_override}"
-            )
-        IcebergDataset.__init__(
-            self,
-            state["metadata_path"],
-            snapshot_id=state["snapshot_id"],
-            iceberg_storage_properties=state["iceberg_storage_properties"],
-            reader_override=state["reader_override"],
-        )
-class _ResolvedScanDataBase(ABC):
-    @abstractmethod
-    def to_lazyframe(self) -> pl.LazyFrame: ...
-    @property
-    @abstractmethod
-    def snapshot_id_key(self) -> str: ...
-@dataclass
-class _NativeIcebergScanData(_ResolvedScanDataBase):
-    """Resolved parameters for a native Iceberg scan."""
-    sources: list[str]
-    projected_iceberg_schema: pyiceberg.schema.Schema
-    column_mapping: pa.Schema
-    default_values: dict[int, pl.Series | str]
-    deletion_files: dict[int, list[str]]
-    min_max_statistics: pl.DataFrame | None
-    # This is here for test purposes, as the `min_max_statistics` on this
-    # dataclass contain coalesced values from `default_values`, a test may
-    # access the statistics loader directly to inspect the values before
-    # coalescing.
-    statistics_loader: IcebergStatisticsLoader | None
-    storage_options: dict[str, str] | None
-    _snapshot_id_key: str
-    def to_lazyframe(self) -> pl.LazyFrame:
-        from polars.io.parquet.functions import scan_parquet
-        return scan_parquet(
-            self.sources,
-            cast_options=ScanCastOptions._default_iceberg(),
-            missing_columns="insert",
-            extra_columns="ignore",
-            storage_options=self.storage_options,
-            _column_mapping=("iceberg-column-mapping", self.column_mapping),
-            _default_values=("iceberg", self.default_values),
-            _deletion_files=("iceberg-position-delete", self.deletion_files),
-            _table_statistics=self.min_max_statistics,
-        )
-    @property
-    def snapshot_id_key(self) -> str:
-        return self._snapshot_id_key
-@dataclass
-class _PyIcebergScanData(_ResolvedScanDataBase):
-    """Resolved parameters for reading via PyIceberg."""
-    # We're not interested in inspecting anything for the pyiceberg scan, so
-    # this class is just a wrapper.
-    lf: pl.LazyFrame
-    _snapshot_id_key: str
-    def to_lazyframe(self) -> pl.LazyFrame:
-        return self.lf
-    @property
-    def snapshot_id_key(self) -> str:
-        return self._snapshot_id_key
-def _redact_dict_values(obj: Any) -> Any:
-    return (
-        {k: "REDACTED" for k in obj.keys()}  # noqa: SIM118
-        if isinstance(obj, dict)
-        else f"<{type(obj).__name__} object>"
-        if obj is not None
-        else "None"
-    )
-def _convert_iceberg_to_object_store_storage_options(
-    iceberg_storage_properties: dict[str, str],
-) -> dict[str, str]:
-    storage_options = {}
-    for k, v in iceberg_storage_properties.items():
-        if (
-            translated_key := ICEBERG_TO_OBJECT_STORE_CONFIG_KEY_MAP.get(k)
-        ) is not None:
-            storage_options[translated_key] = v
-        elif "." not in k:
-            # Pass-through non-Iceberg config keys, as they may be native config
-            # keys. We identify Iceberg keys by checking for a dot - from
-            # observation nearly all Iceberg config keys contain dots, whereas
-            # native config keys do not contain them.
-            storage_options[k] = v
-        # Otherwise, unknown keys are ignored / not passed. This is to avoid
-        # interfering with credential provider auto-init, which bails on
-        # unknown keys.
-    return storage_options
-# https://py.iceberg.apache.org/configuration/#fileio
-# This does not contain all keys - some have no object-store equivalent.
-ICEBERG_TO_OBJECT_STORE_CONFIG_KEY_MAP: dict[str, str] = {
-    # S3
-    "s3.endpoint": "aws_endpoint_url",
-    "s3.access-key-id": "aws_access_key_id",
-    "s3.secret-access-key": "aws_secret_access_key",
-    "s3.session-token": "aws_session_token",
-    "s3.region": "aws_region",
-    "s3.proxy-uri": "proxy_url",
-    "s3.connect-timeout": "connect_timeout",
-    "s3.request-timeout": "timeout",
-    "s3.force-virtual-addressing": "aws_virtual_hosted_style_request",
-    # Azure
-    "adls.account-name": "azure_storage_account_name",
-    "adls.account-key": "azure_storage_account_key",
-    "adls.sas-token": "azure_storage_sas_key",
-    "adls.tenant-id": "azure_storage_tenant_id",
-    "adls.client-id": "azure_storage_client_id",
-    "adls.client-secret": "azure_storage_client_secret",
-    "adls.account-host": "azure_storage_authority_host",
-    "adls.token": "azure_storage_token",
-    # Google storage
-    "gcs.oauth2.token": "bearer_token",
-    # HuggingFace
-    "hf.token": "token",
-}

polars/io/iceberg/functions.py DELETED Viewed

@@ -1,151 +0,0 @@
-from __future__ import annotations
-from typing import TYPE_CHECKING, Any, Literal
-from polars._utils.unstable import issue_unstable_warning
-from polars._utils.wrap import wrap_ldf
-from polars.io.iceberg.dataset import IcebergDataset
-if TYPE_CHECKING:
-    from pyiceberg.table import Table
-    from polars.lazyframe.frame import LazyFrame
-def scan_iceberg(
-    source: str | Table,
-    *,
-    snapshot_id: int | None = None,
-    storage_options: dict[str, Any] | None = None,
-    reader_override: Literal["native", "pyiceberg"] | None = None,
-    use_metadata_statistics: bool = True,
-) -> LazyFrame:
-    """
-    Lazily read from an Apache Iceberg table.
-    Parameters
-    ----------
-    source
-        A PyIceberg table, or a direct path to the metadata.
-        Note: For Local filesystem, absolute and relative paths are supported but
-        for the supported object storages - GCS, Azure and S3 full URI must be provided.
-    snapshot_id
-        The snapshot ID to scan from.
-    storage_options
-        Extra options for the storage backends supported by `pyiceberg`.
-        For cloud storages, this may include configurations for authentication etc.
-        More info is available `here <https://py.iceberg.apache.org/configuration/>`__.
-    reader_override
-        Overrides the reader used to read the data.
-        .. warning::
-            This functionality is considered **unstable**. It may be changed
-            at any point without it being considered a breaking change.
-        Note that this parameter should not be necessary outside of testing, as
-        polars will by default automatically select the best reader.
-        Available options:
-        * native: Uses polars native reader. This allows for more optimizations to
-          improve performance.
-        * pyiceberg: Uses PyIceberg, which may support more features.
-    use_metadata_statistics
-        Load and use min/max statistics from Iceberg metadata files when a filter
-        is present. This allows the reader to potentially skip loading metadata
-        from the underlying data files.
-        .. warning::
-            This functionality is considered **unstable**. It may be changed
-            at any point without it being considered a breaking change.
-    Returns
-    -------
-    LazyFrame
-    Examples
-    --------
-    Creates a scan for an Iceberg table from local filesystem, or object store.
-    >>> table_path = "file:/path/to/iceberg-table/metadata.json"
-    >>> pl.scan_iceberg(table_path).collect()  # doctest: +SKIP
-    Creates a scan for an Iceberg table from S3.
-    See a list of supported storage options for S3 `here
-    <https://py.iceberg.apache.org/configuration/#fileio>`__.
-    >>> table_path = "s3://bucket/path/to/iceberg-table/metadata.json"
-    >>> storage_options = {
-    ...     "s3.region": "eu-central-1",
-    ...     "s3.access-key-id": "THE_AWS_ACCESS_KEY_ID",
-    ...     "s3.secret-access-key": "THE_AWS_SECRET_ACCESS_KEY",
-    ... }
-    >>> pl.scan_iceberg(
-    ...     table_path, storage_options=storage_options
-    ... ).collect()  # doctest: +SKIP
-    Creates a scan for an Iceberg table from Azure.
-    Supported options for Azure are available `here
-    <https://py.iceberg.apache.org/configuration/#azure-data-lake>`__.
-    Following type of table paths are supported:
-    * az://<container>/<path>/metadata.json
-    * adl://<container>/<path>/metadata.json
-    * abfs[s]://<container>/<path>/metadata.json
-    >>> table_path = "az://container/path/to/iceberg-table/metadata.json"
-    >>> storage_options = {
-    ...     "adlfs.account-name": "AZURE_STORAGE_ACCOUNT_NAME",
-    ...     "adlfs.account-key": "AZURE_STORAGE_ACCOUNT_KEY",
-    ... }
-    >>> pl.scan_iceberg(
-    ...     table_path, storage_options=storage_options
-    ... ).collect()  # doctest: +SKIP
-    Creates a scan for an Iceberg table from Google Cloud Storage.
-    Supported options for GCS are available `here
-    <https://py.iceberg.apache.org/configuration/#google-cloud-storage>`__.
-    >>> table_path = "s3://bucket/path/to/iceberg-table/metadata.json"
-    >>> storage_options = {
-    ...     "gcs.project-id": "my-gcp-project",
-    ...     "gcs.oauth.token": "ya29.dr.AfM...",
-    ... }
-    >>> pl.scan_iceberg(
-    ...     table_path, storage_options=storage_options
-    ... ).collect()  # doctest: +SKIP
-    Creates a scan for an Iceberg table with additional options.
-    In the below example, `without_files` option is used which loads the table without
-    file tracking information.
-    >>> table_path = "/path/to/iceberg-table/metadata.json"
-    >>> storage_options = {"py-io-impl": "pyiceberg.io.fsspec.FsspecFileIO"}
-    >>> pl.scan_iceberg(
-    ...     table_path, storage_options=storage_options
-    ... ).collect()  # doctest: +SKIP
-    Creates a scan for an Iceberg table using a specific snapshot ID.
-    >>> table_path = "/path/to/iceberg-table/metadata.json"
-    >>> snapshot_id = 7051579356916758811
-    >>> pl.scan_iceberg(table_path, snapshot_id=snapshot_id).collect()  # doctest: +SKIP
-    """
-    from polars._plr import PyLazyFrame
-    if reader_override is not None:
-        msg = "the `reader_override` parameter of `scan_iceberg()` is considered unstable."
-        issue_unstable_warning(msg)
-    dataset = IcebergDataset(
-        source,
-        snapshot_id=snapshot_id,
-        iceberg_storage_properties=storage_options,
-        reader_override=reader_override,
-        use_metadata_statistics=use_metadata_statistics,
-    )
-    return wrap_ldf(PyLazyFrame.new_from_dataset_object(dataset))