PyPI - snowflake-ml-python - Versions diffs - 1.8.1__py3-none-any.whl → 1.8.3__py3-none-any.whl - Mend

snowflake-ml-python 1.8.1py3-none-any.whl → 1.8.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (170) hide show

snowflake/ml/_internal/utils/table_manager.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Optional
 from snowflake import snowpark
 from snowflake.ml._internal.utils import formatting, identifier, query_result_checker
@@ -24,8 +24,8 @@ def create_single_table(
     database_name: str,
     schema_name: str,
     table_name: str,
-    table_schema: List[Tuple[str, str]],
-    statement_params: Optional[Dict[str, Any]] = None,
+    table_schema: list[tuple[str, str]],
+    statement_params: Optional[dict[str, Any]] = None,
 ) -> str:
     """Creates a single table for registry and returns the fully qualified name of the table.
@@ -55,7 +55,7 @@ def create_single_table(
     return fully_qualified_table_name
-def insert_table_entry(session: snowpark.Session, table: str, columns: Dict[str, Any]) -> List[snowpark.Row]:
+def insert_table_entry(session: snowpark.Session, table: str, columns: dict[str, Any]) -> list[snowpark.Row]:
     """Insert an entry into an internal Model Registry table.
     Args:
@@ -99,9 +99,9 @@ def validate_table_exist(session: snowpark.Session, table: str, qualified_schema
     return len(tables) == 1
-def get_table_schema(session: snowpark.Session, table_name: str, qualified_schema_name: str) -> Dict[str, str]:
+def get_table_schema(session: snowpark.Session, table_name: str, qualified_schema_name: str) -> dict[str, str]:
     result = session.sql(f"DESC TABLE {qualified_schema_name}.{table_name}").collect()
-    schema_dict: Dict[str, str] = {}
+    schema_dict: dict[str, str] = {}
     for row in result:
         schema_dict[row["name"]] = row["type"]
     return schema_dict
@@ -112,13 +112,13 @@ def get_table_schema_types(
     database: str,
     schema: str,
     table_name: str,
-) -> Dict[str, types.DataType]:
+) -> dict[str, types.DataType]:
     fully_qualified_table_name = identifier.get_schema_level_object_identifier(
         db=database, schema=schema, object_name=table_name
     )
-    struct_fields: List[types.StructField] = session.table(fully_qualified_table_name).schema.fields
+    struct_fields: list[types.StructField] = session.table(fully_qualified_table_name).schema.fields
-    schema_dict: Dict[str, types.DataType] = {}
+    schema_dict: dict[str, types.DataType] = {}
     for field in struct_fields:
         schema_dict[field.name] = field.datatype
     return schema_dict

snowflake/ml/data/_internal/arrow_ingestor.py CHANGED Viewed

@@ -2,7 +2,7 @@ import collections
 import logging
 import os
 import time
-from typing import Any, Deque, Dict, Iterator, List, Optional, Sequence, Union
+from typing import Any, Deque, Iterator, Optional, Sequence, Union
 import numpy as np
 import numpy.typing as npt
@@ -71,7 +71,7 @@ class ArrowIngestor(data_ingestor.DataIngestor):
         return cls(session, sources)
     @property
-    def data_sources(self) -> List[data_source.DataSource]:
+    def data_sources(self) -> list[data_source.DataSource]:
         return self._data_sources
     def to_batches(
@@ -79,7 +79,7 @@ class ArrowIngestor(data_ingestor.DataIngestor):
         batch_size: int,
         shuffle: bool = True,
         drop_last_batch: bool = True,
-    ) -> Iterator[Dict[str, npt.NDArray[Any]]]:
+    ) -> Iterator[dict[str, npt.NDArray[Any]]]:
         """Iterate through PyArrow Dataset to generate batches whose length equals to expected batch size.
         As we are generating batches with the exactly same length, the last few rows in each file might get left as they
@@ -120,7 +120,7 @@ class ArrowIngestor(data_ingestor.DataIngestor):
     def _get_dataset(self, shuffle: bool) -> pds.Dataset:
         format = self._format
-        sources: List[Any] = []
+        sources: list[Any] = []
         source_format = None
         for source in self._data_sources:
             if isinstance(source, str):
@@ -155,7 +155,7 @@ class ArrowIngestor(data_ingestor.DataIngestor):
         pa_dataset: pds.Dataset = pds.dataset(sources, format=format, **self._kwargs)
         return pa_dataset
-    def _get_batches_from_buffer(self, batch_size: int) -> Dict[str, npt.NDArray[Any]]:
+    def _get_batches_from_buffer(self, batch_size: int) -> dict[str, npt.NDArray[Any]]:
         """Generate new batches from the existing record batch buffer."""
         cnt_rbs_num_rows = 0
         candidates = []
@@ -180,7 +180,7 @@ class ArrowIngestor(data_ingestor.DataIngestor):
         return _record_batch_to_arrays(res)
-def _merge_record_batches(record_batches: List[pa.RecordBatch]) -> pa.RecordBatch:
+def _merge_record_batches(record_batches: list[pa.RecordBatch]) -> pa.RecordBatch:
     """Merge a list of arrow RecordBatches into one. Similar to MergeTables."""
     if not record_batches:
         return _EMPTY_RECORD_BATCH
@@ -192,7 +192,7 @@ def _merge_record_batches(record_batches: List[pa.RecordBatch]) -> pa.RecordBatc
     return batches[0]
-def _record_batch_to_arrays(rb: pa.RecordBatch) -> Dict[str, npt.NDArray[Any]]:
+def _record_batch_to_arrays(rb: pa.RecordBatch) -> dict[str, npt.NDArray[Any]]:
     """Transform the record batch to a (string, numpy array) dict."""
     batch_dict = {}
     for column, column_schema in zip(rb, rb.schema):

snowflake/ml/data/data_connector.py CHANGED Viewed

@@ -1,32 +1,18 @@
 import os
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Dict,
-    Generator,
-    List,
-    Optional,
-    Sequence,
-    Type,
-    TypeVar,
-    cast,
-)
+from typing import TYPE_CHECKING, Any, Generator, Optional, Sequence, TypeVar
 import numpy.typing as npt
 from typing_extensions import deprecated
 from snowflake import snowpark
-from snowflake.ml._internal import telemetry
+from snowflake.ml._internal import env, telemetry
 from snowflake.ml.data import data_ingestor, data_source
 from snowflake.ml.data._internal.arrow_ingestor import ArrowIngestor
-from snowflake.ml.modeling._internal.constants import (
-    IN_ML_RUNTIME_ENV_VAR,
-    USE_OPTIMIZED_DATA_INGESTOR,
-)
 from snowflake.snowpark import context as sf_context
 if TYPE_CHECKING:
     import pandas as pd
+    import ray
     import tensorflow as tf
     from torch.utils import data as torch_data
@@ -42,7 +28,7 @@ DataConnectorType = TypeVar("DataConnectorType", bound="DataConnector")
 class DataConnector:
     """Snowflake data reader which provides application integration connectors"""
-    DEFAULT_INGESTOR_CLASS: Type[data_ingestor.DataIngestor] = ArrowIngestor
+    DEFAULT_INGESTOR_CLASS: type[data_ingestor.DataIngestor] = ArrowIngestor
     def __init__(
         self,
@@ -53,27 +39,22 @@ class DataConnector:
         self._kwargs = kwargs
     @classmethod
-    @snowpark._internal.utils.private_preview(version="1.6.0")
     def from_dataframe(
-        cls: Type[DataConnectorType],
+        cls: type[DataConnectorType],
         df: snowpark.DataFrame,
-        ingestor_class: Optional[Type[data_ingestor.DataIngestor]] = None,
+        ingestor_class: Optional[type[data_ingestor.DataIngestor]] = None,
         **kwargs: Any,
     ) -> DataConnectorType:
         if len(df.queries["queries"]) != 1 or len(df.queries["post_actions"]) != 0:
             raise ValueError("DataFrames with multiple queries and/or post-actions not supported")
-        return cast(
-            DataConnectorType,
-            cls.from_sql(df.queries["queries"][0], session=df._session, ingestor_class=ingestor_class, **kwargs),
-        )
+        return cls.from_sql(df.queries["queries"][0], session=df._session, ingestor_class=ingestor_class, **kwargs)
     @classmethod
-    @snowpark._internal.utils.private_preview(version="1.7.3")
     def from_sql(
-        cls: Type[DataConnectorType],
+        cls: type[DataConnectorType],
         query: str,
         session: Optional[snowpark.Session] = None,
-        ingestor_class: Optional[Type[data_ingestor.DataIngestor]] = None,
+        ingestor_class: Optional[type[data_ingestor.DataIngestor]] = None,
         **kwargs: Any,
     ) -> DataConnectorType:
         session = session or sf_context.get_active_session()
@@ -82,9 +63,9 @@ class DataConnector:
     @classmethod
     def from_dataset(
-        cls: Type[DataConnectorType],
+        cls: type[DataConnectorType],
         ds: "dataset.Dataset",
-        ingestor_class: Optional[Type[data_ingestor.DataIngestor]] = None,
+        ingestor_class: Optional[type[data_ingestor.DataIngestor]] = None,
         **kwargs: Any,
     ) -> DataConnectorType:
         dsv = ds.selected_version
@@ -101,10 +82,10 @@ class DataConnector:
         func_params_to_log=["sources", "ingestor_class"],
     )
     def from_sources(
-        cls: Type[DataConnectorType],
+        cls: type[DataConnectorType],
         session: snowpark.Session,
         sources: Sequence[data_source.DataSource],
-        ingestor_class: Optional[Type[data_ingestor.DataIngestor]] = None,
+        ingestor_class: Optional[type[data_ingestor.DataIngestor]] = None,
         **kwargs: Any,
     ) -> DataConnectorType:
         ingestor_class = ingestor_class or cls.DEFAULT_INGESTOR_CLASS
@@ -112,7 +93,7 @@ class DataConnector:
         return cls(ingestor, **kwargs)
     @property
-    def data_sources(self) -> List[data_source.DataSource]:
+    def data_sources(self) -> list[data_source.DataSource]:
         return self._ingestor.data_sources
     @telemetry.send_api_usage_telemetry(
@@ -138,7 +119,7 @@ class DataConnector:
         """
         import tensorflow as tf
-        def generator() -> Generator[Dict[str, npt.NDArray[Any]], None, None]:
+        def generator() -> Generator[dict[str, npt.NDArray[Any]], None, None]:
             yield from self._ingestor.to_batches(batch_size, shuffle, drop_last_batch)
         # Derive TensorFlow signature
@@ -241,14 +222,37 @@ class DataConnector:
         """
         return self._ingestor.to_pandas(limit)
+    @telemetry.send_api_usage_telemetry(
+        project=_PROJECT,
+        subproject_extractor=lambda self: type(self).__name__,
+        func_params_to_log=["limit"],
+    )
+    def to_ray_dataset(self) -> "ray.data.Dataset":
+        """Retrieve the Snowflake data as a Ray Dataset.
+        Returns:
+            A Ray Dataset.
+        Raises:
+            ImportError: If Ray is not installed in the local environment.
+        """
+        if hasattr(self._ingestor, "to_ray_dataset"):
+            return self._ingestor.to_ray_dataset()
+        try:
+            import ray
+            return ray.data.from_pandas(self._ingestor.to_pandas())
+        except ImportError as e:
+            raise ImportError("Ray is not installed, please install ray in your local environment.") from e
 # Switch to use Runtime's Data Ingester if running in ML runtime
 # Fail silently if the data ingester is not found
-if os.getenv(IN_ML_RUNTIME_ENV_VAR) and os.getenv(USE_OPTIMIZED_DATA_INGESTOR):
+if env.IN_ML_RUNTIME and os.getenv(env.USE_OPTIMIZED_DATA_INGESTOR):
     try:
         from runtime_external_entities import get_ingester_class
         DataConnector.DEFAULT_INGESTOR_CLASS = get_ingester_class()
     except ImportError:
         """Runtime Default Ingester not found, ignore"""
-        pass

snowflake/ml/data/data_ingestor.py CHANGED Viewed

@@ -1,15 +1,4 @@
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Dict,
-    Iterator,
-    List,
-    Optional,
-    Protocol,
-    Sequence,
-    Type,
-    TypeVar,
-)
+from typing import TYPE_CHECKING, Any, Iterator, Optional, Protocol, Sequence, TypeVar
 from numpy import typing as npt
@@ -26,12 +15,12 @@ DataIngestorType = TypeVar("DataIngestorType", bound="DataIngestor")
 class DataIngestor(Protocol):
     @classmethod
     def from_sources(
-        cls: Type[DataIngestorType], session: snowpark.Session, sources: Sequence[data_source.DataSource]
+        cls: type[DataIngestorType], session: snowpark.Session, sources: Sequence[data_source.DataSource]
     ) -> DataIngestorType:
         raise NotImplementedError
     @property
-    def data_sources(self) -> List[data_source.DataSource]:
+    def data_sources(self) -> list[data_source.DataSource]:
         raise NotImplementedError
     def to_batches(
@@ -39,7 +28,7 @@ class DataIngestor(Protocol):
         batch_size: int,
         shuffle: bool = True,
         drop_last_batch: bool = True,
-    ) -> Iterator[Dict[str, npt.NDArray[Any]]]:
+    ) -> Iterator[dict[str, npt.NDArray[Any]]]:
         raise NotImplementedError
     def to_pandas(self, limit: Optional[int] = None) -> "pd.DataFrame":

snowflake/ml/data/data_source.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import dataclasses
-from typing import List, Optional, Union
+from typing import Optional, Union
 @dataclasses.dataclass(frozen=True)
@@ -17,7 +17,7 @@ class DatasetInfo:
     fully_qualified_name: str
     version: str
     url: Optional[str] = None
-    exclude_cols: Optional[List[str]] = None
+    exclude_cols: Optional[list[str]] = None
 DataSource = Union[DataFrameInfo, DatasetInfo, str]

snowflake/ml/data/ingestor_utils.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import Optional
 import fsspec
 import pyarrow as pa
@@ -33,7 +33,7 @@ def _get_dataframe_cursor(session: snowpark.Session, df_info: data_source.DataFr
 def get_dataframe_result_batches(
     session: snowpark.Session, df_info: data_source.DataFrameInfo
-) -> List[result_batch.ResultBatch]:
+) -> list[result_batch.ResultBatch]:
     """Retrieve the ResultBatches for a given query"""
     cursor = _get_dataframe_cursor(session, df_info)
     batches = cursor.get_result_batches()
@@ -63,7 +63,7 @@ def get_dataset_filesystem(
 def get_dataset_files(
     session: snowpark.Session, ds_info: data_source.DatasetInfo, filesystem: Optional[fsspec.AbstractFileSystem] = None
-) -> List[str]:
+) -> list[str]:
     """Get the list of files in a given Dataset"""
     if filesystem is None:
         filesystem = get_dataset_filesystem(session, ds_info)

snowflake/ml/data/torch_utils.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Any, Dict, Iterator, List, Optional, Union
+from typing import Any, Iterator, Optional, Union
 import numpy as np
 import numpy.typing as npt
@@ -7,7 +7,7 @@ import torch.utils.data
 from snowflake.ml.data import data_ingestor
-class TorchDatasetWrapper(torch.utils.data.IterableDataset[Dict[str, Any]]):
+class TorchDatasetWrapper(torch.utils.data.IterableDataset[dict[str, Any]]):
     """Wrap a DataIngestor into a PyTorch IterableDataset"""
     def __init__(
@@ -32,7 +32,7 @@ class TorchDatasetWrapper(torch.utils.data.IterableDataset[Dict[str, Any]]):
         self._squeeze_outputs = squeeze
         self._expand_dims = expand_dims
-    def __iter__(self) -> Iterator[Dict[str, Union[npt.NDArray[Any], List[Any]]]]:
+    def __iter__(self) -> Iterator[dict[str, Union[npt.NDArray[Any], list[Any]]]]:
         max_idx = 0
         filter_idx = 0
         worker_info = torch.utils.data.get_worker_info()
@@ -59,7 +59,7 @@ class TorchDatasetWrapper(torch.utils.data.IterableDataset[Dict[str, Any]]):
                 counter = 0
-class TorchDataPipeWrapper(TorchDatasetWrapper, torch.utils.data.IterDataPipe[Dict[str, Any]]):
+class TorchDataPipeWrapper(TorchDatasetWrapper, torch.utils.data.IterDataPipe[dict[str, Any]]):
     """Wrap a DataIngestor into a PyTorch IterDataPipe"""
     def __init__(
@@ -77,7 +77,7 @@ class TorchDataPipeWrapper(TorchDatasetWrapper, torch.utils.data.IterDataPipe[Di
 def _preprocess_array(
     arr: npt.NDArray[Any], squeeze: bool = False, expand_dims: bool = True
-) -> Union[npt.NDArray[Any], List[np.object_]]:
+) -> Union[npt.NDArray[Any], list[np.object_]]:
     """Preprocesses batch column values."""
     single_dimensional = arr.ndim < 2 and not arr.dtype == np.object_

snowflake/ml/dataset/dataset.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import json
 import warnings
 from datetime import datetime
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Optional, Union
 from snowflake import snowpark
 from snowflake.ml._internal import telemetry
@@ -46,8 +46,8 @@ class DatasetVersion:
         self._version = version
         self._session: snowpark.Session = self._parent._session
-        self._properties: Optional[Dict[str, Any]] = None
-        self._raw_metadata: Optional[Dict[str, Any]] = None
+        self._properties: Optional[dict[str, Any]] = None
+        self._raw_metadata: Optional[dict[str, Any]] = None
         self._metadata: Optional[dataset_metadata.DatasetMetadata] = None
     @property
@@ -66,14 +66,14 @@ class DatasetVersion:
         return comment
     @property
-    def label_cols(self) -> List[str]:
+    def label_cols(self) -> list[str]:
         metadata = self._get_metadata()
         if metadata is None or metadata.label_cols is None:
             return []
         return metadata.label_cols
     @property
-    def exclude_cols(self) -> List[str]:
+    def exclude_cols(self) -> list[str]:
         metadata = self._get_metadata()
         if metadata is None or metadata.exclude_cols is None:
             return []
@@ -115,7 +115,7 @@ class DatasetVersion:
         return path
     @telemetry.send_api_usage_telemetry(project=_PROJECT)
-    def list_files(self, subdir: Optional[str] = None) -> List[snowpark.Row]:
+    def list_files(self, subdir: Optional[str] = None) -> list[snowpark.Row]:
         """Get the list of remote file paths for the current DatasetVersion."""
         return self._session.sql(f"LIST {self.url()}{subdir or ''}").collect(
             statement_params=_TELEMETRY_STATEMENT_PARAMS
@@ -244,7 +244,7 @@ class Dataset(lineage_node.LineageNode):
                 raise
     @telemetry.send_api_usage_telemetry(project=_PROJECT)
-    def list_versions(self, detailed: bool = False) -> Union[List[str], List[snowpark.Row]]:
+    def list_versions(self, detailed: bool = False) -> Union[list[str], list[snowpark.Row]]:
         """Return list of versions"""
         versions = self._list_versions()
         versions.sort(key=lambda r: r[_DATASET_VERSION_NAME_COL])
@@ -271,8 +271,8 @@ class Dataset(lineage_node.LineageNode):
         version: str,
         input_dataframe: snowpark.DataFrame,
         shuffle: bool = False,
-        exclude_cols: Optional[List[str]] = None,
-        label_cols: Optional[List[str]] = None,
+        exclude_cols: Optional[list[str]] = None,
+        label_cols: Optional[list[str]] = None,
         properties: Optional[dataset_metadata.DatasetPropertiesType] = None,
         partition_by: Optional[str] = None,
         comment: Optional[str] = None,
@@ -423,7 +423,7 @@ class Dataset(lineage_node.LineageNode):
             statement_params=_TELEMETRY_STATEMENT_PARAMS
         )
-    def _list_versions(self, pattern: Optional[str] = None) -> List[snowpark.Row]:
+    def _list_versions(self, pattern: Optional[str] = None) -> list[snowpark.Row]:
         """Return list of versions"""
         try:
             pattern_clause = f" LIKE '{pattern}'" if pattern else ""
@@ -469,7 +469,7 @@ lineage_node.DOMAIN_LINEAGE_REGISTRY["dataset"] = Dataset
 # Utility methods
-def _get_schema_level_identifier(session: snowpark.Session, dataset_name: str) -> Tuple[str, str, str]:
+def _get_schema_level_identifier(session: snowpark.Session, dataset_name: str) -> tuple[str, str, str]:
     """Resolve a dataset name into a validated schema-level location identifier"""
     db, schema, object_name = identifier.parse_schema_level_object_identifier(dataset_name)
     db = db or session.get_current_database()

snowflake/ml/dataset/dataset_metadata.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import dataclasses
 import json
 import typing
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Optional, Union
 _PROPERTY_TYPE_KEY = "$proptype$"
 DATASET_SCHEMA_VERSION = "1"
@@ -20,15 +20,15 @@ class FeatureStoreMetadata:
     """
     spine_query: str
-    serialized_feature_views: Optional[List[str]] = None
-    compact_feature_views: Optional[List[str]] = None
+    serialized_feature_views: Optional[list[str]] = None
+    compact_feature_views: Optional[list[str]] = None
     spine_timestamp_col: Optional[str] = None
     def to_json(self) -> str:
         return json.dumps(dataclasses.asdict(self))
     @classmethod
-    def from_json(cls, input_json: Union[Dict[str, Any], str, bytes]) -> "FeatureStoreMetadata":
+    def from_json(cls, input_json: Union[dict[str, Any], str, bytes]) -> "FeatureStoreMetadata":
         if isinstance(input_json, dict):
             return cls(**input_json)
         return cls(**json.loads(input_json))
@@ -61,8 +61,8 @@ class DatasetMetadata:
     source_query: str
     owner: str
-    exclude_cols: Optional[List[str]] = None
-    label_cols: Optional[List[str]] = None
+    exclude_cols: Optional[list[str]] = None
+    label_cols: Optional[list[str]] = None
     properties: Optional[DatasetPropertiesType] = None
     schema_version: str = dataclasses.field(default=DATASET_SCHEMA_VERSION, init=False)
@@ -78,11 +78,11 @@ class DatasetMetadata:
         return json.dumps(state_dict)
     @classmethod
-    def from_json(cls, input_json: Union[Dict[str, Any], str, bytes]) -> "DatasetMetadata":
+    def from_json(cls, input_json: Union[dict[str, Any], str, bytes]) -> "DatasetMetadata":
         if not input_json:
             raise ValueError("json_str was empty or None")
         try:
-            state_dict: Dict[str, Any] = (
+            state_dict: dict[str, Any] = (
                 input_json if isinstance(input_json, dict) else json.loads(input_json, strict=False)
             )

snowflake/ml/dataset/dataset_reader.py CHANGED Viewed

@@ -1,10 +1,11 @@
-from typing import Any, List, Optional, Type
+from typing import Any, Optional
 from snowflake import snowpark
 from snowflake.ml._internal import telemetry
 from snowflake.ml._internal.lineage import lineage_utils
 from snowflake.ml.data import data_connector, data_ingestor, data_source, ingestor_utils
 from snowflake.ml.fileset import snowfs
+from snowflake.snowpark._internal import utils as snowpark_utils
 _PROJECT = "Dataset"
 _SUBPROJECT = "DatasetReader"
@@ -24,21 +25,21 @@ class DatasetReader(data_connector.DataConnector):
         self._session: snowpark.Session = snowpark_session
         self._fs: snowfs.SnowFileSystem = ingestor_utils.get_dataset_filesystem(self._session)
-        self._files: Optional[List[str]] = None
+        self._files: Optional[list[str]] = None
     @classmethod
     def from_dataframe(
-        cls, df: snowpark.DataFrame, ingestor_class: Optional[Type[data_ingestor.DataIngestor]] = None, **kwargs: Any
+        cls, df: snowpark.DataFrame, ingestor_class: Optional[type[data_ingestor.DataIngestor]] = None, **kwargs: Any
     ) -> "DatasetReader":
         # Block superclass constructor from Snowpark DataFrames
         raise RuntimeError("Creating DatasetReader from DataFrames not supported")
-    def _list_files(self) -> List[str]:
+    def _list_files(self) -> list[str]:
         """Private helper function that lists all files in this DatasetVersion and caches the results."""
         if self._files:
             return self._files
-        files: List[str] = []
+        files: list[str] = []
         for source in self.data_sources:
             assert isinstance(source, data_source.DatasetInfo)
             files.extend(ingestor_utils.get_dataset_files(self._session, source, filesystem=self._fs))
@@ -48,7 +49,7 @@ class DatasetReader(data_connector.DataConnector):
         return self._files
     @telemetry.send_api_usage_telemetry(project=_PROJECT, subproject=_SUBPROJECT)
-    def files(self) -> List[str]:
+    def files(self) -> list[str]:
         """Get the list of remote file paths for the current DatasetVersion.
         The file paths follows the snow protocol.
@@ -91,10 +92,13 @@ class DatasetReader(data_connector.DataConnector):
                 For example, an OBJECT column may be scanned back as a STRING column.
         """
         file_path_pattern = ".*data_.*[.]parquet"
-        dfs: List[snowpark.DataFrame] = []
+        dfs: list[snowpark.DataFrame] = []
         for source in self.data_sources:
             assert isinstance(source, data_source.DatasetInfo) and source.url is not None
-            df = self._session.read.option("pattern", file_path_pattern).parquet(source.url)
+            stage_reader = self._session.read.option("pattern", file_path_pattern)
+            if "INFER_SCHEMA_OPTIONS" in snowpark_utils.NON_FORMAT_TYPE_OPTIONS:
+                stage_reader = stage_reader.option("INFER_SCHEMA_OPTIONS", {"MAX_FILE_COUNT": 1})
+            df = stage_reader.parquet(source.url)
             if only_feature_cols and source.exclude_cols:
                 df = df.drop(source.exclude_cols)
             dfs.append(df)

snowflake/ml/feature_store/__init__.py CHANGED Viewed

@@ -4,7 +4,7 @@ from snowflake.ml._internal import init_utils
 from .access_manager import setup_feature_store
-pkg_dir = os.path.dirname(os.path.abspath(__file__))
+pkg_dir = os.path.dirname(__file__)
 pkg_name = __name__
 exportable_classes = init_utils.fetch_classes_from_modules_in_pkg_dir(pkg_dir=pkg_dir, pkg_name=pkg_name)
 for k, v in exportable_classes.items():

snowflake/ml/feature_store/access_manager.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from dataclasses import asdict, dataclass
 from enum import Enum
-from typing import Dict, List, Optional
+from typing import Optional
 from warnings import warn
 from snowflake.ml._internal import telemetry
@@ -28,7 +28,7 @@ class _FeatureStoreRole(Enum):
 class _Privilege:
     object_type: str
     object_name: str
-    privileges: List[str]
+    privileges: list[str]
     scope: Optional[str] = None
     optional: bool = False
@@ -41,7 +41,7 @@ class _SessionInfo:
 # Lists of permissions as tuples of (OBJECT_TYPE, [PRIVILEGES, ...])
-_PRE_INIT_PRIVILEGES: Dict[_FeatureStoreRole, List[_Privilege]] = {
+_PRE_INIT_PRIVILEGES: dict[_FeatureStoreRole, list[_Privilege]] = {
     _FeatureStoreRole.PRODUCER: [
         _Privilege("DATABASE", "{database}", ["USAGE"]),
         _Privilege("SCHEMA", "{database}.{schema}", ["USAGE"]),
@@ -78,7 +78,7 @@ _PRE_INIT_PRIVILEGES: Dict[_FeatureStoreRole, List[_Privilege]] = {
     _FeatureStoreRole.NONE: [],
 }
-_POST_INIT_PRIVILEGES: Dict[_FeatureStoreRole, List[_Privilege]] = {
+_POST_INIT_PRIVILEGES: dict[_FeatureStoreRole, list[_Privilege]] = {
     _FeatureStoreRole.PRODUCER: [
         _Privilege("TAG", f"{{database}}.{{schema}}.{_FEATURE_VIEW_METADATA_TAG}", ["APPLY"]),
         _Privilege("TAG", f"{{database}}.{{schema}}.{_FEATURE_STORE_OBJECT_TAG}", ["APPLY"]),
@@ -89,7 +89,7 @@ _POST_INIT_PRIVILEGES: Dict[_FeatureStoreRole, List[_Privilege]] = {
 def _grant_privileges(
-    session: Session, role_name: str, privileges: List[_Privilege], session_info: _SessionInfo
+    session: Session, role_name: str, privileges: list[_Privilege], session_info: _SessionInfo
 ) -> None:
     session_info_dict = asdict(session_info)
     for p in privileges:
@@ -129,7 +129,7 @@ def _grant_privileges(
 def _configure_pre_init_privileges(
     session: Session,
     session_info: _SessionInfo,
-    roles_to_create: Dict[_FeatureStoreRole, str],
+    roles_to_create: dict[_FeatureStoreRole, str],
 ) -> None:
     """
     Configure Feature Store role privileges. Must be run with ACCOUNTADMIN
@@ -172,7 +172,7 @@ def _configure_pre_init_privileges(
 def _configure_post_init_privileges(
     session: Session,
     session_info: _SessionInfo,
-    roles_to_create: Dict[_FeatureStoreRole, str],
+    roles_to_create: dict[_FeatureStoreRole, str],
 ) -> None:
     for role_type, role in roles_to_create.items():
         _grant_privileges(session, role, _POST_INIT_PRIVILEGES[role_type], session_info)

snowflake-ml-python 1.8.1__py3-none-any.whl → 1.8.3__py3-none-any.whl

snowflake-ml-python 1.8.1py3-none-any.whl → 1.8.3py3-none-any.whl