PyPI - snowflake-ml-python - Versions diffs - 1.6.0__py3-none-any.whl → 1.6.1__py3-none-any.whl - Mend

snowflake-ml-python 1.6.0py3-none-any.whl → 1.6.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

snowflake/cortex/_complete.py CHANGED Viewed

@@ -90,7 +90,6 @@ def _call_complete_rest(
     prompt: Union[str, List[ConversationMessage]],
     options: Optional[CompleteOptions] = None,
     session: Optional[snowpark.Session] = None,
-    stream: bool = False,
 ) -> requests.Response:
     session = session or context.get_active_session()
     if session is None:
@@ -121,7 +120,7 @@ def _call_complete_rest(
     data = {
         "model": model,
-        "stream": stream,
+        "stream": True,
     }
     if isinstance(prompt, List):
         data["messages"] = prompt
@@ -137,32 +136,15 @@ def _call_complete_rest(
         if "top_p" in options:
             data["top_p"] = options["top_p"]
-    logger.debug(f"making POST request to {url} (model={model}, stream={stream})")
+    logger.debug(f"making POST request to {url} (model={model})")
     return requests.post(
         url,
         json=data,
         headers=headers,
-        stream=stream,
+        stream=True,
     )
-def _process_rest_response(
-    response: requests.Response,
-    stream: bool = False,
-    deadline: Optional[float] = None,
-) -> Union[str, Iterator[str]]:
-    if stream:
-        return _return_stream_response(response, deadline)
-    try:
-        content = response.json()["choices"][0]["message"]["content"]
-        assert isinstance(content, str)
-        return content
-    except (KeyError, IndexError, AssertionError) as e:
-        # Unlike the streaming case, errors are not ignored because a message must be returned.
-        raise ResponseParseException("Failed to parse message from response.") from e
 def _return_stream_response(response: requests.Response, deadline: Optional[float]) -> Iterator[str]:
     client = SSEClient(response)
     for event in client.events():
@@ -243,7 +225,6 @@ def _complete_impl(
     prompt: Union[str, List[ConversationMessage], snowpark.Column],
     options: Optional[CompleteOptions] = None,
     session: Optional[snowpark.Session] = None,
-    use_rest_api_experimental: bool = False,
     stream: bool = False,
     function: str = "snowflake.cortex.complete",
     timeout: Optional[float] = None,
@@ -253,16 +234,14 @@ def _complete_impl(
         raise ValueError('only one of "timeout" and "deadline" must be set')
     if timeout is not None:
         deadline = time.time() + timeout
-    if use_rest_api_experimental:
+    if stream:
         if not isinstance(model, str):
             raise ValueError("in REST mode, 'model' must be a string")
         if not isinstance(prompt, str) and not isinstance(prompt, List):
             raise ValueError("in REST mode, 'prompt' must be a string or a list of ConversationMessage")
-        response = _call_complete_rest(model, prompt, options, session=session, stream=stream, deadline=deadline)
+        response = _call_complete_rest(model, prompt, options, session=session, deadline=deadline)
         assert response.status_code >= 200 and response.status_code < 300
-        return _process_rest_response(response, stream=stream)
-    if stream is True:
-        raise ValueError("streaming can only be enabled in REST mode, set use_rest_api_experimental=True")
+        return _return_stream_response(response, deadline)
     return _complete_sql_impl(function, model, prompt, options, session)
@@ -275,7 +254,6 @@ def Complete(
     *,
     options: Optional[CompleteOptions] = None,
     session: Optional[snowpark.Session] = None,
-    use_rest_api_experimental: bool = False,
     stream: bool = False,
     timeout: Optional[float] = None,
     deadline: Optional[float] = None,
@@ -287,16 +265,13 @@ def Complete(
         prompt: A Column of prompts to send to the LLM.
         options: A instance of snowflake.cortex.CompleteOptions
         session: The snowpark session to use. Will be inferred by context if not specified.
-        use_rest_api_experimental (bool): Toggles between the use of SQL and REST implementation. This feature is
-            experimental and can be removed at any time.
         stream (bool): Enables streaming. When enabled, a generator function is returned that provides the streaming
             output as it is received. Each update is a string containing the new text content since the previous update.
-            The use of streaming requires the experimental use_rest_api_experimental flag to be enabled.
         timeout (float): Timeout in seconds to retry failed REST requests.
         deadline (float): Time in seconds since the epoch (as returned by time.time()) to retry failed REST requests.
     Raises:
-        ValueError: If `stream` is set to True and `use_rest_api_experimental` is set to False.
+        ValueError: incorrect argument.
     Returns:
         A column of string responses.
@@ -307,7 +282,6 @@ def Complete(
             prompt,
             options=options,
             session=session,
-            use_rest_api_experimental=use_rest_api_experimental,
             stream=stream,
             timeout=timeout,
             deadline=deadline,

snowflake/ml/_internal/env_utils.py CHANGED Viewed

@@ -27,7 +27,6 @@ class CONDA_OS(Enum):
     NO_ARCH = "noarch"
-_SNOWFLAKE_CONDA_CHANNEL_URL = "https://repo.anaconda.com/pkgs/snowflake"
 _NODEFAULTS = "nodefaults"
 _SNOWFLAKE_INFO_SCHEMA_PACKAGE_CACHE: Dict[str, List[version.Version]] = {}
 _SNOWFLAKE_CONDA_PACKAGE_CACHE: Dict[str, List[version.Version]] = {}
@@ -36,6 +35,7 @@ _SUPPORTED_PACKAGE_SPEC_OPS = ["==", ">=", "<=", ">", "<"]
 DEFAULT_CHANNEL_NAME = ""
 SNOWML_SPROC_ENV = "IN_SNOWML_SPROC"
 SNOWPARK_ML_PKG_NAME = "snowflake-ml-python"
+SNOWFLAKE_CONDA_CHANNEL_URL = "https://repo.anaconda.com/pkgs/snowflake"
 def _validate_pip_requirement_string(req_str: str) -> requirements.Requirement:
@@ -370,7 +370,7 @@ def get_matched_package_versions_in_snowflake_conda_channel(
     assert not snowpark_utils.is_in_stored_procedure()  # type: ignore[no-untyped-call]
-    url = f"{_SNOWFLAKE_CONDA_CHANNEL_URL}/{conda_os.value}/repodata.json"
+    url = f"{SNOWFLAKE_CONDA_CHANNEL_URL}/{conda_os.value}/repodata.json"
     if req.name not in _SNOWFLAKE_CONDA_PACKAGE_CACHE:
         try:
@@ -477,6 +477,7 @@ def save_conda_env_file(
     path: pathlib.Path,
     conda_chan_deps: DefaultDict[str, List[requirements.Requirement]],
     python_version: str,
+    default_channel_override: str = SNOWFLAKE_CONDA_CHANNEL_URL,
 ) -> None:
     """Generate conda.yml file given a dict of dependencies after validation.
     The channels part of conda.yml file will contains Snowflake Anaconda Channel, nodefaults and all channel names
@@ -489,6 +490,7 @@ def save_conda_env_file(
         path: Path to the conda.yml file.
         conda_chan_deps: Dict of conda dependencies after validated.
         python_version: A string 'major.minor' showing python version relate to model.
+        default_channel_override: The default channel to be put in the first place of the channels section.
     """
     assert path.suffix in [".yml", ".yaml"], "Conda environment file should have extension of yml or yaml."
     path.parent.mkdir(parents=True, exist_ok=True)
@@ -499,7 +501,11 @@ def save_conda_env_file(
     channels = list(dict(sorted(conda_chan_deps.items(), key=lambda item: len(item[1]), reverse=True)).keys())
     if DEFAULT_CHANNEL_NAME in channels:
         channels.remove(DEFAULT_CHANNEL_NAME)
-    env["channels"] = [_SNOWFLAKE_CONDA_CHANNEL_URL] + channels + [_NODEFAULTS]
+    if default_channel_override in channels:
+        channels.remove(default_channel_override)
+    env["channels"] = [default_channel_override] + channels + [_NODEFAULTS]
     env["dependencies"] = [f"python=={python_version}.*"]
     for chan, reqs in conda_chan_deps.items():
         env["dependencies"].extend(
@@ -567,8 +573,8 @@ def load_conda_env_file(
     python_version = None
     channels = env.get("channels", [])
-    if _SNOWFLAKE_CONDA_CHANNEL_URL in channels:
-        channels.remove(_SNOWFLAKE_CONDA_CHANNEL_URL)
+    if len(channels) >= 1:
+        channels = channels[1:]  # Skip the first channel which is the default channel
     if _NODEFAULTS in channels:
         channels.remove(_NODEFAULTS)

snowflake/ml/_internal/exceptions/modeling_error_messages.py CHANGED Viewed

@@ -4,7 +4,10 @@ ATTRIBUTE_NOT_SET = (
     "-differences."
 )
 SIZE_MISMATCH = "Size mismatch: {}={}, {}={}."
-INVALID_MODEL_PARAM = "Invalid parameter {} for model {}. Valid parameters: {}."
+INVALID_MODEL_PARAM = (
+    "Invalid parameter {} for model {}. Valid parameters: {}."
+    "Note: Scikit learn params cannot be set until the model has been fit."
+)
 UNSUPPORTED_MODEL_CONVERSION = "Object doesn't support {}. Please use {}."
 INCOMPATIBLE_NEW_SKLEARN_PARAM = "Incompatible scikit-learn version: {} requires scikit-learn>={}. Installed: {}."
 REMOVED_SKLEARN_PARAM = "Incompatible scikit-learn version: {} is removed in scikit-learn>={}. Installed: {}."

snowflake/ml/_internal/telemetry.py CHANGED Viewed

@@ -44,6 +44,20 @@ _Args = ParamSpec("_Args")
 _ReturnValue = TypeVar("_ReturnValue")
+@enum.unique
+class TelemetryProject(enum.Enum):
+    MLOPS = "MLOps"
+    MODELING = "ModelDevelopment"
+    # TODO: Update with remaining projects.
+@enum.unique
+class TelemetrySubProject(enum.Enum):
+    MONITORING = "Monitoring"
+    REGISTRY = "ModelManagement"
+    # TODO: Update with remaining subprojects.
 @enum.unique
 class TelemetryField(enum.Enum):
     # constants

snowflake/ml/_internal/utils/pkg_version_utils.py CHANGED Viewed

@@ -26,30 +26,11 @@ def get_valid_pkg_versions_supported_in_snowflake_conda_channel(
     pkg_versions: List[str], session: Session, subproject: Optional[str] = None
 ) -> List[str]:
     if snowpark_utils.is_in_stored_procedure():  # type: ignore[no-untyped-call]
-        return _get_valid_pkg_versions_supported_in_snowflake_conda_channel_sync(pkg_versions, session, subproject)
+        return pkg_versions
     else:
         return _get_valid_pkg_versions_supported_in_snowflake_conda_channel_async(pkg_versions, session, subproject)
-def _get_valid_pkg_versions_supported_in_snowflake_conda_channel_sync(
-    pkg_versions: List[str], session: Session, subproject: Optional[str] = None
-) -> List[str]:
-    for pkg_version in pkg_versions:
-        if pkg_version not in cache:
-            pkg_version_list = _query_pkg_version_supported_in_snowflake_conda_channel(
-                pkg_version=pkg_version, session=session, block=True, subproject=subproject
-            )
-            assert isinstance(pkg_version_list, list)  # keep mypy happy
-            try:
-                cache[pkg_version] = pkg_version_list[0]["VERSION"]
-            except IndexError:
-                cache[pkg_version] = None
-    pkg_version_conda_list = _get_conda_packages_and_emit_warnings(pkg_versions)
-    return pkg_version_conda_list
 def _get_valid_pkg_versions_supported_in_snowflake_conda_channel_async(
     pkg_versions: List[str], session: Session, subproject: Optional[str] = None
 ) -> List[str]:
@@ -60,7 +41,11 @@ def _get_valid_pkg_versions_supported_in_snowflake_conda_channel_async(
             async_job = _query_pkg_version_supported_in_snowflake_conda_channel(
                 pkg_version=pkg_version, session=session, block=False, subproject=subproject
             )
-            assert isinstance(async_job, AsyncJob)
+            if isinstance(async_job, list):
+                raise RuntimeError(
+                    "Async job was expected, executed query was returned. Please contact Snowflake support."
+                )
             pkg_version_async_job_list.append((pkg_version, async_job))
     # Populate the cache.
@@ -143,7 +128,8 @@ def _get_conda_packages_and_emit_warnings(pkg_versions: List[str]) -> List[str]:
         warnings.warn(
             f"Package {', '.join([pkg[0] for pkg in pkg_version_warning_list])} is not supported "
             f"in snowflake conda channel for python runtime "
-            f"{', '.join([pkg[1] for pkg in pkg_version_warning_list])}."
+            f"{', '.join([pkg[1] for pkg in pkg_version_warning_list])}.",
+            stacklevel=1,
         )
     return pkg_version_conda_list

snowflake/ml/data/_internal/arrow_ingestor.py CHANGED Viewed

@@ -2,17 +2,17 @@ import collections
 import logging
 import os
 import time
-from typing import Any, Deque, Dict, Iterator, List, Optional
+from typing import Any, Deque, Dict, Iterator, List, Optional, Union
 import numpy as np
 import numpy.typing as npt
 import pandas as pd
 import pyarrow as pa
-import pyarrow.dataset as ds
+import pyarrow.dataset as pds
 from snowflake import snowpark
-from snowflake.ml.data import data_ingestor, data_source
-from snowflake.ml.data._internal import ingestor_utils
+from snowflake.connector import result_batch
+from snowflake.ml.data import data_ingestor, data_source, ingestor_utils
 _EMPTY_RECORD_BATCH = pa.RecordBatch.from_arrays([], [])
@@ -67,6 +67,10 @@ class ArrowIngestor(data_ingestor.DataIngestor):
         self._schema: Optional[pa.Schema] = None
+    @classmethod
+    def from_sources(cls, session: snowpark.Session, sources: List[data_source.DataSource]) -> "ArrowIngestor":
+        return cls(session, sources)
     @property
     def data_sources(self) -> List[data_source.DataSource]:
         return self._data_sources
@@ -115,9 +119,9 @@ class ArrowIngestor(data_ingestor.DataIngestor):
         table = ds.to_table() if limit is None else ds.head(num_rows=limit)
         return table.to_pandas()
-    def _get_dataset(self, shuffle: bool) -> ds.Dataset:
+    def _get_dataset(self, shuffle: bool) -> pds.Dataset:
         format = self._format
-        sources = []
+        sources: List[Any] = []
         source_format = None
         for source in self._data_sources:
             if isinstance(source, str):
@@ -137,8 +141,16 @@ class ArrowIngestor(data_ingestor.DataIngestor):
                 #        in-memory (first batch) and file URLs (subsequent batches) and creating a
                 #        union dataset.
                 result_batches = ingestor_utils.get_dataframe_result_batches(self._session, source)
-                sources.extend(b.to_arrow() for b in result_batches)
-                source_format = "arrow"
+                sources.extend(
+                    b.to_arrow(self._session.connection)
+                    if isinstance(b, result_batch.ArrowResultBatch)
+                    else b.to_arrow()
+                    for b in result_batches
+                )
+                # HACK: Mitigate typing inconsistencies in Snowpark results
+                if len(sources) > 0:
+                    sources = [_cast_if_needed(s, sources[-1].schema) for s in sources]
+                source_format = None  # Arrow Dataset expects "None" for in-memory datasets
             else:
                 raise RuntimeError(f"Unsupported data source type: {type(source)}")
@@ -150,7 +162,7 @@ class ArrowIngestor(data_ingestor.DataIngestor):
         # Re-shuffle input files on each iteration start
         if shuffle:
             np.random.shuffle(sources)
-        pa_dataset: ds.Dataset = ds.dataset(sources, format=format, **self._kwargs)
+        pa_dataset: pds.Dataset = pds.dataset(sources, format=format, **self._kwargs)
         return pa_dataset
     def _get_batches_from_buffer(self, batch_size: int) -> Dict[str, npt.NDArray[Any]]:
@@ -201,7 +213,7 @@ def _record_batch_to_arrays(rb: pa.RecordBatch) -> Dict[str, npt.NDArray[Any]]:
 def _retryable_batches(
-    dataset: ds.Dataset, batch_size: int, max_retries: int = 3, delay: int = 0
+    dataset: pds.Dataset, batch_size: int, max_retries: int = 3, delay: int = 0
 ) -> Iterator[pa.RecordBatch]:
     """Make the Dataset to_batches retryable."""
     retries = 0
@@ -226,3 +238,47 @@ def _retryable_batches(
                 time.sleep(delay)
             else:
                 raise e
+def _cast_if_needed(
+    batch: Union[pa.Table, pa.RecordBatch], schema: Optional[pa.Schema] = None
+) -> Union[pa.Table, pa.RecordBatch]:
+    """
+    Cast the batch to be compatible with downstream frameworks. Returns original batch if cast is not necessary.
+    Besides casting types to match `schema` (if provided), this function also applies the following casting:
+        - Decimal (fixed-point) types: Convert to float or integer types based on scale and byte length
+    Args:
+        batch: The PyArrow batch to cast if needed
+        schema: Optional schema the batch should be casted to match. Note that compatibility type casting takes
+            precedence over the provided schema, e.g. if the schema has decimal types the result will be further
+            cast into integer/float types.
+    Returns:
+        The type-casted PyArrow batch, or the original batch if casting was not necessary
+    """
+    schema = schema or batch.schema
+    assert len(batch.schema) == len(schema)
+    fields = []
+    cast_needed = False
+    for field, target in zip(batch.schema, schema):
+        # Need to convert decimal types to supported types. This behavior supersedes target schema data types
+        if pa.types.is_decimal(target.type):
+            byte_length = int(target.metadata.get(b"byteLength", 8))
+            if int(target.metadata.get(b"scale", 0)) > 0:
+                target = target.with_type(pa.float32() if byte_length == 4 else pa.float64())
+            else:
+                if byte_length == 2:
+                    target = target.with_type(pa.int16())
+                elif byte_length == 4:
+                    target = target.with_type(pa.int32())
+                else:  # Cap out at 64-bit
+                    target = target.with_type(pa.int64())
+        if not field.equals(target):
+            cast_needed = True
+            field = target
+        fields.append(field)
+    if cast_needed:
+        return batch.cast(pa.schema(fields))
+    return batch

snowflake/ml/data/data_connector.py CHANGED Viewed

@@ -1,11 +1,12 @@
 from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional, Type, TypeVar
 import numpy.typing as npt
+from typing_extensions import deprecated
 from snowflake import snowpark
 from snowflake.ml._internal import telemetry
 from snowflake.ml.data import data_ingestor, data_source
-from snowflake.ml.data._internal.arrow_ingestor import ArrowIngestor as DefaultIngestor
+from snowflake.ml.data._internal.arrow_ingestor import ArrowIngestor
 if TYPE_CHECKING:
     import pandas as pd
@@ -24,6 +25,8 @@ DataConnectorType = TypeVar("DataConnectorType", bound="DataConnector")
 class DataConnector:
     """Snowflake data reader which provides application integration connectors"""
+    DEFAULT_INGESTOR_CLASS: Type[data_ingestor.DataIngestor] = ArrowIngestor
     def __init__(
         self,
         ingestor: data_ingestor.DataIngestor,
@@ -31,22 +34,48 @@ class DataConnector:
         self._ingestor = ingestor
     @classmethod
-    def from_dataframe(cls: Type[DataConnectorType], df: snowpark.DataFrame, **kwargs: Any) -> DataConnectorType:
+    @snowpark._internal.utils.private_preview(version="1.6.0")
+    def from_dataframe(
+        cls: Type[DataConnectorType],
+        df: snowpark.DataFrame,
+        ingestor_class: Optional[Type[data_ingestor.DataIngestor]] = None,
+        **kwargs: Any
+    ) -> DataConnectorType:
         if len(df.queries["queries"]) != 1 or len(df.queries["post_actions"]) != 0:
             raise ValueError("DataFrames with multiple queries and/or post-actions not supported")
         source = data_source.DataFrameInfo(df.queries["queries"][0])
         assert df._session is not None
-        ingestor = DefaultIngestor(df._session, [source])
-        return cls(ingestor, **kwargs)
+        return cls.from_sources(df._session, [source], ingestor_class=ingestor_class, **kwargs)
     @classmethod
-    def from_dataset(cls: Type[DataConnectorType], ds: "dataset.Dataset", **kwargs: Any) -> DataConnectorType:
+    def from_dataset(
+        cls: Type[DataConnectorType],
+        ds: "dataset.Dataset",
+        ingestor_class: Optional[Type[data_ingestor.DataIngestor]] = None,
+        **kwargs: Any
+    ) -> DataConnectorType:
         dsv = ds.selected_version
         assert dsv is not None
         source = data_source.DatasetInfo(
             ds.fully_qualified_name, dsv.name, dsv.url(), exclude_cols=(dsv.label_cols + dsv.exclude_cols)
         )
-        ingestor = DefaultIngestor(ds._session, [source])
+        return cls.from_sources(ds._session, [source], ingestor_class=ingestor_class, **kwargs)
+    @classmethod
+    @telemetry.send_api_usage_telemetry(
+        project=_PROJECT,
+        subproject_extractor=lambda cls: cls.__name__,
+        func_params_to_log=["sources", "ingestor_class"],
+    )
+    def from_sources(
+        cls: Type[DataConnectorType],
+        session: snowpark.Session,
+        sources: List[data_source.DataSource],
+        ingestor_class: Optional[Type[data_ingestor.DataIngestor]] = None,
+        **kwargs: Any
+    ) -> DataConnectorType:
+        ingestor_class = ingestor_class or cls.DEFAULT_INGESTOR_CLASS
+        ingestor = ingestor_class.from_sources(session, sources)
         return cls(ingestor, **kwargs)
     @property
@@ -87,6 +116,9 @@ class DataConnector:
         return tf.data.Dataset.from_generator(generator, output_signature=tf_signature)
+    @deprecated(
+        "to_torch_datapipe() is deprecated and will be removed in a future release. Use to_torch_dataset() instead"
+    )
     @telemetry.send_api_usage_telemetry(
         project=_PROJECT,
         subproject_extractor=lambda self: type(self).__name__,
@@ -116,6 +148,27 @@ class DataConnector:
             self._ingestor.to_batches(batch_size, shuffle, drop_last_batch)
         )
+    @telemetry.send_api_usage_telemetry(
+        project=_PROJECT,
+        subproject_extractor=lambda self: type(self).__name__,
+        func_params_to_log=["shuffle"],
+    )
+    def to_torch_dataset(self, *, shuffle: bool = False) -> "torch_data.IterableDataset":  # type: ignore[type-arg]
+        """Transform the Snowflake data into a PyTorch Iterable Dataset to be used with a DataLoader.
+        Return a PyTorch Dataset which iterates on rows of data.
+        Args:
+            shuffle: It specifies whether the data will be shuffled. If True, files will be shuffled, and
+                rows in each file will also be shuffled.
+        Returns:
+            A PyTorch Iterable Dataset that yields data.
+        """
+        from snowflake.ml.data import torch_dataset
+        return torch_dataset.TorchDataset(self._ingestor, shuffle)
     @telemetry.send_api_usage_telemetry(
         project=_PROJECT,
         subproject_extractor=lambda self: type(self).__name__,

snowflake/ml/data/data_ingestor.py CHANGED Viewed

@@ -1,7 +1,18 @@
-from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Protocol, TypeVar
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Dict,
+    Iterator,
+    List,
+    Optional,
+    Protocol,
+    Type,
+    TypeVar,
+)
 from numpy import typing as npt
+from snowflake import snowpark
 from snowflake.ml.data import data_source
 if TYPE_CHECKING:
@@ -12,6 +23,12 @@ DataIngestorType = TypeVar("DataIngestorType", bound="DataIngestor")
 class DataIngestor(Protocol):
+    @classmethod
+    def from_sources(
+        cls: Type[DataIngestorType], session: snowpark.Session, sources: List[data_source.DataSource]
+    ) -> DataIngestorType:
+        raise NotImplementedError
     @property
     def data_sources(self) -> List[data_source.DataSource]:
         raise NotImplementedError

snowflake/ml/data/{_internal/ingestor_utils.py → ingestor_utils.py} RENAMED Viewed

@@ -13,6 +13,7 @@ _TARGET_FILE_SIZE = 32 * 2**20  # The max file size for data loading.
 def get_dataframe_result_batches(
     session: snowpark.Session, df_info: data_source.DataFrameInfo
 ) -> List[result_batch.ResultBatch]:
+    """Retrieve the ResultBatches for a given query"""
     cursor = session._conn._cursor
     if df_info.query_id:
@@ -39,6 +40,7 @@ def get_dataframe_result_batches(
 def get_dataset_filesystem(
     session: snowpark.Session, ds_info: Optional[data_source.DatasetInfo] = None
 ) -> fsspec.AbstractFileSystem:
+    """Get the fsspec filesystem for a given Dataset"""
     # We can't directly load the Dataset to avoid a circular dependency
     # Dataset -> DatasetReader -> DataConnector -> DataIngestor -> (?) ingestor_utils -> Dataset
     # TODO: Automatically pick appropriate fsspec implementation based on protocol in URL
@@ -52,7 +54,9 @@ def get_dataset_filesystem(
 def get_dataset_files(
     session: snowpark.Session, ds_info: data_source.DatasetInfo, filesystem: Optional[fsspec.AbstractFileSystem] = None
 ) -> List[str]:
+    """Get the list of files in a given Dataset"""
     if filesystem is None:
         filesystem = get_dataset_filesystem(session, ds_info)
     assert bool(ds_info.url)  # Not null or empty
-    return sorted(filesystem.ls(ds_info.url))
+    files = sorted(filesystem.ls(ds_info.url))
+    return [filesystem.unstrip_protocol(f) for f in files]

snowflake/ml/data/torch_dataset.py ADDED Viewed

@@ -0,0 +1,33 @@
+from typing import Any, Dict, Iterator
+import torch.utils.data
+from snowflake.ml.data import data_ingestor
+class TorchDataset(torch.utils.data.IterableDataset[Dict[str, Any]]):
+    """Implementation of PyTorch IterableDataset"""
+    def __init__(self, ingestor: data_ingestor.DataIngestor, shuffle: bool = False) -> None:
+        """Not intended for direct usage. Use DataConnector.to_torch_dataset() instead"""
+        self._ingestor = ingestor
+        self._shuffle = shuffle
+    def __iter__(self) -> Iterator[Dict[str, Any]]:
+        max_idx = 0
+        filter_idx = 0
+        worker_info = torch.utils.data.get_worker_info()
+        if worker_info is not None:
+            max_idx = worker_info.num_workers - 1
+            filter_idx = worker_info.id
+        counter = 0
+        for batch in self._ingestor.to_batches(batch_size=1, shuffle=self._shuffle, drop_last_batch=False):
+            # Skip indices during multi-process data loading to prevent data duplication
+            if counter == filter_idx:
+                yield {k: v.item() for k, v in batch.items()}
+            if counter < max_idx:
+                counter += 1
+            else:
+                counter = 0

snowflake/ml/dataset/dataset_metadata.py CHANGED Viewed

@@ -15,11 +15,13 @@ class FeatureStoreMetadata:
     Properties:
         spine_query: The input query on source table which will be joined with features.
         serialized_feature_views: A list of serialized feature objects in the feature store.
+        compact_feature_views: A compact representation of a FeatureView or FeatureViewSlice.
         spine_timestamp_col: Timestamp column which was used for point-in-time correct feature lookup.
     """
     spine_query: str
-    serialized_feature_views: List[str]
+    serialized_feature_views: Optional[List[str]] = None
+    compact_feature_views: Optional[List[str]] = None
     spine_timestamp_col: Optional[str] = None
     def to_json(self) -> str:

snowflake/ml/dataset/dataset_reader.py CHANGED Viewed

@@ -1,10 +1,9 @@
-from typing import List, Optional
+from typing import Any, List, Optional, Type
 from snowflake import snowpark
 from snowflake.ml._internal import telemetry
 from snowflake.ml._internal.lineage import lineage_utils
-from snowflake.ml.data import data_connector, data_ingestor, data_source
-from snowflake.ml.data._internal import ingestor_utils
+from snowflake.ml.data import data_connector, data_ingestor, data_source, ingestor_utils
 from snowflake.ml.fileset import snowfs
 _PROJECT = "Dataset"
@@ -27,6 +26,13 @@ class DatasetReader(data_connector.DataConnector):
         self._fs: snowfs.SnowFileSystem = ingestor_utils.get_dataset_filesystem(self._session)
         self._files: Optional[List[str]] = None
+    @classmethod
+    def from_dataframe(
+        cls, df: snowpark.DataFrame, ingestor_class: Optional[Type[data_ingestor.DataIngestor]] = None, **kwargs: Any
+    ) -> "DatasetReader":
+        # Block superclass constructor from Snowpark DataFrames
+        raise RuntimeError("Creating DatasetReader from DataFrames not supported")
     def _list_files(self) -> List[str]:
         """Private helper function that lists all files in this DatasetVersion and caches the results."""
         if self._files:

snowflake/ml/feature_store/examples/airline_features/entities.py ADDED Viewed

@@ -0,0 +1,16 @@
+from typing import List
+from snowflake.ml.feature_store import Entity
+zipcode_entity = Entity(
+    name="AIRPORT_ZIP_CODE",
+    join_keys=["AIRPORT_ZIP_CODE"],
+    desc="Zip code of the airport.",
+)
+plane_entity = Entity(name="PLANE_MODEL", join_keys=["PLANE_MODEL"], desc="The model of an airplane.")
+# This will be invoked by example_helper.py. Do not change function name.
+def get_all_entities() -> List[Entity]:
+    return [zipcode_entity, plane_entity]

snowflake-ml-python 1.6.0__py3-none-any.whl → 1.6.1__py3-none-any.whl

snowflake-ml-python 1.6.0py3-none-any.whl → 1.6.1py3-none-any.whl