PyPI - snowflake-ml-python - Versions diffs - 1.6.0__py3-none-any.whl → 1.6.2__py3-none-any.whl - Mend

snowflake-ml-python 1.6.0py3-none-any.whl → 1.6.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (252) hide show

snowflake/ml/_internal/utils/table_manager.py CHANGED Viewed

@@ -1,7 +1,8 @@
 from typing import Any, Dict, List, Optional, Tuple
 from snowflake import snowpark
-from snowflake.ml._internal.utils import formatting, query_result_checker
+from snowflake.ml._internal.utils import formatting, identifier, query_result_checker
+from snowflake.snowpark import types
 """Table_manager is a set of utils that helps create tables.
@@ -104,3 +105,20 @@ def get_table_schema(session: snowpark.Session, table_name: str, qualified_schem
     for row in result:
         schema_dict[row["name"]] = row["type"]
     return schema_dict
+def get_table_schema_types(
+    session: snowpark.Session,
+    database: str,
+    schema: str,
+    table_name: str,
+) -> Dict[str, types.DataType]:
+    fully_qualified_table_name = identifier.get_schema_level_object_identifier(
+        db=database, schema=schema, object_name=table_name
+    )
+    struct_fields: List[types.StructField] = session.table(fully_qualified_table_name).schema.fields
+    schema_dict: Dict[str, types.DataType] = {}
+    for field in struct_fields:
+        schema_dict[field.name] = field.datatype
+    return schema_dict

snowflake/ml/_internal/utils/uri.py CHANGED Viewed

@@ -53,7 +53,7 @@ def get_uri_scheme(uri: str) -> str:
 def get_uri_from_snowflake_stage_path(stage_path: str) -> str:
     """Generates a URI from Snowflake stage path."""
     assert stage_path.startswith("@")
-    (db, schema, stage, path) = identifier.parse_schema_level_object_identifier(
+    (db, schema, stage, path) = identifier.parse_snowflake_stage_path(
         posixpath.normpath(identifier.remove_prefix(stage_path, "@"))
     )
     return urlunparse(
@@ -70,7 +70,7 @@ def get_uri_from_snowflake_stage_path(stage_path: str) -> str:
 def get_stage_and_path(stage_path: str) -> Tuple[str, str]:
     assert stage_path.startswith("@"), f"stage path should start with @, actual: {stage_path}"
-    (db, schema, stage, path) = identifier.parse_schema_level_object_identifier(
+    (db, schema, stage, path) = identifier.parse_snowflake_stage_path(
         posixpath.normpath(identifier.remove_prefix(stage_path, "@"))
     )
     full_qualified_stage = "@" + identifier.get_schema_level_object_identifier(db, schema, stage)

snowflake/ml/data/_internal/arrow_ingestor.py CHANGED Viewed

@@ -2,17 +2,17 @@ import collections
 import logging
 import os
 import time
-from typing import Any, Deque, Dict, Iterator, List, Optional
+from typing import Any, Deque, Dict, Iterator, List, Optional, Union
 import numpy as np
 import numpy.typing as npt
 import pandas as pd
 import pyarrow as pa
-import pyarrow.dataset as ds
+import pyarrow.dataset as pds
 from snowflake import snowpark
-from snowflake.ml.data import data_ingestor, data_source
-from snowflake.ml.data._internal import ingestor_utils
+from snowflake.connector import result_batch
+from snowflake.ml.data import data_ingestor, data_source, ingestor_utils
 _EMPTY_RECORD_BATCH = pa.RecordBatch.from_arrays([], [])
@@ -67,6 +67,10 @@ class ArrowIngestor(data_ingestor.DataIngestor):
         self._schema: Optional[pa.Schema] = None
+    @classmethod
+    def from_sources(cls, session: snowpark.Session, sources: List[data_source.DataSource]) -> "ArrowIngestor":
+        return cls(session, sources)
     @property
     def data_sources(self) -> List[data_source.DataSource]:
         return self._data_sources
@@ -115,9 +119,9 @@ class ArrowIngestor(data_ingestor.DataIngestor):
         table = ds.to_table() if limit is None else ds.head(num_rows=limit)
         return table.to_pandas()
-    def _get_dataset(self, shuffle: bool) -> ds.Dataset:
+    def _get_dataset(self, shuffle: bool) -> pds.Dataset:
         format = self._format
-        sources = []
+        sources: List[Any] = []
         source_format = None
         for source in self._data_sources:
             if isinstance(source, str):
@@ -137,8 +141,16 @@ class ArrowIngestor(data_ingestor.DataIngestor):
                 #        in-memory (first batch) and file URLs (subsequent batches) and creating a
                 #        union dataset.
                 result_batches = ingestor_utils.get_dataframe_result_batches(self._session, source)
-                sources.extend(b.to_arrow() for b in result_batches)
-                source_format = "arrow"
+                sources.extend(
+                    b.to_arrow(self._session.connection)
+                    if isinstance(b, result_batch.ArrowResultBatch)
+                    else b.to_arrow()
+                    for b in result_batches
+                )
+                # HACK: Mitigate typing inconsistencies in Snowpark results
+                if len(sources) > 0:
+                    sources = [_cast_if_needed(s, sources[-1].schema) for s in sources]
+                source_format = None  # Arrow Dataset expects "None" for in-memory datasets
             else:
                 raise RuntimeError(f"Unsupported data source type: {type(source)}")
@@ -150,7 +162,7 @@ class ArrowIngestor(data_ingestor.DataIngestor):
         # Re-shuffle input files on each iteration start
         if shuffle:
             np.random.shuffle(sources)
-        pa_dataset: ds.Dataset = ds.dataset(sources, format=format, **self._kwargs)
+        pa_dataset: pds.Dataset = pds.dataset(sources, format=format, **self._kwargs)
         return pa_dataset
     def _get_batches_from_buffer(self, batch_size: int) -> Dict[str, npt.NDArray[Any]]:
@@ -201,7 +213,7 @@ def _record_batch_to_arrays(rb: pa.RecordBatch) -> Dict[str, npt.NDArray[Any]]:
 def _retryable_batches(
-    dataset: ds.Dataset, batch_size: int, max_retries: int = 3, delay: int = 0
+    dataset: pds.Dataset, batch_size: int, max_retries: int = 3, delay: int = 0
 ) -> Iterator[pa.RecordBatch]:
     """Make the Dataset to_batches retryable."""
     retries = 0
@@ -226,3 +238,47 @@ def _retryable_batches(
                 time.sleep(delay)
             else:
                 raise e
+def _cast_if_needed(
+    batch: Union[pa.Table, pa.RecordBatch], schema: Optional[pa.Schema] = None
+) -> Union[pa.Table, pa.RecordBatch]:
+    """
+    Cast the batch to be compatible with downstream frameworks. Returns original batch if cast is not necessary.
+    Besides casting types to match `schema` (if provided), this function also applies the following casting:
+        - Decimal (fixed-point) types: Convert to float or integer types based on scale and byte length
+    Args:
+        batch: The PyArrow batch to cast if needed
+        schema: Optional schema the batch should be casted to match. Note that compatibility type casting takes
+            precedence over the provided schema, e.g. if the schema has decimal types the result will be further
+            cast into integer/float types.
+    Returns:
+        The type-casted PyArrow batch, or the original batch if casting was not necessary
+    """
+    schema = schema or batch.schema
+    assert len(batch.schema) == len(schema)
+    fields = []
+    cast_needed = False
+    for field, target in zip(batch.schema, schema):
+        # Need to convert decimal types to supported types. This behavior supersedes target schema data types
+        if pa.types.is_decimal(target.type):
+            byte_length = int(target.metadata.get(b"byteLength", 8))
+            if int(target.metadata.get(b"scale", 0)) > 0:
+                target = target.with_type(pa.float32() if byte_length == 4 else pa.float64())
+            else:
+                if byte_length == 2:
+                    target = target.with_type(pa.int16())
+                elif byte_length == 4:
+                    target = target.with_type(pa.int32())
+                else:  # Cap out at 64-bit
+                    target = target.with_type(pa.int64())
+        if not field.equals(target):
+            cast_needed = True
+            field = target
+        fields.append(field)
+    if cast_needed:
+        return batch.cast(pa.schema(fields))
+    return batch

snowflake/ml/data/data_connector.py CHANGED Viewed

@@ -1,11 +1,17 @@
+import os
 from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional, Type, TypeVar
 import numpy.typing as npt
+from typing_extensions import deprecated
 from snowflake import snowpark
 from snowflake.ml._internal import telemetry
 from snowflake.ml.data import data_ingestor, data_source
-from snowflake.ml.data._internal.arrow_ingestor import ArrowIngestor as DefaultIngestor
+from snowflake.ml.data._internal.arrow_ingestor import ArrowIngestor
+from snowflake.ml.modeling._internal.constants import (
+    IN_ML_RUNTIME_ENV_VAR,
+    USE_OPTIMIZED_DATA_INGESTOR,
+)
 if TYPE_CHECKING:
     import pandas as pd
@@ -24,6 +30,8 @@ DataConnectorType = TypeVar("DataConnectorType", bound="DataConnector")
 class DataConnector:
     """Snowflake data reader which provides application integration connectors"""
+    DEFAULT_INGESTOR_CLASS: Type[data_ingestor.DataIngestor] = ArrowIngestor
     def __init__(
         self,
         ingestor: data_ingestor.DataIngestor,
@@ -31,22 +39,48 @@ class DataConnector:
         self._ingestor = ingestor
     @classmethod
-    def from_dataframe(cls: Type[DataConnectorType], df: snowpark.DataFrame, **kwargs: Any) -> DataConnectorType:
+    @snowpark._internal.utils.private_preview(version="1.6.0")
+    def from_dataframe(
+        cls: Type[DataConnectorType],
+        df: snowpark.DataFrame,
+        ingestor_class: Optional[Type[data_ingestor.DataIngestor]] = None,
+        **kwargs: Any
+    ) -> DataConnectorType:
         if len(df.queries["queries"]) != 1 or len(df.queries["post_actions"]) != 0:
             raise ValueError("DataFrames with multiple queries and/or post-actions not supported")
         source = data_source.DataFrameInfo(df.queries["queries"][0])
         assert df._session is not None
-        ingestor = DefaultIngestor(df._session, [source])
-        return cls(ingestor, **kwargs)
+        return cls.from_sources(df._session, [source], ingestor_class=ingestor_class, **kwargs)
     @classmethod
-    def from_dataset(cls: Type[DataConnectorType], ds: "dataset.Dataset", **kwargs: Any) -> DataConnectorType:
+    def from_dataset(
+        cls: Type[DataConnectorType],
+        ds: "dataset.Dataset",
+        ingestor_class: Optional[Type[data_ingestor.DataIngestor]] = None,
+        **kwargs: Any
+    ) -> DataConnectorType:
         dsv = ds.selected_version
         assert dsv is not None
         source = data_source.DatasetInfo(
             ds.fully_qualified_name, dsv.name, dsv.url(), exclude_cols=(dsv.label_cols + dsv.exclude_cols)
         )
-        ingestor = DefaultIngestor(ds._session, [source])
+        return cls.from_sources(ds._session, [source], ingestor_class=ingestor_class, **kwargs)
+    @classmethod
+    @telemetry.send_api_usage_telemetry(
+        project=_PROJECT,
+        subproject_extractor=lambda cls: cls.__name__,
+        func_params_to_log=["sources", "ingestor_class"],
+    )
+    def from_sources(
+        cls: Type[DataConnectorType],
+        session: snowpark.Session,
+        sources: List[data_source.DataSource],
+        ingestor_class: Optional[Type[data_ingestor.DataIngestor]] = None,
+        **kwargs: Any
+    ) -> DataConnectorType:
+        ingestor_class = ingestor_class or cls.DEFAULT_INGESTOR_CLASS
+        ingestor = ingestor_class.from_sources(session, sources)
         return cls(ingestor, **kwargs)
     @property
@@ -87,6 +121,9 @@ class DataConnector:
         return tf.data.Dataset.from_generator(generator, output_signature=tf_signature)
+    @deprecated(
+        "to_torch_datapipe() is deprecated and will be removed in a future release. Use to_torch_dataset() instead"
+    )
     @telemetry.send_api_usage_telemetry(
         project=_PROJECT,
         subproject_extractor=lambda self: type(self).__name__,
@@ -110,10 +147,40 @@ class DataConnector:
         Returns:
             A Pytorch iterable datapipe that yield data.
         """
-        from torch.utils.data.datapipes import iter as torch_iter
+        from snowflake.ml.data import torch_utils
-        return torch_iter.IterableWrapper(  # type: ignore[no-untyped-call]
-            self._ingestor.to_batches(batch_size, shuffle, drop_last_batch)
+        return torch_utils.TorchDataPipeWrapper(
+            self._ingestor, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last_batch
+        )
+    @telemetry.send_api_usage_telemetry(
+        project=_PROJECT,
+        subproject_extractor=lambda self: type(self).__name__,
+        func_params_to_log=["batch_size", "shuffle", "drop_last_batch"],
+    )
+    def to_torch_dataset(
+        self, *, batch_size: int = 1, shuffle: bool = False, drop_last_batch: bool = True
+    ) -> "torch_data.IterableDataset":  # type: ignore[type-arg]
+        """Transform the Snowflake data into a PyTorch Iterable Dataset to be used with a DataLoader.
+        Return a PyTorch Dataset which iterates on rows of data.
+        Args:
+            batch_size: It specifies the size of each data batch which will be yielded in the result dataset.
+                Batching is pushed down to data ingestion level which may be more performant than DataLoader
+                batching.
+            shuffle: It specifies whether the data will be shuffled. If True, files will be shuffled, and
+                rows in each file will also be shuffled.
+            drop_last_batch: Whether the last batch of data should be dropped. If set to be true,
+                then the last batch will get dropped if its size is smaller than the given batch_size.
+        Returns:
+            A PyTorch Iterable Dataset that yields data.
+        """
+        from snowflake.ml.data import torch_utils
+        return torch_utils.TorchDatasetWrapper(
+            self._ingestor, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last_batch
         )
     @telemetry.send_api_usage_telemetry(
@@ -131,3 +198,15 @@ class DataConnector:
             A Pandas DataFrame.
         """
         return self._ingestor.to_pandas(limit)
+# Switch to use Runtime's Data Ingester if running in ML runtime
+# Fail silently if the data ingester is not found
+if os.getenv(IN_ML_RUNTIME_ENV_VAR) and os.getenv(USE_OPTIMIZED_DATA_INGESTOR):
+    try:
+        from runtime_external_entities import get_ingester_class
+        DataConnector.DEFAULT_INGESTOR_CLASS = get_ingester_class()
+    except ImportError:
+        """Runtime Default Ingester not found, ignore"""
+        pass

snowflake/ml/data/data_ingestor.py CHANGED Viewed

@@ -1,7 +1,18 @@
-from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Protocol, TypeVar
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Dict,
+    Iterator,
+    List,
+    Optional,
+    Protocol,
+    Type,
+    TypeVar,
+)
 from numpy import typing as npt
+from snowflake import snowpark
 from snowflake.ml.data import data_source
 if TYPE_CHECKING:
@@ -12,6 +23,12 @@ DataIngestorType = TypeVar("DataIngestorType", bound="DataIngestor")
 class DataIngestor(Protocol):
+    @classmethod
+    def from_sources(
+        cls: Type[DataIngestorType], session: snowpark.Session, sources: List[data_source.DataSource]
+    ) -> DataIngestorType:
+        raise NotImplementedError
     @property
     def data_sources(self) -> List[data_source.DataSource]:
         raise NotImplementedError

snowflake/ml/data/{_internal/ingestor_utils.py → ingestor_utils.py} RENAMED Viewed

@@ -13,6 +13,7 @@ _TARGET_FILE_SIZE = 32 * 2**20  # The max file size for data loading.
 def get_dataframe_result_batches(
     session: snowpark.Session, df_info: data_source.DataFrameInfo
 ) -> List[result_batch.ResultBatch]:
+    """Retrieve the ResultBatches for a given query"""
     cursor = session._conn._cursor
     if df_info.query_id:
@@ -39,6 +40,7 @@ def get_dataframe_result_batches(
 def get_dataset_filesystem(
     session: snowpark.Session, ds_info: Optional[data_source.DatasetInfo] = None
 ) -> fsspec.AbstractFileSystem:
+    """Get the fsspec filesystem for a given Dataset"""
     # We can't directly load the Dataset to avoid a circular dependency
     # Dataset -> DatasetReader -> DataConnector -> DataIngestor -> (?) ingestor_utils -> Dataset
     # TODO: Automatically pick appropriate fsspec implementation based on protocol in URL
@@ -52,7 +54,9 @@ def get_dataset_filesystem(
 def get_dataset_files(
     session: snowpark.Session, ds_info: data_source.DatasetInfo, filesystem: Optional[fsspec.AbstractFileSystem] = None
 ) -> List[str]:
+    """Get the list of files in a given Dataset"""
     if filesystem is None:
         filesystem = get_dataset_filesystem(session, ds_info)
     assert bool(ds_info.url)  # Not null or empty
-    return sorted(filesystem.ls(ds_info.url))
+    files = sorted(filesystem.ls(ds_info.url))
+    return [filesystem.unstrip_protocol(f) for f in files]

snowflake/ml/data/torch_utils.py ADDED Viewed

@@ -0,0 +1,68 @@
+from typing import Any, Dict, Iterator, List, Union
+import numpy as np
+import numpy.typing as npt
+import torch.utils.data
+from snowflake.ml.data import data_ingestor
+class TorchDatasetWrapper(torch.utils.data.IterableDataset[Dict[str, Any]]):
+    """Wrap a DataIngestor into a PyTorch IterableDataset"""
+    def __init__(
+        self,
+        ingestor: data_ingestor.DataIngestor,
+        *,
+        batch_size: int,
+        shuffle: bool = False,
+        drop_last: bool = False,
+        squeeze_outputs: bool = True
+    ) -> None:
+        """Not intended for direct usage. Use DataConnector.to_torch_dataset() instead"""
+        self._ingestor = ingestor
+        self._batch_size = batch_size
+        self._shuffle = shuffle
+        self._drop_last = drop_last
+        self._squeeze_outputs = squeeze_outputs
+    def __iter__(self) -> Iterator[Dict[str, Union[npt.NDArray[Any], List[Any]]]]:
+        max_idx = 0
+        filter_idx = 0
+        worker_info = torch.utils.data.get_worker_info()
+        if worker_info is not None:
+            max_idx = worker_info.num_workers - 1
+            filter_idx = worker_info.id
+        if self._shuffle and worker_info is not None:
+            raise RuntimeError("Dataset shuffling not currently supported with multithreading")
+        counter = 0
+        for batch in self._ingestor.to_batches(
+            batch_size=self._batch_size, shuffle=self._shuffle, drop_last_batch=self._drop_last
+        ):
+            # Skip indices during multi-process data loading to prevent data duplication
+            if counter == filter_idx:
+                # Basic preprocessing on batch values: squeeze away extra dimensions
+                # and convert object arrays (e.g. strings) to lists
+                if self._squeeze_outputs:
+                    yield {
+                        k: (v.squeeze().tolist() if v.dtype == np.object_ else v.squeeze()) for k, v in batch.items()
+                    }
+                else:
+                    yield batch  # type: ignore[misc]
+            if counter < max_idx:
+                counter += 1
+            else:
+                counter = 0
+class TorchDataPipeWrapper(TorchDatasetWrapper, torch.utils.data.IterDataPipe[Dict[str, Any]]):
+    """Wrap a DataIngestor into a PyTorch IterDataPipe"""
+    def __init__(
+        self, ingestor: data_ingestor.DataIngestor, *, batch_size: int, shuffle: bool = False, drop_last: bool = False
+    ) -> None:
+        """Not intended for direct usage. Use DataConnector.to_torch_datapipe() instead"""
+        super().__init__(ingestor, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, squeeze_outputs=False)

snowflake/ml/dataset/dataset.py CHANGED Viewed

@@ -472,9 +472,7 @@ lineage_node.DOMAIN_LINEAGE_REGISTRY["dataset"] = Dataset
 def _get_schema_level_identifier(session: snowpark.Session, dataset_name: str) -> Tuple[str, str, str]:
     """Resolve a dataset name into a validated schema-level location identifier"""
-    db, schema, object_name, others = identifier.parse_schema_level_object_identifier(dataset_name)
-    if others:
-        raise ValueError(f"Invalid identifier: unexpected '{others}'")
+    db, schema, object_name = identifier.parse_schema_level_object_identifier(dataset_name)
     db = db or session.get_current_database()
     schema = schema or session.get_current_schema()
     return str(db), str(schema), str(object_name)

snowflake/ml/dataset/dataset_metadata.py CHANGED Viewed

@@ -15,11 +15,13 @@ class FeatureStoreMetadata:
     Properties:
         spine_query: The input query on source table which will be joined with features.
         serialized_feature_views: A list of serialized feature objects in the feature store.
+        compact_feature_views: A compact representation of a FeatureView or FeatureViewSlice.
         spine_timestamp_col: Timestamp column which was used for point-in-time correct feature lookup.
     """
     spine_query: str
-    serialized_feature_views: List[str]
+    serialized_feature_views: Optional[List[str]] = None
+    compact_feature_views: Optional[List[str]] = None
     spine_timestamp_col: Optional[str] = None
     def to_json(self) -> str:

snowflake/ml/dataset/dataset_reader.py CHANGED Viewed

@@ -1,10 +1,9 @@
-from typing import List, Optional
+from typing import Any, List, Optional, Type
 from snowflake import snowpark
 from snowflake.ml._internal import telemetry
 from snowflake.ml._internal.lineage import lineage_utils
-from snowflake.ml.data import data_connector, data_ingestor, data_source
-from snowflake.ml.data._internal import ingestor_utils
+from snowflake.ml.data import data_connector, data_ingestor, data_source, ingestor_utils
 from snowflake.ml.fileset import snowfs
 _PROJECT = "Dataset"
@@ -27,6 +26,13 @@ class DatasetReader(data_connector.DataConnector):
         self._fs: snowfs.SnowFileSystem = ingestor_utils.get_dataset_filesystem(self._session)
         self._files: Optional[List[str]] = None
+    @classmethod
+    def from_dataframe(
+        cls, df: snowpark.DataFrame, ingestor_class: Optional[Type[data_ingestor.DataIngestor]] = None, **kwargs: Any
+    ) -> "DatasetReader":
+        # Block superclass constructor from Snowpark DataFrames
+        raise RuntimeError("Creating DatasetReader from DataFrames not supported")
     def _list_files(self) -> List[str]:
         """Private helper function that lists all files in this DatasetVersion and caches the results."""
         if self._files:

snowflake/ml/feature_store/examples/airline_features/entities.py ADDED Viewed

@@ -0,0 +1,16 @@
+from typing import List
+from snowflake.ml.feature_store import Entity
+zipcode_entity = Entity(
+    name="AIRPORT_ZIP_CODE",
+    join_keys=["AIRPORT_ZIP_CODE"],
+    desc="Zip code of the airport.",
+)
+plane_entity = Entity(name="PLANE_MODEL", join_keys=["PLANE_MODEL"], desc="The model of an airplane.")
+# This will be invoked by example_helper.py. Do not change function name.
+def get_all_entities() -> List[Entity]:
+    return [zipcode_entity, plane_entity]

snowflake/ml/feature_store/examples/airline_features/features/plane_features.py ADDED Viewed

@@ -0,0 +1,31 @@
+from typing import List
+from snowflake.ml.feature_store import FeatureView
+from snowflake.ml.feature_store.examples.airline_features.entities import plane_entity
+from snowflake.snowpark import DataFrame, Session
+# This function will be invoked by example_helper.py. Do not change the name.
+def create_draft_feature_view(session: Session, source_dfs: List[DataFrame], source_tables: List[str]) -> FeatureView:
+    """Create a feature view about airplane model."""
+    query = session.sql(
+        """
+        select
+           PLANE_MODEL,
+           SEATING_CAPACITY
+        from
+            PLANE_MODEL_ATTRIBUTES
+        """
+    )
+    return FeatureView(
+        name="f_plane",  # name of feature view
+        entities=[plane_entity],  # entities
+        feature_df=query,  # definition query
+        refresh_freq=None,  # refresh frequency
+        desc="Plane features never refresh.",
+    ).attach_feature_desc(
+        {
+            "SEATING_CAPACITY": "The seating capacity of a plane.",
+        }
+    )

snowflake/ml/feature_store/examples/airline_features/features/weather_features.py ADDED Viewed

@@ -0,0 +1,42 @@
+from typing import List
+from snowflake.ml.feature_store import FeatureView
+from snowflake.ml.feature_store.examples.airline_features.entities import zipcode_entity
+from snowflake.snowpark import DataFrame, Session
+# This function will be invoked by example_helper.py. Do not change the name.
+def create_draft_feature_view(session: Session, source_dfs: List[DataFrame], source_tables: List[str]) -> FeatureView:
+    """Create a feature view about airport weather."""
+    query = session.sql(
+        """
+        select
+            DATETIME_UTC AS TS,
+            AIRPORT_ZIP_CODE,
+            sum(RAIN_MM_H) over (
+                partition by AIRPORT_ZIP_CODE
+                order by DATETIME_UTC
+                range between interval '30 minutes' preceding and current row
+            ) RAIN_SUM_30M,
+            sum(RAIN_MM_H) over (
+                partition by AIRPORT_ZIP_CODE
+                order by DATETIME_UTC
+                range between interval '1 day' preceding and current row
+            ) RAIN_SUM_60M
+        from AIRPORT_WEATHER_STATION
+        """
+    )
+    return FeatureView(
+        name="f_weather",  # name of feature view
+        entities=[zipcode_entity],  # entities
+        feature_df=query,  # definition query
+        timestamp_col="TS",  # timestamp column
+        refresh_freq="1d",  # refresh frequency
+        desc="Airport weather features refreshed every day.",
+    ).attach_feature_desc(
+        {
+            "RAIN_SUM_30M": "The sum of rain fall over past 30 minutes for one zipcode.",
+            "RAIN_SUM_60M": "The sum of rain fall over past 1 day for one zipcode.",
+        }
+    )

snowflake/ml/feature_store/examples/airline_features/source.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+source_data: airline
+label_columns: DEPARTING_DELAY
+timestamp_column: SCHEDULED_DEPARTURE_UTC
+desc: Features using synthetic airline data to predict the departing delay.
+model_category: classification
+training_spine_table: US_FLIGHT_SCHEDULES

snowflake/ml/feature_store/examples/citibike_trip_features/features/station_feature.py CHANGED Viewed

@@ -14,18 +14,24 @@ def create_draft_feature_view(session: Session, source_dfs: List[DataFrame], sou
         f"""
         select
             end_station_id,
-            count(end_station_id) as f_count_1d,
-            avg(end_station_latitude) as f_avg_latitude_1d,
-            avg(end_station_longitude) as f_avg_longtitude_1d
+            count(end_station_id) as f_count,
+            avg(end_station_latitude) as f_avg_latitude,
+            avg(end_station_longitude) as f_avg_longtitude
         from {source_tables[0]}
         group by end_station_id
         """
     )
     return FeatureView(
-        name="f_station_1d",  # name of feature view
+        name="f_station",  # name of feature view
         entities=[end_station_id],  # entities
         feature_df=query,  # definition query
         refresh_freq="1d",  # refresh frequency. '1d' means it refreshes everyday
         desc="Station features refreshed every day.",
+    ).attach_feature_desc(
+        {
+            "f_count": "How many times this station appears in 1 day.",
+            "f_avg_latitude": "Averaged latitude of a station.",
+            "f_avg_longtitude": "Averaged longtitude of a station.",
+        }
     )

snowflake/ml/feature_store/examples/citibike_trip_features/features/trip_feature.py CHANGED Viewed

@@ -21,4 +21,10 @@ def create_draft_feature_view(session: Session, source_dfs: List[DataFrame], sou
         feature_df=feature_df,  # definition query
         refresh_freq=None,  # refresh frequency. None indicates it never refresh
         desc="Static trip features",
+    ).attach_feature_desc(
+        {
+            "f_birth_year": "The birth year of a trip passenger.",
+            "f_gender": "The gender of a trip passenger.",
+            "f_bikeid": "The bike id of a trip passenger.",
+        }
     )

snowflake/ml/feature_store/examples/citibike_trip_features/source.yaml CHANGED Viewed

@@ -1,4 +1,7 @@
 ---
 source_data: citibike_trips
+training_spine_table: citibike_trips
 label_columns: tripduration
 add_id_column: trip_id
+desc: Features using citibike trip data trying to predict the duration of a trip.
+model_category: regression

snowflake-ml-python 1.6.0__py3-none-any.whl → 1.6.2__py3-none-any.whl

snowflake-ml-python 1.6.0py3-none-any.whl → 1.6.2py3-none-any.whl