PyPI - snowflake-ml-python - Versions diffs - 1.5.4__py3-none-any.whl → 1.6.0__py3-none-any.whl - Mend

snowflake-ml-python 1.5.4py3-none-any.whl → 1.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

snowflake/ml/feature_store/examples/example_helper.py ADDED Viewed

@@ -0,0 +1,240 @@
+import importlib
+import logging
+import os
+import re
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+import yaml
+from snowflake.ml._internal.utils import identifier, sql_identifier
+from snowflake.ml.feature_store import Entity, FeatureView  # type: ignore[attr-defined]
+from snowflake.snowpark import DataFrame, Session, functions as F
+from snowflake.snowpark.types import TimestampType
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+class ExampleHelper:
+    def __init__(self, session: Session, database_name: str, dataset_schema: str) -> None:
+        """A helper class to run Feature Store examples.
+        Args:
+            session: A Snowpark session object.
+            database_name: Database where dataset and Feature Store lives.
+            dataset_schema: Schema where destination dataset table lives.
+        """
+        self._session = session
+        self._database_name = database_name
+        self._dataset_schema = dataset_schema
+        self._selected_example = None
+        self._source_tables: List[str] = []
+        self._source_dfs: List[DataFrame] = []
+        self._excluded_columns: List[sql_identifier.SqlIdentifier] = []
+        self._label_columns: List[sql_identifier.SqlIdentifier] = []
+        self._timestamp_column: Optional[sql_identifier.SqlIdentifier] = None
+        self._epoch_to_timestamp_cols: List[str] = []
+        self._add_id_column: Optional[sql_identifier.SqlIdentifier] = None
+    def list_examples(self) -> List[str]:
+        """Return a list of examples."""
+        root_dir = Path(__file__).parent
+        result = []
+        for f_name in os.listdir(root_dir):
+            if os.path.isdir(os.path.join(root_dir, f_name)) and f_name[0].isalpha() and f_name != "source_data":
+                result.append(f_name)
+        return result
+    def load_draft_feature_views(self) -> List[FeatureView]:
+        """Return all feature views in an example.
+        Returns:
+            A list of FeatureView object.
+        """
+        fvs = []
+        root_dir = Path(__file__).parent.joinpath(f"{self._selected_example}/features")
+        for f_name in os.listdir(root_dir):
+            if not f_name[0].isalpha():
+                # skip folders like __pycache__
+                continue
+            mod_path = f"{__package__}.{self._selected_example}.features.{f_name.rstrip('.py')}"
+            mod = importlib.import_module(mod_path)
+            fv = mod.create_draft_feature_view(self._session, self._source_dfs, self._source_tables)
+            fvs.append(fv)
+        return fvs
+    def load_entities(self) -> List[Entity]:
+        """Return all entities in an example.
+        Returns:
+            A list of Entity object.
+        """
+        current_module = f"{__package__}.{self._selected_example}.entities"
+        mod = importlib.import_module(current_module)
+        return mod.get_all_entities()  # type: ignore[no-any-return]
+    def _read_yaml(self, file_path: str) -> Any:
+        with open(file_path) as fs:
+            return yaml.safe_load(fs)
+    def _create_file_format(self, format_dict: Dict[str, str], format_name: str) -> None:
+        """Create a file name with given name."""
+        self._session.sql(
+            f"""
+            create or replace file format {format_name}
+                type = '{format_dict['type']}'
+                compression = '{format_dict['compression']}'
+                field_delimiter = '{format_dict['field_delimiter']}'
+                record_delimiter = '{format_dict['record_delimiter']}'
+                skip_header = {format_dict['skip_header']}
+                field_optionally_enclosed_by = '{format_dict['field_optionally_enclosed_by']}'
+                trim_space = {format_dict['trim_space']}
+                error_on_column_count_mismatch = {format_dict['error_on_column_count_mismatch']}
+                escape = '{format_dict['escape']}'
+                escape_unenclosed_field = '{format_dict['escape_unenclosed_field']}'
+                date_format = '{format_dict['date_format']}'
+                timestamp_format = '{format_dict['timestamp_format']}'
+                null_if = {format_dict['null_if']}
+                comment = '{format_dict['comment']}'
+            """
+        ).collect()
+    def _load_csv(self, schema_dict: Dict[str, str], destination_table: str, temp_stage_name: str) -> None:
+        # create temp file format
+        file_format_name = f"{self._database_name}.{self._dataset_schema}.feature_store_temp_format"
+        format_str = ""
+        if "format" in schema_dict:
+            self._create_file_format(schema_dict["format"], file_format_name)  # type: ignore[arg-type]
+            format_str = f"file_format = {file_format_name}"
+        # create destination table
+        cols_type_str = ",".join([f"{k} {v}" for k, v in schema_dict["columns"].items()])  # type: ignore[attr-defined]
+        cols_name_str = ",".join(schema_dict["columns"].keys())  # type: ignore[attr-defined]
+        if self._add_id_column:
+            cols_type_str = (
+                f"{self._add_id_column.resolved()} number autoincrement start 1 increment 1, " + cols_type_str
+            )
+        self._session.sql(
+            f"""
+            create or replace table {destination_table} ({cols_type_str})
+            """
+        ).collect()
+        # copy dataset on stage into destination table
+        self._session.sql(
+            f"""
+            copy into {destination_table} ({cols_name_str}) from
+                @{temp_stage_name}
+                {format_str}
+                pattern = '{schema_dict['load_files_pattern']}'
+            """
+        ).collect()
+    def _load_parquet(self, schema_dict: Dict[str, str], destination_table: str, temp_stage_name: str) -> None:
+        regex_pattern = schema_dict["load_files_pattern"]
+        all_files = self._session.sql(f"list @{temp_stage_name}").collect()
+        filtered_files = [item["name"] for item in all_files if re.match(regex_pattern, item["name"])]
+        assert len(filtered_files) == 1, "Current code only works for one file"
+        file_name = filtered_files[0].rsplit("/", 1)[-1]
+        df = self._session.read.parquet(f"@{temp_stage_name}/{file_name}")
+        for old_col_name in df.columns:
+            df = df.with_column_renamed(old_col_name, identifier.get_unescaped_names(old_col_name))
+        for ts_col in self._epoch_to_timestamp_cols:
+            if "timestamp" != dict(df.dtypes)[ts_col]:
+                df = df.with_column(f"{ts_col}_NEW", F.cast(df[ts_col] / 1000000, TimestampType()))
+                df = df.drop(ts_col).rename(f"{ts_col}_NEW", ts_col)
+        df.write.mode("overwrite").save_as_table(destination_table)
+    def _load_source_data(self, schema_yaml_file: str) -> str:
+        """Parse a yaml schema file and load data into Snowflake.
+        Args:
+            schema_yaml_file: the path to a yaml schema file.
+        Returns:
+            Return a destination table name.
+        """
+        # load schema file
+        schema_dict = self._read_yaml(schema_yaml_file)
+        temp_stage_name = f"{self._database_name}.{self._dataset_schema}.feature_store_temp_stage"
+        destination_table = f"{self._database_name}.{self._dataset_schema}.{schema_dict['destination_table_name']}"
+        # create a temp stage from S3 URL
+        self._session.sql(f"create or replace stage {temp_stage_name} url = '{schema_dict['s3_url']}'").collect()
+        # load csv or parquet
+        # TODO: this could be more flexible and robust.
+        if "parquet" in schema_dict["load_files_pattern"]:
+            self._load_parquet(schema_dict, destination_table, temp_stage_name)
+        else:
+            self._load_csv(schema_dict, destination_table, temp_stage_name)
+        return destination_table
+    def load_example(self, example_name: str) -> List[str]:
+        """Select the active example and load its datasets to Snowflake.
+        Args:
+            example_name: The folder name under feature_store/examples.
+                For example, 'citibike_trip_features'.
+        Returns:
+            Returns a list of table names with populated datasets.
+        """
+        self._selected_example = example_name  # type: ignore[assignment]
+        # load source yaml file
+        root_dir = Path(__file__).parent
+        source_file_path = root_dir.joinpath(f"{self._selected_example}/source.yaml")
+        source_dict = self._read_yaml(str(source_file_path))
+        self._source_tables = []
+        self._source_dfs = []
+        source_ymal_data = source_dict["source_data"]
+        if "excluded_columns" in source_dict:
+            self._excluded_columns = sql_identifier.to_sql_identifiers(source_dict["excluded_columns"].split(","))
+        if "label_columns" in source_dict:
+            self._label_columns = sql_identifier.to_sql_identifiers(source_dict["label_columns"].split(","))
+        if "timestamp_column" in source_dict:
+            self._timestamp_column = sql_identifier.SqlIdentifier(source_dict["timestamp_column"])
+        if "epoch_to_timestamp_cols" in source_dict:
+            self._epoch_to_timestamp_cols = source_dict["epoch_to_timestamp_cols"].split(",")
+        if "add_id_column" in source_dict:
+            self._add_id_column = sql_identifier.SqlIdentifier(source_dict["add_id_column"])
+        return self.load_source_data(source_ymal_data)
+    def load_source_data(self, source_data_name: str) -> List[str]:
+        """Load source data into Snowflake.
+        Args:
+            source_data_name: The name of source data located in examples/source_data/.
+        Returns:
+            Return a list of Snowflake tables.
+        """
+        root_dir = Path(__file__).parent
+        schema_file = root_dir.joinpath(f"source_data/{source_data_name}.yaml")
+        destination_table = self._load_source_data(str(schema_file))
+        source_df = self._session.table(destination_table)
+        self._source_tables.append(destination_table)
+        self._source_dfs.append(source_df)
+        logger.info(f"source data {source_data_name} has been successfully loaded into table {destination_table}.")
+        return self._source_tables
+    def get_current_schema(self) -> str:
+        return self._dataset_schema
+    def get_label_cols(self) -> List[str]:
+        return [item.resolved() for item in self._label_columns]
+    def get_excluded_cols(self) -> List[str]:
+        return [item.resolved() for item in self._excluded_columns]
+    def get_training_data_timestamp_col(self) -> Optional[str]:
+        return self._timestamp_column.resolved() if self._timestamp_column is not None else None

snowflake/ml/feature_store/examples/new_york_taxi_features/entities.py ADDED Viewed

@@ -0,0 +1,12 @@
+from typing import List
+from snowflake.ml.feature_store import Entity
+trip_pickup = Entity(name="TRIP_PICKUP", join_keys=["PULOCATIONID"], desc="Trip pickup entity.")
+trip_dropoff = Entity(name="TRIP_DROPOFF", join_keys=["DOLOCATIONID"], desc="Trip dropoff entity.")
+# This will be invoked by example_helper.py. Do not change function name.
+def get_all_entities() -> List[Entity]:
+    return [trip_pickup, trip_dropoff]

snowflake/ml/feature_store/examples/new_york_taxi_features/features/dropoff_features.py ADDED Viewed

@@ -0,0 +1,39 @@
+from typing import List
+from snowflake.ml.feature_store import FeatureView
+from snowflake.ml.feature_store.examples.new_york_taxi_features.entities import (
+    trip_dropoff,
+)
+from snowflake.snowpark import DataFrame, Session
+# This function will be invoked by example_helper.py. Do not change the name.
+def create_draft_feature_view(session: Session, source_dfs: List[DataFrame], source_tables: List[str]) -> FeatureView:
+    """Create a draft feature view."""
+    feature_df = session.sql(
+        f"""
+        select
+            TPEP_DROPOFF_DATETIME as TS,
+            DOLOCATIONID,
+            count(FARE_AMOUNT) over (
+                partition by DOLOCATIONID
+                order by TPEP_DROPOFF_DATETIME
+                range between interval '1 hours' preceding and current row
+            ) TRIP_COUNT_1H,
+            count(FARE_AMOUNT) over (
+                partition by DOLOCATIONID
+                order by TPEP_DROPOFF_DATETIME
+                range between interval '5 hours' preceding and current row
+            ) TRIP_COUNT_5H
+        from {source_tables[0]}
+    """
+    )
+    return FeatureView(
+        name="f_trip_dropoff",  # name of feature view
+        entities=[trip_dropoff],  # entities
+        feature_df=feature_df,  # definition query
+        refresh_freq="12h",  # the frequency this feature view re-compute
+        timestamp_col="TS",  # timestamp column. Used when generate training data
+        desc="Managed feature view trip dropoff refreshed every 12 hours.",
+    )

snowflake/ml/feature_store/examples/new_york_taxi_features/features/pickup_features.py ADDED Viewed

@@ -0,0 +1,58 @@
+from typing import List
+from snowflake.ml.feature_store import FeatureView
+from snowflake.ml.feature_store.examples.new_york_taxi_features.entities import (
+    trip_pickup,
+)
+from snowflake.snowpark import DataFrame, Session
+# This function will be invoked by example_helper.py. Do not change the name.
+def create_draft_feature_view(session: Session, source_dfs: List[DataFrame], source_tables: List[str]) -> FeatureView:
+    """Create a draft feature view."""
+    feature_df = session.sql(
+        f"""
+        with
+            cte_1 (TS, PULOCATIONID, TRIP_COUNT_2H, TRIP_COUNT_5H, TRIP_FARE_SUM_2H, TRIP_FARE_SUM_5H) as (
+                select
+                    TPEP_PICKUP_DATETIME as TS,
+                    PULOCATIONID,
+                    count(FARE_AMOUNT) over (
+                        partition by PULOCATIONID
+                        order by TPEP_PICKUP_DATETIME
+                        range between interval '2 hours' preceding and current row
+                    ) TRIP_COUNT_2H,
+                    count(FARE_AMOUNT) over (
+                        partition by PULOCATIONID
+                        order by TPEP_PICKUP_DATETIME
+                        range between interval '5 hours' preceding and current row
+                    ) TRIP_COUNT_5H,
+                    sum(FARE_AMOUNT) over (
+                        partition by PULOCATIONID
+                        order by TPEP_PICKUP_DATETIME
+                        range between interval '2 hours' preceding and current row
+                    ) TRIP_FARE_SUM_2H,
+                    count(FARE_AMOUNT) over (
+                        partition by PULOCATIONID
+                        order by TPEP_PICKUP_DATETIME
+                        range between interval '5 hours' preceding and current row
+                    ) TRIP_FARE_SUM_5H
+                from {source_tables[0]}
+            )
+        select
+            TS,
+            PULOCATIONID,
+            TRIP_FARE_SUM_2H / TRIP_COUNT_2H as MEAN_FARE_2H,
+            TRIP_FARE_SUM_5H / TRIP_COUNT_5H as MEAN_FARE_5H,
+        from cte_1
+    """
+    )
+    return FeatureView(
+        name="f_trip_pickup",  # name of feature view
+        entities=[trip_pickup],  # entities
+        feature_df=feature_df,  # definition query
+        refresh_freq="1d",  # the frequency this feature view re-compute
+        timestamp_col="TS",  # timestamp column. Used when generate training data
+        desc="Managed feature view trip pickup refreshed everyday.",
+    )

snowflake/ml/feature_store/examples/new_york_taxi_features/source.yaml ADDED Viewed

@@ -0,0 +1,5 @@
+---
+source_data: nyc_yellow_trips
+label_columns: FARE_AMOUNT
+timestamp_column: TPEP_PICKUP_DATETIME
+epoch_to_timestamp_cols: TPEP_PICKUP_DATETIME,TPEP_DROPOFF_DATETIME

snowflake/ml/feature_store/examples/source_data/citibike_trips.yaml ADDED Viewed

@@ -0,0 +1,36 @@
+---
+s3_url: s3://snowflake-workshop-lab/citibike-trips-csv/
+destination_table_name: citibike_trips
+load_files_pattern: trips_2013_6_4_0.csv.gz
+format:
+  type: csv
+  compression: auto
+  field_delimiter: ','
+  record_delimiter: \n
+  skip_header: 0
+  field_optionally_enclosed_by: \042
+  trim_space: 'true'
+  error_on_column_count_mismatch: 'false'
+  escape: none
+  escape_unenclosed_field: \134
+  date_format: auto
+  timestamp_format: auto
+  null_if: ('')
+  comment: file format for citibike trips data
+columns:
+  tripduration: integer
+  starttime: timestamp
+  stoptime: timestamp
+  start_station_id: integer
+  start_station_name: string
+  start_station_latitude: float
+  start_station_longitude: float
+  end_station_id: integer
+  end_station_name: string
+  end_station_latitude: float
+  end_station_longitude: float
+  bikeid: integer
+  membership_type: string
+  usertype: string
+  birth_year: integer
+  gender: integer

snowflake/ml/feature_store/examples/source_data/fraud_transactions.yaml ADDED Viewed

@@ -0,0 +1,29 @@
+---
+s3_url: s3://sfquickstarts/misc/demos/fraud_transactions.csv
+destination_table_name: fraud_transactions
+load_files_pattern: .*
+format:
+  type: csv
+  compression: auto
+  field_delimiter: ','
+  record_delimiter: \n
+  skip_header: 1
+  field_optionally_enclosed_by: none
+  trim_space: 'false'
+  error_on_column_count_mismatch: 'false'
+  escape: none
+  escape_unenclosed_field: none
+  date_format: auto
+  timestamp_format: auto
+  null_if: ('')
+  comment: file format for winequality data
+columns:
+  TRANSACTION_ID: integer
+  TX_DATETIME: datetime
+  CUSTOMER_ID: integer
+  TERMINAL_ID: integer
+  TX_AMOUNT: float
+  TX_TIME_SECONDS: integer
+  TX_TIME_DAYS: integer
+  TX_FRAUD: integer
+  TX_FRAUD_SCENARIO: integer

snowflake/ml/feature_store/examples/source_data/nyc_yellow_trips.yaml ADDED Viewed

@@ -0,0 +1,4 @@
+---
+s3_url: s3://sfquickstarts/misc/demos/nyc_yellow_trips/
+destination_table_name: nyc_yellow_trips
+load_files_pattern: .*_2016-01[.]parquet

snowflake/ml/feature_store/examples/source_data/winequality_red.yaml ADDED Viewed

@@ -0,0 +1,32 @@
+---
+s3_url: s3://sfquickstarts/misc/demos/winequality-red.csv
+destination_table_name: winedata
+load_files_pattern: .*
+format:
+  type: csv
+  compression: auto
+  field_delimiter: ;
+  record_delimiter: \n
+  skip_header: 1
+  field_optionally_enclosed_by: none
+  trim_space: 'false'
+  error_on_column_count_mismatch: 'false'
+  escape: none
+  escape_unenclosed_field: none
+  date_format: auto
+  timestamp_format: auto
+  null_if: ('')
+  comment: file format for winequality data
+columns:
+  fixed_acidity: float
+  volatile_acidity: float
+  citric_acid: float
+  residual_sugar: float
+  chlorides: float
+  free_sulfur_dioxide: integer
+  total_sulfur_dioxide: integer
+  density: float
+  pH: float
+  sulphates: float
+  alcohol: float
+  quality: integer

snowflake/ml/feature_store/examples/wine_quality_features/entities.py ADDED Viewed

@@ -0,0 +1,14 @@
+from typing import List
+from snowflake.ml.feature_store import Entity
+wine_entity = Entity(
+    name="WINE",
+    join_keys=["WINE_ID"],
+    desc="Wine ID column.",
+)
+# This will be invoked by example_helper.py. Do not change function name.
+def get_all_entities() -> List[Entity]:
+    return [wine_entity]

snowflake/ml/feature_store/examples/wine_quality_features/features/managed_wine_features.py ADDED Viewed

@@ -0,0 +1,29 @@
+from typing import List
+from snowflake.ml.feature_store import FeatureView
+from snowflake.ml.feature_store.examples.wine_quality_features.entities import (
+    wine_entity,
+)
+from snowflake.snowpark import DataFrame, Session, functions as F
+# This function will be invoked by example_helper.py. Do not change the name.
+def create_draft_feature_view(session: Session, source_dfs: List[DataFrame], source_tables: List[str]) -> FeatureView:
+    """Create a feature view about trip station."""
+    feature_df = source_dfs[0].select(
+        "WINE_ID",
+        "FIXED_ACIDITY",
+        "CITRIC_ACID",
+        "CHLORIDES",
+        "TOTAL_SULFUR_DIOXIDE",
+        "PH",
+        (F.col("FIXED_ACIDITY") * F.col("CITRIC_ACID")).alias("MY_NEW_FEATURE"),
+    )
+    return FeatureView(
+        name="WINE_FEATURES",  # name of feature view
+        entities=[wine_entity],  # entities
+        feature_df=feature_df,  # definition dataframe
+        refresh_freq="1d",  # refresh frequency. '1d' means it refreshes everyday
+        desc="Managed feature view about wine quality which refreshes everyday.",
+    )

snowflake/ml/feature_store/examples/wine_quality_features/features/static_wine_features.py ADDED Viewed

@@ -0,0 +1,21 @@
+from typing import List
+from snowflake.ml.feature_store import FeatureView
+from snowflake.ml.feature_store.examples.wine_quality_features.entities import (
+    wine_entity,
+)
+from snowflake.snowpark import DataFrame, Session
+# This function will be invoked by example_helper.py. Do not change the name.
+def create_draft_feature_view(session: Session, source_dfs: List[DataFrame], source_tables: List[str]) -> FeatureView:
+    """Create a feature view about trip station."""
+    feature_df = source_dfs[0].select("WINE_ID", "SULPHATES", "ALCOHOL")
+    return FeatureView(
+        name="EXTRA_WINE_FEATURES",  # name of feature view
+        entities=[wine_entity],  # entities
+        feature_df=feature_df,  # feature dataframe
+        refresh_freq=None,  # refresh frequency. None means it never refresh
+        desc="Static feature view about wine quality which never refresh.",
+    )

snowflake/ml/feature_store/examples/wine_quality_features/source.yaml ADDED Viewed

@@ -0,0 +1,5 @@
+---
+source_data: winequality_red
+add_id_column: wine_id
+label_columns: quality
+excluded_columns: wine_id

snowflake-ml-python 1.5.4__py3-none-any.whl → 1.6.0__py3-none-any.whl

snowflake-ml-python 1.5.4py3-none-any.whl → 1.6.0py3-none-any.whl