snowflake-ml-python 1.5.4__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. snowflake/cortex/__init__.py +2 -0
  2. snowflake/cortex/_classify_text.py +36 -0
  3. snowflake/cortex/_complete.py +67 -10
  4. snowflake/cortex/_util.py +4 -4
  5. snowflake/ml/_internal/lineage/lineage_utils.py +4 -4
  6. snowflake/ml/_internal/telemetry.py +12 -2
  7. snowflake/ml/data/_internal/arrow_ingestor.py +228 -0
  8. snowflake/ml/data/_internal/ingestor_utils.py +58 -0
  9. snowflake/ml/data/data_connector.py +133 -0
  10. snowflake/ml/data/data_ingestor.py +28 -0
  11. snowflake/ml/data/data_source.py +23 -0
  12. snowflake/ml/dataset/dataset.py +1 -13
  13. snowflake/ml/dataset/dataset_reader.py +18 -118
  14. snowflake/ml/feature_store/access_manager.py +7 -1
  15. snowflake/ml/feature_store/entity.py +19 -2
  16. snowflake/ml/feature_store/examples/citibike_trip_features/entities.py +20 -0
  17. snowflake/ml/feature_store/examples/citibike_trip_features/features/station_feature.py +31 -0
  18. snowflake/ml/feature_store/examples/citibike_trip_features/features/trip_feature.py +24 -0
  19. snowflake/ml/feature_store/examples/citibike_trip_features/source.yaml +4 -0
  20. snowflake/ml/feature_store/examples/example_helper.py +240 -0
  21. snowflake/ml/feature_store/examples/new_york_taxi_features/entities.py +12 -0
  22. snowflake/ml/feature_store/examples/new_york_taxi_features/features/dropoff_features.py +39 -0
  23. snowflake/ml/feature_store/examples/new_york_taxi_features/features/pickup_features.py +58 -0
  24. snowflake/ml/feature_store/examples/new_york_taxi_features/source.yaml +5 -0
  25. snowflake/ml/feature_store/examples/source_data/citibike_trips.yaml +36 -0
  26. snowflake/ml/feature_store/examples/source_data/fraud_transactions.yaml +29 -0
  27. snowflake/ml/feature_store/examples/source_data/nyc_yellow_trips.yaml +4 -0
  28. snowflake/ml/feature_store/examples/source_data/winequality_red.yaml +32 -0
  29. snowflake/ml/feature_store/examples/wine_quality_features/entities.py +14 -0
  30. snowflake/ml/feature_store/examples/wine_quality_features/features/managed_wine_features.py +29 -0
  31. snowflake/ml/feature_store/examples/wine_quality_features/features/static_wine_features.py +21 -0
  32. snowflake/ml/feature_store/examples/wine_quality_features/source.yaml +5 -0
  33. snowflake/ml/feature_store/feature_store.py +579 -53
  34. snowflake/ml/feature_store/feature_view.py +168 -5
  35. snowflake/ml/fileset/stage_fs.py +18 -10
  36. snowflake/ml/lineage/lineage_node.py +1 -1
  37. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +2 -3
  38. snowflake/ml/model/_model_composer/model_composer.py +11 -14
  39. snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +24 -16
  40. snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +2 -1
  41. snowflake/ml/model/_model_composer/model_method/function_generator.py +3 -3
  42. snowflake/ml/model/_model_composer/model_method/infer_function.py_template +3 -32
  43. snowflake/ml/model/_model_composer/model_method/infer_partitioned.py_template +3 -27
  44. snowflake/ml/model/_model_composer/model_method/infer_table_function.py_template +3 -32
  45. snowflake/ml/model/_model_composer/model_method/model_method.py +5 -2
  46. snowflake/ml/model/_packager/model_handlers/_base.py +11 -1
  47. snowflake/ml/model/_packager/model_handlers/_utils.py +58 -1
  48. snowflake/ml/model/_packager/model_handlers/catboost.py +42 -0
  49. snowflake/ml/model/_packager/model_handlers/lightgbm.py +68 -0
  50. snowflake/ml/model/_packager/model_handlers/xgboost.py +59 -0
  51. snowflake/ml/model/_packager/model_runtime/model_runtime.py +3 -5
  52. snowflake/ml/model/model_signature.py +4 -4
  53. snowflake/ml/model/type_hints.py +4 -0
  54. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +1 -1
  55. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_search_udf_file.py +13 -1
  56. snowflake/ml/modeling/impute/simple_imputer.py +26 -0
  57. snowflake/ml/modeling/pipeline/pipeline.py +4 -4
  58. snowflake/ml/registry/registry.py +100 -13
  59. snowflake/ml/version.py +1 -1
  60. {snowflake_ml_python-1.5.4.dist-info → snowflake_ml_python-1.6.0.dist-info}/METADATA +48 -2
  61. {snowflake_ml_python-1.5.4.dist-info → snowflake_ml_python-1.6.0.dist-info}/RECORD +64 -42
  62. {snowflake_ml_python-1.5.4.dist-info → snowflake_ml_python-1.6.0.dist-info}/WHEEL +1 -1
  63. snowflake/ml/_internal/lineage/data_source.py +0 -10
  64. {snowflake_ml_python-1.5.4.dist-info → snowflake_ml_python-1.6.0.dist-info}/LICENSE.txt +0 -0
  65. {snowflake_ml_python-1.5.4.dist-info → snowflake_ml_python-1.6.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,240 @@
1
+ import importlib
2
+ import logging
3
+ import os
4
+ import re
5
+ from pathlib import Path
6
+ from typing import Any, Dict, List, Optional
7
+
8
+ import yaml
9
+
10
+ from snowflake.ml._internal.utils import identifier, sql_identifier
11
+ from snowflake.ml.feature_store import Entity, FeatureView # type: ignore[attr-defined]
12
+ from snowflake.snowpark import DataFrame, Session, functions as F
13
+ from snowflake.snowpark.types import TimestampType
14
+
15
+ logger = logging.getLogger(__name__)
16
+ logger.setLevel(logging.INFO)
17
+
18
+
19
+ class ExampleHelper:
20
+ def __init__(self, session: Session, database_name: str, dataset_schema: str) -> None:
21
+ """A helper class to run Feature Store examples.
22
+
23
+ Args:
24
+ session: A Snowpark session object.
25
+ database_name: Database where dataset and Feature Store lives.
26
+ dataset_schema: Schema where destination dataset table lives.
27
+ """
28
+ self._session = session
29
+ self._database_name = database_name
30
+ self._dataset_schema = dataset_schema
31
+ self._selected_example = None
32
+ self._source_tables: List[str] = []
33
+ self._source_dfs: List[DataFrame] = []
34
+ self._excluded_columns: List[sql_identifier.SqlIdentifier] = []
35
+ self._label_columns: List[sql_identifier.SqlIdentifier] = []
36
+ self._timestamp_column: Optional[sql_identifier.SqlIdentifier] = None
37
+ self._epoch_to_timestamp_cols: List[str] = []
38
+ self._add_id_column: Optional[sql_identifier.SqlIdentifier] = None
39
+
40
+ def list_examples(self) -> List[str]:
41
+ """Return a list of examples."""
42
+ root_dir = Path(__file__).parent
43
+ result = []
44
+ for f_name in os.listdir(root_dir):
45
+ if os.path.isdir(os.path.join(root_dir, f_name)) and f_name[0].isalpha() and f_name != "source_data":
46
+ result.append(f_name)
47
+ return result
48
+
49
+ def load_draft_feature_views(self) -> List[FeatureView]:
50
+ """Return all feature views in an example.
51
+
52
+ Returns:
53
+ A list of FeatureView object.
54
+ """
55
+ fvs = []
56
+ root_dir = Path(__file__).parent.joinpath(f"{self._selected_example}/features")
57
+ for f_name in os.listdir(root_dir):
58
+ if not f_name[0].isalpha():
59
+ # skip folders like __pycache__
60
+ continue
61
+ mod_path = f"{__package__}.{self._selected_example}.features.{f_name.rstrip('.py')}"
62
+ mod = importlib.import_module(mod_path)
63
+ fv = mod.create_draft_feature_view(self._session, self._source_dfs, self._source_tables)
64
+ fvs.append(fv)
65
+
66
+ return fvs
67
+
68
+ def load_entities(self) -> List[Entity]:
69
+ """Return all entities in an example.
70
+
71
+ Returns:
72
+ A list of Entity object.
73
+ """
74
+ current_module = f"{__package__}.{self._selected_example}.entities"
75
+ mod = importlib.import_module(current_module)
76
+ return mod.get_all_entities() # type: ignore[no-any-return]
77
+
78
+ def _read_yaml(self, file_path: str) -> Any:
79
+ with open(file_path) as fs:
80
+ return yaml.safe_load(fs)
81
+
82
+ def _create_file_format(self, format_dict: Dict[str, str], format_name: str) -> None:
83
+ """Create a file name with given name."""
84
+ self._session.sql(
85
+ f"""
86
+ create or replace file format {format_name}
87
+ type = '{format_dict['type']}'
88
+ compression = '{format_dict['compression']}'
89
+ field_delimiter = '{format_dict['field_delimiter']}'
90
+ record_delimiter = '{format_dict['record_delimiter']}'
91
+ skip_header = {format_dict['skip_header']}
92
+ field_optionally_enclosed_by = '{format_dict['field_optionally_enclosed_by']}'
93
+ trim_space = {format_dict['trim_space']}
94
+ error_on_column_count_mismatch = {format_dict['error_on_column_count_mismatch']}
95
+ escape = '{format_dict['escape']}'
96
+ escape_unenclosed_field = '{format_dict['escape_unenclosed_field']}'
97
+ date_format = '{format_dict['date_format']}'
98
+ timestamp_format = '{format_dict['timestamp_format']}'
99
+ null_if = {format_dict['null_if']}
100
+ comment = '{format_dict['comment']}'
101
+ """
102
+ ).collect()
103
+
104
+ def _load_csv(self, schema_dict: Dict[str, str], destination_table: str, temp_stage_name: str) -> None:
105
+ # create temp file format
106
+ file_format_name = f"{self._database_name}.{self._dataset_schema}.feature_store_temp_format"
107
+ format_str = ""
108
+ if "format" in schema_dict:
109
+ self._create_file_format(schema_dict["format"], file_format_name) # type: ignore[arg-type]
110
+ format_str = f"file_format = {file_format_name}"
111
+
112
+ # create destination table
113
+ cols_type_str = ",".join([f"{k} {v}" for k, v in schema_dict["columns"].items()]) # type: ignore[attr-defined]
114
+ cols_name_str = ",".join(schema_dict["columns"].keys()) # type: ignore[attr-defined]
115
+ if self._add_id_column:
116
+ cols_type_str = (
117
+ f"{self._add_id_column.resolved()} number autoincrement start 1 increment 1, " + cols_type_str
118
+ )
119
+ self._session.sql(
120
+ f"""
121
+ create or replace table {destination_table} ({cols_type_str})
122
+ """
123
+ ).collect()
124
+
125
+ # copy dataset on stage into destination table
126
+ self._session.sql(
127
+ f"""
128
+ copy into {destination_table} ({cols_name_str}) from
129
+ @{temp_stage_name}
130
+ {format_str}
131
+ pattern = '{schema_dict['load_files_pattern']}'
132
+ """
133
+ ).collect()
134
+
135
+ def _load_parquet(self, schema_dict: Dict[str, str], destination_table: str, temp_stage_name: str) -> None:
136
+ regex_pattern = schema_dict["load_files_pattern"]
137
+ all_files = self._session.sql(f"list @{temp_stage_name}").collect()
138
+ filtered_files = [item["name"] for item in all_files if re.match(regex_pattern, item["name"])]
139
+ assert len(filtered_files) == 1, "Current code only works for one file"
140
+ file_name = filtered_files[0].rsplit("/", 1)[-1]
141
+
142
+ df = self._session.read.parquet(f"@{temp_stage_name}/{file_name}")
143
+ for old_col_name in df.columns:
144
+ df = df.with_column_renamed(old_col_name, identifier.get_unescaped_names(old_col_name))
145
+
146
+ for ts_col in self._epoch_to_timestamp_cols:
147
+ if "timestamp" != dict(df.dtypes)[ts_col]:
148
+ df = df.with_column(f"{ts_col}_NEW", F.cast(df[ts_col] / 1000000, TimestampType()))
149
+ df = df.drop(ts_col).rename(f"{ts_col}_NEW", ts_col)
150
+
151
+ df.write.mode("overwrite").save_as_table(destination_table)
152
+
153
+ def _load_source_data(self, schema_yaml_file: str) -> str:
154
+ """Parse a yaml schema file and load data into Snowflake.
155
+
156
+ Args:
157
+ schema_yaml_file: the path to a yaml schema file.
158
+
159
+ Returns:
160
+ Return a destination table name.
161
+ """
162
+ # load schema file
163
+ schema_dict = self._read_yaml(schema_yaml_file)
164
+ temp_stage_name = f"{self._database_name}.{self._dataset_schema}.feature_store_temp_stage"
165
+ destination_table = f"{self._database_name}.{self._dataset_schema}.{schema_dict['destination_table_name']}"
166
+
167
+ # create a temp stage from S3 URL
168
+ self._session.sql(f"create or replace stage {temp_stage_name} url = '{schema_dict['s3_url']}'").collect()
169
+
170
+ # load csv or parquet
171
+ # TODO: this could be more flexible and robust.
172
+ if "parquet" in schema_dict["load_files_pattern"]:
173
+ self._load_parquet(schema_dict, destination_table, temp_stage_name)
174
+ else:
175
+ self._load_csv(schema_dict, destination_table, temp_stage_name)
176
+
177
+ return destination_table
178
+
179
+ def load_example(self, example_name: str) -> List[str]:
180
+ """Select the active example and load its datasets to Snowflake.
181
+
182
+ Args:
183
+ example_name: The folder name under feature_store/examples.
184
+ For example, 'citibike_trip_features'.
185
+
186
+ Returns:
187
+ Returns a list of table names with populated datasets.
188
+ """
189
+ self._selected_example = example_name # type: ignore[assignment]
190
+
191
+ # load source yaml file
192
+ root_dir = Path(__file__).parent
193
+ source_file_path = root_dir.joinpath(f"{self._selected_example}/source.yaml")
194
+ source_dict = self._read_yaml(str(source_file_path))
195
+ self._source_tables = []
196
+ self._source_dfs = []
197
+
198
+ source_ymal_data = source_dict["source_data"]
199
+ if "excluded_columns" in source_dict:
200
+ self._excluded_columns = sql_identifier.to_sql_identifiers(source_dict["excluded_columns"].split(","))
201
+ if "label_columns" in source_dict:
202
+ self._label_columns = sql_identifier.to_sql_identifiers(source_dict["label_columns"].split(","))
203
+ if "timestamp_column" in source_dict:
204
+ self._timestamp_column = sql_identifier.SqlIdentifier(source_dict["timestamp_column"])
205
+ if "epoch_to_timestamp_cols" in source_dict:
206
+ self._epoch_to_timestamp_cols = source_dict["epoch_to_timestamp_cols"].split(",")
207
+ if "add_id_column" in source_dict:
208
+ self._add_id_column = sql_identifier.SqlIdentifier(source_dict["add_id_column"])
209
+
210
+ return self.load_source_data(source_ymal_data)
211
+
212
+ def load_source_data(self, source_data_name: str) -> List[str]:
213
+ """Load source data into Snowflake.
214
+
215
+ Args:
216
+ source_data_name: The name of source data located in examples/source_data/.
217
+
218
+ Returns:
219
+ Return a list of Snowflake tables.
220
+ """
221
+ root_dir = Path(__file__).parent
222
+ schema_file = root_dir.joinpath(f"source_data/{source_data_name}.yaml")
223
+ destination_table = self._load_source_data(str(schema_file))
224
+ source_df = self._session.table(destination_table)
225
+ self._source_tables.append(destination_table)
226
+ self._source_dfs.append(source_df)
227
+ logger.info(f"source data {source_data_name} has been successfully loaded into table {destination_table}.")
228
+ return self._source_tables
229
+
230
+ def get_current_schema(self) -> str:
231
+ return self._dataset_schema
232
+
233
+ def get_label_cols(self) -> List[str]:
234
+ return [item.resolved() for item in self._label_columns]
235
+
236
+ def get_excluded_cols(self) -> List[str]:
237
+ return [item.resolved() for item in self._excluded_columns]
238
+
239
+ def get_training_data_timestamp_col(self) -> Optional[str]:
240
+ return self._timestamp_column.resolved() if self._timestamp_column is not None else None
@@ -0,0 +1,12 @@
1
+ from typing import List
2
+
3
+ from snowflake.ml.feature_store import Entity
4
+
5
+ trip_pickup = Entity(name="TRIP_PICKUP", join_keys=["PULOCATIONID"], desc="Trip pickup entity.")
6
+
7
+ trip_dropoff = Entity(name="TRIP_DROPOFF", join_keys=["DOLOCATIONID"], desc="Trip dropoff entity.")
8
+
9
+
10
+ # This will be invoked by example_helper.py. Do not change function name.
11
+ def get_all_entities() -> List[Entity]:
12
+ return [trip_pickup, trip_dropoff]
@@ -0,0 +1,39 @@
1
+ from typing import List
2
+
3
+ from snowflake.ml.feature_store import FeatureView
4
+ from snowflake.ml.feature_store.examples.new_york_taxi_features.entities import (
5
+ trip_dropoff,
6
+ )
7
+ from snowflake.snowpark import DataFrame, Session
8
+
9
+
10
+ # This function will be invoked by example_helper.py. Do not change the name.
11
+ def create_draft_feature_view(session: Session, source_dfs: List[DataFrame], source_tables: List[str]) -> FeatureView:
12
+ """Create a draft feature view."""
13
+ feature_df = session.sql(
14
+ f"""
15
+ select
16
+ TPEP_DROPOFF_DATETIME as TS,
17
+ DOLOCATIONID,
18
+ count(FARE_AMOUNT) over (
19
+ partition by DOLOCATIONID
20
+ order by TPEP_DROPOFF_DATETIME
21
+ range between interval '1 hours' preceding and current row
22
+ ) TRIP_COUNT_1H,
23
+ count(FARE_AMOUNT) over (
24
+ partition by DOLOCATIONID
25
+ order by TPEP_DROPOFF_DATETIME
26
+ range between interval '5 hours' preceding and current row
27
+ ) TRIP_COUNT_5H
28
+ from {source_tables[0]}
29
+ """
30
+ )
31
+
32
+ return FeatureView(
33
+ name="f_trip_dropoff", # name of feature view
34
+ entities=[trip_dropoff], # entities
35
+ feature_df=feature_df, # definition query
36
+ refresh_freq="12h", # the frequency this feature view re-compute
37
+ timestamp_col="TS", # timestamp column. Used when generate training data
38
+ desc="Managed feature view trip dropoff refreshed every 12 hours.",
39
+ )
@@ -0,0 +1,58 @@
1
+ from typing import List
2
+
3
+ from snowflake.ml.feature_store import FeatureView
4
+ from snowflake.ml.feature_store.examples.new_york_taxi_features.entities import (
5
+ trip_pickup,
6
+ )
7
+ from snowflake.snowpark import DataFrame, Session
8
+
9
+
10
+ # This function will be invoked by example_helper.py. Do not change the name.
11
+ def create_draft_feature_view(session: Session, source_dfs: List[DataFrame], source_tables: List[str]) -> FeatureView:
12
+ """Create a draft feature view."""
13
+ feature_df = session.sql(
14
+ f"""
15
+ with
16
+ cte_1 (TS, PULOCATIONID, TRIP_COUNT_2H, TRIP_COUNT_5H, TRIP_FARE_SUM_2H, TRIP_FARE_SUM_5H) as (
17
+ select
18
+ TPEP_PICKUP_DATETIME as TS,
19
+ PULOCATIONID,
20
+ count(FARE_AMOUNT) over (
21
+ partition by PULOCATIONID
22
+ order by TPEP_PICKUP_DATETIME
23
+ range between interval '2 hours' preceding and current row
24
+ ) TRIP_COUNT_2H,
25
+ count(FARE_AMOUNT) over (
26
+ partition by PULOCATIONID
27
+ order by TPEP_PICKUP_DATETIME
28
+ range between interval '5 hours' preceding and current row
29
+ ) TRIP_COUNT_5H,
30
+ sum(FARE_AMOUNT) over (
31
+ partition by PULOCATIONID
32
+ order by TPEP_PICKUP_DATETIME
33
+ range between interval '2 hours' preceding and current row
34
+ ) TRIP_FARE_SUM_2H,
35
+ count(FARE_AMOUNT) over (
36
+ partition by PULOCATIONID
37
+ order by TPEP_PICKUP_DATETIME
38
+ range between interval '5 hours' preceding and current row
39
+ ) TRIP_FARE_SUM_5H
40
+ from {source_tables[0]}
41
+ )
42
+ select
43
+ TS,
44
+ PULOCATIONID,
45
+ TRIP_FARE_SUM_2H / TRIP_COUNT_2H as MEAN_FARE_2H,
46
+ TRIP_FARE_SUM_5H / TRIP_COUNT_5H as MEAN_FARE_5H,
47
+ from cte_1
48
+ """
49
+ )
50
+
51
+ return FeatureView(
52
+ name="f_trip_pickup", # name of feature view
53
+ entities=[trip_pickup], # entities
54
+ feature_df=feature_df, # definition query
55
+ refresh_freq="1d", # the frequency this feature view re-compute
56
+ timestamp_col="TS", # timestamp column. Used when generate training data
57
+ desc="Managed feature view trip pickup refreshed everyday.",
58
+ )
@@ -0,0 +1,5 @@
1
+ ---
2
+ source_data: nyc_yellow_trips
3
+ label_columns: FARE_AMOUNT
4
+ timestamp_column: TPEP_PICKUP_DATETIME
5
+ epoch_to_timestamp_cols: TPEP_PICKUP_DATETIME,TPEP_DROPOFF_DATETIME
@@ -0,0 +1,36 @@
1
+ ---
2
+ s3_url: s3://snowflake-workshop-lab/citibike-trips-csv/
3
+ destination_table_name: citibike_trips
4
+ load_files_pattern: trips_2013_6_4_0.csv.gz
5
+ format:
6
+ type: csv
7
+ compression: auto
8
+ field_delimiter: ','
9
+ record_delimiter: \n
10
+ skip_header: 0
11
+ field_optionally_enclosed_by: \042
12
+ trim_space: 'true'
13
+ error_on_column_count_mismatch: 'false'
14
+ escape: none
15
+ escape_unenclosed_field: \134
16
+ date_format: auto
17
+ timestamp_format: auto
18
+ null_if: ('')
19
+ comment: file format for citibike trips data
20
+ columns:
21
+ tripduration: integer
22
+ starttime: timestamp
23
+ stoptime: timestamp
24
+ start_station_id: integer
25
+ start_station_name: string
26
+ start_station_latitude: float
27
+ start_station_longitude: float
28
+ end_station_id: integer
29
+ end_station_name: string
30
+ end_station_latitude: float
31
+ end_station_longitude: float
32
+ bikeid: integer
33
+ membership_type: string
34
+ usertype: string
35
+ birth_year: integer
36
+ gender: integer
@@ -0,0 +1,29 @@
1
+ ---
2
+ s3_url: s3://sfquickstarts/misc/demos/fraud_transactions.csv
3
+ destination_table_name: fraud_transactions
4
+ load_files_pattern: .*
5
+ format:
6
+ type: csv
7
+ compression: auto
8
+ field_delimiter: ','
9
+ record_delimiter: \n
10
+ skip_header: 1
11
+ field_optionally_enclosed_by: none
12
+ trim_space: 'false'
13
+ error_on_column_count_mismatch: 'false'
14
+ escape: none
15
+ escape_unenclosed_field: none
16
+ date_format: auto
17
+ timestamp_format: auto
18
+ null_if: ('')
19
+ comment: file format for winequality data
20
+ columns:
21
+ TRANSACTION_ID: integer
22
+ TX_DATETIME: datetime
23
+ CUSTOMER_ID: integer
24
+ TERMINAL_ID: integer
25
+ TX_AMOUNT: float
26
+ TX_TIME_SECONDS: integer
27
+ TX_TIME_DAYS: integer
28
+ TX_FRAUD: integer
29
+ TX_FRAUD_SCENARIO: integer
@@ -0,0 +1,4 @@
1
+ ---
2
+ s3_url: s3://sfquickstarts/misc/demos/nyc_yellow_trips/
3
+ destination_table_name: nyc_yellow_trips
4
+ load_files_pattern: .*_2016-01[.]parquet
@@ -0,0 +1,32 @@
1
+ ---
2
+ s3_url: s3://sfquickstarts/misc/demos/winequality-red.csv
3
+ destination_table_name: winedata
4
+ load_files_pattern: .*
5
+ format:
6
+ type: csv
7
+ compression: auto
8
+ field_delimiter: ;
9
+ record_delimiter: \n
10
+ skip_header: 1
11
+ field_optionally_enclosed_by: none
12
+ trim_space: 'false'
13
+ error_on_column_count_mismatch: 'false'
14
+ escape: none
15
+ escape_unenclosed_field: none
16
+ date_format: auto
17
+ timestamp_format: auto
18
+ null_if: ('')
19
+ comment: file format for winequality data
20
+ columns:
21
+ fixed_acidity: float
22
+ volatile_acidity: float
23
+ citric_acid: float
24
+ residual_sugar: float
25
+ chlorides: float
26
+ free_sulfur_dioxide: integer
27
+ total_sulfur_dioxide: integer
28
+ density: float
29
+ pH: float
30
+ sulphates: float
31
+ alcohol: float
32
+ quality: integer
@@ -0,0 +1,14 @@
1
+ from typing import List
2
+
3
+ from snowflake.ml.feature_store import Entity
4
+
5
+ wine_entity = Entity(
6
+ name="WINE",
7
+ join_keys=["WINE_ID"],
8
+ desc="Wine ID column.",
9
+ )
10
+
11
+
12
+ # This will be invoked by example_helper.py. Do not change function name.
13
+ def get_all_entities() -> List[Entity]:
14
+ return [wine_entity]
@@ -0,0 +1,29 @@
1
+ from typing import List
2
+
3
+ from snowflake.ml.feature_store import FeatureView
4
+ from snowflake.ml.feature_store.examples.wine_quality_features.entities import (
5
+ wine_entity,
6
+ )
7
+ from snowflake.snowpark import DataFrame, Session, functions as F
8
+
9
+
10
+ # This function will be invoked by example_helper.py. Do not change the name.
11
+ def create_draft_feature_view(session: Session, source_dfs: List[DataFrame], source_tables: List[str]) -> FeatureView:
12
+ """Create a feature view about trip station."""
13
+ feature_df = source_dfs[0].select(
14
+ "WINE_ID",
15
+ "FIXED_ACIDITY",
16
+ "CITRIC_ACID",
17
+ "CHLORIDES",
18
+ "TOTAL_SULFUR_DIOXIDE",
19
+ "PH",
20
+ (F.col("FIXED_ACIDITY") * F.col("CITRIC_ACID")).alias("MY_NEW_FEATURE"),
21
+ )
22
+
23
+ return FeatureView(
24
+ name="WINE_FEATURES", # name of feature view
25
+ entities=[wine_entity], # entities
26
+ feature_df=feature_df, # definition dataframe
27
+ refresh_freq="1d", # refresh frequency. '1d' means it refreshes everyday
28
+ desc="Managed feature view about wine quality which refreshes everyday.",
29
+ )
@@ -0,0 +1,21 @@
1
+ from typing import List
2
+
3
+ from snowflake.ml.feature_store import FeatureView
4
+ from snowflake.ml.feature_store.examples.wine_quality_features.entities import (
5
+ wine_entity,
6
+ )
7
+ from snowflake.snowpark import DataFrame, Session
8
+
9
+
10
+ # This function will be invoked by example_helper.py. Do not change the name.
11
+ def create_draft_feature_view(session: Session, source_dfs: List[DataFrame], source_tables: List[str]) -> FeatureView:
12
+ """Create a feature view about trip station."""
13
+ feature_df = source_dfs[0].select("WINE_ID", "SULPHATES", "ALCOHOL")
14
+
15
+ return FeatureView(
16
+ name="EXTRA_WINE_FEATURES", # name of feature view
17
+ entities=[wine_entity], # entities
18
+ feature_df=feature_df, # feature dataframe
19
+ refresh_freq=None, # refresh frequency. None means it never refresh
20
+ desc="Static feature view about wine quality which never refresh.",
21
+ )
@@ -0,0 +1,5 @@
1
+ ---
2
+ source_data: winequality_red
3
+ add_id_column: wine_id
4
+ label_columns: quality
5
+ excluded_columns: wine_id