snowflake-ml-python 1.5.4__py3-none-any.whl → 1.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. snowflake/cortex/__init__.py +2 -0
  2. snowflake/cortex/_classify_text.py +36 -0
  3. snowflake/cortex/_complete.py +66 -35
  4. snowflake/cortex/_util.py +4 -4
  5. snowflake/ml/_internal/env_utils.py +11 -5
  6. snowflake/ml/_internal/exceptions/modeling_error_messages.py +4 -1
  7. snowflake/ml/_internal/lineage/lineage_utils.py +4 -4
  8. snowflake/ml/_internal/telemetry.py +26 -2
  9. snowflake/ml/_internal/utils/pkg_version_utils.py +8 -22
  10. snowflake/ml/data/_internal/arrow_ingestor.py +284 -0
  11. snowflake/ml/data/data_connector.py +186 -0
  12. snowflake/ml/data/data_ingestor.py +45 -0
  13. snowflake/ml/data/data_source.py +23 -0
  14. snowflake/ml/data/ingestor_utils.py +62 -0
  15. snowflake/ml/data/torch_dataset.py +33 -0
  16. snowflake/ml/dataset/dataset.py +1 -13
  17. snowflake/ml/dataset/dataset_metadata.py +3 -1
  18. snowflake/ml/dataset/dataset_reader.py +23 -117
  19. snowflake/ml/feature_store/access_manager.py +7 -1
  20. snowflake/ml/feature_store/entity.py +19 -2
  21. snowflake/ml/feature_store/examples/airline_features/entities.py +16 -0
  22. snowflake/ml/feature_store/examples/airline_features/features/plane_features.py +31 -0
  23. snowflake/ml/feature_store/examples/airline_features/features/weather_features.py +42 -0
  24. snowflake/ml/feature_store/examples/airline_features/source.yaml +7 -0
  25. snowflake/ml/feature_store/examples/citibike_trip_features/entities.py +20 -0
  26. snowflake/ml/feature_store/examples/citibike_trip_features/features/station_feature.py +37 -0
  27. snowflake/ml/feature_store/examples/citibike_trip_features/features/trip_feature.py +30 -0
  28. snowflake/ml/feature_store/examples/citibike_trip_features/source.yaml +7 -0
  29. snowflake/ml/feature_store/examples/example_helper.py +278 -0
  30. snowflake/ml/feature_store/examples/new_york_taxi_features/entities.py +12 -0
  31. snowflake/ml/feature_store/examples/new_york_taxi_features/features/location_features.py +44 -0
  32. snowflake/ml/feature_store/examples/new_york_taxi_features/features/trip_features.py +36 -0
  33. snowflake/ml/feature_store/examples/new_york_taxi_features/source.yaml +9 -0
  34. snowflake/ml/feature_store/examples/source_data/airline.yaml +4 -0
  35. snowflake/ml/feature_store/examples/source_data/citibike_trips.yaml +36 -0
  36. snowflake/ml/feature_store/examples/source_data/fraud_transactions.yaml +29 -0
  37. snowflake/ml/feature_store/examples/source_data/nyc_yellow_trips.yaml +4 -0
  38. snowflake/ml/feature_store/examples/source_data/winequality_red.yaml +32 -0
  39. snowflake/ml/feature_store/examples/wine_quality_features/entities.py +14 -0
  40. snowflake/ml/feature_store/examples/wine_quality_features/features/managed_wine_features.py +36 -0
  41. snowflake/ml/feature_store/examples/wine_quality_features/features/static_wine_features.py +24 -0
  42. snowflake/ml/feature_store/examples/wine_quality_features/source.yaml +8 -0
  43. snowflake/ml/feature_store/feature_store.py +637 -76
  44. snowflake/ml/feature_store/feature_view.py +316 -9
  45. snowflake/ml/fileset/stage_fs.py +18 -10
  46. snowflake/ml/lineage/lineage_node.py +1 -1
  47. snowflake/ml/model/_client/model/model_impl.py +11 -2
  48. snowflake/ml/model/_client/model/model_version_impl.py +171 -20
  49. snowflake/ml/model/_client/ops/model_ops.py +105 -27
  50. snowflake/ml/model/_client/ops/service_ops.py +121 -0
  51. snowflake/ml/model/_client/service/model_deployment_spec.py +95 -0
  52. snowflake/ml/model/_client/service/model_deployment_spec_schema.py +31 -0
  53. snowflake/ml/model/_client/sql/model_version.py +13 -4
  54. snowflake/ml/model/_client/sql/service.py +129 -0
  55. snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +2 -3
  56. snowflake/ml/model/_model_composer/model_composer.py +14 -14
  57. snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +33 -17
  58. snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +5 -1
  59. snowflake/ml/model/_model_composer/model_method/function_generator.py +3 -3
  60. snowflake/ml/model/_model_composer/model_method/infer_function.py_template +3 -32
  61. snowflake/ml/model/_model_composer/model_method/infer_partitioned.py_template +3 -27
  62. snowflake/ml/model/_model_composer/model_method/infer_table_function.py_template +3 -32
  63. snowflake/ml/model/_model_composer/model_method/model_method.py +5 -2
  64. snowflake/ml/model/_packager/model_env/model_env.py +7 -2
  65. snowflake/ml/model/_packager/model_handlers/_base.py +30 -3
  66. snowflake/ml/model/_packager/model_handlers/_utils.py +58 -1
  67. snowflake/ml/model/_packager/model_handlers/catboost.py +52 -3
  68. snowflake/ml/model/_packager/model_handlers/custom.py +6 -2
  69. snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +9 -5
  70. snowflake/ml/model/_packager/model_handlers/lightgbm.py +80 -3
  71. snowflake/ml/model/_packager/model_handlers/llm.py +7 -3
  72. snowflake/ml/model/_packager/model_handlers/mlflow.py +8 -3
  73. snowflake/ml/model/_packager/model_handlers/pytorch.py +8 -3
  74. snowflake/ml/model/_packager/model_handlers/sentence_transformers.py +8 -3
  75. snowflake/ml/model/_packager/model_handlers/sklearn.py +87 -4
  76. snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +7 -2
  77. snowflake/ml/model/_packager/model_handlers/tensorflow.py +9 -4
  78. snowflake/ml/model/_packager/model_handlers/torchscript.py +8 -3
  79. snowflake/ml/model/_packager/model_handlers/xgboost.py +71 -3
  80. snowflake/ml/model/_packager/model_meta/model_meta.py +32 -2
  81. snowflake/ml/model/_packager/model_meta/model_meta_schema.py +19 -0
  82. snowflake/ml/model/_packager/model_packager.py +2 -1
  83. snowflake/ml/model/_packager/model_runtime/model_runtime.py +7 -7
  84. snowflake/ml/model/model_signature.py +4 -4
  85. snowflake/ml/model/type_hints.py +2 -0
  86. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +1 -1
  87. snowflake/ml/modeling/_internal/snowpark_implementations/distributed_search_udf_file.py +13 -1
  88. snowflake/ml/modeling/framework/base.py +28 -19
  89. snowflake/ml/modeling/impute/simple_imputer.py +26 -0
  90. snowflake/ml/modeling/pipeline/pipeline.py +7 -4
  91. snowflake/ml/registry/_manager/model_manager.py +16 -2
  92. snowflake/ml/registry/registry.py +100 -13
  93. snowflake/ml/utils/sql_client.py +22 -0
  94. snowflake/ml/version.py +1 -1
  95. {snowflake_ml_python-1.5.4.dist-info → snowflake_ml_python-1.6.1.dist-info}/METADATA +81 -2
  96. {snowflake_ml_python-1.5.4.dist-info → snowflake_ml_python-1.6.1.dist-info}/RECORD +99 -66
  97. {snowflake_ml_python-1.5.4.dist-info → snowflake_ml_python-1.6.1.dist-info}/WHEEL +1 -1
  98. snowflake/ml/_internal/lineage/data_source.py +0 -10
  99. {snowflake_ml_python-1.5.4.dist-info → snowflake_ml_python-1.6.1.dist-info}/LICENSE.txt +0 -0
  100. {snowflake_ml_python-1.5.4.dist-info → snowflake_ml_python-1.6.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,278 @@
1
+ import importlib
2
+ import logging
3
+ import os
4
+ import re
5
+ from pathlib import Path
6
+ from typing import Any, Dict, List, Optional
7
+
8
+ import yaml
9
+
10
+ from snowflake.ml._internal.utils import identifier, sql_identifier
11
+ from snowflake.ml.feature_store import Entity, FeatureView # type: ignore[attr-defined]
12
+ from snowflake.snowpark import DataFrame, Session, functions as F
13
+ from snowflake.snowpark.types import TimestampTimeZone, TimestampType
14
+
15
+ logger = logging.getLogger(__name__)
16
+ logger.setLevel(logging.INFO)
17
+
18
+
19
+ class ExampleHelper:
20
+ def __init__(self, session: Session, database_name: str, dataset_schema: str) -> None:
21
+ """A helper class to run Feature Store examples.
22
+
23
+ Args:
24
+ session: A Snowpark session object.
25
+ database_name: Database where dataset and Feature Store lives.
26
+ dataset_schema: Schema where destination dataset table lives.
27
+ """
28
+ self._session = session
29
+ self._database_name = database_name
30
+ self._dataset_schema = dataset_schema
31
+ self._clear()
32
+
33
+ def _clear(self) -> None:
34
+ self._selected_example = None
35
+ self._source_tables: List[str] = []
36
+ self._source_dfs: List[DataFrame] = []
37
+ self._excluded_columns: List[sql_identifier.SqlIdentifier] = []
38
+ self._label_columns: List[sql_identifier.SqlIdentifier] = []
39
+ self._timestamp_column: Optional[sql_identifier.SqlIdentifier] = None
40
+ self._epoch_to_timestamp_cols: List[str] = []
41
+ self._add_id_column: Optional[sql_identifier.SqlIdentifier] = None
42
+ self._training_spine_table: str = ""
43
+
44
+ def list_examples(self) -> Optional[DataFrame]:
45
+ """Return a dataframe object about descriptions of all examples."""
46
+ root_dir = Path(__file__).parent
47
+ rows = []
48
+ for f_name in os.listdir(root_dir):
49
+ if os.path.isdir(os.path.join(root_dir, f_name)) and f_name[0].isalpha() and f_name != "source_data":
50
+ source_file_path = root_dir.joinpath(f"{f_name}/source.yaml")
51
+ source_dict = self._read_yaml(str(source_file_path))
52
+ rows.append((f_name, source_dict["model_category"], source_dict["desc"], source_dict["label_columns"]))
53
+ return self._session.create_dataframe(rows, schema=["NAME", "MODEL_CATEGORY", "DESC", "LABEL_COLS"])
54
+
55
+ def load_draft_feature_views(self) -> List[FeatureView]:
56
+ """Return all feature views in an example.
57
+
58
+ Returns:
59
+ A list of FeatureView object.
60
+ """
61
+ fvs = []
62
+ root_dir = Path(__file__).parent.joinpath(f"{self._selected_example}/features")
63
+ for f_name in os.listdir(root_dir):
64
+ if not f_name[0].isalpha():
65
+ # skip folders like __pycache__
66
+ continue
67
+ mod_path = f"{__package__}.{self._selected_example}.features.{f_name.rstrip('.py')}"
68
+ mod = importlib.import_module(mod_path)
69
+ fv = mod.create_draft_feature_view(self._session, self._source_dfs, self._source_tables)
70
+ fvs.append(fv)
71
+
72
+ return fvs
73
+
74
+ def load_entities(self) -> List[Entity]:
75
+ """Return all entities in an example.
76
+
77
+ Returns:
78
+ A list of Entity object.
79
+ """
80
+ current_module = f"{__package__}.{self._selected_example}.entities"
81
+ mod = importlib.import_module(current_module)
82
+ return mod.get_all_entities() # type: ignore[no-any-return]
83
+
84
+ def _read_yaml(self, file_path: str) -> Any:
85
+ with open(file_path) as fs:
86
+ return yaml.safe_load(fs)
87
+
88
+ def _create_file_format(self, format_dict: Dict[str, str], format_name: str) -> None:
89
+ """Create a file name with given name."""
90
+ self._session.sql(
91
+ f"""
92
+ create or replace file format {format_name}
93
+ type = '{format_dict['type']}'
94
+ compression = '{format_dict['compression']}'
95
+ field_delimiter = '{format_dict['field_delimiter']}'
96
+ record_delimiter = '{format_dict['record_delimiter']}'
97
+ skip_header = {format_dict['skip_header']}
98
+ field_optionally_enclosed_by = '{format_dict['field_optionally_enclosed_by']}'
99
+ trim_space = {format_dict['trim_space']}
100
+ error_on_column_count_mismatch = {format_dict['error_on_column_count_mismatch']}
101
+ escape = '{format_dict['escape']}'
102
+ escape_unenclosed_field = '{format_dict['escape_unenclosed_field']}'
103
+ date_format = '{format_dict['date_format']}'
104
+ timestamp_format = '{format_dict['timestamp_format']}'
105
+ null_if = {format_dict['null_if']}
106
+ comment = '{format_dict['comment']}'
107
+ """
108
+ ).collect()
109
+
110
+ def _load_csv(self, schema_dict: Dict[str, str], temp_stage_name: str) -> List[str]:
111
+ # create temp file format
112
+ file_format_name = f"{self._database_name}.{self._dataset_schema}.feature_store_temp_format"
113
+ format_str = ""
114
+ if "format" in schema_dict:
115
+ self._create_file_format(schema_dict["format"], file_format_name) # type: ignore[arg-type]
116
+ format_str = f"file_format = {file_format_name}"
117
+
118
+ # create destination table
119
+ cols_type_str = ",".join([f"{k} {v}" for k, v in schema_dict["columns"].items()]) # type: ignore[attr-defined]
120
+ cols_name_str = ",".join(schema_dict["columns"].keys()) # type: ignore[attr-defined]
121
+ if self._add_id_column:
122
+ cols_type_str = (
123
+ f"{self._add_id_column.resolved()} number autoincrement start 1 increment 1, " + cols_type_str
124
+ )
125
+
126
+ destination_table = f"{self._database_name}.{self._dataset_schema}.{schema_dict['destination_table_name']}"
127
+ self._session.sql(
128
+ f"""
129
+ create or replace table {destination_table} ({cols_type_str})
130
+ """
131
+ ).collect()
132
+
133
+ # copy dataset on stage into destination table
134
+ self._session.sql(
135
+ f"""
136
+ copy into {destination_table} ({cols_name_str}) from
137
+ @{temp_stage_name}
138
+ {format_str}
139
+ pattern = '{schema_dict['load_files_pattern']}'
140
+ """
141
+ ).collect()
142
+
143
+ return [destination_table]
144
+
145
+ def _load_parquet(self, schema_dict: Dict[str, str], temp_stage_name: str) -> List[str]:
146
+ regex_pattern = schema_dict["load_files_pattern"]
147
+ all_files = self._session.sql(f"list @{temp_stage_name}").collect()
148
+ filtered_files = [item["name"] for item in all_files if re.match(regex_pattern, item["name"])]
149
+ file_count = len(filtered_files)
150
+ result = []
151
+
152
+ for file in filtered_files:
153
+ file_name = file.rsplit("/", 1)[-1]
154
+
155
+ df = self._session.read.parquet(f"@{temp_stage_name}/{file_name}")
156
+ for old_col_name in df.columns:
157
+ df = df.with_column_renamed(old_col_name, identifier.get_unescaped_names(old_col_name))
158
+
159
+ # convert timestamp column to ntz
160
+ for name, type in dict(df.dtypes).items():
161
+ if type == "timestamp":
162
+ df = df.with_column(name, F.to_timestamp_ntz(name))
163
+
164
+ # convert epoch column to ntz timestamp
165
+ for ts_col in self._epoch_to_timestamp_cols:
166
+ if "timestamp" != dict(df.dtypes)[ts_col]:
167
+ df = df.with_column(ts_col, F.cast(df[ts_col] / 1000000, TimestampType(TimestampTimeZone.NTZ)))
168
+
169
+ if self._add_id_column:
170
+ df = df.withColumn(self._add_id_column, F.monotonically_increasing_id())
171
+
172
+ if file_count == 1:
173
+ dest_table_name = (
174
+ f"{self._database_name}.{self._dataset_schema}.{schema_dict['destination_table_name']}"
175
+ )
176
+ else:
177
+ regex_pattern = schema_dict["destination_table_name"]
178
+ dest_table_name = re.match(regex_pattern, file_name).group("table_name") # type: ignore[union-attr]
179
+ dest_table_name = f"{self._database_name}.{self._dataset_schema}.{dest_table_name}"
180
+
181
+ df.write.mode("overwrite").save_as_table(dest_table_name)
182
+ result.append(dest_table_name)
183
+
184
+ return result
185
+
186
+ def _load_source_data(self, schema_yaml_file: str) -> List[str]:
187
+ """Parse a yaml schema file and load data into Snowflake.
188
+
189
+ Args:
190
+ schema_yaml_file: the path to a yaml schema file.
191
+
192
+ Returns:
193
+ Return a destination table name.
194
+ """
195
+ # load schema file
196
+ schema_dict = self._read_yaml(schema_yaml_file)
197
+ temp_stage_name = f"{self._database_name}.{self._dataset_schema}.feature_store_temp_stage"
198
+
199
+ # create a temp stage from S3 URL
200
+ self._session.sql(f"create or replace stage {temp_stage_name} url = '{schema_dict['s3_url']}'").collect()
201
+
202
+ # load csv or parquet
203
+ # TODO: this could be more flexible and robust.
204
+ if "parquet" in schema_dict["load_files_pattern"]:
205
+ return self._load_parquet(schema_dict, temp_stage_name)
206
+ else:
207
+ return self._load_csv(schema_dict, temp_stage_name)
208
+
209
+ def load_example(self, example_name: str) -> List[str]:
210
+ """Select the active example and load its datasets to Snowflake.
211
+
212
+ Args:
213
+ example_name: The folder name under feature_store/examples.
214
+ For example, 'citibike_trip_features'.
215
+
216
+ Returns:
217
+ Returns a list of table names with populated datasets.
218
+ """
219
+ self._clear()
220
+ self._selected_example = example_name # type: ignore[assignment]
221
+
222
+ # load source yaml file
223
+ root_dir = Path(__file__).parent
224
+ source_file_path = root_dir.joinpath(f"{self._selected_example}/source.yaml")
225
+ source_dict = self._read_yaml(str(source_file_path))
226
+ self._source_tables = []
227
+ self._source_dfs = []
228
+
229
+ source_yaml_data = source_dict["source_data"]
230
+ if "excluded_columns" in source_dict:
231
+ self._excluded_columns = sql_identifier.to_sql_identifiers(source_dict["excluded_columns"].split(","))
232
+ if "label_columns" in source_dict:
233
+ self._label_columns = sql_identifier.to_sql_identifiers(source_dict["label_columns"].split(","))
234
+ if "timestamp_column" in source_dict:
235
+ self._timestamp_column = sql_identifier.SqlIdentifier(source_dict["timestamp_column"])
236
+ if "epoch_to_timestamp_cols" in source_dict:
237
+ self._epoch_to_timestamp_cols = source_dict["epoch_to_timestamp_cols"].split(",")
238
+ if "add_id_column" in source_dict:
239
+ self._add_id_column = sql_identifier.SqlIdentifier(source_dict["add_id_column"])
240
+ self._training_spine_table = (
241
+ f"{self._database_name}.{self._dataset_schema}.{source_dict['training_spine_table']}"
242
+ )
243
+
244
+ return self.load_source_data(source_yaml_data)
245
+
246
+ def load_source_data(self, source_data_name: str) -> List[str]:
247
+ """Load source data into Snowflake.
248
+
249
+ Args:
250
+ source_data_name: The name of source data located in examples/source_data/.
251
+
252
+ Returns:
253
+ Return a list of Snowflake tables.
254
+ """
255
+ root_dir = Path(__file__).parent
256
+ schema_file = root_dir.joinpath(f"source_data/{source_data_name}.yaml")
257
+ destination_tables = self._load_source_data(str(schema_file))
258
+ for dest_table in destination_tables:
259
+ source_df = self._session.table(dest_table)
260
+ self._source_tables.append(dest_table)
261
+ self._source_dfs.append(source_df)
262
+ logger.info(f"{dest_table} has been created successfully.")
263
+ return self._source_tables
264
+
265
+ def get_current_schema(self) -> str:
266
+ return self._dataset_schema
267
+
268
+ def get_label_cols(self) -> List[str]:
269
+ return [item.resolved() for item in self._label_columns]
270
+
271
+ def get_excluded_cols(self) -> List[str]:
272
+ return [item.resolved() for item in self._excluded_columns]
273
+
274
+ def get_training_data_timestamp_col(self) -> Optional[str]:
275
+ return self._timestamp_column.resolved() if self._timestamp_column is not None else None
276
+
277
+ def get_training_spine_table(self) -> str:
278
+ return self._training_spine_table
@@ -0,0 +1,12 @@
1
+ from typing import List
2
+
3
+ from snowflake.ml.feature_store import Entity
4
+
5
+ trip_id = Entity(name="TRIP_ID", join_keys=["TRIP_ID"], desc="Trip id.")
6
+
7
+ location_id = Entity(name="DOLOCATIONID", join_keys=["DOLOCATIONID"], desc="Drop off location id.")
8
+
9
+
10
+ # This will be invoked by example_helper.py. Do not change function name.
11
+ def get_all_entities() -> List[Entity]:
12
+ return [trip_id, location_id]
@@ -0,0 +1,44 @@
1
+ from typing import List
2
+
3
+ from snowflake.ml.feature_store import FeatureView
4
+ from snowflake.ml.feature_store.examples.new_york_taxi_features.entities import (
5
+ location_id,
6
+ )
7
+ from snowflake.snowpark import DataFrame, Session
8
+
9
+
10
+ # This function will be invoked by example_helper.py. Do not change the name.
11
+ def create_draft_feature_view(session: Session, source_dfs: List[DataFrame], source_tables: List[str]) -> FeatureView:
12
+ """Create a draft feature view."""
13
+ feature_df = session.sql(
14
+ f"""
15
+ select
16
+ TPEP_DROPOFF_DATETIME as TS,
17
+ DOLOCATIONID,
18
+ avg(FARE_AMOUNT) over (
19
+ partition by DOLOCATIONID
20
+ order by TPEP_DROPOFF_DATETIME
21
+ range between interval '1 hours' preceding and current row
22
+ ) AVG_FARE_1H,
23
+ avg(FARE_AMOUNT) over (
24
+ partition by DOLOCATIONID
25
+ order by TPEP_DROPOFF_DATETIME
26
+ range between interval '10 hours' preceding and current row
27
+ ) AVG_FARE_10h
28
+ from {source_tables[0]}
29
+ """
30
+ )
31
+
32
+ return FeatureView(
33
+ name="f_location", # name of feature view
34
+ entities=[location_id], # entities
35
+ feature_df=feature_df, # definition query
36
+ refresh_freq="12h", # the frequency this feature view re-compute
37
+ timestamp_col="TS", # timestamp column. Used when generate training data
38
+ desc="Features aggregated by location id and refreshed every 12 hours.",
39
+ ).attach_feature_desc(
40
+ {
41
+ "AVG_FARE_1H": "Averaged fare in past 1 hour window aggregated by location.",
42
+ "AVG_FARE_10H": "Averaged fare in past 10 hours aggregated by location.",
43
+ }
44
+ )
@@ -0,0 +1,36 @@
1
+ from typing import List
2
+
3
+ from snowflake.ml.feature_store import FeatureView
4
+ from snowflake.ml.feature_store.examples.new_york_taxi_features.entities import trip_id
5
+ from snowflake.snowpark import DataFrame, Session
6
+
7
+
8
+ # This function will be invoked by example_helper.py. Do not change the name.
9
+ def create_draft_feature_view(session: Session, source_dfs: List[DataFrame], source_tables: List[str]) -> FeatureView:
10
+ """Create a draft feature view."""
11
+ feature_df = session.sql(
12
+ f"""
13
+ select
14
+ TRIP_ID,
15
+ PASSENGER_COUNT,
16
+ TRIP_DISTANCE,
17
+ FARE_AMOUNT
18
+ from
19
+ {source_tables[0]}
20
+ """
21
+ )
22
+
23
+ return FeatureView(
24
+ name="f_trip", # name of feature view
25
+ entities=[trip_id], # entities
26
+ feature_df=feature_df, # definition query
27
+ refresh_freq="1d", # the frequency this feature view re-compute
28
+ timestamp_col=None, # timestamp column. Used when generate training data
29
+ desc="Features per trip refreshed every day.",
30
+ ).attach_feature_desc(
31
+ {
32
+ "PASSENGER_COUNT": "The count of passenger of a trip.",
33
+ "TRIP_DISTANCE": "The distance of a trip.",
34
+ "FARE_AMOUNT": "The fare of a trip.",
35
+ }
36
+ )
@@ -0,0 +1,9 @@
1
+ ---
2
+ source_data: nyc_yellow_trips
3
+ training_spine_table: nyc_yellow_trips
4
+ label_columns: TOTAL_AMOUNT
5
+ add_id_column: TRIP_ID
6
+ timestamp_column: TPEP_PICKUP_DATETIME
7
+ epoch_to_timestamp_cols: TPEP_PICKUP_DATETIME,TPEP_DROPOFF_DATETIME
8
+ desc: Features using taxi trip data trying to predict the total fare of a trip.
9
+ model_category: regression
@@ -0,0 +1,4 @@
1
+ ---
2
+ s3_url: s3://sfquickstarts/misc/demos/airline/
3
+ load_files_pattern: .*[.]parquet
4
+ destination_table_name: (?P<table_name>.*)_0_0_0[.]snappy[.]parquet
@@ -0,0 +1,36 @@
1
+ ---
2
+ s3_url: s3://snowflake-workshop-lab/citibike-trips-csv/
3
+ destination_table_name: citibike_trips
4
+ load_files_pattern: .*trips_2013_6_.*[.]csv[.]gz
5
+ format:
6
+ type: csv
7
+ compression: auto
8
+ field_delimiter: ','
9
+ record_delimiter: \n
10
+ skip_header: 0
11
+ field_optionally_enclosed_by: \042
12
+ trim_space: 'true'
13
+ error_on_column_count_mismatch: 'false'
14
+ escape: none
15
+ escape_unenclosed_field: \134
16
+ date_format: auto
17
+ timestamp_format: auto
18
+ null_if: ('')
19
+ comment: file format for citibike trips data
20
+ columns:
21
+ tripduration: integer
22
+ starttime: timestamp
23
+ stoptime: timestamp
24
+ start_station_id: integer
25
+ start_station_name: string
26
+ start_station_latitude: float
27
+ start_station_longitude: float
28
+ end_station_id: integer
29
+ end_station_name: string
30
+ end_station_latitude: float
31
+ end_station_longitude: float
32
+ bikeid: integer
33
+ membership_type: string
34
+ usertype: string
35
+ birth_year: integer
36
+ gender: integer
@@ -0,0 +1,29 @@
1
+ ---
2
+ s3_url: s3://sfquickstarts/misc/demos/fraud_transactions.csv
3
+ destination_table_name: fraud_transactions
4
+ load_files_pattern: .*
5
+ format:
6
+ type: csv
7
+ compression: auto
8
+ field_delimiter: ','
9
+ record_delimiter: \n
10
+ skip_header: 1
11
+ field_optionally_enclosed_by: none
12
+ trim_space: 'false'
13
+ error_on_column_count_mismatch: 'false'
14
+ escape: none
15
+ escape_unenclosed_field: none
16
+ date_format: auto
17
+ timestamp_format: auto
18
+ null_if: ('')
19
+ comment: file format for winequality data
20
+ columns:
21
+ TRANSACTION_ID: integer
22
+ TX_DATETIME: datetime
23
+ CUSTOMER_ID: integer
24
+ TERMINAL_ID: integer
25
+ TX_AMOUNT: float
26
+ TX_TIME_SECONDS: integer
27
+ TX_TIME_DAYS: integer
28
+ TX_FRAUD: integer
29
+ TX_FRAUD_SCENARIO: integer
@@ -0,0 +1,4 @@
1
+ ---
2
+ s3_url: s3://sfquickstarts/misc/demos/nyc_yellow_trips/
3
+ destination_table_name: nyc_yellow_trips
4
+ load_files_pattern: .*_2016-01[.]parquet
@@ -0,0 +1,32 @@
1
+ ---
2
+ s3_url: s3://sfquickstarts/misc/demos/winequality-red.csv
3
+ destination_table_name: winedata
4
+ load_files_pattern: .*
5
+ format:
6
+ type: csv
7
+ compression: auto
8
+ field_delimiter: ;
9
+ record_delimiter: \n
10
+ skip_header: 1
11
+ field_optionally_enclosed_by: none
12
+ trim_space: 'false'
13
+ error_on_column_count_mismatch: 'false'
14
+ escape: none
15
+ escape_unenclosed_field: none
16
+ date_format: auto
17
+ timestamp_format: auto
18
+ null_if: ('')
19
+ comment: file format for winequality data
20
+ columns:
21
+ fixed_acidity: float
22
+ volatile_acidity: float
23
+ citric_acid: float
24
+ residual_sugar: float
25
+ chlorides: float
26
+ free_sulfur_dioxide: integer
27
+ total_sulfur_dioxide: integer
28
+ density: float
29
+ pH: float
30
+ sulphates: float
31
+ alcohol: float
32
+ quality: integer
@@ -0,0 +1,14 @@
1
+ from typing import List
2
+
3
+ from snowflake.ml.feature_store import Entity
4
+
5
+ wine_id = Entity(
6
+ name="WINE",
7
+ join_keys=["WINE_ID"],
8
+ desc="Wine ID.",
9
+ )
10
+
11
+
12
+ # This will be invoked by example_helper.py. Do not change function name.
13
+ def get_all_entities() -> List[Entity]:
14
+ return [wine_id]
@@ -0,0 +1,36 @@
1
+ from typing import List
2
+
3
+ from snowflake.ml.feature_store import FeatureView
4
+ from snowflake.ml.feature_store.examples.wine_quality_features.entities import wine_id
5
+ from snowflake.snowpark import DataFrame, Session, functions as F
6
+
7
+
8
+ # This function will be invoked by example_helper.py. Do not change the name.
9
+ def create_draft_feature_view(session: Session, source_dfs: List[DataFrame], source_tables: List[str]) -> FeatureView:
10
+ """Create a feature view about trip station."""
11
+ feature_df = source_dfs[0].select(
12
+ "WINE_ID",
13
+ "FIXED_ACIDITY",
14
+ "CITRIC_ACID",
15
+ "CHLORIDES",
16
+ "TOTAL_SULFUR_DIOXIDE",
17
+ "PH",
18
+ (F.col("FIXED_ACIDITY") * F.col("CITRIC_ACID")).alias("HYBRID_ACID"),
19
+ )
20
+
21
+ return FeatureView(
22
+ name="WINE_FEATURES", # name of feature view
23
+ entities=[wine_id], # entities
24
+ feature_df=feature_df, # definition dataframe
25
+ refresh_freq="1d", # refresh frequency. '1d' means it refreshes everyday
26
+ desc="Managed features about wine quality which refreshes everyday.",
27
+ ).attach_feature_desc(
28
+ {
29
+ "FIXED_ACIDITY": "Fixed acidity.",
30
+ "CITRIC_ACID": "Citric acid.",
31
+ "CHLORIDES": "Chlorides",
32
+ "TOTAL_SULFUR_DIOXIDE": "Total sulfur dioxide.",
33
+ "PH": "PH.",
34
+ "HYBRID_ACID": "Hybrid acid generated by a production of fixed and citric acid.",
35
+ }
36
+ )
@@ -0,0 +1,24 @@
1
+ from typing import List
2
+
3
+ from snowflake.ml.feature_store import FeatureView
4
+ from snowflake.ml.feature_store.examples.wine_quality_features.entities import wine_id
5
+ from snowflake.snowpark import DataFrame, Session
6
+
7
+
8
+ # This function will be invoked by example_helper.py. Do not change the name.
9
+ def create_draft_feature_view(session: Session, source_dfs: List[DataFrame], source_tables: List[str]) -> FeatureView:
10
+ """Create a feature view about trip station."""
11
+ feature_df = source_dfs[0].select("WINE_ID", "SULPHATES", "ALCOHOL")
12
+
13
+ return FeatureView(
14
+ name="EXTRA_WINE_FEATURES", # name of feature view
15
+ entities=[wine_id], # entities
16
+ feature_df=feature_df, # feature dataframe
17
+ refresh_freq=None, # refresh frequency. None means it never refresh
18
+ desc="Static features about wine quality which never refresh.",
19
+ ).attach_feature_desc(
20
+ {
21
+ "SULPHATES": "Sulphates.",
22
+ "ALCOHOL": "Alcohol.",
23
+ }
24
+ )
@@ -0,0 +1,8 @@
1
+ ---
2
+ source_data: winequality_red
3
+ training_spine_table: winedata
4
+ add_id_column: wine_id
5
+ label_columns: quality
6
+ excluded_columns: wine_id
7
+ desc: Features using wine quality data trying to predict the quality of wine.
8
+ model_category: regression