snowflake-ml-python 1.5.0__py3-none-any.whl → 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +6 -0
- snowflake/ml/_internal/lineage/lineage_utils.py +95 -0
- snowflake/ml/_internal/telemetry.py +1 -0
- snowflake/ml/_internal/utils/identifier.py +1 -1
- snowflake/ml/_internal/utils/sql_identifier.py +14 -1
- snowflake/ml/dataset/__init__.py +2 -1
- snowflake/ml/dataset/dataset.py +4 -3
- snowflake/ml/dataset/dataset_reader.py +5 -8
- snowflake/ml/feature_store/__init__.py +6 -0
- snowflake/ml/feature_store/access_manager.py +279 -0
- snowflake/ml/feature_store/feature_store.py +159 -99
- snowflake/ml/feature_store/feature_view.py +18 -8
- snowflake/ml/fileset/embedded_stage_fs.py +15 -12
- snowflake/ml/fileset/snowfs.py +3 -2
- snowflake/ml/fileset/stage_fs.py +25 -7
- snowflake/ml/model/_client/model/model_impl.py +46 -39
- snowflake/ml/model/_client/model/model_version_impl.py +24 -2
- snowflake/ml/model/_client/ops/metadata_ops.py +27 -4
- snowflake/ml/model/_client/ops/model_ops.py +131 -16
- snowflake/ml/model/_client/sql/_base.py +34 -0
- snowflake/ml/model/_client/sql/model.py +32 -39
- snowflake/ml/model/_client/sql/model_version.py +60 -43
- snowflake/ml/model/_client/sql/stage.py +6 -32
- snowflake/ml/model/_client/sql/tag.py +32 -56
- snowflake/ml/model/_model_composer/model_composer.py +2 -2
- snowflake/ml/model/_packager/model_handlers/mlflow.py +2 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +50 -21
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +81 -3
- snowflake/ml/modeling/framework/base.py +4 -3
- snowflake/ml/modeling/pipeline/pipeline.py +27 -7
- snowflake/ml/registry/_manager/model_manager.py +36 -7
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.5.0.dist-info → snowflake_ml_python-1.5.1.dist-info}/METADATA +54 -10
- {snowflake_ml_python-1.5.0.dist-info → snowflake_ml_python-1.5.1.dist-info}/RECORD +37 -35
- snowflake/ml/_internal/lineage/dataset_dataframe.py +0 -44
- {snowflake_ml_python-1.5.0.dist-info → snowflake_ml_python-1.5.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.5.0.dist-info → snowflake_ml_python-1.5.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.5.0.dist-info → snowflake_ml_python-1.5.1.dist-info}/top_level.txt +0 -0
@@ -8,7 +8,19 @@ import re
|
|
8
8
|
import warnings
|
9
9
|
from dataclasses import dataclass
|
10
10
|
from enum import Enum
|
11
|
-
from typing import
|
11
|
+
from typing import (
|
12
|
+
Any,
|
13
|
+
Callable,
|
14
|
+
Dict,
|
15
|
+
List,
|
16
|
+
Literal,
|
17
|
+
Optional,
|
18
|
+
Tuple,
|
19
|
+
TypeVar,
|
20
|
+
Union,
|
21
|
+
cast,
|
22
|
+
overload,
|
23
|
+
)
|
12
24
|
|
13
25
|
import packaging.version as pkg_version
|
14
26
|
import snowflake.ml.version as snowml_version
|
@@ -32,7 +44,7 @@ from snowflake.ml.feature_store.entity import _ENTITY_NAME_LENGTH_LIMIT, Entity
|
|
32
44
|
from snowflake.ml.feature_store.feature_view import (
|
33
45
|
_FEATURE_OBJ_TYPE,
|
34
46
|
_FEATURE_VIEW_NAME_DELIMITER,
|
35
|
-
|
47
|
+
_LEGACY_TIMESTAMP_COL_PLACEHOLDER_VALS,
|
36
48
|
FeatureView,
|
37
49
|
FeatureViewSlice,
|
38
50
|
FeatureViewStatus,
|
@@ -242,23 +254,16 @@ class FeatureStore:
|
|
242
254
|
|
243
255
|
else:
|
244
256
|
try:
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
]
|
252
|
-
):
|
257
|
+
# Explicitly check if schema exists first since we may not have CREATE SCHEMA privilege
|
258
|
+
if len(self._find_object("SCHEMAS", self._config.schema)) == 0:
|
259
|
+
self._session.sql(f"CREATE SCHEMA IF NOT EXISTS {self._config.full_schema_path}").collect(
|
260
|
+
statement_params=self._telemetry_stmp
|
261
|
+
)
|
262
|
+
for tag in to_sql_identifiers([_FEATURE_VIEW_METADATA_TAG, _FEATURE_STORE_OBJECT_TAG]):
|
253
263
|
self._session.sql(f"CREATE TAG IF NOT EXISTS {self._get_fully_qualified_name(tag)}").collect(
|
254
264
|
statement_params=self._telemetry_stmp
|
255
265
|
)
|
256
|
-
|
257
|
-
self._session.sql(
|
258
|
-
f"CREATE TAG IF NOT EXISTS {self._get_fully_qualified_name(_FEATURE_STORE_OBJECT_TAG)}"
|
259
|
-
).collect(statement_params=self._telemetry_stmp)
|
260
266
|
except Exception as e:
|
261
|
-
self.clear()
|
262
267
|
raise snowml_exceptions.SnowflakeMLException(
|
263
268
|
error_code=error_codes.INTERNAL_SNOWPARK_ERROR,
|
264
269
|
original_exception=RuntimeError(f"Failed to create feature store {name}: {e}."),
|
@@ -750,7 +755,7 @@ class FeatureStore:
|
|
750
755
|
except Exception as e:
|
751
756
|
raise snowml_exceptions.SnowflakeMLException(
|
752
757
|
error_code=error_codes.INTERNAL_SNOWPARK_ERROR,
|
753
|
-
original_exception=RuntimeError(f"Failed to
|
758
|
+
original_exception=RuntimeError(f"Failed to delete entity: {e}."),
|
754
759
|
) from e
|
755
760
|
logger.info(f"Deleted Entity {name}.")
|
756
761
|
|
@@ -802,7 +807,7 @@ class FeatureStore:
|
|
802
807
|
|
803
808
|
return df
|
804
809
|
|
805
|
-
@
|
810
|
+
@overload
|
806
811
|
def generate_dataset(
|
807
812
|
self,
|
808
813
|
name: str,
|
@@ -814,7 +819,40 @@ class FeatureStore:
|
|
814
819
|
exclude_columns: Optional[List[str]] = None,
|
815
820
|
include_feature_view_timestamp_col: bool = False,
|
816
821
|
desc: str = "",
|
822
|
+
output_type: Literal["dataset"] = "dataset",
|
817
823
|
) -> dataset.Dataset:
|
824
|
+
...
|
825
|
+
|
826
|
+
@overload
|
827
|
+
def generate_dataset(
|
828
|
+
self,
|
829
|
+
name: str,
|
830
|
+
spine_df: DataFrame,
|
831
|
+
features: List[Union[FeatureView, FeatureViewSlice]],
|
832
|
+
output_type: Literal["table"],
|
833
|
+
version: Optional[str] = None,
|
834
|
+
spine_timestamp_col: Optional[str] = None,
|
835
|
+
spine_label_cols: Optional[List[str]] = None,
|
836
|
+
exclude_columns: Optional[List[str]] = None,
|
837
|
+
include_feature_view_timestamp_col: bool = False,
|
838
|
+
desc: str = "",
|
839
|
+
) -> DataFrame:
|
840
|
+
...
|
841
|
+
|
842
|
+
@dispatch_decorator() # type: ignore[misc]
|
843
|
+
def generate_dataset(
|
844
|
+
self,
|
845
|
+
name: str,
|
846
|
+
spine_df: DataFrame,
|
847
|
+
features: List[Union[FeatureView, FeatureViewSlice]],
|
848
|
+
version: Optional[str] = None,
|
849
|
+
spine_timestamp_col: Optional[str] = None,
|
850
|
+
spine_label_cols: Optional[List[str]] = None,
|
851
|
+
exclude_columns: Optional[List[str]] = None,
|
852
|
+
include_feature_view_timestamp_col: bool = False,
|
853
|
+
desc: str = "",
|
854
|
+
output_type: Literal["dataset", "table"] = "dataset",
|
855
|
+
) -> Union[dataset.Dataset, DataFrame]:
|
818
856
|
"""
|
819
857
|
Generate dataset by given source table and feature views.
|
820
858
|
|
@@ -834,30 +872,29 @@ class FeatureStore:
|
|
834
872
|
include_feature_view_timestamp_col: Generated dataset will include timestamp column of feature view
|
835
873
|
(if feature view has timestamp column) if set true. Default to false.
|
836
874
|
desc: A description about this dataset.
|
875
|
+
output_type: The type of Snowflake storage to use for the generated training data.
|
837
876
|
|
838
877
|
Returns:
|
839
|
-
|
878
|
+
If output_type is "dataset" (default), returns a Dataset object.
|
879
|
+
If output_type is "table", returns a Snowpark DataFrame representing the table.
|
840
880
|
|
841
881
|
Raises:
|
842
|
-
SnowflakeMLException: [ValueError] spine_df contains more than one query.
|
843
882
|
SnowflakeMLException: [ValueError] Dataset name/version already exists
|
844
883
|
SnowflakeMLException: [ValueError] Snapshot creation failed.
|
884
|
+
SnowflakeMLException: [ValueError] Invalid output_type specified.
|
845
885
|
SnowflakeMLException: [RuntimeError] Failed to create clone from table.
|
846
886
|
SnowflakeMLException: [RuntimeError] Failed to find resources.
|
847
887
|
"""
|
888
|
+
if output_type not in {"table", "dataset"}:
|
889
|
+
raise snowml_exceptions.SnowflakeMLException(
|
890
|
+
error_code=error_codes.INVALID_ARGUMENT,
|
891
|
+
original_exception=ValueError(f"Invalid output_type: {output_type}."),
|
892
|
+
)
|
848
893
|
if spine_timestamp_col is not None:
|
849
894
|
spine_timestamp_col = SqlIdentifier(spine_timestamp_col)
|
850
895
|
if spine_label_cols is not None:
|
851
896
|
spine_label_cols = to_sql_identifiers(spine_label_cols) # type: ignore[assignment]
|
852
897
|
|
853
|
-
if len(spine_df.queries["queries"]) != 1:
|
854
|
-
raise snowml_exceptions.SnowflakeMLException(
|
855
|
-
error_code=error_codes.INVALID_ARGUMENT,
|
856
|
-
original_exception=ValueError(
|
857
|
-
f"spine_df must contain only one query. Got: {spine_df.queries['queries']}"
|
858
|
-
),
|
859
|
-
)
|
860
|
-
|
861
898
|
result_df, join_keys = self._join_features(
|
862
899
|
spine_df, features, spine_timestamp_col, include_feature_view_timestamp_col
|
863
900
|
)
|
@@ -875,33 +912,49 @@ class FeatureStore:
|
|
875
912
|
result_df = self._exclude_columns(result_df, exclude_columns)
|
876
913
|
|
877
914
|
fs_meta = FeatureStoreMetadata(
|
878
|
-
spine_query=spine_df.queries["queries"][
|
915
|
+
spine_query=spine_df.queries["queries"][-1],
|
879
916
|
serialized_feature_views=[fv.to_json() for fv in features],
|
880
917
|
spine_timestamp_col=spine_timestamp_col,
|
881
918
|
)
|
882
919
|
|
883
920
|
try:
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
|
890
|
-
|
891
|
-
|
892
|
-
|
893
|
-
|
894
|
-
|
921
|
+
if output_type == "table":
|
922
|
+
table_name = f"{name}_{version}"
|
923
|
+
result_df.write.mode("errorifexists").save_as_table(table_name) # type: ignore[call-overload]
|
924
|
+
ds_df = self._session.table(table_name)
|
925
|
+
return ds_df
|
926
|
+
else:
|
927
|
+
assert output_type == "dataset"
|
928
|
+
if not self._is_dataset_enabled():
|
929
|
+
raise snowml_exceptions.SnowflakeMLException(
|
930
|
+
error_code=error_codes.SNOWML_CREATE_FAILED,
|
931
|
+
original_exception=RuntimeError(
|
932
|
+
"Dataset is not enabled in your account. Ask your account admin to set"
|
933
|
+
' FEATURE_DATASET=ENABLED or set output_type="table" to generate the data'
|
934
|
+
" as a Snowflake Table instead."
|
935
|
+
),
|
936
|
+
)
|
937
|
+
ds: dataset.Dataset = dataset.create_from_dataframe(
|
938
|
+
self._session,
|
939
|
+
name,
|
940
|
+
version,
|
941
|
+
input_dataframe=result_df,
|
942
|
+
exclude_cols=[spine_timestamp_col],
|
943
|
+
label_cols=spine_label_cols,
|
944
|
+
properties=fs_meta,
|
945
|
+
comment=desc,
|
946
|
+
)
|
947
|
+
return ds
|
895
948
|
|
896
949
|
except dataset_errors.DatasetExistError as e:
|
897
950
|
raise snowml_exceptions.SnowflakeMLException(
|
898
951
|
error_code=error_codes.OBJECT_ALREADY_EXISTS,
|
899
|
-
original_exception=
|
952
|
+
original_exception=RuntimeError(str(e)),
|
900
953
|
) from e
|
901
954
|
except SnowparkSQLException as e:
|
902
955
|
raise snowml_exceptions.SnowflakeMLException(
|
903
956
|
error_code=error_codes.INTERNAL_SNOWPARK_ERROR,
|
904
|
-
original_exception=RuntimeError(f"An error occurred during
|
957
|
+
original_exception=RuntimeError(f"An error occurred during dataset generation: {e}."),
|
905
958
|
) from e
|
906
959
|
|
907
960
|
@dispatch_decorator()
|
@@ -930,52 +983,47 @@ class FeatureStore:
|
|
930
983
|
return self._load_serialized_feature_objects(source_meta.properties.serialized_feature_views)
|
931
984
|
|
932
985
|
@dispatch_decorator()
|
933
|
-
def
|
986
|
+
def _clear(self, dryrun: bool = True) -> None:
|
934
987
|
"""
|
935
|
-
Clear all feature
|
936
|
-
|
988
|
+
Clear all feature views and entities. Note Feature Store schema and metadata will NOT be purged
|
989
|
+
together. Use SQL to delete schema and metadata instead.
|
937
990
|
|
938
|
-
|
939
|
-
|
991
|
+
Args:
|
992
|
+
dryrun: Print a list of objects will be deleted but not actually perform the deletion when true.
|
940
993
|
"""
|
941
|
-
|
942
|
-
|
943
|
-
|
944
|
-
|
945
|
-
|
946
|
-
|
947
|
-
|
948
|
-
|
949
|
-
|
950
|
-
|
951
|
-
|
952
|
-
|
953
|
-
|
954
|
-
|
955
|
-
|
956
|
-
|
957
|
-
|
958
|
-
|
959
|
-
|
960
|
-
|
961
|
-
|
962
|
-
|
963
|
-
|
964
|
-
|
965
|
-
|
966
|
-
|
967
|
-
|
968
|
-
|
969
|
-
|
970
|
-
|
971
|
-
|
972
|
-
|
994
|
+
warnings.warn(
|
995
|
+
"It will clear ALL feature views and entities in this Feature Store. Make sure your role"
|
996
|
+
" has sufficient access to all feature views and entities. Insufficient access to some feature"
|
997
|
+
" views or entities will leave Feature Store in an incomplete state.",
|
998
|
+
stacklevel=2,
|
999
|
+
category=UserWarning,
|
1000
|
+
)
|
1001
|
+
|
1002
|
+
all_fvs_df = self.list_feature_views()
|
1003
|
+
all_entities_df = self.list_entities()
|
1004
|
+
all_fvs_rows = all_fvs_df.collect()
|
1005
|
+
all_entities_rows = all_entities_df.collect()
|
1006
|
+
|
1007
|
+
if dryrun:
|
1008
|
+
logger.info(
|
1009
|
+
"Following feature views and entities will be deleted."
|
1010
|
+
+ " Set 'dryrun=False' to perform the actual deletion."
|
1011
|
+
)
|
1012
|
+
logger.info(f"Total {len(all_fvs_rows)} Feature views to be deleted:")
|
1013
|
+
all_fvs_df.show(n=len(all_fvs_rows))
|
1014
|
+
logger.info(f"\nTotal {len(all_entities_rows)} entities to be deleted:")
|
1015
|
+
all_entities_df.show(n=len(all_entities_rows))
|
1016
|
+
return
|
1017
|
+
|
1018
|
+
for fv_row in all_fvs_rows:
|
1019
|
+
fv = self.get_feature_view(
|
1020
|
+
SqlIdentifier(fv_row["NAME"], case_sensitive=True).identifier(), fv_row["VERSION"]
|
1021
|
+
)
|
1022
|
+
self.delete_feature_view(fv)
|
1023
|
+
|
1024
|
+
for entity_row in all_entities_rows:
|
1025
|
+
self.delete_entity(SqlIdentifier(entity_row["NAME"], case_sensitive=True).identifier())
|
973
1026
|
|
974
|
-
except Exception as e:
|
975
|
-
raise snowml_exceptions.SnowflakeMLException(
|
976
|
-
error_code=error_codes.INTERNAL_SNOWPARK_ERROR,
|
977
|
-
original_exception=RuntimeError(f"Failed to clear feature store {self._config.full_schema_path}: {e}."),
|
978
|
-
) from e
|
979
1027
|
logger.info(f"Feature store {self._config.full_schema_path} has been cleared.")
|
980
1028
|
|
981
1029
|
def _get_feature_view_if_exists(self, name: str, version: str) -> FeatureView:
|
@@ -1093,14 +1141,6 @@ class FeatureStore:
|
|
1093
1141
|
spine_timestamp_col: Optional[SqlIdentifier],
|
1094
1142
|
include_feature_view_timestamp_col: bool,
|
1095
1143
|
) -> Tuple[DataFrame, List[SqlIdentifier]]:
|
1096
|
-
if len(spine_df.queries["queries"]) != 1:
|
1097
|
-
raise snowml_exceptions.SnowflakeMLException(
|
1098
|
-
error_code=error_codes.INVALID_ARGUMENT,
|
1099
|
-
original_exception=ValueError(
|
1100
|
-
f"spine_df must contain only one query. Got: {spine_df.queries['queries']}"
|
1101
|
-
),
|
1102
|
-
)
|
1103
|
-
|
1104
1144
|
for f in features:
|
1105
1145
|
f = f.feature_view_ref if isinstance(f, FeatureViewSlice) else f
|
1106
1146
|
if f.status == FeatureViewStatus.DRAFT:
|
@@ -1122,7 +1162,7 @@ class FeatureStore:
|
|
1122
1162
|
self._asof_join_enabled = self._is_asof_join_enabled()
|
1123
1163
|
|
1124
1164
|
# TODO: leverage Snowpark dataframe for more concise syntax once it supports AsOfJoin
|
1125
|
-
query = spine_df.queries["queries"][
|
1165
|
+
query = spine_df.queries["queries"][-1]
|
1126
1166
|
layer = 0
|
1127
1167
|
for f in features:
|
1128
1168
|
if isinstance(f, FeatureViewSlice):
|
@@ -1180,7 +1220,15 @@ class FeatureStore:
|
|
1180
1220
|
"""
|
1181
1221
|
layer += 1
|
1182
1222
|
|
1183
|
-
|
1223
|
+
# TODO: construct result dataframe with datframe APIs once ASOF join is supported natively.
|
1224
|
+
# Below code manually construct result dataframe from private members of spine dataframe, which
|
1225
|
+
# likely will cause unintentional issues. This setp is needed because spine_df might contains
|
1226
|
+
# prerequisite queries and post actions that must be carried over to result dataframe.
|
1227
|
+
result_df = self._session.sql(query)
|
1228
|
+
result_df._plan.queries = spine_df._plan.queries[:-1] + result_df._plan.queries
|
1229
|
+
result_df._plan.post_actions = spine_df._plan.post_actions
|
1230
|
+
|
1231
|
+
return result_df, join_keys
|
1184
1232
|
|
1185
1233
|
def _check_database_exists_or_throw(self) -> None:
|
1186
1234
|
resolved_db_name = self._config.database.resolved()
|
@@ -1517,6 +1565,9 @@ class FeatureStore:
|
|
1517
1565
|
original_exception=RuntimeError(f"Failed to parse query text for FeatureView {name}/{version}: {row}."),
|
1518
1566
|
)
|
1519
1567
|
|
1568
|
+
fv_name = FeatureView._get_physical_name(name, version)
|
1569
|
+
infer_schema_df = self._session.sql(f"SELECT * FROM {self._get_fully_qualified_name(fv_name)}")
|
1570
|
+
|
1520
1571
|
if m.group("obj_type") == "DYNAMIC TABLE":
|
1521
1572
|
query = m.group("query")
|
1522
1573
|
df = self._session.sql(query)
|
@@ -1524,7 +1575,7 @@ class FeatureStore:
|
|
1524
1575
|
fv_metadata = _FeatureViewMetadata.from_json(m.group("fv_metadata"))
|
1525
1576
|
entities = [find_and_compose_entity(n) for n in fv_metadata.entities]
|
1526
1577
|
ts_col = fv_metadata.timestamp_col
|
1527
|
-
timestamp_col = ts_col if ts_col
|
1578
|
+
timestamp_col = ts_col if ts_col not in _LEGACY_TIMESTAMP_COL_PLACEHOLDER_VALS else None
|
1528
1579
|
|
1529
1580
|
fv = FeatureView._construct_feature_view(
|
1530
1581
|
name=name,
|
@@ -1534,9 +1585,7 @@ class FeatureStore:
|
|
1534
1585
|
desc=desc,
|
1535
1586
|
version=version,
|
1536
1587
|
status=FeatureViewStatus(row["scheduling_state"]),
|
1537
|
-
feature_descs=self._fetch_column_descs(
|
1538
|
-
"DYNAMIC TABLE", SqlIdentifier(row["name"], case_sensitive=True)
|
1539
|
-
),
|
1588
|
+
feature_descs=self._fetch_column_descs("DYNAMIC TABLE", fv_name),
|
1540
1589
|
refresh_freq=row["target_lag"],
|
1541
1590
|
database=self._config.database.identifier(),
|
1542
1591
|
schema=self._config.schema.identifier(),
|
@@ -1544,6 +1593,7 @@ class FeatureStore:
|
|
1544
1593
|
refresh_mode=row["refresh_mode"],
|
1545
1594
|
refresh_mode_reason=row["refresh_mode_reason"],
|
1546
1595
|
owner=row["owner"],
|
1596
|
+
infer_schema_df=infer_schema_df,
|
1547
1597
|
)
|
1548
1598
|
return fv
|
1549
1599
|
else:
|
@@ -1553,7 +1603,7 @@ class FeatureStore:
|
|
1553
1603
|
fv_metadata = _FeatureViewMetadata.from_json(m.group("fv_metadata"))
|
1554
1604
|
entities = [find_and_compose_entity(n) for n in fv_metadata.entities]
|
1555
1605
|
ts_col = fv_metadata.timestamp_col
|
1556
|
-
timestamp_col = ts_col if ts_col
|
1606
|
+
timestamp_col = ts_col if ts_col not in _LEGACY_TIMESTAMP_COL_PLACEHOLDER_VALS else None
|
1557
1607
|
|
1558
1608
|
fv = FeatureView._construct_feature_view(
|
1559
1609
|
name=name,
|
@@ -1563,7 +1613,7 @@ class FeatureStore:
|
|
1563
1613
|
desc=desc,
|
1564
1614
|
version=version,
|
1565
1615
|
status=FeatureViewStatus.STATIC,
|
1566
|
-
feature_descs=self._fetch_column_descs("VIEW",
|
1616
|
+
feature_descs=self._fetch_column_descs("VIEW", fv_name),
|
1567
1617
|
refresh_freq=None,
|
1568
1618
|
database=self._config.database.identifier(),
|
1569
1619
|
schema=self._config.schema.identifier(),
|
@@ -1571,6 +1621,7 @@ class FeatureStore:
|
|
1571
1621
|
refresh_mode=None,
|
1572
1622
|
refresh_mode_reason=None,
|
1573
1623
|
owner=row["owner"],
|
1624
|
+
infer_schema_df=infer_schema_df,
|
1574
1625
|
)
|
1575
1626
|
return fv
|
1576
1627
|
|
@@ -1720,6 +1771,15 @@ class FeatureStore:
|
|
1720
1771
|
except Exception:
|
1721
1772
|
return False
|
1722
1773
|
|
1774
|
+
def _is_dataset_enabled(self) -> bool:
|
1775
|
+
try:
|
1776
|
+
self._session.sql(f"SHOW DATASETS IN SCHEMA {self._config.full_schema_path}").collect()
|
1777
|
+
return True
|
1778
|
+
except SnowparkSQLException as e:
|
1779
|
+
if "'DATASETS' does not exist" in e.message:
|
1780
|
+
return False
|
1781
|
+
raise
|
1782
|
+
|
1723
1783
|
def _check_feature_store_object_versions(self) -> None:
|
1724
1784
|
versions = self._collapse_object_versions()
|
1725
1785
|
if len(versions) > 0 and pkg_version.parse(snowml_version.VERSION) < versions[0]:
|
@@ -5,7 +5,7 @@ import re
|
|
5
5
|
from collections import OrderedDict
|
6
6
|
from dataclasses import asdict, dataclass
|
7
7
|
from enum import Enum
|
8
|
-
from typing import Dict, List, Optional
|
8
|
+
from typing import Any, Dict, List, Optional
|
9
9
|
|
10
10
|
from snowflake.ml._internal.exceptions import (
|
11
11
|
error_codes,
|
@@ -27,7 +27,8 @@ from snowflake.snowpark.types import (
|
|
27
27
|
)
|
28
28
|
|
29
29
|
_FEATURE_VIEW_NAME_DELIMITER = "$"
|
30
|
-
|
30
|
+
_LEGACY_TIMESTAMP_COL_PLACEHOLDER_VALS = ["FS_TIMESTAMP_COL_PLACEHOLDER_VAL", "NULL"]
|
31
|
+
_TIMESTAMP_COL_PLACEHOLDER = "NULL"
|
31
32
|
_FEATURE_OBJ_TYPE = "FEATURE_OBJ_TYPE"
|
32
33
|
# Feature view version rule is aligned with dataset version rule in SQL.
|
33
34
|
_FEATURE_VIEW_VERSION_RE = re.compile(r"^[a-zA-Z0-9][a-zA-Z0-9_.\-]*$")
|
@@ -121,12 +122,13 @@ class FeatureView:
|
|
121
122
|
timestamp_col: Optional[str] = None,
|
122
123
|
refresh_freq: Optional[str] = None,
|
123
124
|
desc: str = "",
|
125
|
+
**_kwargs: Any,
|
124
126
|
) -> None:
|
125
127
|
"""
|
126
128
|
Create a FeatureView instance.
|
127
129
|
|
128
130
|
Args:
|
129
|
-
name: name of the FeatureView. NOTE:
|
131
|
+
name: name of the FeatureView. NOTE: following Snowflake identifier rule
|
130
132
|
entities: entities that the FeatureView is associated with.
|
131
133
|
feature_df: Snowpark DataFrame containing data source and all feature feature_df logics.
|
132
134
|
Final projection of the DataFrame should contain feature names, join keys and timestamp(if applicable).
|
@@ -140,6 +142,7 @@ class FeatureView:
|
|
140
142
|
NOTE: If refresh_freq is not provided, then FeatureView will be registered as View on Snowflake backend
|
141
143
|
and there won't be extra storage cost.
|
142
144
|
desc: description of the FeatureView.
|
145
|
+
_kwargs: reserved kwargs for system generated args. NOTE: DO NOT USE.
|
143
146
|
"""
|
144
147
|
|
145
148
|
self._name: SqlIdentifier = SqlIdentifier(name)
|
@@ -149,6 +152,7 @@ class FeatureView:
|
|
149
152
|
SqlIdentifier(timestamp_col) if timestamp_col is not None else None
|
150
153
|
)
|
151
154
|
self._desc: str = desc
|
155
|
+
self._infer_schema_df: DataFrame = _kwargs.get("_infer_schema_df", self._feature_df)
|
152
156
|
self._query: str = self._get_query()
|
153
157
|
self._version: Optional[FeatureViewVersion] = None
|
154
158
|
self._status: FeatureViewStatus = FeatureViewStatus.DRAFT
|
@@ -295,7 +299,7 @@ class FeatureView:
|
|
295
299
|
|
296
300
|
@property
|
297
301
|
def output_schema(self) -> StructType:
|
298
|
-
return self.
|
302
|
+
return self._infer_schema_df.schema
|
299
303
|
|
300
304
|
@property
|
301
305
|
def refresh_mode(self) -> Optional[str]:
|
@@ -329,7 +333,7 @@ Got {len(self._feature_df.queries['queries'])}: {self._feature_df.queries['queri
|
|
329
333
|
f"FeatureView name `{self._name}` contains invalid character `{_FEATURE_VIEW_NAME_DELIMITER}`."
|
330
334
|
)
|
331
335
|
|
332
|
-
unescaped_df_cols = to_sql_identifiers(self.
|
336
|
+
unescaped_df_cols = to_sql_identifiers(self._infer_schema_df.columns)
|
333
337
|
for e in self._entities:
|
334
338
|
for k in e.join_keys:
|
335
339
|
if k not in unescaped_df_cols:
|
@@ -341,17 +345,17 @@ Got {len(self._feature_df.queries['queries'])}: {self._feature_df.queries['queri
|
|
341
345
|
ts_col = self._timestamp_col
|
342
346
|
if ts_col == SqlIdentifier(_TIMESTAMP_COL_PLACEHOLDER):
|
343
347
|
raise ValueError(f"Invalid timestamp_col name, cannot be {_TIMESTAMP_COL_PLACEHOLDER}.")
|
344
|
-
if ts_col not in to_sql_identifiers(self.
|
348
|
+
if ts_col not in to_sql_identifiers(self._infer_schema_df.columns):
|
345
349
|
raise ValueError(f"timestamp_col {ts_col} is not found in input dataframe.")
|
346
350
|
|
347
|
-
col_type = self.
|
351
|
+
col_type = self._infer_schema_df.schema[ts_col].datatype
|
348
352
|
if not isinstance(col_type, (DateType, TimeType, TimestampType, _NumericType)):
|
349
353
|
raise ValueError(f"Invalid data type for timestamp_col {ts_col}: {col_type}.")
|
350
354
|
|
351
355
|
def _get_feature_names(self) -> List[SqlIdentifier]:
|
352
356
|
join_keys = [k for e in self._entities for k in e.join_keys]
|
353
357
|
ts_col = [self._timestamp_col] if self._timestamp_col is not None else []
|
354
|
-
feature_names = to_sql_identifiers(self.
|
358
|
+
feature_names = to_sql_identifiers(self._infer_schema_df.columns, case_sensitive=False)
|
355
359
|
return [c for c in feature_names if c not in join_keys + ts_col]
|
356
360
|
|
357
361
|
def __repr__(self) -> str:
|
@@ -384,6 +388,9 @@ Got {len(self._feature_df.queries['queries'])}: {self._feature_df.queries['queri
|
|
384
388
|
fv_dict = self.__dict__.copy()
|
385
389
|
if "_feature_df" in fv_dict:
|
386
390
|
fv_dict.pop("_feature_df")
|
391
|
+
if "_infer_schema_df" in fv_dict:
|
392
|
+
infer_schema_df = fv_dict.pop("_infer_schema_df")
|
393
|
+
fv_dict["_infer_schema_query"] = infer_schema_df.queries["queries"][0]
|
387
394
|
fv_dict["_entities"] = [e._to_dict() for e in self._entities]
|
388
395
|
fv_dict["_status"] = str(self._status)
|
389
396
|
fv_dict["_name"] = str(self._name) if self._name is not None else None
|
@@ -440,6 +447,7 @@ Got {len(self._feature_df.queries['queries'])}: {self._feature_df.queries['queri
|
|
440
447
|
refresh_mode=json_dict["_refresh_mode"],
|
441
448
|
refresh_mode_reason=json_dict["_refresh_mode_reason"],
|
442
449
|
owner=json_dict["_owner"],
|
450
|
+
infer_schema_df=session.sql(json_dict.get("_infer_schema_query", None)),
|
443
451
|
)
|
444
452
|
|
445
453
|
@staticmethod
|
@@ -471,6 +479,7 @@ Got {len(self._feature_df.queries['queries'])}: {self._feature_df.queries['queri
|
|
471
479
|
refresh_mode: Optional[str],
|
472
480
|
refresh_mode_reason: Optional[str],
|
473
481
|
owner: Optional[str],
|
482
|
+
infer_schema_df: Optional[DataFrame],
|
474
483
|
) -> FeatureView:
|
475
484
|
fv = FeatureView(
|
476
485
|
name=name,
|
@@ -478,6 +487,7 @@ Got {len(self._feature_df.queries['queries'])}: {self._feature_df.queries['queri
|
|
478
487
|
feature_df=feature_df,
|
479
488
|
timestamp_col=timestamp_col,
|
480
489
|
desc=desc,
|
490
|
+
_infer_schema_df=infer_schema_df,
|
481
491
|
)
|
482
492
|
fv._version = FeatureViewVersion(version) if version is not None else None
|
483
493
|
fv._status = status
|
@@ -78,22 +78,26 @@ class SFEmbeddedStageFileSystem(stage_fs.SFStageFileSystem):
|
|
78
78
|
match = _SNOWURL_PATH_RE.fullmatch(file)
|
79
79
|
assert match is not None and match.group("filepath") is not None
|
80
80
|
versions_dict[match.group("version")].append(match.group("filepath"))
|
81
|
-
presigned_urls: List[Tuple[str, str]] = []
|
82
81
|
try:
|
82
|
+
async_jobs: List[snowpark.AsyncJob] = []
|
83
83
|
for version, version_files in versions_dict.items():
|
84
84
|
for file in version_files:
|
85
85
|
stage_loc = f"{self.stage_name}/versions/{version}"
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
)
|
86
|
+
query_result = self._session.sql(
|
87
|
+
f"select '{version}/{file}' as name,"
|
88
|
+
f" get_presigned_url('{stage_loc}', '{file}', {url_lifetime}) as url"
|
89
|
+
).collect(
|
90
|
+
block=False,
|
91
|
+
statement_params=telemetry.get_function_usage_statement_params(
|
92
|
+
project=stage_fs._PROJECT,
|
93
|
+
api_calls=[snowpark.DataFrame.collect],
|
94
|
+
),
|
96
95
|
)
|
96
|
+
async_jobs.append(query_result)
|
97
|
+
presigned_urls: List[Tuple[str, str]] = [
|
98
|
+
(r["NAME"], r["URL"]) for job in async_jobs for r in stage_fs._resolve_async_job(job)
|
99
|
+
]
|
100
|
+
return presigned_urls
|
97
101
|
except snowpark_exceptions.SnowparkClientException as e:
|
98
102
|
if e.message.startswith(fileset_errors.ERRNO_DOMAIN_NOT_EXIST) or e.message.startswith(
|
99
103
|
fileset_errors.ERRNO_STAGE_NOT_EXIST
|
@@ -109,7 +113,6 @@ class SFEmbeddedStageFileSystem(stage_fs.SFStageFileSystem):
|
|
109
113
|
error_code=error_codes.INTERNAL_SNOWML_ERROR,
|
110
114
|
original_exception=fileset_errors.FileSetError(str(e)),
|
111
115
|
)
|
112
|
-
return presigned_urls
|
113
116
|
|
114
117
|
@classmethod
|
115
118
|
def _parent(cls, path: str) -> str:
|
snowflake/ml/fileset/snowfs.py
CHANGED
@@ -104,7 +104,8 @@ class SnowFileSystem(sfcfs.SFFileSystem):
|
|
104
104
|
if self._IS_BUGGED_VERSION:
|
105
105
|
match = _SNOWURL_PATTERN.fullmatch(abs_path)
|
106
106
|
assert match is not None
|
107
|
-
|
107
|
+
if match.group("relpath"):
|
108
|
+
abs_path = abs_path.replace(match.group("relpath"), match.group("relpath").lstrip("/"))
|
108
109
|
return abs_path
|
109
110
|
|
110
111
|
@classmethod
|
@@ -145,7 +146,7 @@ class SnowFileSystem(sfcfs.SFFileSystem):
|
|
145
146
|
logging.debug(f"Parsed snow URL: {snowurl_match.groups()}")
|
146
147
|
# FIXME(dhung): Temporary fix for bug in GS version 8.17
|
147
148
|
if cls._IS_BUGGED_VERSION:
|
148
|
-
filepath =
|
149
|
+
filepath = f"versions/{version}//{relative_path}"
|
149
150
|
return _SFFileEntityPath(
|
150
151
|
domain=domain, name=name, version=version, relative_path=relative_path, filepath=filepath
|
151
152
|
)
|