snowflake-ml-python 1.4.1__py3-none-any.whl → 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +72 -31
- snowflake/ml/_internal/exceptions/dataset_error_messages.py +5 -0
- snowflake/ml/_internal/exceptions/dataset_errors.py +24 -0
- snowflake/ml/_internal/exceptions/error_codes.py +3 -0
- snowflake/ml/_internal/lineage/data_source.py +10 -0
- snowflake/ml/_internal/lineage/lineage_utils.py +95 -0
- snowflake/ml/_internal/telemetry.py +1 -0
- snowflake/ml/_internal/utils/identifier.py +1 -1
- snowflake/ml/_internal/utils/sql_identifier.py +14 -1
- snowflake/ml/dataset/__init__.py +11 -0
- snowflake/ml/dataset/dataset.py +455 -129
- snowflake/ml/dataset/dataset_factory.py +53 -0
- snowflake/ml/dataset/dataset_metadata.py +103 -0
- snowflake/ml/dataset/dataset_reader.py +199 -0
- snowflake/ml/feature_store/__init__.py +6 -0
- snowflake/ml/feature_store/access_manager.py +279 -0
- snowflake/ml/feature_store/feature_store.py +544 -358
- snowflake/ml/feature_store/feature_view.py +55 -16
- snowflake/ml/fileset/embedded_stage_fs.py +149 -0
- snowflake/ml/fileset/sfcfs.py +0 -4
- snowflake/ml/fileset/snowfs.py +160 -0
- snowflake/ml/fileset/stage_fs.py +25 -10
- snowflake/ml/model/__init__.py +2 -2
- snowflake/ml/model/_api.py +16 -1
- snowflake/ml/model/_client/model/model_impl.py +65 -31
- snowflake/ml/model/_client/model/model_version_impl.py +159 -2
- snowflake/ml/model/_client/ops/metadata_ops.py +27 -4
- snowflake/ml/model/_client/ops/model_ops.py +268 -83
- snowflake/ml/model/_client/sql/_base.py +34 -0
- snowflake/ml/model/_client/sql/model.py +42 -47
- snowflake/ml/model/_client/sql/model_version.py +164 -39
- snowflake/ml/model/_client/sql/stage.py +6 -32
- snowflake/ml/model/_client/sql/tag.py +32 -56
- snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +5 -1
- snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +1 -0
- snowflake/ml/model/_deploy_client/snowservice/deploy.py +2 -0
- snowflake/ml/model/_deploy_client/utils/constants.py +0 -5
- snowflake/ml/model/_deploy_client/utils/snowservice_client.py +21 -50
- snowflake/ml/model/_model_composer/model_composer.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +22 -0
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +11 -0
- snowflake/ml/model/_packager/model_env/model_env.py +41 -0
- snowflake/ml/model/_packager/model_handlers/mlflow.py +2 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +1 -5
- snowflake/ml/model/_packager/model_packager.py +0 -3
- snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +55 -3
- snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +34 -18
- snowflake/ml/modeling/_internal/model_trainer.py +7 -0
- snowflake/ml/modeling/_internal/model_trainer_builder.py +42 -9
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +50 -21
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +24 -2
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +340 -17
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +51 -52
- snowflake/ml/modeling/cluster/affinity_propagation.py +51 -52
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +51 -52
- snowflake/ml/modeling/cluster/birch.py +53 -52
- snowflake/ml/modeling/cluster/bisecting_k_means.py +53 -52
- snowflake/ml/modeling/cluster/dbscan.py +51 -52
- snowflake/ml/modeling/cluster/feature_agglomeration.py +53 -52
- snowflake/ml/modeling/cluster/k_means.py +53 -52
- snowflake/ml/modeling/cluster/mean_shift.py +51 -52
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +53 -52
- snowflake/ml/modeling/cluster/optics.py +51 -52
- snowflake/ml/modeling/cluster/spectral_biclustering.py +51 -52
- snowflake/ml/modeling/cluster/spectral_clustering.py +51 -52
- snowflake/ml/modeling/cluster/spectral_coclustering.py +51 -52
- snowflake/ml/modeling/compose/column_transformer.py +53 -52
- snowflake/ml/modeling/compose/transformed_target_regressor.py +51 -52
- snowflake/ml/modeling/covariance/elliptic_envelope.py +51 -52
- snowflake/ml/modeling/covariance/empirical_covariance.py +51 -52
- snowflake/ml/modeling/covariance/graphical_lasso.py +51 -52
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +51 -52
- snowflake/ml/modeling/covariance/ledoit_wolf.py +51 -52
- snowflake/ml/modeling/covariance/min_cov_det.py +51 -52
- snowflake/ml/modeling/covariance/oas.py +51 -52
- snowflake/ml/modeling/covariance/shrunk_covariance.py +51 -52
- snowflake/ml/modeling/decomposition/dictionary_learning.py +53 -52
- snowflake/ml/modeling/decomposition/factor_analysis.py +53 -52
- snowflake/ml/modeling/decomposition/fast_ica.py +53 -52
- snowflake/ml/modeling/decomposition/incremental_pca.py +53 -52
- snowflake/ml/modeling/decomposition/kernel_pca.py +53 -52
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +53 -52
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +53 -52
- snowflake/ml/modeling/decomposition/pca.py +53 -52
- snowflake/ml/modeling/decomposition/sparse_pca.py +53 -52
- snowflake/ml/modeling/decomposition/truncated_svd.py +53 -52
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +53 -52
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +51 -52
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +51 -52
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +51 -52
- snowflake/ml/modeling/ensemble/bagging_classifier.py +51 -52
- snowflake/ml/modeling/ensemble/bagging_regressor.py +51 -52
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +51 -52
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +51 -52
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +51 -52
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +51 -52
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +51 -52
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +51 -52
- snowflake/ml/modeling/ensemble/isolation_forest.py +51 -52
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +51 -52
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +51 -52
- snowflake/ml/modeling/ensemble/stacking_regressor.py +53 -52
- snowflake/ml/modeling/ensemble/voting_classifier.py +53 -52
- snowflake/ml/modeling/ensemble/voting_regressor.py +53 -52
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +53 -52
- snowflake/ml/modeling/feature_selection/select_fdr.py +53 -52
- snowflake/ml/modeling/feature_selection/select_fpr.py +53 -52
- snowflake/ml/modeling/feature_selection/select_fwe.py +53 -52
- snowflake/ml/modeling/feature_selection/select_k_best.py +53 -52
- snowflake/ml/modeling/feature_selection/select_percentile.py +53 -52
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +53 -52
- snowflake/ml/modeling/feature_selection/variance_threshold.py +53 -52
- snowflake/ml/modeling/framework/base.py +64 -36
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +51 -52
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +51 -52
- snowflake/ml/modeling/impute/iterative_imputer.py +53 -52
- snowflake/ml/modeling/impute/knn_imputer.py +53 -52
- snowflake/ml/modeling/impute/missing_indicator.py +53 -52
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +53 -52
- snowflake/ml/modeling/kernel_approximation/nystroem.py +53 -52
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +53 -52
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +53 -52
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +53 -52
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +51 -52
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +51 -52
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +51 -52
- snowflake/ml/modeling/linear_model/ard_regression.py +51 -52
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +51 -52
- snowflake/ml/modeling/linear_model/elastic_net.py +51 -52
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +51 -52
- snowflake/ml/modeling/linear_model/gamma_regressor.py +51 -52
- snowflake/ml/modeling/linear_model/huber_regressor.py +51 -52
- snowflake/ml/modeling/linear_model/lars.py +51 -52
- snowflake/ml/modeling/linear_model/lars_cv.py +51 -52
- snowflake/ml/modeling/linear_model/lasso.py +51 -52
- snowflake/ml/modeling/linear_model/lasso_cv.py +51 -52
- snowflake/ml/modeling/linear_model/lasso_lars.py +51 -52
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +51 -52
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +51 -52
- snowflake/ml/modeling/linear_model/linear_regression.py +51 -52
- snowflake/ml/modeling/linear_model/logistic_regression.py +51 -52
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +51 -52
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +51 -52
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +51 -52
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +51 -52
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +51 -52
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +51 -52
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +51 -52
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +51 -52
- snowflake/ml/modeling/linear_model/perceptron.py +51 -52
- snowflake/ml/modeling/linear_model/poisson_regressor.py +51 -52
- snowflake/ml/modeling/linear_model/ransac_regressor.py +51 -52
- snowflake/ml/modeling/linear_model/ridge.py +51 -52
- snowflake/ml/modeling/linear_model/ridge_classifier.py +51 -52
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +51 -52
- snowflake/ml/modeling/linear_model/ridge_cv.py +51 -52
- snowflake/ml/modeling/linear_model/sgd_classifier.py +51 -52
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +51 -52
- snowflake/ml/modeling/linear_model/sgd_regressor.py +51 -52
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +51 -52
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +51 -52
- snowflake/ml/modeling/manifold/isomap.py +53 -52
- snowflake/ml/modeling/manifold/mds.py +53 -52
- snowflake/ml/modeling/manifold/spectral_embedding.py +53 -52
- snowflake/ml/modeling/manifold/tsne.py +53 -52
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +51 -52
- snowflake/ml/modeling/mixture/gaussian_mixture.py +51 -52
- snowflake/ml/modeling/model_selection/grid_search_cv.py +21 -23
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +38 -20
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +51 -52
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +51 -52
- snowflake/ml/modeling/multiclass/output_code_classifier.py +51 -52
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +51 -52
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +51 -52
- snowflake/ml/modeling/naive_bayes/complement_nb.py +51 -52
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +51 -52
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +51 -52
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +51 -52
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +51 -52
- snowflake/ml/modeling/neighbors/kernel_density.py +51 -52
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +51 -52
- snowflake/ml/modeling/neighbors/nearest_centroid.py +51 -52
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +51 -52
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +53 -52
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +51 -52
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +51 -52
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +53 -52
- snowflake/ml/modeling/neural_network/mlp_classifier.py +51 -52
- snowflake/ml/modeling/neural_network/mlp_regressor.py +51 -52
- snowflake/ml/modeling/pipeline/pipeline.py +538 -36
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +12 -0
- snowflake/ml/modeling/preprocessing/polynomial_features.py +53 -52
- snowflake/ml/modeling/semi_supervised/label_propagation.py +51 -52
- snowflake/ml/modeling/semi_supervised/label_spreading.py +51 -52
- snowflake/ml/modeling/svm/linear_svc.py +51 -52
- snowflake/ml/modeling/svm/linear_svr.py +51 -52
- snowflake/ml/modeling/svm/nu_svc.py +51 -52
- snowflake/ml/modeling/svm/nu_svr.py +51 -52
- snowflake/ml/modeling/svm/svc.py +51 -52
- snowflake/ml/modeling/svm/svr.py +51 -52
- snowflake/ml/modeling/tree/decision_tree_classifier.py +51 -52
- snowflake/ml/modeling/tree/decision_tree_regressor.py +51 -52
- snowflake/ml/modeling/tree/extra_tree_classifier.py +51 -52
- snowflake/ml/modeling/tree/extra_tree_regressor.py +51 -52
- snowflake/ml/modeling/xgboost/xgb_classifier.py +51 -52
- snowflake/ml/modeling/xgboost/xgb_regressor.py +51 -52
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +51 -52
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +51 -52
- snowflake/ml/registry/_manager/model_manager.py +36 -7
- snowflake/ml/registry/model_registry.py +3 -149
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.1.dist-info → snowflake_ml_python-1.5.1.dist-info}/METADATA +112 -7
- {snowflake_ml_python-1.4.1.dist-info → snowflake_ml_python-1.5.1.dist-info}/RECORD +216 -206
- snowflake/ml/registry/_artifact_manager.py +0 -156
- snowflake/ml/registry/artifact.py +0 -46
- {snowflake_ml_python-1.4.1.dist-info → snowflake_ml_python-1.5.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.1.dist-info → snowflake_ml_python-1.5.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.1.dist-info → snowflake_ml_python-1.5.1.dist-info}/top_level.txt +0 -0
@@ -8,13 +8,29 @@ import re
|
|
8
8
|
import warnings
|
9
9
|
from dataclasses import dataclass
|
10
10
|
from enum import Enum
|
11
|
-
from typing import
|
11
|
+
from typing import (
|
12
|
+
Any,
|
13
|
+
Callable,
|
14
|
+
Dict,
|
15
|
+
List,
|
16
|
+
Literal,
|
17
|
+
Optional,
|
18
|
+
Tuple,
|
19
|
+
TypeVar,
|
20
|
+
Union,
|
21
|
+
cast,
|
22
|
+
overload,
|
23
|
+
)
|
12
24
|
|
25
|
+
import packaging.version as pkg_version
|
26
|
+
import snowflake.ml.version as snowml_version
|
13
27
|
from pytimeparse.timeparse import timeparse
|
14
28
|
from typing_extensions import Concatenate, ParamSpec
|
15
29
|
|
30
|
+
from snowflake.ml import dataset
|
16
31
|
from snowflake.ml._internal import telemetry
|
17
32
|
from snowflake.ml._internal.exceptions import (
|
33
|
+
dataset_errors,
|
18
34
|
error_codes,
|
19
35
|
exceptions as snowml_exceptions,
|
20
36
|
)
|
@@ -23,25 +39,27 @@ from snowflake.ml._internal.utils.sql_identifier import (
|
|
23
39
|
SqlIdentifier,
|
24
40
|
to_sql_identifiers,
|
25
41
|
)
|
26
|
-
from snowflake.ml.dataset.
|
27
|
-
from snowflake.ml.feature_store.entity import
|
28
|
-
_ENTITY_NAME_LENGTH_LIMIT,
|
29
|
-
_FEATURE_VIEW_ENTITY_TAG_DELIMITER,
|
30
|
-
Entity,
|
31
|
-
)
|
42
|
+
from snowflake.ml.dataset.dataset_metadata import FeatureStoreMetadata
|
43
|
+
from snowflake.ml.feature_store.entity import _ENTITY_NAME_LENGTH_LIMIT, Entity
|
32
44
|
from snowflake.ml.feature_store.feature_view import (
|
33
45
|
_FEATURE_OBJ_TYPE,
|
34
46
|
_FEATURE_VIEW_NAME_DELIMITER,
|
35
|
-
|
47
|
+
_LEGACY_TIMESTAMP_COL_PLACEHOLDER_VALS,
|
36
48
|
FeatureView,
|
37
49
|
FeatureViewSlice,
|
38
50
|
FeatureViewStatus,
|
39
51
|
FeatureViewVersion,
|
52
|
+
_FeatureViewMetadata,
|
40
53
|
)
|
41
54
|
from snowflake.snowpark import DataFrame, Row, Session, functions as F
|
42
|
-
from snowflake.snowpark._internal import type_utils, utils as snowpark_utils
|
43
55
|
from snowflake.snowpark.exceptions import SnowparkSQLException
|
44
|
-
from snowflake.snowpark.types import
|
56
|
+
from snowflake.snowpark.types import (
|
57
|
+
ArrayType,
|
58
|
+
StringType,
|
59
|
+
StructField,
|
60
|
+
StructType,
|
61
|
+
TimestampType,
|
62
|
+
)
|
45
63
|
|
46
64
|
_Args = ParamSpec("_Args")
|
47
65
|
_RT = TypeVar("_RT")
|
@@ -49,38 +67,80 @@ _RT = TypeVar("_RT")
|
|
49
67
|
logger = logging.getLogger(__name__)
|
50
68
|
|
51
69
|
_ENTITY_TAG_PREFIX = "SNOWML_FEATURE_STORE_ENTITY_"
|
52
|
-
_FEATURE_VIEW_ENTITY_TAG = "SNOWML_FEATURE_STORE_FV_ENTITIES"
|
53
|
-
_FEATURE_VIEW_TS_COL_TAG = "SNOWML_FEATURE_STORE_FV_TS_COL"
|
54
70
|
_FEATURE_STORE_OBJECT_TAG = "SNOWML_FEATURE_STORE_OBJECT"
|
71
|
+
_FEATURE_VIEW_METADATA_TAG = "SNOWML_FEATURE_VIEW_METADATA"
|
72
|
+
|
73
|
+
|
74
|
+
@dataclass(frozen=True)
|
75
|
+
class _FeatureStoreObjInfo:
|
76
|
+
type: _FeatureStoreObjTypes
|
77
|
+
pkg_version: str
|
78
|
+
|
79
|
+
def to_json(self) -> str:
|
80
|
+
state_dict = self.__dict__.copy()
|
81
|
+
state_dict["type"] = state_dict["type"].value
|
82
|
+
return json.dumps(state_dict)
|
83
|
+
|
84
|
+
@classmethod
|
85
|
+
def from_json(cls, json_str: str) -> _FeatureStoreObjInfo:
|
86
|
+
json_dict = json.loads(json_str)
|
87
|
+
# since we may introduce new fields in the json blob in the future,
|
88
|
+
# in order to guarantee compatibility, we need to select ones that can be
|
89
|
+
# decoded in the current version
|
90
|
+
state_dict = {}
|
91
|
+
state_dict["type"] = _FeatureStoreObjTypes.parse(json_dict["type"])
|
92
|
+
state_dict["pkg_version"] = json_dict["pkg_version"]
|
93
|
+
return cls(**state_dict) # type: ignore[arg-type]
|
55
94
|
|
56
95
|
|
57
96
|
# TODO: remove "" after dataset is updated
|
58
97
|
class _FeatureStoreObjTypes(Enum):
|
59
|
-
|
98
|
+
UNKNOWN = "UNKNOWN" # for forward compatibility
|
99
|
+
MANAGED_FEATURE_VIEW = "MANAGED_FEATURE_VIEW"
|
100
|
+
EXTERNAL_FEATURE_VIEW = "EXTERNAL_FEATURE_VIEW"
|
60
101
|
FEATURE_VIEW_REFRESH_TASK = "FEATURE_VIEW_REFRESH_TASK"
|
61
102
|
TRAINING_DATA = ""
|
62
103
|
|
104
|
+
@classmethod
|
105
|
+
def parse(cls, val: str) -> _FeatureStoreObjTypes:
|
106
|
+
try:
|
107
|
+
return cls(val)
|
108
|
+
except ValueError:
|
109
|
+
return cls.UNKNOWN
|
110
|
+
|
63
111
|
|
64
112
|
_PROJECT = "FeatureStore"
|
65
113
|
_DT_OR_VIEW_QUERY_PATTERN = re.compile(
|
66
114
|
r"""CREATE\ (OR\ REPLACE\ )?(?P<obj_type>(DYNAMIC\ TABLE|VIEW))\ .*
|
67
115
|
COMMENT\ =\ '(?P<comment>.*)'\s*
|
68
|
-
TAG.*?{
|
69
|
-
.*?{ts_col_tag}\ =\ '(?P<ts_col>.*?)',?.*?
|
116
|
+
TAG.*?{fv_metadata_tag}\ =\ '(?P<fv_metadata>.*?)',?.*?
|
70
117
|
AS\ (?P<query>.*)
|
71
118
|
""".format(
|
72
|
-
|
119
|
+
fv_metadata_tag=_FEATURE_VIEW_METADATA_TAG,
|
73
120
|
),
|
74
121
|
flags=re.DOTALL | re.IGNORECASE | re.X,
|
75
122
|
)
|
76
123
|
|
124
|
+
_LIST_FEATURE_VIEW_SCHEMA = StructType(
|
125
|
+
[
|
126
|
+
StructField("name", StringType()),
|
127
|
+
StructField("version", StringType()),
|
128
|
+
StructField("database_name", StringType()),
|
129
|
+
StructField("schema_name", StringType()),
|
130
|
+
StructField("created_on", TimestampType()),
|
131
|
+
StructField("owner", StringType()),
|
132
|
+
StructField("desc", StringType()),
|
133
|
+
StructField("entities", ArrayType(StringType())),
|
134
|
+
]
|
135
|
+
)
|
136
|
+
|
77
137
|
|
78
138
|
class CreationMode(Enum):
|
79
139
|
FAIL_IF_NOT_EXIST = 1
|
80
140
|
CREATE_IF_NOT_EXIST = 2
|
81
141
|
|
82
142
|
|
83
|
-
@dataclass
|
143
|
+
@dataclass(frozen=True)
|
84
144
|
class _FeatureStoreConfig:
|
85
145
|
database: SqlIdentifier
|
86
146
|
schema: SqlIdentifier
|
@@ -111,14 +171,14 @@ def switch_warehouse(
|
|
111
171
|
return wrapper
|
112
172
|
|
113
173
|
|
114
|
-
def dispatch_decorator(
|
115
|
-
|
116
|
-
|
174
|
+
def dispatch_decorator() -> Callable[
|
175
|
+
[Callable[Concatenate[FeatureStore, _Args], _RT]],
|
176
|
+
Callable[Concatenate[FeatureStore, _Args], _RT],
|
177
|
+
]:
|
117
178
|
def decorator(
|
118
179
|
f: Callable[Concatenate[FeatureStore, _Args], _RT]
|
119
180
|
) -> Callable[Concatenate[FeatureStore, _Args], _RT]:
|
120
181
|
@telemetry.send_api_usage_telemetry(project=_PROJECT)
|
121
|
-
@snowpark_utils.private_preview(version=prpr_version)
|
122
182
|
@switch_warehouse
|
123
183
|
@functools.wraps(f)
|
124
184
|
def wrap(self: FeatureStore, /, *args: _Args.args, **kargs: _Args.kwargs) -> _RT:
|
@@ -135,7 +195,6 @@ class FeatureStore:
|
|
135
195
|
"""
|
136
196
|
|
137
197
|
@telemetry.send_api_usage_telemetry(project=_PROJECT)
|
138
|
-
@snowpark_utils.private_preview(version="1.0.8")
|
139
198
|
def __init__(
|
140
199
|
self,
|
141
200
|
session: Session,
|
@@ -178,7 +237,7 @@ class FeatureStore:
|
|
178
237
|
# search space used in query "SHOW <object_TYPE> LIKE <object_name> IN <search_space>"
|
179
238
|
# object domain used in query "TAG_REFERENCE(<object_name>, <object_domain>)"
|
180
239
|
self._obj_search_spaces = {
|
181
|
-
"
|
240
|
+
"DATASETS": (self._config.full_schema_path, "DATASET"),
|
182
241
|
"DYNAMIC TABLES": (self._config.full_schema_path, "TABLE"),
|
183
242
|
"VIEWS": (self._config.full_schema_path, "TABLE"),
|
184
243
|
"SCHEMAS": (f"DATABASE {self._config.database}", "SCHEMA"),
|
@@ -195,34 +254,27 @@ class FeatureStore:
|
|
195
254
|
|
196
255
|
else:
|
197
256
|
try:
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
_FEATURE_VIEW_TS_COL_TAG,
|
205
|
-
]
|
206
|
-
):
|
257
|
+
# Explicitly check if schema exists first since we may not have CREATE SCHEMA privilege
|
258
|
+
if len(self._find_object("SCHEMAS", self._config.schema)) == 0:
|
259
|
+
self._session.sql(f"CREATE SCHEMA IF NOT EXISTS {self._config.full_schema_path}").collect(
|
260
|
+
statement_params=self._telemetry_stmp
|
261
|
+
)
|
262
|
+
for tag in to_sql_identifiers([_FEATURE_VIEW_METADATA_TAG, _FEATURE_STORE_OBJECT_TAG]):
|
207
263
|
self._session.sql(f"CREATE TAG IF NOT EXISTS {self._get_fully_qualified_name(tag)}").collect(
|
208
264
|
statement_params=self._telemetry_stmp
|
209
265
|
)
|
210
|
-
|
211
|
-
self._session.sql(
|
212
|
-
f"""CREATE TAG IF NOT EXISTS {self._get_fully_qualified_name(_FEATURE_STORE_OBJECT_TAG)}
|
213
|
-
ALLOWED_VALUES {','.join([f"'{v.value}'" for v in _FeatureStoreObjTypes])}"""
|
214
|
-
).collect(statement_params=self._telemetry_stmp)
|
215
266
|
except Exception as e:
|
216
|
-
self.clear()
|
217
267
|
raise snowml_exceptions.SnowflakeMLException(
|
218
268
|
error_code=error_codes.INTERNAL_SNOWPARK_ERROR,
|
219
269
|
original_exception=RuntimeError(f"Failed to create feature store {name}: {e}."),
|
220
270
|
)
|
221
271
|
|
272
|
+
# TODO: remove this after tag_ref_internal rollout
|
273
|
+
self._use_optimized_tag_ref = self._tag_ref_internal_enabled()
|
274
|
+
self._check_feature_store_object_versions()
|
222
275
|
logger.info(f"Successfully connected to feature store: {self._config.full_schema_path}.")
|
223
276
|
|
224
277
|
@telemetry.send_api_usage_telemetry(project=_PROJECT)
|
225
|
-
@snowpark_utils.private_preview(version="1.0.12")
|
226
278
|
def update_default_warehouse(self, warehouse_name: str) -> None:
|
227
279
|
"""Update default warehouse for feature store.
|
228
280
|
|
@@ -242,7 +294,7 @@ class FeatureStore:
|
|
242
294
|
|
243
295
|
self._default_warehouse = warehouse
|
244
296
|
|
245
|
-
@dispatch_decorator(
|
297
|
+
@dispatch_decorator()
|
246
298
|
def register_entity(self, entity: Entity) -> Entity:
|
247
299
|
"""
|
248
300
|
Register Entity in the FeatureStore.
|
@@ -268,13 +320,13 @@ class FeatureStore:
|
|
268
320
|
return entity
|
269
321
|
|
270
322
|
# allowed_values will add double-quotes around each value, thus use resolved str here.
|
271
|
-
join_keys = [f"
|
323
|
+
join_keys = [f"{key.resolved()}" for key in entity.join_keys]
|
272
324
|
join_keys_str = ",".join(join_keys)
|
273
325
|
full_tag_name = self._get_fully_qualified_name(tag_name)
|
274
326
|
try:
|
275
327
|
self._session.sql(
|
276
328
|
f"""CREATE TAG IF NOT EXISTS {full_tag_name}
|
277
|
-
ALLOWED_VALUES {join_keys_str}
|
329
|
+
ALLOWED_VALUES '{join_keys_str}'
|
278
330
|
COMMENT = '{entity.desc}'
|
279
331
|
"""
|
280
332
|
).collect(statement_params=self._telemetry_stmp)
|
@@ -289,7 +341,7 @@ class FeatureStore:
|
|
289
341
|
return self.get_entity(entity.name)
|
290
342
|
|
291
343
|
# TODO: add support to update column desc once SNOW-894249 is fixed
|
292
|
-
@dispatch_decorator(
|
344
|
+
@dispatch_decorator()
|
293
345
|
def register_feature_view(
|
294
346
|
self,
|
295
347
|
feature_view: FeatureView,
|
@@ -342,7 +394,6 @@ class FeatureStore:
|
|
342
394
|
),
|
343
395
|
)
|
344
396
|
|
345
|
-
# TODO: ideally we should move this to FeatureView creation time
|
346
397
|
for e in feature_view.entities:
|
347
398
|
if not self._validate_entity_exists(e.name):
|
348
399
|
raise snowml_exceptions.SnowflakeMLException(
|
@@ -358,12 +409,23 @@ class FeatureStore:
|
|
358
409
|
pass
|
359
410
|
|
360
411
|
fully_qualified_name = self._get_fully_qualified_name(feature_view_name)
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
412
|
+
refresh_freq = feature_view.refresh_freq
|
413
|
+
|
414
|
+
if refresh_freq is not None:
|
415
|
+
obj_info = _FeatureStoreObjInfo(_FeatureStoreObjTypes.MANAGED_FEATURE_VIEW, snowml_version.VERSION)
|
416
|
+
else:
|
417
|
+
obj_info = _FeatureStoreObjInfo(_FeatureStoreObjTypes.EXTERNAL_FEATURE_VIEW, snowml_version.VERSION)
|
418
|
+
|
419
|
+
tagging_clause = [
|
420
|
+
f"{self._get_fully_qualified_name(_FEATURE_STORE_OBJECT_TAG)} = '{obj_info.to_json()}'",
|
421
|
+
f"{self._get_fully_qualified_name(_FEATURE_VIEW_METADATA_TAG)} = '{feature_view._metadata().to_json()}'",
|
422
|
+
]
|
423
|
+
for e in feature_view.entities:
|
424
|
+
join_keys = [f"{key.resolved()}" for key in e.join_keys]
|
425
|
+
tagging_clause.append(
|
426
|
+
f"{self._get_fully_qualified_name(self._get_entity_name(e.name))} = '{','.join(join_keys)}'"
|
427
|
+
)
|
428
|
+
tagging_clause_str = ",\n".join(tagging_clause)
|
367
429
|
|
368
430
|
def create_col_desc(col: StructField) -> str:
|
369
431
|
desc = feature_view.feature_descs.get(SqlIdentifier(col.name), None)
|
@@ -371,7 +433,6 @@ class FeatureStore:
|
|
371
433
|
return f"{col.name} {desc}"
|
372
434
|
|
373
435
|
column_descs = ", ".join([f"{create_col_desc(col)}" for col in feature_view.output_schema.fields])
|
374
|
-
refresh_freq = feature_view.refresh_freq
|
375
436
|
|
376
437
|
if refresh_freq is not None:
|
377
438
|
schedule_task = refresh_freq != "DOWNSTREAM" and timeparse(refresh_freq) is None
|
@@ -380,10 +441,9 @@ class FeatureStore:
|
|
380
441
|
feature_view,
|
381
442
|
fully_qualified_name,
|
382
443
|
column_descs,
|
383
|
-
|
444
|
+
tagging_clause_str,
|
384
445
|
schedule_task,
|
385
446
|
self._default_warehouse,
|
386
|
-
timestamp_col,
|
387
447
|
block,
|
388
448
|
overwrite,
|
389
449
|
)
|
@@ -393,9 +453,7 @@ class FeatureStore:
|
|
393
453
|
query = f"""CREATE{overwrite_clause} VIEW {fully_qualified_name} ({column_descs})
|
394
454
|
COMMENT = '{feature_view.desc}'
|
395
455
|
TAG (
|
396
|
-
{
|
397
|
-
{_FEATURE_VIEW_TS_COL_TAG} = '{timestamp_col}',
|
398
|
-
{_FEATURE_STORE_OBJECT_TAG} = '{_FeatureStoreObjTypes.FEATURE_VIEW.value}'
|
456
|
+
{tagging_clause_str}
|
399
457
|
)
|
400
458
|
AS {feature_view.query}
|
401
459
|
"""
|
@@ -406,10 +464,10 @@ class FeatureStore:
|
|
406
464
|
original_exception=RuntimeError(f"Create view {fully_qualified_name} [\n{query}\n] failed: {e}"),
|
407
465
|
) from e
|
408
466
|
|
409
|
-
logger.info(f"Registered FeatureView {feature_view.name}/{version}.")
|
467
|
+
logger.info(f"Registered FeatureView {feature_view.name}/{version} successfully.")
|
410
468
|
return self.get_feature_view(feature_view.name, str(version))
|
411
469
|
|
412
|
-
@dispatch_decorator(
|
470
|
+
@dispatch_decorator()
|
413
471
|
def update_feature_view(
|
414
472
|
self, name: str, version: str, refresh_freq: Optional[str] = None, warehouse: Optional[str] = None
|
415
473
|
) -> FeatureView:
|
@@ -456,7 +514,7 @@ class FeatureStore:
|
|
456
514
|
) from e
|
457
515
|
return self.get_feature_view(name=name, version=version)
|
458
516
|
|
459
|
-
@dispatch_decorator(
|
517
|
+
@dispatch_decorator()
|
460
518
|
def read_feature_view(self, feature_view: FeatureView) -> DataFrame:
|
461
519
|
"""
|
462
520
|
Read FeatureView data.
|
@@ -478,13 +536,12 @@ class FeatureStore:
|
|
478
536
|
|
479
537
|
return self._session.sql(f"SELECT * FROM {feature_view.fully_qualified_name()}")
|
480
538
|
|
481
|
-
@dispatch_decorator(
|
539
|
+
@dispatch_decorator()
|
482
540
|
def list_feature_views(
|
483
541
|
self,
|
484
542
|
entity_name: Optional[str] = None,
|
485
543
|
feature_view_name: Optional[str] = None,
|
486
|
-
|
487
|
-
) -> Union[Optional[DataFrame], List[FeatureView]]:
|
544
|
+
) -> DataFrame:
|
488
545
|
"""
|
489
546
|
List FeatureViews in the FeatureStore.
|
490
547
|
If entity_name is specified, FeatureViews associated with that Entity will be listed.
|
@@ -493,34 +550,26 @@ class FeatureStore:
|
|
493
550
|
Args:
|
494
551
|
entity_name: Entity name.
|
495
552
|
feature_view_name: FeatureView name.
|
496
|
-
as_dataframe: whether the return type should be a DataFrame.
|
497
553
|
|
498
554
|
Returns:
|
499
|
-
|
555
|
+
FeatureViews information as a Snowpark DataFrame.
|
500
556
|
"""
|
501
|
-
if entity_name is not None:
|
502
|
-
entity_name = SqlIdentifier(entity_name)
|
503
557
|
if feature_view_name is not None:
|
504
558
|
feature_view_name = SqlIdentifier(feature_view_name)
|
505
559
|
|
506
560
|
if entity_name is not None:
|
507
|
-
|
561
|
+
entity_name = SqlIdentifier(entity_name)
|
562
|
+
if self._use_optimized_tag_ref:
|
563
|
+
return self._optimized_find_feature_views(entity_name, feature_view_name)
|
564
|
+
else:
|
565
|
+
return self._find_feature_views(entity_name, feature_view_name)
|
508
566
|
else:
|
509
|
-
|
510
|
-
entities = self.list_entities().collect()
|
567
|
+
output_values: List[List[Any]] = []
|
511
568
|
for row in self._get_fv_backend_representations(feature_view_name, prefix_match=True):
|
512
|
-
|
513
|
-
|
514
|
-
if as_dataframe:
|
515
|
-
result = None
|
516
|
-
for fv in fvs:
|
517
|
-
fv_df = fv.to_df(self._session)
|
518
|
-
result = fv_df if result is None else result.union(fv_df) # type: ignore[attr-defined]
|
519
|
-
return result
|
520
|
-
else:
|
521
|
-
return fvs
|
569
|
+
self._extract_feature_view_info(row, output_values)
|
570
|
+
return self._session.create_dataframe(output_values, schema=_LIST_FEATURE_VIEW_SCHEMA)
|
522
571
|
|
523
|
-
@dispatch_decorator(
|
572
|
+
@dispatch_decorator()
|
524
573
|
def get_feature_view(self, name: str, version: str) -> FeatureView:
|
525
574
|
"""
|
526
575
|
Retrieve previously registered FeatureView.
|
@@ -549,7 +598,7 @@ class FeatureStore:
|
|
549
598
|
|
550
599
|
return self._compose_feature_view(results[0], self.list_entities().collect())
|
551
600
|
|
552
|
-
@dispatch_decorator(
|
601
|
+
@dispatch_decorator()
|
553
602
|
def resume_feature_view(self, feature_view: FeatureView) -> FeatureView:
|
554
603
|
"""
|
555
604
|
Resume a previously suspended FeatureView.
|
@@ -562,7 +611,7 @@ class FeatureStore:
|
|
562
611
|
"""
|
563
612
|
return self._update_feature_view_status(feature_view, "RESUME")
|
564
613
|
|
565
|
-
@dispatch_decorator(
|
614
|
+
@dispatch_decorator()
|
566
615
|
def suspend_feature_view(self, feature_view: FeatureView) -> FeatureView:
|
567
616
|
"""
|
568
617
|
Suspend an active FeatureView.
|
@@ -575,7 +624,7 @@ class FeatureStore:
|
|
575
624
|
"""
|
576
625
|
return self._update_feature_view_status(feature_view, "SUSPEND")
|
577
626
|
|
578
|
-
@dispatch_decorator(
|
627
|
+
@dispatch_decorator()
|
579
628
|
def delete_feature_view(self, feature_view: FeatureView) -> None:
|
580
629
|
"""
|
581
630
|
Delete a FeatureView.
|
@@ -586,6 +635,8 @@ class FeatureStore:
|
|
586
635
|
Raises:
|
587
636
|
SnowflakeMLException: [ValueError] FeatureView is not registered.
|
588
637
|
"""
|
638
|
+
# TODO: we should leverage lineage graph to check downstream deps, and block the deletion
|
639
|
+
# if there're other FVs depending on this
|
589
640
|
if feature_view.status == FeatureViewStatus.DRAFT or feature_view.version is None:
|
590
641
|
raise snowml_exceptions.SnowflakeMLException(
|
591
642
|
error_code=error_codes.NOT_FOUND,
|
@@ -608,7 +659,7 @@ class FeatureStore:
|
|
608
659
|
|
609
660
|
logger.info(f"Deleted FeatureView {feature_view.name}/{feature_view.version}.")
|
610
661
|
|
611
|
-
@dispatch_decorator(
|
662
|
+
@dispatch_decorator()
|
612
663
|
def list_entities(self) -> DataFrame:
|
613
664
|
"""
|
614
665
|
List all Entities in the FeatureStore.
|
@@ -629,7 +680,7 @@ class FeatureStore:
|
|
629
680
|
),
|
630
681
|
)
|
631
682
|
|
632
|
-
@dispatch_decorator(
|
683
|
+
@dispatch_decorator()
|
633
684
|
def get_entity(self, name: str) -> Entity:
|
634
685
|
"""
|
635
686
|
Retrieve previously registered Entity object.
|
@@ -659,8 +710,7 @@ class FeatureStore:
|
|
659
710
|
original_exception=ValueError(f"Cannot find Entity with name: {name}."),
|
660
711
|
)
|
661
712
|
|
662
|
-
|
663
|
-
join_keys = raw_join_keys.strip("[]").split(",")
|
713
|
+
join_keys = self._recompose_join_keys(result[0]["JOIN_KEYS"])
|
664
714
|
|
665
715
|
return Entity._construct_entity(
|
666
716
|
name=SqlIdentifier(result[0]["NAME"], case_sensitive=True).identifier(),
|
@@ -669,7 +719,7 @@ class FeatureStore:
|
|
669
719
|
owner=result[0]["OWNER"],
|
670
720
|
)
|
671
721
|
|
672
|
-
@dispatch_decorator(
|
722
|
+
@dispatch_decorator()
|
673
723
|
def delete_entity(self, name: str) -> None:
|
674
724
|
"""
|
675
725
|
Delete a previously registered Entity.
|
@@ -690,13 +740,13 @@ class FeatureStore:
|
|
690
740
|
original_exception=ValueError(f"Entity {name} does not exist."),
|
691
741
|
)
|
692
742
|
|
693
|
-
active_feature_views =
|
743
|
+
active_feature_views = self.list_feature_views(entity_name=name).collect(statement_params=self._telemetry_stmp)
|
744
|
+
|
694
745
|
if len(active_feature_views) > 0:
|
746
|
+
active_fvs = [r["NAME"] for r in active_feature_views]
|
695
747
|
raise snowml_exceptions.SnowflakeMLException(
|
696
748
|
error_code=error_codes.SNOWML_DELETE_FAILED,
|
697
|
-
original_exception=ValueError(
|
698
|
-
f"Cannot delete Entity {name} due to active FeatureViews: {[f.name for f in active_feature_views]}."
|
699
|
-
),
|
749
|
+
original_exception=ValueError(f"Cannot delete Entity {name} due to active FeatureViews: {active_fvs}."),
|
700
750
|
)
|
701
751
|
|
702
752
|
tag_name = self._get_fully_qualified_name(self._get_entity_name(name))
|
@@ -705,11 +755,11 @@ class FeatureStore:
|
|
705
755
|
except Exception as e:
|
706
756
|
raise snowml_exceptions.SnowflakeMLException(
|
707
757
|
error_code=error_codes.INTERNAL_SNOWPARK_ERROR,
|
708
|
-
original_exception=RuntimeError(f"Failed to
|
758
|
+
original_exception=RuntimeError(f"Failed to delete entity: {e}."),
|
709
759
|
) from e
|
710
760
|
logger.info(f"Deleted Entity {name}.")
|
711
761
|
|
712
|
-
@dispatch_decorator(
|
762
|
+
@dispatch_decorator()
|
713
763
|
def retrieve_feature_values(
|
714
764
|
self,
|
715
765
|
spine_df: DataFrame,
|
@@ -757,145 +807,163 @@ class FeatureStore:
|
|
757
807
|
|
758
808
|
return df
|
759
809
|
|
760
|
-
@
|
810
|
+
@overload
|
811
|
+
def generate_dataset(
|
812
|
+
self,
|
813
|
+
name: str,
|
814
|
+
spine_df: DataFrame,
|
815
|
+
features: List[Union[FeatureView, FeatureViewSlice]],
|
816
|
+
version: Optional[str] = None,
|
817
|
+
spine_timestamp_col: Optional[str] = None,
|
818
|
+
spine_label_cols: Optional[List[str]] = None,
|
819
|
+
exclude_columns: Optional[List[str]] = None,
|
820
|
+
include_feature_view_timestamp_col: bool = False,
|
821
|
+
desc: str = "",
|
822
|
+
output_type: Literal["dataset"] = "dataset",
|
823
|
+
) -> dataset.Dataset:
|
824
|
+
...
|
825
|
+
|
826
|
+
@overload
|
761
827
|
def generate_dataset(
|
762
828
|
self,
|
829
|
+
name: str,
|
763
830
|
spine_df: DataFrame,
|
764
831
|
features: List[Union[FeatureView, FeatureViewSlice]],
|
765
|
-
|
832
|
+
output_type: Literal["table"],
|
833
|
+
version: Optional[str] = None,
|
766
834
|
spine_timestamp_col: Optional[str] = None,
|
767
835
|
spine_label_cols: Optional[List[str]] = None,
|
768
836
|
exclude_columns: Optional[List[str]] = None,
|
769
|
-
save_mode: str = "errorifexists",
|
770
837
|
include_feature_view_timestamp_col: bool = False,
|
771
838
|
desc: str = "",
|
772
|
-
) ->
|
839
|
+
) -> DataFrame:
|
840
|
+
...
|
841
|
+
|
842
|
+
@dispatch_decorator() # type: ignore[misc]
|
843
|
+
def generate_dataset(
|
844
|
+
self,
|
845
|
+
name: str,
|
846
|
+
spine_df: DataFrame,
|
847
|
+
features: List[Union[FeatureView, FeatureViewSlice]],
|
848
|
+
version: Optional[str] = None,
|
849
|
+
spine_timestamp_col: Optional[str] = None,
|
850
|
+
spine_label_cols: Optional[List[str]] = None,
|
851
|
+
exclude_columns: Optional[List[str]] = None,
|
852
|
+
include_feature_view_timestamp_col: bool = False,
|
853
|
+
desc: str = "",
|
854
|
+
output_type: Literal["dataset", "table"] = "dataset",
|
855
|
+
) -> Union[dataset.Dataset, DataFrame]:
|
773
856
|
"""
|
774
857
|
Generate dataset by given source table and feature views.
|
775
858
|
|
776
859
|
Args:
|
860
|
+
name: The name of the Dataset to be generated. Datasets are uniquely identified within a schema
|
861
|
+
by their name and version.
|
777
862
|
spine_df: The fact table contains the raw dataset.
|
778
863
|
features: A list of FeatureView or FeatureViewSlice which contains features to be joined.
|
779
|
-
|
780
|
-
|
781
|
-
the provided table. Note result dataset will be a snowflake clone of registered table.
|
782
|
-
New data can append on same registered table and previously generated dataset won't be affected.
|
783
|
-
Default result table name will be a concatenation of materialized_table name and current timestamp.
|
864
|
+
version: The version of the Dataset to be generated. If none specified, the current timestamp
|
865
|
+
will be used instead.
|
784
866
|
spine_timestamp_col: Name of timestamp column in spine_df that will be used to join
|
785
867
|
time-series features. If spine_timestamp_col is not none, the input features also must have
|
786
868
|
timestamp_col.
|
787
869
|
spine_label_cols: Name of column(s) in spine_df that contains labels.
|
788
870
|
exclude_columns: Column names to exclude from the result dataframe.
|
789
871
|
The underlying storage will still contain the columns.
|
790
|
-
save_mode: How new data is saved. currently support:
|
791
|
-
errorifexists: Raise error if registered table already exists.
|
792
|
-
merge: Merge new data if registered table already exists.
|
793
872
|
include_feature_view_timestamp_col: Generated dataset will include timestamp column of feature view
|
794
873
|
(if feature view has timestamp column) if set true. Default to false.
|
795
874
|
desc: A description about this dataset.
|
875
|
+
output_type: The type of Snowflake storage to use for the generated training data.
|
796
876
|
|
797
877
|
Returns:
|
798
|
-
|
878
|
+
If output_type is "dataset" (default), returns a Dataset object.
|
879
|
+
If output_type is "table", returns a Snowpark DataFrame representing the table.
|
799
880
|
|
800
881
|
Raises:
|
801
|
-
SnowflakeMLException: [ValueError]
|
802
|
-
SnowflakeMLException: [ValueError] spine_df contains more than one query.
|
803
|
-
SnowflakeMLException: [ValueError] Materialized_table contains invalid char `.`.
|
804
|
-
SnowflakeMLException: [ValueError] Materialized_table already exists with save_mode `errorifexists`.
|
882
|
+
SnowflakeMLException: [ValueError] Dataset name/version already exists
|
805
883
|
SnowflakeMLException: [ValueError] Snapshot creation failed.
|
884
|
+
SnowflakeMLException: [ValueError] Invalid output_type specified.
|
806
885
|
SnowflakeMLException: [RuntimeError] Failed to create clone from table.
|
807
886
|
SnowflakeMLException: [RuntimeError] Failed to find resources.
|
808
887
|
"""
|
888
|
+
if output_type not in {"table", "dataset"}:
|
889
|
+
raise snowml_exceptions.SnowflakeMLException(
|
890
|
+
error_code=error_codes.INVALID_ARGUMENT,
|
891
|
+
original_exception=ValueError(f"Invalid output_type: {output_type}."),
|
892
|
+
)
|
809
893
|
if spine_timestamp_col is not None:
|
810
894
|
spine_timestamp_col = SqlIdentifier(spine_timestamp_col)
|
811
895
|
if spine_label_cols is not None:
|
812
896
|
spine_label_cols = to_sql_identifiers(spine_label_cols) # type: ignore[assignment]
|
813
897
|
|
814
|
-
allowed_save_mode = {"errorifexists", "merge"}
|
815
|
-
if save_mode.lower() not in allowed_save_mode:
|
816
|
-
raise snowml_exceptions.SnowflakeMLException(
|
817
|
-
error_code=error_codes.INVALID_ARGUMENT,
|
818
|
-
original_exception=ValueError(
|
819
|
-
f"'{save_mode}' is not supported. Current supported save modes: {','.join(allowed_save_mode)}"
|
820
|
-
),
|
821
|
-
)
|
822
|
-
|
823
|
-
if len(spine_df.queries["queries"]) != 1:
|
824
|
-
raise snowml_exceptions.SnowflakeMLException(
|
825
|
-
error_code=error_codes.INVALID_ARGUMENT,
|
826
|
-
original_exception=ValueError(
|
827
|
-
f"spine_df must contain only one query. Got: {spine_df.queries['queries']}"
|
828
|
-
),
|
829
|
-
)
|
830
|
-
|
831
898
|
result_df, join_keys = self._join_features(
|
832
899
|
spine_df, features, spine_timestamp_col, include_feature_view_timestamp_col
|
833
900
|
)
|
834
901
|
|
835
|
-
|
836
|
-
|
837
|
-
|
838
|
-
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
|
843
|
-
# TODO (wezhou) change materialized_table to SqlIdentifier
|
844
|
-
found_rows = self._find_object("TABLES", SqlIdentifier(materialized_table))
|
845
|
-
if save_mode.lower() == "errorifexists" and len(found_rows) > 0:
|
846
|
-
raise snowml_exceptions.SnowflakeMLException(
|
847
|
-
error_code=error_codes.OBJECT_ALREADY_EXISTS,
|
848
|
-
original_exception=ValueError(f"Dataset table {materialized_table} already exists."),
|
849
|
-
)
|
850
|
-
|
851
|
-
self._dump_dataset(result_df, materialized_table, join_keys, spine_timestamp_col)
|
852
|
-
|
853
|
-
snapshot_table = f"{materialized_table}_{datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')}"
|
854
|
-
snapshot_table = self._get_fully_qualified_name(snapshot_table)
|
855
|
-
materialized_table = self._get_fully_qualified_name(materialized_table)
|
856
|
-
|
857
|
-
try:
|
858
|
-
self._session.sql(f"CREATE TABLE {snapshot_table} CLONE {materialized_table}").collect(
|
859
|
-
statement_params=self._telemetry_stmp
|
860
|
-
)
|
861
|
-
except Exception as e:
|
862
|
-
raise snowml_exceptions.SnowflakeMLException(
|
863
|
-
error_code=error_codes.INTERNAL_SNOWPARK_ERROR,
|
864
|
-
original_exception=RuntimeError(
|
865
|
-
f"Failed to create clone {materialized_table} from table {snapshot_table}: {e}."
|
866
|
-
),
|
867
|
-
) from e
|
868
|
-
|
869
|
-
result_df = self._session.sql(f"SELECT * FROM {snapshot_table}")
|
902
|
+
# Convert name to fully qualified name if not already fully qualified
|
903
|
+
db_name, schema_name, object_name, _ = identifier.parse_schema_level_object_identifier(name)
|
904
|
+
name = "{}.{}.{}".format(
|
905
|
+
db_name or self._config.database,
|
906
|
+
schema_name or self._config.schema,
|
907
|
+
object_name,
|
908
|
+
)
|
909
|
+
version = version or datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
|
870
910
|
|
871
911
|
if exclude_columns is not None:
|
872
912
|
result_df = self._exclude_columns(result_df, exclude_columns)
|
873
913
|
|
874
914
|
fs_meta = FeatureStoreMetadata(
|
875
|
-
spine_query=spine_df.queries["queries"][
|
876
|
-
|
877
|
-
|
915
|
+
spine_query=spine_df.queries["queries"][-1],
|
916
|
+
serialized_feature_views=[fv.to_json() for fv in features],
|
917
|
+
spine_timestamp_col=spine_timestamp_col,
|
878
918
|
)
|
879
919
|
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
|
890
|
-
|
920
|
+
try:
|
921
|
+
if output_type == "table":
|
922
|
+
table_name = f"{name}_{version}"
|
923
|
+
result_df.write.mode("errorifexists").save_as_table(table_name) # type: ignore[call-overload]
|
924
|
+
ds_df = self._session.table(table_name)
|
925
|
+
return ds_df
|
926
|
+
else:
|
927
|
+
assert output_type == "dataset"
|
928
|
+
if not self._is_dataset_enabled():
|
929
|
+
raise snowml_exceptions.SnowflakeMLException(
|
930
|
+
error_code=error_codes.SNOWML_CREATE_FAILED,
|
931
|
+
original_exception=RuntimeError(
|
932
|
+
"Dataset is not enabled in your account. Ask your account admin to set"
|
933
|
+
' FEATURE_DATASET=ENABLED or set output_type="table" to generate the data'
|
934
|
+
" as a Snowflake Table instead."
|
935
|
+
),
|
936
|
+
)
|
937
|
+
ds: dataset.Dataset = dataset.create_from_dataframe(
|
938
|
+
self._session,
|
939
|
+
name,
|
940
|
+
version,
|
941
|
+
input_dataframe=result_df,
|
942
|
+
exclude_cols=[spine_timestamp_col],
|
943
|
+
label_cols=spine_label_cols,
|
944
|
+
properties=fs_meta,
|
945
|
+
comment=desc,
|
946
|
+
)
|
947
|
+
return ds
|
948
|
+
|
949
|
+
except dataset_errors.DatasetExistError as e:
|
950
|
+
raise snowml_exceptions.SnowflakeMLException(
|
951
|
+
error_code=error_codes.OBJECT_ALREADY_EXISTS,
|
952
|
+
original_exception=RuntimeError(str(e)),
|
953
|
+
) from e
|
954
|
+
except SnowparkSQLException as e:
|
955
|
+
raise snowml_exceptions.SnowflakeMLException(
|
956
|
+
error_code=error_codes.INTERNAL_SNOWPARK_ERROR,
|
957
|
+
original_exception=RuntimeError(f"An error occurred during dataset generation: {e}."),
|
958
|
+
) from e
|
891
959
|
|
892
|
-
@dispatch_decorator(
|
893
|
-
def load_feature_views_from_dataset(self,
|
960
|
+
@dispatch_decorator()
|
961
|
+
def load_feature_views_from_dataset(self, ds: dataset.Dataset) -> List[Union[FeatureView, FeatureViewSlice]]:
|
894
962
|
"""
|
895
963
|
Retrieve FeatureViews used during Dataset construction.
|
896
964
|
|
897
965
|
Args:
|
898
|
-
|
966
|
+
ds: Dataset object created from feature store.
|
899
967
|
|
900
968
|
Returns:
|
901
969
|
List of FeatureViews used during Dataset construction.
|
@@ -903,56 +971,59 @@ class FeatureStore:
|
|
903
971
|
Raises:
|
904
972
|
ValueError: if dataset object is not generated from feature store.
|
905
973
|
"""
|
906
|
-
|
907
|
-
|
908
|
-
|
974
|
+
assert ds.selected_version is not None
|
975
|
+
source_meta = ds.selected_version._get_metadata()
|
976
|
+
if (
|
977
|
+
source_meta is None
|
978
|
+
or not isinstance(source_meta.properties, FeatureStoreMetadata)
|
979
|
+
or source_meta.properties.serialized_feature_views is None
|
980
|
+
):
|
981
|
+
raise ValueError(f"Dataset {ds} does not contain valid feature view information.")
|
909
982
|
|
910
|
-
return self._load_serialized_feature_objects(
|
983
|
+
return self._load_serialized_feature_objects(source_meta.properties.serialized_feature_views)
|
911
984
|
|
912
|
-
@dispatch_decorator(
|
913
|
-
def
|
985
|
+
@dispatch_decorator()
|
986
|
+
def _clear(self, dryrun: bool = True) -> None:
|
914
987
|
"""
|
915
|
-
Clear all feature
|
916
|
-
|
988
|
+
Clear all feature views and entities. Note Feature Store schema and metadata will NOT be purged
|
989
|
+
together. Use SQL to delete schema and metadata instead.
|
917
990
|
|
918
|
-
|
919
|
-
|
991
|
+
Args:
|
992
|
+
dryrun: Print a list of objects will be deleted but not actually perform the deletion when true.
|
920
993
|
"""
|
921
|
-
|
922
|
-
|
923
|
-
|
924
|
-
|
925
|
-
|
926
|
-
|
927
|
-
|
928
|
-
|
929
|
-
|
930
|
-
|
931
|
-
|
932
|
-
|
933
|
-
|
934
|
-
|
935
|
-
|
936
|
-
|
937
|
-
|
938
|
-
|
939
|
-
|
940
|
-
|
941
|
-
|
942
|
-
|
943
|
-
|
944
|
-
|
945
|
-
|
946
|
-
|
947
|
-
|
948
|
-
|
949
|
-
|
994
|
+
warnings.warn(
|
995
|
+
"It will clear ALL feature views and entities in this Feature Store. Make sure your role"
|
996
|
+
" has sufficient access to all feature views and entities. Insufficient access to some feature"
|
997
|
+
" views or entities will leave Feature Store in an incomplete state.",
|
998
|
+
stacklevel=2,
|
999
|
+
category=UserWarning,
|
1000
|
+
)
|
1001
|
+
|
1002
|
+
all_fvs_df = self.list_feature_views()
|
1003
|
+
all_entities_df = self.list_entities()
|
1004
|
+
all_fvs_rows = all_fvs_df.collect()
|
1005
|
+
all_entities_rows = all_entities_df.collect()
|
1006
|
+
|
1007
|
+
if dryrun:
|
1008
|
+
logger.info(
|
1009
|
+
"Following feature views and entities will be deleted."
|
1010
|
+
+ " Set 'dryrun=False' to perform the actual deletion."
|
1011
|
+
)
|
1012
|
+
logger.info(f"Total {len(all_fvs_rows)} Feature views to be deleted:")
|
1013
|
+
all_fvs_df.show(n=len(all_fvs_rows))
|
1014
|
+
logger.info(f"\nTotal {len(all_entities_rows)} entities to be deleted:")
|
1015
|
+
all_entities_df.show(n=len(all_entities_rows))
|
1016
|
+
return
|
1017
|
+
|
1018
|
+
for fv_row in all_fvs_rows:
|
1019
|
+
fv = self.get_feature_view(
|
1020
|
+
SqlIdentifier(fv_row["NAME"], case_sensitive=True).identifier(), fv_row["VERSION"]
|
1021
|
+
)
|
1022
|
+
self.delete_feature_view(fv)
|
1023
|
+
|
1024
|
+
for entity_row in all_entities_rows:
|
1025
|
+
self.delete_entity(SqlIdentifier(entity_row["NAME"], case_sensitive=True).identifier())
|
950
1026
|
|
951
|
-
except Exception as e:
|
952
|
-
raise snowml_exceptions.SnowflakeMLException(
|
953
|
-
error_code=error_codes.INTERNAL_SNOWPARK_ERROR,
|
954
|
-
original_exception=RuntimeError(f"Failed to clear feature store {self._config.full_schema_path}: {e}."),
|
955
|
-
) from e
|
956
1027
|
logger.info(f"Feature store {self._config.full_schema_path} has been cleared.")
|
957
1028
|
|
958
1029
|
def _get_feature_view_if_exists(self, name: str, version: str) -> FeatureView:
|
@@ -965,37 +1036,47 @@ class FeatureStore:
|
|
965
1036
|
)
|
966
1037
|
return existing_fv
|
967
1038
|
|
1039
|
+
def _recompose_join_keys(self, join_key: str) -> List[str]:
|
1040
|
+
# ALLOWED_VALUES in TAG will follow format ["key_1,key2,..."]
|
1041
|
+
# since keys are already resolved following the SQL identifier rule on the write path,
|
1042
|
+
# we simply parse the keys back and wrap them with quotes to preserve cases
|
1043
|
+
# Example join_key repr from TAG value: "[key1,key2,key3]"
|
1044
|
+
join_keys = join_key[2:-2].split(",")
|
1045
|
+
res = []
|
1046
|
+
for k in join_keys:
|
1047
|
+
res.append(f'"{k}"')
|
1048
|
+
return res
|
1049
|
+
|
968
1050
|
def _create_dynamic_table(
|
969
1051
|
self,
|
970
1052
|
feature_view_name: SqlIdentifier,
|
971
1053
|
feature_view: FeatureView,
|
972
1054
|
fully_qualified_name: str,
|
973
1055
|
column_descs: str,
|
974
|
-
|
1056
|
+
tagging_clause: str,
|
975
1057
|
schedule_task: bool,
|
976
1058
|
warehouse: SqlIdentifier,
|
977
|
-
timestamp_col: SqlIdentifier,
|
978
1059
|
block: bool,
|
979
1060
|
override: bool,
|
980
1061
|
) -> None:
|
981
1062
|
# TODO: cluster by join keys once DT supports that
|
982
|
-
override_clause = " OR REPLACE" if override else ""
|
983
|
-
query = f"""CREATE{override_clause} DYNAMIC TABLE {fully_qualified_name} ({column_descs})
|
984
|
-
TARGET_LAG = '{'DOWNSTREAM' if schedule_task else feature_view.refresh_freq}'
|
985
|
-
COMMENT = '{feature_view.desc}'
|
986
|
-
TAG (
|
987
|
-
{self._get_fully_qualified_name(_FEATURE_VIEW_ENTITY_TAG)} = '{entities}',
|
988
|
-
{self._get_fully_qualified_name(_FEATURE_VIEW_TS_COL_TAG)} = '{timestamp_col}',
|
989
|
-
{self._get_fully_qualified_name(_FEATURE_STORE_OBJECT_TAG)} =
|
990
|
-
'{_FeatureStoreObjTypes.FEATURE_VIEW.value}'
|
991
|
-
)
|
992
|
-
WAREHOUSE = {warehouse}
|
993
|
-
AS {feature_view.query}
|
994
|
-
"""
|
995
1063
|
try:
|
1064
|
+
override_clause = " OR REPLACE" if override else ""
|
1065
|
+
query = f"""CREATE{override_clause} DYNAMIC TABLE {fully_qualified_name} ({column_descs})
|
1066
|
+
TARGET_LAG = '{'DOWNSTREAM' if schedule_task else feature_view.refresh_freq}'
|
1067
|
+
COMMENT = '{feature_view.desc}'
|
1068
|
+
TAG (
|
1069
|
+
{tagging_clause}
|
1070
|
+
)
|
1071
|
+
WAREHOUSE = {warehouse}
|
1072
|
+
AS {feature_view.query}
|
1073
|
+
"""
|
996
1074
|
self._session.sql(query).collect(block=block, statement_params=self._telemetry_stmp)
|
997
1075
|
|
998
1076
|
if schedule_task:
|
1077
|
+
task_obj_info = _FeatureStoreObjInfo(
|
1078
|
+
_FeatureStoreObjTypes.FEATURE_VIEW_REFRESH_TASK, snowml_version.VERSION
|
1079
|
+
)
|
999
1080
|
try:
|
1000
1081
|
self._session.sql(
|
1001
1082
|
f"""CREATE{override_clause} TASK {fully_qualified_name}
|
@@ -1007,8 +1088,7 @@ class FeatureStore:
|
|
1007
1088
|
self._session.sql(
|
1008
1089
|
f"""
|
1009
1090
|
ALTER TASK {fully_qualified_name}
|
1010
|
-
SET TAG {self._get_fully_qualified_name(_FEATURE_STORE_OBJECT_TAG)}
|
1011
|
-
='{_FeatureStoreObjTypes.FEATURE_VIEW_REFRESH_TASK.value}'
|
1091
|
+
SET TAG {self._get_fully_qualified_name(_FEATURE_STORE_OBJECT_TAG)}='{task_obj_info.to_json()}'
|
1012
1092
|
"""
|
1013
1093
|
).collect(statement_params=self._telemetry_stmp)
|
1014
1094
|
self._session.sql(f"ALTER TASK {fully_qualified_name} RESUME").collect(
|
@@ -1049,57 +1129,6 @@ class FeatureStore:
|
|
1049
1129
|
category=UserWarning,
|
1050
1130
|
)
|
1051
1131
|
|
1052
|
-
def _dump_dataset(
|
1053
|
-
self,
|
1054
|
-
df: DataFrame,
|
1055
|
-
table_name: str,
|
1056
|
-
join_keys: List[SqlIdentifier],
|
1057
|
-
spine_timestamp_col: Optional[SqlIdentifier] = None,
|
1058
|
-
) -> None:
|
1059
|
-
if len(df.queries["queries"]) != 1:
|
1060
|
-
raise snowml_exceptions.SnowflakeMLException(
|
1061
|
-
error_code=error_codes.INVALID_ARGUMENT,
|
1062
|
-
original_exception=ValueError(f"Dataset df must contain only one query. Got: {df.queries['queries']}"),
|
1063
|
-
)
|
1064
|
-
schema = ", ".join([f"{c.name} {type_utils.convert_sp_to_sf_type(c.datatype)}" for c in df.schema.fields])
|
1065
|
-
fully_qualified_name = self._get_fully_qualified_name(table_name)
|
1066
|
-
|
1067
|
-
try:
|
1068
|
-
self._session.sql(
|
1069
|
-
f"""CREATE TABLE IF NOT EXISTS {fully_qualified_name} ({schema})
|
1070
|
-
CLUSTER BY ({', '.join(join_keys)})
|
1071
|
-
TAG ({self._get_fully_qualified_name(_FEATURE_STORE_OBJECT_TAG)} = '')
|
1072
|
-
"""
|
1073
|
-
).collect(block=True, statement_params=self._telemetry_stmp)
|
1074
|
-
except Exception as e:
|
1075
|
-
raise snowml_exceptions.SnowflakeMLException(
|
1076
|
-
error_code=error_codes.INTERNAL_SNOWPARK_ERROR,
|
1077
|
-
original_exception=RuntimeError(f"Failed to create table {fully_qualified_name}: {e}."),
|
1078
|
-
) from e
|
1079
|
-
|
1080
|
-
source_query = df.queries["queries"][0]
|
1081
|
-
|
1082
|
-
if spine_timestamp_col is not None:
|
1083
|
-
join_keys.append(spine_timestamp_col)
|
1084
|
-
|
1085
|
-
_, _, dest_alias, _ = identifier.parse_schema_level_object_identifier(fully_qualified_name)
|
1086
|
-
source_alias = f"{dest_alias}_source"
|
1087
|
-
join_cond = " AND ".join([f"{dest_alias}.{k} = {source_alias}.{k}" for k in join_keys])
|
1088
|
-
update_clause = ", ".join([f"{dest_alias}.{c} = {source_alias}.{c}" for c in df.columns])
|
1089
|
-
insert_clause = ", ".join([f"{source_alias}.{c}" for c in df.columns])
|
1090
|
-
query = f"""
|
1091
|
-
MERGE INTO {fully_qualified_name} USING ({source_query}) {source_alias} ON {join_cond}
|
1092
|
-
WHEN MATCHED THEN UPDATE SET {update_clause}
|
1093
|
-
WHEN NOT MATCHED THEN INSERT ({', '.join(df.columns)}) VALUES ({insert_clause})
|
1094
|
-
"""
|
1095
|
-
try:
|
1096
|
-
self._session.sql(query).collect(block=True, statement_params=self._telemetry_stmp)
|
1097
|
-
except Exception as e:
|
1098
|
-
raise snowml_exceptions.SnowflakeMLException(
|
1099
|
-
error_code=error_codes.INTERNAL_SNOWPARK_ERROR,
|
1100
|
-
original_exception=RuntimeError(f"Failed to create dataset {fully_qualified_name} with merge: {e}."),
|
1101
|
-
) from e
|
1102
|
-
|
1103
1132
|
def _validate_entity_exists(self, name: SqlIdentifier) -> bool:
|
1104
1133
|
full_entity_tag_name = self._get_entity_name(name)
|
1105
1134
|
found_rows = self._find_object("TAGS", full_entity_tag_name)
|
@@ -1112,14 +1141,6 @@ class FeatureStore:
|
|
1112
1141
|
spine_timestamp_col: Optional[SqlIdentifier],
|
1113
1142
|
include_feature_view_timestamp_col: bool,
|
1114
1143
|
) -> Tuple[DataFrame, List[SqlIdentifier]]:
|
1115
|
-
if len(spine_df.queries["queries"]) != 1:
|
1116
|
-
raise snowml_exceptions.SnowflakeMLException(
|
1117
|
-
error_code=error_codes.INVALID_ARGUMENT,
|
1118
|
-
original_exception=ValueError(
|
1119
|
-
f"spine_df must contain only one query. Got: {spine_df.queries['queries']}"
|
1120
|
-
),
|
1121
|
-
)
|
1122
|
-
|
1123
1144
|
for f in features:
|
1124
1145
|
f = f.feature_view_ref if isinstance(f, FeatureViewSlice) else f
|
1125
1146
|
if f.status == FeatureViewStatus.DRAFT:
|
@@ -1141,7 +1162,7 @@ class FeatureStore:
|
|
1141
1162
|
self._asof_join_enabled = self._is_asof_join_enabled()
|
1142
1163
|
|
1143
1164
|
# TODO: leverage Snowpark dataframe for more concise syntax once it supports AsOfJoin
|
1144
|
-
query = spine_df.queries["queries"][
|
1165
|
+
query = spine_df.queries["queries"][-1]
|
1145
1166
|
layer = 0
|
1146
1167
|
for f in features:
|
1147
1168
|
if isinstance(f, FeatureViewSlice):
|
@@ -1150,7 +1171,7 @@ class FeatureStore:
|
|
1150
1171
|
else:
|
1151
1172
|
cols = f.feature_names
|
1152
1173
|
|
1153
|
-
join_keys =
|
1174
|
+
join_keys = list({k for e in f.entities for k in e.join_keys})
|
1154
1175
|
join_keys_str = ", ".join(join_keys)
|
1155
1176
|
assert f.version is not None
|
1156
1177
|
join_table_name = f.fully_qualified_name()
|
@@ -1199,7 +1220,15 @@ class FeatureStore:
|
|
1199
1220
|
"""
|
1200
1221
|
layer += 1
|
1201
1222
|
|
1202
|
-
|
1223
|
+
# TODO: construct result dataframe with datframe APIs once ASOF join is supported natively.
|
1224
|
+
# Below code manually construct result dataframe from private members of spine dataframe, which
|
1225
|
+
# likely will cause unintentional issues. This setp is needed because spine_df might contains
|
1226
|
+
# prerequisite queries and post actions that must be carried over to result dataframe.
|
1227
|
+
result_df = self._session.sql(query)
|
1228
|
+
result_df._plan.queries = spine_df._plan.queries[:-1] + result_df._plan.queries
|
1229
|
+
result_df._plan.post_actions = spine_df._plan.post_actions
|
1230
|
+
|
1231
|
+
return result_df, join_keys
|
1203
1232
|
|
1204
1233
|
def _check_database_exists_or_throw(self) -> None:
|
1205
1234
|
resolved_db_name = self._config.database.resolved()
|
@@ -1227,8 +1256,7 @@ class FeatureStore:
|
|
1227
1256
|
for tag_name in to_sql_identifiers(
|
1228
1257
|
[
|
1229
1258
|
_FEATURE_STORE_OBJECT_TAG,
|
1230
|
-
|
1231
|
-
_FEATURE_VIEW_TS_COL_TAG,
|
1259
|
+
_FEATURE_VIEW_METADATA_TAG,
|
1232
1260
|
]
|
1233
1261
|
):
|
1234
1262
|
tag_result = self._find_object("TAGS", tag_name)
|
@@ -1340,7 +1368,8 @@ class FeatureStore:
|
|
1340
1368
|
|
1341
1369
|
# Part 4: join original spine table with window table
|
1342
1370
|
prefix_f_only_cols = to_sql_identifiers(
|
1343
|
-
[f"{temp_prefix}{name.resolved()}" for name in f_only_cols],
|
1371
|
+
[f"{temp_prefix}{name.resolved()}" for name in f_only_cols],
|
1372
|
+
case_sensitive=True,
|
1344
1373
|
)
|
1345
1374
|
last_select = f"""
|
1346
1375
|
SELECT
|
@@ -1373,7 +1402,10 @@ class FeatureStore:
|
|
1373
1402
|
return dynamic_table_results + view_results
|
1374
1403
|
|
1375
1404
|
def _update_feature_view_status(self, feature_view: FeatureView, operation: str) -> FeatureView:
|
1376
|
-
assert operation in [
|
1405
|
+
assert operation in [
|
1406
|
+
"RESUME",
|
1407
|
+
"SUSPEND",
|
1408
|
+
], f"Operation: {operation} not supported"
|
1377
1409
|
if feature_view.status == FeatureViewStatus.DRAFT or feature_view.version is None:
|
1378
1410
|
raise snowml_exceptions.SnowflakeMLException(
|
1379
1411
|
error_code=error_codes.NOT_FOUND,
|
@@ -1397,17 +1429,76 @@ class FeatureStore:
|
|
1397
1429
|
logger.info(f"Successfully {operation} FeatureView {feature_view.name}/{feature_view.version}.")
|
1398
1430
|
return self.get_feature_view(feature_view.name, feature_view.version)
|
1399
1431
|
|
1400
|
-
def
|
1432
|
+
def _optimized_find_feature_views(
|
1401
1433
|
self, entity_name: SqlIdentifier, feature_view_name: Optional[SqlIdentifier]
|
1402
|
-
) ->
|
1434
|
+
) -> DataFrame:
|
1403
1435
|
if not self._validate_entity_exists(entity_name):
|
1404
|
-
return []
|
1436
|
+
return self._session.create_dataframe([], schema=_LIST_FEATURE_VIEW_SCHEMA)
|
1405
1437
|
|
1438
|
+
# TODO: this can be optimized further by directly getting all possible FVs and filter by tag
|
1439
|
+
# it's easier to rewrite the code once we can remove the tag_reference path
|
1406
1440
|
all_fvs = self._get_fv_backend_representations(object_name=None)
|
1407
1441
|
fv_maps = {SqlIdentifier(r["name"], case_sensitive=True): r for r in all_fvs}
|
1408
1442
|
|
1409
1443
|
if len(fv_maps.keys()) == 0:
|
1410
|
-
return []
|
1444
|
+
return self._session.create_dataframe([], schema=_LIST_FEATURE_VIEW_SCHEMA)
|
1445
|
+
|
1446
|
+
filter_clause = f"WHERE OBJECT_NAME LIKE '{feature_view_name.resolved()}%'" if feature_view_name else ""
|
1447
|
+
try:
|
1448
|
+
res = self._session.sql(
|
1449
|
+
f"""
|
1450
|
+
SELECT
|
1451
|
+
OBJECT_NAME
|
1452
|
+
FROM TABLE(
|
1453
|
+
{self._config.database}.INFORMATION_SCHEMA.TAG_REFERENCES_INTERNAL(
|
1454
|
+
TAG_NAME => '{self._get_fully_qualified_name(self._get_entity_name(entity_name))}'
|
1455
|
+
)
|
1456
|
+
) {filter_clause}"""
|
1457
|
+
).collect(statement_params=self._telemetry_stmp)
|
1458
|
+
except Exception as e:
|
1459
|
+
raise snowml_exceptions.SnowflakeMLException(
|
1460
|
+
error_code=error_codes.INTERNAL_SNOWPARK_ERROR,
|
1461
|
+
original_exception=RuntimeError(f"Failed to find feature views' by entity {entity_name}: {e}"),
|
1462
|
+
) from e
|
1463
|
+
|
1464
|
+
output_values: List[List[Any]] = []
|
1465
|
+
for r in res:
|
1466
|
+
row = fv_maps[SqlIdentifier(r["OBJECT_NAME"], case_sensitive=True)]
|
1467
|
+
self._extract_feature_view_info(row, output_values)
|
1468
|
+
|
1469
|
+
return self._session.create_dataframe(output_values, schema=_LIST_FEATURE_VIEW_SCHEMA)
|
1470
|
+
|
1471
|
+
def _extract_feature_view_info(self, row: Row, output_values: List[List[Any]]) -> None:
|
1472
|
+
name, version = row["name"].split(_FEATURE_VIEW_NAME_DELIMITER)
|
1473
|
+
m = re.match(_DT_OR_VIEW_QUERY_PATTERN, row["text"])
|
1474
|
+
if m is None:
|
1475
|
+
raise snowml_exceptions.SnowflakeMLException(
|
1476
|
+
error_code=error_codes.INTERNAL_SNOWML_ERROR,
|
1477
|
+
original_exception=RuntimeError(f"Failed to parse query text for FeatureView {name}/{version}: {row}."),
|
1478
|
+
)
|
1479
|
+
|
1480
|
+
fv_metadata = _FeatureViewMetadata.from_json(m.group("fv_metadata"))
|
1481
|
+
|
1482
|
+
values: List[Any] = []
|
1483
|
+
values.append(name)
|
1484
|
+
values.append(version)
|
1485
|
+
values.append(row["database_name"])
|
1486
|
+
values.append(row["schema_name"])
|
1487
|
+
values.append(row["created_on"])
|
1488
|
+
values.append(row["owner"])
|
1489
|
+
values.append(row["comment"])
|
1490
|
+
values.append(fv_metadata.entities)
|
1491
|
+
output_values.append(values)
|
1492
|
+
|
1493
|
+
def _find_feature_views(self, entity_name: SqlIdentifier, feature_view_name: Optional[SqlIdentifier]) -> DataFrame:
|
1494
|
+
if not self._validate_entity_exists(entity_name):
|
1495
|
+
return self._session.create_dataframe([], schema=_LIST_FEATURE_VIEW_SCHEMA)
|
1496
|
+
|
1497
|
+
all_fvs = self._get_fv_backend_representations(object_name=None)
|
1498
|
+
fv_maps = {SqlIdentifier(r["name"], case_sensitive=True): r for r in all_fvs}
|
1499
|
+
|
1500
|
+
if len(fv_maps.keys()) == 0:
|
1501
|
+
return self._session.create_dataframe([], schema=_LIST_FEATURE_VIEW_SCHEMA)
|
1411
1502
|
|
1412
1503
|
# NOTE: querying INFORMATION_SCHEMA for Entity lineage can be expensive depending on how many active
|
1413
1504
|
# FeatureViews there are. If this ever become an issue, consider exploring improvements.
|
@@ -1424,7 +1515,7 @@ class FeatureStore:
|
|
1424
1515
|
)
|
1425
1516
|
)
|
1426
1517
|
WHERE LEVEL = 'TABLE'
|
1427
|
-
AND TAG_NAME = '{
|
1518
|
+
AND TAG_NAME = '{_FEATURE_VIEW_METADATA_TAG}'
|
1428
1519
|
"""
|
1429
1520
|
for fv_name in fv_maps.keys()
|
1430
1521
|
]
|
@@ -1436,21 +1527,22 @@ class FeatureStore:
|
|
1436
1527
|
original_exception=RuntimeError(f"Failed to retrieve feature views' information: {e}"),
|
1437
1528
|
) from e
|
1438
1529
|
|
1439
|
-
|
1440
|
-
outputs = []
|
1530
|
+
output_values: List[List[Any]] = []
|
1441
1531
|
for r in results:
|
1442
|
-
|
1443
|
-
|
1444
|
-
|
1445
|
-
|
1446
|
-
|
1447
|
-
|
1448
|
-
|
1532
|
+
fv_metadata = _FeatureViewMetadata.from_json(r["TAG_VALUE"])
|
1533
|
+
for retrieved_entity in fv_metadata.entities:
|
1534
|
+
if entity_name == SqlIdentifier(retrieved_entity, case_sensitive=True):
|
1535
|
+
fv_name, _ = r["OBJECT_NAME"].split(_FEATURE_VIEW_NAME_DELIMITER)
|
1536
|
+
fv_name = SqlIdentifier(fv_name, case_sensitive=True)
|
1537
|
+
obj_name = SqlIdentifier(r["OBJECT_NAME"], case_sensitive=True)
|
1538
|
+
if feature_view_name is not None:
|
1539
|
+
if fv_name == feature_view_name:
|
1540
|
+
self._extract_feature_view_info(fv_maps[obj_name], output_values)
|
1541
|
+
else:
|
1542
|
+
continue
|
1449
1543
|
else:
|
1450
|
-
|
1451
|
-
|
1452
|
-
outputs.append(self._compose_feature_view(fv_maps[obj_name], entities))
|
1453
|
-
return outputs
|
1544
|
+
self._extract_feature_view_info(fv_maps[obj_name], output_values)
|
1545
|
+
return self._session.create_dataframe(output_values, schema=_LIST_FEATURE_VIEW_SCHEMA)
|
1454
1546
|
|
1455
1547
|
def _compose_feature_view(self, row: Row, entity_list: List[Row]) -> FeatureView:
|
1456
1548
|
def find_and_compose_entity(name: str) -> Entity:
|
@@ -1459,7 +1551,7 @@ class FeatureStore:
|
|
1459
1551
|
if e["NAME"] == name:
|
1460
1552
|
return Entity(
|
1461
1553
|
name=SqlIdentifier(e["NAME"], case_sensitive=True).identifier(),
|
1462
|
-
join_keys=e["JOIN_KEYS"]
|
1554
|
+
join_keys=self._recompose_join_keys(e["JOIN_KEYS"]),
|
1463
1555
|
desc=e["DESC"],
|
1464
1556
|
)
|
1465
1557
|
raise RuntimeError(f"Cannot find entity {name} from retrieved entity list: {entity_list}")
|
@@ -1473,14 +1565,17 @@ class FeatureStore:
|
|
1473
1565
|
original_exception=RuntimeError(f"Failed to parse query text for FeatureView {name}/{version}: {row}."),
|
1474
1566
|
)
|
1475
1567
|
|
1568
|
+
fv_name = FeatureView._get_physical_name(name, version)
|
1569
|
+
infer_schema_df = self._session.sql(f"SELECT * FROM {self._get_fully_qualified_name(fv_name)}")
|
1570
|
+
|
1476
1571
|
if m.group("obj_type") == "DYNAMIC TABLE":
|
1477
1572
|
query = m.group("query")
|
1478
1573
|
df = self._session.sql(query)
|
1479
1574
|
desc = m.group("comment")
|
1480
|
-
|
1481
|
-
entities = [find_and_compose_entity(n) for n in
|
1482
|
-
ts_col =
|
1483
|
-
timestamp_col = ts_col if ts_col
|
1575
|
+
fv_metadata = _FeatureViewMetadata.from_json(m.group("fv_metadata"))
|
1576
|
+
entities = [find_and_compose_entity(n) for n in fv_metadata.entities]
|
1577
|
+
ts_col = fv_metadata.timestamp_col
|
1578
|
+
timestamp_col = ts_col if ts_col not in _LEGACY_TIMESTAMP_COL_PLACEHOLDER_VALS else None
|
1484
1579
|
|
1485
1580
|
fv = FeatureView._construct_feature_view(
|
1486
1581
|
name=name,
|
@@ -1490,9 +1585,7 @@ class FeatureStore:
|
|
1490
1585
|
desc=desc,
|
1491
1586
|
version=version,
|
1492
1587
|
status=FeatureViewStatus(row["scheduling_state"]),
|
1493
|
-
feature_descs=self._fetch_column_descs(
|
1494
|
-
"DYNAMIC TABLE", SqlIdentifier(row["name"], case_sensitive=True)
|
1495
|
-
),
|
1588
|
+
feature_descs=self._fetch_column_descs("DYNAMIC TABLE", fv_name),
|
1496
1589
|
refresh_freq=row["target_lag"],
|
1497
1590
|
database=self._config.database.identifier(),
|
1498
1591
|
schema=self._config.schema.identifier(),
|
@@ -1500,16 +1593,17 @@ class FeatureStore:
|
|
1500
1593
|
refresh_mode=row["refresh_mode"],
|
1501
1594
|
refresh_mode_reason=row["refresh_mode_reason"],
|
1502
1595
|
owner=row["owner"],
|
1596
|
+
infer_schema_df=infer_schema_df,
|
1503
1597
|
)
|
1504
1598
|
return fv
|
1505
1599
|
else:
|
1506
1600
|
query = m.group("query")
|
1507
1601
|
df = self._session.sql(query)
|
1508
1602
|
desc = m.group("comment")
|
1509
|
-
|
1510
|
-
entities = [find_and_compose_entity(n) for n in
|
1511
|
-
ts_col =
|
1512
|
-
timestamp_col = ts_col if ts_col
|
1603
|
+
fv_metadata = _FeatureViewMetadata.from_json(m.group("fv_metadata"))
|
1604
|
+
entities = [find_and_compose_entity(n) for n in fv_metadata.entities]
|
1605
|
+
ts_col = fv_metadata.timestamp_col
|
1606
|
+
timestamp_col = ts_col if ts_col not in _LEGACY_TIMESTAMP_COL_PLACEHOLDER_VALS else None
|
1513
1607
|
|
1514
1608
|
fv = FeatureView._construct_feature_view(
|
1515
1609
|
name=name,
|
@@ -1519,7 +1613,7 @@ class FeatureStore:
|
|
1519
1613
|
desc=desc,
|
1520
1614
|
version=version,
|
1521
1615
|
status=FeatureViewStatus.STATIC,
|
1522
|
-
feature_descs=self._fetch_column_descs("VIEW",
|
1616
|
+
feature_descs=self._fetch_column_descs("VIEW", fv_name),
|
1523
1617
|
refresh_freq=None,
|
1524
1618
|
database=self._config.database.identifier(),
|
1525
1619
|
schema=self._config.schema.identifier(),
|
@@ -1527,6 +1621,7 @@ class FeatureStore:
|
|
1527
1621
|
refresh_mode=None,
|
1528
1622
|
refresh_mode_reason=None,
|
1529
1623
|
owner=row["owner"],
|
1624
|
+
infer_schema_df=infer_schema_df,
|
1530
1625
|
)
|
1531
1626
|
return fv
|
1532
1627
|
|
@@ -1542,7 +1637,10 @@ class FeatureStore:
|
|
1542
1637
|
return descs
|
1543
1638
|
|
1544
1639
|
def _find_object(
|
1545
|
-
self,
|
1640
|
+
self,
|
1641
|
+
object_type: str,
|
1642
|
+
object_name: Optional[SqlIdentifier],
|
1643
|
+
prefix_match: bool = False,
|
1546
1644
|
) -> List[Row]:
|
1547
1645
|
"""Try to find an object by given type and name pattern.
|
1548
1646
|
|
@@ -1569,7 +1667,7 @@ class FeatureStore:
|
|
1569
1667
|
search_space, obj_domain = self._obj_search_spaces[object_type]
|
1570
1668
|
all_rows = []
|
1571
1669
|
fs_tag_objects = []
|
1572
|
-
tag_free_object_types = ["TAGS", "SCHEMAS", "WAREHOUSES"]
|
1670
|
+
tag_free_object_types = ["TAGS", "SCHEMAS", "WAREHOUSES", "DATASETS"]
|
1573
1671
|
try:
|
1574
1672
|
search_scope = f"IN {search_space}" if search_space is not None else ""
|
1575
1673
|
all_rows = self._session.sql(f"SHOW {object_type} LIKE '{match_name}' {search_scope}").collect(
|
@@ -1577,25 +1675,41 @@ class FeatureStore:
|
|
1577
1675
|
)
|
1578
1676
|
# There could be none-FS objects under FS schema, thus filter on objects with FS special tag.
|
1579
1677
|
if object_type not in tag_free_object_types and len(all_rows) > 0:
|
1580
|
-
|
1581
|
-
|
1582
|
-
|
1583
|
-
|
1584
|
-
|
1585
|
-
|
1586
|
-
|
1587
|
-
|
1588
|
-
|
1678
|
+
if self._use_optimized_tag_ref:
|
1679
|
+
fs_obj_rows = self._session.sql(
|
1680
|
+
f"""
|
1681
|
+
SELECT
|
1682
|
+
OBJECT_NAME
|
1683
|
+
FROM TABLE(
|
1684
|
+
{self._config.database}.INFORMATION_SCHEMA.TAG_REFERENCES_INTERNAL(
|
1685
|
+
TAG_NAME => '{self._get_fully_qualified_name(_FEATURE_STORE_OBJECT_TAG)}'
|
1686
|
+
)
|
1589
1687
|
)
|
1590
|
-
|
1591
|
-
|
1592
|
-
|
1593
|
-
|
1594
|
-
|
1595
|
-
|
1596
|
-
|
1597
|
-
|
1598
|
-
|
1688
|
+
WHERE DOMAIN='{obj_domain}'
|
1689
|
+
"""
|
1690
|
+
).collect(statement_params=self._telemetry_stmp)
|
1691
|
+
else:
|
1692
|
+
# TODO: remove this after tag_ref_internal rollout
|
1693
|
+
# Note: <object_name> in TAG_REFERENCES(<object_name>) is case insensitive,
|
1694
|
+
# use double quotes to make it case-sensitive.
|
1695
|
+
queries = [
|
1696
|
+
f"""
|
1697
|
+
SELECT OBJECT_NAME
|
1698
|
+
FROM TABLE(
|
1699
|
+
{self._config.database}.INFORMATION_SCHEMA.TAG_REFERENCES(
|
1700
|
+
'{self._get_fully_qualified_name(SqlIdentifier(row['name'], case_sensitive=True))}',
|
1701
|
+
'{obj_domain}'
|
1702
|
+
)
|
1703
|
+
)
|
1704
|
+
WHERE TAG_NAME = '{_FEATURE_STORE_OBJECT_TAG}'
|
1705
|
+
AND TAG_SCHEMA = '{self._config.schema.resolved()}'
|
1706
|
+
"""
|
1707
|
+
for row in all_rows
|
1708
|
+
]
|
1709
|
+
fs_obj_rows = self._session.sql("\nUNION\n".join(queries)).collect(
|
1710
|
+
statement_params=self._telemetry_stmp
|
1711
|
+
)
|
1712
|
+
|
1599
1713
|
fs_tag_objects = [row["OBJECT_NAME"] for row in fs_obj_rows]
|
1600
1714
|
except Exception as e:
|
1601
1715
|
raise snowml_exceptions.SnowflakeMLException(
|
@@ -1641,3 +1755,75 @@ class FeatureStore:
|
|
1641
1755
|
),
|
1642
1756
|
)
|
1643
1757
|
return cast(DataFrame, df.drop(exclude_columns))
|
1758
|
+
|
1759
|
+
def _tag_ref_internal_enabled(self) -> bool:
|
1760
|
+
try:
|
1761
|
+
self._session.sql(
|
1762
|
+
f"""
|
1763
|
+
SELECT * FROM TABLE(
|
1764
|
+
INFORMATION_SCHEMA.TAG_REFERENCES_INTERNAL(
|
1765
|
+
TAG_NAME => '{_FEATURE_STORE_OBJECT_TAG}'
|
1766
|
+
)
|
1767
|
+
) LIMIT 1;
|
1768
|
+
"""
|
1769
|
+
).collect()
|
1770
|
+
return True
|
1771
|
+
except Exception:
|
1772
|
+
return False
|
1773
|
+
|
1774
|
+
def _is_dataset_enabled(self) -> bool:
|
1775
|
+
try:
|
1776
|
+
self._session.sql(f"SHOW DATASETS IN SCHEMA {self._config.full_schema_path}").collect()
|
1777
|
+
return True
|
1778
|
+
except SnowparkSQLException as e:
|
1779
|
+
if "'DATASETS' does not exist" in e.message:
|
1780
|
+
return False
|
1781
|
+
raise
|
1782
|
+
|
1783
|
+
def _check_feature_store_object_versions(self) -> None:
|
1784
|
+
versions = self._collapse_object_versions()
|
1785
|
+
if len(versions) > 0 and pkg_version.parse(snowml_version.VERSION) < versions[0]:
|
1786
|
+
warnings.warn(
|
1787
|
+
"The current snowflake-ml-python version out of date, package upgrade recommended "
|
1788
|
+
+ f"(current={snowml_version.VERSION}, recommended>={str(versions[0])})",
|
1789
|
+
stacklevel=2,
|
1790
|
+
category=UserWarning,
|
1791
|
+
)
|
1792
|
+
|
1793
|
+
def _collapse_object_versions(self) -> List[pkg_version.Version]:
|
1794
|
+
if not self._use_optimized_tag_ref:
|
1795
|
+
return []
|
1796
|
+
|
1797
|
+
query = f"""
|
1798
|
+
SELECT
|
1799
|
+
TAG_VALUE
|
1800
|
+
FROM TABLE(
|
1801
|
+
{self._config.database}.INFORMATION_SCHEMA.TAG_REFERENCES_INTERNAL(
|
1802
|
+
TAG_NAME => '{self._get_fully_qualified_name(_FEATURE_STORE_OBJECT_TAG)}'
|
1803
|
+
)
|
1804
|
+
)
|
1805
|
+
"""
|
1806
|
+
try:
|
1807
|
+
res = self._session.sql(query).collect(statement_params=self._telemetry_stmp)
|
1808
|
+
except Exception:
|
1809
|
+
# since this is a best effort user warning to upgrade pkg versions
|
1810
|
+
# we are treating failures as benign error
|
1811
|
+
return []
|
1812
|
+
versions = set()
|
1813
|
+
compatibility_breakage_detected = False
|
1814
|
+
for r in res:
|
1815
|
+
info = _FeatureStoreObjInfo.from_json(r["TAG_VALUE"])
|
1816
|
+
if info.type == _FeatureStoreObjTypes.UNKNOWN:
|
1817
|
+
compatibility_breakage_detected = True
|
1818
|
+
versions.add(pkg_version.parse(info.pkg_version))
|
1819
|
+
|
1820
|
+
sorted_versions = sorted(versions, reverse=True)
|
1821
|
+
if compatibility_breakage_detected:
|
1822
|
+
raise snowml_exceptions.SnowflakeMLException(
|
1823
|
+
error_code=error_codes.SNOWML_PACKAGE_OUTDATED,
|
1824
|
+
original_exception=RuntimeError(
|
1825
|
+
f"The current snowflake-ml-python version {snowml_version.VERSION} is out of date, "
|
1826
|
+
+ f"please upgrade to at least {sorted_versions[0]}."
|
1827
|
+
),
|
1828
|
+
)
|
1829
|
+
return sorted_versions
|