snowflake-ml-python 1.5.0__py3-none-any.whl → 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +6 -0
- snowflake/ml/_internal/lineage/lineage_utils.py +95 -0
- snowflake/ml/_internal/telemetry.py +1 -0
- snowflake/ml/_internal/utils/identifier.py +1 -1
- snowflake/ml/_internal/utils/sql_identifier.py +14 -1
- snowflake/ml/dataset/__init__.py +2 -1
- snowflake/ml/dataset/dataset.py +4 -3
- snowflake/ml/dataset/dataset_reader.py +5 -8
- snowflake/ml/feature_store/__init__.py +6 -0
- snowflake/ml/feature_store/access_manager.py +279 -0
- snowflake/ml/feature_store/feature_store.py +159 -99
- snowflake/ml/feature_store/feature_view.py +18 -8
- snowflake/ml/fileset/embedded_stage_fs.py +15 -12
- snowflake/ml/fileset/snowfs.py +3 -2
- snowflake/ml/fileset/stage_fs.py +25 -7
- snowflake/ml/model/_client/model/model_impl.py +46 -39
- snowflake/ml/model/_client/model/model_version_impl.py +24 -2
- snowflake/ml/model/_client/ops/metadata_ops.py +27 -4
- snowflake/ml/model/_client/ops/model_ops.py +131 -16
- snowflake/ml/model/_client/sql/_base.py +34 -0
- snowflake/ml/model/_client/sql/model.py +32 -39
- snowflake/ml/model/_client/sql/model_version.py +60 -43
- snowflake/ml/model/_client/sql/stage.py +6 -32
- snowflake/ml/model/_client/sql/tag.py +32 -56
- snowflake/ml/model/_model_composer/model_composer.py +2 -2
- snowflake/ml/model/_packager/model_handlers/mlflow.py +2 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +50 -21
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +81 -3
- snowflake/ml/modeling/framework/base.py +4 -3
- snowflake/ml/modeling/pipeline/pipeline.py +27 -7
- snowflake/ml/registry/_manager/model_manager.py +36 -7
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.5.0.dist-info → snowflake_ml_python-1.5.1.dist-info}/METADATA +54 -10
- {snowflake_ml_python-1.5.0.dist-info → snowflake_ml_python-1.5.1.dist-info}/RECORD +37 -35
- snowflake/ml/_internal/lineage/dataset_dataframe.py +0 -44
- {snowflake_ml_python-1.5.0.dist-info → snowflake_ml_python-1.5.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.5.0.dist-info → snowflake_ml_python-1.5.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.5.0.dist-info → snowflake_ml_python-1.5.1.dist-info}/top_level.txt +0 -0
@@ -4,7 +4,7 @@ import io
|
|
4
4
|
import os
|
5
5
|
import posixpath
|
6
6
|
import sys
|
7
|
-
from typing import Any, Dict, List, Optional, Tuple, Union
|
7
|
+
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
8
8
|
|
9
9
|
import cloudpickle as cp
|
10
10
|
import numpy as np
|
@@ -154,7 +154,7 @@ def construct_cv_results(
|
|
154
154
|
return multimetric, estimator._format_results(param_grid, n_split, out)
|
155
155
|
|
156
156
|
|
157
|
-
def
|
157
|
+
def construct_cv_results_memory_efficient_version(
|
158
158
|
estimator: Union[GridSearchCV, RandomizedSearchCV],
|
159
159
|
n_split: int,
|
160
160
|
param_grid: List[Dict[str, Any]],
|
@@ -205,12 +205,35 @@ def construct_cv_results_new_implementation(
|
|
205
205
|
with io.BytesIO(hex_str) as f_reload:
|
206
206
|
out = cp.load(f_reload)
|
207
207
|
all_out.extend(out)
|
208
|
+
|
209
|
+
# because original SearchCV is ranked by parameter first and cv second,
|
210
|
+
# to make the memory efficient, we implemented by fitting on cv first and parameter second
|
211
|
+
# when retrieving the results back, the ordering should revert back to remain the same result as original SearchCV
|
212
|
+
def generate_the_order_by_parameter_index(all_combination_length: int) -> List[int]:
|
213
|
+
pattern = []
|
214
|
+
for i in range(all_combination_length):
|
215
|
+
if i % parameter_grid_length == 0:
|
216
|
+
pattern.append(i)
|
217
|
+
for i in range(1, parameter_grid_length):
|
218
|
+
for j in range(all_combination_length):
|
219
|
+
if j % parameter_grid_length == i:
|
220
|
+
pattern.append(j)
|
221
|
+
return pattern
|
222
|
+
|
223
|
+
def rerank_array(original_array: List[Any], pattern: List[int]) -> List[Any]:
|
224
|
+
reranked_array = []
|
225
|
+
for index in pattern:
|
226
|
+
reranked_array.append(original_array[index])
|
227
|
+
return reranked_array
|
228
|
+
|
229
|
+
pattern = generate_the_order_by_parameter_index(len(all_out))
|
230
|
+
reranked_all_out = rerank_array(all_out, pattern)
|
208
231
|
first_test_score = all_out[0]["test_scores"]
|
209
|
-
return first_test_score, estimator._format_results(param_grid, n_split,
|
232
|
+
return first_test_score, estimator._format_results(param_grid, n_split, reranked_all_out)
|
210
233
|
|
211
234
|
|
212
235
|
cp.register_pickle_by_value(inspect.getmodule(construct_cv_results))
|
213
|
-
cp.register_pickle_by_value(inspect.getmodule(
|
236
|
+
cp.register_pickle_by_value(inspect.getmodule(construct_cv_results_memory_efficient_version))
|
214
237
|
|
215
238
|
|
216
239
|
class DistributedHPOTrainer(SnowparkModelTrainer):
|
@@ -661,7 +684,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
|
|
661
684
|
|
662
685
|
return fit_estimator
|
663
686
|
|
664
|
-
def
|
687
|
+
def fit_search_snowpark_enable_efficient_memory_usage(
|
665
688
|
self,
|
666
689
|
param_grid: Union[model_selection.ParameterGrid, model_selection.ParameterSampler],
|
667
690
|
dataset: DataFrame,
|
@@ -718,7 +741,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
|
|
718
741
|
inspect.currentframe(), self.__class__.__name__
|
719
742
|
),
|
720
743
|
api_calls=[udtf],
|
721
|
-
custom_tags=dict([("
|
744
|
+
custom_tags=dict([("hpo_memory_efficient", True)]),
|
722
745
|
)
|
723
746
|
|
724
747
|
# Put locally serialized estimator on stage.
|
@@ -960,22 +983,26 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
|
|
960
983
|
self.base_estimator = base_estimator
|
961
984
|
self.fit_and_score_kwargs = fit_and_score_kwargs
|
962
985
|
self.fit_score_params: List[Any] = []
|
963
|
-
self.
|
964
|
-
# Calculate the full index here to avoid duplicate calculation (which consumes a lot of memory)
|
965
|
-
full_index = np.arange(DATA_LENGTH)
|
966
|
-
for i in range(n_splits):
|
967
|
-
self.cached_train_test_indices.extend(
|
968
|
-
[[np.setdiff1d(full_index, self.test_indices[i]), self.test_indices[i]]]
|
969
|
-
)
|
986
|
+
self.cv_indices_set: Set[int] = set()
|
970
987
|
|
971
988
|
def process(self, idx: int, params_idx: int, cv_idx: int) -> None:
|
972
989
|
self.fit_score_params.extend([[idx, params_idx, cv_idx]])
|
990
|
+
self.cv_indices_set.add(cv_idx)
|
973
991
|
|
974
992
|
def end_partition(self) -> Iterator[Tuple[int, str]]:
|
975
993
|
from sklearn.base import clone
|
976
994
|
from sklearn.model_selection._validation import _fit_and_score
|
977
995
|
from sklearn.utils.parallel import Parallel, delayed
|
978
996
|
|
997
|
+
cached_train_test_indices = {}
|
998
|
+
# Calculate the full index here to avoid duplicate calculation (which consumes a lot of memory)
|
999
|
+
full_index = np.arange(DATA_LENGTH)
|
1000
|
+
for i in self.cv_indices_set:
|
1001
|
+
cached_train_test_indices[i] = [
|
1002
|
+
np.setdiff1d(full_index, self.test_indices[i]),
|
1003
|
+
self.test_indices[i],
|
1004
|
+
]
|
1005
|
+
|
979
1006
|
parallel = Parallel(n_jobs=_N_JOBS, pre_dispatch=_PRE_DISPATCH)
|
980
1007
|
|
981
1008
|
out = parallel(
|
@@ -983,8 +1010,8 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
|
|
983
1010
|
clone(self.base_estimator),
|
984
1011
|
self.X,
|
985
1012
|
self.y,
|
986
|
-
train=
|
987
|
-
test=
|
1013
|
+
train=cached_train_test_indices[split_idx][0],
|
1014
|
+
test=cached_train_test_indices[split_idx][1],
|
988
1015
|
parameters=self.params_to_evaluate[cand_idx],
|
989
1016
|
split_progress=(split_idx, n_splits),
|
990
1017
|
candidate_progress=(cand_idx, n_candidates),
|
@@ -1005,7 +1032,9 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
|
|
1005
1032
|
|
1006
1033
|
session.udtf.register(
|
1007
1034
|
SearchCV,
|
1008
|
-
output_schema=StructType(
|
1035
|
+
output_schema=StructType(
|
1036
|
+
[StructField("FIRST_IDX", IntegerType()), StructField("EACH_CV_RESULTS", StringType())]
|
1037
|
+
),
|
1009
1038
|
input_types=[IntegerType(), IntegerType(), IntegerType()],
|
1010
1039
|
name=random_udtf_name,
|
1011
1040
|
packages=required_deps, # type: ignore[arg-type]
|
@@ -1020,8 +1049,8 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
|
|
1020
1049
|
# param_indices is for the index for each parameter grid;
|
1021
1050
|
# cv_indices is for the index for each cross_validator's fold;
|
1022
1051
|
# param_cv_indices is for the index for the product of (len(param_indices) * len(cv_indices))
|
1023
|
-
|
1024
|
-
*product(range(
|
1052
|
+
cv_indices, param_indices = zip(
|
1053
|
+
*product(range(cross_validator_indices_length), range(parameter_grid_length))
|
1025
1054
|
)
|
1026
1055
|
|
1027
1056
|
indices_info_pandas = pd.DataFrame(
|
@@ -1042,11 +1071,11 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
|
|
1042
1071
|
),
|
1043
1072
|
)
|
1044
1073
|
|
1045
|
-
first_test_score, cv_results_ =
|
1074
|
+
first_test_score, cv_results_ = construct_cv_results_memory_efficient_version(
|
1046
1075
|
estimator,
|
1047
1076
|
n_splits,
|
1048
1077
|
list(param_grid),
|
1049
|
-
HP_raw_results.select("
|
1078
|
+
HP_raw_results.select("EACH_CV_RESULTS").sort(F.col("FIRST_IDX")).collect(),
|
1050
1079
|
cross_validator_indices_length,
|
1051
1080
|
parameter_grid_length,
|
1052
1081
|
)
|
@@ -1163,7 +1192,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
|
|
1163
1192
|
pkg_versions=model_spec.pkgDependencies, session=self.session
|
1164
1193
|
)
|
1165
1194
|
if ENABLE_EFFICIENT_MEMORY_USAGE:
|
1166
|
-
return self.
|
1195
|
+
return self.fit_search_snowpark_enable_efficient_memory_usage(
|
1167
1196
|
param_grid=param_grid,
|
1168
1197
|
dataset=self.dataset,
|
1169
1198
|
session=self.session,
|
@@ -45,6 +45,7 @@ cp.register_pickle_by_value(inspect.getmodule(identifier.get_inferred_name))
|
|
45
45
|
cp.register_pickle_by_value(inspect.getmodule(handle_inference_result))
|
46
46
|
|
47
47
|
_PROJECT = "ModelDevelopment"
|
48
|
+
_ENABLE_ANONYMOUS_SPROC = False
|
48
49
|
|
49
50
|
|
50
51
|
class SnowparkModelTrainer:
|
@@ -251,6 +252,27 @@ class SnowparkModelTrainer:
|
|
251
252
|
|
252
253
|
return fit_wrapper_function
|
253
254
|
|
255
|
+
def _get_fit_wrapper_sproc_anonymous(self, statement_params: Dict[str, str]) -> StoredProcedure:
|
256
|
+
model_spec = ModelSpecificationsBuilder.build(model=self.estimator)
|
257
|
+
fit_sproc_name = random_name_for_temp_object(TempObjectType.PROCEDURE)
|
258
|
+
|
259
|
+
relaxed_dependencies = pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel(
|
260
|
+
pkg_versions=model_spec.pkgDependencies, session=self.session
|
261
|
+
)
|
262
|
+
|
263
|
+
fit_wrapper_sproc = self.session.sproc.register(
|
264
|
+
func=self._build_fit_wrapper_sproc(model_spec=model_spec),
|
265
|
+
is_permanent=False,
|
266
|
+
name=fit_sproc_name,
|
267
|
+
packages=["snowflake-snowpark-python"] + relaxed_dependencies, # type: ignore[arg-type]
|
268
|
+
replace=True,
|
269
|
+
session=self.session,
|
270
|
+
statement_params=statement_params,
|
271
|
+
anonymous=True,
|
272
|
+
)
|
273
|
+
|
274
|
+
return fit_wrapper_sproc
|
275
|
+
|
254
276
|
def _get_fit_wrapper_sproc(self, statement_params: Dict[str, str]) -> StoredProcedure:
|
255
277
|
# If the sproc already exists, don't register.
|
256
278
|
if not hasattr(self.session, "_FIT_WRAPPER_SPROCS"):
|
@@ -510,6 +532,28 @@ class SnowparkModelTrainer:
|
|
510
532
|
|
511
533
|
return fit_transform_wrapper_function
|
512
534
|
|
535
|
+
def _get_fit_predict_wrapper_sproc_anonymous(self, statement_params: Dict[str, str]) -> StoredProcedure:
|
536
|
+
model_spec = ModelSpecificationsBuilder.build(model=self.estimator)
|
537
|
+
|
538
|
+
fit_predict_sproc_name = random_name_for_temp_object(TempObjectType.PROCEDURE)
|
539
|
+
|
540
|
+
relaxed_dependencies = pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel(
|
541
|
+
pkg_versions=model_spec.pkgDependencies, session=self.session
|
542
|
+
)
|
543
|
+
|
544
|
+
fit_predict_wrapper_sproc = self.session.sproc.register(
|
545
|
+
func=self._build_fit_predict_wrapper_sproc(model_spec=model_spec),
|
546
|
+
is_permanent=False,
|
547
|
+
name=fit_predict_sproc_name,
|
548
|
+
packages=["snowflake-snowpark-python"] + relaxed_dependencies, # type: ignore[arg-type]
|
549
|
+
replace=True,
|
550
|
+
session=self.session,
|
551
|
+
statement_params=statement_params,
|
552
|
+
anonymous=True,
|
553
|
+
)
|
554
|
+
|
555
|
+
return fit_predict_wrapper_sproc
|
556
|
+
|
513
557
|
def _get_fit_predict_wrapper_sproc(self, statement_params: Dict[str, str]) -> StoredProcedure:
|
514
558
|
# If the sproc already exists, don't register.
|
515
559
|
if not hasattr(self.session, "_FIT_WRAPPER_SPROCS"):
|
@@ -545,6 +589,27 @@ class SnowparkModelTrainer:
|
|
545
589
|
|
546
590
|
return fit_predict_wrapper_sproc
|
547
591
|
|
592
|
+
def _get_fit_transform_wrapper_sproc_anonymous(self, statement_params: Dict[str, str]) -> StoredProcedure:
|
593
|
+
model_spec = ModelSpecificationsBuilder.build(model=self.estimator)
|
594
|
+
|
595
|
+
fit_transform_sproc_name = random_name_for_temp_object(TempObjectType.PROCEDURE)
|
596
|
+
|
597
|
+
relaxed_dependencies = pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel(
|
598
|
+
pkg_versions=model_spec.pkgDependencies, session=self.session
|
599
|
+
)
|
600
|
+
|
601
|
+
fit_transform_wrapper_sproc = self.session.sproc.register(
|
602
|
+
func=self._build_fit_transform_wrapper_sproc(model_spec=model_spec),
|
603
|
+
is_permanent=False,
|
604
|
+
name=fit_transform_sproc_name,
|
605
|
+
packages=["snowflake-snowpark-python"] + relaxed_dependencies, # type: ignore[arg-type]
|
606
|
+
replace=True,
|
607
|
+
session=self.session,
|
608
|
+
statement_params=statement_params,
|
609
|
+
anonymous=True,
|
610
|
+
)
|
611
|
+
return fit_transform_wrapper_sproc
|
612
|
+
|
548
613
|
def _get_fit_transform_wrapper_sproc(self, statement_params: Dict[str, str]) -> StoredProcedure:
|
549
614
|
# If the sproc already exists, don't register.
|
550
615
|
if not hasattr(self.session, "_FIT_WRAPPER_SPROCS"):
|
@@ -612,7 +677,10 @@ class SnowparkModelTrainer:
|
|
612
677
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
613
678
|
)
|
614
679
|
|
615
|
-
|
680
|
+
if _ENABLE_ANONYMOUS_SPROC:
|
681
|
+
fit_wrapper_sproc = self._get_fit_wrapper_sproc_anonymous(statement_params=statement_params)
|
682
|
+
else:
|
683
|
+
fit_wrapper_sproc = self._get_fit_wrapper_sproc(statement_params=statement_params)
|
616
684
|
|
617
685
|
try:
|
618
686
|
sproc_export_file_name: str = fit_wrapper_sproc(
|
@@ -680,7 +748,11 @@ class SnowparkModelTrainer:
|
|
680
748
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
681
749
|
)
|
682
750
|
|
683
|
-
|
751
|
+
if _ENABLE_ANONYMOUS_SPROC:
|
752
|
+
fit_predict_wrapper_sproc = self._get_fit_predict_wrapper_sproc_anonymous(statement_params=statement_params)
|
753
|
+
else:
|
754
|
+
fit_predict_wrapper_sproc = self._get_fit_predict_wrapper_sproc(statement_params=statement_params)
|
755
|
+
|
684
756
|
fit_predict_result_name = random_name_for_temp_object(TempObjectType.TABLE)
|
685
757
|
|
686
758
|
sproc_export_file_name: str = fit_predict_wrapper_sproc(
|
@@ -741,7 +813,13 @@ class SnowparkModelTrainer:
|
|
741
813
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
742
814
|
)
|
743
815
|
|
744
|
-
|
816
|
+
if _ENABLE_ANONYMOUS_SPROC:
|
817
|
+
fit_transform_wrapper_sproc = self._get_fit_transform_wrapper_sproc_anonymous(
|
818
|
+
statement_params=statement_params
|
819
|
+
)
|
820
|
+
else:
|
821
|
+
fit_transform_wrapper_sproc = self._get_fit_transform_wrapper_sproc(statement_params=statement_params)
|
822
|
+
|
745
823
|
fit_transform_result_name = random_name_for_temp_object(TempObjectType.TABLE)
|
746
824
|
|
747
825
|
sproc_export_file_name: str = fit_transform_wrapper_sproc(
|
@@ -16,7 +16,7 @@ from snowflake.ml._internal.exceptions import (
|
|
16
16
|
exceptions,
|
17
17
|
modeling_error_messages,
|
18
18
|
)
|
19
|
-
from snowflake.ml._internal.lineage import data_source,
|
19
|
+
from snowflake.ml._internal.lineage import data_source, lineage_utils
|
20
20
|
from snowflake.ml._internal.utils import identifier, parallelize
|
21
21
|
from snowflake.ml.modeling.framework import _utils
|
22
22
|
from snowflake.snowpark import functions as F
|
@@ -430,8 +430,9 @@ class BaseEstimator(Base):
|
|
430
430
|
)
|
431
431
|
def fit(self, dataset: Union[snowpark.DataFrame, pd.DataFrame]) -> "BaseEstimator":
|
432
432
|
"""Runs universal logics for all fit implementations."""
|
433
|
-
|
434
|
-
|
433
|
+
self._data_sources = getattr(dataset, lineage_utils.DATA_SOURCES_ATTR, None)
|
434
|
+
if self._data_sources:
|
435
|
+
assert all(isinstance(ds, data_source.DataSource) for ds in self._data_sources)
|
435
436
|
return self._fit(dataset)
|
436
437
|
|
437
438
|
@abstractmethod
|
@@ -115,7 +115,7 @@ class Pipeline(base.BaseTransformer):
|
|
115
115
|
self._feature_names_in: List[np.ndarray[Any, np.dtype[Any]]] = []
|
116
116
|
self._n_features_in: List[int] = []
|
117
117
|
self._transformers_to_input_indices: Dict[str, List[int]] = {}
|
118
|
-
self.
|
118
|
+
self._modifies_label_or_sample_weight = True
|
119
119
|
|
120
120
|
self._model_signature_dict: Optional[Dict[str, ModelSignature]] = None
|
121
121
|
|
@@ -126,6 +126,9 @@ class Pipeline(base.BaseTransformer):
|
|
126
126
|
self._deps = list(deps)
|
127
127
|
self._sklearn_object = None
|
128
128
|
self.label_cols = self._get_label_cols()
|
129
|
+
self._is_convertible_to_sklearn = self._is_convertible_to_sklearn_object()
|
130
|
+
|
131
|
+
self._send_pipeline_configuration_telemetry()
|
129
132
|
|
130
133
|
@staticmethod
|
131
134
|
def _is_estimator(obj: object) -> bool:
|
@@ -228,7 +231,7 @@ class Pipeline(base.BaseTransformer):
|
|
228
231
|
return [c for c in columns if c not in target_cols]
|
229
232
|
|
230
233
|
def _append_step_feature_consumption_info(self, step_name: str, all_cols: List[str], input_cols: List[str]) -> None:
|
231
|
-
if self.
|
234
|
+
if self._modifies_label_or_sample_weight:
|
232
235
|
all_cols = self._get_sanitized_list_of_columns(all_cols)
|
233
236
|
self._feature_names_in.append(np.asarray(all_cols, dtype=object))
|
234
237
|
self._n_features_in.append(len(all_cols))
|
@@ -248,7 +251,7 @@ class Pipeline(base.BaseTransformer):
|
|
248
251
|
self, dataset: Union[snowpark.DataFrame, pd.DataFrame]
|
249
252
|
) -> Union[snowpark.DataFrame, pd.DataFrame]:
|
250
253
|
self._reset()
|
251
|
-
self.
|
254
|
+
self._modifies_label_or_sample_weight = not self._is_pipeline_modifying_label_or_sample_weight()
|
252
255
|
transformed_dataset = dataset
|
253
256
|
for name, trans in self._get_transformers():
|
254
257
|
self._append_step_feature_consumption_info(
|
@@ -425,7 +428,7 @@ class Pipeline(base.BaseTransformer):
|
|
425
428
|
)
|
426
429
|
|
427
430
|
if self._can_be_trained_in_ml_runtime(dataset):
|
428
|
-
if not self.
|
431
|
+
if not self._is_convertible_to_sklearn:
|
429
432
|
raise ValueError("This pipeline cannot be converted to an sklearn pipeline.")
|
430
433
|
self._fit_ml_runtime(dataset)
|
431
434
|
|
@@ -947,7 +950,7 @@ class Pipeline(base.BaseTransformer):
|
|
947
950
|
if not os.environ.get(IN_ML_RUNTIME_ENV_VAR):
|
948
951
|
return False
|
949
952
|
|
950
|
-
return self.
|
953
|
+
return self._is_convertible_to_sklearn
|
951
954
|
|
952
955
|
@staticmethod
|
953
956
|
def _wrap_transformer_in_column_transformer(
|
@@ -1003,7 +1006,7 @@ class Pipeline(base.BaseTransformer):
|
|
1003
1006
|
if not self._is_fitted:
|
1004
1007
|
return self._create_unfitted_sklearn_object()
|
1005
1008
|
|
1006
|
-
if not self.
|
1009
|
+
if not self._modifies_label_or_sample_weight:
|
1007
1010
|
raise exceptions.SnowflakeMLException(
|
1008
1011
|
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1009
1012
|
original_exception=ValueError(
|
@@ -1109,7 +1112,24 @@ class Pipeline(base.BaseTransformer):
|
|
1109
1112
|
else:
|
1110
1113
|
return self._create_sklearn_object()
|
1111
1114
|
else:
|
1112
|
-
if self.
|
1115
|
+
if self._is_convertible_to_sklearn:
|
1113
1116
|
return self._create_unfitted_sklearn_object()
|
1114
1117
|
else:
|
1115
1118
|
raise ValueError("This pipeline can not be converted to an sklearn pipeline.")
|
1119
|
+
|
1120
|
+
def _send_pipeline_configuration_telemetry(self) -> None:
|
1121
|
+
"""Track information about the pipeline setup. Currently, we want to track:
|
1122
|
+
- Whether the pipeline is converible to an sklearn pipeline
|
1123
|
+
- Whether the pipeline is being used in the SPCS ml runtime.
|
1124
|
+
"""
|
1125
|
+
|
1126
|
+
telemetry_data = {
|
1127
|
+
"pipeline_is_convertible_to_sklearn": self._is_convertible_to_sklearn,
|
1128
|
+
"in_spcs_ml_runtime": bool(os.environ.get(IN_ML_RUNTIME_ENV_VAR)),
|
1129
|
+
}
|
1130
|
+
telemetry.send_custom_usage(
|
1131
|
+
project=_PROJECT,
|
1132
|
+
subproject=_SUBPROJECT,
|
1133
|
+
telemetry_type=telemetry.TelemetryField.TYPE_SNOWML_PIPELINE_USAGE.value,
|
1134
|
+
data=telemetry_data,
|
1135
|
+
)
|
@@ -48,20 +48,29 @@ class ModelManager:
|
|
48
48
|
options: Optional[model_types.ModelSaveOption] = None,
|
49
49
|
statement_params: Optional[Dict[str, Any]] = None,
|
50
50
|
) -> model_version_impl.ModelVersion:
|
51
|
-
model_name_id = sql_identifier.
|
51
|
+
database_name_id, schema_name_id, model_name_id = sql_identifier.parse_fully_qualified_name(model_name)
|
52
52
|
|
53
53
|
if not version_name:
|
54
54
|
version_name = self._hrid_generator.generate()[1]
|
55
55
|
version_name_id = sql_identifier.SqlIdentifier(version_name)
|
56
56
|
|
57
57
|
if self._model_ops.validate_existence(
|
58
|
-
|
58
|
+
database_name=database_name_id,
|
59
|
+
schema_name=schema_name_id,
|
60
|
+
model_name=model_name_id,
|
61
|
+
statement_params=statement_params,
|
59
62
|
) and self._model_ops.validate_existence(
|
60
|
-
|
63
|
+
database_name=database_name_id,
|
64
|
+
schema_name=schema_name_id,
|
65
|
+
model_name=model_name_id,
|
66
|
+
version_name=version_name_id,
|
67
|
+
statement_params=statement_params,
|
61
68
|
):
|
62
69
|
raise ValueError(f"Model {model_name} version {version_name} already existed.")
|
63
70
|
|
64
71
|
stage_path = self._model_ops.prepare_model_stage_path(
|
72
|
+
database_name=database_name_id,
|
73
|
+
schema_name=schema_name_id,
|
65
74
|
statement_params=statement_params,
|
66
75
|
)
|
67
76
|
|
@@ -85,13 +94,19 @@ class ModelManager:
|
|
85
94
|
|
86
95
|
self._model_ops.create_from_stage(
|
87
96
|
composed_model=mc,
|
97
|
+
database_name=database_name_id,
|
98
|
+
schema_name=schema_name_id,
|
88
99
|
model_name=model_name_id,
|
89
100
|
version_name=version_name_id,
|
90
101
|
statement_params=statement_params,
|
91
102
|
)
|
92
103
|
|
93
104
|
mv = model_version_impl.ModelVersion._ref(
|
94
|
-
|
105
|
+
model_ops.ModelOperator(
|
106
|
+
self._model_ops._session,
|
107
|
+
database_name=database_name_id or self._database_name,
|
108
|
+
schema_name=schema_name_id or self._schema_name,
|
109
|
+
),
|
95
110
|
model_name=model_name_id,
|
96
111
|
version_name=version_name_id,
|
97
112
|
)
|
@@ -102,6 +117,8 @@ class ModelManager:
|
|
102
117
|
if metrics:
|
103
118
|
self._model_ops._metadata_ops.save(
|
104
119
|
metadata_ops.ModelVersionMetadataSchema(metrics=metrics),
|
120
|
+
database_name=database_name_id,
|
121
|
+
schema_name=schema_name_id,
|
105
122
|
model_name=model_name_id,
|
106
123
|
version_name=version_name_id,
|
107
124
|
statement_params=statement_params,
|
@@ -115,13 +132,19 @@ class ModelManager:
|
|
115
132
|
*,
|
116
133
|
statement_params: Optional[Dict[str, Any]] = None,
|
117
134
|
) -> model_impl.Model:
|
118
|
-
model_name_id = sql_identifier.
|
135
|
+
database_name_id, schema_name_id, model_name_id = sql_identifier.parse_fully_qualified_name(model_name)
|
119
136
|
if self._model_ops.validate_existence(
|
137
|
+
database_name=database_name_id,
|
138
|
+
schema_name=schema_name_id,
|
120
139
|
model_name=model_name_id,
|
121
140
|
statement_params=statement_params,
|
122
141
|
):
|
123
142
|
return model_impl.Model._ref(
|
124
|
-
|
143
|
+
model_ops.ModelOperator(
|
144
|
+
self._model_ops._session,
|
145
|
+
database_name=database_name_id or self._database_name,
|
146
|
+
schema_name=schema_name_id or self._schema_name,
|
147
|
+
),
|
125
148
|
model_name=model_name_id,
|
126
149
|
)
|
127
150
|
else:
|
@@ -133,6 +156,8 @@ class ModelManager:
|
|
133
156
|
statement_params: Optional[Dict[str, Any]] = None,
|
134
157
|
) -> List[model_impl.Model]:
|
135
158
|
model_names = self._model_ops.list_models_or_versions(
|
159
|
+
database_name=None,
|
160
|
+
schema_name=None,
|
136
161
|
statement_params=statement_params,
|
137
162
|
)
|
138
163
|
return [
|
@@ -149,6 +174,8 @@ class ModelManager:
|
|
149
174
|
statement_params: Optional[Dict[str, Any]] = None,
|
150
175
|
) -> pd.DataFrame:
|
151
176
|
rows = self._model_ops.show_models_or_versions(
|
177
|
+
database_name=None,
|
178
|
+
schema_name=None,
|
152
179
|
statement_params=statement_params,
|
153
180
|
)
|
154
181
|
return pd.DataFrame([row.as_dict() for row in rows])
|
@@ -159,9 +186,11 @@ class ModelManager:
|
|
159
186
|
*,
|
160
187
|
statement_params: Optional[Dict[str, Any]] = None,
|
161
188
|
) -> None:
|
162
|
-
model_name_id = sql_identifier.
|
189
|
+
database_name_id, schema_name_id, model_name_id = sql_identifier.parse_fully_qualified_name(model_name)
|
163
190
|
|
164
191
|
self._model_ops.delete_model_or_version(
|
192
|
+
database_name=database_name_id,
|
193
|
+
schema_name=schema_name_id,
|
165
194
|
model_name=model_name_id,
|
166
195
|
statement_params=statement_params,
|
167
196
|
)
|
snowflake/ml/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
VERSION="1.5.
|
1
|
+
VERSION="1.5.1"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: snowflake-ml-python
|
3
|
-
Version: 1.5.
|
3
|
+
Version: 1.5.1
|
4
4
|
Summary: The machine learning client library that is used for interacting with Snowflake to build machine learning solutions.
|
5
5
|
Author-email: "Snowflake, Inc" <support@snowflake.com>
|
6
6
|
License:
|
@@ -236,7 +236,6 @@ License-File: LICENSE.txt
|
|
236
236
|
Requires-Dist: absl-py <2,>=0.15
|
237
237
|
Requires-Dist: anyio <4,>=3.5.0
|
238
238
|
Requires-Dist: cachetools <6,>=3.1.1
|
239
|
-
Requires-Dist: catboost <1.3,>=1.2.0
|
240
239
|
Requires-Dist: cloudpickle >=2.0.0
|
241
240
|
Requires-Dist: fsspec[http] <2024,>=2022.11
|
242
241
|
Requires-Dist: importlib-resources <7,>=6.1.1
|
@@ -256,19 +255,22 @@ Requires-Dist: sqlparse <1,>=0.4
|
|
256
255
|
Requires-Dist: typing-extensions <5,>=4.1.0
|
257
256
|
Requires-Dist: xgboost <2,>=1.7.3
|
258
257
|
Provides-Extra: all
|
259
|
-
Requires-Dist:
|
258
|
+
Requires-Dist: catboost <2,>=1.2.0 ; extra == 'all'
|
259
|
+
Requires-Dist: lightgbm <5,>=3.3.5 ; extra == 'all'
|
260
260
|
Requires-Dist: mlflow <2.4,>=2.1.0 ; extra == 'all'
|
261
261
|
Requires-Dist: peft <1,>=0.5.0 ; extra == 'all'
|
262
262
|
Requires-Dist: sentence-transformers <3,>=2.2.2 ; extra == 'all'
|
263
|
-
Requires-Dist: sentencepiece <
|
263
|
+
Requires-Dist: sentencepiece <1,>=0.1.95 ; extra == 'all'
|
264
264
|
Requires-Dist: shap ==0.42.1 ; extra == 'all'
|
265
265
|
Requires-Dist: tensorflow <3,>=2.10 ; extra == 'all'
|
266
266
|
Requires-Dist: tokenizers <1,>=0.10 ; extra == 'all'
|
267
267
|
Requires-Dist: torch <3,>=2.0.1 ; extra == 'all'
|
268
268
|
Requires-Dist: torchdata <1,>=0.4 ; extra == 'all'
|
269
269
|
Requires-Dist: transformers <5,>=4.32.1 ; extra == 'all'
|
270
|
+
Provides-Extra: catboost
|
271
|
+
Requires-Dist: catboost <2,>=1.2.0 ; extra == 'catboost'
|
270
272
|
Provides-Extra: lightgbm
|
271
|
-
Requires-Dist: lightgbm <
|
273
|
+
Requires-Dist: lightgbm <5,>=3.3.5 ; extra == 'lightgbm'
|
272
274
|
Provides-Extra: llm
|
273
275
|
Requires-Dist: peft <1,>=0.5.0 ; extra == 'llm'
|
274
276
|
Provides-Extra: mlflow
|
@@ -282,7 +284,7 @@ Requires-Dist: torch <3,>=2.0.1 ; extra == 'torch'
|
|
282
284
|
Requires-Dist: torchdata <1,>=0.4 ; extra == 'torch'
|
283
285
|
Provides-Extra: transformers
|
284
286
|
Requires-Dist: sentence-transformers <3,>=2.2.2 ; extra == 'transformers'
|
285
|
-
Requires-Dist: sentencepiece <
|
287
|
+
Requires-Dist: sentencepiece <1,>=0.1.95 ; extra == 'transformers'
|
286
288
|
Requires-Dist: tokenizers <1,>=0.10 ; extra == 'transformers'
|
287
289
|
Requires-Dist: transformers <5,>=4.32.1 ; extra == 'transformers'
|
288
290
|
|
@@ -371,6 +373,31 @@ be compatibility issues. Server-side functionality that `snowflake-ml-python` de
|
|
371
373
|
|
372
374
|
# Release History
|
373
375
|
|
376
|
+
## 1.5.1
|
377
|
+
|
378
|
+
### Bug Fixes
|
379
|
+
|
380
|
+
- Dataset: Fix `snowflake.connector.errors.DataError: Query Result did not match expected number of rows` when accessing
|
381
|
+
DatasetVersion properties when case insensitive `SHOW VERSIONS IN DATASET` check matches multiple version names.
|
382
|
+
- Dataset: Fix bug in SnowFS bulk file read when used with DuckDB
|
383
|
+
- Registry: Fixed a bug when loading old models.
|
384
|
+
- Lineage: Fix Dataset source lineage propagation through `snowpark.DataFrame` transformations
|
385
|
+
|
386
|
+
### Behavior Changes
|
387
|
+
|
388
|
+
- Feature Store: convert clear() into a private function. Also make it deletes feature views and entities only.
|
389
|
+
- Feature Store: Use NULL as default value for timestamp tag value.
|
390
|
+
|
391
|
+
### New Features
|
392
|
+
|
393
|
+
- Feature Store: Added new `snowflake.ml.feature_store.setup_feature_store()` API to assist Feature Store RBAC setup.
|
394
|
+
- Feature Store: Add `output_type` argument to `FeatureStore.generate_dataset()` to allow generating data snapshots
|
395
|
+
as Datasets or Tables.
|
396
|
+
- Registry: `log_model`, `get_model`, `delete_model` now supports fully qualified name.
|
397
|
+
- Modeling: Supports anonymous stored procedure during fit calls so that modeling would not require sufficient
|
398
|
+
permissions to operate on schema. Please call
|
399
|
+
`import snowflake.ml.modeling.parameters.enable_anonymous_sproc # noqa: F401`
|
400
|
+
|
374
401
|
## 1.5.0
|
375
402
|
|
376
403
|
### Bug Fixes
|
@@ -411,12 +438,19 @@ be compatibility issues. Server-side functionality that `snowflake-ml-python` de
|
|
411
438
|
|
412
439
|
#### Feature Store (PrPr)
|
413
440
|
|
414
|
-
`FeatureStore.generate_dataset` argument list has been changed to match the new
|
441
|
+
- `FeatureStore.generate_dataset` argument list has been changed to match the new
|
415
442
|
`snowflake.ml.dataset.Dataset` definition
|
416
443
|
|
417
|
-
- `materialized_table` has been removed and replaced with `name` and `version`.
|
418
|
-
- `name` moved to first positional argument
|
419
|
-
- `save_mode` has been removed as `merge` behavior is no longer supported. The new behavior is always `errorifexists`.
|
444
|
+
- `materialized_table` has been removed and replaced with `name` and `version`.
|
445
|
+
- `name` moved to first positional argument
|
446
|
+
- `save_mode` has been removed as `merge` behavior is no longer supported. The new behavior is always `errorifexists`.
|
447
|
+
|
448
|
+
- Change feature view version type from str to `FeatureViewVersion`. It is a restricted string literal.
|
449
|
+
|
450
|
+
- Remove as_dataframe arg from FeatureStore.list_feature_views(), now always returns result as DataFrame.
|
451
|
+
|
452
|
+
- Combines few metadata tags into a new tag: SNOWML_FEATURE_VIEW_METADATA. This will make previously created feature views
|
453
|
+
not readable by new SDK.
|
420
454
|
|
421
455
|
### New Features
|
422
456
|
|
@@ -432,6 +466,10 @@ be compatibility issues. Server-side functionality that `snowflake-ml-python` de
|
|
432
466
|
and `Dataset.read.to_tf_dataset()` respectively.
|
433
467
|
- Added `fsspec` style file integration using `Dataset.read.files()` and `Dataset.read.filesystem()`
|
434
468
|
|
469
|
+
#### Feature Store
|
470
|
+
|
471
|
+
- use new tag_reference_internal to speed up metadata lookup.
|
472
|
+
|
435
473
|
## 1.4.1 (2024-04-18)
|
436
474
|
|
437
475
|
### New Features
|
@@ -443,6 +481,10 @@ be compatibility issues. Server-side functionality that `snowflake-ml-python` de
|
|
443
481
|
|
444
482
|
- Registry: Fix a bug that leads to relax_version option is not working.
|
445
483
|
|
484
|
+
### Behavior changes
|
485
|
+
|
486
|
+
- Feature Store: update_feature_view takes refresh_freq and warehouse as argument.
|
487
|
+
|
446
488
|
## 1.4.0 (2024-04-08)
|
447
489
|
|
448
490
|
### Bug Fixes
|
@@ -464,6 +506,8 @@ be compatibility issues. Server-side functionality that `snowflake-ml-python` de
|
|
464
506
|
|
465
507
|
- Registry: `apply` method is no longer by default logged when logging a xgboost model. If that is required, it could
|
466
508
|
be specified manually when logging the model by `log_model(..., options={"target_methods": ["apply", ...]})`.
|
509
|
+
- Feature Store: register_entity returns an entity object.
|
510
|
+
- Feature Store: register_feature_view `block=true` becomes default.
|
467
511
|
|
468
512
|
### New Features
|
469
513
|
|