snowflake-ml-python 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +11 -1
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/feature_store/feature_store.py +151 -78
- snowflake/ml/feature_store/feature_view.py +12 -24
- snowflake/ml/fileset/sfcfs.py +56 -50
- snowflake/ml/fileset/stage_fs.py +48 -13
- snowflake/ml/model/_client/model/model_version_impl.py +2 -50
- snowflake/ml/model/_client/ops/model_ops.py +78 -29
- snowflake/ml/model/_client/sql/model.py +23 -2
- snowflake/ml/model/_client/sql/model_version.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +19 -54
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +8 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +36 -6
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -2
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +195 -123
- snowflake/ml/modeling/cluster/affinity_propagation.py +195 -123
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +195 -123
- snowflake/ml/modeling/cluster/birch.py +195 -123
- snowflake/ml/modeling/cluster/bisecting_k_means.py +195 -123
- snowflake/ml/modeling/cluster/dbscan.py +195 -123
- snowflake/ml/modeling/cluster/feature_agglomeration.py +195 -123
- snowflake/ml/modeling/cluster/k_means.py +195 -123
- snowflake/ml/modeling/cluster/mean_shift.py +195 -123
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +195 -123
- snowflake/ml/modeling/cluster/optics.py +195 -123
- snowflake/ml/modeling/cluster/spectral_biclustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_clustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_coclustering.py +195 -123
- snowflake/ml/modeling/compose/column_transformer.py +195 -123
- snowflake/ml/modeling/compose/transformed_target_regressor.py +195 -123
- snowflake/ml/modeling/covariance/elliptic_envelope.py +195 -123
- snowflake/ml/modeling/covariance/empirical_covariance.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +195 -123
- snowflake/ml/modeling/covariance/ledoit_wolf.py +195 -123
- snowflake/ml/modeling/covariance/min_cov_det.py +195 -123
- snowflake/ml/modeling/covariance/oas.py +195 -123
- snowflake/ml/modeling/covariance/shrunk_covariance.py +195 -123
- snowflake/ml/modeling/decomposition/dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/factor_analysis.py +195 -123
- snowflake/ml/modeling/decomposition/fast_ica.py +195 -123
- snowflake/ml/modeling/decomposition/incremental_pca.py +195 -123
- snowflake/ml/modeling/decomposition/kernel_pca.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/pca.py +195 -123
- snowflake/ml/modeling/decomposition/sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/truncated_svd.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/isolation_forest.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/stacking_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/voting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/voting_regressor.py +195 -123
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fdr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fpr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fwe.py +195 -123
- snowflake/ml/modeling/feature_selection/select_k_best.py +195 -123
- snowflake/ml/modeling/feature_selection/select_percentile.py +195 -123
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +195 -123
- snowflake/ml/modeling/feature_selection/variance_threshold.py +195 -123
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +9 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +195 -123
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +195 -123
- snowflake/ml/modeling/impute/iterative_imputer.py +195 -123
- snowflake/ml/modeling/impute/knn_imputer.py +195 -123
- snowflake/ml/modeling/impute/missing_indicator.py +195 -123
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/nystroem.py +195 -123
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +195 -123
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ard_regression.py +195 -123
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/gamma_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/huber_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/lars.py +195 -123
- snowflake/ml/modeling/linear_model/lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +195 -123
- snowflake/ml/modeling/linear_model/linear_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/perceptron.py +195 -123
- snowflake/ml/modeling/linear_model/poisson_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ransac_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ridge.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_cv.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +195 -123
- snowflake/ml/modeling/manifold/isomap.py +195 -123
- snowflake/ml/modeling/manifold/mds.py +195 -123
- snowflake/ml/modeling/manifold/spectral_embedding.py +195 -123
- snowflake/ml/modeling/manifold/tsne.py +195 -123
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +195 -123
- snowflake/ml/modeling/mixture/gaussian_mixture.py +195 -123
- snowflake/ml/modeling/model_selection/grid_search_cv.py +42 -18
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +42 -18
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/output_code_classifier.py +195 -123
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/complement_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neighbors/kernel_density.py +195 -123
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_centroid.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +195 -123
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_classifier.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_regressor.py +195 -123
- snowflake/ml/modeling/pipeline/pipeline.py +4 -4
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +195 -123
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +195 -123
- snowflake/ml/modeling/semi_supervised/label_spreading.py +195 -123
- snowflake/ml/modeling/svm/linear_svc.py +195 -123
- snowflake/ml/modeling/svm/linear_svr.py +195 -123
- snowflake/ml/modeling/svm/nu_svc.py +195 -123
- snowflake/ml/modeling/svm/nu_svr.py +195 -123
- snowflake/ml/modeling/svm/svc.py +195 -123
- snowflake/ml/modeling/svm/svr.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_regressor.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +195 -123
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/METADATA +68 -57
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/RECORD +202 -200
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -303,12 +302,7 @@ class MiniBatchKMeans(BaseTransformer):
|
|
303
302
|
)
|
304
303
|
return selected_cols
|
305
304
|
|
306
|
-
|
307
|
-
project=_PROJECT,
|
308
|
-
subproject=_SUBPROJECT,
|
309
|
-
custom_tags=dict([("autogen", True)]),
|
310
|
-
)
|
311
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "MiniBatchKMeans":
|
305
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "MiniBatchKMeans":
|
312
306
|
"""Compute the centroids on X by chunking it into mini-batches
|
313
307
|
For more details on this function, see [sklearn.cluster.MiniBatchKMeans.fit]
|
314
308
|
(https://scikit-learn.org/stable/modules/generated/sklearn.cluster.MiniBatchKMeans.html#sklearn.cluster.MiniBatchKMeans.fit)
|
@@ -335,12 +329,14 @@ class MiniBatchKMeans(BaseTransformer):
|
|
335
329
|
|
336
330
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
337
331
|
|
338
|
-
|
332
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
339
333
|
if SNOWML_SPROC_ENV in os.environ:
|
340
334
|
statement_params = telemetry.get_function_usage_statement_params(
|
341
335
|
project=_PROJECT,
|
342
336
|
subproject=_SUBPROJECT,
|
343
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
337
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
338
|
+
inspect.currentframe(), MiniBatchKMeans.__class__.__name__
|
339
|
+
),
|
344
340
|
api_calls=[Session.call],
|
345
341
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
346
342
|
)
|
@@ -361,7 +357,7 @@ class MiniBatchKMeans(BaseTransformer):
|
|
361
357
|
)
|
362
358
|
self._sklearn_object = model_trainer.train()
|
363
359
|
self._is_fitted = True
|
364
|
-
self.
|
360
|
+
self._generate_model_signatures(dataset)
|
365
361
|
return self
|
366
362
|
|
367
363
|
def _batch_inference_validate_snowpark(
|
@@ -437,7 +433,9 @@ class MiniBatchKMeans(BaseTransformer):
|
|
437
433
|
# when it is classifier, infer the datatype from label columns
|
438
434
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
439
435
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
440
|
-
label_cols_signatures = [
|
436
|
+
label_cols_signatures = [
|
437
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
438
|
+
]
|
441
439
|
if len(label_cols_signatures) == 0:
|
442
440
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
443
441
|
raise exceptions.SnowflakeMLException(
|
@@ -445,25 +443,22 @@ class MiniBatchKMeans(BaseTransformer):
|
|
445
443
|
original_exception=ValueError(error_str),
|
446
444
|
)
|
447
445
|
|
448
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
449
|
-
label_cols_signatures[0].as_snowpark_type()
|
450
|
-
)
|
446
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
451
447
|
|
452
448
|
self._deps = self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
453
|
-
assert isinstance(
|
449
|
+
assert isinstance(
|
450
|
+
dataset._session, Session
|
451
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
454
452
|
|
455
453
|
transform_kwargs = dict(
|
456
|
-
session
|
457
|
-
dependencies
|
458
|
-
drop_input_cols
|
459
|
-
expected_output_cols_type
|
454
|
+
session=dataset._session,
|
455
|
+
dependencies=self._deps,
|
456
|
+
drop_input_cols=self._drop_input_cols,
|
457
|
+
expected_output_cols_type=expected_type_inferred,
|
460
458
|
)
|
461
459
|
|
462
460
|
elif isinstance(dataset, pd.DataFrame):
|
463
|
-
transform_kwargs = dict(
|
464
|
-
snowpark_input_cols = self._snowpark_cols,
|
465
|
-
drop_input_cols = self._drop_input_cols
|
466
|
-
)
|
461
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
467
462
|
|
468
463
|
transform_handlers = ModelTransformerBuilder.build(
|
469
464
|
dataset=dataset,
|
@@ -505,7 +500,7 @@ class MiniBatchKMeans(BaseTransformer):
|
|
505
500
|
Transformed dataset.
|
506
501
|
"""
|
507
502
|
super()._check_dataset_type(dataset)
|
508
|
-
inference_method="transform"
|
503
|
+
inference_method = "transform"
|
509
504
|
|
510
505
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
511
506
|
# are specific to the type of dataset used.
|
@@ -542,17 +537,14 @@ class MiniBatchKMeans(BaseTransformer):
|
|
542
537
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
543
538
|
|
544
539
|
transform_kwargs = dict(
|
545
|
-
session
|
546
|
-
dependencies
|
547
|
-
drop_input_cols
|
548
|
-
expected_output_cols_type
|
540
|
+
session=dataset._session,
|
541
|
+
dependencies=self._deps,
|
542
|
+
drop_input_cols=self._drop_input_cols,
|
543
|
+
expected_output_cols_type=expected_dtype,
|
549
544
|
)
|
550
545
|
|
551
546
|
elif isinstance(dataset, pd.DataFrame):
|
552
|
-
transform_kwargs = dict(
|
553
|
-
snowpark_input_cols = self._snowpark_cols,
|
554
|
-
drop_input_cols = self._drop_input_cols
|
555
|
-
)
|
547
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
556
548
|
|
557
549
|
transform_handlers = ModelTransformerBuilder.build(
|
558
550
|
dataset=dataset,
|
@@ -571,7 +563,11 @@ class MiniBatchKMeans(BaseTransformer):
|
|
571
563
|
return output_df
|
572
564
|
|
573
565
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
574
|
-
def fit_predict(
|
566
|
+
def fit_predict(
|
567
|
+
self,
|
568
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
569
|
+
output_cols_prefix: str = "fit_predict_",
|
570
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
575
571
|
""" Compute cluster centers and predict cluster index for each sample
|
576
572
|
For more details on this function, see [sklearn.cluster.MiniBatchKMeans.fit_predict]
|
577
573
|
(https://scikit-learn.org/stable/modules/generated/sklearn.cluster.MiniBatchKMeans.html#sklearn.cluster.MiniBatchKMeans.fit_predict)
|
@@ -598,7 +594,9 @@ class MiniBatchKMeans(BaseTransformer):
|
|
598
594
|
)
|
599
595
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
600
596
|
drop_input_cols=self._drop_input_cols,
|
601
|
-
expected_output_cols_list=
|
597
|
+
expected_output_cols_list=(
|
598
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
599
|
+
),
|
602
600
|
)
|
603
601
|
self._sklearn_object = fitted_estimator
|
604
602
|
self._is_fitted = True
|
@@ -615,6 +613,62 @@ class MiniBatchKMeans(BaseTransformer):
|
|
615
613
|
assert self._sklearn_object is not None
|
616
614
|
return self._sklearn_object.embedding_
|
617
615
|
|
616
|
+
|
617
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
618
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
619
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
620
|
+
"""
|
621
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
622
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
623
|
+
if output_cols:
|
624
|
+
output_cols = [
|
625
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
626
|
+
for c in output_cols
|
627
|
+
]
|
628
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
629
|
+
output_cols = [output_cols_prefix]
|
630
|
+
elif self._sklearn_object is not None:
|
631
|
+
classes = self._sklearn_object.classes_
|
632
|
+
if isinstance(classes, numpy.ndarray):
|
633
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
634
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
635
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
636
|
+
output_cols = []
|
637
|
+
for i, cl in enumerate(classes):
|
638
|
+
# For binary classification, there is only one output column for each class
|
639
|
+
# ndarray as the two classes are complementary.
|
640
|
+
if len(cl) == 2:
|
641
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
642
|
+
else:
|
643
|
+
output_cols.extend([
|
644
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
645
|
+
])
|
646
|
+
else:
|
647
|
+
output_cols = []
|
648
|
+
|
649
|
+
# Make sure column names are valid snowflake identifiers.
|
650
|
+
assert output_cols is not None # Make MyPy happy
|
651
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
652
|
+
|
653
|
+
return rv
|
654
|
+
|
655
|
+
def _align_expected_output_names(
|
656
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
657
|
+
) -> List[str]:
|
658
|
+
# in case the inferred output column names dimension is different
|
659
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
660
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
661
|
+
output_df_columns = list(output_df_pd.columns)
|
662
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
663
|
+
if self.sample_weight_col:
|
664
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
665
|
+
# if the dimension of inferred output column names is correct; use it
|
666
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
667
|
+
return expected_output_cols_list
|
668
|
+
# otherwise, use the sklearn estimator's output
|
669
|
+
else:
|
670
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
671
|
+
|
618
672
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
619
673
|
@telemetry.send_api_usage_telemetry(
|
620
674
|
project=_PROJECT,
|
@@ -645,24 +699,28 @@ class MiniBatchKMeans(BaseTransformer):
|
|
645
699
|
# are specific to the type of dataset used.
|
646
700
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
647
701
|
|
702
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
703
|
+
|
648
704
|
if isinstance(dataset, DataFrame):
|
649
705
|
self._deps = self._batch_inference_validate_snowpark(
|
650
706
|
dataset=dataset,
|
651
707
|
inference_method=inference_method,
|
652
708
|
)
|
653
|
-
assert isinstance(
|
709
|
+
assert isinstance(
|
710
|
+
dataset._session, Session
|
711
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
654
712
|
transform_kwargs = dict(
|
655
713
|
session=dataset._session,
|
656
714
|
dependencies=self._deps,
|
657
|
-
drop_input_cols
|
715
|
+
drop_input_cols=self._drop_input_cols,
|
658
716
|
expected_output_cols_type="float",
|
659
717
|
)
|
718
|
+
expected_output_cols = self._align_expected_output_names(
|
719
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
720
|
+
)
|
660
721
|
|
661
722
|
elif isinstance(dataset, pd.DataFrame):
|
662
|
-
transform_kwargs = dict(
|
663
|
-
snowpark_input_cols = self._snowpark_cols,
|
664
|
-
drop_input_cols = self._drop_input_cols
|
665
|
-
)
|
723
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
666
724
|
|
667
725
|
transform_handlers = ModelTransformerBuilder.build(
|
668
726
|
dataset=dataset,
|
@@ -674,7 +732,7 @@ class MiniBatchKMeans(BaseTransformer):
|
|
674
732
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
675
733
|
inference_method=inference_method,
|
676
734
|
input_cols=self.input_cols,
|
677
|
-
expected_output_cols=
|
735
|
+
expected_output_cols=expected_output_cols,
|
678
736
|
**transform_kwargs
|
679
737
|
)
|
680
738
|
return output_df
|
@@ -704,7 +762,8 @@ class MiniBatchKMeans(BaseTransformer):
|
|
704
762
|
Output dataset with log probability of the sample for each class in the model.
|
705
763
|
"""
|
706
764
|
super()._check_dataset_type(dataset)
|
707
|
-
inference_method="predict_log_proba"
|
765
|
+
inference_method = "predict_log_proba"
|
766
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
708
767
|
|
709
768
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
710
769
|
# are specific to the type of dataset used.
|
@@ -715,18 +774,20 @@ class MiniBatchKMeans(BaseTransformer):
|
|
715
774
|
dataset=dataset,
|
716
775
|
inference_method=inference_method,
|
717
776
|
)
|
718
|
-
assert isinstance(
|
777
|
+
assert isinstance(
|
778
|
+
dataset._session, Session
|
779
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
719
780
|
transform_kwargs = dict(
|
720
781
|
session=dataset._session,
|
721
782
|
dependencies=self._deps,
|
722
|
-
drop_input_cols
|
783
|
+
drop_input_cols=self._drop_input_cols,
|
723
784
|
expected_output_cols_type="float",
|
724
785
|
)
|
786
|
+
expected_output_cols = self._align_expected_output_names(
|
787
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
788
|
+
)
|
725
789
|
elif isinstance(dataset, pd.DataFrame):
|
726
|
-
transform_kwargs = dict(
|
727
|
-
snowpark_input_cols = self._snowpark_cols,
|
728
|
-
drop_input_cols = self._drop_input_cols
|
729
|
-
)
|
790
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
730
791
|
|
731
792
|
transform_handlers = ModelTransformerBuilder.build(
|
732
793
|
dataset=dataset,
|
@@ -739,7 +800,7 @@ class MiniBatchKMeans(BaseTransformer):
|
|
739
800
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
740
801
|
inference_method=inference_method,
|
741
802
|
input_cols=self.input_cols,
|
742
|
-
expected_output_cols=
|
803
|
+
expected_output_cols=expected_output_cols,
|
743
804
|
**transform_kwargs
|
744
805
|
)
|
745
806
|
return output_df
|
@@ -765,30 +826,34 @@ class MiniBatchKMeans(BaseTransformer):
|
|
765
826
|
Output dataset with results of the decision function for the samples in input dataset.
|
766
827
|
"""
|
767
828
|
super()._check_dataset_type(dataset)
|
768
|
-
inference_method="decision_function"
|
829
|
+
inference_method = "decision_function"
|
769
830
|
|
770
831
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
771
832
|
# are specific to the type of dataset used.
|
772
833
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
773
834
|
|
835
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
836
|
+
|
774
837
|
if isinstance(dataset, DataFrame):
|
775
838
|
self._deps = self._batch_inference_validate_snowpark(
|
776
839
|
dataset=dataset,
|
777
840
|
inference_method=inference_method,
|
778
841
|
)
|
779
|
-
assert isinstance(
|
842
|
+
assert isinstance(
|
843
|
+
dataset._session, Session
|
844
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
780
845
|
transform_kwargs = dict(
|
781
846
|
session=dataset._session,
|
782
847
|
dependencies=self._deps,
|
783
|
-
drop_input_cols
|
848
|
+
drop_input_cols=self._drop_input_cols,
|
784
849
|
expected_output_cols_type="float",
|
785
850
|
)
|
851
|
+
expected_output_cols = self._align_expected_output_names(
|
852
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
853
|
+
)
|
786
854
|
|
787
855
|
elif isinstance(dataset, pd.DataFrame):
|
788
|
-
transform_kwargs = dict(
|
789
|
-
snowpark_input_cols = self._snowpark_cols,
|
790
|
-
drop_input_cols = self._drop_input_cols
|
791
|
-
)
|
856
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
792
857
|
|
793
858
|
transform_handlers = ModelTransformerBuilder.build(
|
794
859
|
dataset=dataset,
|
@@ -801,7 +866,7 @@ class MiniBatchKMeans(BaseTransformer):
|
|
801
866
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
802
867
|
inference_method=inference_method,
|
803
868
|
input_cols=self.input_cols,
|
804
|
-
expected_output_cols=
|
869
|
+
expected_output_cols=expected_output_cols,
|
805
870
|
**transform_kwargs
|
806
871
|
)
|
807
872
|
return output_df
|
@@ -830,12 +895,14 @@ class MiniBatchKMeans(BaseTransformer):
|
|
830
895
|
Output dataset with probability of the sample for each class in the model.
|
831
896
|
"""
|
832
897
|
super()._check_dataset_type(dataset)
|
833
|
-
inference_method="score_samples"
|
898
|
+
inference_method = "score_samples"
|
834
899
|
|
835
900
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
836
901
|
# are specific to the type of dataset used.
|
837
902
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
838
903
|
|
904
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
905
|
+
|
839
906
|
if isinstance(dataset, DataFrame):
|
840
907
|
self._deps = self._batch_inference_validate_snowpark(
|
841
908
|
dataset=dataset,
|
@@ -848,6 +915,9 @@ class MiniBatchKMeans(BaseTransformer):
|
|
848
915
|
drop_input_cols = self._drop_input_cols,
|
849
916
|
expected_output_cols_type="float",
|
850
917
|
)
|
918
|
+
expected_output_cols = self._align_expected_output_names(
|
919
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
920
|
+
)
|
851
921
|
|
852
922
|
elif isinstance(dataset, pd.DataFrame):
|
853
923
|
transform_kwargs = dict(
|
@@ -866,7 +936,7 @@ class MiniBatchKMeans(BaseTransformer):
|
|
866
936
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
867
937
|
inference_method=inference_method,
|
868
938
|
input_cols=self.input_cols,
|
869
|
-
expected_output_cols=
|
939
|
+
expected_output_cols=expected_output_cols,
|
870
940
|
**transform_kwargs
|
871
941
|
)
|
872
942
|
return output_df
|
@@ -1013,50 +1083,84 @@ class MiniBatchKMeans(BaseTransformer):
|
|
1013
1083
|
)
|
1014
1084
|
return output_df
|
1015
1085
|
|
1086
|
+
|
1087
|
+
|
1088
|
+
def to_sklearn(self) -> Any:
|
1089
|
+
"""Get sklearn.cluster.MiniBatchKMeans object.
|
1090
|
+
"""
|
1091
|
+
if self._sklearn_object is None:
|
1092
|
+
self._sklearn_object = self._create_sklearn_object()
|
1093
|
+
return self._sklearn_object
|
1094
|
+
|
1095
|
+
def to_xgboost(self) -> Any:
|
1096
|
+
raise exceptions.SnowflakeMLException(
|
1097
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1098
|
+
original_exception=AttributeError(
|
1099
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1100
|
+
"to_xgboost()",
|
1101
|
+
"to_sklearn()"
|
1102
|
+
)
|
1103
|
+
),
|
1104
|
+
)
|
1105
|
+
|
1106
|
+
def to_lightgbm(self) -> Any:
|
1107
|
+
raise exceptions.SnowflakeMLException(
|
1108
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1109
|
+
original_exception=AttributeError(
|
1110
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1111
|
+
"to_lightgbm()",
|
1112
|
+
"to_sklearn()"
|
1113
|
+
)
|
1114
|
+
),
|
1115
|
+
)
|
1016
1116
|
|
1017
|
-
def
|
1117
|
+
def _get_dependencies(self) -> List[str]:
|
1118
|
+
return self._deps
|
1119
|
+
|
1120
|
+
|
1121
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
1018
1122
|
self._model_signature_dict = dict()
|
1019
1123
|
|
1020
1124
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1021
1125
|
|
1022
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1126
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1023
1127
|
outputs: List[BaseFeatureSpec] = []
|
1024
1128
|
if hasattr(self, "predict"):
|
1025
1129
|
# keep mypy happy
|
1026
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1130
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1027
1131
|
# For classifier, the type of predict is the same as the type of label
|
1028
|
-
if self._sklearn_object._estimator_type ==
|
1029
|
-
|
1132
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1133
|
+
# label columns is the desired type for output
|
1030
1134
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1031
1135
|
# rename the output columns
|
1032
1136
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1033
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1034
|
-
|
1035
|
-
|
1137
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1138
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1139
|
+
)
|
1036
1140
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
1037
1141
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
1038
|
-
# Clusterer returns int64 cluster labels.
|
1142
|
+
# Clusterer returns int64 cluster labels.
|
1039
1143
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
1040
1144
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
1041
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1042
|
-
|
1043
|
-
|
1044
|
-
|
1145
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1146
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1147
|
+
)
|
1148
|
+
|
1045
1149
|
# For regressor, the type of predict is float64
|
1046
|
-
elif self._sklearn_object._estimator_type ==
|
1150
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
1047
1151
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1048
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1049
|
-
|
1050
|
-
|
1051
|
-
|
1152
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1153
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1154
|
+
)
|
1155
|
+
|
1052
1156
|
for prob_func in PROB_FUNCTIONS:
|
1053
1157
|
if hasattr(self, prob_func):
|
1054
1158
|
output_cols_prefix: str = f"{prob_func}_"
|
1055
1159
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1056
1160
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1057
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
1058
|
-
|
1059
|
-
|
1161
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1162
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1163
|
+
)
|
1060
1164
|
|
1061
1165
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
1062
1166
|
items = list(self._model_signature_dict.items())
|
@@ -1069,10 +1173,10 @@ class MiniBatchKMeans(BaseTransformer):
|
|
1069
1173
|
"""Returns model signature of current class.
|
1070
1174
|
|
1071
1175
|
Raises:
|
1072
|
-
|
1176
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
1073
1177
|
|
1074
1178
|
Returns:
|
1075
|
-
Dict
|
1179
|
+
Dict with each method and its input output signature
|
1076
1180
|
"""
|
1077
1181
|
if self._model_signature_dict is None:
|
1078
1182
|
raise exceptions.SnowflakeMLException(
|
@@ -1080,35 +1184,3 @@ class MiniBatchKMeans(BaseTransformer):
|
|
1080
1184
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
1081
1185
|
)
|
1082
1186
|
return self._model_signature_dict
|
1083
|
-
|
1084
|
-
def to_sklearn(self) -> Any:
|
1085
|
-
"""Get sklearn.cluster.MiniBatchKMeans object.
|
1086
|
-
"""
|
1087
|
-
if self._sklearn_object is None:
|
1088
|
-
self._sklearn_object = self._create_sklearn_object()
|
1089
|
-
return self._sklearn_object
|
1090
|
-
|
1091
|
-
def to_xgboost(self) -> Any:
|
1092
|
-
raise exceptions.SnowflakeMLException(
|
1093
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1094
|
-
original_exception=AttributeError(
|
1095
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1096
|
-
"to_xgboost()",
|
1097
|
-
"to_sklearn()"
|
1098
|
-
)
|
1099
|
-
),
|
1100
|
-
)
|
1101
|
-
|
1102
|
-
def to_lightgbm(self) -> Any:
|
1103
|
-
raise exceptions.SnowflakeMLException(
|
1104
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1105
|
-
original_exception=AttributeError(
|
1106
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1107
|
-
"to_lightgbm()",
|
1108
|
-
"to_sklearn()"
|
1109
|
-
)
|
1110
|
-
),
|
1111
|
-
)
|
1112
|
-
|
1113
|
-
def _get_dependencies(self) -> List[str]:
|
1114
|
-
return self._deps
|