snowflake-ml-python 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +11 -1
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/feature_store/feature_store.py +151 -78
- snowflake/ml/feature_store/feature_view.py +12 -24
- snowflake/ml/fileset/sfcfs.py +56 -50
- snowflake/ml/fileset/stage_fs.py +48 -13
- snowflake/ml/model/_client/model/model_version_impl.py +2 -50
- snowflake/ml/model/_client/ops/model_ops.py +78 -29
- snowflake/ml/model/_client/sql/model.py +23 -2
- snowflake/ml/model/_client/sql/model_version.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +19 -54
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +8 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +36 -6
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -2
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +195 -123
- snowflake/ml/modeling/cluster/affinity_propagation.py +195 -123
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +195 -123
- snowflake/ml/modeling/cluster/birch.py +195 -123
- snowflake/ml/modeling/cluster/bisecting_k_means.py +195 -123
- snowflake/ml/modeling/cluster/dbscan.py +195 -123
- snowflake/ml/modeling/cluster/feature_agglomeration.py +195 -123
- snowflake/ml/modeling/cluster/k_means.py +195 -123
- snowflake/ml/modeling/cluster/mean_shift.py +195 -123
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +195 -123
- snowflake/ml/modeling/cluster/optics.py +195 -123
- snowflake/ml/modeling/cluster/spectral_biclustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_clustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_coclustering.py +195 -123
- snowflake/ml/modeling/compose/column_transformer.py +195 -123
- snowflake/ml/modeling/compose/transformed_target_regressor.py +195 -123
- snowflake/ml/modeling/covariance/elliptic_envelope.py +195 -123
- snowflake/ml/modeling/covariance/empirical_covariance.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +195 -123
- snowflake/ml/modeling/covariance/ledoit_wolf.py +195 -123
- snowflake/ml/modeling/covariance/min_cov_det.py +195 -123
- snowflake/ml/modeling/covariance/oas.py +195 -123
- snowflake/ml/modeling/covariance/shrunk_covariance.py +195 -123
- snowflake/ml/modeling/decomposition/dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/factor_analysis.py +195 -123
- snowflake/ml/modeling/decomposition/fast_ica.py +195 -123
- snowflake/ml/modeling/decomposition/incremental_pca.py +195 -123
- snowflake/ml/modeling/decomposition/kernel_pca.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/pca.py +195 -123
- snowflake/ml/modeling/decomposition/sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/truncated_svd.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/isolation_forest.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/stacking_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/voting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/voting_regressor.py +195 -123
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fdr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fpr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fwe.py +195 -123
- snowflake/ml/modeling/feature_selection/select_k_best.py +195 -123
- snowflake/ml/modeling/feature_selection/select_percentile.py +195 -123
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +195 -123
- snowflake/ml/modeling/feature_selection/variance_threshold.py +195 -123
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +9 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +195 -123
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +195 -123
- snowflake/ml/modeling/impute/iterative_imputer.py +195 -123
- snowflake/ml/modeling/impute/knn_imputer.py +195 -123
- snowflake/ml/modeling/impute/missing_indicator.py +195 -123
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/nystroem.py +195 -123
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +195 -123
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ard_regression.py +195 -123
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/gamma_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/huber_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/lars.py +195 -123
- snowflake/ml/modeling/linear_model/lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +195 -123
- snowflake/ml/modeling/linear_model/linear_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/perceptron.py +195 -123
- snowflake/ml/modeling/linear_model/poisson_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ransac_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ridge.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_cv.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +195 -123
- snowflake/ml/modeling/manifold/isomap.py +195 -123
- snowflake/ml/modeling/manifold/mds.py +195 -123
- snowflake/ml/modeling/manifold/spectral_embedding.py +195 -123
- snowflake/ml/modeling/manifold/tsne.py +195 -123
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +195 -123
- snowflake/ml/modeling/mixture/gaussian_mixture.py +195 -123
- snowflake/ml/modeling/model_selection/grid_search_cv.py +42 -18
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +42 -18
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/output_code_classifier.py +195 -123
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/complement_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neighbors/kernel_density.py +195 -123
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_centroid.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +195 -123
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_classifier.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_regressor.py +195 -123
- snowflake/ml/modeling/pipeline/pipeline.py +4 -4
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +195 -123
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +195 -123
- snowflake/ml/modeling/semi_supervised/label_spreading.py +195 -123
- snowflake/ml/modeling/svm/linear_svc.py +195 -123
- snowflake/ml/modeling/svm/linear_svr.py +195 -123
- snowflake/ml/modeling/svm/nu_svc.py +195 -123
- snowflake/ml/modeling/svm/nu_svr.py +195 -123
- snowflake/ml/modeling/svm/svc.py +195 -123
- snowflake/ml/modeling/svm/svr.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_regressor.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +195 -123
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/METADATA +68 -57
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/RECORD +202 -200
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -339,12 +338,7 @@ class MiniBatchDictionaryLearning(BaseTransformer):
|
|
339
338
|
)
|
340
339
|
return selected_cols
|
341
340
|
|
342
|
-
|
343
|
-
project=_PROJECT,
|
344
|
-
subproject=_SUBPROJECT,
|
345
|
-
custom_tags=dict([("autogen", True)]),
|
346
|
-
)
|
347
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "MiniBatchDictionaryLearning":
|
341
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "MiniBatchDictionaryLearning":
|
348
342
|
"""Fit the model from data in X
|
349
343
|
For more details on this function, see [sklearn.decomposition.MiniBatchDictionaryLearning.fit]
|
350
344
|
(https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.MiniBatchDictionaryLearning.html#sklearn.decomposition.MiniBatchDictionaryLearning.fit)
|
@@ -371,12 +365,14 @@ class MiniBatchDictionaryLearning(BaseTransformer):
|
|
371
365
|
|
372
366
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
373
367
|
|
374
|
-
|
368
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
375
369
|
if SNOWML_SPROC_ENV in os.environ:
|
376
370
|
statement_params = telemetry.get_function_usage_statement_params(
|
377
371
|
project=_PROJECT,
|
378
372
|
subproject=_SUBPROJECT,
|
379
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
373
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
374
|
+
inspect.currentframe(), MiniBatchDictionaryLearning.__class__.__name__
|
375
|
+
),
|
380
376
|
api_calls=[Session.call],
|
381
377
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
382
378
|
)
|
@@ -397,7 +393,7 @@ class MiniBatchDictionaryLearning(BaseTransformer):
|
|
397
393
|
)
|
398
394
|
self._sklearn_object = model_trainer.train()
|
399
395
|
self._is_fitted = True
|
400
|
-
self.
|
396
|
+
self._generate_model_signatures(dataset)
|
401
397
|
return self
|
402
398
|
|
403
399
|
def _batch_inference_validate_snowpark(
|
@@ -471,7 +467,9 @@ class MiniBatchDictionaryLearning(BaseTransformer):
|
|
471
467
|
# when it is classifier, infer the datatype from label columns
|
472
468
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
473
469
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
474
|
-
label_cols_signatures = [
|
470
|
+
label_cols_signatures = [
|
471
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
472
|
+
]
|
475
473
|
if len(label_cols_signatures) == 0:
|
476
474
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
477
475
|
raise exceptions.SnowflakeMLException(
|
@@ -479,25 +477,22 @@ class MiniBatchDictionaryLearning(BaseTransformer):
|
|
479
477
|
original_exception=ValueError(error_str),
|
480
478
|
)
|
481
479
|
|
482
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
483
|
-
label_cols_signatures[0].as_snowpark_type()
|
484
|
-
)
|
480
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
485
481
|
|
486
482
|
self._deps = self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
487
|
-
assert isinstance(
|
483
|
+
assert isinstance(
|
484
|
+
dataset._session, Session
|
485
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
488
486
|
|
489
487
|
transform_kwargs = dict(
|
490
|
-
session
|
491
|
-
dependencies
|
492
|
-
drop_input_cols
|
493
|
-
expected_output_cols_type
|
488
|
+
session=dataset._session,
|
489
|
+
dependencies=self._deps,
|
490
|
+
drop_input_cols=self._drop_input_cols,
|
491
|
+
expected_output_cols_type=expected_type_inferred,
|
494
492
|
)
|
495
493
|
|
496
494
|
elif isinstance(dataset, pd.DataFrame):
|
497
|
-
transform_kwargs = dict(
|
498
|
-
snowpark_input_cols = self._snowpark_cols,
|
499
|
-
drop_input_cols = self._drop_input_cols
|
500
|
-
)
|
495
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
501
496
|
|
502
497
|
transform_handlers = ModelTransformerBuilder.build(
|
503
498
|
dataset=dataset,
|
@@ -539,7 +534,7 @@ class MiniBatchDictionaryLearning(BaseTransformer):
|
|
539
534
|
Transformed dataset.
|
540
535
|
"""
|
541
536
|
super()._check_dataset_type(dataset)
|
542
|
-
inference_method="transform"
|
537
|
+
inference_method = "transform"
|
543
538
|
|
544
539
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
545
540
|
# are specific to the type of dataset used.
|
@@ -576,17 +571,14 @@ class MiniBatchDictionaryLearning(BaseTransformer):
|
|
576
571
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
577
572
|
|
578
573
|
transform_kwargs = dict(
|
579
|
-
session
|
580
|
-
dependencies
|
581
|
-
drop_input_cols
|
582
|
-
expected_output_cols_type
|
574
|
+
session=dataset._session,
|
575
|
+
dependencies=self._deps,
|
576
|
+
drop_input_cols=self._drop_input_cols,
|
577
|
+
expected_output_cols_type=expected_dtype,
|
583
578
|
)
|
584
579
|
|
585
580
|
elif isinstance(dataset, pd.DataFrame):
|
586
|
-
transform_kwargs = dict(
|
587
|
-
snowpark_input_cols = self._snowpark_cols,
|
588
|
-
drop_input_cols = self._drop_input_cols
|
589
|
-
)
|
581
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
590
582
|
|
591
583
|
transform_handlers = ModelTransformerBuilder.build(
|
592
584
|
dataset=dataset,
|
@@ -605,7 +597,11 @@ class MiniBatchDictionaryLearning(BaseTransformer):
|
|
605
597
|
return output_df
|
606
598
|
|
607
599
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
608
|
-
def fit_predict(
|
600
|
+
def fit_predict(
|
601
|
+
self,
|
602
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
603
|
+
output_cols_prefix: str = "fit_predict_",
|
604
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
609
605
|
""" Method not supported for this class.
|
610
606
|
|
611
607
|
|
@@ -630,7 +626,9 @@ class MiniBatchDictionaryLearning(BaseTransformer):
|
|
630
626
|
)
|
631
627
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
632
628
|
drop_input_cols=self._drop_input_cols,
|
633
|
-
expected_output_cols_list=
|
629
|
+
expected_output_cols_list=(
|
630
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
631
|
+
),
|
634
632
|
)
|
635
633
|
self._sklearn_object = fitted_estimator
|
636
634
|
self._is_fitted = True
|
@@ -647,6 +645,62 @@ class MiniBatchDictionaryLearning(BaseTransformer):
|
|
647
645
|
assert self._sklearn_object is not None
|
648
646
|
return self._sklearn_object.embedding_
|
649
647
|
|
648
|
+
|
649
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
650
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
651
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
652
|
+
"""
|
653
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
654
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
655
|
+
if output_cols:
|
656
|
+
output_cols = [
|
657
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
658
|
+
for c in output_cols
|
659
|
+
]
|
660
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
661
|
+
output_cols = [output_cols_prefix]
|
662
|
+
elif self._sklearn_object is not None:
|
663
|
+
classes = self._sklearn_object.classes_
|
664
|
+
if isinstance(classes, numpy.ndarray):
|
665
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
666
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
667
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
668
|
+
output_cols = []
|
669
|
+
for i, cl in enumerate(classes):
|
670
|
+
# For binary classification, there is only one output column for each class
|
671
|
+
# ndarray as the two classes are complementary.
|
672
|
+
if len(cl) == 2:
|
673
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
674
|
+
else:
|
675
|
+
output_cols.extend([
|
676
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
677
|
+
])
|
678
|
+
else:
|
679
|
+
output_cols = []
|
680
|
+
|
681
|
+
# Make sure column names are valid snowflake identifiers.
|
682
|
+
assert output_cols is not None # Make MyPy happy
|
683
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
684
|
+
|
685
|
+
return rv
|
686
|
+
|
687
|
+
def _align_expected_output_names(
|
688
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
689
|
+
) -> List[str]:
|
690
|
+
# in case the inferred output column names dimension is different
|
691
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
692
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
693
|
+
output_df_columns = list(output_df_pd.columns)
|
694
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
695
|
+
if self.sample_weight_col:
|
696
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
697
|
+
# if the dimension of inferred output column names is correct; use it
|
698
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
699
|
+
return expected_output_cols_list
|
700
|
+
# otherwise, use the sklearn estimator's output
|
701
|
+
else:
|
702
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
703
|
+
|
650
704
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
651
705
|
@telemetry.send_api_usage_telemetry(
|
652
706
|
project=_PROJECT,
|
@@ -677,24 +731,28 @@ class MiniBatchDictionaryLearning(BaseTransformer):
|
|
677
731
|
# are specific to the type of dataset used.
|
678
732
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
679
733
|
|
734
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
735
|
+
|
680
736
|
if isinstance(dataset, DataFrame):
|
681
737
|
self._deps = self._batch_inference_validate_snowpark(
|
682
738
|
dataset=dataset,
|
683
739
|
inference_method=inference_method,
|
684
740
|
)
|
685
|
-
assert isinstance(
|
741
|
+
assert isinstance(
|
742
|
+
dataset._session, Session
|
743
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
686
744
|
transform_kwargs = dict(
|
687
745
|
session=dataset._session,
|
688
746
|
dependencies=self._deps,
|
689
|
-
drop_input_cols
|
747
|
+
drop_input_cols=self._drop_input_cols,
|
690
748
|
expected_output_cols_type="float",
|
691
749
|
)
|
750
|
+
expected_output_cols = self._align_expected_output_names(
|
751
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
752
|
+
)
|
692
753
|
|
693
754
|
elif isinstance(dataset, pd.DataFrame):
|
694
|
-
transform_kwargs = dict(
|
695
|
-
snowpark_input_cols = self._snowpark_cols,
|
696
|
-
drop_input_cols = self._drop_input_cols
|
697
|
-
)
|
755
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
698
756
|
|
699
757
|
transform_handlers = ModelTransformerBuilder.build(
|
700
758
|
dataset=dataset,
|
@@ -706,7 +764,7 @@ class MiniBatchDictionaryLearning(BaseTransformer):
|
|
706
764
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
707
765
|
inference_method=inference_method,
|
708
766
|
input_cols=self.input_cols,
|
709
|
-
expected_output_cols=
|
767
|
+
expected_output_cols=expected_output_cols,
|
710
768
|
**transform_kwargs
|
711
769
|
)
|
712
770
|
return output_df
|
@@ -736,7 +794,8 @@ class MiniBatchDictionaryLearning(BaseTransformer):
|
|
736
794
|
Output dataset with log probability of the sample for each class in the model.
|
737
795
|
"""
|
738
796
|
super()._check_dataset_type(dataset)
|
739
|
-
inference_method="predict_log_proba"
|
797
|
+
inference_method = "predict_log_proba"
|
798
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
740
799
|
|
741
800
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
742
801
|
# are specific to the type of dataset used.
|
@@ -747,18 +806,20 @@ class MiniBatchDictionaryLearning(BaseTransformer):
|
|
747
806
|
dataset=dataset,
|
748
807
|
inference_method=inference_method,
|
749
808
|
)
|
750
|
-
assert isinstance(
|
809
|
+
assert isinstance(
|
810
|
+
dataset._session, Session
|
811
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
751
812
|
transform_kwargs = dict(
|
752
813
|
session=dataset._session,
|
753
814
|
dependencies=self._deps,
|
754
|
-
drop_input_cols
|
815
|
+
drop_input_cols=self._drop_input_cols,
|
755
816
|
expected_output_cols_type="float",
|
756
817
|
)
|
818
|
+
expected_output_cols = self._align_expected_output_names(
|
819
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
820
|
+
)
|
757
821
|
elif isinstance(dataset, pd.DataFrame):
|
758
|
-
transform_kwargs = dict(
|
759
|
-
snowpark_input_cols = self._snowpark_cols,
|
760
|
-
drop_input_cols = self._drop_input_cols
|
761
|
-
)
|
822
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
762
823
|
|
763
824
|
transform_handlers = ModelTransformerBuilder.build(
|
764
825
|
dataset=dataset,
|
@@ -771,7 +832,7 @@ class MiniBatchDictionaryLearning(BaseTransformer):
|
|
771
832
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
772
833
|
inference_method=inference_method,
|
773
834
|
input_cols=self.input_cols,
|
774
|
-
expected_output_cols=
|
835
|
+
expected_output_cols=expected_output_cols,
|
775
836
|
**transform_kwargs
|
776
837
|
)
|
777
838
|
return output_df
|
@@ -797,30 +858,34 @@ class MiniBatchDictionaryLearning(BaseTransformer):
|
|
797
858
|
Output dataset with results of the decision function for the samples in input dataset.
|
798
859
|
"""
|
799
860
|
super()._check_dataset_type(dataset)
|
800
|
-
inference_method="decision_function"
|
861
|
+
inference_method = "decision_function"
|
801
862
|
|
802
863
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
803
864
|
# are specific to the type of dataset used.
|
804
865
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
805
866
|
|
867
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
868
|
+
|
806
869
|
if isinstance(dataset, DataFrame):
|
807
870
|
self._deps = self._batch_inference_validate_snowpark(
|
808
871
|
dataset=dataset,
|
809
872
|
inference_method=inference_method,
|
810
873
|
)
|
811
|
-
assert isinstance(
|
874
|
+
assert isinstance(
|
875
|
+
dataset._session, Session
|
876
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
812
877
|
transform_kwargs = dict(
|
813
878
|
session=dataset._session,
|
814
879
|
dependencies=self._deps,
|
815
|
-
drop_input_cols
|
880
|
+
drop_input_cols=self._drop_input_cols,
|
816
881
|
expected_output_cols_type="float",
|
817
882
|
)
|
883
|
+
expected_output_cols = self._align_expected_output_names(
|
884
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
885
|
+
)
|
818
886
|
|
819
887
|
elif isinstance(dataset, pd.DataFrame):
|
820
|
-
transform_kwargs = dict(
|
821
|
-
snowpark_input_cols = self._snowpark_cols,
|
822
|
-
drop_input_cols = self._drop_input_cols
|
823
|
-
)
|
888
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
824
889
|
|
825
890
|
transform_handlers = ModelTransformerBuilder.build(
|
826
891
|
dataset=dataset,
|
@@ -833,7 +898,7 @@ class MiniBatchDictionaryLearning(BaseTransformer):
|
|
833
898
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
834
899
|
inference_method=inference_method,
|
835
900
|
input_cols=self.input_cols,
|
836
|
-
expected_output_cols=
|
901
|
+
expected_output_cols=expected_output_cols,
|
837
902
|
**transform_kwargs
|
838
903
|
)
|
839
904
|
return output_df
|
@@ -862,12 +927,14 @@ class MiniBatchDictionaryLearning(BaseTransformer):
|
|
862
927
|
Output dataset with probability of the sample for each class in the model.
|
863
928
|
"""
|
864
929
|
super()._check_dataset_type(dataset)
|
865
|
-
inference_method="score_samples"
|
930
|
+
inference_method = "score_samples"
|
866
931
|
|
867
932
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
868
933
|
# are specific to the type of dataset used.
|
869
934
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
870
935
|
|
936
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
937
|
+
|
871
938
|
if isinstance(dataset, DataFrame):
|
872
939
|
self._deps = self._batch_inference_validate_snowpark(
|
873
940
|
dataset=dataset,
|
@@ -880,6 +947,9 @@ class MiniBatchDictionaryLearning(BaseTransformer):
|
|
880
947
|
drop_input_cols = self._drop_input_cols,
|
881
948
|
expected_output_cols_type="float",
|
882
949
|
)
|
950
|
+
expected_output_cols = self._align_expected_output_names(
|
951
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
952
|
+
)
|
883
953
|
|
884
954
|
elif isinstance(dataset, pd.DataFrame):
|
885
955
|
transform_kwargs = dict(
|
@@ -898,7 +968,7 @@ class MiniBatchDictionaryLearning(BaseTransformer):
|
|
898
968
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
899
969
|
inference_method=inference_method,
|
900
970
|
input_cols=self.input_cols,
|
901
|
-
expected_output_cols=
|
971
|
+
expected_output_cols=expected_output_cols,
|
902
972
|
**transform_kwargs
|
903
973
|
)
|
904
974
|
return output_df
|
@@ -1043,50 +1113,84 @@ class MiniBatchDictionaryLearning(BaseTransformer):
|
|
1043
1113
|
)
|
1044
1114
|
return output_df
|
1045
1115
|
|
1116
|
+
|
1117
|
+
|
1118
|
+
def to_sklearn(self) -> Any:
|
1119
|
+
"""Get sklearn.decomposition.MiniBatchDictionaryLearning object.
|
1120
|
+
"""
|
1121
|
+
if self._sklearn_object is None:
|
1122
|
+
self._sklearn_object = self._create_sklearn_object()
|
1123
|
+
return self._sklearn_object
|
1124
|
+
|
1125
|
+
def to_xgboost(self) -> Any:
|
1126
|
+
raise exceptions.SnowflakeMLException(
|
1127
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1128
|
+
original_exception=AttributeError(
|
1129
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1130
|
+
"to_xgboost()",
|
1131
|
+
"to_sklearn()"
|
1132
|
+
)
|
1133
|
+
),
|
1134
|
+
)
|
1135
|
+
|
1136
|
+
def to_lightgbm(self) -> Any:
|
1137
|
+
raise exceptions.SnowflakeMLException(
|
1138
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1139
|
+
original_exception=AttributeError(
|
1140
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1141
|
+
"to_lightgbm()",
|
1142
|
+
"to_sklearn()"
|
1143
|
+
)
|
1144
|
+
),
|
1145
|
+
)
|
1046
1146
|
|
1047
|
-
def
|
1147
|
+
def _get_dependencies(self) -> List[str]:
|
1148
|
+
return self._deps
|
1149
|
+
|
1150
|
+
|
1151
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
1048
1152
|
self._model_signature_dict = dict()
|
1049
1153
|
|
1050
1154
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
1051
1155
|
|
1052
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1156
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
1053
1157
|
outputs: List[BaseFeatureSpec] = []
|
1054
1158
|
if hasattr(self, "predict"):
|
1055
1159
|
# keep mypy happy
|
1056
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1160
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1057
1161
|
# For classifier, the type of predict is the same as the type of label
|
1058
|
-
if self._sklearn_object._estimator_type ==
|
1059
|
-
|
1162
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1163
|
+
# label columns is the desired type for output
|
1060
1164
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
1061
1165
|
# rename the output columns
|
1062
1166
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
1063
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1064
|
-
|
1065
|
-
|
1167
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1168
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1169
|
+
)
|
1066
1170
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
1067
1171
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
1068
|
-
# Clusterer returns int64 cluster labels.
|
1172
|
+
# Clusterer returns int64 cluster labels.
|
1069
1173
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
1070
1174
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
1071
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1072
|
-
|
1073
|
-
|
1074
|
-
|
1175
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1176
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1177
|
+
)
|
1178
|
+
|
1075
1179
|
# For regressor, the type of predict is float64
|
1076
|
-
elif self._sklearn_object._estimator_type ==
|
1180
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
1077
1181
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
1078
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
1079
|
-
|
1080
|
-
|
1081
|
-
|
1182
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1183
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1184
|
+
)
|
1185
|
+
|
1082
1186
|
for prob_func in PROB_FUNCTIONS:
|
1083
1187
|
if hasattr(self, prob_func):
|
1084
1188
|
output_cols_prefix: str = f"{prob_func}_"
|
1085
1189
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
1086
1190
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
1087
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
1088
|
-
|
1089
|
-
|
1191
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1192
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1193
|
+
)
|
1090
1194
|
|
1091
1195
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
1092
1196
|
items = list(self._model_signature_dict.items())
|
@@ -1099,10 +1203,10 @@ class MiniBatchDictionaryLearning(BaseTransformer):
|
|
1099
1203
|
"""Returns model signature of current class.
|
1100
1204
|
|
1101
1205
|
Raises:
|
1102
|
-
|
1206
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
1103
1207
|
|
1104
1208
|
Returns:
|
1105
|
-
Dict
|
1209
|
+
Dict with each method and its input output signature
|
1106
1210
|
"""
|
1107
1211
|
if self._model_signature_dict is None:
|
1108
1212
|
raise exceptions.SnowflakeMLException(
|
@@ -1110,35 +1214,3 @@ class MiniBatchDictionaryLearning(BaseTransformer):
|
|
1110
1214
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
1111
1215
|
)
|
1112
1216
|
return self._model_signature_dict
|
1113
|
-
|
1114
|
-
def to_sklearn(self) -> Any:
|
1115
|
-
"""Get sklearn.decomposition.MiniBatchDictionaryLearning object.
|
1116
|
-
"""
|
1117
|
-
if self._sklearn_object is None:
|
1118
|
-
self._sklearn_object = self._create_sklearn_object()
|
1119
|
-
return self._sklearn_object
|
1120
|
-
|
1121
|
-
def to_xgboost(self) -> Any:
|
1122
|
-
raise exceptions.SnowflakeMLException(
|
1123
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1124
|
-
original_exception=AttributeError(
|
1125
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1126
|
-
"to_xgboost()",
|
1127
|
-
"to_sklearn()"
|
1128
|
-
)
|
1129
|
-
),
|
1130
|
-
)
|
1131
|
-
|
1132
|
-
def to_lightgbm(self) -> Any:
|
1133
|
-
raise exceptions.SnowflakeMLException(
|
1134
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1135
|
-
original_exception=AttributeError(
|
1136
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1137
|
-
"to_lightgbm()",
|
1138
|
-
"to_sklearn()"
|
1139
|
-
)
|
1140
|
-
),
|
1141
|
-
)
|
1142
|
-
|
1143
|
-
def _get_dependencies(self) -> List[str]:
|
1144
|
-
return self._deps
|