snowflake-ml-python 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +11 -1
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/feature_store/feature_store.py +151 -78
- snowflake/ml/feature_store/feature_view.py +12 -24
- snowflake/ml/fileset/sfcfs.py +56 -50
- snowflake/ml/fileset/stage_fs.py +48 -13
- snowflake/ml/model/_client/model/model_version_impl.py +2 -50
- snowflake/ml/model/_client/ops/model_ops.py +78 -29
- snowflake/ml/model/_client/sql/model.py +23 -2
- snowflake/ml/model/_client/sql/model_version.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +19 -54
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +8 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +36 -6
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -2
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +195 -123
- snowflake/ml/modeling/cluster/affinity_propagation.py +195 -123
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +195 -123
- snowflake/ml/modeling/cluster/birch.py +195 -123
- snowflake/ml/modeling/cluster/bisecting_k_means.py +195 -123
- snowflake/ml/modeling/cluster/dbscan.py +195 -123
- snowflake/ml/modeling/cluster/feature_agglomeration.py +195 -123
- snowflake/ml/modeling/cluster/k_means.py +195 -123
- snowflake/ml/modeling/cluster/mean_shift.py +195 -123
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +195 -123
- snowflake/ml/modeling/cluster/optics.py +195 -123
- snowflake/ml/modeling/cluster/spectral_biclustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_clustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_coclustering.py +195 -123
- snowflake/ml/modeling/compose/column_transformer.py +195 -123
- snowflake/ml/modeling/compose/transformed_target_regressor.py +195 -123
- snowflake/ml/modeling/covariance/elliptic_envelope.py +195 -123
- snowflake/ml/modeling/covariance/empirical_covariance.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +195 -123
- snowflake/ml/modeling/covariance/ledoit_wolf.py +195 -123
- snowflake/ml/modeling/covariance/min_cov_det.py +195 -123
- snowflake/ml/modeling/covariance/oas.py +195 -123
- snowflake/ml/modeling/covariance/shrunk_covariance.py +195 -123
- snowflake/ml/modeling/decomposition/dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/factor_analysis.py +195 -123
- snowflake/ml/modeling/decomposition/fast_ica.py +195 -123
- snowflake/ml/modeling/decomposition/incremental_pca.py +195 -123
- snowflake/ml/modeling/decomposition/kernel_pca.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/pca.py +195 -123
- snowflake/ml/modeling/decomposition/sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/truncated_svd.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/isolation_forest.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/stacking_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/voting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/voting_regressor.py +195 -123
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fdr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fpr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fwe.py +195 -123
- snowflake/ml/modeling/feature_selection/select_k_best.py +195 -123
- snowflake/ml/modeling/feature_selection/select_percentile.py +195 -123
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +195 -123
- snowflake/ml/modeling/feature_selection/variance_threshold.py +195 -123
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +9 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +195 -123
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +195 -123
- snowflake/ml/modeling/impute/iterative_imputer.py +195 -123
- snowflake/ml/modeling/impute/knn_imputer.py +195 -123
- snowflake/ml/modeling/impute/missing_indicator.py +195 -123
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/nystroem.py +195 -123
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +195 -123
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ard_regression.py +195 -123
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/gamma_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/huber_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/lars.py +195 -123
- snowflake/ml/modeling/linear_model/lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +195 -123
- snowflake/ml/modeling/linear_model/linear_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/perceptron.py +195 -123
- snowflake/ml/modeling/linear_model/poisson_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ransac_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ridge.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_cv.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +195 -123
- snowflake/ml/modeling/manifold/isomap.py +195 -123
- snowflake/ml/modeling/manifold/mds.py +195 -123
- snowflake/ml/modeling/manifold/spectral_embedding.py +195 -123
- snowflake/ml/modeling/manifold/tsne.py +195 -123
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +195 -123
- snowflake/ml/modeling/mixture/gaussian_mixture.py +195 -123
- snowflake/ml/modeling/model_selection/grid_search_cv.py +42 -18
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +42 -18
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/output_code_classifier.py +195 -123
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/complement_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neighbors/kernel_density.py +195 -123
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_centroid.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +195 -123
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_classifier.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_regressor.py +195 -123
- snowflake/ml/modeling/pipeline/pipeline.py +4 -4
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +195 -123
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +195 -123
- snowflake/ml/modeling/semi_supervised/label_spreading.py +195 -123
- snowflake/ml/modeling/svm/linear_svc.py +195 -123
- snowflake/ml/modeling/svm/linear_svr.py +195 -123
- snowflake/ml/modeling/svm/nu_svc.py +195 -123
- snowflake/ml/modeling/svm/nu_svr.py +195 -123
- snowflake/ml/modeling/svm/svc.py +195 -123
- snowflake/ml/modeling/svm/svr.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_regressor.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +195 -123
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/METADATA +68 -57
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/RECORD +202 -200
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -240,12 +239,7 @@ class SpectralCoclustering(BaseTransformer):
|
|
240
239
|
)
|
241
240
|
return selected_cols
|
242
241
|
|
243
|
-
|
244
|
-
project=_PROJECT,
|
245
|
-
subproject=_SUBPROJECT,
|
246
|
-
custom_tags=dict([("autogen", True)]),
|
247
|
-
)
|
248
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "SpectralCoclustering":
|
242
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "SpectralCoclustering":
|
249
243
|
"""Create a biclustering for X
|
250
244
|
For more details on this function, see [sklearn.cluster.SpectralCoclustering.fit]
|
251
245
|
(https://scikit-learn.org/stable/modules/generated/sklearn.cluster.SpectralCoclustering.html#sklearn.cluster.SpectralCoclustering.fit)
|
@@ -272,12 +266,14 @@ class SpectralCoclustering(BaseTransformer):
|
|
272
266
|
|
273
267
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
274
268
|
|
275
|
-
|
269
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
276
270
|
if SNOWML_SPROC_ENV in os.environ:
|
277
271
|
statement_params = telemetry.get_function_usage_statement_params(
|
278
272
|
project=_PROJECT,
|
279
273
|
subproject=_SUBPROJECT,
|
280
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
274
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
275
|
+
inspect.currentframe(), SpectralCoclustering.__class__.__name__
|
276
|
+
),
|
281
277
|
api_calls=[Session.call],
|
282
278
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
283
279
|
)
|
@@ -298,7 +294,7 @@ class SpectralCoclustering(BaseTransformer):
|
|
298
294
|
)
|
299
295
|
self._sklearn_object = model_trainer.train()
|
300
296
|
self._is_fitted = True
|
301
|
-
self.
|
297
|
+
self._generate_model_signatures(dataset)
|
302
298
|
return self
|
303
299
|
|
304
300
|
def _batch_inference_validate_snowpark(
|
@@ -372,7 +368,9 @@ class SpectralCoclustering(BaseTransformer):
|
|
372
368
|
# when it is classifier, infer the datatype from label columns
|
373
369
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
374
370
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
375
|
-
label_cols_signatures = [
|
371
|
+
label_cols_signatures = [
|
372
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
373
|
+
]
|
376
374
|
if len(label_cols_signatures) == 0:
|
377
375
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
378
376
|
raise exceptions.SnowflakeMLException(
|
@@ -380,25 +378,22 @@ class SpectralCoclustering(BaseTransformer):
|
|
380
378
|
original_exception=ValueError(error_str),
|
381
379
|
)
|
382
380
|
|
383
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
384
|
-
label_cols_signatures[0].as_snowpark_type()
|
385
|
-
)
|
381
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
386
382
|
|
387
383
|
self._deps = self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
388
|
-
assert isinstance(
|
384
|
+
assert isinstance(
|
385
|
+
dataset._session, Session
|
386
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
389
387
|
|
390
388
|
transform_kwargs = dict(
|
391
|
-
session
|
392
|
-
dependencies
|
393
|
-
drop_input_cols
|
394
|
-
expected_output_cols_type
|
389
|
+
session=dataset._session,
|
390
|
+
dependencies=self._deps,
|
391
|
+
drop_input_cols=self._drop_input_cols,
|
392
|
+
expected_output_cols_type=expected_type_inferred,
|
395
393
|
)
|
396
394
|
|
397
395
|
elif isinstance(dataset, pd.DataFrame):
|
398
|
-
transform_kwargs = dict(
|
399
|
-
snowpark_input_cols = self._snowpark_cols,
|
400
|
-
drop_input_cols = self._drop_input_cols
|
401
|
-
)
|
396
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
402
397
|
|
403
398
|
transform_handlers = ModelTransformerBuilder.build(
|
404
399
|
dataset=dataset,
|
@@ -438,7 +433,7 @@ class SpectralCoclustering(BaseTransformer):
|
|
438
433
|
Transformed dataset.
|
439
434
|
"""
|
440
435
|
super()._check_dataset_type(dataset)
|
441
|
-
inference_method="transform"
|
436
|
+
inference_method = "transform"
|
442
437
|
|
443
438
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
444
439
|
# are specific to the type of dataset used.
|
@@ -475,17 +470,14 @@ class SpectralCoclustering(BaseTransformer):
|
|
475
470
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
476
471
|
|
477
472
|
transform_kwargs = dict(
|
478
|
-
session
|
479
|
-
dependencies
|
480
|
-
drop_input_cols
|
481
|
-
expected_output_cols_type
|
473
|
+
session=dataset._session,
|
474
|
+
dependencies=self._deps,
|
475
|
+
drop_input_cols=self._drop_input_cols,
|
476
|
+
expected_output_cols_type=expected_dtype,
|
482
477
|
)
|
483
478
|
|
484
479
|
elif isinstance(dataset, pd.DataFrame):
|
485
|
-
transform_kwargs = dict(
|
486
|
-
snowpark_input_cols = self._snowpark_cols,
|
487
|
-
drop_input_cols = self._drop_input_cols
|
488
|
-
)
|
480
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
489
481
|
|
490
482
|
transform_handlers = ModelTransformerBuilder.build(
|
491
483
|
dataset=dataset,
|
@@ -504,7 +496,11 @@ class SpectralCoclustering(BaseTransformer):
|
|
504
496
|
return output_df
|
505
497
|
|
506
498
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
507
|
-
def fit_predict(
|
499
|
+
def fit_predict(
|
500
|
+
self,
|
501
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
502
|
+
output_cols_prefix: str = "fit_predict_",
|
503
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
508
504
|
""" Method not supported for this class.
|
509
505
|
|
510
506
|
|
@@ -529,7 +525,9 @@ class SpectralCoclustering(BaseTransformer):
|
|
529
525
|
)
|
530
526
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
531
527
|
drop_input_cols=self._drop_input_cols,
|
532
|
-
expected_output_cols_list=
|
528
|
+
expected_output_cols_list=(
|
529
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
530
|
+
),
|
533
531
|
)
|
534
532
|
self._sklearn_object = fitted_estimator
|
535
533
|
self._is_fitted = True
|
@@ -546,6 +544,62 @@ class SpectralCoclustering(BaseTransformer):
|
|
546
544
|
assert self._sklearn_object is not None
|
547
545
|
return self._sklearn_object.embedding_
|
548
546
|
|
547
|
+
|
548
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
549
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
550
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
551
|
+
"""
|
552
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
553
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
554
|
+
if output_cols:
|
555
|
+
output_cols = [
|
556
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
557
|
+
for c in output_cols
|
558
|
+
]
|
559
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
560
|
+
output_cols = [output_cols_prefix]
|
561
|
+
elif self._sklearn_object is not None:
|
562
|
+
classes = self._sklearn_object.classes_
|
563
|
+
if isinstance(classes, numpy.ndarray):
|
564
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
565
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
566
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
567
|
+
output_cols = []
|
568
|
+
for i, cl in enumerate(classes):
|
569
|
+
# For binary classification, there is only one output column for each class
|
570
|
+
# ndarray as the two classes are complementary.
|
571
|
+
if len(cl) == 2:
|
572
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
573
|
+
else:
|
574
|
+
output_cols.extend([
|
575
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
576
|
+
])
|
577
|
+
else:
|
578
|
+
output_cols = []
|
579
|
+
|
580
|
+
# Make sure column names are valid snowflake identifiers.
|
581
|
+
assert output_cols is not None # Make MyPy happy
|
582
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
583
|
+
|
584
|
+
return rv
|
585
|
+
|
586
|
+
def _align_expected_output_names(
|
587
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
588
|
+
) -> List[str]:
|
589
|
+
# in case the inferred output column names dimension is different
|
590
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
591
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
592
|
+
output_df_columns = list(output_df_pd.columns)
|
593
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
594
|
+
if self.sample_weight_col:
|
595
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
596
|
+
# if the dimension of inferred output column names is correct; use it
|
597
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
598
|
+
return expected_output_cols_list
|
599
|
+
# otherwise, use the sklearn estimator's output
|
600
|
+
else:
|
601
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
602
|
+
|
549
603
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
550
604
|
@telemetry.send_api_usage_telemetry(
|
551
605
|
project=_PROJECT,
|
@@ -576,24 +630,28 @@ class SpectralCoclustering(BaseTransformer):
|
|
576
630
|
# are specific to the type of dataset used.
|
577
631
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
578
632
|
|
633
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
634
|
+
|
579
635
|
if isinstance(dataset, DataFrame):
|
580
636
|
self._deps = self._batch_inference_validate_snowpark(
|
581
637
|
dataset=dataset,
|
582
638
|
inference_method=inference_method,
|
583
639
|
)
|
584
|
-
assert isinstance(
|
640
|
+
assert isinstance(
|
641
|
+
dataset._session, Session
|
642
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
585
643
|
transform_kwargs = dict(
|
586
644
|
session=dataset._session,
|
587
645
|
dependencies=self._deps,
|
588
|
-
drop_input_cols
|
646
|
+
drop_input_cols=self._drop_input_cols,
|
589
647
|
expected_output_cols_type="float",
|
590
648
|
)
|
649
|
+
expected_output_cols = self._align_expected_output_names(
|
650
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
651
|
+
)
|
591
652
|
|
592
653
|
elif isinstance(dataset, pd.DataFrame):
|
593
|
-
transform_kwargs = dict(
|
594
|
-
snowpark_input_cols = self._snowpark_cols,
|
595
|
-
drop_input_cols = self._drop_input_cols
|
596
|
-
)
|
654
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
597
655
|
|
598
656
|
transform_handlers = ModelTransformerBuilder.build(
|
599
657
|
dataset=dataset,
|
@@ -605,7 +663,7 @@ class SpectralCoclustering(BaseTransformer):
|
|
605
663
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
606
664
|
inference_method=inference_method,
|
607
665
|
input_cols=self.input_cols,
|
608
|
-
expected_output_cols=
|
666
|
+
expected_output_cols=expected_output_cols,
|
609
667
|
**transform_kwargs
|
610
668
|
)
|
611
669
|
return output_df
|
@@ -635,7 +693,8 @@ class SpectralCoclustering(BaseTransformer):
|
|
635
693
|
Output dataset with log probability of the sample for each class in the model.
|
636
694
|
"""
|
637
695
|
super()._check_dataset_type(dataset)
|
638
|
-
inference_method="predict_log_proba"
|
696
|
+
inference_method = "predict_log_proba"
|
697
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
639
698
|
|
640
699
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
641
700
|
# are specific to the type of dataset used.
|
@@ -646,18 +705,20 @@ class SpectralCoclustering(BaseTransformer):
|
|
646
705
|
dataset=dataset,
|
647
706
|
inference_method=inference_method,
|
648
707
|
)
|
649
|
-
assert isinstance(
|
708
|
+
assert isinstance(
|
709
|
+
dataset._session, Session
|
710
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
650
711
|
transform_kwargs = dict(
|
651
712
|
session=dataset._session,
|
652
713
|
dependencies=self._deps,
|
653
|
-
drop_input_cols
|
714
|
+
drop_input_cols=self._drop_input_cols,
|
654
715
|
expected_output_cols_type="float",
|
655
716
|
)
|
717
|
+
expected_output_cols = self._align_expected_output_names(
|
718
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
719
|
+
)
|
656
720
|
elif isinstance(dataset, pd.DataFrame):
|
657
|
-
transform_kwargs = dict(
|
658
|
-
snowpark_input_cols = self._snowpark_cols,
|
659
|
-
drop_input_cols = self._drop_input_cols
|
660
|
-
)
|
721
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
661
722
|
|
662
723
|
transform_handlers = ModelTransformerBuilder.build(
|
663
724
|
dataset=dataset,
|
@@ -670,7 +731,7 @@ class SpectralCoclustering(BaseTransformer):
|
|
670
731
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
671
732
|
inference_method=inference_method,
|
672
733
|
input_cols=self.input_cols,
|
673
|
-
expected_output_cols=
|
734
|
+
expected_output_cols=expected_output_cols,
|
674
735
|
**transform_kwargs
|
675
736
|
)
|
676
737
|
return output_df
|
@@ -696,30 +757,34 @@ class SpectralCoclustering(BaseTransformer):
|
|
696
757
|
Output dataset with results of the decision function for the samples in input dataset.
|
697
758
|
"""
|
698
759
|
super()._check_dataset_type(dataset)
|
699
|
-
inference_method="decision_function"
|
760
|
+
inference_method = "decision_function"
|
700
761
|
|
701
762
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
702
763
|
# are specific to the type of dataset used.
|
703
764
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
704
765
|
|
766
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
767
|
+
|
705
768
|
if isinstance(dataset, DataFrame):
|
706
769
|
self._deps = self._batch_inference_validate_snowpark(
|
707
770
|
dataset=dataset,
|
708
771
|
inference_method=inference_method,
|
709
772
|
)
|
710
|
-
assert isinstance(
|
773
|
+
assert isinstance(
|
774
|
+
dataset._session, Session
|
775
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
711
776
|
transform_kwargs = dict(
|
712
777
|
session=dataset._session,
|
713
778
|
dependencies=self._deps,
|
714
|
-
drop_input_cols
|
779
|
+
drop_input_cols=self._drop_input_cols,
|
715
780
|
expected_output_cols_type="float",
|
716
781
|
)
|
782
|
+
expected_output_cols = self._align_expected_output_names(
|
783
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
784
|
+
)
|
717
785
|
|
718
786
|
elif isinstance(dataset, pd.DataFrame):
|
719
|
-
transform_kwargs = dict(
|
720
|
-
snowpark_input_cols = self._snowpark_cols,
|
721
|
-
drop_input_cols = self._drop_input_cols
|
722
|
-
)
|
787
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
723
788
|
|
724
789
|
transform_handlers = ModelTransformerBuilder.build(
|
725
790
|
dataset=dataset,
|
@@ -732,7 +797,7 @@ class SpectralCoclustering(BaseTransformer):
|
|
732
797
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
733
798
|
inference_method=inference_method,
|
734
799
|
input_cols=self.input_cols,
|
735
|
-
expected_output_cols=
|
800
|
+
expected_output_cols=expected_output_cols,
|
736
801
|
**transform_kwargs
|
737
802
|
)
|
738
803
|
return output_df
|
@@ -761,12 +826,14 @@ class SpectralCoclustering(BaseTransformer):
|
|
761
826
|
Output dataset with probability of the sample for each class in the model.
|
762
827
|
"""
|
763
828
|
super()._check_dataset_type(dataset)
|
764
|
-
inference_method="score_samples"
|
829
|
+
inference_method = "score_samples"
|
765
830
|
|
766
831
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
767
832
|
# are specific to the type of dataset used.
|
768
833
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
769
834
|
|
835
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
836
|
+
|
770
837
|
if isinstance(dataset, DataFrame):
|
771
838
|
self._deps = self._batch_inference_validate_snowpark(
|
772
839
|
dataset=dataset,
|
@@ -779,6 +846,9 @@ class SpectralCoclustering(BaseTransformer):
|
|
779
846
|
drop_input_cols = self._drop_input_cols,
|
780
847
|
expected_output_cols_type="float",
|
781
848
|
)
|
849
|
+
expected_output_cols = self._align_expected_output_names(
|
850
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
851
|
+
)
|
782
852
|
|
783
853
|
elif isinstance(dataset, pd.DataFrame):
|
784
854
|
transform_kwargs = dict(
|
@@ -797,7 +867,7 @@ class SpectralCoclustering(BaseTransformer):
|
|
797
867
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
798
868
|
inference_method=inference_method,
|
799
869
|
input_cols=self.input_cols,
|
800
|
-
expected_output_cols=
|
870
|
+
expected_output_cols=expected_output_cols,
|
801
871
|
**transform_kwargs
|
802
872
|
)
|
803
873
|
return output_df
|
@@ -942,50 +1012,84 @@ class SpectralCoclustering(BaseTransformer):
|
|
942
1012
|
)
|
943
1013
|
return output_df
|
944
1014
|
|
1015
|
+
|
1016
|
+
|
1017
|
+
def to_sklearn(self) -> Any:
|
1018
|
+
"""Get sklearn.cluster.SpectralCoclustering object.
|
1019
|
+
"""
|
1020
|
+
if self._sklearn_object is None:
|
1021
|
+
self._sklearn_object = self._create_sklearn_object()
|
1022
|
+
return self._sklearn_object
|
1023
|
+
|
1024
|
+
def to_xgboost(self) -> Any:
|
1025
|
+
raise exceptions.SnowflakeMLException(
|
1026
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1027
|
+
original_exception=AttributeError(
|
1028
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1029
|
+
"to_xgboost()",
|
1030
|
+
"to_sklearn()"
|
1031
|
+
)
|
1032
|
+
),
|
1033
|
+
)
|
1034
|
+
|
1035
|
+
def to_lightgbm(self) -> Any:
|
1036
|
+
raise exceptions.SnowflakeMLException(
|
1037
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1038
|
+
original_exception=AttributeError(
|
1039
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1040
|
+
"to_lightgbm()",
|
1041
|
+
"to_sklearn()"
|
1042
|
+
)
|
1043
|
+
),
|
1044
|
+
)
|
945
1045
|
|
946
|
-
def
|
1046
|
+
def _get_dependencies(self) -> List[str]:
|
1047
|
+
return self._deps
|
1048
|
+
|
1049
|
+
|
1050
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
947
1051
|
self._model_signature_dict = dict()
|
948
1052
|
|
949
1053
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
950
1054
|
|
951
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1055
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
952
1056
|
outputs: List[BaseFeatureSpec] = []
|
953
1057
|
if hasattr(self, "predict"):
|
954
1058
|
# keep mypy happy
|
955
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1059
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
956
1060
|
# For classifier, the type of predict is the same as the type of label
|
957
|
-
if self._sklearn_object._estimator_type ==
|
958
|
-
|
1061
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1062
|
+
# label columns is the desired type for output
|
959
1063
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
960
1064
|
# rename the output columns
|
961
1065
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
962
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
963
|
-
|
964
|
-
|
1066
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1067
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1068
|
+
)
|
965
1069
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
966
1070
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
967
|
-
# Clusterer returns int64 cluster labels.
|
1071
|
+
# Clusterer returns int64 cluster labels.
|
968
1072
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
969
1073
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
970
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
971
|
-
|
972
|
-
|
973
|
-
|
1074
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1075
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1076
|
+
)
|
1077
|
+
|
974
1078
|
# For regressor, the type of predict is float64
|
975
|
-
elif self._sklearn_object._estimator_type ==
|
1079
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
976
1080
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
977
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
978
|
-
|
979
|
-
|
980
|
-
|
1081
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1082
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1083
|
+
)
|
1084
|
+
|
981
1085
|
for prob_func in PROB_FUNCTIONS:
|
982
1086
|
if hasattr(self, prob_func):
|
983
1087
|
output_cols_prefix: str = f"{prob_func}_"
|
984
1088
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
985
1089
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
986
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
987
|
-
|
988
|
-
|
1090
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1091
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1092
|
+
)
|
989
1093
|
|
990
1094
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
991
1095
|
items = list(self._model_signature_dict.items())
|
@@ -998,10 +1102,10 @@ class SpectralCoclustering(BaseTransformer):
|
|
998
1102
|
"""Returns model signature of current class.
|
999
1103
|
|
1000
1104
|
Raises:
|
1001
|
-
|
1105
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
1002
1106
|
|
1003
1107
|
Returns:
|
1004
|
-
Dict
|
1108
|
+
Dict with each method and its input output signature
|
1005
1109
|
"""
|
1006
1110
|
if self._model_signature_dict is None:
|
1007
1111
|
raise exceptions.SnowflakeMLException(
|
@@ -1009,35 +1113,3 @@ class SpectralCoclustering(BaseTransformer):
|
|
1009
1113
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
1010
1114
|
)
|
1011
1115
|
return self._model_signature_dict
|
1012
|
-
|
1013
|
-
def to_sklearn(self) -> Any:
|
1014
|
-
"""Get sklearn.cluster.SpectralCoclustering object.
|
1015
|
-
"""
|
1016
|
-
if self._sklearn_object is None:
|
1017
|
-
self._sklearn_object = self._create_sklearn_object()
|
1018
|
-
return self._sklearn_object
|
1019
|
-
|
1020
|
-
def to_xgboost(self) -> Any:
|
1021
|
-
raise exceptions.SnowflakeMLException(
|
1022
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1023
|
-
original_exception=AttributeError(
|
1024
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1025
|
-
"to_xgboost()",
|
1026
|
-
"to_sklearn()"
|
1027
|
-
)
|
1028
|
-
),
|
1029
|
-
)
|
1030
|
-
|
1031
|
-
def to_lightgbm(self) -> Any:
|
1032
|
-
raise exceptions.SnowflakeMLException(
|
1033
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1034
|
-
original_exception=AttributeError(
|
1035
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1036
|
-
"to_lightgbm()",
|
1037
|
-
"to_sklearn()"
|
1038
|
-
)
|
1039
|
-
),
|
1040
|
-
)
|
1041
|
-
|
1042
|
-
def _get_dependencies(self) -> List[str]:
|
1043
|
-
return self._deps
|