snowflake-ml-python 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/ml/_internal/env_utils.py +11 -1
- snowflake/ml/_internal/utils/identifier.py +3 -1
- snowflake/ml/_internal/utils/sql_identifier.py +2 -6
- snowflake/ml/feature_store/feature_store.py +151 -78
- snowflake/ml/feature_store/feature_view.py +12 -24
- snowflake/ml/fileset/sfcfs.py +56 -50
- snowflake/ml/fileset/stage_fs.py +48 -13
- snowflake/ml/model/_client/model/model_version_impl.py +2 -50
- snowflake/ml/model/_client/ops/model_ops.py +78 -29
- snowflake/ml/model/_client/sql/model.py +23 -2
- snowflake/ml/model/_client/sql/model_version.py +22 -1
- snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +19 -54
- snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +8 -1
- snowflake/ml/model/_model_composer/model_method/model_method.py +6 -10
- snowflake/ml/model/_packager/model_handlers/catboost.py +206 -0
- snowflake/ml/model/_packager/model_handlers/lightgbm.py +218 -0
- snowflake/ml/model/_packager/model_handlers/sklearn.py +3 -0
- snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
- snowflake/ml/model/_packager/model_meta/model_meta.py +36 -6
- snowflake/ml/model/_packager/model_meta/model_meta_schema.py +20 -1
- snowflake/ml/model/_packager/model_meta_migrator/migrator_plans.py +3 -1
- snowflake/ml/model/_packager/model_packager.py +2 -2
- snowflake/ml/model/{_model_composer/model_runtime/_runtime_requirements.py → _packager/model_runtime/_snowml_inference_alternative_requirements.py} +1 -1
- snowflake/ml/model/_packager/model_runtime/model_runtime.py +137 -0
- snowflake/ml/model/type_hints.py +21 -2
- snowflake/ml/modeling/_internal/estimator_utils.py +16 -11
- snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -1
- snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +13 -14
- snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +5 -5
- snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +195 -123
- snowflake/ml/modeling/cluster/affinity_propagation.py +195 -123
- snowflake/ml/modeling/cluster/agglomerative_clustering.py +195 -123
- snowflake/ml/modeling/cluster/birch.py +195 -123
- snowflake/ml/modeling/cluster/bisecting_k_means.py +195 -123
- snowflake/ml/modeling/cluster/dbscan.py +195 -123
- snowflake/ml/modeling/cluster/feature_agglomeration.py +195 -123
- snowflake/ml/modeling/cluster/k_means.py +195 -123
- snowflake/ml/modeling/cluster/mean_shift.py +195 -123
- snowflake/ml/modeling/cluster/mini_batch_k_means.py +195 -123
- snowflake/ml/modeling/cluster/optics.py +195 -123
- snowflake/ml/modeling/cluster/spectral_biclustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_clustering.py +195 -123
- snowflake/ml/modeling/cluster/spectral_coclustering.py +195 -123
- snowflake/ml/modeling/compose/column_transformer.py +195 -123
- snowflake/ml/modeling/compose/transformed_target_regressor.py +195 -123
- snowflake/ml/modeling/covariance/elliptic_envelope.py +195 -123
- snowflake/ml/modeling/covariance/empirical_covariance.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso.py +195 -123
- snowflake/ml/modeling/covariance/graphical_lasso_cv.py +195 -123
- snowflake/ml/modeling/covariance/ledoit_wolf.py +195 -123
- snowflake/ml/modeling/covariance/min_cov_det.py +195 -123
- snowflake/ml/modeling/covariance/oas.py +195 -123
- snowflake/ml/modeling/covariance/shrunk_covariance.py +195 -123
- snowflake/ml/modeling/decomposition/dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/factor_analysis.py +195 -123
- snowflake/ml/modeling/decomposition/fast_ica.py +195 -123
- snowflake/ml/modeling/decomposition/incremental_pca.py +195 -123
- snowflake/ml/modeling/decomposition/kernel_pca.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +195 -123
- snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/pca.py +195 -123
- snowflake/ml/modeling/decomposition/sparse_pca.py +195 -123
- snowflake/ml/modeling/decomposition/truncated_svd.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/ada_boost_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/bagging_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/extra_trees_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/isolation_forest.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/random_forest_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/stacking_regressor.py +195 -123
- snowflake/ml/modeling/ensemble/voting_classifier.py +195 -123
- snowflake/ml/modeling/ensemble/voting_regressor.py +195 -123
- snowflake/ml/modeling/feature_selection/generic_univariate_select.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fdr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fpr.py +195 -123
- snowflake/ml/modeling/feature_selection/select_fwe.py +195 -123
- snowflake/ml/modeling/feature_selection/select_k_best.py +195 -123
- snowflake/ml/modeling/feature_selection/select_percentile.py +195 -123
- snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +195 -123
- snowflake/ml/modeling/feature_selection/variance_threshold.py +195 -123
- snowflake/ml/modeling/framework/_utils.py +8 -1
- snowflake/ml/modeling/framework/base.py +9 -1
- snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +195 -123
- snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +195 -123
- snowflake/ml/modeling/impute/iterative_imputer.py +195 -123
- snowflake/ml/modeling/impute/knn_imputer.py +195 -123
- snowflake/ml/modeling/impute/missing_indicator.py +195 -123
- snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/nystroem.py +195 -123
- snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +195 -123
- snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +195 -123
- snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +195 -123
- snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_classifier.py +195 -123
- snowflake/ml/modeling/lightgbm/lgbm_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ard_regression.py +195 -123
- snowflake/ml/modeling/linear_model/bayesian_ridge.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/gamma_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/huber_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/lars.py +195 -123
- snowflake/ml/modeling/linear_model/lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_cv.py +195 -123
- snowflake/ml/modeling/linear_model/lasso_lars_ic.py +195 -123
- snowflake/ml/modeling/linear_model/linear_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression.py +195 -123
- snowflake/ml/modeling/linear_model/logistic_regression_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso.py +195 -123
- snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +195 -123
- snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/perceptron.py +195 -123
- snowflake/ml/modeling/linear_model/poisson_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ransac_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/ridge.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +195 -123
- snowflake/ml/modeling/linear_model/ridge_cv.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_classifier.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +195 -123
- snowflake/ml/modeling/linear_model/sgd_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/theil_sen_regressor.py +195 -123
- snowflake/ml/modeling/linear_model/tweedie_regressor.py +195 -123
- snowflake/ml/modeling/manifold/isomap.py +195 -123
- snowflake/ml/modeling/manifold/mds.py +195 -123
- snowflake/ml/modeling/manifold/spectral_embedding.py +195 -123
- snowflake/ml/modeling/manifold/tsne.py +195 -123
- snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +195 -123
- snowflake/ml/modeling/mixture/gaussian_mixture.py +195 -123
- snowflake/ml/modeling/model_selection/grid_search_cv.py +42 -18
- snowflake/ml/modeling/model_selection/randomized_search_cv.py +42 -18
- snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +195 -123
- snowflake/ml/modeling/multiclass/output_code_classifier.py +195 -123
- snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/categorical_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/complement_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/gaussian_nb.py +195 -123
- snowflake/ml/modeling/naive_bayes/multinomial_nb.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neighbors/kernel_density.py +195 -123
- snowflake/ml/modeling/neighbors/local_outlier_factor.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_centroid.py +195 -123
- snowflake/ml/modeling/neighbors/nearest_neighbors.py +195 -123
- snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +195 -123
- snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +195 -123
- snowflake/ml/modeling/neural_network/bernoulli_rbm.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_classifier.py +195 -123
- snowflake/ml/modeling/neural_network/mlp_regressor.py +195 -123
- snowflake/ml/modeling/pipeline/pipeline.py +4 -4
- snowflake/ml/modeling/preprocessing/binarizer.py +1 -5
- snowflake/ml/modeling/preprocessing/k_bins_discretizer.py +1 -5
- snowflake/ml/modeling/preprocessing/label_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/max_abs_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/min_max_scaler.py +10 -12
- snowflake/ml/modeling/preprocessing/normalizer.py +1 -5
- snowflake/ml/modeling/preprocessing/one_hot_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/ordinal_encoder.py +1 -5
- snowflake/ml/modeling/preprocessing/polynomial_features.py +195 -123
- snowflake/ml/modeling/preprocessing/robust_scaler.py +1 -5
- snowflake/ml/modeling/preprocessing/standard_scaler.py +11 -11
- snowflake/ml/modeling/semi_supervised/label_propagation.py +195 -123
- snowflake/ml/modeling/semi_supervised/label_spreading.py +195 -123
- snowflake/ml/modeling/svm/linear_svc.py +195 -123
- snowflake/ml/modeling/svm/linear_svr.py +195 -123
- snowflake/ml/modeling/svm/nu_svc.py +195 -123
- snowflake/ml/modeling/svm/nu_svr.py +195 -123
- snowflake/ml/modeling/svm/svc.py +195 -123
- snowflake/ml/modeling/svm/svr.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/decision_tree_regressor.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_classifier.py +195 -123
- snowflake/ml/modeling/tree/extra_tree_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgb_regressor.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_classifier.py +195 -123
- snowflake/ml/modeling/xgboost/xgbrf_regressor.py +195 -123
- snowflake/ml/registry/registry.py +1 -1
- snowflake/ml/version.py +1 -1
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/METADATA +68 -57
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/RECORD +202 -200
- snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +0 -97
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/LICENSE.txt +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/WHEEL +0 -0
- {snowflake_ml_python-1.4.0.dist-info → snowflake_ml_python-1.4.1.dist-info}/top_level.txt +0 -0
@@ -33,6 +33,15 @@ from snowflake.ml.modeling._internal.transformer_protocols import (
|
|
33
33
|
BatchInferenceKwargsTypedDict,
|
34
34
|
ScoreKwargsTypedDict
|
35
35
|
)
|
36
|
+
from snowflake.ml.model._signatures import utils as model_signature_utils
|
37
|
+
from snowflake.ml.model.model_signature import (
|
38
|
+
BaseFeatureSpec,
|
39
|
+
DataType,
|
40
|
+
FeatureSpec,
|
41
|
+
ModelSignature,
|
42
|
+
_infer_signature,
|
43
|
+
_rename_signature_with_snowflake_identifiers,
|
44
|
+
)
|
36
45
|
|
37
46
|
from snowflake.ml.modeling._internal.model_transformer_builder import ModelTransformerBuilder
|
38
47
|
|
@@ -43,16 +52,6 @@ from snowflake.ml.modeling._internal.estimator_utils import (
|
|
43
52
|
validate_sklearn_args,
|
44
53
|
)
|
45
54
|
|
46
|
-
from snowflake.ml.model.model_signature import (
|
47
|
-
DataType,
|
48
|
-
FeatureSpec,
|
49
|
-
ModelSignature,
|
50
|
-
_infer_signature,
|
51
|
-
_rename_signature_with_snowflake_identifiers,
|
52
|
-
BaseFeatureSpec,
|
53
|
-
)
|
54
|
-
from snowflake.ml.model._signatures import utils as model_signature_utils
|
55
|
-
|
56
55
|
_PROJECT = "ModelDevelopment"
|
57
56
|
# Derive subproject from module name by removing "sklearn"
|
58
57
|
# and converting module name from underscore to CamelCase
|
@@ -227,12 +226,7 @@ class OrthogonalMatchingPursuit(BaseTransformer):
|
|
227
226
|
)
|
228
227
|
return selected_cols
|
229
228
|
|
230
|
-
|
231
|
-
project=_PROJECT,
|
232
|
-
subproject=_SUBPROJECT,
|
233
|
-
custom_tags=dict([("autogen", True)]),
|
234
|
-
)
|
235
|
-
def fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "OrthogonalMatchingPursuit":
|
229
|
+
def _fit(self, dataset: Union[DataFrame, pd.DataFrame]) -> "OrthogonalMatchingPursuit":
|
236
230
|
"""Fit the model using X, y as training data
|
237
231
|
For more details on this function, see [sklearn.linear_model.OrthogonalMatchingPursuit.fit]
|
238
232
|
(https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.OrthogonalMatchingPursuit.html#sklearn.linear_model.OrthogonalMatchingPursuit.fit)
|
@@ -259,12 +253,14 @@ class OrthogonalMatchingPursuit(BaseTransformer):
|
|
259
253
|
|
260
254
|
self._snowpark_cols = dataset.select(self.input_cols).columns
|
261
255
|
|
262
|
-
|
256
|
+
# If we are already in a stored procedure, no need to kick off another one.
|
263
257
|
if SNOWML_SPROC_ENV in os.environ:
|
264
258
|
statement_params = telemetry.get_function_usage_statement_params(
|
265
259
|
project=_PROJECT,
|
266
260
|
subproject=_SUBPROJECT,
|
267
|
-
function_name=telemetry.get_statement_params_full_func_name(
|
261
|
+
function_name=telemetry.get_statement_params_full_func_name(
|
262
|
+
inspect.currentframe(), OrthogonalMatchingPursuit.__class__.__name__
|
263
|
+
),
|
268
264
|
api_calls=[Session.call],
|
269
265
|
custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
|
270
266
|
)
|
@@ -285,7 +281,7 @@ class OrthogonalMatchingPursuit(BaseTransformer):
|
|
285
281
|
)
|
286
282
|
self._sklearn_object = model_trainer.train()
|
287
283
|
self._is_fitted = True
|
288
|
-
self.
|
284
|
+
self._generate_model_signatures(dataset)
|
289
285
|
return self
|
290
286
|
|
291
287
|
def _batch_inference_validate_snowpark(
|
@@ -361,7 +357,9 @@ class OrthogonalMatchingPursuit(BaseTransformer):
|
|
361
357
|
# when it is classifier, infer the datatype from label columns
|
362
358
|
if expected_type_inferred == "" and 'predict' in self.model_signatures:
|
363
359
|
# Batch inference takes a single expected output column type. Use the first columns type for now.
|
364
|
-
label_cols_signatures = [
|
360
|
+
label_cols_signatures = [
|
361
|
+
row for row in self.model_signatures['predict'].outputs if row.name in self.output_cols
|
362
|
+
]
|
365
363
|
if len(label_cols_signatures) == 0:
|
366
364
|
error_str = f"Output columns {self.output_cols} do not match model signatures {self.model_signatures['predict'].outputs}."
|
367
365
|
raise exceptions.SnowflakeMLException(
|
@@ -369,25 +367,22 @@ class OrthogonalMatchingPursuit(BaseTransformer):
|
|
369
367
|
original_exception=ValueError(error_str),
|
370
368
|
)
|
371
369
|
|
372
|
-
expected_type_inferred = convert_sp_to_sf_type(
|
373
|
-
label_cols_signatures[0].as_snowpark_type()
|
374
|
-
)
|
370
|
+
expected_type_inferred = convert_sp_to_sf_type(label_cols_signatures[0].as_snowpark_type())
|
375
371
|
|
376
372
|
self._deps = self._batch_inference_validate_snowpark(dataset=dataset, inference_method=inference_method)
|
377
|
-
assert isinstance(
|
373
|
+
assert isinstance(
|
374
|
+
dataset._session, Session
|
375
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
378
376
|
|
379
377
|
transform_kwargs = dict(
|
380
|
-
session
|
381
|
-
dependencies
|
382
|
-
drop_input_cols
|
383
|
-
expected_output_cols_type
|
378
|
+
session=dataset._session,
|
379
|
+
dependencies=self._deps,
|
380
|
+
drop_input_cols=self._drop_input_cols,
|
381
|
+
expected_output_cols_type=expected_type_inferred,
|
384
382
|
)
|
385
383
|
|
386
384
|
elif isinstance(dataset, pd.DataFrame):
|
387
|
-
transform_kwargs = dict(
|
388
|
-
snowpark_input_cols = self._snowpark_cols,
|
389
|
-
drop_input_cols = self._drop_input_cols
|
390
|
-
)
|
385
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
391
386
|
|
392
387
|
transform_handlers = ModelTransformerBuilder.build(
|
393
388
|
dataset=dataset,
|
@@ -427,7 +422,7 @@ class OrthogonalMatchingPursuit(BaseTransformer):
|
|
427
422
|
Transformed dataset.
|
428
423
|
"""
|
429
424
|
super()._check_dataset_type(dataset)
|
430
|
-
inference_method="transform"
|
425
|
+
inference_method = "transform"
|
431
426
|
|
432
427
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
433
428
|
# are specific to the type of dataset used.
|
@@ -464,17 +459,14 @@ class OrthogonalMatchingPursuit(BaseTransformer):
|
|
464
459
|
assert isinstance(dataset._session, Session) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
465
460
|
|
466
461
|
transform_kwargs = dict(
|
467
|
-
session
|
468
|
-
dependencies
|
469
|
-
drop_input_cols
|
470
|
-
expected_output_cols_type
|
462
|
+
session=dataset._session,
|
463
|
+
dependencies=self._deps,
|
464
|
+
drop_input_cols=self._drop_input_cols,
|
465
|
+
expected_output_cols_type=expected_dtype,
|
471
466
|
)
|
472
467
|
|
473
468
|
elif isinstance(dataset, pd.DataFrame):
|
474
|
-
transform_kwargs = dict(
|
475
|
-
snowpark_input_cols = self._snowpark_cols,
|
476
|
-
drop_input_cols = self._drop_input_cols
|
477
|
-
)
|
469
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
478
470
|
|
479
471
|
transform_handlers = ModelTransformerBuilder.build(
|
480
472
|
dataset=dataset,
|
@@ -493,7 +485,11 @@ class OrthogonalMatchingPursuit(BaseTransformer):
|
|
493
485
|
return output_df
|
494
486
|
|
495
487
|
@available_if(original_estimator_has_callable("fit_predict")) # type: ignore[misc]
|
496
|
-
def fit_predict(
|
488
|
+
def fit_predict(
|
489
|
+
self,
|
490
|
+
dataset: Union[DataFrame, pd.DataFrame],
|
491
|
+
output_cols_prefix: str = "fit_predict_",
|
492
|
+
) -> Union[DataFrame, pd.DataFrame]:
|
497
493
|
""" Method not supported for this class.
|
498
494
|
|
499
495
|
|
@@ -518,7 +514,9 @@ class OrthogonalMatchingPursuit(BaseTransformer):
|
|
518
514
|
)
|
519
515
|
output_result, fitted_estimator = model_trainer.train_fit_predict(
|
520
516
|
drop_input_cols=self._drop_input_cols,
|
521
|
-
expected_output_cols_list=
|
517
|
+
expected_output_cols_list=(
|
518
|
+
self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix)
|
519
|
+
),
|
522
520
|
)
|
523
521
|
self._sklearn_object = fitted_estimator
|
524
522
|
self._is_fitted = True
|
@@ -535,6 +533,62 @@ class OrthogonalMatchingPursuit(BaseTransformer):
|
|
535
533
|
assert self._sklearn_object is not None
|
536
534
|
return self._sklearn_object.embedding_
|
537
535
|
|
536
|
+
|
537
|
+
def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
|
538
|
+
""" Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
|
539
|
+
Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
|
540
|
+
"""
|
541
|
+
output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
|
542
|
+
# The following condition is introduced for kneighbors methods, and not used in other methods
|
543
|
+
if output_cols:
|
544
|
+
output_cols = [
|
545
|
+
identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
|
546
|
+
for c in output_cols
|
547
|
+
]
|
548
|
+
elif getattr(self._sklearn_object, "classes_", None) is None:
|
549
|
+
output_cols = [output_cols_prefix]
|
550
|
+
elif self._sklearn_object is not None:
|
551
|
+
classes = self._sklearn_object.classes_
|
552
|
+
if isinstance(classes, numpy.ndarray):
|
553
|
+
output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
|
554
|
+
elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
|
555
|
+
# If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
|
556
|
+
output_cols = []
|
557
|
+
for i, cl in enumerate(classes):
|
558
|
+
# For binary classification, there is only one output column for each class
|
559
|
+
# ndarray as the two classes are complementary.
|
560
|
+
if len(cl) == 2:
|
561
|
+
output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
|
562
|
+
else:
|
563
|
+
output_cols.extend([
|
564
|
+
f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
|
565
|
+
])
|
566
|
+
else:
|
567
|
+
output_cols = []
|
568
|
+
|
569
|
+
# Make sure column names are valid snowflake identifiers.
|
570
|
+
assert output_cols is not None # Make MyPy happy
|
571
|
+
rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
|
572
|
+
|
573
|
+
return rv
|
574
|
+
|
575
|
+
def _align_expected_output_names(
|
576
|
+
self, method: str, dataset: DataFrame, expected_output_cols_list: List[str], output_cols_prefix: str
|
577
|
+
) -> List[str]:
|
578
|
+
# in case the inferred output column names dimension is different
|
579
|
+
# we use one line of snowpark dataframe and put it into sklearn estimator using pandas
|
580
|
+
output_df_pd = getattr(self, method)(dataset.limit(1).to_pandas(), output_cols_prefix)
|
581
|
+
output_df_columns = list(output_df_pd.columns)
|
582
|
+
output_df_columns_set: Set[str] = set(output_df_columns) - set(dataset.columns)
|
583
|
+
if self.sample_weight_col:
|
584
|
+
output_df_columns_set -= set(self.sample_weight_col)
|
585
|
+
# if the dimension of inferred output column names is correct; use it
|
586
|
+
if len(expected_output_cols_list) == len(output_df_columns_set):
|
587
|
+
return expected_output_cols_list
|
588
|
+
# otherwise, use the sklearn estimator's output
|
589
|
+
else:
|
590
|
+
return sorted(list(output_df_columns_set), key=lambda x: output_df_columns.index(x))
|
591
|
+
|
538
592
|
@available_if(original_estimator_has_callable("predict_proba")) # type: ignore[misc]
|
539
593
|
@telemetry.send_api_usage_telemetry(
|
540
594
|
project=_PROJECT,
|
@@ -565,24 +619,28 @@ class OrthogonalMatchingPursuit(BaseTransformer):
|
|
565
619
|
# are specific to the type of dataset used.
|
566
620
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
567
621
|
|
622
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
623
|
+
|
568
624
|
if isinstance(dataset, DataFrame):
|
569
625
|
self._deps = self._batch_inference_validate_snowpark(
|
570
626
|
dataset=dataset,
|
571
627
|
inference_method=inference_method,
|
572
628
|
)
|
573
|
-
assert isinstance(
|
629
|
+
assert isinstance(
|
630
|
+
dataset._session, Session
|
631
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
574
632
|
transform_kwargs = dict(
|
575
633
|
session=dataset._session,
|
576
634
|
dependencies=self._deps,
|
577
|
-
drop_input_cols
|
635
|
+
drop_input_cols=self._drop_input_cols,
|
578
636
|
expected_output_cols_type="float",
|
579
637
|
)
|
638
|
+
expected_output_cols = self._align_expected_output_names(
|
639
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
640
|
+
)
|
580
641
|
|
581
642
|
elif isinstance(dataset, pd.DataFrame):
|
582
|
-
transform_kwargs = dict(
|
583
|
-
snowpark_input_cols = self._snowpark_cols,
|
584
|
-
drop_input_cols = self._drop_input_cols
|
585
|
-
)
|
643
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
586
644
|
|
587
645
|
transform_handlers = ModelTransformerBuilder.build(
|
588
646
|
dataset=dataset,
|
@@ -594,7 +652,7 @@ class OrthogonalMatchingPursuit(BaseTransformer):
|
|
594
652
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
595
653
|
inference_method=inference_method,
|
596
654
|
input_cols=self.input_cols,
|
597
|
-
expected_output_cols=
|
655
|
+
expected_output_cols=expected_output_cols,
|
598
656
|
**transform_kwargs
|
599
657
|
)
|
600
658
|
return output_df
|
@@ -624,7 +682,8 @@ class OrthogonalMatchingPursuit(BaseTransformer):
|
|
624
682
|
Output dataset with log probability of the sample for each class in the model.
|
625
683
|
"""
|
626
684
|
super()._check_dataset_type(dataset)
|
627
|
-
inference_method="predict_log_proba"
|
685
|
+
inference_method = "predict_log_proba"
|
686
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
628
687
|
|
629
688
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
630
689
|
# are specific to the type of dataset used.
|
@@ -635,18 +694,20 @@ class OrthogonalMatchingPursuit(BaseTransformer):
|
|
635
694
|
dataset=dataset,
|
636
695
|
inference_method=inference_method,
|
637
696
|
)
|
638
|
-
assert isinstance(
|
697
|
+
assert isinstance(
|
698
|
+
dataset._session, Session
|
699
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
639
700
|
transform_kwargs = dict(
|
640
701
|
session=dataset._session,
|
641
702
|
dependencies=self._deps,
|
642
|
-
drop_input_cols
|
703
|
+
drop_input_cols=self._drop_input_cols,
|
643
704
|
expected_output_cols_type="float",
|
644
705
|
)
|
706
|
+
expected_output_cols = self._align_expected_output_names(
|
707
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
708
|
+
)
|
645
709
|
elif isinstance(dataset, pd.DataFrame):
|
646
|
-
transform_kwargs = dict(
|
647
|
-
snowpark_input_cols = self._snowpark_cols,
|
648
|
-
drop_input_cols = self._drop_input_cols
|
649
|
-
)
|
710
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
650
711
|
|
651
712
|
transform_handlers = ModelTransformerBuilder.build(
|
652
713
|
dataset=dataset,
|
@@ -659,7 +720,7 @@ class OrthogonalMatchingPursuit(BaseTransformer):
|
|
659
720
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
660
721
|
inference_method=inference_method,
|
661
722
|
input_cols=self.input_cols,
|
662
|
-
expected_output_cols=
|
723
|
+
expected_output_cols=expected_output_cols,
|
663
724
|
**transform_kwargs
|
664
725
|
)
|
665
726
|
return output_df
|
@@ -685,30 +746,34 @@ class OrthogonalMatchingPursuit(BaseTransformer):
|
|
685
746
|
Output dataset with results of the decision function for the samples in input dataset.
|
686
747
|
"""
|
687
748
|
super()._check_dataset_type(dataset)
|
688
|
-
inference_method="decision_function"
|
749
|
+
inference_method = "decision_function"
|
689
750
|
|
690
751
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
691
752
|
# are specific to the type of dataset used.
|
692
753
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
693
754
|
|
755
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
756
|
+
|
694
757
|
if isinstance(dataset, DataFrame):
|
695
758
|
self._deps = self._batch_inference_validate_snowpark(
|
696
759
|
dataset=dataset,
|
697
760
|
inference_method=inference_method,
|
698
761
|
)
|
699
|
-
assert isinstance(
|
762
|
+
assert isinstance(
|
763
|
+
dataset._session, Session
|
764
|
+
) # mypy does not recognize the check in _batch_inference_validate_snowpark()
|
700
765
|
transform_kwargs = dict(
|
701
766
|
session=dataset._session,
|
702
767
|
dependencies=self._deps,
|
703
|
-
drop_input_cols
|
768
|
+
drop_input_cols=self._drop_input_cols,
|
704
769
|
expected_output_cols_type="float",
|
705
770
|
)
|
771
|
+
expected_output_cols = self._align_expected_output_names(
|
772
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
773
|
+
)
|
706
774
|
|
707
775
|
elif isinstance(dataset, pd.DataFrame):
|
708
|
-
transform_kwargs = dict(
|
709
|
-
snowpark_input_cols = self._snowpark_cols,
|
710
|
-
drop_input_cols = self._drop_input_cols
|
711
|
-
)
|
776
|
+
transform_kwargs = dict(snowpark_input_cols=self._snowpark_cols, drop_input_cols=self._drop_input_cols)
|
712
777
|
|
713
778
|
transform_handlers = ModelTransformerBuilder.build(
|
714
779
|
dataset=dataset,
|
@@ -721,7 +786,7 @@ class OrthogonalMatchingPursuit(BaseTransformer):
|
|
721
786
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
722
787
|
inference_method=inference_method,
|
723
788
|
input_cols=self.input_cols,
|
724
|
-
expected_output_cols=
|
789
|
+
expected_output_cols=expected_output_cols,
|
725
790
|
**transform_kwargs
|
726
791
|
)
|
727
792
|
return output_df
|
@@ -750,12 +815,14 @@ class OrthogonalMatchingPursuit(BaseTransformer):
|
|
750
815
|
Output dataset with probability of the sample for each class in the model.
|
751
816
|
"""
|
752
817
|
super()._check_dataset_type(dataset)
|
753
|
-
inference_method="score_samples"
|
818
|
+
inference_method = "score_samples"
|
754
819
|
|
755
820
|
# This dictionary contains optional kwargs for batch inference. These kwargs
|
756
821
|
# are specific to the type of dataset used.
|
757
822
|
transform_kwargs: BatchInferenceKwargsTypedDict = dict()
|
758
823
|
|
824
|
+
expected_output_cols = self._get_output_column_names(output_cols_prefix)
|
825
|
+
|
759
826
|
if isinstance(dataset, DataFrame):
|
760
827
|
self._deps = self._batch_inference_validate_snowpark(
|
761
828
|
dataset=dataset,
|
@@ -768,6 +835,9 @@ class OrthogonalMatchingPursuit(BaseTransformer):
|
|
768
835
|
drop_input_cols = self._drop_input_cols,
|
769
836
|
expected_output_cols_type="float",
|
770
837
|
)
|
838
|
+
expected_output_cols = self._align_expected_output_names(
|
839
|
+
inference_method, dataset, expected_output_cols, output_cols_prefix
|
840
|
+
)
|
771
841
|
|
772
842
|
elif isinstance(dataset, pd.DataFrame):
|
773
843
|
transform_kwargs = dict(
|
@@ -786,7 +856,7 @@ class OrthogonalMatchingPursuit(BaseTransformer):
|
|
786
856
|
output_df: DATAFRAME_TYPE = transform_handlers.batch_inference(
|
787
857
|
inference_method=inference_method,
|
788
858
|
input_cols=self.input_cols,
|
789
|
-
expected_output_cols=
|
859
|
+
expected_output_cols=expected_output_cols,
|
790
860
|
**transform_kwargs
|
791
861
|
)
|
792
862
|
return output_df
|
@@ -933,50 +1003,84 @@ class OrthogonalMatchingPursuit(BaseTransformer):
|
|
933
1003
|
)
|
934
1004
|
return output_df
|
935
1005
|
|
1006
|
+
|
1007
|
+
|
1008
|
+
def to_sklearn(self) -> Any:
|
1009
|
+
"""Get sklearn.linear_model.OrthogonalMatchingPursuit object.
|
1010
|
+
"""
|
1011
|
+
if self._sklearn_object is None:
|
1012
|
+
self._sklearn_object = self._create_sklearn_object()
|
1013
|
+
return self._sklearn_object
|
1014
|
+
|
1015
|
+
def to_xgboost(self) -> Any:
|
1016
|
+
raise exceptions.SnowflakeMLException(
|
1017
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1018
|
+
original_exception=AttributeError(
|
1019
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1020
|
+
"to_xgboost()",
|
1021
|
+
"to_sklearn()"
|
1022
|
+
)
|
1023
|
+
),
|
1024
|
+
)
|
1025
|
+
|
1026
|
+
def to_lightgbm(self) -> Any:
|
1027
|
+
raise exceptions.SnowflakeMLException(
|
1028
|
+
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1029
|
+
original_exception=AttributeError(
|
1030
|
+
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1031
|
+
"to_lightgbm()",
|
1032
|
+
"to_sklearn()"
|
1033
|
+
)
|
1034
|
+
),
|
1035
|
+
)
|
936
1036
|
|
937
|
-
def
|
1037
|
+
def _get_dependencies(self) -> List[str]:
|
1038
|
+
return self._deps
|
1039
|
+
|
1040
|
+
|
1041
|
+
def _generate_model_signatures(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
|
938
1042
|
self._model_signature_dict = dict()
|
939
1043
|
|
940
1044
|
PROB_FUNCTIONS = ["predict_log_proba", "predict_proba", "decision_function"]
|
941
1045
|
|
942
|
-
inputs = list(_infer_signature(dataset[self.input_cols], "input"))
|
1046
|
+
inputs = list(_infer_signature(dataset[self.input_cols], "input", use_snowflake_identifiers=True))
|
943
1047
|
outputs: List[BaseFeatureSpec] = []
|
944
1048
|
if hasattr(self, "predict"):
|
945
1049
|
# keep mypy happy
|
946
|
-
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
1050
|
+
assert self._sklearn_object is not None and hasattr(self._sklearn_object, "_estimator_type")
|
947
1051
|
# For classifier, the type of predict is the same as the type of label
|
948
|
-
if self._sklearn_object._estimator_type ==
|
949
|
-
|
1052
|
+
if self._sklearn_object._estimator_type == "classifier":
|
1053
|
+
# label columns is the desired type for output
|
950
1054
|
outputs = list(_infer_signature(dataset[self.label_cols], "output", use_snowflake_identifiers=True))
|
951
1055
|
# rename the output columns
|
952
1056
|
outputs = list(model_signature_utils.rename_features(outputs, self.output_cols))
|
953
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
954
|
-
|
955
|
-
|
1057
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1058
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1059
|
+
)
|
956
1060
|
# For mixture models that use the density mixin, `predict` returns the argmax of the log prob.
|
957
1061
|
# For outlier models, returns -1 for outliers and 1 for inliers.
|
958
|
-
# Clusterer returns int64 cluster labels.
|
1062
|
+
# Clusterer returns int64 cluster labels.
|
959
1063
|
elif self._sklearn_object._estimator_type in ["DensityEstimator", "clusterer", "outlier_detector"]:
|
960
1064
|
outputs = [FeatureSpec(dtype=DataType.INT64, name=c) for c in self.output_cols]
|
961
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
962
|
-
|
963
|
-
|
964
|
-
|
1065
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1066
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1067
|
+
)
|
1068
|
+
|
965
1069
|
# For regressor, the type of predict is float64
|
966
|
-
elif self._sklearn_object._estimator_type ==
|
1070
|
+
elif self._sklearn_object._estimator_type == "regressor":
|
967
1071
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
|
968
|
-
self._model_signature_dict["predict"] = ModelSignature(
|
969
|
-
|
970
|
-
|
971
|
-
|
1072
|
+
self._model_signature_dict["predict"] = ModelSignature(
|
1073
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1074
|
+
)
|
1075
|
+
|
972
1076
|
for prob_func in PROB_FUNCTIONS:
|
973
1077
|
if hasattr(self, prob_func):
|
974
1078
|
output_cols_prefix: str = f"{prob_func}_"
|
975
1079
|
output_column_names = self._get_output_column_names(output_cols_prefix)
|
976
1080
|
outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
|
977
|
-
self._model_signature_dict[prob_func] = ModelSignature(
|
978
|
-
|
979
|
-
|
1081
|
+
self._model_signature_dict[prob_func] = ModelSignature(
|
1082
|
+
inputs, ([] if self._drop_input_cols else inputs) + outputs
|
1083
|
+
)
|
980
1084
|
|
981
1085
|
# Output signature names may still need to be renamed, since they were not created with `_infer_signature`.
|
982
1086
|
items = list(self._model_signature_dict.items())
|
@@ -989,10 +1093,10 @@ class OrthogonalMatchingPursuit(BaseTransformer):
|
|
989
1093
|
"""Returns model signature of current class.
|
990
1094
|
|
991
1095
|
Raises:
|
992
|
-
|
1096
|
+
SnowflakeMLException: If estimator is not fitted, then model signature cannot be inferred
|
993
1097
|
|
994
1098
|
Returns:
|
995
|
-
Dict
|
1099
|
+
Dict with each method and its input output signature
|
996
1100
|
"""
|
997
1101
|
if self._model_signature_dict is None:
|
998
1102
|
raise exceptions.SnowflakeMLException(
|
@@ -1000,35 +1104,3 @@ class OrthogonalMatchingPursuit(BaseTransformer):
|
|
1000
1104
|
original_exception=RuntimeError("Estimator not fitted before accessing property model_signatures!"),
|
1001
1105
|
)
|
1002
1106
|
return self._model_signature_dict
|
1003
|
-
|
1004
|
-
def to_sklearn(self) -> Any:
|
1005
|
-
"""Get sklearn.linear_model.OrthogonalMatchingPursuit object.
|
1006
|
-
"""
|
1007
|
-
if self._sklearn_object is None:
|
1008
|
-
self._sklearn_object = self._create_sklearn_object()
|
1009
|
-
return self._sklearn_object
|
1010
|
-
|
1011
|
-
def to_xgboost(self) -> Any:
|
1012
|
-
raise exceptions.SnowflakeMLException(
|
1013
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1014
|
-
original_exception=AttributeError(
|
1015
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1016
|
-
"to_xgboost()",
|
1017
|
-
"to_sklearn()"
|
1018
|
-
)
|
1019
|
-
),
|
1020
|
-
)
|
1021
|
-
|
1022
|
-
def to_lightgbm(self) -> Any:
|
1023
|
-
raise exceptions.SnowflakeMLException(
|
1024
|
-
error_code=error_codes.METHOD_NOT_ALLOWED,
|
1025
|
-
original_exception=AttributeError(
|
1026
|
-
modeling_error_messages.UNSUPPORTED_MODEL_CONVERSION.format(
|
1027
|
-
"to_lightgbm()",
|
1028
|
-
"to_sklearn()"
|
1029
|
-
)
|
1030
|
-
),
|
1031
|
-
)
|
1032
|
-
|
1033
|
-
def _get_dependencies(self) -> List[str]:
|
1034
|
-
return self._deps
|