PyPI - snowflake-ml-python - Versions diffs - 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl - Mend

snowflake-ml-python 1.0.1py3-none-any.whl → 1.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (196) hide show

snowflake/ml/_internal/env_utils.py +2 -1
snowflake/ml/_internal/file_utils.py +35 -40
snowflake/ml/_internal/telemetry.py +5 -8
snowflake/ml/_internal/utils/identifier.py +74 -7
snowflake/ml/_internal/utils/uri.py +7 -2
snowflake/ml/model/_core_requirements.py +1 -1
snowflake/ml/model/_deploy_client/image_builds/base_image_builder.py +15 -0
snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +259 -0
snowflake/ml/model/_deploy_client/image_builds/docker_context.py +89 -0
snowflake/ml/model/_deploy_client/image_builds/gunicorn_run.sh +24 -0
snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +118 -0
snowflake/ml/model/_deploy_client/image_builds/templates/dockerfile_template +40 -0
snowflake/ml/model/_deploy_client/snowservice/deploy.py +199 -0
snowflake/ml/model/_deploy_client/snowservice/deploy_options.py +88 -0
snowflake/ml/model/_deploy_client/snowservice/templates/service_spec_template +24 -0
snowflake/ml/model/_deploy_client/utils/constants.py +47 -0
snowflake/ml/model/_deploy_client/utils/snowservice_client.py +178 -0
snowflake/ml/model/_deploy_client/warehouse/deploy.py +25 -28
snowflake/ml/model/_deploy_client/warehouse/infer_template.py +7 -4
snowflake/ml/model/_deployer.py +14 -27
snowflake/ml/model/_env.py +4 -4
snowflake/ml/model/_handlers/_base.py +3 -1
snowflake/ml/model/_handlers/custom.py +14 -2
snowflake/ml/model/_handlers/pytorch.py +186 -0
snowflake/ml/model/_handlers/sklearn.py +14 -8
snowflake/ml/model/_handlers/snowmlmodel.py +14 -9
snowflake/ml/model/_handlers/torchscript.py +180 -0
snowflake/ml/model/_handlers/xgboost.py +19 -9
snowflake/ml/model/_model.py +27 -21
snowflake/ml/model/_model_meta.py +33 -19
snowflake/ml/model/model_signature.py +446 -66
snowflake/ml/model/type_hints.py +28 -15
snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +79 -43
snowflake/ml/modeling/cluster/affinity_propagation.py +79 -43
snowflake/ml/modeling/cluster/agglomerative_clustering.py +79 -43
snowflake/ml/modeling/cluster/birch.py +79 -43
snowflake/ml/modeling/cluster/bisecting_k_means.py +79 -43
snowflake/ml/modeling/cluster/dbscan.py +79 -43
snowflake/ml/modeling/cluster/feature_agglomeration.py +79 -43
snowflake/ml/modeling/cluster/k_means.py +79 -43
snowflake/ml/modeling/cluster/mean_shift.py +79 -43
snowflake/ml/modeling/cluster/mini_batch_k_means.py +79 -43
snowflake/ml/modeling/cluster/optics.py +79 -43
snowflake/ml/modeling/cluster/spectral_biclustering.py +79 -43
snowflake/ml/modeling/cluster/spectral_clustering.py +79 -43
snowflake/ml/modeling/cluster/spectral_coclustering.py +79 -43
snowflake/ml/modeling/compose/column_transformer.py +79 -43
snowflake/ml/modeling/compose/transformed_target_regressor.py +79 -43
snowflake/ml/modeling/covariance/elliptic_envelope.py +79 -43
snowflake/ml/modeling/covariance/empirical_covariance.py +79 -43
snowflake/ml/modeling/covariance/graphical_lasso.py +79 -43
snowflake/ml/modeling/covariance/graphical_lasso_cv.py +79 -43
snowflake/ml/modeling/covariance/ledoit_wolf.py +79 -43
snowflake/ml/modeling/covariance/min_cov_det.py +79 -43
snowflake/ml/modeling/covariance/oas.py +79 -43
snowflake/ml/modeling/covariance/shrunk_covariance.py +79 -43
snowflake/ml/modeling/decomposition/dictionary_learning.py +79 -43
snowflake/ml/modeling/decomposition/factor_analysis.py +79 -43
snowflake/ml/modeling/decomposition/fast_ica.py +79 -43
snowflake/ml/modeling/decomposition/incremental_pca.py +79 -43
snowflake/ml/modeling/decomposition/kernel_pca.py +79 -43
snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +79 -43
snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +79 -43
snowflake/ml/modeling/decomposition/pca.py +79 -43
snowflake/ml/modeling/decomposition/sparse_pca.py +79 -43
snowflake/ml/modeling/decomposition/truncated_svd.py +79 -43
snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +79 -43
snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +79 -43
snowflake/ml/modeling/ensemble/ada_boost_classifier.py +79 -43
snowflake/ml/modeling/ensemble/ada_boost_regressor.py +79 -43
snowflake/ml/modeling/ensemble/bagging_classifier.py +79 -43
snowflake/ml/modeling/ensemble/bagging_regressor.py +79 -43
snowflake/ml/modeling/ensemble/extra_trees_classifier.py +79 -43
snowflake/ml/modeling/ensemble/extra_trees_regressor.py +79 -43
snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +79 -43
snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +79 -43
snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +79 -43
snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +79 -43
snowflake/ml/modeling/ensemble/isolation_forest.py +79 -43
snowflake/ml/modeling/ensemble/random_forest_classifier.py +79 -43
snowflake/ml/modeling/ensemble/random_forest_regressor.py +79 -43
snowflake/ml/modeling/ensemble/stacking_regressor.py +79 -43
snowflake/ml/modeling/ensemble/voting_classifier.py +79 -43
snowflake/ml/modeling/ensemble/voting_regressor.py +79 -43
snowflake/ml/modeling/feature_selection/generic_univariate_select.py +79 -43
snowflake/ml/modeling/feature_selection/select_fdr.py +79 -43
snowflake/ml/modeling/feature_selection/select_fpr.py +79 -43
snowflake/ml/modeling/feature_selection/select_fwe.py +79 -43
snowflake/ml/modeling/feature_selection/select_k_best.py +79 -43
snowflake/ml/modeling/feature_selection/select_percentile.py +79 -43
snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +79 -43
snowflake/ml/modeling/feature_selection/variance_threshold.py +79 -43
snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +79 -43
snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +79 -43
snowflake/ml/modeling/impute/iterative_imputer.py +79 -43
snowflake/ml/modeling/impute/knn_imputer.py +79 -43
snowflake/ml/modeling/impute/missing_indicator.py +79 -43
snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +79 -43
snowflake/ml/modeling/kernel_approximation/nystroem.py +79 -43
snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +79 -43
snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +79 -43
snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +79 -43
snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +79 -43
snowflake/ml/modeling/lightgbm/lgbm_classifier.py +79 -43
snowflake/ml/modeling/lightgbm/lgbm_regressor.py +79 -43
snowflake/ml/modeling/linear_model/ard_regression.py +79 -43
snowflake/ml/modeling/linear_model/bayesian_ridge.py +79 -43
snowflake/ml/modeling/linear_model/elastic_net.py +79 -43
snowflake/ml/modeling/linear_model/elastic_net_cv.py +79 -43
snowflake/ml/modeling/linear_model/gamma_regressor.py +79 -43
snowflake/ml/modeling/linear_model/huber_regressor.py +79 -43
snowflake/ml/modeling/linear_model/lars.py +79 -43
snowflake/ml/modeling/linear_model/lars_cv.py +79 -43
snowflake/ml/modeling/linear_model/lasso.py +79 -43
snowflake/ml/modeling/linear_model/lasso_cv.py +79 -43
snowflake/ml/modeling/linear_model/lasso_lars.py +79 -43
snowflake/ml/modeling/linear_model/lasso_lars_cv.py +79 -43
snowflake/ml/modeling/linear_model/lasso_lars_ic.py +79 -43
snowflake/ml/modeling/linear_model/linear_regression.py +79 -43
snowflake/ml/modeling/linear_model/logistic_regression.py +79 -43
snowflake/ml/modeling/linear_model/logistic_regression_cv.py +79 -43
snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +79 -43
snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +79 -43
snowflake/ml/modeling/linear_model/multi_task_lasso.py +79 -43
snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +79 -43
snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +79 -43
snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +79 -43
snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +79 -43
snowflake/ml/modeling/linear_model/perceptron.py +79 -43
snowflake/ml/modeling/linear_model/poisson_regressor.py +79 -43
snowflake/ml/modeling/linear_model/ransac_regressor.py +79 -43
snowflake/ml/modeling/linear_model/ridge.py +79 -43
snowflake/ml/modeling/linear_model/ridge_classifier.py +79 -43
snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +79 -43
snowflake/ml/modeling/linear_model/ridge_cv.py +79 -43
snowflake/ml/modeling/linear_model/sgd_classifier.py +79 -43
snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +79 -43
snowflake/ml/modeling/linear_model/sgd_regressor.py +79 -43
snowflake/ml/modeling/linear_model/theil_sen_regressor.py +79 -43
snowflake/ml/modeling/linear_model/tweedie_regressor.py +79 -43
snowflake/ml/modeling/manifold/isomap.py +79 -43
snowflake/ml/modeling/manifold/mds.py +79 -43
snowflake/ml/modeling/manifold/spectral_embedding.py +79 -43
snowflake/ml/modeling/manifold/tsne.py +79 -43
snowflake/ml/modeling/metrics/classification.py +6 -1
snowflake/ml/modeling/metrics/regression.py +517 -9
snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +79 -43
snowflake/ml/modeling/mixture/gaussian_mixture.py +79 -43
snowflake/ml/modeling/model_selection/grid_search_cv.py +79 -43
snowflake/ml/modeling/model_selection/randomized_search_cv.py +79 -43
snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +79 -43
snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +79 -43
snowflake/ml/modeling/multiclass/output_code_classifier.py +79 -43
snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +79 -43
snowflake/ml/modeling/naive_bayes/categorical_nb.py +79 -43
snowflake/ml/modeling/naive_bayes/complement_nb.py +79 -43
snowflake/ml/modeling/naive_bayes/gaussian_nb.py +79 -43
snowflake/ml/modeling/naive_bayes/multinomial_nb.py +79 -43
snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +79 -43
snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +79 -43
snowflake/ml/modeling/neighbors/kernel_density.py +79 -43
snowflake/ml/modeling/neighbors/local_outlier_factor.py +79 -43
snowflake/ml/modeling/neighbors/nearest_centroid.py +79 -43
snowflake/ml/modeling/neighbors/nearest_neighbors.py +79 -43
snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +79 -43
snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +79 -43
snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +79 -43
snowflake/ml/modeling/neural_network/bernoulli_rbm.py +79 -43
snowflake/ml/modeling/neural_network/mlp_classifier.py +79 -43
snowflake/ml/modeling/neural_network/mlp_regressor.py +79 -43
snowflake/ml/modeling/pipeline/pipeline.py +24 -0
snowflake/ml/modeling/preprocessing/one_hot_encoder.py +18 -19
snowflake/ml/modeling/preprocessing/ordinal_encoder.py +2 -0
snowflake/ml/modeling/preprocessing/polynomial_features.py +79 -43
snowflake/ml/modeling/semi_supervised/label_propagation.py +79 -43
snowflake/ml/modeling/semi_supervised/label_spreading.py +79 -43
snowflake/ml/modeling/svm/linear_svc.py +79 -43
snowflake/ml/modeling/svm/linear_svr.py +79 -43
snowflake/ml/modeling/svm/nu_svc.py +79 -43
snowflake/ml/modeling/svm/nu_svr.py +79 -43
snowflake/ml/modeling/svm/svc.py +79 -43
snowflake/ml/modeling/svm/svr.py +79 -43
snowflake/ml/modeling/tree/decision_tree_classifier.py +79 -43
snowflake/ml/modeling/tree/decision_tree_regressor.py +79 -43
snowflake/ml/modeling/tree/extra_tree_classifier.py +79 -43
snowflake/ml/modeling/tree/extra_tree_regressor.py +79 -43
snowflake/ml/modeling/xgboost/xgb_classifier.py +79 -43
snowflake/ml/modeling/xgboost/xgb_regressor.py +79 -43
snowflake/ml/modeling/xgboost/xgbrf_classifier.py +79 -43
snowflake/ml/modeling/xgboost/xgbrf_regressor.py +79 -43
snowflake/ml/registry/model_registry.py +123 -121
snowflake/ml/version.py +1 -1
{snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/METADATA +50 -8
snowflake_ml_python-1.0.3.dist-info/RECORD +259 -0
snowflake_ml_python-1.0.1.dist-info/RECORD +0 -246
{snowflake_ml_python-1.0.1.dist-info → snowflake_ml_python-1.0.3.dist-info}/WHEEL +0 -0

snowflake/ml/modeling/ensemble/voting_regressor.py CHANGED Viewed

@@ -7,6 +7,7 @@
 #
 import inspect
 import os
+import posixpath
 from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
 from uuid import uuid4
@@ -27,6 +28,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
 from snowflake.snowpark import DataFrame, Session
 from snowflake.snowpark.functions import pandas_udf, sproc
 from snowflake.snowpark.types import PandasSeries
+from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
 from snowflake.ml.model.model_signature import (
     DataType,
@@ -203,7 +205,6 @@ class VotingRegressor(BaseTransformer):
         sample_weight_col: Optional[str] = None,
     ) -> None:
         super().__init__()
-        self.id = str(uuid4()).replace("-", "_").upper()
         deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
         deps = deps | _gather_dependencies(estimators)
         self._deps = list(deps)
@@ -226,6 +227,15 @@ class VotingRegressor(BaseTransformer):
         self.set_drop_input_cols(drop_input_cols)
         self.set_sample_weight_col(sample_weight_col)
+    def _get_rand_id(self) -> str:
+        """
+        Generate random id to be used in sproc and stage names.
+        Returns:
+            Random id string usable in sproc, table, and stage names.
+        """
+        return str(uuid4()).replace("-", "_").upper()
     def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
         """
         Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -304,7 +314,7 @@ class VotingRegressor(BaseTransformer):
             cp.dump(self._sklearn_object, local_transform_file)
         # Create temp stage to run fit.
-        transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
+        transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
         stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
         SqlResultValidator(
             session=session,
@@ -317,11 +327,12 @@ class VotingRegressor(BaseTransformer):
             expected_value=f"Stage area {transform_stage_name} successfully created."
         ).validate()
-        stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
+        # Use posixpath to construct stage paths
+        stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
+        stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
         local_result_file_name = get_temp_file_path()
-        stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
-        fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
+        fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
         statement_params = telemetry.get_function_usage_statement_params(
             project=_PROJECT,
             subproject=_SUBPROJECT,
@@ -347,6 +358,7 @@ class VotingRegressor(BaseTransformer):
             replace=True,
             session=session,
             statement_params=statement_params,
+            anonymous=True
         )
         def fit_wrapper_sproc(
             session: Session,
@@ -355,7 +367,8 @@ class VotingRegressor(BaseTransformer):
             stage_result_file_name: str,
             input_cols: List[str],
             label_cols: List[str],
-            sample_weight_col: Optional[str]
+            sample_weight_col: Optional[str],
+            statement_params: Dict[str, str]
         ) -> str:
             import cloudpickle as cp
             import numpy as np
@@ -422,15 +435,15 @@ class VotingRegressor(BaseTransformer):
             api_calls=[Session.call],
             custom_tags=dict([("autogen", True)]),
         )
-        sproc_export_file_name = session.call(
-            fit_sproc_name,
+        sproc_export_file_name = fit_wrapper_sproc(
+            session,
             query,
             stage_transform_file_name,
             stage_result_file_name,
             identifier.get_unescaped_names(self.input_cols),
             identifier.get_unescaped_names(self.label_cols),
             identifier.get_unescaped_names(self.sample_weight_col),
-            statement_params=statement_params,
+            statement_params,
         )
         if "|" in sproc_export_file_name:
@@ -440,7 +453,7 @@ class VotingRegressor(BaseTransformer):
                 print("\n".join(fields[1:]))
         session.file.get(
-            os.path.join(stage_result_file_name, sproc_export_file_name),
+            posixpath.join(stage_result_file_name, sproc_export_file_name),
             local_result_file_name,
             statement_params=statement_params
         )
@@ -486,7 +499,7 @@ class VotingRegressor(BaseTransformer):
         # Register vectorized UDF for batch inference
         batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
-                safe_id=self.id, method=inference_method)
+                safe_id=self._get_rand_id(), method=inference_method)
         # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
         # will try to pickle all of self which fails.
@@ -578,7 +591,7 @@ class VotingRegressor(BaseTransformer):
             return transformed_pandas_df.to_dict("records")
         batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
-            safe_id=self.id
+            safe_id=self._get_rand_id()
         )
         pass_through_columns = self._get_pass_through_columns(dataset)
@@ -634,26 +647,37 @@ class VotingRegressor(BaseTransformer):
         # input cols need to match unquoted / quoted
         input_cols = self.input_cols
         unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
+        quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
         estimator = self._sklearn_object
-        input_df = dataset[input_cols] # Select input columns with quoted column names.
-        if hasattr(estimator, "feature_names_in_"):
-            missing_features = []
-            for i, f in enumerate(getattr(estimator, "feature_names_in_")):
-                if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
-                    missing_features.append(f)
-            if len(missing_features) > 0:
-                raise ValueError(
-                    "The feature names should match with those that were passed during fit.\n"
-                    f"Features seen during fit call but not present in the input: {missing_features}\n"
-                    f"Features in the input dataframe : {input_cols}\n"
-                )
-            input_df.columns = getattr(estimator, "feature_names_in_")
-        else:
-            # Just rename the column names to unquoted identifiers.
-            input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
+        features_required_by_estimator =  getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
+        missing_features = []
+        features_in_dataset = set(dataset.columns)
+        columns_to_select = []
+        for i, f in enumerate(features_required_by_estimator):
+            if (
+                    i >= len(input_cols)
+                or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
+                or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
+                    and quoted_input_cols[i] not in features_in_dataset)
+                ):
+                missing_features.append(f)
+            elif input_cols[i] in features_in_dataset:
+                columns_to_select.append(input_cols[i])
+            elif unquoted_input_cols[i] in features_in_dataset:
+                columns_to_select.append(unquoted_input_cols[i])
+            else:
+                columns_to_select.append(quoted_input_cols[i])
+        if len(missing_features) > 0:
+            raise ValueError(
+                "The feature names should match with those that were passed during fit.\n"
+                f"Features seen during fit call but not present in the input: {missing_features}\n"
+                f"Features in the input dataframe : {input_cols}\n"
+            )
+        input_df = dataset[columns_to_select]
+        input_df.columns = features_required_by_estimator
         transformed_numpy_array = getattr(estimator, inference_method)(
             input_df
@@ -734,11 +758,18 @@ class VotingRegressor(BaseTransformer):
             Transformed dataset.
         """
         if isinstance(dataset, DataFrame):
+            expected_type_inferred = "float"
+            # when it is classifier, infer the datatype from label columns
+            if expected_type_inferred == "" and 'predict' in self.model_signatures:
+                expected_type_inferred = convert_sp_to_sf_type(
+                    self.model_signatures['predict'].outputs[0].as_snowpark_type()
+                )
             output_df = self._batch_inference(
                 dataset=dataset,
                 inference_method="predict",
                 expected_output_cols_list=self.output_cols,
-                expected_output_cols_type="float",
+                expected_output_cols_type=expected_type_inferred,
             )
         elif isinstance(dataset, pd.DataFrame):
             output_df = self._sklearn_inference(
@@ -811,10 +842,10 @@ class VotingRegressor(BaseTransformer):
     def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
         """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
-        Returns an empty list if current object is not a classifier or not yet fitted.
+        Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
         """
         if getattr(self._sklearn_object, "classes_", None) is None:
-            return []
+            return [output_cols_prefix]
         classes = self._sklearn_object.classes_
         if isinstance(classes, numpy.ndarray):
@@ -1039,7 +1070,7 @@ class VotingRegressor(BaseTransformer):
             cp.dump(self._sklearn_object, local_score_file)
         # Create temp stage to run score.
-        score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
+        score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
         session = dataset._session
         stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
         SqlResultValidator(
@@ -1053,8 +1084,9 @@ class VotingRegressor(BaseTransformer):
             expected_value=f"Stage area {score_stage_name} successfully created."
         ).validate()
-        stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
-        score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
+        # Use posixpath to construct stage paths
+        stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
+        score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
         statement_params = telemetry.get_function_usage_statement_params(
             project=_PROJECT,
             subproject=_SUBPROJECT,
@@ -1080,6 +1112,7 @@ class VotingRegressor(BaseTransformer):
             replace=True,
             session=session,
             statement_params=statement_params,
+            anonymous=True
         )
         def score_wrapper_sproc(
             session: Session,
@@ -1087,7 +1120,8 @@ class VotingRegressor(BaseTransformer):
             stage_score_file_name: str,
             input_cols: List[str],
             label_cols: List[str],
-            sample_weight_col: Optional[str]
+            sample_weight_col: Optional[str],
+            statement_params: Dict[str, str]
         ) -> float:
             import cloudpickle as cp
             import numpy as np
@@ -1137,14 +1171,14 @@ class VotingRegressor(BaseTransformer):
             api_calls=[Session.call],
             custom_tags=dict([("autogen", True)]),
         )
-        score = session.call(
-            score_sproc_name,
+        score = score_wrapper_sproc(
+            session,
             query,
             stage_score_file_name,
             identifier.get_unescaped_names(self.input_cols),
             identifier.get_unescaped_names(self.label_cols),
             identifier.get_unescaped_names(self.sample_weight_col),
-            statement_params=statement_params,
+            statement_params,
         )
         cleanup_temp_files([local_score_file_name])
@@ -1162,18 +1196,20 @@ class VotingRegressor(BaseTransformer):
             if self._sklearn_object._estimator_type == 'classifier':
                 outputs = _infer_signature(dataset[self.label_cols], "output")  # label columns is the desired type for output
                 outputs = _rename_features(outputs, self.output_cols)  # rename the output columns
-                self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
+                self._model_signature_dict["predict"] = ModelSignature(inputs,
+                                                                       ([] if self._drop_input_cols else inputs) + outputs)
             # For regressor, the type of predict is float64
             elif self._sklearn_object._estimator_type == 'regressor':
                 outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
-                self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
+                self._model_signature_dict["predict"] = ModelSignature(inputs,
+                                                                       ([] if self._drop_input_cols else inputs) + outputs)
         for prob_func in PROB_FUNCTIONS:
             if hasattr(self, prob_func):
                 output_cols_prefix: str = f"{prob_func}_"
                 output_column_names = self._get_output_column_names(output_cols_prefix)
                 outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
-                self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
+                self._model_signature_dict[prob_func] = ModelSignature(inputs,
+                                                                       ([] if self._drop_input_cols else inputs) + outputs)
     @property
     def model_signatures(self) -> Dict[str, ModelSignature]:

snowflake/ml/modeling/feature_selection/generic_univariate_select.py CHANGED Viewed

@@ -7,6 +7,7 @@
 #
 import inspect
 import os
+import posixpath
 from typing import Iterable, Optional, Union, List, Any, Dict, Callable, Set
 from uuid import uuid4
@@ -28,6 +29,7 @@ from snowflake.ml._internal.utils.temp_file_utils import cleanup_temp_files, get
 from snowflake.snowpark import DataFrame, Session
 from snowflake.snowpark.functions import pandas_udf, sproc
 from snowflake.snowpark.types import PandasSeries
+from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
 from snowflake.ml.model.model_signature import (
     DataType,
@@ -194,7 +196,6 @@ class GenericUnivariateSelect(BaseTransformer):
         sample_weight_col: Optional[str] = None,
     ) -> None:
         super().__init__()
-        self.id = str(uuid4()).replace("-", "_").upper()
         deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
         self._deps = list(deps)
@@ -216,6 +217,15 @@ class GenericUnivariateSelect(BaseTransformer):
         self.set_drop_input_cols(drop_input_cols)
         self.set_sample_weight_col(sample_weight_col)
+    def _get_rand_id(self) -> str:
+        """
+        Generate random id to be used in sproc and stage names.
+        Returns:
+            Random id string usable in sproc, table, and stage names.
+        """
+        return str(uuid4()).replace("-", "_").upper()
     def _infer_input_output_cols(self, dataset: Union[DataFrame, pd.DataFrame]) -> None:
         """
         Infer `self.input_cols` and `self.output_cols` if they are not explicitly set.
@@ -294,7 +304,7 @@ class GenericUnivariateSelect(BaseTransformer):
             cp.dump(self._sklearn_object, local_transform_file)
         # Create temp stage to run fit.
-        transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self.id)
+        transform_stage_name = "SNOWML_TRANSFORM_{safe_id}".format(safe_id=self._get_rand_id())
         stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {transform_stage_name};"
         SqlResultValidator(
             session=session,
@@ -307,11 +317,12 @@ class GenericUnivariateSelect(BaseTransformer):
             expected_value=f"Stage area {transform_stage_name} successfully created."
         ).validate()
-        stage_transform_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
+        # Use posixpath to construct stage paths
+        stage_transform_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
+        stage_result_file_name = posixpath.join(transform_stage_name, os.path.basename(local_transform_file_name))
         local_result_file_name = get_temp_file_path()
-        stage_result_file_name = os.path.join(transform_stage_name, os.path.basename(local_transform_file_name))
-        fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self.id)
+        fit_sproc_name = "SNOWML_FIT_{safe_id}".format(safe_id=self._get_rand_id())
         statement_params = telemetry.get_function_usage_statement_params(
             project=_PROJECT,
             subproject=_SUBPROJECT,
@@ -337,6 +348,7 @@ class GenericUnivariateSelect(BaseTransformer):
             replace=True,
             session=session,
             statement_params=statement_params,
+            anonymous=True
         )
         def fit_wrapper_sproc(
             session: Session,
@@ -345,7 +357,8 @@ class GenericUnivariateSelect(BaseTransformer):
             stage_result_file_name: str,
             input_cols: List[str],
             label_cols: List[str],
-            sample_weight_col: Optional[str]
+            sample_weight_col: Optional[str],
+            statement_params: Dict[str, str]
         ) -> str:
             import cloudpickle as cp
             import numpy as np
@@ -412,15 +425,15 @@ class GenericUnivariateSelect(BaseTransformer):
             api_calls=[Session.call],
             custom_tags=dict([("autogen", True)]),
         )
-        sproc_export_file_name = session.call(
-            fit_sproc_name,
+        sproc_export_file_name = fit_wrapper_sproc(
+            session,
             query,
             stage_transform_file_name,
             stage_result_file_name,
             identifier.get_unescaped_names(self.input_cols),
             identifier.get_unescaped_names(self.label_cols),
             identifier.get_unescaped_names(self.sample_weight_col),
-            statement_params=statement_params,
+            statement_params,
         )
         if "|" in sproc_export_file_name:
@@ -430,7 +443,7 @@ class GenericUnivariateSelect(BaseTransformer):
                 print("\n".join(fields[1:]))
         session.file.get(
-            os.path.join(stage_result_file_name, sproc_export_file_name),
+            posixpath.join(stage_result_file_name, sproc_export_file_name),
             local_result_file_name,
             statement_params=statement_params
         )
@@ -476,7 +489,7 @@ class GenericUnivariateSelect(BaseTransformer):
         # Register vectorized UDF for batch inference
         batch_inference_udf_name = "SNOWML_BATCH_INFERENCE_{safe_id}_{method}".format(
-                safe_id=self.id, method=inference_method)
+                safe_id=self._get_rand_id(), method=inference_method)
         # Need to do this since if we use self._sklearn_object directly in the UDF, Snowpark
         # will try to pickle all of self which fails.
@@ -568,7 +581,7 @@ class GenericUnivariateSelect(BaseTransformer):
             return transformed_pandas_df.to_dict("records")
         batch_inference_table_name = "SNOWML_BATCH_INFERENCE_INPUT_TABLE_{safe_id}".format(
-            safe_id=self.id
+            safe_id=self._get_rand_id()
         )
         pass_through_columns = self._get_pass_through_columns(dataset)
@@ -624,26 +637,37 @@ class GenericUnivariateSelect(BaseTransformer):
         # input cols need to match unquoted / quoted
         input_cols = self.input_cols
         unquoted_input_cols = identifier.get_unescaped_names(self.input_cols)
+        quoted_input_cols = identifier.get_escaped_names(unquoted_input_cols)
         estimator = self._sklearn_object
-        input_df = dataset[input_cols] # Select input columns with quoted column names.
-        if hasattr(estimator, "feature_names_in_"):
-            missing_features = []
-            for i, f in enumerate(getattr(estimator, "feature_names_in_")):
-                if i >= len(input_cols) or (input_cols[i] != f and unquoted_input_cols[i] != f):
-                    missing_features.append(f)
-            if len(missing_features) > 0:
-                raise ValueError(
-                    "The feature names should match with those that were passed during fit.\n"
-                    f"Features seen during fit call but not present in the input: {missing_features}\n"
-                    f"Features in the input dataframe : {input_cols}\n"
-                )
-            input_df.columns = getattr(estimator, "feature_names_in_")
-        else:
-            # Just rename the column names to unquoted identifiers.
-            input_df.columns = unquoted_input_cols # Replace the quoted columns identifier with unquoted column ids.
+        features_required_by_estimator =  getattr(estimator, "feature_names_in_") if hasattr(estimator, "feature_names_in_") else unquoted_input_cols
+        missing_features = []
+        features_in_dataset = set(dataset.columns)
+        columns_to_select = []
+        for i, f in enumerate(features_required_by_estimator):
+            if (
+                    i >= len(input_cols)
+                or (input_cols[i] != f and unquoted_input_cols[i] != f and quoted_input_cols[i] != f)
+                or (input_cols[i] not in features_in_dataset and unquoted_input_cols[i] not in features_in_dataset
+                    and quoted_input_cols[i] not in features_in_dataset)
+                ):
+                missing_features.append(f)
+            elif input_cols[i] in features_in_dataset:
+                columns_to_select.append(input_cols[i])
+            elif unquoted_input_cols[i] in features_in_dataset:
+                columns_to_select.append(unquoted_input_cols[i])
+            else:
+                columns_to_select.append(quoted_input_cols[i])
+        if len(missing_features) > 0:
+            raise ValueError(
+                "The feature names should match with those that were passed during fit.\n"
+                f"Features seen during fit call but not present in the input: {missing_features}\n"
+                f"Features in the input dataframe : {input_cols}\n"
+            )
+        input_df = dataset[columns_to_select]
+        input_df.columns = features_required_by_estimator
         transformed_numpy_array = getattr(estimator, inference_method)(
             input_df
@@ -722,11 +746,18 @@ class GenericUnivariateSelect(BaseTransformer):
             Transformed dataset.
         """
         if isinstance(dataset, DataFrame):
+            expected_type_inferred = ""
+            # when it is classifier, infer the datatype from label columns
+            if expected_type_inferred == "" and 'predict' in self.model_signatures:
+                expected_type_inferred = convert_sp_to_sf_type(
+                    self.model_signatures['predict'].outputs[0].as_snowpark_type()
+                )
             output_df = self._batch_inference(
                 dataset=dataset,
                 inference_method="predict",
                 expected_output_cols_list=self.output_cols,
-                expected_output_cols_type="",
+                expected_output_cols_type=expected_type_inferred,
             )
         elif isinstance(dataset, pd.DataFrame):
             output_df = self._sklearn_inference(
@@ -799,10 +830,10 @@ class GenericUnivariateSelect(BaseTransformer):
     def _get_output_column_names(self, output_cols_prefix: str) -> List[str]:
         """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
-        Returns an empty list if current object is not a classifier or not yet fitted.
+        Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
         """
         if getattr(self._sklearn_object, "classes_", None) is None:
-            return []
+            return [output_cols_prefix]
         classes = self._sklearn_object.classes_
         if isinstance(classes, numpy.ndarray):
@@ -1027,7 +1058,7 @@ class GenericUnivariateSelect(BaseTransformer):
             cp.dump(self._sklearn_object, local_score_file)
         # Create temp stage to run score.
-        score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
+        score_stage_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
         session = dataset._session
         stage_creation_query = f"CREATE OR REPLACE TEMPORARY STAGE {score_stage_name};"
         SqlResultValidator(
@@ -1041,8 +1072,9 @@ class GenericUnivariateSelect(BaseTransformer):
             expected_value=f"Stage area {score_stage_name} successfully created."
         ).validate()
-        stage_score_file_name = os.path.join(score_stage_name, os.path.basename(local_score_file_name))
-        score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self.id)
+        # Use posixpath to construct stage paths
+        stage_score_file_name = posixpath.join(score_stage_name, os.path.basename(local_score_file_name))
+        score_sproc_name = "SNOWML_SCORE_{safe_id}".format(safe_id=self._get_rand_id())
         statement_params = telemetry.get_function_usage_statement_params(
             project=_PROJECT,
             subproject=_SUBPROJECT,
@@ -1068,6 +1100,7 @@ class GenericUnivariateSelect(BaseTransformer):
             replace=True,
             session=session,
             statement_params=statement_params,
+            anonymous=True
         )
         def score_wrapper_sproc(
             session: Session,
@@ -1075,7 +1108,8 @@ class GenericUnivariateSelect(BaseTransformer):
             stage_score_file_name: str,
             input_cols: List[str],
             label_cols: List[str],
-            sample_weight_col: Optional[str]
+            sample_weight_col: Optional[str],
+            statement_params: Dict[str, str]
         ) -> float:
             import cloudpickle as cp
             import numpy as np
@@ -1125,14 +1159,14 @@ class GenericUnivariateSelect(BaseTransformer):
             api_calls=[Session.call],
             custom_tags=dict([("autogen", True)]),
         )
-        score = session.call(
-            score_sproc_name,
+        score = score_wrapper_sproc(
+            session,
             query,
             stage_score_file_name,
             identifier.get_unescaped_names(self.input_cols),
             identifier.get_unescaped_names(self.label_cols),
             identifier.get_unescaped_names(self.sample_weight_col),
-            statement_params=statement_params,
+            statement_params,
         )
         cleanup_temp_files([local_score_file_name])
@@ -1150,18 +1184,20 @@ class GenericUnivariateSelect(BaseTransformer):
             if self._sklearn_object._estimator_type == 'classifier':
                 outputs = _infer_signature(dataset[self.label_cols], "output")  # label columns is the desired type for output
                 outputs = _rename_features(outputs, self.output_cols)  # rename the output columns
-                self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
+                self._model_signature_dict["predict"] = ModelSignature(inputs,
+                                                                       ([] if self._drop_input_cols else inputs) + outputs)
             # For regressor, the type of predict is float64
             elif self._sklearn_object._estimator_type == 'regressor':
                 outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in self.output_cols]
-                self._model_signature_dict["predict"] = ModelSignature(inputs, outputs)
+                self._model_signature_dict["predict"] = ModelSignature(inputs,
+                                                                       ([] if self._drop_input_cols else inputs) + outputs)
         for prob_func in PROB_FUNCTIONS:
             if hasattr(self, prob_func):
                 output_cols_prefix: str = f"{prob_func}_"
                 output_column_names = self._get_output_column_names(output_cols_prefix)
                 outputs = [FeatureSpec(dtype=DataType.DOUBLE, name=c) for c in output_column_names]
-                self._model_signature_dict[prob_func] = ModelSignature(inputs, outputs)
+                self._model_signature_dict[prob_func] = ModelSignature(inputs,
+                                                                       ([] if self._drop_input_cols else inputs) + outputs)
     @property
     def model_signatures(self) -> Dict[str, ModelSignature]:

snowflake-ml-python 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

snowflake-ml-python 1.0.1py3-none-any.whl → 1.0.3py3-none-any.whl