PyPI - snowflake-ml-python - Versions diffs - 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl - Mend

snowflake-ml-python 1.3.0py3-none-any.whl → 1.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (211) hide show

snowflake/ml/_internal/file_utils.py +3 -3
snowflake/ml/_internal/human_readable_id/adjectives.txt +128 -0
snowflake/ml/_internal/human_readable_id/animals.txt +128 -0
snowflake/ml/_internal/human_readable_id/hrid_generator.py +40 -0
snowflake/ml/_internal/human_readable_id/hrid_generator_base.py +135 -0
snowflake/ml/_internal/telemetry.py +11 -2
snowflake/ml/_internal/utils/formatting.py +1 -1
snowflake/ml/feature_store/feature_store.py +15 -106
snowflake/ml/fileset/sfcfs.py +4 -3
snowflake/ml/fileset/stage_fs.py +18 -0
snowflake/ml/model/_api.py +9 -9
snowflake/ml/model/_client/model/model_version_impl.py +20 -15
snowflake/ml/model/_deploy_client/image_builds/docker_context.py +3 -9
snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +3 -5
snowflake/ml/model/_deploy_client/snowservice/deploy.py +7 -6
snowflake/ml/model/_model_composer/model_composer.py +10 -8
snowflake/ml/model/_model_composer/model_method/function_generator.py +1 -1
snowflake/ml/model/_model_composer/model_method/infer_table_function.py_template +2 -1
snowflake/ml/model/_model_composer/model_method/model_method.py +2 -2
snowflake/ml/model/_model_composer/model_runtime/_runtime_requirements.py +1 -1
snowflake/ml/model/_packager/model_handlers/_base.py +2 -2
snowflake/ml/model/_packager/model_handlers/_utils.py +5 -5
snowflake/ml/model/_packager/model_handlers/custom.py +7 -7
snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +2 -2
snowflake/ml/model/_packager/model_handlers/llm.py +1 -1
snowflake/ml/model/_packager/model_handlers/mlflow.py +1 -1
snowflake/ml/model/_packager/model_handlers/pytorch.py +13 -10
snowflake/ml/model/_packager/model_handlers/sentence_transformers.py +214 -0
snowflake/ml/model/_packager/model_handlers/sklearn.py +6 -6
snowflake/ml/model/_packager/model_handlers/snowmlmodel.py +15 -3
snowflake/ml/model/_packager/model_handlers/tensorflow.py +8 -8
snowflake/ml/model/_packager/model_handlers/torchscript.py +7 -7
snowflake/ml/model/_packager/model_handlers/xgboost.py +8 -8
snowflake/ml/model/_packager/model_meta/_core_requirements.py +1 -1
snowflake/ml/model/_packager/model_packager.py +8 -6
snowflake/ml/model/custom_model.py +3 -1
snowflake/ml/model/type_hints.py +13 -0
snowflake/ml/modeling/_internal/estimator_utils.py +61 -1
snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py +4 -43
snowflake/ml/modeling/_internal/local_implementations/pandas_trainer.py +4 -4
snowflake/ml/modeling/_internal/ml_runtime_implementations/ml_runtime_handlers.py +21 -17
snowflake/ml/modeling/_internal/model_specifications.py +3 -1
snowflake/ml/modeling/_internal/model_trainer.py +2 -2
snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py +547 -1
snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py +67 -114
snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py +9 -9
snowflake/ml/modeling/_internal/transformer_protocols.py +2 -3
snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +33 -61
snowflake/ml/modeling/cluster/affinity_propagation.py +33 -61
snowflake/ml/modeling/cluster/agglomerative_clustering.py +33 -61
snowflake/ml/modeling/cluster/birch.py +33 -61
snowflake/ml/modeling/cluster/bisecting_k_means.py +33 -61
snowflake/ml/modeling/cluster/dbscan.py +33 -61
snowflake/ml/modeling/cluster/feature_agglomeration.py +33 -61
snowflake/ml/modeling/cluster/k_means.py +33 -61
snowflake/ml/modeling/cluster/mean_shift.py +33 -61
snowflake/ml/modeling/cluster/mini_batch_k_means.py +33 -61
snowflake/ml/modeling/cluster/optics.py +33 -61
snowflake/ml/modeling/cluster/spectral_biclustering.py +33 -61
snowflake/ml/modeling/cluster/spectral_clustering.py +33 -61
snowflake/ml/modeling/cluster/spectral_coclustering.py +33 -61
snowflake/ml/modeling/compose/column_transformer.py +33 -61
snowflake/ml/modeling/compose/transformed_target_regressor.py +33 -61
snowflake/ml/modeling/covariance/elliptic_envelope.py +33 -61
snowflake/ml/modeling/covariance/empirical_covariance.py +33 -61
snowflake/ml/modeling/covariance/graphical_lasso.py +33 -61
snowflake/ml/modeling/covariance/graphical_lasso_cv.py +33 -61
snowflake/ml/modeling/covariance/ledoit_wolf.py +33 -61
snowflake/ml/modeling/covariance/min_cov_det.py +33 -61
snowflake/ml/modeling/covariance/oas.py +33 -61
snowflake/ml/modeling/covariance/shrunk_covariance.py +33 -61
snowflake/ml/modeling/decomposition/dictionary_learning.py +33 -61
snowflake/ml/modeling/decomposition/factor_analysis.py +33 -61
snowflake/ml/modeling/decomposition/fast_ica.py +33 -61
snowflake/ml/modeling/decomposition/incremental_pca.py +33 -61
snowflake/ml/modeling/decomposition/kernel_pca.py +33 -61
snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +33 -61
snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +33 -61
snowflake/ml/modeling/decomposition/pca.py +33 -61
snowflake/ml/modeling/decomposition/sparse_pca.py +33 -61
snowflake/ml/modeling/decomposition/truncated_svd.py +33 -61
snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +33 -61
snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +33 -61
snowflake/ml/modeling/ensemble/ada_boost_classifier.py +33 -61
snowflake/ml/modeling/ensemble/ada_boost_regressor.py +33 -61
snowflake/ml/modeling/ensemble/bagging_classifier.py +33 -61
snowflake/ml/modeling/ensemble/bagging_regressor.py +33 -61
snowflake/ml/modeling/ensemble/extra_trees_classifier.py +33 -61
snowflake/ml/modeling/ensemble/extra_trees_regressor.py +33 -61
snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +33 -61
snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +33 -61
snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +33 -61
snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +33 -61
snowflake/ml/modeling/ensemble/isolation_forest.py +33 -61
snowflake/ml/modeling/ensemble/random_forest_classifier.py +33 -61
snowflake/ml/modeling/ensemble/random_forest_regressor.py +33 -61
snowflake/ml/modeling/ensemble/stacking_regressor.py +33 -61
snowflake/ml/modeling/ensemble/voting_classifier.py +33 -61
snowflake/ml/modeling/ensemble/voting_regressor.py +33 -61
snowflake/ml/modeling/feature_selection/generic_univariate_select.py +33 -61
snowflake/ml/modeling/feature_selection/select_fdr.py +33 -61
snowflake/ml/modeling/feature_selection/select_fpr.py +33 -61
snowflake/ml/modeling/feature_selection/select_fwe.py +33 -61
snowflake/ml/modeling/feature_selection/select_k_best.py +33 -61
snowflake/ml/modeling/feature_selection/select_percentile.py +33 -61
snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +33 -61
snowflake/ml/modeling/feature_selection/variance_threshold.py +33 -61
snowflake/ml/modeling/framework/base.py +55 -5
snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +33 -61
snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +33 -61
snowflake/ml/modeling/impute/iterative_imputer.py +33 -61
snowflake/ml/modeling/impute/knn_imputer.py +33 -61
snowflake/ml/modeling/impute/missing_indicator.py +33 -61
snowflake/ml/modeling/impute/simple_imputer.py +4 -15
snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +33 -61
snowflake/ml/modeling/kernel_approximation/nystroem.py +33 -61
snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +33 -61
snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +33 -61
snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +33 -61
snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +33 -61
snowflake/ml/modeling/lightgbm/lgbm_classifier.py +36 -63
snowflake/ml/modeling/lightgbm/lgbm_regressor.py +36 -63
snowflake/ml/modeling/linear_model/ard_regression.py +33 -61
snowflake/ml/modeling/linear_model/bayesian_ridge.py +33 -61
snowflake/ml/modeling/linear_model/elastic_net.py +33 -61
snowflake/ml/modeling/linear_model/elastic_net_cv.py +33 -61
snowflake/ml/modeling/linear_model/gamma_regressor.py +33 -61
snowflake/ml/modeling/linear_model/huber_regressor.py +33 -61
snowflake/ml/modeling/linear_model/lars.py +33 -61
snowflake/ml/modeling/linear_model/lars_cv.py +33 -61
snowflake/ml/modeling/linear_model/lasso.py +33 -61
snowflake/ml/modeling/linear_model/lasso_cv.py +33 -61
snowflake/ml/modeling/linear_model/lasso_lars.py +33 -61
snowflake/ml/modeling/linear_model/lasso_lars_cv.py +33 -61
snowflake/ml/modeling/linear_model/lasso_lars_ic.py +33 -61
snowflake/ml/modeling/linear_model/linear_regression.py +33 -61
snowflake/ml/modeling/linear_model/logistic_regression.py +33 -61
snowflake/ml/modeling/linear_model/logistic_regression_cv.py +33 -61
snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +33 -61
snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +33 -61
snowflake/ml/modeling/linear_model/multi_task_lasso.py +33 -61
snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +33 -61
snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +33 -61
snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +33 -61
snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +33 -61
snowflake/ml/modeling/linear_model/perceptron.py +33 -61
snowflake/ml/modeling/linear_model/poisson_regressor.py +33 -61
snowflake/ml/modeling/linear_model/ransac_regressor.py +33 -61
snowflake/ml/modeling/linear_model/ridge.py +33 -61
snowflake/ml/modeling/linear_model/ridge_classifier.py +33 -61
snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +33 -61
snowflake/ml/modeling/linear_model/ridge_cv.py +33 -61
snowflake/ml/modeling/linear_model/sgd_classifier.py +33 -61
snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +33 -61
snowflake/ml/modeling/linear_model/sgd_regressor.py +33 -61
snowflake/ml/modeling/linear_model/theil_sen_regressor.py +33 -61
snowflake/ml/modeling/linear_model/tweedie_regressor.py +33 -61
snowflake/ml/modeling/manifold/isomap.py +33 -61
snowflake/ml/modeling/manifold/mds.py +33 -61
snowflake/ml/modeling/manifold/spectral_embedding.py +33 -61
snowflake/ml/modeling/manifold/tsne.py +33 -61
snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +33 -61
snowflake/ml/modeling/mixture/gaussian_mixture.py +33 -61
snowflake/ml/modeling/model_selection/grid_search_cv.py +39 -57
snowflake/ml/modeling/model_selection/randomized_search_cv.py +26 -57
snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +33 -61
snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +33 -61
snowflake/ml/modeling/multiclass/output_code_classifier.py +33 -61
snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +33 -61
snowflake/ml/modeling/naive_bayes/categorical_nb.py +33 -61
snowflake/ml/modeling/naive_bayes/complement_nb.py +33 -61
snowflake/ml/modeling/naive_bayes/gaussian_nb.py +33 -61
snowflake/ml/modeling/naive_bayes/multinomial_nb.py +33 -61
snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +33 -61
snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +33 -61
snowflake/ml/modeling/neighbors/kernel_density.py +33 -61
snowflake/ml/modeling/neighbors/local_outlier_factor.py +33 -61
snowflake/ml/modeling/neighbors/nearest_centroid.py +33 -61
snowflake/ml/modeling/neighbors/nearest_neighbors.py +33 -61
snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +33 -61
snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +33 -61
snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +33 -61
snowflake/ml/modeling/neural_network/bernoulli_rbm.py +33 -61
snowflake/ml/modeling/neural_network/mlp_classifier.py +33 -61
snowflake/ml/modeling/neural_network/mlp_regressor.py +33 -61
snowflake/ml/modeling/preprocessing/polynomial_features.py +33 -61
snowflake/ml/modeling/semi_supervised/label_propagation.py +33 -61
snowflake/ml/modeling/semi_supervised/label_spreading.py +33 -61
snowflake/ml/modeling/svm/linear_svc.py +33 -61
snowflake/ml/modeling/svm/linear_svr.py +33 -61
snowflake/ml/modeling/svm/nu_svc.py +33 -61
snowflake/ml/modeling/svm/nu_svr.py +33 -61
snowflake/ml/modeling/svm/svc.py +33 -61
snowflake/ml/modeling/svm/svr.py +33 -61
snowflake/ml/modeling/tree/decision_tree_classifier.py +33 -61
snowflake/ml/modeling/tree/decision_tree_regressor.py +33 -61
snowflake/ml/modeling/tree/extra_tree_classifier.py +33 -61
snowflake/ml/modeling/tree/extra_tree_regressor.py +33 -61
snowflake/ml/modeling/xgboost/xgb_classifier.py +33 -61
snowflake/ml/modeling/xgboost/xgb_regressor.py +33 -61
snowflake/ml/modeling/xgboost/xgbrf_classifier.py +33 -61
snowflake/ml/modeling/xgboost/xgbrf_regressor.py +33 -61
snowflake/ml/registry/_manager/model_manager.py +6 -2
snowflake/ml/registry/model_registry.py +100 -27
snowflake/ml/registry/registry.py +6 -2
snowflake/ml/version.py +1 -1
{snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/METADATA +43 -7
{snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/RECORD +211 -206
{snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/LICENSE.txt +0 -0
{snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/WHEEL +0 -0
{snowflake_ml_python-1.3.0.dist-info → snowflake_ml_python-1.4.0.dist-info}/top_level.txt +0 -0

snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py CHANGED Viewed

@@ -15,16 +15,16 @@ from snowflake.ml._internal.utils.temp_file_utils import (
     cleanup_temp_files,
     get_temp_file_path,
 )
-from snowflake.snowpark import DataFrame, Session
+from snowflake.ml.modeling._internal.estimator_utils import handle_inference_result
+from snowflake.snowpark import DataFrame, Session, functions as F, types as T
 from snowflake.snowpark._internal.utils import (
     TempObjectType,
     random_name_for_temp_object,
 )
-from snowflake.snowpark.functions import pandas_udf, sproc
-from snowflake.snowpark.types import PandasSeries
 cp.register_pickle_by_value(inspect.getmodule(get_temp_file_path))
 cp.register_pickle_by_value(inspect.getmodule(identifier.get_inferred_name))
+cp.register_pickle_by_value(inspect.getmodule(handle_inference_result))
 _PROJECT = "ModelDevelopment"
@@ -67,9 +67,9 @@ class SnowparkTransformHandlers:
         inference_method: str,
         input_cols: List[str],
         expected_output_cols: List[str],
-        pass_through_cols: List[str],
         session: Session,
         dependencies: List[str],
+        drop_input_cols: Optional[bool] = False,
         expected_output_cols_type: Optional[str] = "",
         *args: Any,
         **kwargs: Any,
@@ -81,8 +81,8 @@ class SnowparkTransformHandlers:
             dependencies: List of dependencies for the transformer.
             inference_method: the name of the method used by `estimator` to run inference.
             input_cols: List of feature columns for inference.
-            pass_through_cols: columns in the dataset not used in inference.
             expected_output_cols: column names (in order) of the output dataset.
+            drop_input_cols: Boolean to determine whether to drop the input columns from the output dataset.
             expected_output_cols_type: Expected type of the output columns.
             args: additional positional arguments.
             kwargs: additional keyword args.
@@ -95,141 +95,94 @@ class SnowparkTransformHandlers:
         estimator = self.estimator
         # Register vectorized UDF for batch inference
         batch_inference_udf_name = random_name_for_temp_object(TempObjectType.FUNCTION)
-        snowpark_cols = dataset.select(input_cols).columns
         dataset = snowpark_dataframe_utils.cast_snowpark_dataframe_column_types(dataset)
+        # Align the input_cols with snowpark dataframe's column name
+        # This step also makes sure that the every col in input_cols exists in the current dataset
+        snowpark_cols = dataset.select(input_cols).columns
+        # Infer the datatype from input dataset's schema for batch inference
+        # This is required before registering the UDTF
+        fields = dataset.select(input_cols).schema.fields
+        input_datatypes = []
+        for field in fields:
+            input_datatypes.append(field.datatype)
         statement_params = telemetry.get_function_usage_statement_params(
             project=_PROJECT,
             subproject=self._subproject,
             function_name=telemetry.get_statement_params_full_func_name(inspect.currentframe(), self._class_name),
-            api_calls=[pandas_udf],
+            api_calls=[F.pandas_udf],
             custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
         )
-        @pandas_udf(  # type: ignore[arg-type, misc]
+        @F.pandas_udf(  # type: ignore[arg-type, misc]
             is_permanent=False,
             name=batch_inference_udf_name,
             packages=dependencies,  # type: ignore[arg-type]
             replace=True,
             session=session,
             statement_params=statement_params,
+            input_types=[T.PandasDataFrameType(input_datatypes)],
         )
-        def vec_batch_infer(ds: PandasSeries[dict]) -> PandasSeries[dict]:  # type: ignore[type-arg]
-            import numbers
-            import numpy as np
+        def vec_batch_infer(input_df: pd.DataFrame) -> T.PandasSeries[dict]:  # type: ignore[type-arg]
+            import numpy as np  # noqa: F401
             import pandas as pd
-            input_df = pd.json_normalize(ds)
-            # pd.json_normalize() doesn't remove quotes around quoted identifiers like snowpakr_df.to_pandas().
-            # But trained models have unquoted input column names saved in internal state if trained using snowpark_df
-            # or quoted input column names saved in internal state if trained using pandas_df.
-            # Model expects exact same columns names in the input df for predict call.
-            input_df = input_df[input_cols]  # Select input columns with quoted column names.
-            if hasattr(estimator, "feature_names_in_"):
-                missing_features = []
-                for i, f in enumerate(getattr(estimator, "feature_names_in_", {})):
-                    if i >= len(input_cols) or (input_cols[i] != f and snowpark_cols[i] != f):
-                        missing_features.append(f)
-                if len(missing_features) > 0:
-                    raise ValueError(
-                        "The feature names should match with those that were passed during fit.\n"
-                        f"Features seen during fit call but not present in the input: {missing_features}\n"
-                        f"Features in the input dataframe : {input_cols}\n"
-                    )
-                input_df.columns = getattr(estimator, "feature_names_in_", {})
-            else:
-                # Just rename the column names to unquoted identifiers.
-                input_df.columns = snowpark_cols  # Replace the quoted columns identifier with unquoted column ids.
+            input_df.columns = snowpark_cols
+            if hasattr(estimator, "n_jobs"):
+                # Vectorized UDF cannot handle joblib multiprocessing right now, deactivate the n_jobs
+                estimator.n_jobs = 1
             inference_res = getattr(estimator, inference_method)(input_df, *args, **kwargs)
-            if isinstance(inference_res, list) and len(inference_res) > 0 and isinstance(inference_res[0], np.ndarray):
-                # In case of multioutput estimators, predict_proba, decision_function etc., functions return a list of
-                # ndarrays. We need to concatenate them.
-                transformed_numpy_array = np.concatenate(inference_res, axis=1)
-            elif (
-                isinstance(inference_res, tuple) and len(inference_res) > 0 and isinstance(inference_res[0], np.ndarray)
-            ):
-                # In case of kneighbors, functions return a tuple of ndarrays.
-                transformed_numpy_array = np.stack(inference_res, axis=1)
-            elif isinstance(inference_res, numbers.Number):
-                # In case of BernoulliRBM, functions return a float
-                transformed_numpy_array = np.array([inference_res])
-            else:
-                transformed_numpy_array = inference_res
-            if (len(transformed_numpy_array.shape) == 3) and inference_method != "kneighbors":
-                # VotingClassifier will return results of shape (n_classifiers, n_samples, n_classes)
-                # when voting = "soft" and flatten_transform = False. We can't handle unflatten transforms,
-                # so we ignore flatten_transform flag and flatten the results.
-                transformed_numpy_array = np.hstack(transformed_numpy_array)  # type: ignore[call-overload]
+            transformed_numpy_array, output_cols = handle_inference_result(
+                inference_res=inference_res,
+                output_cols=expected_output_cols,
+                inference_method=inference_method,
+                within_udf=True,
+            )
             if len(transformed_numpy_array.shape) > 1:
-                if transformed_numpy_array.shape[1] != len(expected_output_cols):
-                    # HeterogeneousEnsemble's transform method produce results with variying shapes
-                    # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes).
-                    # It is hard to predict the response shape without using fragile introspection logic.
-                    # So, to avoid that we are packing the results into a dataframe of shape (n_samples, 1) with
-                    # each element being a list.
-                    if len(expected_output_cols) != 1:
-                        raise TypeError(
-                            "expected_output_cols must be same length as transformed array or " "should be of length 1"
-                        )
+                if transformed_numpy_array.shape[1] != len(output_cols):
                     series = pd.Series(transformed_numpy_array.tolist())
-                    transformed_pandas_df = pd.DataFrame(series, columns=expected_output_cols)
+                    transformed_pandas_df = pd.DataFrame(series, columns=output_cols)
                 else:
-                    transformed_pandas_df = pd.DataFrame(transformed_numpy_array.tolist(), columns=expected_output_cols)
+                    transformed_pandas_df = pd.DataFrame(transformed_numpy_array.tolist(), columns=output_cols)
             else:
-                transformed_pandas_df = pd.DataFrame(transformed_numpy_array, columns=expected_output_cols)
+                transformed_pandas_df = pd.DataFrame(transformed_numpy_array, columns=output_cols)
             return transformed_pandas_df.to_dict("records")  # type: ignore[no-any-return]
-        batch_inference_table_name = f"SNOWML_BATCH_INFERENCE_INPUT_TABLE_{_get_rand_id()}"
-        # Run Transform
-        query_from_df = str(dataset.queries["queries"][0])
-        outer_select_list = pass_through_cols[:]
-        inner_select_list = pass_through_cols[:]
-        outer_select_list.extend(
-            [
-                "{object_name}:{column_name}{udf_datatype} as {column_name}".format(
-                    object_name=batch_inference_udf_name,
-                    column_name=identifier.get_inferred_name(c),
-                    udf_datatype=(f"::{expected_output_cols_type}" if expected_output_cols_type else ""),
-                )
-                for c in expected_output_cols
-            ]
-        )
-        inner_select_list.extend(
-            [
-                "{udf_name}(object_construct_keep_null({input_cols_dict})) AS {udf_name}".format(
-                    udf_name=batch_inference_udf_name,
-                    input_cols_dict=", ".join([f"'{c}', {c}" for c in input_cols]),
-                )
-            ]
-        )
-        sql = """WITH {input_table_name} AS ({query})
-                    SELECT
-                      {outer_select_stmt}
-                    FROM (
-                      SELECT
-                        {inner_select_stmt}
-                      FROM {input_table_name}
-                    )
-               """.format(
-            input_table_name=batch_inference_table_name,
-            query=query_from_df,
-            outer_select_stmt=", ".join(outer_select_list),
-            inner_select_stmt=", ".join(inner_select_list),
-        )
-        return session.sql(sql)
+        # Run Transform and get intermediate result
+        INTERMEDIATE_OBJ_NAME = "tmp_result"
+        # Use snowpark_cols can make sure the name ordering of the input dataframe
+        # and only select those columns to put into vectorized udf
+        output_obj = F.call_udf(batch_inference_udf_name, [F.col(col_name) for col_name in snowpark_cols])
+        df_res: DataFrame = dataset.with_column(INTERMEDIATE_OBJ_NAME, output_obj)
+        # Prepare the output
+        output_cols = []
+        output_col_names = []
+        # When there is no expected_output_cols_type, default set it as StringType
+        # snowpark cannot handle empty string, so this step give "string" value
+        if expected_output_cols_type == "":
+            expected_output_cols_type = "string"
+        assert expected_output_cols_type is not None
+        for output_feature in expected_output_cols:
+            output_cols.append(F.col(INTERMEDIATE_OBJ_NAME)[output_feature].astype(expected_output_cols_type))
+            output_col_names.append(identifier.get_inferred_name(output_feature))
+        # Extract output from INTERMEDIATE_OBJ_NAME and drop that column
+        df_res = df_res.with_columns(
+            output_col_names,
+            output_cols,
+        ).drop(INTERMEDIATE_OBJ_NAME)
+        if drop_input_cols:
+            df_res = df_res.drop(*input_cols)
+        return df_res
     def score(
         self,
@@ -287,7 +240,7 @@ class SnowparkTransformHandlers:
             function_name=telemetry.get_statement_params_full_func_name(
                 inspect.currentframe(), self.__class__.__name__
             ),
-            api_calls=[sproc],
+            api_calls=[F.sproc],
             custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
         )
         # Put locally serialized score on stage.
@@ -299,7 +252,7 @@ class SnowparkTransformHandlers:
             statement_params=statement_params,
         )
-        @sproc(  # type: ignore[misc]
+        @F.sproc(  # type: ignore[misc]
             is_permanent=False,
             name=score_sproc_name,
             packages=dependencies,  # type: ignore[arg-type]

snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py CHANGED Viewed

@@ -279,7 +279,7 @@ class SnowparkModelTrainer:
     def _build_fit_predict_wrapper_sproc(
         self,
         model_spec: ModelSpecifications,
-    ) -> Callable[[Session, List[str], str, str, List[str], Dict[str, str], List[str], List[str], str], str]:
+    ) -> Callable[[Session, List[str], str, str, List[str], Dict[str, str], bool, List[str], str], str]:
         """
         Constructs and returns a python stored procedure function to be used for training model.
@@ -299,7 +299,7 @@ class SnowparkModelTrainer:
             stage_result_file_name: str,
             input_cols: List[str],
             statement_params: Dict[str, str],
-            pass_through_columns: List[str],
+            drop_input_cols: bool,
             expected_output_cols_list: List[str],
             fit_predict_result_name: str,
         ) -> str:
@@ -345,12 +345,12 @@ class SnowparkModelTrainer:
             )
             # store the predict output
-            if len(pass_through_columns) != 0:
-                df = df.copy()
+            if drop_input_cols:
                 fit_predict_result_pd = pd.DataFrame(data=fit_predict_result, columns=expected_output_cols_list)
-                fit_predict_result_pd = pd.concat([df, fit_predict_result_pd], axis=1)
             else:
+                df = df.copy()
                 fit_predict_result_pd = pd.DataFrame(data=fit_predict_result, columns=expected_output_cols_list)
+                fit_predict_result_pd = pd.concat([df, fit_predict_result_pd], axis=1)
             # write into a temp table in sproc and load the table from outside
             session.write_pandas(
@@ -463,18 +463,18 @@ class SnowparkModelTrainer:
     def train_fit_predict(
         self,
-        pass_through_columns: List[str],
         expected_output_cols_list: List[str],
+        drop_input_cols: Optional[bool] = False,
     ) -> Tuple[Union[DataFrame, pd.DataFrame], object]:
         """Trains the model by pushing down the compute into Snowflake using stored procedures.
         This API is different from fit itself because it would also provide the predict
         output.
         Args:
-            pass_through_columns (List[str]): The column names that would
-                display in the returned dataset.
             expected_output_cols_list (List[str]): The output columns
                 name as a list. Defaults to None.
+            drop_input_cols (Optional[bool]): Boolean to determine drop
+                the input columns from the output dataset or not
         Returns:
             Tuple[Union[DataFrame, pd.DataFrame], object]: [predicted dataset, estimator]
@@ -508,7 +508,7 @@ class SnowparkModelTrainer:
             stage_result_file_name,
             self.input_cols,
             statement_params,
-            pass_through_columns,
+            drop_input_cols,
             expected_output_cols_list,
             fit_predict_result_name,
         )

snowflake/ml/modeling/_internal/transformer_protocols.py CHANGED Viewed

@@ -107,9 +107,9 @@ class RemoteModelTransformHandlers(Protocol):
         inference_method: str,
         input_cols: List[str],
         expected_output_cols: List[str],
-        pass_through_cols: List[str],
         session: snowpark.Session,
         dependencies: List[str],
+        drop_input_cols: Optional[bool] = False,
         expected_output_cols_type: Optional[str] = "",
         *args: Any,
         **kwargs: Any,
@@ -121,9 +121,9 @@ class RemoteModelTransformHandlers(Protocol):
             dependencies: List of dependencies for the transformer.
             inference_method: the name of the method used by `estimator` to run inference.
             input_cols: List of feature columns for inference.
-            pass_through_cols: columns in the dataset not used in inference.
             expected_output_cols: column names (in order) of the output dataset.
             expected_output_cols_type: Expected type of the output columns.
+            drop_input_cols: Boolean to determine drop the input columns from the output dataset or not
             args: additional positional arguments.
             kwargs: additional keyword args.
@@ -175,7 +175,6 @@ class BatchInferenceKwargsTypedDict(TypedDict, total=False):
     snowpark_input_cols: Optional[List[str]]
     drop_input_cols: Optional[bool]
-    pass_through_cols: List[str]
     session: snowpark.Session
     dependencies: List[str]
     expected_output_cols_type: str

snowflake/ml/modeling/calibration/calibrated_classifier_cv.py CHANGED Viewed

@@ -328,18 +328,24 @@ class CalibratedClassifierCV(BaseTransformer):
         self._get_model_signatures(dataset)
         return self
-    def _get_pass_through_columns(self, dataset: DataFrame) -> List[str]:
-        if self._drop_input_cols:
-            return []
-        else:
-            return list(set(dataset.columns) - set(self.output_cols))
     def _batch_inference_validate_snowpark(
         self,
         dataset: DataFrame,
         inference_method: str,
     ) -> List[str]:
-        """Util method to run validate that batch inference can be run on a snowpark dataframe.
+        """Util method to run validate that batch inference can be run on a snowpark dataframe and
+        return the available package that exists in the snowflake anaconda channel
+        Args:
+            dataset: snowpark dataframe
+            inference_method: the inference method such as predict, score...
+        Raises:
+            SnowflakeMLException: If the estimator is not fitted, raise error
+            SnowflakeMLException: If the session is None, raise error
+        Returns:
+            A list of available package that exists in the snowflake anaconda channel
         """
         if not self._is_fitted:
             raise exceptions.SnowflakeMLException(
@@ -413,7 +419,7 @@ class CalibratedClassifierCV(BaseTransformer):
             transform_kwargs = dict(
                 session = dataset._session,
                 dependencies = self._deps,
-                pass_through_cols = self._get_pass_through_columns(dataset),
+                drop_input_cols = self._drop_input_cols,
                 expected_output_cols_type = expected_type_inferred,
             )
@@ -473,16 +479,16 @@ class CalibratedClassifierCV(BaseTransformer):
                 # from (n_samples, n_estimators) to (n_samples, n_estimators * n_classes) (and everything in between)
                 # based on init param values. We will convert that to pandas dataframe of shape (n_samples, 1) with
                 # each row containing a list of values.
-                expected_dtype = "ARRAY"
+                expected_dtype = "array"
             # If we were unable to assign a type to this transform in the factory, infer the type here.
             if expected_dtype == "":
-                # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
+                # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "array"
                 if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
-                    expected_dtype = "ARRAY"
-                # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
+                    expected_dtype = "array"
+                # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "array"
                 elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
-                    expected_dtype = "ARRAY"
+                    expected_dtype = "array"
                 else:
                     output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
                     # We can only infer the output types from the input types if the following two statemetns are true:
@@ -500,7 +506,7 @@ class CalibratedClassifierCV(BaseTransformer):
             transform_kwargs = dict(
                 session = dataset._session,
                 dependencies = self._deps,
-                pass_through_cols = self._get_pass_through_columns(dataset),
+                drop_input_cols = self._drop_input_cols,
                 expected_output_cols_type = expected_dtype,
             )
@@ -551,7 +557,7 @@ class CalibratedClassifierCV(BaseTransformer):
             subproject=_SUBPROJECT,
         )
         output_result, fitted_estimator = model_trainer.train_fit_predict(
-            pass_through_columns=self._get_pass_through_columns(dataset),
+            drop_input_cols=self._drop_input_cols,
             expected_output_cols_list=self.output_cols if self.output_cols else self._get_output_column_names(output_cols_prefix),
         )
         self._sklearn_object = fitted_estimator
@@ -569,44 +575,6 @@ class CalibratedClassifierCV(BaseTransformer):
         assert self._sklearn_object is not None
         return self._sklearn_object.embedding_
-    def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
-        """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.
-        Returns a list with output_cols_prefix as the only element if the estimator is not a classifier.
-        """
-        output_cols_prefix = identifier.resolve_identifier(output_cols_prefix)
-        if output_cols:
-            output_cols = [
-                identifier.concat_names([output_cols_prefix, identifier.resolve_identifier(c)])
-                for c in output_cols
-            ]
-        elif getattr(self._sklearn_object, "classes_", None) is None:
-            output_cols = [output_cols_prefix]
-        elif self._sklearn_object is not None:
-            classes = self._sklearn_object.classes_
-            if isinstance(classes, numpy.ndarray):
-                output_cols = [f'{output_cols_prefix}{str(c)}' for c in classes.tolist()]
-            elif isinstance(classes, list) and len(classes) > 0 and isinstance(classes[0], numpy.ndarray):
-                # If the estimator is a multioutput estimator, classes_ will be a list of ndarrays.
-                output_cols = []
-                for i, cl in enumerate(classes):
-                    # For binary classification, there is only one output column for each class
-                    # ndarray as the two classes are complementary.
-                    if len(cl) == 2:
-                        output_cols.append(f'{output_cols_prefix}{i}_{cl[0]}')
-                    else:
-                        output_cols.extend([
-                            f'{output_cols_prefix}{i}_{c}' for c in cl.tolist()
-                        ])
-        else:
-            output_cols = []
-        # Make sure column names are valid snowflake identifiers.
-        assert output_cols is not None  # Make MyPy happy
-        rv = [identifier.rename_to_valid_snowflake_identifier(c) for c in output_cols]
-        return rv
     @available_if(original_estimator_has_callable("predict_proba"))  # type: ignore[misc]
     @telemetry.send_api_usage_telemetry(
         project=_PROJECT,
@@ -648,7 +616,7 @@ class CalibratedClassifierCV(BaseTransformer):
             transform_kwargs = dict(
                 session=dataset._session,
                 dependencies=self._deps,
-                pass_through_cols=self._get_pass_through_columns(dataset),
+                drop_input_cols = self._drop_input_cols,
                 expected_output_cols_type="float",
             )
@@ -715,7 +683,7 @@ class CalibratedClassifierCV(BaseTransformer):
             transform_kwargs = dict(
                 session=dataset._session,
                 dependencies=self._deps,
-                pass_through_cols=self._get_pass_through_columns(dataset),
+                drop_input_cols = self._drop_input_cols,
                 expected_output_cols_type="float",
             )
         elif isinstance(dataset, pd.DataFrame):
@@ -776,7 +744,7 @@ class CalibratedClassifierCV(BaseTransformer):
             transform_kwargs = dict(
                 session=dataset._session,
                 dependencies=self._deps,
-                pass_through_cols=self._get_pass_through_columns(dataset),
+                drop_input_cols = self._drop_input_cols,
                 expected_output_cols_type="float",
             )
@@ -841,7 +809,7 @@ class CalibratedClassifierCV(BaseTransformer):
             transform_kwargs = dict(
                 session=dataset._session,
                 dependencies=self._deps,
-                pass_through_cols=self._get_pass_through_columns(dataset),
+                drop_input_cols = self._drop_input_cols,
                 expected_output_cols_type="float",
             )
@@ -897,13 +865,17 @@ class CalibratedClassifierCV(BaseTransformer):
         transform_kwargs: ScoreKwargsTypedDict = dict()
         if isinstance(dataset, DataFrame):
+            self._deps = self._batch_inference_validate_snowpark(
+                dataset=dataset,
+                inference_method="score",
+            )
             selected_cols = self._get_active_columns()
             if len(selected_cols) > 0:
                 dataset = dataset.select(selected_cols)
             assert isinstance(dataset._session, Session) # keep mypy happy
             transform_kwargs = dict(
                 session=dataset._session,
-                dependencies=["snowflake-snowpark-python"] + self._get_dependencies(),
+                dependencies=["snowflake-snowpark-python"] + self._deps,
                 score_sproc_imports=['sklearn'],
             )
         elif isinstance(dataset, pd.DataFrame):
@@ -977,9 +949,9 @@ class CalibratedClassifierCV(BaseTransformer):
             transform_kwargs = dict(
                 session = dataset._session,
                 dependencies = self._deps,
-                pass_through_cols = self._get_pass_through_columns(dataset),
-                expected_output_cols_type =  "array",
-                n_neighbors =  n_neighbors,
+                drop_input_cols = self._drop_input_cols,
+                expected_output_cols_type="array",
+                n_neighbors = n_neighbors,
                 return_distance =  return_distance
             )
         elif isinstance(dataset, pd.DataFrame):

snowflake-ml-python 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

snowflake-ml-python 1.3.0py3-none-any.whl → 1.4.0py3-none-any.whl