PyPI - snowflake-ml-python - Versions diffs - 1.1.2__py3-none-any.whl → 1.2.1__py3-none-any.whl - Mend

snowflake-ml-python 1.1.2py3-none-any.whl → 1.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (215) hide show

snowflake/ml/{model/_deploy_client/utils → _internal/container_services/image_registry}/imagelib.py +3 -1
snowflake/ml/{model/_deploy_client/utils/image_registry_client.py → _internal/container_services/image_registry/registry_client.py} +4 -2
snowflake/ml/_internal/env_utils.py +31 -52
snowflake/ml/_internal/file_utils.py +17 -0
snowflake/ml/_internal/telemetry.py +19 -0
snowflake/ml/_internal/utils/query_result_checker.py +8 -5
snowflake/ml/_internal/utils/snowflake_env.py +95 -0
snowflake/ml/fileset/parquet_parser.py +31 -1
snowflake/ml/model/__init__.py +6 -0
snowflake/ml/model/_client/model/model_impl.py +172 -13
snowflake/ml/model/_client/model/model_version_impl.py +96 -52
snowflake/ml/model/_client/ops/metadata_ops.py +1 -3
snowflake/ml/model/_client/ops/model_ops.py +155 -9
snowflake/ml/model/_client/sql/model.py +55 -10
snowflake/ml/model/_client/sql/model_version.py +72 -61
snowflake/ml/model/_client/sql/stage.py +10 -4
snowflake/ml/model/_client/sql/tag.py +118 -0
snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +2 -2
snowflake/ml/model/_deploy_client/image_builds/docker_context.py +8 -8
snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +4 -6
snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +6 -7
snowflake/ml/model/_deploy_client/snowservice/deploy.py +4 -5
snowflake/ml/model/_deploy_client/snowservice/instance_types.py +9 -1
snowflake/ml/model/_deploy_client/warehouse/deploy.py +20 -11
snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +45 -1
snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +30 -0
snowflake/ml/model/_model_composer/model_method/function_generator.py +2 -1
snowflake/ml/model/_model_composer/model_runtime/_runtime_requirements.py +10 -1
snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +10 -7
snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +1 -1
snowflake/ml/model/_packager/model_handlers/xgboost.py +13 -2
snowflake/ml/model/_packager/model_meta/_core_requirements.py +11 -1
snowflake/ml/model/_packager/model_meta/_packaging_requirements.py +3 -0
snowflake/ml/model/_packager/model_meta/model_meta.py +17 -3
snowflake/ml/model/_signatures/core.py +20 -17
snowflake/ml/model/custom_model.py +30 -27
snowflake/ml/model/model_signature.py +16 -17
snowflake/ml/model/type_hints.py +3 -0
snowflake/ml/modeling/_internal/distributed_hpo_trainer.py +185 -98
snowflake/ml/modeling/_internal/estimator_utils.py +21 -0
snowflake/ml/modeling/_internal/model_specifications.py +3 -10
snowflake/ml/modeling/_internal/model_trainer_builder.py +55 -11
snowflake/ml/modeling/_internal/snowpark_handlers.py +9 -6
snowflake/ml/modeling/_internal/snowpark_trainer.py +10 -2
snowflake/ml/modeling/_internal/xgboost_external_memory_trainer.py +444 -0
snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +51 -16
snowflake/ml/modeling/cluster/affinity_propagation.py +51 -16
snowflake/ml/modeling/cluster/agglomerative_clustering.py +51 -16
snowflake/ml/modeling/cluster/birch.py +51 -16
snowflake/ml/modeling/cluster/bisecting_k_means.py +51 -16
snowflake/ml/modeling/cluster/dbscan.py +51 -16
snowflake/ml/modeling/cluster/feature_agglomeration.py +51 -16
snowflake/ml/modeling/cluster/k_means.py +51 -16
snowflake/ml/modeling/cluster/mean_shift.py +51 -16
snowflake/ml/modeling/cluster/mini_batch_k_means.py +51 -16
snowflake/ml/modeling/cluster/optics.py +51 -16
snowflake/ml/modeling/cluster/spectral_biclustering.py +51 -16
snowflake/ml/modeling/cluster/spectral_clustering.py +51 -16
snowflake/ml/modeling/cluster/spectral_coclustering.py +51 -16
snowflake/ml/modeling/compose/column_transformer.py +51 -16
snowflake/ml/modeling/compose/transformed_target_regressor.py +51 -16
snowflake/ml/modeling/covariance/elliptic_envelope.py +51 -16
snowflake/ml/modeling/covariance/empirical_covariance.py +51 -16
snowflake/ml/modeling/covariance/graphical_lasso.py +51 -16
snowflake/ml/modeling/covariance/graphical_lasso_cv.py +51 -16
snowflake/ml/modeling/covariance/ledoit_wolf.py +51 -16
snowflake/ml/modeling/covariance/min_cov_det.py +51 -16
snowflake/ml/modeling/covariance/oas.py +51 -16
snowflake/ml/modeling/covariance/shrunk_covariance.py +51 -16
snowflake/ml/modeling/decomposition/dictionary_learning.py +51 -16
snowflake/ml/modeling/decomposition/factor_analysis.py +51 -16
snowflake/ml/modeling/decomposition/fast_ica.py +51 -16
snowflake/ml/modeling/decomposition/incremental_pca.py +51 -16
snowflake/ml/modeling/decomposition/kernel_pca.py +51 -16
snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +51 -16
snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +51 -16
snowflake/ml/modeling/decomposition/pca.py +51 -16
snowflake/ml/modeling/decomposition/sparse_pca.py +51 -16
snowflake/ml/modeling/decomposition/truncated_svd.py +51 -16
snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +51 -16
snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +51 -16
snowflake/ml/modeling/ensemble/ada_boost_classifier.py +51 -16
snowflake/ml/modeling/ensemble/ada_boost_regressor.py +51 -16
snowflake/ml/modeling/ensemble/bagging_classifier.py +51 -16
snowflake/ml/modeling/ensemble/bagging_regressor.py +51 -16
snowflake/ml/modeling/ensemble/extra_trees_classifier.py +51 -16
snowflake/ml/modeling/ensemble/extra_trees_regressor.py +51 -16
snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +51 -16
snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +51 -16
snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +51 -16
snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +51 -16
snowflake/ml/modeling/ensemble/isolation_forest.py +51 -16
snowflake/ml/modeling/ensemble/random_forest_classifier.py +51 -16
snowflake/ml/modeling/ensemble/random_forest_regressor.py +51 -16
snowflake/ml/modeling/ensemble/stacking_regressor.py +51 -16
snowflake/ml/modeling/ensemble/voting_classifier.py +51 -16
snowflake/ml/modeling/ensemble/voting_regressor.py +51 -16
snowflake/ml/modeling/feature_selection/generic_univariate_select.py +51 -16
snowflake/ml/modeling/feature_selection/select_fdr.py +51 -16
snowflake/ml/modeling/feature_selection/select_fpr.py +51 -16
snowflake/ml/modeling/feature_selection/select_fwe.py +51 -16
snowflake/ml/modeling/feature_selection/select_k_best.py +51 -16
snowflake/ml/modeling/feature_selection/select_percentile.py +51 -16
snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +51 -16
snowflake/ml/modeling/feature_selection/variance_threshold.py +51 -16
snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +51 -16
snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +51 -16
snowflake/ml/modeling/impute/iterative_imputer.py +51 -16
snowflake/ml/modeling/impute/knn_imputer.py +51 -16
snowflake/ml/modeling/impute/missing_indicator.py +51 -16
snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +51 -16
snowflake/ml/modeling/kernel_approximation/nystroem.py +51 -16
snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +51 -16
snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +51 -16
snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +51 -16
snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +51 -16
snowflake/ml/modeling/lightgbm/lgbm_classifier.py +51 -16
snowflake/ml/modeling/lightgbm/lgbm_regressor.py +51 -16
snowflake/ml/modeling/linear_model/ard_regression.py +51 -16
snowflake/ml/modeling/linear_model/bayesian_ridge.py +51 -16
snowflake/ml/modeling/linear_model/elastic_net.py +51 -16
snowflake/ml/modeling/linear_model/elastic_net_cv.py +51 -16
snowflake/ml/modeling/linear_model/gamma_regressor.py +51 -16
snowflake/ml/modeling/linear_model/huber_regressor.py +51 -16
snowflake/ml/modeling/linear_model/lars.py +51 -16
snowflake/ml/modeling/linear_model/lars_cv.py +51 -16
snowflake/ml/modeling/linear_model/lasso.py +51 -16
snowflake/ml/modeling/linear_model/lasso_cv.py +51 -16
snowflake/ml/modeling/linear_model/lasso_lars.py +51 -16
snowflake/ml/modeling/linear_model/lasso_lars_cv.py +51 -16
snowflake/ml/modeling/linear_model/lasso_lars_ic.py +51 -16
snowflake/ml/modeling/linear_model/linear_regression.py +51 -16
snowflake/ml/modeling/linear_model/logistic_regression.py +51 -16
snowflake/ml/modeling/linear_model/logistic_regression_cv.py +51 -16
snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +51 -16
snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +51 -16
snowflake/ml/modeling/linear_model/multi_task_lasso.py +51 -16
snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +51 -16
snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +51 -16
snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +51 -16
snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +51 -16
snowflake/ml/modeling/linear_model/perceptron.py +51 -16
snowflake/ml/modeling/linear_model/poisson_regressor.py +51 -16
snowflake/ml/modeling/linear_model/ransac_regressor.py +51 -16
snowflake/ml/modeling/linear_model/ridge.py +51 -16
snowflake/ml/modeling/linear_model/ridge_classifier.py +51 -16
snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +51 -16
snowflake/ml/modeling/linear_model/ridge_cv.py +51 -16
snowflake/ml/modeling/linear_model/sgd_classifier.py +51 -16
snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +51 -16
snowflake/ml/modeling/linear_model/sgd_regressor.py +51 -16
snowflake/ml/modeling/linear_model/theil_sen_regressor.py +51 -16
snowflake/ml/modeling/linear_model/tweedie_regressor.py +51 -16
snowflake/ml/modeling/manifold/isomap.py +51 -16
snowflake/ml/modeling/manifold/mds.py +51 -16
snowflake/ml/modeling/manifold/spectral_embedding.py +51 -16
snowflake/ml/modeling/manifold/tsne.py +51 -16
snowflake/ml/modeling/metrics/classification.py +5 -6
snowflake/ml/modeling/metrics/metrics_utils.py +5 -3
snowflake/ml/modeling/metrics/ranking.py +7 -3
snowflake/ml/modeling/metrics/regression.py +6 -3
snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +51 -16
snowflake/ml/modeling/mixture/gaussian_mixture.py +51 -16
snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +51 -16
snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +51 -16
snowflake/ml/modeling/multiclass/output_code_classifier.py +51 -16
snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +51 -16
snowflake/ml/modeling/naive_bayes/categorical_nb.py +51 -16
snowflake/ml/modeling/naive_bayes/complement_nb.py +51 -16
snowflake/ml/modeling/naive_bayes/gaussian_nb.py +51 -16
snowflake/ml/modeling/naive_bayes/multinomial_nb.py +51 -16
snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +51 -16
snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +51 -16
snowflake/ml/modeling/neighbors/kernel_density.py +51 -16
snowflake/ml/modeling/neighbors/local_outlier_factor.py +51 -16
snowflake/ml/modeling/neighbors/nearest_centroid.py +51 -16
snowflake/ml/modeling/neighbors/nearest_neighbors.py +51 -16
snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +51 -16
snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +51 -16
snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +51 -16
snowflake/ml/modeling/neural_network/bernoulli_rbm.py +51 -16
snowflake/ml/modeling/neural_network/mlp_classifier.py +51 -16
snowflake/ml/modeling/neural_network/mlp_regressor.py +51 -16
snowflake/ml/modeling/preprocessing/min_max_scaler.py +15 -1
snowflake/ml/modeling/preprocessing/polynomial_features.py +51 -16
snowflake/ml/modeling/semi_supervised/label_propagation.py +51 -16
snowflake/ml/modeling/semi_supervised/label_spreading.py +51 -16
snowflake/ml/modeling/svm/linear_svc.py +51 -16
snowflake/ml/modeling/svm/linear_svr.py +51 -16
snowflake/ml/modeling/svm/nu_svc.py +51 -16
snowflake/ml/modeling/svm/nu_svr.py +51 -16
snowflake/ml/modeling/svm/svc.py +51 -16
snowflake/ml/modeling/svm/svr.py +51 -16
snowflake/ml/modeling/tree/decision_tree_classifier.py +51 -16
snowflake/ml/modeling/tree/decision_tree_regressor.py +51 -16
snowflake/ml/modeling/tree/extra_tree_classifier.py +51 -16
snowflake/ml/modeling/tree/extra_tree_regressor.py +51 -16
snowflake/ml/modeling/xgboost/xgb_classifier.py +69 -16
snowflake/ml/modeling/xgboost/xgb_regressor.py +69 -16
snowflake/ml/modeling/xgboost/xgbrf_classifier.py +69 -16
snowflake/ml/modeling/xgboost/xgbrf_regressor.py +69 -16
snowflake/ml/registry/__init__.py +3 -0
snowflake/ml/registry/_manager/model_manager.py +163 -0
snowflake/ml/registry/model_registry.py +12 -0
snowflake/ml/registry/registry.py +100 -90
snowflake/ml/version.py +1 -1
snowflake_ml_python-1.2.1.dist-info/LICENSE.txt +202 -0
{snowflake_ml_python-1.1.2.dist-info → snowflake_ml_python-1.2.1.dist-info}/METADATA +295 -60
snowflake_ml_python-1.2.1.dist-info/RECORD +355 -0
{snowflake_ml_python-1.1.2.dist-info → snowflake_ml_python-1.2.1.dist-info}/WHEEL +2 -1
snowflake_ml_python-1.2.1.dist-info/top_level.txt +1 -0
snowflake/ml/model/_client/model/model_method_info.py +0 -19
snowflake_ml_python-1.1.2.dist-info/RECORD +0 -347
/snowflake/ml/_internal/{utils/spcs_image_registry.py → container_services/image_registry/credential.py} +0 -0
/snowflake/ml/_internal/{utils/image_registry_http_client.py → container_services/image_registry/http_client.py} +0 -0

snowflake/ml/modeling/_internal/distributed_hpo_trainer.py CHANGED Viewed

@@ -8,11 +8,15 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 import cloudpickle as cp
 import numpy as np
-from scipy.stats import rankdata
 from sklearn import model_selection
+from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
 from snowflake.ml._internal import telemetry
-from snowflake.ml._internal.utils import identifier, snowpark_dataframe_utils
+from snowflake.ml._internal.utils import (
+    identifier,
+    pkg_version_utils,
+    snowpark_dataframe_utils,
+)
 from snowflake.ml._internal.utils.temp_file_utils import (
     cleanup_temp_files,
     get_temp_file_path,
@@ -26,7 +30,8 @@ from snowflake.snowpark._internal.utils import (
     TempObjectType,
     random_name_for_temp_object,
 )
-from snowflake.snowpark.functions import col, sproc, udtf
+from snowflake.snowpark.functions import sproc, udtf
+from snowflake.snowpark.row import Row
 from snowflake.snowpark.types import IntegerType, StringType, StructField, StructType
 cp.register_pickle_by_value(inspect.getmodule(get_temp_file_path))
@@ -36,6 +41,117 @@ _PROJECT = "ModelDevelopment"
 DEFAULT_UDTF_NJOBS = 3
+def construct_cv_results(
+    estimator: Union[GridSearchCV, RandomizedSearchCV],
+    n_split: int,
+    param_grid: List[Dict[str, Any]],
+    cv_results_raw_hex: List[Row],
+    cross_validator_indices_length: int,
+    parameter_grid_length: int,
+) -> Tuple[bool, Dict[str, Any]]:
+    """Construct the cross validation result from the UDF. Because we accelerate the process
+    by the number of cross validation number, and the combination of parameter grids.
+    Therefore, we need to stick them back together instead of returning the raw result
+    to align with original sklearn result.
+    Args:
+        estimator (Union[GridSearchCV, RandomizedSearchCV]): The sklearn object of estimator
+            GridSearchCV or RandomizedSearchCV
+        n_split (int): The number of split, which is determined by build_cross_validator.get_n_splits(X, y, groups)
+        param_grid (List[Dict[str, Any]]): the list of parameter grid or parameter sampler
+        cv_results_raw_hex (List[Row]): the list of cv_results from each cv and parameter grid combination.
+            Because UDxF can only return string, and numpy array/masked arrays cannot be encoded in a
+            json format. Each cv_result is encoded into hex string.
+        cross_validator_indices_length (int): the length of cross validator indices
+        parameter_grid_length (int): the length of parameter grid combination
+    Raises:
+        ValueError: Retrieved empty cross validation results
+        ValueError: Cross validator index length is 0
+        ValueError: Parameter index length is 0
+        ValueError: Retrieved incorrect dataframe dimension from Snowpark's UDTF.
+        RuntimeError: Cross validation results are unexpectedly empty for one fold.
+    Returns:
+        Tuple[bool, Dict[str, Any]]: returns multimetric, cv_results_
+    """
+    # Filter corner cases: either the snowpark dataframe result is empty; or index length is empty
+    if len(cv_results_raw_hex) == 0:
+        raise ValueError(
+            "Retrieved empty cross validation results from snowpark. Please retry or contact snowflake support."
+        )
+    if cross_validator_indices_length == 0:
+        raise ValueError("Cross validator index length is 0. Was the CV iterator empty? ")
+    if parameter_grid_length == 0:
+        raise ValueError("Parameter index length is 0. Were there no candidates?")
+    # cv_result maintains the original order
+    multimetric = False
+    # retrieve the cv_results from udtf table; results are encoded by hex and cloudpickle;
+    # We are constructing the raw information back to original form
+    if len(cv_results_raw_hex) != cross_validator_indices_length * parameter_grid_length:
+        raise ValueError(
+            "Retrieved incorrect dataframe dimension from Snowpark's UDTF."
+            f"Expected {cross_validator_indices_length * parameter_grid_length}, got {len(cv_results_raw_hex)}. "
+            "Please retry or contact snowflake support."
+        )
+    out = []
+    for each_cv_result_hex in cv_results_raw_hex:
+        # convert the hex string back to cv_results_
+        hex_str = bytes.fromhex(each_cv_result_hex[0])
+        with io.BytesIO(hex_str) as f_reload:
+            each_cv_result = cp.load(f_reload)
+            if not each_cv_result:
+                raise RuntimeError(
+                    "Cross validation response is empty. This issue may be temporary - please try again."
+                )
+            temp_dict = dict()
+            """
+            This dictionary has the following keys
+            train_scores : dict of scorer name -> float
+                Score on training set (for all the scorers),
+                returned only if `return_train_score` is `True`.
+            test_scores : dict of scorer name -> float
+                Score on testing set (for all the scorers).
+            fit_time : float
+                Time spent for fitting in seconds.
+            score_time : float
+                Time spent for scoring in seconds.
+            """
+            if estimator.return_train_score:
+                if each_cv_result.get("split0_train_score", None):
+                    # for single scorer, the split0_train_score only contains an array with one value
+                    temp_dict["train_scores"] = each_cv_result["split0_train_score"][0]
+                else:
+                    # if multimetric situation, the format would be
+                    # {metric_name1: value, metric_name2: value, ...}
+                    temp_dict["train_scores"] = {}
+                    # For multi-metric evaluation, the scores for all the scorers are available in the
+                    # cv_results_ dict at the keys ending with that scorer’s name ('_<scorer_name>')
+                    # instead of '_score'.
+                    for k, v in each_cv_result.items():
+                        if "split0_train_" in k:
+                            temp_dict["train_scores"][k[len("split0_train_") :]] = v
+            if isinstance(each_cv_result.get("split0_test_score"), np.ndarray):
+                temp_dict["test_scores"] = each_cv_result["split0_test_score"][0]
+            else:
+                temp_dict["test_scores"] = {}
+                for k, v in each_cv_result.items():
+                    if "split0_test_" in k:
+                        temp_dict["test_scores"][k[len("split0_test_") :]] = v
+            temp_dict["fit_time"] = each_cv_result["mean_fit_time"][0]
+            temp_dict["score_time"] = each_cv_result["mean_score_time"][0]
+            out.append(temp_dict)
+    first_test_score = out[0]["test_scores"]
+    multimetric = isinstance(first_test_score, dict)
+    return multimetric, estimator._format_results(param_grid, n_split, out)
+cp.register_pickle_by_value(inspect.getmodule(construct_cv_results))
 class DistributedHPOTrainer(SnowparkModelTrainer):
     """
     A class for performing distributed hyperparameter optimization (HPO) using Snowpark.
@@ -105,7 +221,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
         temp_stage_creation_query = f"CREATE OR REPLACE TEMP STAGE {temp_stage_name};"
         session.sql(temp_stage_creation_query).collect()
-        # Stage data.
+        # Stage data as parquet file
         dataset = snowpark_dataframe_utils.cast_snowpark_dataframe(dataset)
         remote_file_path = f"{temp_stage_name}/{temp_stage_name}.parquet"
         dataset.write.copy_into_location(  # type:ignore[call-overload]
@@ -114,6 +230,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
         imports = [f"@{row.name}" for row in session.sql(f"LIST @{temp_stage_name}").collect()]
         # Store GridSearchCV's refit variable. If user set it as False, we don't need to refit it again
+        # refit variable can be boolean, string or callable
         original_refit = estimator.refit
         # Create a temp file and dump the estimator to that file.
@@ -136,7 +253,6 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
                 inspect.currentframe(), self.__class__.__name__
             ),
             api_calls=[sproc],
-            custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
         )
         udtf_statement_params = telemetry.get_function_usage_statement_params(
             project=_PROJECT,
@@ -145,7 +261,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
                 inspect.currentframe(), self.__class__.__name__
             ),
             api_calls=[udtf],
-            custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
+            custom_tags=dict([("hpo_udtf", True)]),
         )
         # Put locally serialized estimator on stage.
@@ -208,7 +324,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
                 for file_name in data_files
             ]
             df = pd.concat(partial_df, ignore_index=True)
-            df.columns = [identifier.get_inferred_name(col) for col in df.columns]
+            df.columns = [identifier.get_inferred_name(col_) for col_ in df.columns]
             X = df[input_cols]
             y = df[label_cols].squeeze() if label_cols else None
@@ -222,11 +338,16 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
             with open(local_estimator_file_path, mode="r+b") as local_estimator_file_obj:
                 estimator = cp.load(local_estimator_file_obj)["estimator"]
-            cv_orig = check_cv(estimator.cv, y, classifier=is_classifier(estimator.estimator))
-            indices = [test for _, test in cv_orig.split(X, y)]
+            build_cross_validator = check_cv(estimator.cv, y, classifier=is_classifier(estimator.estimator))
+            from sklearn.utils.validation import indexable
+            X, y, _ = indexable(X, y, None)
+            n_splits = build_cross_validator.get_n_splits(X, y, None)
+            # store the cross_validator's test indices only to save space
+            cross_validator_indices = [test for _, test in build_cross_validator.split(X, y, None)]
             local_indices_file_name = get_temp_file_path()
             with open(local_indices_file_name, mode="w+b") as local_indices_file_obj:
-                cp.dump(indices, local_indices_file_obj)
+                cp.dump(cross_validator_indices, local_indices_file_obj)
             # Put locally serialized indices on stage.
             put_result = session.file.put(
@@ -237,7 +358,8 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
             )
             indices_location = put_result[0].target
             imports.append(f"@{temp_stage_name}/{indices_location}")
-            indices_len = len(indices)
+            cross_validator_indices_length = int(len(cross_validator_indices))
+            parameter_grid_length = len(param_grid)
             assert estimator is not None
@@ -261,7 +383,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
                     for file_name in data_files
                 ]
                 df = pd.concat(partial_df, ignore_index=True)
-                df.columns = [identifier.get_inferred_name(col) for col in df.columns]
+                df.columns = [identifier.get_inferred_name(col_) for col_ in df.columns]
                 # load estimator
                 local_estimator_file_path = os.path.join(
@@ -299,16 +421,30 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
                     self.data_length = data_length
                     self.params_to_evaluate = params_to_evaluate
-                def process(self, params_idx: int, idx: int) -> Iterator[Tuple[str]]:
+                def process(self, params_idx: int, cv_idx: int) -> Iterator[Tuple[str]]:
+                    # Assign parameter to GridSearchCV
                     if hasattr(estimator, "param_grid"):
                         self.estimator.param_grid = self.params_to_evaluate[params_idx]
+                    # Assign parameter to RandomizedSearchCV
                     else:
                         self.estimator.param_distributions = self.params_to_evaluate[params_idx]
+                    # cross validator's indices: we stored test indices only (to save space);
+                    # use the full indices to re-construct the train indices back.
                     full_indices = np.array([i for i in range(self.data_length)])
-                    test_indice = self.indices[idx]
+                    test_indice = self.indices[cv_idx]
                     train_indice = np.setdiff1d(full_indices, test_indice)
+                    # assign the tuple of train and test indices to estimator's original cross validator
                     self.estimator.cv = [(train_indice, test_indice)]
                     self.estimator.fit(**self.args)
+                    # If the cv_results_ is empty, then the udtf table will have different number of output rows
+                    # from the input rows. Raise ValueError.
+                    if not self.estimator.cv_results_:
+                        raise RuntimeError(
+                            """Cross validation results are unexpectedly empty for one fold.
+                            This issue may be temporary - please try again."""
+                        )
+                    # Encode the dictionary of cv_results_ as binary (in hex format) to send it back
+                    # because udtf doesn't allow numpy within json file
                     binary_cv_results = None
                     with io.BytesIO() as f:
                         cp.dump(self.estimator.cv_results_, f)
@@ -333,96 +469,44 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
             HP_TUNING = F.table_function(random_udtf_name)
-            idx_length = int(indices_len)
-            params_length = len(param_grid)
-            idxs = [i for i in range(idx_length)]
-            param_indices, training_indices = [], []
-            for param_idx, cv_idx in product([param_index for param_index in range(params_length)], idxs):
+            # param_indices is for the index for each parameter grid;
+            # cv_indices is for the index for each cross_validator's fold;
+            # param_cv_indices is for the index for the product of (len(param_indices) * len(cv_indices))
+            param_indices, cv_indices = [], []
+            for param_idx, cv_idx in product(
+                [param_index for param_index in range(parameter_grid_length)],
+                [cv_index for cv_index in range(cross_validator_indices_length)],
+            ):
                 param_indices.append(param_idx)
-                training_indices.append(cv_idx)
+                cv_indices.append(cv_idx)
-            pd_df = pd.DataFrame(
+            indices_info_pandas = pd.DataFrame(
                 {
-                    "PARAMS": param_indices,
-                    "TRAIN_IND": training_indices,
-                    "PARAM_INDEX": [i for i in range(idx_length * params_length)],
+                    "PARAM_IND": param_indices,
+                    "CV_IND": cv_indices,
+                    "PARAM_CV_IND": [i for i in range(cross_validator_indices_length * parameter_grid_length)],
                 }
             )
-            df = session.create_dataframe(pd_df)
-            results = df.select(
-                F.cast(df["PARAM_INDEX"], IntegerType()).as_("PARAM_INDEX"),
-                (HP_TUNING(df["PARAMS"], df["TRAIN_IND"]).over(partition_by=df["PARAM_INDEX"])),
+            indices_info_sp = session.create_dataframe(indices_info_pandas)
+            # execute udtf by querying HP_TUNING table
+            HP_raw_results = indices_info_sp.select(
+                F.cast(indices_info_sp["PARAM_CV_IND"], IntegerType()).as_("PARAM_CV_IND"),
+                (
+                    HP_TUNING(indices_info_sp["PARAM_IND"], indices_info_sp["CV_IND"]).over(
+                        partition_by=indices_info_sp["PARAM_CV_IND"]
+                    )
+                ),
             )
-            # cv_result maintains the original order
-            multimetric = False
-            cv_results_ = dict()
-            scorers = set()
-            for i, val in enumerate(results.select("CV_RESULTS").sort(col("PARAM_INDEX")).collect()):
-                # retrieved string had one more double quote in the front and end of the string.
-                # use [1:-1] to remove the extra double quotes
-                hex_str = bytes.fromhex(val[0])
-                with io.BytesIO(hex_str) as f_reload:
-                    each_cv_result = cp.load(f_reload)
-                    for k, v in each_cv_result.items():
-                        cur_cv = i % idx_length
-                        key = k
-                        if "split0_test_" in k:
-                            # For multi-metric evaluation, the scores for all the scorers are available in the
-                            # cv_results_ dict at the keys ending with that scorer’s name ('_<scorer_name>')
-                            # instead of '_score'.
-                            scorers.add(k[len("split0_test_") :])
-                            key = k.replace("split0_test", f"split{cur_cv}_test")
-                        elif k.startswith("param"):
-                            if cur_cv != 0:
-                                key = False
-                        if key:
-                            if key not in cv_results_:
-                                cv_results_[key] = v
-                            else:
-                                cv_results_[key] = np.concatenate([cv_results_[key], v])
-            multimetric = len(scorers) > 1
-            # Use numpy to re-calculate all the information in cv_results_ again
-            # Generally speaking, reshape all the results into the (scorers+2, idx_length, params_length) shape,
-            # and average them by the idx_length;
-            # idx_length is the number of cv folds; params_length is the number of parameter combinations
-            scores = [
-                np.reshape(
-                    np.concatenate([cv_results_[f"split{cur_cv}_test_{score}"] for cur_cv in range(idx_length)]),
-                    (idx_length, -1),
-                )
-                for score in scorers
-            ]
-            fit_score_test_matrix = np.stack(
-                [
-                    np.reshape(cv_results_["mean_fit_time"], (idx_length, -1)),
-                    np.reshape(cv_results_["mean_score_time"], (idx_length, -1)),
-                ]
-                + scores
+            # multimetric, cv_results_, best_param_index, scorers
+            multimetric, cv_results_ = construct_cv_results(
+                estimator,
+                n_splits,
+                list(param_grid),
+                HP_raw_results.select("CV_RESULTS").sort(F.col("PARAM_CV_IND")).collect(),
+                cross_validator_indices_length,
+                parameter_grid_length,
             )
-            mean_fit_score_test_matrix = np.mean(fit_score_test_matrix, axis=1)
-            std_fit_score_test_matrix = np.std(fit_score_test_matrix, axis=1)
-            cv_results_["std_fit_time"] = std_fit_score_test_matrix[0]
-            cv_results_["mean_fit_time"] = mean_fit_score_test_matrix[0]
-            cv_results_["std_score_time"] = std_fit_score_test_matrix[1]
-            cv_results_["mean_score_time"] = mean_fit_score_test_matrix[1]
-            for idx, score in enumerate(scorers):
-                cv_results_[f"std_test_{score}"] = std_fit_score_test_matrix[idx + 2]
-                cv_results_[f"mean_test_{score}"] = mean_fit_score_test_matrix[idx + 2]
-                # re-compute the ranking again with mean_test_<score>.
-                cv_results_[f"rank_test_{score}"] = rankdata(-cv_results_[f"mean_test_{score}"], method="min")
-                # The best param is the highest ranking (which is 1) and we choose the first time ranking 1 appeared.
-                # If all scores are `nan`, `rankdata` will also produce an array of `nan` values.
-                # In that case, default to first index.
-                best_param_index = (
-                    np.where(cv_results_[f"rank_test_{score}"] == 1)[0][0]
-                    if not np.isnan(cv_results_[f"rank_test_{score}"]).all()
-                    else 0
-                )
             estimator.cv_results_ = cv_results_
             estimator.multimetric_ = multimetric
@@ -452,7 +536,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
                     # With a non-custom callable, we can select the best score
                     # based on the best index
                     estimator.best_score_ = cv_results_[f"mean_test_{refit_metric}"][estimator.best_index_]
-                estimator.best_params_ = cv_results_["params"][best_param_index]
+                estimator.best_params_ = cv_results_["params"][estimator.best_index_]
             if original_refit:
                 estimator.best_estimator_ = clone(estimator.estimator).set_params(
@@ -541,12 +625,15 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
                 n_iter=self.estimator.n_iter,
                 random_state=self.estimator.random_state,
             )
+        relaxed_dependencies = pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel(
+            pkg_versions=model_spec.pkgDependencies, session=self.session
+        )
         return self.fit_search_snowpark(
             param_grid=param_grid,
             dataset=self.dataset,
             session=self.session,
             estimator=self.estimator,
-            dependencies=model_spec.pkgDependencies,
+            dependencies=relaxed_dependencies,
             udf_imports=["sklearn"],
             input_cols=self.input_cols,
             label_cols=self.label_cols,

snowflake/ml/modeling/_internal/estimator_utils.py CHANGED Viewed

@@ -132,3 +132,24 @@ def is_single_node(session: Session) -> bool:
     # If current session cannot retrieve the warehouse name back,
     # Default as True; Let HPO fall back to stored procedure implementation
     return True
+def get_module_name(model: object) -> str:
+    """Returns the source module of the given object.
+    Args:
+        model: Object to inspect.
+    Returns:
+        Source module of the given object.
+    Raises:
+        SnowflakeMLException: If the source module of the given object is not found.
+    """
+    module = inspect.getmodule(model)
+    if module is None:
+        raise exceptions.SnowflakeMLException(
+            error_code=error_codes.INVALID_TYPE,
+            original_exception=ValueError(f"Unable to infer the source module of the given object {model}."),
+        )
+    return module.__name__

snowflake/ml/modeling/_internal/model_specifications.py CHANGED Viewed

@@ -1,10 +1,9 @@
-import inspect
 from typing import List
 import cloudpickle as cp
 import numpy as np
-from snowflake.ml._internal.exceptions import error_codes, exceptions
+from snowflake.ml.modeling._internal.estimator_utils import get_module_name
 class ModelSpecifications:
@@ -120,16 +119,10 @@ class ModelSpecificationsBuilder:
             Appropriate ModelSpecification object
         Raises:
-            SnowflakeMLException: Raises an exception the module of given model can't be determined.
             TypeError: Raises the exception for unsupported modules.
         """
-        module = inspect.getmodule(model)
-        if module is None:
-            raise exceptions.SnowflakeMLException(
-                error_code=error_codes.INVALID_TYPE,
-                original_exception=ValueError("Unable to infer model type of the given native model object."),
-            )
-        root_module_name = module.__name__.split(".")[0]
+        module_name = get_module_name(model=model)
+        root_module_name = module_name.split(".")[0]
         if root_module_name == "sklearn":
             from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

snowflake/ml/modeling/_internal/model_trainer_builder.py CHANGED Viewed

@@ -3,13 +3,20 @@ from typing import List, Optional, Union
 import pandas as pd
 from sklearn import model_selection
+from snowflake.ml._internal.exceptions import error_codes, exceptions
 from snowflake.ml.modeling._internal.distributed_hpo_trainer import (
     DistributedHPOTrainer,
 )
-from snowflake.ml.modeling._internal.estimator_utils import is_single_node
+from snowflake.ml.modeling._internal.estimator_utils import (
+    get_module_name,
+    is_single_node,
+)
 from snowflake.ml.modeling._internal.model_trainer import ModelTrainer
 from snowflake.ml.modeling._internal.pandas_trainer import PandasModelTrainer
 from snowflake.ml.modeling._internal.snowpark_trainer import SnowparkModelTrainer
+from snowflake.ml.modeling._internal.xgboost_external_memory_trainer import (
+    XGBoostExternalMemoryTrainer,
+)
 from snowflake.snowpark import DataFrame, Session
 _PROJECT = "ModelDevelopment"
@@ -30,6 +37,31 @@ class ModelTrainerBuilder:
     def _check_if_distributed_hpo_enabled(cls, session: Session) -> bool:
         return not is_single_node(session) and ModelTrainerBuilder._ENABLE_DISTRIBUTED is True
+    @classmethod
+    def _validate_external_memory_params(cls, estimator: object, batch_size: int) -> None:
+        """
+        Validate the params are set appropriately for external memory training.
+        Args:
+            estimator: Model object
+            batch_size: Number of rows in each batch of data processed during training.
+        Raises:
+            SnowflakeMLException: If the params are not appropriate for the external memory training feature.
+        """
+        module_name = get_module_name(model=estimator)
+        root_module_name = module_name.split(".")[0]
+        if root_module_name != "xgboost":
+            raise exceptions.SnowflakeMLException(
+                error_code=error_codes.INVALID_ARGUMENT,
+                original_exception=RuntimeError("External memory training is only supported for XGBoost models."),
+            )
+        if batch_size <= 0:
+            raise exceptions.SnowflakeMLException(
+                error_code=error_codes.INVALID_ARGUMENT,
+                original_exception=RuntimeError("Batch size must be >= 0 when using external memory training feature."),
+            )
     @classmethod
     def build(
         cls,
@@ -40,6 +72,8 @@ class ModelTrainerBuilder:
         sample_weight_col: Optional[str] = None,
         autogenerated: bool = False,
         subproject: str = "",
+        use_external_memory_version: bool = False,
+        batch_size: int = -1,
     ) -> ModelTrainer:
         """
         Builder method that creates an approproiate ModelTrainer instance based on the given params.
@@ -55,22 +89,32 @@ class ModelTrainerBuilder:
             )
         elif isinstance(dataset, DataFrame):
             trainer_klass = SnowparkModelTrainer
+            init_args = {
+                "estimator": estimator,
+                "dataset": dataset,
+                "session": dataset._session,
+                "input_cols": input_cols,
+                "label_cols": label_cols,
+                "sample_weight_col": sample_weight_col,
+                "autogenerated": autogenerated,
+                "subproject": subproject,
+            }
             assert dataset._session is not None  # Make MyPy happpy
             if isinstance(estimator, model_selection.GridSearchCV) or isinstance(
                 estimator, model_selection.RandomizedSearchCV
             ):
                 if ModelTrainerBuilder._check_if_distributed_hpo_enabled(session=dataset._session):
                     trainer_klass = DistributedHPOTrainer
-            return trainer_klass(
-                estimator=estimator,
-                dataset=dataset,
-                session=dataset._session,
-                input_cols=input_cols,
-                label_cols=label_cols,
-                sample_weight_col=sample_weight_col,
-                autogenerated=autogenerated,
-                subproject=subproject,
-            )
+            elif use_external_memory_version:
+                ModelTrainerBuilder._validate_external_memory_params(
+                    estimator=estimator,
+                    batch_size=batch_size,
+                )
+                trainer_klass = XGBoostExternalMemoryTrainer
+                init_args["batch_size"] = batch_size
+            return trainer_klass(**init_args)  # type: ignore[arg-type]
         else:
             raise TypeError(
                 f"Unexpected dataset type: {type(dataset)}."

snowflake/ml/modeling/_internal/snowpark_handlers.py CHANGED Viewed

@@ -306,7 +306,7 @@ class SnowparkHandlers:
             input_cols: List[str],
             label_cols: List[str],
             sample_weight_col: Optional[str],
-            statement_params: Dict[str, str],
+            score_statement_params: Dict[str, str],
         ) -> float:
             import inspect
             import os
@@ -317,13 +317,13 @@ class SnowparkHandlers:
                 importlib.import_module(import_name)
             for query in sql_queries[:-1]:
-                _ = session.sql(query).collect(statement_params=statement_params)
+                _ = session.sql(query).collect(statement_params=score_statement_params)
             sp_df = session.sql(sql_queries[-1])
-            df: pd.DataFrame = sp_df.to_pandas(statement_params=statement_params)
+            df: pd.DataFrame = sp_df.to_pandas(statement_params=score_statement_params)
             df.columns = sp_df.columns
             local_score_file_name = get_temp_file_path()
-            session.file.get(stage_score_file_name, local_score_file_name, statement_params=statement_params)
+            session.file.get(stage_score_file_name, local_score_file_name, statement_params=score_statement_params)
             local_score_file_name_path = os.path.join(local_score_file_name, os.listdir(local_score_file_name)[0])
             with open(local_score_file_name_path, mode="r+b") as local_score_file_obj:
@@ -348,7 +348,7 @@ class SnowparkHandlers:
             return result
         # Call score sproc
-        statement_params = telemetry.get_function_usage_statement_params(
+        score_statement_params = telemetry.get_function_usage_statement_params(
             project=_PROJECT,
             subproject=self._subproject,
             function_name=telemetry.get_statement_params_full_func_name(
@@ -357,6 +357,8 @@ class SnowparkHandlers:
             api_calls=[Session.call],
             custom_tags=dict([("autogen", True)]) if self._autogenerated else None,
         )
+        kwargs = telemetry.get_sproc_statement_params_kwargs(score_wrapper_sproc, score_statement_params)
         score: float = score_wrapper_sproc(
             session,
             queries,
@@ -364,7 +366,8 @@ class SnowparkHandlers:
             input_cols,
             label_cols,
             sample_weight_col,
-            statement_params,
+            score_statement_params,
+            **kwargs,
         )
         cleanup_temp_files([local_score_file_name])

snowflake/ml/modeling/_internal/snowpark_trainer.py CHANGED Viewed

@@ -12,7 +12,11 @@ from snowflake.ml._internal.exceptions import (
     exceptions,
     modeling_error_messages,
 )
-from snowflake.ml._internal.utils import identifier, snowpark_dataframe_utils
+from snowflake.ml._internal.utils import (
+    identifier,
+    pkg_version_utils,
+    snowpark_dataframe_utils,
+)
 from snowflake.ml._internal.utils.query_result_checker import SqlResultValidator
 from snowflake.ml._internal.utils.temp_file_utils import (
     cleanup_temp_files,
@@ -253,11 +257,15 @@ class SnowparkModelTrainer:
         fit_sproc_name = random_name_for_temp_object(TempObjectType.PROCEDURE)
+        relaxed_dependencies = pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel(
+            pkg_versions=model_spec.pkgDependencies, session=self.session
+        )
         fit_wrapper_sproc = self.session.sproc.register(
             func=self._build_fit_wrapper_sproc(model_spec=model_spec),
             is_permanent=False,
             name=fit_sproc_name,
-            packages=["snowflake-snowpark-python"] + model_spec.pkgDependencies,  # type: ignore[arg-type]
+            packages=["snowflake-snowpark-python"] + relaxed_dependencies,  # type: ignore[arg-type]
             replace=True,
             session=self.session,
             statement_params=statement_params,

snowflake-ml-python 1.1.2__py3-none-any.whl → 1.2.1__py3-none-any.whl

snowflake-ml-python 1.1.2py3-none-any.whl → 1.2.1py3-none-any.whl