PyPI - snowflake-ml-python - Versions diffs - 1.1.2__py3-none-any.whl → 1.2.1__py3-none-any.whl - Mend

snowflake-ml-python 1.1.2py3-none-any.whl → 1.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (215) hide show

snowflake/ml/{model/_deploy_client/utils → _internal/container_services/image_registry}/imagelib.py +3 -1
snowflake/ml/{model/_deploy_client/utils/image_registry_client.py → _internal/container_services/image_registry/registry_client.py} +4 -2
snowflake/ml/_internal/env_utils.py +31 -52
snowflake/ml/_internal/file_utils.py +17 -0
snowflake/ml/_internal/telemetry.py +19 -0
snowflake/ml/_internal/utils/query_result_checker.py +8 -5
snowflake/ml/_internal/utils/snowflake_env.py +95 -0
snowflake/ml/fileset/parquet_parser.py +31 -1
snowflake/ml/model/__init__.py +6 -0
snowflake/ml/model/_client/model/model_impl.py +172 -13
snowflake/ml/model/_client/model/model_version_impl.py +96 -52
snowflake/ml/model/_client/ops/metadata_ops.py +1 -3
snowflake/ml/model/_client/ops/model_ops.py +155 -9
snowflake/ml/model/_client/sql/model.py +55 -10
snowflake/ml/model/_client/sql/model_version.py +72 -61
snowflake/ml/model/_client/sql/stage.py +10 -4
snowflake/ml/model/_client/sql/tag.py +118 -0
snowflake/ml/model/_deploy_client/image_builds/client_image_builder.py +2 -2
snowflake/ml/model/_deploy_client/image_builds/docker_context.py +8 -8
snowflake/ml/model/_deploy_client/image_builds/inference_server/main.py +4 -6
snowflake/ml/model/_deploy_client/image_builds/server_image_builder.py +6 -7
snowflake/ml/model/_deploy_client/snowservice/deploy.py +4 -5
snowflake/ml/model/_deploy_client/snowservice/instance_types.py +9 -1
snowflake/ml/model/_deploy_client/warehouse/deploy.py +20 -11
snowflake/ml/model/_model_composer/model_manifest/model_manifest.py +45 -1
snowflake/ml/model/_model_composer/model_manifest/model_manifest_schema.py +30 -0
snowflake/ml/model/_model_composer/model_method/function_generator.py +2 -1
snowflake/ml/model/_model_composer/model_runtime/_runtime_requirements.py +10 -1
snowflake/ml/model/_model_composer/model_runtime/model_runtime.py +10 -7
snowflake/ml/model/_packager/model_handlers/huggingface_pipeline.py +1 -1
snowflake/ml/model/_packager/model_handlers/xgboost.py +13 -2
snowflake/ml/model/_packager/model_meta/_core_requirements.py +11 -1
snowflake/ml/model/_packager/model_meta/_packaging_requirements.py +3 -0
snowflake/ml/model/_packager/model_meta/model_meta.py +17 -3
snowflake/ml/model/_signatures/core.py +20 -17
snowflake/ml/model/custom_model.py +30 -27
snowflake/ml/model/model_signature.py +16 -17
snowflake/ml/model/type_hints.py +3 -0
snowflake/ml/modeling/_internal/distributed_hpo_trainer.py +185 -98
snowflake/ml/modeling/_internal/estimator_utils.py +21 -0
snowflake/ml/modeling/_internal/model_specifications.py +3 -10
snowflake/ml/modeling/_internal/model_trainer_builder.py +55 -11
snowflake/ml/modeling/_internal/snowpark_handlers.py +9 -6
snowflake/ml/modeling/_internal/snowpark_trainer.py +10 -2
snowflake/ml/modeling/_internal/xgboost_external_memory_trainer.py +444 -0
snowflake/ml/modeling/calibration/calibrated_classifier_cv.py +51 -16
snowflake/ml/modeling/cluster/affinity_propagation.py +51 -16
snowflake/ml/modeling/cluster/agglomerative_clustering.py +51 -16
snowflake/ml/modeling/cluster/birch.py +51 -16
snowflake/ml/modeling/cluster/bisecting_k_means.py +51 -16
snowflake/ml/modeling/cluster/dbscan.py +51 -16
snowflake/ml/modeling/cluster/feature_agglomeration.py +51 -16
snowflake/ml/modeling/cluster/k_means.py +51 -16
snowflake/ml/modeling/cluster/mean_shift.py +51 -16
snowflake/ml/modeling/cluster/mini_batch_k_means.py +51 -16
snowflake/ml/modeling/cluster/optics.py +51 -16
snowflake/ml/modeling/cluster/spectral_biclustering.py +51 -16
snowflake/ml/modeling/cluster/spectral_clustering.py +51 -16
snowflake/ml/modeling/cluster/spectral_coclustering.py +51 -16
snowflake/ml/modeling/compose/column_transformer.py +51 -16
snowflake/ml/modeling/compose/transformed_target_regressor.py +51 -16
snowflake/ml/modeling/covariance/elliptic_envelope.py +51 -16
snowflake/ml/modeling/covariance/empirical_covariance.py +51 -16
snowflake/ml/modeling/covariance/graphical_lasso.py +51 -16
snowflake/ml/modeling/covariance/graphical_lasso_cv.py +51 -16
snowflake/ml/modeling/covariance/ledoit_wolf.py +51 -16
snowflake/ml/modeling/covariance/min_cov_det.py +51 -16
snowflake/ml/modeling/covariance/oas.py +51 -16
snowflake/ml/modeling/covariance/shrunk_covariance.py +51 -16
snowflake/ml/modeling/decomposition/dictionary_learning.py +51 -16
snowflake/ml/modeling/decomposition/factor_analysis.py +51 -16
snowflake/ml/modeling/decomposition/fast_ica.py +51 -16
snowflake/ml/modeling/decomposition/incremental_pca.py +51 -16
snowflake/ml/modeling/decomposition/kernel_pca.py +51 -16
snowflake/ml/modeling/decomposition/mini_batch_dictionary_learning.py +51 -16
snowflake/ml/modeling/decomposition/mini_batch_sparse_pca.py +51 -16
snowflake/ml/modeling/decomposition/pca.py +51 -16
snowflake/ml/modeling/decomposition/sparse_pca.py +51 -16
snowflake/ml/modeling/decomposition/truncated_svd.py +51 -16
snowflake/ml/modeling/discriminant_analysis/linear_discriminant_analysis.py +51 -16
snowflake/ml/modeling/discriminant_analysis/quadratic_discriminant_analysis.py +51 -16
snowflake/ml/modeling/ensemble/ada_boost_classifier.py +51 -16
snowflake/ml/modeling/ensemble/ada_boost_regressor.py +51 -16
snowflake/ml/modeling/ensemble/bagging_classifier.py +51 -16
snowflake/ml/modeling/ensemble/bagging_regressor.py +51 -16
snowflake/ml/modeling/ensemble/extra_trees_classifier.py +51 -16
snowflake/ml/modeling/ensemble/extra_trees_regressor.py +51 -16
snowflake/ml/modeling/ensemble/gradient_boosting_classifier.py +51 -16
snowflake/ml/modeling/ensemble/gradient_boosting_regressor.py +51 -16
snowflake/ml/modeling/ensemble/hist_gradient_boosting_classifier.py +51 -16
snowflake/ml/modeling/ensemble/hist_gradient_boosting_regressor.py +51 -16
snowflake/ml/modeling/ensemble/isolation_forest.py +51 -16
snowflake/ml/modeling/ensemble/random_forest_classifier.py +51 -16
snowflake/ml/modeling/ensemble/random_forest_regressor.py +51 -16
snowflake/ml/modeling/ensemble/stacking_regressor.py +51 -16
snowflake/ml/modeling/ensemble/voting_classifier.py +51 -16
snowflake/ml/modeling/ensemble/voting_regressor.py +51 -16
snowflake/ml/modeling/feature_selection/generic_univariate_select.py +51 -16
snowflake/ml/modeling/feature_selection/select_fdr.py +51 -16
snowflake/ml/modeling/feature_selection/select_fpr.py +51 -16
snowflake/ml/modeling/feature_selection/select_fwe.py +51 -16
snowflake/ml/modeling/feature_selection/select_k_best.py +51 -16
snowflake/ml/modeling/feature_selection/select_percentile.py +51 -16
snowflake/ml/modeling/feature_selection/sequential_feature_selector.py +51 -16
snowflake/ml/modeling/feature_selection/variance_threshold.py +51 -16
snowflake/ml/modeling/gaussian_process/gaussian_process_classifier.py +51 -16
snowflake/ml/modeling/gaussian_process/gaussian_process_regressor.py +51 -16
snowflake/ml/modeling/impute/iterative_imputer.py +51 -16
snowflake/ml/modeling/impute/knn_imputer.py +51 -16
snowflake/ml/modeling/impute/missing_indicator.py +51 -16
snowflake/ml/modeling/kernel_approximation/additive_chi2_sampler.py +51 -16
snowflake/ml/modeling/kernel_approximation/nystroem.py +51 -16
snowflake/ml/modeling/kernel_approximation/polynomial_count_sketch.py +51 -16
snowflake/ml/modeling/kernel_approximation/rbf_sampler.py +51 -16
snowflake/ml/modeling/kernel_approximation/skewed_chi2_sampler.py +51 -16
snowflake/ml/modeling/kernel_ridge/kernel_ridge.py +51 -16
snowflake/ml/modeling/lightgbm/lgbm_classifier.py +51 -16
snowflake/ml/modeling/lightgbm/lgbm_regressor.py +51 -16
snowflake/ml/modeling/linear_model/ard_regression.py +51 -16
snowflake/ml/modeling/linear_model/bayesian_ridge.py +51 -16
snowflake/ml/modeling/linear_model/elastic_net.py +51 -16
snowflake/ml/modeling/linear_model/elastic_net_cv.py +51 -16
snowflake/ml/modeling/linear_model/gamma_regressor.py +51 -16
snowflake/ml/modeling/linear_model/huber_regressor.py +51 -16
snowflake/ml/modeling/linear_model/lars.py +51 -16
snowflake/ml/modeling/linear_model/lars_cv.py +51 -16
snowflake/ml/modeling/linear_model/lasso.py +51 -16
snowflake/ml/modeling/linear_model/lasso_cv.py +51 -16
snowflake/ml/modeling/linear_model/lasso_lars.py +51 -16
snowflake/ml/modeling/linear_model/lasso_lars_cv.py +51 -16
snowflake/ml/modeling/linear_model/lasso_lars_ic.py +51 -16
snowflake/ml/modeling/linear_model/linear_regression.py +51 -16
snowflake/ml/modeling/linear_model/logistic_regression.py +51 -16
snowflake/ml/modeling/linear_model/logistic_regression_cv.py +51 -16
snowflake/ml/modeling/linear_model/multi_task_elastic_net.py +51 -16
snowflake/ml/modeling/linear_model/multi_task_elastic_net_cv.py +51 -16
snowflake/ml/modeling/linear_model/multi_task_lasso.py +51 -16
snowflake/ml/modeling/linear_model/multi_task_lasso_cv.py +51 -16
snowflake/ml/modeling/linear_model/orthogonal_matching_pursuit.py +51 -16
snowflake/ml/modeling/linear_model/passive_aggressive_classifier.py +51 -16
snowflake/ml/modeling/linear_model/passive_aggressive_regressor.py +51 -16
snowflake/ml/modeling/linear_model/perceptron.py +51 -16
snowflake/ml/modeling/linear_model/poisson_regressor.py +51 -16
snowflake/ml/modeling/linear_model/ransac_regressor.py +51 -16
snowflake/ml/modeling/linear_model/ridge.py +51 -16
snowflake/ml/modeling/linear_model/ridge_classifier.py +51 -16
snowflake/ml/modeling/linear_model/ridge_classifier_cv.py +51 -16
snowflake/ml/modeling/linear_model/ridge_cv.py +51 -16
snowflake/ml/modeling/linear_model/sgd_classifier.py +51 -16
snowflake/ml/modeling/linear_model/sgd_one_class_svm.py +51 -16
snowflake/ml/modeling/linear_model/sgd_regressor.py +51 -16
snowflake/ml/modeling/linear_model/theil_sen_regressor.py +51 -16
snowflake/ml/modeling/linear_model/tweedie_regressor.py +51 -16
snowflake/ml/modeling/manifold/isomap.py +51 -16
snowflake/ml/modeling/manifold/mds.py +51 -16
snowflake/ml/modeling/manifold/spectral_embedding.py +51 -16
snowflake/ml/modeling/manifold/tsne.py +51 -16
snowflake/ml/modeling/metrics/classification.py +5 -6
snowflake/ml/modeling/metrics/metrics_utils.py +5 -3
snowflake/ml/modeling/metrics/ranking.py +7 -3
snowflake/ml/modeling/metrics/regression.py +6 -3
snowflake/ml/modeling/mixture/bayesian_gaussian_mixture.py +51 -16
snowflake/ml/modeling/mixture/gaussian_mixture.py +51 -16
snowflake/ml/modeling/multiclass/one_vs_one_classifier.py +51 -16
snowflake/ml/modeling/multiclass/one_vs_rest_classifier.py +51 -16
snowflake/ml/modeling/multiclass/output_code_classifier.py +51 -16
snowflake/ml/modeling/naive_bayes/bernoulli_nb.py +51 -16
snowflake/ml/modeling/naive_bayes/categorical_nb.py +51 -16
snowflake/ml/modeling/naive_bayes/complement_nb.py +51 -16
snowflake/ml/modeling/naive_bayes/gaussian_nb.py +51 -16
snowflake/ml/modeling/naive_bayes/multinomial_nb.py +51 -16
snowflake/ml/modeling/neighbors/k_neighbors_classifier.py +51 -16
snowflake/ml/modeling/neighbors/k_neighbors_regressor.py +51 -16
snowflake/ml/modeling/neighbors/kernel_density.py +51 -16
snowflake/ml/modeling/neighbors/local_outlier_factor.py +51 -16
snowflake/ml/modeling/neighbors/nearest_centroid.py +51 -16
snowflake/ml/modeling/neighbors/nearest_neighbors.py +51 -16
snowflake/ml/modeling/neighbors/neighborhood_components_analysis.py +51 -16
snowflake/ml/modeling/neighbors/radius_neighbors_classifier.py +51 -16
snowflake/ml/modeling/neighbors/radius_neighbors_regressor.py +51 -16
snowflake/ml/modeling/neural_network/bernoulli_rbm.py +51 -16
snowflake/ml/modeling/neural_network/mlp_classifier.py +51 -16
snowflake/ml/modeling/neural_network/mlp_regressor.py +51 -16
snowflake/ml/modeling/preprocessing/min_max_scaler.py +15 -1
snowflake/ml/modeling/preprocessing/polynomial_features.py +51 -16
snowflake/ml/modeling/semi_supervised/label_propagation.py +51 -16
snowflake/ml/modeling/semi_supervised/label_spreading.py +51 -16
snowflake/ml/modeling/svm/linear_svc.py +51 -16
snowflake/ml/modeling/svm/linear_svr.py +51 -16
snowflake/ml/modeling/svm/nu_svc.py +51 -16
snowflake/ml/modeling/svm/nu_svr.py +51 -16
snowflake/ml/modeling/svm/svc.py +51 -16
snowflake/ml/modeling/svm/svr.py +51 -16
snowflake/ml/modeling/tree/decision_tree_classifier.py +51 -16
snowflake/ml/modeling/tree/decision_tree_regressor.py +51 -16
snowflake/ml/modeling/tree/extra_tree_classifier.py +51 -16
snowflake/ml/modeling/tree/extra_tree_regressor.py +51 -16
snowflake/ml/modeling/xgboost/xgb_classifier.py +69 -16
snowflake/ml/modeling/xgboost/xgb_regressor.py +69 -16
snowflake/ml/modeling/xgboost/xgbrf_classifier.py +69 -16
snowflake/ml/modeling/xgboost/xgbrf_regressor.py +69 -16
snowflake/ml/registry/__init__.py +3 -0
snowflake/ml/registry/_manager/model_manager.py +163 -0
snowflake/ml/registry/model_registry.py +12 -0
snowflake/ml/registry/registry.py +100 -90
snowflake/ml/version.py +1 -1
snowflake_ml_python-1.2.1.dist-info/LICENSE.txt +202 -0
{snowflake_ml_python-1.1.2.dist-info → snowflake_ml_python-1.2.1.dist-info}/METADATA +295 -60
snowflake_ml_python-1.2.1.dist-info/RECORD +355 -0
{snowflake_ml_python-1.1.2.dist-info → snowflake_ml_python-1.2.1.dist-info}/WHEEL +2 -1
snowflake_ml_python-1.2.1.dist-info/top_level.txt +1 -0
snowflake/ml/model/_client/model/model_method_info.py +0 -19
snowflake_ml_python-1.1.2.dist-info/RECORD +0 -347
/snowflake/ml/_internal/{utils/spcs_image_registry.py → container_services/image_registry/credential.py} +0 -0
/snowflake/ml/_internal/{utils/image_registry_http_client.py → container_services/image_registry/http_client.py} +0 -0

snowflake/ml/modeling/_internal/xgboost_external_memory_trainer.py ADDED Viewed

@@ -0,0 +1,444 @@
+import inspect
+import os
+import tempfile
+from typing import Any, Dict, List, Optional
+import cloudpickle as cp
+import pandas as pd
+import pyarrow.parquet as pq
+from snowflake.ml._internal import telemetry
+from snowflake.ml._internal.exceptions import (
+    error_codes,
+    exceptions,
+    modeling_error_messages,
+)
+from snowflake.ml._internal.utils import pkg_version_utils
+from snowflake.ml._internal.utils.query_result_checker import ResultValidator
+from snowflake.ml._internal.utils.snowpark_dataframe_utils import (
+    cast_snowpark_dataframe,
+)
+from snowflake.ml._internal.utils.temp_file_utils import get_temp_file_path
+from snowflake.ml.modeling._internal.model_specifications import (
+    ModelSpecifications,
+    ModelSpecificationsBuilder,
+)
+from snowflake.ml.modeling._internal.snowpark_trainer import SnowparkModelTrainer
+from snowflake.snowpark import (
+    DataFrame,
+    Session,
+    exceptions as snowpark_exceptions,
+    functions as F,
+)
+from snowflake.snowpark._internal.utils import (
+    TempObjectType,
+    random_name_for_temp_object,
+)
+_PROJECT = "ModelDevelopment"
+def get_data_iterator(
+    file_paths: List[str],
+    batch_size: int,
+    input_cols: List[str],
+    label_cols: List[str],
+    sample_weight_col: Optional[str] = None,
+) -> Any:
+    from typing import List, Optional
+    import xgboost
+    class ParquetDataIterator(xgboost.DataIter):
+        """
+        This iterator reads parquet data stored in a specified files and returns
+        deserialized data, enabling seamless integration with the xgboost framework for
+        machine learning tasks.
+        """
+        def __init__(
+            self,
+            file_paths: List[str],
+            batch_size: int,
+            input_cols: List[str],
+            label_cols: List[str],
+            sample_weight_col: Optional[str] = None,
+        ) -> None:
+            """
+            Initialize the DataIterator.
+            Args:
+                file_paths: List of file paths containing the data.
+                batch_size: Target number of rows in each batch.
+                input_cols: The name(s) of one or more columns in a DataFrame containing a feature to be used for
+                    training.
+                label_cols: The name(s) of one or more columns in a DataFrame representing the target variable(s)
+                    to learn.
+                sample_weight_col: The column name representing the weight of training examples.
+            """
+            self._file_paths = file_paths
+            self._batch_size = batch_size
+            self._input_cols = input_cols
+            self._label_cols = label_cols
+            self._sample_weight_col = sample_weight_col
+            # File index
+            self._it = 0
+            # Pandas dataframe containing temp data
+            self._df = None
+            # XGBoost will generate some cache files under current directory with the prefix
+            # "cache"
+            cache_dir_name = tempfile.mkdtemp()
+            super().__init__(cache_prefix=os.path.join(cache_dir_name, "cache"))
+        def next(self, batch_consumer_fn) -> int:  # type: ignore[no-untyped-def]
+            """Advance the iterator by 1 step and pass the data to XGBoost's batch_consumer_fn.
+            This function is called by XGBoost during the construction of ``DMatrix``
+            Args:
+                batch_consumer_fn: batch consumer function
+            Returns:
+                0 if there is no more data, else 1.
+            """
+            while (self._df is None) or (self._df.shape[0] < self._batch_size):
+                # Read files and append data to temp df until batch size is reached.
+                if self._it == len(self._file_paths):
+                    break
+                new_df = pq.read_table(self._file_paths[self._it]).to_pandas()
+                self._it += 1
+                if self._df is None:
+                    self._df = new_df
+                else:
+                    self._df = pd.concat([self._df, new_df], ignore_index=True)
+            if (self._df is None) or (self._df.shape[0] == 0):
+                # No more data
+                return 0
+            # Slice the temp df and save the remainder in the temp df
+            batch_end_index = min(self._batch_size, self._df.shape[0])
+            batch_df = self._df.iloc[:batch_end_index]
+            self._df = self._df.truncate(before=batch_end_index).reset_index(drop=True)
+            # TODO(snandamuri): Make it proper to support categorical features, etc.
+            func_args = {
+                "data": batch_df[self._input_cols],
+                "label": batch_df[self._label_cols].squeeze(),
+            }
+            if self._sample_weight_col is not None:
+                func_args["weight"] = batch_df[self._sample_weight_col].squeeze()
+            batch_consumer_fn(**func_args)
+            # Return 1 to let XGBoost know we haven't seen all the files yet.
+            return 1
+        def reset(self) -> None:
+            """Reset the iterator to its beginning"""
+            self._it = 0
+    return ParquetDataIterator(
+        file_paths=file_paths,
+        batch_size=batch_size,
+        input_cols=input_cols,
+        label_cols=label_cols,
+        sample_weight_col=sample_weight_col,
+    )
+def train_xgboost_model(
+    estimator: object,
+    file_paths: List[str],
+    batch_size: int,
+    input_cols: List[str],
+    label_cols: List[str],
+    sample_weight_col: Optional[str] = None,
+) -> object:
+    """
+    Function to train XGBoost models using the external memory version of XGBoost.
+    """
+    import xgboost
+    def _objective_decorator(func):  # type: ignore[no-untyped-def]
+        def inner(preds, dmatrix):  # type: ignore[no-untyped-def]
+            """internal function"""
+            labels = dmatrix.get_label()
+            return func(labels, preds)
+        return inner
+    assert isinstance(estimator, xgboost.XGBModel)
+    params = estimator.get_xgb_params()
+    obj = None
+    if isinstance(estimator, xgboost.XGBClassifier):
+        # TODO (snandamuri): Find better way to get expected_classes
+        # Set: self.classes_, self.n_classes_
+        expected_classes = pd.unique(pq.read_table(file_paths[0]).to_pandas()[label_cols].squeeze())
+        estimator.n_classes_ = len(expected_classes)
+        if callable(estimator.objective):
+            obj = _objective_decorator(estimator.objective)  # type: ignore[no-untyped-call]
+            # Use default value. Is it really not used ?
+            params["objective"] = "binary:logistic"
+        if len(expected_classes) > 2:
+            # Switch to using a multiclass objective in the underlying XGB instance
+            if params.get("objective", None) != "multi:softmax":
+                params["objective"] = "multi:softprob"
+            params["num_class"] = len(expected_classes)
+    if "tree_method" not in params.keys() or params["tree_method"] is None or params["tree_method"].lower() == "exact":
+        params["tree_method"] = "hist"
+    if (
+        "grow_policy" not in params.keys()
+        or params["grow_policy"] is None
+        or params["grow_policy"].lower() != "depthwise"
+    ):
+        params["grow_policy"] = "depthwise"
+    it = get_data_iterator(
+        file_paths=file_paths,
+        batch_size=batch_size,
+        input_cols=input_cols,
+        label_cols=label_cols,
+        sample_weight_col=sample_weight_col,
+    )
+    Xy = xgboost.DMatrix(it)
+    estimator._Booster = xgboost.train(
+        params,
+        Xy,
+        estimator.get_num_boosting_rounds(),
+        evals=[],
+        early_stopping_rounds=estimator.early_stopping_rounds,
+        evals_result=None,
+        obj=obj,
+        custom_metric=estimator.eval_metric,
+        verbose_eval=None,
+        xgb_model=None,
+        callbacks=None,
+    )
+    return estimator
+cp.register_pickle_by_value(inspect.getmodule(get_data_iterator))
+cp.register_pickle_by_value(inspect.getmodule(train_xgboost_model))
+class XGBoostExternalMemoryTrainer(SnowparkModelTrainer):
+    """
+    When working with large datasets, training XGBoost models traditionally requires loading the entire dataset into
+    memory, which can be costly and sometimes infeasible due to memory constraints. To solve this problem, XGBoost
+    provides support for loading data from external memory using a built-in data parser. With this feature enabled,
+    the training process occurs in a two-step approach:
+        Preprocessing Step: Input data is read and parsed into an internal format, such as CSR, CSC, or sorted CSC.
+            Processed state is appended to an in-memory buffer. Once the buffer reaches a predefined size, it is
+            written out to disk as a page.
+        Tree Construction Step: During the tree construction phase, the data pages stored on disk are streamed via
+            a multi-threaded pre-fetcher, allowing the model to efficiently access and process the data without
+            overloading memory.
+    """
+    def __init__(
+        self,
+        estimator: object,
+        dataset: DataFrame,
+        session: Session,
+        input_cols: List[str],
+        label_cols: Optional[List[str]],
+        sample_weight_col: Optional[str],
+        autogenerated: bool = False,
+        subproject: str = "",
+        batch_size: int = 10000,
+    ) -> None:
+        """
+        Initializes the XGBoostExternalMemoryTrainer with a model, a Snowpark DataFrame, feature, and label column
+        names, etc.
+        Args:
+            estimator: SKLearn compatible estimator or transformer object.
+            dataset: The dataset used for training the model.
+            session: Snowflake session object to be used for training.
+            input_cols: The name(s) of one or more columns in a DataFrame containing a feature to be used for training.
+            label_cols: The name(s) of one or more columns in a DataFrame representing the target variable(s) to learn.
+            sample_weight_col: The column name representing the weight of training examples.
+            autogenerated: A boolean denoting if the trainer is being used by autogenerated code or not.
+            subproject: subproject name to be used in telemetry.
+            batch_size: Number of the rows in the each batch processed during training.
+        """
+        super().__init__(
+            estimator=estimator,
+            dataset=dataset,
+            session=session,
+            input_cols=input_cols,
+            label_cols=label_cols,
+            sample_weight_col=sample_weight_col,
+            autogenerated=autogenerated,
+            subproject=subproject,
+        )
+        self._batch_size = batch_size
+    def _get_xgb_external_memory_fit_wrapper_sproc(
+        self,
+        model_spec: ModelSpecifications,
+        session: Session,
+        statement_params: Dict[str, str],
+        import_file_paths: List[str],
+    ) -> Any:
+        fit_sproc_name = random_name_for_temp_object(TempObjectType.PROCEDURE)
+        relaxed_dependencies = pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel(
+            pkg_versions=model_spec.pkgDependencies, session=self.session
+        )
+        @F.sproc(
+            is_permanent=False,
+            name=fit_sproc_name,
+            packages=list(["snowflake-snowpark-python"] + relaxed_dependencies),
+            replace=True,
+            session=session,
+            statement_params=statement_params,
+            anonymous=True,
+            imports=list(import_file_paths),
+        )  # type: ignore[misc]
+        def fit_wrapper_sproc(
+            session: Session,
+            stage_transform_file_name: str,
+            stage_result_file_name: str,
+            dataset_stage_name: str,
+            batch_size: int,
+            input_cols: List[str],
+            label_cols: List[str],
+            sample_weight_col: Optional[str],
+            statement_params: Dict[str, str],
+        ) -> str:
+            import os
+            import sys
+            import cloudpickle as cp
+            local_transform_file_name = get_temp_file_path()
+            session.file.get(stage_transform_file_name, local_transform_file_name, statement_params=statement_params)
+            local_transform_file_path = os.path.join(
+                local_transform_file_name, os.listdir(local_transform_file_name)[0]
+            )
+            with open(local_transform_file_path, mode="r+b") as local_transform_file_obj:
+                estimator = cp.load(local_transform_file_obj)
+            data_files = [
+                os.path.join(sys._xoptions["snowflake_import_directory"], filename)
+                for filename in os.listdir(sys._xoptions["snowflake_import_directory"])
+                if filename.startswith(dataset_stage_name)
+            ]
+            estimator = train_xgboost_model(
+                estimator=estimator,
+                file_paths=data_files,
+                batch_size=batch_size,
+                input_cols=input_cols,
+                label_cols=label_cols,
+                sample_weight_col=sample_weight_col,
+            )
+            local_result_file_name = get_temp_file_path()
+            with open(local_result_file_name, mode="w+b") as local_result_file_obj:
+                cp.dump(estimator, local_result_file_obj)
+            session.file.put(
+                local_result_file_name,
+                stage_result_file_name,
+                auto_compress=False,
+                overwrite=True,
+                statement_params=statement_params,
+            )
+            # Note: you can add something like  + "|" + str(df) to the return string
+            # to pass debug information to the caller.
+            return str(os.path.basename(local_result_file_name))
+        return fit_wrapper_sproc
+    def _write_training_data_to_stage(self, dataset_stage_name: str) -> List[str]:
+        """
+        Materializes the training to the specified stage and returns the list of stage file paths.
+        Args:
+            dataset_stage_name: Target stage to materialize training data.
+        Returns:
+            List of stage file paths that contain the materialized data.
+        """
+        # Stage data.
+        dataset = cast_snowpark_dataframe(self.dataset)
+        remote_file_path = f"{dataset_stage_name}/{dataset_stage_name}.parquet"
+        copy_response = dataset.write.copy_into_location(  # type:ignore[call-overload]
+            remote_file_path, file_format_type="parquet", header=True, overwrite=True
+        )
+        ResultValidator(result=copy_response).has_dimensions(expected_rows=1).validate()
+        data_file_paths = [f"@{row.name}" for row in self.session.sql(f"LIST @{dataset_stage_name}").collect()]
+        return data_file_paths
+    def train(self) -> object:
+        """
+        Runs hyper parameter optimization by distributing the tasks across warehouse.
+        Returns:
+            Trained model
+        Raises:
+            SnowflakeMLException: For known types of user and system errors.
+            e: For every unexpected exception from SnowflakeClient.
+        """
+        temp_stage_name = self._create_temp_stage()
+        (stage_transform_file_name, stage_result_file_name) = self._upload_model_to_stage(stage_name=temp_stage_name)
+        data_file_paths = self._write_training_data_to_stage(dataset_stage_name=temp_stage_name)
+        # Call fit sproc
+        statement_params = telemetry.get_function_usage_statement_params(
+            project=_PROJECT,
+            subproject=self._subproject,
+            function_name=telemetry.get_statement_params_full_func_name(inspect.currentframe(), self._class_name),
+            api_calls=[Session.call],
+            custom_tags=None,
+        )
+        model_spec = ModelSpecificationsBuilder.build(model=self.estimator)
+        fit_wrapper = self._get_xgb_external_memory_fit_wrapper_sproc(
+            model_spec=model_spec,
+            session=self.session,
+            statement_params=statement_params,
+            import_file_paths=data_file_paths,
+        )
+        try:
+            sproc_export_file_name = fit_wrapper(
+                self.session,
+                stage_transform_file_name,
+                stage_result_file_name,
+                temp_stage_name,
+                self._batch_size,
+                self.input_cols,
+                self.label_cols,
+                self.sample_weight_col,
+                statement_params,
+            )
+        except snowpark_exceptions.SnowparkClientException as e:
+            if "fit() missing 1 required positional argument: 'y'" in str(e):
+                raise exceptions.SnowflakeMLException(
+                    error_code=error_codes.NOT_FOUND,
+                    original_exception=RuntimeError(modeling_error_messages.ATTRIBUTE_NOT_SET.format("label_cols")),
+                ) from e
+            raise e
+        if "|" in sproc_export_file_name:
+            fields = sproc_export_file_name.strip().split("|")
+            sproc_export_file_name = fields[0]
+        return self._fetch_model_from_stage(
+            dir_path=stage_result_file_name,
+            file_name=sproc_export_file_name,
+            statement_params=statement_params,
+        )

snowflake/ml/modeling/calibration/calibrated_classifier_cv.py CHANGED Viewed

@@ -54,6 +54,18 @@ _PROJECT = "ModelDevelopment"
 _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.calibration".replace("sklearn.", "").split("_")])
+def _is_fit_predict_method_enabled() -> Callable[[Any], bool]:
+    def check(self: BaseTransformer) -> TypeGuard[Callable[..., object]]:
+        return False and callable(getattr(self._sklearn_object, "fit_predict", None))
+    return check
+def _is_fit_transform_method_enabled() -> Callable[[Any], bool]:
+    def check(self: BaseTransformer) -> TypeGuard[Callable[..., object]]:
+        return False and callable(getattr(self._sklearn_object, "fit_transform", None))
+    return check
 class CalibratedClassifierCV(BaseTransformer):
     r"""Probability calibration with isotonic regression or logistic regression
     For more details on this class, see [sklearn.calibration.CalibratedClassifierCV]
@@ -192,7 +204,9 @@ class CalibratedClassifierCV(BaseTransformer):
         self.set_label_cols(label_cols)
         self.set_passthrough_cols(passthrough_cols)
         self.set_drop_input_cols(drop_input_cols)
-        self.set_sample_weight_col(sample_weight_col)
+        self.set_sample_weight_col(sample_weight_col)
+        self._use_external_memory_version = False
+        self._batch_size = -1
         deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
         deps = deps | gather_dependencies(estimator)
         deps = deps | gather_dependencies(base_estimator)
@@ -275,11 +289,6 @@ class CalibratedClassifierCV(BaseTransformer):
         if isinstance(dataset, DataFrame):
             session = dataset._session
             assert session is not None  # keep mypy happy
-            # Validate that key package version in user workspace are supported in snowflake conda channel
-            # If customer doesn't have package in conda channel, replace the ones have the closest versions
-            self._deps = pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel(
-                pkg_versions=self._get_dependencies(), session=session, subproject=_SUBPROJECT)
             # Specify input columns so column pruning will be enforced
             selected_cols = self._get_active_columns()
             if len(selected_cols) > 0:
@@ -307,7 +316,9 @@ class CalibratedClassifierCV(BaseTransformer):
             label_cols=self.label_cols,
             sample_weight_col=self.sample_weight_col,
             autogenerated=self._autogenerated,
-            subproject=_SUBPROJECT
+            subproject=_SUBPROJECT,
+            use_external_memory_version=self._use_external_memory_version,
+            batch_size=self._batch_size,
         )
         self._sklearn_object = model_trainer.train()
         self._is_fitted = True
@@ -578,6 +589,22 @@ class CalibratedClassifierCV(BaseTransformer):
                 # each row containing a list of values.
                 expected_dtype = "ARRAY"
+            # If we were unable to assign a type to this transform in the factory, infer the type here.
+            if expected_dtype == "":
+                # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
+                if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
+                    expected_dtype = "ARRAY"
+                # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
+                elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
+                    expected_dtype = "ARRAY"
+                else:
+                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
+                    # We can only infer the output types from the input types if the following two statemetns are true:
+                    # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
+                    # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
+                    if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
+                        expected_dtype = convert_sp_to_sf_type(output_types[0])
             output_df = self._batch_inference(
                 dataset=dataset,
                 inference_method="transform",
@@ -593,8 +620,8 @@ class CalibratedClassifierCV(BaseTransformer):
         return output_df
-    @available_if(original_estimator_has_callable("fit_predict"))  # type: ignore[misc]
-    def fit_predict(self, dataset: Union[DataFrame, pd.DataFrame]) -> npt.NDArray[Any]:
+    @available_if(_is_fit_predict_method_enabled())  # type: ignore[misc]
+    def fit_predict(self, dataset: Union[DataFrame, pd.DataFrame]) -> Union[Any, npt.NDArray[Any]]:
         """ Method not supported for this class.
@@ -607,13 +634,21 @@ class CalibratedClassifierCV(BaseTransformer):
         Returns:
             Predicted dataset.
         """
-        if False:
-            self.fit(dataset)
-            assert self._sklearn_object is not None
-            labels : npt.NDArray[Any] = self._sklearn_object.labels_
-            return labels
-        else:
-            raise NotImplementedError
+        self.fit(dataset)
+        assert self._sklearn_object is not None
+        return self._sklearn_object.labels_
+    @available_if(_is_fit_transform_method_enabled())  # type: ignore[misc]
+    def fit_transform(self, dataset: Union[DataFrame, pd.DataFrame]) -> Union[Any, npt.NDArray[Any]]:
+        """
+        Returns:
+            Transformed dataset.
+        """
+        self.fit(dataset)
+        assert self._sklearn_object is not None
+        return self._sklearn_object.embedding_
     def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
         """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.

snowflake/ml/modeling/cluster/affinity_propagation.py CHANGED Viewed

@@ -54,6 +54,18 @@ _PROJECT = "ModelDevelopment"
 _SUBPROJECT = "".join([s.capitalize() for s in "sklearn.cluster".replace("sklearn.", "").split("_")])
+def _is_fit_predict_method_enabled() -> Callable[[Any], bool]:
+    def check(self: BaseTransformer) -> TypeGuard[Callable[..., object]]:
+        return True and callable(getattr(self._sklearn_object, "fit_predict", None))
+    return check
+def _is_fit_transform_method_enabled() -> Callable[[Any], bool]:
+    def check(self: BaseTransformer) -> TypeGuard[Callable[..., object]]:
+        return False and callable(getattr(self._sklearn_object, "fit_transform", None))
+    return check
 class AffinityPropagation(BaseTransformer):
     r"""Perform Affinity Propagation Clustering of data
     For more details on this class, see [sklearn.cluster.AffinityPropagation]
@@ -167,7 +179,9 @@ class AffinityPropagation(BaseTransformer):
         self.set_label_cols(label_cols)
         self.set_passthrough_cols(passthrough_cols)
         self.set_drop_input_cols(drop_input_cols)
-        self.set_sample_weight_col(sample_weight_col)
+        self.set_sample_weight_col(sample_weight_col)
+        self._use_external_memory_version = False
+        self._batch_size = -1
         deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
         self._deps = list(deps)
@@ -250,11 +264,6 @@ class AffinityPropagation(BaseTransformer):
         if isinstance(dataset, DataFrame):
             session = dataset._session
             assert session is not None  # keep mypy happy
-            # Validate that key package version in user workspace are supported in snowflake conda channel
-            # If customer doesn't have package in conda channel, replace the ones have the closest versions
-            self._deps = pkg_version_utils.get_valid_pkg_versions_supported_in_snowflake_conda_channel(
-                pkg_versions=self._get_dependencies(), session=session, subproject=_SUBPROJECT)
             # Specify input columns so column pruning will be enforced
             selected_cols = self._get_active_columns()
             if len(selected_cols) > 0:
@@ -282,7 +291,9 @@ class AffinityPropagation(BaseTransformer):
             label_cols=self.label_cols,
             sample_weight_col=self.sample_weight_col,
             autogenerated=self._autogenerated,
-            subproject=_SUBPROJECT
+            subproject=_SUBPROJECT,
+            use_external_memory_version=self._use_external_memory_version,
+            batch_size=self._batch_size,
         )
         self._sklearn_object = model_trainer.train()
         self._is_fitted = True
@@ -553,6 +564,22 @@ class AffinityPropagation(BaseTransformer):
                 # each row containing a list of values.
                 expected_dtype = "ARRAY"
+            # If we were unable to assign a type to this transform in the factory, infer the type here.
+            if expected_dtype == "":
+                # If this is a clustering transformer, if the number of output columns does not equal the number of clusters the response will be an "ARRAY"
+                if hasattr(self._sklearn_object, "n_clusters") and getattr(self._sklearn_object, "n_clusters") != len(self.output_cols):
+                    expected_dtype = "ARRAY"
+                # If this is a decomposition transformer, if the number of output columns does not equal the number of components the response will be an "ARRAY"
+                elif hasattr(self._sklearn_object, "n_components") and getattr(self._sklearn_object, "n_components") != len(self.output_cols):
+                    expected_dtype = "ARRAY"
+                else:
+                    output_types = [signature.as_snowpark_type() for signature in _infer_signature(dataset[self.input_cols], "output", use_snowflake_identifiers=True)]
+                    # We can only infer the output types from the input types if the following two statemetns are true:
+                    # 1) All of the output types are the same. Otherwise, we still have to fall back to variant because `_sklearn_inference` only accepts one type.
+                    # 2) The length of the input columns equals the length of the output columns. Otherwise the transform will likely result in an `ARRAY`.
+                    if all(x == output_types[0] for x in output_types) and len(output_types) == len(self.output_cols):
+                        expected_dtype = convert_sp_to_sf_type(output_types[0])
             output_df = self._batch_inference(
                 dataset=dataset,
                 inference_method="transform",
@@ -568,8 +595,8 @@ class AffinityPropagation(BaseTransformer):
         return output_df
-    @available_if(original_estimator_has_callable("fit_predict"))  # type: ignore[misc]
-    def fit_predict(self, dataset: Union[DataFrame, pd.DataFrame]) -> npt.NDArray[Any]:
+    @available_if(_is_fit_predict_method_enabled())  # type: ignore[misc]
+    def fit_predict(self, dataset: Union[DataFrame, pd.DataFrame]) -> Union[Any, npt.NDArray[Any]]:
         """ Fit clustering from features/affinity matrix; return cluster labels
         For more details on this function, see [sklearn.cluster.AffinityPropagation.fit_predict]
         (https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AffinityPropagation.html#sklearn.cluster.AffinityPropagation.fit_predict)
@@ -584,13 +611,21 @@ class AffinityPropagation(BaseTransformer):
         Returns:
             Predicted dataset.
         """
-        if True:
-            self.fit(dataset)
-            assert self._sklearn_object is not None
-            labels : npt.NDArray[Any] = self._sklearn_object.labels_
-            return labels
-        else:
-            raise NotImplementedError
+        self.fit(dataset)
+        assert self._sklearn_object is not None
+        return self._sklearn_object.labels_
+    @available_if(_is_fit_transform_method_enabled())  # type: ignore[misc]
+    def fit_transform(self, dataset: Union[DataFrame, pd.DataFrame]) -> Union[Any, npt.NDArray[Any]]:
+        """
+        Returns:
+            Transformed dataset.
+        """
+        self.fit(dataset)
+        assert self._sklearn_object is not None
+        return self._sklearn_object.embedding_
     def _get_output_column_names(self, output_cols_prefix: str, output_cols: Optional[List[str]] = None) -> List[str]:
         """ Returns the list of output columns for predict_proba(), decision_function(), etc.. functions.

snowflake-ml-python 1.1.2__py3-none-any.whl → 1.2.1__py3-none-any.whl

snowflake-ml-python 1.1.2py3-none-any.whl → 1.2.1py3-none-any.whl