PyPI - snowflake-ml-python - Versions diffs - 1.6.3__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

snowflake-ml-python 1.6.3py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (150) hide show

snowflake/ml/model/_packager/model_handlers/xgboost.py CHANGED Viewed

@@ -1,7 +1,6 @@
 # mypy: disable-error-code="import"
 import os
 import warnings
-from importlib import metadata as importlib_metadata
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -16,23 +15,19 @@ from typing import (
 import numpy as np
 import pandas as pd
-from packaging import version
 from typing_extensions import TypeGuard, Unpack
 from snowflake.ml._internal import type_utils
 from snowflake.ml.model import custom_model, model_signature, type_hints as model_types
 from snowflake.ml.model._packager.model_env import model_env
-from snowflake.ml.model._packager.model_handlers import (
-    _base,
-    _utils as handlers_utils,
-    model_objective_utils,
-)
+from snowflake.ml.model._packager.model_handlers import _base, _utils as handlers_utils
 from snowflake.ml.model._packager.model_handlers_migrator import base_migrator
 from snowflake.ml.model._packager.model_meta import (
     model_blob_meta,
     model_meta as model_meta_api,
     model_meta_schema,
 )
+from snowflake.ml.model._packager.model_task import model_task_utils
 from snowflake.ml.model._signatures import numpy_handler, utils as model_signature_utils
 if TYPE_CHECKING:
@@ -94,23 +89,6 @@ class XGBModelHandler(_base.BaseModelHandler[Union["xgboost.Booster", "xgboost.X
         assert isinstance(model, xgboost.Booster) or isinstance(model, xgboost.XGBModel)
-        local_xgb_version = None
-        try:
-            local_dist = importlib_metadata.distribution("xgboost")
-            local_xgb_version = version.parse(local_dist.version)
-        except importlib_metadata.PackageNotFoundError:
-            pass
-        if local_xgb_version and local_xgb_version >= version.parse("2.1.0") and enable_explainability:
-            warnings.warn(
-                f"This version of xgboost {local_xgb_version} does not work with shap 0.42.1."
-                + "If you want model explanations, lower the xgboost version to <2.1.0.",
-                category=UserWarning,
-                stacklevel=1,
-            )
-            enable_explainability = False
         if not is_sub_model:
             target_methods = handlers_utils.get_target_methods(
                 model=model,
@@ -139,7 +117,7 @@ class XGBModelHandler(_base.BaseModelHandler[Union["xgboost.Booster", "xgboost.X
                 sample_input_data=sample_input_data,
                 get_prediction_fn=get_prediction,
             )
-            model_task_and_output = model_objective_utils.get_model_task_and_output_type(model)
+            model_task_and_output = model_task_utils.get_model_task_and_output_type(model)
             model_meta.task = handlers_utils.validate_model_task(model_meta.task, model_task_and_output.task)
             if enable_explainability:
                 model_meta = handlers_utils.add_explain_method_signature(
@@ -187,23 +165,15 @@ class XGBModelHandler(_base.BaseModelHandler[Union["xgboost.Booster", "xgboost.X
             ],
             check_local_version=True,
         )
-        if local_xgb_version and local_xgb_version >= version.parse("2.0.0") and enable_explainability:
-            model_meta.env.include_if_absent(
-                [
-                    model_env.ModelDependency(requirement="xgboost==2.0.*", pip_name="xgboost"),
-                ],
-                check_local_version=False,
-            )
-        else:
-            model_meta.env.include_if_absent(
-                [
-                    model_env.ModelDependency(requirement="xgboost", pip_name="xgboost"),
-                ],
-                check_local_version=True,
-            )
+        model_meta.env.include_if_absent(
+            [
+                model_env.ModelDependency(requirement="xgboost", pip_name="xgboost"),
+            ],
+            check_local_version=True,
+        )
         if enable_explainability:
-            model_meta.env.include_if_absent([model_env.ModelDependency(requirement="shap", pip_name="shap")])
+            model_meta.env.include_if_absent([model_env.ModelDependency(requirement="shap>=0.46.0", pip_name="shap")])
             model_meta.explain_algorithm = model_meta_schema.ModelExplainAlgorithm.SHAP
         model_meta.env.cuda_version = kwargs.get("cuda_version", model_env.DEFAULT_CUDA_VERSION)

snowflake/ml/model/_packager/model_packager.py CHANGED Viewed

@@ -61,17 +61,6 @@ class ModelPackager:
         if not options:
             options = model_types.BaseModelSaveOption()
-        # here handling the case of enable_explainability is False/None
-        enable_explainability = options.get("enable_explainability", None)
-        if enable_explainability is False or enable_explainability is None:
-            if (signatures is not None) and (sample_input_data is not None):
-                raise snowml_exceptions.SnowflakeMLException(
-                    error_code=error_codes.INVALID_ARGUMENT,
-                    original_exception=ValueError(
-                        "Signatures and sample_input_data both cannot be specified at the same time."
-                    ),
-                )
         handler = model_handler.find_handler(model)
         if handler is None:
             raise snowml_exceptions.SnowflakeMLException(

snowflake/ml/model/_packager/{model_handlers/model_objective_utils.py → model_task/model_task_utils.py} RENAMED Viewed

@@ -128,42 +128,30 @@ def get_model_task_xgb(model: Union["xgboost.Booster", "xgboost.XGBModel"]) -> t
     return type_hints.Task.UNKNOWN
-def get_model_task_and_output_type(model: Any) -> ModelTaskAndOutputType:
+def _get_model_task(model: Any) -> type_hints.Task:
     if type_utils.LazyType("xgboost.Booster").isinstance(model) or type_utils.LazyType("xgboost.XGBModel").isinstance(
         model
     ):
-        task = get_model_task_xgb(model)
-        output_type = model_signature.DataType.DOUBLE
-        if task == type_hints.Task.TABULAR_MULTI_CLASSIFICATION:
-            output_type = model_signature.DataType.STRING
-        return ModelTaskAndOutputType(task=task, output_type=output_type)
+        return get_model_task_xgb(model)
     if type_utils.LazyType("lightgbm.Booster").isinstance(model) or type_utils.LazyType(
         "lightgbm.LGBMModel"
     ).isinstance(model):
-        task = get_model_task_lightgbm(model)
-        output_type = model_signature.DataType.DOUBLE
-        if task in [
-            type_hints.Task.TABULAR_BINARY_CLASSIFICATION,
-            type_hints.Task.TABULAR_MULTI_CLASSIFICATION,
-        ]:
-            output_type = model_signature.DataType.STRING
-        return ModelTaskAndOutputType(task=task, output_type=output_type)
+        return get_model_task_lightgbm(model)
     if type_utils.LazyType("catboost.CatBoost").isinstance(model):
-        task = get_model_task_catboost(model)
-        output_type = model_signature.DataType.DOUBLE
-        if task == type_hints.Task.TABULAR_MULTI_CLASSIFICATION:
-            output_type = model_signature.DataType.STRING
-        return ModelTaskAndOutputType(task=task, output_type=output_type)
+        return get_model_task_catboost(model)
     if type_utils.LazyType("sklearn.base.BaseEstimator").isinstance(model) or type_utils.LazyType(
         "sklearn.pipeline.Pipeline"
     ).isinstance(model):
-        task = get_task_skl(model)
-        output_type = model_signature.DataType.DOUBLE
-        if task == type_hints.Task.TABULAR_MULTI_CLASSIFICATION:
-            output_type = model_signature.DataType.STRING
-        return ModelTaskAndOutputType(task=task, output_type=output_type)
+        return get_task_skl(model)
     raise ValueError(f"Model type {type(model)} is not supported")
+def get_model_task_and_output_type(model: Any) -> ModelTaskAndOutputType:
+    task = _get_model_task(model)
+    output_type = model_signature.DataType.DOUBLE
+    if task == type_hints.Task.TABULAR_MULTI_CLASSIFICATION:
+        output_type = model_signature.DataType.STRING
+    return ModelTaskAndOutputType(task=task, output_type=output_type)

snowflake/ml/model/_signatures/pandas_handler.py CHANGED Viewed

@@ -147,6 +147,22 @@ class PandasDataFrameHandler(base_handler.BaseDataHandler[pd.DataFrame]):
                     specs.append(core.FeatureSpec(dtype=core.DataType.STRING, name=ft_name))
                 elif isinstance(data[df_col].iloc[0], bytes):
                     specs.append(core.FeatureSpec(dtype=core.DataType.BYTES, name=ft_name))
+            elif isinstance(df_col_dtype, pd.CategoricalDtype):
+                category_dtype = df_col_dtype.categories.dtype
+                if category_dtype == np.dtype("O"):
+                    if isinstance(df_col_dtype.categories[0], str):
+                        specs.append(core.FeatureSpec(dtype=core.DataType.STRING, name=ft_name))
+                    elif isinstance(df_col_dtype.categories[0], bytes):
+                        specs.append(core.FeatureSpec(dtype=core.DataType.BYTES, name=ft_name))
+                    else:
+                        raise snowml_exceptions.SnowflakeMLException(
+                            error_code=error_codes.INVALID_DATA,
+                            original_exception=ValueError(
+                                f"Data Validation Error: Unsupported type confronted in {df_col_dtype.categories[0]}"
+                            ),
+                        )
+                else:
+                    specs.append(core.FeatureSpec(dtype=core.DataType.from_numpy_type(category_dtype), name=ft_name))
             elif isinstance(data[df_col].iloc[0], np.datetime64):
                 specs.append(core.FeatureSpec(dtype=core.DataType.TIMESTAMP_NTZ, name=ft_name))
             else:

snowflake/ml/model/custom_model.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import functools
 import inspect
-from typing import Any, Callable, Coroutine, Dict, Generator, List, Optional
+from typing import Any, Callable, Coroutine, Dict, Generator, List, Optional, Union
 import anyio
 import pandas as pd
@@ -104,19 +104,53 @@ class ModelContext:
     def __init__(
         self,
         *,
-        artifacts: Optional[Dict[str, str]] = None,
-        models: Optional[Dict[str, model_types.SupportedModelType]] = None,
+        artifacts: Optional[Union[Dict[str, str], str, model_types.SupportedModelType]] = None,
+        models: Optional[Union[Dict[str, model_types.SupportedModelType], str, model_types.SupportedModelType]] = None,
+        **kwargs: Optional[Union[str, model_types.SupportedModelType]],
     ) -> None:
         """Initialize the model context.
         Args:
             artifacts: A dictionary mapping the name of the artifact to its currently available path. Defaults to None.
             models: A dictionary mapping the name of the sub-model to the corresponding model object. Defaults to None.
+            **kwargs: Additional keyword arguments to be used as artifacts or models.
+        Raises:
+            ValueError: Raised when the keyword argument is used as artifacts or models.
+            ValueError: Raised when the artifact name is duplicated.
+            ValueError: Raised when the model name is duplicated.
         """
-        self.artifacts: Dict[str, str] = artifacts if artifacts else dict()
-        self.model_refs: Dict[str, ModelRef] = (
-            {name: ModelRef(name, model) for name, model in models.items()} if models else dict()
-        )
+        self.artifacts: Dict[str, str] = dict()
+        self.model_refs: Dict[str, ModelRef] = dict()
+        # In case that artifacts is a dictionary, assume the original usage,
+        # which is to pass in a dictionary of artifacts.
+        # In other scenarios, (str or supported model types) we will try to parse the arguments as artifacts or models.
+        if isinstance(artifacts, dict):
+            self.artifacts = artifacts
+        elif isinstance(artifacts, str):
+            self.artifacts["artifacts"] = artifacts
+        elif artifacts is not None:
+            self.model_refs["artifacts"] = ModelRef("artifacts", artifacts)
+        if isinstance(models, dict):
+            self.model_refs = {name: ModelRef(name, model) for name, model in models.items()} if models else dict()
+        elif isinstance(models, str):
+            self.artifacts["models"] = models
+        elif models is not None:
+            self.model_refs["models"] = ModelRef("models", models)
+        # Handle any new arguments passed via kwargs
+        for key, value in kwargs.items():
+            if isinstance(value, str):
+                if key in self.artifacts:
+                    raise ValueError(f"Duplicate artifact name: {key}")
+                self.artifacts[key] = value
+            else:
+                if key in self.model_refs:
+                    raise ValueError(f"Duplicate model name: {key}")
+                self.model_refs[key] = ModelRef(key, value)
     def path(self, key: str) -> str:
         """Get the actual path to a specific artifact. This could be used when defining a Custom Model to retrieve
@@ -141,6 +175,12 @@ class ModelContext:
         """
         return self.model_refs[name]
+    def __getitem__(self, key: str) -> Union[str, ModelRef]:
+        combined: Dict[str, Union[str, ModelRef]] = {**self.artifacts, **self.model_refs}
+        if key not in combined:
+            raise KeyError(f"Key {key} not found in the kwargs, current available keys are: {combined.keys()}")
+        return combined[key]
 class CustomModel:
     """Abstract class for user defined custom model.

snowflake/ml/model/model_signature.py CHANGED Viewed

@@ -214,6 +214,8 @@ def _validate_pandas_df(data: pd.DataFrame, features: Sequence[core.BaseFeatureS
         assert isinstance(feature, core.FeatureSpec)  # assert for mypy.
         ft_type = feature._dtype
         ft_shape = feature._shape
+        if isinstance(df_col_dtype, pd.CategoricalDtype):
+            df_col_dtype = df_col_dtype.categories.dtype
         if df_col_dtype != np.dtype("O"):
             if not _validate_numpy_array(data_col.to_numpy(), ft_type, strict=strict):
                 raise snowml_exceptions.SnowflakeMLException(

snowflake/ml/model/type_hints.py CHANGED Viewed

@@ -298,3 +298,11 @@ class Task(Enum):
     TABULAR_MULTI_CLASSIFICATION = "TABULAR_MULTI_CLASSIFICATION"
     TABULAR_REGRESSION = "TABULAR_REGRESSION"
     TABULAR_RANKING = "TABULAR_RANKING"
+class TargetPlatform(Enum):
+    WAREHOUSE = "WAREHOUSE"
+    SNOWPARK_CONTAINER_SERVICES = "SNOWPARK_CONTAINER_SERVICES"
+SupportedTargetPlatformType = Union[TargetPlatform, str]

snowflake/ml/modeling/_internal/estimator_utils.py CHANGED Viewed

@@ -275,3 +275,16 @@ def upload_model_to_stage(
     temp_file_utils.cleanup_temp_files([local_transform_file_name])
     return os.path.basename(local_transform_file_name)
+def should_include_sample_weight(estimator: object, method_name: str) -> bool:
+    # If this is a Grid Search or Randomized Search estimator, check the underlying estimator.
+    underlying_estimator = (
+        estimator.estimator if ("_search" in estimator.__module__ and hasattr(estimator, "estimator")) else estimator
+    )
+    method = getattr(underlying_estimator, method_name)
+    underlying_estimator_params = inspect.signature(method).parameters
+    if "sample_weight" in underlying_estimator_params:
+        return True
+    return False

snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py CHANGED Viewed

@@ -4,7 +4,10 @@ from typing import Any, List, Optional
 import pandas as pd
 from snowflake.ml._internal.exceptions import error_codes, exceptions
-from snowflake.ml.modeling._internal.estimator_utils import handle_inference_result
+from snowflake.ml.modeling._internal.estimator_utils import (
+    handle_inference_result,
+    should_include_sample_weight,
+)
 class PandasTransformHandlers:
@@ -166,6 +169,7 @@ class PandasTransformHandlers:
             SnowflakeMLException: The input column list does not have one of `X` and `X_test`.
         """
         assert hasattr(self.estimator, "score")  # make type checker happy
         params = inspect.signature(self.estimator.score).parameters
         if "X" in params:
             score_args = {"X": self.dataset[input_cols]}
@@ -181,7 +185,8 @@ class PandasTransformHandlers:
             label_arg_name = "Y" if "Y" in params else "y"
             score_args[label_arg_name] = self.dataset[label_cols].squeeze()
-        if sample_weight_col is not None and "sample_weight" in params:
+        # Sample weight is not included in search estimators parameters, check the underlying estimator.
+        if sample_weight_col is not None and should_include_sample_weight(self.estimator, "score"):
             score_args["sample_weight"] = self.dataset[sample_weight_col].squeeze()
         score = self.estimator.score(**score_args)

snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py CHANGED Viewed

@@ -19,6 +19,7 @@ from snowflake.ml._internal.utils import (
     snowpark_dataframe_utils,
     temp_file_utils,
 )
+from snowflake.ml.modeling._internal.estimator_utils import should_include_sample_weight
 from snowflake.ml.modeling._internal.model_specifications import (
     ModelSpecificationsBuilder,
 )
@@ -38,6 +39,7 @@ from snowflake.snowpark.udtf import UDTFRegistration
 cp.register_pickle_by_value(inspect.getmodule(temp_file_utils.get_temp_file_path))
 cp.register_pickle_by_value(inspect.getmodule(identifier.get_inferred_name))
 cp.register_pickle_by_value(inspect.getmodule(snowpark_dataframe_utils.cast_snowpark_dataframe))
+cp.register_pickle_by_value(inspect.getmodule(should_include_sample_weight))
 _PROJECT = "ModelDevelopment"
 DEFAULT_UDTF_NJOBS = 3
@@ -393,7 +395,10 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
             import pandas as pd
             import pyarrow.parquet as pq
             from sklearn.metrics import check_scoring
-            from sklearn.metrics._scorer import _check_multimetric_scoring
+            from sklearn.metrics._scorer import (
+                _check_multimetric_scoring,
+                _MultimetricScorer,
+            )
             for import_name in udf_imports:
                 importlib.import_module(import_name)
@@ -606,6 +611,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
                 scorers = _check_multimetric_scoring(estimator.estimator, estimator.scoring)
                 estimator._check_refit_for_multimetric(scorers)
                 refit_metric = original_refit
+                scorers = _MultimetricScorer(scorers=scorers)
             estimator.scorer_ = scorers
@@ -638,7 +644,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
                 if label_cols:
                     label_arg_name = "Y" if "Y" in argspec.args else "y"
                     args[label_arg_name] = y
-                if sample_weight_col is not None and "sample_weight" in argspec.args:
+                if sample_weight_col is not None and should_include_sample_weight(estimator, "fit"):
                     args["sample_weight"] = df[sample_weight_col].squeeze()
                 estimator.refit = original_refit
                 refit_start_time = time.time()
@@ -797,8 +803,11 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
             import pandas as pd
             import pyarrow.parquet as pq
             from sklearn.metrics import check_scoring
-            from sklearn.metrics._scorer import _check_multimetric_scoring
-            from sklearn.utils.validation import _check_fit_params, indexable
+            from sklearn.metrics._scorer import (
+                _check_multimetric_scoring,
+                _MultimetricScorer,
+            )
+            from sklearn.utils.validation import _check_method_params, indexable
             # import packages in sproc
             for import_name in udf_imports:
@@ -846,11 +855,12 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
                 scorers = _check_multimetric_scoring(estimator.estimator, estimator.scoring)
                 estimator._check_refit_for_multimetric(scorers)
                 refit_metric = estimator.refit
+                scorers = _MultimetricScorer(scorers=scorers)
             # preprocess the attributes - (2) check fit_params
             groups = None
             X, y, _ = indexable(X, y, groups)
-            fit_params = _check_fit_params(X, fit_params)
+            fit_params = _check_method_params(X, fit_params)
             # preprocess the attributes - (3) safe clone base estimator
             base_estimator = clone(estimator.estimator)
@@ -863,6 +873,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
             fit_and_score_kwargs = dict(
                 scorer=scorers,
                 fit_params=fit_params,
+                score_params=None,
                 return_train_score=estimator.return_train_score,
                 return_n_test_samples=True,
                 return_times=True,

snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py CHANGED Viewed

@@ -18,7 +18,10 @@ from snowflake.ml._internal.utils import (
 )
 from snowflake.ml._internal.utils.query_result_checker import SqlResultValidator
 from snowflake.ml.modeling._internal import estimator_utils
-from snowflake.ml.modeling._internal.estimator_utils import handle_inference_result
+from snowflake.ml.modeling._internal.estimator_utils import (
+    handle_inference_result,
+    should_include_sample_weight,
+)
 from snowflake.snowpark import DataFrame, Session, functions as F, types as T
 from snowflake.snowpark._internal.utils import (
     TempObjectType,
@@ -28,6 +31,8 @@ from snowflake.snowpark._internal.utils import (
 cp.register_pickle_by_value(inspect.getmodule(temp_file_utils.get_temp_file_path))
 cp.register_pickle_by_value(inspect.getmodule(identifier.get_inferred_name))
 cp.register_pickle_by_value(inspect.getmodule(handle_inference_result))
+cp.register_pickle_by_value(inspect.getmodule(should_include_sample_weight))
 _PROJECT = "ModelDevelopment"
@@ -330,7 +335,8 @@ class SnowparkTransformHandlers:
                 label_arg_name = "Y" if "Y" in params else "y"
                 args[label_arg_name] = df[label_cols].squeeze()
-            if sample_weight_col is not None and "sample_weight" in params:
+            # Sample weight is not included in search estimators parameters, check the underlying estimator.
+            if sample_weight_col is not None and should_include_sample_weight(estimator, "score"):
                 args["sample_weight"] = df[sample_weight_col].squeeze()
             result: float = estimator.score(**args)

snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py CHANGED Viewed

@@ -20,7 +20,10 @@ from snowflake.ml._internal.utils import (
     temp_file_utils,
 )
 from snowflake.ml.modeling._internal import estimator_utils
-from snowflake.ml.modeling._internal.estimator_utils import handle_inference_result
+from snowflake.ml.modeling._internal.estimator_utils import (
+    handle_inference_result,
+    should_include_sample_weight,
+)
 from snowflake.ml.modeling._internal.model_specifications import (
     ModelSpecifications,
     ModelSpecificationsBuilder,
@@ -32,6 +35,7 @@ from snowflake.snowpark.stored_procedure import StoredProcedure
 cp.register_pickle_by_value(inspect.getmodule(temp_file_utils.get_temp_file_path))
 cp.register_pickle_by_value(inspect.getmodule(identifier.get_inferred_name))
 cp.register_pickle_by_value(inspect.getmodule(handle_inference_result))
+cp.register_pickle_by_value(inspect.getmodule(should_include_sample_weight))
 _PROJECT = "ModelDevelopment"
 _ENABLE_ANONYMOUS_SPROC = False
@@ -170,12 +174,14 @@ class SnowparkModelTrainer:
                     estimator = cp.load(local_transform_file_obj)
                 params = inspect.signature(estimator.fit).parameters
                 args = {"X": df[input_cols]}
                 if label_cols:
                     label_arg_name = "Y" if "Y" in params else "y"
                     args[label_arg_name] = df[label_cols].squeeze()
-                if sample_weight_col is not None and "sample_weight" in params:
+                # Sample weight is not included in search estimators parameters, check the underlying estimator.
+                if sample_weight_col is not None and should_include_sample_weight(estimator, "fit"):
                     args["sample_weight"] = df[sample_weight_col].squeeze()
                 estimator.fit(**args)
@@ -412,7 +418,7 @@ class SnowparkModelTrainer:
                 label_arg_name = "Y" if "Y" in params else "y"
                 args[label_arg_name] = df[label_cols].squeeze()
-            if sample_weight_col is not None and "sample_weight" in params:
+            if sample_weight_col is not None and should_include_sample_weight(estimator, "fit"):
                 args["sample_weight"] = df[sample_weight_col].squeeze()
             fit_transform_result = estimator.fit_transform(**args)

snowflake/ml/modeling/calibration/calibrated_classifier_cv.py CHANGED Viewed

@@ -167,9 +167,6 @@ class CalibratedClassifierCV(BaseTransformer):
         `estimator` trained on all the data.
         Note that this method is also internally implemented  in
         :mod:`sklearn.svm` estimators with the `probabilities=True` parameter.
-    base_estimator: estimator instance
-        This parameter is deprecated. Use `estimator` instead.
     """
     def __init__(  # type: ignore[no-untyped-def]
@@ -180,7 +177,6 @@ class CalibratedClassifierCV(BaseTransformer):
         cv=None,
         n_jobs=None,
         ensemble=True,
-        base_estimator="deprecated",
         input_cols: Optional[Union[str, Iterable[str]]] = None,
         output_cols: Optional[Union[str, Iterable[str]]] = None,
         label_cols: Optional[Union[str, Iterable[str]]] = None,
@@ -200,16 +196,13 @@ class CalibratedClassifierCV(BaseTransformer):
         self._batch_size = -1
         deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
         deps = deps | gather_dependencies(estimator)
-        deps = deps | gather_dependencies(base_estimator)
         self._deps = list(deps)
         estimator = transform_snowml_obj_to_sklearn_obj(estimator)
-        base_estimator = transform_snowml_obj_to_sklearn_obj(base_estimator)
         init_args = {'estimator':(estimator, None, False),
             'method':(method, "sigmoid", False),
             'cv':(cv, None, False),
             'n_jobs':(n_jobs, None, False),
-            'ensemble':(ensemble, True, False),
-            'base_estimator':(base_estimator, "deprecated", False),}
+            'ensemble':(ensemble, True, False),}
         cleaned_up_init_args = validate_sklearn_args(
             args=init_args,
             klass=sklearn.calibration.CalibratedClassifierCV

snowflake/ml/modeling/cluster/agglomerative_clustering.py CHANGED Viewed

@@ -113,28 +113,18 @@ class AgglomerativeClustering(BaseTransformer):
         The number of clusters to find. It must be ``None`` if
         ``distance_threshold`` is not ``None``.
-    affinity: str or callable, default='euclidean'
-        The metric to use when calculating distance between instances in a
-        feature array. If metric is a string or callable, it must be one of
-        the options allowed by :func:`sklearn.metrics.pairwise_distances` for
-        its metric parameter.
-        If linkage is "ward", only "euclidean" is accepted.
-        If "precomputed", a distance matrix (instead of a similarity matrix)
-        is needed as input for the fit method.
-    metric: str or callable, default=None
+    metric: str or callable, default="euclidean"
         Metric used to compute the linkage. Can be "euclidean", "l1", "l2",
-        "manhattan", "cosine", or "precomputed". If set to `None` then
-        "euclidean" is used. If linkage is "ward", only "euclidean" is
-        accepted. If "precomputed", a distance matrix is needed as input for
-        the fit method.
+        "manhattan", "cosine", or "precomputed". If linkage is "ward", only
+        "euclidean" is accepted. If "precomputed", a distance matrix is needed
+        as input for the fit method.
     memory: str or object with the joblib.Memory interface, default=None
         Used to cache the output of the computation of the tree.
         By default, no caching is done. If a string is given, it is the
         path to the caching directory.
-    connectivity: array-like or callable, default=None
+    connectivity: array-like, sparse matrix, or callable, default=None
         Connectivity matrix. Defines for each sample the neighboring
         samples following a given structure of the data.
         This can be a connectivity matrix itself or a callable that transforms
@@ -142,6 +132,10 @@ class AgglomerativeClustering(BaseTransformer):
         `kneighbors_graph`. Default is ``None``, i.e, the
         hierarchical clustering algorithm is unstructured.
+        For an example of connectivity matrix using
+        :class:`~sklearn.neighbors.kneighbors_graph`, see
+        :ref:`sphx_glr_auto_examples_cluster_plot_agglomerative_clustering.py`.
     compute_full_tree: 'auto' or bool, default='auto'
         Stop early the construction of the tree at ``n_clusters``. This is
         useful to decrease computation time if the number of clusters is not
@@ -167,6 +161,9 @@ class AgglomerativeClustering(BaseTransformer):
         - 'single' uses the minimum of the distances between all observations
           of the two sets.
+        For examples comparing different `linkage` criteria, see
+        :ref:`sphx_glr_auto_examples_cluster_plot_linkage_comparison.py`.
     distance_threshold: float, default=None
         The linkage distance threshold at or above which clusters will not be
         merged. If not ``None``, ``n_clusters`` must be ``None`` and
@@ -176,14 +173,16 @@ class AgglomerativeClustering(BaseTransformer):
         Computes distances between clusters even if `distance_threshold` is not
         used. This can be used to make dendrogram visualization, but introduces
         a computational and memory overhead.
+        For an example of dendrogram visualization, see
+        :ref:`sphx_glr_auto_examples_cluster_plot_agglomerative_dendrogram.py`.
     """
     def __init__(  # type: ignore[no-untyped-def]
         self,
         *,
         n_clusters=2,
-        affinity="deprecated",
-        metric=None,
+        metric="euclidean",
         memory=None,
         connectivity=None,
         compute_full_tree="auto",
@@ -212,8 +211,7 @@ class AgglomerativeClustering(BaseTransformer):
         self._deps = list(deps)
         init_args = {'n_clusters':(n_clusters, 2, False),
-            'affinity':(affinity, "deprecated", False),
-            'metric':(metric, None, False),
+            'metric':(metric, "euclidean", False),
             'memory':(memory, None, False),
             'connectivity':(connectivity, None, False),
             'compute_full_tree':(compute_full_tree, "auto", False),

snowflake/ml/modeling/cluster/dbscan.py CHANGED Viewed

@@ -117,8 +117,11 @@ class DBSCAN(BaseTransformer):
         and distance function.
     min_samples: int, default=5
-        The number of samples (or total weight) in a neighborhood for a point
-        to be considered as a core point. This includes the point itself.
+        The number of samples (or total weight) in a neighborhood for a point to
+        be considered as a core point. This includes the point itself. If
+        `min_samples` is set to a higher value, DBSCAN will find denser clusters,
+        whereas if it is set to a lower value, the found clusters will be more
+        sparse.
     metric: str, or callable, default='euclidean'
         The metric to use when calculating distance between instances in a

snowflake-ml-python 1.6.3__py3-none-any.whl → 1.7.0__py3-none-any.whl

snowflake-ml-python 1.6.3py3-none-any.whl → 1.7.0py3-none-any.whl