PyPI - snowflake-ml-python - Versions diffs - 1.6.4__py3-none-any.whl → 1.7.1__py3-none-any.whl - Mend

snowflake-ml-python 1.6.4py3-none-any.whl → 1.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (176) hide show

snowflake/ml/model/_signatures/tensorflow_handler.py CHANGED Viewed

@@ -109,10 +109,10 @@ class SeqOfTensorflowTensorHandler(
             dtype = core.DataType.from_numpy_type(data_col.dtype.as_numpy_dtype)
             ft_name = f"{role_prefix}{feature_prefix}{i}"
             if len(data_col.shape) == 1:
-                features.append(core.FeatureSpec(dtype=dtype, name=ft_name))
+                features.append(core.FeatureSpec(dtype=dtype, name=ft_name, nullable=False))
             else:
                 ft_shape = tuple(data_col.shape[1:])
-                features.append(core.FeatureSpec(dtype=dtype, name=ft_name, shape=ft_shape))
+                features.append(core.FeatureSpec(dtype=dtype, name=ft_name, shape=ft_shape, nullable=False))
         return features
     @staticmethod

snowflake/ml/model/_signatures/utils.py CHANGED Viewed

@@ -297,3 +297,7 @@ def huggingface_pipeline_signature_auto_infer(task: str, params: Dict[str, Any])
         )
     return None
+def series_dropna(series: pd.Series) -> pd.Series:
+    return series.dropna(inplace=False).reset_index(drop=True).convert_dtypes()

snowflake/ml/model/custom_model.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import functools
 import inspect
-from typing import Any, Callable, Coroutine, Dict, Generator, List, Optional
+from typing import Any, Callable, Coroutine, Dict, Generator, List, Optional, Union
 import anyio
 import pandas as pd
@@ -104,19 +104,53 @@ class ModelContext:
     def __init__(
         self,
         *,
-        artifacts: Optional[Dict[str, str]] = None,
-        models: Optional[Dict[str, model_types.SupportedModelType]] = None,
+        artifacts: Optional[Union[Dict[str, str], str, model_types.SupportedModelType]] = None,
+        models: Optional[Union[Dict[str, model_types.SupportedModelType], str, model_types.SupportedModelType]] = None,
+        **kwargs: Optional[Union[str, model_types.SupportedModelType]],
     ) -> None:
         """Initialize the model context.
         Args:
             artifacts: A dictionary mapping the name of the artifact to its currently available path. Defaults to None.
             models: A dictionary mapping the name of the sub-model to the corresponding model object. Defaults to None.
+            **kwargs: Additional keyword arguments to be used as artifacts or models.
+        Raises:
+            ValueError: Raised when the keyword argument is used as artifacts or models.
+            ValueError: Raised when the artifact name is duplicated.
+            ValueError: Raised when the model name is duplicated.
         """
-        self.artifacts: Dict[str, str] = artifacts if artifacts else dict()
-        self.model_refs: Dict[str, ModelRef] = (
-            {name: ModelRef(name, model) for name, model in models.items()} if models else dict()
-        )
+        self.artifacts: Dict[str, str] = dict()
+        self.model_refs: Dict[str, ModelRef] = dict()
+        # In case that artifacts is a dictionary, assume the original usage,
+        # which is to pass in a dictionary of artifacts.
+        # In other scenarios, (str or supported model types) we will try to parse the arguments as artifacts or models.
+        if isinstance(artifacts, dict):
+            self.artifacts = artifacts
+        elif isinstance(artifacts, str):
+            self.artifacts["artifacts"] = artifacts
+        elif artifacts is not None:
+            self.model_refs["artifacts"] = ModelRef("artifacts", artifacts)
+        if isinstance(models, dict):
+            self.model_refs = {name: ModelRef(name, model) for name, model in models.items()} if models else dict()
+        elif isinstance(models, str):
+            self.artifacts["models"] = models
+        elif models is not None:
+            self.model_refs["models"] = ModelRef("models", models)
+        # Handle any new arguments passed via kwargs
+        for key, value in kwargs.items():
+            if isinstance(value, str):
+                if key in self.artifacts:
+                    raise ValueError(f"Duplicate artifact name: {key}")
+                self.artifacts[key] = value
+            else:
+                if key in self.model_refs:
+                    raise ValueError(f"Duplicate model name: {key}")
+                self.model_refs[key] = ModelRef(key, value)
     def path(self, key: str) -> str:
         """Get the actual path to a specific artifact. This could be used when defining a Custom Model to retrieve
@@ -141,6 +175,12 @@ class ModelContext:
         """
         return self.model_refs[name]
+    def __getitem__(self, key: str) -> Union[str, ModelRef]:
+        combined: Dict[str, Union[str, ModelRef]] = {**self.artifacts, **self.model_refs}
+        if key not in combined:
+            raise KeyError(f"Key {key} not found in the kwargs, current available keys are: {combined.keys()}")
+        return combined[key]
 class CustomModel:
     """Abstract class for user defined custom model.

snowflake/ml/model/model_signature.py CHANGED Viewed

@@ -139,9 +139,32 @@ def _rename_signature_with_snowflake_identifiers(
     return signature
-def _validate_numpy_array(
-    arr: model_types._SupportedNumpyArray, feature_type: core.DataType, strict: bool = False
+def _validate_array_or_series_type(
+    arr: Union[model_types._SupportedNumpyArray, pd.Series], feature_type: core.DataType, strict: bool = False
 ) -> bool:
+    original_dtype = arr.dtype
+    dtype = arr.dtype
+    if isinstance(
+        dtype,
+        (
+            pd.Int8Dtype,
+            pd.Int16Dtype,
+            pd.Int32Dtype,
+            pd.Int64Dtype,
+            pd.UInt8Dtype,
+            pd.UInt16Dtype,
+            pd.UInt32Dtype,
+            pd.UInt64Dtype,
+            pd.Float32Dtype,
+            pd.Float64Dtype,
+            pd.BooleanDtype,
+        ),
+    ):
+        dtype = dtype.type
+    elif isinstance(dtype, pd.CategoricalDtype):
+        dtype = dtype.categories.dtype
+    elif isinstance(dtype, pd.StringDtype):
+        dtype = np.str_
     if feature_type in [
         core.DataType.INT8,
         core.DataType.INT16,
@@ -152,14 +175,17 @@ def _validate_numpy_array(
         core.DataType.UINT32,
         core.DataType.UINT64,
     ]:
-        if not (np.issubdtype(arr.dtype, np.integer)):
+        if not (np.issubdtype(dtype, np.integer)):
             return False
         if not strict:
             return True
-        min_v, max_v = arr.min(), arr.max()
+        if isinstance(original_dtype, pd.CategoricalDtype):
+            min_v, max_v = arr.cat.as_ordered().min(), arr.cat.as_ordered().min()  # type: ignore[union-attr]
+        else:
+            min_v, max_v = arr.min(), arr.max()
         return bool(max_v <= np.iinfo(feature_type._numpy_type).max and min_v >= np.iinfo(feature_type._numpy_type).min)
     elif feature_type in [core.DataType.FLOAT, core.DataType.DOUBLE]:
-        if not (np.issubdtype(arr.dtype, np.integer) or np.issubdtype(arr.dtype, np.floating)):
+        if not (np.issubdtype(dtype, np.integer) or np.issubdtype(dtype, np.floating)):
             return False
         if not strict:
             return True
@@ -171,7 +197,7 @@ def _validate_numpy_array(
     elif feature_type in [core.DataType.TIMESTAMP_NTZ]:
         return np.issubdtype(arr.dtype, np.datetime64)
     else:
-        return np.can_cast(arr.dtype, feature_type._numpy_type, casting="no")
+        return np.can_cast(dtype, feature_type._numpy_type, casting="no")
 def _validate_pandas_df(data: pd.DataFrame, features: Sequence[core.BaseFeatureSpec], strict: bool = False) -> None:
@@ -204,7 +230,10 @@ def _validate_pandas_df(data: pd.DataFrame, features: Sequence[core.BaseFeatureS
                 original_exception=ValueError(f"Data Validation Error: feature {ft_name} does not exist in data."),
             )
+        if data_col.isnull().any():
+            data_col = utils.series_dropna(data_col)
         df_col_dtype = data_col.dtype
         if isinstance(feature, core.FeatureGroupSpec):
             raise snowml_exceptions.SnowflakeMLException(
                 error_code=error_codes.NOT_IMPLEMENTED,
@@ -214,8 +243,10 @@ def _validate_pandas_df(data: pd.DataFrame, features: Sequence[core.BaseFeatureS
         assert isinstance(feature, core.FeatureSpec)  # assert for mypy.
         ft_type = feature._dtype
         ft_shape = feature._shape
+        if isinstance(df_col_dtype, pd.CategoricalDtype):
+            df_col_dtype = df_col_dtype.categories.dtype
         if df_col_dtype != np.dtype("O"):
-            if not _validate_numpy_array(data_col.to_numpy(), ft_type, strict=strict):
+            if not _validate_array_or_series_type(data_col, ft_type, strict=strict):
                 raise snowml_exceptions.SnowflakeMLException(
                     error_code=error_codes.INVALID_DATA,
                     original_exception=ValueError(
@@ -245,7 +276,7 @@ def _validate_pandas_df(data: pd.DataFrame, features: Sequence[core.BaseFeatureS
                 converted_data_list = [utils.convert_list_to_ndarray(data_row) for data_row in data_col]
                 if not all(
-                    _validate_numpy_array(converted_data, ft_type, strict=strict)
+                    _validate_array_or_series_type(converted_data, ft_type, strict=strict)
                     for converted_data in converted_data_list
                 ):
                     raise snowml_exceptions.SnowflakeMLException(
@@ -276,7 +307,7 @@ def _validate_pandas_df(data: pd.DataFrame, features: Sequence[core.BaseFeatureS
                         ),
                     )
-                if not all(_validate_numpy_array(data_row, ft_type, strict=strict) for data_row in data_col):
+                if not all(_validate_array_or_series_type(data_row, ft_type, strict=strict) for data_row in data_col):
                     raise snowml_exceptions.SnowflakeMLException(
                         error_code=error_codes.INVALID_DATA,
                         original_exception=ValueError(

snowflake/ml/model/type_hints.py CHANGED Viewed

@@ -66,7 +66,7 @@ SupportedRequireSignatureModelType = Union[
     "xgboost.XGBModel",
     "xgboost.Booster",
     "torch.nn.Module",
-    "torch.jit.ScriptModule",  # type:ignore[name-defined]
+    "torch.jit.ScriptModule",
     "tensorflow.Module",
 ]
@@ -298,3 +298,11 @@ class Task(Enum):
     TABULAR_MULTI_CLASSIFICATION = "TABULAR_MULTI_CLASSIFICATION"
     TABULAR_REGRESSION = "TABULAR_REGRESSION"
     TABULAR_RANKING = "TABULAR_RANKING"
+class TargetPlatform(Enum):
+    WAREHOUSE = "WAREHOUSE"
+    SNOWPARK_CONTAINER_SERVICES = "SNOWPARK_CONTAINER_SERVICES"
+SupportedTargetPlatformType = Union[TargetPlatform, str]

snowflake/ml/modeling/_internal/estimator_utils.py CHANGED Viewed

@@ -275,3 +275,16 @@ def upload_model_to_stage(
     temp_file_utils.cleanup_temp_files([local_transform_file_name])
     return os.path.basename(local_transform_file_name)
+def should_include_sample_weight(estimator: object, method_name: str) -> bool:
+    # If this is a Grid Search or Randomized Search estimator, check the underlying estimator.
+    underlying_estimator = (
+        estimator.estimator if ("_search" in estimator.__module__ and hasattr(estimator, "estimator")) else estimator
+    )
+    method = getattr(underlying_estimator, method_name)
+    underlying_estimator_params = inspect.signature(method).parameters
+    if "sample_weight" in underlying_estimator_params:
+        return True
+    return False

snowflake/ml/modeling/_internal/local_implementations/pandas_handlers.py CHANGED Viewed

@@ -4,7 +4,10 @@ from typing import Any, List, Optional
 import pandas as pd
 from snowflake.ml._internal.exceptions import error_codes, exceptions
-from snowflake.ml.modeling._internal.estimator_utils import handle_inference_result
+from snowflake.ml.modeling._internal.estimator_utils import (
+    handle_inference_result,
+    should_include_sample_weight,
+)
 class PandasTransformHandlers:
@@ -166,6 +169,7 @@ class PandasTransformHandlers:
             SnowflakeMLException: The input column list does not have one of `X` and `X_test`.
         """
         assert hasattr(self.estimator, "score")  # make type checker happy
         params = inspect.signature(self.estimator.score).parameters
         if "X" in params:
             score_args = {"X": self.dataset[input_cols]}
@@ -181,7 +185,8 @@ class PandasTransformHandlers:
             label_arg_name = "Y" if "Y" in params else "y"
             score_args[label_arg_name] = self.dataset[label_cols].squeeze()
-        if sample_weight_col is not None and "sample_weight" in params:
+        # Sample weight is not included in search estimators parameters, check the underlying estimator.
+        if sample_weight_col is not None and should_include_sample_weight(self.estimator, "score"):
             score_args["sample_weight"] = self.dataset[sample_weight_col].squeeze()
         score = self.estimator.score(**score_args)

snowflake/ml/modeling/_internal/snowpark_implementations/distributed_hpo_trainer.py CHANGED Viewed

@@ -19,6 +19,7 @@ from snowflake.ml._internal.utils import (
     snowpark_dataframe_utils,
     temp_file_utils,
 )
+from snowflake.ml.modeling._internal.estimator_utils import should_include_sample_weight
 from snowflake.ml.modeling._internal.model_specifications import (
     ModelSpecificationsBuilder,
 )
@@ -38,6 +39,7 @@ from snowflake.snowpark.udtf import UDTFRegistration
 cp.register_pickle_by_value(inspect.getmodule(temp_file_utils.get_temp_file_path))
 cp.register_pickle_by_value(inspect.getmodule(identifier.get_inferred_name))
 cp.register_pickle_by_value(inspect.getmodule(snowpark_dataframe_utils.cast_snowpark_dataframe))
+cp.register_pickle_by_value(inspect.getmodule(should_include_sample_weight))
 _PROJECT = "ModelDevelopment"
 DEFAULT_UDTF_NJOBS = 3
@@ -393,7 +395,10 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
             import pandas as pd
             import pyarrow.parquet as pq
             from sklearn.metrics import check_scoring
-            from sklearn.metrics._scorer import _check_multimetric_scoring
+            from sklearn.metrics._scorer import (
+                _check_multimetric_scoring,
+                _MultimetricScorer,
+            )
             for import_name in udf_imports:
                 importlib.import_module(import_name)
@@ -606,6 +611,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
                 scorers = _check_multimetric_scoring(estimator.estimator, estimator.scoring)
                 estimator._check_refit_for_multimetric(scorers)
                 refit_metric = original_refit
+                scorers = _MultimetricScorer(scorers=scorers)
             estimator.scorer_ = scorers
@@ -638,7 +644,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
                 if label_cols:
                     label_arg_name = "Y" if "Y" in argspec.args else "y"
                     args[label_arg_name] = y
-                if sample_weight_col is not None and "sample_weight" in argspec.args:
+                if sample_weight_col is not None and should_include_sample_weight(estimator, "fit"):
                     args["sample_weight"] = df[sample_weight_col].squeeze()
                 estimator.refit = original_refit
                 refit_start_time = time.time()
@@ -797,8 +803,11 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
             import pandas as pd
             import pyarrow.parquet as pq
             from sklearn.metrics import check_scoring
-            from sklearn.metrics._scorer import _check_multimetric_scoring
-            from sklearn.utils.validation import _check_fit_params, indexable
+            from sklearn.metrics._scorer import (
+                _check_multimetric_scoring,
+                _MultimetricScorer,
+            )
+            from sklearn.utils.validation import _check_method_params, indexable
             # import packages in sproc
             for import_name in udf_imports:
@@ -846,11 +855,12 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
                 scorers = _check_multimetric_scoring(estimator.estimator, estimator.scoring)
                 estimator._check_refit_for_multimetric(scorers)
                 refit_metric = estimator.refit
+                scorers = _MultimetricScorer(scorers=scorers)
             # preprocess the attributes - (2) check fit_params
             groups = None
             X, y, _ = indexable(X, y, groups)
-            fit_params = _check_fit_params(X, fit_params)
+            fit_params = _check_method_params(X, fit_params)
             # preprocess the attributes - (3) safe clone base estimator
             base_estimator = clone(estimator.estimator)
@@ -863,6 +873,7 @@ class DistributedHPOTrainer(SnowparkModelTrainer):
             fit_and_score_kwargs = dict(
                 scorer=scorers,
                 fit_params=fit_params,
+                score_params=None,
                 return_train_score=estimator.return_train_score,
                 return_n_test_samples=True,
                 return_times=True,

snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_handlers.py CHANGED Viewed

@@ -18,7 +18,10 @@ from snowflake.ml._internal.utils import (
 )
 from snowflake.ml._internal.utils.query_result_checker import SqlResultValidator
 from snowflake.ml.modeling._internal import estimator_utils
-from snowflake.ml.modeling._internal.estimator_utils import handle_inference_result
+from snowflake.ml.modeling._internal.estimator_utils import (
+    handle_inference_result,
+    should_include_sample_weight,
+)
 from snowflake.snowpark import DataFrame, Session, functions as F, types as T
 from snowflake.snowpark._internal.utils import (
     TempObjectType,
@@ -28,6 +31,8 @@ from snowflake.snowpark._internal.utils import (
 cp.register_pickle_by_value(inspect.getmodule(temp_file_utils.get_temp_file_path))
 cp.register_pickle_by_value(inspect.getmodule(identifier.get_inferred_name))
 cp.register_pickle_by_value(inspect.getmodule(handle_inference_result))
+cp.register_pickle_by_value(inspect.getmodule(should_include_sample_weight))
 _PROJECT = "ModelDevelopment"
@@ -330,7 +335,8 @@ class SnowparkTransformHandlers:
                 label_arg_name = "Y" if "Y" in params else "y"
                 args[label_arg_name] = df[label_cols].squeeze()
-            if sample_weight_col is not None and "sample_weight" in params:
+            # Sample weight is not included in search estimators parameters, check the underlying estimator.
+            if sample_weight_col is not None and should_include_sample_weight(estimator, "score"):
                 args["sample_weight"] = df[sample_weight_col].squeeze()
             result: float = estimator.score(**args)

snowflake/ml/modeling/_internal/snowpark_implementations/snowpark_trainer.py CHANGED Viewed

@@ -20,7 +20,10 @@ from snowflake.ml._internal.utils import (
     temp_file_utils,
 )
 from snowflake.ml.modeling._internal import estimator_utils
-from snowflake.ml.modeling._internal.estimator_utils import handle_inference_result
+from snowflake.ml.modeling._internal.estimator_utils import (
+    handle_inference_result,
+    should_include_sample_weight,
+)
 from snowflake.ml.modeling._internal.model_specifications import (
     ModelSpecifications,
     ModelSpecificationsBuilder,
@@ -32,6 +35,7 @@ from snowflake.snowpark.stored_procedure import StoredProcedure
 cp.register_pickle_by_value(inspect.getmodule(temp_file_utils.get_temp_file_path))
 cp.register_pickle_by_value(inspect.getmodule(identifier.get_inferred_name))
 cp.register_pickle_by_value(inspect.getmodule(handle_inference_result))
+cp.register_pickle_by_value(inspect.getmodule(should_include_sample_weight))
 _PROJECT = "ModelDevelopment"
 _ENABLE_ANONYMOUS_SPROC = False
@@ -170,12 +174,14 @@ class SnowparkModelTrainer:
                     estimator = cp.load(local_transform_file_obj)
                 params = inspect.signature(estimator.fit).parameters
                 args = {"X": df[input_cols]}
                 if label_cols:
                     label_arg_name = "Y" if "Y" in params else "y"
                     args[label_arg_name] = df[label_cols].squeeze()
-                if sample_weight_col is not None and "sample_weight" in params:
+                # Sample weight is not included in search estimators parameters, check the underlying estimator.
+                if sample_weight_col is not None and should_include_sample_weight(estimator, "fit"):
                     args["sample_weight"] = df[sample_weight_col].squeeze()
                 estimator.fit(**args)
@@ -412,7 +418,7 @@ class SnowparkModelTrainer:
                 label_arg_name = "Y" if "Y" in params else "y"
                 args[label_arg_name] = df[label_cols].squeeze()
-            if sample_weight_col is not None and "sample_weight" in params:
+            if sample_weight_col is not None and should_include_sample_weight(estimator, "fit"):
                 args["sample_weight"] = df[sample_weight_col].squeeze()
             fit_transform_result = estimator.fit_transform(**args)

snowflake/ml/modeling/calibration/calibrated_classifier_cv.py CHANGED Viewed

@@ -167,9 +167,6 @@ class CalibratedClassifierCV(BaseTransformer):
         `estimator` trained on all the data.
         Note that this method is also internally implemented  in
         :mod:`sklearn.svm` estimators with the `probabilities=True` parameter.
-    base_estimator: estimator instance
-        This parameter is deprecated. Use `estimator` instead.
     """
     def __init__(  # type: ignore[no-untyped-def]
@@ -180,7 +177,6 @@ class CalibratedClassifierCV(BaseTransformer):
         cv=None,
         n_jobs=None,
         ensemble=True,
-        base_estimator="deprecated",
         input_cols: Optional[Union[str, Iterable[str]]] = None,
         output_cols: Optional[Union[str, Iterable[str]]] = None,
         label_cols: Optional[Union[str, Iterable[str]]] = None,
@@ -200,16 +196,13 @@ class CalibratedClassifierCV(BaseTransformer):
         self._batch_size = -1
         deps: Set[str] = set([f'numpy=={np.__version__}', f'scikit-learn=={sklearn.__version__}', f'cloudpickle=={cp.__version__}'])
         deps = deps | gather_dependencies(estimator)
-        deps = deps | gather_dependencies(base_estimator)
         self._deps = list(deps)
         estimator = transform_snowml_obj_to_sklearn_obj(estimator)
-        base_estimator = transform_snowml_obj_to_sklearn_obj(base_estimator)
         init_args = {'estimator':(estimator, None, False),
             'method':(method, "sigmoid", False),
             'cv':(cv, None, False),
             'n_jobs':(n_jobs, None, False),
-            'ensemble':(ensemble, True, False),
-            'base_estimator':(base_estimator, "deprecated", False),}
+            'ensemble':(ensemble, True, False),}
         cleaned_up_init_args = validate_sklearn_args(
             args=init_args,
             klass=sklearn.calibration.CalibratedClassifierCV

snowflake/ml/modeling/cluster/agglomerative_clustering.py CHANGED Viewed

@@ -113,28 +113,18 @@ class AgglomerativeClustering(BaseTransformer):
         The number of clusters to find. It must be ``None`` if
         ``distance_threshold`` is not ``None``.
-    affinity: str or callable, default='euclidean'
-        The metric to use when calculating distance between instances in a
-        feature array. If metric is a string or callable, it must be one of
-        the options allowed by :func:`sklearn.metrics.pairwise_distances` for
-        its metric parameter.
-        If linkage is "ward", only "euclidean" is accepted.
-        If "precomputed", a distance matrix (instead of a similarity matrix)
-        is needed as input for the fit method.
-    metric: str or callable, default=None
+    metric: str or callable, default="euclidean"
         Metric used to compute the linkage. Can be "euclidean", "l1", "l2",
-        "manhattan", "cosine", or "precomputed". If set to `None` then
-        "euclidean" is used. If linkage is "ward", only "euclidean" is
-        accepted. If "precomputed", a distance matrix is needed as input for
-        the fit method.
+        "manhattan", "cosine", or "precomputed". If linkage is "ward", only
+        "euclidean" is accepted. If "precomputed", a distance matrix is needed
+        as input for the fit method.
     memory: str or object with the joblib.Memory interface, default=None
         Used to cache the output of the computation of the tree.
         By default, no caching is done. If a string is given, it is the
         path to the caching directory.
-    connectivity: array-like or callable, default=None
+    connectivity: array-like, sparse matrix, or callable, default=None
         Connectivity matrix. Defines for each sample the neighboring
         samples following a given structure of the data.
         This can be a connectivity matrix itself or a callable that transforms
@@ -142,6 +132,10 @@ class AgglomerativeClustering(BaseTransformer):
         `kneighbors_graph`. Default is ``None``, i.e, the
         hierarchical clustering algorithm is unstructured.
+        For an example of connectivity matrix using
+        :class:`~sklearn.neighbors.kneighbors_graph`, see
+        :ref:`sphx_glr_auto_examples_cluster_plot_agglomerative_clustering.py`.
     compute_full_tree: 'auto' or bool, default='auto'
         Stop early the construction of the tree at ``n_clusters``. This is
         useful to decrease computation time if the number of clusters is not
@@ -167,6 +161,9 @@ class AgglomerativeClustering(BaseTransformer):
         - 'single' uses the minimum of the distances between all observations
           of the two sets.
+        For examples comparing different `linkage` criteria, see
+        :ref:`sphx_glr_auto_examples_cluster_plot_linkage_comparison.py`.
     distance_threshold: float, default=None
         The linkage distance threshold at or above which clusters will not be
         merged. If not ``None``, ``n_clusters`` must be ``None`` and
@@ -176,14 +173,16 @@ class AgglomerativeClustering(BaseTransformer):
         Computes distances between clusters even if `distance_threshold` is not
         used. This can be used to make dendrogram visualization, but introduces
         a computational and memory overhead.
+        For an example of dendrogram visualization, see
+        :ref:`sphx_glr_auto_examples_cluster_plot_agglomerative_dendrogram.py`.
     """
     def __init__(  # type: ignore[no-untyped-def]
         self,
         *,
         n_clusters=2,
-        affinity="deprecated",
-        metric=None,
+        metric="euclidean",
         memory=None,
         connectivity=None,
         compute_full_tree="auto",
@@ -212,8 +211,7 @@ class AgglomerativeClustering(BaseTransformer):
         self._deps = list(deps)
         init_args = {'n_clusters':(n_clusters, 2, False),
-            'affinity':(affinity, "deprecated", False),
-            'metric':(metric, None, False),
+            'metric':(metric, "euclidean", False),
             'memory':(memory, None, False),
             'connectivity':(connectivity, None, False),
             'compute_full_tree':(compute_full_tree, "auto", False),

snowflake/ml/modeling/cluster/dbscan.py CHANGED Viewed

@@ -117,8 +117,11 @@ class DBSCAN(BaseTransformer):
         and distance function.
     min_samples: int, default=5
-        The number of samples (or total weight) in a neighborhood for a point
-        to be considered as a core point. This includes the point itself.
+        The number of samples (or total weight) in a neighborhood for a point to
+        be considered as a core point. This includes the point itself. If
+        `min_samples` is set to a higher value, DBSCAN will find denser clusters,
+        whereas if it is set to a lower value, the found clusters will be more
+        sparse.
     metric: str, or callable, default='euclidean'
         The metric to use when calculating distance between instances in a

snowflake/ml/modeling/cluster/feature_agglomeration.py CHANGED Viewed

@@ -113,28 +113,18 @@ class FeatureAgglomeration(BaseTransformer):
         The number of clusters to find. It must be ``None`` if
         ``distance_threshold`` is not ``None``.
-    affinity: str or callable, default='euclidean'
-        The metric to use when calculating distance between instances in a
-        feature array. If metric is a string or callable, it must be one of
-        the options allowed by :func:`sklearn.metrics.pairwise_distances` for
-        its metric parameter.
-        If linkage is "ward", only "euclidean" is accepted.
-        If "precomputed", a distance matrix (instead of a similarity matrix)
-        is needed as input for the fit method.
-    metric: str or callable, default=None
+    metric: str or callable, default="euclidean"
         Metric used to compute the linkage. Can be "euclidean", "l1", "l2",
-        "manhattan", "cosine", or "precomputed". If set to `None` then
-        "euclidean" is used. If linkage is "ward", only "euclidean" is
-        accepted. If "precomputed", a distance matrix is needed as input for
-        the fit method.
+        "manhattan", "cosine", or "precomputed". If linkage is "ward", only
+        "euclidean" is accepted. If "precomputed", a distance matrix is needed
+        as input for the fit method.
     memory: str or object with the joblib.Memory interface, default=None
         Used to cache the output of the computation of the tree.
         By default, no caching is done. If a string is given, it is the
         path to the caching directory.
-    connectivity: array-like or callable, default=None
+    connectivity: array-like, sparse matrix, or callable, default=None
         Connectivity matrix. Defines for each feature the neighboring
         features following a given structure of the data.
         This can be a connectivity matrix itself or a callable that transforms
@@ -187,8 +177,7 @@ class FeatureAgglomeration(BaseTransformer):
         self,
         *,
         n_clusters=2,
-        affinity="deprecated",
-        metric=None,
+        metric="euclidean",
         memory=None,
         connectivity=None,
         compute_full_tree="auto",
@@ -218,8 +207,7 @@ class FeatureAgglomeration(BaseTransformer):
         self._deps = list(deps)
         init_args = {'n_clusters':(n_clusters, 2, False),
-            'affinity':(affinity, "deprecated", False),
-            'metric':(metric, None, False),
+            'metric':(metric, "euclidean", False),
             'memory':(memory, None, False),
             'connectivity':(connectivity, None, False),
             'compute_full_tree':(compute_full_tree, "auto", False),

snowflake-ml-python 1.6.4__py3-none-any.whl → 1.7.1__py3-none-any.whl

snowflake-ml-python 1.6.4py3-none-any.whl → 1.7.1py3-none-any.whl