PyPI - distclassipy - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl - Mend

distclassipy 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

distclassipy/classifier.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import numpy as np
 import pandas as pd
-from scipy.spatial import distance
+import scipy
 from .distances import Distance
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
@@ -11,11 +11,68 @@ from typing import Callable
 class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
     """
-    Implement a distance metric classifier based on scikit-learn.
-    This classifier uses a specified distance metric to classify data points based on their distance to a training template. The training template is created using a specified statistical measure (e.g., median or mean). The classifier can be scaled in terms of standard deviations.
-    Attributes:
+    A distance-based classifier that supports the use of various distance metrics.
+    The distance metric classifier determines the similarity between features in a dataset by leveraging the use of different distance metrics to. A specified distance metric is used to compute the distance between a given object and a centroid for every training class in the feature space. The classifier supports the use of different statistical measures for constructing the centroid and scaling the computed distance. Additionally, the distance metric classifier also optionally provides an estimate of the confidence of the classifier's predictions.
+    Parameters
+    ----------
+    metric : str or callable, default="euclidean"
+        The distance metric to use for calculating the distance between features.
+    scale : bool, default=True
+        Whether to scale the distance between the test object and the centroid for a class in the feature space. If True, the data will be scaled based on the specified dispersion statistic.
+    central_stat : {"mean", "median"}, default="median"
+        The statistic used to calculate the central tendency of the data to construct the feature-space centroid. Supported statistics are "mean" and "median".
+    dispersion_stat : {"std", "iqr"}, default="std"
+        The statistic used to calculate the dispersion of the data for scaling the distance. Supported  statistics are "std" for standard deviation and "iqr" for inter-quartile range.
+        .. versionadded:: 0.1.0
+    calculate_kde : bool, default=False
+        Whether to calculate a kernel density estimate based confidence parameter.
+    calculate_1d_dist : bool, default=False
+        Whether to calculate the 1-dimensional distance based confidence parameter.
+    Attributes
+    ----------
+    metric : str or callable
+        The distance metric used for classification.
+    scale : bool
+        Indicates whether the data is scaled.
+    central_stat : str
+        The statistic used for calculating central tendency.
+    dispersion_stat : str
+        The statistic used for calculating dispersion.
+    calculate_kde : bool
+        Indicates whether a kernel density estimate is calculated.
+    calculate_1d_dist : bool
+        Indicates whether 1-dimensional distances are calculated.
+    See Also
+    --------
+    scipy.spatial.dist : Other distance metrics provided in SciPy
+    distclassipy.Distance : Distance metrics included with DistClassiPy
+    Notes
+    -----
+    If using distance metrics supported by SciPy, it is desirable to pass a string, which allows SciPy to use an optimized C version of the code instead of the slower Python version.
+    References
+    ----------
+    .. [1] "Light Curve Classification with DistClassiPy: a new distance-based classifier"
+    Examples
+    --------
+    >>> import distclassipy as dcpy
+    >>> from sklearn.datasets import make_classification
+    >>> X, y = make_classification(n_samples=1000, n_features=4,
+    ...                            n_informative=2, n_redundant=0,
+    ...                            random_state=0, shuffle=False)
+    >>> clf = dcpy.DistanceMetricClassifier(metric="canberra")
+    >>> clf.fit(X, y)
+    DistanceMetricClassifier(...)
+    >>> print(clf.predict([[0, 0, 0, 0]]))
+    [0]
     """
     def __init__(
@@ -29,21 +86,6 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
     ):
         """
         Initialize the classifier with specified parameters.
-        Parameters
-        ----------
-        metric : str or callable, optional
-            The distance metric to use. Default is 'euclidean'.
-        scale : bool, optional
-            If True, classifier is scaled in terms of standard deviations. Default is True.
-        central_stat : str, optional
-            The statistical measure to calculate the central tendency of the training template('median' or 'mean')
-        dispersion_stat : str, optional
-            The statistical measure to calculate the dispersion of the training template ('iqr' or 'std').
-        calculate_kde : bool, optional
-            If True, calculate the kernel density estimate. Default is True.
-        calculate_1d_dist : bool, optional
-            If True, calculate the 1-dimensional distance. Default is True.
         """
         self.metric = metric
         self.scale = scale
@@ -52,24 +94,68 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
         self.calculate_kde = calculate_kde
         self.calculate_1d_dist = calculate_1d_dist
+        # Hardcoded source packages to check for distance metrics.
+        self.metric_sources_ = {
+            "scipy.spatial.distance": scipy.spatial.distance,
+            "distances.Distance": Distance(),
+        }
+    def set_metric_fn_(self):
+        """
+        Set the metric function based on the provided metric.
+        If the metric is a string, the function will look for a corresponding function in scipy.spatial.distance or distances.Distance. If the metric is a function, it will be used directly.
+        """
+        if callable(self.metric):
+            self.metric_fn_ = self.metric
+            self.metric_arg_ = self.metric
+        elif isinstance(self.metric, str):
+            metric_str_lowercase = self.metric.lower()
+            metric_found = False
+            for package_str, source in self.metric_sources_.items():
+                if hasattr(source, metric_str_lowercase):
+                    self.metric_fn_ = getattr(source, metric_str_lowercase)
+                    metric_found = True
+                    if package_str == "scipy.spatial.distance":
+                        # Use the string as an argument if it belongs to scipy as it is optimized
+                        self.metric_arg_ = self.metric
+                    else:
+                        self.metric_arg_ = self.metric_fn_
+                    break
+            if not metric_found:
+                raise ValueError(
+                    f"{self.metric} metric not found. Please pass a string of the name of a metric in scipy.spatial.distance or distances.Distance, or pass a metric function directly. For a list of available metrics, see: https://sidchaini.github.io/DistClassiPy/distances.html or https://docs.scipy.org/doc/scipy/reference/spatial.distance.html"
+                )
     def fit(self, X: np.array, y: np.array, feat_labels: list[str] = None):
-        """Fit the classifier to the data.
+        """
+        Calculate the feature space centroid for all classes in the training set (X,y) using the central statistic. If scaling is enabled, also calculate the appropriate dispersion statistic.
-        This involves creating the training template and optionally calculating the kernel density estimate and 1-dimensional distance.
+        This involves computing the centroid for every class in the feature space and optionally calculating the kernel density estimate and 1-dimensional distance.
         Parameters
         ----------
         X : array-like of shape (n_samples, n_features)
             The training input samples.
         y : array-like of shape (n_samples,)
-            The target values.
-        feat_labels : list of str, optional
+            The target values (class labels).
+        feat_labels : list of str, optional, default=None
             The feature labels. If not provided, default labels representing feature number will be used.
+        Returns
+        -------
+        self : object
+            Fitted estimator.
         """
         X, y = check_X_y(X, y)
         self.classes_ = unique_labels(y)
         self.n_features_in_ = X.shape[1]
+        self.set_metric_fn_()
         if feat_labels is None:
             feat_labels = [f"Feature_{x}" for x in range(X.shape[1])]
@@ -112,7 +198,6 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
             self.df_iqr_ = df_iqr
         if self.calculate_kde:
-            self.set_metric_fn()
             self.kde_dict_ = {}
             for cl in self.classes_:
@@ -133,21 +218,26 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
         return self
     def predict(self, X: np.array):
-        """Predict the class labels for the provided data.
+        """Predict the class labels for the provided X.
-        The prediction is based on the distance of each data point to the training template.
+        The prediction is based on the distance of each data point in the input sample to the centroid for each class in the feature space. The predicted class is the one whose centroid is the closest to the input sample.
         Parameters
         ----------
         X : array-like of shape (n_samples, n_features)
             The input samples.
+        Returns
+        -------
+        y : ndarray of shape (n_samples,)
+            The predicted classes.
         """
         check_is_fitted(self, "is_fitted_")
         X = check_array(X)
         if not self.scale:
-            dist_arr = distance.cdist(
-                XA=X, XB=self.df_centroid_.to_numpy(), metric=self.metric
+            dist_arr = scipy.spatial.distance.cdist(
+                XA=X, XB=self.df_centroid_.to_numpy(), metric=self.metric_arg_
             )
         else:
@@ -164,48 +254,39 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
                 w = wtdf.loc[cl].to_numpy()  # 1/std dev
                 XB = XB * w  # w is for this class only
                 XA = X * w  # w is for this class only
-                cl_dist = distance.cdist(XA=XA, XB=XB, metric=self.metric)
+                cl_dist = scipy.spatial.distance.cdist(
+                    XA=XA, XB=XB, metric=self.metric_arg_
+                )
                 dist_arr_list.append(cl_dist)
             dist_arr = np.column_stack(dist_arr_list)
         y_pred = self.classes_[dist_arr.argmin(axis=1)]
         return y_pred
-    def set_metric_fn(self):
-        """
-        Set the metric function based on the specified metric.
-        If the metric is a string, the function will look for a corresponding function in scipy.spatial.distance or distances.Distance. If the metric is a function, it will be used directly.
-        """
-        if not callable(self.metric) or isinstance(self.metric, str):
-            if hasattr(distance, self.metric):
-                self.metric_fn_ = getattr(distance, self.metric)
-            elif hasattr(Distance(), self.metric):
-                self.metric_fn_ = getattr(Distance(), self.metric)
-            else:
-                raise ValueError(
-                    f"{self.metric} metric not found. Please pass a string of the name of a metric in scipy.spatial.distance or distances.Distance, or pass a metric function directly. For a list of available metrics, see: https://sidchaini.github.io/DistClassiPy/distances.html or https://docs.scipy.org/doc/scipy/reference/spatial.distance.html"
-                )
-        else:
-            self.metric_fn_ = self.metric
     def predict_and_analyse(self, X: np.array):
         """
-        Predict the class labels for the provided data and perform analysis.
+        Predict the class labels for the provided X and perform analysis.
+        The prediction is based on the distance of each data point in the input sample to the centroid for each class in the feature space. The predicted class is the one whose centroid is the closest to the input sample.
-        The analysis includes calculating the distance of each data point to the training template, and optionally calculating the kernel density estimate and 1-dimensional distance.
+        The analysis involves saving all calculated distances and confidences as an attribute for inspection and analysis later.
         Parameters
         ----------
         X : array-like of shape (n_samples, n_features)
             The input samples.
+        Returns
+        -------
+        y : ndarray of shape (n_samples,)
+            The predicted classes.
         """
         check_is_fitted(self, "is_fitted_")
         X = check_array(X)
         if not self.scale:
-            dist_arr = distance.cdist(
-                XA=X, XB=self.df_centroid_.to_numpy(), metric=self.metric
+            dist_arr = scipy.spatial.distance.cdist(
+                XA=X, XB=self.df_centroid_.to_numpy(), metric=self.metric_arg_
             )
         else:
@@ -222,7 +303,9 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
                 w = wtdf.loc[cl].to_numpy()  # 1/std dev
                 XB = XB * w  # w is for this class only
                 XA = X * w  # w is for this class only
-                cl_dist = distance.cdist(XA=XA, XB=XB, metric=self.metric)
+                cl_dist = scipy.spatial.distance.cdist(
+                    XA=XA, XB=XB, metric=self.metric_arg_
+                )
                 dist_arr_list.append(cl_dist)
             dist_arr = np.column_stack(dist_arr_list)
@@ -262,12 +345,12 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
             for cl in self.classes_:
                 sum_1d_dists = np.zeros(shape=(len(Xdf_temp)))
                 for feat in Xdf_temp.columns:
-                    dists = distance.cdist(
+                    dists = scipy.spatial.distance.cdist(
                         XA=np.zeros(shape=(1, 1)),
                         XB=(self.df_centroid_.loc[cl] - Xdf_temp)[feat]
                         .to_numpy()
                         .reshape(-1, 1),
-                        metric=self.metric,
+                        metric=self.metric_arg_,
                     ).ravel()
                     if self.scale and self.dispersion_stat == "std":
                         sum_1d_dists = sum_1d_dists + dists / self.df_std_.loc[cl, feat]
@@ -290,11 +373,11 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
         """
         Calculate the confidence for each prediction.
-        The confidence is calculated based on the distance of each data point to the training template, and optionally the kernel density estimate and 1-dimensional distance.
+        The confidence is calculated based on either the distance of each data point to the centroids of the training data, optionally the kernel density estimate or 1-dimensional distance.
         Parameters
         ----------
-        method : str, optional
+        method : {"distance_inverse", "1d_distance_inverse", "kde_likelihood"}, default="distance_inverse"
             The method to use for calculating confidence. Default is 'distance_inverse'.
         """
         check_is_fitted(self, "is_fitted_")

distclassipy 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

distclassipy 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl