PyPI - distclassipy - Versions diffs - 0.1.5__tar.gz → 0.2.0a0__tar.gz - Mend

distclassipy 0.1.5tar.gz → 0.2.0a0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

{distclassipy-0.1.5 → distclassipy-0.2.0a0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: distclassipy
-Version: 0.1.5
+Version: 0.2.0a0
 Summary: A python package for a distance-based classifier which can use several different distance metrics.
 Author-email: Siddharth Chaini <sidchaini@gmail.com>
 License:                     GNU GENERAL PUBLIC LICENSE
@@ -694,7 +694,7 @@ Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: joblib>=1.3.2
-Requires-Dist: numpy<2,>=1.25.2
+Requires-Dist: numpy>=1.25.2
 Requires-Dist: pandas>=2.0.3
 Requires-Dist: scikit-learn>=1.2.2
@@ -740,9 +740,9 @@ X, y = make_classification(
     random_state=0,
     shuffle=False,
 )
-clf = dcpy.DistanceMetricClassifier(metric="canberra")
+clf = dcpy.DistanceMetricClassifier()
 clf.fit(X, y)
-print(clf.predict([[0, 0, 0, 0]]))
+print(clf.predict([[0, 0, 0, 0]]), metric="canberra")
 ```
 ## Features
@@ -765,27 +765,30 @@ DistClassiPy is released under the [GNU General Public License v3.0](https://www
 ## Citation
 If you use DistClassiPy in your research or project, please consider citing the paper:
-> Chaini, S., Mahabal, A., Kembhavi, A., & Bianco, F. B. (2024). Light Curve Classification with DistClassiPy: a new distance-based classifier. arXiv. https://doi.org/10.48550/arXiv.2403.12120
+> Chaini, S., Mahabal, A., Kembhavi, A., & Bianco, F. B. (2024). Light Curve Classification with DistClassiPy: a new distance-based classifier. Astronomy and Computing. https://doi.org/10.1016/j.ascom.2024.100850.
 ### Bibtex
 ```bibtex
-@ARTICLE{chaini2024light,
-       author = {{Chaini}, Siddharth and {Mahabal}, Ashish and {Kembhavi}, Ajit and {Bianco}, Federica B.},
-       title = "{Light Curve Classification with DistClassiPy: a new distance-based classifier}",
-       journal = {arXiv e-prints},
-       keywords = {Astrophysics - Instrumentation and Methods for Astrophysics, Astrophysics - Solar and Stellar Astrophysics, Computer Science - Machine Learning},
-       year = 2024,
-       month = mar,
-       eid = {arXiv:2403.12120},
-       pages = {arXiv:2403.12120},
-       archivePrefix = {arXiv},
+@ARTICLE{2024A&C....4800850C,
+       author = {{Chaini}, S. and {Mahabal}, A. and {Kembhavi}, A. and {Bianco}, F.~B.},
+        title = "{Light curve classification with DistClassiPy: A new distance-based classifier}",
+      journal = {Astronomy and Computing},
+     keywords = {Variable stars (1761), Astronomy data analysis (1858), Open source software (1866), Astrostatistics (1882), Classification (1907), Light curve classification (1954), Astrophysics - Instrumentation and Methods for Astrophysics, Astrophysics - Solar and Stellar Astrophysics, Computer Science - Machine Learning},
+         year = 2024,
+        month = jul,
+       volume = {48},
+          eid = {100850},
+        pages = {100850},
+          doi = {10.1016/j.ascom.2024.100850},
+archivePrefix = {arXiv},
        eprint = {2403.12120},
-       primaryClass = {astro-ph.IM},
-       adsurl = {https://ui.adsabs.harvard.edu/abs/2024arXiv240312120C},
-       adsnote = {Provided by the SAO/NASA Astrophysics Data System}
+ primaryClass = {astro-ph.IM},
+       adsurl = {https://ui.adsabs.harvard.edu/abs/2024A&C....4800850C},
+      adsnote = {Provided by the SAO/NASA Astrophysics Data System}
 }
 ```

{distclassipy-0.1.5 → distclassipy-0.2.0a0}/README.md RENAMED Viewed

@@ -40,9 +40,9 @@ X, y = make_classification(
     random_state=0,
     shuffle=False,
 )
-clf = dcpy.DistanceMetricClassifier(metric="canberra")
+clf = dcpy.DistanceMetricClassifier()
 clf.fit(X, y)
-print(clf.predict([[0, 0, 0, 0]]))
+print(clf.predict([[0, 0, 0, 0]]), metric="canberra")
 ```
 ## Features
@@ -65,27 +65,30 @@ DistClassiPy is released under the [GNU General Public License v3.0](https://www
 ## Citation
 If you use DistClassiPy in your research or project, please consider citing the paper:
-> Chaini, S., Mahabal, A., Kembhavi, A., & Bianco, F. B. (2024). Light Curve Classification with DistClassiPy: a new distance-based classifier. arXiv. https://doi.org/10.48550/arXiv.2403.12120
+> Chaini, S., Mahabal, A., Kembhavi, A., & Bianco, F. B. (2024). Light Curve Classification with DistClassiPy: a new distance-based classifier. Astronomy and Computing. https://doi.org/10.1016/j.ascom.2024.100850.
 ### Bibtex
 ```bibtex
-@ARTICLE{chaini2024light,
-       author = {{Chaini}, Siddharth and {Mahabal}, Ashish and {Kembhavi}, Ajit and {Bianco}, Federica B.},
-       title = "{Light Curve Classification with DistClassiPy: a new distance-based classifier}",
-       journal = {arXiv e-prints},
-       keywords = {Astrophysics - Instrumentation and Methods for Astrophysics, Astrophysics - Solar and Stellar Astrophysics, Computer Science - Machine Learning},
-       year = 2024,
-       month = mar,
-       eid = {arXiv:2403.12120},
-       pages = {arXiv:2403.12120},
-       archivePrefix = {arXiv},
+@ARTICLE{2024A&C....4800850C,
+       author = {{Chaini}, S. and {Mahabal}, A. and {Kembhavi}, A. and {Bianco}, F.~B.},
+        title = "{Light curve classification with DistClassiPy: A new distance-based classifier}",
+      journal = {Astronomy and Computing},
+     keywords = {Variable stars (1761), Astronomy data analysis (1858), Open source software (1866), Astrostatistics (1882), Classification (1907), Light curve classification (1954), Astrophysics - Instrumentation and Methods for Astrophysics, Astrophysics - Solar and Stellar Astrophysics, Computer Science - Machine Learning},
+         year = 2024,
+        month = jul,
+       volume = {48},
+          eid = {100850},
+        pages = {100850},
+          doi = {10.1016/j.ascom.2024.100850},
+archivePrefix = {arXiv},
        eprint = {2403.12120},
-       primaryClass = {astro-ph.IM},
-       adsurl = {https://ui.adsabs.harvard.edu/abs/2024arXiv240312120C},
-       adsnote = {Provided by the SAO/NASA Astrophysics Data System}
+ primaryClass = {astro-ph.IM},
+       adsurl = {https://ui.adsabs.harvard.edu/abs/2024A&C....4800850C},
+      adsnote = {Provided by the SAO/NASA Astrophysics Data System}
 }
 ```

{distclassipy-0.1.5 → distclassipy-0.2.0a0}/distclassipy/__init__.py RENAMED Viewed

@@ -25,4 +25,4 @@ along with this program.  If not, see <https://www.gnu.org/licenses/>.
 from .classifier import DistanceMetricClassifier  # noqa
 from .distances import Distance  # noqa
-__version__ = "0.1.5"
+__version__ = "0.2.0a0"

{distclassipy-0.1.5 → distclassipy-0.2.0a0}/distclassipy/classifier.py RENAMED Viewed

@@ -28,7 +28,6 @@ import pandas as pd
 import scipy
 from sklearn.base import BaseEstimator, ClassifierMixin
-from sklearn.neighbors import KernelDensity
 from sklearn.utils.multiclass import unique_labels
 from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
@@ -41,6 +40,52 @@ METRIC_SOURCES_ = {
 }
+def initialize_metric_function(metric):
+    """Set the metric function based on the provided metric.
+    If the metric is a string, the function will look for a corresponding
+    function in scipy.spatial.distance or distances.Distance. If the metric
+    is a function, it will be used directly.
+    """
+    if callable(metric):
+        metric_fn_ = metric
+        metric_arg_ = metric
+    elif isinstance(metric, str):
+        metric_str_lowercase = metric.lower()
+        metric_found = False
+        for package_str, source in METRIC_SOURCES_.items():
+            # Don't use scipy for jaccard as their implementation only works with
+            # booleans - use custom jaccard instead
+            if (
+                package_str == "scipy.spatial.distance"
+                and metric_str_lowercase == "jaccard"
+            ):
+                continue
+            if hasattr(source, metric_str_lowercase):
+                metric_fn_ = getattr(source, metric_str_lowercase)
+                metric_found = True
+                # Use the string as an argument if it belongs to scipy as it is
+                # optimized
+                metric_arg_ = (
+                    metric if package_str == "scipy.spatial.distance" else metric_fn_
+                )
+                break
+        if not metric_found:
+            raise ValueError(
+                f"{metric} metric not found. Please pass a string of the "
+                "name of a metric in scipy.spatial.distance or "
+                "distances.Distance, or pass a metric function directly. For a "
+                "list of available metrics, see: "
+                "https://sidchaini.github.io/DistClassiPy/distances.html or "
+                "https://docs.scipy.org/doc/scipy/reference/spatial.distance.html"
+            )
+    return metric_fn_, metric_arg_
 class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
     """A distance-based classifier that supports different distance metrics.
@@ -54,8 +99,6 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
     Parameters
     ----------
-    metric : str or callable, default="euclidean"
-        The distance metric to use for calculating the distance between features.
     scale : bool, default=True
         Whether to scale the distance between the test object and the centroid for a
         class in the feature space. If True, the data will be scaled based on the
@@ -70,36 +113,15 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
         .. versionadded:: 0.1.0
-    calculate_kde : bool, default=False
-        Whether to calculate a kernel density estimate based confidence parameter.
-    calculate_1d_dist : bool, default=False
-        Whether to calculate the 1-dimensional distance based confidence parameter.
     Attributes
     ----------
-    metric : str or callable
-        The distance metric used for classification.
     scale : bool
         Indicates whether the data is scaled.
     central_stat : str
         The statistic used for calculating central tendency.
     dispersion_stat : str
         The statistic used for calculating dispersion.
-    calculate_kde : bool
-        Indicates whether a kernel density estimate is calculated.
-    calculate_1d_dist : bool
-        Indicates whether 1-dimensional distances are calculated.
-    See Also
-    --------
-    scipy.spatial.dist : Other distance metrics provided in SciPy
-    distclassipy.Distance : Distance metrics included with DistClassiPy
-    Notes
-    -----
-    If using distance metrics supported by SciPy, it is desirable to pass a string,
-    which allows SciPy to use an optimized C version of the code instead of the slower
-    Python version.
     References
     ----------
@@ -122,66 +144,14 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
     def __init__(
         self,
-        metric: str | Callable = "euclidean",
         scale: bool = True,
         central_stat: str = "median",
         dispersion_stat: str = "std",
-        calculate_kde: bool = True,
-        calculate_1d_dist: bool = True,
     ):
         """Initialize the classifier with specified parameters."""
-        self.metric = metric
         self.scale = scale
         self.central_stat = central_stat
         self.dispersion_stat = dispersion_stat
-        self.calculate_kde = calculate_kde
-        self.calculate_1d_dist = calculate_1d_dist
-    def initialize_metric_function(self):
-        """Set the metric function based on the provided metric.
-        If the metric is a string, the function will look for a corresponding
-        function in scipy.spatial.distance or distances.Distance. If the metric
-        is a function, it will be used directly.
-        """
-        if callable(self.metric):
-            self.metric_fn_ = self.metric
-            self.metric_arg_ = self.metric
-        elif isinstance(self.metric, str):
-            metric_str_lowercase = self.metric.lower()
-            metric_found = False
-            for package_str, source in METRIC_SOURCES_.items():
-                # Don't use scipy for jaccard as their implementation only works with
-                # booleans - use custom jaccard instead
-                if (
-                    package_str == "scipy.spatial.distance"
-                    and metric_str_lowercase == "jaccard"
-                ):
-                    continue
-                if hasattr(source, metric_str_lowercase):
-                    self.metric_fn_ = getattr(source, metric_str_lowercase)
-                    metric_found = True
-                    # Use the string as an argument if it belongs to scipy as it is
-                    # optimized
-                    self.metric_arg_ = (
-                        self.metric
-                        if package_str == "scipy.spatial.distance"
-                        else self.metric_fn_
-                    )
-                    break
-            if not metric_found:
-                raise ValueError(
-                    f"{self.metric} metric not found. Please pass a string of the "
-                    "name of a metric in scipy.spatial.distance or "
-                    "distances.Distance, or pass a metric function directly. For a "
-                    "list of available metrics, see: "
-                    "https://sidchaini.github.io/DistClassiPy/distances.html or "
-                    "https://docs.scipy.org/doc/scipy/reference/spatial.distance.html"
-                )
     def fit(self, X: np.array, y: np.array, feat_labels: list[str] = None):
         """Calculate the feature space centroid for all classes.
@@ -209,9 +179,9 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
         """
         X, y = check_X_y(X, y)
         self.classes_ = unique_labels(y)
-        self.n_features_in_ = X.shape[1]
-        self.initialize_metric_function()
+        self.n_features_in_ = X.shape[
+            1
+        ]  # Number of features seen during fit - required for sklearn compatibility.
         if feat_labels is None:
             feat_labels = [f"Feature_{x}" for x in range(X.shape[1])]
@@ -256,27 +226,15 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
             )
             self.df_iqr_ = df_iqr
-        if self.calculate_kde:
-            self.kde_dict_ = {}
-            for cl in self.classes_:
-                subX = X[y == cl]
-                # Implement the following in an if-else to save computational time.
-                # kde = KernelDensity(bandwidth='scott', metric=self.metric)
-                # kde.fit(subX)
-                kde = KernelDensity(
-                    bandwidth="scott",
-                    metric="pyfunc",
-                    metric_params={"func": self.metric_fn_},
-                )
-                kde.fit(subX)
-                self.kde_dict_[cl] = kde
         self.is_fitted_ = True
         return self
-    def predict(self, X: np.array):
+    def predict(
+        self,
+        X: np.array,
+        metric: str | Callable = "euclidean",
+    ):
         """Predict the class labels for the provided X.
         The prediction is based on the distance of each data point in the input sample
@@ -287,18 +245,33 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
         ----------
         X : array-like of shape (n_samples, n_features)
             The input samples.
+        metric : str or callable, default="euclidean"
+            The distance metric to use for calculating the distance between features.
         Returns
         -------
         y : ndarray of shape (n_samples,)
             The predicted classes.
+        See Also
+        --------
+        scipy.spatial.dist : Other distance metrics provided in SciPy
+        distclassipy.Distance : Distance metrics included with DistClassiPy
+        Notes
+        -----
+        If using distance metrics supported by SciPy, it is desirable to pass a string,
+        which allows SciPy to use an optimized C version of the code instead of the
+        slower Python version.
         """
         check_is_fitted(self, "is_fitted_")
         X = check_array(X)
+        metric_fn_, metric_arg_ = initialize_metric_function(metric)
         if not self.scale:
             dist_arr = scipy.spatial.distance.cdist(
-                XA=X, XB=self.df_centroid_.to_numpy(), metric=self.metric_arg_
+                XA=X, XB=self.df_centroid_.to_numpy(), metric=metric_arg_
             )
         else:
@@ -315,16 +288,18 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
                 w = wtdf.loc[cl].to_numpy()  # 1/std dev
                 XB = XB * w  # w is for this class only
                 XA = X * w  # w is for this class only
-                cl_dist = scipy.spatial.distance.cdist(
-                    XA=XA, XB=XB, metric=self.metric_arg_
-                )
+                cl_dist = scipy.spatial.distance.cdist(XA=XA, XB=XB, metric=metric_arg_)
                 dist_arr_list.append(cl_dist)
             dist_arr = np.column_stack(dist_arr_list)
         y_pred = self.classes_[dist_arr.argmin(axis=1)]
         return y_pred
-    def predict_and_analyse(self, X: np.array):
+    def predict_and_analyse(
+        self,
+        X: np.array,
+        metric: str | Callable = "euclidean",
+    ):
         """Predict the class labels for the provided X and perform analysis.
         The prediction is based on the distance of each data point in the input sample
@@ -338,18 +313,35 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
         ----------
         X : array-like of shape (n_samples, n_features)
             The input samples.
+        metric : str or callable, default="euclidean"
+            The distance metric to use for calculating the distance between features.
         Returns
         -------
         y : ndarray of shape (n_samples,)
             The predicted classes.
+        See Also
+        --------
+        scipy.spatial.dist : Other distance metrics provided in SciPy
+        distclassipy.Distance : Distance metrics included with DistClassiPy
+        Notes
+        -----
+        If using distance metrics supported by SciPy, it is desirable to pass a string,
+        which allows SciPy to use an optimized C version of the code instead
+        of the slower Python version.
         """
         check_is_fitted(self, "is_fitted_")
         X = check_array(X)
+        metric_fn_, metric_arg_ = initialize_metric_function(metric)
         if not self.scale:
             dist_arr = scipy.spatial.distance.cdist(
-                XA=X, XB=self.df_centroid_.to_numpy(), metric=self.metric_arg_
+                XA=X, XB=self.df_centroid_.to_numpy(), metric=metric_arg_
             )
         else:
@@ -366,9 +358,7 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
                 w = wtdf.loc[cl].to_numpy()  # 1/std dev
                 XB = XB * w  # w is for this class only
                 XA = X * w  # w is for this class only
-                cl_dist = scipy.spatial.distance.cdist(
-                    XA=XA, XB=XB, metric=self.metric_arg_
-                )
+                cl_dist = scipy.spatial.distance.cdist(XA=XA, XB=XB, metric=metric_arg_)
                 dist_arr_list.append(cl_dist)
             dist_arr = np.column_stack(dist_arr_list)
@@ -381,68 +371,15 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
         y_pred = self.classes_[dist_arr.argmin(axis=1)]
-        if self.calculate_kde:
-            # NEW: Rescale in terms of median likelihoods - calculate here
-            scale_factors = np.exp(
-                [
-                    self.kde_dict_[cl].score_samples(
-                        self.df_centroid_.loc[cl].to_numpy().reshape(1, -1)
-                    )[0]
-                    for cl in self.classes_
-                ]
-            )
-            likelihood_arr = []
-            for k in self.kde_dict_.keys():
-                log_pdf = self.kde_dict_[k].score_samples(X)
-                likelihood_val = np.exp(log_pdf)
-                likelihood_arr.append(likelihood_val)
-            self.likelihood_arr_ = np.array(likelihood_arr).T
-            # NEW: Rescale in terms of median likelihoods - rescale here
-            self.likelihood_arr_ = self.likelihood_arr_ / scale_factors
-        if self.calculate_1d_dist:
-            conf_cl = []
-            Xdf_temp = pd.DataFrame(data=X, columns=self.df_centroid_.columns)
-            for cl in self.classes_:
-                sum_1d_dists = np.zeros(shape=(len(Xdf_temp)))
-                for feat in Xdf_temp.columns:
-                    dists = scipy.spatial.distance.cdist(
-                        XA=np.zeros(shape=(1, 1)),
-                        XB=(self.df_centroid_.loc[cl] - Xdf_temp)[feat]
-                        .to_numpy()
-                        .reshape(-1, 1),
-                        metric=self.metric_arg_,
-                    ).ravel()
-                    if self.scale and self.dispersion_stat == "std":
-                        sum_1d_dists = sum_1d_dists + dists / self.df_std_.loc[cl, feat]
-                    elif self.scale and self.dispersion_stat == "std":
-                        sum_1d_dists = sum_1d_dists + dists / self.df_iqr_.loc[cl, feat]
-                    else:
-                        sum_1d_dists = sum_1d_dists + dists
-                confs = 1 / np.clip(sum_1d_dists, a_min=np.finfo(float).eps, a_max=None)
-                conf_cl.append(confs)
-            conf_cl = np.array(conf_cl)
-            self.conf_cl_ = conf_cl
         self.analyis_ = True
         return y_pred
-    def calculate_confidence(self, method: str = "distance_inverse"):
+    def calculate_confidence(self):
         """Calculate the confidence for each prediction.
-        The confidence is calculated based on either the distance of each data point to
-        the centroids of the training data, optionally the kernel density estimate or
-        1-dimensional distance.
-        Parameters
-        ----------
-        method : {"distance_inverse", "1d_distance_inverse","kde_likelihood"},
-                 default="distance_inverse"
-            The method to use for calculating confidence. Default is
-            'distance_inverse'.
+        The confidence is calculated as the inverse of the distance of each data point
+        to the centroids of the training data.
         """
         check_is_fitted(self, "is_fitted_")
         if not hasattr(self, "analyis_"):
@@ -452,34 +389,11 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
             )
         # Calculate confidence for each prediction
-        if method == "distance_inverse":
-            self.confidence_df_ = 1 / np.clip(
-                self.centroid_dist_df_, a_min=np.finfo(float).eps, a_max=None
-            )
-            self.confidence_df_.columns = [
-                x.replace("_dist", "_conf") for x in self.confidence_df_.columns
-            ]
-        elif method == "1d_distance_inverse":
-            if not self.calculate_1d_dist:
-                raise ValueError(
-                    "method='1d_distance_inverse' is only valid if calculate_1d_dist "
-                    "is set to True"
-                )
-            self.confidence_df_ = pd.DataFrame(
-                data=self.conf_cl_.T, columns=[f"{x}_conf" for x in self.classes_]
-            )
-        elif method == "kde_likelihood":
-            if not self.calculate_kde:
-                raise ValueError(
-                    "method='kde_likelihood' is only valid if calculate_kde is set "
-                    "to True"
-                )
-            self.confidence_df_ = pd.DataFrame(
-                data=self.likelihood_arr_,
-                columns=[f"{x}_conf" for x in self.kde_dict_.keys()],
-            )
+        self.confidence_df_ = 1 / np.clip(
+            self.centroid_dist_df_, a_min=np.finfo(float).eps, a_max=None
+        )
+        self.confidence_df_.columns = [
+            x.replace("_dist", "_conf") for x in self.confidence_df_.columns
+        ]
         return self.confidence_df_.to_numpy()

distclassipy 0.1.5__tar.gz → 0.2.0a0__tar.gz

distclassipy 0.1.5tar.gz → 0.2.0a0tar.gz