distclassipy 0.2.0a0__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: distclassipy
3
- Version: 0.2.0a0
3
+ Version: 0.2.1
4
4
  Summary: A python package for a distance-based classifier which can use several different distance metrics.
5
5
  Author-email: Siddharth Chaini <sidchaini@gmail.com>
6
6
  License: GNU GENERAL PUBLIC LICENSE
@@ -740,17 +740,25 @@ X, y = make_classification(
740
740
  random_state=0,
741
741
  shuffle=False,
742
742
  )
743
+ # Example usage of DistanceMetricClassifier
743
744
  clf = dcpy.DistanceMetricClassifier()
744
745
  clf.fit(X, y)
745
- print(clf.predict([[0, 0, 0, 0]]), metric="canberra")
746
+ print(clf.predict([[0, 0, 0, 0]], metric="canberra"))
747
+
748
+ # Example usage of EnsembleDistanceClassifier
749
+ ensemble_clf = dcpy.EnsembleDistanceClassifier(feat_idx=0)
750
+ ensemble_clf.fit(X, y)
751
+ print(ensemble_clf.predict(X))
746
752
  ```
747
753
 
748
754
  ## Features
749
755
  - **Distance Metric-Based Classification**: Utilizes a variety of distance metrics for classification.
750
756
  - **Customizable for Scientific Goals**: Allows fine-tuning based on scientific objectives by selecting appropriate distance metrics and features, enhancing both computational efficiency and model performance.
751
757
  - **Interpretable Results**: Offers improved interpretability of classification outcomes by directly using distance metrics and feature importance, making it ideal for scientific applications.
752
- - **Efficient and Scalable**: Demonstrates lower computational requirements compared to traditional methods like Random Forests, making it suitable for large datasets
753
- - **Open Source and Accessible**: Available as an open-source Python package on PyPI, encouraging broad application in astronomy and beyond
758
+ - **Efficient and Scalable**: Demonstrates lower computational requirements compared to traditional methods like Random Forests, making it suitable for large datasets.
759
+ - **Open Source and Accessible**: Available as an open-source Python package on PyPI, encouraging broad application in astronomy and beyond.
760
+ - **(NEW) Ensemble Distance Classification**: Leverages an ensemble approach to use different distance metrics for each quantile, improving classification performance across diverse data distributions.
761
+ - **(NEW) Expanded Distance Metrics**: DistClassiPy now offers 43 built-in distance metrics, an increase from the previous 18. Additionally, users can still define and use custom distance metrics as needed.
754
762
 
755
763
  ## Documentation
756
764
 
@@ -40,17 +40,25 @@ X, y = make_classification(
40
40
  random_state=0,
41
41
  shuffle=False,
42
42
  )
43
+ # Example usage of DistanceMetricClassifier
43
44
  clf = dcpy.DistanceMetricClassifier()
44
45
  clf.fit(X, y)
45
- print(clf.predict([[0, 0, 0, 0]]), metric="canberra")
46
+ print(clf.predict([[0, 0, 0, 0]], metric="canberra"))
47
+
48
+ # Example usage of EnsembleDistanceClassifier
49
+ ensemble_clf = dcpy.EnsembleDistanceClassifier(feat_idx=0)
50
+ ensemble_clf.fit(X, y)
51
+ print(ensemble_clf.predict(X))
46
52
  ```
47
53
 
48
54
  ## Features
49
55
  - **Distance Metric-Based Classification**: Utilizes a variety of distance metrics for classification.
50
56
  - **Customizable for Scientific Goals**: Allows fine-tuning based on scientific objectives by selecting appropriate distance metrics and features, enhancing both computational efficiency and model performance.
51
57
  - **Interpretable Results**: Offers improved interpretability of classification outcomes by directly using distance metrics and feature importance, making it ideal for scientific applications.
52
- - **Efficient and Scalable**: Demonstrates lower computational requirements compared to traditional methods like Random Forests, making it suitable for large datasets
53
- - **Open Source and Accessible**: Available as an open-source Python package on PyPI, encouraging broad application in astronomy and beyond
58
+ - **Efficient and Scalable**: Demonstrates lower computational requirements compared to traditional methods like Random Forests, making it suitable for large datasets.
59
+ - **Open Source and Accessible**: Available as an open-source Python package on PyPI, encouraging broad application in astronomy and beyond.
60
+ - **(NEW) Ensemble Distance Classification**: Leverages an ensemble approach to use different distance metrics for each quantile, improving classification performance across diverse data distributions.
61
+ - **(NEW) Expanded Distance Metrics**: DistClassiPy now offers 43 built-in distance metrics, an increase from the previous 18. Additionally, users can still define and use custom distance metrics as needed.
54
62
 
55
63
  ## Documentation
56
64
 
@@ -22,7 +22,17 @@ You should have received a copy of the GNU General Public License
22
22
  along with this program. If not, see <https://www.gnu.org/licenses/>.
23
23
  """
24
24
 
25
- from .classifier import DistanceMetricClassifier # noqa
26
- from .distances import Distance # noqa
25
+ from .classifier import (
26
+ DistanceMetricClassifier,
27
+ EnsembleDistanceClassifier,
28
+ )
29
+ from .distances import Distance, _ALL_METRICS
27
30
 
28
- __version__ = "0.2.0a0"
31
+ __version__ = "0.2.1"
32
+
33
+ __all__ = [
34
+ "DistanceMetricClassifier",
35
+ "EnsembleDistanceClassifier",
36
+ "Distance",
37
+ "_ALL_METRICS",
38
+ ]
@@ -3,6 +3,21 @@
3
3
  This module contains the DistanceMetricClassifier introduced by Chaini et al. (2024)
4
4
  in "Light Curve Classification with DistClassiPy: a new distance-based classifier"
5
5
 
6
+
7
+ .. autoclass:: distclassipy.classifier.DistanceMetricClassifier
8
+ :members:
9
+ :inherited-members:
10
+ :exclude-members: set_fit_request, set_predict_request
11
+
12
+ .. autoclass:: distclassipy.classifier.EnsembleDistanceClassifier
13
+ :members:
14
+ :inherited-members:
15
+ :exclude-members: set_fit_request, set_predict_request
16
+
17
+ .. doctest-skip::
18
+
19
+ .. skip::
20
+
6
21
  Copyright (C) 2024 Siddharth Chaini
7
22
  -----
8
23
  This program is free software: you can redistribute it and/or modify
@@ -19,7 +34,7 @@ You should have received a copy of the GNU General Public License
19
34
  along with this program. If not, see <https://www.gnu.org/licenses/>.
20
35
  """
21
36
 
22
- from typing import Callable
37
+ from typing import Callable, Tuple
23
38
 
24
39
  import numpy as np
25
40
 
@@ -28,10 +43,12 @@ import pandas as pd
28
43
  import scipy
29
44
 
30
45
  from sklearn.base import BaseEstimator, ClassifierMixin
46
+ from sklearn.metrics import accuracy_score
47
+ from sklearn.model_selection import train_test_split
31
48
  from sklearn.utils.multiclass import unique_labels
32
49
  from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
33
50
 
34
- from .distances import Distance
51
+ from .distances import Distance, _ALL_METRICS
35
52
 
36
53
  # Hardcoded source packages to check for distance metrics.
37
54
  METRIC_SOURCES_ = {
@@ -113,16 +130,6 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
113
130
 
114
131
  .. versionadded:: 0.1.0
115
132
 
116
-
117
- Attributes
118
- ----------
119
- scale : bool
120
- Indicates whether the data is scaled.
121
- central_stat : str
122
- The statistic used for calculating central tendency.
123
- dispersion_stat : str
124
- The statistic used for calculating dispersion.
125
-
126
133
  References
127
134
  ----------
128
135
  .. [1] "Light Curve Classification with DistClassiPy: a new distance-based
@@ -135,10 +142,10 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
135
142
  >>> X, y = make_classification(n_samples=1000, n_features=4,
136
143
  ... n_informative=2, n_redundant=0,
137
144
  ... random_state=0, shuffle=False)
138
- >>> clf = dcpy.DistanceMetricClassifier(metric="canberra")
145
+ >>> clf = dcpy.DistanceMetricClassifier()
139
146
  >>> clf.fit(X, y)
140
147
  DistanceMetricClassifier(...)
141
- >>> print(clf.predict([[0, 0, 0, 0]]))
148
+ >>> print(clf.predict([[0, 0, 0, 0]], metric="canberra"))
142
149
  [0]
143
150
  """
144
151
 
@@ -147,13 +154,15 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
147
154
  scale: bool = True,
148
155
  central_stat: str = "median",
149
156
  dispersion_stat: str = "std",
150
- ):
157
+ ) -> None:
151
158
  """Initialize the classifier with specified parameters."""
152
159
  self.scale = scale
153
160
  self.central_stat = central_stat
154
161
  self.dispersion_stat = dispersion_stat
155
162
 
156
- def fit(self, X: np.array, y: np.array, feat_labels: list[str] = None):
163
+ def fit(
164
+ self, X: np.array, y: np.array, feat_labels: list[str] = None
165
+ ) -> "DistanceMetricClassifier":
157
166
  """Calculate the feature space centroid for all classes.
158
167
 
159
168
  This function calculates the feature space centroid in the training
@@ -234,7 +243,7 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
234
243
  self,
235
244
  X: np.array,
236
245
  metric: str | Callable = "euclidean",
237
- ):
246
+ ) -> np.ndarray:
238
247
  """Predict the class labels for the provided X.
239
248
 
240
249
  The prediction is based on the distance of each data point in the input sample
@@ -248,6 +257,10 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
248
257
  metric : str or callable, default="euclidean"
249
258
  The distance metric to use for calculating the distance between features.
250
259
 
260
+ .. versionchanged:: 0.2.0
261
+ The metric is now specified at prediction time rather
262
+ than during initialization, providing greater flexibility.
263
+
251
264
  Returns
252
265
  -------
253
266
  y : ndarray of shape (n_samples,)
@@ -266,9 +279,7 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
266
279
  """
267
280
  check_is_fitted(self, "is_fitted_")
268
281
  X = check_array(X)
269
-
270
282
  metric_fn_, metric_arg_ = initialize_metric_function(metric)
271
-
272
283
  if not self.scale:
273
284
  dist_arr = scipy.spatial.distance.cdist(
274
285
  XA=X, XB=self.df_centroid_.to_numpy(), metric=metric_arg_
@@ -299,7 +310,7 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
299
310
  self,
300
311
  X: np.array,
301
312
  metric: str | Callable = "euclidean",
302
- ):
313
+ ) -> np.ndarray:
303
314
  """Predict the class labels for the provided X and perform analysis.
304
315
 
305
316
  The prediction is based on the distance of each data point in the input sample
@@ -397,3 +408,289 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
397
408
  ]
398
409
 
399
410
  return self.confidence_df_.to_numpy()
411
+
412
+ def score(self, X, y, metric: str | Callable = "euclidean") -> float:
413
+ """Return the mean accuracy on the given test data and labels.
414
+
415
+ Parameters
416
+ ----------
417
+ X : array-like of shape (n_samples, n_features)
418
+ Test samples.
419
+ y : array-like of shape (n_samples,)
420
+ True labels for X.
421
+ metric : str or callable, default="euclidean"
422
+ The distance metric to use for calculating the distance between features.
423
+
424
+ Returns
425
+ -------
426
+ score : float
427
+ Mean accuracy of self.predict(X) wrt. y.
428
+ """
429
+ y_pred = self.predict(X, metric=metric)
430
+ return accuracy_score(y, y_pred)
431
+
432
+
433
+ class EnsembleDistanceClassifier(BaseEstimator, ClassifierMixin):
434
+ """An ensemble classifier that uses different metrics for each quantile.
435
+
436
+ This classifier splits the data into quantiles based on a specified
437
+ feature and uses different distance metrics for each quantile to
438
+ construct an ensemble classifier for each quantile, generally leading
439
+ to better performance.
440
+ Note, however, this involves fitting the training set for each metric
441
+ to evaluate performance, making this more computationally expensive.
442
+
443
+ .. versionadded:: 0.2.0
444
+ """
445
+
446
+ def __init__(
447
+ self,
448
+ feat_idx: int,
449
+ scale: bool = True,
450
+ central_stat: str = "median",
451
+ dispersion_stat: str = "std",
452
+ metrics_to_consider: list[str] = None,
453
+ random_state: int = None,
454
+ ) -> None:
455
+ """Initialize the classifier with specified parameters.
456
+
457
+ Parameters
458
+ ----------
459
+ feat_idx : int
460
+ The index of the feature to be used for quantile splitting.
461
+ scale : bool, default=True
462
+ Whether to scale the distance between the test object and the centroid.
463
+ central_stat : str, default="median"
464
+ The statistic used to calculate the central tendency of the data.
465
+ dispersion_stat : str, default="std"
466
+ The statistic used to calculate the dispersion of the data.
467
+ metrics_to_consider : list of str, optional
468
+ A list of distance metrics to evaluate. If None, all available
469
+ metrics within DistClassiPy will be considered.
470
+ random_state : int, RandomState instance or None, optional (default=None)
471
+ Controls the randomness of the estimator. Pass an int for reproducible
472
+ output across multiple function calls.
473
+
474
+ .. versionadded:: 0.2.1
475
+ """
476
+ self.feat_idx = feat_idx
477
+ self.scale = scale
478
+ self.central_stat = central_stat
479
+ self.dispersion_stat = dispersion_stat
480
+ self.metrics_to_consider = metrics_to_consider
481
+ self.random_state = random_state
482
+
483
+ def fit(
484
+ self, X: np.ndarray, y: np.ndarray, n_quantiles: int = 4
485
+ ) -> "EnsembleDistanceClassifier":
486
+ """Fit the ensemble classifier using the best metrics for each quantile.
487
+
488
+ Parameters
489
+ ----------
490
+ X : np.ndarray
491
+ The input feature matrix.
492
+ y : np.ndarray
493
+ The target labels.
494
+ n_quantiles : int, default=4
495
+ The number of quantiles to split the data into.
496
+
497
+ Returns
498
+ -------
499
+ self : object
500
+ Fitted estimator.
501
+ """
502
+ self.clf_ = DistanceMetricClassifier(
503
+ scale=self.scale,
504
+ central_stat=self.central_stat,
505
+ dispersion_stat=self.dispersion_stat,
506
+ )
507
+
508
+ # Find best metrics based on training set quantiles
509
+ self.quantile_scores_df_, self.best_metrics_per_quantile_, self.group_bins = (
510
+ self.evaluate_metrics(X, y, n_quantiles)
511
+ )
512
+
513
+ # Ensure the bins work with values outside of training data
514
+ self.group_bins[0] = -np.inf
515
+ self.group_bins[-1] = np.inf
516
+
517
+ self.group_labels = [f"Quantile {i+1}" for i in range(n_quantiles)]
518
+ self.clf_.fit(X, y)
519
+ self.is_fitted_ = True
520
+ return self
521
+
522
+ def predict(self, X: np.ndarray) -> np.ndarray:
523
+ """Predict class labels using the best metric for each quantile.
524
+
525
+ Parameters
526
+ ----------
527
+ X : np.ndarray
528
+ The input samples.
529
+
530
+ Returns
531
+ -------
532
+ predictions : np.ndarray
533
+ The predicted class labels.
534
+ """
535
+ check_is_fitted(self, "is_fitted_")
536
+ X = check_array(X)
537
+
538
+ # notes for pred during best:
539
+ # option 1:
540
+ # loop through each metric, merge quantiles for each metric
541
+ # pred on this
542
+ # option 2, easier, but slower:
543
+ # loop through each quantile, and append pred
544
+
545
+ quantiles = pd.cut(
546
+ X[:, self.feat_idx], bins=self.group_bins, labels=self.group_labels
547
+ )
548
+ grouped_data = pd.DataFrame(X).groupby(quantiles, observed=False)
549
+ # quantile_indices = quantiles.codes # Get integer codes for quantiles
550
+ predictions = np.empty(X.shape[0], dtype=object) # Change dtype to object
551
+ for i, (lim, subdf) in enumerate(grouped_data):
552
+ best_metric = self.best_metrics_per_quantile_.loc[self.group_labels[i]]
553
+ preds = self.clf_.predict(subdf.to_numpy(), metric=best_metric)
554
+ predictions[subdf.index] = preds
555
+ # # Precompute predictions for each quantile
556
+ # quantile_predictions = {}
557
+ # for i, label in enumerate(self.group_labels):
558
+ # best_metric = self.best_metrics_per_quantile_.loc[label]
559
+ # quantile_data = X[quantile_indices == i]
560
+ # if quantile_data.size > 0:
561
+ # quantile_predictions[i] = self.clf_.predict(
562
+ # quantile_data, metric=best_metric
563
+ # )
564
+
565
+ # Assign predictions to the corresponding indices
566
+ # for i, preds in quantile_predictions.items():
567
+ # predictions[quantile_indices == i] = preds
568
+
569
+ return predictions
570
+
571
+ def evaluate_metrics(
572
+ self, X: np.ndarray, y: np.ndarray, n_quantiles: int = 4
573
+ ) -> Tuple[pd.DataFrame, pd.Series, np.ndarray]:
574
+ """Evaluate and find the best distance metrics for the specified feature.
575
+
576
+ This method uses the standalone `find_best_metrics` function to evaluate
577
+ different distance metrics and determine the best-performing ones for
578
+ each quantile.
579
+
580
+ Parameters
581
+ ----------
582
+ X : np.ndarray
583
+ The input feature matrix.
584
+ y : np.ndarray
585
+ The target labels.
586
+ n_quantiles : int, default=4
587
+ The number of quantiles to split the data into.
588
+
589
+ Returns
590
+ -------
591
+ quantile_scores_df : pd.DataFrame
592
+ A DataFrame containing the accuracy scores for each metric across
593
+ different quantiles.
594
+ best_metrics_per_quantile : pd.Series
595
+ A Series indicating the best-performing metric for each quantile.
596
+ group_bins : np.ndarray
597
+ The bins used for quantile splitting.
598
+ """
599
+ return find_best_metrics(
600
+ self.clf_,
601
+ X,
602
+ y,
603
+ self.feat_idx,
604
+ n_quantiles,
605
+ self.metrics_to_consider,
606
+ self.random_state,
607
+ )
608
+
609
+
610
+ def find_best_metrics(
611
+ clf: "DistanceMetricClassifier",
612
+ X: np.ndarray,
613
+ y: np.ndarray,
614
+ feat_idx: int,
615
+ n_quantiles: int = 4,
616
+ metrics_to_consider: list[str] = None,
617
+ random_state: int = None,
618
+ ) -> Tuple[pd.DataFrame, pd.Series, np.ndarray]:
619
+ """Evaluate and find the best distance metrics for a given feature.
620
+
621
+ This function evaluates different distance metrics to determine which
622
+ performs best for a specific feature in the dataset. It splits the data
623
+ into quantiles based on the specified feature and calculates the accuracy
624
+ of the classifier for each metric within these quantiles.
625
+
626
+ .. versionadded:: 0.2.0
627
+
628
+ Parameters
629
+ ----------
630
+ clf : DistanceMetricClassifier
631
+ The classifier instance to be used for evaluation.
632
+ X : np.ndarray
633
+ The input feature matrix.
634
+ y : np.ndarray
635
+ The target labels.
636
+ feat_idx : int
637
+ The index of the feature to be used for quantile splitting.
638
+ n_quantiles : int, default=4
639
+ The number of quantiles to split the data into.
640
+ metrics_to_consider : list of str, optional
641
+ A list of distance metrics to evaluate. If None, all available
642
+ metrics within DistClassiPy will be considered.
643
+ random_state : int, RandomState instance or None, optional (default=None)
644
+ Controls the randomness of the estimator. Pass an int for reproducible
645
+ output across multiple function calls.
646
+
647
+ .. versionadded:: 0.2.1
648
+
649
+ Returns
650
+ -------
651
+ quantile_scores_df : pd.DataFrame
652
+ A DataFrame containing the accuracy scores for each metric across
653
+ different quantiles.
654
+ best_metrics_per_quantile : pd.Series
655
+ A Series indicating the best-performing metric for each quantile.
656
+ group_bins : np.ndarray
657
+ The bins used for quantile splitting.
658
+ """
659
+ X = check_array(X)
660
+ feature_labels = [f"Feature_{i}" for i in range(X.shape[1])]
661
+ feature_name = f"Feature_{feat_idx}"
662
+
663
+ if metrics_to_consider is None:
664
+ metrics_to_consider = _ALL_METRICS
665
+
666
+ X_df = pd.DataFrame(X, columns=feature_labels)
667
+ y_df = pd.DataFrame(y, columns=["Target"])
668
+ quantiles, group_bins = pd.qcut(X_df[feature_name], q=n_quantiles, retbins=True)
669
+
670
+ X_train, X_test, y_train, y_test = train_test_split(
671
+ X_df, y_df, test_size=0.25, stratify=quantiles, random_state=random_state
672
+ )
673
+
674
+ clf.fit(X_train, y_train.to_numpy().ravel())
675
+ grouped_test_data = X_test.groupby(quantiles, observed=False)
676
+
677
+ quantile_scores = []
678
+ for metric in metrics_to_consider:
679
+ scores_for_metric = [
680
+ accuracy_score(
681
+ y_test.loc[subdf.index], clf.predict(subdf.to_numpy(), metric=metric)
682
+ )
683
+ for _, subdf in grouped_test_data
684
+ ]
685
+ quantile_scores.append(scores_for_metric)
686
+
687
+ quantile_scores = np.array(quantile_scores) * 100
688
+ quantile_scores_df = pd.DataFrame(
689
+ data=quantile_scores,
690
+ index=metrics_to_consider,
691
+ columns=[f"Quantile {i+1}" for i in range(n_quantiles)],
692
+ )
693
+
694
+ best_metrics_per_quantile = quantile_scores_df.idxmax()
695
+
696
+ return quantile_scores_df, best_metrics_per_quantile, group_bins
@@ -48,6 +48,52 @@ import numpy as np
48
48
 
49
49
  import scipy
50
50
 
51
+ _ALL_METRICS = [
52
+ "euclidean",
53
+ "braycurtis",
54
+ "canberra",
55
+ "cityblock",
56
+ "chebyshev",
57
+ "clark",
58
+ "correlation",
59
+ "cosine",
60
+ "hellinger",
61
+ "jaccard",
62
+ "lorentzian",
63
+ "marylandbridge",
64
+ "meehl",
65
+ "motyka",
66
+ "soergel",
67
+ "wave_hedges",
68
+ "kulczynski",
69
+ "add_chisq",
70
+ "acc",
71
+ "chebyshev_min",
72
+ "czekanowski",
73
+ "dice",
74
+ "divergence",
75
+ "google",
76
+ "gower",
77
+ "jeffreys",
78
+ "jensenshannon_divergence",
79
+ "jensen_difference",
80
+ "kumarjohnson",
81
+ "matusita",
82
+ "minkowski",
83
+ "penroseshape",
84
+ "prob_chisq",
85
+ "ruzicka",
86
+ "sorensen",
87
+ "squared_chisq",
88
+ "squaredchord",
89
+ "squared_euclidean",
90
+ "taneja",
91
+ "tanimoto",
92
+ "topsoe",
93
+ "vicis_symmetric_chisq",
94
+ "vicis_wave_hedges",
95
+ ]
96
+
51
97
 
52
98
  class Distance:
53
99
  """A class to calculate various distance metrics between vectors.
@@ -352,7 +398,11 @@ class Distance:
352
398
  1(4), 300-307.
353
399
  """
354
400
  u, v = np.asarray(u), np.asarray(v)
355
- return np.sqrt(2 * np.sum((np.sqrt(u) - np.sqrt(v)) ** 2))
401
+ # Clip negative values to zero for valid sqrt
402
+ with np.errstate(divide="ignore", invalid="ignore"):
403
+ u = np.clip(u, a_min=0, a_max=None)
404
+ v = np.clip(v, a_min=0, a_max=None)
405
+ return np.sqrt(2 * np.sum((np.sqrt(u) - np.sqrt(v)) ** 2))
356
406
 
357
407
  def jaccard(self, u, v):
358
408
  """Calculate the Jaccard distance between two vectors.
@@ -402,7 +452,8 @@ class Distance:
402
452
  eschew the log of zero.
403
453
  """
404
454
  u, v = np.asarray(u), np.asarray(v)
405
- return np.sum(np.log(np.abs(u - v) + 1))
455
+ with np.errstate(divide="ignore", invalid="ignore"):
456
+ return np.sum(np.log(np.abs(u - v) + 1))
406
457
 
407
458
  def marylandbridge(self, u, v):
408
459
  """Calculate the Maryland Bridge distance between two vectors.
@@ -633,7 +684,8 @@ class Distance:
633
684
  # 3. https://en.wikipedia.org/wiki/Bhattacharyya_distance
634
685
  # """
635
686
  # u, v = np.asarray(u), np.asarray(v)
636
- # return -np.log(np.sum(np.sqrt(u * v)))
687
+ # with np.errstate(divide="ignore", invalid="ignore"):
688
+ # return -np.log(np.sum(np.sqrt(u * v)))
637
689
 
638
690
  def chebyshev_min(self, u, v):
639
691
  """Calculate the minimum value distance between two vectors.
@@ -808,9 +860,12 @@ class Distance:
808
860
  # vectors could be ignored or masked (see below).
809
861
  # u = ma.masked_where(u == 0, u)
810
862
  # v = ma.masked_where(v == 0, u)
811
- u = np.where(u == 0, self.epsilon, u)
812
- v = np.where(v == 0, self.epsilon, v)
813
- return np.sum((u - v) * np.log(u / v))
863
+ with np.errstate(divide="ignore", invalid="ignore"):
864
+ u[u == 0] = self.epsilon
865
+ v[v == 0] = self.epsilon
866
+ # Clip negative values to zero for valid log
867
+ udivv = np.clip(u / v, a_min=self.epsilon, a_max=None)
868
+ return np.sum((u - v) * np.log(udivv))
814
869
 
815
870
  def jensenshannon_divergence(self, u, v):
816
871
  """Calculate the Jensen-Shannon divergence between two vectors.
@@ -844,11 +899,17 @@ class Distance:
844
899
  return np.sum(el1 - el2 * el3)
845
900
  """
846
901
  u, v = np.asarray(u), np.asarray(v)
847
- u = np.where(u == 0, self.epsilon, u)
848
- v = np.where(v == 0, self.epsilon, v)
849
- dl = u * np.log(2 * u / (u + v))
850
- dr = v * np.log(2 * v / (u + v))
851
- return (np.sum(dl) + np.sum(dr)) / 2
902
+ with np.errstate(divide="ignore", invalid="ignore"):
903
+ # Clip negative values to zero for valid log
904
+ u[u == 0] = self.epsilon
905
+ v[v == 0] = self.epsilon
906
+
907
+ term1 = np.clip(2 * u / (u + v), a_min=self.epsilon, a_max=None)
908
+ term2 = np.clip(2 * v / (u + v), a_min=self.epsilon, a_max=None)
909
+
910
+ dl = u * np.log(term1)
911
+ dr = v * np.log(term2)
912
+ return (np.sum(dl) + np.sum(dr)) / 2
852
913
 
853
914
  def jensen_difference(self, u, v):
854
915
  """Calculate the Jensen difference between two vectors.
@@ -877,11 +938,14 @@ class Distance:
877
938
  1(4), 300-307.
878
939
  """
879
940
  u, v = np.asarray(u), np.asarray(v)
880
- u = np.where(u == 0, self.epsilon, u)
881
- v = np.where(v == 0, self.epsilon, v)
882
- el1 = (u * np.log(u) + v * np.log(v)) / 2
883
- el2 = (u + v) / 2
884
- return np.sum(el1 - el2 * np.log(el2))
941
+
942
+ with np.errstate(divide="ignore", invalid="ignore"):
943
+ # Clip negative values to eps for valid log
944
+ u = np.clip(u, self.epsilon, None)
945
+ v = np.clip(v, self.epsilon, None)
946
+ el1 = (u * np.log(u) + v * np.log(v)) / 2
947
+ el2 = np.clip((u + v) / 2, a_min=self.epsilon, a_max=None)
948
+ return np.sum(el1 - el2 * np.log(el2))
885
949
 
886
950
  def kumarjohnson(self, u, v):
887
951
  """Calculate the Kumar-Johnson distance between two vectors.
@@ -934,7 +998,8 @@ class Distance:
934
998
  Equals square root of Squared-chord distance.
935
999
  """
936
1000
  u, v = np.asarray(u), np.asarray(v)
937
- return np.sqrt(np.sum((np.sqrt(u) - np.sqrt(v)) ** 2))
1001
+ with np.errstate(divide="ignore", invalid="ignore"):
1002
+ return np.sqrt(np.sum((np.sqrt(u) - np.sqrt(v)) ** 2))
938
1003
 
939
1004
  def minkowski(self, u, v, p=2):
940
1005
  """Calculate the Minkowski distance between two vectors.
@@ -981,7 +1046,8 @@ class Distance:
981
1046
  u, v = np.asarray(u), np.asarray(v)
982
1047
  umu = np.mean(u)
983
1048
  vmu = np.mean(v)
984
- return np.sqrt(np.sum(((u - umu) - (v - vmu)) ** 2))
1049
+ with np.errstate(divide="ignore", invalid="ignore"):
1050
+ return np.sqrt(np.sum(((u - umu) - (v - vmu)) ** 2))
985
1051
 
986
1052
  def prob_chisq(self, u, v):
987
1053
  """Calculate the Probabilistic chi-square distance between two vectors.
@@ -1093,7 +1159,8 @@ class Distance:
1093
1159
  Equals to squared Matusita distance.
1094
1160
  """
1095
1161
  u, v = np.asarray(u), np.asarray(v)
1096
- return np.sum((np.sqrt(u) - np.sqrt(v)) ** 2)
1162
+ with np.errstate(divide="ignore", invalid="ignore"):
1163
+ return np.sum((np.sqrt(u) - np.sqrt(v)) ** 2)
1097
1164
 
1098
1165
  def squared_euclidean(self, u, v):
1099
1166
  """Calculate the Squared Euclidean distance between two vectors.
@@ -1145,10 +1212,14 @@ class Distance:
1145
1212
  1(4), 300-307.
1146
1213
  """
1147
1214
  u, v = np.asarray(u), np.asarray(v)
1148
- u = np.where(u == 0, self.epsilon, u)
1149
- v = np.where(v == 0, self.epsilon, v)
1150
- uvsum = u + v
1151
- return np.sum((uvsum / 2) * np.log(uvsum / (2 * np.sqrt(u * v))))
1215
+ with np.errstate(divide="ignore", invalid="ignore"):
1216
+ u[u == 0] = self.epsilon
1217
+ v[v == 0] = self.epsilon
1218
+ uvsum = u + v
1219
+ logarg = np.clip(
1220
+ uvsum / (2 * np.sqrt(u * v)), a_min=self.epsilon, a_max=None
1221
+ )
1222
+ return np.sum((uvsum / 2) * np.log(logarg))
1152
1223
 
1153
1224
  def tanimoto(self, u, v):
1154
1225
  """Calculate the Tanimoto distance between two vectors.
@@ -1202,11 +1273,14 @@ class Distance:
1202
1273
  Equals two times Jensen-Shannon divergence.
1203
1274
  """
1204
1275
  u, v = np.asarray(u), np.asarray(v)
1205
- u = np.where(u == 0, self.epsilon, u)
1206
- v = np.where(v == 0, self.epsilon, v)
1207
- dl = u * np.log(2 * u / (u + v))
1208
- dr = v * np.log(2 * v / (u + v))
1209
- return np.sum(dl + dr)
1276
+ with np.errstate(divide="ignore", invalid="ignore"):
1277
+ u[u == 0] = self.epsilon
1278
+ v[v == 0] = self.epsilon
1279
+ logarg1 = np.clip(2 * u / (u + v), a_min=self.epsilon, a_max=None)
1280
+ logarg2 = np.clip(2 * v / (u + v), a_min=self.epsilon, a_max=None)
1281
+ dl = u * np.log(logarg1)
1282
+ dr = v * np.log(logarg2)
1283
+ return np.sum(dl + dr)
1210
1284
 
1211
1285
  def vicis_symmetric_chisq(self, u, v):
1212
1286
  """Calculate the Vicis Symmetric chi-square distance.
@@ -1330,9 +1404,10 @@ class Distance:
1330
1404
  # 1(4), 300-307.
1331
1405
  # """
1332
1406
  # u, v = np.asarray(u), np.asarray(v)
1333
- # u = np.where(u == 0, self.epsilon, u)
1334
- # v = np.where(v == 0, self.epsilon, v)
1335
- # return np.sum(u * np.log(2 * u / (u + v)))
1407
+ # u[u == 0] = self.epsilon
1408
+ # v[v == 0] = self.epsilon
1409
+ # with np.errstate(divide="ignore", invalid="ignore"):
1410
+ # return np.sum(u * np.log(2 * u / (u + v)))
1336
1411
 
1337
1412
  # def kl_divergence(self, u, v):
1338
1413
  # """Calculate the Kullback-Leibler divergence between two vectors.
@@ -1358,9 +1433,10 @@ class Distance:
1358
1433
  # 1(4):300-307.
1359
1434
  # """
1360
1435
  # u, v = np.asarray(u), np.asarray(v)
1361
- # u = np.where(u == 0, self.epsilon, u)
1362
- # v = np.where(v == 0, self.epsilon, v)
1363
- # return np.sum(u * np.log(u / v))
1436
+ # u[u == 0] = self.epsilon
1437
+ # v[v == 0] = self.epsilon
1438
+ # with np.errstate(divide="ignore", invalid="ignore"):
1439
+ # return np.sum(u * np.log(u / v))
1364
1440
 
1365
1441
  # def max_symmetric_chisq(self, u, v):
1366
1442
  # """Calculate the maximum symmetric chi-square distance.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: distclassipy
3
- Version: 0.2.0a0
3
+ Version: 0.2.1
4
4
  Summary: A python package for a distance-based classifier which can use several different distance metrics.
5
5
  Author-email: Siddharth Chaini <sidchaini@gmail.com>
6
6
  License: GNU GENERAL PUBLIC LICENSE
@@ -740,17 +740,25 @@ X, y = make_classification(
740
740
  random_state=0,
741
741
  shuffle=False,
742
742
  )
743
+ # Example usage of DistanceMetricClassifier
743
744
  clf = dcpy.DistanceMetricClassifier()
744
745
  clf.fit(X, y)
745
- print(clf.predict([[0, 0, 0, 0]]), metric="canberra")
746
+ print(clf.predict([[0, 0, 0, 0]], metric="canberra"))
747
+
748
+ # Example usage of EnsembleDistanceClassifier
749
+ ensemble_clf = dcpy.EnsembleDistanceClassifier(feat_idx=0)
750
+ ensemble_clf.fit(X, y)
751
+ print(ensemble_clf.predict(X))
746
752
  ```
747
753
 
748
754
  ## Features
749
755
  - **Distance Metric-Based Classification**: Utilizes a variety of distance metrics for classification.
750
756
  - **Customizable for Scientific Goals**: Allows fine-tuning based on scientific objectives by selecting appropriate distance metrics and features, enhancing both computational efficiency and model performance.
751
757
  - **Interpretable Results**: Offers improved interpretability of classification outcomes by directly using distance metrics and feature importance, making it ideal for scientific applications.
752
- - **Efficient and Scalable**: Demonstrates lower computational requirements compared to traditional methods like Random Forests, making it suitable for large datasets
753
- - **Open Source and Accessible**: Available as an open-source Python package on PyPI, encouraging broad application in astronomy and beyond
758
+ - **Efficient and Scalable**: Demonstrates lower computational requirements compared to traditional methods like Random Forests, making it suitable for large datasets.
759
+ - **Open Source and Accessible**: Available as an open-source Python package on PyPI, encouraging broad application in astronomy and beyond.
760
+ - **(NEW) Ensemble Distance Classification**: Leverages an ensemble approach to use different distance metrics for each quantile, improving classification performance across diverse data distributions.
761
+ - **(NEW) Expanded Distance Metrics**: DistClassiPy now offers 43 built-in distance metrics, an increase from the previous 18. Additionally, users can still define and use custom distance metrics as needed.
754
762
 
755
763
  ## Documentation
756
764
 
@@ -1,9 +1,13 @@
1
- from distclassipy.classifier import DistanceMetricClassifier
1
+ from distclassipy.classifier import (
2
+ DistanceMetricClassifier,
3
+ EnsembleDistanceClassifier,
4
+ )
2
5
 
3
6
  import numpy as np
4
7
 
5
8
  import pytest
6
9
 
10
+ from sklearn.datasets import make_classification
7
11
  from sklearn.utils.estimator_checks import check_estimator
8
12
 
9
13
 
@@ -28,7 +32,7 @@ def test_fit():
28
32
 
29
33
 
30
34
  # Test making predictions with the classifier
31
- def test_predict():
35
+ def test_dcpy():
32
36
  X = np.array([[1, 2], [3, 4], [5, 6]]) # Sample feature set
33
37
  y = np.array([0, 1, 0]) # Sample target values
34
38
  clf = DistanceMetricClassifier()
@@ -59,7 +63,7 @@ def test_metric_scipy():
59
63
 
60
64
 
61
65
  # Test using different distance metrics - from distclassipy
62
- def test_metric_dcpy():
66
+ def test_metric_pred():
63
67
  X = np.array([[1, 2], [3, 4], [5, 6]]) # Sample feature set
64
68
  y = np.array([0, 1, 0]) # Sample target values
65
69
  clf = DistanceMetricClassifier()
@@ -134,3 +138,18 @@ def test_confidence_calculation():
134
138
  clf.predict_and_analyse(X)
135
139
  distance_confidence = clf.calculate_confidence()
136
140
  assert distance_confidence.shape == (3, len(np.unique(y)))
141
+
142
+
143
+ # Test basic functionality of EnsembleDistanceClassifier
144
+ def test_ensemble_distance_classifier():
145
+ X, y = make_classification(
146
+ n_samples=1000,
147
+ n_features=4,
148
+ n_informative=2,
149
+ shuffle=True,
150
+ )
151
+ clf = EnsembleDistanceClassifier(feat_idx=0)
152
+ clf.fit(X, y)
153
+ predictions = clf.predict(X)
154
+ assert len(predictions) == len(y)
155
+ assert set(predictions).issubset(set(y))
@@ -1,6 +1,6 @@
1
1
  import math
2
2
 
3
- from distclassipy.distances import Distance
3
+ from distclassipy.distances import Distance, _ALL_METRICS
4
4
 
5
5
  from hypothesis import given, strategies as st
6
6
 
@@ -38,51 +38,6 @@ arrays = st.integers(min_value=1, max_value=20).flatmap(
38
38
  ).map(np.array),
39
39
  )
40
40
  )
41
- # List of all distance metrics
42
- _ALL_METRICS = [
43
- "euclidean",
44
- "braycurtis",
45
- "canberra",
46
- "cityblock",
47
- "chebyshev",
48
- "clark",
49
- "correlation",
50
- "cosine",
51
- "hellinger",
52
- "jaccard",
53
- "lorentzian",
54
- "marylandbridge",
55
- "meehl",
56
- "motyka",
57
- "soergel",
58
- "wave_hedges",
59
- "kulczynski",
60
- "add_chisq",
61
- "acc",
62
- "chebyshev_min",
63
- "czekanowski",
64
- "dice",
65
- "divergence",
66
- "google",
67
- "gower",
68
- "jeffreys",
69
- "jensenshannon_divergence",
70
- "jensen_difference",
71
- "kumarjohnson",
72
- "matusita",
73
- "minkowski",
74
- "penroseshape",
75
- "prob_chisq",
76
- "ruzicka",
77
- "sorensen",
78
- "squared_chisq",
79
- "squaredchord",
80
- "squared_euclidean",
81
- "taneja",
82
- "tanimoto",
83
- "topsoe",
84
- "vicis_symmetric_chisq",
85
- ]
86
41
 
87
42
 
88
43
  @pytest.mark.parametrize(
File without changes
File without changes
File without changes