distclassipy 0.2.0a0__py3-none-any.whl → 0.2.2a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
distclassipy/__init__.py CHANGED
@@ -22,7 +22,17 @@ You should have received a copy of the GNU General Public License
22
22
  along with this program. If not, see <https://www.gnu.org/licenses/>.
23
23
  """
24
24
 
25
- from .classifier import DistanceMetricClassifier # noqa
26
- from .distances import Distance # noqa
25
+ from .classifier import (
26
+ DistanceMetricClassifier,
27
+ EnsembleDistanceClassifier,
28
+ )
29
+ from .distances import _ALL_METRICS
27
30
 
28
- __version__ = "0.2.0a0"
31
+ __version__ = "0.2.2a1"
32
+
33
+ __all__ = [
34
+ "DistanceMetricClassifier",
35
+ "EnsembleDistanceClassifier",
36
+ "Distance",
37
+ "_ALL_METRICS",
38
+ ]
@@ -3,6 +3,21 @@
3
3
  This module contains the DistanceMetricClassifier introduced by Chaini et al. (2024)
4
4
  in "Light Curve Classification with DistClassiPy: a new distance-based classifier"
5
5
 
6
+
7
+ .. autoclass:: distclassipy.classifier.DistanceMetricClassifier
8
+ :members:
9
+ :inherited-members:
10
+ :exclude-members: set_fit_request, set_predict_request
11
+
12
+ .. autoclass:: distclassipy.classifier.EnsembleDistanceClassifier
13
+ :members:
14
+ :inherited-members:
15
+ :exclude-members: set_fit_request, set_predict_request
16
+
17
+ .. doctest-skip::
18
+
19
+ .. skip::
20
+
6
21
  Copyright (C) 2024 Siddharth Chaini
7
22
  -----
8
23
  This program is free software: you can redistribute it and/or modify
@@ -19,7 +34,7 @@ You should have received a copy of the GNU General Public License
19
34
  along with this program. If not, see <https://www.gnu.org/licenses/>.
20
35
  """
21
36
 
22
- from typing import Callable
37
+ from typing import Callable, Tuple
23
38
 
24
39
  import numpy as np
25
40
 
@@ -28,15 +43,18 @@ import pandas as pd
28
43
  import scipy
29
44
 
30
45
  from sklearn.base import BaseEstimator, ClassifierMixin
46
+ from sklearn.metrics import accuracy_score
47
+ from sklearn.model_selection import train_test_split
31
48
  from sklearn.utils.multiclass import unique_labels
32
- from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
49
+ from sklearn.utils.validation import check_is_fitted, check_array
33
50
 
34
- from .distances import Distance
51
+ from . import distances
52
+ from .distances import _ALL_METRICS
35
53
 
36
54
  # Hardcoded source packages to check for distance metrics.
37
55
  METRIC_SOURCES_ = {
38
56
  "scipy.spatial.distance": scipy.spatial.distance,
39
- "distances.Distance": Distance(),
57
+ "distclassipy.distances": distances,
40
58
  }
41
59
 
42
60
 
@@ -44,7 +62,7 @@ def initialize_metric_function(metric):
44
62
  """Set the metric function based on the provided metric.
45
63
 
46
64
  If the metric is a string, the function will look for a corresponding
47
- function in scipy.spatial.distance or distances.Distance. If the metric
65
+ function in scipy.spatial.distance or distclassipy.distances. If the metric
48
66
  is a function, it will be used directly.
49
67
  """
50
68
  if callable(metric):
@@ -78,7 +96,7 @@ def initialize_metric_function(metric):
78
96
  raise ValueError(
79
97
  f"{metric} metric not found. Please pass a string of the "
80
98
  "name of a metric in scipy.spatial.distance or "
81
- "distances.Distance, or pass a metric function directly. For a "
99
+ "distclassipy.distances, or pass a metric function directly. For a "
82
100
  "list of available metrics, see: "
83
101
  "https://sidchaini.github.io/DistClassiPy/distances.html or "
84
102
  "https://docs.scipy.org/doc/scipy/reference/spatial.distance.html"
@@ -86,7 +104,7 @@ def initialize_metric_function(metric):
86
104
  return metric_fn_, metric_arg_
87
105
 
88
106
 
89
- class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
107
+ class DistanceMetricClassifier(ClassifierMixin, BaseEstimator):
90
108
  """A distance-based classifier that supports different distance metrics.
91
109
 
92
110
  The distance metric classifier determines the similarity between features in a
@@ -113,16 +131,6 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
113
131
 
114
132
  .. versionadded:: 0.1.0
115
133
 
116
-
117
- Attributes
118
- ----------
119
- scale : bool
120
- Indicates whether the data is scaled.
121
- central_stat : str
122
- The statistic used for calculating central tendency.
123
- dispersion_stat : str
124
- The statistic used for calculating dispersion.
125
-
126
134
  References
127
135
  ----------
128
136
  .. [1] "Light Curve Classification with DistClassiPy: a new distance-based
@@ -135,25 +143,29 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
135
143
  >>> X, y = make_classification(n_samples=1000, n_features=4,
136
144
  ... n_informative=2, n_redundant=0,
137
145
  ... random_state=0, shuffle=False)
138
- >>> clf = dcpy.DistanceMetricClassifier(metric="canberra")
146
+ >>> clf = dcpy.DistanceMetricClassifier()
139
147
  >>> clf.fit(X, y)
140
148
  DistanceMetricClassifier(...)
141
- >>> print(clf.predict([[0, 0, 0, 0]]))
149
+ >>> print(clf.predict([[0, 0, 0, 0]], metric="canberra"))
142
150
  [0]
143
151
  """
144
152
 
145
153
  def __init__(
146
154
  self,
155
+ metric: str | Callable = None,
147
156
  scale: bool = True,
148
157
  central_stat: str = "median",
149
158
  dispersion_stat: str = "std",
150
- ):
159
+ ) -> None:
151
160
  """Initialize the classifier with specified parameters."""
161
+ self.metric = metric
152
162
  self.scale = scale
153
163
  self.central_stat = central_stat
154
164
  self.dispersion_stat = dispersion_stat
155
165
 
156
- def fit(self, X: np.array, y: np.array, feat_labels: list[str] = None):
166
+ def fit(
167
+ self, X: np.array, y: np.array, feat_labels: list[str] = None
168
+ ) -> "DistanceMetricClassifier":
157
169
  """Calculate the feature space centroid for all classes.
158
170
 
159
171
  This function calculates the feature space centroid in the training
@@ -177,11 +189,8 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
177
189
  self : object
178
190
  Fitted estimator.
179
191
  """
180
- X, y = check_X_y(X, y)
192
+ X, y = self._validate_data(X, y)
181
193
  self.classes_ = unique_labels(y)
182
- self.n_features_in_ = X.shape[
183
- 1
184
- ] # Number of features seen during fit - required for sklearn compatibility.
185
194
 
186
195
  if feat_labels is None:
187
196
  feat_labels = [f"Feature_{x}" for x in range(X.shape[1])]
@@ -233,8 +242,8 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
233
242
  def predict(
234
243
  self,
235
244
  X: np.array,
236
- metric: str | Callable = "euclidean",
237
- ):
245
+ metric: str | Callable = None,
246
+ ) -> np.ndarray:
238
247
  """Predict the class labels for the provided X.
239
248
 
240
249
  The prediction is based on the distance of each data point in the input sample
@@ -248,6 +257,10 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
248
257
  metric : str or callable, default="euclidean"
249
258
  The distance metric to use for calculating the distance between features.
250
259
 
260
+ .. versionchanged:: 0.2.0
261
+ The metric is now specified at prediction time rather
262
+ than during initialization, providing greater flexibility.
263
+
251
264
  Returns
252
265
  -------
253
266
  y : ndarray of shape (n_samples,)
@@ -256,7 +269,7 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
256
269
  See Also
257
270
  --------
258
271
  scipy.spatial.dist : Other distance metrics provided in SciPy
259
- distclassipy.Distance : Distance metrics included with DistClassiPy
272
+ distclassipy.distances : Distance metrics included with DistClassiPy
260
273
 
261
274
  Notes
262
275
  -----
@@ -264,10 +277,14 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
264
277
  which allows SciPy to use an optimized C version of the code instead of the
265
278
  slower Python version.
266
279
  """
267
- check_is_fitted(self, "is_fitted_")
268
- X = check_array(X)
280
+ check_is_fitted(self)
281
+ X = self._validate_data(X, reset=False)
269
282
 
270
- metric_fn_, metric_arg_ = initialize_metric_function(metric)
283
+ metric_to_use = metric if metric is not None else self.metric
284
+ if metric_to_use is None:
285
+ # defaults to euclidean
286
+ metric_to_use = "euclidean"
287
+ metric_fn_, metric_arg_ = initialize_metric_function(metric_to_use)
271
288
 
272
289
  if not self.scale:
273
290
  dist_arr = scipy.spatial.distance.cdist(
@@ -298,8 +315,8 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
298
315
  def predict_and_analyse(
299
316
  self,
300
317
  X: np.array,
301
- metric: str | Callable = "euclidean",
302
- ):
318
+ metric: str | Callable = None,
319
+ ) -> np.ndarray:
303
320
  """Predict the class labels for the provided X and perform analysis.
304
321
 
305
322
  The prediction is based on the distance of each data point in the input sample
@@ -325,7 +342,7 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
325
342
  See Also
326
343
  --------
327
344
  scipy.spatial.dist : Other distance metrics provided in SciPy
328
- distclassipy.Distance : Distance metrics included with DistClassiPy
345
+ distclassipy.distances : Distance metrics included with DistClassiPy
329
346
 
330
347
  Notes
331
348
  -----
@@ -334,10 +351,14 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
334
351
  of the slower Python version.
335
352
 
336
353
  """
337
- check_is_fitted(self, "is_fitted_")
338
- X = check_array(X)
354
+ check_is_fitted(self)
355
+ X = self._validate_data(X, reset=False)
339
356
 
340
- metric_fn_, metric_arg_ = initialize_metric_function(metric)
357
+ metric_to_use = metric if metric is not None else self.metric
358
+ if metric_to_use is None:
359
+ # defaults to euclidean
360
+ metric_to_use = "euclidean"
361
+ metric_fn_, metric_arg_ = initialize_metric_function(metric_to_use)
341
362
 
342
363
  if not self.scale:
343
364
  dist_arr = scipy.spatial.distance.cdist(
@@ -397,3 +418,290 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
397
418
  ]
398
419
 
399
420
  return self.confidence_df_.to_numpy()
421
+
422
+ def score(self, X, y, metric: str | Callable = None) -> float:
423
+ """Return the mean accuracy on the given test data and labels.
424
+
425
+ Parameters
426
+ ----------
427
+ X : array-like of shape (n_samples, n_features)
428
+ Test samples.
429
+ y : array-like of shape (n_samples,)
430
+ True labels for X.
431
+ metric : str or callable, default="euclidean"
432
+ The distance metric to use for calculating the distance between features.
433
+
434
+ Returns
435
+ -------
436
+ score : float
437
+ Mean accuracy of self.predict(X) wrt. y.
438
+ """
439
+ metric_to_use = metric if metric is not None else self.metric
440
+ y_pred = self.predict(X, metric=metric_to_use)
441
+ return accuracy_score(y, y_pred)
442
+
443
+
444
+ class EnsembleDistanceClassifier(ClassifierMixin, BaseEstimator):
445
+ """An ensemble classifier that uses different metrics for each quantile.
446
+
447
+ This classifier splits the data into quantiles based on a specified
448
+ feature and uses different distance metrics for each quantile to
449
+ construct an ensemble classifier for each quantile, generally leading
450
+ to better performance.
451
+ Note, however, this involves fitting the training set for each metric
452
+ to evaluate performance, making this more computationally expensive.
453
+
454
+ .. versionadded:: 0.2.0
455
+ """
456
+
457
+ def __init__(
458
+ self,
459
+ feat_idx: int,
460
+ scale: bool = True,
461
+ central_stat: str = "median",
462
+ dispersion_stat: str = "std",
463
+ metrics_to_consider: list[str] = None,
464
+ random_state: int = None,
465
+ ) -> None:
466
+ """Initialize the classifier with specified parameters.
467
+
468
+ Parameters
469
+ ----------
470
+ feat_idx : int
471
+ The index of the feature to be used for quantile splitting.
472
+ scale : bool, default=True
473
+ Whether to scale the distance between the test object and the centroid.
474
+ central_stat : str, default="median"
475
+ The statistic used to calculate the central tendency of the data.
476
+ dispersion_stat : str, default="std"
477
+ The statistic used to calculate the dispersion of the data.
478
+ metrics_to_consider : list of str, optional
479
+ A list of distance metrics to evaluate. If None, all available
480
+ metrics within DistClassiPy will be considered.
481
+ random_state : int, RandomState instance or None, optional (default=None)
482
+ Controls the randomness of the estimator. Pass an int for reproducible
483
+ output across multiple function calls.
484
+
485
+ .. versionadded:: 0.2.1
486
+ """
487
+ self.feat_idx = feat_idx
488
+ self.scale = scale
489
+ self.central_stat = central_stat
490
+ self.dispersion_stat = dispersion_stat
491
+ self.metrics_to_consider = metrics_to_consider
492
+ self.random_state = random_state
493
+
494
+ def fit(
495
+ self, X: np.ndarray, y: np.ndarray, n_quantiles: int = 4
496
+ ) -> "EnsembleDistanceClassifier":
497
+ """Fit the ensemble classifier using the best metrics for each quantile.
498
+
499
+ Parameters
500
+ ----------
501
+ X : np.ndarray
502
+ The input feature matrix.
503
+ y : np.ndarray
504
+ The target labels.
505
+ n_quantiles : int, default=4
506
+ The number of quantiles to split the data into.
507
+
508
+ Returns
509
+ -------
510
+ self : object
511
+ Fitted estimator.
512
+ """
513
+ self.clf_ = DistanceMetricClassifier(
514
+ scale=self.scale,
515
+ central_stat=self.central_stat,
516
+ dispersion_stat=self.dispersion_stat,
517
+ )
518
+
519
+ # Find best metrics based on training set quantiles
520
+ self.quantile_scores_df_, self.best_metrics_per_quantile_, self.group_bins = (
521
+ self.evaluate_metrics(X, y, n_quantiles)
522
+ )
523
+
524
+ # Ensure the bins work with values outside of training data
525
+ self.group_bins[0] = -np.inf
526
+ self.group_bins[-1] = np.inf
527
+
528
+ self.group_labels = [f"Quantile {i+1}" for i in range(n_quantiles)]
529
+ self.clf_.fit(X, y)
530
+ self.is_fitted_ = True
531
+ return self
532
+
533
+ def predict(self, X: np.ndarray) -> np.ndarray:
534
+ """Predict class labels using the best metric for each quantile.
535
+
536
+ Parameters
537
+ ----------
538
+ X : np.ndarray
539
+ The input samples.
540
+
541
+ Returns
542
+ -------
543
+ predictions : np.ndarray
544
+ The predicted class labels.
545
+ """
546
+ check_is_fitted(self)
547
+ X = self._validate_data(X, reset=False)
548
+
549
+ # notes for pred during best:
550
+ # option 1:
551
+ # loop through each metric, merge quantiles for each metric
552
+ # pred on this
553
+ # option 2, easier, but slower:
554
+ # loop through each quantile, and append pred
555
+
556
+ quantiles = pd.cut(
557
+ X[:, self.feat_idx], bins=self.group_bins, labels=self.group_labels
558
+ )
559
+ grouped_data = pd.DataFrame(X).groupby(quantiles, observed=False)
560
+ # quantile_indices = quantiles.codes # Get integer codes for quantiles
561
+ predictions = np.empty(X.shape[0], dtype=object) # Change dtype to object
562
+ for i, (lim, subdf) in enumerate(grouped_data):
563
+ best_metric = self.best_metrics_per_quantile_.loc[self.group_labels[i]]
564
+ preds = self.clf_.predict(subdf.to_numpy(), metric=best_metric)
565
+ predictions[subdf.index] = preds
566
+ # # Precompute predictions for each quantile
567
+ # quantile_predictions = {}
568
+ # for i, label in enumerate(self.group_labels):
569
+ # best_metric = self.best_metrics_per_quantile_.loc[label]
570
+ # quantile_data = X[quantile_indices == i]
571
+ # if quantile_data.size > 0:
572
+ # quantile_predictions[i] = self.clf_.predict(
573
+ # quantile_data, metric=best_metric
574
+ # )
575
+
576
+ # Assign predictions to the corresponding indices
577
+ # for i, preds in quantile_predictions.items():
578
+ # predictions[quantile_indices == i] = preds
579
+
580
+ return predictions
581
+
582
+ def evaluate_metrics(
583
+ self, X: np.ndarray, y: np.ndarray, n_quantiles: int = 4
584
+ ) -> Tuple[pd.DataFrame, pd.Series, np.ndarray]:
585
+ """Evaluate and find the best distance metrics for the specified feature.
586
+
587
+ This method uses the standalone `find_best_metrics` function to evaluate
588
+ different distance metrics and determine the best-performing ones for
589
+ each quantile.
590
+
591
+ Parameters
592
+ ----------
593
+ X : np.ndarray
594
+ The input feature matrix.
595
+ y : np.ndarray
596
+ The target labels.
597
+ n_quantiles : int, default=4
598
+ The number of quantiles to split the data into.
599
+
600
+ Returns
601
+ -------
602
+ quantile_scores_df : pd.DataFrame
603
+ A DataFrame containing the accuracy scores for each metric across
604
+ different quantiles.
605
+ best_metrics_per_quantile : pd.Series
606
+ A Series indicating the best-performing metric for each quantile.
607
+ group_bins : np.ndarray
608
+ The bins used for quantile splitting.
609
+ """
610
+ return find_best_metrics(
611
+ self.clf_,
612
+ X,
613
+ y,
614
+ self.feat_idx,
615
+ n_quantiles,
616
+ self.metrics_to_consider,
617
+ self.random_state,
618
+ )
619
+
620
+
621
+ def find_best_metrics(
622
+ clf: "DistanceMetricClassifier",
623
+ X: np.ndarray,
624
+ y: np.ndarray,
625
+ feat_idx: int,
626
+ n_quantiles: int = 4,
627
+ metrics_to_consider: list[str] = None,
628
+ random_state: int = None,
629
+ ) -> Tuple[pd.DataFrame, pd.Series, np.ndarray]:
630
+ """Evaluate and find the best distance metrics for a given feature.
631
+
632
+ This function evaluates different distance metrics to determine which
633
+ performs best for a specific feature in the dataset. It splits the data
634
+ into quantiles based on the specified feature and calculates the accuracy
635
+ of the classifier for each metric within these quantiles.
636
+
637
+ .. versionadded:: 0.2.0
638
+
639
+ Parameters
640
+ ----------
641
+ clf : DistanceMetricClassifier
642
+ The classifier instance to be used for evaluation.
643
+ X : np.ndarray
644
+ The input feature matrix.
645
+ y : np.ndarray
646
+ The target labels.
647
+ feat_idx : int
648
+ The index of the feature to be used for quantile splitting.
649
+ n_quantiles : int, default=4
650
+ The number of quantiles to split the data into.
651
+ metrics_to_consider : list of str, optional
652
+ A list of distance metrics to evaluate. If None, all available
653
+ metrics within DistClassiPy will be considered.
654
+ random_state : int, RandomState instance or None, optional (default=None)
655
+ Controls the randomness of the estimator. Pass an int for reproducible
656
+ output across multiple function calls.
657
+
658
+ .. versionadded:: 0.2.1
659
+
660
+ Returns
661
+ -------
662
+ quantile_scores_df : pd.DataFrame
663
+ A DataFrame containing the accuracy scores for each metric across
664
+ different quantiles.
665
+ best_metrics_per_quantile : pd.Series
666
+ A Series indicating the best-performing metric for each quantile.
667
+ group_bins : np.ndarray
668
+ The bins used for quantile splitting.
669
+ """
670
+ X = check_array(X)
671
+ feature_labels = [f"Feature_{i}" for i in range(X.shape[1])]
672
+ feature_name = f"Feature_{feat_idx}"
673
+
674
+ if metrics_to_consider is None:
675
+ metrics_to_consider = _ALL_METRICS
676
+
677
+ X_df = pd.DataFrame(X, columns=feature_labels)
678
+ y_df = pd.DataFrame(y, columns=["Target"])
679
+ quantiles, group_bins = pd.qcut(X_df[feature_name], q=n_quantiles, retbins=True)
680
+
681
+ X_train, X_test, y_train, y_test = train_test_split(
682
+ X_df, y_df, test_size=0.25, stratify=quantiles, random_state=random_state
683
+ )
684
+
685
+ clf.fit(X_train, y_train.to_numpy().ravel())
686
+ grouped_test_data = X_test.groupby(quantiles, observed=False)
687
+
688
+ quantile_scores = []
689
+ for metric in metrics_to_consider:
690
+ scores_for_metric = [
691
+ accuracy_score(
692
+ y_test.loc[subdf.index], clf.predict(subdf.to_numpy(), metric=metric)
693
+ )
694
+ for _, subdf in grouped_test_data
695
+ ]
696
+ quantile_scores.append(scores_for_metric)
697
+
698
+ quantile_scores = np.array(quantile_scores) * 100
699
+ quantile_scores_df = pd.DataFrame(
700
+ data=quantile_scores,
701
+ index=metrics_to_consider,
702
+ columns=[f"Quantile {i+1}" for i in range(n_quantiles)],
703
+ )
704
+
705
+ best_metrics_per_quantile = quantile_scores_df.idxmax()
706
+
707
+ return quantile_scores_df, best_metrics_per_quantile, group_bins