distclassipy 0.1.6a0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,15 @@
3
3
  This module contains the DistanceMetricClassifier introduced by Chaini et al. (2024)
4
4
  in "Light Curve Classification with DistClassiPy: a new distance-based classifier"
5
5
 
6
+
7
+ .. autoclass:: distclassipy.classifier.DistanceMetricClassifier
8
+ :members:
9
+ :exclude-members: set_fit_request, set_predict_request
10
+
11
+ .. doctest-skip::
12
+
13
+ .. skip::
14
+
6
15
  Copyright (C) 2024 Siddharth Chaini
7
16
  -----
8
17
  This program is free software: you can redistribute it and/or modify
@@ -19,8 +28,7 @@ You should have received a copy of the GNU General Public License
19
28
  along with this program. If not, see <https://www.gnu.org/licenses/>.
20
29
  """
21
30
 
22
- import warnings
23
- from typing import Callable
31
+ from typing import Callable, Tuple
24
32
 
25
33
  import numpy as np
26
34
 
@@ -29,11 +37,12 @@ import pandas as pd
29
37
  import scipy
30
38
 
31
39
  from sklearn.base import BaseEstimator, ClassifierMixin
32
- from sklearn.neighbors import KernelDensity
40
+ from sklearn.metrics import accuracy_score
41
+ from sklearn.model_selection import train_test_split
33
42
  from sklearn.utils.multiclass import unique_labels
34
43
  from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
35
44
 
36
- from .distances import Distance
45
+ from .distances import Distance, _ALL_METRICS
37
46
 
38
47
  # Hardcoded source packages to check for distance metrics.
39
48
  METRIC_SOURCES_ = {
@@ -42,6 +51,52 @@ METRIC_SOURCES_ = {
42
51
  }
43
52
 
44
53
 
54
+ def initialize_metric_function(metric):
55
+ """Set the metric function based on the provided metric.
56
+
57
+ If the metric is a string, the function will look for a corresponding
58
+ function in scipy.spatial.distance or distances.Distance. If the metric
59
+ is a function, it will be used directly.
60
+ """
61
+ if callable(metric):
62
+ metric_fn_ = metric
63
+ metric_arg_ = metric
64
+
65
+ elif isinstance(metric, str):
66
+ metric_str_lowercase = metric.lower()
67
+ metric_found = False
68
+ for package_str, source in METRIC_SOURCES_.items():
69
+
70
+ # Don't use scipy for jaccard as their implementation only works with
71
+ # booleans - use custom jaccard instead
72
+ if (
73
+ package_str == "scipy.spatial.distance"
74
+ and metric_str_lowercase == "jaccard"
75
+ ):
76
+ continue
77
+
78
+ if hasattr(source, metric_str_lowercase):
79
+ metric_fn_ = getattr(source, metric_str_lowercase)
80
+ metric_found = True
81
+
82
+ # Use the string as an argument if it belongs to scipy as it is
83
+ # optimized
84
+ metric_arg_ = (
85
+ metric if package_str == "scipy.spatial.distance" else metric_fn_
86
+ )
87
+ break
88
+ if not metric_found:
89
+ raise ValueError(
90
+ f"{metric} metric not found. Please pass a string of the "
91
+ "name of a metric in scipy.spatial.distance or "
92
+ "distances.Distance, or pass a metric function directly. For a "
93
+ "list of available metrics, see: "
94
+ "https://sidchaini.github.io/DistClassiPy/distances.html or "
95
+ "https://docs.scipy.org/doc/scipy/reference/spatial.distance.html"
96
+ )
97
+ return metric_fn_, metric_arg_
98
+
99
+
45
100
  class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
46
101
  """A distance-based classifier that supports different distance metrics.
47
102
 
@@ -55,8 +110,6 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
55
110
 
56
111
  Parameters
57
112
  ----------
58
- metric : str or callable, default="euclidean"
59
- The distance metric to use for calculating the distance between features.
60
113
  scale : bool, default=True
61
114
  Whether to scale the distance between the test object and the centroid for a
62
115
  class in the feature space. If True, the data will be scaled based on the
@@ -71,47 +124,15 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
71
124
 
72
125
  .. versionadded:: 0.1.0
73
126
 
74
- calculate_kde : bool, default=False
75
- Whether to calculate a kernel density estimate based confidence parameter.
76
- .. deprecated:: 0.2.0
77
- This parameter will be removed in a future version and only the
78
- distance confidence parameter will be available.
79
- calculate_1d_dist : bool, default=False
80
- Whether to calculate the 1-dimensional distance based confidence parameter.
81
- .. deprecated:: 0.2.0
82
- This parameter will be removed in a future version and only the
83
- distance confidence parameter will be available.
84
- Whether to calculate the 1-dimensional distance based confidence parameter.
85
127
 
86
128
  Attributes
87
129
  ----------
88
- metric : str or callable
89
- The distance metric used for classification.
90
130
  scale : bool
91
131
  Indicates whether the data is scaled.
92
132
  central_stat : str
93
133
  The statistic used for calculating central tendency.
94
134
  dispersion_stat : str
95
135
  The statistic used for calculating dispersion.
96
- calculate_kde : bool
97
- Indicates whether a kernel density estimate is calculated.
98
- .. deprecated:: 0.2.0
99
- This parameter will be removed in a future version.
100
- calculate_1d_dist : bool
101
- Indicates whether 1-dimensional distances are calculated.
102
- .. deprecated:: 0.2.0
103
- This parameter will be removed in a future version.
104
-
105
- See Also
106
- --------
107
- scipy.spatial.dist : Other distance metrics provided in SciPy
108
- distclassipy.Distance : Distance metrics included with DistClassiPy
109
-
110
- Notes
111
- -----
112
- If using distance metrics supported by SciPy, it is desirable to pass a string,
113
- which allows SciPy to use an optimized C version of the code instead of the slower
114
- Python version.
115
136
 
116
137
  References
117
138
  ----------
@@ -125,88 +146,27 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
125
146
  >>> X, y = make_classification(n_samples=1000, n_features=4,
126
147
  ... n_informative=2, n_redundant=0,
127
148
  ... random_state=0, shuffle=False)
128
- >>> clf = dcpy.DistanceMetricClassifier(metric="canberra")
149
+ >>> clf = dcpy.DistanceMetricClassifier()
129
150
  >>> clf.fit(X, y)
130
151
  DistanceMetricClassifier(...)
131
- >>> print(clf.predict([[0, 0, 0, 0]]))
152
+ >>> print(clf.predict([[0, 0, 0, 0]], metric="canberra"))
132
153
  [0]
133
154
  """
134
155
 
135
156
  def __init__(
136
157
  self,
137
- metric: str | Callable = "euclidean",
138
158
  scale: bool = True,
139
159
  central_stat: str = "median",
140
160
  dispersion_stat: str = "std",
141
- calculate_kde: bool = True, # deprecated in 0.2.0
142
- calculate_1d_dist: bool = True, # deprecated in 0.2.0
143
- ):
161
+ ) -> None:
144
162
  """Initialize the classifier with specified parameters."""
145
- self.metric = metric
146
163
  self.scale = scale
147
164
  self.central_stat = central_stat
148
165
  self.dispersion_stat = dispersion_stat
149
- if calculate_kde:
150
- warnings.warn(
151
- "calculate_kde is deprecated and will be removed in version 0.2.0",
152
- DeprecationWarning,
153
- )
154
- self.calculate_kde = calculate_kde
155
-
156
- if calculate_1d_dist:
157
- warnings.warn(
158
- "calculate_1d_dist is deprecated and will be removed in version 0.2.0",
159
- DeprecationWarning,
160
- )
161
- self.calculate_1d_dist = calculate_1d_dist
162
-
163
- def initialize_metric_function(self):
164
- """Set the metric function based on the provided metric.
165
166
 
166
- If the metric is a string, the function will look for a corresponding
167
- function in scipy.spatial.distance or distances.Distance. If the metric
168
- is a function, it will be used directly.
169
- """
170
- if callable(self.metric):
171
- self.metric_fn_ = self.metric
172
- self.metric_arg_ = self.metric
173
-
174
- elif isinstance(self.metric, str):
175
- metric_str_lowercase = self.metric.lower()
176
- metric_found = False
177
- for package_str, source in METRIC_SOURCES_.items():
178
-
179
- # Don't use scipy for jaccard as their implementation only works with
180
- # booleans - use custom jaccard instead
181
- if (
182
- package_str == "scipy.spatial.distance"
183
- and metric_str_lowercase == "jaccard"
184
- ):
185
- continue
186
-
187
- if hasattr(source, metric_str_lowercase):
188
- self.metric_fn_ = getattr(source, metric_str_lowercase)
189
- metric_found = True
190
-
191
- # Use the string as an argument if it belongs to scipy as it is
192
- # optimized
193
- self.metric_arg_ = (
194
- self.metric
195
- if package_str == "scipy.spatial.distance"
196
- else self.metric_fn_
197
- )
198
- break
199
- if not metric_found:
200
- raise ValueError(
201
- f"{self.metric} metric not found. Please pass a string of the "
202
- "name of a metric in scipy.spatial.distance or "
203
- "distances.Distance, or pass a metric function directly. For a "
204
- "list of available metrics, see: "
205
- "https://sidchaini.github.io/DistClassiPy/distances.html or "
206
- "https://docs.scipy.org/doc/scipy/reference/spatial.distance.html"
207
- )
208
-
209
- def fit(self, X: np.array, y: np.array, feat_labels: list[str] = None):
167
+ def fit(
168
+ self, X: np.array, y: np.array, feat_labels: list[str] = None
169
+ ) -> "DistanceMetricClassifier":
210
170
  """Calculate the feature space centroid for all classes.
211
171
 
212
172
  This function calculates the feature space centroid in the training
@@ -236,8 +196,6 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
236
196
  1
237
197
  ] # Number of features seen during fit - required for sklearn compatibility.
238
198
 
239
- self.initialize_metric_function()
240
-
241
199
  if feat_labels is None:
242
200
  feat_labels = [f"Feature_{x}" for x in range(X.shape[1])]
243
201
 
@@ -281,30 +239,15 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
281
239
  )
282
240
  self.df_iqr_ = df_iqr
283
241
 
284
- if self.calculate_kde:
285
- warnings.warn(
286
- "KDE calculation is deprecated and will be removed in version 0.2.0",
287
- DeprecationWarning,
288
- )
289
- self.kde_dict_ = {}
290
-
291
- for cl in self.classes_:
292
- subX = X[y == cl]
293
- # Implement the following in an if-else to save computational time.
294
- # kde = KernelDensity(bandwidth='scott', metric=self.metric)
295
- # kde.fit(subX)
296
- kde = KernelDensity(
297
- bandwidth="scott",
298
- metric="pyfunc",
299
- metric_params={"func": self.metric_fn_},
300
- )
301
- kde.fit(subX)
302
- self.kde_dict_[cl] = kde
303
242
  self.is_fitted_ = True
304
243
 
305
244
  return self
306
245
 
307
- def predict(self, X: np.array):
246
+ def predict(
247
+ self,
248
+ X: np.array,
249
+ metric: str | Callable = "euclidean",
250
+ ) -> np.ndarray:
308
251
  """Predict the class labels for the provided X.
309
252
 
310
253
  The prediction is based on the distance of each data point in the input sample
@@ -315,18 +258,35 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
315
258
  ----------
316
259
  X : array-like of shape (n_samples, n_features)
317
260
  The input samples.
261
+ metric : str or callable, default="euclidean"
262
+ The distance metric to use for calculating the distance between features.
263
+
264
+ .. versionchanged:: 0.2.0
265
+ The metric is now specified at prediction time rather
266
+ than during initialization, providing greater flexibility.
318
267
 
319
268
  Returns
320
269
  -------
321
270
  y : ndarray of shape (n_samples,)
322
271
  The predicted classes.
272
+
273
+ See Also
274
+ --------
275
+ scipy.spatial.dist : Other distance metrics provided in SciPy
276
+ distclassipy.Distance : Distance metrics included with DistClassiPy
277
+
278
+ Notes
279
+ -----
280
+ If using distance metrics supported by SciPy, it is desirable to pass a string,
281
+ which allows SciPy to use an optimized C version of the code instead of the
282
+ slower Python version.
323
283
  """
324
284
  check_is_fitted(self, "is_fitted_")
325
285
  X = check_array(X)
326
-
286
+ metric_fn_, metric_arg_ = initialize_metric_function(metric)
327
287
  if not self.scale:
328
288
  dist_arr = scipy.spatial.distance.cdist(
329
- XA=X, XB=self.df_centroid_.to_numpy(), metric=self.metric_arg_
289
+ XA=X, XB=self.df_centroid_.to_numpy(), metric=metric_arg_
330
290
  )
331
291
 
332
292
  else:
@@ -343,16 +303,18 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
343
303
  w = wtdf.loc[cl].to_numpy() # 1/std dev
344
304
  XB = XB * w # w is for this class only
345
305
  XA = X * w # w is for this class only
346
- cl_dist = scipy.spatial.distance.cdist(
347
- XA=XA, XB=XB, metric=self.metric_arg_
348
- )
306
+ cl_dist = scipy.spatial.distance.cdist(XA=XA, XB=XB, metric=metric_arg_)
349
307
  dist_arr_list.append(cl_dist)
350
308
  dist_arr = np.column_stack(dist_arr_list)
351
309
 
352
310
  y_pred = self.classes_[dist_arr.argmin(axis=1)]
353
311
  return y_pred
354
312
 
355
- def predict_and_analyse(self, X: np.array):
313
+ def predict_and_analyse(
314
+ self,
315
+ X: np.array,
316
+ metric: str | Callable = "euclidean",
317
+ ) -> np.ndarray:
356
318
  """Predict the class labels for the provided X and perform analysis.
357
319
 
358
320
  The prediction is based on the distance of each data point in the input sample
@@ -366,18 +328,35 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
366
328
  ----------
367
329
  X : array-like of shape (n_samples, n_features)
368
330
  The input samples.
331
+ metric : str or callable, default="euclidean"
332
+ The distance metric to use for calculating the distance between features.
333
+
369
334
 
370
335
  Returns
371
336
  -------
372
337
  y : ndarray of shape (n_samples,)
373
338
  The predicted classes.
339
+
340
+ See Also
341
+ --------
342
+ scipy.spatial.dist : Other distance metrics provided in SciPy
343
+ distclassipy.Distance : Distance metrics included with DistClassiPy
344
+
345
+ Notes
346
+ -----
347
+ If using distance metrics supported by SciPy, it is desirable to pass a string,
348
+ which allows SciPy to use an optimized C version of the code instead
349
+ of the slower Python version.
350
+
374
351
  """
375
352
  check_is_fitted(self, "is_fitted_")
376
353
  X = check_array(X)
377
354
 
355
+ metric_fn_, metric_arg_ = initialize_metric_function(metric)
356
+
378
357
  if not self.scale:
379
358
  dist_arr = scipy.spatial.distance.cdist(
380
- XA=X, XB=self.df_centroid_.to_numpy(), metric=self.metric_arg_
359
+ XA=X, XB=self.df_centroid_.to_numpy(), metric=metric_arg_
381
360
  )
382
361
 
383
362
  else:
@@ -394,9 +373,7 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
394
373
  w = wtdf.loc[cl].to_numpy() # 1/std dev
395
374
  XB = XB * w # w is for this class only
396
375
  XA = X * w # w is for this class only
397
- cl_dist = scipy.spatial.distance.cdist(
398
- XA=XA, XB=XB, metric=self.metric_arg_
399
- )
376
+ cl_dist = scipy.spatial.distance.cdist(XA=XA, XB=XB, metric=metric_arg_)
400
377
  dist_arr_list.append(cl_dist)
401
378
  dist_arr = np.column_stack(dist_arr_list)
402
379
 
@@ -409,78 +386,15 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
409
386
 
410
387
  y_pred = self.classes_[dist_arr.argmin(axis=1)]
411
388
 
412
- if self.calculate_kde:
413
- warnings.warn(
414
- "KDE calculation in predict_and_analyse is deprecated "
415
- "and will be removed in version 0.2.0",
416
- DeprecationWarning,
417
- )
418
- # NEW: Rescale in terms of median likelihoods - calculate here
419
- scale_factors = np.exp(
420
- [
421
- self.kde_dict_[cl].score_samples(
422
- self.df_centroid_.loc[cl].to_numpy().reshape(1, -1)
423
- )[0]
424
- for cl in self.classes_
425
- ]
426
- )
427
-
428
- likelihood_arr = []
429
- for k in self.kde_dict_.keys():
430
- log_pdf = self.kde_dict_[k].score_samples(X)
431
- likelihood_val = np.exp(log_pdf)
432
- likelihood_arr.append(likelihood_val)
433
- self.likelihood_arr_ = np.array(likelihood_arr).T
434
-
435
- # NEW: Rescale in terms of median likelihoods - rescale here
436
- self.likelihood_arr_ = self.likelihood_arr_ / scale_factors
437
- if self.calculate_1d_dist:
438
- warnings.warn(
439
- "calculate_1d_dist is deprecated and will be removed in version 0.2.0",
440
- DeprecationWarning,
441
- )
442
- conf_cl = []
443
- Xdf_temp = pd.DataFrame(data=X, columns=self.df_centroid_.columns)
444
- for cl in self.classes_:
445
- sum_1d_dists = np.zeros(shape=(len(Xdf_temp)))
446
- for feat in Xdf_temp.columns:
447
- dists = scipy.spatial.distance.cdist(
448
- XA=np.zeros(shape=(1, 1)),
449
- XB=(self.df_centroid_.loc[cl] - Xdf_temp)[feat]
450
- .to_numpy()
451
- .reshape(-1, 1),
452
- metric=self.metric_arg_,
453
- ).ravel()
454
- if self.scale and self.dispersion_stat == "std":
455
- sum_1d_dists = sum_1d_dists + dists / self.df_std_.loc[cl, feat]
456
- elif self.scale and self.dispersion_stat == "std":
457
- sum_1d_dists = sum_1d_dists + dists / self.df_iqr_.loc[cl, feat]
458
- else:
459
- sum_1d_dists = sum_1d_dists + dists
460
- confs = 1 / np.clip(sum_1d_dists, a_min=np.finfo(float).eps, a_max=None)
461
- conf_cl.append(confs)
462
- conf_cl = np.array(conf_cl)
463
- self.conf_cl_ = conf_cl
464
389
  self.analyis_ = True
465
390
 
466
391
  return y_pred
467
392
 
468
- def calculate_confidence(self, method: str = "distance_inverse"):
393
+ def calculate_confidence(self):
469
394
  """Calculate the confidence for each prediction.
470
395
 
471
- The confidence is calculated based on either the distance of each data point to
472
- the centroids of the training data, optionally the kernel density estimate or
473
- 1-dimensional distance.
474
-
475
- Parameters
476
- ----------
477
- method : {"distance_inverse", "1d_distance_inverse", "kde_likelihood"},
478
- default="distance_inverse"
479
- The method to use for calculating confidence. Default is
480
- 'distance_inverse'.
481
- .. deprecated:: 0.2.0
482
- The methods '1d_distance_inverse' and
483
- 'kde_likelihood' will be removed in version 0.2.0.
396
+ The confidence is calculated as the inverse of the distance of each data point
397
+ to the centroids of the training data.
484
398
  """
485
399
  check_is_fitted(self, "is_fitted_")
486
400
  if not hasattr(self, "analyis_"):
@@ -490,44 +404,278 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
490
404
  )
491
405
 
492
406
  # Calculate confidence for each prediction
493
- if method == "distance_inverse":
494
- self.confidence_df_ = 1 / np.clip(
495
- self.centroid_dist_df_, a_min=np.finfo(float).eps, a_max=None
496
- )
497
- self.confidence_df_.columns = [
498
- x.replace("_dist", "_conf") for x in self.confidence_df_.columns
499
- ]
500
-
501
- elif method == "1d_distance_inverse":
502
- warnings.warn(
503
- "The '1d_distance_inverse' method is deprecated "
504
- "and will be removed in version 0.2.0",
505
- DeprecationWarning,
506
- )
507
- if not self.calculate_1d_dist:
508
- raise ValueError(
509
- "method='1d_distance_inverse' is only valid if calculate_1d_dist "
510
- "is set to True"
511
- )
512
- self.confidence_df_ = pd.DataFrame(
513
- data=self.conf_cl_.T, columns=[f"{x}_conf" for x in self.classes_]
514
- )
407
+ self.confidence_df_ = 1 / np.clip(
408
+ self.centroid_dist_df_, a_min=np.finfo(float).eps, a_max=None
409
+ )
410
+ self.confidence_df_.columns = [
411
+ x.replace("_dist", "_conf") for x in self.confidence_df_.columns
412
+ ]
413
+
414
+ return self.confidence_df_.to_numpy()
415
+
416
+ def score(self, X, y, metric: str | Callable = "euclidean") -> float:
417
+ """Return the mean accuracy on the given test data and labels.
418
+
419
+ Parameters
420
+ ----------
421
+ X : array-like of shape (n_samples, n_features)
422
+ Test samples.
423
+ y : array-like of shape (n_samples,)
424
+ True labels for X.
425
+ metric : str or callable, default="euclidean"
426
+ The distance metric to use for calculating the distance between features.
427
+
428
+ Returns
429
+ -------
430
+ score : float
431
+ Mean accuracy of self.predict(X) wrt. y.
432
+ """
433
+ y_pred = self.predict(X, metric=metric)
434
+ return accuracy_score(y, y_pred)
435
+
436
+
437
+ def find_best_metrics(
438
+ clf: "DistanceMetricClassifier",
439
+ X: np.ndarray,
440
+ y: np.ndarray,
441
+ feat_idx: int,
442
+ n_quantiles: int = 4,
443
+ metrics_to_consider: list[str] = None,
444
+ ) -> Tuple[pd.DataFrame, pd.Series, np.ndarray]:
445
+ """Evaluate and find the best distance metrics for a given feature.
446
+
447
+ This function evaluates different distance metrics to determine which
448
+ performs best for a specific feature in the dataset. It splits the data
449
+ into quantiles based on the specified feature and calculates the accuracy
450
+ of the classifier for each metric within these quantiles.
451
+
452
+ .. versionadded:: 0.2.0
453
+
454
+ Parameters
455
+ ----------
456
+ clf : DistanceMetricClassifier
457
+ The classifier instance to be used for evaluation.
458
+ X : np.ndarray
459
+ The input feature matrix.
460
+ y : np.ndarray
461
+ The target labels.
462
+ feat_idx : int
463
+ The index of the feature to be used for quantile splitting.
464
+ n_quantiles : int, default=4
465
+ The number of quantiles to split the data into.
466
+ metrics_to_consider : list of str, optional
467
+ A list of distance metrics to evaluate. If None, all available
468
+ metrics within DistClassiPy will be considered.
469
+
470
+ Returns
471
+ -------
472
+ quantile_scores_df : pd.DataFrame
473
+ A DataFrame containing the accuracy scores for each metric across
474
+ different quantiles.
475
+ best_metrics_per_quantile : pd.Series
476
+ A Series indicating the best-performing metric for each quantile.
477
+ group_bins : np.ndarray
478
+ The bins used for quantile splitting.
479
+ """
480
+ X = check_array(X)
481
+ feature_labels = [f"Feature_{i}" for i in range(X.shape[1])]
482
+ feature_name = f"Feature_{feat_idx}"
483
+
484
+ if metrics_to_consider is None:
485
+ metrics_to_consider = _ALL_METRICS
486
+
487
+ X_df = pd.DataFrame(X, columns=feature_labels)
488
+ y_df = pd.DataFrame(y, columns=["Target"])
489
+ quantiles, group_bins = pd.qcut(X_df[feature_name], q=n_quantiles, retbins=True)
515
490
 
516
- elif method == "kde_likelihood":
517
- warnings.warn(
518
- "The 'kde_likelihood' method is deprecated and will be "
519
- "removed in version 0.2.0",
520
- DeprecationWarning,
491
+ X_train, X_test, y_train, y_test = train_test_split(
492
+ X_df, y_df, test_size=0.25, stratify=quantiles
493
+ )
494
+
495
+ clf.fit(X_train, y_train.to_numpy().ravel())
496
+ grouped_test_data = X_test.groupby(quantiles, observed=False)
497
+
498
+ quantile_scores = []
499
+ for metric in metrics_to_consider:
500
+ scores_for_metric = [
501
+ accuracy_score(
502
+ y_test.loc[subdf.index], clf.predict(subdf.to_numpy(), metric=metric)
521
503
  )
522
- if not self.calculate_kde:
523
- raise ValueError(
524
- "method='kde_likelihood' is only valid if calculate_kde is set "
525
- "to True"
504
+ for _, subdf in grouped_test_data
505
+ ]
506
+ quantile_scores.append(scores_for_metric)
507
+
508
+ quantile_scores = np.array(quantile_scores) * 100
509
+ quantile_scores_df = pd.DataFrame(
510
+ data=quantile_scores,
511
+ index=metrics_to_consider,
512
+ columns=[f"Quantile {i+1}" for i in range(n_quantiles)],
513
+ )
514
+
515
+ best_metrics_per_quantile = quantile_scores_df.idxmax()
516
+
517
+ return quantile_scores_df, best_metrics_per_quantile, group_bins
518
+
519
+
520
+ class EnsembleDistanceClassifier(BaseEstimator, ClassifierMixin):
521
+ """An ensemble classifier that uses different metrics for each quantile.
522
+
523
+ This classifier splits the data into quantiles based on a specified
524
+ feature and uses different distance metrics for each quantile to
525
+ construct an ensemble classifier for each quantile, generally leading
526
+ to better performance.
527
+ Note, however, this involves fitting the training set for each metric
528
+ to evaluate performance, making this more computationally expensive.
529
+
530
+ .. versionadded:: 0.2.0
531
+ """
532
+
533
+ def __init__(
534
+ self,
535
+ feat_idx: int,
536
+ scale: bool = True,
537
+ central_stat: str = "median",
538
+ dispersion_stat: str = "std",
539
+ metrics_to_consider: list[str] = None,
540
+ ) -> None:
541
+ """Initialize the classifier with specified parameters.
542
+
543
+ Parameters
544
+ ----------
545
+ feat_idx : int
546
+ The index of the feature to be used for quantile splitting.
547
+ scale : bool, default=True
548
+ Whether to scale the distance between the test object and the centroid.
549
+ central_stat : str, default="median"
550
+ The statistic used to calculate the central tendency of the data.
551
+ dispersion_stat : str, default="std"
552
+ The statistic used to calculate the dispersion of the data.
553
+ metrics_to_consider : list of str, optional
554
+ A list of distance metrics to evaluate. If None, all available
555
+ metrics within DistClassiPy will be considered.
556
+ """
557
+ self.feat_idx = feat_idx
558
+ self.scale = scale
559
+ self.central_stat = central_stat
560
+ self.dispersion_stat = dispersion_stat
561
+ self.metrics_to_consider = metrics_to_consider
562
+
563
+ def fit(
564
+ self, X: np.ndarray, y: np.ndarray, n_quantiles: int = 4
565
+ ) -> "EnsembleDistanceClassifier":
566
+ """Fit the ensemble classifier using the best metrics for each quantile.
567
+
568
+ Parameters
569
+ ----------
570
+ X : np.ndarray
571
+ The input feature matrix.
572
+ y : np.ndarray
573
+ The target labels.
574
+ n_quantiles : int, default=4
575
+ The number of quantiles to split the data into.
576
+
577
+ Returns
578
+ -------
579
+ self : object
580
+ Fitted estimator.
581
+ """
582
+ self.clf_ = DistanceMetricClassifier(
583
+ scale=self.scale,
584
+ central_stat=self.central_stat,
585
+ dispersion_stat=self.dispersion_stat,
586
+ )
587
+
588
+ # Find best metrics based on training set quantiles
589
+ self.quantile_scores_df_, self.best_metrics_per_quantile_, self.group_bins = (
590
+ self.evaluate_metrics(X, y, n_quantiles)
591
+ )
592
+
593
+ # Ensure the bins work with values outside of training data
594
+ self.group_bins[0] = -np.inf
595
+ self.group_bins[-1] = np.inf
596
+
597
+ self.group_labels = [f"Quantile {i+1}" for i in range(n_quantiles)]
598
+ self.clf_.fit(X, y)
599
+ self.is_fitted_ = True
600
+ return self
601
+
602
+ def predict(self, X: np.ndarray) -> np.ndarray:
603
+ """Predict class labels using the best metric for each quantile.
604
+
605
+ Parameters
606
+ ----------
607
+ X : np.ndarray
608
+ The input samples.
609
+
610
+ Returns
611
+ -------
612
+ predictions : np.ndarray
613
+ The predicted class labels.
614
+ """
615
+ check_is_fitted(self, "is_fitted_")
616
+ X = check_array(X)
617
+
618
+ # notes for pred during best:
619
+ # option 1:
620
+ # loop through each metric, merge quantiles for each metric
621
+ # pred on this
622
+ # option 2, easier, but slower:
623
+ # loop through each quantile, and append pred
624
+
625
+ quantiles = pd.cut(
626
+ X[:, self.feat_idx], bins=self.group_bins, labels=self.group_labels
627
+ )
628
+ # grouped_data = pd.DataFrame(X).groupby(quantiles, observed=False)
629
+ quantile_indices = quantiles.codes # Get integer codes for quantiles
630
+ predictions = np.empty(X.shape[0], dtype=int)
631
+ # for i, (lim, subdf) in enumerate(grouped_data):
632
+ # best_metric = self.best_metrics_per_quantile_.loc[self.group_labels[i]]
633
+ # preds = self.clf_.predict(subdf.to_numpy(), metric=best_metric)
634
+ # predictions[subdf.index] = preds
635
+ # Precompute predictions for each quantile
636
+ quantile_predictions = {}
637
+ for i, label in enumerate(self.group_labels):
638
+ best_metric = self.best_metrics_per_quantile_.loc[label]
639
+ quantile_data = X[quantile_indices == i]
640
+ if quantile_data.size > 0:
641
+ quantile_predictions[i] = self.clf_.predict(
642
+ quantile_data, metric=best_metric
526
643
  )
527
644
 
528
- self.confidence_df_ = pd.DataFrame(
529
- data=self.likelihood_arr_,
530
- columns=[f"{x}_conf" for x in self.kde_dict_.keys()],
531
- )
645
+ # Assign predictions to the corresponding indices
646
+ for i, preds in quantile_predictions.items():
647
+ predictions[quantile_indices == i] = preds
532
648
 
533
- return self.confidence_df_.to_numpy()
649
+ return predictions
650
+
651
+ def evaluate_metrics(
652
+ self, X: np.ndarray, y: np.ndarray, n_quantiles: int = 4
653
+ ) -> Tuple[pd.DataFrame, pd.Series, np.ndarray]:
654
+ """Evaluate and find the best distance metrics for the specified feature.
655
+
656
+ This method uses the standalone `find_best_metrics` function to evaluate
657
+ different distance metrics and determine the best-performing ones for
658
+ each quantile.
659
+
660
+ Parameters
661
+ ----------
662
+ X : np.ndarray
663
+ The input feature matrix.
664
+ y : np.ndarray
665
+ The target labels.
666
+ n_quantiles : int, default=4
667
+ The number of quantiles to split the data into.
668
+
669
+ Returns
670
+ -------
671
+ quantile_scores_df : pd.DataFrame
672
+ A DataFrame containing the accuracy scores for each metric across
673
+ different quantiles.
674
+ best_metrics_per_quantile : pd.Series
675
+ A Series indicating the best-performing metric for each quantile.
676
+ group_bins : np.ndarray
677
+ The bins used for quantile splitting.
678
+ """
679
+ return find_best_metrics(
680
+ self.clf_, X, y, self.feat_idx, n_quantiles, self.metrics_to_consider
681
+ )