distclassipy 0.1.5__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,15 @@
3
3
  This module contains the DistanceMetricClassifier introduced by Chaini et al. (2024)
4
4
  in "Light Curve Classification with DistClassiPy: a new distance-based classifier"
5
5
 
6
+
7
+ .. autoclass:: distclassipy.classifier.DistanceMetricClassifier
8
+ :members:
9
+ :exclude-members: set_fit_request, set_predict_request
10
+
11
+ .. doctest-skip::
12
+
13
+ .. skip::
14
+
6
15
  Copyright (C) 2024 Siddharth Chaini
7
16
  -----
8
17
  This program is free software: you can redistribute it and/or modify
@@ -19,7 +28,7 @@ You should have received a copy of the GNU General Public License
19
28
  along with this program. If not, see <https://www.gnu.org/licenses/>.
20
29
  """
21
30
 
22
- from typing import Callable
31
+ from typing import Callable, Tuple
23
32
 
24
33
  import numpy as np
25
34
 
@@ -28,11 +37,12 @@ import pandas as pd
28
37
  import scipy
29
38
 
30
39
  from sklearn.base import BaseEstimator, ClassifierMixin
31
- from sklearn.neighbors import KernelDensity
40
+ from sklearn.metrics import accuracy_score
41
+ from sklearn.model_selection import train_test_split
32
42
  from sklearn.utils.multiclass import unique_labels
33
43
  from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
34
44
 
35
- from .distances import Distance
45
+ from .distances import Distance, _ALL_METRICS
36
46
 
37
47
  # Hardcoded source packages to check for distance metrics.
38
48
  METRIC_SOURCES_ = {
@@ -41,6 +51,52 @@ METRIC_SOURCES_ = {
41
51
  }
42
52
 
43
53
 
54
+ def initialize_metric_function(metric):
55
+ """Set the metric function based on the provided metric.
56
+
57
+ If the metric is a string, the function will look for a corresponding
58
+ function in scipy.spatial.distance or distances.Distance. If the metric
59
+ is a function, it will be used directly.
60
+ """
61
+ if callable(metric):
62
+ metric_fn_ = metric
63
+ metric_arg_ = metric
64
+
65
+ elif isinstance(metric, str):
66
+ metric_str_lowercase = metric.lower()
67
+ metric_found = False
68
+ for package_str, source in METRIC_SOURCES_.items():
69
+
70
+ # Don't use scipy for jaccard as their implementation only works with
71
+ # booleans - use custom jaccard instead
72
+ if (
73
+ package_str == "scipy.spatial.distance"
74
+ and metric_str_lowercase == "jaccard"
75
+ ):
76
+ continue
77
+
78
+ if hasattr(source, metric_str_lowercase):
79
+ metric_fn_ = getattr(source, metric_str_lowercase)
80
+ metric_found = True
81
+
82
+ # Use the string as an argument if it belongs to scipy as it is
83
+ # optimized
84
+ metric_arg_ = (
85
+ metric if package_str == "scipy.spatial.distance" else metric_fn_
86
+ )
87
+ break
88
+ if not metric_found:
89
+ raise ValueError(
90
+ f"{metric} metric not found. Please pass a string of the "
91
+ "name of a metric in scipy.spatial.distance or "
92
+ "distances.Distance, or pass a metric function directly. For a "
93
+ "list of available metrics, see: "
94
+ "https://sidchaini.github.io/DistClassiPy/distances.html or "
95
+ "https://docs.scipy.org/doc/scipy/reference/spatial.distance.html"
96
+ )
97
+ return metric_fn_, metric_arg_
98
+
99
+
44
100
  class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
45
101
  """A distance-based classifier that supports different distance metrics.
46
102
 
@@ -54,8 +110,6 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
54
110
 
55
111
  Parameters
56
112
  ----------
57
- metric : str or callable, default="euclidean"
58
- The distance metric to use for calculating the distance between features.
59
113
  scale : bool, default=True
60
114
  Whether to scale the distance between the test object and the centroid for a
61
115
  class in the feature space. If True, the data will be scaled based on the
@@ -70,36 +124,15 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
70
124
 
71
125
  .. versionadded:: 0.1.0
72
126
 
73
- calculate_kde : bool, default=False
74
- Whether to calculate a kernel density estimate based confidence parameter.
75
- calculate_1d_dist : bool, default=False
76
- Whether to calculate the 1-dimensional distance based confidence parameter.
77
127
 
78
128
  Attributes
79
129
  ----------
80
- metric : str or callable
81
- The distance metric used for classification.
82
130
  scale : bool
83
131
  Indicates whether the data is scaled.
84
132
  central_stat : str
85
133
  The statistic used for calculating central tendency.
86
134
  dispersion_stat : str
87
135
  The statistic used for calculating dispersion.
88
- calculate_kde : bool
89
- Indicates whether a kernel density estimate is calculated.
90
- calculate_1d_dist : bool
91
- Indicates whether 1-dimensional distances are calculated.
92
-
93
- See Also
94
- --------
95
- scipy.spatial.dist : Other distance metrics provided in SciPy
96
- distclassipy.Distance : Distance metrics included with DistClassiPy
97
-
98
- Notes
99
- -----
100
- If using distance metrics supported by SciPy, it is desirable to pass a string,
101
- which allows SciPy to use an optimized C version of the code instead of the slower
102
- Python version.
103
136
 
104
137
  References
105
138
  ----------
@@ -113,77 +146,27 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
113
146
  >>> X, y = make_classification(n_samples=1000, n_features=4,
114
147
  ... n_informative=2, n_redundant=0,
115
148
  ... random_state=0, shuffle=False)
116
- >>> clf = dcpy.DistanceMetricClassifier(metric="canberra")
149
+ >>> clf = dcpy.DistanceMetricClassifier()
117
150
  >>> clf.fit(X, y)
118
151
  DistanceMetricClassifier(...)
119
- >>> print(clf.predict([[0, 0, 0, 0]]))
152
+ >>> print(clf.predict([[0, 0, 0, 0]], metric="canberra"))
120
153
  [0]
121
154
  """
122
155
 
123
156
  def __init__(
124
157
  self,
125
- metric: str | Callable = "euclidean",
126
158
  scale: bool = True,
127
159
  central_stat: str = "median",
128
160
  dispersion_stat: str = "std",
129
- calculate_kde: bool = True,
130
- calculate_1d_dist: bool = True,
131
- ):
161
+ ) -> None:
132
162
  """Initialize the classifier with specified parameters."""
133
- self.metric = metric
134
163
  self.scale = scale
135
164
  self.central_stat = central_stat
136
165
  self.dispersion_stat = dispersion_stat
137
- self.calculate_kde = calculate_kde
138
- self.calculate_1d_dist = calculate_1d_dist
139
-
140
- def initialize_metric_function(self):
141
- """Set the metric function based on the provided metric.
142
-
143
- If the metric is a string, the function will look for a corresponding
144
- function in scipy.spatial.distance or distances.Distance. If the metric
145
- is a function, it will be used directly.
146
- """
147
- if callable(self.metric):
148
- self.metric_fn_ = self.metric
149
- self.metric_arg_ = self.metric
150
-
151
- elif isinstance(self.metric, str):
152
- metric_str_lowercase = self.metric.lower()
153
- metric_found = False
154
- for package_str, source in METRIC_SOURCES_.items():
155
-
156
- # Don't use scipy for jaccard as their implementation only works with
157
- # booleans - use custom jaccard instead
158
- if (
159
- package_str == "scipy.spatial.distance"
160
- and metric_str_lowercase == "jaccard"
161
- ):
162
- continue
163
-
164
- if hasattr(source, metric_str_lowercase):
165
- self.metric_fn_ = getattr(source, metric_str_lowercase)
166
- metric_found = True
167
-
168
- # Use the string as an argument if it belongs to scipy as it is
169
- # optimized
170
- self.metric_arg_ = (
171
- self.metric
172
- if package_str == "scipy.spatial.distance"
173
- else self.metric_fn_
174
- )
175
- break
176
- if not metric_found:
177
- raise ValueError(
178
- f"{self.metric} metric not found. Please pass a string of the "
179
- "name of a metric in scipy.spatial.distance or "
180
- "distances.Distance, or pass a metric function directly. For a "
181
- "list of available metrics, see: "
182
- "https://sidchaini.github.io/DistClassiPy/distances.html or "
183
- "https://docs.scipy.org/doc/scipy/reference/spatial.distance.html"
184
- )
185
166
 
186
- def fit(self, X: np.array, y: np.array, feat_labels: list[str] = None):
167
+ def fit(
168
+ self, X: np.array, y: np.array, feat_labels: list[str] = None
169
+ ) -> "DistanceMetricClassifier":
187
170
  """Calculate the feature space centroid for all classes.
188
171
 
189
172
  This function calculates the feature space centroid in the training
@@ -209,9 +192,9 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
209
192
  """
210
193
  X, y = check_X_y(X, y)
211
194
  self.classes_ = unique_labels(y)
212
- self.n_features_in_ = X.shape[1]
213
-
214
- self.initialize_metric_function()
195
+ self.n_features_in_ = X.shape[
196
+ 1
197
+ ] # Number of features seen during fit - required for sklearn compatibility.
215
198
 
216
199
  if feat_labels is None:
217
200
  feat_labels = [f"Feature_{x}" for x in range(X.shape[1])]
@@ -256,27 +239,15 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
256
239
  )
257
240
  self.df_iqr_ = df_iqr
258
241
 
259
- if self.calculate_kde:
260
- self.kde_dict_ = {}
261
-
262
- for cl in self.classes_:
263
- subX = X[y == cl]
264
- # Implement the following in an if-else to save computational time.
265
- # kde = KernelDensity(bandwidth='scott', metric=self.metric)
266
- # kde.fit(subX)
267
- kde = KernelDensity(
268
- bandwidth="scott",
269
- metric="pyfunc",
270
- metric_params={"func": self.metric_fn_},
271
- )
272
- kde.fit(subX)
273
- self.kde_dict_[cl] = kde
274
-
275
242
  self.is_fitted_ = True
276
243
 
277
244
  return self
278
245
 
279
- def predict(self, X: np.array):
246
+ def predict(
247
+ self,
248
+ X: np.array,
249
+ metric: str | Callable = "euclidean",
250
+ ) -> np.ndarray:
280
251
  """Predict the class labels for the provided X.
281
252
 
282
253
  The prediction is based on the distance of each data point in the input sample
@@ -287,18 +258,35 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
287
258
  ----------
288
259
  X : array-like of shape (n_samples, n_features)
289
260
  The input samples.
261
+ metric : str or callable, default="euclidean"
262
+ The distance metric to use for calculating the distance between features.
263
+
264
+ .. versionchanged:: 0.2.0
265
+ The metric is now specified at prediction time rather
266
+ than during initialization, providing greater flexibility.
290
267
 
291
268
  Returns
292
269
  -------
293
270
  y : ndarray of shape (n_samples,)
294
271
  The predicted classes.
272
+
273
+ See Also
274
+ --------
275
+ scipy.spatial.dist : Other distance metrics provided in SciPy
276
+ distclassipy.Distance : Distance metrics included with DistClassiPy
277
+
278
+ Notes
279
+ -----
280
+ If using distance metrics supported by SciPy, it is desirable to pass a string,
281
+ which allows SciPy to use an optimized C version of the code instead of the
282
+ slower Python version.
295
283
  """
296
284
  check_is_fitted(self, "is_fitted_")
297
285
  X = check_array(X)
298
-
286
+ metric_fn_, metric_arg_ = initialize_metric_function(metric)
299
287
  if not self.scale:
300
288
  dist_arr = scipy.spatial.distance.cdist(
301
- XA=X, XB=self.df_centroid_.to_numpy(), metric=self.metric_arg_
289
+ XA=X, XB=self.df_centroid_.to_numpy(), metric=metric_arg_
302
290
  )
303
291
 
304
292
  else:
@@ -315,16 +303,18 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
315
303
  w = wtdf.loc[cl].to_numpy() # 1/std dev
316
304
  XB = XB * w # w is for this class only
317
305
  XA = X * w # w is for this class only
318
- cl_dist = scipy.spatial.distance.cdist(
319
- XA=XA, XB=XB, metric=self.metric_arg_
320
- )
306
+ cl_dist = scipy.spatial.distance.cdist(XA=XA, XB=XB, metric=metric_arg_)
321
307
  dist_arr_list.append(cl_dist)
322
308
  dist_arr = np.column_stack(dist_arr_list)
323
309
 
324
310
  y_pred = self.classes_[dist_arr.argmin(axis=1)]
325
311
  return y_pred
326
312
 
327
- def predict_and_analyse(self, X: np.array):
313
+ def predict_and_analyse(
314
+ self,
315
+ X: np.array,
316
+ metric: str | Callable = "euclidean",
317
+ ) -> np.ndarray:
328
318
  """Predict the class labels for the provided X and perform analysis.
329
319
 
330
320
  The prediction is based on the distance of each data point in the input sample
@@ -338,18 +328,35 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
338
328
  ----------
339
329
  X : array-like of shape (n_samples, n_features)
340
330
  The input samples.
331
+ metric : str or callable, default="euclidean"
332
+ The distance metric to use for calculating the distance between features.
333
+
341
334
 
342
335
  Returns
343
336
  -------
344
337
  y : ndarray of shape (n_samples,)
345
338
  The predicted classes.
339
+
340
+ See Also
341
+ --------
342
+ scipy.spatial.dist : Other distance metrics provided in SciPy
343
+ distclassipy.Distance : Distance metrics included with DistClassiPy
344
+
345
+ Notes
346
+ -----
347
+ If using distance metrics supported by SciPy, it is desirable to pass a string,
348
+ which allows SciPy to use an optimized C version of the code instead
349
+ of the slower Python version.
350
+
346
351
  """
347
352
  check_is_fitted(self, "is_fitted_")
348
353
  X = check_array(X)
349
354
 
355
+ metric_fn_, metric_arg_ = initialize_metric_function(metric)
356
+
350
357
  if not self.scale:
351
358
  dist_arr = scipy.spatial.distance.cdist(
352
- XA=X, XB=self.df_centroid_.to_numpy(), metric=self.metric_arg_
359
+ XA=X, XB=self.df_centroid_.to_numpy(), metric=metric_arg_
353
360
  )
354
361
 
355
362
  else:
@@ -366,9 +373,7 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
366
373
  w = wtdf.loc[cl].to_numpy() # 1/std dev
367
374
  XB = XB * w # w is for this class only
368
375
  XA = X * w # w is for this class only
369
- cl_dist = scipy.spatial.distance.cdist(
370
- XA=XA, XB=XB, metric=self.metric_arg_
371
- )
376
+ cl_dist = scipy.spatial.distance.cdist(XA=XA, XB=XB, metric=metric_arg_)
372
377
  dist_arr_list.append(cl_dist)
373
378
  dist_arr = np.column_stack(dist_arr_list)
374
379
 
@@ -381,68 +386,15 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
381
386
 
382
387
  y_pred = self.classes_[dist_arr.argmin(axis=1)]
383
388
 
384
- if self.calculate_kde:
385
- # NEW: Rescale in terms of median likelihoods - calculate here
386
- scale_factors = np.exp(
387
- [
388
- self.kde_dict_[cl].score_samples(
389
- self.df_centroid_.loc[cl].to_numpy().reshape(1, -1)
390
- )[0]
391
- for cl in self.classes_
392
- ]
393
- )
394
-
395
- likelihood_arr = []
396
- for k in self.kde_dict_.keys():
397
- log_pdf = self.kde_dict_[k].score_samples(X)
398
- likelihood_val = np.exp(log_pdf)
399
- likelihood_arr.append(likelihood_val)
400
- self.likelihood_arr_ = np.array(likelihood_arr).T
401
-
402
- # NEW: Rescale in terms of median likelihoods - rescale here
403
- self.likelihood_arr_ = self.likelihood_arr_ / scale_factors
404
-
405
- if self.calculate_1d_dist:
406
- conf_cl = []
407
- Xdf_temp = pd.DataFrame(data=X, columns=self.df_centroid_.columns)
408
- for cl in self.classes_:
409
- sum_1d_dists = np.zeros(shape=(len(Xdf_temp)))
410
- for feat in Xdf_temp.columns:
411
- dists = scipy.spatial.distance.cdist(
412
- XA=np.zeros(shape=(1, 1)),
413
- XB=(self.df_centroid_.loc[cl] - Xdf_temp)[feat]
414
- .to_numpy()
415
- .reshape(-1, 1),
416
- metric=self.metric_arg_,
417
- ).ravel()
418
- if self.scale and self.dispersion_stat == "std":
419
- sum_1d_dists = sum_1d_dists + dists / self.df_std_.loc[cl, feat]
420
- elif self.scale and self.dispersion_stat == "std":
421
- sum_1d_dists = sum_1d_dists + dists / self.df_iqr_.loc[cl, feat]
422
- else:
423
- sum_1d_dists = sum_1d_dists + dists
424
- confs = 1 / np.clip(sum_1d_dists, a_min=np.finfo(float).eps, a_max=None)
425
- conf_cl.append(confs)
426
- conf_cl = np.array(conf_cl)
427
- self.conf_cl_ = conf_cl
428
-
429
389
  self.analyis_ = True
430
390
 
431
391
  return y_pred
432
392
 
433
- def calculate_confidence(self, method: str = "distance_inverse"):
393
+ def calculate_confidence(self):
434
394
  """Calculate the confidence for each prediction.
435
395
 
436
- The confidence is calculated based on either the distance of each data point to
437
- the centroids of the training data, optionally the kernel density estimate or
438
- 1-dimensional distance.
439
-
440
- Parameters
441
- ----------
442
- method : {"distance_inverse", "1d_distance_inverse","kde_likelihood"},
443
- default="distance_inverse"
444
- The method to use for calculating confidence. Default is
445
- 'distance_inverse'.
396
+ The confidence is calculated as the inverse of the distance of each data point
397
+ to the centroids of the training data.
446
398
  """
447
399
  check_is_fitted(self, "is_fitted_")
448
400
  if not hasattr(self, "analyis_"):
@@ -452,34 +404,278 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
452
404
  )
453
405
 
454
406
  # Calculate confidence for each prediction
455
- if method == "distance_inverse":
456
- self.confidence_df_ = 1 / np.clip(
457
- self.centroid_dist_df_, a_min=np.finfo(float).eps, a_max=None
458
- )
459
- self.confidence_df_.columns = [
460
- x.replace("_dist", "_conf") for x in self.confidence_df_.columns
461
- ]
462
-
463
- elif method == "1d_distance_inverse":
464
- if not self.calculate_1d_dist:
465
- raise ValueError(
466
- "method='1d_distance_inverse' is only valid if calculate_1d_dist "
467
- "is set to True"
468
- )
469
- self.confidence_df_ = pd.DataFrame(
470
- data=self.conf_cl_.T, columns=[f"{x}_conf" for x in self.classes_]
407
+ self.confidence_df_ = 1 / np.clip(
408
+ self.centroid_dist_df_, a_min=np.finfo(float).eps, a_max=None
409
+ )
410
+ self.confidence_df_.columns = [
411
+ x.replace("_dist", "_conf") for x in self.confidence_df_.columns
412
+ ]
413
+
414
+ return self.confidence_df_.to_numpy()
415
+
416
+ def score(self, X, y, metric: str | Callable = "euclidean") -> float:
417
+ """Return the mean accuracy on the given test data and labels.
418
+
419
+ Parameters
420
+ ----------
421
+ X : array-like of shape (n_samples, n_features)
422
+ Test samples.
423
+ y : array-like of shape (n_samples,)
424
+ True labels for X.
425
+ metric : str or callable, default="euclidean"
426
+ The distance metric to use for calculating the distance between features.
427
+
428
+ Returns
429
+ -------
430
+ score : float
431
+ Mean accuracy of self.predict(X) wrt. y.
432
+ """
433
+ y_pred = self.predict(X, metric=metric)
434
+ return accuracy_score(y, y_pred)
435
+
436
+
437
+ def find_best_metrics(
438
+ clf: "DistanceMetricClassifier",
439
+ X: np.ndarray,
440
+ y: np.ndarray,
441
+ feat_idx: int,
442
+ n_quantiles: int = 4,
443
+ metrics_to_consider: list[str] = None,
444
+ ) -> Tuple[pd.DataFrame, pd.Series, np.ndarray]:
445
+ """Evaluate and find the best distance metrics for a given feature.
446
+
447
+ This function evaluates different distance metrics to determine which
448
+ performs best for a specific feature in the dataset. It splits the data
449
+ into quantiles based on the specified feature and calculates the accuracy
450
+ of the classifier for each metric within these quantiles.
451
+
452
+ .. versionadded:: 0.2.0
453
+
454
+ Parameters
455
+ ----------
456
+ clf : DistanceMetricClassifier
457
+ The classifier instance to be used for evaluation.
458
+ X : np.ndarray
459
+ The input feature matrix.
460
+ y : np.ndarray
461
+ The target labels.
462
+ feat_idx : int
463
+ The index of the feature to be used for quantile splitting.
464
+ n_quantiles : int, default=4
465
+ The number of quantiles to split the data into.
466
+ metrics_to_consider : list of str, optional
467
+ A list of distance metrics to evaluate. If None, all available
468
+ metrics within DistClassiPy will be considered.
469
+
470
+ Returns
471
+ -------
472
+ quantile_scores_df : pd.DataFrame
473
+ A DataFrame containing the accuracy scores for each metric across
474
+ different quantiles.
475
+ best_metrics_per_quantile : pd.Series
476
+ A Series indicating the best-performing metric for each quantile.
477
+ group_bins : np.ndarray
478
+ The bins used for quantile splitting.
479
+ """
480
+ X = check_array(X)
481
+ feature_labels = [f"Feature_{i}" for i in range(X.shape[1])]
482
+ feature_name = f"Feature_{feat_idx}"
483
+
484
+ if metrics_to_consider is None:
485
+ metrics_to_consider = _ALL_METRICS
486
+
487
+ X_df = pd.DataFrame(X, columns=feature_labels)
488
+ y_df = pd.DataFrame(y, columns=["Target"])
489
+ quantiles, group_bins = pd.qcut(X_df[feature_name], q=n_quantiles, retbins=True)
490
+
491
+ X_train, X_test, y_train, y_test = train_test_split(
492
+ X_df, y_df, test_size=0.25, stratify=quantiles
493
+ )
494
+
495
+ clf.fit(X_train, y_train.to_numpy().ravel())
496
+ grouped_test_data = X_test.groupby(quantiles, observed=False)
497
+
498
+ quantile_scores = []
499
+ for metric in metrics_to_consider:
500
+ scores_for_metric = [
501
+ accuracy_score(
502
+ y_test.loc[subdf.index], clf.predict(subdf.to_numpy(), metric=metric)
471
503
  )
504
+ for _, subdf in grouped_test_data
505
+ ]
506
+ quantile_scores.append(scores_for_metric)
507
+
508
+ quantile_scores = np.array(quantile_scores) * 100
509
+ quantile_scores_df = pd.DataFrame(
510
+ data=quantile_scores,
511
+ index=metrics_to_consider,
512
+ columns=[f"Quantile {i+1}" for i in range(n_quantiles)],
513
+ )
514
+
515
+ best_metrics_per_quantile = quantile_scores_df.idxmax()
516
+
517
+ return quantile_scores_df, best_metrics_per_quantile, group_bins
518
+
519
+
520
+ class EnsembleDistanceClassifier(BaseEstimator, ClassifierMixin):
521
+ """An ensemble classifier that uses different metrics for each quantile.
522
+
523
+ This classifier splits the data into quantiles based on a specified
524
+ feature and uses different distance metrics for each quantile to
525
+ construct an ensemble classifier for each quantile, generally leading
526
+ to better performance.
527
+ Note, however, this involves fitting the training set for each metric
528
+ to evaluate performance, making this more computationally expensive.
529
+
530
+ .. versionadded:: 0.2.0
531
+ """
532
+
533
+ def __init__(
534
+ self,
535
+ feat_idx: int,
536
+ scale: bool = True,
537
+ central_stat: str = "median",
538
+ dispersion_stat: str = "std",
539
+ metrics_to_consider: list[str] = None,
540
+ ) -> None:
541
+ """Initialize the classifier with specified parameters.
542
+
543
+ Parameters
544
+ ----------
545
+ feat_idx : int
546
+ The index of the feature to be used for quantile splitting.
547
+ scale : bool, default=True
548
+ Whether to scale the distance between the test object and the centroid.
549
+ central_stat : str, default="median"
550
+ The statistic used to calculate the central tendency of the data.
551
+ dispersion_stat : str, default="std"
552
+ The statistic used to calculate the dispersion of the data.
553
+ metrics_to_consider : list of str, optional
554
+ A list of distance metrics to evaluate. If None, all available
555
+ metrics within DistClassiPy will be considered.
556
+ """
557
+ self.feat_idx = feat_idx
558
+ self.scale = scale
559
+ self.central_stat = central_stat
560
+ self.dispersion_stat = dispersion_stat
561
+ self.metrics_to_consider = metrics_to_consider
562
+
563
+ def fit(
564
+ self, X: np.ndarray, y: np.ndarray, n_quantiles: int = 4
565
+ ) -> "EnsembleDistanceClassifier":
566
+ """Fit the ensemble classifier using the best metrics for each quantile.
567
+
568
+ Parameters
569
+ ----------
570
+ X : np.ndarray
571
+ The input feature matrix.
572
+ y : np.ndarray
573
+ The target labels.
574
+ n_quantiles : int, default=4
575
+ The number of quantiles to split the data into.
576
+
577
+ Returns
578
+ -------
579
+ self : object
580
+ Fitted estimator.
581
+ """
582
+ self.clf_ = DistanceMetricClassifier(
583
+ scale=self.scale,
584
+ central_stat=self.central_stat,
585
+ dispersion_stat=self.dispersion_stat,
586
+ )
587
+
588
+ # Find best metrics based on training set quantiles
589
+ self.quantile_scores_df_, self.best_metrics_per_quantile_, self.group_bins = (
590
+ self.evaluate_metrics(X, y, n_quantiles)
591
+ )
592
+
593
+ # Ensure the bins work with values outside of training data
594
+ self.group_bins[0] = -np.inf
595
+ self.group_bins[-1] = np.inf
596
+
597
+ self.group_labels = [f"Quantile {i+1}" for i in range(n_quantiles)]
598
+ self.clf_.fit(X, y)
599
+ self.is_fitted_ = True
600
+ return self
601
+
602
+ def predict(self, X: np.ndarray) -> np.ndarray:
603
+ """Predict class labels using the best metric for each quantile.
472
604
 
473
- elif method == "kde_likelihood":
474
- if not self.calculate_kde:
475
- raise ValueError(
476
- "method='kde_likelihood' is only valid if calculate_kde is set "
477
- "to True"
605
+ Parameters
606
+ ----------
607
+ X : np.ndarray
608
+ The input samples.
609
+
610
+ Returns
611
+ -------
612
+ predictions : np.ndarray
613
+ The predicted class labels.
614
+ """
615
+ check_is_fitted(self, "is_fitted_")
616
+ X = check_array(X)
617
+
618
+ # notes for pred during best:
619
+ # option 1:
620
+ # loop through each metric, merge quantiles for each metric
621
+ # pred on this
622
+ # option 2, easier, but slower:
623
+ # loop through each quantile, and append pred
624
+
625
+ quantiles = pd.cut(
626
+ X[:, self.feat_idx], bins=self.group_bins, labels=self.group_labels
627
+ )
628
+ # grouped_data = pd.DataFrame(X).groupby(quantiles, observed=False)
629
+ quantile_indices = quantiles.codes # Get integer codes for quantiles
630
+ predictions = np.empty(X.shape[0], dtype=int)
631
+ # for i, (lim, subdf) in enumerate(grouped_data):
632
+ # best_metric = self.best_metrics_per_quantile_.loc[self.group_labels[i]]
633
+ # preds = self.clf_.predict(subdf.to_numpy(), metric=best_metric)
634
+ # predictions[subdf.index] = preds
635
+ # Precompute predictions for each quantile
636
+ quantile_predictions = {}
637
+ for i, label in enumerate(self.group_labels):
638
+ best_metric = self.best_metrics_per_quantile_.loc[label]
639
+ quantile_data = X[quantile_indices == i]
640
+ if quantile_data.size > 0:
641
+ quantile_predictions[i] = self.clf_.predict(
642
+ quantile_data, metric=best_metric
478
643
  )
479
644
 
480
- self.confidence_df_ = pd.DataFrame(
481
- data=self.likelihood_arr_,
482
- columns=[f"{x}_conf" for x in self.kde_dict_.keys()],
483
- )
645
+ # Assign predictions to the corresponding indices
646
+ for i, preds in quantile_predictions.items():
647
+ predictions[quantile_indices == i] = preds
484
648
 
485
- return self.confidence_df_.to_numpy()
649
+ return predictions
650
+
651
+ def evaluate_metrics(
652
+ self, X: np.ndarray, y: np.ndarray, n_quantiles: int = 4
653
+ ) -> Tuple[pd.DataFrame, pd.Series, np.ndarray]:
654
+ """Evaluate and find the best distance metrics for the specified feature.
655
+
656
+ This method uses the standalone `find_best_metrics` function to evaluate
657
+ different distance metrics and determine the best-performing ones for
658
+ each quantile.
659
+
660
+ Parameters
661
+ ----------
662
+ X : np.ndarray
663
+ The input feature matrix.
664
+ y : np.ndarray
665
+ The target labels.
666
+ n_quantiles : int, default=4
667
+ The number of quantiles to split the data into.
668
+
669
+ Returns
670
+ -------
671
+ quantile_scores_df : pd.DataFrame
672
+ A DataFrame containing the accuracy scores for each metric across
673
+ different quantiles.
674
+ best_metrics_per_quantile : pd.Series
675
+ A Series indicating the best-performing metric for each quantile.
676
+ group_bins : np.ndarray
677
+ The bins used for quantile splitting.
678
+ """
679
+ return find_best_metrics(
680
+ self.clf_, X, y, self.feat_idx, n_quantiles, self.metrics_to_consider
681
+ )