distclassipy 0.1.6a0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- distclassipy/__init__.py +13 -3
- distclassipy/classifier.py +387 -239
- distclassipy/distances.py +981 -905
- {distclassipy-0.1.6a0.dist-info → distclassipy-0.2.0.dist-info}/METADATA +14 -6
- distclassipy-0.2.0.dist-info/RECORD +8 -0
- {distclassipy-0.1.6a0.dist-info → distclassipy-0.2.0.dist-info}/WHEEL +1 -1
- distclassipy-0.1.6a0.dist-info/RECORD +0 -8
- {distclassipy-0.1.6a0.dist-info → distclassipy-0.2.0.dist-info}/LICENSE +0 -0
- {distclassipy-0.1.6a0.dist-info → distclassipy-0.2.0.dist-info}/top_level.txt +0 -0
distclassipy/classifier.py
CHANGED
|
@@ -3,6 +3,15 @@
|
|
|
3
3
|
This module contains the DistanceMetricClassifier introduced by Chaini et al. (2024)
|
|
4
4
|
in "Light Curve Classification with DistClassiPy: a new distance-based classifier"
|
|
5
5
|
|
|
6
|
+
|
|
7
|
+
.. autoclass:: distclassipy.classifier.DistanceMetricClassifier
|
|
8
|
+
:members:
|
|
9
|
+
:exclude-members: set_fit_request, set_predict_request
|
|
10
|
+
|
|
11
|
+
.. doctest-skip::
|
|
12
|
+
|
|
13
|
+
.. skip::
|
|
14
|
+
|
|
6
15
|
Copyright (C) 2024 Siddharth Chaini
|
|
7
16
|
-----
|
|
8
17
|
This program is free software: you can redistribute it and/or modify
|
|
@@ -19,8 +28,7 @@ You should have received a copy of the GNU General Public License
|
|
|
19
28
|
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
20
29
|
"""
|
|
21
30
|
|
|
22
|
-
import
|
|
23
|
-
from typing import Callable
|
|
31
|
+
from typing import Callable, Tuple
|
|
24
32
|
|
|
25
33
|
import numpy as np
|
|
26
34
|
|
|
@@ -29,11 +37,12 @@ import pandas as pd
|
|
|
29
37
|
import scipy
|
|
30
38
|
|
|
31
39
|
from sklearn.base import BaseEstimator, ClassifierMixin
|
|
32
|
-
from sklearn.
|
|
40
|
+
from sklearn.metrics import accuracy_score
|
|
41
|
+
from sklearn.model_selection import train_test_split
|
|
33
42
|
from sklearn.utils.multiclass import unique_labels
|
|
34
43
|
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
|
|
35
44
|
|
|
36
|
-
from .distances import Distance
|
|
45
|
+
from .distances import Distance, _ALL_METRICS
|
|
37
46
|
|
|
38
47
|
# Hardcoded source packages to check for distance metrics.
|
|
39
48
|
METRIC_SOURCES_ = {
|
|
@@ -42,6 +51,52 @@ METRIC_SOURCES_ = {
|
|
|
42
51
|
}
|
|
43
52
|
|
|
44
53
|
|
|
54
|
+
def initialize_metric_function(metric):
|
|
55
|
+
"""Set the metric function based on the provided metric.
|
|
56
|
+
|
|
57
|
+
If the metric is a string, the function will look for a corresponding
|
|
58
|
+
function in scipy.spatial.distance or distances.Distance. If the metric
|
|
59
|
+
is a function, it will be used directly.
|
|
60
|
+
"""
|
|
61
|
+
if callable(metric):
|
|
62
|
+
metric_fn_ = metric
|
|
63
|
+
metric_arg_ = metric
|
|
64
|
+
|
|
65
|
+
elif isinstance(metric, str):
|
|
66
|
+
metric_str_lowercase = metric.lower()
|
|
67
|
+
metric_found = False
|
|
68
|
+
for package_str, source in METRIC_SOURCES_.items():
|
|
69
|
+
|
|
70
|
+
# Don't use scipy for jaccard as their implementation only works with
|
|
71
|
+
# booleans - use custom jaccard instead
|
|
72
|
+
if (
|
|
73
|
+
package_str == "scipy.spatial.distance"
|
|
74
|
+
and metric_str_lowercase == "jaccard"
|
|
75
|
+
):
|
|
76
|
+
continue
|
|
77
|
+
|
|
78
|
+
if hasattr(source, metric_str_lowercase):
|
|
79
|
+
metric_fn_ = getattr(source, metric_str_lowercase)
|
|
80
|
+
metric_found = True
|
|
81
|
+
|
|
82
|
+
# Use the string as an argument if it belongs to scipy as it is
|
|
83
|
+
# optimized
|
|
84
|
+
metric_arg_ = (
|
|
85
|
+
metric if package_str == "scipy.spatial.distance" else metric_fn_
|
|
86
|
+
)
|
|
87
|
+
break
|
|
88
|
+
if not metric_found:
|
|
89
|
+
raise ValueError(
|
|
90
|
+
f"{metric} metric not found. Please pass a string of the "
|
|
91
|
+
"name of a metric in scipy.spatial.distance or "
|
|
92
|
+
"distances.Distance, or pass a metric function directly. For a "
|
|
93
|
+
"list of available metrics, see: "
|
|
94
|
+
"https://sidchaini.github.io/DistClassiPy/distances.html or "
|
|
95
|
+
"https://docs.scipy.org/doc/scipy/reference/spatial.distance.html"
|
|
96
|
+
)
|
|
97
|
+
return metric_fn_, metric_arg_
|
|
98
|
+
|
|
99
|
+
|
|
45
100
|
class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
46
101
|
"""A distance-based classifier that supports different distance metrics.
|
|
47
102
|
|
|
@@ -55,8 +110,6 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
55
110
|
|
|
56
111
|
Parameters
|
|
57
112
|
----------
|
|
58
|
-
metric : str or callable, default="euclidean"
|
|
59
|
-
The distance metric to use for calculating the distance between features.
|
|
60
113
|
scale : bool, default=True
|
|
61
114
|
Whether to scale the distance between the test object and the centroid for a
|
|
62
115
|
class in the feature space. If True, the data will be scaled based on the
|
|
@@ -71,47 +124,15 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
71
124
|
|
|
72
125
|
.. versionadded:: 0.1.0
|
|
73
126
|
|
|
74
|
-
calculate_kde : bool, default=False
|
|
75
|
-
Whether to calculate a kernel density estimate based confidence parameter.
|
|
76
|
-
.. deprecated:: 0.2.0
|
|
77
|
-
This parameter will be removed in a future version and only the
|
|
78
|
-
distance confidence parameter will be available.
|
|
79
|
-
calculate_1d_dist : bool, default=False
|
|
80
|
-
Whether to calculate the 1-dimensional distance based confidence parameter.
|
|
81
|
-
.. deprecated:: 0.2.0
|
|
82
|
-
This parameter will be removed in a future version and only the
|
|
83
|
-
distance confidence parameter will be available.
|
|
84
|
-
Whether to calculate the 1-dimensional distance based confidence parameter.
|
|
85
127
|
|
|
86
128
|
Attributes
|
|
87
129
|
----------
|
|
88
|
-
metric : str or callable
|
|
89
|
-
The distance metric used for classification.
|
|
90
130
|
scale : bool
|
|
91
131
|
Indicates whether the data is scaled.
|
|
92
132
|
central_stat : str
|
|
93
133
|
The statistic used for calculating central tendency.
|
|
94
134
|
dispersion_stat : str
|
|
95
135
|
The statistic used for calculating dispersion.
|
|
96
|
-
calculate_kde : bool
|
|
97
|
-
Indicates whether a kernel density estimate is calculated.
|
|
98
|
-
.. deprecated:: 0.2.0
|
|
99
|
-
This parameter will be removed in a future version.
|
|
100
|
-
calculate_1d_dist : bool
|
|
101
|
-
Indicates whether 1-dimensional distances are calculated.
|
|
102
|
-
.. deprecated:: 0.2.0
|
|
103
|
-
This parameter will be removed in a future version.
|
|
104
|
-
|
|
105
|
-
See Also
|
|
106
|
-
--------
|
|
107
|
-
scipy.spatial.dist : Other distance metrics provided in SciPy
|
|
108
|
-
distclassipy.Distance : Distance metrics included with DistClassiPy
|
|
109
|
-
|
|
110
|
-
Notes
|
|
111
|
-
-----
|
|
112
|
-
If using distance metrics supported by SciPy, it is desirable to pass a string,
|
|
113
|
-
which allows SciPy to use an optimized C version of the code instead of the slower
|
|
114
|
-
Python version.
|
|
115
136
|
|
|
116
137
|
References
|
|
117
138
|
----------
|
|
@@ -125,88 +146,27 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
125
146
|
>>> X, y = make_classification(n_samples=1000, n_features=4,
|
|
126
147
|
... n_informative=2, n_redundant=0,
|
|
127
148
|
... random_state=0, shuffle=False)
|
|
128
|
-
>>> clf = dcpy.DistanceMetricClassifier(
|
|
149
|
+
>>> clf = dcpy.DistanceMetricClassifier()
|
|
129
150
|
>>> clf.fit(X, y)
|
|
130
151
|
DistanceMetricClassifier(...)
|
|
131
|
-
>>> print(clf.predict([[0, 0, 0, 0]]))
|
|
152
|
+
>>> print(clf.predict([[0, 0, 0, 0]], metric="canberra"))
|
|
132
153
|
[0]
|
|
133
154
|
"""
|
|
134
155
|
|
|
135
156
|
def __init__(
|
|
136
157
|
self,
|
|
137
|
-
metric: str | Callable = "euclidean",
|
|
138
158
|
scale: bool = True,
|
|
139
159
|
central_stat: str = "median",
|
|
140
160
|
dispersion_stat: str = "std",
|
|
141
|
-
|
|
142
|
-
calculate_1d_dist: bool = True, # deprecated in 0.2.0
|
|
143
|
-
):
|
|
161
|
+
) -> None:
|
|
144
162
|
"""Initialize the classifier with specified parameters."""
|
|
145
|
-
self.metric = metric
|
|
146
163
|
self.scale = scale
|
|
147
164
|
self.central_stat = central_stat
|
|
148
165
|
self.dispersion_stat = dispersion_stat
|
|
149
|
-
if calculate_kde:
|
|
150
|
-
warnings.warn(
|
|
151
|
-
"calculate_kde is deprecated and will be removed in version 0.2.0",
|
|
152
|
-
DeprecationWarning,
|
|
153
|
-
)
|
|
154
|
-
self.calculate_kde = calculate_kde
|
|
155
|
-
|
|
156
|
-
if calculate_1d_dist:
|
|
157
|
-
warnings.warn(
|
|
158
|
-
"calculate_1d_dist is deprecated and will be removed in version 0.2.0",
|
|
159
|
-
DeprecationWarning,
|
|
160
|
-
)
|
|
161
|
-
self.calculate_1d_dist = calculate_1d_dist
|
|
162
|
-
|
|
163
|
-
def initialize_metric_function(self):
|
|
164
|
-
"""Set the metric function based on the provided metric.
|
|
165
166
|
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
"""
|
|
170
|
-
if callable(self.metric):
|
|
171
|
-
self.metric_fn_ = self.metric
|
|
172
|
-
self.metric_arg_ = self.metric
|
|
173
|
-
|
|
174
|
-
elif isinstance(self.metric, str):
|
|
175
|
-
metric_str_lowercase = self.metric.lower()
|
|
176
|
-
metric_found = False
|
|
177
|
-
for package_str, source in METRIC_SOURCES_.items():
|
|
178
|
-
|
|
179
|
-
# Don't use scipy for jaccard as their implementation only works with
|
|
180
|
-
# booleans - use custom jaccard instead
|
|
181
|
-
if (
|
|
182
|
-
package_str == "scipy.spatial.distance"
|
|
183
|
-
and metric_str_lowercase == "jaccard"
|
|
184
|
-
):
|
|
185
|
-
continue
|
|
186
|
-
|
|
187
|
-
if hasattr(source, metric_str_lowercase):
|
|
188
|
-
self.metric_fn_ = getattr(source, metric_str_lowercase)
|
|
189
|
-
metric_found = True
|
|
190
|
-
|
|
191
|
-
# Use the string as an argument if it belongs to scipy as it is
|
|
192
|
-
# optimized
|
|
193
|
-
self.metric_arg_ = (
|
|
194
|
-
self.metric
|
|
195
|
-
if package_str == "scipy.spatial.distance"
|
|
196
|
-
else self.metric_fn_
|
|
197
|
-
)
|
|
198
|
-
break
|
|
199
|
-
if not metric_found:
|
|
200
|
-
raise ValueError(
|
|
201
|
-
f"{self.metric} metric not found. Please pass a string of the "
|
|
202
|
-
"name of a metric in scipy.spatial.distance or "
|
|
203
|
-
"distances.Distance, or pass a metric function directly. For a "
|
|
204
|
-
"list of available metrics, see: "
|
|
205
|
-
"https://sidchaini.github.io/DistClassiPy/distances.html or "
|
|
206
|
-
"https://docs.scipy.org/doc/scipy/reference/spatial.distance.html"
|
|
207
|
-
)
|
|
208
|
-
|
|
209
|
-
def fit(self, X: np.array, y: np.array, feat_labels: list[str] = None):
|
|
167
|
+
def fit(
|
|
168
|
+
self, X: np.array, y: np.array, feat_labels: list[str] = None
|
|
169
|
+
) -> "DistanceMetricClassifier":
|
|
210
170
|
"""Calculate the feature space centroid for all classes.
|
|
211
171
|
|
|
212
172
|
This function calculates the feature space centroid in the training
|
|
@@ -236,8 +196,6 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
236
196
|
1
|
|
237
197
|
] # Number of features seen during fit - required for sklearn compatibility.
|
|
238
198
|
|
|
239
|
-
self.initialize_metric_function()
|
|
240
|
-
|
|
241
199
|
if feat_labels is None:
|
|
242
200
|
feat_labels = [f"Feature_{x}" for x in range(X.shape[1])]
|
|
243
201
|
|
|
@@ -281,30 +239,15 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
281
239
|
)
|
|
282
240
|
self.df_iqr_ = df_iqr
|
|
283
241
|
|
|
284
|
-
if self.calculate_kde:
|
|
285
|
-
warnings.warn(
|
|
286
|
-
"KDE calculation is deprecated and will be removed in version 0.2.0",
|
|
287
|
-
DeprecationWarning,
|
|
288
|
-
)
|
|
289
|
-
self.kde_dict_ = {}
|
|
290
|
-
|
|
291
|
-
for cl in self.classes_:
|
|
292
|
-
subX = X[y == cl]
|
|
293
|
-
# Implement the following in an if-else to save computational time.
|
|
294
|
-
# kde = KernelDensity(bandwidth='scott', metric=self.metric)
|
|
295
|
-
# kde.fit(subX)
|
|
296
|
-
kde = KernelDensity(
|
|
297
|
-
bandwidth="scott",
|
|
298
|
-
metric="pyfunc",
|
|
299
|
-
metric_params={"func": self.metric_fn_},
|
|
300
|
-
)
|
|
301
|
-
kde.fit(subX)
|
|
302
|
-
self.kde_dict_[cl] = kde
|
|
303
242
|
self.is_fitted_ = True
|
|
304
243
|
|
|
305
244
|
return self
|
|
306
245
|
|
|
307
|
-
def predict(
|
|
246
|
+
def predict(
|
|
247
|
+
self,
|
|
248
|
+
X: np.array,
|
|
249
|
+
metric: str | Callable = "euclidean",
|
|
250
|
+
) -> np.ndarray:
|
|
308
251
|
"""Predict the class labels for the provided X.
|
|
309
252
|
|
|
310
253
|
The prediction is based on the distance of each data point in the input sample
|
|
@@ -315,18 +258,35 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
315
258
|
----------
|
|
316
259
|
X : array-like of shape (n_samples, n_features)
|
|
317
260
|
The input samples.
|
|
261
|
+
metric : str or callable, default="euclidean"
|
|
262
|
+
The distance metric to use for calculating the distance between features.
|
|
263
|
+
|
|
264
|
+
.. versionchanged:: 0.2.0
|
|
265
|
+
The metric is now specified at prediction time rather
|
|
266
|
+
than during initialization, providing greater flexibility.
|
|
318
267
|
|
|
319
268
|
Returns
|
|
320
269
|
-------
|
|
321
270
|
y : ndarray of shape (n_samples,)
|
|
322
271
|
The predicted classes.
|
|
272
|
+
|
|
273
|
+
See Also
|
|
274
|
+
--------
|
|
275
|
+
scipy.spatial.dist : Other distance metrics provided in SciPy
|
|
276
|
+
distclassipy.Distance : Distance metrics included with DistClassiPy
|
|
277
|
+
|
|
278
|
+
Notes
|
|
279
|
+
-----
|
|
280
|
+
If using distance metrics supported by SciPy, it is desirable to pass a string,
|
|
281
|
+
which allows SciPy to use an optimized C version of the code instead of the
|
|
282
|
+
slower Python version.
|
|
323
283
|
"""
|
|
324
284
|
check_is_fitted(self, "is_fitted_")
|
|
325
285
|
X = check_array(X)
|
|
326
|
-
|
|
286
|
+
metric_fn_, metric_arg_ = initialize_metric_function(metric)
|
|
327
287
|
if not self.scale:
|
|
328
288
|
dist_arr = scipy.spatial.distance.cdist(
|
|
329
|
-
XA=X, XB=self.df_centroid_.to_numpy(), metric=
|
|
289
|
+
XA=X, XB=self.df_centroid_.to_numpy(), metric=metric_arg_
|
|
330
290
|
)
|
|
331
291
|
|
|
332
292
|
else:
|
|
@@ -343,16 +303,18 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
343
303
|
w = wtdf.loc[cl].to_numpy() # 1/std dev
|
|
344
304
|
XB = XB * w # w is for this class only
|
|
345
305
|
XA = X * w # w is for this class only
|
|
346
|
-
cl_dist = scipy.spatial.distance.cdist(
|
|
347
|
-
XA=XA, XB=XB, metric=self.metric_arg_
|
|
348
|
-
)
|
|
306
|
+
cl_dist = scipy.spatial.distance.cdist(XA=XA, XB=XB, metric=metric_arg_)
|
|
349
307
|
dist_arr_list.append(cl_dist)
|
|
350
308
|
dist_arr = np.column_stack(dist_arr_list)
|
|
351
309
|
|
|
352
310
|
y_pred = self.classes_[dist_arr.argmin(axis=1)]
|
|
353
311
|
return y_pred
|
|
354
312
|
|
|
355
|
-
def predict_and_analyse(
|
|
313
|
+
def predict_and_analyse(
|
|
314
|
+
self,
|
|
315
|
+
X: np.array,
|
|
316
|
+
metric: str | Callable = "euclidean",
|
|
317
|
+
) -> np.ndarray:
|
|
356
318
|
"""Predict the class labels for the provided X and perform analysis.
|
|
357
319
|
|
|
358
320
|
The prediction is based on the distance of each data point in the input sample
|
|
@@ -366,18 +328,35 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
366
328
|
----------
|
|
367
329
|
X : array-like of shape (n_samples, n_features)
|
|
368
330
|
The input samples.
|
|
331
|
+
metric : str or callable, default="euclidean"
|
|
332
|
+
The distance metric to use for calculating the distance between features.
|
|
333
|
+
|
|
369
334
|
|
|
370
335
|
Returns
|
|
371
336
|
-------
|
|
372
337
|
y : ndarray of shape (n_samples,)
|
|
373
338
|
The predicted classes.
|
|
339
|
+
|
|
340
|
+
See Also
|
|
341
|
+
--------
|
|
342
|
+
scipy.spatial.dist : Other distance metrics provided in SciPy
|
|
343
|
+
distclassipy.Distance : Distance metrics included with DistClassiPy
|
|
344
|
+
|
|
345
|
+
Notes
|
|
346
|
+
-----
|
|
347
|
+
If using distance metrics supported by SciPy, it is desirable to pass a string,
|
|
348
|
+
which allows SciPy to use an optimized C version of the code instead
|
|
349
|
+
of the slower Python version.
|
|
350
|
+
|
|
374
351
|
"""
|
|
375
352
|
check_is_fitted(self, "is_fitted_")
|
|
376
353
|
X = check_array(X)
|
|
377
354
|
|
|
355
|
+
metric_fn_, metric_arg_ = initialize_metric_function(metric)
|
|
356
|
+
|
|
378
357
|
if not self.scale:
|
|
379
358
|
dist_arr = scipy.spatial.distance.cdist(
|
|
380
|
-
XA=X, XB=self.df_centroid_.to_numpy(), metric=
|
|
359
|
+
XA=X, XB=self.df_centroid_.to_numpy(), metric=metric_arg_
|
|
381
360
|
)
|
|
382
361
|
|
|
383
362
|
else:
|
|
@@ -394,9 +373,7 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
394
373
|
w = wtdf.loc[cl].to_numpy() # 1/std dev
|
|
395
374
|
XB = XB * w # w is for this class only
|
|
396
375
|
XA = X * w # w is for this class only
|
|
397
|
-
cl_dist = scipy.spatial.distance.cdist(
|
|
398
|
-
XA=XA, XB=XB, metric=self.metric_arg_
|
|
399
|
-
)
|
|
376
|
+
cl_dist = scipy.spatial.distance.cdist(XA=XA, XB=XB, metric=metric_arg_)
|
|
400
377
|
dist_arr_list.append(cl_dist)
|
|
401
378
|
dist_arr = np.column_stack(dist_arr_list)
|
|
402
379
|
|
|
@@ -409,78 +386,15 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
409
386
|
|
|
410
387
|
y_pred = self.classes_[dist_arr.argmin(axis=1)]
|
|
411
388
|
|
|
412
|
-
if self.calculate_kde:
|
|
413
|
-
warnings.warn(
|
|
414
|
-
"KDE calculation in predict_and_analyse is deprecated "
|
|
415
|
-
"and will be removed in version 0.2.0",
|
|
416
|
-
DeprecationWarning,
|
|
417
|
-
)
|
|
418
|
-
# NEW: Rescale in terms of median likelihoods - calculate here
|
|
419
|
-
scale_factors = np.exp(
|
|
420
|
-
[
|
|
421
|
-
self.kde_dict_[cl].score_samples(
|
|
422
|
-
self.df_centroid_.loc[cl].to_numpy().reshape(1, -1)
|
|
423
|
-
)[0]
|
|
424
|
-
for cl in self.classes_
|
|
425
|
-
]
|
|
426
|
-
)
|
|
427
|
-
|
|
428
|
-
likelihood_arr = []
|
|
429
|
-
for k in self.kde_dict_.keys():
|
|
430
|
-
log_pdf = self.kde_dict_[k].score_samples(X)
|
|
431
|
-
likelihood_val = np.exp(log_pdf)
|
|
432
|
-
likelihood_arr.append(likelihood_val)
|
|
433
|
-
self.likelihood_arr_ = np.array(likelihood_arr).T
|
|
434
|
-
|
|
435
|
-
# NEW: Rescale in terms of median likelihoods - rescale here
|
|
436
|
-
self.likelihood_arr_ = self.likelihood_arr_ / scale_factors
|
|
437
|
-
if self.calculate_1d_dist:
|
|
438
|
-
warnings.warn(
|
|
439
|
-
"calculate_1d_dist is deprecated and will be removed in version 0.2.0",
|
|
440
|
-
DeprecationWarning,
|
|
441
|
-
)
|
|
442
|
-
conf_cl = []
|
|
443
|
-
Xdf_temp = pd.DataFrame(data=X, columns=self.df_centroid_.columns)
|
|
444
|
-
for cl in self.classes_:
|
|
445
|
-
sum_1d_dists = np.zeros(shape=(len(Xdf_temp)))
|
|
446
|
-
for feat in Xdf_temp.columns:
|
|
447
|
-
dists = scipy.spatial.distance.cdist(
|
|
448
|
-
XA=np.zeros(shape=(1, 1)),
|
|
449
|
-
XB=(self.df_centroid_.loc[cl] - Xdf_temp)[feat]
|
|
450
|
-
.to_numpy()
|
|
451
|
-
.reshape(-1, 1),
|
|
452
|
-
metric=self.metric_arg_,
|
|
453
|
-
).ravel()
|
|
454
|
-
if self.scale and self.dispersion_stat == "std":
|
|
455
|
-
sum_1d_dists = sum_1d_dists + dists / self.df_std_.loc[cl, feat]
|
|
456
|
-
elif self.scale and self.dispersion_stat == "std":
|
|
457
|
-
sum_1d_dists = sum_1d_dists + dists / self.df_iqr_.loc[cl, feat]
|
|
458
|
-
else:
|
|
459
|
-
sum_1d_dists = sum_1d_dists + dists
|
|
460
|
-
confs = 1 / np.clip(sum_1d_dists, a_min=np.finfo(float).eps, a_max=None)
|
|
461
|
-
conf_cl.append(confs)
|
|
462
|
-
conf_cl = np.array(conf_cl)
|
|
463
|
-
self.conf_cl_ = conf_cl
|
|
464
389
|
self.analyis_ = True
|
|
465
390
|
|
|
466
391
|
return y_pred
|
|
467
392
|
|
|
468
|
-
def calculate_confidence(self
|
|
393
|
+
def calculate_confidence(self):
|
|
469
394
|
"""Calculate the confidence for each prediction.
|
|
470
395
|
|
|
471
|
-
The confidence is calculated
|
|
472
|
-
the centroids of the training data
|
|
473
|
-
1-dimensional distance.
|
|
474
|
-
|
|
475
|
-
Parameters
|
|
476
|
-
----------
|
|
477
|
-
method : {"distance_inverse", "1d_distance_inverse", "kde_likelihood"},
|
|
478
|
-
default="distance_inverse"
|
|
479
|
-
The method to use for calculating confidence. Default is
|
|
480
|
-
'distance_inverse'.
|
|
481
|
-
.. deprecated:: 0.2.0
|
|
482
|
-
The methods '1d_distance_inverse' and
|
|
483
|
-
'kde_likelihood' will be removed in version 0.2.0.
|
|
396
|
+
The confidence is calculated as the inverse of the distance of each data point
|
|
397
|
+
to the centroids of the training data.
|
|
484
398
|
"""
|
|
485
399
|
check_is_fitted(self, "is_fitted_")
|
|
486
400
|
if not hasattr(self, "analyis_"):
|
|
@@ -490,44 +404,278 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
490
404
|
)
|
|
491
405
|
|
|
492
406
|
# Calculate confidence for each prediction
|
|
493
|
-
|
|
494
|
-
self.
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
self.confidence_df_.columns
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
407
|
+
self.confidence_df_ = 1 / np.clip(
|
|
408
|
+
self.centroid_dist_df_, a_min=np.finfo(float).eps, a_max=None
|
|
409
|
+
)
|
|
410
|
+
self.confidence_df_.columns = [
|
|
411
|
+
x.replace("_dist", "_conf") for x in self.confidence_df_.columns
|
|
412
|
+
]
|
|
413
|
+
|
|
414
|
+
return self.confidence_df_.to_numpy()
|
|
415
|
+
|
|
416
|
+
def score(self, X, y, metric: str | Callable = "euclidean") -> float:
|
|
417
|
+
"""Return the mean accuracy on the given test data and labels.
|
|
418
|
+
|
|
419
|
+
Parameters
|
|
420
|
+
----------
|
|
421
|
+
X : array-like of shape (n_samples, n_features)
|
|
422
|
+
Test samples.
|
|
423
|
+
y : array-like of shape (n_samples,)
|
|
424
|
+
True labels for X.
|
|
425
|
+
metric : str or callable, default="euclidean"
|
|
426
|
+
The distance metric to use for calculating the distance between features.
|
|
427
|
+
|
|
428
|
+
Returns
|
|
429
|
+
-------
|
|
430
|
+
score : float
|
|
431
|
+
Mean accuracy of self.predict(X) wrt. y.
|
|
432
|
+
"""
|
|
433
|
+
y_pred = self.predict(X, metric=metric)
|
|
434
|
+
return accuracy_score(y, y_pred)
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
def find_best_metrics(
|
|
438
|
+
clf: "DistanceMetricClassifier",
|
|
439
|
+
X: np.ndarray,
|
|
440
|
+
y: np.ndarray,
|
|
441
|
+
feat_idx: int,
|
|
442
|
+
n_quantiles: int = 4,
|
|
443
|
+
metrics_to_consider: list[str] = None,
|
|
444
|
+
) -> Tuple[pd.DataFrame, pd.Series, np.ndarray]:
|
|
445
|
+
"""Evaluate and find the best distance metrics for a given feature.
|
|
446
|
+
|
|
447
|
+
This function evaluates different distance metrics to determine which
|
|
448
|
+
performs best for a specific feature in the dataset. It splits the data
|
|
449
|
+
into quantiles based on the specified feature and calculates the accuracy
|
|
450
|
+
of the classifier for each metric within these quantiles.
|
|
451
|
+
|
|
452
|
+
.. versionadded:: 0.2.0
|
|
453
|
+
|
|
454
|
+
Parameters
|
|
455
|
+
----------
|
|
456
|
+
clf : DistanceMetricClassifier
|
|
457
|
+
The classifier instance to be used for evaluation.
|
|
458
|
+
X : np.ndarray
|
|
459
|
+
The input feature matrix.
|
|
460
|
+
y : np.ndarray
|
|
461
|
+
The target labels.
|
|
462
|
+
feat_idx : int
|
|
463
|
+
The index of the feature to be used for quantile splitting.
|
|
464
|
+
n_quantiles : int, default=4
|
|
465
|
+
The number of quantiles to split the data into.
|
|
466
|
+
metrics_to_consider : list of str, optional
|
|
467
|
+
A list of distance metrics to evaluate. If None, all available
|
|
468
|
+
metrics within DistClassiPy will be considered.
|
|
469
|
+
|
|
470
|
+
Returns
|
|
471
|
+
-------
|
|
472
|
+
quantile_scores_df : pd.DataFrame
|
|
473
|
+
A DataFrame containing the accuracy scores for each metric across
|
|
474
|
+
different quantiles.
|
|
475
|
+
best_metrics_per_quantile : pd.Series
|
|
476
|
+
A Series indicating the best-performing metric for each quantile.
|
|
477
|
+
group_bins : np.ndarray
|
|
478
|
+
The bins used for quantile splitting.
|
|
479
|
+
"""
|
|
480
|
+
X = check_array(X)
|
|
481
|
+
feature_labels = [f"Feature_{i}" for i in range(X.shape[1])]
|
|
482
|
+
feature_name = f"Feature_{feat_idx}"
|
|
483
|
+
|
|
484
|
+
if metrics_to_consider is None:
|
|
485
|
+
metrics_to_consider = _ALL_METRICS
|
|
486
|
+
|
|
487
|
+
X_df = pd.DataFrame(X, columns=feature_labels)
|
|
488
|
+
y_df = pd.DataFrame(y, columns=["Target"])
|
|
489
|
+
quantiles, group_bins = pd.qcut(X_df[feature_name], q=n_quantiles, retbins=True)
|
|
515
490
|
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
491
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
492
|
+
X_df, y_df, test_size=0.25, stratify=quantiles
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
clf.fit(X_train, y_train.to_numpy().ravel())
|
|
496
|
+
grouped_test_data = X_test.groupby(quantiles, observed=False)
|
|
497
|
+
|
|
498
|
+
quantile_scores = []
|
|
499
|
+
for metric in metrics_to_consider:
|
|
500
|
+
scores_for_metric = [
|
|
501
|
+
accuracy_score(
|
|
502
|
+
y_test.loc[subdf.index], clf.predict(subdf.to_numpy(), metric=metric)
|
|
521
503
|
)
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
504
|
+
for _, subdf in grouped_test_data
|
|
505
|
+
]
|
|
506
|
+
quantile_scores.append(scores_for_metric)
|
|
507
|
+
|
|
508
|
+
quantile_scores = np.array(quantile_scores) * 100
|
|
509
|
+
quantile_scores_df = pd.DataFrame(
|
|
510
|
+
data=quantile_scores,
|
|
511
|
+
index=metrics_to_consider,
|
|
512
|
+
columns=[f"Quantile {i+1}" for i in range(n_quantiles)],
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
best_metrics_per_quantile = quantile_scores_df.idxmax()
|
|
516
|
+
|
|
517
|
+
return quantile_scores_df, best_metrics_per_quantile, group_bins
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
class EnsembleDistanceClassifier(BaseEstimator, ClassifierMixin):
|
|
521
|
+
"""An ensemble classifier that uses different metrics for each quantile.
|
|
522
|
+
|
|
523
|
+
This classifier splits the data into quantiles based on a specified
|
|
524
|
+
feature and uses different distance metrics for each quantile to
|
|
525
|
+
construct an ensemble classifier for each quantile, generally leading
|
|
526
|
+
to better performance.
|
|
527
|
+
Note, however, this involves fitting the training set for each metric
|
|
528
|
+
to evaluate performance, making this more computationally expensive.
|
|
529
|
+
|
|
530
|
+
.. versionadded:: 0.2.0
|
|
531
|
+
"""
|
|
532
|
+
|
|
533
|
+
def __init__(
|
|
534
|
+
self,
|
|
535
|
+
feat_idx: int,
|
|
536
|
+
scale: bool = True,
|
|
537
|
+
central_stat: str = "median",
|
|
538
|
+
dispersion_stat: str = "std",
|
|
539
|
+
metrics_to_consider: list[str] = None,
|
|
540
|
+
) -> None:
|
|
541
|
+
"""Initialize the classifier with specified parameters.
|
|
542
|
+
|
|
543
|
+
Parameters
|
|
544
|
+
----------
|
|
545
|
+
feat_idx : int
|
|
546
|
+
The index of the feature to be used for quantile splitting.
|
|
547
|
+
scale : bool, default=True
|
|
548
|
+
Whether to scale the distance between the test object and the centroid.
|
|
549
|
+
central_stat : str, default="median"
|
|
550
|
+
The statistic used to calculate the central tendency of the data.
|
|
551
|
+
dispersion_stat : str, default="std"
|
|
552
|
+
The statistic used to calculate the dispersion of the data.
|
|
553
|
+
metrics_to_consider : list of str, optional
|
|
554
|
+
A list of distance metrics to evaluate. If None, all available
|
|
555
|
+
metrics within DistClassiPy will be considered.
|
|
556
|
+
"""
|
|
557
|
+
self.feat_idx = feat_idx
|
|
558
|
+
self.scale = scale
|
|
559
|
+
self.central_stat = central_stat
|
|
560
|
+
self.dispersion_stat = dispersion_stat
|
|
561
|
+
self.metrics_to_consider = metrics_to_consider
|
|
562
|
+
|
|
563
|
+
def fit(
|
|
564
|
+
self, X: np.ndarray, y: np.ndarray, n_quantiles: int = 4
|
|
565
|
+
) -> "EnsembleDistanceClassifier":
|
|
566
|
+
"""Fit the ensemble classifier using the best metrics for each quantile.
|
|
567
|
+
|
|
568
|
+
Parameters
|
|
569
|
+
----------
|
|
570
|
+
X : np.ndarray
|
|
571
|
+
The input feature matrix.
|
|
572
|
+
y : np.ndarray
|
|
573
|
+
The target labels.
|
|
574
|
+
n_quantiles : int, default=4
|
|
575
|
+
The number of quantiles to split the data into.
|
|
576
|
+
|
|
577
|
+
Returns
|
|
578
|
+
-------
|
|
579
|
+
self : object
|
|
580
|
+
Fitted estimator.
|
|
581
|
+
"""
|
|
582
|
+
self.clf_ = DistanceMetricClassifier(
|
|
583
|
+
scale=self.scale,
|
|
584
|
+
central_stat=self.central_stat,
|
|
585
|
+
dispersion_stat=self.dispersion_stat,
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
# Find best metrics based on training set quantiles
|
|
589
|
+
self.quantile_scores_df_, self.best_metrics_per_quantile_, self.group_bins = (
|
|
590
|
+
self.evaluate_metrics(X, y, n_quantiles)
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
# Ensure the bins work with values outside of training data
|
|
594
|
+
self.group_bins[0] = -np.inf
|
|
595
|
+
self.group_bins[-1] = np.inf
|
|
596
|
+
|
|
597
|
+
self.group_labels = [f"Quantile {i+1}" for i in range(n_quantiles)]
|
|
598
|
+
self.clf_.fit(X, y)
|
|
599
|
+
self.is_fitted_ = True
|
|
600
|
+
return self
|
|
601
|
+
|
|
602
|
+
def predict(self, X: np.ndarray) -> np.ndarray:
|
|
603
|
+
"""Predict class labels using the best metric for each quantile.
|
|
604
|
+
|
|
605
|
+
Parameters
|
|
606
|
+
----------
|
|
607
|
+
X : np.ndarray
|
|
608
|
+
The input samples.
|
|
609
|
+
|
|
610
|
+
Returns
|
|
611
|
+
-------
|
|
612
|
+
predictions : np.ndarray
|
|
613
|
+
The predicted class labels.
|
|
614
|
+
"""
|
|
615
|
+
check_is_fitted(self, "is_fitted_")
|
|
616
|
+
X = check_array(X)
|
|
617
|
+
|
|
618
|
+
# notes for pred during best:
|
|
619
|
+
# option 1:
|
|
620
|
+
# loop through each metric, merge quantiles for each metric
|
|
621
|
+
# pred on this
|
|
622
|
+
# option 2, easier, but slower:
|
|
623
|
+
# loop through each quantile, and append pred
|
|
624
|
+
|
|
625
|
+
quantiles = pd.cut(
|
|
626
|
+
X[:, self.feat_idx], bins=self.group_bins, labels=self.group_labels
|
|
627
|
+
)
|
|
628
|
+
# grouped_data = pd.DataFrame(X).groupby(quantiles, observed=False)
|
|
629
|
+
quantile_indices = quantiles.codes # Get integer codes for quantiles
|
|
630
|
+
predictions = np.empty(X.shape[0], dtype=int)
|
|
631
|
+
# for i, (lim, subdf) in enumerate(grouped_data):
|
|
632
|
+
# best_metric = self.best_metrics_per_quantile_.loc[self.group_labels[i]]
|
|
633
|
+
# preds = self.clf_.predict(subdf.to_numpy(), metric=best_metric)
|
|
634
|
+
# predictions[subdf.index] = preds
|
|
635
|
+
# Precompute predictions for each quantile
|
|
636
|
+
quantile_predictions = {}
|
|
637
|
+
for i, label in enumerate(self.group_labels):
|
|
638
|
+
best_metric = self.best_metrics_per_quantile_.loc[label]
|
|
639
|
+
quantile_data = X[quantile_indices == i]
|
|
640
|
+
if quantile_data.size > 0:
|
|
641
|
+
quantile_predictions[i] = self.clf_.predict(
|
|
642
|
+
quantile_data, metric=best_metric
|
|
526
643
|
)
|
|
527
644
|
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
)
|
|
645
|
+
# Assign predictions to the corresponding indices
|
|
646
|
+
for i, preds in quantile_predictions.items():
|
|
647
|
+
predictions[quantile_indices == i] = preds
|
|
532
648
|
|
|
533
|
-
return
|
|
649
|
+
return predictions
|
|
650
|
+
|
|
651
|
+
def evaluate_metrics(
|
|
652
|
+
self, X: np.ndarray, y: np.ndarray, n_quantiles: int = 4
|
|
653
|
+
) -> Tuple[pd.DataFrame, pd.Series, np.ndarray]:
|
|
654
|
+
"""Evaluate and find the best distance metrics for the specified feature.
|
|
655
|
+
|
|
656
|
+
This method uses the standalone `find_best_metrics` function to evaluate
|
|
657
|
+
different distance metrics and determine the best-performing ones for
|
|
658
|
+
each quantile.
|
|
659
|
+
|
|
660
|
+
Parameters
|
|
661
|
+
----------
|
|
662
|
+
X : np.ndarray
|
|
663
|
+
The input feature matrix.
|
|
664
|
+
y : np.ndarray
|
|
665
|
+
The target labels.
|
|
666
|
+
n_quantiles : int, default=4
|
|
667
|
+
The number of quantiles to split the data into.
|
|
668
|
+
|
|
669
|
+
Returns
|
|
670
|
+
-------
|
|
671
|
+
quantile_scores_df : pd.DataFrame
|
|
672
|
+
A DataFrame containing the accuracy scores for each metric across
|
|
673
|
+
different quantiles.
|
|
674
|
+
best_metrics_per_quantile : pd.Series
|
|
675
|
+
A Series indicating the best-performing metric for each quantile.
|
|
676
|
+
group_bins : np.ndarray
|
|
677
|
+
The bins used for quantile splitting.
|
|
678
|
+
"""
|
|
679
|
+
return find_best_metrics(
|
|
680
|
+
self.clf_, X, y, self.feat_idx, n_quantiles, self.metrics_to_consider
|
|
681
|
+
)
|