distclassipy 0.1.6a0__tar.gz → 0.2.0a0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {distclassipy-0.1.6a0 → distclassipy-0.2.0a0}/PKG-INFO +4 -4
- {distclassipy-0.1.6a0 → distclassipy-0.2.0a0}/README.md +2 -2
- {distclassipy-0.1.6a0 → distclassipy-0.2.0a0}/distclassipy/__init__.py +1 -1
- {distclassipy-0.1.6a0 → distclassipy-0.2.0a0}/distclassipy/classifier.py +101 -235
- distclassipy-0.2.0a0/distclassipy/distances.py +1484 -0
- {distclassipy-0.1.6a0 → distclassipy-0.2.0a0}/distclassipy.egg-info/PKG-INFO +4 -4
- {distclassipy-0.1.6a0 → distclassipy-0.2.0a0}/distclassipy.egg-info/requires.txt +1 -1
- {distclassipy-0.1.6a0 → distclassipy-0.2.0a0}/pyproject.toml +1 -1
- {distclassipy-0.1.6a0 → distclassipy-0.2.0a0}/tests/test_classifier.py +16 -48
- {distclassipy-0.1.6a0 → distclassipy-0.2.0a0}/tests/test_distances_prop.py +30 -2
- distclassipy-0.1.6a0/distclassipy/distances.py +0 -1484
- {distclassipy-0.1.6a0 → distclassipy-0.2.0a0}/LICENSE +0 -0
- {distclassipy-0.1.6a0 → distclassipy-0.2.0a0}/distclassipy.egg-info/SOURCES.txt +0 -0
- {distclassipy-0.1.6a0 → distclassipy-0.2.0a0}/distclassipy.egg-info/dependency_links.txt +0 -0
- {distclassipy-0.1.6a0 → distclassipy-0.2.0a0}/distclassipy.egg-info/top_level.txt +0 -0
- {distclassipy-0.1.6a0 → distclassipy-0.2.0a0}/setup.cfg +0 -0
- {distclassipy-0.1.6a0 → distclassipy-0.2.0a0}/setup.py +0 -0
- {distclassipy-0.1.6a0 → distclassipy-0.2.0a0}/tests/test_distances.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: distclassipy
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0a0
|
|
4
4
|
Summary: A python package for a distance-based classifier which can use several different distance metrics.
|
|
5
5
|
Author-email: Siddharth Chaini <sidchaini@gmail.com>
|
|
6
6
|
License: GNU GENERAL PUBLIC LICENSE
|
|
@@ -694,7 +694,7 @@ Requires-Python: >=3.10
|
|
|
694
694
|
Description-Content-Type: text/markdown
|
|
695
695
|
License-File: LICENSE
|
|
696
696
|
Requires-Dist: joblib>=1.3.2
|
|
697
|
-
Requires-Dist: numpy
|
|
697
|
+
Requires-Dist: numpy>=1.25.2
|
|
698
698
|
Requires-Dist: pandas>=2.0.3
|
|
699
699
|
Requires-Dist: scikit-learn>=1.2.2
|
|
700
700
|
|
|
@@ -740,9 +740,9 @@ X, y = make_classification(
|
|
|
740
740
|
random_state=0,
|
|
741
741
|
shuffle=False,
|
|
742
742
|
)
|
|
743
|
-
clf = dcpy.DistanceMetricClassifier(
|
|
743
|
+
clf = dcpy.DistanceMetricClassifier()
|
|
744
744
|
clf.fit(X, y)
|
|
745
|
-
print(clf.predict([[0, 0, 0, 0]]))
|
|
745
|
+
print(clf.predict([[0, 0, 0, 0]]), metric="canberra")
|
|
746
746
|
```
|
|
747
747
|
|
|
748
748
|
## Features
|
|
@@ -40,9 +40,9 @@ X, y = make_classification(
|
|
|
40
40
|
random_state=0,
|
|
41
41
|
shuffle=False,
|
|
42
42
|
)
|
|
43
|
-
clf = dcpy.DistanceMetricClassifier(
|
|
43
|
+
clf = dcpy.DistanceMetricClassifier()
|
|
44
44
|
clf.fit(X, y)
|
|
45
|
-
print(clf.predict([[0, 0, 0, 0]]))
|
|
45
|
+
print(clf.predict([[0, 0, 0, 0]]), metric="canberra")
|
|
46
46
|
```
|
|
47
47
|
|
|
48
48
|
## Features
|
|
@@ -19,7 +19,6 @@ You should have received a copy of the GNU General Public License
|
|
|
19
19
|
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
20
20
|
"""
|
|
21
21
|
|
|
22
|
-
import warnings
|
|
23
22
|
from typing import Callable
|
|
24
23
|
|
|
25
24
|
import numpy as np
|
|
@@ -29,7 +28,6 @@ import pandas as pd
|
|
|
29
28
|
import scipy
|
|
30
29
|
|
|
31
30
|
from sklearn.base import BaseEstimator, ClassifierMixin
|
|
32
|
-
from sklearn.neighbors import KernelDensity
|
|
33
31
|
from sklearn.utils.multiclass import unique_labels
|
|
34
32
|
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
|
|
35
33
|
|
|
@@ -42,6 +40,52 @@ METRIC_SOURCES_ = {
|
|
|
42
40
|
}
|
|
43
41
|
|
|
44
42
|
|
|
43
|
+
def initialize_metric_function(metric):
|
|
44
|
+
"""Set the metric function based on the provided metric.
|
|
45
|
+
|
|
46
|
+
If the metric is a string, the function will look for a corresponding
|
|
47
|
+
function in scipy.spatial.distance or distances.Distance. If the metric
|
|
48
|
+
is a function, it will be used directly.
|
|
49
|
+
"""
|
|
50
|
+
if callable(metric):
|
|
51
|
+
metric_fn_ = metric
|
|
52
|
+
metric_arg_ = metric
|
|
53
|
+
|
|
54
|
+
elif isinstance(metric, str):
|
|
55
|
+
metric_str_lowercase = metric.lower()
|
|
56
|
+
metric_found = False
|
|
57
|
+
for package_str, source in METRIC_SOURCES_.items():
|
|
58
|
+
|
|
59
|
+
# Don't use scipy for jaccard as their implementation only works with
|
|
60
|
+
# booleans - use custom jaccard instead
|
|
61
|
+
if (
|
|
62
|
+
package_str == "scipy.spatial.distance"
|
|
63
|
+
and metric_str_lowercase == "jaccard"
|
|
64
|
+
):
|
|
65
|
+
continue
|
|
66
|
+
|
|
67
|
+
if hasattr(source, metric_str_lowercase):
|
|
68
|
+
metric_fn_ = getattr(source, metric_str_lowercase)
|
|
69
|
+
metric_found = True
|
|
70
|
+
|
|
71
|
+
# Use the string as an argument if it belongs to scipy as it is
|
|
72
|
+
# optimized
|
|
73
|
+
metric_arg_ = (
|
|
74
|
+
metric if package_str == "scipy.spatial.distance" else metric_fn_
|
|
75
|
+
)
|
|
76
|
+
break
|
|
77
|
+
if not metric_found:
|
|
78
|
+
raise ValueError(
|
|
79
|
+
f"{metric} metric not found. Please pass a string of the "
|
|
80
|
+
"name of a metric in scipy.spatial.distance or "
|
|
81
|
+
"distances.Distance, or pass a metric function directly. For a "
|
|
82
|
+
"list of available metrics, see: "
|
|
83
|
+
"https://sidchaini.github.io/DistClassiPy/distances.html or "
|
|
84
|
+
"https://docs.scipy.org/doc/scipy/reference/spatial.distance.html"
|
|
85
|
+
)
|
|
86
|
+
return metric_fn_, metric_arg_
|
|
87
|
+
|
|
88
|
+
|
|
45
89
|
class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
46
90
|
"""A distance-based classifier that supports different distance metrics.
|
|
47
91
|
|
|
@@ -55,8 +99,6 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
55
99
|
|
|
56
100
|
Parameters
|
|
57
101
|
----------
|
|
58
|
-
metric : str or callable, default="euclidean"
|
|
59
|
-
The distance metric to use for calculating the distance between features.
|
|
60
102
|
scale : bool, default=True
|
|
61
103
|
Whether to scale the distance between the test object and the centroid for a
|
|
62
104
|
class in the feature space. If True, the data will be scaled based on the
|
|
@@ -71,47 +113,15 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
71
113
|
|
|
72
114
|
.. versionadded:: 0.1.0
|
|
73
115
|
|
|
74
|
-
calculate_kde : bool, default=False
|
|
75
|
-
Whether to calculate a kernel density estimate based confidence parameter.
|
|
76
|
-
.. deprecated:: 0.2.0
|
|
77
|
-
This parameter will be removed in a future version and only the
|
|
78
|
-
distance confidence parameter will be available.
|
|
79
|
-
calculate_1d_dist : bool, default=False
|
|
80
|
-
Whether to calculate the 1-dimensional distance based confidence parameter.
|
|
81
|
-
.. deprecated:: 0.2.0
|
|
82
|
-
This parameter will be removed in a future version and only the
|
|
83
|
-
distance confidence parameter will be available.
|
|
84
|
-
Whether to calculate the 1-dimensional distance based confidence parameter.
|
|
85
116
|
|
|
86
117
|
Attributes
|
|
87
118
|
----------
|
|
88
|
-
metric : str or callable
|
|
89
|
-
The distance metric used for classification.
|
|
90
119
|
scale : bool
|
|
91
120
|
Indicates whether the data is scaled.
|
|
92
121
|
central_stat : str
|
|
93
122
|
The statistic used for calculating central tendency.
|
|
94
123
|
dispersion_stat : str
|
|
95
124
|
The statistic used for calculating dispersion.
|
|
96
|
-
calculate_kde : bool
|
|
97
|
-
Indicates whether a kernel density estimate is calculated.
|
|
98
|
-
.. deprecated:: 0.2.0
|
|
99
|
-
This parameter will be removed in a future version.
|
|
100
|
-
calculate_1d_dist : bool
|
|
101
|
-
Indicates whether 1-dimensional distances are calculated.
|
|
102
|
-
.. deprecated:: 0.2.0
|
|
103
|
-
This parameter will be removed in a future version.
|
|
104
|
-
|
|
105
|
-
See Also
|
|
106
|
-
--------
|
|
107
|
-
scipy.spatial.dist : Other distance metrics provided in SciPy
|
|
108
|
-
distclassipy.Distance : Distance metrics included with DistClassiPy
|
|
109
|
-
|
|
110
|
-
Notes
|
|
111
|
-
-----
|
|
112
|
-
If using distance metrics supported by SciPy, it is desirable to pass a string,
|
|
113
|
-
which allows SciPy to use an optimized C version of the code instead of the slower
|
|
114
|
-
Python version.
|
|
115
125
|
|
|
116
126
|
References
|
|
117
127
|
----------
|
|
@@ -134,77 +144,14 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
134
144
|
|
|
135
145
|
def __init__(
|
|
136
146
|
self,
|
|
137
|
-
metric: str | Callable = "euclidean",
|
|
138
147
|
scale: bool = True,
|
|
139
148
|
central_stat: str = "median",
|
|
140
149
|
dispersion_stat: str = "std",
|
|
141
|
-
calculate_kde: bool = True, # deprecated in 0.2.0
|
|
142
|
-
calculate_1d_dist: bool = True, # deprecated in 0.2.0
|
|
143
150
|
):
|
|
144
151
|
"""Initialize the classifier with specified parameters."""
|
|
145
|
-
self.metric = metric
|
|
146
152
|
self.scale = scale
|
|
147
153
|
self.central_stat = central_stat
|
|
148
154
|
self.dispersion_stat = dispersion_stat
|
|
149
|
-
if calculate_kde:
|
|
150
|
-
warnings.warn(
|
|
151
|
-
"calculate_kde is deprecated and will be removed in version 0.2.0",
|
|
152
|
-
DeprecationWarning,
|
|
153
|
-
)
|
|
154
|
-
self.calculate_kde = calculate_kde
|
|
155
|
-
|
|
156
|
-
if calculate_1d_dist:
|
|
157
|
-
warnings.warn(
|
|
158
|
-
"calculate_1d_dist is deprecated and will be removed in version 0.2.0",
|
|
159
|
-
DeprecationWarning,
|
|
160
|
-
)
|
|
161
|
-
self.calculate_1d_dist = calculate_1d_dist
|
|
162
|
-
|
|
163
|
-
def initialize_metric_function(self):
|
|
164
|
-
"""Set the metric function based on the provided metric.
|
|
165
|
-
|
|
166
|
-
If the metric is a string, the function will look for a corresponding
|
|
167
|
-
function in scipy.spatial.distance or distances.Distance. If the metric
|
|
168
|
-
is a function, it will be used directly.
|
|
169
|
-
"""
|
|
170
|
-
if callable(self.metric):
|
|
171
|
-
self.metric_fn_ = self.metric
|
|
172
|
-
self.metric_arg_ = self.metric
|
|
173
|
-
|
|
174
|
-
elif isinstance(self.metric, str):
|
|
175
|
-
metric_str_lowercase = self.metric.lower()
|
|
176
|
-
metric_found = False
|
|
177
|
-
for package_str, source in METRIC_SOURCES_.items():
|
|
178
|
-
|
|
179
|
-
# Don't use scipy for jaccard as their implementation only works with
|
|
180
|
-
# booleans - use custom jaccard instead
|
|
181
|
-
if (
|
|
182
|
-
package_str == "scipy.spatial.distance"
|
|
183
|
-
and metric_str_lowercase == "jaccard"
|
|
184
|
-
):
|
|
185
|
-
continue
|
|
186
|
-
|
|
187
|
-
if hasattr(source, metric_str_lowercase):
|
|
188
|
-
self.metric_fn_ = getattr(source, metric_str_lowercase)
|
|
189
|
-
metric_found = True
|
|
190
|
-
|
|
191
|
-
# Use the string as an argument if it belongs to scipy as it is
|
|
192
|
-
# optimized
|
|
193
|
-
self.metric_arg_ = (
|
|
194
|
-
self.metric
|
|
195
|
-
if package_str == "scipy.spatial.distance"
|
|
196
|
-
else self.metric_fn_
|
|
197
|
-
)
|
|
198
|
-
break
|
|
199
|
-
if not metric_found:
|
|
200
|
-
raise ValueError(
|
|
201
|
-
f"{self.metric} metric not found. Please pass a string of the "
|
|
202
|
-
"name of a metric in scipy.spatial.distance or "
|
|
203
|
-
"distances.Distance, or pass a metric function directly. For a "
|
|
204
|
-
"list of available metrics, see: "
|
|
205
|
-
"https://sidchaini.github.io/DistClassiPy/distances.html or "
|
|
206
|
-
"https://docs.scipy.org/doc/scipy/reference/spatial.distance.html"
|
|
207
|
-
)
|
|
208
155
|
|
|
209
156
|
def fit(self, X: np.array, y: np.array, feat_labels: list[str] = None):
|
|
210
157
|
"""Calculate the feature space centroid for all classes.
|
|
@@ -236,8 +183,6 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
236
183
|
1
|
|
237
184
|
] # Number of features seen during fit - required for sklearn compatibility.
|
|
238
185
|
|
|
239
|
-
self.initialize_metric_function()
|
|
240
|
-
|
|
241
186
|
if feat_labels is None:
|
|
242
187
|
feat_labels = [f"Feature_{x}" for x in range(X.shape[1])]
|
|
243
188
|
|
|
@@ -281,30 +226,15 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
281
226
|
)
|
|
282
227
|
self.df_iqr_ = df_iqr
|
|
283
228
|
|
|
284
|
-
if self.calculate_kde:
|
|
285
|
-
warnings.warn(
|
|
286
|
-
"KDE calculation is deprecated and will be removed in version 0.2.0",
|
|
287
|
-
DeprecationWarning,
|
|
288
|
-
)
|
|
289
|
-
self.kde_dict_ = {}
|
|
290
|
-
|
|
291
|
-
for cl in self.classes_:
|
|
292
|
-
subX = X[y == cl]
|
|
293
|
-
# Implement the following in an if-else to save computational time.
|
|
294
|
-
# kde = KernelDensity(bandwidth='scott', metric=self.metric)
|
|
295
|
-
# kde.fit(subX)
|
|
296
|
-
kde = KernelDensity(
|
|
297
|
-
bandwidth="scott",
|
|
298
|
-
metric="pyfunc",
|
|
299
|
-
metric_params={"func": self.metric_fn_},
|
|
300
|
-
)
|
|
301
|
-
kde.fit(subX)
|
|
302
|
-
self.kde_dict_[cl] = kde
|
|
303
229
|
self.is_fitted_ = True
|
|
304
230
|
|
|
305
231
|
return self
|
|
306
232
|
|
|
307
|
-
def predict(
|
|
233
|
+
def predict(
|
|
234
|
+
self,
|
|
235
|
+
X: np.array,
|
|
236
|
+
metric: str | Callable = "euclidean",
|
|
237
|
+
):
|
|
308
238
|
"""Predict the class labels for the provided X.
|
|
309
239
|
|
|
310
240
|
The prediction is based on the distance of each data point in the input sample
|
|
@@ -315,18 +245,33 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
315
245
|
----------
|
|
316
246
|
X : array-like of shape (n_samples, n_features)
|
|
317
247
|
The input samples.
|
|
248
|
+
metric : str or callable, default="euclidean"
|
|
249
|
+
The distance metric to use for calculating the distance between features.
|
|
318
250
|
|
|
319
251
|
Returns
|
|
320
252
|
-------
|
|
321
253
|
y : ndarray of shape (n_samples,)
|
|
322
254
|
The predicted classes.
|
|
255
|
+
|
|
256
|
+
See Also
|
|
257
|
+
--------
|
|
258
|
+
scipy.spatial.dist : Other distance metrics provided in SciPy
|
|
259
|
+
distclassipy.Distance : Distance metrics included with DistClassiPy
|
|
260
|
+
|
|
261
|
+
Notes
|
|
262
|
+
-----
|
|
263
|
+
If using distance metrics supported by SciPy, it is desirable to pass a string,
|
|
264
|
+
which allows SciPy to use an optimized C version of the code instead of the
|
|
265
|
+
slower Python version.
|
|
323
266
|
"""
|
|
324
267
|
check_is_fitted(self, "is_fitted_")
|
|
325
268
|
X = check_array(X)
|
|
326
269
|
|
|
270
|
+
metric_fn_, metric_arg_ = initialize_metric_function(metric)
|
|
271
|
+
|
|
327
272
|
if not self.scale:
|
|
328
273
|
dist_arr = scipy.spatial.distance.cdist(
|
|
329
|
-
XA=X, XB=self.df_centroid_.to_numpy(), metric=
|
|
274
|
+
XA=X, XB=self.df_centroid_.to_numpy(), metric=metric_arg_
|
|
330
275
|
)
|
|
331
276
|
|
|
332
277
|
else:
|
|
@@ -343,16 +288,18 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
343
288
|
w = wtdf.loc[cl].to_numpy() # 1/std dev
|
|
344
289
|
XB = XB * w # w is for this class only
|
|
345
290
|
XA = X * w # w is for this class only
|
|
346
|
-
cl_dist = scipy.spatial.distance.cdist(
|
|
347
|
-
XA=XA, XB=XB, metric=self.metric_arg_
|
|
348
|
-
)
|
|
291
|
+
cl_dist = scipy.spatial.distance.cdist(XA=XA, XB=XB, metric=metric_arg_)
|
|
349
292
|
dist_arr_list.append(cl_dist)
|
|
350
293
|
dist_arr = np.column_stack(dist_arr_list)
|
|
351
294
|
|
|
352
295
|
y_pred = self.classes_[dist_arr.argmin(axis=1)]
|
|
353
296
|
return y_pred
|
|
354
297
|
|
|
355
|
-
def predict_and_analyse(
|
|
298
|
+
def predict_and_analyse(
|
|
299
|
+
self,
|
|
300
|
+
X: np.array,
|
|
301
|
+
metric: str | Callable = "euclidean",
|
|
302
|
+
):
|
|
356
303
|
"""Predict the class labels for the provided X and perform analysis.
|
|
357
304
|
|
|
358
305
|
The prediction is based on the distance of each data point in the input sample
|
|
@@ -366,18 +313,35 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
366
313
|
----------
|
|
367
314
|
X : array-like of shape (n_samples, n_features)
|
|
368
315
|
The input samples.
|
|
316
|
+
metric : str or callable, default="euclidean"
|
|
317
|
+
The distance metric to use for calculating the distance between features.
|
|
318
|
+
|
|
369
319
|
|
|
370
320
|
Returns
|
|
371
321
|
-------
|
|
372
322
|
y : ndarray of shape (n_samples,)
|
|
373
323
|
The predicted classes.
|
|
324
|
+
|
|
325
|
+
See Also
|
|
326
|
+
--------
|
|
327
|
+
scipy.spatial.dist : Other distance metrics provided in SciPy
|
|
328
|
+
distclassipy.Distance : Distance metrics included with DistClassiPy
|
|
329
|
+
|
|
330
|
+
Notes
|
|
331
|
+
-----
|
|
332
|
+
If using distance metrics supported by SciPy, it is desirable to pass a string,
|
|
333
|
+
which allows SciPy to use an optimized C version of the code instead
|
|
334
|
+
of the slower Python version.
|
|
335
|
+
|
|
374
336
|
"""
|
|
375
337
|
check_is_fitted(self, "is_fitted_")
|
|
376
338
|
X = check_array(X)
|
|
377
339
|
|
|
340
|
+
metric_fn_, metric_arg_ = initialize_metric_function(metric)
|
|
341
|
+
|
|
378
342
|
if not self.scale:
|
|
379
343
|
dist_arr = scipy.spatial.distance.cdist(
|
|
380
|
-
XA=X, XB=self.df_centroid_.to_numpy(), metric=
|
|
344
|
+
XA=X, XB=self.df_centroid_.to_numpy(), metric=metric_arg_
|
|
381
345
|
)
|
|
382
346
|
|
|
383
347
|
else:
|
|
@@ -394,9 +358,7 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
394
358
|
w = wtdf.loc[cl].to_numpy() # 1/std dev
|
|
395
359
|
XB = XB * w # w is for this class only
|
|
396
360
|
XA = X * w # w is for this class only
|
|
397
|
-
cl_dist = scipy.spatial.distance.cdist(
|
|
398
|
-
XA=XA, XB=XB, metric=self.metric_arg_
|
|
399
|
-
)
|
|
361
|
+
cl_dist = scipy.spatial.distance.cdist(XA=XA, XB=XB, metric=metric_arg_)
|
|
400
362
|
dist_arr_list.append(cl_dist)
|
|
401
363
|
dist_arr = np.column_stack(dist_arr_list)
|
|
402
364
|
|
|
@@ -409,78 +371,15 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
409
371
|
|
|
410
372
|
y_pred = self.classes_[dist_arr.argmin(axis=1)]
|
|
411
373
|
|
|
412
|
-
if self.calculate_kde:
|
|
413
|
-
warnings.warn(
|
|
414
|
-
"KDE calculation in predict_and_analyse is deprecated "
|
|
415
|
-
"and will be removed in version 0.2.0",
|
|
416
|
-
DeprecationWarning,
|
|
417
|
-
)
|
|
418
|
-
# NEW: Rescale in terms of median likelihoods - calculate here
|
|
419
|
-
scale_factors = np.exp(
|
|
420
|
-
[
|
|
421
|
-
self.kde_dict_[cl].score_samples(
|
|
422
|
-
self.df_centroid_.loc[cl].to_numpy().reshape(1, -1)
|
|
423
|
-
)[0]
|
|
424
|
-
for cl in self.classes_
|
|
425
|
-
]
|
|
426
|
-
)
|
|
427
|
-
|
|
428
|
-
likelihood_arr = []
|
|
429
|
-
for k in self.kde_dict_.keys():
|
|
430
|
-
log_pdf = self.kde_dict_[k].score_samples(X)
|
|
431
|
-
likelihood_val = np.exp(log_pdf)
|
|
432
|
-
likelihood_arr.append(likelihood_val)
|
|
433
|
-
self.likelihood_arr_ = np.array(likelihood_arr).T
|
|
434
|
-
|
|
435
|
-
# NEW: Rescale in terms of median likelihoods - rescale here
|
|
436
|
-
self.likelihood_arr_ = self.likelihood_arr_ / scale_factors
|
|
437
|
-
if self.calculate_1d_dist:
|
|
438
|
-
warnings.warn(
|
|
439
|
-
"calculate_1d_dist is deprecated and will be removed in version 0.2.0",
|
|
440
|
-
DeprecationWarning,
|
|
441
|
-
)
|
|
442
|
-
conf_cl = []
|
|
443
|
-
Xdf_temp = pd.DataFrame(data=X, columns=self.df_centroid_.columns)
|
|
444
|
-
for cl in self.classes_:
|
|
445
|
-
sum_1d_dists = np.zeros(shape=(len(Xdf_temp)))
|
|
446
|
-
for feat in Xdf_temp.columns:
|
|
447
|
-
dists = scipy.spatial.distance.cdist(
|
|
448
|
-
XA=np.zeros(shape=(1, 1)),
|
|
449
|
-
XB=(self.df_centroid_.loc[cl] - Xdf_temp)[feat]
|
|
450
|
-
.to_numpy()
|
|
451
|
-
.reshape(-1, 1),
|
|
452
|
-
metric=self.metric_arg_,
|
|
453
|
-
).ravel()
|
|
454
|
-
if self.scale and self.dispersion_stat == "std":
|
|
455
|
-
sum_1d_dists = sum_1d_dists + dists / self.df_std_.loc[cl, feat]
|
|
456
|
-
elif self.scale and self.dispersion_stat == "std":
|
|
457
|
-
sum_1d_dists = sum_1d_dists + dists / self.df_iqr_.loc[cl, feat]
|
|
458
|
-
else:
|
|
459
|
-
sum_1d_dists = sum_1d_dists + dists
|
|
460
|
-
confs = 1 / np.clip(sum_1d_dists, a_min=np.finfo(float).eps, a_max=None)
|
|
461
|
-
conf_cl.append(confs)
|
|
462
|
-
conf_cl = np.array(conf_cl)
|
|
463
|
-
self.conf_cl_ = conf_cl
|
|
464
374
|
self.analyis_ = True
|
|
465
375
|
|
|
466
376
|
return y_pred
|
|
467
377
|
|
|
468
|
-
def calculate_confidence(self
|
|
378
|
+
def calculate_confidence(self):
|
|
469
379
|
"""Calculate the confidence for each prediction.
|
|
470
380
|
|
|
471
|
-
The confidence is calculated
|
|
472
|
-
the centroids of the training data
|
|
473
|
-
1-dimensional distance.
|
|
474
|
-
|
|
475
|
-
Parameters
|
|
476
|
-
----------
|
|
477
|
-
method : {"distance_inverse", "1d_distance_inverse", "kde_likelihood"},
|
|
478
|
-
default="distance_inverse"
|
|
479
|
-
The method to use for calculating confidence. Default is
|
|
480
|
-
'distance_inverse'.
|
|
481
|
-
.. deprecated:: 0.2.0
|
|
482
|
-
The methods '1d_distance_inverse' and
|
|
483
|
-
'kde_likelihood' will be removed in version 0.2.0.
|
|
381
|
+
The confidence is calculated as the inverse of the distance of each data point
|
|
382
|
+
to the centroids of the training data.
|
|
484
383
|
"""
|
|
485
384
|
check_is_fitted(self, "is_fitted_")
|
|
486
385
|
if not hasattr(self, "analyis_"):
|
|
@@ -490,44 +389,11 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
490
389
|
)
|
|
491
390
|
|
|
492
391
|
# Calculate confidence for each prediction
|
|
493
|
-
|
|
494
|
-
self.
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
self.confidence_df_.columns
|
|
498
|
-
|
|
499
|
-
]
|
|
500
|
-
|
|
501
|
-
elif method == "1d_distance_inverse":
|
|
502
|
-
warnings.warn(
|
|
503
|
-
"The '1d_distance_inverse' method is deprecated "
|
|
504
|
-
"and will be removed in version 0.2.0",
|
|
505
|
-
DeprecationWarning,
|
|
506
|
-
)
|
|
507
|
-
if not self.calculate_1d_dist:
|
|
508
|
-
raise ValueError(
|
|
509
|
-
"method='1d_distance_inverse' is only valid if calculate_1d_dist "
|
|
510
|
-
"is set to True"
|
|
511
|
-
)
|
|
512
|
-
self.confidence_df_ = pd.DataFrame(
|
|
513
|
-
data=self.conf_cl_.T, columns=[f"{x}_conf" for x in self.classes_]
|
|
514
|
-
)
|
|
515
|
-
|
|
516
|
-
elif method == "kde_likelihood":
|
|
517
|
-
warnings.warn(
|
|
518
|
-
"The 'kde_likelihood' method is deprecated and will be "
|
|
519
|
-
"removed in version 0.2.0",
|
|
520
|
-
DeprecationWarning,
|
|
521
|
-
)
|
|
522
|
-
if not self.calculate_kde:
|
|
523
|
-
raise ValueError(
|
|
524
|
-
"method='kde_likelihood' is only valid if calculate_kde is set "
|
|
525
|
-
"to True"
|
|
526
|
-
)
|
|
527
|
-
|
|
528
|
-
self.confidence_df_ = pd.DataFrame(
|
|
529
|
-
data=self.likelihood_arr_,
|
|
530
|
-
columns=[f"{x}_conf" for x in self.kde_dict_.keys()],
|
|
531
|
-
)
|
|
392
|
+
self.confidence_df_ = 1 / np.clip(
|
|
393
|
+
self.centroid_dist_df_, a_min=np.finfo(float).eps, a_max=None
|
|
394
|
+
)
|
|
395
|
+
self.confidence_df_.columns = [
|
|
396
|
+
x.replace("_dist", "_conf") for x in self.confidence_df_.columns
|
|
397
|
+
]
|
|
532
398
|
|
|
533
399
|
return self.confidence_df_.to_numpy()
|