distclassipy 0.0.2__py3-none-any.whl → 0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- distclassipy/__init__.py +14 -2
- distclassipy/classifier.py +121 -95
- distclassipy/distances.py +759 -401
- distclassipy-0.0.5.dist-info/METADATA +702 -0
- distclassipy-0.0.5.dist-info/RECORD +11 -0
- {distclassipy-0.0.2.dist-info → distclassipy-0.0.5.dist-info}/top_level.txt +0 -1
- distclassipy-0.0.2.dist-info/METADATA +0 -25
- distclassipy-0.0.2.dist-info/RECORD +0 -11
- {distclassipy-0.0.2.dist-info → distclassipy-0.0.5.dist-info}/LICENSE +0 -0
- {distclassipy-0.0.2.dist-info → distclassipy-0.0.5.dist-info}/WHEEL +0 -0
distclassipy/__init__.py
CHANGED
|
@@ -1,2 +1,14 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
1
|
+
"""
|
|
2
|
+
A module for using distance metrics for classification.
|
|
3
|
+
|
|
4
|
+
Classes:
|
|
5
|
+
DistanceMetricClassifier - A classifier that uses a specified distance metric for classification.
|
|
6
|
+
Distance - A class that provides various distance metrics for use in classification.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from .classifier import (
|
|
10
|
+
DistanceMetricClassifier,
|
|
11
|
+
) # Importing the DistanceMetricClassifier from the classifier module
|
|
12
|
+
from .distances import (
|
|
13
|
+
Distance,
|
|
14
|
+
) # Importing the Distance class from the distances module
|
distclassipy/classifier.py
CHANGED
|
@@ -5,28 +5,31 @@ from .distances import Distance
|
|
|
5
5
|
from sklearn.base import BaseEstimator, ClassifierMixin
|
|
6
6
|
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
|
|
7
7
|
from sklearn.utils.multiclass import unique_labels
|
|
8
|
-
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
|
9
8
|
from sklearn.neighbors import KernelDensity
|
|
9
|
+
from typing import Callable
|
|
10
|
+
|
|
10
11
|
|
|
11
12
|
class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
12
13
|
"""
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
The statistical measure to use for creating the training template. Default is 'median'.
|
|
19
|
-
metric : str or callable, optional
|
|
20
|
-
The distance metric to use. Default is 'euclidean'.
|
|
21
|
-
scale_std : bool, optional
|
|
22
|
-
If True, classifier is scaled in terms of standard deviations. Default is True.
|
|
14
|
+
Implement a distance metric classifier based on scikit-learn.
|
|
15
|
+
|
|
16
|
+
This classifier uses a specified distance metric to classify data points based on their distance to a training template. The training template is created using a specified statistical measure (e.g., median or mean). The classifier can be scaled in terms of standard deviations.
|
|
17
|
+
|
|
18
|
+
Attributes:
|
|
23
19
|
"""
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
metric: str | Callable = "euclidean",
|
|
24
|
+
scale_std: bool = True,
|
|
25
|
+
canonical_stat: str = "median",
|
|
26
|
+
calculate_kde: bool = True,
|
|
27
|
+
calculate_1d_dist: bool = True,
|
|
28
|
+
n_jobs: int = -1,
|
|
29
|
+
):
|
|
27
30
|
"""
|
|
28
|
-
Initialize the classifier with
|
|
29
|
-
|
|
31
|
+
Initialize the classifier with specified parameters.
|
|
32
|
+
|
|
30
33
|
Parameters
|
|
31
34
|
----------
|
|
32
35
|
metric : str or callable, optional
|
|
@@ -50,10 +53,11 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
50
53
|
self.n_jobs = n_jobs
|
|
51
54
|
self.distance_calculator = Distance()
|
|
52
55
|
|
|
53
|
-
def fit(self, X: np.array, y: np.array, feat_labels: list[str]=None):
|
|
54
|
-
"""
|
|
55
|
-
|
|
56
|
-
|
|
56
|
+
def fit(self, X: np.array, y: np.array, feat_labels: list[str] = None):
|
|
57
|
+
"""Fit the classifier to the data.
|
|
58
|
+
|
|
59
|
+
This involves creating the training template and optionally calculating the kernel density estimate and 1-dimensional distance.
|
|
60
|
+
|
|
57
61
|
Parameters
|
|
58
62
|
----------
|
|
59
63
|
X : array-like of shape (n_samples, n_features)
|
|
@@ -66,10 +70,10 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
66
70
|
X, y = check_X_y(X, y)
|
|
67
71
|
self.classes_ = unique_labels(y)
|
|
68
72
|
self.n_features_in_ = X.shape[1]
|
|
69
|
-
|
|
73
|
+
|
|
70
74
|
if feat_labels is None:
|
|
71
75
|
feat_labels = [f"Feature_{x}" for x in range(X.shape[1])]
|
|
72
|
-
|
|
76
|
+
|
|
73
77
|
canonical_list = []
|
|
74
78
|
for cur_class in self.classes_:
|
|
75
79
|
cur_X = X[np.argwhere(y == cur_class)]
|
|
@@ -78,9 +82,8 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
78
82
|
elif self.canonical_stat == "mean":
|
|
79
83
|
canonical_list.append(np.mean(cur_X, axis=0).ravel())
|
|
80
84
|
df_canonical = pd.DataFrame(
|
|
81
|
-
data=np.array(canonical_list),
|
|
82
|
-
|
|
83
|
-
columns=feat_labels)
|
|
85
|
+
data=np.array(canonical_list), index=self.classes_, columns=feat_labels
|
|
86
|
+
)
|
|
84
87
|
self.df_canonical_ = df_canonical
|
|
85
88
|
|
|
86
89
|
if self.scale_std:
|
|
@@ -88,48 +91,50 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
88
91
|
for cur_class in self.classes_:
|
|
89
92
|
cur_X = X[y == cur_class]
|
|
90
93
|
# Note we're using ddof=1 because we're dealing with a sample. See more: https://stackoverflow.com/a/46083501/10743245
|
|
91
|
-
std_list.append(np.std(cur_X, axis=0, ddof=1).ravel())
|
|
94
|
+
std_list.append(np.std(cur_X, axis=0, ddof=1).ravel())
|
|
92
95
|
df_std = pd.DataFrame(
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
columns=feat_labels)
|
|
96
|
+
data=np.array(std_list), index=self.classes_, columns=feat_labels
|
|
97
|
+
)
|
|
96
98
|
self.df_std_ = df_std
|
|
97
|
-
|
|
99
|
+
|
|
98
100
|
if self.calculate_kde:
|
|
99
101
|
self.set_metric_fn()
|
|
100
102
|
self.kde_dict_ = {}
|
|
101
|
-
|
|
102
|
-
|
|
103
|
+
|
|
103
104
|
for cl in self.classes_:
|
|
104
105
|
subX = X[y == cl]
|
|
105
106
|
# Implement the following in an if-else to save computational time.
|
|
106
107
|
# kde = KernelDensity(bandwidth='scott', metric=self.metric)
|
|
107
108
|
# kde.fit(subX)
|
|
108
|
-
kde = KernelDensity(
|
|
109
|
+
kde = KernelDensity(
|
|
110
|
+
bandwidth="scott",
|
|
111
|
+
metric="pyfunc",
|
|
112
|
+
metric_params={"func": self.metric_fn_},
|
|
113
|
+
)
|
|
109
114
|
kde.fit(subX)
|
|
110
115
|
self.kde_dict_[cl] = kde
|
|
111
116
|
|
|
112
117
|
self.is_fitted_ = True
|
|
113
|
-
|
|
118
|
+
|
|
114
119
|
return self
|
|
115
120
|
|
|
116
121
|
def predict(self, X: np.array):
|
|
117
|
-
"""
|
|
118
|
-
|
|
119
|
-
|
|
122
|
+
"""Predict the class labels for the provided data.
|
|
123
|
+
|
|
124
|
+
The prediction is based on the distance of each data point to the training template.
|
|
125
|
+
|
|
120
126
|
Parameters
|
|
121
127
|
----------
|
|
122
128
|
X : array-like of shape (n_samples, n_features)
|
|
123
129
|
The input samples.
|
|
124
130
|
"""
|
|
125
|
-
check_is_fitted(self,
|
|
131
|
+
check_is_fitted(self, "is_fitted_")
|
|
126
132
|
X = check_array(X)
|
|
127
|
-
|
|
133
|
+
|
|
128
134
|
if not self.scale_std:
|
|
129
135
|
dist_arr = distance.cdist(
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
metric=self.metric)
|
|
136
|
+
XA=X, XB=self.df_canonical_.to_numpy(), metric=self.metric
|
|
137
|
+
)
|
|
133
138
|
|
|
134
139
|
else:
|
|
135
140
|
dist_arr_list = []
|
|
@@ -142,49 +147,50 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
142
147
|
w = wtdf.loc[cl].to_numpy() # 1/std dev
|
|
143
148
|
XB = XB * w # w is for this class only
|
|
144
149
|
XA = X * w # w is for this class only
|
|
145
|
-
cl_dist = distance.cdist(XA=XA,
|
|
146
|
-
XB=XB,
|
|
147
|
-
metric=self.metric)
|
|
150
|
+
cl_dist = distance.cdist(XA=XA, XB=XB, metric=self.metric)
|
|
148
151
|
dist_arr_list.append(cl_dist)
|
|
149
152
|
dist_arr = np.column_stack(dist_arr_list)
|
|
150
153
|
|
|
151
154
|
y_pred = self.classes_[dist_arr.argmin(axis=1)]
|
|
152
155
|
return y_pred
|
|
153
|
-
|
|
156
|
+
|
|
154
157
|
def set_metric_fn(self):
|
|
155
158
|
"""
|
|
156
|
-
Set the metric function
|
|
159
|
+
Set the metric function based on the specified metric.
|
|
160
|
+
|
|
161
|
+
If the metric is a string, the function will look for a corresponding function in scipy.spatial.distance or distances.Distance. If the metric is a function, it will be used directly.
|
|
157
162
|
"""
|
|
158
163
|
if not callable(self.metric) or isinstance(self.metric, str):
|
|
159
164
|
if hasattr(distance, self.metric):
|
|
160
165
|
self.metric_fn_ = getattr(distance, self.metric)
|
|
161
|
-
elif hasattr(self.distance_calculator, self.metric):
|
|
166
|
+
elif hasattr(self.distance_calculator, self.metric):
|
|
162
167
|
self.metric_fn_ = getattr(self.distance_calculator, self.metric)
|
|
163
168
|
else:
|
|
164
|
-
raise ValueError(
|
|
169
|
+
raise ValueError(
|
|
170
|
+
f"{self.metric} metric not found. Either pass a string of the name of a metric in scipy.spatial.distance or distances.Distance, or, pass a metric function directly."
|
|
171
|
+
)
|
|
165
172
|
|
|
166
173
|
else:
|
|
167
174
|
self.metric_fn_ = self.metric
|
|
168
175
|
|
|
169
|
-
|
|
170
|
-
|
|
171
176
|
def predict_and_analyse(self, X: np.array):
|
|
172
177
|
"""
|
|
173
|
-
Predict the class labels for the provided data and perform analysis.
|
|
174
|
-
|
|
178
|
+
Predict the class labels for the provided data and perform analysis.
|
|
179
|
+
|
|
180
|
+
The analysis includes calculating the distance of each data point to the training template, and optionally calculating the kernel density estimate and 1-dimensional distance.
|
|
181
|
+
|
|
175
182
|
Parameters
|
|
176
183
|
----------
|
|
177
184
|
X : array-like of shape (n_samples, n_features)
|
|
178
185
|
The input samples.
|
|
179
186
|
"""
|
|
180
|
-
check_is_fitted(self,
|
|
187
|
+
check_is_fitted(self, "is_fitted_")
|
|
181
188
|
X = check_array(X)
|
|
182
|
-
|
|
189
|
+
|
|
183
190
|
if not self.scale_std:
|
|
184
191
|
dist_arr = distance.cdist(
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
metric=self.metric)
|
|
192
|
+
XA=X, XB=self.df_canonical_.to_numpy(), metric=self.metric
|
|
193
|
+
)
|
|
188
194
|
|
|
189
195
|
else:
|
|
190
196
|
dist_arr_list = []
|
|
@@ -198,30 +204,37 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
198
204
|
w = wtdf.loc[cl].to_numpy() # 1/std dev
|
|
199
205
|
XB = XB * w # w is for this class only
|
|
200
206
|
XA = X * w # w is for this class only
|
|
201
|
-
cl_dist = distance.cdist(XA=XA,
|
|
202
|
-
XB=XB,
|
|
203
|
-
metric=self.metric)
|
|
207
|
+
cl_dist = distance.cdist(XA=XA, XB=XB, metric=self.metric)
|
|
204
208
|
dist_arr_list.append(cl_dist)
|
|
205
209
|
dist_arr = np.column_stack(dist_arr_list)
|
|
206
210
|
|
|
207
|
-
self.canonical_dist_df_ = pd.DataFrame(
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
self.canonical_dist_df_.columns = [
|
|
211
|
-
|
|
211
|
+
self.canonical_dist_df_ = pd.DataFrame(
|
|
212
|
+
data=dist_arr, index=np.arange(X.shape[0]), columns=self.classes_
|
|
213
|
+
)
|
|
214
|
+
self.canonical_dist_df_.columns = [
|
|
215
|
+
f"{ind}_dist" for ind in self.canonical_dist_df_.columns
|
|
216
|
+
]
|
|
217
|
+
|
|
212
218
|
y_pred = self.classes_[dist_arr.argmin(axis=1)]
|
|
213
|
-
|
|
219
|
+
|
|
214
220
|
if self.calculate_kde:
|
|
215
221
|
# NEW: Rescale in terms of median likelihoods - calculate here
|
|
216
|
-
scale_factors = np.exp(
|
|
217
|
-
|
|
222
|
+
scale_factors = np.exp(
|
|
223
|
+
[
|
|
224
|
+
self.kde_dict_[cl].score_samples(
|
|
225
|
+
self.df_canonical_.loc[cl].to_numpy().reshape(1, -1)
|
|
226
|
+
)[0]
|
|
227
|
+
for cl in self.classes_
|
|
228
|
+
]
|
|
229
|
+
)
|
|
230
|
+
|
|
218
231
|
likelihood_arr = []
|
|
219
232
|
for k in self.kde_dict_.keys():
|
|
220
233
|
log_pdf = self.kde_dict_[k].score_samples(X)
|
|
221
234
|
likelihood_val = np.exp(log_pdf)
|
|
222
235
|
likelihood_arr.append(likelihood_val)
|
|
223
236
|
self.likelihood_arr_ = np.array(likelihood_arr).T
|
|
224
|
-
|
|
237
|
+
|
|
225
238
|
# NEW: Rescale in terms of median likelihoods - rescale here
|
|
226
239
|
self.likelihood_arr_ = self.likelihood_arr_ / scale_factors
|
|
227
240
|
|
|
@@ -232,51 +245,64 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
232
245
|
sum_1d_dists = np.zeros(shape=(len(Xdf_temp)))
|
|
233
246
|
for feat in Xdf_temp.columns:
|
|
234
247
|
dists = distance.cdist(
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
248
|
+
XA=np.zeros(shape=(1, 1)),
|
|
249
|
+
XB=(self.df_canonical_.loc[cl] - Xdf_temp)[feat]
|
|
250
|
+
.to_numpy()
|
|
251
|
+
.reshape(-1, 1),
|
|
252
|
+
metric=self.metric,
|
|
253
|
+
).ravel()
|
|
238
254
|
sum_1d_dists = sum_1d_dists + dists / self.df_std_.loc[cl, feat]
|
|
239
255
|
confs = 1 / sum_1d_dists
|
|
240
256
|
conf_cl.append(confs)
|
|
241
257
|
conf_cl = np.array(conf_cl)
|
|
242
258
|
self.conf_cl_ = conf_cl
|
|
243
259
|
|
|
244
|
-
|
|
245
260
|
self.analyis_ = True
|
|
246
|
-
|
|
261
|
+
|
|
247
262
|
return y_pred
|
|
248
263
|
|
|
249
|
-
|
|
250
|
-
def calculate_confidence(self, method: str="distance_inverse"):
|
|
264
|
+
def calculate_confidence(self, method: str = "distance_inverse"):
|
|
251
265
|
"""
|
|
252
|
-
Calculate the confidence for each prediction.
|
|
253
|
-
|
|
266
|
+
Calculate the confidence for each prediction.
|
|
267
|
+
|
|
268
|
+
The confidence is calculated based on the distance of each data point to the training template, and optionally the kernel density estimate and 1-dimensional distance.
|
|
269
|
+
|
|
254
270
|
Parameters
|
|
255
271
|
----------
|
|
256
272
|
method : str, optional
|
|
257
273
|
The method to use for calculating confidence. Default is 'distance_inverse'.
|
|
258
274
|
"""
|
|
259
|
-
check_is_fitted(self,
|
|
260
|
-
if not hasattr(self,
|
|
261
|
-
raise ValueError(
|
|
262
|
-
|
|
275
|
+
check_is_fitted(self, "is_fitted_")
|
|
276
|
+
if not hasattr(self, "analyis_"):
|
|
277
|
+
raise ValueError(
|
|
278
|
+
"Use predict_and_analyse() instead of predict() for confidence calculation."
|
|
279
|
+
)
|
|
280
|
+
|
|
263
281
|
# Calculate confidence for each prediction
|
|
264
282
|
if method == "distance_inverse":
|
|
265
283
|
self.confidence_df_ = 1 / self.canonical_dist_df_
|
|
266
|
-
self.confidence_df_.columns = [
|
|
267
|
-
|
|
284
|
+
self.confidence_df_.columns = [
|
|
285
|
+
x.replace("_dist", "_conf") for x in self.confidence_df_.columns
|
|
286
|
+
]
|
|
287
|
+
|
|
268
288
|
elif method == "1d_distance_inverse":
|
|
269
289
|
if not self.calculate_1d_dist:
|
|
270
|
-
raise ValueError(
|
|
290
|
+
raise ValueError(
|
|
291
|
+
"method='1d_distance_inverse' is only valid if calculate_1d_dist is set to True"
|
|
292
|
+
)
|
|
271
293
|
self.confidence_df_ = pd.DataFrame(
|
|
272
|
-
data=self.conf_cl_.T,
|
|
273
|
-
|
|
274
|
-
|
|
294
|
+
data=self.conf_cl_.T, columns=[f"{x}_conf" for x in self.classes_]
|
|
295
|
+
)
|
|
296
|
+
|
|
275
297
|
elif method == "kde_likelihood":
|
|
276
298
|
if not self.calculate_kde:
|
|
277
|
-
raise ValueError(
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
299
|
+
raise ValueError(
|
|
300
|
+
"method='kde_likelihood' is only valid if calculate_kde is set to True"
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
self.confidence_df_ = pd.DataFrame(
|
|
304
|
+
data=self.likelihood_arr_,
|
|
305
|
+
columns=[f"{x}_conf" for x in self.kde_dict_.keys()],
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
return self.confidence_df_.to_numpy()
|