distclassipy 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- distclassipy/__init__.py +2 -0
- distclassipy/classifier.py +282 -0
- distclassipy/distances.py +926 -0
- distclassipy-0.0.1.dist-info/LICENSE +674 -0
- distclassipy-0.0.1.dist-info/METADATA +21 -0
- distclassipy-0.0.1.dist-info/RECORD +11 -0
- distclassipy-0.0.1.dist-info/WHEEL +5 -0
- distclassipy-0.0.1.dist-info/top_level.txt +2 -0
- tests/__init__.py +0 -0
- tests/test_classifier.py +60 -0
- tests/test_distances.py +2 -0
distclassipy/__init__.py
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from scipy.spatial import distance
|
|
4
|
+
from .distances import Distance
|
|
5
|
+
from sklearn.base import BaseEstimator, ClassifierMixin
|
|
6
|
+
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
|
|
7
|
+
from sklearn.utils.multiclass import unique_labels
|
|
8
|
+
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
|
9
|
+
from sklearn.neighbors import KernelDensity
|
|
10
|
+
|
|
11
|
+
class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
12
|
+
"""
|
|
13
|
+
This class implements a distance metric classifier based on scikit-learn. The classifier uses a specified distance metric to classify data points based on their distance to a training template. The training template is created using a specified statistical measure (e.g., median or mean). The classifier can be scaled in terms of standard deviations.
|
|
14
|
+
|
|
15
|
+
Parameters
|
|
16
|
+
----------
|
|
17
|
+
canonical_stat : str, optional
|
|
18
|
+
The statistical measure to use for creating the training template. Default is 'median'.
|
|
19
|
+
metric : str or callable, optional
|
|
20
|
+
The distance metric to use. Default is 'euclidean'.
|
|
21
|
+
scale_std : bool, optional
|
|
22
|
+
If True, classifier is scaled in terms of standard deviations. Default is True.
|
|
23
|
+
"""
|
|
24
|
+
def __init__(self, metric: str or callable="euclidean", scale_std: bool=True,
|
|
25
|
+
canonical_stat: str="median", calculate_kde: bool=True,
|
|
26
|
+
calculate_1d_dist: bool=True, n_jobs: int=-1):
|
|
27
|
+
"""
|
|
28
|
+
Initialize the classifier with the given parameters.
|
|
29
|
+
|
|
30
|
+
Parameters
|
|
31
|
+
----------
|
|
32
|
+
metric : str or callable, optional
|
|
33
|
+
The distance metric to use. Default is 'euclidean'.
|
|
34
|
+
scale_std : bool, optional
|
|
35
|
+
If True, classifier is scaled in terms of standard deviations. Default is True.
|
|
36
|
+
canonical_stat : str, optional
|
|
37
|
+
The statistical measure to use for creating the training template. Default is 'median'.
|
|
38
|
+
calculate_kde : bool, optional
|
|
39
|
+
If True, calculate the kernel density estimate. Default is True.
|
|
40
|
+
calculate_1d_dist : bool, optional
|
|
41
|
+
If True, calculate the 1-dimensional distance. Default is True.
|
|
42
|
+
n_jobs : int, optional
|
|
43
|
+
The number of jobs to run in parallel. Default is -1 (use all processors).
|
|
44
|
+
"""
|
|
45
|
+
self.metric = metric
|
|
46
|
+
self.scale_std = scale_std
|
|
47
|
+
self.canonical_stat = canonical_stat
|
|
48
|
+
self.calculate_kde = calculate_kde
|
|
49
|
+
self.calculate_1d_dist = calculate_1d_dist
|
|
50
|
+
self.n_jobs = n_jobs
|
|
51
|
+
self.distance_calculator = Distance()
|
|
52
|
+
|
|
53
|
+
def fit(self, X: np.array, y: np.array, feat_labels: list[str]=None):
|
|
54
|
+
"""
|
|
55
|
+
Fit the classifier to the data. This involves creating the training template and optionally calculating the kernel density estimate and 1-dimensional distance.
|
|
56
|
+
|
|
57
|
+
Parameters
|
|
58
|
+
----------
|
|
59
|
+
X : array-like of shape (n_samples, n_features)
|
|
60
|
+
The training input samples.
|
|
61
|
+
y : array-like of shape (n_samples,)
|
|
62
|
+
The target values.
|
|
63
|
+
feat_labels : list of str, optional
|
|
64
|
+
The feature labels. If not provided, default labels representing feature number will be used.
|
|
65
|
+
"""
|
|
66
|
+
X, y = check_X_y(X, y)
|
|
67
|
+
self.classes_ = unique_labels(y)
|
|
68
|
+
self.n_features_in_ = X.shape[1]
|
|
69
|
+
|
|
70
|
+
if feat_labels is None:
|
|
71
|
+
feat_labels = [f"Feature_{x}" for x in range(X.shape[1])]
|
|
72
|
+
|
|
73
|
+
canonical_list = []
|
|
74
|
+
for cur_class in self.classes_:
|
|
75
|
+
cur_X = X[np.argwhere(y == cur_class)]
|
|
76
|
+
if self.canonical_stat == "median":
|
|
77
|
+
canonical_list.append(np.median(cur_X, axis=0).ravel())
|
|
78
|
+
elif self.canonical_stat == "mean":
|
|
79
|
+
canonical_list.append(np.mean(cur_X, axis=0).ravel())
|
|
80
|
+
df_canonical = pd.DataFrame(
|
|
81
|
+
data=np.array(canonical_list),
|
|
82
|
+
index=self.classes_,
|
|
83
|
+
columns=feat_labels)
|
|
84
|
+
self.df_canonical_ = df_canonical
|
|
85
|
+
|
|
86
|
+
if self.scale_std:
|
|
87
|
+
std_list = []
|
|
88
|
+
for cur_class in self.classes_:
|
|
89
|
+
cur_X = X[y == cur_class]
|
|
90
|
+
# Note we're using ddof=1 because we're dealing with a sample. See more: https://stackoverflow.com/a/46083501/10743245
|
|
91
|
+
std_list.append(np.std(cur_X, axis=0, ddof=1).ravel())
|
|
92
|
+
df_std = pd.DataFrame(
|
|
93
|
+
data=np.array(std_list),
|
|
94
|
+
index=self.classes_,
|
|
95
|
+
columns=feat_labels)
|
|
96
|
+
self.df_std_ = df_std
|
|
97
|
+
|
|
98
|
+
if self.calculate_kde:
|
|
99
|
+
self.set_metric_fn()
|
|
100
|
+
self.kde_dict_ = {}
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
for cl in self.classes_:
|
|
104
|
+
subX = X[y == cl]
|
|
105
|
+
# Implement the following in an if-else to save computational time.
|
|
106
|
+
# kde = KernelDensity(bandwidth='scott', metric=self.metric)
|
|
107
|
+
# kde.fit(subX)
|
|
108
|
+
kde = KernelDensity(bandwidth='scott', metric='pyfunc', metric_params={"func": self.metric_fn_})
|
|
109
|
+
kde.fit(subX)
|
|
110
|
+
self.kde_dict_[cl] = kde
|
|
111
|
+
|
|
112
|
+
self.is_fitted_ = True
|
|
113
|
+
|
|
114
|
+
return self
|
|
115
|
+
|
|
116
|
+
def predict(self, X: np.array):
|
|
117
|
+
"""
|
|
118
|
+
Predict the class labels for the provided data. The prediction is based on the distance of each data point to the training template.
|
|
119
|
+
|
|
120
|
+
Parameters
|
|
121
|
+
----------
|
|
122
|
+
X : array-like of shape (n_samples, n_features)
|
|
123
|
+
The input samples.
|
|
124
|
+
"""
|
|
125
|
+
check_is_fitted(self, 'is_fitted_')
|
|
126
|
+
X = check_array(X)
|
|
127
|
+
|
|
128
|
+
if not self.scale_std:
|
|
129
|
+
dist_arr = distance.cdist(
|
|
130
|
+
XA=X,
|
|
131
|
+
XB=self.df_canonical_.to_numpy(),
|
|
132
|
+
metric=self.metric)
|
|
133
|
+
|
|
134
|
+
else:
|
|
135
|
+
dist_arr_list = []
|
|
136
|
+
wtdf = 1 / self.df_std_
|
|
137
|
+
wtdf = wtdf.replace([np.inf, -np.inf], np.nan)
|
|
138
|
+
wtdf = wtdf.fillna(0)
|
|
139
|
+
|
|
140
|
+
for cl in self.classes_:
|
|
141
|
+
XB = self.df_canonical_.loc[cl].to_numpy().reshape(1, -1)
|
|
142
|
+
w = wtdf.loc[cl].to_numpy() # 1/std dev
|
|
143
|
+
XB = XB * w # w is for this class only
|
|
144
|
+
XA = X * w # w is for this class only
|
|
145
|
+
cl_dist = distance.cdist(XA=XA,
|
|
146
|
+
XB=XB,
|
|
147
|
+
metric=self.metric)
|
|
148
|
+
dist_arr_list.append(cl_dist)
|
|
149
|
+
dist_arr = np.column_stack(dist_arr_list)
|
|
150
|
+
|
|
151
|
+
y_pred = self.classes_[dist_arr.argmin(axis=1)]
|
|
152
|
+
return y_pred
|
|
153
|
+
|
|
154
|
+
def set_metric_fn(self):
|
|
155
|
+
"""
|
|
156
|
+
Set the metric function. If the metric is a string, the function will look for a corresponding function in scipy.spatial.distance or distances.Distance. If the metric is a function, it will be used directly.
|
|
157
|
+
"""
|
|
158
|
+
if not callable(self.metric) or isinstance(self.metric, str):
|
|
159
|
+
if hasattr(distance, self.metric):
|
|
160
|
+
self.metric_fn_ = getattr(distance, self.metric)
|
|
161
|
+
elif hasattr(self.distance_calculator, self.metric):
|
|
162
|
+
self.metric_fn_ = getattr(self.distance_calculator, self.metric)
|
|
163
|
+
else:
|
|
164
|
+
raise ValueError(f"{self.metric} metric not found. Either pass a string of the name of a metric in scipy.spatial.distance or distances.Distance, or, pass a metric function directly.")
|
|
165
|
+
|
|
166
|
+
else:
|
|
167
|
+
self.metric_fn_ = self.metric
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def predict_and_analyse(self, X: np.array):
|
|
172
|
+
"""
|
|
173
|
+
Predict the class labels for the provided data and perform analysis. The analysis includes calculating the distance of each data point to the training template, and optionally calculating the kernel density estimate and 1-dimensional distance.
|
|
174
|
+
|
|
175
|
+
Parameters
|
|
176
|
+
----------
|
|
177
|
+
X : array-like of shape (n_samples, n_features)
|
|
178
|
+
The input samples.
|
|
179
|
+
"""
|
|
180
|
+
check_is_fitted(self, 'is_fitted_')
|
|
181
|
+
X = check_array(X)
|
|
182
|
+
|
|
183
|
+
if not self.scale_std:
|
|
184
|
+
dist_arr = distance.cdist(
|
|
185
|
+
XA=X,
|
|
186
|
+
XB=self.df_canonical_.to_numpy(),
|
|
187
|
+
metric=self.metric)
|
|
188
|
+
|
|
189
|
+
else:
|
|
190
|
+
dist_arr_list = []
|
|
191
|
+
wtdf = 1 / self.df_std_
|
|
192
|
+
wtdf = wtdf.replace([np.inf, -np.inf], np.nan)
|
|
193
|
+
wtdf = wtdf.fillna(0)
|
|
194
|
+
self.wtdf_ = wtdf
|
|
195
|
+
|
|
196
|
+
for cl in self.classes_:
|
|
197
|
+
XB = self.df_canonical_.loc[cl].to_numpy().reshape(1, -1)
|
|
198
|
+
w = wtdf.loc[cl].to_numpy() # 1/std dev
|
|
199
|
+
XB = XB * w # w is for this class only
|
|
200
|
+
XA = X * w # w is for this class only
|
|
201
|
+
cl_dist = distance.cdist(XA=XA,
|
|
202
|
+
XB=XB,
|
|
203
|
+
metric=self.metric)
|
|
204
|
+
dist_arr_list.append(cl_dist)
|
|
205
|
+
dist_arr = np.column_stack(dist_arr_list)
|
|
206
|
+
|
|
207
|
+
self.canonical_dist_df_ = pd.DataFrame(data=dist_arr,
|
|
208
|
+
index=np.arange(X.shape[0]),
|
|
209
|
+
columns=self.classes_)
|
|
210
|
+
self.canonical_dist_df_.columns = [f"{ind}_dist" for ind in self.canonical_dist_df_.columns]
|
|
211
|
+
|
|
212
|
+
y_pred = self.classes_[dist_arr.argmin(axis=1)]
|
|
213
|
+
|
|
214
|
+
if self.calculate_kde:
|
|
215
|
+
# NEW: Rescale in terms of median likelihoods - calculate here
|
|
216
|
+
scale_factors = np.exp([self.kde_dict_[cl].score_samples(self.df_canonical_.loc[cl].to_numpy().reshape(1, -1))[0] for cl in self.classes_])
|
|
217
|
+
|
|
218
|
+
likelihood_arr = []
|
|
219
|
+
for k in self.kde_dict_.keys():
|
|
220
|
+
log_pdf = self.kde_dict_[k].score_samples(X)
|
|
221
|
+
likelihood_val = np.exp(log_pdf)
|
|
222
|
+
likelihood_arr.append(likelihood_val)
|
|
223
|
+
self.likelihood_arr_ = np.array(likelihood_arr).T
|
|
224
|
+
|
|
225
|
+
# NEW: Rescale in terms of median likelihoods - rescale here
|
|
226
|
+
self.likelihood_arr_ = self.likelihood_arr_ / scale_factors
|
|
227
|
+
|
|
228
|
+
if self.calculate_1d_dist:
|
|
229
|
+
conf_cl = []
|
|
230
|
+
Xdf_temp = pd.DataFrame(data=X, columns=self.df_canonical_.columns)
|
|
231
|
+
for cl in self.classes_:
|
|
232
|
+
sum_1d_dists = np.zeros(shape=(len(Xdf_temp)))
|
|
233
|
+
for feat in Xdf_temp.columns:
|
|
234
|
+
dists = distance.cdist(
|
|
235
|
+
XA=np.zeros(shape=(1, 1)),
|
|
236
|
+
XB=(self.df_canonical_.loc[cl] - Xdf_temp)[feat].to_numpy().reshape(-1, 1),
|
|
237
|
+
metric=self.metric).ravel()
|
|
238
|
+
sum_1d_dists = sum_1d_dists + dists / self.df_std_.loc[cl, feat]
|
|
239
|
+
confs = 1 / sum_1d_dists
|
|
240
|
+
conf_cl.append(confs)
|
|
241
|
+
conf_cl = np.array(conf_cl)
|
|
242
|
+
self.conf_cl_ = conf_cl
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
self.analyis_ = True
|
|
246
|
+
|
|
247
|
+
return y_pred
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def calculate_confidence(self, method: str="distance_inverse"):
|
|
251
|
+
"""
|
|
252
|
+
Calculate the confidence for each prediction. The confidence is calculated based on the distance of each data point to the training template, and optionally the kernel density estimate and 1-dimensional distance.
|
|
253
|
+
|
|
254
|
+
Parameters
|
|
255
|
+
----------
|
|
256
|
+
method : str, optional
|
|
257
|
+
The method to use for calculating confidence. Default is 'distance_inverse'.
|
|
258
|
+
"""
|
|
259
|
+
check_is_fitted(self, 'is_fitted_')
|
|
260
|
+
if not hasattr(self, 'analyis_'):
|
|
261
|
+
raise ValueError("Use predict_and_analyse() instead of predict() for confidence calculation.")
|
|
262
|
+
|
|
263
|
+
# Calculate confidence for each prediction
|
|
264
|
+
if method == "distance_inverse":
|
|
265
|
+
self.confidence_df_ = 1 / self.canonical_dist_df_
|
|
266
|
+
self.confidence_df_.columns = [x.replace("_dist", "_conf") for x in self.confidence_df_.columns]
|
|
267
|
+
|
|
268
|
+
elif method == "1d_distance_inverse":
|
|
269
|
+
if not self.calculate_1d_dist:
|
|
270
|
+
raise ValueError("method='1d_distance_inverse' is only valid if calculate_1d_dist is set to True")
|
|
271
|
+
self.confidence_df_ = pd.DataFrame(
|
|
272
|
+
data=self.conf_cl_.T,
|
|
273
|
+
columns=[f"{x}_conf" for x in self.classes_])
|
|
274
|
+
|
|
275
|
+
elif method == "kde_likelihood":
|
|
276
|
+
if not self.calculate_kde:
|
|
277
|
+
raise ValueError("method='kde_likelihood' is only valid if calculate_kde is set to True")
|
|
278
|
+
|
|
279
|
+
self.confidence_df_ = pd.DataFrame(data=self.likelihood_arr_,
|
|
280
|
+
columns=[f"{x}_conf" for x in self.kde_dict_.keys()])
|
|
281
|
+
|
|
282
|
+
return self.confidence_df_.to_numpy()
|