distclassipy 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2 @@
1
+ from .classifier import DistanceMetricClassifier # Importing the DistanceMetricClassifier from the classifier module
2
+ from .distances import Distance # Importing the Distance class from the distances module
@@ -0,0 +1,282 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from scipy.spatial import distance
4
+ from .distances import Distance
5
+ from sklearn.base import BaseEstimator, ClassifierMixin
6
+ from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
7
+ from sklearn.utils.multiclass import unique_labels
8
+ from sklearn.preprocessing import StandardScaler, MinMaxScaler
9
+ from sklearn.neighbors import KernelDensity
10
+
11
+ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
12
+ """
13
+ This class implements a distance metric classifier based on scikit-learn. The classifier uses a specified distance metric to classify data points based on their distance to a training template. The training template is created using a specified statistical measure (e.g., median or mean). The classifier can be scaled in terms of standard deviations.
14
+
15
+ Parameters
16
+ ----------
17
+ canonical_stat : str, optional
18
+ The statistical measure to use for creating the training template. Default is 'median'.
19
+ metric : str or callable, optional
20
+ The distance metric to use. Default is 'euclidean'.
21
+ scale_std : bool, optional
22
+ If True, classifier is scaled in terms of standard deviations. Default is True.
23
+ """
24
+ def __init__(self, metric: str or callable="euclidean", scale_std: bool=True,
25
+ canonical_stat: str="median", calculate_kde: bool=True,
26
+ calculate_1d_dist: bool=True, n_jobs: int=-1):
27
+ """
28
+ Initialize the classifier with the given parameters.
29
+
30
+ Parameters
31
+ ----------
32
+ metric : str or callable, optional
33
+ The distance metric to use. Default is 'euclidean'.
34
+ scale_std : bool, optional
35
+ If True, classifier is scaled in terms of standard deviations. Default is True.
36
+ canonical_stat : str, optional
37
+ The statistical measure to use for creating the training template. Default is 'median'.
38
+ calculate_kde : bool, optional
39
+ If True, calculate the kernel density estimate. Default is True.
40
+ calculate_1d_dist : bool, optional
41
+ If True, calculate the 1-dimensional distance. Default is True.
42
+ n_jobs : int, optional
43
+ The number of jobs to run in parallel. Default is -1 (use all processors).
44
+ """
45
+ self.metric = metric
46
+ self.scale_std = scale_std
47
+ self.canonical_stat = canonical_stat
48
+ self.calculate_kde = calculate_kde
49
+ self.calculate_1d_dist = calculate_1d_dist
50
+ self.n_jobs = n_jobs
51
+ self.distance_calculator = Distance()
52
+
53
+ def fit(self, X: np.array, y: np.array, feat_labels: list[str]=None):
54
+ """
55
+ Fit the classifier to the data. This involves creating the training template and optionally calculating the kernel density estimate and 1-dimensional distance.
56
+
57
+ Parameters
58
+ ----------
59
+ X : array-like of shape (n_samples, n_features)
60
+ The training input samples.
61
+ y : array-like of shape (n_samples,)
62
+ The target values.
63
+ feat_labels : list of str, optional
64
+ The feature labels. If not provided, default labels representing feature number will be used.
65
+ """
66
+ X, y = check_X_y(X, y)
67
+ self.classes_ = unique_labels(y)
68
+ self.n_features_in_ = X.shape[1]
69
+
70
+ if feat_labels is None:
71
+ feat_labels = [f"Feature_{x}" for x in range(X.shape[1])]
72
+
73
+ canonical_list = []
74
+ for cur_class in self.classes_:
75
+ cur_X = X[np.argwhere(y == cur_class)]
76
+ if self.canonical_stat == "median":
77
+ canonical_list.append(np.median(cur_X, axis=0).ravel())
78
+ elif self.canonical_stat == "mean":
79
+ canonical_list.append(np.mean(cur_X, axis=0).ravel())
80
+ df_canonical = pd.DataFrame(
81
+ data=np.array(canonical_list),
82
+ index=self.classes_,
83
+ columns=feat_labels)
84
+ self.df_canonical_ = df_canonical
85
+
86
+ if self.scale_std:
87
+ std_list = []
88
+ for cur_class in self.classes_:
89
+ cur_X = X[y == cur_class]
90
+ # Note we're using ddof=1 because we're dealing with a sample. See more: https://stackoverflow.com/a/46083501/10743245
91
+ std_list.append(np.std(cur_X, axis=0, ddof=1).ravel())
92
+ df_std = pd.DataFrame(
93
+ data=np.array(std_list),
94
+ index=self.classes_,
95
+ columns=feat_labels)
96
+ self.df_std_ = df_std
97
+
98
+ if self.calculate_kde:
99
+ self.set_metric_fn()
100
+ self.kde_dict_ = {}
101
+
102
+
103
+ for cl in self.classes_:
104
+ subX = X[y == cl]
105
+ # Implement the following in an if-else to save computational time.
106
+ # kde = KernelDensity(bandwidth='scott', metric=self.metric)
107
+ # kde.fit(subX)
108
+ kde = KernelDensity(bandwidth='scott', metric='pyfunc', metric_params={"func": self.metric_fn_})
109
+ kde.fit(subX)
110
+ self.kde_dict_[cl] = kde
111
+
112
+ self.is_fitted_ = True
113
+
114
+ return self
115
+
116
+ def predict(self, X: np.array):
117
+ """
118
+ Predict the class labels for the provided data. The prediction is based on the distance of each data point to the training template.
119
+
120
+ Parameters
121
+ ----------
122
+ X : array-like of shape (n_samples, n_features)
123
+ The input samples.
124
+ """
125
+ check_is_fitted(self, 'is_fitted_')
126
+ X = check_array(X)
127
+
128
+ if not self.scale_std:
129
+ dist_arr = distance.cdist(
130
+ XA=X,
131
+ XB=self.df_canonical_.to_numpy(),
132
+ metric=self.metric)
133
+
134
+ else:
135
+ dist_arr_list = []
136
+ wtdf = 1 / self.df_std_
137
+ wtdf = wtdf.replace([np.inf, -np.inf], np.nan)
138
+ wtdf = wtdf.fillna(0)
139
+
140
+ for cl in self.classes_:
141
+ XB = self.df_canonical_.loc[cl].to_numpy().reshape(1, -1)
142
+ w = wtdf.loc[cl].to_numpy() # 1/std dev
143
+ XB = XB * w # w is for this class only
144
+ XA = X * w # w is for this class only
145
+ cl_dist = distance.cdist(XA=XA,
146
+ XB=XB,
147
+ metric=self.metric)
148
+ dist_arr_list.append(cl_dist)
149
+ dist_arr = np.column_stack(dist_arr_list)
150
+
151
+ y_pred = self.classes_[dist_arr.argmin(axis=1)]
152
+ return y_pred
153
+
154
+ def set_metric_fn(self):
155
+ """
156
+ Set the metric function. If the metric is a string, the function will look for a corresponding function in scipy.spatial.distance or distances.Distance. If the metric is a function, it will be used directly.
157
+ """
158
+ if not callable(self.metric) or isinstance(self.metric, str):
159
+ if hasattr(distance, self.metric):
160
+ self.metric_fn_ = getattr(distance, self.metric)
161
+ elif hasattr(self.distance_calculator, self.metric):
162
+ self.metric_fn_ = getattr(self.distance_calculator, self.metric)
163
+ else:
164
+ raise ValueError(f"{self.metric} metric not found. Either pass a string of the name of a metric in scipy.spatial.distance or distances.Distance, or, pass a metric function directly.")
165
+
166
+ else:
167
+ self.metric_fn_ = self.metric
168
+
169
+
170
+
171
+ def predict_and_analyse(self, X: np.array):
172
+ """
173
+ Predict the class labels for the provided data and perform analysis. The analysis includes calculating the distance of each data point to the training template, and optionally calculating the kernel density estimate and 1-dimensional distance.
174
+
175
+ Parameters
176
+ ----------
177
+ X : array-like of shape (n_samples, n_features)
178
+ The input samples.
179
+ """
180
+ check_is_fitted(self, 'is_fitted_')
181
+ X = check_array(X)
182
+
183
+ if not self.scale_std:
184
+ dist_arr = distance.cdist(
185
+ XA=X,
186
+ XB=self.df_canonical_.to_numpy(),
187
+ metric=self.metric)
188
+
189
+ else:
190
+ dist_arr_list = []
191
+ wtdf = 1 / self.df_std_
192
+ wtdf = wtdf.replace([np.inf, -np.inf], np.nan)
193
+ wtdf = wtdf.fillna(0)
194
+ self.wtdf_ = wtdf
195
+
196
+ for cl in self.classes_:
197
+ XB = self.df_canonical_.loc[cl].to_numpy().reshape(1, -1)
198
+ w = wtdf.loc[cl].to_numpy() # 1/std dev
199
+ XB = XB * w # w is for this class only
200
+ XA = X * w # w is for this class only
201
+ cl_dist = distance.cdist(XA=XA,
202
+ XB=XB,
203
+ metric=self.metric)
204
+ dist_arr_list.append(cl_dist)
205
+ dist_arr = np.column_stack(dist_arr_list)
206
+
207
+ self.canonical_dist_df_ = pd.DataFrame(data=dist_arr,
208
+ index=np.arange(X.shape[0]),
209
+ columns=self.classes_)
210
+ self.canonical_dist_df_.columns = [f"{ind}_dist" for ind in self.canonical_dist_df_.columns]
211
+
212
+ y_pred = self.classes_[dist_arr.argmin(axis=1)]
213
+
214
+ if self.calculate_kde:
215
+ # NEW: Rescale in terms of median likelihoods - calculate here
216
+ scale_factors = np.exp([self.kde_dict_[cl].score_samples(self.df_canonical_.loc[cl].to_numpy().reshape(1, -1))[0] for cl in self.classes_])
217
+
218
+ likelihood_arr = []
219
+ for k in self.kde_dict_.keys():
220
+ log_pdf = self.kde_dict_[k].score_samples(X)
221
+ likelihood_val = np.exp(log_pdf)
222
+ likelihood_arr.append(likelihood_val)
223
+ self.likelihood_arr_ = np.array(likelihood_arr).T
224
+
225
+ # NEW: Rescale in terms of median likelihoods - rescale here
226
+ self.likelihood_arr_ = self.likelihood_arr_ / scale_factors
227
+
228
+ if self.calculate_1d_dist:
229
+ conf_cl = []
230
+ Xdf_temp = pd.DataFrame(data=X, columns=self.df_canonical_.columns)
231
+ for cl in self.classes_:
232
+ sum_1d_dists = np.zeros(shape=(len(Xdf_temp)))
233
+ for feat in Xdf_temp.columns:
234
+ dists = distance.cdist(
235
+ XA=np.zeros(shape=(1, 1)),
236
+ XB=(self.df_canonical_.loc[cl] - Xdf_temp)[feat].to_numpy().reshape(-1, 1),
237
+ metric=self.metric).ravel()
238
+ sum_1d_dists = sum_1d_dists + dists / self.df_std_.loc[cl, feat]
239
+ confs = 1 / sum_1d_dists
240
+ conf_cl.append(confs)
241
+ conf_cl = np.array(conf_cl)
242
+ self.conf_cl_ = conf_cl
243
+
244
+
245
+ self.analyis_ = True
246
+
247
+ return y_pred
248
+
249
+
250
+ def calculate_confidence(self, method: str="distance_inverse"):
251
+ """
252
+ Calculate the confidence for each prediction. The confidence is calculated based on the distance of each data point to the training template, and optionally the kernel density estimate and 1-dimensional distance.
253
+
254
+ Parameters
255
+ ----------
256
+ method : str, optional
257
+ The method to use for calculating confidence. Default is 'distance_inverse'.
258
+ """
259
+ check_is_fitted(self, 'is_fitted_')
260
+ if not hasattr(self, 'analyis_'):
261
+ raise ValueError("Use predict_and_analyse() instead of predict() for confidence calculation.")
262
+
263
+ # Calculate confidence for each prediction
264
+ if method == "distance_inverse":
265
+ self.confidence_df_ = 1 / self.canonical_dist_df_
266
+ self.confidence_df_.columns = [x.replace("_dist", "_conf") for x in self.confidence_df_.columns]
267
+
268
+ elif method == "1d_distance_inverse":
269
+ if not self.calculate_1d_dist:
270
+ raise ValueError("method='1d_distance_inverse' is only valid if calculate_1d_dist is set to True")
271
+ self.confidence_df_ = pd.DataFrame(
272
+ data=self.conf_cl_.T,
273
+ columns=[f"{x}_conf" for x in self.classes_])
274
+
275
+ elif method == "kde_likelihood":
276
+ if not self.calculate_kde:
277
+ raise ValueError("method='kde_likelihood' is only valid if calculate_kde is set to True")
278
+
279
+ self.confidence_df_ = pd.DataFrame(data=self.likelihood_arr_,
280
+ columns=[f"{x}_conf" for x in self.kde_dict_.keys()])
281
+
282
+ return self.confidence_df_.to_numpy()