distclassipy 0.0.3__tar.gz → 0.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: distclassipy
3
- Version: 0.0.3
3
+ Version: 0.0.5
4
4
  Summary: A python package for a distance-based classifier which can use several different distance metrics.
5
5
  Author-email: Siddharth Chaini <sidchaini@gmail.com>
6
6
  License: GNU GENERAL PUBLIC LICENSE
@@ -685,13 +685,12 @@ Classifier: Intended Audience :: Developers
685
685
  Classifier: Intended Audience :: Education
686
686
  Classifier: Intended Audience :: Science/Research
687
687
  Classifier: License :: OSI Approved :: MIT License
688
- Classifier: Programming Language :: Python :: 3.9
689
688
  Classifier: Programming Language :: Python :: 3.10
690
689
  Classifier: Programming Language :: Python :: 3.11
691
690
  Classifier: Programming Language :: Python :: 3.12
692
691
  Classifier: Programming Language :: Python :: 3.13
693
692
  Classifier: Operating System :: OS Independent
694
- Requires-Python: >=3.9
693
+ Requires-Python: >=3.10
695
694
  Description-Content-Type: text/markdown
696
695
  License-File: LICENSE
697
696
  Requires-Dist: joblib>=1.3.2
@@ -0,0 +1,14 @@
1
+ """
2
+ A module for using distance metrics for classification.
3
+
4
+ Classes:
5
+ DistanceMetricClassifier - A classifier that uses a specified distance metric for classification.
6
+ Distance - A class that provides various distance metrics for use in classification.
7
+ """
8
+
9
+ from .classifier import (
10
+ DistanceMetricClassifier,
11
+ ) # Importing the DistanceMetricClassifier from the classifier module
12
+ from .distances import (
13
+ Distance,
14
+ ) # Importing the Distance class from the distances module
@@ -5,28 +5,31 @@ from .distances import Distance
5
5
  from sklearn.base import BaseEstimator, ClassifierMixin
6
6
  from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
7
7
  from sklearn.utils.multiclass import unique_labels
8
- from sklearn.preprocessing import StandardScaler, MinMaxScaler
9
8
  from sklearn.neighbors import KernelDensity
9
+ from typing import Callable
10
+
10
11
 
11
12
  class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
12
13
  """
13
- This class implements a distance metric classifier based on scikit-learn. The classifier uses a specified distance metric to classify data points based on their distance to a training template. The training template is created using a specified statistical measure (e.g., median or mean). The classifier can be scaled in terms of standard deviations.
14
-
15
- Parameters
16
- ----------
17
- canonical_stat : str, optional
18
- The statistical measure to use for creating the training template. Default is 'median'.
19
- metric : str or callable, optional
20
- The distance metric to use. Default is 'euclidean'.
21
- scale_std : bool, optional
22
- If True, classifier is scaled in terms of standard deviations. Default is True.
14
+ Implement a distance metric classifier based on scikit-learn.
15
+
16
+ This classifier uses a specified distance metric to classify data points based on their distance to a training template. The training template is created using a specified statistical measure (e.g., median or mean). The classifier can be scaled in terms of standard deviations.
17
+
18
+ Attributes:
23
19
  """
24
- def __init__(self, metric: str or callable="euclidean", scale_std: bool=True,
25
- canonical_stat: str="median", calculate_kde: bool=True,
26
- calculate_1d_dist: bool=True, n_jobs: int=-1):
20
+
21
+ def __init__(
22
+ self,
23
+ metric: str | Callable = "euclidean",
24
+ scale_std: bool = True,
25
+ canonical_stat: str = "median",
26
+ calculate_kde: bool = True,
27
+ calculate_1d_dist: bool = True,
28
+ n_jobs: int = -1,
29
+ ):
27
30
  """
28
- Initialize the classifier with the given parameters.
29
-
31
+ Initialize the classifier with specified parameters.
32
+
30
33
  Parameters
31
34
  ----------
32
35
  metric : str or callable, optional
@@ -50,10 +53,11 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
50
53
  self.n_jobs = n_jobs
51
54
  self.distance_calculator = Distance()
52
55
 
53
- def fit(self, X: np.array, y: np.array, feat_labels: list[str]=None):
54
- """
55
- Fit the classifier to the data. This involves creating the training template and optionally calculating the kernel density estimate and 1-dimensional distance.
56
-
56
+ def fit(self, X: np.array, y: np.array, feat_labels: list[str] = None):
57
+ """Fit the classifier to the data.
58
+
59
+ This involves creating the training template and optionally calculating the kernel density estimate and 1-dimensional distance.
60
+
57
61
  Parameters
58
62
  ----------
59
63
  X : array-like of shape (n_samples, n_features)
@@ -66,10 +70,10 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
66
70
  X, y = check_X_y(X, y)
67
71
  self.classes_ = unique_labels(y)
68
72
  self.n_features_in_ = X.shape[1]
69
-
73
+
70
74
  if feat_labels is None:
71
75
  feat_labels = [f"Feature_{x}" for x in range(X.shape[1])]
72
-
76
+
73
77
  canonical_list = []
74
78
  for cur_class in self.classes_:
75
79
  cur_X = X[np.argwhere(y == cur_class)]
@@ -78,9 +82,8 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
78
82
  elif self.canonical_stat == "mean":
79
83
  canonical_list.append(np.mean(cur_X, axis=0).ravel())
80
84
  df_canonical = pd.DataFrame(
81
- data=np.array(canonical_list),
82
- index=self.classes_,
83
- columns=feat_labels)
85
+ data=np.array(canonical_list), index=self.classes_, columns=feat_labels
86
+ )
84
87
  self.df_canonical_ = df_canonical
85
88
 
86
89
  if self.scale_std:
@@ -88,48 +91,50 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
88
91
  for cur_class in self.classes_:
89
92
  cur_X = X[y == cur_class]
90
93
  # Note we're using ddof=1 because we're dealing with a sample. See more: https://stackoverflow.com/a/46083501/10743245
91
- std_list.append(np.std(cur_X, axis=0, ddof=1).ravel())
94
+ std_list.append(np.std(cur_X, axis=0, ddof=1).ravel())
92
95
  df_std = pd.DataFrame(
93
- data=np.array(std_list),
94
- index=self.classes_,
95
- columns=feat_labels)
96
+ data=np.array(std_list), index=self.classes_, columns=feat_labels
97
+ )
96
98
  self.df_std_ = df_std
97
-
99
+
98
100
  if self.calculate_kde:
99
101
  self.set_metric_fn()
100
102
  self.kde_dict_ = {}
101
-
102
-
103
+
103
104
  for cl in self.classes_:
104
105
  subX = X[y == cl]
105
106
  # Implement the following in an if-else to save computational time.
106
107
  # kde = KernelDensity(bandwidth='scott', metric=self.metric)
107
108
  # kde.fit(subX)
108
- kde = KernelDensity(bandwidth='scott', metric='pyfunc', metric_params={"func": self.metric_fn_})
109
+ kde = KernelDensity(
110
+ bandwidth="scott",
111
+ metric="pyfunc",
112
+ metric_params={"func": self.metric_fn_},
113
+ )
109
114
  kde.fit(subX)
110
115
  self.kde_dict_[cl] = kde
111
116
 
112
117
  self.is_fitted_ = True
113
-
118
+
114
119
  return self
115
120
 
116
121
  def predict(self, X: np.array):
117
- """
118
- Predict the class labels for the provided data. The prediction is based on the distance of each data point to the training template.
119
-
122
+ """Predict the class labels for the provided data.
123
+
124
+ The prediction is based on the distance of each data point to the training template.
125
+
120
126
  Parameters
121
127
  ----------
122
128
  X : array-like of shape (n_samples, n_features)
123
129
  The input samples.
124
130
  """
125
- check_is_fitted(self, 'is_fitted_')
131
+ check_is_fitted(self, "is_fitted_")
126
132
  X = check_array(X)
127
-
133
+
128
134
  if not self.scale_std:
129
135
  dist_arr = distance.cdist(
130
- XA=X,
131
- XB=self.df_canonical_.to_numpy(),
132
- metric=self.metric)
136
+ XA=X, XB=self.df_canonical_.to_numpy(), metric=self.metric
137
+ )
133
138
 
134
139
  else:
135
140
  dist_arr_list = []
@@ -142,49 +147,50 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
142
147
  w = wtdf.loc[cl].to_numpy() # 1/std dev
143
148
  XB = XB * w # w is for this class only
144
149
  XA = X * w # w is for this class only
145
- cl_dist = distance.cdist(XA=XA,
146
- XB=XB,
147
- metric=self.metric)
150
+ cl_dist = distance.cdist(XA=XA, XB=XB, metric=self.metric)
148
151
  dist_arr_list.append(cl_dist)
149
152
  dist_arr = np.column_stack(dist_arr_list)
150
153
 
151
154
  y_pred = self.classes_[dist_arr.argmin(axis=1)]
152
155
  return y_pred
153
-
156
+
154
157
  def set_metric_fn(self):
155
158
  """
156
- Set the metric function. If the metric is a string, the function will look for a corresponding function in scipy.spatial.distance or distances.Distance. If the metric is a function, it will be used directly.
159
+ Set the metric function based on the specified metric.
160
+
161
+ If the metric is a string, the function will look for a corresponding function in scipy.spatial.distance or distances.Distance. If the metric is a function, it will be used directly.
157
162
  """
158
163
  if not callable(self.metric) or isinstance(self.metric, str):
159
164
  if hasattr(distance, self.metric):
160
165
  self.metric_fn_ = getattr(distance, self.metric)
161
- elif hasattr(self.distance_calculator, self.metric):
166
+ elif hasattr(self.distance_calculator, self.metric):
162
167
  self.metric_fn_ = getattr(self.distance_calculator, self.metric)
163
168
  else:
164
- raise ValueError(f"{self.metric} metric not found. Either pass a string of the name of a metric in scipy.spatial.distance or distances.Distance, or, pass a metric function directly.")
169
+ raise ValueError(
170
+ f"{self.metric} metric not found. Either pass a string of the name of a metric in scipy.spatial.distance or distances.Distance, or, pass a metric function directly."
171
+ )
165
172
 
166
173
  else:
167
174
  self.metric_fn_ = self.metric
168
175
 
169
-
170
-
171
176
  def predict_and_analyse(self, X: np.array):
172
177
  """
173
- Predict the class labels for the provided data and perform analysis. The analysis includes calculating the distance of each data point to the training template, and optionally calculating the kernel density estimate and 1-dimensional distance.
174
-
178
+ Predict the class labels for the provided data and perform analysis.
179
+
180
+ The analysis includes calculating the distance of each data point to the training template, and optionally calculating the kernel density estimate and 1-dimensional distance.
181
+
175
182
  Parameters
176
183
  ----------
177
184
  X : array-like of shape (n_samples, n_features)
178
185
  The input samples.
179
186
  """
180
- check_is_fitted(self, 'is_fitted_')
187
+ check_is_fitted(self, "is_fitted_")
181
188
  X = check_array(X)
182
-
189
+
183
190
  if not self.scale_std:
184
191
  dist_arr = distance.cdist(
185
- XA=X,
186
- XB=self.df_canonical_.to_numpy(),
187
- metric=self.metric)
192
+ XA=X, XB=self.df_canonical_.to_numpy(), metric=self.metric
193
+ )
188
194
 
189
195
  else:
190
196
  dist_arr_list = []
@@ -198,30 +204,37 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
198
204
  w = wtdf.loc[cl].to_numpy() # 1/std dev
199
205
  XB = XB * w # w is for this class only
200
206
  XA = X * w # w is for this class only
201
- cl_dist = distance.cdist(XA=XA,
202
- XB=XB,
203
- metric=self.metric)
207
+ cl_dist = distance.cdist(XA=XA, XB=XB, metric=self.metric)
204
208
  dist_arr_list.append(cl_dist)
205
209
  dist_arr = np.column_stack(dist_arr_list)
206
210
 
207
- self.canonical_dist_df_ = pd.DataFrame(data=dist_arr,
208
- index=np.arange(X.shape[0]),
209
- columns=self.classes_)
210
- self.canonical_dist_df_.columns = [f"{ind}_dist" for ind in self.canonical_dist_df_.columns]
211
-
211
+ self.canonical_dist_df_ = pd.DataFrame(
212
+ data=dist_arr, index=np.arange(X.shape[0]), columns=self.classes_
213
+ )
214
+ self.canonical_dist_df_.columns = [
215
+ f"{ind}_dist" for ind in self.canonical_dist_df_.columns
216
+ ]
217
+
212
218
  y_pred = self.classes_[dist_arr.argmin(axis=1)]
213
-
219
+
214
220
  if self.calculate_kde:
215
221
  # NEW: Rescale in terms of median likelihoods - calculate here
216
- scale_factors = np.exp([self.kde_dict_[cl].score_samples(self.df_canonical_.loc[cl].to_numpy().reshape(1, -1))[0] for cl in self.classes_])
217
-
222
+ scale_factors = np.exp(
223
+ [
224
+ self.kde_dict_[cl].score_samples(
225
+ self.df_canonical_.loc[cl].to_numpy().reshape(1, -1)
226
+ )[0]
227
+ for cl in self.classes_
228
+ ]
229
+ )
230
+
218
231
  likelihood_arr = []
219
232
  for k in self.kde_dict_.keys():
220
233
  log_pdf = self.kde_dict_[k].score_samples(X)
221
234
  likelihood_val = np.exp(log_pdf)
222
235
  likelihood_arr.append(likelihood_val)
223
236
  self.likelihood_arr_ = np.array(likelihood_arr).T
224
-
237
+
225
238
  # NEW: Rescale in terms of median likelihoods - rescale here
226
239
  self.likelihood_arr_ = self.likelihood_arr_ / scale_factors
227
240
 
@@ -232,51 +245,64 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
232
245
  sum_1d_dists = np.zeros(shape=(len(Xdf_temp)))
233
246
  for feat in Xdf_temp.columns:
234
247
  dists = distance.cdist(
235
- XA=np.zeros(shape=(1, 1)),
236
- XB=(self.df_canonical_.loc[cl] - Xdf_temp)[feat].to_numpy().reshape(-1, 1),
237
- metric=self.metric).ravel()
248
+ XA=np.zeros(shape=(1, 1)),
249
+ XB=(self.df_canonical_.loc[cl] - Xdf_temp)[feat]
250
+ .to_numpy()
251
+ .reshape(-1, 1),
252
+ metric=self.metric,
253
+ ).ravel()
238
254
  sum_1d_dists = sum_1d_dists + dists / self.df_std_.loc[cl, feat]
239
255
  confs = 1 / sum_1d_dists
240
256
  conf_cl.append(confs)
241
257
  conf_cl = np.array(conf_cl)
242
258
  self.conf_cl_ = conf_cl
243
259
 
244
-
245
260
  self.analyis_ = True
246
-
261
+
247
262
  return y_pred
248
263
 
249
-
250
- def calculate_confidence(self, method: str="distance_inverse"):
264
+ def calculate_confidence(self, method: str = "distance_inverse"):
251
265
  """
252
- Calculate the confidence for each prediction. The confidence is calculated based on the distance of each data point to the training template, and optionally the kernel density estimate and 1-dimensional distance.
253
-
266
+ Calculate the confidence for each prediction.
267
+
268
+ The confidence is calculated based on the distance of each data point to the training template, and optionally the kernel density estimate and 1-dimensional distance.
269
+
254
270
  Parameters
255
271
  ----------
256
272
  method : str, optional
257
273
  The method to use for calculating confidence. Default is 'distance_inverse'.
258
274
  """
259
- check_is_fitted(self, 'is_fitted_')
260
- if not hasattr(self, 'analyis_'):
261
- raise ValueError("Use predict_and_analyse() instead of predict() for confidence calculation.")
262
-
275
+ check_is_fitted(self, "is_fitted_")
276
+ if not hasattr(self, "analyis_"):
277
+ raise ValueError(
278
+ "Use predict_and_analyse() instead of predict() for confidence calculation."
279
+ )
280
+
263
281
  # Calculate confidence for each prediction
264
282
  if method == "distance_inverse":
265
283
  self.confidence_df_ = 1 / self.canonical_dist_df_
266
- self.confidence_df_.columns = [x.replace("_dist", "_conf") for x in self.confidence_df_.columns]
267
-
284
+ self.confidence_df_.columns = [
285
+ x.replace("_dist", "_conf") for x in self.confidence_df_.columns
286
+ ]
287
+
268
288
  elif method == "1d_distance_inverse":
269
289
  if not self.calculate_1d_dist:
270
- raise ValueError("method='1d_distance_inverse' is only valid if calculate_1d_dist is set to True")
290
+ raise ValueError(
291
+ "method='1d_distance_inverse' is only valid if calculate_1d_dist is set to True"
292
+ )
271
293
  self.confidence_df_ = pd.DataFrame(
272
- data=self.conf_cl_.T,
273
- columns=[f"{x}_conf" for x in self.classes_])
274
-
294
+ data=self.conf_cl_.T, columns=[f"{x}_conf" for x in self.classes_]
295
+ )
296
+
275
297
  elif method == "kde_likelihood":
276
298
  if not self.calculate_kde:
277
- raise ValueError("method='kde_likelihood' is only valid if calculate_kde is set to True")
278
-
279
- self.confidence_df_ = pd.DataFrame(data=self.likelihood_arr_,
280
- columns=[f"{x}_conf" for x in self.kde_dict_.keys()])
281
-
282
- return self.confidence_df_.to_numpy()
299
+ raise ValueError(
300
+ "method='kde_likelihood' is only valid if calculate_kde is set to True"
301
+ )
302
+
303
+ self.confidence_df_ = pd.DataFrame(
304
+ data=self.likelihood_arr_,
305
+ columns=[f"{x}_conf" for x in self.kde_dict_.keys()],
306
+ )
307
+
308
+ return self.confidence_df_.to_numpy()