distclassipy 0.1.6a0__tar.gz → 0.2.0a0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: distclassipy
3
- Version: 0.1.6a0
3
+ Version: 0.2.0a0
4
4
  Summary: A python package for a distance-based classifier which can use several different distance metrics.
5
5
  Author-email: Siddharth Chaini <sidchaini@gmail.com>
6
6
  License: GNU GENERAL PUBLIC LICENSE
@@ -694,7 +694,7 @@ Requires-Python: >=3.10
694
694
  Description-Content-Type: text/markdown
695
695
  License-File: LICENSE
696
696
  Requires-Dist: joblib>=1.3.2
697
- Requires-Dist: numpy<2,>=1.25.2
697
+ Requires-Dist: numpy>=1.25.2
698
698
  Requires-Dist: pandas>=2.0.3
699
699
  Requires-Dist: scikit-learn>=1.2.2
700
700
 
@@ -740,9 +740,9 @@ X, y = make_classification(
740
740
  random_state=0,
741
741
  shuffle=False,
742
742
  )
743
- clf = dcpy.DistanceMetricClassifier(metric="canberra")
743
+ clf = dcpy.DistanceMetricClassifier()
744
744
  clf.fit(X, y)
745
- print(clf.predict([[0, 0, 0, 0]]))
745
+ print(clf.predict([[0, 0, 0, 0]]), metric="canberra")
746
746
  ```
747
747
 
748
748
  ## Features
@@ -40,9 +40,9 @@ X, y = make_classification(
40
40
  random_state=0,
41
41
  shuffle=False,
42
42
  )
43
- clf = dcpy.DistanceMetricClassifier(metric="canberra")
43
+ clf = dcpy.DistanceMetricClassifier()
44
44
  clf.fit(X, y)
45
- print(clf.predict([[0, 0, 0, 0]]))
45
+ print(clf.predict([[0, 0, 0, 0]]), metric="canberra")
46
46
  ```
47
47
 
48
48
  ## Features
@@ -25,4 +25,4 @@ along with this program. If not, see <https://www.gnu.org/licenses/>.
25
25
  from .classifier import DistanceMetricClassifier # noqa
26
26
  from .distances import Distance # noqa
27
27
 
28
- __version__ = "0.1.6a0"
28
+ __version__ = "0.2.0a0"
@@ -19,7 +19,6 @@ You should have received a copy of the GNU General Public License
19
19
  along with this program. If not, see <https://www.gnu.org/licenses/>.
20
20
  """
21
21
 
22
- import warnings
23
22
  from typing import Callable
24
23
 
25
24
  import numpy as np
@@ -29,7 +28,6 @@ import pandas as pd
29
28
  import scipy
30
29
 
31
30
  from sklearn.base import BaseEstimator, ClassifierMixin
32
- from sklearn.neighbors import KernelDensity
33
31
  from sklearn.utils.multiclass import unique_labels
34
32
  from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
35
33
 
@@ -42,6 +40,52 @@ METRIC_SOURCES_ = {
42
40
  }
43
41
 
44
42
 
43
+ def initialize_metric_function(metric):
44
+ """Set the metric function based on the provided metric.
45
+
46
+ If the metric is a string, the function will look for a corresponding
47
+ function in scipy.spatial.distance or distances.Distance. If the metric
48
+ is a function, it will be used directly.
49
+ """
50
+ if callable(metric):
51
+ metric_fn_ = metric
52
+ metric_arg_ = metric
53
+
54
+ elif isinstance(metric, str):
55
+ metric_str_lowercase = metric.lower()
56
+ metric_found = False
57
+ for package_str, source in METRIC_SOURCES_.items():
58
+
59
+ # Don't use scipy for jaccard as their implementation only works with
60
+ # booleans - use custom jaccard instead
61
+ if (
62
+ package_str == "scipy.spatial.distance"
63
+ and metric_str_lowercase == "jaccard"
64
+ ):
65
+ continue
66
+
67
+ if hasattr(source, metric_str_lowercase):
68
+ metric_fn_ = getattr(source, metric_str_lowercase)
69
+ metric_found = True
70
+
71
+ # Use the string as an argument if it belongs to scipy as it is
72
+ # optimized
73
+ metric_arg_ = (
74
+ metric if package_str == "scipy.spatial.distance" else metric_fn_
75
+ )
76
+ break
77
+ if not metric_found:
78
+ raise ValueError(
79
+ f"{metric} metric not found. Please pass a string of the "
80
+ "name of a metric in scipy.spatial.distance or "
81
+ "distances.Distance, or pass a metric function directly. For a "
82
+ "list of available metrics, see: "
83
+ "https://sidchaini.github.io/DistClassiPy/distances.html or "
84
+ "https://docs.scipy.org/doc/scipy/reference/spatial.distance.html"
85
+ )
86
+ return metric_fn_, metric_arg_
87
+
88
+
45
89
  class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
46
90
  """A distance-based classifier that supports different distance metrics.
47
91
 
@@ -55,8 +99,6 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
55
99
 
56
100
  Parameters
57
101
  ----------
58
- metric : str or callable, default="euclidean"
59
- The distance metric to use for calculating the distance between features.
60
102
  scale : bool, default=True
61
103
  Whether to scale the distance between the test object and the centroid for a
62
104
  class in the feature space. If True, the data will be scaled based on the
@@ -71,47 +113,15 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
71
113
 
72
114
  .. versionadded:: 0.1.0
73
115
 
74
- calculate_kde : bool, default=False
75
- Whether to calculate a kernel density estimate based confidence parameter.
76
- .. deprecated:: 0.2.0
77
- This parameter will be removed in a future version and only the
78
- distance confidence parameter will be available.
79
- calculate_1d_dist : bool, default=False
80
- Whether to calculate the 1-dimensional distance based confidence parameter.
81
- .. deprecated:: 0.2.0
82
- This parameter will be removed in a future version and only the
83
- distance confidence parameter will be available.
84
- Whether to calculate the 1-dimensional distance based confidence parameter.
85
116
 
86
117
  Attributes
87
118
  ----------
88
- metric : str or callable
89
- The distance metric used for classification.
90
119
  scale : bool
91
120
  Indicates whether the data is scaled.
92
121
  central_stat : str
93
122
  The statistic used for calculating central tendency.
94
123
  dispersion_stat : str
95
124
  The statistic used for calculating dispersion.
96
- calculate_kde : bool
97
- Indicates whether a kernel density estimate is calculated.
98
- .. deprecated:: 0.2.0
99
- This parameter will be removed in a future version.
100
- calculate_1d_dist : bool
101
- Indicates whether 1-dimensional distances are calculated.
102
- .. deprecated:: 0.2.0
103
- This parameter will be removed in a future version.
104
-
105
- See Also
106
- --------
107
- scipy.spatial.dist : Other distance metrics provided in SciPy
108
- distclassipy.Distance : Distance metrics included with DistClassiPy
109
-
110
- Notes
111
- -----
112
- If using distance metrics supported by SciPy, it is desirable to pass a string,
113
- which allows SciPy to use an optimized C version of the code instead of the slower
114
- Python version.
115
125
 
116
126
  References
117
127
  ----------
@@ -134,77 +144,14 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
134
144
 
135
145
  def __init__(
136
146
  self,
137
- metric: str | Callable = "euclidean",
138
147
  scale: bool = True,
139
148
  central_stat: str = "median",
140
149
  dispersion_stat: str = "std",
141
- calculate_kde: bool = True, # deprecated in 0.2.0
142
- calculate_1d_dist: bool = True, # deprecated in 0.2.0
143
150
  ):
144
151
  """Initialize the classifier with specified parameters."""
145
- self.metric = metric
146
152
  self.scale = scale
147
153
  self.central_stat = central_stat
148
154
  self.dispersion_stat = dispersion_stat
149
- if calculate_kde:
150
- warnings.warn(
151
- "calculate_kde is deprecated and will be removed in version 0.2.0",
152
- DeprecationWarning,
153
- )
154
- self.calculate_kde = calculate_kde
155
-
156
- if calculate_1d_dist:
157
- warnings.warn(
158
- "calculate_1d_dist is deprecated and will be removed in version 0.2.0",
159
- DeprecationWarning,
160
- )
161
- self.calculate_1d_dist = calculate_1d_dist
162
-
163
- def initialize_metric_function(self):
164
- """Set the metric function based on the provided metric.
165
-
166
- If the metric is a string, the function will look for a corresponding
167
- function in scipy.spatial.distance or distances.Distance. If the metric
168
- is a function, it will be used directly.
169
- """
170
- if callable(self.metric):
171
- self.metric_fn_ = self.metric
172
- self.metric_arg_ = self.metric
173
-
174
- elif isinstance(self.metric, str):
175
- metric_str_lowercase = self.metric.lower()
176
- metric_found = False
177
- for package_str, source in METRIC_SOURCES_.items():
178
-
179
- # Don't use scipy for jaccard as their implementation only works with
180
- # booleans - use custom jaccard instead
181
- if (
182
- package_str == "scipy.spatial.distance"
183
- and metric_str_lowercase == "jaccard"
184
- ):
185
- continue
186
-
187
- if hasattr(source, metric_str_lowercase):
188
- self.metric_fn_ = getattr(source, metric_str_lowercase)
189
- metric_found = True
190
-
191
- # Use the string as an argument if it belongs to scipy as it is
192
- # optimized
193
- self.metric_arg_ = (
194
- self.metric
195
- if package_str == "scipy.spatial.distance"
196
- else self.metric_fn_
197
- )
198
- break
199
- if not metric_found:
200
- raise ValueError(
201
- f"{self.metric} metric not found. Please pass a string of the "
202
- "name of a metric in scipy.spatial.distance or "
203
- "distances.Distance, or pass a metric function directly. For a "
204
- "list of available metrics, see: "
205
- "https://sidchaini.github.io/DistClassiPy/distances.html or "
206
- "https://docs.scipy.org/doc/scipy/reference/spatial.distance.html"
207
- )
208
155
 
209
156
  def fit(self, X: np.array, y: np.array, feat_labels: list[str] = None):
210
157
  """Calculate the feature space centroid for all classes.
@@ -236,8 +183,6 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
236
183
  1
237
184
  ] # Number of features seen during fit - required for sklearn compatibility.
238
185
 
239
- self.initialize_metric_function()
240
-
241
186
  if feat_labels is None:
242
187
  feat_labels = [f"Feature_{x}" for x in range(X.shape[1])]
243
188
 
@@ -281,30 +226,15 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
281
226
  )
282
227
  self.df_iqr_ = df_iqr
283
228
 
284
- if self.calculate_kde:
285
- warnings.warn(
286
- "KDE calculation is deprecated and will be removed in version 0.2.0",
287
- DeprecationWarning,
288
- )
289
- self.kde_dict_ = {}
290
-
291
- for cl in self.classes_:
292
- subX = X[y == cl]
293
- # Implement the following in an if-else to save computational time.
294
- # kde = KernelDensity(bandwidth='scott', metric=self.metric)
295
- # kde.fit(subX)
296
- kde = KernelDensity(
297
- bandwidth="scott",
298
- metric="pyfunc",
299
- metric_params={"func": self.metric_fn_},
300
- )
301
- kde.fit(subX)
302
- self.kde_dict_[cl] = kde
303
229
  self.is_fitted_ = True
304
230
 
305
231
  return self
306
232
 
307
- def predict(self, X: np.array):
233
+ def predict(
234
+ self,
235
+ X: np.array,
236
+ metric: str | Callable = "euclidean",
237
+ ):
308
238
  """Predict the class labels for the provided X.
309
239
 
310
240
  The prediction is based on the distance of each data point in the input sample
@@ -315,18 +245,33 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
315
245
  ----------
316
246
  X : array-like of shape (n_samples, n_features)
317
247
  The input samples.
248
+ metric : str or callable, default="euclidean"
249
+ The distance metric to use for calculating the distance between features.
318
250
 
319
251
  Returns
320
252
  -------
321
253
  y : ndarray of shape (n_samples,)
322
254
  The predicted classes.
255
+
256
+ See Also
257
+ --------
258
+ scipy.spatial.dist : Other distance metrics provided in SciPy
259
+ distclassipy.Distance : Distance metrics included with DistClassiPy
260
+
261
+ Notes
262
+ -----
263
+ If using distance metrics supported by SciPy, it is desirable to pass a string,
264
+ which allows SciPy to use an optimized C version of the code instead of the
265
+ slower Python version.
323
266
  """
324
267
  check_is_fitted(self, "is_fitted_")
325
268
  X = check_array(X)
326
269
 
270
+ metric_fn_, metric_arg_ = initialize_metric_function(metric)
271
+
327
272
  if not self.scale:
328
273
  dist_arr = scipy.spatial.distance.cdist(
329
- XA=X, XB=self.df_centroid_.to_numpy(), metric=self.metric_arg_
274
+ XA=X, XB=self.df_centroid_.to_numpy(), metric=metric_arg_
330
275
  )
331
276
 
332
277
  else:
@@ -343,16 +288,18 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
343
288
  w = wtdf.loc[cl].to_numpy() # 1/std dev
344
289
  XB = XB * w # w is for this class only
345
290
  XA = X * w # w is for this class only
346
- cl_dist = scipy.spatial.distance.cdist(
347
- XA=XA, XB=XB, metric=self.metric_arg_
348
- )
291
+ cl_dist = scipy.spatial.distance.cdist(XA=XA, XB=XB, metric=metric_arg_)
349
292
  dist_arr_list.append(cl_dist)
350
293
  dist_arr = np.column_stack(dist_arr_list)
351
294
 
352
295
  y_pred = self.classes_[dist_arr.argmin(axis=1)]
353
296
  return y_pred
354
297
 
355
- def predict_and_analyse(self, X: np.array):
298
+ def predict_and_analyse(
299
+ self,
300
+ X: np.array,
301
+ metric: str | Callable = "euclidean",
302
+ ):
356
303
  """Predict the class labels for the provided X and perform analysis.
357
304
 
358
305
  The prediction is based on the distance of each data point in the input sample
@@ -366,18 +313,35 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
366
313
  ----------
367
314
  X : array-like of shape (n_samples, n_features)
368
315
  The input samples.
316
+ metric : str or callable, default="euclidean"
317
+ The distance metric to use for calculating the distance between features.
318
+
369
319
 
370
320
  Returns
371
321
  -------
372
322
  y : ndarray of shape (n_samples,)
373
323
  The predicted classes.
324
+
325
+ See Also
326
+ --------
327
+ scipy.spatial.dist : Other distance metrics provided in SciPy
328
+ distclassipy.Distance : Distance metrics included with DistClassiPy
329
+
330
+ Notes
331
+ -----
332
+ If using distance metrics supported by SciPy, it is desirable to pass a string,
333
+ which allows SciPy to use an optimized C version of the code instead
334
+ of the slower Python version.
335
+
374
336
  """
375
337
  check_is_fitted(self, "is_fitted_")
376
338
  X = check_array(X)
377
339
 
340
+ metric_fn_, metric_arg_ = initialize_metric_function(metric)
341
+
378
342
  if not self.scale:
379
343
  dist_arr = scipy.spatial.distance.cdist(
380
- XA=X, XB=self.df_centroid_.to_numpy(), metric=self.metric_arg_
344
+ XA=X, XB=self.df_centroid_.to_numpy(), metric=metric_arg_
381
345
  )
382
346
 
383
347
  else:
@@ -394,9 +358,7 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
394
358
  w = wtdf.loc[cl].to_numpy() # 1/std dev
395
359
  XB = XB * w # w is for this class only
396
360
  XA = X * w # w is for this class only
397
- cl_dist = scipy.spatial.distance.cdist(
398
- XA=XA, XB=XB, metric=self.metric_arg_
399
- )
361
+ cl_dist = scipy.spatial.distance.cdist(XA=XA, XB=XB, metric=metric_arg_)
400
362
  dist_arr_list.append(cl_dist)
401
363
  dist_arr = np.column_stack(dist_arr_list)
402
364
 
@@ -409,78 +371,15 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
409
371
 
410
372
  y_pred = self.classes_[dist_arr.argmin(axis=1)]
411
373
 
412
- if self.calculate_kde:
413
- warnings.warn(
414
- "KDE calculation in predict_and_analyse is deprecated "
415
- "and will be removed in version 0.2.0",
416
- DeprecationWarning,
417
- )
418
- # NEW: Rescale in terms of median likelihoods - calculate here
419
- scale_factors = np.exp(
420
- [
421
- self.kde_dict_[cl].score_samples(
422
- self.df_centroid_.loc[cl].to_numpy().reshape(1, -1)
423
- )[0]
424
- for cl in self.classes_
425
- ]
426
- )
427
-
428
- likelihood_arr = []
429
- for k in self.kde_dict_.keys():
430
- log_pdf = self.kde_dict_[k].score_samples(X)
431
- likelihood_val = np.exp(log_pdf)
432
- likelihood_arr.append(likelihood_val)
433
- self.likelihood_arr_ = np.array(likelihood_arr).T
434
-
435
- # NEW: Rescale in terms of median likelihoods - rescale here
436
- self.likelihood_arr_ = self.likelihood_arr_ / scale_factors
437
- if self.calculate_1d_dist:
438
- warnings.warn(
439
- "calculate_1d_dist is deprecated and will be removed in version 0.2.0",
440
- DeprecationWarning,
441
- )
442
- conf_cl = []
443
- Xdf_temp = pd.DataFrame(data=X, columns=self.df_centroid_.columns)
444
- for cl in self.classes_:
445
- sum_1d_dists = np.zeros(shape=(len(Xdf_temp)))
446
- for feat in Xdf_temp.columns:
447
- dists = scipy.spatial.distance.cdist(
448
- XA=np.zeros(shape=(1, 1)),
449
- XB=(self.df_centroid_.loc[cl] - Xdf_temp)[feat]
450
- .to_numpy()
451
- .reshape(-1, 1),
452
- metric=self.metric_arg_,
453
- ).ravel()
454
- if self.scale and self.dispersion_stat == "std":
455
- sum_1d_dists = sum_1d_dists + dists / self.df_std_.loc[cl, feat]
456
- elif self.scale and self.dispersion_stat == "std":
457
- sum_1d_dists = sum_1d_dists + dists / self.df_iqr_.loc[cl, feat]
458
- else:
459
- sum_1d_dists = sum_1d_dists + dists
460
- confs = 1 / np.clip(sum_1d_dists, a_min=np.finfo(float).eps, a_max=None)
461
- conf_cl.append(confs)
462
- conf_cl = np.array(conf_cl)
463
- self.conf_cl_ = conf_cl
464
374
  self.analyis_ = True
465
375
 
466
376
  return y_pred
467
377
 
468
- def calculate_confidence(self, method: str = "distance_inverse"):
378
+ def calculate_confidence(self):
469
379
  """Calculate the confidence for each prediction.
470
380
 
471
- The confidence is calculated based on either the distance of each data point to
472
- the centroids of the training data, optionally the kernel density estimate or
473
- 1-dimensional distance.
474
-
475
- Parameters
476
- ----------
477
- method : {"distance_inverse", "1d_distance_inverse", "kde_likelihood"},
478
- default="distance_inverse"
479
- The method to use for calculating confidence. Default is
480
- 'distance_inverse'.
481
- .. deprecated:: 0.2.0
482
- The methods '1d_distance_inverse' and
483
- 'kde_likelihood' will be removed in version 0.2.0.
381
+ The confidence is calculated as the inverse of the distance of each data point
382
+ to the centroids of the training data.
484
383
  """
485
384
  check_is_fitted(self, "is_fitted_")
486
385
  if not hasattr(self, "analyis_"):
@@ -490,44 +389,11 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
490
389
  )
491
390
 
492
391
  # Calculate confidence for each prediction
493
- if method == "distance_inverse":
494
- self.confidence_df_ = 1 / np.clip(
495
- self.centroid_dist_df_, a_min=np.finfo(float).eps, a_max=None
496
- )
497
- self.confidence_df_.columns = [
498
- x.replace("_dist", "_conf") for x in self.confidence_df_.columns
499
- ]
500
-
501
- elif method == "1d_distance_inverse":
502
- warnings.warn(
503
- "The '1d_distance_inverse' method is deprecated "
504
- "and will be removed in version 0.2.0",
505
- DeprecationWarning,
506
- )
507
- if not self.calculate_1d_dist:
508
- raise ValueError(
509
- "method='1d_distance_inverse' is only valid if calculate_1d_dist "
510
- "is set to True"
511
- )
512
- self.confidence_df_ = pd.DataFrame(
513
- data=self.conf_cl_.T, columns=[f"{x}_conf" for x in self.classes_]
514
- )
515
-
516
- elif method == "kde_likelihood":
517
- warnings.warn(
518
- "The 'kde_likelihood' method is deprecated and will be "
519
- "removed in version 0.2.0",
520
- DeprecationWarning,
521
- )
522
- if not self.calculate_kde:
523
- raise ValueError(
524
- "method='kde_likelihood' is only valid if calculate_kde is set "
525
- "to True"
526
- )
527
-
528
- self.confidence_df_ = pd.DataFrame(
529
- data=self.likelihood_arr_,
530
- columns=[f"{x}_conf" for x in self.kde_dict_.keys()],
531
- )
392
+ self.confidence_df_ = 1 / np.clip(
393
+ self.centroid_dist_df_, a_min=np.finfo(float).eps, a_max=None
394
+ )
395
+ self.confidence_df_.columns = [
396
+ x.replace("_dist", "_conf") for x in self.confidence_df_.columns
397
+ ]
532
398
 
533
399
  return self.confidence_df_.to_numpy()