distclassipy 0.1.4__py3-none-any.whl → 0.1.6a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
distclassipy/__init__.py CHANGED
@@ -1,16 +1,28 @@
1
- """
2
- A module for using distance metrics for classification.
1
+ """A module for using distance metrics for classification.
3
2
 
4
3
  Classes:
5
- DistanceMetricClassifier - A classifier that uses a specified distance metric for classification.
4
+ DistanceMetricClassifier - A classifier that uses a specified distance metric for
5
+ classification.
6
6
  Distance - A class that provides various distance metrics for use in classification.
7
+
8
+
9
+ Copyright (C) 2024 Siddharth Chaini
10
+ -----
11
+ This program is free software: you can redistribute it and/or modify
12
+ it under the terms of the GNU General Public License as published by
13
+ the Free Software Foundation, either version 3 of the License, or
14
+ (at your option) any later version.
15
+
16
+ This program is distributed in the hope that it will be useful,
17
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
18
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19
+ GNU General Public License for more details.
20
+
21
+ You should have received a copy of the GNU General Public License
22
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
7
23
  """
8
24
 
9
- from .classifier import (
10
- DistanceMetricClassifier,
11
- ) # Importing the DistanceMetricClassifier from the classifier module
12
- from .distances import (
13
- Distance,
14
- ) # Importing the Distance class from the distances module
25
+ from .classifier import DistanceMetricClassifier # noqa
26
+ from .distances import Distance # noqa
15
27
 
16
- __version__ = "0.1.4"
28
+ __version__ = "0.1.6a0"
@@ -1,41 +1,87 @@
1
- """
2
- A module which contains the DistanceMetricClassifier introduced by Chaini et al. (2024) in "Light Curve Classification with DistClassiPy: a new distance-based classifier".
1
+ """A module containing the distance metric classifier.
2
+
3
+ This module contains the DistanceMetricClassifier introduced by Chaini et al. (2024)
4
+ in "Light Curve Classification with DistClassiPy: a new distance-based classifier"
5
+
6
+ Copyright (C) 2024 Siddharth Chaini
7
+ -----
8
+ This program is free software: you can redistribute it and/or modify
9
+ it under the terms of the GNU General Public License as published by
10
+ the Free Software Foundation, either version 3 of the License, or
11
+ (at your option) any later version.
12
+
13
+ This program is distributed in the hope that it will be useful,
14
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ GNU General Public License for more details.
17
+
18
+ You should have received a copy of the GNU General Public License
19
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
3
20
  """
4
21
 
22
+ import warnings
23
+ from typing import Callable
24
+
5
25
  import numpy as np
26
+
6
27
  import pandas as pd
28
+
7
29
  import scipy
8
- from .distances import Distance
30
+
9
31
  from sklearn.base import BaseEstimator, ClassifierMixin
10
- from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
11
- from sklearn.utils.multiclass import unique_labels
12
32
  from sklearn.neighbors import KernelDensity
13
- from typing import Callable
33
+ from sklearn.utils.multiclass import unique_labels
34
+ from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
35
+
36
+ from .distances import Distance
37
+
38
+ # Hardcoded source packages to check for distance metrics.
39
+ METRIC_SOURCES_ = {
40
+ "scipy.spatial.distance": scipy.spatial.distance,
41
+ "distances.Distance": Distance(),
42
+ }
14
43
 
15
44
 
16
45
  class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
17
- """
18
- A distance-based classifier that supports the use of various distance metrics.
46
+ """A distance-based classifier that supports different distance metrics.
19
47
 
20
- The distance metric classifier determines the similarity between features in a dataset by leveraging the use of different distance metrics to. A specified distance metric is used to compute the distance between a given object and a centroid for every training class in the feature space. The classifier supports the use of different statistical measures for constructing the centroid and scaling the computed distance. Additionally, the distance metric classifier also optionally provides an estimate of the confidence of the classifier's predictions.
48
+ The distance metric classifier determines the similarity between features in a
49
+ dataset by leveraging the use of different distance metrics to. A specified
50
+ distance metric is used to compute the distance between a given object and a
51
+ centroid for every training class in the feature space. The classifier supports
52
+ the use of different statistical measures for constructing the centroid and scaling
53
+ the computed distance. Additionally, the distance metric classifier also
54
+ optionally provides an estimate of the confidence of the classifier's predictions.
21
55
 
22
56
  Parameters
23
57
  ----------
24
58
  metric : str or callable, default="euclidean"
25
59
  The distance metric to use for calculating the distance between features.
26
60
  scale : bool, default=True
27
- Whether to scale the distance between the test object and the centroid for a class in the feature space. If True, the data will be scaled based on the specified dispersion statistic.
61
+ Whether to scale the distance between the test object and the centroid for a
62
+ class in the feature space. If True, the data will be scaled based on the
63
+ specified dispersion statistic.
28
64
  central_stat : {"mean", "median"}, default="median"
29
- The statistic used to calculate the central tendency of the data to construct the feature-space centroid. Supported statistics are "mean" and "median".
65
+ The statistic used to calculate the central tendency of the data to construct
66
+ the feature-space centroid. Supported statistics are "mean" and "median".
30
67
  dispersion_stat : {"std", "iqr"}, default="std"
31
- The statistic used to calculate the dispersion of the data for scaling the distance. Supported statistics are "std" for standard deviation and "iqr" for inter-quartile range.
68
+ The statistic used to calculate the dispersion of the data for scaling the
69
+ distance. Supported statistics are "std" for standard deviation and "iqr"
70
+ for inter-quartile range.
32
71
 
33
72
  .. versionadded:: 0.1.0
34
73
 
35
74
  calculate_kde : bool, default=False
36
75
  Whether to calculate a kernel density estimate based confidence parameter.
76
+ .. deprecated:: 0.2.0
77
+ This parameter will be removed in a future version and only the
78
+ distance confidence parameter will be available.
37
79
  calculate_1d_dist : bool, default=False
38
80
  Whether to calculate the 1-dimensional distance based confidence parameter.
81
+ .. deprecated:: 0.2.0
82
+ This parameter will be removed in a future version and only the
83
+ distance confidence parameter will be available.
84
+ Whether to calculate the 1-dimensional distance based confidence parameter.
39
85
 
40
86
  Attributes
41
87
  ----------
@@ -49,8 +95,12 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
49
95
  The statistic used for calculating dispersion.
50
96
  calculate_kde : bool
51
97
  Indicates whether a kernel density estimate is calculated.
98
+ .. deprecated:: 0.2.0
99
+ This parameter will be removed in a future version.
52
100
  calculate_1d_dist : bool
53
101
  Indicates whether 1-dimensional distances are calculated.
102
+ .. deprecated:: 0.2.0
103
+ This parameter will be removed in a future version.
54
104
 
55
105
  See Also
56
106
  --------
@@ -59,11 +109,14 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
59
109
 
60
110
  Notes
61
111
  -----
62
- If using distance metrics supported by SciPy, it is desirable to pass a string, which allows SciPy to use an optimized C version of the code instead of the slower Python version.
112
+ If using distance metrics supported by SciPy, it is desirable to pass a string,
113
+ which allows SciPy to use an optimized C version of the code instead of the slower
114
+ Python version.
63
115
 
64
116
  References
65
117
  ----------
66
- .. [1] "Light Curve Classification with DistClassiPy: a new distance-based classifier"
118
+ .. [1] "Light Curve Classification with DistClassiPy: a new distance-based
119
+ classifier"
67
120
 
68
121
  Examples
69
122
  --------
@@ -85,32 +138,35 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
85
138
  scale: bool = True,
86
139
  central_stat: str = "median",
87
140
  dispersion_stat: str = "std",
88
- calculate_kde: bool = True,
89
- calculate_1d_dist: bool = True,
141
+ calculate_kde: bool = True, # deprecated in 0.2.0
142
+ calculate_1d_dist: bool = True, # deprecated in 0.2.0
90
143
  ):
91
- """
92
- Initialize the classifier with specified parameters.
93
- """
144
+ """Initialize the classifier with specified parameters."""
94
145
  self.metric = metric
95
146
  self.scale = scale
96
147
  self.central_stat = central_stat
97
148
  self.dispersion_stat = dispersion_stat
149
+ if calculate_kde:
150
+ warnings.warn(
151
+ "calculate_kde is deprecated and will be removed in version 0.2.0",
152
+ DeprecationWarning,
153
+ )
98
154
  self.calculate_kde = calculate_kde
155
+
156
+ if calculate_1d_dist:
157
+ warnings.warn(
158
+ "calculate_1d_dist is deprecated and will be removed in version 0.2.0",
159
+ DeprecationWarning,
160
+ )
99
161
  self.calculate_1d_dist = calculate_1d_dist
100
162
 
101
- def set_metric_fn_(self):
102
- """
103
- Set the metric function based on the provided metric.
163
+ def initialize_metric_function(self):
164
+ """Set the metric function based on the provided metric.
104
165
 
105
- If the metric is a string, the function will look for a corresponding function in scipy.spatial.distance or distances.Distance. If the metric is a function, it will be used directly.
166
+ If the metric is a string, the function will look for a corresponding
167
+ function in scipy.spatial.distance or distances.Distance. If the metric
168
+ is a function, it will be used directly.
106
169
  """
107
-
108
- # Hardcoded source packages to check for distance metrics.
109
- metric_sources_ = {
110
- "scipy.spatial.distance": scipy.spatial.distance,
111
- "distances.Distance": Distance(),
112
- }
113
-
114
170
  if callable(self.metric):
115
171
  self.metric_fn_ = self.metric
116
172
  self.metric_arg_ = self.metric
@@ -118,9 +174,10 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
118
174
  elif isinstance(self.metric, str):
119
175
  metric_str_lowercase = self.metric.lower()
120
176
  metric_found = False
121
- for package_str, source in metric_sources_.items():
177
+ for package_str, source in METRIC_SOURCES_.items():
122
178
 
123
- # Don't use scipy for jaccard as their implementation only works with booleans - use custom jaccard instead
179
+ # Don't use scipy for jaccard as their implementation only works with
180
+ # booleans - use custom jaccard instead
124
181
  if (
125
182
  package_str == "scipy.spatial.distance"
126
183
  and metric_str_lowercase == "jaccard"
@@ -131,7 +188,8 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
131
188
  self.metric_fn_ = getattr(source, metric_str_lowercase)
132
189
  metric_found = True
133
190
 
134
- # Use the string as an argument if it belongs to scipy as it is optimized
191
+ # Use the string as an argument if it belongs to scipy as it is
192
+ # optimized
135
193
  self.metric_arg_ = (
136
194
  self.metric
137
195
  if package_str == "scipy.spatial.distance"
@@ -140,14 +198,22 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
140
198
  break
141
199
  if not metric_found:
142
200
  raise ValueError(
143
- f"{self.metric} metric not found. Please pass a string of the name of a metric in scipy.spatial.distance or distances.Distance, or pass a metric function directly. For a list of available metrics, see: https://sidchaini.github.io/DistClassiPy/distances.html or https://docs.scipy.org/doc/scipy/reference/spatial.distance.html"
201
+ f"{self.metric} metric not found. Please pass a string of the "
202
+ "name of a metric in scipy.spatial.distance or "
203
+ "distances.Distance, or pass a metric function directly. For a "
204
+ "list of available metrics, see: "
205
+ "https://sidchaini.github.io/DistClassiPy/distances.html or "
206
+ "https://docs.scipy.org/doc/scipy/reference/spatial.distance.html"
144
207
  )
145
208
 
146
209
  def fit(self, X: np.array, y: np.array, feat_labels: list[str] = None):
147
- """
148
- Calculate the feature space centroid for all classes in the training set (X,y) using the central statistic. If scaling is enabled, also calculate the appropriate dispersion statistic.
210
+ """Calculate the feature space centroid for all classes.
149
211
 
150
- This involves computing the centroid for every class in the feature space and optionally calculating the kernel density estimate and 1-dimensional distance.
212
+ This function calculates the feature space centroid in the training
213
+ set (X, y) for all classes using the central statistic. If scaling
214
+ is enabled, it also calculates the appropriate dispersion statistic.
215
+ This involves computing the centroid for every class in the feature space and
216
+ optionally calculating the kernel density estimate and 1-dimensional distance.
151
217
 
152
218
  Parameters
153
219
  ----------
@@ -156,7 +222,8 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
156
222
  y : array-like of shape (n_samples,)
157
223
  The target values (class labels).
158
224
  feat_labels : list of str, optional, default=None
159
- The feature labels. If not provided, default labels representing feature number will be used.
225
+ The feature labels. If not provided, default labels representing feature
226
+ number will be used.
160
227
 
161
228
  Returns
162
229
  -------
@@ -165,9 +232,11 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
165
232
  """
166
233
  X, y = check_X_y(X, y)
167
234
  self.classes_ = unique_labels(y)
168
- self.n_features_in_ = X.shape[1]
235
+ self.n_features_in_ = X.shape[
236
+ 1
237
+ ] # Number of features seen during fit - required for sklearn compatibility.
169
238
 
170
- self.set_metric_fn_()
239
+ self.initialize_metric_function()
171
240
 
172
241
  if feat_labels is None:
173
242
  feat_labels = [f"Feature_{x}" for x in range(X.shape[1])]
@@ -188,7 +257,8 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
188
257
  std_list = []
189
258
  for cur_class in self.classes_:
190
259
  cur_X = X[y == cur_class]
191
- # Note we're using ddof=1 because we're dealing with a sample. See more: https://stackoverflow.com/a/46083501/10743245
260
+ # Note we're using ddof=1 because we're dealing with a sample.
261
+ # See more: https://stackoverflow.com/a/46083501/10743245
192
262
  std_list.append(np.std(cur_X, axis=0, ddof=1).ravel())
193
263
  df_std = pd.DataFrame(
194
264
  data=np.array(std_list), index=self.classes_, columns=feat_labels
@@ -200,7 +270,8 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
200
270
 
201
271
  for cur_class in self.classes_:
202
272
  cur_X = X[y == cur_class]
203
- # Note we're using ddof=1 because we're dealing with a sample. See more: https://stackoverflow.com/a/46083501/10743245
273
+ # Note we're using ddof=1 because we're dealing with a sample.
274
+ # See more: https://stackoverflow.com/a/46083501/10743245
204
275
  iqr_list.append(
205
276
  np.quantile(cur_X, q=0.75, axis=0).ravel()
206
277
  - np.quantile(cur_X, q=0.25, axis=0).ravel()
@@ -211,6 +282,10 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
211
282
  self.df_iqr_ = df_iqr
212
283
 
213
284
  if self.calculate_kde:
285
+ warnings.warn(
286
+ "KDE calculation is deprecated and will be removed in version 0.2.0",
287
+ DeprecationWarning,
288
+ )
214
289
  self.kde_dict_ = {}
215
290
 
216
291
  for cl in self.classes_:
@@ -225,7 +300,6 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
225
300
  )
226
301
  kde.fit(subX)
227
302
  self.kde_dict_[cl] = kde
228
-
229
303
  self.is_fitted_ = True
230
304
 
231
305
  return self
@@ -233,7 +307,9 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
233
307
  def predict(self, X: np.array):
234
308
  """Predict the class labels for the provided X.
235
309
 
236
- The prediction is based on the distance of each data point in the input sample to the centroid for each class in the feature space. The predicted class is the one whose centroid is the closest to the input sample.
310
+ The prediction is based on the distance of each data point in the input sample
311
+ to the centroid for each class in the feature space. The predicted class is the
312
+ one whose centroid is the closest to the input sample.
237
313
 
238
314
  Parameters
239
315
  ----------
@@ -277,12 +353,14 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
277
353
  return y_pred
278
354
 
279
355
  def predict_and_analyse(self, X: np.array):
280
- """
281
- Predict the class labels for the provided X and perform analysis.
356
+ """Predict the class labels for the provided X and perform analysis.
282
357
 
283
- The prediction is based on the distance of each data point in the input sample to the centroid for each class in the feature space. The predicted class is the one whose centroid is the closest to the input sample.
358
+ The prediction is based on the distance of each data point in the input sample
359
+ to the centroid for each class in the feature space. The predicted class is the
360
+ one whose centroid is the closest to the input sample.
284
361
 
285
- The analysis involves saving all calculated distances and confidences as an attribute for inspection and analysis later.
362
+ The analysis involves saving all calculated distances and confidences as an
363
+ attribute for inspection and analysis later.
286
364
 
287
365
  Parameters
288
366
  ----------
@@ -332,6 +410,11 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
332
410
  y_pred = self.classes_[dist_arr.argmin(axis=1)]
333
411
 
334
412
  if self.calculate_kde:
413
+ warnings.warn(
414
+ "KDE calculation in predict_and_analyse is deprecated "
415
+ "and will be removed in version 0.2.0",
416
+ DeprecationWarning,
417
+ )
335
418
  # NEW: Rescale in terms of median likelihoods - calculate here
336
419
  scale_factors = np.exp(
337
420
  [
@@ -351,8 +434,11 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
351
434
 
352
435
  # NEW: Rescale in terms of median likelihoods - rescale here
353
436
  self.likelihood_arr_ = self.likelihood_arr_ / scale_factors
354
-
355
437
  if self.calculate_1d_dist:
438
+ warnings.warn(
439
+ "calculate_1d_dist is deprecated and will be removed in version 0.2.0",
440
+ DeprecationWarning,
441
+ )
356
442
  conf_cl = []
357
443
  Xdf_temp = pd.DataFrame(data=X, columns=self.df_centroid_.columns)
358
444
  for cl in self.classes_:
@@ -375,26 +461,32 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
375
461
  conf_cl.append(confs)
376
462
  conf_cl = np.array(conf_cl)
377
463
  self.conf_cl_ = conf_cl
378
-
379
464
  self.analyis_ = True
380
465
 
381
466
  return y_pred
382
467
 
383
468
  def calculate_confidence(self, method: str = "distance_inverse"):
384
- """
385
- Calculate the confidence for each prediction.
469
+ """Calculate the confidence for each prediction.
386
470
 
387
- The confidence is calculated based on either the distance of each data point to the centroids of the training data, optionally the kernel density estimate or 1-dimensional distance.
471
+ The confidence is calculated based on either the distance of each data point to
472
+ the centroids of the training data, optionally the kernel density estimate or
473
+ 1-dimensional distance.
388
474
 
389
475
  Parameters
390
476
  ----------
391
- method : {"distance_inverse", "1d_distance_inverse", "kde_likelihood"}, default="distance_inverse"
392
- The method to use for calculating confidence. Default is 'distance_inverse'.
477
+ method : {"distance_inverse", "1d_distance_inverse", "kde_likelihood"},
478
+ default="distance_inverse"
479
+ The method to use for calculating confidence. Default is
480
+ 'distance_inverse'.
481
+ .. deprecated:: 0.2.0
482
+ The methods '1d_distance_inverse' and
483
+ 'kde_likelihood' will be removed in version 0.2.0.
393
484
  """
394
485
  check_is_fitted(self, "is_fitted_")
395
486
  if not hasattr(self, "analyis_"):
396
487
  raise ValueError(
397
- "Use predict_and_analyse() instead of predict() for confidence calculation."
488
+ "Use predict_and_analyse() instead of predict() for "
489
+ "confidence calculation."
398
490
  )
399
491
 
400
492
  # Calculate confidence for each prediction
@@ -407,18 +499,30 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
407
499
  ]
408
500
 
409
501
  elif method == "1d_distance_inverse":
502
+ warnings.warn(
503
+ "The '1d_distance_inverse' method is deprecated "
504
+ "and will be removed in version 0.2.0",
505
+ DeprecationWarning,
506
+ )
410
507
  if not self.calculate_1d_dist:
411
508
  raise ValueError(
412
- "method='1d_distance_inverse' is only valid if calculate_1d_dist is set to True"
509
+ "method='1d_distance_inverse' is only valid if calculate_1d_dist "
510
+ "is set to True"
413
511
  )
414
512
  self.confidence_df_ = pd.DataFrame(
415
513
  data=self.conf_cl_.T, columns=[f"{x}_conf" for x in self.classes_]
416
514
  )
417
515
 
418
516
  elif method == "kde_likelihood":
517
+ warnings.warn(
518
+ "The 'kde_likelihood' method is deprecated and will be "
519
+ "removed in version 0.2.0",
520
+ DeprecationWarning,
521
+ )
419
522
  if not self.calculate_kde:
420
523
  raise ValueError(
421
- "method='kde_likelihood' is only valid if calculate_kde is set to True"
524
+ "method='kde_likelihood' is only valid if calculate_kde is set "
525
+ "to True"
422
526
  )
423
527
 
424
528
  self.confidence_df_ = pd.DataFrame(