distclassipy 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
distclassipy/__init__.py CHANGED
@@ -1,16 +1,28 @@
1
- """
2
- A module for using distance metrics for classification.
1
+ """A module for using distance metrics for classification.
3
2
 
4
3
  Classes:
5
- DistanceMetricClassifier - A classifier that uses a specified distance metric for classification.
4
+ DistanceMetricClassifier - A classifier that uses a specified distance metric for
5
+ classification.
6
6
  Distance - A class that provides various distance metrics for use in classification.
7
+
8
+
9
+ Copyright (C) 2024 Siddharth Chaini
10
+ -----
11
+ This program is free software: you can redistribute it and/or modify
12
+ it under the terms of the GNU General Public License as published by
13
+ the Free Software Foundation, either version 3 of the License, or
14
+ (at your option) any later version.
15
+
16
+ This program is distributed in the hope that it will be useful,
17
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
18
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19
+ GNU General Public License for more details.
20
+
21
+ You should have received a copy of the GNU General Public License
22
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
7
23
  """
8
24
 
9
- from .classifier import (
10
- DistanceMetricClassifier,
11
- ) # Importing the DistanceMetricClassifier from the classifier module
12
- from .distances import (
13
- Distance,
14
- ) # Importing the Distance class from the distances module
25
+ from .classifier import DistanceMetricClassifier # noqa
26
+ from .distances import Distance # noqa
15
27
 
16
- __version__ = "0.1.3"
28
+ __version__ = "0.1.5"
@@ -1,34 +1,72 @@
1
+ """A module containing the distance metric classifier.
2
+
3
+ This module contains the DistanceMetricClassifier introduced by Chaini et al. (2024)
4
+ in "Light Curve Classification with DistClassiPy: a new distance-based classifier"
5
+
6
+ Copyright (C) 2024 Siddharth Chaini
7
+ -----
8
+ This program is free software: you can redistribute it and/or modify
9
+ it under the terms of the GNU General Public License as published by
10
+ the Free Software Foundation, either version 3 of the License, or
11
+ (at your option) any later version.
12
+
13
+ This program is distributed in the hope that it will be useful,
14
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ GNU General Public License for more details.
17
+
18
+ You should have received a copy of the GNU General Public License
19
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
1
20
  """
2
- A module which contains the DistanceMetricClassifier introduced by Chaini et al. (2024) in "Light Curve Classification with DistClassiPy: a new distance-based classifier".
3
- """
21
+
22
+ from typing import Callable
4
23
 
5
24
  import numpy as np
25
+
6
26
  import pandas as pd
27
+
7
28
  import scipy
8
- from .distances import Distance
29
+
9
30
  from sklearn.base import BaseEstimator, ClassifierMixin
10
- from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
11
- from sklearn.utils.multiclass import unique_labels
12
31
  from sklearn.neighbors import KernelDensity
13
- from typing import Callable
32
+ from sklearn.utils.multiclass import unique_labels
33
+ from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
34
+
35
+ from .distances import Distance
36
+
37
+ # Hardcoded source packages to check for distance metrics.
38
+ METRIC_SOURCES_ = {
39
+ "scipy.spatial.distance": scipy.spatial.distance,
40
+ "distances.Distance": Distance(),
41
+ }
14
42
 
15
43
 
16
44
  class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
17
- """
18
- A distance-based classifier that supports the use of various distance metrics.
45
+ """A distance-based classifier that supports different distance metrics.
19
46
 
20
- The distance metric classifier determines the similarity between features in a dataset by leveraging the use of different distance metrics to. A specified distance metric is used to compute the distance between a given object and a centroid for every training class in the feature space. The classifier supports the use of different statistical measures for constructing the centroid and scaling the computed distance. Additionally, the distance metric classifier also optionally provides an estimate of the confidence of the classifier's predictions.
47
+ The distance metric classifier determines the similarity between features in a
48
+ dataset by leveraging the use of different distance metrics to. A specified
49
+ distance metric is used to compute the distance between a given object and a
50
+ centroid for every training class in the feature space. The classifier supports
51
+ the use of different statistical measures for constructing the centroid and scaling
52
+ the computed distance. Additionally, the distance metric classifier also
53
+ optionally provides an estimate of the confidence of the classifier's predictions.
21
54
 
22
55
  Parameters
23
56
  ----------
24
57
  metric : str or callable, default="euclidean"
25
58
  The distance metric to use for calculating the distance between features.
26
59
  scale : bool, default=True
27
- Whether to scale the distance between the test object and the centroid for a class in the feature space. If True, the data will be scaled based on the specified dispersion statistic.
60
+ Whether to scale the distance between the test object and the centroid for a
61
+ class in the feature space. If True, the data will be scaled based on the
62
+ specified dispersion statistic.
28
63
  central_stat : {"mean", "median"}, default="median"
29
- The statistic used to calculate the central tendency of the data to construct the feature-space centroid. Supported statistics are "mean" and "median".
64
+ The statistic used to calculate the central tendency of the data to construct
65
+ the feature-space centroid. Supported statistics are "mean" and "median".
30
66
  dispersion_stat : {"std", "iqr"}, default="std"
31
- The statistic used to calculate the dispersion of the data for scaling the distance. Supported statistics are "std" for standard deviation and "iqr" for inter-quartile range.
67
+ The statistic used to calculate the dispersion of the data for scaling the
68
+ distance. Supported statistics are "std" for standard deviation and "iqr"
69
+ for inter-quartile range.
32
70
 
33
71
  .. versionadded:: 0.1.0
34
72
 
@@ -59,11 +97,14 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
59
97
 
60
98
  Notes
61
99
  -----
62
- If using distance metrics supported by SciPy, it is desirable to pass a string, which allows SciPy to use an optimized C version of the code instead of the slower Python version.
100
+ If using distance metrics supported by SciPy, it is desirable to pass a string,
101
+ which allows SciPy to use an optimized C version of the code instead of the slower
102
+ Python version.
63
103
 
64
104
  References
65
105
  ----------
66
- .. [1] "Light Curve Classification with DistClassiPy: a new distance-based classifier"
106
+ .. [1] "Light Curve Classification with DistClassiPy: a new distance-based
107
+ classifier"
67
108
 
68
109
  Examples
69
110
  --------
@@ -88,9 +129,7 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
88
129
  calculate_kde: bool = True,
89
130
  calculate_1d_dist: bool = True,
90
131
  ):
91
- """
92
- Initialize the classifier with specified parameters.
93
- """
132
+ """Initialize the classifier with specified parameters."""
94
133
  self.metric = metric
95
134
  self.scale = scale
96
135
  self.central_stat = central_stat
@@ -98,19 +137,13 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
98
137
  self.calculate_kde = calculate_kde
99
138
  self.calculate_1d_dist = calculate_1d_dist
100
139
 
101
- def set_metric_fn_(self):
102
- """
103
- Set the metric function based on the provided metric.
140
+ def initialize_metric_function(self):
141
+ """Set the metric function based on the provided metric.
104
142
 
105
- If the metric is a string, the function will look for a corresponding function in scipy.spatial.distance or distances.Distance. If the metric is a function, it will be used directly.
143
+ If the metric is a string, the function will look for a corresponding
144
+ function in scipy.spatial.distance or distances.Distance. If the metric
145
+ is a function, it will be used directly.
106
146
  """
107
-
108
- # Hardcoded source packages to check for distance metrics.
109
- metric_sources_ = {
110
- "scipy.spatial.distance": scipy.spatial.distance,
111
- "distances.Distance": Distance(),
112
- }
113
-
114
147
  if callable(self.metric):
115
148
  self.metric_fn_ = self.metric
116
149
  self.metric_arg_ = self.metric
@@ -118,9 +151,10 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
118
151
  elif isinstance(self.metric, str):
119
152
  metric_str_lowercase = self.metric.lower()
120
153
  metric_found = False
121
- for package_str, source in metric_sources_.items():
154
+ for package_str, source in METRIC_SOURCES_.items():
122
155
 
123
- # Don't use scipy for jaccard as their implementation only works with booleans - use custom jaccard instead
156
+ # Don't use scipy for jaccard as their implementation only works with
157
+ # booleans - use custom jaccard instead
124
158
  if (
125
159
  package_str == "scipy.spatial.distance"
126
160
  and metric_str_lowercase == "jaccard"
@@ -131,7 +165,8 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
131
165
  self.metric_fn_ = getattr(source, metric_str_lowercase)
132
166
  metric_found = True
133
167
 
134
- # Use the string as an argument if it belongs to scipy as it is optimized
168
+ # Use the string as an argument if it belongs to scipy as it is
169
+ # optimized
135
170
  self.metric_arg_ = (
136
171
  self.metric
137
172
  if package_str == "scipy.spatial.distance"
@@ -140,14 +175,22 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
140
175
  break
141
176
  if not metric_found:
142
177
  raise ValueError(
143
- f"{self.metric} metric not found. Please pass a string of the name of a metric in scipy.spatial.distance or distances.Distance, or pass a metric function directly. For a list of available metrics, see: https://sidchaini.github.io/DistClassiPy/distances.html or https://docs.scipy.org/doc/scipy/reference/spatial.distance.html"
178
+ f"{self.metric} metric not found. Please pass a string of the "
179
+ "name of a metric in scipy.spatial.distance or "
180
+ "distances.Distance, or pass a metric function directly. For a "
181
+ "list of available metrics, see: "
182
+ "https://sidchaini.github.io/DistClassiPy/distances.html or "
183
+ "https://docs.scipy.org/doc/scipy/reference/spatial.distance.html"
144
184
  )
145
185
 
146
186
  def fit(self, X: np.array, y: np.array, feat_labels: list[str] = None):
147
- """
148
- Calculate the feature space centroid for all classes in the training set (X,y) using the central statistic. If scaling is enabled, also calculate the appropriate dispersion statistic.
187
+ """Calculate the feature space centroid for all classes.
149
188
 
150
- This involves computing the centroid for every class in the feature space and optionally calculating the kernel density estimate and 1-dimensional distance.
189
+ This function calculates the feature space centroid in the training
190
+ set (X, y) for all classes using the central statistic. If scaling
191
+ is enabled, it also calculates the appropriate dispersion statistic.
192
+ This involves computing the centroid for every class in the feature space and
193
+ optionally calculating the kernel density estimate and 1-dimensional distance.
151
194
 
152
195
  Parameters
153
196
  ----------
@@ -156,7 +199,8 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
156
199
  y : array-like of shape (n_samples,)
157
200
  The target values (class labels).
158
201
  feat_labels : list of str, optional, default=None
159
- The feature labels. If not provided, default labels representing feature number will be used.
202
+ The feature labels. If not provided, default labels representing feature
203
+ number will be used.
160
204
 
161
205
  Returns
162
206
  -------
@@ -167,7 +211,7 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
167
211
  self.classes_ = unique_labels(y)
168
212
  self.n_features_in_ = X.shape[1]
169
213
 
170
- self.set_metric_fn_()
214
+ self.initialize_metric_function()
171
215
 
172
216
  if feat_labels is None:
173
217
  feat_labels = [f"Feature_{x}" for x in range(X.shape[1])]
@@ -188,7 +232,8 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
188
232
  std_list = []
189
233
  for cur_class in self.classes_:
190
234
  cur_X = X[y == cur_class]
191
- # Note we're using ddof=1 because we're dealing with a sample. See more: https://stackoverflow.com/a/46083501/10743245
235
+ # Note we're using ddof=1 because we're dealing with a sample.
236
+ # See more: https://stackoverflow.com/a/46083501/10743245
192
237
  std_list.append(np.std(cur_X, axis=0, ddof=1).ravel())
193
238
  df_std = pd.DataFrame(
194
239
  data=np.array(std_list), index=self.classes_, columns=feat_labels
@@ -200,7 +245,8 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
200
245
 
201
246
  for cur_class in self.classes_:
202
247
  cur_X = X[y == cur_class]
203
- # Note we're using ddof=1 because we're dealing with a sample. See more: https://stackoverflow.com/a/46083501/10743245
248
+ # Note we're using ddof=1 because we're dealing with a sample.
249
+ # See more: https://stackoverflow.com/a/46083501/10743245
204
250
  iqr_list.append(
205
251
  np.quantile(cur_X, q=0.75, axis=0).ravel()
206
252
  - np.quantile(cur_X, q=0.25, axis=0).ravel()
@@ -233,7 +279,9 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
233
279
  def predict(self, X: np.array):
234
280
  """Predict the class labels for the provided X.
235
281
 
236
- The prediction is based on the distance of each data point in the input sample to the centroid for each class in the feature space. The predicted class is the one whose centroid is the closest to the input sample.
282
+ The prediction is based on the distance of each data point in the input sample
283
+ to the centroid for each class in the feature space. The predicted class is the
284
+ one whose centroid is the closest to the input sample.
237
285
 
238
286
  Parameters
239
287
  ----------
@@ -277,12 +325,14 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
277
325
  return y_pred
278
326
 
279
327
  def predict_and_analyse(self, X: np.array):
280
- """
281
- Predict the class labels for the provided X and perform analysis.
328
+ """Predict the class labels for the provided X and perform analysis.
282
329
 
283
- The prediction is based on the distance of each data point in the input sample to the centroid for each class in the feature space. The predicted class is the one whose centroid is the closest to the input sample.
330
+ The prediction is based on the distance of each data point in the input sample
331
+ to the centroid for each class in the feature space. The predicted class is the
332
+ one whose centroid is the closest to the input sample.
284
333
 
285
- The analysis involves saving all calculated distances and confidences as an attribute for inspection and analysis later.
334
+ The analysis involves saving all calculated distances and confidences as an
335
+ attribute for inspection and analysis later.
286
336
 
287
337
  Parameters
288
338
  ----------
@@ -381,20 +431,24 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
381
431
  return y_pred
382
432
 
383
433
  def calculate_confidence(self, method: str = "distance_inverse"):
384
- """
385
- Calculate the confidence for each prediction.
434
+ """Calculate the confidence for each prediction.
386
435
 
387
- The confidence is calculated based on either the distance of each data point to the centroids of the training data, optionally the kernel density estimate or 1-dimensional distance.
436
+ The confidence is calculated based on either the distance of each data point to
437
+ the centroids of the training data, optionally the kernel density estimate or
438
+ 1-dimensional distance.
388
439
 
389
440
  Parameters
390
441
  ----------
391
- method : {"distance_inverse", "1d_distance_inverse", "kde_likelihood"}, default="distance_inverse"
392
- The method to use for calculating confidence. Default is 'distance_inverse'.
442
+ method : {"distance_inverse", "1d_distance_inverse","kde_likelihood"},
443
+ default="distance_inverse"
444
+ The method to use for calculating confidence. Default is
445
+ 'distance_inverse'.
393
446
  """
394
447
  check_is_fitted(self, "is_fitted_")
395
448
  if not hasattr(self, "analyis_"):
396
449
  raise ValueError(
397
- "Use predict_and_analyse() instead of predict() for confidence calculation."
450
+ "Use predict_and_analyse() instead of predict() for "
451
+ "confidence calculation."
398
452
  )
399
453
 
400
454
  # Calculate confidence for each prediction
@@ -409,7 +463,8 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
409
463
  elif method == "1d_distance_inverse":
410
464
  if not self.calculate_1d_dist:
411
465
  raise ValueError(
412
- "method='1d_distance_inverse' is only valid if calculate_1d_dist is set to True"
466
+ "method='1d_distance_inverse' is only valid if calculate_1d_dist "
467
+ "is set to True"
413
468
  )
414
469
  self.confidence_df_ = pd.DataFrame(
415
470
  data=self.conf_cl_.T, columns=[f"{x}_conf" for x in self.classes_]
@@ -418,7 +473,8 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
418
473
  elif method == "kde_likelihood":
419
474
  if not self.calculate_kde:
420
475
  raise ValueError(
421
- "method='kde_likelihood' is only valid if calculate_kde is set to True"
476
+ "method='kde_likelihood' is only valid if calculate_kde is set "
477
+ "to True"
422
478
  )
423
479
 
424
480
  self.confidence_df_ = pd.DataFrame(