distclassipy 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- distclassipy/__init__.py +22 -10
- distclassipy/classifier.py +108 -52
- distclassipy/distances.py +1177 -1141
- {distclassipy-0.1.3.dist-info → distclassipy-0.1.5.dist-info}/METADATA +4 -4
- distclassipy-0.1.5.dist-info/RECORD +8 -0
- distclassipy-0.1.3.dist-info/RECORD +0 -8
- {distclassipy-0.1.3.dist-info → distclassipy-0.1.5.dist-info}/LICENSE +0 -0
- {distclassipy-0.1.3.dist-info → distclassipy-0.1.5.dist-info}/WHEEL +0 -0
- {distclassipy-0.1.3.dist-info → distclassipy-0.1.5.dist-info}/top_level.txt +0 -0
distclassipy/__init__.py
CHANGED
|
@@ -1,16 +1,28 @@
|
|
|
1
|
-
"""
|
|
2
|
-
A module for using distance metrics for classification.
|
|
1
|
+
"""A module for using distance metrics for classification.
|
|
3
2
|
|
|
4
3
|
Classes:
|
|
5
|
-
DistanceMetricClassifier - A classifier that uses a specified distance metric for
|
|
4
|
+
DistanceMetricClassifier - A classifier that uses a specified distance metric for
|
|
5
|
+
classification.
|
|
6
6
|
Distance - A class that provides various distance metrics for use in classification.
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
Copyright (C) 2024 Siddharth Chaini
|
|
10
|
+
-----
|
|
11
|
+
This program is free software: you can redistribute it and/or modify
|
|
12
|
+
it under the terms of the GNU General Public License as published by
|
|
13
|
+
the Free Software Foundation, either version 3 of the License, or
|
|
14
|
+
(at your option) any later version.
|
|
15
|
+
|
|
16
|
+
This program is distributed in the hope that it will be useful,
|
|
17
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
18
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
19
|
+
GNU General Public License for more details.
|
|
20
|
+
|
|
21
|
+
You should have received a copy of the GNU General Public License
|
|
22
|
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
7
23
|
"""
|
|
8
24
|
|
|
9
|
-
from .classifier import
|
|
10
|
-
|
|
11
|
-
) # Importing the DistanceMetricClassifier from the classifier module
|
|
12
|
-
from .distances import (
|
|
13
|
-
Distance,
|
|
14
|
-
) # Importing the Distance class from the distances module
|
|
25
|
+
from .classifier import DistanceMetricClassifier # noqa
|
|
26
|
+
from .distances import Distance # noqa
|
|
15
27
|
|
|
16
|
-
__version__ = "0.1.
|
|
28
|
+
__version__ = "0.1.5"
|
distclassipy/classifier.py
CHANGED
|
@@ -1,34 +1,72 @@
|
|
|
1
|
+
"""A module containing the distance metric classifier.
|
|
2
|
+
|
|
3
|
+
This module contains the DistanceMetricClassifier introduced by Chaini et al. (2024)
|
|
4
|
+
in "Light Curve Classification with DistClassiPy: a new distance-based classifier"
|
|
5
|
+
|
|
6
|
+
Copyright (C) 2024 Siddharth Chaini
|
|
7
|
+
-----
|
|
8
|
+
This program is free software: you can redistribute it and/or modify
|
|
9
|
+
it under the terms of the GNU General Public License as published by
|
|
10
|
+
the Free Software Foundation, either version 3 of the License, or
|
|
11
|
+
(at your option) any later version.
|
|
12
|
+
|
|
13
|
+
This program is distributed in the hope that it will be useful,
|
|
14
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
15
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
16
|
+
GNU General Public License for more details.
|
|
17
|
+
|
|
18
|
+
You should have received a copy of the GNU General Public License
|
|
19
|
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
1
20
|
"""
|
|
2
|
-
|
|
3
|
-
|
|
21
|
+
|
|
22
|
+
from typing import Callable
|
|
4
23
|
|
|
5
24
|
import numpy as np
|
|
25
|
+
|
|
6
26
|
import pandas as pd
|
|
27
|
+
|
|
7
28
|
import scipy
|
|
8
|
-
|
|
29
|
+
|
|
9
30
|
from sklearn.base import BaseEstimator, ClassifierMixin
|
|
10
|
-
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
|
|
11
|
-
from sklearn.utils.multiclass import unique_labels
|
|
12
31
|
from sklearn.neighbors import KernelDensity
|
|
13
|
-
from
|
|
32
|
+
from sklearn.utils.multiclass import unique_labels
|
|
33
|
+
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
|
|
34
|
+
|
|
35
|
+
from .distances import Distance
|
|
36
|
+
|
|
37
|
+
# Hardcoded source packages to check for distance metrics.
|
|
38
|
+
METRIC_SOURCES_ = {
|
|
39
|
+
"scipy.spatial.distance": scipy.spatial.distance,
|
|
40
|
+
"distances.Distance": Distance(),
|
|
41
|
+
}
|
|
14
42
|
|
|
15
43
|
|
|
16
44
|
class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
17
|
-
"""
|
|
18
|
-
A distance-based classifier that supports the use of various distance metrics.
|
|
45
|
+
"""A distance-based classifier that supports different distance metrics.
|
|
19
46
|
|
|
20
|
-
The distance metric classifier determines the similarity between features in a
|
|
47
|
+
The distance metric classifier determines the similarity between features in a
|
|
48
|
+
dataset by leveraging the use of different distance metrics to. A specified
|
|
49
|
+
distance metric is used to compute the distance between a given object and a
|
|
50
|
+
centroid for every training class in the feature space. The classifier supports
|
|
51
|
+
the use of different statistical measures for constructing the centroid and scaling
|
|
52
|
+
the computed distance. Additionally, the distance metric classifier also
|
|
53
|
+
optionally provides an estimate of the confidence of the classifier's predictions.
|
|
21
54
|
|
|
22
55
|
Parameters
|
|
23
56
|
----------
|
|
24
57
|
metric : str or callable, default="euclidean"
|
|
25
58
|
The distance metric to use for calculating the distance between features.
|
|
26
59
|
scale : bool, default=True
|
|
27
|
-
Whether to scale the distance between the test object and the centroid for a
|
|
60
|
+
Whether to scale the distance between the test object and the centroid for a
|
|
61
|
+
class in the feature space. If True, the data will be scaled based on the
|
|
62
|
+
specified dispersion statistic.
|
|
28
63
|
central_stat : {"mean", "median"}, default="median"
|
|
29
|
-
The statistic used to calculate the central tendency of the data to construct
|
|
64
|
+
The statistic used to calculate the central tendency of the data to construct
|
|
65
|
+
the feature-space centroid. Supported statistics are "mean" and "median".
|
|
30
66
|
dispersion_stat : {"std", "iqr"}, default="std"
|
|
31
|
-
The statistic used to calculate the dispersion of the data for scaling the
|
|
67
|
+
The statistic used to calculate the dispersion of the data for scaling the
|
|
68
|
+
distance. Supported statistics are "std" for standard deviation and "iqr"
|
|
69
|
+
for inter-quartile range.
|
|
32
70
|
|
|
33
71
|
.. versionadded:: 0.1.0
|
|
34
72
|
|
|
@@ -59,11 +97,14 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
59
97
|
|
|
60
98
|
Notes
|
|
61
99
|
-----
|
|
62
|
-
If using distance metrics supported by SciPy, it is desirable to pass a string,
|
|
100
|
+
If using distance metrics supported by SciPy, it is desirable to pass a string,
|
|
101
|
+
which allows SciPy to use an optimized C version of the code instead of the slower
|
|
102
|
+
Python version.
|
|
63
103
|
|
|
64
104
|
References
|
|
65
105
|
----------
|
|
66
|
-
.. [1] "Light Curve Classification with DistClassiPy: a new distance-based
|
|
106
|
+
.. [1] "Light Curve Classification with DistClassiPy: a new distance-based
|
|
107
|
+
classifier"
|
|
67
108
|
|
|
68
109
|
Examples
|
|
69
110
|
--------
|
|
@@ -88,9 +129,7 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
88
129
|
calculate_kde: bool = True,
|
|
89
130
|
calculate_1d_dist: bool = True,
|
|
90
131
|
):
|
|
91
|
-
"""
|
|
92
|
-
Initialize the classifier with specified parameters.
|
|
93
|
-
"""
|
|
132
|
+
"""Initialize the classifier with specified parameters."""
|
|
94
133
|
self.metric = metric
|
|
95
134
|
self.scale = scale
|
|
96
135
|
self.central_stat = central_stat
|
|
@@ -98,19 +137,13 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
98
137
|
self.calculate_kde = calculate_kde
|
|
99
138
|
self.calculate_1d_dist = calculate_1d_dist
|
|
100
139
|
|
|
101
|
-
def
|
|
102
|
-
"""
|
|
103
|
-
Set the metric function based on the provided metric.
|
|
140
|
+
def initialize_metric_function(self):
|
|
141
|
+
"""Set the metric function based on the provided metric.
|
|
104
142
|
|
|
105
|
-
If the metric is a string, the function will look for a corresponding
|
|
143
|
+
If the metric is a string, the function will look for a corresponding
|
|
144
|
+
function in scipy.spatial.distance or distances.Distance. If the metric
|
|
145
|
+
is a function, it will be used directly.
|
|
106
146
|
"""
|
|
107
|
-
|
|
108
|
-
# Hardcoded source packages to check for distance metrics.
|
|
109
|
-
metric_sources_ = {
|
|
110
|
-
"scipy.spatial.distance": scipy.spatial.distance,
|
|
111
|
-
"distances.Distance": Distance(),
|
|
112
|
-
}
|
|
113
|
-
|
|
114
147
|
if callable(self.metric):
|
|
115
148
|
self.metric_fn_ = self.metric
|
|
116
149
|
self.metric_arg_ = self.metric
|
|
@@ -118,9 +151,10 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
118
151
|
elif isinstance(self.metric, str):
|
|
119
152
|
metric_str_lowercase = self.metric.lower()
|
|
120
153
|
metric_found = False
|
|
121
|
-
for package_str, source in
|
|
154
|
+
for package_str, source in METRIC_SOURCES_.items():
|
|
122
155
|
|
|
123
|
-
# Don't use scipy for jaccard as their implementation only works with
|
|
156
|
+
# Don't use scipy for jaccard as their implementation only works with
|
|
157
|
+
# booleans - use custom jaccard instead
|
|
124
158
|
if (
|
|
125
159
|
package_str == "scipy.spatial.distance"
|
|
126
160
|
and metric_str_lowercase == "jaccard"
|
|
@@ -131,7 +165,8 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
131
165
|
self.metric_fn_ = getattr(source, metric_str_lowercase)
|
|
132
166
|
metric_found = True
|
|
133
167
|
|
|
134
|
-
# Use the string as an argument if it belongs to scipy as it is
|
|
168
|
+
# Use the string as an argument if it belongs to scipy as it is
|
|
169
|
+
# optimized
|
|
135
170
|
self.metric_arg_ = (
|
|
136
171
|
self.metric
|
|
137
172
|
if package_str == "scipy.spatial.distance"
|
|
@@ -140,14 +175,22 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
140
175
|
break
|
|
141
176
|
if not metric_found:
|
|
142
177
|
raise ValueError(
|
|
143
|
-
f"{self.metric} metric not found. Please pass a string of the
|
|
178
|
+
f"{self.metric} metric not found. Please pass a string of the "
|
|
179
|
+
"name of a metric in scipy.spatial.distance or "
|
|
180
|
+
"distances.Distance, or pass a metric function directly. For a "
|
|
181
|
+
"list of available metrics, see: "
|
|
182
|
+
"https://sidchaini.github.io/DistClassiPy/distances.html or "
|
|
183
|
+
"https://docs.scipy.org/doc/scipy/reference/spatial.distance.html"
|
|
144
184
|
)
|
|
145
185
|
|
|
146
186
|
def fit(self, X: np.array, y: np.array, feat_labels: list[str] = None):
|
|
147
|
-
"""
|
|
148
|
-
Calculate the feature space centroid for all classes in the training set (X,y) using the central statistic. If scaling is enabled, also calculate the appropriate dispersion statistic.
|
|
187
|
+
"""Calculate the feature space centroid for all classes.
|
|
149
188
|
|
|
150
|
-
This
|
|
189
|
+
This function calculates the feature space centroid in the training
|
|
190
|
+
set (X, y) for all classes using the central statistic. If scaling
|
|
191
|
+
is enabled, it also calculates the appropriate dispersion statistic.
|
|
192
|
+
This involves computing the centroid for every class in the feature space and
|
|
193
|
+
optionally calculating the kernel density estimate and 1-dimensional distance.
|
|
151
194
|
|
|
152
195
|
Parameters
|
|
153
196
|
----------
|
|
@@ -156,7 +199,8 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
156
199
|
y : array-like of shape (n_samples,)
|
|
157
200
|
The target values (class labels).
|
|
158
201
|
feat_labels : list of str, optional, default=None
|
|
159
|
-
The feature labels. If not provided, default labels representing feature
|
|
202
|
+
The feature labels. If not provided, default labels representing feature
|
|
203
|
+
number will be used.
|
|
160
204
|
|
|
161
205
|
Returns
|
|
162
206
|
-------
|
|
@@ -167,7 +211,7 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
167
211
|
self.classes_ = unique_labels(y)
|
|
168
212
|
self.n_features_in_ = X.shape[1]
|
|
169
213
|
|
|
170
|
-
self.
|
|
214
|
+
self.initialize_metric_function()
|
|
171
215
|
|
|
172
216
|
if feat_labels is None:
|
|
173
217
|
feat_labels = [f"Feature_{x}" for x in range(X.shape[1])]
|
|
@@ -188,7 +232,8 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
188
232
|
std_list = []
|
|
189
233
|
for cur_class in self.classes_:
|
|
190
234
|
cur_X = X[y == cur_class]
|
|
191
|
-
# Note we're using ddof=1 because we're dealing with a sample.
|
|
235
|
+
# Note we're using ddof=1 because we're dealing with a sample.
|
|
236
|
+
# See more: https://stackoverflow.com/a/46083501/10743245
|
|
192
237
|
std_list.append(np.std(cur_X, axis=0, ddof=1).ravel())
|
|
193
238
|
df_std = pd.DataFrame(
|
|
194
239
|
data=np.array(std_list), index=self.classes_, columns=feat_labels
|
|
@@ -200,7 +245,8 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
200
245
|
|
|
201
246
|
for cur_class in self.classes_:
|
|
202
247
|
cur_X = X[y == cur_class]
|
|
203
|
-
# Note we're using ddof=1 because we're dealing with a sample.
|
|
248
|
+
# Note we're using ddof=1 because we're dealing with a sample.
|
|
249
|
+
# See more: https://stackoverflow.com/a/46083501/10743245
|
|
204
250
|
iqr_list.append(
|
|
205
251
|
np.quantile(cur_X, q=0.75, axis=0).ravel()
|
|
206
252
|
- np.quantile(cur_X, q=0.25, axis=0).ravel()
|
|
@@ -233,7 +279,9 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
233
279
|
def predict(self, X: np.array):
|
|
234
280
|
"""Predict the class labels for the provided X.
|
|
235
281
|
|
|
236
|
-
The prediction is based on the distance of each data point in the input sample
|
|
282
|
+
The prediction is based on the distance of each data point in the input sample
|
|
283
|
+
to the centroid for each class in the feature space. The predicted class is the
|
|
284
|
+
one whose centroid is the closest to the input sample.
|
|
237
285
|
|
|
238
286
|
Parameters
|
|
239
287
|
----------
|
|
@@ -277,12 +325,14 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
277
325
|
return y_pred
|
|
278
326
|
|
|
279
327
|
def predict_and_analyse(self, X: np.array):
|
|
280
|
-
"""
|
|
281
|
-
Predict the class labels for the provided X and perform analysis.
|
|
328
|
+
"""Predict the class labels for the provided X and perform analysis.
|
|
282
329
|
|
|
283
|
-
The prediction is based on the distance of each data point in the input sample
|
|
330
|
+
The prediction is based on the distance of each data point in the input sample
|
|
331
|
+
to the centroid for each class in the feature space. The predicted class is the
|
|
332
|
+
one whose centroid is the closest to the input sample.
|
|
284
333
|
|
|
285
|
-
The analysis involves saving all calculated distances and confidences as an
|
|
334
|
+
The analysis involves saving all calculated distances and confidences as an
|
|
335
|
+
attribute for inspection and analysis later.
|
|
286
336
|
|
|
287
337
|
Parameters
|
|
288
338
|
----------
|
|
@@ -381,20 +431,24 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
381
431
|
return y_pred
|
|
382
432
|
|
|
383
433
|
def calculate_confidence(self, method: str = "distance_inverse"):
|
|
384
|
-
"""
|
|
385
|
-
Calculate the confidence for each prediction.
|
|
434
|
+
"""Calculate the confidence for each prediction.
|
|
386
435
|
|
|
387
|
-
The confidence is calculated based on either the distance of each data point to
|
|
436
|
+
The confidence is calculated based on either the distance of each data point to
|
|
437
|
+
the centroids of the training data, optionally the kernel density estimate or
|
|
438
|
+
1-dimensional distance.
|
|
388
439
|
|
|
389
440
|
Parameters
|
|
390
441
|
----------
|
|
391
|
-
method : {"distance_inverse", "1d_distance_inverse",
|
|
392
|
-
|
|
442
|
+
method : {"distance_inverse", "1d_distance_inverse","kde_likelihood"},
|
|
443
|
+
default="distance_inverse"
|
|
444
|
+
The method to use for calculating confidence. Default is
|
|
445
|
+
'distance_inverse'.
|
|
393
446
|
"""
|
|
394
447
|
check_is_fitted(self, "is_fitted_")
|
|
395
448
|
if not hasattr(self, "analyis_"):
|
|
396
449
|
raise ValueError(
|
|
397
|
-
"Use predict_and_analyse() instead of predict() for
|
|
450
|
+
"Use predict_and_analyse() instead of predict() for "
|
|
451
|
+
"confidence calculation."
|
|
398
452
|
)
|
|
399
453
|
|
|
400
454
|
# Calculate confidence for each prediction
|
|
@@ -409,7 +463,8 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
409
463
|
elif method == "1d_distance_inverse":
|
|
410
464
|
if not self.calculate_1d_dist:
|
|
411
465
|
raise ValueError(
|
|
412
|
-
"method='1d_distance_inverse' is only valid if calculate_1d_dist
|
|
466
|
+
"method='1d_distance_inverse' is only valid if calculate_1d_dist "
|
|
467
|
+
"is set to True"
|
|
413
468
|
)
|
|
414
469
|
self.confidence_df_ = pd.DataFrame(
|
|
415
470
|
data=self.conf_cl_.T, columns=[f"{x}_conf" for x in self.classes_]
|
|
@@ -418,7 +473,8 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
418
473
|
elif method == "kde_likelihood":
|
|
419
474
|
if not self.calculate_kde:
|
|
420
475
|
raise ValueError(
|
|
421
|
-
"method='kde_likelihood' is only valid if calculate_kde is set
|
|
476
|
+
"method='kde_likelihood' is only valid if calculate_kde is set "
|
|
477
|
+
"to True"
|
|
422
478
|
)
|
|
423
479
|
|
|
424
480
|
self.confidence_df_ = pd.DataFrame(
|