distclassipy 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- distclassipy/classifier.py +141 -58
- distclassipy/distances.py +389 -226
- {distclassipy-0.1.0.dist-info → distclassipy-0.1.2.dist-info}/METADATA +64 -13
- distclassipy-0.1.2.dist-info/RECORD +9 -0
- {distclassipy-0.1.0.dist-info → distclassipy-0.1.2.dist-info}/WHEEL +1 -1
- distclassipy-0.1.0.dist-info/RECORD +0 -9
- {distclassipy-0.1.0.dist-info → distclassipy-0.1.2.dist-info}/LICENSE +0 -0
- {distclassipy-0.1.0.dist-info → distclassipy-0.1.2.dist-info}/top_level.txt +0 -0
distclassipy/classifier.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import numpy as np
|
|
2
2
|
import pandas as pd
|
|
3
|
-
|
|
3
|
+
import scipy
|
|
4
4
|
from .distances import Distance
|
|
5
5
|
from sklearn.base import BaseEstimator, ClassifierMixin
|
|
6
6
|
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
|
|
@@ -11,11 +11,68 @@ from typing import Callable
|
|
|
11
11
|
|
|
12
12
|
class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
13
13
|
"""
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
14
|
+
A distance-based classifier that supports the use of various distance metrics.
|
|
15
|
+
|
|
16
|
+
The distance metric classifier determines the similarity between features in a dataset by leveraging the use of different distance metrics to. A specified distance metric is used to compute the distance between a given object and a centroid for every training class in the feature space. The classifier supports the use of different statistical measures for constructing the centroid and scaling the computed distance. Additionally, the distance metric classifier also optionally provides an estimate of the confidence of the classifier's predictions.
|
|
17
|
+
|
|
18
|
+
Parameters
|
|
19
|
+
----------
|
|
20
|
+
metric : str or callable, default="euclidean"
|
|
21
|
+
The distance metric to use for calculating the distance between features.
|
|
22
|
+
scale : bool, default=True
|
|
23
|
+
Whether to scale the distance between the test object and the centroid for a class in the feature space. If True, the data will be scaled based on the specified dispersion statistic.
|
|
24
|
+
central_stat : {"mean", "median"}, default="median"
|
|
25
|
+
The statistic used to calculate the central tendency of the data to construct the feature-space centroid. Supported statistics are "mean" and "median".
|
|
26
|
+
dispersion_stat : {"std", "iqr"}, default="std"
|
|
27
|
+
The statistic used to calculate the dispersion of the data for scaling the distance. Supported statistics are "std" for standard deviation and "iqr" for inter-quartile range.
|
|
28
|
+
|
|
29
|
+
.. versionadded:: 0.1.0
|
|
30
|
+
|
|
31
|
+
calculate_kde : bool, default=False
|
|
32
|
+
Whether to calculate a kernel density estimate based confidence parameter.
|
|
33
|
+
calculate_1d_dist : bool, default=False
|
|
34
|
+
Whether to calculate the 1-dimensional distance based confidence parameter.
|
|
35
|
+
|
|
36
|
+
Attributes
|
|
37
|
+
----------
|
|
38
|
+
metric : str or callable
|
|
39
|
+
The distance metric used for classification.
|
|
40
|
+
scale : bool
|
|
41
|
+
Indicates whether the data is scaled.
|
|
42
|
+
central_stat : str
|
|
43
|
+
The statistic used for calculating central tendency.
|
|
44
|
+
dispersion_stat : str
|
|
45
|
+
The statistic used for calculating dispersion.
|
|
46
|
+
calculate_kde : bool
|
|
47
|
+
Indicates whether a kernel density estimate is calculated.
|
|
48
|
+
calculate_1d_dist : bool
|
|
49
|
+
Indicates whether 1-dimensional distances are calculated.
|
|
50
|
+
|
|
51
|
+
See Also
|
|
52
|
+
--------
|
|
53
|
+
scipy.spatial.dist : Other distance metrics provided in SciPy
|
|
54
|
+
distclassipy.Distance : Distance metrics included with DistClassiPy
|
|
55
|
+
|
|
56
|
+
Notes
|
|
57
|
+
-----
|
|
58
|
+
If using distance metrics supported by SciPy, it is desirable to pass a string, which allows SciPy to use an optimized C version of the code instead of the slower Python version.
|
|
59
|
+
|
|
60
|
+
References
|
|
61
|
+
----------
|
|
62
|
+
.. [1] "Light Curve Classification with DistClassiPy: a new distance-based classifier"
|
|
63
|
+
|
|
64
|
+
Examples
|
|
65
|
+
--------
|
|
66
|
+
>>> import distclassipy as dcpy
|
|
67
|
+
>>> from sklearn.datasets import make_classification
|
|
68
|
+
>>> X, y = make_classification(n_samples=1000, n_features=4,
|
|
69
|
+
... n_informative=2, n_redundant=0,
|
|
70
|
+
... random_state=0, shuffle=False)
|
|
71
|
+
>>> clf = dcpy.DistanceMetricClassifier(metric="canberra")
|
|
72
|
+
>>> clf.fit(X, y)
|
|
73
|
+
DistanceMetricClassifier(...)
|
|
74
|
+
>>> print(clf.predict([[0, 0, 0, 0]]))
|
|
75
|
+
[0]
|
|
19
76
|
"""
|
|
20
77
|
|
|
21
78
|
def __init__(
|
|
@@ -29,21 +86,6 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
29
86
|
):
|
|
30
87
|
"""
|
|
31
88
|
Initialize the classifier with specified parameters.
|
|
32
|
-
|
|
33
|
-
Parameters
|
|
34
|
-
----------
|
|
35
|
-
metric : str or callable, optional
|
|
36
|
-
The distance metric to use. Default is 'euclidean'.
|
|
37
|
-
scale : bool, optional
|
|
38
|
-
If True, classifier is scaled in terms of standard deviations. Default is True.
|
|
39
|
-
central_stat : str, optional
|
|
40
|
-
The statistical measure to calculate the central tendency of the training template('median' or 'mean')
|
|
41
|
-
dispersion_stat : str, optional
|
|
42
|
-
The statistical measure to calculate the dispersion of the training template ('iqr' or 'std').
|
|
43
|
-
calculate_kde : bool, optional
|
|
44
|
-
If True, calculate the kernel density estimate. Default is True.
|
|
45
|
-
calculate_1d_dist : bool, optional
|
|
46
|
-
If True, calculate the 1-dimensional distance. Default is True.
|
|
47
89
|
"""
|
|
48
90
|
self.metric = metric
|
|
49
91
|
self.scale = scale
|
|
@@ -52,24 +94,68 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
52
94
|
self.calculate_kde = calculate_kde
|
|
53
95
|
self.calculate_1d_dist = calculate_1d_dist
|
|
54
96
|
|
|
97
|
+
# Hardcoded source packages to check for distance metrics.
|
|
98
|
+
self.metric_sources_ = {
|
|
99
|
+
"scipy.spatial.distance": scipy.spatial.distance,
|
|
100
|
+
"distances.Distance": Distance(),
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
def set_metric_fn_(self):
|
|
104
|
+
"""
|
|
105
|
+
Set the metric function based on the provided metric.
|
|
106
|
+
|
|
107
|
+
If the metric is a string, the function will look for a corresponding function in scipy.spatial.distance or distances.Distance. If the metric is a function, it will be used directly.
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
if callable(self.metric):
|
|
111
|
+
self.metric_fn_ = self.metric
|
|
112
|
+
self.metric_arg_ = self.metric
|
|
113
|
+
|
|
114
|
+
elif isinstance(self.metric, str):
|
|
115
|
+
metric_str_lowercase = self.metric.lower()
|
|
116
|
+
metric_found = False
|
|
117
|
+
for package_str, source in self.metric_sources_.items():
|
|
118
|
+
if hasattr(source, metric_str_lowercase):
|
|
119
|
+
self.metric_fn_ = getattr(source, metric_str_lowercase)
|
|
120
|
+
metric_found = True
|
|
121
|
+
if package_str == "scipy.spatial.distance":
|
|
122
|
+
# Use the string as an argument if it belongs to scipy as it is optimized
|
|
123
|
+
self.metric_arg_ = self.metric
|
|
124
|
+
else:
|
|
125
|
+
self.metric_arg_ = self.metric_fn_
|
|
126
|
+
break
|
|
127
|
+
|
|
128
|
+
if not metric_found:
|
|
129
|
+
raise ValueError(
|
|
130
|
+
f"{self.metric} metric not found. Please pass a string of the name of a metric in scipy.spatial.distance or distances.Distance, or pass a metric function directly. For a list of available metrics, see: https://sidchaini.github.io/DistClassiPy/distances.html or https://docs.scipy.org/doc/scipy/reference/spatial.distance.html"
|
|
131
|
+
)
|
|
132
|
+
|
|
55
133
|
def fit(self, X: np.array, y: np.array, feat_labels: list[str] = None):
|
|
56
|
-
"""
|
|
134
|
+
"""
|
|
135
|
+
Calculate the feature space centroid for all classes in the training set (X,y) using the central statistic. If scaling is enabled, also calculate the appropriate dispersion statistic.
|
|
57
136
|
|
|
58
|
-
This involves
|
|
137
|
+
This involves computing the centroid for every class in the feature space and optionally calculating the kernel density estimate and 1-dimensional distance.
|
|
59
138
|
|
|
60
139
|
Parameters
|
|
61
140
|
----------
|
|
62
141
|
X : array-like of shape (n_samples, n_features)
|
|
63
142
|
The training input samples.
|
|
64
143
|
y : array-like of shape (n_samples,)
|
|
65
|
-
The target values.
|
|
66
|
-
feat_labels : list of str, optional
|
|
144
|
+
The target values (class labels).
|
|
145
|
+
feat_labels : list of str, optional, default=None
|
|
67
146
|
The feature labels. If not provided, default labels representing feature number will be used.
|
|
147
|
+
|
|
148
|
+
Returns
|
|
149
|
+
-------
|
|
150
|
+
self : object
|
|
151
|
+
Fitted estimator.
|
|
68
152
|
"""
|
|
69
153
|
X, y = check_X_y(X, y)
|
|
70
154
|
self.classes_ = unique_labels(y)
|
|
71
155
|
self.n_features_in_ = X.shape[1]
|
|
72
156
|
|
|
157
|
+
self.set_metric_fn_()
|
|
158
|
+
|
|
73
159
|
if feat_labels is None:
|
|
74
160
|
feat_labels = [f"Feature_{x}" for x in range(X.shape[1])]
|
|
75
161
|
|
|
@@ -112,7 +198,6 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
112
198
|
self.df_iqr_ = df_iqr
|
|
113
199
|
|
|
114
200
|
if self.calculate_kde:
|
|
115
|
-
self.set_metric_fn()
|
|
116
201
|
self.kde_dict_ = {}
|
|
117
202
|
|
|
118
203
|
for cl in self.classes_:
|
|
@@ -133,21 +218,26 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
133
218
|
return self
|
|
134
219
|
|
|
135
220
|
def predict(self, X: np.array):
|
|
136
|
-
"""Predict the class labels for the provided
|
|
221
|
+
"""Predict the class labels for the provided X.
|
|
137
222
|
|
|
138
|
-
The prediction is based on the distance of each data point to the
|
|
223
|
+
The prediction is based on the distance of each data point in the input sample to the centroid for each class in the feature space. The predicted class is the one whose centroid is the closest to the input sample.
|
|
139
224
|
|
|
140
225
|
Parameters
|
|
141
226
|
----------
|
|
142
227
|
X : array-like of shape (n_samples, n_features)
|
|
143
228
|
The input samples.
|
|
229
|
+
|
|
230
|
+
Returns
|
|
231
|
+
-------
|
|
232
|
+
y : ndarray of shape (n_samples,)
|
|
233
|
+
The predicted classes.
|
|
144
234
|
"""
|
|
145
235
|
check_is_fitted(self, "is_fitted_")
|
|
146
236
|
X = check_array(X)
|
|
147
237
|
|
|
148
238
|
if not self.scale:
|
|
149
|
-
dist_arr = distance.cdist(
|
|
150
|
-
XA=X, XB=self.df_centroid_.to_numpy(), metric=self.
|
|
239
|
+
dist_arr = scipy.spatial.distance.cdist(
|
|
240
|
+
XA=X, XB=self.df_centroid_.to_numpy(), metric=self.metric_arg_
|
|
151
241
|
)
|
|
152
242
|
|
|
153
243
|
else:
|
|
@@ -164,48 +254,39 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
164
254
|
w = wtdf.loc[cl].to_numpy() # 1/std dev
|
|
165
255
|
XB = XB * w # w is for this class only
|
|
166
256
|
XA = X * w # w is for this class only
|
|
167
|
-
cl_dist = distance.cdist(
|
|
257
|
+
cl_dist = scipy.spatial.distance.cdist(
|
|
258
|
+
XA=XA, XB=XB, metric=self.metric_arg_
|
|
259
|
+
)
|
|
168
260
|
dist_arr_list.append(cl_dist)
|
|
169
261
|
dist_arr = np.column_stack(dist_arr_list)
|
|
170
262
|
|
|
171
263
|
y_pred = self.classes_[dist_arr.argmin(axis=1)]
|
|
172
264
|
return y_pred
|
|
173
265
|
|
|
174
|
-
def set_metric_fn(self):
|
|
175
|
-
"""
|
|
176
|
-
Set the metric function based on the specified metric.
|
|
177
|
-
|
|
178
|
-
If the metric is a string, the function will look for a corresponding function in scipy.spatial.distance or distances.Distance. If the metric is a function, it will be used directly.
|
|
179
|
-
"""
|
|
180
|
-
if not callable(self.metric) or isinstance(self.metric, str):
|
|
181
|
-
if hasattr(distance, self.metric):
|
|
182
|
-
self.metric_fn_ = getattr(distance, self.metric)
|
|
183
|
-
elif hasattr(Distance(), self.metric):
|
|
184
|
-
self.metric_fn_ = getattr(Distance(), self.metric)
|
|
185
|
-
else:
|
|
186
|
-
raise ValueError(
|
|
187
|
-
f"{self.metric} metric not found. Please pass a string of the name of a metric in scipy.spatial.distance or distances.Distance, or pass a metric function directly. For a list of available metrics, see: https://sidchaini.github.io/DistClassiPy/distances.html or https://docs.scipy.org/doc/scipy/reference/spatial.distance.html"
|
|
188
|
-
)
|
|
189
|
-
else:
|
|
190
|
-
self.metric_fn_ = self.metric
|
|
191
|
-
|
|
192
266
|
def predict_and_analyse(self, X: np.array):
|
|
193
267
|
"""
|
|
194
|
-
Predict the class labels for the provided
|
|
268
|
+
Predict the class labels for the provided X and perform analysis.
|
|
269
|
+
|
|
270
|
+
The prediction is based on the distance of each data point in the input sample to the centroid for each class in the feature space. The predicted class is the one whose centroid is the closest to the input sample.
|
|
195
271
|
|
|
196
|
-
The analysis
|
|
272
|
+
The analysis involves saving all calculated distances and confidences as an attribute for inspection and analysis later.
|
|
197
273
|
|
|
198
274
|
Parameters
|
|
199
275
|
----------
|
|
200
276
|
X : array-like of shape (n_samples, n_features)
|
|
201
277
|
The input samples.
|
|
278
|
+
|
|
279
|
+
Returns
|
|
280
|
+
-------
|
|
281
|
+
y : ndarray of shape (n_samples,)
|
|
282
|
+
The predicted classes.
|
|
202
283
|
"""
|
|
203
284
|
check_is_fitted(self, "is_fitted_")
|
|
204
285
|
X = check_array(X)
|
|
205
286
|
|
|
206
287
|
if not self.scale:
|
|
207
|
-
dist_arr = distance.cdist(
|
|
208
|
-
XA=X, XB=self.df_centroid_.to_numpy(), metric=self.
|
|
288
|
+
dist_arr = scipy.spatial.distance.cdist(
|
|
289
|
+
XA=X, XB=self.df_centroid_.to_numpy(), metric=self.metric_arg_
|
|
209
290
|
)
|
|
210
291
|
|
|
211
292
|
else:
|
|
@@ -222,7 +303,9 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
222
303
|
w = wtdf.loc[cl].to_numpy() # 1/std dev
|
|
223
304
|
XB = XB * w # w is for this class only
|
|
224
305
|
XA = X * w # w is for this class only
|
|
225
|
-
cl_dist = distance.cdist(
|
|
306
|
+
cl_dist = scipy.spatial.distance.cdist(
|
|
307
|
+
XA=XA, XB=XB, metric=self.metric_arg_
|
|
308
|
+
)
|
|
226
309
|
dist_arr_list.append(cl_dist)
|
|
227
310
|
dist_arr = np.column_stack(dist_arr_list)
|
|
228
311
|
|
|
@@ -262,12 +345,12 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
262
345
|
for cl in self.classes_:
|
|
263
346
|
sum_1d_dists = np.zeros(shape=(len(Xdf_temp)))
|
|
264
347
|
for feat in Xdf_temp.columns:
|
|
265
|
-
dists = distance.cdist(
|
|
348
|
+
dists = scipy.spatial.distance.cdist(
|
|
266
349
|
XA=np.zeros(shape=(1, 1)),
|
|
267
350
|
XB=(self.df_centroid_.loc[cl] - Xdf_temp)[feat]
|
|
268
351
|
.to_numpy()
|
|
269
352
|
.reshape(-1, 1),
|
|
270
|
-
metric=self.
|
|
353
|
+
metric=self.metric_arg_,
|
|
271
354
|
).ravel()
|
|
272
355
|
if self.scale and self.dispersion_stat == "std":
|
|
273
356
|
sum_1d_dists = sum_1d_dists + dists / self.df_std_.loc[cl, feat]
|
|
@@ -290,11 +373,11 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
290
373
|
"""
|
|
291
374
|
Calculate the confidence for each prediction.
|
|
292
375
|
|
|
293
|
-
The confidence is calculated based on the distance of each data point to the training
|
|
376
|
+
The confidence is calculated based on either the distance of each data point to the centroids of the training data, optionally the kernel density estimate or 1-dimensional distance.
|
|
294
377
|
|
|
295
378
|
Parameters
|
|
296
379
|
----------
|
|
297
|
-
method :
|
|
380
|
+
method : {"distance_inverse", "1d_distance_inverse", "kde_likelihood"}, default="distance_inverse"
|
|
298
381
|
The method to use for calculating confidence. Default is 'distance_inverse'.
|
|
299
382
|
"""
|
|
300
383
|
check_is_fitted(self, "is_fitted_")
|