distclassipy 0.2.0a0__tar.gz → 0.2.2a1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {distclassipy-0.2.0a0 → distclassipy-0.2.2a1}/PKG-INFO +14 -5
- {distclassipy-0.2.0a0 → distclassipy-0.2.2a1}/README.md +11 -3
- {distclassipy-0.2.0a0 → distclassipy-0.2.2a1}/distclassipy/__init__.py +13 -3
- {distclassipy-0.2.0a0 → distclassipy-0.2.2a1}/distclassipy/classifier.py +345 -37
- distclassipy-0.2.2a1/distclassipy/distances.py +1564 -0
- {distclassipy-0.2.0a0 → distclassipy-0.2.2a1}/distclassipy.egg-info/PKG-INFO +14 -5
- {distclassipy-0.2.0a0 → distclassipy-0.2.2a1}/tests/test_classifier.py +44 -4
- {distclassipy-0.2.0a0 → distclassipy-0.2.2a1}/tests/test_distances.py +19 -21
- distclassipy-0.2.2a1/tests/test_distances_prop.py +65 -0
- distclassipy-0.2.0a0/distclassipy/distances.py +0 -1484
- distclassipy-0.2.0a0/tests/test_distances_prop.py +0 -112
- {distclassipy-0.2.0a0 → distclassipy-0.2.2a1}/LICENSE +0 -0
- {distclassipy-0.2.0a0 → distclassipy-0.2.2a1}/distclassipy.egg-info/SOURCES.txt +0 -0
- {distclassipy-0.2.0a0 → distclassipy-0.2.2a1}/distclassipy.egg-info/dependency_links.txt +0 -0
- {distclassipy-0.2.0a0 → distclassipy-0.2.2a1}/distclassipy.egg-info/requires.txt +0 -0
- {distclassipy-0.2.0a0 → distclassipy-0.2.2a1}/distclassipy.egg-info/top_level.txt +0 -0
- {distclassipy-0.2.0a0 → distclassipy-0.2.2a1}/pyproject.toml +0 -0
- {distclassipy-0.2.0a0 → distclassipy-0.2.2a1}/setup.cfg +0 -0
- {distclassipy-0.2.0a0 → distclassipy-0.2.2a1}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: distclassipy
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2a1
|
|
4
4
|
Summary: A python package for a distance-based classifier which can use several different distance metrics.
|
|
5
5
|
Author-email: Siddharth Chaini <sidchaini@gmail.com>
|
|
6
6
|
License: GNU GENERAL PUBLIC LICENSE
|
|
@@ -697,6 +697,7 @@ Requires-Dist: joblib>=1.3.2
|
|
|
697
697
|
Requires-Dist: numpy>=1.25.2
|
|
698
698
|
Requires-Dist: pandas>=2.0.3
|
|
699
699
|
Requires-Dist: scikit-learn>=1.2.2
|
|
700
|
+
Dynamic: license-file
|
|
700
701
|
|
|
701
702
|
<h1 align="center">
|
|
702
703
|
<picture align="center">
|
|
@@ -740,17 +741,25 @@ X, y = make_classification(
|
|
|
740
741
|
random_state=0,
|
|
741
742
|
shuffle=False,
|
|
742
743
|
)
|
|
744
|
+
# Example usage of DistanceMetricClassifier
|
|
743
745
|
clf = dcpy.DistanceMetricClassifier()
|
|
744
746
|
clf.fit(X, y)
|
|
745
|
-
print(clf.predict([[0, 0, 0, 0]]
|
|
747
|
+
print(clf.predict([[0, 0, 0, 0]], metric="canberra"))
|
|
748
|
+
|
|
749
|
+
# Example usage of EnsembleDistanceClassifier
|
|
750
|
+
ensemble_clf = dcpy.EnsembleDistanceClassifier(feat_idx=0)
|
|
751
|
+
ensemble_clf.fit(X, y)
|
|
752
|
+
print(ensemble_clf.predict(X))
|
|
746
753
|
```
|
|
747
754
|
|
|
748
755
|
## Features
|
|
749
756
|
- **Distance Metric-Based Classification**: Utilizes a variety of distance metrics for classification.
|
|
750
757
|
- **Customizable for Scientific Goals**: Allows fine-tuning based on scientific objectives by selecting appropriate distance metrics and features, enhancing both computational efficiency and model performance.
|
|
751
758
|
- **Interpretable Results**: Offers improved interpretability of classification outcomes by directly using distance metrics and feature importance, making it ideal for scientific applications.
|
|
752
|
-
- **Efficient and Scalable**: Demonstrates lower computational requirements compared to traditional methods like Random Forests, making it suitable for large datasets
|
|
753
|
-
- **Open Source and Accessible**: Available as an open-source Python package on PyPI, encouraging broad application in astronomy and beyond
|
|
759
|
+
- **Efficient and Scalable**: Demonstrates lower computational requirements compared to traditional methods like Random Forests, making it suitable for large datasets.
|
|
760
|
+
- **Open Source and Accessible**: Available as an open-source Python package on PyPI, encouraging broad application in astronomy and beyond.
|
|
761
|
+
- **(NEW) Ensemble Distance Classification**: Leverages an ensemble approach to use different distance metrics for each quantile, improving classification performance across diverse data distributions.
|
|
762
|
+
- **(NEW) Expanded Distance Metrics**: DistClassiPy now offers 43 built-in distance metrics, an increase from the previous 18. Additionally, users can still define and use custom distance metrics as needed.
|
|
754
763
|
|
|
755
764
|
## Documentation
|
|
756
765
|
|
|
@@ -40,17 +40,25 @@ X, y = make_classification(
|
|
|
40
40
|
random_state=0,
|
|
41
41
|
shuffle=False,
|
|
42
42
|
)
|
|
43
|
+
# Example usage of DistanceMetricClassifier
|
|
43
44
|
clf = dcpy.DistanceMetricClassifier()
|
|
44
45
|
clf.fit(X, y)
|
|
45
|
-
print(clf.predict([[0, 0, 0, 0]]
|
|
46
|
+
print(clf.predict([[0, 0, 0, 0]], metric="canberra"))
|
|
47
|
+
|
|
48
|
+
# Example usage of EnsembleDistanceClassifier
|
|
49
|
+
ensemble_clf = dcpy.EnsembleDistanceClassifier(feat_idx=0)
|
|
50
|
+
ensemble_clf.fit(X, y)
|
|
51
|
+
print(ensemble_clf.predict(X))
|
|
46
52
|
```
|
|
47
53
|
|
|
48
54
|
## Features
|
|
49
55
|
- **Distance Metric-Based Classification**: Utilizes a variety of distance metrics for classification.
|
|
50
56
|
- **Customizable for Scientific Goals**: Allows fine-tuning based on scientific objectives by selecting appropriate distance metrics and features, enhancing both computational efficiency and model performance.
|
|
51
57
|
- **Interpretable Results**: Offers improved interpretability of classification outcomes by directly using distance metrics and feature importance, making it ideal for scientific applications.
|
|
52
|
-
- **Efficient and Scalable**: Demonstrates lower computational requirements compared to traditional methods like Random Forests, making it suitable for large datasets
|
|
53
|
-
- **Open Source and Accessible**: Available as an open-source Python package on PyPI, encouraging broad application in astronomy and beyond
|
|
58
|
+
- **Efficient and Scalable**: Demonstrates lower computational requirements compared to traditional methods like Random Forests, making it suitable for large datasets.
|
|
59
|
+
- **Open Source and Accessible**: Available as an open-source Python package on PyPI, encouraging broad application in astronomy and beyond.
|
|
60
|
+
- **(NEW) Ensemble Distance Classification**: Leverages an ensemble approach to use different distance metrics for each quantile, improving classification performance across diverse data distributions.
|
|
61
|
+
- **(NEW) Expanded Distance Metrics**: DistClassiPy now offers 43 built-in distance metrics, an increase from the previous 18. Additionally, users can still define and use custom distance metrics as needed.
|
|
54
62
|
|
|
55
63
|
## Documentation
|
|
56
64
|
|
|
@@ -22,7 +22,17 @@ You should have received a copy of the GNU General Public License
|
|
|
22
22
|
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
23
23
|
"""
|
|
24
24
|
|
|
25
|
-
from .classifier import
|
|
26
|
-
|
|
25
|
+
from .classifier import (
|
|
26
|
+
DistanceMetricClassifier,
|
|
27
|
+
EnsembleDistanceClassifier,
|
|
28
|
+
)
|
|
29
|
+
from .distances import _ALL_METRICS
|
|
27
30
|
|
|
28
|
-
__version__ = "0.2.
|
|
31
|
+
__version__ = "0.2.2a1"
|
|
32
|
+
|
|
33
|
+
__all__ = [
|
|
34
|
+
"DistanceMetricClassifier",
|
|
35
|
+
"EnsembleDistanceClassifier",
|
|
36
|
+
"Distance",
|
|
37
|
+
"_ALL_METRICS",
|
|
38
|
+
]
|
|
@@ -3,6 +3,21 @@
|
|
|
3
3
|
This module contains the DistanceMetricClassifier introduced by Chaini et al. (2024)
|
|
4
4
|
in "Light Curve Classification with DistClassiPy: a new distance-based classifier"
|
|
5
5
|
|
|
6
|
+
|
|
7
|
+
.. autoclass:: distclassipy.classifier.DistanceMetricClassifier
|
|
8
|
+
:members:
|
|
9
|
+
:inherited-members:
|
|
10
|
+
:exclude-members: set_fit_request, set_predict_request
|
|
11
|
+
|
|
12
|
+
.. autoclass:: distclassipy.classifier.EnsembleDistanceClassifier
|
|
13
|
+
:members:
|
|
14
|
+
:inherited-members:
|
|
15
|
+
:exclude-members: set_fit_request, set_predict_request
|
|
16
|
+
|
|
17
|
+
.. doctest-skip::
|
|
18
|
+
|
|
19
|
+
.. skip::
|
|
20
|
+
|
|
6
21
|
Copyright (C) 2024 Siddharth Chaini
|
|
7
22
|
-----
|
|
8
23
|
This program is free software: you can redistribute it and/or modify
|
|
@@ -19,7 +34,7 @@ You should have received a copy of the GNU General Public License
|
|
|
19
34
|
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
20
35
|
"""
|
|
21
36
|
|
|
22
|
-
from typing import Callable
|
|
37
|
+
from typing import Callable, Tuple
|
|
23
38
|
|
|
24
39
|
import numpy as np
|
|
25
40
|
|
|
@@ -28,15 +43,18 @@ import pandas as pd
|
|
|
28
43
|
import scipy
|
|
29
44
|
|
|
30
45
|
from sklearn.base import BaseEstimator, ClassifierMixin
|
|
46
|
+
from sklearn.metrics import accuracy_score
|
|
47
|
+
from sklearn.model_selection import train_test_split
|
|
31
48
|
from sklearn.utils.multiclass import unique_labels
|
|
32
|
-
from sklearn.utils.validation import
|
|
49
|
+
from sklearn.utils.validation import check_is_fitted, check_array
|
|
33
50
|
|
|
34
|
-
from .
|
|
51
|
+
from . import distances
|
|
52
|
+
from .distances import _ALL_METRICS
|
|
35
53
|
|
|
36
54
|
# Hardcoded source packages to check for distance metrics.
|
|
37
55
|
METRIC_SOURCES_ = {
|
|
38
56
|
"scipy.spatial.distance": scipy.spatial.distance,
|
|
39
|
-
"distances
|
|
57
|
+
"distclassipy.distances": distances,
|
|
40
58
|
}
|
|
41
59
|
|
|
42
60
|
|
|
@@ -44,7 +62,7 @@ def initialize_metric_function(metric):
|
|
|
44
62
|
"""Set the metric function based on the provided metric.
|
|
45
63
|
|
|
46
64
|
If the metric is a string, the function will look for a corresponding
|
|
47
|
-
function in scipy.spatial.distance or distances.
|
|
65
|
+
function in scipy.spatial.distance or distclassipy.distances. If the metric
|
|
48
66
|
is a function, it will be used directly.
|
|
49
67
|
"""
|
|
50
68
|
if callable(metric):
|
|
@@ -78,7 +96,7 @@ def initialize_metric_function(metric):
|
|
|
78
96
|
raise ValueError(
|
|
79
97
|
f"{metric} metric not found. Please pass a string of the "
|
|
80
98
|
"name of a metric in scipy.spatial.distance or "
|
|
81
|
-
"distances
|
|
99
|
+
"distclassipy.distances, or pass a metric function directly. For a "
|
|
82
100
|
"list of available metrics, see: "
|
|
83
101
|
"https://sidchaini.github.io/DistClassiPy/distances.html or "
|
|
84
102
|
"https://docs.scipy.org/doc/scipy/reference/spatial.distance.html"
|
|
@@ -86,7 +104,7 @@ def initialize_metric_function(metric):
|
|
|
86
104
|
return metric_fn_, metric_arg_
|
|
87
105
|
|
|
88
106
|
|
|
89
|
-
class DistanceMetricClassifier(
|
|
107
|
+
class DistanceMetricClassifier(ClassifierMixin, BaseEstimator):
|
|
90
108
|
"""A distance-based classifier that supports different distance metrics.
|
|
91
109
|
|
|
92
110
|
The distance metric classifier determines the similarity between features in a
|
|
@@ -113,16 +131,6 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
113
131
|
|
|
114
132
|
.. versionadded:: 0.1.0
|
|
115
133
|
|
|
116
|
-
|
|
117
|
-
Attributes
|
|
118
|
-
----------
|
|
119
|
-
scale : bool
|
|
120
|
-
Indicates whether the data is scaled.
|
|
121
|
-
central_stat : str
|
|
122
|
-
The statistic used for calculating central tendency.
|
|
123
|
-
dispersion_stat : str
|
|
124
|
-
The statistic used for calculating dispersion.
|
|
125
|
-
|
|
126
134
|
References
|
|
127
135
|
----------
|
|
128
136
|
.. [1] "Light Curve Classification with DistClassiPy: a new distance-based
|
|
@@ -135,25 +143,29 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
135
143
|
>>> X, y = make_classification(n_samples=1000, n_features=4,
|
|
136
144
|
... n_informative=2, n_redundant=0,
|
|
137
145
|
... random_state=0, shuffle=False)
|
|
138
|
-
>>> clf = dcpy.DistanceMetricClassifier(
|
|
146
|
+
>>> clf = dcpy.DistanceMetricClassifier()
|
|
139
147
|
>>> clf.fit(X, y)
|
|
140
148
|
DistanceMetricClassifier(...)
|
|
141
|
-
>>> print(clf.predict([[0, 0, 0, 0]]))
|
|
149
|
+
>>> print(clf.predict([[0, 0, 0, 0]], metric="canberra"))
|
|
142
150
|
[0]
|
|
143
151
|
"""
|
|
144
152
|
|
|
145
153
|
def __init__(
|
|
146
154
|
self,
|
|
155
|
+
metric: str | Callable = None,
|
|
147
156
|
scale: bool = True,
|
|
148
157
|
central_stat: str = "median",
|
|
149
158
|
dispersion_stat: str = "std",
|
|
150
|
-
):
|
|
159
|
+
) -> None:
|
|
151
160
|
"""Initialize the classifier with specified parameters."""
|
|
161
|
+
self.metric = metric
|
|
152
162
|
self.scale = scale
|
|
153
163
|
self.central_stat = central_stat
|
|
154
164
|
self.dispersion_stat = dispersion_stat
|
|
155
165
|
|
|
156
|
-
def fit(
|
|
166
|
+
def fit(
|
|
167
|
+
self, X: np.array, y: np.array, feat_labels: list[str] = None
|
|
168
|
+
) -> "DistanceMetricClassifier":
|
|
157
169
|
"""Calculate the feature space centroid for all classes.
|
|
158
170
|
|
|
159
171
|
This function calculates the feature space centroid in the training
|
|
@@ -177,11 +189,8 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
177
189
|
self : object
|
|
178
190
|
Fitted estimator.
|
|
179
191
|
"""
|
|
180
|
-
X, y =
|
|
192
|
+
X, y = self._validate_data(X, y)
|
|
181
193
|
self.classes_ = unique_labels(y)
|
|
182
|
-
self.n_features_in_ = X.shape[
|
|
183
|
-
1
|
|
184
|
-
] # Number of features seen during fit - required for sklearn compatibility.
|
|
185
194
|
|
|
186
195
|
if feat_labels is None:
|
|
187
196
|
feat_labels = [f"Feature_{x}" for x in range(X.shape[1])]
|
|
@@ -233,8 +242,8 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
233
242
|
def predict(
|
|
234
243
|
self,
|
|
235
244
|
X: np.array,
|
|
236
|
-
metric: str | Callable =
|
|
237
|
-
):
|
|
245
|
+
metric: str | Callable = None,
|
|
246
|
+
) -> np.ndarray:
|
|
238
247
|
"""Predict the class labels for the provided X.
|
|
239
248
|
|
|
240
249
|
The prediction is based on the distance of each data point in the input sample
|
|
@@ -248,6 +257,10 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
248
257
|
metric : str or callable, default="euclidean"
|
|
249
258
|
The distance metric to use for calculating the distance between features.
|
|
250
259
|
|
|
260
|
+
.. versionchanged:: 0.2.0
|
|
261
|
+
The metric is now specified at prediction time rather
|
|
262
|
+
than during initialization, providing greater flexibility.
|
|
263
|
+
|
|
251
264
|
Returns
|
|
252
265
|
-------
|
|
253
266
|
y : ndarray of shape (n_samples,)
|
|
@@ -256,7 +269,7 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
256
269
|
See Also
|
|
257
270
|
--------
|
|
258
271
|
scipy.spatial.dist : Other distance metrics provided in SciPy
|
|
259
|
-
distclassipy.
|
|
272
|
+
distclassipy.distances : Distance metrics included with DistClassiPy
|
|
260
273
|
|
|
261
274
|
Notes
|
|
262
275
|
-----
|
|
@@ -264,10 +277,14 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
264
277
|
which allows SciPy to use an optimized C version of the code instead of the
|
|
265
278
|
slower Python version.
|
|
266
279
|
"""
|
|
267
|
-
check_is_fitted(self
|
|
268
|
-
X =
|
|
280
|
+
check_is_fitted(self)
|
|
281
|
+
X = self._validate_data(X, reset=False)
|
|
269
282
|
|
|
270
|
-
|
|
283
|
+
metric_to_use = metric if metric is not None else self.metric
|
|
284
|
+
if metric_to_use is None:
|
|
285
|
+
# defaults to euclidean
|
|
286
|
+
metric_to_use = "euclidean"
|
|
287
|
+
metric_fn_, metric_arg_ = initialize_metric_function(metric_to_use)
|
|
271
288
|
|
|
272
289
|
if not self.scale:
|
|
273
290
|
dist_arr = scipy.spatial.distance.cdist(
|
|
@@ -298,8 +315,8 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
298
315
|
def predict_and_analyse(
|
|
299
316
|
self,
|
|
300
317
|
X: np.array,
|
|
301
|
-
metric: str | Callable =
|
|
302
|
-
):
|
|
318
|
+
metric: str | Callable = None,
|
|
319
|
+
) -> np.ndarray:
|
|
303
320
|
"""Predict the class labels for the provided X and perform analysis.
|
|
304
321
|
|
|
305
322
|
The prediction is based on the distance of each data point in the input sample
|
|
@@ -325,7 +342,7 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
325
342
|
See Also
|
|
326
343
|
--------
|
|
327
344
|
scipy.spatial.dist : Other distance metrics provided in SciPy
|
|
328
|
-
distclassipy.
|
|
345
|
+
distclassipy.distances : Distance metrics included with DistClassiPy
|
|
329
346
|
|
|
330
347
|
Notes
|
|
331
348
|
-----
|
|
@@ -334,10 +351,14 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
334
351
|
of the slower Python version.
|
|
335
352
|
|
|
336
353
|
"""
|
|
337
|
-
check_is_fitted(self
|
|
338
|
-
X =
|
|
354
|
+
check_is_fitted(self)
|
|
355
|
+
X = self._validate_data(X, reset=False)
|
|
339
356
|
|
|
340
|
-
|
|
357
|
+
metric_to_use = metric if metric is not None else self.metric
|
|
358
|
+
if metric_to_use is None:
|
|
359
|
+
# defaults to euclidean
|
|
360
|
+
metric_to_use = "euclidean"
|
|
361
|
+
metric_fn_, metric_arg_ = initialize_metric_function(metric_to_use)
|
|
341
362
|
|
|
342
363
|
if not self.scale:
|
|
343
364
|
dist_arr = scipy.spatial.distance.cdist(
|
|
@@ -397,3 +418,290 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
397
418
|
]
|
|
398
419
|
|
|
399
420
|
return self.confidence_df_.to_numpy()
|
|
421
|
+
|
|
422
|
+
def score(self, X, y, metric: str | Callable = None) -> float:
|
|
423
|
+
"""Return the mean accuracy on the given test data and labels.
|
|
424
|
+
|
|
425
|
+
Parameters
|
|
426
|
+
----------
|
|
427
|
+
X : array-like of shape (n_samples, n_features)
|
|
428
|
+
Test samples.
|
|
429
|
+
y : array-like of shape (n_samples,)
|
|
430
|
+
True labels for X.
|
|
431
|
+
metric : str or callable, default="euclidean"
|
|
432
|
+
The distance metric to use for calculating the distance between features.
|
|
433
|
+
|
|
434
|
+
Returns
|
|
435
|
+
-------
|
|
436
|
+
score : float
|
|
437
|
+
Mean accuracy of self.predict(X) wrt. y.
|
|
438
|
+
"""
|
|
439
|
+
metric_to_use = metric if metric is not None else self.metric
|
|
440
|
+
y_pred = self.predict(X, metric=metric_to_use)
|
|
441
|
+
return accuracy_score(y, y_pred)
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
class EnsembleDistanceClassifier(ClassifierMixin, BaseEstimator):
|
|
445
|
+
"""An ensemble classifier that uses different metrics for each quantile.
|
|
446
|
+
|
|
447
|
+
This classifier splits the data into quantiles based on a specified
|
|
448
|
+
feature and uses different distance metrics for each quantile to
|
|
449
|
+
construct an ensemble classifier for each quantile, generally leading
|
|
450
|
+
to better performance.
|
|
451
|
+
Note, however, this involves fitting the training set for each metric
|
|
452
|
+
to evaluate performance, making this more computationally expensive.
|
|
453
|
+
|
|
454
|
+
.. versionadded:: 0.2.0
|
|
455
|
+
"""
|
|
456
|
+
|
|
457
|
+
def __init__(
|
|
458
|
+
self,
|
|
459
|
+
feat_idx: int,
|
|
460
|
+
scale: bool = True,
|
|
461
|
+
central_stat: str = "median",
|
|
462
|
+
dispersion_stat: str = "std",
|
|
463
|
+
metrics_to_consider: list[str] = None,
|
|
464
|
+
random_state: int = None,
|
|
465
|
+
) -> None:
|
|
466
|
+
"""Initialize the classifier with specified parameters.
|
|
467
|
+
|
|
468
|
+
Parameters
|
|
469
|
+
----------
|
|
470
|
+
feat_idx : int
|
|
471
|
+
The index of the feature to be used for quantile splitting.
|
|
472
|
+
scale : bool, default=True
|
|
473
|
+
Whether to scale the distance between the test object and the centroid.
|
|
474
|
+
central_stat : str, default="median"
|
|
475
|
+
The statistic used to calculate the central tendency of the data.
|
|
476
|
+
dispersion_stat : str, default="std"
|
|
477
|
+
The statistic used to calculate the dispersion of the data.
|
|
478
|
+
metrics_to_consider : list of str, optional
|
|
479
|
+
A list of distance metrics to evaluate. If None, all available
|
|
480
|
+
metrics within DistClassiPy will be considered.
|
|
481
|
+
random_state : int, RandomState instance or None, optional (default=None)
|
|
482
|
+
Controls the randomness of the estimator. Pass an int for reproducible
|
|
483
|
+
output across multiple function calls.
|
|
484
|
+
|
|
485
|
+
.. versionadded:: 0.2.1
|
|
486
|
+
"""
|
|
487
|
+
self.feat_idx = feat_idx
|
|
488
|
+
self.scale = scale
|
|
489
|
+
self.central_stat = central_stat
|
|
490
|
+
self.dispersion_stat = dispersion_stat
|
|
491
|
+
self.metrics_to_consider = metrics_to_consider
|
|
492
|
+
self.random_state = random_state
|
|
493
|
+
|
|
494
|
+
def fit(
|
|
495
|
+
self, X: np.ndarray, y: np.ndarray, n_quantiles: int = 4
|
|
496
|
+
) -> "EnsembleDistanceClassifier":
|
|
497
|
+
"""Fit the ensemble classifier using the best metrics for each quantile.
|
|
498
|
+
|
|
499
|
+
Parameters
|
|
500
|
+
----------
|
|
501
|
+
X : np.ndarray
|
|
502
|
+
The input feature matrix.
|
|
503
|
+
y : np.ndarray
|
|
504
|
+
The target labels.
|
|
505
|
+
n_quantiles : int, default=4
|
|
506
|
+
The number of quantiles to split the data into.
|
|
507
|
+
|
|
508
|
+
Returns
|
|
509
|
+
-------
|
|
510
|
+
self : object
|
|
511
|
+
Fitted estimator.
|
|
512
|
+
"""
|
|
513
|
+
self.clf_ = DistanceMetricClassifier(
|
|
514
|
+
scale=self.scale,
|
|
515
|
+
central_stat=self.central_stat,
|
|
516
|
+
dispersion_stat=self.dispersion_stat,
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
# Find best metrics based on training set quantiles
|
|
520
|
+
self.quantile_scores_df_, self.best_metrics_per_quantile_, self.group_bins = (
|
|
521
|
+
self.evaluate_metrics(X, y, n_quantiles)
|
|
522
|
+
)
|
|
523
|
+
|
|
524
|
+
# Ensure the bins work with values outside of training data
|
|
525
|
+
self.group_bins[0] = -np.inf
|
|
526
|
+
self.group_bins[-1] = np.inf
|
|
527
|
+
|
|
528
|
+
self.group_labels = [f"Quantile {i+1}" for i in range(n_quantiles)]
|
|
529
|
+
self.clf_.fit(X, y)
|
|
530
|
+
self.is_fitted_ = True
|
|
531
|
+
return self
|
|
532
|
+
|
|
533
|
+
def predict(self, X: np.ndarray) -> np.ndarray:
|
|
534
|
+
"""Predict class labels using the best metric for each quantile.
|
|
535
|
+
|
|
536
|
+
Parameters
|
|
537
|
+
----------
|
|
538
|
+
X : np.ndarray
|
|
539
|
+
The input samples.
|
|
540
|
+
|
|
541
|
+
Returns
|
|
542
|
+
-------
|
|
543
|
+
predictions : np.ndarray
|
|
544
|
+
The predicted class labels.
|
|
545
|
+
"""
|
|
546
|
+
check_is_fitted(self)
|
|
547
|
+
X = self._validate_data(X, reset=False)
|
|
548
|
+
|
|
549
|
+
# notes for pred during best:
|
|
550
|
+
# option 1:
|
|
551
|
+
# loop through each metric, merge quantiles for each metric
|
|
552
|
+
# pred on this
|
|
553
|
+
# option 2, easier, but slower:
|
|
554
|
+
# loop through each quantile, and append pred
|
|
555
|
+
|
|
556
|
+
quantiles = pd.cut(
|
|
557
|
+
X[:, self.feat_idx], bins=self.group_bins, labels=self.group_labels
|
|
558
|
+
)
|
|
559
|
+
grouped_data = pd.DataFrame(X).groupby(quantiles, observed=False)
|
|
560
|
+
# quantile_indices = quantiles.codes # Get integer codes for quantiles
|
|
561
|
+
predictions = np.empty(X.shape[0], dtype=object) # Change dtype to object
|
|
562
|
+
for i, (lim, subdf) in enumerate(grouped_data):
|
|
563
|
+
best_metric = self.best_metrics_per_quantile_.loc[self.group_labels[i]]
|
|
564
|
+
preds = self.clf_.predict(subdf.to_numpy(), metric=best_metric)
|
|
565
|
+
predictions[subdf.index] = preds
|
|
566
|
+
# # Precompute predictions for each quantile
|
|
567
|
+
# quantile_predictions = {}
|
|
568
|
+
# for i, label in enumerate(self.group_labels):
|
|
569
|
+
# best_metric = self.best_metrics_per_quantile_.loc[label]
|
|
570
|
+
# quantile_data = X[quantile_indices == i]
|
|
571
|
+
# if quantile_data.size > 0:
|
|
572
|
+
# quantile_predictions[i] = self.clf_.predict(
|
|
573
|
+
# quantile_data, metric=best_metric
|
|
574
|
+
# )
|
|
575
|
+
|
|
576
|
+
# Assign predictions to the corresponding indices
|
|
577
|
+
# for i, preds in quantile_predictions.items():
|
|
578
|
+
# predictions[quantile_indices == i] = preds
|
|
579
|
+
|
|
580
|
+
return predictions
|
|
581
|
+
|
|
582
|
+
def evaluate_metrics(
|
|
583
|
+
self, X: np.ndarray, y: np.ndarray, n_quantiles: int = 4
|
|
584
|
+
) -> Tuple[pd.DataFrame, pd.Series, np.ndarray]:
|
|
585
|
+
"""Evaluate and find the best distance metrics for the specified feature.
|
|
586
|
+
|
|
587
|
+
This method uses the standalone `find_best_metrics` function to evaluate
|
|
588
|
+
different distance metrics and determine the best-performing ones for
|
|
589
|
+
each quantile.
|
|
590
|
+
|
|
591
|
+
Parameters
|
|
592
|
+
----------
|
|
593
|
+
X : np.ndarray
|
|
594
|
+
The input feature matrix.
|
|
595
|
+
y : np.ndarray
|
|
596
|
+
The target labels.
|
|
597
|
+
n_quantiles : int, default=4
|
|
598
|
+
The number of quantiles to split the data into.
|
|
599
|
+
|
|
600
|
+
Returns
|
|
601
|
+
-------
|
|
602
|
+
quantile_scores_df : pd.DataFrame
|
|
603
|
+
A DataFrame containing the accuracy scores for each metric across
|
|
604
|
+
different quantiles.
|
|
605
|
+
best_metrics_per_quantile : pd.Series
|
|
606
|
+
A Series indicating the best-performing metric for each quantile.
|
|
607
|
+
group_bins : np.ndarray
|
|
608
|
+
The bins used for quantile splitting.
|
|
609
|
+
"""
|
|
610
|
+
return find_best_metrics(
|
|
611
|
+
self.clf_,
|
|
612
|
+
X,
|
|
613
|
+
y,
|
|
614
|
+
self.feat_idx,
|
|
615
|
+
n_quantiles,
|
|
616
|
+
self.metrics_to_consider,
|
|
617
|
+
self.random_state,
|
|
618
|
+
)
|
|
619
|
+
|
|
620
|
+
|
|
621
|
+
def find_best_metrics(
|
|
622
|
+
clf: "DistanceMetricClassifier",
|
|
623
|
+
X: np.ndarray,
|
|
624
|
+
y: np.ndarray,
|
|
625
|
+
feat_idx: int,
|
|
626
|
+
n_quantiles: int = 4,
|
|
627
|
+
metrics_to_consider: list[str] = None,
|
|
628
|
+
random_state: int = None,
|
|
629
|
+
) -> Tuple[pd.DataFrame, pd.Series, np.ndarray]:
|
|
630
|
+
"""Evaluate and find the best distance metrics for a given feature.
|
|
631
|
+
|
|
632
|
+
This function evaluates different distance metrics to determine which
|
|
633
|
+
performs best for a specific feature in the dataset. It splits the data
|
|
634
|
+
into quantiles based on the specified feature and calculates the accuracy
|
|
635
|
+
of the classifier for each metric within these quantiles.
|
|
636
|
+
|
|
637
|
+
.. versionadded:: 0.2.0
|
|
638
|
+
|
|
639
|
+
Parameters
|
|
640
|
+
----------
|
|
641
|
+
clf : DistanceMetricClassifier
|
|
642
|
+
The classifier instance to be used for evaluation.
|
|
643
|
+
X : np.ndarray
|
|
644
|
+
The input feature matrix.
|
|
645
|
+
y : np.ndarray
|
|
646
|
+
The target labels.
|
|
647
|
+
feat_idx : int
|
|
648
|
+
The index of the feature to be used for quantile splitting.
|
|
649
|
+
n_quantiles : int, default=4
|
|
650
|
+
The number of quantiles to split the data into.
|
|
651
|
+
metrics_to_consider : list of str, optional
|
|
652
|
+
A list of distance metrics to evaluate. If None, all available
|
|
653
|
+
metrics within DistClassiPy will be considered.
|
|
654
|
+
random_state : int, RandomState instance or None, optional (default=None)
|
|
655
|
+
Controls the randomness of the estimator. Pass an int for reproducible
|
|
656
|
+
output across multiple function calls.
|
|
657
|
+
|
|
658
|
+
.. versionadded:: 0.2.1
|
|
659
|
+
|
|
660
|
+
Returns
|
|
661
|
+
-------
|
|
662
|
+
quantile_scores_df : pd.DataFrame
|
|
663
|
+
A DataFrame containing the accuracy scores for each metric across
|
|
664
|
+
different quantiles.
|
|
665
|
+
best_metrics_per_quantile : pd.Series
|
|
666
|
+
A Series indicating the best-performing metric for each quantile.
|
|
667
|
+
group_bins : np.ndarray
|
|
668
|
+
The bins used for quantile splitting.
|
|
669
|
+
"""
|
|
670
|
+
X = check_array(X)
|
|
671
|
+
feature_labels = [f"Feature_{i}" for i in range(X.shape[1])]
|
|
672
|
+
feature_name = f"Feature_{feat_idx}"
|
|
673
|
+
|
|
674
|
+
if metrics_to_consider is None:
|
|
675
|
+
metrics_to_consider = _ALL_METRICS
|
|
676
|
+
|
|
677
|
+
X_df = pd.DataFrame(X, columns=feature_labels)
|
|
678
|
+
y_df = pd.DataFrame(y, columns=["Target"])
|
|
679
|
+
quantiles, group_bins = pd.qcut(X_df[feature_name], q=n_quantiles, retbins=True)
|
|
680
|
+
|
|
681
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
682
|
+
X_df, y_df, test_size=0.25, stratify=quantiles, random_state=random_state
|
|
683
|
+
)
|
|
684
|
+
|
|
685
|
+
clf.fit(X_train, y_train.to_numpy().ravel())
|
|
686
|
+
grouped_test_data = X_test.groupby(quantiles, observed=False)
|
|
687
|
+
|
|
688
|
+
quantile_scores = []
|
|
689
|
+
for metric in metrics_to_consider:
|
|
690
|
+
scores_for_metric = [
|
|
691
|
+
accuracy_score(
|
|
692
|
+
y_test.loc[subdf.index], clf.predict(subdf.to_numpy(), metric=metric)
|
|
693
|
+
)
|
|
694
|
+
for _, subdf in grouped_test_data
|
|
695
|
+
]
|
|
696
|
+
quantile_scores.append(scores_for_metric)
|
|
697
|
+
|
|
698
|
+
quantile_scores = np.array(quantile_scores) * 100
|
|
699
|
+
quantile_scores_df = pd.DataFrame(
|
|
700
|
+
data=quantile_scores,
|
|
701
|
+
index=metrics_to_consider,
|
|
702
|
+
columns=[f"Quantile {i+1}" for i in range(n_quantiles)],
|
|
703
|
+
)
|
|
704
|
+
|
|
705
|
+
best_metrics_per_quantile = quantile_scores_df.idxmax()
|
|
706
|
+
|
|
707
|
+
return quantile_scores_df, best_metrics_per_quantile, group_bins
|