distclassipy 0.2.0a0__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {distclassipy-0.2.0a0 → distclassipy-0.2.1}/PKG-INFO +12 -4
- {distclassipy-0.2.0a0 → distclassipy-0.2.1}/README.md +11 -3
- {distclassipy-0.2.0a0 → distclassipy-0.2.1}/distclassipy/__init__.py +13 -3
- {distclassipy-0.2.0a0 → distclassipy-0.2.1}/distclassipy/classifier.py +317 -20
- {distclassipy-0.2.0a0 → distclassipy-0.2.1}/distclassipy/distances.py +110 -34
- {distclassipy-0.2.0a0 → distclassipy-0.2.1}/distclassipy.egg-info/PKG-INFO +12 -4
- {distclassipy-0.2.0a0 → distclassipy-0.2.1}/tests/test_classifier.py +22 -3
- {distclassipy-0.2.0a0 → distclassipy-0.2.1}/tests/test_distances_prop.py +1 -46
- {distclassipy-0.2.0a0 → distclassipy-0.2.1}/LICENSE +0 -0
- {distclassipy-0.2.0a0 → distclassipy-0.2.1}/distclassipy.egg-info/SOURCES.txt +0 -0
- {distclassipy-0.2.0a0 → distclassipy-0.2.1}/distclassipy.egg-info/dependency_links.txt +0 -0
- {distclassipy-0.2.0a0 → distclassipy-0.2.1}/distclassipy.egg-info/requires.txt +0 -0
- {distclassipy-0.2.0a0 → distclassipy-0.2.1}/distclassipy.egg-info/top_level.txt +0 -0
- {distclassipy-0.2.0a0 → distclassipy-0.2.1}/pyproject.toml +0 -0
- {distclassipy-0.2.0a0 → distclassipy-0.2.1}/setup.cfg +0 -0
- {distclassipy-0.2.0a0 → distclassipy-0.2.1}/setup.py +0 -0
- {distclassipy-0.2.0a0 → distclassipy-0.2.1}/tests/test_distances.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: distclassipy
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: A python package for a distance-based classifier which can use several different distance metrics.
|
|
5
5
|
Author-email: Siddharth Chaini <sidchaini@gmail.com>
|
|
6
6
|
License: GNU GENERAL PUBLIC LICENSE
|
|
@@ -740,17 +740,25 @@ X, y = make_classification(
|
|
|
740
740
|
random_state=0,
|
|
741
741
|
shuffle=False,
|
|
742
742
|
)
|
|
743
|
+
# Example usage of DistanceMetricClassifier
|
|
743
744
|
clf = dcpy.DistanceMetricClassifier()
|
|
744
745
|
clf.fit(X, y)
|
|
745
|
-
print(clf.predict([[0, 0, 0, 0]]
|
|
746
|
+
print(clf.predict([[0, 0, 0, 0]], metric="canberra"))
|
|
747
|
+
|
|
748
|
+
# Example usage of EnsembleDistanceClassifier
|
|
749
|
+
ensemble_clf = dcpy.EnsembleDistanceClassifier(feat_idx=0)
|
|
750
|
+
ensemble_clf.fit(X, y)
|
|
751
|
+
print(ensemble_clf.predict(X))
|
|
746
752
|
```
|
|
747
753
|
|
|
748
754
|
## Features
|
|
749
755
|
- **Distance Metric-Based Classification**: Utilizes a variety of distance metrics for classification.
|
|
750
756
|
- **Customizable for Scientific Goals**: Allows fine-tuning based on scientific objectives by selecting appropriate distance metrics and features, enhancing both computational efficiency and model performance.
|
|
751
757
|
- **Interpretable Results**: Offers improved interpretability of classification outcomes by directly using distance metrics and feature importance, making it ideal for scientific applications.
|
|
752
|
-
- **Efficient and Scalable**: Demonstrates lower computational requirements compared to traditional methods like Random Forests, making it suitable for large datasets
|
|
753
|
-
- **Open Source and Accessible**: Available as an open-source Python package on PyPI, encouraging broad application in astronomy and beyond
|
|
758
|
+
- **Efficient and Scalable**: Demonstrates lower computational requirements compared to traditional methods like Random Forests, making it suitable for large datasets.
|
|
759
|
+
- **Open Source and Accessible**: Available as an open-source Python package on PyPI, encouraging broad application in astronomy and beyond.
|
|
760
|
+
- **(NEW) Ensemble Distance Classification**: Leverages an ensemble approach to use different distance metrics for each quantile, improving classification performance across diverse data distributions.
|
|
761
|
+
- **(NEW) Expanded Distance Metrics**: DistClassiPy now offers 43 built-in distance metrics, an increase from the previous 18. Additionally, users can still define and use custom distance metrics as needed.
|
|
754
762
|
|
|
755
763
|
## Documentation
|
|
756
764
|
|
|
@@ -40,17 +40,25 @@ X, y = make_classification(
|
|
|
40
40
|
random_state=0,
|
|
41
41
|
shuffle=False,
|
|
42
42
|
)
|
|
43
|
+
# Example usage of DistanceMetricClassifier
|
|
43
44
|
clf = dcpy.DistanceMetricClassifier()
|
|
44
45
|
clf.fit(X, y)
|
|
45
|
-
print(clf.predict([[0, 0, 0, 0]]
|
|
46
|
+
print(clf.predict([[0, 0, 0, 0]], metric="canberra"))
|
|
47
|
+
|
|
48
|
+
# Example usage of EnsembleDistanceClassifier
|
|
49
|
+
ensemble_clf = dcpy.EnsembleDistanceClassifier(feat_idx=0)
|
|
50
|
+
ensemble_clf.fit(X, y)
|
|
51
|
+
print(ensemble_clf.predict(X))
|
|
46
52
|
```
|
|
47
53
|
|
|
48
54
|
## Features
|
|
49
55
|
- **Distance Metric-Based Classification**: Utilizes a variety of distance metrics for classification.
|
|
50
56
|
- **Customizable for Scientific Goals**: Allows fine-tuning based on scientific objectives by selecting appropriate distance metrics and features, enhancing both computational efficiency and model performance.
|
|
51
57
|
- **Interpretable Results**: Offers improved interpretability of classification outcomes by directly using distance metrics and feature importance, making it ideal for scientific applications.
|
|
52
|
-
- **Efficient and Scalable**: Demonstrates lower computational requirements compared to traditional methods like Random Forests, making it suitable for large datasets
|
|
53
|
-
- **Open Source and Accessible**: Available as an open-source Python package on PyPI, encouraging broad application in astronomy and beyond
|
|
58
|
+
- **Efficient and Scalable**: Demonstrates lower computational requirements compared to traditional methods like Random Forests, making it suitable for large datasets.
|
|
59
|
+
- **Open Source and Accessible**: Available as an open-source Python package on PyPI, encouraging broad application in astronomy and beyond.
|
|
60
|
+
- **(NEW) Ensemble Distance Classification**: Leverages an ensemble approach to use different distance metrics for each quantile, improving classification performance across diverse data distributions.
|
|
61
|
+
- **(NEW) Expanded Distance Metrics**: DistClassiPy now offers 43 built-in distance metrics, an increase from the previous 18. Additionally, users can still define and use custom distance metrics as needed.
|
|
54
62
|
|
|
55
63
|
## Documentation
|
|
56
64
|
|
|
@@ -22,7 +22,17 @@ You should have received a copy of the GNU General Public License
|
|
|
22
22
|
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
23
23
|
"""
|
|
24
24
|
|
|
25
|
-
from .classifier import
|
|
26
|
-
|
|
25
|
+
from .classifier import (
|
|
26
|
+
DistanceMetricClassifier,
|
|
27
|
+
EnsembleDistanceClassifier,
|
|
28
|
+
)
|
|
29
|
+
from .distances import Distance, _ALL_METRICS
|
|
27
30
|
|
|
28
|
-
__version__ = "0.2.
|
|
31
|
+
__version__ = "0.2.1"
|
|
32
|
+
|
|
33
|
+
__all__ = [
|
|
34
|
+
"DistanceMetricClassifier",
|
|
35
|
+
"EnsembleDistanceClassifier",
|
|
36
|
+
"Distance",
|
|
37
|
+
"_ALL_METRICS",
|
|
38
|
+
]
|
|
@@ -3,6 +3,21 @@
|
|
|
3
3
|
This module contains the DistanceMetricClassifier introduced by Chaini et al. (2024)
|
|
4
4
|
in "Light Curve Classification with DistClassiPy: a new distance-based classifier"
|
|
5
5
|
|
|
6
|
+
|
|
7
|
+
.. autoclass:: distclassipy.classifier.DistanceMetricClassifier
|
|
8
|
+
:members:
|
|
9
|
+
:inherited-members:
|
|
10
|
+
:exclude-members: set_fit_request, set_predict_request
|
|
11
|
+
|
|
12
|
+
.. autoclass:: distclassipy.classifier.EnsembleDistanceClassifier
|
|
13
|
+
:members:
|
|
14
|
+
:inherited-members:
|
|
15
|
+
:exclude-members: set_fit_request, set_predict_request
|
|
16
|
+
|
|
17
|
+
.. doctest-skip::
|
|
18
|
+
|
|
19
|
+
.. skip::
|
|
20
|
+
|
|
6
21
|
Copyright (C) 2024 Siddharth Chaini
|
|
7
22
|
-----
|
|
8
23
|
This program is free software: you can redistribute it and/or modify
|
|
@@ -19,7 +34,7 @@ You should have received a copy of the GNU General Public License
|
|
|
19
34
|
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
20
35
|
"""
|
|
21
36
|
|
|
22
|
-
from typing import Callable
|
|
37
|
+
from typing import Callable, Tuple
|
|
23
38
|
|
|
24
39
|
import numpy as np
|
|
25
40
|
|
|
@@ -28,10 +43,12 @@ import pandas as pd
|
|
|
28
43
|
import scipy
|
|
29
44
|
|
|
30
45
|
from sklearn.base import BaseEstimator, ClassifierMixin
|
|
46
|
+
from sklearn.metrics import accuracy_score
|
|
47
|
+
from sklearn.model_selection import train_test_split
|
|
31
48
|
from sklearn.utils.multiclass import unique_labels
|
|
32
49
|
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
|
|
33
50
|
|
|
34
|
-
from .distances import Distance
|
|
51
|
+
from .distances import Distance, _ALL_METRICS
|
|
35
52
|
|
|
36
53
|
# Hardcoded source packages to check for distance metrics.
|
|
37
54
|
METRIC_SOURCES_ = {
|
|
@@ -113,16 +130,6 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
113
130
|
|
|
114
131
|
.. versionadded:: 0.1.0
|
|
115
132
|
|
|
116
|
-
|
|
117
|
-
Attributes
|
|
118
|
-
----------
|
|
119
|
-
scale : bool
|
|
120
|
-
Indicates whether the data is scaled.
|
|
121
|
-
central_stat : str
|
|
122
|
-
The statistic used for calculating central tendency.
|
|
123
|
-
dispersion_stat : str
|
|
124
|
-
The statistic used for calculating dispersion.
|
|
125
|
-
|
|
126
133
|
References
|
|
127
134
|
----------
|
|
128
135
|
.. [1] "Light Curve Classification with DistClassiPy: a new distance-based
|
|
@@ -135,10 +142,10 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
135
142
|
>>> X, y = make_classification(n_samples=1000, n_features=4,
|
|
136
143
|
... n_informative=2, n_redundant=0,
|
|
137
144
|
... random_state=0, shuffle=False)
|
|
138
|
-
>>> clf = dcpy.DistanceMetricClassifier(
|
|
145
|
+
>>> clf = dcpy.DistanceMetricClassifier()
|
|
139
146
|
>>> clf.fit(X, y)
|
|
140
147
|
DistanceMetricClassifier(...)
|
|
141
|
-
>>> print(clf.predict([[0, 0, 0, 0]]))
|
|
148
|
+
>>> print(clf.predict([[0, 0, 0, 0]], metric="canberra"))
|
|
142
149
|
[0]
|
|
143
150
|
"""
|
|
144
151
|
|
|
@@ -147,13 +154,15 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
147
154
|
scale: bool = True,
|
|
148
155
|
central_stat: str = "median",
|
|
149
156
|
dispersion_stat: str = "std",
|
|
150
|
-
):
|
|
157
|
+
) -> None:
|
|
151
158
|
"""Initialize the classifier with specified parameters."""
|
|
152
159
|
self.scale = scale
|
|
153
160
|
self.central_stat = central_stat
|
|
154
161
|
self.dispersion_stat = dispersion_stat
|
|
155
162
|
|
|
156
|
-
def fit(
|
|
163
|
+
def fit(
|
|
164
|
+
self, X: np.array, y: np.array, feat_labels: list[str] = None
|
|
165
|
+
) -> "DistanceMetricClassifier":
|
|
157
166
|
"""Calculate the feature space centroid for all classes.
|
|
158
167
|
|
|
159
168
|
This function calculates the feature space centroid in the training
|
|
@@ -234,7 +243,7 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
234
243
|
self,
|
|
235
244
|
X: np.array,
|
|
236
245
|
metric: str | Callable = "euclidean",
|
|
237
|
-
):
|
|
246
|
+
) -> np.ndarray:
|
|
238
247
|
"""Predict the class labels for the provided X.
|
|
239
248
|
|
|
240
249
|
The prediction is based on the distance of each data point in the input sample
|
|
@@ -248,6 +257,10 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
248
257
|
metric : str or callable, default="euclidean"
|
|
249
258
|
The distance metric to use for calculating the distance between features.
|
|
250
259
|
|
|
260
|
+
.. versionchanged:: 0.2.0
|
|
261
|
+
The metric is now specified at prediction time rather
|
|
262
|
+
than during initialization, providing greater flexibility.
|
|
263
|
+
|
|
251
264
|
Returns
|
|
252
265
|
-------
|
|
253
266
|
y : ndarray of shape (n_samples,)
|
|
@@ -266,9 +279,7 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
266
279
|
"""
|
|
267
280
|
check_is_fitted(self, "is_fitted_")
|
|
268
281
|
X = check_array(X)
|
|
269
|
-
|
|
270
282
|
metric_fn_, metric_arg_ = initialize_metric_function(metric)
|
|
271
|
-
|
|
272
283
|
if not self.scale:
|
|
273
284
|
dist_arr = scipy.spatial.distance.cdist(
|
|
274
285
|
XA=X, XB=self.df_centroid_.to_numpy(), metric=metric_arg_
|
|
@@ -299,7 +310,7 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
299
310
|
self,
|
|
300
311
|
X: np.array,
|
|
301
312
|
metric: str | Callable = "euclidean",
|
|
302
|
-
):
|
|
313
|
+
) -> np.ndarray:
|
|
303
314
|
"""Predict the class labels for the provided X and perform analysis.
|
|
304
315
|
|
|
305
316
|
The prediction is based on the distance of each data point in the input sample
|
|
@@ -397,3 +408,289 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
397
408
|
]
|
|
398
409
|
|
|
399
410
|
return self.confidence_df_.to_numpy()
|
|
411
|
+
|
|
412
|
+
def score(self, X, y, metric: str | Callable = "euclidean") -> float:
|
|
413
|
+
"""Return the mean accuracy on the given test data and labels.
|
|
414
|
+
|
|
415
|
+
Parameters
|
|
416
|
+
----------
|
|
417
|
+
X : array-like of shape (n_samples, n_features)
|
|
418
|
+
Test samples.
|
|
419
|
+
y : array-like of shape (n_samples,)
|
|
420
|
+
True labels for X.
|
|
421
|
+
metric : str or callable, default="euclidean"
|
|
422
|
+
The distance metric to use for calculating the distance between features.
|
|
423
|
+
|
|
424
|
+
Returns
|
|
425
|
+
-------
|
|
426
|
+
score : float
|
|
427
|
+
Mean accuracy of self.predict(X) wrt. y.
|
|
428
|
+
"""
|
|
429
|
+
y_pred = self.predict(X, metric=metric)
|
|
430
|
+
return accuracy_score(y, y_pred)
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
class EnsembleDistanceClassifier(BaseEstimator, ClassifierMixin):
|
|
434
|
+
"""An ensemble classifier that uses different metrics for each quantile.
|
|
435
|
+
|
|
436
|
+
This classifier splits the data into quantiles based on a specified
|
|
437
|
+
feature and uses different distance metrics for each quantile to
|
|
438
|
+
construct an ensemble classifier for each quantile, generally leading
|
|
439
|
+
to better performance.
|
|
440
|
+
Note, however, this involves fitting the training set for each metric
|
|
441
|
+
to evaluate performance, making this more computationally expensive.
|
|
442
|
+
|
|
443
|
+
.. versionadded:: 0.2.0
|
|
444
|
+
"""
|
|
445
|
+
|
|
446
|
+
def __init__(
|
|
447
|
+
self,
|
|
448
|
+
feat_idx: int,
|
|
449
|
+
scale: bool = True,
|
|
450
|
+
central_stat: str = "median",
|
|
451
|
+
dispersion_stat: str = "std",
|
|
452
|
+
metrics_to_consider: list[str] = None,
|
|
453
|
+
random_state: int = None,
|
|
454
|
+
) -> None:
|
|
455
|
+
"""Initialize the classifier with specified parameters.
|
|
456
|
+
|
|
457
|
+
Parameters
|
|
458
|
+
----------
|
|
459
|
+
feat_idx : int
|
|
460
|
+
The index of the feature to be used for quantile splitting.
|
|
461
|
+
scale : bool, default=True
|
|
462
|
+
Whether to scale the distance between the test object and the centroid.
|
|
463
|
+
central_stat : str, default="median"
|
|
464
|
+
The statistic used to calculate the central tendency of the data.
|
|
465
|
+
dispersion_stat : str, default="std"
|
|
466
|
+
The statistic used to calculate the dispersion of the data.
|
|
467
|
+
metrics_to_consider : list of str, optional
|
|
468
|
+
A list of distance metrics to evaluate. If None, all available
|
|
469
|
+
metrics within DistClassiPy will be considered.
|
|
470
|
+
random_state : int, RandomState instance or None, optional (default=None)
|
|
471
|
+
Controls the randomness of the estimator. Pass an int for reproducible
|
|
472
|
+
output across multiple function calls.
|
|
473
|
+
|
|
474
|
+
.. versionadded:: 0.2.1
|
|
475
|
+
"""
|
|
476
|
+
self.feat_idx = feat_idx
|
|
477
|
+
self.scale = scale
|
|
478
|
+
self.central_stat = central_stat
|
|
479
|
+
self.dispersion_stat = dispersion_stat
|
|
480
|
+
self.metrics_to_consider = metrics_to_consider
|
|
481
|
+
self.random_state = random_state
|
|
482
|
+
|
|
483
|
+
def fit(
|
|
484
|
+
self, X: np.ndarray, y: np.ndarray, n_quantiles: int = 4
|
|
485
|
+
) -> "EnsembleDistanceClassifier":
|
|
486
|
+
"""Fit the ensemble classifier using the best metrics for each quantile.
|
|
487
|
+
|
|
488
|
+
Parameters
|
|
489
|
+
----------
|
|
490
|
+
X : np.ndarray
|
|
491
|
+
The input feature matrix.
|
|
492
|
+
y : np.ndarray
|
|
493
|
+
The target labels.
|
|
494
|
+
n_quantiles : int, default=4
|
|
495
|
+
The number of quantiles to split the data into.
|
|
496
|
+
|
|
497
|
+
Returns
|
|
498
|
+
-------
|
|
499
|
+
self : object
|
|
500
|
+
Fitted estimator.
|
|
501
|
+
"""
|
|
502
|
+
self.clf_ = DistanceMetricClassifier(
|
|
503
|
+
scale=self.scale,
|
|
504
|
+
central_stat=self.central_stat,
|
|
505
|
+
dispersion_stat=self.dispersion_stat,
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
# Find best metrics based on training set quantiles
|
|
509
|
+
self.quantile_scores_df_, self.best_metrics_per_quantile_, self.group_bins = (
|
|
510
|
+
self.evaluate_metrics(X, y, n_quantiles)
|
|
511
|
+
)
|
|
512
|
+
|
|
513
|
+
# Ensure the bins work with values outside of training data
|
|
514
|
+
self.group_bins[0] = -np.inf
|
|
515
|
+
self.group_bins[-1] = np.inf
|
|
516
|
+
|
|
517
|
+
self.group_labels = [f"Quantile {i+1}" for i in range(n_quantiles)]
|
|
518
|
+
self.clf_.fit(X, y)
|
|
519
|
+
self.is_fitted_ = True
|
|
520
|
+
return self
|
|
521
|
+
|
|
522
|
+
def predict(self, X: np.ndarray) -> np.ndarray:
|
|
523
|
+
"""Predict class labels using the best metric for each quantile.
|
|
524
|
+
|
|
525
|
+
Parameters
|
|
526
|
+
----------
|
|
527
|
+
X : np.ndarray
|
|
528
|
+
The input samples.
|
|
529
|
+
|
|
530
|
+
Returns
|
|
531
|
+
-------
|
|
532
|
+
predictions : np.ndarray
|
|
533
|
+
The predicted class labels.
|
|
534
|
+
"""
|
|
535
|
+
check_is_fitted(self, "is_fitted_")
|
|
536
|
+
X = check_array(X)
|
|
537
|
+
|
|
538
|
+
# notes for pred during best:
|
|
539
|
+
# option 1:
|
|
540
|
+
# loop through each metric, merge quantiles for each metric
|
|
541
|
+
# pred on this
|
|
542
|
+
# option 2, easier, but slower:
|
|
543
|
+
# loop through each quantile, and append pred
|
|
544
|
+
|
|
545
|
+
quantiles = pd.cut(
|
|
546
|
+
X[:, self.feat_idx], bins=self.group_bins, labels=self.group_labels
|
|
547
|
+
)
|
|
548
|
+
grouped_data = pd.DataFrame(X).groupby(quantiles, observed=False)
|
|
549
|
+
# quantile_indices = quantiles.codes # Get integer codes for quantiles
|
|
550
|
+
predictions = np.empty(X.shape[0], dtype=object) # Change dtype to object
|
|
551
|
+
for i, (lim, subdf) in enumerate(grouped_data):
|
|
552
|
+
best_metric = self.best_metrics_per_quantile_.loc[self.group_labels[i]]
|
|
553
|
+
preds = self.clf_.predict(subdf.to_numpy(), metric=best_metric)
|
|
554
|
+
predictions[subdf.index] = preds
|
|
555
|
+
# # Precompute predictions for each quantile
|
|
556
|
+
# quantile_predictions = {}
|
|
557
|
+
# for i, label in enumerate(self.group_labels):
|
|
558
|
+
# best_metric = self.best_metrics_per_quantile_.loc[label]
|
|
559
|
+
# quantile_data = X[quantile_indices == i]
|
|
560
|
+
# if quantile_data.size > 0:
|
|
561
|
+
# quantile_predictions[i] = self.clf_.predict(
|
|
562
|
+
# quantile_data, metric=best_metric
|
|
563
|
+
# )
|
|
564
|
+
|
|
565
|
+
# Assign predictions to the corresponding indices
|
|
566
|
+
# for i, preds in quantile_predictions.items():
|
|
567
|
+
# predictions[quantile_indices == i] = preds
|
|
568
|
+
|
|
569
|
+
return predictions
|
|
570
|
+
|
|
571
|
+
def evaluate_metrics(
|
|
572
|
+
self, X: np.ndarray, y: np.ndarray, n_quantiles: int = 4
|
|
573
|
+
) -> Tuple[pd.DataFrame, pd.Series, np.ndarray]:
|
|
574
|
+
"""Evaluate and find the best distance metrics for the specified feature.
|
|
575
|
+
|
|
576
|
+
This method uses the standalone `find_best_metrics` function to evaluate
|
|
577
|
+
different distance metrics and determine the best-performing ones for
|
|
578
|
+
each quantile.
|
|
579
|
+
|
|
580
|
+
Parameters
|
|
581
|
+
----------
|
|
582
|
+
X : np.ndarray
|
|
583
|
+
The input feature matrix.
|
|
584
|
+
y : np.ndarray
|
|
585
|
+
The target labels.
|
|
586
|
+
n_quantiles : int, default=4
|
|
587
|
+
The number of quantiles to split the data into.
|
|
588
|
+
|
|
589
|
+
Returns
|
|
590
|
+
-------
|
|
591
|
+
quantile_scores_df : pd.DataFrame
|
|
592
|
+
A DataFrame containing the accuracy scores for each metric across
|
|
593
|
+
different quantiles.
|
|
594
|
+
best_metrics_per_quantile : pd.Series
|
|
595
|
+
A Series indicating the best-performing metric for each quantile.
|
|
596
|
+
group_bins : np.ndarray
|
|
597
|
+
The bins used for quantile splitting.
|
|
598
|
+
"""
|
|
599
|
+
return find_best_metrics(
|
|
600
|
+
self.clf_,
|
|
601
|
+
X,
|
|
602
|
+
y,
|
|
603
|
+
self.feat_idx,
|
|
604
|
+
n_quantiles,
|
|
605
|
+
self.metrics_to_consider,
|
|
606
|
+
self.random_state,
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
|
|
610
|
+
def find_best_metrics(
|
|
611
|
+
clf: "DistanceMetricClassifier",
|
|
612
|
+
X: np.ndarray,
|
|
613
|
+
y: np.ndarray,
|
|
614
|
+
feat_idx: int,
|
|
615
|
+
n_quantiles: int = 4,
|
|
616
|
+
metrics_to_consider: list[str] = None,
|
|
617
|
+
random_state: int = None,
|
|
618
|
+
) -> Tuple[pd.DataFrame, pd.Series, np.ndarray]:
|
|
619
|
+
"""Evaluate and find the best distance metrics for a given feature.
|
|
620
|
+
|
|
621
|
+
This function evaluates different distance metrics to determine which
|
|
622
|
+
performs best for a specific feature in the dataset. It splits the data
|
|
623
|
+
into quantiles based on the specified feature and calculates the accuracy
|
|
624
|
+
of the classifier for each metric within these quantiles.
|
|
625
|
+
|
|
626
|
+
.. versionadded:: 0.2.0
|
|
627
|
+
|
|
628
|
+
Parameters
|
|
629
|
+
----------
|
|
630
|
+
clf : DistanceMetricClassifier
|
|
631
|
+
The classifier instance to be used for evaluation.
|
|
632
|
+
X : np.ndarray
|
|
633
|
+
The input feature matrix.
|
|
634
|
+
y : np.ndarray
|
|
635
|
+
The target labels.
|
|
636
|
+
feat_idx : int
|
|
637
|
+
The index of the feature to be used for quantile splitting.
|
|
638
|
+
n_quantiles : int, default=4
|
|
639
|
+
The number of quantiles to split the data into.
|
|
640
|
+
metrics_to_consider : list of str, optional
|
|
641
|
+
A list of distance metrics to evaluate. If None, all available
|
|
642
|
+
metrics within DistClassiPy will be considered.
|
|
643
|
+
random_state : int, RandomState instance or None, optional (default=None)
|
|
644
|
+
Controls the randomness of the estimator. Pass an int for reproducible
|
|
645
|
+
output across multiple function calls.
|
|
646
|
+
|
|
647
|
+
.. versionadded:: 0.2.1
|
|
648
|
+
|
|
649
|
+
Returns
|
|
650
|
+
-------
|
|
651
|
+
quantile_scores_df : pd.DataFrame
|
|
652
|
+
A DataFrame containing the accuracy scores for each metric across
|
|
653
|
+
different quantiles.
|
|
654
|
+
best_metrics_per_quantile : pd.Series
|
|
655
|
+
A Series indicating the best-performing metric for each quantile.
|
|
656
|
+
group_bins : np.ndarray
|
|
657
|
+
The bins used for quantile splitting.
|
|
658
|
+
"""
|
|
659
|
+
X = check_array(X)
|
|
660
|
+
feature_labels = [f"Feature_{i}" for i in range(X.shape[1])]
|
|
661
|
+
feature_name = f"Feature_{feat_idx}"
|
|
662
|
+
|
|
663
|
+
if metrics_to_consider is None:
|
|
664
|
+
metrics_to_consider = _ALL_METRICS
|
|
665
|
+
|
|
666
|
+
X_df = pd.DataFrame(X, columns=feature_labels)
|
|
667
|
+
y_df = pd.DataFrame(y, columns=["Target"])
|
|
668
|
+
quantiles, group_bins = pd.qcut(X_df[feature_name], q=n_quantiles, retbins=True)
|
|
669
|
+
|
|
670
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
671
|
+
X_df, y_df, test_size=0.25, stratify=quantiles, random_state=random_state
|
|
672
|
+
)
|
|
673
|
+
|
|
674
|
+
clf.fit(X_train, y_train.to_numpy().ravel())
|
|
675
|
+
grouped_test_data = X_test.groupby(quantiles, observed=False)
|
|
676
|
+
|
|
677
|
+
quantile_scores = []
|
|
678
|
+
for metric in metrics_to_consider:
|
|
679
|
+
scores_for_metric = [
|
|
680
|
+
accuracy_score(
|
|
681
|
+
y_test.loc[subdf.index], clf.predict(subdf.to_numpy(), metric=metric)
|
|
682
|
+
)
|
|
683
|
+
for _, subdf in grouped_test_data
|
|
684
|
+
]
|
|
685
|
+
quantile_scores.append(scores_for_metric)
|
|
686
|
+
|
|
687
|
+
quantile_scores = np.array(quantile_scores) * 100
|
|
688
|
+
quantile_scores_df = pd.DataFrame(
|
|
689
|
+
data=quantile_scores,
|
|
690
|
+
index=metrics_to_consider,
|
|
691
|
+
columns=[f"Quantile {i+1}" for i in range(n_quantiles)],
|
|
692
|
+
)
|
|
693
|
+
|
|
694
|
+
best_metrics_per_quantile = quantile_scores_df.idxmax()
|
|
695
|
+
|
|
696
|
+
return quantile_scores_df, best_metrics_per_quantile, group_bins
|
|
@@ -48,6 +48,52 @@ import numpy as np
|
|
|
48
48
|
|
|
49
49
|
import scipy
|
|
50
50
|
|
|
51
|
+
_ALL_METRICS = [
|
|
52
|
+
"euclidean",
|
|
53
|
+
"braycurtis",
|
|
54
|
+
"canberra",
|
|
55
|
+
"cityblock",
|
|
56
|
+
"chebyshev",
|
|
57
|
+
"clark",
|
|
58
|
+
"correlation",
|
|
59
|
+
"cosine",
|
|
60
|
+
"hellinger",
|
|
61
|
+
"jaccard",
|
|
62
|
+
"lorentzian",
|
|
63
|
+
"marylandbridge",
|
|
64
|
+
"meehl",
|
|
65
|
+
"motyka",
|
|
66
|
+
"soergel",
|
|
67
|
+
"wave_hedges",
|
|
68
|
+
"kulczynski",
|
|
69
|
+
"add_chisq",
|
|
70
|
+
"acc",
|
|
71
|
+
"chebyshev_min",
|
|
72
|
+
"czekanowski",
|
|
73
|
+
"dice",
|
|
74
|
+
"divergence",
|
|
75
|
+
"google",
|
|
76
|
+
"gower",
|
|
77
|
+
"jeffreys",
|
|
78
|
+
"jensenshannon_divergence",
|
|
79
|
+
"jensen_difference",
|
|
80
|
+
"kumarjohnson",
|
|
81
|
+
"matusita",
|
|
82
|
+
"minkowski",
|
|
83
|
+
"penroseshape",
|
|
84
|
+
"prob_chisq",
|
|
85
|
+
"ruzicka",
|
|
86
|
+
"sorensen",
|
|
87
|
+
"squared_chisq",
|
|
88
|
+
"squaredchord",
|
|
89
|
+
"squared_euclidean",
|
|
90
|
+
"taneja",
|
|
91
|
+
"tanimoto",
|
|
92
|
+
"topsoe",
|
|
93
|
+
"vicis_symmetric_chisq",
|
|
94
|
+
"vicis_wave_hedges",
|
|
95
|
+
]
|
|
96
|
+
|
|
51
97
|
|
|
52
98
|
class Distance:
|
|
53
99
|
"""A class to calculate various distance metrics between vectors.
|
|
@@ -352,7 +398,11 @@ class Distance:
|
|
|
352
398
|
1(4), 300-307.
|
|
353
399
|
"""
|
|
354
400
|
u, v = np.asarray(u), np.asarray(v)
|
|
355
|
-
|
|
401
|
+
# Clip negative values to zero for valid sqrt
|
|
402
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
403
|
+
u = np.clip(u, a_min=0, a_max=None)
|
|
404
|
+
v = np.clip(v, a_min=0, a_max=None)
|
|
405
|
+
return np.sqrt(2 * np.sum((np.sqrt(u) - np.sqrt(v)) ** 2))
|
|
356
406
|
|
|
357
407
|
def jaccard(self, u, v):
|
|
358
408
|
"""Calculate the Jaccard distance between two vectors.
|
|
@@ -402,7 +452,8 @@ class Distance:
|
|
|
402
452
|
eschew the log of zero.
|
|
403
453
|
"""
|
|
404
454
|
u, v = np.asarray(u), np.asarray(v)
|
|
405
|
-
|
|
455
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
456
|
+
return np.sum(np.log(np.abs(u - v) + 1))
|
|
406
457
|
|
|
407
458
|
def marylandbridge(self, u, v):
|
|
408
459
|
"""Calculate the Maryland Bridge distance between two vectors.
|
|
@@ -633,7 +684,8 @@ class Distance:
|
|
|
633
684
|
# 3. https://en.wikipedia.org/wiki/Bhattacharyya_distance
|
|
634
685
|
# """
|
|
635
686
|
# u, v = np.asarray(u), np.asarray(v)
|
|
636
|
-
#
|
|
687
|
+
# with np.errstate(divide="ignore", invalid="ignore"):
|
|
688
|
+
# return -np.log(np.sum(np.sqrt(u * v)))
|
|
637
689
|
|
|
638
690
|
def chebyshev_min(self, u, v):
|
|
639
691
|
"""Calculate the minimum value distance between two vectors.
|
|
@@ -808,9 +860,12 @@ class Distance:
|
|
|
808
860
|
# vectors could be ignored or masked (see below).
|
|
809
861
|
# u = ma.masked_where(u == 0, u)
|
|
810
862
|
# v = ma.masked_where(v == 0, u)
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
863
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
864
|
+
u[u == 0] = self.epsilon
|
|
865
|
+
v[v == 0] = self.epsilon
|
|
866
|
+
# Clip negative values to zero for valid log
|
|
867
|
+
udivv = np.clip(u / v, a_min=self.epsilon, a_max=None)
|
|
868
|
+
return np.sum((u - v) * np.log(udivv))
|
|
814
869
|
|
|
815
870
|
def jensenshannon_divergence(self, u, v):
|
|
816
871
|
"""Calculate the Jensen-Shannon divergence between two vectors.
|
|
@@ -844,11 +899,17 @@ class Distance:
|
|
|
844
899
|
return np.sum(el1 - el2 * el3)
|
|
845
900
|
"""
|
|
846
901
|
u, v = np.asarray(u), np.asarray(v)
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
902
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
903
|
+
# Clip negative values to zero for valid log
|
|
904
|
+
u[u == 0] = self.epsilon
|
|
905
|
+
v[v == 0] = self.epsilon
|
|
906
|
+
|
|
907
|
+
term1 = np.clip(2 * u / (u + v), a_min=self.epsilon, a_max=None)
|
|
908
|
+
term2 = np.clip(2 * v / (u + v), a_min=self.epsilon, a_max=None)
|
|
909
|
+
|
|
910
|
+
dl = u * np.log(term1)
|
|
911
|
+
dr = v * np.log(term2)
|
|
912
|
+
return (np.sum(dl) + np.sum(dr)) / 2
|
|
852
913
|
|
|
853
914
|
def jensen_difference(self, u, v):
|
|
854
915
|
"""Calculate the Jensen difference between two vectors.
|
|
@@ -877,11 +938,14 @@ class Distance:
|
|
|
877
938
|
1(4), 300-307.
|
|
878
939
|
"""
|
|
879
940
|
u, v = np.asarray(u), np.asarray(v)
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
941
|
+
|
|
942
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
943
|
+
# Clip negative values to eps for valid log
|
|
944
|
+
u = np.clip(u, self.epsilon, None)
|
|
945
|
+
v = np.clip(v, self.epsilon, None)
|
|
946
|
+
el1 = (u * np.log(u) + v * np.log(v)) / 2
|
|
947
|
+
el2 = np.clip((u + v) / 2, a_min=self.epsilon, a_max=None)
|
|
948
|
+
return np.sum(el1 - el2 * np.log(el2))
|
|
885
949
|
|
|
886
950
|
def kumarjohnson(self, u, v):
|
|
887
951
|
"""Calculate the Kumar-Johnson distance between two vectors.
|
|
@@ -934,7 +998,8 @@ class Distance:
|
|
|
934
998
|
Equals square root of Squared-chord distance.
|
|
935
999
|
"""
|
|
936
1000
|
u, v = np.asarray(u), np.asarray(v)
|
|
937
|
-
|
|
1001
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
1002
|
+
return np.sqrt(np.sum((np.sqrt(u) - np.sqrt(v)) ** 2))
|
|
938
1003
|
|
|
939
1004
|
def minkowski(self, u, v, p=2):
|
|
940
1005
|
"""Calculate the Minkowski distance between two vectors.
|
|
@@ -981,7 +1046,8 @@ class Distance:
|
|
|
981
1046
|
u, v = np.asarray(u), np.asarray(v)
|
|
982
1047
|
umu = np.mean(u)
|
|
983
1048
|
vmu = np.mean(v)
|
|
984
|
-
|
|
1049
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
1050
|
+
return np.sqrt(np.sum(((u - umu) - (v - vmu)) ** 2))
|
|
985
1051
|
|
|
986
1052
|
def prob_chisq(self, u, v):
|
|
987
1053
|
"""Calculate the Probabilistic chi-square distance between two vectors.
|
|
@@ -1093,7 +1159,8 @@ class Distance:
|
|
|
1093
1159
|
Equals to squared Matusita distance.
|
|
1094
1160
|
"""
|
|
1095
1161
|
u, v = np.asarray(u), np.asarray(v)
|
|
1096
|
-
|
|
1162
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
1163
|
+
return np.sum((np.sqrt(u) - np.sqrt(v)) ** 2)
|
|
1097
1164
|
|
|
1098
1165
|
def squared_euclidean(self, u, v):
|
|
1099
1166
|
"""Calculate the Squared Euclidean distance between two vectors.
|
|
@@ -1145,10 +1212,14 @@ class Distance:
|
|
|
1145
1212
|
1(4), 300-307.
|
|
1146
1213
|
"""
|
|
1147
1214
|
u, v = np.asarray(u), np.asarray(v)
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1215
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
1216
|
+
u[u == 0] = self.epsilon
|
|
1217
|
+
v[v == 0] = self.epsilon
|
|
1218
|
+
uvsum = u + v
|
|
1219
|
+
logarg = np.clip(
|
|
1220
|
+
uvsum / (2 * np.sqrt(u * v)), a_min=self.epsilon, a_max=None
|
|
1221
|
+
)
|
|
1222
|
+
return np.sum((uvsum / 2) * np.log(logarg))
|
|
1152
1223
|
|
|
1153
1224
|
def tanimoto(self, u, v):
|
|
1154
1225
|
"""Calculate the Tanimoto distance between two vectors.
|
|
@@ -1202,11 +1273,14 @@ class Distance:
|
|
|
1202
1273
|
Equals two times Jensen-Shannon divergence.
|
|
1203
1274
|
"""
|
|
1204
1275
|
u, v = np.asarray(u), np.asarray(v)
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1276
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
1277
|
+
u[u == 0] = self.epsilon
|
|
1278
|
+
v[v == 0] = self.epsilon
|
|
1279
|
+
logarg1 = np.clip(2 * u / (u + v), a_min=self.epsilon, a_max=None)
|
|
1280
|
+
logarg2 = np.clip(2 * v / (u + v), a_min=self.epsilon, a_max=None)
|
|
1281
|
+
dl = u * np.log(logarg1)
|
|
1282
|
+
dr = v * np.log(logarg2)
|
|
1283
|
+
return np.sum(dl + dr)
|
|
1210
1284
|
|
|
1211
1285
|
def vicis_symmetric_chisq(self, u, v):
|
|
1212
1286
|
"""Calculate the Vicis Symmetric chi-square distance.
|
|
@@ -1330,9 +1404,10 @@ class Distance:
|
|
|
1330
1404
|
# 1(4), 300-307.
|
|
1331
1405
|
# """
|
|
1332
1406
|
# u, v = np.asarray(u), np.asarray(v)
|
|
1333
|
-
# u
|
|
1334
|
-
# v
|
|
1335
|
-
#
|
|
1407
|
+
# u[u == 0] = self.epsilon
|
|
1408
|
+
# v[v == 0] = self.epsilon
|
|
1409
|
+
# with np.errstate(divide="ignore", invalid="ignore"):
|
|
1410
|
+
# return np.sum(u * np.log(2 * u / (u + v)))
|
|
1336
1411
|
|
|
1337
1412
|
# def kl_divergence(self, u, v):
|
|
1338
1413
|
# """Calculate the Kullback-Leibler divergence between two vectors.
|
|
@@ -1358,9 +1433,10 @@ class Distance:
|
|
|
1358
1433
|
# 1(4):300-307.
|
|
1359
1434
|
# """
|
|
1360
1435
|
# u, v = np.asarray(u), np.asarray(v)
|
|
1361
|
-
# u
|
|
1362
|
-
# v
|
|
1363
|
-
#
|
|
1436
|
+
# u[u == 0] = self.epsilon
|
|
1437
|
+
# v[v == 0] = self.epsilon
|
|
1438
|
+
# with np.errstate(divide="ignore", invalid="ignore"):
|
|
1439
|
+
# return np.sum(u * np.log(u / v))
|
|
1364
1440
|
|
|
1365
1441
|
# def max_symmetric_chisq(self, u, v):
|
|
1366
1442
|
# """Calculate the maximum symmetric chi-square distance.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: distclassipy
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: A python package for a distance-based classifier which can use several different distance metrics.
|
|
5
5
|
Author-email: Siddharth Chaini <sidchaini@gmail.com>
|
|
6
6
|
License: GNU GENERAL PUBLIC LICENSE
|
|
@@ -740,17 +740,25 @@ X, y = make_classification(
|
|
|
740
740
|
random_state=0,
|
|
741
741
|
shuffle=False,
|
|
742
742
|
)
|
|
743
|
+
# Example usage of DistanceMetricClassifier
|
|
743
744
|
clf = dcpy.DistanceMetricClassifier()
|
|
744
745
|
clf.fit(X, y)
|
|
745
|
-
print(clf.predict([[0, 0, 0, 0]]
|
|
746
|
+
print(clf.predict([[0, 0, 0, 0]], metric="canberra"))
|
|
747
|
+
|
|
748
|
+
# Example usage of EnsembleDistanceClassifier
|
|
749
|
+
ensemble_clf = dcpy.EnsembleDistanceClassifier(feat_idx=0)
|
|
750
|
+
ensemble_clf.fit(X, y)
|
|
751
|
+
print(ensemble_clf.predict(X))
|
|
746
752
|
```
|
|
747
753
|
|
|
748
754
|
## Features
|
|
749
755
|
- **Distance Metric-Based Classification**: Utilizes a variety of distance metrics for classification.
|
|
750
756
|
- **Customizable for Scientific Goals**: Allows fine-tuning based on scientific objectives by selecting appropriate distance metrics and features, enhancing both computational efficiency and model performance.
|
|
751
757
|
- **Interpretable Results**: Offers improved interpretability of classification outcomes by directly using distance metrics and feature importance, making it ideal for scientific applications.
|
|
752
|
-
- **Efficient and Scalable**: Demonstrates lower computational requirements compared to traditional methods like Random Forests, making it suitable for large datasets
|
|
753
|
-
- **Open Source and Accessible**: Available as an open-source Python package on PyPI, encouraging broad application in astronomy and beyond
|
|
758
|
+
- **Efficient and Scalable**: Demonstrates lower computational requirements compared to traditional methods like Random Forests, making it suitable for large datasets.
|
|
759
|
+
- **Open Source and Accessible**: Available as an open-source Python package on PyPI, encouraging broad application in astronomy and beyond.
|
|
760
|
+
- **(NEW) Ensemble Distance Classification**: Leverages an ensemble approach to use different distance metrics for each quantile, improving classification performance across diverse data distributions.
|
|
761
|
+
- **(NEW) Expanded Distance Metrics**: DistClassiPy now offers 43 built-in distance metrics, an increase from the previous 18. Additionally, users can still define and use custom distance metrics as needed.
|
|
754
762
|
|
|
755
763
|
## Documentation
|
|
756
764
|
|
|
@@ -1,9 +1,13 @@
|
|
|
1
|
-
from distclassipy.classifier import
|
|
1
|
+
from distclassipy.classifier import (
|
|
2
|
+
DistanceMetricClassifier,
|
|
3
|
+
EnsembleDistanceClassifier,
|
|
4
|
+
)
|
|
2
5
|
|
|
3
6
|
import numpy as np
|
|
4
7
|
|
|
5
8
|
import pytest
|
|
6
9
|
|
|
10
|
+
from sklearn.datasets import make_classification
|
|
7
11
|
from sklearn.utils.estimator_checks import check_estimator
|
|
8
12
|
|
|
9
13
|
|
|
@@ -28,7 +32,7 @@ def test_fit():
|
|
|
28
32
|
|
|
29
33
|
|
|
30
34
|
# Test making predictions with the classifier
|
|
31
|
-
def
|
|
35
|
+
def test_dcpy():
|
|
32
36
|
X = np.array([[1, 2], [3, 4], [5, 6]]) # Sample feature set
|
|
33
37
|
y = np.array([0, 1, 0]) # Sample target values
|
|
34
38
|
clf = DistanceMetricClassifier()
|
|
@@ -59,7 +63,7 @@ def test_metric_scipy():
|
|
|
59
63
|
|
|
60
64
|
|
|
61
65
|
# Test using different distance metrics - from distclassipy
|
|
62
|
-
def
|
|
66
|
+
def test_metric_pred():
|
|
63
67
|
X = np.array([[1, 2], [3, 4], [5, 6]]) # Sample feature set
|
|
64
68
|
y = np.array([0, 1, 0]) # Sample target values
|
|
65
69
|
clf = DistanceMetricClassifier()
|
|
@@ -134,3 +138,18 @@ def test_confidence_calculation():
|
|
|
134
138
|
clf.predict_and_analyse(X)
|
|
135
139
|
distance_confidence = clf.calculate_confidence()
|
|
136
140
|
assert distance_confidence.shape == (3, len(np.unique(y)))
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
# Test basic functionality of EnsembleDistanceClassifier
|
|
144
|
+
def test_ensemble_distance_classifier():
|
|
145
|
+
X, y = make_classification(
|
|
146
|
+
n_samples=1000,
|
|
147
|
+
n_features=4,
|
|
148
|
+
n_informative=2,
|
|
149
|
+
shuffle=True,
|
|
150
|
+
)
|
|
151
|
+
clf = EnsembleDistanceClassifier(feat_idx=0)
|
|
152
|
+
clf.fit(X, y)
|
|
153
|
+
predictions = clf.predict(X)
|
|
154
|
+
assert len(predictions) == len(y)
|
|
155
|
+
assert set(predictions).issubset(set(y))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import math
|
|
2
2
|
|
|
3
|
-
from distclassipy.distances import Distance
|
|
3
|
+
from distclassipy.distances import Distance, _ALL_METRICS
|
|
4
4
|
|
|
5
5
|
from hypothesis import given, strategies as st
|
|
6
6
|
|
|
@@ -38,51 +38,6 @@ arrays = st.integers(min_value=1, max_value=20).flatmap(
|
|
|
38
38
|
).map(np.array),
|
|
39
39
|
)
|
|
40
40
|
)
|
|
41
|
-
# List of all distance metrics
|
|
42
|
-
_ALL_METRICS = [
|
|
43
|
-
"euclidean",
|
|
44
|
-
"braycurtis",
|
|
45
|
-
"canberra",
|
|
46
|
-
"cityblock",
|
|
47
|
-
"chebyshev",
|
|
48
|
-
"clark",
|
|
49
|
-
"correlation",
|
|
50
|
-
"cosine",
|
|
51
|
-
"hellinger",
|
|
52
|
-
"jaccard",
|
|
53
|
-
"lorentzian",
|
|
54
|
-
"marylandbridge",
|
|
55
|
-
"meehl",
|
|
56
|
-
"motyka",
|
|
57
|
-
"soergel",
|
|
58
|
-
"wave_hedges",
|
|
59
|
-
"kulczynski",
|
|
60
|
-
"add_chisq",
|
|
61
|
-
"acc",
|
|
62
|
-
"chebyshev_min",
|
|
63
|
-
"czekanowski",
|
|
64
|
-
"dice",
|
|
65
|
-
"divergence",
|
|
66
|
-
"google",
|
|
67
|
-
"gower",
|
|
68
|
-
"jeffreys",
|
|
69
|
-
"jensenshannon_divergence",
|
|
70
|
-
"jensen_difference",
|
|
71
|
-
"kumarjohnson",
|
|
72
|
-
"matusita",
|
|
73
|
-
"minkowski",
|
|
74
|
-
"penroseshape",
|
|
75
|
-
"prob_chisq",
|
|
76
|
-
"ruzicka",
|
|
77
|
-
"sorensen",
|
|
78
|
-
"squared_chisq",
|
|
79
|
-
"squaredchord",
|
|
80
|
-
"squared_euclidean",
|
|
81
|
-
"taneja",
|
|
82
|
-
"tanimoto",
|
|
83
|
-
"topsoe",
|
|
84
|
-
"vicis_symmetric_chisq",
|
|
85
|
-
]
|
|
86
41
|
|
|
87
42
|
|
|
88
43
|
@pytest.mark.parametrize(
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|