distclassipy 0.1.2__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {distclassipy-0.1.2 → distclassipy-0.1.4}/PKG-INFO +28 -25
- {distclassipy-0.1.2 → distclassipy-0.1.4}/README.md +23 -20
- {distclassipy-0.1.2 → distclassipy-0.1.4}/distclassipy/__init__.py +2 -1
- {distclassipy-0.1.2 → distclassipy-0.1.4}/distclassipy/classifier.py +30 -17
- {distclassipy-0.1.2 → distclassipy-0.1.4}/distclassipy/distances.py +86 -85
- {distclassipy-0.1.2 → distclassipy-0.1.4}/distclassipy.egg-info/PKG-INFO +28 -25
- {distclassipy-0.1.2 → distclassipy-0.1.4}/distclassipy.egg-info/SOURCES.txt +0 -1
- distclassipy-0.1.4/distclassipy.egg-info/requires.txt +4 -0
- {distclassipy-0.1.2 → distclassipy-0.1.4}/pyproject.toml +8 -5
- distclassipy-0.1.4/setup.py +21 -0
- {distclassipy-0.1.2 → distclassipy-0.1.4}/tests/test_classifier.py +4 -7
- distclassipy-0.1.4/tests/test_distances.py +17 -0
- distclassipy-0.1.2/distclassipy/version.py +0 -16
- distclassipy-0.1.2/distclassipy.egg-info/requires.txt +0 -4
- distclassipy-0.1.2/setup.py +0 -3
- distclassipy-0.1.2/tests/test_distances.py +0 -35
- {distclassipy-0.1.2 → distclassipy-0.1.4}/LICENSE +0 -0
- {distclassipy-0.1.2 → distclassipy-0.1.4}/distclassipy.egg-info/dependency_links.txt +0 -0
- {distclassipy-0.1.2 → distclassipy-0.1.4}/distclassipy.egg-info/top_level.txt +0 -0
- {distclassipy-0.1.2 → distclassipy-0.1.4}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: distclassipy
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: A python package for a distance-based classifier which can use several different distance metrics.
|
|
5
5
|
Author-email: Siddharth Chaini <sidchaini@gmail.com>
|
|
6
6
|
License: GNU GENERAL PUBLIC LICENSE
|
|
@@ -679,7 +679,7 @@ License: GNU GENERAL PUBLIC LICENSE
|
|
|
679
679
|
<https://www.gnu.org/licenses/why-not-lgpl.html>.
|
|
680
680
|
|
|
681
681
|
Project-URL: Repository, https://github.com/sidchaini/DistClassiPy
|
|
682
|
-
Project-URL:
|
|
682
|
+
Project-URL: Documentation, https://sidchaini.github.io/DistClassiPy/
|
|
683
683
|
Classifier: Development Status :: 4 - Beta
|
|
684
684
|
Classifier: Intended Audience :: Developers
|
|
685
685
|
Classifier: Intended Audience :: Education
|
|
@@ -694,9 +694,9 @@ Requires-Python: >=3.10
|
|
|
694
694
|
Description-Content-Type: text/markdown
|
|
695
695
|
License-File: LICENSE
|
|
696
696
|
Requires-Dist: joblib>=1.3.2
|
|
697
|
-
Requires-Dist: numpy>=1.
|
|
698
|
-
Requires-Dist: pandas>=2.
|
|
699
|
-
Requires-Dist: scikit-learn>=1.
|
|
697
|
+
Requires-Dist: numpy>=1.25.2
|
|
698
|
+
Requires-Dist: pandas>=2.0.3
|
|
699
|
+
Requires-Dist: scikit-learn>=1.2.2
|
|
700
700
|
|
|
701
701
|
<h1 align="center">
|
|
702
702
|
<picture align="center">
|
|
@@ -712,6 +712,9 @@ Requires-Dist: scikit-learn>=1.4.0
|
|
|
712
712
|
[](https://github.com/sidchaini/distclassipy/blob/main/LICENSE)
|
|
713
713
|
[](https://github.com/psf/black)
|
|
714
714
|
|
|
715
|
+
[](https://arxiv.org/abs/2403.12120)
|
|
716
|
+
[](https://ascl.net/2403.002)
|
|
717
|
+
|
|
715
718
|
<!-- [](
|
|
716
719
|
https://doi.org/10.1038/s41586-020-2649-2) -->
|
|
717
720
|
|
|
@@ -743,9 +746,11 @@ print(clf.predict([[0, 0, 0, 0]]))
|
|
|
743
746
|
```
|
|
744
747
|
|
|
745
748
|
## Features
|
|
746
|
-
-
|
|
747
|
-
-
|
|
748
|
-
-
|
|
749
|
+
- **Distance Metric-Based Classification**: Utilizes a variety of distance metrics for classification.
|
|
750
|
+
- **Customizable for Scientific Goals**: Allows fine-tuning based on scientific objectives by selecting appropriate distance metrics and features, enhancing both computational efficiency and model performance.
|
|
751
|
+
- **Interpretable Results**: Offers improved interpretability of classification outcomes by directly using distance metrics and feature importance, making it ideal for scientific applications.
|
|
752
|
+
- **Efficient and Scalable**: Demonstrates lower computational requirements compared to traditional methods like Random Forests, making it suitable for large datasets
|
|
753
|
+
- **Open Source and Accessible**: Available as an open-source Python package on PyPI, encouraging broad application in astronomy and beyond
|
|
749
754
|
|
|
750
755
|
## Documentation
|
|
751
756
|
|
|
@@ -760,30 +765,28 @@ DistClassiPy is released under the [GNU General Public License v3.0](https://www
|
|
|
760
765
|
## Citation
|
|
761
766
|
|
|
762
767
|
If you use DistClassiPy in your research or project, please consider citing the paper:
|
|
763
|
-
> Light Curve Classification with DistClassiPy: a new distance-based classifier
|
|
764
|
-
|
|
768
|
+
> Chaini, S., Mahabal, A., Kembhavi, A., & Bianco, F. B. (2024). Light Curve Classification with DistClassiPy: a new distance-based classifier. arXiv. https://doi.org/10.48550/arXiv.2403.12120
|
|
765
769
|
|
|
766
|
-
<!--
|
|
767
770
|
### Bibtex
|
|
768
771
|
|
|
769
772
|
|
|
770
773
|
```bibtex
|
|
771
|
-
@ARTICLE{
|
|
772
|
-
author = {{Chaini},
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
774
|
+
@ARTICLE{chaini2024light,
|
|
775
|
+
author = {{Chaini}, Siddharth and {Mahabal}, Ashish and {Kembhavi}, Ajit and {Bianco}, Federica B.},
|
|
776
|
+
title = "{Light Curve Classification with DistClassiPy: a new distance-based classifier}",
|
|
777
|
+
journal = {arXiv e-prints},
|
|
778
|
+
keywords = {Astrophysics - Instrumentation and Methods for Astrophysics, Astrophysics - Solar and Stellar Astrophysics, Computer Science - Machine Learning},
|
|
779
|
+
year = 2024,
|
|
780
|
+
month = mar,
|
|
781
|
+
eid = {arXiv:2403.12120},
|
|
782
|
+
pages = {arXiv:2403.12120},
|
|
783
|
+
archivePrefix = {arXiv},
|
|
784
|
+
eprint = {2403.12120},
|
|
785
|
+
primaryClass = {astro-ph.IM},
|
|
786
|
+
adsurl = {https://ui.adsabs.harvard.edu/abs/2024arXiv240312120C},
|
|
787
|
+
adsnote = {Provided by the SAO/NASA Astrophysics Data System}
|
|
784
788
|
}
|
|
785
789
|
```
|
|
786
|
-
-->
|
|
787
790
|
|
|
788
791
|
|
|
789
792
|
<!-- You can also find citation information in the [CITATION.cff](https://github.com/sidchaini/DistClassiPy/CITATION.cff) file. -->
|
|
@@ -12,6 +12,9 @@
|
|
|
12
12
|
[](https://github.com/sidchaini/distclassipy/blob/main/LICENSE)
|
|
13
13
|
[](https://github.com/psf/black)
|
|
14
14
|
|
|
15
|
+
[](https://arxiv.org/abs/2403.12120)
|
|
16
|
+
[](https://ascl.net/2403.002)
|
|
17
|
+
|
|
15
18
|
<!-- [](
|
|
16
19
|
https://doi.org/10.1038/s41586-020-2649-2) -->
|
|
17
20
|
|
|
@@ -43,9 +46,11 @@ print(clf.predict([[0, 0, 0, 0]]))
|
|
|
43
46
|
```
|
|
44
47
|
|
|
45
48
|
## Features
|
|
46
|
-
-
|
|
47
|
-
-
|
|
48
|
-
-
|
|
49
|
+
- **Distance Metric-Based Classification**: Utilizes a variety of distance metrics for classification.
|
|
50
|
+
- **Customizable for Scientific Goals**: Allows fine-tuning based on scientific objectives by selecting appropriate distance metrics and features, enhancing both computational efficiency and model performance.
|
|
51
|
+
- **Interpretable Results**: Offers improved interpretability of classification outcomes by directly using distance metrics and feature importance, making it ideal for scientific applications.
|
|
52
|
+
- **Efficient and Scalable**: Demonstrates lower computational requirements compared to traditional methods like Random Forests, making it suitable for large datasets
|
|
53
|
+
- **Open Source and Accessible**: Available as an open-source Python package on PyPI, encouraging broad application in astronomy and beyond
|
|
49
54
|
|
|
50
55
|
## Documentation
|
|
51
56
|
|
|
@@ -60,30 +65,28 @@ DistClassiPy is released under the [GNU General Public License v3.0](https://www
|
|
|
60
65
|
## Citation
|
|
61
66
|
|
|
62
67
|
If you use DistClassiPy in your research or project, please consider citing the paper:
|
|
63
|
-
> Light Curve Classification with DistClassiPy: a new distance-based classifier
|
|
64
|
-
|
|
68
|
+
> Chaini, S., Mahabal, A., Kembhavi, A., & Bianco, F. B. (2024). Light Curve Classification with DistClassiPy: a new distance-based classifier. arXiv. https://doi.org/10.48550/arXiv.2403.12120
|
|
65
69
|
|
|
66
|
-
<!--
|
|
67
70
|
### Bibtex
|
|
68
71
|
|
|
69
72
|
|
|
70
73
|
```bibtex
|
|
71
|
-
@ARTICLE{
|
|
72
|
-
author = {{Chaini},
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
74
|
+
@ARTICLE{chaini2024light,
|
|
75
|
+
author = {{Chaini}, Siddharth and {Mahabal}, Ashish and {Kembhavi}, Ajit and {Bianco}, Federica B.},
|
|
76
|
+
title = "{Light Curve Classification with DistClassiPy: a new distance-based classifier}",
|
|
77
|
+
journal = {arXiv e-prints},
|
|
78
|
+
keywords = {Astrophysics - Instrumentation and Methods for Astrophysics, Astrophysics - Solar and Stellar Astrophysics, Computer Science - Machine Learning},
|
|
79
|
+
year = 2024,
|
|
80
|
+
month = mar,
|
|
81
|
+
eid = {arXiv:2403.12120},
|
|
82
|
+
pages = {arXiv:2403.12120},
|
|
83
|
+
archivePrefix = {arXiv},
|
|
84
|
+
eprint = {2403.12120},
|
|
85
|
+
primaryClass = {astro-ph.IM},
|
|
86
|
+
adsurl = {https://ui.adsabs.harvard.edu/abs/2024arXiv240312120C},
|
|
87
|
+
adsnote = {Provided by the SAO/NASA Astrophysics Data System}
|
|
84
88
|
}
|
|
85
89
|
```
|
|
86
|
-
-->
|
|
87
90
|
|
|
88
91
|
|
|
89
92
|
<!-- You can also find citation information in the [CITATION.cff](https://github.com/sidchaini/DistClassiPy/CITATION.cff) file. -->
|
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
"""
|
|
2
|
+
A module which contains the DistanceMetricClassifier introduced by Chaini et al. (2024) in "Light Curve Classification with DistClassiPy: a new distance-based classifier".
|
|
3
|
+
"""
|
|
4
|
+
|
|
1
5
|
import numpy as np
|
|
2
6
|
import pandas as pd
|
|
3
7
|
import scipy
|
|
@@ -94,12 +98,6 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
94
98
|
self.calculate_kde = calculate_kde
|
|
95
99
|
self.calculate_1d_dist = calculate_1d_dist
|
|
96
100
|
|
|
97
|
-
# Hardcoded source packages to check for distance metrics.
|
|
98
|
-
self.metric_sources_ = {
|
|
99
|
-
"scipy.spatial.distance": scipy.spatial.distance,
|
|
100
|
-
"distances.Distance": Distance(),
|
|
101
|
-
}
|
|
102
|
-
|
|
103
101
|
def set_metric_fn_(self):
|
|
104
102
|
"""
|
|
105
103
|
Set the metric function based on the provided metric.
|
|
@@ -107,6 +105,12 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
107
105
|
If the metric is a string, the function will look for a corresponding function in scipy.spatial.distance or distances.Distance. If the metric is a function, it will be used directly.
|
|
108
106
|
"""
|
|
109
107
|
|
|
108
|
+
# Hardcoded source packages to check for distance metrics.
|
|
109
|
+
metric_sources_ = {
|
|
110
|
+
"scipy.spatial.distance": scipy.spatial.distance,
|
|
111
|
+
"distances.Distance": Distance(),
|
|
112
|
+
}
|
|
113
|
+
|
|
110
114
|
if callable(self.metric):
|
|
111
115
|
self.metric_fn_ = self.metric
|
|
112
116
|
self.metric_arg_ = self.metric
|
|
@@ -114,17 +118,26 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
114
118
|
elif isinstance(self.metric, str):
|
|
115
119
|
metric_str_lowercase = self.metric.lower()
|
|
116
120
|
metric_found = False
|
|
117
|
-
for package_str, source in
|
|
121
|
+
for package_str, source in metric_sources_.items():
|
|
122
|
+
|
|
123
|
+
# Don't use scipy for jaccard as their implementation only works with booleans - use custom jaccard instead
|
|
124
|
+
if (
|
|
125
|
+
package_str == "scipy.spatial.distance"
|
|
126
|
+
and metric_str_lowercase == "jaccard"
|
|
127
|
+
):
|
|
128
|
+
continue
|
|
129
|
+
|
|
118
130
|
if hasattr(source, metric_str_lowercase):
|
|
119
131
|
self.metric_fn_ = getattr(source, metric_str_lowercase)
|
|
120
132
|
metric_found = True
|
|
121
|
-
if package_str == "scipy.spatial.distance":
|
|
122
|
-
# Use the string as an argument if it belongs to scipy as it is optimized
|
|
123
|
-
self.metric_arg_ = self.metric
|
|
124
|
-
else:
|
|
125
|
-
self.metric_arg_ = self.metric_fn_
|
|
126
|
-
break
|
|
127
133
|
|
|
134
|
+
# Use the string as an argument if it belongs to scipy as it is optimized
|
|
135
|
+
self.metric_arg_ = (
|
|
136
|
+
self.metric
|
|
137
|
+
if package_str == "scipy.spatial.distance"
|
|
138
|
+
else self.metric_fn_
|
|
139
|
+
)
|
|
140
|
+
break
|
|
128
141
|
if not metric_found:
|
|
129
142
|
raise ValueError(
|
|
130
143
|
f"{self.metric} metric not found. Please pass a string of the name of a metric in scipy.spatial.distance or distances.Distance, or pass a metric function directly. For a list of available metrics, see: https://sidchaini.github.io/DistClassiPy/distances.html or https://docs.scipy.org/doc/scipy/reference/spatial.distance.html"
|
|
@@ -358,9 +371,7 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
358
371
|
sum_1d_dists = sum_1d_dists + dists / self.df_iqr_.loc[cl, feat]
|
|
359
372
|
else:
|
|
360
373
|
sum_1d_dists = sum_1d_dists + dists
|
|
361
|
-
confs = 1 / sum_1d_dists
|
|
362
|
-
# Add epsilon later
|
|
363
|
-
# confs = 1 / (sum_1d_dists + np.finfo(float).eps)
|
|
374
|
+
confs = 1 / np.clip(sum_1d_dists, a_min=np.finfo(float).eps, a_max=None)
|
|
364
375
|
conf_cl.append(confs)
|
|
365
376
|
conf_cl = np.array(conf_cl)
|
|
366
377
|
self.conf_cl_ = conf_cl
|
|
@@ -388,7 +399,9 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
388
399
|
|
|
389
400
|
# Calculate confidence for each prediction
|
|
390
401
|
if method == "distance_inverse":
|
|
391
|
-
self.confidence_df_ = 1 /
|
|
402
|
+
self.confidence_df_ = 1 / np.clip(
|
|
403
|
+
self.centroid_dist_df_, a_min=np.finfo(float).eps, a_max=None
|
|
404
|
+
)
|
|
392
405
|
self.confidence_df_.columns = [
|
|
393
406
|
x.replace("_dist", "_conf") for x in self.confidence_df_.columns
|
|
394
407
|
]
|
|
@@ -24,6 +24,7 @@ Notes
|
|
|
24
24
|
|
|
25
25
|
In addition, the following code was added to all functions for array conversion:
|
|
26
26
|
u,v = np.asarray(u), np.asarray(v)
|
|
27
|
+
-----
|
|
27
28
|
"""
|
|
28
29
|
|
|
29
30
|
import numpy as np
|
|
@@ -89,33 +90,33 @@ class Distance:
|
|
|
89
90
|
with np.errstate(divide="ignore", invalid="ignore"):
|
|
90
91
|
return np.sum(np.where(uvmult != 0, ((u - v) ** 2 * (u + v)) / uvmult, 0))
|
|
91
92
|
|
|
92
|
-
def bhattacharyya(self, u, v):
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
93
|
+
# def bhattacharyya(self, u, v):
|
|
94
|
+
# """
|
|
95
|
+
# Calculate the Bhattacharyya distance between two vectors.
|
|
96
|
+
|
|
97
|
+
# Returns a distance value between 0 and 1.
|
|
98
|
+
|
|
99
|
+
# Parameters
|
|
100
|
+
# ----------
|
|
101
|
+
# - u, v: Input vectors between which the distance is to be calculated.
|
|
102
|
+
|
|
103
|
+
# Returns
|
|
104
|
+
# -------
|
|
105
|
+
# - The Bhattacharyya distance between the two vectors.
|
|
106
|
+
|
|
107
|
+
# References
|
|
108
|
+
# ----------
|
|
109
|
+
# 1. Bhattacharyya A (1947) On a measure of divergence between two
|
|
110
|
+
# statistical populations defined by probability distributions,
|
|
111
|
+
# Bull. Calcutta Math. Soc., 35, 99–109.
|
|
112
|
+
# 2. Sung-Hyuk C. (2007) Comprehensive Survey on Distance/Similarity
|
|
113
|
+
# Measures between Probability Density Functions. International
|
|
114
|
+
# Journal of Mathematical Models and Methods in Applied Sciences.
|
|
115
|
+
# 1(4), 300-307.
|
|
116
|
+
# 3. https://en.wikipedia.org/wiki/Bhattacharyya_distance
|
|
117
|
+
# """
|
|
118
|
+
# u, v = np.asarray(u), np.asarray(v)
|
|
119
|
+
# return -np.log(np.sum(np.sqrt(u * v)))
|
|
119
120
|
|
|
120
121
|
def braycurtis(self, u, v):
|
|
121
122
|
"""
|
|
@@ -397,26 +398,26 @@ class Distance:
|
|
|
397
398
|
u, v = np.asarray(u), np.asarray(v)
|
|
398
399
|
return np.linalg.norm(u - v)
|
|
399
400
|
|
|
400
|
-
def fidelity(self, u, v):
|
|
401
|
-
|
|
402
|
-
|
|
401
|
+
# def fidelity(self, u, v):
|
|
402
|
+
# """
|
|
403
|
+
# Calculate the fidelity distance between two vectors.
|
|
403
404
|
|
|
404
|
-
|
|
405
|
+
# The fidelity distance measures the similarity between two probability distributions.
|
|
405
406
|
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
407
|
+
# Parameters
|
|
408
|
+
# ----------
|
|
409
|
+
# - u, v: Input vectors between which the distance is to be calculated.
|
|
409
410
|
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
411
|
+
# Returns
|
|
412
|
+
# -------
|
|
413
|
+
# - The fidelity distance between the two vectors.
|
|
413
414
|
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
415
|
+
# Notes
|
|
416
|
+
# -----
|
|
417
|
+
# Added by SC.
|
|
418
|
+
# """
|
|
419
|
+
# u, v = np.asarray(u), np.asarray(v)
|
|
420
|
+
# return 1 - (np.sum(np.sqrt(u * v)))
|
|
420
421
|
|
|
421
422
|
def google(self, u, v):
|
|
422
423
|
"""
|
|
@@ -515,26 +516,26 @@ class Distance:
|
|
|
515
516
|
u, v = np.asarray(u), np.asarray(v)
|
|
516
517
|
return np.sqrt(2 * np.sum((np.sqrt(u) - np.sqrt(v)) ** 2))
|
|
517
518
|
|
|
518
|
-
def inner(self, u, v):
|
|
519
|
-
|
|
520
|
-
|
|
519
|
+
# def inner(self, u, v):
|
|
520
|
+
# """
|
|
521
|
+
# Calculate the inner product distance between two vectors.
|
|
521
522
|
|
|
522
|
-
|
|
523
|
+
# The inner product distance is a measure of similarity between two vectors, based on their inner product.
|
|
523
524
|
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
525
|
+
# Parameters
|
|
526
|
+
# ----------
|
|
527
|
+
# - u, v: Input vectors between which the distance is to be calculated.
|
|
527
528
|
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
529
|
+
# Returns
|
|
530
|
+
# -------
|
|
531
|
+
# - The inner product distance between the two vectors.
|
|
531
532
|
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
533
|
+
# Notes
|
|
534
|
+
# -----
|
|
535
|
+
# Added by SC.
|
|
536
|
+
# """
|
|
537
|
+
# u, v = np.asarray(u), np.asarray(v)
|
|
538
|
+
# return 1 - np.dot(u, v)
|
|
538
539
|
|
|
539
540
|
def jaccard(self, u, v):
|
|
540
541
|
"""
|
|
@@ -1032,32 +1033,32 @@ class Distance:
|
|
|
1032
1033
|
with np.errstate(divide="ignore", invalid="ignore"):
|
|
1033
1034
|
return np.sum(np.where(u != 0, (u - v) ** 2 / u, 0))
|
|
1034
1035
|
|
|
1035
|
-
def nonintersection(self, u, v):
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1036
|
+
# def nonintersection(self, u, v):
|
|
1037
|
+
# """
|
|
1038
|
+
# Calculate the Nonintersection distance between two vectors.
|
|
1039
|
+
|
|
1040
|
+
# Parameters
|
|
1041
|
+
# ----------
|
|
1042
|
+
# - u, v: Input vectors between which the distance is to be calculated.
|
|
1043
|
+
|
|
1044
|
+
# Returns
|
|
1045
|
+
# -------
|
|
1046
|
+
# - The Nonintersection distance between the two vectors.
|
|
1047
|
+
|
|
1048
|
+
# References
|
|
1049
|
+
# ----------
|
|
1050
|
+
# 1. Sung-Hyuk C. (2007) Comprehensive Survey on Distance/Similarity
|
|
1051
|
+
# Measures between Probability Density Functions. International
|
|
1052
|
+
# Journal of Mathematical Models and Methods in Applied Sciences.
|
|
1053
|
+
# 1(4), 300-307.
|
|
1054
|
+
|
|
1055
|
+
# Notes
|
|
1056
|
+
# -----
|
|
1057
|
+
# When used for comparing two probability density functions (pdfs),
|
|
1058
|
+
# Nonintersection distance equals half of Cityblock distance.
|
|
1059
|
+
# """
|
|
1060
|
+
# u, v = np.asarray(u), np.asarray(v)
|
|
1061
|
+
# return 1 - np.sum(np.minimum(u, v))
|
|
1061
1062
|
|
|
1062
1063
|
def pearson_chisq(self, u, v):
|
|
1063
1064
|
"""
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: distclassipy
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: A python package for a distance-based classifier which can use several different distance metrics.
|
|
5
5
|
Author-email: Siddharth Chaini <sidchaini@gmail.com>
|
|
6
6
|
License: GNU GENERAL PUBLIC LICENSE
|
|
@@ -679,7 +679,7 @@ License: GNU GENERAL PUBLIC LICENSE
|
|
|
679
679
|
<https://www.gnu.org/licenses/why-not-lgpl.html>.
|
|
680
680
|
|
|
681
681
|
Project-URL: Repository, https://github.com/sidchaini/DistClassiPy
|
|
682
|
-
Project-URL:
|
|
682
|
+
Project-URL: Documentation, https://sidchaini.github.io/DistClassiPy/
|
|
683
683
|
Classifier: Development Status :: 4 - Beta
|
|
684
684
|
Classifier: Intended Audience :: Developers
|
|
685
685
|
Classifier: Intended Audience :: Education
|
|
@@ -694,9 +694,9 @@ Requires-Python: >=3.10
|
|
|
694
694
|
Description-Content-Type: text/markdown
|
|
695
695
|
License-File: LICENSE
|
|
696
696
|
Requires-Dist: joblib>=1.3.2
|
|
697
|
-
Requires-Dist: numpy>=1.
|
|
698
|
-
Requires-Dist: pandas>=2.
|
|
699
|
-
Requires-Dist: scikit-learn>=1.
|
|
697
|
+
Requires-Dist: numpy>=1.25.2
|
|
698
|
+
Requires-Dist: pandas>=2.0.3
|
|
699
|
+
Requires-Dist: scikit-learn>=1.2.2
|
|
700
700
|
|
|
701
701
|
<h1 align="center">
|
|
702
702
|
<picture align="center">
|
|
@@ -712,6 +712,9 @@ Requires-Dist: scikit-learn>=1.4.0
|
|
|
712
712
|
[](https://github.com/sidchaini/distclassipy/blob/main/LICENSE)
|
|
713
713
|
[](https://github.com/psf/black)
|
|
714
714
|
|
|
715
|
+
[](https://arxiv.org/abs/2403.12120)
|
|
716
|
+
[](https://ascl.net/2403.002)
|
|
717
|
+
|
|
715
718
|
<!-- [](
|
|
716
719
|
https://doi.org/10.1038/s41586-020-2649-2) -->
|
|
717
720
|
|
|
@@ -743,9 +746,11 @@ print(clf.predict([[0, 0, 0, 0]]))
|
|
|
743
746
|
```
|
|
744
747
|
|
|
745
748
|
## Features
|
|
746
|
-
-
|
|
747
|
-
-
|
|
748
|
-
-
|
|
749
|
+
- **Distance Metric-Based Classification**: Utilizes a variety of distance metrics for classification.
|
|
750
|
+
- **Customizable for Scientific Goals**: Allows fine-tuning based on scientific objectives by selecting appropriate distance metrics and features, enhancing both computational efficiency and model performance.
|
|
751
|
+
- **Interpretable Results**: Offers improved interpretability of classification outcomes by directly using distance metrics and feature importance, making it ideal for scientific applications.
|
|
752
|
+
- **Efficient and Scalable**: Demonstrates lower computational requirements compared to traditional methods like Random Forests, making it suitable for large datasets
|
|
753
|
+
- **Open Source and Accessible**: Available as an open-source Python package on PyPI, encouraging broad application in astronomy and beyond
|
|
749
754
|
|
|
750
755
|
## Documentation
|
|
751
756
|
|
|
@@ -760,30 +765,28 @@ DistClassiPy is released under the [GNU General Public License v3.0](https://www
|
|
|
760
765
|
## Citation
|
|
761
766
|
|
|
762
767
|
If you use DistClassiPy in your research or project, please consider citing the paper:
|
|
763
|
-
> Light Curve Classification with DistClassiPy: a new distance-based classifier
|
|
764
|
-
|
|
768
|
+
> Chaini, S., Mahabal, A., Kembhavi, A., & Bianco, F. B. (2024). Light Curve Classification with DistClassiPy: a new distance-based classifier. arXiv. https://doi.org/10.48550/arXiv.2403.12120
|
|
765
769
|
|
|
766
|
-
<!--
|
|
767
770
|
### Bibtex
|
|
768
771
|
|
|
769
772
|
|
|
770
773
|
```bibtex
|
|
771
|
-
@ARTICLE{
|
|
772
|
-
author = {{Chaini},
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
774
|
+
@ARTICLE{chaini2024light,
|
|
775
|
+
author = {{Chaini}, Siddharth and {Mahabal}, Ashish and {Kembhavi}, Ajit and {Bianco}, Federica B.},
|
|
776
|
+
title = "{Light Curve Classification with DistClassiPy: a new distance-based classifier}",
|
|
777
|
+
journal = {arXiv e-prints},
|
|
778
|
+
keywords = {Astrophysics - Instrumentation and Methods for Astrophysics, Astrophysics - Solar and Stellar Astrophysics, Computer Science - Machine Learning},
|
|
779
|
+
year = 2024,
|
|
780
|
+
month = mar,
|
|
781
|
+
eid = {arXiv:2403.12120},
|
|
782
|
+
pages = {arXiv:2403.12120},
|
|
783
|
+
archivePrefix = {arXiv},
|
|
784
|
+
eprint = {2403.12120},
|
|
785
|
+
primaryClass = {astro-ph.IM},
|
|
786
|
+
adsurl = {https://ui.adsabs.harvard.edu/abs/2024arXiv240312120C},
|
|
787
|
+
adsnote = {Provided by the SAO/NASA Astrophysics Data System}
|
|
784
788
|
}
|
|
785
789
|
```
|
|
786
|
-
-->
|
|
787
790
|
|
|
788
791
|
|
|
789
792
|
<!-- You can also find citation information in the [CITATION.cff](https://github.com/sidchaini/DistClassiPy/CITATION.cff) file. -->
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "distclassipy"
|
|
7
|
-
|
|
7
|
+
dynamic = ["version"]
|
|
8
8
|
description = "A python package for a distance-based classifier which can use several different distance metrics."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -24,11 +24,14 @@ classifiers = [
|
|
|
24
24
|
]
|
|
25
25
|
dependencies = [
|
|
26
26
|
"joblib>=1.3.2",
|
|
27
|
-
"numpy>=1.
|
|
28
|
-
"pandas>=2.
|
|
29
|
-
"scikit-learn>=1.
|
|
27
|
+
"numpy>=1.25.2",
|
|
28
|
+
"pandas>=2.0.3",
|
|
29
|
+
"scikit-learn>=1.2.2"
|
|
30
30
|
]
|
|
31
31
|
|
|
32
|
+
[tool.setuptools.dynamic]
|
|
33
|
+
version = {attr = "distclassipy.__version__"}
|
|
34
|
+
|
|
32
35
|
[project.urls]
|
|
33
36
|
Repository = "https://github.com/sidchaini/DistClassiPy"
|
|
34
|
-
|
|
37
|
+
Documentation = "https://sidchaini.github.io/DistClassiPy/"
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from setuptools import setup
|
|
2
|
+
import codecs
|
|
3
|
+
import os.path
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def read(rel_path):
|
|
7
|
+
here = os.path.abspath(os.path.dirname(__file__))
|
|
8
|
+
with codecs.open(os.path.join(here, rel_path), "r") as fp:
|
|
9
|
+
return fp.read()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def get_version(rel_path):
|
|
13
|
+
for line in read(rel_path).splitlines():
|
|
14
|
+
if line.startswith("__version__"):
|
|
15
|
+
delim = '"' if '"' in line else "'"
|
|
16
|
+
return line.split(delim)[1]
|
|
17
|
+
else:
|
|
18
|
+
raise RuntimeError("Unable to find version string.")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
setup(version=get_version("distclassipy/__init__.py"))
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import pytest
|
|
2
2
|
import numpy as np
|
|
3
3
|
from distclassipy.classifier import DistanceMetricClassifier
|
|
4
|
+
from sklearn.utils.estimator_checks import check_estimator
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
# Test initialization of the classifier with specific parameters
|
|
@@ -10,12 +11,8 @@ def test_init():
|
|
|
10
11
|
assert clf.scale is True
|
|
11
12
|
|
|
12
13
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
# def test_estimator_compatibility():
|
|
16
|
-
# from sklearn.utils.estimator_checks import check_estimator
|
|
17
|
-
|
|
18
|
-
# check_estimator(DistanceMetricClassifier())
|
|
14
|
+
def test_sklearn_compatibility():
|
|
15
|
+
check_estimator(DistanceMetricClassifier())
|
|
19
16
|
|
|
20
17
|
|
|
21
18
|
# Test fitting the classifier to a dataset
|
|
@@ -105,7 +102,7 @@ def test_metric_invalid():
|
|
|
105
102
|
def test_central_stat_median():
|
|
106
103
|
X = np.array([[1, 2], [3, 4], [5, 6]]) # Sample feature set
|
|
107
104
|
y = np.array([0, 1, 0]) # Sample target values
|
|
108
|
-
clf = DistanceMetricClassifier(central_stat="median")
|
|
105
|
+
clf = DistanceMetricClassifier(central_stat="median", dispersion_stat="iqr")
|
|
109
106
|
clf.fit(X, y)
|
|
110
107
|
assert clf.central_stat == "median"
|
|
111
108
|
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
import numpy as np
|
|
3
|
+
from distclassipy.distances import Distance
|
|
4
|
+
|
|
5
|
+
# Initialize the Distance class to use its methods for testing
|
|
6
|
+
distance = Distance()
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def test_all_distances():
|
|
10
|
+
# Define two sample vectors
|
|
11
|
+
u = np.array([1, 2, 3])
|
|
12
|
+
v = np.array([1, 2, 3])
|
|
13
|
+
for func_name in dir(distance):
|
|
14
|
+
if callable(getattr(distance, func_name)) and not func_name.startswith("__"):
|
|
15
|
+
func = getattr(distance, func_name)
|
|
16
|
+
d = func(u, v)
|
|
17
|
+
assert d >= 0
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import re
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
def get_version_from_pyproject():
|
|
6
|
-
pyproject_path = os.path.join(os.path.dirname(__file__), "..", "pyproject.toml")
|
|
7
|
-
with open(pyproject_path, "r") as f:
|
|
8
|
-
pyproject_content = f.read()
|
|
9
|
-
version_match = re.search(r'^version\s*=\s*"(.*?)"', pyproject_content, re.M)
|
|
10
|
-
if version_match:
|
|
11
|
-
return version_match.group(1)
|
|
12
|
-
else:
|
|
13
|
-
raise RuntimeError("Version not found in pyproject.toml")
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
__version__ = get_version_from_pyproject()
|
distclassipy-0.1.2/setup.py
DELETED
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
import pytest
|
|
2
|
-
import numpy as np
|
|
3
|
-
from distclassipy.distances import Distance
|
|
4
|
-
|
|
5
|
-
# Initialize the Distance class to use its methods for testing
|
|
6
|
-
distance = Distance()
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
# Test for the accuracy distance calculation
|
|
10
|
-
def test_acc_distance():
|
|
11
|
-
# Define two sample vectors
|
|
12
|
-
u = np.array([1, 2, 3])
|
|
13
|
-
v = np.array([2, 4, 6])
|
|
14
|
-
# Calculate the expected result manually
|
|
15
|
-
expected = np.mean([np.sum(np.abs(u - v)), np.max(np.abs(u - v))])
|
|
16
|
-
# Assert that the calculated distance matches the expected result
|
|
17
|
-
assert distance.acc(u, v) == expected
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
# Test for the Vicis Wave Hedges distance calculation
|
|
21
|
-
def test_vicis_wave_hedges():
|
|
22
|
-
# Define two sample vectors
|
|
23
|
-
u = np.array([1, 2, 3])
|
|
24
|
-
v = np.array([2, 4, 6])
|
|
25
|
-
# Calculate the minimum of u and v element-wise
|
|
26
|
-
uvmin = np.minimum(u, v)
|
|
27
|
-
# Calculate the absolute difference between u and v
|
|
28
|
-
u_v = np.abs(u - v)
|
|
29
|
-
# Calculate the expected result manually
|
|
30
|
-
expected = np.sum(np.where(uvmin != 0, u_v / uvmin, 0))
|
|
31
|
-
# Assert that the calculated distance matches the expected result
|
|
32
|
-
assert distance.vicis_wave_hedges(u, v) == expected
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
# HAVE TO ADD MORE TESTS
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|