distclassipy 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
distclassipy/__init__.py CHANGED
@@ -12,4 +12,5 @@ from .classifier import (
12
12
  from .distances import (
13
13
  Distance,
14
14
  ) # Importing the Distance class from the distances module
15
- from .version import __version__
15
+
16
+ __version__ = "0.1.3"
@@ -1,3 +1,7 @@
1
+ """
2
+ A module which contains the DistanceMetricClassifier introduced by Chaini et al. (2024) in "Light Curve Classification with DistClassiPy: a new distance-based classifier".
3
+ """
4
+
1
5
  import numpy as np
2
6
  import pandas as pd
3
7
  import scipy
@@ -94,12 +98,6 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
94
98
  self.calculate_kde = calculate_kde
95
99
  self.calculate_1d_dist = calculate_1d_dist
96
100
 
97
- # Hardcoded source packages to check for distance metrics.
98
- self.metric_sources_ = {
99
- "scipy.spatial.distance": scipy.spatial.distance,
100
- "distances.Distance": Distance(),
101
- }
102
-
103
101
  def set_metric_fn_(self):
104
102
  """
105
103
  Set the metric function based on the provided metric.
@@ -107,6 +105,12 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
107
105
  If the metric is a string, the function will look for a corresponding function in scipy.spatial.distance or distances.Distance. If the metric is a function, it will be used directly.
108
106
  """
109
107
 
108
+ # Hardcoded source packages to check for distance metrics.
109
+ metric_sources_ = {
110
+ "scipy.spatial.distance": scipy.spatial.distance,
111
+ "distances.Distance": Distance(),
112
+ }
113
+
110
114
  if callable(self.metric):
111
115
  self.metric_fn_ = self.metric
112
116
  self.metric_arg_ = self.metric
@@ -114,17 +118,26 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
114
118
  elif isinstance(self.metric, str):
115
119
  metric_str_lowercase = self.metric.lower()
116
120
  metric_found = False
117
- for package_str, source in self.metric_sources_.items():
121
+ for package_str, source in metric_sources_.items():
122
+
123
+ # Don't use scipy for jaccard as their implementation only works with booleans - use custom jaccard instead
124
+ if (
125
+ package_str == "scipy.spatial.distance"
126
+ and metric_str_lowercase == "jaccard"
127
+ ):
128
+ continue
129
+
118
130
  if hasattr(source, metric_str_lowercase):
119
131
  self.metric_fn_ = getattr(source, metric_str_lowercase)
120
132
  metric_found = True
121
- if package_str == "scipy.spatial.distance":
122
- # Use the string as an argument if it belongs to scipy as it is optimized
123
- self.metric_arg_ = self.metric
124
- else:
125
- self.metric_arg_ = self.metric_fn_
126
- break
127
133
 
134
+ # Use the string as an argument if it belongs to scipy as it is optimized
135
+ self.metric_arg_ = (
136
+ self.metric
137
+ if package_str == "scipy.spatial.distance"
138
+ else self.metric_fn_
139
+ )
140
+ break
128
141
  if not metric_found:
129
142
  raise ValueError(
130
143
  f"{self.metric} metric not found. Please pass a string of the name of a metric in scipy.spatial.distance or distances.Distance, or pass a metric function directly. For a list of available metrics, see: https://sidchaini.github.io/DistClassiPy/distances.html or https://docs.scipy.org/doc/scipy/reference/spatial.distance.html"
@@ -358,9 +371,7 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
358
371
  sum_1d_dists = sum_1d_dists + dists / self.df_iqr_.loc[cl, feat]
359
372
  else:
360
373
  sum_1d_dists = sum_1d_dists + dists
361
- confs = 1 / sum_1d_dists
362
- # Add epsilon later
363
- # confs = 1 / (sum_1d_dists + np.finfo(float).eps)
374
+ confs = 1 / np.clip(sum_1d_dists, a_min=np.finfo(float).eps, a_max=None)
364
375
  conf_cl.append(confs)
365
376
  conf_cl = np.array(conf_cl)
366
377
  self.conf_cl_ = conf_cl
@@ -388,7 +399,9 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
388
399
 
389
400
  # Calculate confidence for each prediction
390
401
  if method == "distance_inverse":
391
- self.confidence_df_ = 1 / self.centroid_dist_df_
402
+ self.confidence_df_ = 1 / np.clip(
403
+ self.centroid_dist_df_, a_min=np.finfo(float).eps, a_max=None
404
+ )
392
405
  self.confidence_df_.columns = [
393
406
  x.replace("_dist", "_conf") for x in self.confidence_df_.columns
394
407
  ]
distclassipy/distances.py CHANGED
@@ -24,6 +24,7 @@ Notes
24
24
 
25
25
  In addition, the following code was added to all functions for array conversion:
26
26
  u,v = np.asarray(u), np.asarray(v)
27
+ -----
27
28
  """
28
29
 
29
30
  import numpy as np
@@ -89,33 +90,33 @@ class Distance:
89
90
  with np.errstate(divide="ignore", invalid="ignore"):
90
91
  return np.sum(np.where(uvmult != 0, ((u - v) ** 2 * (u + v)) / uvmult, 0))
91
92
 
92
- def bhattacharyya(self, u, v):
93
- """
94
- Calculate the Bhattacharyya distance between two vectors.
95
-
96
- Returns a distance value between 0 and 1.
97
-
98
- Parameters
99
- ----------
100
- - u, v: Input vectors between which the distance is to be calculated.
101
-
102
- Returns
103
- -------
104
- - The Bhattacharyya distance between the two vectors.
105
-
106
- References
107
- ----------
108
- 1. Bhattacharyya A (1947) On a measure of divergence between two
109
- statistical populations defined by probability distributions,
110
- Bull. Calcutta Math. Soc., 35, 99–109.
111
- 2. Sung-Hyuk C. (2007) Comprehensive Survey on Distance/Similarity
112
- Measures between Probability Density Functions. International
113
- Journal of Mathematical Models and Methods in Applied Sciences.
114
- 1(4), 300-307.
115
- 3. https://en.wikipedia.org/wiki/Bhattacharyya_distance
116
- """
117
- u, v = np.asarray(u), np.asarray(v)
118
- return -np.log(np.sum(np.sqrt(u * v)))
93
+ # def bhattacharyya(self, u, v):
94
+ # """
95
+ # Calculate the Bhattacharyya distance between two vectors.
96
+
97
+ # Returns a distance value between 0 and 1.
98
+
99
+ # Parameters
100
+ # ----------
101
+ # - u, v: Input vectors between which the distance is to be calculated.
102
+
103
+ # Returns
104
+ # -------
105
+ # - The Bhattacharyya distance between the two vectors.
106
+
107
+ # References
108
+ # ----------
109
+ # 1. Bhattacharyya A (1947) On a measure of divergence between two
110
+ # statistical populations defined by probability distributions,
111
+ # Bull. Calcutta Math. Soc., 35, 99–109.
112
+ # 2. Sung-Hyuk C. (2007) Comprehensive Survey on Distance/Similarity
113
+ # Measures between Probability Density Functions. International
114
+ # Journal of Mathematical Models and Methods in Applied Sciences.
115
+ # 1(4), 300-307.
116
+ # 3. https://en.wikipedia.org/wiki/Bhattacharyya_distance
117
+ # """
118
+ # u, v = np.asarray(u), np.asarray(v)
119
+ # return -np.log(np.sum(np.sqrt(u * v)))
119
120
 
120
121
  def braycurtis(self, u, v):
121
122
  """
@@ -397,26 +398,26 @@ class Distance:
397
398
  u, v = np.asarray(u), np.asarray(v)
398
399
  return np.linalg.norm(u - v)
399
400
 
400
- def fidelity(self, u, v):
401
- """
402
- Calculate the fidelity distance between two vectors.
401
+ # def fidelity(self, u, v):
402
+ # """
403
+ # Calculate the fidelity distance between two vectors.
403
404
 
404
- The fidelity distance measures the similarity between two probability distributions.
405
+ # The fidelity distance measures the similarity between two probability distributions.
405
406
 
406
- Parameters
407
- ----------
408
- - u, v: Input vectors between which the distance is to be calculated.
407
+ # Parameters
408
+ # ----------
409
+ # - u, v: Input vectors between which the distance is to be calculated.
409
410
 
410
- Returns
411
- -------
412
- - The fidelity distance between the two vectors.
411
+ # Returns
412
+ # -------
413
+ # - The fidelity distance between the two vectors.
413
414
 
414
- Notes
415
- -----
416
- Added by SC.
417
- """
418
- u, v = np.asarray(u), np.asarray(v)
419
- return 1 - (np.sum(np.sqrt(u * v)))
415
+ # Notes
416
+ # -----
417
+ # Added by SC.
418
+ # """
419
+ # u, v = np.asarray(u), np.asarray(v)
420
+ # return 1 - (np.sum(np.sqrt(u * v)))
420
421
 
421
422
  def google(self, u, v):
422
423
  """
@@ -515,26 +516,26 @@ class Distance:
515
516
  u, v = np.asarray(u), np.asarray(v)
516
517
  return np.sqrt(2 * np.sum((np.sqrt(u) - np.sqrt(v)) ** 2))
517
518
 
518
- def inner(self, u, v):
519
- """
520
- Calculate the inner product distance between two vectors.
519
+ # def inner(self, u, v):
520
+ # """
521
+ # Calculate the inner product distance between two vectors.
521
522
 
522
- The inner product distance is a measure of similarity between two vectors, based on their inner product.
523
+ # The inner product distance is a measure of similarity between two vectors, based on their inner product.
523
524
 
524
- Parameters
525
- ----------
526
- - u, v: Input vectors between which the distance is to be calculated.
525
+ # Parameters
526
+ # ----------
527
+ # - u, v: Input vectors between which the distance is to be calculated.
527
528
 
528
- Returns
529
- -------
530
- - The inner product distance between the two vectors.
529
+ # Returns
530
+ # -------
531
+ # - The inner product distance between the two vectors.
531
532
 
532
- Notes
533
- -----
534
- Added by SC.
535
- """
536
- u, v = np.asarray(u), np.asarray(v)
537
- return 1 - np.dot(u, v)
533
+ # Notes
534
+ # -----
535
+ # Added by SC.
536
+ # """
537
+ # u, v = np.asarray(u), np.asarray(v)
538
+ # return 1 - np.dot(u, v)
538
539
 
539
540
  def jaccard(self, u, v):
540
541
  """
@@ -1032,32 +1033,32 @@ class Distance:
1032
1033
  with np.errstate(divide="ignore", invalid="ignore"):
1033
1034
  return np.sum(np.where(u != 0, (u - v) ** 2 / u, 0))
1034
1035
 
1035
- def nonintersection(self, u, v):
1036
- """
1037
- Calculate the Nonintersection distance between two vectors.
1038
-
1039
- Parameters
1040
- ----------
1041
- - u, v: Input vectors between which the distance is to be calculated.
1042
-
1043
- Returns
1044
- -------
1045
- - The Nonintersection distance between the two vectors.
1046
-
1047
- References
1048
- ----------
1049
- 1. Sung-Hyuk C. (2007) Comprehensive Survey on Distance/Similarity
1050
- Measures between Probability Density Functions. International
1051
- Journal of Mathematical Models and Methods in Applied Sciences.
1052
- 1(4), 300-307.
1053
-
1054
- Notes
1055
- -----
1056
- When used for comparing two probability density functions (pdfs),
1057
- Nonintersection distance equals half of Cityblock distance.
1058
- """
1059
- u, v = np.asarray(u), np.asarray(v)
1060
- return 1 - np.sum(np.minimum(u, v))
1036
+ # def nonintersection(self, u, v):
1037
+ # """
1038
+ # Calculate the Nonintersection distance between two vectors.
1039
+
1040
+ # Parameters
1041
+ # ----------
1042
+ # - u, v: Input vectors between which the distance is to be calculated.
1043
+
1044
+ # Returns
1045
+ # -------
1046
+ # - The Nonintersection distance between the two vectors.
1047
+
1048
+ # References
1049
+ # ----------
1050
+ # 1. Sung-Hyuk C. (2007) Comprehensive Survey on Distance/Similarity
1051
+ # Measures between Probability Density Functions. International
1052
+ # Journal of Mathematical Models and Methods in Applied Sciences.
1053
+ # 1(4), 300-307.
1054
+
1055
+ # Notes
1056
+ # -----
1057
+ # When used for comparing two probability density functions (pdfs),
1058
+ # Nonintersection distance equals half of Cityblock distance.
1059
+ # """
1060
+ # u, v = np.asarray(u), np.asarray(v)
1061
+ # return 1 - np.sum(np.minimum(u, v))
1061
1062
 
1062
1063
  def pearson_chisq(self, u, v):
1063
1064
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: distclassipy
3
- Version: 0.1.2
3
+ Version: 0.1.3
4
4
  Summary: A python package for a distance-based classifier which can use several different distance metrics.
5
5
  Author-email: Siddharth Chaini <sidchaini@gmail.com>
6
6
  License: GNU GENERAL PUBLIC LICENSE
@@ -679,7 +679,7 @@ License: GNU GENERAL PUBLIC LICENSE
679
679
  <https://www.gnu.org/licenses/why-not-lgpl.html>.
680
680
 
681
681
  Project-URL: Repository, https://github.com/sidchaini/DistClassiPy
682
- Project-URL: Documenation, https://sidchaini.github.io/DistClassiPy/
682
+ Project-URL: Documentation, https://sidchaini.github.io/DistClassiPy/
683
683
  Classifier: Development Status :: 4 - Beta
684
684
  Classifier: Intended Audience :: Developers
685
685
  Classifier: Intended Audience :: Education
@@ -712,6 +712,9 @@ Requires-Dist: scikit-learn >=1.4.0
712
712
  [![License - GPL-3](https://img.shields.io/pypi/l/distclassipy.svg)](https://github.com/sidchaini/distclassipy/blob/main/LICENSE)
713
713
  [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
714
714
 
715
+ [![arXiv](https://img.shields.io/badge/arXiv-astro--ph%2F2403.12120-red)](https://arxiv.org/abs/2403.12120)
716
+ [![ascl:2403.002](https://img.shields.io/badge/ascl-2403.002-blue.svg?colorB=262255)](https://ascl.net/2403.002)
717
+
715
718
  <!-- [![Paper](https://img.shields.io/badge/DOI-10.1038%2Fs41586--020--2649--2-blue)](
716
719
  https://doi.org/10.1038/s41586-020-2649-2) -->
717
720
 
@@ -743,9 +746,11 @@ print(clf.predict([[0, 0, 0, 0]]))
743
746
  ```
744
747
 
745
748
  ## Features
746
- - Multiple distance metrics support
747
- - Easy integration with existing data processing pipelines
748
- - Efficient and scalable for large datasets
749
+ - **Distance Metric-Based Classification**: Utilizes a variety of distance metrics for classification.
750
+ - **Customizable for Scientific Goals**: Allows fine-tuning based on scientific objectives by selecting appropriate distance metrics and features, enhancing both computational efficiency and model performance.
751
+ - **Interpretable Results**: Offers improved interpretability of classification outcomes by directly using distance metrics and feature importance, making it ideal for scientific applications.
752
+ - **Efficient and Scalable**: Demonstrates lower computational requirements compared to traditional methods like Random Forests, making it suitable for large datasets
753
+ - **Open Source and Accessible**: Available as an open-source Python package on PyPI, encouraging broad application in astronomy and beyond
749
754
 
750
755
  ## Documentation
751
756
 
@@ -760,30 +765,28 @@ DistClassiPy is released under the [GNU General Public License v3.0](https://www
760
765
  ## Citation
761
766
 
762
767
  If you use DistClassiPy in your research or project, please consider citing the paper:
763
- > Light Curve Classification with DistClassiPy: a new distance-based classifier (submitted to A&C)
764
-
768
+ > Chaini, S., Mahabal, A., Kembhavi, A., & Bianco, F. B. (2024). Light Curve Classification with DistClassiPy: a new distance-based classifier. arXiv. https://doi.org/10.48550/arXiv.2403.12120
765
769
 
766
- <!--
767
770
  ### Bibtex
768
771
 
769
772
 
770
773
  ```bibtex
771
- @ARTICLE{Chaini2024,
772
- author = {{Chaini}, S. and {Mahabal}, A. and {Kembhavi}, A. and {Bianco}, F.~B.},
773
- title = "{Light Curve Classification with DistClassiPy: a new distance-based classifier}",
774
- journal = {Submitted to A&C},
775
- % keywords = {},
776
- year = 2024,
777
- % month = ,
778
- % volume = {},
779
- % eid = {},
780
- % pages = {},
781
- % doi = {},
782
- % adsurl = {},
783
- % adsnote = {}
774
+ @ARTICLE{chaini2024light,
775
+ author = {{Chaini}, Siddharth and {Mahabal}, Ashish and {Kembhavi}, Ajit and {Bianco}, Federica B.},
776
+ title = "{Light Curve Classification with DistClassiPy: a new distance-based classifier}",
777
+ journal = {arXiv e-prints},
778
+ keywords = {Astrophysics - Instrumentation and Methods for Astrophysics, Astrophysics - Solar and Stellar Astrophysics, Computer Science - Machine Learning},
779
+ year = 2024,
780
+ month = mar,
781
+ eid = {arXiv:2403.12120},
782
+ pages = {arXiv:2403.12120},
783
+ archivePrefix = {arXiv},
784
+ eprint = {2403.12120},
785
+ primaryClass = {astro-ph.IM},
786
+ adsurl = {https://ui.adsabs.harvard.edu/abs/2024arXiv240312120C},
787
+ adsnote = {Provided by the SAO/NASA Astrophysics Data System}
784
788
  }
785
789
  ```
786
- -->
787
790
 
788
791
 
789
792
  <!-- You can also find citation information in the [CITATION.cff](https://github.com/sidchaini/DistClassiPy/CITATION.cff) file. -->
@@ -0,0 +1,8 @@
1
+ distclassipy/__init__.py,sha256=5JXhOQjWW6Zx9-NhnDysSjFl3D4tVjawMBuQgPKAx8U,515
2
+ distclassipy/classifier.py,sha256=U7Adweo6dLXYAEk88ITguT5YX-PaWQ0F8lHJAiKeAdw,18143
3
+ distclassipy/distances.py,sha256=fTAS8bqnEYc_xPDf3bO8dPRQiZjuqsH2eubW3ghwVWQ,49563
4
+ distclassipy-0.1.3.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
5
+ distclassipy-0.1.3.dist-info/METADATA,sha256=DmxYfskQMKMurSZnTD8JGqWTqTpJ53QpqUtvo6xUhnU,46339
6
+ distclassipy-0.1.3.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
7
+ distclassipy-0.1.3.dist-info/top_level.txt,sha256=jiwqhSkq7CMCjV_Zar2dSDBO63o5C_Dp2tpGiVV6COE,13
8
+ distclassipy-0.1.3.dist-info/RECORD,,
distclassipy/version.py DELETED
@@ -1,16 +0,0 @@
1
- import os
2
- import re
3
-
4
-
5
- def get_version_from_pyproject():
6
- pyproject_path = os.path.join(os.path.dirname(__file__), "..", "pyproject.toml")
7
- with open(pyproject_path, "r") as f:
8
- pyproject_content = f.read()
9
- version_match = re.search(r'^version\s*=\s*"(.*?)"', pyproject_content, re.M)
10
- if version_match:
11
- return version_match.group(1)
12
- else:
13
- raise RuntimeError("Version not found in pyproject.toml")
14
-
15
-
16
- __version__ = get_version_from_pyproject()
@@ -1,9 +0,0 @@
1
- distclassipy/__init__.py,sha256=ktlw7OVuC2C6xPV3ZkTpaIo_fuRsXsXDtmrk2ZVgleg,525
2
- distclassipy/classifier.py,sha256=rvXMgMOL30Vqpbzazl08eYBk0dk5uPVmX3l-u6385zo,17633
3
- distclassipy/distances.py,sha256=21fsoX6MfeGr7XeV7oqyVZVSr_4aCUYkgr2qriGb3AI,49403
4
- distclassipy/version.py,sha256=piUqd7Jqjr5Y8FPF5YjkMnmrraDX-H2yHnUGP0QibDo,482
5
- distclassipy-0.1.2.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
6
- distclassipy-0.1.2.dist-info/METADATA,sha256=foOlSgUE8ZZTxBJR6hCZ9l7rZjNdrUkRJTeUiOkVu0s,45016
7
- distclassipy-0.1.2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
8
- distclassipy-0.1.2.dist-info/top_level.txt,sha256=jiwqhSkq7CMCjV_Zar2dSDBO63o5C_Dp2tpGiVV6COE,13
9
- distclassipy-0.1.2.dist-info/RECORD,,