distclassipy 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- distclassipy/__init__.py +1 -1
- distclassipy/classifier.py +127 -112
- {distclassipy-0.2.0.dist-info → distclassipy-0.2.1.dist-info}/METADATA +1 -1
- distclassipy-0.2.1.dist-info/RECORD +8 -0
- distclassipy-0.2.0.dist-info/RECORD +0 -8
- {distclassipy-0.2.0.dist-info → distclassipy-0.2.1.dist-info}/LICENSE +0 -0
- {distclassipy-0.2.0.dist-info → distclassipy-0.2.1.dist-info}/WHEEL +0 -0
- {distclassipy-0.2.0.dist-info → distclassipy-0.2.1.dist-info}/top_level.txt +0 -0
distclassipy/__init__.py
CHANGED
distclassipy/classifier.py
CHANGED
|
@@ -6,6 +6,12 @@ in "Light Curve Classification with DistClassiPy: a new distance-based classifie
|
|
|
6
6
|
|
|
7
7
|
.. autoclass:: distclassipy.classifier.DistanceMetricClassifier
|
|
8
8
|
:members:
|
|
9
|
+
:inherited-members:
|
|
10
|
+
:exclude-members: set_fit_request, set_predict_request
|
|
11
|
+
|
|
12
|
+
.. autoclass:: distclassipy.classifier.EnsembleDistanceClassifier
|
|
13
|
+
:members:
|
|
14
|
+
:inherited-members:
|
|
9
15
|
:exclude-members: set_fit_request, set_predict_request
|
|
10
16
|
|
|
11
17
|
.. doctest-skip::
|
|
@@ -124,16 +130,6 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
124
130
|
|
|
125
131
|
.. versionadded:: 0.1.0
|
|
126
132
|
|
|
127
|
-
|
|
128
|
-
Attributes
|
|
129
|
-
----------
|
|
130
|
-
scale : bool
|
|
131
|
-
Indicates whether the data is scaled.
|
|
132
|
-
central_stat : str
|
|
133
|
-
The statistic used for calculating central tendency.
|
|
134
|
-
dispersion_stat : str
|
|
135
|
-
The statistic used for calculating dispersion.
|
|
136
|
-
|
|
137
133
|
References
|
|
138
134
|
----------
|
|
139
135
|
.. [1] "Light Curve Classification with DistClassiPy: a new distance-based
|
|
@@ -434,89 +430,6 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
|
|
|
434
430
|
return accuracy_score(y, y_pred)
|
|
435
431
|
|
|
436
432
|
|
|
437
|
-
def find_best_metrics(
|
|
438
|
-
clf: "DistanceMetricClassifier",
|
|
439
|
-
X: np.ndarray,
|
|
440
|
-
y: np.ndarray,
|
|
441
|
-
feat_idx: int,
|
|
442
|
-
n_quantiles: int = 4,
|
|
443
|
-
metrics_to_consider: list[str] = None,
|
|
444
|
-
) -> Tuple[pd.DataFrame, pd.Series, np.ndarray]:
|
|
445
|
-
"""Evaluate and find the best distance metrics for a given feature.
|
|
446
|
-
|
|
447
|
-
This function evaluates different distance metrics to determine which
|
|
448
|
-
performs best for a specific feature in the dataset. It splits the data
|
|
449
|
-
into quantiles based on the specified feature and calculates the accuracy
|
|
450
|
-
of the classifier for each metric within these quantiles.
|
|
451
|
-
|
|
452
|
-
.. versionadded:: 0.2.0
|
|
453
|
-
|
|
454
|
-
Parameters
|
|
455
|
-
----------
|
|
456
|
-
clf : DistanceMetricClassifier
|
|
457
|
-
The classifier instance to be used for evaluation.
|
|
458
|
-
X : np.ndarray
|
|
459
|
-
The input feature matrix.
|
|
460
|
-
y : np.ndarray
|
|
461
|
-
The target labels.
|
|
462
|
-
feat_idx : int
|
|
463
|
-
The index of the feature to be used for quantile splitting.
|
|
464
|
-
n_quantiles : int, default=4
|
|
465
|
-
The number of quantiles to split the data into.
|
|
466
|
-
metrics_to_consider : list of str, optional
|
|
467
|
-
A list of distance metrics to evaluate. If None, all available
|
|
468
|
-
metrics within DistClassiPy will be considered.
|
|
469
|
-
|
|
470
|
-
Returns
|
|
471
|
-
-------
|
|
472
|
-
quantile_scores_df : pd.DataFrame
|
|
473
|
-
A DataFrame containing the accuracy scores for each metric across
|
|
474
|
-
different quantiles.
|
|
475
|
-
best_metrics_per_quantile : pd.Series
|
|
476
|
-
A Series indicating the best-performing metric for each quantile.
|
|
477
|
-
group_bins : np.ndarray
|
|
478
|
-
The bins used for quantile splitting.
|
|
479
|
-
"""
|
|
480
|
-
X = check_array(X)
|
|
481
|
-
feature_labels = [f"Feature_{i}" for i in range(X.shape[1])]
|
|
482
|
-
feature_name = f"Feature_{feat_idx}"
|
|
483
|
-
|
|
484
|
-
if metrics_to_consider is None:
|
|
485
|
-
metrics_to_consider = _ALL_METRICS
|
|
486
|
-
|
|
487
|
-
X_df = pd.DataFrame(X, columns=feature_labels)
|
|
488
|
-
y_df = pd.DataFrame(y, columns=["Target"])
|
|
489
|
-
quantiles, group_bins = pd.qcut(X_df[feature_name], q=n_quantiles, retbins=True)
|
|
490
|
-
|
|
491
|
-
X_train, X_test, y_train, y_test = train_test_split(
|
|
492
|
-
X_df, y_df, test_size=0.25, stratify=quantiles
|
|
493
|
-
)
|
|
494
|
-
|
|
495
|
-
clf.fit(X_train, y_train.to_numpy().ravel())
|
|
496
|
-
grouped_test_data = X_test.groupby(quantiles, observed=False)
|
|
497
|
-
|
|
498
|
-
quantile_scores = []
|
|
499
|
-
for metric in metrics_to_consider:
|
|
500
|
-
scores_for_metric = [
|
|
501
|
-
accuracy_score(
|
|
502
|
-
y_test.loc[subdf.index], clf.predict(subdf.to_numpy(), metric=metric)
|
|
503
|
-
)
|
|
504
|
-
for _, subdf in grouped_test_data
|
|
505
|
-
]
|
|
506
|
-
quantile_scores.append(scores_for_metric)
|
|
507
|
-
|
|
508
|
-
quantile_scores = np.array(quantile_scores) * 100
|
|
509
|
-
quantile_scores_df = pd.DataFrame(
|
|
510
|
-
data=quantile_scores,
|
|
511
|
-
index=metrics_to_consider,
|
|
512
|
-
columns=[f"Quantile {i+1}" for i in range(n_quantiles)],
|
|
513
|
-
)
|
|
514
|
-
|
|
515
|
-
best_metrics_per_quantile = quantile_scores_df.idxmax()
|
|
516
|
-
|
|
517
|
-
return quantile_scores_df, best_metrics_per_quantile, group_bins
|
|
518
|
-
|
|
519
|
-
|
|
520
433
|
class EnsembleDistanceClassifier(BaseEstimator, ClassifierMixin):
|
|
521
434
|
"""An ensemble classifier that uses different metrics for each quantile.
|
|
522
435
|
|
|
@@ -537,6 +450,7 @@ class EnsembleDistanceClassifier(BaseEstimator, ClassifierMixin):
|
|
|
537
450
|
central_stat: str = "median",
|
|
538
451
|
dispersion_stat: str = "std",
|
|
539
452
|
metrics_to_consider: list[str] = None,
|
|
453
|
+
random_state: int = None,
|
|
540
454
|
) -> None:
|
|
541
455
|
"""Initialize the classifier with specified parameters.
|
|
542
456
|
|
|
@@ -553,12 +467,18 @@ class EnsembleDistanceClassifier(BaseEstimator, ClassifierMixin):
|
|
|
553
467
|
metrics_to_consider : list of str, optional
|
|
554
468
|
A list of distance metrics to evaluate. If None, all available
|
|
555
469
|
metrics within DistClassiPy will be considered.
|
|
470
|
+
random_state : int, RandomState instance or None, optional (default=None)
|
|
471
|
+
Controls the randomness of the estimator. Pass an int for reproducible
|
|
472
|
+
output across multiple function calls.
|
|
473
|
+
|
|
474
|
+
.. versionadded:: 0.2.1
|
|
556
475
|
"""
|
|
557
476
|
self.feat_idx = feat_idx
|
|
558
477
|
self.scale = scale
|
|
559
478
|
self.central_stat = central_stat
|
|
560
479
|
self.dispersion_stat = dispersion_stat
|
|
561
480
|
self.metrics_to_consider = metrics_to_consider
|
|
481
|
+
self.random_state = random_state
|
|
562
482
|
|
|
563
483
|
def fit(
|
|
564
484
|
self, X: np.ndarray, y: np.ndarray, n_quantiles: int = 4
|
|
@@ -625,26 +545,26 @@ class EnsembleDistanceClassifier(BaseEstimator, ClassifierMixin):
|
|
|
625
545
|
quantiles = pd.cut(
|
|
626
546
|
X[:, self.feat_idx], bins=self.group_bins, labels=self.group_labels
|
|
627
547
|
)
|
|
628
|
-
|
|
629
|
-
quantile_indices = quantiles.codes # Get integer codes for quantiles
|
|
630
|
-
predictions = np.empty(X.shape[0], dtype=
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
# Precompute predictions for each quantile
|
|
636
|
-
quantile_predictions = {}
|
|
637
|
-
for i, label in enumerate(self.group_labels):
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
548
|
+
grouped_data = pd.DataFrame(X).groupby(quantiles, observed=False)
|
|
549
|
+
# quantile_indices = quantiles.codes # Get integer codes for quantiles
|
|
550
|
+
predictions = np.empty(X.shape[0], dtype=object) # Change dtype to object
|
|
551
|
+
for i, (lim, subdf) in enumerate(grouped_data):
|
|
552
|
+
best_metric = self.best_metrics_per_quantile_.loc[self.group_labels[i]]
|
|
553
|
+
preds = self.clf_.predict(subdf.to_numpy(), metric=best_metric)
|
|
554
|
+
predictions[subdf.index] = preds
|
|
555
|
+
# # Precompute predictions for each quantile
|
|
556
|
+
# quantile_predictions = {}
|
|
557
|
+
# for i, label in enumerate(self.group_labels):
|
|
558
|
+
# best_metric = self.best_metrics_per_quantile_.loc[label]
|
|
559
|
+
# quantile_data = X[quantile_indices == i]
|
|
560
|
+
# if quantile_data.size > 0:
|
|
561
|
+
# quantile_predictions[i] = self.clf_.predict(
|
|
562
|
+
# quantile_data, metric=best_metric
|
|
563
|
+
# )
|
|
644
564
|
|
|
645
565
|
# Assign predictions to the corresponding indices
|
|
646
|
-
for i, preds in quantile_predictions.items():
|
|
647
|
-
|
|
566
|
+
# for i, preds in quantile_predictions.items():
|
|
567
|
+
# predictions[quantile_indices == i] = preds
|
|
648
568
|
|
|
649
569
|
return predictions
|
|
650
570
|
|
|
@@ -677,5 +597,100 @@ class EnsembleDistanceClassifier(BaseEstimator, ClassifierMixin):
|
|
|
677
597
|
The bins used for quantile splitting.
|
|
678
598
|
"""
|
|
679
599
|
return find_best_metrics(
|
|
680
|
-
self.clf_,
|
|
600
|
+
self.clf_,
|
|
601
|
+
X,
|
|
602
|
+
y,
|
|
603
|
+
self.feat_idx,
|
|
604
|
+
n_quantiles,
|
|
605
|
+
self.metrics_to_consider,
|
|
606
|
+
self.random_state,
|
|
681
607
|
)
|
|
608
|
+
|
|
609
|
+
|
|
610
|
+
def find_best_metrics(
|
|
611
|
+
clf: "DistanceMetricClassifier",
|
|
612
|
+
X: np.ndarray,
|
|
613
|
+
y: np.ndarray,
|
|
614
|
+
feat_idx: int,
|
|
615
|
+
n_quantiles: int = 4,
|
|
616
|
+
metrics_to_consider: list[str] = None,
|
|
617
|
+
random_state: int = None,
|
|
618
|
+
) -> Tuple[pd.DataFrame, pd.Series, np.ndarray]:
|
|
619
|
+
"""Evaluate and find the best distance metrics for a given feature.
|
|
620
|
+
|
|
621
|
+
This function evaluates different distance metrics to determine which
|
|
622
|
+
performs best for a specific feature in the dataset. It splits the data
|
|
623
|
+
into quantiles based on the specified feature and calculates the accuracy
|
|
624
|
+
of the classifier for each metric within these quantiles.
|
|
625
|
+
|
|
626
|
+
.. versionadded:: 0.2.0
|
|
627
|
+
|
|
628
|
+
Parameters
|
|
629
|
+
----------
|
|
630
|
+
clf : DistanceMetricClassifier
|
|
631
|
+
The classifier instance to be used for evaluation.
|
|
632
|
+
X : np.ndarray
|
|
633
|
+
The input feature matrix.
|
|
634
|
+
y : np.ndarray
|
|
635
|
+
The target labels.
|
|
636
|
+
feat_idx : int
|
|
637
|
+
The index of the feature to be used for quantile splitting.
|
|
638
|
+
n_quantiles : int, default=4
|
|
639
|
+
The number of quantiles to split the data into.
|
|
640
|
+
metrics_to_consider : list of str, optional
|
|
641
|
+
A list of distance metrics to evaluate. If None, all available
|
|
642
|
+
metrics within DistClassiPy will be considered.
|
|
643
|
+
random_state : int, RandomState instance or None, optional (default=None)
|
|
644
|
+
Controls the randomness of the estimator. Pass an int for reproducible
|
|
645
|
+
output across multiple function calls.
|
|
646
|
+
|
|
647
|
+
.. versionadded:: 0.2.1
|
|
648
|
+
|
|
649
|
+
Returns
|
|
650
|
+
-------
|
|
651
|
+
quantile_scores_df : pd.DataFrame
|
|
652
|
+
A DataFrame containing the accuracy scores for each metric across
|
|
653
|
+
different quantiles.
|
|
654
|
+
best_metrics_per_quantile : pd.Series
|
|
655
|
+
A Series indicating the best-performing metric for each quantile.
|
|
656
|
+
group_bins : np.ndarray
|
|
657
|
+
The bins used for quantile splitting.
|
|
658
|
+
"""
|
|
659
|
+
X = check_array(X)
|
|
660
|
+
feature_labels = [f"Feature_{i}" for i in range(X.shape[1])]
|
|
661
|
+
feature_name = f"Feature_{feat_idx}"
|
|
662
|
+
|
|
663
|
+
if metrics_to_consider is None:
|
|
664
|
+
metrics_to_consider = _ALL_METRICS
|
|
665
|
+
|
|
666
|
+
X_df = pd.DataFrame(X, columns=feature_labels)
|
|
667
|
+
y_df = pd.DataFrame(y, columns=["Target"])
|
|
668
|
+
quantiles, group_bins = pd.qcut(X_df[feature_name], q=n_quantiles, retbins=True)
|
|
669
|
+
|
|
670
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
671
|
+
X_df, y_df, test_size=0.25, stratify=quantiles, random_state=random_state
|
|
672
|
+
)
|
|
673
|
+
|
|
674
|
+
clf.fit(X_train, y_train.to_numpy().ravel())
|
|
675
|
+
grouped_test_data = X_test.groupby(quantiles, observed=False)
|
|
676
|
+
|
|
677
|
+
quantile_scores = []
|
|
678
|
+
for metric in metrics_to_consider:
|
|
679
|
+
scores_for_metric = [
|
|
680
|
+
accuracy_score(
|
|
681
|
+
y_test.loc[subdf.index], clf.predict(subdf.to_numpy(), metric=metric)
|
|
682
|
+
)
|
|
683
|
+
for _, subdf in grouped_test_data
|
|
684
|
+
]
|
|
685
|
+
quantile_scores.append(scores_for_metric)
|
|
686
|
+
|
|
687
|
+
quantile_scores = np.array(quantile_scores) * 100
|
|
688
|
+
quantile_scores_df = pd.DataFrame(
|
|
689
|
+
data=quantile_scores,
|
|
690
|
+
index=metrics_to_consider,
|
|
691
|
+
columns=[f"Quantile {i+1}" for i in range(n_quantiles)],
|
|
692
|
+
)
|
|
693
|
+
|
|
694
|
+
best_metrics_per_quantile = quantile_scores_df.idxmax()
|
|
695
|
+
|
|
696
|
+
return quantile_scores_df, best_metrics_per_quantile, group_bins
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: distclassipy
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: A python package for a distance-based classifier which can use several different distance metrics.
|
|
5
5
|
Author-email: Siddharth Chaini <sidchaini@gmail.com>
|
|
6
6
|
License: GNU GENERAL PUBLIC LICENSE
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
distclassipy/__init__.py,sha256=jOQm_ReOhCZKV2yzZ7uXG_Txw7fSFvCvxD0l-qWvJgM,1230
|
|
2
|
+
distclassipy/classifier.py,sha256=G_Ah_mvCnMMs5JSSB_PNthNjFGqvunmFREey-mI4kwg,26062
|
|
3
|
+
distclassipy/distances.py,sha256=FXREhY-HcSZbrCrmP5MBJaqbxqyf3gnzgPYS9pVslwA,54358
|
|
4
|
+
distclassipy-0.2.1.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
5
|
+
distclassipy-0.2.1.dist-info/METADATA,sha256=8HexdfPbkIIRsLHJNRfW-R7HaxFJB4FnE6oHqnzvlN0,47174
|
|
6
|
+
distclassipy-0.2.1.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
|
|
7
|
+
distclassipy-0.2.1.dist-info/top_level.txt,sha256=jiwqhSkq7CMCjV_Zar2dSDBO63o5C_Dp2tpGiVV6COE,13
|
|
8
|
+
distclassipy-0.2.1.dist-info/RECORD,,
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
distclassipy/__init__.py,sha256=26ocDZ8viE2UEOTOzEvbf78QwHzzOOeKPjYJvVwv-wE,1230
|
|
2
|
-
distclassipy/classifier.py,sha256=ymzqN-bzgUVbMaiKhWHTJPy377QRyhw_OY1zWwV6eKA,25381
|
|
3
|
-
distclassipy/distances.py,sha256=FXREhY-HcSZbrCrmP5MBJaqbxqyf3gnzgPYS9pVslwA,54358
|
|
4
|
-
distclassipy-0.2.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
5
|
-
distclassipy-0.2.0.dist-info/METADATA,sha256=97wSaNp9KgLZwayhc5SJ1tURtS16La7Bl6fyBB56Ysk,47174
|
|
6
|
-
distclassipy-0.2.0.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
|
|
7
|
-
distclassipy-0.2.0.dist-info/top_level.txt,sha256=jiwqhSkq7CMCjV_Zar2dSDBO63o5C_Dp2tpGiVV6COE,13
|
|
8
|
-
distclassipy-0.2.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|