distclassipy 0.2.0__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: distclassipy
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: A python package for a distance-based classifier which can use several different distance metrics.
5
5
  Author-email: Siddharth Chaini <sidchaini@gmail.com>
6
6
  License: GNU GENERAL PUBLIC LICENSE
@@ -28,7 +28,7 @@ from .classifier import (
28
28
  )
29
29
  from .distances import Distance, _ALL_METRICS
30
30
 
31
- __version__ = "0.2.0"
31
+ __version__ = "0.2.1"
32
32
 
33
33
  __all__ = [
34
34
  "DistanceMetricClassifier",
@@ -6,6 +6,12 @@ in "Light Curve Classification with DistClassiPy: a new distance-based classifie
6
6
 
7
7
  .. autoclass:: distclassipy.classifier.DistanceMetricClassifier
8
8
  :members:
9
+ :inherited-members:
10
+ :exclude-members: set_fit_request, set_predict_request
11
+
12
+ .. autoclass:: distclassipy.classifier.EnsembleDistanceClassifier
13
+ :members:
14
+ :inherited-members:
9
15
  :exclude-members: set_fit_request, set_predict_request
10
16
 
11
17
  .. doctest-skip::
@@ -124,16 +130,6 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
124
130
 
125
131
  .. versionadded:: 0.1.0
126
132
 
127
-
128
- Attributes
129
- ----------
130
- scale : bool
131
- Indicates whether the data is scaled.
132
- central_stat : str
133
- The statistic used for calculating central tendency.
134
- dispersion_stat : str
135
- The statistic used for calculating dispersion.
136
-
137
133
  References
138
134
  ----------
139
135
  .. [1] "Light Curve Classification with DistClassiPy: a new distance-based
@@ -434,89 +430,6 @@ class DistanceMetricClassifier(BaseEstimator, ClassifierMixin):
434
430
  return accuracy_score(y, y_pred)
435
431
 
436
432
 
437
- def find_best_metrics(
438
- clf: "DistanceMetricClassifier",
439
- X: np.ndarray,
440
- y: np.ndarray,
441
- feat_idx: int,
442
- n_quantiles: int = 4,
443
- metrics_to_consider: list[str] = None,
444
- ) -> Tuple[pd.DataFrame, pd.Series, np.ndarray]:
445
- """Evaluate and find the best distance metrics for a given feature.
446
-
447
- This function evaluates different distance metrics to determine which
448
- performs best for a specific feature in the dataset. It splits the data
449
- into quantiles based on the specified feature and calculates the accuracy
450
- of the classifier for each metric within these quantiles.
451
-
452
- .. versionadded:: 0.2.0
453
-
454
- Parameters
455
- ----------
456
- clf : DistanceMetricClassifier
457
- The classifier instance to be used for evaluation.
458
- X : np.ndarray
459
- The input feature matrix.
460
- y : np.ndarray
461
- The target labels.
462
- feat_idx : int
463
- The index of the feature to be used for quantile splitting.
464
- n_quantiles : int, default=4
465
- The number of quantiles to split the data into.
466
- metrics_to_consider : list of str, optional
467
- A list of distance metrics to evaluate. If None, all available
468
- metrics within DistClassiPy will be considered.
469
-
470
- Returns
471
- -------
472
- quantile_scores_df : pd.DataFrame
473
- A DataFrame containing the accuracy scores for each metric across
474
- different quantiles.
475
- best_metrics_per_quantile : pd.Series
476
- A Series indicating the best-performing metric for each quantile.
477
- group_bins : np.ndarray
478
- The bins used for quantile splitting.
479
- """
480
- X = check_array(X)
481
- feature_labels = [f"Feature_{i}" for i in range(X.shape[1])]
482
- feature_name = f"Feature_{feat_idx}"
483
-
484
- if metrics_to_consider is None:
485
- metrics_to_consider = _ALL_METRICS
486
-
487
- X_df = pd.DataFrame(X, columns=feature_labels)
488
- y_df = pd.DataFrame(y, columns=["Target"])
489
- quantiles, group_bins = pd.qcut(X_df[feature_name], q=n_quantiles, retbins=True)
490
-
491
- X_train, X_test, y_train, y_test = train_test_split(
492
- X_df, y_df, test_size=0.25, stratify=quantiles
493
- )
494
-
495
- clf.fit(X_train, y_train.to_numpy().ravel())
496
- grouped_test_data = X_test.groupby(quantiles, observed=False)
497
-
498
- quantile_scores = []
499
- for metric in metrics_to_consider:
500
- scores_for_metric = [
501
- accuracy_score(
502
- y_test.loc[subdf.index], clf.predict(subdf.to_numpy(), metric=metric)
503
- )
504
- for _, subdf in grouped_test_data
505
- ]
506
- quantile_scores.append(scores_for_metric)
507
-
508
- quantile_scores = np.array(quantile_scores) * 100
509
- quantile_scores_df = pd.DataFrame(
510
- data=quantile_scores,
511
- index=metrics_to_consider,
512
- columns=[f"Quantile {i+1}" for i in range(n_quantiles)],
513
- )
514
-
515
- best_metrics_per_quantile = quantile_scores_df.idxmax()
516
-
517
- return quantile_scores_df, best_metrics_per_quantile, group_bins
518
-
519
-
520
433
  class EnsembleDistanceClassifier(BaseEstimator, ClassifierMixin):
521
434
  """An ensemble classifier that uses different metrics for each quantile.
522
435
 
@@ -537,6 +450,7 @@ class EnsembleDistanceClassifier(BaseEstimator, ClassifierMixin):
537
450
  central_stat: str = "median",
538
451
  dispersion_stat: str = "std",
539
452
  metrics_to_consider: list[str] = None,
453
+ random_state: int = None,
540
454
  ) -> None:
541
455
  """Initialize the classifier with specified parameters.
542
456
 
@@ -553,12 +467,18 @@ class EnsembleDistanceClassifier(BaseEstimator, ClassifierMixin):
553
467
  metrics_to_consider : list of str, optional
554
468
  A list of distance metrics to evaluate. If None, all available
555
469
  metrics within DistClassiPy will be considered.
470
+ random_state : int, RandomState instance or None, optional (default=None)
471
+ Controls the randomness of the estimator. Pass an int for reproducible
472
+ output across multiple function calls.
473
+
474
+ .. versionadded:: 0.2.1
556
475
  """
557
476
  self.feat_idx = feat_idx
558
477
  self.scale = scale
559
478
  self.central_stat = central_stat
560
479
  self.dispersion_stat = dispersion_stat
561
480
  self.metrics_to_consider = metrics_to_consider
481
+ self.random_state = random_state
562
482
 
563
483
  def fit(
564
484
  self, X: np.ndarray, y: np.ndarray, n_quantiles: int = 4
@@ -625,26 +545,26 @@ class EnsembleDistanceClassifier(BaseEstimator, ClassifierMixin):
625
545
  quantiles = pd.cut(
626
546
  X[:, self.feat_idx], bins=self.group_bins, labels=self.group_labels
627
547
  )
628
- # grouped_data = pd.DataFrame(X).groupby(quantiles, observed=False)
629
- quantile_indices = quantiles.codes # Get integer codes for quantiles
630
- predictions = np.empty(X.shape[0], dtype=int)
631
- # for i, (lim, subdf) in enumerate(grouped_data):
632
- # best_metric = self.best_metrics_per_quantile_.loc[self.group_labels[i]]
633
- # preds = self.clf_.predict(subdf.to_numpy(), metric=best_metric)
634
- # predictions[subdf.index] = preds
635
- # Precompute predictions for each quantile
636
- quantile_predictions = {}
637
- for i, label in enumerate(self.group_labels):
638
- best_metric = self.best_metrics_per_quantile_.loc[label]
639
- quantile_data = X[quantile_indices == i]
640
- if quantile_data.size > 0:
641
- quantile_predictions[i] = self.clf_.predict(
642
- quantile_data, metric=best_metric
643
- )
548
+ grouped_data = pd.DataFrame(X).groupby(quantiles, observed=False)
549
+ # quantile_indices = quantiles.codes # Get integer codes for quantiles
550
+ predictions = np.empty(X.shape[0], dtype=object) # Change dtype to object
551
+ for i, (lim, subdf) in enumerate(grouped_data):
552
+ best_metric = self.best_metrics_per_quantile_.loc[self.group_labels[i]]
553
+ preds = self.clf_.predict(subdf.to_numpy(), metric=best_metric)
554
+ predictions[subdf.index] = preds
555
+ # # Precompute predictions for each quantile
556
+ # quantile_predictions = {}
557
+ # for i, label in enumerate(self.group_labels):
558
+ # best_metric = self.best_metrics_per_quantile_.loc[label]
559
+ # quantile_data = X[quantile_indices == i]
560
+ # if quantile_data.size > 0:
561
+ # quantile_predictions[i] = self.clf_.predict(
562
+ # quantile_data, metric=best_metric
563
+ # )
644
564
 
645
565
  # Assign predictions to the corresponding indices
646
- for i, preds in quantile_predictions.items():
647
- predictions[quantile_indices == i] = preds
566
+ # for i, preds in quantile_predictions.items():
567
+ # predictions[quantile_indices == i] = preds
648
568
 
649
569
  return predictions
650
570
 
@@ -677,5 +597,100 @@ class EnsembleDistanceClassifier(BaseEstimator, ClassifierMixin):
677
597
  The bins used for quantile splitting.
678
598
  """
679
599
  return find_best_metrics(
680
- self.clf_, X, y, self.feat_idx, n_quantiles, self.metrics_to_consider
600
+ self.clf_,
601
+ X,
602
+ y,
603
+ self.feat_idx,
604
+ n_quantiles,
605
+ self.metrics_to_consider,
606
+ self.random_state,
681
607
  )
608
+
609
+
610
+ def find_best_metrics(
611
+ clf: "DistanceMetricClassifier",
612
+ X: np.ndarray,
613
+ y: np.ndarray,
614
+ feat_idx: int,
615
+ n_quantiles: int = 4,
616
+ metrics_to_consider: list[str] = None,
617
+ random_state: int = None,
618
+ ) -> Tuple[pd.DataFrame, pd.Series, np.ndarray]:
619
+ """Evaluate and find the best distance metrics for a given feature.
620
+
621
+ This function evaluates different distance metrics to determine which
622
+ performs best for a specific feature in the dataset. It splits the data
623
+ into quantiles based on the specified feature and calculates the accuracy
624
+ of the classifier for each metric within these quantiles.
625
+
626
+ .. versionadded:: 0.2.0
627
+
628
+ Parameters
629
+ ----------
630
+ clf : DistanceMetricClassifier
631
+ The classifier instance to be used for evaluation.
632
+ X : np.ndarray
633
+ The input feature matrix.
634
+ y : np.ndarray
635
+ The target labels.
636
+ feat_idx : int
637
+ The index of the feature to be used for quantile splitting.
638
+ n_quantiles : int, default=4
639
+ The number of quantiles to split the data into.
640
+ metrics_to_consider : list of str, optional
641
+ A list of distance metrics to evaluate. If None, all available
642
+ metrics within DistClassiPy will be considered.
643
+ random_state : int, RandomState instance or None, optional (default=None)
644
+ Controls the randomness of the estimator. Pass an int for reproducible
645
+ output across multiple function calls.
646
+
647
+ .. versionadded:: 0.2.1
648
+
649
+ Returns
650
+ -------
651
+ quantile_scores_df : pd.DataFrame
652
+ A DataFrame containing the accuracy scores for each metric across
653
+ different quantiles.
654
+ best_metrics_per_quantile : pd.Series
655
+ A Series indicating the best-performing metric for each quantile.
656
+ group_bins : np.ndarray
657
+ The bins used for quantile splitting.
658
+ """
659
+ X = check_array(X)
660
+ feature_labels = [f"Feature_{i}" for i in range(X.shape[1])]
661
+ feature_name = f"Feature_{feat_idx}"
662
+
663
+ if metrics_to_consider is None:
664
+ metrics_to_consider = _ALL_METRICS
665
+
666
+ X_df = pd.DataFrame(X, columns=feature_labels)
667
+ y_df = pd.DataFrame(y, columns=["Target"])
668
+ quantiles, group_bins = pd.qcut(X_df[feature_name], q=n_quantiles, retbins=True)
669
+
670
+ X_train, X_test, y_train, y_test = train_test_split(
671
+ X_df, y_df, test_size=0.25, stratify=quantiles, random_state=random_state
672
+ )
673
+
674
+ clf.fit(X_train, y_train.to_numpy().ravel())
675
+ grouped_test_data = X_test.groupby(quantiles, observed=False)
676
+
677
+ quantile_scores = []
678
+ for metric in metrics_to_consider:
679
+ scores_for_metric = [
680
+ accuracy_score(
681
+ y_test.loc[subdf.index], clf.predict(subdf.to_numpy(), metric=metric)
682
+ )
683
+ for _, subdf in grouped_test_data
684
+ ]
685
+ quantile_scores.append(scores_for_metric)
686
+
687
+ quantile_scores = np.array(quantile_scores) * 100
688
+ quantile_scores_df = pd.DataFrame(
689
+ data=quantile_scores,
690
+ index=metrics_to_consider,
691
+ columns=[f"Quantile {i+1}" for i in range(n_quantiles)],
692
+ )
693
+
694
+ best_metrics_per_quantile = quantile_scores_df.idxmax()
695
+
696
+ return quantile_scores_df, best_metrics_per_quantile, group_bins
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: distclassipy
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: A python package for a distance-based classifier which can use several different distance metrics.
5
5
  Author-email: Siddharth Chaini <sidchaini@gmail.com>
6
6
  License: GNU GENERAL PUBLIC LICENSE
File without changes
File without changes
File without changes
File without changes