PyPI - moose-fs - Versions diffs - 0.1.0__py3-none-any.whl - Mend

moose-fs 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

LICENSE +21 -0
README.md +190 -0
moose_fs-0.1.0.dist-info/METADATA +232 -0
moose_fs-0.1.0.dist-info/RECORD +40 -0
moose_fs-0.1.0.dist-info/WHEEL +4 -0
moose_fs-0.1.0.dist-info/entry_points.txt +2 -0
moose_fs-0.1.0.dist-info/licenses/LICENSE +21 -0
moosefs/__init__.py +6 -0
moosefs/core/__init__.py +6 -0
moosefs/core/data_processor.py +319 -0
moosefs/core/feature.py +44 -0
moosefs/core/novovicova.py +60 -0
moosefs/core/pareto.py +90 -0
moosefs/feature_selection_pipeline.py +548 -0
moosefs/feature_selectors/__init__.py +26 -0
moosefs/feature_selectors/base_selector.py +38 -0
moosefs/feature_selectors/default_variance.py +21 -0
moosefs/feature_selectors/elastic_net_selector.py +75 -0
moosefs/feature_selectors/f_statistic_selector.py +42 -0
moosefs/feature_selectors/lasso_selector.py +46 -0
moosefs/feature_selectors/mrmr_selector.py +57 -0
moosefs/feature_selectors/mutual_info_selector.py +45 -0
moosefs/feature_selectors/random_forest_selector.py +48 -0
moosefs/feature_selectors/svm_selector.py +50 -0
moosefs/feature_selectors/variance_selectors.py +16 -0
moosefs/feature_selectors/xgboost_selector.py +44 -0
moosefs/merging_strategies/__init__.py +17 -0
moosefs/merging_strategies/arithmetic_mean_merger.py +46 -0
moosefs/merging_strategies/base_merger.py +64 -0
moosefs/merging_strategies/borda_merger.py +46 -0
moosefs/merging_strategies/consensus_merger.py +80 -0
moosefs/merging_strategies/l2_norm_merger.py +42 -0
moosefs/merging_strategies/union_of_intersections_merger.py +89 -0
moosefs/metrics/__init__.py +23 -0
moosefs/metrics/performance_metrics.py +239 -0
moosefs/metrics/stability_metrics.py +49 -0
moosefs/utils.py +161 -0
scripts/config.yml +92 -0
scripts/main.py +163 -0
scripts/utils.py +186 -0

moosefs/feature_selectors/elastic_net_selector.py ADDED Viewed

@@ -0,0 +1,75 @@
+from __future__ import annotations
+from typing import Any
+import warnings
+import numpy as np
+import pandas as pd
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.linear_model import ElasticNet, LogisticRegression
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
+from .base_selector import FeatureSelector
+class ElasticNetSelector(FeatureSelector):
+    """Elastic‑net based selector.
+    • regression  → sklearn.linear_model.ElasticNet (L1+L2 on y∈ℝ)
+    • classification → sklearn.linear_model.LogisticRegression with penalty='elasticnet' (solver='saga')
+    Scores are |coef| (mean over classes if multiclass).
+    """
+    name = "ElasticNet"
+    def __init__(self, task: str, num_features_to_select: int, **kwargs: Any) -> None:
+        super().__init__(task, num_features_to_select)
+        self.kwargs = kwargs
+    def compute_scores(self, X: Any, y: Any) -> np.ndarray:
+        # Ensure tabular objects for column-safe slicing later in the pipeline
+        if isinstance(X, np.ndarray):
+            X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
+        if isinstance(y, (pd.DataFrame, np.ndarray)) and getattr(y, "ndim", 1) == 2:
+            y = np.ravel(y)
+        if self.task == "regression":
+            params = {
+                "alpha": self.kwargs.pop("alpha", 1.0),
+                "l1_ratio": self.kwargs.pop("l1_ratio", 0.5),
+                "max_iter": self.kwargs.pop("max_iter", 100_000),
+                **self.kwargs,
+            }
+            model = make_pipeline(StandardScaler(with_mean=True, with_std=True), ElasticNet(**params))
+            # Fit, silencing only ConvergenceWarning (optional but useful)
+            with warnings.catch_warnings():
+                warnings.filterwarnings("ignore", category=ConvergenceWarning)
+                model.fit(X, y)
+            # model = ElasticNet(**params)
+            model.fit(X, y)
+            coef = model[-1].coef_
+        elif self.task == "classification":
+            # LogisticRegression uses C instead of alpha; keep both if user passes.
+            l1_ratio = self.kwargs.pop("l1_ratio", 0.5)
+            C = self.kwargs.pop("C", 1.0)
+            params = {
+                "penalty": "elasticnet",
+                "solver": "saga",
+                "l1_ratio": l1_ratio,
+                "C": C,
+                "max_iter": self.kwargs.pop("max_iter", 100_000),
+                **self.kwargs,
+            }
+            model = LogisticRegression(**params)
+            model.fit(X, y)
+            coef = model.coef_  # shape (n_classes, n_features) or (1, n_features)
+            if coef.ndim > 1:
+                coef = np.mean(np.abs(coef), axis=0)
+        else:
+            raise ValueError("Task must be 'classification' or 'regression'.")
+        scores = np.abs(coef) if isinstance(coef, np.ndarray) else np.abs(np.asarray(coef))
+        return scores

moosefs/feature_selectors/f_statistic_selector.py ADDED Viewed

@@ -0,0 +1,42 @@
+from typing import Any
+import numpy as np
+from sklearn.feature_selection import f_classif, f_regression
+from .base_selector import FeatureSelector
+class FStatisticSelector(FeatureSelector):
+    """Feature selector using F-statistic scores."""
+    name = "FStatistic"
+    def __init__(self, task: str, num_features_to_select: int, **kwargs: Any) -> None:
+        """
+        Args:
+            task: ML task ('classification' or 'regression').
+            num_features_to_select: Number of features to select.
+            **kwargs: Additional arguments for the scoring function.
+        """
+        super().__init__(task, num_features_to_select)
+        self.kwargs = kwargs
+    def compute_scores(self, X: Any, y: Any) -> np.ndarray:
+        """
+        Computes F-statistic scores.
+        Args:
+            X: Training samples.
+            y: Target values.
+        Returns:
+            F-statistic scores for each feature.
+        Raises:
+            ValueError: If task is not 'classification' or 'regression'.
+        """
+        score_func = {"classification": f_classif, "regression": f_regression}.get(self.task)
+        if score_func is None:
+            raise ValueError("Task must be 'classification' or 'regression'.")
+        scores, _ = score_func(X, y, **self.kwargs)
+        return scores

moosefs/feature_selectors/lasso_selector.py ADDED Viewed

@@ -0,0 +1,46 @@
+from typing import Any
+import numpy as np
+import pandas as pd
+from sklearn.linear_model import Lasso
+from .base_selector import FeatureSelector
+class LassoSelector(FeatureSelector):
+    """Feature selector using Lasso regression."""
+    name = "Lasso"
+    def __init__(self, task: str, num_features_to_select: int, **kwargs: Any) -> None:
+        """
+        Args:
+            task: ML task ('classification' or 'regression').
+            num_features_to_select: Number of features to select.
+            **kwargs: Additional arguments for Lasso.
+        """
+        super().__init__(task, num_features_to_select)
+        self.kwargs = kwargs
+    def compute_scores(self, X: Any, y: Any) -> np.ndarray:
+        """
+        Computes feature scores using Lasso regression.
+        Args:
+            X: Training samples.
+            y: Target values.
+        Returns:
+            Feature scores based on absolute Lasso coefficients.
+        """
+        if isinstance(X, np.ndarray):
+            X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
+        if isinstance(y, np.ndarray) and y.ndim == 2:
+            y = y.ravel()
+        # set default alpha to 0.05 if not provided in kwargs
+        model = Lasso(alpha=self.kwargs.pop("alpha", 0.05))
+        model.fit(X, y)
+        scores = np.abs(model.coef_)
+        return scores

moosefs/feature_selectors/mrmr_selector.py ADDED Viewed

@@ -0,0 +1,57 @@
+from typing import Any
+from mrmr import mrmr_classif, mrmr_regression
+import numpy as np
+import pandas as pd
+from .base_selector import FeatureSelector
+class MRMRSelector(FeatureSelector):
+    """Feature selector using Minimum Redundancy Maximum Relevance (MRMR)."""
+    name = "MRMR"
+    def __init__(self, task: str, num_features_to_select: int, **kwargs: Any) -> None:
+        """
+        Args:
+            task: ML task ('classification' or 'regression').
+            num_features_to_select: Number of features to select.
+            **kwargs: Additional arguments for mRMR functions.
+        """
+        super().__init__(task, num_features_to_select)
+        self.kwargs = kwargs
+    def compute_scores(self, X: Any, y: Any) -> np.ndarray:
+        """
+        Computes feature scores using the MRMR algorithm.
+        Args:
+            X: Training samples.
+            y: Target values.
+        Returns:
+            MRMR scores for each feature.
+        """
+        if isinstance(X, np.ndarray):
+            X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
+        elif not isinstance(X, pd.DataFrame):
+            raise TypeError("X must be a pandas DataFrame or a NumPy array.")
+        if isinstance(y, np.ndarray):
+            y = pd.Series(y)
+        score_func = {
+            "classification": mrmr_classif,
+            "regression": mrmr_regression,
+        }.get(self.task)
+        if score_func is None:
+            raise ValueError("Task must be 'classification' or 'regression'.")
+        _, relevance, redundancy = score_func(X, y, K=self.num_features_to_select, return_scores=True, **self.kwargs)
+        # Compute MRMR scores (Relevance / Mean Redundancy), handling division by zero
+        mrmr_scores = relevance / redundancy.mean(axis=1).replace(0, np.nan)
+        mrmr_scores = mrmr_scores.fillna(0)
+        scores = np.array([mrmr_scores.get(feature, 0) for feature in X.columns])
+        return scores

moosefs/feature_selectors/mutual_info_selector.py ADDED Viewed

@@ -0,0 +1,45 @@
+from typing import Any
+import numpy as np
+from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
+from .base_selector import FeatureSelector
+class MutualInfoSelector(FeatureSelector):
+    """Feature selector using mutual information scores."""
+    name = "MutualInfo"
+    def __init__(self, task: str, num_features_to_select: int, **kwargs: Any) -> None:
+        """
+        Args:
+            task: ML task ('classification' or 'regression').
+            num_features_to_select: Number of features to select.
+            **kwargs: Additional arguments for mutual information function.
+        """
+        super().__init__(task, num_features_to_select)
+        self.kwargs = kwargs
+    def compute_scores(self, X: Any, y: Any) -> np.ndarray:
+        """
+        Computes mutual information scores.
+        Args:
+            X: Training samples.
+            y: Target values.
+        Returns:
+            Mutual information scores for each feature.
+        Raises:
+            ValueError: If task is not 'classification' or 'regression'.
+        """
+        mutual_info_func = {
+            "classification": mutual_info_classif,
+            "regression": mutual_info_regression,
+        }.get(self.task)
+        if mutual_info_func is None:
+            raise ValueError("Task must be 'classification' or 'regression'.")
+        scores = mutual_info_func(X, y)
+        return scores

moosefs/feature_selectors/random_forest_selector.py ADDED Viewed

@@ -0,0 +1,48 @@
+from typing import Any
+import numpy as np
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+from .base_selector import FeatureSelector
+class RandomForestSelector(FeatureSelector):
+    """Feature selector using RandomForest feature importance."""
+    name = "RandomForest"
+    def __init__(self, task: str, num_features_to_select: int, **kwargs: Any) -> None:
+        """
+        Args:
+            task: ML task ('classification' or 'regression').
+            num_features_to_select: Number of features to select.
+            **kwargs: Additional arguments for RandomForest model.
+        """
+        super().__init__(task, num_features_to_select)
+        self.kwargs = kwargs
+    def compute_scores(self, X: Any, y: Any) -> np.ndarray:
+        """
+        Computes feature importances using a RandomForest model.
+        Args:
+            X: Training samples.
+            y: Target values.
+        Returns:
+            Feature importances from the trained RandomForest model.
+        Raises:
+            ValueError: If task is not 'classification' or 'regression'.
+        """
+        model_cls = {
+            "classification": RandomForestClassifier,
+            "regression": RandomForestRegressor,
+        }.get(self.task)
+        if model_cls is None:
+            raise ValueError("Task must be 'classification' or 'regression'.")
+        model = model_cls()
+        model.fit(X, y)
+        scores = model.feature_importances_
+        return scores

moosefs/feature_selectors/svm_selector.py ADDED Viewed

@@ -0,0 +1,50 @@
+from typing import Any
+import numpy as np
+from sklearn.svm import SVC, SVR
+from .base_selector import FeatureSelector
+class SVMSelector(FeatureSelector):
+    """Feature selector using SVM coefficients."""
+    name = "SVM"
+    def __init__(self, task: str, num_features_to_select: int, **kwargs: Any) -> None:
+        """
+        Args:
+            task: ML task ('classification' or 'regression').
+            num_features_to_select: Number of features to select.
+            **kwargs: Additional arguments for the SVM model.
+        """
+        super().__init__(task, num_features_to_select)
+        self.kwargs = kwargs
+    def compute_scores(self, X: Any, y: Any) -> np.ndarray:
+        """
+        Computes feature importances using an SVM model.
+        Args:
+            X: Training samples.
+            y: Target values.
+        Returns:
+            Feature importances derived from SVM model coefficients.
+        Raises:
+            ValueError: If task is not 'classification' or 'regression'.
+        """
+        model_cls = {"classification": SVC, "regression": SVR}.get(self.task)
+        if model_cls is None:
+            raise ValueError("Task must be 'classification' or 'regression'.")
+        # Only remove `random_state` for SVR
+        filtered_kwargs = (
+            {k: v for k, v in self.kwargs.items() if k != "random_state"} if self.task == "regression" else self.kwargs
+        )
+        model = model_cls(kernel="linear", **filtered_kwargs)
+        model.fit(X, y)
+        scores = np.abs(model.coef_[0])
+        return scores

moosefs/feature_selectors/variance_selectors.py ADDED Viewed

@@ -0,0 +1,16 @@
+import numpy as np
+import pandas as pd
+from .base_selector import FeatureSelector
+class VarianceSelector(FeatureSelector):
+    name = "Variance"
+    def __init__(self, task: str, num_features_to_select: int, **kwargs):
+        super().__init__(task, num_features_to_select)
+    def compute_scores(self, X, y):
+        if isinstance(X, np.ndarray):
+            X = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(X.shape[1])])
+        return X.var(ddof=0).values  # base class will keep the highest variances

moosefs/feature_selectors/xgboost_selector.py ADDED Viewed

@@ -0,0 +1,44 @@
+from typing import Any
+import numpy as np
+from xgboost import XGBClassifier, XGBRegressor
+from .base_selector import FeatureSelector
+class XGBoostSelector(FeatureSelector):
+    """Feature selector using XGBoost feature importance."""
+    name = "XGBoost"
+    def __init__(self, task: str, num_features_to_select: int, **kwargs: Any) -> None:
+        """
+        Args:
+            task: ML task ('classification' or 'regression').
+            num_features_to_select: Number of features to select.
+            **kwargs: Additional arguments for the XGBoost model.
+        """
+        super().__init__(task, num_features_to_select)
+        self.kwargs = kwargs
+    def compute_scores(self, X: Any, y: Any) -> np.ndarray:
+        """
+        Computes feature importances using an XGBoost model.
+        Args:
+            X: Training samples.
+            y: Target values.
+        Returns:
+            Feature importances from the trained XGBoost model.
+        Raises:
+            ValueError: If task is not 'classification' or 'regression'.
+        """
+        model_cls = {"classification": XGBClassifier, "regression": XGBRegressor}.get(self.task)
+        if model_cls is None:
+            raise ValueError("Task must be 'classification' or 'regression'.")
+        model = model_cls()
+        model.fit(X, y)
+        scores = model.feature_importances_
+        return scores

moosefs/merging_strategies/__init__.py ADDED Viewed

@@ -0,0 +1,17 @@
+# merging_strategies/__init__.py
+from .arithmetic_mean_merger import ArithmeticMeanMerger
+from .base_merger import MergingStrategy
+from .borda_merger import BordaMerger
+from .consensus_merger import ConsensusMerger
+from .l2_norm_merger import L2NormMerger
+from .union_of_intersections_merger import UnionOfIntersectionsMerger
+__all__ = [
+    "MergingStrategy",
+    "BordaMerger",
+    "UnionOfIntersectionsMerger",
+    "ArithmeticMeanMerger",
+    "L2NormMerger",
+    "ConsensusMerger",
+]

moosefs/merging_strategies/arithmetic_mean_merger.py ADDED Viewed

@@ -0,0 +1,46 @@
+import numpy as np
+from .base_merger import MergingStrategy
+class ArithmeticMeanMerger(MergingStrategy):
+    """Rank-based merging using the arithmetic mean of scores."""
+    name = "ArithmeticMean"
+    def __init__(self, **kwargs) -> None:
+        # Keep taxonomy consistent with existing mergers
+        super().__init__("rank-based")
+        self.kwargs = kwargs
+    def merge(
+        self,
+        subsets: list,
+        num_features_to_select: int,
+        **kwargs,
+    ) -> list:
+        """Return the top‑k feature names after arithmetic-mean aggregation.
+        Args:
+            subsets: Feature lists (one list per selector).
+            num_features_to_select: Number of names to return.
+        Returns:
+            Feature names sorted by mean score.
+        """
+        self._validate_input(subsets)
+        # Shortcut if only one selector supplied
+        if len(subsets) == 1:
+            return [f.name for f in subsets[0]][:num_features_to_select]
+        feature_names = [f.name for f in subsets[0]]
+        # shape: (n_features, n_selectors)
+        scores = np.array([[f.score for f in s] for s in subsets]).T
+        # Arithmetic mean across selectors
+        scores_merged = scores.mean(axis=1)
+        # Lower score ⇒ higher rank (same convention as Borda)
+        sorted_names = [feature_names[i] for i in np.argsort(-scores_merged, kind="stable")]
+        return sorted_names[:num_features_to_select]

moosefs/merging_strategies/base_merger.py ADDED Viewed

@@ -0,0 +1,64 @@
+from ..core.feature import Feature
+class MergingStrategy:
+    """Abstract base for merging strategies.
+    Strategies can be "set-based" or "rank-based" depending on how they merge
+    the per-selector outputs.
+    """
+    def __init__(self, strategy_type: str) -> None:
+        """Initialize the strategy.
+        Args:
+            strategy_type: Either "set-based" or "rank-based".
+        """
+        self.strategy_type = strategy_type
+    def merge(self, data: list, num_features_to_select: int, **kwargs) -> list:
+        """Merge input data according to the strategy.
+        Subclasses must implement this method.
+        Args:
+            data: List of Feature lists (one list per selector) or a single list.
+            num_features_to_select: Number of top features to return.
+            **kwargs: Strategy-specific options.
+        Returns:
+            A list of merged features (or names depending on strategy).
+        Raises:
+            NotImplementedError: If not implemented in a subclass.
+        """
+        raise NotImplementedError("Subclasses must implement this method")
+    def is_set_based(self) -> bool:
+        """Return True if the strategy is set-based."""
+        return self.strategy_type == "set-based"
+    def is_rank_based(self) -> bool:
+        """Return True if the strategy is rank-based."""
+        return self.strategy_type == "rank-based"
+    def _validate_input(self, subsets: list) -> None:
+        """Validate that ``subsets`` contains Feature objects.
+        Args:
+            subsets: A list of Feature or a list of Feature lists.
+        Raises:
+            ValueError: If empty or containing invalid types.
+        """
+        if not subsets:
+            raise ValueError("Subsets cannot be empty.")
+        if isinstance(subsets[0], list):  # List of lists case
+            if not all(isinstance(sub, list) and sub for sub in subsets):
+                raise ValueError("Subsets cannot contain empty lists.")
+            if not all(isinstance(feature, Feature) for sub in subsets for feature in sub):
+                raise ValueError("Subsets must contain Feature objects.")
+        else:  # Single list case
+            if not all(isinstance(feature, Feature) for feature in subsets):
+                raise ValueError("Subsets must contain Feature objects.")

moosefs/merging_strategies/borda_merger.py ADDED Viewed

@@ -0,0 +1,46 @@
+import numpy as np
+from ranky import borda
+from .base_merger import MergingStrategy
+class BordaMerger(MergingStrategy):
+    """Rank-based merging using the Borda count method."""
+    name = "Borda"
+    def __init__(self, **kwargs) -> None:
+        """Initialize a rank-based merger.
+        Args:
+            **kwargs: Forwarded to the Borda routine (if applicable).
+        """
+        super().__init__("rank-based")
+        self.kwargs = kwargs
+    def merge(self, subsets: list, num_features_to_select: int, **kwargs) -> list:
+        """Merge by Borda and return top-k names.
+        Args:
+            subsets: Feature lists (one list per selector).
+            num_features_to_select: Number of names to return.
+        Returns:
+            Feature names sorted by merged Borda scores.
+        """
+        self._validate_input(subsets)
+        if len(subsets) == 1:
+            return [feature.name for feature in subsets[0]][:num_features_to_select]
+        # Extract feature names (from the first subset) and scores
+        feature_names = [feature.name for feature in subsets[0]]
+        scores = np.array([[feature.score for feature in subset] for subset in subsets]).T
+        # Apply Borda count method
+        scores_merged = borda(m=scores, **self.kwargs)
+        # Sort based on Borda scores (lower score = higher rank)
+        sorted_names = [feature_names[i] for i in np.argsort(scores_merged, kind="stable")]
+        return list(sorted_names[:num_features_to_select])