PyPI - moose-fs - Versions diffs - 0.1.0__py3-none-any.whl - Mend

moose-fs 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

LICENSE +21 -0
README.md +190 -0
moose_fs-0.1.0.dist-info/METADATA +232 -0
moose_fs-0.1.0.dist-info/RECORD +40 -0
moose_fs-0.1.0.dist-info/WHEEL +4 -0
moose_fs-0.1.0.dist-info/entry_points.txt +2 -0
moose_fs-0.1.0.dist-info/licenses/LICENSE +21 -0
moosefs/__init__.py +6 -0
moosefs/core/__init__.py +6 -0
moosefs/core/data_processor.py +319 -0
moosefs/core/feature.py +44 -0
moosefs/core/novovicova.py +60 -0
moosefs/core/pareto.py +90 -0
moosefs/feature_selection_pipeline.py +548 -0
moosefs/feature_selectors/__init__.py +26 -0
moosefs/feature_selectors/base_selector.py +38 -0
moosefs/feature_selectors/default_variance.py +21 -0
moosefs/feature_selectors/elastic_net_selector.py +75 -0
moosefs/feature_selectors/f_statistic_selector.py +42 -0
moosefs/feature_selectors/lasso_selector.py +46 -0
moosefs/feature_selectors/mrmr_selector.py +57 -0
moosefs/feature_selectors/mutual_info_selector.py +45 -0
moosefs/feature_selectors/random_forest_selector.py +48 -0
moosefs/feature_selectors/svm_selector.py +50 -0
moosefs/feature_selectors/variance_selectors.py +16 -0
moosefs/feature_selectors/xgboost_selector.py +44 -0
moosefs/merging_strategies/__init__.py +17 -0
moosefs/merging_strategies/arithmetic_mean_merger.py +46 -0
moosefs/merging_strategies/base_merger.py +64 -0
moosefs/merging_strategies/borda_merger.py +46 -0
moosefs/merging_strategies/consensus_merger.py +80 -0
moosefs/merging_strategies/l2_norm_merger.py +42 -0
moosefs/merging_strategies/union_of_intersections_merger.py +89 -0
moosefs/metrics/__init__.py +23 -0
moosefs/metrics/performance_metrics.py +239 -0
moosefs/metrics/stability_metrics.py +49 -0
moosefs/utils.py +161 -0
scripts/config.yml +92 -0
scripts/main.py +163 -0
scripts/utils.py +186 -0

moosefs/merging_strategies/consensus_merger.py ADDED Viewed

@@ -0,0 +1,80 @@
+from collections import Counter, defaultdict
+from itertools import chain
+from typing import Optional
+import numpy as np
+from .base_merger import MergingStrategy
+class ConsensusMerger(MergingStrategy):
+    """Set-based consensus merger with optional fill.
+    Keeps features selected by at least ``k`` selectors. If ``fill=True``,
+    trims/pads to ``num_features_to_select`` using summed, per-selector
+    min–max–normalized scores as a tie-breaker.
+    """
+    def __init__(self, k: int = 2, *, fill: bool = False) -> None:
+        super().__init__("set-based")
+        self.k = k
+        self.fill = fill
+        self.name = f"Consensus_ge_{k}"
+    # -----------------------------------------------------------------
+    def merge(
+        self,
+        subsets: list,
+        num_features_to_select: Optional[int] = None,
+        **kwargs,
+    ) -> set:
+        """Merge by consensus threshold.
+        Args:
+            subsets: Feature lists (one list per selector).
+            num_features_to_select: Required when ``fill=True``.
+            **kwargs: Unused.
+        Returns:
+            Set of selected feature names.
+        """
+        self._validate_input(subsets)
+        if self.fill and num_features_to_select is None:
+            raise ValueError("`num_features_to_select` required when fill=True")
+        # ── collect names & scores (ragged‑safe) ─────────────────────
+        names_mat = [[f.name for f in s] for s in subsets]
+        # Consensus counts across all selectors
+        counts = Counter(chain.from_iterable(names_mat))
+        # Summed, per-selector min‑max–normalised scores per feature name
+        sum_scores = defaultdict(float)
+        for subset in subsets:
+            if not subset:
+                continue
+            scores = np.array([f.score for f in subset], dtype=np.float32)
+            min_v = float(scores.min())
+            rng = float(scores.max() - min_v) or 1.0
+            norm = (scores - min_v) / rng
+            for name, s in zip([f.name for f in subset], norm):
+                sum_scores[name] += float(s)
+        selected = {f for f, c in counts.items() if c >= self.k}
+        if not self.fill:
+            return selected
+        # ── trim / pad to desired size ───────────────────────────────
+        core_sorted = sorted(selected, key=lambda n: sum_scores[n], reverse=True)
+        if len(core_sorted) >= num_features_to_select:
+            return set(core_sorted[:num_features_to_select])
+        extras = sorted(
+            (n for n in counts if n not in selected),
+            key=lambda n: sum_scores[n],
+            reverse=True,
+        )
+        need = num_features_to_select - len(core_sorted)
+        return set(core_sorted + extras[:need])

moosefs/merging_strategies/l2_norm_merger.py ADDED Viewed

@@ -0,0 +1,42 @@
+import numpy as np
+from .base_merger import MergingStrategy
+class L2NormMerger(MergingStrategy):
+    """Rank-based merging using the L2-norm (RMS) of scores."""
+    name = "L2Norm"
+    def __init__(self, **kwargs) -> None:
+        super().__init__("rank-based")
+        self.kwargs = kwargs
+    def merge(
+        self,
+        subsets: list,
+        num_features_to_select: int,
+        **kwargs,
+    ) -> list:
+        """Return the top‑k feature names after L2-norm aggregation.
+        Args:
+            subsets: Feature lists (one list per selector).
+            num_features_to_select: Number of names to return.
+        Returns:
+            Feature names sorted by aggregated L2 score.
+        """
+        self._validate_input(subsets)
+        if len(subsets) == 1:
+            return [f.name for f in subsets[0]][:num_features_to_select]
+        feature_names = [f.name for f in subsets[0]]
+        scores = np.array([[f.score for f in s] for s in subsets]).T
+        # Euclidean norm (root-mean-square) across selectors
+        scores_merged = np.linalg.norm(scores, ord=2, axis=1)
+        sorted_names = [feature_names[i] for i in np.argsort(-scores_merged, kind="stable")]
+        return sorted_names[:num_features_to_select]

moosefs/merging_strategies/union_of_intersections_merger.py ADDED Viewed

@@ -0,0 +1,89 @@
+from collections import defaultdict
+from itertools import combinations
+from typing import Optional
+import numpy as np
+from .base_merger import MergingStrategy
+class UnionOfIntersectionsMerger(MergingStrategy):
+    """Union of intersections across selector subsets."""
+    name = "UnionOfIntersections"
+    def __init__(self) -> None:
+        super().__init__("set-based")
+    def merge(
+        self,
+        subsets: list,
+        num_features_to_select: Optional[int] = None,
+        fill: bool = False,
+        **kwargs,
+    ) -> set:
+        """Merge by union of pairwise intersections.
+        Args:
+            subsets: Feature lists (one list per selector).
+            num_features_to_select: Required when ``fill=True``.
+            fill: If True, trim/pad output to requested size.
+            **kwargs: Unused.
+        Returns:
+            Set of selected feature names.
+        Raises:
+            ValueError: If inputs are invalid or size is missing when ``fill=True``.
+        """
+        self._validate_input(subsets)
+        if fill and num_features_to_select is None:
+            raise ValueError("`num_features_to_select` must be provided when `fill=True`.")
+        if len(subsets) == 1:
+            feature_names = {f.name for f in subsets[0]}
+            return (
+                set(sorted(feature_names, key=lambda f: f.score, reverse=True)[:num_features_to_select])
+                if fill
+                else feature_names
+            )
+        # Extract feature names and scores
+        feature_names = [[f.name for f in subset] for subset in subsets]
+        feature_scores = np.array([[f.score for f in subset] for subset in subsets], dtype=np.float32).T
+        # Normalize scores within each subset (vectorized min-max scaling)
+        min_vals, max_vals = (
+            feature_scores.min(axis=1, keepdims=True),
+            feature_scores.max(axis=1, keepdims=True),
+        )
+        score_range = np.where(max_vals - min_vals == 0, 1, max_vals - min_vals)  # Prevent division by zero
+        feature_scores = (feature_scores - min_vals) / score_range
+        # Compute core as the union of pairwise intersections
+        core = set().union(
+            *[set(feature_names[i]) & set(feature_names[j]) for i, j in combinations(range(len(feature_names)), 2)]
+        )
+        if not fill:
+            return core  # Return raw core without enforcing `num_features_to_select`
+        # Compute global feature scores (sum of normalized values)
+        feature_score_map = defaultdict(float)
+        for subset, scores in zip(feature_names, feature_scores.T):
+            for name, score in zip(subset, scores):
+                feature_score_map[name] += score
+        # Prune or fill to get exactly `num_features_to_select`
+        core_list = sorted(core, key=lambda x: feature_score_map[x], reverse=True)
+        core_size = len(core_list)
+        if core_size >= num_features_to_select:
+            return set(core_list[:num_features_to_select])
+        # Fill with highest-ranked extra features
+        extras = sorted(feature_score_map.keys(), key=lambda x: feature_score_map[x], reverse=True)
+        extras = [f for f in extras if f not in core][: num_features_to_select - core_size]
+        return set(core_list + extras)

moosefs/metrics/__init__.py ADDED Viewed

@@ -0,0 +1,23 @@
+from .performance_metrics import (
+    Accuracy,
+    BaseMetric,
+    F1Score,
+    LogLoss,
+    MeanAbsoluteError,
+    MeanSquaredError,
+    PrecisionScore,
+    R2Score,
+    RecallScore,
+)
+__all__ = [
+    "BaseMetric",
+    "R2Score",
+    "MeanAbsoluteError",
+    "MeanSquaredError",
+    "LogLoss",
+    "F1Score",
+    "Accuracy",
+    "PrecisionScore",
+    "RecallScore",
+]

moosefs/metrics/performance_metrics.py ADDED Viewed

@@ -0,0 +1,239 @@
+from typing import Any, Optional
+from joblib import hash as joblib_hash
+import numpy as np
+from sklearn.ensemble import (
+    GradientBoostingClassifier,
+    GradientBoostingRegressor,
+    RandomForestClassifier,
+    RandomForestRegressor,
+)
+from sklearn.linear_model import LinearRegression, LogisticRegression
+from sklearn.metrics import (
+    accuracy_score,
+    f1_score,
+    log_loss,
+    mean_absolute_error,
+    mean_squared_error,
+    precision_score,
+    r2_score,
+    recall_score,
+)
+class BaseMetric:
+    """Base class for computing evaluation metrics.
+    Trains a small battery of models and aggregates per-model metric values.
+    """
+    def __init__(self, name: str, task: str) -> None:
+        """Initialize the metric with a task type.
+        Args:
+            name: Human-readable metric name.
+            task: Either "classification" or "regression".
+        """
+        if task not in {"classification", "regression"}:
+            raise ValueError("Task must be 'classification' or 'regression'.")
+        self.name = name
+        self.task = task
+        self.models = self._initialize_models()
+    def model_signature(self) -> str:
+        """Return a stable signature describing the internal model set."""
+        signature_payload = {
+            name: (
+                f"{model.__class__.__module__}.{model.__class__.__qualname__}",
+                model.get_params(deep=True),
+            )
+            for name, model in self.models.items()
+        }
+        return f"{self.task}:{joblib_hash(signature_payload)}"
+    def _initialize_models(self) -> dict:
+        """Initialize task-specific models.
+        Returns:
+            Mapping from model label to estimator instance.
+        """
+        # Keep inner models single-threaded to avoid nested parallelism.
+        return {
+            "classification": {
+                "Random Forest": RandomForestClassifier(n_jobs=1),
+                "Logistic Regression": LogisticRegression(max_iter=1000),
+                "Gradient Boosting": GradientBoostingClassifier(),
+            },
+            "regression": {
+                "Random Forest": RandomForestRegressor(n_jobs=1),
+                "Linear Regression": LinearRegression(),
+                "Gradient Boosting": GradientBoostingRegressor(),
+            },
+        }[self.task]
+    def train_and_predict(
+        self,
+        X_train: Any,
+        y_train: Any,
+        X_test: Any,
+        y_test: Any,
+    ) -> dict:
+        """Train all models and generate predictions.
+        Args:
+            X_train: Training features.
+            y_train: Training targets.
+            X_test: Test features.
+            y_test: Test targets.
+        Returns:
+            Dict keyed by model name with predictions and optional probabilities.
+        """
+        results = {}
+        for model_name, model in self.models.items():
+            model.fit(X_train, y_train)
+            predictions = model.predict(X_test)
+            probabilities = model.predict_proba(X_test) if self.task == "classification" else None
+            results[model_name] = {
+                "predictions": predictions,
+                "probabilities": probabilities,
+            }
+        return results
+    def compute(
+        self,
+        X_train: Any,
+        y_train: Any,
+        X_test: Any,
+        y_test: Any,
+    ) -> float:
+        """Compute the metric (implemented by subclasses)."""
+        raise NotImplementedError("This method must be overridden in subclasses.")
+class RegressionMetric(BaseMetric):
+    """Base class for regression metrics."""
+    def __init__(self, name: str) -> None:
+        super().__init__(name, task="regression")
+    def compute(
+        self,
+        X_train: Any,
+        y_train: Any,
+        X_test: Any,
+        y_test: Any,
+    ) -> float:
+        """Average the metric over the internal model set."""
+        results = self.train_and_predict(X_train, y_train, X_test, y_test)
+        return self.aggregate_from_results(y_test, results)
+    def aggregate_from_results(self, y_test: np.ndarray, results: dict) -> float:
+        """Aggregate metric value from cached prediction results."""
+        return float(np.mean([self._metric_func(y_test, res["predictions"]) for res in results.values()]))
+    def _metric_func(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
+        """Metric function to be overridden by subclasses."""
+        raise NotImplementedError("This method must be overridden in subclasses.")
+class R2Score(RegressionMetric):
+    def __init__(self) -> None:
+        super().__init__("R2 Score")
+    def _metric_func(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
+        return r2_score(y_true, y_pred)
+class MeanAbsoluteError(RegressionMetric):
+    def __init__(self) -> None:
+        super().__init__("Mean Absolute Error")
+    def _metric_func(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
+        return -mean_absolute_error(y_true, y_pred)  # Return negative MAE
+class MeanSquaredError(RegressionMetric):
+    def __init__(self) -> None:
+        super().__init__("Mean Squared Error")
+    def _metric_func(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
+        return -mean_squared_error(y_true, y_pred)  # Return negative MSE
+class ClassificationMetric(BaseMetric):
+    """Base class for classification metrics."""
+    def __init__(self, name: str) -> None:
+        super().__init__(name, task="classification")
+    def compute(
+        self,
+        X_train: Any,
+        y_train: Any,
+        X_test: Any,
+        y_test: Any,
+    ) -> float:
+        """Average the metric over the internal model set."""
+        results = self.train_and_predict(X_train, y_train, X_test, y_test)
+        return self.aggregate_from_results(y_test, results)
+    def aggregate_from_results(self, y_test: np.ndarray, results: dict) -> float:
+        """Aggregate metric value from cached prediction results."""
+        return float(
+            np.mean(
+                [self._metric_func(y_test, res["predictions"], res.get("probabilities")) for res in results.values()]
+            )
+        )
+    def _metric_func(
+        self,
+        y_true: np.ndarray,
+        y_pred: np.ndarray,
+        y_proba: Optional[np.ndarray] = None,
+    ) -> float:
+        """Metric function to be overridden by subclasses."""
+        raise NotImplementedError("This method must be overridden in subclasses.")
+class LogLoss(ClassificationMetric):
+    def __init__(self) -> None:
+        super().__init__("Log Loss")
+    def _metric_func(self, y_true: np.ndarray, y_pred: np.ndarray, y_proba: np.ndarray) -> float:
+        return -log_loss(y_true, y_proba)
+class F1Score(ClassificationMetric):
+    def __init__(self) -> None:
+        super().__init__("F1 Score")
+    def _metric_func(self, y_true: np.ndarray, y_pred: np.ndarray, y_proba: None = None) -> float:
+        return f1_score(y_true, y_pred, average="macro")
+class Accuracy(ClassificationMetric):
+    def __init__(self) -> None:
+        super().__init__("Accuracy")
+    def _metric_func(self, y_true: np.ndarray, y_pred: np.ndarray, y_proba: None = None) -> float:
+        return accuracy_score(y_true, y_pred)
+class PrecisionScore(ClassificationMetric):
+    def __init__(self) -> None:
+        super().__init__("Precision Score")
+    def _metric_func(self, y_true: np.ndarray, y_pred: np.ndarray, y_proba: None = None) -> float:
+        return precision_score(y_true, y_pred, average="macro", zero_division=0)
+class RecallScore(ClassificationMetric):
+    def __init__(self) -> None:
+        super().__init__("Recall Score")
+    def _metric_func(self, y_true: np.ndarray, y_pred: np.ndarray, y_proba: None = None) -> float:
+        return recall_score(y_true, y_pred, average="macro")

moosefs/metrics/stability_metrics.py ADDED Viewed

@@ -0,0 +1,49 @@
+from itertools import combinations
+from moosefs.core.novovicova import StabilityNovovicova
+def compute_stability_metrics(features_list: list) -> float:
+    """Compute stability SH(S) across selections.
+    Args:
+        features_list: Selected feature names per selector.
+    Returns:
+        Stability in [0, 1].
+    """
+    return StabilityNovovicova(features_list).compute_stability()
+def _jaccard(a: set, b: set) -> float:
+    """Return Jaccard similarity, handling empty sets as 1.0 if both empty."""
+    return len(a & b) / len(a | b) if a | b else 1.0
+def diversity_agreement(selectors: list, merged: list, alpha: float = 0.5) -> float:
+    """Blend diversity and agreement into a single score.
+    Args:
+        selectors: List of selected feature lists (one per selector).
+        merged: Merged/core feature names for the group.
+        alpha: Weight on agreement (0 → pure diversity, 1 → pure agreement).
+    Returns:
+        Score in [0, 1] (higher is better).
+    """
+    k = len(selectors)
+    if k < 2:
+        return 0.0  # cannot measure diversity with a single selector
+    sets = [set(s) for s in selectors]
+    core = set(merged)
+    # 1) diversity  (average Jaccard *dissimilarity* across selector pairs)
+    pair_dis = [1.0 - _jaccard(sets[i], sets[j]) for i, j in combinations(range(k), 2)]
+    diversity = sum(pair_dis) / len(pair_dis)
+    # 2) agreement (mean similarity of each selector to the core)
+    agree = sum(_jaccard(s, core) for s in sets) / k
+    # 3) linear blend
+    return (1.0 - alpha) * diversity + alpha * agree

moosefs/utils.py ADDED Viewed

@@ -0,0 +1,161 @@
+import inspect
+from typing import Any
+# Mapping of class identifiers to their import paths and expected initialization parameters.
+# Template: "identifier": ("module.path.ClassName", ["param1", "param2", ...])
+class_path_mapping: dict = {
+    # metrics
+    "mse": (
+        "moosefs.metrics.performance_metrics.MeanSquaredError",
+        [],
+    ),
+    "mae": (
+        "moosefs.metrics.performance_metrics.MeanAbsoluteError",
+        [],
+    ),
+    "r2_score": (
+        "moosefs.metrics.performance_metrics.R2Score",
+        [],
+    ),
+    "logloss": (
+        "moosefs.metrics.performance_metrics.LogLoss",
+        [],
+    ),
+    "f1_score": (
+        "moosefs.metrics.performance_metrics.F1Score",
+        [],
+    ),
+    "accuracy": (
+        "moosefs.metrics.performance_metrics.Accuracy",
+        [],
+    ),
+    "precision_score": (
+        "moosefs.metrics.performance_metrics.PrecisionScore",
+        [],
+    ),
+    "recall_score": (
+        "moosefs.metrics.performance_metrics.RecallScore",
+        [],
+    ),
+    "f_statistic_selector": (
+        "moosefs.feature_selectors.f_statistic_selector.FStatisticSelector",
+        ["task", "num_features_to_select"],
+    ),
+    "random_forest_selector": (
+        "moosefs.feature_selectors.random_forest_selector.RandomForestSelector",
+        ["task", "num_features_to_select", "random_state"],
+    ),
+    "mutual_info_selector": (
+        "moosefs.feature_selectors.mutual_info_selector.MutualInfoSelector",
+        ["task", "num_features_to_select", "random_state"],
+    ),
+    "svm_selector": (
+        "moosefs.feature_selectors.svm_selector.SVMSelector",
+        ["task", "num_features_to_select"],
+    ),
+    "xgboost_selector": (
+        "moosefs.feature_selectors.xgboost_selector.XGBoostSelector",
+        ["task", "num_features_to_select", "random_state"],
+    ),
+    "mrmr_selector": (
+        "moosefs.feature_selectors.mrmr_selector.MRMRSelector",
+        ["task", "num_features_to_select"],
+    ),
+    "lasso_selector": (
+        "moosefs.feature_selectors.lasso_selector.LassoSelector",
+        ["task", "num_features_to_select", "random_state"],
+    ),
+    "elastic_net_selector": (
+        "moosefs.feature_selectors.elastic_net_selector.ElasticNetSelector",
+        ["task", "num_features_to_select", "random_state"],
+    ),
+    "variance_selector": (
+        "moosefs.feature_selectors.variance_selectors.VarianceSelector",
+        ["task", "num_features_to_select"],
+    ),
+    # mergers
+    "borda_merger": (
+        "moosefs.merging_strategies.borda_merger.BordaMerger",
+        [],
+    ),
+    "union_of_intersections_merger": (
+        "moosefs.merging_strategies.union_of_intersections_merger.UnionOfIntersectionsMerger",
+        [],
+    ),
+    "l2_norm_merger": (
+        "moosefs.merging_strategies.l2_norm_merger.L2NormMerger",
+        [],
+    ),
+    "arithmetic_mean_merger": (
+        "moosefs.merging_strategies.arithmetic_mean_merger.ArithmeticMeanMerger",
+        [],
+    ),
+    "consensus_merger": (
+        "moosefs.merging_strategies.consensus_merger.ConsensusMerger",
+        ["k", "fill"],
+    ),
+}
+def dynamic_import(class_path: str) -> type:
+    """Import a class from a fully qualified path.
+    Args:
+        class_path: Dotted path, e.g. "moosefs.module.ClassName".
+    Returns:
+        The referenced class object.
+    """
+    components = class_path.split(".")
+    module_path = ".".join(components[:-1])
+    class_name = components[-1]
+    module = __import__(module_path, fromlist=[class_name])
+    return getattr(module, class_name)
+def get_class_info(identifier: str) -> tuple:
+    """Resolve an identifier to a class and its expected params.
+    Args:
+        identifier: Lookup key in ``class_path_mapping``.
+    Returns:
+        (class, params) where params are attribute names to fetch.
+    Raises:
+        ValueError: If the identifier is unknown.
+    """
+    if identifier not in class_path_mapping:
+        raise ValueError(f"Unknown class identifier: {identifier}")
+    class_path, params = class_path_mapping[identifier]
+    cls = dynamic_import(class_path)
+    return cls, params
+def extract_params(cls: type, instance: Any, params: list) -> dict:
+    """Collect constructor parameters from an owning instance.
+    Args:
+        cls: Class to instantiate.
+        instance: Object carrying attributes matching ``params``.
+        params: Parameter names to extract.
+    Returns:
+        Mapping of parameter names to values for ``cls``.
+    """
+    sig = inspect.signature(cls.__init__)
+    extracted_params: dict = {
+        param: getattr(instance, param) for param in params if param in sig.parameters and hasattr(instance, param)
+    }
+    # If **kwargs exists in the class signature, include additional parameters.
+    if any(p.kind == inspect.Parameter.VAR_KEYWORD for p in sig.parameters.values()):
+        additional_params = {
+            param: getattr(instance, param)
+            for param in params
+            if param not in sig.parameters and hasattr(instance, param)
+        }
+        extracted_params.update(additional_params)
+    return extracted_params