PyPI - moose-fs - Versions diffs - 0.1.0__py3-none-any.whl - Mend

moose-fs 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

LICENSE +21 -0
README.md +190 -0
moose_fs-0.1.0.dist-info/METADATA +232 -0
moose_fs-0.1.0.dist-info/RECORD +40 -0
moose_fs-0.1.0.dist-info/WHEEL +4 -0
moose_fs-0.1.0.dist-info/entry_points.txt +2 -0
moose_fs-0.1.0.dist-info/licenses/LICENSE +21 -0
moosefs/__init__.py +6 -0
moosefs/core/__init__.py +6 -0
moosefs/core/data_processor.py +319 -0
moosefs/core/feature.py +44 -0
moosefs/core/novovicova.py +60 -0
moosefs/core/pareto.py +90 -0
moosefs/feature_selection_pipeline.py +548 -0
moosefs/feature_selectors/__init__.py +26 -0
moosefs/feature_selectors/base_selector.py +38 -0
moosefs/feature_selectors/default_variance.py +21 -0
moosefs/feature_selectors/elastic_net_selector.py +75 -0
moosefs/feature_selectors/f_statistic_selector.py +42 -0
moosefs/feature_selectors/lasso_selector.py +46 -0
moosefs/feature_selectors/mrmr_selector.py +57 -0
moosefs/feature_selectors/mutual_info_selector.py +45 -0
moosefs/feature_selectors/random_forest_selector.py +48 -0
moosefs/feature_selectors/svm_selector.py +50 -0
moosefs/feature_selectors/variance_selectors.py +16 -0
moosefs/feature_selectors/xgboost_selector.py +44 -0
moosefs/merging_strategies/__init__.py +17 -0
moosefs/merging_strategies/arithmetic_mean_merger.py +46 -0
moosefs/merging_strategies/base_merger.py +64 -0
moosefs/merging_strategies/borda_merger.py +46 -0
moosefs/merging_strategies/consensus_merger.py +80 -0
moosefs/merging_strategies/l2_norm_merger.py +42 -0
moosefs/merging_strategies/union_of_intersections_merger.py +89 -0
moosefs/metrics/__init__.py +23 -0
moosefs/metrics/performance_metrics.py +239 -0
moosefs/metrics/stability_metrics.py +49 -0
moosefs/utils.py +161 -0
scripts/config.yml +92 -0
scripts/main.py +163 -0
scripts/utils.py +186 -0

moosefs/core/data_processor.py ADDED Viewed

@@ -0,0 +1,319 @@
+from typing import Any, Optional
+import pandas as pd
+from sklearn.impute import KNNImputer
+from sklearn.preprocessing import LabelEncoder, StandardScaler
+class DataProcessor:
+    def __init__(
+        self,
+        categorical_columns: Optional[list] = None,
+        columns_to_drop: Optional[list] = None,
+        drop_missing_values: bool = False,
+        merge_key: Optional[str] = None,
+        normalize: bool = True,
+        target_column: str = "target",
+    ) -> None:
+        """
+        Initialize the DataProcessor with specific parameters for preprocessing.
+        Args:
+            categorical_columns: List of column names to treat as categorical.
+            columns_to_drop: List of column names to drop from the dataset.
+            drop_missing_values: Flag to determine if missing values should be dropped.
+            merge_key: Column name to use as a key when merging data with metadata.
+            normalize: Flag to determine if numerical features should be normalized.
+            target_column: Name of the target column in the dataset.
+        """
+        self.categorical_columns: Optional[list] = categorical_columns
+        self.columns_to_drop: Optional[list] = columns_to_drop
+        self.drop_missing_values: bool = drop_missing_values
+        self.merge_key: Optional[str] = merge_key
+        self.normalize: bool = normalize
+        self.target_column: str = target_column
+        self.label_encoders: dict = {}
+    def preprocess_data(
+        self,
+        data: Any,
+        index_col: Optional[str] = None,
+        metadata: Optional[Any] = None,
+    ) -> pd.DataFrame:
+        """
+        Load and preprocess data from a CSV file or DataFrame, with optional metadata merging.
+        Args:
+            data: Path to the CSV file or a pandas DataFrame.
+            index_col: Column to set as index. Defaults to None.
+            metadata: Path to the CSV file or DataFrame containing metadata. Defaults to None.
+        Returns:
+            The preprocessed data as a pandas DataFrame.
+        """
+        data_df = self._load_data(data, index_col)
+        if metadata is not None:
+            meta_df = self._load_data(metadata, index_col)
+            data_df = self._merge_data_and_metadata(data_df, meta_df)
+        for condition, method in [
+            (self.columns_to_drop, self._drop_columns),
+            (self.drop_missing_values, self._drop_missing_values),
+            (self.categorical_columns, self._encode_categorical_variables),
+            (self.normalize, self._scale_numerical_features),
+            (self.target_column, self._rename_target_column),
+        ]:
+            if condition:
+                data_df = method(data_df)
+        return data_df
+    def _load_data(self, data: Any, index_col: Optional[str] = None) -> pd.DataFrame:
+        """
+        Helper method to load data and set the index if specified.
+        Args:
+            data: Path to the CSV file or a pandas DataFrame.
+            index_col: Column to set as index. Defaults to None.
+        Returns:
+            The loaded pandas DataFrame with index set if specified.
+        """
+        if isinstance(data, str):
+            df = pd.read_csv(data)
+        elif isinstance(data, pd.DataFrame):
+            df = data.copy()
+        else:
+            raise ValueError("Input data must be a file path (str) or a pandas DataFrame")
+        if index_col is not None:
+            df.set_index(index_col, inplace=True)
+        return df
+    def _merge_data_and_metadata(self, data_df: pd.DataFrame, meta_df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Merge the main data frame with metadata.
+        Args:
+            data_df: The main data DataFrame.
+            meta_df: The metadata DataFrame.
+        Returns:
+            The merged DataFrame.
+        """
+        if not self.merge_key:
+            raise ValueError("merge_key must be provided for merging data and metadata")
+        return pd.merge(data_df, meta_df, on=self.merge_key)
+    def _rename_target_column(self, data_df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Rename the target column in the data frame to 'target'.
+        Args:
+            data_df: The data DataFrame to be modified.
+        Returns:
+            The DataFrame with the renamed target column.
+        """
+        data_df.rename(columns={self.target_column: "target"}, inplace=True)
+        self.target_column = "target"
+        return data_df
+    def _drop_columns(self, data_df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Drop specified columns from the data frame.
+        Args:
+            data_df: The data DataFrame to be modified.
+        Returns:
+            The DataFrame with specified columns dropped.
+        """
+        if self.columns_to_drop:
+            data_df.drop(columns=self.columns_to_drop, inplace=True, errors="ignore")
+        return data_df
+    def _drop_missing_values(self, data_df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Drop missing values by dropping rows with NaNs.
+        Args:
+            data_df: The data DataFrame with missing values.
+        Returns:
+            The DataFrame with missing values dropped.
+        """
+        return data_df.dropna()
+    def _encode_categorical_variables(self, data_df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Encode categorical variables using label encoding and store the mappings.
+        Args:
+            data_df: The data DataFrame with categorical columns.
+        Returns:
+            The DataFrame with categorical variables encoded.
+        """
+        if not self.categorical_columns:
+            return data_df
+        for col in self.categorical_columns:
+            if col in data_df.columns:
+                label_encoder = LabelEncoder()
+                data_df[col] = label_encoder.fit_transform(data_df[col])
+                self.label_encoders[col] = label_encoder
+        return data_df
+    def get_label_mapping(self, column_name: str) -> dict:
+        """
+        Retrieve the label encoding mapping for a specific column.
+        Args:
+            column_name: The column for which to get the label encoding mapping.
+        Returns:
+            A dictionary mapping original labels to encoded values.
+        """
+        if column_name in self.label_encoders:
+            label_encoder = self.label_encoders[column_name]
+            return dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
+        else:
+            raise ValueError(f"No label encoder found for column: {column_name}")
+    def _scale_numerical_features(self, data_df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Scale numerical features using standard scaling.
+        Args:
+            data_df: The data DataFrame with numerical columns.
+        Returns:
+            The DataFrame with numerical features scaled.
+        """
+        categorical_cols = self.categorical_columns if self.categorical_columns else []
+        numerical_cols = [col for col in data_df.columns if col not in categorical_cols]
+        scaler = StandardScaler()
+        data_df[numerical_cols] = scaler.fit_transform(data_df[numerical_cols])
+        return data_df
+    def _filtered_time_dataset(self, data_df: pd.DataFrame, min_num_timepoints: int, clone_column: str) -> pd.DataFrame:
+        """
+        Filter dataset to retain only clones with at least min_num_timepoints.
+        Args:
+            data_df: DataFrame containing the dataset.
+            min_num_timepoints: Minimum number of time points required per clone.
+            clone_column: Column name for the clone identifier.
+        Returns:
+            DataFrame with clones filtered based on time points.
+        """
+        filtered_df = data_df.groupby(clone_column).filter(lambda x: len(x) >= min_num_timepoints)
+        return filtered_df.sort_values(clone_column)
+    def _fill_nan(
+        self,
+        df: pd.DataFrame,
+        method: str = "mean",
+        **knn_kwargs: Any,  # forwarded only if method == "knn"
+    ) -> pd.DataFrame:
+        """
+        Fill NaN values in *df* according to *method*.
+        Parameters
+        ----------
+        df : pd.DataFrame
+            The data whose missing values should be filled.
+        method : {"mean", "knn"}, default "mean"
+            Imputation strategy:
+            - "mean" : column-wise mean for numeric, mode for categoricals.
+            - "knn"  : KNNImputer for numeric, mode for categoricals.
+        **knn_kwargs : Any
+            Extra keyword arguments passed straight to
+            ``sklearn.impute.KNNImputer`` when *method* == "knn".
+            Example: ``n_neighbors=5, weights="distance"``.
+        Returns
+        -------
+        pd.DataFrame
+            A copy of *df* with NaNs imputed.
+        """
+        df = df.copy()  # avoid mutating the caller’s frame
+        numeric_cols = df.select_dtypes(include="number").columns
+        categorical_cols = df.select_dtypes(include="category").columns
+        if method == "mean":
+            # numeric
+            df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
+        elif method == "knn":
+            # numeric via sklearn KNN
+            if numeric_cols.empty:
+                raise ValueError("KNN imputation requires at least one numeric column.")
+            imputer = KNNImputer(**knn_kwargs)
+            df[numeric_cols] = pd.DataFrame(
+                imputer.fit_transform(df[numeric_cols]),
+                columns=numeric_cols,
+                index=df.index,
+            )
+        else:
+            raise ValueError(f"Unknown method: {method!r}")
+        # categoricals: always use mode (most frequent)
+        for col in categorical_cols:
+            if df[col].isna().any():
+                df[col] = df[col].fillna(df[col].mode(dropna=True)[0])
+        return df
+    def flatten_time(
+        self,
+        data_df: pd.DataFrame,
+        clone_column: str,
+        time_column: str,
+        time_dependent_columns: list,
+        min_num_timepoints: Optional[int] = None,
+        fill_nan_method: str = "mean",
+        **kwargs: Any,
+    ) -> pd.DataFrame:
+        """
+        Flatten dataset based on time-dependent columns, optionally filtering by minimum time points and filling NaNs.
+        Args:
+            data_df: DataFrame containing the dataset.
+            clone_column: Column name for the clone identifier.
+            time_column: Column name for the time variable.
+            time_dependent_columns: List of columns that vary with time.
+            min_num_timepoints: Optional minimum number of time points per clone for filtering.
+            fill_nan_method: Method to fill NaN values. Defaults to "mean".
+        Returns:
+            DataFrame where time-dependent columns are pivoted and flattened by clone, with NaN values filled.
+        """
+        if min_num_timepoints is not None:
+            data_df = self._filtered_time_dataset(data_df, min_num_timepoints, clone_column)
+        flattened_data = []
+        # Reverse mapping for TIMEPOINT
+        mapping = {v: k for k, v in self.get_label_mapping("TIMEPOINT").items()}
+        data_df["TIMEPOINT"] = data_df["TIMEPOINT"].map(mapping)
+        for clone, clone_df in data_df.groupby(clone_column):
+            melted_df = clone_df.melt(
+                id_vars=[clone_column, time_column],
+                value_vars=time_dependent_columns,
+                var_name="VARIABLE",
+                value_name="VALUE",
+            )
+            melted_df["time_var"] = melted_df[time_column].astype(str) + "_" + melted_df["VARIABLE"]
+            pivoted_df = melted_df.pivot(index=clone_column, columns="time_var", values="VALUE")
+            flattened_data.append(pivoted_df)
+        flattened_df = pd.concat(flattened_data)
+        target_df = data_df[[clone_column, self.target_column]].drop_duplicates()
+        flattened_df = flattened_df.reset_index()
+        flattened_df = pd.merge(flattened_df, target_df, on=clone_column).set_index(clone_column).sort_index()
+        flattened_df = flattened_df.dropna(subset=[self.target_column])
+        flattened_df = self._fill_nan(flattened_df, fill_nan_method, **kwargs)
+        return flattened_df

moosefs/core/feature.py ADDED Viewed

@@ -0,0 +1,44 @@
+from typing import Optional
+class Feature:
+    """Container for a single feature.
+    Stores the feature name, an optional score, and whether it is selected.
+    Args:
+        name: Feature identifier (e.g., column name).
+        score: Optional importance/score for ranking.
+        selected: Whether the feature is selected.
+    """
+    __slots__ = ("name", "score", "selected")
+    def __init__(self, name: str, score: Optional[float] = None, selected: bool = False) -> None:
+        self.name: str = name
+        self.score: Optional[float] = score
+        self.selected: bool = selected
+    def set_score(self, score: float) -> None:
+        """Set the feature score.
+        Args:
+            score: Importance/score value.
+        """
+        self.score = score
+    def set_selected(self, selected: bool) -> None:
+        """Set the selected flag.
+        Args:
+            selected: True if selected; otherwise False.
+        """
+        self.selected = selected
+    def __str__(self) -> str:
+        """Return a readable string representation."""
+        return f"Feature(name={self.name}, score={self.score}, selected={self.selected})"
+    def __repr__(self) -> str:
+        """Return an unambiguous representation for debugging."""
+        return f"Feature('{self.name}', {self.score}, {self.selected})"

moosefs/core/novovicova.py ADDED Viewed

@@ -0,0 +1,60 @@
+import numpy as np
+class StabilityNovovicova:
+    """
+    Computes the stability of feature selection algorithms based on Novovicová et al. (2009).
+    References:
+        Novovicová, J., Somol, P., & Pudil, P. (2009). "A New Measure of Feature Selection
+        Algorithms' Stability." IEEE International Conference on Data Mining Workshops.
+    """
+    def __init__(self, selected_features: list):
+        """
+        Args:
+            selected_features: A list of sets or lists, where each represents selected features in a dataset.
+        """
+        self._validate_inputs(selected_features)
+        self.selected_features: list = [set(sel) for sel in selected_features]  # Convert all to sets
+        self.N: int = sum(len(sel) for sel in self.selected_features)  # Total feature occurrences
+        self.n: int = len(self.selected_features)  # Number of datasets
+    @staticmethod
+    def _validate_inputs(selected_features: list) -> None:
+        """Validates the input format, ensuring consistency and non-emptiness."""
+        if not selected_features:
+            raise ValueError("Feature selections cannot be empty.")
+        if not isinstance(selected_features, list):
+            raise TypeError("Feature selections must be a list of sets or lists.")
+        if not all(isinstance(sel, (set, list)) for sel in selected_features):
+            raise TypeError("Each feature selection must be a set or a list.")
+        if any(len(sel) == 0 for sel in selected_features):
+            raise ValueError("Feature selections cannot contain empty sets or lists.")
+        # Ensure feature types are consistent
+        first_item = next(iter(selected_features[0]))
+        element_type = type(first_item)
+        if any(any(type(item) is not element_type for item in sel) for sel in selected_features):
+            raise ValueError("All features must be of the same type across selections.")
+    def compute_stability(self) -> float:
+        """
+        Computes the stability measure SH(S), ranging from 0 (no stability) to 1 (full stability).
+        Returns:
+            Stability score.
+        """
+        if self.N == 0 or self.n == 1:
+            return 0.0  # Stability is not meaningful for a single subset or empty selection.
+        # Count occurrences of each unique feature
+        feature_counts: dict = {}
+        for sublist in self.selected_features:
+            for feature in sublist:
+                feature_counts[feature] = feature_counts.get(feature, 0) + 1
+        # Compute stability measure
+        SH_S: float = sum(count * np.log2(count) for count in feature_counts.values())
+        return SH_S / (self.N * np.log2(self.n))

moosefs/core/pareto.py ADDED Viewed

@@ -0,0 +1,90 @@
+import numpy as np
+class ParetoAnalysis:
+    """Rank groups by dominance and break ties using utopia distance.
+    For each group, computes a scalar dominance score: dominated−is_dominated.
+    If the top score ties, scales tied vectors to [0, 1] (within the tie) and
+    picks the one closest to the utopia point (1, ..., 1).
+    """
+    def __init__(self, data: list, group_names: list) -> None:
+        """Initialize the analysis state.
+        Args:
+            data: Metric vectors per group.
+            group_names: Display names for groups.
+        Raises:
+            ValueError: If ``data`` is empty.
+        """
+        if not data:
+            raise ValueError("Data cannot be empty.")
+        self.data = data
+        self.group_names = group_names
+        self.num_groups, self.num_metrics = len(data), len(data[0])
+        # Each row will hold:
+        #   0  group name
+        #   1  dominate_count
+        #   2  is_dominated_count
+        #   3  scalar = 1 − 2
+        #   4  metrics vector  ← NEW column used only for tie-break
+        self.results: list = [
+            [g, 0, 0, 0, vec]  # vec = data[i]
+            for g, vec in zip(group_names, data)
+        ]
+    def _dominate_count(self, i: int) -> int:
+        g = self.data[i]
+        return sum(
+            all(g[m] >= o[m] for m in range(self.num_metrics)) and any(g[m] > o[m] for m in range(self.num_metrics))
+            for j, o in enumerate(self.data)
+            if j != i
+        )
+    def _is_dominated_count(self, i: int) -> int:
+        g = self.data[i]
+        return sum(
+            all(g[m] <= o[m] for m in range(self.num_metrics)) and any(g[m] < o[m] for m in range(self.num_metrics))
+            for j, o in enumerate(self.data)
+            if j != i
+        )
+    def get_results(self) -> list:
+        """Compute dominance and return ranked rows.
+        Returns:
+            Rows [name, dominate_count, is_dominated_count, scalar] sorted by rank.
+        """
+        # 1) scalar dominance
+        for i in range(self.num_groups):
+            dom = self._dominate_count(i)
+            sub = self._is_dominated_count(i)
+            self.results[i][1:4] = [dom, sub, dom - sub]
+        # 2) initial sort: scalar desc  then lexicographic name
+        self.results.sort(key=lambda r: (-r[3], tuple(r[0])))
+        # 3) tie-break on utopia distance
+        top_scalar = self.results[0][3]
+        tied_rows = [r for r in self.results if r[3] == top_scalar]
+        if len(tied_rows) > 1:
+            tied_data = np.vstack([r[4] for r in tied_rows], dtype=float)
+            mins, maxs = tied_data.min(0), tied_data.max(0)
+            span = np.where(maxs - mins == 0, 1, maxs - mins)
+            scaled = (tied_data - mins) / span  # 0-1 per metric
+            dists = np.linalg.norm(1.0 - scaled, axis=1)  # to utopia (1,…,1)
+            best_local_idx = int(dists.argmin())  # index inside tied_rows
+            best_row = tied_rows[best_local_idx]
+            # place best_row at position 0, keep relative order of the rest
+            self.results.remove(best_row)
+            self.results.insert(0, best_row)
+        # strip the metrics vector column before returning (keep original layout)
+        return [row[:4] for row in self.results]