moose-fs 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. LICENSE +21 -0
  2. README.md +190 -0
  3. moose_fs-0.1.0.dist-info/METADATA +232 -0
  4. moose_fs-0.1.0.dist-info/RECORD +40 -0
  5. moose_fs-0.1.0.dist-info/WHEEL +4 -0
  6. moose_fs-0.1.0.dist-info/entry_points.txt +2 -0
  7. moose_fs-0.1.0.dist-info/licenses/LICENSE +21 -0
  8. moosefs/__init__.py +6 -0
  9. moosefs/core/__init__.py +6 -0
  10. moosefs/core/data_processor.py +319 -0
  11. moosefs/core/feature.py +44 -0
  12. moosefs/core/novovicova.py +60 -0
  13. moosefs/core/pareto.py +90 -0
  14. moosefs/feature_selection_pipeline.py +548 -0
  15. moosefs/feature_selectors/__init__.py +26 -0
  16. moosefs/feature_selectors/base_selector.py +38 -0
  17. moosefs/feature_selectors/default_variance.py +21 -0
  18. moosefs/feature_selectors/elastic_net_selector.py +75 -0
  19. moosefs/feature_selectors/f_statistic_selector.py +42 -0
  20. moosefs/feature_selectors/lasso_selector.py +46 -0
  21. moosefs/feature_selectors/mrmr_selector.py +57 -0
  22. moosefs/feature_selectors/mutual_info_selector.py +45 -0
  23. moosefs/feature_selectors/random_forest_selector.py +48 -0
  24. moosefs/feature_selectors/svm_selector.py +50 -0
  25. moosefs/feature_selectors/variance_selectors.py +16 -0
  26. moosefs/feature_selectors/xgboost_selector.py +44 -0
  27. moosefs/merging_strategies/__init__.py +17 -0
  28. moosefs/merging_strategies/arithmetic_mean_merger.py +46 -0
  29. moosefs/merging_strategies/base_merger.py +64 -0
  30. moosefs/merging_strategies/borda_merger.py +46 -0
  31. moosefs/merging_strategies/consensus_merger.py +80 -0
  32. moosefs/merging_strategies/l2_norm_merger.py +42 -0
  33. moosefs/merging_strategies/union_of_intersections_merger.py +89 -0
  34. moosefs/metrics/__init__.py +23 -0
  35. moosefs/metrics/performance_metrics.py +239 -0
  36. moosefs/metrics/stability_metrics.py +49 -0
  37. moosefs/utils.py +161 -0
  38. scripts/config.yml +92 -0
  39. scripts/main.py +163 -0
  40. scripts/utils.py +186 -0
@@ -0,0 +1,319 @@
1
+ from typing import Any, Optional
2
+
3
+ import pandas as pd
4
+ from sklearn.impute import KNNImputer
5
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
6
+
7
+
8
+ class DataProcessor:
9
+ def __init__(
10
+ self,
11
+ categorical_columns: Optional[list] = None,
12
+ columns_to_drop: Optional[list] = None,
13
+ drop_missing_values: bool = False,
14
+ merge_key: Optional[str] = None,
15
+ normalize: bool = True,
16
+ target_column: str = "target",
17
+ ) -> None:
18
+ """
19
+ Initialize the DataProcessor with specific parameters for preprocessing.
20
+
21
+ Args:
22
+ categorical_columns: List of column names to treat as categorical.
23
+ columns_to_drop: List of column names to drop from the dataset.
24
+ drop_missing_values: Flag to determine if missing values should be dropped.
25
+ merge_key: Column name to use as a key when merging data with metadata.
26
+ normalize: Flag to determine if numerical features should be normalized.
27
+ target_column: Name of the target column in the dataset.
28
+ """
29
+ self.categorical_columns: Optional[list] = categorical_columns
30
+ self.columns_to_drop: Optional[list] = columns_to_drop
31
+ self.drop_missing_values: bool = drop_missing_values
32
+ self.merge_key: Optional[str] = merge_key
33
+ self.normalize: bool = normalize
34
+ self.target_column: str = target_column
35
+ self.label_encoders: dict = {}
36
+
37
+ def preprocess_data(
38
+ self,
39
+ data: Any,
40
+ index_col: Optional[str] = None,
41
+ metadata: Optional[Any] = None,
42
+ ) -> pd.DataFrame:
43
+ """
44
+ Load and preprocess data from a CSV file or DataFrame, with optional metadata merging.
45
+
46
+ Args:
47
+ data: Path to the CSV file or a pandas DataFrame.
48
+ index_col: Column to set as index. Defaults to None.
49
+ metadata: Path to the CSV file or DataFrame containing metadata. Defaults to None.
50
+
51
+ Returns:
52
+ The preprocessed data as a pandas DataFrame.
53
+ """
54
+ data_df = self._load_data(data, index_col)
55
+
56
+ if metadata is not None:
57
+ meta_df = self._load_data(metadata, index_col)
58
+ data_df = self._merge_data_and_metadata(data_df, meta_df)
59
+
60
+ for condition, method in [
61
+ (self.columns_to_drop, self._drop_columns),
62
+ (self.drop_missing_values, self._drop_missing_values),
63
+ (self.categorical_columns, self._encode_categorical_variables),
64
+ (self.normalize, self._scale_numerical_features),
65
+ (self.target_column, self._rename_target_column),
66
+ ]:
67
+ if condition:
68
+ data_df = method(data_df)
69
+ return data_df
70
+
71
+ def _load_data(self, data: Any, index_col: Optional[str] = None) -> pd.DataFrame:
72
+ """
73
+ Helper method to load data and set the index if specified.
74
+
75
+ Args:
76
+ data: Path to the CSV file or a pandas DataFrame.
77
+ index_col: Column to set as index. Defaults to None.
78
+
79
+ Returns:
80
+ The loaded pandas DataFrame with index set if specified.
81
+ """
82
+ if isinstance(data, str):
83
+ df = pd.read_csv(data)
84
+ elif isinstance(data, pd.DataFrame):
85
+ df = data.copy()
86
+ else:
87
+ raise ValueError("Input data must be a file path (str) or a pandas DataFrame")
88
+
89
+ if index_col is not None:
90
+ df.set_index(index_col, inplace=True)
91
+ return df
92
+
93
+ def _merge_data_and_metadata(self, data_df: pd.DataFrame, meta_df: pd.DataFrame) -> pd.DataFrame:
94
+ """
95
+ Merge the main data frame with metadata.
96
+
97
+ Args:
98
+ data_df: The main data DataFrame.
99
+ meta_df: The metadata DataFrame.
100
+
101
+ Returns:
102
+ The merged DataFrame.
103
+ """
104
+ if not self.merge_key:
105
+ raise ValueError("merge_key must be provided for merging data and metadata")
106
+ return pd.merge(data_df, meta_df, on=self.merge_key)
107
+
108
+ def _rename_target_column(self, data_df: pd.DataFrame) -> pd.DataFrame:
109
+ """
110
+ Rename the target column in the data frame to 'target'.
111
+
112
+ Args:
113
+ data_df: The data DataFrame to be modified.
114
+
115
+ Returns:
116
+ The DataFrame with the renamed target column.
117
+ """
118
+ data_df.rename(columns={self.target_column: "target"}, inplace=True)
119
+ self.target_column = "target"
120
+ return data_df
121
+
122
+ def _drop_columns(self, data_df: pd.DataFrame) -> pd.DataFrame:
123
+ """
124
+ Drop specified columns from the data frame.
125
+
126
+ Args:
127
+ data_df: The data DataFrame to be modified.
128
+
129
+ Returns:
130
+ The DataFrame with specified columns dropped.
131
+ """
132
+ if self.columns_to_drop:
133
+ data_df.drop(columns=self.columns_to_drop, inplace=True, errors="ignore")
134
+ return data_df
135
+
136
+ def _drop_missing_values(self, data_df: pd.DataFrame) -> pd.DataFrame:
137
+ """
138
+ Drop missing values by dropping rows with NaNs.
139
+
140
+ Args:
141
+ data_df: The data DataFrame with missing values.
142
+
143
+ Returns:
144
+ The DataFrame with missing values dropped.
145
+ """
146
+ return data_df.dropna()
147
+
148
+ def _encode_categorical_variables(self, data_df: pd.DataFrame) -> pd.DataFrame:
149
+ """
150
+ Encode categorical variables using label encoding and store the mappings.
151
+
152
+ Args:
153
+ data_df: The data DataFrame with categorical columns.
154
+
155
+ Returns:
156
+ The DataFrame with categorical variables encoded.
157
+ """
158
+ if not self.categorical_columns:
159
+ return data_df
160
+
161
+ for col in self.categorical_columns:
162
+ if col in data_df.columns:
163
+ label_encoder = LabelEncoder()
164
+ data_df[col] = label_encoder.fit_transform(data_df[col])
165
+ self.label_encoders[col] = label_encoder
166
+ return data_df
167
+
168
+ def get_label_mapping(self, column_name: str) -> dict:
169
+ """
170
+ Retrieve the label encoding mapping for a specific column.
171
+
172
+ Args:
173
+ column_name: The column for which to get the label encoding mapping.
174
+
175
+ Returns:
176
+ A dictionary mapping original labels to encoded values.
177
+ """
178
+ if column_name in self.label_encoders:
179
+ label_encoder = self.label_encoders[column_name]
180
+ return dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
181
+ else:
182
+ raise ValueError(f"No label encoder found for column: {column_name}")
183
+
184
+ def _scale_numerical_features(self, data_df: pd.DataFrame) -> pd.DataFrame:
185
+ """
186
+ Scale numerical features using standard scaling.
187
+
188
+ Args:
189
+ data_df: The data DataFrame with numerical columns.
190
+
191
+ Returns:
192
+ The DataFrame with numerical features scaled.
193
+ """
194
+ categorical_cols = self.categorical_columns if self.categorical_columns else []
195
+ numerical_cols = [col for col in data_df.columns if col not in categorical_cols]
196
+ scaler = StandardScaler()
197
+ data_df[numerical_cols] = scaler.fit_transform(data_df[numerical_cols])
198
+ return data_df
199
+
200
+ def _filtered_time_dataset(self, data_df: pd.DataFrame, min_num_timepoints: int, clone_column: str) -> pd.DataFrame:
201
+ """
202
+ Filter dataset to retain only clones with at least min_num_timepoints.
203
+
204
+ Args:
205
+ data_df: DataFrame containing the dataset.
206
+ min_num_timepoints: Minimum number of time points required per clone.
207
+ clone_column: Column name for the clone identifier.
208
+
209
+ Returns:
210
+ DataFrame with clones filtered based on time points.
211
+ """
212
+ filtered_df = data_df.groupby(clone_column).filter(lambda x: len(x) >= min_num_timepoints)
213
+ return filtered_df.sort_values(clone_column)
214
+
215
+ def _fill_nan(
216
+ self,
217
+ df: pd.DataFrame,
218
+ method: str = "mean",
219
+ **knn_kwargs: Any, # forwarded only if method == "knn"
220
+ ) -> pd.DataFrame:
221
+ """
222
+ Fill NaN values in *df* according to *method*.
223
+
224
+ Parameters
225
+ ----------
226
+ df : pd.DataFrame
227
+ The data whose missing values should be filled.
228
+ method : {"mean", "knn"}, default "mean"
229
+ Imputation strategy:
230
+ - "mean" : column-wise mean for numeric, mode for categoricals.
231
+ - "knn" : KNNImputer for numeric, mode for categoricals.
232
+ **knn_kwargs : Any
233
+ Extra keyword arguments passed straight to
234
+ ``sklearn.impute.KNNImputer`` when *method* == "knn".
235
+ Example: ``n_neighbors=5, weights="distance"``.
236
+
237
+ Returns
238
+ -------
239
+ pd.DataFrame
240
+ A copy of *df* with NaNs imputed.
241
+ """
242
+ df = df.copy() # avoid mutating the caller’s frame
243
+
244
+ numeric_cols = df.select_dtypes(include="number").columns
245
+ categorical_cols = df.select_dtypes(include="category").columns
246
+
247
+ if method == "mean":
248
+ # numeric
249
+ df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
250
+ elif method == "knn":
251
+ # numeric via sklearn KNN
252
+ if numeric_cols.empty:
253
+ raise ValueError("KNN imputation requires at least one numeric column.")
254
+ imputer = KNNImputer(**knn_kwargs)
255
+ df[numeric_cols] = pd.DataFrame(
256
+ imputer.fit_transform(df[numeric_cols]),
257
+ columns=numeric_cols,
258
+ index=df.index,
259
+ )
260
+ else:
261
+ raise ValueError(f"Unknown method: {method!r}")
262
+
263
+ # categoricals: always use mode (most frequent)
264
+ for col in categorical_cols:
265
+ if df[col].isna().any():
266
+ df[col] = df[col].fillna(df[col].mode(dropna=True)[0])
267
+
268
+ return df
269
+
270
+ def flatten_time(
271
+ self,
272
+ data_df: pd.DataFrame,
273
+ clone_column: str,
274
+ time_column: str,
275
+ time_dependent_columns: list,
276
+ min_num_timepoints: Optional[int] = None,
277
+ fill_nan_method: str = "mean",
278
+ **kwargs: Any,
279
+ ) -> pd.DataFrame:
280
+ """
281
+ Flatten dataset based on time-dependent columns, optionally filtering by minimum time points and filling NaNs.
282
+
283
+ Args:
284
+ data_df: DataFrame containing the dataset.
285
+ clone_column: Column name for the clone identifier.
286
+ time_column: Column name for the time variable.
287
+ time_dependent_columns: List of columns that vary with time.
288
+ min_num_timepoints: Optional minimum number of time points per clone for filtering.
289
+ fill_nan_method: Method to fill NaN values. Defaults to "mean".
290
+
291
+ Returns:
292
+ DataFrame where time-dependent columns are pivoted and flattened by clone, with NaN values filled.
293
+ """
294
+ if min_num_timepoints is not None:
295
+ data_df = self._filtered_time_dataset(data_df, min_num_timepoints, clone_column)
296
+
297
+ flattened_data = []
298
+ # Reverse mapping for TIMEPOINT
299
+ mapping = {v: k for k, v in self.get_label_mapping("TIMEPOINT").items()}
300
+ data_df["TIMEPOINT"] = data_df["TIMEPOINT"].map(mapping)
301
+
302
+ for clone, clone_df in data_df.groupby(clone_column):
303
+ melted_df = clone_df.melt(
304
+ id_vars=[clone_column, time_column],
305
+ value_vars=time_dependent_columns,
306
+ var_name="VARIABLE",
307
+ value_name="VALUE",
308
+ )
309
+ melted_df["time_var"] = melted_df[time_column].astype(str) + "_" + melted_df["VARIABLE"]
310
+ pivoted_df = melted_df.pivot(index=clone_column, columns="time_var", values="VALUE")
311
+ flattened_data.append(pivoted_df)
312
+
313
+ flattened_df = pd.concat(flattened_data)
314
+ target_df = data_df[[clone_column, self.target_column]].drop_duplicates()
315
+ flattened_df = flattened_df.reset_index()
316
+ flattened_df = pd.merge(flattened_df, target_df, on=clone_column).set_index(clone_column).sort_index()
317
+ flattened_df = flattened_df.dropna(subset=[self.target_column])
318
+ flattened_df = self._fill_nan(flattened_df, fill_nan_method, **kwargs)
319
+ return flattened_df
@@ -0,0 +1,44 @@
1
+ from typing import Optional
2
+
3
+
4
+ class Feature:
5
+ """Container for a single feature.
6
+
7
+ Stores the feature name, an optional score, and whether it is selected.
8
+
9
+ Args:
10
+ name: Feature identifier (e.g., column name).
11
+ score: Optional importance/score for ranking.
12
+ selected: Whether the feature is selected.
13
+ """
14
+
15
+ __slots__ = ("name", "score", "selected")
16
+
17
+ def __init__(self, name: str, score: Optional[float] = None, selected: bool = False) -> None:
18
+ self.name: str = name
19
+ self.score: Optional[float] = score
20
+ self.selected: bool = selected
21
+
22
+ def set_score(self, score: float) -> None:
23
+ """Set the feature score.
24
+
25
+ Args:
26
+ score: Importance/score value.
27
+ """
28
+ self.score = score
29
+
30
+ def set_selected(self, selected: bool) -> None:
31
+ """Set the selected flag.
32
+
33
+ Args:
34
+ selected: True if selected; otherwise False.
35
+ """
36
+ self.selected = selected
37
+
38
+ def __str__(self) -> str:
39
+ """Return a readable string representation."""
40
+ return f"Feature(name={self.name}, score={self.score}, selected={self.selected})"
41
+
42
+ def __repr__(self) -> str:
43
+ """Return an unambiguous representation for debugging."""
44
+ return f"Feature('{self.name}', {self.score}, {self.selected})"
@@ -0,0 +1,60 @@
1
+ import numpy as np
2
+
3
+
4
+ class StabilityNovovicova:
5
+ """
6
+ Computes the stability of feature selection algorithms based on Novovicová et al. (2009).
7
+
8
+ References:
9
+ Novovicová, J., Somol, P., & Pudil, P. (2009). "A New Measure of Feature Selection
10
+ Algorithms' Stability." IEEE International Conference on Data Mining Workshops.
11
+ """
12
+
13
+ def __init__(self, selected_features: list):
14
+ """
15
+ Args:
16
+ selected_features: A list of sets or lists, where each represents selected features in a dataset.
17
+ """
18
+ self._validate_inputs(selected_features)
19
+ self.selected_features: list = [set(sel) for sel in selected_features] # Convert all to sets
20
+ self.N: int = sum(len(sel) for sel in self.selected_features) # Total feature occurrences
21
+ self.n: int = len(self.selected_features) # Number of datasets
22
+
23
+ @staticmethod
24
+ def _validate_inputs(selected_features: list) -> None:
25
+ """Validates the input format, ensuring consistency and non-emptiness."""
26
+ if not selected_features:
27
+ raise ValueError("Feature selections cannot be empty.")
28
+ if not isinstance(selected_features, list):
29
+ raise TypeError("Feature selections must be a list of sets or lists.")
30
+ if not all(isinstance(sel, (set, list)) for sel in selected_features):
31
+ raise TypeError("Each feature selection must be a set or a list.")
32
+
33
+ if any(len(sel) == 0 for sel in selected_features):
34
+ raise ValueError("Feature selections cannot contain empty sets or lists.")
35
+
36
+ # Ensure feature types are consistent
37
+ first_item = next(iter(selected_features[0]))
38
+ element_type = type(first_item)
39
+ if any(any(type(item) is not element_type for item in sel) for sel in selected_features):
40
+ raise ValueError("All features must be of the same type across selections.")
41
+
42
+ def compute_stability(self) -> float:
43
+ """
44
+ Computes the stability measure SH(S), ranging from 0 (no stability) to 1 (full stability).
45
+
46
+ Returns:
47
+ Stability score.
48
+ """
49
+ if self.N == 0 or self.n == 1:
50
+ return 0.0 # Stability is not meaningful for a single subset or empty selection.
51
+
52
+ # Count occurrences of each unique feature
53
+ feature_counts: dict = {}
54
+ for sublist in self.selected_features:
55
+ for feature in sublist:
56
+ feature_counts[feature] = feature_counts.get(feature, 0) + 1
57
+
58
+ # Compute stability measure
59
+ SH_S: float = sum(count * np.log2(count) for count in feature_counts.values())
60
+ return SH_S / (self.N * np.log2(self.n))
moosefs/core/pareto.py ADDED
@@ -0,0 +1,90 @@
1
+ import numpy as np
2
+
3
+
4
+ class ParetoAnalysis:
5
+ """Rank groups by dominance and break ties using utopia distance.
6
+
7
+ For each group, computes a scalar dominance score: dominated−is_dominated.
8
+ If the top score ties, scales tied vectors to [0, 1] (within the tie) and
9
+ picks the one closest to the utopia point (1, ..., 1).
10
+ """
11
+
12
+ def __init__(self, data: list, group_names: list) -> None:
13
+ """Initialize the analysis state.
14
+
15
+ Args:
16
+ data: Metric vectors per group.
17
+ group_names: Display names for groups.
18
+
19
+ Raises:
20
+ ValueError: If ``data`` is empty.
21
+ """
22
+ if not data:
23
+ raise ValueError("Data cannot be empty.")
24
+ self.data = data
25
+ self.group_names = group_names
26
+ self.num_groups, self.num_metrics = len(data), len(data[0])
27
+
28
+ # Each row will hold:
29
+ # 0 group name
30
+ # 1 dominate_count
31
+ # 2 is_dominated_count
32
+ # 3 scalar = 1 − 2
33
+ # 4 metrics vector ← NEW column used only for tie-break
34
+ self.results: list = [
35
+ [g, 0, 0, 0, vec] # vec = data[i]
36
+ for g, vec in zip(group_names, data)
37
+ ]
38
+
39
+ def _dominate_count(self, i: int) -> int:
40
+ g = self.data[i]
41
+ return sum(
42
+ all(g[m] >= o[m] for m in range(self.num_metrics)) and any(g[m] > o[m] for m in range(self.num_metrics))
43
+ for j, o in enumerate(self.data)
44
+ if j != i
45
+ )
46
+
47
+ def _is_dominated_count(self, i: int) -> int:
48
+ g = self.data[i]
49
+ return sum(
50
+ all(g[m] <= o[m] for m in range(self.num_metrics)) and any(g[m] < o[m] for m in range(self.num_metrics))
51
+ for j, o in enumerate(self.data)
52
+ if j != i
53
+ )
54
+
55
+ def get_results(self) -> list:
56
+ """Compute dominance and return ranked rows.
57
+
58
+ Returns:
59
+ Rows [name, dominate_count, is_dominated_count, scalar] sorted by rank.
60
+ """
61
+ # 1) scalar dominance
62
+ for i in range(self.num_groups):
63
+ dom = self._dominate_count(i)
64
+ sub = self._is_dominated_count(i)
65
+ self.results[i][1:4] = [dom, sub, dom - sub]
66
+
67
+ # 2) initial sort: scalar desc then lexicographic name
68
+ self.results.sort(key=lambda r: (-r[3], tuple(r[0])))
69
+
70
+ # 3) tie-break on utopia distance
71
+ top_scalar = self.results[0][3]
72
+ tied_rows = [r for r in self.results if r[3] == top_scalar]
73
+
74
+ if len(tied_rows) > 1:
75
+ tied_data = np.vstack([r[4] for r in tied_rows], dtype=float)
76
+
77
+ mins, maxs = tied_data.min(0), tied_data.max(0)
78
+ span = np.where(maxs - mins == 0, 1, maxs - mins)
79
+ scaled = (tied_data - mins) / span # 0-1 per metric
80
+ dists = np.linalg.norm(1.0 - scaled, axis=1) # to utopia (1,…,1)
81
+
82
+ best_local_idx = int(dists.argmin()) # index inside tied_rows
83
+ best_row = tied_rows[best_local_idx]
84
+
85
+ # place best_row at position 0, keep relative order of the rest
86
+ self.results.remove(best_row)
87
+ self.results.insert(0, best_row)
88
+
89
+ # strip the metrics vector column before returning (keep original layout)
90
+ return [row[:4] for row in self.results]