moose-fs 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. LICENSE +21 -0
  2. README.md +190 -0
  3. moose_fs-0.1.0.dist-info/METADATA +232 -0
  4. moose_fs-0.1.0.dist-info/RECORD +40 -0
  5. moose_fs-0.1.0.dist-info/WHEEL +4 -0
  6. moose_fs-0.1.0.dist-info/entry_points.txt +2 -0
  7. moose_fs-0.1.0.dist-info/licenses/LICENSE +21 -0
  8. moosefs/__init__.py +6 -0
  9. moosefs/core/__init__.py +6 -0
  10. moosefs/core/data_processor.py +319 -0
  11. moosefs/core/feature.py +44 -0
  12. moosefs/core/novovicova.py +60 -0
  13. moosefs/core/pareto.py +90 -0
  14. moosefs/feature_selection_pipeline.py +548 -0
  15. moosefs/feature_selectors/__init__.py +26 -0
  16. moosefs/feature_selectors/base_selector.py +38 -0
  17. moosefs/feature_selectors/default_variance.py +21 -0
  18. moosefs/feature_selectors/elastic_net_selector.py +75 -0
  19. moosefs/feature_selectors/f_statistic_selector.py +42 -0
  20. moosefs/feature_selectors/lasso_selector.py +46 -0
  21. moosefs/feature_selectors/mrmr_selector.py +57 -0
  22. moosefs/feature_selectors/mutual_info_selector.py +45 -0
  23. moosefs/feature_selectors/random_forest_selector.py +48 -0
  24. moosefs/feature_selectors/svm_selector.py +50 -0
  25. moosefs/feature_selectors/variance_selectors.py +16 -0
  26. moosefs/feature_selectors/xgboost_selector.py +44 -0
  27. moosefs/merging_strategies/__init__.py +17 -0
  28. moosefs/merging_strategies/arithmetic_mean_merger.py +46 -0
  29. moosefs/merging_strategies/base_merger.py +64 -0
  30. moosefs/merging_strategies/borda_merger.py +46 -0
  31. moosefs/merging_strategies/consensus_merger.py +80 -0
  32. moosefs/merging_strategies/l2_norm_merger.py +42 -0
  33. moosefs/merging_strategies/union_of_intersections_merger.py +89 -0
  34. moosefs/metrics/__init__.py +23 -0
  35. moosefs/metrics/performance_metrics.py +239 -0
  36. moosefs/metrics/stability_metrics.py +49 -0
  37. moosefs/utils.py +161 -0
  38. scripts/config.yml +92 -0
  39. scripts/main.py +163 -0
  40. scripts/utils.py +186 -0
scripts/config.yml ADDED
@@ -0,0 +1,92 @@
1
+ experience:
2
+ data_path:
3
+ description: The directory path to the folder containing the dataset.
4
+ value: path/to/dataset/DATASETNAME.csv
5
+ experiment_name:
6
+ description: The name of the file to save the experimental results.
7
+ value: TESTNAME
8
+ metadata_path:
9
+ value: path/to/dataset//METADATANAME.csv
10
+ result_path:
11
+ description: The directory path where the results will be saved.
12
+ value: /path/to/results
13
+ pipeline:
14
+ fs_methods:
15
+ description: List of feature selection methods to use. You can choose two or more
16
+ methods.
17
+ valid_values:
18
+ - f_statistic_selector
19
+ - mutual_info_selector
20
+ - random_forest_selector
21
+ - svm_selector
22
+ - xgboost_selector
23
+ - laso_selector
24
+ - mrmr_selector
25
+ value:
26
+ - f_statistic_selector
27
+ - mutual_info_selector
28
+ - random_forest_selector
29
+ - svm_selector
30
+ - xgboost_selector
31
+ merging_strategy:
32
+ description: The strategy for merging feature selection results. Choose only one
33
+ merging strategy.
34
+ valid_values:
35
+ - union_of_intersections_merger
36
+ - borda_merger
37
+ value: union_of_intersections_merger
38
+ num_repeats:
39
+ description: The number of repeats (train/test folds) for the pipeline, it does also affect the test size 1/n_repeats of the dataset
40
+ max_value: 10
41
+ min_value: 2
42
+ value: 5
43
+ random_state:
44
+ description: Seed used to fix feature selection method
45
+ valid_values:
46
+ - int
47
+ - None
48
+ value: 2024
49
+ metrics:
50
+ description: List of metrics to use for evaluation, need a list of exactly 3 metrics
51
+ valid_values:
52
+ - accuracy
53
+ - f1_score
54
+ - logloss
55
+ - precision
56
+ - recall
57
+ - mae
58
+ - mse
59
+ - r2_score
60
+ value:
61
+ - f1_score
62
+ - accuracy
63
+ - logloss
64
+ task:
65
+ description: ML task, either regression or classification
66
+ valid_values:
67
+ - regression
68
+ - classification
69
+ value: classification
70
+ num_features_to_select:
71
+ description: Number of features to select
72
+ valid_values:
73
+ - int
74
+ value: 100
75
+ n_jobs:
76
+ description: to define number of cores for sklearn module
77
+ valid_values:
78
+ - postitive int or -1
79
+ - None
80
+ value: 1
81
+ preprocessing:
82
+ categorical_columns:
83
+ - NAME_OF_CATEGORICAL_COLUMN_1
84
+ - NAME_OF_CATEGORICAL_COLUMN_2
85
+ - NAME_OF_CATEGORICAL_COLUMN_TARGET
86
+ columns_to_drop:
87
+ - NAME_OF_COLUMN_TO_DROP_1
88
+ - NAME_OF_COLUMN_TARGET
89
+ drop_missing_values: true
90
+ merge_key: NAME_OF_COLUMN_TO_MERGE
91
+ normalize: true
92
+ target_column: NAME_OF_COLUMN_TARGET
scripts/main.py ADDED
@@ -0,0 +1,163 @@
1
+ import argparse
2
+ import csv
3
+ import os
4
+ from pathlib import Path
5
+ import shutil
6
+ import sys
7
+ from typing import Any, Dict
8
+
9
+ import pandas as pd
10
+ import yaml
11
+
12
+ # Ensure project root is on sys.path when running as `python scripts/main.py`
13
+ _THIS_DIR = Path(__file__).parent
14
+ _REPO_ROOT = _THIS_DIR.parent
15
+ if str(_REPO_ROOT) not in sys.path:
16
+ sys.path.insert(0, str(_REPO_ROOT))
17
+
18
+ from moosefs import FeatureSelectionPipeline
19
+ from moosefs.core import DataProcessor
20
+ from scripts.utils import (
21
+ read_config,
22
+ validate_experience_config,
23
+ validate_pipeline_config,
24
+ validate_preprocessing_config,
25
+ )
26
+
27
+
28
+ def load_config(config_path: str) -> Dict[str, Any]:
29
+ """
30
+ Load configuration from a YAML file.
31
+
32
+ Args:
33
+ config_path: Path to the configuration file.
34
+
35
+ Returns:
36
+ A dictionary containing configuration parameters.
37
+ """
38
+ with open(config_path, "r") as file:
39
+ config = yaml.safe_load(file)
40
+ return config
41
+
42
+
43
+ def main() -> None:
44
+ """
45
+ Main function to run the feature selection pipeline.
46
+ """
47
+ parser = argparse.ArgumentParser(description="Run the main application script.")
48
+ parser.add_argument(
49
+ "--config",
50
+ nargs="?",
51
+ default=os.path.join(Path(__file__).parent, "config.yml"),
52
+ help="Path to the configuration file. Defaults to the packaged config.yml.",
53
+ )
54
+ parser.add_argument(
55
+ "--data",
56
+ nargs="?",
57
+ default=None,
58
+ help="Path to the processed data file. If not provided, a raw dataset should be defined in the config file.",
59
+ )
60
+ args = parser.parse_args()
61
+ config_path = args.config
62
+ dataset_arg = args.data
63
+
64
+ # Load the configuration
65
+ config = load_config(config_path)
66
+
67
+ # Validate configuration sections
68
+ config_experience, config_preprocessing, config_pipeline = read_config(config)
69
+ validate_pipeline_config(config_pipeline)
70
+ if config_preprocessing:
71
+ validate_preprocessing_config(config_preprocessing)
72
+ if config_experience:
73
+ validate_experience_config(config_experience)
74
+
75
+ # Determine data path if not provided as argument
76
+ if dataset_arg is None:
77
+ data_path = config_experience.get("data_path", {}).get("value", None)
78
+ if data_path is None:
79
+ raise ValueError(
80
+ "No data path provided. Please specify a processed data file or a raw data path in the config."
81
+ )
82
+ else:
83
+ data_path = dataset_arg
84
+
85
+ metadata_path = config_experience.get("metadata_path", {}).get("value", None)
86
+ result_path = config_experience.get("result_path", {}).get("value", "default_results")
87
+ experiment_name = config_experience.get("experiment_name", {}).get("value", "default_experiment")
88
+ experiment_folder = os.path.join(result_path, experiment_name)
89
+ os.makedirs(experiment_folder, exist_ok=True)
90
+
91
+ # Copy the configuration file into the experiment folder for reproducibility.
92
+ shutil.copy(config_path, experiment_folder)
93
+
94
+ # Extract preprocessing parameters
95
+ categorical_columns = config_preprocessing.get("categorical_columns", None)
96
+ columns_to_drop = config_preprocessing.get("columns_to_drop", None)
97
+ drop_missing_values = config_preprocessing.get("drop_missing_values", False)
98
+ merge_key = config_preprocessing.get("merge_key", None)
99
+ normalize = config_preprocessing.get("normalize", True)
100
+ target_column = config_preprocessing.get("target_column", "target")
101
+
102
+ # Process the dataset if a raw data path is provided; otherwise, load processed data.
103
+ if dataset_arg is None:
104
+ data_processor = DataProcessor(
105
+ categorical_columns=categorical_columns,
106
+ columns_to_drop=columns_to_drop,
107
+ drop_missing_values=drop_missing_values,
108
+ merge_key=merge_key,
109
+ normalize=normalize,
110
+ target_column=target_column,
111
+ )
112
+ dataset = data_processor.preprocess_data(data=data_path, metadata=metadata_path)
113
+ else:
114
+ dataset = pd.read_csv(dataset_arg)
115
+
116
+ # Extract pipeline parameters
117
+ fs_methods = config_pipeline["fs_methods"]["value"]
118
+ merging_strategy = config_pipeline["merging_strategy"]["value"]
119
+ num_repeats = config_pipeline["num_repeats"]["value"]
120
+ num_features_to_select = config_pipeline["num_features_to_select"]["value"]
121
+ metrics = config_pipeline["metrics"]["value"]
122
+ task = config_pipeline["task"]["value"]
123
+ random_state = config_pipeline.get("random_state", {}).get("value", None)
124
+ n_jobs = config_pipeline.get("n_jobs", {}).get("value", None)
125
+
126
+ # Run the feature selection pipeline
127
+ pipeline = FeatureSelectionPipeline(
128
+ data=dataset,
129
+ fs_methods=fs_methods,
130
+ merging_strategy=merging_strategy,
131
+ num_repeats=num_repeats,
132
+ num_features_to_select=num_features_to_select,
133
+ metrics=metrics,
134
+ task=task,
135
+ random_state=random_state,
136
+ n_jobs=n_jobs,
137
+ )
138
+ best_features, best_repeat, best_group_name = pipeline.run(verbose=False)
139
+
140
+ # Save results to a text file
141
+ results_txt_path = os.path.join(experiment_folder, "results.txt")
142
+ with open(results_txt_path, "w") as file:
143
+ file.write(f"The best features are {best_features}\n")
144
+ file.write(f"Best repeat value: {best_repeat}\n")
145
+ file.write(f"Best group name: {best_group_name}\n")
146
+
147
+ # Save results to a CSV file
148
+ results = {
149
+ "best_features": best_features,
150
+ "best_repeat": best_repeat,
151
+ "best_group_name": best_group_name,
152
+ }
153
+ csv_file_path = os.path.join(experiment_folder, "results.csv")
154
+ with open(csv_file_path, mode="w", newline="") as file:
155
+ writer = csv.DictWriter(file, fieldnames=results.keys())
156
+ writer.writeheader()
157
+ writer.writerow(results)
158
+
159
+ print(f"Results written to {experiment_folder}")
160
+
161
+
162
+ if __name__ == "__main__":
163
+ main()
scripts/utils.py ADDED
@@ -0,0 +1,186 @@
1
+ from typing import Any, Dict, List, Tuple
2
+
3
+
4
+ def read_config(config: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any], Dict[str, Any]]:
5
+ """
6
+ Read configuration sections from the provided config dictionary.
7
+
8
+ Args:
9
+ config: The configuration dictionary.
10
+
11
+ Returns:
12
+ A tuple containing the 'experience', 'preprocessing', and 'pipeline' sections.
13
+
14
+ Raises:
15
+ ValueError: If the 'pipeline' section is missing.
16
+ """
17
+ config_pipeline = config.get("pipeline", {})
18
+ config_experience = config.get("experience", {})
19
+ config_preprocessing = config.get("preprocessing", {})
20
+
21
+ if not config_pipeline:
22
+ raise ValueError("The configuration file must contain a 'pipeline' section.")
23
+
24
+ return config_experience, config_preprocessing, config_pipeline
25
+
26
+
27
+ def validate_pipeline_config(config: Dict[str, Any]) -> None:
28
+ """
29
+ Validate the pipeline configuration.
30
+
31
+ Args:
32
+ config: The pipeline configuration dictionary.
33
+
34
+ Raises:
35
+ ValueError: If any required pipeline parameter is invalid.
36
+ """
37
+ # Validate fs_methods
38
+ fs_methods: List[Any] = config.get("fs_methods", {}).get("value", [])
39
+ valid_fs_methods = {
40
+ "f_statistic_selector",
41
+ "mutual_info_selector",
42
+ "random_forest_selector",
43
+ "svm_selector",
44
+ "xgboost_selector",
45
+ "rfe_rf_selector",
46
+ }
47
+ if not isinstance(fs_methods, list) or len(fs_methods) < 2:
48
+ raise ValueError("`fs_methods` should be a list containing two or more feature selection methods.")
49
+ if any(method not in valid_fs_methods for method in fs_methods):
50
+ raise ValueError(
51
+ f"Invalid feature selection method(s) in `fs_methods`: {fs_methods}. Valid options are: {valid_fs_methods}."
52
+ )
53
+
54
+ # Validate merging_strategy
55
+ merging_strategy: str = config.get("merging_strategy", {}).get("value", "")
56
+ valid_merging_strategies = {
57
+ "union_of_intersections_merger",
58
+ "borda_merger",
59
+ }
60
+ if merging_strategy not in valid_merging_strategies:
61
+ raise ValueError(
62
+ f"Invalid `merging_strategy`: {merging_strategy}. Choose one from: {valid_merging_strategies}."
63
+ )
64
+
65
+ # Validate num_repeats
66
+ num_repeats = config.get("num_repeats", {}).get("value", None)
67
+ if not isinstance(num_repeats, int) or not (2 <= num_repeats <= 10):
68
+ raise ValueError(f"`num_repeats` must be an integer between 2 and 10, inclusive. Got: {num_repeats}.")
69
+
70
+ # Validate random_state
71
+ random_state = config.get("random_state", {}).get("value", None)
72
+ if not (isinstance(random_state, int) or random_state is None):
73
+ raise ValueError("`random_state` must be an integer or None.")
74
+
75
+ # Validate metrics
76
+ metrics: List[Any] = config.get("metrics", {}).get("value", [])
77
+ valid_metrics = {
78
+ "accuracy",
79
+ "logloss",
80
+ "f1_score",
81
+ "precision_score",
82
+ "recall_score",
83
+ "mse",
84
+ "mae",
85
+ "r2_score",
86
+ }
87
+ if not isinstance(metrics, list) or not all(metric in valid_metrics for metric in metrics):
88
+ raise ValueError(
89
+ f"`metrics` should be a list containing metric names from the following options: {valid_metrics}."
90
+ )
91
+
92
+ # Validate task
93
+ task: str = config.get("task", {}).get("value", "")
94
+ valid_tasks = {"regression", "classification"}
95
+ if task not in valid_tasks:
96
+ raise ValueError(f"Invalid `task`: {task}. Choose either 'regression' or 'classification'.")
97
+
98
+ # Validate num_features_to_select
99
+ num_features_to_select = config.get("num_features_to_select", {}).get("value", None)
100
+ if not (isinstance(num_features_to_select, int) or num_features_to_select is None):
101
+ raise ValueError("`num_features_to_select` must be an integer or None.")
102
+
103
+ # Validate n_jobs
104
+ n_jobs = config.get("n_jobs", {}).get("value", None)
105
+ if not (isinstance(n_jobs, int) and (n_jobs > 0 or n_jobs == -1) or n_jobs is None):
106
+ raise ValueError("`n_jobs` must be a positive integer, -1, or None.")
107
+
108
+ print("Configuration is valid.")
109
+
110
+
111
+ def validate_preprocessing_config(config: Dict[str, Any]) -> None:
112
+ """
113
+ Validate the preprocessing configuration.
114
+
115
+ Args:
116
+ config: The preprocessing configuration dictionary.
117
+
118
+ Raises:
119
+ ValueError: If any preprocessing parameter is invalid.
120
+ """
121
+ # Validate categorical_columns
122
+ categorical_columns = config.get("categorical_columns", [])
123
+ if not isinstance(categorical_columns, list) or not all(isinstance(col, str) for col in categorical_columns):
124
+ raise ValueError("`categorical_columns` should be a list of strings representing column names.")
125
+
126
+ # Validate columns_to_drop
127
+ columns_to_drop = config.get("columns_to_drop", [])
128
+ if not isinstance(columns_to_drop, list) or not all(isinstance(col, str) for col in columns_to_drop):
129
+ raise ValueError("`columns_to_drop` should be a list of strings representing column names.")
130
+
131
+ # Validate drop_missing_values
132
+ drop_missing_values = config.get("drop_missing_values", None)
133
+ if not isinstance(drop_missing_values, bool):
134
+ raise ValueError("`drop_missing_values` must be a boolean (True or False).")
135
+
136
+ # Validate merge_key
137
+ merge_key = config.get("merge_key", "")
138
+ if not isinstance(merge_key, str):
139
+ raise ValueError("`merge_key` should be a string representing a column name.")
140
+
141
+ # Validate normalize
142
+ normalize = config.get("normalize", None)
143
+ if not isinstance(normalize, bool):
144
+ raise ValueError("`normalize` must be a boolean (True or False).")
145
+
146
+ # Validate target_column
147
+ target_column = config.get("target_column", "")
148
+ if not isinstance(target_column, str):
149
+ raise ValueError("`target_column` should be a string representing a column name.")
150
+
151
+ print("Preprocessing configuration is valid.")
152
+
153
+
154
+ def validate_experience_config(config: Dict[str, Any]) -> None:
155
+ """
156
+ Validate the experience configuration.
157
+
158
+ Args:
159
+ config: The experience configuration dictionary.
160
+
161
+ Raises:
162
+ ValueError: If any experience parameter is invalid.
163
+ """
164
+ # Validate data_path
165
+ data_path = config.get("data_path", {}).get("value", "")
166
+ if not isinstance(data_path, str) or not data_path:
167
+ raise ValueError("`data_path` must be a non-empty string representing the path to the dataset.")
168
+
169
+ # Validate experiment_name
170
+ experiment_name = config.get("experiment_name", {}).get("value", "")
171
+ if not isinstance(experiment_name, str) or not experiment_name:
172
+ raise ValueError(
173
+ "`experiment_name` must be a non-empty string representing the name for the experiment results."
174
+ )
175
+
176
+ # Validate metadata_path
177
+ metadata_path = config.get("metadata_path", {}).get("value", "")
178
+ if not (isinstance(metadata_path, str) or metadata_path is None):
179
+ raise ValueError("`metadata_path` must be a string representing the path to the metadata file or None.")
180
+
181
+ # Validate result_path
182
+ result_path = config.get("result_path", {}).get("value", "")
183
+ if not isinstance(result_path, str) or not result_path:
184
+ raise ValueError("`result_path` must be a non-empty string representing the directory path to save results.")
185
+
186
+ print("Experience configuration is valid.")