PyPI - moose-fs - Versions diffs - 0.1.0__py3-none-any.whl - Mend

moose-fs 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

LICENSE +21 -0
README.md +190 -0
moose_fs-0.1.0.dist-info/METADATA +232 -0
moose_fs-0.1.0.dist-info/RECORD +40 -0
moose_fs-0.1.0.dist-info/WHEEL +4 -0
moose_fs-0.1.0.dist-info/entry_points.txt +2 -0
moose_fs-0.1.0.dist-info/licenses/LICENSE +21 -0
moosefs/__init__.py +6 -0
moosefs/core/__init__.py +6 -0
moosefs/core/data_processor.py +319 -0
moosefs/core/feature.py +44 -0
moosefs/core/novovicova.py +60 -0
moosefs/core/pareto.py +90 -0
moosefs/feature_selection_pipeline.py +548 -0
moosefs/feature_selectors/__init__.py +26 -0
moosefs/feature_selectors/base_selector.py +38 -0
moosefs/feature_selectors/default_variance.py +21 -0
moosefs/feature_selectors/elastic_net_selector.py +75 -0
moosefs/feature_selectors/f_statistic_selector.py +42 -0
moosefs/feature_selectors/lasso_selector.py +46 -0
moosefs/feature_selectors/mrmr_selector.py +57 -0
moosefs/feature_selectors/mutual_info_selector.py +45 -0
moosefs/feature_selectors/random_forest_selector.py +48 -0
moosefs/feature_selectors/svm_selector.py +50 -0
moosefs/feature_selectors/variance_selectors.py +16 -0
moosefs/feature_selectors/xgboost_selector.py +44 -0
moosefs/merging_strategies/__init__.py +17 -0
moosefs/merging_strategies/arithmetic_mean_merger.py +46 -0
moosefs/merging_strategies/base_merger.py +64 -0
moosefs/merging_strategies/borda_merger.py +46 -0
moosefs/merging_strategies/consensus_merger.py +80 -0
moosefs/merging_strategies/l2_norm_merger.py +42 -0
moosefs/merging_strategies/union_of_intersections_merger.py +89 -0
moosefs/metrics/__init__.py +23 -0
moosefs/metrics/performance_metrics.py +239 -0
moosefs/metrics/stability_metrics.py +49 -0
moosefs/utils.py +161 -0
scripts/config.yml +92 -0
scripts/main.py +163 -0
scripts/utils.py +186 -0

scripts/config.yml ADDED Viewed

@@ -0,0 +1,92 @@
+experience:
+  data_path:
+    description: The directory path to the folder containing the dataset.
+    value: path/to/dataset/DATASETNAME.csv
+  experiment_name:
+    description: The name of the file to save the experimental results.
+    value: TESTNAME
+  metadata_path:
+    value: path/to/dataset//METADATANAME.csv
+  result_path:
+    description: The directory path where the results will be saved.
+    value: /path/to/results
+pipeline:
+  fs_methods:
+    description: List of feature selection methods to use. You can choose two or more
+      methods.
+    valid_values:
+    - f_statistic_selector
+    - mutual_info_selector
+    - random_forest_selector
+    - svm_selector
+    - xgboost_selector
+    - laso_selector
+    - mrmr_selector
+    value:
+    - f_statistic_selector
+    - mutual_info_selector
+    - random_forest_selector
+    - svm_selector
+    - xgboost_selector
+  merging_strategy:
+    description: The strategy for merging feature selection results. Choose only one
+      merging strategy.
+    valid_values:
+    - union_of_intersections_merger
+    - borda_merger
+    value: union_of_intersections_merger
+  num_repeats:
+    description: The number of repeats (train/test folds) for the pipeline, it does also affect the test size 1/n_repeats of the dataset
+    max_value: 10
+    min_value: 2
+    value: 5
+  random_state:
+    description: Seed used to fix feature selection method
+    valid_values:
+    - int
+    - None
+    value: 2024
+  metrics:
+    description: List of metrics to use for evaluation, need a list of exactly 3 metrics
+    valid_values:
+    - accuracy
+    - f1_score
+    - logloss
+    - precision
+    - recall
+    - mae
+    - mse
+    - r2_score
+    value:
+    - f1_score
+    - accuracy
+    - logloss
+  task:
+    description: ML task, either regression or classification
+    valid_values:
+    - regression
+    - classification
+    value: classification
+  num_features_to_select:
+    description: Number of features to select
+    valid_values:
+    - int
+    value: 100
+  n_jobs:
+    description: to define number of cores for sklearn module
+    valid_values:
+    - postitive int or -1
+    - None
+    value: 1
+preprocessing:
+  categorical_columns:
+  - NAME_OF_CATEGORICAL_COLUMN_1
+  - NAME_OF_CATEGORICAL_COLUMN_2
+  - NAME_OF_CATEGORICAL_COLUMN_TARGET
+  columns_to_drop:
+  - NAME_OF_COLUMN_TO_DROP_1
+  - NAME_OF_COLUMN_TARGET
+  drop_missing_values: true
+  merge_key: NAME_OF_COLUMN_TO_MERGE
+  normalize: true
+  target_column: NAME_OF_COLUMN_TARGET

scripts/main.py ADDED Viewed

@@ -0,0 +1,163 @@
+import argparse
+import csv
+import os
+from pathlib import Path
+import shutil
+import sys
+from typing import Any, Dict
+import pandas as pd
+import yaml
+# Ensure project root is on sys.path when running as `python scripts/main.py`
+_THIS_DIR = Path(__file__).parent
+_REPO_ROOT = _THIS_DIR.parent
+if str(_REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(_REPO_ROOT))
+from moosefs import FeatureSelectionPipeline
+from moosefs.core import DataProcessor
+from scripts.utils import (
+    read_config,
+    validate_experience_config,
+    validate_pipeline_config,
+    validate_preprocessing_config,
+)
+def load_config(config_path: str) -> Dict[str, Any]:
+    """
+    Load configuration from a YAML file.
+    Args:
+        config_path: Path to the configuration file.
+    Returns:
+        A dictionary containing configuration parameters.
+    """
+    with open(config_path, "r") as file:
+        config = yaml.safe_load(file)
+    return config
+def main() -> None:
+    """
+    Main function to run the feature selection pipeline.
+    """
+    parser = argparse.ArgumentParser(description="Run the main application script.")
+    parser.add_argument(
+        "--config",
+        nargs="?",
+        default=os.path.join(Path(__file__).parent, "config.yml"),
+        help="Path to the configuration file. Defaults to the packaged config.yml.",
+    )
+    parser.add_argument(
+        "--data",
+        nargs="?",
+        default=None,
+        help="Path to the processed data file. If not provided, a raw dataset should be defined in the config file.",
+    )
+    args = parser.parse_args()
+    config_path = args.config
+    dataset_arg = args.data
+    # Load the configuration
+    config = load_config(config_path)
+    # Validate configuration sections
+    config_experience, config_preprocessing, config_pipeline = read_config(config)
+    validate_pipeline_config(config_pipeline)
+    if config_preprocessing:
+        validate_preprocessing_config(config_preprocessing)
+    if config_experience:
+        validate_experience_config(config_experience)
+    # Determine data path if not provided as argument
+    if dataset_arg is None:
+        data_path = config_experience.get("data_path", {}).get("value", None)
+        if data_path is None:
+            raise ValueError(
+                "No data path provided. Please specify a processed data file or a raw data path in the config."
+            )
+    else:
+        data_path = dataset_arg
+    metadata_path = config_experience.get("metadata_path", {}).get("value", None)
+    result_path = config_experience.get("result_path", {}).get("value", "default_results")
+    experiment_name = config_experience.get("experiment_name", {}).get("value", "default_experiment")
+    experiment_folder = os.path.join(result_path, experiment_name)
+    os.makedirs(experiment_folder, exist_ok=True)
+    # Copy the configuration file into the experiment folder for reproducibility.
+    shutil.copy(config_path, experiment_folder)
+    # Extract preprocessing parameters
+    categorical_columns = config_preprocessing.get("categorical_columns", None)
+    columns_to_drop = config_preprocessing.get("columns_to_drop", None)
+    drop_missing_values = config_preprocessing.get("drop_missing_values", False)
+    merge_key = config_preprocessing.get("merge_key", None)
+    normalize = config_preprocessing.get("normalize", True)
+    target_column = config_preprocessing.get("target_column", "target")
+    # Process the dataset if a raw data path is provided; otherwise, load processed data.
+    if dataset_arg is None:
+        data_processor = DataProcessor(
+            categorical_columns=categorical_columns,
+            columns_to_drop=columns_to_drop,
+            drop_missing_values=drop_missing_values,
+            merge_key=merge_key,
+            normalize=normalize,
+            target_column=target_column,
+        )
+        dataset = data_processor.preprocess_data(data=data_path, metadata=metadata_path)
+    else:
+        dataset = pd.read_csv(dataset_arg)
+    # Extract pipeline parameters
+    fs_methods = config_pipeline["fs_methods"]["value"]
+    merging_strategy = config_pipeline["merging_strategy"]["value"]
+    num_repeats = config_pipeline["num_repeats"]["value"]
+    num_features_to_select = config_pipeline["num_features_to_select"]["value"]
+    metrics = config_pipeline["metrics"]["value"]
+    task = config_pipeline["task"]["value"]
+    random_state = config_pipeline.get("random_state", {}).get("value", None)
+    n_jobs = config_pipeline.get("n_jobs", {}).get("value", None)
+    # Run the feature selection pipeline
+    pipeline = FeatureSelectionPipeline(
+        data=dataset,
+        fs_methods=fs_methods,
+        merging_strategy=merging_strategy,
+        num_repeats=num_repeats,
+        num_features_to_select=num_features_to_select,
+        metrics=metrics,
+        task=task,
+        random_state=random_state,
+        n_jobs=n_jobs,
+    )
+    best_features, best_repeat, best_group_name = pipeline.run(verbose=False)
+    # Save results to a text file
+    results_txt_path = os.path.join(experiment_folder, "results.txt")
+    with open(results_txt_path, "w") as file:
+        file.write(f"The best features are {best_features}\n")
+        file.write(f"Best repeat value: {best_repeat}\n")
+        file.write(f"Best group name: {best_group_name}\n")
+    # Save results to a CSV file
+    results = {
+        "best_features": best_features,
+        "best_repeat": best_repeat,
+        "best_group_name": best_group_name,
+    }
+    csv_file_path = os.path.join(experiment_folder, "results.csv")
+    with open(csv_file_path, mode="w", newline="") as file:
+        writer = csv.DictWriter(file, fieldnames=results.keys())
+        writer.writeheader()
+        writer.writerow(results)
+    print(f"Results written to {experiment_folder}")
+if __name__ == "__main__":
+    main()

scripts/utils.py ADDED Viewed

@@ -0,0 +1,186 @@
+from typing import Any, Dict, List, Tuple
+def read_config(config: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any], Dict[str, Any]]:
+    """
+    Read configuration sections from the provided config dictionary.
+    Args:
+        config: The configuration dictionary.
+    Returns:
+        A tuple containing the 'experience', 'preprocessing', and 'pipeline' sections.
+    Raises:
+        ValueError: If the 'pipeline' section is missing.
+    """
+    config_pipeline = config.get("pipeline", {})
+    config_experience = config.get("experience", {})
+    config_preprocessing = config.get("preprocessing", {})
+    if not config_pipeline:
+        raise ValueError("The configuration file must contain a 'pipeline' section.")
+    return config_experience, config_preprocessing, config_pipeline
+def validate_pipeline_config(config: Dict[str, Any]) -> None:
+    """
+    Validate the pipeline configuration.
+    Args:
+        config: The pipeline configuration dictionary.
+    Raises:
+        ValueError: If any required pipeline parameter is invalid.
+    """
+    # Validate fs_methods
+    fs_methods: List[Any] = config.get("fs_methods", {}).get("value", [])
+    valid_fs_methods = {
+        "f_statistic_selector",
+        "mutual_info_selector",
+        "random_forest_selector",
+        "svm_selector",
+        "xgboost_selector",
+        "rfe_rf_selector",
+    }
+    if not isinstance(fs_methods, list) or len(fs_methods) < 2:
+        raise ValueError("`fs_methods` should be a list containing two or more feature selection methods.")
+    if any(method not in valid_fs_methods for method in fs_methods):
+        raise ValueError(
+            f"Invalid feature selection method(s) in `fs_methods`: {fs_methods}. Valid options are: {valid_fs_methods}."
+        )
+    # Validate merging_strategy
+    merging_strategy: str = config.get("merging_strategy", {}).get("value", "")
+    valid_merging_strategies = {
+        "union_of_intersections_merger",
+        "borda_merger",
+    }
+    if merging_strategy not in valid_merging_strategies:
+        raise ValueError(
+            f"Invalid `merging_strategy`: {merging_strategy}. Choose one from: {valid_merging_strategies}."
+        )
+    # Validate num_repeats
+    num_repeats = config.get("num_repeats", {}).get("value", None)
+    if not isinstance(num_repeats, int) or not (2 <= num_repeats <= 10):
+        raise ValueError(f"`num_repeats` must be an integer between 2 and 10, inclusive. Got: {num_repeats}.")
+    # Validate random_state
+    random_state = config.get("random_state", {}).get("value", None)
+    if not (isinstance(random_state, int) or random_state is None):
+        raise ValueError("`random_state` must be an integer or None.")
+    # Validate metrics
+    metrics: List[Any] = config.get("metrics", {}).get("value", [])
+    valid_metrics = {
+        "accuracy",
+        "logloss",
+        "f1_score",
+        "precision_score",
+        "recall_score",
+        "mse",
+        "mae",
+        "r2_score",
+    }
+    if not isinstance(metrics, list) or not all(metric in valid_metrics for metric in metrics):
+        raise ValueError(
+            f"`metrics` should be a list containing metric names from the following options: {valid_metrics}."
+        )
+    # Validate task
+    task: str = config.get("task", {}).get("value", "")
+    valid_tasks = {"regression", "classification"}
+    if task not in valid_tasks:
+        raise ValueError(f"Invalid `task`: {task}. Choose either 'regression' or 'classification'.")
+    # Validate num_features_to_select
+    num_features_to_select = config.get("num_features_to_select", {}).get("value", None)
+    if not (isinstance(num_features_to_select, int) or num_features_to_select is None):
+        raise ValueError("`num_features_to_select` must be an integer or None.")
+    # Validate n_jobs
+    n_jobs = config.get("n_jobs", {}).get("value", None)
+    if not (isinstance(n_jobs, int) and (n_jobs > 0 or n_jobs == -1) or n_jobs is None):
+        raise ValueError("`n_jobs` must be a positive integer, -1, or None.")
+    print("Configuration is valid.")
+def validate_preprocessing_config(config: Dict[str, Any]) -> None:
+    """
+    Validate the preprocessing configuration.
+    Args:
+        config: The preprocessing configuration dictionary.
+    Raises:
+        ValueError: If any preprocessing parameter is invalid.
+    """
+    # Validate categorical_columns
+    categorical_columns = config.get("categorical_columns", [])
+    if not isinstance(categorical_columns, list) or not all(isinstance(col, str) for col in categorical_columns):
+        raise ValueError("`categorical_columns` should be a list of strings representing column names.")
+    # Validate columns_to_drop
+    columns_to_drop = config.get("columns_to_drop", [])
+    if not isinstance(columns_to_drop, list) or not all(isinstance(col, str) for col in columns_to_drop):
+        raise ValueError("`columns_to_drop` should be a list of strings representing column names.")
+    # Validate drop_missing_values
+    drop_missing_values = config.get("drop_missing_values", None)
+    if not isinstance(drop_missing_values, bool):
+        raise ValueError("`drop_missing_values` must be a boolean (True or False).")
+    # Validate merge_key
+    merge_key = config.get("merge_key", "")
+    if not isinstance(merge_key, str):
+        raise ValueError("`merge_key` should be a string representing a column name.")
+    # Validate normalize
+    normalize = config.get("normalize", None)
+    if not isinstance(normalize, bool):
+        raise ValueError("`normalize` must be a boolean (True or False).")
+    # Validate target_column
+    target_column = config.get("target_column", "")
+    if not isinstance(target_column, str):
+        raise ValueError("`target_column` should be a string representing a column name.")
+    print("Preprocessing configuration is valid.")
+def validate_experience_config(config: Dict[str, Any]) -> None:
+    """
+    Validate the experience configuration.
+    Args:
+        config: The experience configuration dictionary.
+    Raises:
+        ValueError: If any experience parameter is invalid.
+    """
+    # Validate data_path
+    data_path = config.get("data_path", {}).get("value", "")
+    if not isinstance(data_path, str) or not data_path:
+        raise ValueError("`data_path` must be a non-empty string representing the path to the dataset.")
+    # Validate experiment_name
+    experiment_name = config.get("experiment_name", {}).get("value", "")
+    if not isinstance(experiment_name, str) or not experiment_name:
+        raise ValueError(
+            "`experiment_name` must be a non-empty string representing the name for the experiment results."
+        )
+    # Validate metadata_path
+    metadata_path = config.get("metadata_path", {}).get("value", "")
+    if not (isinstance(metadata_path, str) or metadata_path is None):
+        raise ValueError("`metadata_path` must be a string representing the path to the metadata file or None.")
+    # Validate result_path
+    result_path = config.get("result_path", {}).get("value", "")
+    if not isinstance(result_path, str) or not result_path:
+        raise ValueError("`result_path` must be a non-empty string representing the directory path to save results.")
+    print("Experience configuration is valid.")