PyPI - pythonflex - Versions diffs - 0.3.4__py3-none-any.whl → 0.4__py3-none-any.whl - Mend

pythonflex 0.3.4py3-none-any.whl → 0.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

pythonflex/__init__.py +28 -4
pythonflex/analysis.py +287 -579
pythonflex/examples/basic_usage.py +38 -30
pythonflex/examples/manuscript.py +37 -43
pythonflex/examples/runtime/runtime_benchmark.py +218 -0
pythonflex/examples/runtime/runtime_benchmark_10_runs_memmap.py +534 -0
pythonflex/examples/runtime/runtime_benchmark_corum_njobs.py +245 -0
pythonflex/examples/runtime/runtime_benchmark_gobp_njobs_chunks.py +319 -0
pythonflex/examples/runtime/runtime_benchmark_gobp_optimization.py +417 -0
pythonflex/examples/runtime/runtime_benchmark_repeated.py +347 -0
pythonflex/old_functions.py +422 -0
pythonflex/plotting.py +655 -242
pythonflex/preprocessing.py +54 -216
pythonflex/utils.py +36 -9
{pythonflex-0.3.4.dist-info → pythonflex-0.4.dist-info}/METADATA +8 -6
pythonflex-0.4.dist-info/RECORD +32 -0
{pythonflex-0.3.4.dist-info → pythonflex-0.4.dist-info}/WHEEL +1 -1
pythonflex-0.4.dist-info/licenses/LICENSE +7 -0
pythonflex-0.3.4.dist-info/RECORD +0 -24
{pythonflex-0.3.4.dist-info → pythonflex-0.4.dist-info}/entry_points.txt +0 -0

pythonflex/analysis.py CHANGED Viewed

@@ -11,7 +11,6 @@ from pathlib import Path
 from art import tprint
 from bitarray import bitarray
 from joblib import Parallel, delayed, dump, load
-import matplotlib.pyplot as plt
 from numba import njit, prange
 import numpy as np
 import pandas as pd
@@ -20,8 +19,8 @@ from tqdm import tqdm
 # Local/application-specific imports
 from .logging_config import log
-from .preprocessing import filter_matrix_by_genes
-from .utils import dsave, dload, _sanitize
+from .preprocessing import filter_matrix_by_genes, filter_duplicate_terms
+from .utils import dsave, dload, _sanitize, normalize_analysis_genes
 import matplotlib as mpl
@@ -36,6 +35,8 @@ def deep_update(source, overrides):
 def initialize(config={}):
+    user_overrides = config if isinstance(config, dict) else {}
     default_config = {
         "min_genes_in_complex": 3,
         "min_genes_per_complex_analysis": 2,
@@ -43,8 +44,10 @@ def initialize(config={}):
         "gold_standard": "CORUM",
         "color_map": "RdYlBu",
         "jaccard": True,
-        "jaccard_threshold": 1.0,
-        "use_common_genes": True,
+        # Which genes are used for analysis (drives used_genes intersection)
+        # - 'shared'           : use genes common to all datasets (common_genes)
+        # - 'dataset_specific' : use genes present in each dataset individually
+        "analysis_genes": "shared",
         "plotting": {
             "save_plot": True,
             "show_plot": True,
@@ -56,6 +59,11 @@ def initialize(config={}):
             "drop_na": False,
         },
         "corr_function": "numpy",
+        "per_complex": {
+            "n_jobs": 4,
+            "chunk_size": 200,
+            "max_nbytes": "100M",
+        },
         "logging": {  # Added: Default logging config
             "visible_levels": ["DONE"]  # if needed #, "PROGRESS", "STARTED", "INFO"
         }
@@ -66,6 +74,26 @@ def initialize(config={}):
         config = deep_update(default_config, config)
     else:
         config = default_config
+    # Backward compatibility: if user provided legacy key but not the new one,
+    # map it to analysis_genes. (We must look at the original overrides, because
+    # defaults always include analysis_genes.)
+    analysis_genes_provided = (
+        isinstance(user_overrides, dict)
+        and "analysis_genes" in user_overrides
+        and user_overrides.get("analysis_genes") is not None
+        and str(user_overrides.get("analysis_genes")).strip() != ""
+    )
+    if (
+        isinstance(user_overrides, dict)
+        and "use_common_genes" in user_overrides
+        and not analysis_genes_provided
+    ):
+        config["analysis_genes"] = (
+            "shared" if bool(user_overrides.get("use_common_genes")) else "dataset_specific"
+        )
+    config["analysis_genes"] = normalize_analysis_genes(config.get("analysis_genes"))
     # Extract visible_levels from the merged config and set logging visibility immediately (before any logs)
     visible_levels = config.get("logging", {}).get("visible_levels", ["DONE"])
@@ -111,12 +139,15 @@ def update_matploblib_config(config=None, font_family="Arial", layout="single"):
     if config is None:
         config = {}
     # Fallback if chosen font missing
+    requested_font_family = font_family
     try:
         from matplotlib.font_manager import findfont, FontProperties
         findfont(FontProperties(family=font_family))
     except Exception:
         font_family = "Helvetica"  # Nature prefers Helvetica if Arial unavailable
-        print(f"Warning: '{font_family}' not found, falling back to 'Helvetica'.")
+        log.warning(
+            f"Font '{requested_font_family}' not found; falling back to '{font_family}'."
+        )
     # Figure size presets (Nature: single ≈ 89 mm, double ≈ 183 mm at 25.4 mm/inch)
     if isinstance(layout, tuple):
@@ -191,50 +222,114 @@ def update_matploblib_config(config=None, font_family="Arial", layout="single"):
         "svg.fonttype": "none",
     })
-def pra(dataset_name, matrix, is_corr=False):
-    log.info(f"******************** {dataset_name} ********************")
-    log.started(f"** Global Precision-Recall Analysis - {dataset_name} **")
+def _sort_ascending_for_dataset(dataset_name):
+    sorting = dload("input", "sorting")
+    if not isinstance(sorting, dict):
+        return False
+    sort_order = str(sorting.get(dataset_name, "high")).strip().lower()
+    return sort_order == "low"
+def prepare_terms_for_dataset(dataset_name, matrix):
+    """Prepare dataset-specific gold-standard terms and filtered matrix.
+        This computes:
+            - terms['used_genes'] as the intersection of terms['all_genes'] with either
+                shared genes (config['analysis_genes']=='shared') or the dataset genes
+                (config['analysis_genes']=='dataset_specific').
+      - genes_present_in_terms_<dataset_name>
+    Side effects:
+      - stores dataset-specific terms and genes list under:
+        dsave(..., 'common', f'terms_{dataset_name}')
+        dsave(..., 'common', f'genes_present_in_terms_{dataset_name}')
+    Returns:
+      (terms, genes_present, matrix_filtered)
+    """
     config = dload("config")
-    use_common_genes = config.get("use_common_genes", True)
+    if config is None:
+        raise RuntimeError(
+            "prepare_terms_for_dataset(): config not found. Run initialize() first."
+        )
     terms_data = dload("common", "terms")
     if terms_data is None or not isinstance(terms_data, pd.DataFrame):
-        raise ValueError("Expected 'terms' to be a DataFrame, but got None or invalid type.")
+        raise ValueError(
+            "prepare_terms_for_dataset(): expected 'terms' to be a DataFrame, but got None or invalid type. "
+            "Make sure to run load_gold_standard() first."
+        )
     terms = terms_data.copy()
-    sorting = dload("input", "sorting")
-    sort_order = sorting.get(dataset_name, "high")
-    if not is_corr:
-        matrix = perform_corr(matrix, config.get("corr_function"))
-    # Apply per-dataset gene filtering based on use_common_genes setting
-    if use_common_genes:
-        # Use common genes approach (current behavior)
-        common_genes = dload("common", "common_genes")
-        if not common_genes:
-            raise ValueError("Common genes not found.")
-        common_genes_set = set(common_genes)
-        terms["used_genes"] = terms["all_genes"].apply(lambda x: list(set(x) & common_genes_set))
-        log.info(f"Using common genes approach: {len(common_genes)} genes")
-    else:
-        # Use per-dataset approach (new behavior)
-        dataset_genes_set = set(matrix.index)
-        terms["used_genes"] = terms["all_genes"].apply(lambda x: list(set(x) & dataset_genes_set))
-        log.info(f"Using per-dataset approach for {dataset_name}: {len(dataset_genes_set)} genes in dataset")
+    analysis_genes = normalize_analysis_genes(config.get("analysis_genes"))
+    if analysis_genes == "shared":
+        common_genes = dload("common", "common_genes")
+        if common_genes is None:
+            raise ValueError(
+                "prepare_terms_for_dataset(): common genes not found. "
+                "Run get_common_genes() or set analysis_genes='dataset_specific'."
+            )
+        common_genes_list = list(common_genes)
+        if len(common_genes_list) == 0:
+            raise ValueError(
+                "prepare_terms_for_dataset(): common genes is empty. "
+                "Run get_common_genes() or set analysis_genes='dataset_specific'."
+            )
+        gene_universe = set(common_genes_list)
+        log.info(f"Using shared genes approach: {len(gene_universe)} genes")
+    else:
+        gene_universe = set(matrix.index)
+        log.info(
+            f"Using dataset-specific approach for {dataset_name}: {len(gene_universe)} genes in dataset"
+        )
+    terms["used_genes"] = terms["all_genes"].apply(
+        lambda genes: list(set(genes) & gene_universe)
+    )
-    # Filter terms by minimum genes after dataset-specific filtering
+    min_genes_raw = config.get("min_genes_in_complex", 3)
+    min_genes = int(min_genes_raw) if min_genes_raw is not None else 3
     terms["n_used_genes"] = terms["used_genes"].apply(len)
-    terms = terms[terms["n_used_genes"] >= config['min_genes_in_complex']]
-    # Get genes present in terms for this specific dataset
-    genes_present = list(set([gene for genes_list in terms["used_genes"] for gene in genes_list]))
+    terms = terms[terms["n_used_genes"] >= min_genes]
+    if bool(config.get("jaccard", False)):
+        before = len(terms)
+        terms = filter_duplicate_terms(terms)
+        log.done(
+            f"After Jaccard duplicate used_genes filtering for {dataset_name}: "
+            f"{len(terms)} terms ({before - len(terms)} removed)"
+        )
+    genes_present = list(
+        set(gene for genes_list in terms["used_genes"] for gene in genes_list)
+    )
     log.info(f"Genes present in terms for {dataset_name}: {len(genes_present)}")
-    matrix = filter_matrix_by_genes(matrix, genes_present)
+    matrix_filtered = filter_matrix_by_genes(matrix, genes_present)
+    dsave(terms, "common", f"terms_{dataset_name}")
+    dsave(genes_present, "common", f"genes_present_in_terms_{dataset_name}")
+    return terms, genes_present, matrix_filtered
+def pra(dataset_name, matrix, is_corr=False):
+    log.info(f"******************** {dataset_name} ********************")
+    log.started(f"** Global Precision-Recall Analysis - {dataset_name} **")
+    config = dload("config")
+    ascending = _sort_ascending_for_dataset(dataset_name)
+    if not is_corr:
+        matrix = perform_corr(matrix, config.get("corr_function"))
+    terms, _genes_present, matrix = prepare_terms_for_dataset(dataset_name, matrix)
     log.info(f"Matrix shape: {matrix.shape}")
     df = binary(matrix)
     log.info(f"Pair-wise shape: {df.shape}")
-    df = quick_sort(df, ascending=(sort_order == "low"))
+    df = quick_sort(df, ascending=ascending)
     log.started("Building gene-to-pair indices")
     gold_pair_to_complex = _build_gold_pair_to_complex(terms)
@@ -252,23 +347,22 @@ def pra(dataset_name, matrix, is_corr=False):
     if df["prediction"].sum() == 0:
         log.info("No true positives found in dataset.")
         pr_auc = np.nan
+        df["tp"] = 0
+        df["precision"] = np.nan
+        df["recall"] = np.nan
     else:
         tp = df["prediction"].cumsum()
         df["tp"] = tp
         precision = tp / (np.arange(len(df)) + 1)
         recall = tp / tp.iloc[-1]
-        pr_auc = metrics.auc(recall, precision)
         df["precision"] = precision
         df["recall"] = recall
+        pr_auc = metrics.auc(recall, precision) if len(recall) >= 2 else np.nan
     log.info(f"PR-AUC: {pr_auc:.4f}, Number of true positives: {df['prediction'].sum()}")
     dsave(df, "pra", dataset_name)
     dsave(pr_auc, "pr_auc", dataset_name)
-    dsave( _corrected_auc(df) , "corrected_pr_auc", dataset_name)
-    # Save dataset-specific terms for per-complex analysis
-    dsave(terms, "common", f"terms_{dataset_name}")
-    dsave(genes_present, "common", f"genes_present_in_terms_{dataset_name}")
+    dsave(_corrected_auc(df), "corrected_pr_auc", dataset_name)
     log.done(f"Global PRA completed for {dataset_name}")
     return df
@@ -278,7 +372,12 @@ def pra(dataset_name, matrix, is_corr=False):
 # --------------------------------------------------------------------------
 def _corrected_auc(df: pd.DataFrame) -> float:
-    return np.trapz(df["precision"], df["recall"]) - df["precision"].iloc[-1]
+    if df.empty or "precision" not in df.columns or "recall" not in df.columns:
+        return np.nan
+    valid = df[["precision", "recall"]].replace([np.inf, -np.inf], np.nan).dropna()
+    if len(valid) < 2:
+        return np.nan
+    return np.trapz(valid["precision"], valid["recall"]) - valid["precision"].iloc[-1]
 def _build_gene_to_pair_indices(pairwise_df):
     indices = pairwise_df.index.values
@@ -395,28 +494,52 @@ def _process_chunk(chunk_terms, min_genes, memmap_path, gene_to_pair_indices):
         # Return error info for debugging
         return {'error': str(e), 'chunk_size': len(chunk_terms)}
-def pra_percomplex(dataset_name, matrix, is_corr=False, chunk_size=200):
+def pra_percomplex(dataset_name, matrix, is_corr=False, chunk_size=None, n_jobs=None):
     log.started(f"*** Per-complex PRA started - {dataset_name} ***")
     config = dload("config")
-    # Use dataset-specific terms and genes from pra function
-    terms = dload("common", f"terms_{dataset_name}")
-    genes_present = dload("common", f"genes_present_in_terms_{dataset_name}")
-    if terms is None:
-        log.warning(f"No dataset-specific terms found for {dataset_name}, using global terms")
-        terms = dload("common", "terms")
-        genes_present = dload("common", "genes_present_in_terms")
-    sorting = dload("input", "sorting")
-    sort_order = sorting.get(dataset_name, "high")
+    ascending = _sort_ascending_for_dataset(dataset_name)
+    per_complex_config = config.get("per_complex", {})
+    if not isinstance(per_complex_config, dict):
+        per_complex_config = {}
+    chunk_size_value = (
+        chunk_size if chunk_size is not None else per_complex_config.get("chunk_size", 200)
+    )
+    n_jobs_value = n_jobs if n_jobs is not None else per_complex_config.get("n_jobs", 4)
+    max_nbytes = per_complex_config.get("max_nbytes", "100M")
+    try:
+        effective_chunk_size = int(chunk_size_value)
+        effective_n_jobs = int(n_jobs_value)
+    except (TypeError, ValueError) as exc:
+        raise ValueError(
+            "per-complex chunk_size and n_jobs must be integer-compatible values."
+        ) from exc
+    if effective_chunk_size <= 0:
+        raise ValueError("per-complex chunk_size must be greater than 0.")
+    if effective_n_jobs <= 0:
+        raise ValueError("per-complex n_jobs must be greater than 0.")
     if not is_corr:
         matrix = perform_corr(matrix, config.get("corr_function"))
-    matrix = filter_matrix_by_genes(matrix, genes_present)
+    # Prefer terms prepared by pra(); if absent, prepare them here so direct
+    # pra_percomplex() calls use the same dataset-specific gene universe.
+    terms = dload("common", f"terms_{dataset_name}")
+    genes_present = dload("common", f"genes_present_in_terms_{dataset_name}")
+    if not isinstance(terms, pd.DataFrame) or genes_present is None:
+        log.warning(
+            f"No dataset-specific terms found for {dataset_name}; preparing them now."
+        )
+        terms, genes_present, matrix = prepare_terms_for_dataset(dataset_name, matrix)
+    else:
+        matrix = filter_matrix_by_genes(matrix, genes_present)
     log.info(f"Matrix shape: {matrix.shape}")
     df = binary(matrix)
     log.info(f"Pair-wise shape: {df.shape}")
-    df = quick_sort(df, ascending=(sort_order == "low"))
+    df = quick_sort(df, ascending=ascending)
     pairwise_df = df.copy()
     pairwise_df['gene1'] = pairwise_df['gene1'].astype("category")
     pairwise_df['gene2'] = pairwise_df['gene2'].astype("category")
@@ -434,26 +557,42 @@ def pra_percomplex(dataset_name, matrix, is_corr=False, chunk_size=200):
     pairwise_df = _precompute_complex_ids(pairwise_df, gold_pair_to_complex)
     log.done("Precomputing complex IDs")  #
+    chunks = [
+        terms.iloc[i:i + effective_chunk_size]
+        for i in range(0, len(terms), effective_chunk_size)
+    ]
+    min_genes = config["min_genes_per_complex_analysis"]
+    if not chunks:
+        terms["auc_score"] = pd.Series(dtype=float)
+        terms["corrected_auc_score"] = pd.Series(dtype=float)
+        dsave(terms, "pra_percomplex", dataset_name)
+        log.done("Per-complex PRA completed with no eligible terms.")
+        return terms
     log.info('Dumping pairwise_df to memmap')
     memmap_path = _dump_pairwise_memmap(pairwise_df, dataset_name)
     log.done('Dumping pairwise_df to memmap')
-    # choose smaller chunks now that pickling cost is gone
-    chunks = [terms.iloc[i:i+chunk_size] for i in range(0, len(terms), chunk_size)]
-    min_genes = config["min_genes_per_complex_analysis"]
     # Initialize results variable
     results = None
     try:
         # Compatible parallel execution for older joblib versions
         log.started("Processing chunks in parallel")
+        actual_n_jobs = min(effective_n_jobs, len(chunks))
+        log.info(
+            "Per-complex parallel config: "
+            f"n_jobs={actual_n_jobs}, requested_n_jobs={effective_n_jobs}, "
+            f"chunk_size={effective_chunk_size}, chunks={len(chunks)}, "
+            f"max_nbytes={max_nbytes}"
+        )
         # Use a more conservative approach with older joblib
         results = Parallel(
-            n_jobs=min(4, len(chunks)),  # Limit to 4 workers or number of chunks
+            n_jobs=actual_n_jobs,
             temp_folder=os.path.dirname(memmap_path),
-            max_nbytes='100M',  # Set memory limit
+            max_nbytes=max_nbytes,
             verbose=1  # Show progress
         )(delayed(_process_chunk)(chunk, min_genes, memmap_path, gene_to_pair_indices)
           for chunk in tqdm(chunks, desc="Per-complex PRA"))
@@ -484,11 +623,13 @@ def pra_percomplex(dataset_name, matrix, is_corr=False, chunk_size=200):
     # Merge results with enhanced error handling
     auc_scores = {}
     corrected_auc_scores = {}
+    errors = []
     if results:
         for i, res in enumerate(results):
             if isinstance(res, dict):
                 if 'error' in res:
                     log.error(f"Error in chunk {i}: {res['error']}")
+                    errors.append(f"chunk {i}: {res['error']}")
                 elif 'auc' in res and 'corrected_auc' in res:
                     # New format with both AUC types
                     auc_scores.update(res['auc'])
@@ -498,8 +639,15 @@ def pra_percomplex(dataset_name, matrix, is_corr=False, chunk_size=200):
                     auc_scores.update(res)
             elif isinstance(res, tuple) and len(res) >= 2 and res[0] is None:
                 log.error(f"Chunk {i} error: {res[1]}")
+                errors.append(f"chunk {i}: {res[1]}")
             else:
                 log.warning(f"Unexpected result type from chunk {i}: {type(res)} - {res}")
+                errors.append(f"chunk {i}: unexpected result type {type(res)}")
+    if errors:
+        preview = "; ".join(errors[:3])
+        extra = f" ({len(errors) - 3} more)" if len(errors) > 3 else ""
+        raise RuntimeError(f"Per-complex PRA failed in worker chunks: {preview}{extra}")
     # Add the computed AUC scores to the terms DataFrame.
     terms["auc_score"] = pd.Series(auc_scores)
@@ -511,10 +659,20 @@ def pra_percomplex(dataset_name, matrix, is_corr=False, chunk_size=200):
 def complex_contributions(name):
     log.info(f"Computing complex contributions (Greedy) for dataset: {name}")
     pra = dload("pra", name)
-    terms = dload("common", "terms")
+    terms = dload("common", f"terms_{name}")
+    if not isinstance(terms, pd.DataFrame):
+        # Fallback for backward compatibility
+        terms = dload("common", "terms")
+    if not isinstance(pra, pd.DataFrame) or pra.empty:
+        raise RuntimeError(f"complex_contributions(): PRA data for dataset '{name}' not found.")
+    if not isinstance(terms, pd.DataFrame) or terms.empty:
+        raise RuntimeError(f"complex_contributions(): terms for dataset '{name}' not found.")
-    # Ensure pra is sorted by score descending (matches R's order by predicted descending)
-    pra = pra.sort_values(by='score', ascending=False).reset_index(drop=True)
+    # Respect the dataset's score direction: high scores by default, low scores if configured.
+    pra = pra.sort_values(
+        by='score',
+        ascending=_sort_ascending_for_dataset(name),
+    ).reset_index(drop=True)
     # Compute cumulative TP and precision (matches R's TP.count = cumsum(true), Precision = TP / (1:n))
     pra['cumTP'] = pra['prediction'].cumsum()
@@ -809,6 +967,9 @@ def binary(corr):
     stack = corr.stack().rename_axis(index=['gene1', 'gene2']).\
             reset_index().rename(columns={0: 'score'})
+    if stack.empty:
+        log.done("Pair-wise conversion.")
+        return stack
     if has_mirror_of_first_pair(stack):
         log.info("Mirror pairs detected. Dropping them to ensure unique gene pairs.")
         stack = drop_mirror_pairs(stack)
@@ -859,7 +1020,7 @@ def save_results_to_csv(categories = ["complex_contributions", "pr_auc", "pra_pe
             continue
         if category == "mpr_complexes_auc" and isinstance(data, dict):
-            # Dict[dataset_name -> Dict[filter_key -> auc]]
+            # Dict[dataset_name -> Dict[variant_key -> auc]]
             try:
                 df = pd.DataFrame.from_dict(data, orient="index")
                 df.index.name = "Dataset"
@@ -902,21 +1063,10 @@ def save_results_to_csv(categories = ["complex_contributions", "pr_auc", "pra_pe
     log.done("Results saved to CSV files in the output folder.")
-################### mPR
-################### mPR ###################
-################### mPR ###################
-################### mPR ###################
 # -----------------------------------------------------------------------------
 # mPR preparation (module-level precision–recall, Fig. 1E / 1F)
 # -----------------------------------------------------------------------------
-import numpy as np
-import pandas as pd
 def _mpr_get_mtRibo_ETCI_ids(terms_like):
     """
@@ -971,34 +1121,16 @@ def _mpr_get_small_high_auprc_ids(
 # Helpers implementing the FLEX stepwise module-level PR logic
 # -------------------------------------------------------------------------
-"""
-CORRECT FIX for _mpr_build_pairs in analysis.py
-The issue: The current code marks filtered TPs as true=0, which makes them
-count as False Positives and dramatically lowers precision.
-The R code (getSubsetOfCoAnnRemoveIDs with replace=FALSE) REMOVES the
-filtered positive pairs entirely from the dataset.
-This is the key difference:
-- Current Python: Keeps all rows, filtered TPs become FPs → precision tanks
-- R code: Removes filtered TP rows → they don't affect precision at all
-"""
-import numpy as np
-import pandas as pd
-def _mpr_build_pairs(pra, removed_ids=None):
+def _mpr_build_pairs(pra, removed_ids=None, ascending=False):
     """
     Build a Pairs.in.data-like table for mPR / stepwise contributions.
-    FIXED: Removes rows that contain filtered complex IDs (matching R behavior)
-    instead of marking them as true=0.
+    Rows containing filtered positive complex IDs are removed from the ranking,
+    matching the FLEX stepwise module-level precision-recall behavior.
     Input:
       pra : DataFrame with at least columns
-            - 'score'       : ranking score (higher = better)
+            - 'score'       : ranking score
             - 'complex_id'  : complex annotations
       removed_ids : set[int] of complexes to remove
@@ -1057,11 +1189,9 @@ def _mpr_build_pairs(pra, removed_ids=None):
     out["complex_ids"] = df[cid_col].apply(normalize_ids)
     out["true"] = out["complex_ids"].apply(lambda ids: 1 if len(ids) > 0 else 0)
-    # KEY FIX: Remove rows that are TPs AND contain a removed complex ID
-    # This matches the R behavior of getSubsetOfCoAnnRemoveIDs with replace=FALSE
+    # Remove rows that are TPs and contain a removed complex ID.
     if removed_ids:
         should_remove_mask = df[cid_col].apply(should_remove)
-        # Only remove if it's a TP (true=1)
         remove_mask = should_remove_mask & (out["true"] == 1)
         out = out[~remove_mask].copy()
@@ -1071,41 +1201,11 @@ def _mpr_build_pairs(pra, removed_ids=None):
             lambda ids: [cid for cid in ids if cid not in removed_ids]
         )
-    # Sort by predicted descending
-    out = out.sort_values("predicted", ascending=False).reset_index(drop=True)
+    # Sort by the dataset's configured score direction.
+    out = out.sort_values("predicted", ascending=ascending).reset_index(drop=True)
     return out
-# ============================================================================
-# HOW TO APPLY THIS FIX
-# ============================================================================
-#
-# In analysis.py, replace the _mpr_build_pairs function (around line 962-1025)
-# with the _mpr_build_pairs_fixed function above.
-#
-# The key changes are:
-#
-# 1. REMOVE rows instead of marking true=0:
-#
-#    OLD CODE:
-#        if removed_ids:
-#            ids = [cid for cid in ids if cid not in removed_ids]
-#        return ids
-#        ...
-#        out["true"] = out["complex_ids"].apply(lambda ids: 1 if len(ids) > 0 else 0)
-#
-#    NEW CODE:
-#        # First compute true normally
-#        out["true"] = out["complex_ids"].apply(lambda ids: 1 if len(ids) > 0 else 0)
-#
-#        # Then REMOVE rows that are TPs and contain removed IDs
-#        if removed_ids:
-#            should_remove_mask = df[cid_col].apply(should_remove)
-#            remove_mask = should_remove_mask & (out["true"] == 1)
-#            out = out[~remove_mask].copy()
-#
-# ============================================================================
 def _mpr_precision_cutoffs_from_pairs(pairs, step=0.025):
     """
     Choose precision cutoffs similar to FLEX:
@@ -1137,7 +1237,7 @@ def _mpr_precision_cutoffs_from_pairs(pairs, step=0.025):
     return np.array(cuts, dtype=float)
-def _mpr_stepwise_contributions(pairs, precision_cutoffs):
+def _mpr_stepwise_contributions(pairs, precision_cutoffs, ascending=False):
     """
     Greedy, stepwise TP allocation per complex at each precision cutoff.
@@ -1152,7 +1252,7 @@ def _mpr_stepwise_contributions(pairs, precision_cutoffs):
       contrib_df : DataFrame [complex_id x cutoff] with TP counts
     """
     pairs = pairs.copy()
-    pairs = pairs.sort_values("predicted", ascending=False).reset_index(drop=True)
+    pairs = pairs.sort_values("predicted", ascending=ascending).reset_index(drop=True)
     true = pairs["true"].to_numpy(dtype=int)
     n = len(true)
@@ -1272,16 +1372,29 @@ def _mpr_module_coverage(contrib_df, terms, tp_th=1, percent_th=0.1):
         row = terms.loc[cid_int]
         n_genes = None
-        # FIXED: Handle all_genes as list (how it's stored in preprocessing)
-        if "all_genes" in row.index:
+        # Prefer used_genes (genes actually in the dataset) for a fair coverage
+        # fraction. This matters for GOBP/PATHWAY where all_genes >> used_genes.
+        if "used_genes" in row.index:
+            genes = row["used_genes"]
+            if isinstance(genes, (list, np.ndarray)) and len(genes) > 0:
+                n_genes = len(genes)
+        if n_genes is None and "n_used_genes" in row.index:
+            try:
+                v = int(row["n_used_genes"])
+                if v > 0:
+                    n_genes = v
+            except (ValueError, TypeError):
+                pass
+        # Fallback: all_genes (how it's stored in preprocessing)
+        if n_genes is None and "all_genes" in row.index:
             genes = row["all_genes"]
-            if isinstance(genes, list):
+            if isinstance(genes, (list, np.ndarray)):
                 n_genes = len(genes)
             elif isinstance(genes, str):
-                # Fallback if stored as string
                 n_genes = len([g for g in genes.split(";") if g])
         # Fallback to Genes column (original string from CORUM)
         if n_genes is None and "Genes" in row.index:
             genes_str = row["Genes"]
@@ -1338,7 +1451,7 @@ def _mpr_complexes_auc(
     We compute a normalized AUC by integrating precision over the *normalized*
     coverage axis:
-        AUC = \int y \, d(x/max_complexes)
+        AUC = integral y d(x/max_complexes)
     This yields a score in [0, 1] (or NaN if insufficient data).
     """
@@ -1348,7 +1461,7 @@ def _mpr_complexes_auc(
     if cov.size == 0 or prec.size == 0:
         return 0.0
-    # Match plot_mpr_complexes_multi(): only count cov>0 (log-x cannot show 0)
+    # Match plot_mpr_complex_coverage_curve(): only count cov>0 (log-x cannot show 0)
     mask = (
         np.isfinite(cov)
         & np.isfinite(prec)
@@ -1410,26 +1523,31 @@ def mpr_prepare(
     """
     pra = dload("pra", name)
     pra_percomplex = dload("pra_percomplex", name)
-    terms = dload("common", "terms")
+    terms = dload("common", f"terms_{name}")
+    if not isinstance(terms, pd.DataFrame):
+        # Fallback for backward compatibility
+        terms = dload("common", "terms")
-    if pra is None:
+    if pra is None or not isinstance(pra, pd.DataFrame) or pra.empty:
         raise RuntimeError(
             f"mpr_prepare(): PRA data for dataset '{name}' not found "
             "(dload('pra', name))."
         )
-    if pra_percomplex is None:
+    if pra_percomplex is None or not isinstance(pra_percomplex, pd.DataFrame) or pra_percomplex.empty:
         raise RuntimeError(
             f"mpr_prepare(): per-complex PRA data for dataset '{name}' not found "
             "(dload('pra_percomplex', name))."
         )
-    if terms is None:
+    if terms is None or not isinstance(terms, pd.DataFrame) or terms.empty:
         raise RuntimeError(
             "mpr_prepare(): CORUM 'terms' table not found (dload('common', 'terms'))."
         )
-    # sort by score descending (ranking)
+    ascending = _sort_ascending_for_dataset(name)
+    # Sort by the dataset's configured score direction.
     if "score" in pra.columns:
-        pra = pra.sort_values("score", ascending=False).reset_index(drop=True)
+        pra = pra.sort_values("score", ascending=ascending).reset_index(drop=True)
     else:
         pra = pra.reset_index(drop=True)
@@ -1455,7 +1573,7 @@ def mpr_prepare(
     for label, removed in filter_sets.items():
         # 1) Build pairs table after removing complexes in `removed`
-        pairs = _mpr_build_pairs(pra, removed_ids=removed)
+        pairs = _mpr_build_pairs(pra, removed_ids=removed, ascending=ascending)
         true = pairs["true"].to_numpy(dtype=int)
         n = len(true)
@@ -1482,13 +1600,24 @@ def mpr_prepare(
         if precision_cutoffs is None:
             precision_cutoffs = _mpr_precision_cutoffs_from_pairs(pairs)
-        contrib_df = _mpr_stepwise_contributions(pairs, precision_cutoffs)
+        contrib_df = _mpr_stepwise_contributions(
+            pairs,
+            precision_cutoffs,
+            ascending=ascending,
+        )
         cov = _mpr_module_coverage(
             contrib_df,
             terms,
             tp_th=tp_th,
             percent_th=percent_th,
         )
+        # precision_cutoffs are sorted ascending (low → high).
+        # Coverage must be non-increasing in that direction: a more permissive
+        # threshold (lower precision) should never yield fewer covered terms.
+        # The independent greedy allocation per cutoff can violate this, so
+        # enforce monotonicity by propagating the max from right to left.
+        if cov.size > 0:
+            cov = np.maximum.accumulate(cov[::-1])[::-1]
         coverage_curves[label] = cov
         complexes_auc[label] = _mpr_complexes_auc(
             cov,
@@ -1516,424 +1645,3 @@ def mpr_prepare(
     # Convenience: store AUCs as their own category for easy export / plotting.
     dsave(complexes_auc, "mpr_complexes_auc", name)
-### OLD FUNCTIONS
-# new but withoutparallel
-# def pra_percomplex(dataset_name, matrix, is_corr=False):
-#     log.started(f"*** Per-complex PRA started - {dataset_name} ***")
-#     config = dload("config")
-#     terms = dload("tmp", "terms")
-#     genes_present = dload("tmp", "genes_present_in_terms")
-#     sorting = dload("input", "sorting")
-#     sort_order = sorting.get(dataset_name, "high")
-#     if not is_corr:
-#         matrix = perform_corr(matrix, config.get("corr_function"))
-#     matrix = filter_matrix_by_genes(matrix, genes_present)
-#     log.info(f"Matrix shape: {matrix.shape}")
-#     df = binary(matrix)
-#     log.info(f"Pair-wise shape: {df.shape}")
-#     df = quick_sort(df, ascending=(sort_order == "low"))
-#     pairwise_df = df.copy()
-#     pairwise_df['gene1'] = pairwise_df['gene1'].astype("category")
-#     pairwise_df['gene2'] = pairwise_df['gene2'].astype("category")
-#     # Precompute a mapping from each gene to the row indices in the pairwise DataFrame where it appears.
-#     gene_to_pair_indices = {}
-#     for i, (gene_a, gene_b) in enumerate(zip(pairwise_df["gene1"], pairwise_df["gene2"])):
-#         gene_to_pair_indices.setdefault(gene_a, []).append(i)
-#         gene_to_pair_indices.setdefault(gene_b, []).append(i)
-#     log.done
-#     # Build gold_pair_to_complex using sets for efficiency
-#     gold_pair_to_complex = defaultdict(set)
-#     for idx, row in terms.iterrows():
-#         genes = row.used_genes
-#         if len(genes) < 2:
-#             continue
-#         for i, g1 in enumerate(genes):
-#             for g2 in genes[i + 1:]:
-#                 pair = tuple(sorted((g1, g2)))
-#                 gold_pair_to_complex[pair].add(idx)
-#     # Precompute complex_ids as semicolon-separated strings in pairwise_df
-#     pairs = [tuple(sorted((g1, g2))) for g1, g2 in zip(pairwise_df["gene1"], pairwise_df["gene2"])]
-#     pairwise_df['complex_ids'] = [';'.join(map(str, sorted(gold_pair_to_complex.get(pair, set())))) for pair in pairs]
-#     # Initialize AUC scores
-#     auc_scores = {}
-#     # Loop over each gene complex
-#     for idx, row in tqdm(terms.iterrows()):
-#         gene_set = set(row.used_genes)
-#         if config["min_genes_per_complex_analysis"] > len(gene_set):
-#             continue
-#         # Collect all row indices in the pairwise data where either gene belongs to the complex.
-#         candidate_indices = bitarray(len(pairwise_df))
-#         for gene in gene_set:
-#             if gene in gene_to_pair_indices:
-#                 candidate_indices[gene_to_pair_indices[gene]] = True
-#         if not candidate_indices.any():
-#             continue
-#         # Select only the relevant pairwise comparisons.
-#         selected_rows = np.unpackbits(candidate_indices).view(bool)[:len(pairwise_df)]
-#         sub_df = pairwise_df.iloc[selected_rows]
-#         # Get current complex ID (assuming idx is the ID; adjust if row['ID'] is different)
-#         complex_id = str(idx)  # Or str(row['ID']) if available
-#         # Create true_label: 1 if complex_id in complex_ids (vectorized with str.contains)
-#         #true_label = sub_df['complex_ids'].str.contains(complex_id, regex=False).astype(int)
-#         # Inside the loop, for each complex:
-#         # Inside the loop:
-#         complex_id = str(idx)
-#         # Use (?:^|;) and (?:;|$) to avoid capturing groups
-#         pattern = r'(?:^|;)' + re.escape(complex_id) + r'(?:;|$)'
-#         true_label = sub_df['complex_ids'].str.contains(pattern, regex=True).astype(int)
-#         # Filter to keep verified negatives (complex_ids == "") or positives for this complex (true_label == 1)
-#         complex_mask = (sub_df['complex_ids'] == "") | (true_label == 1)
-#         # Use the masked true labels for AUPRC (avoids SettingWithCopyWarning)
-#         predictions = true_label[complex_mask]
-#         if predictions.sum() == 0:
-#             continue
-#         # Compute cumulative true positives and derive precision and recall.
-#         true_positive_cumsum = predictions.cumsum()
-#         precision = true_positive_cumsum / (np.arange(len(predictions)) + 1)
-#         recall = true_positive_cumsum / true_positive_cumsum.iloc[-1]
-#         if len(recall) < 2 or recall.iloc[-1] == 0:
-#             continue
-#         auc_scores[idx] = metrics.auc(recall, precision)
-#     # Add the computed AUC scores to the terms DataFrame.
-#     terms["auc_score"] = pd.Series(auc_scores)
-#     terms.drop(columns=["hash"], inplace=True)
-#     dsave(terms, "pra_percomplex", dataset_name)
-#     log.done(f"Per-complex PRA completed.")
-#     return terms
-# it works quick but only maps 1 complex to each pair
-# def pra_percomplex_old_type_filtering(dataset_name, matrix, is_corr=False):
-#     log.started(f"*** Per-complex PRA started - {dataset_name} ***")
-#     config = dload("config")
-#     terms = dload("tmp", "terms")
-#     genes_present = dload("tmp", "genes_present_in_terms")
-#     sorting = dload("input", "sorting")
-#     sort_order = sorting.get(dataset_name, "high")
-#     if not is_corr:
-#         matrix = perform_corr(matrix, config.get("corr_function"))
-#     matrix = filter_matrix_by_genes(matrix, genes_present)
-#     log.info(f"Matrix shape: {matrix.shape}")
-#     df = binary(matrix)
-#     log.info(f"Pair-wise shape: {df.shape}")
-#     df = quick_sort(df, ascending=(sort_order == "low"))
-#     pairwise_df = df.copy()
-#     pairwise_df['gene1'] = pairwise_df['gene1'].astype("category")
-#     pairwise_df['gene2'] = pairwise_df['gene2'].astype("category")
-#     # Precompute a mapping from each gene to the row indices in the pairwise DataFrame where it appears.
-#     gene_to_pair_indices = {}
-#     for i, (gene_a, gene_b) in enumerate(zip(pairwise_df["gene1"], pairwise_df["gene2"])):
-#         gene_to_pair_indices.setdefault(gene_a, []).append(i)
-#         gene_to_pair_indices.setdefault(gene_b, []).append(i)
-#     # Initialize AUC scores (one for each complex) with NaNs.
-#     #auc_scores = np.full(len(terms), np.nan)
-#     auc_scores = {}
-#     # Loop over each gene complex
-#     for idx, row in tqdm(terms.iterrows()):
-#         gene_set = set(row.used_genes)
-#         if config["min_genes_per_complex_analysis"] > len(gene_set):
-#             continue
-#         # Collect all row indices in the pairwise data where either gene belongs to the complex.
-#         candidate_indices = bitarray(len(pairwise_df))
-#         for gene in gene_set:
-#             if gene in gene_to_pair_indices:
-#                 candidate_indices[gene_to_pair_indices[gene]] = True
-#         if not candidate_indices.any():
-#             continue
-#         # Select only the relevant pairwise comparisons.
-#         selected_rows = np.unpackbits(candidate_indices).view(bool)[:len(pairwise_df)]
-#         sub_df = pairwise_df.iloc[selected_rows]
-#         # A prediction is 1 if both genes in the pair are in the complex; otherwise 0.
-#         predictions = (sub_df["gene1"].isin(gene_set) & sub_df["gene2"].isin(gene_set)).astype(int)
-#         if predictions.sum() == 0:
-#             continue
-#         # Compute cumulative true positives and derive precision and recall.
-#         true_positive_cumsum = predictions.cumsum()
-#         precision = true_positive_cumsum / (np.arange(len(predictions)) + 1)
-#         recall = true_positive_cumsum / true_positive_cumsum.iloc[-1]
-#         if len(recall) < 2 or recall.iloc[-1] == 0:
-#             continue
-#         auc_scores[idx] = metrics.auc(recall, precision)
-#     # Add the computed AUC scores to the terms DataFrame.
-#     terms["auc_score"] = pd.Series(auc_scores)
-#     terms.drop(columns=["hash"], inplace=True)
-#     dsave(terms, "pra_percomplex", dataset_name)
-#     log.done(f"Per-complex PRA completed.")
-#     return terms
-# OLD
-# def pra_percomplex(dataset_name, matrix, is_corr=False):
-#     log.started(f"*** Per-complex PRA started for {dataset_name} ***")
-#     config = dload("config")
-#     terms = dload("tmp", "terms")
-#     genes_present = dload("tmp", "genes_present_in_terms")
-#     sorting = dload("input", "sorting")
-#     sort_order = sorting.get(dataset_name, "high")
-#     if not is_corr:
-#         matrix = perform_corr(matrix, "numpy")
-#     matrix = filter_matrix_by_genes(matrix, genes_present)
-#     log.info(f"Matrix shape: {matrix.shape}")
-#     df = binary(matrix)
-#     log.info(f"Pair-wise shape: {df.shape}")
-#     df = quick_sort(df, ascending=(sort_order == "low"))
-#     # Precompute gene → row indices
-#     gene_to_rows = {}
-#     for i, (g1, g2) in enumerate(zip(df["gene1"], df["gene2"])):
-#         gene_to_rows.setdefault(g1, []).append(i)
-#         gene_to_rows.setdefault(g2, []).append(i)
-#     aucs = np.full(len(terms), np.nan)
-#     N = len(df)
-#     for idx, row in tqdm(terms.iterrows()):
-#         genes = set(row.used_genes)
-#         if len(genes) < config["min_complex_size_for_percomplex"]:  # Skip small complexes
-#             continue
-#         # Get all row indices where either gene is in the complex
-#         candidate_idxs = set()
-#         for g in genes:
-#             candidate_idxs.update(gene_to_rows.get(g, []))
-#         candidate_idxs = sorted(candidate_idxs)
-#         if not candidate_idxs:
-#             continue
-#         # Use only relevant rows for prediction
-#         sub = df.loc[candidate_idxs]
-#         preds = (sub["gene1"].isin(genes) & sub["gene2"].isin(genes)).astype(int)
-#         if preds.sum() == 0:
-#             continue
-#         tp = preds.cumsum()
-#         prec = tp / (np.arange(len(preds)) + 1)
-#         recall = tp / tp.iloc[-1]
-#         if len(recall) < 2 or recall.iloc[-1] == 0:
-#             continue
-#         aucs[idx] = metrics.auc(recall, prec)
-#     terms["auc_score"] = aucs
-#     terms.drop(columns=["list", "set", "hash"], inplace=True)
-#     dsave(terms, "pra_percomplex", dataset_name)
-#     log.done(f"Per-complex PRA completed.")
-#     return terms
-# without greedy
-# def complex_contributions(name):
-#     log.info(f"Computing complex contributions for dataset: {name}")
-#     pra = dload("pra", name)
-#     terms = dload("tmp", "terms")
-#     d = pra.query('prediction == 1').drop(columns=['gene1', 'gene2'])
-#     results = {}
-#     thresholds = [round(i, 2) for i in np.arange(1, 0.0001, -0.025)]
-#     for cid in terms.ID.to_list():
-#         arr = []
-#         for threshold in thresholds:
-#             r = d[d.complex_id == cid].query('precision >= @threshold')
-#             arr.append(r.shape[0])
-#         results[cid] = arr
-#     r = pd.DataFrame(results, index=thresholds).T
-#     t = terms[['ID', 'Name']].set_index('ID')
-#     r['Name'] = r.index.map(t.Name)
-#     r = r[list(reversed(list(r.columns)))]
-#     r = r.reset_index(drop=True)
-#     dsave(r, "complex_contributions", name)
-#     log.info(f"Complex contributions computation completed for dataset: {name}")
-#     return r
-# # new
-# def complex_contributions(name):
-#     log.info(f"Computing complex contributions using R-style greedy logic for dataset: {name}")
-#     pra = dload("pra", name)
-#     terms = dload("common", "terms")
-#     # Ensure pra is sorted by score descending
-#     pra = pra.sort_values(by='score', ascending=False).reset_index(drop=True)
-#     # Compute cumulative TP and precision if not present
-#     pra['cumTP'] = pra['prediction'].cumsum()
-#     pra['rank'] = pra.index + 1
-#     pra['precision'] = pra['cumTP'] / pra['rank']
-#     # R-style precision thresholds
-#     prec_min = pra['precision'].min()
-#     prec_max = pra['precision'].max()
-#     precision_cutoffs = [round(prec_min, 3)]
-#     cutoffs_range = np.arange(0.1, prec_max + 0.001, 0.025)
-#     precision_cutoffs += [round(t, 3) for t in cutoffs_range if t > prec_min]
-#     thresholds = sorted(set(precision_cutoffs))  # Ensure unique and sorted
-#     results = {}
-#     for t in thresholds:
-#         if pra['precision'].max() < t:
-#             continue
-#         cand = pra[pra['precision'] >= t]
-#         if cand.empty:
-#             continue
-#         k = cand.index.max()  # rightmost index where precision >= t
-#         tp_target = pra.loc[k, 'cumTP']
-#         # Find the smallest m where cumTP[m] >= tp_target
-#         ind = pra[pra['cumTP'] >= tp_target].index.min()
-#         if pd.isna(ind):
-#             continue
-#         # Select top (ind+1) rows
-#         tmp = pra.iloc[0:ind + 1].copy()
-#         # Filter for predicted positives (true == 1)
-#         tmp = tmp[tmp['prediction'] == 1]
-#         tmp = tmp[tmp["complex_id"].notnull()]
-#         tmp["ID"] = tmp["complex_id"].apply(lambda ids: ";".join(str(int(i)) for i in ids if pd.notnull(i)))
-#         # Now greedy logic
-#         final_contrib = {}
-#         while not tmp.empty:
-#             all_ids = tmp["ID"].str.split(";").explode()
-#             contrib = all_ids.value_counts()
-#             if contrib.empty:
-#                 break
-#             top_id = contrib.idxmax()
-#             final_contrib[top_id] = contrib[top_id]
-#             tmp = tmp[~tmp["ID"].str.contains(rf"\b{top_id}\b", regex=True)]
-#         for cid, count in final_contrib.items():
-#             if cid not in results:
-#                 results[cid] = [0] * len(thresholds)
-#             results[cid][thresholds.index(t)] = count
-#     # Add back gold standard complexes with 0 contribution
-#     gold_ids = set(terms.index.astype(str))
-#     all_ids = set(results.keys())
-#     missing_ids = gold_ids - all_ids
-#     for cid in missing_ids:
-#         results[cid] = [0] * len(thresholds)
-#     # Build result DataFrame
-#     r = pd.DataFrame(results, index=thresholds).T
-#     r['Name'] = r.index.astype(int).map(terms['Name'])
-#     r = r[['Name'] + [c for c in r.columns if c != 'Name']]  # Name as first col
-#     r = r[(r.drop(columns="Name").sum(axis=1) > 0)]
-#     # Move ID to first column, keep Name second, then precision columns in order
-#     dsave(r, "complex_contributions", name)
-#     log.info(f"Greedy R-style complex contribution completed for dataset: {name}")
-#     return r
-# def pra(dataset_name, matrix, is_corr=False):
-#     log.info(f"******************** {dataset_name} ********************")
-#     log.started(f"** Global Precision-Recall Analysis - {dataset_name} **")
-#     config = dload("config")
-#     terms_data = dload("tmp", "terms")
-#     if terms_data is None or not isinstance(terms_data, pd.DataFrame):
-#         raise ValueError("Expected 'terms' to be a DataFrame, but got None or invalid type.")
-#     terms = terms_data
-#     genes_present = dload("tmp", "genes_present_in_terms")
-#     sorting = dload("input", "sorting")
-#     sort_order = sorting.get(dataset_name, "high")
-#     if not is_corr:
-#         matrix = perform_corr(matrix, config.get("corr_function"))
-#     matrix = filter_matrix_by_genes(matrix, genes_present)
-#     log.info(f"Matrix shape: {matrix.shape}")
-#     df = binary(matrix)
-#     log.info(f"Pair-wise shape: {df.shape}")
-#     df = quick_sort(df, ascending=(sort_order == "low"))
-#     gold_pair_to_complex = defaultdict(list)
-#     for idx, row in terms.iterrows():
-#         genes = row.used_genes
-#         if len(genes) < 2:
-#             continue
-#         for i, g1 in enumerate(genes):
-#             for g2 in genes[i + 1:]:
-#                 pair = tuple(sorted((g1, g2)))
-#                 gold_pair_to_complex[pair].append(idx)
-#     # Label predictions and complex IDs
-#     complex_ids = []
-#     predictions = []
-#     for g1, g2 in zip(df["gene1"], df["gene2"]):
-#         pair = tuple(sorted((g1, g2)))
-#         ids = gold_pair_to_complex.get(pair, [])
-#         if ids:
-#             predictions.append(1)
-#             complex_ids.append(ids)
-#         else:
-#             predictions.append(0)
-#             complex_ids.append([])
-#     df["prediction"] = predictions
-#     df["complex_id"] = complex_ids
-#     if df["prediction"].sum() == 0:
-#         log.info("No true positives found in dataset.")
-#         pr_auc = np.nan
-#     else:
-#         tp = df["prediction"].cumsum()
-#         df["tp"] = tp
-#         precision = tp / (np.arange(len(df)) + 1)
-#         recall = tp / tp.iloc[-1]
-#         pr_auc = metrics.auc(recall, precision)
-#         df["precision"] = precision
-#         df["recall"] = recall
-#     log.info(f"PR-AUC: {pr_auc:.4f}, Number of true positives: {df['prediction'].sum()}")
-#     dsave(df, "pra", dataset_name)
-#     dsave(pr_auc, "pr_auc", dataset_name)
-#     log.done(f"Global PRA completed for {dataset_name}")
-#     return df, pr_auc
-# def compute_pra(df):
-#     log.info("Calculating precision-recall and AUC score.")
-#     if df.empty:
-#         log.warning("Empty DataFrame encountered in compute_pra. Returning empty DataFrame.")
-#         return df
-#     df["tp"] = df["prediction"].cumsum()
-#     df.reset_index(drop=True, inplace=True)
-#     df["precision"] = df["tp"] / (df.index + 1)
-#     df["recall"] = df["tp"] / df["tp"].iloc[-1]
-#     log.info("DONE: Calculating precision-recall AUC score.")
-#     return df
-# def pra(dataset_name, matrix, is_corr=False):
-#     log.info(f"PRA computation started for {dataset_name}.")
-#     genes_present_in_terms = dload("tmp", "genes_present_in_terms")
-#     #terms_hash_table = dload("tmp", "terms_hash_table")
-#     sorting_prefs = dload("input", "sorting")
-#     sort_order = sorting_prefs.get(dataset_name, "high")
-#     if not is_corr: matrix = perform_corr(matrix, "numpy")
-#     matrix = filter_matrix_by_genes(matrix, genes_present_in_terms)
-#     stack = binary(matrix)
-#     log.info("Checking gene pairs against the gold standard.")
-#     gene_pairs = list(zip(stack["gene1"], stack["gene2"]))
-#     hashed_pairs = [hash(pair) for pair in gene_pairs]
-#     stack["complex_id"] = [terms_hash_table.get(h, 0) for h in hashed_pairs]
-#     stack["prediction"] = [1 if h in terms_hash_table else 0 for h in hashed_pairs]
-#     annotated = stack.copy()
-#     if sort_order == "low":
-#         ann_sorted = quick_sort(annotated, ascending=True)
-#     else:
-#         ann_sorted = quick_sort(annotated)
-#     pra = compute_pra(ann_sorted)
-#     pr_auc = metrics.auc(pra.recall, pra.precision)
-#     dsave(pra, "pra", dataset_name)
-#     dsave(pr_auc, "pr_auc", dataset_name)
-#     log.info(f"PRA computation completed for {dataset_name} (Sorting: {sort_order}).")
-#     return pra, pr_auc

pythonflex 0.3.4__py3-none-any.whl → 0.4__py3-none-any.whl

pythonflex 0.3.4py3-none-any.whl → 0.4py3-none-any.whl