PyPI - risk-network - Versions diffs - 0.0.8b26__py3-none-any.whl → 0.0.9__py3-none-any.whl - Mend

risk-network 0.0.8b26py3-none-any.whl → 0.0.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

risk/__init__.py +2 -2
risk/annotations/__init__.py +2 -2
risk/annotations/annotations.py +195 -118
risk/annotations/io.py +47 -31
risk/log/__init__.py +4 -2
risk/log/{config.py → console.py} +5 -3
risk/log/{params.py → parameters.py} +17 -42
risk/neighborhoods/__init__.py +3 -5
risk/neighborhoods/api.py +442 -0
risk/neighborhoods/community.py +324 -101
risk/neighborhoods/domains.py +125 -52
risk/neighborhoods/neighborhoods.py +177 -165
risk/network/__init__.py +1 -3
risk/network/geometry.py +71 -89
risk/network/graph/__init__.py +6 -0
risk/network/graph/api.py +200 -0
risk/network/{graph.py → graph/graph.py} +90 -40
risk/network/graph/summary.py +254 -0
risk/network/io.py +103 -114
risk/network/plotter/__init__.py +6 -0
risk/network/plotter/api.py +54 -0
risk/network/{plot → plotter}/canvas.py +12 -9
risk/network/{plot → plotter}/contour.py +27 -24
risk/network/{plot → plotter}/labels.py +73 -78
risk/network/{plot → plotter}/network.py +45 -39
risk/network/{plot → plotter}/plotter.py +23 -17
risk/network/{plot/utils/color.py → plotter/utils/colors.py} +114 -122
risk/network/{plot → plotter}/utils/layout.py +10 -7
risk/risk.py +11 -500
risk/stats/__init__.py +10 -4
risk/stats/permutation/__init__.py +1 -1
risk/stats/permutation/permutation.py +44 -38
risk/stats/permutation/test_functions.py +26 -18
risk/stats/{stats.py → significance.py} +17 -15
risk/stats/stat_tests.py +267 -0
{risk_network-0.0.8b26.dist-info → risk_network-0.0.9.dist-info}/METADATA +31 -46
risk_network-0.0.9.dist-info/RECORD +40 -0
{risk_network-0.0.8b26.dist-info → risk_network-0.0.9.dist-info}/WHEEL +1 -1
risk/constants.py +0 -31
risk/network/plot/__init__.py +0 -6
risk/stats/hypergeom.py +0 -54
risk/stats/poisson.py +0 -44
risk_network-0.0.8b26.dist-info/RECORD +0 -37
{risk_network-0.0.8b26.dist-info → risk_network-0.0.9.dist-info}/LICENSE +0 -0
{risk_network-0.0.8b26.dist-info → risk_network-0.0.9.dist-info}/top_level.txt +0 -0

risk/neighborhoods/domains.py CHANGED Viewed

@@ -3,75 +3,97 @@ risk/neighborhoods/domains
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 """
-from contextlib import suppress
 from itertools import product
-from tqdm import tqdm
-from typing import Tuple
+from typing import Tuple, Union
 import numpy as np
 import pandas as pd
+from numpy.linalg import LinAlgError
 from scipy.cluster.hierarchy import linkage, fcluster
 from sklearn.metrics import silhouette_score
+from tqdm import tqdm
 from risk.annotations import get_weighted_description
-from risk.constants import GROUP_LINKAGE_METHODS, GROUP_DISTANCE_METRICS
 from risk.log import logger
+# Define constants for clustering
+# fmt: off
+LINKAGE_METHODS = {"single", "complete", "average", "weighted", "centroid", "median", "ward"}
+LINKAGE_METRICS = {
+    "braycurtis","canberra", "chebyshev", "cityblock", "correlation", "cosine", "dice", "euclidean",
+    "hamming", "jaccard", "jensenshannon", "kulczynski1", "mahalanobis", "matching", "minkowski",
+    "rogerstanimoto", "russellrao", "seuclidean", "sokalmichener", "sokalsneath", "sqeuclidean", "yule",
+}
+# fmt: on
 def define_domains(
     top_annotations: pd.DataFrame,
-    significant_neighborhoods_enrichment: np.ndarray,
+    significant_neighborhoods_significance: np.ndarray,
     linkage_criterion: str,
     linkage_method: str,
     linkage_metric: str,
+    linkage_threshold: Union[float, str],
 ) -> pd.DataFrame:
-    """Define domains and assign nodes to these domains based on their enrichment scores and clustering,
+    """Define domains and assign nodes to these domains based on their significance scores and clustering,
     handling errors by assigning unique domains when clustering fails.
     Args:
         top_annotations (pd.DataFrame): DataFrame of top annotations data for the network nodes.
-        significant_neighborhoods_enrichment (np.ndarray): The binary enrichment matrix below alpha.
+        significant_neighborhoods_significance (np.ndarray): The binary significance matrix below alpha.
         linkage_criterion (str): The clustering criterion for defining groups.
-        linkage_method (str): The linkage method for clustering.
-        linkage_metric (str): The linkage metric for clustering.
+        linkage_method (str): The linkage method for clustering. Choose "auto" to optimize.
+        linkage_metric (str): The linkage metric for clustering. Choose "auto" to optimize.
+        linkage_threshold (float, str): The threshold for clustering. Choose "auto" to optimize.
     Returns:
         pd.DataFrame: DataFrame with the primary domain for each node.
     """
     try:
+        if linkage_criterion == "off":
+            raise ValueError("Clustering is turned off.")
         # Transpose the matrix to cluster annotations
-        m = significant_neighborhoods_enrichment[:, top_annotations["significant_annotations"]].T
+        m = significant_neighborhoods_significance[:, top_annotations["significant_annotations"]].T
+        # Safeguard the matrix by replacing NaN, Inf, and -Inf values
+        m = _safeguard_matrix(m)
+        # Optimize silhouette score across different linkage methods and distance metrics
         best_linkage, best_metric, best_threshold = _optimize_silhouette_across_linkage_and_metrics(
-            m, linkage_criterion, linkage_method, linkage_metric
+            m, linkage_criterion, linkage_method, linkage_metric, linkage_threshold
         )
         # Perform hierarchical clustering
         Z = linkage(m, method=best_linkage, metric=best_metric)
         logger.warning(
-            f"Linkage criterion: '{linkage_criterion}'\nLinkage method: '{best_linkage}'\nLinkage metric: '{best_metric}'"
+            f"Linkage criterion: '{linkage_criterion}'\nLinkage method: '{best_linkage}'\nLinkage metric: '{best_metric}'\nLinkage threshold: {round(best_threshold, 3)}"
         )
-        logger.debug(f"Optimal linkage threshold: {round(best_threshold, 3)}")
         # Calculate the optimal threshold for clustering
         max_d_optimal = np.max(Z[:, 2]) * best_threshold
         # Assign domains to the annotations matrix
         domains = fcluster(Z, max_d_optimal, criterion=linkage_criterion)
         top_annotations["domain"] = 0
         top_annotations.loc[top_annotations["significant_annotations"], "domain"] = domains
-    except ValueError:
+    except (ValueError, LinAlgError):
         # If a ValueError is encountered, handle it by assigning unique domains
         n_rows = len(top_annotations)
-        logger.error(
-            f"Error encountered. Skipping clustering and assigning {n_rows} unique domains."
-        )
+        if linkage_criterion == "off":
+            logger.warning(
+                f"Clustering is turned off. Skipping clustering and assigning {n_rows} unique domains."
+            )
+        else:
+            logger.error(
+                f"Error encountered. Skipping clustering and assigning {n_rows} unique domains."
+            )
         top_annotations["domain"] = range(1, n_rows + 1)  # Assign unique domains
     # Create DataFrames to store domain information
-    node_to_enrichment = pd.DataFrame(
-        data=significant_neighborhoods_enrichment,
+    node_to_significance = pd.DataFrame(
+        data=significant_neighborhoods_significance,
         columns=[top_annotations.index.values, top_annotations["domain"]],
     )
-    node_to_domain = node_to_enrichment.groupby(level="domain", axis=1).sum()
+    node_to_domain = node_to_significance.T.groupby(level="domain").sum().T
-    # Find the maximum enrichment score for each node
+    # Find the maximum significance score for each node
     t_max = node_to_domain.loc[:, 1:].max(axis=1)
     t_idxmax = node_to_domain.loc[:, 1:].idxmax(axis=1)
     t_idxmax[t_max == 0] = 0
@@ -86,13 +108,13 @@ def define_domains(
     return node_to_domain
-def trim_domains_and_top_annotations(
+def trim_domains(
     domains: pd.DataFrame,
     top_annotations: pd.DataFrame,
     min_cluster_size: int = 5,
     max_cluster_size: int = 1000,
 ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
-    """Trim domains and top annotations that do not meet size criteria and find outliers.
+    """Trim domains that do not meet size criteria and find outliers.
     Args:
         domains (pd.DataFrame): DataFrame of domain data for the network nodes.
@@ -101,8 +123,7 @@ def trim_domains_and_top_annotations(
         max_cluster_size (int, optional): Maximum size of a cluster to be retained. Defaults to 1000.
     Returns:
-        Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing:
-            - Trimmed annotations (pd.DataFrame)
+        Tuple[pd.DataFrame, pd.DataFrame]:
             - Trimmed domains (pd.DataFrame)
             - A DataFrame with domain labels (pd.DataFrame)
     """
@@ -116,30 +137,30 @@ def trim_domains_and_top_annotations(
     invalid_domain_id = 888888
     invalid_domain_ids = {0, invalid_domain_id}
     # Mark domains to be removed
-    top_annotations["domain"].replace(to_remove, invalid_domain_id, inplace=True)
+    top_annotations["domain"] = top_annotations["domain"].replace(to_remove, invalid_domain_id)
     domains.loc[domains["primary_domain"].isin(to_remove), ["primary_domain"]] = invalid_domain_id
-    # Normalize "num enriched neighborhoods" by percentile for each domain and scale to 0-10
+    # Normalize "num significant neighborhoods" by percentile for each domain and scale to 0-10
     top_annotations["normalized_value"] = top_annotations.groupby("domain")[
-        "significant_neighborhood_enrichment_sums"
+        "significant_neighborhood_significance_sums"
     ].transform(lambda x: (x.rank(pct=True) * 10).apply(np.ceil).astype(int))
-    # Modify the lambda function to pass both full_terms and significant_enrichment_score
+    # Modify the lambda function to pass both full_terms and significant_significance_score
     top_annotations["combined_terms"] = top_annotations.apply(
         lambda row: " ".join([str(row["full_terms"])] * row["normalized_value"]), axis=1
     )
-    # Perform the groupby operation while retaining the other columns and adding the weighting with enrichment scores
+    # Perform the groupby operation while retaining the other columns and adding the weighting with significance scores
     domain_labels = (
         top_annotations.groupby("domain")
         .agg(
             full_terms=("full_terms", lambda x: list(x)),
-            enrichment_scores=("significant_enrichment_score", lambda x: list(x)),
+            significance_scores=("significant_significance_score", lambda x: list(x)),
         )
         .reset_index()
     )
     domain_labels["combined_terms"] = domain_labels.apply(
         lambda row: get_weighted_description(
-            pd.Series(row["full_terms"]), pd.Series(row["enrichment_scores"])
+            pd.Series(row["full_terms"]), pd.Series(row["significance_scores"])
         ),
         axis=1,
     )
@@ -150,45 +171,72 @@ def trim_domains_and_top_annotations(
             "domain": "id",
             "combined_terms": "normalized_description",
             "full_terms": "full_descriptions",
-            "enrichment_scores": "enrichment_scores",
+            "significance_scores": "significance_scores",
         }
     ).set_index("id")
     # Remove invalid domains
-    valid_annotations = top_annotations[~top_annotations["domain"].isin(invalid_domain_ids)].drop(
-        columns=["normalized_value"]
-    )
     valid_domains = domains[~domains["primary_domain"].isin(invalid_domain_ids)]
     valid_trimmed_domains_matrix = trimmed_domains_matrix[
         ~trimmed_domains_matrix.index.isin(invalid_domain_ids)
     ]
-    return valid_annotations, valid_domains, valid_trimmed_domains_matrix
+    return valid_domains, valid_trimmed_domains_matrix
+def _safeguard_matrix(matrix: np.ndarray) -> np.ndarray:
+    """Safeguard the matrix by replacing NaN, Inf, and -Inf values.
+    Args:
+        matrix (np.ndarray): Data matrix.
+    Returns:
+        np.ndarray: Safeguarded data matrix.
+    """
+    # Replace NaN with column mean
+    nan_replacement = np.nanmean(matrix, axis=0)
+    matrix = np.where(np.isnan(matrix), nan_replacement, matrix)
+    # Replace Inf/-Inf with maximum/minimum finite values
+    finite_max = np.nanmax(matrix[np.isfinite(matrix)])
+    finite_min = np.nanmin(matrix[np.isfinite(matrix)])
+    matrix = np.where(np.isposinf(matrix), finite_max, matrix)
+    matrix = np.where(np.isneginf(matrix), finite_min, matrix)
+    # Ensure rows have non-zero variance (optional step)
+    row_variance = np.var(matrix, axis=1)
+    matrix = matrix[row_variance > 0]
+    return matrix
 def _optimize_silhouette_across_linkage_and_metrics(
-    m: np.ndarray, linkage_criterion: str, linkage_method: str, linkage_metric: str
+    m: np.ndarray,
+    linkage_criterion: str,
+    linkage_method: str,
+    linkage_metric: str,
+    linkage_threshold: Union[str, float],
 ) -> Tuple[str, str, float]:
     """Optimize silhouette score across different linkage methods and distance metrics.
     Args:
         m (np.ndarray): Data matrix.
         linkage_criterion (str): Clustering criterion.
-        linkage_method (str): Linkage method for clustering.
-        linkage_metric (str): Linkage metric for clustering.
+        linkage_method (str): Linkage method for clustering. Choose "auto" to optimize.
+        linkage_metric (str): Linkage metric for clustering. Choose "auto" to optimize.
+        linkage_threshold (Union[str, float]): Threshold for clustering. Choose "auto" to optimize.
     Returns:
-        Tuple[str, str, float]: A tuple containing:
+        Tuple[str, str, float]:
             - Best linkage method (str)
             - Best linkage metric (str)
             - Best threshold (float)
     """
+    # Initialize best overall values
     best_overall_method = linkage_method
     best_overall_metric = linkage_metric
+    best_overall_threshold = linkage_threshold
     best_overall_score = -np.inf
-    best_overall_threshold = 1
-    linkage_methods = GROUP_LINKAGE_METHODS if linkage_method == "auto" else [linkage_method]
-    linkage_metrics = GROUP_DISTANCE_METRICS if linkage_metric == "auto" else [linkage_metric]
+    # Set linkage methods and metrics to all combinations if "auto" is selected
+    linkage_methods = LINKAGE_METHODS if linkage_method == "auto" else [linkage_method]
+    linkage_metrics = LINKAGE_METRICS if linkage_metric == "auto" else [linkage_metric]
     total_combinations = len(linkage_methods) * len(linkage_metrics)
     # Evaluating optimal linkage method and metric
@@ -198,14 +246,39 @@ def _optimize_silhouette_across_linkage_and_metrics(
         total=total_combinations,
         bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
     ):
-        with suppress(Exception):
+        # Some linkage methods and metrics may not work with certain data
+        try:
             Z = linkage(m, method=method, metric=metric)
-            threshold, score = _find_best_silhouette_score(Z, m, metric, linkage_criterion)
-            if score > best_overall_score:
-                best_overall_score = score
-                best_overall_threshold = threshold
-                best_overall_method = method
-                best_overall_metric = metric
+        except (ValueError, LinAlgError):
+            # If linkage fails, set a default threshold (a float) and a very poor score
+            current_threshold = 0.0
+            score = -float("inf")
+        else:
+            # Only optimize silhouette score if the threshold is "auto"
+            if linkage_threshold == "auto":
+                threshold, score = _find_best_silhouette_score(Z, m, metric, linkage_criterion)
+                current_threshold = threshold
+            else:
+                # Use the provided threshold without optimization
+                score = silhouette_score(
+                    m,
+                    fcluster(Z, linkage_threshold * np.max(Z[:, 2]), criterion=linkage_criterion),
+                    metric=metric,
+                )
+                current_threshold = linkage_threshold
+        if score > best_overall_score:
+            best_overall_score = score
+            best_overall_threshold = float(current_threshold)  # Ensure it's a float
+            best_overall_method = method
+            best_overall_metric = metric
+    # Ensure that we always return a valid tuple:
+    if best_overall_score == -np.inf:
+        # No valid linkage was found; return default values.
+        best_overall_threshold = float(linkage_threshold) if linkage_threshold != "auto" else 0.0
+        best_overall_method = linkage_method
+        best_overall_metric = linkage_metric
     return best_overall_method, best_overall_metric, best_overall_threshold
@@ -231,7 +304,7 @@ def _find_best_silhouette_score(
         resolution (float, optional): Desired resolution for the best threshold. Defaults to 0.001.
     Returns:
-        Tuple[float, float]: A tuple containing:
+        Tuple[float, float]:
             - Best threshold (float): The threshold that yields the best silhouette score.
             - Best silhouette score (float): The highest silhouette score achieved.
     """

risk-network 0.0.8b26__py3-none-any.whl → 0.0.9__py3-none-any.whl

risk-network 0.0.8b26py3-none-any.whl → 0.0.9py3-none-any.whl