PyPI - risk-network - Versions diffs - 0.0.6b10__py3-none-any.whl → 0.0.7__py3-none-any.whl - Mend

risk-network 0.0.6b10py3-none-any.whl → 0.0.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

risk/__init__.py +1 -1
risk/annotations/annotations.py +61 -42
risk/annotations/io.py +14 -14
risk/log/__init__.py +1 -1
risk/log/config.py +139 -0
risk/log/params.py +4 -4
risk/neighborhoods/community.py +25 -36
risk/neighborhoods/domains.py +29 -27
risk/neighborhoods/neighborhoods.py +171 -72
risk/network/graph.py +92 -41
risk/network/io.py +22 -26
risk/network/plot.py +132 -19
risk/risk.py +81 -78
risk/stats/__init__.py +2 -2
risk/stats/hypergeom.py +30 -107
risk/stats/permutation/permutation.py +23 -17
risk/stats/permutation/test_functions.py +2 -2
risk/stats/poisson.py +44 -0
{risk_network-0.0.6b10.dist-info → risk_network-0.0.7.dist-info}/METADATA +1 -1
risk_network-0.0.7.dist-info/RECORD +30 -0
risk/log/console.py +0 -16
risk/stats/fisher_exact.py +0 -132
risk_network-0.0.6b10.dist-info/RECORD +0 -30
{risk_network-0.0.6b10.dist-info → risk_network-0.0.7.dist-info}/LICENSE +0 -0
{risk_network-0.0.6b10.dist-info → risk_network-0.0.7.dist-info}/WHEEL +0 -0
{risk_network-0.0.6b10.dist-info → risk_network-0.0.7.dist-info}/top_level.txt +0 -0

risk/neighborhoods/domains.py CHANGED Viewed

@@ -4,6 +4,7 @@ risk/neighborhoods/domains
 """
 from contextlib import suppress
+from itertools import product
 from tqdm import tqdm
 from typing import Tuple
@@ -14,6 +15,7 @@ from sklearn.metrics import silhouette_score
 from risk.annotations import get_description
 from risk.constants import GROUP_LINKAGE_METHODS, GROUP_DISTANCE_METRICS
+from risk.log import logger
 def define_domains(
@@ -23,7 +25,8 @@ def define_domains(
     linkage_method: str,
     linkage_metric: str,
 ) -> pd.DataFrame:
-    """Define domains and assign nodes to these domains based on their enrichment scores and clustering.
+    """Define domains and assign nodes to these domains based on their enrichment scores and clustering,
+    handling errors by assigning unique domains when clustering fails.
     Args:
         top_annotations (pd.DataFrame): DataFrame of top annotations data for the network nodes.
@@ -35,31 +38,31 @@ def define_domains(
     Returns:
         pd.DataFrame: DataFrame with the primary domain for each node.
     """
-    # Check if there's more than one column in significant_neighborhoods_enrichment
-    if significant_neighborhoods_enrichment.shape[1] == 1:
-        print("Single annotation detected. Skipping clustering.")
-        top_annotations["domain"] = 1  # Assign a default domain or handle appropriately
-    else:
-        # Perform hierarchical clustering on the binary enrichment matrix
+    try:
+        # Transpose the matrix to cluster annotations
         m = significant_neighborhoods_enrichment[:, top_annotations["top attributes"]].T
         best_linkage, best_metric, best_threshold = _optimize_silhouette_across_linkage_and_metrics(
             m, linkage_criterion, linkage_method, linkage_metric
         )
-        try:
-            Z = linkage(m, method=best_linkage, metric=best_metric)
-        except ValueError as e:
-            raise ValueError("No significant annotations found.") from e
-        print(
+        # Perform hierarchical clustering
+        Z = linkage(m, method=best_linkage, metric=best_metric)
+        logger.warning(
             f"Linkage criterion: '{linkage_criterion}'\nLinkage method: '{best_linkage}'\nLinkage metric: '{best_metric}'"
         )
-        print(f"Optimal linkage threshold: {round(best_threshold, 3)}")
+        logger.debug(f"Optimal linkage threshold: {round(best_threshold, 3)}")
+        # Calculate the optimal threshold for clustering
         max_d_optimal = np.max(Z[:, 2]) * best_threshold
-        domains = fcluster(Z, max_d_optimal, criterion=linkage_criterion)
         # Assign domains to the annotations matrix
+        domains = fcluster(Z, max_d_optimal, criterion=linkage_criterion)
         top_annotations["domain"] = 0
         top_annotations.loc[top_annotations["top attributes"], "domain"] = domains
+    except ValueError:
+        # If a ValueError is encountered, handle it by assigning unique domains
+        n_rows = len(top_annotations)
+        logger.error(
+            f"Error encountered. Skipping clustering and assigning {n_rows} unique domains."
+        )
+        top_annotations["domain"] = range(1, n_rows + 1)  # Assign unique domains
     # Create DataFrames to store domain information
     node_to_enrichment = pd.DataFrame(
@@ -166,21 +169,20 @@ def _optimize_silhouette_across_linkage_and_metrics(
     total_combinations = len(linkage_methods) * len(linkage_metrics)
     # Evaluating optimal linkage method and metric
-    for method in tqdm(
-        linkage_methods,
+    for method, metric in tqdm(
+        product(linkage_methods, linkage_metrics),
         desc="Evaluating optimal linkage method and metric",
         total=total_combinations,
         bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
     ):
-        for metric in linkage_metrics:
-            with suppress(Exception):
-                Z = linkage(m, method=method, metric=metric)
-                threshold, score = _find_best_silhouette_score(Z, m, metric, linkage_criterion)
-                if score > best_overall_score:
-                    best_overall_score = score
-                    best_overall_threshold = threshold
-                    best_overall_method = method
-                    best_overall_metric = metric
+        with suppress(Exception):
+            Z = linkage(m, method=method, metric=metric)
+            threshold, score = _find_best_silhouette_score(Z, m, metric, linkage_criterion)
+            if score > best_overall_score:
+                best_overall_score = score
+                best_overall_threshold = threshold
+                best_overall_method = method
+                best_overall_metric = metric
     return best_overall_method, best_overall_metric, best_overall_threshold

risk/neighborhoods/neighborhoods.py CHANGED Viewed

@@ -3,21 +3,24 @@ risk/neighborhoods/neighborhoods
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 """
+import random
 import warnings
 from typing import Any, Dict, List, Tuple
 import networkx as nx
 import numpy as np
 from sklearn.exceptions import DataConversionWarning
+from sklearn.metrics.pairwise import cosine_similarity
 from risk.neighborhoods.community import (
-    calculate_dijkstra_neighborhoods,
+    calculate_greedy_modularity_neighborhoods,
     calculate_label_propagation_neighborhoods,
     calculate_louvain_neighborhoods,
     calculate_markov_clustering_neighborhoods,
     calculate_spinglass_neighborhoods,
     calculate_walktrap_neighborhoods,
 )
+from risk.log import logger
 # Suppress DataConversionWarning
 warnings.filterwarnings(action="ignore", category=DataConversionWarning)
@@ -25,7 +28,7 @@ warnings.filterwarnings(action="ignore", category=DataConversionWarning)
 def get_network_neighborhoods(
     network: nx.Graph,
-    distance_metric: str = "dijkstra",
+    distance_metric: str = "louvain",
     edge_length_threshold: float = 1.0,
     louvain_resolution: float = 1.0,
     random_seed: int = 888,
@@ -34,8 +37,8 @@ def get_network_neighborhoods(
     Args:
         network (nx.Graph): The network graph.
-        distance_metric (str): The distance metric to use ('euclidean', 'dijkstra', 'louvain', 'affinity_propagation',
-            'label_propagation', 'markov_clustering', 'walktrap', 'spinglass').
+        distance_metric (str): The distance metric to use ('greedy_modularity', 'louvain', 'label_propagation',
+            'markov_clustering', 'walktrap', 'spinglass').
         edge_length_threshold (float): The edge length threshold for the neighborhoods.
         louvain_resolution (float, optional): Resolution parameter for the Louvain method. Defaults to 1.0.
         random_seed (int, optional): Random seed for methods requiring random initialization. Defaults to 888.
@@ -43,12 +46,19 @@ def get_network_neighborhoods(
     Returns:
         np.ndarray: Neighborhood matrix calculated based on the selected distance metric.
     """
-    network = _create_percentile_limited_subgraph(network, edge_length_threshold)
+    # Set random seed for reproducibility in all methods besides Louvain, which requires a separate seed
+    random.seed(random_seed)
+    np.random.seed(random_seed)
+    # Create a subgraph based on the edge length percentile threshold
+    network = _create_percentile_limited_subgraph(
+        network, edge_length_percentile=edge_length_threshold
+    )
-    if distance_metric == "dijkstra":
-        return calculate_dijkstra_neighborhoods(network)
     if distance_metric == "louvain":
         return calculate_louvain_neighborhoods(network, louvain_resolution, random_seed=random_seed)
+    if distance_metric == "greedy_modularity":
+        return calculate_greedy_modularity_neighborhoods(network)
     if distance_metric == "label_propagation":
         return calculate_label_propagation_neighborhoods(network)
     if distance_metric == "markov_clustering":
@@ -59,41 +69,51 @@ def get_network_neighborhoods(
         return calculate_spinglass_neighborhoods(network)
     raise ValueError(
-        "Incorrect distance metric specified. Please choose from 'dijkstra', 'louvain',"
+        "Incorrect distance metric specified. Please choose from 'greedy_modularity', 'louvain',"
         "'label_propagation', 'markov_clustering', 'walktrap', 'spinglass'."
     )
 def _create_percentile_limited_subgraph(G: nx.Graph, edge_length_percentile: float) -> nx.Graph:
-    """Calculate the edge length corresponding to the given percentile of edge lengths in the graph
-    and create a subgraph with all nodes and edges below this length.
+    """Create a subgraph containing all nodes and edges where the edge length is below the
+    specified percentile of all edge lengths in the input graph.
     Args:
-        G (nx.Graph): The input graph.
-        edge_length_percentile (float): The percentile to calculate (between 0 and 1).
+        G (nx.Graph): The input graph with 'length' attributes on edges.
+        edge_length_percentile (float): The percentile (between 0 and 1) to filter edges by length.
     Returns:
-        nx.Graph: A subgraph with all nodes and edges below the edge length corresponding to the given percentile.
+        nx.Graph: A subgraph with all nodes and edges where the edge length is below the
+        calculated threshold length.
     """
-    # Extract edge lengths from the graph
+    # Extract edge lengths and handle missing lengths
     edge_lengths = [d["length"] for _, _, d in G.edges(data=True) if "length" in d]
+    if not edge_lengths:
+        raise ValueError(
+            "No edge lengths found in the graph. Ensure edges have 'length' attributes."
+        )
     # Calculate the specific edge length for the given percentile
     percentile_length = np.percentile(edge_lengths, edge_length_percentile * 100)
-    # Create a new graph with all nodes from the original graph
+    # Create the subgraph by directly filtering edges during iteration
     subgraph = nx.Graph()
-    subgraph.add_nodes_from(G.nodes(data=True))
-    # Add edges to the subgraph if they are below the specified percentile length
+    subgraph.add_nodes_from(G.nodes(data=True))  # Retain all nodes from the original graph
+    # Add edges below the specified percentile length in a single pass
     for u, v, d in G.edges(data=True):
         if d.get("length", 1) <= percentile_length:
             subgraph.add_edge(u, v, **d)
+    # Return the subgraph; optionally check if it's too sparse
+    if subgraph.number_of_edges() == 0:
+        raise Warning("The resulting subgraph has no edges. Consider adjusting the percentile.")
     return subgraph
 def process_neighborhoods(
     network: nx.Graph,
     neighborhoods: Dict[str, Any],
-    impute_depth: int = 1,
+    impute_depth: int = 0,
     prune_threshold: float = 0.0,
 ) -> Dict[str, Any]:
     """Process neighborhoods based on the imputation and pruning settings.
@@ -101,7 +121,7 @@ def process_neighborhoods(
     Args:
         network (nx.Graph): The network data structure used for imputing and pruning neighbors.
         neighborhoods (dict): Dictionary containing 'enrichment_matrix', 'binary_enrichment_matrix', and 'significant_enrichment_matrix'.
-        impute_depth (int, optional): Depth for imputing neighbors. Defaults to 1.
+        impute_depth (int, optional): Depth for imputing neighbors. Defaults to 0.
         prune_threshold (float, optional): Distance threshold for pruning neighbors. Defaults to 0.0.
     Returns:
@@ -110,7 +130,7 @@ def process_neighborhoods(
     enrichment_matrix = neighborhoods["enrichment_matrix"]
     binary_enrichment_matrix = neighborhoods["binary_enrichment_matrix"]
     significant_enrichment_matrix = neighborhoods["significant_enrichment_matrix"]
-    print(f"Imputation depth: {impute_depth}")
+    logger.debug(f"Imputation depth: {impute_depth}")
     if impute_depth:
         (
             enrichment_matrix,
@@ -123,7 +143,7 @@ def process_neighborhoods(
             max_depth=impute_depth,
         )
-    print(f"Pruning threshold: {prune_threshold}")
+    logger.debug(f"Pruning threshold: {prune_threshold}")
     if prune_threshold:
         (
             enrichment_matrix,
@@ -167,55 +187,134 @@ def _impute_neighbors(
             - np.ndarray: The imputed alpha threshold matrix.
             - np.ndarray: The significant enrichment matrix with non-significant entries set to zero.
     """
-    # Calculate shortest distances for each node to determine the distance threshold
-    shortest_distances = []
-    for node in network.nodes():
-        try:
-            neighbors = [
-                n for n in network.neighbors(node) if binary_enrichment_matrix[n].sum() != 0
-            ]
-        except IndexError as e:
-            raise IndexError(
-                f"Failed to find neighbors for node '{node}': Ensure that the node exists in the network and that the binary enrichment matrix is correctly indexed."
-            ) from e
-        # Calculate the shortest distance to a neighbor
-        if neighbors:
-            shortest_distance = min([_get_euclidean_distance(node, n, network) for n in neighbors])
-            shortest_distances.append(shortest_distance)
+    # Calculate the distance threshold value based on the shortest distances
+    enrichment_matrix, binary_enrichment_matrix = _impute_neighbors_with_similarity(
+        network, enrichment_matrix, binary_enrichment_matrix, max_depth=max_depth
+    )
+    # Create a matrix where non-significant entries are set to zero
+    significant_enrichment_matrix = np.where(binary_enrichment_matrix == 1, enrichment_matrix, 0)
+    return enrichment_matrix, binary_enrichment_matrix, significant_enrichment_matrix
+def _impute_neighbors_with_similarity(
+    network: nx.Graph,
+    enrichment_matrix: np.ndarray,
+    binary_enrichment_matrix: np.ndarray,
+    max_depth: int = 3,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Impute non-enriched nodes based on the closest enriched neighbors' profiles and their similarity.
+    Args:
+        network (nx.Graph): The network graph with nodes having IDs matching the matrix indices.
+        enrichment_matrix (np.ndarray): The enrichment matrix with rows to be imputed.
+        binary_enrichment_matrix (np.ndarray): The alpha threshold matrix to be imputed similarly.
+        max_depth (int): Maximum depth of nodes to traverse for imputing values.
+    Returns:
+        Tuple[np.ndarray, np.ndarray]: A tuple containing:
+            - The imputed enrichment matrix.
+            - The imputed alpha threshold matrix.
+    """
     depth = 1
     rows_to_impute = np.where(binary_enrichment_matrix.sum(axis=1) == 0)[0]
     while len(rows_to_impute) and depth <= max_depth:
-        next_rows_to_impute = []
-        for row_index in rows_to_impute:
-            neighbors = nx.single_source_shortest_path_length(network, row_index, cutoff=depth)
-            valid_neighbors = [
-                n
-                for n in neighbors
-                if n != row_index
-                and binary_enrichment_matrix[n].sum() != 0
-                and enrichment_matrix[n].sum() != 0
-            ]
-            if valid_neighbors:
-                closest_neighbor = min(
-                    valid_neighbors, key=lambda n: _get_euclidean_distance(row_index, n, network)
+        # Iterate over all enriched nodes
+        for row_index in range(binary_enrichment_matrix.shape[0]):
+            if binary_enrichment_matrix[row_index].sum() != 0:
+                enrichment_matrix, binary_enrichment_matrix = _process_node_imputation(
+                    row_index, network, enrichment_matrix, binary_enrichment_matrix, depth
                 )
-                # Impute the row with the closest valid neighbor's data
-                enrichment_matrix[row_index] = enrichment_matrix[closest_neighbor]
-                binary_enrichment_matrix[row_index] = binary_enrichment_matrix[
-                    closest_neighbor
-                ] / np.sqrt(depth + 1)
-            else:
-                next_rows_to_impute.append(row_index)
-        rows_to_impute = next_rows_to_impute
+        # Update rows to impute for the next iteration
+        rows_to_impute = np.where(binary_enrichment_matrix.sum(axis=1) == 0)[0]
         depth += 1
-    # Create a matrix where non-significant entries are set to zero
-    significant_enrichment_matrix = np.where(binary_enrichment_matrix == 1, enrichment_matrix, 0)
+    return enrichment_matrix, binary_enrichment_matrix
-    return enrichment_matrix, binary_enrichment_matrix, significant_enrichment_matrix
+def _process_node_imputation(
+    row_index: int,
+    network: nx.Graph,
+    enrichment_matrix: np.ndarray,
+    binary_enrichment_matrix: np.ndarray,
+    depth: int,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Process the imputation for a single node based on its enriched neighbors.
+    Args:
+        row_index (int): The index of the enriched node being processed.
+        network (nx.Graph): The network graph with nodes having IDs matching the matrix indices.
+        enrichment_matrix (np.ndarray): The enrichment matrix with rows to be imputed.
+        binary_enrichment_matrix (np.ndarray): The alpha threshold matrix to be imputed similarly.
+        depth (int): Current depth for traversal.
+    Returns:
+        Tuple[np.ndarray, np.ndarray]: The modified enrichment matrix and binary threshold matrix.
+    """
+    # Check neighbors at the current depth
+    neighbors = nx.single_source_shortest_path_length(network, row_index, cutoff=depth)
+    # Filter annotated neighbors (already enriched)
+    annotated_neighbors = [
+        n
+        for n in neighbors
+        if n != row_index
+        and binary_enrichment_matrix[n].sum() != 0
+        and enrichment_matrix[n].sum() != 0
+    ]
+    # Filter non-enriched neighbors
+    valid_neighbors = [
+        n
+        for n in neighbors
+        if n != row_index
+        and binary_enrichment_matrix[n].sum() == 0
+        and enrichment_matrix[n].sum() == 0
+    ]
+    # If there are valid non-enriched neighbors
+    if valid_neighbors and annotated_neighbors:
+        # Calculate distances to annotated neighbors
+        distances_to_annotated = [
+            _get_euclidean_distance(row_index, n, network) for n in annotated_neighbors
+        ]
+        # Calculate the IQR to identify outliers
+        q1, q3 = np.percentile(distances_to_annotated, [25, 75])
+        iqr = q3 - q1
+        lower_bound = q1 - 1.5 * iqr
+        upper_bound = q3 + 1.5 * iqr
+        # Filter valid non-enriched neighbors that fall within the IQR bounds
+        valid_neighbors_within_iqr = [
+            n
+            for n in valid_neighbors
+            if lower_bound <= _get_euclidean_distance(row_index, n, network) <= upper_bound
+        ]
+        # If there are any valid neighbors within the IQR
+        if valid_neighbors_within_iqr:
+            # If more than one valid neighbor is within the IQR, compute pairwise cosine similarities
+            if len(valid_neighbors_within_iqr) > 1:
+                # Find the most similar neighbor based on pairwise cosine similarities
+                def sum_pairwise_cosine_similarities(neighbor):
+                    return sum(
+                        cosine_similarity(
+                            enrichment_matrix[neighbor].reshape(1, -1),
+                            enrichment_matrix[other_neighbor].reshape(1, -1),
+                        )[0][0]
+                        for other_neighbor in valid_neighbors_within_iqr
+                        if other_neighbor != neighbor
+                    )
+                most_similar_neighbor = max(
+                    valid_neighbors_within_iqr, key=sum_pairwise_cosine_similarities
+                )
+            else:
+                most_similar_neighbor = valid_neighbors_within_iqr[0]
+            # Impute the most similar non-enriched neighbor with the enriched node's data, scaled by depth
+            enrichment_matrix[most_similar_neighbor] = enrichment_matrix[row_index] / np.sqrt(
+                depth + 1
+            )
+            binary_enrichment_matrix[most_similar_neighbor] = binary_enrichment_matrix[row_index]
+    return enrichment_matrix, binary_enrichment_matrix
 def _prune_neighbors(
@@ -240,27 +339,27 @@ def _prune_neighbors(
     """
     # Identify indices with non-zero rows in the binary enrichment matrix
     non_zero_indices = np.where(binary_enrichment_matrix.sum(axis=1) != 0)[0]
-    average_distances = []
+    median_distances = []
     for node in non_zero_indices:
         neighbors = [n for n in network.neighbors(node) if binary_enrichment_matrix[n].sum() != 0]
         if neighbors:
-            average_distance = np.mean(
+            median_distance = np.median(
                 [_get_euclidean_distance(node, n, network) for n in neighbors]
             )
-            average_distances.append(average_distance)
+            median_distances.append(median_distance)
     # Calculate the distance threshold value based on rank
-    distance_threshold_value = _calculate_threshold(average_distances, 1 - distance_threshold)
+    distance_threshold_value = _calculate_threshold(median_distances, 1 - distance_threshold)
     # Prune nodes that are outliers based on the distance threshold
     for row_index in non_zero_indices:
         neighbors = [
             n for n in network.neighbors(row_index) if binary_enrichment_matrix[n].sum() != 0
         ]
         if neighbors:
-            average_distance = np.mean(
+            median_distance = np.median(
                 [_get_euclidean_distance(row_index, n, network) for n in neighbors]
             )
-            if average_distance >= distance_threshold_value:
+            if median_distance >= distance_threshold_value:
                 enrichment_matrix[row_index] = 0
                 binary_enrichment_matrix[row_index] = 0
@@ -305,18 +404,18 @@ def _get_node_position(network: nx.Graph, node: Any) -> np.ndarray:
     )
-def _calculate_threshold(average_distances: List, distance_threshold: float) -> float:
-    """Calculate the distance threshold based on the given average distances and a percentile threshold.
+def _calculate_threshold(median_distances: List, distance_threshold: float) -> float:
+    """Calculate the distance threshold based on the given median distances and a percentile threshold.
     Args:
-        average_distances (list): An array of average distances.
+        median_distances (list): An array of median distances.
         distance_threshold (float): A percentile threshold (0 to 1) used to determine the distance cutoff.
     Returns:
         float: The calculated distance threshold value.
     """
-    # Sort the average distances
-    sorted_distances = np.sort(average_distances)
+    # Sort the median distances
+    sorted_distances = np.sort(median_distances)
     # Compute the rank percentiles for the sorted distances
     rank_percentiles = np.linspace(0, 1, len(sorted_distances))
     # Interpolating the ranks to 1000 evenly spaced percentiles

risk-network 0.0.6b10__py3-none-any.whl → 0.0.7__py3-none-any.whl

risk-network 0.0.6b10py3-none-any.whl → 0.0.7py3-none-any.whl