PyPI - risk-network - Versions diffs - 0.0.7b4__tar.gz → 0.0.7b6__tar.gz - Mend

risk-network 0.0.7b4tar.gz → 0.0.7b6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

{risk_network-0.0.7b4 → risk_network-0.0.7b6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: risk-network
-Version: 0.0.7b4
+Version: 0.0.7b6
 Summary: A Python package for biological network analysis
 Author: Ira Horecka
 Author-email: Ira Horecka <ira89@icloud.com>

{risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk/__init__.py RENAMED Viewed

@@ -7,4 +7,4 @@ RISK: RISK Infers Spatial Kinships
 from risk.risk import RISK
-__version__ = "0.0.7-beta.4"
+__version__ = "0.0.7-beta.6"

{risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk/annotations/annotations.py RENAMED Viewed

@@ -39,7 +39,7 @@ def load_annotations(network: nx.Graph, annotations_input: Dict[str, Any]) -> Di
         annotations_input (dict): A dictionary with annotations.
     Returns:
-        dict: A dictionary containing ordered nodes, ordered annotations, and the annotations matrix.
+        dict: A dictionary containing ordered nodes, ordered annotations, and the binary annotations matrix.
     """
     # Flatten the dictionary to a list of tuples for easier DataFrame creation
     flattened_annotations = [
@@ -66,7 +66,8 @@ def load_annotations(network: nx.Graph, annotations_input: Dict[str, Any]) -> Di
     # Extract ordered nodes and annotations
     ordered_nodes = tuple(annotations_pivot.index)
     ordered_annotations = tuple(annotations_pivot.columns)
-    annotations_pivot_numpy = annotations_pivot.fillna(0).to_numpy()
+    # Convert the annotations_pivot matrix to a numpy array and ensure it's binary
+    annotations_pivot_numpy = (annotations_pivot.fillna(0).to_numpy() > 0).astype(int)
     return {
         "ordered_nodes": ordered_nodes,
@@ -163,8 +164,8 @@ def define_top_annotations(
 def get_description(words_column: pd.Series) -> str:
-    """Process input Series to identify and return the top N frequent, significant words,
-    filtering based on stopwords and similarity (Jaccard index).
+    """Process input Series to identify and return the top frequent, significant words,
+    filtering based on stopwords and gracefully handling numerical strings.
     Args:
         words_column (pd.Series): A pandas Series containing strings to process.
@@ -172,19 +173,30 @@ def get_description(words_column: pd.Series) -> str:
     Returns:
         str: A coherent description formed from the most frequent and significant words.
     """
-    # Define stopwords
-    stop_words = set(stopwords.words("english"))
-    # Tokenize the concatenated string and filter out stopwords and non-alphabetic words
+    # Concatenate all rows into a single string and tokenize into words
+    all_words = words_column.str.cat(sep=" ")
+    tokens = word_tokenize(all_words)
+    # Check if all tokens are numeric strings or contain a mixture of strings and numbers
+    numeric_tokens = [token for token in tokens if token.replace(".", "", 1).isdigit()]
+    non_numeric_tokens = [token for token in tokens if not token.replace(".", "", 1).isdigit()]
+    # If there's only one unique numeric value, return it directly as a string
+    unique_numeric_values = set(numeric_tokens)
+    if len(unique_numeric_values) == 1:
+        return f"{list(unique_numeric_values)[0]}"
+    # Allow the inclusion of both alphabetic and numeric tokens if mixture is detected
     words = [
         (
             word.lower() if word.istitle() else word
         )  # Lowercase all words except proper nouns (e.g., RNA, mRNA)
-        for word in word_tokenize(words_column.str.cat(sep=" "))
-        if word.isalpha() and word.lower() not in stop_words
+        for word in tokens
+        if word.isalpha()
+        or word.replace(".", "", 1).isdigit()  # Keep alphabetic words and numeric strings
     ]
-    # Simplify the word list to remove similar words based on the Jaccard index and generate coherent description
-    simplified_words = _simplify_word_list(words, threshold=0.90)
-    description = _generate_coherent_description(simplified_words)
+    # Generate a coherent description from the processed words
+    description = _generate_coherent_description(words)
     return description
@@ -242,25 +254,28 @@ def _calculate_jaccard_index(set1: Set[Any], set2: Set[Any]) -> float:
 def _generate_coherent_description(words: List[str]) -> str:
-    """Generate a coherent description from a list of words.
+    """Generate a coherent description from a list of words or numerical string values.
+    If there is only one unique entry, return it directly.
     Args:
-        words (list of str): A list of words from which to generate the description.
+        words (list): A list of words or numerical string values.
     Returns:
         str: A coherent description formed by arranging the words in a logical sequence.
     """
-    # Count the frequency of each word
+    # If there are no words or the input is invalid, raise an error
+    if not words or not isinstance(words, list) or not all(isinstance(word, str) for word in words):
+        raise ValueError("Input must be a list of strings.")
+    # If there's only one unique word, return it directly (even if it's a number-like string)
+    unique_words = set(words)
+    if len(unique_words) == 1:
+        return list(unique_words)[0]
+    # Count the frequency of each word and sort them by frequency
     word_counts = Counter(words)
-    # Get the most common words
     most_common_words = [word for word, _ in word_counts.most_common()]
-    # Filter out common stopwords
-    stop_words = set(stopwords.words("english"))
-    filtered_words = [word for word in most_common_words if word.lower() not in stop_words]
-    # Generate permutations of the filtered words to find a logical order
-    perm = permutations(filtered_words)
-    # Assume the first permutation as the logical sequence (since they're all equally likely without additional context)
-    logical_sequence = next(perm)
-    # Join the words to form a coherent description
-    description = " ".join(logical_sequence)
+    # Join the most common words to form a coherent description based on frequency
+    description = " ".join(most_common_words)
     return description

{risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk/neighborhoods/community.py RENAMED Viewed

@@ -7,32 +7,29 @@ import community as community_louvain
 import networkx as nx
 import numpy as np
 import markov_clustering as mc
-from networkx.algorithms.community import asyn_lpa_communities
+from networkx.algorithms.community import asyn_lpa_communities, greedy_modularity_communities
-def calculate_dijkstra_neighborhoods(network: nx.Graph) -> np.ndarray:
-    """Calculate neighborhoods using Dijkstra's shortest path distances.
+def calculate_greedy_modularity_neighborhoods(network: nx.Graph) -> np.ndarray:
+    """Calculate neighborhoods using the Greedy Modularity method.
     Args:
-        network (nx.Graph): The network graph.
+        network (nx.Graph): The network graph to analyze for community structure.
     Returns:
-        np.ndarray: Neighborhood matrix based on Dijkstra's distances.
+        np.ndarray: A binary neighborhood matrix where nodes in the same community have 1, and others have 0.
     """
-    # Compute Dijkstra's distance for all pairs of nodes in the network
-    all_dijkstra_paths = dict(nx.all_pairs_dijkstra_path_length(network, weight="length"))
+    # Detect communities using the Greedy Modularity method
+    communities = greedy_modularity_communities(network)
+    # Create a mapping from node to community
+    community_dict = {node: idx for idx, community in enumerate(communities) for node in community}
+    # Create a binary neighborhood matrix
     neighborhoods = np.zeros((network.number_of_nodes(), network.number_of_nodes()), dtype=int)
-    # Populate the neighborhoods matrix based on Dijkstra's distances
-    for source, targets in all_dijkstra_paths.items():
-        max_length = max(targets.values()) if targets else 1  # Handle cases with no targets
-        for target, length in targets.items():
-            if np.isnan(length):
-                neighborhoods[source, target] = max_length  # Use max distance for NaN
-            elif length == 0:
-                neighborhoods[source, target] = 1  # Assign 1 for zero-length paths (self-loops)
-            else:
-                neighborhoods[source, target] = 1 / length  # Inverse of the distance
+    node_index = {node: i for i, node in enumerate(network.nodes())}
+    for node_i, community_i in community_dict.items():
+        for node_j, community_j in community_dict.items():
+            if community_i == community_j:
+                neighborhoods[node_index[node_i], node_index[node_j]] = 1
     return neighborhoods
@@ -44,21 +41,19 @@ def calculate_label_propagation_neighborhoods(network: nx.Graph) -> np.ndarray:
         network (nx.Graph): The network graph.
     Returns:
-        np.ndarray: Neighborhood matrix based on Label Propagation.
+        np.ndarray: Binary neighborhood matrix on Label Propagation.
     """
     # Apply Label Propagation
     communities = nx.algorithms.community.label_propagation.label_propagation_communities(network)
     # Create a mapping from node to community
     community_dict = {}
     for community_id, community in enumerate(communities):
         for node in community:
             community_dict[node] = community_id
-    # Create a neighborhood matrix
+    # Create a binary neighborhood matrix
     num_nodes = network.number_of_nodes()
     neighborhoods = np.zeros((num_nodes, num_nodes), dtype=int)
     # Assign neighborhoods based on community labels
     for node_i, community_i in community_dict.items():
         for node_j, community_j in community_dict.items():
@@ -79,14 +74,14 @@ def calculate_louvain_neighborhoods(
         random_seed (int, optional): Random seed for reproducibility. Defaults to 888.
     Returns:
-        np.ndarray: Neighborhood matrix based on the Louvain method.
+        np.ndarray: Binary neighborhood matrix on the Louvain method.
     """
     # Apply Louvain method to partition the network
     partition = community_louvain.best_partition(
         network, resolution=resolution, random_state=random_seed
     )
+    # Create a binary neighborhood matrix
     neighborhoods = np.zeros((network.number_of_nodes(), network.number_of_nodes()), dtype=int)
     # Assign neighborhoods based on community partitions
     for node_i, community_i in partition.items():
         for node_j, community_j in partition.items():
@@ -103,7 +98,7 @@ def calculate_markov_clustering_neighborhoods(network: nx.Graph) -> np.ndarray:
         network (nx.Graph): The network graph.
     Returns:
-        np.ndarray: Neighborhood matrix based on Markov Clustering.
+        np.ndarray: Binary neighborhood matrix on Markov Clustering.
     """
     # Convert the graph to an adjacency matrix
     adjacency_matrix = nx.to_numpy_array(network)
@@ -111,17 +106,15 @@ def calculate_markov_clustering_neighborhoods(network: nx.Graph) -> np.ndarray:
     result = mc.run_mcl(adjacency_matrix)  # Run MCL with default parameters
     # Get clusters
     clusters = mc.get_clusters(result)
     # Create a community label for each node
     community_dict = {}
     for community_id, community in enumerate(clusters):
         for node in community:
             community_dict[node] = community_id
-    # Create a neighborhood matrix
+    # Create a binary neighborhood matrix
     num_nodes = network.number_of_nodes()
     neighborhoods = np.zeros((num_nodes, num_nodes), dtype=int)
     # Assign neighborhoods based on community labels
     for node_i, community_i in community_dict.items():
         for node_j, community_j in community_dict.items():
@@ -138,21 +131,19 @@ def calculate_spinglass_neighborhoods(network: nx.Graph) -> np.ndarray:
         network (nx.Graph): The network graph.
     Returns:
-        np.ndarray: Neighborhood matrix based on Spin Glass communities.
+        np.ndarray: Binary neighborhood matrix on Spin Glass communities.
     """
     # Use the asynchronous label propagation algorithm as a proxy for Spin Glass
     communities = asyn_lpa_communities(network)
     # Create a community label for each node
     community_dict = {}
     for community_id, community in enumerate(communities):
         for node in community:
             community_dict[node] = community_id
-    # Create a neighborhood matrix
+    # Create a binary neighborhood matrix
     num_nodes = network.number_of_nodes()
     neighborhoods = np.zeros((num_nodes, num_nodes), dtype=int)
     # Assign neighborhoods based on community labels
     for node_i, community_i in community_dict.items():
         for node_j, community_j in community_dict.items():
@@ -169,21 +160,19 @@ def calculate_walktrap_neighborhoods(network: nx.Graph) -> np.ndarray:
         network (nx.Graph): The network graph.
     Returns:
-        np.ndarray: Neighborhood matrix based on Walktrap communities.
+        np.ndarray: Binary neighborhood matrix on Walktrap communities.
     """
     # Use the asynchronous label propagation algorithm as a proxy for Walktrap
     communities = asyn_lpa_communities(network)
     # Create a community label for each node
     community_dict = {}
     for community_id, community in enumerate(communities):
         for node in community:
             community_dict[node] = community_id
-    # Create a neighborhood matrix
+    # Create a binary neighborhood matrix
     num_nodes = network.number_of_nodes()
     neighborhoods = np.zeros((num_nodes, num_nodes), dtype=int)
     # Assign neighborhoods based on community labels
     for node_i, community_i in community_dict.items():
         for node_j, community_j in community_dict.items():

{risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk/neighborhoods/neighborhoods.py RENAMED Viewed

@@ -3,6 +3,7 @@ risk/neighborhoods/neighborhoods
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 """
+import random
 import warnings
 from typing import Any, Dict, List, Tuple
@@ -12,7 +13,7 @@ from sklearn.exceptions import DataConversionWarning
 from sklearn.metrics.pairwise import cosine_similarity
 from risk.neighborhoods.community import (
-    calculate_dijkstra_neighborhoods,
+    calculate_greedy_modularity_neighborhoods,
     calculate_label_propagation_neighborhoods,
     calculate_louvain_neighborhoods,
     calculate_markov_clustering_neighborhoods,
@@ -26,7 +27,7 @@ warnings.filterwarnings(action="ignore", category=DataConversionWarning)
 def get_network_neighborhoods(
     network: nx.Graph,
-    distance_metric: str = "dijkstra",
+    distance_metric: str = "louvain",
     edge_length_threshold: float = 1.0,
     louvain_resolution: float = 1.0,
     random_seed: int = 888,
@@ -35,8 +36,8 @@ def get_network_neighborhoods(
     Args:
         network (nx.Graph): The network graph.
-        distance_metric (str): The distance metric to use ('euclidean', 'dijkstra', 'louvain', 'affinity_propagation',
-            'label_propagation', 'markov_clustering', 'walktrap', 'spinglass').
+        distance_metric (str): The distance metric to use ('greedy_modularity', 'louvain', 'label_propagation',
+            'markov_clustering', 'walktrap', 'spinglass').
         edge_length_threshold (float): The edge length threshold for the neighborhoods.
         louvain_resolution (float, optional): Resolution parameter for the Louvain method. Defaults to 1.0.
         random_seed (int, optional): Random seed for methods requiring random initialization. Defaults to 888.
@@ -44,10 +45,17 @@ def get_network_neighborhoods(
     Returns:
         np.ndarray: Neighborhood matrix calculated based on the selected distance metric.
     """
-    network = _create_percentile_limited_subgraph(network, edge_length_threshold)
+    # Set random seed for reproducibility in all methods besides Louvain, which requires a separate seed
+    random.seed(random_seed)
+    np.random.seed(random_seed)
-    if distance_metric == "dijkstra":
-        return calculate_dijkstra_neighborhoods(network)
+    # Create a subgraph based on the edge length percentile threshold
+    network = _create_percentile_limited_subgraph(
+        network, edge_length_percentile=edge_length_threshold
+    )
+    if distance_metric == "greedy_modularity":
+        return calculate_greedy_modularity_neighborhoods(network)
     if distance_metric == "louvain":
         return calculate_louvain_neighborhoods(network, louvain_resolution, random_seed=random_seed)
     if distance_metric == "label_propagation":
@@ -60,34 +68,44 @@ def get_network_neighborhoods(
         return calculate_spinglass_neighborhoods(network)
     raise ValueError(
-        "Incorrect distance metric specified. Please choose from 'dijkstra', 'louvain',"
+        "Incorrect distance metric specified. Please choose from 'greedy_modularity', 'louvain',"
         "'label_propagation', 'markov_clustering', 'walktrap', 'spinglass'."
     )
 def _create_percentile_limited_subgraph(G: nx.Graph, edge_length_percentile: float) -> nx.Graph:
-    """Calculate the edge length corresponding to the given percentile of edge lengths in the graph
-    and create a subgraph with all nodes and edges below this length.
+    """Create a subgraph containing all nodes and edges where the edge length is below the
+    specified percentile of all edge lengths in the input graph.
     Args:
-        G (nx.Graph): The input graph.
-        edge_length_percentile (float): The percentile to calculate (between 0 and 1).
+        G (nx.Graph): The input graph with 'length' attributes on edges.
+        edge_length_percentile (float): The percentile (between 0 and 1) to filter edges by length.
     Returns:
-        nx.Graph: A subgraph with all nodes and edges below the edge length corresponding to the given percentile.
+        nx.Graph: A subgraph with all nodes and edges where the edge length is below the
+        calculated threshold length.
     """
-    # Extract edge lengths from the graph
+    # Extract edge lengths and handle missing lengths
     edge_lengths = [d["length"] for _, _, d in G.edges(data=True) if "length" in d]
+    if not edge_lengths:
+        raise ValueError(
+            "No edge lengths found in the graph. Ensure edges have 'length' attributes."
+        )
     # Calculate the specific edge length for the given percentile
     percentile_length = np.percentile(edge_lengths, edge_length_percentile * 100)
-    # Create a new graph with all nodes from the original graph
+    # Create the subgraph by directly filtering edges during iteration
     subgraph = nx.Graph()
-    subgraph.add_nodes_from(G.nodes(data=True))
-    # Add edges to the subgraph if they are below the specified percentile length
+    subgraph.add_nodes_from(G.nodes(data=True))  # Retain all nodes from the original graph
+    # Add edges below the specified percentile length in a single pass
     for u, v, d in G.edges(data=True):
         if d.get("length", 1) <= percentile_length:
             subgraph.add_edge(u, v, **d)
+    # Return the subgraph; optionally check if it's too sparse
+    if subgraph.number_of_edges() == 0:
+        raise Warning("The resulting subgraph has no edges. Consider adjusting the percentile.")
     return subgraph

{risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk/network/plot.py RENAMED Viewed

@@ -9,6 +9,7 @@ import matplotlib.colors as mcolors
 import matplotlib.pyplot as plt
 import networkx as nx
 import numpy as np
+import pandas as pd
 from scipy.ndimage import label
 from scipy.stats import gaussian_kde
@@ -601,7 +602,7 @@ class NetworkPlotter:
             min_words (int, optional): Minimum number of words required to display a label. Defaults to 1.
             max_word_length (int, optional): Maximum number of characters in a word to display. Defaults to 20.
             min_word_length (int, optional): Minimum number of characters in a word to display. Defaults to 1.
-            words_to_omit (List, optional): List of words to omit from the labels. Defaults to None.
+            words_to_omit (list, optional): List of words to omit from the labels. Defaults to None.
             overlay_ids (bool, optional): Whether to overlay domain IDs in the center of the centroids. Defaults to False.
             ids_to_keep (list, tuple, np.ndarray, or None, optional): IDs of domains that must be labeled. To discover domain IDs,
                 you can set `overlay_ids=True`. Defaults to None.
@@ -710,6 +711,9 @@ class NetworkPlotter:
         # Process remaining domains to fill in additional labels, if there are slots left
         if remaining_labels and remaining_labels > 0:
             for idx, (domain, centroid) in enumerate(domain_centroids.items()):
+                # Check if the domain is NaN and continue if true
+                if pd.isna(domain) or (isinstance(domain, float) and np.isnan(domain)):
+                    continue  # Skip NaN domains
                 if ids_to_keep and domain in ids_to_keep:
                     continue  # Skip domains already handled by ids_to_keep

{risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk/risk.py RENAMED Viewed

@@ -20,9 +20,9 @@ from risk.neighborhoods import (
 from risk.network import NetworkIO, NetworkGraph, NetworkPlotter
 from risk.stats import (
     calculate_significance_matrices,
-    compute_fisher_exact_test,
     compute_hypergeom_test,
     compute_permutation_test,
+    compute_poisson_test,
 )
@@ -45,48 +45,39 @@ class RISK(NetworkIO, AnnotationsIO):
         """Access the logged parameters."""
         return params
-    def load_neighborhoods_by_permutation(
+    def load_neighborhoods_by_hypergeom(
         self,
         network: nx.Graph,
         annotations: Dict[str, Any],
-        distance_metric: str = "dijkstra",
+        distance_metric: str = "louvain",
         louvain_resolution: float = 0.1,
         edge_length_threshold: float = 0.5,
-        score_metric: str = "sum",
         null_distribution: str = "network",
-        num_permutations: int = 1000,
         random_seed: int = 888,
-        max_workers: int = 1,
     ) -> Dict[str, Any]:
-        """Load significant neighborhoods for the network using the permutation test.
+        """Load significant neighborhoods for the network using the hypergeometric test.
         Args:
             network (nx.Graph): The network graph.
             annotations (dict): The annotations associated with the network.
-            distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "dijkstra".
+            distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "louvain".
             louvain_resolution (float, optional): Resolution parameter for Louvain clustering. Defaults to 0.1.
             edge_length_threshold (float, optional): Edge length threshold for neighborhood analysis. Defaults to 0.5.
-            score_metric (str, optional): Scoring metric for neighborhood significance. Defaults to "sum".
-            null_distribution (str, optional): Distribution used for permutation tests. Defaults to "network".
-            num_permutations (int, optional): Number of permutations for significance testing. Defaults to 1000.
+            null_distribution (str, optional): Type of null distribution ('network' or 'annotations'). Defaults to "network".
             random_seed (int, optional): Seed for random number generation. Defaults to 888.
-            max_workers (int, optional): Maximum number of workers for parallel computation. Defaults to 1.
         Returns:
             dict: Computed significance of neighborhoods.
         """
-        print_header("Running permutation test")
+        print_header("Running hypergeometric test")
         # Log neighborhood analysis parameters
         params.log_neighborhoods(
             distance_metric=distance_metric,
             louvain_resolution=louvain_resolution,
             edge_length_threshold=edge_length_threshold,
-            statistical_test_function="permutation",
-            score_metric=score_metric,
+            statistical_test_function="hypergeom",
             null_distribution=null_distribution,
-            num_permutations=num_permutations,
             random_seed=random_seed,
-            max_workers=max_workers,
         )
         # Load neighborhoods based on the network and distance metric
@@ -97,59 +88,49 @@ class RISK(NetworkIO, AnnotationsIO):
             edge_length_threshold=edge_length_threshold,
             random_seed=random_seed,
         )
-        # Log and display permutation test settings
-        print(f"Neighborhood scoring metric: '{score_metric}'")
-        print(f"Null distribution: '{null_distribution}'")
-        print(f"Number of permutations: {num_permutations}")
-        print(f"Maximum workers: {max_workers}")
-        # Run permutation test to compute neighborhood significance
-        neighborhood_significance = compute_permutation_test(
+        # Run hypergeometric test to compute neighborhood significance
+        neighborhood_significance = compute_hypergeom_test(
             neighborhoods=neighborhoods,
             annotations=annotations["matrix"],
-            score_metric=score_metric,
             null_distribution=null_distribution,
-            num_permutations=num_permutations,
-            random_seed=random_seed,
-            max_workers=max_workers,
         )
         # Return the computed neighborhood significance
         return neighborhood_significance
-    def load_neighborhoods_by_fisher_exact(
+    def load_neighborhoods_by_poisson(
         self,
         network: nx.Graph,
         annotations: Dict[str, Any],
-        distance_metric: str = "dijkstra",
+        distance_metric: str = "louvain",
         louvain_resolution: float = 0.1,
         edge_length_threshold: float = 0.5,
+        null_distribution: str = "network",
         random_seed: int = 888,
-        max_workers: int = 1,
     ) -> Dict[str, Any]:
-        """Load significant neighborhoods for the network using the Fisher's exact test.
+        """Load significant neighborhoods for the network using the Poisson test.
         Args:
             network (nx.Graph): The network graph.
             annotations (dict): The annotations associated with the network.
-            distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "dijkstra".
+            distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "louvain".
             louvain_resolution (float, optional): Resolution parameter for Louvain clustering. Defaults to 0.1.
             edge_length_threshold (float, optional): Edge length threshold for neighborhood analysis. Defaults to 0.5.
+            null_distribution (str, optional): Type of null distribution ('network' or 'annotations'). Defaults to "network".
             random_seed (int, optional): Seed for random number generation. Defaults to 888.
-            max_workers (int, optional): Maximum number of workers for parallel computation. Defaults to 1.
         Returns:
             dict: Computed significance of neighborhoods.
         """
-        print_header("Running Fisher's exact test")
+        print_header("Running Poisson test")
         # Log neighborhood analysis parameters
         params.log_neighborhoods(
             distance_metric=distance_metric,
             louvain_resolution=louvain_resolution,
             edge_length_threshold=edge_length_threshold,
-            statistical_test_function="fisher_exact",
+            statistical_test_function="poisson",
+            null_distribution=null_distribution,
             random_seed=random_seed,
-            max_workers=max_workers,
         )
         # Load neighborhoods based on the network and distance metric
@@ -160,50 +141,56 @@ class RISK(NetworkIO, AnnotationsIO):
             edge_length_threshold=edge_length_threshold,
             random_seed=random_seed,
         )
-        # Log and display Fisher's exact test settings
-        print(f"Maximum workers: {max_workers}")
-        # Run Fisher's exact test to compute neighborhood significance
-        neighborhood_significance = compute_fisher_exact_test(
+        # Run Poisson test to compute neighborhood significance
+        neighborhood_significance = compute_poisson_test(
             neighborhoods=neighborhoods,
             annotations=annotations["matrix"],
-            max_workers=max_workers,
+            null_distribution=null_distribution,
         )
         # Return the computed neighborhood significance
         return neighborhood_significance
-    def load_neighborhoods_by_hypergeom(
+    def load_neighborhoods_by_permutation(
         self,
         network: nx.Graph,
         annotations: Dict[str, Any],
-        distance_metric: str = "dijkstra",
+        distance_metric: str = "louvain",
         louvain_resolution: float = 0.1,
         edge_length_threshold: float = 0.5,
+        score_metric: str = "sum",
+        null_distribution: str = "network",
+        num_permutations: int = 1000,
         random_seed: int = 888,
         max_workers: int = 1,
     ) -> Dict[str, Any]:
-        """Load significant neighborhoods for the network using the hypergeometric test.
+        """Load significant neighborhoods for the network using the permutation test.
         Args:
             network (nx.Graph): The network graph.
             annotations (dict): The annotations associated with the network.
-            distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "dijkstra".
+            distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "louvain".
             louvain_resolution (float, optional): Resolution parameter for Louvain clustering. Defaults to 0.1.
             edge_length_threshold (float, optional): Edge length threshold for neighborhood analysis. Defaults to 0.5.
+            score_metric (str, optional): Scoring metric for neighborhood significance. Defaults to "sum".
+            null_distribution (str, optional): Type of null distribution ('network' or 'annotations'). Defaults to "network".
+            num_permutations (int, optional): Number of permutations for significance testing. Defaults to 1000.
             random_seed (int, optional): Seed for random number generation. Defaults to 888.
             max_workers (int, optional): Maximum number of workers for parallel computation. Defaults to 1.
         Returns:
             dict: Computed significance of neighborhoods.
         """
-        print_header("Running hypergeometric test")
+        print_header("Running permutation test")
         # Log neighborhood analysis parameters
         params.log_neighborhoods(
             distance_metric=distance_metric,
             louvain_resolution=louvain_resolution,
             edge_length_threshold=edge_length_threshold,
-            statistical_test_function="hypergeom",
+            statistical_test_function="permutation",
+            score_metric=score_metric,
+            null_distribution=null_distribution,
+            num_permutations=num_permutations,
             random_seed=random_seed,
             max_workers=max_workers,
         )
@@ -217,12 +204,19 @@ class RISK(NetworkIO, AnnotationsIO):
             random_seed=random_seed,
         )
-        # Log and display hypergeometric test settings
+        # Log and display permutation test settings
+        print(f"Neighborhood scoring metric: '{score_metric}'")
+        print(f"Null distribution: '{null_distribution}'")
+        print(f"Number of permutations: {num_permutations}")
         print(f"Maximum workers: {max_workers}")
-        # Run hypergeometric test to compute neighborhood significance
-        neighborhood_significance = compute_hypergeom_test(
+        # Run permutation test to compute neighborhood significance
+        neighborhood_significance = compute_permutation_test(
             neighborhoods=neighborhoods,
             annotations=annotations["matrix"],
+            score_metric=score_metric,
+            null_distribution=null_distribution,
+            num_permutations=num_permutations,
+            random_seed=random_seed,
             max_workers=max_workers,
         )
@@ -380,7 +374,7 @@ class RISK(NetworkIO, AnnotationsIO):
     def _load_neighborhoods(
         self,
         network: nx.Graph,
-        distance_metric: str = "dijkstra",
+        distance_metric: str = "louvain",
         louvain_resolution: float = 0.1,
         edge_length_threshold: float = 0.5,
         random_seed: int = 888,
@@ -390,7 +384,7 @@ class RISK(NetworkIO, AnnotationsIO):
         Args:
             network (nx.Graph): The network graph.
             annotations (pd.DataFrame): The matrix of annotations associated with the network.
-            distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "dijkstra".
+            distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "louvain".
             louvain_resolution (float, optional): Resolution parameter for Louvain clustering. Defaults to 0.1.
             edge_length_threshold (float, optional): Edge length threshold for neighborhood analysis. Defaults to 0.5.
             random_seed (int, optional): Seed for random number generation. Defaults to 888.

{risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk/stats/__init__.py RENAMED Viewed

@@ -3,7 +3,7 @@ risk/stats
 ~~~~~~~~~~
 """
-from .stats import calculate_significance_matrices
-from .fisher_exact import compute_fisher_exact_test
 from .hypergeom import compute_hypergeom_test
 from .permutation import compute_permutation_test
+from .poisson import compute_poisson_test
+from .stats import calculate_significance_matrices

risk_network-0.0.7b6/risk/stats/hypergeom.py ADDED Viewed

@@ -0,0 +1,56 @@
+"""
+risk/stats/hypergeom
+~~~~~~~~~~~~~~~~~~~~
+"""
+from typing import Any, Dict
+import numpy as np
+from scipy.stats import hypergeom
+def compute_hypergeom_test(
+    neighborhoods: np.ndarray, annotations: np.ndarray, null_distribution: str = "network"
+) -> Dict[str, Any]:
+    """Compute hypergeometric test for enrichment and depletion in neighborhoods with selectable null distribution.
+    Args:
+        neighborhoods (np.ndarray): Binary matrix representing neighborhoods.
+        annotations (np.ndarray): Binary matrix representing annotations.
+        null_distribution (str, optional): Type of null distribution ('network' or 'annotations'). Defaults to "network".
+    Returns:
+        dict: Dictionary containing depletion and enrichment p-values.
+    """
+    # Ensure both matrices are binary (presence/absence)
+    neighborhoods = (neighborhoods > 0).astype(int)
+    annotations = (annotations > 0).astype(int)
+    total_node_count = neighborhoods.shape[0]
+    if null_distribution == "network":
+        # Case 1: Use all nodes as the background
+        background_population = total_node_count
+        neighborhood_sums = np.sum(neighborhoods, axis=0, keepdims=True).T
+        annotation_sums = np.sum(annotations, axis=0, keepdims=True)
+    elif null_distribution == "annotations":
+        # Case 2: Only consider nodes with at least one annotation
+        annotated_nodes = np.sum(annotations, axis=1) > 0
+        background_population = np.sum(annotated_nodes)
+        neighborhood_sums = np.sum(neighborhoods[annotated_nodes], axis=0, keepdims=True).T
+        annotation_sums = np.sum(annotations[annotated_nodes], axis=0, keepdims=True)
+    else:
+        raise ValueError(
+            "Invalid null_distribution value. Choose either 'network' or 'annotations'."
+        )
+    # Matrix multiplication for annotated nodes in each neighborhood
+    annotated_in_neighborhood = neighborhoods.T @ annotations
+    # Calculate depletion and enrichment p-values using the hypergeometric distribution
+    depletion_pvals = hypergeom.cdf(
+        annotated_in_neighborhood, background_population, annotation_sums, neighborhood_sums
+    )
+    enrichment_pvals = hypergeom.sf(
+        annotated_in_neighborhood - 1, background_population, annotation_sums, neighborhood_sums
+    )
+    return {"depletion_pvals": depletion_pvals, "enrichment_pvals": enrichment_pvals}

{risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk/stats/permutation/permutation.py RENAMED Viewed

@@ -28,7 +28,7 @@ def compute_permutation_test(
         neighborhoods (np.ndarray): Binary matrix representing neighborhoods.
         annotations (np.ndarray): Binary matrix representing annotations.
         score_metric (str, optional): Metric to use for scoring ('sum', 'mean', etc.). Defaults to "sum".
-        null_distribution (str, optional): Type of null distribution ('network' or other). Defaults to "network".
+        null_distribution (str, optional): Type of null distribution ('network' or 'annotations'). Defaults to "network".
         num_permutations (int, optional): Number of permutations to run. Defaults to 1000.
         random_seed (int, optional): Seed for random number generation. Defaults to 888.
         max_workers (int, optional): Number of workers for multiprocessing. Defaults to 1.
@@ -78,7 +78,7 @@ def _run_permutation_test(
         neighborhoods (np.ndarray): The neighborhood matrix.
         annotations (np.ndarray): The annotation matrix.
         neighborhood_score_func (Callable): Function to calculate neighborhood scores.
-        null_distribution (str, optional): Type of null distribution. Defaults to "network".
+        null_distribution (str, optional): Type of null distribution ('network' or 'annotations'). Defaults to "network".
         num_permutations (int, optional): Number of permutations. Defaults to 1000.
         random_seed (int, optional): Seed for random number generation. Defaults to 888.
         max_workers (int, optional): Number of workers for multiprocessing. Defaults to 4.
@@ -91,8 +91,12 @@ def _run_permutation_test(
     # Determine the indices to use based on the null distribution type
     if null_distribution == "network":
         idxs = range(annotations.shape[0])
-    else:
+    elif null_distribution == "annotations":
         idxs = np.nonzero(np.sum(~np.isnan(annotations), axis=1))[0]
+    else:
+        raise ValueError(
+            "Invalid null_distribution value. Choose either 'network' or 'annotations'."
+        )
     # Replace NaNs with zeros in the annotations matrix
     annotations[np.isnan(annotations)] = 0

risk_network-0.0.7b6/risk/stats/poisson.py ADDED Viewed

@@ -0,0 +1,47 @@
+"""
+risk/stats/poisson
+~~~~~~~~~~~~~~~~~~
+"""
+from typing import Dict, Any
+import numpy as np
+from scipy.stats import poisson
+def compute_poisson_test(
+    neighborhoods: np.ndarray, annotations: np.ndarray, null_distribution: str = "network"
+) -> Dict[str, Any]:
+    """Compute Poisson test for enrichment and depletion in neighborhoods with selectable null distribution.
+    Args:
+        neighborhoods (np.ndarray): Binary matrix representing neighborhoods.
+        annotations (np.ndarray): Binary matrix representing annotations.
+        null_distribution (str, optional): Type of null distribution ('network' or 'annotations'). Defaults to "network".
+    Returns:
+        dict: Dictionary containing depletion and enrichment p-values.
+    """
+    # Ensure both matrices are binary (presence/absence)
+    neighborhoods = (neighborhoods > 0).astype(int)
+    annotations = (annotations > 0).astype(int)
+    # Matrix multiplication to get the number of annotated nodes in each neighborhood
+    annotated_in_neighborhood = neighborhoods @ annotations
+    # Compute lambda_expected based on the chosen null distribution
+    if null_distribution == "network":
+        # Use the mean across neighborhoods (axis=1)
+        lambda_expected = np.mean(annotated_in_neighborhood, axis=1, keepdims=True)
+    elif null_distribution == "annotations":
+        # Use the mean across annotations (axis=0)
+        lambda_expected = np.mean(annotated_in_neighborhood, axis=0, keepdims=True)
+    else:
+        raise ValueError(
+            "Invalid null_distribution value. Choose either 'network' or 'annotations'."
+        )
+    # Compute p-values for enrichment and depletion using Poisson distribution
+    enrichment_pvals = 1 - poisson.cdf(annotated_in_neighborhood - 1, lambda_expected)
+    depletion_pvals = poisson.cdf(annotated_in_neighborhood, lambda_expected)
+    return {"enrichment_pvals": enrichment_pvals, "depletion_pvals": depletion_pvals}

{risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk_network.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: risk-network
-Version: 0.0.7b4
+Version: 0.0.7b6
 Summary: A Python package for biological network analysis
 Author: Ira Horecka
 Author-email: Ira Horecka <ira89@icloud.com>

{risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk_network.egg-info/SOURCES.txt RENAMED Viewed

@@ -22,8 +22,8 @@ risk/network/graph.py
 risk/network/io.py
 risk/network/plot.py
 risk/stats/__init__.py
-risk/stats/fisher_exact.py
 risk/stats/hypergeom.py
+risk/stats/poisson.py
 risk/stats/stats.py
 risk/stats/permutation/__init__.py
 risk/stats/permutation/permutation.py

risk_network-0.0.7b4/risk/stats/fisher_exact.py DELETED Viewed

@@ -1,132 +0,0 @@
-"""
-risk/stats/fisher_exact
-~~~~~~~~~~~~~~~~~~~~~~~
-"""
-from multiprocessing import get_context, Manager
-from tqdm import tqdm
-from typing import Any, Dict
-import numpy as np
-from scipy.stats import fisher_exact
-def compute_fisher_exact_test(
-    neighborhoods: np.ndarray,
-    annotations: np.ndarray,
-    max_workers: int = 4,
-) -> Dict[str, Any]:
-    """Compute Fisher's exact test for enrichment and depletion in neighborhoods.
-    Args:
-        neighborhoods (np.ndarray): Binary matrix representing neighborhoods.
-        annotations (np.ndarray): Binary matrix representing annotations.
-        max_workers (int, optional): Number of workers for multiprocessing. Defaults to 4.
-    Returns:
-        dict: Dictionary containing depletion and enrichment p-values.
-    """
-    # Ensure that the matrices are binary (boolean) and free of NaN values
-    neighborhoods = neighborhoods.astype(bool)  # Convert to boolean
-    annotations = annotations.astype(bool)  # Convert to boolean
-    # Initialize the process of calculating p-values using multiprocessing
-    ctx = get_context("spawn")
-    manager = Manager()
-    progress_counter = manager.Value("i", 0)
-    total_tasks = neighborhoods.shape[1] * annotations.shape[1]
-    # Calculate the workload per worker
-    chunk_size = total_tasks // max_workers
-    remainder = total_tasks % max_workers
-    # Execute the Fisher's exact test using multiprocessing
-    with ctx.Pool(max_workers) as pool:
-        with tqdm(total=total_tasks, desc="Total progress", position=0) as progress:
-            params_list = []
-            start_idx = 0
-            for i in range(max_workers):
-                end_idx = start_idx + chunk_size + (1 if i < remainder else 0)
-                params_list.append(
-                    (neighborhoods, annotations, start_idx, end_idx, progress_counter)
-                )
-                start_idx = end_idx
-            # Start the Fisher's exact test process in parallel
-            results = pool.starmap_async(_fisher_exact_process_subset, params_list, chunksize=1)
-            # Update progress bar based on progress_counter
-            while not results.ready():
-                progress.update(progress_counter.value - progress.n)
-                results.wait(0.05)  # Wait for 50ms
-            # Ensure progress bar reaches 100%
-            progress.update(total_tasks - progress.n)
-            # Accumulate results from each worker
-            depletion_pvals, enrichment_pvals = [], []
-            for dp, ep in results.get():
-                depletion_pvals.extend(dp)
-                enrichment_pvals.extend(ep)
-    # Reshape the results back into arrays with the appropriate dimensions
-    depletion_pvals = np.array(depletion_pvals).reshape(
-        neighborhoods.shape[1], annotations.shape[1]
-    )
-    enrichment_pvals = np.array(enrichment_pvals).reshape(
-        neighborhoods.shape[1], annotations.shape[1]
-    )
-    return {
-        "depletion_pvals": depletion_pvals,
-        "enrichment_pvals": enrichment_pvals,
-    }
-def _fisher_exact_process_subset(
-    neighborhoods: np.ndarray,
-    annotations: np.ndarray,
-    start_idx: int,
-    end_idx: int,
-    progress_counter,
-) -> tuple:
-    """Process a subset of neighborhoods using Fisher's exact test.
-    Args:
-        neighborhoods (np.ndarray): The full neighborhood matrix.
-        annotations (np.ndarray): The annotation matrix.
-        start_idx (int): Starting index of the neighborhood-annotation pairs to process.
-        end_idx (int): Ending index of the neighborhood-annotation pairs to process.
-        progress_counter: Shared counter for tracking progress.
-    Returns:
-        tuple: Local p-values for depletion and enrichment.
-    """
-    # Initialize lists to store p-values for depletion and enrichment
-    depletion_pvals = []
-    enrichment_pvals = []
-    # Process the subset of tasks assigned to this worker
-    for idx in range(start_idx, end_idx):
-        i = idx // annotations.shape[1]  # Neighborhood index
-        j = idx % annotations.shape[1]  # Annotation index
-        neighborhood = neighborhoods[:, i]
-        annotation = annotations[:, j]
-        # Calculate the contingency table values
-        TP = np.sum(neighborhood & annotation)
-        FP = np.sum(neighborhood & ~annotation)
-        FN = np.sum(~neighborhood & annotation)
-        TN = np.sum(~neighborhood & ~annotation)
-        table = np.array([[TP, FP], [FN, TN]])
-        # Perform Fisher's exact test for depletion (alternative='less')
-        _, p_value_depletion = fisher_exact(table, alternative="less")
-        depletion_pvals.append(p_value_depletion)
-        # Perform Fisher's exact test for enrichment (alternative='greater')
-        _, p_value_enrichment = fisher_exact(table, alternative="greater")
-        enrichment_pvals.append(p_value_enrichment)
-        # Update the shared progress counter
-        progress_counter.value += 1
-    return depletion_pvals, enrichment_pvals

risk_network-0.0.7b4/risk/stats/hypergeom.py DELETED Viewed

@@ -1,131 +0,0 @@
-"""
-risk/stats/hypergeom
-~~~~~~~~~~~~~~~~~~~~
-"""
-from multiprocessing import get_context, Manager
-from tqdm import tqdm
-from typing import Any, Dict
-import numpy as np
-from scipy.stats import hypergeom
-def compute_hypergeom_test(
-    neighborhoods: np.ndarray,
-    annotations: np.ndarray,
-    max_workers: int = 4,
-) -> Dict[str, Any]:
-    """Compute hypergeometric test for enrichment and depletion in neighborhoods.
-    Args:
-        neighborhoods (np.ndarray): Binary matrix representing neighborhoods.
-        annotations (np.ndarray): Binary matrix representing annotations.
-        max_workers (int, optional): Number of workers for multiprocessing. Defaults to 4.
-    Returns:
-        dict: Dictionary containing depletion and enrichment p-values.
-    """
-    # Ensure that the matrices are binary (boolean) and free of NaN values
-    neighborhoods = neighborhoods.astype(bool)  # Convert to boolean
-    annotations = annotations.astype(bool)  # Convert to boolean
-    # Initialize the process of calculating p-values using multiprocessing
-    ctx = get_context("spawn")
-    manager = Manager()
-    progress_counter = manager.Value("i", 0)
-    total_tasks = neighborhoods.shape[1] * annotations.shape[1]
-    # Calculate the workload per worker
-    chunk_size = total_tasks // max_workers
-    remainder = total_tasks % max_workers
-    # Execute the hypergeometric test using multiprocessing
-    with ctx.Pool(max_workers) as pool:
-        with tqdm(total=total_tasks, desc="Total progress", position=0) as progress:
-            params_list = []
-            start_idx = 0
-            for i in range(max_workers):
-                end_idx = start_idx + chunk_size + (1 if i < remainder else 0)
-                params_list.append(
-                    (neighborhoods, annotations, start_idx, end_idx, progress_counter)
-                )
-                start_idx = end_idx
-            # Start the hypergeometric test process in parallel
-            results = pool.starmap_async(_hypergeom_process_subset, params_list, chunksize=1)
-            # Update progress bar based on progress_counter
-            while not results.ready():
-                progress.update(progress_counter.value - progress.n)
-                results.wait(0.05)  # Wait for 50ms
-            # Ensure progress bar reaches 100%
-            progress.update(total_tasks - progress.n)
-            # Accumulate results from each worker
-            depletion_pvals, enrichment_pvals = [], []
-            for dp, ep in results.get():
-                depletion_pvals.extend(dp)
-                enrichment_pvals.extend(ep)
-    # Reshape the results back into arrays with the appropriate dimensions
-    depletion_pvals = np.array(depletion_pvals).reshape(
-        neighborhoods.shape[1], annotations.shape[1]
-    )
-    enrichment_pvals = np.array(enrichment_pvals).reshape(
-        neighborhoods.shape[1], annotations.shape[1]
-    )
-    return {
-        "depletion_pvals": depletion_pvals,
-        "enrichment_pvals": enrichment_pvals,
-    }
-def _hypergeom_process_subset(
-    neighborhoods: np.ndarray,
-    annotations: np.ndarray,
-    start_idx: int,
-    end_idx: int,
-    progress_counter,
-) -> tuple:
-    """Process a subset of neighborhoods using the hypergeometric test.
-    Args:
-        neighborhoods (np.ndarray): The full neighborhood matrix.
-        annotations (np.ndarray): The annotation matrix.
-        start_idx (int): Starting index of the neighborhood-annotation pairs to process.
-        end_idx (int): Ending index of the neighborhood-annotation pairs to process.
-        progress_counter: Shared counter for tracking progress.
-    Returns:
-        tuple: Local p-values for depletion and enrichment.
-    """
-    # Initialize lists to store p-values for depletion and enrichment
-    depletion_pvals = []
-    enrichment_pvals = []
-    # Process the subset of tasks assigned to this worker
-    for idx in range(start_idx, end_idx):
-        i = idx // annotations.shape[1]  # Neighborhood index
-        j = idx % annotations.shape[1]  # Annotation index
-        neighborhood = neighborhoods[:, i]
-        annotation = annotations[:, j]
-        # Calculate the required values for the hypergeometric test
-        M = annotations.shape[0]  # Total number of items (population size)
-        n = np.sum(annotation)  # Total number of successes in population
-        N = np.sum(neighborhood)  # Total number of draws (sample size)
-        k = np.sum(neighborhood & annotation)  # Number of successes in sample
-        # Perform hypergeometric test for depletion
-        p_value_depletion = hypergeom.cdf(k, M, n, N)
-        depletion_pvals.append(p_value_depletion)
-        # Perform hypergeometric test for enrichment
-        p_value_enrichment = hypergeom.sf(k - 1, M, n, N)
-        enrichment_pvals.append(p_value_enrichment)
-        # Update the shared progress counter
-        progress_counter.value += 1
-    return depletion_pvals, enrichment_pvals