PyPI - risk-network - Versions diffs - 0.0.10__py3-none-any.whl → 0.0.12__py3-none-any.whl - Mend

risk-network 0.0.10py3-none-any.whl → 0.0.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

risk/__init__.py +1 -1
risk/annotation/__init__.py +10 -0
risk/{annotations/annotations.py → annotation/annotation.py} +62 -102
risk/{annotations → annotation}/io.py +93 -92
risk/annotation/nltk_setup.py +86 -0
risk/log/__init__.py +1 -1
risk/log/parameters.py +26 -27
risk/neighborhoods/__init__.py +0 -1
risk/neighborhoods/api.py +38 -38
risk/neighborhoods/community.py +33 -4
risk/neighborhoods/domains.py +26 -28
risk/neighborhoods/neighborhoods.py +8 -2
risk/neighborhoods/stats/__init__.py +13 -0
risk/neighborhoods/stats/permutation/__init__.py +6 -0
risk/{stats → neighborhoods/stats}/permutation/permutation.py +24 -21
risk/{stats → neighborhoods/stats}/permutation/test_functions.py +5 -4
risk/{stats/stat_tests.py → neighborhoods/stats/tests.py} +62 -54
risk/network/__init__.py +0 -2
risk/network/graph/__init__.py +0 -2
risk/network/graph/api.py +19 -19
risk/network/graph/graph.py +73 -68
risk/{stats/significance.py → network/graph/stats.py} +2 -2
risk/network/graph/summary.py +12 -13
risk/network/io.py +163 -20
risk/network/plotter/__init__.py +0 -2
risk/network/plotter/api.py +1 -1
risk/network/plotter/canvas.py +36 -36
risk/network/plotter/contour.py +14 -15
risk/network/plotter/labels.py +303 -294
risk/network/plotter/network.py +6 -6
risk/network/plotter/plotter.py +8 -10
risk/network/plotter/utils/colors.py +15 -8
risk/network/plotter/utils/layout.py +3 -3
risk/risk.py +6 -7
risk_network-0.0.12.dist-info/METADATA +122 -0
risk_network-0.0.12.dist-info/RECORD +40 -0
{risk_network-0.0.10.dist-info → risk_network-0.0.12.dist-info}/WHEEL +1 -1
risk/annotations/__init__.py +0 -7
risk/network/geometry.py +0 -150
risk/stats/__init__.py +0 -15
risk/stats/permutation/__init__.py +0 -6
risk_network-0.0.10.dist-info/METADATA +0 -798
risk_network-0.0.10.dist-info/RECORD +0 -40
{risk_network-0.0.10.dist-info → risk_network-0.0.12.dist-info/licenses}/LICENSE +0 -0
{risk_network-0.0.10.dist-info → risk_network-0.0.12.dist-info}/top_level.txt +0 -0

risk/neighborhoods/domains.py CHANGED Viewed

@@ -9,19 +9,18 @@ from typing import Tuple, Union
 import numpy as np
 import pandas as pd
 from numpy.linalg import LinAlgError
-from scipy.cluster.hierarchy import linkage, fcluster
+from scipy.cluster.hierarchy import fcluster, linkage
 from sklearn.metrics import silhouette_score
 from tqdm import tqdm
-from risk.annotations import get_weighted_description
+from risk.annotation import get_weighted_description
 from risk.log import logger
 # Define constants for clustering
 # fmt: off
 LINKAGE_METHODS = {"single", "complete", "average", "weighted", "centroid", "median", "ward"}
 LINKAGE_METRICS = {
-    "braycurtis","canberra", "chebyshev", "cityblock", "correlation", "cosine", "dice", "euclidean",
+    "braycurtis", "canberra", "chebyshev", "cityblock", "correlation", "cosine", "dice", "euclidean",
     "hamming", "jaccard", "jensenshannon", "kulczynski1", "mahalanobis", "matching", "minkowski",
     "rogerstanimoto", "russellrao", "seuclidean", "sokalmichener", "sokalsneath", "sqeuclidean", "yule",
 }
@@ -29,7 +28,7 @@ LINKAGE_METRICS = {
 def define_domains(
-    top_annotations: pd.DataFrame,
+    top_annotation: pd.DataFrame,
     significant_neighborhoods_significance: np.ndarray,
     linkage_criterion: str,
     linkage_method: str,
@@ -40,7 +39,7 @@ def define_domains(
     handling errors by assigning unique domains when clustering fails.
     Args:
-        top_annotations (pd.DataFrame): DataFrame of top annotations data for the network nodes.
+        top_annotation (pd.DataFrame): DataFrame of top annotations data for the network nodes.
         significant_neighborhoods_significance (np.ndarray): The binary significance matrix below alpha.
         linkage_criterion (str): The clustering criterion for defining groups. Choose "off" to disable clustering.
         linkage_method (str): The linkage method for clustering. Choose "auto" to optimize.
@@ -49,13 +48,16 @@ def define_domains(
     Returns:
         pd.DataFrame: DataFrame with the primary domain for each node.
+    Raises:
+        ValueError: If the clustering criterion is set to "off" or if an error occurs during clustering.
     """
     try:
         if linkage_criterion == "off":
             raise ValueError("Clustering is turned off.")
         # Transpose the matrix to cluster annotations
-        m = significant_neighborhoods_significance[:, top_annotations["significant_annotations"]].T
+        m = significant_neighborhoods_significance[:, top_annotation["significant_annotation"]].T
         # Safeguard the matrix by replacing NaN, Inf, and -Inf values
         m = _safeguard_matrix(m)
         # Optimize silhouette score across different linkage methods and distance metrics
@@ -69,27 +71,23 @@ def define_domains(
         )
         # Calculate the optimal threshold for clustering
         max_d_optimal = np.max(Z[:, 2]) * best_threshold
-        # Assign domains to the annotations matrix
+        # Assign domains to the annotation matrix
         domains = fcluster(Z, max_d_optimal, criterion=linkage_criterion)
-        top_annotations["domain"] = 0
-        top_annotations.loc[top_annotations["significant_annotations"], "domain"] = domains
+        top_annotation["domain"] = 0
+        top_annotation.loc[top_annotation["significant_annotation"], "domain"] = domains
     except (ValueError, LinAlgError):
         # If a ValueError is encountered, handle it by assigning unique domains
-        n_rows = len(top_annotations)
+        n_rows = len(top_annotation)
         if linkage_criterion == "off":
-            logger.warning(
-                f"Clustering is turned off. Skipping clustering and assigning {n_rows} unique domains."
-            )
+            logger.warning("Clustering is turned off. Skipping clustering.")
         else:
-            logger.error(
-                f"Error encountered. Skipping clustering and assigning {n_rows} unique domains."
-            )
-        top_annotations["domain"] = range(1, n_rows + 1)  # Assign unique domains
+            logger.error("Error encountered. Skipping clustering.")
+        top_annotation["domain"] = range(1, n_rows + 1)  # Assign unique domains
     # Create DataFrames to store domain information
     node_to_significance = pd.DataFrame(
         data=significant_neighborhoods_significance,
-        columns=[top_annotations.index.values, top_annotations["domain"]],
+        columns=[top_annotation.index.values, top_annotation["domain"]],
     )
     node_to_domain = node_to_significance.T.groupby(level="domain").sum().T
@@ -110,15 +108,15 @@ def define_domains(
 def trim_domains(
     domains: pd.DataFrame,
-    top_annotations: pd.DataFrame,
+    top_annotation: pd.DataFrame,
     min_cluster_size: int = 5,
     max_cluster_size: int = 1000,
-) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+) -> Tuple[pd.DataFrame, pd.DataFrame]:
     """Trim domains that do not meet size criteria and find outliers.
     Args:
         domains (pd.DataFrame): DataFrame of domain data for the network nodes.
-        top_annotations (pd.DataFrame): DataFrame of top annotations data for the network nodes.
+        top_annotation (pd.DataFrame): DataFrame of top annotations data for the network nodes.
         min_cluster_size (int, optional): Minimum size of a cluster to be retained. Defaults to 5.
         max_cluster_size (int, optional): Maximum size of a cluster to be retained. Defaults to 1000.
@@ -137,21 +135,21 @@ def trim_domains(
     invalid_domain_id = 888888
     invalid_domain_ids = {0, invalid_domain_id}
     # Mark domains to be removed
-    top_annotations["domain"] = top_annotations["domain"].replace(to_remove, invalid_domain_id)
+    top_annotation["domain"] = top_annotation["domain"].replace(to_remove, invalid_domain_id)
     domains.loc[domains["primary_domain"].isin(to_remove), ["primary_domain"]] = invalid_domain_id
     # Normalize "num significant neighborhoods" by percentile for each domain and scale to 0-10
-    top_annotations["normalized_value"] = top_annotations.groupby("domain")[
+    top_annotation["normalized_value"] = top_annotation.groupby("domain")[
         "significant_neighborhood_significance_sums"
     ].transform(lambda x: (x.rank(pct=True) * 10).apply(np.ceil).astype(int))
     # Modify the lambda function to pass both full_terms and significant_significance_score
-    top_annotations["combined_terms"] = top_annotations.apply(
+    top_annotation["combined_terms"] = top_annotation.apply(
         lambda row: " ".join([str(row["full_terms"])] * row["normalized_value"]), axis=1
     )
     # Perform the groupby operation while retaining the other columns and adding the weighting with significance scores
     domain_labels = (
-        top_annotations.groupby("domain")
+        top_annotation.groupby("domain")
         .agg(
             full_terms=("full_terms", lambda x: list(x)),
             significance_scores=("significant_significance_score", lambda x: list(x)),
@@ -231,7 +229,7 @@ def _optimize_silhouette_across_linkage_and_metrics(
     # Initialize best overall values
     best_overall_method = linkage_method
     best_overall_metric = linkage_metric
-    best_overall_threshold = linkage_threshold
+    best_overall_threshold = 0.0
     best_overall_score = -np.inf
     # Set linkage methods and metrics to all combinations if "auto" is selected
@@ -242,7 +240,7 @@ def _optimize_silhouette_across_linkage_and_metrics(
     # Evaluating optimal linkage method and metric
     for method, metric in tqdm(
         product(linkage_methods, linkage_metrics),
-        desc="Evaluating optimal linkage method and metric",
+        desc="Evaluating linkage methods and metrics",
         total=total_combinations,
         bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
     ):

risk/neighborhoods/neighborhoods.py CHANGED Viewed

@@ -13,6 +13,7 @@ from scipy.sparse import csr_matrix
 from sklearn.exceptions import DataConversionWarning
 from sklearn.metrics.pairwise import cosine_similarity
+from risk.log import logger
 from risk.neighborhoods.community import (
     calculate_greedy_modularity_neighborhoods,
     calculate_label_propagation_neighborhoods,
@@ -22,7 +23,6 @@ from risk.neighborhoods.community import (
     calculate_spinglass_neighborhoods,
     calculate_walktrap_neighborhoods,
 )
-from risk.log import logger
 # Suppress DataConversionWarning
 warnings.filterwarnings(action="ignore", category=DataConversionWarning)
@@ -48,6 +48,9 @@ def get_network_neighborhoods(
     Returns:
         csr_matrix: The combined neighborhood matrix.
+    Raises:
+        ValueError: If the number of distance metrics does not match the number of edge length thresholds.
     """
     # Set random seed for reproducibility
     random.seed(random_seed)
@@ -490,6 +493,9 @@ def _calculate_threshold(median_distances: List, distance_threshold: float) -> f
     Returns:
         float: The calculated distance threshold value.
+    Raises:
+        ValueError: If no significant annotation is found in the median distances.
     """
     # Sort the median distances
     sorted_distances = np.sort(median_distances)
@@ -500,7 +506,7 @@ def _calculate_threshold(median_distances: List, distance_threshold: float) -> f
     try:
         smoothed_distances = np.interp(interpolated_percentiles, rank_percentiles, sorted_distances)
     except ValueError as e:
-        raise ValueError("No significant annotations found.") from e
+        raise ValueError("No significant annotation found.") from e
     # Determine the index corresponding to the distance threshold
     threshold_index = int(np.ceil(distance_threshold * len(smoothed_distances))) - 1

risk/neighborhoods/stats/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""
+risk/neighborhoods/stats
+~~~~~~~~~~~~~~~~~~~~~~~~
+"""
+from risk.neighborhoods.stats.permutation import compute_permutation_test
+from risk.neighborhoods.stats.tests import (
+    compute_binom_test,
+    compute_chi2_test,
+    compute_hypergeom_test,
+    compute_poisson_test,
+    compute_zscore_test,
+)

risk/neighborhoods/stats/permutation/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""
+risk/neighborhoods/stats/permutation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+"""
+from risk.neighborhoods.stats.permutation.permutation import compute_permutation_test

risk/{stats → neighborhoods/stats}/permutation/permutation.py RENAMED Viewed

@@ -1,9 +1,9 @@
 """
-risk/stats/permutation/permutation
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+risk/neighborhoods/stats/permutation/permutation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 """
-from multiprocessing import get_context, Manager
+from multiprocessing import Manager, get_context
 from multiprocessing.managers import ValueProxy
 from typing import Any, Callable, Dict, List, Tuple, Union
@@ -12,12 +12,12 @@ from scipy.sparse import csr_matrix
 from threadpoolctl import threadpool_limits
 from tqdm import tqdm
-from risk.stats.permutation.test_functions import DISPATCH_TEST_FUNCTIONS
+from risk.neighborhoods.stats.permutation.test_functions import DISPATCH_TEST_FUNCTIONS
 def compute_permutation_test(
     neighborhoods: csr_matrix,
-    annotations: csr_matrix,
+    annotation: csr_matrix,
     score_metric: str = "sum",
     null_distribution: str = "network",
     num_permutations: int = 1000,
@@ -28,9 +28,9 @@ def compute_permutation_test(
     Args:
         neighborhoods (csr_matrix): Sparse binary matrix representing neighborhoods.
-        annotations (csr_matrix): Sparse binary matrix representing annotations.
+        annotation (csr_matrix): Sparse binary matrix representing annotation.
         score_metric (str, optional): Metric to use for scoring ('sum' or 'stdev'). Defaults to "sum".
-        null_distribution (str, optional): Type of null distribution ('network' or 'annotations'). Defaults to "network".
+        null_distribution (str, optional): Type of null distribution ('network' or 'annotation'). Defaults to "network".
         num_permutations (int, optional): Number of permutations to run. Defaults to 1000.
         random_seed (int, optional): Seed for random number generation. Defaults to 888.
         max_workers (int, optional): Number of workers for multiprocessing. Defaults to 1.
@@ -41,14 +41,14 @@ def compute_permutation_test(
     # Ensure that the matrices are in the correct format and free of NaN values
     # NOTE: Keep the data type as float32 to avoid locking issues with dot product operations
     neighborhoods = neighborhoods.astype(np.float32)
-    annotations = annotations.astype(np.float32)
+    annotation = annotation.astype(np.float32)
     # Retrieve the appropriate neighborhood score function based on the metric
     neighborhood_score_func = DISPATCH_TEST_FUNCTIONS[score_metric]
     # Run the permutation test to calculate depletion and enrichment counts
     counts_depletion, counts_enrichment = _run_permutation_test(
         neighborhoods=neighborhoods,
-        annotations=annotations,
+        annotation=annotation,
         neighborhood_score_func=neighborhood_score_func,
         null_distribution=null_distribution,
         num_permutations=num_permutations,
@@ -68,7 +68,7 @@ def compute_permutation_test(
 def _run_permutation_test(
     neighborhoods: csr_matrix,
-    annotations: csr_matrix,
+    annotation: csr_matrix,
     neighborhood_score_func: Callable,
     null_distribution: str = "network",
     num_permutations: int = 1000,
@@ -79,31 +79,34 @@ def _run_permutation_test(
     Args:
         neighborhoods (csr_matrix): Sparse binary matrix representing neighborhoods.
-        annotations (csr_matrix): Sparse binary matrix representing annotations.
+        annotation (csr_matrix): Sparse binary matrix representing annotation.
         neighborhood_score_func (Callable): Function to calculate neighborhood scores.
-        null_distribution (str, optional): Type of null distribution ('network' or 'annotations'). Defaults to "network".
+        null_distribution (str, optional): Type of null distribution ('network' or 'annotation'). Defaults to "network".
         num_permutations (int, optional): Number of permutations. Defaults to 1000.
         random_seed (int, optional): Seed for random number generation. Defaults to 888.
         max_workers (int, optional): Number of workers for multiprocessing. Defaults to 4.
     Returns:
         tuple: Depletion and enrichment counts.
+    Raises:
+        ValueError: If an invalid null_distribution value is provided.
     """
     # Initialize the RNG for reproducibility
     rng = np.random.default_rng(seed=random_seed)
     # Determine the indices to use based on the null distribution type
     if null_distribution == "network":
-        idxs = range(annotations.shape[0])
-    elif null_distribution == "annotations":
-        idxs = np.nonzero(annotations.getnnz(axis=1) > 0)[0]
+        idxs = range(annotation.shape[0])
+    elif null_distribution == "annotation":
+        idxs = np.nonzero(annotation.getnnz(axis=1) > 0)[0]
     else:
         raise ValueError(
-            "Invalid null_distribution value. Choose either 'network' or 'annotations'."
+            "Invalid null_distribution value. Choose either 'network' or 'annotation'."
         )
-    # Replace NaNs with zeros in the sparse annotations matrix
-    annotations.data[np.isnan(annotations.data)] = 0
-    annotation_matrix_obsv = annotations[idxs]
+    # Replace NaNs with zeros in the sparse annotation matrix
+    annotation.data[np.isnan(annotation.data)] = 0
+    annotation_matrix_obsv = annotation[idxs]
     neighborhoods_matrix_obsv = neighborhoods.T[idxs].T
     # Calculate observed neighborhood scores
     with np.errstate(invalid="ignore", divide="ignore"):
@@ -139,7 +142,7 @@ def _run_permutation_test(
             params_list = [
                 (
                     permutation_batches[i],  # Pass the batch of precomputed permutations
-                    annotations,
+                    annotation,
                     neighborhoods_matrix_obsv,
                     observed_neighborhood_scores,
                     neighborhood_score_func,
@@ -182,7 +185,7 @@ def _permutation_process_batch(
     Args:
         permutations (Union[List, Tuple, np.ndarray]): Permutation batch to process.
-        annotation_matrix (csr_matrix): Sparse binary matrix representing annotations.
+        annotation_matrix (csr_matrix): Sparse binary matrix representing annotation.
         neighborhoods_matrix_obsv (csr_matrix): Sparse binary matrix representing observed neighborhoods.
         observed_neighborhood_scores (np.ndarray): Observed neighborhood scores.
         neighborhood_score_func (Callable): Function to calculate neighborhood scores.

risk/{stats → neighborhoods/stats}/permutation/test_functions.py RENAMED Viewed

@@ -1,6 +1,6 @@
 """
-risk/stats/permutation/test_functions
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+risk/neighborhoods/stats/permutation/test_functions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 """
 import numpy as np
@@ -8,6 +8,7 @@ from scipy.sparse import csr_matrix
 # NOTE: Cython optimizations provided minimal performance benefits.
 # The final version with Cython is archived in the `cython_permutation` branch.
 # DISPATCH_TEST_FUNCTIONS can be found at the end of the file.
@@ -23,7 +24,7 @@ def compute_neighborhood_score_by_sum(
     Returns:
         np.ndarray: Dense array of summed attribute values for each neighborhood.
     """
-    # Calculate the neighborhood score as the dot product of neighborhoods and annotations
+    # Calculate the neighborhood score as the dot product of neighborhoods and annotation
     neighborhood_score = neighborhoods_matrix @ annotation_matrix  # Sparse matrix multiplication
     # Convert the result to a dense array for downstream calculations
     neighborhood_score_dense = neighborhood_score.toarray()
@@ -42,7 +43,7 @@ def compute_neighborhood_score_by_stdev(
     Returns:
         np.ndarray: Standard deviation of the neighborhood scores.
     """
-    # Calculate the neighborhood score as the dot product of neighborhoods and annotations
+    # Calculate the neighborhood score as the dot product of neighborhoods and annotation
     neighborhood_score = neighborhoods_matrix @ annotation_matrix  # Sparse matrix multiplication
     # Calculate the number of elements in each neighborhood (sum of rows)
     N = neighborhoods_matrix.sum(axis=1).A.flatten()  # Convert to 1D array

risk-network 0.0.10__py3-none-any.whl → 0.0.12__py3-none-any.whl

risk-network 0.0.10py3-none-any.whl → 0.0.12py3-none-any.whl