PyPI - risk-network - Versions diffs - 0.0.4b2__py3-none-any.whl → 0.0.5b0__py3-none-any.whl - Mend

risk-network 0.0.4b2py3-none-any.whl → 0.0.5b0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

risk/__init__.py +2 -5
risk/annotations/annotations.py +1 -1
risk/neighborhoods/neighborhoods.py +5 -1
risk/network/geometry.py +2 -2
risk/network/io.py +45 -30
risk/network/plot.py +55 -10
risk/risk.py +171 -19
risk/stats/__init__.py +4 -1
risk/stats/fisher_exact.py +132 -0
risk/stats/hypergeom.py +131 -0
risk/stats/permutation/__init__.py +6 -0
risk/stats/permutation/permutation.py +212 -0
risk/stats/{permutation.py → permutation/test_functions.py} +12 -39
risk/stats/stats.py +1 -212
{risk_network-0.0.4b2.dist-info → risk_network-0.0.5b0.dist-info}/METADATA +2 -2
risk_network-0.0.5b0.dist-info/RECORD +30 -0
{risk_network-0.0.4b2.dist-info → risk_network-0.0.5b0.dist-info}/WHEEL +1 -1
risk_network-0.0.4b2.dist-info/RECORD +0 -26
{risk_network-0.0.4b2.dist-info → risk_network-0.0.5b0.dist-info}/LICENSE +0 -0
{risk_network-0.0.4b2.dist-info → risk_network-0.0.5b0.dist-info}/top_level.txt +0 -0

risk/stats/fisher_exact.py ADDED Viewed

@@ -0,0 +1,132 @@
+"""
+risk/stats/fisher_exact
+~~~~~~~~~~~~~~~~~~~~~~~
+"""
+from multiprocessing import get_context, Manager
+from tqdm import tqdm
+from typing import Any, Dict
+import numpy as np
+from scipy.stats import fisher_exact
+def compute_fisher_exact_test(
+    neighborhoods: np.ndarray,
+    annotations: np.ndarray,
+    max_workers: int = 4,
+) -> Dict[str, Any]:
+    """Compute Fisher's exact test for enrichment and depletion in neighborhoods.
+    Args:
+        neighborhoods (np.ndarray): Binary matrix representing neighborhoods.
+        annotations (np.ndarray): Binary matrix representing annotations.
+        max_workers (int, optional): Number of workers for multiprocessing. Defaults to 4.
+    Returns:
+        dict: Dictionary containing depletion and enrichment p-values.
+    """
+    # Ensure that the matrices are binary (boolean) and free of NaN values
+    neighborhoods = neighborhoods.astype(bool)  # Convert to boolean
+    annotations = annotations.astype(bool)  # Convert to boolean
+    # Initialize the process of calculating p-values using multiprocessing
+    ctx = get_context("spawn")
+    manager = Manager()
+    progress_counter = manager.Value("i", 0)
+    total_tasks = neighborhoods.shape[1] * annotations.shape[1]
+    # Calculate the workload per worker
+    chunk_size = total_tasks // max_workers
+    remainder = total_tasks % max_workers
+    # Execute the Fisher's exact test using multiprocessing
+    with ctx.Pool(max_workers) as pool:
+        with tqdm(total=total_tasks, desc="Total progress", position=0) as progress:
+            params_list = []
+            start_idx = 0
+            for i in range(max_workers):
+                end_idx = start_idx + chunk_size + (1 if i < remainder else 0)
+                params_list.append(
+                    (neighborhoods, annotations, start_idx, end_idx, progress_counter)
+                )
+                start_idx = end_idx
+            # Start the Fisher's exact test process in parallel
+            results = pool.starmap_async(_fisher_exact_process_subset, params_list, chunksize=1)
+            # Update progress bar based on progress_counter
+            while not results.ready():
+                progress.update(progress_counter.value - progress.n)
+                results.wait(0.05)  # Wait for 50ms
+            # Ensure progress bar reaches 100%
+            progress.update(total_tasks - progress.n)
+            # Accumulate results from each worker
+            depletion_pvals, enrichment_pvals = [], []
+            for dp, ep in results.get():
+                depletion_pvals.extend(dp)
+                enrichment_pvals.extend(ep)
+    # Reshape the results back into arrays with the appropriate dimensions
+    depletion_pvals = np.array(depletion_pvals).reshape(
+        neighborhoods.shape[1], annotations.shape[1]
+    )
+    enrichment_pvals = np.array(enrichment_pvals).reshape(
+        neighborhoods.shape[1], annotations.shape[1]
+    )
+    return {
+        "depletion_pvals": depletion_pvals,
+        "enrichment_pvals": enrichment_pvals,
+    }
+def _fisher_exact_process_subset(
+    neighborhoods: np.ndarray,
+    annotations: np.ndarray,
+    start_idx: int,
+    end_idx: int,
+    progress_counter,
+) -> tuple:
+    """Process a subset of neighborhoods using Fisher's exact test.
+    Args:
+        neighborhoods (np.ndarray): The full neighborhood matrix.
+        annotations (np.ndarray): The annotation matrix.
+        start_idx (int): Starting index of the neighborhood-annotation pairs to process.
+        end_idx (int): Ending index of the neighborhood-annotation pairs to process.
+        progress_counter: Shared counter for tracking progress.
+    Returns:
+        tuple: Local p-values for depletion and enrichment.
+    """
+    # Initialize lists to store p-values for depletion and enrichment
+    depletion_pvals = []
+    enrichment_pvals = []
+    # Process the subset of tasks assigned to this worker
+    for idx in range(start_idx, end_idx):
+        i = idx // annotations.shape[1]  # Neighborhood index
+        j = idx % annotations.shape[1]  # Annotation index
+        neighborhood = neighborhoods[:, i]
+        annotation = annotations[:, j]
+        # Calculate the contingency table values
+        TP = np.sum(neighborhood & annotation)
+        FP = np.sum(neighborhood & ~annotation)
+        FN = np.sum(~neighborhood & annotation)
+        TN = np.sum(~neighborhood & ~annotation)
+        table = np.array([[TP, FP], [FN, TN]])
+        # Perform Fisher's exact test for depletion (alternative='less')
+        _, p_value_depletion = fisher_exact(table, alternative="less")
+        depletion_pvals.append(p_value_depletion)
+        # Perform Fisher's exact test for enrichment (alternative='greater')
+        _, p_value_enrichment = fisher_exact(table, alternative="greater")
+        enrichment_pvals.append(p_value_enrichment)
+        # Update the shared progress counter
+        progress_counter.value += 1
+    return depletion_pvals, enrichment_pvals

risk/stats/hypergeom.py ADDED Viewed

@@ -0,0 +1,131 @@
+"""
+risk/stats/hypergeom
+~~~~~~~~~~~~~~~~~~~~
+"""
+from multiprocessing import get_context, Manager
+from tqdm import tqdm
+from typing import Any, Dict
+import numpy as np
+from scipy.stats import hypergeom
+def compute_hypergeom_test(
+    neighborhoods: np.ndarray,
+    annotations: np.ndarray,
+    max_workers: int = 4,
+) -> Dict[str, Any]:
+    """Compute hypergeometric test for enrichment and depletion in neighborhoods.
+    Args:
+        neighborhoods (np.ndarray): Binary matrix representing neighborhoods.
+        annotations (np.ndarray): Binary matrix representing annotations.
+        max_workers (int, optional): Number of workers for multiprocessing. Defaults to 4.
+    Returns:
+        dict: Dictionary containing depletion and enrichment p-values.
+    """
+    # Ensure that the matrices are binary (boolean) and free of NaN values
+    neighborhoods = neighborhoods.astype(bool)  # Convert to boolean
+    annotations = annotations.astype(bool)  # Convert to boolean
+    # Initialize the process of calculating p-values using multiprocessing
+    ctx = get_context("spawn")
+    manager = Manager()
+    progress_counter = manager.Value("i", 0)
+    total_tasks = neighborhoods.shape[1] * annotations.shape[1]
+    # Calculate the workload per worker
+    chunk_size = total_tasks // max_workers
+    remainder = total_tasks % max_workers
+    # Execute the hypergeometric test using multiprocessing
+    with ctx.Pool(max_workers) as pool:
+        with tqdm(total=total_tasks, desc="Total progress", position=0) as progress:
+            params_list = []
+            start_idx = 0
+            for i in range(max_workers):
+                end_idx = start_idx + chunk_size + (1 if i < remainder else 0)
+                params_list.append(
+                    (neighborhoods, annotations, start_idx, end_idx, progress_counter)
+                )
+                start_idx = end_idx
+            # Start the hypergeometric test process in parallel
+            results = pool.starmap_async(_hypergeom_process_subset, params_list, chunksize=1)
+            # Update progress bar based on progress_counter
+            while not results.ready():
+                progress.update(progress_counter.value - progress.n)
+                results.wait(0.05)  # Wait for 50ms
+            # Ensure progress bar reaches 100%
+            progress.update(total_tasks - progress.n)
+            # Accumulate results from each worker
+            depletion_pvals, enrichment_pvals = [], []
+            for dp, ep in results.get():
+                depletion_pvals.extend(dp)
+                enrichment_pvals.extend(ep)
+    # Reshape the results back into arrays with the appropriate dimensions
+    depletion_pvals = np.array(depletion_pvals).reshape(
+        neighborhoods.shape[1], annotations.shape[1]
+    )
+    enrichment_pvals = np.array(enrichment_pvals).reshape(
+        neighborhoods.shape[1], annotations.shape[1]
+    )
+    return {
+        "depletion_pvals": depletion_pvals,
+        "enrichment_pvals": enrichment_pvals,
+    }
+def _hypergeom_process_subset(
+    neighborhoods: np.ndarray,
+    annotations: np.ndarray,
+    start_idx: int,
+    end_idx: int,
+    progress_counter,
+) -> tuple:
+    """Process a subset of neighborhoods using the hypergeometric test.
+    Args:
+        neighborhoods (np.ndarray): The full neighborhood matrix.
+        annotations (np.ndarray): The annotation matrix.
+        start_idx (int): Starting index of the neighborhood-annotation pairs to process.
+        end_idx (int): Ending index of the neighborhood-annotation pairs to process.
+        progress_counter: Shared counter for tracking progress.
+    Returns:
+        tuple: Local p-values for depletion and enrichment.
+    """
+    # Initialize lists to store p-values for depletion and enrichment
+    depletion_pvals = []
+    enrichment_pvals = []
+    # Process the subset of tasks assigned to this worker
+    for idx in range(start_idx, end_idx):
+        i = idx // annotations.shape[1]  # Neighborhood index
+        j = idx % annotations.shape[1]  # Annotation index
+        neighborhood = neighborhoods[:, i]
+        annotation = annotations[:, j]
+        # Calculate the required values for the hypergeometric test
+        M = annotations.shape[0]  # Total number of items (population size)
+        n = np.sum(annotation)  # Total number of successes in population
+        N = np.sum(neighborhood)  # Total number of draws (sample size)
+        k = np.sum(neighborhood & annotation)  # Number of successes in sample
+        # Perform hypergeometric test for depletion
+        p_value_depletion = hypergeom.cdf(k, M, n, N)
+        depletion_pvals.append(p_value_depletion)
+        # Perform hypergeometric test for enrichment
+        p_value_enrichment = hypergeom.sf(k - 1, M, n, N)
+        enrichment_pvals.append(p_value_enrichment)
+        # Update the shared progress counter
+        progress_counter.value += 1
+    return depletion_pvals, enrichment_pvals

risk/stats/permutation/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""
+risk/stats/permutation
+~~~~~~~~~~~~~~~~~~~~~~
+"""
+from .permutation import compute_permutation_test

risk/stats/permutation/permutation.py ADDED Viewed

@@ -0,0 +1,212 @@
+"""
+risk/stats/permutation/permutation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+"""
+from multiprocessing import get_context, Manager
+from tqdm import tqdm
+from typing import Any, Callable, Dict
+import numpy as np
+from threadpoolctl import threadpool_limits
+from risk.stats.permutation.test_functions import DISPATCH_TEST_FUNCTIONS
+def compute_permutation_test(
+    neighborhoods: np.ndarray,
+    annotations: np.ndarray,
+    score_metric: str = "sum",
+    null_distribution: str = "network",
+    num_permutations: int = 1000,
+    random_seed: int = 888,
+    max_workers: int = 1,
+) -> Dict[str, Any]:
+    """Compute permutation test for enrichment and depletion in neighborhoods.
+    Args:
+        neighborhoods (np.ndarray): Binary matrix representing neighborhoods.
+        annotations (np.ndarray): Binary matrix representing annotations.
+        score_metric (str, optional): Metric to use for scoring ('sum', 'mean', etc.). Defaults to "sum".
+        null_distribution (str, optional): Type of null distribution ('network' or other). Defaults to "network".
+        num_permutations (int, optional): Number of permutations to run. Defaults to 1000.
+        random_seed (int, optional): Seed for random number generation. Defaults to 888.
+        max_workers (int, optional): Number of workers for multiprocessing. Defaults to 1.
+    Returns:
+        dict: Dictionary containing depletion and enrichment p-values.
+    """
+    # Ensure that the matrices are in the correct format and free of NaN values
+    neighborhoods = neighborhoods.astype(np.float32)
+    annotations = annotations.astype(np.float32)
+    # Retrieve the appropriate neighborhood score function based on the metric
+    neighborhood_score_func = DISPATCH_TEST_FUNCTIONS[score_metric]
+    # Run the permutation test to calculate depletion and enrichment counts
+    counts_depletion, counts_enrichment = _run_permutation_test(
+        neighborhoods=neighborhoods,
+        annotations=annotations,
+        neighborhood_score_func=neighborhood_score_func,
+        null_distribution=null_distribution,
+        num_permutations=num_permutations,
+        random_seed=random_seed,
+        max_workers=max_workers,
+    )
+    # Compute p-values for depletion and enrichment
+    # If counts are 0, set p-value to 1/num_permutations to avoid zero p-values
+    depletion_pvals = np.maximum(counts_depletion, 1) / num_permutations
+    enrichment_pvals = np.maximum(counts_enrichment, 1) / num_permutations
+    return {
+        "depletion_pvals": depletion_pvals,
+        "enrichment_pvals": enrichment_pvals,
+    }
+def _run_permutation_test(
+    neighborhoods: np.ndarray,
+    annotations: np.ndarray,
+    neighborhood_score_func: Callable,
+    null_distribution: str = "network",
+    num_permutations: int = 1000,
+    random_seed: int = 888,
+    max_workers: int = 4,
+) -> tuple:
+    """Run a permutation test to calculate enrichment and depletion counts.
+    Args:
+        neighborhoods (np.ndarray): The neighborhood matrix.
+        annotations (np.ndarray): The annotation matrix.
+        neighborhood_score_func (Callable): Function to calculate neighborhood scores.
+        null_distribution (str, optional): Type of null distribution. Defaults to "network".
+        num_permutations (int, optional): Number of permutations. Defaults to 1000.
+        random_seed (int, optional): Seed for random number generation. Defaults to 888.
+        max_workers (int, optional): Number of workers for multiprocessing. Defaults to 4.
+    Returns:
+        tuple: Depletion and enrichment counts.
+    """
+    # Initialize the RNG for reproducibility
+    rng = np.random.default_rng(seed=random_seed)
+    # Determine the indices to use based on the null distribution type
+    if null_distribution == "network":
+        idxs = range(annotations.shape[0])
+    else:
+        idxs = np.nonzero(np.sum(~np.isnan(annotations), axis=1))[0]
+    # Replace NaNs with zeros in the annotations matrix
+    annotations[np.isnan(annotations)] = 0
+    annotation_matrix_obsv = annotations[idxs]
+    neighborhoods_matrix_obsv = neighborhoods.T[idxs].T
+    # Calculate observed neighborhood scores
+    with np.errstate(invalid="ignore", divide="ignore"):
+        observed_neighborhood_scores = neighborhood_score_func(
+            neighborhoods_matrix_obsv, annotation_matrix_obsv
+        )
+    # Initialize count matrices for depletion and enrichment
+    counts_depletion = np.zeros(observed_neighborhood_scores.shape)
+    counts_enrichment = np.zeros(observed_neighborhood_scores.shape)
+    # Determine the number of permutations to run in each worker process
+    subset_size = num_permutations // max_workers
+    remainder = num_permutations % max_workers
+    # Use the spawn context for creating a new multiprocessing pool
+    ctx = get_context("spawn")
+    manager = Manager()
+    progress_counter = manager.Value("i", 0)
+    total_progress = num_permutations
+    # Execute the permutation test using multiprocessing
+    with ctx.Pool(max_workers) as pool:
+        with tqdm(total=total_progress, desc="Total progress", position=0) as progress:
+            # Prepare parameters for multiprocessing
+            params_list = [
+                (
+                    annotations,
+                    np.array(idxs),
+                    neighborhoods_matrix_obsv,
+                    observed_neighborhood_scores,
+                    neighborhood_score_func,
+                    subset_size + (1 if i < remainder else 0),
+                    progress_counter,
+                    rng,  # Pass the RNG to each process
+                )
+                for i in range(max_workers)
+            ]
+            # Start the permutation process in parallel
+            results = pool.starmap_async(_permutation_process_subset, params_list, chunksize=1)
+            # Update progress bar based on progress_counter
+            # NOTE: Waiting for results to be ready while updating progress bar gives a big improvement
+            # in performance, especially for large number of permutations and workers
+            while not results.ready():
+                progress.update(progress_counter.value - progress.n)
+                results.wait(0.05)  # Wait for 50ms
+            # Ensure progress bar reaches 100%
+            progress.update(total_progress - progress.n)
+            # Accumulate results from each worker
+            for local_counts_depletion, local_counts_enrichment in results.get():
+                counts_depletion = np.add(counts_depletion, local_counts_depletion)
+                counts_enrichment = np.add(counts_enrichment, local_counts_enrichment)
+    return counts_depletion, counts_enrichment
+def _permutation_process_subset(
+    annotation_matrix: np.ndarray,
+    idxs: np.ndarray,
+    neighborhoods_matrix_obsv: np.ndarray,
+    observed_neighborhood_scores: np.ndarray,
+    neighborhood_score_func: Callable,
+    subset_size: int,
+    progress_counter,
+    rng: np.random.Generator,
+) -> tuple:
+    """Process a subset of permutations for the permutation test.
+    Args:
+        annotation_matrix (np.ndarray): The annotation matrix.
+        idxs (np.ndarray): Indices of valid rows in the matrix.
+        neighborhoods_matrix_obsv (np.ndarray): Observed neighborhoods matrix.
+        observed_neighborhood_scores (np.ndarray): Observed neighborhood scores.
+        neighborhood_score_func (Callable): Function to calculate neighborhood scores.
+        subset_size (int): Number of permutations to run in this subset.
+        progress_counter: Shared counter for tracking progress.
+        rng (np.random.Generator): Random number generator object.
+    Returns:
+        tuple: Local counts of depletion and enrichment.
+    """
+    # Initialize local count matrices for this worker
+    local_counts_depletion = np.zeros(observed_neighborhood_scores.shape)
+    local_counts_enrichment = np.zeros(observed_neighborhood_scores.shape)
+    # NOTE: Limit the number of threads used by NumPy's BLAS implementation to 1.
+    # This can help prevent oversubscription of CPU resources during multiprocessing,
+    # ensuring that each process doesn't use more than one CPU core.
+    with threadpool_limits(limits=1, user_api="blas"):
+        for _ in range(subset_size):
+            # Permute the annotation matrix using the RNG
+            annotation_matrix_permut = annotation_matrix[rng.permutation(idxs)]
+            # Calculate permuted neighborhood scores
+            with np.errstate(invalid="ignore", divide="ignore"):
+                permuted_neighborhood_scores = neighborhood_score_func(
+                    neighborhoods_matrix_obsv, annotation_matrix_permut
+                )
+            # Update local depletion and enrichment counts based on permuted scores
+            local_counts_depletion = np.add(
+                local_counts_depletion, permuted_neighborhood_scores <= observed_neighborhood_scores
+            )
+            local_counts_enrichment = np.add(
+                local_counts_enrichment,
+                permuted_neighborhood_scores >= observed_neighborhood_scores,
+            )
+            # Update the shared progress counter
+            progress_counter.value += 1
+    return local_counts_depletion, local_counts_enrichment

risk/stats/{permutation.py → permutation/test_functions.py} RENAMED Viewed

@@ -1,12 +1,13 @@
 """
-risk/stats/permutation
-~~~~~~~~~~~~~~~~~~~~~~
+risk/stats/permutation/test_function
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 """
 import numpy as np
 # Note: Cython optimizations provided minimal performance benefits.
 # The final version with Cython is archived in the `cython_permutation` branch.
+# DISPATCH_TEST_FUNCTIONS can be found at the end of the file.
 def compute_neighborhood_score_by_sum(
@@ -22,8 +23,8 @@ def compute_neighborhood_score_by_sum(
         np.ndarray: Sum of attribute values for each neighborhood.
     """
     # Calculate the neighborhood score as the dot product of neighborhoods and annotations
-    neighborhood_score = np.dot(neighborhoods_matrix, annotation_matrix)
-    return neighborhood_score
+    neighborhood_sum = np.dot(neighborhoods_matrix, annotation_matrix)
+    return neighborhood_sum
 def compute_neighborhood_score_by_stdev(
@@ -49,40 +50,12 @@ def compute_neighborhood_score_by_stdev(
     # Calculate variance as EXX - M^2
     variance = EXX - M**2
     # Compute the standard deviation as the square root of the variance
-    stdev = np.sqrt(variance)
-    return stdev
-def compute_neighborhood_score_by_z_score(
-    neighborhoods_matrix: np.ndarray, annotation_matrix: np.ndarray
-) -> np.ndarray:
-    """Compute Z-scores for neighborhood scores.
-    Args:
-        neighborhoods_matrix (np.ndarray): Binary matrix representing neighborhoods.
-        annotation_matrix (np.ndarray): Matrix representing annotation values.
+    neighborhood_stdev = np.sqrt(variance)
+    return neighborhood_stdev
-    Returns:
-        np.ndarray: Z-scores for each neighborhood.
-    """
-    # Calculate the neighborhood score as the dot product of neighborhoods and annotations
-    neighborhood_score = np.dot(neighborhoods_matrix, annotation_matrix)
-    # Calculate the number of elements in each neighborhood
-    N = np.dot(
-        neighborhoods_matrix, np.ones(annotation_matrix.shape[1], dtype=annotation_matrix.dtype)
-    )
-    # Compute the mean of the neighborhood scores
-    M = neighborhood_score / N
-    # Compute the mean of squares (EXX)
-    EXX = np.dot(neighborhoods_matrix, annotation_matrix**2) / N
-    # Calculate the standard deviation for each neighborhood
-    variance = EXX - M**2
-    std = np.sqrt(variance)
-    # Calculate Z-scores, handling cases where std is 0 or N is less than 3
-    with np.errstate(divide="ignore", invalid="ignore"):
-        z_scores = M / std
-        z_scores[(std == 0) | (N < 3)] = (
-            np.nan
-        )  # Handle division by zero and apply minimum threshold
-    return z_scores
+# Dictionary to dispatch statistical test functions based on the score metric
+DISPATCH_TEST_FUNCTIONS = {
+    "sum": compute_neighborhood_score_by_sum,
+    "stdev": compute_neighborhood_score_by_stdev,
+}

risk-network 0.0.4b2__py3-none-any.whl → 0.0.5b0__py3-none-any.whl

risk-network 0.0.4b2py3-none-any.whl → 0.0.5b0py3-none-any.whl