risk-network 0.0.10__py3-none-any.whl → 0.0.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. risk/__init__.py +1 -1
  2. risk/annotation/__init__.py +10 -0
  3. risk/{annotations/annotations.py → annotation/annotation.py} +62 -102
  4. risk/{annotations → annotation}/io.py +93 -92
  5. risk/annotation/nltk_setup.py +86 -0
  6. risk/log/__init__.py +1 -1
  7. risk/log/parameters.py +26 -27
  8. risk/neighborhoods/__init__.py +0 -1
  9. risk/neighborhoods/api.py +38 -38
  10. risk/neighborhoods/community.py +33 -4
  11. risk/neighborhoods/domains.py +26 -28
  12. risk/neighborhoods/neighborhoods.py +8 -2
  13. risk/neighborhoods/stats/__init__.py +13 -0
  14. risk/neighborhoods/stats/permutation/__init__.py +6 -0
  15. risk/{stats → neighborhoods/stats}/permutation/permutation.py +24 -21
  16. risk/{stats → neighborhoods/stats}/permutation/test_functions.py +5 -4
  17. risk/{stats/stat_tests.py → neighborhoods/stats/tests.py} +62 -54
  18. risk/network/__init__.py +0 -2
  19. risk/network/graph/__init__.py +0 -2
  20. risk/network/graph/api.py +19 -19
  21. risk/network/graph/graph.py +73 -68
  22. risk/{stats/significance.py → network/graph/stats.py} +2 -2
  23. risk/network/graph/summary.py +12 -13
  24. risk/network/io.py +163 -20
  25. risk/network/plotter/__init__.py +0 -2
  26. risk/network/plotter/api.py +1 -1
  27. risk/network/plotter/canvas.py +36 -36
  28. risk/network/plotter/contour.py +14 -15
  29. risk/network/plotter/labels.py +303 -294
  30. risk/network/plotter/network.py +6 -6
  31. risk/network/plotter/plotter.py +8 -10
  32. risk/network/plotter/utils/colors.py +15 -8
  33. risk/network/plotter/utils/layout.py +3 -3
  34. risk/risk.py +6 -7
  35. risk_network-0.0.12.dist-info/METADATA +122 -0
  36. risk_network-0.0.12.dist-info/RECORD +40 -0
  37. {risk_network-0.0.10.dist-info → risk_network-0.0.12.dist-info}/WHEEL +1 -1
  38. risk/annotations/__init__.py +0 -7
  39. risk/network/geometry.py +0 -150
  40. risk/stats/__init__.py +0 -15
  41. risk/stats/permutation/__init__.py +0 -6
  42. risk_network-0.0.10.dist-info/METADATA +0 -798
  43. risk_network-0.0.10.dist-info/RECORD +0 -40
  44. {risk_network-0.0.10.dist-info → risk_network-0.0.12.dist-info/licenses}/LICENSE +0 -0
  45. {risk_network-0.0.10.dist-info → risk_network-0.0.12.dist-info}/top_level.txt +0 -0
@@ -9,19 +9,18 @@ from typing import Tuple, Union
9
9
  import numpy as np
10
10
  import pandas as pd
11
11
  from numpy.linalg import LinAlgError
12
- from scipy.cluster.hierarchy import linkage, fcluster
12
+ from scipy.cluster.hierarchy import fcluster, linkage
13
13
  from sklearn.metrics import silhouette_score
14
14
  from tqdm import tqdm
15
15
 
16
- from risk.annotations import get_weighted_description
16
+ from risk.annotation import get_weighted_description
17
17
  from risk.log import logger
18
18
 
19
-
20
19
  # Define constants for clustering
21
20
  # fmt: off
22
21
  LINKAGE_METHODS = {"single", "complete", "average", "weighted", "centroid", "median", "ward"}
23
22
  LINKAGE_METRICS = {
24
- "braycurtis","canberra", "chebyshev", "cityblock", "correlation", "cosine", "dice", "euclidean",
23
+ "braycurtis", "canberra", "chebyshev", "cityblock", "correlation", "cosine", "dice", "euclidean",
25
24
  "hamming", "jaccard", "jensenshannon", "kulczynski1", "mahalanobis", "matching", "minkowski",
26
25
  "rogerstanimoto", "russellrao", "seuclidean", "sokalmichener", "sokalsneath", "sqeuclidean", "yule",
27
26
  }
@@ -29,7 +28,7 @@ LINKAGE_METRICS = {
29
28
 
30
29
 
31
30
  def define_domains(
32
- top_annotations: pd.DataFrame,
31
+ top_annotation: pd.DataFrame,
33
32
  significant_neighborhoods_significance: np.ndarray,
34
33
  linkage_criterion: str,
35
34
  linkage_method: str,
@@ -40,7 +39,7 @@ def define_domains(
40
39
  handling errors by assigning unique domains when clustering fails.
41
40
 
42
41
  Args:
43
- top_annotations (pd.DataFrame): DataFrame of top annotations data for the network nodes.
42
+ top_annotation (pd.DataFrame): DataFrame of top annotations data for the network nodes.
44
43
  significant_neighborhoods_significance (np.ndarray): The binary significance matrix below alpha.
45
44
  linkage_criterion (str): The clustering criterion for defining groups. Choose "off" to disable clustering.
46
45
  linkage_method (str): The linkage method for clustering. Choose "auto" to optimize.
@@ -49,13 +48,16 @@ def define_domains(
49
48
 
50
49
  Returns:
51
50
  pd.DataFrame: DataFrame with the primary domain for each node.
51
+
52
+ Raises:
53
+ ValueError: If the clustering criterion is set to "off" or if an error occurs during clustering.
52
54
  """
53
55
  try:
54
56
  if linkage_criterion == "off":
55
57
  raise ValueError("Clustering is turned off.")
56
58
 
57
59
  # Transpose the matrix to cluster annotations
58
- m = significant_neighborhoods_significance[:, top_annotations["significant_annotations"]].T
60
+ m = significant_neighborhoods_significance[:, top_annotation["significant_annotation"]].T
59
61
  # Safeguard the matrix by replacing NaN, Inf, and -Inf values
60
62
  m = _safeguard_matrix(m)
61
63
  # Optimize silhouette score across different linkage methods and distance metrics
@@ -69,27 +71,23 @@ def define_domains(
69
71
  )
70
72
  # Calculate the optimal threshold for clustering
71
73
  max_d_optimal = np.max(Z[:, 2]) * best_threshold
72
- # Assign domains to the annotations matrix
74
+ # Assign domains to the annotation matrix
73
75
  domains = fcluster(Z, max_d_optimal, criterion=linkage_criterion)
74
- top_annotations["domain"] = 0
75
- top_annotations.loc[top_annotations["significant_annotations"], "domain"] = domains
76
+ top_annotation["domain"] = 0
77
+ top_annotation.loc[top_annotation["significant_annotation"], "domain"] = domains
76
78
  except (ValueError, LinAlgError):
77
79
  # If a ValueError is encountered, handle it by assigning unique domains
78
- n_rows = len(top_annotations)
80
+ n_rows = len(top_annotation)
79
81
  if linkage_criterion == "off":
80
- logger.warning(
81
- f"Clustering is turned off. Skipping clustering and assigning {n_rows} unique domains."
82
- )
82
+ logger.warning("Clustering is turned off. Skipping clustering.")
83
83
  else:
84
- logger.error(
85
- f"Error encountered. Skipping clustering and assigning {n_rows} unique domains."
86
- )
87
- top_annotations["domain"] = range(1, n_rows + 1) # Assign unique domains
84
+ logger.error("Error encountered. Skipping clustering.")
85
+ top_annotation["domain"] = range(1, n_rows + 1) # Assign unique domains
88
86
 
89
87
  # Create DataFrames to store domain information
90
88
  node_to_significance = pd.DataFrame(
91
89
  data=significant_neighborhoods_significance,
92
- columns=[top_annotations.index.values, top_annotations["domain"]],
90
+ columns=[top_annotation.index.values, top_annotation["domain"]],
93
91
  )
94
92
  node_to_domain = node_to_significance.T.groupby(level="domain").sum().T
95
93
 
@@ -110,15 +108,15 @@ def define_domains(
110
108
 
111
109
  def trim_domains(
112
110
  domains: pd.DataFrame,
113
- top_annotations: pd.DataFrame,
111
+ top_annotation: pd.DataFrame,
114
112
  min_cluster_size: int = 5,
115
113
  max_cluster_size: int = 1000,
116
- ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
114
+ ) -> Tuple[pd.DataFrame, pd.DataFrame]:
117
115
  """Trim domains that do not meet size criteria and find outliers.
118
116
 
119
117
  Args:
120
118
  domains (pd.DataFrame): DataFrame of domain data for the network nodes.
121
- top_annotations (pd.DataFrame): DataFrame of top annotations data for the network nodes.
119
+ top_annotation (pd.DataFrame): DataFrame of top annotations data for the network nodes.
122
120
  min_cluster_size (int, optional): Minimum size of a cluster to be retained. Defaults to 5.
123
121
  max_cluster_size (int, optional): Maximum size of a cluster to be retained. Defaults to 1000.
124
122
 
@@ -137,21 +135,21 @@ def trim_domains(
137
135
  invalid_domain_id = 888888
138
136
  invalid_domain_ids = {0, invalid_domain_id}
139
137
  # Mark domains to be removed
140
- top_annotations["domain"] = top_annotations["domain"].replace(to_remove, invalid_domain_id)
138
+ top_annotation["domain"] = top_annotation["domain"].replace(to_remove, invalid_domain_id)
141
139
  domains.loc[domains["primary_domain"].isin(to_remove), ["primary_domain"]] = invalid_domain_id
142
140
 
143
141
  # Normalize "num significant neighborhoods" by percentile for each domain and scale to 0-10
144
- top_annotations["normalized_value"] = top_annotations.groupby("domain")[
142
+ top_annotation["normalized_value"] = top_annotation.groupby("domain")[
145
143
  "significant_neighborhood_significance_sums"
146
144
  ].transform(lambda x: (x.rank(pct=True) * 10).apply(np.ceil).astype(int))
147
145
  # Modify the lambda function to pass both full_terms and significant_significance_score
148
- top_annotations["combined_terms"] = top_annotations.apply(
146
+ top_annotation["combined_terms"] = top_annotation.apply(
149
147
  lambda row: " ".join([str(row["full_terms"])] * row["normalized_value"]), axis=1
150
148
  )
151
149
 
152
150
  # Perform the groupby operation while retaining the other columns and adding the weighting with significance scores
153
151
  domain_labels = (
154
- top_annotations.groupby("domain")
152
+ top_annotation.groupby("domain")
155
153
  .agg(
156
154
  full_terms=("full_terms", lambda x: list(x)),
157
155
  significance_scores=("significant_significance_score", lambda x: list(x)),
@@ -231,7 +229,7 @@ def _optimize_silhouette_across_linkage_and_metrics(
231
229
  # Initialize best overall values
232
230
  best_overall_method = linkage_method
233
231
  best_overall_metric = linkage_metric
234
- best_overall_threshold = linkage_threshold
232
+ best_overall_threshold = 0.0
235
233
  best_overall_score = -np.inf
236
234
 
237
235
  # Set linkage methods and metrics to all combinations if "auto" is selected
@@ -242,7 +240,7 @@ def _optimize_silhouette_across_linkage_and_metrics(
242
240
  # Evaluating optimal linkage method and metric
243
241
  for method, metric in tqdm(
244
242
  product(linkage_methods, linkage_metrics),
245
- desc="Evaluating optimal linkage method and metric",
243
+ desc="Evaluating linkage methods and metrics",
246
244
  total=total_combinations,
247
245
  bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
248
246
  ):
@@ -13,6 +13,7 @@ from scipy.sparse import csr_matrix
13
13
  from sklearn.exceptions import DataConversionWarning
14
14
  from sklearn.metrics.pairwise import cosine_similarity
15
15
 
16
+ from risk.log import logger
16
17
  from risk.neighborhoods.community import (
17
18
  calculate_greedy_modularity_neighborhoods,
18
19
  calculate_label_propagation_neighborhoods,
@@ -22,7 +23,6 @@ from risk.neighborhoods.community import (
22
23
  calculate_spinglass_neighborhoods,
23
24
  calculate_walktrap_neighborhoods,
24
25
  )
25
- from risk.log import logger
26
26
 
27
27
  # Suppress DataConversionWarning
28
28
  warnings.filterwarnings(action="ignore", category=DataConversionWarning)
@@ -48,6 +48,9 @@ def get_network_neighborhoods(
48
48
 
49
49
  Returns:
50
50
  csr_matrix: The combined neighborhood matrix.
51
+
52
+ Raises:
53
+ ValueError: If the number of distance metrics does not match the number of edge length thresholds.
51
54
  """
52
55
  # Set random seed for reproducibility
53
56
  random.seed(random_seed)
@@ -490,6 +493,9 @@ def _calculate_threshold(median_distances: List, distance_threshold: float) -> f
490
493
 
491
494
  Returns:
492
495
  float: The calculated distance threshold value.
496
+
497
+ Raises:
498
+ ValueError: If no significant annotation is found in the median distances.
493
499
  """
494
500
  # Sort the median distances
495
501
  sorted_distances = np.sort(median_distances)
@@ -500,7 +506,7 @@ def _calculate_threshold(median_distances: List, distance_threshold: float) -> f
500
506
  try:
501
507
  smoothed_distances = np.interp(interpolated_percentiles, rank_percentiles, sorted_distances)
502
508
  except ValueError as e:
503
- raise ValueError("No significant annotations found.") from e
509
+ raise ValueError("No significant annotation found.") from e
504
510
 
505
511
  # Determine the index corresponding to the distance threshold
506
512
  threshold_index = int(np.ceil(distance_threshold * len(smoothed_distances))) - 1
@@ -0,0 +1,13 @@
1
+ """
2
+ risk/neighborhoods/stats
3
+ ~~~~~~~~~~~~~~~~~~~~~~~~
4
+ """
5
+
6
+ from risk.neighborhoods.stats.permutation import compute_permutation_test
7
+ from risk.neighborhoods.stats.tests import (
8
+ compute_binom_test,
9
+ compute_chi2_test,
10
+ compute_hypergeom_test,
11
+ compute_poisson_test,
12
+ compute_zscore_test,
13
+ )
@@ -0,0 +1,6 @@
1
+ """
2
+ risk/neighborhoods/stats/permutation
3
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4
+ """
5
+
6
+ from risk.neighborhoods.stats.permutation.permutation import compute_permutation_test
@@ -1,9 +1,9 @@
1
1
  """
2
- risk/stats/permutation/permutation
3
- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2
+ risk/neighborhoods/stats/permutation/permutation
3
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4
4
  """
5
5
 
6
- from multiprocessing import get_context, Manager
6
+ from multiprocessing import Manager, get_context
7
7
  from multiprocessing.managers import ValueProxy
8
8
  from typing import Any, Callable, Dict, List, Tuple, Union
9
9
 
@@ -12,12 +12,12 @@ from scipy.sparse import csr_matrix
12
12
  from threadpoolctl import threadpool_limits
13
13
  from tqdm import tqdm
14
14
 
15
- from risk.stats.permutation.test_functions import DISPATCH_TEST_FUNCTIONS
15
+ from risk.neighborhoods.stats.permutation.test_functions import DISPATCH_TEST_FUNCTIONS
16
16
 
17
17
 
18
18
  def compute_permutation_test(
19
19
  neighborhoods: csr_matrix,
20
- annotations: csr_matrix,
20
+ annotation: csr_matrix,
21
21
  score_metric: str = "sum",
22
22
  null_distribution: str = "network",
23
23
  num_permutations: int = 1000,
@@ -28,9 +28,9 @@ def compute_permutation_test(
28
28
 
29
29
  Args:
30
30
  neighborhoods (csr_matrix): Sparse binary matrix representing neighborhoods.
31
- annotations (csr_matrix): Sparse binary matrix representing annotations.
31
+ annotation (csr_matrix): Sparse binary matrix representing annotation.
32
32
  score_metric (str, optional): Metric to use for scoring ('sum' or 'stdev'). Defaults to "sum".
33
- null_distribution (str, optional): Type of null distribution ('network' or 'annotations'). Defaults to "network".
33
+ null_distribution (str, optional): Type of null distribution ('network' or 'annotation'). Defaults to "network".
34
34
  num_permutations (int, optional): Number of permutations to run. Defaults to 1000.
35
35
  random_seed (int, optional): Seed for random number generation. Defaults to 888.
36
36
  max_workers (int, optional): Number of workers for multiprocessing. Defaults to 1.
@@ -41,14 +41,14 @@ def compute_permutation_test(
41
41
  # Ensure that the matrices are in the correct format and free of NaN values
42
42
  # NOTE: Keep the data type as float32 to avoid locking issues with dot product operations
43
43
  neighborhoods = neighborhoods.astype(np.float32)
44
- annotations = annotations.astype(np.float32)
44
+ annotation = annotation.astype(np.float32)
45
45
  # Retrieve the appropriate neighborhood score function based on the metric
46
46
  neighborhood_score_func = DISPATCH_TEST_FUNCTIONS[score_metric]
47
47
 
48
48
  # Run the permutation test to calculate depletion and enrichment counts
49
49
  counts_depletion, counts_enrichment = _run_permutation_test(
50
50
  neighborhoods=neighborhoods,
51
- annotations=annotations,
51
+ annotation=annotation,
52
52
  neighborhood_score_func=neighborhood_score_func,
53
53
  null_distribution=null_distribution,
54
54
  num_permutations=num_permutations,
@@ -68,7 +68,7 @@ def compute_permutation_test(
68
68
 
69
69
  def _run_permutation_test(
70
70
  neighborhoods: csr_matrix,
71
- annotations: csr_matrix,
71
+ annotation: csr_matrix,
72
72
  neighborhood_score_func: Callable,
73
73
  null_distribution: str = "network",
74
74
  num_permutations: int = 1000,
@@ -79,31 +79,34 @@ def _run_permutation_test(
79
79
 
80
80
  Args:
81
81
  neighborhoods (csr_matrix): Sparse binary matrix representing neighborhoods.
82
- annotations (csr_matrix): Sparse binary matrix representing annotations.
82
+ annotation (csr_matrix): Sparse binary matrix representing annotation.
83
83
  neighborhood_score_func (Callable): Function to calculate neighborhood scores.
84
- null_distribution (str, optional): Type of null distribution ('network' or 'annotations'). Defaults to "network".
84
+ null_distribution (str, optional): Type of null distribution ('network' or 'annotation'). Defaults to "network".
85
85
  num_permutations (int, optional): Number of permutations. Defaults to 1000.
86
86
  random_seed (int, optional): Seed for random number generation. Defaults to 888.
87
87
  max_workers (int, optional): Number of workers for multiprocessing. Defaults to 4.
88
88
 
89
89
  Returns:
90
90
  tuple: Depletion and enrichment counts.
91
+
92
+ Raises:
93
+ ValueError: If an invalid null_distribution value is provided.
91
94
  """
92
95
  # Initialize the RNG for reproducibility
93
96
  rng = np.random.default_rng(seed=random_seed)
94
97
  # Determine the indices to use based on the null distribution type
95
98
  if null_distribution == "network":
96
- idxs = range(annotations.shape[0])
97
- elif null_distribution == "annotations":
98
- idxs = np.nonzero(annotations.getnnz(axis=1) > 0)[0]
99
+ idxs = range(annotation.shape[0])
100
+ elif null_distribution == "annotation":
101
+ idxs = np.nonzero(annotation.getnnz(axis=1) > 0)[0]
99
102
  else:
100
103
  raise ValueError(
101
- "Invalid null_distribution value. Choose either 'network' or 'annotations'."
104
+ "Invalid null_distribution value. Choose either 'network' or 'annotation'."
102
105
  )
103
106
 
104
- # Replace NaNs with zeros in the sparse annotations matrix
105
- annotations.data[np.isnan(annotations.data)] = 0
106
- annotation_matrix_obsv = annotations[idxs]
107
+ # Replace NaNs with zeros in the sparse annotation matrix
108
+ annotation.data[np.isnan(annotation.data)] = 0
109
+ annotation_matrix_obsv = annotation[idxs]
107
110
  neighborhoods_matrix_obsv = neighborhoods.T[idxs].T
108
111
  # Calculate observed neighborhood scores
109
112
  with np.errstate(invalid="ignore", divide="ignore"):
@@ -139,7 +142,7 @@ def _run_permutation_test(
139
142
  params_list = [
140
143
  (
141
144
  permutation_batches[i], # Pass the batch of precomputed permutations
142
- annotations,
145
+ annotation,
143
146
  neighborhoods_matrix_obsv,
144
147
  observed_neighborhood_scores,
145
148
  neighborhood_score_func,
@@ -182,7 +185,7 @@ def _permutation_process_batch(
182
185
 
183
186
  Args:
184
187
  permutations (Union[List, Tuple, np.ndarray]): Permutation batch to process.
185
- annotation_matrix (csr_matrix): Sparse binary matrix representing annotations.
188
+ annotation_matrix (csr_matrix): Sparse binary matrix representing annotation.
186
189
  neighborhoods_matrix_obsv (csr_matrix): Sparse binary matrix representing observed neighborhoods.
187
190
  observed_neighborhood_scores (np.ndarray): Observed neighborhood scores.
188
191
  neighborhood_score_func (Callable): Function to calculate neighborhood scores.
@@ -1,6 +1,6 @@
1
1
  """
2
- risk/stats/permutation/test_functions
3
- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2
+ risk/neighborhoods/stats/permutation/test_functions
3
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4
4
  """
5
5
 
6
6
  import numpy as np
@@ -8,6 +8,7 @@ from scipy.sparse import csr_matrix
8
8
 
9
9
  # NOTE: Cython optimizations provided minimal performance benefits.
10
10
  # The final version with Cython is archived in the `cython_permutation` branch.
11
+
11
12
  # DISPATCH_TEST_FUNCTIONS can be found at the end of the file.
12
13
 
13
14
 
@@ -23,7 +24,7 @@ def compute_neighborhood_score_by_sum(
23
24
  Returns:
24
25
  np.ndarray: Dense array of summed attribute values for each neighborhood.
25
26
  """
26
- # Calculate the neighborhood score as the dot product of neighborhoods and annotations
27
+ # Calculate the neighborhood score as the dot product of neighborhoods and annotation
27
28
  neighborhood_score = neighborhoods_matrix @ annotation_matrix # Sparse matrix multiplication
28
29
  # Convert the result to a dense array for downstream calculations
29
30
  neighborhood_score_dense = neighborhood_score.toarray()
@@ -42,7 +43,7 @@ def compute_neighborhood_score_by_stdev(
42
43
  Returns:
43
44
  np.ndarray: Standard deviation of the neighborhood scores.
44
45
  """
45
- # Calculate the neighborhood score as the dot product of neighborhoods and annotations
46
+ # Calculate the neighborhood score as the dot product of neighborhoods and annotation
46
47
  neighborhood_score = neighborhoods_matrix @ annotation_matrix # Sparse matrix multiplication
47
48
  # Calculate the number of elements in each neighborhood (sum of rows)
48
49
  N = neighborhoods_matrix.sum(axis=1).A.flatten() # Convert to 1D array