risk-network 0.0.8b27__py3-none-any.whl → 0.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. risk/__init__.py +2 -2
  2. risk/annotations/__init__.py +2 -2
  3. risk/annotations/annotations.py +195 -118
  4. risk/annotations/io.py +47 -31
  5. risk/log/__init__.py +4 -2
  6. risk/log/console.py +3 -1
  7. risk/log/{params.py → parameters.py} +17 -42
  8. risk/neighborhoods/__init__.py +3 -5
  9. risk/neighborhoods/api.py +442 -0
  10. risk/neighborhoods/community.py +324 -101
  11. risk/neighborhoods/domains.py +125 -52
  12. risk/neighborhoods/neighborhoods.py +177 -165
  13. risk/network/__init__.py +1 -3
  14. risk/network/geometry.py +71 -89
  15. risk/network/graph/__init__.py +6 -0
  16. risk/network/graph/api.py +200 -0
  17. risk/network/{graph.py → graph/graph.py} +90 -40
  18. risk/network/graph/summary.py +254 -0
  19. risk/network/io.py +103 -114
  20. risk/network/plotter/__init__.py +6 -0
  21. risk/network/plotter/api.py +54 -0
  22. risk/network/{plot → plotter}/canvas.py +9 -8
  23. risk/network/{plot → plotter}/contour.py +27 -24
  24. risk/network/{plot → plotter}/labels.py +73 -78
  25. risk/network/{plot → plotter}/network.py +45 -39
  26. risk/network/{plot → plotter}/plotter.py +23 -17
  27. risk/network/{plot/utils/color.py → plotter/utils/colors.py} +114 -122
  28. risk/network/{plot → plotter}/utils/layout.py +10 -7
  29. risk/risk.py +11 -500
  30. risk/stats/__init__.py +10 -4
  31. risk/stats/permutation/__init__.py +1 -1
  32. risk/stats/permutation/permutation.py +44 -38
  33. risk/stats/permutation/test_functions.py +26 -18
  34. risk/stats/{stats.py → significance.py} +17 -15
  35. risk/stats/stat_tests.py +267 -0
  36. {risk_network-0.0.8b27.dist-info → risk_network-0.0.9.dist-info}/METADATA +31 -46
  37. risk_network-0.0.9.dist-info/RECORD +40 -0
  38. {risk_network-0.0.8b27.dist-info → risk_network-0.0.9.dist-info}/WHEEL +1 -1
  39. risk/constants.py +0 -31
  40. risk/network/plot/__init__.py +0 -6
  41. risk/stats/hypergeom.py +0 -54
  42. risk/stats/poisson.py +0 -44
  43. risk_network-0.0.8b27.dist-info/RECORD +0 -37
  44. {risk_network-0.0.8b27.dist-info → risk_network-0.0.9.dist-info}/LICENSE +0 -0
  45. {risk_network-0.0.8b27.dist-info → risk_network-0.0.9.dist-info}/top_level.txt +0 -0
@@ -3,75 +3,97 @@ risk/neighborhoods/domains
3
3
  ~~~~~~~~~~~~~~~~~~~~~~~~~~
4
4
  """
5
5
 
6
- from contextlib import suppress
7
6
  from itertools import product
8
- from tqdm import tqdm
9
- from typing import Tuple
7
+ from typing import Tuple, Union
10
8
 
11
9
  import numpy as np
12
10
  import pandas as pd
11
+ from numpy.linalg import LinAlgError
13
12
  from scipy.cluster.hierarchy import linkage, fcluster
14
13
  from sklearn.metrics import silhouette_score
14
+ from tqdm import tqdm
15
15
 
16
16
  from risk.annotations import get_weighted_description
17
- from risk.constants import GROUP_LINKAGE_METHODS, GROUP_DISTANCE_METRICS
18
17
  from risk.log import logger
19
18
 
20
19
 
20
+ # Define constants for clustering
21
+ # fmt: off
22
+ LINKAGE_METHODS = {"single", "complete", "average", "weighted", "centroid", "median", "ward"}
23
+ LINKAGE_METRICS = {
24
+ "braycurtis","canberra", "chebyshev", "cityblock", "correlation", "cosine", "dice", "euclidean",
25
+ "hamming", "jaccard", "jensenshannon", "kulczynski1", "mahalanobis", "matching", "minkowski",
26
+ "rogerstanimoto", "russellrao", "seuclidean", "sokalmichener", "sokalsneath", "sqeuclidean", "yule",
27
+ }
28
+ # fmt: on
29
+
30
+
21
31
  def define_domains(
22
32
  top_annotations: pd.DataFrame,
23
- significant_neighborhoods_enrichment: np.ndarray,
33
+ significant_neighborhoods_significance: np.ndarray,
24
34
  linkage_criterion: str,
25
35
  linkage_method: str,
26
36
  linkage_metric: str,
37
+ linkage_threshold: Union[float, str],
27
38
  ) -> pd.DataFrame:
28
- """Define domains and assign nodes to these domains based on their enrichment scores and clustering,
39
+ """Define domains and assign nodes to these domains based on their significance scores and clustering,
29
40
  handling errors by assigning unique domains when clustering fails.
30
41
 
31
42
  Args:
32
43
  top_annotations (pd.DataFrame): DataFrame of top annotations data for the network nodes.
33
- significant_neighborhoods_enrichment (np.ndarray): The binary enrichment matrix below alpha.
44
+ significant_neighborhoods_significance (np.ndarray): The binary significance matrix below alpha.
34
45
  linkage_criterion (str): The clustering criterion for defining groups.
35
- linkage_method (str): The linkage method for clustering.
36
- linkage_metric (str): The linkage metric for clustering.
46
+ linkage_method (str): The linkage method for clustering. Choose "auto" to optimize.
47
+ linkage_metric (str): The linkage metric for clustering. Choose "auto" to optimize.
48
+ linkage_threshold (float, str): The threshold for clustering. Choose "auto" to optimize.
37
49
 
38
50
  Returns:
39
51
  pd.DataFrame: DataFrame with the primary domain for each node.
40
52
  """
41
53
  try:
54
+ if linkage_criterion == "off":
55
+ raise ValueError("Clustering is turned off.")
56
+
42
57
  # Transpose the matrix to cluster annotations
43
- m = significant_neighborhoods_enrichment[:, top_annotations["significant_annotations"]].T
58
+ m = significant_neighborhoods_significance[:, top_annotations["significant_annotations"]].T
59
+ # Safeguard the matrix by replacing NaN, Inf, and -Inf values
60
+ m = _safeguard_matrix(m)
61
+ # Optimize silhouette score across different linkage methods and distance metrics
44
62
  best_linkage, best_metric, best_threshold = _optimize_silhouette_across_linkage_and_metrics(
45
- m, linkage_criterion, linkage_method, linkage_metric
63
+ m, linkage_criterion, linkage_method, linkage_metric, linkage_threshold
46
64
  )
47
65
  # Perform hierarchical clustering
48
66
  Z = linkage(m, method=best_linkage, metric=best_metric)
49
67
  logger.warning(
50
- f"Linkage criterion: '{linkage_criterion}'\nLinkage method: '{best_linkage}'\nLinkage metric: '{best_metric}'"
68
+ f"Linkage criterion: '{linkage_criterion}'\nLinkage method: '{best_linkage}'\nLinkage metric: '{best_metric}'\nLinkage threshold: {round(best_threshold, 3)}"
51
69
  )
52
- logger.debug(f"Optimal linkage threshold: {round(best_threshold, 3)}")
53
70
  # Calculate the optimal threshold for clustering
54
71
  max_d_optimal = np.max(Z[:, 2]) * best_threshold
55
72
  # Assign domains to the annotations matrix
56
73
  domains = fcluster(Z, max_d_optimal, criterion=linkage_criterion)
57
74
  top_annotations["domain"] = 0
58
75
  top_annotations.loc[top_annotations["significant_annotations"], "domain"] = domains
59
- except ValueError:
76
+ except (ValueError, LinAlgError):
60
77
  # If a ValueError is encountered, handle it by assigning unique domains
61
78
  n_rows = len(top_annotations)
62
- logger.error(
63
- f"Error encountered. Skipping clustering and assigning {n_rows} unique domains."
64
- )
79
+ if linkage_criterion == "off":
80
+ logger.warning(
81
+ f"Clustering is turned off. Skipping clustering and assigning {n_rows} unique domains."
82
+ )
83
+ else:
84
+ logger.error(
85
+ f"Error encountered. Skipping clustering and assigning {n_rows} unique domains."
86
+ )
65
87
  top_annotations["domain"] = range(1, n_rows + 1) # Assign unique domains
66
88
 
67
89
  # Create DataFrames to store domain information
68
- node_to_enrichment = pd.DataFrame(
69
- data=significant_neighborhoods_enrichment,
90
+ node_to_significance = pd.DataFrame(
91
+ data=significant_neighborhoods_significance,
70
92
  columns=[top_annotations.index.values, top_annotations["domain"]],
71
93
  )
72
- node_to_domain = node_to_enrichment.groupby(level="domain", axis=1).sum()
94
+ node_to_domain = node_to_significance.T.groupby(level="domain").sum().T
73
95
 
74
- # Find the maximum enrichment score for each node
96
+ # Find the maximum significance score for each node
75
97
  t_max = node_to_domain.loc[:, 1:].max(axis=1)
76
98
  t_idxmax = node_to_domain.loc[:, 1:].idxmax(axis=1)
77
99
  t_idxmax[t_max == 0] = 0
@@ -86,13 +108,13 @@ def define_domains(
86
108
  return node_to_domain
87
109
 
88
110
 
89
- def trim_domains_and_top_annotations(
111
+ def trim_domains(
90
112
  domains: pd.DataFrame,
91
113
  top_annotations: pd.DataFrame,
92
114
  min_cluster_size: int = 5,
93
115
  max_cluster_size: int = 1000,
94
116
  ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
95
- """Trim domains and top annotations that do not meet size criteria and find outliers.
117
+ """Trim domains that do not meet size criteria and find outliers.
96
118
 
97
119
  Args:
98
120
  domains (pd.DataFrame): DataFrame of domain data for the network nodes.
@@ -101,8 +123,7 @@ def trim_domains_and_top_annotations(
101
123
  max_cluster_size (int, optional): Maximum size of a cluster to be retained. Defaults to 1000.
102
124
 
103
125
  Returns:
104
- Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing:
105
- - Trimmed annotations (pd.DataFrame)
126
+ Tuple[pd.DataFrame, pd.DataFrame]:
106
127
  - Trimmed domains (pd.DataFrame)
107
128
  - A DataFrame with domain labels (pd.DataFrame)
108
129
  """
@@ -116,30 +137,30 @@ def trim_domains_and_top_annotations(
116
137
  invalid_domain_id = 888888
117
138
  invalid_domain_ids = {0, invalid_domain_id}
118
139
  # Mark domains to be removed
119
- top_annotations["domain"].replace(to_remove, invalid_domain_id, inplace=True)
140
+ top_annotations["domain"] = top_annotations["domain"].replace(to_remove, invalid_domain_id)
120
141
  domains.loc[domains["primary_domain"].isin(to_remove), ["primary_domain"]] = invalid_domain_id
121
142
 
122
- # Normalize "num enriched neighborhoods" by percentile for each domain and scale to 0-10
143
+ # Normalize "num significant neighborhoods" by percentile for each domain and scale to 0-10
123
144
  top_annotations["normalized_value"] = top_annotations.groupby("domain")[
124
- "significant_neighborhood_enrichment_sums"
145
+ "significant_neighborhood_significance_sums"
125
146
  ].transform(lambda x: (x.rank(pct=True) * 10).apply(np.ceil).astype(int))
126
- # Modify the lambda function to pass both full_terms and significant_enrichment_score
147
+ # Modify the lambda function to pass both full_terms and significant_significance_score
127
148
  top_annotations["combined_terms"] = top_annotations.apply(
128
149
  lambda row: " ".join([str(row["full_terms"])] * row["normalized_value"]), axis=1
129
150
  )
130
151
 
131
- # Perform the groupby operation while retaining the other columns and adding the weighting with enrichment scores
152
+ # Perform the groupby operation while retaining the other columns and adding the weighting with significance scores
132
153
  domain_labels = (
133
154
  top_annotations.groupby("domain")
134
155
  .agg(
135
156
  full_terms=("full_terms", lambda x: list(x)),
136
- enrichment_scores=("significant_enrichment_score", lambda x: list(x)),
157
+ significance_scores=("significant_significance_score", lambda x: list(x)),
137
158
  )
138
159
  .reset_index()
139
160
  )
140
161
  domain_labels["combined_terms"] = domain_labels.apply(
141
162
  lambda row: get_weighted_description(
142
- pd.Series(row["full_terms"]), pd.Series(row["enrichment_scores"])
163
+ pd.Series(row["full_terms"]), pd.Series(row["significance_scores"])
143
164
  ),
144
165
  axis=1,
145
166
  )
@@ -150,45 +171,72 @@ def trim_domains_and_top_annotations(
150
171
  "domain": "id",
151
172
  "combined_terms": "normalized_description",
152
173
  "full_terms": "full_descriptions",
153
- "enrichment_scores": "enrichment_scores",
174
+ "significance_scores": "significance_scores",
154
175
  }
155
176
  ).set_index("id")
156
177
 
157
178
  # Remove invalid domains
158
- valid_annotations = top_annotations[~top_annotations["domain"].isin(invalid_domain_ids)].drop(
159
- columns=["normalized_value"]
160
- )
161
179
  valid_domains = domains[~domains["primary_domain"].isin(invalid_domain_ids)]
162
180
  valid_trimmed_domains_matrix = trimmed_domains_matrix[
163
181
  ~trimmed_domains_matrix.index.isin(invalid_domain_ids)
164
182
  ]
165
- return valid_annotations, valid_domains, valid_trimmed_domains_matrix
183
+ return valid_domains, valid_trimmed_domains_matrix
184
+
185
+
186
+ def _safeguard_matrix(matrix: np.ndarray) -> np.ndarray:
187
+ """Safeguard the matrix by replacing NaN, Inf, and -Inf values.
188
+
189
+ Args:
190
+ matrix (np.ndarray): Data matrix.
191
+
192
+ Returns:
193
+ np.ndarray: Safeguarded data matrix.
194
+ """
195
+ # Replace NaN with column mean
196
+ nan_replacement = np.nanmean(matrix, axis=0)
197
+ matrix = np.where(np.isnan(matrix), nan_replacement, matrix)
198
+ # Replace Inf/-Inf with maximum/minimum finite values
199
+ finite_max = np.nanmax(matrix[np.isfinite(matrix)])
200
+ finite_min = np.nanmin(matrix[np.isfinite(matrix)])
201
+ matrix = np.where(np.isposinf(matrix), finite_max, matrix)
202
+ matrix = np.where(np.isneginf(matrix), finite_min, matrix)
203
+ # Ensure rows have non-zero variance (optional step)
204
+ row_variance = np.var(matrix, axis=1)
205
+ matrix = matrix[row_variance > 0]
206
+ return matrix
166
207
 
167
208
 
168
209
  def _optimize_silhouette_across_linkage_and_metrics(
169
- m: np.ndarray, linkage_criterion: str, linkage_method: str, linkage_metric: str
210
+ m: np.ndarray,
211
+ linkage_criterion: str,
212
+ linkage_method: str,
213
+ linkage_metric: str,
214
+ linkage_threshold: Union[str, float],
170
215
  ) -> Tuple[str, str, float]:
171
216
  """Optimize silhouette score across different linkage methods and distance metrics.
172
217
 
173
218
  Args:
174
219
  m (np.ndarray): Data matrix.
175
220
  linkage_criterion (str): Clustering criterion.
176
- linkage_method (str): Linkage method for clustering.
177
- linkage_metric (str): Linkage metric for clustering.
221
+ linkage_method (str): Linkage method for clustering. Choose "auto" to optimize.
222
+ linkage_metric (str): Linkage metric for clustering. Choose "auto" to optimize.
223
+ linkage_threshold (Union[str, float]): Threshold for clustering. Choose "auto" to optimize.
178
224
 
179
225
  Returns:
180
- Tuple[str, str, float]: A tuple containing:
226
+ Tuple[str, str, float]:
181
227
  - Best linkage method (str)
182
228
  - Best linkage metric (str)
183
229
  - Best threshold (float)
184
230
  """
231
+ # Initialize best overall values
185
232
  best_overall_method = linkage_method
186
233
  best_overall_metric = linkage_metric
234
+ best_overall_threshold = linkage_threshold
187
235
  best_overall_score = -np.inf
188
- best_overall_threshold = 1
189
236
 
190
- linkage_methods = GROUP_LINKAGE_METHODS if linkage_method == "auto" else [linkage_method]
191
- linkage_metrics = GROUP_DISTANCE_METRICS if linkage_metric == "auto" else [linkage_metric]
237
+ # Set linkage methods and metrics to all combinations if "auto" is selected
238
+ linkage_methods = LINKAGE_METHODS if linkage_method == "auto" else [linkage_method]
239
+ linkage_metrics = LINKAGE_METRICS if linkage_metric == "auto" else [linkage_metric]
192
240
  total_combinations = len(linkage_methods) * len(linkage_metrics)
193
241
 
194
242
  # Evaluating optimal linkage method and metric
@@ -198,14 +246,39 @@ def _optimize_silhouette_across_linkage_and_metrics(
198
246
  total=total_combinations,
199
247
  bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
200
248
  ):
201
- with suppress(Exception):
249
+ # Some linkage methods and metrics may not work with certain data
250
+ try:
202
251
  Z = linkage(m, method=method, metric=metric)
203
- threshold, score = _find_best_silhouette_score(Z, m, metric, linkage_criterion)
204
- if score > best_overall_score:
205
- best_overall_score = score
206
- best_overall_threshold = threshold
207
- best_overall_method = method
208
- best_overall_metric = metric
252
+ except (ValueError, LinAlgError):
253
+ # If linkage fails, set a default threshold (a float) and a very poor score
254
+ current_threshold = 0.0
255
+ score = -float("inf")
256
+ else:
257
+ # Only optimize silhouette score if the threshold is "auto"
258
+ if linkage_threshold == "auto":
259
+ threshold, score = _find_best_silhouette_score(Z, m, metric, linkage_criterion)
260
+ current_threshold = threshold
261
+ else:
262
+ # Use the provided threshold without optimization
263
+ score = silhouette_score(
264
+ m,
265
+ fcluster(Z, linkage_threshold * np.max(Z[:, 2]), criterion=linkage_criterion),
266
+ metric=metric,
267
+ )
268
+ current_threshold = linkage_threshold
269
+
270
+ if score > best_overall_score:
271
+ best_overall_score = score
272
+ best_overall_threshold = float(current_threshold) # Ensure it's a float
273
+ best_overall_method = method
274
+ best_overall_metric = metric
275
+
276
+ # Ensure that we always return a valid tuple:
277
+ if best_overall_score == -np.inf:
278
+ # No valid linkage was found; return default values.
279
+ best_overall_threshold = float(linkage_threshold) if linkage_threshold != "auto" else 0.0
280
+ best_overall_method = linkage_method
281
+ best_overall_metric = linkage_metric
209
282
 
210
283
  return best_overall_method, best_overall_metric, best_overall_threshold
211
284
 
@@ -231,7 +304,7 @@ def _find_best_silhouette_score(
231
304
  resolution (float, optional): Desired resolution for the best threshold. Defaults to 0.001.
232
305
 
233
306
  Returns:
234
- Tuple[float, float]: A tuple containing:
307
+ Tuple[float, float]:
235
308
  - Best threshold (float): The threshold that yields the best silhouette score.
236
309
  - Best silhouette score (float): The highest silhouette score achieved.
237
310
  """