risk-network 0.0.11__py3-none-any.whl → 0.0.12b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. risk/__init__.py +1 -1
  2. risk/risk.py +5 -5
  3. {risk_network-0.0.11.dist-info → risk_network-0.0.12b0.dist-info}/METADATA +10 -12
  4. risk_network-0.0.12b0.dist-info/RECORD +7 -0
  5. {risk_network-0.0.11.dist-info → risk_network-0.0.12b0.dist-info}/WHEEL +1 -1
  6. risk/annotations/__init__.py +0 -7
  7. risk/annotations/annotations.py +0 -354
  8. risk/annotations/io.py +0 -240
  9. risk/annotations/nltk_setup.py +0 -85
  10. risk/log/__init__.py +0 -11
  11. risk/log/console.py +0 -141
  12. risk/log/parameters.py +0 -172
  13. risk/neighborhoods/__init__.py +0 -8
  14. risk/neighborhoods/api.py +0 -442
  15. risk/neighborhoods/community.py +0 -412
  16. risk/neighborhoods/domains.py +0 -358
  17. risk/neighborhoods/neighborhoods.py +0 -508
  18. risk/network/__init__.py +0 -6
  19. risk/network/geometry.py +0 -150
  20. risk/network/graph/__init__.py +0 -6
  21. risk/network/graph/api.py +0 -200
  22. risk/network/graph/graph.py +0 -269
  23. risk/network/graph/summary.py +0 -254
  24. risk/network/io.py +0 -550
  25. risk/network/plotter/__init__.py +0 -6
  26. risk/network/plotter/api.py +0 -54
  27. risk/network/plotter/canvas.py +0 -291
  28. risk/network/plotter/contour.py +0 -330
  29. risk/network/plotter/labels.py +0 -924
  30. risk/network/plotter/network.py +0 -294
  31. risk/network/plotter/plotter.py +0 -143
  32. risk/network/plotter/utils/colors.py +0 -416
  33. risk/network/plotter/utils/layout.py +0 -94
  34. risk/stats/__init__.py +0 -15
  35. risk/stats/permutation/__init__.py +0 -6
  36. risk/stats/permutation/permutation.py +0 -237
  37. risk/stats/permutation/test_functions.py +0 -70
  38. risk/stats/significance.py +0 -166
  39. risk/stats/stat_tests.py +0 -267
  40. risk_network-0.0.11.dist-info/RECORD +0 -41
  41. {risk_network-0.0.11.dist-info → risk_network-0.0.12b0.dist-info/licenses}/LICENSE +0 -0
  42. {risk_network-0.0.11.dist-info → risk_network-0.0.12b0.dist-info}/top_level.txt +0 -0
@@ -1,358 +0,0 @@
1
- """
2
- risk/neighborhoods/domains
3
- ~~~~~~~~~~~~~~~~~~~~~~~~~~
4
- """
5
-
6
- from itertools import product
7
- from typing import Tuple, Union
8
-
9
- import numpy as np
10
- import pandas as pd
11
- from numpy.linalg import LinAlgError
12
- from scipy.cluster.hierarchy import linkage, fcluster
13
- from sklearn.metrics import silhouette_score
14
- from tqdm import tqdm
15
-
16
- from risk.annotations import get_weighted_description
17
- from risk.log import logger
18
-
19
-
20
- # Define constants for clustering
21
- # fmt: off
22
- LINKAGE_METHODS = {"single", "complete", "average", "weighted", "centroid", "median", "ward"}
23
- LINKAGE_METRICS = {
24
- "braycurtis","canberra", "chebyshev", "cityblock", "correlation", "cosine", "dice", "euclidean",
25
- "hamming", "jaccard", "jensenshannon", "kulczynski1", "mahalanobis", "matching", "minkowski",
26
- "rogerstanimoto", "russellrao", "seuclidean", "sokalmichener", "sokalsneath", "sqeuclidean", "yule",
27
- }
28
- # fmt: on
29
-
30
-
31
- def define_domains(
32
- top_annotations: pd.DataFrame,
33
- significant_neighborhoods_significance: np.ndarray,
34
- linkage_criterion: str,
35
- linkage_method: str,
36
- linkage_metric: str,
37
- linkage_threshold: Union[float, str],
38
- ) -> pd.DataFrame:
39
- """Define domains and assign nodes to these domains based on their significance scores and clustering,
40
- handling errors by assigning unique domains when clustering fails.
41
-
42
- Args:
43
- top_annotations (pd.DataFrame): DataFrame of top annotations data for the network nodes.
44
- significant_neighborhoods_significance (np.ndarray): The binary significance matrix below alpha.
45
- linkage_criterion (str): The clustering criterion for defining groups. Choose "off" to disable clustering.
46
- linkage_method (str): The linkage method for clustering. Choose "auto" to optimize.
47
- linkage_metric (str): The linkage metric for clustering. Choose "auto" to optimize.
48
- linkage_threshold (float, str): The threshold for clustering. Choose "auto" to optimize.
49
-
50
- Returns:
51
- pd.DataFrame: DataFrame with the primary domain for each node.
52
- """
53
- try:
54
- if linkage_criterion == "off":
55
- raise ValueError("Clustering is turned off.")
56
-
57
- # Transpose the matrix to cluster annotations
58
- m = significant_neighborhoods_significance[:, top_annotations["significant_annotations"]].T
59
- # Safeguard the matrix by replacing NaN, Inf, and -Inf values
60
- m = _safeguard_matrix(m)
61
- # Optimize silhouette score across different linkage methods and distance metrics
62
- best_linkage, best_metric, best_threshold = _optimize_silhouette_across_linkage_and_metrics(
63
- m, linkage_criterion, linkage_method, linkage_metric, linkage_threshold
64
- )
65
- # Perform hierarchical clustering
66
- Z = linkage(m, method=best_linkage, metric=best_metric)
67
- logger.warning(
68
- f"Linkage criterion: '{linkage_criterion}'\nLinkage method: '{best_linkage}'\nLinkage metric: '{best_metric}'\nLinkage threshold: {round(best_threshold, 3)}"
69
- )
70
- # Calculate the optimal threshold for clustering
71
- max_d_optimal = np.max(Z[:, 2]) * best_threshold
72
- # Assign domains to the annotations matrix
73
- domains = fcluster(Z, max_d_optimal, criterion=linkage_criterion)
74
- top_annotations["domain"] = 0
75
- top_annotations.loc[top_annotations["significant_annotations"], "domain"] = domains
76
- except (ValueError, LinAlgError):
77
- # If a ValueError is encountered, handle it by assigning unique domains
78
- n_rows = len(top_annotations)
79
- if linkage_criterion == "off":
80
- logger.warning(
81
- f"Clustering is turned off. Skipping clustering and assigning {n_rows} unique domains."
82
- )
83
- else:
84
- logger.error(
85
- f"Error encountered. Skipping clustering and assigning {n_rows} unique domains."
86
- )
87
- top_annotations["domain"] = range(1, n_rows + 1) # Assign unique domains
88
-
89
- # Create DataFrames to store domain information
90
- node_to_significance = pd.DataFrame(
91
- data=significant_neighborhoods_significance,
92
- columns=[top_annotations.index.values, top_annotations["domain"]],
93
- )
94
- node_to_domain = node_to_significance.T.groupby(level="domain").sum().T
95
-
96
- # Find the maximum significance score for each node
97
- t_max = node_to_domain.loc[:, 1:].max(axis=1)
98
- t_idxmax = node_to_domain.loc[:, 1:].idxmax(axis=1)
99
- t_idxmax[t_max == 0] = 0
100
-
101
- # Assign all domains where the score is greater than 0
102
- node_to_domain["all_domains"] = node_to_domain.loc[:, 1:].apply(
103
- lambda row: list(row[row > 0].index), axis=1
104
- )
105
- # Assign primary domain
106
- node_to_domain["primary_domain"] = t_idxmax
107
-
108
- return node_to_domain
109
-
110
-
111
- def trim_domains(
112
- domains: pd.DataFrame,
113
- top_annotations: pd.DataFrame,
114
- min_cluster_size: int = 5,
115
- max_cluster_size: int = 1000,
116
- ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
117
- """Trim domains that do not meet size criteria and find outliers.
118
-
119
- Args:
120
- domains (pd.DataFrame): DataFrame of domain data for the network nodes.
121
- top_annotations (pd.DataFrame): DataFrame of top annotations data for the network nodes.
122
- min_cluster_size (int, optional): Minimum size of a cluster to be retained. Defaults to 5.
123
- max_cluster_size (int, optional): Maximum size of a cluster to be retained. Defaults to 1000.
124
-
125
- Returns:
126
- Tuple[pd.DataFrame, pd.DataFrame]:
127
- - Trimmed domains (pd.DataFrame)
128
- - A DataFrame with domain labels (pd.DataFrame)
129
- """
130
- # Identify domains to remove based on size criteria
131
- domain_counts = domains["primary_domain"].value_counts()
132
- to_remove = set(
133
- domain_counts[(domain_counts < min_cluster_size) | (domain_counts > max_cluster_size)].index
134
- )
135
-
136
- # Add invalid domain IDs
137
- invalid_domain_id = 888888
138
- invalid_domain_ids = {0, invalid_domain_id}
139
- # Mark domains to be removed
140
- top_annotations["domain"] = top_annotations["domain"].replace(to_remove, invalid_domain_id)
141
- domains.loc[domains["primary_domain"].isin(to_remove), ["primary_domain"]] = invalid_domain_id
142
-
143
- # Normalize "num significant neighborhoods" by percentile for each domain and scale to 0-10
144
- top_annotations["normalized_value"] = top_annotations.groupby("domain")[
145
- "significant_neighborhood_significance_sums"
146
- ].transform(lambda x: (x.rank(pct=True) * 10).apply(np.ceil).astype(int))
147
- # Modify the lambda function to pass both full_terms and significant_significance_score
148
- top_annotations["combined_terms"] = top_annotations.apply(
149
- lambda row: " ".join([str(row["full_terms"])] * row["normalized_value"]), axis=1
150
- )
151
-
152
- # Perform the groupby operation while retaining the other columns and adding the weighting with significance scores
153
- domain_labels = (
154
- top_annotations.groupby("domain")
155
- .agg(
156
- full_terms=("full_terms", lambda x: list(x)),
157
- significance_scores=("significant_significance_score", lambda x: list(x)),
158
- )
159
- .reset_index()
160
- )
161
- domain_labels["combined_terms"] = domain_labels.apply(
162
- lambda row: get_weighted_description(
163
- pd.Series(row["full_terms"]), pd.Series(row["significance_scores"])
164
- ),
165
- axis=1,
166
- )
167
-
168
- # Rename the columns as necessary
169
- trimmed_domains_matrix = domain_labels.rename(
170
- columns={
171
- "domain": "id",
172
- "combined_terms": "normalized_description",
173
- "full_terms": "full_descriptions",
174
- "significance_scores": "significance_scores",
175
- }
176
- ).set_index("id")
177
-
178
- # Remove invalid domains
179
- valid_domains = domains[~domains["primary_domain"].isin(invalid_domain_ids)]
180
- valid_trimmed_domains_matrix = trimmed_domains_matrix[
181
- ~trimmed_domains_matrix.index.isin(invalid_domain_ids)
182
- ]
183
- return valid_domains, valid_trimmed_domains_matrix
184
-
185
-
186
- def _safeguard_matrix(matrix: np.ndarray) -> np.ndarray:
187
- """Safeguard the matrix by replacing NaN, Inf, and -Inf values.
188
-
189
- Args:
190
- matrix (np.ndarray): Data matrix.
191
-
192
- Returns:
193
- np.ndarray: Safeguarded data matrix.
194
- """
195
- # Replace NaN with column mean
196
- nan_replacement = np.nanmean(matrix, axis=0)
197
- matrix = np.where(np.isnan(matrix), nan_replacement, matrix)
198
- # Replace Inf/-Inf with maximum/minimum finite values
199
- finite_max = np.nanmax(matrix[np.isfinite(matrix)])
200
- finite_min = np.nanmin(matrix[np.isfinite(matrix)])
201
- matrix = np.where(np.isposinf(matrix), finite_max, matrix)
202
- matrix = np.where(np.isneginf(matrix), finite_min, matrix)
203
- # Ensure rows have non-zero variance (optional step)
204
- row_variance = np.var(matrix, axis=1)
205
- matrix = matrix[row_variance > 0]
206
- return matrix
207
-
208
-
209
- def _optimize_silhouette_across_linkage_and_metrics(
210
- m: np.ndarray,
211
- linkage_criterion: str,
212
- linkage_method: str,
213
- linkage_metric: str,
214
- linkage_threshold: Union[str, float],
215
- ) -> Tuple[str, str, float]:
216
- """Optimize silhouette score across different linkage methods and distance metrics.
217
-
218
- Args:
219
- m (np.ndarray): Data matrix.
220
- linkage_criterion (str): Clustering criterion.
221
- linkage_method (str): Linkage method for clustering. Choose "auto" to optimize.
222
- linkage_metric (str): Linkage metric for clustering. Choose "auto" to optimize.
223
- linkage_threshold (Union[str, float]): Threshold for clustering. Choose "auto" to optimize.
224
-
225
- Returns:
226
- Tuple[str, str, float]:
227
- - Best linkage method (str)
228
- - Best linkage metric (str)
229
- - Best threshold (float)
230
- """
231
- # Initialize best overall values
232
- best_overall_method = linkage_method
233
- best_overall_metric = linkage_metric
234
- best_overall_threshold = linkage_threshold
235
- best_overall_score = -np.inf
236
-
237
- # Set linkage methods and metrics to all combinations if "auto" is selected
238
- linkage_methods = LINKAGE_METHODS if linkage_method == "auto" else [linkage_method]
239
- linkage_metrics = LINKAGE_METRICS if linkage_metric == "auto" else [linkage_metric]
240
- total_combinations = len(linkage_methods) * len(linkage_metrics)
241
-
242
- # Evaluating optimal linkage method and metric
243
- for method, metric in tqdm(
244
- product(linkage_methods, linkage_metrics),
245
- desc="Evaluating optimal linkage method and metric",
246
- total=total_combinations,
247
- bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
248
- ):
249
- # Some linkage methods and metrics may not work with certain data
250
- try:
251
- Z = linkage(m, method=method, metric=metric)
252
- if linkage_threshold == "auto":
253
- try:
254
- threshold, score = _find_best_silhouette_score(Z, m, metric, linkage_criterion)
255
- except (ValueError, LinAlgError):
256
- continue # Skip to the next combination
257
- current_threshold = threshold
258
- else:
259
- score = silhouette_score(
260
- m,
261
- fcluster(Z, linkage_threshold * np.max(Z[:, 2]), criterion=linkage_criterion),
262
- metric=metric,
263
- )
264
- current_threshold = linkage_threshold
265
- except (ValueError, LinAlgError):
266
- continue # Skip to the next combination
267
-
268
- if score > best_overall_score:
269
- best_overall_score = score
270
- best_overall_threshold = float(current_threshold) # Ensure it's a float
271
- best_overall_method = method
272
- best_overall_metric = metric
273
-
274
- # Ensure that we always return a valid tuple:
275
- if best_overall_score == -np.inf:
276
- # No valid linkage was found; return default values.
277
- best_overall_threshold = float(linkage_threshold) if linkage_threshold != "auto" else 0.0
278
- best_overall_method = linkage_method
279
- best_overall_metric = linkage_metric
280
-
281
- return best_overall_method, best_overall_metric, best_overall_threshold
282
-
283
-
284
- def _find_best_silhouette_score(
285
- Z: np.ndarray,
286
- m: np.ndarray,
287
- linkage_metric: str,
288
- linkage_criterion: str,
289
- lower_bound: float = 0.001,
290
- upper_bound: float = 1.0,
291
- ) -> Tuple[float, float]:
292
- """Find the best silhouette score using binary search.
293
-
294
- Args:
295
- Z (np.ndarray): Linkage matrix.
296
- m (np.ndarray): Data matrix.
297
- linkage_metric (str): Linkage metric for silhouette score calculation.
298
- linkage_criterion (str): Clustering criterion.
299
- lower_bound (float, optional): Lower bound for search. Defaults to 0.001.
300
- upper_bound (float, optional): Upper bound for search. Defaults to 1.0.
301
-
302
- Returns:
303
- Tuple[float, float]:
304
- - Best threshold (float): The threshold that yields the best silhouette score.
305
- - Best silhouette score (float): The highest silhouette score achieved.
306
- """
307
- best_score = -np.inf
308
- best_threshold = None
309
- minimum_linkage_threshold = 1e-6
310
-
311
- # Test lower bound
312
- max_d_lower = np.max(Z[:, 2]) * lower_bound
313
- clusters_lower = fcluster(Z, max_d_lower, criterion=linkage_criterion)
314
- try:
315
- score_lower = silhouette_score(m, clusters_lower, metric=linkage_metric)
316
- except ValueError:
317
- score_lower = -np.inf
318
-
319
- # Test upper bound
320
- max_d_upper = np.max(Z[:, 2]) * upper_bound
321
- clusters_upper = fcluster(Z, max_d_upper, criterion=linkage_criterion)
322
- try:
323
- score_upper = silhouette_score(m, clusters_upper, metric=linkage_metric)
324
- except ValueError:
325
- score_upper = -np.inf
326
-
327
- # Determine initial bounds for binary search
328
- if score_lower > score_upper:
329
- best_score = score_lower
330
- best_threshold = lower_bound
331
- upper_bound = (lower_bound + upper_bound) / 2
332
- else:
333
- best_score = score_upper
334
- best_threshold = upper_bound
335
- lower_bound = (lower_bound + upper_bound) / 2
336
-
337
- # Binary search loop
338
- while upper_bound - lower_bound > minimum_linkage_threshold:
339
- mid_threshold = (upper_bound + lower_bound) / 2
340
- max_d_mid = np.max(Z[:, 2]) * mid_threshold
341
- clusters_mid = fcluster(Z, max_d_mid, criterion=linkage_criterion)
342
- try:
343
- score_mid = silhouette_score(m, clusters_mid, metric=linkage_metric)
344
- except ValueError:
345
- score_mid = -np.inf
346
-
347
- # Update best score and threshold if mid-point is better
348
- if score_mid > best_score:
349
- best_score = score_mid
350
- best_threshold = mid_threshold
351
-
352
- # Adjust bounds based on the scores
353
- if score_lower > score_upper:
354
- upper_bound = mid_threshold
355
- else:
356
- lower_bound = mid_threshold
357
-
358
- return best_threshold, float(best_score)