risk-network 0.0.11__py3-none-any.whl → 0.0.12b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- risk/__init__.py +1 -1
- risk/risk.py +5 -5
- {risk_network-0.0.11.dist-info → risk_network-0.0.12b0.dist-info}/METADATA +10 -12
- risk_network-0.0.12b0.dist-info/RECORD +7 -0
- {risk_network-0.0.11.dist-info → risk_network-0.0.12b0.dist-info}/WHEEL +1 -1
- risk/annotations/__init__.py +0 -7
- risk/annotations/annotations.py +0 -354
- risk/annotations/io.py +0 -240
- risk/annotations/nltk_setup.py +0 -85
- risk/log/__init__.py +0 -11
- risk/log/console.py +0 -141
- risk/log/parameters.py +0 -172
- risk/neighborhoods/__init__.py +0 -8
- risk/neighborhoods/api.py +0 -442
- risk/neighborhoods/community.py +0 -412
- risk/neighborhoods/domains.py +0 -358
- risk/neighborhoods/neighborhoods.py +0 -508
- risk/network/__init__.py +0 -6
- risk/network/geometry.py +0 -150
- risk/network/graph/__init__.py +0 -6
- risk/network/graph/api.py +0 -200
- risk/network/graph/graph.py +0 -269
- risk/network/graph/summary.py +0 -254
- risk/network/io.py +0 -550
- risk/network/plotter/__init__.py +0 -6
- risk/network/plotter/api.py +0 -54
- risk/network/plotter/canvas.py +0 -291
- risk/network/plotter/contour.py +0 -330
- risk/network/plotter/labels.py +0 -924
- risk/network/plotter/network.py +0 -294
- risk/network/plotter/plotter.py +0 -143
- risk/network/plotter/utils/colors.py +0 -416
- risk/network/plotter/utils/layout.py +0 -94
- risk/stats/__init__.py +0 -15
- risk/stats/permutation/__init__.py +0 -6
- risk/stats/permutation/permutation.py +0 -237
- risk/stats/permutation/test_functions.py +0 -70
- risk/stats/significance.py +0 -166
- risk/stats/stat_tests.py +0 -267
- risk_network-0.0.11.dist-info/RECORD +0 -41
- {risk_network-0.0.11.dist-info → risk_network-0.0.12b0.dist-info/licenses}/LICENSE +0 -0
- {risk_network-0.0.11.dist-info → risk_network-0.0.12b0.dist-info}/top_level.txt +0 -0
risk/neighborhoods/domains.py
DELETED
@@ -1,358 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
risk/neighborhoods/domains
|
3
|
-
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
4
|
-
"""
|
5
|
-
|
6
|
-
from itertools import product
|
7
|
-
from typing import Tuple, Union
|
8
|
-
|
9
|
-
import numpy as np
|
10
|
-
import pandas as pd
|
11
|
-
from numpy.linalg import LinAlgError
|
12
|
-
from scipy.cluster.hierarchy import linkage, fcluster
|
13
|
-
from sklearn.metrics import silhouette_score
|
14
|
-
from tqdm import tqdm
|
15
|
-
|
16
|
-
from risk.annotations import get_weighted_description
|
17
|
-
from risk.log import logger
|
18
|
-
|
19
|
-
|
20
|
-
# Define constants for clustering
|
21
|
-
# fmt: off
|
22
|
-
LINKAGE_METHODS = {"single", "complete", "average", "weighted", "centroid", "median", "ward"}
|
23
|
-
LINKAGE_METRICS = {
|
24
|
-
"braycurtis","canberra", "chebyshev", "cityblock", "correlation", "cosine", "dice", "euclidean",
|
25
|
-
"hamming", "jaccard", "jensenshannon", "kulczynski1", "mahalanobis", "matching", "minkowski",
|
26
|
-
"rogerstanimoto", "russellrao", "seuclidean", "sokalmichener", "sokalsneath", "sqeuclidean", "yule",
|
27
|
-
}
|
28
|
-
# fmt: on
|
29
|
-
|
30
|
-
|
31
|
-
def define_domains(
|
32
|
-
top_annotations: pd.DataFrame,
|
33
|
-
significant_neighborhoods_significance: np.ndarray,
|
34
|
-
linkage_criterion: str,
|
35
|
-
linkage_method: str,
|
36
|
-
linkage_metric: str,
|
37
|
-
linkage_threshold: Union[float, str],
|
38
|
-
) -> pd.DataFrame:
|
39
|
-
"""Define domains and assign nodes to these domains based on their significance scores and clustering,
|
40
|
-
handling errors by assigning unique domains when clustering fails.
|
41
|
-
|
42
|
-
Args:
|
43
|
-
top_annotations (pd.DataFrame): DataFrame of top annotations data for the network nodes.
|
44
|
-
significant_neighborhoods_significance (np.ndarray): The binary significance matrix below alpha.
|
45
|
-
linkage_criterion (str): The clustering criterion for defining groups. Choose "off" to disable clustering.
|
46
|
-
linkage_method (str): The linkage method for clustering. Choose "auto" to optimize.
|
47
|
-
linkage_metric (str): The linkage metric for clustering. Choose "auto" to optimize.
|
48
|
-
linkage_threshold (float, str): The threshold for clustering. Choose "auto" to optimize.
|
49
|
-
|
50
|
-
Returns:
|
51
|
-
pd.DataFrame: DataFrame with the primary domain for each node.
|
52
|
-
"""
|
53
|
-
try:
|
54
|
-
if linkage_criterion == "off":
|
55
|
-
raise ValueError("Clustering is turned off.")
|
56
|
-
|
57
|
-
# Transpose the matrix to cluster annotations
|
58
|
-
m = significant_neighborhoods_significance[:, top_annotations["significant_annotations"]].T
|
59
|
-
# Safeguard the matrix by replacing NaN, Inf, and -Inf values
|
60
|
-
m = _safeguard_matrix(m)
|
61
|
-
# Optimize silhouette score across different linkage methods and distance metrics
|
62
|
-
best_linkage, best_metric, best_threshold = _optimize_silhouette_across_linkage_and_metrics(
|
63
|
-
m, linkage_criterion, linkage_method, linkage_metric, linkage_threshold
|
64
|
-
)
|
65
|
-
# Perform hierarchical clustering
|
66
|
-
Z = linkage(m, method=best_linkage, metric=best_metric)
|
67
|
-
logger.warning(
|
68
|
-
f"Linkage criterion: '{linkage_criterion}'\nLinkage method: '{best_linkage}'\nLinkage metric: '{best_metric}'\nLinkage threshold: {round(best_threshold, 3)}"
|
69
|
-
)
|
70
|
-
# Calculate the optimal threshold for clustering
|
71
|
-
max_d_optimal = np.max(Z[:, 2]) * best_threshold
|
72
|
-
# Assign domains to the annotations matrix
|
73
|
-
domains = fcluster(Z, max_d_optimal, criterion=linkage_criterion)
|
74
|
-
top_annotations["domain"] = 0
|
75
|
-
top_annotations.loc[top_annotations["significant_annotations"], "domain"] = domains
|
76
|
-
except (ValueError, LinAlgError):
|
77
|
-
# If a ValueError is encountered, handle it by assigning unique domains
|
78
|
-
n_rows = len(top_annotations)
|
79
|
-
if linkage_criterion == "off":
|
80
|
-
logger.warning(
|
81
|
-
f"Clustering is turned off. Skipping clustering and assigning {n_rows} unique domains."
|
82
|
-
)
|
83
|
-
else:
|
84
|
-
logger.error(
|
85
|
-
f"Error encountered. Skipping clustering and assigning {n_rows} unique domains."
|
86
|
-
)
|
87
|
-
top_annotations["domain"] = range(1, n_rows + 1) # Assign unique domains
|
88
|
-
|
89
|
-
# Create DataFrames to store domain information
|
90
|
-
node_to_significance = pd.DataFrame(
|
91
|
-
data=significant_neighborhoods_significance,
|
92
|
-
columns=[top_annotations.index.values, top_annotations["domain"]],
|
93
|
-
)
|
94
|
-
node_to_domain = node_to_significance.T.groupby(level="domain").sum().T
|
95
|
-
|
96
|
-
# Find the maximum significance score for each node
|
97
|
-
t_max = node_to_domain.loc[:, 1:].max(axis=1)
|
98
|
-
t_idxmax = node_to_domain.loc[:, 1:].idxmax(axis=1)
|
99
|
-
t_idxmax[t_max == 0] = 0
|
100
|
-
|
101
|
-
# Assign all domains where the score is greater than 0
|
102
|
-
node_to_domain["all_domains"] = node_to_domain.loc[:, 1:].apply(
|
103
|
-
lambda row: list(row[row > 0].index), axis=1
|
104
|
-
)
|
105
|
-
# Assign primary domain
|
106
|
-
node_to_domain["primary_domain"] = t_idxmax
|
107
|
-
|
108
|
-
return node_to_domain
|
109
|
-
|
110
|
-
|
111
|
-
def trim_domains(
|
112
|
-
domains: pd.DataFrame,
|
113
|
-
top_annotations: pd.DataFrame,
|
114
|
-
min_cluster_size: int = 5,
|
115
|
-
max_cluster_size: int = 1000,
|
116
|
-
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
117
|
-
"""Trim domains that do not meet size criteria and find outliers.
|
118
|
-
|
119
|
-
Args:
|
120
|
-
domains (pd.DataFrame): DataFrame of domain data for the network nodes.
|
121
|
-
top_annotations (pd.DataFrame): DataFrame of top annotations data for the network nodes.
|
122
|
-
min_cluster_size (int, optional): Minimum size of a cluster to be retained. Defaults to 5.
|
123
|
-
max_cluster_size (int, optional): Maximum size of a cluster to be retained. Defaults to 1000.
|
124
|
-
|
125
|
-
Returns:
|
126
|
-
Tuple[pd.DataFrame, pd.DataFrame]:
|
127
|
-
- Trimmed domains (pd.DataFrame)
|
128
|
-
- A DataFrame with domain labels (pd.DataFrame)
|
129
|
-
"""
|
130
|
-
# Identify domains to remove based on size criteria
|
131
|
-
domain_counts = domains["primary_domain"].value_counts()
|
132
|
-
to_remove = set(
|
133
|
-
domain_counts[(domain_counts < min_cluster_size) | (domain_counts > max_cluster_size)].index
|
134
|
-
)
|
135
|
-
|
136
|
-
# Add invalid domain IDs
|
137
|
-
invalid_domain_id = 888888
|
138
|
-
invalid_domain_ids = {0, invalid_domain_id}
|
139
|
-
# Mark domains to be removed
|
140
|
-
top_annotations["domain"] = top_annotations["domain"].replace(to_remove, invalid_domain_id)
|
141
|
-
domains.loc[domains["primary_domain"].isin(to_remove), ["primary_domain"]] = invalid_domain_id
|
142
|
-
|
143
|
-
# Normalize "num significant neighborhoods" by percentile for each domain and scale to 0-10
|
144
|
-
top_annotations["normalized_value"] = top_annotations.groupby("domain")[
|
145
|
-
"significant_neighborhood_significance_sums"
|
146
|
-
].transform(lambda x: (x.rank(pct=True) * 10).apply(np.ceil).astype(int))
|
147
|
-
# Modify the lambda function to pass both full_terms and significant_significance_score
|
148
|
-
top_annotations["combined_terms"] = top_annotations.apply(
|
149
|
-
lambda row: " ".join([str(row["full_terms"])] * row["normalized_value"]), axis=1
|
150
|
-
)
|
151
|
-
|
152
|
-
# Perform the groupby operation while retaining the other columns and adding the weighting with significance scores
|
153
|
-
domain_labels = (
|
154
|
-
top_annotations.groupby("domain")
|
155
|
-
.agg(
|
156
|
-
full_terms=("full_terms", lambda x: list(x)),
|
157
|
-
significance_scores=("significant_significance_score", lambda x: list(x)),
|
158
|
-
)
|
159
|
-
.reset_index()
|
160
|
-
)
|
161
|
-
domain_labels["combined_terms"] = domain_labels.apply(
|
162
|
-
lambda row: get_weighted_description(
|
163
|
-
pd.Series(row["full_terms"]), pd.Series(row["significance_scores"])
|
164
|
-
),
|
165
|
-
axis=1,
|
166
|
-
)
|
167
|
-
|
168
|
-
# Rename the columns as necessary
|
169
|
-
trimmed_domains_matrix = domain_labels.rename(
|
170
|
-
columns={
|
171
|
-
"domain": "id",
|
172
|
-
"combined_terms": "normalized_description",
|
173
|
-
"full_terms": "full_descriptions",
|
174
|
-
"significance_scores": "significance_scores",
|
175
|
-
}
|
176
|
-
).set_index("id")
|
177
|
-
|
178
|
-
# Remove invalid domains
|
179
|
-
valid_domains = domains[~domains["primary_domain"].isin(invalid_domain_ids)]
|
180
|
-
valid_trimmed_domains_matrix = trimmed_domains_matrix[
|
181
|
-
~trimmed_domains_matrix.index.isin(invalid_domain_ids)
|
182
|
-
]
|
183
|
-
return valid_domains, valid_trimmed_domains_matrix
|
184
|
-
|
185
|
-
|
186
|
-
def _safeguard_matrix(matrix: np.ndarray) -> np.ndarray:
|
187
|
-
"""Safeguard the matrix by replacing NaN, Inf, and -Inf values.
|
188
|
-
|
189
|
-
Args:
|
190
|
-
matrix (np.ndarray): Data matrix.
|
191
|
-
|
192
|
-
Returns:
|
193
|
-
np.ndarray: Safeguarded data matrix.
|
194
|
-
"""
|
195
|
-
# Replace NaN with column mean
|
196
|
-
nan_replacement = np.nanmean(matrix, axis=0)
|
197
|
-
matrix = np.where(np.isnan(matrix), nan_replacement, matrix)
|
198
|
-
# Replace Inf/-Inf with maximum/minimum finite values
|
199
|
-
finite_max = np.nanmax(matrix[np.isfinite(matrix)])
|
200
|
-
finite_min = np.nanmin(matrix[np.isfinite(matrix)])
|
201
|
-
matrix = np.where(np.isposinf(matrix), finite_max, matrix)
|
202
|
-
matrix = np.where(np.isneginf(matrix), finite_min, matrix)
|
203
|
-
# Ensure rows have non-zero variance (optional step)
|
204
|
-
row_variance = np.var(matrix, axis=1)
|
205
|
-
matrix = matrix[row_variance > 0]
|
206
|
-
return matrix
|
207
|
-
|
208
|
-
|
209
|
-
def _optimize_silhouette_across_linkage_and_metrics(
|
210
|
-
m: np.ndarray,
|
211
|
-
linkage_criterion: str,
|
212
|
-
linkage_method: str,
|
213
|
-
linkage_metric: str,
|
214
|
-
linkage_threshold: Union[str, float],
|
215
|
-
) -> Tuple[str, str, float]:
|
216
|
-
"""Optimize silhouette score across different linkage methods and distance metrics.
|
217
|
-
|
218
|
-
Args:
|
219
|
-
m (np.ndarray): Data matrix.
|
220
|
-
linkage_criterion (str): Clustering criterion.
|
221
|
-
linkage_method (str): Linkage method for clustering. Choose "auto" to optimize.
|
222
|
-
linkage_metric (str): Linkage metric for clustering. Choose "auto" to optimize.
|
223
|
-
linkage_threshold (Union[str, float]): Threshold for clustering. Choose "auto" to optimize.
|
224
|
-
|
225
|
-
Returns:
|
226
|
-
Tuple[str, str, float]:
|
227
|
-
- Best linkage method (str)
|
228
|
-
- Best linkage metric (str)
|
229
|
-
- Best threshold (float)
|
230
|
-
"""
|
231
|
-
# Initialize best overall values
|
232
|
-
best_overall_method = linkage_method
|
233
|
-
best_overall_metric = linkage_metric
|
234
|
-
best_overall_threshold = linkage_threshold
|
235
|
-
best_overall_score = -np.inf
|
236
|
-
|
237
|
-
# Set linkage methods and metrics to all combinations if "auto" is selected
|
238
|
-
linkage_methods = LINKAGE_METHODS if linkage_method == "auto" else [linkage_method]
|
239
|
-
linkage_metrics = LINKAGE_METRICS if linkage_metric == "auto" else [linkage_metric]
|
240
|
-
total_combinations = len(linkage_methods) * len(linkage_metrics)
|
241
|
-
|
242
|
-
# Evaluating optimal linkage method and metric
|
243
|
-
for method, metric in tqdm(
|
244
|
-
product(linkage_methods, linkage_metrics),
|
245
|
-
desc="Evaluating optimal linkage method and metric",
|
246
|
-
total=total_combinations,
|
247
|
-
bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
|
248
|
-
):
|
249
|
-
# Some linkage methods and metrics may not work with certain data
|
250
|
-
try:
|
251
|
-
Z = linkage(m, method=method, metric=metric)
|
252
|
-
if linkage_threshold == "auto":
|
253
|
-
try:
|
254
|
-
threshold, score = _find_best_silhouette_score(Z, m, metric, linkage_criterion)
|
255
|
-
except (ValueError, LinAlgError):
|
256
|
-
continue # Skip to the next combination
|
257
|
-
current_threshold = threshold
|
258
|
-
else:
|
259
|
-
score = silhouette_score(
|
260
|
-
m,
|
261
|
-
fcluster(Z, linkage_threshold * np.max(Z[:, 2]), criterion=linkage_criterion),
|
262
|
-
metric=metric,
|
263
|
-
)
|
264
|
-
current_threshold = linkage_threshold
|
265
|
-
except (ValueError, LinAlgError):
|
266
|
-
continue # Skip to the next combination
|
267
|
-
|
268
|
-
if score > best_overall_score:
|
269
|
-
best_overall_score = score
|
270
|
-
best_overall_threshold = float(current_threshold) # Ensure it's a float
|
271
|
-
best_overall_method = method
|
272
|
-
best_overall_metric = metric
|
273
|
-
|
274
|
-
# Ensure that we always return a valid tuple:
|
275
|
-
if best_overall_score == -np.inf:
|
276
|
-
# No valid linkage was found; return default values.
|
277
|
-
best_overall_threshold = float(linkage_threshold) if linkage_threshold != "auto" else 0.0
|
278
|
-
best_overall_method = linkage_method
|
279
|
-
best_overall_metric = linkage_metric
|
280
|
-
|
281
|
-
return best_overall_method, best_overall_metric, best_overall_threshold
|
282
|
-
|
283
|
-
|
284
|
-
def _find_best_silhouette_score(
|
285
|
-
Z: np.ndarray,
|
286
|
-
m: np.ndarray,
|
287
|
-
linkage_metric: str,
|
288
|
-
linkage_criterion: str,
|
289
|
-
lower_bound: float = 0.001,
|
290
|
-
upper_bound: float = 1.0,
|
291
|
-
) -> Tuple[float, float]:
|
292
|
-
"""Find the best silhouette score using binary search.
|
293
|
-
|
294
|
-
Args:
|
295
|
-
Z (np.ndarray): Linkage matrix.
|
296
|
-
m (np.ndarray): Data matrix.
|
297
|
-
linkage_metric (str): Linkage metric for silhouette score calculation.
|
298
|
-
linkage_criterion (str): Clustering criterion.
|
299
|
-
lower_bound (float, optional): Lower bound for search. Defaults to 0.001.
|
300
|
-
upper_bound (float, optional): Upper bound for search. Defaults to 1.0.
|
301
|
-
|
302
|
-
Returns:
|
303
|
-
Tuple[float, float]:
|
304
|
-
- Best threshold (float): The threshold that yields the best silhouette score.
|
305
|
-
- Best silhouette score (float): The highest silhouette score achieved.
|
306
|
-
"""
|
307
|
-
best_score = -np.inf
|
308
|
-
best_threshold = None
|
309
|
-
minimum_linkage_threshold = 1e-6
|
310
|
-
|
311
|
-
# Test lower bound
|
312
|
-
max_d_lower = np.max(Z[:, 2]) * lower_bound
|
313
|
-
clusters_lower = fcluster(Z, max_d_lower, criterion=linkage_criterion)
|
314
|
-
try:
|
315
|
-
score_lower = silhouette_score(m, clusters_lower, metric=linkage_metric)
|
316
|
-
except ValueError:
|
317
|
-
score_lower = -np.inf
|
318
|
-
|
319
|
-
# Test upper bound
|
320
|
-
max_d_upper = np.max(Z[:, 2]) * upper_bound
|
321
|
-
clusters_upper = fcluster(Z, max_d_upper, criterion=linkage_criterion)
|
322
|
-
try:
|
323
|
-
score_upper = silhouette_score(m, clusters_upper, metric=linkage_metric)
|
324
|
-
except ValueError:
|
325
|
-
score_upper = -np.inf
|
326
|
-
|
327
|
-
# Determine initial bounds for binary search
|
328
|
-
if score_lower > score_upper:
|
329
|
-
best_score = score_lower
|
330
|
-
best_threshold = lower_bound
|
331
|
-
upper_bound = (lower_bound + upper_bound) / 2
|
332
|
-
else:
|
333
|
-
best_score = score_upper
|
334
|
-
best_threshold = upper_bound
|
335
|
-
lower_bound = (lower_bound + upper_bound) / 2
|
336
|
-
|
337
|
-
# Binary search loop
|
338
|
-
while upper_bound - lower_bound > minimum_linkage_threshold:
|
339
|
-
mid_threshold = (upper_bound + lower_bound) / 2
|
340
|
-
max_d_mid = np.max(Z[:, 2]) * mid_threshold
|
341
|
-
clusters_mid = fcluster(Z, max_d_mid, criterion=linkage_criterion)
|
342
|
-
try:
|
343
|
-
score_mid = silhouette_score(m, clusters_mid, metric=linkage_metric)
|
344
|
-
except ValueError:
|
345
|
-
score_mid = -np.inf
|
346
|
-
|
347
|
-
# Update best score and threshold if mid-point is better
|
348
|
-
if score_mid > best_score:
|
349
|
-
best_score = score_mid
|
350
|
-
best_threshold = mid_threshold
|
351
|
-
|
352
|
-
# Adjust bounds based on the scores
|
353
|
-
if score_lower > score_upper:
|
354
|
-
upper_bound = mid_threshold
|
355
|
-
else:
|
356
|
-
lower_bound = mid_threshold
|
357
|
-
|
358
|
-
return best_threshold, float(best_score)
|