risk-network 0.0.8b26__py3-none-any.whl → 0.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- risk/__init__.py +2 -2
- risk/annotations/__init__.py +2 -2
- risk/annotations/annotations.py +195 -118
- risk/annotations/io.py +47 -31
- risk/log/__init__.py +4 -2
- risk/log/{config.py → console.py} +5 -3
- risk/log/{params.py → parameters.py} +17 -42
- risk/neighborhoods/__init__.py +3 -5
- risk/neighborhoods/api.py +442 -0
- risk/neighborhoods/community.py +324 -101
- risk/neighborhoods/domains.py +125 -52
- risk/neighborhoods/neighborhoods.py +177 -165
- risk/network/__init__.py +1 -3
- risk/network/geometry.py +71 -89
- risk/network/graph/__init__.py +6 -0
- risk/network/graph/api.py +200 -0
- risk/network/{graph.py → graph/graph.py} +90 -40
- risk/network/graph/summary.py +254 -0
- risk/network/io.py +103 -114
- risk/network/plotter/__init__.py +6 -0
- risk/network/plotter/api.py +54 -0
- risk/network/{plot → plotter}/canvas.py +12 -9
- risk/network/{plot → plotter}/contour.py +27 -24
- risk/network/{plot → plotter}/labels.py +73 -78
- risk/network/{plot → plotter}/network.py +45 -39
- risk/network/{plot → plotter}/plotter.py +23 -17
- risk/network/{plot/utils/color.py → plotter/utils/colors.py} +114 -122
- risk/network/{plot → plotter}/utils/layout.py +10 -7
- risk/risk.py +11 -500
- risk/stats/__init__.py +10 -4
- risk/stats/permutation/__init__.py +1 -1
- risk/stats/permutation/permutation.py +44 -38
- risk/stats/permutation/test_functions.py +26 -18
- risk/stats/{stats.py → significance.py} +17 -15
- risk/stats/stat_tests.py +267 -0
- {risk_network-0.0.8b26.dist-info → risk_network-0.0.9.dist-info}/METADATA +31 -46
- risk_network-0.0.9.dist-info/RECORD +40 -0
- {risk_network-0.0.8b26.dist-info → risk_network-0.0.9.dist-info}/WHEEL +1 -1
- risk/constants.py +0 -31
- risk/network/plot/__init__.py +0 -6
- risk/stats/hypergeom.py +0 -54
- risk/stats/poisson.py +0 -44
- risk_network-0.0.8b26.dist-info/RECORD +0 -37
- {risk_network-0.0.8b26.dist-info → risk_network-0.0.9.dist-info}/LICENSE +0 -0
- {risk_network-0.0.8b26.dist-info → risk_network-0.0.9.dist-info}/top_level.txt +0 -0
risk/neighborhoods/domains.py
CHANGED
@@ -3,75 +3,97 @@ risk/neighborhoods/domains
|
|
3
3
|
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
4
4
|
"""
|
5
5
|
|
6
|
-
from contextlib import suppress
|
7
6
|
from itertools import product
|
8
|
-
from
|
9
|
-
from typing import Tuple
|
7
|
+
from typing import Tuple, Union
|
10
8
|
|
11
9
|
import numpy as np
|
12
10
|
import pandas as pd
|
11
|
+
from numpy.linalg import LinAlgError
|
13
12
|
from scipy.cluster.hierarchy import linkage, fcluster
|
14
13
|
from sklearn.metrics import silhouette_score
|
14
|
+
from tqdm import tqdm
|
15
15
|
|
16
16
|
from risk.annotations import get_weighted_description
|
17
|
-
from risk.constants import GROUP_LINKAGE_METHODS, GROUP_DISTANCE_METRICS
|
18
17
|
from risk.log import logger
|
19
18
|
|
20
19
|
|
20
|
+
# Define constants for clustering
|
21
|
+
# fmt: off
|
22
|
+
LINKAGE_METHODS = {"single", "complete", "average", "weighted", "centroid", "median", "ward"}
|
23
|
+
LINKAGE_METRICS = {
|
24
|
+
"braycurtis","canberra", "chebyshev", "cityblock", "correlation", "cosine", "dice", "euclidean",
|
25
|
+
"hamming", "jaccard", "jensenshannon", "kulczynski1", "mahalanobis", "matching", "minkowski",
|
26
|
+
"rogerstanimoto", "russellrao", "seuclidean", "sokalmichener", "sokalsneath", "sqeuclidean", "yule",
|
27
|
+
}
|
28
|
+
# fmt: on
|
29
|
+
|
30
|
+
|
21
31
|
def define_domains(
|
22
32
|
top_annotations: pd.DataFrame,
|
23
|
-
|
33
|
+
significant_neighborhoods_significance: np.ndarray,
|
24
34
|
linkage_criterion: str,
|
25
35
|
linkage_method: str,
|
26
36
|
linkage_metric: str,
|
37
|
+
linkage_threshold: Union[float, str],
|
27
38
|
) -> pd.DataFrame:
|
28
|
-
"""Define domains and assign nodes to these domains based on their
|
39
|
+
"""Define domains and assign nodes to these domains based on their significance scores and clustering,
|
29
40
|
handling errors by assigning unique domains when clustering fails.
|
30
41
|
|
31
42
|
Args:
|
32
43
|
top_annotations (pd.DataFrame): DataFrame of top annotations data for the network nodes.
|
33
|
-
|
44
|
+
significant_neighborhoods_significance (np.ndarray): The binary significance matrix below alpha.
|
34
45
|
linkage_criterion (str): The clustering criterion for defining groups.
|
35
|
-
linkage_method (str): The linkage method for clustering.
|
36
|
-
linkage_metric (str): The linkage metric for clustering.
|
46
|
+
linkage_method (str): The linkage method for clustering. Choose "auto" to optimize.
|
47
|
+
linkage_metric (str): The linkage metric for clustering. Choose "auto" to optimize.
|
48
|
+
linkage_threshold (float, str): The threshold for clustering. Choose "auto" to optimize.
|
37
49
|
|
38
50
|
Returns:
|
39
51
|
pd.DataFrame: DataFrame with the primary domain for each node.
|
40
52
|
"""
|
41
53
|
try:
|
54
|
+
if linkage_criterion == "off":
|
55
|
+
raise ValueError("Clustering is turned off.")
|
56
|
+
|
42
57
|
# Transpose the matrix to cluster annotations
|
43
|
-
m =
|
58
|
+
m = significant_neighborhoods_significance[:, top_annotations["significant_annotations"]].T
|
59
|
+
# Safeguard the matrix by replacing NaN, Inf, and -Inf values
|
60
|
+
m = _safeguard_matrix(m)
|
61
|
+
# Optimize silhouette score across different linkage methods and distance metrics
|
44
62
|
best_linkage, best_metric, best_threshold = _optimize_silhouette_across_linkage_and_metrics(
|
45
|
-
m, linkage_criterion, linkage_method, linkage_metric
|
63
|
+
m, linkage_criterion, linkage_method, linkage_metric, linkage_threshold
|
46
64
|
)
|
47
65
|
# Perform hierarchical clustering
|
48
66
|
Z = linkage(m, method=best_linkage, metric=best_metric)
|
49
67
|
logger.warning(
|
50
|
-
f"Linkage criterion: '{linkage_criterion}'\nLinkage method: '{best_linkage}'\nLinkage metric: '{best_metric}'"
|
68
|
+
f"Linkage criterion: '{linkage_criterion}'\nLinkage method: '{best_linkage}'\nLinkage metric: '{best_metric}'\nLinkage threshold: {round(best_threshold, 3)}"
|
51
69
|
)
|
52
|
-
logger.debug(f"Optimal linkage threshold: {round(best_threshold, 3)}")
|
53
70
|
# Calculate the optimal threshold for clustering
|
54
71
|
max_d_optimal = np.max(Z[:, 2]) * best_threshold
|
55
72
|
# Assign domains to the annotations matrix
|
56
73
|
domains = fcluster(Z, max_d_optimal, criterion=linkage_criterion)
|
57
74
|
top_annotations["domain"] = 0
|
58
75
|
top_annotations.loc[top_annotations["significant_annotations"], "domain"] = domains
|
59
|
-
except ValueError:
|
76
|
+
except (ValueError, LinAlgError):
|
60
77
|
# If a ValueError is encountered, handle it by assigning unique domains
|
61
78
|
n_rows = len(top_annotations)
|
62
|
-
|
63
|
-
|
64
|
-
|
79
|
+
if linkage_criterion == "off":
|
80
|
+
logger.warning(
|
81
|
+
f"Clustering is turned off. Skipping clustering and assigning {n_rows} unique domains."
|
82
|
+
)
|
83
|
+
else:
|
84
|
+
logger.error(
|
85
|
+
f"Error encountered. Skipping clustering and assigning {n_rows} unique domains."
|
86
|
+
)
|
65
87
|
top_annotations["domain"] = range(1, n_rows + 1) # Assign unique domains
|
66
88
|
|
67
89
|
# Create DataFrames to store domain information
|
68
|
-
|
69
|
-
data=
|
90
|
+
node_to_significance = pd.DataFrame(
|
91
|
+
data=significant_neighborhoods_significance,
|
70
92
|
columns=[top_annotations.index.values, top_annotations["domain"]],
|
71
93
|
)
|
72
|
-
node_to_domain =
|
94
|
+
node_to_domain = node_to_significance.T.groupby(level="domain").sum().T
|
73
95
|
|
74
|
-
# Find the maximum
|
96
|
+
# Find the maximum significance score for each node
|
75
97
|
t_max = node_to_domain.loc[:, 1:].max(axis=1)
|
76
98
|
t_idxmax = node_to_domain.loc[:, 1:].idxmax(axis=1)
|
77
99
|
t_idxmax[t_max == 0] = 0
|
@@ -86,13 +108,13 @@ def define_domains(
|
|
86
108
|
return node_to_domain
|
87
109
|
|
88
110
|
|
89
|
-
def
|
111
|
+
def trim_domains(
|
90
112
|
domains: pd.DataFrame,
|
91
113
|
top_annotations: pd.DataFrame,
|
92
114
|
min_cluster_size: int = 5,
|
93
115
|
max_cluster_size: int = 1000,
|
94
116
|
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
95
|
-
"""Trim domains
|
117
|
+
"""Trim domains that do not meet size criteria and find outliers.
|
96
118
|
|
97
119
|
Args:
|
98
120
|
domains (pd.DataFrame): DataFrame of domain data for the network nodes.
|
@@ -101,8 +123,7 @@ def trim_domains_and_top_annotations(
|
|
101
123
|
max_cluster_size (int, optional): Maximum size of a cluster to be retained. Defaults to 1000.
|
102
124
|
|
103
125
|
Returns:
|
104
|
-
Tuple[pd.DataFrame, pd.DataFrame
|
105
|
-
- Trimmed annotations (pd.DataFrame)
|
126
|
+
Tuple[pd.DataFrame, pd.DataFrame]:
|
106
127
|
- Trimmed domains (pd.DataFrame)
|
107
128
|
- A DataFrame with domain labels (pd.DataFrame)
|
108
129
|
"""
|
@@ -116,30 +137,30 @@ def trim_domains_and_top_annotations(
|
|
116
137
|
invalid_domain_id = 888888
|
117
138
|
invalid_domain_ids = {0, invalid_domain_id}
|
118
139
|
# Mark domains to be removed
|
119
|
-
top_annotations["domain"].replace(to_remove, invalid_domain_id
|
140
|
+
top_annotations["domain"] = top_annotations["domain"].replace(to_remove, invalid_domain_id)
|
120
141
|
domains.loc[domains["primary_domain"].isin(to_remove), ["primary_domain"]] = invalid_domain_id
|
121
142
|
|
122
|
-
# Normalize "num
|
143
|
+
# Normalize "num significant neighborhoods" by percentile for each domain and scale to 0-10
|
123
144
|
top_annotations["normalized_value"] = top_annotations.groupby("domain")[
|
124
|
-
"
|
145
|
+
"significant_neighborhood_significance_sums"
|
125
146
|
].transform(lambda x: (x.rank(pct=True) * 10).apply(np.ceil).astype(int))
|
126
|
-
# Modify the lambda function to pass both full_terms and
|
147
|
+
# Modify the lambda function to pass both full_terms and significant_significance_score
|
127
148
|
top_annotations["combined_terms"] = top_annotations.apply(
|
128
149
|
lambda row: " ".join([str(row["full_terms"])] * row["normalized_value"]), axis=1
|
129
150
|
)
|
130
151
|
|
131
|
-
# Perform the groupby operation while retaining the other columns and adding the weighting with
|
152
|
+
# Perform the groupby operation while retaining the other columns and adding the weighting with significance scores
|
132
153
|
domain_labels = (
|
133
154
|
top_annotations.groupby("domain")
|
134
155
|
.agg(
|
135
156
|
full_terms=("full_terms", lambda x: list(x)),
|
136
|
-
|
157
|
+
significance_scores=("significant_significance_score", lambda x: list(x)),
|
137
158
|
)
|
138
159
|
.reset_index()
|
139
160
|
)
|
140
161
|
domain_labels["combined_terms"] = domain_labels.apply(
|
141
162
|
lambda row: get_weighted_description(
|
142
|
-
pd.Series(row["full_terms"]), pd.Series(row["
|
163
|
+
pd.Series(row["full_terms"]), pd.Series(row["significance_scores"])
|
143
164
|
),
|
144
165
|
axis=1,
|
145
166
|
)
|
@@ -150,45 +171,72 @@ def trim_domains_and_top_annotations(
|
|
150
171
|
"domain": "id",
|
151
172
|
"combined_terms": "normalized_description",
|
152
173
|
"full_terms": "full_descriptions",
|
153
|
-
"
|
174
|
+
"significance_scores": "significance_scores",
|
154
175
|
}
|
155
176
|
).set_index("id")
|
156
177
|
|
157
178
|
# Remove invalid domains
|
158
|
-
valid_annotations = top_annotations[~top_annotations["domain"].isin(invalid_domain_ids)].drop(
|
159
|
-
columns=["normalized_value"]
|
160
|
-
)
|
161
179
|
valid_domains = domains[~domains["primary_domain"].isin(invalid_domain_ids)]
|
162
180
|
valid_trimmed_domains_matrix = trimmed_domains_matrix[
|
163
181
|
~trimmed_domains_matrix.index.isin(invalid_domain_ids)
|
164
182
|
]
|
165
|
-
return
|
183
|
+
return valid_domains, valid_trimmed_domains_matrix
|
184
|
+
|
185
|
+
|
186
|
+
def _safeguard_matrix(matrix: np.ndarray) -> np.ndarray:
|
187
|
+
"""Safeguard the matrix by replacing NaN, Inf, and -Inf values.
|
188
|
+
|
189
|
+
Args:
|
190
|
+
matrix (np.ndarray): Data matrix.
|
191
|
+
|
192
|
+
Returns:
|
193
|
+
np.ndarray: Safeguarded data matrix.
|
194
|
+
"""
|
195
|
+
# Replace NaN with column mean
|
196
|
+
nan_replacement = np.nanmean(matrix, axis=0)
|
197
|
+
matrix = np.where(np.isnan(matrix), nan_replacement, matrix)
|
198
|
+
# Replace Inf/-Inf with maximum/minimum finite values
|
199
|
+
finite_max = np.nanmax(matrix[np.isfinite(matrix)])
|
200
|
+
finite_min = np.nanmin(matrix[np.isfinite(matrix)])
|
201
|
+
matrix = np.where(np.isposinf(matrix), finite_max, matrix)
|
202
|
+
matrix = np.where(np.isneginf(matrix), finite_min, matrix)
|
203
|
+
# Ensure rows have non-zero variance (optional step)
|
204
|
+
row_variance = np.var(matrix, axis=1)
|
205
|
+
matrix = matrix[row_variance > 0]
|
206
|
+
return matrix
|
166
207
|
|
167
208
|
|
168
209
|
def _optimize_silhouette_across_linkage_and_metrics(
|
169
|
-
m: np.ndarray,
|
210
|
+
m: np.ndarray,
|
211
|
+
linkage_criterion: str,
|
212
|
+
linkage_method: str,
|
213
|
+
linkage_metric: str,
|
214
|
+
linkage_threshold: Union[str, float],
|
170
215
|
) -> Tuple[str, str, float]:
|
171
216
|
"""Optimize silhouette score across different linkage methods and distance metrics.
|
172
217
|
|
173
218
|
Args:
|
174
219
|
m (np.ndarray): Data matrix.
|
175
220
|
linkage_criterion (str): Clustering criterion.
|
176
|
-
linkage_method (str): Linkage method for clustering.
|
177
|
-
linkage_metric (str): Linkage metric for clustering.
|
221
|
+
linkage_method (str): Linkage method for clustering. Choose "auto" to optimize.
|
222
|
+
linkage_metric (str): Linkage metric for clustering. Choose "auto" to optimize.
|
223
|
+
linkage_threshold (Union[str, float]): Threshold for clustering. Choose "auto" to optimize.
|
178
224
|
|
179
225
|
Returns:
|
180
|
-
Tuple[str, str, float]:
|
226
|
+
Tuple[str, str, float]:
|
181
227
|
- Best linkage method (str)
|
182
228
|
- Best linkage metric (str)
|
183
229
|
- Best threshold (float)
|
184
230
|
"""
|
231
|
+
# Initialize best overall values
|
185
232
|
best_overall_method = linkage_method
|
186
233
|
best_overall_metric = linkage_metric
|
234
|
+
best_overall_threshold = linkage_threshold
|
187
235
|
best_overall_score = -np.inf
|
188
|
-
best_overall_threshold = 1
|
189
236
|
|
190
|
-
|
191
|
-
|
237
|
+
# Set linkage methods and metrics to all combinations if "auto" is selected
|
238
|
+
linkage_methods = LINKAGE_METHODS if linkage_method == "auto" else [linkage_method]
|
239
|
+
linkage_metrics = LINKAGE_METRICS if linkage_metric == "auto" else [linkage_metric]
|
192
240
|
total_combinations = len(linkage_methods) * len(linkage_metrics)
|
193
241
|
|
194
242
|
# Evaluating optimal linkage method and metric
|
@@ -198,14 +246,39 @@ def _optimize_silhouette_across_linkage_and_metrics(
|
|
198
246
|
total=total_combinations,
|
199
247
|
bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
|
200
248
|
):
|
201
|
-
with
|
249
|
+
# Some linkage methods and metrics may not work with certain data
|
250
|
+
try:
|
202
251
|
Z = linkage(m, method=method, metric=metric)
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
252
|
+
except (ValueError, LinAlgError):
|
253
|
+
# If linkage fails, set a default threshold (a float) and a very poor score
|
254
|
+
current_threshold = 0.0
|
255
|
+
score = -float("inf")
|
256
|
+
else:
|
257
|
+
# Only optimize silhouette score if the threshold is "auto"
|
258
|
+
if linkage_threshold == "auto":
|
259
|
+
threshold, score = _find_best_silhouette_score(Z, m, metric, linkage_criterion)
|
260
|
+
current_threshold = threshold
|
261
|
+
else:
|
262
|
+
# Use the provided threshold without optimization
|
263
|
+
score = silhouette_score(
|
264
|
+
m,
|
265
|
+
fcluster(Z, linkage_threshold * np.max(Z[:, 2]), criterion=linkage_criterion),
|
266
|
+
metric=metric,
|
267
|
+
)
|
268
|
+
current_threshold = linkage_threshold
|
269
|
+
|
270
|
+
if score > best_overall_score:
|
271
|
+
best_overall_score = score
|
272
|
+
best_overall_threshold = float(current_threshold) # Ensure it's a float
|
273
|
+
best_overall_method = method
|
274
|
+
best_overall_metric = metric
|
275
|
+
|
276
|
+
# Ensure that we always return a valid tuple:
|
277
|
+
if best_overall_score == -np.inf:
|
278
|
+
# No valid linkage was found; return default values.
|
279
|
+
best_overall_threshold = float(linkage_threshold) if linkage_threshold != "auto" else 0.0
|
280
|
+
best_overall_method = linkage_method
|
281
|
+
best_overall_metric = linkage_metric
|
209
282
|
|
210
283
|
return best_overall_method, best_overall_metric, best_overall_threshold
|
211
284
|
|
@@ -231,7 +304,7 @@ def _find_best_silhouette_score(
|
|
231
304
|
resolution (float, optional): Desired resolution for the best threshold. Defaults to 0.001.
|
232
305
|
|
233
306
|
Returns:
|
234
|
-
Tuple[float, float]:
|
307
|
+
Tuple[float, float]:
|
235
308
|
- Best threshold (float): The threshold that yields the best silhouette score.
|
236
309
|
- Best silhouette score (float): The highest silhouette score achieved.
|
237
310
|
"""
|