risk-network 0.0.8b18__py3-none-any.whl → 0.0.9b26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- risk/__init__.py +2 -2
- risk/annotations/__init__.py +2 -2
- risk/annotations/annotations.py +133 -72
- risk/annotations/io.py +50 -34
- risk/log/__init__.py +4 -2
- risk/log/{config.py → console.py} +5 -3
- risk/log/{params.py → parameters.py} +21 -46
- risk/neighborhoods/__init__.py +3 -5
- risk/neighborhoods/api.py +446 -0
- risk/neighborhoods/community.py +281 -96
- risk/neighborhoods/domains.py +92 -38
- risk/neighborhoods/neighborhoods.py +210 -149
- risk/network/__init__.py +1 -3
- risk/network/geometry.py +69 -58
- risk/network/graph/__init__.py +6 -0
- risk/network/graph/api.py +194 -0
- risk/network/graph/network.py +269 -0
- risk/network/graph/summary.py +254 -0
- risk/network/io.py +58 -48
- risk/network/plotter/__init__.py +6 -0
- risk/network/plotter/api.py +54 -0
- risk/network/{plot → plotter}/canvas.py +80 -26
- risk/network/{plot → plotter}/contour.py +43 -34
- risk/network/{plot → plotter}/labels.py +123 -113
- risk/network/plotter/network.py +424 -0
- risk/network/plotter/utils/colors.py +416 -0
- risk/network/plotter/utils/layout.py +94 -0
- risk/risk.py +11 -469
- risk/stats/__init__.py +8 -4
- risk/stats/binom.py +51 -0
- risk/stats/chi2.py +69 -0
- risk/stats/hypergeom.py +28 -18
- risk/stats/permutation/__init__.py +1 -1
- risk/stats/permutation/permutation.py +45 -39
- risk/stats/permutation/test_functions.py +25 -17
- risk/stats/poisson.py +17 -11
- risk/stats/stats.py +20 -16
- risk/stats/zscore.py +68 -0
- {risk_network-0.0.8b18.dist-info → risk_network-0.0.9b26.dist-info}/METADATA +9 -5
- risk_network-0.0.9b26.dist-info/RECORD +44 -0
- {risk_network-0.0.8b18.dist-info → risk_network-0.0.9b26.dist-info}/WHEEL +1 -1
- risk/network/graph.py +0 -159
- risk/network/plot/__init__.py +0 -6
- risk/network/plot/network.py +0 -282
- risk/network/plot/plotter.py +0 -137
- risk/network/plot/utils/color.py +0 -353
- risk/network/plot/utils/layout.py +0 -53
- risk_network-0.0.8b18.dist-info/RECORD +0 -37
- {risk_network-0.0.8b18.dist-info → risk_network-0.0.9b26.dist-info}/LICENSE +0 -0
- {risk_network-0.0.8b18.dist-info → risk_network-0.0.9b26.dist-info}/top_level.txt +0 -0
risk/neighborhoods/domains.py
CHANGED
@@ -5,32 +5,32 @@ risk/neighborhoods/domains
|
|
5
5
|
|
6
6
|
from contextlib import suppress
|
7
7
|
from itertools import product
|
8
|
-
from tqdm import tqdm
|
9
8
|
from typing import Tuple
|
10
9
|
|
11
10
|
import numpy as np
|
12
11
|
import pandas as pd
|
13
12
|
from scipy.cluster.hierarchy import linkage, fcluster
|
14
13
|
from sklearn.metrics import silhouette_score
|
14
|
+
from tqdm import tqdm
|
15
15
|
|
16
|
-
from risk.annotations import
|
16
|
+
from risk.annotations import get_weighted_description
|
17
17
|
from risk.constants import GROUP_LINKAGE_METHODS, GROUP_DISTANCE_METRICS
|
18
18
|
from risk.log import logger
|
19
19
|
|
20
20
|
|
21
21
|
def define_domains(
|
22
22
|
top_annotations: pd.DataFrame,
|
23
|
-
|
23
|
+
significant_neighborhoods_significance: np.ndarray,
|
24
24
|
linkage_criterion: str,
|
25
25
|
linkage_method: str,
|
26
26
|
linkage_metric: str,
|
27
27
|
) -> pd.DataFrame:
|
28
|
-
"""Define domains and assign nodes to these domains based on their
|
28
|
+
"""Define domains and assign nodes to these domains based on their significance scores and clustering,
|
29
29
|
handling errors by assigning unique domains when clustering fails.
|
30
30
|
|
31
31
|
Args:
|
32
32
|
top_annotations (pd.DataFrame): DataFrame of top annotations data for the network nodes.
|
33
|
-
|
33
|
+
significant_neighborhoods_significance (np.ndarray): The binary significance matrix below alpha.
|
34
34
|
linkage_criterion (str): The clustering criterion for defining groups.
|
35
35
|
linkage_method (str): The linkage method for clustering.
|
36
36
|
linkage_metric (str): The linkage metric for clustering.
|
@@ -39,8 +39,14 @@ def define_domains(
|
|
39
39
|
pd.DataFrame: DataFrame with the primary domain for each node.
|
40
40
|
"""
|
41
41
|
try:
|
42
|
+
if linkage_criterion == "off":
|
43
|
+
raise ValueError("Clustering is turned off.")
|
44
|
+
|
42
45
|
# Transpose the matrix to cluster annotations
|
43
|
-
m =
|
46
|
+
m = significant_neighborhoods_significance[:, top_annotations["significant_annotations"]].T
|
47
|
+
# Safeguard the matrix by replacing NaN, Inf, and -Inf values
|
48
|
+
m = _safeguard_matrix(m)
|
49
|
+
# Optimize silhouette score across different linkage methods and distance metrics
|
44
50
|
best_linkage, best_metric, best_threshold = _optimize_silhouette_across_linkage_and_metrics(
|
45
51
|
m, linkage_criterion, linkage_method, linkage_metric
|
46
52
|
)
|
@@ -55,40 +61,49 @@ def define_domains(
|
|
55
61
|
# Assign domains to the annotations matrix
|
56
62
|
domains = fcluster(Z, max_d_optimal, criterion=linkage_criterion)
|
57
63
|
top_annotations["domain"] = 0
|
58
|
-
top_annotations.loc[top_annotations["
|
64
|
+
top_annotations.loc[top_annotations["significant_annotations"], "domain"] = domains
|
59
65
|
except ValueError:
|
60
66
|
# If a ValueError is encountered, handle it by assigning unique domains
|
61
67
|
n_rows = len(top_annotations)
|
62
|
-
|
63
|
-
|
64
|
-
|
68
|
+
if linkage_criterion == "off":
|
69
|
+
logger.warning(
|
70
|
+
f"Clustering is turned off. Skipping clustering and assigning {n_rows} unique domains."
|
71
|
+
)
|
72
|
+
else:
|
73
|
+
logger.error(
|
74
|
+
f"Error encountered. Skipping clustering and assigning {n_rows} unique domains."
|
75
|
+
)
|
65
76
|
top_annotations["domain"] = range(1, n_rows + 1) # Assign unique domains
|
66
77
|
|
67
78
|
# Create DataFrames to store domain information
|
68
|
-
|
69
|
-
data=
|
79
|
+
node_to_significance = pd.DataFrame(
|
80
|
+
data=significant_neighborhoods_significance,
|
70
81
|
columns=[top_annotations.index.values, top_annotations["domain"]],
|
71
82
|
)
|
72
|
-
node_to_domain =
|
83
|
+
node_to_domain = node_to_significance.T.groupby(level="domain").sum().T
|
73
84
|
|
74
|
-
# Find the maximum
|
85
|
+
# Find the maximum significance score for each node
|
75
86
|
t_max = node_to_domain.loc[:, 1:].max(axis=1)
|
76
87
|
t_idxmax = node_to_domain.loc[:, 1:].idxmax(axis=1)
|
77
88
|
t_idxmax[t_max == 0] = 0
|
78
89
|
|
90
|
+
# Assign all domains where the score is greater than 0
|
91
|
+
node_to_domain["all_domains"] = node_to_domain.loc[:, 1:].apply(
|
92
|
+
lambda row: list(row[row > 0].index), axis=1
|
93
|
+
)
|
79
94
|
# Assign primary domain
|
80
|
-
node_to_domain["
|
95
|
+
node_to_domain["primary_domain"] = t_idxmax
|
81
96
|
|
82
97
|
return node_to_domain
|
83
98
|
|
84
99
|
|
85
|
-
def
|
100
|
+
def trim_domains(
|
86
101
|
domains: pd.DataFrame,
|
87
102
|
top_annotations: pd.DataFrame,
|
88
103
|
min_cluster_size: int = 5,
|
89
104
|
max_cluster_size: int = 1000,
|
90
105
|
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
91
|
-
"""Trim domains
|
106
|
+
"""Trim domains that do not meet size criteria and find outliers.
|
92
107
|
|
93
108
|
Args:
|
94
109
|
domains (pd.DataFrame): DataFrame of domain data for the network nodes.
|
@@ -97,13 +112,12 @@ def trim_domains_and_top_annotations(
|
|
97
112
|
max_cluster_size (int, optional): Maximum size of a cluster to be retained. Defaults to 1000.
|
98
113
|
|
99
114
|
Returns:
|
100
|
-
|
101
|
-
- Trimmed annotations (pd.DataFrame)
|
115
|
+
Tuple[pd.DataFrame, pd.DataFrame]:
|
102
116
|
- Trimmed domains (pd.DataFrame)
|
103
117
|
- A DataFrame with domain labels (pd.DataFrame)
|
104
118
|
"""
|
105
119
|
# Identify domains to remove based on size criteria
|
106
|
-
domain_counts = domains["
|
120
|
+
domain_counts = domains["primary_domain"].value_counts()
|
107
121
|
to_remove = set(
|
108
122
|
domain_counts[(domain_counts < min_cluster_size) | (domain_counts > max_cluster_size)].index
|
109
123
|
)
|
@@ -112,34 +126,73 @@ def trim_domains_and_top_annotations(
|
|
112
126
|
invalid_domain_id = 888888
|
113
127
|
invalid_domain_ids = {0, invalid_domain_id}
|
114
128
|
# Mark domains to be removed
|
115
|
-
top_annotations["domain"].replace(to_remove, invalid_domain_id
|
116
|
-
domains.loc[domains["
|
129
|
+
top_annotations["domain"] = top_annotations["domain"].replace(to_remove, invalid_domain_id)
|
130
|
+
domains.loc[domains["primary_domain"].isin(to_remove), ["primary_domain"]] = invalid_domain_id
|
117
131
|
|
118
|
-
# Normalize "num
|
132
|
+
# Normalize "num significant neighborhoods" by percentile for each domain and scale to 0-10
|
119
133
|
top_annotations["normalized_value"] = top_annotations.groupby("domain")[
|
120
|
-
"
|
134
|
+
"significant_neighborhood_significance_sums"
|
121
135
|
].transform(lambda x: (x.rank(pct=True) * 10).apply(np.ceil).astype(int))
|
122
|
-
#
|
123
|
-
top_annotations["
|
124
|
-
lambda row: " ".join([str(row["
|
136
|
+
# Modify the lambda function to pass both full_terms and significant_significance_score
|
137
|
+
top_annotations["combined_terms"] = top_annotations.apply(
|
138
|
+
lambda row: " ".join([str(row["full_terms"])] * row["normalized_value"]), axis=1
|
125
139
|
)
|
126
140
|
|
127
|
-
#
|
128
|
-
domain_labels =
|
141
|
+
# Perform the groupby operation while retaining the other columns and adding the weighting with significance scores
|
142
|
+
domain_labels = (
|
143
|
+
top_annotations.groupby("domain")
|
144
|
+
.agg(
|
145
|
+
full_terms=("full_terms", lambda x: list(x)),
|
146
|
+
significance_scores=("significant_significance_score", lambda x: list(x)),
|
147
|
+
)
|
148
|
+
.reset_index()
|
149
|
+
)
|
150
|
+
domain_labels["combined_terms"] = domain_labels.apply(
|
151
|
+
lambda row: get_weighted_description(
|
152
|
+
pd.Series(row["full_terms"]), pd.Series(row["significance_scores"])
|
153
|
+
),
|
154
|
+
axis=1,
|
155
|
+
)
|
156
|
+
|
157
|
+
# Rename the columns as necessary
|
129
158
|
trimmed_domains_matrix = domain_labels.rename(
|
130
|
-
columns={
|
159
|
+
columns={
|
160
|
+
"domain": "id",
|
161
|
+
"combined_terms": "normalized_description",
|
162
|
+
"full_terms": "full_descriptions",
|
163
|
+
"significance_scores": "significance_scores",
|
164
|
+
}
|
131
165
|
).set_index("id")
|
132
166
|
|
133
167
|
# Remove invalid domains
|
134
|
-
|
135
|
-
columns=["normalized_value"]
|
136
|
-
)
|
137
|
-
valid_domains = domains[~domains["primary domain"].isin(invalid_domain_ids)]
|
168
|
+
valid_domains = domains[~domains["primary_domain"].isin(invalid_domain_ids)]
|
138
169
|
valid_trimmed_domains_matrix = trimmed_domains_matrix[
|
139
170
|
~trimmed_domains_matrix.index.isin(invalid_domain_ids)
|
140
171
|
]
|
172
|
+
return valid_domains, valid_trimmed_domains_matrix
|
173
|
+
|
174
|
+
|
175
|
+
def _safeguard_matrix(matrix: np.ndarray) -> np.ndarray:
|
176
|
+
"""Safeguard the matrix by replacing NaN, Inf, and -Inf values.
|
141
177
|
|
142
|
-
|
178
|
+
Args:
|
179
|
+
matrix (np.ndarray): Data matrix.
|
180
|
+
|
181
|
+
Returns:
|
182
|
+
np.ndarray: Safeguarded data matrix.
|
183
|
+
"""
|
184
|
+
# Replace NaN with column mean
|
185
|
+
nan_replacement = np.nanmean(matrix, axis=0)
|
186
|
+
matrix = np.where(np.isnan(matrix), nan_replacement, matrix)
|
187
|
+
# Replace Inf/-Inf with maximum/minimum finite values
|
188
|
+
finite_max = np.nanmax(matrix[np.isfinite(matrix)])
|
189
|
+
finite_min = np.nanmin(matrix[np.isfinite(matrix)])
|
190
|
+
matrix = np.where(np.isposinf(matrix), finite_max, matrix)
|
191
|
+
matrix = np.where(np.isneginf(matrix), finite_min, matrix)
|
192
|
+
# Ensure rows have non-zero variance (optional step)
|
193
|
+
row_variance = np.var(matrix, axis=1)
|
194
|
+
matrix = matrix[row_variance > 0]
|
195
|
+
return matrix
|
143
196
|
|
144
197
|
|
145
198
|
def _optimize_silhouette_across_linkage_and_metrics(
|
@@ -154,7 +207,7 @@ def _optimize_silhouette_across_linkage_and_metrics(
|
|
154
207
|
linkage_metric (str): Linkage metric for clustering.
|
155
208
|
|
156
209
|
Returns:
|
157
|
-
|
210
|
+
Tuple[str, str, float]:
|
158
211
|
- Best linkage method (str)
|
159
212
|
- Best linkage metric (str)
|
160
213
|
- Best threshold (float)
|
@@ -175,7 +228,8 @@ def _optimize_silhouette_across_linkage_and_metrics(
|
|
175
228
|
total=total_combinations,
|
176
229
|
bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
|
177
230
|
):
|
178
|
-
with
|
231
|
+
# Some linkage methods and metrics may not work with certain data
|
232
|
+
with suppress(ValueError):
|
179
233
|
Z = linkage(m, method=method, metric=metric)
|
180
234
|
threshold, score = _find_best_silhouette_score(Z, m, metric, linkage_criterion)
|
181
235
|
if score > best_overall_score:
|
@@ -208,7 +262,7 @@ def _find_best_silhouette_score(
|
|
208
262
|
resolution (float, optional): Desired resolution for the best threshold. Defaults to 0.001.
|
209
263
|
|
210
264
|
Returns:
|
211
|
-
|
265
|
+
Tuple[float, float]:
|
212
266
|
- Best threshold (float): The threshold that yields the best silhouette score.
|
213
267
|
- Best silhouette score (float): The highest silhouette score achieved.
|
214
268
|
"""
|