risk-network 0.0.8b26__py3-none-any.whl → 0.0.9b26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- risk/__init__.py +2 -2
- risk/annotations/__init__.py +2 -2
- risk/annotations/annotations.py +74 -47
- risk/annotations/io.py +47 -31
- risk/log/__init__.py +4 -2
- risk/log/{config.py → console.py} +5 -3
- risk/log/{params.py → parameters.py} +17 -42
- risk/neighborhoods/__init__.py +3 -5
- risk/neighborhoods/api.py +446 -0
- risk/neighborhoods/community.py +255 -77
- risk/neighborhoods/domains.py +62 -31
- risk/neighborhoods/neighborhoods.py +156 -160
- risk/network/__init__.py +1 -3
- risk/network/geometry.py +65 -57
- risk/network/graph/__init__.py +6 -0
- risk/network/graph/api.py +194 -0
- risk/network/{graph.py → graph/network.py} +87 -37
- risk/network/graph/summary.py +254 -0
- risk/network/io.py +56 -47
- risk/network/plotter/__init__.py +6 -0
- risk/network/plotter/api.py +54 -0
- risk/network/{plot → plotter}/canvas.py +7 -4
- risk/network/{plot → plotter}/contour.py +22 -19
- risk/network/{plot → plotter}/labels.py +69 -74
- risk/network/{plot → plotter}/network.py +170 -34
- risk/network/{plot/utils/color.py → plotter/utils/colors.py} +104 -112
- risk/network/{plot → plotter}/utils/layout.py +8 -5
- risk/risk.py +11 -500
- risk/stats/__init__.py +8 -4
- risk/stats/binom.py +51 -0
- risk/stats/chi2.py +69 -0
- risk/stats/hypergeom.py +27 -17
- risk/stats/permutation/__init__.py +1 -1
- risk/stats/permutation/permutation.py +44 -38
- risk/stats/permutation/test_functions.py +25 -17
- risk/stats/poisson.py +15 -9
- risk/stats/stats.py +15 -13
- risk/stats/zscore.py +68 -0
- {risk_network-0.0.8b26.dist-info → risk_network-0.0.9b26.dist-info}/METADATA +9 -5
- risk_network-0.0.9b26.dist-info/RECORD +44 -0
- {risk_network-0.0.8b26.dist-info → risk_network-0.0.9b26.dist-info}/WHEEL +1 -1
- risk/network/plot/__init__.py +0 -6
- risk/network/plot/plotter.py +0 -137
- risk_network-0.0.8b26.dist-info/RECORD +0 -37
- {risk_network-0.0.8b26.dist-info → risk_network-0.0.9b26.dist-info}/LICENSE +0 -0
- {risk_network-0.0.8b26.dist-info → risk_network-0.0.9b26.dist-info}/top_level.txt +0 -0
risk/neighborhoods/community.py
CHANGED
@@ -4,186 +4,364 @@ risk/neighborhoods/community
|
|
4
4
|
"""
|
5
5
|
|
6
6
|
import community as community_louvain
|
7
|
+
import igraph as ig
|
8
|
+
import markov_clustering as mc
|
7
9
|
import networkx as nx
|
8
10
|
import numpy as np
|
9
|
-
import
|
10
|
-
from networkx.algorithms.community import
|
11
|
+
from leidenalg import find_partition, RBConfigurationVertexPartition
|
12
|
+
from networkx.algorithms.community import greedy_modularity_communities
|
11
13
|
|
14
|
+
from risk.log import logger
|
12
15
|
|
13
|
-
|
16
|
+
|
17
|
+
def calculate_greedy_modularity_neighborhoods(
|
18
|
+
network: nx.Graph, fraction_shortest_edges: float = 1.0
|
19
|
+
) -> np.ndarray:
|
14
20
|
"""Calculate neighborhoods using the Greedy Modularity method.
|
15
21
|
|
16
22
|
Args:
|
17
|
-
network (nx.Graph): The network graph
|
23
|
+
network (nx.Graph): The network graph.
|
24
|
+
fraction_shortest_edges (float, optional): Shortest edge rank fraction threshold for creating
|
25
|
+
subgraphs before clustering.
|
18
26
|
|
19
27
|
Returns:
|
20
28
|
np.ndarray: A binary neighborhood matrix where nodes in the same community have 1, and others have 0.
|
21
29
|
"""
|
30
|
+
# Create a subgraph with the shortest edges based on the rank fraction
|
31
|
+
subnetwork = _create_percentile_limited_subgraph(
|
32
|
+
network, fraction_shortest_edges=fraction_shortest_edges
|
33
|
+
)
|
22
34
|
# Detect communities using the Greedy Modularity method
|
23
|
-
communities = greedy_modularity_communities(
|
35
|
+
communities = greedy_modularity_communities(subnetwork)
|
36
|
+
# Get the list of nodes in the original NetworkX graph
|
37
|
+
nodes = list(network.nodes())
|
38
|
+
node_index_map = {node: idx for idx, node in enumerate(nodes)}
|
24
39
|
# Create a binary neighborhood matrix
|
25
|
-
|
26
|
-
neighborhoods
|
27
|
-
|
28
|
-
node_index = {node: i for i, node in enumerate(network.nodes())}
|
40
|
+
num_nodes = len(nodes)
|
41
|
+
# Initialize neighborhoods with zeros and set self-self entries to 1
|
42
|
+
neighborhoods = np.eye(num_nodes, dtype=int)
|
29
43
|
# Fill in the neighborhood matrix for nodes in the same community
|
30
44
|
for community in communities:
|
31
45
|
# Iterate through all pairs of nodes in the same community
|
32
46
|
for node_i in community:
|
33
|
-
idx_i = node_index[node_i]
|
34
47
|
for node_j in community:
|
35
|
-
|
48
|
+
idx_i = node_index_map[node_i]
|
49
|
+
idx_j = node_index_map[node_j]
|
36
50
|
# Set them as neighbors (1) in the binary matrix
|
37
51
|
neighborhoods[idx_i, idx_j] = 1
|
38
52
|
|
39
53
|
return neighborhoods
|
40
54
|
|
41
55
|
|
42
|
-
def calculate_label_propagation_neighborhoods(
|
56
|
+
def calculate_label_propagation_neighborhoods(
|
57
|
+
network: nx.Graph, fraction_shortest_edges: float = 1.0
|
58
|
+
) -> np.ndarray:
|
43
59
|
"""Apply Label Propagation to the network to detect communities.
|
44
60
|
|
45
61
|
Args:
|
46
62
|
network (nx.Graph): The network graph.
|
63
|
+
fraction_shortest_edges (float, optional): Shortest edge rank fraction threshold for creating
|
64
|
+
subgraphs before clustering.
|
47
65
|
|
48
66
|
Returns:
|
49
|
-
np.ndarray:
|
67
|
+
np.ndarray: A binary neighborhood matrix on Label Propagation.
|
50
68
|
"""
|
69
|
+
# Create a subgraph with the shortest edges based on the rank fraction
|
70
|
+
subnetwork = _create_percentile_limited_subgraph(
|
71
|
+
network, fraction_shortest_edges=fraction_shortest_edges
|
72
|
+
)
|
51
73
|
# Apply Label Propagation for community detection
|
52
|
-
communities = nx.algorithms.community.label_propagation.label_propagation_communities(
|
74
|
+
communities = nx.algorithms.community.label_propagation.label_propagation_communities(
|
75
|
+
subnetwork
|
76
|
+
)
|
77
|
+
# Get the list of nodes in the network
|
78
|
+
nodes = list(network.nodes())
|
79
|
+
node_index_map = {node: idx for idx, node in enumerate(nodes)}
|
53
80
|
# Create a binary neighborhood matrix
|
54
|
-
num_nodes =
|
55
|
-
neighborhoods
|
56
|
-
|
57
|
-
|
58
|
-
# Assign neighborhoods based on community labels
|
81
|
+
num_nodes = len(nodes)
|
82
|
+
# Initialize neighborhoods with zeros and set self-self entries to 1
|
83
|
+
neighborhoods = np.eye(num_nodes, dtype=int)
|
84
|
+
# Assign neighborhoods based on community labels using the mapped indices
|
59
85
|
for community in communities:
|
60
86
|
for node_i in community:
|
61
|
-
idx_i = node_index[node_i]
|
62
87
|
for node_j in community:
|
63
|
-
|
88
|
+
idx_i = node_index_map[node_i]
|
89
|
+
idx_j = node_index_map[node_j]
|
90
|
+
neighborhoods[idx_i, idx_j] = 1
|
91
|
+
|
92
|
+
return neighborhoods
|
93
|
+
|
94
|
+
|
95
|
+
def calculate_leiden_neighborhoods(
|
96
|
+
network: nx.Graph,
|
97
|
+
resolution: float = 1.0,
|
98
|
+
fraction_shortest_edges: float = 1.0,
|
99
|
+
random_seed: int = 888,
|
100
|
+
) -> np.ndarray:
|
101
|
+
"""Calculate neighborhoods using the Leiden method.
|
102
|
+
|
103
|
+
Args:
|
104
|
+
network (nx.Graph): The network graph.
|
105
|
+
resolution (float, optional): Resolution parameter for the Leiden method. Defaults to 1.0.
|
106
|
+
fraction_shortest_edges (float, optional): Shortest edge rank fraction threshold for creating
|
107
|
+
subgraphs before clustering.
|
108
|
+
random_seed (int, optional): Random seed for reproducibility. Defaults to 888.
|
109
|
+
|
110
|
+
Returns:
|
111
|
+
np.ndarray: A binary neighborhood matrix where nodes in the same community have 1, and others have 0.
|
112
|
+
"""
|
113
|
+
# Create a subgraph with the shortest edges based on the rank fraction
|
114
|
+
subnetwork = _create_percentile_limited_subgraph(
|
115
|
+
network, fraction_shortest_edges=fraction_shortest_edges
|
116
|
+
)
|
117
|
+
# Convert NetworkX graph to iGraph
|
118
|
+
igraph_network = ig.Graph.from_networkx(subnetwork)
|
119
|
+
# Apply Leiden algorithm using RBConfigurationVertexPartition, which supports resolution
|
120
|
+
partition = find_partition(
|
121
|
+
igraph_network,
|
122
|
+
partition_type=RBConfigurationVertexPartition,
|
123
|
+
resolution_parameter=resolution,
|
124
|
+
seed=random_seed,
|
125
|
+
)
|
126
|
+
# Get the list of nodes in the original NetworkX graph
|
127
|
+
nodes = list(network.nodes())
|
128
|
+
node_index_map = {node: idx for idx, node in enumerate(nodes)}
|
129
|
+
# Create a binary neighborhood matrix
|
130
|
+
num_nodes = len(nodes)
|
131
|
+
# Initialize neighborhoods with zeros and set self-self entries to 1
|
132
|
+
neighborhoods = np.eye(num_nodes, dtype=int)
|
133
|
+
# Assign neighborhoods based on community partitions using the mapped indices
|
134
|
+
for community in partition:
|
135
|
+
for node_i in community:
|
136
|
+
for node_j in community:
|
137
|
+
idx_i = node_index_map[igraph_network.vs[node_i]["_nx_name"]]
|
138
|
+
idx_j = node_index_map[igraph_network.vs[node_j]["_nx_name"]]
|
64
139
|
neighborhoods[idx_i, idx_j] = 1
|
65
140
|
|
66
141
|
return neighborhoods
|
67
142
|
|
68
143
|
|
69
144
|
def calculate_louvain_neighborhoods(
|
70
|
-
network: nx.Graph,
|
145
|
+
network: nx.Graph,
|
146
|
+
resolution: float = 0.1,
|
147
|
+
fraction_shortest_edges: float = 1.0,
|
148
|
+
random_seed: int = 888,
|
71
149
|
) -> np.ndarray:
|
72
150
|
"""Calculate neighborhoods using the Louvain method.
|
73
151
|
|
74
152
|
Args:
|
75
153
|
network (nx.Graph): The network graph.
|
76
|
-
resolution (float): Resolution parameter for the Louvain method.
|
154
|
+
resolution (float, optional): Resolution parameter for the Louvain method. Defaults to 0.1.
|
155
|
+
fraction_shortest_edges (float, optional): Shortest edge rank fraction threshold for creating
|
156
|
+
subgraphs before clustering.
|
77
157
|
random_seed (int, optional): Random seed for reproducibility. Defaults to 888.
|
78
158
|
|
79
159
|
Returns:
|
80
|
-
np.ndarray:
|
160
|
+
np.ndarray: A binary neighborhood matrix on the Louvain method.
|
81
161
|
"""
|
162
|
+
# Create a subgraph with the shortest edges based on the rank fraction
|
163
|
+
subnetwork = _create_percentile_limited_subgraph(
|
164
|
+
network, fraction_shortest_edges=fraction_shortest_edges
|
165
|
+
)
|
82
166
|
# Apply Louvain method to partition the network
|
83
167
|
partition = community_louvain.best_partition(
|
84
|
-
|
168
|
+
subnetwork, resolution=resolution, random_state=random_seed
|
85
169
|
)
|
170
|
+
# Get the list of nodes in the network and create a mapping to indices
|
171
|
+
nodes = list(network.nodes())
|
172
|
+
node_index_map = {node: idx for idx, node in enumerate(nodes)}
|
86
173
|
# Create a binary neighborhood matrix
|
87
|
-
num_nodes =
|
88
|
-
neighborhoods
|
89
|
-
|
90
|
-
node_index = {node: i for i, node in enumerate(network.nodes())}
|
174
|
+
num_nodes = len(nodes)
|
175
|
+
# Initialize neighborhoods with zeros and set self-self entries to 1
|
176
|
+
neighborhoods = np.eye(num_nodes, dtype=int)
|
91
177
|
# Group nodes by community
|
92
178
|
community_groups = {}
|
93
179
|
for node, community in partition.items():
|
94
180
|
community_groups.setdefault(community, []).append(node)
|
95
181
|
|
96
|
-
# Assign neighborhoods based on community partitions
|
182
|
+
# Assign neighborhoods based on community partitions using the mapped indices
|
97
183
|
for community, nodes in community_groups.items():
|
98
184
|
for node_i in nodes:
|
99
|
-
idx_i = node_index[node_i]
|
100
185
|
for node_j in nodes:
|
101
|
-
|
186
|
+
idx_i = node_index_map[node_i]
|
187
|
+
idx_j = node_index_map[node_j]
|
102
188
|
neighborhoods[idx_i, idx_j] = 1
|
103
189
|
|
104
190
|
return neighborhoods
|
105
191
|
|
106
192
|
|
107
|
-
def calculate_markov_clustering_neighborhoods(
|
108
|
-
|
193
|
+
def calculate_markov_clustering_neighborhoods(
|
194
|
+
network: nx.Graph, fraction_shortest_edges: float = 1.0
|
195
|
+
) -> np.ndarray:
|
196
|
+
"""Apply Markov Clustering (MCL) to the network and return a binary neighborhood matrix.
|
109
197
|
|
110
198
|
Args:
|
111
199
|
network (nx.Graph): The network graph.
|
200
|
+
fraction_shortest_edges (float, optional): Shortest edge rank fraction threshold for creating
|
201
|
+
subgraphs before clustering.
|
112
202
|
|
113
203
|
Returns:
|
114
|
-
np.ndarray:
|
204
|
+
np.ndarray: A binary neighborhood matrix on Markov Clustering.
|
115
205
|
"""
|
116
|
-
#
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
#
|
206
|
+
# Create a subgraph with the shortest edges based on the rank fraction
|
207
|
+
subnetwork = _create_percentile_limited_subgraph(
|
208
|
+
network, fraction_shortest_edges=fraction_shortest_edges
|
209
|
+
)
|
210
|
+
# Step 1: Convert the subnetwork to an adjacency matrix
|
211
|
+
subnetwork_nodes = list(subnetwork.nodes())
|
212
|
+
adjacency_matrix = nx.to_numpy_array(subnetwork, nodelist=subnetwork_nodes)
|
213
|
+
# Step 2: Run Markov Clustering (MCL) on the subnetwork's adjacency matrix
|
214
|
+
result = mc.run_mcl(adjacency_matrix)
|
121
215
|
clusters = mc.get_clusters(result)
|
122
|
-
#
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
216
|
+
# Step 3: Prepare the original network nodes and indices
|
217
|
+
nodes = list(network.nodes())
|
218
|
+
node_index_map = {node: idx for idx, node in enumerate(nodes)}
|
219
|
+
num_nodes = len(nodes)
|
220
|
+
# Step 4: Initialize the neighborhood matrix for the original network
|
221
|
+
neighborhoods = np.eye(num_nodes, dtype=int)
|
222
|
+
# Step 5: Fill the neighborhoods matrix using the clusters from the subnetwork
|
128
223
|
for cluster in clusters:
|
129
224
|
for node_i in cluster:
|
130
|
-
idx_i = node_index[node_i]
|
131
225
|
for node_j in cluster:
|
132
|
-
|
133
|
-
|
226
|
+
# Map the indices back to the original network's node indices
|
227
|
+
original_node_i = subnetwork_nodes[node_i]
|
228
|
+
original_node_j = subnetwork_nodes[node_j]
|
229
|
+
|
230
|
+
if original_node_i in node_index_map and original_node_j in node_index_map:
|
231
|
+
idx_i = node_index_map[original_node_i]
|
232
|
+
idx_j = node_index_map[original_node_j]
|
233
|
+
neighborhoods[idx_i, idx_j] = 1
|
134
234
|
|
135
235
|
return neighborhoods
|
136
236
|
|
137
237
|
|
138
|
-
def calculate_spinglass_neighborhoods(
|
139
|
-
|
238
|
+
def calculate_spinglass_neighborhoods(
|
239
|
+
network: nx.Graph, fraction_shortest_edges: float = 1.0
|
240
|
+
) -> np.ndarray:
|
241
|
+
"""Apply Spinglass Community Detection to the network, handling disconnected components.
|
140
242
|
|
141
243
|
Args:
|
142
244
|
network (nx.Graph): The network graph.
|
245
|
+
fraction_shortest_edges (float, optional): Shortest edge rank fraction threshold for creating
|
246
|
+
subgraphs before clustering.
|
143
247
|
|
144
248
|
Returns:
|
145
|
-
np.ndarray:
|
249
|
+
np.ndarray: A binary neighborhood matrix based on Spinglass communities.
|
146
250
|
"""
|
147
|
-
#
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
for
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
251
|
+
# Create a subgraph with the shortest edges based on the rank fraction
|
252
|
+
subnetwork = _create_percentile_limited_subgraph(
|
253
|
+
network, fraction_shortest_edges=fraction_shortest_edges
|
254
|
+
)
|
255
|
+
# Step 1: Find connected components in the graph
|
256
|
+
components = list(nx.connected_components(subnetwork))
|
257
|
+
# Prepare to store community results
|
258
|
+
nodes = list(network.nodes())
|
259
|
+
node_index_map = {node: idx for idx, node in enumerate(nodes)}
|
260
|
+
num_nodes = len(nodes)
|
261
|
+
# Initialize neighborhoods with zeros and set self-self entries to 1
|
262
|
+
neighborhoods = np.eye(num_nodes, dtype=int)
|
263
|
+
# Step 2: Run Spinglass on each connected component
|
264
|
+
for component in components:
|
265
|
+
# Extract the subgraph corresponding to the current component
|
266
|
+
subgraph = network.subgraph(component)
|
267
|
+
# Convert the subgraph to an iGraph object
|
268
|
+
igraph_subgraph = ig.Graph.from_networkx(subgraph)
|
269
|
+
# Ensure the subgraph is connected before running Spinglass
|
270
|
+
if not igraph_subgraph.is_connected():
|
271
|
+
logger.error("Warning: Subgraph is not connected. Skipping...")
|
272
|
+
continue
|
273
|
+
|
274
|
+
# Apply Spinglass community detection
|
275
|
+
try:
|
276
|
+
communities = igraph_subgraph.community_spinglass()
|
277
|
+
except Exception as e:
|
278
|
+
logger.error(f"Error running Spinglass on component: {e}")
|
279
|
+
continue
|
280
|
+
|
281
|
+
# Step 3: Assign neighborhoods based on community labels
|
282
|
+
for community in communities:
|
283
|
+
for node_i in community:
|
284
|
+
for node_j in community:
|
285
|
+
idx_i = node_index_map[igraph_subgraph.vs[node_i]["_nx_name"]]
|
286
|
+
idx_j = node_index_map[igraph_subgraph.vs[node_j]["_nx_name"]]
|
287
|
+
neighborhoods[idx_i, idx_j] = 1
|
161
288
|
|
162
289
|
return neighborhoods
|
163
290
|
|
164
291
|
|
165
|
-
def calculate_walktrap_neighborhoods(
|
292
|
+
def calculate_walktrap_neighborhoods(
|
293
|
+
network: nx.Graph, fraction_shortest_edges: float = 1.0
|
294
|
+
) -> np.ndarray:
|
166
295
|
"""Apply Walktrap Community Detection to the network.
|
167
296
|
|
168
297
|
Args:
|
169
298
|
network (nx.Graph): The network graph.
|
299
|
+
fraction_shortest_edges (float, optional): Shortest edge rank fraction threshold for creating
|
300
|
+
subgraphs before clustering.
|
170
301
|
|
171
302
|
Returns:
|
172
|
-
np.ndarray:
|
303
|
+
np.ndarray: A binary neighborhood matrix on Walktrap communities.
|
173
304
|
"""
|
174
|
-
#
|
175
|
-
|
305
|
+
# Create a subgraph with the shortest edges based on the rank fraction
|
306
|
+
subnetwork = _create_percentile_limited_subgraph(
|
307
|
+
network, fraction_shortest_edges=fraction_shortest_edges
|
308
|
+
)
|
309
|
+
# Convert NetworkX graph to iGraph
|
310
|
+
igraph_network = ig.Graph.from_networkx(subnetwork)
|
311
|
+
# Apply Walktrap community detection
|
312
|
+
communities = igraph_network.community_walktrap().as_clustering()
|
313
|
+
# Get the list of nodes in the original NetworkX graph
|
314
|
+
nodes = list(network.nodes())
|
315
|
+
node_index_map = {node: idx for idx, node in enumerate(nodes)}
|
176
316
|
# Create a binary neighborhood matrix
|
177
|
-
num_nodes =
|
178
|
-
neighborhoods
|
179
|
-
|
180
|
-
|
181
|
-
# Assign neighborhoods based on community labels from LPA
|
317
|
+
num_nodes = len(nodes)
|
318
|
+
# Initialize neighborhoods with zeros and set self-self entries to 1
|
319
|
+
neighborhoods = np.eye(num_nodes, dtype=int)
|
320
|
+
# Assign neighborhoods based on community labels
|
182
321
|
for community in communities:
|
183
322
|
for node_i in community:
|
184
|
-
idx_i = node_index[node_i]
|
185
323
|
for node_j in community:
|
186
|
-
|
324
|
+
idx_i = node_index_map[igraph_network.vs[node_i]["_nx_name"]]
|
325
|
+
idx_j = node_index_map[igraph_network.vs[node_j]["_nx_name"]]
|
187
326
|
neighborhoods[idx_i, idx_j] = 1
|
188
327
|
|
189
328
|
return neighborhoods
|
329
|
+
|
330
|
+
|
331
|
+
def _create_percentile_limited_subgraph(G: nx.Graph, fraction_shortest_edges: float) -> nx.Graph:
|
332
|
+
"""Create a subgraph containing the shortest edges based on the specified rank fraction
|
333
|
+
of all edge lengths in the input graph.
|
334
|
+
|
335
|
+
Args:
|
336
|
+
G (nx.Graph): The input graph with 'length' attributes on edges.
|
337
|
+
fraction_shortest_edges (float): The rank fraction (between 0 and 1) to filter edges.
|
338
|
+
|
339
|
+
Returns:
|
340
|
+
nx.Graph: A subgraph with nodes and edges where the edges are within the shortest
|
341
|
+
specified rank fraction.
|
342
|
+
"""
|
343
|
+
# Step 1: Extract edges with their lengths
|
344
|
+
edges_with_length = [(u, v, d) for u, v, d in G.edges(data=True) if "length" in d]
|
345
|
+
if not edges_with_length:
|
346
|
+
raise ValueError(
|
347
|
+
"No edge lengths found in the graph. Ensure edges have 'length' attributes."
|
348
|
+
)
|
349
|
+
|
350
|
+
# Step 2: Sort edges by length in ascending order
|
351
|
+
edges_with_length.sort(key=lambda x: x[2]["length"])
|
352
|
+
# Step 3: Calculate the cutoff index for the given rank fraction
|
353
|
+
cutoff_index = int(fraction_shortest_edges * len(edges_with_length))
|
354
|
+
if cutoff_index == 0:
|
355
|
+
raise ValueError("The rank fraction is too low, resulting in no edges being included.")
|
356
|
+
|
357
|
+
# Step 4: Create the subgraph by selecting only the shortest edges within the rank fraction
|
358
|
+
subgraph = nx.Graph()
|
359
|
+
subgraph.add_nodes_from(G.nodes(data=True)) # Retain all nodes from the original graph
|
360
|
+
subgraph.add_edges_from(edges_with_length[:cutoff_index])
|
361
|
+
# Step 5: Remove nodes with no edges
|
362
|
+
subgraph.remove_nodes_from(list(nx.isolates(subgraph)))
|
363
|
+
# Step 6: Check if the resulting subgraph has no edges and issue a warning
|
364
|
+
if subgraph.number_of_edges() == 0:
|
365
|
+
raise Warning("The resulting subgraph has no edges. Consider adjusting the rank fraction.")
|
366
|
+
|
367
|
+
return subgraph
|
risk/neighborhoods/domains.py
CHANGED
@@ -5,13 +5,13 @@ risk/neighborhoods/domains
|
|
5
5
|
|
6
6
|
from contextlib import suppress
|
7
7
|
from itertools import product
|
8
|
-
from tqdm import tqdm
|
9
8
|
from typing import Tuple
|
10
9
|
|
11
10
|
import numpy as np
|
12
11
|
import pandas as pd
|
13
12
|
from scipy.cluster.hierarchy import linkage, fcluster
|
14
13
|
from sklearn.metrics import silhouette_score
|
14
|
+
from tqdm import tqdm
|
15
15
|
|
16
16
|
from risk.annotations import get_weighted_description
|
17
17
|
from risk.constants import GROUP_LINKAGE_METHODS, GROUP_DISTANCE_METRICS
|
@@ -20,17 +20,17 @@ from risk.log import logger
|
|
20
20
|
|
21
21
|
def define_domains(
|
22
22
|
top_annotations: pd.DataFrame,
|
23
|
-
|
23
|
+
significant_neighborhoods_significance: np.ndarray,
|
24
24
|
linkage_criterion: str,
|
25
25
|
linkage_method: str,
|
26
26
|
linkage_metric: str,
|
27
27
|
) -> pd.DataFrame:
|
28
|
-
"""Define domains and assign nodes to these domains based on their
|
28
|
+
"""Define domains and assign nodes to these domains based on their significance scores and clustering,
|
29
29
|
handling errors by assigning unique domains when clustering fails.
|
30
30
|
|
31
31
|
Args:
|
32
32
|
top_annotations (pd.DataFrame): DataFrame of top annotations data for the network nodes.
|
33
|
-
|
33
|
+
significant_neighborhoods_significance (np.ndarray): The binary significance matrix below alpha.
|
34
34
|
linkage_criterion (str): The clustering criterion for defining groups.
|
35
35
|
linkage_method (str): The linkage method for clustering.
|
36
36
|
linkage_metric (str): The linkage metric for clustering.
|
@@ -39,8 +39,14 @@ def define_domains(
|
|
39
39
|
pd.DataFrame: DataFrame with the primary domain for each node.
|
40
40
|
"""
|
41
41
|
try:
|
42
|
+
if linkage_criterion == "off":
|
43
|
+
raise ValueError("Clustering is turned off.")
|
44
|
+
|
42
45
|
# Transpose the matrix to cluster annotations
|
43
|
-
m =
|
46
|
+
m = significant_neighborhoods_significance[:, top_annotations["significant_annotations"]].T
|
47
|
+
# Safeguard the matrix by replacing NaN, Inf, and -Inf values
|
48
|
+
m = _safeguard_matrix(m)
|
49
|
+
# Optimize silhouette score across different linkage methods and distance metrics
|
44
50
|
best_linkage, best_metric, best_threshold = _optimize_silhouette_across_linkage_and_metrics(
|
45
51
|
m, linkage_criterion, linkage_method, linkage_metric
|
46
52
|
)
|
@@ -59,19 +65,24 @@ def define_domains(
|
|
59
65
|
except ValueError:
|
60
66
|
# If a ValueError is encountered, handle it by assigning unique domains
|
61
67
|
n_rows = len(top_annotations)
|
62
|
-
|
63
|
-
|
64
|
-
|
68
|
+
if linkage_criterion == "off":
|
69
|
+
logger.warning(
|
70
|
+
f"Clustering is turned off. Skipping clustering and assigning {n_rows} unique domains."
|
71
|
+
)
|
72
|
+
else:
|
73
|
+
logger.error(
|
74
|
+
f"Error encountered. Skipping clustering and assigning {n_rows} unique domains."
|
75
|
+
)
|
65
76
|
top_annotations["domain"] = range(1, n_rows + 1) # Assign unique domains
|
66
77
|
|
67
78
|
# Create DataFrames to store domain information
|
68
|
-
|
69
|
-
data=
|
79
|
+
node_to_significance = pd.DataFrame(
|
80
|
+
data=significant_neighborhoods_significance,
|
70
81
|
columns=[top_annotations.index.values, top_annotations["domain"]],
|
71
82
|
)
|
72
|
-
node_to_domain =
|
83
|
+
node_to_domain = node_to_significance.T.groupby(level="domain").sum().T
|
73
84
|
|
74
|
-
# Find the maximum
|
85
|
+
# Find the maximum significance score for each node
|
75
86
|
t_max = node_to_domain.loc[:, 1:].max(axis=1)
|
76
87
|
t_idxmax = node_to_domain.loc[:, 1:].idxmax(axis=1)
|
77
88
|
t_idxmax[t_max == 0] = 0
|
@@ -86,13 +97,13 @@ def define_domains(
|
|
86
97
|
return node_to_domain
|
87
98
|
|
88
99
|
|
89
|
-
def
|
100
|
+
def trim_domains(
|
90
101
|
domains: pd.DataFrame,
|
91
102
|
top_annotations: pd.DataFrame,
|
92
103
|
min_cluster_size: int = 5,
|
93
104
|
max_cluster_size: int = 1000,
|
94
105
|
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
95
|
-
"""Trim domains
|
106
|
+
"""Trim domains that do not meet size criteria and find outliers.
|
96
107
|
|
97
108
|
Args:
|
98
109
|
domains (pd.DataFrame): DataFrame of domain data for the network nodes.
|
@@ -101,8 +112,7 @@ def trim_domains_and_top_annotations(
|
|
101
112
|
max_cluster_size (int, optional): Maximum size of a cluster to be retained. Defaults to 1000.
|
102
113
|
|
103
114
|
Returns:
|
104
|
-
Tuple[pd.DataFrame, pd.DataFrame
|
105
|
-
- Trimmed annotations (pd.DataFrame)
|
115
|
+
Tuple[pd.DataFrame, pd.DataFrame]:
|
106
116
|
- Trimmed domains (pd.DataFrame)
|
107
117
|
- A DataFrame with domain labels (pd.DataFrame)
|
108
118
|
"""
|
@@ -116,30 +126,30 @@ def trim_domains_and_top_annotations(
|
|
116
126
|
invalid_domain_id = 888888
|
117
127
|
invalid_domain_ids = {0, invalid_domain_id}
|
118
128
|
# Mark domains to be removed
|
119
|
-
top_annotations["domain"].replace(to_remove, invalid_domain_id
|
129
|
+
top_annotations["domain"] = top_annotations["domain"].replace(to_remove, invalid_domain_id)
|
120
130
|
domains.loc[domains["primary_domain"].isin(to_remove), ["primary_domain"]] = invalid_domain_id
|
121
131
|
|
122
|
-
# Normalize "num
|
132
|
+
# Normalize "num significant neighborhoods" by percentile for each domain and scale to 0-10
|
123
133
|
top_annotations["normalized_value"] = top_annotations.groupby("domain")[
|
124
|
-
"
|
134
|
+
"significant_neighborhood_significance_sums"
|
125
135
|
].transform(lambda x: (x.rank(pct=True) * 10).apply(np.ceil).astype(int))
|
126
|
-
# Modify the lambda function to pass both full_terms and
|
136
|
+
# Modify the lambda function to pass both full_terms and significant_significance_score
|
127
137
|
top_annotations["combined_terms"] = top_annotations.apply(
|
128
138
|
lambda row: " ".join([str(row["full_terms"])] * row["normalized_value"]), axis=1
|
129
139
|
)
|
130
140
|
|
131
|
-
# Perform the groupby operation while retaining the other columns and adding the weighting with
|
141
|
+
# Perform the groupby operation while retaining the other columns and adding the weighting with significance scores
|
132
142
|
domain_labels = (
|
133
143
|
top_annotations.groupby("domain")
|
134
144
|
.agg(
|
135
145
|
full_terms=("full_terms", lambda x: list(x)),
|
136
|
-
|
146
|
+
significance_scores=("significant_significance_score", lambda x: list(x)),
|
137
147
|
)
|
138
148
|
.reset_index()
|
139
149
|
)
|
140
150
|
domain_labels["combined_terms"] = domain_labels.apply(
|
141
151
|
lambda row: get_weighted_description(
|
142
|
-
pd.Series(row["full_terms"]), pd.Series(row["
|
152
|
+
pd.Series(row["full_terms"]), pd.Series(row["significance_scores"])
|
143
153
|
),
|
144
154
|
axis=1,
|
145
155
|
)
|
@@ -150,19 +160,39 @@ def trim_domains_and_top_annotations(
|
|
150
160
|
"domain": "id",
|
151
161
|
"combined_terms": "normalized_description",
|
152
162
|
"full_terms": "full_descriptions",
|
153
|
-
"
|
163
|
+
"significance_scores": "significance_scores",
|
154
164
|
}
|
155
165
|
).set_index("id")
|
156
166
|
|
157
167
|
# Remove invalid domains
|
158
|
-
valid_annotations = top_annotations[~top_annotations["domain"].isin(invalid_domain_ids)].drop(
|
159
|
-
columns=["normalized_value"]
|
160
|
-
)
|
161
168
|
valid_domains = domains[~domains["primary_domain"].isin(invalid_domain_ids)]
|
162
169
|
valid_trimmed_domains_matrix = trimmed_domains_matrix[
|
163
170
|
~trimmed_domains_matrix.index.isin(invalid_domain_ids)
|
164
171
|
]
|
165
|
-
return
|
172
|
+
return valid_domains, valid_trimmed_domains_matrix
|
173
|
+
|
174
|
+
|
175
|
+
def _safeguard_matrix(matrix: np.ndarray) -> np.ndarray:
|
176
|
+
"""Safeguard the matrix by replacing NaN, Inf, and -Inf values.
|
177
|
+
|
178
|
+
Args:
|
179
|
+
matrix (np.ndarray): Data matrix.
|
180
|
+
|
181
|
+
Returns:
|
182
|
+
np.ndarray: Safeguarded data matrix.
|
183
|
+
"""
|
184
|
+
# Replace NaN with column mean
|
185
|
+
nan_replacement = np.nanmean(matrix, axis=0)
|
186
|
+
matrix = np.where(np.isnan(matrix), nan_replacement, matrix)
|
187
|
+
# Replace Inf/-Inf with maximum/minimum finite values
|
188
|
+
finite_max = np.nanmax(matrix[np.isfinite(matrix)])
|
189
|
+
finite_min = np.nanmin(matrix[np.isfinite(matrix)])
|
190
|
+
matrix = np.where(np.isposinf(matrix), finite_max, matrix)
|
191
|
+
matrix = np.where(np.isneginf(matrix), finite_min, matrix)
|
192
|
+
# Ensure rows have non-zero variance (optional step)
|
193
|
+
row_variance = np.var(matrix, axis=1)
|
194
|
+
matrix = matrix[row_variance > 0]
|
195
|
+
return matrix
|
166
196
|
|
167
197
|
|
168
198
|
def _optimize_silhouette_across_linkage_and_metrics(
|
@@ -177,7 +207,7 @@ def _optimize_silhouette_across_linkage_and_metrics(
|
|
177
207
|
linkage_metric (str): Linkage metric for clustering.
|
178
208
|
|
179
209
|
Returns:
|
180
|
-
Tuple[str, str, float]:
|
210
|
+
Tuple[str, str, float]:
|
181
211
|
- Best linkage method (str)
|
182
212
|
- Best linkage metric (str)
|
183
213
|
- Best threshold (float)
|
@@ -198,7 +228,8 @@ def _optimize_silhouette_across_linkage_and_metrics(
|
|
198
228
|
total=total_combinations,
|
199
229
|
bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
|
200
230
|
):
|
201
|
-
with
|
231
|
+
# Some linkage methods and metrics may not work with certain data
|
232
|
+
with suppress(ValueError):
|
202
233
|
Z = linkage(m, method=method, metric=metric)
|
203
234
|
threshold, score = _find_best_silhouette_score(Z, m, metric, linkage_criterion)
|
204
235
|
if score > best_overall_score:
|
@@ -231,7 +262,7 @@ def _find_best_silhouette_score(
|
|
231
262
|
resolution (float, optional): Desired resolution for the best threshold. Defaults to 0.001.
|
232
263
|
|
233
264
|
Returns:
|
234
|
-
Tuple[float, float]:
|
265
|
+
Tuple[float, float]:
|
235
266
|
- Best threshold (float): The threshold that yields the best silhouette score.
|
236
267
|
- Best silhouette score (float): The highest silhouette score achieved.
|
237
268
|
"""
|