risk-network 0.0.8b26__py3-none-any.whl → 0.0.9b26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. risk/__init__.py +2 -2
  2. risk/annotations/__init__.py +2 -2
  3. risk/annotations/annotations.py +74 -47
  4. risk/annotations/io.py +47 -31
  5. risk/log/__init__.py +4 -2
  6. risk/log/{config.py → console.py} +5 -3
  7. risk/log/{params.py → parameters.py} +17 -42
  8. risk/neighborhoods/__init__.py +3 -5
  9. risk/neighborhoods/api.py +446 -0
  10. risk/neighborhoods/community.py +255 -77
  11. risk/neighborhoods/domains.py +62 -31
  12. risk/neighborhoods/neighborhoods.py +156 -160
  13. risk/network/__init__.py +1 -3
  14. risk/network/geometry.py +65 -57
  15. risk/network/graph/__init__.py +6 -0
  16. risk/network/graph/api.py +194 -0
  17. risk/network/{graph.py → graph/network.py} +87 -37
  18. risk/network/graph/summary.py +254 -0
  19. risk/network/io.py +56 -47
  20. risk/network/plotter/__init__.py +6 -0
  21. risk/network/plotter/api.py +54 -0
  22. risk/network/{plot → plotter}/canvas.py +7 -4
  23. risk/network/{plot → plotter}/contour.py +22 -19
  24. risk/network/{plot → plotter}/labels.py +69 -74
  25. risk/network/{plot → plotter}/network.py +170 -34
  26. risk/network/{plot/utils/color.py → plotter/utils/colors.py} +104 -112
  27. risk/network/{plot → plotter}/utils/layout.py +8 -5
  28. risk/risk.py +11 -500
  29. risk/stats/__init__.py +8 -4
  30. risk/stats/binom.py +51 -0
  31. risk/stats/chi2.py +69 -0
  32. risk/stats/hypergeom.py +27 -17
  33. risk/stats/permutation/__init__.py +1 -1
  34. risk/stats/permutation/permutation.py +44 -38
  35. risk/stats/permutation/test_functions.py +25 -17
  36. risk/stats/poisson.py +15 -9
  37. risk/stats/stats.py +15 -13
  38. risk/stats/zscore.py +68 -0
  39. {risk_network-0.0.8b26.dist-info → risk_network-0.0.9b26.dist-info}/METADATA +9 -5
  40. risk_network-0.0.9b26.dist-info/RECORD +44 -0
  41. {risk_network-0.0.8b26.dist-info → risk_network-0.0.9b26.dist-info}/WHEEL +1 -1
  42. risk/network/plot/__init__.py +0 -6
  43. risk/network/plot/plotter.py +0 -137
  44. risk_network-0.0.8b26.dist-info/RECORD +0 -37
  45. {risk_network-0.0.8b26.dist-info → risk_network-0.0.9b26.dist-info}/LICENSE +0 -0
  46. {risk_network-0.0.8b26.dist-info → risk_network-0.0.9b26.dist-info}/top_level.txt +0 -0
@@ -4,186 +4,364 @@ risk/neighborhoods/community
4
4
  """
5
5
 
6
6
  import community as community_louvain
7
+ import igraph as ig
8
+ import markov_clustering as mc
7
9
  import networkx as nx
8
10
  import numpy as np
9
- import markov_clustering as mc
10
- from networkx.algorithms.community import asyn_lpa_communities, greedy_modularity_communities
11
+ from leidenalg import find_partition, RBConfigurationVertexPartition
12
+ from networkx.algorithms.community import greedy_modularity_communities
11
13
 
14
+ from risk.log import logger
12
15
 
13
- def calculate_greedy_modularity_neighborhoods(network: nx.Graph) -> np.ndarray:
16
+
17
+ def calculate_greedy_modularity_neighborhoods(
18
+ network: nx.Graph, fraction_shortest_edges: float = 1.0
19
+ ) -> np.ndarray:
14
20
  """Calculate neighborhoods using the Greedy Modularity method.
15
21
 
16
22
  Args:
17
- network (nx.Graph): The network graph to analyze for community structure.
23
+ network (nx.Graph): The network graph.
24
+ fraction_shortest_edges (float, optional): Shortest edge rank fraction threshold for creating
25
+ subgraphs before clustering.
18
26
 
19
27
  Returns:
20
28
  np.ndarray: A binary neighborhood matrix where nodes in the same community have 1, and others have 0.
21
29
  """
30
+ # Create a subgraph with the shortest edges based on the rank fraction
31
+ subnetwork = _create_percentile_limited_subgraph(
32
+ network, fraction_shortest_edges=fraction_shortest_edges
33
+ )
22
34
  # Detect communities using the Greedy Modularity method
23
- communities = greedy_modularity_communities(network)
35
+ communities = greedy_modularity_communities(subnetwork)
36
+ # Get the list of nodes in the original NetworkX graph
37
+ nodes = list(network.nodes())
38
+ node_index_map = {node: idx for idx, node in enumerate(nodes)}
24
39
  # Create a binary neighborhood matrix
25
- n_nodes = network.number_of_nodes()
26
- neighborhoods = np.zeros((n_nodes, n_nodes), dtype=int)
27
- # Create a mapping from node to index in the matrix
28
- node_index = {node: i for i, node in enumerate(network.nodes())}
40
+ num_nodes = len(nodes)
41
+ # Initialize neighborhoods with zeros and set self-self entries to 1
42
+ neighborhoods = np.eye(num_nodes, dtype=int)
29
43
  # Fill in the neighborhood matrix for nodes in the same community
30
44
  for community in communities:
31
45
  # Iterate through all pairs of nodes in the same community
32
46
  for node_i in community:
33
- idx_i = node_index[node_i]
34
47
  for node_j in community:
35
- idx_j = node_index[node_j]
48
+ idx_i = node_index_map[node_i]
49
+ idx_j = node_index_map[node_j]
36
50
  # Set them as neighbors (1) in the binary matrix
37
51
  neighborhoods[idx_i, idx_j] = 1
38
52
 
39
53
  return neighborhoods
40
54
 
41
55
 
42
- def calculate_label_propagation_neighborhoods(network: nx.Graph) -> np.ndarray:
56
+ def calculate_label_propagation_neighborhoods(
57
+ network: nx.Graph, fraction_shortest_edges: float = 1.0
58
+ ) -> np.ndarray:
43
59
  """Apply Label Propagation to the network to detect communities.
44
60
 
45
61
  Args:
46
62
  network (nx.Graph): The network graph.
63
+ fraction_shortest_edges (float, optional): Shortest edge rank fraction threshold for creating
64
+ subgraphs before clustering.
47
65
 
48
66
  Returns:
49
- np.ndarray: Binary neighborhood matrix on Label Propagation.
67
+ np.ndarray: A binary neighborhood matrix on Label Propagation.
50
68
  """
69
+ # Create a subgraph with the shortest edges based on the rank fraction
70
+ subnetwork = _create_percentile_limited_subgraph(
71
+ network, fraction_shortest_edges=fraction_shortest_edges
72
+ )
51
73
  # Apply Label Propagation for community detection
52
- communities = nx.algorithms.community.label_propagation.label_propagation_communities(network)
74
+ communities = nx.algorithms.community.label_propagation.label_propagation_communities(
75
+ subnetwork
76
+ )
77
+ # Get the list of nodes in the network
78
+ nodes = list(network.nodes())
79
+ node_index_map = {node: idx for idx, node in enumerate(nodes)}
53
80
  # Create a binary neighborhood matrix
54
- num_nodes = network.number_of_nodes()
55
- neighborhoods = np.zeros((num_nodes, num_nodes), dtype=int)
56
- # Create a mapping from node to index in the matrix
57
- node_index = {node: i for i, node in enumerate(network.nodes())}
58
- # Assign neighborhoods based on community labels
81
+ num_nodes = len(nodes)
82
+ # Initialize neighborhoods with zeros and set self-self entries to 1
83
+ neighborhoods = np.eye(num_nodes, dtype=int)
84
+ # Assign neighborhoods based on community labels using the mapped indices
59
85
  for community in communities:
60
86
  for node_i in community:
61
- idx_i = node_index[node_i]
62
87
  for node_j in community:
63
- idx_j = node_index[node_j]
88
+ idx_i = node_index_map[node_i]
89
+ idx_j = node_index_map[node_j]
90
+ neighborhoods[idx_i, idx_j] = 1
91
+
92
+ return neighborhoods
93
+
94
+
95
+ def calculate_leiden_neighborhoods(
96
+ network: nx.Graph,
97
+ resolution: float = 1.0,
98
+ fraction_shortest_edges: float = 1.0,
99
+ random_seed: int = 888,
100
+ ) -> np.ndarray:
101
+ """Calculate neighborhoods using the Leiden method.
102
+
103
+ Args:
104
+ network (nx.Graph): The network graph.
105
+ resolution (float, optional): Resolution parameter for the Leiden method. Defaults to 1.0.
106
+ fraction_shortest_edges (float, optional): Shortest edge rank fraction threshold for creating
107
+ subgraphs before clustering.
108
+ random_seed (int, optional): Random seed for reproducibility. Defaults to 888.
109
+
110
+ Returns:
111
+ np.ndarray: A binary neighborhood matrix where nodes in the same community have 1, and others have 0.
112
+ """
113
+ # Create a subgraph with the shortest edges based on the rank fraction
114
+ subnetwork = _create_percentile_limited_subgraph(
115
+ network, fraction_shortest_edges=fraction_shortest_edges
116
+ )
117
+ # Convert NetworkX graph to iGraph
118
+ igraph_network = ig.Graph.from_networkx(subnetwork)
119
+ # Apply Leiden algorithm using RBConfigurationVertexPartition, which supports resolution
120
+ partition = find_partition(
121
+ igraph_network,
122
+ partition_type=RBConfigurationVertexPartition,
123
+ resolution_parameter=resolution,
124
+ seed=random_seed,
125
+ )
126
+ # Get the list of nodes in the original NetworkX graph
127
+ nodes = list(network.nodes())
128
+ node_index_map = {node: idx for idx, node in enumerate(nodes)}
129
+ # Create a binary neighborhood matrix
130
+ num_nodes = len(nodes)
131
+ # Initialize neighborhoods with zeros and set self-self entries to 1
132
+ neighborhoods = np.eye(num_nodes, dtype=int)
133
+ # Assign neighborhoods based on community partitions using the mapped indices
134
+ for community in partition:
135
+ for node_i in community:
136
+ for node_j in community:
137
+ idx_i = node_index_map[igraph_network.vs[node_i]["_nx_name"]]
138
+ idx_j = node_index_map[igraph_network.vs[node_j]["_nx_name"]]
64
139
  neighborhoods[idx_i, idx_j] = 1
65
140
 
66
141
  return neighborhoods
67
142
 
68
143
 
69
144
  def calculate_louvain_neighborhoods(
70
- network: nx.Graph, resolution: float, random_seed: int = 888
145
+ network: nx.Graph,
146
+ resolution: float = 0.1,
147
+ fraction_shortest_edges: float = 1.0,
148
+ random_seed: int = 888,
71
149
  ) -> np.ndarray:
72
150
  """Calculate neighborhoods using the Louvain method.
73
151
 
74
152
  Args:
75
153
  network (nx.Graph): The network graph.
76
- resolution (float): Resolution parameter for the Louvain method.
154
+ resolution (float, optional): Resolution parameter for the Louvain method. Defaults to 0.1.
155
+ fraction_shortest_edges (float, optional): Shortest edge rank fraction threshold for creating
156
+ subgraphs before clustering.
77
157
  random_seed (int, optional): Random seed for reproducibility. Defaults to 888.
78
158
 
79
159
  Returns:
80
- np.ndarray: Binary neighborhood matrix on the Louvain method.
160
+ np.ndarray: A binary neighborhood matrix on the Louvain method.
81
161
  """
162
+ # Create a subgraph with the shortest edges based on the rank fraction
163
+ subnetwork = _create_percentile_limited_subgraph(
164
+ network, fraction_shortest_edges=fraction_shortest_edges
165
+ )
82
166
  # Apply Louvain method to partition the network
83
167
  partition = community_louvain.best_partition(
84
- network, resolution=resolution, random_state=random_seed
168
+ subnetwork, resolution=resolution, random_state=random_seed
85
169
  )
170
+ # Get the list of nodes in the network and create a mapping to indices
171
+ nodes = list(network.nodes())
172
+ node_index_map = {node: idx for idx, node in enumerate(nodes)}
86
173
  # Create a binary neighborhood matrix
87
- num_nodes = network.number_of_nodes()
88
- neighborhoods = np.zeros((num_nodes, num_nodes), dtype=int)
89
- # Create a mapping from node to index in the matrix
90
- node_index = {node: i for i, node in enumerate(network.nodes())}
174
+ num_nodes = len(nodes)
175
+ # Initialize neighborhoods with zeros and set self-self entries to 1
176
+ neighborhoods = np.eye(num_nodes, dtype=int)
91
177
  # Group nodes by community
92
178
  community_groups = {}
93
179
  for node, community in partition.items():
94
180
  community_groups.setdefault(community, []).append(node)
95
181
 
96
- # Assign neighborhoods based on community partitions
182
+ # Assign neighborhoods based on community partitions using the mapped indices
97
183
  for community, nodes in community_groups.items():
98
184
  for node_i in nodes:
99
- idx_i = node_index[node_i]
100
185
  for node_j in nodes:
101
- idx_j = node_index[node_j]
186
+ idx_i = node_index_map[node_i]
187
+ idx_j = node_index_map[node_j]
102
188
  neighborhoods[idx_i, idx_j] = 1
103
189
 
104
190
  return neighborhoods
105
191
 
106
192
 
107
- def calculate_markov_clustering_neighborhoods(network: nx.Graph) -> np.ndarray:
108
- """Apply Markov Clustering (MCL) to the network.
193
+ def calculate_markov_clustering_neighborhoods(
194
+ network: nx.Graph, fraction_shortest_edges: float = 1.0
195
+ ) -> np.ndarray:
196
+ """Apply Markov Clustering (MCL) to the network and return a binary neighborhood matrix.
109
197
 
110
198
  Args:
111
199
  network (nx.Graph): The network graph.
200
+ fraction_shortest_edges (float, optional): Shortest edge rank fraction threshold for creating
201
+ subgraphs before clustering.
112
202
 
113
203
  Returns:
114
- np.ndarray: Binary neighborhood matrix on Markov Clustering.
204
+ np.ndarray: A binary neighborhood matrix on Markov Clustering.
115
205
  """
116
- # Convert the graph to an adjacency matrix
117
- adjacency_matrix = nx.to_numpy_array(network)
118
- # Run Markov Clustering (MCL)
119
- result = mc.run_mcl(adjacency_matrix) # MCL with default parameters
120
- # Get clusters (communities) from MCL result
206
+ # Create a subgraph with the shortest edges based on the rank fraction
207
+ subnetwork = _create_percentile_limited_subgraph(
208
+ network, fraction_shortest_edges=fraction_shortest_edges
209
+ )
210
+ # Step 1: Convert the subnetwork to an adjacency matrix
211
+ subnetwork_nodes = list(subnetwork.nodes())
212
+ adjacency_matrix = nx.to_numpy_array(subnetwork, nodelist=subnetwork_nodes)
213
+ # Step 2: Run Markov Clustering (MCL) on the subnetwork's adjacency matrix
214
+ result = mc.run_mcl(adjacency_matrix)
121
215
  clusters = mc.get_clusters(result)
122
- # Create a binary neighborhood matrix
123
- num_nodes = network.number_of_nodes()
124
- neighborhoods = np.zeros((num_nodes, num_nodes), dtype=int)
125
- # Create a mapping from node to index in the matrix
126
- node_index = {node: i for i, node in enumerate(network.nodes())}
127
- # Assign neighborhoods based on MCL clusters
216
+ # Step 3: Prepare the original network nodes and indices
217
+ nodes = list(network.nodes())
218
+ node_index_map = {node: idx for idx, node in enumerate(nodes)}
219
+ num_nodes = len(nodes)
220
+ # Step 4: Initialize the neighborhood matrix for the original network
221
+ neighborhoods = np.eye(num_nodes, dtype=int)
222
+ # Step 5: Fill the neighborhoods matrix using the clusters from the subnetwork
128
223
  for cluster in clusters:
129
224
  for node_i in cluster:
130
- idx_i = node_index[node_i]
131
225
  for node_j in cluster:
132
- idx_j = node_index[node_j]
133
- neighborhoods[idx_i, idx_j] = 1
226
+ # Map the indices back to the original network's node indices
227
+ original_node_i = subnetwork_nodes[node_i]
228
+ original_node_j = subnetwork_nodes[node_j]
229
+
230
+ if original_node_i in node_index_map and original_node_j in node_index_map:
231
+ idx_i = node_index_map[original_node_i]
232
+ idx_j = node_index_map[original_node_j]
233
+ neighborhoods[idx_i, idx_j] = 1
134
234
 
135
235
  return neighborhoods
136
236
 
137
237
 
138
- def calculate_spinglass_neighborhoods(network: nx.Graph) -> np.ndarray:
139
- """Apply Spin Glass Community Detection to the network.
238
+ def calculate_spinglass_neighborhoods(
239
+ network: nx.Graph, fraction_shortest_edges: float = 1.0
240
+ ) -> np.ndarray:
241
+ """Apply Spinglass Community Detection to the network, handling disconnected components.
140
242
 
141
243
  Args:
142
244
  network (nx.Graph): The network graph.
245
+ fraction_shortest_edges (float, optional): Shortest edge rank fraction threshold for creating
246
+ subgraphs before clustering.
143
247
 
144
248
  Returns:
145
- np.ndarray: Binary neighborhood matrix on Spin Glass communities.
249
+ np.ndarray: A binary neighborhood matrix based on Spinglass communities.
146
250
  """
147
- # Apply Asynchronous Label Propagation (LPA)
148
- communities = asyn_lpa_communities(network)
149
- # Create a binary neighborhood matrix
150
- num_nodes = network.number_of_nodes()
151
- neighborhoods = np.zeros((num_nodes, num_nodes), dtype=int)
152
- # Create a mapping from node to index in the matrix
153
- node_index = {node: i for i, node in enumerate(network.nodes())}
154
- # Assign neighborhoods based on community labels from LPA
155
- for community in communities:
156
- for node_i in community:
157
- idx_i = node_index[node_i]
158
- for node_j in community:
159
- idx_j = node_index[node_j]
160
- neighborhoods[idx_i, idx_j] = 1
251
+ # Create a subgraph with the shortest edges based on the rank fraction
252
+ subnetwork = _create_percentile_limited_subgraph(
253
+ network, fraction_shortest_edges=fraction_shortest_edges
254
+ )
255
+ # Step 1: Find connected components in the graph
256
+ components = list(nx.connected_components(subnetwork))
257
+ # Prepare to store community results
258
+ nodes = list(network.nodes())
259
+ node_index_map = {node: idx for idx, node in enumerate(nodes)}
260
+ num_nodes = len(nodes)
261
+ # Initialize neighborhoods with zeros and set self-self entries to 1
262
+ neighborhoods = np.eye(num_nodes, dtype=int)
263
+ # Step 2: Run Spinglass on each connected component
264
+ for component in components:
265
+ # Extract the subgraph corresponding to the current component
266
+ subgraph = network.subgraph(component)
267
+ # Convert the subgraph to an iGraph object
268
+ igraph_subgraph = ig.Graph.from_networkx(subgraph)
269
+ # Ensure the subgraph is connected before running Spinglass
270
+ if not igraph_subgraph.is_connected():
271
+ logger.error("Warning: Subgraph is not connected. Skipping...")
272
+ continue
273
+
274
+ # Apply Spinglass community detection
275
+ try:
276
+ communities = igraph_subgraph.community_spinglass()
277
+ except Exception as e:
278
+ logger.error(f"Error running Spinglass on component: {e}")
279
+ continue
280
+
281
+ # Step 3: Assign neighborhoods based on community labels
282
+ for community in communities:
283
+ for node_i in community:
284
+ for node_j in community:
285
+ idx_i = node_index_map[igraph_subgraph.vs[node_i]["_nx_name"]]
286
+ idx_j = node_index_map[igraph_subgraph.vs[node_j]["_nx_name"]]
287
+ neighborhoods[idx_i, idx_j] = 1
161
288
 
162
289
  return neighborhoods
163
290
 
164
291
 
165
- def calculate_walktrap_neighborhoods(network: nx.Graph) -> np.ndarray:
292
+ def calculate_walktrap_neighborhoods(
293
+ network: nx.Graph, fraction_shortest_edges: float = 1.0
294
+ ) -> np.ndarray:
166
295
  """Apply Walktrap Community Detection to the network.
167
296
 
168
297
  Args:
169
298
  network (nx.Graph): The network graph.
299
+ fraction_shortest_edges (float, optional): Shortest edge rank fraction threshold for creating
300
+ subgraphs before clustering.
170
301
 
171
302
  Returns:
172
- np.ndarray: Binary neighborhood matrix on Walktrap communities.
303
+ np.ndarray: A binary neighborhood matrix on Walktrap communities.
173
304
  """
174
- # Apply Asynchronous Label Propagation (LPA)
175
- communities = asyn_lpa_communities(network)
305
+ # Create a subgraph with the shortest edges based on the rank fraction
306
+ subnetwork = _create_percentile_limited_subgraph(
307
+ network, fraction_shortest_edges=fraction_shortest_edges
308
+ )
309
+ # Convert NetworkX graph to iGraph
310
+ igraph_network = ig.Graph.from_networkx(subnetwork)
311
+ # Apply Walktrap community detection
312
+ communities = igraph_network.community_walktrap().as_clustering()
313
+ # Get the list of nodes in the original NetworkX graph
314
+ nodes = list(network.nodes())
315
+ node_index_map = {node: idx for idx, node in enumerate(nodes)}
176
316
  # Create a binary neighborhood matrix
177
- num_nodes = network.number_of_nodes()
178
- neighborhoods = np.zeros((num_nodes, num_nodes), dtype=int)
179
- # Create a mapping from node to index in the matrix
180
- node_index = {node: i for i, node in enumerate(network.nodes())}
181
- # Assign neighborhoods based on community labels from LPA
317
+ num_nodes = len(nodes)
318
+ # Initialize neighborhoods with zeros and set self-self entries to 1
319
+ neighborhoods = np.eye(num_nodes, dtype=int)
320
+ # Assign neighborhoods based on community labels
182
321
  for community in communities:
183
322
  for node_i in community:
184
- idx_i = node_index[node_i]
185
323
  for node_j in community:
186
- idx_j = node_index[node_j]
324
+ idx_i = node_index_map[igraph_network.vs[node_i]["_nx_name"]]
325
+ idx_j = node_index_map[igraph_network.vs[node_j]["_nx_name"]]
187
326
  neighborhoods[idx_i, idx_j] = 1
188
327
 
189
328
  return neighborhoods
329
+
330
+
331
+ def _create_percentile_limited_subgraph(G: nx.Graph, fraction_shortest_edges: float) -> nx.Graph:
332
+ """Create a subgraph containing the shortest edges based on the specified rank fraction
333
+ of all edge lengths in the input graph.
334
+
335
+ Args:
336
+ G (nx.Graph): The input graph with 'length' attributes on edges.
337
+ fraction_shortest_edges (float): The rank fraction (between 0 and 1) to filter edges.
338
+
339
+ Returns:
340
+ nx.Graph: A subgraph with nodes and edges where the edges are within the shortest
341
+ specified rank fraction.
342
+ """
343
+ # Step 1: Extract edges with their lengths
344
+ edges_with_length = [(u, v, d) for u, v, d in G.edges(data=True) if "length" in d]
345
+ if not edges_with_length:
346
+ raise ValueError(
347
+ "No edge lengths found in the graph. Ensure edges have 'length' attributes."
348
+ )
349
+
350
+ # Step 2: Sort edges by length in ascending order
351
+ edges_with_length.sort(key=lambda x: x[2]["length"])
352
+ # Step 3: Calculate the cutoff index for the given rank fraction
353
+ cutoff_index = int(fraction_shortest_edges * len(edges_with_length))
354
+ if cutoff_index == 0:
355
+ raise ValueError("The rank fraction is too low, resulting in no edges being included.")
356
+
357
+ # Step 4: Create the subgraph by selecting only the shortest edges within the rank fraction
358
+ subgraph = nx.Graph()
359
+ subgraph.add_nodes_from(G.nodes(data=True)) # Retain all nodes from the original graph
360
+ subgraph.add_edges_from(edges_with_length[:cutoff_index])
361
+ # Step 5: Remove nodes with no edges
362
+ subgraph.remove_nodes_from(list(nx.isolates(subgraph)))
363
+ # Step 6: Check if the resulting subgraph has no edges and issue a warning
364
+ if subgraph.number_of_edges() == 0:
365
+ raise Warning("The resulting subgraph has no edges. Consider adjusting the rank fraction.")
366
+
367
+ return subgraph
@@ -5,13 +5,13 @@ risk/neighborhoods/domains
5
5
 
6
6
  from contextlib import suppress
7
7
  from itertools import product
8
- from tqdm import tqdm
9
8
  from typing import Tuple
10
9
 
11
10
  import numpy as np
12
11
  import pandas as pd
13
12
  from scipy.cluster.hierarchy import linkage, fcluster
14
13
  from sklearn.metrics import silhouette_score
14
+ from tqdm import tqdm
15
15
 
16
16
  from risk.annotations import get_weighted_description
17
17
  from risk.constants import GROUP_LINKAGE_METHODS, GROUP_DISTANCE_METRICS
@@ -20,17 +20,17 @@ from risk.log import logger
20
20
 
21
21
  def define_domains(
22
22
  top_annotations: pd.DataFrame,
23
- significant_neighborhoods_enrichment: np.ndarray,
23
+ significant_neighborhoods_significance: np.ndarray,
24
24
  linkage_criterion: str,
25
25
  linkage_method: str,
26
26
  linkage_metric: str,
27
27
  ) -> pd.DataFrame:
28
- """Define domains and assign nodes to these domains based on their enrichment scores and clustering,
28
+ """Define domains and assign nodes to these domains based on their significance scores and clustering,
29
29
  handling errors by assigning unique domains when clustering fails.
30
30
 
31
31
  Args:
32
32
  top_annotations (pd.DataFrame): DataFrame of top annotations data for the network nodes.
33
- significant_neighborhoods_enrichment (np.ndarray): The binary enrichment matrix below alpha.
33
+ significant_neighborhoods_significance (np.ndarray): The binary significance matrix below alpha.
34
34
  linkage_criterion (str): The clustering criterion for defining groups.
35
35
  linkage_method (str): The linkage method for clustering.
36
36
  linkage_metric (str): The linkage metric for clustering.
@@ -39,8 +39,14 @@ def define_domains(
39
39
  pd.DataFrame: DataFrame with the primary domain for each node.
40
40
  """
41
41
  try:
42
+ if linkage_criterion == "off":
43
+ raise ValueError("Clustering is turned off.")
44
+
42
45
  # Transpose the matrix to cluster annotations
43
- m = significant_neighborhoods_enrichment[:, top_annotations["significant_annotations"]].T
46
+ m = significant_neighborhoods_significance[:, top_annotations["significant_annotations"]].T
47
+ # Safeguard the matrix by replacing NaN, Inf, and -Inf values
48
+ m = _safeguard_matrix(m)
49
+ # Optimize silhouette score across different linkage methods and distance metrics
44
50
  best_linkage, best_metric, best_threshold = _optimize_silhouette_across_linkage_and_metrics(
45
51
  m, linkage_criterion, linkage_method, linkage_metric
46
52
  )
@@ -59,19 +65,24 @@ def define_domains(
59
65
  except ValueError:
60
66
  # If a ValueError is encountered, handle it by assigning unique domains
61
67
  n_rows = len(top_annotations)
62
- logger.error(
63
- f"Error encountered. Skipping clustering and assigning {n_rows} unique domains."
64
- )
68
+ if linkage_criterion == "off":
69
+ logger.warning(
70
+ f"Clustering is turned off. Skipping clustering and assigning {n_rows} unique domains."
71
+ )
72
+ else:
73
+ logger.error(
74
+ f"Error encountered. Skipping clustering and assigning {n_rows} unique domains."
75
+ )
65
76
  top_annotations["domain"] = range(1, n_rows + 1) # Assign unique domains
66
77
 
67
78
  # Create DataFrames to store domain information
68
- node_to_enrichment = pd.DataFrame(
69
- data=significant_neighborhoods_enrichment,
79
+ node_to_significance = pd.DataFrame(
80
+ data=significant_neighborhoods_significance,
70
81
  columns=[top_annotations.index.values, top_annotations["domain"]],
71
82
  )
72
- node_to_domain = node_to_enrichment.groupby(level="domain", axis=1).sum()
83
+ node_to_domain = node_to_significance.T.groupby(level="domain").sum().T
73
84
 
74
- # Find the maximum enrichment score for each node
85
+ # Find the maximum significance score for each node
75
86
  t_max = node_to_domain.loc[:, 1:].max(axis=1)
76
87
  t_idxmax = node_to_domain.loc[:, 1:].idxmax(axis=1)
77
88
  t_idxmax[t_max == 0] = 0
@@ -86,13 +97,13 @@ def define_domains(
86
97
  return node_to_domain
87
98
 
88
99
 
89
- def trim_domains_and_top_annotations(
100
+ def trim_domains(
90
101
  domains: pd.DataFrame,
91
102
  top_annotations: pd.DataFrame,
92
103
  min_cluster_size: int = 5,
93
104
  max_cluster_size: int = 1000,
94
105
  ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
95
- """Trim domains and top annotations that do not meet size criteria and find outliers.
106
+ """Trim domains that do not meet size criteria and find outliers.
96
107
 
97
108
  Args:
98
109
  domains (pd.DataFrame): DataFrame of domain data for the network nodes.
@@ -101,8 +112,7 @@ def trim_domains_and_top_annotations(
101
112
  max_cluster_size (int, optional): Maximum size of a cluster to be retained. Defaults to 1000.
102
113
 
103
114
  Returns:
104
- Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing:
105
- - Trimmed annotations (pd.DataFrame)
115
+ Tuple[pd.DataFrame, pd.DataFrame]:
106
116
  - Trimmed domains (pd.DataFrame)
107
117
  - A DataFrame with domain labels (pd.DataFrame)
108
118
  """
@@ -116,30 +126,30 @@ def trim_domains_and_top_annotations(
116
126
  invalid_domain_id = 888888
117
127
  invalid_domain_ids = {0, invalid_domain_id}
118
128
  # Mark domains to be removed
119
- top_annotations["domain"].replace(to_remove, invalid_domain_id, inplace=True)
129
+ top_annotations["domain"] = top_annotations["domain"].replace(to_remove, invalid_domain_id)
120
130
  domains.loc[domains["primary_domain"].isin(to_remove), ["primary_domain"]] = invalid_domain_id
121
131
 
122
- # Normalize "num enriched neighborhoods" by percentile for each domain and scale to 0-10
132
+ # Normalize "num significant neighborhoods" by percentile for each domain and scale to 0-10
123
133
  top_annotations["normalized_value"] = top_annotations.groupby("domain")[
124
- "significant_neighborhood_enrichment_sums"
134
+ "significant_neighborhood_significance_sums"
125
135
  ].transform(lambda x: (x.rank(pct=True) * 10).apply(np.ceil).astype(int))
126
- # Modify the lambda function to pass both full_terms and significant_enrichment_score
136
+ # Modify the lambda function to pass both full_terms and significant_significance_score
127
137
  top_annotations["combined_terms"] = top_annotations.apply(
128
138
  lambda row: " ".join([str(row["full_terms"])] * row["normalized_value"]), axis=1
129
139
  )
130
140
 
131
- # Perform the groupby operation while retaining the other columns and adding the weighting with enrichment scores
141
+ # Perform the groupby operation while retaining the other columns and adding the weighting with significance scores
132
142
  domain_labels = (
133
143
  top_annotations.groupby("domain")
134
144
  .agg(
135
145
  full_terms=("full_terms", lambda x: list(x)),
136
- enrichment_scores=("significant_enrichment_score", lambda x: list(x)),
146
+ significance_scores=("significant_significance_score", lambda x: list(x)),
137
147
  )
138
148
  .reset_index()
139
149
  )
140
150
  domain_labels["combined_terms"] = domain_labels.apply(
141
151
  lambda row: get_weighted_description(
142
- pd.Series(row["full_terms"]), pd.Series(row["enrichment_scores"])
152
+ pd.Series(row["full_terms"]), pd.Series(row["significance_scores"])
143
153
  ),
144
154
  axis=1,
145
155
  )
@@ -150,19 +160,39 @@ def trim_domains_and_top_annotations(
150
160
  "domain": "id",
151
161
  "combined_terms": "normalized_description",
152
162
  "full_terms": "full_descriptions",
153
- "enrichment_scores": "enrichment_scores",
163
+ "significance_scores": "significance_scores",
154
164
  }
155
165
  ).set_index("id")
156
166
 
157
167
  # Remove invalid domains
158
- valid_annotations = top_annotations[~top_annotations["domain"].isin(invalid_domain_ids)].drop(
159
- columns=["normalized_value"]
160
- )
161
168
  valid_domains = domains[~domains["primary_domain"].isin(invalid_domain_ids)]
162
169
  valid_trimmed_domains_matrix = trimmed_domains_matrix[
163
170
  ~trimmed_domains_matrix.index.isin(invalid_domain_ids)
164
171
  ]
165
- return valid_annotations, valid_domains, valid_trimmed_domains_matrix
172
+ return valid_domains, valid_trimmed_domains_matrix
173
+
174
+
175
+ def _safeguard_matrix(matrix: np.ndarray) -> np.ndarray:
176
+ """Safeguard the matrix by replacing NaN, Inf, and -Inf values.
177
+
178
+ Args:
179
+ matrix (np.ndarray): Data matrix.
180
+
181
+ Returns:
182
+ np.ndarray: Safeguarded data matrix.
183
+ """
184
+ # Replace NaN with column mean
185
+ nan_replacement = np.nanmean(matrix, axis=0)
186
+ matrix = np.where(np.isnan(matrix), nan_replacement, matrix)
187
+ # Replace Inf/-Inf with maximum/minimum finite values
188
+ finite_max = np.nanmax(matrix[np.isfinite(matrix)])
189
+ finite_min = np.nanmin(matrix[np.isfinite(matrix)])
190
+ matrix = np.where(np.isposinf(matrix), finite_max, matrix)
191
+ matrix = np.where(np.isneginf(matrix), finite_min, matrix)
192
+ # Ensure rows have non-zero variance (optional step)
193
+ row_variance = np.var(matrix, axis=1)
194
+ matrix = matrix[row_variance > 0]
195
+ return matrix
166
196
 
167
197
 
168
198
  def _optimize_silhouette_across_linkage_and_metrics(
@@ -177,7 +207,7 @@ def _optimize_silhouette_across_linkage_and_metrics(
177
207
  linkage_metric (str): Linkage metric for clustering.
178
208
 
179
209
  Returns:
180
- Tuple[str, str, float]: A tuple containing:
210
+ Tuple[str, str, float]:
181
211
  - Best linkage method (str)
182
212
  - Best linkage metric (str)
183
213
  - Best threshold (float)
@@ -198,7 +228,8 @@ def _optimize_silhouette_across_linkage_and_metrics(
198
228
  total=total_combinations,
199
229
  bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
200
230
  ):
201
- with suppress(Exception):
231
+ # Some linkage methods and metrics may not work with certain data
232
+ with suppress(ValueError):
202
233
  Z = linkage(m, method=method, metric=metric)
203
234
  threshold, score = _find_best_silhouette_score(Z, m, metric, linkage_criterion)
204
235
  if score > best_overall_score:
@@ -231,7 +262,7 @@ def _find_best_silhouette_score(
231
262
  resolution (float, optional): Desired resolution for the best threshold. Defaults to 0.001.
232
263
 
233
264
  Returns:
234
- Tuple[float, float]: A tuple containing:
265
+ Tuple[float, float]:
235
266
  - Best threshold (float): The threshold that yields the best silhouette score.
236
267
  - Best silhouette score (float): The highest silhouette score achieved.
237
268
  """