risk-network 0.0.7b11__py3-none-any.whl → 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -21,15 +21,20 @@ def calculate_greedy_modularity_neighborhoods(network: nx.Graph) -> np.ndarray:
21
21
  """
22
22
  # Detect communities using the Greedy Modularity method
23
23
  communities = greedy_modularity_communities(network)
24
- # Create a mapping from node to community
25
- community_dict = {node: idx for idx, community in enumerate(communities) for node in community}
26
24
  # Create a binary neighborhood matrix
27
- neighborhoods = np.zeros((network.number_of_nodes(), network.number_of_nodes()), dtype=int)
25
+ n_nodes = network.number_of_nodes()
26
+ neighborhoods = np.zeros((n_nodes, n_nodes), dtype=int)
27
+ # Create a mapping from node to index in the matrix
28
28
  node_index = {node: i for i, node in enumerate(network.nodes())}
29
- for node_i, community_i in community_dict.items():
30
- for node_j, community_j in community_dict.items():
31
- if community_i == community_j:
32
- neighborhoods[node_index[node_i], node_index[node_j]] = 1
29
+ # Fill in the neighborhood matrix for nodes in the same community
30
+ for community in communities:
31
+ # Iterate through all pairs of nodes in the same community
32
+ for node_i in community:
33
+ idx_i = node_index[node_i]
34
+ for node_j in community:
35
+ idx_j = node_index[node_j]
36
+ # Set them as neighbors (1) in the binary matrix
37
+ neighborhoods[idx_i, idx_j] = 1
33
38
 
34
39
  return neighborhoods
35
40
 
@@ -43,22 +48,20 @@ def calculate_label_propagation_neighborhoods(network: nx.Graph) -> np.ndarray:
43
48
  Returns:
44
49
  np.ndarray: Binary neighborhood matrix on Label Propagation.
45
50
  """
46
- # Apply Label Propagation
51
+ # Apply Label Propagation for community detection
47
52
  communities = nx.algorithms.community.label_propagation.label_propagation_communities(network)
48
- # Create a mapping from node to community
49
- community_dict = {}
50
- for community_id, community in enumerate(communities):
51
- for node in community:
52
- community_dict[node] = community_id
53
-
54
53
  # Create a binary neighborhood matrix
55
54
  num_nodes = network.number_of_nodes()
56
55
  neighborhoods = np.zeros((num_nodes, num_nodes), dtype=int)
56
+ # Create a mapping from node to index in the matrix
57
+ node_index = {node: i for i, node in enumerate(network.nodes())}
57
58
  # Assign neighborhoods based on community labels
58
- for node_i, community_i in community_dict.items():
59
- for node_j, community_j in community_dict.items():
60
- if community_i == community_j:
61
- neighborhoods[node_i, node_j] = 1
59
+ for community in communities:
60
+ for node_i in community:
61
+ idx_i = node_index[node_i]
62
+ for node_j in community:
63
+ idx_j = node_index[node_j]
64
+ neighborhoods[idx_i, idx_j] = 1
62
65
 
63
66
  return neighborhoods
64
67
 
@@ -81,12 +84,22 @@ def calculate_louvain_neighborhoods(
81
84
  network, resolution=resolution, random_state=random_seed
82
85
  )
83
86
  # Create a binary neighborhood matrix
84
- neighborhoods = np.zeros((network.number_of_nodes(), network.number_of_nodes()), dtype=int)
87
+ num_nodes = network.number_of_nodes()
88
+ neighborhoods = np.zeros((num_nodes, num_nodes), dtype=int)
89
+ # Create a mapping from node to index in the matrix
90
+ node_index = {node: i for i, node in enumerate(network.nodes())}
91
+ # Group nodes by community
92
+ community_groups = {}
93
+ for node, community in partition.items():
94
+ community_groups.setdefault(community, []).append(node)
95
+
85
96
  # Assign neighborhoods based on community partitions
86
- for node_i, community_i in partition.items():
87
- for node_j, community_j in partition.items():
88
- if community_i == community_j:
89
- neighborhoods[node_i, node_j] = 1
97
+ for community, nodes in community_groups.items():
98
+ for node_i in nodes:
99
+ idx_i = node_index[node_i]
100
+ for node_j in nodes:
101
+ idx_j = node_index[node_j]
102
+ neighborhoods[idx_i, idx_j] = 1
90
103
 
91
104
  return neighborhoods
92
105
 
@@ -102,24 +115,22 @@ def calculate_markov_clustering_neighborhoods(network: nx.Graph) -> np.ndarray:
102
115
  """
103
116
  # Convert the graph to an adjacency matrix
104
117
  adjacency_matrix = nx.to_numpy_array(network)
105
- # Run Markov Clustering
106
- result = mc.run_mcl(adjacency_matrix) # Run MCL with default parameters
107
- # Get clusters
118
+ # Run Markov Clustering (MCL)
119
+ result = mc.run_mcl(adjacency_matrix) # MCL with default parameters
120
+ # Get clusters (communities) from MCL result
108
121
  clusters = mc.get_clusters(result)
109
- # Create a community label for each node
110
- community_dict = {}
111
- for community_id, community in enumerate(clusters):
112
- for node in community:
113
- community_dict[node] = community_id
114
-
115
122
  # Create a binary neighborhood matrix
116
123
  num_nodes = network.number_of_nodes()
117
124
  neighborhoods = np.zeros((num_nodes, num_nodes), dtype=int)
118
- # Assign neighborhoods based on community labels
119
- for node_i, community_i in community_dict.items():
120
- for node_j, community_j in community_dict.items():
121
- if community_i == community_j:
122
- neighborhoods[node_i, node_j] = 1
125
+ # Create a mapping from node to index in the matrix
126
+ node_index = {node: i for i, node in enumerate(network.nodes())}
127
+ # Assign neighborhoods based on MCL clusters
128
+ for cluster in clusters:
129
+ for node_i in cluster:
130
+ idx_i = node_index[node_i]
131
+ for node_j in cluster:
132
+ idx_j = node_index[node_j]
133
+ neighborhoods[idx_i, idx_j] = 1
123
134
 
124
135
  return neighborhoods
125
136
 
@@ -133,22 +144,20 @@ def calculate_spinglass_neighborhoods(network: nx.Graph) -> np.ndarray:
133
144
  Returns:
134
145
  np.ndarray: Binary neighborhood matrix on Spin Glass communities.
135
146
  """
136
- # Use the asynchronous label propagation algorithm as a proxy for Spin Glass
147
+ # Apply Asynchronous Label Propagation (LPA)
137
148
  communities = asyn_lpa_communities(network)
138
- # Create a community label for each node
139
- community_dict = {}
140
- for community_id, community in enumerate(communities):
141
- for node in community:
142
- community_dict[node] = community_id
143
-
144
149
  # Create a binary neighborhood matrix
145
150
  num_nodes = network.number_of_nodes()
146
151
  neighborhoods = np.zeros((num_nodes, num_nodes), dtype=int)
147
- # Assign neighborhoods based on community labels
148
- for node_i, community_i in community_dict.items():
149
- for node_j, community_j in community_dict.items():
150
- if community_i == community_j:
151
- neighborhoods[node_i, node_j] = 1
152
+ # Create a mapping from node to index in the matrix
153
+ node_index = {node: i for i, node in enumerate(network.nodes())}
154
+ # Assign neighborhoods based on community labels from LPA
155
+ for community in communities:
156
+ for node_i in community:
157
+ idx_i = node_index[node_i]
158
+ for node_j in community:
159
+ idx_j = node_index[node_j]
160
+ neighborhoods[idx_i, idx_j] = 1
152
161
 
153
162
  return neighborhoods
154
163
 
@@ -162,21 +171,19 @@ def calculate_walktrap_neighborhoods(network: nx.Graph) -> np.ndarray:
162
171
  Returns:
163
172
  np.ndarray: Binary neighborhood matrix on Walktrap communities.
164
173
  """
165
- # Use the asynchronous label propagation algorithm as a proxy for Walktrap
174
+ # Apply Asynchronous Label Propagation (LPA)
166
175
  communities = asyn_lpa_communities(network)
167
- # Create a community label for each node
168
- community_dict = {}
169
- for community_id, community in enumerate(communities):
170
- for node in community:
171
- community_dict[node] = community_id
172
-
173
176
  # Create a binary neighborhood matrix
174
177
  num_nodes = network.number_of_nodes()
175
178
  neighborhoods = np.zeros((num_nodes, num_nodes), dtype=int)
176
- # Assign neighborhoods based on community labels
177
- for node_i, community_i in community_dict.items():
178
- for node_j, community_j in community_dict.items():
179
- if community_i == community_j:
180
- neighborhoods[node_i, node_j] = 1
179
+ # Create a mapping from node to index in the matrix
180
+ node_index = {node: i for i, node in enumerate(network.nodes())}
181
+ # Assign neighborhoods based on community labels from LPA
182
+ for community in communities:
183
+ for node_i in community:
184
+ idx_i = node_index[node_i]
185
+ for node_j in community:
186
+ idx_j = node_index[node_j]
187
+ neighborhoods[idx_i, idx_j] = 1
181
188
 
182
189
  return neighborhoods
@@ -13,7 +13,7 @@ import pandas as pd
13
13
  from scipy.cluster.hierarchy import linkage, fcluster
14
14
  from sklearn.metrics import silhouette_score
15
15
 
16
- from risk.annotations import get_description
16
+ from risk.annotations import get_weighted_description
17
17
  from risk.constants import GROUP_LINKAGE_METHODS, GROUP_DISTANCE_METRICS
18
18
  from risk.log import logger
19
19
 
@@ -40,22 +40,22 @@ def define_domains(
40
40
  """
41
41
  try:
42
42
  # Transpose the matrix to cluster annotations
43
- m = significant_neighborhoods_enrichment[:, top_annotations["top attributes"]].T
43
+ m = significant_neighborhoods_enrichment[:, top_annotations["significant_annotations"]].T
44
44
  best_linkage, best_metric, best_threshold = _optimize_silhouette_across_linkage_and_metrics(
45
45
  m, linkage_criterion, linkage_method, linkage_metric
46
46
  )
47
47
  # Perform hierarchical clustering
48
48
  Z = linkage(m, method=best_linkage, metric=best_metric)
49
- logger.info(
49
+ logger.warning(
50
50
  f"Linkage criterion: '{linkage_criterion}'\nLinkage method: '{best_linkage}'\nLinkage metric: '{best_metric}'"
51
51
  )
52
- logger.info(f"Optimal linkage threshold: {round(best_threshold, 3)}")
52
+ logger.debug(f"Optimal linkage threshold: {round(best_threshold, 3)}")
53
53
  # Calculate the optimal threshold for clustering
54
54
  max_d_optimal = np.max(Z[:, 2]) * best_threshold
55
55
  # Assign domains to the annotations matrix
56
56
  domains = fcluster(Z, max_d_optimal, criterion=linkage_criterion)
57
57
  top_annotations["domain"] = 0
58
- top_annotations.loc[top_annotations["top attributes"], "domain"] = domains
58
+ top_annotations.loc[top_annotations["significant_annotations"], "domain"] = domains
59
59
  except ValueError:
60
60
  # If a ValueError is encountered, handle it by assigning unique domains
61
61
  n_rows = len(top_annotations)
@@ -76,8 +76,12 @@ def define_domains(
76
76
  t_idxmax = node_to_domain.loc[:, 1:].idxmax(axis=1)
77
77
  t_idxmax[t_max == 0] = 0
78
78
 
79
+ # Assign all domains where the score is greater than 0
80
+ node_to_domain["all_domains"] = node_to_domain.loc[:, 1:].apply(
81
+ lambda row: list(row[row > 0].index), axis=1
82
+ )
79
83
  # Assign primary domain
80
- node_to_domain["primary domain"] = t_idxmax
84
+ node_to_domain["primary_domain"] = t_idxmax
81
85
 
82
86
  return node_to_domain
83
87
 
@@ -97,13 +101,13 @@ def trim_domains_and_top_annotations(
97
101
  max_cluster_size (int, optional): Maximum size of a cluster to be retained. Defaults to 1000.
98
102
 
99
103
  Returns:
100
- tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing:
104
+ Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing:
101
105
  - Trimmed annotations (pd.DataFrame)
102
106
  - Trimmed domains (pd.DataFrame)
103
107
  - A DataFrame with domain labels (pd.DataFrame)
104
108
  """
105
109
  # Identify domains to remove based on size criteria
106
- domain_counts = domains["primary domain"].value_counts()
110
+ domain_counts = domains["primary_domain"].value_counts()
107
111
  to_remove = set(
108
112
  domain_counts[(domain_counts < min_cluster_size) | (domain_counts > max_cluster_size)].index
109
113
  )
@@ -113,32 +117,51 @@ def trim_domains_and_top_annotations(
113
117
  invalid_domain_ids = {0, invalid_domain_id}
114
118
  # Mark domains to be removed
115
119
  top_annotations["domain"].replace(to_remove, invalid_domain_id, inplace=True)
116
- domains.loc[domains["primary domain"].isin(to_remove), ["primary domain"]] = invalid_domain_id
120
+ domains.loc[domains["primary_domain"].isin(to_remove), ["primary_domain"]] = invalid_domain_id
117
121
 
118
122
  # Normalize "num enriched neighborhoods" by percentile for each domain and scale to 0-10
119
123
  top_annotations["normalized_value"] = top_annotations.groupby("domain")[
120
- "neighborhood enrichment sums"
124
+ "significant_neighborhood_enrichment_sums"
121
125
  ].transform(lambda x: (x.rank(pct=True) * 10).apply(np.ceil).astype(int))
122
- # Multiply 'words' column by normalized values
123
- top_annotations["words"] = top_annotations.apply(
124
- lambda row: " ".join([row["words"]] * row["normalized_value"]), axis=1
126
+ # Modify the lambda function to pass both full_terms and significant_enrichment_score
127
+ top_annotations["combined_terms"] = top_annotations.apply(
128
+ lambda row: " ".join([str(row["full_terms"])] * row["normalized_value"]), axis=1
129
+ )
130
+
131
+ # Perform the groupby operation while retaining the other columns and adding the weighting with enrichment scores
132
+ domain_labels = (
133
+ top_annotations.groupby("domain")
134
+ .agg(
135
+ full_terms=("full_terms", lambda x: list(x)),
136
+ enrichment_scores=("significant_enrichment_score", lambda x: list(x)),
137
+ )
138
+ .reset_index()
139
+ )
140
+ domain_labels["combined_terms"] = domain_labels.apply(
141
+ lambda row: get_weighted_description(
142
+ pd.Series(row["full_terms"]), pd.Series(row["enrichment_scores"])
143
+ ),
144
+ axis=1,
125
145
  )
126
146
 
127
- # Generate domain labels
128
- domain_labels = top_annotations.groupby("domain")["words"].apply(get_description).reset_index()
147
+ # Rename the columns as necessary
129
148
  trimmed_domains_matrix = domain_labels.rename(
130
- columns={"domain": "id", "words": "label"}
149
+ columns={
150
+ "domain": "id",
151
+ "combined_terms": "normalized_description",
152
+ "full_terms": "full_descriptions",
153
+ "enrichment_scores": "enrichment_scores",
154
+ }
131
155
  ).set_index("id")
132
156
 
133
157
  # Remove invalid domains
134
158
  valid_annotations = top_annotations[~top_annotations["domain"].isin(invalid_domain_ids)].drop(
135
159
  columns=["normalized_value"]
136
160
  )
137
- valid_domains = domains[~domains["primary domain"].isin(invalid_domain_ids)]
161
+ valid_domains = domains[~domains["primary_domain"].isin(invalid_domain_ids)]
138
162
  valid_trimmed_domains_matrix = trimmed_domains_matrix[
139
163
  ~trimmed_domains_matrix.index.isin(invalid_domain_ids)
140
164
  ]
141
-
142
165
  return valid_annotations, valid_domains, valid_trimmed_domains_matrix
143
166
 
144
167
 
@@ -154,7 +177,7 @@ def _optimize_silhouette_across_linkage_and_metrics(
154
177
  linkage_metric (str): Linkage metric for clustering.
155
178
 
156
179
  Returns:
157
- tuple[str, str, float]: A tuple containing:
180
+ Tuple[str, str, float]: A tuple containing:
158
181
  - Best linkage method (str)
159
182
  - Best linkage metric (str)
160
183
  - Best threshold (float)
@@ -208,7 +231,7 @@ def _find_best_silhouette_score(
208
231
  resolution (float, optional): Desired resolution for the best threshold. Defaults to 0.001.
209
232
 
210
233
  Returns:
211
- tuple[float, float]: A tuple containing:
234
+ Tuple[float, float]: A tuple containing:
212
235
  - Best threshold (float): The threshold that yields the best silhouette score.
213
236
  - Best silhouette score (float): The highest silhouette score achieved.
214
237
  """