risk-network 0.0.7b5__py3-none-any.whl → 0.0.7b6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
risk/__init__.py CHANGED
@@ -7,4 +7,4 @@ RISK: RISK Infers Spatial Kinships
7
7
 
8
8
  from risk.risk import RISK
9
9
 
10
- __version__ = "0.0.7-beta.5"
10
+ __version__ = "0.0.7-beta.6"
@@ -39,7 +39,7 @@ def load_annotations(network: nx.Graph, annotations_input: Dict[str, Any]) -> Di
39
39
  annotations_input (dict): A dictionary with annotations.
40
40
 
41
41
  Returns:
42
- dict: A dictionary containing ordered nodes, ordered annotations, and the annotations matrix.
42
+ dict: A dictionary containing ordered nodes, ordered annotations, and the binary annotations matrix.
43
43
  """
44
44
  # Flatten the dictionary to a list of tuples for easier DataFrame creation
45
45
  flattened_annotations = [
@@ -66,7 +66,8 @@ def load_annotations(network: nx.Graph, annotations_input: Dict[str, Any]) -> Di
66
66
  # Extract ordered nodes and annotations
67
67
  ordered_nodes = tuple(annotations_pivot.index)
68
68
  ordered_annotations = tuple(annotations_pivot.columns)
69
- annotations_pivot_numpy = annotations_pivot.fillna(0).to_numpy()
69
+ # Convert the annotations_pivot matrix to a numpy array and ensure it's binary
70
+ annotations_pivot_numpy = (annotations_pivot.fillna(0).to_numpy() > 0).astype(int)
70
71
 
71
72
  return {
72
73
  "ordered_nodes": ordered_nodes,
@@ -163,8 +164,8 @@ def define_top_annotations(
163
164
 
164
165
 
165
166
  def get_description(words_column: pd.Series) -> str:
166
- """Process input Series to identify and return the top N frequent, significant words,
167
- filtering based on stopwords and similarity (Jaccard index).
167
+ """Process input Series to identify and return the top frequent, significant words,
168
+ filtering based on stopwords and gracefully handling numerical strings.
168
169
 
169
170
  Args:
170
171
  words_column (pd.Series): A pandas Series containing strings to process.
@@ -172,19 +173,30 @@ def get_description(words_column: pd.Series) -> str:
172
173
  Returns:
173
174
  str: A coherent description formed from the most frequent and significant words.
174
175
  """
175
- # Define stopwords
176
- stop_words = set(stopwords.words("english"))
177
- # Tokenize the concatenated string and filter out stopwords and non-alphabetic words
176
+ # Concatenate all rows into a single string and tokenize into words
177
+ all_words = words_column.str.cat(sep=" ")
178
+ tokens = word_tokenize(all_words)
179
+
180
+ # Check if all tokens are numeric strings or contain a mixture of strings and numbers
181
+ numeric_tokens = [token for token in tokens if token.replace(".", "", 1).isdigit()]
182
+ non_numeric_tokens = [token for token in tokens if not token.replace(".", "", 1).isdigit()]
183
+ # If there's only one unique numeric value, return it directly as a string
184
+ unique_numeric_values = set(numeric_tokens)
185
+ if len(unique_numeric_values) == 1:
186
+ return f"{list(unique_numeric_values)[0]}"
187
+
188
+ # Allow the inclusion of both alphabetic and numeric tokens if mixture is detected
178
189
  words = [
179
190
  (
180
191
  word.lower() if word.istitle() else word
181
192
  ) # Lowercase all words except proper nouns (e.g., RNA, mRNA)
182
- for word in word_tokenize(words_column.str.cat(sep=" "))
183
- if word.isalpha() and word.lower() not in stop_words
193
+ for word in tokens
194
+ if word.isalpha()
195
+ or word.replace(".", "", 1).isdigit() # Keep alphabetic words and numeric strings
184
196
  ]
185
- # Simplify the word list to remove similar words based on the Jaccard index and generate coherent description
186
- simplified_words = _simplify_word_list(words, threshold=0.90)
187
- description = _generate_coherent_description(simplified_words)
197
+ # Generate a coherent description from the processed words
198
+ description = _generate_coherent_description(words)
199
+
188
200
  return description
189
201
 
190
202
 
@@ -242,25 +254,28 @@ def _calculate_jaccard_index(set1: Set[Any], set2: Set[Any]) -> float:
242
254
 
243
255
 
244
256
  def _generate_coherent_description(words: List[str]) -> str:
245
- """Generate a coherent description from a list of words.
257
+ """Generate a coherent description from a list of words or numerical string values.
258
+ If there is only one unique entry, return it directly.
246
259
 
247
260
  Args:
248
- words (list of str): A list of words from which to generate the description.
261
+ words (list): A list of words or numerical string values.
249
262
 
250
263
  Returns:
251
264
  str: A coherent description formed by arranging the words in a logical sequence.
252
265
  """
253
- # Count the frequency of each word
266
+ # If there are no words or the input is invalid, raise an error
267
+ if not words or not isinstance(words, list) or not all(isinstance(word, str) for word in words):
268
+ raise ValueError("Input must be a list of strings.")
269
+
270
+ # If there's only one unique word, return it directly (even if it's a number-like string)
271
+ unique_words = set(words)
272
+ if len(unique_words) == 1:
273
+ return list(unique_words)[0]
274
+
275
+ # Count the frequency of each word and sort them by frequency
254
276
  word_counts = Counter(words)
255
- # Get the most common words
256
277
  most_common_words = [word for word, _ in word_counts.most_common()]
257
- # Filter out common stopwords
258
- stop_words = set(stopwords.words("english"))
259
- filtered_words = [word for word in most_common_words if word.lower() not in stop_words]
260
- # Generate permutations of the filtered words to find a logical order
261
- perm = permutations(filtered_words)
262
- # Assume the first permutation as the logical sequence (since they're all equally likely without additional context)
263
- logical_sequence = next(perm)
264
- # Join the words to form a coherent description
265
- description = " ".join(logical_sequence)
278
+ # Join the most common words to form a coherent description based on frequency
279
+ description = " ".join(most_common_words)
280
+
266
281
  return description
@@ -7,32 +7,29 @@ import community as community_louvain
7
7
  import networkx as nx
8
8
  import numpy as np
9
9
  import markov_clustering as mc
10
- from networkx.algorithms.community import asyn_lpa_communities
10
+ from networkx.algorithms.community import asyn_lpa_communities, greedy_modularity_communities
11
11
 
12
12
 
13
- def calculate_dijkstra_neighborhoods(network: nx.Graph) -> np.ndarray:
14
- """Calculate neighborhoods using Dijkstra's shortest path distances.
13
+ def calculate_greedy_modularity_neighborhoods(network: nx.Graph) -> np.ndarray:
14
+ """Calculate neighborhoods using the Greedy Modularity method.
15
15
 
16
16
  Args:
17
- network (nx.Graph): The network graph.
17
+ network (nx.Graph): The network graph to analyze for community structure.
18
18
 
19
19
  Returns:
20
- np.ndarray: Neighborhood matrix based on Dijkstra's distances.
20
+ np.ndarray: A binary neighborhood matrix where nodes in the same community have 1, and others have 0.
21
21
  """
22
- # Compute Dijkstra's distance for all pairs of nodes in the network
23
- all_dijkstra_paths = dict(nx.all_pairs_dijkstra_path_length(network, weight="length"))
22
+ # Detect communities using the Greedy Modularity method
23
+ communities = greedy_modularity_communities(network)
24
+ # Create a mapping from node to community
25
+ community_dict = {node: idx for idx, community in enumerate(communities) for node in community}
26
+ # Create a binary neighborhood matrix
24
27
  neighborhoods = np.zeros((network.number_of_nodes(), network.number_of_nodes()), dtype=int)
25
-
26
- # Populate the neighborhoods matrix based on Dijkstra's distances
27
- for source, targets in all_dijkstra_paths.items():
28
- max_length = max(targets.values()) if targets else 1 # Handle cases with no targets
29
- for target, length in targets.items():
30
- if np.isnan(length):
31
- neighborhoods[source, target] = max_length # Use max distance for NaN
32
- elif length == 0:
33
- neighborhoods[source, target] = 1 # Assign 1 for zero-length paths (self-loops)
34
- else:
35
- neighborhoods[source, target] = 1 / length # Inverse of the distance
28
+ node_index = {node: i for i, node in enumerate(network.nodes())}
29
+ for node_i, community_i in community_dict.items():
30
+ for node_j, community_j in community_dict.items():
31
+ if community_i == community_j:
32
+ neighborhoods[node_index[node_i], node_index[node_j]] = 1
36
33
 
37
34
  return neighborhoods
38
35
 
@@ -44,21 +41,19 @@ def calculate_label_propagation_neighborhoods(network: nx.Graph) -> np.ndarray:
44
41
  network (nx.Graph): The network graph.
45
42
 
46
43
  Returns:
47
- np.ndarray: Neighborhood matrix based on Label Propagation.
44
+ np.ndarray: Binary neighborhood matrix on Label Propagation.
48
45
  """
49
46
  # Apply Label Propagation
50
47
  communities = nx.algorithms.community.label_propagation.label_propagation_communities(network)
51
-
52
48
  # Create a mapping from node to community
53
49
  community_dict = {}
54
50
  for community_id, community in enumerate(communities):
55
51
  for node in community:
56
52
  community_dict[node] = community_id
57
53
 
58
- # Create a neighborhood matrix
54
+ # Create a binary neighborhood matrix
59
55
  num_nodes = network.number_of_nodes()
60
56
  neighborhoods = np.zeros((num_nodes, num_nodes), dtype=int)
61
-
62
57
  # Assign neighborhoods based on community labels
63
58
  for node_i, community_i in community_dict.items():
64
59
  for node_j, community_j in community_dict.items():
@@ -79,14 +74,14 @@ def calculate_louvain_neighborhoods(
79
74
  random_seed (int, optional): Random seed for reproducibility. Defaults to 888.
80
75
 
81
76
  Returns:
82
- np.ndarray: Neighborhood matrix based on the Louvain method.
77
+ np.ndarray: Binary neighborhood matrix on the Louvain method.
83
78
  """
84
79
  # Apply Louvain method to partition the network
85
80
  partition = community_louvain.best_partition(
86
81
  network, resolution=resolution, random_state=random_seed
87
82
  )
83
+ # Create a binary neighborhood matrix
88
84
  neighborhoods = np.zeros((network.number_of_nodes(), network.number_of_nodes()), dtype=int)
89
-
90
85
  # Assign neighborhoods based on community partitions
91
86
  for node_i, community_i in partition.items():
92
87
  for node_j, community_j in partition.items():
@@ -103,7 +98,7 @@ def calculate_markov_clustering_neighborhoods(network: nx.Graph) -> np.ndarray:
103
98
  network (nx.Graph): The network graph.
104
99
 
105
100
  Returns:
106
- np.ndarray: Neighborhood matrix based on Markov Clustering.
101
+ np.ndarray: Binary neighborhood matrix on Markov Clustering.
107
102
  """
108
103
  # Convert the graph to an adjacency matrix
109
104
  adjacency_matrix = nx.to_numpy_array(network)
@@ -111,17 +106,15 @@ def calculate_markov_clustering_neighborhoods(network: nx.Graph) -> np.ndarray:
111
106
  result = mc.run_mcl(adjacency_matrix) # Run MCL with default parameters
112
107
  # Get clusters
113
108
  clusters = mc.get_clusters(result)
114
-
115
109
  # Create a community label for each node
116
110
  community_dict = {}
117
111
  for community_id, community in enumerate(clusters):
118
112
  for node in community:
119
113
  community_dict[node] = community_id
120
114
 
121
- # Create a neighborhood matrix
115
+ # Create a binary neighborhood matrix
122
116
  num_nodes = network.number_of_nodes()
123
117
  neighborhoods = np.zeros((num_nodes, num_nodes), dtype=int)
124
-
125
118
  # Assign neighborhoods based on community labels
126
119
  for node_i, community_i in community_dict.items():
127
120
  for node_j, community_j in community_dict.items():
@@ -138,21 +131,19 @@ def calculate_spinglass_neighborhoods(network: nx.Graph) -> np.ndarray:
138
131
  network (nx.Graph): The network graph.
139
132
 
140
133
  Returns:
141
- np.ndarray: Neighborhood matrix based on Spin Glass communities.
134
+ np.ndarray: Binary neighborhood matrix on Spin Glass communities.
142
135
  """
143
136
  # Use the asynchronous label propagation algorithm as a proxy for Spin Glass
144
137
  communities = asyn_lpa_communities(network)
145
-
146
138
  # Create a community label for each node
147
139
  community_dict = {}
148
140
  for community_id, community in enumerate(communities):
149
141
  for node in community:
150
142
  community_dict[node] = community_id
151
143
 
152
- # Create a neighborhood matrix
144
+ # Create a binary neighborhood matrix
153
145
  num_nodes = network.number_of_nodes()
154
146
  neighborhoods = np.zeros((num_nodes, num_nodes), dtype=int)
155
-
156
147
  # Assign neighborhoods based on community labels
157
148
  for node_i, community_i in community_dict.items():
158
149
  for node_j, community_j in community_dict.items():
@@ -169,21 +160,19 @@ def calculate_walktrap_neighborhoods(network: nx.Graph) -> np.ndarray:
169
160
  network (nx.Graph): The network graph.
170
161
 
171
162
  Returns:
172
- np.ndarray: Neighborhood matrix based on Walktrap communities.
163
+ np.ndarray: Binary neighborhood matrix on Walktrap communities.
173
164
  """
174
165
  # Use the asynchronous label propagation algorithm as a proxy for Walktrap
175
166
  communities = asyn_lpa_communities(network)
176
-
177
167
  # Create a community label for each node
178
168
  community_dict = {}
179
169
  for community_id, community in enumerate(communities):
180
170
  for node in community:
181
171
  community_dict[node] = community_id
182
172
 
183
- # Create a neighborhood matrix
173
+ # Create a binary neighborhood matrix
184
174
  num_nodes = network.number_of_nodes()
185
175
  neighborhoods = np.zeros((num_nodes, num_nodes), dtype=int)
186
-
187
176
  # Assign neighborhoods based on community labels
188
177
  for node_i, community_i in community_dict.items():
189
178
  for node_j, community_j in community_dict.items():
@@ -3,6 +3,7 @@ risk/neighborhoods/neighborhoods
3
3
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4
4
  """
5
5
 
6
+ import random
6
7
  import warnings
7
8
  from typing import Any, Dict, List, Tuple
8
9
 
@@ -12,7 +13,7 @@ from sklearn.exceptions import DataConversionWarning
12
13
  from sklearn.metrics.pairwise import cosine_similarity
13
14
 
14
15
  from risk.neighborhoods.community import (
15
- calculate_dijkstra_neighborhoods,
16
+ calculate_greedy_modularity_neighborhoods,
16
17
  calculate_label_propagation_neighborhoods,
17
18
  calculate_louvain_neighborhoods,
18
19
  calculate_markov_clustering_neighborhoods,
@@ -26,7 +27,7 @@ warnings.filterwarnings(action="ignore", category=DataConversionWarning)
26
27
 
27
28
  def get_network_neighborhoods(
28
29
  network: nx.Graph,
29
- distance_metric: str = "dijkstra",
30
+ distance_metric: str = "louvain",
30
31
  edge_length_threshold: float = 1.0,
31
32
  louvain_resolution: float = 1.0,
32
33
  random_seed: int = 888,
@@ -35,8 +36,8 @@ def get_network_neighborhoods(
35
36
 
36
37
  Args:
37
38
  network (nx.Graph): The network graph.
38
- distance_metric (str): The distance metric to use ('euclidean', 'dijkstra', 'louvain', 'affinity_propagation',
39
- 'label_propagation', 'markov_clustering', 'walktrap', 'spinglass').
39
+ distance_metric (str): The distance metric to use ('greedy_modularity', 'louvain', 'label_propagation',
40
+ 'markov_clustering', 'walktrap', 'spinglass').
40
41
  edge_length_threshold (float): The edge length threshold for the neighborhoods.
41
42
  louvain_resolution (float, optional): Resolution parameter for the Louvain method. Defaults to 1.0.
42
43
  random_seed (int, optional): Random seed for methods requiring random initialization. Defaults to 888.
@@ -44,10 +45,17 @@ def get_network_neighborhoods(
44
45
  Returns:
45
46
  np.ndarray: Neighborhood matrix calculated based on the selected distance metric.
46
47
  """
47
- network = _create_percentile_limited_subgraph(network, edge_length_threshold)
48
+ # Set random seed for reproducibility in all methods besides Louvain, which requires a separate seed
49
+ random.seed(random_seed)
50
+ np.random.seed(random_seed)
48
51
 
49
- if distance_metric == "dijkstra":
50
- return calculate_dijkstra_neighborhoods(network)
52
+ # Create a subgraph based on the edge length percentile threshold
53
+ network = _create_percentile_limited_subgraph(
54
+ network, edge_length_percentile=edge_length_threshold
55
+ )
56
+
57
+ if distance_metric == "greedy_modularity":
58
+ return calculate_greedy_modularity_neighborhoods(network)
51
59
  if distance_metric == "louvain":
52
60
  return calculate_louvain_neighborhoods(network, louvain_resolution, random_seed=random_seed)
53
61
  if distance_metric == "label_propagation":
@@ -60,34 +68,44 @@ def get_network_neighborhoods(
60
68
  return calculate_spinglass_neighborhoods(network)
61
69
 
62
70
  raise ValueError(
63
- "Incorrect distance metric specified. Please choose from 'dijkstra', 'louvain',"
71
+ "Incorrect distance metric specified. Please choose from 'greedy_modularity', 'louvain',"
64
72
  "'label_propagation', 'markov_clustering', 'walktrap', 'spinglass'."
65
73
  )
66
74
 
67
75
 
68
76
  def _create_percentile_limited_subgraph(G: nx.Graph, edge_length_percentile: float) -> nx.Graph:
69
- """Calculate the edge length corresponding to the given percentile of edge lengths in the graph
70
- and create a subgraph with all nodes and edges below this length.
77
+ """Create a subgraph containing all nodes and edges where the edge length is below the
78
+ specified percentile of all edge lengths in the input graph.
71
79
 
72
80
  Args:
73
- G (nx.Graph): The input graph.
74
- edge_length_percentile (float): The percentile to calculate (between 0 and 1).
81
+ G (nx.Graph): The input graph with 'length' attributes on edges.
82
+ edge_length_percentile (float): The percentile (between 0 and 1) to filter edges by length.
75
83
 
76
84
  Returns:
77
- nx.Graph: A subgraph with all nodes and edges below the edge length corresponding to the given percentile.
85
+ nx.Graph: A subgraph with all nodes and edges where the edge length is below the
86
+ calculated threshold length.
78
87
  """
79
- # Extract edge lengths from the graph
88
+ # Extract edge lengths and handle missing lengths
80
89
  edge_lengths = [d["length"] for _, _, d in G.edges(data=True) if "length" in d]
90
+ if not edge_lengths:
91
+ raise ValueError(
92
+ "No edge lengths found in the graph. Ensure edges have 'length' attributes."
93
+ )
94
+
81
95
  # Calculate the specific edge length for the given percentile
82
96
  percentile_length = np.percentile(edge_lengths, edge_length_percentile * 100)
83
- # Create a new graph with all nodes from the original graph
97
+ # Create the subgraph by directly filtering edges during iteration
84
98
  subgraph = nx.Graph()
85
- subgraph.add_nodes_from(G.nodes(data=True))
86
- # Add edges to the subgraph if they are below the specified percentile length
99
+ subgraph.add_nodes_from(G.nodes(data=True)) # Retain all nodes from the original graph
100
+ # Add edges below the specified percentile length in a single pass
87
101
  for u, v, d in G.edges(data=True):
88
102
  if d.get("length", 1) <= percentile_length:
89
103
  subgraph.add_edge(u, v, **d)
90
104
 
105
+ # Return the subgraph; optionally check if it's too sparse
106
+ if subgraph.number_of_edges() == 0:
107
+ raise Warning("The resulting subgraph has no edges. Consider adjusting the percentile.")
108
+
91
109
  return subgraph
92
110
 
93
111
 
risk/network/plot.py CHANGED
@@ -9,6 +9,7 @@ import matplotlib.colors as mcolors
9
9
  import matplotlib.pyplot as plt
10
10
  import networkx as nx
11
11
  import numpy as np
12
+ import pandas as pd
12
13
  from scipy.ndimage import label
13
14
  from scipy.stats import gaussian_kde
14
15
 
@@ -601,7 +602,7 @@ class NetworkPlotter:
601
602
  min_words (int, optional): Minimum number of words required to display a label. Defaults to 1.
602
603
  max_word_length (int, optional): Maximum number of characters in a word to display. Defaults to 20.
603
604
  min_word_length (int, optional): Minimum number of characters in a word to display. Defaults to 1.
604
- words_to_omit (List, optional): List of words to omit from the labels. Defaults to None.
605
+ words_to_omit (list, optional): List of words to omit from the labels. Defaults to None.
605
606
  overlay_ids (bool, optional): Whether to overlay domain IDs in the center of the centroids. Defaults to False.
606
607
  ids_to_keep (list, tuple, np.ndarray, or None, optional): IDs of domains that must be labeled. To discover domain IDs,
607
608
  you can set `overlay_ids=True`. Defaults to None.
@@ -710,6 +711,9 @@ class NetworkPlotter:
710
711
  # Process remaining domains to fill in additional labels, if there are slots left
711
712
  if remaining_labels and remaining_labels > 0:
712
713
  for idx, (domain, centroid) in enumerate(domain_centroids.items()):
714
+ # Check if the domain is NaN and continue if true
715
+ if pd.isna(domain) or (isinstance(domain, float) and np.isnan(domain)):
716
+ continue # Skip NaN domains
713
717
  if ids_to_keep and domain in ids_to_keep:
714
718
  continue # Skip domains already handled by ids_to_keep
715
719
 
risk/risk.py CHANGED
@@ -49,9 +49,10 @@ class RISK(NetworkIO, AnnotationsIO):
49
49
  self,
50
50
  network: nx.Graph,
51
51
  annotations: Dict[str, Any],
52
- distance_metric: str = "dijkstra",
52
+ distance_metric: str = "louvain",
53
53
  louvain_resolution: float = 0.1,
54
54
  edge_length_threshold: float = 0.5,
55
+ null_distribution: str = "network",
55
56
  random_seed: int = 888,
56
57
  ) -> Dict[str, Any]:
57
58
  """Load significant neighborhoods for the network using the hypergeometric test.
@@ -59,9 +60,10 @@ class RISK(NetworkIO, AnnotationsIO):
59
60
  Args:
60
61
  network (nx.Graph): The network graph.
61
62
  annotations (dict): The annotations associated with the network.
62
- distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "dijkstra".
63
+ distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "louvain".
63
64
  louvain_resolution (float, optional): Resolution parameter for Louvain clustering. Defaults to 0.1.
64
65
  edge_length_threshold (float, optional): Edge length threshold for neighborhood analysis. Defaults to 0.5.
66
+ null_distribution (str, optional): Type of null distribution ('network' or 'annotations'). Defaults to "network".
65
67
  random_seed (int, optional): Seed for random number generation. Defaults to 888.
66
68
 
67
69
  Returns:
@@ -74,6 +76,7 @@ class RISK(NetworkIO, AnnotationsIO):
74
76
  louvain_resolution=louvain_resolution,
75
77
  edge_length_threshold=edge_length_threshold,
76
78
  statistical_test_function="hypergeom",
79
+ null_distribution=null_distribution,
77
80
  random_seed=random_seed,
78
81
  )
79
82
 
@@ -89,6 +92,7 @@ class RISK(NetworkIO, AnnotationsIO):
89
92
  neighborhood_significance = compute_hypergeom_test(
90
93
  neighborhoods=neighborhoods,
91
94
  annotations=annotations["matrix"],
95
+ null_distribution=null_distribution,
92
96
  )
93
97
 
94
98
  # Return the computed neighborhood significance
@@ -98,9 +102,10 @@ class RISK(NetworkIO, AnnotationsIO):
98
102
  self,
99
103
  network: nx.Graph,
100
104
  annotations: Dict[str, Any],
101
- distance_metric: str = "dijkstra",
105
+ distance_metric: str = "louvain",
102
106
  louvain_resolution: float = 0.1,
103
107
  edge_length_threshold: float = 0.5,
108
+ null_distribution: str = "network",
104
109
  random_seed: int = 888,
105
110
  ) -> Dict[str, Any]:
106
111
  """Load significant neighborhoods for the network using the Poisson test.
@@ -108,9 +113,10 @@ class RISK(NetworkIO, AnnotationsIO):
108
113
  Args:
109
114
  network (nx.Graph): The network graph.
110
115
  annotations (dict): The annotations associated with the network.
111
- distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "dijkstra".
116
+ distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "louvain".
112
117
  louvain_resolution (float, optional): Resolution parameter for Louvain clustering. Defaults to 0.1.
113
118
  edge_length_threshold (float, optional): Edge length threshold for neighborhood analysis. Defaults to 0.5.
119
+ null_distribution (str, optional): Type of null distribution ('network' or 'annotations'). Defaults to "network".
114
120
  random_seed (int, optional): Seed for random number generation. Defaults to 888.
115
121
 
116
122
  Returns:
@@ -123,6 +129,7 @@ class RISK(NetworkIO, AnnotationsIO):
123
129
  louvain_resolution=louvain_resolution,
124
130
  edge_length_threshold=edge_length_threshold,
125
131
  statistical_test_function="poisson",
132
+ null_distribution=null_distribution,
126
133
  random_seed=random_seed,
127
134
  )
128
135
 
@@ -138,6 +145,7 @@ class RISK(NetworkIO, AnnotationsIO):
138
145
  neighborhood_significance = compute_poisson_test(
139
146
  neighborhoods=neighborhoods,
140
147
  annotations=annotations["matrix"],
148
+ null_distribution=null_distribution,
141
149
  )
142
150
 
143
151
  # Return the computed neighborhood significance
@@ -147,7 +155,7 @@ class RISK(NetworkIO, AnnotationsIO):
147
155
  self,
148
156
  network: nx.Graph,
149
157
  annotations: Dict[str, Any],
150
- distance_metric: str = "dijkstra",
158
+ distance_metric: str = "louvain",
151
159
  louvain_resolution: float = 0.1,
152
160
  edge_length_threshold: float = 0.5,
153
161
  score_metric: str = "sum",
@@ -161,11 +169,11 @@ class RISK(NetworkIO, AnnotationsIO):
161
169
  Args:
162
170
  network (nx.Graph): The network graph.
163
171
  annotations (dict): The annotations associated with the network.
164
- distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "dijkstra".
172
+ distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "louvain".
165
173
  louvain_resolution (float, optional): Resolution parameter for Louvain clustering. Defaults to 0.1.
166
174
  edge_length_threshold (float, optional): Edge length threshold for neighborhood analysis. Defaults to 0.5.
167
175
  score_metric (str, optional): Scoring metric for neighborhood significance. Defaults to "sum".
168
- null_distribution (str, optional): Distribution used for permutation tests. Defaults to "network".
176
+ null_distribution (str, optional): Type of null distribution ('network' or 'annotations'). Defaults to "network".
169
177
  num_permutations (int, optional): Number of permutations for significance testing. Defaults to 1000.
170
178
  random_seed (int, optional): Seed for random number generation. Defaults to 888.
171
179
  max_workers (int, optional): Maximum number of workers for parallel computation. Defaults to 1.
@@ -366,7 +374,7 @@ class RISK(NetworkIO, AnnotationsIO):
366
374
  def _load_neighborhoods(
367
375
  self,
368
376
  network: nx.Graph,
369
- distance_metric: str = "dijkstra",
377
+ distance_metric: str = "louvain",
370
378
  louvain_resolution: float = 0.1,
371
379
  edge_length_threshold: float = 0.5,
372
380
  random_seed: int = 888,
@@ -376,7 +384,7 @@ class RISK(NetworkIO, AnnotationsIO):
376
384
  Args:
377
385
  network (nx.Graph): The network graph.
378
386
  annotations (pd.DataFrame): The matrix of annotations associated with the network.
379
- distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "dijkstra".
387
+ distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "louvain".
380
388
  louvain_resolution (float, optional): Resolution parameter for Louvain clustering. Defaults to 0.1.
381
389
  edge_length_threshold (float, optional): Edge length threshold for neighborhood analysis. Defaults to 0.5.
382
390
  random_seed (int, optional): Seed for random number generation. Defaults to 888.
risk/stats/hypergeom.py CHANGED
@@ -10,46 +10,47 @@ from scipy.stats import hypergeom
10
10
 
11
11
 
12
12
  def compute_hypergeom_test(
13
- neighborhoods: np.ndarray,
14
- annotations: np.ndarray,
13
+ neighborhoods: np.ndarray, annotations: np.ndarray, null_distribution: str = "network"
15
14
  ) -> Dict[str, Any]:
16
- """Compute hypergeometric test for enrichment and depletion in neighborhoods.
15
+ """Compute hypergeometric test for enrichment and depletion in neighborhoods with selectable null distribution.
17
16
 
18
17
  Args:
19
- neighborhoods (np.ndarray): Binary matrix representing neighborhoods, where rows are nodes
20
- and columns are neighborhoods. Entries indicate the presence (1) or absence (0) of a node
21
- in a neighborhood.
22
- annotations (np.ndarray): Binary matrix representing annotations, where rows are nodes
23
- and columns are annotations. Entries indicate the presence (1) or absence (0) of a node
24
- being annotated.
18
+ neighborhoods (np.ndarray): Binary matrix representing neighborhoods.
19
+ annotations (np.ndarray): Binary matrix representing annotations.
20
+ null_distribution (str, optional): Type of null distribution ('network' or 'annotations'). Defaults to "network".
25
21
 
26
22
  Returns:
27
- Dict[str, Any]: A dictionary with two keys:
28
- - "enrichment_pvals" (np.ndarray): P-values for enrichment, indicating the probability
29
- of observing more annotations in a neighborhood than expected under the hypergeometric test.
30
- - "depletion_pvals" (np.ndarray): P-values for depletion, indicating the probability
31
- of observing fewer annotations in a neighborhood than expected under the hypergeometric test.
23
+ dict: Dictionary containing depletion and enrichment p-values.
32
24
  """
33
25
  # Ensure both matrices are binary (presence/absence)
34
26
  neighborhoods = (neighborhoods > 0).astype(int)
35
27
  annotations = (annotations > 0).astype(int)
36
- total_node_count = annotations.shape[0]
37
- # Sum of values in each neighborhood
38
- neighborhood_sums = np.sum(neighborhoods, axis=0)[:, np.newaxis]
39
- # Repeating neighborhood sums for each annotation
40
- neighborhood_size_matrix = np.tile(neighborhood_sums, (1, annotations.shape[1]))
41
- # Total number of nodes annotated to each attribute
42
- annotated_node_counts = np.tile(np.sum(annotations, axis=0), (neighborhoods.shape[1], 1))
43
- # Nodes in each neighborhood annotated to each attribute
44
- annotated_in_neighborhood = np.dot(neighborhoods, annotations)
45
- # Calculate p-values using the hypergeometric distribution
28
+ total_node_count = neighborhoods.shape[0]
29
+
30
+ if null_distribution == "network":
31
+ # Case 1: Use all nodes as the background
32
+ background_population = total_node_count
33
+ neighborhood_sums = np.sum(neighborhoods, axis=0, keepdims=True).T
34
+ annotation_sums = np.sum(annotations, axis=0, keepdims=True)
35
+ elif null_distribution == "annotations":
36
+ # Case 2: Only consider nodes with at least one annotation
37
+ annotated_nodes = np.sum(annotations, axis=1) > 0
38
+ background_population = np.sum(annotated_nodes)
39
+ neighborhood_sums = np.sum(neighborhoods[annotated_nodes], axis=0, keepdims=True).T
40
+ annotation_sums = np.sum(annotations[annotated_nodes], axis=0, keepdims=True)
41
+ else:
42
+ raise ValueError(
43
+ "Invalid null_distribution value. Choose either 'network' or 'annotations'."
44
+ )
45
+
46
+ # Matrix multiplication for annotated nodes in each neighborhood
47
+ annotated_in_neighborhood = neighborhoods.T @ annotations
48
+ # Calculate depletion and enrichment p-values using the hypergeometric distribution
46
49
  depletion_pvals = hypergeom.cdf(
47
- annotated_in_neighborhood, total_node_count, annotated_node_counts, neighborhood_size_matrix
50
+ annotated_in_neighborhood, background_population, annotation_sums, neighborhood_sums
48
51
  )
49
52
  enrichment_pvals = hypergeom.sf(
50
- annotated_in_neighborhood - 1,
51
- total_node_count,
52
- annotated_node_counts,
53
- neighborhood_size_matrix,
53
+ annotated_in_neighborhood - 1, background_population, annotation_sums, neighborhood_sums
54
54
  )
55
+
55
56
  return {"depletion_pvals": depletion_pvals, "enrichment_pvals": enrichment_pvals}
@@ -28,7 +28,7 @@ def compute_permutation_test(
28
28
  neighborhoods (np.ndarray): Binary matrix representing neighborhoods.
29
29
  annotations (np.ndarray): Binary matrix representing annotations.
30
30
  score_metric (str, optional): Metric to use for scoring ('sum', 'mean', etc.). Defaults to "sum".
31
- null_distribution (str, optional): Type of null distribution ('network' or other). Defaults to "network".
31
+ null_distribution (str, optional): Type of null distribution ('network' or 'annotations'). Defaults to "network".
32
32
  num_permutations (int, optional): Number of permutations to run. Defaults to 1000.
33
33
  random_seed (int, optional): Seed for random number generation. Defaults to 888.
34
34
  max_workers (int, optional): Number of workers for multiprocessing. Defaults to 1.
@@ -78,7 +78,7 @@ def _run_permutation_test(
78
78
  neighborhoods (np.ndarray): The neighborhood matrix.
79
79
  annotations (np.ndarray): The annotation matrix.
80
80
  neighborhood_score_func (Callable): Function to calculate neighborhood scores.
81
- null_distribution (str, optional): Type of null distribution. Defaults to "network".
81
+ null_distribution (str, optional): Type of null distribution ('network' or 'annotations'). Defaults to "network".
82
82
  num_permutations (int, optional): Number of permutations. Defaults to 1000.
83
83
  random_seed (int, optional): Seed for random number generation. Defaults to 888.
84
84
  max_workers (int, optional): Number of workers for multiprocessing. Defaults to 4.
@@ -91,8 +91,12 @@ def _run_permutation_test(
91
91
  # Determine the indices to use based on the null distribution type
92
92
  if null_distribution == "network":
93
93
  idxs = range(annotations.shape[0])
94
- else:
94
+ elif null_distribution == "annotations":
95
95
  idxs = np.nonzero(np.sum(~np.isnan(annotations), axis=1))[0]
96
+ else:
97
+ raise ValueError(
98
+ "Invalid null_distribution value. Choose either 'network' or 'annotations'."
99
+ )
96
100
 
97
101
  # Replace NaNs with zeros in the annotations matrix
98
102
  annotations[np.isnan(annotations)] = 0
risk/stats/poisson.py CHANGED
@@ -9,32 +9,39 @@ import numpy as np
9
9
  from scipy.stats import poisson
10
10
 
11
11
 
12
- def compute_poisson_test(neighborhoods: np.ndarray, annotations: np.ndarray) -> Dict[str, Any]:
13
- """Compute Poisson test for enrichment and depletion in neighborhoods.
12
+ def compute_poisson_test(
13
+ neighborhoods: np.ndarray, annotations: np.ndarray, null_distribution: str = "network"
14
+ ) -> Dict[str, Any]:
15
+ """Compute Poisson test for enrichment and depletion in neighborhoods with selectable null distribution.
14
16
 
15
17
  Args:
16
- neighborhoods (np.ndarray): Binary matrix representing neighborhoods, where rows are nodes
17
- and columns are neighborhoods. Entries indicate the presence (1) or absence (0) of a node
18
- in a neighborhood.
19
- annotations (np.ndarray): Binary matrix representing annotations, where rows are nodes
20
- and columns are annotations. Entries indicate the presence (1) or absence (0) of a node
21
- being annotated.
18
+ neighborhoods (np.ndarray): Binary matrix representing neighborhoods.
19
+ annotations (np.ndarray): Binary matrix representing annotations.
20
+ null_distribution (str, optional): Type of null distribution ('network' or 'annotations'). Defaults to "network".
22
21
 
23
22
  Returns:
24
- Dict[str, Any]: A dictionary with two keys:
25
- - "enrichment_pvals" (np.ndarray): P-values for enrichment, indicating the probability
26
- of observing more annotations in a neighborhood than expected under the Poisson distribution.
27
- - "depletion_pvals" (np.ndarray): P-values for depletion, indicating the probability of
28
- observing fewer annotations in a neighborhood than expected under the Poisson distribution.
23
+ dict: Dictionary containing depletion and enrichment p-values.
29
24
  """
25
+ # Ensure both matrices are binary (presence/absence)
30
26
  neighborhoods = (neighborhoods > 0).astype(int)
31
27
  annotations = (annotations > 0).astype(int)
32
- annotated_in_neighborhood = np.dot(neighborhoods, annotations)
33
- lambda_expected = np.mean(annotated_in_neighborhood, axis=0)
34
- # Enrichment (observing more than expected)
28
+ # Matrix multiplication to get the number of annotated nodes in each neighborhood
29
+ annotated_in_neighborhood = neighborhoods @ annotations
30
+
31
+ # Compute lambda_expected based on the chosen null distribution
32
+ if null_distribution == "network":
33
+ # Use the mean across neighborhoods (axis=1)
34
+ lambda_expected = np.mean(annotated_in_neighborhood, axis=1, keepdims=True)
35
+ elif null_distribution == "annotations":
36
+ # Use the mean across annotations (axis=0)
37
+ lambda_expected = np.mean(annotated_in_neighborhood, axis=0, keepdims=True)
38
+ else:
39
+ raise ValueError(
40
+ "Invalid null_distribution value. Choose either 'network' or 'annotations'."
41
+ )
42
+
43
+ # Compute p-values for enrichment and depletion using Poisson distribution
35
44
  enrichment_pvals = 1 - poisson.cdf(annotated_in_neighborhood - 1, lambda_expected)
36
-
37
- # Depletion (observing fewer than expected)
38
45
  depletion_pvals = poisson.cdf(annotated_in_neighborhood, lambda_expected)
39
46
 
40
47
  return {"enrichment_pvals": enrichment_pvals, "depletion_pvals": depletion_pvals}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: risk-network
3
- Version: 0.0.7b5
3
+ Version: 0.0.7b6
4
4
  Summary: A Python package for biological network analysis
5
5
  Author: Ira Horecka
6
6
  Author-email: Ira Horecka <ira89@icloud.com>
@@ -1,30 +1,30 @@
1
- risk/__init__.py,sha256=dGMZvusp_heb_yF3HEnVZDfVhFlvQDEuBQKDQfIAJvk,112
1
+ risk/__init__.py,sha256=usWMc5kXOn1-bcSacSXIi_nGKYj4cIQyRvSzvAWGbMI,112
2
2
  risk/constants.py,sha256=XInRaH78Slnw_sWgAsBFbUHkyA0h0jL0DKGuQNbOvjM,550
3
- risk/risk.py,sha256=EhKdNC5ntEsBAXG7Rw1Y-ho0HzbsvoU9XYE8djD-Axs,19972
3
+ risk/risk.py,sha256=6666BzdMTgOaQl98ZKiJ19c6XBot26eTJ0iIlk-ZCZQ,20515
4
4
  risk/annotations/__init__.py,sha256=vUpVvMRE5if01Ic8QY6M2Ae3EFGJHdugEe9PdEkAW4Y,138
5
- risk/annotations/annotations.py,sha256=K7cUA6vYTKYAvj0xHqrAwNEYtmPq4H7LDYENAOVQdQ0,11014
5
+ risk/annotations/annotations.py,sha256=k9LGTL2uqdYvI5F3jU3UKz-O855B-DoazGPMzSn-XUc,11673
6
6
  risk/annotations/io.py,sha256=lo7NKqOVkeeBp58JBxWJHtA0xjL5Yoxqe9Ox0daKlZk,9457
7
7
  risk/log/__init__.py,sha256=xuLImfxFlKpnVhzi_gDYlr2_c9cLkrw2c_3iEsXb1as,107
8
8
  risk/log/console.py,sha256=im9DRExwf6wHlcn9fewoDcKIpo3vPcorZIaNAl-0csY,355
9
9
  risk/log/params.py,sha256=Rfdg5UcGCrG80m6V79FyORERWUqIzHFO7tGiY4zAImM,6347
10
10
  risk/neighborhoods/__init__.py,sha256=tKKEg4lsbqFukpgYlUGxU_v_9FOqK7V0uvM9T2QzoL0,206
11
- risk/neighborhoods/community.py,sha256=7ebo1Q5KokSQISnxZIh2SQxsKXdXm8aVkp-h_DiQ3K0,6818
11
+ risk/neighborhoods/community.py,sha256=stYYBXeZlGLMV-k8ckQeIqThT6v9y-S3hETobAo9590,6817
12
12
  risk/neighborhoods/domains.py,sha256=bxJUxqFTynzX0mf3E8-AA4_Rfccje1reeVVhfzb1-pE,10672
13
- risk/neighborhoods/neighborhoods.py,sha256=N02r2nnCfDtzVicuUt2WA77EUPHtruqjX8qJmXUP7ik,17475
13
+ risk/neighborhoods/neighborhoods.py,sha256=r-JeUb6dTjzMtnaMDvJy6MI3mTl-yUzILcdcjtOhFdM,18218
14
14
  risk/network/__init__.py,sha256=iEPeJdZfqp0toxtbElryB8jbz9_t_k4QQ3iDvKE8C_0,126
15
15
  risk/network/geometry.py,sha256=H1yGVVqgbfpzBzJwEheDLfvGLSA284jGQQTn612L4Vc,6759
16
16
  risk/network/graph.py,sha256=_LEoom4EEowGALuJKSXcev9RAAHu2FqIeq3u7mkifW0,16479
17
17
  risk/network/io.py,sha256=gG50kOknO-D3HkW1HsbHMkTMvjUtn3l4W4Jwd-rXNr8,21202
18
- risk/network/plot.py,sha256=F6KPjmBYWrThKZScHs9SuzoKQiytBvzrmGhGberHjwo,62063
18
+ risk/network/plot.py,sha256=3OucCoKJwx9M9H4lqAvcQdM9YiCSyIxz21jyqDbpffc,62286
19
19
  risk/stats/__init__.py,sha256=WcgoETQ-hS0LQqKRsAMIPtP15xZ-4eul6VUBuUx4Wzc,220
20
- risk/stats/hypergeom.py,sha256=CfGJ1fd7QKIbBVy85p6-upXwNi19TJioDuekA65PHCQ,2473
21
- risk/stats/poisson.py,sha256=eCBgxVdNUTJ_0aVxSU8ddSFGIXeSOY7Vx3YQBaEzN2k,1836
20
+ risk/stats/hypergeom.py,sha256=DcGYjmfcgt1qshNZPJt5IHGIHtxw9tWRS1r6QJ6V3dI,2378
21
+ risk/stats/poisson.py,sha256=CnLk65CHViR4YhAaN3ix37iyLm_YQYGo851bSnGyyxY,1950
22
22
  risk/stats/stats.py,sha256=kvShov-94W6ffgDUTb522vB9hDJQSyTsYif_UIaFfSM,7059
23
23
  risk/stats/permutation/__init__.py,sha256=neJp7FENC-zg_CGOXqv-iIvz1r5XUKI9Ruxhmq7kDOI,105
24
- risk/stats/permutation/permutation.py,sha256=qLWdwxEY6nmkYPxpM8HLDcd2mbqYv9Qr7CKtJvhLqIM,9220
24
+ risk/stats/permutation/permutation.py,sha256=bFcgTJZI8cLODvGoW4QtMeBnuUs2HibJ42OZyC74Tz0,9427
25
25
  risk/stats/permutation/test_functions.py,sha256=HuDIM-V1jkkfE1rlaIqrWWBSKZt3dQ1f-YEDjWpnLSE,2343
26
- risk_network-0.0.7b5.dist-info/LICENSE,sha256=jOtLnuWt7d5Hsx6XXB2QxzrSe2sWWh3NgMfFRetluQM,35147
27
- risk_network-0.0.7b5.dist-info/METADATA,sha256=DaAqg8en6KjGKUGgxI96K749ZwhFRY92h0RsnvoGqx4,43142
28
- risk_network-0.0.7b5.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
29
- risk_network-0.0.7b5.dist-info/top_level.txt,sha256=NX7C2PFKTvC1JhVKv14DFlFAIFnKc6Lpsu1ZfxvQwVw,5
30
- risk_network-0.0.7b5.dist-info/RECORD,,
26
+ risk_network-0.0.7b6.dist-info/LICENSE,sha256=jOtLnuWt7d5Hsx6XXB2QxzrSe2sWWh3NgMfFRetluQM,35147
27
+ risk_network-0.0.7b6.dist-info/METADATA,sha256=BQRgen5tB4jtWEWvm6VXkYPX3WudtzPpsxtFUtz3Ej0,43142
28
+ risk_network-0.0.7b6.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
29
+ risk_network-0.0.7b6.dist-info/top_level.txt,sha256=NX7C2PFKTvC1JhVKv14DFlFAIFnKc6Lpsu1ZfxvQwVw,5
30
+ risk_network-0.0.7b6.dist-info/RECORD,,