risk-network 0.0.7b4__tar.gz → 0.0.7b6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/PKG-INFO +1 -1
  2. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk/__init__.py +1 -1
  3. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk/annotations/annotations.py +40 -25
  4. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk/neighborhoods/community.py +25 -36
  5. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk/neighborhoods/neighborhoods.py +35 -17
  6. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk/network/plot.py +5 -1
  7. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk/risk.py +49 -55
  8. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk/stats/__init__.py +2 -2
  9. risk_network-0.0.7b6/risk/stats/hypergeom.py +56 -0
  10. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk/stats/permutation/permutation.py +7 -3
  11. risk_network-0.0.7b6/risk/stats/poisson.py +47 -0
  12. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk_network.egg-info/PKG-INFO +1 -1
  13. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk_network.egg-info/SOURCES.txt +1 -1
  14. risk_network-0.0.7b4/risk/stats/fisher_exact.py +0 -132
  15. risk_network-0.0.7b4/risk/stats/hypergeom.py +0 -131
  16. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/LICENSE +0 -0
  17. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/MANIFEST.in +0 -0
  18. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/README.md +0 -0
  19. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/pyproject.toml +0 -0
  20. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk/annotations/__init__.py +0 -0
  21. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk/annotations/io.py +0 -0
  22. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk/constants.py +0 -0
  23. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk/log/__init__.py +0 -0
  24. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk/log/console.py +0 -0
  25. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk/log/params.py +0 -0
  26. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk/neighborhoods/__init__.py +0 -0
  27. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk/neighborhoods/domains.py +0 -0
  28. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk/network/__init__.py +0 -0
  29. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk/network/geometry.py +0 -0
  30. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk/network/graph.py +0 -0
  31. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk/network/io.py +0 -0
  32. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk/stats/permutation/__init__.py +0 -0
  33. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk/stats/permutation/test_functions.py +0 -0
  34. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk/stats/stats.py +0 -0
  35. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk_network.egg-info/dependency_links.txt +0 -0
  36. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk_network.egg-info/requires.txt +0 -0
  37. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/risk_network.egg-info/top_level.txt +0 -0
  38. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/setup.cfg +0 -0
  39. {risk_network-0.0.7b4 → risk_network-0.0.7b6}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: risk-network
3
- Version: 0.0.7b4
3
+ Version: 0.0.7b6
4
4
  Summary: A Python package for biological network analysis
5
5
  Author: Ira Horecka
6
6
  Author-email: Ira Horecka <ira89@icloud.com>
@@ -7,4 +7,4 @@ RISK: RISK Infers Spatial Kinships
7
7
 
8
8
  from risk.risk import RISK
9
9
 
10
- __version__ = "0.0.7-beta.4"
10
+ __version__ = "0.0.7-beta.6"
@@ -39,7 +39,7 @@ def load_annotations(network: nx.Graph, annotations_input: Dict[str, Any]) -> Di
39
39
  annotations_input (dict): A dictionary with annotations.
40
40
 
41
41
  Returns:
42
- dict: A dictionary containing ordered nodes, ordered annotations, and the annotations matrix.
42
+ dict: A dictionary containing ordered nodes, ordered annotations, and the binary annotations matrix.
43
43
  """
44
44
  # Flatten the dictionary to a list of tuples for easier DataFrame creation
45
45
  flattened_annotations = [
@@ -66,7 +66,8 @@ def load_annotations(network: nx.Graph, annotations_input: Dict[str, Any]) -> Di
66
66
  # Extract ordered nodes and annotations
67
67
  ordered_nodes = tuple(annotations_pivot.index)
68
68
  ordered_annotations = tuple(annotations_pivot.columns)
69
- annotations_pivot_numpy = annotations_pivot.fillna(0).to_numpy()
69
+ # Convert the annotations_pivot matrix to a numpy array and ensure it's binary
70
+ annotations_pivot_numpy = (annotations_pivot.fillna(0).to_numpy() > 0).astype(int)
70
71
 
71
72
  return {
72
73
  "ordered_nodes": ordered_nodes,
@@ -163,8 +164,8 @@ def define_top_annotations(
163
164
 
164
165
 
165
166
  def get_description(words_column: pd.Series) -> str:
166
- """Process input Series to identify and return the top N frequent, significant words,
167
- filtering based on stopwords and similarity (Jaccard index).
167
+ """Process input Series to identify and return the top frequent, significant words,
168
+ filtering based on stopwords and gracefully handling numerical strings.
168
169
 
169
170
  Args:
170
171
  words_column (pd.Series): A pandas Series containing strings to process.
@@ -172,19 +173,30 @@ def get_description(words_column: pd.Series) -> str:
172
173
  Returns:
173
174
  str: A coherent description formed from the most frequent and significant words.
174
175
  """
175
- # Define stopwords
176
- stop_words = set(stopwords.words("english"))
177
- # Tokenize the concatenated string and filter out stopwords and non-alphabetic words
176
+ # Concatenate all rows into a single string and tokenize into words
177
+ all_words = words_column.str.cat(sep=" ")
178
+ tokens = word_tokenize(all_words)
179
+
180
+ # Check if all tokens are numeric strings or contain a mixture of strings and numbers
181
+ numeric_tokens = [token for token in tokens if token.replace(".", "", 1).isdigit()]
182
+ non_numeric_tokens = [token for token in tokens if not token.replace(".", "", 1).isdigit()]
183
+ # If there's only one unique numeric value, return it directly as a string
184
+ unique_numeric_values = set(numeric_tokens)
185
+ if len(unique_numeric_values) == 1:
186
+ return f"{list(unique_numeric_values)[0]}"
187
+
188
+ # Allow the inclusion of both alphabetic and numeric tokens if mixture is detected
178
189
  words = [
179
190
  (
180
191
  word.lower() if word.istitle() else word
181
192
  ) # Lowercase all words except proper nouns (e.g., RNA, mRNA)
182
- for word in word_tokenize(words_column.str.cat(sep=" "))
183
- if word.isalpha() and word.lower() not in stop_words
193
+ for word in tokens
194
+ if word.isalpha()
195
+ or word.replace(".", "", 1).isdigit() # Keep alphabetic words and numeric strings
184
196
  ]
185
- # Simplify the word list to remove similar words based on the Jaccard index and generate coherent description
186
- simplified_words = _simplify_word_list(words, threshold=0.90)
187
- description = _generate_coherent_description(simplified_words)
197
+ # Generate a coherent description from the processed words
198
+ description = _generate_coherent_description(words)
199
+
188
200
  return description
189
201
 
190
202
 
@@ -242,25 +254,28 @@ def _calculate_jaccard_index(set1: Set[Any], set2: Set[Any]) -> float:
242
254
 
243
255
 
244
256
  def _generate_coherent_description(words: List[str]) -> str:
245
- """Generate a coherent description from a list of words.
257
+ """Generate a coherent description from a list of words or numerical string values.
258
+ If there is only one unique entry, return it directly.
246
259
 
247
260
  Args:
248
- words (list of str): A list of words from which to generate the description.
261
+ words (list): A list of words or numerical string values.
249
262
 
250
263
  Returns:
251
264
  str: A coherent description formed by arranging the words in a logical sequence.
252
265
  """
253
- # Count the frequency of each word
266
+ # If there are no words or the input is invalid, raise an error
267
+ if not words or not isinstance(words, list) or not all(isinstance(word, str) for word in words):
268
+ raise ValueError("Input must be a list of strings.")
269
+
270
+ # If there's only one unique word, return it directly (even if it's a number-like string)
271
+ unique_words = set(words)
272
+ if len(unique_words) == 1:
273
+ return list(unique_words)[0]
274
+
275
+ # Count the frequency of each word and sort them by frequency
254
276
  word_counts = Counter(words)
255
- # Get the most common words
256
277
  most_common_words = [word for word, _ in word_counts.most_common()]
257
- # Filter out common stopwords
258
- stop_words = set(stopwords.words("english"))
259
- filtered_words = [word for word in most_common_words if word.lower() not in stop_words]
260
- # Generate permutations of the filtered words to find a logical order
261
- perm = permutations(filtered_words)
262
- # Assume the first permutation as the logical sequence (since they're all equally likely without additional context)
263
- logical_sequence = next(perm)
264
- # Join the words to form a coherent description
265
- description = " ".join(logical_sequence)
278
+ # Join the most common words to form a coherent description based on frequency
279
+ description = " ".join(most_common_words)
280
+
266
281
  return description
@@ -7,32 +7,29 @@ import community as community_louvain
7
7
  import networkx as nx
8
8
  import numpy as np
9
9
  import markov_clustering as mc
10
- from networkx.algorithms.community import asyn_lpa_communities
10
+ from networkx.algorithms.community import asyn_lpa_communities, greedy_modularity_communities
11
11
 
12
12
 
13
- def calculate_dijkstra_neighborhoods(network: nx.Graph) -> np.ndarray:
14
- """Calculate neighborhoods using Dijkstra's shortest path distances.
13
+ def calculate_greedy_modularity_neighborhoods(network: nx.Graph) -> np.ndarray:
14
+ """Calculate neighborhoods using the Greedy Modularity method.
15
15
 
16
16
  Args:
17
- network (nx.Graph): The network graph.
17
+ network (nx.Graph): The network graph to analyze for community structure.
18
18
 
19
19
  Returns:
20
- np.ndarray: Neighborhood matrix based on Dijkstra's distances.
20
+ np.ndarray: A binary neighborhood matrix where nodes in the same community have 1, and others have 0.
21
21
  """
22
- # Compute Dijkstra's distance for all pairs of nodes in the network
23
- all_dijkstra_paths = dict(nx.all_pairs_dijkstra_path_length(network, weight="length"))
22
+ # Detect communities using the Greedy Modularity method
23
+ communities = greedy_modularity_communities(network)
24
+ # Create a mapping from node to community
25
+ community_dict = {node: idx for idx, community in enumerate(communities) for node in community}
26
+ # Create a binary neighborhood matrix
24
27
  neighborhoods = np.zeros((network.number_of_nodes(), network.number_of_nodes()), dtype=int)
25
-
26
- # Populate the neighborhoods matrix based on Dijkstra's distances
27
- for source, targets in all_dijkstra_paths.items():
28
- max_length = max(targets.values()) if targets else 1 # Handle cases with no targets
29
- for target, length in targets.items():
30
- if np.isnan(length):
31
- neighborhoods[source, target] = max_length # Use max distance for NaN
32
- elif length == 0:
33
- neighborhoods[source, target] = 1 # Assign 1 for zero-length paths (self-loops)
34
- else:
35
- neighborhoods[source, target] = 1 / length # Inverse of the distance
28
+ node_index = {node: i for i, node in enumerate(network.nodes())}
29
+ for node_i, community_i in community_dict.items():
30
+ for node_j, community_j in community_dict.items():
31
+ if community_i == community_j:
32
+ neighborhoods[node_index[node_i], node_index[node_j]] = 1
36
33
 
37
34
  return neighborhoods
38
35
 
@@ -44,21 +41,19 @@ def calculate_label_propagation_neighborhoods(network: nx.Graph) -> np.ndarray:
44
41
  network (nx.Graph): The network graph.
45
42
 
46
43
  Returns:
47
- np.ndarray: Neighborhood matrix based on Label Propagation.
44
+ np.ndarray: Binary neighborhood matrix on Label Propagation.
48
45
  """
49
46
  # Apply Label Propagation
50
47
  communities = nx.algorithms.community.label_propagation.label_propagation_communities(network)
51
-
52
48
  # Create a mapping from node to community
53
49
  community_dict = {}
54
50
  for community_id, community in enumerate(communities):
55
51
  for node in community:
56
52
  community_dict[node] = community_id
57
53
 
58
- # Create a neighborhood matrix
54
+ # Create a binary neighborhood matrix
59
55
  num_nodes = network.number_of_nodes()
60
56
  neighborhoods = np.zeros((num_nodes, num_nodes), dtype=int)
61
-
62
57
  # Assign neighborhoods based on community labels
63
58
  for node_i, community_i in community_dict.items():
64
59
  for node_j, community_j in community_dict.items():
@@ -79,14 +74,14 @@ def calculate_louvain_neighborhoods(
79
74
  random_seed (int, optional): Random seed for reproducibility. Defaults to 888.
80
75
 
81
76
  Returns:
82
- np.ndarray: Neighborhood matrix based on the Louvain method.
77
+ np.ndarray: Binary neighborhood matrix on the Louvain method.
83
78
  """
84
79
  # Apply Louvain method to partition the network
85
80
  partition = community_louvain.best_partition(
86
81
  network, resolution=resolution, random_state=random_seed
87
82
  )
83
+ # Create a binary neighborhood matrix
88
84
  neighborhoods = np.zeros((network.number_of_nodes(), network.number_of_nodes()), dtype=int)
89
-
90
85
  # Assign neighborhoods based on community partitions
91
86
  for node_i, community_i in partition.items():
92
87
  for node_j, community_j in partition.items():
@@ -103,7 +98,7 @@ def calculate_markov_clustering_neighborhoods(network: nx.Graph) -> np.ndarray:
103
98
  network (nx.Graph): The network graph.
104
99
 
105
100
  Returns:
106
- np.ndarray: Neighborhood matrix based on Markov Clustering.
101
+ np.ndarray: Binary neighborhood matrix on Markov Clustering.
107
102
  """
108
103
  # Convert the graph to an adjacency matrix
109
104
  adjacency_matrix = nx.to_numpy_array(network)
@@ -111,17 +106,15 @@ def calculate_markov_clustering_neighborhoods(network: nx.Graph) -> np.ndarray:
111
106
  result = mc.run_mcl(adjacency_matrix) # Run MCL with default parameters
112
107
  # Get clusters
113
108
  clusters = mc.get_clusters(result)
114
-
115
109
  # Create a community label for each node
116
110
  community_dict = {}
117
111
  for community_id, community in enumerate(clusters):
118
112
  for node in community:
119
113
  community_dict[node] = community_id
120
114
 
121
- # Create a neighborhood matrix
115
+ # Create a binary neighborhood matrix
122
116
  num_nodes = network.number_of_nodes()
123
117
  neighborhoods = np.zeros((num_nodes, num_nodes), dtype=int)
124
-
125
118
  # Assign neighborhoods based on community labels
126
119
  for node_i, community_i in community_dict.items():
127
120
  for node_j, community_j in community_dict.items():
@@ -138,21 +131,19 @@ def calculate_spinglass_neighborhoods(network: nx.Graph) -> np.ndarray:
138
131
  network (nx.Graph): The network graph.
139
132
 
140
133
  Returns:
141
- np.ndarray: Neighborhood matrix based on Spin Glass communities.
134
+ np.ndarray: Binary neighborhood matrix on Spin Glass communities.
142
135
  """
143
136
  # Use the asynchronous label propagation algorithm as a proxy for Spin Glass
144
137
  communities = asyn_lpa_communities(network)
145
-
146
138
  # Create a community label for each node
147
139
  community_dict = {}
148
140
  for community_id, community in enumerate(communities):
149
141
  for node in community:
150
142
  community_dict[node] = community_id
151
143
 
152
- # Create a neighborhood matrix
144
+ # Create a binary neighborhood matrix
153
145
  num_nodes = network.number_of_nodes()
154
146
  neighborhoods = np.zeros((num_nodes, num_nodes), dtype=int)
155
-
156
147
  # Assign neighborhoods based on community labels
157
148
  for node_i, community_i in community_dict.items():
158
149
  for node_j, community_j in community_dict.items():
@@ -169,21 +160,19 @@ def calculate_walktrap_neighborhoods(network: nx.Graph) -> np.ndarray:
169
160
  network (nx.Graph): The network graph.
170
161
 
171
162
  Returns:
172
- np.ndarray: Neighborhood matrix based on Walktrap communities.
163
+ np.ndarray: Binary neighborhood matrix on Walktrap communities.
173
164
  """
174
165
  # Use the asynchronous label propagation algorithm as a proxy for Walktrap
175
166
  communities = asyn_lpa_communities(network)
176
-
177
167
  # Create a community label for each node
178
168
  community_dict = {}
179
169
  for community_id, community in enumerate(communities):
180
170
  for node in community:
181
171
  community_dict[node] = community_id
182
172
 
183
- # Create a neighborhood matrix
173
+ # Create a binary neighborhood matrix
184
174
  num_nodes = network.number_of_nodes()
185
175
  neighborhoods = np.zeros((num_nodes, num_nodes), dtype=int)
186
-
187
176
  # Assign neighborhoods based on community labels
188
177
  for node_i, community_i in community_dict.items():
189
178
  for node_j, community_j in community_dict.items():
@@ -3,6 +3,7 @@ risk/neighborhoods/neighborhoods
3
3
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4
4
  """
5
5
 
6
+ import random
6
7
  import warnings
7
8
  from typing import Any, Dict, List, Tuple
8
9
 
@@ -12,7 +13,7 @@ from sklearn.exceptions import DataConversionWarning
12
13
  from sklearn.metrics.pairwise import cosine_similarity
13
14
 
14
15
  from risk.neighborhoods.community import (
15
- calculate_dijkstra_neighborhoods,
16
+ calculate_greedy_modularity_neighborhoods,
16
17
  calculate_label_propagation_neighborhoods,
17
18
  calculate_louvain_neighborhoods,
18
19
  calculate_markov_clustering_neighborhoods,
@@ -26,7 +27,7 @@ warnings.filterwarnings(action="ignore", category=DataConversionWarning)
26
27
 
27
28
  def get_network_neighborhoods(
28
29
  network: nx.Graph,
29
- distance_metric: str = "dijkstra",
30
+ distance_metric: str = "louvain",
30
31
  edge_length_threshold: float = 1.0,
31
32
  louvain_resolution: float = 1.0,
32
33
  random_seed: int = 888,
@@ -35,8 +36,8 @@ def get_network_neighborhoods(
35
36
 
36
37
  Args:
37
38
  network (nx.Graph): The network graph.
38
- distance_metric (str): The distance metric to use ('euclidean', 'dijkstra', 'louvain', 'affinity_propagation',
39
- 'label_propagation', 'markov_clustering', 'walktrap', 'spinglass').
39
+ distance_metric (str): The distance metric to use ('greedy_modularity', 'louvain', 'label_propagation',
40
+ 'markov_clustering', 'walktrap', 'spinglass').
40
41
  edge_length_threshold (float): The edge length threshold for the neighborhoods.
41
42
  louvain_resolution (float, optional): Resolution parameter for the Louvain method. Defaults to 1.0.
42
43
  random_seed (int, optional): Random seed for methods requiring random initialization. Defaults to 888.
@@ -44,10 +45,17 @@ def get_network_neighborhoods(
44
45
  Returns:
45
46
  np.ndarray: Neighborhood matrix calculated based on the selected distance metric.
46
47
  """
47
- network = _create_percentile_limited_subgraph(network, edge_length_threshold)
48
+ # Set random seed for reproducibility in all methods besides Louvain, which requires a separate seed
49
+ random.seed(random_seed)
50
+ np.random.seed(random_seed)
48
51
 
49
- if distance_metric == "dijkstra":
50
- return calculate_dijkstra_neighborhoods(network)
52
+ # Create a subgraph based on the edge length percentile threshold
53
+ network = _create_percentile_limited_subgraph(
54
+ network, edge_length_percentile=edge_length_threshold
55
+ )
56
+
57
+ if distance_metric == "greedy_modularity":
58
+ return calculate_greedy_modularity_neighborhoods(network)
51
59
  if distance_metric == "louvain":
52
60
  return calculate_louvain_neighborhoods(network, louvain_resolution, random_seed=random_seed)
53
61
  if distance_metric == "label_propagation":
@@ -60,34 +68,44 @@ def get_network_neighborhoods(
60
68
  return calculate_spinglass_neighborhoods(network)
61
69
 
62
70
  raise ValueError(
63
- "Incorrect distance metric specified. Please choose from 'dijkstra', 'louvain',"
71
+ "Incorrect distance metric specified. Please choose from 'greedy_modularity', 'louvain',"
64
72
  "'label_propagation', 'markov_clustering', 'walktrap', 'spinglass'."
65
73
  )
66
74
 
67
75
 
68
76
  def _create_percentile_limited_subgraph(G: nx.Graph, edge_length_percentile: float) -> nx.Graph:
69
- """Calculate the edge length corresponding to the given percentile of edge lengths in the graph
70
- and create a subgraph with all nodes and edges below this length.
77
+ """Create a subgraph containing all nodes and edges where the edge length is below the
78
+ specified percentile of all edge lengths in the input graph.
71
79
 
72
80
  Args:
73
- G (nx.Graph): The input graph.
74
- edge_length_percentile (float): The percentile to calculate (between 0 and 1).
81
+ G (nx.Graph): The input graph with 'length' attributes on edges.
82
+ edge_length_percentile (float): The percentile (between 0 and 1) to filter edges by length.
75
83
 
76
84
  Returns:
77
- nx.Graph: A subgraph with all nodes and edges below the edge length corresponding to the given percentile.
85
+ nx.Graph: A subgraph with all nodes and edges where the edge length is below the
86
+ calculated threshold length.
78
87
  """
79
- # Extract edge lengths from the graph
88
+ # Extract edge lengths and handle missing lengths
80
89
  edge_lengths = [d["length"] for _, _, d in G.edges(data=True) if "length" in d]
90
+ if not edge_lengths:
91
+ raise ValueError(
92
+ "No edge lengths found in the graph. Ensure edges have 'length' attributes."
93
+ )
94
+
81
95
  # Calculate the specific edge length for the given percentile
82
96
  percentile_length = np.percentile(edge_lengths, edge_length_percentile * 100)
83
- # Create a new graph with all nodes from the original graph
97
+ # Create the subgraph by directly filtering edges during iteration
84
98
  subgraph = nx.Graph()
85
- subgraph.add_nodes_from(G.nodes(data=True))
86
- # Add edges to the subgraph if they are below the specified percentile length
99
+ subgraph.add_nodes_from(G.nodes(data=True)) # Retain all nodes from the original graph
100
+ # Add edges below the specified percentile length in a single pass
87
101
  for u, v, d in G.edges(data=True):
88
102
  if d.get("length", 1) <= percentile_length:
89
103
  subgraph.add_edge(u, v, **d)
90
104
 
105
+ # Return the subgraph; optionally check if it's too sparse
106
+ if subgraph.number_of_edges() == 0:
107
+ raise Warning("The resulting subgraph has no edges. Consider adjusting the percentile.")
108
+
91
109
  return subgraph
92
110
 
93
111
 
@@ -9,6 +9,7 @@ import matplotlib.colors as mcolors
9
9
  import matplotlib.pyplot as plt
10
10
  import networkx as nx
11
11
  import numpy as np
12
+ import pandas as pd
12
13
  from scipy.ndimage import label
13
14
  from scipy.stats import gaussian_kde
14
15
 
@@ -601,7 +602,7 @@ class NetworkPlotter:
601
602
  min_words (int, optional): Minimum number of words required to display a label. Defaults to 1.
602
603
  max_word_length (int, optional): Maximum number of characters in a word to display. Defaults to 20.
603
604
  min_word_length (int, optional): Minimum number of characters in a word to display. Defaults to 1.
604
- words_to_omit (List, optional): List of words to omit from the labels. Defaults to None.
605
+ words_to_omit (list, optional): List of words to omit from the labels. Defaults to None.
605
606
  overlay_ids (bool, optional): Whether to overlay domain IDs in the center of the centroids. Defaults to False.
606
607
  ids_to_keep (list, tuple, np.ndarray, or None, optional): IDs of domains that must be labeled. To discover domain IDs,
607
608
  you can set `overlay_ids=True`. Defaults to None.
@@ -710,6 +711,9 @@ class NetworkPlotter:
710
711
  # Process remaining domains to fill in additional labels, if there are slots left
711
712
  if remaining_labels and remaining_labels > 0:
712
713
  for idx, (domain, centroid) in enumerate(domain_centroids.items()):
714
+ # Check if the domain is NaN and continue if true
715
+ if pd.isna(domain) or (isinstance(domain, float) and np.isnan(domain)):
716
+ continue # Skip NaN domains
713
717
  if ids_to_keep and domain in ids_to_keep:
714
718
  continue # Skip domains already handled by ids_to_keep
715
719
 
@@ -20,9 +20,9 @@ from risk.neighborhoods import (
20
20
  from risk.network import NetworkIO, NetworkGraph, NetworkPlotter
21
21
  from risk.stats import (
22
22
  calculate_significance_matrices,
23
- compute_fisher_exact_test,
24
23
  compute_hypergeom_test,
25
24
  compute_permutation_test,
25
+ compute_poisson_test,
26
26
  )
27
27
 
28
28
 
@@ -45,48 +45,39 @@ class RISK(NetworkIO, AnnotationsIO):
45
45
  """Access the logged parameters."""
46
46
  return params
47
47
 
48
- def load_neighborhoods_by_permutation(
48
+ def load_neighborhoods_by_hypergeom(
49
49
  self,
50
50
  network: nx.Graph,
51
51
  annotations: Dict[str, Any],
52
- distance_metric: str = "dijkstra",
52
+ distance_metric: str = "louvain",
53
53
  louvain_resolution: float = 0.1,
54
54
  edge_length_threshold: float = 0.5,
55
- score_metric: str = "sum",
56
55
  null_distribution: str = "network",
57
- num_permutations: int = 1000,
58
56
  random_seed: int = 888,
59
- max_workers: int = 1,
60
57
  ) -> Dict[str, Any]:
61
- """Load significant neighborhoods for the network using the permutation test.
58
+ """Load significant neighborhoods for the network using the hypergeometric test.
62
59
 
63
60
  Args:
64
61
  network (nx.Graph): The network graph.
65
62
  annotations (dict): The annotations associated with the network.
66
- distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "dijkstra".
63
+ distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "louvain".
67
64
  louvain_resolution (float, optional): Resolution parameter for Louvain clustering. Defaults to 0.1.
68
65
  edge_length_threshold (float, optional): Edge length threshold for neighborhood analysis. Defaults to 0.5.
69
- score_metric (str, optional): Scoring metric for neighborhood significance. Defaults to "sum".
70
- null_distribution (str, optional): Distribution used for permutation tests. Defaults to "network".
71
- num_permutations (int, optional): Number of permutations for significance testing. Defaults to 1000.
66
+ null_distribution (str, optional): Type of null distribution ('network' or 'annotations'). Defaults to "network".
72
67
  random_seed (int, optional): Seed for random number generation. Defaults to 888.
73
- max_workers (int, optional): Maximum number of workers for parallel computation. Defaults to 1.
74
68
 
75
69
  Returns:
76
70
  dict: Computed significance of neighborhoods.
77
71
  """
78
- print_header("Running permutation test")
72
+ print_header("Running hypergeometric test")
79
73
  # Log neighborhood analysis parameters
80
74
  params.log_neighborhoods(
81
75
  distance_metric=distance_metric,
82
76
  louvain_resolution=louvain_resolution,
83
77
  edge_length_threshold=edge_length_threshold,
84
- statistical_test_function="permutation",
85
- score_metric=score_metric,
78
+ statistical_test_function="hypergeom",
86
79
  null_distribution=null_distribution,
87
- num_permutations=num_permutations,
88
80
  random_seed=random_seed,
89
- max_workers=max_workers,
90
81
  )
91
82
 
92
83
  # Load neighborhoods based on the network and distance metric
@@ -97,59 +88,49 @@ class RISK(NetworkIO, AnnotationsIO):
97
88
  edge_length_threshold=edge_length_threshold,
98
89
  random_seed=random_seed,
99
90
  )
100
-
101
- # Log and display permutation test settings
102
- print(f"Neighborhood scoring metric: '{score_metric}'")
103
- print(f"Null distribution: '{null_distribution}'")
104
- print(f"Number of permutations: {num_permutations}")
105
- print(f"Maximum workers: {max_workers}")
106
- # Run permutation test to compute neighborhood significance
107
- neighborhood_significance = compute_permutation_test(
91
+ # Run hypergeometric test to compute neighborhood significance
92
+ neighborhood_significance = compute_hypergeom_test(
108
93
  neighborhoods=neighborhoods,
109
94
  annotations=annotations["matrix"],
110
- score_metric=score_metric,
111
95
  null_distribution=null_distribution,
112
- num_permutations=num_permutations,
113
- random_seed=random_seed,
114
- max_workers=max_workers,
115
96
  )
116
97
 
117
98
  # Return the computed neighborhood significance
118
99
  return neighborhood_significance
119
100
 
120
- def load_neighborhoods_by_fisher_exact(
101
+ def load_neighborhoods_by_poisson(
121
102
  self,
122
103
  network: nx.Graph,
123
104
  annotations: Dict[str, Any],
124
- distance_metric: str = "dijkstra",
105
+ distance_metric: str = "louvain",
125
106
  louvain_resolution: float = 0.1,
126
107
  edge_length_threshold: float = 0.5,
108
+ null_distribution: str = "network",
127
109
  random_seed: int = 888,
128
- max_workers: int = 1,
129
110
  ) -> Dict[str, Any]:
130
- """Load significant neighborhoods for the network using the Fisher's exact test.
111
+ """Load significant neighborhoods for the network using the Poisson test.
131
112
 
132
113
  Args:
133
114
  network (nx.Graph): The network graph.
134
115
  annotations (dict): The annotations associated with the network.
135
- distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "dijkstra".
116
+ distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "louvain".
136
117
  louvain_resolution (float, optional): Resolution parameter for Louvain clustering. Defaults to 0.1.
137
118
  edge_length_threshold (float, optional): Edge length threshold for neighborhood analysis. Defaults to 0.5.
119
+ null_distribution (str, optional): Type of null distribution ('network' or 'annotations'). Defaults to "network".
138
120
  random_seed (int, optional): Seed for random number generation. Defaults to 888.
139
- max_workers (int, optional): Maximum number of workers for parallel computation. Defaults to 1.
140
121
 
141
122
  Returns:
142
123
  dict: Computed significance of neighborhoods.
143
124
  """
144
- print_header("Running Fisher's exact test")
125
+ print_header("Running Poisson test")
145
126
  # Log neighborhood analysis parameters
146
127
  params.log_neighborhoods(
147
128
  distance_metric=distance_metric,
148
129
  louvain_resolution=louvain_resolution,
149
130
  edge_length_threshold=edge_length_threshold,
150
- statistical_test_function="fisher_exact",
131
+ statistical_test_function="poisson",
132
+ null_distribution=null_distribution,
151
133
  random_seed=random_seed,
152
- max_workers=max_workers,
153
134
  )
154
135
 
155
136
  # Load neighborhoods based on the network and distance metric
@@ -160,50 +141,56 @@ class RISK(NetworkIO, AnnotationsIO):
160
141
  edge_length_threshold=edge_length_threshold,
161
142
  random_seed=random_seed,
162
143
  )
163
-
164
- # Log and display Fisher's exact test settings
165
- print(f"Maximum workers: {max_workers}")
166
- # Run Fisher's exact test to compute neighborhood significance
167
- neighborhood_significance = compute_fisher_exact_test(
144
+ # Run Poisson test to compute neighborhood significance
145
+ neighborhood_significance = compute_poisson_test(
168
146
  neighborhoods=neighborhoods,
169
147
  annotations=annotations["matrix"],
170
- max_workers=max_workers,
148
+ null_distribution=null_distribution,
171
149
  )
172
150
 
173
151
  # Return the computed neighborhood significance
174
152
  return neighborhood_significance
175
153
 
176
- def load_neighborhoods_by_hypergeom(
154
+ def load_neighborhoods_by_permutation(
177
155
  self,
178
156
  network: nx.Graph,
179
157
  annotations: Dict[str, Any],
180
- distance_metric: str = "dijkstra",
158
+ distance_metric: str = "louvain",
181
159
  louvain_resolution: float = 0.1,
182
160
  edge_length_threshold: float = 0.5,
161
+ score_metric: str = "sum",
162
+ null_distribution: str = "network",
163
+ num_permutations: int = 1000,
183
164
  random_seed: int = 888,
184
165
  max_workers: int = 1,
185
166
  ) -> Dict[str, Any]:
186
- """Load significant neighborhoods for the network using the hypergeometric test.
167
+ """Load significant neighborhoods for the network using the permutation test.
187
168
 
188
169
  Args:
189
170
  network (nx.Graph): The network graph.
190
171
  annotations (dict): The annotations associated with the network.
191
- distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "dijkstra".
172
+ distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "louvain".
192
173
  louvain_resolution (float, optional): Resolution parameter for Louvain clustering. Defaults to 0.1.
193
174
  edge_length_threshold (float, optional): Edge length threshold for neighborhood analysis. Defaults to 0.5.
175
+ score_metric (str, optional): Scoring metric for neighborhood significance. Defaults to "sum".
176
+ null_distribution (str, optional): Type of null distribution ('network' or 'annotations'). Defaults to "network".
177
+ num_permutations (int, optional): Number of permutations for significance testing. Defaults to 1000.
194
178
  random_seed (int, optional): Seed for random number generation. Defaults to 888.
195
179
  max_workers (int, optional): Maximum number of workers for parallel computation. Defaults to 1.
196
180
 
197
181
  Returns:
198
182
  dict: Computed significance of neighborhoods.
199
183
  """
200
- print_header("Running hypergeometric test")
184
+ print_header("Running permutation test")
201
185
  # Log neighborhood analysis parameters
202
186
  params.log_neighborhoods(
203
187
  distance_metric=distance_metric,
204
188
  louvain_resolution=louvain_resolution,
205
189
  edge_length_threshold=edge_length_threshold,
206
- statistical_test_function="hypergeom",
190
+ statistical_test_function="permutation",
191
+ score_metric=score_metric,
192
+ null_distribution=null_distribution,
193
+ num_permutations=num_permutations,
207
194
  random_seed=random_seed,
208
195
  max_workers=max_workers,
209
196
  )
@@ -217,12 +204,19 @@ class RISK(NetworkIO, AnnotationsIO):
217
204
  random_seed=random_seed,
218
205
  )
219
206
 
220
- # Log and display hypergeometric test settings
207
+ # Log and display permutation test settings
208
+ print(f"Neighborhood scoring metric: '{score_metric}'")
209
+ print(f"Null distribution: '{null_distribution}'")
210
+ print(f"Number of permutations: {num_permutations}")
221
211
  print(f"Maximum workers: {max_workers}")
222
- # Run hypergeometric test to compute neighborhood significance
223
- neighborhood_significance = compute_hypergeom_test(
212
+ # Run permutation test to compute neighborhood significance
213
+ neighborhood_significance = compute_permutation_test(
224
214
  neighborhoods=neighborhoods,
225
215
  annotations=annotations["matrix"],
216
+ score_metric=score_metric,
217
+ null_distribution=null_distribution,
218
+ num_permutations=num_permutations,
219
+ random_seed=random_seed,
226
220
  max_workers=max_workers,
227
221
  )
228
222
 
@@ -380,7 +374,7 @@ class RISK(NetworkIO, AnnotationsIO):
380
374
  def _load_neighborhoods(
381
375
  self,
382
376
  network: nx.Graph,
383
- distance_metric: str = "dijkstra",
377
+ distance_metric: str = "louvain",
384
378
  louvain_resolution: float = 0.1,
385
379
  edge_length_threshold: float = 0.5,
386
380
  random_seed: int = 888,
@@ -390,7 +384,7 @@ class RISK(NetworkIO, AnnotationsIO):
390
384
  Args:
391
385
  network (nx.Graph): The network graph.
392
386
  annotations (pd.DataFrame): The matrix of annotations associated with the network.
393
- distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "dijkstra".
387
+ distance_metric (str, optional): Distance metric for neighborhood analysis. Defaults to "louvain".
394
388
  louvain_resolution (float, optional): Resolution parameter for Louvain clustering. Defaults to 0.1.
395
389
  edge_length_threshold (float, optional): Edge length threshold for neighborhood analysis. Defaults to 0.5.
396
390
  random_seed (int, optional): Seed for random number generation. Defaults to 888.
@@ -3,7 +3,7 @@ risk/stats
3
3
  ~~~~~~~~~~
4
4
  """
5
5
 
6
- from .stats import calculate_significance_matrices
7
- from .fisher_exact import compute_fisher_exact_test
8
6
  from .hypergeom import compute_hypergeom_test
9
7
  from .permutation import compute_permutation_test
8
+ from .poisson import compute_poisson_test
9
+ from .stats import calculate_significance_matrices
@@ -0,0 +1,56 @@
1
+ """
2
+ risk/stats/hypergeom
3
+ ~~~~~~~~~~~~~~~~~~~~
4
+ """
5
+
6
+ from typing import Any, Dict
7
+
8
+ import numpy as np
9
+ from scipy.stats import hypergeom
10
+
11
+
12
+ def compute_hypergeom_test(
13
+ neighborhoods: np.ndarray, annotations: np.ndarray, null_distribution: str = "network"
14
+ ) -> Dict[str, Any]:
15
+ """Compute hypergeometric test for enrichment and depletion in neighborhoods with selectable null distribution.
16
+
17
+ Args:
18
+ neighborhoods (np.ndarray): Binary matrix representing neighborhoods.
19
+ annotations (np.ndarray): Binary matrix representing annotations.
20
+ null_distribution (str, optional): Type of null distribution ('network' or 'annotations'). Defaults to "network".
21
+
22
+ Returns:
23
+ dict: Dictionary containing depletion and enrichment p-values.
24
+ """
25
+ # Ensure both matrices are binary (presence/absence)
26
+ neighborhoods = (neighborhoods > 0).astype(int)
27
+ annotations = (annotations > 0).astype(int)
28
+ total_node_count = neighborhoods.shape[0]
29
+
30
+ if null_distribution == "network":
31
+ # Case 1: Use all nodes as the background
32
+ background_population = total_node_count
33
+ neighborhood_sums = np.sum(neighborhoods, axis=0, keepdims=True).T
34
+ annotation_sums = np.sum(annotations, axis=0, keepdims=True)
35
+ elif null_distribution == "annotations":
36
+ # Case 2: Only consider nodes with at least one annotation
37
+ annotated_nodes = np.sum(annotations, axis=1) > 0
38
+ background_population = np.sum(annotated_nodes)
39
+ neighborhood_sums = np.sum(neighborhoods[annotated_nodes], axis=0, keepdims=True).T
40
+ annotation_sums = np.sum(annotations[annotated_nodes], axis=0, keepdims=True)
41
+ else:
42
+ raise ValueError(
43
+ "Invalid null_distribution value. Choose either 'network' or 'annotations'."
44
+ )
45
+
46
+ # Matrix multiplication for annotated nodes in each neighborhood
47
+ annotated_in_neighborhood = neighborhoods.T @ annotations
48
+ # Calculate depletion and enrichment p-values using the hypergeometric distribution
49
+ depletion_pvals = hypergeom.cdf(
50
+ annotated_in_neighborhood, background_population, annotation_sums, neighborhood_sums
51
+ )
52
+ enrichment_pvals = hypergeom.sf(
53
+ annotated_in_neighborhood - 1, background_population, annotation_sums, neighborhood_sums
54
+ )
55
+
56
+ return {"depletion_pvals": depletion_pvals, "enrichment_pvals": enrichment_pvals}
@@ -28,7 +28,7 @@ def compute_permutation_test(
28
28
  neighborhoods (np.ndarray): Binary matrix representing neighborhoods.
29
29
  annotations (np.ndarray): Binary matrix representing annotations.
30
30
  score_metric (str, optional): Metric to use for scoring ('sum', 'mean', etc.). Defaults to "sum".
31
- null_distribution (str, optional): Type of null distribution ('network' or other). Defaults to "network".
31
+ null_distribution (str, optional): Type of null distribution ('network' or 'annotations'). Defaults to "network".
32
32
  num_permutations (int, optional): Number of permutations to run. Defaults to 1000.
33
33
  random_seed (int, optional): Seed for random number generation. Defaults to 888.
34
34
  max_workers (int, optional): Number of workers for multiprocessing. Defaults to 1.
@@ -78,7 +78,7 @@ def _run_permutation_test(
78
78
  neighborhoods (np.ndarray): The neighborhood matrix.
79
79
  annotations (np.ndarray): The annotation matrix.
80
80
  neighborhood_score_func (Callable): Function to calculate neighborhood scores.
81
- null_distribution (str, optional): Type of null distribution. Defaults to "network".
81
+ null_distribution (str, optional): Type of null distribution ('network' or 'annotations'). Defaults to "network".
82
82
  num_permutations (int, optional): Number of permutations. Defaults to 1000.
83
83
  random_seed (int, optional): Seed for random number generation. Defaults to 888.
84
84
  max_workers (int, optional): Number of workers for multiprocessing. Defaults to 4.
@@ -91,8 +91,12 @@ def _run_permutation_test(
91
91
  # Determine the indices to use based on the null distribution type
92
92
  if null_distribution == "network":
93
93
  idxs = range(annotations.shape[0])
94
- else:
94
+ elif null_distribution == "annotations":
95
95
  idxs = np.nonzero(np.sum(~np.isnan(annotations), axis=1))[0]
96
+ else:
97
+ raise ValueError(
98
+ "Invalid null_distribution value. Choose either 'network' or 'annotations'."
99
+ )
96
100
 
97
101
  # Replace NaNs with zeros in the annotations matrix
98
102
  annotations[np.isnan(annotations)] = 0
@@ -0,0 +1,47 @@
1
+ """
2
+ risk/stats/poisson
3
+ ~~~~~~~~~~~~~~~~~~
4
+ """
5
+
6
+ from typing import Dict, Any
7
+
8
+ import numpy as np
9
+ from scipy.stats import poisson
10
+
11
+
12
+ def compute_poisson_test(
13
+ neighborhoods: np.ndarray, annotations: np.ndarray, null_distribution: str = "network"
14
+ ) -> Dict[str, Any]:
15
+ """Compute Poisson test for enrichment and depletion in neighborhoods with selectable null distribution.
16
+
17
+ Args:
18
+ neighborhoods (np.ndarray): Binary matrix representing neighborhoods.
19
+ annotations (np.ndarray): Binary matrix representing annotations.
20
+ null_distribution (str, optional): Type of null distribution ('network' or 'annotations'). Defaults to "network".
21
+
22
+ Returns:
23
+ dict: Dictionary containing depletion and enrichment p-values.
24
+ """
25
+ # Ensure both matrices are binary (presence/absence)
26
+ neighborhoods = (neighborhoods > 0).astype(int)
27
+ annotations = (annotations > 0).astype(int)
28
+ # Matrix multiplication to get the number of annotated nodes in each neighborhood
29
+ annotated_in_neighborhood = neighborhoods @ annotations
30
+
31
+ # Compute lambda_expected based on the chosen null distribution
32
+ if null_distribution == "network":
33
+ # Use the mean across neighborhoods (axis=1)
34
+ lambda_expected = np.mean(annotated_in_neighborhood, axis=1, keepdims=True)
35
+ elif null_distribution == "annotations":
36
+ # Use the mean across annotations (axis=0)
37
+ lambda_expected = np.mean(annotated_in_neighborhood, axis=0, keepdims=True)
38
+ else:
39
+ raise ValueError(
40
+ "Invalid null_distribution value. Choose either 'network' or 'annotations'."
41
+ )
42
+
43
+ # Compute p-values for enrichment and depletion using Poisson distribution
44
+ enrichment_pvals = 1 - poisson.cdf(annotated_in_neighborhood - 1, lambda_expected)
45
+ depletion_pvals = poisson.cdf(annotated_in_neighborhood, lambda_expected)
46
+
47
+ return {"enrichment_pvals": enrichment_pvals, "depletion_pvals": depletion_pvals}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: risk-network
3
- Version: 0.0.7b4
3
+ Version: 0.0.7b6
4
4
  Summary: A Python package for biological network analysis
5
5
  Author: Ira Horecka
6
6
  Author-email: Ira Horecka <ira89@icloud.com>
@@ -22,8 +22,8 @@ risk/network/graph.py
22
22
  risk/network/io.py
23
23
  risk/network/plot.py
24
24
  risk/stats/__init__.py
25
- risk/stats/fisher_exact.py
26
25
  risk/stats/hypergeom.py
26
+ risk/stats/poisson.py
27
27
  risk/stats/stats.py
28
28
  risk/stats/permutation/__init__.py
29
29
  risk/stats/permutation/permutation.py
@@ -1,132 +0,0 @@
1
- """
2
- risk/stats/fisher_exact
3
- ~~~~~~~~~~~~~~~~~~~~~~~
4
- """
5
-
6
- from multiprocessing import get_context, Manager
7
- from tqdm import tqdm
8
- from typing import Any, Dict
9
-
10
- import numpy as np
11
- from scipy.stats import fisher_exact
12
-
13
-
14
- def compute_fisher_exact_test(
15
- neighborhoods: np.ndarray,
16
- annotations: np.ndarray,
17
- max_workers: int = 4,
18
- ) -> Dict[str, Any]:
19
- """Compute Fisher's exact test for enrichment and depletion in neighborhoods.
20
-
21
- Args:
22
- neighborhoods (np.ndarray): Binary matrix representing neighborhoods.
23
- annotations (np.ndarray): Binary matrix representing annotations.
24
- max_workers (int, optional): Number of workers for multiprocessing. Defaults to 4.
25
-
26
- Returns:
27
- dict: Dictionary containing depletion and enrichment p-values.
28
- """
29
- # Ensure that the matrices are binary (boolean) and free of NaN values
30
- neighborhoods = neighborhoods.astype(bool) # Convert to boolean
31
- annotations = annotations.astype(bool) # Convert to boolean
32
-
33
- # Initialize the process of calculating p-values using multiprocessing
34
- ctx = get_context("spawn")
35
- manager = Manager()
36
- progress_counter = manager.Value("i", 0)
37
- total_tasks = neighborhoods.shape[1] * annotations.shape[1]
38
-
39
- # Calculate the workload per worker
40
- chunk_size = total_tasks // max_workers
41
- remainder = total_tasks % max_workers
42
-
43
- # Execute the Fisher's exact test using multiprocessing
44
- with ctx.Pool(max_workers) as pool:
45
- with tqdm(total=total_tasks, desc="Total progress", position=0) as progress:
46
- params_list = []
47
- start_idx = 0
48
- for i in range(max_workers):
49
- end_idx = start_idx + chunk_size + (1 if i < remainder else 0)
50
- params_list.append(
51
- (neighborhoods, annotations, start_idx, end_idx, progress_counter)
52
- )
53
- start_idx = end_idx
54
-
55
- # Start the Fisher's exact test process in parallel
56
- results = pool.starmap_async(_fisher_exact_process_subset, params_list, chunksize=1)
57
-
58
- # Update progress bar based on progress_counter
59
- while not results.ready():
60
- progress.update(progress_counter.value - progress.n)
61
- results.wait(0.05) # Wait for 50ms
62
- # Ensure progress bar reaches 100%
63
- progress.update(total_tasks - progress.n)
64
-
65
- # Accumulate results from each worker
66
- depletion_pvals, enrichment_pvals = [], []
67
- for dp, ep in results.get():
68
- depletion_pvals.extend(dp)
69
- enrichment_pvals.extend(ep)
70
-
71
- # Reshape the results back into arrays with the appropriate dimensions
72
- depletion_pvals = np.array(depletion_pvals).reshape(
73
- neighborhoods.shape[1], annotations.shape[1]
74
- )
75
- enrichment_pvals = np.array(enrichment_pvals).reshape(
76
- neighborhoods.shape[1], annotations.shape[1]
77
- )
78
-
79
- return {
80
- "depletion_pvals": depletion_pvals,
81
- "enrichment_pvals": enrichment_pvals,
82
- }
83
-
84
-
85
- def _fisher_exact_process_subset(
86
- neighborhoods: np.ndarray,
87
- annotations: np.ndarray,
88
- start_idx: int,
89
- end_idx: int,
90
- progress_counter,
91
- ) -> tuple:
92
- """Process a subset of neighborhoods using Fisher's exact test.
93
-
94
- Args:
95
- neighborhoods (np.ndarray): The full neighborhood matrix.
96
- annotations (np.ndarray): The annotation matrix.
97
- start_idx (int): Starting index of the neighborhood-annotation pairs to process.
98
- end_idx (int): Ending index of the neighborhood-annotation pairs to process.
99
- progress_counter: Shared counter for tracking progress.
100
-
101
- Returns:
102
- tuple: Local p-values for depletion and enrichment.
103
- """
104
- # Initialize lists to store p-values for depletion and enrichment
105
- depletion_pvals = []
106
- enrichment_pvals = []
107
- # Process the subset of tasks assigned to this worker
108
- for idx in range(start_idx, end_idx):
109
- i = idx // annotations.shape[1] # Neighborhood index
110
- j = idx % annotations.shape[1] # Annotation index
111
-
112
- neighborhood = neighborhoods[:, i]
113
- annotation = annotations[:, j]
114
-
115
- # Calculate the contingency table values
116
- TP = np.sum(neighborhood & annotation)
117
- FP = np.sum(neighborhood & ~annotation)
118
- FN = np.sum(~neighborhood & annotation)
119
- TN = np.sum(~neighborhood & ~annotation)
120
- table = np.array([[TP, FP], [FN, TN]])
121
-
122
- # Perform Fisher's exact test for depletion (alternative='less')
123
- _, p_value_depletion = fisher_exact(table, alternative="less")
124
- depletion_pvals.append(p_value_depletion)
125
- # Perform Fisher's exact test for enrichment (alternative='greater')
126
- _, p_value_enrichment = fisher_exact(table, alternative="greater")
127
- enrichment_pvals.append(p_value_enrichment)
128
-
129
- # Update the shared progress counter
130
- progress_counter.value += 1
131
-
132
- return depletion_pvals, enrichment_pvals
@@ -1,131 +0,0 @@
1
- """
2
- risk/stats/hypergeom
3
- ~~~~~~~~~~~~~~~~~~~~
4
- """
5
-
6
- from multiprocessing import get_context, Manager
7
- from tqdm import tqdm
8
- from typing import Any, Dict
9
-
10
- import numpy as np
11
- from scipy.stats import hypergeom
12
-
13
-
14
- def compute_hypergeom_test(
15
- neighborhoods: np.ndarray,
16
- annotations: np.ndarray,
17
- max_workers: int = 4,
18
- ) -> Dict[str, Any]:
19
- """Compute hypergeometric test for enrichment and depletion in neighborhoods.
20
-
21
- Args:
22
- neighborhoods (np.ndarray): Binary matrix representing neighborhoods.
23
- annotations (np.ndarray): Binary matrix representing annotations.
24
- max_workers (int, optional): Number of workers for multiprocessing. Defaults to 4.
25
-
26
- Returns:
27
- dict: Dictionary containing depletion and enrichment p-values.
28
- """
29
- # Ensure that the matrices are binary (boolean) and free of NaN values
30
- neighborhoods = neighborhoods.astype(bool) # Convert to boolean
31
- annotations = annotations.astype(bool) # Convert to boolean
32
-
33
- # Initialize the process of calculating p-values using multiprocessing
34
- ctx = get_context("spawn")
35
- manager = Manager()
36
- progress_counter = manager.Value("i", 0)
37
- total_tasks = neighborhoods.shape[1] * annotations.shape[1]
38
-
39
- # Calculate the workload per worker
40
- chunk_size = total_tasks // max_workers
41
- remainder = total_tasks % max_workers
42
-
43
- # Execute the hypergeometric test using multiprocessing
44
- with ctx.Pool(max_workers) as pool:
45
- with tqdm(total=total_tasks, desc="Total progress", position=0) as progress:
46
- params_list = []
47
- start_idx = 0
48
- for i in range(max_workers):
49
- end_idx = start_idx + chunk_size + (1 if i < remainder else 0)
50
- params_list.append(
51
- (neighborhoods, annotations, start_idx, end_idx, progress_counter)
52
- )
53
- start_idx = end_idx
54
-
55
- # Start the hypergeometric test process in parallel
56
- results = pool.starmap_async(_hypergeom_process_subset, params_list, chunksize=1)
57
-
58
- # Update progress bar based on progress_counter
59
- while not results.ready():
60
- progress.update(progress_counter.value - progress.n)
61
- results.wait(0.05) # Wait for 50ms
62
- # Ensure progress bar reaches 100%
63
- progress.update(total_tasks - progress.n)
64
-
65
- # Accumulate results from each worker
66
- depletion_pvals, enrichment_pvals = [], []
67
- for dp, ep in results.get():
68
- depletion_pvals.extend(dp)
69
- enrichment_pvals.extend(ep)
70
-
71
- # Reshape the results back into arrays with the appropriate dimensions
72
- depletion_pvals = np.array(depletion_pvals).reshape(
73
- neighborhoods.shape[1], annotations.shape[1]
74
- )
75
- enrichment_pvals = np.array(enrichment_pvals).reshape(
76
- neighborhoods.shape[1], annotations.shape[1]
77
- )
78
-
79
- return {
80
- "depletion_pvals": depletion_pvals,
81
- "enrichment_pvals": enrichment_pvals,
82
- }
83
-
84
-
85
- def _hypergeom_process_subset(
86
- neighborhoods: np.ndarray,
87
- annotations: np.ndarray,
88
- start_idx: int,
89
- end_idx: int,
90
- progress_counter,
91
- ) -> tuple:
92
- """Process a subset of neighborhoods using the hypergeometric test.
93
-
94
- Args:
95
- neighborhoods (np.ndarray): The full neighborhood matrix.
96
- annotations (np.ndarray): The annotation matrix.
97
- start_idx (int): Starting index of the neighborhood-annotation pairs to process.
98
- end_idx (int): Ending index of the neighborhood-annotation pairs to process.
99
- progress_counter: Shared counter for tracking progress.
100
-
101
- Returns:
102
- tuple: Local p-values for depletion and enrichment.
103
- """
104
- # Initialize lists to store p-values for depletion and enrichment
105
- depletion_pvals = []
106
- enrichment_pvals = []
107
- # Process the subset of tasks assigned to this worker
108
- for idx in range(start_idx, end_idx):
109
- i = idx // annotations.shape[1] # Neighborhood index
110
- j = idx % annotations.shape[1] # Annotation index
111
-
112
- neighborhood = neighborhoods[:, i]
113
- annotation = annotations[:, j]
114
-
115
- # Calculate the required values for the hypergeometric test
116
- M = annotations.shape[0] # Total number of items (population size)
117
- n = np.sum(annotation) # Total number of successes in population
118
- N = np.sum(neighborhood) # Total number of draws (sample size)
119
- k = np.sum(neighborhood & annotation) # Number of successes in sample
120
-
121
- # Perform hypergeometric test for depletion
122
- p_value_depletion = hypergeom.cdf(k, M, n, N)
123
- depletion_pvals.append(p_value_depletion)
124
- # Perform hypergeometric test for enrichment
125
- p_value_enrichment = hypergeom.sf(k - 1, M, n, N)
126
- enrichment_pvals.append(p_value_enrichment)
127
-
128
- # Update the shared progress counter
129
- progress_counter.value += 1
130
-
131
- return depletion_pvals, enrichment_pvals
File without changes
File without changes
File without changes
File without changes