risk-network 0.0.12b0__py3-none-any.whl → 0.0.12b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. risk/__init__.py +1 -1
  2. risk/annotations/__init__.py +10 -0
  3. risk/annotations/annotations.py +354 -0
  4. risk/annotations/io.py +241 -0
  5. risk/annotations/nltk_setup.py +86 -0
  6. risk/log/__init__.py +11 -0
  7. risk/log/console.py +141 -0
  8. risk/log/parameters.py +171 -0
  9. risk/neighborhoods/__init__.py +7 -0
  10. risk/neighborhoods/api.py +442 -0
  11. risk/neighborhoods/community.py +441 -0
  12. risk/neighborhoods/domains.py +360 -0
  13. risk/neighborhoods/neighborhoods.py +514 -0
  14. risk/neighborhoods/stats/__init__.py +13 -0
  15. risk/neighborhoods/stats/permutation/__init__.py +6 -0
  16. risk/neighborhoods/stats/permutation/permutation.py +240 -0
  17. risk/neighborhoods/stats/permutation/test_functions.py +70 -0
  18. risk/neighborhoods/stats/tests.py +275 -0
  19. risk/network/__init__.py +4 -0
  20. risk/network/graph/__init__.py +4 -0
  21. risk/network/graph/api.py +200 -0
  22. risk/network/graph/graph.py +274 -0
  23. risk/network/graph/stats.py +166 -0
  24. risk/network/graph/summary.py +253 -0
  25. risk/network/io.py +693 -0
  26. risk/network/plotter/__init__.py +4 -0
  27. risk/network/plotter/api.py +54 -0
  28. risk/network/plotter/canvas.py +291 -0
  29. risk/network/plotter/contour.py +329 -0
  30. risk/network/plotter/labels.py +935 -0
  31. risk/network/plotter/network.py +294 -0
  32. risk/network/plotter/plotter.py +141 -0
  33. risk/network/plotter/utils/colors.py +419 -0
  34. risk/network/plotter/utils/layout.py +94 -0
  35. risk_network-0.0.12b2.dist-info/METADATA +122 -0
  36. risk_network-0.0.12b2.dist-info/RECORD +40 -0
  37. {risk_network-0.0.12b0.dist-info → risk_network-0.0.12b2.dist-info}/WHEEL +1 -1
  38. risk_network-0.0.12b0.dist-info/METADATA +0 -796
  39. risk_network-0.0.12b0.dist-info/RECORD +0 -7
  40. {risk_network-0.0.12b0.dist-info → risk_network-0.0.12b2.dist-info}/licenses/LICENSE +0 -0
  41. {risk_network-0.0.12b0.dist-info → risk_network-0.0.12b2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,200 @@
1
+ """
2
+ risk/network/graph/api
3
+ ~~~~~~~~~~~~~~~~~~~~~~
4
+ """
5
+
6
+ import copy
7
+ from typing import Any, Dict, Union
8
+
9
+ import networkx as nx
10
+ import pandas as pd
11
+
12
+ from risk.annotations import define_top_annotations
13
+ from risk.log import log_header, logger, params
14
+ from risk.neighborhoods import (
15
+ define_domains,
16
+ process_neighborhoods,
17
+ trim_domains,
18
+ )
19
+ from risk.network.graph.graph import Graph
20
+ from risk.network.graph.stats import calculate_significance_matrices
21
+
22
+
23
+ class GraphAPI:
24
+ """Handles the loading of network graphs and associated data.
25
+
26
+ The GraphAPI class provides methods to load and process network graphs, annotations, and neighborhoods.
27
+ """
28
+
29
+ def __init__() -> None:
30
+ pass
31
+
32
+ def load_graph(
33
+ self,
34
+ network: nx.Graph,
35
+ annotations: Dict[str, Any],
36
+ neighborhoods: Dict[str, Any],
37
+ tail: str = "right",
38
+ pval_cutoff: float = 0.01,
39
+ fdr_cutoff: float = 0.9999,
40
+ impute_depth: int = 0,
41
+ prune_threshold: float = 0.0,
42
+ linkage_criterion: str = "distance",
43
+ linkage_method: str = "average",
44
+ linkage_metric: str = "yule",
45
+ linkage_threshold: Union[float, str] = 0.2,
46
+ min_cluster_size: int = 5,
47
+ max_cluster_size: int = 1000,
48
+ ) -> Graph:
49
+ """Load and process the network graph, defining top annotations and domains.
50
+
51
+ Args:
52
+ network (nx.Graph): The network graph.
53
+ annotations (Dict[str, Any]): The annotations associated with the network.
54
+ neighborhoods (Dict[str, Any]): Neighborhood significance data.
55
+ tail (str, optional): Type of significance tail ("right", "left", "both"). Defaults to "right".
56
+ pval_cutoff (float, optional): p-value cutoff for significance. Defaults to 0.01.
57
+ fdr_cutoff (float, optional): FDR cutoff for significance. Defaults to 0.9999.
58
+ impute_depth (int, optional): Depth for imputing neighbors. Defaults to 0.
59
+ prune_threshold (float, optional): Distance threshold for pruning neighbors. Defaults to 0.0.
60
+ linkage_criterion (str, optional): Clustering criterion for defining domains. Defaults to "distance".
61
+ linkage_method (str, optional): Clustering method to use. Choose "auto" to optimize. Defaults to "average".
62
+ linkage_metric (str, optional): Metric to use for calculating distances. Choose "auto" to optimize.
63
+ Defaults to "yule".
64
+ linkage_threshold (float, str, optional): Threshold for clustering. Choose "auto" to optimize.
65
+ Defaults to 0.2.
66
+ min_cluster_size (int, optional): Minimum size for clusters. Defaults to 5.
67
+ max_cluster_size (int, optional): Maximum size for clusters. Defaults to 1000.
68
+
69
+ Returns:
70
+ Graph: A fully initialized and processed Graph object.
71
+ """
72
+ # Log the parameters and display headers
73
+ log_header("Finding significant neighborhoods")
74
+ params.log_graph(
75
+ tail=tail,
76
+ pval_cutoff=pval_cutoff,
77
+ fdr_cutoff=fdr_cutoff,
78
+ impute_depth=impute_depth,
79
+ prune_threshold=prune_threshold,
80
+ linkage_criterion=linkage_criterion,
81
+ linkage_method=linkage_method,
82
+ linkage_metric=linkage_metric,
83
+ linkage_threshold=linkage_threshold,
84
+ min_cluster_size=min_cluster_size,
85
+ max_cluster_size=max_cluster_size,
86
+ )
87
+
88
+ # Make a copy of the network to avoid modifying the original
89
+ network = copy.deepcopy(network)
90
+
91
+ logger.debug(f"p-value cutoff: {pval_cutoff}")
92
+ logger.debug(f"FDR BH cutoff: {fdr_cutoff}")
93
+ logger.debug(
94
+ f"Significance tail: '{tail}' ({'enrichment' if tail == 'right' else 'depletion' if tail == 'left' else 'both'})"
95
+ )
96
+ # Calculate significant neighborhoods based on the provided parameters
97
+ significant_neighborhoods = calculate_significance_matrices(
98
+ neighborhoods["depletion_pvals"],
99
+ neighborhoods["enrichment_pvals"],
100
+ tail=tail,
101
+ pval_cutoff=pval_cutoff,
102
+ fdr_cutoff=fdr_cutoff,
103
+ )
104
+
105
+ log_header("Processing neighborhoods")
106
+ # Process neighborhoods by imputing and pruning based on the given settings
107
+ processed_neighborhoods = process_neighborhoods(
108
+ network=network,
109
+ neighborhoods=significant_neighborhoods,
110
+ impute_depth=impute_depth,
111
+ prune_threshold=prune_threshold,
112
+ )
113
+
114
+ log_header("Finding top annotations")
115
+ logger.debug(f"Min cluster size: {min_cluster_size}")
116
+ logger.debug(f"Max cluster size: {max_cluster_size}")
117
+ # Define top annotations based on processed neighborhoods
118
+ top_annotations = self._define_top_annotations(
119
+ network=network,
120
+ annotations=annotations,
121
+ neighborhoods=processed_neighborhoods,
122
+ min_cluster_size=min_cluster_size,
123
+ max_cluster_size=max_cluster_size,
124
+ )
125
+
126
+ log_header("Optimizing distance threshold for domains")
127
+ # Extract the significant significance matrix from the neighborhoods data
128
+ significant_neighborhoods_significance = processed_neighborhoods[
129
+ "significant_significance_matrix"
130
+ ]
131
+ # Define domains in the network using the specified clustering settings
132
+ domains = define_domains(
133
+ top_annotations=top_annotations,
134
+ significant_neighborhoods_significance=significant_neighborhoods_significance,
135
+ linkage_criterion=linkage_criterion,
136
+ linkage_method=linkage_method,
137
+ linkage_metric=linkage_metric,
138
+ linkage_threshold=linkage_threshold,
139
+ )
140
+ # Trim domains and top annotations based on cluster size constraints
141
+ domains, trimmed_domains = trim_domains(
142
+ domains=domains,
143
+ top_annotations=top_annotations,
144
+ min_cluster_size=min_cluster_size,
145
+ max_cluster_size=max_cluster_size,
146
+ )
147
+
148
+ # Prepare node mapping and significance sums for the final Graph object
149
+ ordered_nodes = annotations["ordered_nodes"]
150
+ node_label_to_id = dict(zip(ordered_nodes, range(len(ordered_nodes))))
151
+ node_significance_sums = processed_neighborhoods["node_significance_sums"]
152
+
153
+ # Return the fully initialized Graph object
154
+ return Graph(
155
+ network=network,
156
+ annotations=annotations,
157
+ neighborhoods=neighborhoods,
158
+ domains=domains,
159
+ trimmed_domains=trimmed_domains,
160
+ node_label_to_node_id_map=node_label_to_id,
161
+ node_significance_sums=node_significance_sums,
162
+ )
163
+
164
+ def _define_top_annotations(
165
+ self,
166
+ network: nx.Graph,
167
+ annotations: Dict[str, Any],
168
+ neighborhoods: Dict[str, Any],
169
+ min_cluster_size: int = 5,
170
+ max_cluster_size: int = 1000,
171
+ ) -> pd.DataFrame:
172
+ """Define top annotations for the network.
173
+
174
+ Args:
175
+ network (nx.Graph): The network graph.
176
+ annotations (Dict[str, Any]): Annotations data for the network.
177
+ neighborhoods (Dict[str, Any]): Neighborhood significance data.
178
+ min_cluster_size (int, optional): Minimum size for clusters. Defaults to 5.
179
+ max_cluster_size (int, optional): Maximum size for clusters. Defaults to 1000.
180
+
181
+ Returns:
182
+ Dict[str, Any]: Top annotations identified within the network.
183
+ """
184
+ # Extract necessary data from annotations and neighborhoods
185
+ ordered_annotations = annotations["ordered_annotations"]
186
+ neighborhood_significance_sums = neighborhoods["neighborhood_significance_counts"]
187
+ significant_significance_matrix = neighborhoods["significant_significance_matrix"]
188
+ significant_binary_significance_matrix = neighborhoods[
189
+ "significant_binary_significance_matrix"
190
+ ]
191
+ # Call external function to define top annotations
192
+ return define_top_annotations(
193
+ network=network,
194
+ ordered_annotation_labels=ordered_annotations,
195
+ neighborhood_significance_sums=neighborhood_significance_sums,
196
+ significant_significance_matrix=significant_significance_matrix,
197
+ significant_binary_significance_matrix=significant_binary_significance_matrix,
198
+ min_cluster_size=min_cluster_size,
199
+ max_cluster_size=max_cluster_size,
200
+ )
@@ -0,0 +1,274 @@
1
+ """
2
+ risk/network/graph/graph
3
+ ~~~~~~~~~~~~~~~~~~~~~~~~
4
+ """
5
+
6
+ from collections import defaultdict
7
+ from typing import Any, Dict, List
8
+
9
+ import networkx as nx
10
+ import numpy as np
11
+ import pandas as pd
12
+
13
+ from risk.network.graph.summary import Summary
14
+
15
+
16
+ class Graph:
17
+ """A class to represent a network graph and process its nodes and edges.
18
+
19
+ The Graph class provides functionality to handle and manipulate a network graph,
20
+ including managing domains, annotations, and node significance data. It also includes methods
21
+ for transforming and mapping graph coordinates, as well as generating colors based on node
22
+ significance.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ network: nx.Graph,
28
+ annotations: Dict[str, Any],
29
+ neighborhoods: Dict[str, Any],
30
+ domains: pd.DataFrame,
31
+ trimmed_domains: pd.DataFrame,
32
+ node_label_to_node_id_map: Dict[str, Any],
33
+ node_significance_sums: np.ndarray,
34
+ ):
35
+ """Initialize the Graph object.
36
+
37
+ Args:
38
+ network (nx.Graph): The network graph.
39
+ annotations (Dict[str, Any]): The annotations associated with the network.
40
+ neighborhoods (Dict[str, Any]): Neighborhood significance data.
41
+ domains (pd.DataFrame): DataFrame containing domain data for the network nodes.
42
+ trimmed_domains (pd.DataFrame): DataFrame containing trimmed domain data for the network nodes.
43
+ node_label_to_node_id_map (Dict[str, Any]): A dictionary mapping node labels to their corresponding IDs.
44
+ node_significance_sums (np.ndarray): Array containing the significant sums for the nodes.
45
+ """
46
+ # Initialize self.network downstream of the other attributes
47
+ # All public attributes can be accessed after initialization
48
+ self.domain_id_to_node_ids_map = self._create_domain_id_to_node_ids_map(domains)
49
+ self.domain_id_to_domain_terms_map = self._create_domain_id_to_domain_terms_map(
50
+ trimmed_domains
51
+ )
52
+ self.domain_id_to_domain_info_map = self._create_domain_id_to_domain_info_map(
53
+ trimmed_domains
54
+ )
55
+ self.node_id_to_domain_ids_and_significance_map = (
56
+ self._create_node_id_to_domain_ids_and_significances(domains)
57
+ )
58
+ self.node_id_to_node_label_map = {v: k for k, v in node_label_to_node_id_map.items()}
59
+ self.node_label_to_significance_map = dict(
60
+ zip(node_label_to_node_id_map.keys(), node_significance_sums)
61
+ )
62
+ self.node_significance_sums = node_significance_sums
63
+ self.node_label_to_node_id_map = node_label_to_node_id_map
64
+
65
+ # NOTE: Below this point, instance attributes (i.e., self) will be used!
66
+ self.domain_id_to_node_labels_map = self._create_domain_id_to_node_labels_map()
67
+ # Unfold the network's 3D coordinates to 2D and extract node coordinates
68
+ self.network = self._unfold_sphere_to_plane(network)
69
+ self.node_coordinates = self._extract_node_coordinates(self.network)
70
+
71
+ # NOTE: Only after the above attributes are initialized, we can create the summary
72
+ self.summary = Summary(annotations, neighborhoods, self)
73
+
74
+ def pop(self, domain_id: int) -> List[str]:
75
+ """Remove a domain ID from the graph and return the corresponding node labels.
76
+
77
+ Args:
78
+ key (int): The domain ID key to be removed from each mapping.
79
+
80
+ Returns:
81
+ List[str]: A list of node labels associated with the domain ID.
82
+ """
83
+ # Get the node labels associated with the domain ID
84
+ node_labels = self.domain_id_to_node_labels_map.get(domain_id, [])
85
+
86
+ # Define the domain mappings to be updated
87
+ domain_mappings = [
88
+ self.domain_id_to_node_ids_map,
89
+ self.domain_id_to_domain_terms_map,
90
+ self.domain_id_to_domain_info_map,
91
+ self.domain_id_to_node_labels_map,
92
+ ]
93
+ # Remove the specified domain_id key from each mapping if it exists
94
+ for mapping in domain_mappings:
95
+ if domain_id in mapping:
96
+ mapping.pop(domain_id)
97
+
98
+ # Remove the domain_id from the node_id_to_domain_ids_and_significance_map
99
+ for _, domain_info in self.node_id_to_domain_ids_and_significance_map.items():
100
+ if domain_id in domain_info["domains"]:
101
+ domain_info["domains"].remove(domain_id)
102
+ domain_info["significances"].pop(domain_id)
103
+
104
+ return node_labels
105
+
106
+ def _create_domain_id_to_node_ids_map(self, domains: pd.DataFrame) -> Dict[int, Any]:
107
+ """Create a mapping from domains to the list of node IDs belonging to each domain.
108
+
109
+ Args:
110
+ domains (pd.DataFrame): DataFrame containing domain information, including the 'primary domain' for each node.
111
+
112
+ Returns:
113
+ Dict[int, Any]: A dictionary where keys are domain IDs and values are lists of node IDs belonging to each domain.
114
+ """
115
+ cleaned_domains_matrix = domains.reset_index()[["index", "primary_domain"]]
116
+ node_to_domains_map = cleaned_domains_matrix.set_index("index")["primary_domain"].to_dict()
117
+ domain_id_to_node_ids_map = defaultdict(list)
118
+ for k, v in node_to_domains_map.items():
119
+ domain_id_to_node_ids_map[v].append(k)
120
+
121
+ return domain_id_to_node_ids_map
122
+
123
+ def _create_domain_id_to_domain_terms_map(
124
+ self, trimmed_domains: pd.DataFrame
125
+ ) -> Dict[int, Any]:
126
+ """Create a mapping from domain IDs to their corresponding terms.
127
+
128
+ Args:
129
+ trimmed_domains (pd.DataFrame): DataFrame containing domain IDs and their corresponding labels.
130
+
131
+ Returns:
132
+ Dict[int, Any]: A dictionary mapping domain IDs to their corresponding terms.
133
+ """
134
+ return dict(
135
+ zip(
136
+ trimmed_domains.index,
137
+ trimmed_domains["normalized_description"],
138
+ )
139
+ )
140
+
141
+ def _create_domain_id_to_domain_info_map(
142
+ self,
143
+ trimmed_domains: pd.DataFrame,
144
+ ) -> Dict[int, Dict[str, Any]]:
145
+ """Create a mapping from domain IDs to their corresponding full description and significance score,
146
+ with scores sorted in descending order.
147
+
148
+ Args:
149
+ trimmed_domains (pd.DataFrame): DataFrame containing domain IDs, full descriptions, and significance scores.
150
+
151
+ Returns:
152
+ Dict[int, Dict[str, Any]]: A dictionary mapping domain IDs (int) to a dictionary with 'full_descriptions' and
153
+ 'significance_scores', both sorted by significance score in descending order.
154
+ """
155
+ # Initialize an empty dictionary to store full descriptions and significance scores of domains
156
+ domain_info_map = {}
157
+ # Domain IDs are the index of the DataFrame (it's common for some IDs to be missing)
158
+ for domain_id in trimmed_domains.index:
159
+ # Sort full_descriptions and significance_scores by significance_scores in descending order
160
+ descriptions_and_scores = sorted(
161
+ zip(
162
+ trimmed_domains.at[domain_id, "full_descriptions"],
163
+ trimmed_domains.at[domain_id, "significance_scores"],
164
+ ),
165
+ key=lambda x: x[1], # Sort by significance score
166
+ reverse=True, # Descending order
167
+ )
168
+ # Unzip the sorted tuples back into separate lists
169
+ sorted_descriptions, sorted_scores = zip(*descriptions_and_scores)
170
+ # Assign to the domain info map
171
+ domain_info_map[int(domain_id)] = {
172
+ "full_descriptions": list(sorted_descriptions),
173
+ "significance_scores": list(sorted_scores),
174
+ }
175
+
176
+ return domain_info_map
177
+
178
+ def _create_node_id_to_domain_ids_and_significances(
179
+ self, domains: pd.DataFrame
180
+ ) -> Dict[int, Dict]:
181
+ """Creates a dictionary mapping each node ID to its corresponding domain IDs and significance values.
182
+
183
+ Args:
184
+ domains (pd.DataFrame): A DataFrame containing domain information for each node. Assumes the last
185
+ two columns are 'all domains' and 'primary domain', which are excluded from processing.
186
+
187
+ Returns:
188
+ Dict[int, Dict]: A dictionary where the key is the node ID (index of the DataFrame), and the value is another dictionary
189
+ with 'domain' (a list of domain IDs with non-zero significance) and 'significance'
190
+ (a dict of domain IDs and their corresponding significance values).
191
+ """
192
+ # Initialize an empty dictionary to store the result
193
+ node_id_to_domain_ids_and_significances = {}
194
+ # Get the list of domain columns (excluding 'all domains' and 'primary domain')
195
+ domain_columns = domains.columns[
196
+ :-2
197
+ ] # The last two columns are 'all domains' and 'primary domain'
198
+ # Iterate over each row in the dataframe
199
+ for idx, row in domains.iterrows():
200
+ # Get the domains (column names) where the significance score is greater than 0
201
+ all_domains = domain_columns[row[domain_columns] > 0].tolist()
202
+ # Get the significance values for those domains
203
+ significance_values = row[all_domains].to_dict()
204
+ # Store the result in the dictionary with index as the key
205
+ node_id_to_domain_ids_and_significances[idx] = {
206
+ "domains": all_domains, # The column names where significance > 0
207
+ "significances": significance_values, # The actual significance values for those columns
208
+ }
209
+
210
+ return node_id_to_domain_ids_and_significances
211
+
212
+ def _create_domain_id_to_node_labels_map(self) -> Dict[int, List[str]]:
213
+ """Create a map from domain IDs to node labels.
214
+
215
+ Returns:
216
+ Dict[int, List[str]]: A dictionary mapping domain IDs to the corresponding node labels.
217
+ """
218
+ domain_id_to_label_map = {}
219
+ for domain_id, node_ids in self.domain_id_to_node_ids_map.items():
220
+ domain_id_to_label_map[domain_id] = [
221
+ self.node_id_to_node_label_map[node_id] for node_id in node_ids
222
+ ]
223
+
224
+ return domain_id_to_label_map
225
+
226
+ def _unfold_sphere_to_plane(self, G: nx.Graph) -> nx.Graph:
227
+ """Convert 3D coordinates to 2D by unfolding a sphere to a plane.
228
+
229
+ Args:
230
+ G (nx.Graph): A network graph with 3D coordinates. Each node should have 'x', 'y', and 'z' attributes.
231
+
232
+ Returns:
233
+ nx.Graph: The network graph with updated 2D coordinates (only 'x' and 'y').
234
+ """
235
+ for node in G.nodes():
236
+ if "z" in G.nodes[node]:
237
+ # Extract 3D coordinates
238
+ x, y, z = G.nodes[node]["x"], G.nodes[node]["y"], G.nodes[node]["z"]
239
+ # Calculate spherical coordinates theta and phi from Cartesian coordinates
240
+ r = np.sqrt(x**2 + y**2 + z**2)
241
+ theta = np.arctan2(y, x)
242
+ phi = np.arccos(z / r)
243
+
244
+ # Convert spherical coordinates to 2D plane coordinates
245
+ unfolded_x = (theta + np.pi) / (2 * np.pi) # Shift and normalize theta to [0, 1]
246
+ unfolded_x = unfolded_x + 0.5 if unfolded_x < 0.5 else unfolded_x - 0.5
247
+ unfolded_y = (np.pi - phi) / np.pi # Reflect phi and normalize to [0, 1]
248
+ # Update network node attributes
249
+ G.nodes[node]["x"] = unfolded_x
250
+ G.nodes[node]["y"] = -unfolded_y
251
+ # Remove the 'z' coordinate as it's no longer needed
252
+ del G.nodes[node]["z"]
253
+
254
+ return G
255
+
256
+ def _extract_node_coordinates(self, G: nx.Graph) -> np.ndarray:
257
+ """Extract 2D coordinates of nodes from the graph.
258
+
259
+ Args:
260
+ G (nx.Graph): The network graph with node coordinates.
261
+
262
+ Returns:
263
+ np.ndarray: Array of node coordinates with shape (num_nodes, 2).
264
+ """
265
+ # Extract x and y coordinates from graph nodes
266
+ x_coords = dict(G.nodes.data("x"))
267
+ y_coords = dict(G.nodes.data("y"))
268
+ coordinates_dicts = [x_coords, y_coords]
269
+ # Combine x and y coordinates into a single array
270
+ node_positions = {
271
+ node: np.array([coords[node] for coords in coordinates_dicts]) for node in x_coords
272
+ }
273
+ node_coordinates = np.vstack(list(node_positions.values()))
274
+ return node_coordinates
@@ -0,0 +1,166 @@
1
+ """
2
+ risk/network/graph/stats
3
+ ~~~~~~~~~~~~~~~~~~~~~~~~
4
+ """
5
+
6
+ from typing import Any, Dict, Union
7
+
8
+ import numpy as np
9
+ from statsmodels.stats.multitest import fdrcorrection
10
+
11
+
12
+ def calculate_significance_matrices(
13
+ depletion_pvals: np.ndarray,
14
+ enrichment_pvals: np.ndarray,
15
+ tail: str = "right",
16
+ pval_cutoff: float = 0.05,
17
+ fdr_cutoff: float = 0.05,
18
+ ) -> Dict[str, Any]:
19
+ """Calculate significance matrices based on p-values and specified tail.
20
+
21
+ Args:
22
+ depletion_pvals (np.ndarray): Matrix of depletion p-values.
23
+ enrichment_pvals (np.ndarray): Matrix of enrichment p-values.
24
+ tail (str, optional): The tail type for significance selection ('left', 'right', 'both'). Defaults to 'right'.
25
+ pval_cutoff (float, optional): Cutoff for p-value significance. Defaults to 0.05.
26
+ fdr_cutoff (float, optional): Cutoff for FDR significance if applied. Defaults to 0.05.
27
+
28
+ Returns:
29
+ Dict[str, Any]: Dictionary containing the enrichment matrix, binary significance matrix,
30
+ and the matrix of significant enrichment values.
31
+ """
32
+ if fdr_cutoff < 1.0:
33
+ # Apply FDR correction to depletion p-values
34
+ depletion_qvals = np.apply_along_axis(fdrcorrection, 1, depletion_pvals)[:, 1, :]
35
+ depletion_alpha_threshold_matrix = _compute_threshold_matrix(
36
+ depletion_pvals, depletion_qvals, pval_cutoff=pval_cutoff, fdr_cutoff=fdr_cutoff
37
+ )
38
+ # Compute the depletion matrix using both q-values and p-values
39
+ depletion_matrix = (depletion_qvals**2) * (depletion_pvals**0.5)
40
+
41
+ # Apply FDR correction to enrichment p-values
42
+ enrichment_qvals = np.apply_along_axis(fdrcorrection, 1, enrichment_pvals)[:, 1, :]
43
+ enrichment_alpha_threshold_matrix = _compute_threshold_matrix(
44
+ enrichment_pvals, enrichment_qvals, pval_cutoff=pval_cutoff, fdr_cutoff=fdr_cutoff
45
+ )
46
+ # Compute the enrichment matrix using both q-values and p-values
47
+ enrichment_matrix = (enrichment_pvals**0.5) * (enrichment_qvals**2)
48
+ else:
49
+ # Compute threshold matrices based on p-value cutoffs only
50
+ depletion_alpha_threshold_matrix = _compute_threshold_matrix(
51
+ depletion_pvals, pval_cutoff=pval_cutoff
52
+ )
53
+ depletion_matrix = depletion_pvals
54
+
55
+ enrichment_alpha_threshold_matrix = _compute_threshold_matrix(
56
+ enrichment_pvals, pval_cutoff=pval_cutoff
57
+ )
58
+ enrichment_matrix = enrichment_pvals
59
+
60
+ # Apply a negative log10 transformation for visualization purposes
61
+ log_depletion_matrix = -np.log10(depletion_matrix)
62
+ log_enrichment_matrix = -np.log10(enrichment_matrix)
63
+
64
+ # Select the appropriate significance matrices based on the specified tail
65
+ significance_matrix, significant_binary_significance_matrix = _select_significance_matrices(
66
+ tail,
67
+ log_depletion_matrix,
68
+ depletion_alpha_threshold_matrix,
69
+ log_enrichment_matrix,
70
+ enrichment_alpha_threshold_matrix,
71
+ )
72
+
73
+ # Filter the enrichment matrix using the binary significance matrix
74
+ significant_significance_matrix = np.where(
75
+ significant_binary_significance_matrix == 1, significance_matrix, 0
76
+ )
77
+
78
+ return {
79
+ "significance_matrix": significance_matrix,
80
+ "significant_significance_matrix": significant_significance_matrix,
81
+ "significant_binary_significance_matrix": significant_binary_significance_matrix,
82
+ }
83
+
84
+
85
+ def _select_significance_matrices(
86
+ tail: str,
87
+ log_depletion_matrix: np.ndarray,
88
+ depletion_alpha_threshold_matrix: np.ndarray,
89
+ log_enrichment_matrix: np.ndarray,
90
+ enrichment_alpha_threshold_matrix: np.ndarray,
91
+ ) -> tuple:
92
+ """Select significance matrices based on the specified tail type.
93
+
94
+ Args:
95
+ tail (str): The tail type for significance selection. Options are 'left', 'right', or 'both'.
96
+ log_depletion_matrix (np.ndarray): Matrix of log-transformed depletion values.
97
+ depletion_alpha_threshold_matrix (np.ndarray): Alpha threshold matrix for depletion significance.
98
+ log_enrichment_matrix (np.ndarray): Matrix of log-transformed enrichment values.
99
+ enrichment_alpha_threshold_matrix (np.ndarray): Alpha threshold matrix for enrichment significance.
100
+
101
+ Returns:
102
+ tuple: A tuple containing the selected enrichment matrix and binary significance matrix.
103
+
104
+ Raises:
105
+ ValueError: If the provided tail type is not 'left', 'right', or 'both'.
106
+ """
107
+ if tail not in {"left", "right", "both"}:
108
+ raise ValueError("Invalid value for 'tail'. Must be 'left', 'right', or 'both'.")
109
+
110
+ if tail == "left":
111
+ # Select depletion matrix and corresponding alpha threshold for left-tail analysis
112
+ significance_matrix = -log_depletion_matrix
113
+ alpha_threshold_matrix = depletion_alpha_threshold_matrix
114
+ elif tail == "right":
115
+ # Select enrichment matrix and corresponding alpha threshold for right-tail analysis
116
+ significance_matrix = log_enrichment_matrix
117
+ alpha_threshold_matrix = enrichment_alpha_threshold_matrix
118
+ elif tail == "both":
119
+ # Select the matrix with the highest absolute values while preserving the sign
120
+ significance_matrix = np.where(
121
+ np.abs(log_depletion_matrix) >= np.abs(log_enrichment_matrix),
122
+ -log_depletion_matrix,
123
+ log_enrichment_matrix,
124
+ )
125
+ # Combine alpha thresholds using a logical OR operation
126
+ alpha_threshold_matrix = np.logical_or(
127
+ depletion_alpha_threshold_matrix, enrichment_alpha_threshold_matrix
128
+ )
129
+ else:
130
+ raise ValueError("Invalid value for 'tail'. Must be 'left', 'right', or 'both'.")
131
+
132
+ # Create a binary significance matrix where valid indices meet the alpha threshold
133
+ valid_idxs = ~np.isnan(alpha_threshold_matrix)
134
+ significant_binary_significance_matrix = np.zeros(alpha_threshold_matrix.shape)
135
+ significant_binary_significance_matrix[valid_idxs] = alpha_threshold_matrix[valid_idxs]
136
+
137
+ return significance_matrix, significant_binary_significance_matrix
138
+
139
+
140
+ def _compute_threshold_matrix(
141
+ pvals: np.ndarray,
142
+ fdr_pvals: Union[np.ndarray, None] = None,
143
+ pval_cutoff: float = 0.05,
144
+ fdr_cutoff: float = 0.05,
145
+ ) -> np.ndarray:
146
+ """Compute a threshold matrix indicating significance based on p-value and FDR cutoffs.
147
+
148
+ Args:
149
+ pvals (np.ndarray): Array of p-values for statistical tests.
150
+ fdr_pvals (np.ndarray, optional): Array of FDR-corrected p-values corresponding to the p-values. Defaults to None.
151
+ pval_cutoff (float, optional): Cutoff for p-value significance. Defaults to 0.05.
152
+ fdr_cutoff (float, optional): Cutoff for FDR significance. Defaults to 0.05.
153
+
154
+ Returns:
155
+ np.ndarray: A threshold matrix where 1 indicates significance based on the provided cutoffs, 0 otherwise.
156
+ """
157
+ if fdr_pvals is not None:
158
+ # Compute the threshold matrix based on both p-value and FDR cutoffs
159
+ pval_below_cutoff = pvals <= pval_cutoff
160
+ fdr_below_cutoff = fdr_pvals <= fdr_cutoff
161
+ threshold_matrix = np.logical_and(pval_below_cutoff, fdr_below_cutoff).astype(int)
162
+ else:
163
+ # Compute the threshold matrix based only on p-value cutoff
164
+ threshold_matrix = (pvals <= pval_cutoff).astype(int)
165
+
166
+ return threshold_matrix