risk-network 0.0.6b10__py3-none-any.whl → 0.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- risk/__init__.py +1 -1
- risk/annotations/annotations.py +61 -42
- risk/annotations/io.py +14 -14
- risk/log/__init__.py +1 -1
- risk/log/config.py +139 -0
- risk/log/params.py +4 -4
- risk/neighborhoods/community.py +25 -36
- risk/neighborhoods/domains.py +29 -27
- risk/neighborhoods/neighborhoods.py +171 -72
- risk/network/graph.py +92 -41
- risk/network/io.py +22 -26
- risk/network/plot.py +132 -19
- risk/risk.py +81 -78
- risk/stats/__init__.py +2 -2
- risk/stats/hypergeom.py +30 -107
- risk/stats/permutation/permutation.py +23 -17
- risk/stats/permutation/test_functions.py +2 -2
- risk/stats/poisson.py +44 -0
- {risk_network-0.0.6b10.dist-info → risk_network-0.0.7.dist-info}/METADATA +1 -1
- risk_network-0.0.7.dist-info/RECORD +30 -0
- risk/log/console.py +0 -16
- risk/stats/fisher_exact.py +0 -132
- risk_network-0.0.6b10.dist-info/RECORD +0 -30
- {risk_network-0.0.6b10.dist-info → risk_network-0.0.7.dist-info}/LICENSE +0 -0
- {risk_network-0.0.6b10.dist-info → risk_network-0.0.7.dist-info}/WHEEL +0 -0
- {risk_network-0.0.6b10.dist-info → risk_network-0.0.7.dist-info}/top_level.txt +0 -0
risk/neighborhoods/domains.py
CHANGED
@@ -4,6 +4,7 @@ risk/neighborhoods/domains
|
|
4
4
|
"""
|
5
5
|
|
6
6
|
from contextlib import suppress
|
7
|
+
from itertools import product
|
7
8
|
from tqdm import tqdm
|
8
9
|
from typing import Tuple
|
9
10
|
|
@@ -14,6 +15,7 @@ from sklearn.metrics import silhouette_score
|
|
14
15
|
|
15
16
|
from risk.annotations import get_description
|
16
17
|
from risk.constants import GROUP_LINKAGE_METHODS, GROUP_DISTANCE_METRICS
|
18
|
+
from risk.log import logger
|
17
19
|
|
18
20
|
|
19
21
|
def define_domains(
|
@@ -23,7 +25,8 @@ def define_domains(
|
|
23
25
|
linkage_method: str,
|
24
26
|
linkage_metric: str,
|
25
27
|
) -> pd.DataFrame:
|
26
|
-
"""Define domains and assign nodes to these domains based on their enrichment scores and clustering
|
28
|
+
"""Define domains and assign nodes to these domains based on their enrichment scores and clustering,
|
29
|
+
handling errors by assigning unique domains when clustering fails.
|
27
30
|
|
28
31
|
Args:
|
29
32
|
top_annotations (pd.DataFrame): DataFrame of top annotations data for the network nodes.
|
@@ -35,31 +38,31 @@ def define_domains(
|
|
35
38
|
Returns:
|
36
39
|
pd.DataFrame: DataFrame with the primary domain for each node.
|
37
40
|
"""
|
38
|
-
|
39
|
-
|
40
|
-
print("Single annotation detected. Skipping clustering.")
|
41
|
-
top_annotations["domain"] = 1 # Assign a default domain or handle appropriately
|
42
|
-
else:
|
43
|
-
# Perform hierarchical clustering on the binary enrichment matrix
|
41
|
+
try:
|
42
|
+
# Transpose the matrix to cluster annotations
|
44
43
|
m = significant_neighborhoods_enrichment[:, top_annotations["top attributes"]].T
|
45
44
|
best_linkage, best_metric, best_threshold = _optimize_silhouette_across_linkage_and_metrics(
|
46
45
|
m, linkage_criterion, linkage_method, linkage_metric
|
47
46
|
)
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
raise ValueError("No significant annotations found.") from e
|
52
|
-
|
53
|
-
print(
|
47
|
+
# Perform hierarchical clustering
|
48
|
+
Z = linkage(m, method=best_linkage, metric=best_metric)
|
49
|
+
logger.warning(
|
54
50
|
f"Linkage criterion: '{linkage_criterion}'\nLinkage method: '{best_linkage}'\nLinkage metric: '{best_metric}'"
|
55
51
|
)
|
56
|
-
|
57
|
-
|
52
|
+
logger.debug(f"Optimal linkage threshold: {round(best_threshold, 3)}")
|
53
|
+
# Calculate the optimal threshold for clustering
|
58
54
|
max_d_optimal = np.max(Z[:, 2]) * best_threshold
|
59
|
-
domains = fcluster(Z, max_d_optimal, criterion=linkage_criterion)
|
60
55
|
# Assign domains to the annotations matrix
|
56
|
+
domains = fcluster(Z, max_d_optimal, criterion=linkage_criterion)
|
61
57
|
top_annotations["domain"] = 0
|
62
58
|
top_annotations.loc[top_annotations["top attributes"], "domain"] = domains
|
59
|
+
except ValueError:
|
60
|
+
# If a ValueError is encountered, handle it by assigning unique domains
|
61
|
+
n_rows = len(top_annotations)
|
62
|
+
logger.error(
|
63
|
+
f"Error encountered. Skipping clustering and assigning {n_rows} unique domains."
|
64
|
+
)
|
65
|
+
top_annotations["domain"] = range(1, n_rows + 1) # Assign unique domains
|
63
66
|
|
64
67
|
# Create DataFrames to store domain information
|
65
68
|
node_to_enrichment = pd.DataFrame(
|
@@ -166,21 +169,20 @@ def _optimize_silhouette_across_linkage_and_metrics(
|
|
166
169
|
total_combinations = len(linkage_methods) * len(linkage_metrics)
|
167
170
|
|
168
171
|
# Evaluating optimal linkage method and metric
|
169
|
-
for method in tqdm(
|
170
|
-
linkage_methods,
|
172
|
+
for method, metric in tqdm(
|
173
|
+
product(linkage_methods, linkage_metrics),
|
171
174
|
desc="Evaluating optimal linkage method and metric",
|
172
175
|
total=total_combinations,
|
173
176
|
bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
|
174
177
|
):
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
best_overall_metric = metric
|
178
|
+
with suppress(Exception):
|
179
|
+
Z = linkage(m, method=method, metric=metric)
|
180
|
+
threshold, score = _find_best_silhouette_score(Z, m, metric, linkage_criterion)
|
181
|
+
if score > best_overall_score:
|
182
|
+
best_overall_score = score
|
183
|
+
best_overall_threshold = threshold
|
184
|
+
best_overall_method = method
|
185
|
+
best_overall_metric = metric
|
184
186
|
|
185
187
|
return best_overall_method, best_overall_metric, best_overall_threshold
|
186
188
|
|
@@ -3,21 +3,24 @@ risk/neighborhoods/neighborhoods
|
|
3
3
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
4
4
|
"""
|
5
5
|
|
6
|
+
import random
|
6
7
|
import warnings
|
7
8
|
from typing import Any, Dict, List, Tuple
|
8
9
|
|
9
10
|
import networkx as nx
|
10
11
|
import numpy as np
|
11
12
|
from sklearn.exceptions import DataConversionWarning
|
13
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
12
14
|
|
13
15
|
from risk.neighborhoods.community import (
|
14
|
-
|
16
|
+
calculate_greedy_modularity_neighborhoods,
|
15
17
|
calculate_label_propagation_neighborhoods,
|
16
18
|
calculate_louvain_neighborhoods,
|
17
19
|
calculate_markov_clustering_neighborhoods,
|
18
20
|
calculate_spinglass_neighborhoods,
|
19
21
|
calculate_walktrap_neighborhoods,
|
20
22
|
)
|
23
|
+
from risk.log import logger
|
21
24
|
|
22
25
|
# Suppress DataConversionWarning
|
23
26
|
warnings.filterwarnings(action="ignore", category=DataConversionWarning)
|
@@ -25,7 +28,7 @@ warnings.filterwarnings(action="ignore", category=DataConversionWarning)
|
|
25
28
|
|
26
29
|
def get_network_neighborhoods(
|
27
30
|
network: nx.Graph,
|
28
|
-
distance_metric: str = "
|
31
|
+
distance_metric: str = "louvain",
|
29
32
|
edge_length_threshold: float = 1.0,
|
30
33
|
louvain_resolution: float = 1.0,
|
31
34
|
random_seed: int = 888,
|
@@ -34,8 +37,8 @@ def get_network_neighborhoods(
|
|
34
37
|
|
35
38
|
Args:
|
36
39
|
network (nx.Graph): The network graph.
|
37
|
-
distance_metric (str): The distance metric to use ('
|
38
|
-
'
|
40
|
+
distance_metric (str): The distance metric to use ('greedy_modularity', 'louvain', 'label_propagation',
|
41
|
+
'markov_clustering', 'walktrap', 'spinglass').
|
39
42
|
edge_length_threshold (float): The edge length threshold for the neighborhoods.
|
40
43
|
louvain_resolution (float, optional): Resolution parameter for the Louvain method. Defaults to 1.0.
|
41
44
|
random_seed (int, optional): Random seed for methods requiring random initialization. Defaults to 888.
|
@@ -43,12 +46,19 @@ def get_network_neighborhoods(
|
|
43
46
|
Returns:
|
44
47
|
np.ndarray: Neighborhood matrix calculated based on the selected distance metric.
|
45
48
|
"""
|
46
|
-
|
49
|
+
# Set random seed for reproducibility in all methods besides Louvain, which requires a separate seed
|
50
|
+
random.seed(random_seed)
|
51
|
+
np.random.seed(random_seed)
|
52
|
+
|
53
|
+
# Create a subgraph based on the edge length percentile threshold
|
54
|
+
network = _create_percentile_limited_subgraph(
|
55
|
+
network, edge_length_percentile=edge_length_threshold
|
56
|
+
)
|
47
57
|
|
48
|
-
if distance_metric == "dijkstra":
|
49
|
-
return calculate_dijkstra_neighborhoods(network)
|
50
58
|
if distance_metric == "louvain":
|
51
59
|
return calculate_louvain_neighborhoods(network, louvain_resolution, random_seed=random_seed)
|
60
|
+
if distance_metric == "greedy_modularity":
|
61
|
+
return calculate_greedy_modularity_neighborhoods(network)
|
52
62
|
if distance_metric == "label_propagation":
|
53
63
|
return calculate_label_propagation_neighborhoods(network)
|
54
64
|
if distance_metric == "markov_clustering":
|
@@ -59,41 +69,51 @@ def get_network_neighborhoods(
|
|
59
69
|
return calculate_spinglass_neighborhoods(network)
|
60
70
|
|
61
71
|
raise ValueError(
|
62
|
-
"Incorrect distance metric specified. Please choose from '
|
72
|
+
"Incorrect distance metric specified. Please choose from 'greedy_modularity', 'louvain',"
|
63
73
|
"'label_propagation', 'markov_clustering', 'walktrap', 'spinglass'."
|
64
74
|
)
|
65
75
|
|
66
76
|
|
67
77
|
def _create_percentile_limited_subgraph(G: nx.Graph, edge_length_percentile: float) -> nx.Graph:
|
68
|
-
"""
|
69
|
-
|
78
|
+
"""Create a subgraph containing all nodes and edges where the edge length is below the
|
79
|
+
specified percentile of all edge lengths in the input graph.
|
70
80
|
|
71
81
|
Args:
|
72
|
-
G (nx.Graph): The input graph.
|
73
|
-
edge_length_percentile (float): The percentile
|
82
|
+
G (nx.Graph): The input graph with 'length' attributes on edges.
|
83
|
+
edge_length_percentile (float): The percentile (between 0 and 1) to filter edges by length.
|
74
84
|
|
75
85
|
Returns:
|
76
|
-
nx.Graph: A subgraph with all nodes and edges
|
86
|
+
nx.Graph: A subgraph with all nodes and edges where the edge length is below the
|
87
|
+
calculated threshold length.
|
77
88
|
"""
|
78
|
-
# Extract edge lengths
|
89
|
+
# Extract edge lengths and handle missing lengths
|
79
90
|
edge_lengths = [d["length"] for _, _, d in G.edges(data=True) if "length" in d]
|
91
|
+
if not edge_lengths:
|
92
|
+
raise ValueError(
|
93
|
+
"No edge lengths found in the graph. Ensure edges have 'length' attributes."
|
94
|
+
)
|
95
|
+
|
80
96
|
# Calculate the specific edge length for the given percentile
|
81
97
|
percentile_length = np.percentile(edge_lengths, edge_length_percentile * 100)
|
82
|
-
# Create
|
98
|
+
# Create the subgraph by directly filtering edges during iteration
|
83
99
|
subgraph = nx.Graph()
|
84
|
-
subgraph.add_nodes_from(G.nodes(data=True))
|
85
|
-
# Add edges
|
100
|
+
subgraph.add_nodes_from(G.nodes(data=True)) # Retain all nodes from the original graph
|
101
|
+
# Add edges below the specified percentile length in a single pass
|
86
102
|
for u, v, d in G.edges(data=True):
|
87
103
|
if d.get("length", 1) <= percentile_length:
|
88
104
|
subgraph.add_edge(u, v, **d)
|
89
105
|
|
106
|
+
# Return the subgraph; optionally check if it's too sparse
|
107
|
+
if subgraph.number_of_edges() == 0:
|
108
|
+
raise Warning("The resulting subgraph has no edges. Consider adjusting the percentile.")
|
109
|
+
|
90
110
|
return subgraph
|
91
111
|
|
92
112
|
|
93
113
|
def process_neighborhoods(
|
94
114
|
network: nx.Graph,
|
95
115
|
neighborhoods: Dict[str, Any],
|
96
|
-
impute_depth: int =
|
116
|
+
impute_depth: int = 0,
|
97
117
|
prune_threshold: float = 0.0,
|
98
118
|
) -> Dict[str, Any]:
|
99
119
|
"""Process neighborhoods based on the imputation and pruning settings.
|
@@ -101,7 +121,7 @@ def process_neighborhoods(
|
|
101
121
|
Args:
|
102
122
|
network (nx.Graph): The network data structure used for imputing and pruning neighbors.
|
103
123
|
neighborhoods (dict): Dictionary containing 'enrichment_matrix', 'binary_enrichment_matrix', and 'significant_enrichment_matrix'.
|
104
|
-
impute_depth (int, optional): Depth for imputing neighbors. Defaults to
|
124
|
+
impute_depth (int, optional): Depth for imputing neighbors. Defaults to 0.
|
105
125
|
prune_threshold (float, optional): Distance threshold for pruning neighbors. Defaults to 0.0.
|
106
126
|
|
107
127
|
Returns:
|
@@ -110,7 +130,7 @@ def process_neighborhoods(
|
|
110
130
|
enrichment_matrix = neighborhoods["enrichment_matrix"]
|
111
131
|
binary_enrichment_matrix = neighborhoods["binary_enrichment_matrix"]
|
112
132
|
significant_enrichment_matrix = neighborhoods["significant_enrichment_matrix"]
|
113
|
-
|
133
|
+
logger.debug(f"Imputation depth: {impute_depth}")
|
114
134
|
if impute_depth:
|
115
135
|
(
|
116
136
|
enrichment_matrix,
|
@@ -123,7 +143,7 @@ def process_neighborhoods(
|
|
123
143
|
max_depth=impute_depth,
|
124
144
|
)
|
125
145
|
|
126
|
-
|
146
|
+
logger.debug(f"Pruning threshold: {prune_threshold}")
|
127
147
|
if prune_threshold:
|
128
148
|
(
|
129
149
|
enrichment_matrix,
|
@@ -167,55 +187,134 @@ def _impute_neighbors(
|
|
167
187
|
- np.ndarray: The imputed alpha threshold matrix.
|
168
188
|
- np.ndarray: The significant enrichment matrix with non-significant entries set to zero.
|
169
189
|
"""
|
170
|
-
# Calculate
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
f"Failed to find neighbors for node '{node}': Ensure that the node exists in the network and that the binary enrichment matrix is correctly indexed."
|
180
|
-
) from e
|
181
|
-
|
182
|
-
# Calculate the shortest distance to a neighbor
|
183
|
-
if neighbors:
|
184
|
-
shortest_distance = min([_get_euclidean_distance(node, n, network) for n in neighbors])
|
185
|
-
shortest_distances.append(shortest_distance)
|
190
|
+
# Calculate the distance threshold value based on the shortest distances
|
191
|
+
enrichment_matrix, binary_enrichment_matrix = _impute_neighbors_with_similarity(
|
192
|
+
network, enrichment_matrix, binary_enrichment_matrix, max_depth=max_depth
|
193
|
+
)
|
194
|
+
# Create a matrix where non-significant entries are set to zero
|
195
|
+
significant_enrichment_matrix = np.where(binary_enrichment_matrix == 1, enrichment_matrix, 0)
|
196
|
+
|
197
|
+
return enrichment_matrix, binary_enrichment_matrix, significant_enrichment_matrix
|
198
|
+
|
186
199
|
|
200
|
+
def _impute_neighbors_with_similarity(
|
201
|
+
network: nx.Graph,
|
202
|
+
enrichment_matrix: np.ndarray,
|
203
|
+
binary_enrichment_matrix: np.ndarray,
|
204
|
+
max_depth: int = 3,
|
205
|
+
) -> Tuple[np.ndarray, np.ndarray]:
|
206
|
+
"""Impute non-enriched nodes based on the closest enriched neighbors' profiles and their similarity.
|
207
|
+
|
208
|
+
Args:
|
209
|
+
network (nx.Graph): The network graph with nodes having IDs matching the matrix indices.
|
210
|
+
enrichment_matrix (np.ndarray): The enrichment matrix with rows to be imputed.
|
211
|
+
binary_enrichment_matrix (np.ndarray): The alpha threshold matrix to be imputed similarly.
|
212
|
+
max_depth (int): Maximum depth of nodes to traverse for imputing values.
|
213
|
+
|
214
|
+
Returns:
|
215
|
+
Tuple[np.ndarray, np.ndarray]: A tuple containing:
|
216
|
+
- The imputed enrichment matrix.
|
217
|
+
- The imputed alpha threshold matrix.
|
218
|
+
"""
|
187
219
|
depth = 1
|
188
220
|
rows_to_impute = np.where(binary_enrichment_matrix.sum(axis=1) == 0)[0]
|
189
221
|
while len(rows_to_impute) and depth <= max_depth:
|
190
|
-
|
191
|
-
for row_index in
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
for n in neighbors
|
196
|
-
if n != row_index
|
197
|
-
and binary_enrichment_matrix[n].sum() != 0
|
198
|
-
and enrichment_matrix[n].sum() != 0
|
199
|
-
]
|
200
|
-
if valid_neighbors:
|
201
|
-
closest_neighbor = min(
|
202
|
-
valid_neighbors, key=lambda n: _get_euclidean_distance(row_index, n, network)
|
222
|
+
# Iterate over all enriched nodes
|
223
|
+
for row_index in range(binary_enrichment_matrix.shape[0]):
|
224
|
+
if binary_enrichment_matrix[row_index].sum() != 0:
|
225
|
+
enrichment_matrix, binary_enrichment_matrix = _process_node_imputation(
|
226
|
+
row_index, network, enrichment_matrix, binary_enrichment_matrix, depth
|
203
227
|
)
|
204
|
-
# Impute the row with the closest valid neighbor's data
|
205
|
-
enrichment_matrix[row_index] = enrichment_matrix[closest_neighbor]
|
206
|
-
binary_enrichment_matrix[row_index] = binary_enrichment_matrix[
|
207
|
-
closest_neighbor
|
208
|
-
] / np.sqrt(depth + 1)
|
209
|
-
else:
|
210
|
-
next_rows_to_impute.append(row_index)
|
211
228
|
|
212
|
-
|
229
|
+
# Update rows to impute for the next iteration
|
230
|
+
rows_to_impute = np.where(binary_enrichment_matrix.sum(axis=1) == 0)[0]
|
213
231
|
depth += 1
|
214
232
|
|
215
|
-
|
216
|
-
significant_enrichment_matrix = np.where(binary_enrichment_matrix == 1, enrichment_matrix, 0)
|
233
|
+
return enrichment_matrix, binary_enrichment_matrix
|
217
234
|
|
218
|
-
|
235
|
+
|
236
|
+
def _process_node_imputation(
|
237
|
+
row_index: int,
|
238
|
+
network: nx.Graph,
|
239
|
+
enrichment_matrix: np.ndarray,
|
240
|
+
binary_enrichment_matrix: np.ndarray,
|
241
|
+
depth: int,
|
242
|
+
) -> Tuple[np.ndarray, np.ndarray]:
|
243
|
+
"""Process the imputation for a single node based on its enriched neighbors.
|
244
|
+
|
245
|
+
Args:
|
246
|
+
row_index (int): The index of the enriched node being processed.
|
247
|
+
network (nx.Graph): The network graph with nodes having IDs matching the matrix indices.
|
248
|
+
enrichment_matrix (np.ndarray): The enrichment matrix with rows to be imputed.
|
249
|
+
binary_enrichment_matrix (np.ndarray): The alpha threshold matrix to be imputed similarly.
|
250
|
+
depth (int): Current depth for traversal.
|
251
|
+
|
252
|
+
Returns:
|
253
|
+
Tuple[np.ndarray, np.ndarray]: The modified enrichment matrix and binary threshold matrix.
|
254
|
+
"""
|
255
|
+
# Check neighbors at the current depth
|
256
|
+
neighbors = nx.single_source_shortest_path_length(network, row_index, cutoff=depth)
|
257
|
+
# Filter annotated neighbors (already enriched)
|
258
|
+
annotated_neighbors = [
|
259
|
+
n
|
260
|
+
for n in neighbors
|
261
|
+
if n != row_index
|
262
|
+
and binary_enrichment_matrix[n].sum() != 0
|
263
|
+
and enrichment_matrix[n].sum() != 0
|
264
|
+
]
|
265
|
+
# Filter non-enriched neighbors
|
266
|
+
valid_neighbors = [
|
267
|
+
n
|
268
|
+
for n in neighbors
|
269
|
+
if n != row_index
|
270
|
+
and binary_enrichment_matrix[n].sum() == 0
|
271
|
+
and enrichment_matrix[n].sum() == 0
|
272
|
+
]
|
273
|
+
# If there are valid non-enriched neighbors
|
274
|
+
if valid_neighbors and annotated_neighbors:
|
275
|
+
# Calculate distances to annotated neighbors
|
276
|
+
distances_to_annotated = [
|
277
|
+
_get_euclidean_distance(row_index, n, network) for n in annotated_neighbors
|
278
|
+
]
|
279
|
+
# Calculate the IQR to identify outliers
|
280
|
+
q1, q3 = np.percentile(distances_to_annotated, [25, 75])
|
281
|
+
iqr = q3 - q1
|
282
|
+
lower_bound = q1 - 1.5 * iqr
|
283
|
+
upper_bound = q3 + 1.5 * iqr
|
284
|
+
# Filter valid non-enriched neighbors that fall within the IQR bounds
|
285
|
+
valid_neighbors_within_iqr = [
|
286
|
+
n
|
287
|
+
for n in valid_neighbors
|
288
|
+
if lower_bound <= _get_euclidean_distance(row_index, n, network) <= upper_bound
|
289
|
+
]
|
290
|
+
# If there are any valid neighbors within the IQR
|
291
|
+
if valid_neighbors_within_iqr:
|
292
|
+
# If more than one valid neighbor is within the IQR, compute pairwise cosine similarities
|
293
|
+
if len(valid_neighbors_within_iqr) > 1:
|
294
|
+
# Find the most similar neighbor based on pairwise cosine similarities
|
295
|
+
def sum_pairwise_cosine_similarities(neighbor):
|
296
|
+
return sum(
|
297
|
+
cosine_similarity(
|
298
|
+
enrichment_matrix[neighbor].reshape(1, -1),
|
299
|
+
enrichment_matrix[other_neighbor].reshape(1, -1),
|
300
|
+
)[0][0]
|
301
|
+
for other_neighbor in valid_neighbors_within_iqr
|
302
|
+
if other_neighbor != neighbor
|
303
|
+
)
|
304
|
+
|
305
|
+
most_similar_neighbor = max(
|
306
|
+
valid_neighbors_within_iqr, key=sum_pairwise_cosine_similarities
|
307
|
+
)
|
308
|
+
else:
|
309
|
+
most_similar_neighbor = valid_neighbors_within_iqr[0]
|
310
|
+
|
311
|
+
# Impute the most similar non-enriched neighbor with the enriched node's data, scaled by depth
|
312
|
+
enrichment_matrix[most_similar_neighbor] = enrichment_matrix[row_index] / np.sqrt(
|
313
|
+
depth + 1
|
314
|
+
)
|
315
|
+
binary_enrichment_matrix[most_similar_neighbor] = binary_enrichment_matrix[row_index]
|
316
|
+
|
317
|
+
return enrichment_matrix, binary_enrichment_matrix
|
219
318
|
|
220
319
|
|
221
320
|
def _prune_neighbors(
|
@@ -240,27 +339,27 @@ def _prune_neighbors(
|
|
240
339
|
"""
|
241
340
|
# Identify indices with non-zero rows in the binary enrichment matrix
|
242
341
|
non_zero_indices = np.where(binary_enrichment_matrix.sum(axis=1) != 0)[0]
|
243
|
-
|
342
|
+
median_distances = []
|
244
343
|
for node in non_zero_indices:
|
245
344
|
neighbors = [n for n in network.neighbors(node) if binary_enrichment_matrix[n].sum() != 0]
|
246
345
|
if neighbors:
|
247
|
-
|
346
|
+
median_distance = np.median(
|
248
347
|
[_get_euclidean_distance(node, n, network) for n in neighbors]
|
249
348
|
)
|
250
|
-
|
349
|
+
median_distances.append(median_distance)
|
251
350
|
|
252
351
|
# Calculate the distance threshold value based on rank
|
253
|
-
distance_threshold_value = _calculate_threshold(
|
352
|
+
distance_threshold_value = _calculate_threshold(median_distances, 1 - distance_threshold)
|
254
353
|
# Prune nodes that are outliers based on the distance threshold
|
255
354
|
for row_index in non_zero_indices:
|
256
355
|
neighbors = [
|
257
356
|
n for n in network.neighbors(row_index) if binary_enrichment_matrix[n].sum() != 0
|
258
357
|
]
|
259
358
|
if neighbors:
|
260
|
-
|
359
|
+
median_distance = np.median(
|
261
360
|
[_get_euclidean_distance(row_index, n, network) for n in neighbors]
|
262
361
|
)
|
263
|
-
if
|
362
|
+
if median_distance >= distance_threshold_value:
|
264
363
|
enrichment_matrix[row_index] = 0
|
265
364
|
binary_enrichment_matrix[row_index] = 0
|
266
365
|
|
@@ -305,18 +404,18 @@ def _get_node_position(network: nx.Graph, node: Any) -> np.ndarray:
|
|
305
404
|
)
|
306
405
|
|
307
406
|
|
308
|
-
def _calculate_threshold(
|
309
|
-
"""Calculate the distance threshold based on the given
|
407
|
+
def _calculate_threshold(median_distances: List, distance_threshold: float) -> float:
|
408
|
+
"""Calculate the distance threshold based on the given median distances and a percentile threshold.
|
310
409
|
|
311
410
|
Args:
|
312
|
-
|
411
|
+
median_distances (list): An array of median distances.
|
313
412
|
distance_threshold (float): A percentile threshold (0 to 1) used to determine the distance cutoff.
|
314
413
|
|
315
414
|
Returns:
|
316
415
|
float: The calculated distance threshold value.
|
317
416
|
"""
|
318
|
-
# Sort the
|
319
|
-
sorted_distances = np.sort(
|
417
|
+
# Sort the median distances
|
418
|
+
sorted_distances = np.sort(median_distances)
|
320
419
|
# Compute the rank percentiles for the sorted distances
|
321
420
|
rank_percentiles = np.linspace(0, 1, len(sorted_distances))
|
322
421
|
# Interpolating the ranks to 1000 evenly spaced percentiles
|