risk-network 0.0.8b26__py3-none-any.whl → 0.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- risk/__init__.py +2 -2
- risk/annotations/__init__.py +2 -2
- risk/annotations/annotations.py +195 -118
- risk/annotations/io.py +47 -31
- risk/log/__init__.py +4 -2
- risk/log/{config.py → console.py} +5 -3
- risk/log/{params.py → parameters.py} +17 -42
- risk/neighborhoods/__init__.py +3 -5
- risk/neighborhoods/api.py +442 -0
- risk/neighborhoods/community.py +324 -101
- risk/neighborhoods/domains.py +125 -52
- risk/neighborhoods/neighborhoods.py +177 -165
- risk/network/__init__.py +1 -3
- risk/network/geometry.py +71 -89
- risk/network/graph/__init__.py +6 -0
- risk/network/graph/api.py +200 -0
- risk/network/{graph.py → graph/graph.py} +90 -40
- risk/network/graph/summary.py +254 -0
- risk/network/io.py +103 -114
- risk/network/plotter/__init__.py +6 -0
- risk/network/plotter/api.py +54 -0
- risk/network/{plot → plotter}/canvas.py +12 -9
- risk/network/{plot → plotter}/contour.py +27 -24
- risk/network/{plot → plotter}/labels.py +73 -78
- risk/network/{plot → plotter}/network.py +45 -39
- risk/network/{plot → plotter}/plotter.py +23 -17
- risk/network/{plot/utils/color.py → plotter/utils/colors.py} +114 -122
- risk/network/{plot → plotter}/utils/layout.py +10 -7
- risk/risk.py +11 -500
- risk/stats/__init__.py +10 -4
- risk/stats/permutation/__init__.py +1 -1
- risk/stats/permutation/permutation.py +44 -38
- risk/stats/permutation/test_functions.py +26 -18
- risk/stats/{stats.py → significance.py} +17 -15
- risk/stats/stat_tests.py +267 -0
- {risk_network-0.0.8b26.dist-info → risk_network-0.0.9.dist-info}/METADATA +31 -46
- risk_network-0.0.9.dist-info/RECORD +40 -0
- {risk_network-0.0.8b26.dist-info → risk_network-0.0.9.dist-info}/WHEEL +1 -1
- risk/constants.py +0 -31
- risk/network/plot/__init__.py +0 -6
- risk/stats/hypergeom.py +0 -54
- risk/stats/poisson.py +0 -44
- risk_network-0.0.8b26.dist-info/RECORD +0 -37
- {risk_network-0.0.8b26.dist-info → risk_network-0.0.9.dist-info}/LICENSE +0 -0
- {risk_network-0.0.8b26.dist-info → risk_network-0.0.9.dist-info}/top_level.txt +0 -0
@@ -9,12 +9,14 @@ from typing import Any, Dict, List, Tuple, Union
|
|
9
9
|
|
10
10
|
import networkx as nx
|
11
11
|
import numpy as np
|
12
|
+
from scipy.sparse import csr_matrix
|
12
13
|
from sklearn.exceptions import DataConversionWarning
|
13
14
|
from sklearn.metrics.pairwise import cosine_similarity
|
14
15
|
|
15
16
|
from risk.neighborhoods.community import (
|
16
17
|
calculate_greedy_modularity_neighborhoods,
|
17
18
|
calculate_label_propagation_neighborhoods,
|
19
|
+
calculate_leiden_neighborhoods,
|
18
20
|
calculate_louvain_neighborhoods,
|
19
21
|
calculate_markov_clustering_neighborhoods,
|
20
22
|
calculate_spinglass_neighborhoods,
|
@@ -29,121 +31,118 @@ warnings.filterwarnings(action="ignore", category=DataConversionWarning)
|
|
29
31
|
def get_network_neighborhoods(
|
30
32
|
network: nx.Graph,
|
31
33
|
distance_metric: Union[str, List, Tuple, np.ndarray] = "louvain",
|
32
|
-
|
33
|
-
louvain_resolution: float = 1
|
34
|
+
fraction_shortest_edges: Union[float, List, Tuple, np.ndarray] = 1.0,
|
35
|
+
louvain_resolution: float = 0.1,
|
36
|
+
leiden_resolution: float = 1.0,
|
34
37
|
random_seed: int = 888,
|
35
|
-
) ->
|
36
|
-
"""Calculate the combined neighborhoods for each node
|
38
|
+
) -> csr_matrix:
|
39
|
+
"""Calculate the combined neighborhoods for each node using sparse matrices.
|
37
40
|
|
38
41
|
Args:
|
39
42
|
network (nx.Graph): The network graph.
|
40
|
-
distance_metric (str, List, Tuple, or np.ndarray, optional): The distance metric(s) to use.
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
Defaults to 1.0.
|
46
|
-
louvain_resolution (float, optional): Resolution parameter for the Louvain method. Defaults to 1.0.
|
47
|
-
random_seed (int, optional): Random seed for methods requiring random initialization. Defaults to 888.
|
43
|
+
distance_metric (str, List, Tuple, or np.ndarray, optional): The distance metric(s) to use.
|
44
|
+
fraction_shortest_edges (float, List, Tuple, or np.ndarray, optional): Shortest edge rank fraction thresholds.
|
45
|
+
louvain_resolution (float, optional): Resolution parameter for the Louvain method.
|
46
|
+
leiden_resolution (float, optional): Resolution parameter for the Leiden method.
|
47
|
+
random_seed (int, optional): Random seed for methods requiring random initialization.
|
48
48
|
|
49
49
|
Returns:
|
50
|
-
|
50
|
+
csr_matrix: The combined neighborhood matrix.
|
51
51
|
"""
|
52
52
|
# Set random seed for reproducibility
|
53
53
|
random.seed(random_seed)
|
54
54
|
np.random.seed(random_seed)
|
55
55
|
|
56
|
-
# Ensure distance_metric is a list
|
56
|
+
# Ensure distance_metric is a list for multi-algorithm handling
|
57
57
|
if isinstance(distance_metric, (str, np.ndarray)):
|
58
58
|
distance_metric = [distance_metric]
|
59
|
-
# Ensure
|
60
|
-
if isinstance(
|
61
|
-
|
62
|
-
#
|
63
|
-
if len(distance_metric) != len(
|
59
|
+
# Ensure fraction_shortest_edges is a list for multi-threshold handling
|
60
|
+
if isinstance(fraction_shortest_edges, (float, int)):
|
61
|
+
fraction_shortest_edges = [fraction_shortest_edges] * len(distance_metric)
|
62
|
+
# Validate matching lengths of distance metrics and thresholds
|
63
|
+
if len(distance_metric) != len(fraction_shortest_edges):
|
64
64
|
raise ValueError(
|
65
65
|
"The number of distance metrics must match the number of edge length thresholds."
|
66
66
|
)
|
67
67
|
|
68
|
-
# Initialize
|
68
|
+
# Initialize a sparse LIL matrix for incremental updates
|
69
69
|
num_nodes = network.number_of_nodes()
|
70
|
-
|
71
|
-
|
72
|
-
# Loop through each distance metric and corresponding edge
|
73
|
-
for metric,
|
74
|
-
#
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
neighborhoods = calculate_louvain_neighborhoods(
|
79
|
-
subgraph, louvain_resolution, random_seed=random_seed
|
70
|
+
# Initialize a sparse matrix with the same shape as the network
|
71
|
+
combined_neighborhoods = csr_matrix((num_nodes, num_nodes), dtype=np.uint8)
|
72
|
+
# Loop through each distance metric and corresponding edge rank fraction
|
73
|
+
for metric, percentile in zip(distance_metric, fraction_shortest_edges):
|
74
|
+
# Compute neighborhoods for the specified metric
|
75
|
+
if metric == "greedy_modularity":
|
76
|
+
neighborhoods = calculate_greedy_modularity_neighborhoods(
|
77
|
+
network, fraction_shortest_edges=percentile
|
80
78
|
)
|
81
|
-
elif metric == "greedy_modularity":
|
82
|
-
neighborhoods = calculate_greedy_modularity_neighborhoods(subgraph)
|
83
79
|
elif metric == "label_propagation":
|
84
|
-
neighborhoods = calculate_label_propagation_neighborhoods(
|
80
|
+
neighborhoods = calculate_label_propagation_neighborhoods(
|
81
|
+
network, fraction_shortest_edges=percentile
|
82
|
+
)
|
83
|
+
elif metric == "leiden":
|
84
|
+
neighborhoods = calculate_leiden_neighborhoods(
|
85
|
+
network,
|
86
|
+
resolution=leiden_resolution,
|
87
|
+
fraction_shortest_edges=percentile,
|
88
|
+
random_seed=random_seed,
|
89
|
+
)
|
90
|
+
elif metric == "louvain":
|
91
|
+
neighborhoods = calculate_louvain_neighborhoods(
|
92
|
+
network,
|
93
|
+
resolution=louvain_resolution,
|
94
|
+
fraction_shortest_edges=percentile,
|
95
|
+
random_seed=random_seed,
|
96
|
+
)
|
85
97
|
elif metric == "markov_clustering":
|
86
|
-
neighborhoods = calculate_markov_clustering_neighborhoods(
|
87
|
-
|
88
|
-
|
98
|
+
neighborhoods = calculate_markov_clustering_neighborhoods(
|
99
|
+
network, fraction_shortest_edges=percentile
|
100
|
+
)
|
89
101
|
elif metric == "spinglass":
|
90
|
-
neighborhoods = calculate_spinglass_neighborhoods(
|
102
|
+
neighborhoods = calculate_spinglass_neighborhoods(
|
103
|
+
network, fraction_shortest_edges=percentile
|
104
|
+
)
|
105
|
+
elif metric == "walktrap":
|
106
|
+
neighborhoods = calculate_walktrap_neighborhoods(
|
107
|
+
network, fraction_shortest_edges=percentile
|
108
|
+
)
|
91
109
|
else:
|
92
110
|
raise ValueError(
|
93
|
-
"
|
94
|
-
"'
|
111
|
+
"Invalid distance metric. Choose from: 'greedy_modularity', 'label_propagation',"
|
112
|
+
"'leiden', 'louvain', 'markov_clustering', 'spinglass', 'walktrap'."
|
95
113
|
)
|
96
114
|
|
97
|
-
#
|
115
|
+
# Add the sparse neighborhood matrix
|
98
116
|
combined_neighborhoods += neighborhoods
|
99
117
|
|
100
|
-
# Ensure
|
101
|
-
|
102
|
-
# while all other values are reset to 0. This transformation simplifies the neighborhood matrix by
|
103
|
-
# focusing on the most significant connection per row.
|
104
|
-
combined_neighborhoods = _set_max_to_one(combined_neighborhoods)
|
118
|
+
# Ensure maximum value in each row is set to 1
|
119
|
+
combined_neighborhoods = _set_max_row_value_to_one_sparse(combined_neighborhoods)
|
105
120
|
|
106
121
|
return combined_neighborhoods
|
107
122
|
|
108
123
|
|
109
|
-
def
|
110
|
-
"""
|
111
|
-
specified percentile of all edge lengths in the input graph.
|
124
|
+
def _set_max_row_value_to_one_sparse(matrix: csr_matrix) -> csr_matrix:
|
125
|
+
"""Set the maximum value in each row of a sparse matrix to 1.
|
112
126
|
|
113
127
|
Args:
|
114
|
-
|
115
|
-
edge_length_percentile (float): The percentile (between 0 and 1) to filter edges by length.
|
128
|
+
matrix (csr_matrix): The input sparse matrix.
|
116
129
|
|
117
130
|
Returns:
|
118
|
-
|
119
|
-
calculated threshold length.
|
131
|
+
csr_matrix: The modified sparse matrix where only the maximum value in each row is set to 1.
|
120
132
|
"""
|
121
|
-
#
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
)
|
127
|
-
|
128
|
-
# Calculate the specific edge length for the given percentile
|
129
|
-
percentile_length = np.percentile(edge_lengths, edge_length_percentile * 100)
|
130
|
-
# Create the subgraph by directly filtering edges during iteration
|
131
|
-
subgraph = nx.Graph()
|
132
|
-
subgraph.add_nodes_from(G.nodes(data=True)) # Retain all nodes from the original graph
|
133
|
-
# Add edges below the specified percentile length in a single pass
|
134
|
-
for u, v, d in G.edges(data=True):
|
135
|
-
if d.get("length", 1) <= percentile_length:
|
136
|
-
subgraph.add_edge(u, v, **d)
|
137
|
-
|
138
|
-
# Return the subgraph; optionally check if it's too sparse
|
139
|
-
if subgraph.number_of_edges() == 0:
|
140
|
-
raise Warning("The resulting subgraph has no edges. Consider adjusting the percentile.")
|
133
|
+
# Iterate over each row and set the maximum value to 1
|
134
|
+
for i in range(matrix.shape[0]):
|
135
|
+
row_data = matrix[i].data
|
136
|
+
if len(row_data) > 0:
|
137
|
+
row_data[:] = (row_data == max(row_data)).astype(int)
|
141
138
|
|
142
|
-
return
|
139
|
+
return matrix
|
143
140
|
|
144
141
|
|
145
|
-
def
|
146
|
-
"""For each row in the input matrix, set the maximum value(s) to 1 and all other values to 0.
|
142
|
+
def _set_max_row_value_to_one(matrix: np.ndarray) -> np.ndarray:
|
143
|
+
"""For each row in the input matrix, set the maximum value(s) to 1 and all other values to 0. This is particularly
|
144
|
+
useful for neighborhood matrices that have undergone multiple neighborhood detection algorithms, where the
|
145
|
+
maximum value in each row represents the most significant relationship per node in the combined neighborhoods.
|
147
146
|
|
148
147
|
Args:
|
149
148
|
matrix (np.ndarray): A 2D numpy array representing the neighborhood matrix.
|
@@ -171,163 +170,170 @@ def process_neighborhoods(
|
|
171
170
|
|
172
171
|
Args:
|
173
172
|
network (nx.Graph): The network data structure used for imputing and pruning neighbors.
|
174
|
-
neighborhoods (Dict[str, Any]): Dictionary containing '
|
173
|
+
neighborhoods (Dict[str, Any]): Dictionary containing 'significance_matrix', 'significant_binary_significance_matrix', and 'significant_significance_matrix'.
|
175
174
|
impute_depth (int, optional): Depth for imputing neighbors. Defaults to 0.
|
176
175
|
prune_threshold (float, optional): Distance threshold for pruning neighbors. Defaults to 0.0.
|
177
176
|
|
178
177
|
Returns:
|
179
|
-
Dict[str, Any]: Processed neighborhoods data, including the updated matrices and
|
178
|
+
Dict[str, Any]: Processed neighborhoods data, including the updated matrices and significance counts.
|
180
179
|
"""
|
181
|
-
|
182
|
-
|
183
|
-
|
180
|
+
significance_matrix = neighborhoods["significance_matrix"]
|
181
|
+
significant_binary_significance_matrix = neighborhoods["significant_binary_significance_matrix"]
|
182
|
+
significant_significance_matrix = neighborhoods["significant_significance_matrix"]
|
184
183
|
logger.debug(f"Imputation depth: {impute_depth}")
|
185
184
|
if impute_depth:
|
186
185
|
(
|
187
|
-
|
188
|
-
|
189
|
-
|
186
|
+
significance_matrix,
|
187
|
+
significant_binary_significance_matrix,
|
188
|
+
significant_significance_matrix,
|
190
189
|
) = _impute_neighbors(
|
191
190
|
network,
|
192
|
-
|
193
|
-
|
191
|
+
significance_matrix,
|
192
|
+
significant_binary_significance_matrix,
|
194
193
|
max_depth=impute_depth,
|
195
194
|
)
|
196
195
|
|
197
196
|
logger.debug(f"Pruning threshold: {prune_threshold}")
|
198
197
|
if prune_threshold:
|
199
198
|
(
|
200
|
-
|
201
|
-
|
202
|
-
|
199
|
+
significance_matrix,
|
200
|
+
significant_binary_significance_matrix,
|
201
|
+
significant_significance_matrix,
|
203
202
|
) = _prune_neighbors(
|
204
203
|
network,
|
205
|
-
|
206
|
-
|
204
|
+
significance_matrix,
|
205
|
+
significant_binary_significance_matrix,
|
207
206
|
distance_threshold=prune_threshold,
|
208
207
|
)
|
209
208
|
|
210
|
-
|
211
|
-
|
209
|
+
neighborhood_significance_counts = np.sum(significant_binary_significance_matrix, axis=0)
|
210
|
+
node_significance_sums = np.sum(significance_matrix, axis=1)
|
212
211
|
return {
|
213
|
-
"
|
214
|
-
"
|
215
|
-
"
|
216
|
-
"
|
217
|
-
"
|
212
|
+
"significance_matrix": significance_matrix,
|
213
|
+
"significant_binary_significance_matrix": significant_binary_significance_matrix,
|
214
|
+
"significant_significance_matrix": significant_significance_matrix,
|
215
|
+
"neighborhood_significance_counts": neighborhood_significance_counts,
|
216
|
+
"node_significance_sums": node_significance_sums,
|
218
217
|
}
|
219
218
|
|
220
219
|
|
221
220
|
def _impute_neighbors(
|
222
221
|
network: nx.Graph,
|
223
|
-
|
224
|
-
|
222
|
+
significance_matrix: np.ndarray,
|
223
|
+
significant_binary_significance_matrix: np.ndarray,
|
225
224
|
max_depth: int = 3,
|
226
225
|
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
227
|
-
"""Impute rows with sums of zero in the
|
226
|
+
"""Impute rows with sums of zero in the significance matrix based on the closest non-zero neighbors in the network graph.
|
228
227
|
|
229
228
|
Args:
|
230
229
|
network (nx.Graph): The network graph with nodes having IDs matching the matrix indices.
|
231
|
-
|
232
|
-
|
230
|
+
significance_matrix (np.ndarray): The significance matrix with rows to be imputed.
|
231
|
+
significant_binary_significance_matrix (np.ndarray): The alpha threshold matrix to be imputed similarly.
|
233
232
|
max_depth (int): Maximum depth of nodes to traverse for imputing values.
|
234
233
|
|
235
234
|
Returns:
|
236
|
-
|
237
|
-
- np.ndarray: The imputed
|
235
|
+
Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
236
|
+
- np.ndarray: The imputed significance matrix.
|
238
237
|
- np.ndarray: The imputed alpha threshold matrix.
|
239
|
-
- np.ndarray: The significant
|
238
|
+
- np.ndarray: The significant significance matrix with non-significant entries set to zero.
|
240
239
|
"""
|
241
240
|
# Calculate the distance threshold value based on the shortest distances
|
242
|
-
|
243
|
-
network,
|
241
|
+
significance_matrix, significant_binary_significance_matrix = _impute_neighbors_with_similarity(
|
242
|
+
network, significance_matrix, significant_binary_significance_matrix, max_depth=max_depth
|
244
243
|
)
|
245
244
|
# Create a matrix where non-significant entries are set to zero
|
246
|
-
|
247
|
-
|
245
|
+
significant_significance_matrix = np.where(
|
246
|
+
significant_binary_significance_matrix == 1, significance_matrix, 0
|
248
247
|
)
|
249
248
|
|
250
|
-
return
|
249
|
+
return (
|
250
|
+
significance_matrix,
|
251
|
+
significant_binary_significance_matrix,
|
252
|
+
significant_significance_matrix,
|
253
|
+
)
|
251
254
|
|
252
255
|
|
253
256
|
def _impute_neighbors_with_similarity(
|
254
257
|
network: nx.Graph,
|
255
|
-
|
256
|
-
|
258
|
+
significance_matrix: np.ndarray,
|
259
|
+
significant_binary_significance_matrix: np.ndarray,
|
257
260
|
max_depth: int = 3,
|
258
261
|
) -> Tuple[np.ndarray, np.ndarray]:
|
259
|
-
"""Impute non-
|
262
|
+
"""Impute non-significant nodes based on the closest significant neighbors' profiles and their similarity.
|
260
263
|
|
261
264
|
Args:
|
262
265
|
network (nx.Graph): The network graph with nodes having IDs matching the matrix indices.
|
263
|
-
|
264
|
-
|
266
|
+
significance_matrix (np.ndarray): The significance matrix with rows to be imputed.
|
267
|
+
significant_binary_significance_matrix (np.ndarray): The alpha threshold matrix to be imputed similarly.
|
265
268
|
max_depth (int): Maximum depth of nodes to traverse for imputing values.
|
266
269
|
|
267
270
|
Returns:
|
268
|
-
Tuple[np.ndarray, np.ndarray]:
|
269
|
-
- The imputed
|
271
|
+
Tuple[np.ndarray, np.ndarray]:
|
272
|
+
- The imputed significance matrix.
|
270
273
|
- The imputed alpha threshold matrix.
|
271
274
|
"""
|
272
275
|
depth = 1
|
273
|
-
rows_to_impute = np.where(
|
276
|
+
rows_to_impute = np.where(significant_binary_significance_matrix.sum(axis=1) == 0)[0]
|
274
277
|
while len(rows_to_impute) and depth <= max_depth:
|
275
|
-
# Iterate over all
|
276
|
-
for row_index in range(
|
277
|
-
if
|
278
|
-
|
278
|
+
# Iterate over all significant nodes
|
279
|
+
for row_index in range(significant_binary_significance_matrix.shape[0]):
|
280
|
+
if significant_binary_significance_matrix[row_index].sum() != 0:
|
281
|
+
(
|
282
|
+
significance_matrix,
|
283
|
+
significant_binary_significance_matrix,
|
284
|
+
) = _process_node_imputation(
|
279
285
|
row_index,
|
280
286
|
network,
|
281
|
-
|
282
|
-
|
287
|
+
significance_matrix,
|
288
|
+
significant_binary_significance_matrix,
|
283
289
|
depth,
|
284
290
|
)
|
285
291
|
|
286
292
|
# Update rows to impute for the next iteration
|
287
|
-
rows_to_impute = np.where(
|
293
|
+
rows_to_impute = np.where(significant_binary_significance_matrix.sum(axis=1) == 0)[0]
|
288
294
|
depth += 1
|
289
295
|
|
290
|
-
return
|
296
|
+
return significance_matrix, significant_binary_significance_matrix
|
291
297
|
|
292
298
|
|
293
299
|
def _process_node_imputation(
|
294
300
|
row_index: int,
|
295
301
|
network: nx.Graph,
|
296
|
-
|
297
|
-
|
302
|
+
significance_matrix: np.ndarray,
|
303
|
+
significant_binary_significance_matrix: np.ndarray,
|
298
304
|
depth: int,
|
299
305
|
) -> Tuple[np.ndarray, np.ndarray]:
|
300
|
-
"""Process the imputation for a single node based on its
|
306
|
+
"""Process the imputation for a single node based on its significant neighbors.
|
301
307
|
|
302
308
|
Args:
|
303
|
-
row_index (int): The index of the
|
309
|
+
row_index (int): The index of the significant node being processed.
|
304
310
|
network (nx.Graph): The network graph with nodes having IDs matching the matrix indices.
|
305
|
-
|
306
|
-
|
311
|
+
significance_matrix (np.ndarray): The significance matrix with rows to be imputed.
|
312
|
+
significant_binary_significance_matrix (np.ndarray): The alpha threshold matrix to be imputed similarly.
|
307
313
|
depth (int): Current depth for traversal.
|
308
314
|
|
309
315
|
Returns:
|
310
|
-
Tuple[np.ndarray, np.ndarray]: The modified
|
316
|
+
Tuple[np.ndarray, np.ndarray]: The modified significance matrix and binary threshold matrix.
|
311
317
|
"""
|
312
318
|
# Check neighbors at the current depth
|
313
319
|
neighbors = nx.single_source_shortest_path_length(network, row_index, cutoff=depth)
|
314
|
-
# Filter annotated neighbors (already
|
320
|
+
# Filter annotated neighbors (already significant)
|
315
321
|
annotated_neighbors = [
|
316
322
|
n
|
317
323
|
for n in neighbors
|
318
324
|
if n != row_index
|
319
|
-
and
|
320
|
-
and
|
325
|
+
and significant_binary_significance_matrix[n].sum() != 0
|
326
|
+
and significance_matrix[n].sum() != 0
|
321
327
|
]
|
322
|
-
# Filter non-
|
328
|
+
# Filter non-significant neighbors
|
323
329
|
valid_neighbors = [
|
324
330
|
n
|
325
331
|
for n in neighbors
|
326
332
|
if n != row_index
|
327
|
-
and
|
328
|
-
and
|
333
|
+
and significant_binary_significance_matrix[n].sum() == 0
|
334
|
+
and significance_matrix[n].sum() == 0
|
329
335
|
]
|
330
|
-
# If there are valid non-
|
336
|
+
# If there are valid non-significant neighbors
|
331
337
|
if valid_neighbors and annotated_neighbors:
|
332
338
|
# Calculate distances to annotated neighbors
|
333
339
|
distances_to_annotated = [
|
@@ -338,7 +344,7 @@ def _process_node_imputation(
|
|
338
344
|
iqr = q3 - q1
|
339
345
|
lower_bound = q1 - 1.5 * iqr
|
340
346
|
upper_bound = q3 + 1.5 * iqr
|
341
|
-
# Filter valid non-
|
347
|
+
# Filter valid non-significant neighbors that fall within the IQR bounds
|
342
348
|
valid_neighbors_within_iqr = [
|
343
349
|
n
|
344
350
|
for n in valid_neighbors
|
@@ -352,8 +358,8 @@ def _process_node_imputation(
|
|
352
358
|
def sum_pairwise_cosine_similarities(neighbor):
|
353
359
|
return sum(
|
354
360
|
cosine_similarity(
|
355
|
-
|
356
|
-
|
361
|
+
significance_matrix[neighbor].reshape(1, -1),
|
362
|
+
significance_matrix[other_neighbor].reshape(1, -1),
|
357
363
|
)[0][0]
|
358
364
|
for other_neighbor in valid_neighbors_within_iqr
|
359
365
|
if other_neighbor != neighbor
|
@@ -365,43 +371,45 @@ def _process_node_imputation(
|
|
365
371
|
else:
|
366
372
|
most_similar_neighbor = valid_neighbors_within_iqr[0]
|
367
373
|
|
368
|
-
# Impute the most similar non-
|
369
|
-
|
374
|
+
# Impute the most similar non-significant neighbor with the significant node's data, scaled by depth
|
375
|
+
significance_matrix[most_similar_neighbor] = significance_matrix[row_index] / np.sqrt(
|
370
376
|
depth + 1
|
371
377
|
)
|
372
|
-
|
373
|
-
|
378
|
+
significant_binary_significance_matrix[most_similar_neighbor] = (
|
379
|
+
significant_binary_significance_matrix[row_index]
|
374
380
|
)
|
375
381
|
|
376
|
-
return
|
382
|
+
return significance_matrix, significant_binary_significance_matrix
|
377
383
|
|
378
384
|
|
379
385
|
def _prune_neighbors(
|
380
386
|
network: nx.Graph,
|
381
|
-
|
382
|
-
|
387
|
+
significance_matrix: np.ndarray,
|
388
|
+
significant_binary_significance_matrix: np.ndarray,
|
383
389
|
distance_threshold: float = 0.9,
|
384
390
|
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
385
391
|
"""Remove outliers based on their rank for edge lengths.
|
386
392
|
|
387
393
|
Args:
|
388
394
|
network (nx.Graph): The network graph with nodes having IDs matching the matrix indices.
|
389
|
-
|
390
|
-
|
395
|
+
significance_matrix (np.ndarray): The significance matrix.
|
396
|
+
significant_binary_significance_matrix (np.ndarray): The alpha threshold matrix.
|
391
397
|
distance_threshold (float): Rank threshold (0 to 1) to determine outliers.
|
392
398
|
|
393
399
|
Returns:
|
394
|
-
|
395
|
-
- np.ndarray: The updated
|
400
|
+
Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
401
|
+
- np.ndarray: The updated significance matrix with outliers set to zero.
|
396
402
|
- np.ndarray: The updated alpha threshold matrix with outliers set to zero.
|
397
|
-
- np.ndarray: The significant
|
403
|
+
- np.ndarray: The significant significance matrix, where non-significant entries are set to zero.
|
398
404
|
"""
|
399
|
-
# Identify indices with non-zero rows in the binary
|
400
|
-
non_zero_indices = np.where(
|
405
|
+
# Identify indices with non-zero rows in the binary significance matrix
|
406
|
+
non_zero_indices = np.where(significant_binary_significance_matrix.sum(axis=1) != 0)[0]
|
401
407
|
median_distances = []
|
402
408
|
for node in non_zero_indices:
|
403
409
|
neighbors = [
|
404
|
-
n
|
410
|
+
n
|
411
|
+
for n in network.neighbors(node)
|
412
|
+
if significant_binary_significance_matrix[n].sum() != 0
|
405
413
|
]
|
406
414
|
if neighbors:
|
407
415
|
median_distance = np.median(
|
@@ -416,22 +424,26 @@ def _prune_neighbors(
|
|
416
424
|
neighbors = [
|
417
425
|
n
|
418
426
|
for n in network.neighbors(row_index)
|
419
|
-
if
|
427
|
+
if significant_binary_significance_matrix[n].sum() != 0
|
420
428
|
]
|
421
429
|
if neighbors:
|
422
430
|
median_distance = np.median(
|
423
431
|
[_get_euclidean_distance(row_index, n, network) for n in neighbors]
|
424
432
|
)
|
425
433
|
if median_distance >= distance_threshold_value:
|
426
|
-
|
427
|
-
|
434
|
+
significance_matrix[row_index] = 0
|
435
|
+
significant_binary_significance_matrix[row_index] = 0
|
428
436
|
|
429
437
|
# Create a matrix where non-significant entries are set to zero
|
430
|
-
|
431
|
-
|
438
|
+
significant_significance_matrix = np.where(
|
439
|
+
significant_binary_significance_matrix == 1, significance_matrix, 0
|
432
440
|
)
|
433
441
|
|
434
|
-
return
|
442
|
+
return (
|
443
|
+
significance_matrix,
|
444
|
+
significant_binary_significance_matrix,
|
445
|
+
significant_significance_matrix,
|
446
|
+
)
|
435
447
|
|
436
448
|
|
437
449
|
def _get_euclidean_distance(node1: Any, node2: Any, network: nx.Graph) -> float:
|
@@ -481,7 +493,7 @@ def _calculate_threshold(median_distances: List, distance_threshold: float) -> f
|
|
481
493
|
"""
|
482
494
|
# Sort the median distances
|
483
495
|
sorted_distances = np.sort(median_distances)
|
484
|
-
# Compute the rank
|
496
|
+
# Compute the rank fractions for the sorted distances
|
485
497
|
rank_percentiles = np.linspace(0, 1, len(sorted_distances))
|
486
498
|
# Interpolating the ranks to 1000 evenly spaced percentiles
|
487
499
|
interpolated_percentiles = np.linspace(0, 1, 1000)
|