risk-network 0.0.8b18__py3-none-any.whl → 0.0.9b26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- risk/__init__.py +2 -2
- risk/annotations/__init__.py +2 -2
- risk/annotations/annotations.py +133 -72
- risk/annotations/io.py +50 -34
- risk/log/__init__.py +4 -2
- risk/log/{config.py → console.py} +5 -3
- risk/log/{params.py → parameters.py} +21 -46
- risk/neighborhoods/__init__.py +3 -5
- risk/neighborhoods/api.py +446 -0
- risk/neighborhoods/community.py +281 -96
- risk/neighborhoods/domains.py +92 -38
- risk/neighborhoods/neighborhoods.py +210 -149
- risk/network/__init__.py +1 -3
- risk/network/geometry.py +69 -58
- risk/network/graph/__init__.py +6 -0
- risk/network/graph/api.py +194 -0
- risk/network/graph/network.py +269 -0
- risk/network/graph/summary.py +254 -0
- risk/network/io.py +58 -48
- risk/network/plotter/__init__.py +6 -0
- risk/network/plotter/api.py +54 -0
- risk/network/{plot → plotter}/canvas.py +80 -26
- risk/network/{plot → plotter}/contour.py +43 -34
- risk/network/{plot → plotter}/labels.py +123 -113
- risk/network/plotter/network.py +424 -0
- risk/network/plotter/utils/colors.py +416 -0
- risk/network/plotter/utils/layout.py +94 -0
- risk/risk.py +11 -469
- risk/stats/__init__.py +8 -4
- risk/stats/binom.py +51 -0
- risk/stats/chi2.py +69 -0
- risk/stats/hypergeom.py +28 -18
- risk/stats/permutation/__init__.py +1 -1
- risk/stats/permutation/permutation.py +45 -39
- risk/stats/permutation/test_functions.py +25 -17
- risk/stats/poisson.py +17 -11
- risk/stats/stats.py +20 -16
- risk/stats/zscore.py +68 -0
- {risk_network-0.0.8b18.dist-info → risk_network-0.0.9b26.dist-info}/METADATA +9 -5
- risk_network-0.0.9b26.dist-info/RECORD +44 -0
- {risk_network-0.0.8b18.dist-info → risk_network-0.0.9b26.dist-info}/WHEEL +1 -1
- risk/network/graph.py +0 -159
- risk/network/plot/__init__.py +0 -6
- risk/network/plot/network.py +0 -282
- risk/network/plot/plotter.py +0 -137
- risk/network/plot/utils/color.py +0 -353
- risk/network/plot/utils/layout.py +0 -53
- risk_network-0.0.8b18.dist-info/RECORD +0 -37
- {risk_network-0.0.8b18.dist-info → risk_network-0.0.9b26.dist-info}/LICENSE +0 -0
- {risk_network-0.0.8b18.dist-info → risk_network-0.0.9b26.dist-info}/top_level.txt +0 -0
@@ -5,7 +5,7 @@ risk/neighborhoods/neighborhoods
|
|
5
5
|
|
6
6
|
import random
|
7
7
|
import warnings
|
8
|
-
from typing import Any, Dict, List, Tuple
|
8
|
+
from typing import Any, Dict, List, Tuple, Union
|
9
9
|
|
10
10
|
import networkx as nx
|
11
11
|
import numpy as np
|
@@ -15,6 +15,7 @@ from sklearn.metrics.pairwise import cosine_similarity
|
|
15
15
|
from risk.neighborhoods.community import (
|
16
16
|
calculate_greedy_modularity_neighborhoods,
|
17
17
|
calculate_label_propagation_neighborhoods,
|
18
|
+
calculate_leiden_neighborhoods,
|
18
19
|
calculate_louvain_neighborhoods,
|
19
20
|
calculate_markov_clustering_neighborhoods,
|
20
21
|
calculate_spinglass_neighborhoods,
|
@@ -28,86 +29,119 @@ warnings.filterwarnings(action="ignore", category=DataConversionWarning)
|
|
28
29
|
|
29
30
|
def get_network_neighborhoods(
|
30
31
|
network: nx.Graph,
|
31
|
-
distance_metric: str = "louvain",
|
32
|
-
|
33
|
-
louvain_resolution: float = 1
|
32
|
+
distance_metric: Union[str, List, Tuple, np.ndarray] = "louvain",
|
33
|
+
fraction_shortest_edges: Union[float, List, Tuple, np.ndarray] = 1.0,
|
34
|
+
louvain_resolution: float = 0.1,
|
35
|
+
leiden_resolution: float = 1.0,
|
34
36
|
random_seed: int = 888,
|
35
37
|
) -> np.ndarray:
|
36
|
-
"""Calculate the neighborhoods for each node
|
38
|
+
"""Calculate the combined neighborhoods for each node based on the specified community detection algorithm(s).
|
37
39
|
|
38
40
|
Args:
|
39
41
|
network (nx.Graph): The network graph.
|
40
|
-
distance_metric (str): The distance metric to use
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
random_seed (int, optional): Random seed for methods requiring random initialization.
|
42
|
+
distance_metric (str, List, Tuple, or np.ndarray, optional): The distance metric(s) to use.
|
43
|
+
fraction_shortest_edges (float, List, Tuple, or np.ndarray, optional): Shortest edge rank fraction threshold(s) for creating subgraphs.
|
44
|
+
louvain_resolution (float, optional): Resolution parameter for the Louvain method.
|
45
|
+
leiden_resolution (float, optional): Resolution parameter for the Leiden method.
|
46
|
+
random_seed (int, optional): Random seed for methods requiring random initialization.
|
45
47
|
|
46
48
|
Returns:
|
47
|
-
np.ndarray:
|
49
|
+
np.ndarray: Summed neighborhood matrix from all selected algorithms.
|
48
50
|
"""
|
49
|
-
# Set random seed for reproducibility
|
51
|
+
# Set random seed for reproducibility
|
50
52
|
random.seed(random_seed)
|
51
53
|
np.random.seed(random_seed)
|
52
54
|
|
53
|
-
#
|
54
|
-
|
55
|
-
|
56
|
-
|
55
|
+
# Ensure distance_metric is a list/tuple for multi-algorithm handling
|
56
|
+
if isinstance(distance_metric, (str, np.ndarray)):
|
57
|
+
distance_metric = [distance_metric]
|
58
|
+
# Ensure fraction_shortest_edges is a list/tuple for multi-threshold handling
|
59
|
+
if isinstance(fraction_shortest_edges, (float, int)):
|
60
|
+
fraction_shortest_edges = [fraction_shortest_edges] * len(distance_metric)
|
61
|
+
# Check that the number of distance metrics matches the number of edge length thresholds
|
62
|
+
if len(distance_metric) != len(fraction_shortest_edges):
|
63
|
+
raise ValueError(
|
64
|
+
"The number of distance metrics must match the number of edge length thresholds."
|
65
|
+
)
|
57
66
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
return calculate_greedy_modularity_neighborhoods(network)
|
62
|
-
if distance_metric == "label_propagation":
|
63
|
-
return calculate_label_propagation_neighborhoods(network)
|
64
|
-
if distance_metric == "markov_clustering":
|
65
|
-
return calculate_markov_clustering_neighborhoods(network)
|
66
|
-
if distance_metric == "walktrap":
|
67
|
-
return calculate_walktrap_neighborhoods(network)
|
68
|
-
if distance_metric == "spinglass":
|
69
|
-
return calculate_spinglass_neighborhoods(network)
|
70
|
-
|
71
|
-
raise ValueError(
|
72
|
-
"Incorrect distance metric specified. Please choose from 'greedy_modularity', 'louvain',"
|
73
|
-
"'label_propagation', 'markov_clustering', 'walktrap', 'spinglass'."
|
74
|
-
)
|
67
|
+
# Initialize combined neighborhood matrix
|
68
|
+
num_nodes = network.number_of_nodes()
|
69
|
+
combined_neighborhoods = np.zeros((num_nodes, num_nodes), dtype=int)
|
75
70
|
|
71
|
+
# Loop through each distance metric and corresponding edge rank fraction
|
72
|
+
for metric, percentile in zip(distance_metric, fraction_shortest_edges):
|
73
|
+
# Call the appropriate neighborhood function based on the metric
|
74
|
+
if metric == "greedy_modularity":
|
75
|
+
neighborhoods = calculate_greedy_modularity_neighborhoods(
|
76
|
+
network, fraction_shortest_edges=percentile
|
77
|
+
)
|
78
|
+
elif metric == "label_propagation":
|
79
|
+
neighborhoods = calculate_label_propagation_neighborhoods(
|
80
|
+
network, fraction_shortest_edges=percentile
|
81
|
+
)
|
82
|
+
elif metric == "leiden":
|
83
|
+
neighborhoods = calculate_leiden_neighborhoods(
|
84
|
+
network,
|
85
|
+
resolution=leiden_resolution,
|
86
|
+
fraction_shortest_edges=percentile,
|
87
|
+
random_seed=random_seed,
|
88
|
+
)
|
89
|
+
elif metric == "louvain":
|
90
|
+
neighborhoods = calculate_louvain_neighborhoods(
|
91
|
+
network,
|
92
|
+
resolution=louvain_resolution,
|
93
|
+
fraction_shortest_edges=percentile,
|
94
|
+
random_seed=random_seed,
|
95
|
+
)
|
96
|
+
elif metric == "markov_clustering":
|
97
|
+
neighborhoods = calculate_markov_clustering_neighborhoods(
|
98
|
+
network, fraction_shortest_edges=percentile
|
99
|
+
)
|
100
|
+
elif metric == "spinglass":
|
101
|
+
neighborhoods = calculate_spinglass_neighborhoods(
|
102
|
+
network, fraction_shortest_edges=percentile
|
103
|
+
)
|
104
|
+
elif metric == "walktrap":
|
105
|
+
neighborhoods = calculate_walktrap_neighborhoods(
|
106
|
+
network, fraction_shortest_edges=percentile
|
107
|
+
)
|
108
|
+
else:
|
109
|
+
raise ValueError(
|
110
|
+
"Incorrect distance metric specified. Please choose from 'greedy_modularity', 'label_propagation',"
|
111
|
+
"'leiden', 'louvain', 'markov_clustering', 'spinglass', 'walktrap'."
|
112
|
+
)
|
76
113
|
|
77
|
-
|
78
|
-
|
79
|
-
specified percentile of all edge lengths in the input graph.
|
114
|
+
# Sum the neighborhood matrices
|
115
|
+
combined_neighborhoods += neighborhoods
|
80
116
|
|
81
|
-
|
82
|
-
|
83
|
-
|
117
|
+
# Ensure that the maximum value in each row is set to 1
|
118
|
+
# This ensures that for each row, only the strongest relationship (the maximum value) is retained,
|
119
|
+
# while all other values are reset to 0. This transformation simplifies the neighborhood matrix by
|
120
|
+
# focusing on the most significant connection per row (or nodes).
|
121
|
+
combined_neighborhoods = _set_max_row_value_to_one(combined_neighborhoods)
|
84
122
|
|
85
|
-
|
86
|
-
nx.Graph: A subgraph with all nodes and edges where the edge length is below the
|
87
|
-
calculated threshold length.
|
88
|
-
"""
|
89
|
-
# Extract edge lengths and handle missing lengths
|
90
|
-
edge_lengths = [d["length"] for _, _, d in G.edges(data=True) if "length" in d]
|
91
|
-
if not edge_lengths:
|
92
|
-
raise ValueError(
|
93
|
-
"No edge lengths found in the graph. Ensure edges have 'length' attributes."
|
94
|
-
)
|
123
|
+
return combined_neighborhoods
|
95
124
|
|
96
|
-
# Calculate the specific edge length for the given percentile
|
97
|
-
percentile_length = np.percentile(edge_lengths, edge_length_percentile * 100)
|
98
|
-
# Create the subgraph by directly filtering edges during iteration
|
99
|
-
subgraph = nx.Graph()
|
100
|
-
subgraph.add_nodes_from(G.nodes(data=True)) # Retain all nodes from the original graph
|
101
|
-
# Add edges below the specified percentile length in a single pass
|
102
|
-
for u, v, d in G.edges(data=True):
|
103
|
-
if d.get("length", 1) <= percentile_length:
|
104
|
-
subgraph.add_edge(u, v, **d)
|
105
125
|
|
106
|
-
|
107
|
-
|
108
|
-
|
126
|
+
def _set_max_row_value_to_one(matrix: np.ndarray) -> np.ndarray:
|
127
|
+
"""For each row in the input matrix, set the maximum value(s) to 1 and all other values to 0. This is particularly
|
128
|
+
useful for neighborhood matrices that have undergone multiple neighborhood detection algorithms, where the
|
129
|
+
maximum value in each row represents the most significant relationship per node in the combined neighborhoods.
|
130
|
+
|
131
|
+
Args:
|
132
|
+
matrix (np.ndarray): A 2D numpy array representing the neighborhood matrix.
|
109
133
|
|
110
|
-
|
134
|
+
Returns:
|
135
|
+
np.ndarray: The modified matrix where only the maximum value(s) in each row is set to 1, and others are set to 0.
|
136
|
+
"""
|
137
|
+
# Find the maximum value in each row (column-wise max operation)
|
138
|
+
max_values = np.max(matrix, axis=1, keepdims=True)
|
139
|
+
# Create a boolean mask where elements are True if they are the max value in their row
|
140
|
+
max_mask = matrix == max_values
|
141
|
+
# Set all elements to 0, and then set the maximum value positions to 1
|
142
|
+
matrix[:] = 0 # Set everything to 0
|
143
|
+
matrix[max_mask] = 1 # Set only the max values to 1
|
144
|
+
return matrix
|
111
145
|
|
112
146
|
|
113
147
|
def process_neighborhoods(
|
@@ -120,157 +154,170 @@ def process_neighborhoods(
|
|
120
154
|
|
121
155
|
Args:
|
122
156
|
network (nx.Graph): The network data structure used for imputing and pruning neighbors.
|
123
|
-
neighborhoods (
|
157
|
+
neighborhoods (Dict[str, Any]): Dictionary containing 'significance_matrix', 'significant_binary_significance_matrix', and 'significant_significance_matrix'.
|
124
158
|
impute_depth (int, optional): Depth for imputing neighbors. Defaults to 0.
|
125
159
|
prune_threshold (float, optional): Distance threshold for pruning neighbors. Defaults to 0.0.
|
126
160
|
|
127
161
|
Returns:
|
128
|
-
|
162
|
+
Dict[str, Any]: Processed neighborhoods data, including the updated matrices and significance counts.
|
129
163
|
"""
|
130
|
-
|
131
|
-
|
132
|
-
|
164
|
+
significance_matrix = neighborhoods["significance_matrix"]
|
165
|
+
significant_binary_significance_matrix = neighborhoods["significant_binary_significance_matrix"]
|
166
|
+
significant_significance_matrix = neighborhoods["significant_significance_matrix"]
|
133
167
|
logger.debug(f"Imputation depth: {impute_depth}")
|
134
168
|
if impute_depth:
|
135
169
|
(
|
136
|
-
|
137
|
-
|
138
|
-
|
170
|
+
significance_matrix,
|
171
|
+
significant_binary_significance_matrix,
|
172
|
+
significant_significance_matrix,
|
139
173
|
) = _impute_neighbors(
|
140
174
|
network,
|
141
|
-
|
142
|
-
|
175
|
+
significance_matrix,
|
176
|
+
significant_binary_significance_matrix,
|
143
177
|
max_depth=impute_depth,
|
144
178
|
)
|
145
179
|
|
146
180
|
logger.debug(f"Pruning threshold: {prune_threshold}")
|
147
181
|
if prune_threshold:
|
148
182
|
(
|
149
|
-
|
150
|
-
|
151
|
-
|
183
|
+
significance_matrix,
|
184
|
+
significant_binary_significance_matrix,
|
185
|
+
significant_significance_matrix,
|
152
186
|
) = _prune_neighbors(
|
153
187
|
network,
|
154
|
-
|
155
|
-
|
188
|
+
significance_matrix,
|
189
|
+
significant_binary_significance_matrix,
|
156
190
|
distance_threshold=prune_threshold,
|
157
191
|
)
|
158
192
|
|
159
|
-
|
160
|
-
|
193
|
+
neighborhood_significance_counts = np.sum(significant_binary_significance_matrix, axis=0)
|
194
|
+
node_significance_sums = np.sum(significance_matrix, axis=1)
|
161
195
|
return {
|
162
|
-
"
|
163
|
-
"
|
164
|
-
"
|
165
|
-
"
|
166
|
-
"
|
196
|
+
"significance_matrix": significance_matrix,
|
197
|
+
"significant_binary_significance_matrix": significant_binary_significance_matrix,
|
198
|
+
"significant_significance_matrix": significant_significance_matrix,
|
199
|
+
"neighborhood_significance_counts": neighborhood_significance_counts,
|
200
|
+
"node_significance_sums": node_significance_sums,
|
167
201
|
}
|
168
202
|
|
169
203
|
|
170
204
|
def _impute_neighbors(
|
171
205
|
network: nx.Graph,
|
172
|
-
|
173
|
-
|
206
|
+
significance_matrix: np.ndarray,
|
207
|
+
significant_binary_significance_matrix: np.ndarray,
|
174
208
|
max_depth: int = 3,
|
175
209
|
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
176
|
-
"""Impute rows with sums of zero in the
|
210
|
+
"""Impute rows with sums of zero in the significance matrix based on the closest non-zero neighbors in the network graph.
|
177
211
|
|
178
212
|
Args:
|
179
213
|
network (nx.Graph): The network graph with nodes having IDs matching the matrix indices.
|
180
|
-
|
181
|
-
|
214
|
+
significance_matrix (np.ndarray): The significance matrix with rows to be imputed.
|
215
|
+
significant_binary_significance_matrix (np.ndarray): The alpha threshold matrix to be imputed similarly.
|
182
216
|
max_depth (int): Maximum depth of nodes to traverse for imputing values.
|
183
217
|
|
184
218
|
Returns:
|
185
|
-
|
186
|
-
- np.ndarray: The imputed
|
219
|
+
Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
220
|
+
- np.ndarray: The imputed significance matrix.
|
187
221
|
- np.ndarray: The imputed alpha threshold matrix.
|
188
|
-
- np.ndarray: The significant
|
222
|
+
- np.ndarray: The significant significance matrix with non-significant entries set to zero.
|
189
223
|
"""
|
190
224
|
# Calculate the distance threshold value based on the shortest distances
|
191
|
-
|
192
|
-
network,
|
225
|
+
significance_matrix, significant_binary_significance_matrix = _impute_neighbors_with_similarity(
|
226
|
+
network, significance_matrix, significant_binary_significance_matrix, max_depth=max_depth
|
193
227
|
)
|
194
228
|
# Create a matrix where non-significant entries are set to zero
|
195
|
-
|
229
|
+
significant_significance_matrix = np.where(
|
230
|
+
significant_binary_significance_matrix == 1, significance_matrix, 0
|
231
|
+
)
|
196
232
|
|
197
|
-
return
|
233
|
+
return (
|
234
|
+
significance_matrix,
|
235
|
+
significant_binary_significance_matrix,
|
236
|
+
significant_significance_matrix,
|
237
|
+
)
|
198
238
|
|
199
239
|
|
200
240
|
def _impute_neighbors_with_similarity(
|
201
241
|
network: nx.Graph,
|
202
|
-
|
203
|
-
|
242
|
+
significance_matrix: np.ndarray,
|
243
|
+
significant_binary_significance_matrix: np.ndarray,
|
204
244
|
max_depth: int = 3,
|
205
245
|
) -> Tuple[np.ndarray, np.ndarray]:
|
206
|
-
"""Impute non-
|
246
|
+
"""Impute non-significant nodes based on the closest significant neighbors' profiles and their similarity.
|
207
247
|
|
208
248
|
Args:
|
209
249
|
network (nx.Graph): The network graph with nodes having IDs matching the matrix indices.
|
210
|
-
|
211
|
-
|
250
|
+
significance_matrix (np.ndarray): The significance matrix with rows to be imputed.
|
251
|
+
significant_binary_significance_matrix (np.ndarray): The alpha threshold matrix to be imputed similarly.
|
212
252
|
max_depth (int): Maximum depth of nodes to traverse for imputing values.
|
213
253
|
|
214
254
|
Returns:
|
215
|
-
Tuple[np.ndarray, np.ndarray]:
|
216
|
-
- The imputed
|
255
|
+
Tuple[np.ndarray, np.ndarray]:
|
256
|
+
- The imputed significance matrix.
|
217
257
|
- The imputed alpha threshold matrix.
|
218
258
|
"""
|
219
259
|
depth = 1
|
220
|
-
rows_to_impute = np.where(
|
260
|
+
rows_to_impute = np.where(significant_binary_significance_matrix.sum(axis=1) == 0)[0]
|
221
261
|
while len(rows_to_impute) and depth <= max_depth:
|
222
|
-
# Iterate over all
|
223
|
-
for row_index in range(
|
224
|
-
if
|
225
|
-
|
226
|
-
|
262
|
+
# Iterate over all significant nodes
|
263
|
+
for row_index in range(significant_binary_significance_matrix.shape[0]):
|
264
|
+
if significant_binary_significance_matrix[row_index].sum() != 0:
|
265
|
+
(
|
266
|
+
significance_matrix,
|
267
|
+
significant_binary_significance_matrix,
|
268
|
+
) = _process_node_imputation(
|
269
|
+
row_index,
|
270
|
+
network,
|
271
|
+
significance_matrix,
|
272
|
+
significant_binary_significance_matrix,
|
273
|
+
depth,
|
227
274
|
)
|
228
275
|
|
229
276
|
# Update rows to impute for the next iteration
|
230
|
-
rows_to_impute = np.where(
|
277
|
+
rows_to_impute = np.where(significant_binary_significance_matrix.sum(axis=1) == 0)[0]
|
231
278
|
depth += 1
|
232
279
|
|
233
|
-
return
|
280
|
+
return significance_matrix, significant_binary_significance_matrix
|
234
281
|
|
235
282
|
|
236
283
|
def _process_node_imputation(
|
237
284
|
row_index: int,
|
238
285
|
network: nx.Graph,
|
239
|
-
|
240
|
-
|
286
|
+
significance_matrix: np.ndarray,
|
287
|
+
significant_binary_significance_matrix: np.ndarray,
|
241
288
|
depth: int,
|
242
289
|
) -> Tuple[np.ndarray, np.ndarray]:
|
243
|
-
"""Process the imputation for a single node based on its
|
290
|
+
"""Process the imputation for a single node based on its significant neighbors.
|
244
291
|
|
245
292
|
Args:
|
246
|
-
row_index (int): The index of the
|
293
|
+
row_index (int): The index of the significant node being processed.
|
247
294
|
network (nx.Graph): The network graph with nodes having IDs matching the matrix indices.
|
248
|
-
|
249
|
-
|
295
|
+
significance_matrix (np.ndarray): The significance matrix with rows to be imputed.
|
296
|
+
significant_binary_significance_matrix (np.ndarray): The alpha threshold matrix to be imputed similarly.
|
250
297
|
depth (int): Current depth for traversal.
|
251
298
|
|
252
299
|
Returns:
|
253
|
-
Tuple[np.ndarray, np.ndarray]: The modified
|
300
|
+
Tuple[np.ndarray, np.ndarray]: The modified significance matrix and binary threshold matrix.
|
254
301
|
"""
|
255
302
|
# Check neighbors at the current depth
|
256
303
|
neighbors = nx.single_source_shortest_path_length(network, row_index, cutoff=depth)
|
257
|
-
# Filter annotated neighbors (already
|
304
|
+
# Filter annotated neighbors (already significant)
|
258
305
|
annotated_neighbors = [
|
259
306
|
n
|
260
307
|
for n in neighbors
|
261
308
|
if n != row_index
|
262
|
-
and
|
263
|
-
and
|
309
|
+
and significant_binary_significance_matrix[n].sum() != 0
|
310
|
+
and significance_matrix[n].sum() != 0
|
264
311
|
]
|
265
|
-
# Filter non-
|
312
|
+
# Filter non-significant neighbors
|
266
313
|
valid_neighbors = [
|
267
314
|
n
|
268
315
|
for n in neighbors
|
269
316
|
if n != row_index
|
270
|
-
and
|
271
|
-
and
|
317
|
+
and significant_binary_significance_matrix[n].sum() == 0
|
318
|
+
and significance_matrix[n].sum() == 0
|
272
319
|
]
|
273
|
-
# If there are valid non-
|
320
|
+
# If there are valid non-significant neighbors
|
274
321
|
if valid_neighbors and annotated_neighbors:
|
275
322
|
# Calculate distances to annotated neighbors
|
276
323
|
distances_to_annotated = [
|
@@ -281,7 +328,7 @@ def _process_node_imputation(
|
|
281
328
|
iqr = q3 - q1
|
282
329
|
lower_bound = q1 - 1.5 * iqr
|
283
330
|
upper_bound = q3 + 1.5 * iqr
|
284
|
-
# Filter valid non-
|
331
|
+
# Filter valid non-significant neighbors that fall within the IQR bounds
|
285
332
|
valid_neighbors_within_iqr = [
|
286
333
|
n
|
287
334
|
for n in valid_neighbors
|
@@ -295,8 +342,8 @@ def _process_node_imputation(
|
|
295
342
|
def sum_pairwise_cosine_similarities(neighbor):
|
296
343
|
return sum(
|
297
344
|
cosine_similarity(
|
298
|
-
|
299
|
-
|
345
|
+
significance_matrix[neighbor].reshape(1, -1),
|
346
|
+
significance_matrix[other_neighbor].reshape(1, -1),
|
300
347
|
)[0][0]
|
301
348
|
for other_neighbor in valid_neighbors_within_iqr
|
302
349
|
if other_neighbor != neighbor
|
@@ -308,40 +355,46 @@ def _process_node_imputation(
|
|
308
355
|
else:
|
309
356
|
most_similar_neighbor = valid_neighbors_within_iqr[0]
|
310
357
|
|
311
|
-
# Impute the most similar non-
|
312
|
-
|
358
|
+
# Impute the most similar non-significant neighbor with the significant node's data, scaled by depth
|
359
|
+
significance_matrix[most_similar_neighbor] = significance_matrix[row_index] / np.sqrt(
|
313
360
|
depth + 1
|
314
361
|
)
|
315
|
-
|
362
|
+
significant_binary_significance_matrix[most_similar_neighbor] = (
|
363
|
+
significant_binary_significance_matrix[row_index]
|
364
|
+
)
|
316
365
|
|
317
|
-
return
|
366
|
+
return significance_matrix, significant_binary_significance_matrix
|
318
367
|
|
319
368
|
|
320
369
|
def _prune_neighbors(
|
321
370
|
network: nx.Graph,
|
322
|
-
|
323
|
-
|
371
|
+
significance_matrix: np.ndarray,
|
372
|
+
significant_binary_significance_matrix: np.ndarray,
|
324
373
|
distance_threshold: float = 0.9,
|
325
374
|
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
326
375
|
"""Remove outliers based on their rank for edge lengths.
|
327
376
|
|
328
377
|
Args:
|
329
378
|
network (nx.Graph): The network graph with nodes having IDs matching the matrix indices.
|
330
|
-
|
331
|
-
|
379
|
+
significance_matrix (np.ndarray): The significance matrix.
|
380
|
+
significant_binary_significance_matrix (np.ndarray): The alpha threshold matrix.
|
332
381
|
distance_threshold (float): Rank threshold (0 to 1) to determine outliers.
|
333
382
|
|
334
383
|
Returns:
|
335
|
-
|
336
|
-
- np.ndarray: The updated
|
384
|
+
Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
385
|
+
- np.ndarray: The updated significance matrix with outliers set to zero.
|
337
386
|
- np.ndarray: The updated alpha threshold matrix with outliers set to zero.
|
338
|
-
- np.ndarray: The significant
|
387
|
+
- np.ndarray: The significant significance matrix, where non-significant entries are set to zero.
|
339
388
|
"""
|
340
|
-
# Identify indices with non-zero rows in the binary
|
341
|
-
non_zero_indices = np.where(
|
389
|
+
# Identify indices with non-zero rows in the binary significance matrix
|
390
|
+
non_zero_indices = np.where(significant_binary_significance_matrix.sum(axis=1) != 0)[0]
|
342
391
|
median_distances = []
|
343
392
|
for node in non_zero_indices:
|
344
|
-
neighbors = [
|
393
|
+
neighbors = [
|
394
|
+
n
|
395
|
+
for n in network.neighbors(node)
|
396
|
+
if significant_binary_significance_matrix[n].sum() != 0
|
397
|
+
]
|
345
398
|
if neighbors:
|
346
399
|
median_distance = np.median(
|
347
400
|
[_get_euclidean_distance(node, n, network) for n in neighbors]
|
@@ -353,20 +406,28 @@ def _prune_neighbors(
|
|
353
406
|
# Prune nodes that are outliers based on the distance threshold
|
354
407
|
for row_index in non_zero_indices:
|
355
408
|
neighbors = [
|
356
|
-
n
|
409
|
+
n
|
410
|
+
for n in network.neighbors(row_index)
|
411
|
+
if significant_binary_significance_matrix[n].sum() != 0
|
357
412
|
]
|
358
413
|
if neighbors:
|
359
414
|
median_distance = np.median(
|
360
415
|
[_get_euclidean_distance(row_index, n, network) for n in neighbors]
|
361
416
|
)
|
362
417
|
if median_distance >= distance_threshold_value:
|
363
|
-
|
364
|
-
|
418
|
+
significance_matrix[row_index] = 0
|
419
|
+
significant_binary_significance_matrix[row_index] = 0
|
365
420
|
|
366
421
|
# Create a matrix where non-significant entries are set to zero
|
367
|
-
|
422
|
+
significant_significance_matrix = np.where(
|
423
|
+
significant_binary_significance_matrix == 1, significance_matrix, 0
|
424
|
+
)
|
368
425
|
|
369
|
-
return
|
426
|
+
return (
|
427
|
+
significance_matrix,
|
428
|
+
significant_binary_significance_matrix,
|
429
|
+
significant_significance_matrix,
|
430
|
+
)
|
370
431
|
|
371
432
|
|
372
433
|
def _get_euclidean_distance(node1: Any, node2: Any, network: nx.Graph) -> float:
|
@@ -408,7 +469,7 @@ def _calculate_threshold(median_distances: List, distance_threshold: float) -> f
|
|
408
469
|
"""Calculate the distance threshold based on the given median distances and a percentile threshold.
|
409
470
|
|
410
471
|
Args:
|
411
|
-
median_distances (
|
472
|
+
median_distances (List): An array of median distances.
|
412
473
|
distance_threshold (float): A percentile threshold (0 to 1) used to determine the distance cutoff.
|
413
474
|
|
414
475
|
Returns:
|
@@ -416,7 +477,7 @@ def _calculate_threshold(median_distances: List, distance_threshold: float) -> f
|
|
416
477
|
"""
|
417
478
|
# Sort the median distances
|
418
479
|
sorted_distances = np.sort(median_distances)
|
419
|
-
# Compute the rank
|
480
|
+
# Compute the rank fractions for the sorted distances
|
420
481
|
rank_percentiles = np.linspace(0, 1, len(sorted_distances))
|
421
482
|
# Interpolating the ranks to 1000 evenly spaced percentiles
|
422
483
|
interpolated_percentiles = np.linspace(0, 1, 1000)
|