risk-network 0.0.8b26__py3-none-any.whl → 0.0.9b26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- risk/__init__.py +2 -2
- risk/annotations/__init__.py +2 -2
- risk/annotations/annotations.py +74 -47
- risk/annotations/io.py +47 -31
- risk/log/__init__.py +4 -2
- risk/log/{config.py → console.py} +5 -3
- risk/log/{params.py → parameters.py} +17 -42
- risk/neighborhoods/__init__.py +3 -5
- risk/neighborhoods/api.py +446 -0
- risk/neighborhoods/community.py +255 -77
- risk/neighborhoods/domains.py +62 -31
- risk/neighborhoods/neighborhoods.py +156 -160
- risk/network/__init__.py +1 -3
- risk/network/geometry.py +65 -57
- risk/network/graph/__init__.py +6 -0
- risk/network/graph/api.py +194 -0
- risk/network/{graph.py → graph/network.py} +87 -37
- risk/network/graph/summary.py +254 -0
- risk/network/io.py +56 -47
- risk/network/plotter/__init__.py +6 -0
- risk/network/plotter/api.py +54 -0
- risk/network/{plot → plotter}/canvas.py +7 -4
- risk/network/{plot → plotter}/contour.py +22 -19
- risk/network/{plot → plotter}/labels.py +69 -74
- risk/network/{plot → plotter}/network.py +170 -34
- risk/network/{plot/utils/color.py → plotter/utils/colors.py} +104 -112
- risk/network/{plot → plotter}/utils/layout.py +8 -5
- risk/risk.py +11 -500
- risk/stats/__init__.py +8 -4
- risk/stats/binom.py +51 -0
- risk/stats/chi2.py +69 -0
- risk/stats/hypergeom.py +27 -17
- risk/stats/permutation/__init__.py +1 -1
- risk/stats/permutation/permutation.py +44 -38
- risk/stats/permutation/test_functions.py +25 -17
- risk/stats/poisson.py +15 -9
- risk/stats/stats.py +15 -13
- risk/stats/zscore.py +68 -0
- {risk_network-0.0.8b26.dist-info → risk_network-0.0.9b26.dist-info}/METADATA +9 -5
- risk_network-0.0.9b26.dist-info/RECORD +44 -0
- {risk_network-0.0.8b26.dist-info → risk_network-0.0.9b26.dist-info}/WHEEL +1 -1
- risk/network/plot/__init__.py +0 -6
- risk/network/plot/plotter.py +0 -137
- risk_network-0.0.8b26.dist-info/RECORD +0 -37
- {risk_network-0.0.8b26.dist-info → risk_network-0.0.9b26.dist-info}/LICENSE +0 -0
- {risk_network-0.0.8b26.dist-info → risk_network-0.0.9b26.dist-info}/top_level.txt +0 -0
@@ -15,6 +15,7 @@ from sklearn.metrics.pairwise import cosine_similarity
|
|
15
15
|
from risk.neighborhoods.community import (
|
16
16
|
calculate_greedy_modularity_neighborhoods,
|
17
17
|
calculate_label_propagation_neighborhoods,
|
18
|
+
calculate_leiden_neighborhoods,
|
18
19
|
calculate_louvain_neighborhoods,
|
19
20
|
calculate_markov_clustering_neighborhoods,
|
20
21
|
calculate_spinglass_neighborhoods,
|
@@ -29,22 +30,20 @@ warnings.filterwarnings(action="ignore", category=DataConversionWarning)
|
|
29
30
|
def get_network_neighborhoods(
|
30
31
|
network: nx.Graph,
|
31
32
|
distance_metric: Union[str, List, Tuple, np.ndarray] = "louvain",
|
32
|
-
|
33
|
-
louvain_resolution: float = 1
|
33
|
+
fraction_shortest_edges: Union[float, List, Tuple, np.ndarray] = 1.0,
|
34
|
+
louvain_resolution: float = 0.1,
|
35
|
+
leiden_resolution: float = 1.0,
|
34
36
|
random_seed: int = 888,
|
35
37
|
) -> np.ndarray:
|
36
38
|
"""Calculate the combined neighborhoods for each node based on the specified community detection algorithm(s).
|
37
39
|
|
38
40
|
Args:
|
39
41
|
network (nx.Graph): The network graph.
|
40
|
-
distance_metric (str, List, Tuple, or np.ndarray, optional): The distance metric(s) to use.
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
Defaults to 1.0.
|
46
|
-
louvain_resolution (float, optional): Resolution parameter for the Louvain method. Defaults to 1.0.
|
47
|
-
random_seed (int, optional): Random seed for methods requiring random initialization. Defaults to 888.
|
42
|
+
distance_metric (str, List, Tuple, or np.ndarray, optional): The distance metric(s) to use.
|
43
|
+
fraction_shortest_edges (float, List, Tuple, or np.ndarray, optional): Shortest edge rank fraction threshold(s) for creating subgraphs.
|
44
|
+
louvain_resolution (float, optional): Resolution parameter for the Louvain method.
|
45
|
+
leiden_resolution (float, optional): Resolution parameter for the Leiden method.
|
46
|
+
random_seed (int, optional): Random seed for methods requiring random initialization.
|
48
47
|
|
49
48
|
Returns:
|
50
49
|
np.ndarray: Summed neighborhood matrix from all selected algorithms.
|
@@ -56,11 +55,11 @@ def get_network_neighborhoods(
|
|
56
55
|
# Ensure distance_metric is a list/tuple for multi-algorithm handling
|
57
56
|
if isinstance(distance_metric, (str, np.ndarray)):
|
58
57
|
distance_metric = [distance_metric]
|
59
|
-
# Ensure
|
60
|
-
if isinstance(
|
61
|
-
|
58
|
+
# Ensure fraction_shortest_edges is a list/tuple for multi-threshold handling
|
59
|
+
if isinstance(fraction_shortest_edges, (float, int)):
|
60
|
+
fraction_shortest_edges = [fraction_shortest_edges] * len(distance_metric)
|
62
61
|
# Check that the number of distance metrics matches the number of edge length thresholds
|
63
|
-
if len(distance_metric) != len(
|
62
|
+
if len(distance_metric) != len(fraction_shortest_edges):
|
64
63
|
raise ValueError(
|
65
64
|
"The number of distance metrics must match the number of edge length thresholds."
|
66
65
|
)
|
@@ -69,29 +68,47 @@ def get_network_neighborhoods(
|
|
69
68
|
num_nodes = network.number_of_nodes()
|
70
69
|
combined_neighborhoods = np.zeros((num_nodes, num_nodes), dtype=int)
|
71
70
|
|
72
|
-
# Loop through each distance metric and corresponding edge
|
73
|
-
for metric,
|
74
|
-
# Create a subgraph based on the specific edge length threshold for this algorithm
|
75
|
-
subgraph = _create_percentile_limited_subgraph(network, edge_length_percentile=threshold)
|
71
|
+
# Loop through each distance metric and corresponding edge rank fraction
|
72
|
+
for metric, percentile in zip(distance_metric, fraction_shortest_edges):
|
76
73
|
# Call the appropriate neighborhood function based on the metric
|
77
|
-
if metric == "
|
78
|
-
neighborhoods =
|
79
|
-
|
74
|
+
if metric == "greedy_modularity":
|
75
|
+
neighborhoods = calculate_greedy_modularity_neighborhoods(
|
76
|
+
network, fraction_shortest_edges=percentile
|
80
77
|
)
|
81
|
-
elif metric == "greedy_modularity":
|
82
|
-
neighborhoods = calculate_greedy_modularity_neighborhoods(subgraph)
|
83
78
|
elif metric == "label_propagation":
|
84
|
-
neighborhoods = calculate_label_propagation_neighborhoods(
|
79
|
+
neighborhoods = calculate_label_propagation_neighborhoods(
|
80
|
+
network, fraction_shortest_edges=percentile
|
81
|
+
)
|
82
|
+
elif metric == "leiden":
|
83
|
+
neighborhoods = calculate_leiden_neighborhoods(
|
84
|
+
network,
|
85
|
+
resolution=leiden_resolution,
|
86
|
+
fraction_shortest_edges=percentile,
|
87
|
+
random_seed=random_seed,
|
88
|
+
)
|
89
|
+
elif metric == "louvain":
|
90
|
+
neighborhoods = calculate_louvain_neighborhoods(
|
91
|
+
network,
|
92
|
+
resolution=louvain_resolution,
|
93
|
+
fraction_shortest_edges=percentile,
|
94
|
+
random_seed=random_seed,
|
95
|
+
)
|
85
96
|
elif metric == "markov_clustering":
|
86
|
-
neighborhoods = calculate_markov_clustering_neighborhoods(
|
87
|
-
|
88
|
-
|
97
|
+
neighborhoods = calculate_markov_clustering_neighborhoods(
|
98
|
+
network, fraction_shortest_edges=percentile
|
99
|
+
)
|
89
100
|
elif metric == "spinglass":
|
90
|
-
neighborhoods = calculate_spinglass_neighborhoods(
|
101
|
+
neighborhoods = calculate_spinglass_neighborhoods(
|
102
|
+
network, fraction_shortest_edges=percentile
|
103
|
+
)
|
104
|
+
elif metric == "walktrap":
|
105
|
+
neighborhoods = calculate_walktrap_neighborhoods(
|
106
|
+
network, fraction_shortest_edges=percentile
|
107
|
+
)
|
91
108
|
else:
|
92
109
|
raise ValueError(
|
93
|
-
"Incorrect distance metric specified. Please choose from 'greedy_modularity', '
|
94
|
-
"'
|
110
|
+
"Incorrect distance metric specified. Please choose from 'greedy_modularity', 'label_propagation',"
|
111
|
+
"'leiden', 'louvain', 'markov_clustering', 'spinglass', 'walktrap'."
|
95
112
|
)
|
96
113
|
|
97
114
|
# Sum the neighborhood matrices
|
@@ -100,50 +117,16 @@ def get_network_neighborhoods(
|
|
100
117
|
# Ensure that the maximum value in each row is set to 1
|
101
118
|
# This ensures that for each row, only the strongest relationship (the maximum value) is retained,
|
102
119
|
# while all other values are reset to 0. This transformation simplifies the neighborhood matrix by
|
103
|
-
# focusing on the most significant connection per row.
|
104
|
-
combined_neighborhoods =
|
120
|
+
# focusing on the most significant connection per row (or nodes).
|
121
|
+
combined_neighborhoods = _set_max_row_value_to_one(combined_neighborhoods)
|
105
122
|
|
106
123
|
return combined_neighborhoods
|
107
124
|
|
108
125
|
|
109
|
-
def
|
110
|
-
"""
|
111
|
-
|
112
|
-
|
113
|
-
Args:
|
114
|
-
G (nx.Graph): The input graph with 'length' attributes on edges.
|
115
|
-
edge_length_percentile (float): The percentile (between 0 and 1) to filter edges by length.
|
116
|
-
|
117
|
-
Returns:
|
118
|
-
nx.Graph: A subgraph with all nodes and edges where the edge length is below the
|
119
|
-
calculated threshold length.
|
120
|
-
"""
|
121
|
-
# Extract edge lengths and handle missing lengths
|
122
|
-
edge_lengths = [d["length"] for _, _, d in G.edges(data=True) if "length" in d]
|
123
|
-
if not edge_lengths:
|
124
|
-
raise ValueError(
|
125
|
-
"No edge lengths found in the graph. Ensure edges have 'length' attributes."
|
126
|
-
)
|
127
|
-
|
128
|
-
# Calculate the specific edge length for the given percentile
|
129
|
-
percentile_length = np.percentile(edge_lengths, edge_length_percentile * 100)
|
130
|
-
# Create the subgraph by directly filtering edges during iteration
|
131
|
-
subgraph = nx.Graph()
|
132
|
-
subgraph.add_nodes_from(G.nodes(data=True)) # Retain all nodes from the original graph
|
133
|
-
# Add edges below the specified percentile length in a single pass
|
134
|
-
for u, v, d in G.edges(data=True):
|
135
|
-
if d.get("length", 1) <= percentile_length:
|
136
|
-
subgraph.add_edge(u, v, **d)
|
137
|
-
|
138
|
-
# Return the subgraph; optionally check if it's too sparse
|
139
|
-
if subgraph.number_of_edges() == 0:
|
140
|
-
raise Warning("The resulting subgraph has no edges. Consider adjusting the percentile.")
|
141
|
-
|
142
|
-
return subgraph
|
143
|
-
|
144
|
-
|
145
|
-
def _set_max_to_one(matrix: np.ndarray) -> np.ndarray:
|
146
|
-
"""For each row in the input matrix, set the maximum value(s) to 1 and all other values to 0.
|
126
|
+
def _set_max_row_value_to_one(matrix: np.ndarray) -> np.ndarray:
|
127
|
+
"""For each row in the input matrix, set the maximum value(s) to 1 and all other values to 0. This is particularly
|
128
|
+
useful for neighborhood matrices that have undergone multiple neighborhood detection algorithms, where the
|
129
|
+
maximum value in each row represents the most significant relationship per node in the combined neighborhoods.
|
147
130
|
|
148
131
|
Args:
|
149
132
|
matrix (np.ndarray): A 2D numpy array representing the neighborhood matrix.
|
@@ -171,163 +154,170 @@ def process_neighborhoods(
|
|
171
154
|
|
172
155
|
Args:
|
173
156
|
network (nx.Graph): The network data structure used for imputing and pruning neighbors.
|
174
|
-
neighborhoods (Dict[str, Any]): Dictionary containing '
|
157
|
+
neighborhoods (Dict[str, Any]): Dictionary containing 'significance_matrix', 'significant_binary_significance_matrix', and 'significant_significance_matrix'.
|
175
158
|
impute_depth (int, optional): Depth for imputing neighbors. Defaults to 0.
|
176
159
|
prune_threshold (float, optional): Distance threshold for pruning neighbors. Defaults to 0.0.
|
177
160
|
|
178
161
|
Returns:
|
179
|
-
Dict[str, Any]: Processed neighborhoods data, including the updated matrices and
|
162
|
+
Dict[str, Any]: Processed neighborhoods data, including the updated matrices and significance counts.
|
180
163
|
"""
|
181
|
-
|
182
|
-
|
183
|
-
|
164
|
+
significance_matrix = neighborhoods["significance_matrix"]
|
165
|
+
significant_binary_significance_matrix = neighborhoods["significant_binary_significance_matrix"]
|
166
|
+
significant_significance_matrix = neighborhoods["significant_significance_matrix"]
|
184
167
|
logger.debug(f"Imputation depth: {impute_depth}")
|
185
168
|
if impute_depth:
|
186
169
|
(
|
187
|
-
|
188
|
-
|
189
|
-
|
170
|
+
significance_matrix,
|
171
|
+
significant_binary_significance_matrix,
|
172
|
+
significant_significance_matrix,
|
190
173
|
) = _impute_neighbors(
|
191
174
|
network,
|
192
|
-
|
193
|
-
|
175
|
+
significance_matrix,
|
176
|
+
significant_binary_significance_matrix,
|
194
177
|
max_depth=impute_depth,
|
195
178
|
)
|
196
179
|
|
197
180
|
logger.debug(f"Pruning threshold: {prune_threshold}")
|
198
181
|
if prune_threshold:
|
199
182
|
(
|
200
|
-
|
201
|
-
|
202
|
-
|
183
|
+
significance_matrix,
|
184
|
+
significant_binary_significance_matrix,
|
185
|
+
significant_significance_matrix,
|
203
186
|
) = _prune_neighbors(
|
204
187
|
network,
|
205
|
-
|
206
|
-
|
188
|
+
significance_matrix,
|
189
|
+
significant_binary_significance_matrix,
|
207
190
|
distance_threshold=prune_threshold,
|
208
191
|
)
|
209
192
|
|
210
|
-
|
211
|
-
|
193
|
+
neighborhood_significance_counts = np.sum(significant_binary_significance_matrix, axis=0)
|
194
|
+
node_significance_sums = np.sum(significance_matrix, axis=1)
|
212
195
|
return {
|
213
|
-
"
|
214
|
-
"
|
215
|
-
"
|
216
|
-
"
|
217
|
-
"
|
196
|
+
"significance_matrix": significance_matrix,
|
197
|
+
"significant_binary_significance_matrix": significant_binary_significance_matrix,
|
198
|
+
"significant_significance_matrix": significant_significance_matrix,
|
199
|
+
"neighborhood_significance_counts": neighborhood_significance_counts,
|
200
|
+
"node_significance_sums": node_significance_sums,
|
218
201
|
}
|
219
202
|
|
220
203
|
|
221
204
|
def _impute_neighbors(
|
222
205
|
network: nx.Graph,
|
223
|
-
|
224
|
-
|
206
|
+
significance_matrix: np.ndarray,
|
207
|
+
significant_binary_significance_matrix: np.ndarray,
|
225
208
|
max_depth: int = 3,
|
226
209
|
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
227
|
-
"""Impute rows with sums of zero in the
|
210
|
+
"""Impute rows with sums of zero in the significance matrix based on the closest non-zero neighbors in the network graph.
|
228
211
|
|
229
212
|
Args:
|
230
213
|
network (nx.Graph): The network graph with nodes having IDs matching the matrix indices.
|
231
|
-
|
232
|
-
|
214
|
+
significance_matrix (np.ndarray): The significance matrix with rows to be imputed.
|
215
|
+
significant_binary_significance_matrix (np.ndarray): The alpha threshold matrix to be imputed similarly.
|
233
216
|
max_depth (int): Maximum depth of nodes to traverse for imputing values.
|
234
217
|
|
235
218
|
Returns:
|
236
|
-
|
237
|
-
- np.ndarray: The imputed
|
219
|
+
Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
220
|
+
- np.ndarray: The imputed significance matrix.
|
238
221
|
- np.ndarray: The imputed alpha threshold matrix.
|
239
|
-
- np.ndarray: The significant
|
222
|
+
- np.ndarray: The significant significance matrix with non-significant entries set to zero.
|
240
223
|
"""
|
241
224
|
# Calculate the distance threshold value based on the shortest distances
|
242
|
-
|
243
|
-
network,
|
225
|
+
significance_matrix, significant_binary_significance_matrix = _impute_neighbors_with_similarity(
|
226
|
+
network, significance_matrix, significant_binary_significance_matrix, max_depth=max_depth
|
244
227
|
)
|
245
228
|
# Create a matrix where non-significant entries are set to zero
|
246
|
-
|
247
|
-
|
229
|
+
significant_significance_matrix = np.where(
|
230
|
+
significant_binary_significance_matrix == 1, significance_matrix, 0
|
248
231
|
)
|
249
232
|
|
250
|
-
return
|
233
|
+
return (
|
234
|
+
significance_matrix,
|
235
|
+
significant_binary_significance_matrix,
|
236
|
+
significant_significance_matrix,
|
237
|
+
)
|
251
238
|
|
252
239
|
|
253
240
|
def _impute_neighbors_with_similarity(
|
254
241
|
network: nx.Graph,
|
255
|
-
|
256
|
-
|
242
|
+
significance_matrix: np.ndarray,
|
243
|
+
significant_binary_significance_matrix: np.ndarray,
|
257
244
|
max_depth: int = 3,
|
258
245
|
) -> Tuple[np.ndarray, np.ndarray]:
|
259
|
-
"""Impute non-
|
246
|
+
"""Impute non-significant nodes based on the closest significant neighbors' profiles and their similarity.
|
260
247
|
|
261
248
|
Args:
|
262
249
|
network (nx.Graph): The network graph with nodes having IDs matching the matrix indices.
|
263
|
-
|
264
|
-
|
250
|
+
significance_matrix (np.ndarray): The significance matrix with rows to be imputed.
|
251
|
+
significant_binary_significance_matrix (np.ndarray): The alpha threshold matrix to be imputed similarly.
|
265
252
|
max_depth (int): Maximum depth of nodes to traverse for imputing values.
|
266
253
|
|
267
254
|
Returns:
|
268
|
-
Tuple[np.ndarray, np.ndarray]:
|
269
|
-
- The imputed
|
255
|
+
Tuple[np.ndarray, np.ndarray]:
|
256
|
+
- The imputed significance matrix.
|
270
257
|
- The imputed alpha threshold matrix.
|
271
258
|
"""
|
272
259
|
depth = 1
|
273
|
-
rows_to_impute = np.where(
|
260
|
+
rows_to_impute = np.where(significant_binary_significance_matrix.sum(axis=1) == 0)[0]
|
274
261
|
while len(rows_to_impute) and depth <= max_depth:
|
275
|
-
# Iterate over all
|
276
|
-
for row_index in range(
|
277
|
-
if
|
278
|
-
|
262
|
+
# Iterate over all significant nodes
|
263
|
+
for row_index in range(significant_binary_significance_matrix.shape[0]):
|
264
|
+
if significant_binary_significance_matrix[row_index].sum() != 0:
|
265
|
+
(
|
266
|
+
significance_matrix,
|
267
|
+
significant_binary_significance_matrix,
|
268
|
+
) = _process_node_imputation(
|
279
269
|
row_index,
|
280
270
|
network,
|
281
|
-
|
282
|
-
|
271
|
+
significance_matrix,
|
272
|
+
significant_binary_significance_matrix,
|
283
273
|
depth,
|
284
274
|
)
|
285
275
|
|
286
276
|
# Update rows to impute for the next iteration
|
287
|
-
rows_to_impute = np.where(
|
277
|
+
rows_to_impute = np.where(significant_binary_significance_matrix.sum(axis=1) == 0)[0]
|
288
278
|
depth += 1
|
289
279
|
|
290
|
-
return
|
280
|
+
return significance_matrix, significant_binary_significance_matrix
|
291
281
|
|
292
282
|
|
293
283
|
def _process_node_imputation(
|
294
284
|
row_index: int,
|
295
285
|
network: nx.Graph,
|
296
|
-
|
297
|
-
|
286
|
+
significance_matrix: np.ndarray,
|
287
|
+
significant_binary_significance_matrix: np.ndarray,
|
298
288
|
depth: int,
|
299
289
|
) -> Tuple[np.ndarray, np.ndarray]:
|
300
|
-
"""Process the imputation for a single node based on its
|
290
|
+
"""Process the imputation for a single node based on its significant neighbors.
|
301
291
|
|
302
292
|
Args:
|
303
|
-
row_index (int): The index of the
|
293
|
+
row_index (int): The index of the significant node being processed.
|
304
294
|
network (nx.Graph): The network graph with nodes having IDs matching the matrix indices.
|
305
|
-
|
306
|
-
|
295
|
+
significance_matrix (np.ndarray): The significance matrix with rows to be imputed.
|
296
|
+
significant_binary_significance_matrix (np.ndarray): The alpha threshold matrix to be imputed similarly.
|
307
297
|
depth (int): Current depth for traversal.
|
308
298
|
|
309
299
|
Returns:
|
310
|
-
Tuple[np.ndarray, np.ndarray]: The modified
|
300
|
+
Tuple[np.ndarray, np.ndarray]: The modified significance matrix and binary threshold matrix.
|
311
301
|
"""
|
312
302
|
# Check neighbors at the current depth
|
313
303
|
neighbors = nx.single_source_shortest_path_length(network, row_index, cutoff=depth)
|
314
|
-
# Filter annotated neighbors (already
|
304
|
+
# Filter annotated neighbors (already significant)
|
315
305
|
annotated_neighbors = [
|
316
306
|
n
|
317
307
|
for n in neighbors
|
318
308
|
if n != row_index
|
319
|
-
and
|
320
|
-
and
|
309
|
+
and significant_binary_significance_matrix[n].sum() != 0
|
310
|
+
and significance_matrix[n].sum() != 0
|
321
311
|
]
|
322
|
-
# Filter non-
|
312
|
+
# Filter non-significant neighbors
|
323
313
|
valid_neighbors = [
|
324
314
|
n
|
325
315
|
for n in neighbors
|
326
316
|
if n != row_index
|
327
|
-
and
|
328
|
-
and
|
317
|
+
and significant_binary_significance_matrix[n].sum() == 0
|
318
|
+
and significance_matrix[n].sum() == 0
|
329
319
|
]
|
330
|
-
# If there are valid non-
|
320
|
+
# If there are valid non-significant neighbors
|
331
321
|
if valid_neighbors and annotated_neighbors:
|
332
322
|
# Calculate distances to annotated neighbors
|
333
323
|
distances_to_annotated = [
|
@@ -338,7 +328,7 @@ def _process_node_imputation(
|
|
338
328
|
iqr = q3 - q1
|
339
329
|
lower_bound = q1 - 1.5 * iqr
|
340
330
|
upper_bound = q3 + 1.5 * iqr
|
341
|
-
# Filter valid non-
|
331
|
+
# Filter valid non-significant neighbors that fall within the IQR bounds
|
342
332
|
valid_neighbors_within_iqr = [
|
343
333
|
n
|
344
334
|
for n in valid_neighbors
|
@@ -352,8 +342,8 @@ def _process_node_imputation(
|
|
352
342
|
def sum_pairwise_cosine_similarities(neighbor):
|
353
343
|
return sum(
|
354
344
|
cosine_similarity(
|
355
|
-
|
356
|
-
|
345
|
+
significance_matrix[neighbor].reshape(1, -1),
|
346
|
+
significance_matrix[other_neighbor].reshape(1, -1),
|
357
347
|
)[0][0]
|
358
348
|
for other_neighbor in valid_neighbors_within_iqr
|
359
349
|
if other_neighbor != neighbor
|
@@ -365,43 +355,45 @@ def _process_node_imputation(
|
|
365
355
|
else:
|
366
356
|
most_similar_neighbor = valid_neighbors_within_iqr[0]
|
367
357
|
|
368
|
-
# Impute the most similar non-
|
369
|
-
|
358
|
+
# Impute the most similar non-significant neighbor with the significant node's data, scaled by depth
|
359
|
+
significance_matrix[most_similar_neighbor] = significance_matrix[row_index] / np.sqrt(
|
370
360
|
depth + 1
|
371
361
|
)
|
372
|
-
|
373
|
-
|
362
|
+
significant_binary_significance_matrix[most_similar_neighbor] = (
|
363
|
+
significant_binary_significance_matrix[row_index]
|
374
364
|
)
|
375
365
|
|
376
|
-
return
|
366
|
+
return significance_matrix, significant_binary_significance_matrix
|
377
367
|
|
378
368
|
|
379
369
|
def _prune_neighbors(
|
380
370
|
network: nx.Graph,
|
381
|
-
|
382
|
-
|
371
|
+
significance_matrix: np.ndarray,
|
372
|
+
significant_binary_significance_matrix: np.ndarray,
|
383
373
|
distance_threshold: float = 0.9,
|
384
374
|
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
385
375
|
"""Remove outliers based on their rank for edge lengths.
|
386
376
|
|
387
377
|
Args:
|
388
378
|
network (nx.Graph): The network graph with nodes having IDs matching the matrix indices.
|
389
|
-
|
390
|
-
|
379
|
+
significance_matrix (np.ndarray): The significance matrix.
|
380
|
+
significant_binary_significance_matrix (np.ndarray): The alpha threshold matrix.
|
391
381
|
distance_threshold (float): Rank threshold (0 to 1) to determine outliers.
|
392
382
|
|
393
383
|
Returns:
|
394
|
-
|
395
|
-
- np.ndarray: The updated
|
384
|
+
Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
385
|
+
- np.ndarray: The updated significance matrix with outliers set to zero.
|
396
386
|
- np.ndarray: The updated alpha threshold matrix with outliers set to zero.
|
397
|
-
- np.ndarray: The significant
|
387
|
+
- np.ndarray: The significant significance matrix, where non-significant entries are set to zero.
|
398
388
|
"""
|
399
|
-
# Identify indices with non-zero rows in the binary
|
400
|
-
non_zero_indices = np.where(
|
389
|
+
# Identify indices with non-zero rows in the binary significance matrix
|
390
|
+
non_zero_indices = np.where(significant_binary_significance_matrix.sum(axis=1) != 0)[0]
|
401
391
|
median_distances = []
|
402
392
|
for node in non_zero_indices:
|
403
393
|
neighbors = [
|
404
|
-
n
|
394
|
+
n
|
395
|
+
for n in network.neighbors(node)
|
396
|
+
if significant_binary_significance_matrix[n].sum() != 0
|
405
397
|
]
|
406
398
|
if neighbors:
|
407
399
|
median_distance = np.median(
|
@@ -416,22 +408,26 @@ def _prune_neighbors(
|
|
416
408
|
neighbors = [
|
417
409
|
n
|
418
410
|
for n in network.neighbors(row_index)
|
419
|
-
if
|
411
|
+
if significant_binary_significance_matrix[n].sum() != 0
|
420
412
|
]
|
421
413
|
if neighbors:
|
422
414
|
median_distance = np.median(
|
423
415
|
[_get_euclidean_distance(row_index, n, network) for n in neighbors]
|
424
416
|
)
|
425
417
|
if median_distance >= distance_threshold_value:
|
426
|
-
|
427
|
-
|
418
|
+
significance_matrix[row_index] = 0
|
419
|
+
significant_binary_significance_matrix[row_index] = 0
|
428
420
|
|
429
421
|
# Create a matrix where non-significant entries are set to zero
|
430
|
-
|
431
|
-
|
422
|
+
significant_significance_matrix = np.where(
|
423
|
+
significant_binary_significance_matrix == 1, significance_matrix, 0
|
432
424
|
)
|
433
425
|
|
434
|
-
return
|
426
|
+
return (
|
427
|
+
significance_matrix,
|
428
|
+
significant_binary_significance_matrix,
|
429
|
+
significant_significance_matrix,
|
430
|
+
)
|
435
431
|
|
436
432
|
|
437
433
|
def _get_euclidean_distance(node1: Any, node2: Any, network: nx.Graph) -> float:
|
@@ -481,7 +477,7 @@ def _calculate_threshold(median_distances: List, distance_threshold: float) -> f
|
|
481
477
|
"""
|
482
478
|
# Sort the median distances
|
483
479
|
sorted_distances = np.sort(median_distances)
|
484
|
-
# Compute the rank
|
480
|
+
# Compute the rank fractions for the sorted distances
|
485
481
|
rank_percentiles = np.linspace(0, 1, len(sorted_distances))
|
486
482
|
# Interpolating the ranks to 1000 evenly spaced percentiles
|
487
483
|
interpolated_percentiles = np.linspace(0, 1, 1000)
|