risk-network 0.0.8b27__py3-none-any.whl → 0.0.9b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- risk/__init__.py +1 -1
- risk/annotations/annotations.py +39 -38
- risk/log/enrichment.py +18 -0
- risk/neighborhoods/domains.py +15 -15
- risk/neighborhoods/neighborhoods.py +101 -89
- risk/network/graph.py +25 -25
- risk/network/plot/contour.py +1 -1
- risk/network/plot/labels.py +1 -1
- risk/network/plot/network.py +28 -28
- risk/network/plot/utils/color.py +27 -27
- risk/risk.py +20 -18
- risk/stats/stats.py +13 -13
- {risk_network-0.0.8b27.dist-info → risk_network-0.0.9b1.dist-info}/METADATA +1 -1
- {risk_network-0.0.8b27.dist-info → risk_network-0.0.9b1.dist-info}/RECORD +17 -16
- {risk_network-0.0.8b27.dist-info → risk_network-0.0.9b1.dist-info}/LICENSE +0 -0
- {risk_network-0.0.8b27.dist-info → risk_network-0.0.9b1.dist-info}/WHEEL +0 -0
- {risk_network-0.0.8b27.dist-info → risk_network-0.0.9b1.dist-info}/top_level.txt +0 -0
risk/__init__.py
CHANGED
risk/annotations/annotations.py
CHANGED
@@ -83,69 +83,69 @@ def load_annotations(network: nx.Graph, annotations_input: Dict[str, Any]) -> Di
|
|
83
83
|
def define_top_annotations(
|
84
84
|
network: nx.Graph,
|
85
85
|
ordered_annotation_labels: List[str],
|
86
|
-
|
87
|
-
|
88
|
-
|
86
|
+
neighborhood_significance_sums: List[int],
|
87
|
+
significant_significance_matrix: np.ndarray,
|
88
|
+
significant_binary_significance_matrix: np.ndarray,
|
89
89
|
min_cluster_size: int = 5,
|
90
90
|
max_cluster_size: int = 1000,
|
91
91
|
) -> pd.DataFrame:
|
92
|
-
"""Define top annotations based on neighborhood
|
92
|
+
"""Define top annotations based on neighborhood significance sums and binary significance matrix.
|
93
93
|
|
94
94
|
Args:
|
95
95
|
network (NetworkX graph): The network graph.
|
96
96
|
ordered_annotation_labels (list of str): List of ordered annotation labels.
|
97
|
-
|
98
|
-
|
99
|
-
|
97
|
+
neighborhood_significance_sums (list of int): List of neighborhood significance sums.
|
98
|
+
significant_significance_matrix (np.ndarray): Enrichment matrix below alpha threshold.
|
99
|
+
significant_binary_significance_matrix (np.ndarray): Binary significance matrix below alpha threshold.
|
100
100
|
min_cluster_size (int, optional): Minimum cluster size. Defaults to 5.
|
101
101
|
max_cluster_size (int, optional): Maximum cluster size. Defaults to 1000.
|
102
102
|
|
103
103
|
Returns:
|
104
104
|
pd.DataFrame: DataFrame with top annotations and their properties.
|
105
105
|
"""
|
106
|
-
# Sum the columns of the significant
|
107
|
-
|
108
|
-
# Create DataFrame to store annotations, their neighborhood
|
109
|
-
|
106
|
+
# Sum the columns of the significant significance matrix (positive floating point values)
|
107
|
+
significant_significance_scores = significant_significance_matrix.sum(axis=0)
|
108
|
+
# Create DataFrame to store annotations, their neighborhood significance sums, and significance scores
|
109
|
+
annotations_significance_matrix = pd.DataFrame(
|
110
110
|
{
|
111
111
|
"id": range(len(ordered_annotation_labels)),
|
112
112
|
"full_terms": ordered_annotation_labels,
|
113
|
-
"
|
114
|
-
"
|
113
|
+
"significant_neighborhood_significance_sums": neighborhood_significance_sums,
|
114
|
+
"significant_significance_score": significant_significance_scores,
|
115
115
|
}
|
116
116
|
)
|
117
|
-
|
117
|
+
annotations_significance_matrix["significant_annotations"] = False
|
118
118
|
# Apply size constraints to identify potential significant annotations
|
119
|
-
|
119
|
+
annotations_significance_matrix.loc[
|
120
120
|
(
|
121
|
-
|
121
|
+
annotations_significance_matrix["significant_neighborhood_significance_sums"]
|
122
122
|
>= min_cluster_size
|
123
123
|
)
|
124
124
|
& (
|
125
|
-
|
125
|
+
annotations_significance_matrix["significant_neighborhood_significance_sums"]
|
126
126
|
<= max_cluster_size
|
127
127
|
),
|
128
128
|
"significant_annotations",
|
129
129
|
] = True
|
130
130
|
# Initialize columns for connected components analysis
|
131
|
-
|
132
|
-
|
133
|
-
|
131
|
+
annotations_significance_matrix["num_connected_components"] = 0
|
132
|
+
annotations_significance_matrix["size_connected_components"] = None
|
133
|
+
annotations_significance_matrix["size_connected_components"] = annotations_significance_matrix[
|
134
134
|
"size_connected_components"
|
135
135
|
].astype(object)
|
136
|
-
|
136
|
+
annotations_significance_matrix["num_large_connected_components"] = 0
|
137
137
|
|
138
|
-
for attribute in
|
139
|
-
|
138
|
+
for attribute in annotations_significance_matrix.index.values[
|
139
|
+
annotations_significance_matrix["significant_annotations"]
|
140
140
|
]:
|
141
|
-
# Identify
|
142
|
-
|
143
|
-
compress(list(network),
|
141
|
+
# Identify significant neighborhoods based on the binary significance matrix
|
142
|
+
significant_neighborhoods = list(
|
143
|
+
compress(list(network), significant_binary_significance_matrix[:, attribute])
|
144
144
|
)
|
145
|
-
|
146
|
-
# Analyze connected components within the
|
145
|
+
significant_network = nx.subgraph(network, significant_neighborhoods)
|
146
|
+
# Analyze connected components within the significant subnetwork
|
147
147
|
connected_components = sorted(
|
148
|
-
nx.connected_components(
|
148
|
+
nx.connected_components(significant_network), key=len, reverse=True
|
149
149
|
)
|
150
150
|
size_connected_components = np.array([len(c) for c in connected_components])
|
151
151
|
|
@@ -159,23 +159,24 @@ def define_top_annotations(
|
|
159
159
|
num_large_connected_components = len(filtered_size_connected_components)
|
160
160
|
|
161
161
|
# Assign the number of connected components
|
162
|
-
|
162
|
+
annotations_significance_matrix.loc[attribute, "num_connected_components"] = (
|
163
163
|
num_connected_components
|
164
164
|
)
|
165
165
|
# Filter out attributes with more than one connected component
|
166
|
-
|
167
|
-
|
166
|
+
annotations_significance_matrix.loc[
|
167
|
+
annotations_significance_matrix["num_connected_components"] > 1,
|
168
|
+
"significant_annotations",
|
168
169
|
] = False
|
169
170
|
# Assign the number of large connected components
|
170
|
-
|
171
|
+
annotations_significance_matrix.loc[attribute, "num_large_connected_components"] = (
|
171
172
|
num_large_connected_components
|
172
173
|
)
|
173
174
|
# Assign the size of connected components, ensuring it is always a list
|
174
|
-
|
175
|
+
annotations_significance_matrix.at[attribute, "size_connected_components"] = (
|
175
176
|
filtered_size_connected_components.tolist()
|
176
177
|
)
|
177
178
|
|
178
|
-
return
|
179
|
+
return annotations_significance_matrix
|
179
180
|
|
180
181
|
|
181
182
|
def get_weighted_description(words_column: pd.Series, scores_column: pd.Series) -> str:
|
@@ -184,16 +185,16 @@ def get_weighted_description(words_column: pd.Series, scores_column: pd.Series)
|
|
184
185
|
|
185
186
|
Args:
|
186
187
|
words_column (pd.Series): A pandas Series containing strings to process.
|
187
|
-
scores_column (pd.Series): A pandas Series containing
|
188
|
+
scores_column (pd.Series): A pandas Series containing significance scores to weigh the terms.
|
188
189
|
|
189
190
|
Returns:
|
190
|
-
str: A coherent description formed from the most frequent and significant words, weighed by
|
191
|
+
str: A coherent description formed from the most frequent and significant words, weighed by significance scores.
|
191
192
|
"""
|
192
193
|
# Handle case where all scores are the same
|
193
194
|
if scores_column.max() == scores_column.min():
|
194
195
|
normalized_scores = pd.Series([1] * len(scores_column))
|
195
196
|
else:
|
196
|
-
# Normalize the
|
197
|
+
# Normalize the significance scores to be between 0 and 1
|
197
198
|
normalized_scores = (scores_column - scores_column.min()) / (
|
198
199
|
scores_column.max() - scores_column.min()
|
199
200
|
)
|
risk/log/enrichment.py
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
"""
|
2
|
+
risk/log/enrichment
|
3
|
+
~~~~~~~~~~~~~~~~~~~
|
4
|
+
"""
|
5
|
+
|
6
|
+
import csv
|
7
|
+
import json
|
8
|
+
import warnings
|
9
|
+
from datetime import datetime
|
10
|
+
from functools import wraps
|
11
|
+
from typing import Any, Dict
|
12
|
+
|
13
|
+
import numpy as np
|
14
|
+
|
15
|
+
from .console import logger, log_header
|
16
|
+
|
17
|
+
# Suppress all warnings - this is to resolve warnings from multiprocessing
|
18
|
+
warnings.filterwarnings("ignore")
|
risk/neighborhoods/domains.py
CHANGED
@@ -20,17 +20,17 @@ from risk.log import logger
|
|
20
20
|
|
21
21
|
def define_domains(
|
22
22
|
top_annotations: pd.DataFrame,
|
23
|
-
|
23
|
+
significant_neighborhoods_significance: np.ndarray,
|
24
24
|
linkage_criterion: str,
|
25
25
|
linkage_method: str,
|
26
26
|
linkage_metric: str,
|
27
27
|
) -> pd.DataFrame:
|
28
|
-
"""Define domains and assign nodes to these domains based on their
|
28
|
+
"""Define domains and assign nodes to these domains based on their significance scores and clustering,
|
29
29
|
handling errors by assigning unique domains when clustering fails.
|
30
30
|
|
31
31
|
Args:
|
32
32
|
top_annotations (pd.DataFrame): DataFrame of top annotations data for the network nodes.
|
33
|
-
|
33
|
+
significant_neighborhoods_significance (np.ndarray): The binary significance matrix below alpha.
|
34
34
|
linkage_criterion (str): The clustering criterion for defining groups.
|
35
35
|
linkage_method (str): The linkage method for clustering.
|
36
36
|
linkage_metric (str): The linkage metric for clustering.
|
@@ -40,7 +40,7 @@ def define_domains(
|
|
40
40
|
"""
|
41
41
|
try:
|
42
42
|
# Transpose the matrix to cluster annotations
|
43
|
-
m =
|
43
|
+
m = significant_neighborhoods_significance[:, top_annotations["significant_annotations"]].T
|
44
44
|
best_linkage, best_metric, best_threshold = _optimize_silhouette_across_linkage_and_metrics(
|
45
45
|
m, linkage_criterion, linkage_method, linkage_metric
|
46
46
|
)
|
@@ -65,13 +65,13 @@ def define_domains(
|
|
65
65
|
top_annotations["domain"] = range(1, n_rows + 1) # Assign unique domains
|
66
66
|
|
67
67
|
# Create DataFrames to store domain information
|
68
|
-
|
69
|
-
data=
|
68
|
+
node_to_significance = pd.DataFrame(
|
69
|
+
data=significant_neighborhoods_significance,
|
70
70
|
columns=[top_annotations.index.values, top_annotations["domain"]],
|
71
71
|
)
|
72
|
-
node_to_domain =
|
72
|
+
node_to_domain = node_to_significance.groupby(level="domain", axis=1).sum()
|
73
73
|
|
74
|
-
# Find the maximum
|
74
|
+
# Find the maximum significance score for each node
|
75
75
|
t_max = node_to_domain.loc[:, 1:].max(axis=1)
|
76
76
|
t_idxmax = node_to_domain.loc[:, 1:].idxmax(axis=1)
|
77
77
|
t_idxmax[t_max == 0] = 0
|
@@ -119,27 +119,27 @@ def trim_domains_and_top_annotations(
|
|
119
119
|
top_annotations["domain"].replace(to_remove, invalid_domain_id, inplace=True)
|
120
120
|
domains.loc[domains["primary_domain"].isin(to_remove), ["primary_domain"]] = invalid_domain_id
|
121
121
|
|
122
|
-
# Normalize "num
|
122
|
+
# Normalize "num significant neighborhoods" by percentile for each domain and scale to 0-10
|
123
123
|
top_annotations["normalized_value"] = top_annotations.groupby("domain")[
|
124
|
-
"
|
124
|
+
"significant_neighborhood_significance_sums"
|
125
125
|
].transform(lambda x: (x.rank(pct=True) * 10).apply(np.ceil).astype(int))
|
126
|
-
# Modify the lambda function to pass both full_terms and
|
126
|
+
# Modify the lambda function to pass both full_terms and significant_significance_score
|
127
127
|
top_annotations["combined_terms"] = top_annotations.apply(
|
128
128
|
lambda row: " ".join([str(row["full_terms"])] * row["normalized_value"]), axis=1
|
129
129
|
)
|
130
130
|
|
131
|
-
# Perform the groupby operation while retaining the other columns and adding the weighting with
|
131
|
+
# Perform the groupby operation while retaining the other columns and adding the weighting with significance scores
|
132
132
|
domain_labels = (
|
133
133
|
top_annotations.groupby("domain")
|
134
134
|
.agg(
|
135
135
|
full_terms=("full_terms", lambda x: list(x)),
|
136
|
-
|
136
|
+
significance_scores=("significant_significance_score", lambda x: list(x)),
|
137
137
|
)
|
138
138
|
.reset_index()
|
139
139
|
)
|
140
140
|
domain_labels["combined_terms"] = domain_labels.apply(
|
141
141
|
lambda row: get_weighted_description(
|
142
|
-
pd.Series(row["full_terms"]), pd.Series(row["
|
142
|
+
pd.Series(row["full_terms"]), pd.Series(row["significance_scores"])
|
143
143
|
),
|
144
144
|
axis=1,
|
145
145
|
)
|
@@ -150,7 +150,7 @@ def trim_domains_and_top_annotations(
|
|
150
150
|
"domain": "id",
|
151
151
|
"combined_terms": "normalized_description",
|
152
152
|
"full_terms": "full_descriptions",
|
153
|
-
"
|
153
|
+
"significance_scores": "significance_scores",
|
154
154
|
}
|
155
155
|
).set_index("id")
|
156
156
|
|
@@ -171,163 +171,169 @@ def process_neighborhoods(
|
|
171
171
|
|
172
172
|
Args:
|
173
173
|
network (nx.Graph): The network data structure used for imputing and pruning neighbors.
|
174
|
-
neighborhoods (Dict[str, Any]): Dictionary containing '
|
174
|
+
neighborhoods (Dict[str, Any]): Dictionary containing 'significance_matrix', 'significant_binary_significance_matrix', and 'significant_significance_matrix'.
|
175
175
|
impute_depth (int, optional): Depth for imputing neighbors. Defaults to 0.
|
176
176
|
prune_threshold (float, optional): Distance threshold for pruning neighbors. Defaults to 0.0.
|
177
177
|
|
178
178
|
Returns:
|
179
|
-
Dict[str, Any]: Processed neighborhoods data, including the updated matrices and
|
179
|
+
Dict[str, Any]: Processed neighborhoods data, including the updated matrices and significance counts.
|
180
180
|
"""
|
181
|
-
|
182
|
-
|
183
|
-
|
181
|
+
significance_matrix = neighborhoods["significance_matrix"]
|
182
|
+
significant_binary_significance_matrix = neighborhoods["significant_binary_significance_matrix"]
|
183
|
+
significant_significance_matrix = neighborhoods["significant_significance_matrix"]
|
184
184
|
logger.debug(f"Imputation depth: {impute_depth}")
|
185
185
|
if impute_depth:
|
186
186
|
(
|
187
|
-
|
188
|
-
|
189
|
-
|
187
|
+
significance_matrix,
|
188
|
+
significant_binary_significance_matrix,
|
189
|
+
significant_significance_matrix,
|
190
190
|
) = _impute_neighbors(
|
191
191
|
network,
|
192
|
-
|
193
|
-
|
192
|
+
significance_matrix,
|
193
|
+
significant_binary_significance_matrix,
|
194
194
|
max_depth=impute_depth,
|
195
195
|
)
|
196
196
|
|
197
197
|
logger.debug(f"Pruning threshold: {prune_threshold}")
|
198
198
|
if prune_threshold:
|
199
199
|
(
|
200
|
-
|
201
|
-
|
202
|
-
|
200
|
+
significance_matrix,
|
201
|
+
significant_binary_significance_matrix,
|
202
|
+
significant_significance_matrix,
|
203
203
|
) = _prune_neighbors(
|
204
204
|
network,
|
205
|
-
|
206
|
-
|
205
|
+
significance_matrix,
|
206
|
+
significant_binary_significance_matrix,
|
207
207
|
distance_threshold=prune_threshold,
|
208
208
|
)
|
209
209
|
|
210
|
-
|
211
|
-
|
210
|
+
neighborhood_significance_counts = np.sum(significant_binary_significance_matrix, axis=0)
|
211
|
+
node_significance_sums = np.sum(significance_matrix, axis=1)
|
212
212
|
return {
|
213
|
-
"
|
214
|
-
"
|
215
|
-
"
|
216
|
-
"
|
217
|
-
"
|
213
|
+
"significance_matrix": significance_matrix,
|
214
|
+
"significant_binary_significance_matrix": significant_binary_significance_matrix,
|
215
|
+
"significant_significance_matrix": significant_significance_matrix,
|
216
|
+
"neighborhood_significance_counts": neighborhood_significance_counts,
|
217
|
+
"node_significance_sums": node_significance_sums,
|
218
218
|
}
|
219
219
|
|
220
220
|
|
221
221
|
def _impute_neighbors(
|
222
222
|
network: nx.Graph,
|
223
|
-
|
224
|
-
|
223
|
+
significance_matrix: np.ndarray,
|
224
|
+
significant_binary_significance_matrix: np.ndarray,
|
225
225
|
max_depth: int = 3,
|
226
226
|
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
227
|
-
"""Impute rows with sums of zero in the
|
227
|
+
"""Impute rows with sums of zero in the significance matrix based on the closest non-zero neighbors in the network graph.
|
228
228
|
|
229
229
|
Args:
|
230
230
|
network (nx.Graph): The network graph with nodes having IDs matching the matrix indices.
|
231
|
-
|
232
|
-
|
231
|
+
significance_matrix (np.ndarray): The significance matrix with rows to be imputed.
|
232
|
+
significant_binary_significance_matrix (np.ndarray): The alpha threshold matrix to be imputed similarly.
|
233
233
|
max_depth (int): Maximum depth of nodes to traverse for imputing values.
|
234
234
|
|
235
235
|
Returns:
|
236
236
|
tuple: A tuple containing:
|
237
|
-
- np.ndarray: The imputed
|
237
|
+
- np.ndarray: The imputed significance matrix.
|
238
238
|
- np.ndarray: The imputed alpha threshold matrix.
|
239
|
-
- np.ndarray: The significant
|
239
|
+
- np.ndarray: The significant significance matrix with non-significant entries set to zero.
|
240
240
|
"""
|
241
241
|
# Calculate the distance threshold value based on the shortest distances
|
242
|
-
|
243
|
-
network,
|
242
|
+
significance_matrix, significant_binary_significance_matrix = _impute_neighbors_with_similarity(
|
243
|
+
network, significance_matrix, significant_binary_significance_matrix, max_depth=max_depth
|
244
244
|
)
|
245
245
|
# Create a matrix where non-significant entries are set to zero
|
246
|
-
|
247
|
-
|
246
|
+
significant_significance_matrix = np.where(
|
247
|
+
significant_binary_significance_matrix == 1, significance_matrix, 0
|
248
248
|
)
|
249
249
|
|
250
|
-
return
|
250
|
+
return (
|
251
|
+
significance_matrix,
|
252
|
+
significant_binary_significance_matrix,
|
253
|
+
significant_significance_matrix,
|
254
|
+
)
|
251
255
|
|
252
256
|
|
253
257
|
def _impute_neighbors_with_similarity(
|
254
258
|
network: nx.Graph,
|
255
|
-
|
256
|
-
|
259
|
+
significance_matrix: np.ndarray,
|
260
|
+
significant_binary_significance_matrix: np.ndarray,
|
257
261
|
max_depth: int = 3,
|
258
262
|
) -> Tuple[np.ndarray, np.ndarray]:
|
259
|
-
"""Impute non-
|
263
|
+
"""Impute non-significant nodes based on the closest significant neighbors' profiles and their similarity.
|
260
264
|
|
261
265
|
Args:
|
262
266
|
network (nx.Graph): The network graph with nodes having IDs matching the matrix indices.
|
263
|
-
|
264
|
-
|
267
|
+
significance_matrix (np.ndarray): The significance matrix with rows to be imputed.
|
268
|
+
significant_binary_significance_matrix (np.ndarray): The alpha threshold matrix to be imputed similarly.
|
265
269
|
max_depth (int): Maximum depth of nodes to traverse for imputing values.
|
266
270
|
|
267
271
|
Returns:
|
268
272
|
Tuple[np.ndarray, np.ndarray]: A tuple containing:
|
269
|
-
- The imputed
|
273
|
+
- The imputed significance matrix.
|
270
274
|
- The imputed alpha threshold matrix.
|
271
275
|
"""
|
272
276
|
depth = 1
|
273
|
-
rows_to_impute = np.where(
|
277
|
+
rows_to_impute = np.where(significant_binary_significance_matrix.sum(axis=1) == 0)[0]
|
274
278
|
while len(rows_to_impute) and depth <= max_depth:
|
275
|
-
# Iterate over all
|
276
|
-
for row_index in range(
|
277
|
-
if
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
279
|
+
# Iterate over all significant nodes
|
280
|
+
for row_index in range(significant_binary_significance_matrix.shape[0]):
|
281
|
+
if significant_binary_significance_matrix[row_index].sum() != 0:
|
282
|
+
significance_matrix, significant_binary_significance_matrix = (
|
283
|
+
_process_node_imputation(
|
284
|
+
row_index,
|
285
|
+
network,
|
286
|
+
significance_matrix,
|
287
|
+
significant_binary_significance_matrix,
|
288
|
+
depth,
|
289
|
+
)
|
284
290
|
)
|
285
291
|
|
286
292
|
# Update rows to impute for the next iteration
|
287
|
-
rows_to_impute = np.where(
|
293
|
+
rows_to_impute = np.where(significant_binary_significance_matrix.sum(axis=1) == 0)[0]
|
288
294
|
depth += 1
|
289
295
|
|
290
|
-
return
|
296
|
+
return significance_matrix, significant_binary_significance_matrix
|
291
297
|
|
292
298
|
|
293
299
|
def _process_node_imputation(
|
294
300
|
row_index: int,
|
295
301
|
network: nx.Graph,
|
296
|
-
|
297
|
-
|
302
|
+
significance_matrix: np.ndarray,
|
303
|
+
significant_binary_significance_matrix: np.ndarray,
|
298
304
|
depth: int,
|
299
305
|
) -> Tuple[np.ndarray, np.ndarray]:
|
300
|
-
"""Process the imputation for a single node based on its
|
306
|
+
"""Process the imputation for a single node based on its significant neighbors.
|
301
307
|
|
302
308
|
Args:
|
303
|
-
row_index (int): The index of the
|
309
|
+
row_index (int): The index of the significant node being processed.
|
304
310
|
network (nx.Graph): The network graph with nodes having IDs matching the matrix indices.
|
305
|
-
|
306
|
-
|
311
|
+
significance_matrix (np.ndarray): The significance matrix with rows to be imputed.
|
312
|
+
significant_binary_significance_matrix (np.ndarray): The alpha threshold matrix to be imputed similarly.
|
307
313
|
depth (int): Current depth for traversal.
|
308
314
|
|
309
315
|
Returns:
|
310
|
-
Tuple[np.ndarray, np.ndarray]: The modified
|
316
|
+
Tuple[np.ndarray, np.ndarray]: The modified significance matrix and binary threshold matrix.
|
311
317
|
"""
|
312
318
|
# Check neighbors at the current depth
|
313
319
|
neighbors = nx.single_source_shortest_path_length(network, row_index, cutoff=depth)
|
314
|
-
# Filter annotated neighbors (already
|
320
|
+
# Filter annotated neighbors (already significant)
|
315
321
|
annotated_neighbors = [
|
316
322
|
n
|
317
323
|
for n in neighbors
|
318
324
|
if n != row_index
|
319
|
-
and
|
320
|
-
and
|
325
|
+
and significant_binary_significance_matrix[n].sum() != 0
|
326
|
+
and significance_matrix[n].sum() != 0
|
321
327
|
]
|
322
|
-
# Filter non-
|
328
|
+
# Filter non-significant neighbors
|
323
329
|
valid_neighbors = [
|
324
330
|
n
|
325
331
|
for n in neighbors
|
326
332
|
if n != row_index
|
327
|
-
and
|
328
|
-
and
|
333
|
+
and significant_binary_significance_matrix[n].sum() == 0
|
334
|
+
and significance_matrix[n].sum() == 0
|
329
335
|
]
|
330
|
-
# If there are valid non-
|
336
|
+
# If there are valid non-significant neighbors
|
331
337
|
if valid_neighbors and annotated_neighbors:
|
332
338
|
# Calculate distances to annotated neighbors
|
333
339
|
distances_to_annotated = [
|
@@ -338,7 +344,7 @@ def _process_node_imputation(
|
|
338
344
|
iqr = q3 - q1
|
339
345
|
lower_bound = q1 - 1.5 * iqr
|
340
346
|
upper_bound = q3 + 1.5 * iqr
|
341
|
-
# Filter valid non-
|
347
|
+
# Filter valid non-significant neighbors that fall within the IQR bounds
|
342
348
|
valid_neighbors_within_iqr = [
|
343
349
|
n
|
344
350
|
for n in valid_neighbors
|
@@ -352,8 +358,8 @@ def _process_node_imputation(
|
|
352
358
|
def sum_pairwise_cosine_similarities(neighbor):
|
353
359
|
return sum(
|
354
360
|
cosine_similarity(
|
355
|
-
|
356
|
-
|
361
|
+
significance_matrix[neighbor].reshape(1, -1),
|
362
|
+
significance_matrix[other_neighbor].reshape(1, -1),
|
357
363
|
)[0][0]
|
358
364
|
for other_neighbor in valid_neighbors_within_iqr
|
359
365
|
if other_neighbor != neighbor
|
@@ -365,43 +371,45 @@ def _process_node_imputation(
|
|
365
371
|
else:
|
366
372
|
most_similar_neighbor = valid_neighbors_within_iqr[0]
|
367
373
|
|
368
|
-
# Impute the most similar non-
|
369
|
-
|
374
|
+
# Impute the most similar non-significant neighbor with the significant node's data, scaled by depth
|
375
|
+
significance_matrix[most_similar_neighbor] = significance_matrix[row_index] / np.sqrt(
|
370
376
|
depth + 1
|
371
377
|
)
|
372
|
-
|
373
|
-
|
378
|
+
significant_binary_significance_matrix[most_similar_neighbor] = (
|
379
|
+
significant_binary_significance_matrix[row_index]
|
374
380
|
)
|
375
381
|
|
376
|
-
return
|
382
|
+
return significance_matrix, significant_binary_significance_matrix
|
377
383
|
|
378
384
|
|
379
385
|
def _prune_neighbors(
|
380
386
|
network: nx.Graph,
|
381
|
-
|
382
|
-
|
387
|
+
significance_matrix: np.ndarray,
|
388
|
+
significant_binary_significance_matrix: np.ndarray,
|
383
389
|
distance_threshold: float = 0.9,
|
384
390
|
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
385
391
|
"""Remove outliers based on their rank for edge lengths.
|
386
392
|
|
387
393
|
Args:
|
388
394
|
network (nx.Graph): The network graph with nodes having IDs matching the matrix indices.
|
389
|
-
|
390
|
-
|
395
|
+
significance_matrix (np.ndarray): The significance matrix.
|
396
|
+
significant_binary_significance_matrix (np.ndarray): The alpha threshold matrix.
|
391
397
|
distance_threshold (float): Rank threshold (0 to 1) to determine outliers.
|
392
398
|
|
393
399
|
Returns:
|
394
400
|
tuple: A tuple containing:
|
395
|
-
- np.ndarray: The updated
|
401
|
+
- np.ndarray: The updated significance matrix with outliers set to zero.
|
396
402
|
- np.ndarray: The updated alpha threshold matrix with outliers set to zero.
|
397
|
-
- np.ndarray: The significant
|
403
|
+
- np.ndarray: The significant significance matrix, where non-significant entries are set to zero.
|
398
404
|
"""
|
399
|
-
# Identify indices with non-zero rows in the binary
|
400
|
-
non_zero_indices = np.where(
|
405
|
+
# Identify indices with non-zero rows in the binary significance matrix
|
406
|
+
non_zero_indices = np.where(significant_binary_significance_matrix.sum(axis=1) != 0)[0]
|
401
407
|
median_distances = []
|
402
408
|
for node in non_zero_indices:
|
403
409
|
neighbors = [
|
404
|
-
n
|
410
|
+
n
|
411
|
+
for n in network.neighbors(node)
|
412
|
+
if significant_binary_significance_matrix[n].sum() != 0
|
405
413
|
]
|
406
414
|
if neighbors:
|
407
415
|
median_distance = np.median(
|
@@ -416,22 +424,26 @@ def _prune_neighbors(
|
|
416
424
|
neighbors = [
|
417
425
|
n
|
418
426
|
for n in network.neighbors(row_index)
|
419
|
-
if
|
427
|
+
if significant_binary_significance_matrix[n].sum() != 0
|
420
428
|
]
|
421
429
|
if neighbors:
|
422
430
|
median_distance = np.median(
|
423
431
|
[_get_euclidean_distance(row_index, n, network) for n in neighbors]
|
424
432
|
)
|
425
433
|
if median_distance >= distance_threshold_value:
|
426
|
-
|
427
|
-
|
434
|
+
significance_matrix[row_index] = 0
|
435
|
+
significant_binary_significance_matrix[row_index] = 0
|
428
436
|
|
429
437
|
# Create a matrix where non-significant entries are set to zero
|
430
|
-
|
431
|
-
|
438
|
+
significant_significance_matrix = np.where(
|
439
|
+
significant_binary_significance_matrix == 1, significance_matrix, 0
|
432
440
|
)
|
433
441
|
|
434
|
-
return
|
442
|
+
return (
|
443
|
+
significance_matrix,
|
444
|
+
significant_binary_significance_matrix,
|
445
|
+
significant_significance_matrix,
|
446
|
+
)
|
435
447
|
|
436
448
|
|
437
449
|
def _get_euclidean_distance(node1: Any, node2: Any, network: nx.Graph) -> float:
|