risk-network 0.0.12b1__py3-none-any.whl → 0.0.12b3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- risk/__init__.py +1 -1
- risk/annotation/__init__.py +10 -0
- risk/{annotations/annotations.py → annotation/annotation.py} +44 -44
- risk/{annotations → annotation}/io.py +46 -46
- risk/{annotations → annotation}/nltk_setup.py +4 -4
- risk/log/parameters.py +5 -5
- risk/neighborhoods/api.py +36 -36
- risk/neighborhoods/domains.py +20 -24
- risk/neighborhoods/neighborhoods.py +4 -4
- risk/neighborhoods/stats/permutation/permutation.py +17 -17
- risk/neighborhoods/stats/permutation/test_functions.py +2 -2
- risk/neighborhoods/stats/tests.py +41 -41
- risk/network/graph/api.py +17 -17
- risk/network/graph/graph.py +17 -11
- risk/network/graph/summary.py +10 -10
- risk/network/io.py +12 -12
- risk/network/plotter/canvas.py +1 -1
- risk/network/plotter/contour.py +3 -3
- risk/network/plotter/labels.py +72 -74
- risk/network/plotter/network.py +6 -6
- risk/network/plotter/plotter.py +6 -6
- risk/network/plotter/utils/colors.py +12 -8
- risk/network/plotter/utils/layout.py +3 -3
- risk/risk.py +2 -2
- {risk_network-0.0.12b1.dist-info → risk_network-0.0.12b3.dist-info}/METADATA +1 -1
- risk_network-0.0.12b3.dist-info/RECORD +40 -0
- {risk_network-0.0.12b1.dist-info → risk_network-0.0.12b3.dist-info}/WHEEL +1 -1
- risk/annotations/__init__.py +0 -10
- risk_network-0.0.12b1.dist-info/RECORD +0 -40
- {risk_network-0.0.12b1.dist-info → risk_network-0.0.12b3.dist-info}/licenses/LICENSE +0 -0
- {risk_network-0.0.12b1.dist-info → risk_network-0.0.12b3.dist-info}/top_level.txt +0 -0
risk/neighborhoods/api.py
CHANGED
@@ -28,13 +28,13 @@ class NeighborhoodsAPI:
|
|
28
28
|
The NeighborhoodsAPI class provides methods to load neighborhood results from statistical tests.
|
29
29
|
"""
|
30
30
|
|
31
|
-
def __init__() -> None:
|
31
|
+
def __init__(self) -> None:
|
32
32
|
pass
|
33
33
|
|
34
|
-
def
|
34
|
+
def load_neighborhoods_binom(
|
35
35
|
self,
|
36
36
|
network: nx.Graph,
|
37
|
-
|
37
|
+
annotation: Dict[str, Any],
|
38
38
|
distance_metric: Union[str, List, Tuple, np.ndarray] = "louvain",
|
39
39
|
louvain_resolution: float = 0.1,
|
40
40
|
leiden_resolution: float = 1.0,
|
@@ -46,7 +46,7 @@ class NeighborhoodsAPI:
|
|
46
46
|
|
47
47
|
Args:
|
48
48
|
network (nx.Graph): The network graph.
|
49
|
-
|
49
|
+
annotation (Dict[str, Any]): The annotation associated with the network.
|
50
50
|
distance_metric (str, List, Tuple, or np.ndarray, optional): The distance metric(s) to use. Can be a string for one
|
51
51
|
metric or a list/tuple/ndarray of metrics ('greedy_modularity', 'louvain', 'leiden', 'label_propagation',
|
52
52
|
'markov_clustering', 'walktrap', 'spinglass'). Defaults to 'louvain'.
|
@@ -55,7 +55,7 @@ class NeighborhoodsAPI:
|
|
55
55
|
fraction_shortest_edges (float, List, Tuple, or np.ndarray, optional): Shortest edge rank fraction threshold(s) for creating subgraphs.
|
56
56
|
Can be a single float for one threshold or a list/tuple of floats corresponding to multiple thresholds.
|
57
57
|
Defaults to 0.5.
|
58
|
-
null_distribution (str, optional): Type of null distribution ('network' or '
|
58
|
+
null_distribution (str, optional): Type of null distribution ('network' or 'annotation'). Defaults to "network".
|
59
59
|
random_seed (int, optional): Seed for random number generation. Defaults to 888.
|
60
60
|
|
61
61
|
Returns:
|
@@ -65,7 +65,7 @@ class NeighborhoodsAPI:
|
|
65
65
|
# Compute neighborhood significance using the binomial test
|
66
66
|
return self._load_neighborhoods_by_statistical_test(
|
67
67
|
network=network,
|
68
|
-
|
68
|
+
annotation=annotation,
|
69
69
|
distance_metric=distance_metric,
|
70
70
|
louvain_resolution=louvain_resolution,
|
71
71
|
leiden_resolution=leiden_resolution,
|
@@ -76,10 +76,10 @@ class NeighborhoodsAPI:
|
|
76
76
|
statistical_test_function=compute_binom_test,
|
77
77
|
)
|
78
78
|
|
79
|
-
def
|
79
|
+
def load_neighborhoods_chi2(
|
80
80
|
self,
|
81
81
|
network: nx.Graph,
|
82
|
-
|
82
|
+
annotation: Dict[str, Any],
|
83
83
|
distance_metric: Union[str, List, Tuple, np.ndarray] = "louvain",
|
84
84
|
louvain_resolution: float = 0.1,
|
85
85
|
leiden_resolution: float = 1.0,
|
@@ -91,7 +91,7 @@ class NeighborhoodsAPI:
|
|
91
91
|
|
92
92
|
Args:
|
93
93
|
network (nx.Graph): The network graph.
|
94
|
-
|
94
|
+
annotation (Dict[str, Any]): The annotation associated with the network.
|
95
95
|
distance_metric (str, List, Tuple, or np.ndarray, optional): The distance metric(s) to use. Can be a string for one
|
96
96
|
metric or a list/tuple/ndarray of metrics ('greedy_modularity', 'louvain', 'leiden', 'label_propagation',
|
97
97
|
'markov_clustering', 'walktrap', 'spinglass'). Defaults to 'louvain'.
|
@@ -100,7 +100,7 @@ class NeighborhoodsAPI:
|
|
100
100
|
fraction_shortest_edges (float, List, Tuple, or np.ndarray, optional): Shortest edge rank fraction threshold(s) for creating subgraphs.
|
101
101
|
Can be a single float for one threshold or a list/tuple of floats corresponding to multiple thresholds.
|
102
102
|
Defaults to 0.5.
|
103
|
-
null_distribution (str, optional): Type of null distribution ('network' or '
|
103
|
+
null_distribution (str, optional): Type of null distribution ('network' or 'annotation'). Defaults to "network".
|
104
104
|
random_seed (int, optional): Seed for random number generation. Defaults to 888.
|
105
105
|
|
106
106
|
Returns:
|
@@ -110,7 +110,7 @@ class NeighborhoodsAPI:
|
|
110
110
|
# Compute neighborhood significance using the chi-squared test
|
111
111
|
return self._load_neighborhoods_by_statistical_test(
|
112
112
|
network=network,
|
113
|
-
|
113
|
+
annotation=annotation,
|
114
114
|
distance_metric=distance_metric,
|
115
115
|
louvain_resolution=louvain_resolution,
|
116
116
|
leiden_resolution=leiden_resolution,
|
@@ -121,10 +121,10 @@ class NeighborhoodsAPI:
|
|
121
121
|
statistical_test_function=compute_chi2_test,
|
122
122
|
)
|
123
123
|
|
124
|
-
def
|
124
|
+
def load_neighborhoods_hypergeom(
|
125
125
|
self,
|
126
126
|
network: nx.Graph,
|
127
|
-
|
127
|
+
annotation: Dict[str, Any],
|
128
128
|
distance_metric: Union[str, List, Tuple, np.ndarray] = "louvain",
|
129
129
|
louvain_resolution: float = 0.1,
|
130
130
|
leiden_resolution: float = 1.0,
|
@@ -136,7 +136,7 @@ class NeighborhoodsAPI:
|
|
136
136
|
|
137
137
|
Args:
|
138
138
|
network (nx.Graph): The network graph.
|
139
|
-
|
139
|
+
annotation (Dict[str, Any]): The annotation associated with the network.
|
140
140
|
distance_metric (str, List, Tuple, or np.ndarray, optional): The distance metric(s) to use. Can be a string for one
|
141
141
|
metric or a list/tuple/ndarray of metrics ('greedy_modularity', 'louvain', 'leiden', 'label_propagation',
|
142
142
|
'markov_clustering', 'walktrap', 'spinglass'). Defaults to 'louvain'.
|
@@ -145,7 +145,7 @@ class NeighborhoodsAPI:
|
|
145
145
|
fraction_shortest_edges (float, List, Tuple, or np.ndarray, optional): Shortest edge rank fraction threshold(s) for creating subgraphs.
|
146
146
|
Can be a single float for one threshold or a list/tuple of floats corresponding to multiple thresholds.
|
147
147
|
Defaults to 0.5.
|
148
|
-
null_distribution (str, optional): Type of null distribution ('network' or '
|
148
|
+
null_distribution (str, optional): Type of null distribution ('network' or 'annotation'). Defaults to "network".
|
149
149
|
random_seed (int, optional): Seed for random number generation. Defaults to 888.
|
150
150
|
|
151
151
|
Returns:
|
@@ -155,7 +155,7 @@ class NeighborhoodsAPI:
|
|
155
155
|
# Compute neighborhood significance using the hypergeometric test
|
156
156
|
return self._load_neighborhoods_by_statistical_test(
|
157
157
|
network=network,
|
158
|
-
|
158
|
+
annotation=annotation,
|
159
159
|
distance_metric=distance_metric,
|
160
160
|
louvain_resolution=louvain_resolution,
|
161
161
|
leiden_resolution=leiden_resolution,
|
@@ -166,10 +166,10 @@ class NeighborhoodsAPI:
|
|
166
166
|
statistical_test_function=compute_hypergeom_test,
|
167
167
|
)
|
168
168
|
|
169
|
-
def
|
169
|
+
def load_neighborhoods_permutation(
|
170
170
|
self,
|
171
171
|
network: nx.Graph,
|
172
|
-
|
172
|
+
annotation: Dict[str, Any],
|
173
173
|
distance_metric: Union[str, List, Tuple, np.ndarray] = "louvain",
|
174
174
|
louvain_resolution: float = 0.1,
|
175
175
|
leiden_resolution: float = 1.0,
|
@@ -184,7 +184,7 @@ class NeighborhoodsAPI:
|
|
184
184
|
|
185
185
|
Args:
|
186
186
|
network (nx.Graph): The network graph.
|
187
|
-
|
187
|
+
annotation (Dict[str, Any]): The annotation associated with the network.
|
188
188
|
distance_metric (str, List, Tuple, or np.ndarray, optional): The distance metric(s) to use. Can be a string for one
|
189
189
|
metric or a list/tuple/ndarray of metrics ('greedy_modularity', 'louvain', 'leiden', 'label_propagation',
|
190
190
|
'markov_clustering', 'walktrap', 'spinglass'). Defaults to 'louvain'.
|
@@ -194,7 +194,7 @@ class NeighborhoodsAPI:
|
|
194
194
|
Can be a single float for one threshold or a list/tuple of floats corresponding to multiple thresholds.
|
195
195
|
Defaults to 0.5.
|
196
196
|
score_metric (str, optional): Scoring metric for neighborhood significance. Defaults to "sum".
|
197
|
-
null_distribution (str, optional): Type of null distribution ('network' or '
|
197
|
+
null_distribution (str, optional): Type of null distribution ('network' or 'annotation'). Defaults to "network".
|
198
198
|
num_permutations (int, optional): Number of permutations for significance testing. Defaults to 1000.
|
199
199
|
random_seed (int, optional): Seed for random number generation. Defaults to 888.
|
200
200
|
max_workers (int, optional): Maximum number of workers for parallel computation. Defaults to 1.
|
@@ -210,7 +210,7 @@ class NeighborhoodsAPI:
|
|
210
210
|
# Compute neighborhood significance using the permutation test
|
211
211
|
return self._load_neighborhoods_by_statistical_test(
|
212
212
|
network=network,
|
213
|
-
|
213
|
+
annotation=annotation,
|
214
214
|
distance_metric=distance_metric,
|
215
215
|
louvain_resolution=louvain_resolution,
|
216
216
|
leiden_resolution=leiden_resolution,
|
@@ -224,10 +224,10 @@ class NeighborhoodsAPI:
|
|
224
224
|
max_workers=max_workers,
|
225
225
|
)
|
226
226
|
|
227
|
-
def
|
227
|
+
def load_neighborhoods_poisson(
|
228
228
|
self,
|
229
229
|
network: nx.Graph,
|
230
|
-
|
230
|
+
annotation: Dict[str, Any],
|
231
231
|
distance_metric: Union[str, List, Tuple, np.ndarray] = "louvain",
|
232
232
|
louvain_resolution: float = 0.1,
|
233
233
|
leiden_resolution: float = 1.0,
|
@@ -239,7 +239,7 @@ class NeighborhoodsAPI:
|
|
239
239
|
|
240
240
|
Args:
|
241
241
|
network (nx.Graph): The network graph.
|
242
|
-
|
242
|
+
annotation (Dict[str, Any]): The annotation associated with the network.
|
243
243
|
distance_metric (str, List, Tuple, or np.ndarray, optional): The distance metric(s) to use. Can be a string for one
|
244
244
|
metric or a list/tuple/ndarray of metrics ('greedy_modularity', 'louvain', 'leiden', 'label_propagation',
|
245
245
|
'markov_clustering', 'walktrap', 'spinglass'). Defaults to 'louvain'.
|
@@ -248,7 +248,7 @@ class NeighborhoodsAPI:
|
|
248
248
|
fraction_shortest_edges (float, List, Tuple, or np.ndarray, optional): Shortest edge rank fraction threshold(s) for creating subgraphs.
|
249
249
|
Can be a single float for one threshold or a list/tuple of floats corresponding to multiple thresholds.
|
250
250
|
Defaults to 0.5.
|
251
|
-
null_distribution (str, optional): Type of null distribution ('network' or '
|
251
|
+
null_distribution (str, optional): Type of null distribution ('network' or 'annotation'). Defaults to "network".
|
252
252
|
random_seed (int, optional): Seed for random number generation. Defaults to 888.
|
253
253
|
|
254
254
|
Returns:
|
@@ -258,7 +258,7 @@ class NeighborhoodsAPI:
|
|
258
258
|
# Compute neighborhood significance using the Poisson test
|
259
259
|
return self._load_neighborhoods_by_statistical_test(
|
260
260
|
network=network,
|
261
|
-
|
261
|
+
annotation=annotation,
|
262
262
|
distance_metric=distance_metric,
|
263
263
|
louvain_resolution=louvain_resolution,
|
264
264
|
leiden_resolution=leiden_resolution,
|
@@ -269,10 +269,10 @@ class NeighborhoodsAPI:
|
|
269
269
|
statistical_test_function=compute_poisson_test,
|
270
270
|
)
|
271
271
|
|
272
|
-
def
|
272
|
+
def load_neighborhoods_zscore(
|
273
273
|
self,
|
274
274
|
network: nx.Graph,
|
275
|
-
|
275
|
+
annotation: Dict[str, Any],
|
276
276
|
distance_metric: Union[str, List, Tuple, np.ndarray] = "louvain",
|
277
277
|
louvain_resolution: float = 0.1,
|
278
278
|
leiden_resolution: float = 1.0,
|
@@ -284,7 +284,7 @@ class NeighborhoodsAPI:
|
|
284
284
|
|
285
285
|
Args:
|
286
286
|
network (nx.Graph): The network graph.
|
287
|
-
|
287
|
+
annotation (Dict[str, Any]): The annotation associated with the network.
|
288
288
|
distance_metric (str, List, Tuple, or np.ndarray, optional): The distance metric(s) to use. Can be a string for one
|
289
289
|
metric or a list/tuple/ndarray of metrics ('greedy_modularity', 'louvain', 'leiden', 'label_propagation',
|
290
290
|
'markov_clustering', 'walktrap', 'spinglass'). Defaults to 'louvain'.
|
@@ -293,7 +293,7 @@ class NeighborhoodsAPI:
|
|
293
293
|
fraction_shortest_edges (float, List, Tuple, or np.ndarray, optional): Shortest edge rank fraction threshold(s) for creating subgraphs.
|
294
294
|
Can be a single float for one threshold or a list/tuple of floats corresponding to multiple thresholds.
|
295
295
|
Defaults to 0.5.
|
296
|
-
null_distribution (str, optional): Type of null distribution ('network' or '
|
296
|
+
null_distribution (str, optional): Type of null distribution ('network' or 'annotation'). Defaults to "network".
|
297
297
|
random_seed (int, optional): Seed for random number generation. Defaults to 888.
|
298
298
|
|
299
299
|
Returns:
|
@@ -303,7 +303,7 @@ class NeighborhoodsAPI:
|
|
303
303
|
# Compute neighborhood significance using the z-score test
|
304
304
|
return self._load_neighborhoods_by_statistical_test(
|
305
305
|
network=network,
|
306
|
-
|
306
|
+
annotation=annotation,
|
307
307
|
distance_metric=distance_metric,
|
308
308
|
louvain_resolution=louvain_resolution,
|
309
309
|
leiden_resolution=leiden_resolution,
|
@@ -317,7 +317,7 @@ class NeighborhoodsAPI:
|
|
317
317
|
def _load_neighborhoods_by_statistical_test(
|
318
318
|
self,
|
319
319
|
network: nx.Graph,
|
320
|
-
|
320
|
+
annotation: Dict[str, Any],
|
321
321
|
distance_metric: Union[str, List, Tuple, np.ndarray] = "louvain",
|
322
322
|
louvain_resolution: float = 0.1,
|
323
323
|
leiden_resolution: float = 1.0,
|
@@ -332,7 +332,7 @@ class NeighborhoodsAPI:
|
|
332
332
|
|
333
333
|
Args:
|
334
334
|
network (nx.Graph): The input network graph.
|
335
|
-
|
335
|
+
annotation (Dict[str, Any]): Annotation data associated with the network, including a "matrix" key with annotation values.
|
336
336
|
distance_metric (Union[str, List, Tuple, np.ndarray], optional): The distance metric or clustering method to define neighborhoods.
|
337
337
|
Can be a string specifying one method (e.g., 'louvain', 'leiden') or a collection of methods.
|
338
338
|
Defaults to "louvain".
|
@@ -340,13 +340,13 @@ class NeighborhoodsAPI:
|
|
340
340
|
leiden_resolution (float, optional): Resolution parameter for Leiden clustering. Defaults to 1.0.
|
341
341
|
fraction_shortest_edges (Union[float, List, Tuple, np.ndarray], optional): Fraction of shortest edges to consider for creating subgraphs.
|
342
342
|
Can be a single value or a collection of thresholds for flexibility. Defaults to 0.5.
|
343
|
-
null_distribution (str, optional): The type of null distribution to use ('network' or '
|
343
|
+
null_distribution (str, optional): The type of null distribution to use ('network' or 'annotation').
|
344
344
|
Defaults to "network".
|
345
345
|
random_seed (int, optional): Seed for random number generation to ensure reproducibility. Defaults to 888.
|
346
346
|
statistical_test_key (str, optional): Key or name of the statistical test to be applied (e.g., "hypergeom", "poisson").
|
347
347
|
Used for logging and debugging. Defaults to "hypergeom".
|
348
348
|
statistical_test_function (Any, optional): The function implementing the statistical test.
|
349
|
-
It should accept neighborhoods,
|
349
|
+
It should accept neighborhoods, annotation, null distribution, and additional kwargs.
|
350
350
|
Defaults to `compute_hypergeom_test`.
|
351
351
|
**kwargs: Additional parameters to be passed to the statistical test function.
|
352
352
|
|
@@ -381,7 +381,7 @@ class NeighborhoodsAPI:
|
|
381
381
|
# Apply statistical test function to compute neighborhood significance
|
382
382
|
neighborhood_significance = statistical_test_function(
|
383
383
|
neighborhoods=neighborhoods,
|
384
|
-
|
384
|
+
annotation=annotation["matrix"],
|
385
385
|
null_distribution=null_distribution,
|
386
386
|
**kwargs,
|
387
387
|
)
|
risk/neighborhoods/domains.py
CHANGED
@@ -13,7 +13,7 @@ from scipy.cluster.hierarchy import fcluster, linkage
|
|
13
13
|
from sklearn.metrics import silhouette_score
|
14
14
|
from tqdm import tqdm
|
15
15
|
|
16
|
-
from risk.
|
16
|
+
from risk.annotation import get_weighted_description
|
17
17
|
from risk.log import logger
|
18
18
|
|
19
19
|
# Define constants for clustering
|
@@ -28,7 +28,7 @@ LINKAGE_METRICS = {
|
|
28
28
|
|
29
29
|
|
30
30
|
def define_domains(
|
31
|
-
|
31
|
+
top_annotation: pd.DataFrame,
|
32
32
|
significant_neighborhoods_significance: np.ndarray,
|
33
33
|
linkage_criterion: str,
|
34
34
|
linkage_method: str,
|
@@ -39,7 +39,7 @@ def define_domains(
|
|
39
39
|
handling errors by assigning unique domains when clustering fails.
|
40
40
|
|
41
41
|
Args:
|
42
|
-
|
42
|
+
top_annotation (pd.DataFrame): DataFrame of top annotations data for the network nodes.
|
43
43
|
significant_neighborhoods_significance (np.ndarray): The binary significance matrix below alpha.
|
44
44
|
linkage_criterion (str): The clustering criterion for defining groups. Choose "off" to disable clustering.
|
45
45
|
linkage_method (str): The linkage method for clustering. Choose "auto" to optimize.
|
@@ -57,7 +57,7 @@ def define_domains(
|
|
57
57
|
raise ValueError("Clustering is turned off.")
|
58
58
|
|
59
59
|
# Transpose the matrix to cluster annotations
|
60
|
-
m = significant_neighborhoods_significance[:,
|
60
|
+
m = significant_neighborhoods_significance[:, top_annotation["significant_annotation"]].T
|
61
61
|
# Safeguard the matrix by replacing NaN, Inf, and -Inf values
|
62
62
|
m = _safeguard_matrix(m)
|
63
63
|
# Optimize silhouette score across different linkage methods and distance metrics
|
@@ -71,27 +71,23 @@ def define_domains(
|
|
71
71
|
)
|
72
72
|
# Calculate the optimal threshold for clustering
|
73
73
|
max_d_optimal = np.max(Z[:, 2]) * best_threshold
|
74
|
-
# Assign domains to the
|
74
|
+
# Assign domains to the annotation matrix
|
75
75
|
domains = fcluster(Z, max_d_optimal, criterion=linkage_criterion)
|
76
|
-
|
77
|
-
|
76
|
+
top_annotation["domain"] = 0
|
77
|
+
top_annotation.loc[top_annotation["significant_annotation"], "domain"] = domains
|
78
78
|
except (ValueError, LinAlgError):
|
79
79
|
# If a ValueError is encountered, handle it by assigning unique domains
|
80
|
-
n_rows = len(
|
80
|
+
n_rows = len(top_annotation)
|
81
81
|
if linkage_criterion == "off":
|
82
|
-
logger.warning(
|
83
|
-
f"Clustering is turned off. Skipping clustering and assigning {n_rows} unique domains."
|
84
|
-
)
|
82
|
+
logger.warning("Clustering is turned off. Skipping clustering.")
|
85
83
|
else:
|
86
|
-
logger.error(
|
87
|
-
|
88
|
-
)
|
89
|
-
top_annotations["domain"] = range(1, n_rows + 1) # Assign unique domains
|
84
|
+
logger.error("Error encountered. Skipping clustering.")
|
85
|
+
top_annotation["domain"] = range(1, n_rows + 1) # Assign unique domains
|
90
86
|
|
91
87
|
# Create DataFrames to store domain information
|
92
88
|
node_to_significance = pd.DataFrame(
|
93
89
|
data=significant_neighborhoods_significance,
|
94
|
-
columns=[
|
90
|
+
columns=[top_annotation.index.values, top_annotation["domain"]],
|
95
91
|
)
|
96
92
|
node_to_domain = node_to_significance.T.groupby(level="domain").sum().T
|
97
93
|
|
@@ -112,15 +108,15 @@ def define_domains(
|
|
112
108
|
|
113
109
|
def trim_domains(
|
114
110
|
domains: pd.DataFrame,
|
115
|
-
|
111
|
+
top_annotation: pd.DataFrame,
|
116
112
|
min_cluster_size: int = 5,
|
117
113
|
max_cluster_size: int = 1000,
|
118
|
-
) -> Tuple[pd.DataFrame, pd.DataFrame
|
114
|
+
) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
119
115
|
"""Trim domains that do not meet size criteria and find outliers.
|
120
116
|
|
121
117
|
Args:
|
122
118
|
domains (pd.DataFrame): DataFrame of domain data for the network nodes.
|
123
|
-
|
119
|
+
top_annotation (pd.DataFrame): DataFrame of top annotations data for the network nodes.
|
124
120
|
min_cluster_size (int, optional): Minimum size of a cluster to be retained. Defaults to 5.
|
125
121
|
max_cluster_size (int, optional): Maximum size of a cluster to be retained. Defaults to 1000.
|
126
122
|
|
@@ -139,21 +135,21 @@ def trim_domains(
|
|
139
135
|
invalid_domain_id = 888888
|
140
136
|
invalid_domain_ids = {0, invalid_domain_id}
|
141
137
|
# Mark domains to be removed
|
142
|
-
|
138
|
+
top_annotation["domain"] = top_annotation["domain"].replace(to_remove, invalid_domain_id)
|
143
139
|
domains.loc[domains["primary_domain"].isin(to_remove), ["primary_domain"]] = invalid_domain_id
|
144
140
|
|
145
141
|
# Normalize "num significant neighborhoods" by percentile for each domain and scale to 0-10
|
146
|
-
|
142
|
+
top_annotation["normalized_value"] = top_annotation.groupby("domain")[
|
147
143
|
"significant_neighborhood_significance_sums"
|
148
144
|
].transform(lambda x: (x.rank(pct=True) * 10).apply(np.ceil).astype(int))
|
149
145
|
# Modify the lambda function to pass both full_terms and significant_significance_score
|
150
|
-
|
146
|
+
top_annotation["combined_terms"] = top_annotation.apply(
|
151
147
|
lambda row: " ".join([str(row["full_terms"])] * row["normalized_value"]), axis=1
|
152
148
|
)
|
153
149
|
|
154
150
|
# Perform the groupby operation while retaining the other columns and adding the weighting with significance scores
|
155
151
|
domain_labels = (
|
156
|
-
|
152
|
+
top_annotation.groupby("domain")
|
157
153
|
.agg(
|
158
154
|
full_terms=("full_terms", lambda x: list(x)),
|
159
155
|
significance_scores=("significant_significance_score", lambda x: list(x)),
|
@@ -233,7 +229,7 @@ def _optimize_silhouette_across_linkage_and_metrics(
|
|
233
229
|
# Initialize best overall values
|
234
230
|
best_overall_method = linkage_method
|
235
231
|
best_overall_metric = linkage_metric
|
236
|
-
best_overall_threshold =
|
232
|
+
best_overall_threshold = 0.0
|
237
233
|
best_overall_score = -np.inf
|
238
234
|
|
239
235
|
# Set linkage methods and metrics to all combinations if "auto" is selected
|
@@ -449,7 +449,7 @@ def _prune_neighbors(
|
|
449
449
|
)
|
450
450
|
|
451
451
|
|
452
|
-
def _get_euclidean_distance(node1: Any, node2: Any, network: nx.Graph) ->
|
452
|
+
def _get_euclidean_distance(node1: Any, node2: Any, network: nx.Graph) -> np.floating[Any]:
|
453
453
|
"""Calculate the Euclidean distance between two nodes in the network.
|
454
454
|
|
455
455
|
Args:
|
@@ -458,7 +458,7 @@ def _get_euclidean_distance(node1: Any, node2: Any, network: nx.Graph) -> float:
|
|
458
458
|
network (nx.Graph): The network graph containing the nodes.
|
459
459
|
|
460
460
|
Returns:
|
461
|
-
|
461
|
+
np.floating[Any]: The Euclidean distance between the two nodes.
|
462
462
|
"""
|
463
463
|
pos1 = _get_node_position(network, node1)
|
464
464
|
pos2 = _get_node_position(network, node2)
|
@@ -495,7 +495,7 @@ def _calculate_threshold(median_distances: List, distance_threshold: float) -> f
|
|
495
495
|
float: The calculated distance threshold value.
|
496
496
|
|
497
497
|
Raises:
|
498
|
-
ValueError: If no significant
|
498
|
+
ValueError: If no significant annotation is found in the median distances.
|
499
499
|
"""
|
500
500
|
# Sort the median distances
|
501
501
|
sorted_distances = np.sort(median_distances)
|
@@ -506,7 +506,7 @@ def _calculate_threshold(median_distances: List, distance_threshold: float) -> f
|
|
506
506
|
try:
|
507
507
|
smoothed_distances = np.interp(interpolated_percentiles, rank_percentiles, sorted_distances)
|
508
508
|
except ValueError as e:
|
509
|
-
raise ValueError("No significant
|
509
|
+
raise ValueError("No significant annotation found.") from e
|
510
510
|
|
511
511
|
# Determine the index corresponding to the distance threshold
|
512
512
|
threshold_index = int(np.ceil(distance_threshold * len(smoothed_distances))) - 1
|
@@ -17,7 +17,7 @@ from risk.neighborhoods.stats.permutation.test_functions import DISPATCH_TEST_FU
|
|
17
17
|
|
18
18
|
def compute_permutation_test(
|
19
19
|
neighborhoods: csr_matrix,
|
20
|
-
|
20
|
+
annotation: csr_matrix,
|
21
21
|
score_metric: str = "sum",
|
22
22
|
null_distribution: str = "network",
|
23
23
|
num_permutations: int = 1000,
|
@@ -28,9 +28,9 @@ def compute_permutation_test(
|
|
28
28
|
|
29
29
|
Args:
|
30
30
|
neighborhoods (csr_matrix): Sparse binary matrix representing neighborhoods.
|
31
|
-
|
31
|
+
annotation (csr_matrix): Sparse binary matrix representing annotation.
|
32
32
|
score_metric (str, optional): Metric to use for scoring ('sum' or 'stdev'). Defaults to "sum".
|
33
|
-
null_distribution (str, optional): Type of null distribution ('network' or '
|
33
|
+
null_distribution (str, optional): Type of null distribution ('network' or 'annotation'). Defaults to "network".
|
34
34
|
num_permutations (int, optional): Number of permutations to run. Defaults to 1000.
|
35
35
|
random_seed (int, optional): Seed for random number generation. Defaults to 888.
|
36
36
|
max_workers (int, optional): Number of workers for multiprocessing. Defaults to 1.
|
@@ -41,14 +41,14 @@ def compute_permutation_test(
|
|
41
41
|
# Ensure that the matrices are in the correct format and free of NaN values
|
42
42
|
# NOTE: Keep the data type as float32 to avoid locking issues with dot product operations
|
43
43
|
neighborhoods = neighborhoods.astype(np.float32)
|
44
|
-
|
44
|
+
annotation = annotation.astype(np.float32)
|
45
45
|
# Retrieve the appropriate neighborhood score function based on the metric
|
46
46
|
neighborhood_score_func = DISPATCH_TEST_FUNCTIONS[score_metric]
|
47
47
|
|
48
48
|
# Run the permutation test to calculate depletion and enrichment counts
|
49
49
|
counts_depletion, counts_enrichment = _run_permutation_test(
|
50
50
|
neighborhoods=neighborhoods,
|
51
|
-
|
51
|
+
annotation=annotation,
|
52
52
|
neighborhood_score_func=neighborhood_score_func,
|
53
53
|
null_distribution=null_distribution,
|
54
54
|
num_permutations=num_permutations,
|
@@ -68,7 +68,7 @@ def compute_permutation_test(
|
|
68
68
|
|
69
69
|
def _run_permutation_test(
|
70
70
|
neighborhoods: csr_matrix,
|
71
|
-
|
71
|
+
annotation: csr_matrix,
|
72
72
|
neighborhood_score_func: Callable,
|
73
73
|
null_distribution: str = "network",
|
74
74
|
num_permutations: int = 1000,
|
@@ -79,9 +79,9 @@ def _run_permutation_test(
|
|
79
79
|
|
80
80
|
Args:
|
81
81
|
neighborhoods (csr_matrix): Sparse binary matrix representing neighborhoods.
|
82
|
-
|
82
|
+
annotation (csr_matrix): Sparse binary matrix representing annotation.
|
83
83
|
neighborhood_score_func (Callable): Function to calculate neighborhood scores.
|
84
|
-
null_distribution (str, optional): Type of null distribution ('network' or '
|
84
|
+
null_distribution (str, optional): Type of null distribution ('network' or 'annotation'). Defaults to "network".
|
85
85
|
num_permutations (int, optional): Number of permutations. Defaults to 1000.
|
86
86
|
random_seed (int, optional): Seed for random number generation. Defaults to 888.
|
87
87
|
max_workers (int, optional): Number of workers for multiprocessing. Defaults to 4.
|
@@ -96,17 +96,17 @@ def _run_permutation_test(
|
|
96
96
|
rng = np.random.default_rng(seed=random_seed)
|
97
97
|
# Determine the indices to use based on the null distribution type
|
98
98
|
if null_distribution == "network":
|
99
|
-
idxs = range(
|
100
|
-
elif null_distribution == "
|
101
|
-
idxs = np.nonzero(
|
99
|
+
idxs = range(annotation.shape[0])
|
100
|
+
elif null_distribution == "annotation":
|
101
|
+
idxs = np.nonzero(annotation.getnnz(axis=1) > 0)[0]
|
102
102
|
else:
|
103
103
|
raise ValueError(
|
104
|
-
"Invalid null_distribution value. Choose either 'network' or '
|
104
|
+
"Invalid null_distribution value. Choose either 'network' or 'annotation'."
|
105
105
|
)
|
106
106
|
|
107
|
-
# Replace NaNs with zeros in the sparse
|
108
|
-
|
109
|
-
annotation_matrix_obsv =
|
107
|
+
# Replace NaNs with zeros in the sparse annotation matrix
|
108
|
+
annotation.data[np.isnan(annotation.data)] = 0
|
109
|
+
annotation_matrix_obsv = annotation[idxs]
|
110
110
|
neighborhoods_matrix_obsv = neighborhoods.T[idxs].T
|
111
111
|
# Calculate observed neighborhood scores
|
112
112
|
with np.errstate(invalid="ignore", divide="ignore"):
|
@@ -142,7 +142,7 @@ def _run_permutation_test(
|
|
142
142
|
params_list = [
|
143
143
|
(
|
144
144
|
permutation_batches[i], # Pass the batch of precomputed permutations
|
145
|
-
|
145
|
+
annotation,
|
146
146
|
neighborhoods_matrix_obsv,
|
147
147
|
observed_neighborhood_scores,
|
148
148
|
neighborhood_score_func,
|
@@ -185,7 +185,7 @@ def _permutation_process_batch(
|
|
185
185
|
|
186
186
|
Args:
|
187
187
|
permutations (Union[List, Tuple, np.ndarray]): Permutation batch to process.
|
188
|
-
annotation_matrix (csr_matrix): Sparse binary matrix representing
|
188
|
+
annotation_matrix (csr_matrix): Sparse binary matrix representing annotation.
|
189
189
|
neighborhoods_matrix_obsv (csr_matrix): Sparse binary matrix representing observed neighborhoods.
|
190
190
|
observed_neighborhood_scores (np.ndarray): Observed neighborhood scores.
|
191
191
|
neighborhood_score_func (Callable): Function to calculate neighborhood scores.
|
@@ -24,7 +24,7 @@ def compute_neighborhood_score_by_sum(
|
|
24
24
|
Returns:
|
25
25
|
np.ndarray: Dense array of summed attribute values for each neighborhood.
|
26
26
|
"""
|
27
|
-
# Calculate the neighborhood score as the dot product of neighborhoods and
|
27
|
+
# Calculate the neighborhood score as the dot product of neighborhoods and annotation
|
28
28
|
neighborhood_score = neighborhoods_matrix @ annotation_matrix # Sparse matrix multiplication
|
29
29
|
# Convert the result to a dense array for downstream calculations
|
30
30
|
neighborhood_score_dense = neighborhood_score.toarray()
|
@@ -43,7 +43,7 @@ def compute_neighborhood_score_by_stdev(
|
|
43
43
|
Returns:
|
44
44
|
np.ndarray: Standard deviation of the neighborhood scores.
|
45
45
|
"""
|
46
|
-
# Calculate the neighborhood score as the dot product of neighborhoods and
|
46
|
+
# Calculate the neighborhood score as the dot product of neighborhoods and annotation
|
47
47
|
neighborhood_score = neighborhoods_matrix @ annotation_matrix # Sparse matrix multiplication
|
48
48
|
# Calculate the number of elements in each neighborhood (sum of rows)
|
49
49
|
N = neighborhoods_matrix.sum(axis=1).A.flatten() # Convert to 1D array
|