risk-network 0.0.9b38__py3-none-any.whl → 0.0.9b39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- risk/__init__.py +1 -1
- risk/neighborhoods/domains.py +111 -137
- risk/network/graph/api.py +3 -6
- {risk_network-0.0.9b38.dist-info → risk_network-0.0.9b39.dist-info}/METADATA +1 -1
- {risk_network-0.0.9b38.dist-info → risk_network-0.0.9b39.dist-info}/RECORD +8 -8
- {risk_network-0.0.9b38.dist-info → risk_network-0.0.9b39.dist-info}/LICENSE +0 -0
- {risk_network-0.0.9b38.dist-info → risk_network-0.0.9b39.dist-info}/WHEEL +0 -0
- {risk_network-0.0.9b38.dist-info → risk_network-0.0.9b39.dist-info}/top_level.txt +0 -0
risk/__init__.py
CHANGED
risk/neighborhoods/domains.py
CHANGED
@@ -10,8 +10,7 @@ from typing import Tuple, Union
|
|
10
10
|
import numpy as np
|
11
11
|
import pandas as pd
|
12
12
|
from scipy.cluster.hierarchy import linkage, fcluster
|
13
|
-
from
|
14
|
-
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, silhouette_score
|
13
|
+
from sklearn.metrics import silhouette_score
|
15
14
|
from tqdm import tqdm
|
16
15
|
|
17
16
|
from risk.annotations import get_weighted_description
|
@@ -19,19 +18,13 @@ from risk.constants import GROUP_LINKAGE_METHODS, GROUP_DISTANCE_METRICS
|
|
19
18
|
from risk.log import logger
|
20
19
|
|
21
20
|
|
22
|
-
class LinkageThresholdError(Exception):
|
23
|
-
"""Exception raised for errors in the linkage threshold optimization process."""
|
24
|
-
|
25
|
-
pass
|
26
|
-
|
27
|
-
|
28
21
|
def define_domains(
|
29
22
|
top_annotations: pd.DataFrame,
|
30
23
|
significant_neighborhoods_significance: np.ndarray,
|
31
24
|
linkage_criterion: str,
|
32
25
|
linkage_method: str,
|
33
26
|
linkage_metric: str,
|
34
|
-
linkage_threshold:
|
27
|
+
linkage_threshold: float,
|
35
28
|
) -> pd.DataFrame:
|
36
29
|
"""Define domains and assign nodes to these domains based on their significance scores and clustering,
|
37
30
|
handling errors by assigning unique domains when clustering fails.
|
@@ -39,19 +32,13 @@ def define_domains(
|
|
39
32
|
Args:
|
40
33
|
top_annotations (pd.DataFrame): DataFrame of top annotations data for the network nodes.
|
41
34
|
significant_neighborhoods_significance (np.ndarray): The binary significance matrix below alpha.
|
42
|
-
linkage_criterion (str): The clustering criterion for defining groups.
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
linkage_threshold (str, float): The linkage threshold for clustering, or one of "silhouette",
|
47
|
-
"calinski_harabasz", or "davies_bouldin" to optimize the threshold.
|
35
|
+
linkage_criterion (str): The clustering criterion for defining groups.
|
36
|
+
linkage_method (str): The linkage method for clustering.
|
37
|
+
linkage_metric (str): The linkage metric for clustering.
|
38
|
+
linkage_threshold (float): The threshold for clustering.
|
48
39
|
|
49
40
|
Returns:
|
50
41
|
pd.DataFrame: DataFrame with the primary domain for each node.
|
51
|
-
|
52
|
-
Raises:
|
53
|
-
ValueError: If an improper value is passed for linkage_threshold. Acceptable values are "silhouette",
|
54
|
-
"calinski_harabasz", "davies_bouldin", or a float value.
|
55
42
|
"""
|
56
43
|
try:
|
57
44
|
if linkage_criterion == "off":
|
@@ -62,10 +49,8 @@ def define_domains(
|
|
62
49
|
# Safeguard the matrix by replacing NaN, Inf, and -Inf values
|
63
50
|
m = _safeguard_matrix(m)
|
64
51
|
# Optimize silhouette score across different linkage methods and distance metrics
|
65
|
-
best_linkage, best_metric, best_threshold = (
|
66
|
-
|
67
|
-
m, linkage_criterion, linkage_method, linkage_metric, linkage_threshold
|
68
|
-
)
|
52
|
+
best_linkage, best_metric, best_threshold = _optimize_silhouette_across_linkage_and_metrics(
|
53
|
+
m, linkage_criterion, linkage_method, linkage_metric, linkage_threshold
|
69
54
|
)
|
70
55
|
# Perform hierarchical clustering
|
71
56
|
Z = linkage(m, method=best_linkage, metric=best_metric)
|
@@ -91,9 +76,6 @@ def define_domains(
|
|
91
76
|
f"Error encountered. Skipping clustering and assigning {n_rows} unique domains."
|
92
77
|
)
|
93
78
|
top_annotations["domain"] = range(1, n_rows + 1) # Assign unique domains
|
94
|
-
except LinkageThresholdError as e:
|
95
|
-
# If a LinkageThresholdError is encountered, raise a ValueError with the original exception
|
96
|
-
raise ValueError(e) from e
|
97
79
|
|
98
80
|
# Create DataFrames to store domain information
|
99
81
|
node_to_significance = pd.DataFrame(
|
@@ -215,154 +197,146 @@ def _safeguard_matrix(matrix: np.ndarray) -> np.ndarray:
|
|
215
197
|
return matrix
|
216
198
|
|
217
199
|
|
218
|
-
def
|
200
|
+
def _optimize_silhouette_across_linkage_and_metrics(
|
219
201
|
m: np.ndarray,
|
220
202
|
linkage_criterion: str,
|
221
203
|
linkage_method: str,
|
222
204
|
linkage_metric: str,
|
223
205
|
linkage_threshold: Union[str, float],
|
224
206
|
) -> Tuple[str, str, float]:
|
225
|
-
"""Optimize
|
226
|
-
a string, optimize the threshold using the specified metric; otherwise, use the provided threshold.
|
207
|
+
"""Optimize silhouette score across different linkage methods and distance metrics.
|
227
208
|
|
228
209
|
Args:
|
229
210
|
m (np.ndarray): Data matrix.
|
230
|
-
linkage_criterion (str):
|
231
|
-
linkage_method (str): Linkage method for clustering
|
232
|
-
linkage_metric (str):
|
233
|
-
linkage_threshold (str, float):
|
234
|
-
"silhouette", "calinski_harabasz", or "davies_bouldin" to trigger optimization.
|
211
|
+
linkage_criterion (str): Clustering criterion.
|
212
|
+
linkage_method (str): Linkage method for clustering.
|
213
|
+
linkage_metric (str): Linkage metric for clustering.
|
214
|
+
linkage_threshold (Union[str, float]): Threshold for clustering. Set to "auto" to optimize.
|
235
215
|
|
236
216
|
Returns:
|
237
217
|
Tuple[str, str, float]:
|
238
|
-
-
|
239
|
-
-
|
240
|
-
-
|
241
|
-
|
242
|
-
Raises:
|
243
|
-
ValueError: If linkage_threshold is neither one of the supported keywords nor convertible to float.
|
218
|
+
- Best linkage method (str)
|
219
|
+
- Best linkage metric (str)
|
220
|
+
- Best threshold (float)
|
244
221
|
"""
|
245
|
-
#
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
if isinstance(linkage_threshold, str):
|
250
|
-
if linkage_threshold in supported_linkage_thresholds:
|
251
|
-
opt_metric = linkage_threshold
|
252
|
-
else:
|
253
|
-
try:
|
254
|
-
threshold_float = float(linkage_threshold)
|
255
|
-
except (TypeError, ValueError):
|
256
|
-
raise LinkageThresholdError(
|
257
|
-
f"linkage_threshold must be one of {', '.join(supported_linkage_thresholds)} or a float value."
|
258
|
-
)
|
259
|
-
return linkage_method, linkage_metric, threshold_float
|
260
|
-
else:
|
261
|
-
# If not a string, try to convert it to float.
|
262
|
-
try:
|
263
|
-
threshold_float = float(linkage_threshold)
|
264
|
-
except (TypeError, ValueError):
|
265
|
-
raise LinkageThresholdError(
|
266
|
-
f"linkage_threshold must be one of {', '.join(supported_linkage_thresholds)} or a float value."
|
267
|
-
)
|
268
|
-
return linkage_method, linkage_metric, threshold_float
|
269
|
-
|
270
|
-
# Otherwise, perform optimization using the specified metric (opt_metric).
|
271
|
-
best_overall_method = None
|
272
|
-
best_overall_metric = None
|
273
|
-
best_overall_threshold = None
|
222
|
+
# Initialize best overall values
|
223
|
+
best_overall_method = linkage_method
|
224
|
+
best_overall_metric = linkage_metric
|
225
|
+
best_overall_threshold = linkage_threshold
|
274
226
|
best_overall_score = -np.inf
|
275
227
|
|
276
|
-
#
|
277
|
-
|
278
|
-
|
279
|
-
total_combinations = len(
|
228
|
+
# Set linkage methods and metrics to all combinations if "auto" is selected
|
229
|
+
linkage_methods = GROUP_LINKAGE_METHODS if linkage_method == "auto" else [linkage_method]
|
230
|
+
linkage_metrics = GROUP_DISTANCE_METRICS if linkage_metric == "auto" else [linkage_metric]
|
231
|
+
total_combinations = len(linkage_methods) * len(linkage_metrics)
|
280
232
|
|
233
|
+
# Evaluating optimal linkage method and metric
|
281
234
|
for method, metric in tqdm(
|
282
|
-
product(
|
235
|
+
product(linkage_methods, linkage_metrics),
|
283
236
|
desc="Evaluating optimal linkage method and metric",
|
284
237
|
total=total_combinations,
|
285
238
|
bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
|
286
239
|
):
|
240
|
+
# Some linkage methods and metrics may not work with certain data
|
287
241
|
with suppress(ValueError):
|
288
242
|
Z = linkage(m, method=method, metric=metric)
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
243
|
+
# Only optimize silhouette score if the threshold is "auto"
|
244
|
+
if linkage_threshold == "auto":
|
245
|
+
threshold, score = _find_best_silhouette_score(Z, m, metric, linkage_criterion)
|
246
|
+
if score > best_overall_score:
|
247
|
+
best_overall_score = score
|
248
|
+
best_overall_threshold = threshold
|
249
|
+
best_overall_method = method
|
250
|
+
best_overall_metric = metric
|
251
|
+
else:
|
252
|
+
# Use the provided threshold without optimization
|
253
|
+
score = silhouette_score(
|
254
|
+
m,
|
255
|
+
fcluster(Z, linkage_threshold * np.max(Z[:, 2]), criterion=linkage_criterion),
|
256
|
+
metric=metric,
|
257
|
+
)
|
258
|
+
if score > best_overall_score:
|
259
|
+
best_overall_score = score
|
260
|
+
best_overall_threshold = linkage_threshold
|
261
|
+
best_overall_method = method
|
262
|
+
best_overall_metric = metric
|
263
|
+
|
300
264
|
return best_overall_method, best_overall_metric, best_overall_threshold
|
301
265
|
|
302
266
|
|
303
|
-
def
|
267
|
+
def _find_best_silhouette_score(
|
304
268
|
Z: np.ndarray,
|
305
269
|
m: np.ndarray,
|
306
270
|
linkage_metric: str,
|
307
271
|
linkage_criterion: str,
|
308
|
-
|
272
|
+
lower_bound: float = 0.001,
|
273
|
+
upper_bound: float = 1.0,
|
274
|
+
resolution: float = 0.001,
|
309
275
|
) -> Tuple[float, float]:
|
310
|
-
"""Find the
|
311
|
-
the threshold value using the specified metric (opt_metric).
|
276
|
+
"""Find the best silhouette score using binary search.
|
312
277
|
|
313
278
|
Args:
|
314
|
-
Z (np.ndarray): Linkage matrix
|
315
|
-
m (np.ndarray): Data matrix
|
316
|
-
linkage_metric (str):
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
279
|
+
Z (np.ndarray): Linkage matrix.
|
280
|
+
m (np.ndarray): Data matrix.
|
281
|
+
linkage_metric (str): Linkage metric for silhouette score calculation.
|
282
|
+
linkage_criterion (str): Clustering criterion.
|
283
|
+
lower_bound (float, optional): Lower bound for search. Defaults to 0.001.
|
284
|
+
upper_bound (float, optional): Upper bound for search. Defaults to 1.0.
|
285
|
+
resolution (float, optional): Desired resolution for the best threshold. Defaults to 0.001.
|
321
286
|
|
322
287
|
Returns:
|
323
288
|
Tuple[float, float]:
|
324
|
-
-
|
325
|
-
-
|
326
|
-
at the optimal threshold (higher for "silhouette" and "calinski_harabasz",
|
327
|
-
lower for "davies_bouldin").
|
328
|
-
|
329
|
-
Raises:
|
330
|
-
ValueError: If the `opt_metric` argument is not one of the supported metrics.
|
289
|
+
- Best threshold (float): The threshold that yields the best silhouette score.
|
290
|
+
- Best silhouette score (float): The highest silhouette score achieved.
|
331
291
|
"""
|
332
|
-
|
333
|
-
|
334
|
-
resolution = 1e-6
|
335
|
-
|
336
|
-
def compute_objective(coefficient: float) -> float:
|
337
|
-
"""Compute the objective function for optimization."""
|
338
|
-
threshold_val = coefficient * max_d
|
339
|
-
clusters = fcluster(Z, threshold_val, criterion=linkage_criterion)
|
340
|
-
unique_clusters = np.unique(clusters)
|
341
|
-
if len(unique_clusters) <= 1 or len(unique_clusters) == m.shape[0]:
|
342
|
-
return 1e6
|
343
|
-
try:
|
344
|
-
if opt_metric == "silhouette":
|
345
|
-
score = silhouette_score(m, clusters, metric=linkage_metric)
|
346
|
-
return -score # We want to maximize the score.
|
347
|
-
elif opt_metric == "calinski_harabasz":
|
348
|
-
score = calinski_harabasz_score(m, clusters)
|
349
|
-
return -score
|
350
|
-
elif opt_metric == "davies_bouldin":
|
351
|
-
score = davies_bouldin_score(m, clusters)
|
352
|
-
return score
|
353
|
-
else:
|
354
|
-
raise ValueError(f"Unknown optimization metric: {opt_metric}.")
|
355
|
-
except Exception:
|
356
|
-
return 1e6
|
292
|
+
best_score = -np.inf
|
293
|
+
best_threshold = None
|
357
294
|
|
358
|
-
#
|
359
|
-
|
360
|
-
|
361
|
-
|
295
|
+
# Test lower bound
|
296
|
+
max_d_lower = np.max(Z[:, 2]) * lower_bound
|
297
|
+
clusters_lower = fcluster(Z, max_d_lower, criterion=linkage_criterion)
|
298
|
+
try:
|
299
|
+
score_lower = silhouette_score(m, clusters_lower, metric=linkage_metric)
|
300
|
+
except ValueError:
|
301
|
+
score_lower = -np.inf
|
302
|
+
|
303
|
+
# Test upper bound
|
304
|
+
max_d_upper = np.max(Z[:, 2]) * upper_bound
|
305
|
+
clusters_upper = fcluster(Z, max_d_upper, criterion=linkage_criterion)
|
306
|
+
try:
|
307
|
+
score_upper = silhouette_score(m, clusters_upper, metric=linkage_metric)
|
308
|
+
except ValueError:
|
309
|
+
score_upper = -np.inf
|
362
310
|
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
311
|
+
# Determine initial bounds for binary search
|
312
|
+
if score_lower > score_upper:
|
313
|
+
best_score = score_lower
|
314
|
+
best_threshold = lower_bound
|
315
|
+
upper_bound = (lower_bound + upper_bound) / 2
|
316
|
+
else:
|
317
|
+
best_score = score_upper
|
318
|
+
best_threshold = upper_bound
|
319
|
+
lower_bound = (lower_bound + upper_bound) / 2
|
320
|
+
|
321
|
+
# Binary search loop
|
322
|
+
while upper_bound - lower_bound > resolution:
|
323
|
+
mid_threshold = (upper_bound + lower_bound) / 2
|
324
|
+
max_d_mid = np.max(Z[:, 2]) * mid_threshold
|
325
|
+
clusters_mid = fcluster(Z, max_d_mid, criterion=linkage_criterion)
|
326
|
+
try:
|
327
|
+
score_mid = silhouette_score(m, clusters_mid, metric=linkage_metric)
|
328
|
+
except ValueError:
|
329
|
+
score_mid = -np.inf
|
330
|
+
|
331
|
+
# Update best score and threshold if mid-point is better
|
332
|
+
if score_mid > best_score:
|
333
|
+
best_score = score_mid
|
334
|
+
best_threshold = mid_threshold
|
335
|
+
|
336
|
+
# Adjust bounds based on the scores
|
337
|
+
if score_lower > score_upper:
|
338
|
+
upper_bound = mid_threshold
|
339
|
+
else:
|
340
|
+
lower_bound = mid_threshold
|
367
341
|
|
368
|
-
return best_threshold, float(
|
342
|
+
return best_threshold, float(best_score)
|
risk/network/graph/api.py
CHANGED
@@ -58,12 +58,9 @@ class GraphAPI:
|
|
58
58
|
impute_depth (int, optional): Depth for imputing neighbors. Defaults to 0.
|
59
59
|
prune_threshold (float, optional): Distance threshold for pruning neighbors. Defaults to 0.0.
|
60
60
|
linkage_criterion (str, optional): Clustering criterion for defining domains. Defaults to "distance".
|
61
|
-
linkage_method (str, optional): Clustering method to use. Defaults to "average".
|
62
|
-
|
63
|
-
|
64
|
-
to automatically select the best linkage metric.
|
65
|
-
linkage_threshold (str, float, optional): Threshold for clustering. Choose "silhouette", "calinski_harabasz",
|
66
|
-
or "davies_bouldin" to automatically select the best threshold. Defaults to 0.2.
|
61
|
+
linkage_method (str, optional): Clustering method to use. Defaults to "average".
|
62
|
+
linkage_metric (str, optional): Metric to use for calculating distances. Defaults to "yule".
|
63
|
+
linkage_threshold (float, optional): Threshold for clustering. Defaults to 0.2.
|
67
64
|
min_cluster_size (int, optional): Minimum size for clusters. Defaults to 5.
|
68
65
|
max_cluster_size (int, optional): Maximum size for clusters. Defaults to 1000.
|
69
66
|
|
@@ -1,4 +1,4 @@
|
|
1
|
-
risk/__init__.py,sha256=
|
1
|
+
risk/__init__.py,sha256=ewYSGDLHigkwFLI9IW6qDbQk4uS6nb3RTd-k2GCD1b0,127
|
2
2
|
risk/constants.py,sha256=XInRaH78Slnw_sWgAsBFbUHkyA0h0jL0DKGuQNbOvjM,550
|
3
3
|
risk/risk.py,sha256=s827_lRknFseOP9O4zW8sP-IcCd2EzrpV_tnVY_tz5s,1104
|
4
4
|
risk/annotations/__init__.py,sha256=parsbcux1U4urpUqh9AdzbDWuLj9HlMidycMPkpSQFo,179
|
@@ -10,13 +10,13 @@ risk/log/parameters.py,sha256=VtwfMzLU1xI4yji3-Ch5vHjH-KdwTfwaEMmi7hFQTs0,5716
|
|
10
10
|
risk/neighborhoods/__init__.py,sha256=Q74HwTH7okI-vaskJPy2bYwb5sNjGASTzJ6m8V8arCU,234
|
11
11
|
risk/neighborhoods/api.py,sha256=ywngw2TQVV27gYlWDXcs8-qnmeepnvb-W9ov6J6VEPM,23341
|
12
12
|
risk/neighborhoods/community.py,sha256=5Q_-VAJC-5SY5EUsB8gIlemeDoAL85uLjyl16pItHiQ,16699
|
13
|
-
risk/neighborhoods/domains.py,sha256=
|
13
|
+
risk/neighborhoods/domains.py,sha256=Yu93mKNCuOpBGa87knAH-XIl260kf-rswPfn3aC9GNo,13937
|
14
14
|
risk/neighborhoods/neighborhoods.py,sha256=l9FhADB1C-OxM8E9QXOcA4osUDgA1vs4ud-OCGKKybc,21457
|
15
15
|
risk/network/__init__.py,sha256=oVi3FA1XXKD84014Cykq-9bpX4_s0F3aAUfNOU-07Qw,73
|
16
16
|
risk/network/geometry.py,sha256=eVtGHMgBf9fEqQZUFdHWjw-zFYYpfUONoHFSAxoRkug,6219
|
17
17
|
risk/network/io.py,sha256=RCH4nQdgYDXcNwMfpSz7qEmPO0pJ1p9fL0rNQptsQrc,21673
|
18
18
|
risk/network/graph/__init__.py,sha256=ziGJew3yhtqvrb9LUuneDu_LwW2Wa9vd4UuhoL5l1CA,91
|
19
|
-
risk/network/graph/api.py,sha256=
|
19
|
+
risk/network/graph/api.py,sha256=9yoviP7EqFU1okLJZlaLBZzFNmjOHv30B1JgDFNP1bg,8399
|
20
20
|
risk/network/graph/graph.py,sha256=qEWyZvuaGT_vvjhreBdmRPX3gst2wQFaXhFAvikPSqw,12158
|
21
21
|
risk/network/graph/summary.py,sha256=Y_0rL2C1UoQeZQIPVe5LbaCO356Mcc8HisnrXwQsRm8,10289
|
22
22
|
risk/network/plotter/__init__.py,sha256=4gWtQHGzQVNHmEBXi31Zf0tX0y2sTcE66J_yGnn7268,99
|
@@ -34,8 +34,8 @@ risk/stats/stat_tests.py,sha256=tj0ri9w89_1fsjGLuafTWpfBEwZXpSLn7Ej2aAQ5lxk,1177
|
|
34
34
|
risk/stats/permutation/__init__.py,sha256=OLmYLm2uj96hPsSaUs0vUqFYw6Thwch_aHtpL7L0ZFw,127
|
35
35
|
risk/stats/permutation/permutation.py,sha256=BWjgdBpLVcHvmwHy0bmD4aJFccxifNBSrrCBPppyKf4,10569
|
36
36
|
risk/stats/permutation/test_functions.py,sha256=KlECWTz1EZ6EPF_OAgHb0uznaIhopiVYb_AKUKuC4no,3120
|
37
|
-
risk_network-0.0.
|
38
|
-
risk_network-0.0.
|
39
|
-
risk_network-0.0.
|
40
|
-
risk_network-0.0.
|
41
|
-
risk_network-0.0.
|
37
|
+
risk_network-0.0.9b39.dist-info/LICENSE,sha256=jOtLnuWt7d5Hsx6XXB2QxzrSe2sWWh3NgMfFRetluQM,35147
|
38
|
+
risk_network-0.0.9b39.dist-info/METADATA,sha256=y3xDx1OCYpCS1OgBMUTNIK5y8HFORWHag4PLnyAXc5g,47627
|
39
|
+
risk_network-0.0.9b39.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
40
|
+
risk_network-0.0.9b39.dist-info/top_level.txt,sha256=NX7C2PFKTvC1JhVKv14DFlFAIFnKc6Lpsu1ZfxvQwVw,5
|
41
|
+
risk_network-0.0.9b39.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|