risk-network 0.0.9b37__py3-none-any.whl → 0.0.9b39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- risk/__init__.py +1 -1
- risk/neighborhoods/domains.py +107 -156
- {risk_network-0.0.9b37.dist-info → risk_network-0.0.9b39.dist-info}/METADATA +1 -1
- {risk_network-0.0.9b37.dist-info → risk_network-0.0.9b39.dist-info}/RECORD +7 -7
- {risk_network-0.0.9b37.dist-info → risk_network-0.0.9b39.dist-info}/LICENSE +0 -0
- {risk_network-0.0.9b37.dist-info → risk_network-0.0.9b39.dist-info}/WHEEL +0 -0
- {risk_network-0.0.9b37.dist-info → risk_network-0.0.9b39.dist-info}/top_level.txt +0 -0
risk/__init__.py
CHANGED
risk/neighborhoods/domains.py
CHANGED
@@ -10,7 +10,7 @@ from typing import Tuple, Union
|
|
10
10
|
import numpy as np
|
11
11
|
import pandas as pd
|
12
12
|
from scipy.cluster.hierarchy import linkage, fcluster
|
13
|
-
from sklearn.metrics import
|
13
|
+
from sklearn.metrics import silhouette_score
|
14
14
|
from tqdm import tqdm
|
15
15
|
|
16
16
|
from risk.annotations import get_weighted_description
|
@@ -18,19 +18,13 @@ from risk.constants import GROUP_LINKAGE_METHODS, GROUP_DISTANCE_METRICS
|
|
18
18
|
from risk.log import logger
|
19
19
|
|
20
20
|
|
21
|
-
class LinkageThresholdError(Exception):
|
22
|
-
"""Exception raised for errors in the linkage threshold optimization process."""
|
23
|
-
|
24
|
-
pass
|
25
|
-
|
26
|
-
|
27
21
|
def define_domains(
|
28
22
|
top_annotations: pd.DataFrame,
|
29
23
|
significant_neighborhoods_significance: np.ndarray,
|
30
24
|
linkage_criterion: str,
|
31
25
|
linkage_method: str,
|
32
26
|
linkage_metric: str,
|
33
|
-
linkage_threshold:
|
27
|
+
linkage_threshold: float,
|
34
28
|
) -> pd.DataFrame:
|
35
29
|
"""Define domains and assign nodes to these domains based on their significance scores and clustering,
|
36
30
|
handling errors by assigning unique domains when clustering fails.
|
@@ -38,19 +32,13 @@ def define_domains(
|
|
38
32
|
Args:
|
39
33
|
top_annotations (pd.DataFrame): DataFrame of top annotations data for the network nodes.
|
40
34
|
significant_neighborhoods_significance (np.ndarray): The binary significance matrix below alpha.
|
41
|
-
linkage_criterion (str): The clustering criterion for defining groups.
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
linkage_threshold (float): The linkage threshold for clustering, or one of "silhouette", "calinski_harabasz",
|
46
|
-
or "davies_bouldin" to optimize the threshold.
|
35
|
+
linkage_criterion (str): The clustering criterion for defining groups.
|
36
|
+
linkage_method (str): The linkage method for clustering.
|
37
|
+
linkage_metric (str): The linkage metric for clustering.
|
38
|
+
linkage_threshold (float): The threshold for clustering.
|
47
39
|
|
48
40
|
Returns:
|
49
41
|
pd.DataFrame: DataFrame with the primary domain for each node.
|
50
|
-
|
51
|
-
Raises:
|
52
|
-
ValueError: If an improper value is passed for linkage_threshold. Acceptable values are "silhouette",
|
53
|
-
"calinski_harabasz", "davies_bouldin", or a float value.
|
54
42
|
"""
|
55
43
|
try:
|
56
44
|
if linkage_criterion == "off":
|
@@ -61,10 +49,8 @@ def define_domains(
|
|
61
49
|
# Safeguard the matrix by replacing NaN, Inf, and -Inf values
|
62
50
|
m = _safeguard_matrix(m)
|
63
51
|
# Optimize silhouette score across different linkage methods and distance metrics
|
64
|
-
best_linkage, best_metric, best_threshold = (
|
65
|
-
|
66
|
-
m, linkage_criterion, linkage_method, linkage_metric, linkage_threshold
|
67
|
-
)
|
52
|
+
best_linkage, best_metric, best_threshold = _optimize_silhouette_across_linkage_and_metrics(
|
53
|
+
m, linkage_criterion, linkage_method, linkage_metric, linkage_threshold
|
68
54
|
)
|
69
55
|
# Perform hierarchical clustering
|
70
56
|
Z = linkage(m, method=best_linkage, metric=best_metric)
|
@@ -90,9 +76,6 @@ def define_domains(
|
|
90
76
|
f"Error encountered. Skipping clustering and assigning {n_rows} unique domains."
|
91
77
|
)
|
92
78
|
top_annotations["domain"] = range(1, n_rows + 1) # Assign unique domains
|
93
|
-
except LinkageThresholdError as e:
|
94
|
-
# If a LinkageThresholdError is encountered, raise a ValueError with the original exception
|
95
|
-
raise ValueError(e) from e
|
96
79
|
|
97
80
|
# Create DataFrames to store domain information
|
98
81
|
node_to_significance = pd.DataFrame(
|
@@ -214,178 +197,146 @@ def _safeguard_matrix(matrix: np.ndarray) -> np.ndarray:
|
|
214
197
|
return matrix
|
215
198
|
|
216
199
|
|
217
|
-
def
|
200
|
+
def _optimize_silhouette_across_linkage_and_metrics(
|
218
201
|
m: np.ndarray,
|
219
202
|
linkage_criterion: str,
|
220
203
|
linkage_method: str,
|
221
204
|
linkage_metric: str,
|
222
205
|
linkage_threshold: Union[str, float],
|
223
206
|
) -> Tuple[str, str, float]:
|
224
|
-
"""Optimize
|
225
|
-
a string, optimize the threshold using the specified metric; otherwise, use the provided threshold.
|
207
|
+
"""Optimize silhouette score across different linkage methods and distance metrics.
|
226
208
|
|
227
209
|
Args:
|
228
210
|
m (np.ndarray): Data matrix.
|
229
|
-
linkage_criterion (str):
|
230
|
-
linkage_method (str): Linkage method for clustering
|
231
|
-
linkage_metric (str):
|
232
|
-
linkage_threshold (Union[str, float]):
|
233
|
-
"silhouette", "calinski_harabasz", or "davies_bouldin" to trigger optimization.
|
211
|
+
linkage_criterion (str): Clustering criterion.
|
212
|
+
linkage_method (str): Linkage method for clustering.
|
213
|
+
linkage_metric (str): Linkage metric for clustering.
|
214
|
+
linkage_threshold (Union[str, float]): Threshold for clustering. Set to "auto" to optimize.
|
234
215
|
|
235
216
|
Returns:
|
236
217
|
Tuple[str, str, float]:
|
237
|
-
-
|
238
|
-
-
|
239
|
-
-
|
240
|
-
|
241
|
-
Raises:
|
242
|
-
ValueError: If linkage_threshold is neither one of the supported keywords nor convertible to float.
|
218
|
+
- Best linkage method (str)
|
219
|
+
- Best linkage metric (str)
|
220
|
+
- Best threshold (float)
|
243
221
|
"""
|
244
|
-
#
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
if isinstance(linkage_threshold, str):
|
249
|
-
if linkage_threshold in supported_linkage_thresholds:
|
250
|
-
opt_metric = linkage_threshold
|
251
|
-
else:
|
252
|
-
try:
|
253
|
-
threshold_float = float(linkage_threshold)
|
254
|
-
except (TypeError, ValueError):
|
255
|
-
raise LinkageThresholdError(
|
256
|
-
f"linkage_threshold must be one of {', '.join(supported_linkage_thresholds)} or a float value."
|
257
|
-
)
|
258
|
-
return linkage_method, linkage_metric, threshold_float
|
259
|
-
else:
|
260
|
-
# If not a string, try to convert it to float.
|
261
|
-
try:
|
262
|
-
threshold_float = float(linkage_threshold)
|
263
|
-
except (TypeError, ValueError):
|
264
|
-
raise LinkageThresholdError(
|
265
|
-
f"linkage_threshold must be one of {', '.join(supported_linkage_thresholds)} or a float value."
|
266
|
-
)
|
267
|
-
return linkage_method, linkage_metric, threshold_float
|
268
|
-
|
269
|
-
# Otherwise, perform optimization using the specified metric (opt_metric).
|
270
|
-
best_overall_method = None
|
271
|
-
best_overall_metric = None
|
272
|
-
best_overall_threshold = None
|
222
|
+
# Initialize best overall values
|
223
|
+
best_overall_method = linkage_method
|
224
|
+
best_overall_metric = linkage_metric
|
225
|
+
best_overall_threshold = linkage_threshold
|
273
226
|
best_overall_score = -np.inf
|
274
227
|
|
275
|
-
#
|
276
|
-
|
277
|
-
|
278
|
-
total_combinations = len(
|
228
|
+
# Set linkage methods and metrics to all combinations if "auto" is selected
|
229
|
+
linkage_methods = GROUP_LINKAGE_METHODS if linkage_method == "auto" else [linkage_method]
|
230
|
+
linkage_metrics = GROUP_DISTANCE_METRICS if linkage_metric == "auto" else [linkage_metric]
|
231
|
+
total_combinations = len(linkage_methods) * len(linkage_metrics)
|
279
232
|
|
233
|
+
# Evaluating optimal linkage method and metric
|
280
234
|
for method, metric in tqdm(
|
281
|
-
product(
|
235
|
+
product(linkage_methods, linkage_metrics),
|
282
236
|
desc="Evaluating optimal linkage method and metric",
|
283
237
|
total=total_combinations,
|
284
238
|
bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
|
285
239
|
):
|
240
|
+
# Some linkage methods and metrics may not work with certain data
|
286
241
|
with suppress(ValueError):
|
287
242
|
Z = linkage(m, method=method, metric=metric)
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
243
|
+
# Only optimize silhouette score if the threshold is "auto"
|
244
|
+
if linkage_threshold == "auto":
|
245
|
+
threshold, score = _find_best_silhouette_score(Z, m, metric, linkage_criterion)
|
246
|
+
if score > best_overall_score:
|
247
|
+
best_overall_score = score
|
248
|
+
best_overall_threshold = threshold
|
249
|
+
best_overall_method = method
|
250
|
+
best_overall_metric = metric
|
251
|
+
else:
|
252
|
+
# Use the provided threshold without optimization
|
253
|
+
score = silhouette_score(
|
254
|
+
m,
|
255
|
+
fcluster(Z, linkage_threshold * np.max(Z[:, 2]), criterion=linkage_criterion),
|
256
|
+
metric=metric,
|
257
|
+
)
|
258
|
+
if score > best_overall_score:
|
259
|
+
best_overall_score = score
|
260
|
+
best_overall_threshold = linkage_threshold
|
261
|
+
best_overall_method = method
|
262
|
+
best_overall_metric = metric
|
263
|
+
|
299
264
|
return best_overall_method, best_overall_metric, best_overall_threshold
|
300
265
|
|
301
266
|
|
302
|
-
def
|
267
|
+
def _find_best_silhouette_score(
|
303
268
|
Z: np.ndarray,
|
304
269
|
m: np.ndarray,
|
305
270
|
linkage_metric: str,
|
306
271
|
linkage_criterion: str,
|
307
|
-
|
272
|
+
lower_bound: float = 0.001,
|
273
|
+
upper_bound: float = 1.0,
|
274
|
+
resolution: float = 0.001,
|
308
275
|
) -> Tuple[float, float]:
|
309
|
-
"""Find the
|
310
|
-
The threshold is chosen to optimize clustering quality, as measured by the chosen metric.
|
276
|
+
"""Find the best silhouette score using binary search.
|
311
277
|
|
312
278
|
Args:
|
313
279
|
Z (np.ndarray): Linkage matrix.
|
314
280
|
m (np.ndarray): Data matrix.
|
315
|
-
linkage_metric (str):
|
316
|
-
linkage_criterion (str):
|
317
|
-
|
318
|
-
|
281
|
+
linkage_metric (str): Linkage metric for silhouette score calculation.
|
282
|
+
linkage_criterion (str): Clustering criterion.
|
283
|
+
lower_bound (float, optional): Lower bound for search. Defaults to 0.001.
|
284
|
+
upper_bound (float, optional): Upper bound for search. Defaults to 1.0.
|
285
|
+
resolution (float, optional): Desired resolution for the best threshold. Defaults to 0.001.
|
319
286
|
|
320
287
|
Returns:
|
321
288
|
Tuple[float, float]:
|
322
|
-
-
|
323
|
-
-
|
324
|
-
"silhouette" and "calinski_harabasz", and lower for "davies_bouldin").
|
289
|
+
- Best threshold (float): The threshold that yields the best silhouette score.
|
290
|
+
- Best silhouette score (float): The highest silhouette score achieved.
|
325
291
|
"""
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
return score # Lower is better.
|
349
|
-
else:
|
350
|
-
raise LinkageThresholdError(f"Unknown optimization metric: {opt_metric}.")
|
351
|
-
except Exception:
|
352
|
-
return 1e6
|
353
|
-
|
354
|
-
# Initialize the bounds and the best objective value
|
355
|
-
obj_lower = compute_objective(lower_bound)
|
356
|
-
obj_upper = compute_objective(upper_bound)
|
357
|
-
# Determine the initial direction of the search
|
358
|
-
if obj_lower < obj_upper:
|
359
|
-
best_obj = obj_lower
|
292
|
+
best_score = -np.inf
|
293
|
+
best_threshold = None
|
294
|
+
|
295
|
+
# Test lower bound
|
296
|
+
max_d_lower = np.max(Z[:, 2]) * lower_bound
|
297
|
+
clusters_lower = fcluster(Z, max_d_lower, criterion=linkage_criterion)
|
298
|
+
try:
|
299
|
+
score_lower = silhouette_score(m, clusters_lower, metric=linkage_metric)
|
300
|
+
except ValueError:
|
301
|
+
score_lower = -np.inf
|
302
|
+
|
303
|
+
# Test upper bound
|
304
|
+
max_d_upper = np.max(Z[:, 2]) * upper_bound
|
305
|
+
clusters_upper = fcluster(Z, max_d_upper, criterion=linkage_criterion)
|
306
|
+
try:
|
307
|
+
score_upper = silhouette_score(m, clusters_upper, metric=linkage_metric)
|
308
|
+
except ValueError:
|
309
|
+
score_upper = -np.inf
|
310
|
+
|
311
|
+
# Determine initial bounds for binary search
|
312
|
+
if score_lower > score_upper:
|
313
|
+
best_score = score_lower
|
360
314
|
best_threshold = lower_bound
|
361
|
-
|
362
|
-
upper = (lower_bound + upper_bound) / 2
|
315
|
+
upper_bound = (lower_bound + upper_bound) / 2
|
363
316
|
else:
|
364
|
-
|
317
|
+
best_score = score_upper
|
365
318
|
best_threshold = upper_bound
|
366
|
-
|
367
|
-
upper = upper_bound
|
368
|
-
|
369
|
-
# Perform binary search to find the optimal linkage threshold coefficient
|
370
|
-
while (upper - lower) > resolution:
|
371
|
-
mid = (upper + lower) / 2
|
372
|
-
obj_mid = compute_objective(mid)
|
373
|
-
if obj_mid < best_obj:
|
374
|
-
best_obj = obj_mid
|
375
|
-
best_threshold = mid
|
376
|
-
|
377
|
-
# Update the bounds based on the objective value
|
378
|
-
obj_left = compute_objective((lower + mid) / 2)
|
379
|
-
obj_right = compute_objective((mid + upper) / 2)
|
380
|
-
if obj_left < obj_right:
|
381
|
-
upper = mid
|
382
|
-
else:
|
383
|
-
lower = mid
|
319
|
+
lower_bound = (lower_bound + upper_bound) / 2
|
384
320
|
|
385
|
-
#
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
321
|
+
# Binary search loop
|
322
|
+
while upper_bound - lower_bound > resolution:
|
323
|
+
mid_threshold = (upper_bound + lower_bound) / 2
|
324
|
+
max_d_mid = np.max(Z[:, 2]) * mid_threshold
|
325
|
+
clusters_mid = fcluster(Z, max_d_mid, criterion=linkage_criterion)
|
326
|
+
try:
|
327
|
+
score_mid = silhouette_score(m, clusters_mid, metric=linkage_metric)
|
328
|
+
except ValueError:
|
329
|
+
score_mid = -np.inf
|
330
|
+
|
331
|
+
# Update best score and threshold if mid-point is better
|
332
|
+
if score_mid > best_score:
|
333
|
+
best_score = score_mid
|
334
|
+
best_threshold = mid_threshold
|
335
|
+
|
336
|
+
# Adjust bounds based on the scores
|
337
|
+
if score_lower > score_upper:
|
338
|
+
upper_bound = mid_threshold
|
339
|
+
else:
|
340
|
+
lower_bound = mid_threshold
|
390
341
|
|
391
|
-
return best_threshold, float(
|
342
|
+
return best_threshold, float(best_score)
|
@@ -1,4 +1,4 @@
|
|
1
|
-
risk/__init__.py,sha256=
|
1
|
+
risk/__init__.py,sha256=ewYSGDLHigkwFLI9IW6qDbQk4uS6nb3RTd-k2GCD1b0,127
|
2
2
|
risk/constants.py,sha256=XInRaH78Slnw_sWgAsBFbUHkyA0h0jL0DKGuQNbOvjM,550
|
3
3
|
risk/risk.py,sha256=s827_lRknFseOP9O4zW8sP-IcCd2EzrpV_tnVY_tz5s,1104
|
4
4
|
risk/annotations/__init__.py,sha256=parsbcux1U4urpUqh9AdzbDWuLj9HlMidycMPkpSQFo,179
|
@@ -10,7 +10,7 @@ risk/log/parameters.py,sha256=VtwfMzLU1xI4yji3-Ch5vHjH-KdwTfwaEMmi7hFQTs0,5716
|
|
10
10
|
risk/neighborhoods/__init__.py,sha256=Q74HwTH7okI-vaskJPy2bYwb5sNjGASTzJ6m8V8arCU,234
|
11
11
|
risk/neighborhoods/api.py,sha256=ywngw2TQVV27gYlWDXcs8-qnmeepnvb-W9ov6J6VEPM,23341
|
12
12
|
risk/neighborhoods/community.py,sha256=5Q_-VAJC-5SY5EUsB8gIlemeDoAL85uLjyl16pItHiQ,16699
|
13
|
-
risk/neighborhoods/domains.py,sha256=
|
13
|
+
risk/neighborhoods/domains.py,sha256=Yu93mKNCuOpBGa87knAH-XIl260kf-rswPfn3aC9GNo,13937
|
14
14
|
risk/neighborhoods/neighborhoods.py,sha256=l9FhADB1C-OxM8E9QXOcA4osUDgA1vs4ud-OCGKKybc,21457
|
15
15
|
risk/network/__init__.py,sha256=oVi3FA1XXKD84014Cykq-9bpX4_s0F3aAUfNOU-07Qw,73
|
16
16
|
risk/network/geometry.py,sha256=eVtGHMgBf9fEqQZUFdHWjw-zFYYpfUONoHFSAxoRkug,6219
|
@@ -34,8 +34,8 @@ risk/stats/stat_tests.py,sha256=tj0ri9w89_1fsjGLuafTWpfBEwZXpSLn7Ej2aAQ5lxk,1177
|
|
34
34
|
risk/stats/permutation/__init__.py,sha256=OLmYLm2uj96hPsSaUs0vUqFYw6Thwch_aHtpL7L0ZFw,127
|
35
35
|
risk/stats/permutation/permutation.py,sha256=BWjgdBpLVcHvmwHy0bmD4aJFccxifNBSrrCBPppyKf4,10569
|
36
36
|
risk/stats/permutation/test_functions.py,sha256=KlECWTz1EZ6EPF_OAgHb0uznaIhopiVYb_AKUKuC4no,3120
|
37
|
-
risk_network-0.0.
|
38
|
-
risk_network-0.0.
|
39
|
-
risk_network-0.0.
|
40
|
-
risk_network-0.0.
|
41
|
-
risk_network-0.0.
|
37
|
+
risk_network-0.0.9b39.dist-info/LICENSE,sha256=jOtLnuWt7d5Hsx6XXB2QxzrSe2sWWh3NgMfFRetluQM,35147
|
38
|
+
risk_network-0.0.9b39.dist-info/METADATA,sha256=y3xDx1OCYpCS1OgBMUTNIK5y8HFORWHag4PLnyAXc5g,47627
|
39
|
+
risk_network-0.0.9b39.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
40
|
+
risk_network-0.0.9b39.dist-info/top_level.txt,sha256=NX7C2PFKTvC1JhVKv14DFlFAIFnKc6Lpsu1ZfxvQwVw,5
|
41
|
+
risk_network-0.0.9b39.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|