risk-network 0.0.9b35__py3-none-any.whl → 0.0.9b37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
risk/__init__.py CHANGED
@@ -7,4 +7,4 @@ RISK: Regional Inference of Significant Kinships
7
7
 
8
8
  from risk.risk import RISK
9
9
 
10
- __version__ = "0.0.9-beta.35"
10
+ __version__ = "0.0.9-beta.37"
@@ -5,12 +5,12 @@ risk/neighborhoods/domains
5
5
 
6
6
  from contextlib import suppress
7
7
  from itertools import product
8
- from typing import Tuple
8
+ from typing import Tuple, Union
9
9
 
10
10
  import numpy as np
11
11
  import pandas as pd
12
12
  from scipy.cluster.hierarchy import linkage, fcluster
13
- from sklearn.metrics import silhouette_score
13
+ from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, silhouette_score
14
14
  from tqdm import tqdm
15
15
 
16
16
  from risk.annotations import get_weighted_description
@@ -18,12 +18,19 @@ from risk.constants import GROUP_LINKAGE_METHODS, GROUP_DISTANCE_METRICS
18
18
  from risk.log import logger
19
19
 
20
20
 
21
+ class LinkageThresholdError(Exception):
22
+ """Exception raised for errors in the linkage threshold optimization process."""
23
+
24
+ pass
25
+
26
+
21
27
  def define_domains(
22
28
  top_annotations: pd.DataFrame,
23
29
  significant_neighborhoods_significance: np.ndarray,
24
30
  linkage_criterion: str,
25
31
  linkage_method: str,
26
32
  linkage_metric: str,
33
+ linkage_threshold: Union[str, float],
27
34
  ) -> pd.DataFrame:
28
35
  """Define domains and assign nodes to these domains based on their significance scores and clustering,
29
36
  handling errors by assigning unique domains when clustering fails.
@@ -31,12 +38,19 @@ def define_domains(
31
38
  Args:
32
39
  top_annotations (pd.DataFrame): DataFrame of top annotations data for the network nodes.
33
40
  significant_neighborhoods_significance (np.ndarray): The binary significance matrix below alpha.
34
- linkage_criterion (str): The clustering criterion for defining groups.
35
- linkage_method (str): The linkage method for clustering.
36
- linkage_metric (str): The linkage metric for clustering.
41
+ linkage_criterion (str): The clustering criterion for defining groups. Use "distance" for distance-based
42
+ clustering or "maxclust" for a fixed number of clusters. Use "off" to skip clustering.
43
+ linkage_method (str): The linkage method for clustering. Use "auto" to try multiple methods.
44
+ linkage_metric (str): The linkage metric for clustering. Use "auto" to try multiple metrics.
45
+ linkage_threshold (float): The linkage threshold for clustering, or one of "silhouette", "calinski_harabasz",
46
+ or "davies_bouldin" to optimize the threshold.
37
47
 
38
48
  Returns:
39
49
  pd.DataFrame: DataFrame with the primary domain for each node.
50
+
51
+ Raises:
52
+ ValueError: If an improper value is passed for linkage_threshold. Acceptable values are "silhouette",
53
+ "calinski_harabasz", "davies_bouldin", or a float value.
40
54
  """
41
55
  try:
42
56
  if linkage_criterion == "off":
@@ -47,8 +61,10 @@ def define_domains(
47
61
  # Safeguard the matrix by replacing NaN, Inf, and -Inf values
48
62
  m = _safeguard_matrix(m)
49
63
  # Optimize silhouette score across different linkage methods and distance metrics
50
- best_linkage, best_metric, best_threshold = _optimize_silhouette_across_linkage_and_metrics(
51
- m, linkage_criterion, linkage_method, linkage_metric
64
+ best_linkage, best_metric, best_threshold = (
65
+ _optimize_linkage_threshold_across_methods_and_metrics(
66
+ m, linkage_criterion, linkage_method, linkage_metric, linkage_threshold
67
+ )
52
68
  )
53
69
  # Perform hierarchical clustering
54
70
  Z = linkage(m, method=best_linkage, metric=best_metric)
@@ -74,6 +90,9 @@ def define_domains(
74
90
  f"Error encountered. Skipping clustering and assigning {n_rows} unique domains."
75
91
  )
76
92
  top_annotations["domain"] = range(1, n_rows + 1) # Assign unique domains
93
+ except LinkageThresholdError as e:
94
+ # If a LinkageThresholdError is encountered, raise a ValueError with the original exception
95
+ raise ValueError(e) from e
77
96
 
78
97
  # Create DataFrames to store domain information
79
98
  node_to_significance = pd.DataFrame(
@@ -195,125 +214,178 @@ def _safeguard_matrix(matrix: np.ndarray) -> np.ndarray:
195
214
  return matrix
196
215
 
197
216
 
198
- def _optimize_silhouette_across_linkage_and_metrics(
199
- m: np.ndarray, linkage_criterion: str, linkage_method: str, linkage_metric: str
217
+ def _optimize_linkage_threshold_across_methods_and_metrics(
218
+ m: np.ndarray,
219
+ linkage_criterion: str,
220
+ linkage_method: str,
221
+ linkage_metric: str,
222
+ linkage_threshold: Union[str, float],
200
223
  ) -> Tuple[str, str, float]:
201
- """Optimize silhouette score across different linkage methods and distance metrics.
224
+ """Optimize the linkage method, metric, and threshold for hierarchical clustering. If the threshold is
225
+ a string, optimize the threshold using the specified metric; otherwise, use the provided threshold.
202
226
 
203
227
  Args:
204
228
  m (np.ndarray): Data matrix.
205
- linkage_criterion (str): Clustering criterion.
206
- linkage_method (str): Linkage method for clustering.
207
- linkage_metric (str): Linkage metric for clustering.
229
+ linkage_criterion (str): Criterion for fcluster (typically "distance").
230
+ linkage_method (str): Linkage method for clustering, or "auto" to try multiple methods.
231
+ linkage_metric (str): Distance metric for clustering, or "auto" to try multiple metrics.
232
+ linkage_threshold (Union[str, float]): Either a numeric threshold or one of
233
+ "silhouette", "calinski_harabasz", or "davies_bouldin" to trigger optimization.
208
234
 
209
235
  Returns:
210
236
  Tuple[str, str, float]:
211
- - Best linkage method (str)
212
- - Best linkage metric (str)
213
- - Best threshold (float)
237
+ - The chosen linkage method.
238
+ - The chosen linkage metric.
239
+ - The optimized threshold (a float).
240
+
241
+ Raises:
242
+ ValueError: If linkage_threshold is neither one of the supported keywords nor convertible to float.
214
243
  """
215
- best_overall_method = linkage_method
216
- best_overall_metric = linkage_metric
244
+ # Supported linkage threshold metrics
245
+ supported_linkage_thresholds = {"silhouette", "calinski_harabasz", "davies_bouldin"}
246
+
247
+ # If linkage_threshold is a string:
248
+ if isinstance(linkage_threshold, str):
249
+ if linkage_threshold in supported_linkage_thresholds:
250
+ opt_metric = linkage_threshold
251
+ else:
252
+ try:
253
+ threshold_float = float(linkage_threshold)
254
+ except (TypeError, ValueError):
255
+ raise LinkageThresholdError(
256
+ f"linkage_threshold must be one of {', '.join(supported_linkage_thresholds)} or a float value."
257
+ )
258
+ return linkage_method, linkage_metric, threshold_float
259
+ else:
260
+ # If not a string, try to convert it to float.
261
+ try:
262
+ threshold_float = float(linkage_threshold)
263
+ except (TypeError, ValueError):
264
+ raise LinkageThresholdError(
265
+ f"linkage_threshold must be one of {', '.join(supported_linkage_thresholds)} or a float value."
266
+ )
267
+ return linkage_method, linkage_metric, threshold_float
268
+
269
+ # Otherwise, perform optimization using the specified metric (opt_metric).
270
+ best_overall_method = None
271
+ best_overall_metric = None
272
+ best_overall_threshold = None
217
273
  best_overall_score = -np.inf
218
- best_overall_threshold = 1
219
274
 
220
- linkage_methods = GROUP_LINKAGE_METHODS if linkage_method == "auto" else [linkage_method]
221
- linkage_metrics = GROUP_DISTANCE_METRICS if linkage_metric == "auto" else [linkage_metric]
222
- total_combinations = len(linkage_methods) * len(linkage_metrics)
275
+ # Use the provided lists if "auto" is specified.
276
+ methods = GROUP_LINKAGE_METHODS if linkage_method == "auto" else [linkage_method]
277
+ metrics = GROUP_DISTANCE_METRICS if linkage_metric == "auto" else [linkage_metric]
278
+ total_combinations = len(methods) * len(metrics)
223
279
 
224
- # Evaluating optimal linkage method and metric
225
280
  for method, metric in tqdm(
226
- product(linkage_methods, linkage_metrics),
281
+ product(methods, metrics),
227
282
  desc="Evaluating optimal linkage method and metric",
228
283
  total=total_combinations,
229
284
  bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
230
285
  ):
231
- # Some linkage methods and metrics may not work with certain data
232
286
  with suppress(ValueError):
233
287
  Z = linkage(m, method=method, metric=metric)
234
- threshold, score = _find_best_silhouette_score(Z, m, metric, linkage_criterion)
288
+ threshold, score = _find_optimal_linkage_threshold(
289
+ Z, m, metric, linkage_criterion, opt_metric=opt_metric
290
+ )
235
291
  if score > best_overall_score:
236
292
  best_overall_score = score
237
293
  best_overall_threshold = threshold
238
294
  best_overall_method = method
239
295
  best_overall_metric = metric
240
296
 
297
+ if best_overall_method is None or best_overall_metric is None or best_overall_threshold is None:
298
+ raise ValueError("Optimization failed to determine an optimal threshold.")
241
299
  return best_overall_method, best_overall_metric, best_overall_threshold
242
300
 
243
301
 
244
- def _find_best_silhouette_score(
302
+ def _find_optimal_linkage_threshold(
245
303
  Z: np.ndarray,
246
304
  m: np.ndarray,
247
305
  linkage_metric: str,
248
306
  linkage_criterion: str,
249
- lower_bound: float = 0.001,
250
- upper_bound: float = 1.0,
251
- resolution: float = 0.001,
307
+ opt_metric: str = "silhouette",
252
308
  ) -> Tuple[float, float]:
253
- """Find the best silhouette score using binary search.
309
+ """Find the optimal linkage threshold coefficient (multiplier for np.max(Z[:, 2])) via binary search.
310
+ The threshold is chosen to optimize clustering quality, as measured by the chosen metric.
254
311
 
255
312
  Args:
256
313
  Z (np.ndarray): Linkage matrix.
257
314
  m (np.ndarray): Data matrix.
258
- linkage_metric (str): Linkage metric for silhouette score calculation.
259
- linkage_criterion (str): Clustering criterion.
260
- lower_bound (float, optional): Lower bound for search. Defaults to 0.001.
261
- upper_bound (float, optional): Upper bound for search. Defaults to 1.0.
262
- resolution (float, optional): Desired resolution for the best threshold. Defaults to 0.001.
315
+ linkage_metric (str): The metric used for clustering quality calculation.
316
+ linkage_criterion (str): The criterion to pass to fcluster (typically "distance").
317
+ opt_metric (str): The metric to optimize; one of "silhouette", "calinski_harabasz", or "davies_bouldin".
318
+ For "silhouette" and "calinski_harabasz", higher is better; for "davies_bouldin", lower is better.
263
319
 
264
320
  Returns:
265
321
  Tuple[float, float]:
266
- - Best threshold (float): The threshold that yields the best silhouette score.
267
- - Best silhouette score (float): The highest silhouette score achieved.
322
+ - best_threshold: The optimal linkage coefficient for np.max(Z[:, 2]).
323
+ - best_metric_value: The clustering quality metric achieved (with higher being better for
324
+ "silhouette" and "calinski_harabasz", and lower for "davies_bouldin").
268
325
  """
269
- best_score = -np.inf
270
- best_threshold = None
271
-
272
- # Test lower bound
273
- max_d_lower = np.max(Z[:, 2]) * lower_bound
274
- clusters_lower = fcluster(Z, max_d_lower, criterion=linkage_criterion)
275
- try:
276
- score_lower = silhouette_score(m, clusters_lower, metric=linkage_metric)
277
- except ValueError:
278
- score_lower = -np.inf
279
-
280
- # Test upper bound
281
- max_d_upper = np.max(Z[:, 2]) * upper_bound
282
- clusters_upper = fcluster(Z, max_d_upper, criterion=linkage_criterion)
283
- try:
284
- score_upper = silhouette_score(m, clusters_upper, metric=linkage_metric)
285
- except ValueError:
286
- score_upper = -np.inf
287
-
288
- # Determine initial bounds for binary search
289
- if score_lower > score_upper:
290
- best_score = score_lower
326
+ max_d = np.max(Z[:, 2])
327
+ lower_bound = 0.0
328
+ upper_bound = 1.0
329
+ resolution = 1e-6
330
+
331
+ def compute_objective(coefficient: float) -> float:
332
+ """Compute the objective function for a given linkage threshold coefficient."""
333
+ threshold_val = coefficient * max_d
334
+ clusters = fcluster(Z, threshold_val, criterion=linkage_criterion)
335
+ unique_clusters = np.unique(clusters)
336
+ # Return a heavy penalty if the clustering is trivial.
337
+ if len(unique_clusters) <= 1 or len(unique_clusters) == m.shape[0]:
338
+ return 1e6
339
+ try:
340
+ if opt_metric == "silhouette":
341
+ score = silhouette_score(m, clusters, metric=linkage_metric)
342
+ return -score # We want to maximize the silhouette score.
343
+ elif opt_metric == "calinski_harabasz":
344
+ score = calinski_harabasz_score(m, clusters)
345
+ return -score # Higher is better.
346
+ elif opt_metric == "davies_bouldin":
347
+ score = davies_bouldin_score(m, clusters)
348
+ return score # Lower is better.
349
+ else:
350
+ raise LinkageThresholdError(f"Unknown optimization metric: {opt_metric}.")
351
+ except Exception:
352
+ return 1e6
353
+
354
+ # Initialize the bounds and the best objective value
355
+ obj_lower = compute_objective(lower_bound)
356
+ obj_upper = compute_objective(upper_bound)
357
+ # Determine the initial direction of the search
358
+ if obj_lower < obj_upper:
359
+ best_obj = obj_lower
291
360
  best_threshold = lower_bound
292
- upper_bound = (lower_bound + upper_bound) / 2
361
+ lower = lower_bound
362
+ upper = (lower_bound + upper_bound) / 2
293
363
  else:
294
- best_score = score_upper
364
+ best_obj = obj_upper
295
365
  best_threshold = upper_bound
296
- lower_bound = (lower_bound + upper_bound) / 2
297
-
298
- # Binary search loop
299
- while upper_bound - lower_bound > resolution:
300
- mid_threshold = (upper_bound + lower_bound) / 2
301
- max_d_mid = np.max(Z[:, 2]) * mid_threshold
302
- clusters_mid = fcluster(Z, max_d_mid, criterion=linkage_criterion)
303
- try:
304
- score_mid = silhouette_score(m, clusters_mid, metric=linkage_metric)
305
- except ValueError:
306
- score_mid = -np.inf
307
-
308
- # Update best score and threshold if mid-point is better
309
- if score_mid > best_score:
310
- best_score = score_mid
311
- best_threshold = mid_threshold
312
-
313
- # Adjust bounds based on the scores
314
- if score_lower > score_upper:
315
- upper_bound = mid_threshold
366
+ lower = (lower_bound + upper_bound) / 2
367
+ upper = upper_bound
368
+
369
+ # Perform binary search to find the optimal linkage threshold coefficient
370
+ while (upper - lower) > resolution:
371
+ mid = (upper + lower) / 2
372
+ obj_mid = compute_objective(mid)
373
+ if obj_mid < best_obj:
374
+ best_obj = obj_mid
375
+ best_threshold = mid
376
+
377
+ # Update the bounds based on the objective value
378
+ obj_left = compute_objective((lower + mid) / 2)
379
+ obj_right = compute_objective((mid + upper) / 2)
380
+ if obj_left < obj_right:
381
+ upper = mid
316
382
  else:
317
- lower_bound = mid_threshold
383
+ lower = mid
384
+
385
+ # If the optimization metric is silhouette or calinski_harabasz, return the negative value
386
+ if opt_metric in ["silhouette", "calinski_harabasz"]:
387
+ best_metric_value = -best_obj
388
+ else:
389
+ best_metric_value = best_obj
318
390
 
319
- return best_threshold, float(best_score)
391
+ return best_threshold, float(best_metric_value)
risk/network/graph/api.py CHANGED
@@ -42,6 +42,7 @@ class GraphAPI:
42
42
  linkage_criterion: str = "distance",
43
43
  linkage_method: str = "average",
44
44
  linkage_metric: str = "yule",
45
+ linkage_threshold: float = 0.2,
45
46
  min_cluster_size: int = 5,
46
47
  max_cluster_size: int = 1000,
47
48
  ) -> Graph:
@@ -59,6 +60,7 @@ class GraphAPI:
59
60
  linkage_criterion (str, optional): Clustering criterion for defining domains. Defaults to "distance".
60
61
  linkage_method (str, optional): Clustering method to use. Defaults to "average".
61
62
  linkage_metric (str, optional): Metric to use for calculating distances. Defaults to "yule".
63
+ linkage_threshold (float, optional): Threshold for clustering. Defaults to 0.2.
62
64
  min_cluster_size (int, optional): Minimum size for clusters. Defaults to 5.
63
65
  max_cluster_size (int, optional): Maximum size for clusters. Defaults to 1000.
64
66
 
@@ -76,6 +78,7 @@ class GraphAPI:
76
78
  linkage_criterion=linkage_criterion,
77
79
  linkage_method=linkage_method,
78
80
  linkage_metric=linkage_metric,
81
+ linkage_threshold=linkage_threshold,
79
82
  min_cluster_size=min_cluster_size,
80
83
  max_cluster_size=max_cluster_size,
81
84
  )
@@ -130,6 +133,7 @@ class GraphAPI:
130
133
  linkage_criterion=linkage_criterion,
131
134
  linkage_method=linkage_method,
132
135
  linkage_metric=linkage_metric,
136
+ linkage_threshold=linkage_threshold,
133
137
  )
134
138
  # Trim domains and top annotations based on cluster size constraints
135
139
  domains, trimmed_domains = trim_domains(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: risk-network
3
- Version: 0.0.9b35
3
+ Version: 0.0.9b37
4
4
  Summary: A Python package for biological network analysis
5
5
  Author: Ira Horecka
6
6
  Author-email: Ira Horecka <ira89@icloud.com>
@@ -1,4 +1,4 @@
1
- risk/__init__.py,sha256=r0xnE00ojEe_XG3ZBNV_1n3-vl0XWYuOA9-na0o5v2w,127
1
+ risk/__init__.py,sha256=T82k1TIYSqPy3KdA5kWb-5ZtqtksCWdHZ4e5vTYhTSY,127
2
2
  risk/constants.py,sha256=XInRaH78Slnw_sWgAsBFbUHkyA0h0jL0DKGuQNbOvjM,550
3
3
  risk/risk.py,sha256=s827_lRknFseOP9O4zW8sP-IcCd2EzrpV_tnVY_tz5s,1104
4
4
  risk/annotations/__init__.py,sha256=parsbcux1U4urpUqh9AdzbDWuLj9HlMidycMPkpSQFo,179
@@ -10,13 +10,13 @@ risk/log/parameters.py,sha256=VtwfMzLU1xI4yji3-Ch5vHjH-KdwTfwaEMmi7hFQTs0,5716
10
10
  risk/neighborhoods/__init__.py,sha256=Q74HwTH7okI-vaskJPy2bYwb5sNjGASTzJ6m8V8arCU,234
11
11
  risk/neighborhoods/api.py,sha256=ywngw2TQVV27gYlWDXcs8-qnmeepnvb-W9ov6J6VEPM,23341
12
12
  risk/neighborhoods/community.py,sha256=5Q_-VAJC-5SY5EUsB8gIlemeDoAL85uLjyl16pItHiQ,16699
13
- risk/neighborhoods/domains.py,sha256=jMJ4-Qzwgmo6Hya8h0E2_IcMaLpbuH_FWlmSjJl2ikc,12832
13
+ risk/neighborhoods/domains.py,sha256=lMEKwZaOcFIVFqCPPgER1GQQR_ANIWaNk5m2zRdUH08,16774
14
14
  risk/neighborhoods/neighborhoods.py,sha256=l9FhADB1C-OxM8E9QXOcA4osUDgA1vs4ud-OCGKKybc,21457
15
15
  risk/network/__init__.py,sha256=oVi3FA1XXKD84014Cykq-9bpX4_s0F3aAUfNOU-07Qw,73
16
16
  risk/network/geometry.py,sha256=eVtGHMgBf9fEqQZUFdHWjw-zFYYpfUONoHFSAxoRkug,6219
17
17
  risk/network/io.py,sha256=RCH4nQdgYDXcNwMfpSz7qEmPO0pJ1p9fL0rNQptsQrc,21673
18
18
  risk/network/graph/__init__.py,sha256=ziGJew3yhtqvrb9LUuneDu_LwW2Wa9vd4UuhoL5l1CA,91
19
- risk/network/graph/api.py,sha256=t5Mh5_lD2uTLioEJFfCRe7ncc5iLNYzxd6r05wSiv7s,8169
19
+ risk/network/graph/api.py,sha256=9yoviP7EqFU1okLJZlaLBZzFNmjOHv30B1JgDFNP1bg,8399
20
20
  risk/network/graph/graph.py,sha256=qEWyZvuaGT_vvjhreBdmRPX3gst2wQFaXhFAvikPSqw,12158
21
21
  risk/network/graph/summary.py,sha256=Y_0rL2C1UoQeZQIPVe5LbaCO356Mcc8HisnrXwQsRm8,10289
22
22
  risk/network/plotter/__init__.py,sha256=4gWtQHGzQVNHmEBXi31Zf0tX0y2sTcE66J_yGnn7268,99
@@ -34,8 +34,8 @@ risk/stats/stat_tests.py,sha256=tj0ri9w89_1fsjGLuafTWpfBEwZXpSLn7Ej2aAQ5lxk,1177
34
34
  risk/stats/permutation/__init__.py,sha256=OLmYLm2uj96hPsSaUs0vUqFYw6Thwch_aHtpL7L0ZFw,127
35
35
  risk/stats/permutation/permutation.py,sha256=BWjgdBpLVcHvmwHy0bmD4aJFccxifNBSrrCBPppyKf4,10569
36
36
  risk/stats/permutation/test_functions.py,sha256=KlECWTz1EZ6EPF_OAgHb0uznaIhopiVYb_AKUKuC4no,3120
37
- risk_network-0.0.9b35.dist-info/LICENSE,sha256=jOtLnuWt7d5Hsx6XXB2QxzrSe2sWWh3NgMfFRetluQM,35147
38
- risk_network-0.0.9b35.dist-info/METADATA,sha256=0rVbcV_Dh2DepCIKj-AbFry3t8J1yPNLyY_Kluq141A,47627
39
- risk_network-0.0.9b35.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
40
- risk_network-0.0.9b35.dist-info/top_level.txt,sha256=NX7C2PFKTvC1JhVKv14DFlFAIFnKc6Lpsu1ZfxvQwVw,5
41
- risk_network-0.0.9b35.dist-info/RECORD,,
37
+ risk_network-0.0.9b37.dist-info/LICENSE,sha256=jOtLnuWt7d5Hsx6XXB2QxzrSe2sWWh3NgMfFRetluQM,35147
38
+ risk_network-0.0.9b37.dist-info/METADATA,sha256=5tQlDkFQSqNgXhv-_TZSfraFKAseXh7LCHciBQIR4CQ,47627
39
+ risk_network-0.0.9b37.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
40
+ risk_network-0.0.9b37.dist-info/top_level.txt,sha256=NX7C2PFKTvC1JhVKv14DFlFAIFnKc6Lpsu1ZfxvQwVw,5
41
+ risk_network-0.0.9b37.dist-info/RECORD,,