risk-network 0.0.9b38__py3-none-any.whl → 0.0.9b39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
risk/__init__.py CHANGED
@@ -7,4 +7,4 @@ RISK: Regional Inference of Significant Kinships
7
7
 
8
8
  from risk.risk import RISK
9
9
 
10
- __version__ = "0.0.9-beta.38"
10
+ __version__ = "0.0.9-beta.39"
@@ -10,8 +10,7 @@ from typing import Tuple, Union
10
10
  import numpy as np
11
11
  import pandas as pd
12
12
  from scipy.cluster.hierarchy import linkage, fcluster
13
- from scipy.optimize import minimize_scalar
14
- from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, silhouette_score
13
+ from sklearn.metrics import silhouette_score
15
14
  from tqdm import tqdm
16
15
 
17
16
  from risk.annotations import get_weighted_description
@@ -19,19 +18,13 @@ from risk.constants import GROUP_LINKAGE_METHODS, GROUP_DISTANCE_METRICS
19
18
  from risk.log import logger
20
19
 
21
20
 
22
- class LinkageThresholdError(Exception):
23
- """Exception raised for errors in the linkage threshold optimization process."""
24
-
25
- pass
26
-
27
-
28
21
  def define_domains(
29
22
  top_annotations: pd.DataFrame,
30
23
  significant_neighborhoods_significance: np.ndarray,
31
24
  linkage_criterion: str,
32
25
  linkage_method: str,
33
26
  linkage_metric: str,
34
- linkage_threshold: Union[str, float],
27
+ linkage_threshold: float,
35
28
  ) -> pd.DataFrame:
36
29
  """Define domains and assign nodes to these domains based on their significance scores and clustering,
37
30
  handling errors by assigning unique domains when clustering fails.
@@ -39,19 +32,13 @@ def define_domains(
39
32
  Args:
40
33
  top_annotations (pd.DataFrame): DataFrame of top annotations data for the network nodes.
41
34
  significant_neighborhoods_significance (np.ndarray): The binary significance matrix below alpha.
42
- linkage_criterion (str): The clustering criterion for defining groups. Use "distance" for distance-based
43
- clustering or "maxclust" for a fixed number of clusters. Use "off" to skip clustering.
44
- linkage_method (str): The linkage method for clustering. Use "auto" to try multiple methods.
45
- linkage_metric (str): The linkage metric for clustering. Use "auto" to try multiple metrics.
46
- linkage_threshold (str, float): The linkage threshold for clustering, or one of "silhouette",
47
- "calinski_harabasz", or "davies_bouldin" to optimize the threshold.
35
+ linkage_criterion (str): The clustering criterion for defining groups.
36
+ linkage_method (str): The linkage method for clustering.
37
+ linkage_metric (str): The linkage metric for clustering.
38
+ linkage_threshold (float): The threshold for clustering.
48
39
 
49
40
  Returns:
50
41
  pd.DataFrame: DataFrame with the primary domain for each node.
51
-
52
- Raises:
53
- ValueError: If an improper value is passed for linkage_threshold. Acceptable values are "silhouette",
54
- "calinski_harabasz", "davies_bouldin", or a float value.
55
42
  """
56
43
  try:
57
44
  if linkage_criterion == "off":
@@ -62,10 +49,8 @@ def define_domains(
62
49
  # Safeguard the matrix by replacing NaN, Inf, and -Inf values
63
50
  m = _safeguard_matrix(m)
64
51
  # Optimize silhouette score across different linkage methods and distance metrics
65
- best_linkage, best_metric, best_threshold = (
66
- _optimize_linkage_threshold_across_methods_and_metrics(
67
- m, linkage_criterion, linkage_method, linkage_metric, linkage_threshold
68
- )
52
+ best_linkage, best_metric, best_threshold = _optimize_silhouette_across_linkage_and_metrics(
53
+ m, linkage_criterion, linkage_method, linkage_metric, linkage_threshold
69
54
  )
70
55
  # Perform hierarchical clustering
71
56
  Z = linkage(m, method=best_linkage, metric=best_metric)
@@ -91,9 +76,6 @@ def define_domains(
91
76
  f"Error encountered. Skipping clustering and assigning {n_rows} unique domains."
92
77
  )
93
78
  top_annotations["domain"] = range(1, n_rows + 1) # Assign unique domains
94
- except LinkageThresholdError as e:
95
- # If a LinkageThresholdError is encountered, raise a ValueError with the original exception
96
- raise ValueError(e) from e
97
79
 
98
80
  # Create DataFrames to store domain information
99
81
  node_to_significance = pd.DataFrame(
@@ -215,154 +197,146 @@ def _safeguard_matrix(matrix: np.ndarray) -> np.ndarray:
215
197
  return matrix
216
198
 
217
199
 
218
- def _optimize_linkage_threshold_across_methods_and_metrics(
200
+ def _optimize_silhouette_across_linkage_and_metrics(
219
201
  m: np.ndarray,
220
202
  linkage_criterion: str,
221
203
  linkage_method: str,
222
204
  linkage_metric: str,
223
205
  linkage_threshold: Union[str, float],
224
206
  ) -> Tuple[str, str, float]:
225
- """Optimize the linkage method, metric, and threshold for hierarchical clustering. If the threshold is
226
- a string, optimize the threshold using the specified metric; otherwise, use the provided threshold.
207
+ """Optimize silhouette score across different linkage methods and distance metrics.
227
208
 
228
209
  Args:
229
210
  m (np.ndarray): Data matrix.
230
- linkage_criterion (str): Criterion for fcluster (typically "distance").
231
- linkage_method (str): Linkage method for clustering, or "auto" to try multiple methods.
232
- linkage_metric (str): Distance metric for clustering, or "auto" to try multiple metrics.
233
- linkage_threshold (str, float): Either a numeric threshold or one of the following keywords:
234
- "silhouette", "calinski_harabasz", or "davies_bouldin" to trigger optimization.
211
+ linkage_criterion (str): Clustering criterion.
212
+ linkage_method (str): Linkage method for clustering.
213
+ linkage_metric (str): Linkage metric for clustering.
214
+ linkage_threshold (Union[str, float]): Threshold for clustering. Set to "auto" to optimize.
235
215
 
236
216
  Returns:
237
217
  Tuple[str, str, float]:
238
- - The chosen linkage method.
239
- - The chosen linkage metric.
240
- - The optimized threshold (a float).
241
-
242
- Raises:
243
- ValueError: If linkage_threshold is neither one of the supported keywords nor convertible to float.
218
+ - Best linkage method (str)
219
+ - Best linkage metric (str)
220
+ - Best threshold (float)
244
221
  """
245
- # Supported linkage threshold metrics
246
- supported_linkage_thresholds = {"silhouette", "calinski_harabasz", "davies_bouldin"}
247
-
248
- # If linkage_threshold is a string:
249
- if isinstance(linkage_threshold, str):
250
- if linkage_threshold in supported_linkage_thresholds:
251
- opt_metric = linkage_threshold
252
- else:
253
- try:
254
- threshold_float = float(linkage_threshold)
255
- except (TypeError, ValueError):
256
- raise LinkageThresholdError(
257
- f"linkage_threshold must be one of {', '.join(supported_linkage_thresholds)} or a float value."
258
- )
259
- return linkage_method, linkage_metric, threshold_float
260
- else:
261
- # If not a string, try to convert it to float.
262
- try:
263
- threshold_float = float(linkage_threshold)
264
- except (TypeError, ValueError):
265
- raise LinkageThresholdError(
266
- f"linkage_threshold must be one of {', '.join(supported_linkage_thresholds)} or a float value."
267
- )
268
- return linkage_method, linkage_metric, threshold_float
269
-
270
- # Otherwise, perform optimization using the specified metric (opt_metric).
271
- best_overall_method = None
272
- best_overall_metric = None
273
- best_overall_threshold = None
222
+ # Initialize best overall values
223
+ best_overall_method = linkage_method
224
+ best_overall_metric = linkage_metric
225
+ best_overall_threshold = linkage_threshold
274
226
  best_overall_score = -np.inf
275
227
 
276
- # Use the provided lists if "auto" is specified.
277
- methods = GROUP_LINKAGE_METHODS if linkage_method == "auto" else [linkage_method]
278
- metrics = GROUP_DISTANCE_METRICS if linkage_metric == "auto" else [linkage_metric]
279
- total_combinations = len(methods) * len(metrics)
228
+ # Set linkage methods and metrics to all combinations if "auto" is selected
229
+ linkage_methods = GROUP_LINKAGE_METHODS if linkage_method == "auto" else [linkage_method]
230
+ linkage_metrics = GROUP_DISTANCE_METRICS if linkage_metric == "auto" else [linkage_metric]
231
+ total_combinations = len(linkage_methods) * len(linkage_metrics)
280
232
 
233
+ # Evaluating optimal linkage method and metric
281
234
  for method, metric in tqdm(
282
- product(methods, metrics),
235
+ product(linkage_methods, linkage_metrics),
283
236
  desc="Evaluating optimal linkage method and metric",
284
237
  total=total_combinations,
285
238
  bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
286
239
  ):
240
+ # Some linkage methods and metrics may not work with certain data
287
241
  with suppress(ValueError):
288
242
  Z = linkage(m, method=method, metric=metric)
289
- threshold, score = _find_optimal_linkage_threshold(
290
- Z, m, metric, linkage_criterion, opt_metric=opt_metric
291
- )
292
- if score > best_overall_score:
293
- best_overall_score = score
294
- best_overall_threshold = threshold
295
- best_overall_method = method
296
- best_overall_metric = metric
297
-
298
- if best_overall_method is None or best_overall_metric is None or best_overall_threshold is None:
299
- raise ValueError("Optimization failed to determine an optimal threshold.")
243
+ # Only optimize silhouette score if the threshold is "auto"
244
+ if linkage_threshold == "auto":
245
+ threshold, score = _find_best_silhouette_score(Z, m, metric, linkage_criterion)
246
+ if score > best_overall_score:
247
+ best_overall_score = score
248
+ best_overall_threshold = threshold
249
+ best_overall_method = method
250
+ best_overall_metric = metric
251
+ else:
252
+ # Use the provided threshold without optimization
253
+ score = silhouette_score(
254
+ m,
255
+ fcluster(Z, linkage_threshold * np.max(Z[:, 2]), criterion=linkage_criterion),
256
+ metric=metric,
257
+ )
258
+ if score > best_overall_score:
259
+ best_overall_score = score
260
+ best_overall_threshold = linkage_threshold
261
+ best_overall_method = method
262
+ best_overall_metric = metric
263
+
300
264
  return best_overall_method, best_overall_metric, best_overall_threshold
301
265
 
302
266
 
303
- def _find_optimal_linkage_threshold(
267
+ def _find_best_silhouette_score(
304
268
  Z: np.ndarray,
305
269
  m: np.ndarray,
306
270
  linkage_metric: str,
307
271
  linkage_criterion: str,
308
- opt_metric: str = "silhouette",
272
+ lower_bound: float = 0.001,
273
+ upper_bound: float = 1.0,
274
+ resolution: float = 0.001,
309
275
  ) -> Tuple[float, float]:
310
- """Find the optimal linkage threshold coefficient for hierarchical clustering. The function optimizes
311
- the threshold value using the specified metric (opt_metric).
276
+ """Find the best silhouette score using binary search.
312
277
 
313
278
  Args:
314
- Z (np.ndarray): Linkage matrix generated by a hierarchical clustering algorithm.
315
- m (np.ndarray): Data matrix used for clustering.
316
- linkage_metric (str): Metric used to calculate distances between data points
317
- (e.g., "euclidean" or "cosine").
318
- linkage_criterion (str): Criterion to pass to `fcluster`, typically "distance".
319
- opt_metric (str, optional): Metric to optimize clustering quality. Options are:
320
- "silhouette", "calinski_harabasz", or "davies_bouldin". Defaults to "silhouette".
279
+ Z (np.ndarray): Linkage matrix.
280
+ m (np.ndarray): Data matrix.
281
+ linkage_metric (str): Linkage metric for silhouette score calculation.
282
+ linkage_criterion (str): Clustering criterion.
283
+ lower_bound (float, optional): Lower bound for search. Defaults to 0.001.
284
+ upper_bound (float, optional): Upper bound for search. Defaults to 1.0.
285
+ resolution (float, optional): Desired resolution for the best threshold. Defaults to 0.001.
321
286
 
322
287
  Returns:
323
288
  Tuple[float, float]:
324
- - best_threshold (float): The optimal linkage threshold coefficient.
325
- - best_metric_value (float): The value of the clustering quality metric achieved
326
- at the optimal threshold (higher for "silhouette" and "calinski_harabasz",
327
- lower for "davies_bouldin").
328
-
329
- Raises:
330
- ValueError: If the `opt_metric` argument is not one of the supported metrics.
289
+ - Best threshold (float): The threshold that yields the best silhouette score.
290
+ - Best silhouette score (float): The highest silhouette score achieved.
331
291
  """
332
- # Get the maximum distance in the linkage matrix
333
- max_d = np.max(Z[:, 2])
334
- resolution = 1e-6
335
-
336
- def compute_objective(coefficient: float) -> float:
337
- """Compute the objective function for optimization."""
338
- threshold_val = coefficient * max_d
339
- clusters = fcluster(Z, threshold_val, criterion=linkage_criterion)
340
- unique_clusters = np.unique(clusters)
341
- if len(unique_clusters) <= 1 or len(unique_clusters) == m.shape[0]:
342
- return 1e6
343
- try:
344
- if opt_metric == "silhouette":
345
- score = silhouette_score(m, clusters, metric=linkage_metric)
346
- return -score # We want to maximize the score.
347
- elif opt_metric == "calinski_harabasz":
348
- score = calinski_harabasz_score(m, clusters)
349
- return -score
350
- elif opt_metric == "davies_bouldin":
351
- score = davies_bouldin_score(m, clusters)
352
- return score
353
- else:
354
- raise ValueError(f"Unknown optimization metric: {opt_metric}.")
355
- except Exception:
356
- return 1e6
292
+ best_score = -np.inf
293
+ best_threshold = None
357
294
 
358
- # Optimize the threshold using the specified metric
359
- res = minimize_scalar(
360
- compute_objective, bounds=(0.0, 1.0), method="bounded", options={"xatol": resolution}
361
- )
295
+ # Test lower bound
296
+ max_d_lower = np.max(Z[:, 2]) * lower_bound
297
+ clusters_lower = fcluster(Z, max_d_lower, criterion=linkage_criterion)
298
+ try:
299
+ score_lower = silhouette_score(m, clusters_lower, metric=linkage_metric)
300
+ except ValueError:
301
+ score_lower = -np.inf
302
+
303
+ # Test upper bound
304
+ max_d_upper = np.max(Z[:, 2]) * upper_bound
305
+ clusters_upper = fcluster(Z, max_d_upper, criterion=linkage_criterion)
306
+ try:
307
+ score_upper = silhouette_score(m, clusters_upper, metric=linkage_metric)
308
+ except ValueError:
309
+ score_upper = -np.inf
362
310
 
363
- best_threshold = res.x
364
- best_obj = res.fun
365
- # For silhouette and calinski_harabasz, the objective was negative.
366
- best_metric_value = -best_obj if opt_metric in ["silhouette", "calinski_harabasz"] else best_obj
311
+ # Determine initial bounds for binary search
312
+ if score_lower > score_upper:
313
+ best_score = score_lower
314
+ best_threshold = lower_bound
315
+ upper_bound = (lower_bound + upper_bound) / 2
316
+ else:
317
+ best_score = score_upper
318
+ best_threshold = upper_bound
319
+ lower_bound = (lower_bound + upper_bound) / 2
320
+
321
+ # Binary search loop
322
+ while upper_bound - lower_bound > resolution:
323
+ mid_threshold = (upper_bound + lower_bound) / 2
324
+ max_d_mid = np.max(Z[:, 2]) * mid_threshold
325
+ clusters_mid = fcluster(Z, max_d_mid, criterion=linkage_criterion)
326
+ try:
327
+ score_mid = silhouette_score(m, clusters_mid, metric=linkage_metric)
328
+ except ValueError:
329
+ score_mid = -np.inf
330
+
331
+ # Update best score and threshold if mid-point is better
332
+ if score_mid > best_score:
333
+ best_score = score_mid
334
+ best_threshold = mid_threshold
335
+
336
+ # Adjust bounds based on the scores
337
+ if score_lower > score_upper:
338
+ upper_bound = mid_threshold
339
+ else:
340
+ lower_bound = mid_threshold
367
341
 
368
- return best_threshold, float(best_metric_value)
342
+ return best_threshold, float(best_score)
risk/network/graph/api.py CHANGED
@@ -58,12 +58,9 @@ class GraphAPI:
58
58
  impute_depth (int, optional): Depth for imputing neighbors. Defaults to 0.
59
59
  prune_threshold (float, optional): Distance threshold for pruning neighbors. Defaults to 0.0.
60
60
  linkage_criterion (str, optional): Clustering criterion for defining domains. Defaults to "distance".
61
- linkage_method (str, optional): Clustering method to use. Defaults to "average". Choose "auto"
62
- to automatically select the best linkage method.
63
- linkage_metric (str, optional): Metric to use for calculating distances. Defaults to "yule". Choose "auto"
64
- to automatically select the best linkage metric.
65
- linkage_threshold (str, float, optional): Threshold for clustering. Choose "silhouette", "calinski_harabasz",
66
- or "davies_bouldin" to automatically select the best threshold. Defaults to 0.2.
61
+ linkage_method (str, optional): Clustering method to use. Defaults to "average".
62
+ linkage_metric (str, optional): Metric to use for calculating distances. Defaults to "yule".
63
+ linkage_threshold (float, optional): Threshold for clustering. Defaults to 0.2.
67
64
  min_cluster_size (int, optional): Minimum size for clusters. Defaults to 5.
68
65
  max_cluster_size (int, optional): Maximum size for clusters. Defaults to 1000.
69
66
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: risk-network
3
- Version: 0.0.9b38
3
+ Version: 0.0.9b39
4
4
  Summary: A Python package for biological network analysis
5
5
  Author: Ira Horecka
6
6
  Author-email: Ira Horecka <ira89@icloud.com>
@@ -1,4 +1,4 @@
1
- risk/__init__.py,sha256=S22dtKjPn7IvWeLakeN4IajRLsqHuSbBhjvf6z5kHoo,127
1
+ risk/__init__.py,sha256=ewYSGDLHigkwFLI9IW6qDbQk4uS6nb3RTd-k2GCD1b0,127
2
2
  risk/constants.py,sha256=XInRaH78Slnw_sWgAsBFbUHkyA0h0jL0DKGuQNbOvjM,550
3
3
  risk/risk.py,sha256=s827_lRknFseOP9O4zW8sP-IcCd2EzrpV_tnVY_tz5s,1104
4
4
  risk/annotations/__init__.py,sha256=parsbcux1U4urpUqh9AdzbDWuLj9HlMidycMPkpSQFo,179
@@ -10,13 +10,13 @@ risk/log/parameters.py,sha256=VtwfMzLU1xI4yji3-Ch5vHjH-KdwTfwaEMmi7hFQTs0,5716
10
10
  risk/neighborhoods/__init__.py,sha256=Q74HwTH7okI-vaskJPy2bYwb5sNjGASTzJ6m8V8arCU,234
11
11
  risk/neighborhoods/api.py,sha256=ywngw2TQVV27gYlWDXcs8-qnmeepnvb-W9ov6J6VEPM,23341
12
12
  risk/neighborhoods/community.py,sha256=5Q_-VAJC-5SY5EUsB8gIlemeDoAL85uLjyl16pItHiQ,16699
13
- risk/neighborhoods/domains.py,sha256=yaSHymGfRJVuXHIa7BwoKzvIRSg5oLhNoOMg0tsVqV8,15961
13
+ risk/neighborhoods/domains.py,sha256=Yu93mKNCuOpBGa87knAH-XIl260kf-rswPfn3aC9GNo,13937
14
14
  risk/neighborhoods/neighborhoods.py,sha256=l9FhADB1C-OxM8E9QXOcA4osUDgA1vs4ud-OCGKKybc,21457
15
15
  risk/network/__init__.py,sha256=oVi3FA1XXKD84014Cykq-9bpX4_s0F3aAUfNOU-07Qw,73
16
16
  risk/network/geometry.py,sha256=eVtGHMgBf9fEqQZUFdHWjw-zFYYpfUONoHFSAxoRkug,6219
17
17
  risk/network/io.py,sha256=RCH4nQdgYDXcNwMfpSz7qEmPO0pJ1p9fL0rNQptsQrc,21673
18
18
  risk/network/graph/__init__.py,sha256=ziGJew3yhtqvrb9LUuneDu_LwW2Wa9vd4UuhoL5l1CA,91
19
- risk/network/graph/api.py,sha256=fOyd-5rRnqmtquproP90scehewd0UtOVZS65hCuwasI,8684
19
+ risk/network/graph/api.py,sha256=9yoviP7EqFU1okLJZlaLBZzFNmjOHv30B1JgDFNP1bg,8399
20
20
  risk/network/graph/graph.py,sha256=qEWyZvuaGT_vvjhreBdmRPX3gst2wQFaXhFAvikPSqw,12158
21
21
  risk/network/graph/summary.py,sha256=Y_0rL2C1UoQeZQIPVe5LbaCO356Mcc8HisnrXwQsRm8,10289
22
22
  risk/network/plotter/__init__.py,sha256=4gWtQHGzQVNHmEBXi31Zf0tX0y2sTcE66J_yGnn7268,99
@@ -34,8 +34,8 @@ risk/stats/stat_tests.py,sha256=tj0ri9w89_1fsjGLuafTWpfBEwZXpSLn7Ej2aAQ5lxk,1177
34
34
  risk/stats/permutation/__init__.py,sha256=OLmYLm2uj96hPsSaUs0vUqFYw6Thwch_aHtpL7L0ZFw,127
35
35
  risk/stats/permutation/permutation.py,sha256=BWjgdBpLVcHvmwHy0bmD4aJFccxifNBSrrCBPppyKf4,10569
36
36
  risk/stats/permutation/test_functions.py,sha256=KlECWTz1EZ6EPF_OAgHb0uznaIhopiVYb_AKUKuC4no,3120
37
- risk_network-0.0.9b38.dist-info/LICENSE,sha256=jOtLnuWt7d5Hsx6XXB2QxzrSe2sWWh3NgMfFRetluQM,35147
38
- risk_network-0.0.9b38.dist-info/METADATA,sha256=mAB2KQoRWeOH13radrcMeW5dalpkPkl4YtjfUpQhJXI,47627
39
- risk_network-0.0.9b38.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
40
- risk_network-0.0.9b38.dist-info/top_level.txt,sha256=NX7C2PFKTvC1JhVKv14DFlFAIFnKc6Lpsu1ZfxvQwVw,5
41
- risk_network-0.0.9b38.dist-info/RECORD,,
37
+ risk_network-0.0.9b39.dist-info/LICENSE,sha256=jOtLnuWt7d5Hsx6XXB2QxzrSe2sWWh3NgMfFRetluQM,35147
38
+ risk_network-0.0.9b39.dist-info/METADATA,sha256=y3xDx1OCYpCS1OgBMUTNIK5y8HFORWHag4PLnyAXc5g,47627
39
+ risk_network-0.0.9b39.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
40
+ risk_network-0.0.9b39.dist-info/top_level.txt,sha256=NX7C2PFKTvC1JhVKv14DFlFAIFnKc6Lpsu1ZfxvQwVw,5
41
+ risk_network-0.0.9b39.dist-info/RECORD,,