risk-network 0.0.9b37__tar.gz → 0.0.9b39__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/PKG-INFO +1 -1
  2. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/__init__.py +1 -1
  3. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/neighborhoods/domains.py +107 -156
  4. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk_network.egg-info/PKG-INFO +1 -1
  5. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/LICENSE +0 -0
  6. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/MANIFEST.in +0 -0
  7. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/README.md +0 -0
  8. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/pyproject.toml +0 -0
  9. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/annotations/__init__.py +0 -0
  10. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/annotations/annotations.py +0 -0
  11. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/annotations/io.py +0 -0
  12. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/constants.py +0 -0
  13. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/log/__init__.py +0 -0
  14. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/log/console.py +0 -0
  15. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/log/parameters.py +0 -0
  16. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/neighborhoods/__init__.py +0 -0
  17. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/neighborhoods/api.py +0 -0
  18. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/neighborhoods/community.py +0 -0
  19. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/neighborhoods/neighborhoods.py +0 -0
  20. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/network/__init__.py +0 -0
  21. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/network/geometry.py +0 -0
  22. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/network/graph/__init__.py +0 -0
  23. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/network/graph/api.py +0 -0
  24. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/network/graph/graph.py +0 -0
  25. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/network/graph/summary.py +0 -0
  26. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/network/io.py +0 -0
  27. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/network/plotter/__init__.py +0 -0
  28. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/network/plotter/api.py +0 -0
  29. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/network/plotter/canvas.py +0 -0
  30. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/network/plotter/contour.py +0 -0
  31. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/network/plotter/labels.py +0 -0
  32. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/network/plotter/network.py +0 -0
  33. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/network/plotter/plotter.py +0 -0
  34. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/network/plotter/utils/colors.py +0 -0
  35. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/network/plotter/utils/layout.py +0 -0
  36. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/risk.py +0 -0
  37. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/stats/__init__.py +0 -0
  38. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/stats/permutation/__init__.py +0 -0
  39. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/stats/permutation/permutation.py +0 -0
  40. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/stats/permutation/test_functions.py +0 -0
  41. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/stats/significance.py +0 -0
  42. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk/stats/stat_tests.py +0 -0
  43. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk_network.egg-info/SOURCES.txt +0 -0
  44. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk_network.egg-info/dependency_links.txt +0 -0
  45. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk_network.egg-info/requires.txt +0 -0
  46. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/risk_network.egg-info/top_level.txt +0 -0
  47. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/setup.cfg +0 -0
  48. {risk_network-0.0.9b37 → risk_network-0.0.9b39}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: risk-network
3
- Version: 0.0.9b37
3
+ Version: 0.0.9b39
4
4
  Summary: A Python package for biological network analysis
5
5
  Author: Ira Horecka
6
6
  Author-email: Ira Horecka <ira89@icloud.com>
@@ -7,4 +7,4 @@ RISK: Regional Inference of Significant Kinships
7
7
 
8
8
  from risk.risk import RISK
9
9
 
10
- __version__ = "0.0.9-beta.37"
10
+ __version__ = "0.0.9-beta.39"
@@ -10,7 +10,7 @@ from typing import Tuple, Union
10
10
  import numpy as np
11
11
  import pandas as pd
12
12
  from scipy.cluster.hierarchy import linkage, fcluster
13
- from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, silhouette_score
13
+ from sklearn.metrics import silhouette_score
14
14
  from tqdm import tqdm
15
15
 
16
16
  from risk.annotations import get_weighted_description
@@ -18,19 +18,13 @@ from risk.constants import GROUP_LINKAGE_METHODS, GROUP_DISTANCE_METRICS
18
18
  from risk.log import logger
19
19
 
20
20
 
21
- class LinkageThresholdError(Exception):
22
- """Exception raised for errors in the linkage threshold optimization process."""
23
-
24
- pass
25
-
26
-
27
21
  def define_domains(
28
22
  top_annotations: pd.DataFrame,
29
23
  significant_neighborhoods_significance: np.ndarray,
30
24
  linkage_criterion: str,
31
25
  linkage_method: str,
32
26
  linkage_metric: str,
33
- linkage_threshold: Union[str, float],
27
+ linkage_threshold: float,
34
28
  ) -> pd.DataFrame:
35
29
  """Define domains and assign nodes to these domains based on their significance scores and clustering,
36
30
  handling errors by assigning unique domains when clustering fails.
@@ -38,19 +32,13 @@ def define_domains(
38
32
  Args:
39
33
  top_annotations (pd.DataFrame): DataFrame of top annotations data for the network nodes.
40
34
  significant_neighborhoods_significance (np.ndarray): The binary significance matrix below alpha.
41
- linkage_criterion (str): The clustering criterion for defining groups. Use "distance" for distance-based
42
- clustering or "maxclust" for a fixed number of clusters. Use "off" to skip clustering.
43
- linkage_method (str): The linkage method for clustering. Use "auto" to try multiple methods.
44
- linkage_metric (str): The linkage metric for clustering. Use "auto" to try multiple metrics.
45
- linkage_threshold (float): The linkage threshold for clustering, or one of "silhouette", "calinski_harabasz",
46
- or "davies_bouldin" to optimize the threshold.
35
+ linkage_criterion (str): The clustering criterion for defining groups.
36
+ linkage_method (str): The linkage method for clustering.
37
+ linkage_metric (str): The linkage metric for clustering.
38
+ linkage_threshold (float): The threshold for clustering.
47
39
 
48
40
  Returns:
49
41
  pd.DataFrame: DataFrame with the primary domain for each node.
50
-
51
- Raises:
52
- ValueError: If an improper value is passed for linkage_threshold. Acceptable values are "silhouette",
53
- "calinski_harabasz", "davies_bouldin", or a float value.
54
42
  """
55
43
  try:
56
44
  if linkage_criterion == "off":
@@ -61,10 +49,8 @@ def define_domains(
61
49
  # Safeguard the matrix by replacing NaN, Inf, and -Inf values
62
50
  m = _safeguard_matrix(m)
63
51
  # Optimize silhouette score across different linkage methods and distance metrics
64
- best_linkage, best_metric, best_threshold = (
65
- _optimize_linkage_threshold_across_methods_and_metrics(
66
- m, linkage_criterion, linkage_method, linkage_metric, linkage_threshold
67
- )
52
+ best_linkage, best_metric, best_threshold = _optimize_silhouette_across_linkage_and_metrics(
53
+ m, linkage_criterion, linkage_method, linkage_metric, linkage_threshold
68
54
  )
69
55
  # Perform hierarchical clustering
70
56
  Z = linkage(m, method=best_linkage, metric=best_metric)
@@ -90,9 +76,6 @@ def define_domains(
90
76
  f"Error encountered. Skipping clustering and assigning {n_rows} unique domains."
91
77
  )
92
78
  top_annotations["domain"] = range(1, n_rows + 1) # Assign unique domains
93
- except LinkageThresholdError as e:
94
- # If a LinkageThresholdError is encountered, raise a ValueError with the original exception
95
- raise ValueError(e) from e
96
79
 
97
80
  # Create DataFrames to store domain information
98
81
  node_to_significance = pd.DataFrame(
@@ -214,178 +197,146 @@ def _safeguard_matrix(matrix: np.ndarray) -> np.ndarray:
214
197
  return matrix
215
198
 
216
199
 
217
- def _optimize_linkage_threshold_across_methods_and_metrics(
200
+ def _optimize_silhouette_across_linkage_and_metrics(
218
201
  m: np.ndarray,
219
202
  linkage_criterion: str,
220
203
  linkage_method: str,
221
204
  linkage_metric: str,
222
205
  linkage_threshold: Union[str, float],
223
206
  ) -> Tuple[str, str, float]:
224
- """Optimize the linkage method, metric, and threshold for hierarchical clustering. If the threshold is
225
- a string, optimize the threshold using the specified metric; otherwise, use the provided threshold.
207
+ """Optimize silhouette score across different linkage methods and distance metrics.
226
208
 
227
209
  Args:
228
210
  m (np.ndarray): Data matrix.
229
- linkage_criterion (str): Criterion for fcluster (typically "distance").
230
- linkage_method (str): Linkage method for clustering, or "auto" to try multiple methods.
231
- linkage_metric (str): Distance metric for clustering, or "auto" to try multiple metrics.
232
- linkage_threshold (Union[str, float]): Either a numeric threshold or one of
233
- "silhouette", "calinski_harabasz", or "davies_bouldin" to trigger optimization.
211
+ linkage_criterion (str): Clustering criterion.
212
+ linkage_method (str): Linkage method for clustering.
213
+ linkage_metric (str): Linkage metric for clustering.
214
+ linkage_threshold (Union[str, float]): Threshold for clustering. Set to "auto" to optimize.
234
215
 
235
216
  Returns:
236
217
  Tuple[str, str, float]:
237
- - The chosen linkage method.
238
- - The chosen linkage metric.
239
- - The optimized threshold (a float).
240
-
241
- Raises:
242
- ValueError: If linkage_threshold is neither one of the supported keywords nor convertible to float.
218
+ - Best linkage method (str)
219
+ - Best linkage metric (str)
220
+ - Best threshold (float)
243
221
  """
244
- # Supported linkage threshold metrics
245
- supported_linkage_thresholds = {"silhouette", "calinski_harabasz", "davies_bouldin"}
246
-
247
- # If linkage_threshold is a string:
248
- if isinstance(linkage_threshold, str):
249
- if linkage_threshold in supported_linkage_thresholds:
250
- opt_metric = linkage_threshold
251
- else:
252
- try:
253
- threshold_float = float(linkage_threshold)
254
- except (TypeError, ValueError):
255
- raise LinkageThresholdError(
256
- f"linkage_threshold must be one of {', '.join(supported_linkage_thresholds)} or a float value."
257
- )
258
- return linkage_method, linkage_metric, threshold_float
259
- else:
260
- # If not a string, try to convert it to float.
261
- try:
262
- threshold_float = float(linkage_threshold)
263
- except (TypeError, ValueError):
264
- raise LinkageThresholdError(
265
- f"linkage_threshold must be one of {', '.join(supported_linkage_thresholds)} or a float value."
266
- )
267
- return linkage_method, linkage_metric, threshold_float
268
-
269
- # Otherwise, perform optimization using the specified metric (opt_metric).
270
- best_overall_method = None
271
- best_overall_metric = None
272
- best_overall_threshold = None
222
+ # Initialize best overall values
223
+ best_overall_method = linkage_method
224
+ best_overall_metric = linkage_metric
225
+ best_overall_threshold = linkage_threshold
273
226
  best_overall_score = -np.inf
274
227
 
275
- # Use the provided lists if "auto" is specified.
276
- methods = GROUP_LINKAGE_METHODS if linkage_method == "auto" else [linkage_method]
277
- metrics = GROUP_DISTANCE_METRICS if linkage_metric == "auto" else [linkage_metric]
278
- total_combinations = len(methods) * len(metrics)
228
+ # Set linkage methods and metrics to all combinations if "auto" is selected
229
+ linkage_methods = GROUP_LINKAGE_METHODS if linkage_method == "auto" else [linkage_method]
230
+ linkage_metrics = GROUP_DISTANCE_METRICS if linkage_metric == "auto" else [linkage_metric]
231
+ total_combinations = len(linkage_methods) * len(linkage_metrics)
279
232
 
233
+ # Evaluating optimal linkage method and metric
280
234
  for method, metric in tqdm(
281
- product(methods, metrics),
235
+ product(linkage_methods, linkage_metrics),
282
236
  desc="Evaluating optimal linkage method and metric",
283
237
  total=total_combinations,
284
238
  bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
285
239
  ):
240
+ # Some linkage methods and metrics may not work with certain data
286
241
  with suppress(ValueError):
287
242
  Z = linkage(m, method=method, metric=metric)
288
- threshold, score = _find_optimal_linkage_threshold(
289
- Z, m, metric, linkage_criterion, opt_metric=opt_metric
290
- )
291
- if score > best_overall_score:
292
- best_overall_score = score
293
- best_overall_threshold = threshold
294
- best_overall_method = method
295
- best_overall_metric = metric
296
-
297
- if best_overall_method is None or best_overall_metric is None or best_overall_threshold is None:
298
- raise ValueError("Optimization failed to determine an optimal threshold.")
243
+ # Only optimize silhouette score if the threshold is "auto"
244
+ if linkage_threshold == "auto":
245
+ threshold, score = _find_best_silhouette_score(Z, m, metric, linkage_criterion)
246
+ if score > best_overall_score:
247
+ best_overall_score = score
248
+ best_overall_threshold = threshold
249
+ best_overall_method = method
250
+ best_overall_metric = metric
251
+ else:
252
+ # Use the provided threshold without optimization
253
+ score = silhouette_score(
254
+ m,
255
+ fcluster(Z, linkage_threshold * np.max(Z[:, 2]), criterion=linkage_criterion),
256
+ metric=metric,
257
+ )
258
+ if score > best_overall_score:
259
+ best_overall_score = score
260
+ best_overall_threshold = linkage_threshold
261
+ best_overall_method = method
262
+ best_overall_metric = metric
263
+
299
264
  return best_overall_method, best_overall_metric, best_overall_threshold
300
265
 
301
266
 
302
- def _find_optimal_linkage_threshold(
267
+ def _find_best_silhouette_score(
303
268
  Z: np.ndarray,
304
269
  m: np.ndarray,
305
270
  linkage_metric: str,
306
271
  linkage_criterion: str,
307
- opt_metric: str = "silhouette",
272
+ lower_bound: float = 0.001,
273
+ upper_bound: float = 1.0,
274
+ resolution: float = 0.001,
308
275
  ) -> Tuple[float, float]:
309
- """Find the optimal linkage threshold coefficient (multiplier for np.max(Z[:, 2])) via binary search.
310
- The threshold is chosen to optimize clustering quality, as measured by the chosen metric.
276
+ """Find the best silhouette score using binary search.
311
277
 
312
278
  Args:
313
279
  Z (np.ndarray): Linkage matrix.
314
280
  m (np.ndarray): Data matrix.
315
- linkage_metric (str): The metric used for clustering quality calculation.
316
- linkage_criterion (str): The criterion to pass to fcluster (typically "distance").
317
- opt_metric (str): The metric to optimize; one of "silhouette", "calinski_harabasz", or "davies_bouldin".
318
- For "silhouette" and "calinski_harabasz", higher is better; for "davies_bouldin", lower is better.
281
+ linkage_metric (str): Linkage metric for silhouette score calculation.
282
+ linkage_criterion (str): Clustering criterion.
283
+ lower_bound (float, optional): Lower bound for search. Defaults to 0.001.
284
+ upper_bound (float, optional): Upper bound for search. Defaults to 1.0.
285
+ resolution (float, optional): Desired resolution for the best threshold. Defaults to 0.001.
319
286
 
320
287
  Returns:
321
288
  Tuple[float, float]:
322
- - best_threshold: The optimal linkage coefficient for np.max(Z[:, 2]).
323
- - best_metric_value: The clustering quality metric achieved (with higher being better for
324
- "silhouette" and "calinski_harabasz", and lower for "davies_bouldin").
289
+ - Best threshold (float): The threshold that yields the best silhouette score.
290
+ - Best silhouette score (float): The highest silhouette score achieved.
325
291
  """
326
- max_d = np.max(Z[:, 2])
327
- lower_bound = 0.0
328
- upper_bound = 1.0
329
- resolution = 1e-6
330
-
331
- def compute_objective(coefficient: float) -> float:
332
- """Compute the objective function for a given linkage threshold coefficient."""
333
- threshold_val = coefficient * max_d
334
- clusters = fcluster(Z, threshold_val, criterion=linkage_criterion)
335
- unique_clusters = np.unique(clusters)
336
- # Return a heavy penalty if the clustering is trivial.
337
- if len(unique_clusters) <= 1 or len(unique_clusters) == m.shape[0]:
338
- return 1e6
339
- try:
340
- if opt_metric == "silhouette":
341
- score = silhouette_score(m, clusters, metric=linkage_metric)
342
- return -score # We want to maximize the silhouette score.
343
- elif opt_metric == "calinski_harabasz":
344
- score = calinski_harabasz_score(m, clusters)
345
- return -score # Higher is better.
346
- elif opt_metric == "davies_bouldin":
347
- score = davies_bouldin_score(m, clusters)
348
- return score # Lower is better.
349
- else:
350
- raise LinkageThresholdError(f"Unknown optimization metric: {opt_metric}.")
351
- except Exception:
352
- return 1e6
353
-
354
- # Initialize the bounds and the best objective value
355
- obj_lower = compute_objective(lower_bound)
356
- obj_upper = compute_objective(upper_bound)
357
- # Determine the initial direction of the search
358
- if obj_lower < obj_upper:
359
- best_obj = obj_lower
292
+ best_score = -np.inf
293
+ best_threshold = None
294
+
295
+ # Test lower bound
296
+ max_d_lower = np.max(Z[:, 2]) * lower_bound
297
+ clusters_lower = fcluster(Z, max_d_lower, criterion=linkage_criterion)
298
+ try:
299
+ score_lower = silhouette_score(m, clusters_lower, metric=linkage_metric)
300
+ except ValueError:
301
+ score_lower = -np.inf
302
+
303
+ # Test upper bound
304
+ max_d_upper = np.max(Z[:, 2]) * upper_bound
305
+ clusters_upper = fcluster(Z, max_d_upper, criterion=linkage_criterion)
306
+ try:
307
+ score_upper = silhouette_score(m, clusters_upper, metric=linkage_metric)
308
+ except ValueError:
309
+ score_upper = -np.inf
310
+
311
+ # Determine initial bounds for binary search
312
+ if score_lower > score_upper:
313
+ best_score = score_lower
360
314
  best_threshold = lower_bound
361
- lower = lower_bound
362
- upper = (lower_bound + upper_bound) / 2
315
+ upper_bound = (lower_bound + upper_bound) / 2
363
316
  else:
364
- best_obj = obj_upper
317
+ best_score = score_upper
365
318
  best_threshold = upper_bound
366
- lower = (lower_bound + upper_bound) / 2
367
- upper = upper_bound
368
-
369
- # Perform binary search to find the optimal linkage threshold coefficient
370
- while (upper - lower) > resolution:
371
- mid = (upper + lower) / 2
372
- obj_mid = compute_objective(mid)
373
- if obj_mid < best_obj:
374
- best_obj = obj_mid
375
- best_threshold = mid
376
-
377
- # Update the bounds based on the objective value
378
- obj_left = compute_objective((lower + mid) / 2)
379
- obj_right = compute_objective((mid + upper) / 2)
380
- if obj_left < obj_right:
381
- upper = mid
382
- else:
383
- lower = mid
319
+ lower_bound = (lower_bound + upper_bound) / 2
384
320
 
385
- # If the optimization metric is silhouette or calinski_harabasz, return the negative value
386
- if opt_metric in ["silhouette", "calinski_harabasz"]:
387
- best_metric_value = -best_obj
388
- else:
389
- best_metric_value = best_obj
321
+ # Binary search loop
322
+ while upper_bound - lower_bound > resolution:
323
+ mid_threshold = (upper_bound + lower_bound) / 2
324
+ max_d_mid = np.max(Z[:, 2]) * mid_threshold
325
+ clusters_mid = fcluster(Z, max_d_mid, criterion=linkage_criterion)
326
+ try:
327
+ score_mid = silhouette_score(m, clusters_mid, metric=linkage_metric)
328
+ except ValueError:
329
+ score_mid = -np.inf
330
+
331
+ # Update best score and threshold if mid-point is better
332
+ if score_mid > best_score:
333
+ best_score = score_mid
334
+ best_threshold = mid_threshold
335
+
336
+ # Adjust bounds based on the scores
337
+ if score_lower > score_upper:
338
+ upper_bound = mid_threshold
339
+ else:
340
+ lower_bound = mid_threshold
390
341
 
391
- return best_threshold, float(best_metric_value)
342
+ return best_threshold, float(best_score)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: risk-network
3
- Version: 0.0.9b37
3
+ Version: 0.0.9b39
4
4
  Summary: A Python package for biological network analysis
5
5
  Author: Ira Horecka
6
6
  Author-email: Ira Horecka <ira89@icloud.com>
File without changes