risk-network 0.0.9b38__py3-none-any.whl → 0.0.9b40__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
risk/__init__.py CHANGED
@@ -7,4 +7,4 @@ RISK: Regional Inference of Significant Kinships
7
7
 
8
8
  from risk.risk import RISK
9
9
 
10
- __version__ = "0.0.9-beta.38"
10
+ __version__ = "0.0.9-beta.40"
@@ -12,8 +12,9 @@ import networkx as nx
12
12
  import nltk
13
13
  import numpy as np
14
14
  import pandas as pd
15
- from nltk.tokenize import word_tokenize
16
15
  from nltk.corpus import stopwords
16
+ from nltk.stem import WordNetLemmatizer
17
+ from nltk.tokenize import word_tokenize
17
18
 
18
19
  from risk.log import logger
19
20
  from scipy.sparse import coo_matrix
@@ -31,11 +32,17 @@ def _setup_nltk():
31
32
  except LookupError:
32
33
  nltk.download("stopwords")
33
34
 
35
+ try:
36
+ nltk.data.find("corpora/wordnet")
37
+ except LookupError:
38
+ nltk.download("wordnet")
39
+
34
40
 
35
41
  # Ensure you have the necessary NLTK data
36
42
  _setup_nltk()
37
- # Initialize English stopwords
38
- stop_words = set(stopwords.words("english"))
43
+ # Use NLTK's stopwords
44
+ STOP_WORDS = set(stopwords.words("english"))
45
+ LEMMATIZER = WordNetLemmatizer()
39
46
 
40
47
 
41
48
  def load_annotations(
@@ -208,104 +215,121 @@ def define_top_annotations(
208
215
 
209
216
  def get_weighted_description(words_column: pd.Series, scores_column: pd.Series) -> str:
210
217
  """Generate a weighted description from words and their corresponding scores,
211
- with support for stopwords filtering and improved weighting logic.
218
+ using improved weighting logic with normalization, lemmatization, and aggregation.
212
219
 
213
220
  Args:
214
- words_column (pd.Series): A pandas Series containing strings to process.
221
+ words_column (pd.Series): A pandas Series containing strings (phrases) to process.
215
222
  scores_column (pd.Series): A pandas Series containing significance scores to weigh the terms.
216
223
 
217
224
  Returns:
218
- str: A coherent description formed from the most frequent and significant words, weighed by significance scores.
225
+ str: A coherent description formed from the most frequent and significant words.
219
226
  """
220
- # Handle case where all scores are the same
227
+ # Normalize significance scores to [0,1]. If all scores are identical, use 1.
221
228
  if scores_column.max() == scores_column.min():
222
- normalized_scores = pd.Series([1] * len(scores_column))
229
+ normalized_scores = pd.Series([1] * len(scores_column), index=scores_column.index)
223
230
  else:
224
- # Normalize the significance scores to be between 0 and 1
225
231
  normalized_scores = (scores_column - scores_column.min()) / (
226
232
  scores_column.max() - scores_column.min()
227
233
  )
228
234
 
229
- # Combine words and normalized scores to create weighted words
235
+ # Accumulate weighted counts for each token (after cleaning and lemmatization)
236
+ weighted_counts = {}
237
+ for phrase, score in zip(words_column, normalized_scores):
238
+ # Tokenize the phrase
239
+ tokens = word_tokenize(str(phrase))
240
+ # Determine the weight (scale factor; here multiplying normalized score by 10)
241
+ weight = max(1, int((0 if pd.isna(score) else score) * 10))
242
+ for token in tokens:
243
+ # Clean token: lowercase and remove extraneous punctuation (but preserve intra-word hyphens)
244
+ token_clean = re.sub(r"[^\w\-]", "", token.lower()).strip()
245
+ if not token_clean:
246
+ continue
247
+ # Skip tokens that are pure numbers
248
+ if token_clean.isdigit():
249
+ continue
250
+ # Skip stopwords
251
+ if token_clean in STOP_WORDS:
252
+ continue
253
+ # Lemmatize the token to merge similar forms
254
+ token_norm = LEMMATIZER.lemmatize(token_clean)
255
+ weighted_counts[token_norm] = weighted_counts.get(token_norm, 0) + weight
256
+
257
+ # Reconstruct a weighted token list by repeating each token by its aggregated count.
230
258
  weighted_words = []
231
- for word, score in zip(words_column, normalized_scores):
232
- word = str(word)
233
- if word not in stop_words: # Skip stopwords
234
- weight = max(1, int((0 if pd.isna(score) else score) * 10))
235
- weighted_words.extend([word] * weight)
236
-
237
- # Tokenize the weighted words, but preserve number-word patterns like '4-alpha'
238
- tokens = word_tokenize(" ".join(weighted_words))
239
- # Ensure we treat "4-alpha" or other "number-word" patterns as single tokens
259
+ for token, count in weighted_counts.items():
260
+ weighted_words.extend([token] * count)
261
+
262
+ # Combine tokens that match number-word patterns (e.g. "4-alpha") and remove pure numeric tokens.
240
263
  combined_tokens = []
241
- for token in tokens:
242
- # Match patterns like '4-alpha' or '5-hydroxy' and keep them together
264
+ for token in weighted_words:
243
265
  if re.match(r"^\d+-\w+", token):
244
266
  combined_tokens.append(token)
245
- elif token.replace(".", "", 1).isdigit(): # Handle pure numeric tokens
246
- # Ignore pure numbers as descriptions unless necessary
267
+ elif token.replace(".", "", 1).isdigit():
247
268
  continue
248
269
  else:
249
270
  combined_tokens.append(token)
250
271
 
251
- # Prevent descriptions like just '4' from being selected
272
+ # If the only token is numeric, return a default value.
252
273
  if len(combined_tokens) == 1 and combined_tokens[0].isdigit():
253
- return "N/A" # Return "N/A" for cases where it's just a number
274
+ return "N/A"
254
275
 
255
- # Simplify the word list and generate the description
276
+ # Simplify the token list to remove near-duplicates based on the Jaccard index.
256
277
  simplified_words = _simplify_word_list(combined_tokens)
278
+ # Generate a coherent description from the simplified words.
257
279
  description = _generate_coherent_description(simplified_words)
258
280
 
259
281
  return description
260
282
 
261
283
 
262
284
  def _simplify_word_list(words: List[str], threshold: float = 0.80) -> List[str]:
263
- """Filter out words that are too similar based on the Jaccard index, keeping the word with the higher count.
285
+ """Filter out words that are too similar based on the Jaccard index,
286
+ keeping the word with the higher aggregated count.
264
287
 
265
288
  Args:
266
- words (list of str): The list of words to be filtered.
289
+ words (List[str]): The list of tokens to be filtered.
267
290
  threshold (float, optional): The similarity threshold for the Jaccard index. Defaults to 0.80.
268
291
 
269
292
  Returns:
270
- list of str: A list of filtered words, where similar words are reduced to the most frequent one.
293
+ List[str]: A list of filtered words, where similar words are reduced to the most frequent one.
271
294
  """
272
- # Count the occurrences of each word
295
+ # Count the occurrences (which reflect the weighted importance)
273
296
  word_counts = Counter(words)
274
297
  filtered_words = []
275
298
  used_words = set()
276
- # Iterate through the words to find similar words
277
- for word in word_counts:
299
+
300
+ # Iterate through words sorted by descending weighted frequency
301
+ for word in sorted(word_counts, key=lambda w: word_counts[w], reverse=True):
278
302
  if word in used_words:
279
303
  continue
280
304
 
281
305
  word_set = set(word)
282
- # Find similar words based on the Jaccard index
306
+ # Find similar words (including the current word) based on the Jaccard index
283
307
  similar_words = [
284
308
  other_word
285
309
  for other_word in word_counts
286
310
  if _calculate_jaccard_index(word_set, set(other_word)) >= threshold
287
311
  ]
288
- # Sort by frequency and choose the most frequent word
312
+ # Choose the word with the highest weighted count among the similar group
289
313
  similar_words.sort(key=lambda w: word_counts[w], reverse=True)
290
314
  best_word = similar_words[0]
291
315
  filtered_words.append(best_word)
292
316
  used_words.update(similar_words)
293
317
 
318
+ # Preserve the original order (by frequency) from the filtered set
294
319
  final_words = [word for word in words if word in filtered_words]
295
320
 
296
321
  return final_words
297
322
 
298
323
 
299
324
  def _calculate_jaccard_index(set1: Set[Any], set2: Set[Any]) -> float:
300
- """Calculate the Jaccard Index of two sets.
325
+ """Calculate the Jaccard index between two sets.
301
326
 
302
327
  Args:
303
- set1 (set): The first set for comparison.
304
- set2 (set): The second set for comparison.
328
+ set1 (Set[Any]): The first set.
329
+ set2 (Set[Any]): The second set.
305
330
 
306
331
  Returns:
307
- float: The Jaccard Index, which is the ratio of the intersection to the union of the two sets.
308
- Returns 0 if the union of the sets is empty.
332
+ float: The Jaccard index (intersection over union). Returns 0 if the union is empty.
309
333
  """
310
334
  intersection = len(set1.intersection(set2))
311
335
  union = len(set1.union(set2))
@@ -313,28 +337,28 @@ def _calculate_jaccard_index(set1: Set[Any], set2: Set[Any]) -> float:
313
337
 
314
338
 
315
339
  def _generate_coherent_description(words: List[str]) -> str:
316
- """Generate a coherent description from a list of words or numerical string values.
340
+ """Generate a coherent description from a list of words.
341
+
317
342
  If there is only one unique entry, return it directly.
343
+ Otherwise, order the words by frequency and join them into a single string.
318
344
 
319
345
  Args:
320
- words (List): A list of words or numerical string values.
346
+ words (List[str]): A list of tokens.
321
347
 
322
348
  Returns:
323
- str: A coherent description formed by arranging the words in a logical sequence.
349
+ str: A coherent, space-separated description.
324
350
  """
325
- # If there are no words, return a keyword indicating no data is available
326
351
  if not words:
327
352
  return "N/A"
328
353
 
329
- # If there's only one unique word, return it directly
354
+ # If there is only one unique word, return it directly
330
355
  unique_words = set(words)
331
356
  if len(unique_words) == 1:
332
357
  return list(unique_words)[0]
333
358
 
334
- # Count the frequency of each word and sort them by frequency
359
+ # Count weighted occurrences and sort in descending order.
335
360
  word_counts = Counter(words)
336
361
  most_common_words = [word for word, _ in word_counts.most_common()]
337
- # Join the most common words to form a coherent description based on frequency
338
362
  description = " ".join(most_common_words)
339
363
 
340
364
  return description
@@ -10,19 +10,22 @@ from typing import Tuple, Union
10
10
  import numpy as np
11
11
  import pandas as pd
12
12
  from scipy.cluster.hierarchy import linkage, fcluster
13
- from scipy.optimize import minimize_scalar
14
- from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, silhouette_score
13
+ from sklearn.metrics import silhouette_score
15
14
  from tqdm import tqdm
16
15
 
17
16
  from risk.annotations import get_weighted_description
18
- from risk.constants import GROUP_LINKAGE_METHODS, GROUP_DISTANCE_METRICS
19
17
  from risk.log import logger
20
18
 
21
19
 
22
- class LinkageThresholdError(Exception):
23
- """Exception raised for errors in the linkage threshold optimization process."""
24
-
25
- pass
20
+ # Define constants for clustering
21
+ # fmt: off
22
+ LINKAGE_METHODS = {"single", "complete", "average", "weighted", "centroid", "median", "ward"}
23
+ LINKAGE_METRICS = {
24
+ "braycurtis","canberra", "chebyshev", "cityblock", "correlation", "cosine", "dice", "euclidean",
25
+ "hamming", "jaccard", "jensenshannon", "kulczynski1", "mahalanobis", "matching", "minkowski",
26
+ "rogerstanimoto", "russellrao", "seuclidean", "sokalmichener", "sokalsneath", "sqeuclidean", "yule",
27
+ }
28
+ # fmt: on
26
29
 
27
30
 
28
31
  def define_domains(
@@ -31,7 +34,7 @@ def define_domains(
31
34
  linkage_criterion: str,
32
35
  linkage_method: str,
33
36
  linkage_metric: str,
34
- linkage_threshold: Union[str, float],
37
+ linkage_threshold: Union[float, str],
35
38
  ) -> pd.DataFrame:
36
39
  """Define domains and assign nodes to these domains based on their significance scores and clustering,
37
40
  handling errors by assigning unique domains when clustering fails.
@@ -39,19 +42,13 @@ def define_domains(
39
42
  Args:
40
43
  top_annotations (pd.DataFrame): DataFrame of top annotations data for the network nodes.
41
44
  significant_neighborhoods_significance (np.ndarray): The binary significance matrix below alpha.
42
- linkage_criterion (str): The clustering criterion for defining groups. Use "distance" for distance-based
43
- clustering or "maxclust" for a fixed number of clusters. Use "off" to skip clustering.
44
- linkage_method (str): The linkage method for clustering. Use "auto" to try multiple methods.
45
- linkage_metric (str): The linkage metric for clustering. Use "auto" to try multiple metrics.
46
- linkage_threshold (str, float): The linkage threshold for clustering, or one of "silhouette",
47
- "calinski_harabasz", or "davies_bouldin" to optimize the threshold.
45
+ linkage_criterion (str): The clustering criterion for defining groups.
46
+ linkage_method (str): The linkage method for clustering. Choose "auto" to optimize.
47
+ linkage_metric (str): The linkage metric for clustering. Choose "auto" to optimize.
48
+ linkage_threshold (float, str): The threshold for clustering. Choose "auto" to optimize.
48
49
 
49
50
  Returns:
50
51
  pd.DataFrame: DataFrame with the primary domain for each node.
51
-
52
- Raises:
53
- ValueError: If an improper value is passed for linkage_threshold. Acceptable values are "silhouette",
54
- "calinski_harabasz", "davies_bouldin", or a float value.
55
52
  """
56
53
  try:
57
54
  if linkage_criterion == "off":
@@ -62,17 +59,14 @@ def define_domains(
62
59
  # Safeguard the matrix by replacing NaN, Inf, and -Inf values
63
60
  m = _safeguard_matrix(m)
64
61
  # Optimize silhouette score across different linkage methods and distance metrics
65
- best_linkage, best_metric, best_threshold = (
66
- _optimize_linkage_threshold_across_methods_and_metrics(
67
- m, linkage_criterion, linkage_method, linkage_metric, linkage_threshold
68
- )
62
+ best_linkage, best_metric, best_threshold = _optimize_silhouette_across_linkage_and_metrics(
63
+ m, linkage_criterion, linkage_method, linkage_metric, linkage_threshold
69
64
  )
70
65
  # Perform hierarchical clustering
71
66
  Z = linkage(m, method=best_linkage, metric=best_metric)
72
67
  logger.warning(
73
- f"Linkage criterion: '{linkage_criterion}'\nLinkage method: '{best_linkage}'\nLinkage metric: '{best_metric}'"
68
+ f"Linkage criterion: '{linkage_criterion}'\nLinkage method: '{best_linkage}'\nLinkage metric: '{best_metric}'\nLinkage threshold: {round(best_threshold, 3)}"
74
69
  )
75
- logger.debug(f"Optimal linkage threshold: {round(best_threshold, 3)}")
76
70
  # Calculate the optimal threshold for clustering
77
71
  max_d_optimal = np.max(Z[:, 2]) * best_threshold
78
72
  # Assign domains to the annotations matrix
@@ -91,9 +85,6 @@ def define_domains(
91
85
  f"Error encountered. Skipping clustering and assigning {n_rows} unique domains."
92
86
  )
93
87
  top_annotations["domain"] = range(1, n_rows + 1) # Assign unique domains
94
- except LinkageThresholdError as e:
95
- # If a LinkageThresholdError is encountered, raise a ValueError with the original exception
96
- raise ValueError(e) from e
97
88
 
98
89
  # Create DataFrames to store domain information
99
90
  node_to_significance = pd.DataFrame(
@@ -215,154 +206,146 @@ def _safeguard_matrix(matrix: np.ndarray) -> np.ndarray:
215
206
  return matrix
216
207
 
217
208
 
218
- def _optimize_linkage_threshold_across_methods_and_metrics(
209
+ def _optimize_silhouette_across_linkage_and_metrics(
219
210
  m: np.ndarray,
220
211
  linkage_criterion: str,
221
212
  linkage_method: str,
222
213
  linkage_metric: str,
223
214
  linkage_threshold: Union[str, float],
224
215
  ) -> Tuple[str, str, float]:
225
- """Optimize the linkage method, metric, and threshold for hierarchical clustering. If the threshold is
226
- a string, optimize the threshold using the specified metric; otherwise, use the provided threshold.
216
+ """Optimize silhouette score across different linkage methods and distance metrics.
227
217
 
228
218
  Args:
229
219
  m (np.ndarray): Data matrix.
230
- linkage_criterion (str): Criterion for fcluster (typically "distance").
231
- linkage_method (str): Linkage method for clustering, or "auto" to try multiple methods.
232
- linkage_metric (str): Distance metric for clustering, or "auto" to try multiple metrics.
233
- linkage_threshold (str, float): Either a numeric threshold or one of the following keywords:
234
- "silhouette", "calinski_harabasz", or "davies_bouldin" to trigger optimization.
220
+ linkage_criterion (str): Clustering criterion.
221
+ linkage_method (str): Linkage method for clustering. Choose "auto" to optimize.
222
+ linkage_metric (str): Linkage metric for clustering. Choose "auto" to optimize.
223
+ linkage_threshold (Union[str, float]): Threshold for clustering. Choose "auto" to optimize.
235
224
 
236
225
  Returns:
237
226
  Tuple[str, str, float]:
238
- - The chosen linkage method.
239
- - The chosen linkage metric.
240
- - The optimized threshold (a float).
241
-
242
- Raises:
243
- ValueError: If linkage_threshold is neither one of the supported keywords nor convertible to float.
227
+ - Best linkage method (str)
228
+ - Best linkage metric (str)
229
+ - Best threshold (float)
244
230
  """
245
- # Supported linkage threshold metrics
246
- supported_linkage_thresholds = {"silhouette", "calinski_harabasz", "davies_bouldin"}
247
-
248
- # If linkage_threshold is a string:
249
- if isinstance(linkage_threshold, str):
250
- if linkage_threshold in supported_linkage_thresholds:
251
- opt_metric = linkage_threshold
252
- else:
253
- try:
254
- threshold_float = float(linkage_threshold)
255
- except (TypeError, ValueError):
256
- raise LinkageThresholdError(
257
- f"linkage_threshold must be one of {', '.join(supported_linkage_thresholds)} or a float value."
258
- )
259
- return linkage_method, linkage_metric, threshold_float
260
- else:
261
- # If not a string, try to convert it to float.
262
- try:
263
- threshold_float = float(linkage_threshold)
264
- except (TypeError, ValueError):
265
- raise LinkageThresholdError(
266
- f"linkage_threshold must be one of {', '.join(supported_linkage_thresholds)} or a float value."
267
- )
268
- return linkage_method, linkage_metric, threshold_float
269
-
270
- # Otherwise, perform optimization using the specified metric (opt_metric).
271
- best_overall_method = None
272
- best_overall_metric = None
273
- best_overall_threshold = None
231
+ # Initialize best overall values
232
+ best_overall_method = linkage_method
233
+ best_overall_metric = linkage_metric
234
+ best_overall_threshold = linkage_threshold
274
235
  best_overall_score = -np.inf
275
236
 
276
- # Use the provided lists if "auto" is specified.
277
- methods = GROUP_LINKAGE_METHODS if linkage_method == "auto" else [linkage_method]
278
- metrics = GROUP_DISTANCE_METRICS if linkage_metric == "auto" else [linkage_metric]
279
- total_combinations = len(methods) * len(metrics)
237
+ # Set linkage methods and metrics to all combinations if "auto" is selected
238
+ linkage_methods = LINKAGE_METHODS if linkage_method == "auto" else [linkage_method]
239
+ linkage_metrics = LINKAGE_METRICS if linkage_metric == "auto" else [linkage_metric]
240
+ total_combinations = len(linkage_methods) * len(linkage_metrics)
280
241
 
242
+ # Evaluating optimal linkage method and metric
281
243
  for method, metric in tqdm(
282
- product(methods, metrics),
244
+ product(linkage_methods, linkage_metrics),
283
245
  desc="Evaluating optimal linkage method and metric",
284
246
  total=total_combinations,
285
247
  bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
286
248
  ):
249
+ # Some linkage methods and metrics may not work with certain data
287
250
  with suppress(ValueError):
288
251
  Z = linkage(m, method=method, metric=metric)
289
- threshold, score = _find_optimal_linkage_threshold(
290
- Z, m, metric, linkage_criterion, opt_metric=opt_metric
291
- )
292
- if score > best_overall_score:
293
- best_overall_score = score
294
- best_overall_threshold = threshold
295
- best_overall_method = method
296
- best_overall_metric = metric
297
-
298
- if best_overall_method is None or best_overall_metric is None or best_overall_threshold is None:
299
- raise ValueError("Optimization failed to determine an optimal threshold.")
252
+ # Only optimize silhouette score if the threshold is "auto"
253
+ if linkage_threshold == "auto":
254
+ threshold, score = _find_best_silhouette_score(Z, m, metric, linkage_criterion)
255
+ if score > best_overall_score:
256
+ best_overall_score = score
257
+ best_overall_threshold = threshold
258
+ best_overall_method = method
259
+ best_overall_metric = metric
260
+ else:
261
+ # Use the provided threshold without optimization
262
+ score = silhouette_score(
263
+ m,
264
+ fcluster(Z, linkage_threshold * np.max(Z[:, 2]), criterion=linkage_criterion),
265
+ metric=metric,
266
+ )
267
+ if score > best_overall_score:
268
+ best_overall_score = score
269
+ best_overall_threshold = linkage_threshold
270
+ best_overall_method = method
271
+ best_overall_metric = metric
272
+
300
273
  return best_overall_method, best_overall_metric, best_overall_threshold
301
274
 
302
275
 
303
- def _find_optimal_linkage_threshold(
276
+ def _find_best_silhouette_score(
304
277
  Z: np.ndarray,
305
278
  m: np.ndarray,
306
279
  linkage_metric: str,
307
280
  linkage_criterion: str,
308
- opt_metric: str = "silhouette",
281
+ lower_bound: float = 0.001,
282
+ upper_bound: float = 1.0,
283
+ resolution: float = 0.001,
309
284
  ) -> Tuple[float, float]:
310
- """Find the optimal linkage threshold coefficient for hierarchical clustering. The function optimizes
311
- the threshold value using the specified metric (opt_metric).
285
+ """Find the best silhouette score using binary search.
312
286
 
313
287
  Args:
314
- Z (np.ndarray): Linkage matrix generated by a hierarchical clustering algorithm.
315
- m (np.ndarray): Data matrix used for clustering.
316
- linkage_metric (str): Metric used to calculate distances between data points
317
- (e.g., "euclidean" or "cosine").
318
- linkage_criterion (str): Criterion to pass to `fcluster`, typically "distance".
319
- opt_metric (str, optional): Metric to optimize clustering quality. Options are:
320
- "silhouette", "calinski_harabasz", or "davies_bouldin". Defaults to "silhouette".
288
+ Z (np.ndarray): Linkage matrix.
289
+ m (np.ndarray): Data matrix.
290
+ linkage_metric (str): Linkage metric for silhouette score calculation.
291
+ linkage_criterion (str): Clustering criterion.
292
+ lower_bound (float, optional): Lower bound for search. Defaults to 0.001.
293
+ upper_bound (float, optional): Upper bound for search. Defaults to 1.0.
294
+ resolution (float, optional): Desired resolution for the best threshold. Defaults to 0.001.
321
295
 
322
296
  Returns:
323
297
  Tuple[float, float]:
324
- - best_threshold (float): The optimal linkage threshold coefficient.
325
- - best_metric_value (float): The value of the clustering quality metric achieved
326
- at the optimal threshold (higher for "silhouette" and "calinski_harabasz",
327
- lower for "davies_bouldin").
328
-
329
- Raises:
330
- ValueError: If the `opt_metric` argument is not one of the supported metrics.
298
+ - Best threshold (float): The threshold that yields the best silhouette score.
299
+ - Best silhouette score (float): The highest silhouette score achieved.
331
300
  """
332
- # Get the maximum distance in the linkage matrix
333
- max_d = np.max(Z[:, 2])
334
- resolution = 1e-6
335
-
336
- def compute_objective(coefficient: float) -> float:
337
- """Compute the objective function for optimization."""
338
- threshold_val = coefficient * max_d
339
- clusters = fcluster(Z, threshold_val, criterion=linkage_criterion)
340
- unique_clusters = np.unique(clusters)
341
- if len(unique_clusters) <= 1 or len(unique_clusters) == m.shape[0]:
342
- return 1e6
343
- try:
344
- if opt_metric == "silhouette":
345
- score = silhouette_score(m, clusters, metric=linkage_metric)
346
- return -score # We want to maximize the score.
347
- elif opt_metric == "calinski_harabasz":
348
- score = calinski_harabasz_score(m, clusters)
349
- return -score
350
- elif opt_metric == "davies_bouldin":
351
- score = davies_bouldin_score(m, clusters)
352
- return score
353
- else:
354
- raise ValueError(f"Unknown optimization metric: {opt_metric}.")
355
- except Exception:
356
- return 1e6
301
+ best_score = -np.inf
302
+ best_threshold = None
357
303
 
358
- # Optimize the threshold using the specified metric
359
- res = minimize_scalar(
360
- compute_objective, bounds=(0.0, 1.0), method="bounded", options={"xatol": resolution}
361
- )
304
+ # Test lower bound
305
+ max_d_lower = np.max(Z[:, 2]) * lower_bound
306
+ clusters_lower = fcluster(Z, max_d_lower, criterion=linkage_criterion)
307
+ try:
308
+ score_lower = silhouette_score(m, clusters_lower, metric=linkage_metric)
309
+ except ValueError:
310
+ score_lower = -np.inf
311
+
312
+ # Test upper bound
313
+ max_d_upper = np.max(Z[:, 2]) * upper_bound
314
+ clusters_upper = fcluster(Z, max_d_upper, criterion=linkage_criterion)
315
+ try:
316
+ score_upper = silhouette_score(m, clusters_upper, metric=linkage_metric)
317
+ except ValueError:
318
+ score_upper = -np.inf
362
319
 
363
- best_threshold = res.x
364
- best_obj = res.fun
365
- # For silhouette and calinski_harabasz, the objective was negative.
366
- best_metric_value = -best_obj if opt_metric in ["silhouette", "calinski_harabasz"] else best_obj
320
+ # Determine initial bounds for binary search
321
+ if score_lower > score_upper:
322
+ best_score = score_lower
323
+ best_threshold = lower_bound
324
+ upper_bound = (lower_bound + upper_bound) / 2
325
+ else:
326
+ best_score = score_upper
327
+ best_threshold = upper_bound
328
+ lower_bound = (lower_bound + upper_bound) / 2
329
+
330
+ # Binary search loop
331
+ while upper_bound - lower_bound > resolution:
332
+ mid_threshold = (upper_bound + lower_bound) / 2
333
+ max_d_mid = np.max(Z[:, 2]) * mid_threshold
334
+ clusters_mid = fcluster(Z, max_d_mid, criterion=linkage_criterion)
335
+ try:
336
+ score_mid = silhouette_score(m, clusters_mid, metric=linkage_metric)
337
+ except ValueError:
338
+ score_mid = -np.inf
339
+
340
+ # Update best score and threshold if mid-point is better
341
+ if score_mid > best_score:
342
+ best_score = score_mid
343
+ best_threshold = mid_threshold
344
+
345
+ # Adjust bounds based on the scores
346
+ if score_lower > score_upper:
347
+ upper_bound = mid_threshold
348
+ else:
349
+ lower_bound = mid_threshold
367
350
 
368
- return best_threshold, float(best_metric_value)
351
+ return best_threshold, float(best_score)
risk/network/graph/api.py CHANGED
@@ -4,7 +4,7 @@ risk/network/graph/api
4
4
  """
5
5
 
6
6
  import copy
7
- from typing import Any, Dict
7
+ from typing import Any, Dict, Union
8
8
 
9
9
  import networkx as nx
10
10
  import pandas as pd
@@ -42,7 +42,7 @@ class GraphAPI:
42
42
  linkage_criterion: str = "distance",
43
43
  linkage_method: str = "average",
44
44
  linkage_metric: str = "yule",
45
- linkage_threshold: float = 0.2,
45
+ linkage_threshold: Union[float, str] = 0.2,
46
46
  min_cluster_size: int = 5,
47
47
  max_cluster_size: int = 1000,
48
48
  ) -> Graph:
@@ -58,12 +58,11 @@ class GraphAPI:
58
58
  impute_depth (int, optional): Depth for imputing neighbors. Defaults to 0.
59
59
  prune_threshold (float, optional): Distance threshold for pruning neighbors. Defaults to 0.0.
60
60
  linkage_criterion (str, optional): Clustering criterion for defining domains. Defaults to "distance".
61
- linkage_method (str, optional): Clustering method to use. Defaults to "average". Choose "auto"
62
- to automatically select the best linkage method.
63
- linkage_metric (str, optional): Metric to use for calculating distances. Defaults to "yule". Choose "auto"
64
- to automatically select the best linkage metric.
65
- linkage_threshold (str, float, optional): Threshold for clustering. Choose "silhouette", "calinski_harabasz",
66
- or "davies_bouldin" to automatically select the best threshold. Defaults to 0.2.
61
+ linkage_method (str, optional): Clustering method to use. Choose "auto" to optimize. Defaults to "average".
62
+ linkage_metric (str, optional): Metric to use for calculating distances. Choose "auto" to optimize.
63
+ Defaults to "yule".
64
+ linkage_threshold (float, str, optional): Threshold for clustering. Choose "auto" to optimize.
65
+ Defaults to 0.2.
67
66
  min_cluster_size (int, optional): Minimum size for clusters. Defaults to 5.
68
67
  max_cluster_size (int, optional): Maximum size for clusters. Defaults to 1000.
69
68
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: risk-network
3
- Version: 0.0.9b38
3
+ Version: 0.0.9b40
4
4
  Summary: A Python package for biological network analysis
5
5
  Author: Ira Horecka
6
6
  Author-email: Ira Horecka <ira89@icloud.com>
@@ -1,8 +1,7 @@
1
- risk/__init__.py,sha256=S22dtKjPn7IvWeLakeN4IajRLsqHuSbBhjvf6z5kHoo,127
2
- risk/constants.py,sha256=XInRaH78Slnw_sWgAsBFbUHkyA0h0jL0DKGuQNbOvjM,550
1
+ risk/__init__.py,sha256=2Ucmxw9wGNzUhqe_QGlEi2pnGhkdOrl9wa8w-MUIfm8,127
3
2
  risk/risk.py,sha256=s827_lRknFseOP9O4zW8sP-IcCd2EzrpV_tnVY_tz5s,1104
4
3
  risk/annotations/__init__.py,sha256=parsbcux1U4urpUqh9AdzbDWuLj9HlMidycMPkpSQFo,179
5
- risk/annotations/annotations.py,sha256=g8ca9H49dZIqHv6Od3Dem4BIo_euy8alL3PDauT6ZJI,14088
4
+ risk/annotations/annotations.py,sha256=Sq24YBtNPMxXOvWoxqPwOJ4bsFAMIBYpVWjEvsQPtNo,14912
6
5
  risk/annotations/io.py,sha256=z1AJySsU-KL_IYuHa7j3nvuczmOHgK3WfaQ4TRunvrA,10499
7
6
  risk/log/__init__.py,sha256=7LxDysQu7doi0LAvlY2YbjN6iJH0fNknqy8lSLgeljo,217
8
7
  risk/log/console.py,sha256=PgjyEvyhYLUSHXPUKEqOmxsDsfrjPICIgqo_cAHq0N8,4575
@@ -10,13 +9,13 @@ risk/log/parameters.py,sha256=VtwfMzLU1xI4yji3-Ch5vHjH-KdwTfwaEMmi7hFQTs0,5716
10
9
  risk/neighborhoods/__init__.py,sha256=Q74HwTH7okI-vaskJPy2bYwb5sNjGASTzJ6m8V8arCU,234
11
10
  risk/neighborhoods/api.py,sha256=ywngw2TQVV27gYlWDXcs8-qnmeepnvb-W9ov6J6VEPM,23341
12
11
  risk/neighborhoods/community.py,sha256=5Q_-VAJC-5SY5EUsB8gIlemeDoAL85uLjyl16pItHiQ,16699
13
- risk/neighborhoods/domains.py,sha256=yaSHymGfRJVuXHIa7BwoKzvIRSg5oLhNoOMg0tsVqV8,15961
12
+ risk/neighborhoods/domains.py,sha256=4K1tbiia3_TQKUrGdfmKVdYlRD2EEzPnMCKRv6IGxu4,14448
14
13
  risk/neighborhoods/neighborhoods.py,sha256=l9FhADB1C-OxM8E9QXOcA4osUDgA1vs4ud-OCGKKybc,21457
15
14
  risk/network/__init__.py,sha256=oVi3FA1XXKD84014Cykq-9bpX4_s0F3aAUfNOU-07Qw,73
16
15
  risk/network/geometry.py,sha256=eVtGHMgBf9fEqQZUFdHWjw-zFYYpfUONoHFSAxoRkug,6219
17
16
  risk/network/io.py,sha256=RCH4nQdgYDXcNwMfpSz7qEmPO0pJ1p9fL0rNQptsQrc,21673
18
17
  risk/network/graph/__init__.py,sha256=ziGJew3yhtqvrb9LUuneDu_LwW2Wa9vd4UuhoL5l1CA,91
19
- risk/network/graph/api.py,sha256=fOyd-5rRnqmtquproP90scehewd0UtOVZS65hCuwasI,8684
18
+ risk/network/graph/api.py,sha256=xS_rNDvZPdwIar2E9x9BKMeR0DcYuwcHiUpc_EcJ4-o,8536
20
19
  risk/network/graph/graph.py,sha256=qEWyZvuaGT_vvjhreBdmRPX3gst2wQFaXhFAvikPSqw,12158
21
20
  risk/network/graph/summary.py,sha256=Y_0rL2C1UoQeZQIPVe5LbaCO356Mcc8HisnrXwQsRm8,10289
22
21
  risk/network/plotter/__init__.py,sha256=4gWtQHGzQVNHmEBXi31Zf0tX0y2sTcE66J_yGnn7268,99
@@ -34,8 +33,8 @@ risk/stats/stat_tests.py,sha256=tj0ri9w89_1fsjGLuafTWpfBEwZXpSLn7Ej2aAQ5lxk,1177
34
33
  risk/stats/permutation/__init__.py,sha256=OLmYLm2uj96hPsSaUs0vUqFYw6Thwch_aHtpL7L0ZFw,127
35
34
  risk/stats/permutation/permutation.py,sha256=BWjgdBpLVcHvmwHy0bmD4aJFccxifNBSrrCBPppyKf4,10569
36
35
  risk/stats/permutation/test_functions.py,sha256=KlECWTz1EZ6EPF_OAgHb0uznaIhopiVYb_AKUKuC4no,3120
37
- risk_network-0.0.9b38.dist-info/LICENSE,sha256=jOtLnuWt7d5Hsx6XXB2QxzrSe2sWWh3NgMfFRetluQM,35147
38
- risk_network-0.0.9b38.dist-info/METADATA,sha256=mAB2KQoRWeOH13radrcMeW5dalpkPkl4YtjfUpQhJXI,47627
39
- risk_network-0.0.9b38.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
40
- risk_network-0.0.9b38.dist-info/top_level.txt,sha256=NX7C2PFKTvC1JhVKv14DFlFAIFnKc6Lpsu1ZfxvQwVw,5
41
- risk_network-0.0.9b38.dist-info/RECORD,,
36
+ risk_network-0.0.9b40.dist-info/LICENSE,sha256=jOtLnuWt7d5Hsx6XXB2QxzrSe2sWWh3NgMfFRetluQM,35147
37
+ risk_network-0.0.9b40.dist-info/METADATA,sha256=0gk-H9_4YiOCT5iykSjB89qALDejboNUa2mZy_XtLNc,47627
38
+ risk_network-0.0.9b40.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
39
+ risk_network-0.0.9b40.dist-info/top_level.txt,sha256=NX7C2PFKTvC1JhVKv14DFlFAIFnKc6Lpsu1ZfxvQwVw,5
40
+ risk_network-0.0.9b40.dist-info/RECORD,,
risk/constants.py DELETED
@@ -1,31 +0,0 @@
1
- """
2
- risk/constants
3
- ~~~~~~~~~~~~~~
4
- """
5
-
6
- GROUP_LINKAGE_METHODS = ["single", "complete", "average", "weighted", "centroid", "median", "ward"]
7
-
8
- GROUP_DISTANCE_METRICS = [
9
- "braycurtis",
10
- "canberra",
11
- "chebyshev",
12
- "cityblock",
13
- "correlation",
14
- "cosine",
15
- "dice",
16
- "euclidean",
17
- "hamming",
18
- "jaccard",
19
- "jensenshannon",
20
- "kulczynski1",
21
- "mahalanobis",
22
- "matching",
23
- "minkowski",
24
- "rogerstanimoto",
25
- "russellrao",
26
- "seuclidean",
27
- "sokalmichener",
28
- "sokalsneath",
29
- "sqeuclidean",
30
- "yule",
31
- ]