risk-network 0.0.9b38__py3-none-any.whl → 0.0.9b40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- risk/__init__.py +1 -1
- risk/annotations/annotations.py +70 -46
- risk/neighborhoods/domains.py +121 -138
- risk/network/graph/api.py +7 -8
- {risk_network-0.0.9b38.dist-info → risk_network-0.0.9b40.dist-info}/METADATA +1 -1
- {risk_network-0.0.9b38.dist-info → risk_network-0.0.9b40.dist-info}/RECORD +9 -10
- risk/constants.py +0 -31
- {risk_network-0.0.9b38.dist-info → risk_network-0.0.9b40.dist-info}/LICENSE +0 -0
- {risk_network-0.0.9b38.dist-info → risk_network-0.0.9b40.dist-info}/WHEEL +0 -0
- {risk_network-0.0.9b38.dist-info → risk_network-0.0.9b40.dist-info}/top_level.txt +0 -0
risk/__init__.py
CHANGED
risk/annotations/annotations.py
CHANGED
@@ -12,8 +12,9 @@ import networkx as nx
|
|
12
12
|
import nltk
|
13
13
|
import numpy as np
|
14
14
|
import pandas as pd
|
15
|
-
from nltk.tokenize import word_tokenize
|
16
15
|
from nltk.corpus import stopwords
|
16
|
+
from nltk.stem import WordNetLemmatizer
|
17
|
+
from nltk.tokenize import word_tokenize
|
17
18
|
|
18
19
|
from risk.log import logger
|
19
20
|
from scipy.sparse import coo_matrix
|
@@ -31,11 +32,17 @@ def _setup_nltk():
|
|
31
32
|
except LookupError:
|
32
33
|
nltk.download("stopwords")
|
33
34
|
|
35
|
+
try:
|
36
|
+
nltk.data.find("corpora/wordnet")
|
37
|
+
except LookupError:
|
38
|
+
nltk.download("wordnet")
|
39
|
+
|
34
40
|
|
35
41
|
# Ensure you have the necessary NLTK data
|
36
42
|
_setup_nltk()
|
37
|
-
#
|
38
|
-
|
43
|
+
# Use NLTK's stopwords
|
44
|
+
STOP_WORDS = set(stopwords.words("english"))
|
45
|
+
LEMMATIZER = WordNetLemmatizer()
|
39
46
|
|
40
47
|
|
41
48
|
def load_annotations(
|
@@ -208,104 +215,121 @@ def define_top_annotations(
|
|
208
215
|
|
209
216
|
def get_weighted_description(words_column: pd.Series, scores_column: pd.Series) -> str:
|
210
217
|
"""Generate a weighted description from words and their corresponding scores,
|
211
|
-
|
218
|
+
using improved weighting logic with normalization, lemmatization, and aggregation.
|
212
219
|
|
213
220
|
Args:
|
214
|
-
words_column (pd.Series): A pandas Series containing strings to process.
|
221
|
+
words_column (pd.Series): A pandas Series containing strings (phrases) to process.
|
215
222
|
scores_column (pd.Series): A pandas Series containing significance scores to weigh the terms.
|
216
223
|
|
217
224
|
Returns:
|
218
|
-
str: A coherent description formed from the most frequent and significant words
|
225
|
+
str: A coherent description formed from the most frequent and significant words.
|
219
226
|
"""
|
220
|
-
#
|
227
|
+
# Normalize significance scores to [0,1]. If all scores are identical, use 1.
|
221
228
|
if scores_column.max() == scores_column.min():
|
222
|
-
normalized_scores = pd.Series([1] * len(scores_column))
|
229
|
+
normalized_scores = pd.Series([1] * len(scores_column), index=scores_column.index)
|
223
230
|
else:
|
224
|
-
# Normalize the significance scores to be between 0 and 1
|
225
231
|
normalized_scores = (scores_column - scores_column.min()) / (
|
226
232
|
scores_column.max() - scores_column.min()
|
227
233
|
)
|
228
234
|
|
229
|
-
#
|
235
|
+
# Accumulate weighted counts for each token (after cleaning and lemmatization)
|
236
|
+
weighted_counts = {}
|
237
|
+
for phrase, score in zip(words_column, normalized_scores):
|
238
|
+
# Tokenize the phrase
|
239
|
+
tokens = word_tokenize(str(phrase))
|
240
|
+
# Determine the weight (scale factor; here multiplying normalized score by 10)
|
241
|
+
weight = max(1, int((0 if pd.isna(score) else score) * 10))
|
242
|
+
for token in tokens:
|
243
|
+
# Clean token: lowercase and remove extraneous punctuation (but preserve intra-word hyphens)
|
244
|
+
token_clean = re.sub(r"[^\w\-]", "", token.lower()).strip()
|
245
|
+
if not token_clean:
|
246
|
+
continue
|
247
|
+
# Skip tokens that are pure numbers
|
248
|
+
if token_clean.isdigit():
|
249
|
+
continue
|
250
|
+
# Skip stopwords
|
251
|
+
if token_clean in STOP_WORDS:
|
252
|
+
continue
|
253
|
+
# Lemmatize the token to merge similar forms
|
254
|
+
token_norm = LEMMATIZER.lemmatize(token_clean)
|
255
|
+
weighted_counts[token_norm] = weighted_counts.get(token_norm, 0) + weight
|
256
|
+
|
257
|
+
# Reconstruct a weighted token list by repeating each token by its aggregated count.
|
230
258
|
weighted_words = []
|
231
|
-
for
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
weighted_words.extend([word] * weight)
|
236
|
-
|
237
|
-
# Tokenize the weighted words, but preserve number-word patterns like '4-alpha'
|
238
|
-
tokens = word_tokenize(" ".join(weighted_words))
|
239
|
-
# Ensure we treat "4-alpha" or other "number-word" patterns as single tokens
|
259
|
+
for token, count in weighted_counts.items():
|
260
|
+
weighted_words.extend([token] * count)
|
261
|
+
|
262
|
+
# Combine tokens that match number-word patterns (e.g. "4-alpha") and remove pure numeric tokens.
|
240
263
|
combined_tokens = []
|
241
|
-
for token in
|
242
|
-
# Match patterns like '4-alpha' or '5-hydroxy' and keep them together
|
264
|
+
for token in weighted_words:
|
243
265
|
if re.match(r"^\d+-\w+", token):
|
244
266
|
combined_tokens.append(token)
|
245
|
-
elif token.replace(".", "", 1).isdigit():
|
246
|
-
# Ignore pure numbers as descriptions unless necessary
|
267
|
+
elif token.replace(".", "", 1).isdigit():
|
247
268
|
continue
|
248
269
|
else:
|
249
270
|
combined_tokens.append(token)
|
250
271
|
|
251
|
-
#
|
272
|
+
# If the only token is numeric, return a default value.
|
252
273
|
if len(combined_tokens) == 1 and combined_tokens[0].isdigit():
|
253
|
-
return "N/A"
|
274
|
+
return "N/A"
|
254
275
|
|
255
|
-
# Simplify the
|
276
|
+
# Simplify the token list to remove near-duplicates based on the Jaccard index.
|
256
277
|
simplified_words = _simplify_word_list(combined_tokens)
|
278
|
+
# Generate a coherent description from the simplified words.
|
257
279
|
description = _generate_coherent_description(simplified_words)
|
258
280
|
|
259
281
|
return description
|
260
282
|
|
261
283
|
|
262
284
|
def _simplify_word_list(words: List[str], threshold: float = 0.80) -> List[str]:
|
263
|
-
"""Filter out words that are too similar based on the Jaccard index,
|
285
|
+
"""Filter out words that are too similar based on the Jaccard index,
|
286
|
+
keeping the word with the higher aggregated count.
|
264
287
|
|
265
288
|
Args:
|
266
|
-
words (
|
289
|
+
words (List[str]): The list of tokens to be filtered.
|
267
290
|
threshold (float, optional): The similarity threshold for the Jaccard index. Defaults to 0.80.
|
268
291
|
|
269
292
|
Returns:
|
270
|
-
|
293
|
+
List[str]: A list of filtered words, where similar words are reduced to the most frequent one.
|
271
294
|
"""
|
272
|
-
# Count the occurrences
|
295
|
+
# Count the occurrences (which reflect the weighted importance)
|
273
296
|
word_counts = Counter(words)
|
274
297
|
filtered_words = []
|
275
298
|
used_words = set()
|
276
|
-
|
277
|
-
|
299
|
+
|
300
|
+
# Iterate through words sorted by descending weighted frequency
|
301
|
+
for word in sorted(word_counts, key=lambda w: word_counts[w], reverse=True):
|
278
302
|
if word in used_words:
|
279
303
|
continue
|
280
304
|
|
281
305
|
word_set = set(word)
|
282
|
-
# Find similar words based on the Jaccard index
|
306
|
+
# Find similar words (including the current word) based on the Jaccard index
|
283
307
|
similar_words = [
|
284
308
|
other_word
|
285
309
|
for other_word in word_counts
|
286
310
|
if _calculate_jaccard_index(word_set, set(other_word)) >= threshold
|
287
311
|
]
|
288
|
-
#
|
312
|
+
# Choose the word with the highest weighted count among the similar group
|
289
313
|
similar_words.sort(key=lambda w: word_counts[w], reverse=True)
|
290
314
|
best_word = similar_words[0]
|
291
315
|
filtered_words.append(best_word)
|
292
316
|
used_words.update(similar_words)
|
293
317
|
|
318
|
+
# Preserve the original order (by frequency) from the filtered set
|
294
319
|
final_words = [word for word in words if word in filtered_words]
|
295
320
|
|
296
321
|
return final_words
|
297
322
|
|
298
323
|
|
299
324
|
def _calculate_jaccard_index(set1: Set[Any], set2: Set[Any]) -> float:
|
300
|
-
"""Calculate the Jaccard
|
325
|
+
"""Calculate the Jaccard index between two sets.
|
301
326
|
|
302
327
|
Args:
|
303
|
-
set1 (
|
304
|
-
set2 (
|
328
|
+
set1 (Set[Any]): The first set.
|
329
|
+
set2 (Set[Any]): The second set.
|
305
330
|
|
306
331
|
Returns:
|
307
|
-
float: The Jaccard
|
308
|
-
Returns 0 if the union of the sets is empty.
|
332
|
+
float: The Jaccard index (intersection over union). Returns 0 if the union is empty.
|
309
333
|
"""
|
310
334
|
intersection = len(set1.intersection(set2))
|
311
335
|
union = len(set1.union(set2))
|
@@ -313,28 +337,28 @@ def _calculate_jaccard_index(set1: Set[Any], set2: Set[Any]) -> float:
|
|
313
337
|
|
314
338
|
|
315
339
|
def _generate_coherent_description(words: List[str]) -> str:
|
316
|
-
"""Generate a coherent description from a list of words
|
340
|
+
"""Generate a coherent description from a list of words.
|
341
|
+
|
317
342
|
If there is only one unique entry, return it directly.
|
343
|
+
Otherwise, order the words by frequency and join them into a single string.
|
318
344
|
|
319
345
|
Args:
|
320
|
-
words (List): A list of
|
346
|
+
words (List[str]): A list of tokens.
|
321
347
|
|
322
348
|
Returns:
|
323
|
-
str: A coherent description
|
349
|
+
str: A coherent, space-separated description.
|
324
350
|
"""
|
325
|
-
# If there are no words, return a keyword indicating no data is available
|
326
351
|
if not words:
|
327
352
|
return "N/A"
|
328
353
|
|
329
|
-
# If there
|
354
|
+
# If there is only one unique word, return it directly
|
330
355
|
unique_words = set(words)
|
331
356
|
if len(unique_words) == 1:
|
332
357
|
return list(unique_words)[0]
|
333
358
|
|
334
|
-
# Count
|
359
|
+
# Count weighted occurrences and sort in descending order.
|
335
360
|
word_counts = Counter(words)
|
336
361
|
most_common_words = [word for word, _ in word_counts.most_common()]
|
337
|
-
# Join the most common words to form a coherent description based on frequency
|
338
362
|
description = " ".join(most_common_words)
|
339
363
|
|
340
364
|
return description
|
risk/neighborhoods/domains.py
CHANGED
@@ -10,19 +10,22 @@ from typing import Tuple, Union
|
|
10
10
|
import numpy as np
|
11
11
|
import pandas as pd
|
12
12
|
from scipy.cluster.hierarchy import linkage, fcluster
|
13
|
-
from
|
14
|
-
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, silhouette_score
|
13
|
+
from sklearn.metrics import silhouette_score
|
15
14
|
from tqdm import tqdm
|
16
15
|
|
17
16
|
from risk.annotations import get_weighted_description
|
18
|
-
from risk.constants import GROUP_LINKAGE_METHODS, GROUP_DISTANCE_METRICS
|
19
17
|
from risk.log import logger
|
20
18
|
|
21
19
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
20
|
+
# Define constants for clustering
|
21
|
+
# fmt: off
|
22
|
+
LINKAGE_METHODS = {"single", "complete", "average", "weighted", "centroid", "median", "ward"}
|
23
|
+
LINKAGE_METRICS = {
|
24
|
+
"braycurtis","canberra", "chebyshev", "cityblock", "correlation", "cosine", "dice", "euclidean",
|
25
|
+
"hamming", "jaccard", "jensenshannon", "kulczynski1", "mahalanobis", "matching", "minkowski",
|
26
|
+
"rogerstanimoto", "russellrao", "seuclidean", "sokalmichener", "sokalsneath", "sqeuclidean", "yule",
|
27
|
+
}
|
28
|
+
# fmt: on
|
26
29
|
|
27
30
|
|
28
31
|
def define_domains(
|
@@ -31,7 +34,7 @@ def define_domains(
|
|
31
34
|
linkage_criterion: str,
|
32
35
|
linkage_method: str,
|
33
36
|
linkage_metric: str,
|
34
|
-
linkage_threshold: Union[
|
37
|
+
linkage_threshold: Union[float, str],
|
35
38
|
) -> pd.DataFrame:
|
36
39
|
"""Define domains and assign nodes to these domains based on their significance scores and clustering,
|
37
40
|
handling errors by assigning unique domains when clustering fails.
|
@@ -39,19 +42,13 @@ def define_domains(
|
|
39
42
|
Args:
|
40
43
|
top_annotations (pd.DataFrame): DataFrame of top annotations data for the network nodes.
|
41
44
|
significant_neighborhoods_significance (np.ndarray): The binary significance matrix below alpha.
|
42
|
-
linkage_criterion (str): The clustering criterion for defining groups.
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
linkage_threshold (str, float): The linkage threshold for clustering, or one of "silhouette",
|
47
|
-
"calinski_harabasz", or "davies_bouldin" to optimize the threshold.
|
45
|
+
linkage_criterion (str): The clustering criterion for defining groups.
|
46
|
+
linkage_method (str): The linkage method for clustering. Choose "auto" to optimize.
|
47
|
+
linkage_metric (str): The linkage metric for clustering. Choose "auto" to optimize.
|
48
|
+
linkage_threshold (float, str): The threshold for clustering. Choose "auto" to optimize.
|
48
49
|
|
49
50
|
Returns:
|
50
51
|
pd.DataFrame: DataFrame with the primary domain for each node.
|
51
|
-
|
52
|
-
Raises:
|
53
|
-
ValueError: If an improper value is passed for linkage_threshold. Acceptable values are "silhouette",
|
54
|
-
"calinski_harabasz", "davies_bouldin", or a float value.
|
55
52
|
"""
|
56
53
|
try:
|
57
54
|
if linkage_criterion == "off":
|
@@ -62,17 +59,14 @@ def define_domains(
|
|
62
59
|
# Safeguard the matrix by replacing NaN, Inf, and -Inf values
|
63
60
|
m = _safeguard_matrix(m)
|
64
61
|
# Optimize silhouette score across different linkage methods and distance metrics
|
65
|
-
best_linkage, best_metric, best_threshold = (
|
66
|
-
|
67
|
-
m, linkage_criterion, linkage_method, linkage_metric, linkage_threshold
|
68
|
-
)
|
62
|
+
best_linkage, best_metric, best_threshold = _optimize_silhouette_across_linkage_and_metrics(
|
63
|
+
m, linkage_criterion, linkage_method, linkage_metric, linkage_threshold
|
69
64
|
)
|
70
65
|
# Perform hierarchical clustering
|
71
66
|
Z = linkage(m, method=best_linkage, metric=best_metric)
|
72
67
|
logger.warning(
|
73
|
-
f"Linkage criterion: '{linkage_criterion}'\nLinkage method: '{best_linkage}'\nLinkage metric: '{best_metric}'"
|
68
|
+
f"Linkage criterion: '{linkage_criterion}'\nLinkage method: '{best_linkage}'\nLinkage metric: '{best_metric}'\nLinkage threshold: {round(best_threshold, 3)}"
|
74
69
|
)
|
75
|
-
logger.debug(f"Optimal linkage threshold: {round(best_threshold, 3)}")
|
76
70
|
# Calculate the optimal threshold for clustering
|
77
71
|
max_d_optimal = np.max(Z[:, 2]) * best_threshold
|
78
72
|
# Assign domains to the annotations matrix
|
@@ -91,9 +85,6 @@ def define_domains(
|
|
91
85
|
f"Error encountered. Skipping clustering and assigning {n_rows} unique domains."
|
92
86
|
)
|
93
87
|
top_annotations["domain"] = range(1, n_rows + 1) # Assign unique domains
|
94
|
-
except LinkageThresholdError as e:
|
95
|
-
# If a LinkageThresholdError is encountered, raise a ValueError with the original exception
|
96
|
-
raise ValueError(e) from e
|
97
88
|
|
98
89
|
# Create DataFrames to store domain information
|
99
90
|
node_to_significance = pd.DataFrame(
|
@@ -215,154 +206,146 @@ def _safeguard_matrix(matrix: np.ndarray) -> np.ndarray:
|
|
215
206
|
return matrix
|
216
207
|
|
217
208
|
|
218
|
-
def
|
209
|
+
def _optimize_silhouette_across_linkage_and_metrics(
|
219
210
|
m: np.ndarray,
|
220
211
|
linkage_criterion: str,
|
221
212
|
linkage_method: str,
|
222
213
|
linkage_metric: str,
|
223
214
|
linkage_threshold: Union[str, float],
|
224
215
|
) -> Tuple[str, str, float]:
|
225
|
-
"""Optimize
|
226
|
-
a string, optimize the threshold using the specified metric; otherwise, use the provided threshold.
|
216
|
+
"""Optimize silhouette score across different linkage methods and distance metrics.
|
227
217
|
|
228
218
|
Args:
|
229
219
|
m (np.ndarray): Data matrix.
|
230
|
-
linkage_criterion (str):
|
231
|
-
linkage_method (str): Linkage method for clustering
|
232
|
-
linkage_metric (str):
|
233
|
-
linkage_threshold (str, float):
|
234
|
-
"silhouette", "calinski_harabasz", or "davies_bouldin" to trigger optimization.
|
220
|
+
linkage_criterion (str): Clustering criterion.
|
221
|
+
linkage_method (str): Linkage method for clustering. Choose "auto" to optimize.
|
222
|
+
linkage_metric (str): Linkage metric for clustering. Choose "auto" to optimize.
|
223
|
+
linkage_threshold (Union[str, float]): Threshold for clustering. Choose "auto" to optimize.
|
235
224
|
|
236
225
|
Returns:
|
237
226
|
Tuple[str, str, float]:
|
238
|
-
-
|
239
|
-
-
|
240
|
-
-
|
241
|
-
|
242
|
-
Raises:
|
243
|
-
ValueError: If linkage_threshold is neither one of the supported keywords nor convertible to float.
|
227
|
+
- Best linkage method (str)
|
228
|
+
- Best linkage metric (str)
|
229
|
+
- Best threshold (float)
|
244
230
|
"""
|
245
|
-
#
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
if isinstance(linkage_threshold, str):
|
250
|
-
if linkage_threshold in supported_linkage_thresholds:
|
251
|
-
opt_metric = linkage_threshold
|
252
|
-
else:
|
253
|
-
try:
|
254
|
-
threshold_float = float(linkage_threshold)
|
255
|
-
except (TypeError, ValueError):
|
256
|
-
raise LinkageThresholdError(
|
257
|
-
f"linkage_threshold must be one of {', '.join(supported_linkage_thresholds)} or a float value."
|
258
|
-
)
|
259
|
-
return linkage_method, linkage_metric, threshold_float
|
260
|
-
else:
|
261
|
-
# If not a string, try to convert it to float.
|
262
|
-
try:
|
263
|
-
threshold_float = float(linkage_threshold)
|
264
|
-
except (TypeError, ValueError):
|
265
|
-
raise LinkageThresholdError(
|
266
|
-
f"linkage_threshold must be one of {', '.join(supported_linkage_thresholds)} or a float value."
|
267
|
-
)
|
268
|
-
return linkage_method, linkage_metric, threshold_float
|
269
|
-
|
270
|
-
# Otherwise, perform optimization using the specified metric (opt_metric).
|
271
|
-
best_overall_method = None
|
272
|
-
best_overall_metric = None
|
273
|
-
best_overall_threshold = None
|
231
|
+
# Initialize best overall values
|
232
|
+
best_overall_method = linkage_method
|
233
|
+
best_overall_metric = linkage_metric
|
234
|
+
best_overall_threshold = linkage_threshold
|
274
235
|
best_overall_score = -np.inf
|
275
236
|
|
276
|
-
#
|
277
|
-
|
278
|
-
|
279
|
-
total_combinations = len(
|
237
|
+
# Set linkage methods and metrics to all combinations if "auto" is selected
|
238
|
+
linkage_methods = LINKAGE_METHODS if linkage_method == "auto" else [linkage_method]
|
239
|
+
linkage_metrics = LINKAGE_METRICS if linkage_metric == "auto" else [linkage_metric]
|
240
|
+
total_combinations = len(linkage_methods) * len(linkage_metrics)
|
280
241
|
|
242
|
+
# Evaluating optimal linkage method and metric
|
281
243
|
for method, metric in tqdm(
|
282
|
-
product(
|
244
|
+
product(linkage_methods, linkage_metrics),
|
283
245
|
desc="Evaluating optimal linkage method and metric",
|
284
246
|
total=total_combinations,
|
285
247
|
bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
|
286
248
|
):
|
249
|
+
# Some linkage methods and metrics may not work with certain data
|
287
250
|
with suppress(ValueError):
|
288
251
|
Z = linkage(m, method=method, metric=metric)
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
252
|
+
# Only optimize silhouette score if the threshold is "auto"
|
253
|
+
if linkage_threshold == "auto":
|
254
|
+
threshold, score = _find_best_silhouette_score(Z, m, metric, linkage_criterion)
|
255
|
+
if score > best_overall_score:
|
256
|
+
best_overall_score = score
|
257
|
+
best_overall_threshold = threshold
|
258
|
+
best_overall_method = method
|
259
|
+
best_overall_metric = metric
|
260
|
+
else:
|
261
|
+
# Use the provided threshold without optimization
|
262
|
+
score = silhouette_score(
|
263
|
+
m,
|
264
|
+
fcluster(Z, linkage_threshold * np.max(Z[:, 2]), criterion=linkage_criterion),
|
265
|
+
metric=metric,
|
266
|
+
)
|
267
|
+
if score > best_overall_score:
|
268
|
+
best_overall_score = score
|
269
|
+
best_overall_threshold = linkage_threshold
|
270
|
+
best_overall_method = method
|
271
|
+
best_overall_metric = metric
|
272
|
+
|
300
273
|
return best_overall_method, best_overall_metric, best_overall_threshold
|
301
274
|
|
302
275
|
|
303
|
-
def
|
276
|
+
def _find_best_silhouette_score(
|
304
277
|
Z: np.ndarray,
|
305
278
|
m: np.ndarray,
|
306
279
|
linkage_metric: str,
|
307
280
|
linkage_criterion: str,
|
308
|
-
|
281
|
+
lower_bound: float = 0.001,
|
282
|
+
upper_bound: float = 1.0,
|
283
|
+
resolution: float = 0.001,
|
309
284
|
) -> Tuple[float, float]:
|
310
|
-
"""Find the
|
311
|
-
the threshold value using the specified metric (opt_metric).
|
285
|
+
"""Find the best silhouette score using binary search.
|
312
286
|
|
313
287
|
Args:
|
314
|
-
Z (np.ndarray): Linkage matrix
|
315
|
-
m (np.ndarray): Data matrix
|
316
|
-
linkage_metric (str):
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
288
|
+
Z (np.ndarray): Linkage matrix.
|
289
|
+
m (np.ndarray): Data matrix.
|
290
|
+
linkage_metric (str): Linkage metric for silhouette score calculation.
|
291
|
+
linkage_criterion (str): Clustering criterion.
|
292
|
+
lower_bound (float, optional): Lower bound for search. Defaults to 0.001.
|
293
|
+
upper_bound (float, optional): Upper bound for search. Defaults to 1.0.
|
294
|
+
resolution (float, optional): Desired resolution for the best threshold. Defaults to 0.001.
|
321
295
|
|
322
296
|
Returns:
|
323
297
|
Tuple[float, float]:
|
324
|
-
-
|
325
|
-
-
|
326
|
-
at the optimal threshold (higher for "silhouette" and "calinski_harabasz",
|
327
|
-
lower for "davies_bouldin").
|
328
|
-
|
329
|
-
Raises:
|
330
|
-
ValueError: If the `opt_metric` argument is not one of the supported metrics.
|
298
|
+
- Best threshold (float): The threshold that yields the best silhouette score.
|
299
|
+
- Best silhouette score (float): The highest silhouette score achieved.
|
331
300
|
"""
|
332
|
-
|
333
|
-
|
334
|
-
resolution = 1e-6
|
335
|
-
|
336
|
-
def compute_objective(coefficient: float) -> float:
|
337
|
-
"""Compute the objective function for optimization."""
|
338
|
-
threshold_val = coefficient * max_d
|
339
|
-
clusters = fcluster(Z, threshold_val, criterion=linkage_criterion)
|
340
|
-
unique_clusters = np.unique(clusters)
|
341
|
-
if len(unique_clusters) <= 1 or len(unique_clusters) == m.shape[0]:
|
342
|
-
return 1e6
|
343
|
-
try:
|
344
|
-
if opt_metric == "silhouette":
|
345
|
-
score = silhouette_score(m, clusters, metric=linkage_metric)
|
346
|
-
return -score # We want to maximize the score.
|
347
|
-
elif opt_metric == "calinski_harabasz":
|
348
|
-
score = calinski_harabasz_score(m, clusters)
|
349
|
-
return -score
|
350
|
-
elif opt_metric == "davies_bouldin":
|
351
|
-
score = davies_bouldin_score(m, clusters)
|
352
|
-
return score
|
353
|
-
else:
|
354
|
-
raise ValueError(f"Unknown optimization metric: {opt_metric}.")
|
355
|
-
except Exception:
|
356
|
-
return 1e6
|
301
|
+
best_score = -np.inf
|
302
|
+
best_threshold = None
|
357
303
|
|
358
|
-
#
|
359
|
-
|
360
|
-
|
361
|
-
|
304
|
+
# Test lower bound
|
305
|
+
max_d_lower = np.max(Z[:, 2]) * lower_bound
|
306
|
+
clusters_lower = fcluster(Z, max_d_lower, criterion=linkage_criterion)
|
307
|
+
try:
|
308
|
+
score_lower = silhouette_score(m, clusters_lower, metric=linkage_metric)
|
309
|
+
except ValueError:
|
310
|
+
score_lower = -np.inf
|
311
|
+
|
312
|
+
# Test upper bound
|
313
|
+
max_d_upper = np.max(Z[:, 2]) * upper_bound
|
314
|
+
clusters_upper = fcluster(Z, max_d_upper, criterion=linkage_criterion)
|
315
|
+
try:
|
316
|
+
score_upper = silhouette_score(m, clusters_upper, metric=linkage_metric)
|
317
|
+
except ValueError:
|
318
|
+
score_upper = -np.inf
|
362
319
|
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
320
|
+
# Determine initial bounds for binary search
|
321
|
+
if score_lower > score_upper:
|
322
|
+
best_score = score_lower
|
323
|
+
best_threshold = lower_bound
|
324
|
+
upper_bound = (lower_bound + upper_bound) / 2
|
325
|
+
else:
|
326
|
+
best_score = score_upper
|
327
|
+
best_threshold = upper_bound
|
328
|
+
lower_bound = (lower_bound + upper_bound) / 2
|
329
|
+
|
330
|
+
# Binary search loop
|
331
|
+
while upper_bound - lower_bound > resolution:
|
332
|
+
mid_threshold = (upper_bound + lower_bound) / 2
|
333
|
+
max_d_mid = np.max(Z[:, 2]) * mid_threshold
|
334
|
+
clusters_mid = fcluster(Z, max_d_mid, criterion=linkage_criterion)
|
335
|
+
try:
|
336
|
+
score_mid = silhouette_score(m, clusters_mid, metric=linkage_metric)
|
337
|
+
except ValueError:
|
338
|
+
score_mid = -np.inf
|
339
|
+
|
340
|
+
# Update best score and threshold if mid-point is better
|
341
|
+
if score_mid > best_score:
|
342
|
+
best_score = score_mid
|
343
|
+
best_threshold = mid_threshold
|
344
|
+
|
345
|
+
# Adjust bounds based on the scores
|
346
|
+
if score_lower > score_upper:
|
347
|
+
upper_bound = mid_threshold
|
348
|
+
else:
|
349
|
+
lower_bound = mid_threshold
|
367
350
|
|
368
|
-
return best_threshold, float(
|
351
|
+
return best_threshold, float(best_score)
|
risk/network/graph/api.py
CHANGED
@@ -4,7 +4,7 @@ risk/network/graph/api
|
|
4
4
|
"""
|
5
5
|
|
6
6
|
import copy
|
7
|
-
from typing import Any, Dict
|
7
|
+
from typing import Any, Dict, Union
|
8
8
|
|
9
9
|
import networkx as nx
|
10
10
|
import pandas as pd
|
@@ -42,7 +42,7 @@ class GraphAPI:
|
|
42
42
|
linkage_criterion: str = "distance",
|
43
43
|
linkage_method: str = "average",
|
44
44
|
linkage_metric: str = "yule",
|
45
|
-
linkage_threshold: float = 0.2,
|
45
|
+
linkage_threshold: Union[float, str] = 0.2,
|
46
46
|
min_cluster_size: int = 5,
|
47
47
|
max_cluster_size: int = 1000,
|
48
48
|
) -> Graph:
|
@@ -58,12 +58,11 @@ class GraphAPI:
|
|
58
58
|
impute_depth (int, optional): Depth for imputing neighbors. Defaults to 0.
|
59
59
|
prune_threshold (float, optional): Distance threshold for pruning neighbors. Defaults to 0.0.
|
60
60
|
linkage_criterion (str, optional): Clustering criterion for defining domains. Defaults to "distance".
|
61
|
-
linkage_method (str, optional): Clustering method to use. Defaults to "average".
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
or "davies_bouldin" to automatically select the best threshold. Defaults to 0.2.
|
61
|
+
linkage_method (str, optional): Clustering method to use. Choose "auto" to optimize. Defaults to "average".
|
62
|
+
linkage_metric (str, optional): Metric to use for calculating distances. Choose "auto" to optimize.
|
63
|
+
Defaults to "yule".
|
64
|
+
linkage_threshold (float, str, optional): Threshold for clustering. Choose "auto" to optimize.
|
65
|
+
Defaults to 0.2.
|
67
66
|
min_cluster_size (int, optional): Minimum size for clusters. Defaults to 5.
|
68
67
|
max_cluster_size (int, optional): Maximum size for clusters. Defaults to 1000.
|
69
68
|
|
@@ -1,8 +1,7 @@
|
|
1
|
-
risk/__init__.py,sha256=
|
2
|
-
risk/constants.py,sha256=XInRaH78Slnw_sWgAsBFbUHkyA0h0jL0DKGuQNbOvjM,550
|
1
|
+
risk/__init__.py,sha256=2Ucmxw9wGNzUhqe_QGlEi2pnGhkdOrl9wa8w-MUIfm8,127
|
3
2
|
risk/risk.py,sha256=s827_lRknFseOP9O4zW8sP-IcCd2EzrpV_tnVY_tz5s,1104
|
4
3
|
risk/annotations/__init__.py,sha256=parsbcux1U4urpUqh9AdzbDWuLj9HlMidycMPkpSQFo,179
|
5
|
-
risk/annotations/annotations.py,sha256=
|
4
|
+
risk/annotations/annotations.py,sha256=Sq24YBtNPMxXOvWoxqPwOJ4bsFAMIBYpVWjEvsQPtNo,14912
|
6
5
|
risk/annotations/io.py,sha256=z1AJySsU-KL_IYuHa7j3nvuczmOHgK3WfaQ4TRunvrA,10499
|
7
6
|
risk/log/__init__.py,sha256=7LxDysQu7doi0LAvlY2YbjN6iJH0fNknqy8lSLgeljo,217
|
8
7
|
risk/log/console.py,sha256=PgjyEvyhYLUSHXPUKEqOmxsDsfrjPICIgqo_cAHq0N8,4575
|
@@ -10,13 +9,13 @@ risk/log/parameters.py,sha256=VtwfMzLU1xI4yji3-Ch5vHjH-KdwTfwaEMmi7hFQTs0,5716
|
|
10
9
|
risk/neighborhoods/__init__.py,sha256=Q74HwTH7okI-vaskJPy2bYwb5sNjGASTzJ6m8V8arCU,234
|
11
10
|
risk/neighborhoods/api.py,sha256=ywngw2TQVV27gYlWDXcs8-qnmeepnvb-W9ov6J6VEPM,23341
|
12
11
|
risk/neighborhoods/community.py,sha256=5Q_-VAJC-5SY5EUsB8gIlemeDoAL85uLjyl16pItHiQ,16699
|
13
|
-
risk/neighborhoods/domains.py,sha256=
|
12
|
+
risk/neighborhoods/domains.py,sha256=4K1tbiia3_TQKUrGdfmKVdYlRD2EEzPnMCKRv6IGxu4,14448
|
14
13
|
risk/neighborhoods/neighborhoods.py,sha256=l9FhADB1C-OxM8E9QXOcA4osUDgA1vs4ud-OCGKKybc,21457
|
15
14
|
risk/network/__init__.py,sha256=oVi3FA1XXKD84014Cykq-9bpX4_s0F3aAUfNOU-07Qw,73
|
16
15
|
risk/network/geometry.py,sha256=eVtGHMgBf9fEqQZUFdHWjw-zFYYpfUONoHFSAxoRkug,6219
|
17
16
|
risk/network/io.py,sha256=RCH4nQdgYDXcNwMfpSz7qEmPO0pJ1p9fL0rNQptsQrc,21673
|
18
17
|
risk/network/graph/__init__.py,sha256=ziGJew3yhtqvrb9LUuneDu_LwW2Wa9vd4UuhoL5l1CA,91
|
19
|
-
risk/network/graph/api.py,sha256=
|
18
|
+
risk/network/graph/api.py,sha256=xS_rNDvZPdwIar2E9x9BKMeR0DcYuwcHiUpc_EcJ4-o,8536
|
20
19
|
risk/network/graph/graph.py,sha256=qEWyZvuaGT_vvjhreBdmRPX3gst2wQFaXhFAvikPSqw,12158
|
21
20
|
risk/network/graph/summary.py,sha256=Y_0rL2C1UoQeZQIPVe5LbaCO356Mcc8HisnrXwQsRm8,10289
|
22
21
|
risk/network/plotter/__init__.py,sha256=4gWtQHGzQVNHmEBXi31Zf0tX0y2sTcE66J_yGnn7268,99
|
@@ -34,8 +33,8 @@ risk/stats/stat_tests.py,sha256=tj0ri9w89_1fsjGLuafTWpfBEwZXpSLn7Ej2aAQ5lxk,1177
|
|
34
33
|
risk/stats/permutation/__init__.py,sha256=OLmYLm2uj96hPsSaUs0vUqFYw6Thwch_aHtpL7L0ZFw,127
|
35
34
|
risk/stats/permutation/permutation.py,sha256=BWjgdBpLVcHvmwHy0bmD4aJFccxifNBSrrCBPppyKf4,10569
|
36
35
|
risk/stats/permutation/test_functions.py,sha256=KlECWTz1EZ6EPF_OAgHb0uznaIhopiVYb_AKUKuC4no,3120
|
37
|
-
risk_network-0.0.
|
38
|
-
risk_network-0.0.
|
39
|
-
risk_network-0.0.
|
40
|
-
risk_network-0.0.
|
41
|
-
risk_network-0.0.
|
36
|
+
risk_network-0.0.9b40.dist-info/LICENSE,sha256=jOtLnuWt7d5Hsx6XXB2QxzrSe2sWWh3NgMfFRetluQM,35147
|
37
|
+
risk_network-0.0.9b40.dist-info/METADATA,sha256=0gk-H9_4YiOCT5iykSjB89qALDejboNUa2mZy_XtLNc,47627
|
38
|
+
risk_network-0.0.9b40.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
39
|
+
risk_network-0.0.9b40.dist-info/top_level.txt,sha256=NX7C2PFKTvC1JhVKv14DFlFAIFnKc6Lpsu1ZfxvQwVw,5
|
40
|
+
risk_network-0.0.9b40.dist-info/RECORD,,
|
risk/constants.py
DELETED
@@ -1,31 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
risk/constants
|
3
|
-
~~~~~~~~~~~~~~
|
4
|
-
"""
|
5
|
-
|
6
|
-
GROUP_LINKAGE_METHODS = ["single", "complete", "average", "weighted", "centroid", "median", "ward"]
|
7
|
-
|
8
|
-
GROUP_DISTANCE_METRICS = [
|
9
|
-
"braycurtis",
|
10
|
-
"canberra",
|
11
|
-
"chebyshev",
|
12
|
-
"cityblock",
|
13
|
-
"correlation",
|
14
|
-
"cosine",
|
15
|
-
"dice",
|
16
|
-
"euclidean",
|
17
|
-
"hamming",
|
18
|
-
"jaccard",
|
19
|
-
"jensenshannon",
|
20
|
-
"kulczynski1",
|
21
|
-
"mahalanobis",
|
22
|
-
"matching",
|
23
|
-
"minkowski",
|
24
|
-
"rogerstanimoto",
|
25
|
-
"russellrao",
|
26
|
-
"seuclidean",
|
27
|
-
"sokalmichener",
|
28
|
-
"sokalsneath",
|
29
|
-
"sqeuclidean",
|
30
|
-
"yule",
|
31
|
-
]
|
File without changes
|
File without changes
|
File without changes
|