PyPI - risk-network - Versions diffs - 0.0.9b38__py3-none-any.whl → 0.0.9b40__py3-none-any.whl - Mend

risk-network 0.0.9b38py3-none-any.whl → 0.0.9b40py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

risk/__init__.py CHANGED Viewed

@@ -7,4 +7,4 @@ RISK: Regional Inference of Significant Kinships
 from risk.risk import RISK
-__version__ = "0.0.9-beta.38"
+__version__ = "0.0.9-beta.40"

risk/annotations/annotations.py CHANGED Viewed

@@ -12,8 +12,9 @@ import networkx as nx
 import nltk
 import numpy as np
 import pandas as pd
-from nltk.tokenize import word_tokenize
 from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+from nltk.tokenize import word_tokenize
 from risk.log import logger
 from scipy.sparse import coo_matrix
@@ -31,11 +32,17 @@ def _setup_nltk():
     except LookupError:
         nltk.download("stopwords")
+    try:
+        nltk.data.find("corpora/wordnet")
+    except LookupError:
+        nltk.download("wordnet")
 # Ensure you have the necessary NLTK data
 _setup_nltk()
-# Initialize English stopwords
-stop_words = set(stopwords.words("english"))
+# Use NLTK's stopwords
+STOP_WORDS = set(stopwords.words("english"))
+LEMMATIZER = WordNetLemmatizer()
 def load_annotations(
@@ -208,104 +215,121 @@ def define_top_annotations(
 def get_weighted_description(words_column: pd.Series, scores_column: pd.Series) -> str:
     """Generate a weighted description from words and their corresponding scores,
-    with support for stopwords filtering and improved weighting logic.
+    using improved weighting logic with normalization, lemmatization, and aggregation.
     Args:
-        words_column (pd.Series): A pandas Series containing strings to process.
+        words_column (pd.Series): A pandas Series containing strings (phrases) to process.
         scores_column (pd.Series): A pandas Series containing significance scores to weigh the terms.
     Returns:
-        str: A coherent description formed from the most frequent and significant words, weighed by significance scores.
+        str: A coherent description formed from the most frequent and significant words.
     """
-    # Handle case where all scores are the same
+    # Normalize significance scores to [0,1]. If all scores are identical, use 1.
     if scores_column.max() == scores_column.min():
-        normalized_scores = pd.Series([1] * len(scores_column))
+        normalized_scores = pd.Series([1] * len(scores_column), index=scores_column.index)
     else:
-        # Normalize the significance scores to be between 0 and 1
         normalized_scores = (scores_column - scores_column.min()) / (
             scores_column.max() - scores_column.min()
         )
-    # Combine words and normalized scores to create weighted words
+    # Accumulate weighted counts for each token (after cleaning and lemmatization)
+    weighted_counts = {}
+    for phrase, score in zip(words_column, normalized_scores):
+        # Tokenize the phrase
+        tokens = word_tokenize(str(phrase))
+        # Determine the weight (scale factor; here multiplying normalized score by 10)
+        weight = max(1, int((0 if pd.isna(score) else score) * 10))
+        for token in tokens:
+            # Clean token: lowercase and remove extraneous punctuation (but preserve intra-word hyphens)
+            token_clean = re.sub(r"[^\w\-]", "", token.lower()).strip()
+            if not token_clean:
+                continue
+            # Skip tokens that are pure numbers
+            if token_clean.isdigit():
+                continue
+            # Skip stopwords
+            if token_clean in STOP_WORDS:
+                continue
+            # Lemmatize the token to merge similar forms
+            token_norm = LEMMATIZER.lemmatize(token_clean)
+            weighted_counts[token_norm] = weighted_counts.get(token_norm, 0) + weight
+    # Reconstruct a weighted token list by repeating each token by its aggregated count.
     weighted_words = []
-    for word, score in zip(words_column, normalized_scores):
-        word = str(word)
-        if word not in stop_words:  # Skip stopwords
-            weight = max(1, int((0 if pd.isna(score) else score) * 10))
-            weighted_words.extend([word] * weight)
-    # Tokenize the weighted words, but preserve number-word patterns like '4-alpha'
-    tokens = word_tokenize(" ".join(weighted_words))
-    # Ensure we treat "4-alpha" or other "number-word" patterns as single tokens
+    for token, count in weighted_counts.items():
+        weighted_words.extend([token] * count)
+    # Combine tokens that match number-word patterns (e.g. "4-alpha") and remove pure numeric tokens.
     combined_tokens = []
-    for token in tokens:
-        # Match patterns like '4-alpha' or '5-hydroxy' and keep them together
+    for token in weighted_words:
         if re.match(r"^\d+-\w+", token):
             combined_tokens.append(token)
-        elif token.replace(".", "", 1).isdigit():  # Handle pure numeric tokens
-            # Ignore pure numbers as descriptions unless necessary
+        elif token.replace(".", "", 1).isdigit():
             continue
         else:
             combined_tokens.append(token)
-    # Prevent descriptions like just '4' from being selected
+    # If the only token is numeric, return a default value.
     if len(combined_tokens) == 1 and combined_tokens[0].isdigit():
-        return "N/A"  # Return "N/A" for cases where it's just a number
+        return "N/A"
-    # Simplify the word list and generate the description
+    # Simplify the token list to remove near-duplicates based on the Jaccard index.
     simplified_words = _simplify_word_list(combined_tokens)
+    # Generate a coherent description from the simplified words.
     description = _generate_coherent_description(simplified_words)
     return description
 def _simplify_word_list(words: List[str], threshold: float = 0.80) -> List[str]:
-    """Filter out words that are too similar based on the Jaccard index, keeping the word with the higher count.
+    """Filter out words that are too similar based on the Jaccard index,
+    keeping the word with the higher aggregated count.
     Args:
-        words (list of str): The list of words to be filtered.
+        words (List[str]): The list of tokens to be filtered.
         threshold (float, optional): The similarity threshold for the Jaccard index. Defaults to 0.80.
     Returns:
-        list of str: A list of filtered words, where similar words are reduced to the most frequent one.
+        List[str]: A list of filtered words, where similar words are reduced to the most frequent one.
     """
-    # Count the occurrences of each word
+    # Count the occurrences (which reflect the weighted importance)
     word_counts = Counter(words)
     filtered_words = []
     used_words = set()
-    # Iterate through the words to find similar words
-    for word in word_counts:
+    # Iterate through words sorted by descending weighted frequency
+    for word in sorted(word_counts, key=lambda w: word_counts[w], reverse=True):
         if word in used_words:
             continue
         word_set = set(word)
-        # Find similar words based on the Jaccard index
+        # Find similar words (including the current word) based on the Jaccard index
         similar_words = [
             other_word
             for other_word in word_counts
             if _calculate_jaccard_index(word_set, set(other_word)) >= threshold
         ]
-        # Sort by frequency and choose the most frequent word
+        # Choose the word with the highest weighted count among the similar group
         similar_words.sort(key=lambda w: word_counts[w], reverse=True)
         best_word = similar_words[0]
         filtered_words.append(best_word)
         used_words.update(similar_words)
+    # Preserve the original order (by frequency) from the filtered set
     final_words = [word for word in words if word in filtered_words]
     return final_words
 def _calculate_jaccard_index(set1: Set[Any], set2: Set[Any]) -> float:
-    """Calculate the Jaccard Index of two sets.
+    """Calculate the Jaccard index between two sets.
     Args:
-        set1 (set): The first set for comparison.
-        set2 (set): The second set for comparison.
+        set1 (Set[Any]): The first set.
+        set2 (Set[Any]): The second set.
     Returns:
-        float: The Jaccard Index, which is the ratio of the intersection to the union of the two sets.
-               Returns 0 if the union of the sets is empty.
+        float: The Jaccard index (intersection over union). Returns 0 if the union is empty.
     """
     intersection = len(set1.intersection(set2))
     union = len(set1.union(set2))
@@ -313,28 +337,28 @@ def _calculate_jaccard_index(set1: Set[Any], set2: Set[Any]) -> float:
 def _generate_coherent_description(words: List[str]) -> str:
-    """Generate a coherent description from a list of words or numerical string values.
+    """Generate a coherent description from a list of words.
     If there is only one unique entry, return it directly.
+    Otherwise, order the words by frequency and join them into a single string.
     Args:
-        words (List): A list of words or numerical string values.
+        words (List[str]): A list of tokens.
     Returns:
-        str: A coherent description formed by arranging the words in a logical sequence.
+        str: A coherent, space-separated description.
     """
-    # If there are no words, return a keyword indicating no data is available
     if not words:
         return "N/A"
-    # If there's only one unique word, return it directly
+    # If there is only one unique word, return it directly
     unique_words = set(words)
     if len(unique_words) == 1:
         return list(unique_words)[0]
-    # Count the frequency of each word and sort them by frequency
+    # Count weighted occurrences and sort in descending order.
     word_counts = Counter(words)
     most_common_words = [word for word, _ in word_counts.most_common()]
-    # Join the most common words to form a coherent description based on frequency
     description = " ".join(most_common_words)
     return description

risk/neighborhoods/domains.py CHANGED Viewed

@@ -10,19 +10,22 @@ from typing import Tuple, Union
 import numpy as np
 import pandas as pd
 from scipy.cluster.hierarchy import linkage, fcluster
-from scipy.optimize import minimize_scalar
-from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, silhouette_score
+from sklearn.metrics import silhouette_score
 from tqdm import tqdm
 from risk.annotations import get_weighted_description
-from risk.constants import GROUP_LINKAGE_METHODS, GROUP_DISTANCE_METRICS
 from risk.log import logger
-class LinkageThresholdError(Exception):
-    """Exception raised for errors in the linkage threshold optimization process."""
-    pass
+# Define constants for clustering
+# fmt: off
+LINKAGE_METHODS = {"single", "complete", "average", "weighted", "centroid", "median", "ward"}
+LINKAGE_METRICS = {
+    "braycurtis","canberra", "chebyshev", "cityblock", "correlation", "cosine", "dice", "euclidean",
+    "hamming", "jaccard", "jensenshannon", "kulczynski1", "mahalanobis", "matching", "minkowski",
+    "rogerstanimoto", "russellrao", "seuclidean", "sokalmichener", "sokalsneath", "sqeuclidean", "yule",
+}
+# fmt: on
 def define_domains(
@@ -31,7 +34,7 @@ def define_domains(
     linkage_criterion: str,
     linkage_method: str,
     linkage_metric: str,
-    linkage_threshold: Union[str, float],
+    linkage_threshold: Union[float, str],
 ) -> pd.DataFrame:
     """Define domains and assign nodes to these domains based on their significance scores and clustering,
     handling errors by assigning unique domains when clustering fails.
@@ -39,19 +42,13 @@ def define_domains(
     Args:
         top_annotations (pd.DataFrame): DataFrame of top annotations data for the network nodes.
         significant_neighborhoods_significance (np.ndarray): The binary significance matrix below alpha.
-        linkage_criterion (str): The clustering criterion for defining groups. Use "distance" for distance-based
-            clustering or "maxclust" for a fixed number of clusters. Use "off" to skip clustering.
-        linkage_method (str): The linkage method for clustering. Use "auto" to try multiple methods.
-        linkage_metric (str): The linkage metric for clustering. Use "auto" to try multiple metrics.
-        linkage_threshold (str, float): The linkage threshold for clustering, or one of "silhouette",
-            "calinski_harabasz", or "davies_bouldin" to optimize the threshold.
+        linkage_criterion (str): The clustering criterion for defining groups.
+        linkage_method (str): The linkage method for clustering. Choose "auto" to optimize.
+        linkage_metric (str): The linkage metric for clustering. Choose "auto" to optimize.
+        linkage_threshold (float, str): The threshold for clustering. Choose "auto" to optimize.
     Returns:
         pd.DataFrame: DataFrame with the primary domain for each node.
-    Raises:
-        ValueError: If an improper value is passed for linkage_threshold. Acceptable values are "silhouette",
-            "calinski_harabasz", "davies_bouldin", or a float value.
     """
     try:
         if linkage_criterion == "off":
@@ -62,17 +59,14 @@ def define_domains(
         # Safeguard the matrix by replacing NaN, Inf, and -Inf values
         m = _safeguard_matrix(m)
         # Optimize silhouette score across different linkage methods and distance metrics
-        best_linkage, best_metric, best_threshold = (
-            _optimize_linkage_threshold_across_methods_and_metrics(
-                m, linkage_criterion, linkage_method, linkage_metric, linkage_threshold
-            )
+        best_linkage, best_metric, best_threshold = _optimize_silhouette_across_linkage_and_metrics(
+            m, linkage_criterion, linkage_method, linkage_metric, linkage_threshold
         )
         # Perform hierarchical clustering
         Z = linkage(m, method=best_linkage, metric=best_metric)
         logger.warning(
-            f"Linkage criterion: '{linkage_criterion}'\nLinkage method: '{best_linkage}'\nLinkage metric: '{best_metric}'"
+            f"Linkage criterion: '{linkage_criterion}'\nLinkage method: '{best_linkage}'\nLinkage metric: '{best_metric}'\nLinkage threshold: {round(best_threshold, 3)}"
         )
-        logger.debug(f"Optimal linkage threshold: {round(best_threshold, 3)}")
         # Calculate the optimal threshold for clustering
         max_d_optimal = np.max(Z[:, 2]) * best_threshold
         # Assign domains to the annotations matrix
@@ -91,9 +85,6 @@ def define_domains(
                 f"Error encountered. Skipping clustering and assigning {n_rows} unique domains."
             )
         top_annotations["domain"] = range(1, n_rows + 1)  # Assign unique domains
-    except LinkageThresholdError as e:
-        # If a LinkageThresholdError is encountered, raise a ValueError with the original exception
-        raise ValueError(e) from e
     # Create DataFrames to store domain information
     node_to_significance = pd.DataFrame(
@@ -215,154 +206,146 @@ def _safeguard_matrix(matrix: np.ndarray) -> np.ndarray:
     return matrix
-def _optimize_linkage_threshold_across_methods_and_metrics(
+def _optimize_silhouette_across_linkage_and_metrics(
     m: np.ndarray,
     linkage_criterion: str,
     linkage_method: str,
     linkage_metric: str,
     linkage_threshold: Union[str, float],
 ) -> Tuple[str, str, float]:
-    """Optimize the linkage method, metric, and threshold for hierarchical clustering. If the threshold is
-    a string, optimize the threshold using the specified metric; otherwise, use the provided threshold.
+    """Optimize silhouette score across different linkage methods and distance metrics.
     Args:
         m (np.ndarray): Data matrix.
-        linkage_criterion (str): Criterion for fcluster (typically "distance").
-        linkage_method (str): Linkage method for clustering, or "auto" to try multiple methods.
-        linkage_metric (str): Distance metric for clustering, or "auto" to try multiple metrics.
-        linkage_threshold (str, float): Either a numeric threshold or one of the following keywords:
-            "silhouette", "calinski_harabasz", or "davies_bouldin" to trigger optimization.
+        linkage_criterion (str): Clustering criterion.
+        linkage_method (str): Linkage method for clustering. Choose "auto" to optimize.
+        linkage_metric (str): Linkage metric for clustering. Choose "auto" to optimize.
+        linkage_threshold (Union[str, float]): Threshold for clustering. Choose "auto" to optimize.
     Returns:
         Tuple[str, str, float]:
-            - The chosen linkage method.
-            - The chosen linkage metric.
-            - The optimized threshold (a float).
-    Raises:
-        ValueError: If linkage_threshold is neither one of the supported keywords nor convertible to float.
+            - Best linkage method (str)
+            - Best linkage metric (str)
+            - Best threshold (float)
     """
-    # Supported linkage threshold metrics
-    supported_linkage_thresholds = {"silhouette", "calinski_harabasz", "davies_bouldin"}
-    # If linkage_threshold is a string:
-    if isinstance(linkage_threshold, str):
-        if linkage_threshold in supported_linkage_thresholds:
-            opt_metric = linkage_threshold
-        else:
-            try:
-                threshold_float = float(linkage_threshold)
-            except (TypeError, ValueError):
-                raise LinkageThresholdError(
-                    f"linkage_threshold must be one of {', '.join(supported_linkage_thresholds)} or a float value."
-                )
-            return linkage_method, linkage_metric, threshold_float
-    else:
-        # If not a string, try to convert it to float.
-        try:
-            threshold_float = float(linkage_threshold)
-        except (TypeError, ValueError):
-            raise LinkageThresholdError(
-                f"linkage_threshold must be one of {', '.join(supported_linkage_thresholds)} or a float value."
-            )
-        return linkage_method, linkage_metric, threshold_float
-    # Otherwise, perform optimization using the specified metric (opt_metric).
-    best_overall_method = None
-    best_overall_metric = None
-    best_overall_threshold = None
+    # Initialize best overall values
+    best_overall_method = linkage_method
+    best_overall_metric = linkage_metric
+    best_overall_threshold = linkage_threshold
     best_overall_score = -np.inf
-    # Use the provided lists if "auto" is specified.
-    methods = GROUP_LINKAGE_METHODS if linkage_method == "auto" else [linkage_method]
-    metrics = GROUP_DISTANCE_METRICS if linkage_metric == "auto" else [linkage_metric]
-    total_combinations = len(methods) * len(metrics)
+    # Set linkage methods and metrics to all combinations if "auto" is selected
+    linkage_methods = LINKAGE_METHODS if linkage_method == "auto" else [linkage_method]
+    linkage_metrics = LINKAGE_METRICS if linkage_metric == "auto" else [linkage_metric]
+    total_combinations = len(linkage_methods) * len(linkage_metrics)
+    # Evaluating optimal linkage method and metric
     for method, metric in tqdm(
-        product(methods, metrics),
+        product(linkage_methods, linkage_metrics),
         desc="Evaluating optimal linkage method and metric",
         total=total_combinations,
         bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
     ):
+        # Some linkage methods and metrics may not work with certain data
         with suppress(ValueError):
             Z = linkage(m, method=method, metric=metric)
-            threshold, score = _find_optimal_linkage_threshold(
-                Z, m, metric, linkage_criterion, opt_metric=opt_metric
-            )
-            if score > best_overall_score:
-                best_overall_score = score
-                best_overall_threshold = threshold
-                best_overall_method = method
-                best_overall_metric = metric
-    if best_overall_method is None or best_overall_metric is None or best_overall_threshold is None:
-        raise ValueError("Optimization failed to determine an optimal threshold.")
+            # Only optimize silhouette score if the threshold is "auto"
+            if linkage_threshold == "auto":
+                threshold, score = _find_best_silhouette_score(Z, m, metric, linkage_criterion)
+                if score > best_overall_score:
+                    best_overall_score = score
+                    best_overall_threshold = threshold
+                    best_overall_method = method
+                    best_overall_metric = metric
+            else:
+                # Use the provided threshold without optimization
+                score = silhouette_score(
+                    m,
+                    fcluster(Z, linkage_threshold * np.max(Z[:, 2]), criterion=linkage_criterion),
+                    metric=metric,
+                )
+                if score > best_overall_score:
+                    best_overall_score = score
+                    best_overall_threshold = linkage_threshold
+                    best_overall_method = method
+                    best_overall_metric = metric
     return best_overall_method, best_overall_metric, best_overall_threshold
-def _find_optimal_linkage_threshold(
+def _find_best_silhouette_score(
     Z: np.ndarray,
     m: np.ndarray,
     linkage_metric: str,
     linkage_criterion: str,
-    opt_metric: str = "silhouette",
+    lower_bound: float = 0.001,
+    upper_bound: float = 1.0,
+    resolution: float = 0.001,
 ) -> Tuple[float, float]:
-    """Find the optimal linkage threshold coefficient for hierarchical clustering. The function optimizes
-    the threshold value using the specified metric (opt_metric).
+    """Find the best silhouette score using binary search.
     Args:
-        Z (np.ndarray): Linkage matrix generated by a hierarchical clustering algorithm.
-        m (np.ndarray): Data matrix used for clustering.
-        linkage_metric (str): Metric used to calculate distances between data points
-            (e.g., "euclidean" or "cosine").
-        linkage_criterion (str): Criterion to pass to `fcluster`, typically "distance".
-        opt_metric (str, optional): Metric to optimize clustering quality. Options are:
-            "silhouette", "calinski_harabasz", or "davies_bouldin". Defaults to "silhouette".
+        Z (np.ndarray): Linkage matrix.
+        m (np.ndarray): Data matrix.
+        linkage_metric (str): Linkage metric for silhouette score calculation.
+        linkage_criterion (str): Clustering criterion.
+        lower_bound (float, optional): Lower bound for search. Defaults to 0.001.
+        upper_bound (float, optional): Upper bound for search. Defaults to 1.0.
+        resolution (float, optional): Desired resolution for the best threshold. Defaults to 0.001.
     Returns:
         Tuple[float, float]:
-            - best_threshold (float): The optimal linkage threshold coefficient.
-            - best_metric_value (float): The value of the clustering quality metric achieved
-              at the optimal threshold (higher for "silhouette" and "calinski_harabasz",
-              lower for "davies_bouldin").
-    Raises:
-        ValueError: If the `opt_metric` argument is not one of the supported metrics.
+            - Best threshold (float): The threshold that yields the best silhouette score.
+            - Best silhouette score (float): The highest silhouette score achieved.
     """
-    # Get the maximum distance in the linkage matrix
-    max_d = np.max(Z[:, 2])
-    resolution = 1e-6
-    def compute_objective(coefficient: float) -> float:
-        """Compute the objective function for optimization."""
-        threshold_val = coefficient * max_d
-        clusters = fcluster(Z, threshold_val, criterion=linkage_criterion)
-        unique_clusters = np.unique(clusters)
-        if len(unique_clusters) <= 1 or len(unique_clusters) == m.shape[0]:
-            return 1e6
-        try:
-            if opt_metric == "silhouette":
-                score = silhouette_score(m, clusters, metric=linkage_metric)
-                return -score  # We want to maximize the score.
-            elif opt_metric == "calinski_harabasz":
-                score = calinski_harabasz_score(m, clusters)
-                return -score
-            elif opt_metric == "davies_bouldin":
-                score = davies_bouldin_score(m, clusters)
-                return score
-            else:
-                raise ValueError(f"Unknown optimization metric: {opt_metric}.")
-        except Exception:
-            return 1e6
+    best_score = -np.inf
+    best_threshold = None
-    # Optimize the threshold using the specified metric
-    res = minimize_scalar(
-        compute_objective, bounds=(0.0, 1.0), method="bounded", options={"xatol": resolution}
-    )
+    # Test lower bound
+    max_d_lower = np.max(Z[:, 2]) * lower_bound
+    clusters_lower = fcluster(Z, max_d_lower, criterion=linkage_criterion)
+    try:
+        score_lower = silhouette_score(m, clusters_lower, metric=linkage_metric)
+    except ValueError:
+        score_lower = -np.inf
+    # Test upper bound
+    max_d_upper = np.max(Z[:, 2]) * upper_bound
+    clusters_upper = fcluster(Z, max_d_upper, criterion=linkage_criterion)
+    try:
+        score_upper = silhouette_score(m, clusters_upper, metric=linkage_metric)
+    except ValueError:
+        score_upper = -np.inf
-    best_threshold = res.x
-    best_obj = res.fun
-    # For silhouette and calinski_harabasz, the objective was negative.
-    best_metric_value = -best_obj if opt_metric in ["silhouette", "calinski_harabasz"] else best_obj
+    # Determine initial bounds for binary search
+    if score_lower > score_upper:
+        best_score = score_lower
+        best_threshold = lower_bound
+        upper_bound = (lower_bound + upper_bound) / 2
+    else:
+        best_score = score_upper
+        best_threshold = upper_bound
+        lower_bound = (lower_bound + upper_bound) / 2
+    # Binary search loop
+    while upper_bound - lower_bound > resolution:
+        mid_threshold = (upper_bound + lower_bound) / 2
+        max_d_mid = np.max(Z[:, 2]) * mid_threshold
+        clusters_mid = fcluster(Z, max_d_mid, criterion=linkage_criterion)
+        try:
+            score_mid = silhouette_score(m, clusters_mid, metric=linkage_metric)
+        except ValueError:
+            score_mid = -np.inf
+        # Update best score and threshold if mid-point is better
+        if score_mid > best_score:
+            best_score = score_mid
+            best_threshold = mid_threshold
+        # Adjust bounds based on the scores
+        if score_lower > score_upper:
+            upper_bound = mid_threshold
+        else:
+            lower_bound = mid_threshold
-    return best_threshold, float(best_metric_value)
+    return best_threshold, float(best_score)

risk/network/graph/api.py CHANGED Viewed

@@ -4,7 +4,7 @@ risk/network/graph/api
 """
 import copy
-from typing import Any, Dict
+from typing import Any, Dict, Union
 import networkx as nx
 import pandas as pd
@@ -42,7 +42,7 @@ class GraphAPI:
         linkage_criterion: str = "distance",
         linkage_method: str = "average",
         linkage_metric: str = "yule",
-        linkage_threshold: float = 0.2,
+        linkage_threshold: Union[float, str] = 0.2,
         min_cluster_size: int = 5,
         max_cluster_size: int = 1000,
     ) -> Graph:
@@ -58,12 +58,11 @@ class GraphAPI:
             impute_depth (int, optional): Depth for imputing neighbors. Defaults to 0.
             prune_threshold (float, optional): Distance threshold for pruning neighbors. Defaults to 0.0.
             linkage_criterion (str, optional): Clustering criterion for defining domains. Defaults to "distance".
-            linkage_method (str, optional): Clustering method to use. Defaults to "average". Choose "auto"
-                to automatically select the best linkage method.
-            linkage_metric (str, optional): Metric to use for calculating distances. Defaults to "yule". Choose "auto"
-                to automatically select the best linkage metric.
-            linkage_threshold (str, float, optional): Threshold for clustering. Choose "silhouette", "calinski_harabasz",
-                or "davies_bouldin" to automatically select the best threshold. Defaults to 0.2.
+            linkage_method (str, optional): Clustering method to use. Choose "auto" to optimize. Defaults to "average".
+            linkage_metric (str, optional): Metric to use for calculating distances. Choose "auto" to optimize.
+                Defaults to "yule".
+            linkage_threshold (float, str, optional): Threshold for clustering. Choose "auto" to optimize.
+                Defaults to 0.2.
             min_cluster_size (int, optional): Minimum size for clusters. Defaults to 5.
             max_cluster_size (int, optional): Maximum size for clusters. Defaults to 1000.

{risk_network-0.0.9b38.dist-info → risk_network-0.0.9b40.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: risk-network
-Version: 0.0.9b38
+Version: 0.0.9b40
 Summary: A Python package for biological network analysis
 Author: Ira Horecka
 Author-email: Ira Horecka <ira89@icloud.com>

{risk_network-0.0.9b38.dist-info → risk_network-0.0.9b40.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,7 @@
-risk/__init__.py,sha256=S22dtKjPn7IvWeLakeN4IajRLsqHuSbBhjvf6z5kHoo,127
-risk/constants.py,sha256=XInRaH78Slnw_sWgAsBFbUHkyA0h0jL0DKGuQNbOvjM,550
+risk/__init__.py,sha256=2Ucmxw9wGNzUhqe_QGlEi2pnGhkdOrl9wa8w-MUIfm8,127
 risk/risk.py,sha256=s827_lRknFseOP9O4zW8sP-IcCd2EzrpV_tnVY_tz5s,1104
 risk/annotations/__init__.py,sha256=parsbcux1U4urpUqh9AdzbDWuLj9HlMidycMPkpSQFo,179
-risk/annotations/annotations.py,sha256=g8ca9H49dZIqHv6Od3Dem4BIo_euy8alL3PDauT6ZJI,14088
+risk/annotations/annotations.py,sha256=Sq24YBtNPMxXOvWoxqPwOJ4bsFAMIBYpVWjEvsQPtNo,14912
 risk/annotations/io.py,sha256=z1AJySsU-KL_IYuHa7j3nvuczmOHgK3WfaQ4TRunvrA,10499
 risk/log/__init__.py,sha256=7LxDysQu7doi0LAvlY2YbjN6iJH0fNknqy8lSLgeljo,217
 risk/log/console.py,sha256=PgjyEvyhYLUSHXPUKEqOmxsDsfrjPICIgqo_cAHq0N8,4575
@@ -10,13 +9,13 @@ risk/log/parameters.py,sha256=VtwfMzLU1xI4yji3-Ch5vHjH-KdwTfwaEMmi7hFQTs0,5716
 risk/neighborhoods/__init__.py,sha256=Q74HwTH7okI-vaskJPy2bYwb5sNjGASTzJ6m8V8arCU,234
 risk/neighborhoods/api.py,sha256=ywngw2TQVV27gYlWDXcs8-qnmeepnvb-W9ov6J6VEPM,23341
 risk/neighborhoods/community.py,sha256=5Q_-VAJC-5SY5EUsB8gIlemeDoAL85uLjyl16pItHiQ,16699
-risk/neighborhoods/domains.py,sha256=yaSHymGfRJVuXHIa7BwoKzvIRSg5oLhNoOMg0tsVqV8,15961
+risk/neighborhoods/domains.py,sha256=4K1tbiia3_TQKUrGdfmKVdYlRD2EEzPnMCKRv6IGxu4,14448
 risk/neighborhoods/neighborhoods.py,sha256=l9FhADB1C-OxM8E9QXOcA4osUDgA1vs4ud-OCGKKybc,21457
 risk/network/__init__.py,sha256=oVi3FA1XXKD84014Cykq-9bpX4_s0F3aAUfNOU-07Qw,73
 risk/network/geometry.py,sha256=eVtGHMgBf9fEqQZUFdHWjw-zFYYpfUONoHFSAxoRkug,6219
 risk/network/io.py,sha256=RCH4nQdgYDXcNwMfpSz7qEmPO0pJ1p9fL0rNQptsQrc,21673
 risk/network/graph/__init__.py,sha256=ziGJew3yhtqvrb9LUuneDu_LwW2Wa9vd4UuhoL5l1CA,91
-risk/network/graph/api.py,sha256=fOyd-5rRnqmtquproP90scehewd0UtOVZS65hCuwasI,8684
+risk/network/graph/api.py,sha256=xS_rNDvZPdwIar2E9x9BKMeR0DcYuwcHiUpc_EcJ4-o,8536
 risk/network/graph/graph.py,sha256=qEWyZvuaGT_vvjhreBdmRPX3gst2wQFaXhFAvikPSqw,12158
 risk/network/graph/summary.py,sha256=Y_0rL2C1UoQeZQIPVe5LbaCO356Mcc8HisnrXwQsRm8,10289
 risk/network/plotter/__init__.py,sha256=4gWtQHGzQVNHmEBXi31Zf0tX0y2sTcE66J_yGnn7268,99
@@ -34,8 +33,8 @@ risk/stats/stat_tests.py,sha256=tj0ri9w89_1fsjGLuafTWpfBEwZXpSLn7Ej2aAQ5lxk,1177
 risk/stats/permutation/__init__.py,sha256=OLmYLm2uj96hPsSaUs0vUqFYw6Thwch_aHtpL7L0ZFw,127
 risk/stats/permutation/permutation.py,sha256=BWjgdBpLVcHvmwHy0bmD4aJFccxifNBSrrCBPppyKf4,10569
 risk/stats/permutation/test_functions.py,sha256=KlECWTz1EZ6EPF_OAgHb0uznaIhopiVYb_AKUKuC4no,3120
-risk_network-0.0.9b38.dist-info/LICENSE,sha256=jOtLnuWt7d5Hsx6XXB2QxzrSe2sWWh3NgMfFRetluQM,35147
-risk_network-0.0.9b38.dist-info/METADATA,sha256=mAB2KQoRWeOH13radrcMeW5dalpkPkl4YtjfUpQhJXI,47627
-risk_network-0.0.9b38.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-risk_network-0.0.9b38.dist-info/top_level.txt,sha256=NX7C2PFKTvC1JhVKv14DFlFAIFnKc6Lpsu1ZfxvQwVw,5
-risk_network-0.0.9b38.dist-info/RECORD,,
+risk_network-0.0.9b40.dist-info/LICENSE,sha256=jOtLnuWt7d5Hsx6XXB2QxzrSe2sWWh3NgMfFRetluQM,35147
+risk_network-0.0.9b40.dist-info/METADATA,sha256=0gk-H9_4YiOCT5iykSjB89qALDejboNUa2mZy_XtLNc,47627
+risk_network-0.0.9b40.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+risk_network-0.0.9b40.dist-info/top_level.txt,sha256=NX7C2PFKTvC1JhVKv14DFlFAIFnKc6Lpsu1ZfxvQwVw,5
+risk_network-0.0.9b40.dist-info/RECORD,,

risk/constants.py DELETED Viewed

@@ -1,31 +0,0 @@
-"""
-risk/constants
-~~~~~~~~~~~~~~
-"""
-GROUP_LINKAGE_METHODS = ["single", "complete", "average", "weighted", "centroid", "median", "ward"]
-GROUP_DISTANCE_METRICS = [
-    "braycurtis",
-    "canberra",
-    "chebyshev",
-    "cityblock",
-    "correlation",
-    "cosine",
-    "dice",
-    "euclidean",
-    "hamming",
-    "jaccard",
-    "jensenshannon",
-    "kulczynski1",
-    "mahalanobis",
-    "matching",
-    "minkowski",
-    "rogerstanimoto",
-    "russellrao",
-    "seuclidean",
-    "sokalmichener",
-    "sokalsneath",
-    "sqeuclidean",
-    "yule",
-]

{risk_network-0.0.9b38.dist-info → risk_network-0.0.9b40.dist-info}/LICENSE RENAMED Viewed

File without changes

{risk_network-0.0.9b38.dist-info → risk_network-0.0.9b40.dist-info}/WHEEL RENAMED Viewed

File without changes

{risk_network-0.0.9b38.dist-info → risk_network-0.0.9b40.dist-info}/top_level.txt RENAMED Viewed

File without changes

risk-network 0.0.9b38__py3-none-any.whl → 0.0.9b40__py3-none-any.whl

risk-network 0.0.9b38py3-none-any.whl → 0.0.9b40py3-none-any.whl