PyPI - risk-network - Versions diffs - 0.0.8b20__tar.gz → 0.0.8b22__tar.gz - Mend

risk-network 0.0.8b20tar.gz → 0.0.8b22tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

{risk_network-0.0.8b20 → risk_network-0.0.8b22}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: risk-network
-Version: 0.0.8b20
+Version: 0.0.8b22
 Summary: A Python package for biological network analysis
 Author: Ira Horecka
 Author-email: Ira Horecka <ira89@icloud.com>

{risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/__init__.py RENAMED Viewed

@@ -7,4 +7,4 @@ RISK: RISK Infers Spatial Kinships
 from risk.risk import RISK
-__version__ = "0.0.8-beta.20"
+__version__ = "0.0.8-beta.22"

risk_network-0.0.8b22/risk/annotations/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""
+risk/annotations
+~~~~~~~~~~~~~~~~
+"""
+from .annotations import define_top_annotations, get_weighted_description
+from .io import AnnotationsIO

{risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/annotations/annotations.py RENAMED Viewed

@@ -30,6 +30,8 @@ def _setup_nltk():
 # Ensure you have the necessary NLTK data
 _setup_nltk()
+# Initialize English stopwords
+stop_words = set(stopwords.words("english"))
 def load_annotations(network: nx.Graph, annotations_input: Dict[str, Any]) -> Dict[str, Any]:
@@ -47,11 +49,11 @@ def load_annotations(network: nx.Graph, annotations_input: Dict[str, Any]) -> Di
         (node, annotation) for annotation, nodes in annotations_input.items() for node in nodes
     ]
     # Create a DataFrame from the flattened list
-    annotations = pd.DataFrame(flattened_annotations, columns=["Node", "Annotations"])
-    annotations["Is Member"] = 1
+    annotations = pd.DataFrame(flattened_annotations, columns=["node", "annotations"])
+    annotations["is_member"] = 1
     # Pivot to create a binary matrix with nodes as rows and annotations as columns
     annotations_pivot = annotations.pivot_table(
-        index="Node", columns="Annotations", values="Is Member", fill_value=0, dropna=False
+        index="node", columns="annotations", values="is_member", fill_value=0, dropna=False
     )
     # Reindex the annotations matrix based on the node labels from the network
     node_label_order = list(nx.get_node_attributes(network, "label").values())
@@ -81,7 +83,8 @@ def define_top_annotations(
     network: nx.Graph,
     ordered_annotation_labels: List[str],
     neighborhood_enrichment_sums: List[int],
-    binary_enrichment_matrix: np.ndarray,
+    significant_enrichment_matrix: np.ndarray,
+    significant_binary_enrichment_matrix: np.ndarray,
     min_cluster_size: int = 5,
     max_cluster_size: int = 1000,
 ) -> pd.DataFrame:
@@ -91,42 +94,52 @@ def define_top_annotations(
         network (NetworkX graph): The network graph.
         ordered_annotation_labels (list of str): List of ordered annotation labels.
         neighborhood_enrichment_sums (list of int): List of neighborhood enrichment sums.
-        binary_enrichment_matrix (np.ndarray): Binary enrichment matrix below alpha threshold.
+        significant_enrichment_matrix (np.ndarray): Enrichment matrix below alpha threshold.
+        significant_binary_enrichment_matrix (np.ndarray): Binary enrichment matrix below alpha threshold.
         min_cluster_size (int, optional): Minimum cluster size. Defaults to 5.
         max_cluster_size (int, optional): Maximum cluster size. Defaults to 1000.
     Returns:
         pd.DataFrame: DataFrame with top annotations and their properties.
     """
-    # Create DataFrame to store annotations and their neighborhood enrichment sums
+    # Sum the columns of the significant enrichment matrix (positive floating point values)
+    significant_enrichment_scores = significant_enrichment_matrix.sum(axis=0)
+    # Create DataFrame to store annotations, their neighborhood enrichment sums, and enrichment scores
     annotations_enrichment_matrix = pd.DataFrame(
         {
             "id": range(len(ordered_annotation_labels)),
-            "words": ordered_annotation_labels,
-            "neighborhood enrichment sums": neighborhood_enrichment_sums,
+            "full_terms": ordered_annotation_labels,
+            "significant_neighborhood_enrichment_sums": neighborhood_enrichment_sums,
+            "significant_enrichment_score": significant_enrichment_scores,
         }
     )
-    annotations_enrichment_matrix["top attributes"] = False
-    # Apply size constraints to identify potential top attributes
+    annotations_enrichment_matrix["significant_annotations"] = False
+    # Apply size constraints to identify potential significant annotations
     annotations_enrichment_matrix.loc[
-        (annotations_enrichment_matrix["neighborhood enrichment sums"] >= min_cluster_size)
-        & (annotations_enrichment_matrix["neighborhood enrichment sums"] <= max_cluster_size),
-        "top attributes",
+        (
+            annotations_enrichment_matrix["significant_neighborhood_enrichment_sums"]
+            >= min_cluster_size
+        )
+        & (
+            annotations_enrichment_matrix["significant_neighborhood_enrichment_sums"]
+            <= max_cluster_size
+        ),
+        "significant_annotations",
     ] = True
     # Initialize columns for connected components analysis
-    annotations_enrichment_matrix["num connected components"] = 0
-    annotations_enrichment_matrix["size connected components"] = None
-    annotations_enrichment_matrix["size connected components"] = annotations_enrichment_matrix[
-        "size connected components"
+    annotations_enrichment_matrix["num_connected_components"] = 0
+    annotations_enrichment_matrix["size_connected_components"] = None
+    annotations_enrichment_matrix["size_connected_components"] = annotations_enrichment_matrix[
+        "size_connected_components"
     ].astype(object)
-    annotations_enrichment_matrix["num large connected components"] = 0
+    annotations_enrichment_matrix["num_large_connected_components"] = 0
     for attribute in annotations_enrichment_matrix.index.values[
-        annotations_enrichment_matrix["top attributes"]
+        annotations_enrichment_matrix["significant_annotations"]
     ]:
         # Identify enriched neighborhoods based on the binary enrichment matrix
         enriched_neighborhoods = list(
-            compress(list(network), binary_enrichment_matrix[:, attribute])
+            compress(list(network), significant_binary_enrichment_matrix[:, attribute])
         )
         enriched_network = nx.subgraph(network, enriched_neighborhoods)
         # Analyze connected components within the enriched subnetwork
@@ -145,55 +158,67 @@ def define_top_annotations(
         num_large_connected_components = len(filtered_size_connected_components)
         # Assign the number of connected components
-        annotations_enrichment_matrix.loc[attribute, "num connected components"] = (
+        annotations_enrichment_matrix.loc[attribute, "num_connected_components"] = (
             num_connected_components
         )
         # Filter out attributes with more than one connected component
         annotations_enrichment_matrix.loc[
-            annotations_enrichment_matrix["num connected components"] > 1, "top attributes"
+            annotations_enrichment_matrix["num_connected_components"] > 1, "significant_annotations"
         ] = False
         # Assign the number of large connected components
-        annotations_enrichment_matrix.loc[attribute, "num large connected components"] = (
+        annotations_enrichment_matrix.loc[attribute, "num_large_connected_components"] = (
             num_large_connected_components
         )
         # Assign the size of connected components, ensuring it is always a list
-        annotations_enrichment_matrix.at[attribute, "size connected components"] = (
+        annotations_enrichment_matrix.at[attribute, "size_connected_components"] = (
             filtered_size_connected_components.tolist()
         )
     return annotations_enrichment_matrix
-def get_description(words_column: pd.Series) -> str:
-    """Process input Series to identify and return the top frequent, significant words,
-    filtering based on stopwords and gracefully handling numerical strings.
+def get_weighted_description(words_column: pd.Series, scores_column: pd.Series) -> str:
+    """Generate a weighted description from words and their corresponding scores,
+    with support for stopwords filtering and improved weighting logic.
     Args:
         words_column (pd.Series): A pandas Series containing strings to process.
+        scores_column (pd.Series): A pandas Series containing enrichment scores to weigh the terms.
     Returns:
-        str: A coherent description formed from the most frequent and significant words.
+        str: A coherent description formed from the most frequent and significant words, weighed by enrichment scores.
     """
-    # Concatenate all rows into a single string and tokenize into words
-    all_words = words_column.str.cat(sep=" ")
-    tokens = word_tokenize(all_words)
+    # Handle case where all scores are the same
+    if scores_column.max() == scores_column.min():
+        normalized_scores = pd.Series([1] * len(scores_column))
+    else:
+        # Normalize the enrichment scores to be between 0 and 1
+        normalized_scores = (scores_column - scores_column.min()) / (
+            scores_column.max() - scores_column.min()
+        )
+    # Combine words and normalized scores to create weighted words
+    weighted_words = []
+    for word, score in zip(words_column, normalized_scores):
+        word = str(word)
+        if word not in stop_words:  # Skip stopwords
+            weight = max(1, int((0 if pd.isna(score) else score) * 10))
+            weighted_words.extend([word] * weight)
+    # Tokenize the weighted words
+    tokens = word_tokenize(" ".join(weighted_words))
     # Separate numeric tokens
     numeric_tokens = [token for token in tokens if token.replace(".", "", 1).isdigit()]
-    # If there's only one unique numeric value, return it directly as a string
     unique_numeric_values = set(numeric_tokens)
     if len(unique_numeric_values) == 1:
         return f"{list(unique_numeric_values)[0]}"
-    # Ensure that all values in 'words' are strings and include both alphabetic and numeric tokens
-    words = [
-        str(word)  # Convert to string to ensure consistent processing
-        for word in tokens
-        if word.isalpha()
-        or word.replace(".", "", 1).isdigit()  # Keep alphabetic words and numeric strings
-    ]
+    # Filter alphabetic and numeric tokens
+    words = [word for word in tokens if word.isalpha() or word.replace(".", "", 1).isdigit()]
+    # Apply word similarity filtering to remove redundant terms
+    simplified_words = _simplify_word_list(words)
     # Generate a coherent description from the processed words
-    description = _generate_coherent_description(words)
+    description = _generate_coherent_description(simplified_words)
     return description

{risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/neighborhoods/domains.py RENAMED Viewed

@@ -13,7 +13,7 @@ import pandas as pd
 from scipy.cluster.hierarchy import linkage, fcluster
 from sklearn.metrics import silhouette_score
-from risk.annotations import get_description
+from risk.annotations import get_weighted_description
 from risk.constants import GROUP_LINKAGE_METHODS, GROUP_DISTANCE_METRICS
 from risk.log import logger
@@ -40,7 +40,7 @@ def define_domains(
     """
     try:
         # Transpose the matrix to cluster annotations
-        m = significant_neighborhoods_enrichment[:, top_annotations["top attributes"]].T
+        m = significant_neighborhoods_enrichment[:, top_annotations["significant_annotations"]].T
         best_linkage, best_metric, best_threshold = _optimize_silhouette_across_linkage_and_metrics(
             m, linkage_criterion, linkage_method, linkage_metric
         )
@@ -55,7 +55,7 @@ def define_domains(
         # Assign domains to the annotations matrix
         domains = fcluster(Z, max_d_optimal, criterion=linkage_criterion)
         top_annotations["domain"] = 0
-        top_annotations.loc[top_annotations["top attributes"], "domain"] = domains
+        top_annotations.loc[top_annotations["significant_annotations"], "domain"] = domains
     except ValueError:
         # If a ValueError is encountered, handle it by assigning unique domains
         n_rows = len(top_annotations)
@@ -77,11 +77,11 @@ def define_domains(
     t_idxmax[t_max == 0] = 0
     # Assign all domains where the score is greater than 0
-    node_to_domain["all domains"] = node_to_domain.loc[:, 1:].apply(
+    node_to_domain["all_domains"] = node_to_domain.loc[:, 1:].apply(
         lambda row: list(row[row > 0].index), axis=1
     )
     # Assign primary domain
-    node_to_domain["primary domain"] = t_idxmax
+    node_to_domain["primary_domain"] = t_idxmax
     return node_to_domain
@@ -107,7 +107,7 @@ def trim_domains_and_top_annotations(
             - A DataFrame with domain labels (pd.DataFrame)
     """
     # Identify domains to remove based on size criteria
-    domain_counts = domains["primary domain"].value_counts()
+    domain_counts = domains["primary_domain"].value_counts()
     to_remove = set(
         domain_counts[(domain_counts < min_cluster_size) | (domain_counts > max_cluster_size)].index
     )
@@ -117,32 +117,51 @@ def trim_domains_and_top_annotations(
     invalid_domain_ids = {0, invalid_domain_id}
     # Mark domains to be removed
     top_annotations["domain"].replace(to_remove, invalid_domain_id, inplace=True)
-    domains.loc[domains["primary domain"].isin(to_remove), ["primary domain"]] = invalid_domain_id
+    domains.loc[domains["primary_domain"].isin(to_remove), ["primary_domain"]] = invalid_domain_id
     # Normalize "num enriched neighborhoods" by percentile for each domain and scale to 0-10
     top_annotations["normalized_value"] = top_annotations.groupby("domain")[
-        "neighborhood enrichment sums"
+        "significant_neighborhood_enrichment_sums"
     ].transform(lambda x: (x.rank(pct=True) * 10).apply(np.ceil).astype(int))
-    # Multiply 'words' column by normalized values
-    top_annotations["words"] = top_annotations.apply(
-        lambda row: " ".join([str(row["words"])] * row["normalized_value"]), axis=1
+    # Modify the lambda function to pass both full_terms and significant_enrichment_score
+    top_annotations["combined_terms"] = top_annotations.apply(
+        lambda row: " ".join([str(row["full_terms"])] * row["normalized_value"]), axis=1
     )
-    # Generate domain labels
-    domain_labels = top_annotations.groupby("domain")["words"].apply(get_description).reset_index()
+    # Perform the groupby operation while retaining the other columns and adding the weighting with enrichment scores
+    domain_labels = (
+        top_annotations.groupby("domain")
+        .agg(
+            full_terms=("full_terms", lambda x: list(x)),
+            enrichment_scores=("significant_enrichment_score", lambda x: list(x)),
+        )
+        .reset_index()
+    )
+    domain_labels["combined_terms"] = domain_labels.apply(
+        lambda row: get_weighted_description(
+            pd.Series(row["full_terms"]), pd.Series(row["enrichment_scores"])
+        ),
+        axis=1,
+    )
+    # Rename the columns as necessary
     trimmed_domains_matrix = domain_labels.rename(
-        columns={"domain": "id", "words": "label"}
+        columns={
+            "domain": "id",
+            "combined_terms": "normalized_description",
+            "full_terms": "full_descriptions",
+            "enrichment_scores": "enrichment_scores",
+        }
     ).set_index("id")
     # Remove invalid domains
     valid_annotations = top_annotations[~top_annotations["domain"].isin(invalid_domain_ids)].drop(
         columns=["normalized_value"]
     )
-    valid_domains = domains[~domains["primary domain"].isin(invalid_domain_ids)]
+    valid_domains = domains[~domains["primary_domain"].isin(invalid_domain_ids)]
     valid_trimmed_domains_matrix = trimmed_domains_matrix[
         ~trimmed_domains_matrix.index.isin(invalid_domain_ids)
     ]
     return valid_annotations, valid_domains, valid_trimmed_domains_matrix

{risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/neighborhoods/neighborhoods.py RENAMED Viewed

@@ -171,7 +171,7 @@ def process_neighborhoods(
     Args:
         network (nx.Graph): The network data structure used for imputing and pruning neighbors.
-        neighborhoods (Dict[str, Any]): Dictionary containing 'enrichment_matrix', 'binary_enrichment_matrix', and 'significant_enrichment_matrix'.
+        neighborhoods (Dict[str, Any]): Dictionary containing 'enrichment_matrix', 'significant_binary_enrichment_matrix', and 'significant_enrichment_matrix'.
         impute_depth (int, optional): Depth for imputing neighbors. Defaults to 0.
         prune_threshold (float, optional): Distance threshold for pruning neighbors. Defaults to 0.0.
@@ -179,18 +179,18 @@ def process_neighborhoods(
         Dict[str, Any]: Processed neighborhoods data, including the updated matrices and enrichment counts.
     """
     enrichment_matrix = neighborhoods["enrichment_matrix"]
-    binary_enrichment_matrix = neighborhoods["binary_enrichment_matrix"]
+    significant_binary_enrichment_matrix = neighborhoods["significant_binary_enrichment_matrix"]
     significant_enrichment_matrix = neighborhoods["significant_enrichment_matrix"]
     logger.debug(f"Imputation depth: {impute_depth}")
     if impute_depth:
         (
             enrichment_matrix,
-            binary_enrichment_matrix,
+            significant_binary_enrichment_matrix,
             significant_enrichment_matrix,
         ) = _impute_neighbors(
             network,
             enrichment_matrix,
-            binary_enrichment_matrix,
+            significant_binary_enrichment_matrix,
             max_depth=impute_depth,
         )
@@ -198,20 +198,20 @@ def process_neighborhoods(
     if prune_threshold:
         (
             enrichment_matrix,
-            binary_enrichment_matrix,
+            significant_binary_enrichment_matrix,
             significant_enrichment_matrix,
         ) = _prune_neighbors(
             network,
             enrichment_matrix,
-            binary_enrichment_matrix,
+            significant_binary_enrichment_matrix,
             distance_threshold=prune_threshold,
         )
-    neighborhood_enrichment_counts = np.sum(binary_enrichment_matrix, axis=0)
+    neighborhood_enrichment_counts = np.sum(significant_binary_enrichment_matrix, axis=0)
     node_enrichment_sums = np.sum(enrichment_matrix, axis=1)
     return {
         "enrichment_matrix": enrichment_matrix,
-        "binary_enrichment_matrix": binary_enrichment_matrix,
+        "significant_binary_enrichment_matrix": significant_binary_enrichment_matrix,
         "significant_enrichment_matrix": significant_enrichment_matrix,
         "neighborhood_enrichment_counts": neighborhood_enrichment_counts,
         "node_enrichment_sums": node_enrichment_sums,
@@ -221,7 +221,7 @@ def process_neighborhoods(
 def _impute_neighbors(
     network: nx.Graph,
     enrichment_matrix: np.ndarray,
-    binary_enrichment_matrix: np.ndarray,
+    significant_binary_enrichment_matrix: np.ndarray,
     max_depth: int = 3,
 ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
     """Impute rows with sums of zero in the enrichment matrix based on the closest non-zero neighbors in the network graph.
@@ -229,7 +229,7 @@ def _impute_neighbors(
     Args:
         network (nx.Graph): The network graph with nodes having IDs matching the matrix indices.
         enrichment_matrix (np.ndarray): The enrichment matrix with rows to be imputed.
-        binary_enrichment_matrix (np.ndarray): The alpha threshold matrix to be imputed similarly.
+        significant_binary_enrichment_matrix (np.ndarray): The alpha threshold matrix to be imputed similarly.
         max_depth (int): Maximum depth of nodes to traverse for imputing values.
     Returns:
@@ -239,19 +239,21 @@ def _impute_neighbors(
             - np.ndarray: The significant enrichment matrix with non-significant entries set to zero.
     """
     # Calculate the distance threshold value based on the shortest distances
-    enrichment_matrix, binary_enrichment_matrix = _impute_neighbors_with_similarity(
-        network, enrichment_matrix, binary_enrichment_matrix, max_depth=max_depth
+    enrichment_matrix, significant_binary_enrichment_matrix = _impute_neighbors_with_similarity(
+        network, enrichment_matrix, significant_binary_enrichment_matrix, max_depth=max_depth
     )
     # Create a matrix where non-significant entries are set to zero
-    significant_enrichment_matrix = np.where(binary_enrichment_matrix == 1, enrichment_matrix, 0)
+    significant_enrichment_matrix = np.where(
+        significant_binary_enrichment_matrix == 1, enrichment_matrix, 0
+    )
-    return enrichment_matrix, binary_enrichment_matrix, significant_enrichment_matrix
+    return enrichment_matrix, significant_binary_enrichment_matrix, significant_enrichment_matrix
 def _impute_neighbors_with_similarity(
     network: nx.Graph,
     enrichment_matrix: np.ndarray,
-    binary_enrichment_matrix: np.ndarray,
+    significant_binary_enrichment_matrix: np.ndarray,
     max_depth: int = 3,
 ) -> Tuple[np.ndarray, np.ndarray]:
     """Impute non-enriched nodes based on the closest enriched neighbors' profiles and their similarity.
@@ -259,7 +261,7 @@ def _impute_neighbors_with_similarity(
     Args:
         network (nx.Graph): The network graph with nodes having IDs matching the matrix indices.
         enrichment_matrix (np.ndarray): The enrichment matrix with rows to be imputed.
-        binary_enrichment_matrix (np.ndarray): The alpha threshold matrix to be imputed similarly.
+        significant_binary_enrichment_matrix (np.ndarray): The alpha threshold matrix to be imputed similarly.
         max_depth (int): Maximum depth of nodes to traverse for imputing values.
     Returns:
@@ -268,27 +270,31 @@ def _impute_neighbors_with_similarity(
             - The imputed alpha threshold matrix.
     """
     depth = 1
-    rows_to_impute = np.where(binary_enrichment_matrix.sum(axis=1) == 0)[0]
+    rows_to_impute = np.where(significant_binary_enrichment_matrix.sum(axis=1) == 0)[0]
     while len(rows_to_impute) and depth <= max_depth:
         # Iterate over all enriched nodes
-        for row_index in range(binary_enrichment_matrix.shape[0]):
-            if binary_enrichment_matrix[row_index].sum() != 0:
-                enrichment_matrix, binary_enrichment_matrix = _process_node_imputation(
-                    row_index, network, enrichment_matrix, binary_enrichment_matrix, depth
+        for row_index in range(significant_binary_enrichment_matrix.shape[0]):
+            if significant_binary_enrichment_matrix[row_index].sum() != 0:
+                enrichment_matrix, significant_binary_enrichment_matrix = _process_node_imputation(
+                    row_index,
+                    network,
+                    enrichment_matrix,
+                    significant_binary_enrichment_matrix,
+                    depth,
                 )
         # Update rows to impute for the next iteration
-        rows_to_impute = np.where(binary_enrichment_matrix.sum(axis=1) == 0)[0]
+        rows_to_impute = np.where(significant_binary_enrichment_matrix.sum(axis=1) == 0)[0]
         depth += 1
-    return enrichment_matrix, binary_enrichment_matrix
+    return enrichment_matrix, significant_binary_enrichment_matrix
 def _process_node_imputation(
     row_index: int,
     network: nx.Graph,
     enrichment_matrix: np.ndarray,
-    binary_enrichment_matrix: np.ndarray,
+    significant_binary_enrichment_matrix: np.ndarray,
     depth: int,
 ) -> Tuple[np.ndarray, np.ndarray]:
     """Process the imputation for a single node based on its enriched neighbors.
@@ -297,7 +303,7 @@ def _process_node_imputation(
         row_index (int): The index of the enriched node being processed.
         network (nx.Graph): The network graph with nodes having IDs matching the matrix indices.
         enrichment_matrix (np.ndarray): The enrichment matrix with rows to be imputed.
-        binary_enrichment_matrix (np.ndarray): The alpha threshold matrix to be imputed similarly.
+        significant_binary_enrichment_matrix (np.ndarray): The alpha threshold matrix to be imputed similarly.
         depth (int): Current depth for traversal.
     Returns:
@@ -310,7 +316,7 @@ def _process_node_imputation(
         n
         for n in neighbors
         if n != row_index
-        and binary_enrichment_matrix[n].sum() != 0
+        and significant_binary_enrichment_matrix[n].sum() != 0
         and enrichment_matrix[n].sum() != 0
     ]
     # Filter non-enriched neighbors
@@ -318,7 +324,7 @@ def _process_node_imputation(
         n
         for n in neighbors
         if n != row_index
-        and binary_enrichment_matrix[n].sum() == 0
+        and significant_binary_enrichment_matrix[n].sum() == 0
         and enrichment_matrix[n].sum() == 0
     ]
     # If there are valid non-enriched neighbors
@@ -363,15 +369,17 @@ def _process_node_imputation(
             enrichment_matrix[most_similar_neighbor] = enrichment_matrix[row_index] / np.sqrt(
                 depth + 1
             )
-            binary_enrichment_matrix[most_similar_neighbor] = binary_enrichment_matrix[row_index]
+            significant_binary_enrichment_matrix[most_similar_neighbor] = (
+                significant_binary_enrichment_matrix[row_index]
+            )
-    return enrichment_matrix, binary_enrichment_matrix
+    return enrichment_matrix, significant_binary_enrichment_matrix
 def _prune_neighbors(
     network: nx.Graph,
     enrichment_matrix: np.ndarray,
-    binary_enrichment_matrix: np.ndarray,
+    significant_binary_enrichment_matrix: np.ndarray,
     distance_threshold: float = 0.9,
 ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
     """Remove outliers based on their rank for edge lengths.
@@ -379,7 +387,7 @@ def _prune_neighbors(
     Args:
         network (nx.Graph): The network graph with nodes having IDs matching the matrix indices.
         enrichment_matrix (np.ndarray): The enrichment matrix.
-        binary_enrichment_matrix (np.ndarray): The alpha threshold matrix.
+        significant_binary_enrichment_matrix (np.ndarray): The alpha threshold matrix.
         distance_threshold (float): Rank threshold (0 to 1) to determine outliers.
     Returns:
@@ -389,10 +397,12 @@ def _prune_neighbors(
             - np.ndarray: The significant enrichment matrix, where non-significant entries are set to zero.
     """
     # Identify indices with non-zero rows in the binary enrichment matrix
-    non_zero_indices = np.where(binary_enrichment_matrix.sum(axis=1) != 0)[0]
+    non_zero_indices = np.where(significant_binary_enrichment_matrix.sum(axis=1) != 0)[0]
     median_distances = []
     for node in non_zero_indices:
-        neighbors = [n for n in network.neighbors(node) if binary_enrichment_matrix[n].sum() != 0]
+        neighbors = [
+            n for n in network.neighbors(node) if significant_binary_enrichment_matrix[n].sum() != 0
+        ]
         if neighbors:
             median_distance = np.median(
                 [_get_euclidean_distance(node, n, network) for n in neighbors]
@@ -404,7 +414,9 @@ def _prune_neighbors(
     # Prune nodes that are outliers based on the distance threshold
     for row_index in non_zero_indices:
         neighbors = [
-            n for n in network.neighbors(row_index) if binary_enrichment_matrix[n].sum() != 0
+            n
+            for n in network.neighbors(row_index)
+            if significant_binary_enrichment_matrix[n].sum() != 0
         ]
         if neighbors:
             median_distance = np.median(
@@ -412,12 +424,14 @@ def _prune_neighbors(
             )
             if median_distance >= distance_threshold_value:
                 enrichment_matrix[row_index] = 0
-                binary_enrichment_matrix[row_index] = 0
+                significant_binary_enrichment_matrix[row_index] = 0
     # Create a matrix where non-significant entries are set to zero
-    significant_enrichment_matrix = np.where(binary_enrichment_matrix == 1, enrichment_matrix, 0)
+    significant_enrichment_matrix = np.where(
+        significant_binary_enrichment_matrix == 1, enrichment_matrix, 0
+    )
-    return enrichment_matrix, binary_enrichment_matrix, significant_enrichment_matrix
+    return enrichment_matrix, significant_binary_enrichment_matrix, significant_enrichment_matrix
 def _get_euclidean_distance(node1: Any, node2: Any, network: nx.Graph) -> float:

{risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/network/graph.py RENAMED Viewed

@@ -45,6 +45,10 @@ class NetworkGraph:
         self.domain_id_to_domain_terms_map = self._create_domain_id_to_domain_terms_map(
             trimmed_domains
         )
+        self.domain_id_to_domain_info_map = self._create_domain_id_to_domain_info_map(
+            trimmed_domains
+        )
+        self.trimmed_domains = trimmed_domains
         self.node_enrichment_sums = node_enrichment_sums
         self.node_id_to_domain_ids_and_enrichments_map = (
             self._create_node_id_to_domain_ids_and_enrichments(domains)
@@ -60,7 +64,8 @@ class NetworkGraph:
         self.network = _unfold_sphere_to_plane(network)
         self.node_coordinates = _extract_node_coordinates(self.network)
-    def _create_domain_id_to_node_ids_map(self, domains: pd.DataFrame) -> Dict[int, Any]:
+    @staticmethod
+    def _create_domain_id_to_node_ids_map(domains: pd.DataFrame) -> Dict[int, Any]:
         """Create a mapping from domains to the list of node IDs belonging to each domain.
         Args:
@@ -69,17 +74,16 @@ class NetworkGraph:
         Returns:
             Dict[int, Any]: A dictionary where keys are domain IDs and values are lists of node IDs belonging to each domain.
         """
-        cleaned_domains_matrix = domains.reset_index()[["index", "primary domain"]]
-        node_to_domains_map = cleaned_domains_matrix.set_index("index")["primary domain"].to_dict()
+        cleaned_domains_matrix = domains.reset_index()[["index", "primary_domain"]]
+        node_to_domains_map = cleaned_domains_matrix.set_index("index")["primary_domain"].to_dict()
         domain_id_to_node_ids_map = defaultdict(list)
         for k, v in node_to_domains_map.items():
             domain_id_to_node_ids_map[v].append(k)
         return domain_id_to_node_ids_map
-    def _create_domain_id_to_domain_terms_map(
-        self, trimmed_domains: pd.DataFrame
-    ) -> Dict[int, Any]:
+    @staticmethod
+    def _create_domain_id_to_domain_terms_map(trimmed_domains: pd.DataFrame) -> Dict[int, Any]:
         """Create a mapping from domain IDs to their corresponding terms.
         Args:
@@ -91,13 +95,32 @@ class NetworkGraph:
         return dict(
             zip(
                 trimmed_domains.index,
-                trimmed_domains["label"],
+                trimmed_domains["normalized_description"],
             )
         )
-    def _create_node_id_to_domain_ids_and_enrichments(
-        self, domains: pd.DataFrame
-    ) -> Dict[int, Dict]:
+    @staticmethod
+    def _create_domain_id_to_domain_info_map(
+        trimmed_domains: pd.DataFrame,
+    ) -> Dict[int, Dict[str, Any]]:
+        """Create a mapping from domain IDs to their corresponding full description and enrichment score.
+        Args:
+            trimmed_domains (pd.DataFrame): DataFrame containing domain IDs, full descriptions, and enrichment scores.
+        Returns:
+            Dict[int, Dict[str, Any]]: A dictionary mapping domain IDs (int) to a dictionary with 'full_descriptions' and 'enrichment_scores'.
+        """
+        return {
+            int(id_): {
+                "full_descriptions": trimmed_domains.at[id_, "full_descriptions"],
+                "enrichment_scores": trimmed_domains.at[id_, "enrichment_scores"],
+            }
+            for id_ in trimmed_domains.index
+        }
+    @staticmethod
+    def _create_node_id_to_domain_ids_and_enrichments(domains: pd.DataFrame) -> Dict[int, Dict]:
         """Creates a dictionary mapping each node ID to its corresponding domain IDs and enrichment values.
         Args:

risk-network 0.0.8b20__tar.gz → 0.0.8b22__tar.gz

risk-network 0.0.8b20tar.gz → 0.0.8b22tar.gz