risk-network 0.0.3b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
risk/__init__.py ADDED
@@ -0,0 +1,13 @@
1
+ """
2
+ risk
3
+ ~~~~
4
+
5
+ risk
6
+ ~~~~
7
+
8
+ RISK: RISK Infers Spatial Kinship
9
+ """
10
+
11
+ from risk.risk import RISK
12
+
13
+ __version__ = "0.0.3-beta.1"
@@ -0,0 +1,7 @@
1
+ """
2
+ risk/annotations
3
+ ~~~~~~~~~~~~~~~~
4
+ """
5
+
6
+ from .annotations import define_top_annotations, get_description
7
+ from .io import AnnotationsIO
@@ -0,0 +1,259 @@
1
+ """
2
+ risk/annotations/annotations
3
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4
+ """
5
+
6
+ from collections import Counter
7
+ from itertools import compress, permutations
8
+ from typing import Any, Dict, List, Set
9
+
10
+ import networkx as nx
11
+ import nltk
12
+ import numpy as np
13
+ import pandas as pd
14
+ from nltk.tokenize import word_tokenize
15
+ from nltk.corpus import stopwords
16
+
17
+
18
+ def _setup_nltk():
19
+ """Ensure necessary NLTK data is downloaded."""
20
+ try:
21
+ nltk.data.find("tokenizers/punkt")
22
+ except LookupError:
23
+ nltk.download("punkt")
24
+
25
+ try:
26
+ nltk.data.find("corpora/stopwords")
27
+ except LookupError:
28
+ nltk.download("stopwords")
29
+
30
+
31
+ # Ensure you have the necessary NLTK data
32
+ _setup_nltk()
33
+
34
+
35
+ def load_annotations(network: nx.Graph, annotations_input: Dict[str, Any]) -> Dict[str, Any]:
36
+ """Convert annotations input to a DataFrame and reindex based on the network's node labels.
37
+
38
+ Args:
39
+ annotations_input (dict): A dictionary with annotations.
40
+
41
+ Returns:
42
+ dict: A dictionary containing ordered nodes, ordered annotations, and the annotations matrix.
43
+ """
44
+ # Flatten the dictionary to a list of tuples for easier DataFrame creation
45
+ flattened_annotations = [
46
+ (node, annotation) for annotation, nodes in annotations_input.items() for node in nodes
47
+ ]
48
+ # Create a DataFrame from the flattened list
49
+ annotations = pd.DataFrame(flattened_annotations, columns=["Node", "Annotations"])
50
+ annotations["Is Member"] = 1
51
+ # Pivot to create a binary matrix with nodes as rows and annotations as columns
52
+ annotations_pivot = annotations.pivot_table(
53
+ index="Node", columns="Annotations", values="Is Member", fill_value=0, dropna=False
54
+ )
55
+ # Reindex the annotations matrix based on the node labels from the network
56
+ node_label_order = list(nx.get_node_attributes(network, "label").values())
57
+ annotations_pivot = annotations_pivot.reindex(index=node_label_order)
58
+ # Raise an error if no valid annotations are found for the nodes in the network
59
+ if annotations_pivot.notnull().sum().sum() == 0:
60
+ raise ValueError(
61
+ "No annotations found in the annotations file for the nodes in the network."
62
+ )
63
+
64
+ # Remove columns with all zeros to improve performance
65
+ annotations_pivot = annotations_pivot.loc[:, annotations_pivot.sum(axis=0) != 0]
66
+ # Extract ordered nodes and annotations
67
+ ordered_nodes = tuple(annotations_pivot.index)
68
+ ordered_annotations = tuple(annotations_pivot.columns)
69
+ annotations_pivot_numpy = annotations_pivot.fillna(0).to_numpy()
70
+
71
+ return {
72
+ "ordered_nodes": ordered_nodes,
73
+ "ordered_annotations": ordered_annotations,
74
+ "matrix": annotations_pivot_numpy,
75
+ }
76
+
77
+
78
+ def define_top_annotations(
79
+ network: nx.Graph,
80
+ ordered_annotation_labels: List[str],
81
+ neighborhood_enrichment_sums: List[int],
82
+ binary_enrichment_matrix: np.ndarray,
83
+ min_cluster_size: int = 5,
84
+ max_cluster_size: int = 1000,
85
+ ) -> pd.DataFrame:
86
+ """Define top annotations based on neighborhood enrichment sums and binary enrichment matrix.
87
+
88
+ Args:
89
+ network (NetworkX graph): The network graph.
90
+ ordered_annotation_labels (list of str): List of ordered annotation labels.
91
+ neighborhood_enrichment_sums (list of int): List of neighborhood enrichment sums.
92
+ binary_enrichment_matrix (np.ndarray): Binary enrichment matrix below alpha threshold.
93
+ min_cluster_size (int, optional): Minimum cluster size. Defaults to 5.
94
+ max_cluster_size (int, optional): Maximum cluster size. Defaults to 1000.
95
+
96
+ Returns:
97
+ pd.DataFrame: DataFrame with top annotations and their properties.
98
+ """
99
+ # Create DataFrame to store annotations and their neighborhood enrichment sums
100
+ annotations_enrichment_matrix = pd.DataFrame(
101
+ {
102
+ "id": range(len(ordered_annotation_labels)),
103
+ "words": ordered_annotation_labels,
104
+ "neighborhood enrichment sums": neighborhood_enrichment_sums,
105
+ }
106
+ )
107
+ annotations_enrichment_matrix["top attributes"] = False
108
+ # Apply size constraints to identify potential top attributes
109
+ annotations_enrichment_matrix.loc[
110
+ (annotations_enrichment_matrix["neighborhood enrichment sums"] >= min_cluster_size)
111
+ & (annotations_enrichment_matrix["neighborhood enrichment sums"] <= max_cluster_size),
112
+ "top attributes",
113
+ ] = True
114
+ # Initialize columns for connected components analysis
115
+ annotations_enrichment_matrix["num connected components"] = 0
116
+ annotations_enrichment_matrix["size connected components"] = None
117
+ annotations_enrichment_matrix["size connected components"] = annotations_enrichment_matrix[
118
+ "size connected components"
119
+ ].astype(object)
120
+ annotations_enrichment_matrix["num large connected components"] = 0
121
+
122
+ for attribute in annotations_enrichment_matrix.index.values[
123
+ annotations_enrichment_matrix["top attributes"]
124
+ ]:
125
+ # Identify enriched neighborhoods based on the binary enrichment matrix
126
+ enriched_neighborhoods = list(
127
+ compress(list(network), binary_enrichment_matrix[:, attribute])
128
+ )
129
+ enriched_network = nx.subgraph(network, enriched_neighborhoods)
130
+ # Analyze connected components within the enriched subnetwork
131
+ connected_components = sorted(
132
+ nx.connected_components(enriched_network), key=len, reverse=True
133
+ )
134
+ size_connected_components = np.array([len(c) for c in connected_components])
135
+ num_connected_components = len(connected_components)
136
+ num_large_connected_components = np.sum(
137
+ np.logical_and(
138
+ size_connected_components >= min_cluster_size,
139
+ size_connected_components <= max_cluster_size,
140
+ )
141
+ )
142
+ annotations_enrichment_matrix.loc[attribute, "num connected components"] = (
143
+ num_connected_components
144
+ )
145
+ annotations_enrichment_matrix.at[attribute, "size connected components"] = (
146
+ size_connected_components
147
+ )
148
+ annotations_enrichment_matrix.loc[attribute, "num large connected components"] = (
149
+ num_large_connected_components
150
+ )
151
+
152
+ # Filter out attributes with more than one connected component
153
+ annotations_enrichment_matrix.loc[
154
+ annotations_enrichment_matrix["num connected components"] > 1, "top attributes"
155
+ ] = False
156
+
157
+ return annotations_enrichment_matrix
158
+
159
+
160
+ def get_description(words_column: pd.Series) -> str:
161
+ """Process input Series to identify and return the top N frequent, significant words,
162
+ filtering based on stopwords and similarity (Jaccard index).
163
+
164
+ Args:
165
+ words_column (pd.Series): A pandas Series containing strings to process.
166
+
167
+ Returns:
168
+ str: A coherent description formed from the most frequent and significant words.
169
+ """
170
+ # Define stopwords
171
+ stop_words = set(stopwords.words("english"))
172
+ # Tokenize the concatenated string and filter out stopwords and non-alphabetic words
173
+ words = [
174
+ word.lower()
175
+ for word in word_tokenize(words_column.str.cat(sep=" "))
176
+ if word.isalpha() and word.lower() not in stop_words
177
+ ]
178
+ # Simplify the word list to remove similar words based on the Jaccard index and generate coherent description
179
+ simplified_words = _simplify_word_list(words, threshold=0.90)
180
+ description = _generate_coherent_description(simplified_words)
181
+ return description
182
+
183
+
184
+ def _simplify_word_list(words: List[str], threshold: float = 0.80) -> List[str]:
185
+ """Filter out words that are too similar based on the Jaccard index, keeping the word with the higher count.
186
+
187
+ Args:
188
+ words (list of str): The list of words to be filtered.
189
+ threshold (float, optional): The similarity threshold for the Jaccard index. Defaults to 0.80.
190
+
191
+ Returns:
192
+ list of str: A list of filtered words, where similar words are reduced to the most frequent one.
193
+ """
194
+ # Count the occurrences of each word
195
+ word_counts = Counter(words)
196
+ filtered_words = []
197
+ used_words = set()
198
+
199
+ for word in word_counts:
200
+ if word in used_words:
201
+ continue
202
+
203
+ word_set = set(word)
204
+ # Find similar words based on the Jaccard index
205
+ similar_words = [
206
+ other_word
207
+ for other_word in word_counts
208
+ if _jaccard_index(word_set, set(other_word)) >= threshold
209
+ ]
210
+ # Sort by frequency and choose the most frequent word
211
+ similar_words.sort(key=lambda w: word_counts[w], reverse=True)
212
+ best_word = similar_words[0]
213
+ filtered_words.append(best_word)
214
+ used_words.update(similar_words)
215
+
216
+ final_words = [word for word in words if word in filtered_words]
217
+
218
+ return final_words
219
+
220
+
221
+ def _jaccard_index(set1: Set[Any], set2: Set[Any]) -> float:
222
+ """Calculate the Jaccard Index of two sets.
223
+
224
+ Args:
225
+ set1 (set): The first set for comparison.
226
+ set2 (set): The second set for comparison.
227
+
228
+ Returns:
229
+ float: The Jaccard Index, which is the ratio of the intersection to the union of the two sets.
230
+ Returns 0 if the union of the sets is empty.
231
+ """
232
+ intersection = len(set1.intersection(set2))
233
+ union = len(set1.union(set2))
234
+ return intersection / union if union else 0
235
+
236
+
237
+ def _generate_coherent_description(words: List[str]) -> str:
238
+ """Generate a coherent description from a list of words.
239
+
240
+ Args:
241
+ words (list of str): A list of words from which to generate the description.
242
+
243
+ Returns:
244
+ str: A coherent description formed by arranging the words in a logical sequence.
245
+ """
246
+ # Count the frequency of each word
247
+ word_counts = Counter(words)
248
+ # Get the most common words
249
+ most_common_words = [word for word, _ in word_counts.most_common()]
250
+ # Filter out common stopwords
251
+ stop_words = set(stopwords.words("english"))
252
+ filtered_words = [word for word in most_common_words if word.lower() not in stop_words]
253
+ # Generate permutations of the filtered words to find a logical order
254
+ perm = permutations(filtered_words)
255
+ # Assume the first permutation as the logical sequence (since they're all equally likely without additional context)
256
+ logical_sequence = next(perm)
257
+ # Join the words to form a coherent description
258
+ description = " ".join(logical_sequence)
259
+ return description
risk/annotations/io.py ADDED
@@ -0,0 +1,183 @@
1
+ """
2
+ risk/annotations/io
3
+ ~~~~~~~~~~~~~~~~~~~
4
+
5
+ This file contains the code for the RISK class and command-line access.
6
+ """
7
+
8
+ import json
9
+ from typing import Any, Dict
10
+
11
+ import networkx as nx
12
+ import pandas as pd
13
+
14
+ from risk.annotations.annotations import load_annotations
15
+ from risk.log import params, print_header
16
+
17
+
18
+ class AnnotationsIO:
19
+ """Handles the loading and exporting of annotations in various file formats.
20
+
21
+ The AnnotationsIO class provides methods to load annotations from different file types (JSON, CSV, Excel, etc.)
22
+ and to export parameter data to various formats like JSON, CSV, and text files.
23
+ """
24
+
25
+ def __init__(self):
26
+ pass
27
+
28
+ def load_json_annotations(self, filepath: str, network: nx.Graph) -> Dict[str, Any]:
29
+ """Load annotations from a JSON file and convert them to a DataFrame.
30
+
31
+ Args:
32
+ filepath (str): Path to the JSON annotations file.
33
+ network (NetworkX graph): The network to which the annotations are related.
34
+
35
+ Returns:
36
+ dict: A dictionary containing ordered nodes, ordered annotations, and the annotations matrix.
37
+ """
38
+ filetype = "JSON"
39
+ params.log_annotations(filepath=filepath, filetype=filetype)
40
+ _log_loading(filetype, filepath=filepath)
41
+ # Open and read the JSON file
42
+ with open(filepath, "r") as file:
43
+ annotations_input = json.load(file)
44
+
45
+ # Process the JSON data and return it in the context of the network
46
+ return load_annotations(network, annotations_input)
47
+
48
+ def load_excel_annotation(
49
+ self,
50
+ filepath: str,
51
+ network: nx.Graph,
52
+ label_colname: str = "label",
53
+ nodes_colname: str = "nodes",
54
+ sheet_name: str = "Sheet1",
55
+ nodes_delimiter: str = ";",
56
+ ) -> Dict[str, Any]:
57
+ """Load annotations from an Excel file and associate them with the network.
58
+
59
+ Args:
60
+ filepath (str): Path to the Excel annotations file.
61
+ network (nx.Graph): The NetworkX graph to which the annotations are related.
62
+ label_colname (str): Name of the column containing the labels (e.g., GO terms).
63
+ nodes_colname (str): Name of the column containing the nodes associated with each label.
64
+ sheet_name (str, optional): The name of the Excel sheet to load (default is 'Sheet1').
65
+ nodes_delimiter (str, optional): Delimiter used to separate multiple nodes within the nodes column (default is ';').
66
+
67
+ Returns:
68
+ Dict[str, Any]: A dictionary where each label is paired with its respective list of nodes,
69
+ linked to the provided network.
70
+ """
71
+ filetype = "Excel"
72
+ params.log_annotations(filepath=filepath, filetype=filetype)
73
+ _log_loading(filetype, filepath=filepath)
74
+ # Load the specified sheet from the Excel file
75
+ df = pd.read_excel(filepath, sheet_name=sheet_name)
76
+ # Split the nodes column by the specified nodes_delimiter
77
+ df[nodes_colname] = df[nodes_colname].apply(lambda x: x.split(nodes_delimiter))
78
+ # Convert the DataFrame to a dictionary pairing labels with their corresponding nodes
79
+ label_node_dict = df.set_index(label_colname)[nodes_colname].to_dict()
80
+ return load_annotations(network, label_node_dict)
81
+
82
+ def load_csv_annotation(
83
+ self,
84
+ filepath: str,
85
+ network: nx.Graph,
86
+ label_colname: str = "label",
87
+ nodes_colname: str = "nodes",
88
+ nodes_delimiter: str = ";",
89
+ ) -> Dict[str, Any]:
90
+ """Load annotations from a CSV file and associate them with the network.
91
+
92
+ Args:
93
+ filepath (str): Path to the CSV annotations file.
94
+ network (nx.Graph): The NetworkX graph to which the annotations are related.
95
+ label_colname (str): Name of the column containing the labels (e.g., GO terms).
96
+ nodes_colname (str): Name of the column containing the nodes associated with each label.
97
+ nodes_delimiter (str, optional): Delimiter used to separate multiple nodes within the nodes column (default is ';').
98
+
99
+ Returns:
100
+ Dict[str, Any]: A dictionary where each label is paired with its respective list of nodes,
101
+ linked to the provided network.
102
+ """
103
+ filetype = "CSV"
104
+ params.log_annotations(filepath=filepath, filetype=filetype)
105
+ _log_loading(filetype, filepath=filepath)
106
+ # Load the CSV file into a dictionary
107
+ annotations_input = _load_matrix_file(
108
+ filepath, label_colname, nodes_colname, delimiter=",", nodes_delimiter=nodes_delimiter
109
+ )
110
+ # Process and return the annotations in the context of the network
111
+ return load_annotations(network, annotations_input)
112
+
113
+ def load_tsv_annotation(
114
+ self,
115
+ filepath: str,
116
+ network: nx.Graph,
117
+ label_colname: str = "label",
118
+ nodes_colname: str = "nodes",
119
+ nodes_delimiter: str = ";",
120
+ ) -> Dict[str, Any]:
121
+ """Load annotations from a TSV file and associate them with the network.
122
+
123
+ Args:
124
+ filepath (str): Path to the TSV annotations file.
125
+ network (nx.Graph): The NetworkX graph to which the annotations are related.
126
+ label_colname (str): Name of the column containing the labels (e.g., GO terms).
127
+ nodes_colname (str): Name of the column containing the nodes associated with each label.
128
+ nodes_delimiter (str, optional): Delimiter used to separate multiple nodes within the nodes column (default is ';').
129
+
130
+ Returns:
131
+ Dict[str, Any]: A dictionary where each label is paired with its respective list of nodes,
132
+ linked to the provided network.
133
+ """
134
+ filetype = "TSV"
135
+ params.log_annotations(filepath=filepath, filetype=filetype)
136
+ _log_loading(filetype, filepath=filepath)
137
+ # Load the TSV file into a dictionary
138
+ annotations_input = _load_matrix_file(
139
+ filepath, label_colname, nodes_colname, delimiter="\t", nodes_delimiter=nodes_delimiter
140
+ )
141
+ # Process and return the annotations in the context of the network
142
+ return load_annotations(network, annotations_input)
143
+
144
+
145
+ def _load_matrix_file(
146
+ filepath: str,
147
+ label_colname: str,
148
+ nodes_colname: str,
149
+ delimiter: str = ",",
150
+ nodes_delimiter: str = ";",
151
+ ) -> Dict[str, Any]:
152
+ """Load annotations from a CSV or TSV file and convert them to a dictionary.
153
+
154
+ Args:
155
+ filepath (str): Path to the annotation file.
156
+ label_colname (str): Name of the column containing the labels (e.g., GO terms).
157
+ nodes_colname (str): Name of the column containing the nodes associated with each label.
158
+ delimiter (str, optional): Delimiter used to separate columns in the file (default is ',').
159
+ nodes_delimiter (str, optional): Delimiter used to separate multiple nodes within the nodes column (default is ';').
160
+
161
+ Returns:
162
+ Dict[str, Any]: A dictionary where each label is paired with its respective list of nodes.
163
+ """
164
+ # Load the CSV or TSV file into a DataFrame
165
+ df = pd.read_csv(filepath, delimiter=delimiter)
166
+ # Split the nodes column by the nodes_delimiter to handle multiple nodes per label
167
+ df[nodes_colname] = df[nodes_colname].apply(lambda x: x.split(nodes_delimiter))
168
+ # Create a dictionary pairing labels with their corresponding list of nodes
169
+ label_node_dict = df.set_index(label_colname)[nodes_colname].to_dict()
170
+ return label_node_dict
171
+
172
+
173
+ def _log_loading(filetype: str, filepath: str = "") -> None:
174
+ """Log information about the network file being loaded.
175
+
176
+ Args:
177
+ filetype (str): The type of the file being loaded (e.g., 'Cytoscape').
178
+ filepath (str, optional): The path to the file being loaded.
179
+ """
180
+ print_header("Loading annotations")
181
+ print(f"Filetype: {filetype}")
182
+ if filepath:
183
+ print(f"Filepath: {filepath}")
risk/constants.py ADDED
@@ -0,0 +1,31 @@
1
+ """
2
+ risk/constants
3
+ ~~~~~~~~~~~~~~
4
+ """
5
+
6
+ GROUP_DISTANCE_METRICS = [
7
+ "braycurtis",
8
+ "canberra",
9
+ "chebyshev",
10
+ "cityblock",
11
+ "correlation",
12
+ "cosine",
13
+ "dice",
14
+ "euclidean",
15
+ "hamming",
16
+ "jaccard",
17
+ "jensenshannon",
18
+ "kulczynski1",
19
+ "mahalanobis",
20
+ "matching",
21
+ "minkowski",
22
+ "rogerstanimoto",
23
+ "russellrao",
24
+ "seuclidean",
25
+ "sokalmichener",
26
+ "sokalsneath",
27
+ "sqeuclidean",
28
+ "yule",
29
+ ]
30
+
31
+ GROUP_LINKAGE_METHODS = ["single", "complete", "average", "weighted", "centroid", "median", "ward"]
risk/log/__init__.py ADDED
@@ -0,0 +1,9 @@
1
+ """
2
+ risk/log
3
+ ~~~~~~~~
4
+ """
5
+
6
+ from .console import print_header
7
+ from .params import Params
8
+
9
+ params = Params()
risk/log/console.py ADDED
@@ -0,0 +1,16 @@
1
+ """
2
+ risk/log/console
3
+ ~~~~~~~~~~~~~~~~
4
+ """
5
+
6
+
7
+ def print_header(input_string: str) -> None:
8
+ """Print the input string as a header with a line of dashes above and below it.
9
+
10
+ Args:
11
+ input_string (str): The string to be printed as a header.
12
+ """
13
+ border = "-" * len(input_string)
14
+ print(border)
15
+ print(input_string)
16
+ print(border)