risk-network 0.0.12b0__py3-none-any.whl → 0.0.12b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- risk/__init__.py +1 -1
- risk/annotations/__init__.py +10 -0
- risk/annotations/annotations.py +354 -0
- risk/annotations/io.py +241 -0
- risk/annotations/nltk_setup.py +86 -0
- risk/log/__init__.py +11 -0
- risk/log/console.py +141 -0
- risk/log/parameters.py +171 -0
- risk/neighborhoods/__init__.py +7 -0
- risk/neighborhoods/api.py +442 -0
- risk/neighborhoods/community.py +441 -0
- risk/neighborhoods/domains.py +360 -0
- risk/neighborhoods/neighborhoods.py +514 -0
- risk/neighborhoods/stats/__init__.py +13 -0
- risk/neighborhoods/stats/permutation/__init__.py +6 -0
- risk/neighborhoods/stats/permutation/permutation.py +240 -0
- risk/neighborhoods/stats/permutation/test_functions.py +70 -0
- risk/neighborhoods/stats/tests.py +275 -0
- risk/network/__init__.py +4 -0
- risk/network/graph/__init__.py +4 -0
- risk/network/graph/api.py +200 -0
- risk/network/graph/graph.py +268 -0
- risk/network/graph/stats.py +166 -0
- risk/network/graph/summary.py +253 -0
- risk/network/io.py +693 -0
- risk/network/plotter/__init__.py +4 -0
- risk/network/plotter/api.py +54 -0
- risk/network/plotter/canvas.py +291 -0
- risk/network/plotter/contour.py +329 -0
- risk/network/plotter/labels.py +935 -0
- risk/network/plotter/network.py +294 -0
- risk/network/plotter/plotter.py +141 -0
- risk/network/plotter/utils/colors.py +419 -0
- risk/network/plotter/utils/layout.py +94 -0
- risk_network-0.0.12b1.dist-info/METADATA +122 -0
- risk_network-0.0.12b1.dist-info/RECORD +40 -0
- {risk_network-0.0.12b0.dist-info → risk_network-0.0.12b1.dist-info}/WHEEL +1 -1
- risk_network-0.0.12b0.dist-info/METADATA +0 -796
- risk_network-0.0.12b0.dist-info/RECORD +0 -7
- {risk_network-0.0.12b0.dist-info → risk_network-0.0.12b1.dist-info}/licenses/LICENSE +0 -0
- {risk_network-0.0.12b0.dist-info → risk_network-0.0.12b1.dist-info}/top_level.txt +0 -0
risk/__init__.py
CHANGED
@@ -0,0 +1,354 @@
|
|
1
|
+
"""
|
2
|
+
risk/annotations/annotations
|
3
|
+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
4
|
+
"""
|
5
|
+
|
6
|
+
import re
|
7
|
+
from collections import Counter
|
8
|
+
from itertools import compress
|
9
|
+
from typing import Any, Dict, List, Set
|
10
|
+
|
11
|
+
import networkx as nx
|
12
|
+
import numpy as np
|
13
|
+
import pandas as pd
|
14
|
+
from nltk.tokenize import word_tokenize
|
15
|
+
from scipy.sparse import coo_matrix
|
16
|
+
|
17
|
+
from risk.annotations.nltk_setup import setup_nltk_resources
|
18
|
+
from risk.log import logger
|
19
|
+
|
20
|
+
|
21
|
+
def initialize_nltk():
|
22
|
+
"""Initialize all required NLTK components."""
|
23
|
+
setup_nltk_resources()
|
24
|
+
|
25
|
+
# After resources are available, initialize the components
|
26
|
+
from nltk.corpus import stopwords
|
27
|
+
from nltk.stem import WordNetLemmatizer
|
28
|
+
|
29
|
+
global STOP_WORDS, LEMMATIZER
|
30
|
+
STOP_WORDS = set(stopwords.words("english"))
|
31
|
+
LEMMATIZER = WordNetLemmatizer()
|
32
|
+
|
33
|
+
|
34
|
+
# Initialize NLTK components
|
35
|
+
initialize_nltk()
|
36
|
+
|
37
|
+
|
38
|
+
def load_annotations(
|
39
|
+
network: nx.Graph, annotations_input: Dict[str, Any], min_nodes_per_term: int = 2
|
40
|
+
) -> Dict[str, Any]:
|
41
|
+
"""Convert annotations input to a sparse matrix and reindex based on the network's node labels.
|
42
|
+
|
43
|
+
Args:
|
44
|
+
network (nx.Graph): The network graph.
|
45
|
+
annotations_input (Dict[str, Any]): A dictionary with annotations.
|
46
|
+
min_nodes_per_term (int, optional): The minimum number of network nodes required for each annotation
|
47
|
+
term to be included. Defaults to 2.
|
48
|
+
|
49
|
+
Returns:
|
50
|
+
Dict[str, Any]: A dictionary containing ordered nodes, ordered annotations, and the sparse binary annotations
|
51
|
+
matrix.
|
52
|
+
|
53
|
+
Raises:
|
54
|
+
ValueError: If no annotations are found for the nodes in the network.
|
55
|
+
ValueError: If no annotations have at least min_nodes_per_term nodes in the network.
|
56
|
+
"""
|
57
|
+
# Step 1: Map nodes and annotations to indices
|
58
|
+
node_label_order = [attr["label"] for _, attr in network.nodes(data=True) if "label" in attr]
|
59
|
+
node_to_idx = {node: i for i, node in enumerate(node_label_order)}
|
60
|
+
annotation_to_idx = {annotation: i for i, annotation in enumerate(annotations_input)}
|
61
|
+
# Step 2: Construct a sparse binary matrix directly
|
62
|
+
row = []
|
63
|
+
col = []
|
64
|
+
data = []
|
65
|
+
for annotation, nodes in annotations_input.items():
|
66
|
+
for node in nodes:
|
67
|
+
if node in node_to_idx and annotation in annotation_to_idx:
|
68
|
+
row.append(node_to_idx[node])
|
69
|
+
col.append(annotation_to_idx[annotation])
|
70
|
+
data.append(1)
|
71
|
+
|
72
|
+
# Create a sparse binary matrix
|
73
|
+
num_nodes = len(node_to_idx)
|
74
|
+
num_annotations = len(annotation_to_idx)
|
75
|
+
annotations_pivot = coo_matrix((data, (row, col)), shape=(num_nodes, num_annotations)).tocsr()
|
76
|
+
# Step 3: Filter out annotations with fewer than min_nodes_per_term occurrences
|
77
|
+
valid_annotations = annotations_pivot.sum(axis=0).A1 >= min_nodes_per_term
|
78
|
+
annotations_pivot = annotations_pivot[:, valid_annotations]
|
79
|
+
# Step 4: Raise errors for empty matrices
|
80
|
+
if annotations_pivot.nnz == 0:
|
81
|
+
raise ValueError("No terms found in the annotation file for the nodes in the network.")
|
82
|
+
|
83
|
+
num_remaining_annotations = annotations_pivot.shape[1]
|
84
|
+
if num_remaining_annotations == 0:
|
85
|
+
raise ValueError(
|
86
|
+
f"No annotation terms found with at least {min_nodes_per_term} nodes in the network."
|
87
|
+
)
|
88
|
+
|
89
|
+
# Step 5: Extract ordered nodes and annotations
|
90
|
+
ordered_nodes = tuple(node_label_order)
|
91
|
+
ordered_annotations = tuple(
|
92
|
+
annotation for annotation, is_valid in zip(annotation_to_idx, valid_annotations) if is_valid
|
93
|
+
)
|
94
|
+
|
95
|
+
# Log the filtering details
|
96
|
+
logger.info(f"Minimum number of nodes per annotation term: {min_nodes_per_term}")
|
97
|
+
logger.info(f"Number of input annotation terms: {num_annotations}")
|
98
|
+
logger.info(f"Number of remaining annotation terms: {num_remaining_annotations}")
|
99
|
+
|
100
|
+
return {
|
101
|
+
"ordered_nodes": ordered_nodes,
|
102
|
+
"ordered_annotations": ordered_annotations,
|
103
|
+
"matrix": annotations_pivot,
|
104
|
+
}
|
105
|
+
|
106
|
+
|
107
|
+
def define_top_annotations(
|
108
|
+
network: nx.Graph,
|
109
|
+
ordered_annotation_labels: List[str],
|
110
|
+
neighborhood_significance_sums: List[int],
|
111
|
+
significant_significance_matrix: np.ndarray,
|
112
|
+
significant_binary_significance_matrix: np.ndarray,
|
113
|
+
min_cluster_size: int = 5,
|
114
|
+
max_cluster_size: int = 1000,
|
115
|
+
) -> pd.DataFrame:
|
116
|
+
"""Define top annotations based on neighborhood significance sums and binary significance matrix.
|
117
|
+
|
118
|
+
Args:
|
119
|
+
network (NetworkX graph): The network graph.
|
120
|
+
ordered_annotation_labels (list of str): List of ordered annotation labels.
|
121
|
+
neighborhood_significance_sums (list of int): List of neighborhood significance sums.
|
122
|
+
significant_significance_matrix (np.ndarray): Enrichment matrix below alpha threshold.
|
123
|
+
significant_binary_significance_matrix (np.ndarray): Binary significance matrix below alpha threshold.
|
124
|
+
min_cluster_size (int, optional): Minimum cluster size. Defaults to 5.
|
125
|
+
max_cluster_size (int, optional): Maximum cluster size. Defaults to 1000.
|
126
|
+
|
127
|
+
Returns:
|
128
|
+
pd.DataFrame: DataFrame with top annotations and their properties.
|
129
|
+
"""
|
130
|
+
# Sum the columns of the significant significance matrix (positive floating point values)
|
131
|
+
significant_significance_scores = significant_significance_matrix.sum(axis=0)
|
132
|
+
# Create DataFrame to store annotations, their neighborhood significance sums, and significance scores
|
133
|
+
annotations_significance_matrix = pd.DataFrame(
|
134
|
+
{
|
135
|
+
"id": range(len(ordered_annotation_labels)),
|
136
|
+
"full_terms": ordered_annotation_labels,
|
137
|
+
"significant_neighborhood_significance_sums": neighborhood_significance_sums,
|
138
|
+
"significant_significance_score": significant_significance_scores,
|
139
|
+
}
|
140
|
+
)
|
141
|
+
annotations_significance_matrix["significant_annotations"] = False
|
142
|
+
# Apply size constraints to identify potential significant annotations
|
143
|
+
annotations_significance_matrix.loc[
|
144
|
+
(
|
145
|
+
annotations_significance_matrix["significant_neighborhood_significance_sums"]
|
146
|
+
>= min_cluster_size
|
147
|
+
)
|
148
|
+
& (
|
149
|
+
annotations_significance_matrix["significant_neighborhood_significance_sums"]
|
150
|
+
<= max_cluster_size
|
151
|
+
),
|
152
|
+
"significant_annotations",
|
153
|
+
] = True
|
154
|
+
# Initialize columns for connected components analysis
|
155
|
+
annotations_significance_matrix["num_connected_components"] = 0
|
156
|
+
annotations_significance_matrix["size_connected_components"] = None
|
157
|
+
annotations_significance_matrix["size_connected_components"] = annotations_significance_matrix[
|
158
|
+
"size_connected_components"
|
159
|
+
].astype(object)
|
160
|
+
annotations_significance_matrix["num_large_connected_components"] = 0
|
161
|
+
|
162
|
+
for attribute in annotations_significance_matrix.index.values[
|
163
|
+
annotations_significance_matrix["significant_annotations"]
|
164
|
+
]:
|
165
|
+
# Identify significant neighborhoods based on the binary significance matrix
|
166
|
+
significant_neighborhoods = list(
|
167
|
+
compress(list(network), significant_binary_significance_matrix[:, attribute])
|
168
|
+
)
|
169
|
+
significant_network = nx.subgraph(network, significant_neighborhoods)
|
170
|
+
# Analyze connected components within the significant subnetwork
|
171
|
+
connected_components = sorted(
|
172
|
+
nx.connected_components(significant_network), key=len, reverse=True
|
173
|
+
)
|
174
|
+
size_connected_components = np.array([len(c) for c in connected_components])
|
175
|
+
|
176
|
+
# Filter the size of connected components by min_cluster_size and max_cluster_size
|
177
|
+
filtered_size_connected_components = size_connected_components[
|
178
|
+
(size_connected_components >= min_cluster_size)
|
179
|
+
& (size_connected_components <= max_cluster_size)
|
180
|
+
]
|
181
|
+
# Calculate the number of connected components and large connected components
|
182
|
+
num_connected_components = len(connected_components)
|
183
|
+
num_large_connected_components = len(filtered_size_connected_components)
|
184
|
+
|
185
|
+
# Assign the number of connected components
|
186
|
+
annotations_significance_matrix.loc[attribute, "num_connected_components"] = (
|
187
|
+
num_connected_components
|
188
|
+
)
|
189
|
+
# Filter out attributes with more than one connected component
|
190
|
+
annotations_significance_matrix.loc[
|
191
|
+
annotations_significance_matrix["num_connected_components"] > 1,
|
192
|
+
"significant_annotations",
|
193
|
+
] = False
|
194
|
+
# Assign the number of large connected components
|
195
|
+
annotations_significance_matrix.loc[attribute, "num_large_connected_components"] = (
|
196
|
+
num_large_connected_components
|
197
|
+
)
|
198
|
+
# Assign the size of connected components, ensuring it is always a list
|
199
|
+
annotations_significance_matrix.at[attribute, "size_connected_components"] = (
|
200
|
+
filtered_size_connected_components.tolist()
|
201
|
+
)
|
202
|
+
|
203
|
+
return annotations_significance_matrix
|
204
|
+
|
205
|
+
|
206
|
+
def get_weighted_description(words_column: pd.Series, scores_column: pd.Series) -> str:
|
207
|
+
"""Generate a weighted description from words and their corresponding scores,
|
208
|
+
using improved weighting logic with normalization, lemmatization, and aggregation.
|
209
|
+
|
210
|
+
Args:
|
211
|
+
words_column (pd.Series): A pandas Series containing strings (phrases) to process.
|
212
|
+
scores_column (pd.Series): A pandas Series containing significance scores to weigh the terms.
|
213
|
+
|
214
|
+
Returns:
|
215
|
+
str: A coherent description formed from the most frequent and significant words.
|
216
|
+
"""
|
217
|
+
# Normalize significance scores to [0,1]. If all scores are identical, use 1.
|
218
|
+
if scores_column.max() == scores_column.min():
|
219
|
+
normalized_scores = pd.Series([1] * len(scores_column), index=scores_column.index)
|
220
|
+
else:
|
221
|
+
normalized_scores = (scores_column - scores_column.min()) / (
|
222
|
+
scores_column.max() - scores_column.min()
|
223
|
+
)
|
224
|
+
|
225
|
+
# Accumulate weighted counts for each token (after cleaning and lemmatization)
|
226
|
+
weighted_counts = {}
|
227
|
+
for phrase, score in zip(words_column, normalized_scores):
|
228
|
+
# Tokenize the phrase
|
229
|
+
tokens = word_tokenize(str(phrase))
|
230
|
+
# Determine the weight (scale factor; here multiplying normalized score by 10)
|
231
|
+
weight = max(1, int((0 if pd.isna(score) else score) * 10))
|
232
|
+
for token in tokens:
|
233
|
+
# Clean token: lowercase and remove extraneous punctuation (but preserve intra-word hyphens)
|
234
|
+
token_clean = re.sub(r"[^\w\-]", "", token).strip()
|
235
|
+
if not token_clean:
|
236
|
+
continue
|
237
|
+
# Skip tokens that are pure numbers
|
238
|
+
if token_clean.isdigit():
|
239
|
+
continue
|
240
|
+
# Skip stopwords
|
241
|
+
if token_clean in STOP_WORDS:
|
242
|
+
continue
|
243
|
+
# Lemmatize the token to merge similar forms
|
244
|
+
token_norm = LEMMATIZER.lemmatize(token_clean)
|
245
|
+
weighted_counts[token_norm] = weighted_counts.get(token_norm, 0) + weight
|
246
|
+
|
247
|
+
# Reconstruct a weighted token list by repeating each token by its aggregated count.
|
248
|
+
weighted_words = []
|
249
|
+
for token, count in weighted_counts.items():
|
250
|
+
weighted_words.extend([token] * count)
|
251
|
+
|
252
|
+
# Combine tokens that match number-word patterns (e.g. "4-alpha") and remove pure numeric tokens.
|
253
|
+
combined_tokens = []
|
254
|
+
for token in weighted_words:
|
255
|
+
if re.match(r"^\d+-\w+", token):
|
256
|
+
combined_tokens.append(token)
|
257
|
+
elif token.replace(".", "", 1).isdigit():
|
258
|
+
continue
|
259
|
+
else:
|
260
|
+
combined_tokens.append(token)
|
261
|
+
|
262
|
+
# If the only token is numeric, return a default value.
|
263
|
+
if len(combined_tokens) == 1 and combined_tokens[0].isdigit():
|
264
|
+
return "N/A"
|
265
|
+
|
266
|
+
# Simplify the token list to remove near-duplicates based on the Jaccard index.
|
267
|
+
simplified_words = _simplify_word_list(combined_tokens)
|
268
|
+
# Generate a coherent description from the simplified words.
|
269
|
+
description = _generate_coherent_description(simplified_words)
|
270
|
+
|
271
|
+
return description
|
272
|
+
|
273
|
+
|
274
|
+
def _simplify_word_list(words: List[str], threshold: float = 0.80) -> List[str]:
|
275
|
+
"""Filter out words that are too similar based on the Jaccard index,
|
276
|
+
keeping the word with the higher aggregated count.
|
277
|
+
|
278
|
+
Args:
|
279
|
+
words (List[str]): The list of tokens to be filtered.
|
280
|
+
threshold (float, optional): The similarity threshold for the Jaccard index. Defaults to 0.80.
|
281
|
+
|
282
|
+
Returns:
|
283
|
+
List[str]: A list of filtered words, where similar words are reduced to the most frequent one.
|
284
|
+
"""
|
285
|
+
# Count the occurrences (which reflect the weighted importance)
|
286
|
+
word_counts = Counter(words)
|
287
|
+
filtered_words = []
|
288
|
+
used_words = set()
|
289
|
+
|
290
|
+
# Iterate through words sorted by descending weighted frequency
|
291
|
+
for word in sorted(word_counts, key=lambda w: word_counts[w], reverse=True):
|
292
|
+
if word in used_words:
|
293
|
+
continue
|
294
|
+
|
295
|
+
word_set = set(word)
|
296
|
+
# Find similar words (including the current word) based on the Jaccard index
|
297
|
+
similar_words = [
|
298
|
+
other_word
|
299
|
+
for other_word in word_counts
|
300
|
+
if _calculate_jaccard_index(word_set, set(other_word)) >= threshold
|
301
|
+
]
|
302
|
+
# Choose the word with the highest weighted count among the similar group
|
303
|
+
similar_words.sort(key=lambda w: word_counts[w], reverse=True)
|
304
|
+
best_word = similar_words[0]
|
305
|
+
filtered_words.append(best_word)
|
306
|
+
used_words.update(similar_words)
|
307
|
+
|
308
|
+
# Preserve the original order (by frequency) from the filtered set
|
309
|
+
final_words = [word for word in words if word in filtered_words]
|
310
|
+
|
311
|
+
return final_words
|
312
|
+
|
313
|
+
|
314
|
+
def _calculate_jaccard_index(set1: Set[Any], set2: Set[Any]) -> float:
|
315
|
+
"""Calculate the Jaccard index between two sets.
|
316
|
+
|
317
|
+
Args:
|
318
|
+
set1 (Set[Any]): The first set.
|
319
|
+
set2 (Set[Any]): The second set.
|
320
|
+
|
321
|
+
Returns:
|
322
|
+
float: The Jaccard index (intersection over union). Returns 0 if the union is empty.
|
323
|
+
"""
|
324
|
+
intersection = len(set1.intersection(set2))
|
325
|
+
union = len(set1.union(set2))
|
326
|
+
return intersection / union if union else 0
|
327
|
+
|
328
|
+
|
329
|
+
def _generate_coherent_description(words: List[str]) -> str:
|
330
|
+
"""Generate a coherent description from a list of words.
|
331
|
+
|
332
|
+
If there is only one unique entry, return it directly.
|
333
|
+
Otherwise, order the words by frequency and join them into a single string.
|
334
|
+
|
335
|
+
Args:
|
336
|
+
words (List[str]): A list of tokens.
|
337
|
+
|
338
|
+
Returns:
|
339
|
+
str: A coherent, space-separated description.
|
340
|
+
"""
|
341
|
+
if not words:
|
342
|
+
return "N/A"
|
343
|
+
|
344
|
+
# If there is only one unique word, return it directly
|
345
|
+
unique_words = set(words)
|
346
|
+
if len(unique_words) == 1:
|
347
|
+
return list(unique_words)[0]
|
348
|
+
|
349
|
+
# Count weighted occurrences and sort in descending order.
|
350
|
+
word_counts = Counter(words)
|
351
|
+
most_common_words = [word for word, _ in word_counts.most_common()]
|
352
|
+
description = " ".join(most_common_words)
|
353
|
+
|
354
|
+
return description
|
risk/annotations/io.py
ADDED
@@ -0,0 +1,241 @@
|
|
1
|
+
"""
|
2
|
+
risk/annotations/io
|
3
|
+
~~~~~~~~~~~~~~~~~~~
|
4
|
+
"""
|
5
|
+
|
6
|
+
import json
|
7
|
+
from typing import Any, Dict
|
8
|
+
|
9
|
+
import networkx as nx
|
10
|
+
import pandas as pd
|
11
|
+
|
12
|
+
from risk.annotations.annotations import load_annotations
|
13
|
+
from risk.log import log_header, logger, params
|
14
|
+
|
15
|
+
|
16
|
+
class AnnotationsIO:
|
17
|
+
"""Handles the loading and exporting of annotations in various file formats.
|
18
|
+
|
19
|
+
The AnnotationsIO class provides methods to load annotations from different file types (JSON, CSV, Excel, etc.)
|
20
|
+
and to export parameter data to various formats like JSON, CSV, and text files.
|
21
|
+
"""
|
22
|
+
|
23
|
+
def load_json_annotation(
|
24
|
+
self, network: nx.Graph, filepath: str, min_nodes_per_term: int = 2
|
25
|
+
) -> Dict[str, Any]:
|
26
|
+
"""Load annotations from a JSON file and convert them to a DataFrame.
|
27
|
+
|
28
|
+
Args:
|
29
|
+
network (NetworkX graph): The network to which the annotations are related.
|
30
|
+
filepath (str): Path to the JSON annotations file.
|
31
|
+
min_nodes_per_term (int, optional): The minimum number of network nodes required for each annotation
|
32
|
+
term to be included. Defaults to 2.
|
33
|
+
|
34
|
+
Returns:
|
35
|
+
Dict[str, Any]: A dictionary containing ordered nodes, ordered annotations, and the annotations matrix.
|
36
|
+
"""
|
37
|
+
filetype = "JSON"
|
38
|
+
# Log the loading of the JSON file
|
39
|
+
params.log_annotations(
|
40
|
+
filetype=filetype, filepath=filepath, min_nodes_per_term=min_nodes_per_term
|
41
|
+
)
|
42
|
+
self._log_loading(filetype, filepath=filepath)
|
43
|
+
|
44
|
+
# Load the JSON file into a dictionary
|
45
|
+
with open(filepath, "r", encoding="utf-8") as file:
|
46
|
+
annotations_input = json.load(file)
|
47
|
+
|
48
|
+
return load_annotations(network, annotations_input, min_nodes_per_term)
|
49
|
+
|
50
|
+
def load_excel_annotation(
|
51
|
+
self,
|
52
|
+
network: nx.Graph,
|
53
|
+
filepath: str,
|
54
|
+
label_colname: str = "label",
|
55
|
+
nodes_colname: str = "nodes",
|
56
|
+
sheet_name: str = "Sheet1",
|
57
|
+
nodes_delimiter: str = ";",
|
58
|
+
min_nodes_per_term: int = 2,
|
59
|
+
) -> Dict[str, Any]:
|
60
|
+
"""Load annotations from an Excel file and associate them with the network.
|
61
|
+
|
62
|
+
Args:
|
63
|
+
network (nx.Graph): The NetworkX graph to which the annotations are related.
|
64
|
+
filepath (str): Path to the Excel annotations file.
|
65
|
+
label_colname (str): Name of the column containing the labels (e.g., GO terms).
|
66
|
+
nodes_colname (str): Name of the column containing the nodes associated with each label.
|
67
|
+
sheet_name (str, optional): The name of the Excel sheet to load (default is 'Sheet1').
|
68
|
+
nodes_delimiter (str, optional): Delimiter used to separate multiple nodes within the nodes column (default is ';').
|
69
|
+
min_nodes_per_term (int, optional): The minimum number of network nodes required for each annotation
|
70
|
+
term to be included. Defaults to 2.
|
71
|
+
|
72
|
+
Returns:
|
73
|
+
Dict[str, Any]: A dictionary where each label is paired with its respective list of nodes,
|
74
|
+
linked to the provided network.
|
75
|
+
"""
|
76
|
+
filetype = "Excel"
|
77
|
+
# Log the loading of the Excel file
|
78
|
+
params.log_annotations(
|
79
|
+
filetype=filetype, filepath=filepath, min_nodes_per_term=min_nodes_per_term
|
80
|
+
)
|
81
|
+
self._log_loading(filetype, filepath=filepath)
|
82
|
+
|
83
|
+
# Load the specified sheet from the Excel file
|
84
|
+
annotation = pd.read_excel(filepath, sheet_name=sheet_name)
|
85
|
+
# Split the nodes column by the specified nodes_delimiter
|
86
|
+
annotation[nodes_colname] = annotation[nodes_colname].apply(
|
87
|
+
lambda x: x.split(nodes_delimiter)
|
88
|
+
)
|
89
|
+
# Convert the DataFrame to a dictionary pairing labels with their corresponding nodes
|
90
|
+
annotations_input = annotation.set_index(label_colname)[nodes_colname].to_dict()
|
91
|
+
|
92
|
+
return load_annotations(network, annotations_input, min_nodes_per_term)
|
93
|
+
|
94
|
+
def load_csv_annotation(
|
95
|
+
self,
|
96
|
+
network: nx.Graph,
|
97
|
+
filepath: str,
|
98
|
+
label_colname: str = "label",
|
99
|
+
nodes_colname: str = "nodes",
|
100
|
+
nodes_delimiter: str = ";",
|
101
|
+
min_nodes_per_term: int = 2,
|
102
|
+
) -> Dict[str, Any]:
|
103
|
+
"""Load annotations from a CSV file and associate them with the network.
|
104
|
+
|
105
|
+
Args:
|
106
|
+
network (nx.Graph): The NetworkX graph to which the annotations are related.
|
107
|
+
filepath (str): Path to the CSV annotations file.
|
108
|
+
label_colname (str): Name of the column containing the labels (e.g., GO terms).
|
109
|
+
nodes_colname (str): Name of the column containing the nodes associated with each label.
|
110
|
+
nodes_delimiter (str, optional): Delimiter used to separate multiple nodes within the nodes column (default is ';').
|
111
|
+
min_nodes_per_term (int, optional): The minimum number of network nodes required for each annotation
|
112
|
+
term to be included. Defaults to 2.
|
113
|
+
|
114
|
+
Returns:
|
115
|
+
Dict[str, Any]: A dictionary where each label is paired with its respective list of nodes,
|
116
|
+
linked to the provided network.
|
117
|
+
"""
|
118
|
+
filetype = "CSV"
|
119
|
+
# Log the loading of the CSV file
|
120
|
+
params.log_annotations(
|
121
|
+
filetype=filetype, filepath=filepath, min_nodes_per_term=min_nodes_per_term
|
122
|
+
)
|
123
|
+
self._log_loading(filetype, filepath=filepath)
|
124
|
+
|
125
|
+
# Load the CSV file into a dictionary
|
126
|
+
annotations_input = self._load_matrix_file(
|
127
|
+
filepath, label_colname, nodes_colname, delimiter=",", nodes_delimiter=nodes_delimiter
|
128
|
+
)
|
129
|
+
|
130
|
+
return load_annotations(network, annotations_input, min_nodes_per_term)
|
131
|
+
|
132
|
+
def load_tsv_annotation(
|
133
|
+
self,
|
134
|
+
network: nx.Graph,
|
135
|
+
filepath: str,
|
136
|
+
label_colname: str = "label",
|
137
|
+
nodes_colname: str = "nodes",
|
138
|
+
nodes_delimiter: str = ";",
|
139
|
+
min_nodes_per_term: int = 2,
|
140
|
+
) -> Dict[str, Any]:
|
141
|
+
"""Load annotations from a TSV file and associate them with the network.
|
142
|
+
|
143
|
+
Args:
|
144
|
+
network (nx.Graph): The NetworkX graph to which the annotations are related.
|
145
|
+
filepath (str): Path to the TSV annotations file.
|
146
|
+
label_colname (str): Name of the column containing the labels (e.g., GO terms).
|
147
|
+
nodes_colname (str): Name of the column containing the nodes associated with each label.
|
148
|
+
nodes_delimiter (str, optional): Delimiter used to separate multiple nodes within the nodes column (default is ';').
|
149
|
+
min_nodes_per_term (int, optional): The minimum number of network nodes required for each annotation
|
150
|
+
term to be included. Defaults to 2.
|
151
|
+
|
152
|
+
Returns:
|
153
|
+
Dict[str, Any]: A dictionary where each label is paired with its respective list of nodes,
|
154
|
+
linked to the provided network.
|
155
|
+
"""
|
156
|
+
filetype = "TSV"
|
157
|
+
# Log the loading of the TSV file
|
158
|
+
params.log_annotations(
|
159
|
+
filetype=filetype, filepath=filepath, min_nodes_per_term=min_nodes_per_term
|
160
|
+
)
|
161
|
+
self._log_loading(filetype, filepath=filepath)
|
162
|
+
|
163
|
+
# Load the TSV file into a dictionary
|
164
|
+
annotations_input = self._load_matrix_file(
|
165
|
+
filepath, label_colname, nodes_colname, delimiter="\t", nodes_delimiter=nodes_delimiter
|
166
|
+
)
|
167
|
+
|
168
|
+
return load_annotations(network, annotations_input, min_nodes_per_term)
|
169
|
+
|
170
|
+
def load_dict_annotation(
|
171
|
+
self, network: nx.Graph, content: Dict[str, Any], min_nodes_per_term: int = 2
|
172
|
+
) -> Dict[str, Any]:
|
173
|
+
"""Load annotations from a provided dictionary and convert them to a dictionary annotation.
|
174
|
+
|
175
|
+
Args:
|
176
|
+
network (NetworkX graph): The network to which the annotations are related.
|
177
|
+
content (Dict[str, Any]): The annotations dictionary to load.
|
178
|
+
min_nodes_per_term (int, optional): The minimum number of network nodes required for each annotation
|
179
|
+
term to be included. Defaults to 2.
|
180
|
+
|
181
|
+
Returns:
|
182
|
+
Dict[str, Any]: A dictionary containing ordered nodes, ordered annotations, and the annotations matrix.
|
183
|
+
|
184
|
+
Raises:
|
185
|
+
TypeError: If the content is not a dictionary.
|
186
|
+
"""
|
187
|
+
# Ensure the input content is a dictionary
|
188
|
+
if not isinstance(content, dict):
|
189
|
+
raise TypeError(
|
190
|
+
f"Expected 'content' to be a dictionary, but got {type(content).__name__} instead."
|
191
|
+
)
|
192
|
+
|
193
|
+
filetype = "Dictionary"
|
194
|
+
# Log the loading of the annotations from the dictionary
|
195
|
+
params.log_annotations(filepath="In-memory dictionary", filetype=filetype)
|
196
|
+
self._log_loading(filetype, "In-memory dictionary")
|
197
|
+
|
198
|
+
# Load the annotations as a dictionary from the provided dictionary
|
199
|
+
return load_annotations(network, content, min_nodes_per_term)
|
200
|
+
|
201
|
+
def _load_matrix_file(
|
202
|
+
self,
|
203
|
+
filepath: str,
|
204
|
+
label_colname: str,
|
205
|
+
nodes_colname: str,
|
206
|
+
delimiter: str = ",",
|
207
|
+
nodes_delimiter: str = ";",
|
208
|
+
) -> Dict[str, Any]:
|
209
|
+
"""Load annotations from a CSV or TSV file and convert them to a dictionary.
|
210
|
+
|
211
|
+
Args:
|
212
|
+
filepath (str): Path to the annotation file.
|
213
|
+
label_colname (str): Name of the column containing the labels (e.g., GO terms).
|
214
|
+
nodes_colname (str): Name of the column containing the nodes associated with each label.
|
215
|
+
delimiter (str, optional): Delimiter used to separate columns in the file (default is ',').
|
216
|
+
nodes_delimiter (str, optional): Delimiter used to separate multiple nodes within the nodes column (default is ';').
|
217
|
+
|
218
|
+
Returns:
|
219
|
+
Dict[str, Any]: A dictionary where each label is paired with its respective list of nodes.
|
220
|
+
"""
|
221
|
+
# Load the CSV or TSV file into a DataFrame
|
222
|
+
annotation = pd.read_csv(filepath, delimiter=delimiter)
|
223
|
+
# Split the nodes column by the nodes_delimiter to handle multiple nodes per label
|
224
|
+
annotation[nodes_colname] = annotation[nodes_colname].apply(
|
225
|
+
lambda x: x.split(nodes_delimiter)
|
226
|
+
)
|
227
|
+
# Create a dictionary pairing labels with their corresponding list of nodes
|
228
|
+
label_node_dict = annotation.set_index(label_colname)[nodes_colname].to_dict()
|
229
|
+
return label_node_dict
|
230
|
+
|
231
|
+
def _log_loading(self, filetype: str, filepath: str = "") -> None:
|
232
|
+
"""Log information about the network file being loaded.
|
233
|
+
|
234
|
+
Args:
|
235
|
+
filetype (str): The type of the file being loaded (e.g., 'Cytoscape').
|
236
|
+
filepath (str, optional): The path to the file being loaded.
|
237
|
+
"""
|
238
|
+
log_header("Loading annotations")
|
239
|
+
logger.debug(f"Filetype: {filetype}")
|
240
|
+
if filepath:
|
241
|
+
logger.debug(f"Filepath: {filepath}")
|