risk-network 0.0.8b20__tar.gz → 0.0.8b22__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/PKG-INFO +1 -1
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/__init__.py +1 -1
- risk_network-0.0.8b22/risk/annotations/__init__.py +7 -0
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/annotations/annotations.py +65 -40
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/neighborhoods/domains.py +35 -16
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/neighborhoods/neighborhoods.py +51 -37
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/network/graph.py +33 -10
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/network/io.py +1 -1
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/network/plot/canvas.py +16 -17
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/network/plot/contour.py +11 -11
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/network/plot/labels.py +6 -7
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/network/plot/network.py +15 -15
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/network/plot/utils/color.py +4 -4
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/risk.py +33 -19
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/stats/stats.py +8 -6
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk_network.egg-info/PKG-INFO +1 -1
- risk_network-0.0.8b20/risk/annotations/__init__.py +0 -7
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/LICENSE +0 -0
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/MANIFEST.in +0 -0
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/README.md +0 -0
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/pyproject.toml +0 -0
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/annotations/io.py +0 -0
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/constants.py +0 -0
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/log/__init__.py +0 -0
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/log/config.py +0 -0
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/log/params.py +0 -0
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/neighborhoods/__init__.py +0 -0
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/neighborhoods/community.py +0 -0
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/network/__init__.py +0 -0
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/network/geometry.py +0 -0
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/network/plot/__init__.py +0 -0
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/network/plot/plotter.py +0 -0
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/network/plot/utils/layout.py +0 -0
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/stats/__init__.py +0 -0
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/stats/hypergeom.py +0 -0
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/stats/permutation/__init__.py +0 -0
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/stats/permutation/permutation.py +0 -0
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/stats/permutation/test_functions.py +0 -0
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk/stats/poisson.py +0 -0
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk_network.egg-info/SOURCES.txt +0 -0
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk_network.egg-info/dependency_links.txt +0 -0
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk_network.egg-info/requires.txt +0 -0
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/risk_network.egg-info/top_level.txt +0 -0
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/setup.cfg +0 -0
- {risk_network-0.0.8b20 → risk_network-0.0.8b22}/setup.py +0 -0
@@ -30,6 +30,8 @@ def _setup_nltk():
|
|
30
30
|
|
31
31
|
# Ensure you have the necessary NLTK data
|
32
32
|
_setup_nltk()
|
33
|
+
# Initialize English stopwords
|
34
|
+
stop_words = set(stopwords.words("english"))
|
33
35
|
|
34
36
|
|
35
37
|
def load_annotations(network: nx.Graph, annotations_input: Dict[str, Any]) -> Dict[str, Any]:
|
@@ -47,11 +49,11 @@ def load_annotations(network: nx.Graph, annotations_input: Dict[str, Any]) -> Di
|
|
47
49
|
(node, annotation) for annotation, nodes in annotations_input.items() for node in nodes
|
48
50
|
]
|
49
51
|
# Create a DataFrame from the flattened list
|
50
|
-
annotations = pd.DataFrame(flattened_annotations, columns=["
|
51
|
-
annotations["
|
52
|
+
annotations = pd.DataFrame(flattened_annotations, columns=["node", "annotations"])
|
53
|
+
annotations["is_member"] = 1
|
52
54
|
# Pivot to create a binary matrix with nodes as rows and annotations as columns
|
53
55
|
annotations_pivot = annotations.pivot_table(
|
54
|
-
index="
|
56
|
+
index="node", columns="annotations", values="is_member", fill_value=0, dropna=False
|
55
57
|
)
|
56
58
|
# Reindex the annotations matrix based on the node labels from the network
|
57
59
|
node_label_order = list(nx.get_node_attributes(network, "label").values())
|
@@ -81,7 +83,8 @@ def define_top_annotations(
|
|
81
83
|
network: nx.Graph,
|
82
84
|
ordered_annotation_labels: List[str],
|
83
85
|
neighborhood_enrichment_sums: List[int],
|
84
|
-
|
86
|
+
significant_enrichment_matrix: np.ndarray,
|
87
|
+
significant_binary_enrichment_matrix: np.ndarray,
|
85
88
|
min_cluster_size: int = 5,
|
86
89
|
max_cluster_size: int = 1000,
|
87
90
|
) -> pd.DataFrame:
|
@@ -91,42 +94,52 @@ def define_top_annotations(
|
|
91
94
|
network (NetworkX graph): The network graph.
|
92
95
|
ordered_annotation_labels (list of str): List of ordered annotation labels.
|
93
96
|
neighborhood_enrichment_sums (list of int): List of neighborhood enrichment sums.
|
94
|
-
|
97
|
+
significant_enrichment_matrix (np.ndarray): Enrichment matrix below alpha threshold.
|
98
|
+
significant_binary_enrichment_matrix (np.ndarray): Binary enrichment matrix below alpha threshold.
|
95
99
|
min_cluster_size (int, optional): Minimum cluster size. Defaults to 5.
|
96
100
|
max_cluster_size (int, optional): Maximum cluster size. Defaults to 1000.
|
97
101
|
|
98
102
|
Returns:
|
99
103
|
pd.DataFrame: DataFrame with top annotations and their properties.
|
100
104
|
"""
|
101
|
-
#
|
105
|
+
# Sum the columns of the significant enrichment matrix (positive floating point values)
|
106
|
+
significant_enrichment_scores = significant_enrichment_matrix.sum(axis=0)
|
107
|
+
# Create DataFrame to store annotations, their neighborhood enrichment sums, and enrichment scores
|
102
108
|
annotations_enrichment_matrix = pd.DataFrame(
|
103
109
|
{
|
104
110
|
"id": range(len(ordered_annotation_labels)),
|
105
|
-
"
|
106
|
-
"
|
111
|
+
"full_terms": ordered_annotation_labels,
|
112
|
+
"significant_neighborhood_enrichment_sums": neighborhood_enrichment_sums,
|
113
|
+
"significant_enrichment_score": significant_enrichment_scores,
|
107
114
|
}
|
108
115
|
)
|
109
|
-
annotations_enrichment_matrix["
|
110
|
-
# Apply size constraints to identify potential
|
116
|
+
annotations_enrichment_matrix["significant_annotations"] = False
|
117
|
+
# Apply size constraints to identify potential significant annotations
|
111
118
|
annotations_enrichment_matrix.loc[
|
112
|
-
(
|
113
|
-
|
114
|
-
|
119
|
+
(
|
120
|
+
annotations_enrichment_matrix["significant_neighborhood_enrichment_sums"]
|
121
|
+
>= min_cluster_size
|
122
|
+
)
|
123
|
+
& (
|
124
|
+
annotations_enrichment_matrix["significant_neighborhood_enrichment_sums"]
|
125
|
+
<= max_cluster_size
|
126
|
+
),
|
127
|
+
"significant_annotations",
|
115
128
|
] = True
|
116
129
|
# Initialize columns for connected components analysis
|
117
|
-
annotations_enrichment_matrix["
|
118
|
-
annotations_enrichment_matrix["
|
119
|
-
annotations_enrichment_matrix["
|
120
|
-
"
|
130
|
+
annotations_enrichment_matrix["num_connected_components"] = 0
|
131
|
+
annotations_enrichment_matrix["size_connected_components"] = None
|
132
|
+
annotations_enrichment_matrix["size_connected_components"] = annotations_enrichment_matrix[
|
133
|
+
"size_connected_components"
|
121
134
|
].astype(object)
|
122
|
-
annotations_enrichment_matrix["
|
135
|
+
annotations_enrichment_matrix["num_large_connected_components"] = 0
|
123
136
|
|
124
137
|
for attribute in annotations_enrichment_matrix.index.values[
|
125
|
-
annotations_enrichment_matrix["
|
138
|
+
annotations_enrichment_matrix["significant_annotations"]
|
126
139
|
]:
|
127
140
|
# Identify enriched neighborhoods based on the binary enrichment matrix
|
128
141
|
enriched_neighborhoods = list(
|
129
|
-
compress(list(network),
|
142
|
+
compress(list(network), significant_binary_enrichment_matrix[:, attribute])
|
130
143
|
)
|
131
144
|
enriched_network = nx.subgraph(network, enriched_neighborhoods)
|
132
145
|
# Analyze connected components within the enriched subnetwork
|
@@ -145,55 +158,67 @@ def define_top_annotations(
|
|
145
158
|
num_large_connected_components = len(filtered_size_connected_components)
|
146
159
|
|
147
160
|
# Assign the number of connected components
|
148
|
-
annotations_enrichment_matrix.loc[attribute, "
|
161
|
+
annotations_enrichment_matrix.loc[attribute, "num_connected_components"] = (
|
149
162
|
num_connected_components
|
150
163
|
)
|
151
164
|
# Filter out attributes with more than one connected component
|
152
165
|
annotations_enrichment_matrix.loc[
|
153
|
-
annotations_enrichment_matrix["
|
166
|
+
annotations_enrichment_matrix["num_connected_components"] > 1, "significant_annotations"
|
154
167
|
] = False
|
155
168
|
# Assign the number of large connected components
|
156
|
-
annotations_enrichment_matrix.loc[attribute, "
|
169
|
+
annotations_enrichment_matrix.loc[attribute, "num_large_connected_components"] = (
|
157
170
|
num_large_connected_components
|
158
171
|
)
|
159
172
|
# Assign the size of connected components, ensuring it is always a list
|
160
|
-
annotations_enrichment_matrix.at[attribute, "
|
173
|
+
annotations_enrichment_matrix.at[attribute, "size_connected_components"] = (
|
161
174
|
filtered_size_connected_components.tolist()
|
162
175
|
)
|
163
176
|
|
164
177
|
return annotations_enrichment_matrix
|
165
178
|
|
166
179
|
|
167
|
-
def
|
168
|
-
"""
|
169
|
-
|
180
|
+
def get_weighted_description(words_column: pd.Series, scores_column: pd.Series) -> str:
|
181
|
+
"""Generate a weighted description from words and their corresponding scores,
|
182
|
+
with support for stopwords filtering and improved weighting logic.
|
170
183
|
|
171
184
|
Args:
|
172
185
|
words_column (pd.Series): A pandas Series containing strings to process.
|
186
|
+
scores_column (pd.Series): A pandas Series containing enrichment scores to weigh the terms.
|
173
187
|
|
174
188
|
Returns:
|
175
|
-
str: A coherent description formed from the most frequent and significant words.
|
189
|
+
str: A coherent description formed from the most frequent and significant words, weighed by enrichment scores.
|
176
190
|
"""
|
177
|
-
#
|
178
|
-
|
179
|
-
|
191
|
+
# Handle case where all scores are the same
|
192
|
+
if scores_column.max() == scores_column.min():
|
193
|
+
normalized_scores = pd.Series([1] * len(scores_column))
|
194
|
+
else:
|
195
|
+
# Normalize the enrichment scores to be between 0 and 1
|
196
|
+
normalized_scores = (scores_column - scores_column.min()) / (
|
197
|
+
scores_column.max() - scores_column.min()
|
198
|
+
)
|
180
199
|
|
200
|
+
# Combine words and normalized scores to create weighted words
|
201
|
+
weighted_words = []
|
202
|
+
for word, score in zip(words_column, normalized_scores):
|
203
|
+
word = str(word)
|
204
|
+
if word not in stop_words: # Skip stopwords
|
205
|
+
weight = max(1, int((0 if pd.isna(score) else score) * 10))
|
206
|
+
weighted_words.extend([word] * weight)
|
207
|
+
|
208
|
+
# Tokenize the weighted words
|
209
|
+
tokens = word_tokenize(" ".join(weighted_words))
|
181
210
|
# Separate numeric tokens
|
182
211
|
numeric_tokens = [token for token in tokens if token.replace(".", "", 1).isdigit()]
|
183
|
-
# If there's only one unique numeric value, return it directly as a string
|
184
212
|
unique_numeric_values = set(numeric_tokens)
|
185
213
|
if len(unique_numeric_values) == 1:
|
186
214
|
return f"{list(unique_numeric_values)[0]}"
|
187
215
|
|
188
|
-
#
|
189
|
-
words = [
|
190
|
-
|
191
|
-
|
192
|
-
if word.isalpha()
|
193
|
-
or word.replace(".", "", 1).isdigit() # Keep alphabetic words and numeric strings
|
194
|
-
]
|
216
|
+
# Filter alphabetic and numeric tokens
|
217
|
+
words = [word for word in tokens if word.isalpha() or word.replace(".", "", 1).isdigit()]
|
218
|
+
# Apply word similarity filtering to remove redundant terms
|
219
|
+
simplified_words = _simplify_word_list(words)
|
195
220
|
# Generate a coherent description from the processed words
|
196
|
-
description = _generate_coherent_description(
|
221
|
+
description = _generate_coherent_description(simplified_words)
|
197
222
|
|
198
223
|
return description
|
199
224
|
|
@@ -13,7 +13,7 @@ import pandas as pd
|
|
13
13
|
from scipy.cluster.hierarchy import linkage, fcluster
|
14
14
|
from sklearn.metrics import silhouette_score
|
15
15
|
|
16
|
-
from risk.annotations import
|
16
|
+
from risk.annotations import get_weighted_description
|
17
17
|
from risk.constants import GROUP_LINKAGE_METHODS, GROUP_DISTANCE_METRICS
|
18
18
|
from risk.log import logger
|
19
19
|
|
@@ -40,7 +40,7 @@ def define_domains(
|
|
40
40
|
"""
|
41
41
|
try:
|
42
42
|
# Transpose the matrix to cluster annotations
|
43
|
-
m = significant_neighborhoods_enrichment[:, top_annotations["
|
43
|
+
m = significant_neighborhoods_enrichment[:, top_annotations["significant_annotations"]].T
|
44
44
|
best_linkage, best_metric, best_threshold = _optimize_silhouette_across_linkage_and_metrics(
|
45
45
|
m, linkage_criterion, linkage_method, linkage_metric
|
46
46
|
)
|
@@ -55,7 +55,7 @@ def define_domains(
|
|
55
55
|
# Assign domains to the annotations matrix
|
56
56
|
domains = fcluster(Z, max_d_optimal, criterion=linkage_criterion)
|
57
57
|
top_annotations["domain"] = 0
|
58
|
-
top_annotations.loc[top_annotations["
|
58
|
+
top_annotations.loc[top_annotations["significant_annotations"], "domain"] = domains
|
59
59
|
except ValueError:
|
60
60
|
# If a ValueError is encountered, handle it by assigning unique domains
|
61
61
|
n_rows = len(top_annotations)
|
@@ -77,11 +77,11 @@ def define_domains(
|
|
77
77
|
t_idxmax[t_max == 0] = 0
|
78
78
|
|
79
79
|
# Assign all domains where the score is greater than 0
|
80
|
-
node_to_domain["
|
80
|
+
node_to_domain["all_domains"] = node_to_domain.loc[:, 1:].apply(
|
81
81
|
lambda row: list(row[row > 0].index), axis=1
|
82
82
|
)
|
83
83
|
# Assign primary domain
|
84
|
-
node_to_domain["
|
84
|
+
node_to_domain["primary_domain"] = t_idxmax
|
85
85
|
|
86
86
|
return node_to_domain
|
87
87
|
|
@@ -107,7 +107,7 @@ def trim_domains_and_top_annotations(
|
|
107
107
|
- A DataFrame with domain labels (pd.DataFrame)
|
108
108
|
"""
|
109
109
|
# Identify domains to remove based on size criteria
|
110
|
-
domain_counts = domains["
|
110
|
+
domain_counts = domains["primary_domain"].value_counts()
|
111
111
|
to_remove = set(
|
112
112
|
domain_counts[(domain_counts < min_cluster_size) | (domain_counts > max_cluster_size)].index
|
113
113
|
)
|
@@ -117,32 +117,51 @@ def trim_domains_and_top_annotations(
|
|
117
117
|
invalid_domain_ids = {0, invalid_domain_id}
|
118
118
|
# Mark domains to be removed
|
119
119
|
top_annotations["domain"].replace(to_remove, invalid_domain_id, inplace=True)
|
120
|
-
domains.loc[domains["
|
120
|
+
domains.loc[domains["primary_domain"].isin(to_remove), ["primary_domain"]] = invalid_domain_id
|
121
121
|
|
122
122
|
# Normalize "num enriched neighborhoods" by percentile for each domain and scale to 0-10
|
123
123
|
top_annotations["normalized_value"] = top_annotations.groupby("domain")[
|
124
|
-
"
|
124
|
+
"significant_neighborhood_enrichment_sums"
|
125
125
|
].transform(lambda x: (x.rank(pct=True) * 10).apply(np.ceil).astype(int))
|
126
|
-
#
|
127
|
-
top_annotations["
|
128
|
-
lambda row: " ".join([str(row["
|
126
|
+
# Modify the lambda function to pass both full_terms and significant_enrichment_score
|
127
|
+
top_annotations["combined_terms"] = top_annotations.apply(
|
128
|
+
lambda row: " ".join([str(row["full_terms"])] * row["normalized_value"]), axis=1
|
129
129
|
)
|
130
130
|
|
131
|
-
#
|
132
|
-
domain_labels =
|
131
|
+
# Perform the groupby operation while retaining the other columns and adding the weighting with enrichment scores
|
132
|
+
domain_labels = (
|
133
|
+
top_annotations.groupby("domain")
|
134
|
+
.agg(
|
135
|
+
full_terms=("full_terms", lambda x: list(x)),
|
136
|
+
enrichment_scores=("significant_enrichment_score", lambda x: list(x)),
|
137
|
+
)
|
138
|
+
.reset_index()
|
139
|
+
)
|
140
|
+
domain_labels["combined_terms"] = domain_labels.apply(
|
141
|
+
lambda row: get_weighted_description(
|
142
|
+
pd.Series(row["full_terms"]), pd.Series(row["enrichment_scores"])
|
143
|
+
),
|
144
|
+
axis=1,
|
145
|
+
)
|
146
|
+
|
147
|
+
# Rename the columns as necessary
|
133
148
|
trimmed_domains_matrix = domain_labels.rename(
|
134
|
-
columns={
|
149
|
+
columns={
|
150
|
+
"domain": "id",
|
151
|
+
"combined_terms": "normalized_description",
|
152
|
+
"full_terms": "full_descriptions",
|
153
|
+
"enrichment_scores": "enrichment_scores",
|
154
|
+
}
|
135
155
|
).set_index("id")
|
136
156
|
|
137
157
|
# Remove invalid domains
|
138
158
|
valid_annotations = top_annotations[~top_annotations["domain"].isin(invalid_domain_ids)].drop(
|
139
159
|
columns=["normalized_value"]
|
140
160
|
)
|
141
|
-
valid_domains = domains[~domains["
|
161
|
+
valid_domains = domains[~domains["primary_domain"].isin(invalid_domain_ids)]
|
142
162
|
valid_trimmed_domains_matrix = trimmed_domains_matrix[
|
143
163
|
~trimmed_domains_matrix.index.isin(invalid_domain_ids)
|
144
164
|
]
|
145
|
-
|
146
165
|
return valid_annotations, valid_domains, valid_trimmed_domains_matrix
|
147
166
|
|
148
167
|
|
@@ -171,7 +171,7 @@ def process_neighborhoods(
|
|
171
171
|
|
172
172
|
Args:
|
173
173
|
network (nx.Graph): The network data structure used for imputing and pruning neighbors.
|
174
|
-
neighborhoods (Dict[str, Any]): Dictionary containing 'enrichment_matrix', '
|
174
|
+
neighborhoods (Dict[str, Any]): Dictionary containing 'enrichment_matrix', 'significant_binary_enrichment_matrix', and 'significant_enrichment_matrix'.
|
175
175
|
impute_depth (int, optional): Depth for imputing neighbors. Defaults to 0.
|
176
176
|
prune_threshold (float, optional): Distance threshold for pruning neighbors. Defaults to 0.0.
|
177
177
|
|
@@ -179,18 +179,18 @@ def process_neighborhoods(
|
|
179
179
|
Dict[str, Any]: Processed neighborhoods data, including the updated matrices and enrichment counts.
|
180
180
|
"""
|
181
181
|
enrichment_matrix = neighborhoods["enrichment_matrix"]
|
182
|
-
|
182
|
+
significant_binary_enrichment_matrix = neighborhoods["significant_binary_enrichment_matrix"]
|
183
183
|
significant_enrichment_matrix = neighborhoods["significant_enrichment_matrix"]
|
184
184
|
logger.debug(f"Imputation depth: {impute_depth}")
|
185
185
|
if impute_depth:
|
186
186
|
(
|
187
187
|
enrichment_matrix,
|
188
|
-
|
188
|
+
significant_binary_enrichment_matrix,
|
189
189
|
significant_enrichment_matrix,
|
190
190
|
) = _impute_neighbors(
|
191
191
|
network,
|
192
192
|
enrichment_matrix,
|
193
|
-
|
193
|
+
significant_binary_enrichment_matrix,
|
194
194
|
max_depth=impute_depth,
|
195
195
|
)
|
196
196
|
|
@@ -198,20 +198,20 @@ def process_neighborhoods(
|
|
198
198
|
if prune_threshold:
|
199
199
|
(
|
200
200
|
enrichment_matrix,
|
201
|
-
|
201
|
+
significant_binary_enrichment_matrix,
|
202
202
|
significant_enrichment_matrix,
|
203
203
|
) = _prune_neighbors(
|
204
204
|
network,
|
205
205
|
enrichment_matrix,
|
206
|
-
|
206
|
+
significant_binary_enrichment_matrix,
|
207
207
|
distance_threshold=prune_threshold,
|
208
208
|
)
|
209
209
|
|
210
|
-
neighborhood_enrichment_counts = np.sum(
|
210
|
+
neighborhood_enrichment_counts = np.sum(significant_binary_enrichment_matrix, axis=0)
|
211
211
|
node_enrichment_sums = np.sum(enrichment_matrix, axis=1)
|
212
212
|
return {
|
213
213
|
"enrichment_matrix": enrichment_matrix,
|
214
|
-
"
|
214
|
+
"significant_binary_enrichment_matrix": significant_binary_enrichment_matrix,
|
215
215
|
"significant_enrichment_matrix": significant_enrichment_matrix,
|
216
216
|
"neighborhood_enrichment_counts": neighborhood_enrichment_counts,
|
217
217
|
"node_enrichment_sums": node_enrichment_sums,
|
@@ -221,7 +221,7 @@ def process_neighborhoods(
|
|
221
221
|
def _impute_neighbors(
|
222
222
|
network: nx.Graph,
|
223
223
|
enrichment_matrix: np.ndarray,
|
224
|
-
|
224
|
+
significant_binary_enrichment_matrix: np.ndarray,
|
225
225
|
max_depth: int = 3,
|
226
226
|
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
227
227
|
"""Impute rows with sums of zero in the enrichment matrix based on the closest non-zero neighbors in the network graph.
|
@@ -229,7 +229,7 @@ def _impute_neighbors(
|
|
229
229
|
Args:
|
230
230
|
network (nx.Graph): The network graph with nodes having IDs matching the matrix indices.
|
231
231
|
enrichment_matrix (np.ndarray): The enrichment matrix with rows to be imputed.
|
232
|
-
|
232
|
+
significant_binary_enrichment_matrix (np.ndarray): The alpha threshold matrix to be imputed similarly.
|
233
233
|
max_depth (int): Maximum depth of nodes to traverse for imputing values.
|
234
234
|
|
235
235
|
Returns:
|
@@ -239,19 +239,21 @@ def _impute_neighbors(
|
|
239
239
|
- np.ndarray: The significant enrichment matrix with non-significant entries set to zero.
|
240
240
|
"""
|
241
241
|
# Calculate the distance threshold value based on the shortest distances
|
242
|
-
enrichment_matrix,
|
243
|
-
network, enrichment_matrix,
|
242
|
+
enrichment_matrix, significant_binary_enrichment_matrix = _impute_neighbors_with_similarity(
|
243
|
+
network, enrichment_matrix, significant_binary_enrichment_matrix, max_depth=max_depth
|
244
244
|
)
|
245
245
|
# Create a matrix where non-significant entries are set to zero
|
246
|
-
significant_enrichment_matrix = np.where(
|
246
|
+
significant_enrichment_matrix = np.where(
|
247
|
+
significant_binary_enrichment_matrix == 1, enrichment_matrix, 0
|
248
|
+
)
|
247
249
|
|
248
|
-
return enrichment_matrix,
|
250
|
+
return enrichment_matrix, significant_binary_enrichment_matrix, significant_enrichment_matrix
|
249
251
|
|
250
252
|
|
251
253
|
def _impute_neighbors_with_similarity(
|
252
254
|
network: nx.Graph,
|
253
255
|
enrichment_matrix: np.ndarray,
|
254
|
-
|
256
|
+
significant_binary_enrichment_matrix: np.ndarray,
|
255
257
|
max_depth: int = 3,
|
256
258
|
) -> Tuple[np.ndarray, np.ndarray]:
|
257
259
|
"""Impute non-enriched nodes based on the closest enriched neighbors' profiles and their similarity.
|
@@ -259,7 +261,7 @@ def _impute_neighbors_with_similarity(
|
|
259
261
|
Args:
|
260
262
|
network (nx.Graph): The network graph with nodes having IDs matching the matrix indices.
|
261
263
|
enrichment_matrix (np.ndarray): The enrichment matrix with rows to be imputed.
|
262
|
-
|
264
|
+
significant_binary_enrichment_matrix (np.ndarray): The alpha threshold matrix to be imputed similarly.
|
263
265
|
max_depth (int): Maximum depth of nodes to traverse for imputing values.
|
264
266
|
|
265
267
|
Returns:
|
@@ -268,27 +270,31 @@ def _impute_neighbors_with_similarity(
|
|
268
270
|
- The imputed alpha threshold matrix.
|
269
271
|
"""
|
270
272
|
depth = 1
|
271
|
-
rows_to_impute = np.where(
|
273
|
+
rows_to_impute = np.where(significant_binary_enrichment_matrix.sum(axis=1) == 0)[0]
|
272
274
|
while len(rows_to_impute) and depth <= max_depth:
|
273
275
|
# Iterate over all enriched nodes
|
274
|
-
for row_index in range(
|
275
|
-
if
|
276
|
-
enrichment_matrix,
|
277
|
-
row_index,
|
276
|
+
for row_index in range(significant_binary_enrichment_matrix.shape[0]):
|
277
|
+
if significant_binary_enrichment_matrix[row_index].sum() != 0:
|
278
|
+
enrichment_matrix, significant_binary_enrichment_matrix = _process_node_imputation(
|
279
|
+
row_index,
|
280
|
+
network,
|
281
|
+
enrichment_matrix,
|
282
|
+
significant_binary_enrichment_matrix,
|
283
|
+
depth,
|
278
284
|
)
|
279
285
|
|
280
286
|
# Update rows to impute for the next iteration
|
281
|
-
rows_to_impute = np.where(
|
287
|
+
rows_to_impute = np.where(significant_binary_enrichment_matrix.sum(axis=1) == 0)[0]
|
282
288
|
depth += 1
|
283
289
|
|
284
|
-
return enrichment_matrix,
|
290
|
+
return enrichment_matrix, significant_binary_enrichment_matrix
|
285
291
|
|
286
292
|
|
287
293
|
def _process_node_imputation(
|
288
294
|
row_index: int,
|
289
295
|
network: nx.Graph,
|
290
296
|
enrichment_matrix: np.ndarray,
|
291
|
-
|
297
|
+
significant_binary_enrichment_matrix: np.ndarray,
|
292
298
|
depth: int,
|
293
299
|
) -> Tuple[np.ndarray, np.ndarray]:
|
294
300
|
"""Process the imputation for a single node based on its enriched neighbors.
|
@@ -297,7 +303,7 @@ def _process_node_imputation(
|
|
297
303
|
row_index (int): The index of the enriched node being processed.
|
298
304
|
network (nx.Graph): The network graph with nodes having IDs matching the matrix indices.
|
299
305
|
enrichment_matrix (np.ndarray): The enrichment matrix with rows to be imputed.
|
300
|
-
|
306
|
+
significant_binary_enrichment_matrix (np.ndarray): The alpha threshold matrix to be imputed similarly.
|
301
307
|
depth (int): Current depth for traversal.
|
302
308
|
|
303
309
|
Returns:
|
@@ -310,7 +316,7 @@ def _process_node_imputation(
|
|
310
316
|
n
|
311
317
|
for n in neighbors
|
312
318
|
if n != row_index
|
313
|
-
and
|
319
|
+
and significant_binary_enrichment_matrix[n].sum() != 0
|
314
320
|
and enrichment_matrix[n].sum() != 0
|
315
321
|
]
|
316
322
|
# Filter non-enriched neighbors
|
@@ -318,7 +324,7 @@ def _process_node_imputation(
|
|
318
324
|
n
|
319
325
|
for n in neighbors
|
320
326
|
if n != row_index
|
321
|
-
and
|
327
|
+
and significant_binary_enrichment_matrix[n].sum() == 0
|
322
328
|
and enrichment_matrix[n].sum() == 0
|
323
329
|
]
|
324
330
|
# If there are valid non-enriched neighbors
|
@@ -363,15 +369,17 @@ def _process_node_imputation(
|
|
363
369
|
enrichment_matrix[most_similar_neighbor] = enrichment_matrix[row_index] / np.sqrt(
|
364
370
|
depth + 1
|
365
371
|
)
|
366
|
-
|
372
|
+
significant_binary_enrichment_matrix[most_similar_neighbor] = (
|
373
|
+
significant_binary_enrichment_matrix[row_index]
|
374
|
+
)
|
367
375
|
|
368
|
-
return enrichment_matrix,
|
376
|
+
return enrichment_matrix, significant_binary_enrichment_matrix
|
369
377
|
|
370
378
|
|
371
379
|
def _prune_neighbors(
|
372
380
|
network: nx.Graph,
|
373
381
|
enrichment_matrix: np.ndarray,
|
374
|
-
|
382
|
+
significant_binary_enrichment_matrix: np.ndarray,
|
375
383
|
distance_threshold: float = 0.9,
|
376
384
|
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
377
385
|
"""Remove outliers based on their rank for edge lengths.
|
@@ -379,7 +387,7 @@ def _prune_neighbors(
|
|
379
387
|
Args:
|
380
388
|
network (nx.Graph): The network graph with nodes having IDs matching the matrix indices.
|
381
389
|
enrichment_matrix (np.ndarray): The enrichment matrix.
|
382
|
-
|
390
|
+
significant_binary_enrichment_matrix (np.ndarray): The alpha threshold matrix.
|
383
391
|
distance_threshold (float): Rank threshold (0 to 1) to determine outliers.
|
384
392
|
|
385
393
|
Returns:
|
@@ -389,10 +397,12 @@ def _prune_neighbors(
|
|
389
397
|
- np.ndarray: The significant enrichment matrix, where non-significant entries are set to zero.
|
390
398
|
"""
|
391
399
|
# Identify indices with non-zero rows in the binary enrichment matrix
|
392
|
-
non_zero_indices = np.where(
|
400
|
+
non_zero_indices = np.where(significant_binary_enrichment_matrix.sum(axis=1) != 0)[0]
|
393
401
|
median_distances = []
|
394
402
|
for node in non_zero_indices:
|
395
|
-
neighbors = [
|
403
|
+
neighbors = [
|
404
|
+
n for n in network.neighbors(node) if significant_binary_enrichment_matrix[n].sum() != 0
|
405
|
+
]
|
396
406
|
if neighbors:
|
397
407
|
median_distance = np.median(
|
398
408
|
[_get_euclidean_distance(node, n, network) for n in neighbors]
|
@@ -404,7 +414,9 @@ def _prune_neighbors(
|
|
404
414
|
# Prune nodes that are outliers based on the distance threshold
|
405
415
|
for row_index in non_zero_indices:
|
406
416
|
neighbors = [
|
407
|
-
n
|
417
|
+
n
|
418
|
+
for n in network.neighbors(row_index)
|
419
|
+
if significant_binary_enrichment_matrix[n].sum() != 0
|
408
420
|
]
|
409
421
|
if neighbors:
|
410
422
|
median_distance = np.median(
|
@@ -412,12 +424,14 @@ def _prune_neighbors(
|
|
412
424
|
)
|
413
425
|
if median_distance >= distance_threshold_value:
|
414
426
|
enrichment_matrix[row_index] = 0
|
415
|
-
|
427
|
+
significant_binary_enrichment_matrix[row_index] = 0
|
416
428
|
|
417
429
|
# Create a matrix where non-significant entries are set to zero
|
418
|
-
significant_enrichment_matrix = np.where(
|
430
|
+
significant_enrichment_matrix = np.where(
|
431
|
+
significant_binary_enrichment_matrix == 1, enrichment_matrix, 0
|
432
|
+
)
|
419
433
|
|
420
|
-
return enrichment_matrix,
|
434
|
+
return enrichment_matrix, significant_binary_enrichment_matrix, significant_enrichment_matrix
|
421
435
|
|
422
436
|
|
423
437
|
def _get_euclidean_distance(node1: Any, node2: Any, network: nx.Graph) -> float:
|
@@ -45,6 +45,10 @@ class NetworkGraph:
|
|
45
45
|
self.domain_id_to_domain_terms_map = self._create_domain_id_to_domain_terms_map(
|
46
46
|
trimmed_domains
|
47
47
|
)
|
48
|
+
self.domain_id_to_domain_info_map = self._create_domain_id_to_domain_info_map(
|
49
|
+
trimmed_domains
|
50
|
+
)
|
51
|
+
self.trimmed_domains = trimmed_domains
|
48
52
|
self.node_enrichment_sums = node_enrichment_sums
|
49
53
|
self.node_id_to_domain_ids_and_enrichments_map = (
|
50
54
|
self._create_node_id_to_domain_ids_and_enrichments(domains)
|
@@ -60,7 +64,8 @@ class NetworkGraph:
|
|
60
64
|
self.network = _unfold_sphere_to_plane(network)
|
61
65
|
self.node_coordinates = _extract_node_coordinates(self.network)
|
62
66
|
|
63
|
-
|
67
|
+
@staticmethod
|
68
|
+
def _create_domain_id_to_node_ids_map(domains: pd.DataFrame) -> Dict[int, Any]:
|
64
69
|
"""Create a mapping from domains to the list of node IDs belonging to each domain.
|
65
70
|
|
66
71
|
Args:
|
@@ -69,17 +74,16 @@ class NetworkGraph:
|
|
69
74
|
Returns:
|
70
75
|
Dict[int, Any]: A dictionary where keys are domain IDs and values are lists of node IDs belonging to each domain.
|
71
76
|
"""
|
72
|
-
cleaned_domains_matrix = domains.reset_index()[["index", "
|
73
|
-
node_to_domains_map = cleaned_domains_matrix.set_index("index")["
|
77
|
+
cleaned_domains_matrix = domains.reset_index()[["index", "primary_domain"]]
|
78
|
+
node_to_domains_map = cleaned_domains_matrix.set_index("index")["primary_domain"].to_dict()
|
74
79
|
domain_id_to_node_ids_map = defaultdict(list)
|
75
80
|
for k, v in node_to_domains_map.items():
|
76
81
|
domain_id_to_node_ids_map[v].append(k)
|
77
82
|
|
78
83
|
return domain_id_to_node_ids_map
|
79
84
|
|
80
|
-
|
81
|
-
|
82
|
-
) -> Dict[int, Any]:
|
85
|
+
@staticmethod
|
86
|
+
def _create_domain_id_to_domain_terms_map(trimmed_domains: pd.DataFrame) -> Dict[int, Any]:
|
83
87
|
"""Create a mapping from domain IDs to their corresponding terms.
|
84
88
|
|
85
89
|
Args:
|
@@ -91,13 +95,32 @@ class NetworkGraph:
|
|
91
95
|
return dict(
|
92
96
|
zip(
|
93
97
|
trimmed_domains.index,
|
94
|
-
trimmed_domains["
|
98
|
+
trimmed_domains["normalized_description"],
|
95
99
|
)
|
96
100
|
)
|
97
101
|
|
98
|
-
|
99
|
-
|
100
|
-
|
102
|
+
@staticmethod
|
103
|
+
def _create_domain_id_to_domain_info_map(
|
104
|
+
trimmed_domains: pd.DataFrame,
|
105
|
+
) -> Dict[int, Dict[str, Any]]:
|
106
|
+
"""Create a mapping from domain IDs to their corresponding full description and enrichment score.
|
107
|
+
|
108
|
+
Args:
|
109
|
+
trimmed_domains (pd.DataFrame): DataFrame containing domain IDs, full descriptions, and enrichment scores.
|
110
|
+
|
111
|
+
Returns:
|
112
|
+
Dict[int, Dict[str, Any]]: A dictionary mapping domain IDs (int) to a dictionary with 'full_descriptions' and 'enrichment_scores'.
|
113
|
+
"""
|
114
|
+
return {
|
115
|
+
int(id_): {
|
116
|
+
"full_descriptions": trimmed_domains.at[id_, "full_descriptions"],
|
117
|
+
"enrichment_scores": trimmed_domains.at[id_, "enrichment_scores"],
|
118
|
+
}
|
119
|
+
for id_ in trimmed_domains.index
|
120
|
+
}
|
121
|
+
|
122
|
+
@staticmethod
|
123
|
+
def _create_node_id_to_domain_ids_and_enrichments(domains: pd.DataFrame) -> Dict[int, Dict]:
|
101
124
|
"""Creates a dictionary mapping each node ID to its corresponding domain IDs and enrichment values.
|
102
125
|
|
103
126
|
Args:
|