napistu 0.2.5.dev7__py3-none-any.whl → 0.3.1.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__init__.py +1 -3
- napistu/__main__.py +126 -96
- napistu/constants.py +35 -41
- napistu/context/__init__.py +10 -0
- napistu/context/discretize.py +462 -0
- napistu/context/filtering.py +387 -0
- napistu/gcs/__init__.py +1 -1
- napistu/identifiers.py +74 -15
- napistu/indices.py +68 -0
- napistu/ingestion/__init__.py +1 -1
- napistu/ingestion/bigg.py +47 -62
- napistu/ingestion/constants.py +18 -133
- napistu/ingestion/gtex.py +113 -0
- napistu/ingestion/hpa.py +147 -0
- napistu/ingestion/sbml.py +0 -97
- napistu/ingestion/string.py +2 -2
- napistu/matching/__init__.py +10 -0
- napistu/matching/constants.py +18 -0
- napistu/matching/interactions.py +518 -0
- napistu/matching/mount.py +529 -0
- napistu/matching/species.py +510 -0
- napistu/mcp/__init__.py +7 -4
- napistu/mcp/__main__.py +128 -72
- napistu/mcp/client.py +16 -25
- napistu/mcp/codebase.py +201 -145
- napistu/mcp/component_base.py +170 -0
- napistu/mcp/config.py +223 -0
- napistu/mcp/constants.py +45 -2
- napistu/mcp/documentation.py +253 -136
- napistu/mcp/documentation_utils.py +13 -48
- napistu/mcp/execution.py +372 -305
- napistu/mcp/health.py +47 -65
- napistu/mcp/profiles.py +10 -6
- napistu/mcp/server.py +161 -80
- napistu/mcp/tutorials.py +139 -87
- napistu/modify/__init__.py +1 -1
- napistu/modify/gaps.py +1 -1
- napistu/network/__init__.py +1 -1
- napistu/network/constants.py +101 -34
- napistu/network/data_handling.py +388 -0
- napistu/network/ig_utils.py +351 -0
- napistu/network/napistu_graph_core.py +354 -0
- napistu/network/neighborhoods.py +40 -40
- napistu/network/net_create.py +373 -309
- napistu/network/net_propagation.py +47 -19
- napistu/network/{net_utils.py → ng_utils.py} +124 -272
- napistu/network/paths.py +67 -51
- napistu/network/precompute.py +11 -11
- napistu/ontologies/__init__.py +10 -0
- napistu/ontologies/constants.py +129 -0
- napistu/ontologies/dogma.py +243 -0
- napistu/ontologies/genodexito.py +649 -0
- napistu/ontologies/mygene.py +369 -0
- napistu/ontologies/renaming.py +198 -0
- napistu/rpy2/__init__.py +229 -86
- napistu/rpy2/callr.py +47 -77
- napistu/rpy2/constants.py +24 -23
- napistu/rpy2/rids.py +61 -648
- napistu/sbml_dfs_core.py +587 -222
- napistu/scverse/__init__.py +15 -0
- napistu/scverse/constants.py +28 -0
- napistu/scverse/loading.py +727 -0
- napistu/utils.py +118 -10
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/METADATA +8 -3
- napistu-0.3.1.dev1.dist-info/RECORD +133 -0
- tests/conftest.py +22 -0
- tests/test_context_discretize.py +56 -0
- tests/test_context_filtering.py +267 -0
- tests/test_identifiers.py +100 -0
- tests/test_indices.py +65 -0
- tests/{test_edgelist.py → test_ingestion_napistu_edgelist.py} +2 -2
- tests/test_matching_interactions.py +108 -0
- tests/test_matching_mount.py +305 -0
- tests/test_matching_species.py +394 -0
- tests/test_mcp_config.py +193 -0
- tests/test_mcp_documentation_utils.py +12 -3
- tests/test_mcp_server.py +156 -19
- tests/test_network_data_handling.py +397 -0
- tests/test_network_ig_utils.py +23 -0
- tests/test_network_neighborhoods.py +19 -0
- tests/test_network_net_create.py +459 -0
- tests/test_network_ng_utils.py +30 -0
- tests/test_network_paths.py +56 -0
- tests/{test_precomputed_distances.py → test_network_precompute.py} +8 -6
- tests/test_ontologies_genodexito.py +58 -0
- tests/test_ontologies_mygene.py +39 -0
- tests/test_ontologies_renaming.py +110 -0
- tests/test_rpy2_callr.py +79 -0
- tests/test_rpy2_init.py +151 -0
- tests/test_sbml.py +0 -31
- tests/test_sbml_dfs_core.py +134 -10
- tests/test_scverse_loading.py +778 -0
- tests/test_set_coverage.py +2 -2
- tests/test_utils.py +121 -1
- napistu/mechanism_matching.py +0 -1353
- napistu/rpy2/netcontextr.py +0 -467
- napistu-0.2.5.dev7.dist-info/RECORD +0 -98
- tests/test_igraph.py +0 -367
- tests/test_mechanism_matching.py +0 -784
- tests/test_net_utils.py +0 -149
- tests/test_netcontextr.py +0 -105
- tests/test_rpy2.py +0 -61
- /napistu/ingestion/{cpr_edgelist.py → napistu_edgelist.py} +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/WHEEL +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/entry_points.txt +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/licenses/LICENSE +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/top_level.txt +0 -0
- /tests/{test_obo.py → test_ingestion_obo.py} +0 -0
napistu/rpy2/rids.py
CHANGED
@@ -3,697 +3,110 @@ from __future__ import annotations
|
|
3
3
|
import logging
|
4
4
|
|
5
5
|
import pandas as pd
|
6
|
-
from napistu import
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
from napistu.rpy2 import
|
12
|
-
from napistu.rpy2 import report_r_exceptions
|
13
|
-
from napistu.rpy2 import warn_if_no_rpy2
|
6
|
+
from napistu.rpy2 import (
|
7
|
+
require_rpy2,
|
8
|
+
report_r_exceptions,
|
9
|
+
)
|
10
|
+
|
11
|
+
from napistu.rpy2.callr import bioconductor_org_r_function, r_dataframe_to_pandas
|
14
12
|
|
15
|
-
from napistu.constants import SBML_DFS
|
16
|
-
from napistu.constants import BQB
|
17
|
-
from napistu.constants import IDENTIFIERS
|
18
13
|
from napistu.constants import ONTOLOGIES
|
19
|
-
from napistu.constants import ONTOLOGY_ALIASES
|
20
14
|
from napistu.rpy2.constants import BIOC_VALID_EXPANDED_SPECIES_ONTOLOGIES
|
21
|
-
from napistu.rpy2.constants import
|
22
|
-
from napistu.rpy2.constants import BIOC_PROTEIN_ONTOLOGIES
|
23
|
-
from napistu.rpy2.constants import BIOC_NAME_ONTOLOGIES
|
24
|
-
from napistu.rpy2.constants import BIOC_GENE_ONTOLOGIES # noqa
|
15
|
+
from napistu.rpy2.constants import BIOC_ONTOLOGY_MAPPING
|
25
16
|
from napistu.rpy2.constants import BIOC_NOMENCLATURE
|
26
17
|
|
27
18
|
logger = logging.getLogger(__name__)
|
28
19
|
|
29
20
|
|
30
|
-
@
|
31
|
-
@report_r_exceptions
|
32
|
-
def expand_identifiers(
|
33
|
-
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
34
|
-
id_type: str,
|
35
|
-
species: str,
|
36
|
-
expanded_ontologies: list[str],
|
37
|
-
r_paths: str | None = None,
|
38
|
-
) -> pd.Series:
|
39
|
-
"""
|
40
|
-
Expand Identifiers
|
41
|
-
|
42
|
-
Update a table's identifiers to include additional related ontologies
|
43
|
-
|
44
|
-
Ontologies are pulled from the bioconductor "org" packages. This is effective, but inelegant.
|
45
|
-
|
46
|
-
Parameters
|
47
|
-
----------
|
48
|
-
sbml_dfs : SBML_dfs
|
49
|
-
A relational pathway model built around reactions interconverting compartmentalized species.
|
50
|
-
id_type: str
|
51
|
-
Identifiers to expand: species, compartments, or reactions
|
52
|
-
species: str
|
53
|
-
Species name
|
54
|
-
expanded_ontologies: list
|
55
|
-
Ontologies to add or complete
|
56
|
-
r_paths: str
|
57
|
-
Path to an R packages directory
|
58
|
-
|
59
|
-
Returns
|
60
|
-
-------
|
61
|
-
a pd.Series with identifiers as the index and updated Identifiers objects as values
|
62
|
-
"""
|
63
|
-
|
64
|
-
if not isinstance(sbml_dfs, sbml_dfs_core.SBML_dfs):
|
65
|
-
raise TypeError("sbml_dfs is not an sbml_dfs_core.SBML_dfs object")
|
66
|
-
|
67
|
-
# pull out all identifiers as a pd.DataFrame
|
68
|
-
all_entity_identifiers = sbml_dfs.get_identifiers(id_type)
|
69
|
-
if not isinstance(all_entity_identifiers, pd.DataFrame):
|
70
|
-
raise TypeError("all_entity_identifiers must be a pandas DataFrame")
|
71
|
-
|
72
|
-
if id_type == "species":
|
73
|
-
all_entity_identifiers = _check_species_identifiers_entrez_gene_ontology(
|
74
|
-
all_entity_identifiers
|
75
|
-
)
|
76
|
-
|
77
|
-
valid_expanded_ontologies = BIOC_VALID_EXPANDED_SPECIES_ONTOLOGIES
|
78
|
-
elif id_type in ["reactions", "compartments"]:
|
79
|
-
raise NotImplementedError(
|
80
|
-
f"No converters implemented to expand {id_type} annotations"
|
81
|
-
)
|
82
|
-
else:
|
83
|
-
raise ValueError(f"{id_type} is an invalid id_type")
|
84
|
-
|
85
|
-
invalid_expanded_ontologies = set(expanded_ontologies).difference(
|
86
|
-
valid_expanded_ontologies
|
87
|
-
)
|
88
|
-
|
89
|
-
if len(invalid_expanded_ontologies) != 0:
|
90
|
-
raise NotImplementedError(
|
91
|
-
f"No converters implemented to expand {id_type} annotations to {', '.join(invalid_expanded_ontologies)}"
|
92
|
-
)
|
93
|
-
|
94
|
-
# find entries in valid_expanded_ontologies which are already present
|
95
|
-
# these are the entries that will be used to expand to other ontologies
|
96
|
-
# or fill in ontologies with incomplete annotations
|
97
|
-
starting_ontologies = valid_expanded_ontologies.intersection(
|
98
|
-
set(all_entity_identifiers["ontology"])
|
99
|
-
)
|
100
|
-
|
101
|
-
if len(starting_ontologies) == 0:
|
102
|
-
raise ValueError(f"No ontologies with {id_type} converters are present")
|
103
|
-
|
104
|
-
required_conversion_ontologies = set(starting_ontologies).union(
|
105
|
-
set(expanded_ontologies)
|
106
|
-
)
|
107
|
-
|
108
|
-
# pull down entrez ids + mapping to other ontologies
|
109
|
-
mapping_ontologies = required_conversion_ontologies.intersection(
|
110
|
-
BIOC_VALID_EXPANDED_SPECIES_ONTOLOGIES
|
111
|
-
)
|
112
|
-
|
113
|
-
mappings_dict = create_bioconductor_mapping_tables(
|
114
|
-
mappings=mapping_ontologies, species=species, r_paths=r_paths
|
115
|
-
)
|
116
|
-
|
117
|
-
# start with entrez IDs (since all other ontologies are mapped to them in the
|
118
|
-
# bioconductor "org" packages)
|
119
|
-
|
120
|
-
# get these values by just looking up the mappings between entrez genes and genomic loci
|
121
|
-
running_ids = merge_bioconductor_mappings(mappings_dict, mapping_ontologies)
|
122
|
-
|
123
|
-
# map from existing ontologies to expanded ontologies
|
124
|
-
ontology_mappings = list()
|
125
|
-
# starting w/
|
126
|
-
for start in starting_ontologies:
|
127
|
-
# ending w/
|
128
|
-
for end in expanded_ontologies:
|
129
|
-
if start == end:
|
130
|
-
continue
|
131
|
-
lookup = (
|
132
|
-
running_ids[[start, end]]
|
133
|
-
.rename(columns={start: IDENTIFIERS.IDENTIFIER, end: "new_identifier"})
|
134
|
-
.assign(ontology=start)
|
135
|
-
.assign(new_ontology=end)
|
136
|
-
)
|
137
|
-
ontology_mappings.append(lookup)
|
138
|
-
|
139
|
-
ontology_mappings_df = pd.concat(ontology_mappings).dropna()
|
140
|
-
|
141
|
-
# old identifiers joined with new identifiers
|
142
|
-
|
143
|
-
# first, define the names of keys and ids
|
144
|
-
table_pk_var = sbml_dfs.schema[id_type]["pk"]
|
145
|
-
table_id_var = sbml_dfs.schema[id_type]["id"]
|
146
|
-
|
147
|
-
# retain bqb terms to define how an identifier is related to sid
|
148
|
-
# this relation will be preserved for the new ids
|
149
|
-
|
150
|
-
merged_identifiers = all_entity_identifiers[
|
151
|
-
[
|
152
|
-
table_pk_var,
|
153
|
-
IDENTIFIERS.ONTOLOGY,
|
154
|
-
IDENTIFIERS.IDENTIFIER,
|
155
|
-
IDENTIFIERS.BQB,
|
156
|
-
]
|
157
|
-
].merge(ontology_mappings_df)
|
158
|
-
|
159
|
-
# new, possibly redundant identifiers
|
160
|
-
new_identifiers = merged_identifiers[
|
161
|
-
[table_pk_var, "new_ontology", "new_identifier", IDENTIFIERS.BQB]
|
162
|
-
].rename(
|
163
|
-
columns={
|
164
|
-
"new_ontology": IDENTIFIERS.ONTOLOGY,
|
165
|
-
"new_identifier": IDENTIFIERS.IDENTIFIER,
|
166
|
-
}
|
167
|
-
)
|
168
|
-
|
169
|
-
expanded_identifiers_df = (
|
170
|
-
pd.concat(
|
171
|
-
[
|
172
|
-
all_entity_identifiers[
|
173
|
-
[
|
174
|
-
table_pk_var,
|
175
|
-
IDENTIFIERS.ONTOLOGY,
|
176
|
-
IDENTIFIERS.IDENTIFIER,
|
177
|
-
IDENTIFIERS.URL,
|
178
|
-
IDENTIFIERS.BQB,
|
179
|
-
]
|
180
|
-
],
|
181
|
-
new_identifiers,
|
182
|
-
# ignore new identifier if it already exists
|
183
|
-
]
|
184
|
-
)
|
185
|
-
# remove duplicated identifiers
|
186
|
-
.groupby([table_pk_var, IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER])
|
187
|
-
.first()
|
188
|
-
.reset_index()
|
189
|
-
.set_index(table_pk_var)
|
190
|
-
)
|
191
|
-
|
192
|
-
# create a dictionary of new Identifiers objects
|
193
|
-
expanded_identifiers_dict = {
|
194
|
-
i: _expand_identifiers_new_entries(i, expanded_identifiers_df)
|
195
|
-
for i in expanded_identifiers_df.index.unique()
|
196
|
-
}
|
197
|
-
|
198
|
-
output = pd.Series(expanded_identifiers_dict).rename(table_id_var)
|
199
|
-
output.index.name = table_pk_var
|
200
|
-
|
201
|
-
return output
|
202
|
-
|
203
|
-
|
204
|
-
@warn_if_no_rpy2
|
21
|
+
@require_rpy2
|
205
22
|
@report_r_exceptions
|
206
23
|
def create_bioconductor_mapping_tables(
|
207
24
|
mappings: set[str], species: str, r_paths: str | None = None
|
208
25
|
) -> dict[str, pd.DataFrame]:
|
209
|
-
"""
|
210
|
-
Create Bioconductor Mapping Tables
|
26
|
+
"""Create Bioconductor Mapping Tables.
|
211
27
|
|
212
28
|
Creating a dictionary of mappings between entrez and other ontologies.
|
213
29
|
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
30
|
+
Parameters
|
31
|
+
----------
|
32
|
+
mappings : set[str]
|
33
|
+
A set of ontologies to work with. The valid ontologies are:
|
34
|
+
"ensembl_gene", "ensembl_transcript", and "uniprot".
|
35
|
+
species : str
|
36
|
+
The organismal species that we are working with (e.g., Homo sapiens).
|
37
|
+
r_paths : str | None, optional
|
38
|
+
Optional path to a library of R packages.
|
222
39
|
|
223
|
-
Returns
|
224
|
-
|
225
|
-
|
40
|
+
Returns
|
41
|
+
-------
|
42
|
+
dict[str, pd.DataFrame]
|
43
|
+
A table of entrez ids, and tables mapping from each ontology in "mappings" to entrez.
|
226
44
|
|
45
|
+
Raises
|
46
|
+
------
|
47
|
+
ValueError
|
48
|
+
If any of the requested mappings are not supported
|
227
49
|
"""
|
228
|
-
|
229
|
-
if not isinstance(mappings, set):
|
230
|
-
raise TypeError(f"mappings must be a set, but got {type(mappings).__name__}")
|
231
|
-
if not isinstance(species, str):
|
232
|
-
raise TypeError(f"species must be a str, but got {type(species).__name__}")
|
233
|
-
|
234
50
|
logger.info(
|
235
|
-
|
51
|
+
"Creating mapping tables from entrez genes to/from %s", ", ".join(mappings)
|
236
52
|
)
|
237
53
|
|
238
54
|
invalid_mappings = set(mappings).difference(BIOC_VALID_EXPANDED_SPECIES_ONTOLOGIES)
|
239
|
-
|
240
55
|
if len(invalid_mappings) > 0:
|
241
56
|
raise ValueError(
|
242
57
|
f"{len(invalid_mappings)} mappings could not be created: {', '.join(invalid_mappings)}.\n"
|
243
58
|
f"The valid mappings are {', '.join(BIOC_VALID_EXPANDED_SPECIES_ONTOLOGIES)}"
|
244
59
|
)
|
245
60
|
|
246
|
-
mappings_dict =
|
61
|
+
mappings_dict = {}
|
247
62
|
|
248
|
-
#
|
249
|
-
|
250
|
-
|
251
|
-
callr.bioconductor_org_r_function(
|
252
|
-
BIOC_NOMENCLATURE.CHR_TBL, species, r_paths=None
|
253
|
-
)
|
254
|
-
)
|
255
|
-
.drop(BIOC_NOMENCLATURE.CHROMOSOME, axis=1)
|
256
|
-
.rename(
|
257
|
-
columns={BIOC_NOMENCLATURE.NCBI_ENTREZ_GENE: ONTOLOGIES.NCBI_ENTREZ_GENE}
|
258
|
-
)
|
259
|
-
.set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
|
260
|
-
)
|
261
|
-
|
262
|
-
if ONTOLOGIES.ENSEMBL_GENE in mappings:
|
263
|
-
# "entrez <> ensembl genes"
|
264
|
-
mappings_dict[ONTOLOGIES.ENSEMBL_GENE] = (
|
265
|
-
callr.r_dataframe_to_pandas(
|
266
|
-
callr.bioconductor_org_r_function(
|
267
|
-
BIOC_NOMENCLATURE.ENSG_TBL, species, r_paths=r_paths
|
268
|
-
)
|
269
|
-
)
|
270
|
-
.rename(
|
271
|
-
columns={
|
272
|
-
BIOC_NOMENCLATURE.NCBI_ENTREZ_GENE: ONTOLOGIES.NCBI_ENTREZ_GENE,
|
273
|
-
BIOC_NOMENCLATURE.ENSEMBL_GENE: ONTOLOGIES.ENSEMBL_GENE,
|
274
|
-
}
|
275
|
-
)
|
276
|
-
.set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
|
277
|
-
)
|
278
|
-
|
279
|
-
if ONTOLOGIES.ENSEMBL_TRANSCRIPT in mappings:
|
280
|
-
# "entrez <> ensembl transcripts"
|
281
|
-
mappings_dict[ONTOLOGIES.ENSEMBL_TRANSCRIPT] = (
|
282
|
-
callr.r_dataframe_to_pandas(
|
283
|
-
callr.bioconductor_org_r_function(
|
284
|
-
BIOC_NOMENCLATURE.ENST_TBL, species, r_paths=r_paths
|
285
|
-
)
|
286
|
-
)
|
287
|
-
.rename(
|
288
|
-
columns={
|
289
|
-
BIOC_NOMENCLATURE.NCBI_ENTREZ_GENE: ONTOLOGIES.NCBI_ENTREZ_GENE,
|
290
|
-
BIOC_NOMENCLATURE.ENSEMBL_TRANSCRIPT: ONTOLOGIES.ENSEMBL_TRANSCRIPT,
|
291
|
-
}
|
292
|
-
)
|
293
|
-
.set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
|
294
|
-
)
|
295
|
-
|
296
|
-
if ONTOLOGIES.ENSEMBL_PROTEIN in mappings:
|
297
|
-
# "entrez <> ensembl proteins"
|
298
|
-
mappings_dict[ONTOLOGIES.ENSEMBL_PROTEIN] = (
|
299
|
-
callr.r_dataframe_to_pandas(
|
300
|
-
callr.bioconductor_org_r_function(
|
301
|
-
BIOC_NOMENCLATURE.ENSP_TBL, species, r_paths=r_paths
|
302
|
-
)
|
303
|
-
)
|
304
|
-
.rename(
|
305
|
-
columns={
|
306
|
-
BIOC_NOMENCLATURE.NCBI_ENTREZ_GENE: ONTOLOGIES.NCBI_ENTREZ_GENE,
|
307
|
-
BIOC_NOMENCLATURE.ENSEMBL_PROTEIN: ONTOLOGIES.ENSEMBL_PROTEIN,
|
308
|
-
}
|
309
|
-
)
|
310
|
-
.set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
|
311
|
-
)
|
312
|
-
|
313
|
-
if ONTOLOGIES.UNIPROT in mappings:
|
314
|
-
# "entrez <> uniprot"
|
315
|
-
mappings_dict[ONTOLOGIES.UNIPROT] = (
|
316
|
-
callr.r_dataframe_to_pandas(
|
317
|
-
callr.bioconductor_org_r_function(
|
318
|
-
BIOC_NOMENCLATURE.UNIPROT_TBL, species, r_paths=r_paths
|
319
|
-
)
|
320
|
-
)
|
321
|
-
.rename(
|
322
|
-
columns={
|
323
|
-
BIOC_NOMENCLATURE.NCBI_ENTREZ_GENE: ONTOLOGIES.NCBI_ENTREZ_GENE,
|
324
|
-
BIOC_NOMENCLATURE.UNIPROT: ONTOLOGIES.UNIPROT,
|
325
|
-
}
|
326
|
-
)
|
327
|
-
.set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
|
328
|
-
)
|
329
|
-
|
330
|
-
if ONTOLOGIES.GENE_NAME in mappings:
|
331
|
-
# "entrez <> gene name"
|
332
|
-
mappings_dict[ONTOLOGIES.GENE_NAME] = (
|
333
|
-
callr.r_dataframe_to_pandas(
|
334
|
-
callr.bioconductor_org_r_function(
|
335
|
-
BIOC_NOMENCLATURE.NAME_TBL, species, r_paths=r_paths
|
336
|
-
)
|
337
|
-
)
|
338
|
-
.rename(
|
339
|
-
columns={
|
340
|
-
BIOC_NOMENCLATURE.NCBI_ENTREZ_GENE: ONTOLOGIES.NCBI_ENTREZ_GENE,
|
341
|
-
BIOC_NOMENCLATURE.GENE_NAME: ONTOLOGIES.GENE_NAME,
|
342
|
-
}
|
343
|
-
)
|
344
|
-
.set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
|
345
|
-
)
|
346
|
-
|
347
|
-
if ONTOLOGIES.SYMBOL in mappings:
|
348
|
-
# "entrez <> gene symbol"
|
349
|
-
mappings_dict[ONTOLOGIES.SYMBOL] = (
|
350
|
-
callr.r_dataframe_to_pandas(
|
351
|
-
callr.bioconductor_org_r_function(
|
352
|
-
BIOC_NOMENCLATURE.SYMBOL_TBL, species, r_paths=r_paths
|
353
|
-
)
|
354
|
-
)
|
355
|
-
.rename(
|
356
|
-
columns={
|
357
|
-
BIOC_NOMENCLATURE.NCBI_ENTREZ_GENE: ONTOLOGIES.NCBI_ENTREZ_GENE,
|
358
|
-
BIOC_NOMENCLATURE.SYMBOL: ONTOLOGIES.SYMBOL,
|
359
|
-
}
|
360
|
-
)
|
361
|
-
.set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
|
362
|
-
)
|
63
|
+
# Create mapping tables for each requested ontology
|
64
|
+
for ontology in mappings:
|
65
|
+
mappings_dict[ontology] = _create_single_mapping(ontology, species, r_paths)
|
363
66
|
|
364
67
|
return mappings_dict
|
365
68
|
|
366
69
|
|
367
|
-
def
|
368
|
-
|
70
|
+
def _create_single_mapping(
|
71
|
+
ontology: str, species: str, r_paths: str | None = None
|
369
72
|
) -> pd.DataFrame:
|
370
|
-
"""
|
371
|
-
|
372
|
-
running_ids = mappings_dict[ONTOLOGIES.NCBI_ENTREZ_GENE]
|
373
|
-
|
374
|
-
for mapping in mapping_ontologies:
|
375
|
-
logger.debug(f"adding entries for {mapping} to running_ids")
|
376
|
-
mapping_df = mappings_dict[mapping]
|
377
|
-
|
378
|
-
running_ids = running_ids.join(mapping_df)
|
73
|
+
"""Create a single mapping table for a given ontology.
|
379
74
|
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
"""
|
389
|
-
Stack Bioconductor Mappings
|
390
|
-
|
391
|
-
Convert a dict of mappings between entrez identifiers and other identifiers to a single table.
|
392
|
-
|
393
|
-
Args:
|
394
|
-
mappings_dict (dict):
|
395
|
-
A dictionary containing mappings between entrez and other ontologies.
|
396
|
-
mapping_ontologies (set):
|
397
|
-
A set of mappings to combine.
|
398
|
-
|
399
|
-
Returns:
|
400
|
-
mappings_df (pd.DataFrame):
|
401
|
-
A table containing entrez_gene_id, ontology, and identifier.
|
402
|
-
"""
|
403
|
-
|
404
|
-
mappings_list = list()
|
405
|
-
for ont in mapping_ontologies:
|
406
|
-
one_mapping_df = (
|
407
|
-
mappings_dict[ont].assign(ontology=ont).rename({ont: "identifier"}, axis=1)
|
408
|
-
)
|
409
|
-
|
410
|
-
mappings_list.append(one_mapping_df)
|
411
|
-
|
412
|
-
return pd.concat(mappings_list)
|
413
|
-
|
414
|
-
|
415
|
-
def _check_species_identifiers_entrez_gene_ontology(
|
416
|
-
entity_identifiers_df: pd.DataFrame,
|
417
|
-
) -> pd.DataFrame:
|
418
|
-
"""
|
419
|
-
Check whether species ontologies contain ncbigene or ncbi_gene
|
420
|
-
If so, replaced them to ncbi_entrez_gene.
|
421
|
-
Return: entity_identifiers_df with proper gene ontology types.
|
422
|
-
"""
|
423
|
-
|
424
|
-
intersect_gene_onto = set(entity_identifiers_df["ontology"]).intersection(
|
425
|
-
ONTOLOGY_ALIASES.NCBI_ENTREZ_GENE
|
426
|
-
)
|
427
|
-
|
428
|
-
# if entity_identifiers_df contains members of ENTREZ_ONTOLOGY_ALIASES,
|
429
|
-
# replace to ncbi_entrez_gene
|
430
|
-
if intersect_gene_onto:
|
431
|
-
logger.info(
|
432
|
-
f" Replace unmatching ontology {', '.join(intersect_gene_onto)} to {ONTOLOGIES.NCBI_ENTREZ_GENE}."
|
433
|
-
)
|
434
|
-
|
435
|
-
filtered_onto_df = entity_identifiers_df[
|
436
|
-
entity_identifiers_df["ontology"].isin(list(intersect_gene_onto))
|
437
|
-
]
|
438
|
-
|
439
|
-
entity_identifiers_df.loc[filtered_onto_df.index, "ontology"] = (
|
440
|
-
ONTOLOGIES.NCBI_ENTREZ_GENE
|
441
|
-
)
|
442
|
-
|
443
|
-
return entity_identifiers_df
|
444
|
-
|
445
|
-
|
446
|
-
def update_expanded_identifiers(
|
447
|
-
model: sbml_dfs_core.SBML_dfs, id_type: str, expanded_ids: pd.Series
|
448
|
-
) -> sbml_dfs_core.SBML_dfs:
|
449
|
-
"""Update the expanded identifiers for a model.
|
450
|
-
|
451
|
-
Args:
|
452
|
-
model (sbml_dfs_core.SBML_dfs): _description_
|
453
|
-
id_type (str): _description_
|
454
|
-
expanded_ids (str): _description_
|
455
|
-
"""
|
456
|
-
ids = getattr(model, id_type)
|
457
|
-
|
458
|
-
# make sure expanded_ids and original model.species have same number of s_ids
|
459
|
-
# if a s_id only in model.species, adding it to expanded_ids.
|
460
|
-
if ids.shape[0] != expanded_ids.shape[0]:
|
461
|
-
matched_expanded_ids = expanded_ids.combine_first(ids[SBML_DFS.S_IDENTIFIERS])
|
462
|
-
logger.debug(
|
463
|
-
f"{ids.shape[0] - expanded_ids.shape[0]} "
|
464
|
-
"ids are not included in expanded ids"
|
465
|
-
)
|
466
|
-
else:
|
467
|
-
matched_expanded_ids = expanded_ids
|
468
|
-
|
469
|
-
updated_ids = ids.drop(SBML_DFS.S_IDENTIFIERS, axis=1).join(
|
470
|
-
pd.DataFrame(matched_expanded_ids)
|
471
|
-
)
|
472
|
-
|
473
|
-
setattr(model, id_type, updated_ids)
|
474
|
-
|
475
|
-
return model
|
476
|
-
|
477
|
-
|
478
|
-
def create_dogmatic_sbml_dfs(
|
479
|
-
species: str, r_paths: str | None = None
|
480
|
-
) -> sbml_dfs_core.SBML_dfs:
|
481
|
-
"""
|
482
|
-
Create Dogmatic SMBL_DFs
|
483
|
-
|
484
|
-
Create an SBML_dfs model which is pretty much just proteins and no
|
485
|
-
reactions, as well as annotations linking proteins to genes, and
|
486
|
-
creating nice labels for genes/proteins.
|
487
|
-
|
488
|
-
Args:
|
489
|
-
species (str):
|
490
|
-
An organismal species (e.g., Homo sapiens)
|
491
|
-
r_paths (str or None)
|
492
|
-
Optional, p]ath to an R packages directory
|
75
|
+
Parameters
|
76
|
+
----------
|
77
|
+
ontology : str
|
78
|
+
The ontology to map (e.g. ENSEMBL_GENE, UNIPROT)
|
79
|
+
species : str
|
80
|
+
The organismal species to map
|
81
|
+
r_paths : str | None, optional
|
82
|
+
Optional path to R packages directory
|
493
83
|
|
494
|
-
Returns
|
495
|
-
|
496
|
-
|
497
|
-
|
84
|
+
Returns
|
85
|
+
-------
|
86
|
+
pd.DataFrame
|
87
|
+
DataFrame containing the mapping between entrez and the target ontology
|
498
88
|
"""
|
499
89
|
|
500
|
-
|
90
|
+
if ontology not in BIOC_ONTOLOGY_MAPPING:
|
91
|
+
raise ValueError(f"Unsupported ontology: {ontology}")
|
501
92
|
|
502
|
-
|
93
|
+
table_name, column_name = BIOC_ONTOLOGY_MAPPING[ontology]
|
503
94
|
|
504
|
-
|
505
|
-
|
506
|
-
dogmatic_mappings["s_name_series"]
|
95
|
+
df = r_dataframe_to_pandas(
|
96
|
+
bioconductor_org_r_function(table_name, species, r_paths=r_paths)
|
507
97
|
)
|
508
98
|
|
509
|
-
#
|
510
|
-
|
511
|
-
|
99
|
+
# Drop chromosome column if this is the chromosome table
|
100
|
+
# this was only introduced so we had a table with 1 row per unique entrez id
|
101
|
+
if table_name == BIOC_NOMENCLATURE.CHR_TBL:
|
102
|
+
df = df.drop(BIOC_NOMENCLATURE.CHROMOSOME, axis=1)
|
512
103
|
|
513
|
-
#
|
514
|
-
|
104
|
+
# Rename columns and set index
|
105
|
+
df = df.rename(
|
515
106
|
columns={
|
516
|
-
|
517
|
-
|
107
|
+
BIOC_NOMENCLATURE.NCBI_ENTREZ_GENE: ONTOLOGIES.NCBI_ENTREZ_GENE,
|
108
|
+
column_name: ontology,
|
518
109
|
}
|
519
|
-
)
|
520
|
-
interaction_edgelist["downstream_name"] = interaction_edgelist["upstream_name"]
|
521
|
-
interaction_edgelist["upstream_compartment"] = "cellular_component"
|
522
|
-
interaction_edgelist["downstream_compartment"] = "cellular_component"
|
523
|
-
interaction_edgelist["r_name"] = interaction_edgelist["upstream_name"]
|
524
|
-
interaction_edgelist["sbo_term"] = constants.MINI_SBO_FROM_NAME["reactant"]
|
525
|
-
interaction_edgelist["r_isreversible"] = False
|
526
|
-
|
527
|
-
dogmatic_sbml_dfs = sbml_dfs_core.sbml_dfs_from_edgelist(
|
528
|
-
interaction_edgelist=interaction_edgelist,
|
529
|
-
species_df=species_df,
|
530
|
-
compartments_df=compartments_df,
|
531
|
-
interaction_source=interaction_source,
|
532
|
-
upstream_stoichiometry=-1,
|
533
|
-
downstream_stoichiometry=1,
|
534
|
-
downstream_sbo_name="product",
|
535
|
-
)
|
536
|
-
|
537
|
-
# remove all reactions except 1 (so it still passes sbml_dfs.validate())
|
538
|
-
# this self reaction will be removed when creating the graph
|
539
|
-
dogmatic_sbml_dfs.remove_reactions(dogmatic_sbml_dfs.reactions.index.tolist()[1::])
|
540
|
-
|
541
|
-
return dogmatic_sbml_dfs
|
542
|
-
|
543
|
-
|
544
|
-
def connect_dogmatic_mappings(species: str, r_paths: str | None = None) -> dict:
|
545
|
-
"""
|
546
|
-
Connect Dogmatic Mappings
|
547
|
-
|
548
|
-
Merge all ontologies into greedy clusters based on shared associations to entrez ids
|
549
|
-
|
550
|
-
Args:
|
551
|
-
species (str):
|
552
|
-
An organismal species (e.g., Homo sapiens)
|
553
|
-
r_paths (str or None)
|
554
|
-
Optional, p]ath to an R packages directory
|
555
|
-
|
556
|
-
Returns:
|
557
|
-
dict with:
|
558
|
-
- s_name_series: a series where the index is distinct molecular species and the values are names.
|
559
|
-
- cluster_consensus_identifiers_df: a pd.DataFrame where the index is distinct molecular species
|
560
|
-
and values are identifiers objects.
|
561
|
-
"""
|
562
|
-
|
563
|
-
mappings_dict = create_bioconductor_mapping_tables(
|
564
|
-
mappings=BIOC_DOGMATIC_MAPPING_ONTOLOGIES,
|
565
|
-
species=species,
|
566
|
-
r_paths=r_paths,
|
567
|
-
)
|
568
|
-
|
569
|
-
protein_mappings = stack_bioconductor_mappings(
|
570
|
-
mappings_dict, set(BIOC_PROTEIN_ONTOLOGIES)
|
571
|
-
)
|
572
|
-
|
573
|
-
# apply greedy graph-based clustering to connect proteins with a common mapping to entrez
|
574
|
-
edgelist_df = utils.format_identifiers_as_edgelist(
|
575
|
-
protein_mappings, [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
|
576
|
-
)
|
577
|
-
connected_indices = utils.find_weakly_connected_subgraphs(
|
578
|
-
edgelist_df[["ind", "id"]]
|
579
|
-
)
|
580
|
-
|
581
|
-
# add clusters to proteins. Each cluster will be a distinct molecular species
|
582
|
-
protein_mappings_w_clusters = protein_mappings.reset_index().merge(
|
583
|
-
connected_indices
|
584
|
-
)
|
585
|
-
|
586
|
-
# combine entrez + cluster so we can pass cluster to non-protein attributes
|
587
|
-
entrez_clusters = protein_mappings_w_clusters[
|
588
|
-
[ONTOLOGIES.NCBI_ENTREZ_GENE, "cluster"]
|
589
|
-
].drop_duplicates()
|
590
|
-
other_ontologies = BIOC_DOGMATIC_MAPPING_ONTOLOGIES.difference(
|
591
|
-
set(BIOC_PROTEIN_ONTOLOGIES)
|
592
|
-
)
|
593
|
-
other_mappings = stack_bioconductor_mappings(mappings_dict, other_ontologies)
|
594
|
-
other_mappings_w_clusters = entrez_clusters.merge(
|
595
|
-
other_mappings, left_on=ONTOLOGIES.NCBI_ENTREZ_GENE, right_index=True
|
596
|
-
)
|
597
|
-
|
598
|
-
possible_names = pd.concat(
|
599
|
-
[
|
600
|
-
protein_mappings_w_clusters.query(
|
601
|
-
"ontology in @BIOC_NAME_ONTOLOGIES.keys()"
|
602
|
-
),
|
603
|
-
other_mappings_w_clusters.query("ontology in @BIOC_NAME_ONTOLOGIES.keys()"),
|
604
|
-
]
|
605
|
-
)[["cluster", IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]]
|
606
|
-
|
607
|
-
possible_names.loc[:, "ontology_preference"] = possible_names[
|
608
|
-
IDENTIFIERS.ONTOLOGY
|
609
|
-
].map(BIOC_NAME_ONTOLOGIES)
|
610
|
-
|
611
|
-
# remove possible names which are present in multiple clusters.
|
612
|
-
# all clusters will need unique names to use sbml_dfs_from_edgelist()
|
613
|
-
id_counts = (
|
614
|
-
possible_names[["cluster", IDENTIFIERS.IDENTIFIER]]
|
615
|
-
.drop_duplicates()
|
616
|
-
.value_counts(IDENTIFIERS.IDENTIFIER)
|
617
|
-
)
|
618
|
-
possible_names = possible_names[
|
619
|
-
~possible_names[IDENTIFIERS.IDENTIFIER].isin(
|
620
|
-
id_counts[id_counts > 1].index.tolist()
|
621
|
-
)
|
622
|
-
]
|
623
|
-
|
624
|
-
s_name_series = (
|
625
|
-
utils._add_nameness_score(possible_names, IDENTIFIERS.IDENTIFIER)
|
626
|
-
.sort_values(["ontology_preference", "nameness_score"])
|
627
|
-
.groupby("cluster")
|
628
|
-
.first()
|
629
|
-
.rename(columns={IDENTIFIERS.IDENTIFIER: SBML_DFS.S_NAME})[SBML_DFS.S_NAME]
|
630
|
-
)
|
631
|
-
|
632
|
-
protein_ids = protein_mappings_w_clusters.assign(bqb=BQB.IS)[
|
633
|
-
["cluster", IDENTIFIERS.IDENTIFIER, IDENTIFIERS.ONTOLOGY, IDENTIFIERS.BQB]
|
634
|
-
]
|
635
|
-
gene_ids = other_mappings_w_clusters.query(
|
636
|
-
"ontology in @BIOC_GENE_ONTOLOGIES"
|
637
|
-
).assign(bqb=BQB.IS_ENCODED_BY)[
|
638
|
-
["cluster", IDENTIFIERS.IDENTIFIER, IDENTIFIERS.ONTOLOGY, IDENTIFIERS.BQB]
|
639
|
-
]
|
640
|
-
entrez_ids = entrez_clusters.assign(
|
641
|
-
ontology=ONTOLOGIES.NCBI_ENTREZ_GENE, bqb=BQB.IS_ENCODED_BY
|
642
|
-
).rename(columns={ONTOLOGIES.NCBI_ENTREZ_GENE: IDENTIFIERS.IDENTIFIER})[
|
643
|
-
["cluster", IDENTIFIERS.IDENTIFIER, IDENTIFIERS.ONTOLOGY, IDENTIFIERS.BQB]
|
644
|
-
]
|
645
|
-
|
646
|
-
# combine all ids to setup a single cluster-level Identifiers
|
647
|
-
all_ids = pd.concat([protein_ids, gene_ids, entrez_ids])
|
648
|
-
all_ids.loc[:, IDENTIFIERS.URL] = [
|
649
|
-
identifiers.create_uri_url(x, y)
|
650
|
-
for x, y in zip(all_ids[IDENTIFIERS.ONTOLOGY], all_ids[IDENTIFIERS.IDENTIFIER])
|
651
|
-
]
|
652
|
-
|
653
|
-
# create one Identifiers object for each new species
|
654
|
-
cluster_consensus_identifiers = {
|
655
|
-
k: identifiers.Identifiers(
|
656
|
-
list(
|
657
|
-
v[
|
658
|
-
[
|
659
|
-
IDENTIFIERS.ONTOLOGY,
|
660
|
-
IDENTIFIERS.IDENTIFIER,
|
661
|
-
IDENTIFIERS.URL,
|
662
|
-
IDENTIFIERS.BQB,
|
663
|
-
]
|
664
|
-
]
|
665
|
-
.reset_index(drop=True)
|
666
|
-
.T.to_dict()
|
667
|
-
.values()
|
668
|
-
)
|
669
|
-
)
|
670
|
-
for k, v in all_ids.groupby("cluster")
|
671
|
-
}
|
672
|
-
|
673
|
-
cluster_consensus_identifiers_df = pd.DataFrame(
|
674
|
-
cluster_consensus_identifiers, index=[SBML_DFS.S_IDENTIFIERS]
|
675
|
-
).T
|
676
|
-
cluster_consensus_identifiers_df.index.name = "cluster"
|
677
|
-
|
678
|
-
out_dict = {
|
679
|
-
"s_name_series": s_name_series,
|
680
|
-
"cluster_consensus_identifiers_df": cluster_consensus_identifiers_df,
|
681
|
-
}
|
682
|
-
|
683
|
-
return out_dict
|
684
|
-
|
685
|
-
|
686
|
-
@warn_if_no_rpy2
|
687
|
-
def _expand_identifiers_new_entries(
|
688
|
-
sysid: str, expanded_identifiers_df: pd.DataFrame
|
689
|
-
) -> identifiers.Identifiers:
|
690
|
-
"""Expand Identifiers to include Bioconductor annotations"""
|
691
|
-
entry = expanded_identifiers_df.loc[sysid]
|
692
|
-
|
693
|
-
if type(entry) is pd.Series:
|
694
|
-
sysis_id_list = [entry.to_dict()]
|
695
|
-
else:
|
696
|
-
# multiple annotations
|
697
|
-
sysis_id_list = list(entry.reset_index(drop=True).T.to_dict().values())
|
110
|
+
).set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
|
698
111
|
|
699
|
-
return
|
112
|
+
return df
|