napistu 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__init__.py +12 -0
- napistu/__main__.py +867 -0
- napistu/consensus.py +1557 -0
- napistu/constants.py +500 -0
- napistu/gcs/__init__.py +10 -0
- napistu/gcs/constants.py +69 -0
- napistu/gcs/downloads.py +180 -0
- napistu/identifiers.py +805 -0
- napistu/indices.py +227 -0
- napistu/ingestion/__init__.py +10 -0
- napistu/ingestion/bigg.py +146 -0
- napistu/ingestion/constants.py +296 -0
- napistu/ingestion/cpr_edgelist.py +106 -0
- napistu/ingestion/identifiers_etl.py +148 -0
- napistu/ingestion/obo.py +268 -0
- napistu/ingestion/psi_mi.py +276 -0
- napistu/ingestion/reactome.py +218 -0
- napistu/ingestion/sbml.py +621 -0
- napistu/ingestion/string.py +356 -0
- napistu/ingestion/trrust.py +285 -0
- napistu/ingestion/yeast.py +147 -0
- napistu/mechanism_matching.py +597 -0
- napistu/modify/__init__.py +10 -0
- napistu/modify/constants.py +86 -0
- napistu/modify/curation.py +628 -0
- napistu/modify/gaps.py +635 -0
- napistu/modify/pathwayannot.py +1381 -0
- napistu/modify/uncompartmentalize.py +264 -0
- napistu/network/__init__.py +10 -0
- napistu/network/constants.py +117 -0
- napistu/network/neighborhoods.py +1594 -0
- napistu/network/net_create.py +1647 -0
- napistu/network/net_utils.py +652 -0
- napistu/network/paths.py +500 -0
- napistu/network/precompute.py +221 -0
- napistu/rpy2/__init__.py +127 -0
- napistu/rpy2/callr.py +168 -0
- napistu/rpy2/constants.py +101 -0
- napistu/rpy2/netcontextr.py +464 -0
- napistu/rpy2/rids.py +697 -0
- napistu/sbml_dfs_core.py +2216 -0
- napistu/sbml_dfs_utils.py +304 -0
- napistu/source.py +394 -0
- napistu/utils.py +943 -0
- napistu-0.1.0.dist-info/METADATA +56 -0
- napistu-0.1.0.dist-info/RECORD +77 -0
- napistu-0.1.0.dist-info/WHEEL +5 -0
- napistu-0.1.0.dist-info/entry_points.txt +2 -0
- napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
- napistu-0.1.0.dist-info/top_level.txt +2 -0
- tests/__init__.py +0 -0
- tests/conftest.py +83 -0
- tests/test_consensus.py +255 -0
- tests/test_constants.py +20 -0
- tests/test_curation.py +134 -0
- tests/test_data/__init__.py +0 -0
- tests/test_edgelist.py +20 -0
- tests/test_gcs.py +23 -0
- tests/test_identifiers.py +151 -0
- tests/test_igraph.py +353 -0
- tests/test_indices.py +88 -0
- tests/test_mechanism_matching.py +126 -0
- tests/test_net_utils.py +66 -0
- tests/test_netcontextr.py +105 -0
- tests/test_obo.py +34 -0
- tests/test_pathwayannot.py +95 -0
- tests/test_precomputed_distances.py +222 -0
- tests/test_rpy2.py +61 -0
- tests/test_sbml.py +46 -0
- tests/test_sbml_dfs_create.py +307 -0
- tests/test_sbml_dfs_utils.py +22 -0
- tests/test_sbo.py +11 -0
- tests/test_set_coverage.py +50 -0
- tests/test_source.py +67 -0
- tests/test_uncompartmentalize.py +40 -0
- tests/test_utils.py +487 -0
- tests/utils.py +30 -0
napistu/rpy2/rids.py
ADDED
@@ -0,0 +1,697 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import logging
|
4
|
+
|
5
|
+
import pandas as pd
|
6
|
+
from napistu import consensus
|
7
|
+
from napistu import constants
|
8
|
+
from napistu import identifiers
|
9
|
+
from napistu import sbml_dfs_core
|
10
|
+
from napistu import source
|
11
|
+
from napistu import utils
|
12
|
+
from napistu.rpy2 import callr
|
13
|
+
from napistu.rpy2 import report_r_exceptions
|
14
|
+
from napistu.rpy2 import warn_if_no_rpy2
|
15
|
+
|
16
|
+
from napistu.constants import SBML_DFS
|
17
|
+
from napistu.constants import BQB
|
18
|
+
from napistu.constants import IDENTIFIERS
|
19
|
+
from napistu.constants import ONTOLOGIES
|
20
|
+
from napistu.constants import ONTOLOGY_ALIASES
|
21
|
+
from napistu.rpy2.constants import BIOC_VALID_EXPANDED_SPECIES_ONTOLOGIES
|
22
|
+
from napistu.rpy2.constants import BIOC_DOGMATIC_MAPPING_ONTOLOGIES
|
23
|
+
from napistu.rpy2.constants import BIOC_PROTEIN_ONTOLOGIES
|
24
|
+
from napistu.rpy2.constants import BIOC_NAME_ONTOLOGIES
|
25
|
+
from napistu.rpy2.constants import BIOC_GENE_ONTOLOGIES # noqa
|
26
|
+
from napistu.rpy2.constants import BIOC_NOMENCLATURE
|
27
|
+
|
28
|
+
logger = logging.getLogger(__name__)
|
29
|
+
|
30
|
+
|
31
|
+
@warn_if_no_rpy2
|
32
|
+
@report_r_exceptions
|
33
|
+
def expand_identifiers(
|
34
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
35
|
+
id_type: str,
|
36
|
+
species: str,
|
37
|
+
expanded_ontologies: list[str],
|
38
|
+
r_paths: str | None = None,
|
39
|
+
) -> pd.Series:
|
40
|
+
"""
|
41
|
+
Expand Identifiers
|
42
|
+
|
43
|
+
Update a table's identifiers to include additional related ontologies
|
44
|
+
|
45
|
+
Ontologies are pulled from the bioconductor "org" packages. This is effective, but inelegant.
|
46
|
+
|
47
|
+
Parameters
|
48
|
+
----------
|
49
|
+
sbml_dfs : SBML_dfs
|
50
|
+
A relational pathway model built around reactions interconverting compartmentalized species.
|
51
|
+
id_type: str
|
52
|
+
Identifiers to expand: species, compartments, or reactions
|
53
|
+
species: str
|
54
|
+
Species name
|
55
|
+
expanded_ontologies: list
|
56
|
+
Ontologies to add or complete
|
57
|
+
r_paths: str
|
58
|
+
Path to an R packages directory
|
59
|
+
|
60
|
+
Returns
|
61
|
+
-------
|
62
|
+
a pd.Series with identifiers as the index and updated Identifiers objects as values
|
63
|
+
"""
|
64
|
+
|
65
|
+
if not isinstance(sbml_dfs, sbml_dfs_core.SBML_dfs):
|
66
|
+
raise TypeError("sbml_dfs is not an sbml_dfs_core.SBML_dfs object")
|
67
|
+
|
68
|
+
# pull out all identifiers as a pd.DataFrame
|
69
|
+
all_entity_identifiers = sbml_dfs.get_identifiers(id_type)
|
70
|
+
assert isinstance(all_entity_identifiers, pd.DataFrame)
|
71
|
+
|
72
|
+
if id_type == "species":
|
73
|
+
all_entity_identifiers = _check_species_identifiers_entrez_gene_ontology(
|
74
|
+
all_entity_identifiers
|
75
|
+
)
|
76
|
+
|
77
|
+
valid_expanded_ontologies = BIOC_VALID_EXPANDED_SPECIES_ONTOLOGIES
|
78
|
+
elif id_type in ["reactions", "compartments"]:
|
79
|
+
raise NotImplementedError(
|
80
|
+
f"No converters implemented to expand {id_type} annotations"
|
81
|
+
)
|
82
|
+
else:
|
83
|
+
raise ValueError(f"{id_type} is an invalid id_type")
|
84
|
+
|
85
|
+
invalid_expanded_ontologies = set(expanded_ontologies).difference(
|
86
|
+
valid_expanded_ontologies
|
87
|
+
)
|
88
|
+
|
89
|
+
if len(invalid_expanded_ontologies) != 0:
|
90
|
+
raise NotImplementedError(
|
91
|
+
f"No converters implemented to expand {id_type} annotations to {', '.join(invalid_expanded_ontologies)}"
|
92
|
+
)
|
93
|
+
|
94
|
+
# find entries in valid_expanded_ontologies which are already present
|
95
|
+
# these are the entries that will be used to expand to other ontologies
|
96
|
+
# or fill in ontologies with incomplete annotations
|
97
|
+
starting_ontologies = valid_expanded_ontologies.intersection(
|
98
|
+
set(all_entity_identifiers["ontology"])
|
99
|
+
)
|
100
|
+
|
101
|
+
if len(starting_ontologies) == 0:
|
102
|
+
raise ValueError(f"No ontologies with {id_type} converters are present")
|
103
|
+
|
104
|
+
required_conversion_ontologies = set(starting_ontologies).union(
|
105
|
+
set(expanded_ontologies)
|
106
|
+
)
|
107
|
+
|
108
|
+
# pull down entrez ids + mapping to other ontologies
|
109
|
+
mapping_ontologies = required_conversion_ontologies.intersection(
|
110
|
+
BIOC_VALID_EXPANDED_SPECIES_ONTOLOGIES
|
111
|
+
)
|
112
|
+
|
113
|
+
mappings_dict = create_bioconductor_mapping_tables(
|
114
|
+
mappings=mapping_ontologies, species=species, r_paths=r_paths
|
115
|
+
)
|
116
|
+
|
117
|
+
# start with entrez IDs (since all other ontologies are mapped to them in the
|
118
|
+
# bioconductor "org" packages)
|
119
|
+
|
120
|
+
# get these values by just looking up the mappings between entrez genes and genomic loci
|
121
|
+
running_ids = merge_bioconductor_mappings(mappings_dict, mapping_ontologies)
|
122
|
+
|
123
|
+
# map from existing ontologies to expanded ontologies
|
124
|
+
ontology_mappings = list()
|
125
|
+
# starting w/
|
126
|
+
for start in starting_ontologies:
|
127
|
+
# ending w/
|
128
|
+
for end in expanded_ontologies:
|
129
|
+
if start == end:
|
130
|
+
continue
|
131
|
+
lookup = (
|
132
|
+
running_ids[[start, end]]
|
133
|
+
.rename(columns={start: IDENTIFIERS.IDENTIFIER, end: "new_identifier"})
|
134
|
+
.assign(ontology=start)
|
135
|
+
.assign(new_ontology=end)
|
136
|
+
)
|
137
|
+
ontology_mappings.append(lookup)
|
138
|
+
|
139
|
+
ontology_mappings_df = pd.concat(ontology_mappings).dropna()
|
140
|
+
|
141
|
+
# old identifiers joined with new identifiers
|
142
|
+
|
143
|
+
# first, define the names of keys and ids
|
144
|
+
table_pk_var = sbml_dfs.schema[id_type]["pk"]
|
145
|
+
table_id_var = sbml_dfs.schema[id_type]["id"]
|
146
|
+
|
147
|
+
# retain bqb terms to define how an identifier is related to sid
|
148
|
+
# this relation will be preserved for the new ids
|
149
|
+
|
150
|
+
merged_identifiers = all_entity_identifiers[
|
151
|
+
[
|
152
|
+
table_pk_var,
|
153
|
+
IDENTIFIERS.ONTOLOGY,
|
154
|
+
IDENTIFIERS.IDENTIFIER,
|
155
|
+
IDENTIFIERS.BQB,
|
156
|
+
]
|
157
|
+
].merge(ontology_mappings_df)
|
158
|
+
|
159
|
+
# new, possibly redundant identifiers
|
160
|
+
new_identifiers = merged_identifiers[
|
161
|
+
[table_pk_var, "new_ontology", "new_identifier", IDENTIFIERS.BQB]
|
162
|
+
].rename(
|
163
|
+
columns={
|
164
|
+
"new_ontology": IDENTIFIERS.ONTOLOGY,
|
165
|
+
"new_identifier": IDENTIFIERS.IDENTIFIER,
|
166
|
+
}
|
167
|
+
)
|
168
|
+
|
169
|
+
expanded_identifiers_df = (
|
170
|
+
pd.concat(
|
171
|
+
[
|
172
|
+
all_entity_identifiers[
|
173
|
+
[
|
174
|
+
table_pk_var,
|
175
|
+
IDENTIFIERS.ONTOLOGY,
|
176
|
+
IDENTIFIERS.IDENTIFIER,
|
177
|
+
IDENTIFIERS.URL,
|
178
|
+
IDENTIFIERS.BQB,
|
179
|
+
]
|
180
|
+
],
|
181
|
+
new_identifiers,
|
182
|
+
# ignore new identifier if it already exists
|
183
|
+
]
|
184
|
+
)
|
185
|
+
# remove duplicated identifiers
|
186
|
+
.groupby([table_pk_var, IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER])
|
187
|
+
.first()
|
188
|
+
.reset_index()
|
189
|
+
.set_index(table_pk_var)
|
190
|
+
)
|
191
|
+
|
192
|
+
# create a dictionary of new Identifiers objects
|
193
|
+
expanded_identifiers_dict = {
|
194
|
+
i: _expand_identifiers_new_entries(i, expanded_identifiers_df)
|
195
|
+
for i in expanded_identifiers_df.index.unique()
|
196
|
+
}
|
197
|
+
|
198
|
+
output = pd.Series(expanded_identifiers_dict).rename(table_id_var)
|
199
|
+
output.index.name = table_pk_var
|
200
|
+
|
201
|
+
return output
|
202
|
+
|
203
|
+
|
204
|
+
@warn_if_no_rpy2
|
205
|
+
@report_r_exceptions
|
206
|
+
def create_bioconductor_mapping_tables(
|
207
|
+
mappings: set[str], species: str, r_paths: str | None = None
|
208
|
+
) -> dict[str, pd.DataFrame]:
|
209
|
+
"""
|
210
|
+
Create Bioconductor Mapping Tables
|
211
|
+
|
212
|
+
Creating a dictionary of mappings between entrez and other ontologies.
|
213
|
+
|
214
|
+
Args:
|
215
|
+
mappings (set):
|
216
|
+
A set of ontologies to work with. The valid ontologies are:
|
217
|
+
"ensembl_gene", "ensembl_transcript", and "uniprot".
|
218
|
+
species (str):
|
219
|
+
The organismal species that we are working with (e.g., Homo sapiens).
|
220
|
+
r_paths (str, optional):
|
221
|
+
Optional path to a library of R packages.
|
222
|
+
|
223
|
+
Returns:
|
224
|
+
mappings_dict (dict):
|
225
|
+
A table of entrez ids, and tables mapping from each ontology in "mappings" to entrez.
|
226
|
+
|
227
|
+
"""
|
228
|
+
|
229
|
+
assert isinstance(mappings, set)
|
230
|
+
assert isinstance(species, str)
|
231
|
+
|
232
|
+
logger.info(
|
233
|
+
f"Creating mapping tables from entrez genes to/from {', '.join(mappings)}"
|
234
|
+
)
|
235
|
+
|
236
|
+
invalid_mappings = set(mappings).difference(BIOC_VALID_EXPANDED_SPECIES_ONTOLOGIES)
|
237
|
+
|
238
|
+
if len(invalid_mappings) > 0:
|
239
|
+
raise ValueError(
|
240
|
+
f"{len(invalid_mappings)} mappings could not be created: {', '.join(invalid_mappings)}.\n"
|
241
|
+
f"The valid mappings are {', '.join(BIOC_VALID_EXPANDED_SPECIES_ONTOLOGIES)}"
|
242
|
+
)
|
243
|
+
|
244
|
+
mappings_dict = dict()
|
245
|
+
|
246
|
+
# all mappings are with respect to entrez. so we will always want to obtain entrez ids
|
247
|
+
mappings_dict[ONTOLOGIES.NCBI_ENTREZ_GENE] = (
|
248
|
+
callr.r_dataframe_to_pandas(
|
249
|
+
callr.bioconductor_org_r_function(
|
250
|
+
BIOC_NOMENCLATURE.CHR_TBL, species, r_paths=None
|
251
|
+
)
|
252
|
+
)
|
253
|
+
.drop(BIOC_NOMENCLATURE.CHROMOSOME, axis=1)
|
254
|
+
.rename(
|
255
|
+
columns={BIOC_NOMENCLATURE.NCBI_ENTREZ_GENE: ONTOLOGIES.NCBI_ENTREZ_GENE}
|
256
|
+
)
|
257
|
+
.set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
|
258
|
+
)
|
259
|
+
|
260
|
+
if ONTOLOGIES.ENSEMBL_GENE in mappings:
|
261
|
+
# "entrez <> ensembl genes"
|
262
|
+
mappings_dict[ONTOLOGIES.ENSEMBL_GENE] = (
|
263
|
+
callr.r_dataframe_to_pandas(
|
264
|
+
callr.bioconductor_org_r_function(
|
265
|
+
BIOC_NOMENCLATURE.ENSG_TBL, species, r_paths=r_paths
|
266
|
+
)
|
267
|
+
)
|
268
|
+
.rename(
|
269
|
+
columns={
|
270
|
+
BIOC_NOMENCLATURE.NCBI_ENTREZ_GENE: ONTOLOGIES.NCBI_ENTREZ_GENE,
|
271
|
+
BIOC_NOMENCLATURE.ENSEMBL_GENE: ONTOLOGIES.ENSEMBL_GENE,
|
272
|
+
}
|
273
|
+
)
|
274
|
+
.set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
|
275
|
+
)
|
276
|
+
|
277
|
+
if ONTOLOGIES.ENSEMBL_TRANSCRIPT in mappings:
|
278
|
+
# "entrez <> ensembl transcripts"
|
279
|
+
mappings_dict[ONTOLOGIES.ENSEMBL_TRANSCRIPT] = (
|
280
|
+
callr.r_dataframe_to_pandas(
|
281
|
+
callr.bioconductor_org_r_function(
|
282
|
+
BIOC_NOMENCLATURE.ENST_TBL, species, r_paths=r_paths
|
283
|
+
)
|
284
|
+
)
|
285
|
+
.rename(
|
286
|
+
columns={
|
287
|
+
BIOC_NOMENCLATURE.NCBI_ENTREZ_GENE: ONTOLOGIES.NCBI_ENTREZ_GENE,
|
288
|
+
BIOC_NOMENCLATURE.ENSEMBL_TRANSCRIPT: ONTOLOGIES.ENSEMBL_TRANSCRIPT,
|
289
|
+
}
|
290
|
+
)
|
291
|
+
.set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
|
292
|
+
)
|
293
|
+
|
294
|
+
if ONTOLOGIES.ENSEMBL_PROTEIN in mappings:
|
295
|
+
# "entrez <> ensembl proteins"
|
296
|
+
mappings_dict[ONTOLOGIES.ENSEMBL_PROTEIN] = (
|
297
|
+
callr.r_dataframe_to_pandas(
|
298
|
+
callr.bioconductor_org_r_function(
|
299
|
+
BIOC_NOMENCLATURE.ENSP_TBL, species, r_paths=r_paths
|
300
|
+
)
|
301
|
+
)
|
302
|
+
.rename(
|
303
|
+
columns={
|
304
|
+
BIOC_NOMENCLATURE.NCBI_ENTREZ_GENE: ONTOLOGIES.NCBI_ENTREZ_GENE,
|
305
|
+
BIOC_NOMENCLATURE.ENSEMBL_PROTEIN: ONTOLOGIES.ENSEMBL_PROTEIN,
|
306
|
+
}
|
307
|
+
)
|
308
|
+
.set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
|
309
|
+
)
|
310
|
+
|
311
|
+
if ONTOLOGIES.UNIPROT in mappings:
|
312
|
+
# "entrez <> uniprot"
|
313
|
+
mappings_dict[ONTOLOGIES.UNIPROT] = (
|
314
|
+
callr.r_dataframe_to_pandas(
|
315
|
+
callr.bioconductor_org_r_function(
|
316
|
+
BIOC_NOMENCLATURE.UNIPROT_TBL, species, r_paths=r_paths
|
317
|
+
)
|
318
|
+
)
|
319
|
+
.rename(
|
320
|
+
columns={
|
321
|
+
BIOC_NOMENCLATURE.NCBI_ENTREZ_GENE: ONTOLOGIES.NCBI_ENTREZ_GENE,
|
322
|
+
BIOC_NOMENCLATURE.UNIPROT: ONTOLOGIES.UNIPROT,
|
323
|
+
}
|
324
|
+
)
|
325
|
+
.set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
|
326
|
+
)
|
327
|
+
|
328
|
+
if ONTOLOGIES.GENE_NAME in mappings:
|
329
|
+
# "entrez <> gene name"
|
330
|
+
mappings_dict[ONTOLOGIES.GENE_NAME] = (
|
331
|
+
callr.r_dataframe_to_pandas(
|
332
|
+
callr.bioconductor_org_r_function(
|
333
|
+
BIOC_NOMENCLATURE.NAME_TBL, species, r_paths=r_paths
|
334
|
+
)
|
335
|
+
)
|
336
|
+
.rename(
|
337
|
+
columns={
|
338
|
+
BIOC_NOMENCLATURE.NCBI_ENTREZ_GENE: ONTOLOGIES.NCBI_ENTREZ_GENE,
|
339
|
+
BIOC_NOMENCLATURE.GENE_NAME: ONTOLOGIES.GENE_NAME,
|
340
|
+
}
|
341
|
+
)
|
342
|
+
.set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
|
343
|
+
)
|
344
|
+
|
345
|
+
if ONTOLOGIES.SYMBOL in mappings:
|
346
|
+
# "entrez <> gene symbol"
|
347
|
+
mappings_dict[ONTOLOGIES.SYMBOL] = (
|
348
|
+
callr.r_dataframe_to_pandas(
|
349
|
+
callr.bioconductor_org_r_function(
|
350
|
+
BIOC_NOMENCLATURE.SYMBOL_TBL, species, r_paths=r_paths
|
351
|
+
)
|
352
|
+
)
|
353
|
+
.rename(
|
354
|
+
columns={
|
355
|
+
BIOC_NOMENCLATURE.NCBI_ENTREZ_GENE: ONTOLOGIES.NCBI_ENTREZ_GENE,
|
356
|
+
BIOC_NOMENCLATURE.SYMBOL: ONTOLOGIES.SYMBOL,
|
357
|
+
}
|
358
|
+
)
|
359
|
+
.set_index(ONTOLOGIES.NCBI_ENTREZ_GENE)
|
360
|
+
)
|
361
|
+
|
362
|
+
return mappings_dict
|
363
|
+
|
364
|
+
|
365
|
+
def merge_bioconductor_mappings(
|
366
|
+
mappings_dict: dict, mapping_ontologies: set[str]
|
367
|
+
) -> pd.DataFrame:
|
368
|
+
"""Combine multiple ontologies by recursively joining on Entrez Gene"""
|
369
|
+
|
370
|
+
running_ids = mappings_dict[ONTOLOGIES.NCBI_ENTREZ_GENE]
|
371
|
+
|
372
|
+
for mapping in mapping_ontologies:
|
373
|
+
logger.debug(f"adding entries for {mapping} to running_ids")
|
374
|
+
mapping_df = mappings_dict[mapping]
|
375
|
+
|
376
|
+
running_ids = running_ids.join(mapping_df)
|
377
|
+
|
378
|
+
running_ids = running_ids.reset_index()
|
379
|
+
|
380
|
+
return running_ids
|
381
|
+
|
382
|
+
|
383
|
+
def stack_bioconductor_mappings(
|
384
|
+
mappings_dict: dict[str, pd.DataFrame], mapping_ontologies: set[str]
|
385
|
+
) -> pd.DataFrame:
|
386
|
+
"""
|
387
|
+
Stack Bioconductor Mappings
|
388
|
+
|
389
|
+
Convert a dict of mappings between entrez identifiers and other identifiers to a single table.
|
390
|
+
|
391
|
+
Args:
|
392
|
+
mappings_dict (dict):
|
393
|
+
A dictionary containing mappings between entrez and other ontologies.
|
394
|
+
mapping_ontologies (set):
|
395
|
+
A set of mappings to combine.
|
396
|
+
|
397
|
+
Returns:
|
398
|
+
mappings_df (pd.DataFrame):
|
399
|
+
A table containing entrez_gene_id, ontology, and identifier.
|
400
|
+
"""
|
401
|
+
|
402
|
+
mappings_list = list()
|
403
|
+
for ont in mapping_ontologies:
|
404
|
+
one_mapping_df = (
|
405
|
+
mappings_dict[ont].assign(ontology=ont).rename({ont: "identifier"}, axis=1)
|
406
|
+
)
|
407
|
+
|
408
|
+
mappings_list.append(one_mapping_df)
|
409
|
+
|
410
|
+
return pd.concat(mappings_list)
|
411
|
+
|
412
|
+
|
413
|
+
def _check_species_identifiers_entrez_gene_ontology(
|
414
|
+
entity_identifiers_df: pd.DataFrame,
|
415
|
+
) -> pd.DataFrame:
|
416
|
+
"""
|
417
|
+
Check whether species ontologies contain ncbigene or ncbi_gene
|
418
|
+
If so, replaced them to ncbi_entrez_gene.
|
419
|
+
Return: entity_identifiers_df with proper gene ontology types.
|
420
|
+
"""
|
421
|
+
|
422
|
+
intersect_gene_onto = set(entity_identifiers_df["ontology"]).intersection(
|
423
|
+
ONTOLOGY_ALIASES.NCBI_ENTREZ_GENE
|
424
|
+
)
|
425
|
+
|
426
|
+
# if entity_identifiers_df contains members of ENTREZ_ONTOLOGY_ALIASES,
|
427
|
+
# replace to ncbi_entrez_gene
|
428
|
+
if intersect_gene_onto:
|
429
|
+
logger.info(
|
430
|
+
f" Replace unmatching ontology {', '.join(intersect_gene_onto)} to {ONTOLOGIES.NCBI_ENTREZ_GENE}."
|
431
|
+
)
|
432
|
+
|
433
|
+
filtered_onto_df = entity_identifiers_df[
|
434
|
+
entity_identifiers_df["ontology"].isin(list(intersect_gene_onto))
|
435
|
+
]
|
436
|
+
|
437
|
+
entity_identifiers_df.loc[filtered_onto_df.index, "ontology"] = (
|
438
|
+
ONTOLOGIES.NCBI_ENTREZ_GENE
|
439
|
+
)
|
440
|
+
|
441
|
+
return entity_identifiers_df
|
442
|
+
|
443
|
+
|
444
|
+
def update_expanded_identifiers(
|
445
|
+
model: sbml_dfs_core.SBML_dfs, id_type: str, expanded_ids: pd.Series
|
446
|
+
) -> sbml_dfs_core.SBML_dfs:
|
447
|
+
"""Update the expanded identifiers for a model.
|
448
|
+
|
449
|
+
Args:
|
450
|
+
model (sbml_dfs_core.SBML_dfs): _description_
|
451
|
+
id_type (str): _description_
|
452
|
+
expanded_ids (str): _description_
|
453
|
+
"""
|
454
|
+
ids = getattr(model, id_type)
|
455
|
+
|
456
|
+
# make sure expanded_ids and original model.species have same number of s_ids
|
457
|
+
# if a s_id only in model.species, adding it to expanded_ids.
|
458
|
+
if ids.shape[0] != expanded_ids.shape[0]:
|
459
|
+
matched_expanded_ids = expanded_ids.combine_first(ids[SBML_DFS.S_IDENTIFIERS])
|
460
|
+
logger.debug(
|
461
|
+
f"{ids.shape[0] - expanded_ids.shape[0]} "
|
462
|
+
"ids are not included in expanded ids"
|
463
|
+
)
|
464
|
+
else:
|
465
|
+
matched_expanded_ids = expanded_ids
|
466
|
+
|
467
|
+
updated_ids = ids.drop(SBML_DFS.S_IDENTIFIERS, axis=1).join(
|
468
|
+
pd.DataFrame(matched_expanded_ids)
|
469
|
+
)
|
470
|
+
|
471
|
+
setattr(model, id_type, updated_ids)
|
472
|
+
|
473
|
+
return model
|
474
|
+
|
475
|
+
|
476
|
+
def create_dogmatic_sbml_dfs(
|
477
|
+
species: str, r_paths: str | None = None
|
478
|
+
) -> sbml_dfs_core.SBML_dfs:
|
479
|
+
"""
|
480
|
+
Create Dogmatic SMBL_DFs
|
481
|
+
|
482
|
+
Create an SBML_dfs model which is pretty much just proteins and no
|
483
|
+
reactions, as well as annotations linking proteins to genes, and
|
484
|
+
creating nice labels for genes/proteins.
|
485
|
+
|
486
|
+
Args:
|
487
|
+
species (str):
|
488
|
+
An organismal species (e.g., Homo sapiens)
|
489
|
+
r_paths (str or None)
|
490
|
+
Optional, p]ath to an R packages directory
|
491
|
+
|
492
|
+
Returns:
|
493
|
+
dogmatic_sbml_dfs (sbml.SBML_dfs)
|
494
|
+
A pathway model which (pretty much) just contains proteins and
|
495
|
+
diverse identifiers
|
496
|
+
"""
|
497
|
+
|
498
|
+
dogmatic_mappings = connect_dogmatic_mappings(species)
|
499
|
+
|
500
|
+
logger.info("Creating inputs for sbml_dfs_from_edgelist()")
|
501
|
+
|
502
|
+
# format entries for sbml_dfs_from_edgelist()
|
503
|
+
species_df = dogmatic_mappings["cluster_consensus_identifiers_df"].join(
|
504
|
+
dogmatic_mappings["s_name_series"]
|
505
|
+
)
|
506
|
+
|
507
|
+
# stub required but invariant variables
|
508
|
+
compartments_df = sbml_dfs_core._stub_compartments()
|
509
|
+
interaction_source = source.Source(init=True)
|
510
|
+
|
511
|
+
# interactions table. This is required to create the sbml_dfs but we'll drop the info later
|
512
|
+
interaction_edgelist = species_df.rename(
|
513
|
+
columns={
|
514
|
+
"s_name": "upstream_name",
|
515
|
+
SBML_DFS.S_IDENTIFIERS: SBML_DFS.R_IDENTIFIERS,
|
516
|
+
}
|
517
|
+
)
|
518
|
+
interaction_edgelist["downstream_name"] = interaction_edgelist["upstream_name"]
|
519
|
+
interaction_edgelist["upstream_compartment"] = "cellular_component"
|
520
|
+
interaction_edgelist["downstream_compartment"] = "cellular_component"
|
521
|
+
interaction_edgelist["r_name"] = interaction_edgelist["upstream_name"]
|
522
|
+
interaction_edgelist["sbo_term"] = constants.MINI_SBO_FROM_NAME["reactant"]
|
523
|
+
interaction_edgelist["r_isreversible"] = False
|
524
|
+
|
525
|
+
dogmatic_sbml_dfs = sbml_dfs_core.sbml_dfs_from_edgelist(
|
526
|
+
interaction_edgelist=interaction_edgelist,
|
527
|
+
species_df=species_df,
|
528
|
+
compartments_df=compartments_df,
|
529
|
+
interaction_source=interaction_source,
|
530
|
+
upstream_stoichiometry=-1,
|
531
|
+
downstream_stoichiometry=1,
|
532
|
+
downstream_sbo_name="product",
|
533
|
+
)
|
534
|
+
|
535
|
+
# remove all reactions except 1 (so it still passes sbml_dfs.validate())
|
536
|
+
# this self reaction will be removed when creating the graph
|
537
|
+
dogmatic_sbml_dfs.remove_reactions(dogmatic_sbml_dfs.reactions.index.tolist()[1::])
|
538
|
+
|
539
|
+
return dogmatic_sbml_dfs
|
540
|
+
|
541
|
+
|
542
|
+
def connect_dogmatic_mappings(species: str, r_paths: str | None = None) -> dict:
|
543
|
+
"""
|
544
|
+
Connect Dogmatic Mappings
|
545
|
+
|
546
|
+
Merge all ontologies into greedy clusters based on shared associations to entrez ids
|
547
|
+
|
548
|
+
Args:
|
549
|
+
species (str):
|
550
|
+
An organismal species (e.g., Homo sapiens)
|
551
|
+
r_paths (str or None)
|
552
|
+
Optional, p]ath to an R packages directory
|
553
|
+
|
554
|
+
Returns:
|
555
|
+
dict with:
|
556
|
+
- s_name_series: a series where the index is distinct molecular species and the values are names.
|
557
|
+
- cluster_consensus_identifiers_df: a pd.DataFrame where the index is distinct molecular species
|
558
|
+
and values are identifiers objects.
|
559
|
+
"""
|
560
|
+
|
561
|
+
mappings_dict = create_bioconductor_mapping_tables(
|
562
|
+
mappings=BIOC_DOGMATIC_MAPPING_ONTOLOGIES,
|
563
|
+
species=species,
|
564
|
+
r_paths=r_paths,
|
565
|
+
)
|
566
|
+
|
567
|
+
protein_mappings = stack_bioconductor_mappings(
|
568
|
+
mappings_dict, set(BIOC_PROTEIN_ONTOLOGIES)
|
569
|
+
)
|
570
|
+
|
571
|
+
# apply greedy graph-based clustering to connect proteins with a common mapping to entrez
|
572
|
+
edgelist_df = utils.format_identifiers_as_edgelist(
|
573
|
+
protein_mappings, [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
|
574
|
+
)
|
575
|
+
connected_indices = utils.find_weakly_connected_subgraphs(
|
576
|
+
edgelist_df[["ind", "id"]]
|
577
|
+
)
|
578
|
+
|
579
|
+
# add clusters to proteins. Each cluster will be a distinct molecular species
|
580
|
+
protein_mappings_w_clusters = protein_mappings.reset_index().merge(
|
581
|
+
connected_indices
|
582
|
+
)
|
583
|
+
|
584
|
+
# combine entrez + cluster so we can pass cluster to non-protein attributes
|
585
|
+
entrez_clusters = protein_mappings_w_clusters[
|
586
|
+
[ONTOLOGIES.NCBI_ENTREZ_GENE, "cluster"]
|
587
|
+
].drop_duplicates()
|
588
|
+
other_ontologies = BIOC_DOGMATIC_MAPPING_ONTOLOGIES.difference(
|
589
|
+
set(BIOC_PROTEIN_ONTOLOGIES)
|
590
|
+
)
|
591
|
+
other_mappings = stack_bioconductor_mappings(mappings_dict, other_ontologies)
|
592
|
+
other_mappings_w_clusters = entrez_clusters.merge(
|
593
|
+
other_mappings, left_on=ONTOLOGIES.NCBI_ENTREZ_GENE, right_index=True
|
594
|
+
)
|
595
|
+
|
596
|
+
possible_names = pd.concat(
|
597
|
+
[
|
598
|
+
protein_mappings_w_clusters.query(
|
599
|
+
"ontology in @BIOC_NAME_ONTOLOGIES.keys()"
|
600
|
+
),
|
601
|
+
other_mappings_w_clusters.query("ontology in @BIOC_NAME_ONTOLOGIES.keys()"),
|
602
|
+
]
|
603
|
+
)[["cluster", IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]]
|
604
|
+
|
605
|
+
possible_names.loc[:, "ontology_preference"] = possible_names[
|
606
|
+
IDENTIFIERS.ONTOLOGY
|
607
|
+
].map(BIOC_NAME_ONTOLOGIES)
|
608
|
+
|
609
|
+
# remove possible names which are present in multiple clusters.
|
610
|
+
# all clusters will need unique names to use sbml_dfs_from_edgelist()
|
611
|
+
id_counts = (
|
612
|
+
possible_names[["cluster", IDENTIFIERS.IDENTIFIER]]
|
613
|
+
.drop_duplicates()
|
614
|
+
.value_counts(IDENTIFIERS.IDENTIFIER)
|
615
|
+
)
|
616
|
+
possible_names = possible_names[
|
617
|
+
~possible_names[IDENTIFIERS.IDENTIFIER].isin(
|
618
|
+
id_counts[id_counts > 1].index.tolist()
|
619
|
+
)
|
620
|
+
]
|
621
|
+
|
622
|
+
s_name_series = (
|
623
|
+
consensus._add_nameness_score(possible_names, IDENTIFIERS.IDENTIFIER)
|
624
|
+
.sort_values(["ontology_preference", "nameness_score"])
|
625
|
+
.groupby("cluster")
|
626
|
+
.first()
|
627
|
+
.rename(columns={IDENTIFIERS.IDENTIFIER: SBML_DFS.S_NAME})[SBML_DFS.S_NAME]
|
628
|
+
)
|
629
|
+
|
630
|
+
protein_ids = protein_mappings_w_clusters.assign(bqb=BQB.IS)[
|
631
|
+
["cluster", IDENTIFIERS.IDENTIFIER, IDENTIFIERS.ONTOLOGY, IDENTIFIERS.BQB]
|
632
|
+
]
|
633
|
+
gene_ids = other_mappings_w_clusters.query(
|
634
|
+
"ontology in @BIOC_GENE_ONTOLOGIES"
|
635
|
+
).assign(bqb=BQB.IS_ENCODED_BY)[
|
636
|
+
["cluster", IDENTIFIERS.IDENTIFIER, IDENTIFIERS.ONTOLOGY, IDENTIFIERS.BQB]
|
637
|
+
]
|
638
|
+
entrez_ids = entrez_clusters.assign(
|
639
|
+
ontology=ONTOLOGIES.NCBI_ENTREZ_GENE, bqb=BQB.IS_ENCODED_BY
|
640
|
+
).rename(columns={ONTOLOGIES.NCBI_ENTREZ_GENE: IDENTIFIERS.IDENTIFIER})[
|
641
|
+
["cluster", IDENTIFIERS.IDENTIFIER, IDENTIFIERS.ONTOLOGY, IDENTIFIERS.BQB]
|
642
|
+
]
|
643
|
+
|
644
|
+
# combine all ids to setup a single cluster-level Identifiers
|
645
|
+
all_ids = pd.concat([protein_ids, gene_ids, entrez_ids])
|
646
|
+
all_ids.loc[:, IDENTIFIERS.URL] = [
|
647
|
+
identifiers.create_uri_url(x, y)
|
648
|
+
for x, y in zip(all_ids[IDENTIFIERS.ONTOLOGY], all_ids[IDENTIFIERS.IDENTIFIER])
|
649
|
+
]
|
650
|
+
|
651
|
+
# create one Identifiers object for each new species
|
652
|
+
cluster_consensus_identifiers = {
|
653
|
+
k: identifiers.Identifiers(
|
654
|
+
list(
|
655
|
+
v[
|
656
|
+
[
|
657
|
+
IDENTIFIERS.ONTOLOGY,
|
658
|
+
IDENTIFIERS.IDENTIFIER,
|
659
|
+
IDENTIFIERS.URL,
|
660
|
+
IDENTIFIERS.BQB,
|
661
|
+
]
|
662
|
+
]
|
663
|
+
.reset_index(drop=True)
|
664
|
+
.T.to_dict()
|
665
|
+
.values()
|
666
|
+
)
|
667
|
+
)
|
668
|
+
for k, v in all_ids.groupby("cluster")
|
669
|
+
}
|
670
|
+
|
671
|
+
cluster_consensus_identifiers_df = pd.DataFrame(
|
672
|
+
cluster_consensus_identifiers, index=[SBML_DFS.S_IDENTIFIERS]
|
673
|
+
).T
|
674
|
+
cluster_consensus_identifiers_df.index.name = "cluster"
|
675
|
+
|
676
|
+
out_dict = {
|
677
|
+
"s_name_series": s_name_series,
|
678
|
+
"cluster_consensus_identifiers_df": cluster_consensus_identifiers_df,
|
679
|
+
}
|
680
|
+
|
681
|
+
return out_dict
|
682
|
+
|
683
|
+
|
684
|
+
@warn_if_no_rpy2
|
685
|
+
def _expand_identifiers_new_entries(
|
686
|
+
sysid: str, expanded_identifiers_df: pd.DataFrame
|
687
|
+
) -> identifiers.Identifiers:
|
688
|
+
"""Expand Identifiers to include Bioconductor annotations"""
|
689
|
+
entry = expanded_identifiers_df.loc[sysid]
|
690
|
+
|
691
|
+
if type(entry) is pd.Series:
|
692
|
+
sysis_id_list = [entry.to_dict()]
|
693
|
+
else:
|
694
|
+
# multiple annotations
|
695
|
+
sysis_id_list = list(entry.reset_index(drop=True).T.to_dict().values())
|
696
|
+
|
697
|
+
return identifiers.Identifiers(sysis_id_list)
|