napistu 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__init__.py +12 -0
- napistu/__main__.py +867 -0
- napistu/consensus.py +1557 -0
- napistu/constants.py +500 -0
- napistu/gcs/__init__.py +10 -0
- napistu/gcs/constants.py +69 -0
- napistu/gcs/downloads.py +180 -0
- napistu/identifiers.py +805 -0
- napistu/indices.py +227 -0
- napistu/ingestion/__init__.py +10 -0
- napistu/ingestion/bigg.py +146 -0
- napistu/ingestion/constants.py +296 -0
- napistu/ingestion/cpr_edgelist.py +106 -0
- napistu/ingestion/identifiers_etl.py +148 -0
- napistu/ingestion/obo.py +268 -0
- napistu/ingestion/psi_mi.py +276 -0
- napistu/ingestion/reactome.py +218 -0
- napistu/ingestion/sbml.py +621 -0
- napistu/ingestion/string.py +356 -0
- napistu/ingestion/trrust.py +285 -0
- napistu/ingestion/yeast.py +147 -0
- napistu/mechanism_matching.py +597 -0
- napistu/modify/__init__.py +10 -0
- napistu/modify/constants.py +86 -0
- napistu/modify/curation.py +628 -0
- napistu/modify/gaps.py +635 -0
- napistu/modify/pathwayannot.py +1381 -0
- napistu/modify/uncompartmentalize.py +264 -0
- napistu/network/__init__.py +10 -0
- napistu/network/constants.py +117 -0
- napistu/network/neighborhoods.py +1594 -0
- napistu/network/net_create.py +1647 -0
- napistu/network/net_utils.py +652 -0
- napistu/network/paths.py +500 -0
- napistu/network/precompute.py +221 -0
- napistu/rpy2/__init__.py +127 -0
- napistu/rpy2/callr.py +168 -0
- napistu/rpy2/constants.py +101 -0
- napistu/rpy2/netcontextr.py +464 -0
- napistu/rpy2/rids.py +697 -0
- napistu/sbml_dfs_core.py +2216 -0
- napistu/sbml_dfs_utils.py +304 -0
- napistu/source.py +394 -0
- napistu/utils.py +943 -0
- napistu-0.1.0.dist-info/METADATA +56 -0
- napistu-0.1.0.dist-info/RECORD +77 -0
- napistu-0.1.0.dist-info/WHEEL +5 -0
- napistu-0.1.0.dist-info/entry_points.txt +2 -0
- napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
- napistu-0.1.0.dist-info/top_level.txt +2 -0
- tests/__init__.py +0 -0
- tests/conftest.py +83 -0
- tests/test_consensus.py +255 -0
- tests/test_constants.py +20 -0
- tests/test_curation.py +134 -0
- tests/test_data/__init__.py +0 -0
- tests/test_edgelist.py +20 -0
- tests/test_gcs.py +23 -0
- tests/test_identifiers.py +151 -0
- tests/test_igraph.py +353 -0
- tests/test_indices.py +88 -0
- tests/test_mechanism_matching.py +126 -0
- tests/test_net_utils.py +66 -0
- tests/test_netcontextr.py +105 -0
- tests/test_obo.py +34 -0
- tests/test_pathwayannot.py +95 -0
- tests/test_precomputed_distances.py +222 -0
- tests/test_rpy2.py +61 -0
- tests/test_sbml.py +46 -0
- tests/test_sbml_dfs_create.py +307 -0
- tests/test_sbml_dfs_utils.py +22 -0
- tests/test_sbo.py +11 -0
- tests/test_set_coverage.py +50 -0
- tests/test_source.py +67 -0
- tests/test_uncompartmentalize.py +40 -0
- tests/test_utils.py +487 -0
- tests/utils.py +30 -0
@@ -0,0 +1,628 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import os
|
4
|
+
from typing import Any
|
5
|
+
|
6
|
+
from fs import open_fs
|
7
|
+
import numpy as np
|
8
|
+
import pandas as pd
|
9
|
+
|
10
|
+
from napistu import identifiers
|
11
|
+
from napistu import sbml_dfs_core
|
12
|
+
from napistu import sbml_dfs_utils
|
13
|
+
from napistu import source
|
14
|
+
|
15
|
+
from napistu.constants import BQB
|
16
|
+
from napistu.constants import SBML_DFS
|
17
|
+
from napistu.constants import SBML_DFS_SCHEMA
|
18
|
+
from napistu.constants import IDENTIFIERS
|
19
|
+
from napistu.constants import MINI_SBO_FROM_NAME
|
20
|
+
from napistu.constants import SBOTERM_NAMES
|
21
|
+
|
22
|
+
from napistu.modify.constants import VALID_ANNOTATION_TYPES
|
23
|
+
|
24
|
+
|
25
|
+
def curate_sbml_dfs(
|
26
|
+
curation_dir: str, sbml_dfs: sbml_dfs_core.SBML_dfs, verbose: bool = True
|
27
|
+
) -> sbml_dfs_core.SBML_dfs:
|
28
|
+
"""
|
29
|
+
Curate SBML_dfs
|
30
|
+
|
31
|
+
Update a pathway model using manual annotations.
|
32
|
+
|
33
|
+
The current workflow is to:
|
34
|
+
- annotate pathways in https://docs.google.com/spreadsheets/d/1waVXSVMOthL5QAT0PITgLMDdXGHIS50LZ2P1_F_c-6s/edit#gid=101210748
|
35
|
+
- parse annotations into flat files using parse_manual_annotation.Rmd
|
36
|
+
- call this function to format flat files and update a current SBML_dfs pathway model
|
37
|
+
|
38
|
+
Params
|
39
|
+
------
|
40
|
+
curation_dir: str
|
41
|
+
Directory containing annotations generated using parse_manual_annotation.Rmd
|
42
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
43
|
+
A pathway model
|
44
|
+
verbose: bool
|
45
|
+
Extra reporting
|
46
|
+
|
47
|
+
Returns
|
48
|
+
-------
|
49
|
+
sbml_df: sbml_dfs_core.SBML_dfs
|
50
|
+
A curated pathway model
|
51
|
+
|
52
|
+
"""
|
53
|
+
|
54
|
+
try:
|
55
|
+
open_fs(curation_dir)
|
56
|
+
except Exception as e:
|
57
|
+
raise FileNotFoundError(f"{curation_dir} does not exist") from e
|
58
|
+
|
59
|
+
if not isinstance(sbml_dfs, sbml_dfs_core.SBML_dfs):
|
60
|
+
raise TypeError(
|
61
|
+
f"sbml_dfs was a {type(sbml_dfs)} and must be an sbml_dfs_core.SBML_dfs"
|
62
|
+
)
|
63
|
+
if not isinstance(verbose, bool):
|
64
|
+
raise TypeError(f"verbose was a {type(verbose)} and must be a bool")
|
65
|
+
|
66
|
+
curation_dict = read_pathway_curations(curation_dir)
|
67
|
+
|
68
|
+
# remove existing entities
|
69
|
+
if "remove" in curation_dict.keys():
|
70
|
+
invalid_entities_dict = _find_invalid_entities(
|
71
|
+
sbml_dfs, curation_dict["remove"]
|
72
|
+
)
|
73
|
+
if verbose:
|
74
|
+
print(
|
75
|
+
"removing "
|
76
|
+
+ ", ".join(
|
77
|
+
[
|
78
|
+
str(len(y)) + " " + x + "s"
|
79
|
+
for x, y in invalid_entities_dict.items()
|
80
|
+
]
|
81
|
+
)
|
82
|
+
)
|
83
|
+
sbml_dfs = _remove_entities(sbml_dfs, invalid_entities_dict)
|
84
|
+
|
85
|
+
# add new entities
|
86
|
+
new_entities = format_curations(curation_dict, sbml_dfs)
|
87
|
+
if verbose:
|
88
|
+
print(
|
89
|
+
"adding "
|
90
|
+
+ ", ".join([str(y.shape[0]) + " " + x for x, y in new_entities.items()])
|
91
|
+
)
|
92
|
+
for entity_type in new_entities.keys():
|
93
|
+
entity_df = getattr(sbml_dfs, entity_type)
|
94
|
+
updated_entity_df = pd.concat([entity_df, new_entities[entity_type]])
|
95
|
+
setattr(sbml_dfs, entity_type, updated_entity_df)
|
96
|
+
sbml_dfs.validate()
|
97
|
+
|
98
|
+
return sbml_dfs
|
99
|
+
|
100
|
+
|
101
|
+
def read_pathway_curations(curation_dir: str) -> dict[str, pd.DataFrame]:
|
102
|
+
"""
|
103
|
+
Read Pathway Curations
|
104
|
+
|
105
|
+
Load curations that were prepared by parse_manual_annotations.Rmd
|
106
|
+
|
107
|
+
Params
|
108
|
+
------
|
109
|
+
curation_dir: str
|
110
|
+
Directory containing annotations generated using parse_manual_annotation.Rmd
|
111
|
+
|
112
|
+
Returns
|
113
|
+
-------
|
114
|
+
curations: dict
|
115
|
+
Dictionary containing different types of annoations
|
116
|
+
"""
|
117
|
+
with open_fs(curation_dir) as curation_fs:
|
118
|
+
curation_files = curation_fs.listdir(".")
|
119
|
+
|
120
|
+
annotations_types = set(curation_files).intersection(
|
121
|
+
{x + ".tsv" for x in VALID_ANNOTATION_TYPES}
|
122
|
+
)
|
123
|
+
|
124
|
+
curation_dict = {}
|
125
|
+
for annotation_file in annotations_types:
|
126
|
+
with curation_fs.open(annotation_file) as f:
|
127
|
+
key = os.path.splitext(annotation_file)[0]
|
128
|
+
curation_dict[key] = pd.read_csv(f, sep="\t")
|
129
|
+
|
130
|
+
return curation_dict
|
131
|
+
|
132
|
+
|
133
|
+
def format_curations(
|
134
|
+
curation_dict: dict[str, pd.DataFrame], sbml_dfs: sbml_dfs_core.SBML_dfs
|
135
|
+
) -> dict[str, pd.DataFrame]:
|
136
|
+
"""
|
137
|
+
Format Curations
|
138
|
+
|
139
|
+
Format manual curations into a set of table that can be appended to an sbml_dfs's tables
|
140
|
+
|
141
|
+
Params
|
142
|
+
------
|
143
|
+
curation_dict:
|
144
|
+
Curations imported using read_pathway_curations
|
145
|
+
sbml_dfs:
|
146
|
+
A pathway model
|
147
|
+
|
148
|
+
Returns
|
149
|
+
-------
|
150
|
+
new_entities: dict
|
151
|
+
Curations formatted as sbml_dfs_core.SBML_dfs tables
|
152
|
+
|
153
|
+
"""
|
154
|
+
|
155
|
+
new_entity_types = set(curation_dict.keys()).difference({"foci", "remove"})
|
156
|
+
|
157
|
+
if SBML_DFS.COMPARTMENTS in new_entity_types:
|
158
|
+
raise NotImplementedError("logic for adding compartments does not exist")
|
159
|
+
|
160
|
+
new_entities = dict() # type: dict[str, pd.DataFrame]
|
161
|
+
|
162
|
+
# reorganize reaction species' annotations as a dict to allow for
|
163
|
+
# annotations added expicitly in the curations sheet
|
164
|
+
# and implicitly due to newly added reactions
|
165
|
+
reaction_species_dict = dict() # type: dict[str, pd.DataFrame | None]
|
166
|
+
reaction_species_dict["explicit"] = _format_explicit_reaction_species(curation_dict)
|
167
|
+
reaction_species_dict["implicit"] = None
|
168
|
+
|
169
|
+
# create reaction species based on reaction stoichiometry
|
170
|
+
if SBML_DFS.REACTIONS in new_entity_types:
|
171
|
+
reaction_species_dict["implicit"] = _format_implicit_reaction_species(
|
172
|
+
curation_dict
|
173
|
+
)
|
174
|
+
new_entity_types.add(SBML_DFS.REACTION_SPECIES)
|
175
|
+
curation_dict[SBML_DFS.REACTION_SPECIES] = pd.concat(reaction_species_dict.values()) # type: ignore
|
176
|
+
|
177
|
+
if SBML_DFS.REACTIONS in new_entity_types:
|
178
|
+
# add "r_isreversible" to curation_dict["reactions"]
|
179
|
+
curation_dict[SBML_DFS.REACTIONS][SBML_DFS.R_ISREVERSIBLE] = [
|
180
|
+
(
|
181
|
+
True
|
182
|
+
if curation_dict[SBML_DFS.REACTIONS][SBML_DFS.STOICHIOMETRY]
|
183
|
+
.iloc[0]
|
184
|
+
.split("<->")
|
185
|
+
== 2
|
186
|
+
else False
|
187
|
+
)
|
188
|
+
for i in range(
|
189
|
+
0, curation_dict[SBML_DFS.REACTIONS][SBML_DFS.STOICHIOMETRY].shape[0]
|
190
|
+
)
|
191
|
+
]
|
192
|
+
|
193
|
+
for entity_type in SBML_DFS_SCHEMA.SCHEMA.keys():
|
194
|
+
if entity_type not in new_entity_types:
|
195
|
+
continue
|
196
|
+
|
197
|
+
# add in the order of compartments, species > reactions > compartmentalized_species > reaction_species
|
198
|
+
new_entities[entity_type] = format_curated_entities(
|
199
|
+
entity_type, curation_dict[entity_type], new_entities, sbml_dfs # type: ignore
|
200
|
+
)
|
201
|
+
|
202
|
+
return new_entities
|
203
|
+
|
204
|
+
|
205
|
+
def _find_invalid_entities(
|
206
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs, invalid_entities: pd.DataFrame
|
207
|
+
) -> dict[str, set]:
|
208
|
+
"""
|
209
|
+
Find Invalid Entities
|
210
|
+
|
211
|
+
Based on a set of entity names or attributes, find each entities
|
212
|
+
corresponding primary key
|
213
|
+
|
214
|
+
Params
|
215
|
+
------
|
216
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
217
|
+
A pathway model
|
218
|
+
invalid_entities: pd.DataFrame
|
219
|
+
A table containing entities to be removed ("remove"),
|
220
|
+
the table where the entity resides ("table") and variable used
|
221
|
+
to find the entity ("variable")
|
222
|
+
|
223
|
+
Returns
|
224
|
+
-------
|
225
|
+
invalid_entities_dict: dict
|
226
|
+
A dictionary containing the primary keys of invalid entities
|
227
|
+
|
228
|
+
"""
|
229
|
+
|
230
|
+
# find tables where removal will occur (or at least start)
|
231
|
+
unique_tables = invalid_entities["table"].unique().tolist()
|
232
|
+
invalid_tables = [x for x in unique_tables if x not in sbml_dfs.schema.keys()]
|
233
|
+
|
234
|
+
if len(invalid_tables) > 0:
|
235
|
+
raise ValueError(
|
236
|
+
f"{', '.join(invalid_tables)} are not valid table names; "
|
237
|
+
f"valid tables are {', '.join(sbml_dfs.schema.keys())}"
|
238
|
+
)
|
239
|
+
|
240
|
+
invalid_entities_dict = dict() # type: dict[str, set]
|
241
|
+
for tab in unique_tables:
|
242
|
+
tab_schema = sbml_dfs.schema[tab]
|
243
|
+
tab_vars = tab_schema["vars"] + [tab_schema["pk"]]
|
244
|
+
|
245
|
+
# pull out the annotations that start with the table being evaluated
|
246
|
+
remove_df = invalid_entities[invalid_entities["table"] == tab]
|
247
|
+
assert isinstance(remove_df, pd.DataFrame)
|
248
|
+
|
249
|
+
invalid_remove_vars = (
|
250
|
+
remove_df["variable"][~remove_df["variable"].isin(tab_vars)]
|
251
|
+
.unique()
|
252
|
+
.tolist()
|
253
|
+
)
|
254
|
+
if len(invalid_remove_vars) > 0:
|
255
|
+
raise ValueError(
|
256
|
+
f"{', '.join(invalid_remove_vars)} are not valid variables"
|
257
|
+
f" in the {tab} table; valid variables are {', '.join(tab_vars)}"
|
258
|
+
)
|
259
|
+
|
260
|
+
# find the pk corresponding to each removal annotation
|
261
|
+
|
262
|
+
tab_df = getattr(sbml_dfs, tab)
|
263
|
+
|
264
|
+
invalid_entities_dict[tab_schema["pk"]] = set()
|
265
|
+
for i in range(0, remove_df.shape[0]):
|
266
|
+
remove_series = remove_df.iloc[i]
|
267
|
+
|
268
|
+
if remove_series["variable"] == tab_schema["pk"]:
|
269
|
+
# check that pk exists and then add to invalid entities
|
270
|
+
if remove_series["remove"] not in tab_df.index:
|
271
|
+
raise ValueError(
|
272
|
+
f"{remove_series['remove']} was not found in the index of {tab}"
|
273
|
+
)
|
274
|
+
invalid_entities_dict[tab_schema["pk"]].add(remove_series["remove"])
|
275
|
+
else:
|
276
|
+
# lookup by
|
277
|
+
matching_entity = tab_df[
|
278
|
+
tab_df[remove_series["variable"]] == remove_series["remove"]
|
279
|
+
]
|
280
|
+
|
281
|
+
if matching_entity.shape[0] == 0:
|
282
|
+
raise ValueError(
|
283
|
+
f"{remove_series['remove']} was not found in the {remove_series['variable']} column of {tab}"
|
284
|
+
)
|
285
|
+
|
286
|
+
[invalid_entities_dict[tab_schema["pk"]].add(x) for x in matching_entity.index.tolist()] # type: ignore
|
287
|
+
|
288
|
+
# iterate through primary key -> foreign key relationships
|
289
|
+
# to define additional entities which should be removed based on
|
290
|
+
# initial removal annotations
|
291
|
+
new_invalid_entities_dict = invalid_entities_dict.copy()
|
292
|
+
|
293
|
+
cont = True
|
294
|
+
while cont:
|
295
|
+
new_invalid_entities_dict = _expand_entities_by_fks(
|
296
|
+
sbml_dfs, new_invalid_entities_dict
|
297
|
+
)
|
298
|
+
|
299
|
+
if new_invalid_entities_dict != invalid_entities_dict:
|
300
|
+
invalid_entities_dict = new_invalid_entities_dict
|
301
|
+
new_invalid_entities_dict = invalid_entities_dict.copy()
|
302
|
+
else:
|
303
|
+
cont = False
|
304
|
+
|
305
|
+
return invalid_entities_dict
|
306
|
+
|
307
|
+
|
308
|
+
def _expand_entities_by_fks(sbml_dfs: sbml_dfs_core.SBML_dfs, pk_dict: dict) -> dict:
|
309
|
+
"""
|
310
|
+
Expand Entities By Foreign Keys
|
311
|
+
|
312
|
+
Starting with a dictionary of foreign keys, add all primary keys that are defined by these foreign keys
|
313
|
+
|
314
|
+
Params
|
315
|
+
------
|
316
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
317
|
+
A pathway model
|
318
|
+
pk_dict: dict
|
319
|
+
Dictionary where keys are types of primary keys in sbml_dfs
|
320
|
+
|
321
|
+
Returns
|
322
|
+
-------
|
323
|
+
pk_dict: dict
|
324
|
+
Input where additional primary keys may have been added
|
325
|
+
|
326
|
+
"""
|
327
|
+
|
328
|
+
for tab in sbml_dfs.schema.keys():
|
329
|
+
tab_df = getattr(sbml_dfs, tab)
|
330
|
+
tab_schema = sbml_dfs.schema[tab]
|
331
|
+
pk = tab_schema["pk"]
|
332
|
+
|
333
|
+
if "fk" in tab_schema.keys():
|
334
|
+
# check for foreign keys which are defined by primary keys
|
335
|
+
# add these to the pk_dict
|
336
|
+
for fk in tab_schema["fk"]:
|
337
|
+
if fk in pk_dict.keys():
|
338
|
+
fks = tab_df[tab_df[fk].isin(pk_dict[fk])]
|
339
|
+
if pk not in pk_dict.keys():
|
340
|
+
pk_dict[pk] = set()
|
341
|
+
for x in fks.index.tolist():
|
342
|
+
pk_dict[pk].add(x)
|
343
|
+
|
344
|
+
return pk_dict
|
345
|
+
|
346
|
+
|
347
|
+
def _remove_entities(
|
348
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs, pk_dict: dict
|
349
|
+
) -> sbml_dfs_core.SBML_dfs:
|
350
|
+
"""
|
351
|
+
Remove Entities
|
352
|
+
|
353
|
+
Remove entities whose primary keys are in pk_dict
|
354
|
+
|
355
|
+
Params
|
356
|
+
------
|
357
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
358
|
+
A pathway model
|
359
|
+
pk_dict: dict
|
360
|
+
Dictionary where keys are types of primary keys in sbml_dfs
|
361
|
+
|
362
|
+
Returns
|
363
|
+
-------
|
364
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
365
|
+
Input with some entities removed
|
366
|
+
|
367
|
+
"""
|
368
|
+
|
369
|
+
for tab in sbml_dfs.schema.keys():
|
370
|
+
tab_df = getattr(sbml_dfs, tab)
|
371
|
+
tab_schema = sbml_dfs.schema[tab]
|
372
|
+
|
373
|
+
if tab_schema["pk"] in pk_dict.keys():
|
374
|
+
updated_table = tab_df[~tab_df.index.isin(pk_dict[tab_schema["pk"]])]
|
375
|
+
setattr(sbml_dfs, tab, updated_table)
|
376
|
+
|
377
|
+
return sbml_dfs
|
378
|
+
|
379
|
+
|
380
|
+
def format_curated_entities(
|
381
|
+
entity_type: str,
|
382
|
+
new_curated_entities: dict[Any, pd.DataFrame],
|
383
|
+
new_entities: dict[str, pd.DataFrame],
|
384
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
385
|
+
curation_id: str = "Calico curations",
|
386
|
+
) -> pd.DataFrame:
|
387
|
+
"""
|
388
|
+
Format Curated Entities
|
389
|
+
|
390
|
+
Convert entities from the curation format to the stucture of SBML_dfs tables
|
391
|
+
|
392
|
+
Params
|
393
|
+
------
|
394
|
+
entity_type: str
|
395
|
+
The type of entity to update (e.g., reactions, species, ...)
|
396
|
+
new_curated_entities: dict
|
397
|
+
Curation pd.DataFrames generated using read_pathway_curations
|
398
|
+
new_entities: dict
|
399
|
+
Curations formatted as sbml_dfs_core.SBML_dfs tables
|
400
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs
|
401
|
+
A pathway model
|
402
|
+
curation_id: str
|
403
|
+
Name to use as a pathway id in source.Source objects
|
404
|
+
|
405
|
+
Returns
|
406
|
+
-------
|
407
|
+
new_entity_df: pd.DataFrame
|
408
|
+
Input for entity_type formatted as an SBML_dfs table
|
409
|
+
|
410
|
+
"""
|
411
|
+
|
412
|
+
if not isinstance(entity_type, str):
|
413
|
+
raise TypeError(f"entity_type was a {type(entity_type)} and must be a str")
|
414
|
+
if not isinstance(new_curated_entities, pd.DataFrame):
|
415
|
+
raise TypeError(
|
416
|
+
f"new_curated_entities was a {type(new_curated_entities)} and must be a pd.DataFrame"
|
417
|
+
)
|
418
|
+
if not isinstance(new_entities, dict):
|
419
|
+
raise TypeError(f"new_entities was a {type(new_entities)} and must be a dict")
|
420
|
+
if not isinstance(sbml_dfs, sbml_dfs_core.SBML_dfs):
|
421
|
+
raise TypeError(
|
422
|
+
f"sbml_dfs was a {type(sbml_dfs)} and must be an sbml_dfs_core.SBML_dfs"
|
423
|
+
)
|
424
|
+
if not isinstance(curation_id, str):
|
425
|
+
raise TypeError(f"curation_id was a {type(curation_id)} and must be a str")
|
426
|
+
|
427
|
+
type_schema = sbml_dfs.schema[entity_type]
|
428
|
+
|
429
|
+
# name the entity
|
430
|
+
if "label" in type_schema.keys():
|
431
|
+
new_curated_entities[type_schema["label"]] = new_curated_entities[entity_type]
|
432
|
+
else:
|
433
|
+
# add a temporary label to improve error messages
|
434
|
+
new_curated_entities["label"] = [
|
435
|
+
", ".join(new_curated_entities.select_dtypes(include=["object"]).iloc[i])
|
436
|
+
for i in range(0, new_curated_entities.shape[0])
|
437
|
+
]
|
438
|
+
|
439
|
+
if "source" in type_schema.keys():
|
440
|
+
new_curated_entities["curator"] = new_curated_entities["curator"].fillna(
|
441
|
+
"unknown"
|
442
|
+
)
|
443
|
+
# convert curator entries to Sources
|
444
|
+
new_curated_entities[type_schema["source"]] = [
|
445
|
+
source.Source(
|
446
|
+
pd.DataFrame(
|
447
|
+
{"model": x, "name": "custom - " + x, "pathway_id": curation_id},
|
448
|
+
index=[0],
|
449
|
+
)
|
450
|
+
)
|
451
|
+
for x in new_curated_entities["curator"]
|
452
|
+
]
|
453
|
+
|
454
|
+
# add the primary key
|
455
|
+
max_pk = max(
|
456
|
+
sbml_dfs_utils.id_formatter_inv(getattr(sbml_dfs, entity_type).index.tolist())
|
457
|
+
)
|
458
|
+
if max_pk is np.nan:
|
459
|
+
max_pk = int(-1)
|
460
|
+
|
461
|
+
new_curated_entities[type_schema["pk"]] = sbml_dfs_utils.id_formatter(
|
462
|
+
range(
|
463
|
+
max_pk + 1,
|
464
|
+
max_pk + new_curated_entities.shape[0] + 1,
|
465
|
+
),
|
466
|
+
type_schema["pk"],
|
467
|
+
)
|
468
|
+
|
469
|
+
# add foreign keys if they exist
|
470
|
+
|
471
|
+
if "fk" in type_schema.keys():
|
472
|
+
# find primary keys corresponding to foreign keys, including both existing and newly added entities
|
473
|
+
for fk in type_schema["fk"]:
|
474
|
+
# find the table that the fk belongs to
|
475
|
+
fk_of = [x for x, y in sbml_dfs.schema.items() if y["pk"] == fk][0]
|
476
|
+
|
477
|
+
# pull up referenced entities table, including newly added entities
|
478
|
+
if fk_of in new_entities.keys():
|
479
|
+
ref_entities = pd.concat(
|
480
|
+
[new_entities[fk_of], getattr(sbml_dfs, fk_of)]
|
481
|
+
)
|
482
|
+
else:
|
483
|
+
ref_entities = getattr(sbml_dfs, fk_of)
|
484
|
+
key_ref_schema = sbml_dfs.schema[fk_of]
|
485
|
+
# add primary key by joining on label
|
486
|
+
new_curated_entities = new_curated_entities.merge(
|
487
|
+
ref_entities[key_ref_schema["label"]].reset_index(), how="left"
|
488
|
+
)
|
489
|
+
|
490
|
+
# check that all fks were found
|
491
|
+
failed_join_df = new_curated_entities[
|
492
|
+
new_curated_entities[key_ref_schema["pk"]].isna()
|
493
|
+
]
|
494
|
+
if failed_join_df.shape[0] != 0:
|
495
|
+
if "label" in type_schema.keys():
|
496
|
+
fail_str = "\n".join(failed_join_df[type_schema["label"]])
|
497
|
+
else:
|
498
|
+
fail_str = "\n".join(failed_join_df["label"])
|
499
|
+
raise ValueError(
|
500
|
+
f"{failed_join_df.shape[0]} merges of {fk_of} "
|
501
|
+
f"failed when updating the {entity_type} table:\n{fail_str}"
|
502
|
+
)
|
503
|
+
|
504
|
+
# add id where applicable
|
505
|
+
if "id" in type_schema.keys():
|
506
|
+
ids = list()
|
507
|
+
for i in range(0, new_curated_entities.shape[0]):
|
508
|
+
new_entity_series = new_curated_entities.iloc[i]
|
509
|
+
|
510
|
+
is_identified = not new_entity_series.isna()["uri"]
|
511
|
+
if is_identified:
|
512
|
+
id = [
|
513
|
+
identifiers.format_uri(
|
514
|
+
new_entity_series["uri"], biological_qualifier_type=BQB.IS
|
515
|
+
)
|
516
|
+
]
|
517
|
+
else:
|
518
|
+
id = [
|
519
|
+
{
|
520
|
+
IDENTIFIERS.ONTOLOGY: "custom_species",
|
521
|
+
IDENTIFIERS.IDENTIFIER: new_entity_series[type_schema["pk"]],
|
522
|
+
IDENTIFIERS.BQB: BQB.IS,
|
523
|
+
}
|
524
|
+
]
|
525
|
+
# stub the id using the entity pk
|
526
|
+
ids.append(identifiers.Identifiers(id))
|
527
|
+
|
528
|
+
new_curated_entities[type_schema["id"]] = ids
|
529
|
+
|
530
|
+
return new_curated_entities.set_index(type_schema["pk"])[type_schema["vars"]]
|
531
|
+
|
532
|
+
|
533
|
+
def _format_implicit_reaction_species(
|
534
|
+
curation_dict: dict[str, pd.DataFrame],
|
535
|
+
) -> pd.DataFrame:
|
536
|
+
"""Construct reaction species which are defined in reactions' stoichiometry."""
|
537
|
+
|
538
|
+
curated_reactions = curation_dict[SBML_DFS.REACTIONS][
|
539
|
+
[SBML_DFS.REACTIONS, SBML_DFS.STOICHIOMETRY]
|
540
|
+
]
|
541
|
+
|
542
|
+
reaction_species = list()
|
543
|
+
for i in range(0, curated_reactions.shape[0]):
|
544
|
+
reaction_stoi = curated_reactions[SBML_DFS.STOICHIOMETRY].iloc[i]
|
545
|
+
if len(reaction_stoi.split("<->")) == 2:
|
546
|
+
split_stoi = reaction_stoi.split("<->")
|
547
|
+
elif len(reaction_stoi.split("->")) == 2:
|
548
|
+
split_stoi = reaction_stoi.split("->")
|
549
|
+
else:
|
550
|
+
raise ValueError(
|
551
|
+
f"{reaction_stoi} is not a valid reaction stoichiometry; "
|
552
|
+
"there must be one and only one '->' to separate the substrates and products"
|
553
|
+
)
|
554
|
+
|
555
|
+
substrates = [x.strip() for x in split_stoi[0].strip().split("++")]
|
556
|
+
products = [x.strip() for x in split_stoi[1].strip().split("++")]
|
557
|
+
|
558
|
+
a_reactions_species = pd.concat(
|
559
|
+
[
|
560
|
+
pd.DataFrame(
|
561
|
+
[
|
562
|
+
{
|
563
|
+
SBML_DFS.SC_NAME: x,
|
564
|
+
SBML_DFS.STOICHIOMETRY: -1,
|
565
|
+
SBML_DFS.SBO_TERM: MINI_SBO_FROM_NAME[
|
566
|
+
SBOTERM_NAMES.REACTANT
|
567
|
+
],
|
568
|
+
}
|
569
|
+
for x in substrates
|
570
|
+
]
|
571
|
+
),
|
572
|
+
pd.DataFrame(
|
573
|
+
[
|
574
|
+
{
|
575
|
+
SBML_DFS.SC_NAME: x,
|
576
|
+
SBML_DFS.STOICHIOMETRY: 1,
|
577
|
+
SBML_DFS.SBO_TERM: MINI_SBO_FROM_NAME[
|
578
|
+
SBOTERM_NAMES.PRODUCT
|
579
|
+
],
|
580
|
+
}
|
581
|
+
for x in products
|
582
|
+
]
|
583
|
+
),
|
584
|
+
]
|
585
|
+
).assign(r_name=curated_reactions[SBML_DFS.REACTIONS].iloc[i])
|
586
|
+
|
587
|
+
reaction_species.append(a_reactions_species)
|
588
|
+
|
589
|
+
return pd.concat(reaction_species)
|
590
|
+
|
591
|
+
|
592
|
+
def _format_explicit_reaction_species(
|
593
|
+
curation_dict: dict[str, pd.DataFrame],
|
594
|
+
) -> pd.DataFrame | None:
|
595
|
+
"""Format reaction species which are deirectly defined among curated species."""
|
596
|
+
|
597
|
+
if SBML_DFS.REACTION_SPECIES not in curation_dict.keys():
|
598
|
+
print("No explicitly curated reaction species")
|
599
|
+
return None
|
600
|
+
|
601
|
+
# convert from sbo_term_names to sbo_term
|
602
|
+
mini_sbo_terms_df = pd.DataFrame(MINI_SBO_FROM_NAME, index=[SBML_DFS.SBO_TERM]).T
|
603
|
+
|
604
|
+
augmented_reaction_species = (
|
605
|
+
curation_dict[SBML_DFS.REACTION_SPECIES]
|
606
|
+
.rename({SBML_DFS.REACTION_SPECIES: SBML_DFS.SC_NAME}, axis=1)
|
607
|
+
.merge(mini_sbo_terms_df, left_on="sbo_term_name", right_index=True, how="left")
|
608
|
+
)
|
609
|
+
|
610
|
+
# invalid terms
|
611
|
+
invalid_terms_df = augmented_reaction_species[
|
612
|
+
augmented_reaction_species[SBML_DFS.SBO_TERM].isna()
|
613
|
+
]
|
614
|
+
if invalid_terms_df.shape[0] != 0:
|
615
|
+
invalid_terms = invalid_terms_df["sbo_term_name"].unique().tolist()
|
616
|
+
raise ValueError(
|
617
|
+
f'{", ".join(invalid_terms)} are invalid entries for "sbo_term_name", '
|
618
|
+
f'valid entries are {", ".join(mini_sbo_terms_df.index.tolist())}'
|
619
|
+
)
|
620
|
+
|
621
|
+
# there currently isn't a good way to encode evidence and curator annotations
|
622
|
+
# as source objects for reaction_species since they lack a source object
|
623
|
+
# to date they have had the same source as their reaction
|
624
|
+
augmented_reaction_species = augmented_reaction_species.drop(
|
625
|
+
["sbo_term_name", "evidence", "curator"], axis=1
|
626
|
+
)
|
627
|
+
|
628
|
+
return augmented_reaction_species
|