napistu 0.3.5__py3-none-any.whl → 0.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__main__.py +18 -18
- napistu/consensus.py +3 -2
- napistu/constants.py +1 -1
- napistu/context/filtering.py +2 -1
- napistu/identifiers.py +3 -6
- napistu/ingestion/bigg.py +6 -6
- napistu/ingestion/string.py +2 -1
- napistu/ingestion/yeast.py +2 -1
- napistu/matching/interactions.py +4 -4
- napistu/modify/uncompartmentalize.py +1 -1
- napistu/network/net_create.py +1 -1
- napistu/network/paths.py +1 -1
- napistu/ontologies/dogma.py +2 -1
- napistu/sbml_dfs_core.py +1282 -2169
- napistu/sbml_dfs_utils.py +1082 -143
- {napistu-0.3.5.dist-info → napistu-0.3.6.dist-info}/METADATA +2 -2
- {napistu-0.3.5.dist-info → napistu-0.3.6.dist-info}/RECORD +28 -28
- tests/conftest.py +43 -0
- tests/test_consensus.py +88 -0
- tests/test_context_filtering.py +2 -2
- tests/test_ontologies_genodexito.py +3 -0
- tests/test_ontologies_mygene.py +3 -0
- tests/test_sbml_dfs_core.py +102 -203
- tests/test_sbml_dfs_utils.py +194 -36
- {napistu-0.3.5.dist-info → napistu-0.3.6.dist-info}/WHEEL +0 -0
- {napistu-0.3.5.dist-info → napistu-0.3.6.dist-info}/entry_points.txt +0 -0
- {napistu-0.3.5.dist-info → napistu-0.3.6.dist-info}/licenses/LICENSE +0 -0
- {napistu-0.3.5.dist-info → napistu-0.3.6.dist-info}/top_level.txt +0 -0
napistu/sbml_dfs_utils.py
CHANGED
@@ -11,80 +11,356 @@ from fs import open_fs
|
|
11
11
|
import numpy as np
|
12
12
|
import pandas as pd
|
13
13
|
from napistu import utils
|
14
|
+
from napistu import identifiers
|
14
15
|
from napistu import indices
|
15
16
|
|
16
|
-
from napistu import
|
17
|
+
from napistu.constants import BQB
|
17
18
|
from napistu.constants import SBML_DFS
|
19
|
+
from napistu.constants import SBML_DFS_SCHEMA
|
18
20
|
from napistu.constants import IDENTIFIERS
|
19
21
|
from napistu.constants import BQB_DEFINING_ATTRS
|
20
22
|
from napistu.constants import BQB_DEFINING_ATTRS_LOOSE
|
23
|
+
from napistu.constants import REQUIRED_REACTION_FROMEDGELIST_COLUMNS
|
24
|
+
from napistu.constants import INTERACTION_EDGELIST_EXPECTED_VARS
|
25
|
+
from napistu.constants import SBO_ROLES_DEFS
|
26
|
+
from napistu.constants import MINI_SBO_FROM_NAME
|
27
|
+
from napistu.constants import MINI_SBO_TO_NAME
|
28
|
+
from napistu.constants import SBO_NAME_TO_ROLE
|
29
|
+
from napistu.constants import ONTOLOGIES
|
30
|
+
from napistu.ingestion.constants import VALID_COMPARTMENTS
|
31
|
+
from napistu.ingestion.constants import COMPARTMENTS_GO_TERMS
|
32
|
+
from napistu.ingestion.constants import GENERIC_COMPARTMENT
|
21
33
|
|
22
34
|
logger = logging.getLogger(__name__)
|
23
35
|
|
24
36
|
|
25
|
-
|
37
|
+
# =============================================================================
|
38
|
+
# PUBLIC FUNCTIONS (ALPHABETICAL ORDER)
|
39
|
+
# =============================================================================
|
40
|
+
|
41
|
+
|
42
|
+
def adapt_pw_index(
|
43
|
+
source: str | indices.PWIndex,
|
44
|
+
species: str | Iterable[str] | None,
|
45
|
+
outdir: str | None = None,
|
46
|
+
) -> indices.PWIndex:
|
47
|
+
"""Adapts a pw_index
|
48
|
+
|
49
|
+
Helpful to filter for species before reconstructing.
|
50
|
+
|
51
|
+
Args:
|
52
|
+
source (str | PWIndex): uri for pw_index.csv file or PWIndex object
|
53
|
+
species (str):
|
54
|
+
outdir (str | None, optional): Optional directory to write pw_index to.
|
55
|
+
Defaults to None.
|
56
|
+
|
57
|
+
Returns:
|
58
|
+
indices.PWIndex: Filtered pw index
|
26
59
|
"""
|
27
|
-
|
60
|
+
if isinstance(source, str):
|
61
|
+
pw_index = indices.PWIndex(source)
|
62
|
+
elif isinstance(source, indices.PWIndex):
|
63
|
+
pw_index = copy.deepcopy(source)
|
64
|
+
else:
|
65
|
+
raise ValueError("'source' needs to be str or PWIndex.")
|
66
|
+
pw_index.filter(species=species)
|
28
67
|
|
29
|
-
|
30
|
-
|
68
|
+
if outdir is not None:
|
69
|
+
with open_fs(outdir, create=True) as fs:
|
70
|
+
with fs.open("pw_index.tsv", "w") as f:
|
71
|
+
pw_index.index.to_csv(f, sep="\t")
|
72
|
+
return pw_index
|
73
|
+
|
74
|
+
|
75
|
+
def add_sbo_role(reaction_species: pd.DataFrame) -> pd.DataFrame:
|
76
|
+
"""
|
77
|
+
Add an sbo_role column to the reaction_species table.
|
78
|
+
|
79
|
+
The sbo_role column is a string column that contains the SBO role of the reaction species.
|
80
|
+
The values in the sbo_role column are taken from the sbo_term column.
|
81
|
+
|
82
|
+
The sbo_role column is added to the reaction_species table by mapping the sbo_term column to the SBO_NAME_TO_ROLE dictionary.
|
83
|
+
"""
|
84
|
+
|
85
|
+
validate_sbml_dfs_table(reaction_species, SBML_DFS.REACTION_SPECIES)
|
86
|
+
|
87
|
+
reaction_species = (
|
88
|
+
reaction_species.assign(sbo_role=reaction_species[SBML_DFS.SBO_TERM])
|
89
|
+
.replace({SBO_ROLES_DEFS.SBO_ROLE: MINI_SBO_TO_NAME})
|
90
|
+
.replace({SBO_ROLES_DEFS.SBO_ROLE: SBO_NAME_TO_ROLE})
|
91
|
+
)
|
92
|
+
|
93
|
+
undefined_roles = set(reaction_species[SBO_ROLES_DEFS.SBO_ROLE].unique()) - set(
|
94
|
+
SBO_NAME_TO_ROLE.values()
|
95
|
+
)
|
96
|
+
if len(undefined_roles) > 0:
|
97
|
+
logger.warning(
|
98
|
+
f"The following SBO roles are not defined: {undefined_roles}. They will be treated as {SBO_ROLES_DEFS.OPTIONAL} when determining reaction operability."
|
99
|
+
)
|
100
|
+
mask = reaction_species[SBO_ROLES_DEFS.SBO_ROLE].isin(undefined_roles)
|
101
|
+
reaction_species.loc[mask, SBO_ROLES_DEFS.SBO_ROLE] = SBO_ROLES_DEFS.OPTIONAL
|
102
|
+
|
103
|
+
return reaction_species
|
104
|
+
|
105
|
+
|
106
|
+
def check_entity_data_index_matching(sbml_dfs, table):
|
107
|
+
"""
|
108
|
+
Update the input smbl_dfs's entity_data (dict) index
|
109
|
+
with match_entitydata_index_to_entity,
|
110
|
+
so that index for dataframe(s) in entity_data (dict) matches the sbml_dfs'
|
111
|
+
corresponding entity, and then passes sbml_dfs.validate()
|
112
|
+
Args
|
113
|
+
sbml_dfs (cpr.SBML_dfs): a cpr.SBML_dfs
|
114
|
+
table (str): table whose data is being consolidates (currently species or reactions)
|
115
|
+
Returns
|
116
|
+
sbml_dfs (cpr.SBML_dfs):
|
117
|
+
sbml_dfs whose entity_data is checked to have the same index
|
118
|
+
as the corresponding entity.
|
119
|
+
"""
|
120
|
+
|
121
|
+
table_data = table + "_data"
|
122
|
+
|
123
|
+
entity_data_dict = getattr(sbml_dfs, table_data)
|
124
|
+
entity_schema = sbml_dfs.schema[table]
|
125
|
+
sbml_dfs_entity = getattr(sbml_dfs, table)
|
126
|
+
|
127
|
+
if entity_data_dict != {}:
|
128
|
+
entity_data_types = set.union(set(entity_data_dict.keys()))
|
129
|
+
|
130
|
+
entity_data_dict_checked = {
|
131
|
+
x: match_entitydata_index_to_entity(
|
132
|
+
entity_data_dict, x, sbml_dfs_entity, entity_schema, table
|
133
|
+
)
|
134
|
+
for x in entity_data_types
|
135
|
+
}
|
136
|
+
|
137
|
+
if table == SBML_DFS.REACTIONS:
|
138
|
+
sbml_dfs.reactions_data = entity_data_dict_checked
|
139
|
+
elif table == SBML_DFS.SPECIES:
|
140
|
+
sbml_dfs.species_data = entity_data_dict_checked
|
141
|
+
|
142
|
+
return sbml_dfs
|
143
|
+
|
144
|
+
|
145
|
+
def construct_formula_string(
|
146
|
+
reaction_species_df: pd.DataFrame,
|
147
|
+
reactions_df: pd.DataFrame,
|
148
|
+
name_var: str,
|
149
|
+
) -> str:
|
150
|
+
"""
|
151
|
+
Construct Formula String
|
152
|
+
|
153
|
+
Convert a table of reaction species into a formula string
|
31
154
|
|
32
155
|
Parameters:
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
156
|
+
----------
|
157
|
+
reaction_species_df: pd.DataFrame
|
158
|
+
Table containing a reactions' species
|
159
|
+
reactions_df: pd.DataFrame
|
160
|
+
smbl.reactions
|
161
|
+
name_var: str
|
162
|
+
Name used to label species
|
37
163
|
|
38
164
|
Returns:
|
39
|
-
|
40
|
-
|
165
|
+
----------
|
166
|
+
formula_str: str
|
167
|
+
String representation of a reactions substrates, products and
|
168
|
+
modifiers
|
41
169
|
|
42
170
|
"""
|
43
171
|
|
44
|
-
|
45
|
-
|
172
|
+
reaction_species_df["label"] = [
|
173
|
+
_add_stoi_to_species_name(x, y)
|
174
|
+
for x, y in zip(
|
175
|
+
reaction_species_df[SBML_DFS.STOICHIOMETRY], reaction_species_df[name_var]
|
176
|
+
)
|
177
|
+
]
|
178
|
+
|
179
|
+
rxn_reversible = bool(
|
180
|
+
reactions_df.loc[
|
181
|
+
reaction_species_df[SBML_DFS.R_ID].iloc[0], SBML_DFS.R_ISREVERSIBLE
|
182
|
+
]
|
183
|
+
) # convert from a np.bool_ to bool if needed
|
184
|
+
if not isinstance(rxn_reversible, bool):
|
185
|
+
raise TypeError(
|
186
|
+
f"rxn_reversible must be a bool, but got {type(rxn_reversible).__name__}"
|
187
|
+
)
|
46
188
|
|
47
|
-
|
48
|
-
|
189
|
+
if rxn_reversible:
|
190
|
+
arrow_type = " <-> "
|
191
|
+
else:
|
192
|
+
arrow_type = " -> "
|
193
|
+
|
194
|
+
substrates = " + ".join(
|
195
|
+
reaction_species_df["label"][
|
196
|
+
reaction_species_df[SBML_DFS.STOICHIOMETRY] < 0
|
197
|
+
].tolist()
|
198
|
+
)
|
199
|
+
products = " + ".join(
|
200
|
+
reaction_species_df["label"][
|
201
|
+
reaction_species_df[SBML_DFS.STOICHIOMETRY] > 0
|
202
|
+
].tolist()
|
203
|
+
)
|
204
|
+
modifiers = " + ".join(
|
205
|
+
reaction_species_df["label"][
|
206
|
+
reaction_species_df[SBML_DFS.STOICHIOMETRY] == 0
|
207
|
+
].tolist()
|
208
|
+
)
|
209
|
+
if modifiers != "":
|
210
|
+
modifiers = f" ---- modifiers: {modifiers}]"
|
211
|
+
|
212
|
+
return f"{substrates}{arrow_type}{products}{modifiers}"
|
213
|
+
|
214
|
+
|
215
|
+
def find_underspecified_reactions(
|
216
|
+
reaction_species_w_roles: pd.DataFrame,
|
217
|
+
) -> pd.DataFrame:
|
218
|
+
|
219
|
+
# check that both sbo_role and "new" are present
|
220
|
+
if SBO_ROLES_DEFS.SBO_ROLE not in reaction_species_w_roles.columns:
|
49
221
|
raise ValueError(
|
50
|
-
|
51
|
-
|
222
|
+
"The sbo_role column is not present in the reaction_species_w_roles table. Please call sbml_dfs_utils.add_sbo_role() first."
|
223
|
+
)
|
224
|
+
if "new" not in reaction_species_w_roles.columns:
|
225
|
+
raise ValueError(
|
226
|
+
"The new column is not present in the reaction_species_w_roles table. This should indicate what cspecies would be preserved in the reaction should it be preserved."
|
227
|
+
)
|
228
|
+
# check that new is a boolean column
|
229
|
+
if reaction_species_w_roles["new"].dtype != bool:
|
230
|
+
raise ValueError(
|
231
|
+
"The new column is not a boolean column. Please ensure that the new column is a boolean column. This should indicate what cspecies would be preserved in the reaction should it be preserved."
|
52
232
|
)
|
53
233
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
df = df.explode(id_var)
|
60
|
-
# Unnest the dict into a dataframe
|
61
|
-
df = pd.DataFrame(df[id_var].values.tolist(), index=df.index)
|
62
|
-
# Add the entry number as an index
|
63
|
-
df["entry"] = df.groupby(df.index).cumcount()
|
64
|
-
df.set_index("entry", append=True, inplace=True)
|
65
|
-
return df
|
234
|
+
reactions_with_lost_defining_members = set(
|
235
|
+
reaction_species_w_roles.query("~new")
|
236
|
+
.query("sbo_role == 'DEFINING'")[SBML_DFS.R_ID]
|
237
|
+
.tolist()
|
238
|
+
)
|
66
239
|
|
240
|
+
N_reactions_with_lost_defining_members = len(reactions_with_lost_defining_members)
|
241
|
+
if N_reactions_with_lost_defining_members > 0:
|
242
|
+
logger.info(
|
243
|
+
f"Removing {N_reactions_with_lost_defining_members} reactions which have lost at least one defining species"
|
244
|
+
)
|
67
245
|
|
68
|
-
|
69
|
-
|
70
|
-
|
246
|
+
# find the cases where all "new" values for a given (r_id, sbo_term) are False
|
247
|
+
reactions_with_lost_requirements = set(
|
248
|
+
reaction_species_w_roles
|
249
|
+
# drop already filtered reactions
|
250
|
+
.query("r_id not in @reactions_with_lost_defining_members")
|
251
|
+
.query("sbo_role == 'REQUIRED'")
|
252
|
+
# which entries which have some required attribute have all False values for that attribute
|
253
|
+
.groupby([SBML_DFS.R_ID, SBML_DFS.SBO_TERM])
|
254
|
+
.agg({"new": "any"})
|
255
|
+
.query("new == False")
|
256
|
+
.index.get_level_values(SBML_DFS.R_ID)
|
257
|
+
)
|
71
258
|
|
259
|
+
N_reactions_with_lost_requirements = len(reactions_with_lost_requirements)
|
260
|
+
if N_reactions_with_lost_requirements > 0:
|
261
|
+
logger.info(
|
262
|
+
f"Removing {N_reactions_with_lost_requirements} reactions which have lost all required members"
|
263
|
+
)
|
72
264
|
|
73
|
-
|
265
|
+
underspecified_reactions = reactions_with_lost_defining_members.union(
|
266
|
+
reactions_with_lost_requirements
|
267
|
+
)
|
268
|
+
|
269
|
+
return underspecified_reactions
|
270
|
+
|
271
|
+
|
272
|
+
def filter_to_characteristic_species_ids(
|
273
|
+
species_ids: pd.DataFrame,
|
274
|
+
max_complex_size: int = 4,
|
275
|
+
max_promiscuity: int = 20,
|
276
|
+
defining_biological_qualifiers: list[str] = BQB_DEFINING_ATTRS,
|
277
|
+
) -> pd.DataFrame:
|
74
278
|
"""
|
75
|
-
|
279
|
+
Filter to Characteristic Species IDs
|
280
|
+
|
281
|
+
Remove identifiers corresponding to one component within a large protein
|
282
|
+
complexes and non-characteristic annotations such as pubmed references and
|
283
|
+
homologues.
|
284
|
+
|
285
|
+
Parameters
|
286
|
+
----------
|
287
|
+
species_ids: pd.DataFrame
|
288
|
+
A table of identifiers produced by sdbml_dfs.get_identifiers("species")
|
289
|
+
max_complex_size: int
|
290
|
+
The largest size of a complex, where BQB_HAS_PART terms will be retained.
|
291
|
+
In most cases, complexes are handled with specific formation and
|
292
|
+
dissolutation reactions,but these identifiers will be pulled in when
|
293
|
+
searching by identifiers or searching the identifiers associated with a
|
294
|
+
species against an external resource such as Open Targets.
|
295
|
+
max_promiscuity: int
|
296
|
+
Maximum number of species where a single molecule can act as a
|
297
|
+
BQB_HAS_PART component associated with a single identifier (and common ontology).
|
298
|
+
defining_biological_qualifiers (list[str]):
|
299
|
+
BQB codes which define distinct entities. Narrowly this would be BQB_IS, while more
|
300
|
+
permissive settings would include homologs, different forms of the same gene.
|
301
|
+
|
302
|
+
Returns:
|
303
|
+
--------
|
304
|
+
species_id: pd.DataFrame
|
305
|
+
Input species filtered to characteristic identifiers
|
76
306
|
|
77
|
-
Convert from internal IDs back to integer IDs
|
78
307
|
"""
|
79
308
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
else:
|
85
|
-
id_val.append(np.nan) # type: ignore
|
309
|
+
if not isinstance(species_ids, pd.DataFrame):
|
310
|
+
raise TypeError(
|
311
|
+
f"species_ids was a {type(species_ids)} but must be a pd.DataFrame"
|
312
|
+
)
|
86
313
|
|
87
|
-
|
314
|
+
if not isinstance(max_complex_size, int):
|
315
|
+
raise TypeError(
|
316
|
+
f"max_complex_size was a {type(max_complex_size)} but must be an int"
|
317
|
+
)
|
318
|
+
|
319
|
+
if not isinstance(max_promiscuity, int):
|
320
|
+
raise TypeError(
|
321
|
+
f"max_promiscuity was a {type(max_promiscuity)} but must be an int"
|
322
|
+
)
|
323
|
+
|
324
|
+
if not isinstance(defining_biological_qualifiers, list):
|
325
|
+
raise TypeError(
|
326
|
+
f"defining_biological_qualifiers was a {type(defining_biological_qualifiers)} but must be a list"
|
327
|
+
)
|
328
|
+
|
329
|
+
# primary annotations of a species
|
330
|
+
bqb_is_species = species_ids.query("bqb in @defining_biological_qualifiers")
|
331
|
+
|
332
|
+
# add components within modestly sized protein complexes
|
333
|
+
# look at HAS_PART IDs
|
334
|
+
bqb_has_parts_species = species_ids[species_ids[IDENTIFIERS.BQB] == BQB.HAS_PART]
|
335
|
+
|
336
|
+
# number of species in a complex
|
337
|
+
n_species_components = bqb_has_parts_species.value_counts(
|
338
|
+
[IDENTIFIERS.ONTOLOGY, SBML_DFS.S_ID]
|
339
|
+
)
|
340
|
+
big_complex_sids = set(
|
341
|
+
n_species_components[
|
342
|
+
n_species_components > max_complex_size
|
343
|
+
].index.get_level_values(SBML_DFS.S_ID)
|
344
|
+
)
|
345
|
+
|
346
|
+
filtered_bqb_has_parts = _filter_promiscuous_components(
|
347
|
+
bqb_has_parts_species, max_promiscuity
|
348
|
+
)
|
349
|
+
|
350
|
+
# drop species parts if there are many components
|
351
|
+
filtered_bqb_has_parts = filtered_bqb_has_parts[
|
352
|
+
~filtered_bqb_has_parts[SBML_DFS.S_ID].isin(big_complex_sids)
|
353
|
+
]
|
354
|
+
|
355
|
+
# combine primary identifiers and rare components
|
356
|
+
characteristic_species_ids = pd.concat(
|
357
|
+
[
|
358
|
+
bqb_is_species,
|
359
|
+
filtered_bqb_has_parts,
|
360
|
+
]
|
361
|
+
)
|
362
|
+
|
363
|
+
return characteristic_species_ids
|
88
364
|
|
89
365
|
|
90
366
|
def get_current_max_id(sbml_dfs_table: pd.DataFrame) -> int:
|
@@ -118,57 +394,26 @@ def get_current_max_id(sbml_dfs_table: pd.DataFrame) -> int:
|
|
118
394
|
return current_max_id
|
119
395
|
|
120
396
|
|
121
|
-
def
|
122
|
-
|
123
|
-
|
124
|
-
outdir: str | None = None,
|
125
|
-
) -> indices.PWIndex:
|
126
|
-
"""Adapts a pw_index
|
127
|
-
|
128
|
-
Helpful to filter for species before reconstructing.
|
397
|
+
def id_formatter(id_values: Iterable[Any], id_type: str, id_len: int = 8) -> list[str]:
|
398
|
+
id_prefix = utils.extract_regex_match("^([a-zA-Z]+)_id$", id_type).upper()
|
399
|
+
return [id_prefix + format(x, f"0{id_len}d") for x in id_values]
|
129
400
|
|
130
|
-
Args:
|
131
|
-
source (str | PWIndex): uri for pw_index.csv file or PWIndex object
|
132
|
-
species (str):
|
133
|
-
outdir (str | None, optional): Optional directory to write pw_index to.
|
134
|
-
Defaults to None.
|
135
401
|
|
136
|
-
|
137
|
-
indices.PWIndex: Filtered pw index
|
402
|
+
def id_formatter_inv(ids: list[str]) -> list[int]:
|
138
403
|
"""
|
139
|
-
|
140
|
-
pw_index = indices.PWIndex(source)
|
141
|
-
elif isinstance(source, indices.PWIndex):
|
142
|
-
pw_index = copy.deepcopy(source)
|
143
|
-
else:
|
144
|
-
raise ValueError("'source' needs to be str or PWIndex.")
|
145
|
-
pw_index.filter(species=species)
|
146
|
-
|
147
|
-
if outdir is not None:
|
148
|
-
with open_fs(outdir, create=True) as fs:
|
149
|
-
with fs.open("pw_index.tsv", "w") as f:
|
150
|
-
pw_index.index.to_csv(f, sep="\t")
|
151
|
-
return pw_index
|
404
|
+
ID Formatter Inverter
|
152
405
|
|
406
|
+
Convert from internal IDs back to integer IDs
|
407
|
+
"""
|
153
408
|
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
"
|
158
|
-
|
159
|
-
|
160
|
-
# preserve differences between genes, transcripts, and proteins
|
161
|
-
defining_biological_qualifiers = BQB_DEFINING_ATTRS
|
162
|
-
else:
|
163
|
-
logger.info(
|
164
|
-
"Running in non-dogmatic mode - genes, transcripts, and proteins will "
|
165
|
-
"be merged if possible."
|
166
|
-
)
|
167
|
-
# merge genes, transcripts, and proteins (if they are defined with
|
168
|
-
# bqb terms which specify their relationships).
|
169
|
-
defining_biological_qualifiers = BQB_DEFINING_ATTRS_LOOSE
|
409
|
+
id_val = list()
|
410
|
+
for an_id in ids:
|
411
|
+
if re.match("^[A-Z]+[0-9]+$", an_id):
|
412
|
+
id_val.append(int(re.sub("^[A-Z]+", "", an_id)))
|
413
|
+
else:
|
414
|
+
id_val.append(np.nan) # type: ignore
|
170
415
|
|
171
|
-
return
|
416
|
+
return id_val
|
172
417
|
|
173
418
|
|
174
419
|
def match_entitydata_index_to_entity(
|
@@ -200,7 +445,7 @@ def match_entitydata_index_to_entity(
|
|
200
445
|
if len(entity_data_df.index.difference(consensus_entity_df.index)) == 0:
|
201
446
|
logger.info(f"{data_table} ids are included in {table} ids")
|
202
447
|
else:
|
203
|
-
logger.
|
448
|
+
logger.warning(
|
204
449
|
f"{data_table} have ids are not matched to {table} ids,"
|
205
450
|
f"please check mismatched ids first"
|
206
451
|
)
|
@@ -229,79 +474,169 @@ def match_entitydata_index_to_entity(
|
|
229
474
|
return entity_data_df
|
230
475
|
|
231
476
|
|
232
|
-
def
|
233
|
-
"""
|
234
|
-
Update the input smbl_dfs's entity_data (dict) index
|
235
|
-
with match_entitydata_index_to_entity,
|
236
|
-
so that index for dataframe(s) in entity_data (dict) matches the sbml_dfs'
|
237
|
-
corresponding entity, and then passes sbml_dfs.validate()
|
238
|
-
Args
|
239
|
-
sbml_dfs (cpr.SBML_dfs): a cpr.SBML_dfs
|
240
|
-
table (str): table whose data is being consolidates (currently species or reactions)
|
241
|
-
Returns
|
242
|
-
sbml_dfs (cpr.SBML_dfs):
|
243
|
-
sbml_dfs whose entity_data is checked to have the same index
|
244
|
-
as the corresponding entity.
|
245
|
-
"""
|
477
|
+
def species_type_types(x):
|
478
|
+
"""Assign a high-level molecule type to a molecular species"""
|
246
479
|
|
247
|
-
|
480
|
+
if isinstance(x, identifiers.Identifiers):
|
481
|
+
if x.filter(["chebi"]):
|
482
|
+
return "metabolite"
|
483
|
+
elif x.filter(["molodex"]):
|
484
|
+
return "drug"
|
485
|
+
else:
|
486
|
+
return "protein"
|
487
|
+
else:
|
488
|
+
return "unknown"
|
248
489
|
|
249
|
-
entity_data_dict = getattr(sbml_dfs, table_data)
|
250
|
-
entity_schema = sbml_dfs.schema[table]
|
251
|
-
sbml_dfs_entity = getattr(sbml_dfs, table)
|
252
490
|
|
253
|
-
|
254
|
-
|
491
|
+
def stub_compartments(
|
492
|
+
stubbed_compartment: str = GENERIC_COMPARTMENT,
|
493
|
+
) -> pd.DataFrame:
|
494
|
+
"""Stub Compartments
|
255
495
|
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
496
|
+
Create a compartments table with only a single compartment
|
497
|
+
|
498
|
+
Args:
|
499
|
+
stubbed_compartment (str): the name of a compartment which should match the
|
500
|
+
keys in ingestion.constants.VALID_COMPARTMENTS and ingestion.constants.COMPARTMENTS_GO_TERMS
|
501
|
+
|
502
|
+
Returns:
|
503
|
+
compartments_df (pd.DataFrame): compartments dataframe
|
504
|
+
"""
|
505
|
+
|
506
|
+
if stubbed_compartment not in VALID_COMPARTMENTS:
|
507
|
+
raise ValueError(
|
508
|
+
f"{stubbed_compartment} is not defined in ingestion.constants.VALID_COMPARTMENTS"
|
509
|
+
)
|
510
|
+
|
511
|
+
if stubbed_compartment not in COMPARTMENTS_GO_TERMS.keys():
|
512
|
+
raise ValueError(
|
513
|
+
f"{stubbed_compartment} is not defined in ingestion.constants.COMPARTMENTS_GO_TERMS"
|
514
|
+
)
|
515
|
+
|
516
|
+
stubbed_compartment_id = COMPARTMENTS_GO_TERMS[stubbed_compartment]
|
517
|
+
|
518
|
+
formatted_uri = identifiers.format_uri(
|
519
|
+
uri=identifiers.create_uri_url(
|
520
|
+
ontology=ONTOLOGIES.GO,
|
521
|
+
identifier=stubbed_compartment_id,
|
522
|
+
),
|
523
|
+
biological_qualifier_type=BQB.IS,
|
524
|
+
)
|
525
|
+
|
526
|
+
compartments_df = pd.DataFrame(
|
527
|
+
{
|
528
|
+
SBML_DFS.C_NAME: [stubbed_compartment],
|
529
|
+
SBML_DFS.C_IDENTIFIERS: [identifiers.Identifiers([formatted_uri])],
|
261
530
|
}
|
531
|
+
)
|
532
|
+
compartments_df.index = id_formatter([0], SBML_DFS.C_ID) # type: ignore
|
533
|
+
compartments_df.index.name = SBML_DFS.C_ID
|
262
534
|
|
263
|
-
|
264
|
-
sbml_dfs.reactions_data = entity_data_dict_checked
|
265
|
-
elif table == SBML_DFS.SPECIES:
|
266
|
-
sbml_dfs.species_data = entity_data_dict_checked
|
535
|
+
return compartments_df
|
267
536
|
|
268
|
-
return sbml_dfs
|
269
537
|
|
538
|
+
def unnest_identifiers(id_table: pd.DataFrame, id_var: str) -> pd.DataFrame:
|
539
|
+
"""
|
540
|
+
Unnest Identifiers
|
541
|
+
|
542
|
+
Take a pd.DataFrame containing an array of Identifiers and
|
543
|
+
return one-row per identifier.
|
544
|
+
|
545
|
+
Parameters:
|
546
|
+
id_table: pd.DataFrame
|
547
|
+
a table containing an array of Identifiers
|
548
|
+
id_var: str
|
549
|
+
variable containing Identifiers
|
550
|
+
|
551
|
+
Returns:
|
552
|
+
pd.Dataframe containing the index of id_table but expanded
|
553
|
+
to include one row per identifier
|
270
554
|
|
271
|
-
def get_characteristic_species_ids(
|
272
|
-
sbml_dfs: sbml_dfs_core.SBML_dfs, dogmatic: bool = True
|
273
|
-
) -> pd.DataFrame:
|
274
555
|
"""
|
275
|
-
Get Characteristic Species IDs
|
276
556
|
|
277
|
-
|
557
|
+
# validate inputs
|
558
|
+
utils.match_pd_vars(id_table, {id_var}).assert_present()
|
559
|
+
|
560
|
+
N_invalid_ids = sum(id_table[id_var].isna())
|
561
|
+
if N_invalid_ids != 0:
|
562
|
+
raise ValueError(
|
563
|
+
f'{N_invalid_ids} entries in "id_table" were missing',
|
564
|
+
"entries with no identifiers should still include an Identifiers object",
|
565
|
+
)
|
566
|
+
|
567
|
+
# Get the identifier as a list of dicts
|
568
|
+
df = id_table[id_var].apply(lambda x: x.ids if len(x.ids) > 0 else 0).to_frame()
|
569
|
+
# Filter out zero length lists
|
570
|
+
df = df.query(f"{id_var} != 0")
|
571
|
+
# Unnest the list of dicts into one dict per row
|
572
|
+
df = df.explode(id_var)
|
573
|
+
# Unnest the dict into a dataframe
|
574
|
+
df = pd.DataFrame(df[id_var].values.tolist(), index=df.index)
|
575
|
+
# Add the entry number as an index
|
576
|
+
df["entry"] = df.groupby(df.index).cumcount()
|
577
|
+
df.set_index("entry", append=True, inplace=True)
|
578
|
+
return df
|
579
|
+
|
580
|
+
|
581
|
+
def validate_sbml_dfs_table(table_data: pd.DataFrame, table_name: str) -> None:
|
582
|
+
"""
|
583
|
+
Validate a standalone table against the SBML_dfs schema.
|
584
|
+
|
585
|
+
This function validates a table against the schema defined in SBML_DFS_SCHEMA,
|
586
|
+
without requiring an SBML_dfs object. Useful for validating tables before
|
587
|
+
creating an SBML_dfs object.
|
278
588
|
|
279
589
|
Parameters
|
280
590
|
----------
|
281
|
-
|
282
|
-
The
|
283
|
-
|
284
|
-
|
591
|
+
table_data : pd.DataFrame
|
592
|
+
The table to validate
|
593
|
+
table_name : str
|
594
|
+
Name of the table in the SBML_dfs schema
|
595
|
+
|
596
|
+
Raises
|
597
|
+
------
|
598
|
+
ValueError
|
599
|
+
If table_name is not in schema or validation fails
|
600
|
+
"""
|
601
|
+
if table_name not in SBML_DFS_SCHEMA.SCHEMA:
|
602
|
+
raise ValueError(
|
603
|
+
f"{table_name} is not a valid table name in SBML_DFS_SCHEMA. "
|
604
|
+
f"Valid tables are: {', '.join(SBML_DFS_SCHEMA.SCHEMA.keys())}"
|
605
|
+
)
|
285
606
|
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
607
|
+
table_schema = SBML_DFS_SCHEMA.SCHEMA[table_name]
|
608
|
+
_perform_sbml_dfs_table_validation(table_data, table_schema, table_name)
|
609
|
+
|
610
|
+
|
611
|
+
# =============================================================================
|
612
|
+
# PRIVATE FUNCTIONS (ALPHABETICAL ORDER)
|
613
|
+
# =============================================================================
|
614
|
+
|
615
|
+
|
616
|
+
def _add_stoi_to_species_name(stoi: float | int, name: str) -> str:
|
290
617
|
"""
|
618
|
+
Add Stoi To Species Name
|
291
619
|
|
292
|
-
#
|
293
|
-
defining_biological_qualifiers = _dogmatic_to_defining_bqbs(dogmatic)
|
620
|
+
Add # of molecules to a species name
|
294
621
|
|
295
|
-
|
296
|
-
|
622
|
+
Parameters:
|
623
|
+
----------
|
624
|
+
stoi: float or int
|
625
|
+
Number of molecules
|
626
|
+
name: str
|
627
|
+
Name of species
|
297
628
|
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
629
|
+
Returns:
|
630
|
+
----------
|
631
|
+
name: str
|
632
|
+
Name containing number of species
|
633
|
+
|
634
|
+
"""
|
303
635
|
|
304
|
-
|
636
|
+
if stoi in [-1, 0, 1]:
|
637
|
+
return name
|
638
|
+
else:
|
639
|
+
return str(abs(stoi)) + " " + name
|
305
640
|
|
306
641
|
|
307
642
|
def _dogmatic_to_defining_bqbs(dogmatic: bool = False) -> str:
|
@@ -325,8 +660,458 @@ def _dogmatic_to_defining_bqbs(dogmatic: bool = False) -> str:
|
|
325
660
|
return defining_biological_qualifiers
|
326
661
|
|
327
662
|
|
328
|
-
def
|
329
|
-
|
663
|
+
def _edgelist_create_compartmentalized_species(
|
664
|
+
interaction_edgelist, species_df, compartments_df, interaction_source
|
665
|
+
):
|
666
|
+
"""
|
667
|
+
Create compartmentalized species from interactions.
|
668
|
+
|
669
|
+
Parameters
|
670
|
+
----------
|
671
|
+
interaction_edgelist : pd.DataFrame
|
672
|
+
Interaction data containing species-compartment combinations
|
673
|
+
species_df : pd.DataFrame
|
674
|
+
Processed species data with IDs
|
675
|
+
compartments_df : pd.DataFrame
|
676
|
+
Processed compartments data with IDs
|
677
|
+
interaction_source : source.Source
|
678
|
+
Source object to assign to compartmentalized species
|
679
|
+
|
680
|
+
Returns
|
681
|
+
-------
|
682
|
+
pd.DataFrame
|
683
|
+
Compartmentalized species with formatted names and IDs
|
684
|
+
"""
|
685
|
+
# Get all distinct upstream and downstream compartmentalized species
|
686
|
+
comp_species = pd.concat(
|
687
|
+
[
|
688
|
+
interaction_edgelist[["upstream_name", "upstream_compartment"]].rename(
|
689
|
+
{
|
690
|
+
"upstream_name": SBML_DFS.S_NAME,
|
691
|
+
"upstream_compartment": SBML_DFS.C_NAME,
|
692
|
+
},
|
693
|
+
axis=1,
|
694
|
+
),
|
695
|
+
interaction_edgelist[["downstream_name", "downstream_compartment"]].rename(
|
696
|
+
{
|
697
|
+
"downstream_name": SBML_DFS.S_NAME,
|
698
|
+
"downstream_compartment": SBML_DFS.C_NAME,
|
699
|
+
},
|
700
|
+
axis=1,
|
701
|
+
),
|
702
|
+
]
|
703
|
+
).drop_duplicates()
|
704
|
+
|
705
|
+
# Add species and compartment IDs
|
706
|
+
comp_species_w_ids = comp_species.merge(
|
707
|
+
species_df[SBML_DFS.S_NAME].reset_index(), how="left", on=SBML_DFS.S_NAME
|
708
|
+
).merge(
|
709
|
+
compartments_df[SBML_DFS.C_NAME].reset_index(), how="left", on=SBML_DFS.C_NAME
|
710
|
+
)
|
711
|
+
|
712
|
+
# Validate merge was successful
|
713
|
+
_sbml_dfs_from_edgelist_check_cspecies_merge(comp_species_w_ids, comp_species)
|
714
|
+
|
715
|
+
# Format compartmentalized species with names, source, and IDs
|
716
|
+
comp_species_w_ids[SBML_DFS.SC_NAME] = [
|
717
|
+
f"{s} [{c}]"
|
718
|
+
for s, c in zip(
|
719
|
+
comp_species_w_ids[SBML_DFS.S_NAME], comp_species_w_ids[SBML_DFS.C_NAME]
|
720
|
+
)
|
721
|
+
]
|
722
|
+
comp_species_w_ids[SBML_DFS.SC_SOURCE] = interaction_source
|
723
|
+
comp_species_w_ids[SBML_DFS.SC_ID] = id_formatter(
|
724
|
+
range(comp_species_w_ids.shape[0]), SBML_DFS.SC_ID
|
725
|
+
)
|
726
|
+
|
727
|
+
return comp_species_w_ids.set_index(SBML_DFS.SC_ID)[
|
728
|
+
[SBML_DFS.SC_NAME, SBML_DFS.S_ID, SBML_DFS.C_ID, SBML_DFS.SC_SOURCE]
|
729
|
+
]
|
730
|
+
|
731
|
+
|
732
|
+
def _edgelist_create_reactions_and_species(
|
733
|
+
interaction_edgelist,
|
734
|
+
comp_species,
|
735
|
+
species_df,
|
736
|
+
compartments_df,
|
737
|
+
interaction_source,
|
738
|
+
upstream_stoichiometry,
|
739
|
+
downstream_stoichiometry,
|
740
|
+
downstream_sbo_name,
|
741
|
+
extra_reactions_columns,
|
742
|
+
):
|
743
|
+
"""
|
744
|
+
Create reactions and reaction species from interactions.
|
745
|
+
|
746
|
+
Parameters
|
747
|
+
----------
|
748
|
+
interaction_edgelist : pd.DataFrame
|
749
|
+
Original interaction data
|
750
|
+
comp_species : pd.DataFrame
|
751
|
+
Compartmentalized species with IDs
|
752
|
+
species_df : pd.DataFrame
|
753
|
+
Processed species data with IDs
|
754
|
+
compartments_df : pd.DataFrame
|
755
|
+
Processed compartments data with IDs
|
756
|
+
interaction_source : source.Source
|
757
|
+
Source object for reactions
|
758
|
+
upstream_stoichiometry : int
|
759
|
+
Stoichiometry for upstream species
|
760
|
+
downstream_stoichiometry : int
|
761
|
+
Stoichiometry for downstream species
|
762
|
+
downstream_sbo_name : str
|
763
|
+
SBO term name for downstream species
|
764
|
+
extra_reactions_columns : list
|
765
|
+
Names of extra columns to preserve
|
766
|
+
|
767
|
+
Returns
|
768
|
+
-------
|
769
|
+
tuple
|
770
|
+
(reactions_df, reaction_species_df, reactions_data)
|
771
|
+
"""
|
772
|
+
# Add compartmentalized species IDs to interactions
|
773
|
+
comp_species_w_names = (
|
774
|
+
comp_species.reset_index()
|
775
|
+
.merge(species_df[SBML_DFS.S_NAME].reset_index())
|
776
|
+
.merge(compartments_df[SBML_DFS.C_NAME].reset_index())
|
777
|
+
)
|
778
|
+
|
779
|
+
interaction_w_cspecies = interaction_edgelist.merge(
|
780
|
+
comp_species_w_names[[SBML_DFS.SC_ID, SBML_DFS.S_NAME, SBML_DFS.C_NAME]].rename(
|
781
|
+
{
|
782
|
+
SBML_DFS.SC_ID: "sc_id_up",
|
783
|
+
SBML_DFS.S_NAME: "upstream_name",
|
784
|
+
SBML_DFS.C_NAME: "upstream_compartment",
|
785
|
+
},
|
786
|
+
axis=1,
|
787
|
+
),
|
788
|
+
how="left",
|
789
|
+
).merge(
|
790
|
+
comp_species_w_names[[SBML_DFS.SC_ID, SBML_DFS.S_NAME, SBML_DFS.C_NAME]].rename(
|
791
|
+
{
|
792
|
+
SBML_DFS.SC_ID: "sc_id_down",
|
793
|
+
SBML_DFS.S_NAME: "downstream_name",
|
794
|
+
SBML_DFS.C_NAME: "downstream_compartment",
|
795
|
+
},
|
796
|
+
axis=1,
|
797
|
+
),
|
798
|
+
how="left",
|
799
|
+
)[
|
800
|
+
REQUIRED_REACTION_FROMEDGELIST_COLUMNS + extra_reactions_columns
|
801
|
+
]
|
802
|
+
|
803
|
+
# Validate merge didn't create duplicates
|
804
|
+
if interaction_edgelist.shape[0] != interaction_w_cspecies.shape[0]:
|
805
|
+
raise ValueError(
|
806
|
+
f"Merging compartmentalized species resulted in row count change "
|
807
|
+
f"from {interaction_edgelist.shape[0]} to {interaction_w_cspecies.shape[0]}"
|
808
|
+
)
|
809
|
+
|
810
|
+
# Create reaction IDs FIRST - before using them
|
811
|
+
interaction_w_cspecies[SBML_DFS.R_ID] = id_formatter(
|
812
|
+
range(interaction_w_cspecies.shape[0]), SBML_DFS.R_ID
|
813
|
+
)
|
814
|
+
|
815
|
+
# Create reactions DataFrame
|
816
|
+
interactions_copy = interaction_w_cspecies.copy()
|
817
|
+
interactions_copy[SBML_DFS.R_SOURCE] = interaction_source
|
818
|
+
|
819
|
+
reactions_columns = [
|
820
|
+
SBML_DFS.R_NAME,
|
821
|
+
SBML_DFS.R_IDENTIFIERS,
|
822
|
+
SBML_DFS.R_SOURCE,
|
823
|
+
SBML_DFS.R_ISREVERSIBLE,
|
824
|
+
]
|
825
|
+
|
826
|
+
reactions_df = interactions_copy.set_index(SBML_DFS.R_ID)[
|
827
|
+
reactions_columns + extra_reactions_columns
|
828
|
+
]
|
829
|
+
|
830
|
+
# Separate extra data
|
831
|
+
reactions_data = reactions_df[extra_reactions_columns]
|
832
|
+
reactions_df = reactions_df[reactions_columns]
|
833
|
+
|
834
|
+
# Create reaction species relationships - NOW r_id exists
|
835
|
+
reaction_species_df = pd.concat(
|
836
|
+
[
|
837
|
+
# Upstream species (modifiers/stimulators/inhibitors)
|
838
|
+
interaction_w_cspecies[["sc_id_up", "sbo_term", SBML_DFS.R_ID]]
|
839
|
+
.assign(stoichiometry=upstream_stoichiometry)
|
840
|
+
.rename({"sc_id_up": "sc_id"}, axis=1),
|
841
|
+
# Downstream species (products)
|
842
|
+
interaction_w_cspecies[["sc_id_down", SBML_DFS.R_ID]]
|
843
|
+
.assign(
|
844
|
+
stoichiometry=downstream_stoichiometry,
|
845
|
+
sbo_term=MINI_SBO_FROM_NAME[downstream_sbo_name],
|
846
|
+
)
|
847
|
+
.rename({"sc_id_down": "sc_id"}, axis=1),
|
848
|
+
]
|
849
|
+
)
|
850
|
+
|
851
|
+
reaction_species_df["rsc_id"] = id_formatter(
|
852
|
+
range(reaction_species_df.shape[0]), "rsc_id"
|
853
|
+
)
|
854
|
+
|
855
|
+
reaction_species_df = reaction_species_df.set_index("rsc_id")
|
856
|
+
|
857
|
+
return reactions_df, reaction_species_df, reactions_data
|
858
|
+
|
859
|
+
|
860
|
+
def _edgelist_identify_extra_columns(
|
861
|
+
interaction_edgelist, species_df, keep_reactions_data, keep_species_data
|
862
|
+
):
|
863
|
+
"""
|
864
|
+
Identify extra columns in input data that should be preserved.
|
865
|
+
|
866
|
+
Parameters
|
867
|
+
----------
|
868
|
+
interaction_edgelist : pd.DataFrame
|
869
|
+
Interaction data containing potential extra columns
|
870
|
+
species_df : pd.DataFrame
|
871
|
+
Species data containing potential extra columns
|
872
|
+
keep_reactions_data : bool or str
|
873
|
+
Whether to keep extra reaction columns
|
874
|
+
keep_species_data : bool or str
|
875
|
+
Whether to keep extra species columns
|
876
|
+
|
877
|
+
Returns
|
878
|
+
-------
|
879
|
+
dict
|
880
|
+
Dictionary with 'reactions' and 'species' keys containing lists of extra column names
|
881
|
+
"""
|
882
|
+
extra_reactions_columns = []
|
883
|
+
extra_species_columns = []
|
884
|
+
|
885
|
+
if keep_reactions_data is not False:
|
886
|
+
extra_reactions_columns = [
|
887
|
+
c
|
888
|
+
for c in interaction_edgelist.columns
|
889
|
+
if c not in INTERACTION_EDGELIST_EXPECTED_VARS
|
890
|
+
]
|
891
|
+
|
892
|
+
if keep_species_data is not False:
|
893
|
+
extra_species_columns = [
|
894
|
+
c
|
895
|
+
for c in species_df.columns
|
896
|
+
if c not in {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
|
897
|
+
]
|
898
|
+
|
899
|
+
return {"reactions": extra_reactions_columns, "species": extra_species_columns}
|
900
|
+
|
901
|
+
|
902
|
+
def _edgelist_process_compartments(compartments_df, interaction_source):
|
903
|
+
"""
|
904
|
+
Format compartments DataFrame with source and ID columns.
|
905
|
+
|
906
|
+
Parameters
|
907
|
+
----------
|
908
|
+
compartments_df : pd.DataFrame
|
909
|
+
Raw compartments data
|
910
|
+
interaction_source : source.Source
|
911
|
+
Source object to assign to compartments
|
912
|
+
|
913
|
+
Returns
|
914
|
+
-------
|
915
|
+
pd.DataFrame
|
916
|
+
Processed compartments with IDs, indexed by compartment ID
|
917
|
+
"""
|
918
|
+
compartments = compartments_df.copy()
|
919
|
+
compartments[SBML_DFS.C_SOURCE] = interaction_source
|
920
|
+
compartments[SBML_DFS.C_ID] = id_formatter(
|
921
|
+
range(compartments.shape[0]), SBML_DFS.C_ID
|
922
|
+
)
|
923
|
+
return compartments.set_index(SBML_DFS.C_ID)[
|
924
|
+
[SBML_DFS.C_NAME, SBML_DFS.C_IDENTIFIERS, SBML_DFS.C_SOURCE]
|
925
|
+
]
|
926
|
+
|
927
|
+
|
928
|
+
def _edgelist_process_species(species_df, interaction_source, extra_species_columns):
|
929
|
+
"""
|
930
|
+
Format species DataFrame and extract extra data.
|
931
|
+
|
932
|
+
Parameters
|
933
|
+
----------
|
934
|
+
species_df : pd.DataFrame
|
935
|
+
Raw species data
|
936
|
+
interaction_source : source.Source
|
937
|
+
Source object to assign to species
|
938
|
+
extra_species_columns : list
|
939
|
+
Names of extra columns to preserve separately
|
940
|
+
|
941
|
+
Returns
|
942
|
+
-------
|
943
|
+
tuple of pd.DataFrame
|
944
|
+
Processed species DataFrame and species extra data DataFrame
|
945
|
+
"""
|
946
|
+
species = species_df.copy()
|
947
|
+
species[SBML_DFS.S_SOURCE] = interaction_source
|
948
|
+
species[SBML_DFS.S_ID] = id_formatter(range(species.shape[0]), SBML_DFS.S_ID)
|
949
|
+
|
950
|
+
required_cols = [SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS, SBML_DFS.S_SOURCE]
|
951
|
+
species_indexed = species.set_index(SBML_DFS.S_ID)[
|
952
|
+
required_cols + extra_species_columns
|
953
|
+
]
|
954
|
+
|
955
|
+
# Separate extra data from main species table
|
956
|
+
species_data = species_indexed[extra_species_columns]
|
957
|
+
processed_species = species_indexed[required_cols]
|
958
|
+
|
959
|
+
return processed_species, species_data
|
960
|
+
|
961
|
+
|
962
|
+
def _edgelist_validate_inputs(
|
963
|
+
interaction_edgelist: pd.DataFrame,
|
964
|
+
species_df: pd.DataFrame,
|
965
|
+
compartments_df: pd.DataFrame,
|
966
|
+
) -> None:
|
967
|
+
"""
|
968
|
+
Validate input DataFrames have required columns.
|
969
|
+
|
970
|
+
Parameters
|
971
|
+
----------
|
972
|
+
interaction_edgelist : pd.DataFrame
|
973
|
+
Interaction data to validate
|
974
|
+
species_df : pd.DataFrame
|
975
|
+
Species data to validate
|
976
|
+
compartments_df : pd.DataFrame
|
977
|
+
Compartments data to validate
|
978
|
+
"""
|
979
|
+
|
980
|
+
# check compartments
|
981
|
+
compartments_df_expected_vars = {SBML_DFS.C_NAME, SBML_DFS.C_IDENTIFIERS}
|
982
|
+
compartments_df_columns = set(compartments_df.columns.tolist())
|
983
|
+
missing_required_fields = compartments_df_expected_vars.difference(
|
984
|
+
compartments_df_columns
|
985
|
+
)
|
986
|
+
if len(missing_required_fields) > 0:
|
987
|
+
raise ValueError(
|
988
|
+
f"{', '.join(missing_required_fields)} are required variables"
|
989
|
+
' in "compartments_df" but were not present in the input file.'
|
990
|
+
)
|
991
|
+
|
992
|
+
# check species
|
993
|
+
species_df_expected_vars = {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
|
994
|
+
species_df_columns = set(species_df.columns.tolist())
|
995
|
+
missing_required_fields = species_df_expected_vars.difference(species_df_columns)
|
996
|
+
if len(missing_required_fields) > 0:
|
997
|
+
raise ValueError(
|
998
|
+
f"{', '.join(missing_required_fields)} are required"
|
999
|
+
' variables in "species_df" but were not present '
|
1000
|
+
"in the input file."
|
1001
|
+
)
|
1002
|
+
|
1003
|
+
# check interactions
|
1004
|
+
interaction_edgelist_columns = set(interaction_edgelist.columns.tolist())
|
1005
|
+
missing_required_fields = INTERACTION_EDGELIST_EXPECTED_VARS.difference(
|
1006
|
+
interaction_edgelist_columns
|
1007
|
+
)
|
1008
|
+
if len(missing_required_fields) > 0:
|
1009
|
+
raise ValueError(
|
1010
|
+
f"{', '.join(missing_required_fields)} are required "
|
1011
|
+
'variables in "interaction_edgelist" but were not '
|
1012
|
+
"present in the input file."
|
1013
|
+
)
|
1014
|
+
|
1015
|
+
return None
|
1016
|
+
|
1017
|
+
|
1018
|
+
def _filter_promiscuous_components(
|
1019
|
+
bqb_has_parts_species: pd.DataFrame, max_promiscuity: int
|
1020
|
+
) -> pd.DataFrame:
|
1021
|
+
|
1022
|
+
# number of complexes a species is part of
|
1023
|
+
n_complexes_involvedin = bqb_has_parts_species.value_counts(
|
1024
|
+
[IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
|
1025
|
+
)
|
1026
|
+
promiscuous_component_identifiers_index = n_complexes_involvedin[
|
1027
|
+
n_complexes_involvedin > max_promiscuity
|
1028
|
+
].index
|
1029
|
+
promiscuous_component_identifiers = pd.Series(
|
1030
|
+
data=[True] * len(promiscuous_component_identifiers_index),
|
1031
|
+
index=promiscuous_component_identifiers_index,
|
1032
|
+
name="is_shared_component",
|
1033
|
+
dtype=bool,
|
1034
|
+
)
|
1035
|
+
|
1036
|
+
if len(promiscuous_component_identifiers) == 0:
|
1037
|
+
return bqb_has_parts_species
|
1038
|
+
|
1039
|
+
filtered_bqb_has_parts = bqb_has_parts_species.merge(
|
1040
|
+
promiscuous_component_identifiers,
|
1041
|
+
left_on=[IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER],
|
1042
|
+
right_index=True,
|
1043
|
+
how="left",
|
1044
|
+
)
|
1045
|
+
|
1046
|
+
filtered_bqb_has_parts["is_shared_component"] = (
|
1047
|
+
filtered_bqb_has_parts["is_shared_component"].astype("boolean").fillna(False)
|
1048
|
+
)
|
1049
|
+
# drop identifiers shared as components across many species
|
1050
|
+
filtered_bqb_has_parts = filtered_bqb_has_parts[
|
1051
|
+
~filtered_bqb_has_parts["is_shared_component"]
|
1052
|
+
].drop(["is_shared_component"], axis=1)
|
1053
|
+
|
1054
|
+
return filtered_bqb_has_parts
|
1055
|
+
|
1056
|
+
|
1057
|
+
def _find_underspecified_reactions(
|
1058
|
+
reaction_species_w_roles: pd.DataFrame,
|
1059
|
+
) -> pd.DataFrame:
|
1060
|
+
|
1061
|
+
# check that both sbo_role and "new" are present
|
1062
|
+
if SBO_ROLES_DEFS.SBO_ROLE not in reaction_species_w_roles.columns:
|
1063
|
+
raise ValueError(
|
1064
|
+
"The sbo_role column is not present in the reaction_species_w_roles table. Please call add_sbo_role() first."
|
1065
|
+
)
|
1066
|
+
if "new" not in reaction_species_w_roles.columns:
|
1067
|
+
raise ValueError(
|
1068
|
+
"The new column is not present in the reaction_species_w_roles table. This should indicate what cspecies would be preserved in the reaction should it be preserved."
|
1069
|
+
)
|
1070
|
+
# check that new is a boolean column
|
1071
|
+
if reaction_species_w_roles["new"].dtype != bool:
|
1072
|
+
raise ValueError(
|
1073
|
+
"The new column is not a boolean column. Please ensure that the new column is a boolean column. This should indicate what cspecies would be preserved in the reaction should it be preserved."
|
1074
|
+
)
|
1075
|
+
|
1076
|
+
reactions_with_lost_defining_members = set(
|
1077
|
+
reaction_species_w_roles.query("~new")
|
1078
|
+
.query("sbo_role == 'DEFINING'")[SBML_DFS.R_ID]
|
1079
|
+
.tolist()
|
1080
|
+
)
|
1081
|
+
|
1082
|
+
N_reactions_with_lost_defining_members = len(reactions_with_lost_defining_members)
|
1083
|
+
if N_reactions_with_lost_defining_members > 0:
|
1084
|
+
logger.info(
|
1085
|
+
f"Removing {N_reactions_with_lost_defining_members} reactions which have lost at least one defining species"
|
1086
|
+
)
|
1087
|
+
|
1088
|
+
# find the cases where all "new" values for a given (r_id, sbo_term) are False
|
1089
|
+
reactions_with_lost_requirements = set(
|
1090
|
+
reaction_species_w_roles
|
1091
|
+
# drop already filtered reactions
|
1092
|
+
.query("r_id not in @reactions_with_lost_defining_members")
|
1093
|
+
.query("sbo_role == 'REQUIRED'")
|
1094
|
+
# which entries which have some required attribute have all False values for that attribute
|
1095
|
+
.groupby([SBML_DFS.R_ID, SBML_DFS.SBO_TERM])
|
1096
|
+
.agg({"new": "any"})
|
1097
|
+
.query("new == False")
|
1098
|
+
.index.get_level_values(SBML_DFS.R_ID)
|
1099
|
+
)
|
1100
|
+
|
1101
|
+
N_reactions_with_lost_requirements = len(reactions_with_lost_requirements)
|
1102
|
+
if N_reactions_with_lost_requirements > 0:
|
1103
|
+
logger.info(
|
1104
|
+
f"Removing {N_reactions_with_lost_requirements} reactions which have lost all required members"
|
1105
|
+
)
|
1106
|
+
|
1107
|
+
underspecified_reactions = reactions_with_lost_defining_members.union(
|
1108
|
+
reactions_with_lost_requirements
|
1109
|
+
)
|
1110
|
+
|
1111
|
+
return underspecified_reactions
|
1112
|
+
|
1113
|
+
|
1114
|
+
def _id_dict_to_df(ids):
|
330
1115
|
if len(ids) == 0:
|
331
1116
|
return pd.DataFrame(
|
332
1117
|
{
|
@@ -338,3 +1123,157 @@ def _stub_ids(ids):
|
|
338
1123
|
)
|
339
1124
|
else:
|
340
1125
|
return pd.DataFrame(ids)
|
1126
|
+
|
1127
|
+
|
1128
|
+
def _perform_sbml_dfs_table_validation(
|
1129
|
+
table_data: pd.DataFrame,
|
1130
|
+
table_schema: dict,
|
1131
|
+
table_name: str,
|
1132
|
+
) -> None:
|
1133
|
+
"""
|
1134
|
+
Core validation logic for SBML_dfs tables.
|
1135
|
+
|
1136
|
+
This function performs the actual validation checks for any table against its schema,
|
1137
|
+
regardless of whether it's part of an SBML_dfs object or standalone.
|
1138
|
+
|
1139
|
+
Parameters
|
1140
|
+
----------
|
1141
|
+
table_data : pd.DataFrame
|
1142
|
+
The table data to validate
|
1143
|
+
table_schema : dict
|
1144
|
+
Schema definition for the table
|
1145
|
+
table_name : str
|
1146
|
+
Name of the table (for error messages)
|
1147
|
+
|
1148
|
+
Raises
|
1149
|
+
------
|
1150
|
+
ValueError
|
1151
|
+
If the table does not conform to its schema:
|
1152
|
+
- Not a DataFrame
|
1153
|
+
- Wrong index name
|
1154
|
+
- Duplicate primary keys
|
1155
|
+
- Missing required variables
|
1156
|
+
- Empty table
|
1157
|
+
"""
|
1158
|
+
if not isinstance(table_data, pd.DataFrame):
|
1159
|
+
raise ValueError(
|
1160
|
+
f"{table_name} must be a pd.DataFrame, but was a {type(table_data)}"
|
1161
|
+
)
|
1162
|
+
|
1163
|
+
# check index
|
1164
|
+
expected_index_name = table_schema["pk"]
|
1165
|
+
if table_data.index.name != expected_index_name:
|
1166
|
+
raise ValueError(
|
1167
|
+
f"the index name for {table_name} was not the pk: {expected_index_name}"
|
1168
|
+
)
|
1169
|
+
|
1170
|
+
# check that all entries in the index are unique
|
1171
|
+
if len(set(table_data.index.tolist())) != table_data.shape[0]:
|
1172
|
+
duplicated_pks = table_data.index.value_counts()
|
1173
|
+
duplicated_pks = duplicated_pks[duplicated_pks > 1]
|
1174
|
+
|
1175
|
+
example_duplicates = duplicated_pks.index[0 : min(duplicated_pks.shape[0], 5)]
|
1176
|
+
raise ValueError(
|
1177
|
+
f"{duplicated_pks.shape[0]} primary keys were duplicated "
|
1178
|
+
f"including {', '.join(example_duplicates)}"
|
1179
|
+
)
|
1180
|
+
|
1181
|
+
# check variables
|
1182
|
+
expected_vars = set(table_schema["vars"])
|
1183
|
+
table_vars = set(list(table_data.columns))
|
1184
|
+
|
1185
|
+
extra_vars = table_vars.difference(expected_vars)
|
1186
|
+
if len(extra_vars) != 0:
|
1187
|
+
logger.debug(
|
1188
|
+
f"{len(extra_vars)} extra variables were found for {table_name}: "
|
1189
|
+
f"{', '.join(extra_vars)}"
|
1190
|
+
)
|
1191
|
+
|
1192
|
+
missing_vars = expected_vars.difference(table_vars)
|
1193
|
+
if len(missing_vars) != 0:
|
1194
|
+
raise ValueError(
|
1195
|
+
f"Missing {len(missing_vars)} required variables for {table_name}: "
|
1196
|
+
f"{', '.join(missing_vars)}"
|
1197
|
+
)
|
1198
|
+
|
1199
|
+
# check for empty table
|
1200
|
+
if table_data.shape[0] == 0:
|
1201
|
+
raise ValueError(f"{table_name} contained no entries")
|
1202
|
+
|
1203
|
+
|
1204
|
+
def _sbml_dfs_from_edgelist_check_cspecies_merge(
|
1205
|
+
merged_species: pd.DataFrame, original_species: pd.DataFrame
|
1206
|
+
) -> None:
|
1207
|
+
"""Check for a mismatch between the provided species data and species implied by the edgelist."""
|
1208
|
+
|
1209
|
+
# check for 1-many merge
|
1210
|
+
if merged_species.shape[0] != original_species.shape[0]:
|
1211
|
+
raise ValueError(
|
1212
|
+
"Merging compartmentalized species to species_df"
|
1213
|
+
" and compartments_df by names resulted in an "
|
1214
|
+
f"increase in the tables from {original_species.shape[0]}"
|
1215
|
+
f" to {merged_species.shape[0]} indicating that names were"
|
1216
|
+
" not unique"
|
1217
|
+
)
|
1218
|
+
|
1219
|
+
# check for missing species and compartments
|
1220
|
+
missing_compartments = merged_species[merged_species[SBML_DFS.C_ID].isna()][
|
1221
|
+
SBML_DFS.C_NAME
|
1222
|
+
].unique()
|
1223
|
+
if len(missing_compartments) >= 1:
|
1224
|
+
raise ValueError(
|
1225
|
+
f"{len(missing_compartments)} compartments were present in"
|
1226
|
+
' "interaction_edgelist" but not "compartments_df":'
|
1227
|
+
f" {', '.join(missing_compartments)}"
|
1228
|
+
)
|
1229
|
+
|
1230
|
+
missing_species = merged_species[merged_species[SBML_DFS.S_ID].isna()][
|
1231
|
+
SBML_DFS.S_NAME
|
1232
|
+
].unique()
|
1233
|
+
if len(missing_species) >= 1:
|
1234
|
+
raise ValueError(
|
1235
|
+
f"{len(missing_species)} species were present in "
|
1236
|
+
'"interaction_edgelist" but not "species_df":'
|
1237
|
+
f" {', '.join(missing_species)}"
|
1238
|
+
)
|
1239
|
+
|
1240
|
+
return None
|
1241
|
+
|
1242
|
+
|
1243
|
+
def _validate_matching_data(data_table: pd.DataFrame, ref_table: pd.DataFrame):
|
1244
|
+
"""Validates a table against a reference
|
1245
|
+
|
1246
|
+
This check if the table has the same index, no duplicates in the index
|
1247
|
+
and that all values in the index are in the reference table.
|
1248
|
+
|
1249
|
+
Args:
|
1250
|
+
data_table (pd.DataFrame): a table with data that should
|
1251
|
+
match the reference
|
1252
|
+
ref_table (pd.DataFrame): a reference table
|
1253
|
+
|
1254
|
+
Raises:
|
1255
|
+
ValueError: not same index name
|
1256
|
+
ValueError: index contains duplicates
|
1257
|
+
ValueError: index not subset of index of reactions table
|
1258
|
+
"""
|
1259
|
+
ref_index_name = ref_table.index.name
|
1260
|
+
if data_table.index.name != ref_index_name:
|
1261
|
+
raise ValueError(
|
1262
|
+
"the index name for reaction data table was not"
|
1263
|
+
f" {ref_index_name}: {data_table.index.name}"
|
1264
|
+
)
|
1265
|
+
ids = data_table.index
|
1266
|
+
if any(ids.duplicated()):
|
1267
|
+
raise ValueError(
|
1268
|
+
"the index for reaction data table " "contained duplicate values"
|
1269
|
+
)
|
1270
|
+
if not all(ids.isin(ref_table.index)):
|
1271
|
+
raise ValueError(
|
1272
|
+
"the index for reaction data table contained values"
|
1273
|
+
" not found in the reactions table"
|
1274
|
+
)
|
1275
|
+
if not isinstance(data_table, pd.DataFrame):
|
1276
|
+
raise TypeError(
|
1277
|
+
f"The data table was type {type(data_table).__name__}"
|
1278
|
+
" but must be a pd.DataFrame"
|
1279
|
+
)
|