napistu 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. napistu/__init__.py +12 -0
  2. napistu/__main__.py +867 -0
  3. napistu/consensus.py +1557 -0
  4. napistu/constants.py +500 -0
  5. napistu/gcs/__init__.py +10 -0
  6. napistu/gcs/constants.py +69 -0
  7. napistu/gcs/downloads.py +180 -0
  8. napistu/identifiers.py +805 -0
  9. napistu/indices.py +227 -0
  10. napistu/ingestion/__init__.py +10 -0
  11. napistu/ingestion/bigg.py +146 -0
  12. napistu/ingestion/constants.py +296 -0
  13. napistu/ingestion/cpr_edgelist.py +106 -0
  14. napistu/ingestion/identifiers_etl.py +148 -0
  15. napistu/ingestion/obo.py +268 -0
  16. napistu/ingestion/psi_mi.py +276 -0
  17. napistu/ingestion/reactome.py +218 -0
  18. napistu/ingestion/sbml.py +621 -0
  19. napistu/ingestion/string.py +356 -0
  20. napistu/ingestion/trrust.py +285 -0
  21. napistu/ingestion/yeast.py +147 -0
  22. napistu/mechanism_matching.py +597 -0
  23. napistu/modify/__init__.py +10 -0
  24. napistu/modify/constants.py +86 -0
  25. napistu/modify/curation.py +628 -0
  26. napistu/modify/gaps.py +635 -0
  27. napistu/modify/pathwayannot.py +1381 -0
  28. napistu/modify/uncompartmentalize.py +264 -0
  29. napistu/network/__init__.py +10 -0
  30. napistu/network/constants.py +117 -0
  31. napistu/network/neighborhoods.py +1594 -0
  32. napistu/network/net_create.py +1647 -0
  33. napistu/network/net_utils.py +652 -0
  34. napistu/network/paths.py +500 -0
  35. napistu/network/precompute.py +221 -0
  36. napistu/rpy2/__init__.py +127 -0
  37. napistu/rpy2/callr.py +168 -0
  38. napistu/rpy2/constants.py +101 -0
  39. napistu/rpy2/netcontextr.py +464 -0
  40. napistu/rpy2/rids.py +697 -0
  41. napistu/sbml_dfs_core.py +2216 -0
  42. napistu/sbml_dfs_utils.py +304 -0
  43. napistu/source.py +394 -0
  44. napistu/utils.py +943 -0
  45. napistu-0.1.0.dist-info/METADATA +56 -0
  46. napistu-0.1.0.dist-info/RECORD +77 -0
  47. napistu-0.1.0.dist-info/WHEEL +5 -0
  48. napistu-0.1.0.dist-info/entry_points.txt +2 -0
  49. napistu-0.1.0.dist-info/licenses/LICENSE +21 -0
  50. napistu-0.1.0.dist-info/top_level.txt +2 -0
  51. tests/__init__.py +0 -0
  52. tests/conftest.py +83 -0
  53. tests/test_consensus.py +255 -0
  54. tests/test_constants.py +20 -0
  55. tests/test_curation.py +134 -0
  56. tests/test_data/__init__.py +0 -0
  57. tests/test_edgelist.py +20 -0
  58. tests/test_gcs.py +23 -0
  59. tests/test_identifiers.py +151 -0
  60. tests/test_igraph.py +353 -0
  61. tests/test_indices.py +88 -0
  62. tests/test_mechanism_matching.py +126 -0
  63. tests/test_net_utils.py +66 -0
  64. tests/test_netcontextr.py +105 -0
  65. tests/test_obo.py +34 -0
  66. tests/test_pathwayannot.py +95 -0
  67. tests/test_precomputed_distances.py +222 -0
  68. tests/test_rpy2.py +61 -0
  69. tests/test_sbml.py +46 -0
  70. tests/test_sbml_dfs_create.py +307 -0
  71. tests/test_sbml_dfs_utils.py +22 -0
  72. tests/test_sbo.py +11 -0
  73. tests/test_set_coverage.py +50 -0
  74. tests/test_source.py +67 -0
  75. tests/test_uncompartmentalize.py +40 -0
  76. tests/test_utils.py +487 -0
  77. tests/utils.py +30 -0
@@ -0,0 +1,2216 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import re
5
+ from typing import Any
6
+ from typing import Iterable
7
+ from typing import Mapping
8
+ from typing import MutableMapping
9
+ from typing import TYPE_CHECKING
10
+
11
+ import numpy as np
12
+ import pandas as pd
13
+ from napistu import identifiers
14
+ from napistu import sbml_dfs_utils
15
+ from napistu import source
16
+ from napistu import utils
17
+ from napistu.constants import SBML_DFS
18
+ from napistu.constants import SBML_DFS_SCHEMA
19
+ from napistu.constants import IDENTIFIERS
20
+ from napistu.constants import REQUIRED_REACTION_FROMEDGELIST_COLUMNS
21
+ from napistu.constants import CPR_STANDARD_OUTPUTS
22
+ from napistu.constants import INTERACTION_EDGELIST_EXPECTED_VARS
23
+ from napistu.constants import BQB_PRIORITIES
24
+ from napistu.constants import ONTOLOGY_PRIORITIES
25
+ from napistu.constants import BQB
26
+ from napistu.constants import BQB_DEFINING_ATTRS
27
+ from napistu.constants import COMPARTMENTS
28
+ from napistu.constants import COMPARTMENT_ALIASES
29
+ from napistu.constants import COMPARTMENTS_GO_TERMS
30
+ from napistu.constants import MINI_SBO_FROM_NAME
31
+ from napistu.constants import MINI_SBO_TO_NAME
32
+ from napistu.constants import ONTOLOGIES
33
+ from napistu.constants import SBO_NAME_TO_ROLE
34
+ from napistu.constants import SBOTERM_NAMES
35
+ from napistu.constants import CHARACTERISTIC_COMPLEX_ONTOLOGIES
36
+ from napistu.ingestion import sbml
37
+ from fs import open_fs
38
+
39
+ logger = logging.getLogger(__name__)
40
+
41
+
42
+ class SBML_dfs:
43
+ """
44
+ System Biology Markup Language Model Data Frames.
45
+
46
+ Attributes
47
+ ----------
48
+ compartments: pd.DataFrame
49
+ sub-cellular compartments in the model
50
+ species: pd.DataFrame
51
+ molecular species in the model
52
+ species_data: Dict[str, pd.DataFrame]: Additional data for species.
53
+ DataFrames with additional data and index = species_id
54
+ reactions: pd.DataFrame
55
+ reactions in the model
56
+ reactions_data: Dict[str, pd.DataFrame]: Additional data for reactions.
57
+ DataFrames with additional data and index = reaction_id
58
+ reaction_species: pd.DataFrame
59
+ One entry per species participating in a reaction
60
+ schema: dict
61
+ dictionary reprenting the structure of the other attributes and meaning of their variables
62
+
63
+ Methods
64
+ -------
65
+ get_table(entity_type, required_attributes)
66
+ Get a table from the SBML_dfs object and optionally validate that it contains a set of required attributes
67
+ search_by_ids(ids, entity_type, identifiers_df, ontologies)
68
+ Pull out identifiers and entities matching a set of query ids which optionally match a set of ontologies
69
+ search_by_name(name, entity_type, partial_match)
70
+ Pull out a set of entities by name or partial string match [default]
71
+ get_cspecies_features()
72
+ Returns additional attributes of compartmentalized species
73
+ get_species_features()
74
+ Returns additional attributes of species
75
+ get_identifiers(id_type)
76
+ Returns a DataFrame containing identifiers from the id_type table
77
+ get_uri_urls(entity_type, entity_ids = None)
78
+ Returns a Series containing reference urls for each entity
79
+ validate()
80
+ Validate that the sbml_dfs follows the schema and identify clear pathologies
81
+ validate_and_rec()
82
+ Validate the sbml_dfs and attempt to automatically resolve common issues
83
+ """
84
+
85
+ compartments: pd.DataFrame
86
+ species: pd.DataFrame
87
+ species_data: dict[str, pd.DataFrame]
88
+ reactions: pd.DataFrame
89
+ reactions_data: dict[str, pd.DataFrame]
90
+ reaction_species: pd.DataFrame
91
+ schema: dict
92
+ _required_entities: set[str]
93
+ _optional_entities: set[str]
94
+
95
+ def __init__(
96
+ self,
97
+ sbml_model: (
98
+ sbml.SBML | MutableMapping[str, pd.DataFrame | dict[str, pd.DataFrame]]
99
+ ),
100
+ validate: bool = True,
101
+ resolve: bool = True,
102
+ ) -> None:
103
+ """
104
+ Creates a pathway
105
+
106
+ Parameters
107
+ ----------
108
+ sbml_model : cpr.SBML or a dict containing tables following the sbml_dfs schema
109
+ A SBML model produced by cpr.SBML().
110
+ validate (bool): if True then call self.validate() to identify formatting issues
111
+ resolve (bool): if True then try to automatically resolve common problems
112
+
113
+ Returns
114
+ -------
115
+ None.
116
+ """
117
+
118
+ self.schema = SBML_DFS_SCHEMA.SCHEMA
119
+ self._required_entities = SBML_DFS_SCHEMA.REQUIRED_ENTITIES
120
+ self._optional_entities = SBML_DFS_SCHEMA.OPTIONAL_ENTITIES
121
+
122
+ # Initialize the dynamic attributes for type checking
123
+ if TYPE_CHECKING:
124
+ self.compartments = pd.DataFrame()
125
+ self.species = pd.DataFrame()
126
+ self.compartmentalized_species = pd.DataFrame()
127
+ self.reactions = pd.DataFrame()
128
+ self.reaction_species = pd.DataFrame()
129
+
130
+ # create a model from dictionary entries
131
+ if isinstance(sbml_model, dict):
132
+ for ent in SBML_DFS_SCHEMA.REQUIRED_ENTITIES:
133
+ setattr(self, ent, sbml_model[ent])
134
+ for ent in SBML_DFS_SCHEMA.OPTIONAL_ENTITIES:
135
+ if ent in sbml_model:
136
+ setattr(self, ent, sbml_model[ent])
137
+ else:
138
+ self = sbml.sbml_df_from_sbml(self, sbml_model)
139
+
140
+ for ent in SBML_DFS_SCHEMA.OPTIONAL_ENTITIES:
141
+ # Initialize optional entities if not set
142
+ if not hasattr(self, ent):
143
+ setattr(self, ent, {})
144
+
145
+ if validate:
146
+ if resolve:
147
+ self.validate_and_resolve()
148
+ else:
149
+ self.validate()
150
+ else:
151
+ if resolve:
152
+ logger.warning(
153
+ '"validate" = False so "resolve" will be ignored (eventhough it was True)'
154
+ )
155
+
156
+ def get_table(
157
+ self, entity_type: str, required_attributes: None | set[str] = None
158
+ ) -> pd.DataFrame:
159
+ """
160
+ Get Table
161
+
162
+ Get a table from the SBML_dfs object and optionally validate that it contains a set of required attributes.
163
+ """
164
+
165
+ schema = self.schema
166
+
167
+ if entity_type not in schema.keys():
168
+ raise ValueError(
169
+ f"{entity_type} does not match a table in the SBML_dfs object. The tables "
170
+ f"which are present are {', '.join(schema.keys())}"
171
+ )
172
+
173
+ if required_attributes is not None:
174
+ assert isinstance(required_attributes, set)
175
+
176
+ # determine whether required_attributes are appropriate
177
+ VALID_REQUIRED_ATTRIBUTES = {"id", "source", "label"}
178
+ invalid_required_attributes = required_attributes.difference(
179
+ VALID_REQUIRED_ATTRIBUTES
180
+ )
181
+
182
+ if len(invalid_required_attributes) > 0:
183
+ raise ValueError(
184
+ f"The following required attributes are not valid: {', '.join(invalid_required_attributes)}. "
185
+ f"Requiered attributes must be a subset of {', '.join(VALID_REQUIRED_ATTRIBUTES)}"
186
+ )
187
+
188
+ # determine if required_attributes are satisified
189
+ invalid_attrs = [
190
+ s for s in required_attributes if s not in schema[entity_type].keys()
191
+ ]
192
+ if len(invalid_attrs) > 0:
193
+ raise ValueError(
194
+ f"The following required attributes are not present for the {entity_type} table: "
195
+ f"{', '.join(invalid_attrs)}."
196
+ )
197
+
198
+ return getattr(self, entity_type)
199
+
200
+ def search_by_ids(
201
+ self,
202
+ ids: list[str],
203
+ entity_type: str,
204
+ identifiers_df: pd.DataFrame,
205
+ ontologies: None | set[str] = None,
206
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
207
+ # validate inputs
208
+ entity_table = self.get_table(entity_type, required_attributes={"id"})
209
+ entity_pk = self.schema[entity_type]["pk"]
210
+
211
+ utils.match_pd_vars(
212
+ identifiers_df,
213
+ req_vars={
214
+ entity_pk,
215
+ IDENTIFIERS.ONTOLOGY,
216
+ IDENTIFIERS.IDENTIFIER,
217
+ IDENTIFIERS.URL,
218
+ IDENTIFIERS.BQB,
219
+ },
220
+ allow_series=False,
221
+ ).assert_present()
222
+
223
+ if ontologies is not None:
224
+ assert isinstance(ontologies, set)
225
+ ALL_VALID_ONTOLOGIES = identifiers_df["ontology"].unique()
226
+ invalid_ontologies = ontologies.difference(ALL_VALID_ONTOLOGIES)
227
+ if len(invalid_ontologies) > 0:
228
+ raise ValueError(
229
+ f"The following ontologies are not valid: {', '.join(invalid_ontologies)}.\n"
230
+ f"Valid ontologies are {', '.join(ALL_VALID_ONTOLOGIES)}"
231
+ )
232
+
233
+ # fitler to just to identifiers matchign the ontologies of interest
234
+ identifiers_df = identifiers_df.query("ontology in @ontologies")
235
+
236
+ matching_identifiers = identifiers_df.loc[
237
+ identifiers_df["identifier"].isin(ids)
238
+ ]
239
+ entity_subset = entity_table.loc[matching_identifiers[entity_pk].tolist()]
240
+
241
+ return entity_subset, matching_identifiers
242
+
243
+ def search_by_name(
244
+ self, name: str, entity_type: str, partial_match: bool = True
245
+ ) -> pd.DataFrame:
246
+ entity_table = self.get_table(entity_type, required_attributes={"label"})
247
+ label_attr = self.schema[entity_type]["label"]
248
+
249
+ if partial_match:
250
+ matches = entity_table.loc[
251
+ entity_table[label_attr].str.contains(name, case=False)
252
+ ]
253
+ else:
254
+ matches = entity_table.loc[entity_table[label_attr].str.lower() == name]
255
+ return matches
256
+
257
+ def get_species_features(self) -> pd.DataFrame:
258
+ species = self.species
259
+ augmented_species = species.assign(
260
+ **{"species_type": lambda d: d["s_Identifiers"].apply(species_type_types)}
261
+ )
262
+
263
+ return augmented_species
264
+
265
+ def get_cspecies_features(self) -> pd.DataFrame:
266
+ cspecies_n_connections = (
267
+ self.reaction_species["sc_id"].value_counts().rename("sc_degree")
268
+ )
269
+
270
+ cspecies_n_children = (
271
+ self.reaction_species.loc[
272
+ self.reaction_species[SBML_DFS.STOICHIOMETRY] <= 0, "sc_id"
273
+ ]
274
+ .value_counts()
275
+ .rename("sc_children")
276
+ )
277
+
278
+ cspecies_n_parents = (
279
+ self.reaction_species.loc[
280
+ self.reaction_species[SBML_DFS.STOICHIOMETRY] > 0, "sc_id"
281
+ ]
282
+ .value_counts()
283
+ .rename("sc_parents")
284
+ )
285
+
286
+ species_features = self.get_species_features()["species_type"]
287
+
288
+ return (
289
+ self.compartmentalized_species.join(cspecies_n_connections)
290
+ .join(cspecies_n_children)
291
+ .join(cspecies_n_parents)
292
+ .fillna(0)
293
+ .astype(
294
+ {"sc_degree": "int32", "sc_children": "int32", "sc_parents": "int32"}
295
+ )
296
+ .merge(species_features, left_on="s_id", right_index=True)
297
+ .drop(columns=["sc_name", "s_id", "c_id"])
298
+ )
299
+
300
+ def get_identifiers(self, id_type) -> pd.DataFrame:
301
+ selected_table = self.get_table(id_type, {"id"})
302
+ schema = self.schema
303
+
304
+ identifiers_dict = dict()
305
+ for sysid in selected_table.index:
306
+ id_entry = selected_table[schema[id_type]["id"]][sysid]
307
+
308
+ if isinstance(id_entry, identifiers.Identifiers):
309
+ identifiers_dict[sysid] = pd.DataFrame(id_entry.ids)
310
+ elif np.isnan(id_entry):
311
+ continue
312
+ else:
313
+ raise ValueError(
314
+ f"id_entry was a {type(id_entry)} and must either be"
315
+ " an identifiers.Identifiers object or NaN"
316
+ )
317
+ identifiers_tbl = pd.concat(identifiers_dict)
318
+
319
+ identifiers_tbl.index.names = [schema[id_type]["pk"], "entry"]
320
+ identifiers_tbl = identifiers_tbl.reset_index()
321
+
322
+ named_identifiers = identifiers_tbl.merge(
323
+ selected_table.drop(schema[id_type]["id"], axis=1),
324
+ left_on=schema[id_type]["pk"],
325
+ right_index=True,
326
+ )
327
+
328
+ return named_identifiers
329
+
330
+ def get_uri_urls(
331
+ self,
332
+ entity_type: str,
333
+ entity_ids: Iterable[str] | None = None,
334
+ required_ontology: str | None = None,
335
+ ) -> pd.Series:
336
+ schema = self.schema
337
+
338
+ # valid entities and their identifier variables
339
+ valid_entity_types = [
340
+ SBML_DFS.COMPARTMENTS,
341
+ SBML_DFS.SPECIES,
342
+ SBML_DFS.REACTIONS,
343
+ ]
344
+
345
+ if entity_type not in valid_entity_types:
346
+ raise ValueError(
347
+ f"{entity_type} is an invalid entity_type; valid types "
348
+ f"are {', '.join(valid_entity_types)}"
349
+ )
350
+
351
+ entity_table = getattr(self, entity_type)
352
+
353
+ if entity_ids is not None:
354
+ # ensure that entity_ids are unique and then convert back to list
355
+ # to support pandas indexing
356
+ entity_ids = list(set(entity_ids))
357
+
358
+ # filter to a subset of identifiers if one is provided
359
+ entity_table = entity_table.loc[entity_ids]
360
+
361
+ # create a dataframe of all identifiers for the select entities
362
+ all_ids = pd.concat(
363
+ [
364
+ sbml_dfs_utils._stub_ids(
365
+ entity_table[schema[entity_type]["id"]][i].ids
366
+ ).assign(id=entity_table.index[i])
367
+ for i in range(0, entity_table.shape[0])
368
+ ]
369
+ ).rename(columns={"id": schema[entity_type]["pk"]})
370
+
371
+ # set priorities for ontologies and bqb terms
372
+
373
+ if required_ontology is None:
374
+ all_ids = all_ids.merge(BQB_PRIORITIES, how="left").merge(
375
+ ONTOLOGY_PRIORITIES, how="left"
376
+ )
377
+ else:
378
+ ontology_priorities = pd.DataFrame(
379
+ [{IDENTIFIERS.ONTOLOGY: required_ontology, "ontology_rank": 1}]
380
+ )
381
+ # if only a single ontology is sought then just return matching entries
382
+ all_ids = all_ids.merge(BQB_PRIORITIES, how="left").merge(
383
+ ontology_priorities, how="inner"
384
+ )
385
+
386
+ uri_urls = (
387
+ all_ids.sort_values(["bqb_rank", "ontology_rank", IDENTIFIERS.URL])
388
+ .groupby(schema[entity_type]["pk"])
389
+ .first()[IDENTIFIERS.URL]
390
+ )
391
+ return uri_urls
392
+
393
+ def get_network_summary(self) -> Mapping[str, Any]:
394
+ """Return diagnostic statistics about the network
395
+
396
+ Returns:
397
+ Mapping[str, Any]: A dictionary of diagnostic statistics with entries:
398
+ n_species_types [int]: Number of species types
399
+ dict_n_species_per_type [dict[str, int]]: Number of
400
+ species per species type
401
+ n_species [int]: Number of species
402
+ n_cspecies [int]: Number of compartmentalized species
403
+ n_reaction_species [int]: Number of reaction species
404
+ n_reactions [int]: Number of reactions
405
+ n_compartments [int]: Number of compartments
406
+ dict_n_species_per_compartment [dict[str, int]]:
407
+ Number of species per compartment
408
+ stats_species_per_reaction [dict[str, float]]:
409
+ Statistics on the number of reactands per reaction
410
+ top10_species_per_reaction [list[dict[str, Any]]]:
411
+ Top 10 reactions with highest number of reactands
412
+ stats_degree [dict[str, float]]: Statistics on the degree
413
+ of a species (number of reactions it is involved in)
414
+ top10_degree [list[dict[str, Any]]]:
415
+ Top 10 species with highest degree
416
+ stats_identifiers_per_species [dict[str, float]]:
417
+ Statistics on the number of identifiers per species
418
+ top10_identifiers_per_species [list[dict[str, Any]]]:
419
+ Top 10 species with highest number of identifiers
420
+ """
421
+ stats: MutableMapping[str, Any] = {}
422
+ species_features = self.get_species_features()
423
+ stats["n_species_types"] = species_features["species_type"].nunique()
424
+ stats["dict_n_species_per_type"] = (
425
+ species_features.groupby(by="species_type").size().to_dict()
426
+ )
427
+ stats["n_species"] = self.species.shape[0]
428
+ stats["n_cspecies"] = self.compartmentalized_species.shape[0]
429
+ stats["n_reaction_species"] = self.reaction_species.shape[0]
430
+ stats["n_reactions"] = self.reactions.shape[0]
431
+ stats["n_compartments"] = self.compartments.shape[0]
432
+ stats["dict_n_species_per_compartment"] = (
433
+ self.compartmentalized_species.groupby(SBML_DFS.C_ID)
434
+ .size()
435
+ .rename("n_species") # type:ignore
436
+ .to_frame()
437
+ .join(self.compartments[[SBML_DFS.C_NAME]])
438
+ .reset_index(drop=False)
439
+ .to_dict(orient="records")
440
+ )
441
+ per_reaction_stats = self.reaction_species.groupby(SBML_DFS.R_ID).size()
442
+ stats["stats_species_per_reactions"] = per_reaction_stats.describe().to_dict()
443
+ stats["top10_species_per_reactions"] = (
444
+ per_reaction_stats.sort_values(ascending=False) # type:ignore
445
+ .head(10)
446
+ .rename("n_species")
447
+ .to_frame()
448
+ .join(self.reactions[[SBML_DFS.R_NAME]])
449
+ .reset_index(drop=False)
450
+ .to_dict(orient="records")
451
+ )
452
+
453
+ cspecies_features = self.get_cspecies_features()
454
+ stats["stats_degree"] = cspecies_features["sc_degree"].describe().to_dict()
455
+ stats["top10_degree"] = (
456
+ cspecies_features.sort_values("sc_degree", ascending=False)
457
+ .head(10)[["sc_degree", "sc_children", "sc_parents", "species_type"]]
458
+ .merge(
459
+ self.compartmentalized_species[[SBML_DFS.S_ID, SBML_DFS.C_ID]],
460
+ on=SBML_DFS.SC_ID,
461
+ )
462
+ .merge(self.compartments[[SBML_DFS.C_NAME]], on=SBML_DFS.C_ID)
463
+ .merge(self.species[[SBML_DFS.S_NAME]], on=SBML_DFS.S_ID)
464
+ .reset_index(drop=False)
465
+ .to_dict(orient="records")
466
+ )
467
+ s_identifiers = sbml_dfs_utils.unnest_identifiers(
468
+ self.species, SBML_DFS.S_IDENTIFIERS
469
+ )
470
+ identifiers_stats = s_identifiers.groupby("s_id").size()
471
+ stats["stats_identifiers_per_species"] = identifiers_stats.describe().to_dict()
472
+ stats["top10_identifiers_per_species"] = (
473
+ identifiers_stats.sort_values(ascending=False)
474
+ .head(10)
475
+ .rename("n_identifiers")
476
+ .to_frame()
477
+ .join(species_features[[SBML_DFS.S_NAME, "species_type"]])
478
+ .reset_index(drop=False)
479
+ .to_dict(orient="records")
480
+ )
481
+
482
+ return stats
483
+
484
+ def add_species_data(self, label: str, data: pd.DataFrame):
485
+ """Adds additional species_data with validation
486
+
487
+ Args:
488
+ label (str): the label for the new data
489
+ data (pd.DataFrame): the data
490
+
491
+ Raises:
492
+ ValueError: if the data is not valid, ie does not match with `species`
493
+ """
494
+ self._validate_species_data(data)
495
+ if label in self.species_data:
496
+ raise ValueError(
497
+ f"{label} already exists in species_data. " "Drop it first."
498
+ )
499
+ self.species_data[label] = data
500
+
501
+ def add_reactions_data(self, label: str, data: pd.DataFrame):
502
+ """Adds additional reaction_data with validation
503
+
504
+ Args:
505
+ label (str): the label for the new data
506
+ data (pd.DataFrame): the data
507
+
508
+ Raises:
509
+ ValueError: if the data is not valid, ie does not match with `reactions`
510
+ """
511
+ self._validate_reactions_data(data)
512
+ if label in self.reactions_data:
513
+ raise ValueError(
514
+ f"{label} already exists in reactions_data. Drop it first."
515
+ )
516
+ self.reactions_data[label] = data
517
+
518
+ def remove_compartmentalized_species(self, sc_ids: Iterable[str]):
519
+ """
520
+ Starting with a set of compartmentalized species determine which reactions should be removed
521
+ based on there removal. Then remove these reactions, compartmentalized species, and species.
522
+
523
+ """
524
+
525
+ # find reactions which should be totally removed since they are losing critical species
526
+ removed_reactions = find_underspecified_reactions(self, sc_ids)
527
+ self.remove_reactions(removed_reactions)
528
+
529
+ self._remove_compartmentalized_species(sc_ids)
530
+
531
+ # remove species (and their associated species data if all their cspecies have been lost)
532
+ self._remove_unused_species()
533
+
534
+ def remove_reactions(self, r_ids: Iterable[str], remove_species: bool = False):
535
+ """Removes reactions from the model
536
+
537
+ Args:
538
+ r_ids (List[str]): the reactions to remove
539
+ remove_species (bool, optional): whether to remove species that are no longer
540
+ part of any reactions. Defaults to False.
541
+ """
542
+ # remove corresponding reactions_species
543
+ self.reaction_species = self.reaction_species.query("r_id not in @r_ids")
544
+ # remove reactions
545
+ self.reactions = self.reactions.drop(index=list(r_ids))
546
+ # remove reactions_data
547
+ if hasattr(self, "reactions_data"):
548
+ for k, data in self.reactions_data.items():
549
+ self.reactions_data[k] = data.drop(index=list(r_ids))
550
+ # remove species if requested
551
+ if remove_species:
552
+ self._remove_unused_cspecies()
553
+ self._remove_unused_species()
554
+
555
+ def validate(self):
556
+ """Validates the object for obvious errors"""
557
+
558
+ if not hasattr(self, "schema"):
559
+ raise ValueError("No schema found")
560
+
561
+ required_tables = self._required_entities
562
+ schema_tables = set(self.schema.keys())
563
+
564
+ extra_tables = schema_tables.difference(required_tables)
565
+ if len(extra_tables) != 0:
566
+ logger.debug(
567
+ f"{len(extra_tables)} unexpected tables found: "
568
+ f"{', '.join(extra_tables)}"
569
+ )
570
+
571
+ missing_tables = required_tables.difference(schema_tables)
572
+ if len(missing_tables) != 0:
573
+ raise ValueError(
574
+ f"Missing {len(missing_tables)} required tables: "
575
+ f"{', '.join(missing_tables)}"
576
+ )
577
+
578
+ # check individual tables
579
+
580
+ for table in required_tables:
581
+ table_schema = self.schema[table]
582
+ table_data = getattr(self, table)
583
+
584
+ if not isinstance(table_data, pd.DataFrame):
585
+ raise ValueError(
586
+ f"{table} must be a pd.DataFrame, but was a " f"{type(table_data)}"
587
+ )
588
+
589
+ # check index
590
+ expected_index_name = table_schema["pk"]
591
+ if table_data.index.name != expected_index_name:
592
+ raise ValueError(
593
+ f"the index name for {table} was not the pk: "
594
+ f"{expected_index_name}"
595
+ )
596
+
597
+ # check that all entries in the index are unique
598
+ if len(set(table_data.index.tolist())) != table_data.shape[0]:
599
+ duplicated_pks = table_data.index.value_counts()
600
+ duplicated_pks = duplicated_pks[duplicated_pks > 1]
601
+
602
+ example_duplicates = duplicated_pks.index[
603
+ 0 : min(duplicated_pks.shape[0], 5)
604
+ ]
605
+ raise ValueError(
606
+ f"{duplicated_pks.shape[0]} primary keys were "
607
+ f"duplicated including {', '.join(example_duplicates)}"
608
+ )
609
+
610
+ # check variables
611
+ expected_vars = set(table_schema["vars"])
612
+ table_vars = set(list(table_data.columns))
613
+
614
+ extra_vars = table_vars.difference(expected_vars)
615
+ if len(extra_vars) != 0:
616
+ logger.debug(
617
+ f"{len(extra_vars)} extra variables were found"
618
+ f" for {table}: {', '.join(extra_vars)}"
619
+ )
620
+
621
+ missing_vars = expected_vars.difference(table_vars)
622
+ if len(missing_vars) != 0:
623
+ raise ValueError(
624
+ f"Missing {len(missing_vars)} required variables"
625
+ f" for {table}: {', '.join(missing_vars)}"
626
+ )
627
+
628
+ # check
629
+ if table_data.shape[0] == 0:
630
+ raise ValueError(f"{table} contained no entries")
631
+
632
+ # check whether pks and fks agree
633
+
634
+ pk_df = pd.DataFrame(
635
+ [{"pk_table": k, "key": v["pk"]} for k, v in self.schema.items()]
636
+ )
637
+
638
+ fk_df = (
639
+ pd.DataFrame(
640
+ [
641
+ {"fk_table": k, "fk": v["fk"]}
642
+ for k, v in self.schema.items()
643
+ if "fk" in v.keys()
644
+ ]
645
+ )
646
+ .set_index("fk_table")["fk"]
647
+ .apply(pd.Series)
648
+ .reset_index()
649
+ .melt(id_vars="fk_table")
650
+ .drop(["variable"], axis=1)
651
+ .rename(columns={"value": "key"})
652
+ )
653
+
654
+ pk_fk_correspondences = pk_df.merge(fk_df)
655
+
656
+ for i in range(0, pk_fk_correspondences.shape[0]):
657
+ pk_table_keys = set(
658
+ getattr(self, pk_fk_correspondences["pk_table"][i]).index.tolist()
659
+ )
660
+ if None in pk_table_keys:
661
+ raise ValueError(
662
+ f"{pk_fk_correspondences['pk_table'][i]} had "
663
+ "missing values in its index"
664
+ )
665
+
666
+ fk_table_keys = set(
667
+ getattr(self, pk_fk_correspondences["fk_table"][i]).loc[
668
+ :, pk_fk_correspondences["key"][i]
669
+ ]
670
+ )
671
+ if None in fk_table_keys:
672
+ raise ValueError(
673
+ f"{pk_fk_correspondences['fk_table'][i]} included "
674
+ f"missing {pk_fk_correspondences['key'][i]} values"
675
+ )
676
+
677
+ # all foreign keys need to match a primary key
678
+
679
+ extra_fks = fk_table_keys.difference(pk_table_keys)
680
+ if len(extra_fks) != 0:
681
+ raise ValueError(
682
+ f"{len(extra_fks)} distinct "
683
+ f"{pk_fk_correspondences['key'][i]} values were"
684
+ f" found in {pk_fk_correspondences['fk_table'][i]} "
685
+ f"but missing from {pk_fk_correspondences['pk_table'][i]}."
686
+ " All foreign keys must have a matching primary key.\n\n"
687
+ f"Extra key are: {', '.join(extra_fks)}"
688
+ )
689
+
690
+ # check optional data tables:
691
+ for k, v in self.species_data.items():
692
+ try:
693
+ self._validate_species_data(v)
694
+ except ValueError as e:
695
+ raise ValueError(f"species data {k} was invalid.") from e
696
+
697
+ for k, v in self.reactions_data.items():
698
+ try:
699
+ self._validate_reactions_data(v)
700
+ except ValueError as e:
701
+ raise ValueError(f"reactions data {k} was invalid.") from e
702
+
703
+ # validate reaction_species sbo_terms and stoi
704
+ self._validate_reaction_species()
705
+
706
+ def validate_and_resolve(self):
707
+ """Call validate and try to iteratively resolve common validation errors"""
708
+
709
+ current_exception = None
710
+ validated = False
711
+
712
+ while not validated:
713
+ try:
714
+ self.validate()
715
+ validated = True
716
+ except Exception as e:
717
+ e_str = str(e)
718
+ if e_str == current_exception:
719
+ logger.warning(
720
+ "Automated resolution of an Exception was attempted but failed"
721
+ )
722
+ raise e
723
+
724
+ # try to resolve
725
+ self._attempt_resolve(e)
726
+
727
+ def _remove_unused_cspecies(self):
728
+ """Removes compartmentalized species that are no
729
+ longer part of any reactions"""
730
+ sc_ids = self._get_unused_cspecies()
731
+ self._remove_compartmentalized_species(sc_ids)
732
+
733
+ def _get_unused_cspecies(self) -> set[str]:
734
+ """Returns a set of compartmentalized species
735
+ that are not part of any reactions"""
736
+ sc_ids = set(self.compartmentalized_species.index) - set(
737
+ self.reaction_species[SBML_DFS.SC_ID]
738
+ )
739
+ return sc_ids # type: ignore
740
+
741
+ def _remove_unused_species(self):
742
+ """Removes species that are no longer part of any
743
+ compartmentalized species"""
744
+ s_ids = self._get_unused_species()
745
+ self._remove_species(s_ids)
746
+
747
+ def _get_unused_species(self) -> set[str]:
748
+ """Returns a list of species that are not part of any reactions"""
749
+ s_ids = set(self.species.index) - set(
750
+ self.compartmentalized_species[SBML_DFS.S_ID]
751
+ )
752
+ return s_ids # type: ignore
753
+
754
+ def _remove_compartmentalized_species(self, sc_ids: Iterable[str]):
755
+ """Removes compartmentalized species from the model
756
+
757
+ This should not be directly used by the user, as it can lead to
758
+ invalid reactions when removing species without a logic to decide
759
+ if the reaction needs to be removed as well.
760
+
761
+ Args:
762
+ sc_ids (Iterable[str]): the compartmentalized species to remove
763
+ """
764
+ # Remove compartmentalized species
765
+ self.compartmentalized_species = self.compartmentalized_species.drop(
766
+ index=list(sc_ids)
767
+ )
768
+ # remove corresponding reactions_species
769
+ self.reaction_species = self.reaction_species.query("sc_id not in @sc_ids")
770
+
771
+ def _remove_species(self, s_ids: Iterable[str]):
772
+ """Removes species from the model
773
+
774
+ This should not be directly used by the user, as it can lead to
775
+ invalid reactions when removing species without a logic to decide
776
+ if the reaction needs to be removed as well.
777
+
778
+ This removes the species and corresponding compartmentalized species and
779
+ reactions_species.
780
+
781
+ Args:
782
+ s_ids (Iterable[str]): the species to remove
783
+ """
784
+ sc_ids = self.compartmentalized_species.query("s_id in @s_ids").index.tolist()
785
+ self._remove_compartmentalized_species(sc_ids)
786
+ # Remove species
787
+ self.species = self.species.drop(index=list(s_ids))
788
+ # remove data
789
+ for k, data in self.species_data.items():
790
+ self.species_data[k] = data.drop(index=list(s_ids))
791
+
792
+ def _validate_species_data(self, species_data_table: pd.DataFrame):
793
+ """Validates species data attribute
794
+
795
+ Args:
796
+ species_data_table (pd.DataFrame): a species data table
797
+
798
+ Raises:
799
+ ValueError: s_id not index name
800
+ ValueError: s_id index contains duplicates
801
+ ValueError: s_id not in species table
802
+ """
803
+ _validate_matching_data(species_data_table, self.species)
804
+
805
+ def _validate_reactions_data(self, reactions_data_table: pd.DataFrame):
806
+ """Validates reactions data attribute
807
+
808
+ Args:
809
+ reactions_data_table (pd.DataFrame): a reactions data table
810
+
811
+ Raises:
812
+ ValueError: r_id not index name
813
+ ValueError: r_id index contains duplicates
814
+ ValueError: r_id not in reactions table
815
+ """
816
+ _validate_matching_data(reactions_data_table, self.reactions)
817
+
818
+ def _validate_reaction_species(self):
819
+ assert all(self.reaction_species[SBML_DFS.STOICHIOMETRY].notnull())
820
+
821
+ # test for null SBO terms
822
+ n_null_sbo_terms = sum(self.reaction_species[SBML_DFS.SBO_TERM].isnull())
823
+ if n_null_sbo_terms != 0:
824
+ raise ValueError(
825
+ f"{n_null_sbo_terms} sbo_terms were None; all terms should be defined"
826
+ )
827
+
828
+ # find invalid SBO terms
829
+ sbo_counts = self.reaction_species.value_counts(SBML_DFS.SBO_TERM)
830
+ invalid_sbo_term_counts = sbo_counts[
831
+ ~sbo_counts.index.isin(MINI_SBO_TO_NAME.keys())
832
+ ]
833
+
834
+ if invalid_sbo_term_counts.shape[0] != 0:
835
+ invalid_sbo_counts_str = ", ".join(
836
+ [f"{k} (N={v})" for k, v in invalid_sbo_term_counts.to_dict().items()]
837
+ )
838
+ raise ValueError(
839
+ f"{invalid_sbo_term_counts.shape[0]} sbo_terms were not "
840
+ f"defined {invalid_sbo_counts_str}"
841
+ )
842
+
843
+ def _attempt_resolve(self, e):
844
+ str_e = str(e)
845
+ if str_e == "compartmentalized_species included missing c_id values":
846
+ logger.warning(str_e)
847
+ logger.warning(
848
+ "Attempting to resolve with infer_uncompartmentalized_species_location()"
849
+ )
850
+ self = infer_uncompartmentalized_species_location(self)
851
+ elif re.search("sbo_terms were not defined", str_e):
852
+ logger.warning(str_e)
853
+ logger.warning("Attempting to resolve with infer_sbo_terms()")
854
+ self = infer_sbo_terms(self)
855
+ else:
856
+ logger.warning(
857
+ "An error occurred which could not be automatically resolved"
858
+ )
859
+ raise e
860
+
861
+
862
+ def species_status(s_id: str, sbml_dfs: SBML_dfs) -> pd.DataFrame:
863
+ """
864
+ Species Status
865
+
866
+ Return all of the reaction's a species particpates in.
867
+
868
+ Parameters:
869
+ s_id: str
870
+ A species ID
871
+ sbml_dfs: SBML_dfs
872
+
873
+ Returns:
874
+ pd.DataFrame, one row reaction
875
+ """
876
+
877
+ matching_species = sbml_dfs.species.loc[s_id]
878
+
879
+ if not isinstance(matching_species, pd.Series):
880
+ raise ValueError(f"{s_id} did not match a single species")
881
+
882
+ # find all rxns species particpate in
883
+
884
+ matching_compartmentalized_species = sbml_dfs.compartmentalized_species[
885
+ sbml_dfs.compartmentalized_species.s_id.isin([s_id])
886
+ ]
887
+
888
+ rxns_participating = sbml_dfs.reaction_species[
889
+ sbml_dfs.reaction_species.sc_id.isin(matching_compartmentalized_species.index)
890
+ ]
891
+
892
+ # find all participants in these rxns
893
+
894
+ full_rxns_participating = sbml_dfs.reaction_species[
895
+ sbml_dfs.reaction_species.r_id.isin(rxns_participating[SBML_DFS.R_ID])
896
+ ].merge(
897
+ sbml_dfs.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
898
+ )
899
+
900
+ reaction_descriptions = pd.concat(
901
+ [
902
+ reaction_summary(x, sbml_dfs)
903
+ for x in set(full_rxns_participating[SBML_DFS.R_ID].tolist())
904
+ ]
905
+ )
906
+
907
+ status = (
908
+ full_rxns_participating.loc[
909
+ full_rxns_participating[SBML_DFS.SC_ID].isin(
910
+ matching_compartmentalized_species.index.values.tolist()
911
+ ),
912
+ [SBML_DFS.SC_NAME, SBML_DFS.STOICHIOMETRY, SBML_DFS.R_ID],
913
+ ]
914
+ .merge(reaction_descriptions, left_on=SBML_DFS.R_ID, right_index=True)
915
+ .reset_index(drop=True)
916
+ .drop(SBML_DFS.R_ID, axis=1)
917
+ )
918
+
919
+ return status
920
+
921
+
922
+ def reaction_summary(r_id: str, sbml_dfs: SBML_dfs) -> pd.DataFrame:
923
+ """
924
+ Reaction Summary
925
+
926
+ Return a reaction's name and a human-readable formula.
927
+
928
+ Parameters:
929
+ r_id: str
930
+ A reaction ID
931
+ sbml_dfs: SBML_dfs
932
+
933
+ Returns:
934
+ one row pd.DataFrame
935
+ """
936
+
937
+ logger.warning(
938
+ "reaction_summary is deprecated and will be removed in a future version of rcpr; "
939
+ "please use reaction_summaries() instead"
940
+ )
941
+
942
+ matching_reaction = sbml_dfs.reactions.loc[r_id]
943
+
944
+ if not isinstance(matching_reaction, pd.Series):
945
+ raise ValueError(f"{r_id} did not match a single reaction")
946
+
947
+ matching_reaction = sbml_dfs.reactions.loc[r_id]
948
+
949
+ matching_reaction_species = sbml_dfs.reaction_species[
950
+ sbml_dfs.reaction_species.r_id.isin([r_id])
951
+ ].merge(
952
+ sbml_dfs.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
953
+ )
954
+
955
+ # collapse all reaction species to a formula string
956
+
957
+ if len(matching_reaction_species[SBML_DFS.C_ID].unique()) == 1:
958
+ augmented_matching_reaction_species = matching_reaction_species.merge(
959
+ sbml_dfs.compartments, left_on=SBML_DFS.C_ID, right_index=True
960
+ ).merge(sbml_dfs.species, left_on=SBML_DFS.S_ID, right_index=True)
961
+ str_formula = (
962
+ construct_formula_string(
963
+ augmented_matching_reaction_species, sbml_dfs.reactions, SBML_DFS.S_NAME
964
+ )
965
+ + " ["
966
+ + augmented_matching_reaction_species[SBML_DFS.C_NAME][0]
967
+ + "]"
968
+ )
969
+ else:
970
+ str_formula = construct_formula_string(
971
+ matching_reaction_species, sbml_dfs.reactions, SBML_DFS.SC_NAME
972
+ )
973
+
974
+ output = pd.DataFrame(
975
+ {
976
+ SBML_DFS.R_NAME: matching_reaction[SBML_DFS.R_NAME],
977
+ "r_formula_str": str_formula,
978
+ },
979
+ index=[r_id],
980
+ )
981
+
982
+ output.index.name = SBML_DFS.R_ID
983
+
984
+ return output
985
+
986
+
987
+ def reaction_summaries(sbml_dfs: SBML_dfs, r_ids=None) -> pd.Series:
988
+ """
989
+ Reaction Summary
990
+
991
+ Return human-readable formulas for reactions.
992
+
993
+ Parameters:
994
+ ----------
995
+ sbml_dfs: sbml.SBML_dfs
996
+ A relational mechanistic model
997
+ r_ids: [str], str or None
998
+ Reaction IDs or None for all reactions
999
+
1000
+ Returns:
1001
+ ----------
1002
+ formula_strs: pd.Series
1003
+ """
1004
+
1005
+ if isinstance(r_ids, str):
1006
+ r_ids = [r_ids]
1007
+
1008
+ if r_ids is None:
1009
+ matching_reactions = sbml_dfs.reactions
1010
+ else:
1011
+ matching_reactions = sbml_dfs.reactions.loc[r_ids]
1012
+
1013
+ matching_reaction_species = sbml_dfs.reaction_species[
1014
+ sbml_dfs.reaction_species.r_id.isin(matching_reactions.index)
1015
+ ].merge(
1016
+ sbml_dfs.compartmentalized_species, left_on=SBML_DFS.SC_ID, right_index=True
1017
+ )
1018
+
1019
+ # split into within compartment and cross-compartment reactions
1020
+ r_id_compartment_counts = matching_reaction_species.groupby(SBML_DFS.R_ID)[
1021
+ SBML_DFS.C_ID
1022
+ ].nunique()
1023
+
1024
+ # identify reactions which work across compartments
1025
+ r_id_cross_compartment = r_id_compartment_counts[r_id_compartment_counts > 1]
1026
+ # there species must be labelled with the sc_name to specify where a species exists
1027
+ if r_id_cross_compartment.shape[0] > 0:
1028
+ rxn_eqtn_cross_compartment = (
1029
+ matching_reaction_species[
1030
+ matching_reaction_species[SBML_DFS.R_ID].isin(
1031
+ r_id_cross_compartment.index
1032
+ )
1033
+ ]
1034
+ .sort_values([SBML_DFS.SC_NAME])
1035
+ .groupby(SBML_DFS.R_ID)
1036
+ .apply(
1037
+ lambda x: construct_formula_string(
1038
+ x, sbml_dfs.reactions, SBML_DFS.SC_NAME
1039
+ )
1040
+ )
1041
+ .rename("r_formula_str")
1042
+ )
1043
+ else:
1044
+ rxn_eqtn_cross_compartment = None
1045
+
1046
+ # identify reactions which occur within a single compartment; for these the reaction
1047
+ # can be labelled with the compartment and individual species can receive a more readable s_name
1048
+ r_id_within_compartment = r_id_compartment_counts[r_id_compartment_counts == 1]
1049
+ if r_id_within_compartment.shape[0] > 0:
1050
+ # add s_name
1051
+ augmented_matching_reaction_species = (
1052
+ matching_reaction_species[
1053
+ matching_reaction_species[SBML_DFS.R_ID].isin(
1054
+ r_id_within_compartment.index
1055
+ )
1056
+ ]
1057
+ .merge(sbml_dfs.compartments, left_on=SBML_DFS.C_ID, right_index=True)
1058
+ .merge(sbml_dfs.species, left_on=SBML_DFS.S_ID, right_index=True)
1059
+ .sort_values([SBML_DFS.S_NAME])
1060
+ )
1061
+ # create formulas based on s_names of components
1062
+ rxn_eqtn_within_compartment = augmented_matching_reaction_species.groupby(
1063
+ [SBML_DFS.R_ID, SBML_DFS.C_NAME]
1064
+ ).apply(
1065
+ lambda x: construct_formula_string(x, sbml_dfs.reactions, SBML_DFS.S_NAME)
1066
+ )
1067
+ # add compartment for each reaction
1068
+ rxn_eqtn_within_compartment = pd.Series(
1069
+ [
1070
+ y + ": " + x
1071
+ for x, y in zip(
1072
+ rxn_eqtn_within_compartment,
1073
+ rxn_eqtn_within_compartment.index.get_level_values(SBML_DFS.C_NAME),
1074
+ )
1075
+ ],
1076
+ index=rxn_eqtn_within_compartment.index.get_level_values(SBML_DFS.R_ID),
1077
+ ).rename("r_formula_str")
1078
+ else:
1079
+ rxn_eqtn_within_compartment = None
1080
+
1081
+ formula_strs = pd.concat([rxn_eqtn_cross_compartment, rxn_eqtn_within_compartment])
1082
+
1083
+ return formula_strs
1084
+
1085
+
1086
+ def construct_formula_string(
1087
+ reaction_species_df: pd.DataFrame,
1088
+ reactions_df: pd.DataFrame,
1089
+ name_var: str,
1090
+ ) -> str:
1091
+ """
1092
+ Construct Formula String
1093
+
1094
+ Convert a table of reaction species into a formula string
1095
+
1096
+ Parameters:
1097
+ ----------
1098
+ reaction_species_df: pd.DataFrame
1099
+ Table containing a reactions' species
1100
+ reactions_df: pd.DataFrame
1101
+ smbl.reactions
1102
+ name_var: str
1103
+ Name used to label species
1104
+
1105
+ Returns:
1106
+ ----------
1107
+ formula_str: str
1108
+ String representation of a reactions substrates, products and
1109
+ modifiers
1110
+
1111
+ """
1112
+
1113
+ reaction_species_df["label"] = [
1114
+ add_stoi_to_species_name(x, y)
1115
+ for x, y in zip(
1116
+ reaction_species_df[SBML_DFS.STOICHIOMETRY], reaction_species_df[name_var]
1117
+ )
1118
+ ]
1119
+
1120
+ rxn_reversible = bool(
1121
+ reactions_df.loc[reaction_species_df[SBML_DFS.R_ID][0], SBML_DFS.R_ISREVERSIBLE]
1122
+ ) # convert from a np.bool_ to bool if needed
1123
+ assert isinstance(rxn_reversible, bool)
1124
+
1125
+ if rxn_reversible:
1126
+ arrow_type = " <-> "
1127
+ else:
1128
+ arrow_type = " -> "
1129
+
1130
+ substrates = " + ".join(
1131
+ reaction_species_df["label"][
1132
+ reaction_species_df[SBML_DFS.STOICHIOMETRY] < 0
1133
+ ].tolist()
1134
+ )
1135
+ products = " + ".join(
1136
+ reaction_species_df["label"][
1137
+ reaction_species_df[SBML_DFS.STOICHIOMETRY] > 0
1138
+ ].tolist()
1139
+ )
1140
+ modifiers = " + ".join(
1141
+ reaction_species_df["label"][
1142
+ reaction_species_df[SBML_DFS.STOICHIOMETRY] == 0
1143
+ ].tolist()
1144
+ )
1145
+ if modifiers != "":
1146
+ modifiers = f" ---- modifiers: {modifiers}]"
1147
+
1148
+ return f"{substrates}{arrow_type}{products}{modifiers}"
1149
+
1150
+
1151
+ def add_stoi_to_species_name(stoi: float | int, name: str) -> str:
1152
+ """
1153
+ Add Stoi To Species Name
1154
+
1155
+ Add # of molecules to a species name
1156
+
1157
+ Parameters:
1158
+ ----------
1159
+ stoi: float or int
1160
+ Number of molecules
1161
+ name: str
1162
+ Name of species
1163
+
1164
+ Returns:
1165
+ ----------
1166
+ name: str
1167
+ Name containing number of species
1168
+
1169
+ """
1170
+
1171
+ if stoi in [-1, 0, 1]:
1172
+ return name
1173
+ else:
1174
+ return str(abs(stoi)) + " " + name
1175
+
1176
+
1177
+ def filter_to_characteristic_species_ids(
1178
+ species_ids: pd.DataFrame,
1179
+ max_complex_size: int = 4,
1180
+ max_promiscuity: int = 20,
1181
+ defining_biological_qualifiers: list[str] = BQB_DEFINING_ATTRS,
1182
+ ) -> pd.DataFrame:
1183
+ """
1184
+ Filter to Characteristic Species IDs
1185
+
1186
+ Remove identifiers corresponding to one component within a large protein
1187
+ complexes and non-characteristic annotations such as pubmed references and
1188
+ homologues.
1189
+
1190
+ Parameters
1191
+ ----------
1192
+ species_ids: pd.DataFrame
1193
+ A table of identifiers produced by sdbml_dfs.get_identifiers("species")
1194
+ max_complex_size: int
1195
+ The largest size of a complex, where BQB_HAS_PART terms will be retained.
1196
+ In most cases, complexes are handled with specific formation and
1197
+ dissolutation reactions,but these identifiers will be pulled in when
1198
+ searching by identifiers or searching the identifiers associated with a
1199
+ species against an external resource such as Open Targets.
1200
+ max_promiscuity: int
1201
+ Maximum number of species where a single molecule can act as a
1202
+ BQB_HAS_PART component associated with a single identifier (and common ontology).
1203
+ defining_biological_qualifiers (list[str]):
1204
+ BQB codes which define distinct entities. Narrowly this would be BQB_IS, while more
1205
+ permissive settings would include homologs, different forms of the same gene.
1206
+
1207
+ Returns:
1208
+ --------
1209
+ species_id: pd.DataFrame
1210
+ Input species filtered to characteristic identifiers
1211
+
1212
+ """
1213
+
1214
+ if not isinstance(species_ids, pd.DataFrame):
1215
+ raise TypeError(
1216
+ f"species_ids was a {type(species_ids)} but must be a pd.DataFrame"
1217
+ )
1218
+
1219
+ if not isinstance(max_complex_size, int):
1220
+ raise TypeError(
1221
+ f"max_complex_size was a {type(max_complex_size)} but must be an int"
1222
+ )
1223
+
1224
+ if not isinstance(max_promiscuity, int):
1225
+ raise TypeError(
1226
+ f"max_promiscuity was a {type(max_promiscuity)} but must be an int"
1227
+ )
1228
+
1229
+ if not isinstance(defining_biological_qualifiers, list):
1230
+ raise TypeError(
1231
+ f"defining_biological_qualifiers was a {type(defining_biological_qualifiers)} but must be a list"
1232
+ )
1233
+
1234
+ # primary annotations of a species
1235
+ bqb_is_species = species_ids.query("bqb in @defining_biological_qualifiers")
1236
+
1237
+ # add components within modestly sized protein complexes
1238
+ # look at HAS_PART IDs
1239
+ bqb_has_parts_species = species_ids[species_ids[IDENTIFIERS.BQB] == BQB.HAS_PART]
1240
+ # filter to genes
1241
+ bqb_has_parts_species = bqb_has_parts_species[
1242
+ bqb_has_parts_species[IDENTIFIERS.ONTOLOGY].isin(
1243
+ CHARACTERISTIC_COMPLEX_ONTOLOGIES
1244
+ )
1245
+ ]
1246
+
1247
+ # number of species in a complex
1248
+ n_species_components = bqb_has_parts_species.value_counts(
1249
+ [IDENTIFIERS.ONTOLOGY, SBML_DFS.S_ID]
1250
+ )
1251
+ big_complex_sids = set(
1252
+ n_species_components[
1253
+ n_species_components > max_complex_size
1254
+ ].index.get_level_values(SBML_DFS.S_ID)
1255
+ )
1256
+
1257
+ # number of complexes a species is part of
1258
+ n_complexes_involvedin = bqb_has_parts_species.value_counts(
1259
+ [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
1260
+ )
1261
+ promiscuous_component_identifiers_index = n_complexes_involvedin[
1262
+ n_complexes_involvedin > max_promiscuity
1263
+ ].index
1264
+ promiscuous_component_identifiers = pd.Series(
1265
+ data=[True] * len(promiscuous_component_identifiers_index),
1266
+ index=promiscuous_component_identifiers_index,
1267
+ name="is_shared_component",
1268
+ )
1269
+
1270
+ if len(promiscuous_component_identifiers) == 0:
1271
+ # no complexes to filter
1272
+ return species_ids
1273
+
1274
+ filtered_bqb_has_parts = bqb_has_parts_species.merge(
1275
+ promiscuous_component_identifiers,
1276
+ left_on=[IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER],
1277
+ right_index=True,
1278
+ how="left",
1279
+ )
1280
+
1281
+ filtered_bqb_has_parts["is_shared_component"] = filtered_bqb_has_parts[
1282
+ "is_shared_component"
1283
+ ].fillna(False)
1284
+ # drop identifiers shared as components across many species
1285
+ filtered_bqb_has_parts = filtered_bqb_has_parts[
1286
+ ~filtered_bqb_has_parts["is_shared_component"]
1287
+ ].drop(["is_shared_component"], axis=1)
1288
+ # drop species parts if there are many components
1289
+ filtered_bqb_has_parts = filtered_bqb_has_parts[
1290
+ ~filtered_bqb_has_parts[SBML_DFS.S_ID].isin(big_complex_sids)
1291
+ ]
1292
+
1293
+ # combine primary identifiers and rare components
1294
+ characteristic_species_ids = pd.concat(
1295
+ [
1296
+ bqb_is_species,
1297
+ filtered_bqb_has_parts,
1298
+ ]
1299
+ )
1300
+
1301
+ return characteristic_species_ids
1302
+
1303
+
1304
+ def infer_uncompartmentalized_species_location(sbml_dfs: SBML_dfs) -> SBML_dfs:
1305
+ """
1306
+ Infer Uncompartmentalized Species Location
1307
+
1308
+ If the compartment of a subset of compartmentalized species
1309
+ was not specified, infer an appropriate compartment from
1310
+ other members of reactions they particpate in
1311
+
1312
+ Parameters:
1313
+ ----------
1314
+ sbml_dfs: sbml.SBML_dfs
1315
+ A relational pathway model
1316
+
1317
+ Returns:
1318
+ ----------
1319
+ sbml_dfs: sbml.SBML_dfs
1320
+ A relational pathway model (with filled in species compartments)
1321
+
1322
+ """
1323
+
1324
+ default_compartment = (
1325
+ sbml_dfs.compartmentalized_species.value_counts(SBML_DFS.C_ID)
1326
+ .rename("N")
1327
+ .reset_index()
1328
+ .sort_values("N", ascending=False)[SBML_DFS.C_ID][0]
1329
+ )
1330
+ if not isinstance(default_compartment, str):
1331
+ raise ValueError(
1332
+ "No default compartment could be found - compartment "
1333
+ "information may not be present"
1334
+ )
1335
+
1336
+ # infer the compartments of species missing compartments
1337
+
1338
+ missing_compartment_scids = sbml_dfs.compartmentalized_species[
1339
+ sbml_dfs.compartmentalized_species[SBML_DFS.C_ID].isnull()
1340
+ ].index.tolist()
1341
+ if len(missing_compartment_scids) == 0:
1342
+ logger.info(
1343
+ "All compartmentalized species have compartments, "
1344
+ "returning input sbml_dfs"
1345
+ )
1346
+ return sbml_dfs
1347
+
1348
+ participating_reactions = (
1349
+ sbml_dfs.reaction_species[
1350
+ sbml_dfs.reaction_species[SBML_DFS.SC_ID].isin(missing_compartment_scids)
1351
+ ][SBML_DFS.R_ID]
1352
+ .unique()
1353
+ .tolist()
1354
+ )
1355
+ reaction_participants = sbml_dfs.reaction_species[
1356
+ sbml_dfs.reaction_species[SBML_DFS.R_ID].isin(participating_reactions)
1357
+ ].reset_index(drop=True)[[SBML_DFS.SC_ID, SBML_DFS.R_ID]]
1358
+ reaction_participants = reaction_participants.merge(
1359
+ sbml_dfs.compartmentalized_species[SBML_DFS.C_ID],
1360
+ left_on=SBML_DFS.SC_ID,
1361
+ right_index=True,
1362
+ )
1363
+
1364
+ # find a default compartment to fall back on if all compartmental information is missing
1365
+
1366
+ primary_reaction_compartment = (
1367
+ reaction_participants.value_counts([SBML_DFS.R_ID, SBML_DFS.C_ID])
1368
+ .rename("N")
1369
+ .reset_index()
1370
+ .sort_values("N", ascending=False)
1371
+ .groupby(SBML_DFS.R_ID)
1372
+ .first()[SBML_DFS.C_ID]
1373
+ .reset_index()
1374
+ )
1375
+
1376
+ inferred_compartmentalization = (
1377
+ sbml_dfs.reaction_species[
1378
+ sbml_dfs.reaction_species[SBML_DFS.SC_ID].isin(missing_compartment_scids)
1379
+ ]
1380
+ .merge(primary_reaction_compartment)
1381
+ .value_counts([SBML_DFS.SC_ID, SBML_DFS.C_ID])
1382
+ .rename("N")
1383
+ .reset_index()
1384
+ .sort_values("N", ascending=False)
1385
+ .groupby(SBML_DFS.SC_ID)
1386
+ .first()
1387
+ .reset_index()[[SBML_DFS.SC_ID, SBML_DFS.C_ID]]
1388
+ )
1389
+ logger.info(
1390
+ f"{inferred_compartmentalization.shape[0]} species' compartmentalization inferred"
1391
+ )
1392
+
1393
+ # define where a reaction is most likely to occur based on the compartmentalization of its particpants
1394
+ species_with_unknown_compartmentalization = set(
1395
+ missing_compartment_scids
1396
+ ).difference(set(inferred_compartmentalization[SBML_DFS.SC_ID].tolist()))
1397
+ if len(species_with_unknown_compartmentalization) != 0:
1398
+ logger.warning(
1399
+ f"{len(species_with_unknown_compartmentalization)} "
1400
+ "species compartmentalization could not be inferred"
1401
+ " from other reaction particpants. Their compartmentalization "
1402
+ f"will be set to the default of {default_compartment}"
1403
+ )
1404
+
1405
+ inferred_compartmentalization = pd.concat(
1406
+ [
1407
+ inferred_compartmentalization,
1408
+ pd.DataFrame(
1409
+ {SBML_DFS.SC_ID: list(species_with_unknown_compartmentalization)}
1410
+ ).assign(c_id=default_compartment),
1411
+ ]
1412
+ )
1413
+
1414
+ if len(missing_compartment_scids) != inferred_compartmentalization.shape[0]:
1415
+ raise ValueError(
1416
+ f"{inferred_compartmentalization.shape[0]} were inferred but {len(missing_compartment_scids)} are required"
1417
+ )
1418
+
1419
+ updated_compartmentalized_species = pd.concat(
1420
+ [
1421
+ sbml_dfs.compartmentalized_species[
1422
+ ~sbml_dfs.compartmentalized_species[SBML_DFS.C_ID].isnull()
1423
+ ],
1424
+ sbml_dfs.compartmentalized_species[
1425
+ sbml_dfs.compartmentalized_species[SBML_DFS.C_ID].isnull()
1426
+ ]
1427
+ .drop(SBML_DFS.C_ID, axis=1)
1428
+ .merge(
1429
+ inferred_compartmentalization, left_index=True, right_on=SBML_DFS.SC_ID
1430
+ )
1431
+ .set_index(SBML_DFS.SC_ID),
1432
+ ]
1433
+ )
1434
+
1435
+ if (
1436
+ updated_compartmentalized_species.shape[0]
1437
+ != sbml_dfs.compartmentalized_species.shape[0]
1438
+ ):
1439
+ raise ValueError(
1440
+ f"Trying to overwrite {sbml_dfs.compartmentalized_species.shape[0]}"
1441
+ " compartmentalized species with "
1442
+ f"{updated_compartmentalized_species.shape[0]}"
1443
+ )
1444
+
1445
+ if any(updated_compartmentalized_species[SBML_DFS.C_ID].isnull()):
1446
+ raise ValueError("Some species compartments are still missing")
1447
+
1448
+ sbml_dfs.compartmentalized_species = updated_compartmentalized_species
1449
+
1450
+ return sbml_dfs
1451
+
1452
+
1453
+ def infer_sbo_terms(sbml_dfs: SBML_dfs) -> SBML_dfs:
1454
+ """
1455
+ Infer SBO Terms
1456
+
1457
+ Define SBO terms based on stoichiometry for reaction_species with missing terms
1458
+
1459
+ Parameters:
1460
+ ----------
1461
+ sbml_dfs: sbml.SBML_dfs
1462
+ A relational pathway model
1463
+
1464
+ Returns:
1465
+ ----------
1466
+ sbml_dfs: sbml.SBML_dfs
1467
+ A relational pathway model (with missing/invalid reaction species sbo_terms resolved)
1468
+
1469
+ """
1470
+
1471
+ valid_sbo_terms = sbml_dfs.reaction_species[
1472
+ sbml_dfs.reaction_species[SBML_DFS.SBO_TERM].isin(MINI_SBO_TO_NAME.keys())
1473
+ ]
1474
+
1475
+ invalid_sbo_terms = sbml_dfs.reaction_species[
1476
+ ~sbml_dfs.reaction_species[SBML_DFS.SBO_TERM].isin(MINI_SBO_TO_NAME.keys())
1477
+ ]
1478
+
1479
+ assert all(sbml_dfs.reaction_species[SBML_DFS.SBO_TERM].notnull())
1480
+ if invalid_sbo_terms.shape[0] == 0:
1481
+ logger.info("All sbo_terms were valid; returning input sbml_dfs")
1482
+ return sbml_dfs
1483
+
1484
+ logger.info(f"Updating {invalid_sbo_terms.shape[0]} reaction_species' sbo_term")
1485
+
1486
+ # add missing/invalid terms based on stoichiometry
1487
+ invalid_sbo_terms.loc[
1488
+ invalid_sbo_terms[sbml_dfs.STOICHIOMETRY] < 0, SBML_DFS.SBO_TERM
1489
+ ] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.REACTANT]
1490
+
1491
+ invalid_sbo_terms.loc[
1492
+ invalid_sbo_terms[sbml_dfs.STOICHIOMETRY] > 0, SBML_DFS.SBO_TERM
1493
+ ] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.PRODUCT]
1494
+
1495
+ invalid_sbo_terms.loc[
1496
+ invalid_sbo_terms[sbml_dfs.STOICHIOMETRY] == 0, SBML_DFS.SBO_TERM
1497
+ ] = MINI_SBO_FROM_NAME[SBOTERM_NAMES.STIMULATOR]
1498
+
1499
+ updated_reaction_species = pd.concat(
1500
+ [valid_sbo_terms, invalid_sbo_terms]
1501
+ ).sort_index()
1502
+
1503
+ assert sbml_dfs.reaction_species.shape[0] == updated_reaction_species.shape[0]
1504
+ sbml_dfs.reaction_species = updated_reaction_species
1505
+
1506
+ return sbml_dfs
1507
+
1508
+
1509
+ def name_compartmentalized_species(sbml_dfs):
1510
+ """
1511
+ Name Compartmentalized Species
1512
+
1513
+ Rename compartmentalized species if they have the same
1514
+ name as their species
1515
+
1516
+ Parameters
1517
+ ----------
1518
+ sbml_dfs : SBML_dfs
1519
+ A model formed by aggregating pathways
1520
+
1521
+ Returns:
1522
+ ----------
1523
+ sbml_dfs
1524
+ """
1525
+
1526
+ augmented_cspecies = sbml_dfs.compartmentalized_species.merge(
1527
+ sbml_dfs.species[SBML_DFS.S_NAME], left_on=SBML_DFS.S_ID, right_index=True
1528
+ ).merge(
1529
+ sbml_dfs.compartments[SBML_DFS.C_NAME], left_on=SBML_DFS.C_ID, right_index=True
1530
+ )
1531
+ augmented_cspecies[SBML_DFS.SC_NAME] = [
1532
+ f"{s} [{c}]" if sc == s else sc
1533
+ for sc, c, s in zip(
1534
+ augmented_cspecies[SBML_DFS.SC_NAME],
1535
+ augmented_cspecies[SBML_DFS.C_NAME],
1536
+ augmented_cspecies[SBML_DFS.S_NAME],
1537
+ )
1538
+ ]
1539
+
1540
+ sbml_dfs.compartmentalized_species = augmented_cspecies.loc[
1541
+ :, sbml_dfs.schema[SBML_DFS.COMPARTMENTALIZED_SPECIES]["vars"]
1542
+ ]
1543
+
1544
+ return sbml_dfs
1545
+
1546
+
1547
+ def export_sbml_dfs(
1548
+ model_prefix: str,
1549
+ sbml_dfs: SBML_dfs,
1550
+ outdir: str,
1551
+ overwrite: bool = False,
1552
+ dogmatic: bool = True,
1553
+ ) -> None:
1554
+ """
1555
+ Export SBML_dfs
1556
+
1557
+ Export summaries of species identifiers and each table underlying
1558
+ an SBML_dfs pathway model
1559
+
1560
+ Params
1561
+ ------
1562
+ model_prefix: str
1563
+ Label to prepend to all exported files
1564
+ sbml_dfs: sbml.SBML_dfs
1565
+ A pathway model
1566
+ outdir: str
1567
+ Path to an existing directory where results should be saved
1568
+ overwrite: bool
1569
+ Should the directory be overwritten if it already exists?
1570
+ dogmatic: bool
1571
+ If True then treat genes, transcript, and proteins as separate species. If False
1572
+ then treat them interchangeably.
1573
+
1574
+ Returns
1575
+ -------
1576
+ None
1577
+
1578
+ """
1579
+
1580
+ if not isinstance(model_prefix, str):
1581
+ raise TypeError(f"model_prefix was a {type(model_prefix)} " "and must be a str")
1582
+ if not isinstance(sbml_dfs, SBML_dfs):
1583
+ raise TypeError(
1584
+ f"sbml_dfs was a {type(sbml_dfs)} and must" " be an sbml.SBML_dfs"
1585
+ )
1586
+ # select valid BQB attributes based on dogmatic flag
1587
+ defining_biological_qualifiers = sbml_dfs_utils._dogmatic_to_defining_bqbs(dogmatic)
1588
+
1589
+ # pre-summarize ontologies
1590
+ species_identifiers = sbml_dfs.get_identifiers(SBML_DFS.SPECIES)
1591
+ # drop some BQB_HAS_PART annotations
1592
+ species_identifiers = filter_to_characteristic_species_ids(
1593
+ species_identifiers,
1594
+ defining_biological_qualifiers=defining_biological_qualifiers,
1595
+ )
1596
+
1597
+ try:
1598
+ utils.initialize_dir(outdir, overwrite=overwrite)
1599
+ except FileExistsError:
1600
+ logger.warning(
1601
+ f"Directory {outdir} already exists and overwrite is False. "
1602
+ "Files will be added to the existing directory."
1603
+ )
1604
+ with open_fs(outdir, writeable=True) as fs:
1605
+ species_identifiers_path = (
1606
+ model_prefix + CPR_STANDARD_OUTPUTS.SPECIES_IDENTIFIERS
1607
+ )
1608
+ with fs.openbin(species_identifiers_path, "w") as f:
1609
+ species_identifiers.drop([SBML_DFS.S_SOURCE], axis=1).to_csv(
1610
+ f, sep="\t", index=False
1611
+ )
1612
+
1613
+ # export jsons
1614
+ species_path = model_prefix + CPR_STANDARD_OUTPUTS.SPECIES
1615
+ reactions_path = model_prefix + CPR_STANDARD_OUTPUTS.REACTIONS
1616
+ reation_species_path = model_prefix + CPR_STANDARD_OUTPUTS.REACTION_SPECIES
1617
+ compartments_path = model_prefix + CPR_STANDARD_OUTPUTS.COMPARTMENTS
1618
+ compartmentalized_species_path = (
1619
+ model_prefix + CPR_STANDARD_OUTPUTS.COMPARTMENTALIZED_SPECIES
1620
+ )
1621
+ with fs.openbin(species_path, "w") as f:
1622
+ sbml_dfs.species[[SBML_DFS.S_NAME]].to_json(f)
1623
+
1624
+ with fs.openbin(reactions_path, "w") as f:
1625
+ sbml_dfs.reactions[[SBML_DFS.R_NAME]].to_json(f)
1626
+
1627
+ with fs.openbin(reation_species_path, "w") as f:
1628
+ sbml_dfs.reaction_species.to_json(f)
1629
+
1630
+ with fs.openbin(compartments_path, "w") as f:
1631
+ sbml_dfs.compartments[[SBML_DFS.C_NAME]].to_json(f)
1632
+
1633
+ with fs.openbin(compartmentalized_species_path, "w") as f:
1634
+ sbml_dfs.compartmentalized_species.drop(SBML_DFS.SC_SOURCE, axis=1).to_json(
1635
+ f
1636
+ )
1637
+
1638
+ return None
1639
+
1640
+
1641
+ def sbml_dfs_from_edgelist(
1642
+ interaction_edgelist: pd.DataFrame,
1643
+ species_df: pd.DataFrame,
1644
+ compartments_df: pd.DataFrame,
1645
+ interaction_source: source.Source,
1646
+ upstream_stoichiometry: int = 0,
1647
+ downstream_stoichiometry: int = 1,
1648
+ downstream_sbo_name: str = SBOTERM_NAMES.PRODUCT,
1649
+ keep_species_data: bool | str = False,
1650
+ keep_reactions_data: bool | str = False,
1651
+ ) -> SBML_dfs:
1652
+ """
1653
+ Create SBML_dfs from Edgelist
1654
+
1655
+ Combine a set of interactions into an sbml.SBML_dfs mechanistic model
1656
+
1657
+ Parameters:
1658
+ interaction_edgelist (pd.DataFrame): A table containing interactions:
1659
+ - upstream_name (str): matching "s_name" from "species_df"
1660
+ - downstream_name (str): matching "s_name" from "species_df"
1661
+ - upstream_compartment (str): compartment of "upstream_name"
1662
+ with names matching "c_name" from "compartments_df"
1663
+ - downstream_compartment (str): compartment of "downstream_name"
1664
+ with names matching "c_name" from "compartments_df"
1665
+ - r_name (str): a name for the interaction
1666
+ - sbo_term (str): sbo term defining the type of
1667
+ molecular interaction (see MINI_SBO_FROM_NAME)
1668
+ - r_Identifiers (identifiers.Identifiers): identifiers
1669
+ supporting the interaction (e.g., pubmed ids)
1670
+ - r_isreversible (bool): Is this reaction reversible?
1671
+ If True, the reaction is reversible
1672
+ By default, the interactions of TRRUST networks are irreversible, and reversible for STRING networks
1673
+ species_df (pd.DataFrame): A table defining unique molecular
1674
+ species participating in "interaction_edgelist":
1675
+ - s_name (str): name of molecular species
1676
+ - s_Identifiers (identifiers.Identifiers): identifiers
1677
+ defining the species
1678
+ compartments_df (pd.DataFrame): A table defining compartments
1679
+ where interactions are occurring "interaction_edgelist":
1680
+ - c_name (str): name of compartment
1681
+ - c_Identifiers (identifiers.Identifiers):
1682
+ identifiers defining the compartment (see
1683
+ bigg.annotate_recon() for a set of names > go categories)
1684
+ interaction_source (source.Source): A source object
1685
+ which will tie model entities to the interaction source
1686
+ upstream_stoichiometry (int): stoichiometry of
1687
+ upstream species in reaction
1688
+ downstream_stoichiometry (int): stoichiometry of
1689
+ downstream species in reaction
1690
+ downstream_sbo_name (str): sbo term defining the
1691
+ type of molecular interaction for the downstream reactand
1692
+ (see MINI_SBO_FROM_NAME)
1693
+ keep_species_data (bool | str): Should species data
1694
+ be kept in the model? If True, all species data will be kept
1695
+ and saved as "species_data" in the SBML_dfs. The label will be 'source'
1696
+ If False, no species data will be kept.
1697
+ If a string: label for the species data to be kept.
1698
+ keep_reactions_data (bool | str): Should reaction data be kept in the model?
1699
+ If True, all reaction data will be kept and saved
1700
+ as "reactions_data" in the SBML_dfs. The label will be 'source'.
1701
+ If False, no reaction data will be kept.
1702
+ If a string: label for the reaction data to be kept.
1703
+
1704
+ Returns:
1705
+ sbml.SBML_dfs
1706
+
1707
+ """
1708
+
1709
+ # check input dfs for required variables
1710
+ _sbml_dfs_from_edgelist_validate_inputs(
1711
+ interaction_edgelist, species_df, compartments_df
1712
+ )
1713
+
1714
+ # Identify extra columns in the input data.
1715
+ # if keep_reactions_data is True, this will be added
1716
+ # as `reaction_data`
1717
+ interaction_edgelist_required_vars = {
1718
+ "upstream_name",
1719
+ "downstream_name",
1720
+ "upstream_compartment",
1721
+ "downstream_compartment",
1722
+ SBML_DFS.R_NAME,
1723
+ SBML_DFS.SBO_TERM,
1724
+ SBML_DFS.R_IDENTIFIERS,
1725
+ SBML_DFS.R_ISREVERSIBLE,
1726
+ }
1727
+ if keep_reactions_data is not False:
1728
+ extra_reactions_columns = [
1729
+ c
1730
+ for c in interaction_edgelist.columns
1731
+ if c not in interaction_edgelist_required_vars
1732
+ ]
1733
+ else:
1734
+ extra_reactions_columns = []
1735
+ # Extra species columns
1736
+ if keep_species_data is not False:
1737
+ extra_species_columns = [
1738
+ c
1739
+ for c in species_df.columns
1740
+ if c not in {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
1741
+ ]
1742
+ else:
1743
+ extra_species_columns = []
1744
+
1745
+ # format compartments
1746
+ compartments_df[SBML_DFS.C_SOURCE] = interaction_source
1747
+ compartments_df[SBML_DFS.C_ID] = sbml_dfs_utils.id_formatter(
1748
+ range(compartments_df.shape[0]), SBML_DFS.C_ID
1749
+ )
1750
+ compartments_df = compartments_df.set_index(SBML_DFS.C_ID)[
1751
+ [SBML_DFS.C_NAME, SBML_DFS.C_IDENTIFIERS, SBML_DFS.C_SOURCE]
1752
+ ]
1753
+
1754
+ # format species
1755
+ species_df[SBML_DFS.S_SOURCE] = interaction_source
1756
+ species_df[SBML_DFS.S_ID] = sbml_dfs_utils.id_formatter(
1757
+ range(species_df.shape[0]), SBML_DFS.S_ID
1758
+ )
1759
+
1760
+ required_cols = [SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS, SBML_DFS.S_SOURCE]
1761
+ species_df = species_df.set_index(SBML_DFS.S_ID)[
1762
+ required_cols + extra_species_columns
1763
+ ]
1764
+ # Keep extra columns to save them as extra data
1765
+ species_data = species_df[extra_species_columns]
1766
+ # Remove extra columns
1767
+ species_df = species_df[required_cols]
1768
+
1769
+ # create compartmentalized species
1770
+
1771
+ # define all distinct upstream and downstream compartmentalized species
1772
+ comp_species = pd.concat(
1773
+ [
1774
+ interaction_edgelist[["upstream_name", "upstream_compartment"]].rename(
1775
+ {
1776
+ "upstream_name": SBML_DFS.S_NAME,
1777
+ "upstream_compartment": SBML_DFS.C_NAME,
1778
+ },
1779
+ axis=1,
1780
+ ),
1781
+ interaction_edgelist[["downstream_name", "downstream_compartment"]].rename(
1782
+ {
1783
+ "downstream_name": SBML_DFS.S_NAME,
1784
+ "downstream_compartment": SBML_DFS.C_NAME,
1785
+ },
1786
+ axis=1,
1787
+ ),
1788
+ ]
1789
+ ).drop_duplicates()
1790
+
1791
+ # merge to add species and compartments primary keys
1792
+ comp_species_w_ids = comp_species.merge(
1793
+ species_df[SBML_DFS.S_NAME].reset_index(),
1794
+ how="left",
1795
+ left_on=SBML_DFS.S_NAME,
1796
+ right_on=SBML_DFS.S_NAME,
1797
+ ).merge(
1798
+ compartments_df[SBML_DFS.C_NAME].reset_index(),
1799
+ how="left",
1800
+ left_on=SBML_DFS.C_NAME,
1801
+ right_on=SBML_DFS.C_NAME,
1802
+ )
1803
+
1804
+ # check whether all species and compartments exist
1805
+ _sbml_dfs_from_edgelist_check_cspecies_merge(comp_species_w_ids, comp_species)
1806
+
1807
+ # name compounds
1808
+ comp_species_w_ids[SBML_DFS.SC_NAME] = [
1809
+ f"{s} [{c}]"
1810
+ for s, c in zip(
1811
+ comp_species_w_ids[SBML_DFS.S_NAME], comp_species_w_ids[SBML_DFS.C_NAME]
1812
+ )
1813
+ ]
1814
+ # add source object
1815
+ comp_species_w_ids[SBML_DFS.SC_SOURCE] = interaction_source
1816
+ # name index
1817
+ comp_species_w_ids[SBML_DFS.SC_ID] = sbml_dfs_utils.id_formatter(
1818
+ range(comp_species_w_ids.shape[0]), SBML_DFS.SC_ID
1819
+ )
1820
+ comp_species_w_ids = comp_species_w_ids.set_index(SBML_DFS.SC_ID)[
1821
+ [SBML_DFS.SC_NAME, SBML_DFS.S_ID, SBML_DFS.C_ID, SBML_DFS.SC_SOURCE]
1822
+ ]
1823
+
1824
+ # create reactions
1825
+
1826
+ # create a from cs_species -> to cs_species edgelist
1827
+ # interaction_edgelist
1828
+ comp_species_w_names = (
1829
+ comp_species_w_ids.reset_index()
1830
+ .merge(species_df[SBML_DFS.S_NAME].reset_index())
1831
+ .merge(compartments_df[SBML_DFS.C_NAME].reset_index())
1832
+ )
1833
+
1834
+ interaction_edgelist_w_cspecies = interaction_edgelist.merge(
1835
+ comp_species_w_names[[SBML_DFS.SC_ID, SBML_DFS.S_NAME, SBML_DFS.C_NAME]].rename(
1836
+ {
1837
+ SBML_DFS.SC_ID: "sc_id_up",
1838
+ SBML_DFS.S_NAME: "upstream_name",
1839
+ SBML_DFS.C_NAME: "upstream_compartment",
1840
+ },
1841
+ axis=1,
1842
+ ),
1843
+ how="left",
1844
+ ).merge(
1845
+ comp_species_w_names[[SBML_DFS.SC_ID, SBML_DFS.S_NAME, SBML_DFS.C_NAME]].rename(
1846
+ {
1847
+ SBML_DFS.SC_ID: "sc_id_down",
1848
+ SBML_DFS.S_NAME: "downstream_name",
1849
+ SBML_DFS.C_NAME: "downstream_compartment",
1850
+ },
1851
+ axis=1,
1852
+ ),
1853
+ how="left",
1854
+ )[
1855
+ REQUIRED_REACTION_FROMEDGELIST_COLUMNS + extra_reactions_columns
1856
+ ]
1857
+
1858
+ # some extra checks
1859
+ if interaction_edgelist.shape[0] != interaction_edgelist_w_cspecies.shape[0]:
1860
+ raise ValueError(
1861
+ "Merging compartmentalized species to interaction_edgelist"
1862
+ " resulted in an increase in the tables from "
1863
+ f"{interaction_edgelist.shape[0]} to "
1864
+ f"{interaction_edgelist_w_cspecies.shape[0]} indicating"
1865
+ " a 1-many join which should have been 1-1"
1866
+ )
1867
+
1868
+ # create one reaction per interaction
1869
+ interaction_edgelist_w_cspecies[SBML_DFS.R_SOURCE] = interaction_source
1870
+ interaction_edgelist_w_cspecies[SBML_DFS.R_ID] = sbml_dfs_utils.id_formatter(
1871
+ range(interaction_edgelist_w_cspecies.shape[0]), SBML_DFS.R_ID
1872
+ )
1873
+
1874
+ reactions_df_columns = [
1875
+ SBML_DFS.R_NAME,
1876
+ SBML_DFS.R_IDENTIFIERS,
1877
+ SBML_DFS.R_SOURCE,
1878
+ SBML_DFS.R_ISREVERSIBLE,
1879
+ ]
1880
+ reactions_df = interaction_edgelist_w_cspecies.copy().set_index(SBML_DFS.R_ID)[
1881
+ reactions_df_columns + extra_reactions_columns
1882
+ ]
1883
+ # Keep extra columns to save them as extra data
1884
+ reactions_data = reactions_df[extra_reactions_columns]
1885
+ reactions_df = reactions_df[reactions_df_columns]
1886
+
1887
+ # define upstream and downstream comp species as reaction species
1888
+ reaction_species_df = pd.concat(
1889
+ [
1890
+ # upstream interactions are defined by sbo_term and should generally
1891
+ # be modifiers/stimulator/inhibitor/interactor
1892
+ interaction_edgelist_w_cspecies[["sc_id_up", "sbo_term", "r_id"]]
1893
+ .assign(stoichiometry=upstream_stoichiometry)
1894
+ .rename({"sc_id_up": "sc_id"}, axis=1),
1895
+ # downstream interactions indicate some modification of the state
1896
+ # of the species and hence are defined as product
1897
+ interaction_edgelist_w_cspecies[["sc_id_down", "r_id"]]
1898
+ .assign(
1899
+ stoichiometry=downstream_stoichiometry,
1900
+ sbo_term=MINI_SBO_FROM_NAME[downstream_sbo_name],
1901
+ )
1902
+ .rename({"sc_id_down": "sc_id"}, axis=1),
1903
+ ]
1904
+ )
1905
+ reaction_species_df["rsc_id"] = sbml_dfs_utils.id_formatter(
1906
+ range(reaction_species_df.shape[0]), "rsc_id"
1907
+ )
1908
+ reaction_species_df = reaction_species_df.set_index("rsc_id")
1909
+
1910
+ # form sbml_dfs object
1911
+ sbml_tbl_dict: MutableMapping[str, pd.DataFrame | dict[str, pd.DataFrame]] = {
1912
+ "compartments": compartments_df,
1913
+ "species": species_df,
1914
+ "compartmentalized_species": comp_species_w_ids,
1915
+ "reactions": reactions_df,
1916
+ "reaction_species": reaction_species_df,
1917
+ }
1918
+ if len(extra_reactions_columns) > 0:
1919
+ if isinstance(keep_reactions_data, str):
1920
+ reactions_data_label = keep_reactions_data
1921
+ else:
1922
+ reactions_data_label = "source"
1923
+ sbml_tbl_dict["reactions_data"] = {reactions_data_label: reactions_data}
1924
+
1925
+ if len(extra_species_columns) > 0:
1926
+ if isinstance(keep_species_data, str):
1927
+ species_data_label = keep_species_data
1928
+ else:
1929
+ species_data_label = "source"
1930
+ sbml_tbl_dict["species_data"] = {species_data_label: species_data}
1931
+
1932
+ sbml_model = SBML_dfs(sbml_tbl_dict)
1933
+ sbml_model.validate()
1934
+
1935
+ return sbml_model
1936
+
1937
+
1938
+ def find_underspecified_reactions(
1939
+ sbml_dfs: SBML_dfs, sc_ids: Iterable[str]
1940
+ ) -> set[str]:
1941
+ """
1942
+ Find Underspecified reactions
1943
+
1944
+ Identity reactions which should be removed if a set of molecular species are removed
1945
+ from the system.
1946
+
1947
+ Params:
1948
+ sbml_dfs (SBML_dfs):
1949
+ A pathway representation
1950
+ sc_ids (list[str])
1951
+ A list of compartmentalized species ids (sc_ids) which will be removed.
1952
+
1953
+ Returns:
1954
+ underspecified_reactions (set[str]):
1955
+ A list of reactions which should be removed because they will not occur once
1956
+ \"sc_ids\" are removed.
1957
+
1958
+ """
1959
+
1960
+ updated_reaction_species = sbml_dfs.reaction_species.copy()
1961
+ updated_reaction_species["new"] = ~updated_reaction_species[SBML_DFS.SC_ID].isin(
1962
+ sc_ids
1963
+ )
1964
+
1965
+ updated_reaction_species = (
1966
+ updated_reaction_species.assign(
1967
+ sbo_role=updated_reaction_species[SBML_DFS.SBO_TERM]
1968
+ )
1969
+ .replace({"sbo_role": MINI_SBO_TO_NAME})
1970
+ .replace({"sbo_role": SBO_NAME_TO_ROLE})
1971
+ )
1972
+
1973
+ reactions_with_lost_defining_members = set(
1974
+ updated_reaction_species.query("~new")
1975
+ .query("sbo_role == 'DEFINING'")[SBML_DFS.R_ID]
1976
+ .tolist()
1977
+ )
1978
+
1979
+ N_reactions_with_lost_defining_members = len(reactions_with_lost_defining_members)
1980
+ if N_reactions_with_lost_defining_members > 0:
1981
+ logger.info(
1982
+ f"Removing {N_reactions_with_lost_defining_members} reactions which have lost at least one defining species"
1983
+ )
1984
+
1985
+ # for each reaction what are the required sbo_terms?
1986
+ reactions_with_requirements = (
1987
+ updated_reaction_species.query("sbo_role == 'REQUIRED'")[
1988
+ ["r_id", "sbo_term", "new"]
1989
+ ]
1990
+ .drop_duplicates()
1991
+ .reset_index(drop=True)
1992
+ )
1993
+
1994
+ # which required members are still present after removing some entries
1995
+ reactions_with_lost_requirements = set(
1996
+ reactions_with_requirements.query("~new")
1997
+ .merge(
1998
+ reactions_with_requirements.query("new").rename(
1999
+ {"new": "still_present"}, axis=1
2000
+ ),
2001
+ how="left",
2002
+ )
2003
+ .fillna(False)[SBML_DFS.R_ID]
2004
+ .tolist()
2005
+ )
2006
+
2007
+ N_reactions_with_lost_requirements = len(reactions_with_lost_requirements)
2008
+ if N_reactions_with_lost_requirements > 0:
2009
+ logger.info(
2010
+ f"Removing {N_reactions_with_lost_requirements} reactions which have lost all required members"
2011
+ )
2012
+
2013
+ underspecified_reactions = reactions_with_lost_defining_members.union(
2014
+ reactions_with_lost_requirements
2015
+ )
2016
+
2017
+ return underspecified_reactions
2018
+
2019
+
2020
+ def _sbml_dfs_from_edgelist_validate_inputs(
2021
+ interaction_edgelist: pd.DataFrame,
2022
+ species_df: pd.DataFrame,
2023
+ compartments_df: pd.DataFrame,
2024
+ ) -> None:
2025
+ """Check that the inputs for creating an SBML_dfs from an edgelist are appropriate."""
2026
+
2027
+ # check compartments
2028
+ compartments_df_expected_vars = {SBML_DFS.C_NAME, SBML_DFS.C_IDENTIFIERS}
2029
+ compartments_df_columns = set(compartments_df.columns.tolist())
2030
+ missing_required_fields = compartments_df_expected_vars.difference(
2031
+ compartments_df_columns
2032
+ )
2033
+ if len(missing_required_fields) > 0:
2034
+ raise ValueError(
2035
+ f"{', '.join(missing_required_fields)} are required variables"
2036
+ ' in "compartments_df" but were not present in the input file.'
2037
+ )
2038
+
2039
+ # check species
2040
+ species_df_expected_vars = {SBML_DFS.S_NAME, SBML_DFS.S_IDENTIFIERS}
2041
+ species_df_columns = set(species_df.columns.tolist())
2042
+ missing_required_fields = species_df_expected_vars.difference(species_df_columns)
2043
+ if len(missing_required_fields) > 0:
2044
+ raise ValueError(
2045
+ f"{', '.join(missing_required_fields)} are required"
2046
+ ' variables in "species_df" but were not present '
2047
+ "in the input file."
2048
+ )
2049
+
2050
+ # check interactions
2051
+ interaction_edgelist_columns = set(interaction_edgelist.columns.tolist())
2052
+ missing_required_fields = INTERACTION_EDGELIST_EXPECTED_VARS.difference(
2053
+ interaction_edgelist_columns
2054
+ )
2055
+ if len(missing_required_fields) > 0:
2056
+ raise ValueError(
2057
+ f"{', '.join(missing_required_fields)} are required "
2058
+ 'variables in "interaction_edgelist" but were not '
2059
+ "present in the input file."
2060
+ )
2061
+
2062
+ return None
2063
+
2064
+
2065
+ def _sbml_dfs_from_edgelist_check_cspecies_merge(
2066
+ merged_species: pd.DataFrame, original_species: pd.DataFrame
2067
+ ) -> None:
2068
+ """Check for a mismatch between the provided species data and species implied by the edgelist."""
2069
+
2070
+ # check for 1-many merge
2071
+ if merged_species.shape[0] != original_species.shape[0]:
2072
+ raise ValueError(
2073
+ "Merging compartmentalized species to species_df"
2074
+ " and compartments_df by names resulted in an "
2075
+ f"increase in the tables from {original_species.shape[0]}"
2076
+ f" to {merged_species.shape[0]} indicating that names were"
2077
+ " not unique"
2078
+ )
2079
+
2080
+ # check for missing species and compartments
2081
+ missing_compartments = merged_species[merged_species[SBML_DFS.C_ID].isna()][
2082
+ SBML_DFS.C_NAME
2083
+ ].unique()
2084
+ if len(missing_compartments) >= 1:
2085
+ raise ValueError(
2086
+ f"{len(missing_compartments)} compartments were present in"
2087
+ ' "interaction_edgelist" but not "compartments_df":'
2088
+ f" {', '.join(missing_compartments)}"
2089
+ )
2090
+
2091
+ missing_species = merged_species[merged_species[SBML_DFS.S_ID].isna()][
2092
+ SBML_DFS.S_NAME
2093
+ ].unique()
2094
+ if len(missing_species) >= 1:
2095
+ raise ValueError(
2096
+ f"{len(missing_species)} species were present in "
2097
+ '"interaction_edgelist" but not "species_df":'
2098
+ f" {', '.join(missing_species)}"
2099
+ )
2100
+
2101
+ return None
2102
+
2103
+
2104
+ def _stub_compartments(
2105
+ stubbed_compartment: str = "CELLULAR_COMPONENT",
2106
+ ) -> pd.DataFrame:
2107
+ """Stub Compartments
2108
+
2109
+ Create a compartments table with only a single compartment
2110
+
2111
+ Args:
2112
+ stubbed_compartment (str): the name of a compartment which should match the
2113
+ keys in constants.COMPARTMENTS and constants.COMPARTMENTS_GO_TERMS
2114
+
2115
+ Returns:
2116
+ compartments_df (pd.DataFrame): compartments dataframe
2117
+ """
2118
+
2119
+ if stubbed_compartment not in COMPARTMENT_ALIASES.keys():
2120
+ raise ValueError(
2121
+ f"{stubbed_compartment} is not defined in constants.COMPARTMENTS"
2122
+ )
2123
+
2124
+ if stubbed_compartment not in COMPARTMENTS_GO_TERMS.keys():
2125
+ raise ValueError(
2126
+ f"{stubbed_compartment} is not defined in constants.COMPARTMENTS_GO_TERMS"
2127
+ )
2128
+
2129
+ stubbed_compartment_name = COMPARTMENTS[stubbed_compartment]
2130
+ stubbed_compartment_id = COMPARTMENTS_GO_TERMS[stubbed_compartment]
2131
+
2132
+ formatted_uri = identifiers.format_uri(
2133
+ uri=identifiers.create_uri_url(
2134
+ ontology=ONTOLOGIES.GO,
2135
+ identifier=stubbed_compartment_id,
2136
+ ),
2137
+ biological_qualifier_type=BQB.IS,
2138
+ )
2139
+
2140
+ compartments_df = pd.DataFrame(
2141
+ {
2142
+ SBML_DFS.C_NAME: [stubbed_compartment_name],
2143
+ SBML_DFS.C_IDENTIFIERS: [identifiers.Identifiers([formatted_uri])],
2144
+ }
2145
+ )
2146
+ compartments_df.index = sbml_dfs_utils.id_formatter([0], SBML_DFS.C_ID) # type: ignore
2147
+ compartments_df.index.name = SBML_DFS.C_ID
2148
+
2149
+ return compartments_df
2150
+
2151
+
2152
+ def _validate_matching_data(data_table: pd.DataFrame, ref_table: pd.DataFrame):
2153
+ """Validates a table against a reference
2154
+
2155
+ This check if the table has the same index, no duplicates in the index
2156
+ and that all values in the index are in the reference table.
2157
+
2158
+ Args:
2159
+ data_table (pd.DataFrame): a table with data that should
2160
+ match the reference
2161
+ ref_table (pd.DataFrame): a reference table
2162
+
2163
+ Raises:
2164
+ ValueError: not same index name
2165
+ ValueError: index contains duplicates
2166
+ ValueError: index not subset of index of reactions table
2167
+ """
2168
+ ref_index_name = ref_table.index.name
2169
+ if data_table.index.name != ref_index_name:
2170
+ raise ValueError(
2171
+ "the index name for reaction data table was not"
2172
+ f" {ref_index_name}: {data_table.index.name}"
2173
+ )
2174
+ ids = data_table.index
2175
+ if any(ids.duplicated()):
2176
+ raise ValueError(
2177
+ "the index for reaction data table " "contained duplicate values"
2178
+ )
2179
+ if not all(ids.isin(ref_table.index)):
2180
+ raise ValueError(
2181
+ "the index for reaction data table contained values"
2182
+ " not found in the reactions table"
2183
+ )
2184
+ if not isinstance(data_table, pd.DataFrame):
2185
+ raise TypeError(
2186
+ f"The data table was type {type(data_table).__name__}"
2187
+ " but must be a pd.DataFrame"
2188
+ )
2189
+
2190
+
2191
+ def species_type_types(x):
2192
+ """Assign a high-level molecule type to a molecular species"""
2193
+
2194
+ if isinstance(x, identifiers.Identifiers):
2195
+ if x.filter(["chebi"]):
2196
+ return "metabolite"
2197
+ elif x.filter(["molodex"]):
2198
+ return "drug"
2199
+ else:
2200
+ return "protein"
2201
+ else:
2202
+ return "unknown"
2203
+
2204
+
2205
+ def stub_ids(ids):
2206
+ if len(ids) == 0:
2207
+ return pd.DataFrame(
2208
+ {
2209
+ IDENTIFIERS.ONTOLOGY: [None],
2210
+ IDENTIFIERS.IDENTIFIER: [None],
2211
+ IDENTIFIERS.URL: [None],
2212
+ IDENTIFIERS.BQB: [None],
2213
+ }
2214
+ )
2215
+ else:
2216
+ return pd.DataFrame(ids)