napistu 0.2.5.dev6__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. napistu/__main__.py +126 -96
  2. napistu/constants.py +35 -41
  3. napistu/context/__init__.py +10 -0
  4. napistu/context/discretize.py +462 -0
  5. napistu/context/filtering.py +387 -0
  6. napistu/gcs/__init__.py +1 -1
  7. napistu/identifiers.py +74 -15
  8. napistu/indices.py +68 -0
  9. napistu/ingestion/__init__.py +1 -1
  10. napistu/ingestion/bigg.py +47 -62
  11. napistu/ingestion/constants.py +18 -133
  12. napistu/ingestion/gtex.py +113 -0
  13. napistu/ingestion/hpa.py +147 -0
  14. napistu/ingestion/sbml.py +0 -97
  15. napistu/ingestion/string.py +2 -2
  16. napistu/matching/__init__.py +10 -0
  17. napistu/matching/constants.py +18 -0
  18. napistu/matching/interactions.py +518 -0
  19. napistu/matching/mount.py +529 -0
  20. napistu/matching/species.py +510 -0
  21. napistu/mcp/__init__.py +7 -4
  22. napistu/mcp/__main__.py +128 -72
  23. napistu/mcp/client.py +16 -25
  24. napistu/mcp/codebase.py +201 -153
  25. napistu/mcp/component_base.py +170 -0
  26. napistu/mcp/config.py +223 -0
  27. napistu/mcp/constants.py +45 -2
  28. napistu/mcp/documentation.py +253 -136
  29. napistu/mcp/documentation_utils.py +13 -48
  30. napistu/mcp/execution.py +372 -305
  31. napistu/mcp/health.py +49 -67
  32. napistu/mcp/profiles.py +10 -6
  33. napistu/mcp/server.py +161 -80
  34. napistu/mcp/tutorials.py +139 -87
  35. napistu/modify/__init__.py +1 -1
  36. napistu/modify/gaps.py +1 -1
  37. napistu/network/__init__.py +1 -1
  38. napistu/network/constants.py +101 -34
  39. napistu/network/data_handling.py +388 -0
  40. napistu/network/ig_utils.py +351 -0
  41. napistu/network/napistu_graph_core.py +354 -0
  42. napistu/network/neighborhoods.py +40 -40
  43. napistu/network/net_create.py +373 -309
  44. napistu/network/net_propagation.py +47 -19
  45. napistu/network/{net_utils.py → ng_utils.py} +124 -272
  46. napistu/network/paths.py +67 -51
  47. napistu/network/precompute.py +11 -11
  48. napistu/ontologies/__init__.py +10 -0
  49. napistu/ontologies/constants.py +129 -0
  50. napistu/ontologies/dogma.py +243 -0
  51. napistu/ontologies/genodexito.py +649 -0
  52. napistu/ontologies/mygene.py +369 -0
  53. napistu/ontologies/renaming.py +198 -0
  54. napistu/rpy2/__init__.py +229 -86
  55. napistu/rpy2/callr.py +47 -77
  56. napistu/rpy2/constants.py +24 -23
  57. napistu/rpy2/rids.py +61 -648
  58. napistu/sbml_dfs_core.py +587 -222
  59. napistu/scverse/__init__.py +15 -0
  60. napistu/scverse/constants.py +28 -0
  61. napistu/scverse/loading.py +727 -0
  62. napistu/utils.py +118 -10
  63. {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/METADATA +8 -3
  64. napistu-0.3.1.dist-info/RECORD +133 -0
  65. tests/conftest.py +22 -0
  66. tests/test_context_discretize.py +56 -0
  67. tests/test_context_filtering.py +267 -0
  68. tests/test_identifiers.py +100 -0
  69. tests/test_indices.py +65 -0
  70. tests/{test_edgelist.py → test_ingestion_napistu_edgelist.py} +2 -2
  71. tests/test_matching_interactions.py +108 -0
  72. tests/test_matching_mount.py +305 -0
  73. tests/test_matching_species.py +394 -0
  74. tests/test_mcp_config.py +193 -0
  75. tests/test_mcp_documentation_utils.py +12 -3
  76. tests/test_mcp_server.py +356 -0
  77. tests/test_network_data_handling.py +397 -0
  78. tests/test_network_ig_utils.py +23 -0
  79. tests/test_network_neighborhoods.py +19 -0
  80. tests/test_network_net_create.py +459 -0
  81. tests/test_network_ng_utils.py +30 -0
  82. tests/test_network_paths.py +56 -0
  83. tests/{test_precomputed_distances.py → test_network_precompute.py} +8 -6
  84. tests/test_ontologies_genodexito.py +58 -0
  85. tests/test_ontologies_mygene.py +39 -0
  86. tests/test_ontologies_renaming.py +110 -0
  87. tests/test_rpy2_callr.py +79 -0
  88. tests/test_rpy2_init.py +151 -0
  89. tests/test_sbml.py +0 -31
  90. tests/test_sbml_dfs_core.py +134 -10
  91. tests/test_scverse_loading.py +778 -0
  92. tests/test_set_coverage.py +2 -2
  93. tests/test_utils.py +121 -1
  94. napistu/mechanism_matching.py +0 -1353
  95. napistu/rpy2/netcontextr.py +0 -467
  96. napistu-0.2.5.dev6.dist-info/RECORD +0 -97
  97. tests/test_igraph.py +0 -367
  98. tests/test_mechanism_matching.py +0 -784
  99. tests/test_net_utils.py +0 -149
  100. tests/test_netcontextr.py +0 -105
  101. tests/test_rpy2.py +0 -61
  102. /napistu/ingestion/{cpr_edgelist.py → napistu_edgelist.py} +0 -0
  103. {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/WHEEL +0 -0
  104. {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/entry_points.txt +0 -0
  105. {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/licenses/LICENSE +0 -0
  106. {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/top_level.txt +0 -0
  107. /tests/{test_obo.py → test_ingestion_obo.py} +0 -0
napistu/sbml_dfs_core.py CHANGED
@@ -31,6 +31,9 @@ from napistu.constants import MINI_SBO_TO_NAME
31
31
  from napistu.constants import ONTOLOGIES
32
32
  from napistu.constants import SBO_NAME_TO_ROLE
33
33
  from napistu.constants import SBOTERM_NAMES
34
+ from napistu.constants import SBO_ROLES_DEFS
35
+ from napistu.constants import ENTITIES_W_DATA
36
+ from napistu.constants import ENTITIES_TO_ENTITY_DATA
34
37
  from napistu.constants import CHARACTERISTIC_COMPLEX_ONTOLOGIES
35
38
  from napistu.ingestion import sbml
36
39
  from fs import open_fs
@@ -42,43 +45,47 @@ class SBML_dfs:
42
45
  """
43
46
  System Biology Markup Language Model Data Frames.
44
47
 
48
+ A class representing a SBML model as a collection of pandas DataFrames.
49
+ This class provides methods for manipulating and analyzing biological pathway models
50
+ with support for species, reactions, compartments, and their relationships.
51
+
45
52
  Attributes
46
53
  ----------
47
- compartments: pd.DataFrame
48
- sub-cellular compartments in the model
49
- species: pd.DataFrame
50
- molecular species in the model
51
- species_data: Dict[str, pd.DataFrame]: Additional data for species.
52
- DataFrames with additional data and index = species_id
53
- reactions: pd.DataFrame
54
- reactions in the model
55
- reactions_data: Dict[str, pd.DataFrame]: Additional data for reactions.
56
- DataFrames with additional data and index = reaction_id
57
- reaction_species: pd.DataFrame
58
- One entry per species participating in a reaction
59
- schema: dict
60
- dictionary reprenting the structure of the other attributes and meaning of their variables
54
+ compartments : pd.DataFrame
55
+ Sub-cellular compartments in the model, indexed by compartment ID (c_id)
56
+ species : pd.DataFrame
57
+ Molecular species in the model, indexed by species ID (s_id)
58
+ species_data : Dict[str, pd.DataFrame]
59
+ Additional data for species. Each DataFrame is indexed by species_id (s_id)
60
+ reactions : pd.DataFrame
61
+ Reactions in the model, indexed by reaction ID (r_id)
62
+ reactions_data : Dict[str, pd.DataFrame]
63
+ Additional data for reactions. Each DataFrame is indexed by reaction_id (r_id)
64
+ reaction_species : pd.DataFrame
65
+ One entry per species participating in a reaction, indexed by reaction-species ID (rsc_id)
66
+ schema : dict
67
+ Dictionary representing the structure of the other attributes and meaning of their variables
61
68
 
62
69
  Methods
63
70
  -------
64
71
  get_table(entity_type, required_attributes)
65
- Get a table from the SBML_dfs object and optionally validate that it contains a set of required attributes
72
+ Get a table from the SBML_dfs object with optional attribute validation
66
73
  search_by_ids(ids, entity_type, identifiers_df, ontologies)
67
- Pull out identifiers and entities matching a set of query ids which optionally match a set of ontologies
74
+ Find entities and identifiers matching a set of query IDs
68
75
  search_by_name(name, entity_type, partial_match)
69
- Pull out a set of entities by name or partial string match [default]
76
+ Find entities by exact or partial name match
70
77
  get_cspecies_features()
71
- Returns additional attributes of compartmentalized species
78
+ Get additional attributes of compartmentalized species
72
79
  get_species_features()
73
- Returns additional attributes of species
80
+ Get additional attributes of species
74
81
  get_identifiers(id_type)
75
- Returns a DataFrame containing identifiers from the id_type table
76
- get_uri_urls(entity_type, entity_ids = None)
77
- Returns a Series containing reference urls for each entity
82
+ Get identifiers from a specified entity type
83
+ get_uri_urls(entity_type, entity_ids)
84
+ Get reference URLs for specified entities
78
85
  validate()
79
- Validate that the sbml_dfs follows the schema and identify clear pathologies
80
- validate_and_rec()
81
- Validate the sbml_dfs and attempt to automatically resolve common issues
86
+ Validate the SBML_dfs structure and relationships
87
+ validate_and_resolve()
88
+ Validate and attempt to automatically fix common issues
82
89
  """
83
90
 
84
91
  compartments: pd.DataFrame
@@ -100,18 +107,22 @@ class SBML_dfs:
100
107
  resolve: bool = True,
101
108
  ) -> None:
102
109
  """
103
- Creates a pathway
110
+ Initialize a SBML_dfs object from a SBML model or dictionary of tables.
104
111
 
105
112
  Parameters
106
113
  ----------
107
- sbml_model : cpr.SBML or a dict containing tables following the sbml_dfs schema
108
- A SBML model produced by cpr.SBML().
109
- validate (bool): if True then call self.validate() to identify formatting issues
110
- resolve (bool): if True then try to automatically resolve common problems
111
-
112
- Returns
113
- -------
114
- None.
114
+ sbml_model : Union[sbml.SBML, MutableMapping[str, Union[pd.DataFrame, Dict[str, pd.DataFrame]]]]
115
+ Either a SBML model produced by sbml.SBML() or a dictionary containing tables
116
+ following the sbml_dfs schema
117
+ validate : bool, optional
118
+ Whether to validate the model structure and relationships, by default True
119
+ resolve : bool, optional
120
+ Whether to attempt automatic resolution of common issues, by default True
121
+
122
+ Raises
123
+ ------
124
+ ValueError
125
+ If the model structure is invalid and cannot be resolved
115
126
  """
116
127
 
117
128
  self.schema = SBML_DFS_SCHEMA.SCHEMA
@@ -156,9 +167,27 @@ class SBML_dfs:
156
167
  self, entity_type: str, required_attributes: None | set[str] = None
157
168
  ) -> pd.DataFrame:
158
169
  """
159
- Get Table
170
+ Get a table from the SBML_dfs object with optional attribute validation.
160
171
 
161
- Get a table from the SBML_dfs object and optionally validate that it contains a set of required attributes.
172
+ Parameters
173
+ ----------
174
+ entity_type : str
175
+ The type of entity table to retrieve (e.g., 'species', 'reactions')
176
+ required_attributes : Optional[Set[str]], optional
177
+ Set of attributes that must be present in the table, by default None.
178
+ Must be passed as a set, e.g. {'id'}, not a string.
179
+
180
+ Returns
181
+ -------
182
+ pd.DataFrame
183
+ The requested table
184
+
185
+ Raises
186
+ ------
187
+ ValueError
188
+ If entity_type is invalid or required attributes are missing
189
+ TypeError
190
+ If required_attributes is not a set
162
191
  """
163
192
 
164
193
  schema = self.schema
@@ -172,7 +201,8 @@ class SBML_dfs:
172
201
  if required_attributes is not None:
173
202
  if not isinstance(required_attributes, set):
174
203
  raise TypeError(
175
- f"required_attributes must be a set, but got {type(required_attributes).__name__}"
204
+ f"required_attributes must be a set (e.g. {{'id'}}), but got {type(required_attributes).__name__}. "
205
+ "Did you pass a string instead of a set?"
176
206
  )
177
207
 
178
208
  # determine whether required_attributes are appropriate
@@ -206,6 +236,33 @@ class SBML_dfs:
206
236
  identifiers_df: pd.DataFrame,
207
237
  ontologies: None | set[str] = None,
208
238
  ) -> tuple[pd.DataFrame, pd.DataFrame]:
239
+ """
240
+ Find entities and identifiers matching a set of query IDs.
241
+
242
+ Parameters
243
+ ----------
244
+ ids : List[str]
245
+ List of identifiers to search for
246
+ entity_type : str
247
+ Type of entity to search (e.g., 'species', 'reactions')
248
+ identifiers_df : pd.DataFrame
249
+ DataFrame containing identifier mappings
250
+ ontologies : Optional[Set[str]], optional
251
+ Set of ontologies to filter by, by default None
252
+
253
+ Returns
254
+ -------
255
+ Tuple[pd.DataFrame, pd.DataFrame]
256
+ - Matching entities
257
+ - Matching identifiers
258
+
259
+ Raises
260
+ ------
261
+ ValueError
262
+ If entity_type is invalid or ontologies are invalid
263
+ TypeError
264
+ If ontologies is not a set
265
+ """
209
266
  # validate inputs
210
267
  entity_table = self.get_table(entity_type, required_attributes={"id"})
211
268
  entity_pk = self.schema[entity_type]["pk"]
@@ -249,6 +306,23 @@ class SBML_dfs:
249
306
  def search_by_name(
250
307
  self, name: str, entity_type: str, partial_match: bool = True
251
308
  ) -> pd.DataFrame:
309
+ """
310
+ Find entities by exact or partial name match.
311
+
312
+ Parameters
313
+ ----------
314
+ name : str
315
+ Name to search for
316
+ entity_type : str
317
+ Type of entity to search (e.g., 'species', 'reactions')
318
+ partial_match : bool, optional
319
+ Whether to allow partial string matches, by default True
320
+
321
+ Returns
322
+ -------
323
+ pd.DataFrame
324
+ Matching entities
325
+ """
252
326
  entity_table = self.get_table(entity_type, required_attributes={"label"})
253
327
  label_attr = self.schema[entity_type]["label"]
254
328
 
@@ -261,6 +335,15 @@ class SBML_dfs:
261
335
  return matches
262
336
 
263
337
  def get_species_features(self) -> pd.DataFrame:
338
+ """
339
+ Get additional attributes of species.
340
+
341
+ Returns
342
+ -------
343
+ pd.DataFrame
344
+ Species with additional features including:
345
+ - species_type: Classification of the species (e.g., metabolite, protein)
346
+ """
264
347
  species = self.species
265
348
  augmented_species = species.assign(
266
349
  **{"species_type": lambda d: d["s_Identifiers"].apply(species_type_types)}
@@ -269,6 +352,18 @@ class SBML_dfs:
269
352
  return augmented_species
270
353
 
271
354
  def get_cspecies_features(self) -> pd.DataFrame:
355
+ """
356
+ Get additional attributes of compartmentalized species.
357
+
358
+ Returns
359
+ -------
360
+ pd.DataFrame
361
+ Compartmentalized species with additional features including:
362
+ - sc_degree: Number of reactions the species participates in
363
+ - sc_children: Number of reactions where species is consumed
364
+ - sc_parents: Number of reactions where species is produced
365
+ - species_type: Classification of the species
366
+ """
272
367
  cspecies_n_connections = (
273
368
  self.reaction_species["sc_id"].value_counts().rename("sc_degree")
274
369
  )
@@ -301,6 +396,24 @@ class SBML_dfs:
301
396
  )
302
397
 
303
398
  def get_identifiers(self, id_type) -> pd.DataFrame:
399
+ """
400
+ Get identifiers from a specified entity type.
401
+
402
+ Parameters
403
+ ----------
404
+ id_type : str
405
+ Type of entity to get identifiers for (e.g., 'species', 'reactions')
406
+
407
+ Returns
408
+ -------
409
+ pd.DataFrame
410
+ Table of identifiers for the specified entity type
411
+
412
+ Raises
413
+ ------
414
+ ValueError
415
+ If id_type is invalid or identifiers are malformed
416
+ """
304
417
  selected_table = self.get_table(id_type, {"id"})
305
418
  schema = self.schema
306
419
 
@@ -339,6 +452,28 @@ class SBML_dfs:
339
452
  entity_ids: Iterable[str] | None = None,
340
453
  required_ontology: str | None = None,
341
454
  ) -> pd.Series:
455
+ """
456
+ Get reference URLs for specified entities.
457
+
458
+ Parameters
459
+ ----------
460
+ entity_type : str
461
+ Type of entity to get URLs for (e.g., 'species', 'reactions')
462
+ entity_ids : Optional[Iterable[str]], optional
463
+ Specific entities to get URLs for, by default None (all entities)
464
+ required_ontology : Optional[str], optional
465
+ Specific ontology to get URLs from, by default None
466
+
467
+ Returns
468
+ -------
469
+ pd.Series
470
+ Series mapping entity IDs to their reference URLs
471
+
472
+ Raises
473
+ ------
474
+ ValueError
475
+ If entity_type is invalid
476
+ """
342
477
  schema = self.schema
343
478
 
344
479
  # valid entities and their identifier variables
@@ -397,32 +532,27 @@ class SBML_dfs:
397
532
  return uri_urls
398
533
 
399
534
  def get_network_summary(self) -> Mapping[str, Any]:
400
- """Return diagnostic statistics about the network
401
-
402
- Returns:
403
- Mapping[str, Any]: A dictionary of diagnostic statistics with entries:
404
- n_species_types [int]: Number of species types
405
- dict_n_species_per_type [dict[str, int]]: Number of
406
- species per species type
407
- n_species [int]: Number of species
408
- n_cspecies [int]: Number of compartmentalized species
409
- n_reaction_species [int]: Number of reaction species
410
- n_reactions [int]: Number of reactions
411
- n_compartments [int]: Number of compartments
412
- dict_n_species_per_compartment [dict[str, int]]:
413
- Number of species per compartment
414
- stats_species_per_reaction [dict[str, float]]:
415
- Statistics on the number of reactands per reaction
416
- top10_species_per_reaction [list[dict[str, Any]]]:
417
- Top 10 reactions with highest number of reactands
418
- stats_degree [dict[str, float]]: Statistics on the degree
419
- of a species (number of reactions it is involved in)
420
- top10_degree [list[dict[str, Any]]]:
421
- Top 10 species with highest degree
422
- stats_identifiers_per_species [dict[str, float]]:
423
- Statistics on the number of identifiers per species
424
- top10_identifiers_per_species [list[dict[str, Any]]]:
425
- Top 10 species with highest number of identifiers
535
+ """
536
+ Get diagnostic statistics about the network.
537
+
538
+ Returns
539
+ -------
540
+ Mapping[str, Any]
541
+ Dictionary of diagnostic statistics including:
542
+ - n_species_types: Number of species types
543
+ - dict_n_species_per_type: Number of species per type
544
+ - n_species: Number of species
545
+ - n_cspecies: Number of compartmentalized species
546
+ - n_reaction_species: Number of reaction species
547
+ - n_reactions: Number of reactions
548
+ - n_compartments: Number of compartments
549
+ - dict_n_species_per_compartment: Number of species per compartment
550
+ - stats_species_per_reaction: Statistics on reactands per reaction
551
+ - top10_species_per_reaction: Top 10 reactions by number of reactands
552
+ - stats_degree: Statistics on species connectivity
553
+ - top10_degree: Top 10 species by connectivity
554
+ - stats_identifiers_per_species: Statistics on identifiers per species
555
+ - top10_identifiers_per_species: Top 10 species by number of identifiers
426
556
  """
427
557
  stats: MutableMapping[str, Any] = {}
428
558
  species_features = self.get_species_features()
@@ -488,14 +618,20 @@ class SBML_dfs:
488
618
  return stats
489
619
 
490
620
  def add_species_data(self, label: str, data: pd.DataFrame):
491
- """Adds additional species_data with validation
492
-
493
- Args:
494
- label (str): the label for the new data
495
- data (pd.DataFrame): the data
621
+ """
622
+ Add additional species data with validation.
496
623
 
497
- Raises:
498
- ValueError: if the data is not valid, ie does not match with `species`
624
+ Parameters
625
+ ----------
626
+ label : str
627
+ Label for the new data
628
+ data : pd.DataFrame
629
+ Data to add, must be indexed by species_id
630
+
631
+ Raises
632
+ ------
633
+ ValueError
634
+ If the data is invalid or label already exists
499
635
  """
500
636
  self._validate_species_data(data)
501
637
  if label in self.species_data:
@@ -504,15 +640,27 @@ class SBML_dfs:
504
640
  )
505
641
  self.species_data[label] = data
506
642
 
507
- def add_reactions_data(self, label: str, data: pd.DataFrame):
508
- """Adds additional reaction_data with validation
643
+ def remove_species_data(self, label: str):
644
+ """
645
+ Remove species data by label.
646
+ """
647
+ self._remove_entity_data(SBML_DFS.SPECIES, label)
509
648
 
510
- Args:
511
- label (str): the label for the new data
512
- data (pd.DataFrame): the data
649
+ def add_reactions_data(self, label: str, data: pd.DataFrame):
650
+ """
651
+ Add additional reaction data with validation.
513
652
 
514
- Raises:
515
- ValueError: if the data is not valid, ie does not match with `reactions`
653
+ Parameters
654
+ ----------
655
+ label : str
656
+ Label for the new data
657
+ data : pd.DataFrame
658
+ Data to add, must be indexed by reaction_id
659
+
660
+ Raises
661
+ ------
662
+ ValueError
663
+ If the data is invalid or label already exists
516
664
  """
517
665
  self._validate_reactions_data(data)
518
666
  if label in self.reactions_data:
@@ -521,15 +669,28 @@ class SBML_dfs:
521
669
  )
522
670
  self.reactions_data[label] = data
523
671
 
672
+ def remove_reactions_data(self, label: str):
673
+ """
674
+ Remove reactions data by label.
675
+ """
676
+ self._remove_entity_data(SBML_DFS.REACTIONS, label)
677
+
524
678
  def remove_compartmentalized_species(self, sc_ids: Iterable[str]):
525
679
  """
526
- Starting with a set of compartmentalized species determine which reactions should be removed
527
- based on there removal. Then remove these reactions, compartmentalized species, and species.
680
+ Remove compartmentalized species and associated reactions.
681
+
682
+ Starting with a set of compartmentalized species, determine which reactions
683
+ should be removed based on their removal. Then remove these reactions,
684
+ compartmentalized species, and species.
528
685
 
686
+ Parameters
687
+ ----------
688
+ sc_ids : Iterable[str]
689
+ IDs of compartmentalized species to remove
529
690
  """
530
691
 
531
692
  # find reactions which should be totally removed since they are losing critical species
532
- removed_reactions = find_underspecified_reactions(self, sc_ids)
693
+ removed_reactions = _find_underspecified_reactions_by_scids(self, sc_ids)
533
694
  self.remove_reactions(removed_reactions)
534
695
 
535
696
  self._remove_compartmentalized_species(sc_ids)
@@ -538,12 +699,16 @@ class SBML_dfs:
538
699
  self._remove_unused_species()
539
700
 
540
701
  def remove_reactions(self, r_ids: Iterable[str], remove_species: bool = False):
541
- """Removes reactions from the model
702
+ """
703
+ Remove reactions from the model.
542
704
 
543
- Args:
544
- r_ids (List[str]): the reactions to remove
545
- remove_species (bool, optional): whether to remove species that are no longer
546
- part of any reactions. Defaults to False.
705
+ Parameters
706
+ ----------
707
+ r_ids : Iterable[str]
708
+ IDs of reactions to remove
709
+ remove_species : bool, optional
710
+ Whether to remove species that are no longer part of any reactions,
711
+ by default False
547
712
  """
548
713
  # remove corresponding reactions_species
549
714
  self.reaction_species = self.reaction_species.query("r_id not in @r_ids")
@@ -559,7 +724,23 @@ class SBML_dfs:
559
724
  self._remove_unused_species()
560
725
 
561
726
  def validate(self):
562
- """Validates the object for obvious errors"""
727
+ """
728
+ Validate the SBML_dfs structure and relationships.
729
+
730
+ Checks:
731
+ - Schema existence
732
+ - Required tables presence
733
+ - Individual table structure
734
+ - Primary key uniqueness
735
+ - Foreign key relationships
736
+ - Optional data table validity
737
+ - Reaction species validity
738
+
739
+ Raises
740
+ ------
741
+ ValueError
742
+ If any validation check fails
743
+ """
563
744
 
564
745
  if not hasattr(self, "schema"):
565
746
  raise ValueError("No schema found")
@@ -582,61 +763,10 @@ class SBML_dfs:
582
763
  )
583
764
 
584
765
  # check individual tables
585
-
586
766
  for table in required_tables:
587
- table_schema = self.schema[table]
588
- table_data = getattr(self, table)
589
-
590
- if not isinstance(table_data, pd.DataFrame):
591
- raise ValueError(
592
- f"{table} must be a pd.DataFrame, but was a " f"{type(table_data)}"
593
- )
594
-
595
- # check index
596
- expected_index_name = table_schema["pk"]
597
- if table_data.index.name != expected_index_name:
598
- raise ValueError(
599
- f"the index name for {table} was not the pk: "
600
- f"{expected_index_name}"
601
- )
602
-
603
- # check that all entries in the index are unique
604
- if len(set(table_data.index.tolist())) != table_data.shape[0]:
605
- duplicated_pks = table_data.index.value_counts()
606
- duplicated_pks = duplicated_pks[duplicated_pks > 1]
607
-
608
- example_duplicates = duplicated_pks.index[
609
- 0 : min(duplicated_pks.shape[0], 5)
610
- ]
611
- raise ValueError(
612
- f"{duplicated_pks.shape[0]} primary keys were "
613
- f"duplicated including {', '.join(example_duplicates)}"
614
- )
615
-
616
- # check variables
617
- expected_vars = set(table_schema["vars"])
618
- table_vars = set(list(table_data.columns))
619
-
620
- extra_vars = table_vars.difference(expected_vars)
621
- if len(extra_vars) != 0:
622
- logger.debug(
623
- f"{len(extra_vars)} extra variables were found"
624
- f" for {table}: {', '.join(extra_vars)}"
625
- )
626
-
627
- missing_vars = expected_vars.difference(table_vars)
628
- if len(missing_vars) != 0:
629
- raise ValueError(
630
- f"Missing {len(missing_vars)} required variables"
631
- f" for {table}: {', '.join(missing_vars)}"
632
- )
633
-
634
- # check
635
- if table_data.shape[0] == 0:
636
- raise ValueError(f"{table} contained no entries")
767
+ self._validate_table(table)
637
768
 
638
769
  # check whether pks and fks agree
639
-
640
770
  pk_df = pd.DataFrame(
641
771
  [{"pk_table": k, "key": v["pk"]} for k, v in self.schema.items()]
642
772
  )
@@ -681,7 +811,6 @@ class SBML_dfs:
681
811
  )
682
812
 
683
813
  # all foreign keys need to match a primary key
684
-
685
814
  extra_fks = fk_table_keys.difference(pk_table_keys)
686
815
  if len(extra_fks) != 0:
687
816
  raise ValueError(
@@ -710,7 +839,19 @@ class SBML_dfs:
710
839
  self._validate_reaction_species()
711
840
 
712
841
  def validate_and_resolve(self):
713
- """Call validate and try to iteratively resolve common validation errors"""
842
+ """
843
+ Validate and attempt to automatically fix common issues.
844
+
845
+ This method iteratively:
846
+ 1. Attempts validation
847
+ 2. If validation fails, tries to resolve the issue
848
+ 3. Repeats until validation passes or issue cannot be resolved
849
+
850
+ Raises
851
+ ------
852
+ ValueError
853
+ If validation fails and cannot be automatically resolved
854
+ """
714
855
 
715
856
  current_exception = None
716
857
  validated = False
@@ -730,6 +871,85 @@ class SBML_dfs:
730
871
  # try to resolve
731
872
  self._attempt_resolve(e)
732
873
 
874
+ def select_species_data(self, species_data_table: str) -> pd.DataFrame:
875
+ """
876
+ Select a species data table from the SBML_dfs object.
877
+
878
+ Parameters
879
+ ----------
880
+ species_data_table : str
881
+ Name of the species data table to select
882
+
883
+ Returns
884
+ -------
885
+ pd.DataFrame
886
+ The selected species data table
887
+
888
+ Raises
889
+ ------
890
+ ValueError
891
+ If species_data_table is not found
892
+ """
893
+ # Check if species_data_table exists in sbml_dfs.species_data
894
+ if species_data_table not in self.species_data:
895
+ raise ValueError(
896
+ f"species_data_table {species_data_table} not found in sbml_dfs.species_data. "
897
+ f"Available tables: {self.species_data.keys()}"
898
+ )
899
+
900
+ # Get the species data
901
+ return self.species_data[species_data_table]
902
+
903
+ def _validate_table(self, table: str) -> None:
904
+ """
905
+ Validate a table in this SBML_dfs object against its schema.
906
+
907
+ This is an internal method that validates a table that is part of this SBML_dfs
908
+ object against the schema stored in self.schema.
909
+
910
+ Parameters
911
+ ----------
912
+ table : str
913
+ Name of the table to validate
914
+
915
+ Raises
916
+ ------
917
+ ValueError
918
+ If the table does not conform to its schema
919
+ """
920
+ table_schema = self.schema[table]
921
+ table_data = getattr(self, table)
922
+ _perform_sbml_dfs_table_validation(table_data, table_schema, table)
923
+
924
+ def _remove_entity_data(self, entity_type: str, label: str) -> None:
925
+ """
926
+ Remove data from species_data or reactions_data by table name and label.
927
+
928
+ Parameters
929
+ ----------
930
+ entity_type : str
931
+ Name of the table to remove data from ('species' or 'reactions')
932
+ label : str
933
+ Label of the data to remove
934
+
935
+ Notes
936
+ -----
937
+ If the label does not exist, a warning will be logged that includes the existing labels.
938
+ """
939
+ if entity_type not in ENTITIES_W_DATA:
940
+ raise ValueError("table_name must be either 'species' or 'reactions'")
941
+
942
+ data_dict = getattr(self, ENTITIES_TO_ENTITY_DATA[entity_type])
943
+ if label not in data_dict:
944
+ existing_labels = list(data_dict.keys())
945
+ logger.warning(
946
+ f"Label '{label}' not found in {ENTITIES_TO_ENTITY_DATA[entity_type]}. "
947
+ f"Existing labels: {existing_labels}"
948
+ )
949
+ return
950
+
951
+ del data_dict[label]
952
+
733
953
  def _remove_unused_cspecies(self):
734
954
  """Removes compartmentalized species that are no
735
955
  longer part of any reactions"""
@@ -1952,88 +2172,6 @@ def sbml_dfs_from_edgelist(
1952
2172
  return sbml_model
1953
2173
 
1954
2174
 
1955
- def find_underspecified_reactions(
1956
- sbml_dfs: SBML_dfs, sc_ids: Iterable[str]
1957
- ) -> set[str]:
1958
- """
1959
- Find Underspecified reactions
1960
-
1961
- Identity reactions which should be removed if a set of molecular species are removed
1962
- from the system.
1963
-
1964
- Params:
1965
- sbml_dfs (SBML_dfs):
1966
- A pathway representation
1967
- sc_ids (list[str])
1968
- A list of compartmentalized species ids (sc_ids) which will be removed.
1969
-
1970
- Returns:
1971
- underspecified_reactions (set[str]):
1972
- A list of reactions which should be removed because they will not occur once
1973
- \"sc_ids\" are removed.
1974
-
1975
- """
1976
-
1977
- updated_reaction_species = sbml_dfs.reaction_species.copy()
1978
- updated_reaction_species["new"] = ~updated_reaction_species[SBML_DFS.SC_ID].isin(
1979
- sc_ids
1980
- )
1981
-
1982
- updated_reaction_species = (
1983
- updated_reaction_species.assign(
1984
- sbo_role=updated_reaction_species[SBML_DFS.SBO_TERM]
1985
- )
1986
- .replace({"sbo_role": MINI_SBO_TO_NAME})
1987
- .replace({"sbo_role": SBO_NAME_TO_ROLE})
1988
- )
1989
-
1990
- reactions_with_lost_defining_members = set(
1991
- updated_reaction_species.query("~new")
1992
- .query("sbo_role == 'DEFINING'")[SBML_DFS.R_ID]
1993
- .tolist()
1994
- )
1995
-
1996
- N_reactions_with_lost_defining_members = len(reactions_with_lost_defining_members)
1997
- if N_reactions_with_lost_defining_members > 0:
1998
- logger.info(
1999
- f"Removing {N_reactions_with_lost_defining_members} reactions which have lost at least one defining species"
2000
- )
2001
-
2002
- # for each reaction what are the required sbo_terms?
2003
- reactions_with_requirements = (
2004
- updated_reaction_species.query("sbo_role == 'REQUIRED'")[
2005
- ["r_id", "sbo_term", "new"]
2006
- ]
2007
- .drop_duplicates()
2008
- .reset_index(drop=True)
2009
- )
2010
-
2011
- # which required members are still present after removing some entries
2012
- reactions_with_lost_requirements = set(
2013
- reactions_with_requirements.query("~new")
2014
- .merge(
2015
- reactions_with_requirements.query("new").rename(
2016
- {"new": "still_present"}, axis=1
2017
- ),
2018
- how="left",
2019
- )
2020
- .fillna(False)[SBML_DFS.R_ID] # Fill boolean column with False
2021
- .tolist()
2022
- )
2023
-
2024
- N_reactions_with_lost_requirements = len(reactions_with_lost_requirements)
2025
- if N_reactions_with_lost_requirements > 0:
2026
- logger.info(
2027
- f"Removing {N_reactions_with_lost_requirements} reactions which have lost all required members"
2028
- )
2029
-
2030
- underspecified_reactions = reactions_with_lost_defining_members.union(
2031
- reactions_with_lost_requirements
2032
- )
2033
-
2034
- return underspecified_reactions
2035
-
2036
-
2037
2175
  def _sbml_dfs_from_edgelist_validate_inputs(
2038
2176
  interaction_edgelist: pd.DataFrame,
2039
2177
  species_df: pd.DataFrame,
@@ -2231,3 +2369,230 @@ def stub_ids(ids):
2231
2369
  )
2232
2370
  else:
2233
2371
  return pd.DataFrame(ids)
2372
+
2373
+
2374
+ def add_sbo_role(reaction_species: pd.DataFrame) -> pd.DataFrame:
2375
+ """
2376
+ Add an sbo_role column to the reaction_species table.
2377
+
2378
+ The sbo_role column is a string column that contains the SBO role of the reaction species.
2379
+ The values in the sbo_role column are taken from the sbo_term column.
2380
+
2381
+ The sbo_role column is added to the reaction_species table by mapping the sbo_term column to the SBO_NAME_TO_ROLE dictionary.
2382
+ """
2383
+
2384
+ validate_sbml_dfs_table(reaction_species, SBML_DFS.REACTION_SPECIES)
2385
+
2386
+ reaction_species = (
2387
+ reaction_species.assign(sbo_role=reaction_species[SBML_DFS.SBO_TERM])
2388
+ .replace({SBO_ROLES_DEFS.SBO_ROLE: MINI_SBO_TO_NAME})
2389
+ .replace({SBO_ROLES_DEFS.SBO_ROLE: SBO_NAME_TO_ROLE})
2390
+ )
2391
+
2392
+ undefined_roles = set(reaction_species[SBO_ROLES_DEFS.SBO_ROLE].unique()) - set(
2393
+ SBO_NAME_TO_ROLE.values()
2394
+ )
2395
+ if len(undefined_roles) > 0:
2396
+ logger.warning(
2397
+ f"The following SBO roles are not defined: {undefined_roles}. They will be treated as {SBO_ROLES_DEFS.OPTIONAL} when determining reaction operability."
2398
+ )
2399
+ mask = reaction_species[SBO_ROLES_DEFS.SBO_ROLE].isin(undefined_roles)
2400
+ reaction_species.loc[mask, SBO_ROLES_DEFS.SBO_ROLE] = SBO_ROLES_DEFS.OPTIONAL
2401
+
2402
+ return reaction_species
2403
+
2404
+
2405
+ def find_underspecified_reactions(
2406
+ reaction_species_w_roles: pd.DataFrame,
2407
+ ) -> pd.DataFrame:
2408
+
2409
+ # check that both sbo_role and "new" are present
2410
+ if SBO_ROLES_DEFS.SBO_ROLE not in reaction_species_w_roles.columns:
2411
+ raise ValueError(
2412
+ "The sbo_role column is not present in the reaction_species_w_roles table. Please call add_sbo_role() first."
2413
+ )
2414
+ if "new" not in reaction_species_w_roles.columns:
2415
+ raise ValueError(
2416
+ "The new column is not present in the reaction_species_w_roles table. This should indicate what cspecies would be preserved in the reaction should it be preserved."
2417
+ )
2418
+ # check that new is a boolean column
2419
+ if reaction_species_w_roles["new"].dtype != bool:
2420
+ raise ValueError(
2421
+ "The new column is not a boolean column. Please ensure that the new column is a boolean column. This should indicate what cspecies would be preserved in the reaction should it be preserved."
2422
+ )
2423
+
2424
+ reactions_with_lost_defining_members = set(
2425
+ reaction_species_w_roles.query("~new")
2426
+ .query("sbo_role == 'DEFINING'")[SBML_DFS.R_ID]
2427
+ .tolist()
2428
+ )
2429
+
2430
+ N_reactions_with_lost_defining_members = len(reactions_with_lost_defining_members)
2431
+ if N_reactions_with_lost_defining_members > 0:
2432
+ logger.info(
2433
+ f"Removing {N_reactions_with_lost_defining_members} reactions which have lost at least one defining species"
2434
+ )
2435
+
2436
+ # find the cases where all "new" values for a given (r_id, sbo_term) are False
2437
+ reactions_with_lost_requirements = set(
2438
+ reaction_species_w_roles
2439
+ # drop already filtered reactions
2440
+ .query("r_id not in @reactions_with_lost_defining_members")
2441
+ .query("sbo_role == 'REQUIRED'")
2442
+ # which entries which have some required attribute have all False values for that attribute
2443
+ .groupby([SBML_DFS.R_ID, SBML_DFS.SBO_TERM])
2444
+ .agg({"new": "any"})
2445
+ .query("new == False")
2446
+ .index.get_level_values(SBML_DFS.R_ID)
2447
+ )
2448
+
2449
+ N_reactions_with_lost_requirements = len(reactions_with_lost_requirements)
2450
+ if N_reactions_with_lost_requirements > 0:
2451
+ logger.info(
2452
+ f"Removing {N_reactions_with_lost_requirements} reactions which have lost all required members"
2453
+ )
2454
+
2455
+ underspecified_reactions = reactions_with_lost_defining_members.union(
2456
+ reactions_with_lost_requirements
2457
+ )
2458
+
2459
+ return underspecified_reactions
2460
+
2461
+
2462
+ def _find_underspecified_reactions_by_scids(
2463
+ sbml_dfs: SBML_dfs, sc_ids: Iterable[str]
2464
+ ) -> set[str]:
2465
+ """
2466
+ Find Underspecified reactions
2467
+
2468
+ Identity reactions which should be removed if a set of molecular species are removed
2469
+ from the system.
2470
+
2471
+ Params:
2472
+ sbml_dfs (SBML_dfs):
2473
+ A pathway representation
2474
+ sc_ids (list[str])
2475
+ A list of compartmentalized species ids (sc_ids) which will be removed.
2476
+
2477
+ Returns:
2478
+ underspecified_reactions (set[str]):
2479
+ A list of reactions which should be removed because they will not occur once
2480
+ \"sc_ids\" are removed.
2481
+
2482
+ """
2483
+
2484
+ updated_reaction_species = sbml_dfs.reaction_species.copy()
2485
+ updated_reaction_species["new"] = ~updated_reaction_species[SBML_DFS.SC_ID].isin(
2486
+ sc_ids
2487
+ )
2488
+
2489
+ updated_reaction_species = add_sbo_role(updated_reaction_species)
2490
+ underspecified_reactions = find_underspecified_reactions(updated_reaction_species)
2491
+
2492
+ return underspecified_reactions
2493
+
2494
+
2495
+ def validate_sbml_dfs_table(table_data: pd.DataFrame, table_name: str) -> None:
2496
+ """
2497
+ Validate a standalone table against the SBML_dfs schema.
2498
+
2499
+ This function validates a table against the schema defined in SBML_DFS_SCHEMA,
2500
+ without requiring an SBML_dfs object. Useful for validating tables before
2501
+ creating an SBML_dfs object.
2502
+
2503
+ Parameters
2504
+ ----------
2505
+ table_data : pd.DataFrame
2506
+ The table to validate
2507
+ table_name : str
2508
+ Name of the table in the SBML_dfs schema
2509
+
2510
+ Raises
2511
+ ------
2512
+ ValueError
2513
+ If table_name is not in schema or validation fails
2514
+ """
2515
+ if table_name not in SBML_DFS_SCHEMA.SCHEMA:
2516
+ raise ValueError(
2517
+ f"{table_name} is not a valid table name in SBML_DFS_SCHEMA. "
2518
+ f"Valid tables are: {', '.join(SBML_DFS_SCHEMA.SCHEMA.keys())}"
2519
+ )
2520
+
2521
+ table_schema = SBML_DFS_SCHEMA.SCHEMA[table_name]
2522
+ _perform_sbml_dfs_table_validation(table_data, table_schema, table_name)
2523
+
2524
+
2525
+ def _perform_sbml_dfs_table_validation(
2526
+ table_data: pd.DataFrame,
2527
+ table_schema: dict,
2528
+ table_name: str,
2529
+ ) -> None:
2530
+ """
2531
+ Core validation logic for SBML_dfs tables.
2532
+
2533
+ This function performs the actual validation checks for any table against its schema,
2534
+ regardless of whether it's part of an SBML_dfs object or standalone.
2535
+
2536
+ Parameters
2537
+ ----------
2538
+ table_data : pd.DataFrame
2539
+ The table data to validate
2540
+ table_schema : dict
2541
+ Schema definition for the table
2542
+ table_name : str
2543
+ Name of the table (for error messages)
2544
+
2545
+ Raises
2546
+ ------
2547
+ ValueError
2548
+ If the table does not conform to its schema:
2549
+ - Not a DataFrame
2550
+ - Wrong index name
2551
+ - Duplicate primary keys
2552
+ - Missing required variables
2553
+ - Empty table
2554
+ """
2555
+ if not isinstance(table_data, pd.DataFrame):
2556
+ raise ValueError(
2557
+ f"{table_name} must be a pd.DataFrame, but was a {type(table_data)}"
2558
+ )
2559
+
2560
+ # check index
2561
+ expected_index_name = table_schema["pk"]
2562
+ if table_data.index.name != expected_index_name:
2563
+ raise ValueError(
2564
+ f"the index name for {table_name} was not the pk: {expected_index_name}"
2565
+ )
2566
+
2567
+ # check that all entries in the index are unique
2568
+ if len(set(table_data.index.tolist())) != table_data.shape[0]:
2569
+ duplicated_pks = table_data.index.value_counts()
2570
+ duplicated_pks = duplicated_pks[duplicated_pks > 1]
2571
+
2572
+ example_duplicates = duplicated_pks.index[0 : min(duplicated_pks.shape[0], 5)]
2573
+ raise ValueError(
2574
+ f"{duplicated_pks.shape[0]} primary keys were duplicated "
2575
+ f"including {', '.join(example_duplicates)}"
2576
+ )
2577
+
2578
+ # check variables
2579
+ expected_vars = set(table_schema["vars"])
2580
+ table_vars = set(list(table_data.columns))
2581
+
2582
+ extra_vars = table_vars.difference(expected_vars)
2583
+ if len(extra_vars) != 0:
2584
+ logger.debug(
2585
+ f"{len(extra_vars)} extra variables were found for {table_name}: "
2586
+ f"{', '.join(extra_vars)}"
2587
+ )
2588
+
2589
+ missing_vars = expected_vars.difference(table_vars)
2590
+ if len(missing_vars) != 0:
2591
+ raise ValueError(
2592
+ f"Missing {len(missing_vars)} required variables for {table_name}: "
2593
+ f"{', '.join(missing_vars)}"
2594
+ )
2595
+
2596
+ # check for empty table
2597
+ if table_data.shape[0] == 0:
2598
+ raise ValueError(f"{table_name} contained no entries")