napistu 0.2.5.dev7__py3-none-any.whl → 0.3.1.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__init__.py +1 -3
- napistu/__main__.py +126 -96
- napistu/constants.py +35 -41
- napistu/context/__init__.py +10 -0
- napistu/context/discretize.py +462 -0
- napistu/context/filtering.py +387 -0
- napistu/gcs/__init__.py +1 -1
- napistu/identifiers.py +74 -15
- napistu/indices.py +68 -0
- napistu/ingestion/__init__.py +1 -1
- napistu/ingestion/bigg.py +47 -62
- napistu/ingestion/constants.py +18 -133
- napistu/ingestion/gtex.py +113 -0
- napistu/ingestion/hpa.py +147 -0
- napistu/ingestion/sbml.py +0 -97
- napistu/ingestion/string.py +2 -2
- napistu/matching/__init__.py +10 -0
- napistu/matching/constants.py +18 -0
- napistu/matching/interactions.py +518 -0
- napistu/matching/mount.py +529 -0
- napistu/matching/species.py +510 -0
- napistu/mcp/__init__.py +7 -4
- napistu/mcp/__main__.py +128 -72
- napistu/mcp/client.py +16 -25
- napistu/mcp/codebase.py +201 -145
- napistu/mcp/component_base.py +170 -0
- napistu/mcp/config.py +223 -0
- napistu/mcp/constants.py +45 -2
- napistu/mcp/documentation.py +253 -136
- napistu/mcp/documentation_utils.py +13 -48
- napistu/mcp/execution.py +372 -305
- napistu/mcp/health.py +47 -65
- napistu/mcp/profiles.py +10 -6
- napistu/mcp/server.py +161 -80
- napistu/mcp/tutorials.py +139 -87
- napistu/modify/__init__.py +1 -1
- napistu/modify/gaps.py +1 -1
- napistu/network/__init__.py +1 -1
- napistu/network/constants.py +101 -34
- napistu/network/data_handling.py +388 -0
- napistu/network/ig_utils.py +351 -0
- napistu/network/napistu_graph_core.py +354 -0
- napistu/network/neighborhoods.py +40 -40
- napistu/network/net_create.py +373 -309
- napistu/network/net_propagation.py +47 -19
- napistu/network/{net_utils.py → ng_utils.py} +124 -272
- napistu/network/paths.py +67 -51
- napistu/network/precompute.py +11 -11
- napistu/ontologies/__init__.py +10 -0
- napistu/ontologies/constants.py +129 -0
- napistu/ontologies/dogma.py +243 -0
- napistu/ontologies/genodexito.py +649 -0
- napistu/ontologies/mygene.py +369 -0
- napistu/ontologies/renaming.py +198 -0
- napistu/rpy2/__init__.py +229 -86
- napistu/rpy2/callr.py +47 -77
- napistu/rpy2/constants.py +24 -23
- napistu/rpy2/rids.py +61 -648
- napistu/sbml_dfs_core.py +587 -222
- napistu/scverse/__init__.py +15 -0
- napistu/scverse/constants.py +28 -0
- napistu/scverse/loading.py +727 -0
- napistu/utils.py +118 -10
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/METADATA +8 -3
- napistu-0.3.1.dev1.dist-info/RECORD +133 -0
- tests/conftest.py +22 -0
- tests/test_context_discretize.py +56 -0
- tests/test_context_filtering.py +267 -0
- tests/test_identifiers.py +100 -0
- tests/test_indices.py +65 -0
- tests/{test_edgelist.py → test_ingestion_napistu_edgelist.py} +2 -2
- tests/test_matching_interactions.py +108 -0
- tests/test_matching_mount.py +305 -0
- tests/test_matching_species.py +394 -0
- tests/test_mcp_config.py +193 -0
- tests/test_mcp_documentation_utils.py +12 -3
- tests/test_mcp_server.py +156 -19
- tests/test_network_data_handling.py +397 -0
- tests/test_network_ig_utils.py +23 -0
- tests/test_network_neighborhoods.py +19 -0
- tests/test_network_net_create.py +459 -0
- tests/test_network_ng_utils.py +30 -0
- tests/test_network_paths.py +56 -0
- tests/{test_precomputed_distances.py → test_network_precompute.py} +8 -6
- tests/test_ontologies_genodexito.py +58 -0
- tests/test_ontologies_mygene.py +39 -0
- tests/test_ontologies_renaming.py +110 -0
- tests/test_rpy2_callr.py +79 -0
- tests/test_rpy2_init.py +151 -0
- tests/test_sbml.py +0 -31
- tests/test_sbml_dfs_core.py +134 -10
- tests/test_scverse_loading.py +778 -0
- tests/test_set_coverage.py +2 -2
- tests/test_utils.py +121 -1
- napistu/mechanism_matching.py +0 -1353
- napistu/rpy2/netcontextr.py +0 -467
- napistu-0.2.5.dev7.dist-info/RECORD +0 -98
- tests/test_igraph.py +0 -367
- tests/test_mechanism_matching.py +0 -784
- tests/test_net_utils.py +0 -149
- tests/test_netcontextr.py +0 -105
- tests/test_rpy2.py +0 -61
- /napistu/ingestion/{cpr_edgelist.py → napistu_edgelist.py} +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/WHEEL +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/entry_points.txt +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/licenses/LICENSE +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/top_level.txt +0 -0
- /tests/{test_obo.py → test_ingestion_obo.py} +0 -0
napistu/sbml_dfs_core.py
CHANGED
@@ -31,6 +31,9 @@ from napistu.constants import MINI_SBO_TO_NAME
|
|
31
31
|
from napistu.constants import ONTOLOGIES
|
32
32
|
from napistu.constants import SBO_NAME_TO_ROLE
|
33
33
|
from napistu.constants import SBOTERM_NAMES
|
34
|
+
from napistu.constants import SBO_ROLES_DEFS
|
35
|
+
from napistu.constants import ENTITIES_W_DATA
|
36
|
+
from napistu.constants import ENTITIES_TO_ENTITY_DATA
|
34
37
|
from napistu.constants import CHARACTERISTIC_COMPLEX_ONTOLOGIES
|
35
38
|
from napistu.ingestion import sbml
|
36
39
|
from fs import open_fs
|
@@ -42,43 +45,47 @@ class SBML_dfs:
|
|
42
45
|
"""
|
43
46
|
System Biology Markup Language Model Data Frames.
|
44
47
|
|
48
|
+
A class representing a SBML model as a collection of pandas DataFrames.
|
49
|
+
This class provides methods for manipulating and analyzing biological pathway models
|
50
|
+
with support for species, reactions, compartments, and their relationships.
|
51
|
+
|
45
52
|
Attributes
|
46
53
|
----------
|
47
|
-
compartments: pd.DataFrame
|
48
|
-
|
49
|
-
species: pd.DataFrame
|
50
|
-
|
51
|
-
species_data: Dict[str, pd.DataFrame]
|
52
|
-
|
53
|
-
reactions: pd.DataFrame
|
54
|
-
|
55
|
-
reactions_data: Dict[str, pd.DataFrame]
|
56
|
-
|
57
|
-
reaction_species: pd.DataFrame
|
58
|
-
One entry per species participating in a reaction
|
59
|
-
schema: dict
|
60
|
-
|
54
|
+
compartments : pd.DataFrame
|
55
|
+
Sub-cellular compartments in the model, indexed by compartment ID (c_id)
|
56
|
+
species : pd.DataFrame
|
57
|
+
Molecular species in the model, indexed by species ID (s_id)
|
58
|
+
species_data : Dict[str, pd.DataFrame]
|
59
|
+
Additional data for species. Each DataFrame is indexed by species_id (s_id)
|
60
|
+
reactions : pd.DataFrame
|
61
|
+
Reactions in the model, indexed by reaction ID (r_id)
|
62
|
+
reactions_data : Dict[str, pd.DataFrame]
|
63
|
+
Additional data for reactions. Each DataFrame is indexed by reaction_id (r_id)
|
64
|
+
reaction_species : pd.DataFrame
|
65
|
+
One entry per species participating in a reaction, indexed by reaction-species ID (rsc_id)
|
66
|
+
schema : dict
|
67
|
+
Dictionary representing the structure of the other attributes and meaning of their variables
|
61
68
|
|
62
69
|
Methods
|
63
70
|
-------
|
64
71
|
get_table(entity_type, required_attributes)
|
65
|
-
Get a table from the SBML_dfs object
|
72
|
+
Get a table from the SBML_dfs object with optional attribute validation
|
66
73
|
search_by_ids(ids, entity_type, identifiers_df, ontologies)
|
67
|
-
|
74
|
+
Find entities and identifiers matching a set of query IDs
|
68
75
|
search_by_name(name, entity_type, partial_match)
|
69
|
-
|
76
|
+
Find entities by exact or partial name match
|
70
77
|
get_cspecies_features()
|
71
|
-
|
78
|
+
Get additional attributes of compartmentalized species
|
72
79
|
get_species_features()
|
73
|
-
|
80
|
+
Get additional attributes of species
|
74
81
|
get_identifiers(id_type)
|
75
|
-
|
76
|
-
get_uri_urls(entity_type, entity_ids
|
77
|
-
|
82
|
+
Get identifiers from a specified entity type
|
83
|
+
get_uri_urls(entity_type, entity_ids)
|
84
|
+
Get reference URLs for specified entities
|
78
85
|
validate()
|
79
|
-
Validate
|
80
|
-
|
81
|
-
Validate
|
86
|
+
Validate the SBML_dfs structure and relationships
|
87
|
+
validate_and_resolve()
|
88
|
+
Validate and attempt to automatically fix common issues
|
82
89
|
"""
|
83
90
|
|
84
91
|
compartments: pd.DataFrame
|
@@ -100,18 +107,22 @@ class SBML_dfs:
|
|
100
107
|
resolve: bool = True,
|
101
108
|
) -> None:
|
102
109
|
"""
|
103
|
-
|
110
|
+
Initialize a SBML_dfs object from a SBML model or dictionary of tables.
|
104
111
|
|
105
112
|
Parameters
|
106
113
|
----------
|
107
|
-
sbml_model :
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
114
|
+
sbml_model : Union[sbml.SBML, MutableMapping[str, Union[pd.DataFrame, Dict[str, pd.DataFrame]]]]
|
115
|
+
Either a SBML model produced by sbml.SBML() or a dictionary containing tables
|
116
|
+
following the sbml_dfs schema
|
117
|
+
validate : bool, optional
|
118
|
+
Whether to validate the model structure and relationships, by default True
|
119
|
+
resolve : bool, optional
|
120
|
+
Whether to attempt automatic resolution of common issues, by default True
|
121
|
+
|
122
|
+
Raises
|
123
|
+
------
|
124
|
+
ValueError
|
125
|
+
If the model structure is invalid and cannot be resolved
|
115
126
|
"""
|
116
127
|
|
117
128
|
self.schema = SBML_DFS_SCHEMA.SCHEMA
|
@@ -156,9 +167,27 @@ class SBML_dfs:
|
|
156
167
|
self, entity_type: str, required_attributes: None | set[str] = None
|
157
168
|
) -> pd.DataFrame:
|
158
169
|
"""
|
159
|
-
Get
|
170
|
+
Get a table from the SBML_dfs object with optional attribute validation.
|
160
171
|
|
161
|
-
|
172
|
+
Parameters
|
173
|
+
----------
|
174
|
+
entity_type : str
|
175
|
+
The type of entity table to retrieve (e.g., 'species', 'reactions')
|
176
|
+
required_attributes : Optional[Set[str]], optional
|
177
|
+
Set of attributes that must be present in the table, by default None.
|
178
|
+
Must be passed as a set, e.g. {'id'}, not a string.
|
179
|
+
|
180
|
+
Returns
|
181
|
+
-------
|
182
|
+
pd.DataFrame
|
183
|
+
The requested table
|
184
|
+
|
185
|
+
Raises
|
186
|
+
------
|
187
|
+
ValueError
|
188
|
+
If entity_type is invalid or required attributes are missing
|
189
|
+
TypeError
|
190
|
+
If required_attributes is not a set
|
162
191
|
"""
|
163
192
|
|
164
193
|
schema = self.schema
|
@@ -172,7 +201,8 @@ class SBML_dfs:
|
|
172
201
|
if required_attributes is not None:
|
173
202
|
if not isinstance(required_attributes, set):
|
174
203
|
raise TypeError(
|
175
|
-
f"required_attributes must be a set, but got {type(required_attributes).__name__}"
|
204
|
+
f"required_attributes must be a set (e.g. {{'id'}}), but got {type(required_attributes).__name__}. "
|
205
|
+
"Did you pass a string instead of a set?"
|
176
206
|
)
|
177
207
|
|
178
208
|
# determine whether required_attributes are appropriate
|
@@ -206,6 +236,33 @@ class SBML_dfs:
|
|
206
236
|
identifiers_df: pd.DataFrame,
|
207
237
|
ontologies: None | set[str] = None,
|
208
238
|
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
239
|
+
"""
|
240
|
+
Find entities and identifiers matching a set of query IDs.
|
241
|
+
|
242
|
+
Parameters
|
243
|
+
----------
|
244
|
+
ids : List[str]
|
245
|
+
List of identifiers to search for
|
246
|
+
entity_type : str
|
247
|
+
Type of entity to search (e.g., 'species', 'reactions')
|
248
|
+
identifiers_df : pd.DataFrame
|
249
|
+
DataFrame containing identifier mappings
|
250
|
+
ontologies : Optional[Set[str]], optional
|
251
|
+
Set of ontologies to filter by, by default None
|
252
|
+
|
253
|
+
Returns
|
254
|
+
-------
|
255
|
+
Tuple[pd.DataFrame, pd.DataFrame]
|
256
|
+
- Matching entities
|
257
|
+
- Matching identifiers
|
258
|
+
|
259
|
+
Raises
|
260
|
+
------
|
261
|
+
ValueError
|
262
|
+
If entity_type is invalid or ontologies are invalid
|
263
|
+
TypeError
|
264
|
+
If ontologies is not a set
|
265
|
+
"""
|
209
266
|
# validate inputs
|
210
267
|
entity_table = self.get_table(entity_type, required_attributes={"id"})
|
211
268
|
entity_pk = self.schema[entity_type]["pk"]
|
@@ -249,6 +306,23 @@ class SBML_dfs:
|
|
249
306
|
def search_by_name(
|
250
307
|
self, name: str, entity_type: str, partial_match: bool = True
|
251
308
|
) -> pd.DataFrame:
|
309
|
+
"""
|
310
|
+
Find entities by exact or partial name match.
|
311
|
+
|
312
|
+
Parameters
|
313
|
+
----------
|
314
|
+
name : str
|
315
|
+
Name to search for
|
316
|
+
entity_type : str
|
317
|
+
Type of entity to search (e.g., 'species', 'reactions')
|
318
|
+
partial_match : bool, optional
|
319
|
+
Whether to allow partial string matches, by default True
|
320
|
+
|
321
|
+
Returns
|
322
|
+
-------
|
323
|
+
pd.DataFrame
|
324
|
+
Matching entities
|
325
|
+
"""
|
252
326
|
entity_table = self.get_table(entity_type, required_attributes={"label"})
|
253
327
|
label_attr = self.schema[entity_type]["label"]
|
254
328
|
|
@@ -261,6 +335,15 @@ class SBML_dfs:
|
|
261
335
|
return matches
|
262
336
|
|
263
337
|
def get_species_features(self) -> pd.DataFrame:
|
338
|
+
"""
|
339
|
+
Get additional attributes of species.
|
340
|
+
|
341
|
+
Returns
|
342
|
+
-------
|
343
|
+
pd.DataFrame
|
344
|
+
Species with additional features including:
|
345
|
+
- species_type: Classification of the species (e.g., metabolite, protein)
|
346
|
+
"""
|
264
347
|
species = self.species
|
265
348
|
augmented_species = species.assign(
|
266
349
|
**{"species_type": lambda d: d["s_Identifiers"].apply(species_type_types)}
|
@@ -269,6 +352,18 @@ class SBML_dfs:
|
|
269
352
|
return augmented_species
|
270
353
|
|
271
354
|
def get_cspecies_features(self) -> pd.DataFrame:
|
355
|
+
"""
|
356
|
+
Get additional attributes of compartmentalized species.
|
357
|
+
|
358
|
+
Returns
|
359
|
+
-------
|
360
|
+
pd.DataFrame
|
361
|
+
Compartmentalized species with additional features including:
|
362
|
+
- sc_degree: Number of reactions the species participates in
|
363
|
+
- sc_children: Number of reactions where species is consumed
|
364
|
+
- sc_parents: Number of reactions where species is produced
|
365
|
+
- species_type: Classification of the species
|
366
|
+
"""
|
272
367
|
cspecies_n_connections = (
|
273
368
|
self.reaction_species["sc_id"].value_counts().rename("sc_degree")
|
274
369
|
)
|
@@ -301,6 +396,24 @@ class SBML_dfs:
|
|
301
396
|
)
|
302
397
|
|
303
398
|
def get_identifiers(self, id_type) -> pd.DataFrame:
|
399
|
+
"""
|
400
|
+
Get identifiers from a specified entity type.
|
401
|
+
|
402
|
+
Parameters
|
403
|
+
----------
|
404
|
+
id_type : str
|
405
|
+
Type of entity to get identifiers for (e.g., 'species', 'reactions')
|
406
|
+
|
407
|
+
Returns
|
408
|
+
-------
|
409
|
+
pd.DataFrame
|
410
|
+
Table of identifiers for the specified entity type
|
411
|
+
|
412
|
+
Raises
|
413
|
+
------
|
414
|
+
ValueError
|
415
|
+
If id_type is invalid or identifiers are malformed
|
416
|
+
"""
|
304
417
|
selected_table = self.get_table(id_type, {"id"})
|
305
418
|
schema = self.schema
|
306
419
|
|
@@ -339,6 +452,28 @@ class SBML_dfs:
|
|
339
452
|
entity_ids: Iterable[str] | None = None,
|
340
453
|
required_ontology: str | None = None,
|
341
454
|
) -> pd.Series:
|
455
|
+
"""
|
456
|
+
Get reference URLs for specified entities.
|
457
|
+
|
458
|
+
Parameters
|
459
|
+
----------
|
460
|
+
entity_type : str
|
461
|
+
Type of entity to get URLs for (e.g., 'species', 'reactions')
|
462
|
+
entity_ids : Optional[Iterable[str]], optional
|
463
|
+
Specific entities to get URLs for, by default None (all entities)
|
464
|
+
required_ontology : Optional[str], optional
|
465
|
+
Specific ontology to get URLs from, by default None
|
466
|
+
|
467
|
+
Returns
|
468
|
+
-------
|
469
|
+
pd.Series
|
470
|
+
Series mapping entity IDs to their reference URLs
|
471
|
+
|
472
|
+
Raises
|
473
|
+
------
|
474
|
+
ValueError
|
475
|
+
If entity_type is invalid
|
476
|
+
"""
|
342
477
|
schema = self.schema
|
343
478
|
|
344
479
|
# valid entities and their identifier variables
|
@@ -397,32 +532,27 @@ class SBML_dfs:
|
|
397
532
|
return uri_urls
|
398
533
|
|
399
534
|
def get_network_summary(self) -> Mapping[str, Any]:
|
400
|
-
"""
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
Top 10 species with highest degree
|
422
|
-
stats_identifiers_per_species [dict[str, float]]:
|
423
|
-
Statistics on the number of identifiers per species
|
424
|
-
top10_identifiers_per_species [list[dict[str, Any]]]:
|
425
|
-
Top 10 species with highest number of identifiers
|
535
|
+
"""
|
536
|
+
Get diagnostic statistics about the network.
|
537
|
+
|
538
|
+
Returns
|
539
|
+
-------
|
540
|
+
Mapping[str, Any]
|
541
|
+
Dictionary of diagnostic statistics including:
|
542
|
+
- n_species_types: Number of species types
|
543
|
+
- dict_n_species_per_type: Number of species per type
|
544
|
+
- n_species: Number of species
|
545
|
+
- n_cspecies: Number of compartmentalized species
|
546
|
+
- n_reaction_species: Number of reaction species
|
547
|
+
- n_reactions: Number of reactions
|
548
|
+
- n_compartments: Number of compartments
|
549
|
+
- dict_n_species_per_compartment: Number of species per compartment
|
550
|
+
- stats_species_per_reaction: Statistics on reactands per reaction
|
551
|
+
- top10_species_per_reaction: Top 10 reactions by number of reactands
|
552
|
+
- stats_degree: Statistics on species connectivity
|
553
|
+
- top10_degree: Top 10 species by connectivity
|
554
|
+
- stats_identifiers_per_species: Statistics on identifiers per species
|
555
|
+
- top10_identifiers_per_species: Top 10 species by number of identifiers
|
426
556
|
"""
|
427
557
|
stats: MutableMapping[str, Any] = {}
|
428
558
|
species_features = self.get_species_features()
|
@@ -488,14 +618,20 @@ class SBML_dfs:
|
|
488
618
|
return stats
|
489
619
|
|
490
620
|
def add_species_data(self, label: str, data: pd.DataFrame):
|
491
|
-
"""
|
492
|
-
|
493
|
-
Args:
|
494
|
-
label (str): the label for the new data
|
495
|
-
data (pd.DataFrame): the data
|
621
|
+
"""
|
622
|
+
Add additional species data with validation.
|
496
623
|
|
497
|
-
|
498
|
-
|
624
|
+
Parameters
|
625
|
+
----------
|
626
|
+
label : str
|
627
|
+
Label for the new data
|
628
|
+
data : pd.DataFrame
|
629
|
+
Data to add, must be indexed by species_id
|
630
|
+
|
631
|
+
Raises
|
632
|
+
------
|
633
|
+
ValueError
|
634
|
+
If the data is invalid or label already exists
|
499
635
|
"""
|
500
636
|
self._validate_species_data(data)
|
501
637
|
if label in self.species_data:
|
@@ -504,15 +640,27 @@ class SBML_dfs:
|
|
504
640
|
)
|
505
641
|
self.species_data[label] = data
|
506
642
|
|
507
|
-
def
|
508
|
-
"""
|
643
|
+
def remove_species_data(self, label: str):
|
644
|
+
"""
|
645
|
+
Remove species data by label.
|
646
|
+
"""
|
647
|
+
self._remove_entity_data(SBML_DFS.SPECIES, label)
|
509
648
|
|
510
|
-
|
511
|
-
|
512
|
-
|
649
|
+
def add_reactions_data(self, label: str, data: pd.DataFrame):
|
650
|
+
"""
|
651
|
+
Add additional reaction data with validation.
|
513
652
|
|
514
|
-
|
515
|
-
|
653
|
+
Parameters
|
654
|
+
----------
|
655
|
+
label : str
|
656
|
+
Label for the new data
|
657
|
+
data : pd.DataFrame
|
658
|
+
Data to add, must be indexed by reaction_id
|
659
|
+
|
660
|
+
Raises
|
661
|
+
------
|
662
|
+
ValueError
|
663
|
+
If the data is invalid or label already exists
|
516
664
|
"""
|
517
665
|
self._validate_reactions_data(data)
|
518
666
|
if label in self.reactions_data:
|
@@ -521,15 +669,28 @@ class SBML_dfs:
|
|
521
669
|
)
|
522
670
|
self.reactions_data[label] = data
|
523
671
|
|
672
|
+
def remove_reactions_data(self, label: str):
|
673
|
+
"""
|
674
|
+
Remove reactions data by label.
|
675
|
+
"""
|
676
|
+
self._remove_entity_data(SBML_DFS.REACTIONS, label)
|
677
|
+
|
524
678
|
def remove_compartmentalized_species(self, sc_ids: Iterable[str]):
|
525
679
|
"""
|
526
|
-
|
527
|
-
|
680
|
+
Remove compartmentalized species and associated reactions.
|
681
|
+
|
682
|
+
Starting with a set of compartmentalized species, determine which reactions
|
683
|
+
should be removed based on their removal. Then remove these reactions,
|
684
|
+
compartmentalized species, and species.
|
528
685
|
|
686
|
+
Parameters
|
687
|
+
----------
|
688
|
+
sc_ids : Iterable[str]
|
689
|
+
IDs of compartmentalized species to remove
|
529
690
|
"""
|
530
691
|
|
531
692
|
# find reactions which should be totally removed since they are losing critical species
|
532
|
-
removed_reactions =
|
693
|
+
removed_reactions = _find_underspecified_reactions_by_scids(self, sc_ids)
|
533
694
|
self.remove_reactions(removed_reactions)
|
534
695
|
|
535
696
|
self._remove_compartmentalized_species(sc_ids)
|
@@ -538,12 +699,16 @@ class SBML_dfs:
|
|
538
699
|
self._remove_unused_species()
|
539
700
|
|
540
701
|
def remove_reactions(self, r_ids: Iterable[str], remove_species: bool = False):
|
541
|
-
"""
|
702
|
+
"""
|
703
|
+
Remove reactions from the model.
|
542
704
|
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
705
|
+
Parameters
|
706
|
+
----------
|
707
|
+
r_ids : Iterable[str]
|
708
|
+
IDs of reactions to remove
|
709
|
+
remove_species : bool, optional
|
710
|
+
Whether to remove species that are no longer part of any reactions,
|
711
|
+
by default False
|
547
712
|
"""
|
548
713
|
# remove corresponding reactions_species
|
549
714
|
self.reaction_species = self.reaction_species.query("r_id not in @r_ids")
|
@@ -559,7 +724,23 @@ class SBML_dfs:
|
|
559
724
|
self._remove_unused_species()
|
560
725
|
|
561
726
|
def validate(self):
|
562
|
-
"""
|
727
|
+
"""
|
728
|
+
Validate the SBML_dfs structure and relationships.
|
729
|
+
|
730
|
+
Checks:
|
731
|
+
- Schema existence
|
732
|
+
- Required tables presence
|
733
|
+
- Individual table structure
|
734
|
+
- Primary key uniqueness
|
735
|
+
- Foreign key relationships
|
736
|
+
- Optional data table validity
|
737
|
+
- Reaction species validity
|
738
|
+
|
739
|
+
Raises
|
740
|
+
------
|
741
|
+
ValueError
|
742
|
+
If any validation check fails
|
743
|
+
"""
|
563
744
|
|
564
745
|
if not hasattr(self, "schema"):
|
565
746
|
raise ValueError("No schema found")
|
@@ -582,61 +763,10 @@ class SBML_dfs:
|
|
582
763
|
)
|
583
764
|
|
584
765
|
# check individual tables
|
585
|
-
|
586
766
|
for table in required_tables:
|
587
|
-
|
588
|
-
table_data = getattr(self, table)
|
589
|
-
|
590
|
-
if not isinstance(table_data, pd.DataFrame):
|
591
|
-
raise ValueError(
|
592
|
-
f"{table} must be a pd.DataFrame, but was a " f"{type(table_data)}"
|
593
|
-
)
|
594
|
-
|
595
|
-
# check index
|
596
|
-
expected_index_name = table_schema["pk"]
|
597
|
-
if table_data.index.name != expected_index_name:
|
598
|
-
raise ValueError(
|
599
|
-
f"the index name for {table} was not the pk: "
|
600
|
-
f"{expected_index_name}"
|
601
|
-
)
|
602
|
-
|
603
|
-
# check that all entries in the index are unique
|
604
|
-
if len(set(table_data.index.tolist())) != table_data.shape[0]:
|
605
|
-
duplicated_pks = table_data.index.value_counts()
|
606
|
-
duplicated_pks = duplicated_pks[duplicated_pks > 1]
|
607
|
-
|
608
|
-
example_duplicates = duplicated_pks.index[
|
609
|
-
0 : min(duplicated_pks.shape[0], 5)
|
610
|
-
]
|
611
|
-
raise ValueError(
|
612
|
-
f"{duplicated_pks.shape[0]} primary keys were "
|
613
|
-
f"duplicated including {', '.join(example_duplicates)}"
|
614
|
-
)
|
615
|
-
|
616
|
-
# check variables
|
617
|
-
expected_vars = set(table_schema["vars"])
|
618
|
-
table_vars = set(list(table_data.columns))
|
619
|
-
|
620
|
-
extra_vars = table_vars.difference(expected_vars)
|
621
|
-
if len(extra_vars) != 0:
|
622
|
-
logger.debug(
|
623
|
-
f"{len(extra_vars)} extra variables were found"
|
624
|
-
f" for {table}: {', '.join(extra_vars)}"
|
625
|
-
)
|
626
|
-
|
627
|
-
missing_vars = expected_vars.difference(table_vars)
|
628
|
-
if len(missing_vars) != 0:
|
629
|
-
raise ValueError(
|
630
|
-
f"Missing {len(missing_vars)} required variables"
|
631
|
-
f" for {table}: {', '.join(missing_vars)}"
|
632
|
-
)
|
633
|
-
|
634
|
-
# check
|
635
|
-
if table_data.shape[0] == 0:
|
636
|
-
raise ValueError(f"{table} contained no entries")
|
767
|
+
self._validate_table(table)
|
637
768
|
|
638
769
|
# check whether pks and fks agree
|
639
|
-
|
640
770
|
pk_df = pd.DataFrame(
|
641
771
|
[{"pk_table": k, "key": v["pk"]} for k, v in self.schema.items()]
|
642
772
|
)
|
@@ -681,7 +811,6 @@ class SBML_dfs:
|
|
681
811
|
)
|
682
812
|
|
683
813
|
# all foreign keys need to match a primary key
|
684
|
-
|
685
814
|
extra_fks = fk_table_keys.difference(pk_table_keys)
|
686
815
|
if len(extra_fks) != 0:
|
687
816
|
raise ValueError(
|
@@ -710,7 +839,19 @@ class SBML_dfs:
|
|
710
839
|
self._validate_reaction_species()
|
711
840
|
|
712
841
|
def validate_and_resolve(self):
|
713
|
-
"""
|
842
|
+
"""
|
843
|
+
Validate and attempt to automatically fix common issues.
|
844
|
+
|
845
|
+
This method iteratively:
|
846
|
+
1. Attempts validation
|
847
|
+
2. If validation fails, tries to resolve the issue
|
848
|
+
3. Repeats until validation passes or issue cannot be resolved
|
849
|
+
|
850
|
+
Raises
|
851
|
+
------
|
852
|
+
ValueError
|
853
|
+
If validation fails and cannot be automatically resolved
|
854
|
+
"""
|
714
855
|
|
715
856
|
current_exception = None
|
716
857
|
validated = False
|
@@ -730,6 +871,85 @@ class SBML_dfs:
|
|
730
871
|
# try to resolve
|
731
872
|
self._attempt_resolve(e)
|
732
873
|
|
874
|
+
def select_species_data(self, species_data_table: str) -> pd.DataFrame:
|
875
|
+
"""
|
876
|
+
Select a species data table from the SBML_dfs object.
|
877
|
+
|
878
|
+
Parameters
|
879
|
+
----------
|
880
|
+
species_data_table : str
|
881
|
+
Name of the species data table to select
|
882
|
+
|
883
|
+
Returns
|
884
|
+
-------
|
885
|
+
pd.DataFrame
|
886
|
+
The selected species data table
|
887
|
+
|
888
|
+
Raises
|
889
|
+
------
|
890
|
+
ValueError
|
891
|
+
If species_data_table is not found
|
892
|
+
"""
|
893
|
+
# Check if species_data_table exists in sbml_dfs.species_data
|
894
|
+
if species_data_table not in self.species_data:
|
895
|
+
raise ValueError(
|
896
|
+
f"species_data_table {species_data_table} not found in sbml_dfs.species_data. "
|
897
|
+
f"Available tables: {self.species_data.keys()}"
|
898
|
+
)
|
899
|
+
|
900
|
+
# Get the species data
|
901
|
+
return self.species_data[species_data_table]
|
902
|
+
|
903
|
+
def _validate_table(self, table: str) -> None:
|
904
|
+
"""
|
905
|
+
Validate a table in this SBML_dfs object against its schema.
|
906
|
+
|
907
|
+
This is an internal method that validates a table that is part of this SBML_dfs
|
908
|
+
object against the schema stored in self.schema.
|
909
|
+
|
910
|
+
Parameters
|
911
|
+
----------
|
912
|
+
table : str
|
913
|
+
Name of the table to validate
|
914
|
+
|
915
|
+
Raises
|
916
|
+
------
|
917
|
+
ValueError
|
918
|
+
If the table does not conform to its schema
|
919
|
+
"""
|
920
|
+
table_schema = self.schema[table]
|
921
|
+
table_data = getattr(self, table)
|
922
|
+
_perform_sbml_dfs_table_validation(table_data, table_schema, table)
|
923
|
+
|
924
|
+
def _remove_entity_data(self, entity_type: str, label: str) -> None:
|
925
|
+
"""
|
926
|
+
Remove data from species_data or reactions_data by table name and label.
|
927
|
+
|
928
|
+
Parameters
|
929
|
+
----------
|
930
|
+
entity_type : str
|
931
|
+
Name of the table to remove data from ('species' or 'reactions')
|
932
|
+
label : str
|
933
|
+
Label of the data to remove
|
934
|
+
|
935
|
+
Notes
|
936
|
+
-----
|
937
|
+
If the label does not exist, a warning will be logged that includes the existing labels.
|
938
|
+
"""
|
939
|
+
if entity_type not in ENTITIES_W_DATA:
|
940
|
+
raise ValueError("table_name must be either 'species' or 'reactions'")
|
941
|
+
|
942
|
+
data_dict = getattr(self, ENTITIES_TO_ENTITY_DATA[entity_type])
|
943
|
+
if label not in data_dict:
|
944
|
+
existing_labels = list(data_dict.keys())
|
945
|
+
logger.warning(
|
946
|
+
f"Label '{label}' not found in {ENTITIES_TO_ENTITY_DATA[entity_type]}. "
|
947
|
+
f"Existing labels: {existing_labels}"
|
948
|
+
)
|
949
|
+
return
|
950
|
+
|
951
|
+
del data_dict[label]
|
952
|
+
|
733
953
|
def _remove_unused_cspecies(self):
|
734
954
|
"""Removes compartmentalized species that are no
|
735
955
|
longer part of any reactions"""
|
@@ -1952,88 +2172,6 @@ def sbml_dfs_from_edgelist(
|
|
1952
2172
|
return sbml_model
|
1953
2173
|
|
1954
2174
|
|
1955
|
-
def find_underspecified_reactions(
|
1956
|
-
sbml_dfs: SBML_dfs, sc_ids: Iterable[str]
|
1957
|
-
) -> set[str]:
|
1958
|
-
"""
|
1959
|
-
Find Underspecified reactions
|
1960
|
-
|
1961
|
-
Identity reactions which should be removed if a set of molecular species are removed
|
1962
|
-
from the system.
|
1963
|
-
|
1964
|
-
Params:
|
1965
|
-
sbml_dfs (SBML_dfs):
|
1966
|
-
A pathway representation
|
1967
|
-
sc_ids (list[str])
|
1968
|
-
A list of compartmentalized species ids (sc_ids) which will be removed.
|
1969
|
-
|
1970
|
-
Returns:
|
1971
|
-
underspecified_reactions (set[str]):
|
1972
|
-
A list of reactions which should be removed because they will not occur once
|
1973
|
-
\"sc_ids\" are removed.
|
1974
|
-
|
1975
|
-
"""
|
1976
|
-
|
1977
|
-
updated_reaction_species = sbml_dfs.reaction_species.copy()
|
1978
|
-
updated_reaction_species["new"] = ~updated_reaction_species[SBML_DFS.SC_ID].isin(
|
1979
|
-
sc_ids
|
1980
|
-
)
|
1981
|
-
|
1982
|
-
updated_reaction_species = (
|
1983
|
-
updated_reaction_species.assign(
|
1984
|
-
sbo_role=updated_reaction_species[SBML_DFS.SBO_TERM]
|
1985
|
-
)
|
1986
|
-
.replace({"sbo_role": MINI_SBO_TO_NAME})
|
1987
|
-
.replace({"sbo_role": SBO_NAME_TO_ROLE})
|
1988
|
-
)
|
1989
|
-
|
1990
|
-
reactions_with_lost_defining_members = set(
|
1991
|
-
updated_reaction_species.query("~new")
|
1992
|
-
.query("sbo_role == 'DEFINING'")[SBML_DFS.R_ID]
|
1993
|
-
.tolist()
|
1994
|
-
)
|
1995
|
-
|
1996
|
-
N_reactions_with_lost_defining_members = len(reactions_with_lost_defining_members)
|
1997
|
-
if N_reactions_with_lost_defining_members > 0:
|
1998
|
-
logger.info(
|
1999
|
-
f"Removing {N_reactions_with_lost_defining_members} reactions which have lost at least one defining species"
|
2000
|
-
)
|
2001
|
-
|
2002
|
-
# for each reaction what are the required sbo_terms?
|
2003
|
-
reactions_with_requirements = (
|
2004
|
-
updated_reaction_species.query("sbo_role == 'REQUIRED'")[
|
2005
|
-
["r_id", "sbo_term", "new"]
|
2006
|
-
]
|
2007
|
-
.drop_duplicates()
|
2008
|
-
.reset_index(drop=True)
|
2009
|
-
)
|
2010
|
-
|
2011
|
-
# which required members are still present after removing some entries
|
2012
|
-
reactions_with_lost_requirements = set(
|
2013
|
-
reactions_with_requirements.query("~new")
|
2014
|
-
.merge(
|
2015
|
-
reactions_with_requirements.query("new").rename(
|
2016
|
-
{"new": "still_present"}, axis=1
|
2017
|
-
),
|
2018
|
-
how="left",
|
2019
|
-
)
|
2020
|
-
.fillna(False)[SBML_DFS.R_ID] # Fill boolean column with False
|
2021
|
-
.tolist()
|
2022
|
-
)
|
2023
|
-
|
2024
|
-
N_reactions_with_lost_requirements = len(reactions_with_lost_requirements)
|
2025
|
-
if N_reactions_with_lost_requirements > 0:
|
2026
|
-
logger.info(
|
2027
|
-
f"Removing {N_reactions_with_lost_requirements} reactions which have lost all required members"
|
2028
|
-
)
|
2029
|
-
|
2030
|
-
underspecified_reactions = reactions_with_lost_defining_members.union(
|
2031
|
-
reactions_with_lost_requirements
|
2032
|
-
)
|
2033
|
-
|
2034
|
-
return underspecified_reactions
|
2035
|
-
|
2036
|
-
|
2037
2175
|
def _sbml_dfs_from_edgelist_validate_inputs(
|
2038
2176
|
interaction_edgelist: pd.DataFrame,
|
2039
2177
|
species_df: pd.DataFrame,
|
@@ -2231,3 +2369,230 @@ def stub_ids(ids):
|
|
2231
2369
|
)
|
2232
2370
|
else:
|
2233
2371
|
return pd.DataFrame(ids)
|
2372
|
+
|
2373
|
+
|
2374
|
+
def add_sbo_role(reaction_species: pd.DataFrame) -> pd.DataFrame:
|
2375
|
+
"""
|
2376
|
+
Add an sbo_role column to the reaction_species table.
|
2377
|
+
|
2378
|
+
The sbo_role column is a string column that contains the SBO role of the reaction species.
|
2379
|
+
The values in the sbo_role column are taken from the sbo_term column.
|
2380
|
+
|
2381
|
+
The sbo_role column is added to the reaction_species table by mapping the sbo_term column to the SBO_NAME_TO_ROLE dictionary.
|
2382
|
+
"""
|
2383
|
+
|
2384
|
+
validate_sbml_dfs_table(reaction_species, SBML_DFS.REACTION_SPECIES)
|
2385
|
+
|
2386
|
+
reaction_species = (
|
2387
|
+
reaction_species.assign(sbo_role=reaction_species[SBML_DFS.SBO_TERM])
|
2388
|
+
.replace({SBO_ROLES_DEFS.SBO_ROLE: MINI_SBO_TO_NAME})
|
2389
|
+
.replace({SBO_ROLES_DEFS.SBO_ROLE: SBO_NAME_TO_ROLE})
|
2390
|
+
)
|
2391
|
+
|
2392
|
+
undefined_roles = set(reaction_species[SBO_ROLES_DEFS.SBO_ROLE].unique()) - set(
|
2393
|
+
SBO_NAME_TO_ROLE.values()
|
2394
|
+
)
|
2395
|
+
if len(undefined_roles) > 0:
|
2396
|
+
logger.warning(
|
2397
|
+
f"The following SBO roles are not defined: {undefined_roles}. They will be treated as {SBO_ROLES_DEFS.OPTIONAL} when determining reaction operability."
|
2398
|
+
)
|
2399
|
+
mask = reaction_species[SBO_ROLES_DEFS.SBO_ROLE].isin(undefined_roles)
|
2400
|
+
reaction_species.loc[mask, SBO_ROLES_DEFS.SBO_ROLE] = SBO_ROLES_DEFS.OPTIONAL
|
2401
|
+
|
2402
|
+
return reaction_species
|
2403
|
+
|
2404
|
+
|
2405
|
+
def find_underspecified_reactions(
|
2406
|
+
reaction_species_w_roles: pd.DataFrame,
|
2407
|
+
) -> pd.DataFrame:
|
2408
|
+
|
2409
|
+
# check that both sbo_role and "new" are present
|
2410
|
+
if SBO_ROLES_DEFS.SBO_ROLE not in reaction_species_w_roles.columns:
|
2411
|
+
raise ValueError(
|
2412
|
+
"The sbo_role column is not present in the reaction_species_w_roles table. Please call add_sbo_role() first."
|
2413
|
+
)
|
2414
|
+
if "new" not in reaction_species_w_roles.columns:
|
2415
|
+
raise ValueError(
|
2416
|
+
"The new column is not present in the reaction_species_w_roles table. This should indicate what cspecies would be preserved in the reaction should it be preserved."
|
2417
|
+
)
|
2418
|
+
# check that new is a boolean column
|
2419
|
+
if reaction_species_w_roles["new"].dtype != bool:
|
2420
|
+
raise ValueError(
|
2421
|
+
"The new column is not a boolean column. Please ensure that the new column is a boolean column. This should indicate what cspecies would be preserved in the reaction should it be preserved."
|
2422
|
+
)
|
2423
|
+
|
2424
|
+
reactions_with_lost_defining_members = set(
|
2425
|
+
reaction_species_w_roles.query("~new")
|
2426
|
+
.query("sbo_role == 'DEFINING'")[SBML_DFS.R_ID]
|
2427
|
+
.tolist()
|
2428
|
+
)
|
2429
|
+
|
2430
|
+
N_reactions_with_lost_defining_members = len(reactions_with_lost_defining_members)
|
2431
|
+
if N_reactions_with_lost_defining_members > 0:
|
2432
|
+
logger.info(
|
2433
|
+
f"Removing {N_reactions_with_lost_defining_members} reactions which have lost at least one defining species"
|
2434
|
+
)
|
2435
|
+
|
2436
|
+
# find the cases where all "new" values for a given (r_id, sbo_term) are False
|
2437
|
+
reactions_with_lost_requirements = set(
|
2438
|
+
reaction_species_w_roles
|
2439
|
+
# drop already filtered reactions
|
2440
|
+
.query("r_id not in @reactions_with_lost_defining_members")
|
2441
|
+
.query("sbo_role == 'REQUIRED'")
|
2442
|
+
# which entries which have some required attribute have all False values for that attribute
|
2443
|
+
.groupby([SBML_DFS.R_ID, SBML_DFS.SBO_TERM])
|
2444
|
+
.agg({"new": "any"})
|
2445
|
+
.query("new == False")
|
2446
|
+
.index.get_level_values(SBML_DFS.R_ID)
|
2447
|
+
)
|
2448
|
+
|
2449
|
+
N_reactions_with_lost_requirements = len(reactions_with_lost_requirements)
|
2450
|
+
if N_reactions_with_lost_requirements > 0:
|
2451
|
+
logger.info(
|
2452
|
+
f"Removing {N_reactions_with_lost_requirements} reactions which have lost all required members"
|
2453
|
+
)
|
2454
|
+
|
2455
|
+
underspecified_reactions = reactions_with_lost_defining_members.union(
|
2456
|
+
reactions_with_lost_requirements
|
2457
|
+
)
|
2458
|
+
|
2459
|
+
return underspecified_reactions
|
2460
|
+
|
2461
|
+
|
2462
|
+
def _find_underspecified_reactions_by_scids(
|
2463
|
+
sbml_dfs: SBML_dfs, sc_ids: Iterable[str]
|
2464
|
+
) -> set[str]:
|
2465
|
+
"""
|
2466
|
+
Find Underspecified reactions
|
2467
|
+
|
2468
|
+
Identity reactions which should be removed if a set of molecular species are removed
|
2469
|
+
from the system.
|
2470
|
+
|
2471
|
+
Params:
|
2472
|
+
sbml_dfs (SBML_dfs):
|
2473
|
+
A pathway representation
|
2474
|
+
sc_ids (list[str])
|
2475
|
+
A list of compartmentalized species ids (sc_ids) which will be removed.
|
2476
|
+
|
2477
|
+
Returns:
|
2478
|
+
underspecified_reactions (set[str]):
|
2479
|
+
A list of reactions which should be removed because they will not occur once
|
2480
|
+
\"sc_ids\" are removed.
|
2481
|
+
|
2482
|
+
"""
|
2483
|
+
|
2484
|
+
updated_reaction_species = sbml_dfs.reaction_species.copy()
|
2485
|
+
updated_reaction_species["new"] = ~updated_reaction_species[SBML_DFS.SC_ID].isin(
|
2486
|
+
sc_ids
|
2487
|
+
)
|
2488
|
+
|
2489
|
+
updated_reaction_species = add_sbo_role(updated_reaction_species)
|
2490
|
+
underspecified_reactions = find_underspecified_reactions(updated_reaction_species)
|
2491
|
+
|
2492
|
+
return underspecified_reactions
|
2493
|
+
|
2494
|
+
|
2495
|
+
def validate_sbml_dfs_table(table_data: pd.DataFrame, table_name: str) -> None:
|
2496
|
+
"""
|
2497
|
+
Validate a standalone table against the SBML_dfs schema.
|
2498
|
+
|
2499
|
+
This function validates a table against the schema defined in SBML_DFS_SCHEMA,
|
2500
|
+
without requiring an SBML_dfs object. Useful for validating tables before
|
2501
|
+
creating an SBML_dfs object.
|
2502
|
+
|
2503
|
+
Parameters
|
2504
|
+
----------
|
2505
|
+
table_data : pd.DataFrame
|
2506
|
+
The table to validate
|
2507
|
+
table_name : str
|
2508
|
+
Name of the table in the SBML_dfs schema
|
2509
|
+
|
2510
|
+
Raises
|
2511
|
+
------
|
2512
|
+
ValueError
|
2513
|
+
If table_name is not in schema or validation fails
|
2514
|
+
"""
|
2515
|
+
if table_name not in SBML_DFS_SCHEMA.SCHEMA:
|
2516
|
+
raise ValueError(
|
2517
|
+
f"{table_name} is not a valid table name in SBML_DFS_SCHEMA. "
|
2518
|
+
f"Valid tables are: {', '.join(SBML_DFS_SCHEMA.SCHEMA.keys())}"
|
2519
|
+
)
|
2520
|
+
|
2521
|
+
table_schema = SBML_DFS_SCHEMA.SCHEMA[table_name]
|
2522
|
+
_perform_sbml_dfs_table_validation(table_data, table_schema, table_name)
|
2523
|
+
|
2524
|
+
|
2525
|
+
def _perform_sbml_dfs_table_validation(
|
2526
|
+
table_data: pd.DataFrame,
|
2527
|
+
table_schema: dict,
|
2528
|
+
table_name: str,
|
2529
|
+
) -> None:
|
2530
|
+
"""
|
2531
|
+
Core validation logic for SBML_dfs tables.
|
2532
|
+
|
2533
|
+
This function performs the actual validation checks for any table against its schema,
|
2534
|
+
regardless of whether it's part of an SBML_dfs object or standalone.
|
2535
|
+
|
2536
|
+
Parameters
|
2537
|
+
----------
|
2538
|
+
table_data : pd.DataFrame
|
2539
|
+
The table data to validate
|
2540
|
+
table_schema : dict
|
2541
|
+
Schema definition for the table
|
2542
|
+
table_name : str
|
2543
|
+
Name of the table (for error messages)
|
2544
|
+
|
2545
|
+
Raises
|
2546
|
+
------
|
2547
|
+
ValueError
|
2548
|
+
If the table does not conform to its schema:
|
2549
|
+
- Not a DataFrame
|
2550
|
+
- Wrong index name
|
2551
|
+
- Duplicate primary keys
|
2552
|
+
- Missing required variables
|
2553
|
+
- Empty table
|
2554
|
+
"""
|
2555
|
+
if not isinstance(table_data, pd.DataFrame):
|
2556
|
+
raise ValueError(
|
2557
|
+
f"{table_name} must be a pd.DataFrame, but was a {type(table_data)}"
|
2558
|
+
)
|
2559
|
+
|
2560
|
+
# check index
|
2561
|
+
expected_index_name = table_schema["pk"]
|
2562
|
+
if table_data.index.name != expected_index_name:
|
2563
|
+
raise ValueError(
|
2564
|
+
f"the index name for {table_name} was not the pk: {expected_index_name}"
|
2565
|
+
)
|
2566
|
+
|
2567
|
+
# check that all entries in the index are unique
|
2568
|
+
if len(set(table_data.index.tolist())) != table_data.shape[0]:
|
2569
|
+
duplicated_pks = table_data.index.value_counts()
|
2570
|
+
duplicated_pks = duplicated_pks[duplicated_pks > 1]
|
2571
|
+
|
2572
|
+
example_duplicates = duplicated_pks.index[0 : min(duplicated_pks.shape[0], 5)]
|
2573
|
+
raise ValueError(
|
2574
|
+
f"{duplicated_pks.shape[0]} primary keys were duplicated "
|
2575
|
+
f"including {', '.join(example_duplicates)}"
|
2576
|
+
)
|
2577
|
+
|
2578
|
+
# check variables
|
2579
|
+
expected_vars = set(table_schema["vars"])
|
2580
|
+
table_vars = set(list(table_data.columns))
|
2581
|
+
|
2582
|
+
extra_vars = table_vars.difference(expected_vars)
|
2583
|
+
if len(extra_vars) != 0:
|
2584
|
+
logger.debug(
|
2585
|
+
f"{len(extra_vars)} extra variables were found for {table_name}: "
|
2586
|
+
f"{', '.join(extra_vars)}"
|
2587
|
+
)
|
2588
|
+
|
2589
|
+
missing_vars = expected_vars.difference(table_vars)
|
2590
|
+
if len(missing_vars) != 0:
|
2591
|
+
raise ValueError(
|
2592
|
+
f"Missing {len(missing_vars)} required variables for {table_name}: "
|
2593
|
+
f"{', '.join(missing_vars)}"
|
2594
|
+
)
|
2595
|
+
|
2596
|
+
# check for empty table
|
2597
|
+
if table_data.shape[0] == 0:
|
2598
|
+
raise ValueError(f"{table_name} contained no entries")
|