napistu 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__main__.py +20 -9
- napistu/consensus.py +19 -25
- napistu/constants.py +90 -64
- napistu/indices.py +3 -1
- napistu/ingestion/sbml.py +298 -295
- napistu/ingestion/string.py +14 -18
- napistu/ingestion/trrust.py +22 -27
- napistu/matching/species.py +1 -1
- napistu/ontologies/genodexito.py +5 -1
- napistu/ontologies/renaming.py +4 -0
- napistu/sbml_dfs_core.py +127 -64
- napistu/sbml_dfs_utils.py +4 -0
- napistu/utils.py +52 -41
- {napistu-0.3.6.dist-info → napistu-0.3.7.dist-info}/METADATA +1 -1
- {napistu-0.3.6.dist-info → napistu-0.3.7.dist-info}/RECORD +27 -27
- tests/conftest.py +70 -13
- tests/test_consensus.py +74 -5
- tests/test_gaps.py +26 -15
- tests/test_network_net_create.py +1 -1
- tests/test_network_precompute.py +1 -1
- tests/test_ontologies_renaming.py +28 -24
- tests/test_sbml_dfs_core.py +165 -15
- tests/test_utils.py +19 -0
- {napistu-0.3.6.dist-info → napistu-0.3.7.dist-info}/WHEEL +0 -0
- {napistu-0.3.6.dist-info → napistu-0.3.7.dist-info}/entry_points.txt +0 -0
- {napistu-0.3.6.dist-info → napistu-0.3.7.dist-info}/licenses/LICENSE +0 -0
- {napistu-0.3.6.dist-info → napistu-0.3.7.dist-info}/top_level.txt +0 -0
napistu/ingestion/string.py
CHANGED
@@ -8,15 +8,11 @@ from napistu import sbml_dfs_core
|
|
8
8
|
from napistu import sbml_dfs_utils
|
9
9
|
from napistu import source
|
10
10
|
from napistu import utils
|
11
|
+
from napistu.ingestion import napistu_edgelist
|
11
12
|
from napistu.constants import BQB
|
12
13
|
from napistu.constants import MINI_SBO_FROM_NAME
|
13
|
-
from napistu.
|
14
|
-
from napistu.
|
15
|
-
from napistu.ingestion.constants import SBML_SPECIES_DICT_NAME
|
16
|
-
from napistu.ingestion.constants import SMBL_REACTION_DICT_IDENTIFIERS
|
17
|
-
from napistu.ingestion.constants import SMBL_REACTION_DICT_IS_REVERSIBLE
|
18
|
-
from napistu.ingestion.constants import SMBL_REACTION_DICT_NAME
|
19
|
-
from napistu.ingestion.constants import SMBL_REACTION_SPEC_SBO_TERM
|
14
|
+
from napistu.constants import ONTOLOGIES
|
15
|
+
from napistu.constants import SBML_DFS
|
20
16
|
from napistu.ingestion.constants import STRING_DOWNSTREAM_COMPARTMENT
|
21
17
|
from napistu.ingestion.constants import STRING_DOWNSTREAM_NAME
|
22
18
|
from napistu.ingestion.constants import STRING_PROTEIN_ID
|
@@ -137,10 +133,10 @@ def convert_string_to_sbml_dfs(
|
|
137
133
|
|
138
134
|
# define identifier mapping from aliases to use:
|
139
135
|
alias_to_identifier = {
|
140
|
-
"Ensembl_gene": (
|
141
|
-
"Ensembl_transcript": (
|
142
|
-
"Ensembl_translation": (
|
143
|
-
"Ensembl_UniProt_AC": (
|
136
|
+
"Ensembl_gene": (ONTOLOGIES.ENSEMBL_GENE, BQB.IS_ENCODED_BY),
|
137
|
+
"Ensembl_transcript": (ONTOLOGIES.ENSEMBL_TRANSCRIPT, BQB.IS_ENCODED_BY),
|
138
|
+
"Ensembl_translation": (ONTOLOGIES.ENSEMBL_PROTEIN, BQB.IS),
|
139
|
+
"Ensembl_UniProt_AC": (ONTOLOGIES.UNIPROT, BQB.IS),
|
144
140
|
}
|
145
141
|
|
146
142
|
# filter aliases to only keep required ones
|
@@ -276,17 +272,17 @@ def _build_species_df(
|
|
276
272
|
species_df = (
|
277
273
|
pd.Series(
|
278
274
|
list(set(edgelist[source_col]).union(edgelist[target_col])),
|
279
|
-
name=
|
275
|
+
name=SBML_DFS.S_NAME,
|
280
276
|
)
|
281
277
|
.to_frame()
|
282
|
-
.set_index(
|
278
|
+
.set_index(SBML_DFS.S_NAME, drop=False)
|
283
279
|
.apply(
|
284
280
|
_get_identifiers,
|
285
281
|
alias_to_identifier=alias_to_identifier,
|
286
282
|
dat_alias=aliases,
|
287
283
|
axis=1,
|
288
284
|
)
|
289
|
-
.rename(
|
285
|
+
.rename(SBML_DFS.S_IDENTIFIERS)
|
290
286
|
.reset_index()
|
291
287
|
)
|
292
288
|
return species_df
|
@@ -312,8 +308,8 @@ def _build_interactor_edgelist(
|
|
312
308
|
**{
|
313
309
|
STRING_UPSTREAM_COMPARTMENT: compartment,
|
314
310
|
STRING_DOWNSTREAM_COMPARTMENT: compartment,
|
315
|
-
|
316
|
-
|
311
|
+
SBML_DFS.SBO_TERM: sbo_interactor,
|
312
|
+
SBML_DFS.R_IDENTIFIERS: lambda x: identifiers.Identifiers([]),
|
317
313
|
}
|
318
314
|
)
|
319
315
|
if add_reverse_interactions:
|
@@ -336,10 +332,10 @@ def _build_interactor_edgelist(
|
|
336
332
|
)
|
337
333
|
|
338
334
|
interaction_edgelist = dat
|
339
|
-
interaction_edgelist[
|
335
|
+
interaction_edgelist[SBML_DFS.R_NAME] = _build_string_reaction_name(
|
340
336
|
dat[STRING_UPSTREAM_NAME], dat[STRING_DOWNSTREAM_NAME]
|
341
337
|
)
|
342
|
-
interaction_edgelist[
|
338
|
+
interaction_edgelist[SBML_DFS.R_ISREVERSIBLE] = True
|
343
339
|
|
344
340
|
return interaction_edgelist
|
345
341
|
|
napistu/ingestion/trrust.py
CHANGED
@@ -8,16 +8,11 @@ from napistu import identifiers
|
|
8
8
|
from napistu import sbml_dfs_core
|
9
9
|
from napistu import source
|
10
10
|
from napistu import utils
|
11
|
+
from napistu.constants import BQB
|
12
|
+
from napistu.constants import IDENTIFIERS
|
11
13
|
from napistu.constants import MINI_SBO_FROM_NAME
|
12
14
|
from napistu.constants import SBOTERM_NAMES
|
13
|
-
from napistu.
|
14
|
-
from napistu.ingestion.constants import SBML_COMPARTMENT_DICT_NAME
|
15
|
-
from napistu.ingestion.constants import SBML_SPECIES_DICT_IDENTIFIERS
|
16
|
-
from napistu.ingestion.constants import SBML_SPECIES_DICT_NAME
|
17
|
-
from napistu.ingestion.constants import SMBL_REACTION_DICT_IDENTIFIERS
|
18
|
-
from napistu.ingestion.constants import SMBL_REACTION_DICT_IS_REVERSIBLE
|
19
|
-
from napistu.ingestion.constants import SMBL_REACTION_DICT_NAME
|
20
|
-
from napistu.ingestion.constants import SMBL_REACTION_SPEC_SBO_TERM
|
15
|
+
from napistu.constants import SBML_DFS
|
21
16
|
from napistu.ingestion.constants import SPECIES_FULL_NAME_HUMAN
|
22
17
|
from napistu.ingestion.constants import STRING_DOWNSTREAM_COMPARTMENT
|
23
18
|
from napistu.ingestion.constants import STRING_DOWNSTREAM_NAME
|
@@ -81,16 +76,16 @@ def convert_trrust_to_sbml_dfs(
|
|
81
76
|
species_df = (
|
82
77
|
pd.DataFrame(
|
83
78
|
{
|
84
|
-
|
79
|
+
SBML_DFS.S_NAME: list(
|
85
80
|
{*edge_summaries_df["from"], *edge_summaries_df["to"]}
|
86
81
|
)
|
87
82
|
}
|
88
83
|
)
|
89
84
|
.merge(
|
90
|
-
uniprot_2_symbol.rename({TRRUST_SYMBOL:
|
85
|
+
uniprot_2_symbol.rename({TRRUST_SYMBOL: SBML_DFS.S_NAME}, axis=1),
|
91
86
|
how="left",
|
92
87
|
)
|
93
|
-
.set_index(
|
88
|
+
.set_index(SBML_DFS.S_NAME)
|
94
89
|
)
|
95
90
|
|
96
91
|
# create Identifiers objects for all species with uniprot IDs
|
@@ -106,14 +101,14 @@ def convert_trrust_to_sbml_dfs(
|
|
106
101
|
[
|
107
102
|
identifiers.Identifiers(
|
108
103
|
[
|
109
|
-
identifiers.format_uri(uri=x, biological_qualifier_type=
|
110
|
-
for x in species_w_ids.loc[[ind]][
|
104
|
+
identifiers.format_uri(uri=x, biological_qualifier_type=BQB.IS)
|
105
|
+
for x in species_w_ids.loc[[ind]][IDENTIFIERS.URL].tolist()
|
111
106
|
]
|
112
107
|
)
|
113
108
|
for ind in species_w_ids.index.unique()
|
114
109
|
],
|
115
110
|
index=species_w_ids.index.unique(),
|
116
|
-
).rename(
|
111
|
+
).rename(SBML_DFS.S_IDENTIFIERS)
|
117
112
|
|
118
113
|
# just retain s_name and s_Identifiers
|
119
114
|
# this just needs a source object which will be added later
|
@@ -124,21 +119,21 @@ def convert_trrust_to_sbml_dfs(
|
|
124
119
|
.merge(
|
125
120
|
species_w_ids_series,
|
126
121
|
how="left",
|
127
|
-
left_on=
|
122
|
+
left_on=SBML_DFS.S_NAME,
|
128
123
|
right_index=True,
|
129
124
|
)
|
130
125
|
.reset_index(drop=True)
|
131
126
|
)
|
132
127
|
# stub genes with missing IDs
|
133
|
-
species_df[
|
128
|
+
species_df[SBML_DFS.S_IDENTIFIERS] = species_df[SBML_DFS.S_IDENTIFIERS].fillna( # type: ignore
|
134
129
|
value=identifiers.Identifiers([])
|
135
130
|
)
|
136
131
|
|
137
132
|
# define distinct compartments
|
138
133
|
compartments_df = pd.DataFrame(
|
139
134
|
{
|
140
|
-
|
141
|
-
|
135
|
+
SBML_DFS.C_NAME: TRRUST_COMPARTMENT_NUCLEOPLASM,
|
136
|
+
SBML_DFS.C_IDENTIFIERS: identifiers.Identifiers(
|
142
137
|
[
|
143
138
|
identifiers.format_uri(
|
144
139
|
uri=identifiers.create_uri_url(
|
@@ -159,7 +154,7 @@ def convert_trrust_to_sbml_dfs(
|
|
159
154
|
upstream_compartment=TRRUST_COMPARTMENT_NUCLEOPLASM,
|
160
155
|
downstream_compartment=TRRUST_COMPARTMENT_NUCLEOPLASM,
|
161
156
|
)
|
162
|
-
gene_gene_identifier_edgelist[
|
157
|
+
gene_gene_identifier_edgelist[SBML_DFS.R_NAME] = [
|
163
158
|
f"{x} {y} of {z}"
|
164
159
|
for x, y, z in zip(
|
165
160
|
gene_gene_identifier_edgelist[STRING_UPSTREAM_NAME],
|
@@ -171,15 +166,15 @@ def convert_trrust_to_sbml_dfs(
|
|
171
166
|
# convert relationships to SBO terms
|
172
167
|
interaction_edgelist = gene_gene_identifier_edgelist.replace(
|
173
168
|
{"sign": MINI_SBO_FROM_NAME}
|
174
|
-
).rename({"sign":
|
169
|
+
).rename({"sign": SBML_DFS.SBO_TERM}, axis=1)
|
175
170
|
|
176
171
|
# format pubmed identifiers of interactions
|
177
|
-
interaction_edgelist[
|
172
|
+
interaction_edgelist[SBML_DFS.R_IDENTIFIERS] = [
|
178
173
|
_format_pubmed_for_interactions(x) for x in interaction_edgelist["reference"]
|
179
174
|
]
|
180
175
|
|
181
176
|
# directionality: by default, set r_isreversible to False for TRRUST data
|
182
|
-
interaction_edgelist[
|
177
|
+
interaction_edgelist[SBML_DFS.R_ISREVERSIBLE] = False
|
183
178
|
|
184
179
|
# reduce to essential variables
|
185
180
|
interaction_edgelist = interaction_edgelist[
|
@@ -188,10 +183,10 @@ def convert_trrust_to_sbml_dfs(
|
|
188
183
|
STRING_DOWNSTREAM_NAME,
|
189
184
|
STRING_UPSTREAM_COMPARTMENT,
|
190
185
|
STRING_DOWNSTREAM_COMPARTMENT,
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
186
|
+
SBML_DFS.R_NAME,
|
187
|
+
SBML_DFS.SBO_TERM,
|
188
|
+
SBML_DFS.R_IDENTIFIERS,
|
189
|
+
SBML_DFS.R_ISREVERSIBLE,
|
195
190
|
]
|
196
191
|
]
|
197
192
|
|
@@ -277,7 +272,7 @@ def _format_pubmed_for_interactions(pubmed_set):
|
|
277
272
|
url = identifiers.create_uri_url(ontology="pubmed", identifier=p, strict=False)
|
278
273
|
if url is not None:
|
279
274
|
valid_url = identifiers.format_uri(
|
280
|
-
uri=url, biological_qualifier_type=
|
275
|
+
uri=url, biological_qualifier_type=BQB.IS_DESCRIBED_BY
|
281
276
|
)
|
282
277
|
|
283
278
|
ids.append(valid_url)
|
napistu/matching/species.py
CHANGED
@@ -33,7 +33,7 @@ def features_to_pathway_species(
|
|
33
33
|
pd.Dataframe containing a "feature_identifiers_var" variable used to match entries
|
34
34
|
species_identifiers: pd.DataFrame
|
35
35
|
A table of molecular species identifiers produced from sbml_dfs.get_identifiers("species")
|
36
|
-
generally using
|
36
|
+
generally using sbml_dfs.export_sbml_dfs()
|
37
37
|
ontologies: set
|
38
38
|
A set of ontologies used to match features to pathway species
|
39
39
|
feature_identifiers_var: str
|
napistu/ontologies/genodexito.py
CHANGED
@@ -356,7 +356,7 @@ class Genodexito:
|
|
356
356
|
)
|
357
357
|
logger.debug(
|
358
358
|
f"{ids.shape[0] - expanded_ids.shape[0]} "
|
359
|
-
"ids are not included in expanded ids"
|
359
|
+
"ids are not included in expanded ids. These will be filled with empty Identifiers"
|
360
360
|
)
|
361
361
|
else:
|
362
362
|
matched_expanded_ids = expanded_ids
|
@@ -364,6 +364,10 @@ class Genodexito:
|
|
364
364
|
updated_ids = ids.drop(SBML_DFS.S_IDENTIFIERS, axis=1).join(
|
365
365
|
pd.DataFrame(matched_expanded_ids)
|
366
366
|
)
|
367
|
+
# fill missing attributes with empty Identifiers
|
368
|
+
updated_ids[SBML_DFS.S_IDENTIFIERS] = updated_ids[
|
369
|
+
SBML_DFS.S_IDENTIFIERS
|
370
|
+
].fillna(identifiers.Identifiers([]))
|
367
371
|
|
368
372
|
setattr(sbml_dfs, "species", updated_ids)
|
369
373
|
|
napistu/ontologies/renaming.py
CHANGED
@@ -72,6 +72,10 @@ def rename_species_ontologies(
|
|
72
72
|
updated_species = sbml_dfs.species.drop(SBML_DFS.S_IDENTIFIERS, axis=1).join(
|
73
73
|
pd.DataFrame(species_identifiers)
|
74
74
|
)
|
75
|
+
# fill missing attributes with empty Identifiers
|
76
|
+
updated_species[SBML_DFS.S_IDENTIFIERS] = updated_species[
|
77
|
+
SBML_DFS.S_IDENTIFIERS
|
78
|
+
].fillna(identifiers.Identifiers([]))
|
75
79
|
|
76
80
|
setattr(sbml_dfs, "species", updated_species)
|
77
81
|
|
napistu/sbml_dfs_core.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import copy
|
3
4
|
import logging
|
4
5
|
import re
|
5
6
|
from typing import Any
|
@@ -64,6 +65,8 @@ class SBML_dfs:
|
|
64
65
|
Add a new reactions data table to the model with validation.
|
65
66
|
add_species_data(label, data)
|
66
67
|
Add a new species data table to the model with validation.
|
68
|
+
copy()
|
69
|
+
Return a deep copy of the SBML_dfs object.
|
67
70
|
export_sbml_dfs(model_prefix, outdir, overwrite=False, dogmatic=True)
|
68
71
|
Export the SBML_dfs model and its tables to files in a specified directory.
|
69
72
|
get_characteristic_species_ids(dogmatic=True)
|
@@ -114,7 +117,6 @@ class SBML_dfs:
|
|
114
117
|
Private/Hidden Methods (alphabetical, appear after public methods)
|
115
118
|
-----------------------------------------------------------------
|
116
119
|
_attempt_resolve(e)
|
117
|
-
_check_pk_fk_correspondence()
|
118
120
|
_find_underspecified_reactions_by_scids(sc_ids)
|
119
121
|
_get_unused_cspecies()
|
120
122
|
_get_unused_species()
|
@@ -123,9 +125,12 @@ class SBML_dfs:
|
|
123
125
|
_remove_species(s_ids)
|
124
126
|
_remove_unused_cspecies()
|
125
127
|
_remove_unused_species()
|
128
|
+
_validate_identifiers()
|
129
|
+
_validate_pk_fk_correspondence()
|
126
130
|
_validate_r_ids(r_ids)
|
127
131
|
_validate_reaction_species()
|
128
132
|
_validate_reactions_data(reactions_data_table)
|
133
|
+
_validate_sources()
|
129
134
|
_validate_species_data(species_data_table)
|
130
135
|
_validate_table(table_name)
|
131
136
|
"""
|
@@ -255,6 +260,17 @@ class SBML_dfs:
|
|
255
260
|
)
|
256
261
|
self.species_data[label] = data
|
257
262
|
|
263
|
+
def copy(self):
|
264
|
+
"""
|
265
|
+
Return a deep copy of the SBML_dfs object.
|
266
|
+
|
267
|
+
Returns
|
268
|
+
-------
|
269
|
+
SBML_dfs
|
270
|
+
A deep copy of the current SBML_dfs object.
|
271
|
+
"""
|
272
|
+
return copy.deepcopy(self)
|
273
|
+
|
258
274
|
def export_sbml_dfs(
|
259
275
|
self,
|
260
276
|
model_prefix: str,
|
@@ -440,7 +456,7 @@ class SBML_dfs:
|
|
440
456
|
If id_type is invalid or identifiers are malformed
|
441
457
|
"""
|
442
458
|
selected_table = self.get_table(id_type, {"id"})
|
443
|
-
schema =
|
459
|
+
schema = SBML_DFS_SCHEMA.SCHEMA
|
444
460
|
|
445
461
|
identifiers_dict = dict()
|
446
462
|
for sysid in selected_table.index:
|
@@ -458,6 +474,7 @@ class SBML_dfs:
|
|
458
474
|
if not identifiers_dict:
|
459
475
|
# Return empty DataFrame with expected columns if nothing found
|
460
476
|
return pd.DataFrame(columns=[schema[id_type]["pk"], "entry"])
|
477
|
+
|
461
478
|
identifiers_tbl = pd.concat(identifiers_dict)
|
462
479
|
|
463
480
|
identifiers_tbl.index.names = [schema[id_type]["pk"], "entry"]
|
@@ -1382,7 +1399,7 @@ class SBML_dfs:
|
|
1382
1399
|
self._validate_table(table)
|
1383
1400
|
|
1384
1401
|
# check whether pks and fks agree
|
1385
|
-
self.
|
1402
|
+
self._validate_pk_fk_correspondence()
|
1386
1403
|
|
1387
1404
|
# check optional data tables:
|
1388
1405
|
for k, v in self.species_data.items():
|
@@ -1400,6 +1417,10 @@ class SBML_dfs:
|
|
1400
1417
|
# validate reaction_species sbo_terms and stoi
|
1401
1418
|
self._validate_reaction_species()
|
1402
1419
|
|
1420
|
+
# validate identifiers and sources
|
1421
|
+
self._validate_identifiers()
|
1422
|
+
self._validate_sources()
|
1423
|
+
|
1403
1424
|
def validate_and_resolve(self):
|
1404
1425
|
"""
|
1405
1426
|
Validate and attempt to automatically fix common issues.
|
@@ -1455,67 +1476,6 @@ class SBML_dfs:
|
|
1455
1476
|
)
|
1456
1477
|
raise e
|
1457
1478
|
|
1458
|
-
def _check_pk_fk_correspondence(self):
|
1459
|
-
"""
|
1460
|
-
Check whether primary keys and foreign keys agree for all tables in the schema.
|
1461
|
-
Raises ValueError if any correspondence fails.
|
1462
|
-
"""
|
1463
|
-
|
1464
|
-
pk_df = pd.DataFrame(
|
1465
|
-
[{"pk_table": k, "key": v["pk"]} for k, v in self.schema.items()]
|
1466
|
-
)
|
1467
|
-
|
1468
|
-
fk_df = (
|
1469
|
-
pd.DataFrame(
|
1470
|
-
[
|
1471
|
-
{"fk_table": k, "fk": v["fk"]}
|
1472
|
-
for k, v in self.schema.items()
|
1473
|
-
if "fk" in v.keys()
|
1474
|
-
]
|
1475
|
-
)
|
1476
|
-
.set_index("fk_table")["fk"]
|
1477
|
-
.apply(pd.Series)
|
1478
|
-
.reset_index()
|
1479
|
-
.melt(id_vars="fk_table")
|
1480
|
-
.drop(["variable"], axis=1)
|
1481
|
-
.rename(columns={"value": "key"})
|
1482
|
-
)
|
1483
|
-
|
1484
|
-
pk_fk_correspondences = pk_df.merge(fk_df)
|
1485
|
-
|
1486
|
-
for i in range(0, pk_fk_correspondences.shape[0]):
|
1487
|
-
pk_table_keys = set(
|
1488
|
-
getattr(self, pk_fk_correspondences["pk_table"][i]).index.tolist()
|
1489
|
-
)
|
1490
|
-
if None in pk_table_keys:
|
1491
|
-
raise ValueError(
|
1492
|
-
f"{pk_fk_correspondences['pk_table'][i]} had "
|
1493
|
-
"missing values in its index"
|
1494
|
-
)
|
1495
|
-
|
1496
|
-
fk_table_keys = set(
|
1497
|
-
getattr(self, pk_fk_correspondences["fk_table"][i]).loc[
|
1498
|
-
:, pk_fk_correspondences["key"][i]
|
1499
|
-
]
|
1500
|
-
)
|
1501
|
-
if None in fk_table_keys:
|
1502
|
-
raise ValueError(
|
1503
|
-
f"{pk_fk_correspondences['fk_table'][i]} included "
|
1504
|
-
f"missing {pk_fk_correspondences['key'][i]} values"
|
1505
|
-
)
|
1506
|
-
|
1507
|
-
# all foreign keys need to match a primary key
|
1508
|
-
extra_fks = fk_table_keys.difference(pk_table_keys)
|
1509
|
-
if len(extra_fks) != 0:
|
1510
|
-
raise ValueError(
|
1511
|
-
f"{len(extra_fks)} distinct "
|
1512
|
-
f"{pk_fk_correspondences['key'][i]} values were"
|
1513
|
-
f" found in {pk_fk_correspondences['fk_table'][i]} "
|
1514
|
-
f"but missing from {pk_fk_correspondences['pk_table'][i]}."
|
1515
|
-
" All foreign keys must have a matching primary key.\n\n"
|
1516
|
-
f"Extra key are: {', '.join(extra_fks)}"
|
1517
|
-
)
|
1518
|
-
|
1519
1479
|
def _find_underspecified_reactions_by_scids(
|
1520
1480
|
self, sc_ids: Iterable[str]
|
1521
1481
|
) -> set[str]:
|
@@ -1640,6 +1600,88 @@ class SBML_dfs:
|
|
1640
1600
|
s_ids = self._get_unused_species()
|
1641
1601
|
self._remove_species(s_ids)
|
1642
1602
|
|
1603
|
+
def _validate_identifiers(self):
|
1604
|
+
"""
|
1605
|
+
Validate identifiers in the model
|
1606
|
+
|
1607
|
+
Iterates through all tables and checks if the identifier columns are valid.
|
1608
|
+
|
1609
|
+
Raises:
|
1610
|
+
ValueError: missing identifiers in the table
|
1611
|
+
"""
|
1612
|
+
|
1613
|
+
SCHEMA = SBML_DFS_SCHEMA.SCHEMA
|
1614
|
+
for table in SBML_DFS_SCHEMA.SCHEMA.keys():
|
1615
|
+
if "id" not in SCHEMA[table].keys():
|
1616
|
+
continue
|
1617
|
+
id_series = self.get_table(table)[SCHEMA[table]["id"]]
|
1618
|
+
if id_series.isna().sum() > 0:
|
1619
|
+
missing_ids = id_series[id_series.isna()].index
|
1620
|
+
raise ValueError(
|
1621
|
+
f"{table} has {len(missing_ids)} missing ids: {missing_ids}"
|
1622
|
+
)
|
1623
|
+
|
1624
|
+
def _validate_pk_fk_correspondence(self):
|
1625
|
+
"""
|
1626
|
+
Check whether primary keys and foreign keys agree for all tables in the schema.
|
1627
|
+
Raises ValueError if any correspondence fails.
|
1628
|
+
"""
|
1629
|
+
|
1630
|
+
pk_df = pd.DataFrame(
|
1631
|
+
[{"pk_table": k, "key": v["pk"]} for k, v in self.schema.items()]
|
1632
|
+
)
|
1633
|
+
|
1634
|
+
fk_df = (
|
1635
|
+
pd.DataFrame(
|
1636
|
+
[
|
1637
|
+
{"fk_table": k, "fk": v["fk"]}
|
1638
|
+
for k, v in self.schema.items()
|
1639
|
+
if "fk" in v.keys()
|
1640
|
+
]
|
1641
|
+
)
|
1642
|
+
.set_index("fk_table")["fk"]
|
1643
|
+
.apply(pd.Series)
|
1644
|
+
.reset_index()
|
1645
|
+
.melt(id_vars="fk_table")
|
1646
|
+
.drop(["variable"], axis=1)
|
1647
|
+
.rename(columns={"value": "key"})
|
1648
|
+
)
|
1649
|
+
|
1650
|
+
pk_fk_correspondences = pk_df.merge(fk_df)
|
1651
|
+
|
1652
|
+
for i in range(0, pk_fk_correspondences.shape[0]):
|
1653
|
+
pk_table_keys = set(
|
1654
|
+
getattr(self, pk_fk_correspondences["pk_table"][i]).index.tolist()
|
1655
|
+
)
|
1656
|
+
if None in pk_table_keys:
|
1657
|
+
raise ValueError(
|
1658
|
+
f"{pk_fk_correspondences['pk_table'][i]} had "
|
1659
|
+
"missing values in its index"
|
1660
|
+
)
|
1661
|
+
|
1662
|
+
fk_table_keys = set(
|
1663
|
+
getattr(self, pk_fk_correspondences["fk_table"][i]).loc[
|
1664
|
+
:, pk_fk_correspondences["key"][i]
|
1665
|
+
]
|
1666
|
+
)
|
1667
|
+
if None in fk_table_keys:
|
1668
|
+
raise ValueError(
|
1669
|
+
f"{pk_fk_correspondences['fk_table'][i]} included "
|
1670
|
+
f"missing {pk_fk_correspondences['key'][i]} values"
|
1671
|
+
)
|
1672
|
+
|
1673
|
+
# all foreign keys need to match a primary key
|
1674
|
+
extra_fks = fk_table_keys.difference(pk_table_keys)
|
1675
|
+
if len(extra_fks) != 0:
|
1676
|
+
raise ValueError(
|
1677
|
+
f"{len(extra_fks)} distinct "
|
1678
|
+
f"{pk_fk_correspondences['key'][i]} values were"
|
1679
|
+
f" found in {pk_fk_correspondences['fk_table'][i]} "
|
1680
|
+
f"but missing from {pk_fk_correspondences['pk_table'][i]}."
|
1681
|
+
" All foreign keys must have a matching primary key.\n\n"
|
1682
|
+
f"Extra key are: {', '.join(extra_fks)}"
|
1683
|
+
)
|
1684
|
+
|
1643
1685
|
def _validate_r_ids(self, r_ids: Optional[Union[str, list[str]]]) -> list[str]:
|
1644
1686
|
|
1645
1687
|
if isinstance(r_ids, str):
|
@@ -1694,6 +1736,27 @@ class SBML_dfs:
|
|
1694
1736
|
"""
|
1695
1737
|
sbml_dfs_utils._validate_matching_data(reactions_data_table, self.reactions)
|
1696
1738
|
|
1739
|
+
def _validate_sources(self):
|
1740
|
+
"""
|
1741
|
+
Validate sources in the model
|
1742
|
+
|
1743
|
+
Iterates through all tables and checks if the source columns are valid.
|
1744
|
+
|
1745
|
+
Raises:
|
1746
|
+
ValueError: missing sources in the table
|
1747
|
+
"""
|
1748
|
+
|
1749
|
+
SCHEMA = SBML_DFS_SCHEMA.SCHEMA
|
1750
|
+
for table in SBML_DFS_SCHEMA.SCHEMA.keys():
|
1751
|
+
if "source" not in SCHEMA[table].keys():
|
1752
|
+
continue
|
1753
|
+
source_series = self.get_table(table)[SCHEMA[table]["source"]]
|
1754
|
+
if source_series.isna().sum() > 0:
|
1755
|
+
missing_sources = source_series[source_series.isna()].index
|
1756
|
+
raise ValueError(
|
1757
|
+
f"{table} has {len(missing_sources)} missing sources: {missing_sources}"
|
1758
|
+
)
|
1759
|
+
|
1697
1760
|
def _validate_species_data(self, species_data_table: pd.DataFrame):
|
1698
1761
|
"""Validates species data attribute
|
1699
1762
|
|
napistu/sbml_dfs_utils.py
CHANGED
@@ -559,6 +559,10 @@ def unnest_identifiers(id_table: pd.DataFrame, id_var: str) -> pd.DataFrame:
|
|
559
559
|
|
560
560
|
N_invalid_ids = sum(id_table[id_var].isna())
|
561
561
|
if N_invalid_ids != 0:
|
562
|
+
|
563
|
+
print("Rows with missing identifiers:")
|
564
|
+
print(id_table.loc[id_table[id_var].isna(), id_var])
|
565
|
+
|
562
566
|
raise ValueError(
|
563
567
|
f'{N_invalid_ids} entries in "id_table" were missing',
|
564
568
|
"entries with no identifiers should still include an Identifiers object",
|
napistu/utils.py
CHANGED
@@ -810,50 +810,15 @@ def drop_extra_cols(
|
|
810
810
|
return df_out.loc[:, ordered_cols]
|
811
811
|
|
812
812
|
|
813
|
-
def
|
814
|
-
left_df: pd.DataFrame, right_df: pd.DataFrame, merge_context: str, **merge_kwargs
|
815
|
-
) -> pd.DataFrame:
|
813
|
+
def update_pathological_names(names: pd.Series, prefix: str) -> pd.Series:
|
816
814
|
"""
|
817
|
-
|
818
|
-
|
819
|
-
Parameters
|
820
|
-
----------
|
821
|
-
left_df : pd.DataFrame
|
822
|
-
Left DataFrame for merge
|
823
|
-
right_df : pd.DataFrame
|
824
|
-
Right DataFrame for merge
|
825
|
-
merge_context : str
|
826
|
-
Description of the merge operation for logging
|
827
|
-
**merge_kwargs : dict
|
828
|
-
Additional keyword arguments passed to pd.merge
|
815
|
+
Update pathological names in a pandas Series.
|
829
816
|
|
830
|
-
|
831
|
-
-------
|
832
|
-
pd.DataFrame
|
833
|
-
Merged DataFrame with overwritten columns removed
|
817
|
+
Add a prefix to the names if they are all numeric.
|
834
818
|
"""
|
835
|
-
|
836
|
-
|
837
|
-
|
838
|
-
# Ensure we're using the correct suffixes
|
839
|
-
merge_kwargs["suffixes"] = ("_old", "")
|
840
|
-
|
841
|
-
# Perform merge
|
842
|
-
merged_df = pd.merge(left_df, right_df, **merge_kwargs)
|
843
|
-
|
844
|
-
# Check for and log any overwritten columns
|
845
|
-
new_cols = merged_df.columns.tolist()
|
846
|
-
overwritten_cols = [col for col in original_cols if col + "_old" in new_cols]
|
847
|
-
if overwritten_cols:
|
848
|
-
logger.warning(
|
849
|
-
f"The following columns were overwritten during {merge_context} merge and their original values "
|
850
|
-
f"have been suffixed with '_old': {', '.join(overwritten_cols)}"
|
851
|
-
)
|
852
|
-
# Drop the old columns
|
853
|
-
cols_to_drop = [col + "_old" for col in overwritten_cols]
|
854
|
-
merged_df = merged_df.drop(columns=cols_to_drop)
|
855
|
-
|
856
|
-
return merged_df
|
819
|
+
if names.apply(lambda x: x.isdigit()).all():
|
820
|
+
names = names.apply(lambda x: f"{prefix}{x}")
|
821
|
+
return names
|
857
822
|
|
858
823
|
|
859
824
|
def format_identifiers_as_edgelist(
|
@@ -1108,3 +1073,49 @@ def _add_nameness_score(df, name_var):
|
|
1108
1073
|
|
1109
1074
|
df.loc[:, "nameness_score"] = df[name_var].apply(score_nameness)
|
1110
1075
|
return df
|
1076
|
+
|
1077
|
+
|
1078
|
+
def _merge_and_log_overwrites(
|
1079
|
+
left_df: pd.DataFrame, right_df: pd.DataFrame, merge_context: str, **merge_kwargs
|
1080
|
+
) -> pd.DataFrame:
|
1081
|
+
"""
|
1082
|
+
Merge two DataFrames and log any column overwrites.
|
1083
|
+
|
1084
|
+
Parameters
|
1085
|
+
----------
|
1086
|
+
left_df : pd.DataFrame
|
1087
|
+
Left DataFrame for merge
|
1088
|
+
right_df : pd.DataFrame
|
1089
|
+
Right DataFrame for merge
|
1090
|
+
merge_context : str
|
1091
|
+
Description of the merge operation for logging
|
1092
|
+
**merge_kwargs : dict
|
1093
|
+
Additional keyword arguments passed to pd.merge
|
1094
|
+
|
1095
|
+
Returns
|
1096
|
+
-------
|
1097
|
+
pd.DataFrame
|
1098
|
+
Merged DataFrame with overwritten columns removed
|
1099
|
+
"""
|
1100
|
+
# Track original columns
|
1101
|
+
original_cols = left_df.columns.tolist()
|
1102
|
+
|
1103
|
+
# Ensure we're using the correct suffixes
|
1104
|
+
merge_kwargs["suffixes"] = ("_old", "")
|
1105
|
+
|
1106
|
+
# Perform merge
|
1107
|
+
merged_df = pd.merge(left_df, right_df, **merge_kwargs)
|
1108
|
+
|
1109
|
+
# Check for and log any overwritten columns
|
1110
|
+
new_cols = merged_df.columns.tolist()
|
1111
|
+
overwritten_cols = [col for col in original_cols if col + "_old" in new_cols]
|
1112
|
+
if overwritten_cols:
|
1113
|
+
logger.warning(
|
1114
|
+
f"The following columns were overwritten during {merge_context} merge and their original values "
|
1115
|
+
f"have been suffixed with '_old': {', '.join(overwritten_cols)}"
|
1116
|
+
)
|
1117
|
+
# Drop the old columns
|
1118
|
+
cols_to_drop = [col + "_old" for col in overwritten_cols]
|
1119
|
+
merged_df = merged_df.drop(columns=cols_to_drop)
|
1120
|
+
|
1121
|
+
return merged_df
|