napistu 0.3.6__py3-none-any.whl → 0.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,15 +8,11 @@ from napistu import sbml_dfs_core
8
8
  from napistu import sbml_dfs_utils
9
9
  from napistu import source
10
10
  from napistu import utils
11
+ from napistu.ingestion import napistu_edgelist
11
12
  from napistu.constants import BQB
12
13
  from napistu.constants import MINI_SBO_FROM_NAME
13
- from napistu.ingestion import napistu_edgelist
14
- from napistu.ingestion.constants import SBML_SPECIES_DICT_IDENTIFIERS
15
- from napistu.ingestion.constants import SBML_SPECIES_DICT_NAME
16
- from napistu.ingestion.constants import SMBL_REACTION_DICT_IDENTIFIERS
17
- from napistu.ingestion.constants import SMBL_REACTION_DICT_IS_REVERSIBLE
18
- from napistu.ingestion.constants import SMBL_REACTION_DICT_NAME
19
- from napistu.ingestion.constants import SMBL_REACTION_SPEC_SBO_TERM
14
+ from napistu.constants import ONTOLOGIES
15
+ from napistu.constants import SBML_DFS
20
16
  from napistu.ingestion.constants import STRING_DOWNSTREAM_COMPARTMENT
21
17
  from napistu.ingestion.constants import STRING_DOWNSTREAM_NAME
22
18
  from napistu.ingestion.constants import STRING_PROTEIN_ID
@@ -137,10 +133,10 @@ def convert_string_to_sbml_dfs(
137
133
 
138
134
  # define identifier mapping from aliases to use:
139
135
  alias_to_identifier = {
140
- "Ensembl_gene": ("ensembl_gene", BQB.IS_ENCODED_BY),
141
- "Ensembl_transcript": ("ensembl_transcript", BQB.IS_ENCODED_BY),
142
- "Ensembl_translation": ("ensembl_protein", BQB.IS),
143
- "Ensembl_UniProt_AC": ("uniprot", BQB.IS),
136
+ "Ensembl_gene": (ONTOLOGIES.ENSEMBL_GENE, BQB.IS_ENCODED_BY),
137
+ "Ensembl_transcript": (ONTOLOGIES.ENSEMBL_TRANSCRIPT, BQB.IS_ENCODED_BY),
138
+ "Ensembl_translation": (ONTOLOGIES.ENSEMBL_PROTEIN, BQB.IS),
139
+ "Ensembl_UniProt_AC": (ONTOLOGIES.UNIPROT, BQB.IS),
144
140
  }
145
141
 
146
142
  # filter aliases to only keep required ones
@@ -276,17 +272,17 @@ def _build_species_df(
276
272
  species_df = (
277
273
  pd.Series(
278
274
  list(set(edgelist[source_col]).union(edgelist[target_col])),
279
- name=SBML_SPECIES_DICT_NAME,
275
+ name=SBML_DFS.S_NAME,
280
276
  )
281
277
  .to_frame()
282
- .set_index(SBML_SPECIES_DICT_NAME, drop=False)
278
+ .set_index(SBML_DFS.S_NAME, drop=False)
283
279
  .apply(
284
280
  _get_identifiers,
285
281
  alias_to_identifier=alias_to_identifier,
286
282
  dat_alias=aliases,
287
283
  axis=1,
288
284
  )
289
- .rename(SBML_SPECIES_DICT_IDENTIFIERS)
285
+ .rename(SBML_DFS.S_IDENTIFIERS)
290
286
  .reset_index()
291
287
  )
292
288
  return species_df
@@ -312,8 +308,8 @@ def _build_interactor_edgelist(
312
308
  **{
313
309
  STRING_UPSTREAM_COMPARTMENT: compartment,
314
310
  STRING_DOWNSTREAM_COMPARTMENT: compartment,
315
- SMBL_REACTION_SPEC_SBO_TERM: sbo_interactor,
316
- SMBL_REACTION_DICT_IDENTIFIERS: lambda x: identifiers.Identifiers([]),
311
+ SBML_DFS.SBO_TERM: sbo_interactor,
312
+ SBML_DFS.R_IDENTIFIERS: lambda x: identifiers.Identifiers([]),
317
313
  }
318
314
  )
319
315
  if add_reverse_interactions:
@@ -336,10 +332,10 @@ def _build_interactor_edgelist(
336
332
  )
337
333
 
338
334
  interaction_edgelist = dat
339
- interaction_edgelist[SMBL_REACTION_DICT_NAME] = _build_string_reaction_name(
335
+ interaction_edgelist[SBML_DFS.R_NAME] = _build_string_reaction_name(
340
336
  dat[STRING_UPSTREAM_NAME], dat[STRING_DOWNSTREAM_NAME]
341
337
  )
342
- interaction_edgelist[SMBL_REACTION_DICT_IS_REVERSIBLE] = True
338
+ interaction_edgelist[SBML_DFS.R_ISREVERSIBLE] = True
343
339
 
344
340
  return interaction_edgelist
345
341
 
@@ -8,16 +8,11 @@ from napistu import identifiers
8
8
  from napistu import sbml_dfs_core
9
9
  from napistu import source
10
10
  from napistu import utils
11
+ from napistu.constants import BQB
12
+ from napistu.constants import IDENTIFIERS
11
13
  from napistu.constants import MINI_SBO_FROM_NAME
12
14
  from napistu.constants import SBOTERM_NAMES
13
- from napistu.ingestion.constants import SBML_COMPARTMENT_DICT_IDENTIFIERS
14
- from napistu.ingestion.constants import SBML_COMPARTMENT_DICT_NAME
15
- from napistu.ingestion.constants import SBML_SPECIES_DICT_IDENTIFIERS
16
- from napistu.ingestion.constants import SBML_SPECIES_DICT_NAME
17
- from napistu.ingestion.constants import SMBL_REACTION_DICT_IDENTIFIERS
18
- from napistu.ingestion.constants import SMBL_REACTION_DICT_IS_REVERSIBLE
19
- from napistu.ingestion.constants import SMBL_REACTION_DICT_NAME
20
- from napistu.ingestion.constants import SMBL_REACTION_SPEC_SBO_TERM
15
+ from napistu.constants import SBML_DFS
21
16
  from napistu.ingestion.constants import SPECIES_FULL_NAME_HUMAN
22
17
  from napistu.ingestion.constants import STRING_DOWNSTREAM_COMPARTMENT
23
18
  from napistu.ingestion.constants import STRING_DOWNSTREAM_NAME
@@ -81,16 +76,16 @@ def convert_trrust_to_sbml_dfs(
81
76
  species_df = (
82
77
  pd.DataFrame(
83
78
  {
84
- SBML_SPECIES_DICT_NAME: list(
79
+ SBML_DFS.S_NAME: list(
85
80
  {*edge_summaries_df["from"], *edge_summaries_df["to"]}
86
81
  )
87
82
  }
88
83
  )
89
84
  .merge(
90
- uniprot_2_symbol.rename({TRRUST_SYMBOL: SBML_SPECIES_DICT_NAME}, axis=1),
85
+ uniprot_2_symbol.rename({TRRUST_SYMBOL: SBML_DFS.S_NAME}, axis=1),
91
86
  how="left",
92
87
  )
93
- .set_index(SBML_SPECIES_DICT_NAME)
88
+ .set_index(SBML_DFS.S_NAME)
94
89
  )
95
90
 
96
91
  # create Identifiers objects for all species with uniprot IDs
@@ -106,14 +101,14 @@ def convert_trrust_to_sbml_dfs(
106
101
  [
107
102
  identifiers.Identifiers(
108
103
  [
109
- identifiers.format_uri(uri=x, biological_qualifier_type="BQB_IS")
110
- for x in species_w_ids.loc[[ind]]["url"].tolist()
104
+ identifiers.format_uri(uri=x, biological_qualifier_type=BQB.IS)
105
+ for x in species_w_ids.loc[[ind]][IDENTIFIERS.URL].tolist()
111
106
  ]
112
107
  )
113
108
  for ind in species_w_ids.index.unique()
114
109
  ],
115
110
  index=species_w_ids.index.unique(),
116
- ).rename(SBML_SPECIES_DICT_IDENTIFIERS)
111
+ ).rename(SBML_DFS.S_IDENTIFIERS)
117
112
 
118
113
  # just retain s_name and s_Identifiers
119
114
  # this just needs a source object which will be added later
@@ -124,21 +119,21 @@ def convert_trrust_to_sbml_dfs(
124
119
  .merge(
125
120
  species_w_ids_series,
126
121
  how="left",
127
- left_on=SBML_SPECIES_DICT_NAME,
122
+ left_on=SBML_DFS.S_NAME,
128
123
  right_index=True,
129
124
  )
130
125
  .reset_index(drop=True)
131
126
  )
132
127
  # stub genes with missing IDs
133
- species_df[SBML_SPECIES_DICT_IDENTIFIERS] = species_df[SBML_SPECIES_DICT_IDENTIFIERS].fillna( # type: ignore
128
+ species_df[SBML_DFS.S_IDENTIFIERS] = species_df[SBML_DFS.S_IDENTIFIERS].fillna( # type: ignore
134
129
  value=identifiers.Identifiers([])
135
130
  )
136
131
 
137
132
  # define distinct compartments
138
133
  compartments_df = pd.DataFrame(
139
134
  {
140
- SBML_COMPARTMENT_DICT_NAME: TRRUST_COMPARTMENT_NUCLEOPLASM,
141
- SBML_COMPARTMENT_DICT_IDENTIFIERS: identifiers.Identifiers(
135
+ SBML_DFS.C_NAME: TRRUST_COMPARTMENT_NUCLEOPLASM,
136
+ SBML_DFS.C_IDENTIFIERS: identifiers.Identifiers(
142
137
  [
143
138
  identifiers.format_uri(
144
139
  uri=identifiers.create_uri_url(
@@ -159,7 +154,7 @@ def convert_trrust_to_sbml_dfs(
159
154
  upstream_compartment=TRRUST_COMPARTMENT_NUCLEOPLASM,
160
155
  downstream_compartment=TRRUST_COMPARTMENT_NUCLEOPLASM,
161
156
  )
162
- gene_gene_identifier_edgelist[SMBL_REACTION_DICT_NAME] = [
157
+ gene_gene_identifier_edgelist[SBML_DFS.R_NAME] = [
163
158
  f"{x} {y} of {z}"
164
159
  for x, y, z in zip(
165
160
  gene_gene_identifier_edgelist[STRING_UPSTREAM_NAME],
@@ -171,15 +166,15 @@ def convert_trrust_to_sbml_dfs(
171
166
  # convert relationships to SBO terms
172
167
  interaction_edgelist = gene_gene_identifier_edgelist.replace(
173
168
  {"sign": MINI_SBO_FROM_NAME}
174
- ).rename({"sign": SMBL_REACTION_SPEC_SBO_TERM}, axis=1)
169
+ ).rename({"sign": SBML_DFS.SBO_TERM}, axis=1)
175
170
 
176
171
  # format pubmed identifiers of interactions
177
- interaction_edgelist[SMBL_REACTION_DICT_IDENTIFIERS] = [
172
+ interaction_edgelist[SBML_DFS.R_IDENTIFIERS] = [
178
173
  _format_pubmed_for_interactions(x) for x in interaction_edgelist["reference"]
179
174
  ]
180
175
 
181
176
  # directionality: by default, set r_isreversible to False for TRRUST data
182
- interaction_edgelist[SMBL_REACTION_DICT_IS_REVERSIBLE] = False
177
+ interaction_edgelist[SBML_DFS.R_ISREVERSIBLE] = False
183
178
 
184
179
  # reduce to essential variables
185
180
  interaction_edgelist = interaction_edgelist[
@@ -188,10 +183,10 @@ def convert_trrust_to_sbml_dfs(
188
183
  STRING_DOWNSTREAM_NAME,
189
184
  STRING_UPSTREAM_COMPARTMENT,
190
185
  STRING_DOWNSTREAM_COMPARTMENT,
191
- SMBL_REACTION_DICT_NAME,
192
- SMBL_REACTION_SPEC_SBO_TERM,
193
- SMBL_REACTION_DICT_IDENTIFIERS,
194
- SMBL_REACTION_DICT_IS_REVERSIBLE,
186
+ SBML_DFS.R_NAME,
187
+ SBML_DFS.SBO_TERM,
188
+ SBML_DFS.R_IDENTIFIERS,
189
+ SBML_DFS.R_ISREVERSIBLE,
195
190
  ]
196
191
  ]
197
192
 
@@ -277,7 +272,7 @@ def _format_pubmed_for_interactions(pubmed_set):
277
272
  url = identifiers.create_uri_url(ontology="pubmed", identifier=p, strict=False)
278
273
  if url is not None:
279
274
  valid_url = identifiers.format_uri(
280
- uri=url, biological_qualifier_type="BQB_IS_DESCRIBED_BY"
275
+ uri=url, biological_qualifier_type=BQB.IS_DESCRIBED_BY
281
276
  )
282
277
 
283
278
  ids.append(valid_url)
@@ -33,7 +33,7 @@ def features_to_pathway_species(
33
33
  pd.Dataframe containing a "feature_identifiers_var" variable used to match entries
34
34
  species_identifiers: pd.DataFrame
35
35
  A table of molecular species identifiers produced from sbml_dfs.get_identifiers("species")
36
- generally using sbml_dfs_core.export_sbml_dfs()
36
+ generally using sbml_dfs.export_sbml_dfs()
37
37
  ontologies: set
38
38
  A set of ontologies used to match features to pathway species
39
39
  feature_identifiers_var: str
@@ -356,7 +356,7 @@ class Genodexito:
356
356
  )
357
357
  logger.debug(
358
358
  f"{ids.shape[0] - expanded_ids.shape[0]} "
359
- "ids are not included in expanded ids"
359
+ "ids are not included in expanded ids. These will be filled with empty Identifiers"
360
360
  )
361
361
  else:
362
362
  matched_expanded_ids = expanded_ids
@@ -364,6 +364,10 @@ class Genodexito:
364
364
  updated_ids = ids.drop(SBML_DFS.S_IDENTIFIERS, axis=1).join(
365
365
  pd.DataFrame(matched_expanded_ids)
366
366
  )
367
+ # fill missing attributes with empty Identifiers
368
+ updated_ids[SBML_DFS.S_IDENTIFIERS] = updated_ids[
369
+ SBML_DFS.S_IDENTIFIERS
370
+ ].fillna(identifiers.Identifiers([]))
367
371
 
368
372
  setattr(sbml_dfs, "species", updated_ids)
369
373
 
@@ -72,6 +72,10 @@ def rename_species_ontologies(
72
72
  updated_species = sbml_dfs.species.drop(SBML_DFS.S_IDENTIFIERS, axis=1).join(
73
73
  pd.DataFrame(species_identifiers)
74
74
  )
75
+ # fill missing attributes with empty Identifiers
76
+ updated_species[SBML_DFS.S_IDENTIFIERS] = updated_species[
77
+ SBML_DFS.S_IDENTIFIERS
78
+ ].fillna(identifiers.Identifiers([]))
75
79
 
76
80
  setattr(sbml_dfs, "species", updated_species)
77
81
 
napistu/sbml_dfs_core.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import copy
3
4
  import logging
4
5
  import re
5
6
  from typing import Any
@@ -64,6 +65,8 @@ class SBML_dfs:
64
65
  Add a new reactions data table to the model with validation.
65
66
  add_species_data(label, data)
66
67
  Add a new species data table to the model with validation.
68
+ copy()
69
+ Return a deep copy of the SBML_dfs object.
67
70
  export_sbml_dfs(model_prefix, outdir, overwrite=False, dogmatic=True)
68
71
  Export the SBML_dfs model and its tables to files in a specified directory.
69
72
  get_characteristic_species_ids(dogmatic=True)
@@ -114,7 +117,6 @@ class SBML_dfs:
114
117
  Private/Hidden Methods (alphabetical, appear after public methods)
115
118
  -----------------------------------------------------------------
116
119
  _attempt_resolve(e)
117
- _check_pk_fk_correspondence()
118
120
  _find_underspecified_reactions_by_scids(sc_ids)
119
121
  _get_unused_cspecies()
120
122
  _get_unused_species()
@@ -123,9 +125,12 @@ class SBML_dfs:
123
125
  _remove_species(s_ids)
124
126
  _remove_unused_cspecies()
125
127
  _remove_unused_species()
128
+ _validate_identifiers()
129
+ _validate_pk_fk_correspondence()
126
130
  _validate_r_ids(r_ids)
127
131
  _validate_reaction_species()
128
132
  _validate_reactions_data(reactions_data_table)
133
+ _validate_sources()
129
134
  _validate_species_data(species_data_table)
130
135
  _validate_table(table_name)
131
136
  """
@@ -255,6 +260,17 @@ class SBML_dfs:
255
260
  )
256
261
  self.species_data[label] = data
257
262
 
263
+ def copy(self):
264
+ """
265
+ Return a deep copy of the SBML_dfs object.
266
+
267
+ Returns
268
+ -------
269
+ SBML_dfs
270
+ A deep copy of the current SBML_dfs object.
271
+ """
272
+ return copy.deepcopy(self)
273
+
258
274
  def export_sbml_dfs(
259
275
  self,
260
276
  model_prefix: str,
@@ -440,7 +456,7 @@ class SBML_dfs:
440
456
  If id_type is invalid or identifiers are malformed
441
457
  """
442
458
  selected_table = self.get_table(id_type, {"id"})
443
- schema = self.schema
459
+ schema = SBML_DFS_SCHEMA.SCHEMA
444
460
 
445
461
  identifiers_dict = dict()
446
462
  for sysid in selected_table.index:
@@ -458,6 +474,7 @@ class SBML_dfs:
458
474
  if not identifiers_dict:
459
475
  # Return empty DataFrame with expected columns if nothing found
460
476
  return pd.DataFrame(columns=[schema[id_type]["pk"], "entry"])
477
+
461
478
  identifiers_tbl = pd.concat(identifiers_dict)
462
479
 
463
480
  identifiers_tbl.index.names = [schema[id_type]["pk"], "entry"]
@@ -1382,7 +1399,7 @@ class SBML_dfs:
1382
1399
  self._validate_table(table)
1383
1400
 
1384
1401
  # check whether pks and fks agree
1385
- self._check_pk_fk_correspondence()
1402
+ self._validate_pk_fk_correspondence()
1386
1403
 
1387
1404
  # check optional data tables:
1388
1405
  for k, v in self.species_data.items():
@@ -1400,6 +1417,10 @@ class SBML_dfs:
1400
1417
  # validate reaction_species sbo_terms and stoi
1401
1418
  self._validate_reaction_species()
1402
1419
 
1420
+ # validate identifiers and sources
1421
+ self._validate_identifiers()
1422
+ self._validate_sources()
1423
+
1403
1424
  def validate_and_resolve(self):
1404
1425
  """
1405
1426
  Validate and attempt to automatically fix common issues.
@@ -1455,67 +1476,6 @@ class SBML_dfs:
1455
1476
  )
1456
1477
  raise e
1457
1478
 
1458
- def _check_pk_fk_correspondence(self):
1459
- """
1460
- Check whether primary keys and foreign keys agree for all tables in the schema.
1461
- Raises ValueError if any correspondence fails.
1462
- """
1463
-
1464
- pk_df = pd.DataFrame(
1465
- [{"pk_table": k, "key": v["pk"]} for k, v in self.schema.items()]
1466
- )
1467
-
1468
- fk_df = (
1469
- pd.DataFrame(
1470
- [
1471
- {"fk_table": k, "fk": v["fk"]}
1472
- for k, v in self.schema.items()
1473
- if "fk" in v.keys()
1474
- ]
1475
- )
1476
- .set_index("fk_table")["fk"]
1477
- .apply(pd.Series)
1478
- .reset_index()
1479
- .melt(id_vars="fk_table")
1480
- .drop(["variable"], axis=1)
1481
- .rename(columns={"value": "key"})
1482
- )
1483
-
1484
- pk_fk_correspondences = pk_df.merge(fk_df)
1485
-
1486
- for i in range(0, pk_fk_correspondences.shape[0]):
1487
- pk_table_keys = set(
1488
- getattr(self, pk_fk_correspondences["pk_table"][i]).index.tolist()
1489
- )
1490
- if None in pk_table_keys:
1491
- raise ValueError(
1492
- f"{pk_fk_correspondences['pk_table'][i]} had "
1493
- "missing values in its index"
1494
- )
1495
-
1496
- fk_table_keys = set(
1497
- getattr(self, pk_fk_correspondences["fk_table"][i]).loc[
1498
- :, pk_fk_correspondences["key"][i]
1499
- ]
1500
- )
1501
- if None in fk_table_keys:
1502
- raise ValueError(
1503
- f"{pk_fk_correspondences['fk_table'][i]} included "
1504
- f"missing {pk_fk_correspondences['key'][i]} values"
1505
- )
1506
-
1507
- # all foreign keys need to match a primary key
1508
- extra_fks = fk_table_keys.difference(pk_table_keys)
1509
- if len(extra_fks) != 0:
1510
- raise ValueError(
1511
- f"{len(extra_fks)} distinct "
1512
- f"{pk_fk_correspondences['key'][i]} values were"
1513
- f" found in {pk_fk_correspondences['fk_table'][i]} "
1514
- f"but missing from {pk_fk_correspondences['pk_table'][i]}."
1515
- " All foreign keys must have a matching primary key.\n\n"
1516
- f"Extra key are: {', '.join(extra_fks)}"
1517
- )
1518
-
1519
1479
  def _find_underspecified_reactions_by_scids(
1520
1480
  self, sc_ids: Iterable[str]
1521
1481
  ) -> set[str]:
@@ -1640,6 +1600,88 @@ class SBML_dfs:
1640
1600
  s_ids = self._get_unused_species()
1641
1601
  self._remove_species(s_ids)
1642
1602
 
1603
+ def _validate_identifiers(self):
1604
+ """
1605
+ Validate identifiers in the model
1606
+
1607
+ Iterates through all tables and checks if the identifier columns are valid.
1608
+
1609
+ Raises:
1610
+ ValueError: missing identifiers in the table
1611
+ """
1612
+
1613
+ SCHEMA = SBML_DFS_SCHEMA.SCHEMA
1614
+ for table in SBML_DFS_SCHEMA.SCHEMA.keys():
1615
+ if "id" not in SCHEMA[table].keys():
1616
+ continue
1617
+ id_series = self.get_table(table)[SCHEMA[table]["id"]]
1618
+ if id_series.isna().sum() > 0:
1619
+ missing_ids = id_series[id_series.isna()].index
1620
+ raise ValueError(
1621
+ f"{table} has {len(missing_ids)} missing ids: {missing_ids}"
1622
+ )
1623
+
1624
+ def _validate_pk_fk_correspondence(self):
1625
+ """
1626
+ Check whether primary keys and foreign keys agree for all tables in the schema.
1627
+ Raises ValueError if any correspondence fails.
1628
+ """
1629
+
1630
+ pk_df = pd.DataFrame(
1631
+ [{"pk_table": k, "key": v["pk"]} for k, v in self.schema.items()]
1632
+ )
1633
+
1634
+ fk_df = (
1635
+ pd.DataFrame(
1636
+ [
1637
+ {"fk_table": k, "fk": v["fk"]}
1638
+ for k, v in self.schema.items()
1639
+ if "fk" in v.keys()
1640
+ ]
1641
+ )
1642
+ .set_index("fk_table")["fk"]
1643
+ .apply(pd.Series)
1644
+ .reset_index()
1645
+ .melt(id_vars="fk_table")
1646
+ .drop(["variable"], axis=1)
1647
+ .rename(columns={"value": "key"})
1648
+ )
1649
+
1650
+ pk_fk_correspondences = pk_df.merge(fk_df)
1651
+
1652
+ for i in range(0, pk_fk_correspondences.shape[0]):
1653
+ pk_table_keys = set(
1654
+ getattr(self, pk_fk_correspondences["pk_table"][i]).index.tolist()
1655
+ )
1656
+ if None in pk_table_keys:
1657
+ raise ValueError(
1658
+ f"{pk_fk_correspondences['pk_table'][i]} had "
1659
+ "missing values in its index"
1660
+ )
1661
+
1662
+ fk_table_keys = set(
1663
+ getattr(self, pk_fk_correspondences["fk_table"][i]).loc[
1664
+ :, pk_fk_correspondences["key"][i]
1665
+ ]
1666
+ )
1667
+ if None in fk_table_keys:
1668
+ raise ValueError(
1669
+ f"{pk_fk_correspondences['fk_table'][i]} included "
1670
+ f"missing {pk_fk_correspondences['key'][i]} values"
1671
+ )
1672
+
1673
+ # all foreign keys need to match a primary key
1674
+ extra_fks = fk_table_keys.difference(pk_table_keys)
1675
+ if len(extra_fks) != 0:
1676
+ raise ValueError(
1677
+ f"{len(extra_fks)} distinct "
1678
+ f"{pk_fk_correspondences['key'][i]} values were"
1679
+ f" found in {pk_fk_correspondences['fk_table'][i]} "
1680
+ f"but missing from {pk_fk_correspondences['pk_table'][i]}."
1681
+ " All foreign keys must have a matching primary key.\n\n"
1682
+ f"Extra key are: {', '.join(extra_fks)}"
1683
+ )
1684
+
1643
1685
  def _validate_r_ids(self, r_ids: Optional[Union[str, list[str]]]) -> list[str]:
1644
1686
 
1645
1687
  if isinstance(r_ids, str):
@@ -1694,6 +1736,27 @@ class SBML_dfs:
1694
1736
  """
1695
1737
  sbml_dfs_utils._validate_matching_data(reactions_data_table, self.reactions)
1696
1738
 
1739
+ def _validate_sources(self):
1740
+ """
1741
+ Validate sources in the model
1742
+
1743
+ Iterates through all tables and checks if the source columns are valid.
1744
+
1745
+ Raises:
1746
+ ValueError: missing sources in the table
1747
+ """
1748
+
1749
+ SCHEMA = SBML_DFS_SCHEMA.SCHEMA
1750
+ for table in SBML_DFS_SCHEMA.SCHEMA.keys():
1751
+ if "source" not in SCHEMA[table].keys():
1752
+ continue
1753
+ source_series = self.get_table(table)[SCHEMA[table]["source"]]
1754
+ if source_series.isna().sum() > 0:
1755
+ missing_sources = source_series[source_series.isna()].index
1756
+ raise ValueError(
1757
+ f"{table} has {len(missing_sources)} missing sources: {missing_sources}"
1758
+ )
1759
+
1697
1760
  def _validate_species_data(self, species_data_table: pd.DataFrame):
1698
1761
  """Validates species data attribute
1699
1762
 
napistu/sbml_dfs_utils.py CHANGED
@@ -559,6 +559,10 @@ def unnest_identifiers(id_table: pd.DataFrame, id_var: str) -> pd.DataFrame:
559
559
 
560
560
  N_invalid_ids = sum(id_table[id_var].isna())
561
561
  if N_invalid_ids != 0:
562
+
563
+ print("Rows with missing identifiers:")
564
+ print(id_table.loc[id_table[id_var].isna(), id_var])
565
+
562
566
  raise ValueError(
563
567
  f'{N_invalid_ids} entries in "id_table" were missing',
564
568
  "entries with no identifiers should still include an Identifiers object",
napistu/utils.py CHANGED
@@ -810,50 +810,15 @@ def drop_extra_cols(
810
810
  return df_out.loc[:, ordered_cols]
811
811
 
812
812
 
813
- def _merge_and_log_overwrites(
814
- left_df: pd.DataFrame, right_df: pd.DataFrame, merge_context: str, **merge_kwargs
815
- ) -> pd.DataFrame:
813
+ def update_pathological_names(names: pd.Series, prefix: str) -> pd.Series:
816
814
  """
817
- Merge two DataFrames and log any column overwrites.
818
-
819
- Parameters
820
- ----------
821
- left_df : pd.DataFrame
822
- Left DataFrame for merge
823
- right_df : pd.DataFrame
824
- Right DataFrame for merge
825
- merge_context : str
826
- Description of the merge operation for logging
827
- **merge_kwargs : dict
828
- Additional keyword arguments passed to pd.merge
815
+ Update pathological names in a pandas Series.
829
816
 
830
- Returns
831
- -------
832
- pd.DataFrame
833
- Merged DataFrame with overwritten columns removed
817
+ Add a prefix to the names if they are all numeric.
834
818
  """
835
- # Track original columns
836
- original_cols = left_df.columns.tolist()
837
-
838
- # Ensure we're using the correct suffixes
839
- merge_kwargs["suffixes"] = ("_old", "")
840
-
841
- # Perform merge
842
- merged_df = pd.merge(left_df, right_df, **merge_kwargs)
843
-
844
- # Check for and log any overwritten columns
845
- new_cols = merged_df.columns.tolist()
846
- overwritten_cols = [col for col in original_cols if col + "_old" in new_cols]
847
- if overwritten_cols:
848
- logger.warning(
849
- f"The following columns were overwritten during {merge_context} merge and their original values "
850
- f"have been suffixed with '_old': {', '.join(overwritten_cols)}"
851
- )
852
- # Drop the old columns
853
- cols_to_drop = [col + "_old" for col in overwritten_cols]
854
- merged_df = merged_df.drop(columns=cols_to_drop)
855
-
856
- return merged_df
819
+ if names.apply(lambda x: x.isdigit()).all():
820
+ names = names.apply(lambda x: f"{prefix}{x}")
821
+ return names
857
822
 
858
823
 
859
824
  def format_identifiers_as_edgelist(
@@ -1108,3 +1073,49 @@ def _add_nameness_score(df, name_var):
1108
1073
 
1109
1074
  df.loc[:, "nameness_score"] = df[name_var].apply(score_nameness)
1110
1075
  return df
1076
+
1077
+
1078
+ def _merge_and_log_overwrites(
1079
+ left_df: pd.DataFrame, right_df: pd.DataFrame, merge_context: str, **merge_kwargs
1080
+ ) -> pd.DataFrame:
1081
+ """
1082
+ Merge two DataFrames and log any column overwrites.
1083
+
1084
+ Parameters
1085
+ ----------
1086
+ left_df : pd.DataFrame
1087
+ Left DataFrame for merge
1088
+ right_df : pd.DataFrame
1089
+ Right DataFrame for merge
1090
+ merge_context : str
1091
+ Description of the merge operation for logging
1092
+ **merge_kwargs : dict
1093
+ Additional keyword arguments passed to pd.merge
1094
+
1095
+ Returns
1096
+ -------
1097
+ pd.DataFrame
1098
+ Merged DataFrame with overwritten columns removed
1099
+ """
1100
+ # Track original columns
1101
+ original_cols = left_df.columns.tolist()
1102
+
1103
+ # Ensure we're using the correct suffixes
1104
+ merge_kwargs["suffixes"] = ("_old", "")
1105
+
1106
+ # Perform merge
1107
+ merged_df = pd.merge(left_df, right_df, **merge_kwargs)
1108
+
1109
+ # Check for and log any overwritten columns
1110
+ new_cols = merged_df.columns.tolist()
1111
+ overwritten_cols = [col for col in original_cols if col + "_old" in new_cols]
1112
+ if overwritten_cols:
1113
+ logger.warning(
1114
+ f"The following columns were overwritten during {merge_context} merge and their original values "
1115
+ f"have been suffixed with '_old': {', '.join(overwritten_cols)}"
1116
+ )
1117
+ # Drop the old columns
1118
+ cols_to_drop = [col + "_old" for col in overwritten_cols]
1119
+ merged_df = merged_df.drop(columns=cols_to_drop)
1120
+
1121
+ return merged_df
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: napistu
3
- Version: 0.3.6
3
+ Version: 0.3.7
4
4
  Summary: Connecting high-dimensional data to curated pathways
5
5
  Home-page: https://github.com/napistu/napistu-py
6
6
  Author: Sean Hackett