napistu 0.3.6__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. napistu/__main__.py +28 -13
  2. napistu/consensus.py +19 -25
  3. napistu/constants.py +102 -83
  4. napistu/indices.py +3 -1
  5. napistu/ingestion/napistu_edgelist.py +4 -4
  6. napistu/ingestion/sbml.py +298 -295
  7. napistu/ingestion/string.py +14 -18
  8. napistu/ingestion/trrust.py +22 -27
  9. napistu/matching/interactions.py +41 -39
  10. napistu/matching/species.py +1 -1
  11. napistu/modify/gaps.py +2 -1
  12. napistu/network/constants.py +61 -45
  13. napistu/network/data_handling.py +1 -1
  14. napistu/network/neighborhoods.py +3 -3
  15. napistu/network/net_create.py +440 -616
  16. napistu/network/net_create_utils.py +734 -0
  17. napistu/network/net_propagation.py +1 -1
  18. napistu/network/{napistu_graph_core.py → ng_core.py} +57 -15
  19. napistu/network/ng_utils.py +28 -21
  20. napistu/network/paths.py +4 -4
  21. napistu/network/precompute.py +35 -74
  22. napistu/ontologies/genodexito.py +5 -1
  23. napistu/ontologies/renaming.py +4 -0
  24. napistu/sbml_dfs_core.py +127 -64
  25. napistu/sbml_dfs_utils.py +50 -0
  26. napistu/utils.py +132 -46
  27. {napistu-0.3.6.dist-info → napistu-0.4.0.dist-info}/METADATA +2 -2
  28. {napistu-0.3.6.dist-info → napistu-0.4.0.dist-info}/RECORD +47 -44
  29. tests/conftest.py +171 -13
  30. tests/test_consensus.py +74 -5
  31. tests/test_gaps.py +26 -15
  32. tests/test_network_data_handling.py +5 -2
  33. tests/test_network_net_create.py +93 -202
  34. tests/test_network_net_create_utils.py +538 -0
  35. tests/test_network_ng_core.py +19 -0
  36. tests/test_network_ng_utils.py +1 -1
  37. tests/test_network_precompute.py +5 -4
  38. tests/test_ontologies_renaming.py +28 -24
  39. tests/test_rpy2_callr.py +0 -1
  40. tests/test_rpy2_init.py +0 -1
  41. tests/test_sbml_dfs_core.py +165 -15
  42. tests/test_sbml_dfs_utils.py +45 -0
  43. tests/test_utils.py +45 -2
  44. {napistu-0.3.6.dist-info → napistu-0.4.0.dist-info}/WHEEL +0 -0
  45. {napistu-0.3.6.dist-info → napistu-0.4.0.dist-info}/entry_points.txt +0 -0
  46. {napistu-0.3.6.dist-info → napistu-0.4.0.dist-info}/licenses/LICENSE +0 -0
  47. {napistu-0.3.6.dist-info → napistu-0.4.0.dist-info}/top_level.txt +0 -0
napistu/sbml_dfs_core.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import copy
3
4
  import logging
4
5
  import re
5
6
  from typing import Any
@@ -64,6 +65,8 @@ class SBML_dfs:
64
65
  Add a new reactions data table to the model with validation.
65
66
  add_species_data(label, data)
66
67
  Add a new species data table to the model with validation.
68
+ copy()
69
+ Return a deep copy of the SBML_dfs object.
67
70
  export_sbml_dfs(model_prefix, outdir, overwrite=False, dogmatic=True)
68
71
  Export the SBML_dfs model and its tables to files in a specified directory.
69
72
  get_characteristic_species_ids(dogmatic=True)
@@ -114,7 +117,6 @@ class SBML_dfs:
114
117
  Private/Hidden Methods (alphabetical, appear after public methods)
115
118
  -----------------------------------------------------------------
116
119
  _attempt_resolve(e)
117
- _check_pk_fk_correspondence()
118
120
  _find_underspecified_reactions_by_scids(sc_ids)
119
121
  _get_unused_cspecies()
120
122
  _get_unused_species()
@@ -123,9 +125,12 @@ class SBML_dfs:
123
125
  _remove_species(s_ids)
124
126
  _remove_unused_cspecies()
125
127
  _remove_unused_species()
128
+ _validate_identifiers()
129
+ _validate_pk_fk_correspondence()
126
130
  _validate_r_ids(r_ids)
127
131
  _validate_reaction_species()
128
132
  _validate_reactions_data(reactions_data_table)
133
+ _validate_sources()
129
134
  _validate_species_data(species_data_table)
130
135
  _validate_table(table_name)
131
136
  """
@@ -255,6 +260,17 @@ class SBML_dfs:
255
260
  )
256
261
  self.species_data[label] = data
257
262
 
263
+ def copy(self):
264
+ """
265
+ Return a deep copy of the SBML_dfs object.
266
+
267
+ Returns
268
+ -------
269
+ SBML_dfs
270
+ A deep copy of the current SBML_dfs object.
271
+ """
272
+ return copy.deepcopy(self)
273
+
258
274
  def export_sbml_dfs(
259
275
  self,
260
276
  model_prefix: str,
@@ -440,7 +456,7 @@ class SBML_dfs:
440
456
  If id_type is invalid or identifiers are malformed
441
457
  """
442
458
  selected_table = self.get_table(id_type, {"id"})
443
- schema = self.schema
459
+ schema = SBML_DFS_SCHEMA.SCHEMA
444
460
 
445
461
  identifiers_dict = dict()
446
462
  for sysid in selected_table.index:
@@ -458,6 +474,7 @@ class SBML_dfs:
458
474
  if not identifiers_dict:
459
475
  # Return empty DataFrame with expected columns if nothing found
460
476
  return pd.DataFrame(columns=[schema[id_type]["pk"], "entry"])
477
+
461
478
  identifiers_tbl = pd.concat(identifiers_dict)
462
479
 
463
480
  identifiers_tbl.index.names = [schema[id_type]["pk"], "entry"]
@@ -1382,7 +1399,7 @@ class SBML_dfs:
1382
1399
  self._validate_table(table)
1383
1400
 
1384
1401
  # check whether pks and fks agree
1385
- self._check_pk_fk_correspondence()
1402
+ self._validate_pk_fk_correspondence()
1386
1403
 
1387
1404
  # check optional data tables:
1388
1405
  for k, v in self.species_data.items():
@@ -1400,6 +1417,10 @@ class SBML_dfs:
1400
1417
  # validate reaction_species sbo_terms and stoi
1401
1418
  self._validate_reaction_species()
1402
1419
 
1420
+ # validate identifiers and sources
1421
+ self._validate_identifiers()
1422
+ self._validate_sources()
1423
+
1403
1424
  def validate_and_resolve(self):
1404
1425
  """
1405
1426
  Validate and attempt to automatically fix common issues.
@@ -1455,67 +1476,6 @@ class SBML_dfs:
1455
1476
  )
1456
1477
  raise e
1457
1478
 
1458
- def _check_pk_fk_correspondence(self):
1459
- """
1460
- Check whether primary keys and foreign keys agree for all tables in the schema.
1461
- Raises ValueError if any correspondence fails.
1462
- """
1463
-
1464
- pk_df = pd.DataFrame(
1465
- [{"pk_table": k, "key": v["pk"]} for k, v in self.schema.items()]
1466
- )
1467
-
1468
- fk_df = (
1469
- pd.DataFrame(
1470
- [
1471
- {"fk_table": k, "fk": v["fk"]}
1472
- for k, v in self.schema.items()
1473
- if "fk" in v.keys()
1474
- ]
1475
- )
1476
- .set_index("fk_table")["fk"]
1477
- .apply(pd.Series)
1478
- .reset_index()
1479
- .melt(id_vars="fk_table")
1480
- .drop(["variable"], axis=1)
1481
- .rename(columns={"value": "key"})
1482
- )
1483
-
1484
- pk_fk_correspondences = pk_df.merge(fk_df)
1485
-
1486
- for i in range(0, pk_fk_correspondences.shape[0]):
1487
- pk_table_keys = set(
1488
- getattr(self, pk_fk_correspondences["pk_table"][i]).index.tolist()
1489
- )
1490
- if None in pk_table_keys:
1491
- raise ValueError(
1492
- f"{pk_fk_correspondences['pk_table'][i]} had "
1493
- "missing values in its index"
1494
- )
1495
-
1496
- fk_table_keys = set(
1497
- getattr(self, pk_fk_correspondences["fk_table"][i]).loc[
1498
- :, pk_fk_correspondences["key"][i]
1499
- ]
1500
- )
1501
- if None in fk_table_keys:
1502
- raise ValueError(
1503
- f"{pk_fk_correspondences['fk_table'][i]} included "
1504
- f"missing {pk_fk_correspondences['key'][i]} values"
1505
- )
1506
-
1507
- # all foreign keys need to match a primary key
1508
- extra_fks = fk_table_keys.difference(pk_table_keys)
1509
- if len(extra_fks) != 0:
1510
- raise ValueError(
1511
- f"{len(extra_fks)} distinct "
1512
- f"{pk_fk_correspondences['key'][i]} values were"
1513
- f" found in {pk_fk_correspondences['fk_table'][i]} "
1514
- f"but missing from {pk_fk_correspondences['pk_table'][i]}."
1515
- " All foreign keys must have a matching primary key.\n\n"
1516
- f"Extra key are: {', '.join(extra_fks)}"
1517
- )
1518
-
1519
1479
  def _find_underspecified_reactions_by_scids(
1520
1480
  self, sc_ids: Iterable[str]
1521
1481
  ) -> set[str]:
@@ -1640,6 +1600,88 @@ class SBML_dfs:
1640
1600
  s_ids = self._get_unused_species()
1641
1601
  self._remove_species(s_ids)
1642
1602
 
1603
+ def _validate_identifiers(self):
1604
+ """
1605
+ Validate identifiers in the model
1606
+
1607
+ Iterates through all tables and checks if the identifier columns are valid.
1608
+
1609
+ Raises:
1610
+ ValueError: missing identifiers in the table
1611
+ """
1612
+
1613
+ SCHEMA = SBML_DFS_SCHEMA.SCHEMA
1614
+ for table in SBML_DFS_SCHEMA.SCHEMA.keys():
1615
+ if "id" not in SCHEMA[table].keys():
1616
+ continue
1617
+ id_series = self.get_table(table)[SCHEMA[table]["id"]]
1618
+ if id_series.isna().sum() > 0:
1619
+ missing_ids = id_series[id_series.isna()].index
1620
+ raise ValueError(
1621
+ f"{table} has {len(missing_ids)} missing ids: {missing_ids}"
1622
+ )
1623
+
1624
+ def _validate_pk_fk_correspondence(self):
1625
+ """
1626
+ Check whether primary keys and foreign keys agree for all tables in the schema.
1627
+ Raises ValueError if any correspondence fails.
1628
+ """
1629
+
1630
+ pk_df = pd.DataFrame(
1631
+ [{"pk_table": k, "key": v["pk"]} for k, v in self.schema.items()]
1632
+ )
1633
+
1634
+ fk_df = (
1635
+ pd.DataFrame(
1636
+ [
1637
+ {"fk_table": k, "fk": v["fk"]}
1638
+ for k, v in self.schema.items()
1639
+ if "fk" in v.keys()
1640
+ ]
1641
+ )
1642
+ .set_index("fk_table")["fk"]
1643
+ .apply(pd.Series)
1644
+ .reset_index()
1645
+ .melt(id_vars="fk_table")
1646
+ .drop(["variable"], axis=1)
1647
+ .rename(columns={"value": "key"})
1648
+ )
1649
+
1650
+ pk_fk_correspondences = pk_df.merge(fk_df)
1651
+
1652
+ for i in range(0, pk_fk_correspondences.shape[0]):
1653
+ pk_table_keys = set(
1654
+ getattr(self, pk_fk_correspondences["pk_table"][i]).index.tolist()
1655
+ )
1656
+ if None in pk_table_keys:
1657
+ raise ValueError(
1658
+ f"{pk_fk_correspondences['pk_table'][i]} had "
1659
+ "missing values in its index"
1660
+ )
1661
+
1662
+ fk_table_keys = set(
1663
+ getattr(self, pk_fk_correspondences["fk_table"][i]).loc[
1664
+ :, pk_fk_correspondences["key"][i]
1665
+ ]
1666
+ )
1667
+ if None in fk_table_keys:
1668
+ raise ValueError(
1669
+ f"{pk_fk_correspondences['fk_table'][i]} included "
1670
+ f"missing {pk_fk_correspondences['key'][i]} values"
1671
+ )
1672
+
1673
+ # all foreign keys need to match a primary key
1674
+ extra_fks = fk_table_keys.difference(pk_table_keys)
1675
+ if len(extra_fks) != 0:
1676
+ raise ValueError(
1677
+ f"{len(extra_fks)} distinct "
1678
+ f"{pk_fk_correspondences['key'][i]} values were"
1679
+ f" found in {pk_fk_correspondences['fk_table'][i]} "
1680
+ f"but missing from {pk_fk_correspondences['pk_table'][i]}."
1681
+ " All foreign keys must have a matching primary key.\n\n"
1682
+ f"Extra key are: {', '.join(extra_fks)}"
1683
+ )
1684
+
1643
1685
  def _validate_r_ids(self, r_ids: Optional[Union[str, list[str]]]) -> list[str]:
1644
1686
 
1645
1687
  if isinstance(r_ids, str):
@@ -1694,6 +1736,27 @@ class SBML_dfs:
1694
1736
  """
1695
1737
  sbml_dfs_utils._validate_matching_data(reactions_data_table, self.reactions)
1696
1738
 
1739
+ def _validate_sources(self):
1740
+ """
1741
+ Validate sources in the model
1742
+
1743
+ Iterates through all tables and checks if the source columns are valid.
1744
+
1745
+ Raises:
1746
+ ValueError: missing sources in the table
1747
+ """
1748
+
1749
+ SCHEMA = SBML_DFS_SCHEMA.SCHEMA
1750
+ for table in SBML_DFS_SCHEMA.SCHEMA.keys():
1751
+ if "source" not in SCHEMA[table].keys():
1752
+ continue
1753
+ source_series = self.get_table(table)[SCHEMA[table]["source"]]
1754
+ if source_series.isna().sum() > 0:
1755
+ missing_sources = source_series[source_series.isna()].index
1756
+ raise ValueError(
1757
+ f"{table} has {len(missing_sources)} missing sources: {missing_sources}"
1758
+ )
1759
+
1697
1760
  def _validate_species_data(self, species_data_table: pd.DataFrame):
1698
1761
  """Validates species data attribute
1699
1762
 
napistu/sbml_dfs_utils.py CHANGED
@@ -27,6 +27,8 @@ from napistu.constants import MINI_SBO_FROM_NAME
27
27
  from napistu.constants import MINI_SBO_TO_NAME
28
28
  from napistu.constants import SBO_NAME_TO_ROLE
29
29
  from napistu.constants import ONTOLOGIES
30
+ from napistu.constants import VALID_SBO_TERM_NAMES
31
+ from napistu.constants import VALID_SBO_TERMS
30
32
  from napistu.ingestion.constants import VALID_COMPARTMENTS
31
33
  from napistu.ingestion.constants import COMPARTMENTS_GO_TERMS
32
34
  from napistu.ingestion.constants import GENERIC_COMPARTMENT
@@ -559,6 +561,10 @@ def unnest_identifiers(id_table: pd.DataFrame, id_var: str) -> pd.DataFrame:
559
561
 
560
562
  N_invalid_ids = sum(id_table[id_var].isna())
561
563
  if N_invalid_ids != 0:
564
+
565
+ print("Rows with missing identifiers:")
566
+ print(id_table.loc[id_table[id_var].isna(), id_var])
567
+
562
568
  raise ValueError(
563
569
  f'{N_invalid_ids} entries in "id_table" were missing',
564
570
  "entries with no identifiers should still include an Identifiers object",
@@ -1277,3 +1283,47 @@ def _validate_matching_data(data_table: pd.DataFrame, ref_table: pd.DataFrame):
1277
1283
  f"The data table was type {type(data_table).__name__}"
1278
1284
  " but must be a pd.DataFrame"
1279
1285
  )
1286
+
1287
+
1288
+ def _validate_sbo_values(sbo_series: pd.Series, validate: str = "names") -> None:
1289
+ """
1290
+ Validate SBO terms or names
1291
+
1292
+ Parameters
1293
+ ----------
1294
+ sbo_series : pd.Series
1295
+ The SBO terms or names to validate.
1296
+ validate : str, optional
1297
+ Whether the values are SBO terms ("terms") or names ("names", default).
1298
+
1299
+ Returns
1300
+ -------
1301
+ None
1302
+
1303
+ Raises
1304
+ ------
1305
+ ValueError
1306
+ If the validation type is invalid.
1307
+ TypeError
1308
+ If the invalid_counts is not a pandas DataFrame.
1309
+ ValueError
1310
+ If some reaction species have unusable SBO terms.
1311
+ """
1312
+
1313
+ if validate == "terms":
1314
+ valid_values = VALID_SBO_TERMS
1315
+ elif validate == "names":
1316
+ valid_values = VALID_SBO_TERM_NAMES
1317
+ else:
1318
+ raise ValueError(f"Invalid validation type: {validate}")
1319
+
1320
+ invalid_sbo_terms = sbo_series[~sbo_series.isin(valid_values)]
1321
+
1322
+ if invalid_sbo_terms.shape[0] != 0:
1323
+ invalid_counts = invalid_sbo_terms.value_counts(sbo_series.name).to_frame("N")
1324
+ if not isinstance(invalid_counts, pd.DataFrame):
1325
+ raise TypeError("invalid_counts must be a pandas DataFrame")
1326
+ print(invalid_counts)
1327
+ raise ValueError("Some reaction species have unusable SBO terms")
1328
+
1329
+ return None
napistu/utils.py CHANGED
@@ -7,24 +7,24 @@ import logging
7
7
  import os
8
8
  import pickle
9
9
  import re
10
+ import requests
10
11
  import shutil
11
12
  import urllib.request as request
12
13
  import zipfile
13
14
  from contextlib import closing
14
15
  from itertools import starmap
15
16
  from textwrap import fill
16
- from typing import Any
17
- from typing import Union
18
- from typing import Optional
19
- from typing import List
17
+ from typing import Any, List, Optional, Union
20
18
  from urllib.parse import urlparse
21
- import requests
19
+ from pathlib import Path
22
20
  from requests.adapters import HTTPAdapter
23
21
  from requests.adapters import Retry
24
22
 
25
23
  import igraph as ig
26
24
  import numpy as np
27
25
  import pandas as pd
26
+ import pyarrow as pa
27
+ import pyarrow.parquet as pq
28
28
  from fs import open_fs
29
29
  from fs.copy import copy_dir
30
30
  from fs.copy import copy_file
@@ -604,6 +604,81 @@ def load_json(uri: str) -> Any:
604
604
  return json.loads(txt)
605
605
 
606
606
 
607
+ def save_parquet(
608
+ df: pd.DataFrame, uri: Union[str, Path], compression: str = "snappy"
609
+ ) -> None:
610
+ """
611
+ Write a DataFrame to a single Parquet file.
612
+
613
+ Parameters
614
+ ----------
615
+ df : pd.DataFrame
616
+ The DataFrame to save
617
+ uri : Union[str, Path]
618
+ Path where to save the Parquet file. Can be a local path or a GCS URI.
619
+ Recommended extensions: .parquet or .pq
620
+ compression : str, default 'snappy'
621
+ Compression algorithm. Options: 'snappy', 'gzip', 'brotli', 'lz4', 'zstd'
622
+
623
+ Raises
624
+ ------
625
+ OSError
626
+ If the file cannot be written to (permission issues, etc.)
627
+ """
628
+
629
+ uri_str = str(uri)
630
+
631
+ # Warn about non-standard extensions
632
+ if not any(uri_str.endswith(ext) for ext in [".parquet", ".pq"]):
633
+ logger.warning(
634
+ f"File '{uri_str}' doesn't have a standard Parquet extension (.parquet or .pq)"
635
+ )
636
+
637
+ target_base, target_path = get_target_base_and_path(uri_str)
638
+
639
+ with open_fs(target_base, create=True) as target_fs:
640
+ with target_fs.openbin(target_path, "w") as f:
641
+ # Convert to Arrow table and write as single file
642
+ table = pa.Table.from_pandas(df)
643
+ pq.write_table(
644
+ table,
645
+ f,
646
+ compression=compression,
647
+ use_dictionary=True, # Efficient for repeated values
648
+ write_statistics=True, # Enables query optimization
649
+ )
650
+
651
+
652
+ def load_parquet(uri: Union[str, Path]) -> pd.DataFrame:
653
+ """
654
+ Read a DataFrame from a Parquet file.
655
+
656
+ Parameters
657
+ ----------
658
+ uri : Union[str, Path]
659
+ Path to the Parquet file to load
660
+
661
+ Returns
662
+ -------
663
+ pd.DataFrame
664
+ The DataFrame loaded from the Parquet file
665
+
666
+ Raises
667
+ ------
668
+ FileNotFoundError
669
+ If the specified file does not exist
670
+ """
671
+ try:
672
+ target_base, target_path = get_target_base_and_path(str(uri))
673
+
674
+ with open_fs(target_base) as target_fs:
675
+ with target_fs.openbin(target_path, "r") as f:
676
+ return pd.read_parquet(f, engine="pyarrow")
677
+
678
+ except ResourceNotFound as e:
679
+ raise FileNotFoundError(f"File not found: {uri}") from e
680
+
681
+
607
682
  def extract_regex_search(regex: str, query: str, index_value: int = 0) -> str:
608
683
  """
609
684
  Match an identifier substring and otherwise throw an error
@@ -810,50 +885,15 @@ def drop_extra_cols(
810
885
  return df_out.loc[:, ordered_cols]
811
886
 
812
887
 
813
- def _merge_and_log_overwrites(
814
- left_df: pd.DataFrame, right_df: pd.DataFrame, merge_context: str, **merge_kwargs
815
- ) -> pd.DataFrame:
888
+ def update_pathological_names(names: pd.Series, prefix: str) -> pd.Series:
816
889
  """
817
- Merge two DataFrames and log any column overwrites.
818
-
819
- Parameters
820
- ----------
821
- left_df : pd.DataFrame
822
- Left DataFrame for merge
823
- right_df : pd.DataFrame
824
- Right DataFrame for merge
825
- merge_context : str
826
- Description of the merge operation for logging
827
- **merge_kwargs : dict
828
- Additional keyword arguments passed to pd.merge
890
+ Update pathological names in a pandas Series.
829
891
 
830
- Returns
831
- -------
832
- pd.DataFrame
833
- Merged DataFrame with overwritten columns removed
892
+ Add a prefix to the names if they are all numeric.
834
893
  """
835
- # Track original columns
836
- original_cols = left_df.columns.tolist()
837
-
838
- # Ensure we're using the correct suffixes
839
- merge_kwargs["suffixes"] = ("_old", "")
840
-
841
- # Perform merge
842
- merged_df = pd.merge(left_df, right_df, **merge_kwargs)
843
-
844
- # Check for and log any overwritten columns
845
- new_cols = merged_df.columns.tolist()
846
- overwritten_cols = [col for col in original_cols if col + "_old" in new_cols]
847
- if overwritten_cols:
848
- logger.warning(
849
- f"The following columns were overwritten during {merge_context} merge and their original values "
850
- f"have been suffixed with '_old': {', '.join(overwritten_cols)}"
851
- )
852
- # Drop the old columns
853
- cols_to_drop = [col + "_old" for col in overwritten_cols]
854
- merged_df = merged_df.drop(columns=cols_to_drop)
855
-
856
- return merged_df
894
+ if names.apply(lambda x: x.isdigit()).all():
895
+ names = names.apply(lambda x: f"{prefix}{x}")
896
+ return names
857
897
 
858
898
 
859
899
  def format_identifiers_as_edgelist(
@@ -1108,3 +1148,49 @@ def _add_nameness_score(df, name_var):
1108
1148
 
1109
1149
  df.loc[:, "nameness_score"] = df[name_var].apply(score_nameness)
1110
1150
  return df
1151
+
1152
+
1153
+ def _merge_and_log_overwrites(
1154
+ left_df: pd.DataFrame, right_df: pd.DataFrame, merge_context: str, **merge_kwargs
1155
+ ) -> pd.DataFrame:
1156
+ """
1157
+ Merge two DataFrames and log any column overwrites.
1158
+
1159
+ Parameters
1160
+ ----------
1161
+ left_df : pd.DataFrame
1162
+ Left DataFrame for merge
1163
+ right_df : pd.DataFrame
1164
+ Right DataFrame for merge
1165
+ merge_context : str
1166
+ Description of the merge operation for logging
1167
+ **merge_kwargs : dict
1168
+ Additional keyword arguments passed to pd.merge
1169
+
1170
+ Returns
1171
+ -------
1172
+ pd.DataFrame
1173
+ Merged DataFrame with overwritten columns removed
1174
+ """
1175
+ # Track original columns
1176
+ original_cols = left_df.columns.tolist()
1177
+
1178
+ # Ensure we're using the correct suffixes
1179
+ merge_kwargs["suffixes"] = ("_old", "")
1180
+
1181
+ # Perform merge
1182
+ merged_df = pd.merge(left_df, right_df, **merge_kwargs)
1183
+
1184
+ # Check for and log any overwritten columns
1185
+ new_cols = merged_df.columns.tolist()
1186
+ overwritten_cols = [col for col in original_cols if col + "_old" in new_cols]
1187
+ if overwritten_cols:
1188
+ logger.warning(
1189
+ f"The following columns were overwritten during {merge_context} merge and their original values "
1190
+ f"have been suffixed with '_old': {', '.join(overwritten_cols)}"
1191
+ )
1192
+ # Drop the old columns
1193
+ cols_to_drop = [col + "_old" for col in overwritten_cols]
1194
+ merged_df = merged_df.drop(columns=cols_to_drop)
1195
+
1196
+ return merged_df
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: napistu
3
- Version: 0.3.6
3
+ Version: 0.4.0
4
4
  Summary: Connecting high-dimensional data to curated pathways
5
5
  Home-page: https://github.com/napistu/napistu-py
6
6
  Author: Sean Hackett
@@ -27,6 +27,7 @@ Requires-Dist: mygene<4.0.0,>=3.0.0
27
27
  Requires-Dist: numpy<3.0.0,>=1.24.0
28
28
  Requires-Dist: pandas<3.0.0,>=1.5.0
29
29
  Requires-Dist: pydantic<3.0.0,>=2.0.0
30
+ Requires-Dist: pyarrow<20.0.0,>=15.0.0
30
31
  Requires-Dist: python-libsbml
31
32
  Requires-Dist: requests>=2.25.0
32
33
  Requires-Dist: scipy<2.0.0,>=1.10.0
@@ -51,7 +52,6 @@ Requires-Dist: markdown>=3.4.0; extra == "mcp"
51
52
  Requires-Dist: jupyter-client>=7.0.0; extra == "mcp"
52
53
  Requires-Dist: nbformat>=5.0.0; extra == "mcp"
53
54
  Provides-Extra: rpy2
54
- Requires-Dist: pyarrow<19.0.0,>=15.0.0; extra == "rpy2"
55
55
  Requires-Dist: rpy2<4.0.0,>=3.5.0; extra == "rpy2"
56
56
  Requires-Dist: rpy2-arrow<1.0.0,>=0.1.0; extra == "rpy2"
57
57
  Provides-Extra: scverse