napistu 0.3.6__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__main__.py +28 -13
- napistu/consensus.py +19 -25
- napistu/constants.py +102 -83
- napistu/indices.py +3 -1
- napistu/ingestion/napistu_edgelist.py +4 -4
- napistu/ingestion/sbml.py +298 -295
- napistu/ingestion/string.py +14 -18
- napistu/ingestion/trrust.py +22 -27
- napistu/matching/interactions.py +41 -39
- napistu/matching/species.py +1 -1
- napistu/modify/gaps.py +2 -1
- napistu/network/constants.py +61 -45
- napistu/network/data_handling.py +1 -1
- napistu/network/neighborhoods.py +3 -3
- napistu/network/net_create.py +440 -616
- napistu/network/net_create_utils.py +734 -0
- napistu/network/net_propagation.py +1 -1
- napistu/network/{napistu_graph_core.py → ng_core.py} +57 -15
- napistu/network/ng_utils.py +28 -21
- napistu/network/paths.py +4 -4
- napistu/network/precompute.py +35 -74
- napistu/ontologies/genodexito.py +5 -1
- napistu/ontologies/renaming.py +4 -0
- napistu/sbml_dfs_core.py +127 -64
- napistu/sbml_dfs_utils.py +50 -0
- napistu/utils.py +132 -46
- {napistu-0.3.6.dist-info → napistu-0.4.0.dist-info}/METADATA +2 -2
- {napistu-0.3.6.dist-info → napistu-0.4.0.dist-info}/RECORD +47 -44
- tests/conftest.py +171 -13
- tests/test_consensus.py +74 -5
- tests/test_gaps.py +26 -15
- tests/test_network_data_handling.py +5 -2
- tests/test_network_net_create.py +93 -202
- tests/test_network_net_create_utils.py +538 -0
- tests/test_network_ng_core.py +19 -0
- tests/test_network_ng_utils.py +1 -1
- tests/test_network_precompute.py +5 -4
- tests/test_ontologies_renaming.py +28 -24
- tests/test_rpy2_callr.py +0 -1
- tests/test_rpy2_init.py +0 -1
- tests/test_sbml_dfs_core.py +165 -15
- tests/test_sbml_dfs_utils.py +45 -0
- tests/test_utils.py +45 -2
- {napistu-0.3.6.dist-info → napistu-0.4.0.dist-info}/WHEEL +0 -0
- {napistu-0.3.6.dist-info → napistu-0.4.0.dist-info}/entry_points.txt +0 -0
- {napistu-0.3.6.dist-info → napistu-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {napistu-0.3.6.dist-info → napistu-0.4.0.dist-info}/top_level.txt +0 -0
napistu/sbml_dfs_core.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import copy
|
3
4
|
import logging
|
4
5
|
import re
|
5
6
|
from typing import Any
|
@@ -64,6 +65,8 @@ class SBML_dfs:
|
|
64
65
|
Add a new reactions data table to the model with validation.
|
65
66
|
add_species_data(label, data)
|
66
67
|
Add a new species data table to the model with validation.
|
68
|
+
copy()
|
69
|
+
Return a deep copy of the SBML_dfs object.
|
67
70
|
export_sbml_dfs(model_prefix, outdir, overwrite=False, dogmatic=True)
|
68
71
|
Export the SBML_dfs model and its tables to files in a specified directory.
|
69
72
|
get_characteristic_species_ids(dogmatic=True)
|
@@ -114,7 +117,6 @@ class SBML_dfs:
|
|
114
117
|
Private/Hidden Methods (alphabetical, appear after public methods)
|
115
118
|
-----------------------------------------------------------------
|
116
119
|
_attempt_resolve(e)
|
117
|
-
_check_pk_fk_correspondence()
|
118
120
|
_find_underspecified_reactions_by_scids(sc_ids)
|
119
121
|
_get_unused_cspecies()
|
120
122
|
_get_unused_species()
|
@@ -123,9 +125,12 @@ class SBML_dfs:
|
|
123
125
|
_remove_species(s_ids)
|
124
126
|
_remove_unused_cspecies()
|
125
127
|
_remove_unused_species()
|
128
|
+
_validate_identifiers()
|
129
|
+
_validate_pk_fk_correspondence()
|
126
130
|
_validate_r_ids(r_ids)
|
127
131
|
_validate_reaction_species()
|
128
132
|
_validate_reactions_data(reactions_data_table)
|
133
|
+
_validate_sources()
|
129
134
|
_validate_species_data(species_data_table)
|
130
135
|
_validate_table(table_name)
|
131
136
|
"""
|
@@ -255,6 +260,17 @@ class SBML_dfs:
|
|
255
260
|
)
|
256
261
|
self.species_data[label] = data
|
257
262
|
|
263
|
+
def copy(self):
|
264
|
+
"""
|
265
|
+
Return a deep copy of the SBML_dfs object.
|
266
|
+
|
267
|
+
Returns
|
268
|
+
-------
|
269
|
+
SBML_dfs
|
270
|
+
A deep copy of the current SBML_dfs object.
|
271
|
+
"""
|
272
|
+
return copy.deepcopy(self)
|
273
|
+
|
258
274
|
def export_sbml_dfs(
|
259
275
|
self,
|
260
276
|
model_prefix: str,
|
@@ -440,7 +456,7 @@ class SBML_dfs:
|
|
440
456
|
If id_type is invalid or identifiers are malformed
|
441
457
|
"""
|
442
458
|
selected_table = self.get_table(id_type, {"id"})
|
443
|
-
schema =
|
459
|
+
schema = SBML_DFS_SCHEMA.SCHEMA
|
444
460
|
|
445
461
|
identifiers_dict = dict()
|
446
462
|
for sysid in selected_table.index:
|
@@ -458,6 +474,7 @@ class SBML_dfs:
|
|
458
474
|
if not identifiers_dict:
|
459
475
|
# Return empty DataFrame with expected columns if nothing found
|
460
476
|
return pd.DataFrame(columns=[schema[id_type]["pk"], "entry"])
|
477
|
+
|
461
478
|
identifiers_tbl = pd.concat(identifiers_dict)
|
462
479
|
|
463
480
|
identifiers_tbl.index.names = [schema[id_type]["pk"], "entry"]
|
@@ -1382,7 +1399,7 @@ class SBML_dfs:
|
|
1382
1399
|
self._validate_table(table)
|
1383
1400
|
|
1384
1401
|
# check whether pks and fks agree
|
1385
|
-
self.
|
1402
|
+
self._validate_pk_fk_correspondence()
|
1386
1403
|
|
1387
1404
|
# check optional data tables:
|
1388
1405
|
for k, v in self.species_data.items():
|
@@ -1400,6 +1417,10 @@ class SBML_dfs:
|
|
1400
1417
|
# validate reaction_species sbo_terms and stoi
|
1401
1418
|
self._validate_reaction_species()
|
1402
1419
|
|
1420
|
+
# validate identifiers and sources
|
1421
|
+
self._validate_identifiers()
|
1422
|
+
self._validate_sources()
|
1423
|
+
|
1403
1424
|
def validate_and_resolve(self):
|
1404
1425
|
"""
|
1405
1426
|
Validate and attempt to automatically fix common issues.
|
@@ -1455,67 +1476,6 @@ class SBML_dfs:
|
|
1455
1476
|
)
|
1456
1477
|
raise e
|
1457
1478
|
|
1458
|
-
def _check_pk_fk_correspondence(self):
|
1459
|
-
"""
|
1460
|
-
Check whether primary keys and foreign keys agree for all tables in the schema.
|
1461
|
-
Raises ValueError if any correspondence fails.
|
1462
|
-
"""
|
1463
|
-
|
1464
|
-
pk_df = pd.DataFrame(
|
1465
|
-
[{"pk_table": k, "key": v["pk"]} for k, v in self.schema.items()]
|
1466
|
-
)
|
1467
|
-
|
1468
|
-
fk_df = (
|
1469
|
-
pd.DataFrame(
|
1470
|
-
[
|
1471
|
-
{"fk_table": k, "fk": v["fk"]}
|
1472
|
-
for k, v in self.schema.items()
|
1473
|
-
if "fk" in v.keys()
|
1474
|
-
]
|
1475
|
-
)
|
1476
|
-
.set_index("fk_table")["fk"]
|
1477
|
-
.apply(pd.Series)
|
1478
|
-
.reset_index()
|
1479
|
-
.melt(id_vars="fk_table")
|
1480
|
-
.drop(["variable"], axis=1)
|
1481
|
-
.rename(columns={"value": "key"})
|
1482
|
-
)
|
1483
|
-
|
1484
|
-
pk_fk_correspondences = pk_df.merge(fk_df)
|
1485
|
-
|
1486
|
-
for i in range(0, pk_fk_correspondences.shape[0]):
|
1487
|
-
pk_table_keys = set(
|
1488
|
-
getattr(self, pk_fk_correspondences["pk_table"][i]).index.tolist()
|
1489
|
-
)
|
1490
|
-
if None in pk_table_keys:
|
1491
|
-
raise ValueError(
|
1492
|
-
f"{pk_fk_correspondences['pk_table'][i]} had "
|
1493
|
-
"missing values in its index"
|
1494
|
-
)
|
1495
|
-
|
1496
|
-
fk_table_keys = set(
|
1497
|
-
getattr(self, pk_fk_correspondences["fk_table"][i]).loc[
|
1498
|
-
:, pk_fk_correspondences["key"][i]
|
1499
|
-
]
|
1500
|
-
)
|
1501
|
-
if None in fk_table_keys:
|
1502
|
-
raise ValueError(
|
1503
|
-
f"{pk_fk_correspondences['fk_table'][i]} included "
|
1504
|
-
f"missing {pk_fk_correspondences['key'][i]} values"
|
1505
|
-
)
|
1506
|
-
|
1507
|
-
# all foreign keys need to match a primary key
|
1508
|
-
extra_fks = fk_table_keys.difference(pk_table_keys)
|
1509
|
-
if len(extra_fks) != 0:
|
1510
|
-
raise ValueError(
|
1511
|
-
f"{len(extra_fks)} distinct "
|
1512
|
-
f"{pk_fk_correspondences['key'][i]} values were"
|
1513
|
-
f" found in {pk_fk_correspondences['fk_table'][i]} "
|
1514
|
-
f"but missing from {pk_fk_correspondences['pk_table'][i]}."
|
1515
|
-
" All foreign keys must have a matching primary key.\n\n"
|
1516
|
-
f"Extra key are: {', '.join(extra_fks)}"
|
1517
|
-
)
|
1518
|
-
|
1519
1479
|
def _find_underspecified_reactions_by_scids(
|
1520
1480
|
self, sc_ids: Iterable[str]
|
1521
1481
|
) -> set[str]:
|
@@ -1640,6 +1600,88 @@ class SBML_dfs:
|
|
1640
1600
|
s_ids = self._get_unused_species()
|
1641
1601
|
self._remove_species(s_ids)
|
1642
1602
|
|
1603
|
+
def _validate_identifiers(self):
|
1604
|
+
"""
|
1605
|
+
Validate identifiers in the model
|
1606
|
+
|
1607
|
+
Iterates through all tables and checks if the identifier columns are valid.
|
1608
|
+
|
1609
|
+
Raises:
|
1610
|
+
ValueError: missing identifiers in the table
|
1611
|
+
"""
|
1612
|
+
|
1613
|
+
SCHEMA = SBML_DFS_SCHEMA.SCHEMA
|
1614
|
+
for table in SBML_DFS_SCHEMA.SCHEMA.keys():
|
1615
|
+
if "id" not in SCHEMA[table].keys():
|
1616
|
+
continue
|
1617
|
+
id_series = self.get_table(table)[SCHEMA[table]["id"]]
|
1618
|
+
if id_series.isna().sum() > 0:
|
1619
|
+
missing_ids = id_series[id_series.isna()].index
|
1620
|
+
raise ValueError(
|
1621
|
+
f"{table} has {len(missing_ids)} missing ids: {missing_ids}"
|
1622
|
+
)
|
1623
|
+
|
1624
|
+
def _validate_pk_fk_correspondence(self):
|
1625
|
+
"""
|
1626
|
+
Check whether primary keys and foreign keys agree for all tables in the schema.
|
1627
|
+
Raises ValueError if any correspondence fails.
|
1628
|
+
"""
|
1629
|
+
|
1630
|
+
pk_df = pd.DataFrame(
|
1631
|
+
[{"pk_table": k, "key": v["pk"]} for k, v in self.schema.items()]
|
1632
|
+
)
|
1633
|
+
|
1634
|
+
fk_df = (
|
1635
|
+
pd.DataFrame(
|
1636
|
+
[
|
1637
|
+
{"fk_table": k, "fk": v["fk"]}
|
1638
|
+
for k, v in self.schema.items()
|
1639
|
+
if "fk" in v.keys()
|
1640
|
+
]
|
1641
|
+
)
|
1642
|
+
.set_index("fk_table")["fk"]
|
1643
|
+
.apply(pd.Series)
|
1644
|
+
.reset_index()
|
1645
|
+
.melt(id_vars="fk_table")
|
1646
|
+
.drop(["variable"], axis=1)
|
1647
|
+
.rename(columns={"value": "key"})
|
1648
|
+
)
|
1649
|
+
|
1650
|
+
pk_fk_correspondences = pk_df.merge(fk_df)
|
1651
|
+
|
1652
|
+
for i in range(0, pk_fk_correspondences.shape[0]):
|
1653
|
+
pk_table_keys = set(
|
1654
|
+
getattr(self, pk_fk_correspondences["pk_table"][i]).index.tolist()
|
1655
|
+
)
|
1656
|
+
if None in pk_table_keys:
|
1657
|
+
raise ValueError(
|
1658
|
+
f"{pk_fk_correspondences['pk_table'][i]} had "
|
1659
|
+
"missing values in its index"
|
1660
|
+
)
|
1661
|
+
|
1662
|
+
fk_table_keys = set(
|
1663
|
+
getattr(self, pk_fk_correspondences["fk_table"][i]).loc[
|
1664
|
+
:, pk_fk_correspondences["key"][i]
|
1665
|
+
]
|
1666
|
+
)
|
1667
|
+
if None in fk_table_keys:
|
1668
|
+
raise ValueError(
|
1669
|
+
f"{pk_fk_correspondences['fk_table'][i]} included "
|
1670
|
+
f"missing {pk_fk_correspondences['key'][i]} values"
|
1671
|
+
)
|
1672
|
+
|
1673
|
+
# all foreign keys need to match a primary key
|
1674
|
+
extra_fks = fk_table_keys.difference(pk_table_keys)
|
1675
|
+
if len(extra_fks) != 0:
|
1676
|
+
raise ValueError(
|
1677
|
+
f"{len(extra_fks)} distinct "
|
1678
|
+
f"{pk_fk_correspondences['key'][i]} values were"
|
1679
|
+
f" found in {pk_fk_correspondences['fk_table'][i]} "
|
1680
|
+
f"but missing from {pk_fk_correspondences['pk_table'][i]}."
|
1681
|
+
" All foreign keys must have a matching primary key.\n\n"
|
1682
|
+
f"Extra key are: {', '.join(extra_fks)}"
|
1683
|
+
)
|
1684
|
+
|
1643
1685
|
def _validate_r_ids(self, r_ids: Optional[Union[str, list[str]]]) -> list[str]:
|
1644
1686
|
|
1645
1687
|
if isinstance(r_ids, str):
|
@@ -1694,6 +1736,27 @@ class SBML_dfs:
|
|
1694
1736
|
"""
|
1695
1737
|
sbml_dfs_utils._validate_matching_data(reactions_data_table, self.reactions)
|
1696
1738
|
|
1739
|
+
def _validate_sources(self):
|
1740
|
+
"""
|
1741
|
+
Validate sources in the model
|
1742
|
+
|
1743
|
+
Iterates through all tables and checks if the source columns are valid.
|
1744
|
+
|
1745
|
+
Raises:
|
1746
|
+
ValueError: missing sources in the table
|
1747
|
+
"""
|
1748
|
+
|
1749
|
+
SCHEMA = SBML_DFS_SCHEMA.SCHEMA
|
1750
|
+
for table in SBML_DFS_SCHEMA.SCHEMA.keys():
|
1751
|
+
if "source" not in SCHEMA[table].keys():
|
1752
|
+
continue
|
1753
|
+
source_series = self.get_table(table)[SCHEMA[table]["source"]]
|
1754
|
+
if source_series.isna().sum() > 0:
|
1755
|
+
missing_sources = source_series[source_series.isna()].index
|
1756
|
+
raise ValueError(
|
1757
|
+
f"{table} has {len(missing_sources)} missing sources: {missing_sources}"
|
1758
|
+
)
|
1759
|
+
|
1697
1760
|
def _validate_species_data(self, species_data_table: pd.DataFrame):
|
1698
1761
|
"""Validates species data attribute
|
1699
1762
|
|
napistu/sbml_dfs_utils.py
CHANGED
@@ -27,6 +27,8 @@ from napistu.constants import MINI_SBO_FROM_NAME
|
|
27
27
|
from napistu.constants import MINI_SBO_TO_NAME
|
28
28
|
from napistu.constants import SBO_NAME_TO_ROLE
|
29
29
|
from napistu.constants import ONTOLOGIES
|
30
|
+
from napistu.constants import VALID_SBO_TERM_NAMES
|
31
|
+
from napistu.constants import VALID_SBO_TERMS
|
30
32
|
from napistu.ingestion.constants import VALID_COMPARTMENTS
|
31
33
|
from napistu.ingestion.constants import COMPARTMENTS_GO_TERMS
|
32
34
|
from napistu.ingestion.constants import GENERIC_COMPARTMENT
|
@@ -559,6 +561,10 @@ def unnest_identifiers(id_table: pd.DataFrame, id_var: str) -> pd.DataFrame:
|
|
559
561
|
|
560
562
|
N_invalid_ids = sum(id_table[id_var].isna())
|
561
563
|
if N_invalid_ids != 0:
|
564
|
+
|
565
|
+
print("Rows with missing identifiers:")
|
566
|
+
print(id_table.loc[id_table[id_var].isna(), id_var])
|
567
|
+
|
562
568
|
raise ValueError(
|
563
569
|
f'{N_invalid_ids} entries in "id_table" were missing',
|
564
570
|
"entries with no identifiers should still include an Identifiers object",
|
@@ -1277,3 +1283,47 @@ def _validate_matching_data(data_table: pd.DataFrame, ref_table: pd.DataFrame):
|
|
1277
1283
|
f"The data table was type {type(data_table).__name__}"
|
1278
1284
|
" but must be a pd.DataFrame"
|
1279
1285
|
)
|
1286
|
+
|
1287
|
+
|
1288
|
+
def _validate_sbo_values(sbo_series: pd.Series, validate: str = "names") -> None:
|
1289
|
+
"""
|
1290
|
+
Validate SBO terms or names
|
1291
|
+
|
1292
|
+
Parameters
|
1293
|
+
----------
|
1294
|
+
sbo_series : pd.Series
|
1295
|
+
The SBO terms or names to validate.
|
1296
|
+
validate : str, optional
|
1297
|
+
Whether the values are SBO terms ("terms") or names ("names", default).
|
1298
|
+
|
1299
|
+
Returns
|
1300
|
+
-------
|
1301
|
+
None
|
1302
|
+
|
1303
|
+
Raises
|
1304
|
+
------
|
1305
|
+
ValueError
|
1306
|
+
If the validation type is invalid.
|
1307
|
+
TypeError
|
1308
|
+
If the invalid_counts is not a pandas DataFrame.
|
1309
|
+
ValueError
|
1310
|
+
If some reaction species have unusable SBO terms.
|
1311
|
+
"""
|
1312
|
+
|
1313
|
+
if validate == "terms":
|
1314
|
+
valid_values = VALID_SBO_TERMS
|
1315
|
+
elif validate == "names":
|
1316
|
+
valid_values = VALID_SBO_TERM_NAMES
|
1317
|
+
else:
|
1318
|
+
raise ValueError(f"Invalid validation type: {validate}")
|
1319
|
+
|
1320
|
+
invalid_sbo_terms = sbo_series[~sbo_series.isin(valid_values)]
|
1321
|
+
|
1322
|
+
if invalid_sbo_terms.shape[0] != 0:
|
1323
|
+
invalid_counts = invalid_sbo_terms.value_counts(sbo_series.name).to_frame("N")
|
1324
|
+
if not isinstance(invalid_counts, pd.DataFrame):
|
1325
|
+
raise TypeError("invalid_counts must be a pandas DataFrame")
|
1326
|
+
print(invalid_counts)
|
1327
|
+
raise ValueError("Some reaction species have unusable SBO terms")
|
1328
|
+
|
1329
|
+
return None
|
napistu/utils.py
CHANGED
@@ -7,24 +7,24 @@ import logging
|
|
7
7
|
import os
|
8
8
|
import pickle
|
9
9
|
import re
|
10
|
+
import requests
|
10
11
|
import shutil
|
11
12
|
import urllib.request as request
|
12
13
|
import zipfile
|
13
14
|
from contextlib import closing
|
14
15
|
from itertools import starmap
|
15
16
|
from textwrap import fill
|
16
|
-
from typing import Any
|
17
|
-
from typing import Union
|
18
|
-
from typing import Optional
|
19
|
-
from typing import List
|
17
|
+
from typing import Any, List, Optional, Union
|
20
18
|
from urllib.parse import urlparse
|
21
|
-
import
|
19
|
+
from pathlib import Path
|
22
20
|
from requests.adapters import HTTPAdapter
|
23
21
|
from requests.adapters import Retry
|
24
22
|
|
25
23
|
import igraph as ig
|
26
24
|
import numpy as np
|
27
25
|
import pandas as pd
|
26
|
+
import pyarrow as pa
|
27
|
+
import pyarrow.parquet as pq
|
28
28
|
from fs import open_fs
|
29
29
|
from fs.copy import copy_dir
|
30
30
|
from fs.copy import copy_file
|
@@ -604,6 +604,81 @@ def load_json(uri: str) -> Any:
|
|
604
604
|
return json.loads(txt)
|
605
605
|
|
606
606
|
|
607
|
+
def save_parquet(
|
608
|
+
df: pd.DataFrame, uri: Union[str, Path], compression: str = "snappy"
|
609
|
+
) -> None:
|
610
|
+
"""
|
611
|
+
Write a DataFrame to a single Parquet file.
|
612
|
+
|
613
|
+
Parameters
|
614
|
+
----------
|
615
|
+
df : pd.DataFrame
|
616
|
+
The DataFrame to save
|
617
|
+
uri : Union[str, Path]
|
618
|
+
Path where to save the Parquet file. Can be a local path or a GCS URI.
|
619
|
+
Recommended extensions: .parquet or .pq
|
620
|
+
compression : str, default 'snappy'
|
621
|
+
Compression algorithm. Options: 'snappy', 'gzip', 'brotli', 'lz4', 'zstd'
|
622
|
+
|
623
|
+
Raises
|
624
|
+
------
|
625
|
+
OSError
|
626
|
+
If the file cannot be written to (permission issues, etc.)
|
627
|
+
"""
|
628
|
+
|
629
|
+
uri_str = str(uri)
|
630
|
+
|
631
|
+
# Warn about non-standard extensions
|
632
|
+
if not any(uri_str.endswith(ext) for ext in [".parquet", ".pq"]):
|
633
|
+
logger.warning(
|
634
|
+
f"File '{uri_str}' doesn't have a standard Parquet extension (.parquet or .pq)"
|
635
|
+
)
|
636
|
+
|
637
|
+
target_base, target_path = get_target_base_and_path(uri_str)
|
638
|
+
|
639
|
+
with open_fs(target_base, create=True) as target_fs:
|
640
|
+
with target_fs.openbin(target_path, "w") as f:
|
641
|
+
# Convert to Arrow table and write as single file
|
642
|
+
table = pa.Table.from_pandas(df)
|
643
|
+
pq.write_table(
|
644
|
+
table,
|
645
|
+
f,
|
646
|
+
compression=compression,
|
647
|
+
use_dictionary=True, # Efficient for repeated values
|
648
|
+
write_statistics=True, # Enables query optimization
|
649
|
+
)
|
650
|
+
|
651
|
+
|
652
|
+
def load_parquet(uri: Union[str, Path]) -> pd.DataFrame:
|
653
|
+
"""
|
654
|
+
Read a DataFrame from a Parquet file.
|
655
|
+
|
656
|
+
Parameters
|
657
|
+
----------
|
658
|
+
uri : Union[str, Path]
|
659
|
+
Path to the Parquet file to load
|
660
|
+
|
661
|
+
Returns
|
662
|
+
-------
|
663
|
+
pd.DataFrame
|
664
|
+
The DataFrame loaded from the Parquet file
|
665
|
+
|
666
|
+
Raises
|
667
|
+
------
|
668
|
+
FileNotFoundError
|
669
|
+
If the specified file does not exist
|
670
|
+
"""
|
671
|
+
try:
|
672
|
+
target_base, target_path = get_target_base_and_path(str(uri))
|
673
|
+
|
674
|
+
with open_fs(target_base) as target_fs:
|
675
|
+
with target_fs.openbin(target_path, "r") as f:
|
676
|
+
return pd.read_parquet(f, engine="pyarrow")
|
677
|
+
|
678
|
+
except ResourceNotFound as e:
|
679
|
+
raise FileNotFoundError(f"File not found: {uri}") from e
|
680
|
+
|
681
|
+
|
607
682
|
def extract_regex_search(regex: str, query: str, index_value: int = 0) -> str:
|
608
683
|
"""
|
609
684
|
Match an identifier substring and otherwise throw an error
|
@@ -810,50 +885,15 @@ def drop_extra_cols(
|
|
810
885
|
return df_out.loc[:, ordered_cols]
|
811
886
|
|
812
887
|
|
813
|
-
def
|
814
|
-
left_df: pd.DataFrame, right_df: pd.DataFrame, merge_context: str, **merge_kwargs
|
815
|
-
) -> pd.DataFrame:
|
888
|
+
def update_pathological_names(names: pd.Series, prefix: str) -> pd.Series:
|
816
889
|
"""
|
817
|
-
|
818
|
-
|
819
|
-
Parameters
|
820
|
-
----------
|
821
|
-
left_df : pd.DataFrame
|
822
|
-
Left DataFrame for merge
|
823
|
-
right_df : pd.DataFrame
|
824
|
-
Right DataFrame for merge
|
825
|
-
merge_context : str
|
826
|
-
Description of the merge operation for logging
|
827
|
-
**merge_kwargs : dict
|
828
|
-
Additional keyword arguments passed to pd.merge
|
890
|
+
Update pathological names in a pandas Series.
|
829
891
|
|
830
|
-
|
831
|
-
-------
|
832
|
-
pd.DataFrame
|
833
|
-
Merged DataFrame with overwritten columns removed
|
892
|
+
Add a prefix to the names if they are all numeric.
|
834
893
|
"""
|
835
|
-
|
836
|
-
|
837
|
-
|
838
|
-
# Ensure we're using the correct suffixes
|
839
|
-
merge_kwargs["suffixes"] = ("_old", "")
|
840
|
-
|
841
|
-
# Perform merge
|
842
|
-
merged_df = pd.merge(left_df, right_df, **merge_kwargs)
|
843
|
-
|
844
|
-
# Check for and log any overwritten columns
|
845
|
-
new_cols = merged_df.columns.tolist()
|
846
|
-
overwritten_cols = [col for col in original_cols if col + "_old" in new_cols]
|
847
|
-
if overwritten_cols:
|
848
|
-
logger.warning(
|
849
|
-
f"The following columns were overwritten during {merge_context} merge and their original values "
|
850
|
-
f"have been suffixed with '_old': {', '.join(overwritten_cols)}"
|
851
|
-
)
|
852
|
-
# Drop the old columns
|
853
|
-
cols_to_drop = [col + "_old" for col in overwritten_cols]
|
854
|
-
merged_df = merged_df.drop(columns=cols_to_drop)
|
855
|
-
|
856
|
-
return merged_df
|
894
|
+
if names.apply(lambda x: x.isdigit()).all():
|
895
|
+
names = names.apply(lambda x: f"{prefix}{x}")
|
896
|
+
return names
|
857
897
|
|
858
898
|
|
859
899
|
def format_identifiers_as_edgelist(
|
@@ -1108,3 +1148,49 @@ def _add_nameness_score(df, name_var):
|
|
1108
1148
|
|
1109
1149
|
df.loc[:, "nameness_score"] = df[name_var].apply(score_nameness)
|
1110
1150
|
return df
|
1151
|
+
|
1152
|
+
|
1153
|
+
def _merge_and_log_overwrites(
|
1154
|
+
left_df: pd.DataFrame, right_df: pd.DataFrame, merge_context: str, **merge_kwargs
|
1155
|
+
) -> pd.DataFrame:
|
1156
|
+
"""
|
1157
|
+
Merge two DataFrames and log any column overwrites.
|
1158
|
+
|
1159
|
+
Parameters
|
1160
|
+
----------
|
1161
|
+
left_df : pd.DataFrame
|
1162
|
+
Left DataFrame for merge
|
1163
|
+
right_df : pd.DataFrame
|
1164
|
+
Right DataFrame for merge
|
1165
|
+
merge_context : str
|
1166
|
+
Description of the merge operation for logging
|
1167
|
+
**merge_kwargs : dict
|
1168
|
+
Additional keyword arguments passed to pd.merge
|
1169
|
+
|
1170
|
+
Returns
|
1171
|
+
-------
|
1172
|
+
pd.DataFrame
|
1173
|
+
Merged DataFrame with overwritten columns removed
|
1174
|
+
"""
|
1175
|
+
# Track original columns
|
1176
|
+
original_cols = left_df.columns.tolist()
|
1177
|
+
|
1178
|
+
# Ensure we're using the correct suffixes
|
1179
|
+
merge_kwargs["suffixes"] = ("_old", "")
|
1180
|
+
|
1181
|
+
# Perform merge
|
1182
|
+
merged_df = pd.merge(left_df, right_df, **merge_kwargs)
|
1183
|
+
|
1184
|
+
# Check for and log any overwritten columns
|
1185
|
+
new_cols = merged_df.columns.tolist()
|
1186
|
+
overwritten_cols = [col for col in original_cols if col + "_old" in new_cols]
|
1187
|
+
if overwritten_cols:
|
1188
|
+
logger.warning(
|
1189
|
+
f"The following columns were overwritten during {merge_context} merge and their original values "
|
1190
|
+
f"have been suffixed with '_old': {', '.join(overwritten_cols)}"
|
1191
|
+
)
|
1192
|
+
# Drop the old columns
|
1193
|
+
cols_to_drop = [col + "_old" for col in overwritten_cols]
|
1194
|
+
merged_df = merged_df.drop(columns=cols_to_drop)
|
1195
|
+
|
1196
|
+
return merged_df
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: napistu
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.4.0
|
4
4
|
Summary: Connecting high-dimensional data to curated pathways
|
5
5
|
Home-page: https://github.com/napistu/napistu-py
|
6
6
|
Author: Sean Hackett
|
@@ -27,6 +27,7 @@ Requires-Dist: mygene<4.0.0,>=3.0.0
|
|
27
27
|
Requires-Dist: numpy<3.0.0,>=1.24.0
|
28
28
|
Requires-Dist: pandas<3.0.0,>=1.5.0
|
29
29
|
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
30
|
+
Requires-Dist: pyarrow<20.0.0,>=15.0.0
|
30
31
|
Requires-Dist: python-libsbml
|
31
32
|
Requires-Dist: requests>=2.25.0
|
32
33
|
Requires-Dist: scipy<2.0.0,>=1.10.0
|
@@ -51,7 +52,6 @@ Requires-Dist: markdown>=3.4.0; extra == "mcp"
|
|
51
52
|
Requires-Dist: jupyter-client>=7.0.0; extra == "mcp"
|
52
53
|
Requires-Dist: nbformat>=5.0.0; extra == "mcp"
|
53
54
|
Provides-Extra: rpy2
|
54
|
-
Requires-Dist: pyarrow<19.0.0,>=15.0.0; extra == "rpy2"
|
55
55
|
Requires-Dist: rpy2<4.0.0,>=3.5.0; extra == "rpy2"
|
56
56
|
Requires-Dist: rpy2-arrow<1.0.0,>=0.1.0; extra == "rpy2"
|
57
57
|
Provides-Extra: scverse
|