napistu 0.1.0__py3-none-any.whl → 0.2.4.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__init__.py +1 -1
- napistu/consensus.py +1010 -513
- napistu/constants.py +24 -0
- napistu/gcs/constants.py +2 -2
- napistu/gcs/downloads.py +57 -25
- napistu/gcs/utils.py +21 -0
- napistu/identifiers.py +105 -6
- napistu/ingestion/constants.py +0 -1
- napistu/ingestion/obo.py +24 -8
- napistu/ingestion/psi_mi.py +20 -5
- napistu/ingestion/reactome.py +8 -32
- napistu/mcp/__init__.py +69 -0
- napistu/mcp/__main__.py +180 -0
- napistu/mcp/codebase.py +182 -0
- napistu/mcp/codebase_utils.py +298 -0
- napistu/mcp/constants.py +72 -0
- napistu/mcp/documentation.py +166 -0
- napistu/mcp/documentation_utils.py +235 -0
- napistu/mcp/execution.py +382 -0
- napistu/mcp/profiles.py +73 -0
- napistu/mcp/server.py +86 -0
- napistu/mcp/tutorials.py +124 -0
- napistu/mcp/tutorials_utils.py +230 -0
- napistu/mcp/utils.py +47 -0
- napistu/mechanism_matching.py +782 -26
- napistu/modify/constants.py +41 -0
- napistu/modify/curation.py +4 -1
- napistu/modify/gaps.py +243 -156
- napistu/modify/pathwayannot.py +26 -8
- napistu/network/neighborhoods.py +16 -7
- napistu/network/net_create.py +209 -54
- napistu/network/net_propagation.py +118 -0
- napistu/network/net_utils.py +1 -32
- napistu/rpy2/netcontextr.py +10 -7
- napistu/rpy2/rids.py +7 -5
- napistu/sbml_dfs_core.py +46 -29
- napistu/sbml_dfs_utils.py +37 -1
- napistu/source.py +8 -2
- napistu/utils.py +67 -8
- napistu-0.2.4.dev2.dist-info/METADATA +84 -0
- napistu-0.2.4.dev2.dist-info/RECORD +95 -0
- {napistu-0.1.0.dist-info → napistu-0.2.4.dev2.dist-info}/WHEEL +1 -1
- tests/conftest.py +11 -5
- tests/test_consensus.py +4 -1
- tests/test_gaps.py +127 -0
- tests/test_gcs.py +3 -2
- tests/test_igraph.py +14 -0
- tests/test_mcp_documentation_utils.py +13 -0
- tests/test_mechanism_matching.py +658 -0
- tests/test_net_propagation.py +89 -0
- tests/test_net_utils.py +83 -0
- tests/test_sbml.py +2 -0
- tests/{test_sbml_dfs_create.py → test_sbml_dfs_core.py} +68 -4
- tests/test_utils.py +81 -0
- napistu-0.1.0.dist-info/METADATA +0 -56
- napistu-0.1.0.dist-info/RECORD +0 -77
- {napistu-0.1.0.dist-info → napistu-0.2.4.dev2.dist-info}/entry_points.txt +0 -0
- {napistu-0.1.0.dist-info → napistu-0.2.4.dev2.dist-info}/licenses/LICENSE +0 -0
- {napistu-0.1.0.dist-info → napistu-0.2.4.dev2.dist-info}/top_level.txt +0 -0
napistu/constants.py
CHANGED
@@ -7,6 +7,20 @@ import libsbml
|
|
7
7
|
from types import SimpleNamespace
|
8
8
|
import pandas as pd
|
9
9
|
|
10
|
+
|
11
|
+
PACKAGE_DEFS = SimpleNamespace(
|
12
|
+
NAPISTU="napistu",
|
13
|
+
GITHUB_OWNER="napistu",
|
14
|
+
GITHUB_PROJECT_REPO="napistu",
|
15
|
+
GITHUB_NAPISTU_PY="napistu-py",
|
16
|
+
GITHUB_NAPISTU_R="napistu-r",
|
17
|
+
TUTORIALS_URL="https://github.com/napistu/napistu/wiki",
|
18
|
+
# User-facing functionality should use a user-defined directory but
|
19
|
+
# for convenience, we provide a default cache directory for dev-facing
|
20
|
+
# workflows
|
21
|
+
CACHE_DIR="napistu_data",
|
22
|
+
)
|
23
|
+
|
10
24
|
PROTEINATLAS_SUBCELL_LOC_URL = (
|
11
25
|
"https://www.proteinatlas.org/download/tsv/subcellular_location.tsv.zip"
|
12
26
|
)
|
@@ -332,6 +346,14 @@ CPR_EDGELIST_REQ_VARS = {
|
|
332
346
|
|
333
347
|
CPR_PATH_REQ_VARS = {CPR_EDGELIST.SC_ID_ORIGIN, CPR_EDGELIST.SC_ID_DEST}
|
334
348
|
|
349
|
+
FEATURE_ID_VAR_DEFAULT = "feature_id"
|
350
|
+
|
351
|
+
RESOLVE_MATCHES_AGGREGATORS = SimpleNamespace(
|
352
|
+
WEIGHTED_MEAN="weighted_mean", MEAN="mean", FIRST="first", MAX="max"
|
353
|
+
)
|
354
|
+
|
355
|
+
RESOLVE_MATCHES_TMP_WEIGHT_COL = "__tmp_weight_for_aggregation__"
|
356
|
+
|
335
357
|
# specifying weighting schemes schema
|
336
358
|
|
337
359
|
DEFAULT_WT_TRANS = "identity"
|
@@ -389,6 +411,8 @@ ONTOLOGIES = SimpleNamespace(
|
|
389
411
|
UNIPROT="uniprot",
|
390
412
|
)
|
391
413
|
|
414
|
+
ONTOLOGIES_LIST = list(ONTOLOGIES.__dict__.values())
|
415
|
+
|
392
416
|
CHARACTERISTIC_COMPLEX_ONTOLOGIES = [
|
393
417
|
ONTOLOGIES.ENSEMBL_GENE,
|
394
418
|
ONTOLOGIES.NCBI_ENTREZ_GENE,
|
napistu/gcs/constants.py
CHANGED
@@ -31,7 +31,7 @@ GCS_ASSETS = SimpleNamespace(
|
|
31
31
|
GCS_SUBASSET_NAMES.REGULATORY_GRAPH: GCS_FILETYPES.REGULATORY_GRAPH,
|
32
32
|
GCS_SUBASSET_NAMES.REGULATORY_DISTANCES: GCS_FILETYPES.REGULATORY_DISTANCES,
|
33
33
|
},
|
34
|
-
"public_url": "https://storage.googleapis.com/
|
34
|
+
"public_url": "https://storage.googleapis.com/shackett-napistu-public/test_pathway.tar.gz",
|
35
35
|
},
|
36
36
|
"human_consensus": {
|
37
37
|
"file": "human_consensus.tar.gz",
|
@@ -40,7 +40,7 @@ GCS_ASSETS = SimpleNamespace(
|
|
40
40
|
GCS_SUBASSET_NAMES.IDENTIFIERS: GCS_FILETYPES.IDENTIFIERS,
|
41
41
|
GCS_SUBASSET_NAMES.REGULATORY_GRAPH: GCS_FILETYPES.REGULATORY_GRAPH,
|
42
42
|
},
|
43
|
-
"public_url": "https://storage.googleapis.com/
|
43
|
+
"public_url": "https://storage.googleapis.com/shackett-napistu-public/human_consensus.tar.gz",
|
44
44
|
},
|
45
45
|
"human_consensus_w_distances": {
|
46
46
|
"file": "human_consensus_w_distances.tar.gz",
|
napistu/gcs/downloads.py
CHANGED
@@ -2,26 +2,29 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import logging
|
4
4
|
import os
|
5
|
-
import pathlib
|
6
5
|
import re
|
6
|
+
import shutil
|
7
7
|
from pydantic import BaseModel
|
8
8
|
from typing import Optional
|
9
9
|
|
10
10
|
from napistu import utils
|
11
11
|
from napistu.gcs.constants import GCS_ASSETS
|
12
|
+
|
12
13
|
from napistu.gcs.constants import INIT_DATA_DIR_MSG
|
14
|
+
from napistu.gcs.utils import _initialize_data_dir
|
13
15
|
|
14
16
|
logger = logging.getLogger(__name__)
|
15
17
|
|
16
18
|
|
17
|
-
def
|
19
|
+
def load_public_napistu_asset(
|
18
20
|
asset: str,
|
19
21
|
data_dir: str,
|
20
22
|
subasset: str | None = None,
|
21
23
|
init_msg: str = INIT_DATA_DIR_MSG,
|
24
|
+
overwrite: bool = False,
|
22
25
|
) -> str:
|
23
26
|
"""
|
24
|
-
Load Public
|
27
|
+
Load Public Napistu Asset
|
25
28
|
|
26
29
|
Download the `asset` asset to `data_dir` if it doesn't
|
27
30
|
already exist and return a path
|
@@ -30,6 +33,7 @@ def load_public_cpr_asset(
|
|
30
33
|
subasset: the name of a subasset to load from within the asset bundle
|
31
34
|
data_dir: the local directory where assets should be stored
|
32
35
|
init_msg: message to display if data_dir does not exist
|
36
|
+
overwrite: if True, always download the asset and re-extract it, even if it already exists
|
33
37
|
|
34
38
|
returns:
|
35
39
|
asset_path: the path to a local file
|
@@ -42,14 +46,16 @@ def load_public_cpr_asset(
|
|
42
46
|
|
43
47
|
# get the path for the asset (which may have been downloaded in a tar-ball)
|
44
48
|
asset_path = os.path.join(data_dir, _get_gcs_asset_path(asset, subasset))
|
45
|
-
if os.path.isfile(asset_path):
|
49
|
+
if os.path.isfile(asset_path) and not overwrite:
|
46
50
|
return asset_path
|
47
51
|
|
48
52
|
download_path = os.path.join(
|
49
53
|
data_dir, os.path.basename(GCS_ASSETS.ASSETS[asset]["file"])
|
50
54
|
)
|
55
|
+
if overwrite:
|
56
|
+
_remove_asset_files_if_needed(asset, data_dir)
|
51
57
|
if not os.path.isfile(download_path):
|
52
|
-
|
58
|
+
download_public_napistu_asset(asset, download_path)
|
53
59
|
|
54
60
|
# gunzip if needed
|
55
61
|
extn = utils.get_extn_from_url(download_path)
|
@@ -70,12 +76,12 @@ def load_public_cpr_asset(
|
|
70
76
|
return asset_path
|
71
77
|
|
72
78
|
|
73
|
-
def
|
79
|
+
def download_public_napistu_asset(asset: str, out_path: str) -> None:
|
74
80
|
"""
|
75
|
-
Download Public
|
81
|
+
Download Public Napistu Asset
|
76
82
|
|
77
83
|
Args:
|
78
|
-
asset (str): The name of a
|
84
|
+
asset (str): The name of a Napistu public asset stored in Google Cloud Storage (GCS)
|
79
85
|
out_path (list): Local location where the file should be saved.
|
80
86
|
|
81
87
|
Returns:
|
@@ -86,22 +92,12 @@ def download_public_cpr_asset(asset: str, out_path: str) -> None:
|
|
86
92
|
selected_file = GCS_ASSETS.ASSETS[asset]["public_url"]
|
87
93
|
|
88
94
|
logger.info(f"Downloading {os.path.basename(selected_file)} to {out_path}")
|
95
|
+
logger.info(f"Download URI: {selected_file}")
|
89
96
|
|
90
97
|
utils.download_wget(selected_file, out_path)
|
91
98
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
def _initialize_data_dir(data_dir: str, init_msg: str = INIT_DATA_DIR_MSG) -> None:
|
96
|
-
"""Create a data directory if it doesn't exist."""
|
97
|
-
|
98
|
-
if not os.path.isdir(data_dir):
|
99
|
-
|
100
|
-
logger.warning(INIT_DATA_DIR_MSG.format(data_dir=data_dir))
|
101
|
-
|
102
|
-
# Artifact directory not found; creating {parentdir}")
|
103
|
-
logger.warning(f"Trying to create {data_dir}")
|
104
|
-
pathlib.Path(data_dir).mkdir(parents=True, exist_ok=True)
|
99
|
+
if not os.path.isfile(out_path):
|
100
|
+
raise FileNotFoundError(f"Download failed: {out_path} was not created.")
|
105
101
|
|
106
102
|
return None
|
107
103
|
|
@@ -109,7 +105,7 @@ def _initialize_data_dir(data_dir: str, init_msg: str = INIT_DATA_DIR_MSG) -> No
|
|
109
105
|
def _validate_gcs_asset(asset: str) -> None:
|
110
106
|
"""Validate a GCS asset by name."""
|
111
107
|
|
112
|
-
assets =
|
108
|
+
assets = _NapistuAssetsValidator(assets=GCS_ASSETS.ASSETS).assets
|
113
109
|
valid_gcs_assets = assets.keys()
|
114
110
|
if asset not in valid_gcs_assets:
|
115
111
|
raise ValueError(
|
@@ -170,11 +166,47 @@ def _get_gcs_asset_path(asset: str, subasset: Optional[str] = None) -> str:
|
|
170
166
|
return out_file
|
171
167
|
|
172
168
|
|
173
|
-
class
|
169
|
+
class _NapistuAssetValidator(BaseModel):
|
174
170
|
file: str
|
175
171
|
subassets: dict[str, str] | None
|
176
172
|
public_url: str
|
177
173
|
|
178
174
|
|
179
|
-
class
|
180
|
-
assets: dict[str,
|
175
|
+
class _NapistuAssetsValidator(BaseModel):
|
176
|
+
assets: dict[str, _NapistuAssetValidator]
|
177
|
+
|
178
|
+
|
179
|
+
def _remove_asset_files_if_needed(asset: str, data_dir: str):
|
180
|
+
"""
|
181
|
+
Remove asset archive and any extracted directory from data_dir.
|
182
|
+
|
183
|
+
Args:
|
184
|
+
asset (str): The asset key (e.g., 'test_pathway').
|
185
|
+
data_dir (str): The directory where assets are stored.
|
186
|
+
"""
|
187
|
+
logger = logging.getLogger(__name__)
|
188
|
+
removed = []
|
189
|
+
|
190
|
+
# Remove the archive file (any extension)
|
191
|
+
archive_filename = os.path.basename(GCS_ASSETS.ASSETS[asset]["file"])
|
192
|
+
archive_path = os.path.join(data_dir, archive_filename)
|
193
|
+
if os.path.exists(archive_path):
|
194
|
+
os.remove(archive_path)
|
195
|
+
logger.info(f"Removed asset archive: {archive_path}")
|
196
|
+
removed.append(archive_path)
|
197
|
+
|
198
|
+
# Remove extracted directory (if any)
|
199
|
+
asset_dict = GCS_ASSETS.ASSETS[asset]
|
200
|
+
if asset_dict.get("subassets") is not None or any(
|
201
|
+
archive_filename.endswith(ext) for ext in [".tar.gz", ".tgz", ".zip", ".gz"]
|
202
|
+
):
|
203
|
+
extract_dir = os.path.join(data_dir, archive_filename.split(".")[0])
|
204
|
+
if os.path.isdir(extract_dir):
|
205
|
+
shutil.rmtree(extract_dir)
|
206
|
+
logger.info(f"Removed extracted asset directory: {extract_dir}")
|
207
|
+
removed.append(extract_dir)
|
208
|
+
|
209
|
+
if not removed:
|
210
|
+
logger.debug("No asset files found to remove.")
|
211
|
+
|
212
|
+
return removed
|
napistu/gcs/utils.py
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
import logging
|
2
|
+
import os
|
3
|
+
import pathlib
|
4
|
+
|
5
|
+
from napistu.gcs.constants import INIT_DATA_DIR_MSG
|
6
|
+
|
7
|
+
logger = logging.getLogger(__name__)
|
8
|
+
|
9
|
+
|
10
|
+
def _initialize_data_dir(data_dir: str, init_msg: str = INIT_DATA_DIR_MSG) -> None:
|
11
|
+
"""Create a data directory if it doesn't exist."""
|
12
|
+
|
13
|
+
if not os.path.isdir(data_dir):
|
14
|
+
|
15
|
+
logger.warning(init_msg.format(data_dir=data_dir))
|
16
|
+
|
17
|
+
# Artifact directory not found; creating {parentdir}")
|
18
|
+
logger.warning(f"Trying to create {data_dir}")
|
19
|
+
pathlib.Path(data_dir).mkdir(parents=True, exist_ok=True)
|
20
|
+
|
21
|
+
return None
|
napistu/identifiers.py
CHANGED
@@ -9,15 +9,19 @@ from urllib.parse import urlparse
|
|
9
9
|
|
10
10
|
import libsbml
|
11
11
|
import pandas as pd
|
12
|
-
from napistu import utils
|
13
12
|
from pydantic import BaseModel
|
14
13
|
|
14
|
+
from napistu import sbml_dfs_core
|
15
|
+
from napistu import sbml_dfs_utils
|
16
|
+
from napistu import utils
|
17
|
+
|
15
18
|
from napistu.constants import IDENTIFIERS
|
16
19
|
from napistu.constants import BIOLOGICAL_QUALIFIER_CODES
|
17
20
|
from napistu.constants import ENSEMBL_MOLECULE_TYPES_TO_ONTOLOGY
|
18
21
|
from napistu.constants import ENSEMBL_MOLECULE_TYPES_FROM_ONTOLOGY
|
19
22
|
from napistu.constants import ENSEMBL_SPECIES_FROM_CODE
|
20
23
|
from napistu.constants import ENSEMBL_SPECIES_TO_CODE
|
24
|
+
from napistu.constants import SPECIES_IDENTIFIERS_REQUIRED_VARS
|
21
25
|
|
22
26
|
logger = logging.getLogger(__name__)
|
23
27
|
|
@@ -225,24 +229,37 @@ def format_uri_url(uri: str) -> dict:
|
|
225
229
|
elif netloc == "www.ensembl.org" and split_path[-1] == "geneview":
|
226
230
|
ontology = "ensembl_gene"
|
227
231
|
identifier, id_ontology, _ = parse_ensembl_id(result.query) # type: ignore
|
228
|
-
|
232
|
+
if ontology != id_ontology:
|
233
|
+
raise ValueError(
|
234
|
+
f"Ontology mismatch: expected {ontology}, got {id_ontology}"
|
235
|
+
)
|
229
236
|
elif netloc == "www.ensembl.org" and split_path[-1] in [
|
230
237
|
"transview",
|
231
238
|
"Transcript",
|
232
239
|
]:
|
233
240
|
ontology = "ensembl_transcript"
|
234
241
|
identifier, id_ontology, _ = parse_ensembl_id(result.query) # type: ignore
|
235
|
-
|
242
|
+
if ontology != id_ontology:
|
243
|
+
raise ValueError(
|
244
|
+
f"Ontology mismatch: expected {ontology}, got {id_ontology}"
|
245
|
+
)
|
236
246
|
elif netloc == "www.ensembl.org" and split_path[-1] == "ProteinSummary":
|
237
247
|
ontology = "ensembl_protein"
|
238
248
|
identifier, id_ontology, _ = parse_ensembl_id(result.query) # type: ignore
|
239
|
-
|
249
|
+
if ontology != id_ontology:
|
250
|
+
raise ValueError(
|
251
|
+
f"Ontology mismatch: expected {ontology}, got {id_ontology}"
|
252
|
+
)
|
240
253
|
elif netloc == "www.ensembl.org" and (
|
241
254
|
re.search("ENS[GTP]", split_path[-1])
|
242
255
|
or re.search("ENS[A-Z]{3}[GTP]", split_path[-1])
|
243
256
|
):
|
244
257
|
# format ensembl IDs which lack gene/transview
|
245
|
-
identifier,
|
258
|
+
identifier, implied_ontology, _ = parse_ensembl_id(split_path[-1])
|
259
|
+
if implied_ontology != ontology:
|
260
|
+
raise ValueError(
|
261
|
+
f"Implied ontology mismatch: expected {ontology}, got {implied_ontology}"
|
262
|
+
)
|
246
263
|
elif netloc == "www.mirbase.org" or netloc == "mirbase.org":
|
247
264
|
ontology = "mirbase"
|
248
265
|
if re.search("MI[0-9]+", split_path[-1]):
|
@@ -676,7 +693,10 @@ def ensembl_id_to_url_regex(identifier: str, ontology: str) -> tuple[str, str]:
|
|
676
693
|
# extract the species name from the 3 letter species code in the id
|
677
694
|
# (these letters are not present for humans)
|
678
695
|
identifier, implied_ontology, species = parse_ensembl_id(identifier) # type: ignore
|
679
|
-
|
696
|
+
if implied_ontology != ontology:
|
697
|
+
raise ValueError(
|
698
|
+
f"Implied ontology mismatch: expected {ontology}, got {implied_ontology}"
|
699
|
+
)
|
680
700
|
|
681
701
|
# create an appropriate regex for validating input
|
682
702
|
# this provides testing for other identifiers even if it is redundant with other
|
@@ -794,6 +814,85 @@ def _format_Identifiers_pubmed(pubmed_id: str) -> Identifiers:
|
|
794
814
|
return Identifiers([id_entry])
|
795
815
|
|
796
816
|
|
817
|
+
def _check_species_identifiers_table(
|
818
|
+
species_identifiers: pd.DataFrame,
|
819
|
+
required_vars: set = SPECIES_IDENTIFIERS_REQUIRED_VARS,
|
820
|
+
):
|
821
|
+
missing_required_vars = required_vars.difference(
|
822
|
+
set(species_identifiers.columns.tolist())
|
823
|
+
)
|
824
|
+
if len(missing_required_vars) > 0:
|
825
|
+
raise ValueError(
|
826
|
+
f"{len(missing_required_vars)} required variables "
|
827
|
+
"were missing from the species_identifiers table: "
|
828
|
+
f"{', '.join(missing_required_vars)}"
|
829
|
+
)
|
830
|
+
|
831
|
+
return None
|
832
|
+
|
833
|
+
|
834
|
+
def _prepare_species_identifiers(
|
835
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
836
|
+
dogmatic: bool = False,
|
837
|
+
species_identifiers: Optional[pd.DataFrame] = None,
|
838
|
+
) -> pd.DataFrame:
|
839
|
+
"""Accepts and validates species_identifiers, or extracts a fresh table if None."""
|
840
|
+
|
841
|
+
if species_identifiers is None:
|
842
|
+
species_identifiers = sbml_dfs_utils.get_characteristic_species_ids(
|
843
|
+
sbml_dfs, dogmatic=dogmatic
|
844
|
+
)
|
845
|
+
else:
|
846
|
+
# check for compatibility
|
847
|
+
try:
|
848
|
+
# check species_identifiers format
|
849
|
+
|
850
|
+
_check_species_identifiers_table(species_identifiers)
|
851
|
+
# quick check for compatibility between sbml_dfs and species_identifiers
|
852
|
+
_validate_assets_sbml_ids(sbml_dfs, species_identifiers)
|
853
|
+
except ValueError as e:
|
854
|
+
logger.warning(
|
855
|
+
f"The provided identifiers are not compatible with your `sbml_dfs` object. Extracting a fresh species identifier table. {e}"
|
856
|
+
)
|
857
|
+
species_identifiers = sbml_dfs_utils.get_characteristic_species_ids(
|
858
|
+
sbml_dfs, dogmatic=dogmatic
|
859
|
+
)
|
860
|
+
|
861
|
+
return species_identifiers
|
862
|
+
|
863
|
+
|
864
|
+
def _validate_assets_sbml_ids(
|
865
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs, identifiers_df: pd.DataFrame
|
866
|
+
) -> None:
|
867
|
+
"""Check an sbml_dfs file and identifiers table for inconsistencies."""
|
868
|
+
|
869
|
+
joined_species_w_ids = sbml_dfs.species.merge(
|
870
|
+
identifiers_df[["s_id", "s_name"]].drop_duplicates(),
|
871
|
+
left_index=True,
|
872
|
+
right_on="s_id",
|
873
|
+
)
|
874
|
+
|
875
|
+
inconsistent_names_df = joined_species_w_ids.query("s_name_x != s_name_y").dropna()
|
876
|
+
inconsistent_names_list = [
|
877
|
+
f"{x} != {y}"
|
878
|
+
for x, y in zip(
|
879
|
+
inconsistent_names_df["s_name_x"], inconsistent_names_df["s_name_y"]
|
880
|
+
)
|
881
|
+
]
|
882
|
+
|
883
|
+
if len(inconsistent_names_list):
|
884
|
+
example_inconsistent_names = inconsistent_names_list[
|
885
|
+
0 : min(10, len(inconsistent_names_list))
|
886
|
+
]
|
887
|
+
|
888
|
+
raise ValueError(
|
889
|
+
f"{len(inconsistent_names_list)} species names do not match between "
|
890
|
+
f"sbml_dfs and identifiers_df including: {', '.join(example_inconsistent_names)}"
|
891
|
+
)
|
892
|
+
|
893
|
+
return None
|
894
|
+
|
895
|
+
|
797
896
|
class _IdentifierValidator(BaseModel):
|
798
897
|
ontology: str
|
799
898
|
identifier: str
|
napistu/ingestion/constants.py
CHANGED
@@ -196,7 +196,6 @@ PSI_MI_INTACT_SPECIES_TO_BASENAME = {
|
|
196
196
|
|
197
197
|
|
198
198
|
# REACTOME
|
199
|
-
REACTOME_SBGN_URL = "https://reactome.org/download/current/homo_sapiens.sbgn.tar.gz"
|
200
199
|
REACTOME_SMBL_URL = "https://reactome.org/download/current/all_species.3.1.sbml.tgz"
|
201
200
|
REACTOME_PATHWAYS_URL = "https://reactome.org/download/current/ReactomePathways.txt"
|
202
201
|
REACTOME_PATHWAY_INDEX_COLUMNS = ["file", "source", "species", "pathway_id", "name"]
|
napistu/ingestion/obo.py
CHANGED
@@ -34,8 +34,14 @@ def create_go_parents_df(go_basic_obo_df: pd.DataFrame) -> pd.DataFrame:
|
|
34
34
|
cc_parents = go_basic_obo_df.query("namespace == 'cellular_component'")["is_a"]
|
35
35
|
|
36
36
|
# this is currently at 4496 rows - this is expected to slowly increase
|
37
|
-
|
38
|
-
|
37
|
+
if cc_parents.shape[0] < 4496:
|
38
|
+
raise ValueError(
|
39
|
+
f"Expected at least 4496 rows in cc_parents, got {cc_parents.shape[0]}"
|
40
|
+
)
|
41
|
+
if cc_parents.shape[0] >= 5000:
|
42
|
+
raise ValueError(
|
43
|
+
f"Expected fewer than 5000 rows in cc_parents, got {cc_parents.shape[0]}"
|
44
|
+
)
|
39
45
|
|
40
46
|
# convert from a list of strings to a list of dicts then expand so each
|
41
47
|
# dict is its own row
|
@@ -48,8 +54,14 @@ def create_go_parents_df(go_basic_obo_df: pd.DataFrame) -> pd.DataFrame:
|
|
48
54
|
go_parents_df["child_id"] = parent_entries.index
|
49
55
|
|
50
56
|
# currently at 4688 rows - this may increase or decrease but will do so slowly
|
51
|
-
|
52
|
-
|
57
|
+
if go_parents_df.shape[0] <= 4600:
|
58
|
+
raise ValueError(
|
59
|
+
f"Expected more than 4600 rows in go_parents_df, got {go_parents_df.shape[0]}"
|
60
|
+
)
|
61
|
+
if go_parents_df.shape[0] >= 5000:
|
62
|
+
raise ValueError(
|
63
|
+
f"Expected fewer than 5000 rows in go_parents_df, got {go_parents_df.shape[0]}"
|
64
|
+
)
|
53
65
|
|
54
66
|
return go_parents_df
|
55
67
|
|
@@ -187,8 +199,10 @@ def create_parent_child_graph(go_parents_df: pd.DataFrame) -> ig.Graph:
|
|
187
199
|
)
|
188
200
|
|
189
201
|
# is it a fully connected DAG as expected?
|
190
|
-
|
191
|
-
|
202
|
+
if not parent_child_graph.is_dag():
|
203
|
+
raise ValueError("parent_child_graph is not a DAG as expected")
|
204
|
+
if not parent_child_graph.is_connected("weak"):
|
205
|
+
raise ValueError("parent_child_graph is not weakly connected as expected")
|
192
206
|
|
193
207
|
return parent_child_graph
|
194
208
|
|
@@ -243,8 +257,10 @@ def _isa_str_list_to_dict_list(isa_list: list) -> list[dict[str, Any]]:
|
|
243
257
|
|
244
258
|
isa_dict_list = list()
|
245
259
|
for split_val in split_vals:
|
246
|
-
|
247
|
-
|
260
|
+
if len(split_val) != 2:
|
261
|
+
raise ValueError(
|
262
|
+
f"Expected tuple of length 2, got {len(split_val)}: {split_val}"
|
263
|
+
)
|
248
264
|
isa_dict_list.append({"parent_id": split_val[0], "parent_name": split_val[1]})
|
249
265
|
|
250
266
|
return isa_dict_list
|
napistu/ingestion/psi_mi.py
CHANGED
@@ -44,7 +44,10 @@ def format_psi(
|
|
44
44
|
|
45
45
|
# the root should be an entrySet if this is a PSI 3.0 file
|
46
46
|
entry_set = et.getroot()
|
47
|
-
|
47
|
+
if entry_set.tag != PSI_MI_INTACT_XML_NAMESPACE + "entrySet":
|
48
|
+
raise ValueError(
|
49
|
+
f"Expected root tag to be {PSI_MI_INTACT_XML_NAMESPACE + 'entrySet'}, got {entry_set.tag}"
|
50
|
+
)
|
48
51
|
|
49
52
|
entry_nodes = entry_set.findall(f"./{PSI_MI_INTACT_XML_NAMESPACE}entry")
|
50
53
|
|
@@ -97,7 +100,10 @@ def _download_intact_species(
|
|
97
100
|
def _format_entry(an_entry) -> dict[str, Any]:
|
98
101
|
"""Extract a single XML entry of interactors and interactions."""
|
99
102
|
|
100
|
-
|
103
|
+
if an_entry.tag != PSI_MI_INTACT_XML_NAMESPACE + "entry":
|
104
|
+
raise ValueError(
|
105
|
+
f"Expected entry tag to be {PSI_MI_INTACT_XML_NAMESPACE + 'entry'}, got {an_entry.tag}"
|
106
|
+
)
|
101
107
|
|
102
108
|
entry_dict = {
|
103
109
|
"source": _format_entry_source(an_entry),
|
@@ -169,7 +175,10 @@ def _format_entry_interactor_list(an_entry) -> list[dict[str, Any]]:
|
|
169
175
|
def _format_entry_interactor(interactor) -> dict[str, Any]:
|
170
176
|
"""Format a single molecular interactor in an interaction list XML node."""
|
171
177
|
|
172
|
-
|
178
|
+
if interactor.tag != PSI_MI_INTACT_XML_NAMESPACE + "interactor":
|
179
|
+
raise ValueError(
|
180
|
+
f"Expected interactor tag to be {PSI_MI_INTACT_XML_NAMESPACE + 'interactor'}, got {interactor.tag}"
|
181
|
+
)
|
173
182
|
|
174
183
|
# optional full name
|
175
184
|
interactor_name_node = interactor.find(
|
@@ -238,7 +247,10 @@ def _format_entry_interactions(an_entry) -> list[dict[str, Any]]:
|
|
238
247
|
def _format_entry_interaction(interaction) -> dict[str, Any]:
|
239
248
|
"""Format a single interaction in an XML interaction list."""
|
240
249
|
|
241
|
-
|
250
|
+
if interaction.tag != PSI_MI_INTACT_XML_NAMESPACE + "interaction":
|
251
|
+
raise ValueError(
|
252
|
+
f"Expected interaction tag to be {PSI_MI_INTACT_XML_NAMESPACE + 'interaction'}, got {interaction.tag}"
|
253
|
+
)
|
242
254
|
|
243
255
|
interaction_name = interaction.find(
|
244
256
|
f"./{PSI_MI_INTACT_XML_NAMESPACE}names/{PSI_MI_INTACT_XML_NAMESPACE}shortLabel"
|
@@ -260,7 +272,10 @@ def _format_entry_interaction(interaction) -> dict[str, Any]:
|
|
260
272
|
def _format_entry_interaction_participants(interaction_participant) -> dict[str, str]:
|
261
273
|
"""Format the participants in an XML interaction."""
|
262
274
|
|
263
|
-
|
275
|
+
if interaction_participant.tag != PSI_MI_INTACT_XML_NAMESPACE + "participant":
|
276
|
+
raise ValueError(
|
277
|
+
f"Expected participant tag to be {PSI_MI_INTACT_XML_NAMESPACE + 'participant'}, got {interaction_participant.tag}"
|
278
|
+
)
|
264
279
|
|
265
280
|
out = {
|
266
281
|
"interactor_id": interaction_participant.attrib["id"],
|
napistu/ingestion/reactome.py
CHANGED
@@ -17,42 +17,12 @@ from napistu.consensus import construct_sbml_dfs_dict
|
|
17
17
|
from napistu.ingestion.constants import REACTOME_PATHWAY_INDEX_COLUMNS
|
18
18
|
from napistu.ingestion.constants import REACTOME_PATHWAY_LIST_COLUMNS
|
19
19
|
from napistu.ingestion.constants import REACTOME_PATHWAYS_URL
|
20
|
-
from napistu.ingestion.constants import REACTOME_SBGN_URL
|
21
20
|
from napistu.ingestion.constants import REACTOME_SMBL_URL
|
22
|
-
from napistu.ingestion.constants import SPECIES_FULL_NAME_HUMAN
|
23
21
|
from fs import open_fs
|
24
22
|
|
25
23
|
logger = logging.getLogger(__name__)
|
26
24
|
|
27
25
|
|
28
|
-
def reactome_sbgn_download(output_dir_path: str, overwrite: bool = False):
|
29
|
-
"""
|
30
|
-
Reactome SBGN Download
|
31
|
-
|
32
|
-
Download all human Reactome SBGN (systems biology graphical notation) files.
|
33
|
-
|
34
|
-
Args:
|
35
|
-
output_dir_path (str): Paths to a directory where .sbgn files should be saved.
|
36
|
-
overwrite (bool): Overwrite an existing output directory.
|
37
|
-
"""
|
38
|
-
utils.download_and_extract(
|
39
|
-
REACTOME_SBGN_URL,
|
40
|
-
output_dir_path=output_dir_path,
|
41
|
-
overwrite=overwrite,
|
42
|
-
)
|
43
|
-
# create the pathway index
|
44
|
-
pw_index = _build_reactome_pw_index(
|
45
|
-
output_dir_path,
|
46
|
-
file_ext="sbgn",
|
47
|
-
# For sbgn only homo sapiens files are available
|
48
|
-
species_filter=(SPECIES_FULL_NAME_HUMAN,),
|
49
|
-
)
|
50
|
-
# save as tsv
|
51
|
-
out_fs = open_fs(output_dir_path)
|
52
|
-
with out_fs.open("pw_index.tsv", "wb") as index_path:
|
53
|
-
pw_index.to_csv(index_path, sep="\t", index=False)
|
54
|
-
|
55
|
-
|
56
26
|
def reactome_sbml_download(output_dir_path: str, overwrite: bool = False):
|
57
27
|
"""
|
58
28
|
Reactome SBML Download
|
@@ -164,8 +134,14 @@ def _check_reactome_pw_index(pw_index: indices.PWIndex, reactome_pathway_list: l
|
|
164
134
|
|
165
135
|
# check extension in pw_index
|
166
136
|
extn = set([os.path.splitext(x)[1] for x in pw_index["file"]])
|
167
|
-
|
168
|
-
|
137
|
+
if len(extn) != 1:
|
138
|
+
raise ValueError(
|
139
|
+
f"Expected all files to have the same extension, but found extensions: {extn}"
|
140
|
+
)
|
141
|
+
if len(extn.intersection({".sbml"})) != 1:
|
142
|
+
raise ValueError(
|
143
|
+
f"Expected all files to have the .sbml extension, but found: {extn}"
|
144
|
+
)
|
169
145
|
extn_string = extn.pop()
|
170
146
|
|
171
147
|
local_reactome_pws = set(pw_index["pathway_id"])
|
napistu/mcp/__init__.py
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
"""
|
2
|
+
MCP (Model Context Protocol) Server for Napistu.
|
3
|
+
|
4
|
+
This module requires optional dependencies. Install with:
|
5
|
+
pip install napistu[mcp]
|
6
|
+
"""
|
7
|
+
|
8
|
+
import asyncio
|
9
|
+
from typing import Dict, Any
|
10
|
+
|
11
|
+
__all__ = ["start_server", "register_object", "is_available"]
|
12
|
+
|
13
|
+
# Check if MCP dependencies are available
|
14
|
+
try:
|
15
|
+
__import__("mcp")
|
16
|
+
is_available = True
|
17
|
+
except ImportError:
|
18
|
+
is_available = False
|
19
|
+
|
20
|
+
if is_available:
|
21
|
+
from .server import create_server
|
22
|
+
from .profiles import get_profile
|
23
|
+
|
24
|
+
def start_server(profile_name: str = "local", **kwargs) -> Dict[str, Any]:
|
25
|
+
"""
|
26
|
+
Start an MCP server with a specific profile.
|
27
|
+
|
28
|
+
Args:
|
29
|
+
profile_name: Name of the profile ('local', 'remote', or 'full')
|
30
|
+
**kwargs: Additional configuration options
|
31
|
+
|
32
|
+
Returns:
|
33
|
+
Server control dictionary
|
34
|
+
"""
|
35
|
+
profile = get_profile(profile_name, **kwargs)
|
36
|
+
server = create_server(profile)
|
37
|
+
|
38
|
+
# Start the server
|
39
|
+
asyncio.create_task(server.start())
|
40
|
+
|
41
|
+
# Return control interface
|
42
|
+
return {
|
43
|
+
"status": "running",
|
44
|
+
"server": server,
|
45
|
+
"profile": profile_name,
|
46
|
+
"stop": server.stop,
|
47
|
+
"register_object": (
|
48
|
+
register_object if profile.get_config()["enable_execution"] else None
|
49
|
+
),
|
50
|
+
}
|
51
|
+
|
52
|
+
# Helper function for registering objects with a running server
|
53
|
+
def register_object(name, obj):
|
54
|
+
"""Register an object with the execution component."""
|
55
|
+
from .execution import register_object as _register
|
56
|
+
|
57
|
+
return _register(name, obj)
|
58
|
+
|
59
|
+
else:
|
60
|
+
# Stubs for when MCP is not available
|
61
|
+
def start_server(*args, **kwargs):
|
62
|
+
raise ImportError(
|
63
|
+
"MCP support not installed. Install with 'pip install napistu[mcp]'"
|
64
|
+
)
|
65
|
+
|
66
|
+
def register_object(*args, **kwargs):
|
67
|
+
raise ImportError(
|
68
|
+
"MCP support not installed. Install with 'pip install napistu[mcp]'"
|
69
|
+
)
|