napistu 0.1.0__py3-none-any.whl → 0.2.4.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. napistu/__init__.py +1 -1
  2. napistu/consensus.py +1010 -513
  3. napistu/constants.py +24 -0
  4. napistu/gcs/constants.py +2 -2
  5. napistu/gcs/downloads.py +57 -25
  6. napistu/gcs/utils.py +21 -0
  7. napistu/identifiers.py +105 -6
  8. napistu/ingestion/constants.py +0 -1
  9. napistu/ingestion/obo.py +24 -8
  10. napistu/ingestion/psi_mi.py +20 -5
  11. napistu/ingestion/reactome.py +8 -32
  12. napistu/mcp/__init__.py +69 -0
  13. napistu/mcp/__main__.py +180 -0
  14. napistu/mcp/codebase.py +182 -0
  15. napistu/mcp/codebase_utils.py +298 -0
  16. napistu/mcp/constants.py +72 -0
  17. napistu/mcp/documentation.py +166 -0
  18. napistu/mcp/documentation_utils.py +235 -0
  19. napistu/mcp/execution.py +382 -0
  20. napistu/mcp/profiles.py +73 -0
  21. napistu/mcp/server.py +86 -0
  22. napistu/mcp/tutorials.py +124 -0
  23. napistu/mcp/tutorials_utils.py +230 -0
  24. napistu/mcp/utils.py +47 -0
  25. napistu/mechanism_matching.py +782 -26
  26. napistu/modify/constants.py +41 -0
  27. napistu/modify/curation.py +4 -1
  28. napistu/modify/gaps.py +243 -156
  29. napistu/modify/pathwayannot.py +26 -8
  30. napistu/network/neighborhoods.py +16 -7
  31. napistu/network/net_create.py +209 -54
  32. napistu/network/net_propagation.py +118 -0
  33. napistu/network/net_utils.py +1 -32
  34. napistu/rpy2/netcontextr.py +10 -7
  35. napistu/rpy2/rids.py +7 -5
  36. napistu/sbml_dfs_core.py +46 -29
  37. napistu/sbml_dfs_utils.py +37 -1
  38. napistu/source.py +8 -2
  39. napistu/utils.py +67 -8
  40. napistu-0.2.4.dev2.dist-info/METADATA +84 -0
  41. napistu-0.2.4.dev2.dist-info/RECORD +95 -0
  42. {napistu-0.1.0.dist-info → napistu-0.2.4.dev2.dist-info}/WHEEL +1 -1
  43. tests/conftest.py +11 -5
  44. tests/test_consensus.py +4 -1
  45. tests/test_gaps.py +127 -0
  46. tests/test_gcs.py +3 -2
  47. tests/test_igraph.py +14 -0
  48. tests/test_mcp_documentation_utils.py +13 -0
  49. tests/test_mechanism_matching.py +658 -0
  50. tests/test_net_propagation.py +89 -0
  51. tests/test_net_utils.py +83 -0
  52. tests/test_sbml.py +2 -0
  53. tests/{test_sbml_dfs_create.py → test_sbml_dfs_core.py} +68 -4
  54. tests/test_utils.py +81 -0
  55. napistu-0.1.0.dist-info/METADATA +0 -56
  56. napistu-0.1.0.dist-info/RECORD +0 -77
  57. {napistu-0.1.0.dist-info → napistu-0.2.4.dev2.dist-info}/entry_points.txt +0 -0
  58. {napistu-0.1.0.dist-info → napistu-0.2.4.dev2.dist-info}/licenses/LICENSE +0 -0
  59. {napistu-0.1.0.dist-info → napistu-0.2.4.dev2.dist-info}/top_level.txt +0 -0
napistu/constants.py CHANGED
@@ -7,6 +7,20 @@ import libsbml
7
7
  from types import SimpleNamespace
8
8
  import pandas as pd
9
9
 
10
+
11
+ PACKAGE_DEFS = SimpleNamespace(
12
+ NAPISTU="napistu",
13
+ GITHUB_OWNER="napistu",
14
+ GITHUB_PROJECT_REPO="napistu",
15
+ GITHUB_NAPISTU_PY="napistu-py",
16
+ GITHUB_NAPISTU_R="napistu-r",
17
+ TUTORIALS_URL="https://github.com/napistu/napistu/wiki",
18
+ # User-facing functionality should use a user-defined directory but
19
+ # for convenience, we provide a default cache directory for dev-facing
20
+ # workflows
21
+ CACHE_DIR="napistu_data",
22
+ )
23
+
10
24
  PROTEINATLAS_SUBCELL_LOC_URL = (
11
25
  "https://www.proteinatlas.org/download/tsv/subcellular_location.tsv.zip"
12
26
  )
@@ -332,6 +346,14 @@ CPR_EDGELIST_REQ_VARS = {
332
346
 
333
347
  CPR_PATH_REQ_VARS = {CPR_EDGELIST.SC_ID_ORIGIN, CPR_EDGELIST.SC_ID_DEST}
334
348
 
349
+ FEATURE_ID_VAR_DEFAULT = "feature_id"
350
+
351
+ RESOLVE_MATCHES_AGGREGATORS = SimpleNamespace(
352
+ WEIGHTED_MEAN="weighted_mean", MEAN="mean", FIRST="first", MAX="max"
353
+ )
354
+
355
+ RESOLVE_MATCHES_TMP_WEIGHT_COL = "__tmp_weight_for_aggregation__"
356
+
335
357
  # specifying weighting schemes schema
336
358
 
337
359
  DEFAULT_WT_TRANS = "identity"
@@ -389,6 +411,8 @@ ONTOLOGIES = SimpleNamespace(
389
411
  UNIPROT="uniprot",
390
412
  )
391
413
 
414
+ ONTOLOGIES_LIST = list(ONTOLOGIES.__dict__.values())
415
+
392
416
  CHARACTERISTIC_COMPLEX_ONTOLOGIES = [
393
417
  ONTOLOGIES.ENSEMBL_GENE,
394
418
  ONTOLOGIES.NCBI_ENTREZ_GENE,
napistu/gcs/constants.py CHANGED
@@ -31,7 +31,7 @@ GCS_ASSETS = SimpleNamespace(
31
31
  GCS_SUBASSET_NAMES.REGULATORY_GRAPH: GCS_FILETYPES.REGULATORY_GRAPH,
32
32
  GCS_SUBASSET_NAMES.REGULATORY_DISTANCES: GCS_FILETYPES.REGULATORY_DISTANCES,
33
33
  },
34
- "public_url": "https://storage.googleapis.com/calico-cpr-public/test_pathway.tar.gz",
34
+ "public_url": "https://storage.googleapis.com/shackett-napistu-public/test_pathway.tar.gz",
35
35
  },
36
36
  "human_consensus": {
37
37
  "file": "human_consensus.tar.gz",
@@ -40,7 +40,7 @@ GCS_ASSETS = SimpleNamespace(
40
40
  GCS_SUBASSET_NAMES.IDENTIFIERS: GCS_FILETYPES.IDENTIFIERS,
41
41
  GCS_SUBASSET_NAMES.REGULATORY_GRAPH: GCS_FILETYPES.REGULATORY_GRAPH,
42
42
  },
43
- "public_url": "https://storage.googleapis.com/calico-cpr-public/human_consensus.tar.gz",
43
+ "public_url": "https://storage.googleapis.com/shackett-napistu-public/human_consensus.tar.gz",
44
44
  },
45
45
  "human_consensus_w_distances": {
46
46
  "file": "human_consensus_w_distances.tar.gz",
napistu/gcs/downloads.py CHANGED
@@ -2,26 +2,29 @@ from __future__ import annotations
2
2
 
3
3
  import logging
4
4
  import os
5
- import pathlib
6
5
  import re
6
+ import shutil
7
7
  from pydantic import BaseModel
8
8
  from typing import Optional
9
9
 
10
10
  from napistu import utils
11
11
  from napistu.gcs.constants import GCS_ASSETS
12
+
12
13
  from napistu.gcs.constants import INIT_DATA_DIR_MSG
14
+ from napistu.gcs.utils import _initialize_data_dir
13
15
 
14
16
  logger = logging.getLogger(__name__)
15
17
 
16
18
 
17
- def load_public_cpr_asset(
19
+ def load_public_napistu_asset(
18
20
  asset: str,
19
21
  data_dir: str,
20
22
  subasset: str | None = None,
21
23
  init_msg: str = INIT_DATA_DIR_MSG,
24
+ overwrite: bool = False,
22
25
  ) -> str:
23
26
  """
24
- Load Public CPR Asset
27
+ Load Public Napistu Asset
25
28
 
26
29
  Download the `asset` asset to `data_dir` if it doesn't
27
30
  already exist and return a path
@@ -30,6 +33,7 @@ def load_public_cpr_asset(
30
33
  subasset: the name of a subasset to load from within the asset bundle
31
34
  data_dir: the local directory where assets should be stored
32
35
  init_msg: message to display if data_dir does not exist
36
+ overwrite: if True, always download the asset and re-extract it, even if it already exists
33
37
 
34
38
  returns:
35
39
  asset_path: the path to a local file
@@ -42,14 +46,16 @@ def load_public_cpr_asset(
42
46
 
43
47
  # get the path for the asset (which may have been downloaded in a tar-ball)
44
48
  asset_path = os.path.join(data_dir, _get_gcs_asset_path(asset, subasset))
45
- if os.path.isfile(asset_path):
49
+ if os.path.isfile(asset_path) and not overwrite:
46
50
  return asset_path
47
51
 
48
52
  download_path = os.path.join(
49
53
  data_dir, os.path.basename(GCS_ASSETS.ASSETS[asset]["file"])
50
54
  )
55
+ if overwrite:
56
+ _remove_asset_files_if_needed(asset, data_dir)
51
57
  if not os.path.isfile(download_path):
52
- download_public_cpr_asset(asset, download_path)
58
+ download_public_napistu_asset(asset, download_path)
53
59
 
54
60
  # gunzip if needed
55
61
  extn = utils.get_extn_from_url(download_path)
@@ -70,12 +76,12 @@ def load_public_cpr_asset(
70
76
  return asset_path
71
77
 
72
78
 
73
- def download_public_cpr_asset(asset: str, out_path: str) -> None:
79
+ def download_public_napistu_asset(asset: str, out_path: str) -> None:
74
80
  """
75
- Download Public CPR Asset
81
+ Download Public Napistu Asset
76
82
 
77
83
  Args:
78
- asset (str): The name of a CPR public asset stored in Google Cloud Storage (GCS)
84
+ asset (str): The name of a Napistu public asset stored in Google Cloud Storage (GCS)
79
85
  out_path (list): Local location where the file should be saved.
80
86
 
81
87
  Returns:
@@ -86,22 +92,12 @@ def download_public_cpr_asset(asset: str, out_path: str) -> None:
86
92
  selected_file = GCS_ASSETS.ASSETS[asset]["public_url"]
87
93
 
88
94
  logger.info(f"Downloading {os.path.basename(selected_file)} to {out_path}")
95
+ logger.info(f"Download URI: {selected_file}")
89
96
 
90
97
  utils.download_wget(selected_file, out_path)
91
98
 
92
- return None
93
-
94
-
95
- def _initialize_data_dir(data_dir: str, init_msg: str = INIT_DATA_DIR_MSG) -> None:
96
- """Create a data directory if it doesn't exist."""
97
-
98
- if not os.path.isdir(data_dir):
99
-
100
- logger.warning(INIT_DATA_DIR_MSG.format(data_dir=data_dir))
101
-
102
- # Artifact directory not found; creating {parentdir}")
103
- logger.warning(f"Trying to create {data_dir}")
104
- pathlib.Path(data_dir).mkdir(parents=True, exist_ok=True)
99
+ if not os.path.isfile(out_path):
100
+ raise FileNotFoundError(f"Download failed: {out_path} was not created.")
105
101
 
106
102
  return None
107
103
 
@@ -109,7 +105,7 @@ def _initialize_data_dir(data_dir: str, init_msg: str = INIT_DATA_DIR_MSG) -> No
109
105
  def _validate_gcs_asset(asset: str) -> None:
110
106
  """Validate a GCS asset by name."""
111
107
 
112
- assets = _CprAssetsValidator(assets=GCS_ASSETS.ASSETS).assets
108
+ assets = _NapistuAssetsValidator(assets=GCS_ASSETS.ASSETS).assets
113
109
  valid_gcs_assets = assets.keys()
114
110
  if asset not in valid_gcs_assets:
115
111
  raise ValueError(
@@ -170,11 +166,47 @@ def _get_gcs_asset_path(asset: str, subasset: Optional[str] = None) -> str:
170
166
  return out_file
171
167
 
172
168
 
173
- class _CprAssetValidator(BaseModel):
169
+ class _NapistuAssetValidator(BaseModel):
174
170
  file: str
175
171
  subassets: dict[str, str] | None
176
172
  public_url: str
177
173
 
178
174
 
179
- class _CprAssetsValidator(BaseModel):
180
- assets: dict[str, _CprAssetValidator]
175
+ class _NapistuAssetsValidator(BaseModel):
176
+ assets: dict[str, _NapistuAssetValidator]
177
+
178
+
179
+ def _remove_asset_files_if_needed(asset: str, data_dir: str):
180
+ """
181
+ Remove asset archive and any extracted directory from data_dir.
182
+
183
+ Args:
184
+ asset (str): The asset key (e.g., 'test_pathway').
185
+ data_dir (str): The directory where assets are stored.
186
+ """
187
+ logger = logging.getLogger(__name__)
188
+ removed = []
189
+
190
+ # Remove the archive file (any extension)
191
+ archive_filename = os.path.basename(GCS_ASSETS.ASSETS[asset]["file"])
192
+ archive_path = os.path.join(data_dir, archive_filename)
193
+ if os.path.exists(archive_path):
194
+ os.remove(archive_path)
195
+ logger.info(f"Removed asset archive: {archive_path}")
196
+ removed.append(archive_path)
197
+
198
+ # Remove extracted directory (if any)
199
+ asset_dict = GCS_ASSETS.ASSETS[asset]
200
+ if asset_dict.get("subassets") is not None or any(
201
+ archive_filename.endswith(ext) for ext in [".tar.gz", ".tgz", ".zip", ".gz"]
202
+ ):
203
+ extract_dir = os.path.join(data_dir, archive_filename.split(".")[0])
204
+ if os.path.isdir(extract_dir):
205
+ shutil.rmtree(extract_dir)
206
+ logger.info(f"Removed extracted asset directory: {extract_dir}")
207
+ removed.append(extract_dir)
208
+
209
+ if not removed:
210
+ logger.debug("No asset files found to remove.")
211
+
212
+ return removed
napistu/gcs/utils.py ADDED
@@ -0,0 +1,21 @@
1
+ import logging
2
+ import os
3
+ import pathlib
4
+
5
+ from napistu.gcs.constants import INIT_DATA_DIR_MSG
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ def _initialize_data_dir(data_dir: str, init_msg: str = INIT_DATA_DIR_MSG) -> None:
11
+ """Create a data directory if it doesn't exist."""
12
+
13
+ if not os.path.isdir(data_dir):
14
+
15
+ logger.warning(init_msg.format(data_dir=data_dir))
16
+
17
+ # Artifact directory not found; creating {parentdir}")
18
+ logger.warning(f"Trying to create {data_dir}")
19
+ pathlib.Path(data_dir).mkdir(parents=True, exist_ok=True)
20
+
21
+ return None
napistu/identifiers.py CHANGED
@@ -9,15 +9,19 @@ from urllib.parse import urlparse
9
9
 
10
10
  import libsbml
11
11
  import pandas as pd
12
- from napistu import utils
13
12
  from pydantic import BaseModel
14
13
 
14
+ from napistu import sbml_dfs_core
15
+ from napistu import sbml_dfs_utils
16
+ from napistu import utils
17
+
15
18
  from napistu.constants import IDENTIFIERS
16
19
  from napistu.constants import BIOLOGICAL_QUALIFIER_CODES
17
20
  from napistu.constants import ENSEMBL_MOLECULE_TYPES_TO_ONTOLOGY
18
21
  from napistu.constants import ENSEMBL_MOLECULE_TYPES_FROM_ONTOLOGY
19
22
  from napistu.constants import ENSEMBL_SPECIES_FROM_CODE
20
23
  from napistu.constants import ENSEMBL_SPECIES_TO_CODE
24
+ from napistu.constants import SPECIES_IDENTIFIERS_REQUIRED_VARS
21
25
 
22
26
  logger = logging.getLogger(__name__)
23
27
 
@@ -225,24 +229,37 @@ def format_uri_url(uri: str) -> dict:
225
229
  elif netloc == "www.ensembl.org" and split_path[-1] == "geneview":
226
230
  ontology = "ensembl_gene"
227
231
  identifier, id_ontology, _ = parse_ensembl_id(result.query) # type: ignore
228
- assert ontology == id_ontology
232
+ if ontology != id_ontology:
233
+ raise ValueError(
234
+ f"Ontology mismatch: expected {ontology}, got {id_ontology}"
235
+ )
229
236
  elif netloc == "www.ensembl.org" and split_path[-1] in [
230
237
  "transview",
231
238
  "Transcript",
232
239
  ]:
233
240
  ontology = "ensembl_transcript"
234
241
  identifier, id_ontology, _ = parse_ensembl_id(result.query) # type: ignore
235
- assert ontology == id_ontology
242
+ if ontology != id_ontology:
243
+ raise ValueError(
244
+ f"Ontology mismatch: expected {ontology}, got {id_ontology}"
245
+ )
236
246
  elif netloc == "www.ensembl.org" and split_path[-1] == "ProteinSummary":
237
247
  ontology = "ensembl_protein"
238
248
  identifier, id_ontology, _ = parse_ensembl_id(result.query) # type: ignore
239
- assert ontology == id_ontology
249
+ if ontology != id_ontology:
250
+ raise ValueError(
251
+ f"Ontology mismatch: expected {ontology}, got {id_ontology}"
252
+ )
240
253
  elif netloc == "www.ensembl.org" and (
241
254
  re.search("ENS[GTP]", split_path[-1])
242
255
  or re.search("ENS[A-Z]{3}[GTP]", split_path[-1])
243
256
  ):
244
257
  # format ensembl IDs which lack gene/transview
245
- identifier, ontology, _ = parse_ensembl_id(split_path[-1])
258
+ identifier, implied_ontology, _ = parse_ensembl_id(split_path[-1])
259
+ if implied_ontology != ontology:
260
+ raise ValueError(
261
+ f"Implied ontology mismatch: expected {ontology}, got {implied_ontology}"
262
+ )
246
263
  elif netloc == "www.mirbase.org" or netloc == "mirbase.org":
247
264
  ontology = "mirbase"
248
265
  if re.search("MI[0-9]+", split_path[-1]):
@@ -676,7 +693,10 @@ def ensembl_id_to_url_regex(identifier: str, ontology: str) -> tuple[str, str]:
676
693
  # extract the species name from the 3 letter species code in the id
677
694
  # (these letters are not present for humans)
678
695
  identifier, implied_ontology, species = parse_ensembl_id(identifier) # type: ignore
679
- assert implied_ontology == ontology
696
+ if implied_ontology != ontology:
697
+ raise ValueError(
698
+ f"Implied ontology mismatch: expected {ontology}, got {implied_ontology}"
699
+ )
680
700
 
681
701
  # create an appropriate regex for validating input
682
702
  # this provides testing for other identifiers even if it is redundant with other
@@ -794,6 +814,85 @@ def _format_Identifiers_pubmed(pubmed_id: str) -> Identifiers:
794
814
  return Identifiers([id_entry])
795
815
 
796
816
 
817
+ def _check_species_identifiers_table(
818
+ species_identifiers: pd.DataFrame,
819
+ required_vars: set = SPECIES_IDENTIFIERS_REQUIRED_VARS,
820
+ ):
821
+ missing_required_vars = required_vars.difference(
822
+ set(species_identifiers.columns.tolist())
823
+ )
824
+ if len(missing_required_vars) > 0:
825
+ raise ValueError(
826
+ f"{len(missing_required_vars)} required variables "
827
+ "were missing from the species_identifiers table: "
828
+ f"{', '.join(missing_required_vars)}"
829
+ )
830
+
831
+ return None
832
+
833
+
834
+ def _prepare_species_identifiers(
835
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
836
+ dogmatic: bool = False,
837
+ species_identifiers: Optional[pd.DataFrame] = None,
838
+ ) -> pd.DataFrame:
839
+ """Accepts and validates species_identifiers, or extracts a fresh table if None."""
840
+
841
+ if species_identifiers is None:
842
+ species_identifiers = sbml_dfs_utils.get_characteristic_species_ids(
843
+ sbml_dfs, dogmatic=dogmatic
844
+ )
845
+ else:
846
+ # check for compatibility
847
+ try:
848
+ # check species_identifiers format
849
+
850
+ _check_species_identifiers_table(species_identifiers)
851
+ # quick check for compatibility between sbml_dfs and species_identifiers
852
+ _validate_assets_sbml_ids(sbml_dfs, species_identifiers)
853
+ except ValueError as e:
854
+ logger.warning(
855
+ f"The provided identifiers are not compatible with your `sbml_dfs` object. Extracting a fresh species identifier table. {e}"
856
+ )
857
+ species_identifiers = sbml_dfs_utils.get_characteristic_species_ids(
858
+ sbml_dfs, dogmatic=dogmatic
859
+ )
860
+
861
+ return species_identifiers
862
+
863
+
864
+ def _validate_assets_sbml_ids(
865
+ sbml_dfs: sbml_dfs_core.SBML_dfs, identifiers_df: pd.DataFrame
866
+ ) -> None:
867
+ """Check an sbml_dfs file and identifiers table for inconsistencies."""
868
+
869
+ joined_species_w_ids = sbml_dfs.species.merge(
870
+ identifiers_df[["s_id", "s_name"]].drop_duplicates(),
871
+ left_index=True,
872
+ right_on="s_id",
873
+ )
874
+
875
+ inconsistent_names_df = joined_species_w_ids.query("s_name_x != s_name_y").dropna()
876
+ inconsistent_names_list = [
877
+ f"{x} != {y}"
878
+ for x, y in zip(
879
+ inconsistent_names_df["s_name_x"], inconsistent_names_df["s_name_y"]
880
+ )
881
+ ]
882
+
883
+ if len(inconsistent_names_list):
884
+ example_inconsistent_names = inconsistent_names_list[
885
+ 0 : min(10, len(inconsistent_names_list))
886
+ ]
887
+
888
+ raise ValueError(
889
+ f"{len(inconsistent_names_list)} species names do not match between "
890
+ f"sbml_dfs and identifiers_df including: {', '.join(example_inconsistent_names)}"
891
+ )
892
+
893
+ return None
894
+
895
+
797
896
  class _IdentifierValidator(BaseModel):
798
897
  ontology: str
799
898
  identifier: str
@@ -196,7 +196,6 @@ PSI_MI_INTACT_SPECIES_TO_BASENAME = {
196
196
 
197
197
 
198
198
  # REACTOME
199
- REACTOME_SBGN_URL = "https://reactome.org/download/current/homo_sapiens.sbgn.tar.gz"
200
199
  REACTOME_SMBL_URL = "https://reactome.org/download/current/all_species.3.1.sbml.tgz"
201
200
  REACTOME_PATHWAYS_URL = "https://reactome.org/download/current/ReactomePathways.txt"
202
201
  REACTOME_PATHWAY_INDEX_COLUMNS = ["file", "source", "species", "pathway_id", "name"]
napistu/ingestion/obo.py CHANGED
@@ -34,8 +34,14 @@ def create_go_parents_df(go_basic_obo_df: pd.DataFrame) -> pd.DataFrame:
34
34
  cc_parents = go_basic_obo_df.query("namespace == 'cellular_component'")["is_a"]
35
35
 
36
36
  # this is currently at 4496 rows - this is expected to slowly increase
37
- assert cc_parents.shape[0] >= 4496
38
- assert cc_parents.shape[0] < 5000
37
+ if cc_parents.shape[0] < 4496:
38
+ raise ValueError(
39
+ f"Expected at least 4496 rows in cc_parents, got {cc_parents.shape[0]}"
40
+ )
41
+ if cc_parents.shape[0] >= 5000:
42
+ raise ValueError(
43
+ f"Expected fewer than 5000 rows in cc_parents, got {cc_parents.shape[0]}"
44
+ )
39
45
 
40
46
  # convert from a list of strings to a list of dicts then expand so each
41
47
  # dict is its own row
@@ -48,8 +54,14 @@ def create_go_parents_df(go_basic_obo_df: pd.DataFrame) -> pd.DataFrame:
48
54
  go_parents_df["child_id"] = parent_entries.index
49
55
 
50
56
  # currently at 4688 rows - this may increase or decrease but will do so slowly
51
- assert go_parents_df.shape[0] > 4600
52
- assert go_parents_df.shape[0] < 5000
57
+ if go_parents_df.shape[0] <= 4600:
58
+ raise ValueError(
59
+ f"Expected more than 4600 rows in go_parents_df, got {go_parents_df.shape[0]}"
60
+ )
61
+ if go_parents_df.shape[0] >= 5000:
62
+ raise ValueError(
63
+ f"Expected fewer than 5000 rows in go_parents_df, got {go_parents_df.shape[0]}"
64
+ )
53
65
 
54
66
  return go_parents_df
55
67
 
@@ -187,8 +199,10 @@ def create_parent_child_graph(go_parents_df: pd.DataFrame) -> ig.Graph:
187
199
  )
188
200
 
189
201
  # is it a fully connected DAG as expected?
190
- assert parent_child_graph.is_dag()
191
- assert parent_child_graph.is_connected("weak")
202
+ if not parent_child_graph.is_dag():
203
+ raise ValueError("parent_child_graph is not a DAG as expected")
204
+ if not parent_child_graph.is_connected("weak"):
205
+ raise ValueError("parent_child_graph is not weakly connected as expected")
192
206
 
193
207
  return parent_child_graph
194
208
 
@@ -243,8 +257,10 @@ def _isa_str_list_to_dict_list(isa_list: list) -> list[dict[str, Any]]:
243
257
 
244
258
  isa_dict_list = list()
245
259
  for split_val in split_vals:
246
- assert len(split_val) == 2
247
-
260
+ if len(split_val) != 2:
261
+ raise ValueError(
262
+ f"Expected tuple of length 2, got {len(split_val)}: {split_val}"
263
+ )
248
264
  isa_dict_list.append({"parent_id": split_val[0], "parent_name": split_val[1]})
249
265
 
250
266
  return isa_dict_list
@@ -44,7 +44,10 @@ def format_psi(
44
44
 
45
45
  # the root should be an entrySet if this is a PSI 3.0 file
46
46
  entry_set = et.getroot()
47
- assert entry_set.tag == PSI_MI_INTACT_XML_NAMESPACE + "entrySet"
47
+ if entry_set.tag != PSI_MI_INTACT_XML_NAMESPACE + "entrySet":
48
+ raise ValueError(
49
+ f"Expected root tag to be {PSI_MI_INTACT_XML_NAMESPACE + 'entrySet'}, got {entry_set.tag}"
50
+ )
48
51
 
49
52
  entry_nodes = entry_set.findall(f"./{PSI_MI_INTACT_XML_NAMESPACE}entry")
50
53
 
@@ -97,7 +100,10 @@ def _download_intact_species(
97
100
  def _format_entry(an_entry) -> dict[str, Any]:
98
101
  """Extract a single XML entry of interactors and interactions."""
99
102
 
100
- assert an_entry.tag == PSI_MI_INTACT_XML_NAMESPACE + "entry"
103
+ if an_entry.tag != PSI_MI_INTACT_XML_NAMESPACE + "entry":
104
+ raise ValueError(
105
+ f"Expected entry tag to be {PSI_MI_INTACT_XML_NAMESPACE + 'entry'}, got {an_entry.tag}"
106
+ )
101
107
 
102
108
  entry_dict = {
103
109
  "source": _format_entry_source(an_entry),
@@ -169,7 +175,10 @@ def _format_entry_interactor_list(an_entry) -> list[dict[str, Any]]:
169
175
  def _format_entry_interactor(interactor) -> dict[str, Any]:
170
176
  """Format a single molecular interactor in an interaction list XML node."""
171
177
 
172
- assert interactor.tag == PSI_MI_INTACT_XML_NAMESPACE + "interactor"
178
+ if interactor.tag != PSI_MI_INTACT_XML_NAMESPACE + "interactor":
179
+ raise ValueError(
180
+ f"Expected interactor tag to be {PSI_MI_INTACT_XML_NAMESPACE + 'interactor'}, got {interactor.tag}"
181
+ )
173
182
 
174
183
  # optional full name
175
184
  interactor_name_node = interactor.find(
@@ -238,7 +247,10 @@ def _format_entry_interactions(an_entry) -> list[dict[str, Any]]:
238
247
  def _format_entry_interaction(interaction) -> dict[str, Any]:
239
248
  """Format a single interaction in an XML interaction list."""
240
249
 
241
- assert interaction.tag == PSI_MI_INTACT_XML_NAMESPACE + "interaction"
250
+ if interaction.tag != PSI_MI_INTACT_XML_NAMESPACE + "interaction":
251
+ raise ValueError(
252
+ f"Expected interaction tag to be {PSI_MI_INTACT_XML_NAMESPACE + 'interaction'}, got {interaction.tag}"
253
+ )
242
254
 
243
255
  interaction_name = interaction.find(
244
256
  f"./{PSI_MI_INTACT_XML_NAMESPACE}names/{PSI_MI_INTACT_XML_NAMESPACE}shortLabel"
@@ -260,7 +272,10 @@ def _format_entry_interaction(interaction) -> dict[str, Any]:
260
272
  def _format_entry_interaction_participants(interaction_participant) -> dict[str, str]:
261
273
  """Format the participants in an XML interaction."""
262
274
 
263
- assert interaction_participant.tag == PSI_MI_INTACT_XML_NAMESPACE + "participant"
275
+ if interaction_participant.tag != PSI_MI_INTACT_XML_NAMESPACE + "participant":
276
+ raise ValueError(
277
+ f"Expected participant tag to be {PSI_MI_INTACT_XML_NAMESPACE + 'participant'}, got {interaction_participant.tag}"
278
+ )
264
279
 
265
280
  out = {
266
281
  "interactor_id": interaction_participant.attrib["id"],
@@ -17,42 +17,12 @@ from napistu.consensus import construct_sbml_dfs_dict
17
17
  from napistu.ingestion.constants import REACTOME_PATHWAY_INDEX_COLUMNS
18
18
  from napistu.ingestion.constants import REACTOME_PATHWAY_LIST_COLUMNS
19
19
  from napistu.ingestion.constants import REACTOME_PATHWAYS_URL
20
- from napistu.ingestion.constants import REACTOME_SBGN_URL
21
20
  from napistu.ingestion.constants import REACTOME_SMBL_URL
22
- from napistu.ingestion.constants import SPECIES_FULL_NAME_HUMAN
23
21
  from fs import open_fs
24
22
 
25
23
  logger = logging.getLogger(__name__)
26
24
 
27
25
 
28
- def reactome_sbgn_download(output_dir_path: str, overwrite: bool = False):
29
- """
30
- Reactome SBGN Download
31
-
32
- Download all human Reactome SBGN (systems biology graphical notation) files.
33
-
34
- Args:
35
- output_dir_path (str): Paths to a directory where .sbgn files should be saved.
36
- overwrite (bool): Overwrite an existing output directory.
37
- """
38
- utils.download_and_extract(
39
- REACTOME_SBGN_URL,
40
- output_dir_path=output_dir_path,
41
- overwrite=overwrite,
42
- )
43
- # create the pathway index
44
- pw_index = _build_reactome_pw_index(
45
- output_dir_path,
46
- file_ext="sbgn",
47
- # For sbgn only homo sapiens files are available
48
- species_filter=(SPECIES_FULL_NAME_HUMAN,),
49
- )
50
- # save as tsv
51
- out_fs = open_fs(output_dir_path)
52
- with out_fs.open("pw_index.tsv", "wb") as index_path:
53
- pw_index.to_csv(index_path, sep="\t", index=False)
54
-
55
-
56
26
  def reactome_sbml_download(output_dir_path: str, overwrite: bool = False):
57
27
  """
58
28
  Reactome SBML Download
@@ -164,8 +134,14 @@ def _check_reactome_pw_index(pw_index: indices.PWIndex, reactome_pathway_list: l
164
134
 
165
135
  # check extension in pw_index
166
136
  extn = set([os.path.splitext(x)[1] for x in pw_index["file"]])
167
- assert len(extn) == 1
168
- assert len(extn.intersection(set([".sbgn", ".sbml"]))) == 1
137
+ if len(extn) != 1:
138
+ raise ValueError(
139
+ f"Expected all files to have the same extension, but found extensions: {extn}"
140
+ )
141
+ if len(extn.intersection({".sbml"})) != 1:
142
+ raise ValueError(
143
+ f"Expected all files to have the .sbml extension, but found: {extn}"
144
+ )
169
145
  extn_string = extn.pop()
170
146
 
171
147
  local_reactome_pws = set(pw_index["pathway_id"])
@@ -0,0 +1,69 @@
1
+ """
2
+ MCP (Model Context Protocol) Server for Napistu.
3
+
4
+ This module requires optional dependencies. Install with:
5
+ pip install napistu[mcp]
6
+ """
7
+
8
+ import asyncio
9
+ from typing import Dict, Any
10
+
11
+ __all__ = ["start_server", "register_object", "is_available"]
12
+
13
+ # Check if MCP dependencies are available
14
+ try:
15
+ __import__("mcp")
16
+ is_available = True
17
+ except ImportError:
18
+ is_available = False
19
+
20
+ if is_available:
21
+ from .server import create_server
22
+ from .profiles import get_profile
23
+
24
+ def start_server(profile_name: str = "local", **kwargs) -> Dict[str, Any]:
25
+ """
26
+ Start an MCP server with a specific profile.
27
+
28
+ Args:
29
+ profile_name: Name of the profile ('local', 'remote', or 'full')
30
+ **kwargs: Additional configuration options
31
+
32
+ Returns:
33
+ Server control dictionary
34
+ """
35
+ profile = get_profile(profile_name, **kwargs)
36
+ server = create_server(profile)
37
+
38
+ # Start the server
39
+ asyncio.create_task(server.start())
40
+
41
+ # Return control interface
42
+ return {
43
+ "status": "running",
44
+ "server": server,
45
+ "profile": profile_name,
46
+ "stop": server.stop,
47
+ "register_object": (
48
+ register_object if profile.get_config()["enable_execution"] else None
49
+ ),
50
+ }
51
+
52
+ # Helper function for registering objects with a running server
53
+ def register_object(name, obj):
54
+ """Register an object with the execution component."""
55
+ from .execution import register_object as _register
56
+
57
+ return _register(name, obj)
58
+
59
+ else:
60
+ # Stubs for when MCP is not available
61
+ def start_server(*args, **kwargs):
62
+ raise ImportError(
63
+ "MCP support not installed. Install with 'pip install napistu[mcp]'"
64
+ )
65
+
66
+ def register_object(*args, **kwargs):
67
+ raise ImportError(
68
+ "MCP support not installed. Install with 'pip install napistu[mcp]'"
69
+ )