napistu 0.2.5.dev7__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__main__.py +126 -96
- napistu/constants.py +35 -41
- napistu/context/__init__.py +10 -0
- napistu/context/discretize.py +462 -0
- napistu/context/filtering.py +387 -0
- napistu/gcs/__init__.py +1 -1
- napistu/identifiers.py +74 -15
- napistu/indices.py +68 -0
- napistu/ingestion/__init__.py +1 -1
- napistu/ingestion/bigg.py +47 -62
- napistu/ingestion/constants.py +18 -133
- napistu/ingestion/gtex.py +113 -0
- napistu/ingestion/hpa.py +147 -0
- napistu/ingestion/sbml.py +0 -97
- napistu/ingestion/string.py +2 -2
- napistu/matching/__init__.py +10 -0
- napistu/matching/constants.py +18 -0
- napistu/matching/interactions.py +518 -0
- napistu/matching/mount.py +529 -0
- napistu/matching/species.py +510 -0
- napistu/mcp/__init__.py +7 -4
- napistu/mcp/__main__.py +128 -72
- napistu/mcp/client.py +16 -25
- napistu/mcp/codebase.py +201 -145
- napistu/mcp/component_base.py +170 -0
- napistu/mcp/config.py +223 -0
- napistu/mcp/constants.py +45 -2
- napistu/mcp/documentation.py +253 -136
- napistu/mcp/documentation_utils.py +13 -48
- napistu/mcp/execution.py +372 -305
- napistu/mcp/health.py +47 -65
- napistu/mcp/profiles.py +10 -6
- napistu/mcp/server.py +161 -80
- napistu/mcp/tutorials.py +139 -87
- napistu/modify/__init__.py +1 -1
- napistu/modify/gaps.py +1 -1
- napistu/network/__init__.py +1 -1
- napistu/network/constants.py +101 -34
- napistu/network/data_handling.py +388 -0
- napistu/network/ig_utils.py +351 -0
- napistu/network/napistu_graph_core.py +354 -0
- napistu/network/neighborhoods.py +40 -40
- napistu/network/net_create.py +373 -309
- napistu/network/net_propagation.py +47 -19
- napistu/network/{net_utils.py → ng_utils.py} +124 -272
- napistu/network/paths.py +67 -51
- napistu/network/precompute.py +11 -11
- napistu/ontologies/__init__.py +10 -0
- napistu/ontologies/constants.py +129 -0
- napistu/ontologies/dogma.py +243 -0
- napistu/ontologies/genodexito.py +649 -0
- napistu/ontologies/mygene.py +369 -0
- napistu/ontologies/renaming.py +198 -0
- napistu/rpy2/__init__.py +229 -86
- napistu/rpy2/callr.py +47 -77
- napistu/rpy2/constants.py +24 -23
- napistu/rpy2/rids.py +61 -648
- napistu/sbml_dfs_core.py +587 -222
- napistu/scverse/__init__.py +15 -0
- napistu/scverse/constants.py +28 -0
- napistu/scverse/loading.py +727 -0
- napistu/utils.py +118 -10
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/METADATA +8 -3
- napistu-0.3.1.dist-info/RECORD +133 -0
- tests/conftest.py +22 -0
- tests/test_context_discretize.py +56 -0
- tests/test_context_filtering.py +267 -0
- tests/test_identifiers.py +100 -0
- tests/test_indices.py +65 -0
- tests/{test_edgelist.py → test_ingestion_napistu_edgelist.py} +2 -2
- tests/test_matching_interactions.py +108 -0
- tests/test_matching_mount.py +305 -0
- tests/test_matching_species.py +394 -0
- tests/test_mcp_config.py +193 -0
- tests/test_mcp_documentation_utils.py +12 -3
- tests/test_mcp_server.py +156 -19
- tests/test_network_data_handling.py +397 -0
- tests/test_network_ig_utils.py +23 -0
- tests/test_network_neighborhoods.py +19 -0
- tests/test_network_net_create.py +459 -0
- tests/test_network_ng_utils.py +30 -0
- tests/test_network_paths.py +56 -0
- tests/{test_precomputed_distances.py → test_network_precompute.py} +8 -6
- tests/test_ontologies_genodexito.py +58 -0
- tests/test_ontologies_mygene.py +39 -0
- tests/test_ontologies_renaming.py +110 -0
- tests/test_rpy2_callr.py +79 -0
- tests/test_rpy2_init.py +151 -0
- tests/test_sbml.py +0 -31
- tests/test_sbml_dfs_core.py +134 -10
- tests/test_scverse_loading.py +778 -0
- tests/test_set_coverage.py +2 -2
- tests/test_utils.py +121 -1
- napistu/mechanism_matching.py +0 -1353
- napistu/rpy2/netcontextr.py +0 -467
- napistu-0.2.5.dev7.dist-info/RECORD +0 -98
- tests/test_igraph.py +0 -367
- tests/test_mechanism_matching.py +0 -784
- tests/test_net_utils.py +0 -149
- tests/test_netcontextr.py +0 -105
- tests/test_rpy2.py +0 -61
- /napistu/ingestion/{cpr_edgelist.py → napistu_edgelist.py} +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/WHEEL +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/entry_points.txt +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/licenses/LICENSE +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/top_level.txt +0 -0
- /tests/{test_obo.py → test_ingestion_obo.py} +0 -0
napistu/ingestion/bigg.py
CHANGED
@@ -1,21 +1,16 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
import datetime
|
4
3
|
import logging
|
5
4
|
import os
|
6
5
|
from typing import Iterable
|
7
6
|
|
8
|
-
import pandas as pd
|
9
7
|
from napistu import indices
|
10
8
|
from napistu import sbml_dfs_core
|
11
9
|
from napistu import utils
|
12
10
|
from napistu.consensus import construct_sbml_dfs_dict
|
13
|
-
from napistu.
|
14
|
-
from napistu.ingestion.constants import BIGG_MODEL_FIELD_SPECIES
|
15
|
-
from napistu.ingestion.constants import BIGG_MODEL_FIELD_URL
|
11
|
+
from napistu.ontologies.renaming import rename_species_ontologies
|
16
12
|
from napistu.ingestion.constants import BIGG_MODEL_KEYS
|
17
13
|
from napistu.ingestion.constants import BIGG_MODEL_URLS
|
18
|
-
from napistu.ingestion.constants import BIGG_RECON3D_FIELD_ANNOTATION
|
19
14
|
from napistu.ingestion.constants import SPECIES_FULL_NAME_HUMAN
|
20
15
|
from napistu.ingestion.constants import SPECIES_FULL_NAME_MOUSE
|
21
16
|
from napistu.ingestion.constants import SPECIES_FULL_NAME_YEAST
|
@@ -40,33 +35,17 @@ def bigg_sbml_download(bg_pathway_root: str, overwrite: bool = False) -> None:
|
|
40
35
|
"""
|
41
36
|
utils.initialize_dir(bg_pathway_root, overwrite)
|
42
37
|
|
43
|
-
|
44
|
-
BIGG_MODEL_KEYS
|
45
|
-
|
46
|
-
|
38
|
+
bigg_models_df = indices.create_pathway_index_df(
|
39
|
+
model_keys=BIGG_MODEL_KEYS,
|
40
|
+
model_urls=BIGG_MODEL_URLS,
|
41
|
+
model_species={
|
42
|
+
SPECIES_FULL_NAME_HUMAN: SPECIES_FULL_NAME_HUMAN,
|
43
|
+
SPECIES_FULL_NAME_MOUSE: SPECIES_FULL_NAME_MOUSE,
|
44
|
+
SPECIES_FULL_NAME_YEAST: SPECIES_FULL_NAME_YEAST,
|
47
45
|
},
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
},
|
52
|
-
BIGG_MODEL_KEYS[SPECIES_FULL_NAME_YEAST]: {
|
53
|
-
BIGG_MODEL_FIELD_URL: BIGG_MODEL_URLS[SPECIES_FULL_NAME_YEAST],
|
54
|
-
BIGG_MODEL_FIELD_SPECIES: SPECIES_FULL_NAME_YEAST,
|
55
|
-
},
|
56
|
-
}
|
57
|
-
bigg_models_df = pd.DataFrame(bigg_models).T
|
58
|
-
bigg_models_df["sbml_path"] = [
|
59
|
-
os.path.join(bg_pathway_root, k) + ".sbml"
|
60
|
-
for k in bigg_models_df.index.tolist()
|
61
|
-
]
|
62
|
-
bigg_models_df["file"] = [os.path.basename(x) for x in bigg_models_df["sbml_path"]]
|
63
|
-
|
64
|
-
# add other attributes which will be used in the pw_index
|
65
|
-
bigg_models_df["date"] = datetime.date.today().strftime("%Y%m%d")
|
66
|
-
bigg_models_df.index = bigg_models_df.index.rename("pathway_id")
|
67
|
-
bigg_models_df = bigg_models_df.reset_index()
|
68
|
-
bigg_models_df["name"] = bigg_models_df["pathway_id"]
|
69
|
-
bigg_models_df = bigg_models_df.assign(source="BiGG")
|
46
|
+
base_path=bg_pathway_root,
|
47
|
+
source_name="BiGG",
|
48
|
+
)
|
70
49
|
|
71
50
|
with open_fs(bg_pathway_root, create=True) as bg_fs:
|
72
51
|
for _, row in bigg_models_df.iterrows():
|
@@ -84,41 +63,46 @@ def bigg_sbml_download(bg_pathway_root: str, overwrite: bool = False) -> None:
|
|
84
63
|
return None
|
85
64
|
|
86
65
|
|
87
|
-
def annotate_recon(raw_model_path: str, annotated_model_path: str) -> None:
|
88
|
-
"""Annotate Recon3D
|
89
|
-
Add compartment annotations to Recon3D so it can be merged with other pathways
|
90
|
-
"""
|
91
|
-
logger.warning(
|
92
|
-
"add_sbml_annotations is deprecated and maybe removed in a future version of rcpr; "
|
93
|
-
"we are now adding these annotation during ingestion by sbml.sbml_df_from_sbml() rather "
|
94
|
-
"than directly appending them to the raw .sbml"
|
95
|
-
)
|
96
|
-
recon_3d_annotations = pd.DataFrame(BIGG_RECON3D_FIELD_ANNOTATION)
|
97
|
-
sbml_model = sbml.SBML(raw_model_path)
|
98
|
-
sbml.add_sbml_annotations(
|
99
|
-
sbml_model, recon_3d_annotations, save_path=annotated_model_path
|
100
|
-
)
|
101
|
-
|
102
|
-
return None
|
103
|
-
|
104
|
-
|
105
66
|
def construct_bigg_consensus(
|
106
67
|
pw_index_inp: str | indices.PWIndex,
|
107
68
|
species: str | Iterable[str] | None = None,
|
108
69
|
outdir: str | None = None,
|
109
70
|
) -> sbml_dfs_core.SBML_dfs:
|
110
|
-
"""
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
71
|
+
"""Construct a BiGG SBML DFs pathway representation.
|
72
|
+
|
73
|
+
Parameters
|
74
|
+
----------
|
75
|
+
pw_index_inp : str or indices.PWIndex
|
76
|
+
PWIndex object or URI pointing to PWIndex
|
77
|
+
species : str or Iterable[str] or None, optional
|
78
|
+
One or more species to filter by, by default None (no filtering)
|
79
|
+
outdir : str or None, optional
|
80
|
+
Output directory used to cache results, by default None
|
81
|
+
|
82
|
+
Returns
|
83
|
+
-------
|
84
|
+
sbml_dfs_core.SBML_dfs
|
85
|
+
A consensus SBML representation
|
86
|
+
|
87
|
+
Notes
|
88
|
+
-----
|
89
|
+
Currently this only works for a single model. Integration of multiple
|
90
|
+
models is not yet supported in BiGG.
|
91
|
+
|
92
|
+
The function:
|
93
|
+
1. Loads/validates the pathway index
|
94
|
+
2. Constructs SBML DFs dictionary
|
95
|
+
3. Processes the single model:
|
96
|
+
- Infers compartmentalization for species without location
|
97
|
+
- Names compartmentalized species
|
98
|
+
- Validates the final model
|
99
|
+
|
100
|
+
Raises
|
101
|
+
------
|
102
|
+
ValueError
|
103
|
+
If pw_index_inp is neither a PWIndex nor a string
|
104
|
+
NotImplementedError
|
105
|
+
If attempting to merge multiple models
|
122
106
|
"""
|
123
107
|
if isinstance(pw_index_inp, str):
|
124
108
|
pw_index = indices.adapt_pw_index(pw_index_inp, species=species, outdir=outdir)
|
@@ -142,5 +126,6 @@ def construct_bigg_consensus(
|
|
142
126
|
# fix missing compartimentalization
|
143
127
|
model = sbml_dfs_core.infer_uncompartmentalized_species_location(model)
|
144
128
|
model = sbml_dfs_core.name_compartmentalized_species(model)
|
129
|
+
rename_species_ontologies(model)
|
145
130
|
model.validate()
|
146
131
|
return model
|
napistu/ingestion/constants.py
CHANGED
@@ -3,12 +3,30 @@ from __future__ import annotations
|
|
3
3
|
|
4
4
|
from types import SimpleNamespace
|
5
5
|
|
6
|
+
|
6
7
|
SPECIES_FULL_NAME_HUMAN = "Homo sapiens"
|
7
8
|
SPECIES_FULL_NAME_MOUSE = "Mus musculus"
|
8
9
|
SPECIES_FULL_NAME_YEAST = "Saccharomyces cerevisiae"
|
9
10
|
SPECIES_FULL_NAME_RAT = "Rattus norvegicus"
|
10
11
|
SPECIES_FULL_NAME_WORM = "Caenorhabditis elegans"
|
11
12
|
|
13
|
+
PROTEINATLAS_SUBCELL_LOC_URL = (
|
14
|
+
"https://www.proteinatlas.org/download/tsv/subcellular_location.tsv.zip"
|
15
|
+
)
|
16
|
+
|
17
|
+
PROTEINATLAS_DEFS = SimpleNamespace(
|
18
|
+
GO_ID="GO id",
|
19
|
+
GENE="Gene",
|
20
|
+
)
|
21
|
+
|
22
|
+
|
23
|
+
# GTEx
|
24
|
+
GTEX_RNASEQ_EXPRESSION_URL = "https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz"
|
25
|
+
|
26
|
+
GTEX_DEFS = SimpleNamespace(
|
27
|
+
NAME="Name",
|
28
|
+
DESCRIPTION="Description",
|
29
|
+
)
|
12
30
|
|
13
31
|
# BIGG
|
14
32
|
BIGG_MODEL_URLS = {
|
@@ -29,134 +47,6 @@ BIGG_RECON3D_FIELD_ID = "id"
|
|
29
47
|
BIGG_RECON3D_FIELD_TYPE = "type"
|
30
48
|
BIGG_RECON3D_FIELD_URI = "uri"
|
31
49
|
|
32
|
-
BIGG_RECON3D_ID_C = "c"
|
33
|
-
BIGG_RECON3D_ID_L = "l"
|
34
|
-
BIGG_RECON3D_ID_E = "e"
|
35
|
-
BIGG_RECON3D_ID_M = "m"
|
36
|
-
BIGG_RECON3D_ID_R = "r"
|
37
|
-
BIGG_RECON3D_ID_X = "x"
|
38
|
-
BIGG_RECON3D_ID_N = "n"
|
39
|
-
BIGG_RECON3D_ID_I = "i"
|
40
|
-
|
41
|
-
BIGG_RECON3D_TYPE_COMPARTMENT = "compartment"
|
42
|
-
|
43
|
-
BIGG_RECON3D_FIELD_ANNOTATION = [
|
44
|
-
{
|
45
|
-
# cytosol
|
46
|
-
BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_C,
|
47
|
-
BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
|
48
|
-
BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005829",
|
49
|
-
},
|
50
|
-
{
|
51
|
-
# cytoplasm
|
52
|
-
BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_C,
|
53
|
-
BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
|
54
|
-
BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005737",
|
55
|
-
},
|
56
|
-
{
|
57
|
-
# plasma membrane
|
58
|
-
BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_C,
|
59
|
-
BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
|
60
|
-
BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005886",
|
61
|
-
},
|
62
|
-
{
|
63
|
-
# lysosome lumen
|
64
|
-
BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_L,
|
65
|
-
BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
|
66
|
-
BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0043202",
|
67
|
-
},
|
68
|
-
{
|
69
|
-
# lysosomal membrane
|
70
|
-
BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_L,
|
71
|
-
BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
|
72
|
-
BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005765",
|
73
|
-
},
|
74
|
-
{
|
75
|
-
# mitochondrial intermembrane space
|
76
|
-
BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_M,
|
77
|
-
BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
|
78
|
-
BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005758",
|
79
|
-
},
|
80
|
-
{
|
81
|
-
# mitochondrial outer membrane
|
82
|
-
BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_M,
|
83
|
-
BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
|
84
|
-
BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005741",
|
85
|
-
},
|
86
|
-
{
|
87
|
-
# ER membrane
|
88
|
-
BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_R,
|
89
|
-
BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
|
90
|
-
BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005789",
|
91
|
-
},
|
92
|
-
{
|
93
|
-
# ER lumen
|
94
|
-
BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_R,
|
95
|
-
BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
|
96
|
-
BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005788",
|
97
|
-
},
|
98
|
-
{
|
99
|
-
# extracellular region
|
100
|
-
BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_E,
|
101
|
-
BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
|
102
|
-
BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005576",
|
103
|
-
},
|
104
|
-
{
|
105
|
-
# peroxosomal membrane
|
106
|
-
BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_X,
|
107
|
-
BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
|
108
|
-
BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005778",
|
109
|
-
},
|
110
|
-
{
|
111
|
-
# peroxosomal matrix
|
112
|
-
BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_X,
|
113
|
-
BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
|
114
|
-
BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005782",
|
115
|
-
},
|
116
|
-
{
|
117
|
-
# nucleolus
|
118
|
-
BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_N,
|
119
|
-
BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
|
120
|
-
BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005730",
|
121
|
-
},
|
122
|
-
{
|
123
|
-
# nuclear envelope
|
124
|
-
BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_N,
|
125
|
-
BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
|
126
|
-
BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005635",
|
127
|
-
},
|
128
|
-
{
|
129
|
-
# nucleoplasm
|
130
|
-
BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_N,
|
131
|
-
BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
|
132
|
-
BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005654",
|
133
|
-
},
|
134
|
-
{
|
135
|
-
# golgi membrane
|
136
|
-
BIGG_RECON3D_FIELD_ID: "g",
|
137
|
-
BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
|
138
|
-
BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0000139",
|
139
|
-
},
|
140
|
-
{
|
141
|
-
# golgi lumen
|
142
|
-
BIGG_RECON3D_FIELD_ID: "g",
|
143
|
-
BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
|
144
|
-
BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005796",
|
145
|
-
},
|
146
|
-
{
|
147
|
-
# mitochondrial matrix
|
148
|
-
BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_I,
|
149
|
-
BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
|
150
|
-
BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005759",
|
151
|
-
},
|
152
|
-
{
|
153
|
-
# mitochondrial inner membrane
|
154
|
-
BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_I,
|
155
|
-
BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
|
156
|
-
BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005743",
|
157
|
-
},
|
158
|
-
]
|
159
|
-
|
160
50
|
# IDENTIFIERS ETL
|
161
51
|
IDENTIFIERS_ETL_YEAST_URL = "https://www.uniprot.org/docs/yeast.txt"
|
162
52
|
IDENTIFIERS_ETL_SBO_URL = (
|
@@ -239,11 +129,6 @@ SBML_COMPARTMENTALIZED_SPECIES_DICT_SOURCE = "sc_Source"
|
|
239
129
|
|
240
130
|
SBML_REACTION_ATTR_GET_GENE_PRODUCT = "getGeneProduct"
|
241
131
|
|
242
|
-
SBML_ANNOTATION_METHOD_GET_SPECIES = "getSpecies"
|
243
|
-
SBML_ANNOTATION_METHOD_GET_COMPARTMENT = "getCompartment"
|
244
|
-
SBML_ANNOTATION_METHOD_GET_REACTION = "getReaction"
|
245
|
-
|
246
|
-
|
247
132
|
# STRING
|
248
133
|
STRING_URL_EXPRESSIONS = {
|
249
134
|
"interactions": "https://stringdb-static.org/download/protein.links.full.v{version}/{taxid}.protein.links.full.v{version}.txt.gz",
|
@@ -0,0 +1,113 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import logging
|
4
|
+
import pandas as pd
|
5
|
+
from fs import open_fs
|
6
|
+
from napistu import utils
|
7
|
+
|
8
|
+
from napistu.constants import ONTOLOGIES
|
9
|
+
from napistu.ingestion.constants import GTEX_DEFS, GTEX_RNASEQ_EXPRESSION_URL
|
10
|
+
|
11
|
+
logger = logging.getLogger(__name__)
|
12
|
+
|
13
|
+
|
14
|
+
def download_gtex_rnaseq(
|
15
|
+
target_uri: str, url: str = GTEX_RNASEQ_EXPRESSION_URL
|
16
|
+
) -> None:
|
17
|
+
"""Download GTEx RNA-seq expression data.
|
18
|
+
|
19
|
+
Parameters
|
20
|
+
----------
|
21
|
+
target_uri : str
|
22
|
+
The URI where the GTEx data should be saved
|
23
|
+
url : str, optional
|
24
|
+
URL to download the GTEx RNA-seq expression data from.
|
25
|
+
Defaults to GTEX_RNASEQ_EXPRESSION_URL.
|
26
|
+
|
27
|
+
Returns
|
28
|
+
-------
|
29
|
+
None
|
30
|
+
|
31
|
+
Notes
|
32
|
+
-----
|
33
|
+
Downloads GTEx RNA-seq expression data (median TPM per gene per tissue) from the
|
34
|
+
specified URL and saves it to the target URI. By default, downloads from GTEx
|
35
|
+
Analysis V8 data (dbGaP Accession phs000424.v8.p2).
|
36
|
+
"""
|
37
|
+
logger.info("Start downloading gtex %s to %s", url, target_uri)
|
38
|
+
utils.download_wget(url, target_uri)
|
39
|
+
|
40
|
+
|
41
|
+
def load_and_clean_gtex_data(gtex_data_path: str) -> pd.DataFrame:
|
42
|
+
"""Load and format GTEx tissue specific expression data.
|
43
|
+
|
44
|
+
This function loads tissue-specific expression data from GTEx (median value per gene per tissue).
|
45
|
+
|
46
|
+
Parameters
|
47
|
+
----------
|
48
|
+
gtex_data_path : str
|
49
|
+
Path to GTEx tissue specific expression data (medians)
|
50
|
+
|
51
|
+
Returns
|
52
|
+
-------
|
53
|
+
pd.DataFrame
|
54
|
+
DataFrame containing all the information from the GTEx file with standardized column names:
|
55
|
+
- ensembl_gene_id: Ensembl gene ID without version number
|
56
|
+
- ensembl_geneTranscript_id: Original GTEx hybrid gene/transcript ID
|
57
|
+
- Description: Gene description/symbol
|
58
|
+
- Multiple tissue columns with median TPM values
|
59
|
+
|
60
|
+
Notes
|
61
|
+
-----
|
62
|
+
The function:
|
63
|
+
1. Skips the first 2 lines of the GTEx file (header info)
|
64
|
+
2. Creates clean Ensembl gene IDs by removing version numbers
|
65
|
+
3. Renames columns for clarity
|
66
|
+
4. Reorders columns to put ID and description columns first
|
67
|
+
|
68
|
+
Raises
|
69
|
+
------
|
70
|
+
FileNotFoundError
|
71
|
+
If the input file does not exist
|
72
|
+
"""
|
73
|
+
# Check file exists
|
74
|
+
base_path, file_name = utils.get_source_base_and_path(gtex_data_path)
|
75
|
+
|
76
|
+
logger.info("Loading GTEx tissue specific expression data")
|
77
|
+
|
78
|
+
# Read the TSV file using pandas, skipping first 2 lines
|
79
|
+
with open_fs(base_path) as base_fs:
|
80
|
+
with base_fs.open(file_name, "rb") as f:
|
81
|
+
gtex_expression_data = pd.read_csv(
|
82
|
+
f, sep="\t", skiprows=2, dtype=str, na_values=[""], keep_default_na=True
|
83
|
+
)
|
84
|
+
|
85
|
+
# Create ensembl_gene_id by removing version numbers from Name column
|
86
|
+
gtex_expression_data[ONTOLOGIES.ENSEMBL_GENE] = gtex_expression_data[
|
87
|
+
GTEX_DEFS.NAME
|
88
|
+
].str.replace(r"\.[0-9]+$", "", regex=True)
|
89
|
+
|
90
|
+
# Rename Name column to be more informative
|
91
|
+
gtex_expression_data = gtex_expression_data.rename(
|
92
|
+
columns={
|
93
|
+
GTEX_DEFS.NAME: ONTOLOGIES.ENSEMBL_GENE_VERSION,
|
94
|
+
GTEX_DEFS.DESCRIPTION: ONTOLOGIES.SYMBOL,
|
95
|
+
}
|
96
|
+
)
|
97
|
+
|
98
|
+
# Reorder columns to put ID and description columns first
|
99
|
+
first_cols = [
|
100
|
+
ONTOLOGIES.ENSEMBL_GENE,
|
101
|
+
ONTOLOGIES.ENSEMBL_GENE_VERSION,
|
102
|
+
ONTOLOGIES.SYMBOL,
|
103
|
+
]
|
104
|
+
other_cols = [col for col in gtex_expression_data.columns if col not in first_cols]
|
105
|
+
gtex_expression_data = gtex_expression_data[first_cols + other_cols]
|
106
|
+
|
107
|
+
# Convert tissue columns to numeric
|
108
|
+
numeric_cols = [col for col in other_cols if col not in first_cols]
|
109
|
+
gtex_expression_data[numeric_cols] = gtex_expression_data[numeric_cols].apply(
|
110
|
+
pd.to_numeric, errors="coerce"
|
111
|
+
)
|
112
|
+
|
113
|
+
return gtex_expression_data
|
napistu/ingestion/hpa.py
ADDED
@@ -0,0 +1,147 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import logging
|
4
|
+
import pandas as pd
|
5
|
+
from napistu import utils
|
6
|
+
from fs import open_fs
|
7
|
+
from napistu.constants import ONTOLOGIES
|
8
|
+
from napistu.ingestion.constants import PROTEINATLAS_SUBCELL_LOC_URL, PROTEINATLAS_DEFS
|
9
|
+
|
10
|
+
|
11
|
+
logger = logging.getLogger(__name__)
|
12
|
+
|
13
|
+
|
14
|
+
def download_hpa_data(target_uri: str, url: str = PROTEINATLAS_SUBCELL_LOC_URL) -> None:
|
15
|
+
"""Download protein localization data from the Human Protein Atlas.
|
16
|
+
|
17
|
+
Parameters
|
18
|
+
----------
|
19
|
+
target_uri : str
|
20
|
+
The URI where the HPA data should be saved. Should end with .tsv
|
21
|
+
url : str, optional
|
22
|
+
URL to download the zipped protein atlas subcellular localization tsv from.
|
23
|
+
Defaults to PROTEINATLAS_SUBCELL_LOC_URL.
|
24
|
+
|
25
|
+
Returns
|
26
|
+
-------
|
27
|
+
None
|
28
|
+
|
29
|
+
Notes
|
30
|
+
-----
|
31
|
+
Downloads the subcellular localization data from the Human Protein Atlas and saves
|
32
|
+
it to the specified target URI. The data is downloaded from the official HPA website
|
33
|
+
as a ZIP file and automatically unzipped to extract the TSV.
|
34
|
+
|
35
|
+
Raises
|
36
|
+
------
|
37
|
+
ValueError
|
38
|
+
If target_uri does not end with .tsv
|
39
|
+
"""
|
40
|
+
if not target_uri.endswith(".tsv"):
|
41
|
+
raise ValueError(f"Target URI must end with .tsv, got {target_uri}")
|
42
|
+
|
43
|
+
file_ext = url.split(".")[-1]
|
44
|
+
target_filename = url.split("/")[-1].split(f".{file_ext}")[0]
|
45
|
+
logger.info("Start downloading proteinatlas %s to %s", url, target_uri)
|
46
|
+
# target_filename is the name of the file in the zip file which will be renamed to target_uri
|
47
|
+
utils.download_wget(url, target_uri, target_filename=target_filename)
|
48
|
+
|
49
|
+
return None
|
50
|
+
|
51
|
+
|
52
|
+
def load_and_clean_hpa_data(hpa_data_path: str) -> pd.DataFrame:
|
53
|
+
"""Load and format Human Protein Atlas subcellular localization data.
|
54
|
+
|
55
|
+
Parameters
|
56
|
+
----------
|
57
|
+
hpa_data_path : str
|
58
|
+
Path to HPA subcellular localization data TSV file
|
59
|
+
|
60
|
+
Returns
|
61
|
+
-------
|
62
|
+
pd.DataFrame
|
63
|
+
DataFrame with genes as rows and GO terms as columns. Each cell
|
64
|
+
is a binary value (0 or 1) indicating whether that gene (row) is found in that
|
65
|
+
compartment (column). Genes with no compartment annotations are filtered out.
|
66
|
+
|
67
|
+
Notes
|
68
|
+
-----
|
69
|
+
This function loads subcellular localization data from the Human Protein Atlas
|
70
|
+
and creates a binary matrix where rows are genes and columns are GO terms,
|
71
|
+
with 1 indicating that a gene is localized to that compartment and 0 indicating
|
72
|
+
it is not.
|
73
|
+
|
74
|
+
The function filters out genes that have no compartment annotations and logs
|
75
|
+
information about the number of genes filtered and the final matrix dimensions.
|
76
|
+
|
77
|
+
Raises
|
78
|
+
------
|
79
|
+
FileNotFoundError
|
80
|
+
If the input file does not exist
|
81
|
+
ValueError
|
82
|
+
If no gene-compartment associations are found in the data
|
83
|
+
"""
|
84
|
+
# Check file exists
|
85
|
+
base_path, file_name = utils.get_source_base_and_path(hpa_data_path)
|
86
|
+
|
87
|
+
logger.info("Loading Human Protein Atlas subcellular localization data")
|
88
|
+
|
89
|
+
# Read the TSV file using pandas
|
90
|
+
with open_fs(base_path) as base_fs:
|
91
|
+
with base_fs.open(file_name, "rb") as f:
|
92
|
+
protein_subcellular_localizations = pd.read_csv(
|
93
|
+
f, sep="\t", dtype=str, na_values=[""], keep_default_na=True
|
94
|
+
)
|
95
|
+
|
96
|
+
# Rename Gene column to be more informative
|
97
|
+
protein_subcellular_localizations = protein_subcellular_localizations.rename(
|
98
|
+
columns={PROTEINATLAS_DEFS.GENE: ONTOLOGIES.ENSEMBL_GENE}
|
99
|
+
)
|
100
|
+
|
101
|
+
# Convert GO id column to lists
|
102
|
+
def _split_go_terms(go_terms):
|
103
|
+
if pd.isna(go_terms):
|
104
|
+
return []
|
105
|
+
return go_terms.split(";")
|
106
|
+
|
107
|
+
# Create a list of all gene-GO term pairs
|
108
|
+
gene_go_pairs = []
|
109
|
+
for _, row in protein_subcellular_localizations.iterrows():
|
110
|
+
go_terms = _split_go_terms(row[PROTEINATLAS_DEFS.GO_ID])
|
111
|
+
for term in go_terms:
|
112
|
+
gene_go_pairs.append(
|
113
|
+
{
|
114
|
+
ONTOLOGIES.ENSEMBL_GENE: row[ONTOLOGIES.ENSEMBL_GENE],
|
115
|
+
ONTOLOGIES.GO: term,
|
116
|
+
}
|
117
|
+
)
|
118
|
+
|
119
|
+
# Convert to DataFrame and pivot to create binary matrix
|
120
|
+
gene_go_df = pd.DataFrame(gene_go_pairs)
|
121
|
+
if len(gene_go_df) == 0:
|
122
|
+
raise ValueError("No gene-compartment associations found in the data")
|
123
|
+
|
124
|
+
localization_matrix = pd.crosstab(
|
125
|
+
gene_go_df[ONTOLOGIES.ENSEMBL_GENE], gene_go_df[ONTOLOGIES.GO]
|
126
|
+
).astype(int)
|
127
|
+
|
128
|
+
# Log number of genes without compartments that were filtered
|
129
|
+
n_total_genes = len(
|
130
|
+
protein_subcellular_localizations[ONTOLOGIES.ENSEMBL_GENE].unique()
|
131
|
+
)
|
132
|
+
n_genes_with_compartments = len(localization_matrix)
|
133
|
+
n_filtered = n_total_genes - n_genes_with_compartments
|
134
|
+
if n_filtered > 0:
|
135
|
+
logger.debug(
|
136
|
+
"Filtered out %d genes with no compartment annotations (from %d total genes)",
|
137
|
+
n_filtered,
|
138
|
+
n_total_genes,
|
139
|
+
)
|
140
|
+
|
141
|
+
logger.info(
|
142
|
+
"Created localization matrix with shape %d genes x %d compartments",
|
143
|
+
localization_matrix.shape[0],
|
144
|
+
localization_matrix.shape[1],
|
145
|
+
)
|
146
|
+
|
147
|
+
return localization_matrix
|