napistu 0.2.5.dev6__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. napistu/__main__.py +126 -96
  2. napistu/constants.py +35 -41
  3. napistu/context/__init__.py +10 -0
  4. napistu/context/discretize.py +462 -0
  5. napistu/context/filtering.py +387 -0
  6. napistu/gcs/__init__.py +1 -1
  7. napistu/identifiers.py +74 -15
  8. napistu/indices.py +68 -0
  9. napistu/ingestion/__init__.py +1 -1
  10. napistu/ingestion/bigg.py +47 -62
  11. napistu/ingestion/constants.py +18 -133
  12. napistu/ingestion/gtex.py +113 -0
  13. napistu/ingestion/hpa.py +147 -0
  14. napistu/ingestion/sbml.py +0 -97
  15. napistu/ingestion/string.py +2 -2
  16. napistu/matching/__init__.py +10 -0
  17. napistu/matching/constants.py +18 -0
  18. napistu/matching/interactions.py +518 -0
  19. napistu/matching/mount.py +529 -0
  20. napistu/matching/species.py +510 -0
  21. napistu/mcp/__init__.py +7 -4
  22. napistu/mcp/__main__.py +128 -72
  23. napistu/mcp/client.py +16 -25
  24. napistu/mcp/codebase.py +201 -153
  25. napistu/mcp/component_base.py +170 -0
  26. napistu/mcp/config.py +223 -0
  27. napistu/mcp/constants.py +45 -2
  28. napistu/mcp/documentation.py +253 -136
  29. napistu/mcp/documentation_utils.py +13 -48
  30. napistu/mcp/execution.py +372 -305
  31. napistu/mcp/health.py +49 -67
  32. napistu/mcp/profiles.py +10 -6
  33. napistu/mcp/server.py +161 -80
  34. napistu/mcp/tutorials.py +139 -87
  35. napistu/modify/__init__.py +1 -1
  36. napistu/modify/gaps.py +1 -1
  37. napistu/network/__init__.py +1 -1
  38. napistu/network/constants.py +101 -34
  39. napistu/network/data_handling.py +388 -0
  40. napistu/network/ig_utils.py +351 -0
  41. napistu/network/napistu_graph_core.py +354 -0
  42. napistu/network/neighborhoods.py +40 -40
  43. napistu/network/net_create.py +373 -309
  44. napistu/network/net_propagation.py +47 -19
  45. napistu/network/{net_utils.py → ng_utils.py} +124 -272
  46. napistu/network/paths.py +67 -51
  47. napistu/network/precompute.py +11 -11
  48. napistu/ontologies/__init__.py +10 -0
  49. napistu/ontologies/constants.py +129 -0
  50. napistu/ontologies/dogma.py +243 -0
  51. napistu/ontologies/genodexito.py +649 -0
  52. napistu/ontologies/mygene.py +369 -0
  53. napistu/ontologies/renaming.py +198 -0
  54. napistu/rpy2/__init__.py +229 -86
  55. napistu/rpy2/callr.py +47 -77
  56. napistu/rpy2/constants.py +24 -23
  57. napistu/rpy2/rids.py +61 -648
  58. napistu/sbml_dfs_core.py +587 -222
  59. napistu/scverse/__init__.py +15 -0
  60. napistu/scverse/constants.py +28 -0
  61. napistu/scverse/loading.py +727 -0
  62. napistu/utils.py +118 -10
  63. {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/METADATA +8 -3
  64. napistu-0.3.1.dist-info/RECORD +133 -0
  65. tests/conftest.py +22 -0
  66. tests/test_context_discretize.py +56 -0
  67. tests/test_context_filtering.py +267 -0
  68. tests/test_identifiers.py +100 -0
  69. tests/test_indices.py +65 -0
  70. tests/{test_edgelist.py → test_ingestion_napistu_edgelist.py} +2 -2
  71. tests/test_matching_interactions.py +108 -0
  72. tests/test_matching_mount.py +305 -0
  73. tests/test_matching_species.py +394 -0
  74. tests/test_mcp_config.py +193 -0
  75. tests/test_mcp_documentation_utils.py +12 -3
  76. tests/test_mcp_server.py +356 -0
  77. tests/test_network_data_handling.py +397 -0
  78. tests/test_network_ig_utils.py +23 -0
  79. tests/test_network_neighborhoods.py +19 -0
  80. tests/test_network_net_create.py +459 -0
  81. tests/test_network_ng_utils.py +30 -0
  82. tests/test_network_paths.py +56 -0
  83. tests/{test_precomputed_distances.py → test_network_precompute.py} +8 -6
  84. tests/test_ontologies_genodexito.py +58 -0
  85. tests/test_ontologies_mygene.py +39 -0
  86. tests/test_ontologies_renaming.py +110 -0
  87. tests/test_rpy2_callr.py +79 -0
  88. tests/test_rpy2_init.py +151 -0
  89. tests/test_sbml.py +0 -31
  90. tests/test_sbml_dfs_core.py +134 -10
  91. tests/test_scverse_loading.py +778 -0
  92. tests/test_set_coverage.py +2 -2
  93. tests/test_utils.py +121 -1
  94. napistu/mechanism_matching.py +0 -1353
  95. napistu/rpy2/netcontextr.py +0 -467
  96. napistu-0.2.5.dev6.dist-info/RECORD +0 -97
  97. tests/test_igraph.py +0 -367
  98. tests/test_mechanism_matching.py +0 -784
  99. tests/test_net_utils.py +0 -149
  100. tests/test_netcontextr.py +0 -105
  101. tests/test_rpy2.py +0 -61
  102. /napistu/ingestion/{cpr_edgelist.py → napistu_edgelist.py} +0 -0
  103. {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/WHEEL +0 -0
  104. {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/entry_points.txt +0 -0
  105. {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/licenses/LICENSE +0 -0
  106. {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/top_level.txt +0 -0
  107. /tests/{test_obo.py → test_ingestion_obo.py} +0 -0
napistu/ingestion/bigg.py CHANGED
@@ -1,21 +1,16 @@
1
1
  from __future__ import annotations
2
2
 
3
- import datetime
4
3
  import logging
5
4
  import os
6
5
  from typing import Iterable
7
6
 
8
- import pandas as pd
9
7
  from napistu import indices
10
8
  from napistu import sbml_dfs_core
11
9
  from napistu import utils
12
10
  from napistu.consensus import construct_sbml_dfs_dict
13
- from napistu.ingestion import sbml
14
- from napistu.ingestion.constants import BIGG_MODEL_FIELD_SPECIES
15
- from napistu.ingestion.constants import BIGG_MODEL_FIELD_URL
11
+ from napistu.ontologies.renaming import rename_species_ontologies
16
12
  from napistu.ingestion.constants import BIGG_MODEL_KEYS
17
13
  from napistu.ingestion.constants import BIGG_MODEL_URLS
18
- from napistu.ingestion.constants import BIGG_RECON3D_FIELD_ANNOTATION
19
14
  from napistu.ingestion.constants import SPECIES_FULL_NAME_HUMAN
20
15
  from napistu.ingestion.constants import SPECIES_FULL_NAME_MOUSE
21
16
  from napistu.ingestion.constants import SPECIES_FULL_NAME_YEAST
@@ -40,33 +35,17 @@ def bigg_sbml_download(bg_pathway_root: str, overwrite: bool = False) -> None:
40
35
  """
41
36
  utils.initialize_dir(bg_pathway_root, overwrite)
42
37
 
43
- bigg_models = {
44
- BIGG_MODEL_KEYS[SPECIES_FULL_NAME_HUMAN]: {
45
- BIGG_MODEL_FIELD_URL: BIGG_MODEL_URLS[SPECIES_FULL_NAME_HUMAN],
46
- BIGG_MODEL_FIELD_SPECIES: SPECIES_FULL_NAME_HUMAN,
38
+ bigg_models_df = indices.create_pathway_index_df(
39
+ model_keys=BIGG_MODEL_KEYS,
40
+ model_urls=BIGG_MODEL_URLS,
41
+ model_species={
42
+ SPECIES_FULL_NAME_HUMAN: SPECIES_FULL_NAME_HUMAN,
43
+ SPECIES_FULL_NAME_MOUSE: SPECIES_FULL_NAME_MOUSE,
44
+ SPECIES_FULL_NAME_YEAST: SPECIES_FULL_NAME_YEAST,
47
45
  },
48
- BIGG_MODEL_KEYS[SPECIES_FULL_NAME_MOUSE]: {
49
- BIGG_MODEL_FIELD_URL: BIGG_MODEL_URLS[SPECIES_FULL_NAME_MOUSE],
50
- BIGG_MODEL_FIELD_SPECIES: SPECIES_FULL_NAME_MOUSE,
51
- },
52
- BIGG_MODEL_KEYS[SPECIES_FULL_NAME_YEAST]: {
53
- BIGG_MODEL_FIELD_URL: BIGG_MODEL_URLS[SPECIES_FULL_NAME_YEAST],
54
- BIGG_MODEL_FIELD_SPECIES: SPECIES_FULL_NAME_YEAST,
55
- },
56
- }
57
- bigg_models_df = pd.DataFrame(bigg_models).T
58
- bigg_models_df["sbml_path"] = [
59
- os.path.join(bg_pathway_root, k) + ".sbml"
60
- for k in bigg_models_df.index.tolist()
61
- ]
62
- bigg_models_df["file"] = [os.path.basename(x) for x in bigg_models_df["sbml_path"]]
63
-
64
- # add other attributes which will be used in the pw_index
65
- bigg_models_df["date"] = datetime.date.today().strftime("%Y%m%d")
66
- bigg_models_df.index = bigg_models_df.index.rename("pathway_id")
67
- bigg_models_df = bigg_models_df.reset_index()
68
- bigg_models_df["name"] = bigg_models_df["pathway_id"]
69
- bigg_models_df = bigg_models_df.assign(source="BiGG")
46
+ base_path=bg_pathway_root,
47
+ source_name="BiGG",
48
+ )
70
49
 
71
50
  with open_fs(bg_pathway_root, create=True) as bg_fs:
72
51
  for _, row in bigg_models_df.iterrows():
@@ -84,41 +63,46 @@ def bigg_sbml_download(bg_pathway_root: str, overwrite: bool = False) -> None:
84
63
  return None
85
64
 
86
65
 
87
- def annotate_recon(raw_model_path: str, annotated_model_path: str) -> None:
88
- """Annotate Recon3D
89
- Add compartment annotations to Recon3D so it can be merged with other pathways
90
- """
91
- logger.warning(
92
- "add_sbml_annotations is deprecated and maybe removed in a future version of rcpr; "
93
- "we are now adding these annotation during ingestion by sbml.sbml_df_from_sbml() rather "
94
- "than directly appending them to the raw .sbml"
95
- )
96
- recon_3d_annotations = pd.DataFrame(BIGG_RECON3D_FIELD_ANNOTATION)
97
- sbml_model = sbml.SBML(raw_model_path)
98
- sbml.add_sbml_annotations(
99
- sbml_model, recon_3d_annotations, save_path=annotated_model_path
100
- )
101
-
102
- return None
103
-
104
-
105
66
  def construct_bigg_consensus(
106
67
  pw_index_inp: str | indices.PWIndex,
107
68
  species: str | Iterable[str] | None = None,
108
69
  outdir: str | None = None,
109
70
  ) -> sbml_dfs_core.SBML_dfs:
110
- """Constructs a BiGG SBML DFs Pathway Representation
111
-
112
- Attention: curently this does work only for a singly model. Integraiton of multiple
113
- models is not supported yet in BiGG.
114
-
115
- Args:
116
- pw_index_inp (str | indices.PWIndex): PWIndex or uri pointing to PWIndex
117
- species (str | Iterable[str] | None): one or more species to filter by. Default: no filtering
118
- outdir (str | None, optional): output directory used to cache results. Defaults to None.
119
-
120
- Returns:
121
- sbml_dfs_core.SBML_dfs: A consensus SBML
71
+ """Construct a BiGG SBML DFs pathway representation.
72
+
73
+ Parameters
74
+ ----------
75
+ pw_index_inp : str or indices.PWIndex
76
+ PWIndex object or URI pointing to PWIndex
77
+ species : str or Iterable[str] or None, optional
78
+ One or more species to filter by, by default None (no filtering)
79
+ outdir : str or None, optional
80
+ Output directory used to cache results, by default None
81
+
82
+ Returns
83
+ -------
84
+ sbml_dfs_core.SBML_dfs
85
+ A consensus SBML representation
86
+
87
+ Notes
88
+ -----
89
+ Currently this only works for a single model. Integration of multiple
90
+ models is not yet supported in BiGG.
91
+
92
+ The function:
93
+ 1. Loads/validates the pathway index
94
+ 2. Constructs SBML DFs dictionary
95
+ 3. Processes the single model:
96
+ - Infers compartmentalization for species without location
97
+ - Names compartmentalized species
98
+ - Validates the final model
99
+
100
+ Raises
101
+ ------
102
+ ValueError
103
+ If pw_index_inp is neither a PWIndex nor a string
104
+ NotImplementedError
105
+ If attempting to merge multiple models
122
106
  """
123
107
  if isinstance(pw_index_inp, str):
124
108
  pw_index = indices.adapt_pw_index(pw_index_inp, species=species, outdir=outdir)
@@ -142,5 +126,6 @@ def construct_bigg_consensus(
142
126
  # fix missing compartimentalization
143
127
  model = sbml_dfs_core.infer_uncompartmentalized_species_location(model)
144
128
  model = sbml_dfs_core.name_compartmentalized_species(model)
129
+ rename_species_ontologies(model)
145
130
  model.validate()
146
131
  return model
@@ -3,12 +3,30 @@ from __future__ import annotations
3
3
 
4
4
  from types import SimpleNamespace
5
5
 
6
+
6
7
  SPECIES_FULL_NAME_HUMAN = "Homo sapiens"
7
8
  SPECIES_FULL_NAME_MOUSE = "Mus musculus"
8
9
  SPECIES_FULL_NAME_YEAST = "Saccharomyces cerevisiae"
9
10
  SPECIES_FULL_NAME_RAT = "Rattus norvegicus"
10
11
  SPECIES_FULL_NAME_WORM = "Caenorhabditis elegans"
11
12
 
13
+ PROTEINATLAS_SUBCELL_LOC_URL = (
14
+ "https://www.proteinatlas.org/download/tsv/subcellular_location.tsv.zip"
15
+ )
16
+
17
+ PROTEINATLAS_DEFS = SimpleNamespace(
18
+ GO_ID="GO id",
19
+ GENE="Gene",
20
+ )
21
+
22
+
23
+ # GTEx
24
+ GTEX_RNASEQ_EXPRESSION_URL = "https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz"
25
+
26
+ GTEX_DEFS = SimpleNamespace(
27
+ NAME="Name",
28
+ DESCRIPTION="Description",
29
+ )
12
30
 
13
31
  # BIGG
14
32
  BIGG_MODEL_URLS = {
@@ -29,134 +47,6 @@ BIGG_RECON3D_FIELD_ID = "id"
29
47
  BIGG_RECON3D_FIELD_TYPE = "type"
30
48
  BIGG_RECON3D_FIELD_URI = "uri"
31
49
 
32
- BIGG_RECON3D_ID_C = "c"
33
- BIGG_RECON3D_ID_L = "l"
34
- BIGG_RECON3D_ID_E = "e"
35
- BIGG_RECON3D_ID_M = "m"
36
- BIGG_RECON3D_ID_R = "r"
37
- BIGG_RECON3D_ID_X = "x"
38
- BIGG_RECON3D_ID_N = "n"
39
- BIGG_RECON3D_ID_I = "i"
40
-
41
- BIGG_RECON3D_TYPE_COMPARTMENT = "compartment"
42
-
43
- BIGG_RECON3D_FIELD_ANNOTATION = [
44
- {
45
- # cytosol
46
- BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_C,
47
- BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
48
- BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005829",
49
- },
50
- {
51
- # cytoplasm
52
- BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_C,
53
- BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
54
- BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005737",
55
- },
56
- {
57
- # plasma membrane
58
- BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_C,
59
- BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
60
- BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005886",
61
- },
62
- {
63
- # lysosome lumen
64
- BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_L,
65
- BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
66
- BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0043202",
67
- },
68
- {
69
- # lysosomal membrane
70
- BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_L,
71
- BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
72
- BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005765",
73
- },
74
- {
75
- # mitochondrial intermembrane space
76
- BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_M,
77
- BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
78
- BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005758",
79
- },
80
- {
81
- # mitochondrial outer membrane
82
- BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_M,
83
- BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
84
- BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005741",
85
- },
86
- {
87
- # ER membrane
88
- BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_R,
89
- BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
90
- BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005789",
91
- },
92
- {
93
- # ER lumen
94
- BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_R,
95
- BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
96
- BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005788",
97
- },
98
- {
99
- # extracellular region
100
- BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_E,
101
- BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
102
- BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005576",
103
- },
104
- {
105
- # peroxosomal membrane
106
- BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_X,
107
- BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
108
- BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005778",
109
- },
110
- {
111
- # peroxosomal matrix
112
- BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_X,
113
- BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
114
- BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005782",
115
- },
116
- {
117
- # nucleolus
118
- BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_N,
119
- BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
120
- BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005730",
121
- },
122
- {
123
- # nuclear envelope
124
- BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_N,
125
- BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
126
- BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005635",
127
- },
128
- {
129
- # nucleoplasm
130
- BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_N,
131
- BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
132
- BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005654",
133
- },
134
- {
135
- # golgi membrane
136
- BIGG_RECON3D_FIELD_ID: "g",
137
- BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
138
- BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0000139",
139
- },
140
- {
141
- # golgi lumen
142
- BIGG_RECON3D_FIELD_ID: "g",
143
- BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
144
- BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005796",
145
- },
146
- {
147
- # mitochondrial matrix
148
- BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_I,
149
- BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
150
- BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005759",
151
- },
152
- {
153
- # mitochondrial inner membrane
154
- BIGG_RECON3D_FIELD_ID: BIGG_RECON3D_ID_I,
155
- BIGG_RECON3D_FIELD_TYPE: BIGG_RECON3D_TYPE_COMPARTMENT,
156
- BIGG_RECON3D_FIELD_URI: "https://www.ebi.ac.uk/QuickGO/term/GO:0005743",
157
- },
158
- ]
159
-
160
50
  # IDENTIFIERS ETL
161
51
  IDENTIFIERS_ETL_YEAST_URL = "https://www.uniprot.org/docs/yeast.txt"
162
52
  IDENTIFIERS_ETL_SBO_URL = (
@@ -239,11 +129,6 @@ SBML_COMPARTMENTALIZED_SPECIES_DICT_SOURCE = "sc_Source"
239
129
 
240
130
  SBML_REACTION_ATTR_GET_GENE_PRODUCT = "getGeneProduct"
241
131
 
242
- SBML_ANNOTATION_METHOD_GET_SPECIES = "getSpecies"
243
- SBML_ANNOTATION_METHOD_GET_COMPARTMENT = "getCompartment"
244
- SBML_ANNOTATION_METHOD_GET_REACTION = "getReaction"
245
-
246
-
247
132
  # STRING
248
133
  STRING_URL_EXPRESSIONS = {
249
134
  "interactions": "https://stringdb-static.org/download/protein.links.full.v{version}/{taxid}.protein.links.full.v{version}.txt.gz",
@@ -0,0 +1,113 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import pandas as pd
5
+ from fs import open_fs
6
+ from napistu import utils
7
+
8
+ from napistu.constants import ONTOLOGIES
9
+ from napistu.ingestion.constants import GTEX_DEFS, GTEX_RNASEQ_EXPRESSION_URL
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def download_gtex_rnaseq(
15
+ target_uri: str, url: str = GTEX_RNASEQ_EXPRESSION_URL
16
+ ) -> None:
17
+ """Download GTEx RNA-seq expression data.
18
+
19
+ Parameters
20
+ ----------
21
+ target_uri : str
22
+ The URI where the GTEx data should be saved
23
+ url : str, optional
24
+ URL to download the GTEx RNA-seq expression data from.
25
+ Defaults to GTEX_RNASEQ_EXPRESSION_URL.
26
+
27
+ Returns
28
+ -------
29
+ None
30
+
31
+ Notes
32
+ -----
33
+ Downloads GTEx RNA-seq expression data (median TPM per gene per tissue) from the
34
+ specified URL and saves it to the target URI. By default, downloads from GTEx
35
+ Analysis V8 data (dbGaP Accession phs000424.v8.p2).
36
+ """
37
+ logger.info("Start downloading gtex %s to %s", url, target_uri)
38
+ utils.download_wget(url, target_uri)
39
+
40
+
41
+ def load_and_clean_gtex_data(gtex_data_path: str) -> pd.DataFrame:
42
+ """Load and format GTEx tissue specific expression data.
43
+
44
+ This function loads tissue-specific expression data from GTEx (median value per gene per tissue).
45
+
46
+ Parameters
47
+ ----------
48
+ gtex_data_path : str
49
+ Path to GTEx tissue specific expression data (medians)
50
+
51
+ Returns
52
+ -------
53
+ pd.DataFrame
54
+ DataFrame containing all the information from the GTEx file with standardized column names:
55
+ - ensembl_gene_id: Ensembl gene ID without version number
56
+ - ensembl_geneTranscript_id: Original GTEx hybrid gene/transcript ID
57
+ - Description: Gene description/symbol
58
+ - Multiple tissue columns with median TPM values
59
+
60
+ Notes
61
+ -----
62
+ The function:
63
+ 1. Skips the first 2 lines of the GTEx file (header info)
64
+ 2. Creates clean Ensembl gene IDs by removing version numbers
65
+ 3. Renames columns for clarity
66
+ 4. Reorders columns to put ID and description columns first
67
+
68
+ Raises
69
+ ------
70
+ FileNotFoundError
71
+ If the input file does not exist
72
+ """
73
+ # Check file exists
74
+ base_path, file_name = utils.get_source_base_and_path(gtex_data_path)
75
+
76
+ logger.info("Loading GTEx tissue specific expression data")
77
+
78
+ # Read the TSV file using pandas, skipping first 2 lines
79
+ with open_fs(base_path) as base_fs:
80
+ with base_fs.open(file_name, "rb") as f:
81
+ gtex_expression_data = pd.read_csv(
82
+ f, sep="\t", skiprows=2, dtype=str, na_values=[""], keep_default_na=True
83
+ )
84
+
85
+ # Create ensembl_gene_id by removing version numbers from Name column
86
+ gtex_expression_data[ONTOLOGIES.ENSEMBL_GENE] = gtex_expression_data[
87
+ GTEX_DEFS.NAME
88
+ ].str.replace(r"\.[0-9]+$", "", regex=True)
89
+
90
+ # Rename Name column to be more informative
91
+ gtex_expression_data = gtex_expression_data.rename(
92
+ columns={
93
+ GTEX_DEFS.NAME: ONTOLOGIES.ENSEMBL_GENE_VERSION,
94
+ GTEX_DEFS.DESCRIPTION: ONTOLOGIES.SYMBOL,
95
+ }
96
+ )
97
+
98
+ # Reorder columns to put ID and description columns first
99
+ first_cols = [
100
+ ONTOLOGIES.ENSEMBL_GENE,
101
+ ONTOLOGIES.ENSEMBL_GENE_VERSION,
102
+ ONTOLOGIES.SYMBOL,
103
+ ]
104
+ other_cols = [col for col in gtex_expression_data.columns if col not in first_cols]
105
+ gtex_expression_data = gtex_expression_data[first_cols + other_cols]
106
+
107
+ # Convert tissue columns to numeric
108
+ numeric_cols = [col for col in other_cols if col not in first_cols]
109
+ gtex_expression_data[numeric_cols] = gtex_expression_data[numeric_cols].apply(
110
+ pd.to_numeric, errors="coerce"
111
+ )
112
+
113
+ return gtex_expression_data
@@ -0,0 +1,147 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import pandas as pd
5
+ from napistu import utils
6
+ from fs import open_fs
7
+ from napistu.constants import ONTOLOGIES
8
+ from napistu.ingestion.constants import PROTEINATLAS_SUBCELL_LOC_URL, PROTEINATLAS_DEFS
9
+
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def download_hpa_data(target_uri: str, url: str = PROTEINATLAS_SUBCELL_LOC_URL) -> None:
15
+ """Download protein localization data from the Human Protein Atlas.
16
+
17
+ Parameters
18
+ ----------
19
+ target_uri : str
20
+ The URI where the HPA data should be saved. Should end with .tsv
21
+ url : str, optional
22
+ URL to download the zipped protein atlas subcellular localization tsv from.
23
+ Defaults to PROTEINATLAS_SUBCELL_LOC_URL.
24
+
25
+ Returns
26
+ -------
27
+ None
28
+
29
+ Notes
30
+ -----
31
+ Downloads the subcellular localization data from the Human Protein Atlas and saves
32
+ it to the specified target URI. The data is downloaded from the official HPA website
33
+ as a ZIP file and automatically unzipped to extract the TSV.
34
+
35
+ Raises
36
+ ------
37
+ ValueError
38
+ If target_uri does not end with .tsv
39
+ """
40
+ if not target_uri.endswith(".tsv"):
41
+ raise ValueError(f"Target URI must end with .tsv, got {target_uri}")
42
+
43
+ file_ext = url.split(".")[-1]
44
+ target_filename = url.split("/")[-1].split(f".{file_ext}")[0]
45
+ logger.info("Start downloading proteinatlas %s to %s", url, target_uri)
46
+ # target_filename is the name of the file in the zip file which will be renamed to target_uri
47
+ utils.download_wget(url, target_uri, target_filename=target_filename)
48
+
49
+ return None
50
+
51
+
52
+ def load_and_clean_hpa_data(hpa_data_path: str) -> pd.DataFrame:
53
+ """Load and format Human Protein Atlas subcellular localization data.
54
+
55
+ Parameters
56
+ ----------
57
+ hpa_data_path : str
58
+ Path to HPA subcellular localization data TSV file
59
+
60
+ Returns
61
+ -------
62
+ pd.DataFrame
63
+ DataFrame with genes as rows and GO terms as columns. Each cell
64
+ is a binary value (0 or 1) indicating whether that gene (row) is found in that
65
+ compartment (column). Genes with no compartment annotations are filtered out.
66
+
67
+ Notes
68
+ -----
69
+ This function loads subcellular localization data from the Human Protein Atlas
70
+ and creates a binary matrix where rows are genes and columns are GO terms,
71
+ with 1 indicating that a gene is localized to that compartment and 0 indicating
72
+ it is not.
73
+
74
+ The function filters out genes that have no compartment annotations and logs
75
+ information about the number of genes filtered and the final matrix dimensions.
76
+
77
+ Raises
78
+ ------
79
+ FileNotFoundError
80
+ If the input file does not exist
81
+ ValueError
82
+ If no gene-compartment associations are found in the data
83
+ """
84
+ # Check file exists
85
+ base_path, file_name = utils.get_source_base_and_path(hpa_data_path)
86
+
87
+ logger.info("Loading Human Protein Atlas subcellular localization data")
88
+
89
+ # Read the TSV file using pandas
90
+ with open_fs(base_path) as base_fs:
91
+ with base_fs.open(file_name, "rb") as f:
92
+ protein_subcellular_localizations = pd.read_csv(
93
+ f, sep="\t", dtype=str, na_values=[""], keep_default_na=True
94
+ )
95
+
96
+ # Rename Gene column to be more informative
97
+ protein_subcellular_localizations = protein_subcellular_localizations.rename(
98
+ columns={PROTEINATLAS_DEFS.GENE: ONTOLOGIES.ENSEMBL_GENE}
99
+ )
100
+
101
+ # Convert GO id column to lists
102
+ def _split_go_terms(go_terms):
103
+ if pd.isna(go_terms):
104
+ return []
105
+ return go_terms.split(";")
106
+
107
+ # Create a list of all gene-GO term pairs
108
+ gene_go_pairs = []
109
+ for _, row in protein_subcellular_localizations.iterrows():
110
+ go_terms = _split_go_terms(row[PROTEINATLAS_DEFS.GO_ID])
111
+ for term in go_terms:
112
+ gene_go_pairs.append(
113
+ {
114
+ ONTOLOGIES.ENSEMBL_GENE: row[ONTOLOGIES.ENSEMBL_GENE],
115
+ ONTOLOGIES.GO: term,
116
+ }
117
+ )
118
+
119
+ # Convert to DataFrame and pivot to create binary matrix
120
+ gene_go_df = pd.DataFrame(gene_go_pairs)
121
+ if len(gene_go_df) == 0:
122
+ raise ValueError("No gene-compartment associations found in the data")
123
+
124
+ localization_matrix = pd.crosstab(
125
+ gene_go_df[ONTOLOGIES.ENSEMBL_GENE], gene_go_df[ONTOLOGIES.GO]
126
+ ).astype(int)
127
+
128
+ # Log number of genes without compartments that were filtered
129
+ n_total_genes = len(
130
+ protein_subcellular_localizations[ONTOLOGIES.ENSEMBL_GENE].unique()
131
+ )
132
+ n_genes_with_compartments = len(localization_matrix)
133
+ n_filtered = n_total_genes - n_genes_with_compartments
134
+ if n_filtered > 0:
135
+ logger.debug(
136
+ "Filtered out %d genes with no compartment annotations (from %d total genes)",
137
+ n_filtered,
138
+ n_total_genes,
139
+ )
140
+
141
+ logger.info(
142
+ "Created localization matrix with shape %d genes x %d compartments",
143
+ localization_matrix.shape[0],
144
+ localization_matrix.shape[1],
145
+ )
146
+
147
+ return localization_matrix