napistu 0.2.5.dev7__py3-none-any.whl → 0.3.1.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. napistu/__init__.py +1 -3
  2. napistu/__main__.py +126 -96
  3. napistu/constants.py +35 -41
  4. napistu/context/__init__.py +10 -0
  5. napistu/context/discretize.py +462 -0
  6. napistu/context/filtering.py +387 -0
  7. napistu/gcs/__init__.py +1 -1
  8. napistu/identifiers.py +74 -15
  9. napistu/indices.py +68 -0
  10. napistu/ingestion/__init__.py +1 -1
  11. napistu/ingestion/bigg.py +47 -62
  12. napistu/ingestion/constants.py +18 -133
  13. napistu/ingestion/gtex.py +113 -0
  14. napistu/ingestion/hpa.py +147 -0
  15. napistu/ingestion/sbml.py +0 -97
  16. napistu/ingestion/string.py +2 -2
  17. napistu/matching/__init__.py +10 -0
  18. napistu/matching/constants.py +18 -0
  19. napistu/matching/interactions.py +518 -0
  20. napistu/matching/mount.py +529 -0
  21. napistu/matching/species.py +510 -0
  22. napistu/mcp/__init__.py +7 -4
  23. napistu/mcp/__main__.py +128 -72
  24. napistu/mcp/client.py +16 -25
  25. napistu/mcp/codebase.py +201 -145
  26. napistu/mcp/component_base.py +170 -0
  27. napistu/mcp/config.py +223 -0
  28. napistu/mcp/constants.py +45 -2
  29. napistu/mcp/documentation.py +253 -136
  30. napistu/mcp/documentation_utils.py +13 -48
  31. napistu/mcp/execution.py +372 -305
  32. napistu/mcp/health.py +47 -65
  33. napistu/mcp/profiles.py +10 -6
  34. napistu/mcp/server.py +161 -80
  35. napistu/mcp/tutorials.py +139 -87
  36. napistu/modify/__init__.py +1 -1
  37. napistu/modify/gaps.py +1 -1
  38. napistu/network/__init__.py +1 -1
  39. napistu/network/constants.py +101 -34
  40. napistu/network/data_handling.py +388 -0
  41. napistu/network/ig_utils.py +351 -0
  42. napistu/network/napistu_graph_core.py +354 -0
  43. napistu/network/neighborhoods.py +40 -40
  44. napistu/network/net_create.py +373 -309
  45. napistu/network/net_propagation.py +47 -19
  46. napistu/network/{net_utils.py → ng_utils.py} +124 -272
  47. napistu/network/paths.py +67 -51
  48. napistu/network/precompute.py +11 -11
  49. napistu/ontologies/__init__.py +10 -0
  50. napistu/ontologies/constants.py +129 -0
  51. napistu/ontologies/dogma.py +243 -0
  52. napistu/ontologies/genodexito.py +649 -0
  53. napistu/ontologies/mygene.py +369 -0
  54. napistu/ontologies/renaming.py +198 -0
  55. napistu/rpy2/__init__.py +229 -86
  56. napistu/rpy2/callr.py +47 -77
  57. napistu/rpy2/constants.py +24 -23
  58. napistu/rpy2/rids.py +61 -648
  59. napistu/sbml_dfs_core.py +587 -222
  60. napistu/scverse/__init__.py +15 -0
  61. napistu/scverse/constants.py +28 -0
  62. napistu/scverse/loading.py +727 -0
  63. napistu/utils.py +118 -10
  64. {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/METADATA +8 -3
  65. napistu-0.3.1.dev1.dist-info/RECORD +133 -0
  66. tests/conftest.py +22 -0
  67. tests/test_context_discretize.py +56 -0
  68. tests/test_context_filtering.py +267 -0
  69. tests/test_identifiers.py +100 -0
  70. tests/test_indices.py +65 -0
  71. tests/{test_edgelist.py → test_ingestion_napistu_edgelist.py} +2 -2
  72. tests/test_matching_interactions.py +108 -0
  73. tests/test_matching_mount.py +305 -0
  74. tests/test_matching_species.py +394 -0
  75. tests/test_mcp_config.py +193 -0
  76. tests/test_mcp_documentation_utils.py +12 -3
  77. tests/test_mcp_server.py +156 -19
  78. tests/test_network_data_handling.py +397 -0
  79. tests/test_network_ig_utils.py +23 -0
  80. tests/test_network_neighborhoods.py +19 -0
  81. tests/test_network_net_create.py +459 -0
  82. tests/test_network_ng_utils.py +30 -0
  83. tests/test_network_paths.py +56 -0
  84. tests/{test_precomputed_distances.py → test_network_precompute.py} +8 -6
  85. tests/test_ontologies_genodexito.py +58 -0
  86. tests/test_ontologies_mygene.py +39 -0
  87. tests/test_ontologies_renaming.py +110 -0
  88. tests/test_rpy2_callr.py +79 -0
  89. tests/test_rpy2_init.py +151 -0
  90. tests/test_sbml.py +0 -31
  91. tests/test_sbml_dfs_core.py +134 -10
  92. tests/test_scverse_loading.py +778 -0
  93. tests/test_set_coverage.py +2 -2
  94. tests/test_utils.py +121 -1
  95. napistu/mechanism_matching.py +0 -1353
  96. napistu/rpy2/netcontextr.py +0 -467
  97. napistu-0.2.5.dev7.dist-info/RECORD +0 -98
  98. tests/test_igraph.py +0 -367
  99. tests/test_mechanism_matching.py +0 -784
  100. tests/test_net_utils.py +0 -149
  101. tests/test_netcontextr.py +0 -105
  102. tests/test_rpy2.py +0 -61
  103. /napistu/ingestion/{cpr_edgelist.py → napistu_edgelist.py} +0 -0
  104. {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/WHEEL +0 -0
  105. {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/entry_points.txt +0 -0
  106. {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/licenses/LICENSE +0 -0
  107. {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/top_level.txt +0 -0
  108. /tests/{test_obo.py → test_ingestion_obo.py} +0 -0
@@ -0,0 +1,243 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import pandas as pd
5
+
6
+ from napistu import sbml_dfs_core
7
+ from napistu import source
8
+ from napistu import identifiers
9
+ from napistu import utils
10
+ from napistu.ontologies.genodexito import Genodexito
11
+ from napistu.constants import BQB
12
+ from napistu.constants import IDENTIFIERS
13
+ from napistu.constants import MINI_SBO_FROM_NAME
14
+ from napistu.constants import ONTOLOGIES
15
+ from napistu.constants import SBML_DFS
16
+ from napistu.ontologies.constants import INTERCONVERTIBLE_GENIC_ONTOLOGIES
17
+ from napistu.ontologies.constants import GENE_ONTOLOGIES # noqa: F401
18
+ from napistu.ontologies.constants import GENODEXITO_DEFS
19
+ from napistu.ontologies.constants import NAME_ONTOLOGIES
20
+ from napistu.ontologies.constants import PROTEIN_ONTOLOGIES
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ def create_dogmatic_sbml_dfs(
26
+ species: str,
27
+ preferred_method: str = GENODEXITO_DEFS.BIOCONDUCTOR,
28
+ allow_fallback: bool = True,
29
+ r_paths: str | None = None,
30
+ ) -> sbml_dfs_core.SBML_dfs:
31
+ """
32
+ Create Dogmatic SMBL_DFs
33
+
34
+ Create an SBML_dfs model which is pretty much just proteins and no
35
+ reactions, as well as annotations linking proteins to genes, and
36
+ creating nice labels for genes/proteins.
37
+
38
+ Args:
39
+ species (str):
40
+ An organismal species (e.g., Homo sapiens)
41
+ r_paths (str or None)
42
+ Optional, p]ath to an R packages directory
43
+
44
+ Returns:
45
+ dogmatic_sbml_dfs (sbml.SBML_dfs)
46
+ A pathway model which (pretty much) just contains proteins and
47
+ diverse identifiers
48
+ """
49
+
50
+ dogmatic_mappings = _connect_dogmatic_mappings(
51
+ species, preferred_method, allow_fallback, r_paths
52
+ )
53
+
54
+ logger.info("Creating inputs for sbml_dfs_from_edgelist()")
55
+
56
+ # format entries for sbml_dfs_from_edgelist()
57
+ species_df = dogmatic_mappings["cluster_consensus_identifiers_df"].join(
58
+ dogmatic_mappings["s_name_series"]
59
+ )
60
+
61
+ # stub required but invariant variables
62
+ compartments_df = sbml_dfs_core._stub_compartments()
63
+ interaction_source = source.Source(init=True)
64
+
65
+ # interactions table. This is required to create the sbml_dfs but we'll drop the info later
66
+ interaction_edgelist = species_df.rename(
67
+ columns={
68
+ "s_name": "upstream_name",
69
+ SBML_DFS.S_IDENTIFIERS: SBML_DFS.R_IDENTIFIERS,
70
+ }
71
+ )
72
+ interaction_edgelist["downstream_name"] = interaction_edgelist["upstream_name"]
73
+ interaction_edgelist["upstream_compartment"] = "cellular_component"
74
+ interaction_edgelist["downstream_compartment"] = "cellular_component"
75
+ interaction_edgelist["r_name"] = interaction_edgelist["upstream_name"]
76
+ interaction_edgelist["sbo_term"] = MINI_SBO_FROM_NAME["reactant"]
77
+ interaction_edgelist["r_isreversible"] = False
78
+
79
+ dogmatic_sbml_dfs = sbml_dfs_core.sbml_dfs_from_edgelist(
80
+ interaction_edgelist=interaction_edgelist,
81
+ species_df=species_df,
82
+ compartments_df=compartments_df,
83
+ interaction_source=interaction_source,
84
+ upstream_stoichiometry=-1,
85
+ downstream_stoichiometry=1,
86
+ downstream_sbo_name="product",
87
+ )
88
+
89
+ # remove all reactions except 1 (so it still passes sbml_dfs.validate())
90
+ # this self reaction will be removed when creating the graph
91
+ dogmatic_sbml_dfs.remove_reactions(dogmatic_sbml_dfs.reactions.index.tolist()[1::])
92
+
93
+ return dogmatic_sbml_dfs
94
+
95
+
96
+ def _connect_dogmatic_mappings(
97
+ species: str,
98
+ preferred_method: str = GENODEXITO_DEFS.BIOCONDUCTOR,
99
+ allow_fallback: bool = True,
100
+ r_paths: str | None = None,
101
+ ) -> dict:
102
+ """
103
+ Connect Dogmatic Mappings
104
+
105
+ Merge all ontologies into greedy clusters based on shared associations to entrez ids
106
+
107
+ Args:
108
+ species (str):
109
+ An organismal species (e.g., Homo sapiens)
110
+ r_paths (str or None)
111
+ Optional, p]ath to an R packages directory
112
+
113
+ Returns:
114
+ dict with:
115
+ - s_name_series: a series where the index is distinct molecular species and the values are names.
116
+ - cluster_consensus_identifiers_df: a pd.DataFrame where the index is distinct molecular species
117
+ and values are identifiers objects.
118
+ """
119
+
120
+ genodexito = Genodexito(
121
+ species=species,
122
+ preferred_method=preferred_method,
123
+ allow_fallback=allow_fallback,
124
+ r_paths=r_paths,
125
+ )
126
+
127
+ genodexito.create_mapping_tables(mappings=INTERCONVERTIBLE_GENIC_ONTOLOGIES)
128
+
129
+ genodexito.stack_mappings(ontologies=set(PROTEIN_ONTOLOGIES))
130
+ protein_mappings = genodexito.stacked_mappings
131
+
132
+ # apply greedy graph-based clustering to connect proteins with a common mapping to entrez
133
+ edgelist_df = utils.format_identifiers_as_edgelist(
134
+ protein_mappings, [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
135
+ )
136
+ connected_indices = utils.find_weakly_connected_subgraphs(
137
+ edgelist_df[["ind", "id"]]
138
+ )
139
+
140
+ # add clusters to proteins. Each cluster will be a distinct molecular species
141
+ protein_mappings_w_clusters = protein_mappings.reset_index().merge(
142
+ connected_indices
143
+ )
144
+
145
+ # combine entrez + cluster so we can pass cluster to non-protein attributes
146
+ entrez_clusters = protein_mappings_w_clusters[
147
+ [ONTOLOGIES.NCBI_ENTREZ_GENE, "cluster"]
148
+ ].drop_duplicates()
149
+ # check for the other ontologies aside from proteins and entrez (since that's in the index)
150
+ other_ontologies = INTERCONVERTIBLE_GENIC_ONTOLOGIES.difference(
151
+ set(PROTEIN_ONTOLOGIES)
152
+ )
153
+ other_ontologies.discard(ONTOLOGIES.NCBI_ENTREZ_GENE)
154
+
155
+ genodexito.stack_mappings(ontologies=other_ontologies)
156
+ other_mappings = genodexito.stacked_mappings
157
+
158
+ other_mappings_w_clusters = entrez_clusters.merge(
159
+ other_mappings, left_on=ONTOLOGIES.NCBI_ENTREZ_GENE, right_index=True
160
+ )
161
+
162
+ possible_names = pd.concat(
163
+ [
164
+ protein_mappings_w_clusters.query("ontology in @NAME_ONTOLOGIES.keys()"),
165
+ other_mappings_w_clusters.query("ontology in @NAME_ONTOLOGIES.keys()"),
166
+ ]
167
+ )[["cluster", IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]]
168
+
169
+ possible_names.loc[:, "ontology_preference"] = possible_names[
170
+ IDENTIFIERS.ONTOLOGY
171
+ ].map(NAME_ONTOLOGIES)
172
+
173
+ # remove possible names which are present in multiple clusters.
174
+ # all clusters will need unique names to use sbml_dfs_from_edgelist()
175
+ id_counts = (
176
+ possible_names[["cluster", IDENTIFIERS.IDENTIFIER]]
177
+ .drop_duplicates()
178
+ .value_counts(IDENTIFIERS.IDENTIFIER)
179
+ )
180
+ possible_names = possible_names[
181
+ ~possible_names[IDENTIFIERS.IDENTIFIER].isin(
182
+ id_counts[id_counts > 1].index.tolist()
183
+ )
184
+ ]
185
+
186
+ s_name_series = (
187
+ utils._add_nameness_score(possible_names, IDENTIFIERS.IDENTIFIER)
188
+ .sort_values(["ontology_preference", "nameness_score"])
189
+ .groupby("cluster")
190
+ .first()
191
+ .rename(columns={IDENTIFIERS.IDENTIFIER: SBML_DFS.S_NAME})[SBML_DFS.S_NAME]
192
+ )
193
+
194
+ protein_ids = protein_mappings_w_clusters.assign(bqb=BQB.IS)[
195
+ ["cluster", IDENTIFIERS.IDENTIFIER, IDENTIFIERS.ONTOLOGY, IDENTIFIERS.BQB]
196
+ ]
197
+ gene_ids = other_mappings_w_clusters.query("ontology in @GENE_ONTOLOGIES").assign(
198
+ bqb=BQB.IS_ENCODED_BY
199
+ )[["cluster", IDENTIFIERS.IDENTIFIER, IDENTIFIERS.ONTOLOGY, IDENTIFIERS.BQB]]
200
+ entrez_ids = entrez_clusters.assign(
201
+ ontology=ONTOLOGIES.NCBI_ENTREZ_GENE, bqb=BQB.IS_ENCODED_BY
202
+ ).rename(columns={ONTOLOGIES.NCBI_ENTREZ_GENE: IDENTIFIERS.IDENTIFIER})[
203
+ ["cluster", IDENTIFIERS.IDENTIFIER, IDENTIFIERS.ONTOLOGY, IDENTIFIERS.BQB]
204
+ ]
205
+
206
+ # combine all ids to setup a single cluster-level Identifiers
207
+ all_ids = pd.concat([protein_ids, gene_ids, entrez_ids])
208
+ all_ids.loc[:, IDENTIFIERS.URL] = [
209
+ identifiers.create_uri_url(x, y)
210
+ for x, y in zip(all_ids[IDENTIFIERS.ONTOLOGY], all_ids[IDENTIFIERS.IDENTIFIER])
211
+ ]
212
+
213
+ # create one Identifiers object for each new species
214
+ cluster_consensus_identifiers = {
215
+ k: identifiers.Identifiers(
216
+ list(
217
+ v[
218
+ [
219
+ IDENTIFIERS.ONTOLOGY,
220
+ IDENTIFIERS.IDENTIFIER,
221
+ IDENTIFIERS.URL,
222
+ IDENTIFIERS.BQB,
223
+ ]
224
+ ]
225
+ .reset_index(drop=True)
226
+ .T.to_dict()
227
+ .values()
228
+ )
229
+ )
230
+ for k, v in all_ids.groupby("cluster")
231
+ }
232
+
233
+ cluster_consensus_identifiers_df = pd.DataFrame(
234
+ cluster_consensus_identifiers, index=[SBML_DFS.S_IDENTIFIERS]
235
+ ).T
236
+ cluster_consensus_identifiers_df.index.name = "cluster"
237
+
238
+ out_dict = {
239
+ "s_name_series": s_name_series,
240
+ "cluster_consensus_identifiers_df": cluster_consensus_identifiers_df,
241
+ }
242
+
243
+ return out_dict