napistu 0.2.5.dev6__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__main__.py +126 -96
- napistu/constants.py +35 -41
- napistu/context/__init__.py +10 -0
- napistu/context/discretize.py +462 -0
- napistu/context/filtering.py +387 -0
- napistu/gcs/__init__.py +1 -1
- napistu/identifiers.py +74 -15
- napistu/indices.py +68 -0
- napistu/ingestion/__init__.py +1 -1
- napistu/ingestion/bigg.py +47 -62
- napistu/ingestion/constants.py +18 -133
- napistu/ingestion/gtex.py +113 -0
- napistu/ingestion/hpa.py +147 -0
- napistu/ingestion/sbml.py +0 -97
- napistu/ingestion/string.py +2 -2
- napistu/matching/__init__.py +10 -0
- napistu/matching/constants.py +18 -0
- napistu/matching/interactions.py +518 -0
- napistu/matching/mount.py +529 -0
- napistu/matching/species.py +510 -0
- napistu/mcp/__init__.py +7 -4
- napistu/mcp/__main__.py +128 -72
- napistu/mcp/client.py +16 -25
- napistu/mcp/codebase.py +201 -153
- napistu/mcp/component_base.py +170 -0
- napistu/mcp/config.py +223 -0
- napistu/mcp/constants.py +45 -2
- napistu/mcp/documentation.py +253 -136
- napistu/mcp/documentation_utils.py +13 -48
- napistu/mcp/execution.py +372 -305
- napistu/mcp/health.py +49 -67
- napistu/mcp/profiles.py +10 -6
- napistu/mcp/server.py +161 -80
- napistu/mcp/tutorials.py +139 -87
- napistu/modify/__init__.py +1 -1
- napistu/modify/gaps.py +1 -1
- napistu/network/__init__.py +1 -1
- napistu/network/constants.py +101 -34
- napistu/network/data_handling.py +388 -0
- napistu/network/ig_utils.py +351 -0
- napistu/network/napistu_graph_core.py +354 -0
- napistu/network/neighborhoods.py +40 -40
- napistu/network/net_create.py +373 -309
- napistu/network/net_propagation.py +47 -19
- napistu/network/{net_utils.py → ng_utils.py} +124 -272
- napistu/network/paths.py +67 -51
- napistu/network/precompute.py +11 -11
- napistu/ontologies/__init__.py +10 -0
- napistu/ontologies/constants.py +129 -0
- napistu/ontologies/dogma.py +243 -0
- napistu/ontologies/genodexito.py +649 -0
- napistu/ontologies/mygene.py +369 -0
- napistu/ontologies/renaming.py +198 -0
- napistu/rpy2/__init__.py +229 -86
- napistu/rpy2/callr.py +47 -77
- napistu/rpy2/constants.py +24 -23
- napistu/rpy2/rids.py +61 -648
- napistu/sbml_dfs_core.py +587 -222
- napistu/scverse/__init__.py +15 -0
- napistu/scverse/constants.py +28 -0
- napistu/scverse/loading.py +727 -0
- napistu/utils.py +118 -10
- {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/METADATA +8 -3
- napistu-0.3.1.dist-info/RECORD +133 -0
- tests/conftest.py +22 -0
- tests/test_context_discretize.py +56 -0
- tests/test_context_filtering.py +267 -0
- tests/test_identifiers.py +100 -0
- tests/test_indices.py +65 -0
- tests/{test_edgelist.py → test_ingestion_napistu_edgelist.py} +2 -2
- tests/test_matching_interactions.py +108 -0
- tests/test_matching_mount.py +305 -0
- tests/test_matching_species.py +394 -0
- tests/test_mcp_config.py +193 -0
- tests/test_mcp_documentation_utils.py +12 -3
- tests/test_mcp_server.py +356 -0
- tests/test_network_data_handling.py +397 -0
- tests/test_network_ig_utils.py +23 -0
- tests/test_network_neighborhoods.py +19 -0
- tests/test_network_net_create.py +459 -0
- tests/test_network_ng_utils.py +30 -0
- tests/test_network_paths.py +56 -0
- tests/{test_precomputed_distances.py → test_network_precompute.py} +8 -6
- tests/test_ontologies_genodexito.py +58 -0
- tests/test_ontologies_mygene.py +39 -0
- tests/test_ontologies_renaming.py +110 -0
- tests/test_rpy2_callr.py +79 -0
- tests/test_rpy2_init.py +151 -0
- tests/test_sbml.py +0 -31
- tests/test_sbml_dfs_core.py +134 -10
- tests/test_scverse_loading.py +778 -0
- tests/test_set_coverage.py +2 -2
- tests/test_utils.py +121 -1
- napistu/mechanism_matching.py +0 -1353
- napistu/rpy2/netcontextr.py +0 -467
- napistu-0.2.5.dev6.dist-info/RECORD +0 -97
- tests/test_igraph.py +0 -367
- tests/test_mechanism_matching.py +0 -784
- tests/test_net_utils.py +0 -149
- tests/test_netcontextr.py +0 -105
- tests/test_rpy2.py +0 -61
- /napistu/ingestion/{cpr_edgelist.py → napistu_edgelist.py} +0 -0
- {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/WHEEL +0 -0
- {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/entry_points.txt +0 -0
- {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/licenses/LICENSE +0 -0
- {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/top_level.txt +0 -0
- /tests/{test_obo.py → test_ingestion_obo.py} +0 -0
@@ -0,0 +1,243 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import logging
|
4
|
+
import pandas as pd
|
5
|
+
|
6
|
+
from napistu import sbml_dfs_core
|
7
|
+
from napistu import source
|
8
|
+
from napistu import identifiers
|
9
|
+
from napistu import utils
|
10
|
+
from napistu.ontologies.genodexito import Genodexito
|
11
|
+
from napistu.constants import BQB
|
12
|
+
from napistu.constants import IDENTIFIERS
|
13
|
+
from napistu.constants import MINI_SBO_FROM_NAME
|
14
|
+
from napistu.constants import ONTOLOGIES
|
15
|
+
from napistu.constants import SBML_DFS
|
16
|
+
from napistu.ontologies.constants import INTERCONVERTIBLE_GENIC_ONTOLOGIES
|
17
|
+
from napistu.ontologies.constants import GENE_ONTOLOGIES # noqa: F401
|
18
|
+
from napistu.ontologies.constants import GENODEXITO_DEFS
|
19
|
+
from napistu.ontologies.constants import NAME_ONTOLOGIES
|
20
|
+
from napistu.ontologies.constants import PROTEIN_ONTOLOGIES
|
21
|
+
|
22
|
+
logger = logging.getLogger(__name__)
|
23
|
+
|
24
|
+
|
25
|
+
def create_dogmatic_sbml_dfs(
|
26
|
+
species: str,
|
27
|
+
preferred_method: str = GENODEXITO_DEFS.BIOCONDUCTOR,
|
28
|
+
allow_fallback: bool = True,
|
29
|
+
r_paths: str | None = None,
|
30
|
+
) -> sbml_dfs_core.SBML_dfs:
|
31
|
+
"""
|
32
|
+
Create Dogmatic SMBL_DFs
|
33
|
+
|
34
|
+
Create an SBML_dfs model which is pretty much just proteins and no
|
35
|
+
reactions, as well as annotations linking proteins to genes, and
|
36
|
+
creating nice labels for genes/proteins.
|
37
|
+
|
38
|
+
Args:
|
39
|
+
species (str):
|
40
|
+
An organismal species (e.g., Homo sapiens)
|
41
|
+
r_paths (str or None)
|
42
|
+
Optional, p]ath to an R packages directory
|
43
|
+
|
44
|
+
Returns:
|
45
|
+
dogmatic_sbml_dfs (sbml.SBML_dfs)
|
46
|
+
A pathway model which (pretty much) just contains proteins and
|
47
|
+
diverse identifiers
|
48
|
+
"""
|
49
|
+
|
50
|
+
dogmatic_mappings = _connect_dogmatic_mappings(
|
51
|
+
species, preferred_method, allow_fallback, r_paths
|
52
|
+
)
|
53
|
+
|
54
|
+
logger.info("Creating inputs for sbml_dfs_from_edgelist()")
|
55
|
+
|
56
|
+
# format entries for sbml_dfs_from_edgelist()
|
57
|
+
species_df = dogmatic_mappings["cluster_consensus_identifiers_df"].join(
|
58
|
+
dogmatic_mappings["s_name_series"]
|
59
|
+
)
|
60
|
+
|
61
|
+
# stub required but invariant variables
|
62
|
+
compartments_df = sbml_dfs_core._stub_compartments()
|
63
|
+
interaction_source = source.Source(init=True)
|
64
|
+
|
65
|
+
# interactions table. This is required to create the sbml_dfs but we'll drop the info later
|
66
|
+
interaction_edgelist = species_df.rename(
|
67
|
+
columns={
|
68
|
+
"s_name": "upstream_name",
|
69
|
+
SBML_DFS.S_IDENTIFIERS: SBML_DFS.R_IDENTIFIERS,
|
70
|
+
}
|
71
|
+
)
|
72
|
+
interaction_edgelist["downstream_name"] = interaction_edgelist["upstream_name"]
|
73
|
+
interaction_edgelist["upstream_compartment"] = "cellular_component"
|
74
|
+
interaction_edgelist["downstream_compartment"] = "cellular_component"
|
75
|
+
interaction_edgelist["r_name"] = interaction_edgelist["upstream_name"]
|
76
|
+
interaction_edgelist["sbo_term"] = MINI_SBO_FROM_NAME["reactant"]
|
77
|
+
interaction_edgelist["r_isreversible"] = False
|
78
|
+
|
79
|
+
dogmatic_sbml_dfs = sbml_dfs_core.sbml_dfs_from_edgelist(
|
80
|
+
interaction_edgelist=interaction_edgelist,
|
81
|
+
species_df=species_df,
|
82
|
+
compartments_df=compartments_df,
|
83
|
+
interaction_source=interaction_source,
|
84
|
+
upstream_stoichiometry=-1,
|
85
|
+
downstream_stoichiometry=1,
|
86
|
+
downstream_sbo_name="product",
|
87
|
+
)
|
88
|
+
|
89
|
+
# remove all reactions except 1 (so it still passes sbml_dfs.validate())
|
90
|
+
# this self reaction will be removed when creating the graph
|
91
|
+
dogmatic_sbml_dfs.remove_reactions(dogmatic_sbml_dfs.reactions.index.tolist()[1::])
|
92
|
+
|
93
|
+
return dogmatic_sbml_dfs
|
94
|
+
|
95
|
+
|
96
|
+
def _connect_dogmatic_mappings(
|
97
|
+
species: str,
|
98
|
+
preferred_method: str = GENODEXITO_DEFS.BIOCONDUCTOR,
|
99
|
+
allow_fallback: bool = True,
|
100
|
+
r_paths: str | None = None,
|
101
|
+
) -> dict:
|
102
|
+
"""
|
103
|
+
Connect Dogmatic Mappings
|
104
|
+
|
105
|
+
Merge all ontologies into greedy clusters based on shared associations to entrez ids
|
106
|
+
|
107
|
+
Args:
|
108
|
+
species (str):
|
109
|
+
An organismal species (e.g., Homo sapiens)
|
110
|
+
r_paths (str or None)
|
111
|
+
Optional, p]ath to an R packages directory
|
112
|
+
|
113
|
+
Returns:
|
114
|
+
dict with:
|
115
|
+
- s_name_series: a series where the index is distinct molecular species and the values are names.
|
116
|
+
- cluster_consensus_identifiers_df: a pd.DataFrame where the index is distinct molecular species
|
117
|
+
and values are identifiers objects.
|
118
|
+
"""
|
119
|
+
|
120
|
+
genodexito = Genodexito(
|
121
|
+
species=species,
|
122
|
+
preferred_method=preferred_method,
|
123
|
+
allow_fallback=allow_fallback,
|
124
|
+
r_paths=r_paths,
|
125
|
+
)
|
126
|
+
|
127
|
+
genodexito.create_mapping_tables(mappings=INTERCONVERTIBLE_GENIC_ONTOLOGIES)
|
128
|
+
|
129
|
+
genodexito.stack_mappings(ontologies=set(PROTEIN_ONTOLOGIES))
|
130
|
+
protein_mappings = genodexito.stacked_mappings
|
131
|
+
|
132
|
+
# apply greedy graph-based clustering to connect proteins with a common mapping to entrez
|
133
|
+
edgelist_df = utils.format_identifiers_as_edgelist(
|
134
|
+
protein_mappings, [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
|
135
|
+
)
|
136
|
+
connected_indices = utils.find_weakly_connected_subgraphs(
|
137
|
+
edgelist_df[["ind", "id"]]
|
138
|
+
)
|
139
|
+
|
140
|
+
# add clusters to proteins. Each cluster will be a distinct molecular species
|
141
|
+
protein_mappings_w_clusters = protein_mappings.reset_index().merge(
|
142
|
+
connected_indices
|
143
|
+
)
|
144
|
+
|
145
|
+
# combine entrez + cluster so we can pass cluster to non-protein attributes
|
146
|
+
entrez_clusters = protein_mappings_w_clusters[
|
147
|
+
[ONTOLOGIES.NCBI_ENTREZ_GENE, "cluster"]
|
148
|
+
].drop_duplicates()
|
149
|
+
# check for the other ontologies aside from proteins and entrez (since that's in the index)
|
150
|
+
other_ontologies = INTERCONVERTIBLE_GENIC_ONTOLOGIES.difference(
|
151
|
+
set(PROTEIN_ONTOLOGIES)
|
152
|
+
)
|
153
|
+
other_ontologies.discard(ONTOLOGIES.NCBI_ENTREZ_GENE)
|
154
|
+
|
155
|
+
genodexito.stack_mappings(ontologies=other_ontologies)
|
156
|
+
other_mappings = genodexito.stacked_mappings
|
157
|
+
|
158
|
+
other_mappings_w_clusters = entrez_clusters.merge(
|
159
|
+
other_mappings, left_on=ONTOLOGIES.NCBI_ENTREZ_GENE, right_index=True
|
160
|
+
)
|
161
|
+
|
162
|
+
possible_names = pd.concat(
|
163
|
+
[
|
164
|
+
protein_mappings_w_clusters.query("ontology in @NAME_ONTOLOGIES.keys()"),
|
165
|
+
other_mappings_w_clusters.query("ontology in @NAME_ONTOLOGIES.keys()"),
|
166
|
+
]
|
167
|
+
)[["cluster", IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]]
|
168
|
+
|
169
|
+
possible_names.loc[:, "ontology_preference"] = possible_names[
|
170
|
+
IDENTIFIERS.ONTOLOGY
|
171
|
+
].map(NAME_ONTOLOGIES)
|
172
|
+
|
173
|
+
# remove possible names which are present in multiple clusters.
|
174
|
+
# all clusters will need unique names to use sbml_dfs_from_edgelist()
|
175
|
+
id_counts = (
|
176
|
+
possible_names[["cluster", IDENTIFIERS.IDENTIFIER]]
|
177
|
+
.drop_duplicates()
|
178
|
+
.value_counts(IDENTIFIERS.IDENTIFIER)
|
179
|
+
)
|
180
|
+
possible_names = possible_names[
|
181
|
+
~possible_names[IDENTIFIERS.IDENTIFIER].isin(
|
182
|
+
id_counts[id_counts > 1].index.tolist()
|
183
|
+
)
|
184
|
+
]
|
185
|
+
|
186
|
+
s_name_series = (
|
187
|
+
utils._add_nameness_score(possible_names, IDENTIFIERS.IDENTIFIER)
|
188
|
+
.sort_values(["ontology_preference", "nameness_score"])
|
189
|
+
.groupby("cluster")
|
190
|
+
.first()
|
191
|
+
.rename(columns={IDENTIFIERS.IDENTIFIER: SBML_DFS.S_NAME})[SBML_DFS.S_NAME]
|
192
|
+
)
|
193
|
+
|
194
|
+
protein_ids = protein_mappings_w_clusters.assign(bqb=BQB.IS)[
|
195
|
+
["cluster", IDENTIFIERS.IDENTIFIER, IDENTIFIERS.ONTOLOGY, IDENTIFIERS.BQB]
|
196
|
+
]
|
197
|
+
gene_ids = other_mappings_w_clusters.query("ontology in @GENE_ONTOLOGIES").assign(
|
198
|
+
bqb=BQB.IS_ENCODED_BY
|
199
|
+
)[["cluster", IDENTIFIERS.IDENTIFIER, IDENTIFIERS.ONTOLOGY, IDENTIFIERS.BQB]]
|
200
|
+
entrez_ids = entrez_clusters.assign(
|
201
|
+
ontology=ONTOLOGIES.NCBI_ENTREZ_GENE, bqb=BQB.IS_ENCODED_BY
|
202
|
+
).rename(columns={ONTOLOGIES.NCBI_ENTREZ_GENE: IDENTIFIERS.IDENTIFIER})[
|
203
|
+
["cluster", IDENTIFIERS.IDENTIFIER, IDENTIFIERS.ONTOLOGY, IDENTIFIERS.BQB]
|
204
|
+
]
|
205
|
+
|
206
|
+
# combine all ids to setup a single cluster-level Identifiers
|
207
|
+
all_ids = pd.concat([protein_ids, gene_ids, entrez_ids])
|
208
|
+
all_ids.loc[:, IDENTIFIERS.URL] = [
|
209
|
+
identifiers.create_uri_url(x, y)
|
210
|
+
for x, y in zip(all_ids[IDENTIFIERS.ONTOLOGY], all_ids[IDENTIFIERS.IDENTIFIER])
|
211
|
+
]
|
212
|
+
|
213
|
+
# create one Identifiers object for each new species
|
214
|
+
cluster_consensus_identifiers = {
|
215
|
+
k: identifiers.Identifiers(
|
216
|
+
list(
|
217
|
+
v[
|
218
|
+
[
|
219
|
+
IDENTIFIERS.ONTOLOGY,
|
220
|
+
IDENTIFIERS.IDENTIFIER,
|
221
|
+
IDENTIFIERS.URL,
|
222
|
+
IDENTIFIERS.BQB,
|
223
|
+
]
|
224
|
+
]
|
225
|
+
.reset_index(drop=True)
|
226
|
+
.T.to_dict()
|
227
|
+
.values()
|
228
|
+
)
|
229
|
+
)
|
230
|
+
for k, v in all_ids.groupby("cluster")
|
231
|
+
}
|
232
|
+
|
233
|
+
cluster_consensus_identifiers_df = pd.DataFrame(
|
234
|
+
cluster_consensus_identifiers, index=[SBML_DFS.S_IDENTIFIERS]
|
235
|
+
).T
|
236
|
+
cluster_consensus_identifiers_df.index.name = "cluster"
|
237
|
+
|
238
|
+
out_dict = {
|
239
|
+
"s_name_series": s_name_series,
|
240
|
+
"cluster_consensus_identifiers_df": cluster_consensus_identifiers_df,
|
241
|
+
}
|
242
|
+
|
243
|
+
return out_dict
|