napistu 0.2.5.dev6__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__main__.py +126 -96
- napistu/constants.py +35 -41
- napistu/context/__init__.py +10 -0
- napistu/context/discretize.py +462 -0
- napistu/context/filtering.py +387 -0
- napistu/gcs/__init__.py +1 -1
- napistu/identifiers.py +74 -15
- napistu/indices.py +68 -0
- napistu/ingestion/__init__.py +1 -1
- napistu/ingestion/bigg.py +47 -62
- napistu/ingestion/constants.py +18 -133
- napistu/ingestion/gtex.py +113 -0
- napistu/ingestion/hpa.py +147 -0
- napistu/ingestion/sbml.py +0 -97
- napistu/ingestion/string.py +2 -2
- napistu/matching/__init__.py +10 -0
- napistu/matching/constants.py +18 -0
- napistu/matching/interactions.py +518 -0
- napistu/matching/mount.py +529 -0
- napistu/matching/species.py +510 -0
- napistu/mcp/__init__.py +7 -4
- napistu/mcp/__main__.py +128 -72
- napistu/mcp/client.py +16 -25
- napistu/mcp/codebase.py +201 -153
- napistu/mcp/component_base.py +170 -0
- napistu/mcp/config.py +223 -0
- napistu/mcp/constants.py +45 -2
- napistu/mcp/documentation.py +253 -136
- napistu/mcp/documentation_utils.py +13 -48
- napistu/mcp/execution.py +372 -305
- napistu/mcp/health.py +49 -67
- napistu/mcp/profiles.py +10 -6
- napistu/mcp/server.py +161 -80
- napistu/mcp/tutorials.py +139 -87
- napistu/modify/__init__.py +1 -1
- napistu/modify/gaps.py +1 -1
- napistu/network/__init__.py +1 -1
- napistu/network/constants.py +101 -34
- napistu/network/data_handling.py +388 -0
- napistu/network/ig_utils.py +351 -0
- napistu/network/napistu_graph_core.py +354 -0
- napistu/network/neighborhoods.py +40 -40
- napistu/network/net_create.py +373 -309
- napistu/network/net_propagation.py +47 -19
- napistu/network/{net_utils.py → ng_utils.py} +124 -272
- napistu/network/paths.py +67 -51
- napistu/network/precompute.py +11 -11
- napistu/ontologies/__init__.py +10 -0
- napistu/ontologies/constants.py +129 -0
- napistu/ontologies/dogma.py +243 -0
- napistu/ontologies/genodexito.py +649 -0
- napistu/ontologies/mygene.py +369 -0
- napistu/ontologies/renaming.py +198 -0
- napistu/rpy2/__init__.py +229 -86
- napistu/rpy2/callr.py +47 -77
- napistu/rpy2/constants.py +24 -23
- napistu/rpy2/rids.py +61 -648
- napistu/sbml_dfs_core.py +587 -222
- napistu/scverse/__init__.py +15 -0
- napistu/scverse/constants.py +28 -0
- napistu/scverse/loading.py +727 -0
- napistu/utils.py +118 -10
- {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/METADATA +8 -3
- napistu-0.3.1.dist-info/RECORD +133 -0
- tests/conftest.py +22 -0
- tests/test_context_discretize.py +56 -0
- tests/test_context_filtering.py +267 -0
- tests/test_identifiers.py +100 -0
- tests/test_indices.py +65 -0
- tests/{test_edgelist.py → test_ingestion_napistu_edgelist.py} +2 -2
- tests/test_matching_interactions.py +108 -0
- tests/test_matching_mount.py +305 -0
- tests/test_matching_species.py +394 -0
- tests/test_mcp_config.py +193 -0
- tests/test_mcp_documentation_utils.py +12 -3
- tests/test_mcp_server.py +356 -0
- tests/test_network_data_handling.py +397 -0
- tests/test_network_ig_utils.py +23 -0
- tests/test_network_neighborhoods.py +19 -0
- tests/test_network_net_create.py +459 -0
- tests/test_network_ng_utils.py +30 -0
- tests/test_network_paths.py +56 -0
- tests/{test_precomputed_distances.py → test_network_precompute.py} +8 -6
- tests/test_ontologies_genodexito.py +58 -0
- tests/test_ontologies_mygene.py +39 -0
- tests/test_ontologies_renaming.py +110 -0
- tests/test_rpy2_callr.py +79 -0
- tests/test_rpy2_init.py +151 -0
- tests/test_sbml.py +0 -31
- tests/test_sbml_dfs_core.py +134 -10
- tests/test_scverse_loading.py +778 -0
- tests/test_set_coverage.py +2 -2
- tests/test_utils.py +121 -1
- napistu/mechanism_matching.py +0 -1353
- napistu/rpy2/netcontextr.py +0 -467
- napistu-0.2.5.dev6.dist-info/RECORD +0 -97
- tests/test_igraph.py +0 -367
- tests/test_mechanism_matching.py +0 -784
- tests/test_net_utils.py +0 -149
- tests/test_netcontextr.py +0 -105
- tests/test_rpy2.py +0 -61
- /napistu/ingestion/{cpr_edgelist.py → napistu_edgelist.py} +0 -0
- {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/WHEEL +0 -0
- {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/entry_points.txt +0 -0
- {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/licenses/LICENSE +0 -0
- {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/top_level.txt +0 -0
- /tests/{test_obo.py → test_ingestion_obo.py} +0 -0
tests/test_mechanism_matching.py
DELETED
@@ -1,784 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
from datetime import datetime
|
4
|
-
|
5
|
-
import numpy as np
|
6
|
-
import pandas as pd
|
7
|
-
import pytest
|
8
|
-
|
9
|
-
from napistu import mechanism_matching
|
10
|
-
from napistu.network import net_create
|
11
|
-
from napistu.network import precompute
|
12
|
-
from napistu.mechanism_matching import _validate_wide_ontologies
|
13
|
-
from napistu.mechanism_matching import match_by_ontology_and_identifier
|
14
|
-
from napistu.mechanism_matching import resolve_matches
|
15
|
-
|
16
|
-
from napistu.constants import SBML_DFS
|
17
|
-
from napistu.constants import IDENTIFIERS
|
18
|
-
from napistu.constants import ONTOLOGIES
|
19
|
-
from napistu.constants import RESOLVE_MATCHES_AGGREGATORS
|
20
|
-
from napistu.constants import FEATURE_ID_VAR_DEFAULT
|
21
|
-
|
22
|
-
|
23
|
-
def test_features_to_pathway_species(sbml_dfs):
|
24
|
-
|
25
|
-
species_identifiers = sbml_dfs.get_identifiers("species")
|
26
|
-
feature_identifiers = pd.DataFrame({"chebis": ["17627", "15379", "29105", "-1"]})
|
27
|
-
|
28
|
-
matching_df = (
|
29
|
-
mechanism_matching.features_to_pathway_species(
|
30
|
-
feature_identifiers, species_identifiers, {"chebi"}, "chebis"
|
31
|
-
)
|
32
|
-
.value_counts("identifier")
|
33
|
-
.sort_index()
|
34
|
-
)
|
35
|
-
|
36
|
-
assert matching_df.index.tolist() == ["15379", "17627", "29105"]
|
37
|
-
assert matching_df.tolist() == [2, 3, 2]
|
38
|
-
|
39
|
-
|
40
|
-
def test_features_to_pathway_species_basic_and_expansion():
|
41
|
-
|
42
|
-
# Mock species_identifiers table
|
43
|
-
species_identifiers = pd.DataFrame(
|
44
|
-
{
|
45
|
-
"ontology": ["chebi", "chebi", "uniprot", "uniprot"],
|
46
|
-
"identifier": ["A", "B", "X", "Y"],
|
47
|
-
"s_id": [1, 2, 3, 4],
|
48
|
-
"s_name": ["foo", "bar", "baz", "qux"],
|
49
|
-
"bqb": ["BQB_IS", "BQB_IS", "BQB_IS", "BQB_IS"],
|
50
|
-
}
|
51
|
-
)
|
52
|
-
# Basic: no expansion, single identifier per row
|
53
|
-
features = pd.DataFrame({"my_id": ["A", "B", "X"], "other_col": [10, 20, 30]})
|
54
|
-
result = mechanism_matching.features_to_pathway_species(
|
55
|
-
feature_identifiers=features,
|
56
|
-
species_identifiers=species_identifiers,
|
57
|
-
ontologies={"chebi", "uniprot"},
|
58
|
-
feature_identifiers_var="my_id",
|
59
|
-
expand_identifiers=False,
|
60
|
-
)
|
61
|
-
# Should map all three
|
62
|
-
assert set(result["my_id"]) == {"A", "B", "X"}
|
63
|
-
assert set(result["identifier"]) == {"A", "B", "X"}
|
64
|
-
assert set(result["s_name"]) == {"foo", "bar", "baz"}
|
65
|
-
# Expansion: one row with multiple IDs
|
66
|
-
features2 = pd.DataFrame({"my_id": ["A / B / X", "Y"], "other_col": [100, 200]})
|
67
|
-
result2 = mechanism_matching.features_to_pathway_species(
|
68
|
-
feature_identifiers=features2,
|
69
|
-
species_identifiers=species_identifiers,
|
70
|
-
ontologies={"chebi", "uniprot"},
|
71
|
-
feature_identifiers_var="my_id",
|
72
|
-
expand_identifiers=True,
|
73
|
-
identifier_delimiter="/",
|
74
|
-
)
|
75
|
-
# Should expand to 4 rows (A, B, X, Y)
|
76
|
-
assert set(result2["identifier"]) == {"A", "B", "X", "Y"}
|
77
|
-
assert set(result2["s_name"]) == {"foo", "bar", "baz", "qux"}
|
78
|
-
# Whitespace trimming
|
79
|
-
features3 = pd.DataFrame({"my_id": [" A / B /X ", " Y"], "other_col": [1, 2]})
|
80
|
-
result3 = mechanism_matching.features_to_pathway_species(
|
81
|
-
feature_identifiers=features3,
|
82
|
-
species_identifiers=species_identifiers,
|
83
|
-
ontologies={"chebi", "uniprot"},
|
84
|
-
feature_identifiers_var="my_id",
|
85
|
-
expand_identifiers=True,
|
86
|
-
identifier_delimiter="/",
|
87
|
-
)
|
88
|
-
# Should expand and trim whitespace
|
89
|
-
assert set(result3["identifier"]) == {"A", "B", "X", "Y"}
|
90
|
-
assert set(result3["s_name"]) == {"foo", "bar", "baz", "qux"}
|
91
|
-
|
92
|
-
|
93
|
-
def test_edgelist_to_pathway_species(sbml_dfs):
|
94
|
-
|
95
|
-
edgelist = pd.DataFrame(
|
96
|
-
[
|
97
|
-
{"identifier_upstream": "17996", "identifier_downstream": "16526"},
|
98
|
-
{"identifier_upstream": "15377", "identifier_downstream": "17544"},
|
99
|
-
{"identifier_upstream": "15378", "identifier_downstream": "57945"},
|
100
|
-
{"identifier_upstream": "57540", "identifier_downstream": "17996"},
|
101
|
-
]
|
102
|
-
)
|
103
|
-
species_identifiers = sbml_dfs.get_identifiers("species").query("bqb == 'BQB_IS'")
|
104
|
-
|
105
|
-
edgelist_w_sids = mechanism_matching.edgelist_to_pathway_species(
|
106
|
-
edgelist, species_identifiers, ontologies={"chebi", "uniprot"}
|
107
|
-
)
|
108
|
-
assert edgelist_w_sids.shape == (4, 4)
|
109
|
-
|
110
|
-
egelist_w_scids = mechanism_matching.edgelist_to_scids(
|
111
|
-
edgelist, sbml_dfs, species_identifiers, ontologies={"chebi"}
|
112
|
-
)
|
113
|
-
|
114
|
-
assert egelist_w_scids.shape == (12, 6)
|
115
|
-
|
116
|
-
direct_interactions = mechanism_matching.filter_to_direct_mechanistic_interactions(
|
117
|
-
edgelist, sbml_dfs, species_identifiers, ontologies={"chebi"}
|
118
|
-
)
|
119
|
-
|
120
|
-
assert direct_interactions.shape == (2, 10)
|
121
|
-
|
122
|
-
|
123
|
-
def test_direct_and_indirect_mechanism_matching(sbml_dfs_glucose_metabolism):
|
124
|
-
|
125
|
-
cpr_graph = net_create.process_cpr_graph(sbml_dfs_glucose_metabolism)
|
126
|
-
|
127
|
-
edgelist = pd.DataFrame(
|
128
|
-
[
|
129
|
-
{
|
130
|
-
"identifier_upstream": "17925",
|
131
|
-
"identifier_downstream": "32966",
|
132
|
-
}, # glu, fbp
|
133
|
-
{
|
134
|
-
"identifier_upstream": "57634",
|
135
|
-
"identifier_downstream": "32966",
|
136
|
-
}, # f6p, fbp
|
137
|
-
{
|
138
|
-
"identifier_upstream": "32966",
|
139
|
-
"identifier_downstream": "57642",
|
140
|
-
}, # fbp, dhap
|
141
|
-
{
|
142
|
-
"identifier_upstream": "17925",
|
143
|
-
"identifier_downstream": "15361",
|
144
|
-
}, # glu, pyr
|
145
|
-
]
|
146
|
-
)
|
147
|
-
|
148
|
-
species_identifiers = sbml_dfs_glucose_metabolism.get_identifiers("species")
|
149
|
-
|
150
|
-
direct_interactions = mechanism_matching.filter_to_direct_mechanistic_interactions(
|
151
|
-
formatted_edgelist=edgelist,
|
152
|
-
sbml_dfs=sbml_dfs_glucose_metabolism,
|
153
|
-
species_identifiers=species_identifiers,
|
154
|
-
ontologies={"chebi"},
|
155
|
-
)
|
156
|
-
|
157
|
-
assert direct_interactions.shape == (2, 10)
|
158
|
-
|
159
|
-
indirect_interactions = (
|
160
|
-
mechanism_matching.filter_to_indirect_mechanistic_interactions(
|
161
|
-
formatted_edgelist=edgelist,
|
162
|
-
sbml_dfs=sbml_dfs_glucose_metabolism,
|
163
|
-
species_identifiers=species_identifiers,
|
164
|
-
cpr_graph=cpr_graph,
|
165
|
-
ontologies={"chebi"},
|
166
|
-
precomputed_distances=None,
|
167
|
-
max_path_length=10,
|
168
|
-
)
|
169
|
-
)
|
170
|
-
|
171
|
-
assert indirect_interactions.shape == (6, 12)
|
172
|
-
|
173
|
-
# confirm that we get the same thing even when using precomputed distances
|
174
|
-
precomputed_distances = precompute.precompute_distances(
|
175
|
-
cpr_graph, weights_vars=["weights"]
|
176
|
-
)
|
177
|
-
|
178
|
-
indirect_interactions_w_precompute = (
|
179
|
-
mechanism_matching.filter_to_indirect_mechanistic_interactions(
|
180
|
-
formatted_edgelist=edgelist,
|
181
|
-
sbml_dfs=sbml_dfs_glucose_metabolism,
|
182
|
-
species_identifiers=species_identifiers,
|
183
|
-
cpr_graph=cpr_graph,
|
184
|
-
ontologies={"chebi"},
|
185
|
-
precomputed_distances=precomputed_distances,
|
186
|
-
max_path_length=10,
|
187
|
-
)
|
188
|
-
)
|
189
|
-
|
190
|
-
assert all(
|
191
|
-
indirect_interactions["weight"] == indirect_interactions_w_precompute["weight"]
|
192
|
-
)
|
193
|
-
|
194
|
-
|
195
|
-
def test_validate_wide_ontologies():
|
196
|
-
"""Test the _validate_wide_ontologies function with various input types and error cases."""
|
197
|
-
# Setup test data
|
198
|
-
example_data_wide = pd.DataFrame(
|
199
|
-
{
|
200
|
-
"results": [-1.0, 0.0, 1.0],
|
201
|
-
"chebi": ["15377", "16810", "17925"],
|
202
|
-
"uniprot": ["P12345", "Q67890", "O43826"],
|
203
|
-
}
|
204
|
-
)
|
205
|
-
|
206
|
-
# Test auto-detection of ontology columns
|
207
|
-
assert _validate_wide_ontologies(example_data_wide) == {"chebi", "uniprot"}
|
208
|
-
|
209
|
-
# Test string input
|
210
|
-
assert _validate_wide_ontologies(example_data_wide, ontologies="chebi") == {"chebi"}
|
211
|
-
|
212
|
-
# Test set input
|
213
|
-
assert _validate_wide_ontologies(example_data_wide, ontologies={"chebi"}) == {
|
214
|
-
"chebi"
|
215
|
-
}
|
216
|
-
assert _validate_wide_ontologies(
|
217
|
-
example_data_wide, ontologies={"chebi", "uniprot"}
|
218
|
-
) == {"chebi", "uniprot"}
|
219
|
-
|
220
|
-
# Test dictionary mapping for renaming
|
221
|
-
assert _validate_wide_ontologies(
|
222
|
-
example_data_wide, ontologies={"chebi": "reactome", "uniprot": "ensembl_gene"}
|
223
|
-
) == {"reactome", "ensembl_gene"}
|
224
|
-
|
225
|
-
# Test error cases
|
226
|
-
|
227
|
-
# Missing column in set input (checks existence first)
|
228
|
-
with pytest.raises(
|
229
|
-
ValueError, match="Specified ontology columns not found in DataFrame:.*"
|
230
|
-
):
|
231
|
-
_validate_wide_ontologies(example_data_wide, ontologies={"invalid_ontology"})
|
232
|
-
|
233
|
-
# Valid column name but invalid ontology
|
234
|
-
df_with_invalid = pd.DataFrame(
|
235
|
-
{
|
236
|
-
"results": [-1.0, 0.0, 1.0],
|
237
|
-
"invalid_ontology": ["a", "b", "c"],
|
238
|
-
}
|
239
|
-
)
|
240
|
-
with pytest.raises(ValueError, match="Invalid ontologies in set:.*"):
|
241
|
-
_validate_wide_ontologies(df_with_invalid, ontologies={"invalid_ontology"})
|
242
|
-
|
243
|
-
# Missing source column in mapping
|
244
|
-
with pytest.raises(ValueError, match="Source columns not found in DataFrame:.*"):
|
245
|
-
_validate_wide_ontologies(
|
246
|
-
example_data_wide, ontologies={"missing_column": "reactome"}
|
247
|
-
)
|
248
|
-
|
249
|
-
# Invalid target ontology in mapping
|
250
|
-
with pytest.raises(ValueError, match="Invalid ontologies in mapping:.*"):
|
251
|
-
_validate_wide_ontologies(
|
252
|
-
example_data_wide, ontologies={"chebi": "invalid_ontology"}
|
253
|
-
)
|
254
|
-
|
255
|
-
# DataFrame with no valid ontology columns
|
256
|
-
invalid_df = pd.DataFrame(
|
257
|
-
{"results": [-1.0, 0.0, 1.0], "col1": ["a", "b", "c"], "col2": ["d", "e", "f"]}
|
258
|
-
)
|
259
|
-
with pytest.raises(
|
260
|
-
ValueError, match="No valid ontology columns found in DataFrame.*"
|
261
|
-
):
|
262
|
-
_validate_wide_ontologies(invalid_df)
|
263
|
-
|
264
|
-
|
265
|
-
def test_ensure_feature_id_var():
|
266
|
-
"""Test the _ensure_feature_id_var function with various input cases."""
|
267
|
-
from napistu.mechanism_matching import _ensure_feature_id_var
|
268
|
-
from napistu.constants import FEATURE_ID_VAR_DEFAULT
|
269
|
-
|
270
|
-
# Test case 1: DataFrame already has feature_id column
|
271
|
-
df1 = pd.DataFrame({"feature_id": [100, 200, 300], "data": ["a", "b", "c"]})
|
272
|
-
result1 = _ensure_feature_id_var(df1)
|
273
|
-
# Should return unchanged DataFrame
|
274
|
-
pd.testing.assert_frame_equal(df1, result1)
|
275
|
-
|
276
|
-
# Test case 2: DataFrame missing feature_id column
|
277
|
-
df2 = pd.DataFrame({"data": ["x", "y", "z"]})
|
278
|
-
result2 = _ensure_feature_id_var(df2)
|
279
|
-
# Should add feature_id column with sequential integers
|
280
|
-
assert FEATURE_ID_VAR_DEFAULT in result2.columns
|
281
|
-
assert list(result2[FEATURE_ID_VAR_DEFAULT]) == [0, 1, 2]
|
282
|
-
assert list(result2["data"]) == ["x", "y", "z"] # Original data preserved
|
283
|
-
|
284
|
-
# Test case 3: Custom feature_id column name
|
285
|
-
df3 = pd.DataFrame({"data": ["p", "q", "r"]})
|
286
|
-
custom_id = "custom_feature_id"
|
287
|
-
result3 = _ensure_feature_id_var(df3, feature_id_var=custom_id)
|
288
|
-
# Should add custom named feature_id column
|
289
|
-
assert custom_id in result3.columns
|
290
|
-
assert list(result3[custom_id]) == [0, 1, 2]
|
291
|
-
assert list(result3["data"]) == ["p", "q", "r"] # Original data preserved
|
292
|
-
|
293
|
-
# Test case 4: Empty DataFrame
|
294
|
-
df4 = pd.DataFrame()
|
295
|
-
result4 = _ensure_feature_id_var(df4)
|
296
|
-
# Should handle empty DataFrame gracefully
|
297
|
-
assert FEATURE_ID_VAR_DEFAULT in result4.columns
|
298
|
-
assert len(result4) == 0
|
299
|
-
|
300
|
-
|
301
|
-
def test_match_by_ontology_and_identifier():
|
302
|
-
"""Test the match_by_ontology_and_identifier function with various input types."""
|
303
|
-
# Setup test data
|
304
|
-
feature_identifiers = pd.DataFrame(
|
305
|
-
{
|
306
|
-
"ontology": ["chebi", "chebi", "uniprot", "uniprot", "reactome"],
|
307
|
-
"identifier": ["15377", "16810", "P12345", "Q67890", "R12345"],
|
308
|
-
"results": [1.0, 2.0, -1.0, -2.0, 0.5],
|
309
|
-
}
|
310
|
-
)
|
311
|
-
|
312
|
-
species_identifiers = pd.DataFrame(
|
313
|
-
{
|
314
|
-
"ontology": ["chebi", "chebi", "uniprot", "uniprot", "ensembl_gene"],
|
315
|
-
"identifier": ["15377", "17925", "P12345", "O43826", "ENSG123"],
|
316
|
-
"s_id": ["s1", "s2", "s3", "s4", "s5"],
|
317
|
-
"s_name": ["compound1", "compound2", "protein1", "protein2", "gene1"],
|
318
|
-
"bqb": ["BQB_IS"] * 5, # Add required bqb column with BQB_IS values
|
319
|
-
}
|
320
|
-
)
|
321
|
-
|
322
|
-
# Test with single ontology (string)
|
323
|
-
result = match_by_ontology_and_identifier(
|
324
|
-
feature_identifiers=feature_identifiers,
|
325
|
-
species_identifiers=species_identifiers,
|
326
|
-
ontologies="chebi",
|
327
|
-
)
|
328
|
-
assert len(result) == 1 # Only one matching chebi identifier
|
329
|
-
assert result.iloc[0]["identifier"] == "15377"
|
330
|
-
assert result.iloc[0]["results"] == 1.0
|
331
|
-
assert result.iloc[0]["ontology"] == "chebi" # From species_identifiers
|
332
|
-
assert result.iloc[0]["s_name"] == "compound1" # Verify join worked correctly
|
333
|
-
assert result.iloc[0]["bqb"] == "BQB_IS" # Verify bqb column is preserved
|
334
|
-
|
335
|
-
# Test with multiple ontologies (set)
|
336
|
-
result = match_by_ontology_and_identifier(
|
337
|
-
feature_identifiers=feature_identifiers,
|
338
|
-
species_identifiers=species_identifiers,
|
339
|
-
ontologies={"chebi", "uniprot"},
|
340
|
-
)
|
341
|
-
assert len(result) == 2 # One chebi and one uniprot match
|
342
|
-
assert set(result["ontology"]) == {"chebi", "uniprot"} # From species_identifiers
|
343
|
-
assert set(result["identifier"]) == {"15377", "P12345"}
|
344
|
-
# Verify results are correctly matched
|
345
|
-
chebi_row = result[result["ontology"] == "chebi"].iloc[0]
|
346
|
-
uniprot_row = result[result["ontology"] == "uniprot"].iloc[0]
|
347
|
-
assert chebi_row["results"] == 1.0
|
348
|
-
assert uniprot_row["results"] == -1.0
|
349
|
-
assert chebi_row["s_name"] == "compound1"
|
350
|
-
assert uniprot_row["s_name"] == "protein1"
|
351
|
-
assert chebi_row["bqb"] == "BQB_IS"
|
352
|
-
assert uniprot_row["bqb"] == "BQB_IS"
|
353
|
-
|
354
|
-
# Test with list of ontologies
|
355
|
-
result = match_by_ontology_and_identifier(
|
356
|
-
feature_identifiers=feature_identifiers,
|
357
|
-
species_identifiers=species_identifiers,
|
358
|
-
ontologies=["chebi", "uniprot"],
|
359
|
-
)
|
360
|
-
assert len(result) == 2
|
361
|
-
assert set(result["ontology"]) == {"chebi", "uniprot"} # From species_identifiers
|
362
|
-
|
363
|
-
# Test with no matches
|
364
|
-
no_match_features = pd.DataFrame(
|
365
|
-
{"ontology": ["chebi"], "identifier": ["99999"], "results": [1.0]}
|
366
|
-
)
|
367
|
-
result = match_by_ontology_and_identifier(
|
368
|
-
feature_identifiers=no_match_features,
|
369
|
-
species_identifiers=species_identifiers,
|
370
|
-
ontologies="chebi",
|
371
|
-
)
|
372
|
-
assert len(result) == 0
|
373
|
-
|
374
|
-
# Test with empty features
|
375
|
-
empty_features = pd.DataFrame({"ontology": [], "identifier": [], "results": []})
|
376
|
-
result = match_by_ontology_and_identifier(
|
377
|
-
feature_identifiers=empty_features,
|
378
|
-
species_identifiers=species_identifiers,
|
379
|
-
ontologies={"chebi", "uniprot"},
|
380
|
-
)
|
381
|
-
assert len(result) == 0
|
382
|
-
|
383
|
-
# Test with invalid ontology
|
384
|
-
with pytest.raises(ValueError, match="Invalid ontologies specified:.*"):
|
385
|
-
match_by_ontology_and_identifier(
|
386
|
-
feature_identifiers=feature_identifiers,
|
387
|
-
species_identifiers=species_identifiers,
|
388
|
-
ontologies="invalid_ontology",
|
389
|
-
)
|
390
|
-
|
391
|
-
# Test with ontology not in feature_identifiers
|
392
|
-
result = match_by_ontology_and_identifier(
|
393
|
-
feature_identifiers=feature_identifiers,
|
394
|
-
species_identifiers=species_identifiers,
|
395
|
-
ontologies={"ensembl_gene"}, # Only in species_identifiers
|
396
|
-
)
|
397
|
-
assert len(result) == 0
|
398
|
-
|
399
|
-
# Test with custom feature_identifiers_var
|
400
|
-
feature_identifiers_custom = feature_identifiers.rename(
|
401
|
-
columns={"identifier": "custom_id"}
|
402
|
-
)
|
403
|
-
result = match_by_ontology_and_identifier(
|
404
|
-
feature_identifiers=feature_identifiers_custom,
|
405
|
-
species_identifiers=species_identifiers,
|
406
|
-
ontologies={"chebi"},
|
407
|
-
feature_identifiers_var="custom_id",
|
408
|
-
)
|
409
|
-
assert len(result) == 1
|
410
|
-
assert result.iloc[0]["custom_id"] == "15377"
|
411
|
-
assert result.iloc[0]["ontology"] == "chebi" # From species_identifiers
|
412
|
-
assert result.iloc[0]["s_name"] == "compound1"
|
413
|
-
assert result.iloc[0]["bqb"] == "BQB_IS"
|
414
|
-
|
415
|
-
|
416
|
-
def test_match_features_to_wide_pathway_species(sbml_dfs_glucose_metabolism):
|
417
|
-
|
418
|
-
def compare_frame_contents(df1, df2):
|
419
|
-
"""
|
420
|
-
Compare if two DataFrames have the same content, ignoring index and column ordering.
|
421
|
-
|
422
|
-
Parameters
|
423
|
-
----------
|
424
|
-
df1 : pd.DataFrame
|
425
|
-
First DataFrame to compare
|
426
|
-
df2 : pd.DataFrame
|
427
|
-
Second DataFrame to compare
|
428
|
-
|
429
|
-
Returns
|
430
|
-
-------
|
431
|
-
None
|
432
|
-
"""
|
433
|
-
df1_sorted = (
|
434
|
-
df1.reindex(columns=sorted(df1.columns))
|
435
|
-
.sort_values(sorted(df1.columns))
|
436
|
-
.reset_index(drop=True)
|
437
|
-
)
|
438
|
-
|
439
|
-
df2_sorted = (
|
440
|
-
df2.reindex(columns=sorted(df2.columns))
|
441
|
-
.sort_values(sorted(df2.columns))
|
442
|
-
.reset_index(drop=True)
|
443
|
-
)
|
444
|
-
|
445
|
-
pd.testing.assert_frame_equal(df1_sorted, df2_sorted, check_like=True)
|
446
|
-
|
447
|
-
return None
|
448
|
-
|
449
|
-
species_identifiers = (
|
450
|
-
sbml_dfs_glucose_metabolism.get_identifiers("species")
|
451
|
-
.query("bqb == 'BQB_IS'")
|
452
|
-
.query("ontology != 'reactome'")
|
453
|
-
)
|
454
|
-
|
455
|
-
# create a table whose index is s_ids and columns are faux-measurements
|
456
|
-
example_data = species_identifiers.groupby("ontology").head(10)[
|
457
|
-
["ontology", "identifier"]
|
458
|
-
]
|
459
|
-
|
460
|
-
example_data["results_a"] = np.random.randn(len(example_data))
|
461
|
-
example_data["results_b"] = np.random.randn(len(example_data))
|
462
|
-
# add a feature_id column to the example_data which tracks the row of the original data
|
463
|
-
example_data["feature_id"] = range(0, len(example_data))
|
464
|
-
|
465
|
-
# pivot (identifier, ontology) to columns for each ontology
|
466
|
-
example_data_wide = (
|
467
|
-
example_data.pivot(
|
468
|
-
columns="ontology",
|
469
|
-
values="identifier",
|
470
|
-
index=["feature_id", "results_a", "results_b"],
|
471
|
-
)
|
472
|
-
.reset_index()
|
473
|
-
.rename_axis(None, axis=1)
|
474
|
-
)
|
475
|
-
|
476
|
-
# options, for matching
|
477
|
-
# 1. match by identifier and a set of ontologies (provided by arg).
|
478
|
-
matched_s_ids = mechanism_matching.features_to_pathway_species(
|
479
|
-
feature_identifiers=example_data.drop(columns="ontology"),
|
480
|
-
species_identifiers=species_identifiers,
|
481
|
-
ontologies={"uniprot", "chebi"},
|
482
|
-
feature_identifiers_var="identifier",
|
483
|
-
)
|
484
|
-
|
485
|
-
# 2. match by identifier and ontology.
|
486
|
-
matched_s_ids_w_ontologies = mechanism_matching.match_by_ontology_and_identifier(
|
487
|
-
feature_identifiers=example_data,
|
488
|
-
species_identifiers=species_identifiers,
|
489
|
-
ontologies={"uniprot", "chebi"},
|
490
|
-
feature_identifiers_var="identifier",
|
491
|
-
)
|
492
|
-
|
493
|
-
# 3. format wide identifier sets into a table with a single identifier column and apply strategy #2.
|
494
|
-
matched_s_ids_from_wide = mechanism_matching.match_features_to_wide_pathway_species(
|
495
|
-
example_data_wide,
|
496
|
-
species_identifiers,
|
497
|
-
ontologies={"uniprot", "chebi"},
|
498
|
-
feature_identifiers_var="identifier",
|
499
|
-
)
|
500
|
-
|
501
|
-
compare_frame_contents(
|
502
|
-
matched_s_ids.drop(columns="s_Source"),
|
503
|
-
matched_s_ids_w_ontologies.drop(columns="s_Source"),
|
504
|
-
)
|
505
|
-
compare_frame_contents(
|
506
|
-
matched_s_ids.drop(columns="s_Source"),
|
507
|
-
matched_s_ids_from_wide.drop(columns="s_Source"),
|
508
|
-
)
|
509
|
-
|
510
|
-
|
511
|
-
def test_resolve_matches_with_example_data():
|
512
|
-
"""Test resolve_matches function with example data for all aggregation methods."""
|
513
|
-
# Setup example data with overlapping 1-to-many and many-to-1 cases
|
514
|
-
example_data = pd.DataFrame(
|
515
|
-
{
|
516
|
-
FEATURE_ID_VAR_DEFAULT: ["A", "B", "C", "D", "D", "E", "B", "B", "C"],
|
517
|
-
SBML_DFS.S_ID: [
|
518
|
-
"s_id_1",
|
519
|
-
"s_id_1",
|
520
|
-
"s_id_1",
|
521
|
-
"s_id_4",
|
522
|
-
"s_id_5",
|
523
|
-
"s_id_6",
|
524
|
-
"s_id_2",
|
525
|
-
"s_id_3",
|
526
|
-
"s_id_3",
|
527
|
-
],
|
528
|
-
"results_a": [1, 2, 3, 0.4, 5, 6, 0.7, 0.8, 9],
|
529
|
-
"results_b": [
|
530
|
-
"foo",
|
531
|
-
"foo",
|
532
|
-
"bar",
|
533
|
-
"bar",
|
534
|
-
"baz",
|
535
|
-
"baz",
|
536
|
-
"not",
|
537
|
-
"not",
|
538
|
-
"not",
|
539
|
-
],
|
540
|
-
}
|
541
|
-
)
|
542
|
-
|
543
|
-
# Test that missing feature_id raises KeyError
|
544
|
-
data_no_id = pd.DataFrame(
|
545
|
-
{
|
546
|
-
SBML_DFS.S_ID: ["s_id_1", "s_id_1", "s_id_2"],
|
547
|
-
"results_a": [1, 2, 3],
|
548
|
-
"results_b": ["foo", "bar", "baz"],
|
549
|
-
}
|
550
|
-
)
|
551
|
-
with pytest.raises(KeyError, match=FEATURE_ID_VAR_DEFAULT):
|
552
|
-
resolve_matches(data_no_id)
|
553
|
-
|
554
|
-
# Test with keep_id_col=True (default)
|
555
|
-
result_with_id = resolve_matches(
|
556
|
-
example_data, keep_id_col=True, numeric_agg=RESOLVE_MATCHES_AGGREGATORS.MEAN
|
557
|
-
)
|
558
|
-
|
559
|
-
# Verify feature_id column is present and correctly aggregated
|
560
|
-
assert FEATURE_ID_VAR_DEFAULT in result_with_id.columns
|
561
|
-
assert result_with_id.loc["s_id_1", FEATURE_ID_VAR_DEFAULT] == "A,B,C"
|
562
|
-
assert result_with_id.loc["s_id_3", FEATURE_ID_VAR_DEFAULT] == "B,C"
|
563
|
-
|
564
|
-
# Test with keep_id_col=False
|
565
|
-
result_without_id = resolve_matches(
|
566
|
-
example_data, keep_id_col=False, numeric_agg=RESOLVE_MATCHES_AGGREGATORS.MEAN
|
567
|
-
)
|
568
|
-
|
569
|
-
# Verify feature_id column is not in output
|
570
|
-
assert FEATURE_ID_VAR_DEFAULT not in result_without_id.columns
|
571
|
-
|
572
|
-
# Verify other columns are still present and correctly aggregated
|
573
|
-
assert "results_a" in result_without_id.columns
|
574
|
-
assert "results_b" in result_without_id.columns
|
575
|
-
assert "feature_id_match_count" in result_without_id.columns
|
576
|
-
|
577
|
-
# Verify numeric aggregation still works
|
578
|
-
actual_mean = result_without_id.loc["s_id_1", "results_a"]
|
579
|
-
expected_mean = 2.0 # (1 + 2 + 3) / 3
|
580
|
-
assert (
|
581
|
-
actual_mean == expected_mean
|
582
|
-
), f"Expected mean {expected_mean}, but got {actual_mean}"
|
583
|
-
|
584
|
-
# Verify string aggregation still works
|
585
|
-
assert result_without_id.loc["s_id_1", "results_b"] == "bar,foo"
|
586
|
-
|
587
|
-
# Verify match counts are still present
|
588
|
-
assert result_without_id.loc["s_id_1", "feature_id_match_count"] == 3
|
589
|
-
assert result_without_id.loc["s_id_3", "feature_id_match_count"] == 2
|
590
|
-
|
591
|
-
# Test maximum aggregation
|
592
|
-
max_result = resolve_matches(
|
593
|
-
example_data, numeric_agg=RESOLVE_MATCHES_AGGREGATORS.MAX
|
594
|
-
)
|
595
|
-
|
596
|
-
# Verify maximum values are correct
|
597
|
-
assert max_result.loc["s_id_1", "results_a"] == 3.0 # max of [1, 2, 3]
|
598
|
-
assert max_result.loc["s_id_3", "results_a"] == 9.0 # max of [0.8, 9]
|
599
|
-
assert max_result.loc["s_id_4", "results_a"] == 0.4 # single value
|
600
|
-
assert max_result.loc["s_id_5", "results_a"] == 5.0 # single value
|
601
|
-
assert max_result.loc["s_id_6", "results_a"] == 6.0 # single value
|
602
|
-
|
603
|
-
# Test weighted mean (feature_id is used for weights regardless of keep_id_col)
|
604
|
-
weighted_result = resolve_matches(
|
605
|
-
example_data,
|
606
|
-
numeric_agg=RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
|
607
|
-
keep_id_col=True,
|
608
|
-
)
|
609
|
-
|
610
|
-
# For s_id_1:
|
611
|
-
# A appears once in total (weight = 1/1)
|
612
|
-
# B appears three times in total (weight = 1/3)
|
613
|
-
# C appears twice in total (weight = 1/2)
|
614
|
-
# Sum of unnormalized weights = 1 + 1/3 + 1/2 = 1.833
|
615
|
-
# Normalized weights:
|
616
|
-
# A: (1/1)/1.833 = 0.545
|
617
|
-
# B: (1/3)/1.833 = 0.182
|
618
|
-
# C: (1/2)/1.833 = 0.273
|
619
|
-
# Weighted mean = 1×0.545 + 2×0.182 + 3×0.273 = 1.73
|
620
|
-
actual_weighted_mean_1 = weighted_result.loc["s_id_1", "results_a"]
|
621
|
-
expected_weighted_mean_1 = 1.73
|
622
|
-
assert (
|
623
|
-
abs(actual_weighted_mean_1 - expected_weighted_mean_1) < 0.01
|
624
|
-
), f"s_id_1 weighted mean: expected {expected_weighted_mean_1:.3f}, but got {actual_weighted_mean_1:.3f}"
|
625
|
-
|
626
|
-
# For s_id_3:
|
627
|
-
# B appears three times in total (weight = 1/3)
|
628
|
-
# C appears twice in total (weight = 1/2)
|
629
|
-
# Sum of unnormalized weights = 1/3 + 1/2 = 0.833
|
630
|
-
# Normalized weights:
|
631
|
-
# B: (1/3)/0.833 = 0.4
|
632
|
-
# C: (1/2)/0.833 = 0.6
|
633
|
-
# Weighted mean = 0.8×0.4 + 9×0.6 = 5.72
|
634
|
-
actual_weighted_mean_3 = weighted_result.loc["s_id_3", "results_a"]
|
635
|
-
expected_weighted_mean_3 = 5.72
|
636
|
-
assert (
|
637
|
-
abs(actual_weighted_mean_3 - expected_weighted_mean_3) < 0.01
|
638
|
-
), f"s_id_3 weighted mean: expected {expected_weighted_mean_3:.3f}, but got {actual_weighted_mean_3:.3f}"
|
639
|
-
|
640
|
-
# Test weighted mean with keep_id_col=False (weights still use feature_id)
|
641
|
-
weighted_result_no_id = resolve_matches(
|
642
|
-
example_data,
|
643
|
-
numeric_agg=RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
|
644
|
-
keep_id_col=False,
|
645
|
-
)
|
646
|
-
|
647
|
-
# Verify weighted means are the same regardless of keep_id_col
|
648
|
-
assert (
|
649
|
-
abs(weighted_result_no_id.loc["s_id_1", "results_a"] - expected_weighted_mean_1)
|
650
|
-
< 0.01
|
651
|
-
), "Weighted mean should be the same regardless of keep_id_col"
|
652
|
-
assert (
|
653
|
-
abs(weighted_result_no_id.loc["s_id_3", "results_a"] - expected_weighted_mean_3)
|
654
|
-
< 0.01
|
655
|
-
), "Weighted mean should be the same regardless of keep_id_col"
|
656
|
-
|
657
|
-
# Test that both versions preserve the same index structure
|
658
|
-
expected_index = pd.Index(
|
659
|
-
["s_id_1", "s_id_2", "s_id_3", "s_id_4", "s_id_5", "s_id_6"], name="s_id"
|
660
|
-
)
|
661
|
-
pd.testing.assert_index_equal(result_with_id.index, expected_index)
|
662
|
-
pd.testing.assert_index_equal(result_without_id.index, expected_index)
|
663
|
-
|
664
|
-
|
665
|
-
def test_resolve_matches_invalid_dtypes():
|
666
|
-
"""Test that resolve_matches raises an error for unsupported dtypes."""
|
667
|
-
# Setup data with boolean and datetime columns
|
668
|
-
data = pd.DataFrame(
|
669
|
-
{
|
670
|
-
FEATURE_ID_VAR_DEFAULT: ["A", "B", "B", "C"],
|
671
|
-
"bool_col": [True, False, True, False],
|
672
|
-
"datetime_col": [
|
673
|
-
datetime(2024, 1, 1),
|
674
|
-
datetime(2024, 1, 2),
|
675
|
-
datetime(2024, 1, 3),
|
676
|
-
datetime(2024, 1, 4),
|
677
|
-
],
|
678
|
-
"s_id": ["s1", "s1", "s2", "s2"],
|
679
|
-
}
|
680
|
-
)
|
681
|
-
|
682
|
-
# Should raise TypeError for unsupported dtypes
|
683
|
-
with pytest.raises(TypeError, match="Unsupported data types"):
|
684
|
-
resolve_matches(data)
|
685
|
-
|
686
|
-
|
687
|
-
def test_resolve_matches_first_method():
|
688
|
-
"""Test resolve_matches with first method."""
|
689
|
-
# Setup data with known order
|
690
|
-
data = pd.DataFrame(
|
691
|
-
{
|
692
|
-
FEATURE_ID_VAR_DEFAULT: ["A", "C", "B", "B", "A"],
|
693
|
-
SBML_DFS.S_ID: ["s1", "s1", "s1", "s2", "s2"],
|
694
|
-
"value": [1, 2, 3, 4, 5],
|
695
|
-
}
|
696
|
-
)
|
697
|
-
|
698
|
-
result = resolve_matches(data, numeric_agg=RESOLVE_MATCHES_AGGREGATORS.FIRST)
|
699
|
-
|
700
|
-
# Should take first value after sorting by feature_id
|
701
|
-
assert result.loc["s1", "value"] == 1 # A comes first
|
702
|
-
assert result.loc["s2", "value"] == 5 # A comes first
|
703
|
-
|
704
|
-
|
705
|
-
def test_resolve_matches_deduplicate_feature_id_within_sid():
|
706
|
-
"""Test that only the first value for each (s_id, feature_id) is used in mean aggregation."""
|
707
|
-
data = pd.DataFrame(
|
708
|
-
{
|
709
|
-
FEATURE_ID_VAR_DEFAULT: ["A", "A", "B"],
|
710
|
-
SBML_DFS.S_ID: ["s1", "s1", "s1"],
|
711
|
-
"value": [
|
712
|
-
1,
|
713
|
-
1,
|
714
|
-
2,
|
715
|
-
], # average should be 1.5 because the two A's are redundant
|
716
|
-
}
|
717
|
-
)
|
718
|
-
|
719
|
-
result = resolve_matches(data, numeric_agg=RESOLVE_MATCHES_AGGREGATORS.MEAN)
|
720
|
-
assert result.loc["s1", "value"] == 1.5
|
721
|
-
|
722
|
-
|
723
|
-
def test_bind_wide_results(sbml_dfs_glucose_metabolism):
|
724
|
-
"""
|
725
|
-
Test that bind_wide_results correctly matches identifiers and adds results to species data.
|
726
|
-
"""
|
727
|
-
# Get species identifiers, excluding reactome
|
728
|
-
species_identifiers = (
|
729
|
-
sbml_dfs_glucose_metabolism.get_identifiers(SBML_DFS.SPECIES)
|
730
|
-
.query("bqb == 'BQB_IS'")
|
731
|
-
.query("ontology != 'reactome'")
|
732
|
-
)
|
733
|
-
|
734
|
-
# Create example data with identifiers and results
|
735
|
-
example_data = species_identifiers.groupby("ontology").head(10)[
|
736
|
-
[IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
|
737
|
-
]
|
738
|
-
example_data["results_a"] = np.random.randn(len(example_data))
|
739
|
-
example_data["results_b"] = np.random.randn(len(example_data))
|
740
|
-
example_data[FEATURE_ID_VAR_DEFAULT] = range(0, len(example_data))
|
741
|
-
|
742
|
-
# Create wide format data
|
743
|
-
example_data_wide = (
|
744
|
-
example_data.pivot(
|
745
|
-
columns=IDENTIFIERS.ONTOLOGY,
|
746
|
-
values=IDENTIFIERS.IDENTIFIER,
|
747
|
-
index=[FEATURE_ID_VAR_DEFAULT, "results_a", "results_b"],
|
748
|
-
)
|
749
|
-
.reset_index()
|
750
|
-
.rename_axis(None, axis=1)
|
751
|
-
)
|
752
|
-
|
753
|
-
# Call bind_wide_results
|
754
|
-
results_name = "test_results"
|
755
|
-
sbml_dfs_result = mechanism_matching.bind_wide_results(
|
756
|
-
sbml_dfs=sbml_dfs_glucose_metabolism,
|
757
|
-
results_df=example_data_wide,
|
758
|
-
results_name=results_name,
|
759
|
-
ontologies={ONTOLOGIES.UNIPROT, ONTOLOGIES.CHEBI},
|
760
|
-
dogmatic=False,
|
761
|
-
species_identifiers=None,
|
762
|
-
feature_id_var=FEATURE_ID_VAR_DEFAULT,
|
763
|
-
verbose=True,
|
764
|
-
)
|
765
|
-
|
766
|
-
# Verify the results were added correctly
|
767
|
-
assert (
|
768
|
-
results_name in sbml_dfs_result.species_data
|
769
|
-
), f"{results_name} not found in species_data"
|
770
|
-
|
771
|
-
# Get the bound results
|
772
|
-
bound_results = sbml_dfs_result.species_data[results_name]
|
773
|
-
|
774
|
-
# columns are feature_id, results_a, results_b
|
775
|
-
assert set(bound_results.columns) == {
|
776
|
-
FEATURE_ID_VAR_DEFAULT,
|
777
|
-
"results_a",
|
778
|
-
"results_b",
|
779
|
-
}
|
780
|
-
|
781
|
-
assert bound_results.shape == (23, 3)
|
782
|
-
assert bound_results.loc["S00000056", "feature_id"] == "18,19"
|
783
|
-
assert bound_results.loc["S00000057", "feature_id"] == "18"
|
784
|
-
assert bound_results.loc["S00000010", "feature_id"] == "9"
|