napistu 0.2.5.dev7__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__main__.py +126 -96
- napistu/constants.py +35 -41
- napistu/context/__init__.py +10 -0
- napistu/context/discretize.py +462 -0
- napistu/context/filtering.py +387 -0
- napistu/gcs/__init__.py +1 -1
- napistu/identifiers.py +74 -15
- napistu/indices.py +68 -0
- napistu/ingestion/__init__.py +1 -1
- napistu/ingestion/bigg.py +47 -62
- napistu/ingestion/constants.py +18 -133
- napistu/ingestion/gtex.py +113 -0
- napistu/ingestion/hpa.py +147 -0
- napistu/ingestion/sbml.py +0 -97
- napistu/ingestion/string.py +2 -2
- napistu/matching/__init__.py +10 -0
- napistu/matching/constants.py +18 -0
- napistu/matching/interactions.py +518 -0
- napistu/matching/mount.py +529 -0
- napistu/matching/species.py +510 -0
- napistu/mcp/__init__.py +7 -4
- napistu/mcp/__main__.py +128 -72
- napistu/mcp/client.py +16 -25
- napistu/mcp/codebase.py +201 -145
- napistu/mcp/component_base.py +170 -0
- napistu/mcp/config.py +223 -0
- napistu/mcp/constants.py +45 -2
- napistu/mcp/documentation.py +253 -136
- napistu/mcp/documentation_utils.py +13 -48
- napistu/mcp/execution.py +372 -305
- napistu/mcp/health.py +47 -65
- napistu/mcp/profiles.py +10 -6
- napistu/mcp/server.py +161 -80
- napistu/mcp/tutorials.py +139 -87
- napistu/modify/__init__.py +1 -1
- napistu/modify/gaps.py +1 -1
- napistu/network/__init__.py +1 -1
- napistu/network/constants.py +101 -34
- napistu/network/data_handling.py +388 -0
- napistu/network/ig_utils.py +351 -0
- napistu/network/napistu_graph_core.py +354 -0
- napistu/network/neighborhoods.py +40 -40
- napistu/network/net_create.py +373 -309
- napistu/network/net_propagation.py +47 -19
- napistu/network/{net_utils.py → ng_utils.py} +124 -272
- napistu/network/paths.py +67 -51
- napistu/network/precompute.py +11 -11
- napistu/ontologies/__init__.py +10 -0
- napistu/ontologies/constants.py +129 -0
- napistu/ontologies/dogma.py +243 -0
- napistu/ontologies/genodexito.py +649 -0
- napistu/ontologies/mygene.py +369 -0
- napistu/ontologies/renaming.py +198 -0
- napistu/rpy2/__init__.py +229 -86
- napistu/rpy2/callr.py +47 -77
- napistu/rpy2/constants.py +24 -23
- napistu/rpy2/rids.py +61 -648
- napistu/sbml_dfs_core.py +587 -222
- napistu/scverse/__init__.py +15 -0
- napistu/scverse/constants.py +28 -0
- napistu/scverse/loading.py +727 -0
- napistu/utils.py +118 -10
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/METADATA +8 -3
- napistu-0.3.1.dist-info/RECORD +133 -0
- tests/conftest.py +22 -0
- tests/test_context_discretize.py +56 -0
- tests/test_context_filtering.py +267 -0
- tests/test_identifiers.py +100 -0
- tests/test_indices.py +65 -0
- tests/{test_edgelist.py → test_ingestion_napistu_edgelist.py} +2 -2
- tests/test_matching_interactions.py +108 -0
- tests/test_matching_mount.py +305 -0
- tests/test_matching_species.py +394 -0
- tests/test_mcp_config.py +193 -0
- tests/test_mcp_documentation_utils.py +12 -3
- tests/test_mcp_server.py +156 -19
- tests/test_network_data_handling.py +397 -0
- tests/test_network_ig_utils.py +23 -0
- tests/test_network_neighborhoods.py +19 -0
- tests/test_network_net_create.py +459 -0
- tests/test_network_ng_utils.py +30 -0
- tests/test_network_paths.py +56 -0
- tests/{test_precomputed_distances.py → test_network_precompute.py} +8 -6
- tests/test_ontologies_genodexito.py +58 -0
- tests/test_ontologies_mygene.py +39 -0
- tests/test_ontologies_renaming.py +110 -0
- tests/test_rpy2_callr.py +79 -0
- tests/test_rpy2_init.py +151 -0
- tests/test_sbml.py +0 -31
- tests/test_sbml_dfs_core.py +134 -10
- tests/test_scverse_loading.py +778 -0
- tests/test_set_coverage.py +2 -2
- tests/test_utils.py +121 -1
- napistu/mechanism_matching.py +0 -1353
- napistu/rpy2/netcontextr.py +0 -467
- napistu-0.2.5.dev7.dist-info/RECORD +0 -98
- tests/test_igraph.py +0 -367
- tests/test_mechanism_matching.py +0 -784
- tests/test_net_utils.py +0 -149
- tests/test_netcontextr.py +0 -105
- tests/test_rpy2.py +0 -61
- /napistu/ingestion/{cpr_edgelist.py → napistu_edgelist.py} +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/WHEEL +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/entry_points.txt +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/licenses/LICENSE +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dist-info}/top_level.txt +0 -0
- /tests/{test_obo.py → test_ingestion_obo.py} +0 -0
@@ -0,0 +1,305 @@
|
|
1
|
+
import copy
|
2
|
+
import numpy as np
|
3
|
+
import pandas as pd
|
4
|
+
import pytest
|
5
|
+
from datetime import datetime
|
6
|
+
|
7
|
+
from napistu.matching import mount
|
8
|
+
from napistu.constants import IDENTIFIERS, SBML_DFS, ONTOLOGIES
|
9
|
+
from napistu.matching.constants import (
|
10
|
+
FEATURE_ID_VAR_DEFAULT,
|
11
|
+
RESOLVE_MATCHES_AGGREGATORS,
|
12
|
+
)
|
13
|
+
|
14
|
+
|
15
|
+
def test_bind_wide_results(sbml_dfs_glucose_metabolism):
|
16
|
+
"""
|
17
|
+
Test that bind_wide_results correctly matches identifiers and adds results to species data.
|
18
|
+
"""
|
19
|
+
# Get species identifiers, excluding reactome
|
20
|
+
species_identifiers = (
|
21
|
+
sbml_dfs_glucose_metabolism.get_identifiers(SBML_DFS.SPECIES)
|
22
|
+
.query("bqb == 'BQB_IS'")
|
23
|
+
.query("ontology != 'reactome'")
|
24
|
+
)
|
25
|
+
|
26
|
+
# Create example data with identifiers and results
|
27
|
+
example_data = species_identifiers.groupby("ontology").head(10)[
|
28
|
+
[IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
|
29
|
+
]
|
30
|
+
example_data["results_a"] = np.random.randn(len(example_data))
|
31
|
+
example_data["results_b"] = np.random.randn(len(example_data))
|
32
|
+
example_data[FEATURE_ID_VAR_DEFAULT] = range(0, len(example_data))
|
33
|
+
|
34
|
+
# Create wide format data
|
35
|
+
example_data_wide = (
|
36
|
+
example_data.pivot(
|
37
|
+
columns=IDENTIFIERS.ONTOLOGY,
|
38
|
+
values=IDENTIFIERS.IDENTIFIER,
|
39
|
+
index=[FEATURE_ID_VAR_DEFAULT, "results_a", "results_b"],
|
40
|
+
)
|
41
|
+
.reset_index()
|
42
|
+
.rename_axis(None, axis=1)
|
43
|
+
)
|
44
|
+
|
45
|
+
# Test inplace=False (default)
|
46
|
+
results_name = "test_results"
|
47
|
+
original_sbml_dfs = copy.deepcopy(sbml_dfs_glucose_metabolism)
|
48
|
+
sbml_dfs_result = mount.bind_wide_results(
|
49
|
+
sbml_dfs=sbml_dfs_glucose_metabolism,
|
50
|
+
results_df=example_data_wide,
|
51
|
+
results_name=results_name,
|
52
|
+
ontologies={ONTOLOGIES.UNIPROT, ONTOLOGIES.CHEBI},
|
53
|
+
dogmatic=False,
|
54
|
+
species_identifiers=None,
|
55
|
+
feature_id_var=FEATURE_ID_VAR_DEFAULT,
|
56
|
+
verbose=True,
|
57
|
+
inplace=False,
|
58
|
+
)
|
59
|
+
|
60
|
+
# Verify original object is unchanged
|
61
|
+
assert results_name not in original_sbml_dfs.species_data
|
62
|
+
|
63
|
+
# Verify the results were added correctly to the new object
|
64
|
+
assert results_name in sbml_dfs_result.species_data
|
65
|
+
bound_results = sbml_dfs_result.species_data[results_name]
|
66
|
+
assert set(bound_results.columns) == {
|
67
|
+
FEATURE_ID_VAR_DEFAULT,
|
68
|
+
"results_a",
|
69
|
+
"results_b",
|
70
|
+
}
|
71
|
+
assert bound_results.shape == (23, 3)
|
72
|
+
assert bound_results.loc["S00000056", "feature_id"] == "18,19"
|
73
|
+
assert bound_results.loc["S00000057", "feature_id"] == "18"
|
74
|
+
assert bound_results.loc["S00000010", "feature_id"] == "9"
|
75
|
+
|
76
|
+
# Test inplace=True
|
77
|
+
results_name_2 = "test_results_2"
|
78
|
+
sbml_dfs_inplace = copy.deepcopy(sbml_dfs_glucose_metabolism)
|
79
|
+
result_inplace = mount.bind_wide_results(
|
80
|
+
sbml_dfs=sbml_dfs_inplace,
|
81
|
+
results_df=example_data_wide,
|
82
|
+
results_name=results_name_2,
|
83
|
+
ontologies={ONTOLOGIES.UNIPROT, ONTOLOGIES.CHEBI},
|
84
|
+
dogmatic=False,
|
85
|
+
species_identifiers=None,
|
86
|
+
feature_id_var=FEATURE_ID_VAR_DEFAULT,
|
87
|
+
verbose=True,
|
88
|
+
inplace=True,
|
89
|
+
)
|
90
|
+
|
91
|
+
# Verify the object was modified and function returned None
|
92
|
+
assert result_inplace is None
|
93
|
+
assert results_name_2 in sbml_dfs_inplace.species_data
|
94
|
+
|
95
|
+
|
96
|
+
def test_resolve_matches_with_example_data():
|
97
|
+
"""Test resolve_matches function with example data for all aggregation methods."""
|
98
|
+
# Setup example data with overlapping 1-to-many and many-to-1 cases
|
99
|
+
example_data = pd.DataFrame(
|
100
|
+
{
|
101
|
+
FEATURE_ID_VAR_DEFAULT: ["A", "B", "C", "D", "D", "E", "B", "B", "C"],
|
102
|
+
SBML_DFS.S_ID: [
|
103
|
+
"s_id_1",
|
104
|
+
"s_id_1",
|
105
|
+
"s_id_1",
|
106
|
+
"s_id_4",
|
107
|
+
"s_id_5",
|
108
|
+
"s_id_6",
|
109
|
+
"s_id_2",
|
110
|
+
"s_id_3",
|
111
|
+
"s_id_3",
|
112
|
+
],
|
113
|
+
"results_a": [1, 2, 3, 0.4, 5, 6, 0.7, 0.8, 9],
|
114
|
+
"results_b": [
|
115
|
+
"foo",
|
116
|
+
"foo",
|
117
|
+
"bar",
|
118
|
+
"bar",
|
119
|
+
"baz",
|
120
|
+
"baz",
|
121
|
+
"not",
|
122
|
+
"not",
|
123
|
+
"not",
|
124
|
+
],
|
125
|
+
}
|
126
|
+
)
|
127
|
+
|
128
|
+
# Test that missing feature_id raises KeyError
|
129
|
+
data_no_id = pd.DataFrame(
|
130
|
+
{
|
131
|
+
SBML_DFS.S_ID: ["s_id_1", "s_id_1", "s_id_2"],
|
132
|
+
"results_a": [1, 2, 3],
|
133
|
+
"results_b": ["foo", "bar", "baz"],
|
134
|
+
}
|
135
|
+
)
|
136
|
+
with pytest.raises(KeyError, match=FEATURE_ID_VAR_DEFAULT):
|
137
|
+
mount.resolve_matches(data_no_id)
|
138
|
+
|
139
|
+
# Test with keep_id_col=True (default)
|
140
|
+
result_with_id = mount.resolve_matches(
|
141
|
+
example_data, keep_id_col=True, numeric_agg=RESOLVE_MATCHES_AGGREGATORS.MEAN
|
142
|
+
)
|
143
|
+
|
144
|
+
# Verify feature_id column is present and correctly aggregated
|
145
|
+
assert FEATURE_ID_VAR_DEFAULT in result_with_id.columns
|
146
|
+
assert result_with_id.loc["s_id_1", FEATURE_ID_VAR_DEFAULT] == "A,B,C"
|
147
|
+
assert result_with_id.loc["s_id_3", FEATURE_ID_VAR_DEFAULT] == "B,C"
|
148
|
+
|
149
|
+
# Test with keep_id_col=False
|
150
|
+
result_without_id = mount.resolve_matches(
|
151
|
+
example_data, keep_id_col=False, numeric_agg=RESOLVE_MATCHES_AGGREGATORS.MEAN
|
152
|
+
)
|
153
|
+
|
154
|
+
# Verify feature_id column is not in output
|
155
|
+
assert FEATURE_ID_VAR_DEFAULT not in result_without_id.columns
|
156
|
+
|
157
|
+
# Verify other columns are still present and correctly aggregated
|
158
|
+
assert "results_a" in result_without_id.columns
|
159
|
+
assert "results_b" in result_without_id.columns
|
160
|
+
assert "feature_id_match_count" in result_without_id.columns
|
161
|
+
|
162
|
+
# Verify numeric aggregation still works
|
163
|
+
actual_mean = result_without_id.loc["s_id_1", "results_a"]
|
164
|
+
expected_mean = 2.0 # (1 + 2 + 3) / 3
|
165
|
+
assert (
|
166
|
+
actual_mean == expected_mean
|
167
|
+
), f"Expected mean {expected_mean}, but got {actual_mean}"
|
168
|
+
|
169
|
+
# Verify string aggregation still works
|
170
|
+
assert result_without_id.loc["s_id_1", "results_b"] == "bar,foo"
|
171
|
+
|
172
|
+
# Verify match counts are still present
|
173
|
+
assert result_without_id.loc["s_id_1", "feature_id_match_count"] == 3
|
174
|
+
assert result_without_id.loc["s_id_3", "feature_id_match_count"] == 2
|
175
|
+
|
176
|
+
# Test maximum aggregation
|
177
|
+
max_result = mount.resolve_matches(
|
178
|
+
example_data, numeric_agg=RESOLVE_MATCHES_AGGREGATORS.MAX
|
179
|
+
)
|
180
|
+
|
181
|
+
# Verify maximum values are correct
|
182
|
+
assert max_result.loc["s_id_1", "results_a"] == 3.0 # max of [1, 2, 3]
|
183
|
+
assert max_result.loc["s_id_3", "results_a"] == 9.0 # max of [0.8, 9]
|
184
|
+
assert max_result.loc["s_id_4", "results_a"] == 0.4 # single value
|
185
|
+
assert max_result.loc["s_id_5", "results_a"] == 5.0 # single value
|
186
|
+
assert max_result.loc["s_id_6", "results_a"] == 6.0 # single value
|
187
|
+
|
188
|
+
# Test weighted mean (feature_id is used for weights regardless of keep_id_col)
|
189
|
+
weighted_result = mount.resolve_matches(
|
190
|
+
example_data,
|
191
|
+
numeric_agg=RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
|
192
|
+
keep_id_col=True,
|
193
|
+
)
|
194
|
+
|
195
|
+
# For s_id_1:
|
196
|
+
# A appears once in total (weight = 1/1)
|
197
|
+
# B appears three times in total (weight = 1/3)
|
198
|
+
# C appears twice in total (weight = 1/2)
|
199
|
+
# Sum of unnormalized weights = 1 + 1/3 + 1/2 = 1.833
|
200
|
+
# Normalized weights:
|
201
|
+
# A: (1/1)/1.833 = 0.545
|
202
|
+
# B: (1/3)/1.833 = 0.182
|
203
|
+
# C: (1/2)/1.833 = 0.273
|
204
|
+
# Weighted mean = 1×0.545 + 2×0.182 + 3×0.273 = 1.73
|
205
|
+
actual_weighted_mean_1 = weighted_result.loc["s_id_1", "results_a"]
|
206
|
+
expected_weighted_mean_1 = 1.73
|
207
|
+
assert (
|
208
|
+
abs(actual_weighted_mean_1 - expected_weighted_mean_1) < 0.01
|
209
|
+
), f"s_id_1 weighted mean: expected {expected_weighted_mean_1:.3f}, but got {actual_weighted_mean_1:.3f}"
|
210
|
+
|
211
|
+
# For s_id_3:
|
212
|
+
# B appears three times in total (weight = 1/3)
|
213
|
+
# C appears twice in total (weight = 1/2)
|
214
|
+
# Sum of unnormalized weights = 1/3 + 1/2 = 0.833
|
215
|
+
# Normalized weights:
|
216
|
+
# B: (1/3)/0.833 = 0.4
|
217
|
+
# C: (1/2)/0.833 = 0.6
|
218
|
+
# Weighted mean = 0.8×0.4 + 9×0.6 = 5.72
|
219
|
+
actual_weighted_mean_3 = weighted_result.loc["s_id_3", "results_a"]
|
220
|
+
expected_weighted_mean_3 = 5.72
|
221
|
+
assert (
|
222
|
+
abs(actual_weighted_mean_3 - expected_weighted_mean_3) < 0.01
|
223
|
+
), f"s_id_3 weighted mean: expected {expected_weighted_mean_3:.3f}, but got {actual_weighted_mean_3:.3f}"
|
224
|
+
|
225
|
+
# Test weighted mean with keep_id_col=False (weights still use feature_id)
|
226
|
+
weighted_result_no_id = mount.resolve_matches(
|
227
|
+
example_data,
|
228
|
+
numeric_agg=RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
|
229
|
+
keep_id_col=False,
|
230
|
+
)
|
231
|
+
|
232
|
+
# Verify weighted means are the same regardless of keep_id_col
|
233
|
+
assert (
|
234
|
+
abs(weighted_result_no_id.loc["s_id_1", "results_a"] - expected_weighted_mean_1)
|
235
|
+
< 0.01
|
236
|
+
), "Weighted mean should be the same regardless of keep_id_col"
|
237
|
+
assert (
|
238
|
+
abs(weighted_result_no_id.loc["s_id_3", "results_a"] - expected_weighted_mean_3)
|
239
|
+
< 0.01
|
240
|
+
), "Weighted mean should be the same regardless of keep_id_col"
|
241
|
+
|
242
|
+
# Test that both versions preserve the same index structure
|
243
|
+
expected_index = pd.Index(
|
244
|
+
["s_id_1", "s_id_2", "s_id_3", "s_id_4", "s_id_5", "s_id_6"], name="s_id"
|
245
|
+
)
|
246
|
+
pd.testing.assert_index_equal(result_with_id.index, expected_index)
|
247
|
+
pd.testing.assert_index_equal(result_without_id.index, expected_index)
|
248
|
+
|
249
|
+
|
250
|
+
def test_resolve_matches_invalid_dtypes():
|
251
|
+
"""Test that resolve_matches raises an error for unsupported dtypes."""
|
252
|
+
# Setup data with boolean and datetime columns
|
253
|
+
data = pd.DataFrame(
|
254
|
+
{
|
255
|
+
FEATURE_ID_VAR_DEFAULT: ["A", "B", "B", "C"],
|
256
|
+
"bool_col": [True, False, True, False],
|
257
|
+
"datetime_col": [
|
258
|
+
datetime(2024, 1, 1),
|
259
|
+
datetime(2024, 1, 2),
|
260
|
+
datetime(2024, 1, 3),
|
261
|
+
datetime(2024, 1, 4),
|
262
|
+
],
|
263
|
+
"s_id": ["s1", "s1", "s2", "s2"],
|
264
|
+
}
|
265
|
+
)
|
266
|
+
|
267
|
+
# Should raise TypeError for unsupported dtypes
|
268
|
+
with pytest.raises(TypeError, match="Unsupported data types"):
|
269
|
+
mount.resolve_matches(data)
|
270
|
+
|
271
|
+
|
272
|
+
def test_resolve_matches_first_method():
|
273
|
+
"""Test resolve_matches with first method."""
|
274
|
+
# Setup data with known order
|
275
|
+
data = pd.DataFrame(
|
276
|
+
{
|
277
|
+
FEATURE_ID_VAR_DEFAULT: ["A", "C", "B", "B", "A"],
|
278
|
+
SBML_DFS.S_ID: ["s1", "s1", "s1", "s2", "s2"],
|
279
|
+
"value": [1, 2, 3, 4, 5],
|
280
|
+
}
|
281
|
+
)
|
282
|
+
|
283
|
+
result = mount.resolve_matches(data, numeric_agg=RESOLVE_MATCHES_AGGREGATORS.FIRST)
|
284
|
+
|
285
|
+
# Should take first value after sorting by feature_id
|
286
|
+
assert result.loc["s1", "value"] == 1 # A comes first
|
287
|
+
assert result.loc["s2", "value"] == 5 # A comes first
|
288
|
+
|
289
|
+
|
290
|
+
def test_resolve_matches_deduplicate_feature_id_within_sid():
|
291
|
+
"""Test that only the first value for each (s_id, feature_id) is used in mean aggregation."""
|
292
|
+
data = pd.DataFrame(
|
293
|
+
{
|
294
|
+
FEATURE_ID_VAR_DEFAULT: ["A", "A", "B"],
|
295
|
+
SBML_DFS.S_ID: ["s1", "s1", "s1"],
|
296
|
+
"value": [
|
297
|
+
1,
|
298
|
+
1,
|
299
|
+
2,
|
300
|
+
], # average should be 1.5 because the two A's are redundant
|
301
|
+
}
|
302
|
+
)
|
303
|
+
|
304
|
+
result = mount.resolve_matches(data, numeric_agg=RESOLVE_MATCHES_AGGREGATORS.MEAN)
|
305
|
+
assert result.loc["s1", "value"] == 1.5
|
@@ -0,0 +1,394 @@
|
|
1
|
+
import numpy as np
|
2
|
+
import pandas as pd
|
3
|
+
import pytest
|
4
|
+
|
5
|
+
from napistu.matching.constants import FEATURE_ID_VAR_DEFAULT
|
6
|
+
from napistu.matching.species import (
|
7
|
+
match_features_to_wide_pathway_species,
|
8
|
+
match_by_ontology_and_identifier,
|
9
|
+
_validate_wide_ontologies,
|
10
|
+
_ensure_feature_id_var,
|
11
|
+
features_to_pathway_species,
|
12
|
+
)
|
13
|
+
|
14
|
+
|
15
|
+
def test_features_to_pathway_species(sbml_dfs):
|
16
|
+
species_identifiers = sbml_dfs.get_identifiers("species")
|
17
|
+
feature_identifiers = pd.DataFrame({"chebis": ["17627", "15379", "29105", "-1"]})
|
18
|
+
|
19
|
+
matching_df = (
|
20
|
+
features_to_pathway_species(
|
21
|
+
feature_identifiers, species_identifiers, {"chebi"}, "chebis"
|
22
|
+
)
|
23
|
+
.value_counts("identifier")
|
24
|
+
.sort_index()
|
25
|
+
)
|
26
|
+
|
27
|
+
assert matching_df.index.tolist() == ["15379", "17627", "29105"]
|
28
|
+
assert matching_df.tolist() == [2, 3, 2]
|
29
|
+
|
30
|
+
|
31
|
+
def test_features_to_pathway_species_basic_and_expansion():
|
32
|
+
|
33
|
+
# Mock species_identifiers table
|
34
|
+
species_identifiers = pd.DataFrame(
|
35
|
+
{
|
36
|
+
"ontology": ["chebi", "chebi", "uniprot", "uniprot"],
|
37
|
+
"identifier": ["A", "B", "X", "Y"],
|
38
|
+
"s_id": [1, 2, 3, 4],
|
39
|
+
"s_name": ["foo", "bar", "baz", "qux"],
|
40
|
+
"bqb": ["BQB_IS", "BQB_IS", "BQB_IS", "BQB_IS"],
|
41
|
+
}
|
42
|
+
)
|
43
|
+
# Basic: no expansion, single identifier per row
|
44
|
+
features = pd.DataFrame({"my_id": ["A", "B", "X"], "other_col": [10, 20, 30]})
|
45
|
+
result = features_to_pathway_species(
|
46
|
+
feature_identifiers=features,
|
47
|
+
species_identifiers=species_identifiers,
|
48
|
+
ontologies={"chebi", "uniprot"},
|
49
|
+
feature_identifiers_var="my_id",
|
50
|
+
expand_identifiers=False,
|
51
|
+
)
|
52
|
+
# Should map all three
|
53
|
+
assert set(result["my_id"]) == {"A", "B", "X"}
|
54
|
+
assert set(result["identifier"]) == {"A", "B", "X"}
|
55
|
+
assert set(result["s_name"]) == {"foo", "bar", "baz"}
|
56
|
+
# Expansion: one row with multiple IDs
|
57
|
+
features2 = pd.DataFrame({"my_id": ["A / B / X", "Y"], "other_col": [100, 200]})
|
58
|
+
result2 = features_to_pathway_species(
|
59
|
+
feature_identifiers=features2,
|
60
|
+
species_identifiers=species_identifiers,
|
61
|
+
ontologies={"chebi", "uniprot"},
|
62
|
+
feature_identifiers_var="my_id",
|
63
|
+
expand_identifiers=True,
|
64
|
+
identifier_delimiter="/",
|
65
|
+
)
|
66
|
+
# Should expand to 4 rows (A, B, X, Y)
|
67
|
+
assert set(result2["identifier"]) == {"A", "B", "X", "Y"}
|
68
|
+
assert set(result2["s_name"]) == {"foo", "bar", "baz", "qux"}
|
69
|
+
# Whitespace trimming
|
70
|
+
features3 = pd.DataFrame({"my_id": [" A / B /X ", " Y"], "other_col": [1, 2]})
|
71
|
+
result3 = features_to_pathway_species(
|
72
|
+
feature_identifiers=features3,
|
73
|
+
species_identifiers=species_identifiers,
|
74
|
+
ontologies={"chebi", "uniprot"},
|
75
|
+
feature_identifiers_var="my_id",
|
76
|
+
expand_identifiers=True,
|
77
|
+
identifier_delimiter="/",
|
78
|
+
)
|
79
|
+
# Should expand and trim whitespace
|
80
|
+
assert set(result3["identifier"]) == {"A", "B", "X", "Y"}
|
81
|
+
assert set(result3["s_name"]) == {"foo", "bar", "baz", "qux"}
|
82
|
+
|
83
|
+
|
84
|
+
def test_validate_wide_ontologies():
|
85
|
+
"""Test the _validate_wide_ontologies function with various input types and error cases."""
|
86
|
+
# Setup test data
|
87
|
+
example_data_wide = pd.DataFrame(
|
88
|
+
{
|
89
|
+
"results": [-1.0, 0.0, 1.0],
|
90
|
+
"chebi": ["15377", "16810", "17925"],
|
91
|
+
"uniprot": ["P12345", "Q67890", "O43826"],
|
92
|
+
}
|
93
|
+
)
|
94
|
+
|
95
|
+
# Test auto-detection of ontology columns
|
96
|
+
assert _validate_wide_ontologies(example_data_wide) == {"chebi", "uniprot"}
|
97
|
+
|
98
|
+
# Test string input
|
99
|
+
assert _validate_wide_ontologies(example_data_wide, ontologies="chebi") == {"chebi"}
|
100
|
+
|
101
|
+
# Test set input
|
102
|
+
assert _validate_wide_ontologies(example_data_wide, ontologies={"chebi"}) == {
|
103
|
+
"chebi"
|
104
|
+
}
|
105
|
+
assert _validate_wide_ontologies(
|
106
|
+
example_data_wide, ontologies={"chebi", "uniprot"}
|
107
|
+
) == {"chebi", "uniprot"}
|
108
|
+
|
109
|
+
# Test dictionary mapping for renaming
|
110
|
+
assert _validate_wide_ontologies(
|
111
|
+
example_data_wide, ontologies={"chebi": "reactome", "uniprot": "ensembl_gene"}
|
112
|
+
) == {"reactome", "ensembl_gene"}
|
113
|
+
|
114
|
+
# Test error cases
|
115
|
+
|
116
|
+
# Missing column in set input (checks existence first)
|
117
|
+
with pytest.raises(
|
118
|
+
ValueError, match="Specified ontology columns not found in DataFrame:.*"
|
119
|
+
):
|
120
|
+
_validate_wide_ontologies(example_data_wide, ontologies={"invalid_ontology"})
|
121
|
+
|
122
|
+
# Valid column name but invalid ontology
|
123
|
+
df_with_invalid = pd.DataFrame(
|
124
|
+
{
|
125
|
+
"results": [-1.0, 0.0, 1.0],
|
126
|
+
"invalid_ontology": ["a", "b", "c"],
|
127
|
+
}
|
128
|
+
)
|
129
|
+
with pytest.raises(ValueError, match="Invalid ontologies in set:.*"):
|
130
|
+
_validate_wide_ontologies(df_with_invalid, ontologies={"invalid_ontology"})
|
131
|
+
|
132
|
+
# Missing source column in mapping
|
133
|
+
with pytest.raises(ValueError, match="Source columns not found in DataFrame:.*"):
|
134
|
+
_validate_wide_ontologies(
|
135
|
+
example_data_wide, ontologies={"missing_column": "reactome"}
|
136
|
+
)
|
137
|
+
|
138
|
+
# Invalid target ontology in mapping
|
139
|
+
with pytest.raises(ValueError, match="Invalid ontologies in mapping:.*"):
|
140
|
+
_validate_wide_ontologies(
|
141
|
+
example_data_wide, ontologies={"chebi": "invalid_ontology"}
|
142
|
+
)
|
143
|
+
|
144
|
+
# DataFrame with no valid ontology columns
|
145
|
+
invalid_df = pd.DataFrame(
|
146
|
+
{"results": [-1.0, 0.0, 1.0], "col1": ["a", "b", "c"], "col2": ["d", "e", "f"]}
|
147
|
+
)
|
148
|
+
with pytest.raises(
|
149
|
+
ValueError, match="No valid ontology columns found in DataFrame.*"
|
150
|
+
):
|
151
|
+
_validate_wide_ontologies(invalid_df)
|
152
|
+
|
153
|
+
|
154
|
+
def test_ensure_feature_id_var():
|
155
|
+
"""Test the _ensure_feature_id_var function with various input cases."""
|
156
|
+
# Test case 1: DataFrame already has feature_id column
|
157
|
+
df1 = pd.DataFrame({"feature_id": [100, 200, 300], "data": ["a", "b", "c"]})
|
158
|
+
result1 = _ensure_feature_id_var(df1)
|
159
|
+
# Should return unchanged DataFrame
|
160
|
+
pd.testing.assert_frame_equal(df1, result1)
|
161
|
+
|
162
|
+
# Test case 2: DataFrame missing feature_id column
|
163
|
+
df2 = pd.DataFrame({"data": ["x", "y", "z"]})
|
164
|
+
result2 = _ensure_feature_id_var(df2)
|
165
|
+
# Should add feature_id column with sequential integers
|
166
|
+
assert FEATURE_ID_VAR_DEFAULT in result2.columns
|
167
|
+
assert list(result2[FEATURE_ID_VAR_DEFAULT]) == [0, 1, 2]
|
168
|
+
assert list(result2["data"]) == ["x", "y", "z"] # Original data preserved
|
169
|
+
|
170
|
+
# Test case 3: Custom feature_id column name
|
171
|
+
df3 = pd.DataFrame({"data": ["p", "q", "r"]})
|
172
|
+
custom_id = "custom_feature_id"
|
173
|
+
result3 = _ensure_feature_id_var(df3, feature_id_var=custom_id)
|
174
|
+
# Should add custom named feature_id column
|
175
|
+
assert custom_id in result3.columns
|
176
|
+
assert list(result3[custom_id]) == [0, 1, 2]
|
177
|
+
assert list(result3["data"]) == ["p", "q", "r"] # Original data preserved
|
178
|
+
|
179
|
+
# Test case 4: Empty DataFrame
|
180
|
+
df4 = pd.DataFrame()
|
181
|
+
result4 = _ensure_feature_id_var(df4)
|
182
|
+
# Should handle empty DataFrame gracefully
|
183
|
+
assert FEATURE_ID_VAR_DEFAULT in result4.columns
|
184
|
+
assert len(result4) == 0
|
185
|
+
|
186
|
+
|
187
|
+
def test_match_by_ontology_and_identifier():
|
188
|
+
"""Test the match_by_ontology_and_identifier function with various input types."""
|
189
|
+
# Setup test data
|
190
|
+
feature_identifiers = pd.DataFrame(
|
191
|
+
{
|
192
|
+
"ontology": ["chebi", "chebi", "uniprot", "uniprot", "reactome"],
|
193
|
+
"identifier": ["15377", "16810", "P12345", "Q67890", "R12345"],
|
194
|
+
"results": [1.0, 2.0, -1.0, -2.0, 0.5],
|
195
|
+
}
|
196
|
+
)
|
197
|
+
|
198
|
+
species_identifiers = pd.DataFrame(
|
199
|
+
{
|
200
|
+
"ontology": ["chebi", "chebi", "uniprot", "uniprot", "ensembl_gene"],
|
201
|
+
"identifier": ["15377", "17925", "P12345", "O43826", "ENSG123"],
|
202
|
+
"s_id": ["s1", "s2", "s3", "s4", "s5"],
|
203
|
+
"s_name": ["compound1", "compound2", "protein1", "protein2", "gene1"],
|
204
|
+
"bqb": ["BQB_IS"] * 5, # Add required bqb column with BQB_IS values
|
205
|
+
}
|
206
|
+
)
|
207
|
+
|
208
|
+
# Test with single ontology (string)
|
209
|
+
result = match_by_ontology_and_identifier(
|
210
|
+
feature_identifiers=feature_identifiers,
|
211
|
+
species_identifiers=species_identifiers,
|
212
|
+
ontologies="chebi",
|
213
|
+
)
|
214
|
+
assert len(result) == 1 # Only one matching chebi identifier
|
215
|
+
assert result.iloc[0]["identifier"] == "15377"
|
216
|
+
assert result.iloc[0]["results"] == 1.0
|
217
|
+
assert result.iloc[0]["ontology"] == "chebi" # From species_identifiers
|
218
|
+
assert result.iloc[0]["s_name"] == "compound1" # Verify join worked correctly
|
219
|
+
assert result.iloc[0]["bqb"] == "BQB_IS" # Verify bqb column is preserved
|
220
|
+
|
221
|
+
# Test with multiple ontologies (set)
|
222
|
+
result = match_by_ontology_and_identifier(
|
223
|
+
feature_identifiers=feature_identifiers,
|
224
|
+
species_identifiers=species_identifiers,
|
225
|
+
ontologies={"chebi", "uniprot"},
|
226
|
+
)
|
227
|
+
assert len(result) == 2 # One chebi and one uniprot match
|
228
|
+
assert set(result["ontology"]) == {"chebi", "uniprot"} # From species_identifiers
|
229
|
+
assert set(result["identifier"]) == {"15377", "P12345"}
|
230
|
+
# Verify results are correctly matched
|
231
|
+
chebi_row = result[result["ontology"] == "chebi"].iloc[0]
|
232
|
+
uniprot_row = result[result["ontology"] == "uniprot"].iloc[0]
|
233
|
+
assert chebi_row["results"] == 1.0
|
234
|
+
assert uniprot_row["results"] == -1.0
|
235
|
+
assert chebi_row["s_name"] == "compound1"
|
236
|
+
assert uniprot_row["s_name"] == "protein1"
|
237
|
+
assert chebi_row["bqb"] == "BQB_IS"
|
238
|
+
assert uniprot_row["bqb"] == "BQB_IS"
|
239
|
+
|
240
|
+
# Test with list of ontologies
|
241
|
+
result = match_by_ontology_and_identifier(
|
242
|
+
feature_identifiers=feature_identifiers,
|
243
|
+
species_identifiers=species_identifiers,
|
244
|
+
ontologies=["chebi", "uniprot"],
|
245
|
+
)
|
246
|
+
assert len(result) == 2
|
247
|
+
assert set(result["ontology"]) == {"chebi", "uniprot"} # From species_identifiers
|
248
|
+
|
249
|
+
# Test with no matches
|
250
|
+
no_match_features = pd.DataFrame(
|
251
|
+
{"ontology": ["chebi"], "identifier": ["99999"], "results": [1.0]}
|
252
|
+
)
|
253
|
+
result = match_by_ontology_and_identifier(
|
254
|
+
feature_identifiers=no_match_features,
|
255
|
+
species_identifiers=species_identifiers,
|
256
|
+
ontologies="chebi",
|
257
|
+
)
|
258
|
+
assert len(result) == 0
|
259
|
+
|
260
|
+
# Test with empty features
|
261
|
+
empty_features = pd.DataFrame({"ontology": [], "identifier": [], "results": []})
|
262
|
+
result = match_by_ontology_and_identifier(
|
263
|
+
feature_identifiers=empty_features,
|
264
|
+
species_identifiers=species_identifiers,
|
265
|
+
ontologies={"chebi", "uniprot"},
|
266
|
+
)
|
267
|
+
assert len(result) == 0
|
268
|
+
|
269
|
+
# Test with invalid ontology
|
270
|
+
with pytest.raises(ValueError, match="Invalid ontologies specified:.*"):
|
271
|
+
match_by_ontology_and_identifier(
|
272
|
+
feature_identifiers=feature_identifiers,
|
273
|
+
species_identifiers=species_identifiers,
|
274
|
+
ontologies="invalid_ontology",
|
275
|
+
)
|
276
|
+
|
277
|
+
# Test with ontology not in feature_identifiers
|
278
|
+
result = match_by_ontology_and_identifier(
|
279
|
+
feature_identifiers=feature_identifiers,
|
280
|
+
species_identifiers=species_identifiers,
|
281
|
+
ontologies={"ensembl_gene"}, # Only in species_identifiers
|
282
|
+
)
|
283
|
+
assert len(result) == 0
|
284
|
+
|
285
|
+
# Test with custom feature_identifiers_var
|
286
|
+
feature_identifiers_custom = feature_identifiers.rename(
|
287
|
+
columns={"identifier": "custom_id"}
|
288
|
+
)
|
289
|
+
result = match_by_ontology_and_identifier(
|
290
|
+
feature_identifiers=feature_identifiers_custom,
|
291
|
+
species_identifiers=species_identifiers,
|
292
|
+
ontologies={"chebi"},
|
293
|
+
feature_identifiers_var="custom_id",
|
294
|
+
)
|
295
|
+
assert len(result) == 1
|
296
|
+
assert result.iloc[0]["custom_id"] == "15377"
|
297
|
+
assert result.iloc[0]["ontology"] == "chebi" # From species_identifiers
|
298
|
+
assert result.iloc[0]["s_name"] == "compound1"
|
299
|
+
assert result.iloc[0]["bqb"] == "BQB_IS"
|
300
|
+
|
301
|
+
|
302
|
+
def test_match_features_to_wide_pathway_species(sbml_dfs_glucose_metabolism):
|
303
|
+
|
304
|
+
def compare_frame_contents(df1, df2):
|
305
|
+
"""
|
306
|
+
Compare if two DataFrames have the same content, ignoring index and column ordering.
|
307
|
+
|
308
|
+
Parameters
|
309
|
+
----------
|
310
|
+
df1 : pd.DataFrame
|
311
|
+
First DataFrame to compare
|
312
|
+
df2 : pd.DataFrame
|
313
|
+
Second DataFrame to compare
|
314
|
+
|
315
|
+
Returns
|
316
|
+
-------
|
317
|
+
None
|
318
|
+
"""
|
319
|
+
df1_sorted = (
|
320
|
+
df1.reindex(columns=sorted(df1.columns))
|
321
|
+
.sort_values(sorted(df1.columns))
|
322
|
+
.reset_index(drop=True)
|
323
|
+
)
|
324
|
+
|
325
|
+
df2_sorted = (
|
326
|
+
df2.reindex(columns=sorted(df2.columns))
|
327
|
+
.sort_values(sorted(df2.columns))
|
328
|
+
.reset_index(drop=True)
|
329
|
+
)
|
330
|
+
|
331
|
+
pd.testing.assert_frame_equal(df1_sorted, df2_sorted, check_like=True)
|
332
|
+
|
333
|
+
return None
|
334
|
+
|
335
|
+
species_identifiers = (
|
336
|
+
sbml_dfs_glucose_metabolism.get_identifiers("species")
|
337
|
+
.query("bqb == 'BQB_IS'")
|
338
|
+
.query("ontology != 'reactome'")
|
339
|
+
)
|
340
|
+
|
341
|
+
# create a table whose index is s_ids and columns are faux-measurements
|
342
|
+
example_data = species_identifiers.groupby("ontology").head(10)[
|
343
|
+
["ontology", "identifier"]
|
344
|
+
]
|
345
|
+
|
346
|
+
example_data["results_a"] = np.random.randn(len(example_data))
|
347
|
+
example_data["results_b"] = np.random.randn(len(example_data))
|
348
|
+
# add a feature_id column to the example_data which tracks the row of the original data
|
349
|
+
example_data["feature_id"] = range(0, len(example_data))
|
350
|
+
|
351
|
+
# pivot (identifier, ontology) to columns for each ontology
|
352
|
+
example_data_wide = (
|
353
|
+
example_data.pivot(
|
354
|
+
columns="ontology",
|
355
|
+
values="identifier",
|
356
|
+
index=["feature_id", "results_a", "results_b"],
|
357
|
+
)
|
358
|
+
.reset_index()
|
359
|
+
.rename_axis(None, axis=1)
|
360
|
+
)
|
361
|
+
|
362
|
+
# options, for matching
|
363
|
+
# 1. match by identifier and a set of ontologies (provided by arg).
|
364
|
+
matched_s_ids = features_to_pathway_species(
|
365
|
+
feature_identifiers=example_data.drop(columns="ontology"),
|
366
|
+
species_identifiers=species_identifiers,
|
367
|
+
ontologies={"uniprot", "chebi"},
|
368
|
+
feature_identifiers_var="identifier",
|
369
|
+
)
|
370
|
+
|
371
|
+
# 2. match by identifier and ontology.
|
372
|
+
matched_s_ids_w_ontologies = match_by_ontology_and_identifier(
|
373
|
+
feature_identifiers=example_data,
|
374
|
+
species_identifiers=species_identifiers,
|
375
|
+
ontologies={"uniprot", "chebi"},
|
376
|
+
feature_identifiers_var="identifier",
|
377
|
+
)
|
378
|
+
|
379
|
+
# 3. format wide identifier sets into a table with a single identifier column and apply strategy #2.
|
380
|
+
matched_s_ids_from_wide = match_features_to_wide_pathway_species(
|
381
|
+
example_data_wide,
|
382
|
+
species_identifiers,
|
383
|
+
ontologies={"uniprot", "chebi"},
|
384
|
+
feature_identifiers_var="identifier",
|
385
|
+
)
|
386
|
+
|
387
|
+
compare_frame_contents(
|
388
|
+
matched_s_ids.drop(columns="s_Source"),
|
389
|
+
matched_s_ids_w_ontologies.drop(columns="s_Source"),
|
390
|
+
)
|
391
|
+
compare_frame_contents(
|
392
|
+
matched_s_ids.drop(columns="s_Source"),
|
393
|
+
matched_s_ids_from_wide.drop(columns="s_Source"),
|
394
|
+
)
|