napistu 0.2.5.dev7__py3-none-any.whl → 0.3.1.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. napistu/__init__.py +1 -3
  2. napistu/__main__.py +126 -96
  3. napistu/constants.py +35 -41
  4. napistu/context/__init__.py +10 -0
  5. napistu/context/discretize.py +462 -0
  6. napistu/context/filtering.py +387 -0
  7. napistu/gcs/__init__.py +1 -1
  8. napistu/identifiers.py +74 -15
  9. napistu/indices.py +68 -0
  10. napistu/ingestion/__init__.py +1 -1
  11. napistu/ingestion/bigg.py +47 -62
  12. napistu/ingestion/constants.py +18 -133
  13. napistu/ingestion/gtex.py +113 -0
  14. napistu/ingestion/hpa.py +147 -0
  15. napistu/ingestion/sbml.py +0 -97
  16. napistu/ingestion/string.py +2 -2
  17. napistu/matching/__init__.py +10 -0
  18. napistu/matching/constants.py +18 -0
  19. napistu/matching/interactions.py +518 -0
  20. napistu/matching/mount.py +529 -0
  21. napistu/matching/species.py +510 -0
  22. napistu/mcp/__init__.py +7 -4
  23. napistu/mcp/__main__.py +128 -72
  24. napistu/mcp/client.py +16 -25
  25. napistu/mcp/codebase.py +201 -145
  26. napistu/mcp/component_base.py +170 -0
  27. napistu/mcp/config.py +223 -0
  28. napistu/mcp/constants.py +45 -2
  29. napistu/mcp/documentation.py +253 -136
  30. napistu/mcp/documentation_utils.py +13 -48
  31. napistu/mcp/execution.py +372 -305
  32. napistu/mcp/health.py +47 -65
  33. napistu/mcp/profiles.py +10 -6
  34. napistu/mcp/server.py +161 -80
  35. napistu/mcp/tutorials.py +139 -87
  36. napistu/modify/__init__.py +1 -1
  37. napistu/modify/gaps.py +1 -1
  38. napistu/network/__init__.py +1 -1
  39. napistu/network/constants.py +101 -34
  40. napistu/network/data_handling.py +388 -0
  41. napistu/network/ig_utils.py +351 -0
  42. napistu/network/napistu_graph_core.py +354 -0
  43. napistu/network/neighborhoods.py +40 -40
  44. napistu/network/net_create.py +373 -309
  45. napistu/network/net_propagation.py +47 -19
  46. napistu/network/{net_utils.py → ng_utils.py} +124 -272
  47. napistu/network/paths.py +67 -51
  48. napistu/network/precompute.py +11 -11
  49. napistu/ontologies/__init__.py +10 -0
  50. napistu/ontologies/constants.py +129 -0
  51. napistu/ontologies/dogma.py +243 -0
  52. napistu/ontologies/genodexito.py +649 -0
  53. napistu/ontologies/mygene.py +369 -0
  54. napistu/ontologies/renaming.py +198 -0
  55. napistu/rpy2/__init__.py +229 -86
  56. napistu/rpy2/callr.py +47 -77
  57. napistu/rpy2/constants.py +24 -23
  58. napistu/rpy2/rids.py +61 -648
  59. napistu/sbml_dfs_core.py +587 -222
  60. napistu/scverse/__init__.py +15 -0
  61. napistu/scverse/constants.py +28 -0
  62. napistu/scverse/loading.py +727 -0
  63. napistu/utils.py +118 -10
  64. {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/METADATA +8 -3
  65. napistu-0.3.1.dev1.dist-info/RECORD +133 -0
  66. tests/conftest.py +22 -0
  67. tests/test_context_discretize.py +56 -0
  68. tests/test_context_filtering.py +267 -0
  69. tests/test_identifiers.py +100 -0
  70. tests/test_indices.py +65 -0
  71. tests/{test_edgelist.py → test_ingestion_napistu_edgelist.py} +2 -2
  72. tests/test_matching_interactions.py +108 -0
  73. tests/test_matching_mount.py +305 -0
  74. tests/test_matching_species.py +394 -0
  75. tests/test_mcp_config.py +193 -0
  76. tests/test_mcp_documentation_utils.py +12 -3
  77. tests/test_mcp_server.py +156 -19
  78. tests/test_network_data_handling.py +397 -0
  79. tests/test_network_ig_utils.py +23 -0
  80. tests/test_network_neighborhoods.py +19 -0
  81. tests/test_network_net_create.py +459 -0
  82. tests/test_network_ng_utils.py +30 -0
  83. tests/test_network_paths.py +56 -0
  84. tests/{test_precomputed_distances.py → test_network_precompute.py} +8 -6
  85. tests/test_ontologies_genodexito.py +58 -0
  86. tests/test_ontologies_mygene.py +39 -0
  87. tests/test_ontologies_renaming.py +110 -0
  88. tests/test_rpy2_callr.py +79 -0
  89. tests/test_rpy2_init.py +151 -0
  90. tests/test_sbml.py +0 -31
  91. tests/test_sbml_dfs_core.py +134 -10
  92. tests/test_scverse_loading.py +778 -0
  93. tests/test_set_coverage.py +2 -2
  94. tests/test_utils.py +121 -1
  95. napistu/mechanism_matching.py +0 -1353
  96. napistu/rpy2/netcontextr.py +0 -467
  97. napistu-0.2.5.dev7.dist-info/RECORD +0 -98
  98. tests/test_igraph.py +0 -367
  99. tests/test_mechanism_matching.py +0 -784
  100. tests/test_net_utils.py +0 -149
  101. tests/test_netcontextr.py +0 -105
  102. tests/test_rpy2.py +0 -61
  103. /napistu/ingestion/{cpr_edgelist.py → napistu_edgelist.py} +0 -0
  104. {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/WHEEL +0 -0
  105. {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/entry_points.txt +0 -0
  106. {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/licenses/LICENSE +0 -0
  107. {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/top_level.txt +0 -0
  108. /tests/{test_obo.py → test_ingestion_obo.py} +0 -0
@@ -0,0 +1,305 @@
1
+ import copy
2
+ import numpy as np
3
+ import pandas as pd
4
+ import pytest
5
+ from datetime import datetime
6
+
7
+ from napistu.matching import mount
8
+ from napistu.constants import IDENTIFIERS, SBML_DFS, ONTOLOGIES
9
+ from napistu.matching.constants import (
10
+ FEATURE_ID_VAR_DEFAULT,
11
+ RESOLVE_MATCHES_AGGREGATORS,
12
+ )
13
+
14
+
15
+ def test_bind_wide_results(sbml_dfs_glucose_metabolism):
16
+ """
17
+ Test that bind_wide_results correctly matches identifiers and adds results to species data.
18
+ """
19
+ # Get species identifiers, excluding reactome
20
+ species_identifiers = (
21
+ sbml_dfs_glucose_metabolism.get_identifiers(SBML_DFS.SPECIES)
22
+ .query("bqb == 'BQB_IS'")
23
+ .query("ontology != 'reactome'")
24
+ )
25
+
26
+ # Create example data with identifiers and results
27
+ example_data = species_identifiers.groupby("ontology").head(10)[
28
+ [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
29
+ ]
30
+ example_data["results_a"] = np.random.randn(len(example_data))
31
+ example_data["results_b"] = np.random.randn(len(example_data))
32
+ example_data[FEATURE_ID_VAR_DEFAULT] = range(0, len(example_data))
33
+
34
+ # Create wide format data
35
+ example_data_wide = (
36
+ example_data.pivot(
37
+ columns=IDENTIFIERS.ONTOLOGY,
38
+ values=IDENTIFIERS.IDENTIFIER,
39
+ index=[FEATURE_ID_VAR_DEFAULT, "results_a", "results_b"],
40
+ )
41
+ .reset_index()
42
+ .rename_axis(None, axis=1)
43
+ )
44
+
45
+ # Test inplace=False (default)
46
+ results_name = "test_results"
47
+ original_sbml_dfs = copy.deepcopy(sbml_dfs_glucose_metabolism)
48
+ sbml_dfs_result = mount.bind_wide_results(
49
+ sbml_dfs=sbml_dfs_glucose_metabolism,
50
+ results_df=example_data_wide,
51
+ results_name=results_name,
52
+ ontologies={ONTOLOGIES.UNIPROT, ONTOLOGIES.CHEBI},
53
+ dogmatic=False,
54
+ species_identifiers=None,
55
+ feature_id_var=FEATURE_ID_VAR_DEFAULT,
56
+ verbose=True,
57
+ inplace=False,
58
+ )
59
+
60
+ # Verify original object is unchanged
61
+ assert results_name not in original_sbml_dfs.species_data
62
+
63
+ # Verify the results were added correctly to the new object
64
+ assert results_name in sbml_dfs_result.species_data
65
+ bound_results = sbml_dfs_result.species_data[results_name]
66
+ assert set(bound_results.columns) == {
67
+ FEATURE_ID_VAR_DEFAULT,
68
+ "results_a",
69
+ "results_b",
70
+ }
71
+ assert bound_results.shape == (23, 3)
72
+ assert bound_results.loc["S00000056", "feature_id"] == "18,19"
73
+ assert bound_results.loc["S00000057", "feature_id"] == "18"
74
+ assert bound_results.loc["S00000010", "feature_id"] == "9"
75
+
76
+ # Test inplace=True
77
+ results_name_2 = "test_results_2"
78
+ sbml_dfs_inplace = copy.deepcopy(sbml_dfs_glucose_metabolism)
79
+ result_inplace = mount.bind_wide_results(
80
+ sbml_dfs=sbml_dfs_inplace,
81
+ results_df=example_data_wide,
82
+ results_name=results_name_2,
83
+ ontologies={ONTOLOGIES.UNIPROT, ONTOLOGIES.CHEBI},
84
+ dogmatic=False,
85
+ species_identifiers=None,
86
+ feature_id_var=FEATURE_ID_VAR_DEFAULT,
87
+ verbose=True,
88
+ inplace=True,
89
+ )
90
+
91
+ # Verify the object was modified and function returned None
92
+ assert result_inplace is None
93
+ assert results_name_2 in sbml_dfs_inplace.species_data
94
+
95
+
96
+ def test_resolve_matches_with_example_data():
97
+ """Test resolve_matches function with example data for all aggregation methods."""
98
+ # Setup example data with overlapping 1-to-many and many-to-1 cases
99
+ example_data = pd.DataFrame(
100
+ {
101
+ FEATURE_ID_VAR_DEFAULT: ["A", "B", "C", "D", "D", "E", "B", "B", "C"],
102
+ SBML_DFS.S_ID: [
103
+ "s_id_1",
104
+ "s_id_1",
105
+ "s_id_1",
106
+ "s_id_4",
107
+ "s_id_5",
108
+ "s_id_6",
109
+ "s_id_2",
110
+ "s_id_3",
111
+ "s_id_3",
112
+ ],
113
+ "results_a": [1, 2, 3, 0.4, 5, 6, 0.7, 0.8, 9],
114
+ "results_b": [
115
+ "foo",
116
+ "foo",
117
+ "bar",
118
+ "bar",
119
+ "baz",
120
+ "baz",
121
+ "not",
122
+ "not",
123
+ "not",
124
+ ],
125
+ }
126
+ )
127
+
128
+ # Test that missing feature_id raises KeyError
129
+ data_no_id = pd.DataFrame(
130
+ {
131
+ SBML_DFS.S_ID: ["s_id_1", "s_id_1", "s_id_2"],
132
+ "results_a": [1, 2, 3],
133
+ "results_b": ["foo", "bar", "baz"],
134
+ }
135
+ )
136
+ with pytest.raises(KeyError, match=FEATURE_ID_VAR_DEFAULT):
137
+ mount.resolve_matches(data_no_id)
138
+
139
+ # Test with keep_id_col=True (default)
140
+ result_with_id = mount.resolve_matches(
141
+ example_data, keep_id_col=True, numeric_agg=RESOLVE_MATCHES_AGGREGATORS.MEAN
142
+ )
143
+
144
+ # Verify feature_id column is present and correctly aggregated
145
+ assert FEATURE_ID_VAR_DEFAULT in result_with_id.columns
146
+ assert result_with_id.loc["s_id_1", FEATURE_ID_VAR_DEFAULT] == "A,B,C"
147
+ assert result_with_id.loc["s_id_3", FEATURE_ID_VAR_DEFAULT] == "B,C"
148
+
149
+ # Test with keep_id_col=False
150
+ result_without_id = mount.resolve_matches(
151
+ example_data, keep_id_col=False, numeric_agg=RESOLVE_MATCHES_AGGREGATORS.MEAN
152
+ )
153
+
154
+ # Verify feature_id column is not in output
155
+ assert FEATURE_ID_VAR_DEFAULT not in result_without_id.columns
156
+
157
+ # Verify other columns are still present and correctly aggregated
158
+ assert "results_a" in result_without_id.columns
159
+ assert "results_b" in result_without_id.columns
160
+ assert "feature_id_match_count" in result_without_id.columns
161
+
162
+ # Verify numeric aggregation still works
163
+ actual_mean = result_without_id.loc["s_id_1", "results_a"]
164
+ expected_mean = 2.0 # (1 + 2 + 3) / 3
165
+ assert (
166
+ actual_mean == expected_mean
167
+ ), f"Expected mean {expected_mean}, but got {actual_mean}"
168
+
169
+ # Verify string aggregation still works
170
+ assert result_without_id.loc["s_id_1", "results_b"] == "bar,foo"
171
+
172
+ # Verify match counts are still present
173
+ assert result_without_id.loc["s_id_1", "feature_id_match_count"] == 3
174
+ assert result_without_id.loc["s_id_3", "feature_id_match_count"] == 2
175
+
176
+ # Test maximum aggregation
177
+ max_result = mount.resolve_matches(
178
+ example_data, numeric_agg=RESOLVE_MATCHES_AGGREGATORS.MAX
179
+ )
180
+
181
+ # Verify maximum values are correct
182
+ assert max_result.loc["s_id_1", "results_a"] == 3.0 # max of [1, 2, 3]
183
+ assert max_result.loc["s_id_3", "results_a"] == 9.0 # max of [0.8, 9]
184
+ assert max_result.loc["s_id_4", "results_a"] == 0.4 # single value
185
+ assert max_result.loc["s_id_5", "results_a"] == 5.0 # single value
186
+ assert max_result.loc["s_id_6", "results_a"] == 6.0 # single value
187
+
188
+ # Test weighted mean (feature_id is used for weights regardless of keep_id_col)
189
+ weighted_result = mount.resolve_matches(
190
+ example_data,
191
+ numeric_agg=RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
192
+ keep_id_col=True,
193
+ )
194
+
195
+ # For s_id_1:
196
+ # A appears once in total (weight = 1/1)
197
+ # B appears three times in total (weight = 1/3)
198
+ # C appears twice in total (weight = 1/2)
199
+ # Sum of unnormalized weights = 1 + 1/3 + 1/2 = 1.833
200
+ # Normalized weights:
201
+ # A: (1/1)/1.833 = 0.545
202
+ # B: (1/3)/1.833 = 0.182
203
+ # C: (1/2)/1.833 = 0.273
204
+ # Weighted mean = 1×0.545 + 2×0.182 + 3×0.273 = 1.73
205
+ actual_weighted_mean_1 = weighted_result.loc["s_id_1", "results_a"]
206
+ expected_weighted_mean_1 = 1.73
207
+ assert (
208
+ abs(actual_weighted_mean_1 - expected_weighted_mean_1) < 0.01
209
+ ), f"s_id_1 weighted mean: expected {expected_weighted_mean_1:.3f}, but got {actual_weighted_mean_1:.3f}"
210
+
211
+ # For s_id_3:
212
+ # B appears three times in total (weight = 1/3)
213
+ # C appears twice in total (weight = 1/2)
214
+ # Sum of unnormalized weights = 1/3 + 1/2 = 0.833
215
+ # Normalized weights:
216
+ # B: (1/3)/0.833 = 0.4
217
+ # C: (1/2)/0.833 = 0.6
218
+ # Weighted mean = 0.8×0.4 + 9×0.6 = 5.72
219
+ actual_weighted_mean_3 = weighted_result.loc["s_id_3", "results_a"]
220
+ expected_weighted_mean_3 = 5.72
221
+ assert (
222
+ abs(actual_weighted_mean_3 - expected_weighted_mean_3) < 0.01
223
+ ), f"s_id_3 weighted mean: expected {expected_weighted_mean_3:.3f}, but got {actual_weighted_mean_3:.3f}"
224
+
225
+ # Test weighted mean with keep_id_col=False (weights still use feature_id)
226
+ weighted_result_no_id = mount.resolve_matches(
227
+ example_data,
228
+ numeric_agg=RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
229
+ keep_id_col=False,
230
+ )
231
+
232
+ # Verify weighted means are the same regardless of keep_id_col
233
+ assert (
234
+ abs(weighted_result_no_id.loc["s_id_1", "results_a"] - expected_weighted_mean_1)
235
+ < 0.01
236
+ ), "Weighted mean should be the same regardless of keep_id_col"
237
+ assert (
238
+ abs(weighted_result_no_id.loc["s_id_3", "results_a"] - expected_weighted_mean_3)
239
+ < 0.01
240
+ ), "Weighted mean should be the same regardless of keep_id_col"
241
+
242
+ # Test that both versions preserve the same index structure
243
+ expected_index = pd.Index(
244
+ ["s_id_1", "s_id_2", "s_id_3", "s_id_4", "s_id_5", "s_id_6"], name="s_id"
245
+ )
246
+ pd.testing.assert_index_equal(result_with_id.index, expected_index)
247
+ pd.testing.assert_index_equal(result_without_id.index, expected_index)
248
+
249
+
250
+ def test_resolve_matches_invalid_dtypes():
251
+ """Test that resolve_matches raises an error for unsupported dtypes."""
252
+ # Setup data with boolean and datetime columns
253
+ data = pd.DataFrame(
254
+ {
255
+ FEATURE_ID_VAR_DEFAULT: ["A", "B", "B", "C"],
256
+ "bool_col": [True, False, True, False],
257
+ "datetime_col": [
258
+ datetime(2024, 1, 1),
259
+ datetime(2024, 1, 2),
260
+ datetime(2024, 1, 3),
261
+ datetime(2024, 1, 4),
262
+ ],
263
+ "s_id": ["s1", "s1", "s2", "s2"],
264
+ }
265
+ )
266
+
267
+ # Should raise TypeError for unsupported dtypes
268
+ with pytest.raises(TypeError, match="Unsupported data types"):
269
+ mount.resolve_matches(data)
270
+
271
+
272
+ def test_resolve_matches_first_method():
273
+ """Test resolve_matches with first method."""
274
+ # Setup data with known order
275
+ data = pd.DataFrame(
276
+ {
277
+ FEATURE_ID_VAR_DEFAULT: ["A", "C", "B", "B", "A"],
278
+ SBML_DFS.S_ID: ["s1", "s1", "s1", "s2", "s2"],
279
+ "value": [1, 2, 3, 4, 5],
280
+ }
281
+ )
282
+
283
+ result = mount.resolve_matches(data, numeric_agg=RESOLVE_MATCHES_AGGREGATORS.FIRST)
284
+
285
+ # Should take first value after sorting by feature_id
286
+ assert result.loc["s1", "value"] == 1 # A comes first
287
+ assert result.loc["s2", "value"] == 5 # A comes first
288
+
289
+
290
+ def test_resolve_matches_deduplicate_feature_id_within_sid():
291
+ """Test that only the first value for each (s_id, feature_id) is used in mean aggregation."""
292
+ data = pd.DataFrame(
293
+ {
294
+ FEATURE_ID_VAR_DEFAULT: ["A", "A", "B"],
295
+ SBML_DFS.S_ID: ["s1", "s1", "s1"],
296
+ "value": [
297
+ 1,
298
+ 1,
299
+ 2,
300
+ ], # average should be 1.5 because the two A's are redundant
301
+ }
302
+ )
303
+
304
+ result = mount.resolve_matches(data, numeric_agg=RESOLVE_MATCHES_AGGREGATORS.MEAN)
305
+ assert result.loc["s1", "value"] == 1.5
@@ -0,0 +1,394 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import pytest
4
+
5
+ from napistu.matching.constants import FEATURE_ID_VAR_DEFAULT
6
+ from napistu.matching.species import (
7
+ match_features_to_wide_pathway_species,
8
+ match_by_ontology_and_identifier,
9
+ _validate_wide_ontologies,
10
+ _ensure_feature_id_var,
11
+ features_to_pathway_species,
12
+ )
13
+
14
+
15
+ def test_features_to_pathway_species(sbml_dfs):
16
+ species_identifiers = sbml_dfs.get_identifiers("species")
17
+ feature_identifiers = pd.DataFrame({"chebis": ["17627", "15379", "29105", "-1"]})
18
+
19
+ matching_df = (
20
+ features_to_pathway_species(
21
+ feature_identifiers, species_identifiers, {"chebi"}, "chebis"
22
+ )
23
+ .value_counts("identifier")
24
+ .sort_index()
25
+ )
26
+
27
+ assert matching_df.index.tolist() == ["15379", "17627", "29105"]
28
+ assert matching_df.tolist() == [2, 3, 2]
29
+
30
+
31
+ def test_features_to_pathway_species_basic_and_expansion():
32
+
33
+ # Mock species_identifiers table
34
+ species_identifiers = pd.DataFrame(
35
+ {
36
+ "ontology": ["chebi", "chebi", "uniprot", "uniprot"],
37
+ "identifier": ["A", "B", "X", "Y"],
38
+ "s_id": [1, 2, 3, 4],
39
+ "s_name": ["foo", "bar", "baz", "qux"],
40
+ "bqb": ["BQB_IS", "BQB_IS", "BQB_IS", "BQB_IS"],
41
+ }
42
+ )
43
+ # Basic: no expansion, single identifier per row
44
+ features = pd.DataFrame({"my_id": ["A", "B", "X"], "other_col": [10, 20, 30]})
45
+ result = features_to_pathway_species(
46
+ feature_identifiers=features,
47
+ species_identifiers=species_identifiers,
48
+ ontologies={"chebi", "uniprot"},
49
+ feature_identifiers_var="my_id",
50
+ expand_identifiers=False,
51
+ )
52
+ # Should map all three
53
+ assert set(result["my_id"]) == {"A", "B", "X"}
54
+ assert set(result["identifier"]) == {"A", "B", "X"}
55
+ assert set(result["s_name"]) == {"foo", "bar", "baz"}
56
+ # Expansion: one row with multiple IDs
57
+ features2 = pd.DataFrame({"my_id": ["A / B / X", "Y"], "other_col": [100, 200]})
58
+ result2 = features_to_pathway_species(
59
+ feature_identifiers=features2,
60
+ species_identifiers=species_identifiers,
61
+ ontologies={"chebi", "uniprot"},
62
+ feature_identifiers_var="my_id",
63
+ expand_identifiers=True,
64
+ identifier_delimiter="/",
65
+ )
66
+ # Should expand to 4 rows (A, B, X, Y)
67
+ assert set(result2["identifier"]) == {"A", "B", "X", "Y"}
68
+ assert set(result2["s_name"]) == {"foo", "bar", "baz", "qux"}
69
+ # Whitespace trimming
70
+ features3 = pd.DataFrame({"my_id": [" A / B /X ", " Y"], "other_col": [1, 2]})
71
+ result3 = features_to_pathway_species(
72
+ feature_identifiers=features3,
73
+ species_identifiers=species_identifiers,
74
+ ontologies={"chebi", "uniprot"},
75
+ feature_identifiers_var="my_id",
76
+ expand_identifiers=True,
77
+ identifier_delimiter="/",
78
+ )
79
+ # Should expand and trim whitespace
80
+ assert set(result3["identifier"]) == {"A", "B", "X", "Y"}
81
+ assert set(result3["s_name"]) == {"foo", "bar", "baz", "qux"}
82
+
83
+
84
+ def test_validate_wide_ontologies():
85
+ """Test the _validate_wide_ontologies function with various input types and error cases."""
86
+ # Setup test data
87
+ example_data_wide = pd.DataFrame(
88
+ {
89
+ "results": [-1.0, 0.0, 1.0],
90
+ "chebi": ["15377", "16810", "17925"],
91
+ "uniprot": ["P12345", "Q67890", "O43826"],
92
+ }
93
+ )
94
+
95
+ # Test auto-detection of ontology columns
96
+ assert _validate_wide_ontologies(example_data_wide) == {"chebi", "uniprot"}
97
+
98
+ # Test string input
99
+ assert _validate_wide_ontologies(example_data_wide, ontologies="chebi") == {"chebi"}
100
+
101
+ # Test set input
102
+ assert _validate_wide_ontologies(example_data_wide, ontologies={"chebi"}) == {
103
+ "chebi"
104
+ }
105
+ assert _validate_wide_ontologies(
106
+ example_data_wide, ontologies={"chebi", "uniprot"}
107
+ ) == {"chebi", "uniprot"}
108
+
109
+ # Test dictionary mapping for renaming
110
+ assert _validate_wide_ontologies(
111
+ example_data_wide, ontologies={"chebi": "reactome", "uniprot": "ensembl_gene"}
112
+ ) == {"reactome", "ensembl_gene"}
113
+
114
+ # Test error cases
115
+
116
+ # Missing column in set input (checks existence first)
117
+ with pytest.raises(
118
+ ValueError, match="Specified ontology columns not found in DataFrame:.*"
119
+ ):
120
+ _validate_wide_ontologies(example_data_wide, ontologies={"invalid_ontology"})
121
+
122
+ # Valid column name but invalid ontology
123
+ df_with_invalid = pd.DataFrame(
124
+ {
125
+ "results": [-1.0, 0.0, 1.0],
126
+ "invalid_ontology": ["a", "b", "c"],
127
+ }
128
+ )
129
+ with pytest.raises(ValueError, match="Invalid ontologies in set:.*"):
130
+ _validate_wide_ontologies(df_with_invalid, ontologies={"invalid_ontology"})
131
+
132
+ # Missing source column in mapping
133
+ with pytest.raises(ValueError, match="Source columns not found in DataFrame:.*"):
134
+ _validate_wide_ontologies(
135
+ example_data_wide, ontologies={"missing_column": "reactome"}
136
+ )
137
+
138
+ # Invalid target ontology in mapping
139
+ with pytest.raises(ValueError, match="Invalid ontologies in mapping:.*"):
140
+ _validate_wide_ontologies(
141
+ example_data_wide, ontologies={"chebi": "invalid_ontology"}
142
+ )
143
+
144
+ # DataFrame with no valid ontology columns
145
+ invalid_df = pd.DataFrame(
146
+ {"results": [-1.0, 0.0, 1.0], "col1": ["a", "b", "c"], "col2": ["d", "e", "f"]}
147
+ )
148
+ with pytest.raises(
149
+ ValueError, match="No valid ontology columns found in DataFrame.*"
150
+ ):
151
+ _validate_wide_ontologies(invalid_df)
152
+
153
+
154
+ def test_ensure_feature_id_var():
155
+ """Test the _ensure_feature_id_var function with various input cases."""
156
+ # Test case 1: DataFrame already has feature_id column
157
+ df1 = pd.DataFrame({"feature_id": [100, 200, 300], "data": ["a", "b", "c"]})
158
+ result1 = _ensure_feature_id_var(df1)
159
+ # Should return unchanged DataFrame
160
+ pd.testing.assert_frame_equal(df1, result1)
161
+
162
+ # Test case 2: DataFrame missing feature_id column
163
+ df2 = pd.DataFrame({"data": ["x", "y", "z"]})
164
+ result2 = _ensure_feature_id_var(df2)
165
+ # Should add feature_id column with sequential integers
166
+ assert FEATURE_ID_VAR_DEFAULT in result2.columns
167
+ assert list(result2[FEATURE_ID_VAR_DEFAULT]) == [0, 1, 2]
168
+ assert list(result2["data"]) == ["x", "y", "z"] # Original data preserved
169
+
170
+ # Test case 3: Custom feature_id column name
171
+ df3 = pd.DataFrame({"data": ["p", "q", "r"]})
172
+ custom_id = "custom_feature_id"
173
+ result3 = _ensure_feature_id_var(df3, feature_id_var=custom_id)
174
+ # Should add custom named feature_id column
175
+ assert custom_id in result3.columns
176
+ assert list(result3[custom_id]) == [0, 1, 2]
177
+ assert list(result3["data"]) == ["p", "q", "r"] # Original data preserved
178
+
179
+ # Test case 4: Empty DataFrame
180
+ df4 = pd.DataFrame()
181
+ result4 = _ensure_feature_id_var(df4)
182
+ # Should handle empty DataFrame gracefully
183
+ assert FEATURE_ID_VAR_DEFAULT in result4.columns
184
+ assert len(result4) == 0
185
+
186
+
187
+ def test_match_by_ontology_and_identifier():
188
+ """Test the match_by_ontology_and_identifier function with various input types."""
189
+ # Setup test data
190
+ feature_identifiers = pd.DataFrame(
191
+ {
192
+ "ontology": ["chebi", "chebi", "uniprot", "uniprot", "reactome"],
193
+ "identifier": ["15377", "16810", "P12345", "Q67890", "R12345"],
194
+ "results": [1.0, 2.0, -1.0, -2.0, 0.5],
195
+ }
196
+ )
197
+
198
+ species_identifiers = pd.DataFrame(
199
+ {
200
+ "ontology": ["chebi", "chebi", "uniprot", "uniprot", "ensembl_gene"],
201
+ "identifier": ["15377", "17925", "P12345", "O43826", "ENSG123"],
202
+ "s_id": ["s1", "s2", "s3", "s4", "s5"],
203
+ "s_name": ["compound1", "compound2", "protein1", "protein2", "gene1"],
204
+ "bqb": ["BQB_IS"] * 5, # Add required bqb column with BQB_IS values
205
+ }
206
+ )
207
+
208
+ # Test with single ontology (string)
209
+ result = match_by_ontology_and_identifier(
210
+ feature_identifiers=feature_identifiers,
211
+ species_identifiers=species_identifiers,
212
+ ontologies="chebi",
213
+ )
214
+ assert len(result) == 1 # Only one matching chebi identifier
215
+ assert result.iloc[0]["identifier"] == "15377"
216
+ assert result.iloc[0]["results"] == 1.0
217
+ assert result.iloc[0]["ontology"] == "chebi" # From species_identifiers
218
+ assert result.iloc[0]["s_name"] == "compound1" # Verify join worked correctly
219
+ assert result.iloc[0]["bqb"] == "BQB_IS" # Verify bqb column is preserved
220
+
221
+ # Test with multiple ontologies (set)
222
+ result = match_by_ontology_and_identifier(
223
+ feature_identifiers=feature_identifiers,
224
+ species_identifiers=species_identifiers,
225
+ ontologies={"chebi", "uniprot"},
226
+ )
227
+ assert len(result) == 2 # One chebi and one uniprot match
228
+ assert set(result["ontology"]) == {"chebi", "uniprot"} # From species_identifiers
229
+ assert set(result["identifier"]) == {"15377", "P12345"}
230
+ # Verify results are correctly matched
231
+ chebi_row = result[result["ontology"] == "chebi"].iloc[0]
232
+ uniprot_row = result[result["ontology"] == "uniprot"].iloc[0]
233
+ assert chebi_row["results"] == 1.0
234
+ assert uniprot_row["results"] == -1.0
235
+ assert chebi_row["s_name"] == "compound1"
236
+ assert uniprot_row["s_name"] == "protein1"
237
+ assert chebi_row["bqb"] == "BQB_IS"
238
+ assert uniprot_row["bqb"] == "BQB_IS"
239
+
240
+ # Test with list of ontologies
241
+ result = match_by_ontology_and_identifier(
242
+ feature_identifiers=feature_identifiers,
243
+ species_identifiers=species_identifiers,
244
+ ontologies=["chebi", "uniprot"],
245
+ )
246
+ assert len(result) == 2
247
+ assert set(result["ontology"]) == {"chebi", "uniprot"} # From species_identifiers
248
+
249
+ # Test with no matches
250
+ no_match_features = pd.DataFrame(
251
+ {"ontology": ["chebi"], "identifier": ["99999"], "results": [1.0]}
252
+ )
253
+ result = match_by_ontology_and_identifier(
254
+ feature_identifiers=no_match_features,
255
+ species_identifiers=species_identifiers,
256
+ ontologies="chebi",
257
+ )
258
+ assert len(result) == 0
259
+
260
+ # Test with empty features
261
+ empty_features = pd.DataFrame({"ontology": [], "identifier": [], "results": []})
262
+ result = match_by_ontology_and_identifier(
263
+ feature_identifiers=empty_features,
264
+ species_identifiers=species_identifiers,
265
+ ontologies={"chebi", "uniprot"},
266
+ )
267
+ assert len(result) == 0
268
+
269
+ # Test with invalid ontology
270
+ with pytest.raises(ValueError, match="Invalid ontologies specified:.*"):
271
+ match_by_ontology_and_identifier(
272
+ feature_identifiers=feature_identifiers,
273
+ species_identifiers=species_identifiers,
274
+ ontologies="invalid_ontology",
275
+ )
276
+
277
+ # Test with ontology not in feature_identifiers
278
+ result = match_by_ontology_and_identifier(
279
+ feature_identifiers=feature_identifiers,
280
+ species_identifiers=species_identifiers,
281
+ ontologies={"ensembl_gene"}, # Only in species_identifiers
282
+ )
283
+ assert len(result) == 0
284
+
285
+ # Test with custom feature_identifiers_var
286
+ feature_identifiers_custom = feature_identifiers.rename(
287
+ columns={"identifier": "custom_id"}
288
+ )
289
+ result = match_by_ontology_and_identifier(
290
+ feature_identifiers=feature_identifiers_custom,
291
+ species_identifiers=species_identifiers,
292
+ ontologies={"chebi"},
293
+ feature_identifiers_var="custom_id",
294
+ )
295
+ assert len(result) == 1
296
+ assert result.iloc[0]["custom_id"] == "15377"
297
+ assert result.iloc[0]["ontology"] == "chebi" # From species_identifiers
298
+ assert result.iloc[0]["s_name"] == "compound1"
299
+ assert result.iloc[0]["bqb"] == "BQB_IS"
300
+
301
+
302
+ def test_match_features_to_wide_pathway_species(sbml_dfs_glucose_metabolism):
303
+
304
+ def compare_frame_contents(df1, df2):
305
+ """
306
+ Compare if two DataFrames have the same content, ignoring index and column ordering.
307
+
308
+ Parameters
309
+ ----------
310
+ df1 : pd.DataFrame
311
+ First DataFrame to compare
312
+ df2 : pd.DataFrame
313
+ Second DataFrame to compare
314
+
315
+ Returns
316
+ -------
317
+ None
318
+ """
319
+ df1_sorted = (
320
+ df1.reindex(columns=sorted(df1.columns))
321
+ .sort_values(sorted(df1.columns))
322
+ .reset_index(drop=True)
323
+ )
324
+
325
+ df2_sorted = (
326
+ df2.reindex(columns=sorted(df2.columns))
327
+ .sort_values(sorted(df2.columns))
328
+ .reset_index(drop=True)
329
+ )
330
+
331
+ pd.testing.assert_frame_equal(df1_sorted, df2_sorted, check_like=True)
332
+
333
+ return None
334
+
335
+ species_identifiers = (
336
+ sbml_dfs_glucose_metabolism.get_identifiers("species")
337
+ .query("bqb == 'BQB_IS'")
338
+ .query("ontology != 'reactome'")
339
+ )
340
+
341
+ # create a table whose index is s_ids and columns are faux-measurements
342
+ example_data = species_identifiers.groupby("ontology").head(10)[
343
+ ["ontology", "identifier"]
344
+ ]
345
+
346
+ example_data["results_a"] = np.random.randn(len(example_data))
347
+ example_data["results_b"] = np.random.randn(len(example_data))
348
+ # add a feature_id column to the example_data which tracks the row of the original data
349
+ example_data["feature_id"] = range(0, len(example_data))
350
+
351
+ # pivot (identifier, ontology) to columns for each ontology
352
+ example_data_wide = (
353
+ example_data.pivot(
354
+ columns="ontology",
355
+ values="identifier",
356
+ index=["feature_id", "results_a", "results_b"],
357
+ )
358
+ .reset_index()
359
+ .rename_axis(None, axis=1)
360
+ )
361
+
362
+ # options, for matching
363
+ # 1. match by identifier and a set of ontologies (provided by arg).
364
+ matched_s_ids = features_to_pathway_species(
365
+ feature_identifiers=example_data.drop(columns="ontology"),
366
+ species_identifiers=species_identifiers,
367
+ ontologies={"uniprot", "chebi"},
368
+ feature_identifiers_var="identifier",
369
+ )
370
+
371
+ # 2. match by identifier and ontology.
372
+ matched_s_ids_w_ontologies = match_by_ontology_and_identifier(
373
+ feature_identifiers=example_data,
374
+ species_identifiers=species_identifiers,
375
+ ontologies={"uniprot", "chebi"},
376
+ feature_identifiers_var="identifier",
377
+ )
378
+
379
+ # 3. format wide identifier sets into a table with a single identifier column and apply strategy #2.
380
+ matched_s_ids_from_wide = match_features_to_wide_pathway_species(
381
+ example_data_wide,
382
+ species_identifiers,
383
+ ontologies={"uniprot", "chebi"},
384
+ feature_identifiers_var="identifier",
385
+ )
386
+
387
+ compare_frame_contents(
388
+ matched_s_ids.drop(columns="s_Source"),
389
+ matched_s_ids_w_ontologies.drop(columns="s_Source"),
390
+ )
391
+ compare_frame_contents(
392
+ matched_s_ids.drop(columns="s_Source"),
393
+ matched_s_ids_from_wide.drop(columns="s_Source"),
394
+ )