napistu 0.2.5.dev7__py3-none-any.whl → 0.3.1.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. napistu/__init__.py +1 -3
  2. napistu/__main__.py +126 -96
  3. napistu/constants.py +35 -41
  4. napistu/context/__init__.py +10 -0
  5. napistu/context/discretize.py +462 -0
  6. napistu/context/filtering.py +387 -0
  7. napistu/gcs/__init__.py +1 -1
  8. napistu/identifiers.py +74 -15
  9. napistu/indices.py +68 -0
  10. napistu/ingestion/__init__.py +1 -1
  11. napistu/ingestion/bigg.py +47 -62
  12. napistu/ingestion/constants.py +18 -133
  13. napistu/ingestion/gtex.py +113 -0
  14. napistu/ingestion/hpa.py +147 -0
  15. napistu/ingestion/sbml.py +0 -97
  16. napistu/ingestion/string.py +2 -2
  17. napistu/matching/__init__.py +10 -0
  18. napistu/matching/constants.py +18 -0
  19. napistu/matching/interactions.py +518 -0
  20. napistu/matching/mount.py +529 -0
  21. napistu/matching/species.py +510 -0
  22. napistu/mcp/__init__.py +7 -4
  23. napistu/mcp/__main__.py +128 -72
  24. napistu/mcp/client.py +16 -25
  25. napistu/mcp/codebase.py +201 -145
  26. napistu/mcp/component_base.py +170 -0
  27. napistu/mcp/config.py +223 -0
  28. napistu/mcp/constants.py +45 -2
  29. napistu/mcp/documentation.py +253 -136
  30. napistu/mcp/documentation_utils.py +13 -48
  31. napistu/mcp/execution.py +372 -305
  32. napistu/mcp/health.py +47 -65
  33. napistu/mcp/profiles.py +10 -6
  34. napistu/mcp/server.py +161 -80
  35. napistu/mcp/tutorials.py +139 -87
  36. napistu/modify/__init__.py +1 -1
  37. napistu/modify/gaps.py +1 -1
  38. napistu/network/__init__.py +1 -1
  39. napistu/network/constants.py +101 -34
  40. napistu/network/data_handling.py +388 -0
  41. napistu/network/ig_utils.py +351 -0
  42. napistu/network/napistu_graph_core.py +354 -0
  43. napistu/network/neighborhoods.py +40 -40
  44. napistu/network/net_create.py +373 -309
  45. napistu/network/net_propagation.py +47 -19
  46. napistu/network/{net_utils.py → ng_utils.py} +124 -272
  47. napistu/network/paths.py +67 -51
  48. napistu/network/precompute.py +11 -11
  49. napistu/ontologies/__init__.py +10 -0
  50. napistu/ontologies/constants.py +129 -0
  51. napistu/ontologies/dogma.py +243 -0
  52. napistu/ontologies/genodexito.py +649 -0
  53. napistu/ontologies/mygene.py +369 -0
  54. napistu/ontologies/renaming.py +198 -0
  55. napistu/rpy2/__init__.py +229 -86
  56. napistu/rpy2/callr.py +47 -77
  57. napistu/rpy2/constants.py +24 -23
  58. napistu/rpy2/rids.py +61 -648
  59. napistu/sbml_dfs_core.py +587 -222
  60. napistu/scverse/__init__.py +15 -0
  61. napistu/scverse/constants.py +28 -0
  62. napistu/scverse/loading.py +727 -0
  63. napistu/utils.py +118 -10
  64. {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/METADATA +8 -3
  65. napistu-0.3.1.dev1.dist-info/RECORD +133 -0
  66. tests/conftest.py +22 -0
  67. tests/test_context_discretize.py +56 -0
  68. tests/test_context_filtering.py +267 -0
  69. tests/test_identifiers.py +100 -0
  70. tests/test_indices.py +65 -0
  71. tests/{test_edgelist.py → test_ingestion_napistu_edgelist.py} +2 -2
  72. tests/test_matching_interactions.py +108 -0
  73. tests/test_matching_mount.py +305 -0
  74. tests/test_matching_species.py +394 -0
  75. tests/test_mcp_config.py +193 -0
  76. tests/test_mcp_documentation_utils.py +12 -3
  77. tests/test_mcp_server.py +156 -19
  78. tests/test_network_data_handling.py +397 -0
  79. tests/test_network_ig_utils.py +23 -0
  80. tests/test_network_neighborhoods.py +19 -0
  81. tests/test_network_net_create.py +459 -0
  82. tests/test_network_ng_utils.py +30 -0
  83. tests/test_network_paths.py +56 -0
  84. tests/{test_precomputed_distances.py → test_network_precompute.py} +8 -6
  85. tests/test_ontologies_genodexito.py +58 -0
  86. tests/test_ontologies_mygene.py +39 -0
  87. tests/test_ontologies_renaming.py +110 -0
  88. tests/test_rpy2_callr.py +79 -0
  89. tests/test_rpy2_init.py +151 -0
  90. tests/test_sbml.py +0 -31
  91. tests/test_sbml_dfs_core.py +134 -10
  92. tests/test_scverse_loading.py +778 -0
  93. tests/test_set_coverage.py +2 -2
  94. tests/test_utils.py +121 -1
  95. napistu/mechanism_matching.py +0 -1353
  96. napistu/rpy2/netcontextr.py +0 -467
  97. napistu-0.2.5.dev7.dist-info/RECORD +0 -98
  98. tests/test_igraph.py +0 -367
  99. tests/test_mechanism_matching.py +0 -784
  100. tests/test_net_utils.py +0 -149
  101. tests/test_netcontextr.py +0 -105
  102. tests/test_rpy2.py +0 -61
  103. /napistu/ingestion/{cpr_edgelist.py → napistu_edgelist.py} +0 -0
  104. {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/WHEEL +0 -0
  105. {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/entry_points.txt +0 -0
  106. {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/licenses/LICENSE +0 -0
  107. {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/top_level.txt +0 -0
  108. /tests/{test_obo.py → test_ingestion_obo.py} +0 -0
@@ -1,784 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from datetime import datetime
4
-
5
- import numpy as np
6
- import pandas as pd
7
- import pytest
8
-
9
- from napistu import mechanism_matching
10
- from napistu.network import net_create
11
- from napistu.network import precompute
12
- from napistu.mechanism_matching import _validate_wide_ontologies
13
- from napistu.mechanism_matching import match_by_ontology_and_identifier
14
- from napistu.mechanism_matching import resolve_matches
15
-
16
- from napistu.constants import SBML_DFS
17
- from napistu.constants import IDENTIFIERS
18
- from napistu.constants import ONTOLOGIES
19
- from napistu.constants import RESOLVE_MATCHES_AGGREGATORS
20
- from napistu.constants import FEATURE_ID_VAR_DEFAULT
21
-
22
-
23
- def test_features_to_pathway_species(sbml_dfs):
24
-
25
- species_identifiers = sbml_dfs.get_identifiers("species")
26
- feature_identifiers = pd.DataFrame({"chebis": ["17627", "15379", "29105", "-1"]})
27
-
28
- matching_df = (
29
- mechanism_matching.features_to_pathway_species(
30
- feature_identifiers, species_identifiers, {"chebi"}, "chebis"
31
- )
32
- .value_counts("identifier")
33
- .sort_index()
34
- )
35
-
36
- assert matching_df.index.tolist() == ["15379", "17627", "29105"]
37
- assert matching_df.tolist() == [2, 3, 2]
38
-
39
-
40
- def test_features_to_pathway_species_basic_and_expansion():
41
-
42
- # Mock species_identifiers table
43
- species_identifiers = pd.DataFrame(
44
- {
45
- "ontology": ["chebi", "chebi", "uniprot", "uniprot"],
46
- "identifier": ["A", "B", "X", "Y"],
47
- "s_id": [1, 2, 3, 4],
48
- "s_name": ["foo", "bar", "baz", "qux"],
49
- "bqb": ["BQB_IS", "BQB_IS", "BQB_IS", "BQB_IS"],
50
- }
51
- )
52
- # Basic: no expansion, single identifier per row
53
- features = pd.DataFrame({"my_id": ["A", "B", "X"], "other_col": [10, 20, 30]})
54
- result = mechanism_matching.features_to_pathway_species(
55
- feature_identifiers=features,
56
- species_identifiers=species_identifiers,
57
- ontologies={"chebi", "uniprot"},
58
- feature_identifiers_var="my_id",
59
- expand_identifiers=False,
60
- )
61
- # Should map all three
62
- assert set(result["my_id"]) == {"A", "B", "X"}
63
- assert set(result["identifier"]) == {"A", "B", "X"}
64
- assert set(result["s_name"]) == {"foo", "bar", "baz"}
65
- # Expansion: one row with multiple IDs
66
- features2 = pd.DataFrame({"my_id": ["A / B / X", "Y"], "other_col": [100, 200]})
67
- result2 = mechanism_matching.features_to_pathway_species(
68
- feature_identifiers=features2,
69
- species_identifiers=species_identifiers,
70
- ontologies={"chebi", "uniprot"},
71
- feature_identifiers_var="my_id",
72
- expand_identifiers=True,
73
- identifier_delimiter="/",
74
- )
75
- # Should expand to 4 rows (A, B, X, Y)
76
- assert set(result2["identifier"]) == {"A", "B", "X", "Y"}
77
- assert set(result2["s_name"]) == {"foo", "bar", "baz", "qux"}
78
- # Whitespace trimming
79
- features3 = pd.DataFrame({"my_id": [" A / B /X ", " Y"], "other_col": [1, 2]})
80
- result3 = mechanism_matching.features_to_pathway_species(
81
- feature_identifiers=features3,
82
- species_identifiers=species_identifiers,
83
- ontologies={"chebi", "uniprot"},
84
- feature_identifiers_var="my_id",
85
- expand_identifiers=True,
86
- identifier_delimiter="/",
87
- )
88
- # Should expand and trim whitespace
89
- assert set(result3["identifier"]) == {"A", "B", "X", "Y"}
90
- assert set(result3["s_name"]) == {"foo", "bar", "baz", "qux"}
91
-
92
-
93
- def test_edgelist_to_pathway_species(sbml_dfs):
94
-
95
- edgelist = pd.DataFrame(
96
- [
97
- {"identifier_upstream": "17996", "identifier_downstream": "16526"},
98
- {"identifier_upstream": "15377", "identifier_downstream": "17544"},
99
- {"identifier_upstream": "15378", "identifier_downstream": "57945"},
100
- {"identifier_upstream": "57540", "identifier_downstream": "17996"},
101
- ]
102
- )
103
- species_identifiers = sbml_dfs.get_identifiers("species").query("bqb == 'BQB_IS'")
104
-
105
- edgelist_w_sids = mechanism_matching.edgelist_to_pathway_species(
106
- edgelist, species_identifiers, ontologies={"chebi", "uniprot"}
107
- )
108
- assert edgelist_w_sids.shape == (4, 4)
109
-
110
- egelist_w_scids = mechanism_matching.edgelist_to_scids(
111
- edgelist, sbml_dfs, species_identifiers, ontologies={"chebi"}
112
- )
113
-
114
- assert egelist_w_scids.shape == (12, 6)
115
-
116
- direct_interactions = mechanism_matching.filter_to_direct_mechanistic_interactions(
117
- edgelist, sbml_dfs, species_identifiers, ontologies={"chebi"}
118
- )
119
-
120
- assert direct_interactions.shape == (2, 10)
121
-
122
-
123
- def test_direct_and_indirect_mechanism_matching(sbml_dfs_glucose_metabolism):
124
-
125
- cpr_graph = net_create.process_cpr_graph(sbml_dfs_glucose_metabolism)
126
-
127
- edgelist = pd.DataFrame(
128
- [
129
- {
130
- "identifier_upstream": "17925",
131
- "identifier_downstream": "32966",
132
- }, # glu, fbp
133
- {
134
- "identifier_upstream": "57634",
135
- "identifier_downstream": "32966",
136
- }, # f6p, fbp
137
- {
138
- "identifier_upstream": "32966",
139
- "identifier_downstream": "57642",
140
- }, # fbp, dhap
141
- {
142
- "identifier_upstream": "17925",
143
- "identifier_downstream": "15361",
144
- }, # glu, pyr
145
- ]
146
- )
147
-
148
- species_identifiers = sbml_dfs_glucose_metabolism.get_identifiers("species")
149
-
150
- direct_interactions = mechanism_matching.filter_to_direct_mechanistic_interactions(
151
- formatted_edgelist=edgelist,
152
- sbml_dfs=sbml_dfs_glucose_metabolism,
153
- species_identifiers=species_identifiers,
154
- ontologies={"chebi"},
155
- )
156
-
157
- assert direct_interactions.shape == (2, 10)
158
-
159
- indirect_interactions = (
160
- mechanism_matching.filter_to_indirect_mechanistic_interactions(
161
- formatted_edgelist=edgelist,
162
- sbml_dfs=sbml_dfs_glucose_metabolism,
163
- species_identifiers=species_identifiers,
164
- cpr_graph=cpr_graph,
165
- ontologies={"chebi"},
166
- precomputed_distances=None,
167
- max_path_length=10,
168
- )
169
- )
170
-
171
- assert indirect_interactions.shape == (6, 12)
172
-
173
- # confirm that we get the same thing even when using precomputed distances
174
- precomputed_distances = precompute.precompute_distances(
175
- cpr_graph, weights_vars=["weights"]
176
- )
177
-
178
- indirect_interactions_w_precompute = (
179
- mechanism_matching.filter_to_indirect_mechanistic_interactions(
180
- formatted_edgelist=edgelist,
181
- sbml_dfs=sbml_dfs_glucose_metabolism,
182
- species_identifiers=species_identifiers,
183
- cpr_graph=cpr_graph,
184
- ontologies={"chebi"},
185
- precomputed_distances=precomputed_distances,
186
- max_path_length=10,
187
- )
188
- )
189
-
190
- assert all(
191
- indirect_interactions["weight"] == indirect_interactions_w_precompute["weight"]
192
- )
193
-
194
-
195
- def test_validate_wide_ontologies():
196
- """Test the _validate_wide_ontologies function with various input types and error cases."""
197
- # Setup test data
198
- example_data_wide = pd.DataFrame(
199
- {
200
- "results": [-1.0, 0.0, 1.0],
201
- "chebi": ["15377", "16810", "17925"],
202
- "uniprot": ["P12345", "Q67890", "O43826"],
203
- }
204
- )
205
-
206
- # Test auto-detection of ontology columns
207
- assert _validate_wide_ontologies(example_data_wide) == {"chebi", "uniprot"}
208
-
209
- # Test string input
210
- assert _validate_wide_ontologies(example_data_wide, ontologies="chebi") == {"chebi"}
211
-
212
- # Test set input
213
- assert _validate_wide_ontologies(example_data_wide, ontologies={"chebi"}) == {
214
- "chebi"
215
- }
216
- assert _validate_wide_ontologies(
217
- example_data_wide, ontologies={"chebi", "uniprot"}
218
- ) == {"chebi", "uniprot"}
219
-
220
- # Test dictionary mapping for renaming
221
- assert _validate_wide_ontologies(
222
- example_data_wide, ontologies={"chebi": "reactome", "uniprot": "ensembl_gene"}
223
- ) == {"reactome", "ensembl_gene"}
224
-
225
- # Test error cases
226
-
227
- # Missing column in set input (checks existence first)
228
- with pytest.raises(
229
- ValueError, match="Specified ontology columns not found in DataFrame:.*"
230
- ):
231
- _validate_wide_ontologies(example_data_wide, ontologies={"invalid_ontology"})
232
-
233
- # Valid column name but invalid ontology
234
- df_with_invalid = pd.DataFrame(
235
- {
236
- "results": [-1.0, 0.0, 1.0],
237
- "invalid_ontology": ["a", "b", "c"],
238
- }
239
- )
240
- with pytest.raises(ValueError, match="Invalid ontologies in set:.*"):
241
- _validate_wide_ontologies(df_with_invalid, ontologies={"invalid_ontology"})
242
-
243
- # Missing source column in mapping
244
- with pytest.raises(ValueError, match="Source columns not found in DataFrame:.*"):
245
- _validate_wide_ontologies(
246
- example_data_wide, ontologies={"missing_column": "reactome"}
247
- )
248
-
249
- # Invalid target ontology in mapping
250
- with pytest.raises(ValueError, match="Invalid ontologies in mapping:.*"):
251
- _validate_wide_ontologies(
252
- example_data_wide, ontologies={"chebi": "invalid_ontology"}
253
- )
254
-
255
- # DataFrame with no valid ontology columns
256
- invalid_df = pd.DataFrame(
257
- {"results": [-1.0, 0.0, 1.0], "col1": ["a", "b", "c"], "col2": ["d", "e", "f"]}
258
- )
259
- with pytest.raises(
260
- ValueError, match="No valid ontology columns found in DataFrame.*"
261
- ):
262
- _validate_wide_ontologies(invalid_df)
263
-
264
-
265
- def test_ensure_feature_id_var():
266
- """Test the _ensure_feature_id_var function with various input cases."""
267
- from napistu.mechanism_matching import _ensure_feature_id_var
268
- from napistu.constants import FEATURE_ID_VAR_DEFAULT
269
-
270
- # Test case 1: DataFrame already has feature_id column
271
- df1 = pd.DataFrame({"feature_id": [100, 200, 300], "data": ["a", "b", "c"]})
272
- result1 = _ensure_feature_id_var(df1)
273
- # Should return unchanged DataFrame
274
- pd.testing.assert_frame_equal(df1, result1)
275
-
276
- # Test case 2: DataFrame missing feature_id column
277
- df2 = pd.DataFrame({"data": ["x", "y", "z"]})
278
- result2 = _ensure_feature_id_var(df2)
279
- # Should add feature_id column with sequential integers
280
- assert FEATURE_ID_VAR_DEFAULT in result2.columns
281
- assert list(result2[FEATURE_ID_VAR_DEFAULT]) == [0, 1, 2]
282
- assert list(result2["data"]) == ["x", "y", "z"] # Original data preserved
283
-
284
- # Test case 3: Custom feature_id column name
285
- df3 = pd.DataFrame({"data": ["p", "q", "r"]})
286
- custom_id = "custom_feature_id"
287
- result3 = _ensure_feature_id_var(df3, feature_id_var=custom_id)
288
- # Should add custom named feature_id column
289
- assert custom_id in result3.columns
290
- assert list(result3[custom_id]) == [0, 1, 2]
291
- assert list(result3["data"]) == ["p", "q", "r"] # Original data preserved
292
-
293
- # Test case 4: Empty DataFrame
294
- df4 = pd.DataFrame()
295
- result4 = _ensure_feature_id_var(df4)
296
- # Should handle empty DataFrame gracefully
297
- assert FEATURE_ID_VAR_DEFAULT in result4.columns
298
- assert len(result4) == 0
299
-
300
-
301
- def test_match_by_ontology_and_identifier():
302
- """Test the match_by_ontology_and_identifier function with various input types."""
303
- # Setup test data
304
- feature_identifiers = pd.DataFrame(
305
- {
306
- "ontology": ["chebi", "chebi", "uniprot", "uniprot", "reactome"],
307
- "identifier": ["15377", "16810", "P12345", "Q67890", "R12345"],
308
- "results": [1.0, 2.0, -1.0, -2.0, 0.5],
309
- }
310
- )
311
-
312
- species_identifiers = pd.DataFrame(
313
- {
314
- "ontology": ["chebi", "chebi", "uniprot", "uniprot", "ensembl_gene"],
315
- "identifier": ["15377", "17925", "P12345", "O43826", "ENSG123"],
316
- "s_id": ["s1", "s2", "s3", "s4", "s5"],
317
- "s_name": ["compound1", "compound2", "protein1", "protein2", "gene1"],
318
- "bqb": ["BQB_IS"] * 5, # Add required bqb column with BQB_IS values
319
- }
320
- )
321
-
322
- # Test with single ontology (string)
323
- result = match_by_ontology_and_identifier(
324
- feature_identifiers=feature_identifiers,
325
- species_identifiers=species_identifiers,
326
- ontologies="chebi",
327
- )
328
- assert len(result) == 1 # Only one matching chebi identifier
329
- assert result.iloc[0]["identifier"] == "15377"
330
- assert result.iloc[0]["results"] == 1.0
331
- assert result.iloc[0]["ontology"] == "chebi" # From species_identifiers
332
- assert result.iloc[0]["s_name"] == "compound1" # Verify join worked correctly
333
- assert result.iloc[0]["bqb"] == "BQB_IS" # Verify bqb column is preserved
334
-
335
- # Test with multiple ontologies (set)
336
- result = match_by_ontology_and_identifier(
337
- feature_identifiers=feature_identifiers,
338
- species_identifiers=species_identifiers,
339
- ontologies={"chebi", "uniprot"},
340
- )
341
- assert len(result) == 2 # One chebi and one uniprot match
342
- assert set(result["ontology"]) == {"chebi", "uniprot"} # From species_identifiers
343
- assert set(result["identifier"]) == {"15377", "P12345"}
344
- # Verify results are correctly matched
345
- chebi_row = result[result["ontology"] == "chebi"].iloc[0]
346
- uniprot_row = result[result["ontology"] == "uniprot"].iloc[0]
347
- assert chebi_row["results"] == 1.0
348
- assert uniprot_row["results"] == -1.0
349
- assert chebi_row["s_name"] == "compound1"
350
- assert uniprot_row["s_name"] == "protein1"
351
- assert chebi_row["bqb"] == "BQB_IS"
352
- assert uniprot_row["bqb"] == "BQB_IS"
353
-
354
- # Test with list of ontologies
355
- result = match_by_ontology_and_identifier(
356
- feature_identifiers=feature_identifiers,
357
- species_identifiers=species_identifiers,
358
- ontologies=["chebi", "uniprot"],
359
- )
360
- assert len(result) == 2
361
- assert set(result["ontology"]) == {"chebi", "uniprot"} # From species_identifiers
362
-
363
- # Test with no matches
364
- no_match_features = pd.DataFrame(
365
- {"ontology": ["chebi"], "identifier": ["99999"], "results": [1.0]}
366
- )
367
- result = match_by_ontology_and_identifier(
368
- feature_identifiers=no_match_features,
369
- species_identifiers=species_identifiers,
370
- ontologies="chebi",
371
- )
372
- assert len(result) == 0
373
-
374
- # Test with empty features
375
- empty_features = pd.DataFrame({"ontology": [], "identifier": [], "results": []})
376
- result = match_by_ontology_and_identifier(
377
- feature_identifiers=empty_features,
378
- species_identifiers=species_identifiers,
379
- ontologies={"chebi", "uniprot"},
380
- )
381
- assert len(result) == 0
382
-
383
- # Test with invalid ontology
384
- with pytest.raises(ValueError, match="Invalid ontologies specified:.*"):
385
- match_by_ontology_and_identifier(
386
- feature_identifiers=feature_identifiers,
387
- species_identifiers=species_identifiers,
388
- ontologies="invalid_ontology",
389
- )
390
-
391
- # Test with ontology not in feature_identifiers
392
- result = match_by_ontology_and_identifier(
393
- feature_identifiers=feature_identifiers,
394
- species_identifiers=species_identifiers,
395
- ontologies={"ensembl_gene"}, # Only in species_identifiers
396
- )
397
- assert len(result) == 0
398
-
399
- # Test with custom feature_identifiers_var
400
- feature_identifiers_custom = feature_identifiers.rename(
401
- columns={"identifier": "custom_id"}
402
- )
403
- result = match_by_ontology_and_identifier(
404
- feature_identifiers=feature_identifiers_custom,
405
- species_identifiers=species_identifiers,
406
- ontologies={"chebi"},
407
- feature_identifiers_var="custom_id",
408
- )
409
- assert len(result) == 1
410
- assert result.iloc[0]["custom_id"] == "15377"
411
- assert result.iloc[0]["ontology"] == "chebi" # From species_identifiers
412
- assert result.iloc[0]["s_name"] == "compound1"
413
- assert result.iloc[0]["bqb"] == "BQB_IS"
414
-
415
-
416
- def test_match_features_to_wide_pathway_species(sbml_dfs_glucose_metabolism):
417
-
418
- def compare_frame_contents(df1, df2):
419
- """
420
- Compare if two DataFrames have the same content, ignoring index and column ordering.
421
-
422
- Parameters
423
- ----------
424
- df1 : pd.DataFrame
425
- First DataFrame to compare
426
- df2 : pd.DataFrame
427
- Second DataFrame to compare
428
-
429
- Returns
430
- -------
431
- None
432
- """
433
- df1_sorted = (
434
- df1.reindex(columns=sorted(df1.columns))
435
- .sort_values(sorted(df1.columns))
436
- .reset_index(drop=True)
437
- )
438
-
439
- df2_sorted = (
440
- df2.reindex(columns=sorted(df2.columns))
441
- .sort_values(sorted(df2.columns))
442
- .reset_index(drop=True)
443
- )
444
-
445
- pd.testing.assert_frame_equal(df1_sorted, df2_sorted, check_like=True)
446
-
447
- return None
448
-
449
- species_identifiers = (
450
- sbml_dfs_glucose_metabolism.get_identifiers("species")
451
- .query("bqb == 'BQB_IS'")
452
- .query("ontology != 'reactome'")
453
- )
454
-
455
- # create a table whose index is s_ids and columns are faux-measurements
456
- example_data = species_identifiers.groupby("ontology").head(10)[
457
- ["ontology", "identifier"]
458
- ]
459
-
460
- example_data["results_a"] = np.random.randn(len(example_data))
461
- example_data["results_b"] = np.random.randn(len(example_data))
462
- # add a feature_id column to the example_data which tracks the row of the original data
463
- example_data["feature_id"] = range(0, len(example_data))
464
-
465
- # pivot (identifier, ontology) to columns for each ontology
466
- example_data_wide = (
467
- example_data.pivot(
468
- columns="ontology",
469
- values="identifier",
470
- index=["feature_id", "results_a", "results_b"],
471
- )
472
- .reset_index()
473
- .rename_axis(None, axis=1)
474
- )
475
-
476
- # options, for matching
477
- # 1. match by identifier and a set of ontologies (provided by arg).
478
- matched_s_ids = mechanism_matching.features_to_pathway_species(
479
- feature_identifiers=example_data.drop(columns="ontology"),
480
- species_identifiers=species_identifiers,
481
- ontologies={"uniprot", "chebi"},
482
- feature_identifiers_var="identifier",
483
- )
484
-
485
- # 2. match by identifier and ontology.
486
- matched_s_ids_w_ontologies = mechanism_matching.match_by_ontology_and_identifier(
487
- feature_identifiers=example_data,
488
- species_identifiers=species_identifiers,
489
- ontologies={"uniprot", "chebi"},
490
- feature_identifiers_var="identifier",
491
- )
492
-
493
- # 3. format wide identifier sets into a table with a single identifier column and apply strategy #2.
494
- matched_s_ids_from_wide = mechanism_matching.match_features_to_wide_pathway_species(
495
- example_data_wide,
496
- species_identifiers,
497
- ontologies={"uniprot", "chebi"},
498
- feature_identifiers_var="identifier",
499
- )
500
-
501
- compare_frame_contents(
502
- matched_s_ids.drop(columns="s_Source"),
503
- matched_s_ids_w_ontologies.drop(columns="s_Source"),
504
- )
505
- compare_frame_contents(
506
- matched_s_ids.drop(columns="s_Source"),
507
- matched_s_ids_from_wide.drop(columns="s_Source"),
508
- )
509
-
510
-
511
- def test_resolve_matches_with_example_data():
512
- """Test resolve_matches function with example data for all aggregation methods."""
513
- # Setup example data with overlapping 1-to-many and many-to-1 cases
514
- example_data = pd.DataFrame(
515
- {
516
- FEATURE_ID_VAR_DEFAULT: ["A", "B", "C", "D", "D", "E", "B", "B", "C"],
517
- SBML_DFS.S_ID: [
518
- "s_id_1",
519
- "s_id_1",
520
- "s_id_1",
521
- "s_id_4",
522
- "s_id_5",
523
- "s_id_6",
524
- "s_id_2",
525
- "s_id_3",
526
- "s_id_3",
527
- ],
528
- "results_a": [1, 2, 3, 0.4, 5, 6, 0.7, 0.8, 9],
529
- "results_b": [
530
- "foo",
531
- "foo",
532
- "bar",
533
- "bar",
534
- "baz",
535
- "baz",
536
- "not",
537
- "not",
538
- "not",
539
- ],
540
- }
541
- )
542
-
543
- # Test that missing feature_id raises KeyError
544
- data_no_id = pd.DataFrame(
545
- {
546
- SBML_DFS.S_ID: ["s_id_1", "s_id_1", "s_id_2"],
547
- "results_a": [1, 2, 3],
548
- "results_b": ["foo", "bar", "baz"],
549
- }
550
- )
551
- with pytest.raises(KeyError, match=FEATURE_ID_VAR_DEFAULT):
552
- resolve_matches(data_no_id)
553
-
554
- # Test with keep_id_col=True (default)
555
- result_with_id = resolve_matches(
556
- example_data, keep_id_col=True, numeric_agg=RESOLVE_MATCHES_AGGREGATORS.MEAN
557
- )
558
-
559
- # Verify feature_id column is present and correctly aggregated
560
- assert FEATURE_ID_VAR_DEFAULT in result_with_id.columns
561
- assert result_with_id.loc["s_id_1", FEATURE_ID_VAR_DEFAULT] == "A,B,C"
562
- assert result_with_id.loc["s_id_3", FEATURE_ID_VAR_DEFAULT] == "B,C"
563
-
564
- # Test with keep_id_col=False
565
- result_without_id = resolve_matches(
566
- example_data, keep_id_col=False, numeric_agg=RESOLVE_MATCHES_AGGREGATORS.MEAN
567
- )
568
-
569
- # Verify feature_id column is not in output
570
- assert FEATURE_ID_VAR_DEFAULT not in result_without_id.columns
571
-
572
- # Verify other columns are still present and correctly aggregated
573
- assert "results_a" in result_without_id.columns
574
- assert "results_b" in result_without_id.columns
575
- assert "feature_id_match_count" in result_without_id.columns
576
-
577
- # Verify numeric aggregation still works
578
- actual_mean = result_without_id.loc["s_id_1", "results_a"]
579
- expected_mean = 2.0 # (1 + 2 + 3) / 3
580
- assert (
581
- actual_mean == expected_mean
582
- ), f"Expected mean {expected_mean}, but got {actual_mean}"
583
-
584
- # Verify string aggregation still works
585
- assert result_without_id.loc["s_id_1", "results_b"] == "bar,foo"
586
-
587
- # Verify match counts are still present
588
- assert result_without_id.loc["s_id_1", "feature_id_match_count"] == 3
589
- assert result_without_id.loc["s_id_3", "feature_id_match_count"] == 2
590
-
591
- # Test maximum aggregation
592
- max_result = resolve_matches(
593
- example_data, numeric_agg=RESOLVE_MATCHES_AGGREGATORS.MAX
594
- )
595
-
596
- # Verify maximum values are correct
597
- assert max_result.loc["s_id_1", "results_a"] == 3.0 # max of [1, 2, 3]
598
- assert max_result.loc["s_id_3", "results_a"] == 9.0 # max of [0.8, 9]
599
- assert max_result.loc["s_id_4", "results_a"] == 0.4 # single value
600
- assert max_result.loc["s_id_5", "results_a"] == 5.0 # single value
601
- assert max_result.loc["s_id_6", "results_a"] == 6.0 # single value
602
-
603
- # Test weighted mean (feature_id is used for weights regardless of keep_id_col)
604
- weighted_result = resolve_matches(
605
- example_data,
606
- numeric_agg=RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
607
- keep_id_col=True,
608
- )
609
-
610
- # For s_id_1:
611
- # A appears once in total (weight = 1/1)
612
- # B appears three times in total (weight = 1/3)
613
- # C appears twice in total (weight = 1/2)
614
- # Sum of unnormalized weights = 1 + 1/3 + 1/2 = 1.833
615
- # Normalized weights:
616
- # A: (1/1)/1.833 = 0.545
617
- # B: (1/3)/1.833 = 0.182
618
- # C: (1/2)/1.833 = 0.273
619
- # Weighted mean = 1×0.545 + 2×0.182 + 3×0.273 = 1.73
620
- actual_weighted_mean_1 = weighted_result.loc["s_id_1", "results_a"]
621
- expected_weighted_mean_1 = 1.73
622
- assert (
623
- abs(actual_weighted_mean_1 - expected_weighted_mean_1) < 0.01
624
- ), f"s_id_1 weighted mean: expected {expected_weighted_mean_1:.3f}, but got {actual_weighted_mean_1:.3f}"
625
-
626
- # For s_id_3:
627
- # B appears three times in total (weight = 1/3)
628
- # C appears twice in total (weight = 1/2)
629
- # Sum of unnormalized weights = 1/3 + 1/2 = 0.833
630
- # Normalized weights:
631
- # B: (1/3)/0.833 = 0.4
632
- # C: (1/2)/0.833 = 0.6
633
- # Weighted mean = 0.8×0.4 + 9×0.6 = 5.72
634
- actual_weighted_mean_3 = weighted_result.loc["s_id_3", "results_a"]
635
- expected_weighted_mean_3 = 5.72
636
- assert (
637
- abs(actual_weighted_mean_3 - expected_weighted_mean_3) < 0.01
638
- ), f"s_id_3 weighted mean: expected {expected_weighted_mean_3:.3f}, but got {actual_weighted_mean_3:.3f}"
639
-
640
- # Test weighted mean with keep_id_col=False (weights still use feature_id)
641
- weighted_result_no_id = resolve_matches(
642
- example_data,
643
- numeric_agg=RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
644
- keep_id_col=False,
645
- )
646
-
647
- # Verify weighted means are the same regardless of keep_id_col
648
- assert (
649
- abs(weighted_result_no_id.loc["s_id_1", "results_a"] - expected_weighted_mean_1)
650
- < 0.01
651
- ), "Weighted mean should be the same regardless of keep_id_col"
652
- assert (
653
- abs(weighted_result_no_id.loc["s_id_3", "results_a"] - expected_weighted_mean_3)
654
- < 0.01
655
- ), "Weighted mean should be the same regardless of keep_id_col"
656
-
657
- # Test that both versions preserve the same index structure
658
- expected_index = pd.Index(
659
- ["s_id_1", "s_id_2", "s_id_3", "s_id_4", "s_id_5", "s_id_6"], name="s_id"
660
- )
661
- pd.testing.assert_index_equal(result_with_id.index, expected_index)
662
- pd.testing.assert_index_equal(result_without_id.index, expected_index)
663
-
664
-
665
- def test_resolve_matches_invalid_dtypes():
666
- """Test that resolve_matches raises an error for unsupported dtypes."""
667
- # Setup data with boolean and datetime columns
668
- data = pd.DataFrame(
669
- {
670
- FEATURE_ID_VAR_DEFAULT: ["A", "B", "B", "C"],
671
- "bool_col": [True, False, True, False],
672
- "datetime_col": [
673
- datetime(2024, 1, 1),
674
- datetime(2024, 1, 2),
675
- datetime(2024, 1, 3),
676
- datetime(2024, 1, 4),
677
- ],
678
- "s_id": ["s1", "s1", "s2", "s2"],
679
- }
680
- )
681
-
682
- # Should raise TypeError for unsupported dtypes
683
- with pytest.raises(TypeError, match="Unsupported data types"):
684
- resolve_matches(data)
685
-
686
-
687
- def test_resolve_matches_first_method():
688
- """Test resolve_matches with first method."""
689
- # Setup data with known order
690
- data = pd.DataFrame(
691
- {
692
- FEATURE_ID_VAR_DEFAULT: ["A", "C", "B", "B", "A"],
693
- SBML_DFS.S_ID: ["s1", "s1", "s1", "s2", "s2"],
694
- "value": [1, 2, 3, 4, 5],
695
- }
696
- )
697
-
698
- result = resolve_matches(data, numeric_agg=RESOLVE_MATCHES_AGGREGATORS.FIRST)
699
-
700
- # Should take first value after sorting by feature_id
701
- assert result.loc["s1", "value"] == 1 # A comes first
702
- assert result.loc["s2", "value"] == 5 # A comes first
703
-
704
-
705
- def test_resolve_matches_deduplicate_feature_id_within_sid():
706
- """Test that only the first value for each (s_id, feature_id) is used in mean aggregation."""
707
- data = pd.DataFrame(
708
- {
709
- FEATURE_ID_VAR_DEFAULT: ["A", "A", "B"],
710
- SBML_DFS.S_ID: ["s1", "s1", "s1"],
711
- "value": [
712
- 1,
713
- 1,
714
- 2,
715
- ], # average should be 1.5 because the two A's are redundant
716
- }
717
- )
718
-
719
- result = resolve_matches(data, numeric_agg=RESOLVE_MATCHES_AGGREGATORS.MEAN)
720
- assert result.loc["s1", "value"] == 1.5
721
-
722
-
723
- def test_bind_wide_results(sbml_dfs_glucose_metabolism):
724
- """
725
- Test that bind_wide_results correctly matches identifiers and adds results to species data.
726
- """
727
- # Get species identifiers, excluding reactome
728
- species_identifiers = (
729
- sbml_dfs_glucose_metabolism.get_identifiers(SBML_DFS.SPECIES)
730
- .query("bqb == 'BQB_IS'")
731
- .query("ontology != 'reactome'")
732
- )
733
-
734
- # Create example data with identifiers and results
735
- example_data = species_identifiers.groupby("ontology").head(10)[
736
- [IDENTIFIERS.ONTOLOGY, IDENTIFIERS.IDENTIFIER]
737
- ]
738
- example_data["results_a"] = np.random.randn(len(example_data))
739
- example_data["results_b"] = np.random.randn(len(example_data))
740
- example_data[FEATURE_ID_VAR_DEFAULT] = range(0, len(example_data))
741
-
742
- # Create wide format data
743
- example_data_wide = (
744
- example_data.pivot(
745
- columns=IDENTIFIERS.ONTOLOGY,
746
- values=IDENTIFIERS.IDENTIFIER,
747
- index=[FEATURE_ID_VAR_DEFAULT, "results_a", "results_b"],
748
- )
749
- .reset_index()
750
- .rename_axis(None, axis=1)
751
- )
752
-
753
- # Call bind_wide_results
754
- results_name = "test_results"
755
- sbml_dfs_result = mechanism_matching.bind_wide_results(
756
- sbml_dfs=sbml_dfs_glucose_metabolism,
757
- results_df=example_data_wide,
758
- results_name=results_name,
759
- ontologies={ONTOLOGIES.UNIPROT, ONTOLOGIES.CHEBI},
760
- dogmatic=False,
761
- species_identifiers=None,
762
- feature_id_var=FEATURE_ID_VAR_DEFAULT,
763
- verbose=True,
764
- )
765
-
766
- # Verify the results were added correctly
767
- assert (
768
- results_name in sbml_dfs_result.species_data
769
- ), f"{results_name} not found in species_data"
770
-
771
- # Get the bound results
772
- bound_results = sbml_dfs_result.species_data[results_name]
773
-
774
- # columns are feature_id, results_a, results_b
775
- assert set(bound_results.columns) == {
776
- FEATURE_ID_VAR_DEFAULT,
777
- "results_a",
778
- "results_b",
779
- }
780
-
781
- assert bound_results.shape == (23, 3)
782
- assert bound_results.loc["S00000056", "feature_id"] == "18,19"
783
- assert bound_results.loc["S00000057", "feature_id"] == "18"
784
- assert bound_results.loc["S00000010", "feature_id"] == "9"