napistu 0.2.5.dev7__py3-none-any.whl → 0.3.1.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__init__.py +1 -3
- napistu/__main__.py +126 -96
- napistu/constants.py +35 -41
- napistu/context/__init__.py +10 -0
- napistu/context/discretize.py +462 -0
- napistu/context/filtering.py +387 -0
- napistu/gcs/__init__.py +1 -1
- napistu/identifiers.py +74 -15
- napistu/indices.py +68 -0
- napistu/ingestion/__init__.py +1 -1
- napistu/ingestion/bigg.py +47 -62
- napistu/ingestion/constants.py +18 -133
- napistu/ingestion/gtex.py +113 -0
- napistu/ingestion/hpa.py +147 -0
- napistu/ingestion/sbml.py +0 -97
- napistu/ingestion/string.py +2 -2
- napistu/matching/__init__.py +10 -0
- napistu/matching/constants.py +18 -0
- napistu/matching/interactions.py +518 -0
- napistu/matching/mount.py +529 -0
- napistu/matching/species.py +510 -0
- napistu/mcp/__init__.py +7 -4
- napistu/mcp/__main__.py +128 -72
- napistu/mcp/client.py +16 -25
- napistu/mcp/codebase.py +201 -145
- napistu/mcp/component_base.py +170 -0
- napistu/mcp/config.py +223 -0
- napistu/mcp/constants.py +45 -2
- napistu/mcp/documentation.py +253 -136
- napistu/mcp/documentation_utils.py +13 -48
- napistu/mcp/execution.py +372 -305
- napistu/mcp/health.py +47 -65
- napistu/mcp/profiles.py +10 -6
- napistu/mcp/server.py +161 -80
- napistu/mcp/tutorials.py +139 -87
- napistu/modify/__init__.py +1 -1
- napistu/modify/gaps.py +1 -1
- napistu/network/__init__.py +1 -1
- napistu/network/constants.py +101 -34
- napistu/network/data_handling.py +388 -0
- napistu/network/ig_utils.py +351 -0
- napistu/network/napistu_graph_core.py +354 -0
- napistu/network/neighborhoods.py +40 -40
- napistu/network/net_create.py +373 -309
- napistu/network/net_propagation.py +47 -19
- napistu/network/{net_utils.py → ng_utils.py} +124 -272
- napistu/network/paths.py +67 -51
- napistu/network/precompute.py +11 -11
- napistu/ontologies/__init__.py +10 -0
- napistu/ontologies/constants.py +129 -0
- napistu/ontologies/dogma.py +243 -0
- napistu/ontologies/genodexito.py +649 -0
- napistu/ontologies/mygene.py +369 -0
- napistu/ontologies/renaming.py +198 -0
- napistu/rpy2/__init__.py +229 -86
- napistu/rpy2/callr.py +47 -77
- napistu/rpy2/constants.py +24 -23
- napistu/rpy2/rids.py +61 -648
- napistu/sbml_dfs_core.py +587 -222
- napistu/scverse/__init__.py +15 -0
- napistu/scverse/constants.py +28 -0
- napistu/scverse/loading.py +727 -0
- napistu/utils.py +118 -10
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/METADATA +8 -3
- napistu-0.3.1.dev1.dist-info/RECORD +133 -0
- tests/conftest.py +22 -0
- tests/test_context_discretize.py +56 -0
- tests/test_context_filtering.py +267 -0
- tests/test_identifiers.py +100 -0
- tests/test_indices.py +65 -0
- tests/{test_edgelist.py → test_ingestion_napistu_edgelist.py} +2 -2
- tests/test_matching_interactions.py +108 -0
- tests/test_matching_mount.py +305 -0
- tests/test_matching_species.py +394 -0
- tests/test_mcp_config.py +193 -0
- tests/test_mcp_documentation_utils.py +12 -3
- tests/test_mcp_server.py +156 -19
- tests/test_network_data_handling.py +397 -0
- tests/test_network_ig_utils.py +23 -0
- tests/test_network_neighborhoods.py +19 -0
- tests/test_network_net_create.py +459 -0
- tests/test_network_ng_utils.py +30 -0
- tests/test_network_paths.py +56 -0
- tests/{test_precomputed_distances.py → test_network_precompute.py} +8 -6
- tests/test_ontologies_genodexito.py +58 -0
- tests/test_ontologies_mygene.py +39 -0
- tests/test_ontologies_renaming.py +110 -0
- tests/test_rpy2_callr.py +79 -0
- tests/test_rpy2_init.py +151 -0
- tests/test_sbml.py +0 -31
- tests/test_sbml_dfs_core.py +134 -10
- tests/test_scverse_loading.py +778 -0
- tests/test_set_coverage.py +2 -2
- tests/test_utils.py +121 -1
- napistu/mechanism_matching.py +0 -1353
- napistu/rpy2/netcontextr.py +0 -467
- napistu-0.2.5.dev7.dist-info/RECORD +0 -98
- tests/test_igraph.py +0 -367
- tests/test_mechanism_matching.py +0 -784
- tests/test_net_utils.py +0 -149
- tests/test_netcontextr.py +0 -105
- tests/test_rpy2.py +0 -61
- /napistu/ingestion/{cpr_edgelist.py → napistu_edgelist.py} +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/WHEEL +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/entry_points.txt +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/licenses/LICENSE +0 -0
- {napistu-0.2.5.dev7.dist-info → napistu-0.3.1.dev1.dist-info}/top_level.txt +0 -0
- /tests/{test_obo.py → test_ingestion_obo.py} +0 -0
napistu/mechanism_matching.py
DELETED
@@ -1,1353 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
import logging
|
4
|
-
from typing import Optional, Union, Set, Dict, List
|
5
|
-
|
6
|
-
import igraph as ig
|
7
|
-
import numpy as np
|
8
|
-
import pandas as pd
|
9
|
-
|
10
|
-
from napistu import identifiers
|
11
|
-
from napistu import sbml_dfs_core
|
12
|
-
from napistu import utils
|
13
|
-
from napistu.constants import SBML_DFS
|
14
|
-
from napistu.constants import CPR_EDGELIST
|
15
|
-
from napistu.constants import CPR_EDGELIST_REQ_VARS
|
16
|
-
from napistu.constants import FEATURE_ID_VAR_DEFAULT
|
17
|
-
from napistu.constants import RESOLVE_MATCHES_AGGREGATORS
|
18
|
-
from napistu.constants import RESOLVE_MATCHES_TMP_WEIGHT_COL
|
19
|
-
from napistu.constants import IDENTIFIERS
|
20
|
-
from napistu.constants import IDENTIFIER_EDGELIST_REQ_VARS
|
21
|
-
from napistu.constants import ONTOLOGIES_LIST
|
22
|
-
from napistu.network.constants import CPR_GRAPH_EDGES
|
23
|
-
from napistu.network import paths
|
24
|
-
|
25
|
-
logger = logging.getLogger(__name__)
|
26
|
-
|
27
|
-
|
28
|
-
def bind_wide_results(
|
29
|
-
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
30
|
-
results_df: pd.DataFrame,
|
31
|
-
results_name: str,
|
32
|
-
ontologies: Optional[Union[Set[str], Dict[str, str]]] = None,
|
33
|
-
dogmatic: bool = False,
|
34
|
-
species_identifiers: Optional[pd.DataFrame] = None,
|
35
|
-
feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
|
36
|
-
numeric_agg: str = RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
|
37
|
-
keep_id_col: bool = True,
|
38
|
-
verbose: bool = False,
|
39
|
-
) -> sbml_dfs_core.SBML_dfs:
|
40
|
-
"""
|
41
|
-
Binds wide results to a sbml_dfs object.
|
42
|
-
|
43
|
-
Take a table with molecular species-level attributes tied to systematic identifiers and match them to an sbml_dfs_model transferring these attributes to species_data
|
44
|
-
|
45
|
-
Parameters
|
46
|
-
----------
|
47
|
-
sbml_dfs : sbml_dfs_core.SBML_dfs
|
48
|
-
The sbml_dfs object to bind the results to.
|
49
|
-
results_df : pd.DataFrame
|
50
|
-
The table containing the results to bind.
|
51
|
-
results_name : str
|
52
|
-
The name of the results to bind.
|
53
|
-
ontologies : Optional[Union[Set[str], Dict[str, str]]], default=None
|
54
|
-
Either:
|
55
|
-
- Set of columns to treat as ontologies (these should be entries in ONTOLOGIES_LIST )
|
56
|
-
- Dict mapping wide column names to ontology names in the ONTOLOGIES_LIST controlled vocabulary
|
57
|
-
- None to automatically detect valid ontology columns based on ONTOLOGIES_LIST
|
58
|
-
dogmatic : bool
|
59
|
-
Whether to respect differences between genes, transcripts, and proteins (True) or ignore them (False).
|
60
|
-
species_identifiers : Optional[pd.DataFrame]
|
61
|
-
Systematic identifiers for the molecular species "sbml_dfs". If None this will be generate on-the-fly.
|
62
|
-
feature_id_var : str
|
63
|
-
The name of the column in the results_df that contains the feature identifiers. If this does not exist it will be created.
|
64
|
-
numeric_agg : str
|
65
|
-
The aggregation method to use for resolving degeneracy.
|
66
|
-
keep_id_col : bool
|
67
|
-
Whether to keep the identifier column in the results_df.
|
68
|
-
verbose : bool
|
69
|
-
Whether to log cases of 1-to-many and many-to-one mapping and to indicate the behavior for resolving degeneracy
|
70
|
-
|
71
|
-
Returns
|
72
|
-
-------
|
73
|
-
sbml_dfs : sbml_dfs_core.SBML_dfs
|
74
|
-
The sbml_dfs object with the results bound.
|
75
|
-
"""
|
76
|
-
|
77
|
-
species_identifiers = identifiers._prepare_species_identifiers(
|
78
|
-
sbml_dfs, dogmatic=dogmatic, species_identifiers=species_identifiers
|
79
|
-
)
|
80
|
-
|
81
|
-
# match
|
82
|
-
matched_s_ids_from_wide = match_features_to_wide_pathway_species(
|
83
|
-
results_df,
|
84
|
-
species_identifiers,
|
85
|
-
ontologies=ontologies,
|
86
|
-
feature_id_var=feature_id_var,
|
87
|
-
verbose=verbose,
|
88
|
-
)
|
89
|
-
|
90
|
-
disambiguated_matches = resolve_matches(
|
91
|
-
matched_data=matched_s_ids_from_wide,
|
92
|
-
feature_id_var=feature_id_var,
|
93
|
-
numeric_agg=numeric_agg,
|
94
|
-
keep_id_col=keep_id_col,
|
95
|
-
)
|
96
|
-
|
97
|
-
clean_species_data = utils.drop_extra_cols(
|
98
|
-
results_df, disambiguated_matches, always_include=[feature_id_var]
|
99
|
-
)
|
100
|
-
|
101
|
-
sbml_dfs.add_species_data(results_name, clean_species_data)
|
102
|
-
|
103
|
-
return sbml_dfs
|
104
|
-
|
105
|
-
|
106
|
-
def features_to_pathway_species(
|
107
|
-
feature_identifiers: pd.DataFrame,
|
108
|
-
species_identifiers: pd.DataFrame,
|
109
|
-
ontologies: set,
|
110
|
-
feature_identifiers_var: str = IDENTIFIERS.IDENTIFIER,
|
111
|
-
feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
|
112
|
-
expand_identifiers: bool = False,
|
113
|
-
identifier_delimiter: str = "/",
|
114
|
-
verbose: bool = False,
|
115
|
-
) -> pd.DataFrame:
|
116
|
-
"""
|
117
|
-
Features to Pathway Species
|
118
|
-
|
119
|
-
Match a table of molecular species to their corresponding species in a pathway representation.
|
120
|
-
|
121
|
-
Parameters:
|
122
|
-
feature_identifiers: pd.DataFrame
|
123
|
-
pd.Dataframe containing a "feature_identifiers_var" variable used to match entries
|
124
|
-
species_identifiers: pd.DataFrame
|
125
|
-
A table of molecular species identifiers produced from sbml_dfs.get_identifiers("species")
|
126
|
-
generally using sbml_dfs_core.export_sbml_dfs()
|
127
|
-
ontologies: set
|
128
|
-
A set of ontologies used to match features to pathway species
|
129
|
-
feature_identifiers_var: str
|
130
|
-
Variable in "feature_identifiers" containing identifiers
|
131
|
-
expand_identifiers: bool, default=False
|
132
|
-
If True, split identifiers in feature_identifiers_var by identifier_delimiter and explode into multiple rows
|
133
|
-
identifier_delimiter: str, default="/"
|
134
|
-
Delimiter to use for splitting identifiers if expand_identifiers is True
|
135
|
-
verbose: bool, default=False
|
136
|
-
If True, log mapping statistics at the end of the function
|
137
|
-
|
138
|
-
Returns:
|
139
|
-
pathway_species: pd.DataFrame
|
140
|
-
species_identifiers joined to feature_identifiers based on shared identifiers
|
141
|
-
"""
|
142
|
-
|
143
|
-
# Check for identifier column
|
144
|
-
if feature_identifiers_var not in feature_identifiers.columns.to_list():
|
145
|
-
raise ValueError(
|
146
|
-
f"{feature_identifiers_var} must be a variable in 'feature_identifiers', "
|
147
|
-
f"possible variables are {', '.join(feature_identifiers.columns.tolist())}"
|
148
|
-
)
|
149
|
-
|
150
|
-
# Respect or create feature_id column
|
151
|
-
feature_identifiers = _ensure_feature_id_var(feature_identifiers, feature_id_var)
|
152
|
-
|
153
|
-
# Optionally expand identifiers into multiple rows
|
154
|
-
if expand_identifiers:
|
155
|
-
# Count the number of expansions by counting delimiters
|
156
|
-
n_expansions = (
|
157
|
-
feature_identifiers[feature_identifiers_var]
|
158
|
-
.astype(str)
|
159
|
-
.str.count(identifier_delimiter)
|
160
|
-
.sum()
|
161
|
-
)
|
162
|
-
if n_expansions > 0:
|
163
|
-
logger.info(
|
164
|
-
f"Expanding identifiers: {n_expansions} delimiters found in '{feature_identifiers_var}', will expand to more rows."
|
165
|
-
)
|
166
|
-
|
167
|
-
# Split, strip whitespace, and explode
|
168
|
-
feature_identifiers = feature_identifiers.copy()
|
169
|
-
feature_identifiers[feature_identifiers_var] = (
|
170
|
-
feature_identifiers[feature_identifiers_var]
|
171
|
-
.astype(str)
|
172
|
-
.str.split(identifier_delimiter)
|
173
|
-
.apply(lambda lst: [x.strip() for x in lst])
|
174
|
-
)
|
175
|
-
feature_identifiers = feature_identifiers.explode(
|
176
|
-
feature_identifiers_var, ignore_index=True
|
177
|
-
)
|
178
|
-
|
179
|
-
# check identifiers table
|
180
|
-
identifiers._check_species_identifiers_table(species_identifiers)
|
181
|
-
|
182
|
-
available_ontologies = set(species_identifiers[IDENTIFIERS.ONTOLOGY].tolist())
|
183
|
-
unavailable_ontologies = ontologies.difference(available_ontologies)
|
184
|
-
|
185
|
-
# no ontologies present
|
186
|
-
if len(unavailable_ontologies) == len(ontologies):
|
187
|
-
raise ValueError(
|
188
|
-
f"None of the requested ontologies ({', '.join(ontologies)}) "
|
189
|
-
"were used to annotate pathway species. Available ontologies are: "
|
190
|
-
f"{', '.join(available_ontologies)}"
|
191
|
-
)
|
192
|
-
|
193
|
-
# 1+ desired ontologies are not present
|
194
|
-
if len(unavailable_ontologies) > 0:
|
195
|
-
raise ValueError(
|
196
|
-
f"Some of the requested ontologies ({', '.join(unavailable_ontologies)}) "
|
197
|
-
"were NOT used to annotate pathway species. Available ontologies are: "
|
198
|
-
f"{', '.join(available_ontologies)}"
|
199
|
-
)
|
200
|
-
|
201
|
-
relevant_identifiers = species_identifiers[
|
202
|
-
species_identifiers[IDENTIFIERS.ONTOLOGY].isin(ontologies)
|
203
|
-
]
|
204
|
-
|
205
|
-
# map features to pathway species
|
206
|
-
pathway_species = feature_identifiers.merge(
|
207
|
-
relevant_identifiers,
|
208
|
-
left_on=feature_identifiers_var,
|
209
|
-
right_on=IDENTIFIERS.IDENTIFIER,
|
210
|
-
)
|
211
|
-
|
212
|
-
if pathway_species.shape[0] == 0:
|
213
|
-
logger.warning(
|
214
|
-
"None of the provided species identifiers matched entries of the pathway; returning None"
|
215
|
-
)
|
216
|
-
None
|
217
|
-
|
218
|
-
# report the fraction of unmapped species
|
219
|
-
if verbose:
|
220
|
-
_log_feature_species_mapping_stats(pathway_species, feature_id_var)
|
221
|
-
|
222
|
-
return pathway_species
|
223
|
-
|
224
|
-
|
225
|
-
def edgelist_to_pathway_species(
|
226
|
-
formatted_edgelist: pd.DataFrame,
|
227
|
-
species_identifiers: pd.DataFrame,
|
228
|
-
ontologies: set,
|
229
|
-
feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
|
230
|
-
verbose: bool = False,
|
231
|
-
) -> pd.DataFrame:
|
232
|
-
"""
|
233
|
-
Edgelist to Pathway Species
|
234
|
-
|
235
|
-
Match an edgelist of molecular species pairs to their corresponding species in a pathway representation.
|
236
|
-
|
237
|
-
Parameters:
|
238
|
-
formatted_edgelist: pd.DataFrame
|
239
|
-
pd.Dataframe containing a "identifier_upstream" and "identifier_downstream" variables used to to match entries
|
240
|
-
species_identifiers: pd.DataFrame
|
241
|
-
A table of molecular species identifiers produced from sbml_dfs.get_identifiers("species") generally using
|
242
|
-
sbml_dfs_core.export_sbml_dfs()
|
243
|
-
ontologies: set
|
244
|
-
A set of ontologies used to match features to pathway species
|
245
|
-
feature_id_var: str, default=FEATURE_ID_VAR_DEFAULT
|
246
|
-
Variable in "formatted_edgelist" containing feature ids
|
247
|
-
verbose: bool, default=False
|
248
|
-
Whether to print verbose output
|
249
|
-
|
250
|
-
Returns:
|
251
|
-
edges_on_pathway: pd.DataFrame
|
252
|
-
formatted_edgelist with upstream features mapped
|
253
|
-
to "s_id_upstream" and downstream species mapped
|
254
|
-
to "s_id_downstream"
|
255
|
-
"""
|
256
|
-
|
257
|
-
required_vars_distinct_features = {
|
258
|
-
CPR_EDGELIST.IDENTIFIER_UPSTREAM,
|
259
|
-
CPR_EDGELIST.IDENTIFIER_DOWNSTREAM,
|
260
|
-
}
|
261
|
-
missing_required_vars_distinct_features = (
|
262
|
-
required_vars_distinct_features.difference(
|
263
|
-
set(formatted_edgelist.columns.tolist())
|
264
|
-
)
|
265
|
-
)
|
266
|
-
|
267
|
-
if len(missing_required_vars_distinct_features) > 0:
|
268
|
-
raise ValueError(
|
269
|
-
f"{len(missing_required_vars_distinct_features)} required variables were "
|
270
|
-
"missing from 'formatted_edgelist': "
|
271
|
-
f"{', '.join(missing_required_vars_distinct_features)}"
|
272
|
-
)
|
273
|
-
|
274
|
-
# define all distinct identifiers in edgelist
|
275
|
-
distinct_identifiers = (
|
276
|
-
pd.concat(
|
277
|
-
[
|
278
|
-
formatted_edgelist[CPR_EDGELIST.IDENTIFIER_UPSTREAM],
|
279
|
-
formatted_edgelist[CPR_EDGELIST.IDENTIFIER_DOWNSTREAM],
|
280
|
-
]
|
281
|
-
)
|
282
|
-
.drop_duplicates()
|
283
|
-
.reset_index(drop=True)
|
284
|
-
.to_frame()
|
285
|
-
.rename({0: feature_id_var}, axis=1)
|
286
|
-
)
|
287
|
-
|
288
|
-
# merge edgelist identifiers with pathway identifiers to map s_ids to identifiers
|
289
|
-
features_on_pathway = features_to_pathway_species(
|
290
|
-
feature_identifiers=distinct_identifiers,
|
291
|
-
species_identifiers=species_identifiers,
|
292
|
-
ontologies=ontologies,
|
293
|
-
feature_identifiers_var=feature_id_var,
|
294
|
-
verbose=verbose,
|
295
|
-
)
|
296
|
-
|
297
|
-
# add s_ids of both upstream and downstream edges to pathway
|
298
|
-
edges_on_pathway = formatted_edgelist.merge(
|
299
|
-
features_on_pathway[[SBML_DFS.S_ID, IDENTIFIERS.IDENTIFIER]].rename(
|
300
|
-
{
|
301
|
-
SBML_DFS.S_ID: CPR_EDGELIST.S_ID_UPSTREAM,
|
302
|
-
IDENTIFIERS.IDENTIFIER: CPR_EDGELIST.IDENTIFIER_UPSTREAM,
|
303
|
-
},
|
304
|
-
axis=1,
|
305
|
-
)
|
306
|
-
).merge(
|
307
|
-
features_on_pathway[[SBML_DFS.S_ID, IDENTIFIERS.IDENTIFIER]].rename(
|
308
|
-
{
|
309
|
-
SBML_DFS.S_ID: CPR_EDGELIST.S_ID_DOWNSTREAM,
|
310
|
-
IDENTIFIERS.IDENTIFIER: CPR_EDGELIST.IDENTIFIER_DOWNSTREAM,
|
311
|
-
},
|
312
|
-
axis=1,
|
313
|
-
)
|
314
|
-
)
|
315
|
-
|
316
|
-
return edges_on_pathway
|
317
|
-
|
318
|
-
|
319
|
-
def match_features_to_wide_pathway_species(
|
320
|
-
wide_df: pd.DataFrame,
|
321
|
-
species_identifiers: pd.DataFrame,
|
322
|
-
ontologies: Optional[Union[Set[str], Dict[str, str]]] = None,
|
323
|
-
feature_identifiers_var: str = IDENTIFIERS.IDENTIFIER,
|
324
|
-
feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
|
325
|
-
verbose: bool = False,
|
326
|
-
) -> pd.DataFrame:
|
327
|
-
"""
|
328
|
-
Convert a wide-format DataFrame with multiple ontology columns to long format,
|
329
|
-
and match features to pathway species by ontology and identifier.
|
330
|
-
|
331
|
-
Parameters
|
332
|
-
----------
|
333
|
-
wide_df : pd.DataFrame
|
334
|
-
DataFrame with ontology identifier columns and any number of results columns.
|
335
|
-
All non-ontology columns are treated as results.
|
336
|
-
species_identifiers : pd.DataFrame
|
337
|
-
DataFrame as required by features_to_pathway_species
|
338
|
-
ontologies : Optional[Union[Set[str], Dict[str, str]]], default=None
|
339
|
-
Either:
|
340
|
-
- Set of columns to treat as ontologies (these should be entries in ONTOLOGIES_LIST )
|
341
|
-
- Dict mapping wide column names to ontology names in the ONTOLOGIES_LIST controlled vocabulary
|
342
|
-
- None to automatically detect valid ontology columns based on ONTOLOGIES_LIST
|
343
|
-
feature_identifiers_var : str, default="identifier"
|
344
|
-
Name for the identifier column in the long format
|
345
|
-
feature_id_var: str, default=FEATURE_ID_VAR_DEFAULT
|
346
|
-
Name for the feature id column in the long format
|
347
|
-
verbose : bool, default=False
|
348
|
-
Whether to print verbose output
|
349
|
-
|
350
|
-
Returns
|
351
|
-
-------
|
352
|
-
pd.DataFrame
|
353
|
-
Output of match_by_ontology_and_identifier
|
354
|
-
|
355
|
-
Examples
|
356
|
-
--------
|
357
|
-
>>> # Example with auto-detected ontology columns and multiple results
|
358
|
-
>>> wide_df = pd.DataFrame({
|
359
|
-
... 'uniprot': ['P12345', 'Q67890'],
|
360
|
-
... 'chebi': ['15377', '16810'],
|
361
|
-
... 'log2fc': [1.0, 2.0],
|
362
|
-
... 'pvalue': [0.01, 0.05]
|
363
|
-
... })
|
364
|
-
>>> result = match_features_to_wide_pathway_species(
|
365
|
-
... wide_df=wide_df,
|
366
|
-
... species_identifiers=species_identifiers
|
367
|
-
... )
|
368
|
-
|
369
|
-
>>> # Example with custom ontology mapping
|
370
|
-
>>> wide_df = pd.DataFrame({
|
371
|
-
... 'protein_id': ['P12345', 'Q67890'],
|
372
|
-
... 'compound_id': ['15377', '16810'],
|
373
|
-
... 'expression': [1.0, 2.0],
|
374
|
-
... 'confidence': [0.8, 0.9]
|
375
|
-
... })
|
376
|
-
>>> result = match_features_to_wide_pathway_species(
|
377
|
-
... wide_df=wide_df,
|
378
|
-
... species_identifiers=species_identifiers,
|
379
|
-
... ontologies={'protein_id': 'uniprot', 'compound_id': 'chebi'}
|
380
|
-
... )
|
381
|
-
"""
|
382
|
-
# Make a copy to avoid modifying the input
|
383
|
-
wide_df = wide_df.copy()
|
384
|
-
|
385
|
-
# Validate ontologies and get the set of ontology columns
|
386
|
-
ontology_cols = _validate_wide_ontologies(wide_df, ontologies)
|
387
|
-
melt_cols = list(ontology_cols)
|
388
|
-
|
389
|
-
# Apply renaming if a mapping is provided
|
390
|
-
if isinstance(ontologies, dict):
|
391
|
-
wide_df = wide_df.rename(columns=ontologies)
|
392
|
-
|
393
|
-
# Ensure feature_id column exists
|
394
|
-
wide_df = _ensure_feature_id_var(wide_df, feature_id_var)
|
395
|
-
|
396
|
-
# All non-ontology columns are treated as results
|
397
|
-
results_cols = list(set(wide_df.columns) - set(melt_cols))
|
398
|
-
if not results_cols:
|
399
|
-
raise ValueError("No results columns found in DataFrame")
|
400
|
-
|
401
|
-
logger.info(f"Using columns as results: {results_cols}")
|
402
|
-
|
403
|
-
# Melt ontology columns to long format, keeping all results columns
|
404
|
-
long_df = wide_df.melt(
|
405
|
-
id_vars=results_cols,
|
406
|
-
value_vars=melt_cols,
|
407
|
-
var_name=IDENTIFIERS.ONTOLOGY,
|
408
|
-
value_name=feature_identifiers_var,
|
409
|
-
).dropna(subset=[feature_identifiers_var])
|
410
|
-
|
411
|
-
logger.debug(f"Final long format shape: {long_df.shape}")
|
412
|
-
|
413
|
-
# Call the matching function with the validated ontologies
|
414
|
-
out = match_by_ontology_and_identifier(
|
415
|
-
feature_identifiers=long_df,
|
416
|
-
species_identifiers=species_identifiers,
|
417
|
-
ontologies=ontology_cols,
|
418
|
-
feature_identifiers_var=feature_identifiers_var,
|
419
|
-
)
|
420
|
-
|
421
|
-
if verbose:
|
422
|
-
_log_feature_species_mapping_stats(out, feature_id_var)
|
423
|
-
|
424
|
-
return out
|
425
|
-
|
426
|
-
|
427
|
-
def match_by_ontology_and_identifier(
|
428
|
-
feature_identifiers: pd.DataFrame,
|
429
|
-
species_identifiers: pd.DataFrame,
|
430
|
-
ontologies: Union[str, Set[str], List[str]],
|
431
|
-
feature_identifiers_var: str = IDENTIFIERS.IDENTIFIER,
|
432
|
-
verbose: bool = False,
|
433
|
-
) -> pd.DataFrame:
|
434
|
-
"""
|
435
|
-
Match features to pathway species based on both ontology and identifier matches.
|
436
|
-
Performs separate matching for each ontology and concatenates the results.
|
437
|
-
|
438
|
-
Parameters
|
439
|
-
----------
|
440
|
-
feature_identifiers : pd.DataFrame
|
441
|
-
DataFrame containing feature identifiers and results.
|
442
|
-
Must have columns [ontology, feature_identifiers_var, results]
|
443
|
-
species_identifiers : pd.DataFrame
|
444
|
-
DataFrame containing species identifiers from pathway.
|
445
|
-
Must have columns [ontology, identifier]
|
446
|
-
ontologies : Union[str, Set[str], List[str]]
|
447
|
-
Ontologies to match on. Can be:
|
448
|
-
- A single ontology string
|
449
|
-
- A set of ontology strings
|
450
|
-
- A list of ontology strings
|
451
|
-
feature_identifiers_var : str, default="identifier"
|
452
|
-
Name of the identifier column in feature_identifiers
|
453
|
-
verbose : bool, default=False
|
454
|
-
Whether to print verbose output
|
455
|
-
|
456
|
-
Returns
|
457
|
-
-------
|
458
|
-
pd.DataFrame
|
459
|
-
Concatenated results of matching for each ontology.
|
460
|
-
Contains all columns from features_to_pathway_species()
|
461
|
-
|
462
|
-
Examples
|
463
|
-
--------
|
464
|
-
>>> # Match using a single ontology
|
465
|
-
>>> result = match_by_ontology_and_identifier(
|
466
|
-
... feature_identifiers=features_df,
|
467
|
-
... species_identifiers=species_df,
|
468
|
-
... ontologies="uniprot"
|
469
|
-
... )
|
470
|
-
|
471
|
-
>>> # Match using multiple ontologies
|
472
|
-
>>> result = match_by_ontology_and_identifier(
|
473
|
-
... feature_identifiers=features_df,
|
474
|
-
... species_identifiers=species_df,
|
475
|
-
... ontologies={"uniprot", "chebi"}
|
476
|
-
... )
|
477
|
-
"""
|
478
|
-
# Convert string to set for consistent handling
|
479
|
-
if isinstance(ontologies, str):
|
480
|
-
ontologies = {ontologies}
|
481
|
-
elif isinstance(ontologies, list):
|
482
|
-
ontologies = set(ontologies)
|
483
|
-
|
484
|
-
# Validate ontologies
|
485
|
-
invalid_onts = ontologies - set(ONTOLOGIES_LIST)
|
486
|
-
if invalid_onts:
|
487
|
-
raise ValueError(
|
488
|
-
f"Invalid ontologies specified: {invalid_onts}. Must be one of: {ONTOLOGIES_LIST}"
|
489
|
-
)
|
490
|
-
|
491
|
-
# Initialize list to store results
|
492
|
-
matched_dfs = []
|
493
|
-
|
494
|
-
# Process each ontology separately
|
495
|
-
for ont in ontologies:
|
496
|
-
# Filter feature identifiers to current ontology and drop ontology column
|
497
|
-
ont_features = (
|
498
|
-
feature_identifiers[feature_identifiers[IDENTIFIERS.ONTOLOGY] == ont]
|
499
|
-
.drop(columns=[IDENTIFIERS.ONTOLOGY])
|
500
|
-
.copy()
|
501
|
-
)
|
502
|
-
|
503
|
-
if ont_features.empty:
|
504
|
-
logger.warning(f"No features found for ontology: {ont}")
|
505
|
-
continue
|
506
|
-
|
507
|
-
# Filter species identifiers to current ontology
|
508
|
-
ont_species = species_identifiers[
|
509
|
-
species_identifiers[IDENTIFIERS.ONTOLOGY] == ont
|
510
|
-
].copy()
|
511
|
-
|
512
|
-
if ont_species.empty:
|
513
|
-
logger.warning(f"No species found for ontology: {ont}")
|
514
|
-
continue
|
515
|
-
|
516
|
-
logger.debug(
|
517
|
-
f"Matching {len(ont_features)} features to {len(ont_species)} species for ontology {ont}"
|
518
|
-
)
|
519
|
-
|
520
|
-
# Match features to species for this ontology
|
521
|
-
matched = features_to_pathway_species(
|
522
|
-
feature_identifiers=ont_features,
|
523
|
-
species_identifiers=ont_species,
|
524
|
-
ontologies={ont},
|
525
|
-
feature_identifiers_var=feature_identifiers_var,
|
526
|
-
verbose=verbose,
|
527
|
-
)
|
528
|
-
|
529
|
-
if matched.empty:
|
530
|
-
logger.warning(f"No matches found for ontology: {ont}")
|
531
|
-
continue
|
532
|
-
|
533
|
-
matched_dfs.append(matched)
|
534
|
-
|
535
|
-
if not matched_dfs:
|
536
|
-
logger.warning("No matches found for any ontology")
|
537
|
-
return pd.DataFrame() # Return empty DataFrame with correct columns
|
538
|
-
|
539
|
-
# Combine results from all ontologies
|
540
|
-
result = pd.concat(matched_dfs, axis=0, ignore_index=True)
|
541
|
-
|
542
|
-
logger.info(
|
543
|
-
f"Found {len(result)} total matches across {len(matched_dfs)} ontologies"
|
544
|
-
)
|
545
|
-
|
546
|
-
return result
|
547
|
-
|
548
|
-
|
549
|
-
def resolve_matches(
|
550
|
-
matched_data: pd.DataFrame,
|
551
|
-
feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
|
552
|
-
index_col: str = SBML_DFS.S_ID,
|
553
|
-
numeric_agg: str = RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
|
554
|
-
keep_id_col: bool = True,
|
555
|
-
) -> pd.DataFrame:
|
556
|
-
"""
|
557
|
-
Resolve many-to-1 and 1-to-many matches in matched data.
|
558
|
-
|
559
|
-
Parameters
|
560
|
-
----------
|
561
|
-
matched_data : pd.DataFrame
|
562
|
-
DataFrame containing matched data with columns:
|
563
|
-
- feature_id_var: identifier column (e.g. feature_id)
|
564
|
-
- index_col: index column (e.g. s_id)
|
565
|
-
- other columns: data columns to be aggregated
|
566
|
-
feature_id_var : str, default="feature_id"
|
567
|
-
Name of the identifier column
|
568
|
-
index_col : str, default="s_id"
|
569
|
-
Name of the column to use as index
|
570
|
-
numeric_agg : str, default="weighted_mean"
|
571
|
-
Method to aggregate numeric columns:
|
572
|
-
- "weighted_mean": weighted by inverse of feature_id frequency (default)
|
573
|
-
- "mean": simple arithmetic mean
|
574
|
-
- "first": first value after sorting by feature_id_var (requires feature_id_var)
|
575
|
-
- "max": maximum value
|
576
|
-
keep_id_col : bool, default=True
|
577
|
-
Whether to keep and rollup the feature_id_var in the output.
|
578
|
-
If False, feature_id_var will be dropped from the output.
|
579
|
-
|
580
|
-
Returns
|
581
|
-
-------
|
582
|
-
pd.DataFrame
|
583
|
-
DataFrame with resolved matches:
|
584
|
-
- Many-to-1: numeric columns are aggregated using specified method
|
585
|
-
- 1-to-many: adds a count column showing number of matches
|
586
|
-
- Index is set to index_col and named accordingly
|
587
|
-
|
588
|
-
Raises
|
589
|
-
------
|
590
|
-
KeyError
|
591
|
-
If feature_id_var is not present in the DataFrame
|
592
|
-
TypeError
|
593
|
-
If DataFrame contains unsupported data types (boolean or datetime)
|
594
|
-
"""
|
595
|
-
# Make a copy to avoid modifying input
|
596
|
-
df = matched_data.copy()
|
597
|
-
|
598
|
-
# Check for unsupported data types
|
599
|
-
unsupported_dtypes = df.select_dtypes(include=["bool", "datetime64"]).columns
|
600
|
-
if not unsupported_dtypes.empty:
|
601
|
-
raise TypeError(
|
602
|
-
f"Unsupported data types found in columns: {list(unsupported_dtypes)}. "
|
603
|
-
"Boolean and datetime columns are not supported."
|
604
|
-
)
|
605
|
-
|
606
|
-
# Always require feature_id_var
|
607
|
-
if feature_id_var not in df.columns:
|
608
|
-
raise KeyError(feature_id_var)
|
609
|
-
|
610
|
-
# Deduplicate by feature_id within each s_id using groupby and first BEFORE any further processing
|
611
|
-
df = df.groupby([index_col, feature_id_var], sort=False).first().reset_index()
|
612
|
-
|
613
|
-
# Use a unique temporary column name for weights
|
614
|
-
if RESOLVE_MATCHES_TMP_WEIGHT_COL in df.columns:
|
615
|
-
raise ValueError(
|
616
|
-
f"Temporary weight column name '{RESOLVE_MATCHES_TMP_WEIGHT_COL}' already exists in the input data. Please rename or remove this column and try again."
|
617
|
-
)
|
618
|
-
|
619
|
-
# Calculate weights if needed (after deduplication!)
|
620
|
-
if numeric_agg == RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN:
|
621
|
-
feature_counts = df[feature_id_var].value_counts()
|
622
|
-
df[RESOLVE_MATCHES_TMP_WEIGHT_COL] = (
|
623
|
-
1 / feature_counts[df[feature_id_var]].values
|
624
|
-
)
|
625
|
-
|
626
|
-
# Set index for grouping
|
627
|
-
df = df.set_index(index_col)
|
628
|
-
|
629
|
-
# Use utility to split columns
|
630
|
-
always_non_numeric = [feature_id_var] if keep_id_col else []
|
631
|
-
numeric_cols, non_numeric_cols = _split_numeric_non_numeric_columns(
|
632
|
-
df, always_non_numeric=always_non_numeric
|
633
|
-
)
|
634
|
-
|
635
|
-
# Get aggregator function
|
636
|
-
numeric_aggregator = _get_numeric_aggregator(
|
637
|
-
method=numeric_agg, feature_id_var=feature_id_var
|
638
|
-
)
|
639
|
-
resolved = _aggregate_grouped_columns(
|
640
|
-
df,
|
641
|
-
numeric_cols,
|
642
|
-
non_numeric_cols,
|
643
|
-
numeric_aggregator,
|
644
|
-
feature_id_var=feature_id_var,
|
645
|
-
numeric_agg=numeric_agg,
|
646
|
-
)
|
647
|
-
# Add count of matches per feature_id
|
648
|
-
match_counts = matched_data.groupby(index_col)[feature_id_var].nunique()
|
649
|
-
resolved[f"{feature_id_var}_match_count"] = match_counts
|
650
|
-
|
651
|
-
# Drop feature_id_var if not keeping it
|
652
|
-
if not keep_id_col and feature_id_var in resolved.columns:
|
653
|
-
resolved = resolved.drop(columns=[feature_id_var])
|
654
|
-
|
655
|
-
# Ensure index is named consistently
|
656
|
-
resolved.index.name = index_col
|
657
|
-
|
658
|
-
return resolved
|
659
|
-
|
660
|
-
|
661
|
-
def edgelist_to_scids(
|
662
|
-
formatted_edgelist: pd.DataFrame,
|
663
|
-
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
664
|
-
species_identifiers: pd.DataFrame,
|
665
|
-
ontologies: set,
|
666
|
-
):
|
667
|
-
"""
|
668
|
-
|
669
|
-
Edgelist to Compartmentalized Species IDds
|
670
|
-
|
671
|
-
Map an edgelist of possible mechanistic interactions onto a
|
672
|
-
pathadex pathway
|
673
|
-
|
674
|
-
Parameters:
|
675
|
-
formatted_edgelist: pd.DataFrame
|
676
|
-
pd.Dataframe containing a "identifier_upstream" and
|
677
|
-
"identifier_downstream" variables used to to match entries
|
678
|
-
sbml_dfs: sbml_dfs_core.SBML_dfs
|
679
|
-
A mechanistic model
|
680
|
-
species_identifiers: pd.DataFrame
|
681
|
-
A table of molecular species identifiers produced from
|
682
|
-
sbml_dfs.get_identifiers("species") generally using sbml_dfs_core.export_sbml_dfs()
|
683
|
-
ontologies: set
|
684
|
-
A set of ontologies used to match features to pathway species
|
685
|
-
|
686
|
-
Returns:
|
687
|
-
edgelist_w_scids: pd.DataFrame
|
688
|
-
formatted_edgelist with upstream features mapped to "sc_id_upstream" and
|
689
|
-
downstream species mapped to "sc_id_downstream"
|
690
|
-
"""
|
691
|
-
|
692
|
-
identifiers._check_species_identifiers_table(species_identifiers)
|
693
|
-
|
694
|
-
# map edges onto pathway entities based on shared identifiers
|
695
|
-
edges_on_pathway = edgelist_to_pathway_species(
|
696
|
-
formatted_edgelist=formatted_edgelist,
|
697
|
-
species_identifiers=species_identifiers,
|
698
|
-
ontologies=ontologies,
|
699
|
-
)
|
700
|
-
|
701
|
-
# expand from s_ids to sc_ids
|
702
|
-
s_id_pairs = edges_on_pathway[
|
703
|
-
[CPR_EDGELIST.S_ID_UPSTREAM, CPR_EDGELIST.S_ID_DOWNSTREAM]
|
704
|
-
].drop_duplicates()
|
705
|
-
sc_id_pairs = s_id_pairs.merge(
|
706
|
-
sbml_dfs.compartmentalized_species[[SBML_DFS.S_ID]]
|
707
|
-
.reset_index()
|
708
|
-
.rename(
|
709
|
-
{
|
710
|
-
SBML_DFS.S_ID: CPR_EDGELIST.S_ID_UPSTREAM,
|
711
|
-
SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_UPSTREAM,
|
712
|
-
},
|
713
|
-
axis=1,
|
714
|
-
)
|
715
|
-
).merge(
|
716
|
-
sbml_dfs.compartmentalized_species[[SBML_DFS.S_ID]]
|
717
|
-
.reset_index()
|
718
|
-
.rename(
|
719
|
-
{
|
720
|
-
SBML_DFS.S_ID: CPR_EDGELIST.S_ID_DOWNSTREAM,
|
721
|
-
SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_DOWNSTREAM,
|
722
|
-
},
|
723
|
-
axis=1,
|
724
|
-
)
|
725
|
-
)
|
726
|
-
|
727
|
-
# map sc_ids back to edges_on_pathway
|
728
|
-
# join lookup table of s_id_upstream, s_id_downstream -> sc_ids
|
729
|
-
edgelist_w_scids = edges_on_pathway.merge(sc_id_pairs)
|
730
|
-
|
731
|
-
logger_msg = (
|
732
|
-
f"{edgelist_w_scids.shape[0]} interactions mapped "
|
733
|
-
"onto pairs of compartmentalized species in the mechanistic model"
|
734
|
-
)
|
735
|
-
if edgelist_w_scids.shape[0] == 0:
|
736
|
-
logger.warning(logger_msg)
|
737
|
-
else:
|
738
|
-
logger.info(logger_msg)
|
739
|
-
|
740
|
-
return edgelist_w_scids
|
741
|
-
|
742
|
-
|
743
|
-
def filter_to_direct_mechanistic_interactions(
|
744
|
-
formatted_edgelist: pd.DataFrame,
|
745
|
-
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
746
|
-
species_identifiers: pd.DataFrame,
|
747
|
-
ontologies: set,
|
748
|
-
) -> pd.DataFrame:
|
749
|
-
"""
|
750
|
-
Filter to Direct Mechanistic Interactions
|
751
|
-
|
752
|
-
Filter an edgelist to direct mechanistic interactions
|
753
|
-
|
754
|
-
Parameters:
|
755
|
-
formatted_edgelist: pd.DataFrame
|
756
|
-
pd.Dataframe containing a "identifier_upstream" and "identifier_downstream" variables used to to match entries
|
757
|
-
sbml_dfs: sbml_dfs_core.SBML_dfs
|
758
|
-
A mechanistic model
|
759
|
-
species_identifiers: pd.DataFrame
|
760
|
-
A table of molecular species identifiers
|
761
|
-
produced from sbml_dfs.get_identifiers("species") generally
|
762
|
-
using sbml_dfs_core.export_sbml_dfs()
|
763
|
-
ontologies: set
|
764
|
-
A set of ontologies used to match features to pathway species
|
765
|
-
|
766
|
-
Returns:
|
767
|
-
edgelist_w_direct_mechanistic_interactions: pd.DataFrame
|
768
|
-
formatted_edgelist filtered to mechanistic reactions present in the pathway representation
|
769
|
-
"""
|
770
|
-
|
771
|
-
edgelist_w_scids = _edgelist_to_scids_if_needed(
|
772
|
-
formatted_edgelist, sbml_dfs, species_identifiers, ontologies
|
773
|
-
)
|
774
|
-
|
775
|
-
# reduce to distinct sc_id pairs
|
776
|
-
sc_id_pairs = edgelist_w_scids[list(CPR_EDGELIST_REQ_VARS)].drop_duplicates()
|
777
|
-
|
778
|
-
# define all existing direct regulatory interactions
|
779
|
-
pathway_interactions = pd.concat(
|
780
|
-
[
|
781
|
-
# pair 0 -> <0 # modifiers affect substrates
|
782
|
-
sbml_dfs.reaction_species[
|
783
|
-
sbml_dfs.reaction_species[SBML_DFS.STOICHIOMETRY] == 0
|
784
|
-
][[SBML_DFS.R_ID, SBML_DFS.SC_ID]]
|
785
|
-
.rename({SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_UPSTREAM}, axis=1)
|
786
|
-
.merge(
|
787
|
-
sbml_dfs.reaction_species[
|
788
|
-
sbml_dfs.reaction_species[SBML_DFS.STOICHIOMETRY] < 0
|
789
|
-
][[SBML_DFS.R_ID, SBML_DFS.SC_ID]].rename(
|
790
|
-
{SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_DOWNSTREAM}, axis=1
|
791
|
-
)
|
792
|
-
),
|
793
|
-
# pair <0 -> >0 # substrates affect products
|
794
|
-
sbml_dfs.reaction_species[
|
795
|
-
sbml_dfs.reaction_species[SBML_DFS.STOICHIOMETRY] < 0
|
796
|
-
][[SBML_DFS.R_ID, SBML_DFS.SC_ID]]
|
797
|
-
.rename({SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_UPSTREAM}, axis=1)
|
798
|
-
.merge(
|
799
|
-
sbml_dfs.reaction_species[
|
800
|
-
sbml_dfs.reaction_species[SBML_DFS.STOICHIOMETRY] > 0
|
801
|
-
][[SBML_DFS.R_ID, SBML_DFS.SC_ID]].rename(
|
802
|
-
{SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_DOWNSTREAM}, axis=1
|
803
|
-
)
|
804
|
-
),
|
805
|
-
# pair 0 -> >0 # modifiers affect products
|
806
|
-
sbml_dfs.reaction_species[
|
807
|
-
sbml_dfs.reaction_species[SBML_DFS.STOICHIOMETRY] == 0
|
808
|
-
][[SBML_DFS.R_ID, SBML_DFS.SC_ID]]
|
809
|
-
.rename({SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_UPSTREAM}, axis=1)
|
810
|
-
.merge(
|
811
|
-
sbml_dfs.reaction_species[
|
812
|
-
sbml_dfs.reaction_species[SBML_DFS.STOICHIOMETRY] > 0
|
813
|
-
][[SBML_DFS.R_ID, SBML_DFS.SC_ID]].rename(
|
814
|
-
{SBML_DFS.SC_ID: CPR_EDGELIST.SC_ID_DOWNSTREAM}, axis=1
|
815
|
-
)
|
816
|
-
),
|
817
|
-
]
|
818
|
-
).reset_index(drop=True)
|
819
|
-
|
820
|
-
# filter pathway interactions based on matches to sc_id_pairs
|
821
|
-
direct_edge_interactions = (
|
822
|
-
sc_id_pairs.merge(pathway_interactions)
|
823
|
-
.merge(
|
824
|
-
sbml_dfs.species[SBML_DFS.S_NAME]
|
825
|
-
.to_frame()
|
826
|
-
.rename({SBML_DFS.S_NAME: CPR_EDGELIST.S_NAME_UPSTREAM}, axis=1),
|
827
|
-
left_on=CPR_EDGELIST.S_ID_UPSTREAM,
|
828
|
-
right_index=True,
|
829
|
-
# add species metadata for matches
|
830
|
-
)
|
831
|
-
.merge(
|
832
|
-
sbml_dfs.species[SBML_DFS.S_NAME]
|
833
|
-
.to_frame()
|
834
|
-
.rename({SBML_DFS.S_NAME: CPR_EDGELIST.S_NAME_DOWNSTREAM}, axis=1),
|
835
|
-
left_on=CPR_EDGELIST.S_ID_DOWNSTREAM,
|
836
|
-
right_index=True,
|
837
|
-
# add metadata for reactions where interaction occurs
|
838
|
-
)
|
839
|
-
.merge(
|
840
|
-
sbml_dfs.reactions[SBML_DFS.R_NAME].to_frame(),
|
841
|
-
left_on=SBML_DFS.R_ID,
|
842
|
-
right_index=True,
|
843
|
-
)
|
844
|
-
)
|
845
|
-
|
846
|
-
edgelist_w_direct_mechanistic_interactions = edgelist_w_scids.merge(
|
847
|
-
direct_edge_interactions[
|
848
|
-
[
|
849
|
-
CPR_EDGELIST.SC_ID_UPSTREAM,
|
850
|
-
CPR_EDGELIST.SC_ID_DOWNSTREAM,
|
851
|
-
SBML_DFS.R_ID,
|
852
|
-
CPR_EDGELIST.S_NAME_UPSTREAM,
|
853
|
-
CPR_EDGELIST.S_NAME_DOWNSTREAM,
|
854
|
-
SBML_DFS.R_NAME,
|
855
|
-
]
|
856
|
-
]
|
857
|
-
)
|
858
|
-
|
859
|
-
return edgelist_w_direct_mechanistic_interactions
|
860
|
-
|
861
|
-
|
862
|
-
def filter_to_indirect_mechanistic_interactions(
|
863
|
-
formatted_edgelist: pd.DataFrame,
|
864
|
-
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
865
|
-
species_identifiers: pd.DataFrame,
|
866
|
-
cpr_graph: ig.Graph,
|
867
|
-
ontologies: set,
|
868
|
-
precomputed_distances=None,
|
869
|
-
max_path_length=10,
|
870
|
-
):
|
871
|
-
"""
|
872
|
-
Filter to Indirect Mechanistic Interactions
|
873
|
-
|
874
|
-
Filter an edgelist to indirect mechanistic interactions.
|
875
|
-
Indirect relationships are identified by searching a
|
876
|
-
network for paths from an upstream species to a downstream species
|
877
|
-
|
878
|
-
Parameters:
|
879
|
-
formatted_edgelist: pd.DataFrame
|
880
|
-
pd.Dataframe containing a "identifier_upstream" and
|
881
|
-
"identifier_downstream" variables used to to match entries
|
882
|
-
sbml_dfs: sbml_dfs_core.SBML_dfs
|
883
|
-
A mechanistic model
|
884
|
-
species_identifiers: pandas.DataFrame
|
885
|
-
A table of molecular species identifiers produced from
|
886
|
-
sbml_dfs.get_identifiers("species") generally using sbml_dfs_core.export_sbml_dfs()
|
887
|
-
cpr_graph: igraph.Graph
|
888
|
-
A network representation of the sbml_dfs model
|
889
|
-
ontologies: set
|
890
|
-
A set of ontologies used to match features to pathway species
|
891
|
-
precomputed_distances: None or a pd.DataFrame containing path lengths and weights
|
892
|
-
between pairs of cspecies.
|
893
|
-
max_path_length: int
|
894
|
-
Maximum number of steps to consider.
|
895
|
-
|
896
|
-
Returns:
|
897
|
-
edgelist_w_indirect_mechanistic_interactions: pd.DataFrame
|
898
|
-
formatted_edgelist filtered to mechanistic reactions which can be described
|
899
|
-
by an indirect mechanism. The mechanism is described by a path weight, length,
|
900
|
-
and a vpath and epath list of vertices and edges which were traversed to create the path.
|
901
|
-
"""
|
902
|
-
|
903
|
-
edgelist_w_scids = _edgelist_to_scids_if_needed(
|
904
|
-
formatted_edgelist, sbml_dfs, species_identifiers, ontologies
|
905
|
-
)
|
906
|
-
|
907
|
-
if precomputed_distances is not None:
|
908
|
-
# rename to match conventions in precomputed_distances
|
909
|
-
# filter by these precomputed distances and then restore naming
|
910
|
-
edgelist_w_scids = paths._filter_paths_by_precomputed_distances(
|
911
|
-
edgelist_w_scids.rename(
|
912
|
-
{
|
913
|
-
CPR_EDGELIST.SC_ID_UPSTREAM: CPR_EDGELIST.SC_ID_ORIGIN,
|
914
|
-
CPR_EDGELIST.SC_ID_DOWNSTREAM: CPR_EDGELIST.SC_ID_DEST,
|
915
|
-
},
|
916
|
-
axis=1,
|
917
|
-
),
|
918
|
-
precomputed_distances,
|
919
|
-
).rename(
|
920
|
-
{
|
921
|
-
CPR_EDGELIST.SC_ID_ORIGIN: CPR_EDGELIST.SC_ID_UPSTREAM,
|
922
|
-
CPR_EDGELIST.SC_ID_DEST: CPR_EDGELIST.SC_ID_DOWNSTREAM,
|
923
|
-
},
|
924
|
-
axis=1,
|
925
|
-
)
|
926
|
-
|
927
|
-
# find paths from 1 upstream to all desired downstream sc_ids
|
928
|
-
# (this is the convention with igraph)
|
929
|
-
indexed_origin_vertices = edgelist_w_scids.set_index(CPR_EDGELIST.SC_ID_UPSTREAM)
|
930
|
-
|
931
|
-
# loop through upstream cspecies and find paths to all downstream species
|
932
|
-
global_dict = dict()
|
933
|
-
for an_origin_index in indexed_origin_vertices.index.unique(): # type: ignore
|
934
|
-
origin_targets = indexed_origin_vertices.loc[
|
935
|
-
an_origin_index
|
936
|
-
] # type: pd.DataFrame
|
937
|
-
|
938
|
-
# if indexing only a single entry pd.DataFrame becomes a pd.Series
|
939
|
-
# convert back to DataFrame for consistency
|
940
|
-
origin_targets = utils.ensure_pd_df(origin_targets)
|
941
|
-
|
942
|
-
# log entry for debugging
|
943
|
-
logger.debug(
|
944
|
-
f"finding paths from {an_origin_index} to "
|
945
|
-
f"{origin_targets.shape[0]} target vertices"
|
946
|
-
)
|
947
|
-
|
948
|
-
# find all paths from indexed_origin to desired destination
|
949
|
-
shortest_paths = paths.find_shortest_reaction_paths(
|
950
|
-
cpr_graph,
|
951
|
-
sbml_dfs,
|
952
|
-
origin=an_origin_index,
|
953
|
-
# find all unique destinations (as a list for compatibility with igraph dest)
|
954
|
-
dest=origin_targets[CPR_EDGELIST.SC_ID_DOWNSTREAM].unique().tolist(),
|
955
|
-
weight_var=CPR_GRAPH_EDGES.WEIGHTS,
|
956
|
-
)
|
957
|
-
|
958
|
-
if shortest_paths is None:
|
959
|
-
continue
|
960
|
-
|
961
|
-
vertices, edges = shortest_paths
|
962
|
-
indexed_edges = edges.set_index("path")
|
963
|
-
indexed_vertices = vertices.set_index("path")
|
964
|
-
|
965
|
-
paths_list = list()
|
966
|
-
for ind in indexed_edges.index.unique():
|
967
|
-
one_path = indexed_edges.loc[ind]
|
968
|
-
|
969
|
-
# make sure that we are working with a DF
|
970
|
-
if type(one_path) is pd.Series:
|
971
|
-
one_path = one_path.to_frame().T
|
972
|
-
|
973
|
-
if one_path.shape[0] > max_path_length:
|
974
|
-
continue
|
975
|
-
|
976
|
-
# find the destination node
|
977
|
-
# this is annoying because if the graph is undirected
|
978
|
-
# its not clear if the from or to edge is the actual destination
|
979
|
-
# when taking advantage of the fact that igraph lets you
|
980
|
-
# look up multiple destinations at once this information is lost
|
981
|
-
ancestor_species = {an_origin_index}
|
982
|
-
if one_path.shape[0] > 1:
|
983
|
-
penultimate_edge = one_path.iloc[one_path.shape[0] - 2]
|
984
|
-
ancestor_species = ancestor_species.union(
|
985
|
-
{
|
986
|
-
penultimate_edge[CPR_GRAPH_EDGES.FROM],
|
987
|
-
penultimate_edge[CPR_GRAPH_EDGES.TO],
|
988
|
-
}
|
989
|
-
)
|
990
|
-
|
991
|
-
terminal_edge = one_path.iloc[one_path.shape[0] - 1]
|
992
|
-
ending_cspecies = {terminal_edge[CPR_GRAPH_EDGES.FROM], terminal_edge[CPR_GRAPH_EDGES.TO]}.difference(ancestor_species) # type: ignore
|
993
|
-
|
994
|
-
if len(ending_cspecies) != 1:
|
995
|
-
raise ValueError(
|
996
|
-
"The terminal edge could not be determined when summarizing paths"
|
997
|
-
)
|
998
|
-
ending_cspecies = ending_cspecies.pop()
|
999
|
-
|
1000
|
-
path_series = pd.Series(
|
1001
|
-
{
|
1002
|
-
CPR_GRAPH_EDGES.FROM: an_origin_index,
|
1003
|
-
CPR_GRAPH_EDGES.TO: ending_cspecies,
|
1004
|
-
"weight": sum(one_path[CPR_GRAPH_EDGES.WEIGHTS]),
|
1005
|
-
"path_length": one_path.shape[0],
|
1006
|
-
"vpath": indexed_vertices.loc[ind],
|
1007
|
-
"epath": one_path,
|
1008
|
-
} # type: ignore
|
1009
|
-
) # type: pd.Series
|
1010
|
-
|
1011
|
-
paths_list.append(path_series)
|
1012
|
-
|
1013
|
-
if len(paths_list) > 0:
|
1014
|
-
origin_paths = pd.DataFrame(paths_list)
|
1015
|
-
global_dict[an_origin_index] = origin_paths
|
1016
|
-
|
1017
|
-
if len(global_dict.keys()) == 0:
|
1018
|
-
logger.warning(
|
1019
|
-
"None of the provide molecular pairs could be mechanistically linked with a network path"
|
1020
|
-
)
|
1021
|
-
return None
|
1022
|
-
|
1023
|
-
all_shortest_paths = pd.concat(global_dict.values())
|
1024
|
-
|
1025
|
-
indirect_shortest_paths = edgelist_w_scids.merge(
|
1026
|
-
all_shortest_paths,
|
1027
|
-
left_on=[CPR_EDGELIST.SC_ID_UPSTREAM, CPR_EDGELIST.SC_ID_DOWNSTREAM],
|
1028
|
-
right_on=[CPR_GRAPH_EDGES.FROM, CPR_GRAPH_EDGES.TO],
|
1029
|
-
)
|
1030
|
-
|
1031
|
-
return indirect_shortest_paths
|
1032
|
-
|
1033
|
-
|
1034
|
-
def _edgelist_to_scids_if_needed(
|
1035
|
-
edgelist: pd.DataFrame,
|
1036
|
-
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
1037
|
-
species_identifiers: pd.DataFrame,
|
1038
|
-
ontologies: set,
|
1039
|
-
) -> pd.DataFrame:
|
1040
|
-
"""Map a set of edgelist species to cspecies or skip if cspecies were provided."""
|
1041
|
-
|
1042
|
-
if utils.match_pd_vars(edgelist, CPR_EDGELIST_REQ_VARS).are_present:
|
1043
|
-
logger.info(
|
1044
|
-
f"An edgelist with {', '.join(CPR_EDGELIST_REQ_VARS)} was provided; identifier matching will be skipped"
|
1045
|
-
)
|
1046
|
-
return edgelist
|
1047
|
-
else:
|
1048
|
-
utils.match_pd_vars(edgelist, IDENTIFIER_EDGELIST_REQ_VARS).assert_present()
|
1049
|
-
|
1050
|
-
identifiers._check_species_identifiers_table(species_identifiers)
|
1051
|
-
|
1052
|
-
edgelist_w_scids = edgelist_to_scids(
|
1053
|
-
edgelist,
|
1054
|
-
sbml_dfs=sbml_dfs,
|
1055
|
-
species_identifiers=species_identifiers,
|
1056
|
-
ontologies=ontologies,
|
1057
|
-
)
|
1058
|
-
|
1059
|
-
return edgelist_w_scids
|
1060
|
-
|
1061
|
-
|
1062
|
-
def _validate_wide_ontologies(
|
1063
|
-
wide_df: pd.DataFrame,
|
1064
|
-
ontologies: Optional[Union[str, Set[str], Dict[str, str]]] = None,
|
1065
|
-
) -> Set[str]:
|
1066
|
-
"""
|
1067
|
-
Validate ontology specifications against the wide DataFrame and ONTOLOGIES_LIST.
|
1068
|
-
|
1069
|
-
Parameters
|
1070
|
-
----------
|
1071
|
-
wide_df : pd.DataFrame
|
1072
|
-
DataFrame with one column per ontology and a results column
|
1073
|
-
ontologies : Optional[Union[str, Set[str], Dict[str, str]]]
|
1074
|
-
Either:
|
1075
|
-
- String specifying a single ontology column
|
1076
|
-
- Set of columns to treat as ontologies
|
1077
|
-
- Dict mapping wide column names to ontology names
|
1078
|
-
- None to automatically detect ontology columns based on ONTOLOGIES_LIST
|
1079
|
-
|
1080
|
-
Returns
|
1081
|
-
-------
|
1082
|
-
Set[str]
|
1083
|
-
Set of validated ontology names. For dictionary mappings, returns the target ontology names.
|
1084
|
-
|
1085
|
-
Raises
|
1086
|
-
------
|
1087
|
-
ValueError
|
1088
|
-
If validation fails for any ontology specification or no valid ontologies are found
|
1089
|
-
"""
|
1090
|
-
# Convert string input to set
|
1091
|
-
if isinstance(ontologies, str):
|
1092
|
-
ontologies = {ontologies}
|
1093
|
-
|
1094
|
-
# Get the set of ontology columns
|
1095
|
-
if isinstance(ontologies, dict):
|
1096
|
-
# Check source columns exist in DataFrame
|
1097
|
-
missing_cols = set(ontologies.keys()) - set(wide_df.columns)
|
1098
|
-
if missing_cols:
|
1099
|
-
raise ValueError(f"Source columns not found in DataFrame: {missing_cols}")
|
1100
|
-
# Validate target ontologies against ONTOLOGIES_LIST
|
1101
|
-
invalid_onts = set(ontologies.values()) - set(ONTOLOGIES_LIST)
|
1102
|
-
if invalid_onts:
|
1103
|
-
raise ValueError(
|
1104
|
-
f"Invalid ontologies in mapping: {invalid_onts}. Must be one of: {ONTOLOGIES_LIST}"
|
1105
|
-
)
|
1106
|
-
# Return target ontology names instead of source column names
|
1107
|
-
ontology_cols = set(ontologies.values())
|
1108
|
-
|
1109
|
-
elif isinstance(ontologies, set):
|
1110
|
-
# Check specified columns exist in DataFrame
|
1111
|
-
missing_cols = ontologies - set(wide_df.columns)
|
1112
|
-
if missing_cols:
|
1113
|
-
raise ValueError(
|
1114
|
-
f"Specified ontology columns not found in DataFrame: {missing_cols}"
|
1115
|
-
)
|
1116
|
-
# Validate specified ontologies against ONTOLOGIES_LIST
|
1117
|
-
invalid_onts = ontologies - set(ONTOLOGIES_LIST)
|
1118
|
-
if invalid_onts:
|
1119
|
-
raise ValueError(
|
1120
|
-
f"Invalid ontologies in set: {invalid_onts}. Must be one of: {ONTOLOGIES_LIST}"
|
1121
|
-
)
|
1122
|
-
ontology_cols = ontologies
|
1123
|
-
|
1124
|
-
else:
|
1125
|
-
# Auto-detect ontology columns by matching against ONTOLOGIES_LIST
|
1126
|
-
ontology_cols = set(wide_df.columns) & set(ONTOLOGIES_LIST)
|
1127
|
-
if not ontology_cols:
|
1128
|
-
raise ValueError(
|
1129
|
-
f"No valid ontology columns found in DataFrame. Column names must match one of: {ONTOLOGIES_LIST}"
|
1130
|
-
)
|
1131
|
-
logger.info(f"Auto-detected ontology columns: {ontology_cols}")
|
1132
|
-
|
1133
|
-
logger.debug(f"Validated ontology columns: {ontology_cols}")
|
1134
|
-
return ontology_cols
|
1135
|
-
|
1136
|
-
|
1137
|
-
def _ensure_feature_id_var(
|
1138
|
-
df: pd.DataFrame, feature_id_var: str = FEATURE_ID_VAR_DEFAULT
|
1139
|
-
) -> pd.DataFrame:
|
1140
|
-
"""
|
1141
|
-
Ensure the DataFrame has a feature_id column, creating one if it doesn't exist.
|
1142
|
-
|
1143
|
-
Parameters
|
1144
|
-
----------
|
1145
|
-
df : pd.DataFrame
|
1146
|
-
DataFrame to check/modify
|
1147
|
-
feature_id_var : str, default=FEATURE_ID_VAR_DEFAULT
|
1148
|
-
Name of the feature ID column
|
1149
|
-
|
1150
|
-
Returns
|
1151
|
-
-------
|
1152
|
-
pd.DataFrame
|
1153
|
-
DataFrame with guaranteed feature_id column
|
1154
|
-
"""
|
1155
|
-
if feature_id_var not in df.columns:
|
1156
|
-
logger.warning(f"No {feature_id_var} column found in DataFrame, creating one")
|
1157
|
-
df = df.copy()
|
1158
|
-
df[feature_id_var] = np.arange(len(df))
|
1159
|
-
return df
|
1160
|
-
|
1161
|
-
|
1162
|
-
def _get_numeric_aggregator(
|
1163
|
-
method: str = RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
|
1164
|
-
feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
|
1165
|
-
) -> callable:
|
1166
|
-
"""
|
1167
|
-
Get aggregation function for numeric columns with various methods.
|
1168
|
-
|
1169
|
-
Parameters
|
1170
|
-
----------
|
1171
|
-
method : str, default="weighted_mean"
|
1172
|
-
Aggregation method to use:
|
1173
|
-
- "weighted_mean": weighted by inverse of feature_id frequency (default)
|
1174
|
-
- "mean": simple arithmetic mean
|
1175
|
-
- "first": first value after sorting by feature_id_var (requires feature_id_var)
|
1176
|
-
- "max": maximum value
|
1177
|
-
feature_id_var : str, default="feature_id"
|
1178
|
-
Name of the column specifying a measured feature - used for sorting and weighting
|
1179
|
-
|
1180
|
-
Returns
|
1181
|
-
-------
|
1182
|
-
callable
|
1183
|
-
Aggregation function to use with groupby
|
1184
|
-
|
1185
|
-
Raises
|
1186
|
-
------
|
1187
|
-
ValueError
|
1188
|
-
If method is not recognized
|
1189
|
-
"""
|
1190
|
-
|
1191
|
-
def weighted_mean(df: pd.DataFrame) -> float:
|
1192
|
-
# Get values and weights for this group
|
1193
|
-
values = df["value"]
|
1194
|
-
weights = df["weight"]
|
1195
|
-
# Weights are already normalized globally, just use them directly
|
1196
|
-
return (values * weights).sum() / weights.sum()
|
1197
|
-
|
1198
|
-
def first_by_id(df: pd.DataFrame) -> float:
|
1199
|
-
# Sort by feature_id and take first value
|
1200
|
-
return df.sort_values(feature_id_var).iloc[0]["value"]
|
1201
|
-
|
1202
|
-
def simple_mean(series: pd.Series) -> float:
|
1203
|
-
return series.mean()
|
1204
|
-
|
1205
|
-
def simple_max(series: pd.Series) -> float:
|
1206
|
-
return series.max()
|
1207
|
-
|
1208
|
-
aggregators = {
|
1209
|
-
RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN: weighted_mean,
|
1210
|
-
RESOLVE_MATCHES_AGGREGATORS.MEAN: simple_mean,
|
1211
|
-
RESOLVE_MATCHES_AGGREGATORS.FIRST: first_by_id,
|
1212
|
-
RESOLVE_MATCHES_AGGREGATORS.MAX: simple_max,
|
1213
|
-
}
|
1214
|
-
|
1215
|
-
if method not in aggregators:
|
1216
|
-
raise ValueError(
|
1217
|
-
f"Unknown aggregation method: {method}. Must be one of {list(aggregators.keys())}"
|
1218
|
-
)
|
1219
|
-
|
1220
|
-
return aggregators[method]
|
1221
|
-
|
1222
|
-
|
1223
|
-
def _split_numeric_non_numeric_columns(df: pd.DataFrame, always_non_numeric=None):
|
1224
|
-
"""
|
1225
|
-
Utility to split DataFrame columns into numeric and non-numeric, always treating specified columns as non-numeric.
|
1226
|
-
|
1227
|
-
Parameters
|
1228
|
-
----------
|
1229
|
-
df : pd.DataFrame
|
1230
|
-
The DataFrame to split.
|
1231
|
-
always_non_numeric : list or set, optional
|
1232
|
-
Columns to always treat as non-numeric (e.g., ['feature_id']).
|
1233
|
-
|
1234
|
-
Returns
|
1235
|
-
-------
|
1236
|
-
numeric_cols : pd.Index
|
1237
|
-
Columns considered numeric (int64, float64, and not in always_non_numeric).
|
1238
|
-
non_numeric_cols : pd.Index
|
1239
|
-
Columns considered non-numeric (object, string, etc., plus always_non_numeric).
|
1240
|
-
"""
|
1241
|
-
if always_non_numeric is None:
|
1242
|
-
always_non_numeric = []
|
1243
|
-
always_non_numeric = set(always_non_numeric)
|
1244
|
-
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.difference(
|
1245
|
-
always_non_numeric
|
1246
|
-
)
|
1247
|
-
non_numeric_cols = df.columns.difference(numeric_cols)
|
1248
|
-
return numeric_cols, non_numeric_cols
|
1249
|
-
|
1250
|
-
|
1251
|
-
def _aggregate_grouped_columns(
|
1252
|
-
df: pd.DataFrame,
|
1253
|
-
numeric_cols,
|
1254
|
-
non_numeric_cols,
|
1255
|
-
numeric_aggregator,
|
1256
|
-
feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
|
1257
|
-
numeric_agg: str = RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
|
1258
|
-
) -> pd.DataFrame:
|
1259
|
-
"""
|
1260
|
-
Aggregate numeric and non-numeric columns for grouped DataFrame.
|
1261
|
-
Assumes deduplication by feature_id within each s_id has already been performed.
|
1262
|
-
Returns the combined DataFrame.
|
1263
|
-
"""
|
1264
|
-
results = []
|
1265
|
-
|
1266
|
-
# Handle non-numeric columns
|
1267
|
-
if len(non_numeric_cols) > 0:
|
1268
|
-
non_numeric_agg = (
|
1269
|
-
df[non_numeric_cols]
|
1270
|
-
.groupby(level=0)
|
1271
|
-
.agg(lambda x: ",".join(sorted(set(x.astype(str)))))
|
1272
|
-
)
|
1273
|
-
results.append(non_numeric_agg)
|
1274
|
-
# Handle numeric columns
|
1275
|
-
if len(numeric_cols) > 0:
|
1276
|
-
numeric_results = {}
|
1277
|
-
for col in numeric_cols:
|
1278
|
-
if numeric_agg in [
|
1279
|
-
RESOLVE_MATCHES_AGGREGATORS.FIRST,
|
1280
|
-
RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
|
1281
|
-
]:
|
1282
|
-
agg_df = pd.DataFrame(
|
1283
|
-
{"value": df[col], feature_id_var: df[feature_id_var]}
|
1284
|
-
)
|
1285
|
-
if numeric_agg == RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN:
|
1286
|
-
agg_df[RESOLVE_MATCHES_TMP_WEIGHT_COL] = df[
|
1287
|
-
RESOLVE_MATCHES_TMP_WEIGHT_COL
|
1288
|
-
]
|
1289
|
-
numeric_results[col] = agg_df.groupby(level=0).apply(
|
1290
|
-
lambda x: (
|
1291
|
-
numeric_aggregator(x)
|
1292
|
-
if numeric_agg != RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN
|
1293
|
-
else numeric_aggregator(
|
1294
|
-
x.rename(columns={RESOLVE_MATCHES_TMP_WEIGHT_COL: "weight"})
|
1295
|
-
)
|
1296
|
-
)
|
1297
|
-
)
|
1298
|
-
else:
|
1299
|
-
numeric_results[col] = df[col].groupby(level=0).agg(numeric_aggregator)
|
1300
|
-
numeric_agg_df = pd.DataFrame(numeric_results)
|
1301
|
-
results.append(numeric_agg_df)
|
1302
|
-
# Combine results
|
1303
|
-
if results:
|
1304
|
-
resolved = pd.concat(results, axis=1)
|
1305
|
-
else:
|
1306
|
-
resolved = pd.DataFrame(index=df.index)
|
1307
|
-
return resolved
|
1308
|
-
|
1309
|
-
|
1310
|
-
def _log_feature_species_mapping_stats(
|
1311
|
-
pathway_species: pd.DataFrame, feature_id_var: str = FEATURE_ID_VAR_DEFAULT
|
1312
|
-
):
|
1313
|
-
"""
|
1314
|
-
Log statistics about the mapping between feature_id and s_id in the pathway_species DataFrame.
|
1315
|
-
"""
|
1316
|
-
|
1317
|
-
# Percent of feature_ids present one or more times in the output
|
1318
|
-
n_feature_ids = pathway_species[feature_id_var].nunique()
|
1319
|
-
n_input_feature_ids = (
|
1320
|
-
pathway_species[feature_id_var].max() + 1
|
1321
|
-
if feature_id_var in pathway_species.columns
|
1322
|
-
else 0
|
1323
|
-
)
|
1324
|
-
percent_present = (
|
1325
|
-
100 * n_feature_ids / n_input_feature_ids if n_input_feature_ids else 0
|
1326
|
-
)
|
1327
|
-
logger.info(
|
1328
|
-
f"{percent_present:.1f}% of feature_ids are present one or more times in the output ({n_feature_ids}/{n_input_feature_ids})"
|
1329
|
-
)
|
1330
|
-
|
1331
|
-
# Number of times an s_id maps to 1+ feature_ids (with s_name)
|
1332
|
-
s_id_counts = pathway_species.groupby(SBML_DFS.S_ID)[feature_id_var].nunique()
|
1333
|
-
s_id_multi = s_id_counts[s_id_counts > 1]
|
1334
|
-
logger.info(f"{len(s_id_multi)} s_id(s) map to more than one feature_id.")
|
1335
|
-
if not s_id_multi.empty:
|
1336
|
-
examples = pathway_species[
|
1337
|
-
pathway_species[SBML_DFS.S_ID].isin(s_id_multi.index)
|
1338
|
-
][[SBML_DFS.S_ID, SBML_DFS.S_NAME, feature_id_var]]
|
1339
|
-
logger.info(
|
1340
|
-
f"Examples of s_id mapping to multiple feature_ids (showing up to 3):\n{examples.groupby([SBML_DFS.S_ID, SBML_DFS.S_NAME])[feature_id_var].apply(list).head(3)}"
|
1341
|
-
)
|
1342
|
-
|
1343
|
-
# Number of times a feature_id maps to 1+ s_ids (with s_name)
|
1344
|
-
feature_id_counts = pathway_species.groupby(feature_id_var)[SBML_DFS.S_ID].nunique()
|
1345
|
-
feature_id_multi = feature_id_counts[feature_id_counts > 1]
|
1346
|
-
logger.info(f"{len(feature_id_multi)} feature_id(s) map to more than one s_id.")
|
1347
|
-
if not feature_id_multi.empty:
|
1348
|
-
examples = pathway_species[
|
1349
|
-
pathway_species[feature_id_var].isin(feature_id_multi.index)
|
1350
|
-
][[feature_id_var, SBML_DFS.S_ID, SBML_DFS.S_NAME]]
|
1351
|
-
logger.info(
|
1352
|
-
f"Examples of feature_id mapping to multiple s_ids (showing up to 3):\n{examples.groupby([feature_id_var])[[SBML_DFS.S_ID, SBML_DFS.S_NAME]].apply(lambda df: list(df.itertuples(index=False, name=None))).head(3)}"
|
1353
|
-
)
|