napistu 0.2.5.dev6__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- napistu/__main__.py +126 -96
- napistu/constants.py +35 -41
- napistu/context/__init__.py +10 -0
- napistu/context/discretize.py +462 -0
- napistu/context/filtering.py +387 -0
- napistu/gcs/__init__.py +1 -1
- napistu/identifiers.py +74 -15
- napistu/indices.py +68 -0
- napistu/ingestion/__init__.py +1 -1
- napistu/ingestion/bigg.py +47 -62
- napistu/ingestion/constants.py +18 -133
- napistu/ingestion/gtex.py +113 -0
- napistu/ingestion/hpa.py +147 -0
- napistu/ingestion/sbml.py +0 -97
- napistu/ingestion/string.py +2 -2
- napistu/matching/__init__.py +10 -0
- napistu/matching/constants.py +18 -0
- napistu/matching/interactions.py +518 -0
- napistu/matching/mount.py +529 -0
- napistu/matching/species.py +510 -0
- napistu/mcp/__init__.py +7 -4
- napistu/mcp/__main__.py +128 -72
- napistu/mcp/client.py +16 -25
- napistu/mcp/codebase.py +201 -153
- napistu/mcp/component_base.py +170 -0
- napistu/mcp/config.py +223 -0
- napistu/mcp/constants.py +45 -2
- napistu/mcp/documentation.py +253 -136
- napistu/mcp/documentation_utils.py +13 -48
- napistu/mcp/execution.py +372 -305
- napistu/mcp/health.py +49 -67
- napistu/mcp/profiles.py +10 -6
- napistu/mcp/server.py +161 -80
- napistu/mcp/tutorials.py +139 -87
- napistu/modify/__init__.py +1 -1
- napistu/modify/gaps.py +1 -1
- napistu/network/__init__.py +1 -1
- napistu/network/constants.py +101 -34
- napistu/network/data_handling.py +388 -0
- napistu/network/ig_utils.py +351 -0
- napistu/network/napistu_graph_core.py +354 -0
- napistu/network/neighborhoods.py +40 -40
- napistu/network/net_create.py +373 -309
- napistu/network/net_propagation.py +47 -19
- napistu/network/{net_utils.py → ng_utils.py} +124 -272
- napistu/network/paths.py +67 -51
- napistu/network/precompute.py +11 -11
- napistu/ontologies/__init__.py +10 -0
- napistu/ontologies/constants.py +129 -0
- napistu/ontologies/dogma.py +243 -0
- napistu/ontologies/genodexito.py +649 -0
- napistu/ontologies/mygene.py +369 -0
- napistu/ontologies/renaming.py +198 -0
- napistu/rpy2/__init__.py +229 -86
- napistu/rpy2/callr.py +47 -77
- napistu/rpy2/constants.py +24 -23
- napistu/rpy2/rids.py +61 -648
- napistu/sbml_dfs_core.py +587 -222
- napistu/scverse/__init__.py +15 -0
- napistu/scverse/constants.py +28 -0
- napistu/scverse/loading.py +727 -0
- napistu/utils.py +118 -10
- {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/METADATA +8 -3
- napistu-0.3.1.dist-info/RECORD +133 -0
- tests/conftest.py +22 -0
- tests/test_context_discretize.py +56 -0
- tests/test_context_filtering.py +267 -0
- tests/test_identifiers.py +100 -0
- tests/test_indices.py +65 -0
- tests/{test_edgelist.py → test_ingestion_napistu_edgelist.py} +2 -2
- tests/test_matching_interactions.py +108 -0
- tests/test_matching_mount.py +305 -0
- tests/test_matching_species.py +394 -0
- tests/test_mcp_config.py +193 -0
- tests/test_mcp_documentation_utils.py +12 -3
- tests/test_mcp_server.py +356 -0
- tests/test_network_data_handling.py +397 -0
- tests/test_network_ig_utils.py +23 -0
- tests/test_network_neighborhoods.py +19 -0
- tests/test_network_net_create.py +459 -0
- tests/test_network_ng_utils.py +30 -0
- tests/test_network_paths.py +56 -0
- tests/{test_precomputed_distances.py → test_network_precompute.py} +8 -6
- tests/test_ontologies_genodexito.py +58 -0
- tests/test_ontologies_mygene.py +39 -0
- tests/test_ontologies_renaming.py +110 -0
- tests/test_rpy2_callr.py +79 -0
- tests/test_rpy2_init.py +151 -0
- tests/test_sbml.py +0 -31
- tests/test_sbml_dfs_core.py +134 -10
- tests/test_scverse_loading.py +778 -0
- tests/test_set_coverage.py +2 -2
- tests/test_utils.py +121 -1
- napistu/mechanism_matching.py +0 -1353
- napistu/rpy2/netcontextr.py +0 -467
- napistu-0.2.5.dev6.dist-info/RECORD +0 -97
- tests/test_igraph.py +0 -367
- tests/test_mechanism_matching.py +0 -784
- tests/test_net_utils.py +0 -149
- tests/test_netcontextr.py +0 -105
- tests/test_rpy2.py +0 -61
- /napistu/ingestion/{cpr_edgelist.py → napistu_edgelist.py} +0 -0
- {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/WHEEL +0 -0
- {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/entry_points.txt +0 -0
- {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/licenses/LICENSE +0 -0
- {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/top_level.txt +0 -0
- /tests/{test_obo.py → test_ingestion_obo.py} +0 -0
@@ -0,0 +1,529 @@
|
|
1
|
+
import copy
|
2
|
+
import logging
|
3
|
+
from typing import Optional, Union, Set, Dict
|
4
|
+
|
5
|
+
import pandas as pd
|
6
|
+
|
7
|
+
from napistu.constants import SBML_DFS, ONTOLOGIES_LIST
|
8
|
+
from napistu.matching.constants import (
|
9
|
+
FEATURE_ID_VAR_DEFAULT,
|
10
|
+
RESOLVE_MATCHES_AGGREGATORS,
|
11
|
+
RESOLVE_MATCHES_TMP_WEIGHT_COL,
|
12
|
+
BIND_DICT_OF_WIDE_RESULTS_STRATEGIES,
|
13
|
+
BIND_DICT_OF_WIDE_RESULTS_STRATEGIES_LIST,
|
14
|
+
)
|
15
|
+
from napistu import identifiers, utils
|
16
|
+
from napistu.matching.species import match_features_to_wide_pathway_species
|
17
|
+
from napistu import sbml_dfs_core
|
18
|
+
|
19
|
+
logger = logging.getLogger(__name__)
|
20
|
+
|
21
|
+
|
22
|
+
def bind_wide_results(
|
23
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
24
|
+
results_df: pd.DataFrame,
|
25
|
+
results_name: str,
|
26
|
+
ontologies: Optional[Union[Set[str], Dict[str, str]]] = None,
|
27
|
+
dogmatic: bool = False,
|
28
|
+
species_identifiers: Optional[pd.DataFrame] = None,
|
29
|
+
feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
|
30
|
+
numeric_agg: str = RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
|
31
|
+
keep_id_col: bool = True,
|
32
|
+
verbose: bool = False,
|
33
|
+
inplace: bool = True,
|
34
|
+
) -> Optional[sbml_dfs_core.SBML_dfs]:
|
35
|
+
"""
|
36
|
+
Binds wide results to a sbml_dfs object.
|
37
|
+
|
38
|
+
Take a table with molecular species-level attributes tied to systematic identifiers and match them to an sbml_dfs_model transferring these attributes to species_data
|
39
|
+
|
40
|
+
Parameters
|
41
|
+
----------
|
42
|
+
sbml_dfs : sbml_dfs_core.SBML_dfs
|
43
|
+
The sbml_dfs object to bind the results to.
|
44
|
+
results_df : pd.DataFrame
|
45
|
+
The table containing the results to bind.
|
46
|
+
results_name : str
|
47
|
+
The name of the results to bind.
|
48
|
+
ontologies : Optional[Union[Set[str], Dict[str, str]]], default=None
|
49
|
+
Either:
|
50
|
+
- Set of columns to treat as ontologies (these should be entries in ONTOLOGIES_LIST )
|
51
|
+
- Dict mapping wide column names to ontology names in the ONTOLOGIES_LIST controlled vocabulary
|
52
|
+
- None to automatically detect valid ontology columns based on ONTOLOGIES_LIST
|
53
|
+
dogmatic : bool
|
54
|
+
Whether to respect differences between genes, transcripts, and proteins (True) or ignore them (False).
|
55
|
+
species_identifiers : Optional[pd.DataFrame]
|
56
|
+
Systematic identifiers for the molecular species "sbml_dfs". If None this will be generate on-the-fly.
|
57
|
+
feature_id_var : str
|
58
|
+
The name of the column in the results_df that contains the feature identifiers. If this does not exist it will be created.
|
59
|
+
numeric_agg : str
|
60
|
+
The aggregation method to use for resolving degeneracy.
|
61
|
+
keep_id_col : bool
|
62
|
+
Whether to keep the identifier column in the results_df.
|
63
|
+
verbose : bool
|
64
|
+
Whether to log cases of 1-to-many and many-to-one mapping and to indicate the behavior for resolving degeneracy
|
65
|
+
inplace : bool, default=True
|
66
|
+
Whether to modify the sbml_dfs object in place. If False, returns a copy.
|
67
|
+
|
68
|
+
Returns
|
69
|
+
-------
|
70
|
+
sbml_dfs : sbml_dfs_core.SBML_dfs
|
71
|
+
The sbml_dfs object with the results bound.
|
72
|
+
"""
|
73
|
+
|
74
|
+
if not inplace:
|
75
|
+
sbml_dfs = copy.deepcopy(sbml_dfs)
|
76
|
+
|
77
|
+
species_identifiers = identifiers._prepare_species_identifiers(
|
78
|
+
sbml_dfs, dogmatic=dogmatic, species_identifiers=species_identifiers
|
79
|
+
)
|
80
|
+
|
81
|
+
# match
|
82
|
+
matched_s_ids_from_wide = match_features_to_wide_pathway_species(
|
83
|
+
results_df,
|
84
|
+
species_identifiers,
|
85
|
+
ontologies=ontologies,
|
86
|
+
feature_id_var=feature_id_var,
|
87
|
+
verbose=verbose,
|
88
|
+
)
|
89
|
+
|
90
|
+
disambiguated_matches = resolve_matches(
|
91
|
+
matched_data=matched_s_ids_from_wide,
|
92
|
+
feature_id_var=feature_id_var,
|
93
|
+
numeric_agg=numeric_agg,
|
94
|
+
keep_id_col=keep_id_col,
|
95
|
+
)
|
96
|
+
|
97
|
+
clean_species_data = utils.drop_extra_cols(
|
98
|
+
results_df, disambiguated_matches, always_include=[feature_id_var]
|
99
|
+
)
|
100
|
+
|
101
|
+
sbml_dfs.add_species_data(results_name, clean_species_data)
|
102
|
+
|
103
|
+
return None if inplace else sbml_dfs
|
104
|
+
|
105
|
+
|
106
|
+
def bind_dict_of_wide_results(
|
107
|
+
sbml_dfs: sbml_dfs_core.SBML_dfs,
|
108
|
+
results_dict: dict,
|
109
|
+
results_name: str,
|
110
|
+
strategy: str = BIND_DICT_OF_WIDE_RESULTS_STRATEGIES.CONTATENATE,
|
111
|
+
species_identifiers: pd.DataFrame = None,
|
112
|
+
ontologies: Optional[Union[str, list]] = None,
|
113
|
+
dogmatic: bool = False,
|
114
|
+
inplace: bool = True,
|
115
|
+
verbose=True,
|
116
|
+
):
|
117
|
+
"""
|
118
|
+
Bind a dictionary of wide results to an SBML_dfs object.
|
119
|
+
|
120
|
+
This function is used to bind a dictionary of wide results to 1 or more species_data attributes of an SBML_dfs object.
|
121
|
+
The dictionary should have keys which are the modality names and values which are the results dataframes.
|
122
|
+
The "strategy" argument controls how the results are added to the SBML_dfs object.
|
123
|
+
|
124
|
+
Parameters
|
125
|
+
----------
|
126
|
+
sbml_dfs : SBML_dfs
|
127
|
+
The SBML_dfs object to bind the results to.
|
128
|
+
results_dict : dict
|
129
|
+
A dictionary of results dataframes with modality names as keys.
|
130
|
+
results_name : str
|
131
|
+
The name of the species_data attribute to bind the results to.
|
132
|
+
strategy : str
|
133
|
+
The strategy to use for binding the results.
|
134
|
+
|
135
|
+
Options are:
|
136
|
+
- "concatenate" : concatenate the results dataframes and add them as a single attribute.
|
137
|
+
- "multiple_keys" : add each modality's results as a separate attribute. The attribute name will be f'{results_name}_{modality}'.
|
138
|
+
- "stagger" : add each modality's results as a separate attribute. The attribute name will be f'{attr_name}_{modality}'.
|
139
|
+
|
140
|
+
species_identifiers : pd.DataFrame
|
141
|
+
A dataframe with species identifiers.
|
142
|
+
ontologies : optional str, list
|
143
|
+
The ontology to use for the species identifiers. If not provided, the column names of the results dataframes which match ONTOLOGIES_LIST will be used.
|
144
|
+
dogmatic : bool
|
145
|
+
Whether to use dogmatic mode. Ignored if species_identifiers is provided.
|
146
|
+
verbose : bool
|
147
|
+
Whether to print verbose output.
|
148
|
+
inplace : bool, default=True
|
149
|
+
Whether to modify the sbml_dfs object in place. If False, returns a copy.
|
150
|
+
|
151
|
+
Returns
|
152
|
+
-------
|
153
|
+
Optional[SBML_dfs]
|
154
|
+
If inplace=True, returns None. Otherwise returns the modified copy of sbml_dfs.
|
155
|
+
"""
|
156
|
+
|
157
|
+
# validate strategy
|
158
|
+
if strategy not in BIND_DICT_OF_WIDE_RESULTS_STRATEGIES_LIST:
|
159
|
+
raise ValueError(
|
160
|
+
f"Invalid strategy: {strategy}. Must be one of {BIND_DICT_OF_WIDE_RESULTS_STRATEGIES_LIST}"
|
161
|
+
)
|
162
|
+
|
163
|
+
species_identifiers = identifiers._prepare_species_identifiers(
|
164
|
+
sbml_dfs, dogmatic=dogmatic, species_identifiers=species_identifiers
|
165
|
+
)
|
166
|
+
|
167
|
+
if not inplace:
|
168
|
+
sbml_dfs = copy.deepcopy(sbml_dfs)
|
169
|
+
|
170
|
+
if strategy == BIND_DICT_OF_WIDE_RESULTS_STRATEGIES.MULTIPLE_KEYS:
|
171
|
+
for modality, results_df in results_dict.items():
|
172
|
+
valid_ontologies = _get_wide_results_valid_ontologies(
|
173
|
+
results_df, ontologies
|
174
|
+
)
|
175
|
+
|
176
|
+
modality_results_name = f"{results_name}_{modality}"
|
177
|
+
|
178
|
+
bind_wide_results(
|
179
|
+
sbml_dfs,
|
180
|
+
results_df,
|
181
|
+
modality_results_name,
|
182
|
+
species_identifiers=species_identifiers,
|
183
|
+
ontologies=valid_ontologies,
|
184
|
+
inplace=True, # Always use inplace=True here since we handle copying above
|
185
|
+
verbose=verbose,
|
186
|
+
)
|
187
|
+
|
188
|
+
return None if inplace else sbml_dfs
|
189
|
+
|
190
|
+
# create either a concatenated or staggered results table
|
191
|
+
if strategy == BIND_DICT_OF_WIDE_RESULTS_STRATEGIES.CONTATENATE:
|
192
|
+
results_df = pd.concat(results_dict.values(), axis=0)
|
193
|
+
elif strategy == BIND_DICT_OF_WIDE_RESULTS_STRATEGIES.STAGGER:
|
194
|
+
|
195
|
+
results_dict_copy = results_dict.copy()
|
196
|
+
for k, v in results_dict_copy.items():
|
197
|
+
valid_ontologies = _get_wide_results_valid_ontologies(v, ontologies)
|
198
|
+
|
199
|
+
if verbose:
|
200
|
+
logger.info(
|
201
|
+
f"Modality {k} has ontologies {valid_ontologies}. Other variables will be renamed to {k}_<variable>"
|
202
|
+
)
|
203
|
+
|
204
|
+
# rename all the columns besides ontologies names
|
205
|
+
for var in v.columns:
|
206
|
+
if var not in valid_ontologies:
|
207
|
+
results_dict_copy[k].rename(
|
208
|
+
columns={var: f"{var}_{k}"}, inplace=True
|
209
|
+
)
|
210
|
+
|
211
|
+
results_df = pd.concat(results_dict_copy.values(), axis=1)
|
212
|
+
|
213
|
+
valid_ontologies = _get_wide_results_valid_ontologies(results_df, ontologies)
|
214
|
+
|
215
|
+
bind_wide_results(
|
216
|
+
sbml_dfs,
|
217
|
+
results_df,
|
218
|
+
results_name,
|
219
|
+
species_identifiers=species_identifiers,
|
220
|
+
ontologies=valid_ontologies,
|
221
|
+
inplace=True, # Always use inplace=True here since we handle copying above
|
222
|
+
verbose=verbose,
|
223
|
+
)
|
224
|
+
|
225
|
+
return None if inplace else sbml_dfs
|
226
|
+
|
227
|
+
|
228
|
+
def resolve_matches(
|
229
|
+
matched_data: pd.DataFrame,
|
230
|
+
feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
|
231
|
+
index_col: str = SBML_DFS.S_ID,
|
232
|
+
numeric_agg: str = RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
|
233
|
+
keep_id_col: bool = True,
|
234
|
+
) -> pd.DataFrame:
|
235
|
+
"""
|
236
|
+
Resolve many-to-1 and 1-to-many matches in matched data.
|
237
|
+
|
238
|
+
Parameters
|
239
|
+
----------
|
240
|
+
matched_data : pd.DataFrame
|
241
|
+
DataFrame containing matched data with columns:
|
242
|
+
- feature_id_var: identifier column (e.g. feature_id)
|
243
|
+
- index_col: index column (e.g. s_id)
|
244
|
+
- other columns: data columns to be aggregated
|
245
|
+
feature_id_var : str, default="feature_id"
|
246
|
+
Name of the identifier column
|
247
|
+
index_col : str, default="s_id"
|
248
|
+
Name of the column to use as index
|
249
|
+
numeric_agg : str, default="weighted_mean"
|
250
|
+
Method to aggregate numeric columns:
|
251
|
+
- "weighted_mean": weighted by inverse of feature_id frequency (default)
|
252
|
+
- "mean": simple arithmetic mean
|
253
|
+
- "first": first value after sorting by feature_id_var (requires feature_id_var)
|
254
|
+
- "max": maximum value
|
255
|
+
keep_id_col : bool, default=True
|
256
|
+
Whether to keep and rollup the feature_id_var in the output.
|
257
|
+
If False, feature_id_var will be dropped from the output.
|
258
|
+
|
259
|
+
Returns
|
260
|
+
-------
|
261
|
+
pd.DataFrame
|
262
|
+
DataFrame with resolved matches:
|
263
|
+
- Many-to-1: numeric columns are aggregated using specified method
|
264
|
+
- 1-to-many: adds a count column showing number of matches
|
265
|
+
- Index is set to index_col and named accordingly
|
266
|
+
|
267
|
+
Raises
|
268
|
+
------
|
269
|
+
KeyError
|
270
|
+
If feature_id_var is not present in the DataFrame
|
271
|
+
TypeError
|
272
|
+
If DataFrame contains unsupported data types (boolean or datetime)
|
273
|
+
"""
|
274
|
+
# Make a copy to avoid modifying input
|
275
|
+
df = matched_data.copy()
|
276
|
+
|
277
|
+
# Check for unsupported data types
|
278
|
+
unsupported_dtypes = df.select_dtypes(include=["bool", "datetime64"]).columns
|
279
|
+
if not unsupported_dtypes.empty:
|
280
|
+
raise TypeError(
|
281
|
+
f"Unsupported data types found in columns: {list(unsupported_dtypes)}. "
|
282
|
+
"Boolean and datetime columns are not supported."
|
283
|
+
)
|
284
|
+
|
285
|
+
# Always require feature_id_var
|
286
|
+
if feature_id_var not in df.columns:
|
287
|
+
raise KeyError(feature_id_var)
|
288
|
+
|
289
|
+
# Deduplicate by feature_id within each s_id using groupby and first BEFORE any further processing
|
290
|
+
df = df.groupby([index_col, feature_id_var], sort=False).first().reset_index()
|
291
|
+
|
292
|
+
# Use a unique temporary column name for weights
|
293
|
+
if RESOLVE_MATCHES_TMP_WEIGHT_COL in df.columns:
|
294
|
+
raise ValueError(
|
295
|
+
f"Temporary weight column name '{RESOLVE_MATCHES_TMP_WEIGHT_COL}' already exists in the input data. Please rename or remove this column and try again."
|
296
|
+
)
|
297
|
+
|
298
|
+
# Calculate weights if needed (after deduplication!)
|
299
|
+
if numeric_agg == RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN:
|
300
|
+
feature_counts = df[feature_id_var].value_counts()
|
301
|
+
df[RESOLVE_MATCHES_TMP_WEIGHT_COL] = (
|
302
|
+
1 / feature_counts[df[feature_id_var]].values
|
303
|
+
)
|
304
|
+
|
305
|
+
# Set index for grouping
|
306
|
+
df = df.set_index(index_col)
|
307
|
+
|
308
|
+
# Use utility to split columns
|
309
|
+
always_non_numeric = [feature_id_var] if keep_id_col else []
|
310
|
+
numeric_cols, non_numeric_cols = _split_numeric_non_numeric_columns(
|
311
|
+
df, always_non_numeric=always_non_numeric
|
312
|
+
)
|
313
|
+
|
314
|
+
# Get aggregator function
|
315
|
+
numeric_aggregator = _get_numeric_aggregator(
|
316
|
+
method=numeric_agg, feature_id_var=feature_id_var
|
317
|
+
)
|
318
|
+
resolved = _aggregate_grouped_columns(
|
319
|
+
df,
|
320
|
+
numeric_cols,
|
321
|
+
non_numeric_cols,
|
322
|
+
numeric_aggregator,
|
323
|
+
feature_id_var=feature_id_var,
|
324
|
+
numeric_agg=numeric_agg,
|
325
|
+
)
|
326
|
+
# Add count of matches per feature_id
|
327
|
+
match_counts = matched_data.groupby(index_col)[feature_id_var].nunique()
|
328
|
+
resolved[f"{feature_id_var}_match_count"] = match_counts
|
329
|
+
|
330
|
+
# Drop feature_id_var if not keeping it
|
331
|
+
if not keep_id_col and feature_id_var in resolved.columns:
|
332
|
+
resolved = resolved.drop(columns=[feature_id_var])
|
333
|
+
|
334
|
+
# Ensure index is named consistently
|
335
|
+
resolved.index.name = index_col
|
336
|
+
|
337
|
+
return resolved
|
338
|
+
|
339
|
+
|
340
|
+
def _get_wide_results_valid_ontologies(
|
341
|
+
results_df: pd.DataFrame, ontologies: Optional[Union[str, list]] = None
|
342
|
+
) -> list:
|
343
|
+
"""
|
344
|
+
Get the valid ontologies for a wide results dataframe.
|
345
|
+
|
346
|
+
If ontologies is a string, it will be converted to a list.
|
347
|
+
If ontologies is None, the column names of the results dataframe which match ONTOLOGIES_LIST will be used.
|
348
|
+
|
349
|
+
Parameters
|
350
|
+
----------
|
351
|
+
results_df : pd.DataFrame
|
352
|
+
The results dataframe to get the valid ontologies for.
|
353
|
+
ontologies : optional str, list
|
354
|
+
The ontology to use for the species identifiers. If not provided, the column names of the results dataframes which match ONTOLOGIES_LIST will be used.
|
355
|
+
|
356
|
+
Returns
|
357
|
+
-------
|
358
|
+
list
|
359
|
+
The valid ontologies for the results dataframe.
|
360
|
+
"""
|
361
|
+
|
362
|
+
if isinstance(ontologies, str):
|
363
|
+
ontologies = [ontologies] # now, it will be None or list
|
364
|
+
|
365
|
+
if ontologies is None:
|
366
|
+
ontologies = [col for col in results_df.columns if col in ONTOLOGIES_LIST]
|
367
|
+
if len(ontologies) == 0:
|
368
|
+
raise ValueError(
|
369
|
+
"No valid ontologies found in results dataframe. Columns are: "
|
370
|
+
+ str(results_df.columns)
|
371
|
+
)
|
372
|
+
|
373
|
+
if isinstance(ontologies, list):
|
374
|
+
invalid_ontologies = set(ontologies) - set(ONTOLOGIES_LIST)
|
375
|
+
if len(invalid_ontologies) > 0:
|
376
|
+
raise ValueError(
|
377
|
+
"Invalid ontologies found in ontologies list: "
|
378
|
+
+ str(invalid_ontologies)
|
379
|
+
)
|
380
|
+
|
381
|
+
return ontologies
|
382
|
+
|
383
|
+
|
384
|
+
def _get_numeric_aggregator(
|
385
|
+
method: str = RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
|
386
|
+
feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
|
387
|
+
) -> callable:
|
388
|
+
"""
|
389
|
+
Get aggregation function for numeric columns with various methods.
|
390
|
+
|
391
|
+
Parameters
|
392
|
+
----------
|
393
|
+
method : str, default="weighted_mean"
|
394
|
+
Aggregation method to use:
|
395
|
+
- "weighted_mean": weighted by inverse of feature_id frequency (default)
|
396
|
+
- "mean": simple arithmetic mean
|
397
|
+
- "first": first value after sorting by feature_id_var (requires feature_id_var)
|
398
|
+
- "max": maximum value
|
399
|
+
feature_id_var : str, default="feature_id"
|
400
|
+
Name of the column specifying a measured feature - used for sorting and weighting
|
401
|
+
|
402
|
+
Returns
|
403
|
+
-------
|
404
|
+
callable
|
405
|
+
Aggregation function to use with groupby
|
406
|
+
|
407
|
+
Raises
|
408
|
+
------
|
409
|
+
ValueError
|
410
|
+
If method is not recognized
|
411
|
+
"""
|
412
|
+
|
413
|
+
def weighted_mean(df: pd.DataFrame) -> float:
|
414
|
+
# Get values and weights for this group
|
415
|
+
values = df["value"]
|
416
|
+
weights = df["weight"]
|
417
|
+
# Weights are already normalized globally, just use them directly
|
418
|
+
return (values * weights).sum() / weights.sum()
|
419
|
+
|
420
|
+
def first_by_id(df: pd.DataFrame) -> float:
|
421
|
+
# Sort by feature_id and take first value
|
422
|
+
return df.sort_values(feature_id_var).iloc[0]["value"]
|
423
|
+
|
424
|
+
def simple_mean(series: pd.Series) -> float:
|
425
|
+
return series.mean()
|
426
|
+
|
427
|
+
def simple_max(series: pd.Series) -> float:
|
428
|
+
return series.max()
|
429
|
+
|
430
|
+
aggregators = {
|
431
|
+
RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN: weighted_mean,
|
432
|
+
RESOLVE_MATCHES_AGGREGATORS.MEAN: simple_mean,
|
433
|
+
RESOLVE_MATCHES_AGGREGATORS.FIRST: first_by_id,
|
434
|
+
RESOLVE_MATCHES_AGGREGATORS.MAX: simple_max,
|
435
|
+
}
|
436
|
+
|
437
|
+
if method not in aggregators:
|
438
|
+
raise ValueError(
|
439
|
+
f"Unknown aggregation method: {method}. Must be one of {list(aggregators.keys())}"
|
440
|
+
)
|
441
|
+
|
442
|
+
return aggregators[method]
|
443
|
+
|
444
|
+
|
445
|
+
def _split_numeric_non_numeric_columns(df: pd.DataFrame, always_non_numeric=None):
|
446
|
+
"""
|
447
|
+
Utility to split DataFrame columns into numeric and non-numeric, always treating specified columns as non-numeric.
|
448
|
+
|
449
|
+
Parameters
|
450
|
+
----------
|
451
|
+
df : pd.DataFrame
|
452
|
+
The DataFrame to split.
|
453
|
+
always_non_numeric : list or set, optional
|
454
|
+
Columns to always treat as non-numeric (e.g., ['feature_id']).
|
455
|
+
|
456
|
+
Returns
|
457
|
+
-------
|
458
|
+
numeric_cols : pd.Index
|
459
|
+
Columns considered numeric (int64, float64, and not in always_non_numeric).
|
460
|
+
non_numeric_cols : pd.Index
|
461
|
+
Columns considered non-numeric (object, string, etc., plus always_non_numeric).
|
462
|
+
"""
|
463
|
+
if always_non_numeric is None:
|
464
|
+
always_non_numeric = []
|
465
|
+
always_non_numeric = set(always_non_numeric)
|
466
|
+
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.difference(
|
467
|
+
always_non_numeric
|
468
|
+
)
|
469
|
+
non_numeric_cols = df.columns.difference(numeric_cols)
|
470
|
+
return numeric_cols, non_numeric_cols
|
471
|
+
|
472
|
+
|
473
|
+
def _aggregate_grouped_columns(
|
474
|
+
df: pd.DataFrame,
|
475
|
+
numeric_cols,
|
476
|
+
non_numeric_cols,
|
477
|
+
numeric_aggregator,
|
478
|
+
feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
|
479
|
+
numeric_agg: str = RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
|
480
|
+
) -> pd.DataFrame:
|
481
|
+
"""
|
482
|
+
Aggregate numeric and non-numeric columns for grouped DataFrame.
|
483
|
+
Assumes deduplication by feature_id within each s_id has already been performed.
|
484
|
+
Returns the combined DataFrame.
|
485
|
+
"""
|
486
|
+
results = []
|
487
|
+
|
488
|
+
# Handle non-numeric columns
|
489
|
+
if len(non_numeric_cols) > 0:
|
490
|
+
non_numeric_agg = (
|
491
|
+
df[non_numeric_cols]
|
492
|
+
.groupby(level=0)
|
493
|
+
.agg(lambda x: ",".join(sorted(set(x.astype(str)))))
|
494
|
+
)
|
495
|
+
results.append(non_numeric_agg)
|
496
|
+
# Handle numeric columns
|
497
|
+
if len(numeric_cols) > 0:
|
498
|
+
numeric_results = {}
|
499
|
+
for col in numeric_cols:
|
500
|
+
if numeric_agg in [
|
501
|
+
RESOLVE_MATCHES_AGGREGATORS.FIRST,
|
502
|
+
RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
|
503
|
+
]:
|
504
|
+
agg_df = pd.DataFrame(
|
505
|
+
{"value": df[col], feature_id_var: df[feature_id_var]}
|
506
|
+
)
|
507
|
+
if numeric_agg == RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN:
|
508
|
+
agg_df[RESOLVE_MATCHES_TMP_WEIGHT_COL] = df[
|
509
|
+
RESOLVE_MATCHES_TMP_WEIGHT_COL
|
510
|
+
]
|
511
|
+
numeric_results[col] = agg_df.groupby(level=0).apply(
|
512
|
+
lambda x: (
|
513
|
+
numeric_aggregator(x)
|
514
|
+
if numeric_agg != RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN
|
515
|
+
else numeric_aggregator(
|
516
|
+
x.rename(columns={RESOLVE_MATCHES_TMP_WEIGHT_COL: "weight"})
|
517
|
+
)
|
518
|
+
)
|
519
|
+
)
|
520
|
+
else:
|
521
|
+
numeric_results[col] = df[col].groupby(level=0).agg(numeric_aggregator)
|
522
|
+
numeric_agg_df = pd.DataFrame(numeric_results)
|
523
|
+
results.append(numeric_agg_df)
|
524
|
+
# Combine results
|
525
|
+
if results:
|
526
|
+
resolved = pd.concat(results, axis=1)
|
527
|
+
else:
|
528
|
+
resolved = pd.DataFrame(index=df.index)
|
529
|
+
return resolved
|