napistu 0.2.5.dev6__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. napistu/__main__.py +126 -96
  2. napistu/constants.py +35 -41
  3. napistu/context/__init__.py +10 -0
  4. napistu/context/discretize.py +462 -0
  5. napistu/context/filtering.py +387 -0
  6. napistu/gcs/__init__.py +1 -1
  7. napistu/identifiers.py +74 -15
  8. napistu/indices.py +68 -0
  9. napistu/ingestion/__init__.py +1 -1
  10. napistu/ingestion/bigg.py +47 -62
  11. napistu/ingestion/constants.py +18 -133
  12. napistu/ingestion/gtex.py +113 -0
  13. napistu/ingestion/hpa.py +147 -0
  14. napistu/ingestion/sbml.py +0 -97
  15. napistu/ingestion/string.py +2 -2
  16. napistu/matching/__init__.py +10 -0
  17. napistu/matching/constants.py +18 -0
  18. napistu/matching/interactions.py +518 -0
  19. napistu/matching/mount.py +529 -0
  20. napistu/matching/species.py +510 -0
  21. napistu/mcp/__init__.py +7 -4
  22. napistu/mcp/__main__.py +128 -72
  23. napistu/mcp/client.py +16 -25
  24. napistu/mcp/codebase.py +201 -153
  25. napistu/mcp/component_base.py +170 -0
  26. napistu/mcp/config.py +223 -0
  27. napistu/mcp/constants.py +45 -2
  28. napistu/mcp/documentation.py +253 -136
  29. napistu/mcp/documentation_utils.py +13 -48
  30. napistu/mcp/execution.py +372 -305
  31. napistu/mcp/health.py +49 -67
  32. napistu/mcp/profiles.py +10 -6
  33. napistu/mcp/server.py +161 -80
  34. napistu/mcp/tutorials.py +139 -87
  35. napistu/modify/__init__.py +1 -1
  36. napistu/modify/gaps.py +1 -1
  37. napistu/network/__init__.py +1 -1
  38. napistu/network/constants.py +101 -34
  39. napistu/network/data_handling.py +388 -0
  40. napistu/network/ig_utils.py +351 -0
  41. napistu/network/napistu_graph_core.py +354 -0
  42. napistu/network/neighborhoods.py +40 -40
  43. napistu/network/net_create.py +373 -309
  44. napistu/network/net_propagation.py +47 -19
  45. napistu/network/{net_utils.py → ng_utils.py} +124 -272
  46. napistu/network/paths.py +67 -51
  47. napistu/network/precompute.py +11 -11
  48. napistu/ontologies/__init__.py +10 -0
  49. napistu/ontologies/constants.py +129 -0
  50. napistu/ontologies/dogma.py +243 -0
  51. napistu/ontologies/genodexito.py +649 -0
  52. napistu/ontologies/mygene.py +369 -0
  53. napistu/ontologies/renaming.py +198 -0
  54. napistu/rpy2/__init__.py +229 -86
  55. napistu/rpy2/callr.py +47 -77
  56. napistu/rpy2/constants.py +24 -23
  57. napistu/rpy2/rids.py +61 -648
  58. napistu/sbml_dfs_core.py +587 -222
  59. napistu/scverse/__init__.py +15 -0
  60. napistu/scverse/constants.py +28 -0
  61. napistu/scverse/loading.py +727 -0
  62. napistu/utils.py +118 -10
  63. {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/METADATA +8 -3
  64. napistu-0.3.1.dist-info/RECORD +133 -0
  65. tests/conftest.py +22 -0
  66. tests/test_context_discretize.py +56 -0
  67. tests/test_context_filtering.py +267 -0
  68. tests/test_identifiers.py +100 -0
  69. tests/test_indices.py +65 -0
  70. tests/{test_edgelist.py → test_ingestion_napistu_edgelist.py} +2 -2
  71. tests/test_matching_interactions.py +108 -0
  72. tests/test_matching_mount.py +305 -0
  73. tests/test_matching_species.py +394 -0
  74. tests/test_mcp_config.py +193 -0
  75. tests/test_mcp_documentation_utils.py +12 -3
  76. tests/test_mcp_server.py +356 -0
  77. tests/test_network_data_handling.py +397 -0
  78. tests/test_network_ig_utils.py +23 -0
  79. tests/test_network_neighborhoods.py +19 -0
  80. tests/test_network_net_create.py +459 -0
  81. tests/test_network_ng_utils.py +30 -0
  82. tests/test_network_paths.py +56 -0
  83. tests/{test_precomputed_distances.py → test_network_precompute.py} +8 -6
  84. tests/test_ontologies_genodexito.py +58 -0
  85. tests/test_ontologies_mygene.py +39 -0
  86. tests/test_ontologies_renaming.py +110 -0
  87. tests/test_rpy2_callr.py +79 -0
  88. tests/test_rpy2_init.py +151 -0
  89. tests/test_sbml.py +0 -31
  90. tests/test_sbml_dfs_core.py +134 -10
  91. tests/test_scverse_loading.py +778 -0
  92. tests/test_set_coverage.py +2 -2
  93. tests/test_utils.py +121 -1
  94. napistu/mechanism_matching.py +0 -1353
  95. napistu/rpy2/netcontextr.py +0 -467
  96. napistu-0.2.5.dev6.dist-info/RECORD +0 -97
  97. tests/test_igraph.py +0 -367
  98. tests/test_mechanism_matching.py +0 -784
  99. tests/test_net_utils.py +0 -149
  100. tests/test_netcontextr.py +0 -105
  101. tests/test_rpy2.py +0 -61
  102. /napistu/ingestion/{cpr_edgelist.py → napistu_edgelist.py} +0 -0
  103. {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/WHEEL +0 -0
  104. {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/entry_points.txt +0 -0
  105. {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/licenses/LICENSE +0 -0
  106. {napistu-0.2.5.dev6.dist-info → napistu-0.3.1.dist-info}/top_level.txt +0 -0
  107. /tests/{test_obo.py → test_ingestion_obo.py} +0 -0
@@ -0,0 +1,529 @@
1
+ import copy
2
+ import logging
3
+ from typing import Optional, Union, Set, Dict
4
+
5
+ import pandas as pd
6
+
7
+ from napistu.constants import SBML_DFS, ONTOLOGIES_LIST
8
+ from napistu.matching.constants import (
9
+ FEATURE_ID_VAR_DEFAULT,
10
+ RESOLVE_MATCHES_AGGREGATORS,
11
+ RESOLVE_MATCHES_TMP_WEIGHT_COL,
12
+ BIND_DICT_OF_WIDE_RESULTS_STRATEGIES,
13
+ BIND_DICT_OF_WIDE_RESULTS_STRATEGIES_LIST,
14
+ )
15
+ from napistu import identifiers, utils
16
+ from napistu.matching.species import match_features_to_wide_pathway_species
17
+ from napistu import sbml_dfs_core
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def bind_wide_results(
23
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
24
+ results_df: pd.DataFrame,
25
+ results_name: str,
26
+ ontologies: Optional[Union[Set[str], Dict[str, str]]] = None,
27
+ dogmatic: bool = False,
28
+ species_identifiers: Optional[pd.DataFrame] = None,
29
+ feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
30
+ numeric_agg: str = RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
31
+ keep_id_col: bool = True,
32
+ verbose: bool = False,
33
+ inplace: bool = True,
34
+ ) -> Optional[sbml_dfs_core.SBML_dfs]:
35
+ """
36
+ Binds wide results to a sbml_dfs object.
37
+
38
+ Take a table with molecular species-level attributes tied to systematic identifiers and match them to an sbml_dfs_model transferring these attributes to species_data
39
+
40
+ Parameters
41
+ ----------
42
+ sbml_dfs : sbml_dfs_core.SBML_dfs
43
+ The sbml_dfs object to bind the results to.
44
+ results_df : pd.DataFrame
45
+ The table containing the results to bind.
46
+ results_name : str
47
+ The name of the results to bind.
48
+ ontologies : Optional[Union[Set[str], Dict[str, str]]], default=None
49
+ Either:
50
+ - Set of columns to treat as ontologies (these should be entries in ONTOLOGIES_LIST )
51
+ - Dict mapping wide column names to ontology names in the ONTOLOGIES_LIST controlled vocabulary
52
+ - None to automatically detect valid ontology columns based on ONTOLOGIES_LIST
53
+ dogmatic : bool
54
+ Whether to respect differences between genes, transcripts, and proteins (True) or ignore them (False).
55
+ species_identifiers : Optional[pd.DataFrame]
56
+ Systematic identifiers for the molecular species "sbml_dfs". If None this will be generate on-the-fly.
57
+ feature_id_var : str
58
+ The name of the column in the results_df that contains the feature identifiers. If this does not exist it will be created.
59
+ numeric_agg : str
60
+ The aggregation method to use for resolving degeneracy.
61
+ keep_id_col : bool
62
+ Whether to keep the identifier column in the results_df.
63
+ verbose : bool
64
+ Whether to log cases of 1-to-many and many-to-one mapping and to indicate the behavior for resolving degeneracy
65
+ inplace : bool, default=True
66
+ Whether to modify the sbml_dfs object in place. If False, returns a copy.
67
+
68
+ Returns
69
+ -------
70
+ sbml_dfs : sbml_dfs_core.SBML_dfs
71
+ The sbml_dfs object with the results bound.
72
+ """
73
+
74
+ if not inplace:
75
+ sbml_dfs = copy.deepcopy(sbml_dfs)
76
+
77
+ species_identifiers = identifiers._prepare_species_identifiers(
78
+ sbml_dfs, dogmatic=dogmatic, species_identifiers=species_identifiers
79
+ )
80
+
81
+ # match
82
+ matched_s_ids_from_wide = match_features_to_wide_pathway_species(
83
+ results_df,
84
+ species_identifiers,
85
+ ontologies=ontologies,
86
+ feature_id_var=feature_id_var,
87
+ verbose=verbose,
88
+ )
89
+
90
+ disambiguated_matches = resolve_matches(
91
+ matched_data=matched_s_ids_from_wide,
92
+ feature_id_var=feature_id_var,
93
+ numeric_agg=numeric_agg,
94
+ keep_id_col=keep_id_col,
95
+ )
96
+
97
+ clean_species_data = utils.drop_extra_cols(
98
+ results_df, disambiguated_matches, always_include=[feature_id_var]
99
+ )
100
+
101
+ sbml_dfs.add_species_data(results_name, clean_species_data)
102
+
103
+ return None if inplace else sbml_dfs
104
+
105
+
106
+ def bind_dict_of_wide_results(
107
+ sbml_dfs: sbml_dfs_core.SBML_dfs,
108
+ results_dict: dict,
109
+ results_name: str,
110
+ strategy: str = BIND_DICT_OF_WIDE_RESULTS_STRATEGIES.CONTATENATE,
111
+ species_identifiers: pd.DataFrame = None,
112
+ ontologies: Optional[Union[str, list]] = None,
113
+ dogmatic: bool = False,
114
+ inplace: bool = True,
115
+ verbose=True,
116
+ ):
117
+ """
118
+ Bind a dictionary of wide results to an SBML_dfs object.
119
+
120
+ This function is used to bind a dictionary of wide results to 1 or more species_data attributes of an SBML_dfs object.
121
+ The dictionary should have keys which are the modality names and values which are the results dataframes.
122
+ The "strategy" argument controls how the results are added to the SBML_dfs object.
123
+
124
+ Parameters
125
+ ----------
126
+ sbml_dfs : SBML_dfs
127
+ The SBML_dfs object to bind the results to.
128
+ results_dict : dict
129
+ A dictionary of results dataframes with modality names as keys.
130
+ results_name : str
131
+ The name of the species_data attribute to bind the results to.
132
+ strategy : str
133
+ The strategy to use for binding the results.
134
+
135
+ Options are:
136
+ - "concatenate" : concatenate the results dataframes and add them as a single attribute.
137
+ - "multiple_keys" : add each modality's results as a separate attribute. The attribute name will be f'{results_name}_{modality}'.
138
+ - "stagger" : add each modality's results as a separate attribute. The attribute name will be f'{attr_name}_{modality}'.
139
+
140
+ species_identifiers : pd.DataFrame
141
+ A dataframe with species identifiers.
142
+ ontologies : optional str, list
143
+ The ontology to use for the species identifiers. If not provided, the column names of the results dataframes which match ONTOLOGIES_LIST will be used.
144
+ dogmatic : bool
145
+ Whether to use dogmatic mode. Ignored if species_identifiers is provided.
146
+ verbose : bool
147
+ Whether to print verbose output.
148
+ inplace : bool, default=True
149
+ Whether to modify the sbml_dfs object in place. If False, returns a copy.
150
+
151
+ Returns
152
+ -------
153
+ Optional[SBML_dfs]
154
+ If inplace=True, returns None. Otherwise returns the modified copy of sbml_dfs.
155
+ """
156
+
157
+ # validate strategy
158
+ if strategy not in BIND_DICT_OF_WIDE_RESULTS_STRATEGIES_LIST:
159
+ raise ValueError(
160
+ f"Invalid strategy: {strategy}. Must be one of {BIND_DICT_OF_WIDE_RESULTS_STRATEGIES_LIST}"
161
+ )
162
+
163
+ species_identifiers = identifiers._prepare_species_identifiers(
164
+ sbml_dfs, dogmatic=dogmatic, species_identifiers=species_identifiers
165
+ )
166
+
167
+ if not inplace:
168
+ sbml_dfs = copy.deepcopy(sbml_dfs)
169
+
170
+ if strategy == BIND_DICT_OF_WIDE_RESULTS_STRATEGIES.MULTIPLE_KEYS:
171
+ for modality, results_df in results_dict.items():
172
+ valid_ontologies = _get_wide_results_valid_ontologies(
173
+ results_df, ontologies
174
+ )
175
+
176
+ modality_results_name = f"{results_name}_{modality}"
177
+
178
+ bind_wide_results(
179
+ sbml_dfs,
180
+ results_df,
181
+ modality_results_name,
182
+ species_identifiers=species_identifiers,
183
+ ontologies=valid_ontologies,
184
+ inplace=True, # Always use inplace=True here since we handle copying above
185
+ verbose=verbose,
186
+ )
187
+
188
+ return None if inplace else sbml_dfs
189
+
190
+ # create either a concatenated or staggered results table
191
+ if strategy == BIND_DICT_OF_WIDE_RESULTS_STRATEGIES.CONTATENATE:
192
+ results_df = pd.concat(results_dict.values(), axis=0)
193
+ elif strategy == BIND_DICT_OF_WIDE_RESULTS_STRATEGIES.STAGGER:
194
+
195
+ results_dict_copy = results_dict.copy()
196
+ for k, v in results_dict_copy.items():
197
+ valid_ontologies = _get_wide_results_valid_ontologies(v, ontologies)
198
+
199
+ if verbose:
200
+ logger.info(
201
+ f"Modality {k} has ontologies {valid_ontologies}. Other variables will be renamed to {k}_<variable>"
202
+ )
203
+
204
+ # rename all the columns besides ontologies names
205
+ for var in v.columns:
206
+ if var not in valid_ontologies:
207
+ results_dict_copy[k].rename(
208
+ columns={var: f"{var}_{k}"}, inplace=True
209
+ )
210
+
211
+ results_df = pd.concat(results_dict_copy.values(), axis=1)
212
+
213
+ valid_ontologies = _get_wide_results_valid_ontologies(results_df, ontologies)
214
+
215
+ bind_wide_results(
216
+ sbml_dfs,
217
+ results_df,
218
+ results_name,
219
+ species_identifiers=species_identifiers,
220
+ ontologies=valid_ontologies,
221
+ inplace=True, # Always use inplace=True here since we handle copying above
222
+ verbose=verbose,
223
+ )
224
+
225
+ return None if inplace else sbml_dfs
226
+
227
+
228
+ def resolve_matches(
229
+ matched_data: pd.DataFrame,
230
+ feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
231
+ index_col: str = SBML_DFS.S_ID,
232
+ numeric_agg: str = RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
233
+ keep_id_col: bool = True,
234
+ ) -> pd.DataFrame:
235
+ """
236
+ Resolve many-to-1 and 1-to-many matches in matched data.
237
+
238
+ Parameters
239
+ ----------
240
+ matched_data : pd.DataFrame
241
+ DataFrame containing matched data with columns:
242
+ - feature_id_var: identifier column (e.g. feature_id)
243
+ - index_col: index column (e.g. s_id)
244
+ - other columns: data columns to be aggregated
245
+ feature_id_var : str, default="feature_id"
246
+ Name of the identifier column
247
+ index_col : str, default="s_id"
248
+ Name of the column to use as index
249
+ numeric_agg : str, default="weighted_mean"
250
+ Method to aggregate numeric columns:
251
+ - "weighted_mean": weighted by inverse of feature_id frequency (default)
252
+ - "mean": simple arithmetic mean
253
+ - "first": first value after sorting by feature_id_var (requires feature_id_var)
254
+ - "max": maximum value
255
+ keep_id_col : bool, default=True
256
+ Whether to keep and rollup the feature_id_var in the output.
257
+ If False, feature_id_var will be dropped from the output.
258
+
259
+ Returns
260
+ -------
261
+ pd.DataFrame
262
+ DataFrame with resolved matches:
263
+ - Many-to-1: numeric columns are aggregated using specified method
264
+ - 1-to-many: adds a count column showing number of matches
265
+ - Index is set to index_col and named accordingly
266
+
267
+ Raises
268
+ ------
269
+ KeyError
270
+ If feature_id_var is not present in the DataFrame
271
+ TypeError
272
+ If DataFrame contains unsupported data types (boolean or datetime)
273
+ """
274
+ # Make a copy to avoid modifying input
275
+ df = matched_data.copy()
276
+
277
+ # Check for unsupported data types
278
+ unsupported_dtypes = df.select_dtypes(include=["bool", "datetime64"]).columns
279
+ if not unsupported_dtypes.empty:
280
+ raise TypeError(
281
+ f"Unsupported data types found in columns: {list(unsupported_dtypes)}. "
282
+ "Boolean and datetime columns are not supported."
283
+ )
284
+
285
+ # Always require feature_id_var
286
+ if feature_id_var not in df.columns:
287
+ raise KeyError(feature_id_var)
288
+
289
+ # Deduplicate by feature_id within each s_id using groupby and first BEFORE any further processing
290
+ df = df.groupby([index_col, feature_id_var], sort=False).first().reset_index()
291
+
292
+ # Use a unique temporary column name for weights
293
+ if RESOLVE_MATCHES_TMP_WEIGHT_COL in df.columns:
294
+ raise ValueError(
295
+ f"Temporary weight column name '{RESOLVE_MATCHES_TMP_WEIGHT_COL}' already exists in the input data. Please rename or remove this column and try again."
296
+ )
297
+
298
+ # Calculate weights if needed (after deduplication!)
299
+ if numeric_agg == RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN:
300
+ feature_counts = df[feature_id_var].value_counts()
301
+ df[RESOLVE_MATCHES_TMP_WEIGHT_COL] = (
302
+ 1 / feature_counts[df[feature_id_var]].values
303
+ )
304
+
305
+ # Set index for grouping
306
+ df = df.set_index(index_col)
307
+
308
+ # Use utility to split columns
309
+ always_non_numeric = [feature_id_var] if keep_id_col else []
310
+ numeric_cols, non_numeric_cols = _split_numeric_non_numeric_columns(
311
+ df, always_non_numeric=always_non_numeric
312
+ )
313
+
314
+ # Get aggregator function
315
+ numeric_aggregator = _get_numeric_aggregator(
316
+ method=numeric_agg, feature_id_var=feature_id_var
317
+ )
318
+ resolved = _aggregate_grouped_columns(
319
+ df,
320
+ numeric_cols,
321
+ non_numeric_cols,
322
+ numeric_aggregator,
323
+ feature_id_var=feature_id_var,
324
+ numeric_agg=numeric_agg,
325
+ )
326
+ # Add count of matches per feature_id
327
+ match_counts = matched_data.groupby(index_col)[feature_id_var].nunique()
328
+ resolved[f"{feature_id_var}_match_count"] = match_counts
329
+
330
+ # Drop feature_id_var if not keeping it
331
+ if not keep_id_col and feature_id_var in resolved.columns:
332
+ resolved = resolved.drop(columns=[feature_id_var])
333
+
334
+ # Ensure index is named consistently
335
+ resolved.index.name = index_col
336
+
337
+ return resolved
338
+
339
+
340
+ def _get_wide_results_valid_ontologies(
341
+ results_df: pd.DataFrame, ontologies: Optional[Union[str, list]] = None
342
+ ) -> list:
343
+ """
344
+ Get the valid ontologies for a wide results dataframe.
345
+
346
+ If ontologies is a string, it will be converted to a list.
347
+ If ontologies is None, the column names of the results dataframe which match ONTOLOGIES_LIST will be used.
348
+
349
+ Parameters
350
+ ----------
351
+ results_df : pd.DataFrame
352
+ The results dataframe to get the valid ontologies for.
353
+ ontologies : optional str, list
354
+ The ontology to use for the species identifiers. If not provided, the column names of the results dataframes which match ONTOLOGIES_LIST will be used.
355
+
356
+ Returns
357
+ -------
358
+ list
359
+ The valid ontologies for the results dataframe.
360
+ """
361
+
362
+ if isinstance(ontologies, str):
363
+ ontologies = [ontologies] # now, it will be None or list
364
+
365
+ if ontologies is None:
366
+ ontologies = [col for col in results_df.columns if col in ONTOLOGIES_LIST]
367
+ if len(ontologies) == 0:
368
+ raise ValueError(
369
+ "No valid ontologies found in results dataframe. Columns are: "
370
+ + str(results_df.columns)
371
+ )
372
+
373
+ if isinstance(ontologies, list):
374
+ invalid_ontologies = set(ontologies) - set(ONTOLOGIES_LIST)
375
+ if len(invalid_ontologies) > 0:
376
+ raise ValueError(
377
+ "Invalid ontologies found in ontologies list: "
378
+ + str(invalid_ontologies)
379
+ )
380
+
381
+ return ontologies
382
+
383
+
384
+ def _get_numeric_aggregator(
385
+ method: str = RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
386
+ feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
387
+ ) -> callable:
388
+ """
389
+ Get aggregation function for numeric columns with various methods.
390
+
391
+ Parameters
392
+ ----------
393
+ method : str, default="weighted_mean"
394
+ Aggregation method to use:
395
+ - "weighted_mean": weighted by inverse of feature_id frequency (default)
396
+ - "mean": simple arithmetic mean
397
+ - "first": first value after sorting by feature_id_var (requires feature_id_var)
398
+ - "max": maximum value
399
+ feature_id_var : str, default="feature_id"
400
+ Name of the column specifying a measured feature - used for sorting and weighting
401
+
402
+ Returns
403
+ -------
404
+ callable
405
+ Aggregation function to use with groupby
406
+
407
+ Raises
408
+ ------
409
+ ValueError
410
+ If method is not recognized
411
+ """
412
+
413
+ def weighted_mean(df: pd.DataFrame) -> float:
414
+ # Get values and weights for this group
415
+ values = df["value"]
416
+ weights = df["weight"]
417
+ # Weights are already normalized globally, just use them directly
418
+ return (values * weights).sum() / weights.sum()
419
+
420
+ def first_by_id(df: pd.DataFrame) -> float:
421
+ # Sort by feature_id and take first value
422
+ return df.sort_values(feature_id_var).iloc[0]["value"]
423
+
424
+ def simple_mean(series: pd.Series) -> float:
425
+ return series.mean()
426
+
427
+ def simple_max(series: pd.Series) -> float:
428
+ return series.max()
429
+
430
+ aggregators = {
431
+ RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN: weighted_mean,
432
+ RESOLVE_MATCHES_AGGREGATORS.MEAN: simple_mean,
433
+ RESOLVE_MATCHES_AGGREGATORS.FIRST: first_by_id,
434
+ RESOLVE_MATCHES_AGGREGATORS.MAX: simple_max,
435
+ }
436
+
437
+ if method not in aggregators:
438
+ raise ValueError(
439
+ f"Unknown aggregation method: {method}. Must be one of {list(aggregators.keys())}"
440
+ )
441
+
442
+ return aggregators[method]
443
+
444
+
445
+ def _split_numeric_non_numeric_columns(df: pd.DataFrame, always_non_numeric=None):
446
+ """
447
+ Utility to split DataFrame columns into numeric and non-numeric, always treating specified columns as non-numeric.
448
+
449
+ Parameters
450
+ ----------
451
+ df : pd.DataFrame
452
+ The DataFrame to split.
453
+ always_non_numeric : list or set, optional
454
+ Columns to always treat as non-numeric (e.g., ['feature_id']).
455
+
456
+ Returns
457
+ -------
458
+ numeric_cols : pd.Index
459
+ Columns considered numeric (int64, float64, and not in always_non_numeric).
460
+ non_numeric_cols : pd.Index
461
+ Columns considered non-numeric (object, string, etc., plus always_non_numeric).
462
+ """
463
+ if always_non_numeric is None:
464
+ always_non_numeric = []
465
+ always_non_numeric = set(always_non_numeric)
466
+ numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.difference(
467
+ always_non_numeric
468
+ )
469
+ non_numeric_cols = df.columns.difference(numeric_cols)
470
+ return numeric_cols, non_numeric_cols
471
+
472
+
473
+ def _aggregate_grouped_columns(
474
+ df: pd.DataFrame,
475
+ numeric_cols,
476
+ non_numeric_cols,
477
+ numeric_aggregator,
478
+ feature_id_var: str = FEATURE_ID_VAR_DEFAULT,
479
+ numeric_agg: str = RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
480
+ ) -> pd.DataFrame:
481
+ """
482
+ Aggregate numeric and non-numeric columns for grouped DataFrame.
483
+ Assumes deduplication by feature_id within each s_id has already been performed.
484
+ Returns the combined DataFrame.
485
+ """
486
+ results = []
487
+
488
+ # Handle non-numeric columns
489
+ if len(non_numeric_cols) > 0:
490
+ non_numeric_agg = (
491
+ df[non_numeric_cols]
492
+ .groupby(level=0)
493
+ .agg(lambda x: ",".join(sorted(set(x.astype(str)))))
494
+ )
495
+ results.append(non_numeric_agg)
496
+ # Handle numeric columns
497
+ if len(numeric_cols) > 0:
498
+ numeric_results = {}
499
+ for col in numeric_cols:
500
+ if numeric_agg in [
501
+ RESOLVE_MATCHES_AGGREGATORS.FIRST,
502
+ RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN,
503
+ ]:
504
+ agg_df = pd.DataFrame(
505
+ {"value": df[col], feature_id_var: df[feature_id_var]}
506
+ )
507
+ if numeric_agg == RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN:
508
+ agg_df[RESOLVE_MATCHES_TMP_WEIGHT_COL] = df[
509
+ RESOLVE_MATCHES_TMP_WEIGHT_COL
510
+ ]
511
+ numeric_results[col] = agg_df.groupby(level=0).apply(
512
+ lambda x: (
513
+ numeric_aggregator(x)
514
+ if numeric_agg != RESOLVE_MATCHES_AGGREGATORS.WEIGHTED_MEAN
515
+ else numeric_aggregator(
516
+ x.rename(columns={RESOLVE_MATCHES_TMP_WEIGHT_COL: "weight"})
517
+ )
518
+ )
519
+ )
520
+ else:
521
+ numeric_results[col] = df[col].groupby(level=0).agg(numeric_aggregator)
522
+ numeric_agg_df = pd.DataFrame(numeric_results)
523
+ results.append(numeric_agg_df)
524
+ # Combine results
525
+ if results:
526
+ resolved = pd.concat(results, axis=1)
527
+ else:
528
+ resolved = pd.DataFrame(index=df.index)
529
+ return resolved