masster 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

Files changed (55) hide show
  1. masster/__init__.py +27 -27
  2. masster/_version.py +17 -17
  3. masster/chromatogram.py +497 -503
  4. masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.featureXML +199787 -0
  5. masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.sample5 +0 -0
  6. masster/logger.py +318 -244
  7. masster/sample/__init__.py +9 -9
  8. masster/sample/defaults/__init__.py +15 -15
  9. masster/sample/defaults/find_adducts_def.py +325 -325
  10. masster/sample/defaults/find_features_def.py +366 -366
  11. masster/sample/defaults/find_ms2_def.py +285 -285
  12. masster/sample/defaults/get_spectrum_def.py +314 -318
  13. masster/sample/defaults/sample_def.py +374 -378
  14. masster/sample/h5.py +1321 -1297
  15. masster/sample/helpers.py +833 -364
  16. masster/sample/lib.py +762 -0
  17. masster/sample/load.py +1220 -1187
  18. masster/sample/parameters.py +131 -131
  19. masster/sample/plot.py +1610 -1622
  20. masster/sample/processing.py +1402 -1416
  21. masster/sample/quant.py +209 -0
  22. masster/sample/sample.py +391 -387
  23. masster/sample/sample5_schema.json +181 -181
  24. masster/sample/save.py +737 -719
  25. masster/sample/sciex.py +1213 -0
  26. masster/spectrum.py +1287 -1319
  27. masster/study/__init__.py +9 -9
  28. masster/study/defaults/__init__.py +21 -19
  29. masster/study/defaults/align_def.py +267 -267
  30. masster/study/defaults/export_def.py +41 -40
  31. masster/study/defaults/fill_chrom_def.py +264 -264
  32. masster/study/defaults/fill_def.py +260 -0
  33. masster/study/defaults/find_consensus_def.py +256 -256
  34. masster/study/defaults/find_ms2_def.py +163 -163
  35. masster/study/defaults/integrate_chrom_def.py +225 -225
  36. masster/study/defaults/integrate_def.py +221 -0
  37. masster/study/defaults/merge_def.py +256 -0
  38. masster/study/defaults/study_def.py +272 -269
  39. masster/study/export.py +674 -287
  40. masster/study/h5.py +1398 -886
  41. masster/study/helpers.py +1650 -433
  42. masster/study/helpers_optimized.py +317 -0
  43. masster/study/load.py +1201 -1078
  44. masster/study/parameters.py +99 -99
  45. masster/study/plot.py +632 -645
  46. masster/study/processing.py +1057 -1046
  47. masster/study/save.py +149 -134
  48. masster/study/study.py +606 -522
  49. masster/study/study5_schema.json +247 -241
  50. {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/METADATA +15 -10
  51. masster-0.3.0.dist-info/RECORD +59 -0
  52. {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/licenses/LICENSE +661 -661
  53. masster-0.2.4.dist-info/RECORD +0 -50
  54. {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/WHEEL +0 -0
  55. {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/entry_points.txt +0 -0
masster/sample/helpers.py CHANGED
@@ -1,364 +1,833 @@
1
- from __future__ import annotations
2
-
3
- import polars as pl
4
-
5
- import numpy as np
6
-
7
- # Parameters removed - using hardcoded defaults
8
-
9
-
10
- def get_dda_stats(self):
11
- # filter self.scans_df with mslevel 1
12
- ms1 = self.scans_df.filter(pl.col("ms_level") == 1)
13
- return ms1
14
-
15
-
16
- # TODO
17
-
18
-
19
- def get_feature(self, feature_uid):
20
- # get the feature with feature_uid == feature_uid
21
- feature = self.features_df.filter(pl.col("feature_uid") == feature_uid)
22
- if len(feature) == 0:
23
- self.logger.warning(f"Feature {feature_uid} not found.")
24
- return None
25
- else:
26
- return feature.row(0, named=True)
27
-
28
-
29
- def _get_scan_uids(self, scans=None, verbose=True):
30
- if scans is None:
31
- # fromuids scan all get_dfans
32
- scans_uids = self.scans_df.get_column("scan_uid").to_list()
33
- elif isinstance(scans, list):
34
- # if scans is a list, ensure all elements are valid scan_uids
35
- scans_uids = [
36
- s for s in scans if s in self.scans_df.get_column("scan_uid").to_list()
37
- ]
38
- if verbose and not scans_uids:
39
- self.logger.error("No valid scan_uids provided.")
40
-
41
- return scans_uids
42
-
43
-
44
- def _get_feature_uids(self, features=None, verbose=True):
45
- if features is None:
46
- # fromuids scan all get_dfans
47
- feature_uids = self.features_df.get_column("feature_uid").to_list()
48
- elif isinstance(features, list):
49
- # if features is a list, ensure all elements are valid feature_uids
50
- feature_uids = [
51
- f
52
- for f in features
53
- if f in self.features_df.get_column("feature_uid").to_list()
54
- ]
55
- if verbose and not feature_uids:
56
- self.logger.error("No valid feature_uids provided.")
57
-
58
- return feature_uids
59
-
60
-
61
- def get_scan(self, scans: list | None = None, verbose=True):
62
- scan_uids = self._get_scan_uids(scans, verbose=False)
63
- if not scan_uids:
64
- if verbose:
65
- self.logger.warning("No valid scan_uids provided.")
66
- return None
67
-
68
- scan = self.scans_df.filter(pl.col("scan_uid").is_in(scan_uids))
69
- return scan
70
-
71
-
72
- def find_closest_scan(
73
- self,
74
- rt,
75
- prec_mz=None,
76
- mz_tol=0.01,
77
- ):
78
- """
79
- Find the closest scan based on retention time (rt), applying additional filtering on precursor m/z (prec_mz) if provided.
80
- Parameters:
81
- rt (float): The target retention time to find the closest scan.
82
- prec_mz (float, optional): The precursor m/z value used to filter scans. If given, only scans with ms_level 2 are considered
83
- and filtered to include only those within mz_tol of prec_mz.
84
- mz_tol (float, optional): The tolerance to apply when filtering scans by precursor m/z. Defaults to 0.01.
85
- Returns:
86
- dict or None: A dictionary representing the closest scan if a matching scan is found;
87
- otherwise, returns None.
88
- Notes:
89
- - If the scans_df attribute is None, the function prints an error message and returns None.
90
- - When prec_mz is provided, it filters scans where ms_level equals 2 and the precursor m/z is within the given mz_tol range.
91
- - If prec_mz is not provided, scans with ms_level equal to 1 are considered.
92
- - The function calculates the absolute difference between each scan's rt and the given rt, sorting the scans by this difference.
93
- - If no scans match the criteria, an error message is printed before returning None.
94
- """
95
- # check if scans_df is None
96
- if self.scans_df is None:
97
- self.logger.warning("No scans found.")
98
- return None
99
- if prec_mz is not None:
100
- ms_level = 2
101
- scans = self.scans_df.filter(pl.col("ms_level") == ms_level)
102
- # find all scans with prec_mz within mz_tol of prec_mz
103
- scans = scans.filter(pl.col("prec_mz") > prec_mz - mz_tol)
104
- scans = scans.filter(pl.col("prec_mz") < prec_mz + mz_tol)
105
- # sort by distance to rt
106
- scans = scans.with_columns((pl.col("rt") - rt).abs().alias("rt_diff"))
107
- scans = scans.sort("rt_diff")
108
- # return the closest scan
109
- if len(scans) > 0:
110
- scan = scans[0]
111
- else:
112
- self.logger.warning(
113
- f"No scans found with prec_mz {prec_mz} within {mz_tol} of rt {rt}.",
114
- )
115
- return None
116
- else:
117
- mslevel = 1
118
- scans = self.scans_df.filter(pl.col("ms_level") == mslevel)
119
- # sort by distance to rt
120
- scans = scans.with_columns((pl.col("rt") - rt).abs().alias("rt_diff"))
121
- scans = scans.sort("rt_diff")
122
- # return the closest scan
123
- if len(scans) > 0:
124
- scan = scans[0]
125
- else:
126
- self.logger.warning(
127
- f"No scans found with ms_level {mslevel} within {mz_tol} of rt {rt}.",
128
- )
129
- return None
130
- # convert to dict
131
-
132
- return scan.row(0, named=True)
133
-
134
-
135
- # TODO the variables here do not follow the rest (mz, rt being tuples, etc.)
136
-
137
-
138
- def filter_features(
139
- self,
140
- inplace=False,
141
- mz=None,
142
- rt=None,
143
- coherence=None,
144
- inty=None,
145
- rt_delta=None,
146
- iso=None,
147
- iso_of=None,
148
- has_MS2=None,
149
- prominence_scaled=None,
150
- height_scaled=None,
151
- prominence=None,
152
- height=None,
153
- ):
154
- # remove all features with coherence < coherence
155
- if self.features_df is None:
156
- # self.logger.info("No features found. R")
157
- return
158
- feats = self.features_df.clone()
159
- if coherence is not None:
160
- has_coherence = "chrom_coherence" in self.features_df.columns
161
- if not has_coherence:
162
- self.logger.warning("No coherence data found in features.")
163
- else:
164
- # record len for logging
165
- feats_len_before_filter = len(feats)
166
- if isinstance(coherence, tuple) and len(coherence) == 2:
167
- min_coherence, max_coherence = coherence
168
- feats = feats.filter(
169
- (pl.col("chrom_coherence") >= min_coherence)
170
- & (pl.col("chrom_coherence") <= max_coherence),
171
- )
172
- else:
173
- feats = feats.filter(pl.col("chrom_coherence") >= coherence)
174
- self.logger.debug(
175
- f"Filtered features by coherence. Features removed: {feats_len_before_filter - len(feats)}",
176
- )
177
-
178
- if mz is not None:
179
- feats_len_before_filter = len(feats)
180
- if isinstance(mz, tuple) and len(mz) == 2:
181
- min_mz, max_mz = mz
182
- feats = feats.filter((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
183
- else:
184
- feats = feats.filter(pl.col("mz") >= mz)
185
- self.logger.debug(
186
- f"Filtered features by mz. Features removed: {feats_len_before_filter - len(feats)}",
187
- )
188
-
189
- if rt is not None:
190
- feats_len_before_filter = len(feats)
191
- if isinstance(rt, tuple) and len(rt) == 2:
192
- min_rt, max_rt = rt
193
- feats = feats.filter((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
194
- else:
195
- feats = feats.filter(pl.col("rt") >= rt)
196
- self.logger.debug(
197
- f"Filtered features by rt. Features removed: {feats_len_before_filter - len(feats)}",
198
- )
199
-
200
- if inty is not None:
201
- feats_len_before_filter = len(feats)
202
- if isinstance(inty, tuple) and len(inty) == 2:
203
- min_inty, max_inty = inty
204
- feats = feats.filter(
205
- (pl.col("inty") >= min_inty) & (pl.col("inty") <= max_inty),
206
- )
207
- else:
208
- feats = feats.filter(pl.col("inty") >= inty)
209
- self.logger.debug(
210
- f"Filtered features by intensity. Features removed: {feats_len_before_filter - len(feats)}",
211
- )
212
-
213
- if rt_delta is not None:
214
- feats_len_before_filter = len(feats)
215
- if "rt_delta" not in feats.columns:
216
- self.logger.warning("No rt_delta data found in features.")
217
- return
218
- if isinstance(rt_delta, tuple) and len(rt_delta) == 2:
219
- min_rt_delta, max_rt_delta = rt_delta
220
- feats = feats.filter(
221
- (pl.col("rt_delta") >= min_rt_delta)
222
- & (pl.col("rt_delta") <= max_rt_delta),
223
- )
224
- else:
225
- feats = feats.filter(pl.col("rt_delta") >= rt_delta)
226
- self.logger.debug(
227
- f"Filtered features by rt_delta. Features removed: {feats_len_before_filter - len(feats)}",
228
- )
229
-
230
- if iso is not None:
231
- feats_len_before_filter = len(feats)
232
- if isinstance(iso, tuple) and len(iso) == 2:
233
- min_iso, max_iso = iso
234
- feats = feats.filter(
235
- (pl.col("iso") >= min_iso) & (pl.col("iso") <= max_iso),
236
- )
237
- else:
238
- feats = feats.filter(pl.col("iso") == iso)
239
- self.logger.debug(
240
- f"Filtered features by iso. Features removed: {feats_len_before_filter - len(feats)}",
241
- )
242
-
243
- if iso_of is not None:
244
- feats_len_before_filter = len(feats)
245
- if isinstance(iso_of, tuple) and len(iso_of) == 2:
246
- min_iso_of, max_iso_of = iso_of
247
- feats = feats.filter(
248
- (pl.col("iso_of") >= min_iso_of) & (pl.col("iso_of") <= max_iso_of),
249
- )
250
- else:
251
- feats = feats.filter(pl.col("iso_of") == iso_of)
252
- self.logger.debug(
253
- f"Filtered features by iso_of. Features removed: {feats_len_before_filter - len(feats)}",
254
- )
255
-
256
- if has_MS2 is not None:
257
- feats_len_before_filter = len(feats)
258
- if has_MS2:
259
- feats = feats.filter(pl.col("ms2_scans").is_not_null())
260
- else:
261
- feats = feats.filter(pl.col("ms2_scans").is_null())
262
- self.logger.debug(
263
- f"Filtered features by MS2 presence. Features removed: {feats_len_before_filter - len(feats)}",
264
- )
265
-
266
- if prominence_scaled is not None:
267
- feats_len_before_filter = len(feats)
268
- if isinstance(prominence_scaled, tuple) and len(prominence_scaled) == 2:
269
- min_prominence_scaled, max_prominence_scaled = prominence_scaled
270
- feats = feats.filter(
271
- (pl.col("chrom_prominence_scaled") >= min_prominence_scaled)
272
- & (pl.col("chrom_prominence_scaled") <= max_prominence_scaled),
273
- )
274
- else:
275
- feats = feats.filter(pl.col("chrom_prominence_scaled") >= prominence_scaled)
276
- self.logger.debug(
277
- f"Filtered features by prominence_scaled. Features removed: {feats_len_before_filter - len(feats)}",
278
- )
279
-
280
- if height_scaled is not None:
281
- feats_len_before_filter = len(feats)
282
- if isinstance(height_scaled, tuple) and len(height_scaled) == 2:
283
- min_height_scaled, max_height_scaled = height_scaled
284
- feats = feats.filter(
285
- (pl.col("chrom_height_scaled") >= min_height_scaled)
286
- & (pl.col("chrom_height_scaled") <= max_height_scaled),
287
- )
288
- else:
289
- feats = feats.filter(pl.col("chrom_height_scaled") >= height_scaled)
290
- self.logger.debug(
291
- f"Filtered features by height_scaled. Features removed: {feats_len_before_filter - len(feats)}",
292
- )
293
-
294
- if prominence is not None:
295
- feats_len_before_filter = len(feats)
296
- if isinstance(prominence, tuple) and len(prominence) == 2:
297
- min_prominence, max_prominence = prominence
298
- feats = feats.filter(
299
- (pl.col("chrom_prominence") >= min_prominence)
300
- & (pl.col("chrom_prominence") <= max_prominence),
301
- )
302
- else:
303
- feats = feats.filter(pl.col("chrom_prominence") >= prominence)
304
- self.logger.debug(
305
- f"Filtered features by prominence. Features removed: {feats_len_before_filter - len(feats)}",
306
- )
307
-
308
- if height is not None:
309
- feats_len_before_filter = len(feats)
310
- # Check if chrom_height column exists, if not use chrom_height_scaled
311
- height_col = (
312
- "chrom_height" if "chrom_height" in feats.columns else "chrom_height_scaled"
313
- )
314
- if isinstance(height, tuple) and len(height) == 2:
315
- min_height, max_height = height
316
- feats = feats.filter(
317
- (pl.col(height_col) >= min_height) & (pl.col(height_col) <= max_height),
318
- )
319
- else:
320
- feats = feats.filter(pl.col(height_col) >= height)
321
- self.logger.debug(
322
- f"Filtered features by {height_col}. Features removed: {feats_len_before_filter - len(feats)}",
323
- )
324
-
325
- self.logger.info(f"Filtered features. Features left: {len(feats)}")
326
- if inplace:
327
- self.features_df = feats
328
- else:
329
- return feats
330
-
331
-
332
- def _delete_ms2(self):
333
- """
334
- Unlinks MS2 spectra from features in the dataset.
335
- This method removes the association between MS2 spectra and features in the features dataframe by setting
336
- the 'ms2_scans' and 'ms2_specs' columns to None. It also updates the scans dataframe to remove the feature
337
- id (feature_uid) association for the linked MS2 spectra.
338
- Parameters:
339
- Returns:
340
- None
341
- Side Effects:
342
- Updates self.features_df by setting 'ms2_scans' and 'ms2_specs' columns to None. Also, updates self.scans_df
343
- by resetting the 'feature_uid' column for linked MS2 spectra.
344
- """
345
- if self.features_df is None:
346
- # self.logger.warning("No features found.")
347
- return
348
-
349
- self.logger.debug("Unlinking MS2 spectra from features...")
350
-
351
- # Set ms2_scans and ms2_specs to None using Polars syntax
352
- self.features_df = self.features_df.with_columns([
353
- pl.lit(None).alias("ms2_scans"),
354
- pl.lit(None).alias("ms2_specs"),
355
- ])
356
-
357
- # Update scans_df to remove feature_uid association for linked MS2 spectra
358
- self.scans_df = self.scans_df.with_columns(
359
- pl.when(pl.col("ms_level") == 2)
360
- .then(None)
361
- .otherwise(pl.col("feature_uid"))
362
- .alias("feature_uid"),
363
- )
364
- self.logger.info("MS2 spectra unlinked from features.")
1
+ from __future__ import annotations
2
+
3
+ import polars as pl
4
+
5
+
6
+ # Parameters removed - using hardcoded defaults
7
+
8
+
9
+ def _estimate_memory_usage(self):
10
+ """
11
+ Estimate the memory usage of all dataframes in the Sample object.
12
+
13
+ Returns:
14
+ dict: A dictionary containing memory usage estimates for each dataframe
15
+ and the total memory usage in bytes and MB.
16
+ """
17
+ memory_usage = {}
18
+ total_bytes = 0
19
+
20
+ # Check features_df
21
+ if self.features_df is not None and len(self.features_df) > 0:
22
+ features_bytes = self.features_df.estimated_size()
23
+ memory_usage['features_df'] = {
24
+ 'rows': len(self.features_df),
25
+ 'columns': len(self.features_df.columns),
26
+ 'bytes': features_bytes,
27
+ 'mb': features_bytes / (1024 * 1024)
28
+ }
29
+ total_bytes += features_bytes
30
+ else:
31
+ memory_usage['features_df'] = {'rows': 0, 'columns': 0, 'bytes': 0, 'mb': 0}
32
+
33
+ # Check scans_df
34
+ if self.scans_df is not None and len(self.scans_df) > 0:
35
+ scans_bytes = self.scans_df.estimated_size()
36
+ memory_usage['scans_df'] = {
37
+ 'rows': len(self.scans_df),
38
+ 'columns': len(self.scans_df.columns),
39
+ 'bytes': scans_bytes,
40
+ 'mb': scans_bytes / (1024 * 1024)
41
+ }
42
+ total_bytes += scans_bytes
43
+ else:
44
+ memory_usage['scans_df'] = {'rows': 0, 'columns': 0, 'bytes': 0, 'mb': 0}
45
+
46
+ # Check ms1_df
47
+ if self.ms1_df is not None and len(self.ms1_df) > 0:
48
+ ms1_bytes = self.ms1_df.estimated_size()
49
+ memory_usage['ms1_df'] = {
50
+ 'rows': len(self.ms1_df),
51
+ 'columns': len(self.ms1_df.columns),
52
+ 'bytes': ms1_bytes,
53
+ 'mb': ms1_bytes / (1024 * 1024)
54
+ }
55
+ total_bytes += ms1_bytes
56
+ else:
57
+ memory_usage['ms1_df'] = {'rows': 0, 'columns': 0, 'bytes': 0, 'mb': 0}
58
+
59
+ # Check chrom_df
60
+ if self.chrom_df is not None and len(self.chrom_df) > 0:
61
+ chrom_bytes = self.chrom_df.estimated_size()
62
+ memory_usage['chrom_df'] = {
63
+ 'rows': len(self.chrom_df),
64
+ 'columns': len(self.chrom_df.columns),
65
+ 'bytes': chrom_bytes,
66
+ 'mb': chrom_bytes / (1024 * 1024)
67
+ }
68
+ total_bytes += chrom_bytes
69
+ else:
70
+ memory_usage['chrom_df'] = {'rows': 0, 'columns': 0, 'bytes': 0, 'mb': 0}
71
+
72
+ # Add total memory usage
73
+ memory_usage['total'] = {
74
+ 'bytes': total_bytes,
75
+ 'mb': total_bytes / (1024 * 1024),
76
+ 'gb': total_bytes / (1024 * 1024 * 1024)
77
+ }
78
+
79
+ # Log the memory usage summary
80
+ if hasattr(self, 'logger'):
81
+ self.logger.debug(f"Total DataFrame memory usage: {memory_usage['total']['mb']:.2f} MB")
82
+ for df_name, stats in memory_usage.items():
83
+ if df_name != 'total' and stats['bytes'] > 0:
84
+ self.logger.debug(f"{df_name}: {stats['rows']} rows, {stats['mb']:.2f} MB")
85
+
86
+ return memory_usage['total']['mb']
87
+
88
+
89
+ def get_dda_stats(self):
90
+ # filter self.scans_df with mslevel 1
91
+ ms1 = self.scans_df.filter(pl.col("ms_level") == 1)
92
+ return ms1
93
+
94
+
95
+ # TODO
96
+
97
+
98
+ def get_feature(self, feature_uid):
99
+ # get the feature with feature_uid == feature_uid
100
+ feature = self.features_df.filter(pl.col("feature_uid") == feature_uid)
101
+ if len(feature) == 0:
102
+ self.logger.warning(f"Feature {feature_uid} not found.")
103
+ return None
104
+ else:
105
+ return feature.row(0, named=True)
106
+
107
+
108
+ def _get_scan_uids(self, scans=None, verbose=True):
109
+ if scans is None:
110
+ # fromuids scan all get_dfans
111
+ scans_uids = self.scans_df.get_column("scan_uid").to_list()
112
+ elif isinstance(scans, list):
113
+ # if scans is a list, ensure all elements are valid scan_uids
114
+ scans_uids = [s for s in scans if s in self.scans_df.get_column("scan_uid").to_list()]
115
+ if verbose and not scans_uids:
116
+ self.logger.error("No valid scan_uids provided.")
117
+
118
+ return scans_uids
119
+
120
+
121
+ def _get_feature_uids(self, features=None, verbose=True):
122
+ """
123
+ Get feature UIDs from various input types.
124
+
125
+ Parameters:
126
+ features: Can be one of the following:
127
+ - None: Returns all feature UIDs from self.features_df
128
+ - list: Returns the list if all elements are valid feature UIDs
129
+ - polars.DataFrame: Extracts unique values from 'feature_uid' or 'feature_id' column
130
+ - pandas.DataFrame: Extracts unique values from 'feature_uid' or 'feature_id' column
131
+ verbose (bool): Whether to log errors for invalid inputs
132
+
133
+ Returns:
134
+ list: List of feature UIDs
135
+ """
136
+ if features is None:
137
+ # Get all feature UIDs from self.features_df
138
+ if self.features_df is None:
139
+ if verbose:
140
+ self.logger.warning("No features_df available.")
141
+ return []
142
+ feature_uids = self.features_df.get_column("feature_uid").to_list()
143
+ elif isinstance(features, list):
144
+ # If features is a list, ensure all elements are valid feature_uids
145
+ if self.features_df is None:
146
+ if verbose:
147
+ self.logger.warning("No features_df available to validate feature UIDs.")
148
+ return []
149
+
150
+ valid_feature_uids = self.features_df.get_column("feature_uid").to_list()
151
+ feature_uids = [f for f in features if f in valid_feature_uids]
152
+ if verbose and not feature_uids:
153
+ self.logger.error("No valid feature_uids provided.")
154
+ else:
155
+ # Handle polars and pandas DataFrames
156
+ try:
157
+ # Check if it's a polars DataFrame
158
+ if hasattr(features, 'columns') and hasattr(features, 'get_column'):
159
+ # Polars DataFrame
160
+ feature_column = None
161
+ if 'feature_uid' in features.columns:
162
+ feature_column = 'feature_uid'
163
+ elif 'feature_id' in features.columns:
164
+ feature_column = 'feature_id'
165
+
166
+ if feature_column is None:
167
+ if verbose:
168
+ self.logger.error("No 'feature_uid' or 'feature_id' column found in polars DataFrame.")
169
+ return []
170
+
171
+ # Get unique values from the column
172
+ feature_uids = features.get_column(feature_column).unique().to_list()
173
+
174
+ # Check if it's a pandas DataFrame
175
+ elif hasattr(features, 'columns') and hasattr(features, 'iloc'):
176
+ # Pandas DataFrame
177
+ import pandas as pd
178
+ if not isinstance(features, pd.DataFrame):
179
+ if verbose:
180
+ self.logger.error("Invalid input type. Expected None, list, polars DataFrame, or pandas DataFrame.")
181
+ return []
182
+
183
+ feature_column = None
184
+ if 'feature_uid' in features.columns:
185
+ feature_column = 'feature_uid'
186
+ elif 'feature_id' in features.columns:
187
+ feature_column = 'feature_id'
188
+
189
+ if feature_column is None:
190
+ if verbose:
191
+ self.logger.error("No 'feature_uid' or 'feature_id' column found in pandas DataFrame.")
192
+ return []
193
+
194
+ # Get unique values from the column
195
+ feature_uids = features[feature_column].unique().tolist()
196
+
197
+ else:
198
+ if verbose:
199
+ self.logger.error("Invalid input type. Expected None, list, polars DataFrame, or pandas DataFrame.")
200
+ return []
201
+
202
+ except Exception as e:
203
+ if verbose:
204
+ self.logger.error(f"Error processing DataFrame input: {e}")
205
+ return []
206
+
207
+ return feature_uids
208
+
209
+
210
+ def get_scan(self, scans: list | None = None, verbose=True):
211
+ scan_uids = self._get_scan_uids(scans, verbose=False)
212
+ if not scan_uids:
213
+ if verbose:
214
+ self.logger.warning("No valid scan_uids provided.")
215
+ return None
216
+
217
+ scan = self.scans_df.filter(pl.col("scan_uid").is_in(scan_uids))
218
+ return scan
219
+
220
+
221
+ def select_closest_scan(
222
+ self,
223
+ rt,
224
+ prec_mz=None,
225
+ mz_tol=0.01,
226
+ ):
227
+ """
228
+ Select the closest scan based on retention time (rt), applying additional filtering on precursor m/z (prec_mz) if provided.
229
+ Parameters:
230
+ rt (float): The target retention time to find the closest scan.
231
+ prec_mz (float, optional): The precursor m/z value used to filter scans. If given, only scans with ms_level 2 are considered
232
+ and filtered to include only those within mz_tol of prec_mz.
233
+ mz_tol (float, optional): The tolerance to apply when filtering scans by precursor m/z. Defaults to 0.01.
234
+ Returns:
235
+ polars.DataFrame or None: A DataFrame slice containing the closest scan if a matching scan is found;
236
+ otherwise, returns None.
237
+ Notes:
238
+ - If the scans_df attribute is None, the function prints an error message and returns None.
239
+ - When prec_mz is provided, it filters scans where ms_level equals 2 and the precursor m/z is within the given mz_tol range.
240
+ - If prec_mz is not provided, scans with ms_level equal to 1 are considered.
241
+ - The function calculates the absolute difference between each scan's rt and the given rt, sorting the scans by this difference.
242
+ - If no scans match the criteria, an error message is printed before returning None.
243
+ """
244
+ # check if scans_df is None
245
+ if self.scans_df is None:
246
+ self.logger.warning("No scans found.")
247
+ return None
248
+ if prec_mz is not None:
249
+ ms_level = 2
250
+ scans = self.scans_df.filter(pl.col("ms_level") == ms_level)
251
+ # find all scans with prec_mz within mz_tol of prec_mz
252
+ scans = scans.filter(pl.col("prec_mz") > prec_mz - mz_tol)
253
+ scans = scans.filter(pl.col("prec_mz") < prec_mz + mz_tol)
254
+ # sort by distance to rt
255
+ scans = scans.with_columns((pl.col("rt") - rt).abs().alias("rt_diff"))
256
+ scans = scans.sort("rt_diff")
257
+ # return the closest scan
258
+ if len(scans) > 0:
259
+ scan = scans.slice(0, 1)
260
+ else:
261
+ self.logger.warning(
262
+ f"No scans found with prec_mz {prec_mz} within {mz_tol} of rt {rt}.",
263
+ )
264
+ return None
265
+ else:
266
+ mslevel = 1
267
+ scans = self.scans_df.filter(pl.col("ms_level") == mslevel)
268
+ # sort by distance to rt
269
+ scans = scans.with_columns((pl.col("rt") - rt).abs().alias("rt_diff"))
270
+ scans = scans.sort("rt_diff")
271
+ # return the closest scan
272
+ if len(scans) > 0:
273
+ scan = scans.slice(0, 1)
274
+ else:
275
+ self.logger.warning(
276
+ f"No scans found with ms_level {mslevel} within {mz_tol} of rt {rt}.",
277
+ )
278
+ return None
279
+ # return scans_df slice
280
+
281
+ return scan
282
+
283
+
284
+ # TODO the variables here do not follow the rest (mz, rt being tuples, etc.)
285
+
286
+
287
+ def select(
288
+ self,
289
+ mz=None,
290
+ rt=None,
291
+ coherence=None,
292
+ inty=None,
293
+ rt_delta=None,
294
+ iso=None,
295
+ iso_of=None,
296
+ has_MS2=None,
297
+ prominence_scaled=None,
298
+ height_scaled=None,
299
+ prominence=None,
300
+ height=None,
301
+ ):
302
+ """
303
+ Select features based on specified criteria and return the filtered DataFrame.
304
+
305
+ Parameters:
306
+ mz: m/z range filter (tuple for range, single value for minimum)
307
+ rt: retention time range filter (tuple for range, single value for minimum)
308
+ coherence: chromatogram coherence filter (tuple for range, single value for minimum)
309
+ inty: intensity filter (tuple for range, single value for minimum)
310
+ rt_delta: retention time delta filter (tuple for range, single value for minimum)
311
+ iso: isotope number filter (tuple for range, single value for exact match)
312
+ iso_of: isotope parent filter (tuple for range, single value for exact match)
313
+ has_MS2: filter for features with/without MS2 spectra (bool)
314
+ prominence_scaled: scaled prominence filter (tuple for range, single value for minimum)
315
+ height_scaled: scaled height filter (tuple for range, single value for minimum)
316
+ prominence: prominence filter (tuple for range, single value for minimum)
317
+ height: height filter (tuple for range, single value for minimum)
318
+
319
+ Returns:
320
+ polars.DataFrame: Filtered features DataFrame
321
+ """
322
+ # remove all features with coherence < coherence
323
+ if self.features_df is None:
324
+ # self.logger.info("No features found. R")
325
+ return
326
+ feats = self.features_df.clone()
327
+ if coherence is not None:
328
+ has_coherence = "chrom_coherence" in self.features_df.columns
329
+ if not has_coherence:
330
+ self.logger.warning("No coherence data found in features.")
331
+ else:
332
+ # record len for logging
333
+ feats_len_before_filter = len(feats)
334
+ if isinstance(coherence, tuple) and len(coherence) == 2:
335
+ min_coherence, max_coherence = coherence
336
+ feats = feats.filter(
337
+ (pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence),
338
+ )
339
+ else:
340
+ feats = feats.filter(pl.col("chrom_coherence") >= coherence)
341
+ self.logger.debug(
342
+ f"Selected features by coherence. Features removed: {feats_len_before_filter - len(feats)}",
343
+ )
344
+
345
+ if mz is not None:
346
+ feats_len_before_filter = len(feats)
347
+ if isinstance(mz, tuple) and len(mz) == 2:
348
+ min_mz, max_mz = mz
349
+ feats = feats.filter((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
350
+ else:
351
+ feats = feats.filter(pl.col("mz") >= mz)
352
+ self.logger.debug(
353
+ f"Selected features by mz. Features removed: {feats_len_before_filter - len(feats)}",
354
+ )
355
+
356
+ if rt is not None:
357
+ feats_len_before_filter = len(feats)
358
+ if isinstance(rt, tuple) and len(rt) == 2:
359
+ min_rt, max_rt = rt
360
+ feats = feats.filter((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
361
+ else:
362
+ feats = feats.filter(pl.col("rt") >= rt)
363
+ self.logger.debug(
364
+ f"Selected features by rt. Features removed: {feats_len_before_filter - len(feats)}",
365
+ )
366
+
367
+ if inty is not None:
368
+ feats_len_before_filter = len(feats)
369
+ if isinstance(inty, tuple) and len(inty) == 2:
370
+ min_inty, max_inty = inty
371
+ feats = feats.filter(
372
+ (pl.col("inty") >= min_inty) & (pl.col("inty") <= max_inty),
373
+ )
374
+ else:
375
+ feats = feats.filter(pl.col("inty") >= inty)
376
+ self.logger.debug(
377
+ f"Selected features by intensity. Features removed: {feats_len_before_filter - len(feats)}",
378
+ )
379
+
380
+ if rt_delta is not None:
381
+ feats_len_before_filter = len(feats)
382
+ if "rt_delta" not in feats.columns:
383
+ self.logger.warning("No rt_delta data found in features.")
384
+ return
385
+ if isinstance(rt_delta, tuple) and len(rt_delta) == 2:
386
+ min_rt_delta, max_rt_delta = rt_delta
387
+ feats = feats.filter(
388
+ (pl.col("rt_delta") >= min_rt_delta) & (pl.col("rt_delta") <= max_rt_delta),
389
+ )
390
+ else:
391
+ feats = feats.filter(pl.col("rt_delta") >= rt_delta)
392
+ self.logger.debug(
393
+ f"Selected features by rt_delta. Features removed: {feats_len_before_filter - len(feats)}",
394
+ )
395
+
396
+ if iso is not None:
397
+ feats_len_before_filter = len(feats)
398
+ if isinstance(iso, tuple) and len(iso) == 2:
399
+ min_iso, max_iso = iso
400
+ feats = feats.filter(
401
+ (pl.col("iso") >= min_iso) & (pl.col("iso") <= max_iso),
402
+ )
403
+ else:
404
+ feats = feats.filter(pl.col("iso") == iso)
405
+ self.logger.debug(
406
+ f"Selected features by iso. Features removed: {feats_len_before_filter - len(feats)}",
407
+ )
408
+
409
+ if iso_of is not None:
410
+ feats_len_before_filter = len(feats)
411
+ if isinstance(iso_of, tuple) and len(iso_of) == 2:
412
+ min_iso_of, max_iso_of = iso_of
413
+ feats = feats.filter(
414
+ (pl.col("iso_of") >= min_iso_of) & (pl.col("iso_of") <= max_iso_of),
415
+ )
416
+ else:
417
+ feats = feats.filter(pl.col("iso_of") == iso_of)
418
+ self.logger.debug(
419
+ f"Selected features by iso_of. Features removed: {feats_len_before_filter - len(feats)}",
420
+ )
421
+
422
+ if has_MS2 is not None:
423
+ feats_len_before_filter = len(feats)
424
+ if has_MS2:
425
+ feats = feats.filter(pl.col("ms2_scans").is_not_null())
426
+ else:
427
+ feats = feats.filter(pl.col("ms2_scans").is_null())
428
+ self.logger.debug(
429
+ f"Selected features by MS2 presence. Features removed: {feats_len_before_filter - len(feats)}",
430
+ )
431
+
432
+ if prominence_scaled is not None:
433
+ feats_len_before_filter = len(feats)
434
+ if isinstance(prominence_scaled, tuple) and len(prominence_scaled) == 2:
435
+ min_prominence_scaled, max_prominence_scaled = prominence_scaled
436
+ feats = feats.filter(
437
+ (pl.col("chrom_prominence_scaled") >= min_prominence_scaled)
438
+ & (pl.col("chrom_prominence_scaled") <= max_prominence_scaled),
439
+ )
440
+ else:
441
+ feats = feats.filter(pl.col("chrom_prominence_scaled") >= prominence_scaled)
442
+ self.logger.debug(
443
+ f"Selected features by prominence_scaled. Features removed: {feats_len_before_filter - len(feats)}",
444
+ )
445
+
446
+ if height_scaled is not None:
447
+ feats_len_before_filter = len(feats)
448
+ if isinstance(height_scaled, tuple) and len(height_scaled) == 2:
449
+ min_height_scaled, max_height_scaled = height_scaled
450
+ feats = feats.filter(
451
+ (pl.col("chrom_height_scaled") >= min_height_scaled)
452
+ & (pl.col("chrom_height_scaled") <= max_height_scaled),
453
+ )
454
+ else:
455
+ feats = feats.filter(pl.col("chrom_height_scaled") >= height_scaled)
456
+ self.logger.debug(
457
+ f"Selected features by height_scaled. Features removed: {feats_len_before_filter - len(feats)}",
458
+ )
459
+
460
+ if prominence is not None:
461
+ feats_len_before_filter = len(feats)
462
+ if isinstance(prominence, tuple) and len(prominence) == 2:
463
+ min_prominence, max_prominence = prominence
464
+ feats = feats.filter(
465
+ (pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence),
466
+ )
467
+ else:
468
+ feats = feats.filter(pl.col("chrom_prominence") >= prominence)
469
+ self.logger.debug(
470
+ f"Selected features by prominence. Features removed: {feats_len_before_filter - len(feats)}",
471
+ )
472
+
473
+ if height is not None:
474
+ feats_len_before_filter = len(feats)
475
+ # Check if chrom_height column exists, if not use chrom_height_scaled
476
+ height_col = "chrom_height" if "chrom_height" in feats.columns else "chrom_height_scaled"
477
+ if isinstance(height, tuple) and len(height) == 2:
478
+ min_height, max_height = height
479
+ feats = feats.filter(
480
+ (pl.col(height_col) >= min_height) & (pl.col(height_col) <= max_height),
481
+ )
482
+ else:
483
+ feats = feats.filter(pl.col(height_col) >= height)
484
+ self.logger.debug(
485
+ f"Selected features by {height_col}. Features removed: {feats_len_before_filter - len(feats)}",
486
+ )
487
+ if len(feats) == 0:
488
+ self.logger.warning("No features remaining after applying selection criteria.")
489
+ else:
490
+ self.logger.info(f"Selected features. Features remaining: {len(feats)}")
491
+ return feats
492
+
493
+
494
+
495
+
496
+ def _features_sync(self):
497
+ """
498
+ Synchronizes the OpenMS FeatureMap and features_df by removing features that exist in one
499
+ but not the other, using feature_id for mapping between them.
500
+
501
+ This function ensures that:
502
+ - Features in the FeatureMap that don't have corresponding entries in features_df are removed
503
+ - Features in features_df that don't have corresponding entries in the FeatureMap are removed
504
+
505
+ Returns:
506
+ None
507
+
508
+ Side Effects:
509
+ Updates self.features (OpenMS FeatureMap) by creating a new FeatureMap with synchronized features
510
+ Updates self.features_df by filtering to only include features present in the FeatureMap
511
+
512
+ Note:
513
+ Uses feature_id as the mapping key. feature_id contains OpenMS unique IDs that correspond
514
+ to the unique IDs of features in the FeatureMap.
515
+ """
516
+ if self.features_df is None or self.features is None:
517
+ self.logger.warning("Cannot sync: features_df or FeatureMap is None.")
518
+ return
519
+
520
+ try:
521
+ # Import pyopenms
522
+ import pyopenms as oms
523
+
524
+ # Get feature_ids from features_df
525
+ df_feature_ids = set(self.features_df.get_column("feature_id").to_list())
526
+
527
+ # Get feature unique IDs from FeatureMap
528
+ feature_map_ids = set()
529
+ for i in range(self.features.size()):
530
+ feature = self.features[i]
531
+ unique_id = str(feature.getUniqueId()) # Convert to string to match DataFrame
532
+ feature_map_ids.add(unique_id)
533
+
534
+ # Find features that exist in both
535
+ common_feature_ids = df_feature_ids & feature_map_ids
536
+
537
+ # Safety check: log error and exit if no features are matching
538
+ if not common_feature_ids:
539
+ self.logger.error(
540
+ f"No matching features found between FeatureMap and features_df. "
541
+ f"FeatureMap has {len(feature_map_ids)} features, "
542
+ f"features_df has {len(df_feature_ids)} features. "
543
+ f"Cannot synchronize - this indicates a data inconsistency. Exiting without changes."
544
+ )
545
+ return
546
+
547
+ # Create new synchronized FeatureMap with only common features
548
+ synced_feature_map = oms.FeatureMap()
549
+ for i in range(self.features.size()):
550
+ feature = self.features[i]
551
+ unique_id = str(feature.getUniqueId())
552
+ if unique_id in common_feature_ids:
553
+ synced_feature_map.push_back(feature)
554
+
555
+ # Filter features_df to only include features that exist in FeatureMap
556
+ synced_features_df = self.features_df.filter(
557
+ pl.col("feature_id").is_in(list(common_feature_ids))
558
+ )
559
+
560
+ # Update the objects
561
+ original_map_size = self.features.size()
562
+ original_df_size = len(self.features_df)
563
+
564
+ self.features = synced_feature_map
565
+ self.features_df = synced_features_df
566
+
567
+ # Log the synchronization results
568
+ map_removed = original_map_size - self.features.size()
569
+ df_removed = original_df_size - len(self.features_df)
570
+
571
+ # only log if features were removed
572
+ if map_removed > 0 or df_removed > 0:
573
+ self.logger.info(
574
+ f"Features synchronized. FeatureMap: {original_map_size} -> {self.features.size()} "
575
+ f"({map_removed} removed), DataFrame: {original_df_size} -> {len(self.features_df)} "
576
+ f"({df_removed} removed)"
577
+ )
578
+ else:
579
+ self.logger.debug(
580
+ f"Features synchronized. FeatureMap: {original_map_size} -> {self.features.size()} "
581
+ f"({map_removed} removed), DataFrame: {original_df_size} -> {len(self.features_df)} "
582
+ f"({df_removed} removed)"
583
+ )
584
+
585
+ except ImportError:
586
+ self.logger.warning("PyOpenMS not available, cannot sync FeatureMap")
587
+ except Exception as e:
588
+ self.logger.error(f"Error during feature synchronization: {e}")
589
+
590
+
591
+ def features_delete(self, features: list|None=None):
592
+ """
593
+ Delete features from both self.features_df and self.features based on a list of feature UIDs.
594
+
595
+ Parameters:
596
+ features (list, optional): List of feature UIDs to delete. If None, all features will be deleted.
597
+
598
+ Returns:
599
+ None
600
+
601
+ Side Effects:
602
+ Updates self.features_df by removing specified features.
603
+ Updates self.features (OpenMS FeatureMap) by creating a new FeatureMap with only the remaining features.
604
+ Updates self.scans_df by removing feature_uid associations for deleted features.
605
+
606
+ Note:
607
+ The function preserves all OpenMS FeatureMap information by creating a new FeatureMap
608
+ containing only the features that should remain after deletion.
609
+ """
610
+ if self.features_df is None:
611
+ self.logger.warning("No features found.")
612
+ return
613
+
614
+ # Get the feature UIDs to delete
615
+ feature_uids_to_delete = self._get_feature_uids(features=features, verbose=True)
616
+
617
+ if not feature_uids_to_delete:
618
+ self.logger.warning("No valid feature UIDs provided for deletion.")
619
+ return
620
+
621
+ original_count = len(self.features_df)
622
+
623
+ # Update features_df by filtering out the features to delete
624
+ self.features_df = self.features_df.filter(
625
+ ~pl.col("feature_uid").is_in(feature_uids_to_delete)
626
+ )
627
+
628
+ # Update the OpenMS FeatureMap by creating a new one with only features to keep
629
+ if self.features is not None:
630
+ try:
631
+ # Import pyopenms
632
+ import pyopenms as oms
633
+
634
+ # Create new FeatureMap with only features to keep
635
+ filtered_map = oms.FeatureMap()
636
+
637
+ # Get the feature UIDs that should remain after deletion
638
+ remaining_feature_uids = self.features_df.get_column("feature_uid").to_list()
639
+
640
+ # Iterate through existing features and keep only those not in deletion list
641
+ for i in range(self.features.size()):
642
+ feature = self.features[i]
643
+ # Since feature UIDs in DataFrame are sequential (0, 1, 2, ...) and correspond to indices
644
+ # we can check if the current index is in the remaining UIDs
645
+ if i in remaining_feature_uids:
646
+ filtered_map.push_back(feature)
647
+
648
+ # Replace the original FeatureMap with the filtered one
649
+ self.features = filtered_map
650
+ self.logger.debug(f"OpenMS FeatureMap updated with {filtered_map.size()} remaining features.")
651
+
652
+ except ImportError:
653
+ self.logger.warning("PyOpenMS not available, only updating features_df")
654
+ except Exception as e:
655
+ self.logger.warning(f"Could not update OpenMS FeatureMap: {e}. FeatureMap may be out of sync.")
656
+
657
+ # Update scans_df to remove feature_uid associations for deleted features
658
+ if hasattr(self, 'scans_df') and self.scans_df is not None:
659
+ self.scans_df = self.scans_df.with_columns(
660
+ pl.when(pl.col("feature_uid").is_in(feature_uids_to_delete))
661
+ .then(None)
662
+ .otherwise(pl.col("feature_uid"))
663
+ .alias("feature_uid")
664
+ )
665
+
666
+ deleted_count = original_count - len(self.features_df)
667
+ self.logger.info(f"Deleted {deleted_count} features. Remaining features: {len(self.features_df)}")
668
+
669
+
670
+ def _delete_ms2(self):
671
+ """
672
+ Unlinks MS2 spectra from features in the dataset.
673
+ This method removes the association between MS2 spectra and features in the features dataframe by setting
674
+ the 'ms2_scans' and 'ms2_specs' columns to None. It also updates the scans dataframe to remove the feature
675
+ id (feature_uid) association for the linked MS2 spectra.
676
+ Parameters:
677
+ Returns:
678
+ None
679
+ Side Effects:
680
+ Updates self.features_df by setting 'ms2_scans' and 'ms2_specs' columns to None. Also, updates self.scans_df
681
+ by resetting the 'feature_uid' column for linked MS2 spectra.
682
+ """
683
+ if self.features_df is None:
684
+ # self.logger.warning("No features found.")
685
+ return
686
+
687
+ self.logger.debug("Unlinking MS2 spectra from features...")
688
+
689
+ # Set ms2_scans and ms2_specs to None using Polars syntax
690
+ self.features_df = self.features_df.with_columns([
691
+ pl.lit(None).alias("ms2_scans"),
692
+ pl.lit(None).alias("ms2_specs"),
693
+ ])
694
+
695
+ # Update scans_df to remove feature_uid association for linked MS2 spectra
696
+ self.scans_df = self.scans_df.with_columns(
697
+ pl.when(pl.col("ms_level") == 2).then(None).otherwise(pl.col("feature_uid")).alias("feature_uid"),
698
+ )
699
+ self.logger.info("MS2 spectra unlinked from features.")
700
+
701
+
702
+ def features_filter(self, features):
703
+ """
704
+ Keep only the specified features and delete all others. This is the opposite of features_delete().
705
+
706
+ Parameters:
707
+ features: Can be one of the following:
708
+ - list: List of feature UIDs to keep
709
+ - polars.DataFrame: DataFrame with 'feature_uid' or 'feature_id' column - extracts unique values to keep
710
+ - pandas.DataFrame: DataFrame with 'feature_uid' or 'feature_id' column - extracts unique values to keep
711
+
712
+ Returns:
713
+ None
714
+
715
+ Side Effects:
716
+ Updates self.features_df by keeping only the specified features.
717
+ Updates self.features (OpenMS FeatureMap) by creating a new FeatureMap with only the specified features.
718
+ Updates self.scans_df by removing feature_uid associations for deleted features.
719
+
720
+ Note:
721
+ The function preserves all OpenMS FeatureMap information by creating a new FeatureMap
722
+ containing only the features that should be kept.
723
+ """
724
+ if self.features_df is None:
725
+ self.logger.warning("No features found.")
726
+ return
727
+
728
+ if features is None:
729
+ self.logger.warning("No features specified to keep. Use features_delete() to delete all features.")
730
+ return
731
+
732
+ # Get the feature UIDs to keep
733
+ feature_uids_to_keep = self._get_feature_uids(features=features, verbose=True)
734
+
735
+ if not feature_uids_to_keep:
736
+ self.logger.warning("No valid feature UIDs provided to keep.")
737
+ return
738
+
739
+ original_count = len(self.features_df)
740
+
741
+ # Update features_df by keeping only the specified features
742
+ self.features_df = self.features_df.filter(
743
+ pl.col("feature_uid").is_in(feature_uids_to_keep)
744
+ )
745
+
746
+ # Calculate which features were deleted (all except the ones to keep)
747
+ all_feature_uids = set(range(original_count)) # Assuming sequential UIDs
748
+ feature_uids_to_delete = list(all_feature_uids - set(feature_uids_to_keep))
749
+
750
+ # Update the OpenMS FeatureMap by creating a new one with only features to keep
751
+ if self.features is not None:
752
+ try:
753
+ # Import pyopenms
754
+ import pyopenms as oms
755
+
756
+ # Create new FeatureMap with only features to keep
757
+ filtered_map = oms.FeatureMap()
758
+
759
+ # Iterate through existing features and keep only those in the keep list
760
+ for i in range(self.features.size()):
761
+ feature = self.features[i]
762
+ # Since feature UIDs in DataFrame are sequential (0, 1, 2, ...) and correspond to indices
763
+ # we can check if the current index is in the keep UIDs
764
+ if i in feature_uids_to_keep:
765
+ filtered_map.push_back(feature)
766
+
767
+ # Replace the original FeatureMap with the filtered one
768
+ self.features = filtered_map
769
+ self.logger.debug(f"OpenMS FeatureMap updated with {filtered_map.size()} remaining features.")
770
+
771
+ except ImportError:
772
+ self.logger.warning("PyOpenMS not available, only updating features_df")
773
+ except Exception as e:
774
+ self.logger.warning(f"Could not update OpenMS FeatureMap: {e}. FeatureMap may be out of sync.")
775
+
776
+ # Update scans_df to remove feature_uid associations for deleted features
777
+ if hasattr(self, 'scans_df') and self.scans_df is not None and feature_uids_to_delete:
778
+ self.scans_df = self.scans_df.with_columns(
779
+ pl.when(pl.col("feature_uid").is_in(feature_uids_to_delete))
780
+ .then(None)
781
+ .otherwise(pl.col("feature_uid"))
782
+ .alias("feature_uid")
783
+ )
784
+
785
+ kept_count = len(self.features_df)
786
+ deleted_count = original_count - kept_count
787
+ self.logger.info(f"Kept {kept_count} features, deleted {deleted_count} features. Remaining features: {kept_count}")
788
+
789
+
790
+ def set_source(self, filename):
791
+ """
792
+ Reassign file_source. If filename contains only a path, keep the current basename
793
+ and build an absolute path. Check that the new file exists before overwriting
794
+ the old file_source.
795
+
796
+ Parameters:
797
+ filename (str): New file path or directory path
798
+
799
+ Returns:
800
+ None
801
+ """
802
+ import os
803
+
804
+ # Store the old file_source for logging
805
+ old_file_source = getattr(self, 'file_source', None)
806
+
807
+ # Check if filename is just a directory path
808
+ if os.path.isdir(filename):
809
+ if old_file_source is None:
810
+ self.logger.error("Cannot build path: no current file_source available")
811
+ return
812
+
813
+ # Get the basename from current file_source
814
+ current_basename = os.path.basename(old_file_source)
815
+ # Build new absolute path
816
+ new_file_path = os.path.join(filename, current_basename)
817
+ else:
818
+ # filename is a full path, make it absolute
819
+ new_file_path = os.path.abspath(filename)
820
+
821
+ # Check if the new file exists
822
+ if not os.path.exists(new_file_path):
823
+ self.logger.error(f"File does not exist: {new_file_path}")
824
+ return
825
+
826
+ # Update file_source
827
+ self.file_source = new_file_path
828
+
829
+ # Log the change
830
+ if old_file_source is not None:
831
+ self.logger.info(f"Updated file_source from {old_file_source} to {self.file_source}")
832
+ else:
833
+ self.logger.info(f"Set file_source to {self.file_source}")