masster 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

Files changed (55) hide show
  1. masster/__init__.py +27 -27
  2. masster/_version.py +17 -17
  3. masster/chromatogram.py +497 -503
  4. masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.featureXML +199787 -0
  5. masster/data/examples/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.sample5 +0 -0
  6. masster/logger.py +318 -244
  7. masster/sample/__init__.py +9 -9
  8. masster/sample/defaults/__init__.py +15 -15
  9. masster/sample/defaults/find_adducts_def.py +325 -325
  10. masster/sample/defaults/find_features_def.py +366 -366
  11. masster/sample/defaults/find_ms2_def.py +285 -285
  12. masster/sample/defaults/get_spectrum_def.py +314 -318
  13. masster/sample/defaults/sample_def.py +374 -378
  14. masster/sample/h5.py +1321 -1297
  15. masster/sample/helpers.py +833 -364
  16. masster/sample/lib.py +762 -0
  17. masster/sample/load.py +1220 -1187
  18. masster/sample/parameters.py +131 -131
  19. masster/sample/plot.py +1610 -1622
  20. masster/sample/processing.py +1402 -1416
  21. masster/sample/quant.py +209 -0
  22. masster/sample/sample.py +391 -387
  23. masster/sample/sample5_schema.json +181 -181
  24. masster/sample/save.py +737 -719
  25. masster/sample/sciex.py +1213 -0
  26. masster/spectrum.py +1287 -1319
  27. masster/study/__init__.py +9 -9
  28. masster/study/defaults/__init__.py +21 -19
  29. masster/study/defaults/align_def.py +267 -267
  30. masster/study/defaults/export_def.py +41 -40
  31. masster/study/defaults/fill_chrom_def.py +264 -264
  32. masster/study/defaults/fill_def.py +260 -0
  33. masster/study/defaults/find_consensus_def.py +256 -256
  34. masster/study/defaults/find_ms2_def.py +163 -163
  35. masster/study/defaults/integrate_chrom_def.py +225 -225
  36. masster/study/defaults/integrate_def.py +221 -0
  37. masster/study/defaults/merge_def.py +256 -0
  38. masster/study/defaults/study_def.py +272 -269
  39. masster/study/export.py +674 -287
  40. masster/study/h5.py +1398 -886
  41. masster/study/helpers.py +1650 -433
  42. masster/study/helpers_optimized.py +317 -0
  43. masster/study/load.py +1201 -1078
  44. masster/study/parameters.py +99 -99
  45. masster/study/plot.py +632 -645
  46. masster/study/processing.py +1057 -1046
  47. masster/study/save.py +149 -134
  48. masster/study/study.py +606 -522
  49. masster/study/study5_schema.json +247 -241
  50. {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/METADATA +15 -10
  51. masster-0.3.0.dist-info/RECORD +59 -0
  52. {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/licenses/LICENSE +661 -661
  53. masster-0.2.4.dist-info/RECORD +0 -50
  54. {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/WHEEL +0 -0
  55. {masster-0.2.4.dist-info → masster-0.3.0.dist-info}/entry_points.txt +0 -0
masster/study/helpers.py CHANGED
@@ -1,433 +1,1650 @@
1
- from __future__ import annotations
2
-
3
- import os
4
-
5
- import numpy as np
6
- import pandas as pd
7
- import polars as pl
8
-
9
- # Remove StudyParameters import as we'll use hardcoded values for seed
10
-
11
-
12
- def get_chrom(self, uids=None, samples=None):
13
- # Check if consensus_df is empty or doesn't have required columns
14
- if self.consensus_df.is_empty() or "consensus_uid" not in self.consensus_df.columns:
15
- self.logger.error("No consensus data found. Please run find_consensus() first.")
16
- return None
17
-
18
- ids = self._get_consensus_uids(uids)
19
- sample_uids = self._get_sample_uids(samples)
20
-
21
- if self.consensus_map is None:
22
- self.logger.error("No consensus map found.")
23
- return None
24
-
25
- # Pre-filter all DataFrames to reduce join sizes
26
- filtered_consensus_mapping = self.consensus_mapping_df.filter(
27
- pl.col("consensus_uid").is_in(ids),
28
- )
29
-
30
- # Get feature_uids that we actually need
31
- relevant_feature_uids = filtered_consensus_mapping["feature_uid"].to_list()
32
-
33
- self.logger.debug(
34
- f"Filtering features_df for {len(relevant_feature_uids)} relevant feature_uids.",
35
- )
36
- # Pre-filter features_df to only relevant features and samples
37
- filtered_features = self.features_df.filter(
38
- pl.col("feature_uid").is_in(relevant_feature_uids)
39
- & pl.col("sample_uid").is_in(sample_uids),
40
- ).select([
41
- "feature_uid",
42
- "chrom",
43
- "rt",
44
- "rt_original",
45
- "sample_uid",
46
- ])
47
-
48
- # Pre-filter samples_df
49
- filtered_samples = self.samples_df.filter(
50
- pl.col("sample_uid").is_in(sample_uids),
51
- ).select(["sample_uid", "sample_name"])
52
-
53
- # Perform a three-way join to get all needed data
54
- self.logger.debug("Joining DataFrames to get complete chromatogram data.")
55
- df_combined = (
56
- filtered_consensus_mapping.join(
57
- filtered_features,
58
- on="feature_uid",
59
- how="inner",
60
- )
61
- .join(filtered_samples, on="sample_uid", how="inner")
62
- .with_columns(
63
- (pl.col("rt") - pl.col("rt_original")).alias("rt_shift"),
64
- )
65
- )
66
-
67
- # Update chrom objects with rt_shift efficiently
68
- self.logger.debug("Updating chromatogram objects with rt_shift values.")
69
- chrom_data = df_combined.select(["chrom", "rt_shift"]).to_dict(as_series=False)
70
- for chrom_obj, rt_shift in zip(chrom_data["chrom"], chrom_data["rt_shift"]):
71
- if chrom_obj is not None:
72
- chrom_obj.rt_shift = rt_shift
73
-
74
- # Get all unique combinations for complete matrix
75
- all_consensus_uids = sorted(df_combined["consensus_uid"].unique().to_list())
76
- all_sample_names = sorted(df_combined["sample_name"].unique().to_list())
77
-
78
- # Create a mapping dictionary for O(1) lookup instead of O(n) filtering
79
- self.logger.debug("Creating lookup dictionary for chromatogram objects.")
80
- chrom_lookup = {}
81
- for row in df_combined.select([
82
- "consensus_uid",
83
- "sample_name",
84
- "chrom",
85
- ]).iter_rows():
86
- key = (row[0], row[1]) # (consensus_uid, sample_name)
87
- chrom_lookup[key] = row[2] # chrom object
88
-
89
- # Build pivot data efficiently using the lookup dictionary
90
- pivot_data = []
91
- total_iterations = len(all_consensus_uids)
92
- progress_interval = max(1, total_iterations // 10) # Show progress every 10%
93
-
94
- for i, consensus_uid in enumerate(all_consensus_uids):
95
- if i % progress_interval == 0:
96
- progress_percent = (i / total_iterations) * 100
97
- self.logger.debug(
98
- f"Building pivot data: {progress_percent:.0f}% complete ({i}/{total_iterations})",
99
- )
100
-
101
- row_data = {"consensus_uid": consensus_uid}
102
- for sample_name in all_sample_names:
103
- key = (consensus_uid, sample_name)
104
- row_data[sample_name] = chrom_lookup.get(key, None)
105
- pivot_data.append(row_data)
106
-
107
- self.logger.debug(
108
- f"Building pivot data: 100% complete ({total_iterations}/{total_iterations})",
109
- )
110
-
111
- # Create Polars DataFrame with complex objects
112
- df2_pivoted = pl.DataFrame(pivot_data)
113
-
114
- # Return as Polars DataFrame (can handle complex objects like Chromatogram)
115
- return df2_pivoted
116
-
117
- def set_default_folder(self, folder):
118
- """
119
- Set the default folder for saving and loading files.
120
- """
121
- if not os.path.exists(folder):
122
- os.makedirs(folder)
123
- self.default_folder = folder
124
-
125
-
126
- def align_reset(self):
127
- if self.alignment_ref_index is None:
128
- return
129
- self.logger.debug("Resetting alignment.")
130
- # iterate over all feature maps and set RT to original RT
131
- for feature_map in self.features_maps:
132
- for feature in feature_map:
133
- rt = feature.getMetaValue("original_RT")
134
- if rt is not None:
135
- feature.setRT(rt)
136
- feature.removeMetaValue("original_RT")
137
- self.alignment_ref_index = None
138
-
139
-
140
- # TODO I don't get this param
141
- def get_consensus(self, quant="chrom_area"):
142
- if self.consensus_df is None:
143
- self.logger.error("No consensus map found.")
144
- return None
145
-
146
- # Convert Polars DataFrame to pandas for this operation since the result is used for export
147
- df1 = self.consensus_df.to_pandas().copy()
148
-
149
- # set consensus_id as uint64
150
- df1["consensus_id"] = df1["consensus_id"].astype("uint64")
151
- # set consensus_id as index
152
- df1.set_index("consensus_uid", inplace=True)
153
- # sort by consensus_id
154
- df1 = df1.sort_index()
155
-
156
- df2 = self.get_consensus_matrix(quant=quant)
157
- # sort df2 row by consensus_id
158
- df2 = df2.sort_index()
159
- # merge df and df2 on consensus_id
160
- df = pd.merge(df1, df2, left_index=True, right_index=True, how="left")
161
-
162
- return df
163
-
164
-
165
- # TODO I don't get this param
166
- def get_consensus_matrix(self, quant="chrom_area"):
167
- """
168
- Get a matrix of consensus features with samples as columns and consensus features as rows.
169
- """
170
- if quant not in self.features_df.columns:
171
- self.logger.error(
172
- f"Quantification method {quant} not found in features_df.",
173
- )
174
- return None
175
-
176
- # Use Polars join instead of pandas merge
177
- features_subset = self.features_df.select(["feature_uid", "sample_uid", quant])
178
- consensus_mapping_subset = self.consensus_mapping_df.select([
179
- "consensus_uid",
180
- "feature_uid",
181
- ])
182
-
183
- df1 = features_subset.join(
184
- consensus_mapping_subset,
185
- on="feature_uid",
186
- how="left",
187
- )
188
-
189
- # Convert to pandas for pivot operation (Polars pivot is still evolving)
190
- df1_pd = df1.to_pandas()
191
- df2 = df1_pd.pivot_table(
192
- index="consensus_uid",
193
- columns="sample_uid",
194
- values=quant,
195
- aggfunc="max",
196
- )
197
-
198
- # Create sample_uid to sample_name mapping using Polars
199
- sample_mapping = dict(
200
- self.samples_df.select(["sample_uid", "sample_name"]).iter_rows(),
201
- )
202
- # replace sample_uid with sample_name in df2
203
- df2 = df2.rename(columns=sample_mapping)
204
-
205
- # round to integer
206
- df2 = df2.round()
207
- # set consensus_id as uint64
208
- df2.index = df2.index.astype("uint64")
209
- # set index to consensus_id
210
- df2.index.name = "consensus_uid"
211
- return df2
212
-
213
-
214
- def get_gaps_matrix(self, uids=None):
215
- """
216
- Get a matrix of gaps between consensus features with samples as columns and consensus features as rows.
217
- """
218
- if self.consensus_df is None:
219
- self.logger.error("No consensus map found.")
220
- return None
221
- uids = self._get_consensus_uids(uids)
222
-
223
- df1 = self.get_consensus_matrix(quant="filled")
224
- if df1 is None or df1.empty:
225
- self.logger.warning("No gap data found.")
226
- return None
227
- # keep only rows where consensus_id is in ids - use pandas indexing since df1 is already pandas
228
- df1 = df1[df1.index.isin(uids)]
229
- return df1
230
-
231
-
232
- def get_gaps_stats(self, uids=None):
233
- """
234
- Get statistics about gaps in the consensus features.
235
- """
236
-
237
- df = self.get_gaps_matrix(uids=uids)
238
-
239
- # For each column, count how many times the value is True, False, or None. Summarize in a new df with three rows: True, False, None.
240
- if df is None or df.empty:
241
- self.logger.warning("No gap data found.")
242
- return None
243
- gaps_stats = pd.DataFrame(
244
- {
245
- "aligned": df.apply(lambda x: (~x.astype(bool)).sum()),
246
- "filled": df.apply(lambda x: x.astype(bool).sum() - pd.isnull(x).sum()),
247
- "missing": df.apply(lambda x: pd.isnull(x).sum()),
248
- },
249
- )
250
- return gaps_stats
251
-
252
-
253
- # TODO is uid not supposed to be a list anymore?
254
- def get_consensus_matches(self, uids=None):
255
- uids = self._get_consensus_uids(uids)
256
-
257
- # find all rows in consensus_mapping_df with consensus_id=id - use Polars filtering
258
- fid = (
259
- self.consensus_mapping_df.filter(
260
- pl.col("consensus_uid").is_in(uids),
261
- )
262
- .select("feature_uid")
263
- .to_series()
264
- .to_list()
265
- )
266
- # select all rows in features_df with uid in fid
267
- matches = self.features_df.filter(pl.col("feature_uid").is_in(fid)).clone()
268
- return matches
269
-
270
-
271
- def fill_reset(self):
272
- # remove all features with filled=True
273
- if self.features_df is None:
274
- self.logger.warning("No features found.")
275
- return
276
- l1 = len(self.features_df)
277
- self.features_df = self.features_df.filter(~pl.col("filled"))
278
- # remove all rows in consensus_mapping_df where feature_uid is not in features_df['uid']
279
-
280
- feature_uids_to_keep = self.features_df["feature_uid"].to_list()
281
- self.consensus_mapping_df = self.consensus_mapping_df.filter(
282
- pl.col("feature_uid").is_in(feature_uids_to_keep),
283
- )
284
- self.logger.info(
285
- f"Reset filled chromatograms. Chroms removed: {l1 - len(self.features_df)}",
286
- )
287
-
288
-
289
- def _get_feature_uids(self, uids=None, seed=42):
290
- """
291
- Helper function to get feature_uids from features_df based on input uids.
292
- If uids is None, returns all feature_uids.
293
- If uids is a single integer, returns a random sample of feature_uids.
294
- If uids is a list of strings, returns feature_uids corresponding to those feature_uids.
295
- If uids is a list of integers, returns feature_uids corresponding to those feature_uids.
296
- """
297
- if uids is None:
298
- # get all feature_uids from features_df
299
- return self.features_df["feature_uid"].to_list()
300
- elif isinstance(uids, int):
301
- # choose a random sample of feature_uids
302
- if len(self.features_df) > uids:
303
- np.random.seed(seed)
304
- return np.random.choice(
305
- self.features_df["feature_uid"].to_list(),
306
- uids,
307
- replace=False,
308
- ).tolist()
309
- else:
310
- return self.features_df["feature_uid"].to_list()
311
- else:
312
- # iterate over all uids. If the item is a string, assume it's a feature_uid
313
- feature_uids = []
314
- for uid in uids:
315
- if isinstance(uid, str):
316
- matching_rows = self.features_df.filter(pl.col("feature_uid") == uid)
317
- if not matching_rows.is_empty():
318
- feature_uids.append(
319
- matching_rows.row(0, named=True)["feature_uid"],
320
- )
321
- elif isinstance(uid, int):
322
- if uid in self.features_df["feature_uid"].to_list():
323
- feature_uids.append(uid)
324
- # remove duplicates
325
- feature_uids = list(set(feature_uids))
326
- return feature_uids
327
-
328
-
329
- def _get_consensus_uids(self, uids=None, seed=42):
330
- """
331
- Helper function to get consensus_uids from consensus_df based on input uids.
332
- If uids is None, returns all consensus_uids.
333
- If uids is a single integer, returns a random sample of consensus_uids.
334
- If uids is a list of strings, returns consensus_uids corresponding to those consensus_ids.
335
- If uids is a list of integers, returns consensus_uids corresponding to those consensus_uids.
336
- """
337
- # Check if consensus_df is empty or doesn't have required columns
338
- if self.consensus_df.is_empty() or "consensus_uid" not in self.consensus_df.columns:
339
- return []
340
-
341
- if uids is None:
342
- # get all consensus_uids from consensus_df
343
- return self.consensus_df["consensus_uid"].to_list()
344
- elif isinstance(uids, int):
345
- # choose a random sample of consensus_uids
346
- if len(self.consensus_df) > uids:
347
- np.random.seed(seed) # for reproducibility
348
- return np.random.choice(
349
- self.consensus_df["consensus_uid"].to_list(),
350
- uids,
351
- replace=False,
352
- ).tolist()
353
- else:
354
- return self.consensus_df["consensus_uid"].to_list()
355
- else:
356
- # iterate over all uids. If the item is a string, assume it's a consensus_id
357
- consensus_uids = []
358
- for uid in uids:
359
- if isinstance(uid, str):
360
- matching_rows = self.consensus_df.filter(pl.col("consensus_id") == uid)
361
- if not matching_rows.is_empty():
362
- consensus_uids.append(
363
- matching_rows.row(0, named=True)["consensus_uid"],
364
- )
365
- elif isinstance(uid, int):
366
- if uid in self.consensus_df["consensus_uid"].to_list():
367
- consensus_uids.append(uid)
368
- # remove duplicates
369
- consensus_uids = list(set(consensus_uids))
370
- return consensus_uids
371
-
372
-
373
- def _get_sample_uids(self, samples=None, seed=42):
374
- """
375
- Helper function to get sample_uids from samples_df based on input samples.
376
- If samples is None, returns all sample_uids.
377
- If samples is a single integer, returns a random sample of sample_uids.
378
- If samples is a list of strings, returns sample_uids corresponding to those sample_names.
379
- If samples is a list of integers, returns sample_uids corresponding to those sample_uids.
380
- """
381
- if samples is None:
382
- # get all sample_uids from samples_df
383
- return self.samples_df["sample_uid"].to_list()
384
- elif isinstance(samples, int):
385
- # choose a random sample of sample_uids
386
- if len(self.samples_df) > samples:
387
- np.random.seed(seed) # for reproducibility
388
- return np.random.choice(
389
- self.samples_df["sample_uid"].to_list(),
390
- samples,
391
- replace=False,
392
- ).tolist()
393
- else:
394
- return self.samples_df["sample_uid"].to_list()
395
- else:
396
- # iterate over all samples. If the item is a string, assume it's a sample_name
397
- sample_uids = []
398
- for sample in samples:
399
- if isinstance(sample, str):
400
- matching_rows = self.samples_df.filter(pl.col("sample_name") == sample)
401
- if not matching_rows.is_empty():
402
- sample_uids.append(
403
- matching_rows.row(0, named=True)["sample_uid"],
404
- )
405
- elif isinstance(sample, int):
406
- if sample in self.samples_df["sample_uid"].to_list():
407
- sample_uids.append(sample)
408
- # remove duplicates
409
- sample_uids = list(set(sample_uids))
410
- return sample_uids
411
-
412
- def get_orphans(self):
413
- """
414
- Get all features that are not in the consensus mapping.
415
- """
416
- not_in_consensus = self.features_df.filter(~self.features_df['feature_uid'].is_in(self.consensus_mapping_df['feature_uid'].to_list()))
417
- return not_in_consensus
418
-
419
- def compress(self):
420
- """
421
- Compress the study data.
422
- """
423
- self.logger.info("Compressing study data...")
424
- # self.features_maps = []
425
- # drop all features that are not in consensus_mapping_df
426
- if self.features_df is not None and not self.features_df.is_empty():
427
- l1 = len(self.features_df)
428
- self.features_df = self.features_df.filter(
429
- pl.col("feature_uid").is_in(
430
- self.consensus_mapping_df["feature_uid"].to_list(),
431
- ),
432
- )
433
- self.logger.info(f"Removed {l1 - len(self.features_df)} features.")
1
+ from __future__ import annotations
2
+
3
+ import os
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ import polars as pl
8
+
9
+ from tqdm import tqdm
10
+
11
+
12
+ def get_chrom(self, uids=None, samples=None):
13
+ # Check if consensus_df is empty or doesn't have required columns
14
+ if self.consensus_df.is_empty() or "consensus_uid" not in self.consensus_df.columns:
15
+ self.logger.error("No consensus data found. Please run merge() first.")
16
+ return None
17
+
18
+ ids = self._get_consensus_uids(uids)
19
+ sample_uids = self._get_sample_uids(samples)
20
+
21
+ if self.consensus_map is None:
22
+ self.logger.error("No consensus map found.")
23
+ return None
24
+
25
+ # Pre-filter all DataFrames to reduce join sizes
26
+ filtered_consensus_mapping = self.consensus_mapping_df.filter(
27
+ pl.col("consensus_uid").is_in(ids),
28
+ )
29
+
30
+ # Get feature_uids that we actually need
31
+ relevant_feature_uids = filtered_consensus_mapping["feature_uid"].to_list()
32
+
33
+ self.logger.debug(
34
+ f"Filtering features_df for {len(relevant_feature_uids)} relevant feature_uids.",
35
+ )
36
+ # Pre-filter features_df to only relevant features and samples
37
+ filtered_features = self.features_df.filter(
38
+ pl.col("feature_uid").is_in(relevant_feature_uids) & pl.col("sample_uid").is_in(sample_uids),
39
+ ).select([
40
+ "feature_uid",
41
+ "chrom",
42
+ "rt",
43
+ "rt_original",
44
+ "sample_uid",
45
+ ])
46
+
47
+ # Pre-filter samples_df
48
+ filtered_samples = self.samples_df.filter(
49
+ pl.col("sample_uid").is_in(sample_uids),
50
+ ).select(["sample_uid", "sample_name"])
51
+
52
+ # Perform a three-way join to get all needed data
53
+ self.logger.debug("Joining DataFrames to get complete chromatogram data.")
54
+ df_combined = (
55
+ filtered_consensus_mapping.join(
56
+ filtered_features,
57
+ on="feature_uid",
58
+ how="inner",
59
+ )
60
+ .join(filtered_samples, on="sample_uid", how="inner")
61
+ .with_columns(
62
+ (pl.col("rt") - pl.col("rt_original")).alias("rt_shift"),
63
+ )
64
+ )
65
+
66
+ # Update chrom objects with rt_shift efficiently
67
+ self.logger.debug("Updating chromatogram objects with rt_shift values.")
68
+ chrom_data = df_combined.select(["chrom", "rt_shift"]).to_dict(as_series=False)
69
+ for chrom_obj, rt_shift in zip(chrom_data["chrom"], chrom_data["rt_shift"]):
70
+ if chrom_obj is not None:
71
+ chrom_obj.rt_shift = rt_shift
72
+
73
+ # Get all unique combinations for complete matrix
74
+ all_consensus_uids = sorted(df_combined["consensus_uid"].unique().to_list())
75
+ all_sample_names = sorted(df_combined["sample_name"].unique().to_list())
76
+
77
+ # Create a mapping dictionary for O(1) lookup instead of O(n) filtering
78
+ self.logger.debug("Creating lookup dictionary for chromatogram objects.")
79
+ chrom_lookup = {}
80
+ for row in df_combined.select([
81
+ "consensus_uid",
82
+ "sample_name",
83
+ "chrom",
84
+ ]).iter_rows():
85
+ key = (row[0], row[1]) # (consensus_uid, sample_name)
86
+ chrom_lookup[key] = row[2] # chrom object
87
+
88
+ # Build pivot data efficiently using the lookup dictionary
89
+ pivot_data = []
90
+ total_iterations = len(all_consensus_uids)
91
+ progress_interval = max(1, total_iterations // 10) # Show progress every 10%
92
+
93
+ for i, consensus_uid in enumerate(all_consensus_uids):
94
+ if i % progress_interval == 0:
95
+ progress_percent = (i / total_iterations) * 100
96
+ self.logger.debug(
97
+ f"Building pivot data: {progress_percent:.0f}% complete ({i}/{total_iterations})",
98
+ )
99
+
100
+ row_data = {"consensus_uid": consensus_uid}
101
+ for sample_name in all_sample_names:
102
+ key = (consensus_uid, sample_name)
103
+ row_data[sample_name] = chrom_lookup.get(key, None)
104
+ pivot_data.append(row_data)
105
+
106
+ self.logger.debug(
107
+ f"Building pivot data: 100% complete ({total_iterations}/{total_iterations})",
108
+ )
109
+
110
+ # Create Polars DataFrame with complex objects
111
+ df2_pivoted = pl.DataFrame(pivot_data)
112
+
113
+ # Return as Polars DataFrame (can handle complex objects like Chromatogram)
114
+ return df2_pivoted
115
+
116
+ def set_folder(self, folder):
117
+ """
118
+ Set the folder for saving and loading files.
119
+ """
120
+ if not os.path.exists(folder):
121
+ os.makedirs(folder)
122
+ self.folder = folder
123
+
124
+
125
+ def align_reset(self):
126
+ if self.alignment_ref_index is None:
127
+ return
128
+ self.logger.debug("Resetting alignment.")
129
+ # iterate over all feature maps and set RT to original RT
130
+ for feature_map in self.features_maps:
131
+ for feature in feature_map:
132
+ rt = feature.getMetaValue("original_RT")
133
+ if rt is not None:
134
+ feature.setRT(rt)
135
+ feature.removeMetaValue("original_RT")
136
+ self.alignment_ref_index = None
137
+
138
+
139
+ # TODO I don't get this param
140
+ def get_consensus(self, quant="chrom_area"):
141
+ if self.consensus_df is None:
142
+ self.logger.error("No consensus map found.")
143
+ return None
144
+
145
+ # Convert Polars DataFrame to pandas for this operation since the result is used for export
146
+ df1 = self.consensus_df.to_pandas().copy()
147
+
148
+ # set consensus_id as uint64
149
+ df1["consensus_id"] = df1["consensus_id"].astype("uint64")
150
+ # set consensus_id as index
151
+ df1.set_index("consensus_uid", inplace=True)
152
+ # sort by consensus_id
153
+ df1 = df1.sort_index()
154
+
155
+ df2 = self.get_consensus_matrix(quant=quant)
156
+ # sort df2 row by consensus_id
157
+ df2 = df2.sort_index()
158
+ # merge df and df2 on consensus_id
159
+ df = pd.merge(df1, df2, left_index=True, right_index=True, how="left")
160
+
161
+ return df
162
+
163
+
164
+ # TODO I don't get this param
165
+ def get_consensus_matrix(self, quant="chrom_area"):
166
+ """
167
+ Get a matrix of consensus features with samples as columns and consensus features as rows.
168
+ """
169
+ if quant not in self.features_df.columns:
170
+ self.logger.error(
171
+ f"Quantification method {quant} not found in features_df.",
172
+ )
173
+ return None
174
+
175
+ # Use Polars join instead of pandas merge
176
+ features_subset = self.features_df.select(["feature_uid", "sample_uid", quant])
177
+ consensus_mapping_subset = self.consensus_mapping_df.select([
178
+ "consensus_uid",
179
+ "feature_uid",
180
+ ])
181
+
182
+ df1 = features_subset.join(
183
+ consensus_mapping_subset,
184
+ on="feature_uid",
185
+ how="left",
186
+ )
187
+
188
+ # Convert to pandas for pivot operation (Polars pivot is still evolving)
189
+ df1_pd = df1.to_pandas()
190
+ df2 = df1_pd.pivot_table(
191
+ index="consensus_uid",
192
+ columns="sample_uid",
193
+ values=quant,
194
+ aggfunc="max",
195
+ )
196
+
197
+ # Create sample_uid to sample_name mapping using Polars
198
+ sample_mapping = dict(
199
+ self.samples_df.select(["sample_uid", "sample_name"]).iter_rows(),
200
+ )
201
+ # replace sample_uid with sample_name in df2
202
+ df2 = df2.rename(columns=sample_mapping)
203
+
204
+ # round to integer
205
+ df2 = df2.round()
206
+ # set consensus_id as uint64
207
+ df2.index = df2.index.astype("uint64")
208
+ # set index to consensus_id
209
+ df2.index.name = "consensus_uid"
210
+ return df2
211
+
212
+
213
+ def get_gaps_matrix(self, uids=None):
214
+ """
215
+ Get a matrix of gaps between consensus features with samples as columns and consensus features as rows.
216
+ """
217
+ if self.consensus_df is None:
218
+ self.logger.error("No consensus map found.")
219
+ return None
220
+ uids = self._get_consensus_uids(uids)
221
+
222
+ df1 = self.get_consensus_matrix(quant="filled")
223
+ if df1 is None or df1.empty:
224
+ self.logger.warning("No gap data found.")
225
+ return None
226
+ # keep only rows where consensus_id is in ids - use pandas indexing since df1 is already pandas
227
+ df1 = df1[df1.index.isin(uids)]
228
+ return df1
229
+
230
+
231
+ def get_gaps_stats(self, uids=None):
232
+ """
233
+ Get statistics about gaps in the consensus features.
234
+ """
235
+
236
+ df = self.get_gaps_matrix(uids=uids)
237
+
238
+ # For each column, count how many times the value is True, False, or None. Summarize in a new df with three rows: True, False, None.
239
+ if df is None or df.empty:
240
+ self.logger.warning("No gap data found.")
241
+ return None
242
+ gaps_stats = pd.DataFrame(
243
+ {
244
+ "aligned": df.apply(lambda x: (~x.astype(bool)).sum()),
245
+ "filled": df.apply(lambda x: x.astype(bool).sum() - pd.isnull(x).sum()),
246
+ "missing": df.apply(lambda x: pd.isnull(x).sum()),
247
+ },
248
+ )
249
+ return gaps_stats
250
+
251
+
252
+ # TODO is uid not supposed to be a list anymore?
253
+ def get_consensus_matches(self, uids=None):
254
+ uids = self._get_consensus_uids(uids)
255
+
256
+ # find all rows in consensus_mapping_df with consensus_id=id - use Polars filtering
257
+ fid = (
258
+ self.consensus_mapping_df.filter(
259
+ pl.col("consensus_uid").is_in(uids),
260
+ )
261
+ .select("feature_uid")
262
+ .to_series()
263
+ .to_list()
264
+ )
265
+ # select all rows in features_df with uid in fid
266
+ matches = self.features_df.filter(pl.col("feature_uid").is_in(fid)).clone()
267
+ return matches
268
+
269
+
270
+ def fill_reset(self):
271
+ # remove all features with filled=True
272
+ if self.features_df is None:
273
+ self.logger.warning("No features found.")
274
+ return
275
+ l1 = len(self.features_df)
276
+ self.features_df = self.features_df.filter(~pl.col("filled"))
277
+ # remove all rows in consensus_mapping_df where feature_uid is not in features_df['uid']
278
+
279
+ feature_uids_to_keep = self.features_df["feature_uid"].to_list()
280
+ self.consensus_mapping_df = self.consensus_mapping_df.filter(
281
+ pl.col("feature_uid").is_in(feature_uids_to_keep),
282
+ )
283
+ self.logger.info(
284
+ f"Reset filled chromatograms. Chroms removed: {l1 - len(self.features_df)}",
285
+ )
286
+
287
+
288
+ def _get_feature_uids(self, uids=None, seed=42):
289
+ """
290
+ Helper function to get feature_uids from features_df based on input uids.
291
+ If uids is None, returns all feature_uids.
292
+ If uids is a single integer, returns a random sample of feature_uids.
293
+ If uids is a list of strings, returns feature_uids corresponding to those feature_uids.
294
+ If uids is a list of integers, returns feature_uids corresponding to those feature_uids.
295
+ """
296
+ if uids is None:
297
+ # get all feature_uids from features_df
298
+ return self.features_df["feature_uid"].to_list()
299
+ elif isinstance(uids, int):
300
+ # choose a random sample of feature_uids
301
+ if len(self.features_df) > uids:
302
+ np.random.seed(seed)
303
+ return np.random.choice(
304
+ self.features_df["feature_uid"].to_list(),
305
+ uids,
306
+ replace=False,
307
+ ).tolist()
308
+ else:
309
+ return self.features_df["feature_uid"].to_list()
310
+ else:
311
+ # iterate over all uids. If the item is a string, assume it's a feature_uid
312
+ feature_uids = []
313
+ for uid in uids:
314
+ if isinstance(uid, str):
315
+ matching_rows = self.features_df.filter(pl.col("feature_uid") == uid)
316
+ if not matching_rows.is_empty():
317
+ feature_uids.append(
318
+ matching_rows.row(0, named=True)["feature_uid"],
319
+ )
320
+ elif isinstance(uid, int):
321
+ if uid in self.features_df["feature_uid"].to_list():
322
+ feature_uids.append(uid)
323
+ # remove duplicates
324
+ feature_uids = list(set(feature_uids))
325
+ return feature_uids
326
+
327
+
328
+ def _get_consensus_uids(self, uids=None, seed=42):
329
+ """
330
+ Helper function to get consensus_uids from consensus_df based on input uids.
331
+ If uids is None, returns all consensus_uids.
332
+ If uids is a single integer, returns a random sample of consensus_uids.
333
+ If uids is a list of strings, returns consensus_uids corresponding to those consensus_ids.
334
+ If uids is a list of integers, returns consensus_uids corresponding to those consensus_uids.
335
+ """
336
+ # Check if consensus_df is empty or doesn't have required columns
337
+ if self.consensus_df.is_empty() or "consensus_uid" not in self.consensus_df.columns:
338
+ return []
339
+
340
+ if uids is None:
341
+ # get all consensus_uids from consensus_df
342
+ return self.consensus_df["consensus_uid"].to_list()
343
+ elif isinstance(uids, int):
344
+ # choose a random sample of consensus_uids
345
+ if len(self.consensus_df) > uids:
346
+ np.random.seed(seed) # for reproducibility
347
+ return np.random.choice(
348
+ self.consensus_df["consensus_uid"].to_list(),
349
+ uids,
350
+ replace=False,
351
+ ).tolist()
352
+ else:
353
+ return self.consensus_df["consensus_uid"].to_list()
354
+ else:
355
+ # iterate over all uids. If the item is a string, assume it's a consensus_id
356
+ consensus_uids = []
357
+ for uid in uids:
358
+ if isinstance(uid, str):
359
+ matching_rows = self.consensus_df.filter(pl.col("consensus_id") == uid)
360
+ if not matching_rows.is_empty():
361
+ consensus_uids.append(
362
+ matching_rows.row(0, named=True)["consensus_uid"],
363
+ )
364
+ elif isinstance(uid, int):
365
+ if uid in self.consensus_df["consensus_uid"].to_list():
366
+ consensus_uids.append(uid)
367
+ # remove duplicates
368
+ consensus_uids = list(set(consensus_uids))
369
+ return consensus_uids
370
+
371
+
372
+ def _get_sample_uids(self, samples=None, seed=42):
373
+ """
374
+ Helper function to get sample_uids from samples_df based on input samples.
375
+ If samples is None, returns all sample_uids.
376
+ If samples is a single integer, returns a random sample of sample_uids.
377
+ If samples is a list of strings, returns sample_uids corresponding to those sample_names.
378
+ If samples is a list of integers, returns sample_uids corresponding to those sample_uids.
379
+ """
380
+ if samples is None:
381
+ # get all sample_uids from samples_df
382
+ return self.samples_df["sample_uid"].to_list()
383
+ elif isinstance(samples, int):
384
+ # choose a random sample of sample_uids
385
+ if len(self.samples_df) > samples:
386
+ np.random.seed(seed) # for reproducibility
387
+ return np.random.choice(
388
+ self.samples_df["sample_uid"].to_list(),
389
+ samples,
390
+ replace=False,
391
+ ).tolist()
392
+ else:
393
+ return self.samples_df["sample_uid"].to_list()
394
+ else:
395
+ # iterate over all samples. If the item is a string, assume it's a sample_name
396
+ sample_uids = []
397
+ for sample in samples:
398
+ if isinstance(sample, str):
399
+ matching_rows = self.samples_df.filter(pl.col("sample_name") == sample)
400
+ if not matching_rows.is_empty():
401
+ sample_uids.append(
402
+ matching_rows.row(0, named=True)["sample_uid"],
403
+ )
404
+ elif isinstance(sample, int):
405
+ if sample in self.samples_df["sample_uid"].to_list():
406
+ sample_uids.append(sample)
407
+ # remove duplicates
408
+ sample_uids = list(set(sample_uids))
409
+ return sample_uids
410
+
411
+ def get_orphans(self):
412
+ """
413
+ Get all features that are not in the consensus mapping.
414
+ """
415
+ not_in_consensus = self.features_df.filter(~self.features_df['feature_uid'].is_in(self.consensus_mapping_df['feature_uid'].to_list()))
416
+ return not_in_consensus
417
+
418
+ def compress(self, features=True, ms2=True, chrom=False, ms2_max=5):
419
+ """
420
+ Perform compress_features, compress_ms2, and compress_chrom operations.
421
+
422
+ Parameters:
423
+ max_replicates (int): Maximum number of MS2 replicates to keep per consensus_uid and energy combination
424
+ """
425
+ self.logger.info("Starting full compression...")
426
+ if features:
427
+ self.compress_features()
428
+ if ms2:
429
+ self.compress_ms2(max_replicates=ms2_max)
430
+ if chrom:
431
+ self.compress_chrom()
432
+ self.logger.info("Compression completed")
433
+
434
+
435
+ def compress_features(self):
436
+ """
437
+ Compress features_df by:
438
+ 1. Deleting features that are not associated to any consensus (according to consensus_mapping_df)
439
+ 2. Setting the m2_specs column to None to save memory
440
+ """
441
+ if self.features_df is None or self.features_df.is_empty():
442
+ self.logger.warning("No features_df found.")
443
+ return
444
+
445
+ if self.consensus_mapping_df is None or self.consensus_mapping_df.is_empty():
446
+ self.logger.warning("No consensus_mapping_df found.")
447
+ return
448
+
449
+ initial_count = len(self.features_df)
450
+
451
+ # Get feature_uids that are associated with consensus features
452
+ consensus_feature_uids = self.consensus_mapping_df["feature_uid"].to_list()
453
+
454
+ # Filter features_df to keep only features associated with consensus
455
+ self.features_df = self.features_df.filter(
456
+ pl.col("feature_uid").is_in(consensus_feature_uids)
457
+ )
458
+
459
+ # Set ms2_specs column to None if it exists
460
+ if "ms2_specs" in self.features_df.columns:
461
+ # Create a list of None values with the same length as the dataframe
462
+ # This preserves the Object dtype instead of converting to Null
463
+ none_values = [None] * len(self.features_df)
464
+ self.features_df = self.features_df.with_columns(
465
+ pl.Series("ms2_specs", none_values, dtype=pl.Object)
466
+ )
467
+
468
+ removed_count = initial_count - len(self.features_df)
469
+ self.logger.info(f"Compressed features: removed {removed_count} features not in consensus, cleared ms2_specs column")
470
+
471
+
472
+ def restore_features(self, samples=None, maps=False):
473
+ """
474
+ Update specific columns (chrom, chrom_area, ms2_scans, ms2_specs) in features_df
475
+ from the corresponding samples by reading features_df from the sample5 file.
476
+ Use the feature_id for matching.
477
+
478
+ Parameters:
479
+ samples (list, optional): List of sample_uids or sample_names to restore.
480
+ If None, restores all samples.
481
+ maps (bool, optional): If True, also load featureXML data and update study.feature_maps.
482
+ """
483
+ import datetime
484
+ from masster.sample.sample import Sample
485
+
486
+ if self.features_df is None or self.features_df.is_empty():
487
+ self.logger.error("No features_df found in study.")
488
+ return
489
+
490
+ if self.samples_df is None or self.samples_df.is_empty():
491
+ self.logger.error("No samples_df found in study.")
492
+ return
493
+
494
+ # Get sample_uids to process
495
+ sample_uids = self._get_sample_uids(samples)
496
+
497
+ if not sample_uids:
498
+ self.logger.warning("No valid samples specified.")
499
+ return
500
+
501
+ # Columns to update from sample data
502
+ columns_to_update = ['chrom', 'chrom_area', 'ms2_scans', 'ms2_specs']
503
+
504
+ self.logger.info(f"Restoring columns {columns_to_update} from {len(sample_uids)} samples...")
505
+
506
+ # Create a mapping of (sample_uid, feature_id) to feature_uid from study.features_df
507
+ study_feature_mapping = {}
508
+ for row in self.features_df.iter_rows(named=True):
509
+ if "feature_id" in row and "feature_uid" in row and "sample_uid" in row:
510
+ key = (row["sample_uid"], row["feature_id"])
511
+ study_feature_mapping[key] = row["feature_uid"]
512
+
513
+ # Process each sample
514
+ tqdm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
515
+ for sample_uid in tqdm(sample_uids,
516
+ unit="sample",
517
+ disable=tqdm_disable,
518
+ desc=f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Restoring samples"):
519
+ # Get sample info
520
+ sample_row = self.samples_df.filter(pl.col("sample_uid") == sample_uid)
521
+ if sample_row.is_empty():
522
+ self.logger.warning(f"Sample with uid {sample_uid} not found in samples_df.")
523
+ continue
524
+
525
+ sample_info = sample_row.row(0, named=True)
526
+ sample_path = sample_info.get("sample_path")
527
+ sample_name = sample_info.get("sample_name")
528
+
529
+ if not sample_path or not os.path.exists(sample_path):
530
+ self.logger.warning(f"Sample file not found for {sample_name}: {sample_path}")
531
+ continue
532
+
533
+ try:
534
+ # Load sample to get its features_df
535
+ # Use a direct load call with map=False to prevent feature synchronization
536
+ # which would remove filled features that don't exist in the original FeatureMap
537
+ sample = Sample(log_level='DEBUG')
538
+ sample._load_sample5(sample_path, map=False)
539
+
540
+ if sample.features_df is None or sample.features_df.is_empty():
541
+ self.logger.warning(f"No features found in sample {sample_name}")
542
+ continue
543
+
544
+ # Create update data for this sample
545
+ updates_made = 0
546
+ for row in sample.features_df.iter_rows(named=True):
547
+ feature_id = row.get("feature_id")
548
+ if feature_id is None:
549
+ continue
550
+
551
+ key = (sample_uid, feature_id)
552
+ if key in study_feature_mapping:
553
+ feature_uid = study_feature_mapping[key]
554
+
555
+ # Update the specific columns in study.features_df
556
+ for col in columns_to_update:
557
+ if col in row and col in self.features_df.columns:
558
+ # Get the original column dtype to preserve it
559
+ original_dtype = self.features_df[col].dtype
560
+
561
+ # Update the specific row and column, preserving dtype
562
+ mask = (pl.col("feature_uid") == feature_uid) & (pl.col("sample_uid") == sample_uid)
563
+
564
+ # Handle object columns (like Chromatogram) differently
565
+ if original_dtype == pl.Object:
566
+ self.features_df = self.features_df.with_columns(
567
+ pl.when(mask)
568
+ .then(pl.lit(row[col], dtype=original_dtype, allow_object=True))
569
+ .otherwise(pl.col(col))
570
+ .alias(col)
571
+ )
572
+ else:
573
+ self.features_df = self.features_df.with_columns(
574
+ pl.when(mask)
575
+ .then(pl.lit(row[col], dtype=original_dtype))
576
+ .otherwise(pl.col(col))
577
+ .alias(col)
578
+ )
579
+ updates_made += 1
580
+
581
+ self.logger.debug(f"Updated {updates_made} features from sample {sample_name}")
582
+
583
+ # If maps is True, load featureXML data
584
+ if maps:
585
+ if hasattr(sample, 'feature_maps'):
586
+ self.feature_maps.extend(sample.feature_maps)
587
+
588
+ except Exception as e:
589
+ self.logger.error(f"Failed to load sample {sample_name}: {e}")
590
+ continue
591
+
592
+ self.logger.info(f"Completed restoring columns {columns_to_update} from {len(sample_uids)} samples")
593
+
594
+
595
+ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
596
+ """
597
+ Restore chromatograms from individual .sample5 files and gap-fill missing ones.
598
+
599
+ This function combines the functionality of restore_features() and fill_chrom():
600
+ 1. First restores chromatograms from individual .sample5 files (like restore_features)
601
+ 2. Then gap-fills any remaining empty chromatograms (like fill_chrom)
602
+ 3. ONLY updates the 'chrom' column, not chrom_area or other derived values
603
+
604
+ Parameters:
605
+ samples (list, optional): List of sample_uids or sample_names to process.
606
+ If None, processes all samples.
607
+ mz_tol (float): m/z tolerance for gap filling (default: 0.010)
608
+ rt_tol (float): RT tolerance for gap filling (default: 10.0)
609
+ """
610
+ import datetime
611
+ import numpy as np
612
+ from masster.sample.sample import Sample
613
+ from masster.chromatogram import Chromatogram
614
+
615
+ if self.features_df is None or self.features_df.is_empty():
616
+ self.logger.error("No features_df found in study.")
617
+ return
618
+
619
+ if self.samples_df is None or self.samples_df.is_empty():
620
+ self.logger.error("No samples_df found in study.")
621
+ return
622
+
623
+ # Get sample_uids to process
624
+ sample_uids = self._get_sample_uids(samples)
625
+ if not sample_uids:
626
+ self.logger.warning("No valid samples specified.")
627
+ return
628
+
629
+ self.logger.info(f"Restoring chromatograms from {len(sample_uids)} samples...")
630
+
631
+ # Create mapping of (sample_uid, feature_id) to feature_uid
632
+ study_feature_mapping = {}
633
+ for row in self.features_df.iter_rows(named=True):
634
+ if "feature_id" in row and "feature_uid" in row and "sample_uid" in row:
635
+ key = (row["sample_uid"], row["feature_id"])
636
+ study_feature_mapping[key] = row["feature_uid"]
637
+
638
+ # Phase 1: Restore from individual .sample5 files (like restore_features)
639
+ restored_count = 0
640
+ tqdm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
641
+
642
+ self.logger.info("Phase 1: Restoring chromatograms from .sample5 files...")
643
+ for sample_uid in tqdm(sample_uids,
644
+ desc=f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Restoring from samples",
645
+ disable=tqdm_disable):
646
+
647
+ # Get sample info
648
+ sample_row = self.samples_df.filter(pl.col("sample_uid") == sample_uid)
649
+ if sample_row.is_empty():
650
+ self.logger.warning(f"Sample with uid {sample_uid} not found.")
651
+ continue
652
+
653
+ sample_info = sample_row.row(0, named=True)
654
+ sample_path = sample_info.get("sample_path")
655
+ sample_name = sample_info.get("sample_name")
656
+
657
+ if not sample_path or not os.path.exists(sample_path):
658
+ self.logger.warning(f"Sample file not found: {sample_path}")
659
+ continue
660
+
661
+ try:
662
+ # Load sample (with map=False to prevent feature synchronization)
663
+ sample = Sample(log_level='WARNING')
664
+ sample._load_sample5(sample_path, map=False)
665
+
666
+ if sample.features_df is None or sample.features_df.is_empty():
667
+ self.logger.warning(f"No features found in sample {sample_name}")
668
+ continue
669
+
670
+ # Update chromatograms from this sample
671
+ for row in sample.features_df.iter_rows(named=True):
672
+ feature_id = row.get("feature_id")
673
+ chrom = row.get("chrom")
674
+
675
+ if feature_id is None or chrom is None:
676
+ continue
677
+
678
+ key = (sample_uid, feature_id)
679
+ if key in study_feature_mapping:
680
+ feature_uid = study_feature_mapping[key]
681
+
682
+ # Update only the chrom column
683
+ mask = (pl.col("feature_uid") == feature_uid) & (pl.col("sample_uid") == sample_uid)
684
+ self.features_df = self.features_df.with_columns(
685
+ pl.when(mask)
686
+ .then(pl.lit(chrom, dtype=pl.Object, allow_object=True))
687
+ .otherwise(pl.col("chrom"))
688
+ .alias("chrom")
689
+ )
690
+ restored_count += 1
691
+
692
+ except Exception as e:
693
+ self.logger.error(f"Failed to load sample {sample_name}: {e}")
694
+ continue
695
+
696
+ self.logger.info(f"Phase 1 complete: Restored {restored_count} chromatograms from .sample5 files")
697
+
698
+ # Phase 2: Gap-fill remaining empty chromatograms (like fill_chrom)
699
+ self.logger.info("Phase 2: Gap-filling remaining empty chromatograms...")
700
+
701
+ # Count how many chromatograms are still missing
702
+ empty_chroms = self.features_df.filter(pl.col("chrom").is_null()).height
703
+ total_chroms = len(self.features_df)
704
+
705
+ self.logger.debug(f"Chromatograms still missing: {empty_chroms}/{total_chroms} ({empty_chroms/total_chroms*100:.1f}%)")
706
+
707
+ if empty_chroms == 0:
708
+ self.logger.info("All chromatograms restored from .sample5 files. No gap-filling needed.")
709
+ return
710
+
711
+ # Get consensus info for gap filling
712
+ consensus_info = {}
713
+ for row in self.consensus_df.iter_rows(named=True):
714
+ consensus_info[row["consensus_uid"]] = {
715
+ "rt_start_mean": row["rt_start_mean"],
716
+ "rt_end_mean": row["rt_end_mean"],
717
+ "mz": row["mz"],
718
+ "rt": row["rt"],
719
+ }
720
+
721
+ filled_count = 0
722
+
723
+ # Process each sample that has missing chromatograms
724
+ for sample_uid in tqdm(sample_uids,
725
+ desc=f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Gap-filling missing chromatograms",
726
+ disable=tqdm_disable):
727
+
728
+ # Get features with missing chromatograms for this sample
729
+ missing_features = self.features_df.filter(
730
+ (pl.col("sample_uid") == sample_uid) &
731
+ (pl.col("chrom").is_null())
732
+ )
733
+
734
+ if missing_features.is_empty():
735
+ continue
736
+
737
+ # Get sample info
738
+ sample_row = self.samples_df.filter(pl.col("sample_uid") == sample_uid)
739
+ sample_info = sample_row.row(0, named=True)
740
+ sample_path = sample_info.get("sample_path")
741
+ sample_name = sample_info.get("sample_name")
742
+
743
+ if not sample_path or not os.path.exists(sample_path):
744
+ continue
745
+
746
+ try:
747
+ # Load sample for MS1 data extraction
748
+ sample = Sample(log_level='WARNING')
749
+ sample._load_sample5(sample_path, map=False)
750
+
751
+ if not hasattr(sample, 'ms1_df') or sample.ms1_df is None or sample.ms1_df.is_empty():
752
+ continue
753
+
754
+ # Process each missing feature
755
+ for feature_row in missing_features.iter_rows(named=True):
756
+ feature_uid = feature_row["feature_uid"]
757
+ mz = feature_row["mz"]
758
+ rt = feature_row["rt"]
759
+ rt_start = feature_row.get("rt_start", rt - rt_tol)
760
+ rt_end = feature_row.get("rt_end", rt + rt_tol)
761
+
762
+ # Extract EIC from MS1 data
763
+ d = sample.ms1_df.filter(
764
+ (pl.col("mz") >= mz - mz_tol) &
765
+ (pl.col("mz") <= mz + mz_tol) &
766
+ (pl.col("rt") >= rt_start - rt_tol) &
767
+ (pl.col("rt") <= rt_end + rt_tol)
768
+ )
769
+
770
+ # Create chromatogram
771
+ if d.is_empty():
772
+ # Create empty chromatogram
773
+ eic = Chromatogram(
774
+ rt=np.array([rt_start, rt_end]),
775
+ inty=np.array([0.0, 0.0]),
776
+ label=f"EIC mz={mz:.4f} (gap-filled)",
777
+ file=sample_path,
778
+ mz=mz,
779
+ mz_tol=mz_tol,
780
+ feature_start=rt_start,
781
+ feature_end=rt_end,
782
+ feature_apex=rt,
783
+ )
784
+ else:
785
+ # Create real chromatogram from data
786
+ eic_rt = d.group_by("rt").agg(pl.col("inty").max()).sort("rt")
787
+
788
+ if len(eic_rt) > 4:
789
+ eic = Chromatogram(
790
+ eic_rt["rt"].to_numpy(),
791
+ eic_rt["inty"].to_numpy(),
792
+ label=f"EIC mz={mz:.4f} (gap-filled)",
793
+ file=sample_path,
794
+ mz=mz,
795
+ mz_tol=mz_tol,
796
+ feature_start=rt_start,
797
+ feature_end=rt_end,
798
+ feature_apex=rt,
799
+ ).find_peaks()
800
+ else:
801
+ eic = Chromatogram(
802
+ eic_rt["rt"].to_numpy(),
803
+ eic_rt["inty"].to_numpy(),
804
+ label=f"EIC mz={mz:.4f} (gap-filled)",
805
+ file=sample_path,
806
+ mz=mz,
807
+ mz_tol=mz_tol,
808
+ feature_start=rt_start,
809
+ feature_end=rt_end,
810
+ feature_apex=rt,
811
+ )
812
+
813
+ # Update the chromatogram in the study
814
+ mask = pl.col("feature_uid") == feature_uid
815
+ self.features_df = self.features_df.with_columns(
816
+ pl.when(mask)
817
+ .then(pl.lit(eic, dtype=pl.Object, allow_object=True))
818
+ .otherwise(pl.col("chrom"))
819
+ .alias("chrom")
820
+ )
821
+ filled_count += 1
822
+
823
+ except Exception as e:
824
+ self.logger.error(f"Failed to gap-fill sample {sample_name}: {e}")
825
+ continue
826
+
827
+ self.logger.info(f"Phase 2 complete: Gap-filled {filled_count} chromatograms")
828
+
829
+ # Final summary
830
+ final_non_null = self.features_df.filter(pl.col("chrom").is_not_null()).height
831
+ final_total = len(self.features_df)
832
+
833
+ self.logger.info(f"Chromatogram restoration complete: {final_non_null}/{final_total} ({final_non_null/final_total*100:.1f}%)")
834
+ self.logger.info(f"Restored from .sample5 files: {restored_count}, Gap-filled from raw data: {filled_count}")
835
+
836
+
837
+ def compress_ms2(self, max_replicates=5):
838
+ """
839
+ Reduce the number of entries matching any pair of (consensus and energy) to max XY rows.
840
+ Groups all rows by consensus_uid and energy. For each group, sort by number_frags * prec_inty,
841
+ and then pick the top XY rows. Discard the others.
842
+
843
+ Parameters:
844
+ max_replicates (int): Maximum number of replicates to keep per consensus_uid and energy combination
845
+ """
846
+ if self.consensus_ms2 is None or self.consensus_ms2.is_empty():
847
+ self.logger.warning("No consensus_ms2 found.")
848
+ return
849
+
850
+ initial_count = len(self.consensus_ms2)
851
+
852
+ # Create a ranking score based on number_frags * prec_inty
853
+ # Handle None values by treating them as 0
854
+ self.consensus_ms2 = self.consensus_ms2.with_columns([
855
+ (
856
+ pl.col("number_frags").fill_null(0) *
857
+ pl.col("prec_inty").fill_null(0)
858
+ ).alias("ranking_score")
859
+ ])
860
+
861
+ # Group by consensus_uid and energy, then rank by score and keep top max_replicates
862
+ compressed_ms2 = (
863
+ self.consensus_ms2
864
+ .with_row_count("row_id") # Add row numbers for stable sorting
865
+ .sort(["consensus_uid", "energy", "ranking_score", "row_id"], descending=[False, False, True, False])
866
+ .with_columns([
867
+ pl.int_range(pl.len()).over(["consensus_uid", "energy"]).alias("rank")
868
+ ])
869
+ .filter(pl.col("rank") < max_replicates)
870
+ .drop(["ranking_score", "row_id", "rank"])
871
+ )
872
+
873
+ self.consensus_ms2 = compressed_ms2
874
+
875
+ removed_count = initial_count - len(self.consensus_ms2)
876
+ self.logger.info(f"Compressed MS2 data: removed {removed_count} entries, kept max {max_replicates} per consensus/energy pair")
877
+
878
+
879
+ def compress_chrom(self):
880
+ """
881
+ Set the chrom column in study.features_df to null to save memory.
882
+
883
+ This function clears all chromatogram objects from the features_df, which can
884
+ significantly reduce memory usage in large studies.
885
+ """
886
+ if self.features_df is None or self.features_df.is_empty():
887
+ self.logger.warning("No features_df found.")
888
+ return
889
+
890
+ if "chrom" not in self.features_df.columns:
891
+ self.logger.warning("No 'chrom' column found in features_df.")
892
+ return
893
+
894
+ # Count non-null chromatograms before compression
895
+ non_null_count = self.features_df.filter(pl.col("chrom").is_not_null()).height
896
+
897
+ # Set chrom column to None while keeping dtype as object
898
+ self.features_df = self.features_df.with_columns(
899
+ pl.lit(None, dtype=pl.Object).alias("chrom")
900
+ )
901
+
902
+ self.logger.info(f"Compressed chromatograms: cleared {non_null_count} chromatogram objects from features_df")
903
+
904
+
905
+ def set_source(self, filename):
906
+ """
907
+ Reassign file_source for all samples in samples_df. If filename contains only a path,
908
+ keep the current basename and build an absolute path. Check that the new file exists
909
+ before overwriting the old file_source.
910
+
911
+ Parameters:
912
+ filename (str): New file path or directory path for all samples
913
+
914
+ Returns:
915
+ None
916
+ """
917
+ import os
918
+
919
+ if self.samples_df is None or len(self.samples_df) == 0:
920
+ self.logger.warning("No samples found in study.")
921
+ return
922
+
923
+ updated_count = 0
924
+ failed_count = 0
925
+
926
+ # Get all current file_source values
927
+ current_sources = self.samples_df.get_column("file_source").to_list()
928
+ sample_names = self.samples_df.get_column("sample_name").to_list()
929
+
930
+ new_sources = []
931
+
932
+ for i, (current_source, sample_name) in enumerate(zip(current_sources, sample_names)):
933
+ # Check if filename is just a directory path
934
+ if os.path.isdir(filename):
935
+ if current_source is None or current_source == "":
936
+ self.logger.warning(f"Cannot build path for sample '{sample_name}': no current file_source available")
937
+ new_sources.append(current_source)
938
+ failed_count += 1
939
+ continue
940
+
941
+ # Get the basename from current file_source
942
+ current_basename = os.path.basename(current_source)
943
+ # Build new absolute path
944
+ new_file_path = os.path.join(filename, current_basename)
945
+ else:
946
+ # filename is a full path, make it absolute
947
+ new_file_path = os.path.abspath(filename)
948
+
949
+ # Check if the new file exists
950
+ if not os.path.exists(new_file_path):
951
+ self.logger.warning(f"File does not exist for sample '{sample_name}': {new_file_path}")
952
+ new_sources.append(current_source)
953
+ failed_count += 1
954
+ continue
955
+
956
+ # File exists, update source
957
+ new_sources.append(new_file_path)
958
+ updated_count += 1
959
+
960
+ # Log individual updates at debug level
961
+ self.logger.debug(f"Updated file_source for sample '{sample_name}': {current_source} -> {new_file_path}")
962
+
963
+ # Update the samples_df with new file_source values
964
+ self.samples_df = self.samples_df.with_columns(
965
+ pl.Series("file_source", new_sources).alias("file_source")
966
+ )
967
+
968
+ # Log summary
969
+ if updated_count > 0:
970
+ self.logger.info(f"Updated file_source for {updated_count} samples")
971
+ if failed_count > 0:
972
+ self.logger.warning(f"Failed to update file_source for {failed_count} samples")
973
+
974
+
975
+ def features_select(
976
+ self,
977
+ mz=None,
978
+ rt=None,
979
+ inty=None,
980
+ sample_uid=None,
981
+ sample_name=None,
982
+ consensus_uid=None,
983
+ feature_uid=None,
984
+ filled=None,
985
+ quality=None,
986
+ chrom_coherence=None,
987
+ chrom_prominence=None,
988
+ chrom_prominence_scaled=None,
989
+ chrom_height_scaled=None,
990
+ ):
991
+ """
992
+ Select features from features_df based on specified criteria and return the filtered DataFrame.
993
+
994
+ OPTIMIZED VERSION: Combines all filters into a single operation for better performance.
995
+
996
+ Parameters:
997
+ mz: m/z range filter (tuple for range, single value for minimum)
998
+ rt: retention time range filter (tuple for range, single value for minimum)
999
+ inty: intensity filter (tuple for range, single value for minimum)
1000
+ sample_uid: sample UID filter (list, single value, or tuple for range)
1001
+ sample_name: sample name filter (list or single value)
1002
+ consensus_uid: consensus UID filter (list, single value, or tuple for range)
1003
+ feature_uid: feature UID filter (list, single value, or tuple for range)
1004
+ filled: filter for filled/not filled features (bool)
1005
+ quality: quality score filter (tuple for range, single value for minimum)
1006
+ chrom_coherence: chromatogram coherence filter (tuple for range, single value for minimum)
1007
+ chrom_prominence: chromatogram prominence filter (tuple for range, single value for minimum)
1008
+ chrom_prominence_scaled: scaled chromatogram prominence filter (tuple for range, single value for minimum)
1009
+ chrom_height_scaled: scaled chromatogram height filter (tuple for range, single value for minimum)
1010
+
1011
+ Returns:
1012
+ polars.DataFrame: Filtered features DataFrame
1013
+ """
1014
+ if self.features_df is None or self.features_df.is_empty():
1015
+ self.logger.warning("No features found in study.")
1016
+ return pl.DataFrame()
1017
+
1018
+ # Early return if no filters provided - performance optimization
1019
+ filter_params = [mz, rt, inty, sample_uid, sample_name, consensus_uid,
1020
+ feature_uid, filled, quality, chrom_coherence,
1021
+ chrom_prominence, chrom_prominence_scaled, chrom_height_scaled]
1022
+ if all(param is None for param in filter_params):
1023
+ return self.features_df.clone()
1024
+
1025
+ initial_count = len(self.features_df)
1026
+
1027
+ # Pre-check available columns once for efficiency
1028
+ available_columns = set(self.features_df.columns)
1029
+
1030
+ # Build all filter conditions first, then apply them all at once
1031
+ filter_conditions = []
1032
+ warnings = []
1033
+
1034
+ # Filter by m/z
1035
+ if mz is not None:
1036
+ if isinstance(mz, tuple) and len(mz) == 2:
1037
+ min_mz, max_mz = mz
1038
+ filter_conditions.append((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
1039
+ else:
1040
+ filter_conditions.append(pl.col("mz") >= mz)
1041
+
1042
+ # Filter by retention time
1043
+ if rt is not None:
1044
+ if isinstance(rt, tuple) and len(rt) == 2:
1045
+ min_rt, max_rt = rt
1046
+ filter_conditions.append((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
1047
+ else:
1048
+ filter_conditions.append(pl.col("rt") >= rt)
1049
+
1050
+ # Filter by intensity
1051
+ if inty is not None:
1052
+ if isinstance(inty, tuple) and len(inty) == 2:
1053
+ min_inty, max_inty = inty
1054
+ filter_conditions.append((pl.col("inty") >= min_inty) & (pl.col("inty") <= max_inty))
1055
+ else:
1056
+ filter_conditions.append(pl.col("inty") >= inty)
1057
+
1058
+ # Filter by sample_uid
1059
+ if sample_uid is not None:
1060
+ if isinstance(sample_uid, (list, tuple)):
1061
+ if len(sample_uid) == 2 and not isinstance(sample_uid, list):
1062
+ # Treat as range
1063
+ min_uid, max_uid = sample_uid
1064
+ filter_conditions.append((pl.col("sample_uid") >= min_uid) & (pl.col("sample_uid") <= max_uid))
1065
+ else:
1066
+ # Treat as list
1067
+ filter_conditions.append(pl.col("sample_uid").is_in(sample_uid))
1068
+ else:
1069
+ filter_conditions.append(pl.col("sample_uid") == sample_uid)
1070
+
1071
+ # Filter by sample_name (requires pre-processing)
1072
+ if sample_name is not None:
1073
+ # Get sample_uids for the given sample names
1074
+ if isinstance(sample_name, list):
1075
+ sample_uids_for_names = self.samples_df.filter(
1076
+ pl.col("sample_name").is_in(sample_name)
1077
+ )["sample_uid"].to_list()
1078
+ else:
1079
+ sample_uids_for_names = self.samples_df.filter(
1080
+ pl.col("sample_name") == sample_name
1081
+ )["sample_uid"].to_list()
1082
+
1083
+ if sample_uids_for_names:
1084
+ filter_conditions.append(pl.col("sample_uid").is_in(sample_uids_for_names))
1085
+ else:
1086
+ filter_conditions.append(pl.lit(False)) # No matching samples
1087
+
1088
+ # Filter by consensus_uid
1089
+ if consensus_uid is not None:
1090
+ if isinstance(consensus_uid, (list, tuple)):
1091
+ if len(consensus_uid) == 2 and not isinstance(consensus_uid, list):
1092
+ # Treat as range
1093
+ min_uid, max_uid = consensus_uid
1094
+ filter_conditions.append((pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid))
1095
+ else:
1096
+ # Treat as list
1097
+ filter_conditions.append(pl.col("consensus_uid").is_in(consensus_uid))
1098
+ else:
1099
+ filter_conditions.append(pl.col("consensus_uid") == consensus_uid)
1100
+
1101
+ # Filter by feature_uid
1102
+ if feature_uid is not None:
1103
+ if isinstance(feature_uid, (list, tuple)):
1104
+ if len(feature_uid) == 2 and not isinstance(feature_uid, list):
1105
+ # Treat as range
1106
+ min_uid, max_uid = feature_uid
1107
+ filter_conditions.append((pl.col("feature_uid") >= min_uid) & (pl.col("feature_uid") <= max_uid))
1108
+ else:
1109
+ # Treat as list
1110
+ filter_conditions.append(pl.col("feature_uid").is_in(feature_uid))
1111
+ else:
1112
+ filter_conditions.append(pl.col("feature_uid") == feature_uid)
1113
+
1114
+ # Filter by filled status
1115
+ if filled is not None:
1116
+ if "filled" in available_columns:
1117
+ if filled:
1118
+ filter_conditions.append(pl.col("filled"))
1119
+ else:
1120
+ filter_conditions.append(~pl.col("filled") | pl.col("filled").is_null())
1121
+ else:
1122
+ warnings.append("'filled' column not found in features_df")
1123
+
1124
+ # Filter by quality
1125
+ if quality is not None:
1126
+ if "quality" in available_columns:
1127
+ if isinstance(quality, tuple) and len(quality) == 2:
1128
+ min_quality, max_quality = quality
1129
+ filter_conditions.append((pl.col("quality") >= min_quality) & (pl.col("quality") <= max_quality))
1130
+ else:
1131
+ filter_conditions.append(pl.col("quality") >= quality)
1132
+ else:
1133
+ warnings.append("'quality' column not found in features_df")
1134
+
1135
+ # Filter by chromatogram coherence
1136
+ if chrom_coherence is not None:
1137
+ if "chrom_coherence" in available_columns:
1138
+ if isinstance(chrom_coherence, tuple) and len(chrom_coherence) == 2:
1139
+ min_coherence, max_coherence = chrom_coherence
1140
+ filter_conditions.append((pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence))
1141
+ else:
1142
+ filter_conditions.append(pl.col("chrom_coherence") >= chrom_coherence)
1143
+ else:
1144
+ warnings.append("'chrom_coherence' column not found in features_df")
1145
+
1146
+ # Filter by chromatogram prominence
1147
+ if chrom_prominence is not None:
1148
+ if "chrom_prominence" in available_columns:
1149
+ if isinstance(chrom_prominence, tuple) and len(chrom_prominence) == 2:
1150
+ min_prominence, max_prominence = chrom_prominence
1151
+ filter_conditions.append((pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence))
1152
+ else:
1153
+ filter_conditions.append(pl.col("chrom_prominence") >= chrom_prominence)
1154
+ else:
1155
+ warnings.append("'chrom_prominence' column not found in features_df")
1156
+
1157
+ # Filter by scaled chromatogram prominence
1158
+ if chrom_prominence_scaled is not None:
1159
+ if "chrom_prominence_scaled" in available_columns:
1160
+ if isinstance(chrom_prominence_scaled, tuple) and len(chrom_prominence_scaled) == 2:
1161
+ min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled
1162
+ filter_conditions.append((pl.col("chrom_prominence_scaled") >= min_prominence_scaled) & (pl.col("chrom_prominence_scaled") <= max_prominence_scaled))
1163
+ else:
1164
+ filter_conditions.append(pl.col("chrom_prominence_scaled") >= chrom_prominence_scaled)
1165
+ else:
1166
+ warnings.append("'chrom_prominence_scaled' column not found in features_df")
1167
+
1168
+ # Filter by scaled chromatogram height
1169
+ if chrom_height_scaled is not None:
1170
+ if "chrom_height_scaled" in available_columns:
1171
+ if isinstance(chrom_height_scaled, tuple) and len(chrom_height_scaled) == 2:
1172
+ min_height_scaled, max_height_scaled = chrom_height_scaled
1173
+ filter_conditions.append((pl.col("chrom_height_scaled") >= min_height_scaled) & (pl.col("chrom_height_scaled") <= max_height_scaled))
1174
+ else:
1175
+ filter_conditions.append(pl.col("chrom_height_scaled") >= chrom_height_scaled)
1176
+ else:
1177
+ warnings.append("'chrom_height_scaled' column not found in features_df")
1178
+
1179
+ # Log all warnings once at the end for efficiency
1180
+ for warning in warnings:
1181
+ self.logger.warning(warning)
1182
+
1183
+ # Apply all filters at once using lazy evaluation for optimal performance
1184
+ if filter_conditions:
1185
+ # Combine all conditions with AND
1186
+ combined_filter = filter_conditions[0]
1187
+ for condition in filter_conditions[1:]:
1188
+ combined_filter = combined_filter & condition
1189
+
1190
+ # Apply the combined filter using lazy evaluation
1191
+ feats = self.features_df.lazy().filter(combined_filter).collect()
1192
+ else:
1193
+ feats = self.features_df.clone()
1194
+
1195
+ final_count = len(feats)
1196
+
1197
+ if final_count == 0:
1198
+ self.logger.warning("No features remaining after applying selection criteria.")
1199
+ else:
1200
+ removed_count = initial_count - final_count
1201
+ self.logger.info(f"Features selected: {final_count} (removed: {removed_count})")
1202
+
1203
+ return feats
1204
+
1205
+
1206
+ def features_filter(self, features):
1207
+ """
1208
+ Filter features_df by removing all features that match the given criteria.
1209
+ This is the inverse of features_select - it removes the selected features.
1210
+
1211
+ OPTIMIZED VERSION: Batch operations and reduced overhead for better performance.
1212
+
1213
+ Parameters:
1214
+ features: Features to remove. Can be:
1215
+ - polars.DataFrame: Features DataFrame (will use feature_uid column)
1216
+ - list: List of feature_uids to remove
1217
+ - int: Single feature_uid to remove
1218
+
1219
+ Returns:
1220
+ None (modifies self.features_df in place)
1221
+ """
1222
+ if self.features_df is None or self.features_df.is_empty():
1223
+ self.logger.warning("No features found in study.")
1224
+ return
1225
+
1226
+ # Early return if no features provided
1227
+ if features is None:
1228
+ self.logger.warning("No features provided for filtering.")
1229
+ return
1230
+
1231
+ initial_count = len(self.features_df)
1232
+
1233
+ # Determine feature_uids to remove - optimized type checking
1234
+ if isinstance(features, pl.DataFrame):
1235
+ if "feature_uid" not in features.columns:
1236
+ self.logger.error("features DataFrame must contain 'feature_uid' column")
1237
+ return
1238
+ feature_uids_to_remove = features["feature_uid"].to_list()
1239
+ elif isinstance(features, (list, tuple)):
1240
+ feature_uids_to_remove = list(features) # Convert tuple to list if needed
1241
+ elif isinstance(features, int):
1242
+ feature_uids_to_remove = [features]
1243
+ else:
1244
+ self.logger.error("features parameter must be a DataFrame, list, tuple, or int")
1245
+ return
1246
+
1247
+ # Early return if no UIDs to remove
1248
+ if not feature_uids_to_remove:
1249
+ self.logger.warning("No feature UIDs provided for filtering.")
1250
+ return
1251
+
1252
+ # Convert to set for faster lookup if list is large
1253
+ if len(feature_uids_to_remove) > 100:
1254
+ feature_uids_set = set(feature_uids_to_remove)
1255
+ # Use the set for filtering if it's significantly smaller
1256
+ if len(feature_uids_set) < len(feature_uids_to_remove) * 0.8:
1257
+ feature_uids_to_remove = list(feature_uids_set)
1258
+
1259
+ # Create filter condition once
1260
+ filter_condition = ~pl.col("feature_uid").is_in(feature_uids_to_remove)
1261
+
1262
+ # Apply filter to features_df using lazy evaluation for better performance
1263
+ self.features_df = self.features_df.lazy().filter(filter_condition).collect()
1264
+
1265
+ # Apply filter to consensus_mapping_df if it exists - batch operation
1266
+ mapping_removed_count = 0
1267
+ if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
1268
+ initial_mapping_count = len(self.consensus_mapping_df)
1269
+ self.consensus_mapping_df = (
1270
+ self.consensus_mapping_df
1271
+ .lazy()
1272
+ .filter(filter_condition)
1273
+ .collect()
1274
+ )
1275
+ mapping_removed_count = initial_mapping_count - len(self.consensus_mapping_df)
1276
+
1277
+ # Calculate results once and log efficiently
1278
+ final_count = len(self.features_df)
1279
+ removed_count = initial_count - final_count
1280
+
1281
+ # Single comprehensive log message
1282
+ if mapping_removed_count > 0:
1283
+ self.logger.info(f"Filtered {removed_count} features and {mapping_removed_count} consensus mappings. Remaining features: {final_count}")
1284
+ else:
1285
+ self.logger.info(f"Filtered {removed_count} features. Remaining features: {final_count}")
1286
+
1287
+
1288
+ def features_delete(self, features):
1289
+ """
1290
+ Delete features from features_df based on feature identifiers.
1291
+ This is an alias for features_filter for consistency with sample.features_delete().
1292
+
1293
+ Parameters:
1294
+ features: Features to delete. Can be:
1295
+ - polars.DataFrame: Features DataFrame (will use feature_uid column)
1296
+ - list: List of feature_uids to delete
1297
+ - int: Single feature_uid to delete
1298
+
1299
+ Returns:
1300
+ None (modifies self.features_df in place)
1301
+ """
1302
+ self.features_filter(features)
1303
+
1304
+
1305
+ def consensus_select(
1306
+ self,
1307
+ mz=None,
1308
+ rt=None,
1309
+ inty_mean=None,
1310
+ consensus_uid=None,
1311
+ consensus_id=None,
1312
+ number_samples=None,
1313
+ number_ms2=None,
1314
+ quality=None,
1315
+ bl=None,
1316
+ chrom_coherence_mean=None,
1317
+ chrom_prominence_mean=None,
1318
+ chrom_prominence_scaled_mean=None,
1319
+ chrom_height_scaled_mean=None,
1320
+ rt_delta_mean=None,
1321
+ ):
1322
+ """
1323
+ Select consensus features from consensus_df based on specified criteria and return the filtered DataFrame.
1324
+
1325
+ Parameters:
1326
+ mz: m/z range filter (tuple for range, single value for minimum)
1327
+ rt: retention time range filter (tuple for range, single value for minimum)
1328
+ inty_mean: mean intensity filter (tuple for range, single value for minimum)
1329
+ consensus_uid: consensus UID filter (list, single value, or tuple for range)
1330
+ consensus_id: consensus ID filter (list or single value)
1331
+ number_samples: number of samples filter (tuple for range, single value for minimum)
1332
+ number_ms2: number of MS2 spectra filter (tuple for range, single value for minimum)
1333
+ quality: quality score filter (tuple for range, single value for minimum)
1334
+ bl: baseline filter (tuple for range, single value for minimum)
1335
+ chrom_coherence_mean: mean chromatogram coherence filter (tuple for range, single value for minimum)
1336
+ chrom_prominence_mean: mean chromatogram prominence filter (tuple for range, single value for minimum)
1337
+ chrom_prominence_scaled_mean: mean scaled chromatogram prominence filter (tuple for range, single value for minimum)
1338
+ chrom_height_scaled_mean: mean scaled chromatogram height filter (tuple for range, single value for minimum)
1339
+ rt_delta_mean: mean RT delta filter (tuple for range, single value for minimum)
1340
+
1341
+ Returns:
1342
+ polars.DataFrame: Filtered consensus DataFrame
1343
+ """
1344
+ if self.consensus_df is None or self.consensus_df.is_empty():
1345
+ self.logger.warning("No consensus features found in study.")
1346
+ return pl.DataFrame()
1347
+
1348
+ consensus = self.consensus_df.clone()
1349
+ initial_count = len(consensus)
1350
+
1351
+ # Filter by m/z
1352
+ if mz is not None:
1353
+ consensus_len_before_filter = len(consensus)
1354
+ if isinstance(mz, tuple) and len(mz) == 2:
1355
+ min_mz, max_mz = mz
1356
+ consensus = consensus.filter((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
1357
+ else:
1358
+ consensus = consensus.filter(pl.col("mz") >= mz)
1359
+ self.logger.debug(
1360
+ f"Selected consensus by mz. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1361
+ )
1362
+
1363
+ # Filter by retention time
1364
+ if rt is not None:
1365
+ consensus_len_before_filter = len(consensus)
1366
+ if isinstance(rt, tuple) and len(rt) == 2:
1367
+ min_rt, max_rt = rt
1368
+ consensus = consensus.filter((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
1369
+ else:
1370
+ consensus = consensus.filter(pl.col("rt") >= rt)
1371
+ self.logger.debug(
1372
+ f"Selected consensus by rt. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1373
+ )
1374
+
1375
+ # Filter by mean intensity
1376
+ if inty_mean is not None:
1377
+ consensus_len_before_filter = len(consensus)
1378
+ if isinstance(inty_mean, tuple) and len(inty_mean) == 2:
1379
+ min_inty, max_inty = inty_mean
1380
+ consensus = consensus.filter((pl.col("inty_mean") >= min_inty) & (pl.col("inty_mean") <= max_inty))
1381
+ else:
1382
+ consensus = consensus.filter(pl.col("inty_mean") >= inty_mean)
1383
+ self.logger.debug(
1384
+ f"Selected consensus by inty_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1385
+ )
1386
+
1387
+ # Filter by consensus_uid
1388
+ if consensus_uid is not None:
1389
+ consensus_len_before_filter = len(consensus)
1390
+ if isinstance(consensus_uid, (list, tuple)):
1391
+ if len(consensus_uid) == 2 and not isinstance(consensus_uid, list):
1392
+ # Treat as range
1393
+ min_uid, max_uid = consensus_uid
1394
+ consensus = consensus.filter((pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid))
1395
+ else:
1396
+ # Treat as list
1397
+ consensus = consensus.filter(pl.col("consensus_uid").is_in(consensus_uid))
1398
+ else:
1399
+ consensus = consensus.filter(pl.col("consensus_uid") == consensus_uid)
1400
+ self.logger.debug(
1401
+ f"Selected consensus by consensus_uid. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1402
+ )
1403
+
1404
+ # Filter by consensus_id
1405
+ if consensus_id is not None:
1406
+ consensus_len_before_filter = len(consensus)
1407
+ if isinstance(consensus_id, list):
1408
+ consensus = consensus.filter(pl.col("consensus_id").is_in(consensus_id))
1409
+ else:
1410
+ consensus = consensus.filter(pl.col("consensus_id") == consensus_id)
1411
+ self.logger.debug(
1412
+ f"Selected consensus by consensus_id. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1413
+ )
1414
+
1415
+ # Filter by number of samples
1416
+ if number_samples is not None:
1417
+ consensus_len_before_filter = len(consensus)
1418
+ if isinstance(number_samples, tuple) and len(number_samples) == 2:
1419
+ min_samples, max_samples = number_samples
1420
+ consensus = consensus.filter((pl.col("number_samples") >= min_samples) & (pl.col("number_samples") <= max_samples))
1421
+ else:
1422
+ consensus = consensus.filter(pl.col("number_samples") >= number_samples)
1423
+ self.logger.debug(
1424
+ f"Selected consensus by number_samples. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1425
+ )
1426
+
1427
+ # Filter by number of MS2 spectra
1428
+ if number_ms2 is not None:
1429
+ consensus_len_before_filter = len(consensus)
1430
+ if "number_ms2" in consensus.columns:
1431
+ if isinstance(number_ms2, tuple) and len(number_ms2) == 2:
1432
+ min_ms2, max_ms2 = number_ms2
1433
+ consensus = consensus.filter((pl.col("number_ms2") >= min_ms2) & (pl.col("number_ms2") <= max_ms2))
1434
+ else:
1435
+ consensus = consensus.filter(pl.col("number_ms2") >= number_ms2)
1436
+ else:
1437
+ self.logger.warning("'number_ms2' column not found in consensus_df")
1438
+ self.logger.debug(
1439
+ f"Selected consensus by number_ms2. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1440
+ )
1441
+
1442
+ # Filter by quality
1443
+ if quality is not None:
1444
+ consensus_len_before_filter = len(consensus)
1445
+ if isinstance(quality, tuple) and len(quality) == 2:
1446
+ min_quality, max_quality = quality
1447
+ consensus = consensus.filter((pl.col("quality") >= min_quality) & (pl.col("quality") <= max_quality))
1448
+ else:
1449
+ consensus = consensus.filter(pl.col("quality") >= quality)
1450
+ self.logger.debug(
1451
+ f"Selected consensus by quality. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1452
+ )
1453
+
1454
+ # Filter by baseline
1455
+ if bl is not None:
1456
+ consensus_len_before_filter = len(consensus)
1457
+ if "bl" in consensus.columns:
1458
+ if isinstance(bl, tuple) and len(bl) == 2:
1459
+ min_bl, max_bl = bl
1460
+ consensus = consensus.filter((pl.col("bl") >= min_bl) & (pl.col("bl") <= max_bl))
1461
+ else:
1462
+ consensus = consensus.filter(pl.col("bl") >= bl)
1463
+ else:
1464
+ self.logger.warning("'bl' column not found in consensus_df")
1465
+ self.logger.debug(
1466
+ f"Selected consensus by bl. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1467
+ )
1468
+
1469
+ # Filter by mean chromatogram coherence
1470
+ if chrom_coherence_mean is not None:
1471
+ consensus_len_before_filter = len(consensus)
1472
+ if "chrom_coherence_mean" in consensus.columns:
1473
+ if isinstance(chrom_coherence_mean, tuple) and len(chrom_coherence_mean) == 2:
1474
+ min_coherence, max_coherence = chrom_coherence_mean
1475
+ consensus = consensus.filter((pl.col("chrom_coherence_mean") >= min_coherence) & (pl.col("chrom_coherence_mean") <= max_coherence))
1476
+ else:
1477
+ consensus = consensus.filter(pl.col("chrom_coherence_mean") >= chrom_coherence_mean)
1478
+ else:
1479
+ self.logger.warning("'chrom_coherence_mean' column not found in consensus_df")
1480
+ self.logger.debug(
1481
+ f"Selected consensus by chrom_coherence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1482
+ )
1483
+
1484
+ # Filter by mean chromatogram prominence
1485
+ if chrom_prominence_mean is not None:
1486
+ consensus_len_before_filter = len(consensus)
1487
+ if "chrom_prominence_mean" in consensus.columns:
1488
+ if isinstance(chrom_prominence_mean, tuple) and len(chrom_prominence_mean) == 2:
1489
+ min_prominence, max_prominence = chrom_prominence_mean
1490
+ consensus = consensus.filter((pl.col("chrom_prominence_mean") >= min_prominence) & (pl.col("chrom_prominence_mean") <= max_prominence))
1491
+ else:
1492
+ consensus = consensus.filter(pl.col("chrom_prominence_mean") >= chrom_prominence_mean)
1493
+ else:
1494
+ self.logger.warning("'chrom_prominence_mean' column not found in consensus_df")
1495
+ self.logger.debug(
1496
+ f"Selected consensus by chrom_prominence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1497
+ )
1498
+
1499
+ # Filter by mean scaled chromatogram prominence
1500
+ if chrom_prominence_scaled_mean is not None:
1501
+ consensus_len_before_filter = len(consensus)
1502
+ if "chrom_prominence_scaled_mean" in consensus.columns:
1503
+ if isinstance(chrom_prominence_scaled_mean, tuple) and len(chrom_prominence_scaled_mean) == 2:
1504
+ min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled_mean
1505
+ consensus = consensus.filter((pl.col("chrom_prominence_scaled_mean") >= min_prominence_scaled) & (pl.col("chrom_prominence_scaled_mean") <= max_prominence_scaled))
1506
+ else:
1507
+ consensus = consensus.filter(pl.col("chrom_prominence_scaled_mean") >= chrom_prominence_scaled_mean)
1508
+ else:
1509
+ self.logger.warning("'chrom_prominence_scaled_mean' column not found in consensus_df")
1510
+ self.logger.debug(
1511
+ f"Selected consensus by chrom_prominence_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1512
+ )
1513
+
1514
+ # Filter by mean scaled chromatogram height
1515
+ if chrom_height_scaled_mean is not None:
1516
+ consensus_len_before_filter = len(consensus)
1517
+ if "chrom_height_scaled_mean" in consensus.columns:
1518
+ if isinstance(chrom_height_scaled_mean, tuple) and len(chrom_height_scaled_mean) == 2:
1519
+ min_height_scaled, max_height_scaled = chrom_height_scaled_mean
1520
+ consensus = consensus.filter((pl.col("chrom_height_scaled_mean") >= min_height_scaled) & (pl.col("chrom_height_scaled_mean") <= max_height_scaled))
1521
+ else:
1522
+ consensus = consensus.filter(pl.col("chrom_height_scaled_mean") >= chrom_height_scaled_mean)
1523
+ else:
1524
+ self.logger.warning("'chrom_height_scaled_mean' column not found in consensus_df")
1525
+ self.logger.debug(
1526
+ f"Selected consensus by chrom_height_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1527
+ )
1528
+
1529
+ # Filter by mean RT delta
1530
+ if rt_delta_mean is not None:
1531
+ consensus_len_before_filter = len(consensus)
1532
+ if "rt_delta_mean" in consensus.columns:
1533
+ if isinstance(rt_delta_mean, tuple) and len(rt_delta_mean) == 2:
1534
+ min_rt_delta, max_rt_delta = rt_delta_mean
1535
+ consensus = consensus.filter((pl.col("rt_delta_mean") >= min_rt_delta) & (pl.col("rt_delta_mean") <= max_rt_delta))
1536
+ else:
1537
+ consensus = consensus.filter(pl.col("rt_delta_mean") >= rt_delta_mean)
1538
+ else:
1539
+ self.logger.warning("'rt_delta_mean' column not found in consensus_df")
1540
+ self.logger.debug(
1541
+ f"Selected consensus by rt_delta_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1542
+ )
1543
+
1544
+ if len(consensus) == 0:
1545
+ self.logger.warning("No consensus features remaining after applying selection criteria.")
1546
+ else:
1547
+ self.logger.info(f"Selected consensus features. Features remaining: {len(consensus)} (from {initial_count})")
1548
+
1549
+ return consensus
1550
+
1551
+
1552
+ def consensus_filter(self, consensus):
1553
+ """
1554
+ Filter consensus_df by removing all consensus features that match the given criteria.
1555
+ This also removes related entries from consensus_mapping_df, features_df, and consensus_ms2.
1556
+
1557
+ Parameters:
1558
+ consensus: Consensus features to remove. Can be:
1559
+ - polars.DataFrame: Consensus DataFrame (will use consensus_uid column)
1560
+ - list: List of consensus_uids to remove
1561
+ - int: Single consensus_uid to remove
1562
+
1563
+ Returns:
1564
+ None (modifies self.consensus_df and related DataFrames in place)
1565
+ """
1566
+ if self.consensus_df is None or self.consensus_df.is_empty():
1567
+ self.logger.warning("No consensus features found in study.")
1568
+ return
1569
+
1570
+ initial_consensus_count = len(self.consensus_df)
1571
+
1572
+ # Determine consensus_uids to remove
1573
+ if isinstance(consensus, pl.DataFrame):
1574
+ if "consensus_uid" not in consensus.columns:
1575
+ self.logger.error("consensus DataFrame must contain 'consensus_uid' column")
1576
+ return
1577
+ consensus_uids_to_remove = consensus["consensus_uid"].to_list()
1578
+ elif isinstance(consensus, list):
1579
+ consensus_uids_to_remove = consensus
1580
+ elif isinstance(consensus, int):
1581
+ consensus_uids_to_remove = [consensus]
1582
+ else:
1583
+ self.logger.error("consensus parameter must be a DataFrame, list, or int")
1584
+ return
1585
+
1586
+ if not consensus_uids_to_remove:
1587
+ self.logger.warning("No consensus UIDs provided for filtering.")
1588
+ return
1589
+
1590
+ # Get feature_uids that need to be removed from features_df
1591
+ feature_uids_to_remove = []
1592
+ if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
1593
+ feature_uids_to_remove = self.consensus_mapping_df.filter(
1594
+ pl.col("consensus_uid").is_in(consensus_uids_to_remove)
1595
+ )["feature_uid"].to_list()
1596
+
1597
+ # Remove consensus features from consensus_df
1598
+ self.consensus_df = self.consensus_df.filter(
1599
+ ~pl.col("consensus_uid").is_in(consensus_uids_to_remove)
1600
+ )
1601
+
1602
+ # Remove from consensus_mapping_df
1603
+ if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
1604
+ initial_mapping_count = len(self.consensus_mapping_df)
1605
+ self.consensus_mapping_df = self.consensus_mapping_df.filter(
1606
+ ~pl.col("consensus_uid").is_in(consensus_uids_to_remove)
1607
+ )
1608
+ removed_mapping_count = initial_mapping_count - len(self.consensus_mapping_df)
1609
+ if removed_mapping_count > 0:
1610
+ self.logger.debug(f"Removed {removed_mapping_count} entries from consensus_mapping_df")
1611
+
1612
+ # Remove corresponding features from features_df
1613
+ if feature_uids_to_remove and self.features_df is not None and not self.features_df.is_empty():
1614
+ initial_features_count = len(self.features_df)
1615
+ self.features_df = self.features_df.filter(
1616
+ ~pl.col("feature_uid").is_in(feature_uids_to_remove)
1617
+ )
1618
+ removed_features_count = initial_features_count - len(self.features_df)
1619
+ if removed_features_count > 0:
1620
+ self.logger.debug(f"Removed {removed_features_count} entries from features_df")
1621
+
1622
+ # Remove from consensus_ms2 if it exists
1623
+ if hasattr(self, 'consensus_ms2') and self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
1624
+ initial_ms2_count = len(self.consensus_ms2)
1625
+ self.consensus_ms2 = self.consensus_ms2.filter(
1626
+ ~pl.col("consensus_uid").is_in(consensus_uids_to_remove)
1627
+ )
1628
+ removed_ms2_count = initial_ms2_count - len(self.consensus_ms2)
1629
+ if removed_ms2_count > 0:
1630
+ self.logger.debug(f"Removed {removed_ms2_count} entries from consensus_ms2")
1631
+
1632
+ removed_consensus_count = initial_consensus_count - len(self.consensus_df)
1633
+ self.logger.info(f"Filtered {removed_consensus_count} consensus features. Remaining consensus: {len(self.consensus_df)}")
1634
+
1635
+
1636
+ def consensus_delete(self, consensus):
1637
+ """
1638
+ Delete consensus features from consensus_df based on consensus identifiers.
1639
+ This is an alias for consensus_filter for consistency with other delete methods.
1640
+
1641
+ Parameters:
1642
+ consensus: Consensus features to delete. Can be:
1643
+ - polars.DataFrame: Consensus DataFrame (will use consensus_uid column)
1644
+ - list: List of consensus_uids to delete
1645
+ - int: Single consensus_uid to delete
1646
+
1647
+ Returns:
1648
+ None (modifies self.consensus_df and related DataFrames in place)
1649
+ """
1650
+ self.consensus_filter(consensus)