masster 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/helpers.py CHANGED
@@ -7,7 +7,290 @@ import pandas as pd
7
7
  import polars as pl
8
8
 
9
9
  from tqdm import tqdm
10
+ from masster.chromatogram import Chromatogram
11
+
12
+
13
+ def get_bpc(owner, sample=None, rt_unit="s", label=None, original=False):
14
+ """
15
+ Return a Chromatogram object containing the Base Peak Chromatogram (BPC).
16
+
17
+ The `owner` argument may be either a Study instance or a Sample-like object that
18
+ exposes `ms1_df` (Polars DataFrame) and optionally `scans_df`.
19
+
20
+ If `owner` is a Study, `sample` must be provided (int sample_uid, str sample_name or Sample instance)
21
+ and the Sample will be retrieved using `get_sample(owner, sample)`.
22
+
23
+ Returns:
24
+ Chromatogram
25
+ """
26
+ # resolve sample when owner is a Study-like object (has get_sample)
27
+ s = None
28
+ if hasattr(owner, "ms1_df"):
29
+ s = owner
30
+ else:
31
+ # owner is expected to be a Study
32
+ s = get_sample(owner, sample)
33
+
34
+ if s is None:
35
+ raise ValueError("Could not resolve sample for BPC computation")
36
+
37
+ # ensure ms1_df exists
38
+ if getattr(s, "ms1_df", None) is None:
39
+ raise ValueError("Sample has no ms1_df for BPC computation")
40
+
41
+ # try Polars aggregation first
42
+ try:
43
+ cols = s.ms1_df.columns
44
+ if not all(c in cols for c in ["rt", "inty"]):
45
+ raise RuntimeError("ms1_df missing required columns")
46
+
47
+ bpc = s.ms1_df.select([pl.col("rt"), pl.col("inty")])
48
+ bpc = bpc.groupby("rt").agg(pl.col("inty").max().alias("inty"))
49
+ bpc_pd = bpc.to_pandas().sort_values("rt")
50
+ except Exception:
51
+ # fallback to pandas
52
+ try:
53
+ bpc_pd = s.ms1_df.to_pandas()[["rt", "inty"]]
54
+ bpc_pd = bpc_pd.groupby("rt").agg({"inty": "max"}).reset_index().sort_values("rt")
55
+ except Exception:
56
+ raise
57
+
58
+ if bpc_pd.empty:
59
+ raise ValueError("Computed BPC is empty")
60
+
61
+ # If caller requests original RTs (original=True) and we were called from a Study
62
+ # we can obtain a per-sample mapping between current rt and rt_original from
63
+ # the study.features_df and apply it to the computed BPC rt values.
64
+ # Note: original parameter default is False (return current/aligned RTs).
65
+ if original is True:
66
+ try:
67
+ # Only proceed if owner is a Study-like object with features_df
68
+ study = None
69
+ if hasattr(owner, "features_df"):
70
+ study = owner
71
+ else:
72
+ # If owner is a Sample, try to find Study via attribute (not guaranteed)
73
+ study = getattr(owner, "study", None)
74
+
75
+ if study is not None and getattr(study, "features_df", None) is not None:
76
+ # Attempt to select mapping rows for this sample. Prefer matching by sample_uid,
77
+ # fall back to sample_name when necessary.
78
+ import numpy as _np
79
+
80
+ feats = study.features_df
81
+ # try filtering by sample identifier provided to this function
82
+ mapping_rows = None
83
+ if sample is not None:
84
+ try:
85
+ mapping_rows = feats.filter(pl.col("sample_uid") == sample)
86
+ except Exception:
87
+ mapping_rows = pl.DataFrame()
88
+
89
+ if mapping_rows is None or mapping_rows.is_empty():
90
+ try:
91
+ mapping_rows = feats.filter(pl.col("sample_name") == sample)
92
+ except Exception:
93
+ mapping_rows = pl.DataFrame()
94
+
95
+ # If we still have no sample selector, try to infer sample from the Sample object s
96
+ if (mapping_rows is None or mapping_rows.is_empty()) and hasattr(s, "sample_path"):
97
+ # attempt to match by sample_path or file name
98
+ try:
99
+ sample_paths = feats.select(["sample_uid", "sample_name", "sample_path"]) # type: ignore[arg-type]
100
+ # find row where sample_path matches
101
+ mapping_rows = feats.filter(pl.col("sample_path") == getattr(s, "file", None))
102
+ except Exception:
103
+ mapping_rows = pl.DataFrame()
104
+
105
+ # If still empty, give up mapping
106
+ if mapping_rows is not None and not mapping_rows.is_empty():
107
+ # collect rt and rt_original pairs
108
+ try:
109
+ map_pd = mapping_rows.select(["rt", "rt_original"]).to_pandas()
110
+ except Exception:
111
+ map_pd = mapping_rows.to_pandas()[["rt", "rt_original"]]
112
+
113
+ # drop NA and duplicates
114
+ map_pd = map_pd.dropna()
115
+ if not map_pd.empty:
116
+ # sort by rt (current/aligned)
117
+ map_pd = map_pd.sort_values("rt")
118
+ x = map_pd["rt"].to_numpy()
119
+ y = map_pd["rt_original"].to_numpy()
120
+ # require at least 2 points to interpolate
121
+ if x.size >= 2:
122
+ # apply linear interpolation from current rt -> original rt
123
+ # for values outside the known range, numpy.interp will clip to endpoints
124
+ new_rt = _np.interp(bpc_pd["rt"].to_numpy(), x, y)
125
+ bpc_pd = bpc_pd.copy()
126
+ bpc_pd["rt"] = new_rt
127
+ except Exception:
128
+ # If mapping fails, silently continue and return the original computed BPC
129
+ pass
130
+
131
+ # build Chromatogram
132
+ ycol = "inty"
133
+ try:
134
+ chrom = Chromatogram(rt=bpc_pd["rt"].to_numpy(), inty=bpc_pd[ycol].to_numpy(), label=label or "Base Peak Chromatogram", rt_unit=rt_unit)
135
+ except Exception:
136
+ chrom = Chromatogram(rt=bpc_pd["rt"].values, inty=bpc_pd[ycol].values, label=label or "Base Peak Chromatogram", rt_unit=rt_unit)
137
+
138
+ return chrom
139
+
140
+
141
+ def get_tic(owner, sample=None, label=None):
142
+ """
143
+ Return a Chromatogram object containing the Total Ion Chromatogram (TIC).
144
+
145
+ `owner` may be a Sample-like object (has `ms1_df`) or a Study (in which case `sample` selects the sample).
146
+ The function falls back to `scans_df` when `ms1_df` is not available.
147
+ """
148
+ # resolve sample object
149
+ s = None
150
+ if hasattr(owner, "ms1_df"):
151
+ s = owner
152
+ else:
153
+ s = get_sample(owner, sample)
154
+
155
+ if s is None:
156
+ raise ValueError("Could not resolve sample for TIC computation")
157
+
158
+ # prefer ms1_df
159
+ try:
160
+ cols = s.ms1_df.columns
161
+ if all(c in cols for c in ["rt", "inty"]):
162
+ tic = s.ms1_df.select([pl.col("rt"), pl.col("inty")])
163
+ tic = tic.groupby("rt").agg(pl.col("inty").sum().alias("inty_tot"))
164
+ tic_pd = tic.to_pandas().sort_values("rt")
165
+ else:
166
+ raise RuntimeError("ms1_df missing required columns")
167
+ except Exception:
168
+ # fallback to scans_df if present
169
+ if getattr(s, "scans_df", None) is not None:
170
+ try:
171
+ scans = s.scans_df.filter(pl.col("ms_level") == 1)
172
+ data = scans[["rt", "scan_uid", "inty_tot"]].to_pandas()
173
+ data = data.sort_values("rt")
174
+ tic_pd = data.rename(columns={"inty_tot": "inty_tot"})
175
+ except Exception:
176
+ raise
177
+ else:
178
+ raise ValueError("Neither ms1_df nor scans_df available for TIC computation")
179
+
180
+ if tic_pd.empty:
181
+ raise ValueError("Computed TIC is empty")
182
+
183
+ # ensure column name
184
+ if "inty_tot" not in tic_pd.columns:
185
+ tic_pd = tic_pd.rename(columns={tic_pd.columns[1]: "inty_tot"})
186
+
187
+ try:
188
+ chrom = Chromatogram(rt=tic_pd["rt"].to_numpy(), inty=tic_pd["inty_tot"].to_numpy(), label=label or "Total Ion Chromatogram")
189
+ except Exception:
190
+ chrom = Chromatogram(rt=tic_pd["rt"].values, inty=tic_pd["inty_tot"].values, label=label or "Total Ion Chromatogram")
191
+
192
+ return chrom
193
+
10
194
 
195
+ def get_eic(owner, sample=None, mz=None, mz_tol=0.01, rt_unit="s", label=None):
196
+ """
197
+ Return a Chromatogram object containing the Extracted Ion Chromatogram (EIC) for a target m/z.
198
+
199
+ The `owner` argument may be either a Study instance or a Sample-like object that
200
+ exposes `ms1_df` (Polars DataFrame).
201
+
202
+ If `owner` is a Study, `sample` must be provided (int sample_uid, str sample_name or Sample instance)
203
+ and the Sample will be retrieved using `get_sample(owner, sample)`.
204
+
205
+ Parameters:
206
+ owner: Study or Sample instance
207
+ sample: Sample identifier (required if owner is Study)
208
+ mz (float): Target m/z value
209
+ mz_tol (float): m/z tolerance (default 0.01)
210
+ rt_unit (str): Retention time unit for the chromatogram
211
+ label (str): Optional label for the chromatogram
212
+
213
+ Returns:
214
+ Chromatogram
215
+ """
216
+ if mz is None:
217
+ raise ValueError("mz must be provided for EIC computation")
218
+
219
+ # resolve sample when owner is a Study-like object (has get_sample)
220
+ s = None
221
+ if hasattr(owner, "ms1_df"):
222
+ s = owner
223
+ else:
224
+ # owner is expected to be a Study
225
+ s = get_sample(owner, sample)
226
+
227
+ if s is None:
228
+ raise ValueError("Could not resolve sample for EIC computation")
229
+
230
+ # ensure ms1_df exists
231
+ if getattr(s, "ms1_df", None) is None:
232
+ raise ValueError("Sample has no ms1_df for EIC computation")
233
+
234
+ # Extract EIC from ms1_df using mz window
235
+ try:
236
+ cols = s.ms1_df.columns
237
+ if not all(c in cols for c in ["rt", "mz", "inty"]):
238
+ raise RuntimeError("ms1_df missing required columns")
239
+
240
+ # Filter by mz window
241
+ mz_min = mz - mz_tol
242
+ mz_max = mz + mz_tol
243
+ eic_data = s.ms1_df.filter(
244
+ (pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max)
245
+ )
246
+
247
+ if eic_data.is_empty():
248
+ # Return empty chromatogram if no data found
249
+ import numpy as _np
250
+ return Chromatogram(
251
+ rt=_np.array([0.0]),
252
+ inty=_np.array([0.0]),
253
+ label=label or f"EIC m/z={mz:.4f} ± {mz_tol} (empty)",
254
+ rt_unit=rt_unit
255
+ )
256
+
257
+ # Aggregate intensities per retention time (sum in case of multiple points per rt)
258
+ eic = eic_data.group_by("rt").agg(pl.col("inty").sum().alias("inty"))
259
+ eic_pd = eic.sort("rt").to_pandas()
260
+
261
+ except Exception:
262
+ raise RuntimeError("Failed to extract EIC from ms1_df")
263
+
264
+ if eic_pd.empty:
265
+ # Return empty chromatogram if no data found
266
+ import numpy as _np
267
+ return Chromatogram(
268
+ rt=_np.array([0.0]),
269
+ inty=_np.array([0.0]),
270
+ label=label or f"EIC m/z={mz:.4f} ± {mz_tol} (empty)",
271
+ rt_unit=rt_unit
272
+ )
273
+
274
+ # build Chromatogram
275
+ try:
276
+ chrom = Chromatogram(
277
+ rt=eic_pd["rt"].to_numpy(),
278
+ inty=eic_pd["inty"].to_numpy(),
279
+ label=label or f"EIC m/z={mz:.4f} ± {mz_tol}",
280
+ rt_unit=rt_unit
281
+ )
282
+ except Exception:
283
+ chrom = Chromatogram(
284
+ rt=eic_pd["rt"].values,
285
+ inty=eic_pd["inty"].values,
286
+ label=label or f"EIC m/z={mz:.4f} ± {mz_tol}",
287
+ rt_unit=rt_unit
288
+ )
289
+
290
+ return chrom
291
+
292
+
293
+
11
294
 
12
295
  def get_chrom(self, uids=None, samples=None):
13
296
  # Check if consensus_df is empty or doesn't have required columns
@@ -113,6 +396,7 @@ def get_chrom(self, uids=None, samples=None):
113
396
  # Return as Polars DataFrame (can handle complex objects like Chromatogram)
114
397
  return df2_pivoted
115
398
 
399
+
116
400
  def set_folder(self, folder):
117
401
  """
118
402
  Set the folder for saving and loading files.
@@ -123,8 +407,6 @@ def set_folder(self, folder):
123
407
 
124
408
 
125
409
  def align_reset(self):
126
- if self.alignment_ref_index is None:
127
- return
128
410
  self.logger.debug("Resetting alignment.")
129
411
  # iterate over all feature maps and set RT to original RT
130
412
  for feature_map in self.features_maps:
@@ -134,7 +416,13 @@ def align_reset(self):
134
416
  feature.setRT(rt)
135
417
  feature.removeMetaValue("original_RT")
136
418
  self.alignment_ref_index = None
137
-
419
+ # in self.features_df, set rt equal to rt_original
420
+ self.features_df = self.features_df.with_columns(
421
+ pl.col("rt_original").alias("rt")
422
+ )
423
+
424
+ # Ensure column order is maintained after with_columns operation
425
+ self._ensure_features_df_schema_order()
138
426
 
139
427
  # TODO I don't get this param
140
428
  def get_consensus(self, quant="chrom_area"):
@@ -408,17 +696,71 @@ def _get_sample_uids(self, samples=None, seed=42):
408
696
  sample_uids = list(set(sample_uids))
409
697
  return sample_uids
410
698
 
699
+
700
+ def get_sample(self, sample):
701
+ """
702
+ Return a `Sample` object corresponding to the provided sample identifier.
703
+
704
+ Accepted `sample` values:
705
+ - int: interpreted as `sample_uid`
706
+ - str: interpreted as `sample_name`
707
+ - Sample instance: returned as-is
708
+
709
+ This helper mirrors the original Study.get_sample method but lives in helpers for reuse.
710
+ """
711
+ from masster.sample.sample import Sample
712
+
713
+ if isinstance(sample, Sample):
714
+ return sample
715
+
716
+ if isinstance(sample, int):
717
+ rows = self.samples_df.filter(pl.col("sample_uid") == sample)
718
+ elif isinstance(sample, str):
719
+ rows = self.samples_df.filter(pl.col("sample_name") == sample)
720
+ else:
721
+ raise ValueError("sample must be an int (sample_uid), str (sample_name) or a Sample instance")
722
+
723
+ if rows.is_empty():
724
+ raise KeyError(f"Sample not found: {sample}")
725
+
726
+ row = rows.row(0, named=True)
727
+ sample_uid = int(row["sample_uid"]) if row["sample_uid"] is not None else None
728
+
729
+ # Use a cache on the Study instance if available
730
+ cache = getattr(self, "_samples_cache", None)
731
+ if cache is not None and sample_uid in cache:
732
+ return cache[sample_uid]
733
+
734
+ sample_path = row.get("sample_path", None)
735
+ s = Sample(log_level='ERROR')
736
+ try:
737
+ if sample_path:
738
+ try:
739
+ s.load(sample_path)
740
+ except Exception:
741
+ s = Sample(file=sample_path)
742
+ except Exception:
743
+ pass
744
+
745
+ if cache is not None and sample_uid is not None:
746
+ cache[sample_uid] = s
747
+ return s
748
+
749
+
411
750
  def get_orphans(self):
412
- """
751
+ """
413
752
  Get all features that are not in the consensus mapping.
414
753
  """
415
- not_in_consensus = self.features_df.filter(~self.features_df['feature_uid'].is_in(self.consensus_mapping_df['feature_uid'].to_list()))
754
+ not_in_consensus = self.features_df.filter(
755
+ ~self.features_df["feature_uid"].is_in(self.consensus_mapping_df["feature_uid"].to_list())
756
+ )
416
757
  return not_in_consensus
417
758
 
759
+
418
760
  def compress(self, features=True, ms2=True, chrom=False, ms2_max=5):
419
761
  """
420
762
  Perform compress_features, compress_ms2, and compress_chrom operations.
421
-
763
+
422
764
  Parameters:
423
765
  max_replicates (int): Maximum number of MS2 replicates to keep per consensus_uid and energy combination
424
766
  """
@@ -441,48 +783,50 @@ def compress_features(self):
441
783
  if self.features_df is None or self.features_df.is_empty():
442
784
  self.logger.warning("No features_df found.")
443
785
  return
444
-
786
+
445
787
  if self.consensus_mapping_df is None or self.consensus_mapping_df.is_empty():
446
788
  self.logger.warning("No consensus_mapping_df found.")
447
789
  return
448
-
790
+
449
791
  initial_count = len(self.features_df)
450
-
792
+
451
793
  # Get feature_uids that are associated with consensus features
452
794
  consensus_feature_uids = self.consensus_mapping_df["feature_uid"].to_list()
453
-
795
+
454
796
  # Filter features_df to keep only features associated with consensus
455
797
  self.features_df = self.features_df.filter(
456
- pl.col("feature_uid").is_in(consensus_feature_uids)
798
+ pl.col("feature_uid").is_in(consensus_feature_uids),
457
799
  )
458
-
800
+
459
801
  # Set ms2_specs column to None if it exists
460
802
  if "ms2_specs" in self.features_df.columns:
461
803
  # Create a list of None values with the same length as the dataframe
462
804
  # This preserves the Object dtype instead of converting to Null
463
805
  none_values = [None] * len(self.features_df)
464
806
  self.features_df = self.features_df.with_columns(
465
- pl.Series("ms2_specs", none_values, dtype=pl.Object)
807
+ pl.Series("ms2_specs", none_values, dtype=pl.Object),
466
808
  )
467
-
809
+
468
810
  removed_count = initial_count - len(self.features_df)
469
- self.logger.info(f"Compressed features: removed {removed_count} features not in consensus, cleared ms2_specs column")
811
+ self.logger.info(
812
+ f"Compressed features: removed {removed_count} features not in consensus, cleared ms2_specs column"
813
+ )
470
814
 
471
815
 
472
816
  def restore_features(self, samples=None, maps=False):
473
817
  """
474
- Update specific columns (chrom, chrom_area, ms2_scans, ms2_specs) in features_df
818
+ Update specific columns (chrom, chrom_area, ms2_scans, ms2_specs) in features_df
475
819
  from the corresponding samples by reading features_df from the sample5 file.
476
820
  Use the feature_id for matching.
477
821
 
478
822
  Parameters:
479
- samples (list, optional): List of sample_uids or sample_names to restore.
823
+ samples (list, optional): List of sample_uids or sample_names to restore.
480
824
  If None, restores all samples.
481
825
  maps (bool, optional): If True, also load featureXML data and update study.feature_maps.
482
826
  """
483
827
  import datetime
484
828
  from masster.sample.sample import Sample
485
-
829
+
486
830
  if self.features_df is None or self.features_df.is_empty():
487
831
  self.logger.error("No features_df found in study.")
488
832
  return
@@ -499,8 +843,8 @@ def restore_features(self, samples=None, maps=False):
499
843
  return
500
844
 
501
845
  # Columns to update from sample data
502
- columns_to_update = ['chrom', 'chrom_area', 'ms2_scans', 'ms2_specs']
503
-
846
+ columns_to_update = ["chrom", "chrom_area", "ms2_scans", "ms2_specs"]
847
+
504
848
  self.logger.info(f"Restoring columns {columns_to_update} from {len(sample_uids)} samples...")
505
849
 
506
850
  # Create a mapping of (sample_uid, feature_id) to feature_uid from study.features_df
@@ -512,10 +856,12 @@ def restore_features(self, samples=None, maps=False):
512
856
 
513
857
  # Process each sample
514
858
  tqdm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
515
- for sample_uid in tqdm(sample_uids,
516
- unit="sample",
517
- disable=tqdm_disable,
518
- desc=f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Restoring samples"):
859
+ for sample_uid in tqdm(
860
+ sample_uids,
861
+ unit="sample",
862
+ disable=tqdm_disable,
863
+ desc=f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Restoring samples",
864
+ ):
519
865
  # Get sample info
520
866
  sample_row = self.samples_df.filter(pl.col("sample_uid") == sample_uid)
521
867
  if sample_row.is_empty():
@@ -534,7 +880,7 @@ def restore_features(self, samples=None, maps=False):
534
880
  # Load sample to get its features_df
535
881
  # Use a direct load call with map=False to prevent feature synchronization
536
882
  # which would remove filled features that don't exist in the original FeatureMap
537
- sample = Sample(log_level='DEBUG')
883
+ sample = Sample(log_level="DEBUG")
538
884
  sample._load_sample5(sample_path, map=False)
539
885
 
540
886
  if sample.features_df is None or sample.features_df.is_empty():
@@ -547,34 +893,34 @@ def restore_features(self, samples=None, maps=False):
547
893
  feature_id = row.get("feature_id")
548
894
  if feature_id is None:
549
895
  continue
550
-
896
+
551
897
  key = (sample_uid, feature_id)
552
898
  if key in study_feature_mapping:
553
899
  feature_uid = study_feature_mapping[key]
554
-
900
+
555
901
  # Update the specific columns in study.features_df
556
902
  for col in columns_to_update:
557
903
  if col in row and col in self.features_df.columns:
558
904
  # Get the original column dtype to preserve it
559
905
  original_dtype = self.features_df[col].dtype
560
-
906
+
561
907
  # Update the specific row and column, preserving dtype
562
908
  mask = (pl.col("feature_uid") == feature_uid) & (pl.col("sample_uid") == sample_uid)
563
-
909
+
564
910
  # Handle object columns (like Chromatogram) differently
565
911
  if original_dtype == pl.Object:
566
912
  self.features_df = self.features_df.with_columns(
567
913
  pl.when(mask)
568
914
  .then(pl.lit(row[col], dtype=original_dtype, allow_object=True))
569
915
  .otherwise(pl.col(col))
570
- .alias(col)
916
+ .alias(col),
571
917
  )
572
918
  else:
573
919
  self.features_df = self.features_df.with_columns(
574
920
  pl.when(mask)
575
921
  .then(pl.lit(row[col], dtype=original_dtype))
576
922
  .otherwise(pl.col(col))
577
- .alias(col)
923
+ .alias(col),
578
924
  )
579
925
  updates_made += 1
580
926
 
@@ -582,7 +928,7 @@ def restore_features(self, samples=None, maps=False):
582
928
 
583
929
  # If maps is True, load featureXML data
584
930
  if maps:
585
- if hasattr(sample, 'feature_maps'):
931
+ if hasattr(sample, "feature_maps"):
586
932
  self.feature_maps.extend(sample.feature_maps)
587
933
 
588
934
  except Exception as e:
@@ -595,14 +941,14 @@ def restore_features(self, samples=None, maps=False):
595
941
  def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
596
942
  """
597
943
  Restore chromatograms from individual .sample5 files and gap-fill missing ones.
598
-
944
+
599
945
  This function combines the functionality of restore_features() and fill_chrom():
600
946
  1. First restores chromatograms from individual .sample5 files (like restore_features)
601
947
  2. Then gap-fills any remaining empty chromatograms (like fill_chrom)
602
948
  3. ONLY updates the 'chrom' column, not chrom_area or other derived values
603
-
949
+
604
950
  Parameters:
605
- samples (list, optional): List of sample_uids or sample_names to process.
951
+ samples (list, optional): List of sample_uids or sample_names to process.
606
952
  If None, processes all samples.
607
953
  mz_tol (float): m/z tolerance for gap filling (default: 0.010)
608
954
  rt_tol (float): RT tolerance for gap filling (default: 10.0)
@@ -611,7 +957,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
611
957
  import numpy as np
612
958
  from masster.sample.sample import Sample
613
959
  from masster.chromatogram import Chromatogram
614
-
960
+
615
961
  if self.features_df is None or self.features_df.is_empty():
616
962
  self.logger.error("No features_df found in study.")
617
963
  return
@@ -627,7 +973,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
627
973
  return
628
974
 
629
975
  self.logger.info(f"Restoring chromatograms from {len(sample_uids)} samples...")
630
-
976
+
631
977
  # Create mapping of (sample_uid, feature_id) to feature_uid
632
978
  study_feature_mapping = {}
633
979
  for row in self.features_df.iter_rows(named=True):
@@ -638,12 +984,13 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
638
984
  # Phase 1: Restore from individual .sample5 files (like restore_features)
639
985
  restored_count = 0
640
986
  tqdm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
641
-
987
+
642
988
  self.logger.info("Phase 1: Restoring chromatograms from .sample5 files...")
643
- for sample_uid in tqdm(sample_uids,
644
- desc=f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Restoring from samples",
645
- disable=tqdm_disable):
646
-
989
+ for sample_uid in tqdm(
990
+ sample_uids,
991
+ desc=f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Restoring from samples",
992
+ disable=tqdm_disable,
993
+ ):
647
994
  # Get sample info
648
995
  sample_row = self.samples_df.filter(pl.col("sample_uid") == sample_uid)
649
996
  if sample_row.is_empty():
@@ -660,7 +1007,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
660
1007
 
661
1008
  try:
662
1009
  # Load sample (with map=False to prevent feature synchronization)
663
- sample = Sample(log_level='WARNING')
1010
+ sample = Sample(log_level="WARNING")
664
1011
  sample._load_sample5(sample_path, map=False)
665
1012
 
666
1013
  if sample.features_df is None or sample.features_df.is_empty():
@@ -671,21 +1018,21 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
671
1018
  for row in sample.features_df.iter_rows(named=True):
672
1019
  feature_id = row.get("feature_id")
673
1020
  chrom = row.get("chrom")
674
-
1021
+
675
1022
  if feature_id is None or chrom is None:
676
1023
  continue
677
-
1024
+
678
1025
  key = (sample_uid, feature_id)
679
1026
  if key in study_feature_mapping:
680
1027
  feature_uid = study_feature_mapping[key]
681
-
1028
+
682
1029
  # Update only the chrom column
683
1030
  mask = (pl.col("feature_uid") == feature_uid) & (pl.col("sample_uid") == sample_uid)
684
1031
  self.features_df = self.features_df.with_columns(
685
1032
  pl.when(mask)
686
1033
  .then(pl.lit(chrom, dtype=pl.Object, allow_object=True))
687
1034
  .otherwise(pl.col("chrom"))
688
- .alias("chrom")
1035
+ .alias("chrom"),
689
1036
  )
690
1037
  restored_count += 1
691
1038
 
@@ -694,20 +1041,22 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
694
1041
  continue
695
1042
 
696
1043
  self.logger.info(f"Phase 1 complete: Restored {restored_count} chromatograms from .sample5 files")
697
-
1044
+
698
1045
  # Phase 2: Gap-fill remaining empty chromatograms (like fill_chrom)
699
1046
  self.logger.info("Phase 2: Gap-filling remaining empty chromatograms...")
700
-
1047
+
701
1048
  # Count how many chromatograms are still missing
702
1049
  empty_chroms = self.features_df.filter(pl.col("chrom").is_null()).height
703
1050
  total_chroms = len(self.features_df)
704
-
705
- self.logger.debug(f"Chromatograms still missing: {empty_chroms}/{total_chroms} ({empty_chroms/total_chroms*100:.1f}%)")
706
-
1051
+
1052
+ self.logger.debug(
1053
+ f"Chromatograms still missing: {empty_chroms}/{total_chroms} ({empty_chroms / total_chroms * 100:.1f}%)"
1054
+ )
1055
+
707
1056
  if empty_chroms == 0:
708
1057
  self.logger.info("All chromatograms restored from .sample5 files. No gap-filling needed.")
709
1058
  return
710
-
1059
+
711
1060
  # Get consensus info for gap filling
712
1061
  consensus_info = {}
713
1062
  for row in self.consensus_df.iter_rows(named=True):
@@ -717,23 +1066,23 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
717
1066
  "mz": row["mz"],
718
1067
  "rt": row["rt"],
719
1068
  }
720
-
1069
+
721
1070
  filled_count = 0
722
-
1071
+
723
1072
  # Process each sample that has missing chromatograms
724
- for sample_uid in tqdm(sample_uids,
725
- desc=f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Gap-filling missing chromatograms",
726
- disable=tqdm_disable):
727
-
1073
+ for sample_uid in tqdm(
1074
+ sample_uids,
1075
+ desc=f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Gap-filling missing chromatograms",
1076
+ disable=tqdm_disable,
1077
+ ):
728
1078
  # Get features with missing chromatograms for this sample
729
1079
  missing_features = self.features_df.filter(
730
- (pl.col("sample_uid") == sample_uid) &
731
- (pl.col("chrom").is_null())
1080
+ (pl.col("sample_uid") == sample_uid) & (pl.col("chrom").is_null()),
732
1081
  )
733
-
1082
+
734
1083
  if missing_features.is_empty():
735
1084
  continue
736
-
1085
+
737
1086
  # Get sample info
738
1087
  sample_row = self.samples_df.filter(pl.col("sample_uid") == sample_uid)
739
1088
  sample_info = sample_row.row(0, named=True)
@@ -745,10 +1094,10 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
745
1094
 
746
1095
  try:
747
1096
  # Load sample for MS1 data extraction
748
- sample = Sample(log_level='WARNING')
1097
+ sample = Sample(log_level="WARNING")
749
1098
  sample._load_sample5(sample_path, map=False)
750
1099
 
751
- if not hasattr(sample, 'ms1_df') or sample.ms1_df is None or sample.ms1_df.is_empty():
1100
+ if not hasattr(sample, "ms1_df") or sample.ms1_df is None or sample.ms1_df.is_empty():
752
1101
  continue
753
1102
 
754
1103
  # Process each missing feature
@@ -758,15 +1107,15 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
758
1107
  rt = feature_row["rt"]
759
1108
  rt_start = feature_row.get("rt_start", rt - rt_tol)
760
1109
  rt_end = feature_row.get("rt_end", rt + rt_tol)
761
-
1110
+
762
1111
  # Extract EIC from MS1 data
763
1112
  d = sample.ms1_df.filter(
764
- (pl.col("mz") >= mz - mz_tol) &
765
- (pl.col("mz") <= mz + mz_tol) &
766
- (pl.col("rt") >= rt_start - rt_tol) &
767
- (pl.col("rt") <= rt_end + rt_tol)
1113
+ (pl.col("mz") >= mz - mz_tol)
1114
+ & (pl.col("mz") <= mz + mz_tol)
1115
+ & (pl.col("rt") >= rt_start - rt_tol)
1116
+ & (pl.col("rt") <= rt_end + rt_tol),
768
1117
  )
769
-
1118
+
770
1119
  # Create chromatogram
771
1120
  if d.is_empty():
772
1121
  # Create empty chromatogram
@@ -784,7 +1133,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
784
1133
  else:
785
1134
  # Create real chromatogram from data
786
1135
  eic_rt = d.group_by("rt").agg(pl.col("inty").max()).sort("rt")
787
-
1136
+
788
1137
  if len(eic_rt) > 4:
789
1138
  eic = Chromatogram(
790
1139
  eic_rt["rt"].to_numpy(),
@@ -809,14 +1158,14 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
809
1158
  feature_end=rt_end,
810
1159
  feature_apex=rt,
811
1160
  )
812
-
1161
+
813
1162
  # Update the chromatogram in the study
814
1163
  mask = pl.col("feature_uid") == feature_uid
815
1164
  self.features_df = self.features_df.with_columns(
816
1165
  pl.when(mask)
817
1166
  .then(pl.lit(eic, dtype=pl.Object, allow_object=True))
818
1167
  .otherwise(pl.col("chrom"))
819
- .alias("chrom")
1168
+ .alias("chrom"),
820
1169
  )
821
1170
  filled_count += 1
822
1171
 
@@ -825,12 +1174,14 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
825
1174
  continue
826
1175
 
827
1176
  self.logger.info(f"Phase 2 complete: Gap-filled {filled_count} chromatograms")
828
-
1177
+
829
1178
  # Final summary
830
1179
  final_non_null = self.features_df.filter(pl.col("chrom").is_not_null()).height
831
1180
  final_total = len(self.features_df)
832
-
833
- self.logger.info(f"Chromatogram restoration complete: {final_non_null}/{final_total} ({final_non_null/final_total*100:.1f}%)")
1181
+
1182
+ self.logger.info(
1183
+ f"Chromatogram restoration complete: {final_non_null}/{final_total} ({final_non_null / final_total * 100:.1f}%)"
1184
+ )
834
1185
  self.logger.info(f"Restored from .sample5 files: {restored_count}, Gap-filled from raw data: {filled_count}")
835
1186
 
836
1187
 
@@ -839,41 +1190,39 @@ def compress_ms2(self, max_replicates=5):
839
1190
  Reduce the number of entries matching any pair of (consensus and energy) to max XY rows.
840
1191
  Groups all rows by consensus_uid and energy. For each group, sort by number_frags * prec_inty,
841
1192
  and then pick the top XY rows. Discard the others.
842
-
1193
+
843
1194
  Parameters:
844
1195
  max_replicates (int): Maximum number of replicates to keep per consensus_uid and energy combination
845
1196
  """
846
1197
  if self.consensus_ms2 is None or self.consensus_ms2.is_empty():
847
1198
  self.logger.warning("No consensus_ms2 found.")
848
1199
  return
849
-
1200
+
850
1201
  initial_count = len(self.consensus_ms2)
851
-
1202
+
852
1203
  # Create a ranking score based on number_frags * prec_inty
853
1204
  # Handle None values by treating them as 0
854
1205
  self.consensus_ms2 = self.consensus_ms2.with_columns([
855
- (
856
- pl.col("number_frags").fill_null(0) *
857
- pl.col("prec_inty").fill_null(0)
858
- ).alias("ranking_score")
1206
+ (pl.col("number_frags").fill_null(0) * pl.col("prec_inty").fill_null(0)).alias("ranking_score"),
859
1207
  ])
860
-
1208
+
861
1209
  # Group by consensus_uid and energy, then rank by score and keep top max_replicates
862
1210
  compressed_ms2 = (
863
- self.consensus_ms2
864
- .with_row_count("row_id") # Add row numbers for stable sorting
1211
+ self.consensus_ms2.with_row_count("row_id") # Add row numbers for stable sorting
865
1212
  .sort(["consensus_uid", "energy", "ranking_score", "row_id"], descending=[False, False, True, False])
866
1213
  .with_columns([
867
- pl.int_range(pl.len()).over(["consensus_uid", "energy"]).alias("rank")
1214
+ pl.int_range(pl.len()).over(["consensus_uid", "energy"]).alias("rank"),
868
1215
  ])
869
1216
  .filter(pl.col("rank") < max_replicates)
870
1217
  .drop(["ranking_score", "row_id", "rank"])
871
1218
  )
872
-
1219
+
873
1220
  self.consensus_ms2 = compressed_ms2
874
-
1221
+
875
1222
  removed_count = initial_count - len(self.consensus_ms2)
876
- self.logger.info(f"Compressed MS2 data: removed {removed_count} entries, kept max {max_replicates} per consensus/energy pair")
1223
+ self.logger.info(
1224
+ f"Compressed MS2 data: removed {removed_count} entries, kept max {max_replicates} per consensus/energy pair"
1225
+ )
877
1226
 
878
1227
 
879
1228
  def compress_chrom(self):
@@ -886,49 +1235,175 @@ def compress_chrom(self):
886
1235
  if self.features_df is None or self.features_df.is_empty():
887
1236
  self.logger.warning("No features_df found.")
888
1237
  return
889
-
1238
+
890
1239
  if "chrom" not in self.features_df.columns:
891
1240
  self.logger.warning("No 'chrom' column found in features_df.")
892
1241
  return
893
-
1242
+
894
1243
  # Count non-null chromatograms before compression
895
1244
  non_null_count = self.features_df.filter(pl.col("chrom").is_not_null()).height
896
-
1245
+
897
1246
  # Set chrom column to None while keeping dtype as object
898
1247
  self.features_df = self.features_df.with_columns(
899
- pl.lit(None, dtype=pl.Object).alias("chrom")
1248
+ pl.lit(None, dtype=pl.Object).alias("chrom"),
900
1249
  )
901
-
1250
+
902
1251
  self.logger.info(f"Compressed chromatograms: cleared {non_null_count} chromatogram objects from features_df")
903
1252
 
904
1253
 
905
- def set_source(self, filename):
1254
+ def name_replace(self, replace_dict):
906
1255
  """
907
- Reassign file_source for all samples in samples_df. If filename contains only a path,
908
- keep the current basename and build an absolute path. Check that the new file exists
909
- before overwriting the old file_source.
1256
+ Replace sample names in samples_df based on a dictionary mapping.
910
1257
 
1258
+ Takes all names in self.samples_df['sample_name'], creates a copy, and replaces
1259
+ all keys with their corresponding values from replace_dict. Checks that all
1260
+ resulting sample names are unique. If unique, replaces the values in self.samples_df.
1261
+
911
1262
  Parameters:
912
- filename (str): New file path or directory path for all samples
1263
+ replace_dict (dict): Dictionary mapping old names (keys) to new names (values).
1264
+ All keys found in sample names will be replaced with their
1265
+ corresponding values.
1266
+ e.g., {"old_name1": "new_name1", "old_name2": "new_name2"}
1267
+
1268
+ Returns:
1269
+ None
1270
+
1271
+ Raises:
1272
+ ValueError: If replace_dict is not a dictionary
1273
+ ValueError: If resulting sample names are not unique
1274
+ """
1275
+ if not isinstance(replace_dict, dict):
1276
+ raise ValueError("replace_dict must be a dictionary")
1277
+
1278
+ if self.samples_df is None or len(self.samples_df) == 0:
1279
+ self.logger.warning("No samples found in study.")
1280
+ return
1281
+
1282
+ if not replace_dict:
1283
+ self.logger.warning("Empty replace_dict provided, no changes made.")
1284
+ return
1285
+
1286
+ # Get current sample names
1287
+ current_names = self.samples_df.get_column("sample_name").to_list()
1288
+
1289
+ # Create a copy and apply replacements
1290
+ new_names = []
1291
+ replaced_count = 0
1292
+
1293
+ for name in current_names:
1294
+ if name in replace_dict:
1295
+ new_names.append(replace_dict[name])
1296
+ replaced_count += 1
1297
+ self.logger.debug(f"Replacing sample name: '{name}' -> '{replace_dict[name]}'")
1298
+ else:
1299
+ new_names.append(name)
1300
+
1301
+ # Check that all new names are unique
1302
+ if len(set(new_names)) != len(new_names):
1303
+ duplicates = []
1304
+ seen = set()
1305
+ for name in new_names:
1306
+ if name in seen:
1307
+ duplicates.append(name)
1308
+ else:
1309
+ seen.add(name)
1310
+ raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
1311
+
1312
+ # If we get here, all names are unique - apply the changes
1313
+ self.samples_df = self.samples_df.with_columns(
1314
+ pl.Series("sample_name", new_names).alias("sample_name"),
1315
+ )
913
1316
 
1317
+ self.logger.info(f"Successfully replaced {replaced_count} sample names")
1318
+
1319
+
1320
+ def name_reset(self):
1321
+ """
1322
+ Reset sample names to the basename of sample_path without extensions.
1323
+
1324
+ Takes all paths in self.samples_df['sample_path'], extracts the basename,
1325
+ removes file extensions, and checks that all resulting names are unique.
1326
+ If unique, replaces the values in self.samples_df['sample_name'].
1327
+
914
1328
  Returns:
915
1329
  None
1330
+
1331
+ Raises:
1332
+ ValueError: If resulting sample names are not unique
1333
+ RuntimeError: If any sample_path is None or empty
916
1334
  """
917
1335
  import os
918
1336
 
919
1337
  if self.samples_df is None or len(self.samples_df) == 0:
920
1338
  self.logger.warning("No samples found in study.")
921
1339
  return
1340
+
1341
+ # Get current sample paths
1342
+ sample_paths = self.samples_df.get_column("sample_path").to_list()
1343
+
1344
+ # Extract basenames without extensions
1345
+ new_names = []
1346
+
1347
+ for i, path in enumerate(sample_paths):
1348
+ if path is None or path == "":
1349
+ raise RuntimeError(f"Sample at index {i} has no sample_path set")
1350
+
1351
+ # Get basename and remove extension(s)
1352
+ basename = os.path.basename(path)
1353
+ # Remove all extensions (handles cases like .tar.gz, .sample5.gz, etc.)
1354
+ name_without_ext = basename
1355
+ while '.' in name_without_ext:
1356
+ name_without_ext = os.path.splitext(name_without_ext)[0]
1357
+
1358
+ new_names.append(name_without_ext)
1359
+ self.logger.debug(f"Resetting sample name from path: '{path}' -> '{name_without_ext}'")
1360
+
1361
+ # Check that all new names are unique
1362
+ if len(set(new_names)) != len(new_names):
1363
+ duplicates = []
1364
+ seen = set()
1365
+ for name in new_names:
1366
+ if name in seen:
1367
+ duplicates.append(name)
1368
+ else:
1369
+ seen.add(name)
1370
+ raise ValueError(f"Resulting sample names are not unique. Duplicates found: {duplicates}")
922
1371
 
1372
+ # If we get here, all names are unique - apply the changes
1373
+ self.samples_df = self.samples_df.with_columns(
1374
+ pl.Series("sample_name", new_names).alias("sample_name"),
1375
+ )
1376
+
1377
+ self.logger.info(f"Successfully reset {len(new_names)} sample names from sample paths")
1378
+
1379
+
1380
+ def set_source(self, filename):
1381
+ """
1382
+ Reassign file_source for all samples in samples_df. If filename contains only a path,
1383
+ keep the current basename and build an absolute path. Check that the new file exists
1384
+ before overwriting the old file_source.
1385
+
1386
+ Parameters:
1387
+ filename (str): New file path or directory path for all samples
1388
+
1389
+ Returns:
1390
+ None
1391
+ """
1392
+ import os
1393
+
1394
+ if self.samples_df is None or len(self.samples_df) == 0:
1395
+ self.logger.warning("No samples found in study.")
1396
+ return
1397
+
923
1398
  updated_count = 0
924
1399
  failed_count = 0
925
-
1400
+
926
1401
  # Get all current file_source values
927
1402
  current_sources = self.samples_df.get_column("file_source").to_list()
928
1403
  sample_names = self.samples_df.get_column("sample_name").to_list()
929
-
1404
+
930
1405
  new_sources = []
931
-
1406
+
932
1407
  for i, (current_source, sample_name) in enumerate(zip(current_sources, sample_names)):
933
1408
  # Check if filename is just a directory path
934
1409
  if os.path.isdir(filename):
@@ -937,7 +1412,7 @@ def set_source(self, filename):
937
1412
  new_sources.append(current_source)
938
1413
  failed_count += 1
939
1414
  continue
940
-
1415
+
941
1416
  # Get the basename from current file_source
942
1417
  current_basename = os.path.basename(current_source)
943
1418
  # Build new absolute path
@@ -945,26 +1420,26 @@ def set_source(self, filename):
945
1420
  else:
946
1421
  # filename is a full path, make it absolute
947
1422
  new_file_path = os.path.abspath(filename)
948
-
1423
+
949
1424
  # Check if the new file exists
950
1425
  if not os.path.exists(new_file_path):
951
1426
  self.logger.warning(f"File does not exist for sample '{sample_name}': {new_file_path}")
952
1427
  new_sources.append(current_source)
953
1428
  failed_count += 1
954
1429
  continue
955
-
1430
+
956
1431
  # File exists, update source
957
1432
  new_sources.append(new_file_path)
958
1433
  updated_count += 1
959
-
1434
+
960
1435
  # Log individual updates at debug level
961
1436
  self.logger.debug(f"Updated file_source for sample '{sample_name}': {current_source} -> {new_file_path}")
962
-
1437
+
963
1438
  # Update the samples_df with new file_source values
964
1439
  self.samples_df = self.samples_df.with_columns(
965
- pl.Series("file_source", new_sources).alias("file_source")
1440
+ pl.Series("file_source", new_sources).alias("file_source"),
966
1441
  )
967
-
1442
+
968
1443
  # Log summary
969
1444
  if updated_count > 0:
970
1445
  self.logger.info(f"Updated file_source for {updated_count} samples")
@@ -990,9 +1465,9 @@ def features_select(
990
1465
  ):
991
1466
  """
992
1467
  Select features from features_df based on specified criteria and return the filtered DataFrame.
993
-
1468
+
994
1469
  OPTIMIZED VERSION: Combines all filters into a single operation for better performance.
995
-
1470
+
996
1471
  Parameters:
997
1472
  mz: m/z range filter (tuple for range, single value for minimum)
998
1473
  rt: retention time range filter (tuple for range, single value for minimum)
@@ -1007,30 +1482,42 @@ def features_select(
1007
1482
  chrom_prominence: chromatogram prominence filter (tuple for range, single value for minimum)
1008
1483
  chrom_prominence_scaled: scaled chromatogram prominence filter (tuple for range, single value for minimum)
1009
1484
  chrom_height_scaled: scaled chromatogram height filter (tuple for range, single value for minimum)
1010
-
1485
+
1011
1486
  Returns:
1012
1487
  polars.DataFrame: Filtered features DataFrame
1013
1488
  """
1014
1489
  if self.features_df is None or self.features_df.is_empty():
1015
1490
  self.logger.warning("No features found in study.")
1016
1491
  return pl.DataFrame()
1017
-
1492
+
1018
1493
  # Early return if no filters provided - performance optimization
1019
- filter_params = [mz, rt, inty, sample_uid, sample_name, consensus_uid,
1020
- feature_uid, filled, quality, chrom_coherence,
1021
- chrom_prominence, chrom_prominence_scaled, chrom_height_scaled]
1494
+ filter_params = [
1495
+ mz,
1496
+ rt,
1497
+ inty,
1498
+ sample_uid,
1499
+ sample_name,
1500
+ consensus_uid,
1501
+ feature_uid,
1502
+ filled,
1503
+ quality,
1504
+ chrom_coherence,
1505
+ chrom_prominence,
1506
+ chrom_prominence_scaled,
1507
+ chrom_height_scaled,
1508
+ ]
1022
1509
  if all(param is None for param in filter_params):
1023
1510
  return self.features_df.clone()
1024
-
1511
+
1025
1512
  initial_count = len(self.features_df)
1026
-
1513
+
1027
1514
  # Pre-check available columns once for efficiency
1028
1515
  available_columns = set(self.features_df.columns)
1029
-
1516
+
1030
1517
  # Build all filter conditions first, then apply them all at once
1031
1518
  filter_conditions = []
1032
1519
  warnings = []
1033
-
1520
+
1034
1521
  # Filter by m/z
1035
1522
  if mz is not None:
1036
1523
  if isinstance(mz, tuple) and len(mz) == 2:
@@ -1038,7 +1525,7 @@ def features_select(
1038
1525
  filter_conditions.append((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
1039
1526
  else:
1040
1527
  filter_conditions.append(pl.col("mz") >= mz)
1041
-
1528
+
1042
1529
  # Filter by retention time
1043
1530
  if rt is not None:
1044
1531
  if isinstance(rt, tuple) and len(rt) == 2:
@@ -1046,7 +1533,7 @@ def features_select(
1046
1533
  filter_conditions.append((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
1047
1534
  else:
1048
1535
  filter_conditions.append(pl.col("rt") >= rt)
1049
-
1536
+
1050
1537
  # Filter by intensity
1051
1538
  if inty is not None:
1052
1539
  if isinstance(inty, tuple) and len(inty) == 2:
@@ -1054,7 +1541,7 @@ def features_select(
1054
1541
  filter_conditions.append((pl.col("inty") >= min_inty) & (pl.col("inty") <= max_inty))
1055
1542
  else:
1056
1543
  filter_conditions.append(pl.col("inty") >= inty)
1057
-
1544
+
1058
1545
  # Filter by sample_uid
1059
1546
  if sample_uid is not None:
1060
1547
  if isinstance(sample_uid, (list, tuple)):
@@ -1067,24 +1554,24 @@ def features_select(
1067
1554
  filter_conditions.append(pl.col("sample_uid").is_in(sample_uid))
1068
1555
  else:
1069
1556
  filter_conditions.append(pl.col("sample_uid") == sample_uid)
1070
-
1557
+
1071
1558
  # Filter by sample_name (requires pre-processing)
1072
1559
  if sample_name is not None:
1073
1560
  # Get sample_uids for the given sample names
1074
1561
  if isinstance(sample_name, list):
1075
1562
  sample_uids_for_names = self.samples_df.filter(
1076
- pl.col("sample_name").is_in(sample_name)
1563
+ pl.col("sample_name").is_in(sample_name),
1077
1564
  )["sample_uid"].to_list()
1078
1565
  else:
1079
1566
  sample_uids_for_names = self.samples_df.filter(
1080
- pl.col("sample_name") == sample_name
1567
+ pl.col("sample_name") == sample_name,
1081
1568
  )["sample_uid"].to_list()
1082
-
1569
+
1083
1570
  if sample_uids_for_names:
1084
1571
  filter_conditions.append(pl.col("sample_uid").is_in(sample_uids_for_names))
1085
1572
  else:
1086
1573
  filter_conditions.append(pl.lit(False)) # No matching samples
1087
-
1574
+
1088
1575
  # Filter by consensus_uid
1089
1576
  if consensus_uid is not None:
1090
1577
  if isinstance(consensus_uid, (list, tuple)):
@@ -1097,7 +1584,7 @@ def features_select(
1097
1584
  filter_conditions.append(pl.col("consensus_uid").is_in(consensus_uid))
1098
1585
  else:
1099
1586
  filter_conditions.append(pl.col("consensus_uid") == consensus_uid)
1100
-
1587
+
1101
1588
  # Filter by feature_uid
1102
1589
  if feature_uid is not None:
1103
1590
  if isinstance(feature_uid, (list, tuple)):
@@ -1110,7 +1597,7 @@ def features_select(
1110
1597
  filter_conditions.append(pl.col("feature_uid").is_in(feature_uid))
1111
1598
  else:
1112
1599
  filter_conditions.append(pl.col("feature_uid") == feature_uid)
1113
-
1600
+
1114
1601
  # Filter by filled status
1115
1602
  if filled is not None:
1116
1603
  if "filled" in available_columns:
@@ -1120,7 +1607,7 @@ def features_select(
1120
1607
  filter_conditions.append(~pl.col("filled") | pl.col("filled").is_null())
1121
1608
  else:
1122
1609
  warnings.append("'filled' column not found in features_df")
1123
-
1610
+
1124
1611
  # Filter by quality
1125
1612
  if quality is not None:
1126
1613
  if "quality" in available_columns:
@@ -1131,73 +1618,83 @@ def features_select(
1131
1618
  filter_conditions.append(pl.col("quality") >= quality)
1132
1619
  else:
1133
1620
  warnings.append("'quality' column not found in features_df")
1134
-
1621
+
1135
1622
  # Filter by chromatogram coherence
1136
1623
  if chrom_coherence is not None:
1137
1624
  if "chrom_coherence" in available_columns:
1138
1625
  if isinstance(chrom_coherence, tuple) and len(chrom_coherence) == 2:
1139
1626
  min_coherence, max_coherence = chrom_coherence
1140
- filter_conditions.append((pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence))
1627
+ filter_conditions.append(
1628
+ (pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence)
1629
+ )
1141
1630
  else:
1142
1631
  filter_conditions.append(pl.col("chrom_coherence") >= chrom_coherence)
1143
1632
  else:
1144
1633
  warnings.append("'chrom_coherence' column not found in features_df")
1145
-
1634
+
1146
1635
  # Filter by chromatogram prominence
1147
1636
  if chrom_prominence is not None:
1148
1637
  if "chrom_prominence" in available_columns:
1149
1638
  if isinstance(chrom_prominence, tuple) and len(chrom_prominence) == 2:
1150
1639
  min_prominence, max_prominence = chrom_prominence
1151
- filter_conditions.append((pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence))
1640
+ filter_conditions.append(
1641
+ (pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence)
1642
+ )
1152
1643
  else:
1153
1644
  filter_conditions.append(pl.col("chrom_prominence") >= chrom_prominence)
1154
1645
  else:
1155
1646
  warnings.append("'chrom_prominence' column not found in features_df")
1156
-
1647
+
1157
1648
  # Filter by scaled chromatogram prominence
1158
1649
  if chrom_prominence_scaled is not None:
1159
1650
  if "chrom_prominence_scaled" in available_columns:
1160
1651
  if isinstance(chrom_prominence_scaled, tuple) and len(chrom_prominence_scaled) == 2:
1161
1652
  min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled
1162
- filter_conditions.append((pl.col("chrom_prominence_scaled") >= min_prominence_scaled) & (pl.col("chrom_prominence_scaled") <= max_prominence_scaled))
1653
+ filter_conditions.append(
1654
+ (pl.col("chrom_prominence_scaled") >= min_prominence_scaled)
1655
+ & (pl.col("chrom_prominence_scaled") <= max_prominence_scaled)
1656
+ )
1163
1657
  else:
1164
1658
  filter_conditions.append(pl.col("chrom_prominence_scaled") >= chrom_prominence_scaled)
1165
1659
  else:
1166
1660
  warnings.append("'chrom_prominence_scaled' column not found in features_df")
1167
-
1661
+
1168
1662
  # Filter by scaled chromatogram height
1169
1663
  if chrom_height_scaled is not None:
1170
1664
  if "chrom_height_scaled" in available_columns:
1171
1665
  if isinstance(chrom_height_scaled, tuple) and len(chrom_height_scaled) == 2:
1172
1666
  min_height_scaled, max_height_scaled = chrom_height_scaled
1173
- filter_conditions.append((pl.col("chrom_height_scaled") >= min_height_scaled) & (pl.col("chrom_height_scaled") <= max_height_scaled))
1667
+ filter_conditions.append(
1668
+ (pl.col("chrom_height_scaled") >= min_height_scaled)
1669
+ & (pl.col("chrom_height_scaled") <= max_height_scaled)
1670
+ )
1174
1671
  else:
1175
1672
  filter_conditions.append(pl.col("chrom_height_scaled") >= chrom_height_scaled)
1176
1673
  else:
1177
1674
  warnings.append("'chrom_height_scaled' column not found in features_df")
1178
-
1675
+
1179
1676
  # Log all warnings once at the end for efficiency
1180
1677
  for warning in warnings:
1181
1678
  self.logger.warning(warning)
1182
-
1679
+
1183
1680
  # Apply all filters at once using lazy evaluation for optimal performance
1184
1681
  if filter_conditions:
1185
1682
  # Combine all conditions with AND
1186
1683
  combined_filter = filter_conditions[0]
1187
1684
  for condition in filter_conditions[1:]:
1188
1685
  combined_filter = combined_filter & condition
1189
-
1686
+
1190
1687
  # Apply the combined filter using lazy evaluation
1191
1688
  feats = self.features_df.lazy().filter(combined_filter).collect()
1192
1689
  else:
1193
1690
  feats = self.features_df.clone()
1194
-
1691
+
1195
1692
  final_count = len(feats)
1196
-
1693
+
1197
1694
  if final_count == 0:
1198
1695
  self.logger.warning("No features remaining after applying selection criteria.")
1199
1696
  else:
1200
- #removed_count = initial_count - final_count
1697
+ # removed_count = initial_count - final_count
1201
1698
  self.logger.info(f"Features selected: {final_count} (out of {initial_count})")
1202
1699
 
1203
1700
  return feats
@@ -1207,29 +1704,29 @@ def features_filter(self, features):
1207
1704
  """
1208
1705
  Filter features_df by keeping only features that match the given criteria.
1209
1706
  This keeps only the specified features and removes all others.
1210
-
1707
+
1211
1708
  OPTIMIZED VERSION: Batch operations and reduced overhead for better performance.
1212
-
1709
+
1213
1710
  Parameters:
1214
1711
  features: Features to keep. Can be:
1215
1712
  - polars.DataFrame: Features DataFrame (will use feature_uid column)
1216
1713
  - list: List of feature_uids to keep
1217
1714
  - int: Single feature_uid to keep
1218
-
1715
+
1219
1716
  Returns:
1220
1717
  None (modifies self.features_df in place)
1221
1718
  """
1222
1719
  if self.features_df is None or self.features_df.is_empty():
1223
1720
  self.logger.warning("No features found in study.")
1224
1721
  return
1225
-
1722
+
1226
1723
  # Early return if no features provided
1227
1724
  if features is None:
1228
1725
  self.logger.warning("No features provided for filtering.")
1229
1726
  return
1230
-
1727
+
1231
1728
  initial_count = len(self.features_df)
1232
-
1729
+
1233
1730
  # Determine feature_uids to keep - optimized type checking
1234
1731
  if isinstance(features, pl.DataFrame):
1235
1732
  if "feature_uid" not in features.columns:
@@ -1243,44 +1740,41 @@ def features_filter(self, features):
1243
1740
  else:
1244
1741
  self.logger.error("features parameter must be a DataFrame, list, tuple, or int")
1245
1742
  return
1246
-
1743
+
1247
1744
  # Early return if no UIDs to keep
1248
1745
  if not feature_uids_to_keep:
1249
1746
  self.logger.warning("No feature UIDs provided for filtering.")
1250
1747
  return
1251
-
1748
+
1252
1749
  # Convert to set for faster lookup if list is large
1253
1750
  if len(feature_uids_to_keep) > 100:
1254
1751
  feature_uids_set = set(feature_uids_to_keep)
1255
1752
  # Use the set for filtering if it's significantly smaller
1256
1753
  if len(feature_uids_set) < len(feature_uids_to_keep) * 0.8:
1257
1754
  feature_uids_to_keep = list(feature_uids_set)
1258
-
1755
+
1259
1756
  # Create filter condition once - keep only the specified features
1260
1757
  filter_condition = pl.col("feature_uid").is_in(feature_uids_to_keep)
1261
-
1758
+
1262
1759
  # Apply filter to features_df using lazy evaluation for better performance
1263
1760
  self.features_df = self.features_df.lazy().filter(filter_condition).collect()
1264
-
1761
+
1265
1762
  # Apply filter to consensus_mapping_df if it exists - batch operation
1266
1763
  mapping_removed_count = 0
1267
1764
  if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
1268
1765
  initial_mapping_count = len(self.consensus_mapping_df)
1269
- self.consensus_mapping_df = (
1270
- self.consensus_mapping_df
1271
- .lazy()
1272
- .filter(filter_condition)
1273
- .collect()
1274
- )
1766
+ self.consensus_mapping_df = self.consensus_mapping_df.lazy().filter(filter_condition).collect()
1275
1767
  mapping_removed_count = initial_mapping_count - len(self.consensus_mapping_df)
1276
-
1768
+
1277
1769
  # Calculate results once and log efficiently
1278
1770
  final_count = len(self.features_df)
1279
1771
  removed_count = initial_count - final_count
1280
-
1772
+
1281
1773
  # Single comprehensive log message
1282
1774
  if mapping_removed_count > 0:
1283
- self.logger.info(f"Kept {final_count} features and removed {mapping_removed_count} consensus mappings. Filtered out {removed_count} features.")
1775
+ self.logger.info(
1776
+ f"Kept {final_count} features and removed {mapping_removed_count} consensus mappings. Filtered out {removed_count} features."
1777
+ )
1284
1778
  else:
1285
1779
  self.logger.info(f"Kept {final_count} features. Filtered out {removed_count} features.")
1286
1780
 
@@ -1289,27 +1783,27 @@ def features_delete(self, features):
1289
1783
  """
1290
1784
  Delete features from features_df based on feature identifiers.
1291
1785
  This removes the specified features and keeps all others (opposite of features_filter).
1292
-
1786
+
1293
1787
  Parameters:
1294
1788
  features: Features to delete. Can be:
1295
1789
  - polars.DataFrame: Features DataFrame (will use feature_uid column)
1296
1790
  - list: List of feature_uids to delete
1297
1791
  - int: Single feature_uid to delete
1298
-
1792
+
1299
1793
  Returns:
1300
1794
  None (modifies self.features_df in place)
1301
1795
  """
1302
1796
  if self.features_df is None or self.features_df.is_empty():
1303
1797
  self.logger.warning("No features found in study.")
1304
1798
  return
1305
-
1799
+
1306
1800
  # Early return if no features provided
1307
1801
  if features is None:
1308
1802
  self.logger.warning("No features provided for deletion.")
1309
1803
  return
1310
-
1804
+
1311
1805
  initial_count = len(self.features_df)
1312
-
1806
+
1313
1807
  # Determine feature_uids to remove - optimized type checking
1314
1808
  if isinstance(features, pl.DataFrame):
1315
1809
  if "feature_uid" not in features.columns:
@@ -1323,44 +1817,41 @@ def features_delete(self, features):
1323
1817
  else:
1324
1818
  self.logger.error("features parameter must be a DataFrame, list, tuple, or int")
1325
1819
  return
1326
-
1820
+
1327
1821
  # Early return if no UIDs to remove
1328
1822
  if not feature_uids_to_remove:
1329
1823
  self.logger.warning("No feature UIDs provided for deletion.")
1330
1824
  return
1331
-
1825
+
1332
1826
  # Convert to set for faster lookup if list is large
1333
1827
  if len(feature_uids_to_remove) > 100:
1334
1828
  feature_uids_set = set(feature_uids_to_remove)
1335
1829
  # Use the set for filtering if it's significantly smaller
1336
1830
  if len(feature_uids_set) < len(feature_uids_to_remove) * 0.8:
1337
1831
  feature_uids_to_remove = list(feature_uids_set)
1338
-
1832
+
1339
1833
  # Create filter condition - remove specified features
1340
1834
  filter_condition = ~pl.col("feature_uid").is_in(feature_uids_to_remove)
1341
-
1835
+
1342
1836
  # Apply filter to features_df using lazy evaluation for better performance
1343
1837
  self.features_df = self.features_df.lazy().filter(filter_condition).collect()
1344
-
1838
+
1345
1839
  # Apply filter to consensus_mapping_df if it exists - batch operation
1346
1840
  mapping_removed_count = 0
1347
1841
  if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
1348
1842
  initial_mapping_count = len(self.consensus_mapping_df)
1349
- self.consensus_mapping_df = (
1350
- self.consensus_mapping_df
1351
- .lazy()
1352
- .filter(filter_condition)
1353
- .collect()
1354
- )
1843
+ self.consensus_mapping_df = self.consensus_mapping_df.lazy().filter(filter_condition).collect()
1355
1844
  mapping_removed_count = initial_mapping_count - len(self.consensus_mapping_df)
1356
-
1845
+
1357
1846
  # Calculate results once and log efficiently
1358
1847
  final_count = len(self.features_df)
1359
1848
  removed_count = initial_count - final_count
1360
-
1849
+
1361
1850
  # Single comprehensive log message
1362
1851
  if mapping_removed_count > 0:
1363
- self.logger.info(f"Deleted {removed_count} features and {mapping_removed_count} consensus mappings. Remaining features: {final_count}")
1852
+ self.logger.info(
1853
+ f"Deleted {removed_count} features and {mapping_removed_count} consensus mappings. Remaining features: {final_count}"
1854
+ )
1364
1855
  else:
1365
1856
  self.logger.info(f"Deleted {removed_count} features. Remaining features: {final_count}")
1366
1857
 
@@ -1384,7 +1875,7 @@ def consensus_select(
1384
1875
  ):
1385
1876
  """
1386
1877
  Select consensus features from consensus_df based on specified criteria and return the filtered DataFrame.
1387
-
1878
+
1388
1879
  Parameters:
1389
1880
  mz: m/z range filter (tuple for range, single value for minimum)
1390
1881
  rt: retention time range filter (tuple for range, single value for minimum)
@@ -1400,17 +1891,17 @@ def consensus_select(
1400
1891
  chrom_prominence_scaled_mean: mean scaled chromatogram prominence filter (tuple for range, single value for minimum)
1401
1892
  chrom_height_scaled_mean: mean scaled chromatogram height filter (tuple for range, single value for minimum)
1402
1893
  rt_delta_mean: mean RT delta filter (tuple for range, single value for minimum)
1403
-
1894
+
1404
1895
  Returns:
1405
1896
  polars.DataFrame: Filtered consensus DataFrame
1406
1897
  """
1407
1898
  if self.consensus_df is None or self.consensus_df.is_empty():
1408
1899
  self.logger.warning("No consensus features found in study.")
1409
1900
  return pl.DataFrame()
1410
-
1901
+
1411
1902
  consensus = self.consensus_df.clone()
1412
1903
  initial_count = len(consensus)
1413
-
1904
+
1414
1905
  # Filter by m/z
1415
1906
  if mz is not None:
1416
1907
  consensus_len_before_filter = len(consensus)
@@ -1420,9 +1911,9 @@ def consensus_select(
1420
1911
  else:
1421
1912
  consensus = consensus.filter(pl.col("mz") >= mz)
1422
1913
  self.logger.debug(
1423
- f"Selected consensus by mz. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1914
+ f"Selected consensus by mz. Consensus removed: {consensus_len_before_filter - len(consensus)}",
1424
1915
  )
1425
-
1916
+
1426
1917
  # Filter by retention time
1427
1918
  if rt is not None:
1428
1919
  consensus_len_before_filter = len(consensus)
@@ -1432,9 +1923,9 @@ def consensus_select(
1432
1923
  else:
1433
1924
  consensus = consensus.filter(pl.col("rt") >= rt)
1434
1925
  self.logger.debug(
1435
- f"Selected consensus by rt. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1926
+ f"Selected consensus by rt. Consensus removed: {consensus_len_before_filter - len(consensus)}",
1436
1927
  )
1437
-
1928
+
1438
1929
  # Filter by mean intensity
1439
1930
  if inty_mean is not None:
1440
1931
  consensus_len_before_filter = len(consensus)
@@ -1444,9 +1935,9 @@ def consensus_select(
1444
1935
  else:
1445
1936
  consensus = consensus.filter(pl.col("inty_mean") >= inty_mean)
1446
1937
  self.logger.debug(
1447
- f"Selected consensus by inty_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1938
+ f"Selected consensus by inty_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
1448
1939
  )
1449
-
1940
+
1450
1941
  # Filter by consensus_uid
1451
1942
  if consensus_uid is not None:
1452
1943
  consensus_len_before_filter = len(consensus)
@@ -1454,16 +1945,18 @@ def consensus_select(
1454
1945
  if len(consensus_uid) == 2 and not isinstance(consensus_uid, list):
1455
1946
  # Treat as range
1456
1947
  min_uid, max_uid = consensus_uid
1457
- consensus = consensus.filter((pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid))
1948
+ consensus = consensus.filter(
1949
+ (pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid)
1950
+ )
1458
1951
  else:
1459
1952
  # Treat as list
1460
1953
  consensus = consensus.filter(pl.col("consensus_uid").is_in(consensus_uid))
1461
1954
  else:
1462
1955
  consensus = consensus.filter(pl.col("consensus_uid") == consensus_uid)
1463
1956
  self.logger.debug(
1464
- f"Selected consensus by consensus_uid. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1957
+ f"Selected consensus by consensus_uid. Consensus removed: {consensus_len_before_filter - len(consensus)}",
1465
1958
  )
1466
-
1959
+
1467
1960
  # Filter by consensus_id
1468
1961
  if consensus_id is not None:
1469
1962
  consensus_len_before_filter = len(consensus)
@@ -1472,21 +1965,23 @@ def consensus_select(
1472
1965
  else:
1473
1966
  consensus = consensus.filter(pl.col("consensus_id") == consensus_id)
1474
1967
  self.logger.debug(
1475
- f"Selected consensus by consensus_id. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1968
+ f"Selected consensus by consensus_id. Consensus removed: {consensus_len_before_filter - len(consensus)}",
1476
1969
  )
1477
-
1970
+
1478
1971
  # Filter by number of samples
1479
1972
  if number_samples is not None:
1480
1973
  consensus_len_before_filter = len(consensus)
1481
1974
  if isinstance(number_samples, tuple) and len(number_samples) == 2:
1482
1975
  min_samples, max_samples = number_samples
1483
- consensus = consensus.filter((pl.col("number_samples") >= min_samples) & (pl.col("number_samples") <= max_samples))
1976
+ consensus = consensus.filter(
1977
+ (pl.col("number_samples") >= min_samples) & (pl.col("number_samples") <= max_samples)
1978
+ )
1484
1979
  else:
1485
1980
  consensus = consensus.filter(pl.col("number_samples") >= number_samples)
1486
1981
  self.logger.debug(
1487
- f"Selected consensus by number_samples. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1982
+ f"Selected consensus by number_samples. Consensus removed: {consensus_len_before_filter - len(consensus)}",
1488
1983
  )
1489
-
1984
+
1490
1985
  # Filter by number of MS2 spectra
1491
1986
  if number_ms2 is not None:
1492
1987
  consensus_len_before_filter = len(consensus)
@@ -1499,9 +1994,9 @@ def consensus_select(
1499
1994
  else:
1500
1995
  self.logger.warning("'number_ms2' column not found in consensus_df")
1501
1996
  self.logger.debug(
1502
- f"Selected consensus by number_ms2. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1997
+ f"Selected consensus by number_ms2. Consensus removed: {consensus_len_before_filter - len(consensus)}",
1503
1998
  )
1504
-
1999
+
1505
2000
  # Filter by quality
1506
2001
  if quality is not None:
1507
2002
  consensus_len_before_filter = len(consensus)
@@ -1511,9 +2006,9 @@ def consensus_select(
1511
2006
  else:
1512
2007
  consensus = consensus.filter(pl.col("quality") >= quality)
1513
2008
  self.logger.debug(
1514
- f"Selected consensus by quality. Consensus removed: {consensus_len_before_filter - len(consensus)}"
2009
+ f"Selected consensus by quality. Consensus removed: {consensus_len_before_filter - len(consensus)}",
1515
2010
  )
1516
-
2011
+
1517
2012
  # Filter by baseline
1518
2013
  if bl is not None:
1519
2014
  consensus_len_before_filter = len(consensus)
@@ -1526,89 +2021,103 @@ def consensus_select(
1526
2021
  else:
1527
2022
  self.logger.warning("'bl' column not found in consensus_df")
1528
2023
  self.logger.debug(
1529
- f"Selected consensus by bl. Consensus removed: {consensus_len_before_filter - len(consensus)}"
2024
+ f"Selected consensus by bl. Consensus removed: {consensus_len_before_filter - len(consensus)}",
1530
2025
  )
1531
-
2026
+
1532
2027
  # Filter by mean chromatogram coherence
1533
2028
  if chrom_coherence_mean is not None:
1534
2029
  consensus_len_before_filter = len(consensus)
1535
2030
  if "chrom_coherence_mean" in consensus.columns:
1536
2031
  if isinstance(chrom_coherence_mean, tuple) and len(chrom_coherence_mean) == 2:
1537
2032
  min_coherence, max_coherence = chrom_coherence_mean
1538
- consensus = consensus.filter((pl.col("chrom_coherence_mean") >= min_coherence) & (pl.col("chrom_coherence_mean") <= max_coherence))
2033
+ consensus = consensus.filter(
2034
+ (pl.col("chrom_coherence_mean") >= min_coherence)
2035
+ & (pl.col("chrom_coherence_mean") <= max_coherence)
2036
+ )
1539
2037
  else:
1540
2038
  consensus = consensus.filter(pl.col("chrom_coherence_mean") >= chrom_coherence_mean)
1541
2039
  else:
1542
2040
  self.logger.warning("'chrom_coherence_mean' column not found in consensus_df")
1543
2041
  self.logger.debug(
1544
- f"Selected consensus by chrom_coherence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
2042
+ f"Selected consensus by chrom_coherence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
1545
2043
  )
1546
-
2044
+
1547
2045
  # Filter by mean chromatogram prominence
1548
2046
  if chrom_prominence_mean is not None:
1549
2047
  consensus_len_before_filter = len(consensus)
1550
2048
  if "chrom_prominence_mean" in consensus.columns:
1551
2049
  if isinstance(chrom_prominence_mean, tuple) and len(chrom_prominence_mean) == 2:
1552
2050
  min_prominence, max_prominence = chrom_prominence_mean
1553
- consensus = consensus.filter((pl.col("chrom_prominence_mean") >= min_prominence) & (pl.col("chrom_prominence_mean") <= max_prominence))
2051
+ consensus = consensus.filter(
2052
+ (pl.col("chrom_prominence_mean") >= min_prominence)
2053
+ & (pl.col("chrom_prominence_mean") <= max_prominence)
2054
+ )
1554
2055
  else:
1555
2056
  consensus = consensus.filter(pl.col("chrom_prominence_mean") >= chrom_prominence_mean)
1556
2057
  else:
1557
2058
  self.logger.warning("'chrom_prominence_mean' column not found in consensus_df")
1558
2059
  self.logger.debug(
1559
- f"Selected consensus by chrom_prominence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
2060
+ f"Selected consensus by chrom_prominence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
1560
2061
  )
1561
-
2062
+
1562
2063
  # Filter by mean scaled chromatogram prominence
1563
2064
  if chrom_prominence_scaled_mean is not None:
1564
2065
  consensus_len_before_filter = len(consensus)
1565
2066
  if "chrom_prominence_scaled_mean" in consensus.columns:
1566
2067
  if isinstance(chrom_prominence_scaled_mean, tuple) and len(chrom_prominence_scaled_mean) == 2:
1567
2068
  min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled_mean
1568
- consensus = consensus.filter((pl.col("chrom_prominence_scaled_mean") >= min_prominence_scaled) & (pl.col("chrom_prominence_scaled_mean") <= max_prominence_scaled))
2069
+ consensus = consensus.filter(
2070
+ (pl.col("chrom_prominence_scaled_mean") >= min_prominence_scaled)
2071
+ & (pl.col("chrom_prominence_scaled_mean") <= max_prominence_scaled)
2072
+ )
1569
2073
  else:
1570
2074
  consensus = consensus.filter(pl.col("chrom_prominence_scaled_mean") >= chrom_prominence_scaled_mean)
1571
2075
  else:
1572
2076
  self.logger.warning("'chrom_prominence_scaled_mean' column not found in consensus_df")
1573
2077
  self.logger.debug(
1574
- f"Selected consensus by chrom_prominence_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
2078
+ f"Selected consensus by chrom_prominence_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
1575
2079
  )
1576
-
2080
+
1577
2081
  # Filter by mean scaled chromatogram height
1578
2082
  if chrom_height_scaled_mean is not None:
1579
2083
  consensus_len_before_filter = len(consensus)
1580
2084
  if "chrom_height_scaled_mean" in consensus.columns:
1581
2085
  if isinstance(chrom_height_scaled_mean, tuple) and len(chrom_height_scaled_mean) == 2:
1582
2086
  min_height_scaled, max_height_scaled = chrom_height_scaled_mean
1583
- consensus = consensus.filter((pl.col("chrom_height_scaled_mean") >= min_height_scaled) & (pl.col("chrom_height_scaled_mean") <= max_height_scaled))
2087
+ consensus = consensus.filter(
2088
+ (pl.col("chrom_height_scaled_mean") >= min_height_scaled)
2089
+ & (pl.col("chrom_height_scaled_mean") <= max_height_scaled)
2090
+ )
1584
2091
  else:
1585
2092
  consensus = consensus.filter(pl.col("chrom_height_scaled_mean") >= chrom_height_scaled_mean)
1586
2093
  else:
1587
2094
  self.logger.warning("'chrom_height_scaled_mean' column not found in consensus_df")
1588
2095
  self.logger.debug(
1589
- f"Selected consensus by chrom_height_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
2096
+ f"Selected consensus by chrom_height_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
1590
2097
  )
1591
-
2098
+
1592
2099
  # Filter by mean RT delta
1593
2100
  if rt_delta_mean is not None:
1594
2101
  consensus_len_before_filter = len(consensus)
1595
2102
  if "rt_delta_mean" in consensus.columns:
1596
2103
  if isinstance(rt_delta_mean, tuple) and len(rt_delta_mean) == 2:
1597
2104
  min_rt_delta, max_rt_delta = rt_delta_mean
1598
- consensus = consensus.filter((pl.col("rt_delta_mean") >= min_rt_delta) & (pl.col("rt_delta_mean") <= max_rt_delta))
2105
+ consensus = consensus.filter(
2106
+ (pl.col("rt_delta_mean") >= min_rt_delta) & (pl.col("rt_delta_mean") <= max_rt_delta)
2107
+ )
1599
2108
  else:
1600
2109
  consensus = consensus.filter(pl.col("rt_delta_mean") >= rt_delta_mean)
1601
2110
  else:
1602
2111
  self.logger.warning("'rt_delta_mean' column not found in consensus_df")
1603
2112
  self.logger.debug(
1604
- f"Selected consensus by rt_delta_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
2113
+ f"Selected consensus by rt_delta_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
1605
2114
  )
1606
-
2115
+
1607
2116
  if len(consensus) == 0:
1608
2117
  self.logger.warning("No consensus features remaining after applying selection criteria.")
1609
2118
  else:
1610
2119
  self.logger.info(f"Selected consensus features. Features remaining: {len(consensus)} (from {initial_count})")
1611
-
2120
+
1612
2121
  return consensus
1613
2122
 
1614
2123
 
@@ -1616,22 +2125,22 @@ def consensus_filter(self, consensus):
1616
2125
  """
1617
2126
  Filter consensus_df by removing all consensus features that match the given criteria.
1618
2127
  This also removes related entries from consensus_mapping_df, features_df, and consensus_ms2.
1619
-
2128
+
1620
2129
  Parameters:
1621
2130
  consensus: Consensus features to remove. Can be:
1622
2131
  - polars.DataFrame: Consensus DataFrame (will use consensus_uid column)
1623
2132
  - list: List of consensus_uids to remove
1624
2133
  - int: Single consensus_uid to remove
1625
-
2134
+
1626
2135
  Returns:
1627
2136
  None (modifies self.consensus_df and related DataFrames in place)
1628
2137
  """
1629
2138
  if self.consensus_df is None or self.consensus_df.is_empty():
1630
2139
  self.logger.warning("No consensus features found in study.")
1631
2140
  return
1632
-
2141
+
1633
2142
  initial_consensus_count = len(self.consensus_df)
1634
-
2143
+
1635
2144
  # Determine consensus_uids to remove
1636
2145
  if isinstance(consensus, pl.DataFrame):
1637
2146
  if "consensus_uid" not in consensus.columns:
@@ -1645,68 +2154,70 @@ def consensus_filter(self, consensus):
1645
2154
  else:
1646
2155
  self.logger.error("consensus parameter must be a DataFrame, list, or int")
1647
2156
  return
1648
-
2157
+
1649
2158
  if not consensus_uids_to_remove:
1650
2159
  self.logger.warning("No consensus UIDs provided for filtering.")
1651
2160
  return
1652
-
2161
+
1653
2162
  # Get feature_uids that need to be removed from features_df
1654
2163
  feature_uids_to_remove = []
1655
2164
  if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
1656
2165
  feature_uids_to_remove = self.consensus_mapping_df.filter(
1657
- pl.col("consensus_uid").is_in(consensus_uids_to_remove)
2166
+ pl.col("consensus_uid").is_in(consensus_uids_to_remove),
1658
2167
  )["feature_uid"].to_list()
1659
-
2168
+
1660
2169
  # Remove consensus features from consensus_df
1661
2170
  self.consensus_df = self.consensus_df.filter(
1662
- ~pl.col("consensus_uid").is_in(consensus_uids_to_remove)
2171
+ ~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
1663
2172
  )
1664
-
2173
+
1665
2174
  # Remove from consensus_mapping_df
1666
2175
  if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
1667
2176
  initial_mapping_count = len(self.consensus_mapping_df)
1668
2177
  self.consensus_mapping_df = self.consensus_mapping_df.filter(
1669
- ~pl.col("consensus_uid").is_in(consensus_uids_to_remove)
2178
+ ~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
1670
2179
  )
1671
2180
  removed_mapping_count = initial_mapping_count - len(self.consensus_mapping_df)
1672
2181
  if removed_mapping_count > 0:
1673
2182
  self.logger.debug(f"Removed {removed_mapping_count} entries from consensus_mapping_df")
1674
-
2183
+
1675
2184
  # Remove corresponding features from features_df
1676
2185
  if feature_uids_to_remove and self.features_df is not None and not self.features_df.is_empty():
1677
2186
  initial_features_count = len(self.features_df)
1678
2187
  self.features_df = self.features_df.filter(
1679
- ~pl.col("feature_uid").is_in(feature_uids_to_remove)
2188
+ ~pl.col("feature_uid").is_in(feature_uids_to_remove),
1680
2189
  )
1681
2190
  removed_features_count = initial_features_count - len(self.features_df)
1682
2191
  if removed_features_count > 0:
1683
2192
  self.logger.debug(f"Removed {removed_features_count} entries from features_df")
1684
-
2193
+
1685
2194
  # Remove from consensus_ms2 if it exists
1686
- if hasattr(self, 'consensus_ms2') and self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
2195
+ if hasattr(self, "consensus_ms2") and self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
1687
2196
  initial_ms2_count = len(self.consensus_ms2)
1688
2197
  self.consensus_ms2 = self.consensus_ms2.filter(
1689
- ~pl.col("consensus_uid").is_in(consensus_uids_to_remove)
2198
+ ~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
1690
2199
  )
1691
2200
  removed_ms2_count = initial_ms2_count - len(self.consensus_ms2)
1692
2201
  if removed_ms2_count > 0:
1693
2202
  self.logger.debug(f"Removed {removed_ms2_count} entries from consensus_ms2")
1694
-
2203
+
1695
2204
  removed_consensus_count = initial_consensus_count - len(self.consensus_df)
1696
- self.logger.info(f"Filtered {removed_consensus_count} consensus features. Remaining consensus: {len(self.consensus_df)}")
2205
+ self.logger.info(
2206
+ f"Filtered {removed_consensus_count} consensus features. Remaining consensus: {len(self.consensus_df)}"
2207
+ )
1697
2208
 
1698
2209
 
1699
2210
  def consensus_delete(self, consensus):
1700
2211
  """
1701
2212
  Delete consensus features from consensus_df based on consensus identifiers.
1702
2213
  This is an alias for consensus_filter for consistency with other delete methods.
1703
-
2214
+
1704
2215
  Parameters:
1705
2216
  consensus: Consensus features to delete. Can be:
1706
2217
  - polars.DataFrame: Consensus DataFrame (will use consensus_uid column)
1707
2218
  - list: List of consensus_uids to delete
1708
2219
  - int: Single consensus_uid to delete
1709
-
2220
+
1710
2221
  Returns:
1711
2222
  None (modifies self.consensus_df and related DataFrames in place)
1712
2223
  """