masster 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/helpers.py CHANGED
@@ -7,7 +7,7 @@ import pandas as pd
7
7
  import polars as pl
8
8
 
9
9
  from tqdm import tqdm
10
-
10
+
11
11
 
12
12
  def get_chrom(self, uids=None, samples=None):
13
13
  # Check if consensus_df is empty or doesn't have required columns
@@ -113,6 +113,7 @@ def get_chrom(self, uids=None, samples=None):
113
113
  # Return as Polars DataFrame (can handle complex objects like Chromatogram)
114
114
  return df2_pivoted
115
115
 
116
+
116
117
  def set_folder(self, folder):
117
118
  """
118
119
  Set the folder for saving and loading files.
@@ -408,17 +409,21 @@ def _get_sample_uids(self, samples=None, seed=42):
408
409
  sample_uids = list(set(sample_uids))
409
410
  return sample_uids
410
411
 
412
+
411
413
  def get_orphans(self):
412
- """
414
+ """
413
415
  Get all features that are not in the consensus mapping.
414
416
  """
415
- not_in_consensus = self.features_df.filter(~self.features_df['feature_uid'].is_in(self.consensus_mapping_df['feature_uid'].to_list()))
417
+ not_in_consensus = self.features_df.filter(
418
+ ~self.features_df["feature_uid"].is_in(self.consensus_mapping_df["feature_uid"].to_list())
419
+ )
416
420
  return not_in_consensus
417
421
 
422
+
418
423
  def compress(self, features=True, ms2=True, chrom=False, ms2_max=5):
419
424
  """
420
425
  Perform compress_features, compress_ms2, and compress_chrom operations.
421
-
426
+
422
427
  Parameters:
423
428
  max_replicates (int): Maximum number of MS2 replicates to keep per consensus_uid and energy combination
424
429
  """
@@ -441,48 +446,50 @@ def compress_features(self):
441
446
  if self.features_df is None or self.features_df.is_empty():
442
447
  self.logger.warning("No features_df found.")
443
448
  return
444
-
449
+
445
450
  if self.consensus_mapping_df is None or self.consensus_mapping_df.is_empty():
446
451
  self.logger.warning("No consensus_mapping_df found.")
447
452
  return
448
-
453
+
449
454
  initial_count = len(self.features_df)
450
-
455
+
451
456
  # Get feature_uids that are associated with consensus features
452
457
  consensus_feature_uids = self.consensus_mapping_df["feature_uid"].to_list()
453
-
458
+
454
459
  # Filter features_df to keep only features associated with consensus
455
460
  self.features_df = self.features_df.filter(
456
- pl.col("feature_uid").is_in(consensus_feature_uids)
461
+ pl.col("feature_uid").is_in(consensus_feature_uids),
457
462
  )
458
-
463
+
459
464
  # Set ms2_specs column to None if it exists
460
465
  if "ms2_specs" in self.features_df.columns:
461
466
  # Create a list of None values with the same length as the dataframe
462
467
  # This preserves the Object dtype instead of converting to Null
463
468
  none_values = [None] * len(self.features_df)
464
469
  self.features_df = self.features_df.with_columns(
465
- pl.Series("ms2_specs", none_values, dtype=pl.Object)
470
+ pl.Series("ms2_specs", none_values, dtype=pl.Object),
466
471
  )
467
-
472
+
468
473
  removed_count = initial_count - len(self.features_df)
469
- self.logger.info(f"Compressed features: removed {removed_count} features not in consensus, cleared ms2_specs column")
474
+ self.logger.info(
475
+ f"Compressed features: removed {removed_count} features not in consensus, cleared ms2_specs column"
476
+ )
470
477
 
471
478
 
472
479
  def restore_features(self, samples=None, maps=False):
473
480
  """
474
- Update specific columns (chrom, chrom_area, ms2_scans, ms2_specs) in features_df
481
+ Update specific columns (chrom, chrom_area, ms2_scans, ms2_specs) in features_df
475
482
  from the corresponding samples by reading features_df from the sample5 file.
476
483
  Use the feature_id for matching.
477
484
 
478
485
  Parameters:
479
- samples (list, optional): List of sample_uids or sample_names to restore.
486
+ samples (list, optional): List of sample_uids or sample_names to restore.
480
487
  If None, restores all samples.
481
488
  maps (bool, optional): If True, also load featureXML data and update study.feature_maps.
482
489
  """
483
490
  import datetime
484
491
  from masster.sample.sample import Sample
485
-
492
+
486
493
  if self.features_df is None or self.features_df.is_empty():
487
494
  self.logger.error("No features_df found in study.")
488
495
  return
@@ -499,8 +506,8 @@ def restore_features(self, samples=None, maps=False):
499
506
  return
500
507
 
501
508
  # Columns to update from sample data
502
- columns_to_update = ['chrom', 'chrom_area', 'ms2_scans', 'ms2_specs']
503
-
509
+ columns_to_update = ["chrom", "chrom_area", "ms2_scans", "ms2_specs"]
510
+
504
511
  self.logger.info(f"Restoring columns {columns_to_update} from {len(sample_uids)} samples...")
505
512
 
506
513
  # Create a mapping of (sample_uid, feature_id) to feature_uid from study.features_df
@@ -512,10 +519,12 @@ def restore_features(self, samples=None, maps=False):
512
519
 
513
520
  # Process each sample
514
521
  tqdm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
515
- for sample_uid in tqdm(sample_uids,
516
- unit="sample",
517
- disable=tqdm_disable,
518
- desc=f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Restoring samples"):
522
+ for sample_uid in tqdm(
523
+ sample_uids,
524
+ unit="sample",
525
+ disable=tqdm_disable,
526
+ desc=f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Restoring samples",
527
+ ):
519
528
  # Get sample info
520
529
  sample_row = self.samples_df.filter(pl.col("sample_uid") == sample_uid)
521
530
  if sample_row.is_empty():
@@ -534,7 +543,7 @@ def restore_features(self, samples=None, maps=False):
534
543
  # Load sample to get its features_df
535
544
  # Use a direct load call with map=False to prevent feature synchronization
536
545
  # which would remove filled features that don't exist in the original FeatureMap
537
- sample = Sample(log_level='DEBUG')
546
+ sample = Sample(log_level="DEBUG")
538
547
  sample._load_sample5(sample_path, map=False)
539
548
 
540
549
  if sample.features_df is None or sample.features_df.is_empty():
@@ -547,34 +556,34 @@ def restore_features(self, samples=None, maps=False):
547
556
  feature_id = row.get("feature_id")
548
557
  if feature_id is None:
549
558
  continue
550
-
559
+
551
560
  key = (sample_uid, feature_id)
552
561
  if key in study_feature_mapping:
553
562
  feature_uid = study_feature_mapping[key]
554
-
563
+
555
564
  # Update the specific columns in study.features_df
556
565
  for col in columns_to_update:
557
566
  if col in row and col in self.features_df.columns:
558
567
  # Get the original column dtype to preserve it
559
568
  original_dtype = self.features_df[col].dtype
560
-
569
+
561
570
  # Update the specific row and column, preserving dtype
562
571
  mask = (pl.col("feature_uid") == feature_uid) & (pl.col("sample_uid") == sample_uid)
563
-
572
+
564
573
  # Handle object columns (like Chromatogram) differently
565
574
  if original_dtype == pl.Object:
566
575
  self.features_df = self.features_df.with_columns(
567
576
  pl.when(mask)
568
577
  .then(pl.lit(row[col], dtype=original_dtype, allow_object=True))
569
578
  .otherwise(pl.col(col))
570
- .alias(col)
579
+ .alias(col),
571
580
  )
572
581
  else:
573
582
  self.features_df = self.features_df.with_columns(
574
583
  pl.when(mask)
575
584
  .then(pl.lit(row[col], dtype=original_dtype))
576
585
  .otherwise(pl.col(col))
577
- .alias(col)
586
+ .alias(col),
578
587
  )
579
588
  updates_made += 1
580
589
 
@@ -582,7 +591,7 @@ def restore_features(self, samples=None, maps=False):
582
591
 
583
592
  # If maps is True, load featureXML data
584
593
  if maps:
585
- if hasattr(sample, 'feature_maps'):
594
+ if hasattr(sample, "feature_maps"):
586
595
  self.feature_maps.extend(sample.feature_maps)
587
596
 
588
597
  except Exception as e:
@@ -595,14 +604,14 @@ def restore_features(self, samples=None, maps=False):
595
604
  def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
596
605
  """
597
606
  Restore chromatograms from individual .sample5 files and gap-fill missing ones.
598
-
607
+
599
608
  This function combines the functionality of restore_features() and fill_chrom():
600
609
  1. First restores chromatograms from individual .sample5 files (like restore_features)
601
610
  2. Then gap-fills any remaining empty chromatograms (like fill_chrom)
602
611
  3. ONLY updates the 'chrom' column, not chrom_area or other derived values
603
-
612
+
604
613
  Parameters:
605
- samples (list, optional): List of sample_uids or sample_names to process.
614
+ samples (list, optional): List of sample_uids or sample_names to process.
606
615
  If None, processes all samples.
607
616
  mz_tol (float): m/z tolerance for gap filling (default: 0.010)
608
617
  rt_tol (float): RT tolerance for gap filling (default: 10.0)
@@ -611,7 +620,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
611
620
  import numpy as np
612
621
  from masster.sample.sample import Sample
613
622
  from masster.chromatogram import Chromatogram
614
-
623
+
615
624
  if self.features_df is None or self.features_df.is_empty():
616
625
  self.logger.error("No features_df found in study.")
617
626
  return
@@ -627,7 +636,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
627
636
  return
628
637
 
629
638
  self.logger.info(f"Restoring chromatograms from {len(sample_uids)} samples...")
630
-
639
+
631
640
  # Create mapping of (sample_uid, feature_id) to feature_uid
632
641
  study_feature_mapping = {}
633
642
  for row in self.features_df.iter_rows(named=True):
@@ -638,12 +647,13 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
638
647
  # Phase 1: Restore from individual .sample5 files (like restore_features)
639
648
  restored_count = 0
640
649
  tqdm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
641
-
650
+
642
651
  self.logger.info("Phase 1: Restoring chromatograms from .sample5 files...")
643
- for sample_uid in tqdm(sample_uids,
644
- desc=f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Restoring from samples",
645
- disable=tqdm_disable):
646
-
652
+ for sample_uid in tqdm(
653
+ sample_uids,
654
+ desc=f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Restoring from samples",
655
+ disable=tqdm_disable,
656
+ ):
647
657
  # Get sample info
648
658
  sample_row = self.samples_df.filter(pl.col("sample_uid") == sample_uid)
649
659
  if sample_row.is_empty():
@@ -660,7 +670,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
660
670
 
661
671
  try:
662
672
  # Load sample (with map=False to prevent feature synchronization)
663
- sample = Sample(log_level='WARNING')
673
+ sample = Sample(log_level="WARNING")
664
674
  sample._load_sample5(sample_path, map=False)
665
675
 
666
676
  if sample.features_df is None or sample.features_df.is_empty():
@@ -671,21 +681,21 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
671
681
  for row in sample.features_df.iter_rows(named=True):
672
682
  feature_id = row.get("feature_id")
673
683
  chrom = row.get("chrom")
674
-
684
+
675
685
  if feature_id is None or chrom is None:
676
686
  continue
677
-
687
+
678
688
  key = (sample_uid, feature_id)
679
689
  if key in study_feature_mapping:
680
690
  feature_uid = study_feature_mapping[key]
681
-
691
+
682
692
  # Update only the chrom column
683
693
  mask = (pl.col("feature_uid") == feature_uid) & (pl.col("sample_uid") == sample_uid)
684
694
  self.features_df = self.features_df.with_columns(
685
695
  pl.when(mask)
686
696
  .then(pl.lit(chrom, dtype=pl.Object, allow_object=True))
687
697
  .otherwise(pl.col("chrom"))
688
- .alias("chrom")
698
+ .alias("chrom"),
689
699
  )
690
700
  restored_count += 1
691
701
 
@@ -694,20 +704,22 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
694
704
  continue
695
705
 
696
706
  self.logger.info(f"Phase 1 complete: Restored {restored_count} chromatograms from .sample5 files")
697
-
707
+
698
708
  # Phase 2: Gap-fill remaining empty chromatograms (like fill_chrom)
699
709
  self.logger.info("Phase 2: Gap-filling remaining empty chromatograms...")
700
-
710
+
701
711
  # Count how many chromatograms are still missing
702
712
  empty_chroms = self.features_df.filter(pl.col("chrom").is_null()).height
703
713
  total_chroms = len(self.features_df)
704
-
705
- self.logger.debug(f"Chromatograms still missing: {empty_chroms}/{total_chroms} ({empty_chroms/total_chroms*100:.1f}%)")
706
-
714
+
715
+ self.logger.debug(
716
+ f"Chromatograms still missing: {empty_chroms}/{total_chroms} ({empty_chroms / total_chroms * 100:.1f}%)"
717
+ )
718
+
707
719
  if empty_chroms == 0:
708
720
  self.logger.info("All chromatograms restored from .sample5 files. No gap-filling needed.")
709
721
  return
710
-
722
+
711
723
  # Get consensus info for gap filling
712
724
  consensus_info = {}
713
725
  for row in self.consensus_df.iter_rows(named=True):
@@ -717,23 +729,23 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
717
729
  "mz": row["mz"],
718
730
  "rt": row["rt"],
719
731
  }
720
-
732
+
721
733
  filled_count = 0
722
-
734
+
723
735
  # Process each sample that has missing chromatograms
724
- for sample_uid in tqdm(sample_uids,
725
- desc=f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Gap-filling missing chromatograms",
726
- disable=tqdm_disable):
727
-
736
+ for sample_uid in tqdm(
737
+ sample_uids,
738
+ desc=f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Gap-filling missing chromatograms",
739
+ disable=tqdm_disable,
740
+ ):
728
741
  # Get features with missing chromatograms for this sample
729
742
  missing_features = self.features_df.filter(
730
- (pl.col("sample_uid") == sample_uid) &
731
- (pl.col("chrom").is_null())
743
+ (pl.col("sample_uid") == sample_uid) & (pl.col("chrom").is_null()),
732
744
  )
733
-
745
+
734
746
  if missing_features.is_empty():
735
747
  continue
736
-
748
+
737
749
  # Get sample info
738
750
  sample_row = self.samples_df.filter(pl.col("sample_uid") == sample_uid)
739
751
  sample_info = sample_row.row(0, named=True)
@@ -745,10 +757,10 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
745
757
 
746
758
  try:
747
759
  # Load sample for MS1 data extraction
748
- sample = Sample(log_level='WARNING')
760
+ sample = Sample(log_level="WARNING")
749
761
  sample._load_sample5(sample_path, map=False)
750
762
 
751
- if not hasattr(sample, 'ms1_df') or sample.ms1_df is None or sample.ms1_df.is_empty():
763
+ if not hasattr(sample, "ms1_df") or sample.ms1_df is None or sample.ms1_df.is_empty():
752
764
  continue
753
765
 
754
766
  # Process each missing feature
@@ -758,15 +770,15 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
758
770
  rt = feature_row["rt"]
759
771
  rt_start = feature_row.get("rt_start", rt - rt_tol)
760
772
  rt_end = feature_row.get("rt_end", rt + rt_tol)
761
-
773
+
762
774
  # Extract EIC from MS1 data
763
775
  d = sample.ms1_df.filter(
764
- (pl.col("mz") >= mz - mz_tol) &
765
- (pl.col("mz") <= mz + mz_tol) &
766
- (pl.col("rt") >= rt_start - rt_tol) &
767
- (pl.col("rt") <= rt_end + rt_tol)
776
+ (pl.col("mz") >= mz - mz_tol)
777
+ & (pl.col("mz") <= mz + mz_tol)
778
+ & (pl.col("rt") >= rt_start - rt_tol)
779
+ & (pl.col("rt") <= rt_end + rt_tol),
768
780
  )
769
-
781
+
770
782
  # Create chromatogram
771
783
  if d.is_empty():
772
784
  # Create empty chromatogram
@@ -784,7 +796,7 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
784
796
  else:
785
797
  # Create real chromatogram from data
786
798
  eic_rt = d.group_by("rt").agg(pl.col("inty").max()).sort("rt")
787
-
799
+
788
800
  if len(eic_rt) > 4:
789
801
  eic = Chromatogram(
790
802
  eic_rt["rt"].to_numpy(),
@@ -809,14 +821,14 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
809
821
  feature_end=rt_end,
810
822
  feature_apex=rt,
811
823
  )
812
-
824
+
813
825
  # Update the chromatogram in the study
814
826
  mask = pl.col("feature_uid") == feature_uid
815
827
  self.features_df = self.features_df.with_columns(
816
828
  pl.when(mask)
817
829
  .then(pl.lit(eic, dtype=pl.Object, allow_object=True))
818
830
  .otherwise(pl.col("chrom"))
819
- .alias("chrom")
831
+ .alias("chrom"),
820
832
  )
821
833
  filled_count += 1
822
834
 
@@ -825,12 +837,14 @@ def restore_chrom(self, samples=None, mz_tol=0.010, rt_tol=10.0):
825
837
  continue
826
838
 
827
839
  self.logger.info(f"Phase 2 complete: Gap-filled {filled_count} chromatograms")
828
-
840
+
829
841
  # Final summary
830
842
  final_non_null = self.features_df.filter(pl.col("chrom").is_not_null()).height
831
843
  final_total = len(self.features_df)
832
-
833
- self.logger.info(f"Chromatogram restoration complete: {final_non_null}/{final_total} ({final_non_null/final_total*100:.1f}%)")
844
+
845
+ self.logger.info(
846
+ f"Chromatogram restoration complete: {final_non_null}/{final_total} ({final_non_null / final_total * 100:.1f}%)"
847
+ )
834
848
  self.logger.info(f"Restored from .sample5 files: {restored_count}, Gap-filled from raw data: {filled_count}")
835
849
 
836
850
 
@@ -839,41 +853,39 @@ def compress_ms2(self, max_replicates=5):
839
853
  Reduce the number of entries matching any pair of (consensus and energy) to max XY rows.
840
854
  Groups all rows by consensus_uid and energy. For each group, sort by number_frags * prec_inty,
841
855
  and then pick the top XY rows. Discard the others.
842
-
856
+
843
857
  Parameters:
844
858
  max_replicates (int): Maximum number of replicates to keep per consensus_uid and energy combination
845
859
  """
846
860
  if self.consensus_ms2 is None or self.consensus_ms2.is_empty():
847
861
  self.logger.warning("No consensus_ms2 found.")
848
862
  return
849
-
863
+
850
864
  initial_count = len(self.consensus_ms2)
851
-
865
+
852
866
  # Create a ranking score based on number_frags * prec_inty
853
867
  # Handle None values by treating them as 0
854
868
  self.consensus_ms2 = self.consensus_ms2.with_columns([
855
- (
856
- pl.col("number_frags").fill_null(0) *
857
- pl.col("prec_inty").fill_null(0)
858
- ).alias("ranking_score")
869
+ (pl.col("number_frags").fill_null(0) * pl.col("prec_inty").fill_null(0)).alias("ranking_score"),
859
870
  ])
860
-
871
+
861
872
  # Group by consensus_uid and energy, then rank by score and keep top max_replicates
862
873
  compressed_ms2 = (
863
- self.consensus_ms2
864
- .with_row_count("row_id") # Add row numbers for stable sorting
874
+ self.consensus_ms2.with_row_count("row_id") # Add row numbers for stable sorting
865
875
  .sort(["consensus_uid", "energy", "ranking_score", "row_id"], descending=[False, False, True, False])
866
876
  .with_columns([
867
- pl.int_range(pl.len()).over(["consensus_uid", "energy"]).alias("rank")
877
+ pl.int_range(pl.len()).over(["consensus_uid", "energy"]).alias("rank"),
868
878
  ])
869
879
  .filter(pl.col("rank") < max_replicates)
870
880
  .drop(["ranking_score", "row_id", "rank"])
871
881
  )
872
-
882
+
873
883
  self.consensus_ms2 = compressed_ms2
874
-
884
+
875
885
  removed_count = initial_count - len(self.consensus_ms2)
876
- self.logger.info(f"Compressed MS2 data: removed {removed_count} entries, kept max {max_replicates} per consensus/energy pair")
886
+ self.logger.info(
887
+ f"Compressed MS2 data: removed {removed_count} entries, kept max {max_replicates} per consensus/energy pair"
888
+ )
877
889
 
878
890
 
879
891
  def compress_chrom(self):
@@ -886,49 +898,49 @@ def compress_chrom(self):
886
898
  if self.features_df is None or self.features_df.is_empty():
887
899
  self.logger.warning("No features_df found.")
888
900
  return
889
-
901
+
890
902
  if "chrom" not in self.features_df.columns:
891
903
  self.logger.warning("No 'chrom' column found in features_df.")
892
904
  return
893
-
905
+
894
906
  # Count non-null chromatograms before compression
895
907
  non_null_count = self.features_df.filter(pl.col("chrom").is_not_null()).height
896
-
908
+
897
909
  # Set chrom column to None while keeping dtype as object
898
910
  self.features_df = self.features_df.with_columns(
899
- pl.lit(None, dtype=pl.Object).alias("chrom")
911
+ pl.lit(None, dtype=pl.Object).alias("chrom"),
900
912
  )
901
-
913
+
902
914
  self.logger.info(f"Compressed chromatograms: cleared {non_null_count} chromatogram objects from features_df")
903
915
 
904
916
 
905
917
  def set_source(self, filename):
906
918
  """
907
- Reassign file_source for all samples in samples_df. If filename contains only a path,
908
- keep the current basename and build an absolute path. Check that the new file exists
919
+ Reassign file_source for all samples in samples_df. If filename contains only a path,
920
+ keep the current basename and build an absolute path. Check that the new file exists
909
921
  before overwriting the old file_source.
910
-
922
+
911
923
  Parameters:
912
924
  filename (str): New file path or directory path for all samples
913
-
925
+
914
926
  Returns:
915
927
  None
916
928
  """
917
929
  import os
918
-
930
+
919
931
  if self.samples_df is None or len(self.samples_df) == 0:
920
932
  self.logger.warning("No samples found in study.")
921
933
  return
922
-
934
+
923
935
  updated_count = 0
924
936
  failed_count = 0
925
-
937
+
926
938
  # Get all current file_source values
927
939
  current_sources = self.samples_df.get_column("file_source").to_list()
928
940
  sample_names = self.samples_df.get_column("sample_name").to_list()
929
-
941
+
930
942
  new_sources = []
931
-
943
+
932
944
  for i, (current_source, sample_name) in enumerate(zip(current_sources, sample_names)):
933
945
  # Check if filename is just a directory path
934
946
  if os.path.isdir(filename):
@@ -937,7 +949,7 @@ def set_source(self, filename):
937
949
  new_sources.append(current_source)
938
950
  failed_count += 1
939
951
  continue
940
-
952
+
941
953
  # Get the basename from current file_source
942
954
  current_basename = os.path.basename(current_source)
943
955
  # Build new absolute path
@@ -945,26 +957,26 @@ def set_source(self, filename):
945
957
  else:
946
958
  # filename is a full path, make it absolute
947
959
  new_file_path = os.path.abspath(filename)
948
-
960
+
949
961
  # Check if the new file exists
950
962
  if not os.path.exists(new_file_path):
951
963
  self.logger.warning(f"File does not exist for sample '{sample_name}': {new_file_path}")
952
964
  new_sources.append(current_source)
953
965
  failed_count += 1
954
966
  continue
955
-
967
+
956
968
  # File exists, update source
957
969
  new_sources.append(new_file_path)
958
970
  updated_count += 1
959
-
971
+
960
972
  # Log individual updates at debug level
961
973
  self.logger.debug(f"Updated file_source for sample '{sample_name}': {current_source} -> {new_file_path}")
962
-
974
+
963
975
  # Update the samples_df with new file_source values
964
976
  self.samples_df = self.samples_df.with_columns(
965
- pl.Series("file_source", new_sources).alias("file_source")
977
+ pl.Series("file_source", new_sources).alias("file_source"),
966
978
  )
967
-
979
+
968
980
  # Log summary
969
981
  if updated_count > 0:
970
982
  self.logger.info(f"Updated file_source for {updated_count} samples")
@@ -990,9 +1002,9 @@ def features_select(
990
1002
  ):
991
1003
  """
992
1004
  Select features from features_df based on specified criteria and return the filtered DataFrame.
993
-
1005
+
994
1006
  OPTIMIZED VERSION: Combines all filters into a single operation for better performance.
995
-
1007
+
996
1008
  Parameters:
997
1009
  mz: m/z range filter (tuple for range, single value for minimum)
998
1010
  rt: retention time range filter (tuple for range, single value for minimum)
@@ -1007,30 +1019,42 @@ def features_select(
1007
1019
  chrom_prominence: chromatogram prominence filter (tuple for range, single value for minimum)
1008
1020
  chrom_prominence_scaled: scaled chromatogram prominence filter (tuple for range, single value for minimum)
1009
1021
  chrom_height_scaled: scaled chromatogram height filter (tuple for range, single value for minimum)
1010
-
1022
+
1011
1023
  Returns:
1012
1024
  polars.DataFrame: Filtered features DataFrame
1013
1025
  """
1014
1026
  if self.features_df is None or self.features_df.is_empty():
1015
1027
  self.logger.warning("No features found in study.")
1016
1028
  return pl.DataFrame()
1017
-
1029
+
1018
1030
  # Early return if no filters provided - performance optimization
1019
- filter_params = [mz, rt, inty, sample_uid, sample_name, consensus_uid,
1020
- feature_uid, filled, quality, chrom_coherence,
1021
- chrom_prominence, chrom_prominence_scaled, chrom_height_scaled]
1031
+ filter_params = [
1032
+ mz,
1033
+ rt,
1034
+ inty,
1035
+ sample_uid,
1036
+ sample_name,
1037
+ consensus_uid,
1038
+ feature_uid,
1039
+ filled,
1040
+ quality,
1041
+ chrom_coherence,
1042
+ chrom_prominence,
1043
+ chrom_prominence_scaled,
1044
+ chrom_height_scaled,
1045
+ ]
1022
1046
  if all(param is None for param in filter_params):
1023
1047
  return self.features_df.clone()
1024
-
1048
+
1025
1049
  initial_count = len(self.features_df)
1026
-
1050
+
1027
1051
  # Pre-check available columns once for efficiency
1028
1052
  available_columns = set(self.features_df.columns)
1029
-
1053
+
1030
1054
  # Build all filter conditions first, then apply them all at once
1031
1055
  filter_conditions = []
1032
1056
  warnings = []
1033
-
1057
+
1034
1058
  # Filter by m/z
1035
1059
  if mz is not None:
1036
1060
  if isinstance(mz, tuple) and len(mz) == 2:
@@ -1038,7 +1062,7 @@ def features_select(
1038
1062
  filter_conditions.append((pl.col("mz") >= min_mz) & (pl.col("mz") <= max_mz))
1039
1063
  else:
1040
1064
  filter_conditions.append(pl.col("mz") >= mz)
1041
-
1065
+
1042
1066
  # Filter by retention time
1043
1067
  if rt is not None:
1044
1068
  if isinstance(rt, tuple) and len(rt) == 2:
@@ -1046,7 +1070,7 @@ def features_select(
1046
1070
  filter_conditions.append((pl.col("rt") >= min_rt) & (pl.col("rt") <= max_rt))
1047
1071
  else:
1048
1072
  filter_conditions.append(pl.col("rt") >= rt)
1049
-
1073
+
1050
1074
  # Filter by intensity
1051
1075
  if inty is not None:
1052
1076
  if isinstance(inty, tuple) and len(inty) == 2:
@@ -1054,7 +1078,7 @@ def features_select(
1054
1078
  filter_conditions.append((pl.col("inty") >= min_inty) & (pl.col("inty") <= max_inty))
1055
1079
  else:
1056
1080
  filter_conditions.append(pl.col("inty") >= inty)
1057
-
1081
+
1058
1082
  # Filter by sample_uid
1059
1083
  if sample_uid is not None:
1060
1084
  if isinstance(sample_uid, (list, tuple)):
@@ -1067,24 +1091,24 @@ def features_select(
1067
1091
  filter_conditions.append(pl.col("sample_uid").is_in(sample_uid))
1068
1092
  else:
1069
1093
  filter_conditions.append(pl.col("sample_uid") == sample_uid)
1070
-
1094
+
1071
1095
  # Filter by sample_name (requires pre-processing)
1072
1096
  if sample_name is not None:
1073
1097
  # Get sample_uids for the given sample names
1074
1098
  if isinstance(sample_name, list):
1075
1099
  sample_uids_for_names = self.samples_df.filter(
1076
- pl.col("sample_name").is_in(sample_name)
1100
+ pl.col("sample_name").is_in(sample_name),
1077
1101
  )["sample_uid"].to_list()
1078
1102
  else:
1079
1103
  sample_uids_for_names = self.samples_df.filter(
1080
- pl.col("sample_name") == sample_name
1104
+ pl.col("sample_name") == sample_name,
1081
1105
  )["sample_uid"].to_list()
1082
-
1106
+
1083
1107
  if sample_uids_for_names:
1084
1108
  filter_conditions.append(pl.col("sample_uid").is_in(sample_uids_for_names))
1085
1109
  else:
1086
1110
  filter_conditions.append(pl.lit(False)) # No matching samples
1087
-
1111
+
1088
1112
  # Filter by consensus_uid
1089
1113
  if consensus_uid is not None:
1090
1114
  if isinstance(consensus_uid, (list, tuple)):
@@ -1097,7 +1121,7 @@ def features_select(
1097
1121
  filter_conditions.append(pl.col("consensus_uid").is_in(consensus_uid))
1098
1122
  else:
1099
1123
  filter_conditions.append(pl.col("consensus_uid") == consensus_uid)
1100
-
1124
+
1101
1125
  # Filter by feature_uid
1102
1126
  if feature_uid is not None:
1103
1127
  if isinstance(feature_uid, (list, tuple)):
@@ -1110,7 +1134,7 @@ def features_select(
1110
1134
  filter_conditions.append(pl.col("feature_uid").is_in(feature_uid))
1111
1135
  else:
1112
1136
  filter_conditions.append(pl.col("feature_uid") == feature_uid)
1113
-
1137
+
1114
1138
  # Filter by filled status
1115
1139
  if filled is not None:
1116
1140
  if "filled" in available_columns:
@@ -1120,7 +1144,7 @@ def features_select(
1120
1144
  filter_conditions.append(~pl.col("filled") | pl.col("filled").is_null())
1121
1145
  else:
1122
1146
  warnings.append("'filled' column not found in features_df")
1123
-
1147
+
1124
1148
  # Filter by quality
1125
1149
  if quality is not None:
1126
1150
  if "quality" in available_columns:
@@ -1131,73 +1155,83 @@ def features_select(
1131
1155
  filter_conditions.append(pl.col("quality") >= quality)
1132
1156
  else:
1133
1157
  warnings.append("'quality' column not found in features_df")
1134
-
1158
+
1135
1159
  # Filter by chromatogram coherence
1136
1160
  if chrom_coherence is not None:
1137
1161
  if "chrom_coherence" in available_columns:
1138
1162
  if isinstance(chrom_coherence, tuple) and len(chrom_coherence) == 2:
1139
1163
  min_coherence, max_coherence = chrom_coherence
1140
- filter_conditions.append((pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence))
1164
+ filter_conditions.append(
1165
+ (pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence)
1166
+ )
1141
1167
  else:
1142
1168
  filter_conditions.append(pl.col("chrom_coherence") >= chrom_coherence)
1143
1169
  else:
1144
1170
  warnings.append("'chrom_coherence' column not found in features_df")
1145
-
1171
+
1146
1172
  # Filter by chromatogram prominence
1147
1173
  if chrom_prominence is not None:
1148
1174
  if "chrom_prominence" in available_columns:
1149
1175
  if isinstance(chrom_prominence, tuple) and len(chrom_prominence) == 2:
1150
1176
  min_prominence, max_prominence = chrom_prominence
1151
- filter_conditions.append((pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence))
1177
+ filter_conditions.append(
1178
+ (pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence)
1179
+ )
1152
1180
  else:
1153
1181
  filter_conditions.append(pl.col("chrom_prominence") >= chrom_prominence)
1154
1182
  else:
1155
1183
  warnings.append("'chrom_prominence' column not found in features_df")
1156
-
1184
+
1157
1185
  # Filter by scaled chromatogram prominence
1158
1186
  if chrom_prominence_scaled is not None:
1159
1187
  if "chrom_prominence_scaled" in available_columns:
1160
1188
  if isinstance(chrom_prominence_scaled, tuple) and len(chrom_prominence_scaled) == 2:
1161
1189
  min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled
1162
- filter_conditions.append((pl.col("chrom_prominence_scaled") >= min_prominence_scaled) & (pl.col("chrom_prominence_scaled") <= max_prominence_scaled))
1190
+ filter_conditions.append(
1191
+ (pl.col("chrom_prominence_scaled") >= min_prominence_scaled)
1192
+ & (pl.col("chrom_prominence_scaled") <= max_prominence_scaled)
1193
+ )
1163
1194
  else:
1164
1195
  filter_conditions.append(pl.col("chrom_prominence_scaled") >= chrom_prominence_scaled)
1165
1196
  else:
1166
1197
  warnings.append("'chrom_prominence_scaled' column not found in features_df")
1167
-
1198
+
1168
1199
  # Filter by scaled chromatogram height
1169
1200
  if chrom_height_scaled is not None:
1170
1201
  if "chrom_height_scaled" in available_columns:
1171
1202
  if isinstance(chrom_height_scaled, tuple) and len(chrom_height_scaled) == 2:
1172
1203
  min_height_scaled, max_height_scaled = chrom_height_scaled
1173
- filter_conditions.append((pl.col("chrom_height_scaled") >= min_height_scaled) & (pl.col("chrom_height_scaled") <= max_height_scaled))
1204
+ filter_conditions.append(
1205
+ (pl.col("chrom_height_scaled") >= min_height_scaled)
1206
+ & (pl.col("chrom_height_scaled") <= max_height_scaled)
1207
+ )
1174
1208
  else:
1175
1209
  filter_conditions.append(pl.col("chrom_height_scaled") >= chrom_height_scaled)
1176
1210
  else:
1177
1211
  warnings.append("'chrom_height_scaled' column not found in features_df")
1178
-
1212
+
1179
1213
  # Log all warnings once at the end for efficiency
1180
1214
  for warning in warnings:
1181
1215
  self.logger.warning(warning)
1182
-
1216
+
1183
1217
  # Apply all filters at once using lazy evaluation for optimal performance
1184
1218
  if filter_conditions:
1185
1219
  # Combine all conditions with AND
1186
1220
  combined_filter = filter_conditions[0]
1187
1221
  for condition in filter_conditions[1:]:
1188
1222
  combined_filter = combined_filter & condition
1189
-
1223
+
1190
1224
  # Apply the combined filter using lazy evaluation
1191
1225
  feats = self.features_df.lazy().filter(combined_filter).collect()
1192
1226
  else:
1193
1227
  feats = self.features_df.clone()
1194
-
1228
+
1195
1229
  final_count = len(feats)
1196
-
1230
+
1197
1231
  if final_count == 0:
1198
1232
  self.logger.warning("No features remaining after applying selection criteria.")
1199
1233
  else:
1200
- #removed_count = initial_count - final_count
1234
+ # removed_count = initial_count - final_count
1201
1235
  self.logger.info(f"Features selected: {final_count} (out of {initial_count})")
1202
1236
 
1203
1237
  return feats
@@ -1207,29 +1241,29 @@ def features_filter(self, features):
1207
1241
  """
1208
1242
  Filter features_df by keeping only features that match the given criteria.
1209
1243
  This keeps only the specified features and removes all others.
1210
-
1244
+
1211
1245
  OPTIMIZED VERSION: Batch operations and reduced overhead for better performance.
1212
-
1246
+
1213
1247
  Parameters:
1214
1248
  features: Features to keep. Can be:
1215
1249
  - polars.DataFrame: Features DataFrame (will use feature_uid column)
1216
1250
  - list: List of feature_uids to keep
1217
1251
  - int: Single feature_uid to keep
1218
-
1252
+
1219
1253
  Returns:
1220
1254
  None (modifies self.features_df in place)
1221
1255
  """
1222
1256
  if self.features_df is None or self.features_df.is_empty():
1223
1257
  self.logger.warning("No features found in study.")
1224
1258
  return
1225
-
1259
+
1226
1260
  # Early return if no features provided
1227
1261
  if features is None:
1228
1262
  self.logger.warning("No features provided for filtering.")
1229
1263
  return
1230
-
1264
+
1231
1265
  initial_count = len(self.features_df)
1232
-
1266
+
1233
1267
  # Determine feature_uids to keep - optimized type checking
1234
1268
  if isinstance(features, pl.DataFrame):
1235
1269
  if "feature_uid" not in features.columns:
@@ -1243,44 +1277,41 @@ def features_filter(self, features):
1243
1277
  else:
1244
1278
  self.logger.error("features parameter must be a DataFrame, list, tuple, or int")
1245
1279
  return
1246
-
1280
+
1247
1281
  # Early return if no UIDs to keep
1248
1282
  if not feature_uids_to_keep:
1249
1283
  self.logger.warning("No feature UIDs provided for filtering.")
1250
1284
  return
1251
-
1285
+
1252
1286
  # Convert to set for faster lookup if list is large
1253
1287
  if len(feature_uids_to_keep) > 100:
1254
1288
  feature_uids_set = set(feature_uids_to_keep)
1255
1289
  # Use the set for filtering if it's significantly smaller
1256
1290
  if len(feature_uids_set) < len(feature_uids_to_keep) * 0.8:
1257
1291
  feature_uids_to_keep = list(feature_uids_set)
1258
-
1292
+
1259
1293
  # Create filter condition once - keep only the specified features
1260
1294
  filter_condition = pl.col("feature_uid").is_in(feature_uids_to_keep)
1261
-
1295
+
1262
1296
  # Apply filter to features_df using lazy evaluation for better performance
1263
1297
  self.features_df = self.features_df.lazy().filter(filter_condition).collect()
1264
-
1298
+
1265
1299
  # Apply filter to consensus_mapping_df if it exists - batch operation
1266
1300
  mapping_removed_count = 0
1267
1301
  if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
1268
1302
  initial_mapping_count = len(self.consensus_mapping_df)
1269
- self.consensus_mapping_df = (
1270
- self.consensus_mapping_df
1271
- .lazy()
1272
- .filter(filter_condition)
1273
- .collect()
1274
- )
1303
+ self.consensus_mapping_df = self.consensus_mapping_df.lazy().filter(filter_condition).collect()
1275
1304
  mapping_removed_count = initial_mapping_count - len(self.consensus_mapping_df)
1276
-
1305
+
1277
1306
  # Calculate results once and log efficiently
1278
1307
  final_count = len(self.features_df)
1279
1308
  removed_count = initial_count - final_count
1280
-
1309
+
1281
1310
  # Single comprehensive log message
1282
1311
  if mapping_removed_count > 0:
1283
- self.logger.info(f"Kept {final_count} features and removed {mapping_removed_count} consensus mappings. Filtered out {removed_count} features.")
1312
+ self.logger.info(
1313
+ f"Kept {final_count} features and removed {mapping_removed_count} consensus mappings. Filtered out {removed_count} features."
1314
+ )
1284
1315
  else:
1285
1316
  self.logger.info(f"Kept {final_count} features. Filtered out {removed_count} features.")
1286
1317
 
@@ -1289,27 +1320,27 @@ def features_delete(self, features):
1289
1320
  """
1290
1321
  Delete features from features_df based on feature identifiers.
1291
1322
  This removes the specified features and keeps all others (opposite of features_filter).
1292
-
1323
+
1293
1324
  Parameters:
1294
1325
  features: Features to delete. Can be:
1295
1326
  - polars.DataFrame: Features DataFrame (will use feature_uid column)
1296
1327
  - list: List of feature_uids to delete
1297
1328
  - int: Single feature_uid to delete
1298
-
1329
+
1299
1330
  Returns:
1300
1331
  None (modifies self.features_df in place)
1301
1332
  """
1302
1333
  if self.features_df is None or self.features_df.is_empty():
1303
1334
  self.logger.warning("No features found in study.")
1304
1335
  return
1305
-
1336
+
1306
1337
  # Early return if no features provided
1307
1338
  if features is None:
1308
1339
  self.logger.warning("No features provided for deletion.")
1309
1340
  return
1310
-
1341
+
1311
1342
  initial_count = len(self.features_df)
1312
-
1343
+
1313
1344
  # Determine feature_uids to remove - optimized type checking
1314
1345
  if isinstance(features, pl.DataFrame):
1315
1346
  if "feature_uid" not in features.columns:
@@ -1323,44 +1354,41 @@ def features_delete(self, features):
1323
1354
  else:
1324
1355
  self.logger.error("features parameter must be a DataFrame, list, tuple, or int")
1325
1356
  return
1326
-
1357
+
1327
1358
  # Early return if no UIDs to remove
1328
1359
  if not feature_uids_to_remove:
1329
1360
  self.logger.warning("No feature UIDs provided for deletion.")
1330
1361
  return
1331
-
1362
+
1332
1363
  # Convert to set for faster lookup if list is large
1333
1364
  if len(feature_uids_to_remove) > 100:
1334
1365
  feature_uids_set = set(feature_uids_to_remove)
1335
1366
  # Use the set for filtering if it's significantly smaller
1336
1367
  if len(feature_uids_set) < len(feature_uids_to_remove) * 0.8:
1337
1368
  feature_uids_to_remove = list(feature_uids_set)
1338
-
1369
+
1339
1370
  # Create filter condition - remove specified features
1340
1371
  filter_condition = ~pl.col("feature_uid").is_in(feature_uids_to_remove)
1341
-
1372
+
1342
1373
  # Apply filter to features_df using lazy evaluation for better performance
1343
1374
  self.features_df = self.features_df.lazy().filter(filter_condition).collect()
1344
-
1375
+
1345
1376
  # Apply filter to consensus_mapping_df if it exists - batch operation
1346
1377
  mapping_removed_count = 0
1347
1378
  if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
1348
1379
  initial_mapping_count = len(self.consensus_mapping_df)
1349
- self.consensus_mapping_df = (
1350
- self.consensus_mapping_df
1351
- .lazy()
1352
- .filter(filter_condition)
1353
- .collect()
1354
- )
1380
+ self.consensus_mapping_df = self.consensus_mapping_df.lazy().filter(filter_condition).collect()
1355
1381
  mapping_removed_count = initial_mapping_count - len(self.consensus_mapping_df)
1356
-
1382
+
1357
1383
  # Calculate results once and log efficiently
1358
1384
  final_count = len(self.features_df)
1359
1385
  removed_count = initial_count - final_count
1360
-
1386
+
1361
1387
  # Single comprehensive log message
1362
1388
  if mapping_removed_count > 0:
1363
- self.logger.info(f"Deleted {removed_count} features and {mapping_removed_count} consensus mappings. Remaining features: {final_count}")
1389
+ self.logger.info(
1390
+ f"Deleted {removed_count} features and {mapping_removed_count} consensus mappings. Remaining features: {final_count}"
1391
+ )
1364
1392
  else:
1365
1393
  self.logger.info(f"Deleted {removed_count} features. Remaining features: {final_count}")
1366
1394
 
@@ -1384,7 +1412,7 @@ def consensus_select(
1384
1412
  ):
1385
1413
  """
1386
1414
  Select consensus features from consensus_df based on specified criteria and return the filtered DataFrame.
1387
-
1415
+
1388
1416
  Parameters:
1389
1417
  mz: m/z range filter (tuple for range, single value for minimum)
1390
1418
  rt: retention time range filter (tuple for range, single value for minimum)
@@ -1400,17 +1428,17 @@ def consensus_select(
1400
1428
  chrom_prominence_scaled_mean: mean scaled chromatogram prominence filter (tuple for range, single value for minimum)
1401
1429
  chrom_height_scaled_mean: mean scaled chromatogram height filter (tuple for range, single value for minimum)
1402
1430
  rt_delta_mean: mean RT delta filter (tuple for range, single value for minimum)
1403
-
1431
+
1404
1432
  Returns:
1405
1433
  polars.DataFrame: Filtered consensus DataFrame
1406
1434
  """
1407
1435
  if self.consensus_df is None or self.consensus_df.is_empty():
1408
1436
  self.logger.warning("No consensus features found in study.")
1409
1437
  return pl.DataFrame()
1410
-
1438
+
1411
1439
  consensus = self.consensus_df.clone()
1412
1440
  initial_count = len(consensus)
1413
-
1441
+
1414
1442
  # Filter by m/z
1415
1443
  if mz is not None:
1416
1444
  consensus_len_before_filter = len(consensus)
@@ -1420,9 +1448,9 @@ def consensus_select(
1420
1448
  else:
1421
1449
  consensus = consensus.filter(pl.col("mz") >= mz)
1422
1450
  self.logger.debug(
1423
- f"Selected consensus by mz. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1451
+ f"Selected consensus by mz. Consensus removed: {consensus_len_before_filter - len(consensus)}",
1424
1452
  )
1425
-
1453
+
1426
1454
  # Filter by retention time
1427
1455
  if rt is not None:
1428
1456
  consensus_len_before_filter = len(consensus)
@@ -1432,9 +1460,9 @@ def consensus_select(
1432
1460
  else:
1433
1461
  consensus = consensus.filter(pl.col("rt") >= rt)
1434
1462
  self.logger.debug(
1435
- f"Selected consensus by rt. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1463
+ f"Selected consensus by rt. Consensus removed: {consensus_len_before_filter - len(consensus)}",
1436
1464
  )
1437
-
1465
+
1438
1466
  # Filter by mean intensity
1439
1467
  if inty_mean is not None:
1440
1468
  consensus_len_before_filter = len(consensus)
@@ -1444,9 +1472,9 @@ def consensus_select(
1444
1472
  else:
1445
1473
  consensus = consensus.filter(pl.col("inty_mean") >= inty_mean)
1446
1474
  self.logger.debug(
1447
- f"Selected consensus by inty_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1475
+ f"Selected consensus by inty_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
1448
1476
  )
1449
-
1477
+
1450
1478
  # Filter by consensus_uid
1451
1479
  if consensus_uid is not None:
1452
1480
  consensus_len_before_filter = len(consensus)
@@ -1454,16 +1482,18 @@ def consensus_select(
1454
1482
  if len(consensus_uid) == 2 and not isinstance(consensus_uid, list):
1455
1483
  # Treat as range
1456
1484
  min_uid, max_uid = consensus_uid
1457
- consensus = consensus.filter((pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid))
1485
+ consensus = consensus.filter(
1486
+ (pl.col("consensus_uid") >= min_uid) & (pl.col("consensus_uid") <= max_uid)
1487
+ )
1458
1488
  else:
1459
1489
  # Treat as list
1460
1490
  consensus = consensus.filter(pl.col("consensus_uid").is_in(consensus_uid))
1461
1491
  else:
1462
1492
  consensus = consensus.filter(pl.col("consensus_uid") == consensus_uid)
1463
1493
  self.logger.debug(
1464
- f"Selected consensus by consensus_uid. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1494
+ f"Selected consensus by consensus_uid. Consensus removed: {consensus_len_before_filter - len(consensus)}",
1465
1495
  )
1466
-
1496
+
1467
1497
  # Filter by consensus_id
1468
1498
  if consensus_id is not None:
1469
1499
  consensus_len_before_filter = len(consensus)
@@ -1472,21 +1502,23 @@ def consensus_select(
1472
1502
  else:
1473
1503
  consensus = consensus.filter(pl.col("consensus_id") == consensus_id)
1474
1504
  self.logger.debug(
1475
- f"Selected consensus by consensus_id. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1505
+ f"Selected consensus by consensus_id. Consensus removed: {consensus_len_before_filter - len(consensus)}",
1476
1506
  )
1477
-
1507
+
1478
1508
  # Filter by number of samples
1479
1509
  if number_samples is not None:
1480
1510
  consensus_len_before_filter = len(consensus)
1481
1511
  if isinstance(number_samples, tuple) and len(number_samples) == 2:
1482
1512
  min_samples, max_samples = number_samples
1483
- consensus = consensus.filter((pl.col("number_samples") >= min_samples) & (pl.col("number_samples") <= max_samples))
1513
+ consensus = consensus.filter(
1514
+ (pl.col("number_samples") >= min_samples) & (pl.col("number_samples") <= max_samples)
1515
+ )
1484
1516
  else:
1485
1517
  consensus = consensus.filter(pl.col("number_samples") >= number_samples)
1486
1518
  self.logger.debug(
1487
- f"Selected consensus by number_samples. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1519
+ f"Selected consensus by number_samples. Consensus removed: {consensus_len_before_filter - len(consensus)}",
1488
1520
  )
1489
-
1521
+
1490
1522
  # Filter by number of MS2 spectra
1491
1523
  if number_ms2 is not None:
1492
1524
  consensus_len_before_filter = len(consensus)
@@ -1499,9 +1531,9 @@ def consensus_select(
1499
1531
  else:
1500
1532
  self.logger.warning("'number_ms2' column not found in consensus_df")
1501
1533
  self.logger.debug(
1502
- f"Selected consensus by number_ms2. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1534
+ f"Selected consensus by number_ms2. Consensus removed: {consensus_len_before_filter - len(consensus)}",
1503
1535
  )
1504
-
1536
+
1505
1537
  # Filter by quality
1506
1538
  if quality is not None:
1507
1539
  consensus_len_before_filter = len(consensus)
@@ -1511,9 +1543,9 @@ def consensus_select(
1511
1543
  else:
1512
1544
  consensus = consensus.filter(pl.col("quality") >= quality)
1513
1545
  self.logger.debug(
1514
- f"Selected consensus by quality. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1546
+ f"Selected consensus by quality. Consensus removed: {consensus_len_before_filter - len(consensus)}",
1515
1547
  )
1516
-
1548
+
1517
1549
  # Filter by baseline
1518
1550
  if bl is not None:
1519
1551
  consensus_len_before_filter = len(consensus)
@@ -1526,89 +1558,103 @@ def consensus_select(
1526
1558
  else:
1527
1559
  self.logger.warning("'bl' column not found in consensus_df")
1528
1560
  self.logger.debug(
1529
- f"Selected consensus by bl. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1561
+ f"Selected consensus by bl. Consensus removed: {consensus_len_before_filter - len(consensus)}",
1530
1562
  )
1531
-
1563
+
1532
1564
  # Filter by mean chromatogram coherence
1533
1565
  if chrom_coherence_mean is not None:
1534
1566
  consensus_len_before_filter = len(consensus)
1535
1567
  if "chrom_coherence_mean" in consensus.columns:
1536
1568
  if isinstance(chrom_coherence_mean, tuple) and len(chrom_coherence_mean) == 2:
1537
1569
  min_coherence, max_coherence = chrom_coherence_mean
1538
- consensus = consensus.filter((pl.col("chrom_coherence_mean") >= min_coherence) & (pl.col("chrom_coherence_mean") <= max_coherence))
1570
+ consensus = consensus.filter(
1571
+ (pl.col("chrom_coherence_mean") >= min_coherence)
1572
+ & (pl.col("chrom_coherence_mean") <= max_coherence)
1573
+ )
1539
1574
  else:
1540
1575
  consensus = consensus.filter(pl.col("chrom_coherence_mean") >= chrom_coherence_mean)
1541
1576
  else:
1542
1577
  self.logger.warning("'chrom_coherence_mean' column not found in consensus_df")
1543
1578
  self.logger.debug(
1544
- f"Selected consensus by chrom_coherence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1579
+ f"Selected consensus by chrom_coherence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
1545
1580
  )
1546
-
1581
+
1547
1582
  # Filter by mean chromatogram prominence
1548
1583
  if chrom_prominence_mean is not None:
1549
1584
  consensus_len_before_filter = len(consensus)
1550
1585
  if "chrom_prominence_mean" in consensus.columns:
1551
1586
  if isinstance(chrom_prominence_mean, tuple) and len(chrom_prominence_mean) == 2:
1552
1587
  min_prominence, max_prominence = chrom_prominence_mean
1553
- consensus = consensus.filter((pl.col("chrom_prominence_mean") >= min_prominence) & (pl.col("chrom_prominence_mean") <= max_prominence))
1588
+ consensus = consensus.filter(
1589
+ (pl.col("chrom_prominence_mean") >= min_prominence)
1590
+ & (pl.col("chrom_prominence_mean") <= max_prominence)
1591
+ )
1554
1592
  else:
1555
1593
  consensus = consensus.filter(pl.col("chrom_prominence_mean") >= chrom_prominence_mean)
1556
1594
  else:
1557
1595
  self.logger.warning("'chrom_prominence_mean' column not found in consensus_df")
1558
1596
  self.logger.debug(
1559
- f"Selected consensus by chrom_prominence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1597
+ f"Selected consensus by chrom_prominence_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
1560
1598
  )
1561
-
1599
+
1562
1600
  # Filter by mean scaled chromatogram prominence
1563
1601
  if chrom_prominence_scaled_mean is not None:
1564
1602
  consensus_len_before_filter = len(consensus)
1565
1603
  if "chrom_prominence_scaled_mean" in consensus.columns:
1566
1604
  if isinstance(chrom_prominence_scaled_mean, tuple) and len(chrom_prominence_scaled_mean) == 2:
1567
1605
  min_prominence_scaled, max_prominence_scaled = chrom_prominence_scaled_mean
1568
- consensus = consensus.filter((pl.col("chrom_prominence_scaled_mean") >= min_prominence_scaled) & (pl.col("chrom_prominence_scaled_mean") <= max_prominence_scaled))
1606
+ consensus = consensus.filter(
1607
+ (pl.col("chrom_prominence_scaled_mean") >= min_prominence_scaled)
1608
+ & (pl.col("chrom_prominence_scaled_mean") <= max_prominence_scaled)
1609
+ )
1569
1610
  else:
1570
1611
  consensus = consensus.filter(pl.col("chrom_prominence_scaled_mean") >= chrom_prominence_scaled_mean)
1571
1612
  else:
1572
1613
  self.logger.warning("'chrom_prominence_scaled_mean' column not found in consensus_df")
1573
1614
  self.logger.debug(
1574
- f"Selected consensus by chrom_prominence_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1615
+ f"Selected consensus by chrom_prominence_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
1575
1616
  )
1576
-
1617
+
1577
1618
  # Filter by mean scaled chromatogram height
1578
1619
  if chrom_height_scaled_mean is not None:
1579
1620
  consensus_len_before_filter = len(consensus)
1580
1621
  if "chrom_height_scaled_mean" in consensus.columns:
1581
1622
  if isinstance(chrom_height_scaled_mean, tuple) and len(chrom_height_scaled_mean) == 2:
1582
1623
  min_height_scaled, max_height_scaled = chrom_height_scaled_mean
1583
- consensus = consensus.filter((pl.col("chrom_height_scaled_mean") >= min_height_scaled) & (pl.col("chrom_height_scaled_mean") <= max_height_scaled))
1624
+ consensus = consensus.filter(
1625
+ (pl.col("chrom_height_scaled_mean") >= min_height_scaled)
1626
+ & (pl.col("chrom_height_scaled_mean") <= max_height_scaled)
1627
+ )
1584
1628
  else:
1585
1629
  consensus = consensus.filter(pl.col("chrom_height_scaled_mean") >= chrom_height_scaled_mean)
1586
1630
  else:
1587
1631
  self.logger.warning("'chrom_height_scaled_mean' column not found in consensus_df")
1588
1632
  self.logger.debug(
1589
- f"Selected consensus by chrom_height_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1633
+ f"Selected consensus by chrom_height_scaled_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
1590
1634
  )
1591
-
1635
+
1592
1636
  # Filter by mean RT delta
1593
1637
  if rt_delta_mean is not None:
1594
1638
  consensus_len_before_filter = len(consensus)
1595
1639
  if "rt_delta_mean" in consensus.columns:
1596
1640
  if isinstance(rt_delta_mean, tuple) and len(rt_delta_mean) == 2:
1597
1641
  min_rt_delta, max_rt_delta = rt_delta_mean
1598
- consensus = consensus.filter((pl.col("rt_delta_mean") >= min_rt_delta) & (pl.col("rt_delta_mean") <= max_rt_delta))
1642
+ consensus = consensus.filter(
1643
+ (pl.col("rt_delta_mean") >= min_rt_delta) & (pl.col("rt_delta_mean") <= max_rt_delta)
1644
+ )
1599
1645
  else:
1600
1646
  consensus = consensus.filter(pl.col("rt_delta_mean") >= rt_delta_mean)
1601
1647
  else:
1602
1648
  self.logger.warning("'rt_delta_mean' column not found in consensus_df")
1603
1649
  self.logger.debug(
1604
- f"Selected consensus by rt_delta_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}"
1650
+ f"Selected consensus by rt_delta_mean. Consensus removed: {consensus_len_before_filter - len(consensus)}",
1605
1651
  )
1606
-
1652
+
1607
1653
  if len(consensus) == 0:
1608
1654
  self.logger.warning("No consensus features remaining after applying selection criteria.")
1609
1655
  else:
1610
1656
  self.logger.info(f"Selected consensus features. Features remaining: {len(consensus)} (from {initial_count})")
1611
-
1657
+
1612
1658
  return consensus
1613
1659
 
1614
1660
 
@@ -1616,22 +1662,22 @@ def consensus_filter(self, consensus):
1616
1662
  """
1617
1663
  Filter consensus_df by removing all consensus features that match the given criteria.
1618
1664
  This also removes related entries from consensus_mapping_df, features_df, and consensus_ms2.
1619
-
1665
+
1620
1666
  Parameters:
1621
1667
  consensus: Consensus features to remove. Can be:
1622
1668
  - polars.DataFrame: Consensus DataFrame (will use consensus_uid column)
1623
1669
  - list: List of consensus_uids to remove
1624
1670
  - int: Single consensus_uid to remove
1625
-
1671
+
1626
1672
  Returns:
1627
1673
  None (modifies self.consensus_df and related DataFrames in place)
1628
1674
  """
1629
1675
  if self.consensus_df is None or self.consensus_df.is_empty():
1630
1676
  self.logger.warning("No consensus features found in study.")
1631
1677
  return
1632
-
1678
+
1633
1679
  initial_consensus_count = len(self.consensus_df)
1634
-
1680
+
1635
1681
  # Determine consensus_uids to remove
1636
1682
  if isinstance(consensus, pl.DataFrame):
1637
1683
  if "consensus_uid" not in consensus.columns:
@@ -1645,68 +1691,70 @@ def consensus_filter(self, consensus):
1645
1691
  else:
1646
1692
  self.logger.error("consensus parameter must be a DataFrame, list, or int")
1647
1693
  return
1648
-
1694
+
1649
1695
  if not consensus_uids_to_remove:
1650
1696
  self.logger.warning("No consensus UIDs provided for filtering.")
1651
1697
  return
1652
-
1698
+
1653
1699
  # Get feature_uids that need to be removed from features_df
1654
1700
  feature_uids_to_remove = []
1655
1701
  if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
1656
1702
  feature_uids_to_remove = self.consensus_mapping_df.filter(
1657
- pl.col("consensus_uid").is_in(consensus_uids_to_remove)
1703
+ pl.col("consensus_uid").is_in(consensus_uids_to_remove),
1658
1704
  )["feature_uid"].to_list()
1659
-
1705
+
1660
1706
  # Remove consensus features from consensus_df
1661
1707
  self.consensus_df = self.consensus_df.filter(
1662
- ~pl.col("consensus_uid").is_in(consensus_uids_to_remove)
1708
+ ~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
1663
1709
  )
1664
-
1710
+
1665
1711
  # Remove from consensus_mapping_df
1666
1712
  if self.consensus_mapping_df is not None and not self.consensus_mapping_df.is_empty():
1667
1713
  initial_mapping_count = len(self.consensus_mapping_df)
1668
1714
  self.consensus_mapping_df = self.consensus_mapping_df.filter(
1669
- ~pl.col("consensus_uid").is_in(consensus_uids_to_remove)
1715
+ ~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
1670
1716
  )
1671
1717
  removed_mapping_count = initial_mapping_count - len(self.consensus_mapping_df)
1672
1718
  if removed_mapping_count > 0:
1673
1719
  self.logger.debug(f"Removed {removed_mapping_count} entries from consensus_mapping_df")
1674
-
1720
+
1675
1721
  # Remove corresponding features from features_df
1676
1722
  if feature_uids_to_remove and self.features_df is not None and not self.features_df.is_empty():
1677
1723
  initial_features_count = len(self.features_df)
1678
1724
  self.features_df = self.features_df.filter(
1679
- ~pl.col("feature_uid").is_in(feature_uids_to_remove)
1725
+ ~pl.col("feature_uid").is_in(feature_uids_to_remove),
1680
1726
  )
1681
1727
  removed_features_count = initial_features_count - len(self.features_df)
1682
1728
  if removed_features_count > 0:
1683
1729
  self.logger.debug(f"Removed {removed_features_count} entries from features_df")
1684
-
1730
+
1685
1731
  # Remove from consensus_ms2 if it exists
1686
- if hasattr(self, 'consensus_ms2') and self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
1732
+ if hasattr(self, "consensus_ms2") and self.consensus_ms2 is not None and not self.consensus_ms2.is_empty():
1687
1733
  initial_ms2_count = len(self.consensus_ms2)
1688
1734
  self.consensus_ms2 = self.consensus_ms2.filter(
1689
- ~pl.col("consensus_uid").is_in(consensus_uids_to_remove)
1735
+ ~pl.col("consensus_uid").is_in(consensus_uids_to_remove),
1690
1736
  )
1691
1737
  removed_ms2_count = initial_ms2_count - len(self.consensus_ms2)
1692
1738
  if removed_ms2_count > 0:
1693
1739
  self.logger.debug(f"Removed {removed_ms2_count} entries from consensus_ms2")
1694
-
1740
+
1695
1741
  removed_consensus_count = initial_consensus_count - len(self.consensus_df)
1696
- self.logger.info(f"Filtered {removed_consensus_count} consensus features. Remaining consensus: {len(self.consensus_df)}")
1742
+ self.logger.info(
1743
+ f"Filtered {removed_consensus_count} consensus features. Remaining consensus: {len(self.consensus_df)}"
1744
+ )
1697
1745
 
1698
1746
 
1699
1747
  def consensus_delete(self, consensus):
1700
1748
  """
1701
1749
  Delete consensus features from consensus_df based on consensus identifiers.
1702
1750
  This is an alias for consensus_filter for consistency with other delete methods.
1703
-
1751
+
1704
1752
  Parameters:
1705
1753
  consensus: Consensus features to delete. Can be:
1706
1754
  - polars.DataFrame: Consensus DataFrame (will use consensus_uid column)
1707
1755
  - list: List of consensus_uids to delete
1708
1756
  - int: Single consensus_uid to delete
1709
-
1757
+
1710
1758
  Returns:
1711
1759
  None (modifies self.consensus_df and related DataFrames in place)
1712
1760
  """