masster 0.5.1__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/load.py CHANGED
@@ -39,8 +39,7 @@ def add(
39
39
  folder=None,
40
40
  reset=False,
41
41
  adducts=None,
42
- max_files=None,
43
- fast=True,
42
+ max_files=None
44
43
  ):
45
44
  """Add samples from a folder to the study.
46
45
 
@@ -52,8 +51,6 @@ def add(
52
51
  adducts (optional): Adducts to use for sample loading. Defaults to None.
53
52
  max_files (int, optional): Maximum number of files to process.
54
53
  Defaults to None (no limit).
55
- fast (bool, optional): Whether to use optimized loading that skips ms1_df
56
- for better performance. Defaults to True.
57
54
  """
58
55
  if folder is None:
59
56
  if self.folder is not None:
@@ -122,12 +119,11 @@ def add(
122
119
  self.logger.debug(
123
120
  f"Batch processing {len(files_to_process)} {ext} files",
124
121
  )
125
- successful = self._add_samples_batch(
122
+ successful = _add_samples_batch(self,
126
123
  files_to_process,
127
124
  reset=reset,
128
125
  adducts=adducts,
129
- blacklist=blacklist,
130
- fast=fast,
126
+ blacklist=blacklist
131
127
  )
132
128
  counter += successful
133
129
  if successful > 0:
@@ -149,8 +145,7 @@ def add(
149
145
  return f"Added {counter} samples to study"
150
146
 
151
147
 
152
- # TODO type is not used
153
- def add_sample(self, file, type=None, reset=False, adducts=None, fast=True):
148
+ def add_sample(self, file, type=None, reset=False, adducts=None):
154
149
  """
155
150
  Add a single sample to the study.
156
151
 
@@ -165,26 +160,16 @@ def add_sample(self, file, type=None, reset=False, adducts=None, fast=True):
165
160
  Returns:
166
161
  bool: True if successful, False otherwise.
167
162
  """
168
- if fast:
169
- # Use optimized method for better performance
170
- success = self._add_sample_optimized(
171
- file,
172
- type=type,
173
- reset=reset,
174
- adducts=adducts,
175
- skip_color_reset=False, # Do color reset for individual calls
176
- skip_schema_check=True, # Skip schema check for performance (safe with diagonal concat)
177
- )
178
- else:
179
- # Use standard method with full ms1_df loading
180
- success = self._add_sample_standard(
181
- file,
182
- type=type,
183
- reset=reset,
184
- adducts=adducts,
185
- skip_color_reset=False, # Do color reset for individual calls
186
- skip_schema_check=True, # Skip schema check for performance
187
- )
163
+
164
+ success = self._add_sample_optimized(
165
+ file,
166
+ type=type,
167
+ reset=reset,
168
+ adducts=adducts,
169
+ skip_color_reset=False, # Do color reset for individual calls
170
+ skip_schema_check=True, # Skip schema check for performance (safe with diagonal concat)
171
+ )
172
+
188
173
 
189
174
  return success
190
175
 
@@ -311,11 +296,12 @@ def _fill_chrom_single_impl(
311
296
  # Process each sample individually
312
297
  # Group missing combinations by sample for efficient processing
313
298
  missing_by_sample = {}
314
- for consensus_uid, sample_uid, sample_name, sample_path in missing_combinations:
299
+ for consensus_uid, sample_uid, sample_name, sample_path, sample_source in missing_combinations:
315
300
  if sample_name not in missing_by_sample:
316
301
  missing_by_sample[sample_name] = {
317
302
  "sample_uid": sample_uid,
318
303
  "sample_path": sample_path,
304
+ "sample_source": sample_source,
319
305
  "missing_consensus_uids": [],
320
306
  }
321
307
  missing_by_sample[sample_name]["missing_consensus_uids"].append(consensus_uid)
@@ -338,13 +324,23 @@ def _fill_chrom_single_impl(
338
324
  # Load this sample
339
325
  sample_uid = sample_info["sample_uid"]
340
326
  sample_path = sample_info["sample_path"]
327
+ sample_source = sample_info["sample_source"]
341
328
  missing_consensus_uids = sample_info["missing_consensus_uids"]
342
329
 
343
330
  try:
344
- # self.logger.debug(f"Loading sample: {sample_path}")
345
- file = Sample()
346
- file.logger_update("WARNING")
347
- file.load(sample_path)
331
+ # Load this sample using study._load_ms1() as suggested by user
332
+ # Use sample_path (points to .sample5 files) not sample_source (points to .raw files)
333
+ ms1_data = self._load_ms1(filename=sample_path)
334
+ if ms1_data is None or ms1_data.is_empty():
335
+ self.logger.warning(f"No MS1 data found for sample {sample_name}")
336
+ continue
337
+
338
+ # Create a temporary object to hold the MS1 data for processing
339
+ class TempSample:
340
+ def __init__(self, ms1_df):
341
+ self.ms1_df = ms1_df
342
+
343
+ file = TempSample(ms1_data)
348
344
  except Exception as e:
349
345
  self.logger.warning(f"Failed to load sample {sample_name}: {e}")
350
346
  continue
@@ -363,12 +359,50 @@ def _fill_chrom_single_impl(
363
359
 
364
360
  # Filter MS1 data for this feature
365
361
  if hasattr(file, "ms1_df") and not file.ms1_df.is_empty():
366
- d = file.ms1_df.filter(
367
- (pl.col("mz") >= mz - mz_tol)
368
- & (pl.col("mz") <= mz + mz_tol)
369
- & (pl.col("rt") >= rt_start_mean - rt_tol)
370
- & (pl.col("rt") <= rt_end_mean + rt_tol),
371
- )
362
+ # Special handling for RT=0 (library-derived features)
363
+ if rt == 0.0:
364
+ # Step 1: Retrieve full chromatogram for the m/z
365
+ d_full = file.ms1_df.filter(
366
+ (pl.col("mz") >= mz - mz_tol)
367
+ & (pl.col("mz") <= mz + mz_tol)
368
+ )
369
+
370
+ if not d_full.is_empty():
371
+ # Step 2: Find maximum intensity and its RT
372
+ max_inty_row = d_full.filter(
373
+ pl.col("inty") == d_full["inty"].max()
374
+ ).head(1)
375
+
376
+ if not max_inty_row.is_empty():
377
+ max_rt = max_inty_row["rt"].item()
378
+
379
+ # Get eic_rt_tol from sample parameters if available
380
+ eic_rt_tol = rt_tol # Default fallback
381
+ if hasattr(file, 'parameters') and hasattr(file.parameters, 'eic_rt_tol'):
382
+ eic_rt_tol = file.parameters.eic_rt_tol
383
+
384
+ # Step 3: Trim around max intensity using eic_rt_tol
385
+ d = d_full.filter(
386
+ (pl.col("rt") >= max_rt - eic_rt_tol)
387
+ & (pl.col("rt") <= max_rt + eic_rt_tol)
388
+ )
389
+
390
+ # Update consensus RT info based on discovered peak
391
+ rt = max_rt
392
+ rt_start_mean = max_rt - eic_rt_tol
393
+ rt_end_mean = max_rt + eic_rt_tol
394
+ else:
395
+ d = pl.DataFrame()
396
+ else:
397
+ d = pl.DataFrame()
398
+ else:
399
+ # Normal RT-based filtering for non-zero RT
400
+ d = file.ms1_df.filter(
401
+ (pl.col("mz") >= mz - mz_tol)
402
+ & (pl.col("mz") <= mz + mz_tol)
403
+ & (pl.col("rt") >= rt_start_mean - rt_tol)
404
+ & (pl.col("rt") <= rt_end_mean + rt_tol),
405
+ )
372
406
  else:
373
407
  d = pl.DataFrame()
374
408
 
@@ -579,6 +613,134 @@ def fill_single(self, **kwargs):
579
613
  )
580
614
 
581
615
 
616
+ def _build_rt_correction_mapping_per_sample(self, sample_uid):
617
+ """
618
+ Pre-compute RT correction mapping for a sample by getting all non-filled features.
619
+ This avoids repeated DataFrame filtering for each feature.
620
+
621
+ Args:
622
+ sample_uid: Sample UID to build mapping for
623
+
624
+ Returns:
625
+ Polars DataFrame with rt, rt_original, and rt_delta columns, sorted by rt
626
+ Returns empty DataFrame if no reference features found
627
+ """
628
+ # Get non-filled features from the same sample
629
+ if 'filled' in self.features_df.columns:
630
+ sample_features = self.features_df.filter(
631
+ (pl.col('sample_uid') == sample_uid) &
632
+ (pl.col('filled') == False) &
633
+ (pl.col('rt_original').is_not_null()) &
634
+ (pl.col('rt').is_not_null())
635
+ )
636
+ else:
637
+ # If no filled column, assume all existing features are non-filled
638
+ sample_features = self.features_df.filter(
639
+ (pl.col('sample_uid') == sample_uid) &
640
+ (pl.col('rt_original').is_not_null()) &
641
+ (pl.col('rt').is_not_null())
642
+ )
643
+
644
+ if sample_features.is_empty():
645
+ return pl.DataFrame(schema={'rt': pl.Float64, 'rt_original': pl.Float64, 'rt_delta': pl.Float64})
646
+
647
+ # Pre-compute RT deltas and sort by RT for efficient lookup
648
+ rt_mapping = sample_features.select([
649
+ pl.col('rt'),
650
+ pl.col('rt_original'),
651
+ (pl.col('rt') - pl.col('rt_original')).alias('rt_delta')
652
+ ]).sort('rt')
653
+
654
+ return rt_mapping
655
+
656
+ def _estimate_rt_original_from_mapping(self, rt_mapping, target_rt):
657
+ """
658
+ Fast RT original estimation using pre-computed mapping.
659
+
660
+ Args:
661
+ rt_mapping: Pre-computed RT mapping DataFrame from _build_rt_correction_mapping_per_sample
662
+ target_rt: Target aligned RT for the filled feature
663
+
664
+ Returns:
665
+ Estimated rt_original value, or None if no mapping available
666
+ """
667
+ if rt_mapping.is_empty():
668
+ return None
669
+
670
+ # Find closest RT using vectorized operations
671
+ rt_mapping_with_diff = rt_mapping.with_columns([
672
+ (pl.col('rt') - target_rt).abs().alias('rt_diff')
673
+ ])
674
+
675
+ # Get the RT delta from the closest feature
676
+ closest_row = rt_mapping_with_diff.sort('rt_diff').head(1)
677
+ if closest_row.is_empty():
678
+ return None
679
+
680
+ closest_rt_delta = closest_row['rt_delta'].item()
681
+ return target_rt - closest_rt_delta
682
+
683
+
684
+ def _estimate_rt_original_for_filled_feature(self, sample_uid, target_rt, logger=None):
685
+ """
686
+ Estimate rt_original for a filled feature by finding the closest non-filled feature
687
+ from the same sample and using its RT delta (rt - rt_original).
688
+
689
+ Args:
690
+ sample_uid: Sample UID to search within
691
+ target_rt: Target aligned RT for the filled feature
692
+ logger: Optional logger for debug messages
693
+
694
+ Returns:
695
+ Estimated rt_original value, or None if no suitable reference found
696
+ """
697
+ # Get non-filled features from the same sample
698
+ if 'filled' in self.features_df.columns:
699
+ sample_features = self.features_df.filter(
700
+ (pl.col('sample_uid') == sample_uid) &
701
+ (pl.col('filled') == False) &
702
+ (pl.col('rt_original').is_not_null()) &
703
+ (pl.col('rt').is_not_null())
704
+ )
705
+ else:
706
+ # If no filled column, assume all existing features are non-filled
707
+ sample_features = self.features_df.filter(
708
+ (pl.col('sample_uid') == sample_uid) &
709
+ (pl.col('rt_original').is_not_null()) &
710
+ (pl.col('rt').is_not_null())
711
+ )
712
+
713
+ if sample_features.is_empty():
714
+ if logger:
715
+ logger.debug(f"No reference features found for sample {sample_uid} to estimate rt_original")
716
+ return None
717
+
718
+ # Calculate RT differences and find the closest feature
719
+ sample_features_with_diff = sample_features.with_columns([
720
+ (pl.col('rt') - target_rt).abs().alias('rt_diff'),
721
+ (pl.col('rt') - pl.col('rt_original')).alias('rt_delta')
722
+ ])
723
+
724
+ # Find the feature with minimum RT difference
725
+ closest_feature = sample_features_with_diff.sort('rt_diff').head(1)
726
+
727
+ if closest_feature.is_empty():
728
+ return None
729
+
730
+ # Get the RT delta from the closest feature
731
+ closest_rt_diff = closest_feature['rt_diff'].item()
732
+ closest_rt_delta = closest_feature['rt_delta'].item()
733
+
734
+ # Estimate rt_original using the same delta: rt_original = rt - rt_delta
735
+ estimated_rt_original = target_rt - closest_rt_delta
736
+
737
+ if self.logger:
738
+ self.logger.debug(f"Estimated rt_original={estimated_rt_original:.3f} for sample {sample_uid}, rt={target_rt:.3f} "
739
+ f"using closest feature (rt_diff={closest_rt_diff:.3f}, rt_delta={closest_rt_delta:.3f})")
740
+
741
+ return estimated_rt_original
742
+
743
+
582
744
  def _process_sample_for_parallel_fill(
583
745
  self,
584
746
  sample_info,
@@ -589,31 +751,606 @@ def _process_sample_for_parallel_fill(
589
751
  missing_combinations_df,
590
752
  features_df_max_uid,
591
753
  ):
592
- """Process a single sample for parallel gap filling."""
593
754
  sample_uid = sample_info["sample_uid"]
594
755
  sample_path = sample_info["sample_path"]
756
+ sample_source = sample_info["sample_source"]
595
757
 
596
758
  new_features: list[dict] = []
597
759
  new_mapping: list[dict] = []
598
760
  counter = 0
599
761
 
600
- try:
601
- # Load this sample
602
- file = Sample()
603
- file.logger_update(level="WARNING")
604
- file.load(sample_path)
605
- except Exception:
606
- # Skip this sample if loading fails
762
+ # Get missing features for this sample from precomputed combinations
763
+ sample_missing_df = missing_combinations_df.filter(pl.col("sample_uid") == sample_uid)
764
+ sample_consensus_uids = sample_missing_df["consensus_uid"].to_list()
765
+
766
+ if not sample_consensus_uids:
607
767
  return new_features, new_mapping, counter
608
768
 
609
- # Find missing features for this sample from precomputed combinations
610
- sample_missing = missing_combinations_df.filter(
611
- pl.col("sample_uid") == sample_uid,
612
- )["consensus_uid"].to_list()
769
+ # OPTIMIZATION: Pre-compute RT correction mapping per sample to avoid repeated DataFrame filtering
770
+ rt_mapping = _build_rt_correction_mapping_per_sample(self, sample_uid)
613
771
 
614
- if not sample_missing:
772
+ # OPTIMIZATION 1: Load MS1 data ONCE per sample instead of per feature
773
+ try:
774
+ ms1_data = self._load_ms1(filename=sample_path)
775
+ if ms1_data is None or ms1_data.is_empty():
776
+ # Create empty features for all missing consensus UIDs
777
+ for i, consensus_uid in enumerate(sample_consensus_uids):
778
+ info = consensus_info[consensus_uid]
779
+ empty_eic = Chromatogram(
780
+ rt=np.array([info["rt_start_mean"], info["rt_end_mean"]]),
781
+ inty=np.array([0.0, 0.0]),
782
+ label=f"EIC mz={info['mz']:.4f}",
783
+ file=sample_path,
784
+ mz=info["mz"],
785
+ feature_start=info["rt_start_mean"],
786
+ feature_end=info["rt_end_mean"],
787
+ feature_apex=info["rt"],
788
+ )
789
+
790
+ new_feature = {
791
+ "uid": features_df_max_uid + counter,
792
+ "sample_uid": sample_uid,
793
+ "mz": info["mz"],
794
+ "rt": info["rt"],
795
+ "rt_original": 0.0 if info["rt"] == 0.0 else _estimate_rt_original_from_mapping(self, rt_mapping, info["rt"]),
796
+ "mz_centroid": None,
797
+ "rt_centroid": None,
798
+ "iso": None,
799
+ "iso_of": None,
800
+ "adduct": None,
801
+ "adduct_mass": None,
802
+ "adduct_group": None,
803
+ "chrom": empty_eic,
804
+ "filled": True,
805
+ "chrom_area": 0.0,
806
+ "chrom_coherence": None,
807
+ "chrom_prominence": None,
808
+ "chrom_prominence_scaled": None,
809
+ "chrom_height_scaled": None,
810
+ "ms2_scans": None,
811
+ "ms2_specs": None,
812
+ }
813
+
814
+ new_features.append(new_feature)
815
+ new_mapping.append({
816
+ "consensus_uid": consensus_uid,
817
+ "sample_uid": sample_uid,
818
+ "feature_uid": features_df_max_uid + counter,
819
+ })
820
+ counter += 1
821
+ return new_features, new_mapping, counter
822
+
823
+ except Exception as e:
824
+ # If MS1 loading fails, create empty features
825
+ self.logger.debug(f"Failed to load MS1 data from {sample_path}: {e}")
826
+ for i, consensus_uid in enumerate(sample_consensus_uids):
827
+ info = consensus_info[consensus_uid]
828
+ empty_eic = Chromatogram(
829
+ rt=np.array([info["rt_start_mean"], info["rt_end_mean"]]),
830
+ inty=np.array([0.0, 0.0]),
831
+ label=f"EIC mz={info['mz']:.4f}",
832
+ file=sample_path,
833
+ mz=info["mz"],
834
+ feature_start=info["rt_start_mean"],
835
+ feature_end=info["rt_end_mean"],
836
+ feature_apex=info["rt"],
837
+ )
838
+
839
+ new_feature = {
840
+ "uid": features_df_max_uid + counter,
841
+ "sample_uid": sample_uid,
842
+ "mz": info["mz"],
843
+ "rt": info["rt"],
844
+ "rt_original": 0.0 if info["rt"] == 0.0 else _estimate_rt_original_from_mapping(self, rt_mapping, info["rt"]),
845
+ "mz_centroid": None,
846
+ "rt_centroid": None,
847
+ "iso": None,
848
+ "iso_of": None,
849
+ "adduct": None,
850
+ "adduct_mass": None,
851
+ "adduct_group": None,
852
+ "chrom": empty_eic,
853
+ "filled": True,
854
+ "chrom_area": 0.0,
855
+ "chrom_coherence": None,
856
+ "chrom_prominence": None,
857
+ "chrom_prominence_scaled": None,
858
+ "chrom_height_scaled": None,
859
+ "ms2_scans": None,
860
+ "ms2_specs": None,
861
+ }
862
+
863
+ new_features.append(new_feature)
864
+ new_mapping.append({
865
+ "consensus_uid": consensus_uid,
866
+ "sample_uid": sample_uid,
867
+ "feature_uid": features_df_max_uid + counter,
868
+ })
869
+ counter += 1
870
+ return new_features, new_mapping, counter
871
+
872
+ # OPTIMIZATION 2: Pre-filter MS1 data by m/z ranges to reduce memory and processing
873
+ all_mzs = [consensus_info[uid]["mz"] for uid in sample_consensus_uids]
874
+ mz_min = min(all_mzs) - mz_tol
875
+ mz_max = max(all_mzs) + mz_tol
876
+
877
+ # Pre-filter by broad m/z range
878
+ ms1_filtered = ms1_data.filter(
879
+ (pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max)
880
+ )
881
+
882
+ # Early exit if no data in m/z range
883
+ if ms1_filtered.is_empty():
884
+ for i, consensus_uid in enumerate(sample_consensus_uids):
885
+ info = consensus_info[consensus_uid]
886
+ empty_eic = Chromatogram(
887
+ rt=np.array([info["rt_start_mean"], info["rt_end_mean"]]),
888
+ inty=np.array([0.0, 0.0]),
889
+ label=f"EIC mz={info['mz']:.4f}",
890
+ file=sample_path,
891
+ mz=info["mz"],
892
+ feature_start=info["rt_start_mean"],
893
+ feature_end=info["rt_end_mean"],
894
+ feature_apex=info["rt"],
895
+ )
896
+
897
+ new_feature = {
898
+ "uid": features_df_max_uid + counter,
899
+ "sample_uid": sample_uid,
900
+ "mz": info["mz"],
901
+ "rt": info["rt"],
902
+ "rt_original": 0.0 if info["rt"] == 0.0 else _estimate_rt_original_from_mapping(self, rt_mapping, info["rt"]),
903
+ "mz_centroid": None,
904
+ "rt_centroid": None,
905
+ "iso": None,
906
+ "iso_of": None,
907
+ "adduct": None,
908
+ "adduct_mass": None,
909
+ "adduct_group": None,
910
+ "chrom": empty_eic,
911
+ "filled": True,
912
+ "chrom_area": 0.0,
913
+ "chrom_coherence": None,
914
+ "chrom_prominence": None,
915
+ "chrom_prominence_scaled": None,
916
+ "chrom_height_scaled": None,
917
+ "ms2_scans": None,
918
+ "ms2_specs": None,
919
+ }
920
+
921
+ new_features.append(new_feature)
922
+ new_mapping.append({
923
+ "consensus_uid": consensus_uid,
924
+ "sample_uid": sample_uid,
925
+ "feature_uid": features_df_max_uid + counter,
926
+ })
927
+ counter += 1
615
928
  return new_features, new_mapping, counter
616
929
 
930
+ # OPTIMIZATION 3: Process all features using the pre-loaded and filtered MS1 data
931
+ for consensus_uid in sample_consensus_uids:
932
+ info = consensus_info[consensus_uid]
933
+ mz, rt = info["mz"], info["rt"]
934
+
935
+ try:
936
+ if rt == 0.0:
937
+ # Handle RT=0 features - create empty chromatogram
938
+ empty_eic = Chromatogram(
939
+ rt=np.array([info["rt_start_mean"], info["rt_end_mean"]]),
940
+ inty=np.array([0.0, 0.0]),
941
+ label=f"EIC mz={mz:.4f}",
942
+ file=sample_path,
943
+ mz=mz,
944
+ feature_start=info["rt_start_mean"],
945
+ feature_end=info["rt_end_mean"],
946
+ feature_apex=rt,
947
+ )
948
+ eic = empty_eic
949
+ best_peak = None
950
+ else:
951
+ # Extract real chromatogram using pre-filtered MS1 data
952
+ d = ms1_filtered.filter(
953
+ (pl.col("mz") >= mz - mz_tol) & (pl.col("mz") <= mz + mz_tol) &
954
+ (pl.col("rt") >= rt - rt_tol) & (pl.col("rt") <= rt + rt_tol)
955
+ )
956
+
957
+ # Create chromatogram from filtered data
958
+ if d.is_empty():
959
+ # No MS1 data found - create empty chromatogram
960
+ eic = Chromatogram(
961
+ rt=np.array([info["rt_start_mean"], info["rt_end_mean"]]),
962
+ inty=np.array([0.0, 0.0]),
963
+ label=f"EIC mz={mz:.4f}",
964
+ file=sample_path,
965
+ mz=mz,
966
+ feature_start=info["rt_start_mean"],
967
+ feature_end=info["rt_end_mean"],
968
+ feature_apex=rt,
969
+ )
970
+ best_peak = None
971
+ else:
972
+ # Aggregate intensities per retention time (get max inty per RT)
973
+ eic_rt = d.group_by("rt").agg(pl.col("inty").max()).sort("rt")
974
+
975
+ # Create chromatogram with real data and find peaks
976
+ eic = Chromatogram(
977
+ eic_rt["rt"].to_numpy(),
978
+ eic_rt["inty"].to_numpy(),
979
+ label=f"EIC mz={mz:.4f}",
980
+ file=sample_path,
981
+ mz=mz,
982
+ feature_start=info["rt_start_mean"],
983
+ feature_end=info["rt_end_mean"],
984
+ feature_apex=rt,
985
+ ).find_peaks()
986
+ best_peak = self._find_best_peak_in_eic(eic, rt, rt_tol) if hasattr(self, '_find_best_peak_in_eic') else None
987
+
988
+ # Create feature with optimized RT original estimation
989
+ rt_original_estimated = None
990
+ if rt == 0.0:
991
+ rt_original_estimated = 0.0 # RT=0 features
992
+ else:
993
+ rt_original_estimated = _estimate_rt_original_from_mapping(self, rt_mapping, rt)
994
+
995
+ new_feature = {
996
+ "uid": features_df_max_uid + counter,
997
+ "sample_uid": sample_uid,
998
+ "mz": mz,
999
+ "rt": rt,
1000
+ "rt_original": rt_original_estimated,
1001
+ "mz_centroid": None,
1002
+ "rt_centroid": None,
1003
+ "iso": None,
1004
+ "iso_of": None,
1005
+ "adduct": None,
1006
+ "adduct_mass": None,
1007
+ "adduct_group": None,
1008
+ "chrom": eic,
1009
+ "filled": True,
1010
+ "chrom_area": best_peak.get("area", 0.0) if best_peak else 0.0,
1011
+ "chrom_coherence": best_peak.get("coherence") if best_peak else None,
1012
+ "chrom_prominence": best_peak.get("prominence") if best_peak else None,
1013
+ "chrom_prominence_scaled": best_peak.get("prominence_scaled") if best_peak else None,
1014
+ "chrom_height_scaled": best_peak.get("height_scaled") if best_peak else None,
1015
+ "ms2_scans": None,
1016
+ "ms2_specs": None,
1017
+ }
1018
+
1019
+ new_features.append(new_feature)
1020
+ new_mapping.append({
1021
+ "consensus_uid": consensus_uid,
1022
+ "sample_uid": sample_uid,
1023
+ "feature_uid": features_df_max_uid + counter,
1024
+ })
1025
+ counter += 1
1026
+
1027
+ except Exception as e:
1028
+ # Skip this feature if extraction fails but log the error
1029
+ self.logger.debug(f"Failed to extract feature {consensus_uid} from {sample_path}: {e}")
1030
+ continue
1031
+
1032
+ return new_features, new_mapping, counter
1033
+
1034
+ '''
1035
+ def _load_ms1_optimized(self, sample_path, mz_ranges, rt_ranges):
1036
+ """
1037
+ OPTIMIZED: Load only the MS1 data we actually need instead of the entire file.
1038
+ Pre-filter by m/z and RT ranges to reduce memory usage and processing time.
1039
+ """
1040
+ try:
1041
+ # Load full MS1 data (we'll optimize this further later)
1042
+ ms1_data = self._load_ms1(filename=sample_path)
1043
+ if ms1_data is None or ms1_data.is_empty():
1044
+ return ms1_data
1045
+
1046
+ # OPTIMIZATION: Pre-filter to only relevant m/z ranges to reduce data size
1047
+ if mz_ranges:
1048
+ # Build comprehensive m/z filter covering all ranges
1049
+ mz_min = min(r[0] for r in mz_ranges)
1050
+ mz_max = max(r[1] for r in mz_ranges)
1051
+
1052
+ # Pre-filter by broad m/z range first (much faster than multiple OR conditions)
1053
+ ms1_filtered = ms1_data.filter(
1054
+ (pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max)
1055
+ )
1056
+
1057
+ # If we have RT ranges, also pre-filter by RT
1058
+ if rt_ranges and len(rt_ranges) > 0:
1059
+ rt_min = min(r[0] for r in rt_ranges)
1060
+ rt_max = max(r[1] for r in rt_ranges)
1061
+ ms1_filtered = ms1_filtered.filter(
1062
+ (pl.col("rt") >= rt_min) & (pl.col("rt") <= rt_max)
1063
+ )
1064
+
1065
+ return ms1_filtered
1066
+
1067
+ return ms1_data
1068
+
1069
+ except Exception:
1070
+ return pl.DataFrame()
1071
+ '''
1072
+
1073
+ '''
1074
+ def _create_empty_features(self, consensus_uids, consensus_info, sample_uid, features_df_max_uid):
1075
+ """Create empty features for consensus UIDs when no MS1 data is available."""
1076
+ new_features = []
1077
+ new_mapping = []
1078
+
1079
+ for i, consensus_uid in enumerate(consensus_uids):
1080
+ cons = consensus_info[consensus_uid]
1081
+ feature_uid = features_df_max_uid + i + 1
1082
+
1083
+ # Create minimal empty feature
1084
+ empty_eic = Chromatogram(
1085
+ rt=np.array([cons["rt_start_mean"], cons["rt_end_mean"]]),
1086
+ inty=np.array([0.0, 0.0]),
1087
+ label=f"EIC mz={cons['mz']:.4f}",
1088
+ file="",
1089
+ mz=cons["mz"],
1090
+ feature_start=cons["rt_start_mean"],
1091
+ feature_end=cons["rt_end_mean"],
1092
+ feature_apex=cons["rt"],
1093
+ )
1094
+
1095
+ new_feature = {
1096
+ "sample_uid": sample_uid,
1097
+ "feature_uid": feature_uid,
1098
+ "feature_id": None,
1099
+ "mz": cons["mz"],
1100
+ "rt": cons["rt"],
1101
+ "rt_original": 0.0 if cons["rt"] == 0.0 else None,
1102
+ "rt_start": cons["rt_start_mean"],
1103
+ "rt_end": cons["rt_end_mean"],
1104
+ "rt_delta": cons["rt_end_mean"] - cons["rt_start_mean"],
1105
+ "mz_start": None,
1106
+ "mz_end": None,
1107
+ "inty": 0.0,
1108
+ "quality": None,
1109
+ "charge": None,
1110
+ "iso": None,
1111
+ "iso_of": None,
1112
+ "adduct": None,
1113
+ "adduct_mass": None,
1114
+ "adduct_group": None,
1115
+ "chrom": empty_eic,
1116
+ "filled": True,
1117
+ "chrom_area": 0.0,
1118
+ "chrom_coherence": None,
1119
+ "chrom_prominence": None,
1120
+ "chrom_prominence_scaled": None,
1121
+ "chrom_height_scaled": None,
1122
+ "ms2_scans": None,
1123
+ "ms2_specs": None,
1124
+ }
1125
+
1126
+ new_features.append(new_feature)
1127
+ new_mapping.append({
1128
+ "consensus_uid": consensus_uid,
1129
+ "sample_uid": sample_uid,
1130
+ "feature_uid": feature_uid,
1131
+ })
1132
+
1133
+ return new_features, new_mapping, len(new_features)
1134
+ '''
1135
+
1136
+ '''
1137
+ def _create_feature_fast(self, consensus_uid, sample_uid, features_df_max_uid, consensus_info):
1138
+ """
1139
+ OPTIMIZED: Create a minimal empty feature quickly without expensive operations.
1140
+ Used for RT=0 features and other cases where we just need a placeholder feature.
1141
+ """
1142
+ cons = consensus_info[consensus_uid]
1143
+ feature_uid = features_df_max_uid
1144
+
1145
+ # Create minimal empty feature
1146
+ empty_eic = Chromatogram(
1147
+ rt=np.array([cons["rt_start_mean"], cons["rt_end_mean"]]),
1148
+ inty=np.array([0.0, 0.0]),
1149
+ label=f"EIC mz={cons['mz']:.4f}",
1150
+ file="",
1151
+ mz=cons["mz"],
1152
+ feature_start=cons["rt_start_mean"],
1153
+ feature_end=cons["rt_end_mean"]
1154
+ )
1155
+
1156
+ new_feature = {
1157
+ "uid": feature_uid,
1158
+ "sample_uid": sample_uid,
1159
+ "mz": cons["mz"],
1160
+ "rt": cons["rt"],
1161
+ "mz_centroid": None,
1162
+ "rt_centroid": None,
1163
+ "iso": None,
1164
+ "iso_of": None,
1165
+ "adduct": None,
1166
+ "adduct_mass": None,
1167
+ "adduct_group": None,
1168
+ "chrom": empty_eic,
1169
+ "filled": True,
1170
+ "chrom_area": 0.0,
1171
+ "chrom_coherence": None,
1172
+ "chrom_prominence": None,
1173
+ "chrom_prominence_scaled": None,
1174
+ "chrom_height_scaled": None,
1175
+ "ms2_scans": None,
1176
+ "ms2_specs": None,
1177
+ }
1178
+
1179
+ new_features = [new_feature]
1180
+ new_mapping = [{
1181
+ "consensus_uid": consensus_uid,
1182
+ "sample_uid": sample_uid,
1183
+ "feature_uid": feature_uid,
1184
+ }]
1185
+
1186
+ return new_features, new_mapping, 1
1187
+ '''
1188
+
1189
+ '''
1190
+ def _process_rt_zero_features_batch(self, rt_zero_consensus_uids, consensus_info, sample_uid,
1191
+ features_df_max_uid, rt_zero_features):
1192
+ """
1193
+ OPTIMIZED: Process all RT=0 features in a batch since they share similar characteristics.
1194
+ RT=0 features are typically not real peaks but artifacts or noise.
1195
+ """
1196
+ new_features = []
1197
+ new_mapping = []
1198
+
1199
+ for consensus_uid in rt_zero_consensus_uids:
1200
+ new_features_batch, new_mapping_batch, _ = self._create_feature_fast(
1201
+ consensus_uid, sample_uid, features_df_max_uid, consensus_info
1202
+ )
1203
+ new_features.extend(new_features_batch)
1204
+ new_mapping.extend(new_mapping_batch)
1205
+ features_df_max_uid += 1
1206
+
1207
+ # Track RT=0 features for statistics
1208
+ rt_zero_features.append(1)
1209
+
1210
+ return new_features, new_mapping, features_df_max_uid
1211
+ '''
1212
+
1213
+ '''
1214
+ def _process_normal_rt_features_batch(self, normal_rt_consensus_uids, consensus_info, ms1_data,
1215
+ sample_uid, sample_path, mz_tol, rt_tol, features_df_max_uid):
1216
+ """
1217
+ OPTIMIZED: Process normal RT features in batch with pre-filtered MS1 data.
1218
+ Only loads chromatograms once per batch instead of per feature.
1219
+ """
1220
+ new_features = []
1221
+ new_mapping = []
1222
+
1223
+ if len(normal_rt_consensus_uids) == 0:
1224
+ return new_features, new_mapping, features_df_max_uid
1225
+
1226
+ # OPTIMIZATION: Pre-filter MS1 data by m/z range to reduce data size
1227
+ all_mzs = [consensus_info[cuid]["mz"] for cuid in normal_rt_consensus_uids]
1228
+ mz_min = min(all_mzs) - max(0.01, min(all_mzs) * mz_tol / 1e6)
1229
+ mz_max = max(all_mzs) + max(0.01, max(all_mzs) * mz_tol / 1e6)
1230
+
1231
+ # Pre-filter MS1 data once for all features
1232
+ ms1_filtered = ms1_data.filter(
1233
+ (pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max)
1234
+ )
1235
+
1236
+ # Early exit if no data in m/z range
1237
+ if ms1_filtered.shape[0] == 0:
1238
+ # Create empty features for all consensus UIDs
1239
+ for consensus_uid in normal_rt_consensus_uids:
1240
+ new_features_batch, new_mapping_batch, _ = self._create_feature_fast(
1241
+ consensus_uid, sample_uid, features_df_max_uid, consensus_info
1242
+ )
1243
+ new_features.extend(new_features_batch)
1244
+ new_mapping.extend(new_mapping_batch)
1245
+ features_df_max_uid += 1
1246
+ return new_features, new_mapping, features_df_max_uid
1247
+
1248
+ # Process each feature with pre-filtered data
1249
+ for consensus_uid in normal_rt_consensus_uids:
1250
+ info = consensus_info[consensus_uid]
1251
+ mz, rt = info["mz"], info["rt"]
1252
+
1253
+ # Extract chromatogram using pre-loaded MS1 data (FIXED!)
1254
+ sample_obj = self._load_ms1(sample_path) # Get the sample object for extract_eic method
1255
+ eic = sample_obj.extract_eic(
1256
+ mz, mz_tol, rt, rt_tol, ms1_data=ms1_filtered # Use the pre-filtered data!
1257
+ )
1258
+
1259
+ # Find best peak
1260
+ best_peak = self._find_best_peak_in_eic(eic, rt, rt_tol)
1261
+
1262
+ # Create feature
1263
+ new_feature = {
1264
+ "uid": features_df_max_uid,
1265
+ "sample_uid": sample_uid,
1266
+ "mz": mz,
1267
+ "rt": rt,
1268
+ "mz_centroid": None,
1269
+ "rt_centroid": None,
1270
+ "iso": None,
1271
+ "iso_of": None,
1272
+ "adduct": None,
1273
+ "adduct_mass": None,
1274
+ "adduct_group": None,
1275
+ "chrom": eic if best_peak else Chromatogram(
1276
+ rt=np.array([rt, rt]),
1277
+ inty=np.array([0.0, 0.0]),
1278
+ label=f"EIC mz={mz:.4f}",
1279
+ file="",
1280
+ mz=mz,
1281
+ feature_start=rt,
1282
+ feature_end=rt
1283
+ ),
1284
+ "filled": True,
1285
+ "chrom_area": best_peak.get("area", 0.0) if best_peak else 0.0,
1286
+ "chrom_coherence": best_peak.get("coherence") if best_peak else None,
1287
+ "chrom_prominence": best_peak.get("prominence") if best_peak else None,
1288
+ "chrom_prominence_scaled": best_peak.get("prominence_scaled") if best_peak else None,
1289
+ "chrom_height_scaled": best_peak.get("height_scaled") if best_peak else None,
1290
+ "ms2_scans": None,
1291
+ "ms2_specs": None,
1292
+ }
1293
+
1294
+ new_features.append(new_feature)
1295
+ new_mapping.append({
1296
+ "consensus_uid": consensus_uid,
1297
+ "sample_uid": sample_uid,
1298
+ "feature_uid": features_df_max_uid,
1299
+ })
1300
+ features_df_max_uid += 1
1301
+
1302
+ return new_features, new_mapping, features_df_max_uid
1303
+ '''
1304
+
1305
+ '''def _batch_process_features(self, consensus_uids, consensus_info, ms1_data, sample_uid, sample_path,
1306
+ mz_tol, rt_tol, features_df_max_uid, rt_zero_features):
1307
+ """
1308
+ OPTIMIZED: Process all missing features for a sample in a single batch operation.
1309
+ This avoids repeated filtering of the MS1 dataframe.
1310
+ """
1311
+ new_features = []
1312
+ new_mapping = []
1313
+
1314
+ # OPTIMIZATION: Process RT=0 features separately (they need special handling)
1315
+ rt_zero_data = {}
1316
+ if rt_zero_features:
1317
+ rt_zero_data = self._process_rt_zero_features_batch(
1318
+ rt_zero_features, consensus_info, ms1_data, mz_tol, rt_tol
1319
+ )
1320
+
1321
+ # OPTIMIZATION: Build comprehensive filter for all normal RT features at once
1322
+ normal_rt_features = [uid for uid in consensus_uids if uid not in rt_zero_features]
1323
+ normal_rt_data = {}
1324
+ if normal_rt_features:
1325
+ normal_rt_data = self._process_normal_rt_features_batch(
1326
+ normal_rt_features, consensus_info, ms1_data, mz_tol, rt_tol
1327
+ )
1328
+
1329
+ # Combine results and create features
1330
+ all_feature_data = {**rt_zero_data, **normal_rt_data}
1331
+
1332
+ for i, consensus_uid in enumerate(consensus_uids):
1333
+ feature_uid = features_df_max_uid + i + 1
1334
+ cons = consensus_info[consensus_uid]
1335
+
1336
+ # Get pre-processed data for this feature
1337
+ feature_ms1_data = all_feature_data.get(consensus_uid, pl.DataFrame())
1338
+
1339
+ # Create feature using optimized chromatogram creation
1340
+ new_feature, area = self._create_feature_fast(
1341
+ consensus_uid, cons, feature_ms1_data, sample_uid, sample_path,
1342
+ feature_uid, mz_tol, rt_tol
1343
+ )
1344
+
1345
+ new_features.append(new_feature)
1346
+ new_mapping.append({
1347
+ "consensus_uid": consensus_uid,
1348
+ "sample_uid": sample_uid,
1349
+ "feature_uid": feature_uid,
1350
+ })
1351
+
1352
+ return new_features, new_mapping, len(new_features)
1353
+
617
1354
  # Process each missing feature
618
1355
  for consensus_uid in sample_missing:
619
1356
  cons = consensus_info[consensus_uid]
@@ -624,12 +1361,43 @@ def _process_sample_for_parallel_fill(
624
1361
 
625
1362
  # Filter MS1 data for this feature
626
1363
  if hasattr(file, "ms1_df") and not file.ms1_df.is_empty():
627
- d = file.ms1_df.filter(
628
- (pl.col("mz") >= mz - mz_tol)
629
- & (pl.col("mz") <= mz + mz_tol)
630
- & (pl.col("rt") >= rt_start_mean - rt_tol)
631
- & (pl.col("rt") <= rt_end_mean + rt_tol),
632
- )
1364
+ # Special handling for RT=0 (library-derived features)
1365
+ if rt == 0.0:
1366
+ # Simple RT=0 processing: find max intensity across full m/z range
1367
+ d_full = file.ms1_df.filter(
1368
+ (pl.col("mz") >= mz - mz_tol) & (pl.col("mz") <= mz + mz_tol)
1369
+ )
1370
+
1371
+ if not d_full.is_empty():
1372
+ max_inty = d_full["inty"].max()
1373
+ if max_inty > 0:
1374
+ max_rt = d_full.filter(pl.col("inty") == max_inty)["rt"].min()
1375
+
1376
+ # Use default rt_tol for RT=0 features
1377
+ eic_rt_tol = rt_tol
1378
+
1379
+ # Filter around max RT
1380
+ d = d_full.filter(
1381
+ (pl.col("rt") >= max_rt - eic_rt_tol) &
1382
+ (pl.col("rt") <= max_rt + eic_rt_tol)
1383
+ )
1384
+
1385
+ # Update consensus RT info
1386
+ rt = max_rt
1387
+ rt_start_mean = max_rt - eic_rt_tol
1388
+ rt_end_mean = max_rt + eic_rt_tol
1389
+ else:
1390
+ d = pl.DataFrame()
1391
+ else:
1392
+ d = pl.DataFrame()
1393
+ else:
1394
+ # Normal RT-based filtering for non-zero RT
1395
+ d = file.ms1_df.filter(
1396
+ (pl.col("mz") >= mz - mz_tol)
1397
+ & (pl.col("mz") <= mz + mz_tol)
1398
+ & (pl.col("rt") >= rt_start_mean - rt_tol)
1399
+ & (pl.col("rt") <= rt_end_mean + rt_tol),
1400
+ )
633
1401
  else:
634
1402
  d = pl.DataFrame()
635
1403
 
@@ -648,6 +1416,13 @@ def _process_sample_for_parallel_fill(
648
1416
  )
649
1417
  max_inty = 0.0
650
1418
  area = 0.0
1419
+ chrom_coherence = None
1420
+ chrom_prominence = None
1421
+ chrom_prominence_scaled = None
1422
+ chrom_height_scaled = None
1423
+ peak_rt_start = rt_start_mean
1424
+ peak_rt_end = rt_end_mean
1425
+ peak_rt_delta = rt_end_mean - rt_start_mean
651
1426
  else:
652
1427
  eic_rt = d.group_by("rt").agg(pl.col("inty").max()).sort("rt")
653
1428
 
@@ -665,6 +1440,24 @@ def _process_sample_for_parallel_fill(
665
1440
  ).find_peaks()
666
1441
  max_inty = np.max(eic.inty)
667
1442
  area = eic.feature_area
1443
+
1444
+ # Extract chromatogram peak properties from first peak (if available)
1445
+ if len(eic.peak_rts) > 0 and eic.feature_start is not None and eic.feature_end is not None:
1446
+ chrom_coherence = round(eic.feature_coherence, 3) if eic.feature_coherence is not None else None
1447
+ chrom_prominence = round(eic.peak_prominences[0], 3) if len(eic.peak_prominences) > 0 else None
1448
+ chrom_prominence_scaled = round(eic.peak_prominences[0] / (np.mean(eic.inty) + 1e-10), 3) if len(eic.peak_prominences) > 0 else None
1449
+ chrom_height_scaled = round(eic.peak_heights[0] / (np.mean(eic.inty) + 1e-10), 3) if len(eic.peak_heights) > 0 else None
1450
+ peak_rt_start = eic.feature_start
1451
+ peak_rt_end = eic.feature_end
1452
+ peak_rt_delta = peak_rt_end - peak_rt_start
1453
+ else:
1454
+ chrom_coherence = None
1455
+ chrom_prominence = None
1456
+ chrom_prominence_scaled = None
1457
+ chrom_height_scaled = None
1458
+ peak_rt_start = rt_start_mean
1459
+ peak_rt_end = rt_end_mean
1460
+ peak_rt_delta = rt_end_mean - rt_start_mean
668
1461
  else:
669
1462
  eic = Chromatogram(
670
1463
  eic_rt["rt"].to_numpy(),
@@ -679,21 +1472,36 @@ def _process_sample_for_parallel_fill(
679
1472
  )
680
1473
  max_inty = 0.0
681
1474
  area = 0.0
1475
+ chrom_coherence = None
1476
+ chrom_prominence = None
1477
+ chrom_prominence_scaled = None
1478
+ chrom_height_scaled = None
1479
+ peak_rt_start = rt_start_mean
1480
+ peak_rt_end = rt_end_mean
1481
+ peak_rt_delta = rt_end_mean - rt_start_mean
682
1482
 
683
1483
  # Generate feature UID (will be adjusted later to ensure global uniqueness)
684
1484
  feature_uid = features_df_max_uid + len(new_features) + 1
685
1485
 
686
- # Create new feature entry
1486
+ # Handle rt_original: for RT=0 features, set to 0; otherwise estimate from closest feature
1487
+ if rt == 0.0 or (hasattr(cons, 'get') and cons.get("rt") == 0.0):
1488
+ estimated_rt_original = 0.0
1489
+ else:
1490
+ estimated_rt_original = _estimate_rt_original_for_filled_feature(
1491
+ self, sample_uid, rt, logger=self.logger if hasattr(self, 'logger') else None
1492
+ )
1493
+
1494
+ # Create new feature entry with updated chromatogram properties
687
1495
  new_feature = {
688
1496
  "sample_uid": sample_uid,
689
1497
  "feature_uid": feature_uid,
690
1498
  "feature_id": None,
691
1499
  "mz": mz,
692
1500
  "rt": rt,
693
- "rt_original": None,
694
- "rt_start": rt_start_mean,
695
- "rt_end": rt_end_mean,
696
- "rt_delta": rt_end_mean - rt_start_mean,
1501
+ "rt_original": estimated_rt_original,
1502
+ "rt_start": peak_rt_start,
1503
+ "rt_end": peak_rt_end,
1504
+ "rt_delta": peak_rt_delta,
697
1505
  "mz_start": None,
698
1506
  "mz_end": None,
699
1507
  "inty": max_inty,
@@ -707,10 +1515,10 @@ def _process_sample_for_parallel_fill(
707
1515
  "chrom": eic,
708
1516
  "filled": True,
709
1517
  "chrom_area": area,
710
- "chrom_coherence": None,
711
- "chrom_prominence": None,
712
- "chrom_prominence_scaled": None,
713
- "chrom_height_scaled": None,
1518
+ "chrom_coherence": chrom_coherence,
1519
+ "chrom_prominence": chrom_prominence,
1520
+ "chrom_prominence_scaled": chrom_prominence_scaled,
1521
+ "chrom_height_scaled": chrom_height_scaled,
714
1522
  "ms2_scans": None,
715
1523
  "ms2_specs": None,
716
1524
  }
@@ -726,7 +1534,7 @@ def _process_sample_for_parallel_fill(
726
1534
  counter += 1
727
1535
 
728
1536
  return new_features, new_mapping, counter
729
-
1537
+ '''
730
1538
 
731
1539
  def _fill_chrom_impl(
732
1540
  self,
@@ -735,7 +1543,7 @@ def _fill_chrom_impl(
735
1543
  rt_tol: float = 10.0,
736
1544
  min_samples_rel: float = 0.0,
737
1545
  min_samples_abs: int = 2,
738
- num_workers=4,
1546
+ threads=6,
739
1547
  ):
740
1548
  """Fill missing chromatograms by extracting from raw data using parallel processing.
741
1549
 
@@ -745,13 +1553,13 @@ def _fill_chrom_impl(
745
1553
  rt_tol: RT tolerance for extraction (default: 10.0 seconds)
746
1554
  min_samples_rel: Relative minimum sample threshold (default: 0.0)
747
1555
  min_samples_abs: Absolute minimum sample threshold (default: 2)
748
- num_workers: Number of parallel workers (default: 4)
1556
+ threads: Number of parallel threads (default: 6)
749
1557
  """
750
1558
  uids = self._get_consensus_uids(uids)
751
1559
 
752
- self.logger.info(f"Gap filling with {num_workers} workers...")
1560
+ self.logger.info(f"Gap filling with {threads} threads...")
753
1561
  self.logger.debug(
754
- f"Parameters: mz_tol={mz_tol}, rt_tol={rt_tol}, min_samples_rel={min_samples_rel}, min_samples_abs={min_samples_abs}, num_workers={num_workers}",
1562
+ f"Parameters: mz_tol={mz_tol}, rt_tol={rt_tol}, min_samples_rel={min_samples_rel}, min_samples_abs={min_samples_abs}, threads={threads}",
755
1563
  )
756
1564
 
757
1565
  # Apply minimum sample filters
@@ -793,6 +1601,7 @@ def _fill_chrom_impl(
793
1601
  "sample_uid": pl.Int64,
794
1602
  "sample_name": pl.Utf8,
795
1603
  "sample_path": pl.Utf8,
1604
+ "sample_source": pl.Utf8,
796
1605
  },
797
1606
  orient="row",
798
1607
  )
@@ -830,12 +1639,13 @@ def _fill_chrom_impl(
830
1639
  "sample_name": row["sample_name"],
831
1640
  "sample_uid": row["sample_uid"],
832
1641
  "sample_path": row["sample_path"],
1642
+ "sample_source": row["sample_source"],
833
1643
  },
834
1644
  )
835
1645
 
836
1646
  total_missing = len(missing_combinations_df)
837
1647
  self.logger.debug(
838
- f"Gap filling for {total_missing} missing features...",
1648
+ f"Gap filling for {total_missing} missing features across {len(samples_to_process)} samples...",
839
1649
  )
840
1650
 
841
1651
  # Calculate current max feature_uid to avoid conflicts
@@ -850,7 +1660,7 @@ def _fill_chrom_impl(
850
1660
 
851
1661
  tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
852
1662
 
853
- with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
1663
+ with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
854
1664
  # Submit all samples for processing
855
1665
  future_to_sample = {}
856
1666
  for sample_info in samples_to_process:
@@ -883,6 +1693,8 @@ def _fill_chrom_impl(
883
1693
  for i, mapping in enumerate(new_mapping):
884
1694
  mapping["feature_uid"] = uid_offset + i + 1
885
1695
 
1696
+ # RT original estimation is now done inside parallel processing - PERFORMANCE OPTIMIZED!
1697
+
886
1698
  all_new_features.extend(new_features)
887
1699
  all_new_mapping.extend(new_mapping)
888
1700
  total_counter += counter
@@ -944,8 +1756,15 @@ def _fill_chrom_impl(
944
1756
  how="diagonal",
945
1757
  )
946
1758
 
1759
+ # Log statistics about rt_original estimation
1760
+ if all_new_features:
1761
+ estimated_count = sum(1 for feature in all_new_features if feature.get('rt_original') is not None)
1762
+ none_count = sum(1 for feature in all_new_features if feature.get('rt_original') is None)
1763
+ self.logger.debug(f"Features with estimated rt_original: {estimated_count}")
1764
+ self.logger.debug(f"Features with None rt_original: {none_count}")
1765
+
947
1766
  self.logger.info(
948
- f"Filled {total_counter} chromatograms from raw data using {num_workers} parallel workers.",
1767
+ f"Filled {total_counter} chromatograms from raw data.",
949
1768
  )
950
1769
 
951
1770
 
@@ -963,14 +1782,18 @@ def fill(self, **kwargs):
963
1782
  rt_tol: RT tolerance for extraction (default: 10.0 seconds)
964
1783
  min_samples_rel: Relative minimum sample threshold (default: 0.05)
965
1784
  min_samples_abs: Absolute minimum sample threshold (default: 5)
966
- num_workers: Number of parallel workers (default: 4)
1785
+ threads: Number of parallel threads (default: 6)
967
1786
  """
968
1787
  # parameters initialization
969
1788
  params = fill_defaults()
970
- num_workers = kwargs.get(
971
- "num_workers",
972
- 4,
973
- ) # Default parameter not in defaults class
1789
+
1790
+ # Handle backward compatibility for old parameter names
1791
+ if "workers" in kwargs:
1792
+ kwargs["threads"] = kwargs.pop("workers")
1793
+ self.logger.debug("Converted 'workers' parameter to 'threads' for backward compatibility")
1794
+ if "num_workers" in kwargs:
1795
+ kwargs["threads"] = kwargs.pop("num_workers")
1796
+ self.logger.debug("Converted 'num_workers' parameter to 'threads' for backward compatibility")
974
1797
 
975
1798
  for key, value in kwargs.items():
976
1799
  if isinstance(value, fill_defaults):
@@ -984,7 +1807,7 @@ def fill(self, **kwargs):
984
1807
  self.logger.warning(
985
1808
  f"Failed to set parameter {key} = {value} (validation failed)",
986
1809
  )
987
- elif key != "num_workers": # Allow num_workers as valid parameter
1810
+ else:
988
1811
  self.logger.debug(f"Unknown parameter {key} ignored")
989
1812
  # end of parameter initialization
990
1813
 
@@ -1000,14 +1823,10 @@ def fill(self, **kwargs):
1000
1823
  rt_tol=params.get("rt_tol"),
1001
1824
  min_samples_rel=params.get("min_samples_rel"),
1002
1825
  min_samples_abs=params.get("min_samples_abs"),
1003
- num_workers=num_workers,
1826
+ threads=params.get("threads"),
1004
1827
  )
1005
1828
 
1006
1829
 
1007
- # Backward compatibility alias
1008
- fill_chrom = fill
1009
-
1010
-
1011
1830
  def _get_missing_consensus_sample_combinations(self, uids):
1012
1831
  """
1013
1832
  Efficiently identify which consensus_uid/sample combinations are missing.
@@ -1017,6 +1836,7 @@ def _get_missing_consensus_sample_combinations(self, uids):
1017
1836
  - Early termination for fully-filled studies
1018
1837
  - Efficient dictionary lookups instead of expensive DataFrame joins
1019
1838
  - Smart handling of sparse vs dense missing data patterns
1839
+ - Special handling for consensus features with no mappings (e.g., library-derived RT=0 features)
1020
1840
  """
1021
1841
  if not uids:
1022
1842
  return []
@@ -1025,10 +1845,42 @@ def _get_missing_consensus_sample_combinations(self, uids):
1025
1845
  n_samples = len(self.samples_df)
1026
1846
  total_possible = n_consensus * n_samples
1027
1847
 
1848
+ # Identify consensus features that have NO mappings at all (e.g., library-derived RT=0 features)
1849
+ uids_set = set(uids)
1850
+ mapped_consensus_uids = set(
1851
+ self.consensus_mapping_df.filter(pl.col("consensus_uid").is_in(uids))["consensus_uid"].to_list()
1852
+ )
1853
+ unmapped_consensus_uids = uids_set - mapped_consensus_uids
1854
+
1855
+ # Get all sample info once for efficiency
1856
+ all_samples = list(
1857
+ self.samples_df.select(
1858
+ ["sample_uid", "sample_name", "sample_path", "sample_source"],
1859
+ ).iter_rows(),
1860
+ )
1861
+
1862
+ missing_combinations = []
1863
+
1864
+ # For unmapped consensus features (e.g., RT=0), ALL samples are missing
1865
+ if unmapped_consensus_uids:
1866
+ self.logger.debug(f"Found {len(unmapped_consensus_uids)} consensus features with no mappings (e.g., RT=0 library features)")
1867
+ for consensus_uid in unmapped_consensus_uids:
1868
+ for sample_uid, sample_name, sample_path, sample_source in all_samples:
1869
+ missing_combinations.append(
1870
+ (consensus_uid, sample_uid, sample_name, sample_path, sample_source)
1871
+ )
1872
+
1873
+ # If all consensus features are unmapped, return early
1874
+ if len(mapped_consensus_uids) == 0:
1875
+ return missing_combinations
1876
+
1877
+ # Continue with existing logic for mapped consensus features
1878
+ mapped_uids_list = list(mapped_consensus_uids)
1879
+
1028
1880
  # Quick early termination check for fully/nearly filled studies
1029
1881
  # This handles the common case where fill() is run on an already-filled study
1030
1882
  consensus_counts = (
1031
- self.consensus_mapping_df.filter(pl.col("consensus_uid").is_in(uids))
1883
+ self.consensus_mapping_df.filter(pl.col("consensus_uid").is_in(mapped_uids_list))
1032
1884
  .group_by("consensus_uid")
1033
1885
  .agg(pl.count("feature_uid").alias("count"))
1034
1886
  )
@@ -1037,22 +1889,22 @@ def _get_missing_consensus_sample_combinations(self, uids):
1037
1889
  consensus_counts["count"].sum() if not consensus_counts.is_empty() else 0
1038
1890
  )
1039
1891
 
1892
+ # Calculate total possible for mapped features only
1893
+ mapped_total_possible = len(mapped_uids_list) * n_samples
1894
+
1040
1895
  # If >95% filled, likely no gaps (common case)
1041
- if total_existing >= total_possible * 0.95:
1896
+ if total_existing >= mapped_total_possible * 0.95:
1042
1897
  self.logger.debug(
1043
- f"Study appears {total_existing / total_possible * 100:.1f}% filled, using sparse optimization",
1898
+ f"Study appears {total_existing / mapped_total_possible * 100:.1f}% filled, using sparse optimization",
1044
1899
  )
1045
1900
 
1046
1901
  # For sparse missing data, check each consensus feature individually
1047
- missing_combinations = []
1048
- uids_set = set(uids)
1049
-
1050
1902
  # Build efficient lookups
1051
1903
  feature_to_sample = dict(
1052
1904
  self.features_df.select(["feature_uid", "sample_uid"]).iter_rows(),
1053
1905
  )
1054
1906
 
1055
- # Get existing combinations for target UIDs only
1907
+ # Get existing combinations for target UIDs only (mapped features)
1056
1908
  existing_by_consensus = {}
1057
1909
  for consensus_uid, feature_uid in self.consensus_mapping_df.select(
1058
1910
  [
@@ -1060,25 +1912,18 @@ def _get_missing_consensus_sample_combinations(self, uids):
1060
1912
  "feature_uid",
1061
1913
  ],
1062
1914
  ).iter_rows():
1063
- if consensus_uid in uids_set and feature_uid in feature_to_sample:
1915
+ if consensus_uid in mapped_consensus_uids and feature_uid in feature_to_sample:
1064
1916
  if consensus_uid not in existing_by_consensus:
1065
1917
  existing_by_consensus[consensus_uid] = set()
1066
1918
  existing_by_consensus[consensus_uid].add(feature_to_sample[feature_uid])
1067
1919
 
1068
- # Get sample info once
1069
- all_samples = list(
1070
- self.samples_df.select(
1071
- ["sample_uid", "sample_name", "sample_path"],
1072
- ).iter_rows(),
1073
- )
1074
-
1075
- # Check for missing combinations
1076
- for consensus_uid in uids:
1920
+ # Check for missing combinations for mapped features
1921
+ for consensus_uid in mapped_uids_list:
1077
1922
  existing_samples = existing_by_consensus.get(consensus_uid, set())
1078
- for sample_uid, sample_name, sample_path in all_samples:
1923
+ for sample_uid, sample_name, sample_path, sample_source in all_samples:
1079
1924
  if sample_uid not in existing_samples:
1080
1925
  missing_combinations.append(
1081
- (consensus_uid, sample_uid, sample_name, sample_path),
1926
+ (consensus_uid, sample_uid, sample_name, sample_path, sample_source),
1082
1927
  )
1083
1928
 
1084
1929
  return missing_combinations
@@ -1086,16 +1931,15 @@ def _get_missing_consensus_sample_combinations(self, uids):
1086
1931
  else:
1087
1932
  # For studies with many gaps, use bulk operations
1088
1933
  self.logger.debug(
1089
- f"Study {total_existing / total_possible * 100:.1f}% filled, using bulk optimization",
1934
+ f"Study {total_existing / mapped_total_possible * 100:.1f}% filled, using bulk optimization",
1090
1935
  )
1091
1936
 
1092
1937
  # Build efficient lookups
1093
- uids_set = set(uids)
1094
1938
  feature_to_sample = dict(
1095
1939
  self.features_df.select(["feature_uid", "sample_uid"]).iter_rows(),
1096
1940
  )
1097
1941
 
1098
- # Build existing combinations set
1942
+ # Build existing combinations set for mapped features only
1099
1943
  existing_combinations = {
1100
1944
  (consensus_uid, feature_to_sample[feature_uid])
1101
1945
  for consensus_uid, feature_uid in self.consensus_mapping_df.select(
@@ -1104,23 +1948,16 @@ def _get_missing_consensus_sample_combinations(self, uids):
1104
1948
  "feature_uid",
1105
1949
  ],
1106
1950
  ).iter_rows()
1107
- if consensus_uid in uids_set and feature_uid in feature_to_sample
1951
+ if consensus_uid in mapped_consensus_uids and feature_uid in feature_to_sample
1108
1952
  }
1109
1953
 
1110
- # Get all sample info
1111
- all_samples = list(
1112
- self.samples_df.select(
1113
- ["sample_uid", "sample_name", "sample_path"],
1114
- ).iter_rows(),
1115
- )
1116
-
1117
- # Generate all missing combinations
1118
- missing_combinations = [
1119
- (consensus_uid, sample_uid, sample_name, sample_path)
1120
- for consensus_uid in uids
1121
- for sample_uid, sample_name, sample_path in all_samples
1122
- if (consensus_uid, sample_uid) not in existing_combinations
1123
- ]
1954
+ # Generate missing combinations for mapped features
1955
+ for consensus_uid in mapped_uids_list:
1956
+ for sample_uid, sample_name, sample_path, sample_source in all_samples:
1957
+ if (consensus_uid, sample_uid) not in existing_combinations:
1958
+ missing_combinations.append(
1959
+ (consensus_uid, sample_uid, sample_name, sample_path, sample_source)
1960
+ )
1124
1961
 
1125
1962
  return missing_combinations
1126
1963
 
@@ -1218,7 +2055,7 @@ def _sanitize(self):
1218
2055
  except Exception as e:
1219
2056
  self.logger.error(f"Failed to recreate sanitized DataFrame: {e}")
1220
2057
 
1221
-
2058
+ '''
1222
2059
  def _load_features(self):
1223
2060
  """
1224
2061
  Load features by reconstructing FeatureMaps from the processed features_df data.
@@ -1326,8 +2163,9 @@ def _load_features(self):
1326
2163
  self.logger.debug(
1327
2164
  f"Successfully reconstructed {len(self.features_maps)} FeatureMaps from features_df.",
1328
2165
  )
2166
+ '''
1329
2167
 
1330
-
2168
+ '''
1331
2169
  def _load_features_from_xml(self):
1332
2170
  """
1333
2171
  Original load_features method that loads from .featureXML files.
@@ -1365,8 +2203,8 @@ def _load_features_from_xml(self):
1365
2203
  fh.load(filename, fm)
1366
2204
  self.features_maps.append(fm)
1367
2205
  self.logger.debug("Features loaded successfully.")
1368
-
1369
-
2206
+ '''
2207
+ '''
1370
2208
  def _load_consensusXML(self, filename="alignment.consensusXML"):
1371
2209
  """
1372
2210
  Load a consensus map from a file.
@@ -1378,15 +2216,14 @@ def _load_consensusXML(self, filename="alignment.consensusXML"):
1378
2216
  self.consensus_map = oms.ConsensusMap()
1379
2217
  fh.load(filename, self.consensus_map)
1380
2218
  self.logger.debug(f"Loaded consensus map from {filename}.")
1381
-
2219
+ '''
1382
2220
 
1383
2221
  def _add_samples_batch(
1384
2222
  self,
1385
2223
  files,
1386
2224
  reset=False,
1387
2225
  adducts=None,
1388
- blacklist=None,
1389
- fast=True,
2226
+ blacklist=None
1390
2227
  ):
1391
2228
  """
1392
2229
  Optimized batch addition of samples.
@@ -1396,7 +2233,6 @@ def _add_samples_batch(
1396
2233
  reset (bool): Whether to reset features before processing
1397
2234
  adducts: Adducts to use for sample loading
1398
2235
  blacklist (set): Set of filenames already processed
1399
- fast (bool): Whether to use optimized loading (skips ms1_df) or standard loading
1400
2236
 
1401
2237
  Performance optimizations:
1402
2238
  1. No per-sample color reset
@@ -1411,7 +2247,7 @@ def _add_samples_batch(
1411
2247
  blacklist = set()
1412
2248
 
1413
2249
  self.logger.debug(
1414
- f"Starting batch addition of {len(files)} samples (skip_ms1={fast})...",
2250
+ f"Starting batch addition of {len(files)} samples...",
1415
2251
  )
1416
2252
 
1417
2253
  successful_additions = 0
@@ -1430,22 +2266,13 @@ def _add_samples_batch(
1430
2266
  ):
1431
2267
  try:
1432
2268
  # Choose between optimized and standard loading
1433
- if fast:
1434
- success = self._add_sample_optimized(
1435
- file,
1436
- reset=reset,
1437
- adducts=adducts,
1438
- skip_color_reset=True, # Skip color reset during batch
1439
- skip_schema_check=True, # Skip schema enforcement
1440
- )
1441
- else:
1442
- success = self._add_sample_standard(
1443
- file,
1444
- reset=reset,
1445
- adducts=adducts,
1446
- skip_color_reset=True, # Skip color reset during batch
1447
- skip_schema_check=True, # Skip schema enforcement
1448
- )
2269
+ success = _add_sample_noms1(self,
2270
+ file,
2271
+ reset=reset,
2272
+ adducts=adducts,
2273
+ skip_color_reset=True, # Skip color reset during batch
2274
+ skip_schema_check=True, # Skip schema enforcement
2275
+ )
1449
2276
 
1450
2277
  if success:
1451
2278
  # Add to blacklist for filename tracking
@@ -1467,7 +2294,7 @@ def _add_samples_batch(
1467
2294
  # self._ensure_features_df_schema_order()
1468
2295
 
1469
2296
  # Color assignment done once for all samples
1470
- self._sample_color_reset_optimized()
2297
+ self.set_samples_color()
1471
2298
 
1472
2299
  self.logger.debug(
1473
2300
  f"Add samples complete: {successful_additions} successful, {failed_additions} failed",
@@ -1476,7 +2303,7 @@ def _add_samples_batch(
1476
2303
  return successful_additions
1477
2304
 
1478
2305
 
1479
- def _add_sample_optimized(
2306
+ def _add_sample_noms1(
1480
2307
  self,
1481
2308
  file,
1482
2309
  type=None,
@@ -1535,11 +2362,11 @@ def _add_sample_optimized(
1535
2362
  return False
1536
2363
 
1537
2364
  # Check if features map was created successfully
1538
- if ddaobj._oms_features_map is None:
1539
- self.logger.warning(f"Failed to add sample {file}: No features map created")
1540
- return False
2365
+ #if ddaobj._oms_features_map is None:
2366
+ # self.logger.warning(f"Failed to add sample {file}: No features map created")
2367
+ # return False
1541
2368
 
1542
- self.features_maps.append(ddaobj._oms_features_map)
2369
+ #self.features_maps.append(ddaobj._oms_features_map)
1543
2370
 
1544
2371
  # Determine sample type
1545
2372
  sample_type = "sample" if type is None else type
@@ -1647,7 +2474,7 @@ def _add_sample_optimized(
1647
2474
  )
1648
2475
  return True
1649
2476
 
1650
-
2477
+ '''
1651
2478
  def _add_sample_standard(
1652
2479
  self,
1653
2480
  file,
@@ -1921,9 +2748,10 @@ def _add_sample_standard(
1921
2748
  )
1922
2749
  return True
1923
2750
 
1924
-
1925
- def _sample_color_reset_optimized(self):
2751
+ '''
2752
+ '''def _sample_color_reset_optimized(self):
1926
2753
  """
1927
2754
  Optimized version of sample color reset using set_samples_color.
1928
2755
  """
1929
2756
  return self.set_samples_color(by=None)
2757
+ '''