masster 0.5.22__py3-none-any.whl → 0.5.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/load.py CHANGED
@@ -34,13 +34,7 @@ except ImportError:
34
34
  import glob
35
35
 
36
36
 
37
- def add(
38
- self,
39
- folder=None,
40
- reset=False,
41
- adducts=None,
42
- max_files=None
43
- ):
37
+ def add(self, folder=None, reset=False, adducts=None, max_files=None):
44
38
  """Add samples from a folder to the study.
45
39
 
46
40
  Args:
@@ -91,9 +85,7 @@ def add(
91
85
 
92
86
  if len(files) > 0:
93
87
  # Limit files if max_files is specified
94
- remaining_slots = (
95
- max_files - counter if max_files is not None else len(files)
96
- )
88
+ remaining_slots = max_files - counter if max_files is not None else len(files)
97
89
  files = files[:remaining_slots]
98
90
 
99
91
  self.logger.debug(f"Found {len(files)} {ext} files")
@@ -119,11 +111,8 @@ def add(
119
111
  self.logger.debug(
120
112
  f"Batch processing {len(files_to_process)} {ext} files",
121
113
  )
122
- successful = _add_samples_batch(self,
123
- files_to_process,
124
- reset=reset,
125
- adducts=adducts,
126
- blacklist=blacklist
114
+ successful = _add_samples_batch(
115
+ self, files_to_process, reset=reset, adducts=adducts, blacklist=blacklist
127
116
  )
128
117
  counter += successful
129
118
  if successful > 0:
@@ -171,7 +160,6 @@ def add_sample(self, file, type=None, reset=False, adducts=None):
171
160
  skip_schema_check=True, # Skip schema check for performance (safe with diagonal concat)
172
161
  )
173
162
 
174
-
175
163
  return success
176
164
 
177
165
 
@@ -200,19 +188,20 @@ def load(self, filename=None):
200
188
 
201
189
  # self.logger.info(f"Loading study from {filename}")
202
190
  from masster.study.h5 import _load_study5
191
+
203
192
  _load_study5(self, filename)
204
-
193
+
205
194
  # After loading the study, check if we have consensus features before loading consensus XML
206
- #if (self.consensus_df is not None and not self.consensus_df.is_empty()):
195
+ # if (self.consensus_df is not None and not self.consensus_df.is_empty()):
207
196
  # consensus_xml_path = filename.replace(".study5", ".consensusXML")
208
197
  # if os.path.exists(consensus_xml_path):
209
198
  # self._load_consensusXML(filename=consensus_xml_path)
210
- # self.logger.info(f"Automatically loaded consensus from {consensus_xml_path}")
199
+ # self.logger.info(f"Automatically loaded consensus from {consensus_xml_path}")
211
200
  # else:
212
201
  # self.logger.warning(f"No consensus XML file found at {consensus_xml_path}")
213
- #else:
202
+ # else:
214
203
  # self.logger.debug("No consensus features found, skipping consensusXML loading")
215
-
204
+
216
205
  self.filename = filename
217
206
 
218
207
 
@@ -250,25 +239,24 @@ def _fill_chrom_single_impl(
250
239
  if isinstance(min_samples_abs, int) and min_samples_abs >= 0:
251
240
  min_number_abs = int(min_samples_abs) if min_samples_abs > 0 else 0
252
241
  min_number = max(min_number_rel, min_number_abs)
253
-
242
+
254
243
  # Special case: if min_samples_abs is explicitly 0, allow 0-sample features (like library features)
255
244
  if isinstance(min_samples_abs, int) and min_samples_abs == 0:
256
245
  min_number = 0
257
-
246
+
258
247
  self.logger.debug(f"Threshold for gap filling: number_samples>={min_number}")
259
248
 
260
249
  if min_number > 0:
261
250
  original_count = len(uids)
262
251
  uids = self.consensus_df.filter(
263
- (pl.col("number_samples") >= min_number)
264
- & (pl.col("consensus_uid").is_in(uids)),
252
+ (pl.col("number_samples") >= min_number) & (pl.col("consensus_uid").is_in(uids)),
265
253
  )["consensus_uid"].to_list()
266
254
  self.logger.debug(
267
255
  f"Features to fill: {original_count} -> {len(uids)}",
268
256
  )
269
257
  self.logger.debug("Identifying missing features...")
270
258
  # Instead of building full chromatogram matrix, identify missing consensus/sample combinations directly
271
- missing_combinations = _get_missing_consensus_sample_combinations(self,uids)
259
+ missing_combinations = _get_missing_consensus_sample_combinations(self, uids)
272
260
  if not missing_combinations:
273
261
  self.logger.info("No missing features found to fill.")
274
262
  return
@@ -335,12 +323,12 @@ def _fill_chrom_single_impl(
335
323
  if ms1_data is None or ms1_data.is_empty():
336
324
  self.logger.warning(f"No MS1 data found for sample {sample_name}")
337
325
  continue
338
-
326
+
339
327
  # Create a temporary object to hold the MS1 data for processing
340
328
  class TempSample:
341
329
  def __init__(self, ms1_df):
342
330
  self.ms1_df = ms1_df
343
-
331
+
344
332
  file = TempSample(ms1_data)
345
333
  except Exception as e:
346
334
  self.logger.warning(f"Failed to load sample {sample_name}: {e}")
@@ -363,31 +351,25 @@ def _fill_chrom_single_impl(
363
351
  # Special handling for RT=0 (library-derived features)
364
352
  if rt == 0.0:
365
353
  # Step 1: Retrieve full chromatogram for the m/z
366
- d_full = file.ms1_df.filter(
367
- (pl.col("mz") >= mz - mz_tol)
368
- & (pl.col("mz") <= mz + mz_tol)
369
- )
370
-
354
+ d_full = file.ms1_df.filter((pl.col("mz") >= mz - mz_tol) & (pl.col("mz") <= mz + mz_tol))
355
+
371
356
  if not d_full.is_empty():
372
357
  # Step 2: Find maximum intensity and its RT
373
- max_inty_row = d_full.filter(
374
- pl.col("inty") == d_full["inty"].max()
375
- ).head(1)
376
-
358
+ max_inty_row = d_full.filter(pl.col("inty") == d_full["inty"].max()).head(1)
359
+
377
360
  if not max_inty_row.is_empty():
378
361
  max_rt = max_inty_row["rt"].item()
379
-
362
+
380
363
  # Get eic_rt_tol from sample parameters if available
381
364
  eic_rt_tol = rt_tol # Default fallback
382
- if hasattr(file, 'parameters') and hasattr(file.parameters, 'eic_rt_tol'):
365
+ if hasattr(file, "parameters") and hasattr(file.parameters, "eic_rt_tol"):
383
366
  eic_rt_tol = file.parameters.eic_rt_tol
384
-
367
+
385
368
  # Step 3: Trim around max intensity using eic_rt_tol
386
369
  d = d_full.filter(
387
- (pl.col("rt") >= max_rt - eic_rt_tol)
388
- & (pl.col("rt") <= max_rt + eic_rt_tol)
370
+ (pl.col("rt") >= max_rt - eic_rt_tol) & (pl.col("rt") <= max_rt + eic_rt_tol)
389
371
  )
390
-
372
+
391
373
  # Update consensus RT info based on discovered peak
392
374
  rt = max_rt
393
375
  rt_start_mean = max_rt - eic_rt_tol
@@ -529,10 +511,7 @@ def _fill_chrom_single_impl(
529
511
  for row in rows_to_add:
530
512
  # Cast numeric columns to ensure consistency
531
513
  for key, value in row.items():
532
- if (
533
- key in ["mz", "rt", "intensity", "area", "height"]
534
- and value is not None
535
- ):
514
+ if key in ["mz", "rt", "intensity", "area", "height"] and value is not None:
536
515
  row[key] = float(value)
537
516
  elif key in ["sample_id", "feature_id"] and value is not None:
538
517
  row[key] = int(value)
@@ -618,67 +597,64 @@ def _build_rt_correction_mapping_per_sample(self, sample_uid):
618
597
  """
619
598
  Pre-compute RT correction mapping for a sample by getting all non-filled features.
620
599
  This avoids repeated DataFrame filtering for each feature.
621
-
600
+
622
601
  Args:
623
602
  sample_uid: Sample UID to build mapping for
624
-
603
+
625
604
  Returns:
626
605
  Polars DataFrame with rt, rt_original, and rt_delta columns, sorted by rt
627
606
  Returns empty DataFrame if no reference features found
628
607
  """
629
608
  # Get non-filled features from the same sample
630
- if 'filled' in self.features_df.columns:
609
+ if "filled" in self.features_df.columns:
631
610
  sample_features = self.features_df.filter(
632
- (pl.col('sample_uid') == sample_uid) &
633
- (pl.col('filled') == False) &
634
- (pl.col('rt_original').is_not_null()) &
635
- (pl.col('rt').is_not_null())
611
+ (pl.col("sample_uid") == sample_uid)
612
+ & (pl.col("filled") == False)
613
+ & (pl.col("rt_original").is_not_null())
614
+ & (pl.col("rt").is_not_null())
636
615
  )
637
616
  else:
638
617
  # If no filled column, assume all existing features are non-filled
639
618
  sample_features = self.features_df.filter(
640
- (pl.col('sample_uid') == sample_uid) &
641
- (pl.col('rt_original').is_not_null()) &
642
- (pl.col('rt').is_not_null())
619
+ (pl.col("sample_uid") == sample_uid) & (pl.col("rt_original").is_not_null()) & (pl.col("rt").is_not_null())
643
620
  )
644
-
621
+
645
622
  if sample_features.is_empty():
646
- return pl.DataFrame(schema={'rt': pl.Float64, 'rt_original': pl.Float64, 'rt_delta': pl.Float64})
647
-
623
+ return pl.DataFrame(schema={"rt": pl.Float64, "rt_original": pl.Float64, "rt_delta": pl.Float64})
624
+
648
625
  # Pre-compute RT deltas and sort by RT for efficient lookup
649
626
  rt_mapping = sample_features.select([
650
- pl.col('rt'),
651
- pl.col('rt_original'),
652
- (pl.col('rt') - pl.col('rt_original')).alias('rt_delta')
653
- ]).sort('rt')
654
-
627
+ pl.col("rt"),
628
+ pl.col("rt_original"),
629
+ (pl.col("rt") - pl.col("rt_original")).alias("rt_delta"),
630
+ ]).sort("rt")
631
+
655
632
  return rt_mapping
656
633
 
634
+
657
635
  def _estimate_rt_original_from_mapping(self, rt_mapping, target_rt):
658
636
  """
659
637
  Fast RT original estimation using pre-computed mapping.
660
-
638
+
661
639
  Args:
662
640
  rt_mapping: Pre-computed RT mapping DataFrame from _build_rt_correction_mapping_per_sample
663
641
  target_rt: Target aligned RT for the filled feature
664
-
642
+
665
643
  Returns:
666
644
  Estimated rt_original value, or None if no mapping available
667
645
  """
668
646
  if rt_mapping.is_empty():
669
647
  return None
670
-
648
+
671
649
  # Find closest RT using vectorized operations
672
- rt_mapping_with_diff = rt_mapping.with_columns([
673
- (pl.col('rt') - target_rt).abs().alias('rt_diff')
674
- ])
675
-
650
+ rt_mapping_with_diff = rt_mapping.with_columns([(pl.col("rt") - target_rt).abs().alias("rt_diff")])
651
+
676
652
  # Get the RT delta from the closest feature
677
- closest_row = rt_mapping_with_diff.sort('rt_diff').head(1)
653
+ closest_row = rt_mapping_with_diff.sort("rt_diff").head(1)
678
654
  if closest_row.is_empty():
679
655
  return None
680
-
681
- closest_rt_delta = closest_row['rt_delta'].item()
656
+
657
+ closest_rt_delta = closest_row["rt_delta"].item()
682
658
  return target_rt - closest_rt_delta
683
659
 
684
660
 
@@ -686,59 +662,59 @@ def _estimate_rt_original_for_filled_feature(self, sample_uid, target_rt, logger
686
662
  """
687
663
  Estimate rt_original for a filled feature by finding the closest non-filled feature
688
664
  from the same sample and using its RT delta (rt - rt_original).
689
-
665
+
690
666
  Args:
691
667
  sample_uid: Sample UID to search within
692
668
  target_rt: Target aligned RT for the filled feature
693
669
  logger: Optional logger for debug messages
694
-
670
+
695
671
  Returns:
696
672
  Estimated rt_original value, or None if no suitable reference found
697
673
  """
698
674
  # Get non-filled features from the same sample
699
- if 'filled' in self.features_df.columns:
675
+ if "filled" in self.features_df.columns:
700
676
  sample_features = self.features_df.filter(
701
- (pl.col('sample_uid') == sample_uid) &
702
- (pl.col('filled') == False) &
703
- (pl.col('rt_original').is_not_null()) &
704
- (pl.col('rt').is_not_null())
677
+ (pl.col("sample_uid") == sample_uid)
678
+ & (pl.col("filled") == False)
679
+ & (pl.col("rt_original").is_not_null())
680
+ & (pl.col("rt").is_not_null())
705
681
  )
706
682
  else:
707
683
  # If no filled column, assume all existing features are non-filled
708
684
  sample_features = self.features_df.filter(
709
- (pl.col('sample_uid') == sample_uid) &
710
- (pl.col('rt_original').is_not_null()) &
711
- (pl.col('rt').is_not_null())
685
+ (pl.col("sample_uid") == sample_uid) & (pl.col("rt_original").is_not_null()) & (pl.col("rt").is_not_null())
712
686
  )
713
-
687
+
714
688
  if sample_features.is_empty():
715
689
  if logger:
716
690
  logger.debug(f"No reference features found for sample {sample_uid} to estimate rt_original")
717
691
  return None
718
-
692
+
719
693
  # Calculate RT differences and find the closest feature
720
694
  sample_features_with_diff = sample_features.with_columns([
721
- (pl.col('rt') - target_rt).abs().alias('rt_diff'),
722
- (pl.col('rt') - pl.col('rt_original')).alias('rt_delta')
695
+ (pl.col("rt") - target_rt).abs().alias("rt_diff"),
696
+ (pl.col("rt") - pl.col("rt_original")).alias("rt_delta"),
723
697
  ])
724
-
698
+
725
699
  # Find the feature with minimum RT difference
726
- closest_feature = sample_features_with_diff.sort('rt_diff').head(1)
727
-
700
+ closest_feature = sample_features_with_diff.sort("rt_diff").head(1)
701
+
728
702
  if closest_feature.is_empty():
729
703
  return None
730
-
704
+
731
705
  # Get the RT delta from the closest feature
732
- closest_rt_diff = closest_feature['rt_diff'].item()
733
- closest_rt_delta = closest_feature['rt_delta'].item()
734
-
706
+ closest_rt_diff = closest_feature["rt_diff"].item()
707
+ closest_rt_delta = closest_feature["rt_delta"].item()
708
+
735
709
  # Estimate rt_original using the same delta: rt_original = rt - rt_delta
736
710
  estimated_rt_original = target_rt - closest_rt_delta
737
-
711
+
738
712
  if self.logger:
739
- self.logger.debug(f"Estimated rt_original={estimated_rt_original:.3f} for sample {sample_uid}, rt={target_rt:.3f} "
740
- f"using closest feature (rt_diff={closest_rt_diff:.3f}, rt_delta={closest_rt_delta:.3f})")
741
-
713
+ self.logger.debug(
714
+ f"Estimated rt_original={estimated_rt_original:.3f} for sample {sample_uid}, rt={target_rt:.3f} "
715
+ f"using closest feature (rt_diff={closest_rt_diff:.3f}, rt_delta={closest_rt_delta:.3f})"
716
+ )
717
+
742
718
  return estimated_rt_original
743
719
 
744
720
 
@@ -763,7 +739,7 @@ def _process_sample_for_parallel_fill(
763
739
  # Get missing features for this sample from precomputed combinations
764
740
  sample_missing_df = missing_combinations_df.filter(pl.col("sample_uid") == sample_uid)
765
741
  sample_consensus_uids = sample_missing_df["consensus_uid"].to_list()
766
-
742
+
767
743
  if not sample_consensus_uids:
768
744
  return new_features, new_mapping, counter
769
745
 
@@ -787,13 +763,15 @@ def _process_sample_for_parallel_fill(
787
763
  feature_end=info["rt_end_mean"],
788
764
  feature_apex=info["rt"],
789
765
  )
790
-
766
+
791
767
  new_feature = {
792
768
  "uid": features_df_max_uid + counter,
793
769
  "sample_uid": sample_uid,
794
770
  "mz": info["mz"],
795
771
  "rt": info["rt"],
796
- "rt_original": 0.0 if info["rt"] == 0.0 else _estimate_rt_original_from_mapping(self, rt_mapping, info["rt"]),
772
+ "rt_original": 0.0
773
+ if info["rt"] == 0.0
774
+ else _estimate_rt_original_from_mapping(self, rt_mapping, info["rt"]),
797
775
  "mz_centroid": None,
798
776
  "rt_centroid": None,
799
777
  "iso": None,
@@ -811,7 +789,7 @@ def _process_sample_for_parallel_fill(
811
789
  "ms2_scans": None,
812
790
  "ms2_specs": None,
813
791
  }
814
-
792
+
815
793
  new_features.append(new_feature)
816
794
  new_mapping.append({
817
795
  "consensus_uid": consensus_uid,
@@ -820,7 +798,7 @@ def _process_sample_for_parallel_fill(
820
798
  })
821
799
  counter += 1
822
800
  return new_features, new_mapping, counter
823
-
801
+
824
802
  except Exception as e:
825
803
  # If MS1 loading fails, create empty features
826
804
  self.logger.debug(f"Failed to load MS1 data from {sample_path}: {e}")
@@ -836,13 +814,15 @@ def _process_sample_for_parallel_fill(
836
814
  feature_end=info["rt_end_mean"],
837
815
  feature_apex=info["rt"],
838
816
  )
839
-
817
+
840
818
  new_feature = {
841
819
  "uid": features_df_max_uid + counter,
842
820
  "sample_uid": sample_uid,
843
821
  "mz": info["mz"],
844
822
  "rt": info["rt"],
845
- "rt_original": 0.0 if info["rt"] == 0.0 else _estimate_rt_original_from_mapping(self, rt_mapping, info["rt"]),
823
+ "rt_original": 0.0
824
+ if info["rt"] == 0.0
825
+ else _estimate_rt_original_from_mapping(self, rt_mapping, info["rt"]),
846
826
  "mz_centroid": None,
847
827
  "rt_centroid": None,
848
828
  "iso": None,
@@ -860,7 +840,7 @@ def _process_sample_for_parallel_fill(
860
840
  "ms2_scans": None,
861
841
  "ms2_specs": None,
862
842
  }
863
-
843
+
864
844
  new_features.append(new_feature)
865
845
  new_mapping.append({
866
846
  "consensus_uid": consensus_uid,
@@ -874,12 +854,10 @@ def _process_sample_for_parallel_fill(
874
854
  all_mzs = [consensus_info[uid]["mz"] for uid in sample_consensus_uids]
875
855
  mz_min = min(all_mzs) - mz_tol
876
856
  mz_max = max(all_mzs) + mz_tol
877
-
857
+
878
858
  # Pre-filter by broad m/z range
879
- ms1_filtered = ms1_data.filter(
880
- (pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max)
881
- )
882
-
859
+ ms1_filtered = ms1_data.filter((pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max))
860
+
883
861
  # Early exit if no data in m/z range
884
862
  if ms1_filtered.is_empty():
885
863
  for i, consensus_uid in enumerate(sample_consensus_uids):
@@ -894,13 +872,15 @@ def _process_sample_for_parallel_fill(
894
872
  feature_end=info["rt_end_mean"],
895
873
  feature_apex=info["rt"],
896
874
  )
897
-
875
+
898
876
  new_feature = {
899
877
  "uid": features_df_max_uid + counter,
900
878
  "sample_uid": sample_uid,
901
879
  "mz": info["mz"],
902
880
  "rt": info["rt"],
903
- "rt_original": 0.0 if info["rt"] == 0.0 else _estimate_rt_original_from_mapping(self, rt_mapping, info["rt"]),
881
+ "rt_original": 0.0
882
+ if info["rt"] == 0.0
883
+ else _estimate_rt_original_from_mapping(self, rt_mapping, info["rt"]),
904
884
  "mz_centroid": None,
905
885
  "rt_centroid": None,
906
886
  "iso": None,
@@ -918,7 +898,7 @@ def _process_sample_for_parallel_fill(
918
898
  "ms2_scans": None,
919
899
  "ms2_specs": None,
920
900
  }
921
-
901
+
922
902
  new_features.append(new_feature)
923
903
  new_mapping.append({
924
904
  "consensus_uid": consensus_uid,
@@ -932,7 +912,7 @@ def _process_sample_for_parallel_fill(
932
912
  for consensus_uid in sample_consensus_uids:
933
913
  info = consensus_info[consensus_uid]
934
914
  mz, rt = info["mz"], info["rt"]
935
-
915
+
936
916
  try:
937
917
  if rt == 0.0:
938
918
  # Handle RT=0 features - create empty chromatogram
@@ -951,10 +931,12 @@ def _process_sample_for_parallel_fill(
951
931
  else:
952
932
  # Extract real chromatogram using pre-filtered MS1 data
953
933
  d = ms1_filtered.filter(
954
- (pl.col("mz") >= mz - mz_tol) & (pl.col("mz") <= mz + mz_tol) &
955
- (pl.col("rt") >= rt - rt_tol) & (pl.col("rt") <= rt + rt_tol)
934
+ (pl.col("mz") >= mz - mz_tol)
935
+ & (pl.col("mz") <= mz + mz_tol)
936
+ & (pl.col("rt") >= rt - rt_tol)
937
+ & (pl.col("rt") <= rt + rt_tol)
956
938
  )
957
-
939
+
958
940
  # Create chromatogram from filtered data
959
941
  if d.is_empty():
960
942
  # No MS1 data found - create empty chromatogram
@@ -972,7 +954,7 @@ def _process_sample_for_parallel_fill(
972
954
  else:
973
955
  # Aggregate intensities per retention time (get max inty per RT)
974
956
  eic_rt = d.group_by("rt").agg(pl.col("inty").max()).sort("rt")
975
-
957
+
976
958
  # Create chromatogram with real data and find peaks
977
959
  eic = Chromatogram(
978
960
  eic_rt["rt"].to_numpy(),
@@ -984,15 +966,19 @@ def _process_sample_for_parallel_fill(
984
966
  feature_end=info["rt_end_mean"],
985
967
  feature_apex=rt,
986
968
  ).find_peaks()
987
- best_peak = self._find_best_peak_in_eic(eic, rt, rt_tol) if hasattr(self, '_find_best_peak_in_eic') else None
988
-
969
+ best_peak = (
970
+ self._find_best_peak_in_eic(eic, rt, rt_tol)
971
+ if hasattr(self, "_find_best_peak_in_eic")
972
+ else None
973
+ )
974
+
989
975
  # Create feature with optimized RT original estimation
990
976
  rt_original_estimated = None
991
977
  if rt == 0.0:
992
978
  rt_original_estimated = 0.0 # RT=0 features
993
979
  else:
994
980
  rt_original_estimated = _estimate_rt_original_from_mapping(self, rt_mapping, rt)
995
-
981
+
996
982
  new_feature = {
997
983
  "uid": features_df_max_uid + counter,
998
984
  "sample_uid": sample_uid,
@@ -1016,7 +1002,7 @@ def _process_sample_for_parallel_fill(
1016
1002
  "ms2_scans": None,
1017
1003
  "ms2_specs": None,
1018
1004
  }
1019
-
1005
+
1020
1006
  new_features.append(new_feature)
1021
1007
  new_mapping.append({
1022
1008
  "consensus_uid": consensus_uid,
@@ -1024,7 +1010,7 @@ def _process_sample_for_parallel_fill(
1024
1010
  "feature_uid": features_df_max_uid + counter,
1025
1011
  })
1026
1012
  counter += 1
1027
-
1013
+
1028
1014
  except Exception as e:
1029
1015
  # Skip this feature if extraction fails but log the error
1030
1016
  self.logger.debug(f"Failed to extract feature {consensus_uid} from {sample_path}: {e}")
@@ -1032,6 +1018,7 @@ def _process_sample_for_parallel_fill(
1032
1018
 
1033
1019
  return new_features, new_mapping, counter
1034
1020
 
1021
+
1035
1022
  def _fill_chrom_impl(
1036
1023
  self,
1037
1024
  uids=None,
@@ -1076,8 +1063,7 @@ def _fill_chrom_impl(
1076
1063
  if min_number > 0:
1077
1064
  original_count = len(uids)
1078
1065
  uids = self.consensus_df.filter(
1079
- (pl.col("number_samples") >= min_number)
1080
- & (pl.col("consensus_uid").is_in(uids)),
1066
+ (pl.col("number_samples") >= min_number) & (pl.col("consensus_uid").is_in(uids)),
1081
1067
  )["consensus_uid"].to_list()
1082
1068
  self.logger.debug(f"Features to fill: {original_count} -> {len(uids)}")
1083
1069
 
@@ -1145,9 +1131,7 @@ def _fill_chrom_impl(
1145
1131
  )
1146
1132
 
1147
1133
  # Calculate current max feature_uid to avoid conflicts
1148
- features_df_max_uid = (
1149
- self.features_df["feature_uid"].max() if not self.features_df.is_empty() else 0
1150
- )
1134
+ features_df_max_uid = self.features_df["feature_uid"].max() if not self.features_df.is_empty() else 0
1151
1135
 
1152
1136
  # Process samples in parallel
1153
1137
  all_new_features: list[dict] = []
@@ -1161,7 +1145,8 @@ def _fill_chrom_impl(
1161
1145
  future_to_sample = {}
1162
1146
  for sample_info in samples_to_process:
1163
1147
  future = executor.submit(
1164
- _process_sample_for_parallel_fill, self,
1148
+ _process_sample_for_parallel_fill,
1149
+ self,
1165
1150
  sample_info,
1166
1151
  consensus_info,
1167
1152
  uids,
@@ -1223,10 +1208,7 @@ def _fill_chrom_impl(
1223
1208
  for row in rows_to_add:
1224
1209
  # Cast numeric columns to ensure consistency
1225
1210
  for key, value in row.items():
1226
- if (
1227
- key in ["mz", "rt", "intensity", "area", "height"]
1228
- and value is not None
1229
- ):
1211
+ if key in ["mz", "rt", "intensity", "area", "height"] and value is not None:
1230
1212
  row[key] = float(value)
1231
1213
  elif key in ["sample_id", "feature_id"] and value is not None:
1232
1214
  row[key] = int(value)
@@ -1254,8 +1236,8 @@ def _fill_chrom_impl(
1254
1236
 
1255
1237
  # Log statistics about rt_original estimation
1256
1238
  if all_new_features:
1257
- estimated_count = sum(1 for feature in all_new_features if feature.get('rt_original') is not None)
1258
- none_count = sum(1 for feature in all_new_features if feature.get('rt_original') is None)
1239
+ estimated_count = sum(1 for feature in all_new_features if feature.get("rt_original") is not None)
1240
+ none_count = sum(1 for feature in all_new_features if feature.get("rt_original") is None)
1259
1241
  self.logger.debug(f"Features with estimated rt_original: {estimated_count}")
1260
1242
  self.logger.debug(f"Features with None rt_original: {none_count}")
1261
1243
 
@@ -1288,7 +1270,7 @@ def fill(self, **kwargs):
1288
1270
  kwargs["threads"] = kwargs.pop("workers")
1289
1271
  self.logger.debug("Converted 'workers' parameter to 'threads' for backward compatibility")
1290
1272
  if "num_workers" in kwargs:
1291
- kwargs["threads"] = kwargs.pop("num_workers")
1273
+ kwargs["threads"] = kwargs.pop("num_workers")
1292
1274
  self.logger.debug("Converted 'num_workers' parameter to 'threads' for backward compatibility")
1293
1275
 
1294
1276
  for key, value in kwargs.items():
@@ -1347,24 +1329,24 @@ def _get_missing_consensus_sample_combinations(self, uids):
1347
1329
  self.consensus_mapping_df.filter(pl.col("consensus_uid").is_in(uids))["consensus_uid"].to_list()
1348
1330
  )
1349
1331
  unmapped_consensus_uids = uids_set - mapped_consensus_uids
1350
-
1332
+
1351
1333
  # Get all sample info once for efficiency
1352
1334
  all_samples = list(
1353
1335
  self.samples_df.select(
1354
1336
  ["sample_uid", "sample_name", "sample_path", "sample_source"],
1355
1337
  ).iter_rows(),
1356
1338
  )
1357
-
1339
+
1358
1340
  missing_combinations = []
1359
-
1341
+
1360
1342
  # For unmapped consensus features (e.g., RT=0), ALL samples are missing
1361
1343
  if unmapped_consensus_uids:
1362
- self.logger.debug(f"Found {len(unmapped_consensus_uids)} consensus features with no mappings (e.g., RT=0 library features)")
1344
+ self.logger.debug(
1345
+ f"Found {len(unmapped_consensus_uids)} consensus features with no mappings (e.g., RT=0 library features)"
1346
+ )
1363
1347
  for consensus_uid in unmapped_consensus_uids:
1364
1348
  for sample_uid, sample_name, sample_path, sample_source in all_samples:
1365
- missing_combinations.append(
1366
- (consensus_uid, sample_uid, sample_name, sample_path, sample_source)
1367
- )
1349
+ missing_combinations.append((consensus_uid, sample_uid, sample_name, sample_path, sample_source))
1368
1350
 
1369
1351
  # If all consensus features are unmapped, return early
1370
1352
  if len(mapped_consensus_uids) == 0:
@@ -1372,7 +1354,7 @@ def _get_missing_consensus_sample_combinations(self, uids):
1372
1354
 
1373
1355
  # Continue with existing logic for mapped consensus features
1374
1356
  mapped_uids_list = list(mapped_consensus_uids)
1375
-
1357
+
1376
1358
  # Quick early termination check for fully/nearly filled studies
1377
1359
  # This handles the common case where fill() is run on an already-filled study
1378
1360
  consensus_counts = (
@@ -1381,9 +1363,7 @@ def _get_missing_consensus_sample_combinations(self, uids):
1381
1363
  .agg(pl.count("feature_uid").alias("count"))
1382
1364
  )
1383
1365
 
1384
- total_existing = (
1385
- consensus_counts["count"].sum() if not consensus_counts.is_empty() else 0
1386
- )
1366
+ total_existing = consensus_counts["count"].sum() if not consensus_counts.is_empty() else 0
1387
1367
 
1388
1368
  # Calculate total possible for mapped features only
1389
1369
  mapped_total_possible = len(mapped_uids_list) * n_samples
@@ -1451,9 +1431,7 @@ def _get_missing_consensus_sample_combinations(self, uids):
1451
1431
  for consensus_uid in mapped_uids_list:
1452
1432
  for sample_uid, sample_name, sample_path, sample_source in all_samples:
1453
1433
  if (consensus_uid, sample_uid) not in existing_combinations:
1454
- missing_combinations.append(
1455
- (consensus_uid, sample_uid, sample_name, sample_path, sample_source)
1456
- )
1434
+ missing_combinations.append((consensus_uid, sample_uid, sample_name, sample_path, sample_source))
1457
1435
 
1458
1436
  return missing_combinations
1459
1437
 
@@ -1551,13 +1529,8 @@ def _sanitize(self):
1551
1529
  except Exception as e:
1552
1530
  self.logger.error(f"Failed to recreate sanitized DataFrame: {e}")
1553
1531
 
1554
- def _add_samples_batch(
1555
- self,
1556
- files,
1557
- reset=False,
1558
- adducts=None,
1559
- blacklist=None
1560
- ):
1532
+
1533
+ def _add_samples_batch(self, files, reset=False, adducts=None, blacklist=None):
1561
1534
  """
1562
1535
  Optimized batch addition of samples.
1563
1536
 
@@ -1599,7 +1572,8 @@ def _add_samples_batch(
1599
1572
  ):
1600
1573
  try:
1601
1574
  # Choose between optimized and standard loading
1602
- success = _add_sample_noms1(self,
1575
+ success = _add_sample_noms1(
1576
+ self,
1603
1577
  file,
1604
1578
  reset=reset,
1605
1579
  adducts=adducts,
@@ -1695,20 +1669,33 @@ def _add_sample_noms1(
1695
1669
  return False
1696
1670
 
1697
1671
  # Check polarity compatibility
1698
- sample_polarity = getattr(ddaobj, 'polarity', None)
1699
- study_polarity = getattr(self, 'polarity', None)
1700
-
1672
+ sample_polarity = getattr(ddaobj, "polarity", None)
1673
+ study_polarity = getattr(self, "polarity", None)
1674
+
1701
1675
  if sample_polarity is not None and study_polarity is not None:
1702
1676
  # Normalize polarity names for comparison
1703
- sample_pol_norm = "positive" if sample_polarity in ["pos", "positive"] else "negative" if sample_polarity in ["neg", "negative"] else sample_polarity
1704
- study_pol_norm = "positive" if study_polarity in ["pos", "positive"] else "negative" if study_polarity in ["neg", "negative"] else study_polarity
1705
-
1677
+ sample_pol_norm = (
1678
+ "positive"
1679
+ if sample_polarity in ["pos", "positive"]
1680
+ else "negative"
1681
+ if sample_polarity in ["neg", "negative"]
1682
+ else sample_polarity
1683
+ )
1684
+ study_pol_norm = (
1685
+ "positive"
1686
+ if study_polarity in ["pos", "positive"]
1687
+ else "negative"
1688
+ if study_polarity in ["neg", "negative"]
1689
+ else study_polarity
1690
+ )
1691
+
1706
1692
  if sample_pol_norm != study_pol_norm:
1707
- self.logger.warning(f"Sample {sample_name} polarity ({sample_polarity}) differs from study polarity ({study_polarity}). Skipping sample.")
1693
+ self.logger.warning(
1694
+ f"Sample {sample_name} polarity ({sample_polarity}) differs from study polarity ({study_polarity}). Skipping sample."
1695
+ )
1708
1696
  return False
1709
1697
 
1710
-
1711
- #self.features_maps.append(ddaobj._oms_features_map)
1698
+ # self.features_maps.append(ddaobj._oms_features_map)
1712
1699
 
1713
1700
  # Determine sample type
1714
1701
  sample_type = "sample" if type is None else type
@@ -1735,14 +1722,8 @@ def _add_sample_noms1(
1735
1722
 
1736
1723
  # Efficient scan counting
1737
1724
  ms1_count = ms2_count = 0
1738
- if (
1739
- hasattr(ddaobj, "scans_df")
1740
- and ddaobj.scans_df is not None
1741
- and not ddaobj.scans_df.is_empty()
1742
- ):
1743
- scan_counts = (
1744
- ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
1745
- )
1725
+ if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
1726
+ scan_counts = ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
1746
1727
  ms_levels = scan_counts.get("ms_level", [])
1747
1728
  counts = scan_counts.get("len", [])
1748
1729
  for level, count in zip(ms_levels, counts):