masster 0.4.22__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/merge.py CHANGED
@@ -274,7 +274,7 @@ def _serialize_feature_map(feature_map):
274
274
  return features_data
275
275
 
276
276
 
277
- def merge(self, **kwargs) -> None:
277
+ def merge(study, **kwargs) -> None:
278
278
  """
279
279
  Group features across samples into consensus features using various algorithms.
280
280
 
@@ -342,7 +342,7 @@ def merge(self, **kwargs) -> None:
342
342
  if key in valid_params:
343
343
  setattr(params, key, value)
344
344
  else:
345
- self.logger.warning(f"Unknown parameter '{key}' ignored")
345
+ study.logger.warning(f"Unknown parameter '{key}' ignored")
346
346
 
347
347
  # Backward compatibility: Map old method names to new names
348
348
  method_mapping = {
@@ -362,18 +362,18 @@ def merge(self, **kwargs) -> None:
362
362
  if params.method in method_mapping:
363
363
  old_method = params.method
364
364
  params.method = method_mapping[old_method]
365
- self.logger.info(f"Method '{old_method}' is deprecated. Using '{params.method}' instead.")
365
+ study.logger.info(f"Method '{old_method}' is deprecated. Using '{params.method}' instead.")
366
366
 
367
367
  # Validate method
368
368
  if params.method not in ['sensitivity', 'qt', 'nowarp', 'kd_chunked', 'qt_chunked', 'quality']:
369
369
  raise ValueError(f"Invalid method '{params.method}'. Must be one of: ['sensitivity', 'qt', 'nowarp', 'kd_chunked', 'qt_chunked', 'quality']")
370
370
 
371
371
  # Check if chunked method is advisable for large datasets
372
- num_samples = len(self.samples_df) if hasattr(self, 'samples_df') and self.samples_df is not None else 0
372
+ num_samples = len(study.samples_df) if hasattr(study, 'samples_df') and study.samples_df is not None else 0
373
373
  if num_samples > 500:
374
374
  chunked_methods = {'kd_chunked', 'qt_chunked'}
375
375
  if params.method not in chunked_methods:
376
- self.logger.warning(
376
+ study.logger.warning(
377
377
  f"Large dataset detected ({num_samples} samples > 500). "
378
378
  f"For better performance and memory efficiency, consider using a chunked method: "
379
379
  f"'kd_chunked' or 'qt_chunked' instead of '{params.method}'"
@@ -381,42 +381,43 @@ def merge(self, **kwargs) -> None:
381
381
 
382
382
  # Persist last used params for diagnostics
383
383
  try:
384
- self._merge_params_last = params.to_dict()
384
+ study._merge_params_last = params.to_dict()
385
385
  except Exception:
386
- self._merge_params_last = {}
386
+ study._merge_params_last = {}
387
387
 
388
388
  # Store merge parameters in history
389
389
  try:
390
- if hasattr(self, 'store_history'):
391
- self.store_history(['merge'], params.to_dict())
390
+ if hasattr(study, 'store_history'):
391
+ study.update_history(['merge'], params.to_dict())
392
392
  else:
393
- self.logger.warning("History storage not available - parameters not saved to history")
393
+ study.logger.warning("History storage not available - parameters not saved to history")
394
394
  except Exception as e:
395
- self.logger.warning(f"Failed to store merge parameters in history: {e}")
395
+ study.logger.warning(f"Failed to store merge parameters in history: {e}")
396
396
 
397
397
  # Ensure feature maps are available for merging (regenerate if needed)
398
- if len(self.features_maps) < len(self.samples_df):
399
- self.features_maps = []
398
+ if len(study.features_maps) < len(study.samples_df):
399
+ study.features_maps = []
400
400
  # Feature maps will be generated on-demand within each merge method
401
401
 
402
- self.logger.info(
402
+ study.logger.info(
403
403
  f"Merge: {params.method}, samples={params.min_samples}, rt_tol={params.rt_tol}s, mz_tol={params.mz_tol}Da"
404
404
  )
405
405
 
406
406
  # Initialize
407
- self._reset_consensus_data()
407
+ _reset_consensus_data(study)
408
408
 
409
409
  # Cache adducts for performance (avoid repeated _get_adducts() calls)
410
410
  cached_adducts_df = None
411
411
  cached_valid_adducts = None
412
412
  try:
413
- cached_adducts_df = self._get_adducts()
413
+ from masster.study.id import _get_adducts
414
+ cached_adducts_df = _get_adducts(study)
414
415
  if not cached_adducts_df.is_empty():
415
416
  cached_valid_adducts = set(cached_adducts_df["name"].to_list())
416
417
  else:
417
418
  cached_valid_adducts = set()
418
419
  except Exception as e:
419
- self.logger.warning(f"Could not retrieve study adducts: {e}")
420
+ study.logger.warning(f"Could not retrieve study adducts: {e}")
420
421
  cached_valid_adducts = set()
421
422
 
422
423
  # Always allow '?' adducts
@@ -424,58 +425,58 @@ def merge(self, **kwargs) -> None:
424
425
 
425
426
  # Route to algorithm implementation
426
427
  if params.method == 'sensitivity':
427
- consensus_map = _merge_kd(self, params)
428
+ consensus_map = _merge_kd(study, params)
428
429
  # Extract consensus features
429
- self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
430
+ _extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
430
431
  elif params.method == 'qt':
431
- consensus_map = _merge_qt(self, params)
432
+ consensus_map = _merge_qt(study, params)
432
433
  # Extract consensus features
433
- self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
434
+ _extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
434
435
  elif params.method == 'nowarp':
435
- consensus_map = _merge_kd_nowarp(self, params)
436
+ consensus_map = _merge_kd_nowarp(study, params)
436
437
  # Extract consensus features
437
- self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
438
+ _extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
438
439
  elif params.method == 'quality':
439
- consensus_map = _merge_kd_strict(self, params)
440
+ consensus_map = _merge_kd_strict(study, params)
440
441
  # Note: _merge_kd_strict handles both consensus_df and consensus_mapping_df directly
441
442
  elif params.method == 'kd_chunked':
442
- consensus_map = _merge_kd_chunked(self, params, cached_adducts_df, cached_valid_adducts)
443
+ consensus_map = _merge_kd_chunked(study, params, cached_adducts_df, cached_valid_adducts)
443
444
  # Note: _merge_kd_chunked populates consensus_df directly, no need to extract
444
445
  elif params.method == 'qt_chunked':
445
- consensus_map = _merge_qt_chunked(self, params, cached_adducts_df, cached_valid_adducts)
446
+ consensus_map = _merge_qt_chunked(study, params, cached_adducts_df, cached_valid_adducts)
446
447
  # Note: _merge_qt_chunked populates consensus_df directly, no need to extract
447
448
 
448
449
  # Enhanced post-clustering to merge over-segmented features (for qt and kd methods)
449
450
  if params.method in ['qt', 'sensitivity', 'qt_chunked', 'kd_chunked', 'quality']:
450
- self._consensus_cleanup(params.rt_tol, params.mz_tol)
451
+ _consensus_cleanup(study, params.rt_tol, params.mz_tol)
451
452
 
452
453
  # Perform adduct grouping
453
- self._perform_adduct_grouping(params.rt_tol, params.mz_tol)
454
+ _perform_adduct_grouping(study, params.rt_tol, params.mz_tol)
454
455
 
455
456
  # Identify coeluting consensus features by mass shifts and update adduct information
456
- self._identify_adduct_by_mass_shift(params.rt_tol, cached_adducts_df)
457
+ _identify_adduct_by_mass_shift(study, params.rt_tol, cached_adducts_df)
457
458
 
458
459
  # Link MS2 if requested
459
460
  if params.link_ms2:
460
- self._finalize_merge(params.link_ms2, params.min_samples)
461
+ _finalize_merge(study, params.link_ms2, params.min_samples)
461
462
 
462
463
  # Log completion without the misleading feature count
463
464
  elapsed = time.time() - start_time
464
- self.logger.debug(f"Merge process completed in {elapsed:.1f}s")
465
+ study.logger.debug(f"Merge process completed in {elapsed:.1f}s")
465
466
 
466
467
 
467
- def _merge_kd(self, params: merge_defaults) -> oms.ConsensusMap:
468
+ def _merge_kd(study, params: merge_defaults) -> oms.ConsensusMap:
468
469
  """KD-tree based merge (fast, recommended)"""
469
470
 
470
471
  # Generate temporary feature maps on-demand from features_df
471
- temp_feature_maps = _generate_feature_maps_on_demand(self)
472
+ temp_feature_maps = _generate_feature_maps_on_demand(study)
472
473
 
473
474
  consensus_map = oms.ConsensusMap()
474
475
  file_descriptions = consensus_map.getColumnHeaders()
475
476
 
476
477
  for i, feature_map in enumerate(temp_feature_maps):
477
478
  file_description = file_descriptions.get(i, oms.ColumnHeader())
478
- file_description.filename = self.samples_df.row(i, named=True)["sample_name"]
479
+ file_description.filename = study.samples_df.row(i, named=True)["sample_name"]
479
480
  file_description.size = feature_map.size()
480
481
  file_description.unique_id = feature_map.getUniqueId()
481
482
  file_descriptions[i] = file_description
@@ -624,22 +625,22 @@ def _generate_feature_maps_on_demand(study):
624
625
  return temp_feature_maps
625
626
 
626
627
 
627
- def _merge_qt(self, params: merge_defaults) -> oms.ConsensusMap:
628
+ def _merge_qt(study, params: merge_defaults) -> oms.ConsensusMap:
628
629
  """QT (Quality Threshold) based merge"""
629
630
 
630
631
  # Generate temporary feature maps on-demand from features_df
631
- temp_feature_maps = _generate_feature_maps_on_demand(self)
632
+ temp_feature_maps = _generate_feature_maps_on_demand(study)
632
633
 
633
634
  n_samples = len(temp_feature_maps)
634
635
  if n_samples > 1000:
635
- self.logger.warning(f"QT with {n_samples} samples may be slow [O(n²)]. Consider KD [O(n log n)]")
636
+ study.logger.warning(f"QT with {n_samples} samples may be slow [O(n²)]. Consider KD [O(n log n)]")
636
637
 
637
638
  consensus_map = oms.ConsensusMap()
638
639
  file_descriptions = consensus_map.getColumnHeaders()
639
640
 
640
641
  for i, feature_map in enumerate(temp_feature_maps):
641
642
  file_description = file_descriptions.get(i, oms.ColumnHeader())
642
- file_description.filename = self.samples_df.row(i, named=True)["sample_name"]
643
+ file_description.filename = study.samples_df.row(i, named=True)["sample_name"]
643
644
  file_description.size = feature_map.size()
644
645
  file_description.unique_id = feature_map.getUniqueId()
645
646
  file_descriptions[i] = file_description
@@ -665,7 +666,7 @@ def _merge_qt(self, params: merge_defaults) -> oms.ConsensusMap:
665
666
  return consensus_map
666
667
 
667
668
 
668
- def _merge_kd_strict(self, params: merge_defaults) -> oms.ConsensusMap:
669
+ def _merge_kd_strict(study, params: merge_defaults) -> oms.ConsensusMap:
669
670
  """
670
671
  Quality merge: Standard KD algorithm with post-processing quality control.
671
672
 
@@ -695,8 +696,8 @@ def _merge_kd_strict(self, params: merge_defaults) -> oms.ConsensusMap:
695
696
 
696
697
  if optimize_rt_tol:
697
698
  # Optimize RT tolerance first
698
- optimal_rt_tol = _optimize_rt_tolerance(self, params)
699
- self.logger.info(f"RT tolerance optimization: {params.rt_tol}s → {optimal_rt_tol}s")
699
+ optimal_rt_tol = _optimize_rt_tolerance(study, params)
700
+ study.logger.info(f"RT tolerance optimization: {params.rt_tol}s → {optimal_rt_tol}s")
700
701
  # Create modified params with optimal RT tolerance
701
702
  import copy
702
703
  optimized_params = copy.deepcopy(params)
@@ -705,22 +706,22 @@ def _merge_kd_strict(self, params: merge_defaults) -> oms.ConsensusMap:
705
706
  optimized_params = params
706
707
 
707
708
  # Phase 1: Standard KD clustering
708
- self.logger.debug("Initial KD clustering")
709
- consensus_map = _merge_kd(self, optimized_params)
709
+ study.logger.debug("Initial KD clustering")
710
+ consensus_map = _merge_kd(study, optimized_params)
710
711
 
711
712
  # Phase 2: Post-processing quality control
712
- self.logger.debug("Post-processing quality control")
713
- consensus_map = _apply_kd_strict_postprocessing(self, consensus_map, optimized_params)
713
+ study.logger.debug("Post-processing quality control")
714
+ consensus_map = _apply_kd_strict_postprocessing(study, consensus_map, optimized_params)
714
715
 
715
716
  return consensus_map
716
717
 
717
718
 
718
- def _optimize_rt_tolerance(self, params: merge_defaults) -> float:
719
+ def _optimize_rt_tolerance(study, params: merge_defaults) -> float:
719
720
  """
720
721
  Optimize RT tolerance by testing different values and measuring oversegmentation.
721
722
 
722
723
  Args:
723
- self: Study object
724
+ study: Study object
724
725
  params: Merge parameters
725
726
 
726
727
  Returns:
@@ -729,7 +730,7 @@ def _optimize_rt_tolerance(self, params: merge_defaults) -> float:
729
730
  rt_tol_range = getattr(params, 'rt_tol_range', (0.8, 2.0))
730
731
  rt_tol_steps = getattr(params, 'rt_tol_steps', 5)
731
732
 
732
- self.logger.info(f"Optimizing RT tolerance in range {rt_tol_range} with {rt_tol_steps} steps")
733
+ study.logger.info(f"Optimizing RT tolerance in range {rt_tol_range} with {rt_tol_steps} steps")
733
734
 
734
735
  # Generate test values
735
736
  test_rt_tols = [rt_tol_range[0] + i * (rt_tol_range[1] - rt_tol_range[0]) / (rt_tol_steps - 1)
@@ -739,8 +740,8 @@ def _optimize_rt_tolerance(self, params: merge_defaults) -> float:
739
740
  best_score = float('inf')
740
741
 
741
742
  # Store original features for restoration
742
- original_consensus_df = getattr(self, 'consensus_df', pl.DataFrame())
743
- original_consensus_mapping_df = getattr(self, 'consensus_mapping_df', pl.DataFrame())
743
+ original_consensus_df = getattr(study, 'consensus_df', pl.DataFrame())
744
+ original_consensus_mapping_df = getattr(study, 'consensus_mapping_df', pl.DataFrame())
744
745
 
745
746
  for test_rt_tol in test_rt_tols:
746
747
  try:
@@ -750,18 +751,18 @@ def _optimize_rt_tolerance(self, params: merge_defaults) -> float:
750
751
  test_params.rt_tol = test_rt_tol
751
752
 
752
753
  # Run KD merge with test parameters
753
- test_consensus_map = _merge_kd(self, test_params)
754
+ test_consensus_map = _merge_kd(study, test_params)
754
755
 
755
756
  # Extract consensus features temporarily for analysis
756
- self._extract_consensus_features(test_consensus_map, test_params.min_samples)
757
+ _extract_consensus_features(study, test_consensus_map, test_params.min_samples)
757
758
 
758
- if len(self.consensus_df) == 0:
759
+ if len(study.consensus_df) == 0:
759
760
  continue
760
761
 
761
762
  # Calculate oversegmentation metrics
762
- oversegmentation_score = _calculate_oversegmentation_score(self, test_rt_tol)
763
+ oversegmentation_score = _calculate_oversegmentation_score(study, test_rt_tol)
763
764
 
764
- self.logger.debug(f"RT tol {test_rt_tol:.1f}s: {len(self.consensus_df)} features, score: {oversegmentation_score:.3f}")
765
+ study.logger.debug(f"RT tol {test_rt_tol:.1f}s: {len(study.consensus_df)} features, score: {oversegmentation_score:.3f}")
765
766
 
766
767
  # Lower score is better (less oversegmentation)
767
768
  if oversegmentation_score < best_score:
@@ -769,50 +770,50 @@ def _optimize_rt_tolerance(self, params: merge_defaults) -> float:
769
770
  best_rt_tol = test_rt_tol
770
771
 
771
772
  except Exception as e:
772
- self.logger.warning(f"RT tolerance optimization failed for {test_rt_tol}s: {e}")
773
+ study.logger.warning(f"RT tolerance optimization failed for {test_rt_tol}s: {e}")
773
774
  continue
774
775
 
775
776
  # Restore original consensus data
776
- self.consensus_df = original_consensus_df
777
- self.consensus_mapping_df = original_consensus_mapping_df
777
+ study.consensus_df = original_consensus_df
778
+ study.consensus_mapping_df = original_consensus_mapping_df
778
779
 
779
- self.logger.info(f"Optimal RT tolerance: {best_rt_tol:.1f}s (score: {best_score:.3f})")
780
+ study.logger.info(f"Optimal RT tolerance: {best_rt_tol:.1f}s (score: {best_score:.3f})")
780
781
  return best_rt_tol
781
782
 
782
783
 
783
- def _calculate_oversegmentation_score(self, rt_tol: float) -> float:
784
+ def _calculate_oversegmentation_score(study, rt_tol: float) -> float:
784
785
  """
785
786
  Calculate oversegmentation score based on feature density and RT spread metrics.
786
787
  Lower scores indicate less oversegmentation.
787
788
 
788
789
  Args:
789
- self: Study object
790
+ study: Study object
790
791
  rt_tol: RT tolerance used
791
792
 
792
793
  Returns:
793
794
  Oversegmentation score (lower = better)
794
795
  """
795
- if len(self.consensus_df) == 0:
796
+ if len(study.consensus_df) == 0:
796
797
  return float('inf')
797
798
 
798
799
  # Metric 1: Feature density (features per RT second)
799
- rt_range = self.consensus_df['rt'].max() - self.consensus_df['rt'].min()
800
+ rt_range = study.consensus_df['rt'].max() - study.consensus_df['rt'].min()
800
801
  if rt_range <= 0:
801
802
  return float('inf')
802
803
 
803
- feature_density = len(self.consensus_df) / rt_range
804
+ feature_density = len(study.consensus_df) / rt_range
804
805
 
805
806
  # Metric 2: Average RT spread relative to tolerance
806
- rt_spreads = (self.consensus_df['rt_max'] - self.consensus_df['rt_min'])
807
+ rt_spreads = (study.consensus_df['rt_max'] - study.consensus_df['rt_min'])
807
808
  avg_rt_spread_ratio = rt_spreads.mean() / rt_tol if rt_tol > 0 else float('inf')
808
809
 
809
810
  # Metric 3: Proportion of features with low sample counts (indicates fragmentation)
810
- low_sample_features = len(self.consensus_df.filter(pl.col('number_samples') <= 5))
811
- low_sample_ratio = low_sample_features / len(self.consensus_df)
811
+ low_sample_features = len(study.consensus_df.filter(pl.col('number_samples') <= 5))
812
+ low_sample_ratio = low_sample_features / len(study.consensus_df)
812
813
 
813
814
  # Metric 4: Number of features with excessive RT spread
814
815
  excessive_spread_features = len(rt_spreads.filter(rt_spreads > rt_tol * 2))
815
- excessive_spread_ratio = excessive_spread_features / len(self.consensus_df)
816
+ excessive_spread_ratio = excessive_spread_features / len(study.consensus_df)
816
817
 
817
818
  # Combined score (weighted combination)
818
819
  oversegmentation_score = (
@@ -825,7 +826,7 @@ def _calculate_oversegmentation_score(self, rt_tol: float) -> float:
825
826
  return oversegmentation_score
826
827
 
827
828
 
828
- def _apply_kd_strict_postprocessing(self, consensus_map: oms.ConsensusMap, params: merge_defaults) -> oms.ConsensusMap:
829
+ def _apply_kd_strict_postprocessing(study, consensus_map: oms.ConsensusMap, params: merge_defaults) -> oms.ConsensusMap:
829
830
  """
830
831
  Apply post-processing quality control to KD consensus map.
831
832
 
@@ -837,20 +838,20 @@ def _apply_kd_strict_postprocessing(self, consensus_map: oms.ConsensusMap, param
837
838
  Processed consensus map with reduced oversegmentation
838
839
  """
839
840
  if consensus_map.size() == 0:
840
- self.logger.warning("Empty consensus map provided to post-processing")
841
+ study.logger.warning("Empty consensus map provided to post-processing")
841
842
  return consensus_map
842
843
 
843
- self.logger.debug(f"Post-processing {consensus_map.size()} initial consensus features")
844
+ study.logger.debug(f"Post-processing {consensus_map.size()} initial consensus features")
844
845
 
845
846
  # Step 1: Extract initial consensus features
846
847
  original_min_samples = params.min_samples
847
848
  params.min_samples = 1 # Extract all features initially
848
849
 
849
- self._extract_consensus_features(consensus_map, params.min_samples)
850
- initial_feature_count = len(self.consensus_df)
850
+ _extract_consensus_features(study, consensus_map, params.min_samples)
851
+ initial_feature_count = len(study.consensus_df)
851
852
 
852
853
  if initial_feature_count == 0:
853
- self.logger.warning("No consensus features extracted for post-processing")
854
+ study.logger.warning("No consensus features extracted for post-processing")
854
855
  params.min_samples = original_min_samples
855
856
  return consensus_map
856
857
 
@@ -858,67 +859,67 @@ def _apply_kd_strict_postprocessing(self, consensus_map: oms.ConsensusMap, param
858
859
  secondary_merge_rt_tol = getattr(params, 'secondary_merge_rt_tol', 0.5)
859
860
  secondary_merge_mz_tol = getattr(params, 'secondary_merge_mz_tol', 0.005)
860
861
 
861
- self.logger.debug(f"Secondary clustering with RT≤{secondary_merge_rt_tol}s, m/z≤{secondary_merge_mz_tol}")
862
- merged_features = _perform_secondary_clustering(self, secondary_merge_rt_tol, secondary_merge_mz_tol)
862
+ study.logger.debug(f"Secondary clustering with RT≤{secondary_merge_rt_tol}s, m/z≤{secondary_merge_mz_tol}")
863
+ merged_features = _perform_secondary_clustering(study, secondary_merge_rt_tol, secondary_merge_mz_tol)
863
864
 
864
865
  # Step 3: Sample overlap validation
865
866
  min_sample_overlap = getattr(params, 'min_sample_overlap', 0.8)
866
867
  if min_sample_overlap > 0:
867
- self.logger.debug(f"Sample overlap validation (threshold: {min_sample_overlap})")
868
- merged_features = _validate_sample_overlap(self, merged_features, min_sample_overlap)
868
+ study.logger.debug(f"Sample overlap validation (threshold: {min_sample_overlap})")
869
+ merged_features = _validate_sample_overlap(study, merged_features, min_sample_overlap)
869
870
 
870
871
  # Step 4: RT spread quality filtering
871
872
  if params.rt_tol is not None:
872
873
  max_rt_spread = getattr(params, 'max_rt_spread', params.rt_tol * 2)
873
874
  if max_rt_spread is not None:
874
- self.logger.debug(f"RT spread filtering (max: {max_rt_spread:.1f}s)")
875
- merged_features = _filter_rt_spread(self, merged_features, max_rt_spread)
875
+ study.logger.debug(f"RT spread filtering (max: {max_rt_spread:.1f}s)")
876
+ merged_features = _filter_rt_spread(study, merged_features, max_rt_spread)
876
877
  else:
877
- self.logger.debug("Skipping RT spread filtering - max_rt_spread is None")
878
+ study.logger.debug("Skipping RT spread filtering - max_rt_spread is None")
878
879
  else:
879
- self.logger.debug("Skipping RT spread filtering - rt_tol is None")
880
+ study.logger.debug("Skipping RT spread filtering - rt_tol is None")
880
881
 
881
882
  # Step 5: Chromatographic coherence filtering (optional)
882
883
  min_coherence = getattr(params, 'min_coherence', 0.0)
883
884
  if min_coherence > 0:
884
- self.logger.debug(f"Chromatographic coherence filtering (min: {min_coherence})")
885
- merged_features = _filter_coherence(self, merged_features, min_coherence)
885
+ study.logger.debug(f"Chromatographic coherence filtering (min: {min_coherence})")
886
+ merged_features = _filter_coherence(study, merged_features, min_coherence)
886
887
 
887
888
  # Step 6: Rebuild consensus_df with filtered features and preserve mapping
888
- original_mapping_df = self.consensus_mapping_df.clone() # Save original mapping
889
- self.consensus_df = pl.DataFrame(merged_features, strict=False)
889
+ original_mapping_df = study.consensus_mapping_df.clone() # Save original mapping
890
+ study.consensus_df = pl.DataFrame(merged_features, strict=False)
890
891
 
891
892
  # Step 7: Apply original min_samples filter
892
893
  params.min_samples = original_min_samples
893
894
  if params.min_samples > 1:
894
- l1 = len(self.consensus_df)
895
- self.consensus_df = self.consensus_df.filter(
895
+ l1 = len(study.consensus_df)
896
+ study.consensus_df = study.consensus_df.filter(
896
897
  pl.col("number_samples") >= params.min_samples
897
898
  )
898
- filtered_count = l1 - len(self.consensus_df)
899
+ filtered_count = l1 - len(study.consensus_df)
899
900
  if filtered_count > 0:
900
- self.logger.debug(f"Filtered {filtered_count} features below min_samples threshold ({params.min_samples})")
901
+ study.logger.debug(f"Filtered {filtered_count} features below min_samples threshold ({params.min_samples})")
901
902
 
902
903
  # Step 8: Update consensus_mapping_df to match final consensus_df
903
- if len(self.consensus_df) > 0 and len(original_mapping_df) > 0:
904
- valid_consensus_ids = set(self.consensus_df['consensus_uid'].to_list())
905
- self.consensus_mapping_df = original_mapping_df.filter(
904
+ if len(study.consensus_df) > 0 and len(original_mapping_df) > 0:
905
+ valid_consensus_ids = set(study.consensus_df['consensus_uid'].to_list())
906
+ study.consensus_mapping_df = original_mapping_df.filter(
906
907
  pl.col('consensus_uid').is_in(list(valid_consensus_ids))
907
908
  )
908
909
  else:
909
- self.consensus_mapping_df = pl.DataFrame()
910
+ study.consensus_mapping_df = pl.DataFrame()
910
911
 
911
- final_feature_count = len(self.consensus_df)
912
+ final_feature_count = len(study.consensus_df)
912
913
  reduction_pct = ((initial_feature_count - final_feature_count) / initial_feature_count * 100) if initial_feature_count > 0 else 0
913
914
 
914
- self.logger.info(f"Consensus cleanup complete: {initial_feature_count} → {final_feature_count} features ({reduction_pct:.1f}% reduction)")
915
+ study.logger.info(f"Consensus cleanup complete: {initial_feature_count} → {final_feature_count} features ({reduction_pct:.1f}% reduction)")
915
916
 
916
917
  # Create a new consensus map for compatibility (the processed data is in consensus_df)
917
918
  processed_consensus_map = oms.ConsensusMap()
918
919
  return processed_consensus_map
919
920
 
920
921
 
921
- def _perform_secondary_clustering(self, rt_tol: float, mz_tol: float) -> list:
922
+ def _perform_secondary_clustering(study, rt_tol: float, mz_tol: float) -> list:
922
923
  """
923
924
  Perform secondary clustering to merge very close features.
924
925
 
@@ -929,34 +930,34 @@ def _perform_secondary_clustering(self, rt_tol: float, mz_tol: float) -> list:
929
930
  Returns:
930
931
  List of merged consensus feature dictionaries
931
932
  """
932
- if len(self.consensus_df) == 0:
933
+ if len(study.consensus_df) == 0:
933
934
  return []
934
935
 
935
936
  # Convert consensus_df to list of dictionaries for clustering
936
937
  consensus_features = []
937
- for i, row in enumerate(self.consensus_df.iter_rows(named=True)):
938
+ for i, row in enumerate(study.consensus_df.iter_rows(named=True)):
938
939
  consensus_features.append(dict(row))
939
940
 
940
941
  # Use Union-Find for efficient clustering
941
942
  class UnionFind:
942
- def __init__(self, n):
943
- self.parent = list(range(n))
944
- self.rank = [0] * n
943
+ def __init__(study, n):
944
+ study.parent = list(range(n))
945
+ study.rank = [0] * n
945
946
 
946
- def find(self, x):
947
- if self.parent[x] != x:
948
- self.parent[x] = self.find(self.parent[x])
949
- return self.parent[x]
947
+ def find(study, x):
948
+ if study.parent[x] != x:
949
+ study.parent[x] = study.find(study.parent[x])
950
+ return study.parent[x]
950
951
 
951
- def union(self, x, y):
952
- px, py = self.find(x), self.find(y)
952
+ def union(study, x, y):
953
+ px, py = study.find(x), study.find(y)
953
954
  if px == py:
954
955
  return
955
- if self.rank[px] < self.rank[py]:
956
+ if study.rank[px] < study.rank[py]:
956
957
  px, py = py, px
957
- self.parent[py] = px
958
- if self.rank[px] == self.rank[py]:
959
- self.rank[px] += 1
958
+ study.parent[py] = px
959
+ if study.rank[px] == study.rank[py]:
960
+ study.rank[px] += 1
960
961
 
961
962
  n_features = len(consensus_features)
962
963
  uf = UnionFind(n_features)
@@ -992,7 +993,7 @@ def _perform_secondary_clustering(self, rt_tol: float, mz_tol: float) -> list:
992
993
  merged_feature = _merge_feature_group(group)
993
994
  merged_features.append(merged_feature)
994
995
 
995
- self.logger.debug(f"Secondary clustering: {n_features} → {len(merged_features)} features ({n_features - len(merged_features)} merged)")
996
+ study.logger.debug(f"Secondary clustering: {n_features} → {len(merged_features)} features ({n_features - len(merged_features)} merged)")
996
997
  return merged_features
997
998
 
998
999
 
@@ -1066,7 +1067,7 @@ def _merge_feature_group(feature_group: list) -> dict:
1066
1067
  return merged
1067
1068
 
1068
1069
 
1069
- def _validate_sample_overlap(self, features: list, min_overlap: float) -> list:
1070
+ def _validate_sample_overlap(study, features: list, min_overlap: float) -> list:
1070
1071
  """
1071
1072
  Validate that merged features have sufficient sample overlap.
1072
1073
 
@@ -1097,7 +1098,7 @@ def _validate_sample_overlap(self, features: list, min_overlap: float) -> list:
1097
1098
  return validated_features
1098
1099
 
1099
1100
 
1100
- def _filter_rt_spread(self, features: list, max_rt_spread: float) -> list:
1101
+ def _filter_rt_spread(study, features: list, max_rt_spread: float) -> list:
1101
1102
  """
1102
1103
  Filter out features with excessive RT spread.
1103
1104
 
@@ -1122,12 +1123,12 @@ def _filter_rt_spread(self, features: list, max_rt_spread: float) -> list:
1122
1123
  filtered_count += 1
1123
1124
 
1124
1125
  if filtered_count > 0:
1125
- self.logger.debug(f"Filtered {filtered_count} features with excessive RT spread (>{max_rt_spread:.1f}s)")
1126
+ study.logger.debug(f"Filtered {filtered_count} features with excessive RT spread (>{max_rt_spread:.1f}s)")
1126
1127
 
1127
1128
  return filtered_features
1128
1129
 
1129
1130
 
1130
- def _filter_coherence(self, features: list, min_coherence: float) -> list:
1131
+ def _filter_coherence(study, features: list, min_coherence: float) -> list:
1131
1132
  """
1132
1133
  Filter out features with low chromatographic coherence.
1133
1134
 
@@ -1150,23 +1151,23 @@ def _filter_coherence(self, features: list, min_coherence: float) -> list:
1150
1151
  filtered_count += 1
1151
1152
 
1152
1153
  if filtered_count > 0:
1153
- self.logger.debug(f"Filtered {filtered_count} features with low coherence (<{min_coherence})")
1154
+ study.logger.debug(f"Filtered {filtered_count} features with low coherence (<{min_coherence})")
1154
1155
 
1155
1156
  return filtered_features
1156
1157
 
1157
1158
 
1158
- def _merge_kd_nowarp(self, params: merge_defaults) -> oms.ConsensusMap:
1159
+ def _merge_kd_nowarp(study, params: merge_defaults) -> oms.ConsensusMap:
1159
1160
  """KD-tree based merge without RT warping"""
1160
1161
 
1161
1162
  # Generate temporary feature maps on-demand from features_df
1162
- temp_feature_maps = _generate_feature_maps_on_demand(self)
1163
+ temp_feature_maps = _generate_feature_maps_on_demand(study)
1163
1164
 
1164
1165
  consensus_map = oms.ConsensusMap()
1165
1166
  file_descriptions = consensus_map.getColumnHeaders()
1166
1167
 
1167
1168
  for i, feature_map in enumerate(temp_feature_maps):
1168
1169
  file_description = file_descriptions.get(i, oms.ColumnHeader())
1169
- file_description.filename = self.samples_df.row(i, named=True)["sample_name"]
1170
+ file_description.filename = study.samples_df.row(i, named=True)["sample_name"]
1170
1171
  file_description.size = feature_map.size()
1171
1172
  file_description.unique_id = feature_map.getUniqueId()
1172
1173
  file_descriptions[i] = file_description
@@ -1193,18 +1194,18 @@ def _merge_kd_nowarp(self, params: merge_defaults) -> oms.ConsensusMap:
1193
1194
  return consensus_map
1194
1195
 
1195
1196
 
1196
- def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
1197
+ def _merge_kd_chunked(study, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
1197
1198
  """KD-based chunked merge with proper cross-chunk consensus building and optional parallel processing"""
1198
1199
 
1199
1200
  # Generate temporary feature maps on-demand from features_df
1200
- temp_feature_maps = _generate_feature_maps_on_demand(self)
1201
+ temp_feature_maps = _generate_feature_maps_on_demand(study)
1201
1202
 
1202
1203
  n_samples = len(temp_feature_maps)
1203
1204
  if n_samples <= params.chunk_size:
1204
- self.logger.info(f"Dataset size ({n_samples}) ≤ chunk_size, using KD merge")
1205
- consensus_map = _merge_kd(self, params)
1205
+ study.logger.info(f"Dataset size ({n_samples}) ≤ chunk_size, using KD merge")
1206
+ consensus_map = _merge_kd(study, params)
1206
1207
  # Extract consensus features to populate consensus_df for chunked method consistency
1207
- self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
1208
+ _extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
1208
1209
  return consensus_map
1209
1210
 
1210
1211
  # Process in chunks
@@ -1213,21 +1214,21 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1213
1214
  chunk_end = min(i + params.chunk_size, n_samples)
1214
1215
  chunks.append((i, temp_feature_maps[i:chunk_end]))
1215
1216
 
1216
- self.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples using {params.threads or 'sequential'} thread(s)")
1217
+ study.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples using {params.threads or 'sequential'} thread(s)")
1217
1218
 
1218
1219
  # Process each chunk to create chunk consensus maps
1219
1220
  chunk_consensus_maps = []
1220
1221
 
1221
1222
  if params.threads is None:
1222
1223
  # Sequential processing (original behavior)
1223
- for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}KD Chunk", disable=self.log_level not in ["TRACE", "DEBUG", "INFO"])):
1224
+ for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {study.log_label}KD Chunk", disable=study.log_level not in ["TRACE", "DEBUG", "INFO"])):
1224
1225
  chunk_consensus_map = oms.ConsensusMap()
1225
1226
 
1226
1227
  # Set up file descriptions for chunk
1227
1228
  file_descriptions = chunk_consensus_map.getColumnHeaders()
1228
1229
  for j, feature_map in enumerate(chunk_maps):
1229
1230
  file_description = file_descriptions.get(j, oms.ColumnHeader())
1230
- file_description.filename = self.samples_df.row(chunk_start_idx + j, named=True)["sample_name"]
1231
+ file_description.filename = study.samples_df.row(chunk_start_idx + j, named=True)["sample_name"]
1231
1232
  file_description.size = feature_map.size()
1232
1233
  file_description.unique_id = feature_map.getUniqueId()
1233
1234
  file_descriptions[j] = file_description
@@ -1255,7 +1256,7 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1255
1256
 
1256
1257
  else:
1257
1258
  # Parallel processing
1258
- self.logger.info(f"Processing chunks in parallel using {params.threads} processes")
1259
+ study.logger.info(f"Processing chunks in parallel using {params.threads} processes")
1259
1260
 
1260
1261
  # Prepare chunk data for parallel processing using features_df slices
1261
1262
  chunk_data_list = []
@@ -1264,7 +1265,7 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1264
1265
  chunk_sample_uids = []
1265
1266
  chunk_samples_df_rows = []
1266
1267
  for j in range(len(chunk_maps)):
1267
- sample_row = self.samples_df.row(chunk_start_idx + j, named=True)
1268
+ sample_row = study.samples_df.row(chunk_start_idx + j, named=True)
1268
1269
  chunk_sample_uids.append(sample_row['sample_uid'])
1269
1270
  chunk_samples_df_rows.append(sample_row)
1270
1271
 
@@ -1272,7 +1273,7 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1272
1273
  chunk_samples_df = pl.DataFrame(chunk_samples_df_rows)
1273
1274
 
1274
1275
  # Filter features_df for this chunk's samples and select only necessary columns
1275
- chunk_features_df = self.features_df.filter(
1276
+ chunk_features_df = study.features_df.filter(
1276
1277
  pl.col('sample_uid').is_in(chunk_sample_uids)
1277
1278
  ).select([
1278
1279
  'sample_uid', 'rt', 'mz', 'inty', 'charge', 'feature_id'
@@ -1316,22 +1317,22 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1316
1317
  serialized_chunk_results.append((chunk_start_idx, consensus_features))
1317
1318
  completed_chunks += 1
1318
1319
  n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
1319
- self.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1320
+ study.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1320
1321
  except Exception as exc:
1321
1322
  # Check if this is a BrokenProcessPool exception from Windows multiprocessing issues
1322
1323
  if isinstance(exc, BrokenProcessPool) or "process pool" in str(exc).lower():
1323
1324
  # Convert to RuntimeError so outer except block can catch it for fallback
1324
1325
  raise RuntimeError(f"Windows multiprocessing failure: {exc}")
1325
1326
  else:
1326
- self.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
1327
+ study.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
1327
1328
  raise exc
1328
1329
 
1329
1330
  except (RuntimeError, OSError, BrokenProcessPool) as e:
1330
1331
  # Handle Windows multiprocessing issues - fallback to ThreadPoolExecutor
1331
1332
  if ("freeze_support" in str(e) or "spawn" in str(e) or "bootstrapping" in str(e) or
1332
1333
  "process pool" in str(e).lower() or "Windows multiprocessing failure" in str(e)):
1333
- self.logger.warning(f"ProcessPoolExecutor failed (likely Windows multiprocessing issue): {e}")
1334
- self.logger.info(f"Falling back to ThreadPoolExecutor with {params.threads} threads")
1334
+ study.logger.warning(f"ProcessPoolExecutor failed (likely Windows multiprocessing issue): {e}")
1335
+ study.logger.info(f"Falling back to ThreadPoolExecutor with {params.threads} threads")
1335
1336
 
1336
1337
  with ThreadPoolExecutor(max_workers=params.threads) as executor:
1337
1338
  # Submit all chunk processing tasks
@@ -1350,9 +1351,9 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1350
1351
  serialized_chunk_results.append((chunk_start_idx, consensus_features))
1351
1352
  completed_chunks += 1
1352
1353
  n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
1353
- self.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1354
+ study.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1354
1355
  except Exception as exc:
1355
- self.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
1356
+ study.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
1356
1357
  raise exc
1357
1358
  else:
1358
1359
  # Re-raise other exceptions
@@ -1366,25 +1367,25 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1366
1367
 
1367
1368
  # Merge chunk results with proper cross-chunk consensus building
1368
1369
  # _merge_chunk_results now handles both ConsensusMap objects (sequential) and serialized data (parallel)
1369
- _merge_chunk_results(self, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
1370
+ _merge_chunk_results(study, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
1370
1371
 
1371
- # Return a dummy consensus map for compatibility (consensus features are stored in self.consensus_df)
1372
+ # Return a dummy consensus map for compatibility (consensus features are stored in study.consensus_df)
1372
1373
  consensus_map = oms.ConsensusMap()
1373
1374
  return consensus_map
1374
1375
 
1375
1376
 
1376
- def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
1377
+ def _merge_qt_chunked(study, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
1377
1378
  """QT-based chunked merge with proper cross-chunk consensus building and optional parallel processing"""
1378
1379
 
1379
1380
  # Generate temporary feature maps on-demand from features_df
1380
- temp_feature_maps = _generate_feature_maps_on_demand(self)
1381
+ temp_feature_maps = _generate_feature_maps_on_demand(study)
1381
1382
 
1382
1383
  n_samples = len(temp_feature_maps)
1383
1384
  if n_samples <= params.chunk_size:
1384
- self.logger.info(f"Dataset size ({n_samples}) ≤ chunk_size, using QT merge")
1385
- consensus_map = _merge_qt(self, params)
1385
+ study.logger.info(f"Dataset size ({n_samples}) ≤ chunk_size, using QT merge")
1386
+ consensus_map = _merge_qt(study, params)
1386
1387
  # Extract consensus features to populate consensus_df for chunked method consistency
1387
- self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
1388
+ _extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
1388
1389
  return consensus_map
1389
1390
 
1390
1391
  # Process in chunks
@@ -1393,21 +1394,21 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1393
1394
  chunk_end = min(i + params.chunk_size, n_samples)
1394
1395
  chunks.append((i, temp_feature_maps[i:chunk_end]))
1395
1396
 
1396
- self.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples using {params.threads or 'sequential'} thread(s)")
1397
+ study.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples using {params.threads or 'sequential'} thread(s)")
1397
1398
 
1398
1399
  # Process each chunk to create chunk consensus maps
1399
1400
  chunk_consensus_maps = []
1400
1401
 
1401
1402
  if params.threads is None:
1402
1403
  # Sequential processing (original behavior)
1403
- for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}QT Chunk", disable=self.log_level not in ["TRACE", "DEBUG", "INFO"])):
1404
+ for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {study.log_label}QT Chunk", disable=study.log_level not in ["TRACE", "DEBUG", "INFO"])):
1404
1405
  chunk_consensus_map = oms.ConsensusMap()
1405
1406
 
1406
1407
  # Set up file descriptions for chunk
1407
1408
  file_descriptions = chunk_consensus_map.getColumnHeaders()
1408
1409
  for j, feature_map in enumerate(chunk_maps):
1409
1410
  file_description = file_descriptions.get(j, oms.ColumnHeader())
1410
- file_description.filename = self.samples_df.row(chunk_start_idx + j, named=True)["sample_name"]
1411
+ file_description.filename = study.samples_df.row(chunk_start_idx + j, named=True)["sample_name"]
1411
1412
  file_description.size = feature_map.size()
1412
1413
  file_description.unique_id = feature_map.getUniqueId()
1413
1414
  file_descriptions[j] = file_description
@@ -1430,7 +1431,7 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1430
1431
 
1431
1432
  else:
1432
1433
  # Parallel processing
1433
- self.logger.info(f"Processing chunks in parallel using {params.threads} processes")
1434
+ study.logger.info(f"Processing chunks in parallel using {params.threads} processes")
1434
1435
 
1435
1436
  # Prepare chunk data for parallel processing using features_df slices
1436
1437
  chunk_data_list = []
@@ -1439,7 +1440,7 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1439
1440
  chunk_sample_uids = []
1440
1441
  chunk_samples_df_rows = []
1441
1442
  for j in range(len(chunk_maps)):
1442
- sample_row = self.samples_df.row(chunk_start_idx + j, named=True)
1443
+ sample_row = study.samples_df.row(chunk_start_idx + j, named=True)
1443
1444
  chunk_sample_uids.append(sample_row['sample_uid'])
1444
1445
  chunk_samples_df_rows.append(sample_row)
1445
1446
 
@@ -1447,7 +1448,7 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1447
1448
  chunk_samples_df = pl.DataFrame(chunk_samples_df_rows)
1448
1449
 
1449
1450
  # Filter features_df for this chunk's samples and select only necessary columns
1450
- chunk_features_df = self.features_df.filter(
1451
+ chunk_features_df = study.features_df.filter(
1451
1452
  pl.col('sample_uid').is_in(chunk_sample_uids)
1452
1453
  ).select([
1453
1454
  'sample_uid', 'rt', 'mz', 'inty', 'charge', 'feature_id'
@@ -1491,22 +1492,22 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1491
1492
  serialized_chunk_results.append((chunk_start_idx, consensus_features))
1492
1493
  completed_chunks += 1
1493
1494
  n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
1494
- self.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1495
+ study.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1495
1496
  except Exception as exc:
1496
1497
  # Check if this is a BrokenProcessPool exception from Windows multiprocessing issues
1497
1498
  if isinstance(exc, BrokenProcessPool) or "process pool" in str(exc).lower():
1498
1499
  # Convert to RuntimeError so outer except block can catch it for fallback
1499
1500
  raise RuntimeError(f"Windows multiprocessing failure: {exc}")
1500
1501
  else:
1501
- self.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
1502
+ study.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
1502
1503
  raise exc
1503
1504
 
1504
1505
  except (RuntimeError, OSError, BrokenProcessPool) as e:
1505
1506
  # Handle Windows multiprocessing issues - fallback to ThreadPoolExecutor
1506
1507
  if ("freeze_support" in str(e) or "spawn" in str(e) or "bootstrapping" in str(e) or
1507
1508
  "process pool" in str(e).lower() or "Windows multiprocessing failure" in str(e)):
1508
- self.logger.warning(f"ProcessPoolExecutor failed (likely Windows multiprocessing issue): {e}")
1509
- self.logger.info(f"Falling back to ThreadPoolExecutor with {params.threads} threads")
1509
+ study.logger.warning(f"ProcessPoolExecutor failed (likely Windows multiprocessing issue): {e}")
1510
+ study.logger.info(f"Falling back to ThreadPoolExecutor with {params.threads} threads")
1510
1511
 
1511
1512
  with ThreadPoolExecutor(max_workers=params.threads) as executor:
1512
1513
  # Submit all chunk processing tasks
@@ -1525,9 +1526,9 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1525
1526
  serialized_chunk_results.append((chunk_start_idx, consensus_features))
1526
1527
  completed_chunks += 1
1527
1528
  n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
1528
- self.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1529
+ study.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1529
1530
  except Exception as exc:
1530
- self.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
1531
+ study.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
1531
1532
  raise exc
1532
1533
  else:
1533
1534
  # Re-raise other exceptions
@@ -1541,14 +1542,14 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1541
1542
 
1542
1543
  # Merge chunk results with proper cross-chunk consensus building
1543
1544
  # _merge_chunk_results now handles both ConsensusMap objects (sequential) and serialized data (parallel)
1544
- _merge_chunk_results(self, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
1545
+ _merge_chunk_results(study, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
1545
1546
 
1546
- # Return a dummy consensus map for compatibility (consensus features are stored in self.consensus_df)
1547
+ # Return a dummy consensus map for compatibility (consensus features are stored in study.consensus_df)
1547
1548
  consensus_map = oms.ConsensusMap()
1548
1549
  return consensus_map
1549
1550
 
1550
1551
 
1551
- def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> None:
1552
+ def _merge_chunk_results(study, chunk_consensus_maps: list, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> None:
1552
1553
  """
1553
1554
  Scalable aggregation of chunk consensus maps into final consensus_df.
1554
1555
 
@@ -1561,7 +1562,8 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
1561
1562
  if len(chunk_consensus_maps) == 1:
1562
1563
  # Single chunk case - just extract using the true global min_samples.
1563
1564
  # No need for permissive threshold because we are not discarding singletons pre-aggregation.
1564
- self._extract_consensus_features(
1565
+ _extract_consensus_features(
1566
+ study,
1565
1567
  chunk_consensus_maps[0][1],
1566
1568
  params.min_samples,
1567
1569
  cached_adducts_df,
@@ -1572,10 +1574,10 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
1572
1574
  # Build feature_uid to feature_data lookup for fast access
1573
1575
  feature_uid_map = {
1574
1576
  row["feature_id"]: row["feature_uid"]
1575
- for row in self.features_df.iter_rows(named=True)
1577
+ for row in study.features_df.iter_rows(named=True)
1576
1578
  }
1577
1579
 
1578
- features_lookup = _optimized_feature_lookup(self, self.features_df)
1580
+ features_lookup = _optimized_feature_lookup(study, study.features_df)
1579
1581
 
1580
1582
  # Extract all consensus features from chunks with their feature_uids
1581
1583
  all_chunk_consensus = []
@@ -1717,8 +1719,8 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
1717
1719
 
1718
1720
  if not all_chunk_consensus:
1719
1721
  # No valid consensus features found
1720
- self.consensus_df = pl.DataFrame()
1721
- self.consensus_mapping_df = pl.DataFrame()
1722
+ study.consensus_df = pl.DataFrame()
1723
+ study.consensus_mapping_df = pl.DataFrame()
1722
1724
  return
1723
1725
 
1724
1726
  # Perform cross-chunk clustering using optimized spatial indexing
@@ -1744,22 +1746,22 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
1744
1746
  features_by_bin[(rt_bin, mz_bin)].append(i)
1745
1747
 
1746
1748
  class UF:
1747
- def __init__(self, n):
1748
- self.p = list(range(n))
1749
- self.r = [0]*n
1750
- def find(self, x):
1751
- if self.p[x] != x:
1752
- self.p[x] = self.find(self.p[x])
1753
- return self.p[x]
1754
- def union(self, a,b):
1755
- pa, pb = self.find(a), self.find(b)
1749
+ def __init__(study, n):
1750
+ study.p = list(range(n))
1751
+ study.r = [0]*n
1752
+ def find(study, x):
1753
+ if study.p[x] != x:
1754
+ study.p[x] = study.find(study.p[x])
1755
+ return study.p[x]
1756
+ def union(study, a,b):
1757
+ pa, pb = study.find(a), study.find(b)
1756
1758
  if pa == pb:
1757
1759
  return
1758
- if self.r[pa] < self.r[pb]:
1760
+ if study.r[pa] < study.r[pb]:
1759
1761
  pa, pb = pb, pa
1760
- self.p[pb] = pa
1761
- if self.r[pa] == self.r[pb]:
1762
- self.r[pa] += 1
1762
+ study.p[pb] = pa
1763
+ if study.r[pa] == study.r[pb]:
1764
+ study.r[pa] += 1
1763
1765
 
1764
1766
  uf = UF(n_features)
1765
1767
  checked = set()
@@ -1918,7 +1920,7 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
1918
1920
  # This allows proper cross-chunk consensus building before final filtering
1919
1921
 
1920
1922
  metadata = _calculate_consensus_statistics(
1921
- self,
1923
+ study,
1922
1924
  consensus_uid_counter,
1923
1925
  list(feature_data_acc.values()),
1924
1926
  rt_values_chunk,
@@ -1937,7 +1939,7 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
1937
1939
 
1938
1940
  if rt_spread > max_allowed_spread:
1939
1941
  # Skip consensus features with excessive RT spread
1940
- self.logger.debug(f"Skipping consensus feature {consensus_uid_counter} with RT spread {rt_spread:.3f}s > {max_allowed_spread:.3f}s")
1942
+ study.logger.debug(f"Skipping consensus feature {consensus_uid_counter} with RT spread {rt_spread:.3f}s > {max_allowed_spread:.3f}s")
1941
1943
  consensus_uid_counter += 1
1942
1944
  continue
1943
1945
 
@@ -1969,27 +1971,27 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
1969
1971
  consensus_uid_counter += 1
1970
1972
 
1971
1973
  # Assign DataFrames
1972
- self.consensus_df = pl.DataFrame(consensus_metadata, strict=False)
1973
- self.consensus_mapping_df = pl.DataFrame(consensus_mapping_list, strict=False)
1974
+ study.consensus_df = pl.DataFrame(consensus_metadata, strict=False)
1975
+ study.consensus_mapping_df = pl.DataFrame(consensus_mapping_list, strict=False)
1974
1976
 
1975
1977
  # Ensure mapping only contains features from retained consensus_df
1976
- if len(self.consensus_df) > 0:
1977
- valid_consensus_ids = set(self.consensus_df['consensus_uid'].to_list())
1978
- self.consensus_mapping_df = self.consensus_mapping_df.filter(
1978
+ if len(study.consensus_df) > 0:
1979
+ valid_consensus_ids = set(study.consensus_df['consensus_uid'].to_list())
1980
+ study.consensus_mapping_df = study.consensus_mapping_df.filter(
1979
1981
  pl.col('consensus_uid').is_in(list(valid_consensus_ids))
1980
1982
  )
1981
1983
  else:
1982
- self.consensus_mapping_df = pl.DataFrame()
1984
+ study.consensus_mapping_df = pl.DataFrame()
1983
1985
 
1984
1986
  # Attach empty consensus_map placeholder for downstream compatibility
1985
- self.consensus_map = oms.ConsensusMap()
1987
+ study.consensus_map = oms.ConsensusMap()
1986
1988
  return
1987
1989
 
1988
1990
 
1989
1991
  def _calculate_consensus_statistics(study_obj, consensus_uid: int, feature_data_list: list,
1990
1992
  rt_values: list, mz_values: list,
1991
1993
  intensity_values: list, quality_values: list,
1992
- number_features: int = None, number_samples: int = None,
1994
+ number_features: int | None = None, number_samples: int | None = None,
1993
1995
  cached_adducts_df=None, cached_valid_adducts=None) -> dict:
1994
1996
  """
1995
1997
  Calculate comprehensive statistics for a consensus feature from aggregated feature data.
@@ -2158,24 +2160,24 @@ def _cluster_consensus_features(features: list, rt_tol: float, mz_tol: float) ->
2158
2160
 
2159
2161
  # Use Union-Find for efficient clustering
2160
2162
  class UnionFind:
2161
- def __init__(self, n):
2162
- self.parent = list(range(n))
2163
- self.rank = [0] * n
2163
+ def __init__(study, n):
2164
+ study.parent = list(range(n))
2165
+ study.rank = [0] * n
2164
2166
 
2165
- def find(self, x):
2166
- if self.parent[x] != x:
2167
- self.parent[x] = self.find(self.parent[x])
2168
- return self.parent[x]
2167
+ def find(study, x):
2168
+ if study.parent[x] != x:
2169
+ study.parent[x] = study.find(study.parent[x])
2170
+ return study.parent[x]
2169
2171
 
2170
- def union(self, x, y):
2171
- px, py = self.find(x), self.find(y)
2172
+ def union(study, x, y):
2173
+ px, py = study.find(x), study.find(y)
2172
2174
  if px == py:
2173
2175
  return
2174
- if self.rank[px] < self.rank[py]:
2176
+ if study.rank[px] < study.rank[py]:
2175
2177
  px, py = py, px
2176
- self.parent[py] = px
2177
- if self.rank[px] == self.rank[py]:
2178
- self.rank[px] += 1
2178
+ study.parent[py] = px
2179
+ if study.rank[px] == study.rank[py]:
2180
+ study.rank[px] += 1
2179
2181
 
2180
2182
  n_features = len(features)
2181
2183
  uf = UnionFind(n_features)
@@ -2208,39 +2210,39 @@ def _cluster_consensus_features(features: list, rt_tol: float, mz_tol: float) ->
2208
2210
  return list(groups_by_root.values())
2209
2211
 
2210
2212
 
2211
- def _reset_consensus_data(self):
2213
+ def _reset_consensus_data(study):
2212
2214
  """Reset consensus-related DataFrames at the start of merge."""
2213
- self.consensus_df = pl.DataFrame()
2214
- self.consensus_ms2 = pl.DataFrame()
2215
- self.consensus_mapping_df = pl.DataFrame()
2215
+ study.consensus_df = pl.DataFrame()
2216
+ study.consensus_ms2 = pl.DataFrame()
2217
+ study.consensus_mapping_df = pl.DataFrame()
2216
2218
 
2217
2219
 
2218
- def _extract_consensus_features(self, consensus_map, min_samples, cached_adducts_df=None, cached_valid_adducts=None):
2220
+ def _extract_consensus_features(study, consensus_map, min_samples, cached_adducts_df=None, cached_valid_adducts=None):
2219
2221
  """Extract consensus features and build metadata."""
2220
- # create a dict to map uid to feature_uid using self.features_df
2222
+ # create a dict to map uid to feature_uid using study.features_df
2221
2223
  feature_uid_map = {
2222
2224
  row["feature_id"]: row["feature_uid"]
2223
- for row in self.features_df.iter_rows(named=True)
2225
+ for row in study.features_df.iter_rows(named=True)
2224
2226
  }
2225
2227
  imax = consensus_map.size()
2226
2228
 
2227
- self.logger.debug(f"Found {imax} feature groups by clustering.")
2229
+ study.logger.debug(f"Found {imax} feature groups by clustering.")
2228
2230
 
2229
2231
  # Pre-build fast lookup tables for features_df data using optimized approach
2230
- features_lookup = _optimized_feature_lookup(self, self.features_df)
2232
+ features_lookup = _optimized_feature_lookup(study, study.features_df)
2231
2233
 
2232
2234
  # create a list to store the consensus mapping
2233
2235
  consensus_mapping = []
2234
2236
  metadata_list = []
2235
2237
 
2236
- tqdm_disable = self.log_level not in ["TRACE", "DEBUG"]
2238
+ tqdm_disable = study.log_level not in ["TRACE", "DEBUG"]
2237
2239
 
2238
2240
  for i, feature in enumerate(
2239
2241
  tqdm(
2240
2242
  consensus_map,
2241
2243
  total=imax,
2242
2244
  disable=tqdm_disable,
2243
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Extract metadata",
2245
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {study.log_label}Extract metadata",
2244
2246
  ),
2245
2247
  ):
2246
2248
  # get all features in the feature map with the same unique id as the consensus feature
@@ -2486,7 +2488,7 @@ def _extract_consensus_features(self, consensus_map, min_samples, cached_adducts
2486
2488
  adduct_mass_shift_top = 1.007825
2487
2489
  else:
2488
2490
  # No valid adducts found - assign default based on study polarity
2489
- study_polarity = getattr(self, "polarity", "positive")
2491
+ study_polarity = getattr(study, "polarity", "positive")
2490
2492
  if study_polarity in ["negative", "neg"]:
2491
2493
  # Negative mode default
2492
2494
  adduct_top = "[M-?]1-"
@@ -2618,55 +2620,55 @@ def _extract_consensus_features(self, consensus_map, min_samples, cached_adducts
2618
2620
  )
2619
2621
 
2620
2622
  consensus_mapping_df = pl.DataFrame(consensus_mapping)
2621
- # remove all rows in consensus_mapping_df where consensus_id is not in self.featured_df['uid']
2623
+ # remove all rows in consensus_mapping_df where consensus_id is not in study.featured_df['uid']
2622
2624
  l1 = len(consensus_mapping_df)
2623
2625
  consensus_mapping_df = consensus_mapping_df.filter(
2624
- pl.col("feature_uid").is_in(self.features_df["feature_uid"].to_list()),
2626
+ pl.col("feature_uid").is_in(study.features_df["feature_uid"].to_list()),
2625
2627
  )
2626
- self.logger.debug(
2628
+ study.logger.debug(
2627
2629
  f"Filtered {l1 - len(consensus_mapping_df)} orphan features from maps.",
2628
2630
  )
2629
- self.consensus_mapping_df = consensus_mapping_df
2630
- self.consensus_df = pl.DataFrame(metadata_list, strict=False)
2631
+ study.consensus_mapping_df = consensus_mapping_df
2632
+ study.consensus_df = pl.DataFrame(metadata_list, strict=False)
2631
2633
 
2632
2634
  if min_samples is None:
2633
2635
  min_samples = 1
2634
2636
  if min_samples < 1:
2635
- min_samples = int(min_samples * len(self.samples_df))
2637
+ min_samples = int(min_samples * len(study.samples_df))
2636
2638
 
2637
2639
  # Validate that min_samples doesn't exceed the number of samples
2638
- if min_samples > len(self.samples_df):
2639
- self.logger.warning(
2640
- f"min_samples ({min_samples}) exceeds the number of samples ({len(self.samples_df)}). "
2641
- f"Setting min_samples to {len(self.samples_df)}.",
2640
+ if min_samples > len(study.samples_df):
2641
+ study.logger.warning(
2642
+ f"min_samples ({min_samples}) exceeds the number of samples ({len(study.samples_df)}). "
2643
+ f"Setting min_samples to {len(study.samples_df)}.",
2642
2644
  )
2643
- min_samples = len(self.samples_df)
2645
+ min_samples = len(study.samples_df)
2644
2646
 
2645
2647
  # filter out consensus features with less than min_samples features
2646
- l1 = len(self.consensus_df)
2647
- self.consensus_df = self.consensus_df.filter(
2648
+ l1 = len(study.consensus_df)
2649
+ study.consensus_df = study.consensus_df.filter(
2648
2650
  pl.col("number_samples") >= min_samples,
2649
2651
  )
2650
- self.logger.debug(
2651
- f"Filtered {l1 - len(self.consensus_df)} consensus features with less than {min_samples} samples.",
2652
+ study.logger.debug(
2653
+ f"Filtered {l1 - len(study.consensus_df)} consensus features with less than {min_samples} samples.",
2652
2654
  )
2653
2655
  # filter out consensus mapping with less than min_samples features
2654
- self.consensus_mapping_df = self.consensus_mapping_df.filter(
2655
- pl.col("consensus_uid").is_in(self.consensus_df["consensus_uid"].to_list()),
2656
+ study.consensus_mapping_df = study.consensus_mapping_df.filter(
2657
+ pl.col("consensus_uid").is_in(study.consensus_df["consensus_uid"].to_list()),
2656
2658
  )
2657
2659
 
2658
- self.consensus_map = consensus_map
2660
+ study.consensus_map = consensus_map
2659
2661
 
2660
2662
 
2661
- def _perform_adduct_grouping(self, rt_tol, mz_tol):
2663
+ def _perform_adduct_grouping(study, rt_tol, mz_tol):
2662
2664
  """Perform adduct grouping on consensus features."""
2663
2665
  import polars as pl
2664
2666
 
2665
2667
  # Add adduct grouping and adduct_of assignment
2666
- if len(self.consensus_df) > 0:
2668
+ if len(study.consensus_df) > 0:
2667
2669
  # Get relevant columns for grouping
2668
2670
  consensus_data = []
2669
- for row in self.consensus_df.iter_rows(named=True):
2671
+ for row in study.consensus_df.iter_rows(named=True):
2670
2672
  consensus_data.append(
2671
2673
  {
2672
2674
  "consensus_uid": row["consensus_uid"],
@@ -2679,11 +2681,11 @@ def _perform_adduct_grouping(self, rt_tol, mz_tol):
2679
2681
 
2680
2682
  # Use optimized adduct grouping
2681
2683
  adduct_group_list, adduct_of_list = _optimized_adduct_grouping(
2682
- self, consensus_data, rt_tol, mz_tol
2684
+ study, consensus_data, rt_tol, mz_tol
2683
2685
  )
2684
2686
 
2685
2687
  # Add the new columns to consensus_df
2686
- self.consensus_df = self.consensus_df.with_columns(
2688
+ study.consensus_df = study.consensus_df.with_columns(
2687
2689
  [
2688
2690
  pl.Series("adduct_group", adduct_group_list, dtype=pl.Int64),
2689
2691
  pl.Series("adduct_of", adduct_of_list, dtype=pl.Int64),
@@ -2691,7 +2693,7 @@ def _perform_adduct_grouping(self, rt_tol, mz_tol):
2691
2693
  )
2692
2694
 
2693
2695
 
2694
- def _count_tight_clusters(self, mz_tol: float = 0.04, rt_tol: float = 0.3) -> int:
2696
+ def _count_tight_clusters(study, mz_tol: float = 0.04, rt_tol: float = 0.3) -> int:
2695
2697
  """
2696
2698
  Count consensus features grouped in tight clusters.
2697
2699
 
@@ -2702,12 +2704,12 @@ def _count_tight_clusters(self, mz_tol: float = 0.04, rt_tol: float = 0.3) -> in
2702
2704
  Returns:
2703
2705
  Number of tight clusters found
2704
2706
  """
2705
- if len(self.consensus_df) < 2:
2707
+ if len(study.consensus_df) < 2:
2706
2708
  return 0
2707
2709
 
2708
2710
  # Extract consensus feature data
2709
2711
  consensus_data = []
2710
- for row in self.consensus_df.iter_rows(named=True):
2712
+ for row in study.consensus_df.iter_rows(named=True):
2711
2713
  consensus_data.append({
2712
2714
  'consensus_uid': row['consensus_uid'],
2713
2715
  'mz': row['mz'],
@@ -2768,7 +2770,7 @@ def _count_tight_clusters(self, mz_tol: float = 0.04, rt_tol: float = 0.3) -> in
2768
2770
  return tight_clusters_count
2769
2771
 
2770
2772
 
2771
- def _consensus_cleanup(self, rt_tol, mz_tol):
2773
+ def _consensus_cleanup(study, rt_tol, mz_tol):
2772
2774
  """
2773
2775
  Consensus cleanup to merge over-segmented consensus features and remove isotopic features.
2774
2776
 
@@ -2777,20 +2779,20 @@ def _consensus_cleanup(self, rt_tol, mz_tol):
2777
2779
  (too many features in very tight m/z and RT windows)
2778
2780
  2. Performs deisotoping to remove +1 and +2 isotopic features
2779
2781
  """
2780
- if len(self.consensus_df) == 0:
2782
+ if len(study.consensus_df) == 0:
2781
2783
  return
2782
2784
 
2783
- initial_count = len(self.consensus_df)
2785
+ initial_count = len(study.consensus_df)
2784
2786
 
2785
2787
  # Only perform enhanced post-clustering if there are many features
2786
2788
  if initial_count < 50:
2787
2789
  return
2788
2790
 
2789
- self.logger.debug(f"Enhanced post-clustering: processing {initial_count} consensus features")
2791
+ study.logger.debug(f"Enhanced post-clustering: processing {initial_count} consensus features")
2790
2792
 
2791
2793
  # Find tight clusters using spatial binning
2792
2794
  consensus_data = []
2793
- for row in self.consensus_df.iter_rows(named=True):
2795
+ for row in study.consensus_df.iter_rows(named=True):
2794
2796
  consensus_data.append({
2795
2797
  'consensus_uid': row['consensus_uid'],
2796
2798
  'mz': row['mz'],
@@ -2873,7 +2875,7 @@ def _consensus_cleanup(self, rt_tol, mz_tol):
2873
2875
  if not merge_groups:
2874
2876
  return
2875
2877
 
2876
- self.logger.debug(f"Found {len(merge_groups)} over-segmented clusters to merge")
2878
+ study.logger.debug(f"Found {len(merge_groups)} over-segmented clusters to merge")
2877
2879
 
2878
2880
  # Merge clusters by keeping the most representative feature
2879
2881
  uids_to_remove = set()
@@ -2892,25 +2894,25 @@ def _consensus_cleanup(self, rt_tol, mz_tol):
2892
2894
 
2893
2895
  if uids_to_remove:
2894
2896
  # Remove merged features from consensus_df
2895
- self.consensus_df = self.consensus_df.filter(
2897
+ study.consensus_df = study.consensus_df.filter(
2896
2898
  ~pl.col('consensus_uid').is_in(list(uids_to_remove))
2897
2899
  )
2898
2900
 
2899
2901
  # Also update consensus_mapping_df if it exists
2900
- if hasattr(self, 'consensus_mapping_df') and not self.consensus_mapping_df.is_empty():
2901
- self.consensus_mapping_df = self.consensus_mapping_df.filter(
2902
+ if hasattr(study, 'consensus_mapping_df') and not study.consensus_mapping_df.is_empty():
2903
+ study.consensus_mapping_df = study.consensus_mapping_df.filter(
2902
2904
  ~pl.col('consensus_uid').is_in(list(uids_to_remove))
2903
2905
  )
2904
2906
 
2905
- final_count = len(self.consensus_df)
2907
+ final_count = len(study.consensus_df)
2906
2908
  reduction = initial_count - final_count
2907
2909
  reduction_pct = (reduction / initial_count) * 100
2908
2910
 
2909
2911
  if reduction > 0:
2910
- self.logger.debug(f"Enhanced post-clustering: {initial_count} → {final_count} features ({reduction_pct:.1f}% reduction)")
2912
+ study.logger.debug(f"Enhanced post-clustering: {initial_count} → {final_count} features ({reduction_pct:.1f}% reduction)")
2911
2913
 
2912
2914
  # Step 2: Deisotoping - Remove +1 and +2 isotopic consensus features
2913
- pre_deisotoping_count = len(self.consensus_df)
2915
+ pre_deisotoping_count = len(study.consensus_df)
2914
2916
  isotope_uids_to_remove = set()
2915
2917
 
2916
2918
  # Use strict tolerances for deisotoping (same as declustering)
@@ -2919,7 +2921,7 @@ def _consensus_cleanup(self, rt_tol, mz_tol):
2919
2921
 
2920
2922
  # Get current consensus data for isotope detection
2921
2923
  current_consensus_data = []
2922
- for row in self.consensus_df.iter_rows(named=True):
2924
+ for row in study.consensus_df.iter_rows(named=True):
2923
2925
  current_consensus_data.append({
2924
2926
  'consensus_uid': row['consensus_uid'],
2925
2927
  'mz': row['mz'],
@@ -2970,31 +2972,31 @@ def _consensus_cleanup(self, rt_tol, mz_tol):
2970
2972
 
2971
2973
  # Remove isotopic features
2972
2974
  if isotope_uids_to_remove:
2973
- self.consensus_df = self.consensus_df.filter(
2975
+ study.consensus_df = study.consensus_df.filter(
2974
2976
  ~pl.col('consensus_uid').is_in(list(isotope_uids_to_remove))
2975
2977
  )
2976
2978
 
2977
2979
  # Also update consensus_mapping_df if it exists
2978
- if hasattr(self, 'consensus_mapping_df') and not self.consensus_mapping_df.is_empty():
2979
- self.consensus_mapping_df = self.consensus_mapping_df.filter(
2980
+ if hasattr(study, 'consensus_mapping_df') and not study.consensus_mapping_df.is_empty():
2981
+ study.consensus_mapping_df = study.consensus_mapping_df.filter(
2980
2982
  ~pl.col('consensus_uid').is_in(list(isotope_uids_to_remove))
2981
2983
  )
2982
2984
 
2983
- post_deisotoping_count = len(self.consensus_df)
2985
+ post_deisotoping_count = len(study.consensus_df)
2984
2986
  isotope_reduction = pre_deisotoping_count - post_deisotoping_count
2985
2987
 
2986
2988
  if isotope_reduction > 0:
2987
- self.logger.debug(f"Deisotoping: {pre_deisotoping_count} → {post_deisotoping_count} features ({isotope_reduction} isotopic features removed)")
2989
+ study.logger.debug(f"Deisotoping: {pre_deisotoping_count} → {post_deisotoping_count} features ({isotope_reduction} isotopic features removed)")
2988
2990
 
2989
2991
  # Final summary
2990
- final_count = len(self.consensus_df)
2992
+ final_count = len(study.consensus_df)
2991
2993
  total_reduction = initial_count - final_count
2992
2994
  if total_reduction > 0:
2993
2995
  total_reduction_pct = (total_reduction / initial_count) * 100
2994
- self.logger.debug(f"Consensus cleanup complete: {initial_count} → {final_count} features ({total_reduction_pct:.1f}% total reduction)")
2996
+ study.logger.debug(f"Consensus cleanup complete: {initial_count} → {final_count} features ({total_reduction_pct:.1f}% total reduction)")
2995
2997
 
2996
2998
 
2997
- def _identify_adduct_by_mass_shift(self, rt_tol, cached_adducts_df=None):
2999
+ def _identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
2998
3000
  """
2999
3001
  Identify coeluting consensus features by characteristic mass shifts between adducts
3000
3002
  and update their adduct information accordingly.
@@ -3014,23 +3016,24 @@ def _identify_adduct_by_mass_shift(self, rt_tol, cached_adducts_df=None):
3014
3016
  from collections import defaultdict
3015
3017
 
3016
3018
  # Check if consensus_df exists and has features
3017
- if len(self.consensus_df) == 0:
3018
- self.logger.debug("No consensus features for adduct identification by mass shift")
3019
+ if len(study.consensus_df) == 0:
3020
+ study.logger.debug("No consensus features for adduct identification by mass shift")
3019
3021
  return
3020
3022
 
3021
- self.logger.info(f"Identifying coeluting adducts by mass shifts in {len(self.consensus_df)} consensus features...")
3023
+ study.logger.info(f"Identifying coeluting adducts by mass shifts in {len(study.consensus_df)} consensus features...")
3022
3024
 
3023
3025
  # Get adducts DataFrame if not provided
3024
3026
  if cached_adducts_df is None or cached_adducts_df.is_empty():
3025
3027
  try:
3026
3028
  # Use lower min_probability for better adduct coverage in mass shift identification
3027
- cached_adducts_df = self._get_adducts(min_probability=0.01)
3029
+ from masster.study.id import _get_adducts
3030
+ cached_adducts_df = _get_adducts(study, min_probability=0.01)
3028
3031
  except Exception as e:
3029
- self.logger.warning(f"Could not retrieve adducts for mass shift identification: {e}")
3032
+ study.logger.warning(f"Could not retrieve adducts for mass shift identification: {e}")
3030
3033
  return
3031
3034
 
3032
3035
  if cached_adducts_df.is_empty():
3033
- self.logger.debug("No adducts available for mass shift identification")
3036
+ study.logger.debug("No adducts available for mass shift identification")
3034
3037
  return
3035
3038
 
3036
3039
  # Build catalogue of mass shifts between adducts
@@ -3081,11 +3084,11 @@ def _identify_adduct_by_mass_shift(self, rt_tol, cached_adducts_df=None):
3081
3084
  "to_charge": charge2
3082
3085
  })
3083
3086
 
3084
- self.logger.debug(f"Generated mass shift catalog with {len(mass_shift_catalog)} unique shifts")
3087
+ study.logger.debug(f"Generated mass shift catalog with {len(mass_shift_catalog)} unique shifts")
3085
3088
 
3086
3089
  # Get consensus features data
3087
3090
  consensus_data = []
3088
- for i, row in enumerate(self.consensus_df.iter_rows(named=True)):
3091
+ for i, row in enumerate(study.consensus_df.iter_rows(named=True)):
3089
3092
  consensus_data.append({
3090
3093
  "index": i,
3091
3094
  "consensus_uid": row["consensus_uid"],
@@ -3234,7 +3237,7 @@ def _identify_adduct_by_mass_shift(self, rt_tol, cached_adducts_df=None):
3234
3237
  }
3235
3238
 
3236
3239
  updated_count += 2
3237
- self.logger.debug(
3240
+ study.logger.debug(
3238
3241
  f"Identified adduct pair: {from_adduct_name} (m/z {from_feature['mz']:.4f}) "
3239
3242
  f"<-> {to_adduct_name} (m/z {to_feature['mz']:.4f}), "
3240
3243
  f"RT {rt1:.2f}s, Δm/z {mz_diff:.4f}"
@@ -3244,7 +3247,7 @@ def _identify_adduct_by_mass_shift(self, rt_tol, cached_adducts_df=None):
3244
3247
  # Apply updates to consensus_df
3245
3248
  if adduct_updates:
3246
3249
  # Prepare update data
3247
- consensus_uids = self.consensus_df["consensus_uid"].to_list()
3250
+ consensus_uids = study.consensus_df["consensus_uid"].to_list()
3248
3251
 
3249
3252
  new_adduct_top = []
3250
3253
  new_adduct_charge_top = []
@@ -3261,88 +3264,88 @@ def _identify_adduct_by_mass_shift(self, rt_tol, cached_adducts_df=None):
3261
3264
  else:
3262
3265
  # Keep existing values
3263
3266
  row_idx = consensus_uids.index(uid)
3264
- row = self.consensus_df.row(row_idx, named=True)
3267
+ row = study.consensus_df.row(row_idx, named=True)
3265
3268
  new_adduct_top.append(row.get("adduct_top"))
3266
3269
  new_adduct_charge_top.append(row.get("adduct_charge_top"))
3267
3270
  new_adduct_mass_neutral_top.append(row.get("adduct_mass_neutral_top"))
3268
3271
  new_adduct_mass_shift_top.append(row.get("adduct_mass_shift_top"))
3269
3272
 
3270
3273
  # Update the DataFrame
3271
- self.consensus_df = self.consensus_df.with_columns([
3274
+ study.consensus_df = study.consensus_df.with_columns([
3272
3275
  pl.Series("adduct_top", new_adduct_top),
3273
3276
  pl.Series("adduct_charge_top", new_adduct_charge_top),
3274
3277
  pl.Series("adduct_mass_neutral_top", new_adduct_mass_neutral_top),
3275
3278
  pl.Series("adduct_mass_shift_top", new_adduct_mass_shift_top)
3276
3279
  ])
3277
3280
 
3278
- self.logger.info(f"Updated adduct assignments for {updated_count} consensus features based on mass shifts")
3281
+ study.logger.info(f"Updated adduct assignments for {updated_count} consensus features based on mass shifts")
3279
3282
  else:
3280
- self.logger.debug("No consensus features updated based on mass shift analysis")
3283
+ study.logger.debug("No consensus features updated based on mass shift analysis")
3281
3284
 
3282
3285
 
3283
- def _finalize_merge(self, link_ms2, min_samples):
3286
+ def _finalize_merge(study, link_ms2, min_samples):
3284
3287
  """Complete the merge process with final calculations and cleanup."""
3285
3288
  import polars as pl
3286
3289
 
3287
3290
  # Check if consensus_df is empty or missing required columns
3288
- if len(self.consensus_df) == 0 or "number_samples" not in self.consensus_df.columns:
3289
- self.logger.debug("No consensus features found or consensus_df is empty. Skipping finalize merge.")
3291
+ if len(study.consensus_df) == 0 or "number_samples" not in study.consensus_df.columns:
3292
+ study.logger.debug("No consensus features found or consensus_df is empty. Skipping finalize merge.")
3290
3293
  return
3291
3294
 
3292
3295
  # Validate min_samples parameter
3293
3296
  if min_samples is None:
3294
3297
  min_samples = 1
3295
3298
  if min_samples < 1:
3296
- min_samples = int(min_samples * len(self.samples_df))
3299
+ min_samples = int(min_samples * len(study.samples_df))
3297
3300
 
3298
3301
  # Validate that min_samples doesn't exceed the number of samples
3299
- if min_samples > len(self.samples_df):
3300
- self.logger.warning(
3301
- f"min_samples ({min_samples}) exceeds the number of samples ({len(self.samples_df)}). "
3302
- f"Setting min_samples to {len(self.samples_df)}.",
3302
+ if min_samples > len(study.samples_df):
3303
+ study.logger.warning(
3304
+ f"min_samples ({min_samples}) exceeds the number of samples ({len(study.samples_df)}). "
3305
+ f"Setting min_samples to {len(study.samples_df)}.",
3303
3306
  )
3304
- min_samples = len(self.samples_df)
3307
+ min_samples = len(study.samples_df)
3305
3308
 
3306
3309
  # Filter out consensus features with less than min_samples features
3307
- l1 = len(self.consensus_df)
3308
- self.consensus_df = self.consensus_df.filter(
3310
+ l1 = len(study.consensus_df)
3311
+ study.consensus_df = study.consensus_df.filter(
3309
3312
  pl.col("number_samples") >= min_samples,
3310
3313
  )
3311
- self.logger.debug(
3312
- f"Filtered {l1 - len(self.consensus_df)} consensus features with less than {min_samples} samples.",
3314
+ study.logger.debug(
3315
+ f"Filtered {l1 - len(study.consensus_df)} consensus features with less than {min_samples} samples.",
3313
3316
  )
3314
3317
 
3315
3318
  # Filter out consensus mapping with less than min_samples features
3316
- self.consensus_mapping_df = self.consensus_mapping_df.filter(
3317
- pl.col("consensus_uid").is_in(self.consensus_df["consensus_uid"].to_list()),
3319
+ study.consensus_mapping_df = study.consensus_mapping_df.filter(
3320
+ pl.col("consensus_uid").is_in(study.consensus_df["consensus_uid"].to_list()),
3318
3321
  )
3319
3322
 
3320
3323
  # Calculate the completeness of the consensus map
3321
3324
  # Log completion with tight cluster metrics
3322
- if len(self.consensus_df) > 0 and len(self.samples_df) > 0:
3325
+ if len(study.consensus_df) > 0 and len(study.samples_df) > 0:
3323
3326
  c = (
3324
- len(self.consensus_mapping_df)
3325
- / len(self.consensus_df)
3326
- / len(self.samples_df)
3327
+ len(study.consensus_mapping_df)
3328
+ / len(study.consensus_df)
3329
+ / len(study.samples_df)
3327
3330
  )
3328
3331
 
3329
3332
  # Count tight clusters with specified thresholds
3330
- tight_clusters = _count_tight_clusters(self,mz_tol=0.04, rt_tol=0.3)
3333
+ tight_clusters = _count_tight_clusters(study,mz_tol=0.04, rt_tol=0.3)
3331
3334
 
3332
- self.logger.info(
3333
- f"Merging completed. Consensus features: {len(self.consensus_df)}. "
3335
+ study.logger.info(
3336
+ f"Merging completed. Consensus features: {len(study.consensus_df)}. "
3334
3337
  f"Completeness: {c:.2f}. Tight clusters left: {tight_clusters}.",
3335
3338
  )
3336
3339
  else:
3337
- self.logger.warning(
3338
- f"Merging completed with empty result. Consensus features: {len(self.consensus_df)}. "
3340
+ study.logger.warning(
3341
+ f"Merging completed with empty result. Consensus features: {len(study.consensus_df)}. "
3339
3342
  f"This may be due to min_samples ({min_samples}) being too high for the available data.",
3340
3343
  )
3341
3344
 
3342
3345
  # add iso data from raw files.
3343
- self.find_iso()
3346
+ study.find_iso()
3344
3347
  if link_ms2:
3345
- self.find_ms2()
3348
+ study.find_ms2()
3346
3349
 
3347
3350
 
3348
3351
  def _optimized_feature_lookup(study_obj, features_df):
@@ -3419,24 +3422,24 @@ def _optimized_adduct_grouping(study_obj, consensus_data, rt_tol, mz_tol):
3419
3422
 
3420
3423
  # Union-Find for efficient grouping
3421
3424
  class UnionFind:
3422
- def __init__(self, n):
3423
- self.parent = list(range(n))
3424
- self.rank = [0] * n
3425
+ def __init__(study, n):
3426
+ study.parent = list(range(n))
3427
+ study.rank = [0] * n
3425
3428
 
3426
- def find(self, x):
3427
- if self.parent[x] != x:
3428
- self.parent[x] = self.find(self.parent[x])
3429
- return self.parent[x]
3429
+ def find(study, x):
3430
+ if study.parent[x] != x:
3431
+ study.parent[x] = study.find(study.parent[x])
3432
+ return study.parent[x]
3430
3433
 
3431
- def union(self, x, y):
3432
- px, py = self.find(x), self.find(y)
3434
+ def union(study, x, y):
3435
+ px, py = study.find(x), study.find(y)
3433
3436
  if px == py:
3434
3437
  return
3435
- if self.rank[px] < self.rank[py]:
3438
+ if study.rank[px] < study.rank[py]:
3436
3439
  px, py = py, px
3437
- self.parent[py] = px
3438
- if self.rank[px] == self.rank[py]:
3439
- self.rank[px] += 1
3440
+ study.parent[py] = px
3441
+ if study.rank[px] == study.rank[py]:
3442
+ study.rank[px] += 1
3440
3443
 
3441
3444
  uid_to_idx = {feature[0]: i for i, feature in enumerate(valid_features)}
3442
3445
  uf = UnionFind(len(valid_features))