masster 0.4.22__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/merge.py CHANGED
@@ -274,7 +274,7 @@ def _serialize_feature_map(feature_map):
274
274
  return features_data
275
275
 
276
276
 
277
- def merge(self, **kwargs) -> None:
277
+ def merge(study, **kwargs) -> None:
278
278
  """
279
279
  Group features across samples into consensus features using various algorithms.
280
280
 
@@ -342,7 +342,7 @@ def merge(self, **kwargs) -> None:
342
342
  if key in valid_params:
343
343
  setattr(params, key, value)
344
344
  else:
345
- self.logger.warning(f"Unknown parameter '{key}' ignored")
345
+ study.logger.warning(f"Unknown parameter '{key}' ignored")
346
346
 
347
347
  # Backward compatibility: Map old method names to new names
348
348
  method_mapping = {
@@ -362,18 +362,18 @@ def merge(self, **kwargs) -> None:
362
362
  if params.method in method_mapping:
363
363
  old_method = params.method
364
364
  params.method = method_mapping[old_method]
365
- self.logger.info(f"Method '{old_method}' is deprecated. Using '{params.method}' instead.")
365
+ study.logger.info(f"Method '{old_method}' is deprecated. Using '{params.method}' instead.")
366
366
 
367
367
  # Validate method
368
368
  if params.method not in ['sensitivity', 'qt', 'nowarp', 'kd_chunked', 'qt_chunked', 'quality']:
369
369
  raise ValueError(f"Invalid method '{params.method}'. Must be one of: ['sensitivity', 'qt', 'nowarp', 'kd_chunked', 'qt_chunked', 'quality']")
370
370
 
371
371
  # Check if chunked method is advisable for large datasets
372
- num_samples = len(self.samples_df) if hasattr(self, 'samples_df') and self.samples_df is not None else 0
372
+ num_samples = len(study.samples_df) if hasattr(study, 'samples_df') and study.samples_df is not None else 0
373
373
  if num_samples > 500:
374
374
  chunked_methods = {'kd_chunked', 'qt_chunked'}
375
375
  if params.method not in chunked_methods:
376
- self.logger.warning(
376
+ study.logger.warning(
377
377
  f"Large dataset detected ({num_samples} samples > 500). "
378
378
  f"For better performance and memory efficiency, consider using a chunked method: "
379
379
  f"'kd_chunked' or 'qt_chunked' instead of '{params.method}'"
@@ -381,42 +381,43 @@ def merge(self, **kwargs) -> None:
381
381
 
382
382
  # Persist last used params for diagnostics
383
383
  try:
384
- self._merge_params_last = params.to_dict()
384
+ study._merge_params_last = params.to_dict()
385
385
  except Exception:
386
- self._merge_params_last = {}
386
+ study._merge_params_last = {}
387
387
 
388
388
  # Store merge parameters in history
389
389
  try:
390
- if hasattr(self, 'store_history'):
391
- self.store_history(['merge'], params.to_dict())
390
+ if hasattr(study, 'store_history'):
391
+ study.update_history(['merge'], params.to_dict())
392
392
  else:
393
- self.logger.warning("History storage not available - parameters not saved to history")
393
+ study.logger.warning("History storage not available - parameters not saved to history")
394
394
  except Exception as e:
395
- self.logger.warning(f"Failed to store merge parameters in history: {e}")
395
+ study.logger.warning(f"Failed to store merge parameters in history: {e}")
396
396
 
397
397
  # Ensure feature maps are available for merging (regenerate if needed)
398
- if len(self.features_maps) < len(self.samples_df):
399
- self.features_maps = []
398
+ if len(study.features_maps) < len(study.samples_df):
399
+ study.features_maps = []
400
400
  # Feature maps will be generated on-demand within each merge method
401
401
 
402
- self.logger.info(
402
+ study.logger.info(
403
403
  f"Merge: {params.method}, samples={params.min_samples}, rt_tol={params.rt_tol}s, mz_tol={params.mz_tol}Da"
404
404
  )
405
405
 
406
406
  # Initialize
407
- self._reset_consensus_data()
407
+ _reset_consensus_data(study)
408
408
 
409
409
  # Cache adducts for performance (avoid repeated _get_adducts() calls)
410
410
  cached_adducts_df = None
411
411
  cached_valid_adducts = None
412
412
  try:
413
- cached_adducts_df = self._get_adducts()
413
+ from masster.study.id import _get_adducts
414
+ cached_adducts_df = _get_adducts(study)
414
415
  if not cached_adducts_df.is_empty():
415
416
  cached_valid_adducts = set(cached_adducts_df["name"].to_list())
416
417
  else:
417
418
  cached_valid_adducts = set()
418
419
  except Exception as e:
419
- self.logger.warning(f"Could not retrieve study adducts: {e}")
420
+ study.logger.warning(f"Could not retrieve study adducts: {e}")
420
421
  cached_valid_adducts = set()
421
422
 
422
423
  # Always allow '?' adducts
@@ -424,58 +425,58 @@ def merge(self, **kwargs) -> None:
424
425
 
425
426
  # Route to algorithm implementation
426
427
  if params.method == 'sensitivity':
427
- consensus_map = _merge_kd(self, params)
428
+ consensus_map = _merge_kd(study, params)
428
429
  # Extract consensus features
429
- self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
430
+ _extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
430
431
  elif params.method == 'qt':
431
- consensus_map = _merge_qt(self, params)
432
+ consensus_map = _merge_qt(study, params)
432
433
  # Extract consensus features
433
- self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
434
+ _extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
434
435
  elif params.method == 'nowarp':
435
- consensus_map = _merge_kd_nowarp(self, params)
436
+ consensus_map = _merge_kd_nowarp(study, params)
436
437
  # Extract consensus features
437
- self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
438
+ _extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
438
439
  elif params.method == 'quality':
439
- consensus_map = _merge_kd_strict(self, params)
440
+ consensus_map = _merge_kd_strict(study, params)
440
441
  # Note: _merge_kd_strict handles both consensus_df and consensus_mapping_df directly
441
442
  elif params.method == 'kd_chunked':
442
- consensus_map = _merge_kd_chunked(self, params, cached_adducts_df, cached_valid_adducts)
443
+ consensus_map = _merge_kd_chunked(study, params, cached_adducts_df, cached_valid_adducts)
443
444
  # Note: _merge_kd_chunked populates consensus_df directly, no need to extract
444
445
  elif params.method == 'qt_chunked':
445
- consensus_map = _merge_qt_chunked(self, params, cached_adducts_df, cached_valid_adducts)
446
+ consensus_map = _merge_qt_chunked(study, params, cached_adducts_df, cached_valid_adducts)
446
447
  # Note: _merge_qt_chunked populates consensus_df directly, no need to extract
447
448
 
448
449
  # Enhanced post-clustering to merge over-segmented features (for qt and kd methods)
449
450
  if params.method in ['qt', 'sensitivity', 'qt_chunked', 'kd_chunked', 'quality']:
450
- self._consensus_cleanup(params.rt_tol, params.mz_tol)
451
+ _consensus_cleanup(study, params.rt_tol, params.mz_tol)
451
452
 
452
453
  # Perform adduct grouping
453
- self._perform_adduct_grouping(params.rt_tol, params.mz_tol)
454
+ _perform_adduct_grouping(study, params.rt_tol, params.mz_tol)
454
455
 
455
456
  # Identify coeluting consensus features by mass shifts and update adduct information
456
- self._identify_adduct_by_mass_shift(params.rt_tol, cached_adducts_df)
457
+ _identify_adduct_by_mass_shift(study, params.rt_tol, cached_adducts_df)
457
458
 
458
459
  # Link MS2 if requested
459
460
  if params.link_ms2:
460
- self._finalize_merge(params.link_ms2, params.min_samples)
461
+ _finalize_merge(study, params.link_ms2, params.min_samples)
461
462
 
462
463
  # Log completion without the misleading feature count
463
464
  elapsed = time.time() - start_time
464
- self.logger.debug(f"Merge process completed in {elapsed:.1f}s")
465
+ study.logger.debug(f"Merge process completed in {elapsed:.1f}s")
465
466
 
466
467
 
467
- def _merge_kd(self, params: merge_defaults) -> oms.ConsensusMap:
468
+ def _merge_kd(study, params: merge_defaults) -> oms.ConsensusMap:
468
469
  """KD-tree based merge (fast, recommended)"""
469
470
 
470
471
  # Generate temporary feature maps on-demand from features_df
471
- temp_feature_maps = _generate_feature_maps_on_demand(self)
472
+ temp_feature_maps = _generate_feature_maps_on_demand(study)
472
473
 
473
474
  consensus_map = oms.ConsensusMap()
474
475
  file_descriptions = consensus_map.getColumnHeaders()
475
476
 
476
477
  for i, feature_map in enumerate(temp_feature_maps):
477
478
  file_description = file_descriptions.get(i, oms.ColumnHeader())
478
- file_description.filename = self.samples_df.row(i, named=True)["sample_name"]
479
+ file_description.filename = study.samples_df.row(i, named=True)["sample_name"]
479
480
  file_description.size = feature_map.size()
480
481
  file_description.unique_id = feature_map.getUniqueId()
481
482
  file_descriptions[i] = file_description
@@ -504,13 +505,99 @@ def _merge_kd(self, params: merge_defaults) -> oms.ConsensusMap:
504
505
  return consensus_map
505
506
 
506
507
 
508
+ def _generate_feature_maps_from_samples(study):
509
+ """
510
+ Generate feature maps using Study-level features_df instead of Sample-level loading.
511
+ This uses the study's existing features_df which is already loaded.
512
+
513
+ Args:
514
+ study: Study object containing features_df
515
+
516
+ Returns:
517
+ list: List of temporary FeatureMap objects built from Study-level data
518
+ """
519
+ import pyopenms as oms
520
+
521
+ temp_feature_maps = []
522
+
523
+ study.logger.info(f"Building feature maps using Study-level features_df from {len(study.samples_df)} samples")
524
+
525
+ # Use the features_df from the study that's already loaded
526
+ if not hasattr(study, 'features_df') or study.features_df is None or study.features_df.is_empty():
527
+ study.logger.warning("No features_df available - features must be loaded first")
528
+ return temp_feature_maps
529
+
530
+ # Group features by sample
531
+ study.logger.info(f"Processing {len(study.features_df)} features grouped by sample")
532
+
533
+ # Get unique sample names/indices
534
+ if 'sample_uid' in study.features_df.columns:
535
+ sample_groups = study.features_df.group_by('sample_uid')
536
+ study.logger.debug("Grouping features by 'sample_uid' column")
537
+ elif 'sample_id' in study.features_df.columns:
538
+ sample_groups = study.features_df.group_by('sample_id')
539
+ study.logger.debug("Grouping features by 'sample_id' column")
540
+ elif 'sample' in study.features_df.columns:
541
+ sample_groups = study.features_df.group_by('sample')
542
+ study.logger.debug("Grouping features by 'sample' column")
543
+ else:
544
+ study.logger.warning("No sample grouping column found in features_df")
545
+ study.logger.info(f"Available columns: {study.features_df.columns}")
546
+ return temp_feature_maps
547
+
548
+ # Process each sample group
549
+ processed_samples = 0
550
+ for sample_key, sample_features in sample_groups:
551
+ try:
552
+ feature_map = oms.FeatureMap()
553
+ feature_count = 0
554
+
555
+ # Build features from this sample's features
556
+ for row in sample_features.iter_rows(named=True):
557
+ try:
558
+ feature = oms.Feature()
559
+
560
+ # Set feature properties
561
+ if row.get("feature_id") is not None:
562
+ feature.setUniqueId(int(row["feature_id"]))
563
+ if row.get("mz") is not None:
564
+ feature.setMZ(float(row["mz"]))
565
+ if row.get("rt") is not None:
566
+ feature.setRT(float(row["rt"]))
567
+ if row.get("inty") is not None:
568
+ feature.setIntensity(float(row["inty"]))
569
+ if row.get("quality") is not None:
570
+ feature.setOverallQuality(float(row["quality"]))
571
+ if row.get("charge") is not None:
572
+ feature.setCharge(int(row["charge"]))
573
+
574
+ feature_map.push_back(feature)
575
+ feature_count += 1
576
+
577
+ except (ValueError, TypeError) as e:
578
+ study.logger.warning(f"Skipping feature in sample {sample_key} due to conversion error: {e}")
579
+ continue
580
+
581
+ temp_feature_maps.append(feature_map)
582
+ processed_samples += 1
583
+ study.logger.debug(f"Built feature map for sample {sample_key} with {feature_count} features")
584
+
585
+ except Exception as e:
586
+ study.logger.warning(f"Failed to process sample group {sample_key}: {e}")
587
+ # Add empty feature map for failed samples to maintain sample order
588
+ temp_feature_maps.append(oms.FeatureMap())
589
+
590
+ study.logger.info(f"Generated {len(temp_feature_maps)} feature maps from {processed_samples} samples using Study-level features_df")
591
+ return temp_feature_maps
592
+
593
+
507
594
  def _generate_feature_maps_on_demand(study):
508
595
  """
509
- Generate feature maps on-demand from study.features_df for merge operations.
596
+ Generate feature maps on-demand using Sample-level _load_ms1() for merge operations.
510
597
  Returns temporary feature maps that are not cached in the study.
511
598
 
512
599
  Args:
513
- study: Study object containing features_df and samples_df
600
+ study: Study object containing samples
514
601
 
515
602
  Returns:
516
603
  list: List of temporary FeatureMap objects
@@ -519,6 +606,15 @@ def _generate_feature_maps_on_demand(study):
519
606
  import pyopenms as oms
520
607
  import numpy as np
521
608
 
609
+ # Check if we should use Sample-level loading instead of features_df
610
+ use_sample_loading = True # Default to Sample-level loading as requested
611
+
612
+ # Use Sample-level loading if requested and samples_df is available
613
+ if use_sample_loading and hasattr(study, 'samples_df') and study.samples_df is not None and len(study.samples_df) > 0:
614
+ study.logger.debug("Building feature maps using Sample-level _load_ms1() instead of features_df")
615
+ return _generate_feature_maps_from_samples(study)
616
+
617
+ # Fallback to original features_df approach
522
618
  if study.features_df is None or len(study.features_df) == 0:
523
619
  study.logger.error("No features_df available for generating feature maps")
524
620
  return []
@@ -624,22 +720,22 @@ def _generate_feature_maps_on_demand(study):
624
720
  return temp_feature_maps
625
721
 
626
722
 
627
- def _merge_qt(self, params: merge_defaults) -> oms.ConsensusMap:
723
+ def _merge_qt(study, params: merge_defaults) -> oms.ConsensusMap:
628
724
  """QT (Quality Threshold) based merge"""
629
725
 
630
726
  # Generate temporary feature maps on-demand from features_df
631
- temp_feature_maps = _generate_feature_maps_on_demand(self)
727
+ temp_feature_maps = _generate_feature_maps_on_demand(study)
632
728
 
633
729
  n_samples = len(temp_feature_maps)
634
730
  if n_samples > 1000:
635
- self.logger.warning(f"QT with {n_samples} samples may be slow [O(n²)]. Consider KD [O(n log n)]")
731
+ study.logger.warning(f"QT with {n_samples} samples may be slow [O(n²)]. Consider KD [O(n log n)]")
636
732
 
637
733
  consensus_map = oms.ConsensusMap()
638
734
  file_descriptions = consensus_map.getColumnHeaders()
639
735
 
640
736
  for i, feature_map in enumerate(temp_feature_maps):
641
737
  file_description = file_descriptions.get(i, oms.ColumnHeader())
642
- file_description.filename = self.samples_df.row(i, named=True)["sample_name"]
738
+ file_description.filename = study.samples_df.row(i, named=True)["sample_name"]
643
739
  file_description.size = feature_map.size()
644
740
  file_description.unique_id = feature_map.getUniqueId()
645
741
  file_descriptions[i] = file_description
@@ -665,7 +761,7 @@ def _merge_qt(self, params: merge_defaults) -> oms.ConsensusMap:
665
761
  return consensus_map
666
762
 
667
763
 
668
- def _merge_kd_strict(self, params: merge_defaults) -> oms.ConsensusMap:
764
+ def _merge_kd_strict(study, params: merge_defaults) -> oms.ConsensusMap:
669
765
  """
670
766
  Quality merge: Standard KD algorithm with post-processing quality control.
671
767
 
@@ -695,8 +791,8 @@ def _merge_kd_strict(self, params: merge_defaults) -> oms.ConsensusMap:
695
791
 
696
792
  if optimize_rt_tol:
697
793
  # Optimize RT tolerance first
698
- optimal_rt_tol = _optimize_rt_tolerance(self, params)
699
- self.logger.info(f"RT tolerance optimization: {params.rt_tol}s → {optimal_rt_tol}s")
794
+ optimal_rt_tol = _optimize_rt_tolerance(study, params)
795
+ study.logger.info(f"RT tolerance optimization: {params.rt_tol}s → {optimal_rt_tol}s")
700
796
  # Create modified params with optimal RT tolerance
701
797
  import copy
702
798
  optimized_params = copy.deepcopy(params)
@@ -705,22 +801,22 @@ def _merge_kd_strict(self, params: merge_defaults) -> oms.ConsensusMap:
705
801
  optimized_params = params
706
802
 
707
803
  # Phase 1: Standard KD clustering
708
- self.logger.debug("Initial KD clustering")
709
- consensus_map = _merge_kd(self, optimized_params)
804
+ study.logger.debug("Initial KD clustering")
805
+ consensus_map = _merge_kd(study, optimized_params)
710
806
 
711
807
  # Phase 2: Post-processing quality control
712
- self.logger.debug("Post-processing quality control")
713
- consensus_map = _apply_kd_strict_postprocessing(self, consensus_map, optimized_params)
808
+ study.logger.debug("Post-processing quality control")
809
+ consensus_map = _apply_kd_strict_postprocessing(study, consensus_map, optimized_params)
714
810
 
715
811
  return consensus_map
716
812
 
717
813
 
718
- def _optimize_rt_tolerance(self, params: merge_defaults) -> float:
814
+ def _optimize_rt_tolerance(study, params: merge_defaults) -> float:
719
815
  """
720
816
  Optimize RT tolerance by testing different values and measuring oversegmentation.
721
817
 
722
818
  Args:
723
- self: Study object
819
+ study: Study object
724
820
  params: Merge parameters
725
821
 
726
822
  Returns:
@@ -729,7 +825,7 @@ def _optimize_rt_tolerance(self, params: merge_defaults) -> float:
729
825
  rt_tol_range = getattr(params, 'rt_tol_range', (0.8, 2.0))
730
826
  rt_tol_steps = getattr(params, 'rt_tol_steps', 5)
731
827
 
732
- self.logger.info(f"Optimizing RT tolerance in range {rt_tol_range} with {rt_tol_steps} steps")
828
+ study.logger.info(f"Optimizing RT tolerance in range {rt_tol_range} with {rt_tol_steps} steps")
733
829
 
734
830
  # Generate test values
735
831
  test_rt_tols = [rt_tol_range[0] + i * (rt_tol_range[1] - rt_tol_range[0]) / (rt_tol_steps - 1)
@@ -739,8 +835,8 @@ def _optimize_rt_tolerance(self, params: merge_defaults) -> float:
739
835
  best_score = float('inf')
740
836
 
741
837
  # Store original features for restoration
742
- original_consensus_df = getattr(self, 'consensus_df', pl.DataFrame())
743
- original_consensus_mapping_df = getattr(self, 'consensus_mapping_df', pl.DataFrame())
838
+ original_consensus_df = getattr(study, 'consensus_df', pl.DataFrame())
839
+ original_consensus_mapping_df = getattr(study, 'consensus_mapping_df', pl.DataFrame())
744
840
 
745
841
  for test_rt_tol in test_rt_tols:
746
842
  try:
@@ -750,18 +846,18 @@ def _optimize_rt_tolerance(self, params: merge_defaults) -> float:
750
846
  test_params.rt_tol = test_rt_tol
751
847
 
752
848
  # Run KD merge with test parameters
753
- test_consensus_map = _merge_kd(self, test_params)
849
+ test_consensus_map = _merge_kd(study, test_params)
754
850
 
755
851
  # Extract consensus features temporarily for analysis
756
- self._extract_consensus_features(test_consensus_map, test_params.min_samples)
852
+ _extract_consensus_features(study, test_consensus_map, test_params.min_samples)
757
853
 
758
- if len(self.consensus_df) == 0:
854
+ if len(study.consensus_df) == 0:
759
855
  continue
760
856
 
761
857
  # Calculate oversegmentation metrics
762
- oversegmentation_score = _calculate_oversegmentation_score(self, test_rt_tol)
858
+ oversegmentation_score = _calculate_oversegmentation_score(study, test_rt_tol)
763
859
 
764
- self.logger.debug(f"RT tol {test_rt_tol:.1f}s: {len(self.consensus_df)} features, score: {oversegmentation_score:.3f}")
860
+ study.logger.debug(f"RT tol {test_rt_tol:.1f}s: {len(study.consensus_df)} features, score: {oversegmentation_score:.3f}")
765
861
 
766
862
  # Lower score is better (less oversegmentation)
767
863
  if oversegmentation_score < best_score:
@@ -769,50 +865,50 @@ def _optimize_rt_tolerance(self, params: merge_defaults) -> float:
769
865
  best_rt_tol = test_rt_tol
770
866
 
771
867
  except Exception as e:
772
- self.logger.warning(f"RT tolerance optimization failed for {test_rt_tol}s: {e}")
868
+ study.logger.warning(f"RT tolerance optimization failed for {test_rt_tol}s: {e}")
773
869
  continue
774
870
 
775
871
  # Restore original consensus data
776
- self.consensus_df = original_consensus_df
777
- self.consensus_mapping_df = original_consensus_mapping_df
872
+ study.consensus_df = original_consensus_df
873
+ study.consensus_mapping_df = original_consensus_mapping_df
778
874
 
779
- self.logger.info(f"Optimal RT tolerance: {best_rt_tol:.1f}s (score: {best_score:.3f})")
875
+ study.logger.info(f"Optimal RT tolerance: {best_rt_tol:.1f}s (score: {best_score:.3f})")
780
876
  return best_rt_tol
781
877
 
782
878
 
783
- def _calculate_oversegmentation_score(self, rt_tol: float) -> float:
879
+ def _calculate_oversegmentation_score(study, rt_tol: float) -> float:
784
880
  """
785
881
  Calculate oversegmentation score based on feature density and RT spread metrics.
786
882
  Lower scores indicate less oversegmentation.
787
883
 
788
884
  Args:
789
- self: Study object
885
+ study: Study object
790
886
  rt_tol: RT tolerance used
791
887
 
792
888
  Returns:
793
889
  Oversegmentation score (lower = better)
794
890
  """
795
- if len(self.consensus_df) == 0:
891
+ if len(study.consensus_df) == 0:
796
892
  return float('inf')
797
893
 
798
894
  # Metric 1: Feature density (features per RT second)
799
- rt_range = self.consensus_df['rt'].max() - self.consensus_df['rt'].min()
895
+ rt_range = study.consensus_df['rt'].max() - study.consensus_df['rt'].min()
800
896
  if rt_range <= 0:
801
897
  return float('inf')
802
898
 
803
- feature_density = len(self.consensus_df) / rt_range
899
+ feature_density = len(study.consensus_df) / rt_range
804
900
 
805
901
  # Metric 2: Average RT spread relative to tolerance
806
- rt_spreads = (self.consensus_df['rt_max'] - self.consensus_df['rt_min'])
902
+ rt_spreads = (study.consensus_df['rt_max'] - study.consensus_df['rt_min'])
807
903
  avg_rt_spread_ratio = rt_spreads.mean() / rt_tol if rt_tol > 0 else float('inf')
808
904
 
809
905
  # Metric 3: Proportion of features with low sample counts (indicates fragmentation)
810
- low_sample_features = len(self.consensus_df.filter(pl.col('number_samples') <= 5))
811
- low_sample_ratio = low_sample_features / len(self.consensus_df)
906
+ low_sample_features = len(study.consensus_df.filter(pl.col('number_samples') <= 5))
907
+ low_sample_ratio = low_sample_features / len(study.consensus_df)
812
908
 
813
909
  # Metric 4: Number of features with excessive RT spread
814
910
  excessive_spread_features = len(rt_spreads.filter(rt_spreads > rt_tol * 2))
815
- excessive_spread_ratio = excessive_spread_features / len(self.consensus_df)
911
+ excessive_spread_ratio = excessive_spread_features / len(study.consensus_df)
816
912
 
817
913
  # Combined score (weighted combination)
818
914
  oversegmentation_score = (
@@ -825,7 +921,7 @@ def _calculate_oversegmentation_score(self, rt_tol: float) -> float:
825
921
  return oversegmentation_score
826
922
 
827
923
 
828
- def _apply_kd_strict_postprocessing(self, consensus_map: oms.ConsensusMap, params: merge_defaults) -> oms.ConsensusMap:
924
+ def _apply_kd_strict_postprocessing(study, consensus_map: oms.ConsensusMap, params: merge_defaults) -> oms.ConsensusMap:
829
925
  """
830
926
  Apply post-processing quality control to KD consensus map.
831
927
 
@@ -837,20 +933,20 @@ def _apply_kd_strict_postprocessing(self, consensus_map: oms.ConsensusMap, param
837
933
  Processed consensus map with reduced oversegmentation
838
934
  """
839
935
  if consensus_map.size() == 0:
840
- self.logger.warning("Empty consensus map provided to post-processing")
936
+ study.logger.warning("Empty consensus map provided to post-processing")
841
937
  return consensus_map
842
938
 
843
- self.logger.debug(f"Post-processing {consensus_map.size()} initial consensus features")
939
+ study.logger.debug(f"Post-processing {consensus_map.size()} initial consensus features")
844
940
 
845
941
  # Step 1: Extract initial consensus features
846
942
  original_min_samples = params.min_samples
847
943
  params.min_samples = 1 # Extract all features initially
848
944
 
849
- self._extract_consensus_features(consensus_map, params.min_samples)
850
- initial_feature_count = len(self.consensus_df)
945
+ _extract_consensus_features(study, consensus_map, params.min_samples)
946
+ initial_feature_count = len(study.consensus_df)
851
947
 
852
948
  if initial_feature_count == 0:
853
- self.logger.warning("No consensus features extracted for post-processing")
949
+ study.logger.warning("No consensus features extracted for post-processing")
854
950
  params.min_samples = original_min_samples
855
951
  return consensus_map
856
952
 
@@ -858,67 +954,67 @@ def _apply_kd_strict_postprocessing(self, consensus_map: oms.ConsensusMap, param
858
954
  secondary_merge_rt_tol = getattr(params, 'secondary_merge_rt_tol', 0.5)
859
955
  secondary_merge_mz_tol = getattr(params, 'secondary_merge_mz_tol', 0.005)
860
956
 
861
- self.logger.debug(f"Secondary clustering with RT≤{secondary_merge_rt_tol}s, m/z≤{secondary_merge_mz_tol}")
862
- merged_features = _perform_secondary_clustering(self, secondary_merge_rt_tol, secondary_merge_mz_tol)
957
+ study.logger.debug(f"Secondary clustering with RT≤{secondary_merge_rt_tol}s, m/z≤{secondary_merge_mz_tol}")
958
+ merged_features = _perform_secondary_clustering(study, secondary_merge_rt_tol, secondary_merge_mz_tol)
863
959
 
864
960
  # Step 3: Sample overlap validation
865
961
  min_sample_overlap = getattr(params, 'min_sample_overlap', 0.8)
866
962
  if min_sample_overlap > 0:
867
- self.logger.debug(f"Sample overlap validation (threshold: {min_sample_overlap})")
868
- merged_features = _validate_sample_overlap(self, merged_features, min_sample_overlap)
963
+ study.logger.debug(f"Sample overlap validation (threshold: {min_sample_overlap})")
964
+ merged_features = _validate_sample_overlap(study, merged_features, min_sample_overlap)
869
965
 
870
966
  # Step 4: RT spread quality filtering
871
967
  if params.rt_tol is not None:
872
968
  max_rt_spread = getattr(params, 'max_rt_spread', params.rt_tol * 2)
873
969
  if max_rt_spread is not None:
874
- self.logger.debug(f"RT spread filtering (max: {max_rt_spread:.1f}s)")
875
- merged_features = _filter_rt_spread(self, merged_features, max_rt_spread)
970
+ study.logger.debug(f"RT spread filtering (max: {max_rt_spread:.1f}s)")
971
+ merged_features = _filter_rt_spread(study, merged_features, max_rt_spread)
876
972
  else:
877
- self.logger.debug("Skipping RT spread filtering - max_rt_spread is None")
973
+ study.logger.debug("Skipping RT spread filtering - max_rt_spread is None")
878
974
  else:
879
- self.logger.debug("Skipping RT spread filtering - rt_tol is None")
975
+ study.logger.debug("Skipping RT spread filtering - rt_tol is None")
880
976
 
881
977
  # Step 5: Chromatographic coherence filtering (optional)
882
978
  min_coherence = getattr(params, 'min_coherence', 0.0)
883
979
  if min_coherence > 0:
884
- self.logger.debug(f"Chromatographic coherence filtering (min: {min_coherence})")
885
- merged_features = _filter_coherence(self, merged_features, min_coherence)
980
+ study.logger.debug(f"Chromatographic coherence filtering (min: {min_coherence})")
981
+ merged_features = _filter_coherence(study, merged_features, min_coherence)
886
982
 
887
983
  # Step 6: Rebuild consensus_df with filtered features and preserve mapping
888
- original_mapping_df = self.consensus_mapping_df.clone() # Save original mapping
889
- self.consensus_df = pl.DataFrame(merged_features, strict=False)
984
+ original_mapping_df = study.consensus_mapping_df.clone() # Save original mapping
985
+ study.consensus_df = pl.DataFrame(merged_features, strict=False)
890
986
 
891
987
  # Step 7: Apply original min_samples filter
892
988
  params.min_samples = original_min_samples
893
989
  if params.min_samples > 1:
894
- l1 = len(self.consensus_df)
895
- self.consensus_df = self.consensus_df.filter(
990
+ l1 = len(study.consensus_df)
991
+ study.consensus_df = study.consensus_df.filter(
896
992
  pl.col("number_samples") >= params.min_samples
897
993
  )
898
- filtered_count = l1 - len(self.consensus_df)
994
+ filtered_count = l1 - len(study.consensus_df)
899
995
  if filtered_count > 0:
900
- self.logger.debug(f"Filtered {filtered_count} features below min_samples threshold ({params.min_samples})")
996
+ study.logger.debug(f"Filtered {filtered_count} features below min_samples threshold ({params.min_samples})")
901
997
 
902
998
  # Step 8: Update consensus_mapping_df to match final consensus_df
903
- if len(self.consensus_df) > 0 and len(original_mapping_df) > 0:
904
- valid_consensus_ids = set(self.consensus_df['consensus_uid'].to_list())
905
- self.consensus_mapping_df = original_mapping_df.filter(
999
+ if len(study.consensus_df) > 0 and len(original_mapping_df) > 0:
1000
+ valid_consensus_ids = set(study.consensus_df['consensus_uid'].to_list())
1001
+ study.consensus_mapping_df = original_mapping_df.filter(
906
1002
  pl.col('consensus_uid').is_in(list(valid_consensus_ids))
907
1003
  )
908
1004
  else:
909
- self.consensus_mapping_df = pl.DataFrame()
1005
+ study.consensus_mapping_df = pl.DataFrame()
910
1006
 
911
- final_feature_count = len(self.consensus_df)
1007
+ final_feature_count = len(study.consensus_df)
912
1008
  reduction_pct = ((initial_feature_count - final_feature_count) / initial_feature_count * 100) if initial_feature_count > 0 else 0
913
1009
 
914
- self.logger.info(f"Consensus cleanup complete: {initial_feature_count} → {final_feature_count} features ({reduction_pct:.1f}% reduction)")
1010
+ study.logger.info(f"Consensus cleanup complete: {initial_feature_count} → {final_feature_count} features ({reduction_pct:.1f}% reduction)")
915
1011
 
916
1012
  # Create a new consensus map for compatibility (the processed data is in consensus_df)
917
1013
  processed_consensus_map = oms.ConsensusMap()
918
1014
  return processed_consensus_map
919
1015
 
920
1016
 
921
- def _perform_secondary_clustering(self, rt_tol: float, mz_tol: float) -> list:
1017
+ def _perform_secondary_clustering(study, rt_tol: float, mz_tol: float) -> list:
922
1018
  """
923
1019
  Perform secondary clustering to merge very close features.
924
1020
 
@@ -929,34 +1025,34 @@ def _perform_secondary_clustering(self, rt_tol: float, mz_tol: float) -> list:
929
1025
  Returns:
930
1026
  List of merged consensus feature dictionaries
931
1027
  """
932
- if len(self.consensus_df) == 0:
1028
+ if len(study.consensus_df) == 0:
933
1029
  return []
934
1030
 
935
1031
  # Convert consensus_df to list of dictionaries for clustering
936
1032
  consensus_features = []
937
- for i, row in enumerate(self.consensus_df.iter_rows(named=True)):
1033
+ for i, row in enumerate(study.consensus_df.iter_rows(named=True)):
938
1034
  consensus_features.append(dict(row))
939
1035
 
940
1036
  # Use Union-Find for efficient clustering
941
1037
  class UnionFind:
942
- def __init__(self, n):
943
- self.parent = list(range(n))
944
- self.rank = [0] * n
1038
+ def __init__(study, n):
1039
+ study.parent = list(range(n))
1040
+ study.rank = [0] * n
945
1041
 
946
- def find(self, x):
947
- if self.parent[x] != x:
948
- self.parent[x] = self.find(self.parent[x])
949
- return self.parent[x]
1042
+ def find(study, x):
1043
+ if study.parent[x] != x:
1044
+ study.parent[x] = study.find(study.parent[x])
1045
+ return study.parent[x]
950
1046
 
951
- def union(self, x, y):
952
- px, py = self.find(x), self.find(y)
1047
+ def union(study, x, y):
1048
+ px, py = study.find(x), study.find(y)
953
1049
  if px == py:
954
1050
  return
955
- if self.rank[px] < self.rank[py]:
1051
+ if study.rank[px] < study.rank[py]:
956
1052
  px, py = py, px
957
- self.parent[py] = px
958
- if self.rank[px] == self.rank[py]:
959
- self.rank[px] += 1
1053
+ study.parent[py] = px
1054
+ if study.rank[px] == study.rank[py]:
1055
+ study.rank[px] += 1
960
1056
 
961
1057
  n_features = len(consensus_features)
962
1058
  uf = UnionFind(n_features)
@@ -992,7 +1088,7 @@ def _perform_secondary_clustering(self, rt_tol: float, mz_tol: float) -> list:
992
1088
  merged_feature = _merge_feature_group(group)
993
1089
  merged_features.append(merged_feature)
994
1090
 
995
- self.logger.debug(f"Secondary clustering: {n_features} → {len(merged_features)} features ({n_features - len(merged_features)} merged)")
1091
+ study.logger.debug(f"Secondary clustering: {n_features} → {len(merged_features)} features ({n_features - len(merged_features)} merged)")
996
1092
  return merged_features
997
1093
 
998
1094
 
@@ -1066,7 +1162,7 @@ def _merge_feature_group(feature_group: list) -> dict:
1066
1162
  return merged
1067
1163
 
1068
1164
 
1069
- def _validate_sample_overlap(self, features: list, min_overlap: float) -> list:
1165
+ def _validate_sample_overlap(study, features: list, min_overlap: float) -> list:
1070
1166
  """
1071
1167
  Validate that merged features have sufficient sample overlap.
1072
1168
 
@@ -1097,7 +1193,7 @@ def _validate_sample_overlap(self, features: list, min_overlap: float) -> list:
1097
1193
  return validated_features
1098
1194
 
1099
1195
 
1100
- def _filter_rt_spread(self, features: list, max_rt_spread: float) -> list:
1196
+ def _filter_rt_spread(study, features: list, max_rt_spread: float) -> list:
1101
1197
  """
1102
1198
  Filter out features with excessive RT spread.
1103
1199
 
@@ -1122,12 +1218,12 @@ def _filter_rt_spread(self, features: list, max_rt_spread: float) -> list:
1122
1218
  filtered_count += 1
1123
1219
 
1124
1220
  if filtered_count > 0:
1125
- self.logger.debug(f"Filtered {filtered_count} features with excessive RT spread (>{max_rt_spread:.1f}s)")
1221
+ study.logger.debug(f"Filtered {filtered_count} features with excessive RT spread (>{max_rt_spread:.1f}s)")
1126
1222
 
1127
1223
  return filtered_features
1128
1224
 
1129
1225
 
1130
- def _filter_coherence(self, features: list, min_coherence: float) -> list:
1226
+ def _filter_coherence(study, features: list, min_coherence: float) -> list:
1131
1227
  """
1132
1228
  Filter out features with low chromatographic coherence.
1133
1229
 
@@ -1150,23 +1246,23 @@ def _filter_coherence(self, features: list, min_coherence: float) -> list:
1150
1246
  filtered_count += 1
1151
1247
 
1152
1248
  if filtered_count > 0:
1153
- self.logger.debug(f"Filtered {filtered_count} features with low coherence (<{min_coherence})")
1249
+ study.logger.debug(f"Filtered {filtered_count} features with low coherence (<{min_coherence})")
1154
1250
 
1155
1251
  return filtered_features
1156
1252
 
1157
1253
 
1158
- def _merge_kd_nowarp(self, params: merge_defaults) -> oms.ConsensusMap:
1254
+ def _merge_kd_nowarp(study, params: merge_defaults) -> oms.ConsensusMap:
1159
1255
  """KD-tree based merge without RT warping"""
1160
1256
 
1161
1257
  # Generate temporary feature maps on-demand from features_df
1162
- temp_feature_maps = _generate_feature_maps_on_demand(self)
1258
+ temp_feature_maps = _generate_feature_maps_on_demand(study)
1163
1259
 
1164
1260
  consensus_map = oms.ConsensusMap()
1165
1261
  file_descriptions = consensus_map.getColumnHeaders()
1166
1262
 
1167
1263
  for i, feature_map in enumerate(temp_feature_maps):
1168
1264
  file_description = file_descriptions.get(i, oms.ColumnHeader())
1169
- file_description.filename = self.samples_df.row(i, named=True)["sample_name"]
1265
+ file_description.filename = study.samples_df.row(i, named=True)["sample_name"]
1170
1266
  file_description.size = feature_map.size()
1171
1267
  file_description.unique_id = feature_map.getUniqueId()
1172
1268
  file_descriptions[i] = file_description
@@ -1193,18 +1289,18 @@ def _merge_kd_nowarp(self, params: merge_defaults) -> oms.ConsensusMap:
1193
1289
  return consensus_map
1194
1290
 
1195
1291
 
1196
- def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
1292
+ def _merge_kd_chunked(study, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
1197
1293
  """KD-based chunked merge with proper cross-chunk consensus building and optional parallel processing"""
1198
1294
 
1199
1295
  # Generate temporary feature maps on-demand from features_df
1200
- temp_feature_maps = _generate_feature_maps_on_demand(self)
1296
+ temp_feature_maps = _generate_feature_maps_on_demand(study)
1201
1297
 
1202
1298
  n_samples = len(temp_feature_maps)
1203
1299
  if n_samples <= params.chunk_size:
1204
- self.logger.info(f"Dataset size ({n_samples}) ≤ chunk_size, using KD merge")
1205
- consensus_map = _merge_kd(self, params)
1300
+ study.logger.info(f"Dataset size ({n_samples}) ≤ chunk_size, using KD merge")
1301
+ consensus_map = _merge_kd(study, params)
1206
1302
  # Extract consensus features to populate consensus_df for chunked method consistency
1207
- self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
1303
+ _extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
1208
1304
  return consensus_map
1209
1305
 
1210
1306
  # Process in chunks
@@ -1213,21 +1309,21 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1213
1309
  chunk_end = min(i + params.chunk_size, n_samples)
1214
1310
  chunks.append((i, temp_feature_maps[i:chunk_end]))
1215
1311
 
1216
- self.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples using {params.threads or 'sequential'} thread(s)")
1312
+ study.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples using {params.threads or 'sequential'} thread(s)")
1217
1313
 
1218
1314
  # Process each chunk to create chunk consensus maps
1219
1315
  chunk_consensus_maps = []
1220
1316
 
1221
1317
  if params.threads is None:
1222
1318
  # Sequential processing (original behavior)
1223
- for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}KD Chunk", disable=self.log_level not in ["TRACE", "DEBUG", "INFO"])):
1319
+ for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {study.log_label}KD Chunk", disable=study.log_level not in ["TRACE", "DEBUG", "INFO"])):
1224
1320
  chunk_consensus_map = oms.ConsensusMap()
1225
1321
 
1226
1322
  # Set up file descriptions for chunk
1227
1323
  file_descriptions = chunk_consensus_map.getColumnHeaders()
1228
1324
  for j, feature_map in enumerate(chunk_maps):
1229
1325
  file_description = file_descriptions.get(j, oms.ColumnHeader())
1230
- file_description.filename = self.samples_df.row(chunk_start_idx + j, named=True)["sample_name"]
1326
+ file_description.filename = study.samples_df.row(chunk_start_idx + j, named=True)["sample_name"]
1231
1327
  file_description.size = feature_map.size()
1232
1328
  file_description.unique_id = feature_map.getUniqueId()
1233
1329
  file_descriptions[j] = file_description
@@ -1255,7 +1351,7 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1255
1351
 
1256
1352
  else:
1257
1353
  # Parallel processing
1258
- self.logger.info(f"Processing chunks in parallel using {params.threads} processes")
1354
+ study.logger.info(f"Processing chunks in parallel using {params.threads} processes")
1259
1355
 
1260
1356
  # Prepare chunk data for parallel processing using features_df slices
1261
1357
  chunk_data_list = []
@@ -1264,7 +1360,7 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1264
1360
  chunk_sample_uids = []
1265
1361
  chunk_samples_df_rows = []
1266
1362
  for j in range(len(chunk_maps)):
1267
- sample_row = self.samples_df.row(chunk_start_idx + j, named=True)
1363
+ sample_row = study.samples_df.row(chunk_start_idx + j, named=True)
1268
1364
  chunk_sample_uids.append(sample_row['sample_uid'])
1269
1365
  chunk_samples_df_rows.append(sample_row)
1270
1366
 
@@ -1272,7 +1368,7 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1272
1368
  chunk_samples_df = pl.DataFrame(chunk_samples_df_rows)
1273
1369
 
1274
1370
  # Filter features_df for this chunk's samples and select only necessary columns
1275
- chunk_features_df = self.features_df.filter(
1371
+ chunk_features_df = study.features_df.filter(
1276
1372
  pl.col('sample_uid').is_in(chunk_sample_uids)
1277
1373
  ).select([
1278
1374
  'sample_uid', 'rt', 'mz', 'inty', 'charge', 'feature_id'
@@ -1316,22 +1412,22 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1316
1412
  serialized_chunk_results.append((chunk_start_idx, consensus_features))
1317
1413
  completed_chunks += 1
1318
1414
  n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
1319
- self.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1415
+ study.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1320
1416
  except Exception as exc:
1321
1417
  # Check if this is a BrokenProcessPool exception from Windows multiprocessing issues
1322
1418
  if isinstance(exc, BrokenProcessPool) or "process pool" in str(exc).lower():
1323
1419
  # Convert to RuntimeError so outer except block can catch it for fallback
1324
1420
  raise RuntimeError(f"Windows multiprocessing failure: {exc}")
1325
1421
  else:
1326
- self.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
1422
+ study.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
1327
1423
  raise exc
1328
1424
 
1329
1425
  except (RuntimeError, OSError, BrokenProcessPool) as e:
1330
1426
  # Handle Windows multiprocessing issues - fallback to ThreadPoolExecutor
1331
1427
  if ("freeze_support" in str(e) or "spawn" in str(e) or "bootstrapping" in str(e) or
1332
1428
  "process pool" in str(e).lower() or "Windows multiprocessing failure" in str(e)):
1333
- self.logger.warning(f"ProcessPoolExecutor failed (likely Windows multiprocessing issue): {e}")
1334
- self.logger.info(f"Falling back to ThreadPoolExecutor with {params.threads} threads")
1429
+ study.logger.warning(f"ProcessPoolExecutor failed (likely Windows multiprocessing issue): {e}")
1430
+ study.logger.info(f"Falling back to ThreadPoolExecutor with {params.threads} threads")
1335
1431
 
1336
1432
  with ThreadPoolExecutor(max_workers=params.threads) as executor:
1337
1433
  # Submit all chunk processing tasks
@@ -1350,9 +1446,9 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1350
1446
  serialized_chunk_results.append((chunk_start_idx, consensus_features))
1351
1447
  completed_chunks += 1
1352
1448
  n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
1353
- self.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1449
+ study.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1354
1450
  except Exception as exc:
1355
- self.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
1451
+ study.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
1356
1452
  raise exc
1357
1453
  else:
1358
1454
  # Re-raise other exceptions
@@ -1366,25 +1462,25 @@ def _merge_kd_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1366
1462
 
1367
1463
  # Merge chunk results with proper cross-chunk consensus building
1368
1464
  # _merge_chunk_results now handles both ConsensusMap objects (sequential) and serialized data (parallel)
1369
- _merge_chunk_results(self, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
1465
+ _merge_chunk_results(study, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
1370
1466
 
1371
- # Return a dummy consensus map for compatibility (consensus features are stored in self.consensus_df)
1467
+ # Return a dummy consensus map for compatibility (consensus features are stored in study.consensus_df)
1372
1468
  consensus_map = oms.ConsensusMap()
1373
1469
  return consensus_map
1374
1470
 
1375
1471
 
1376
- def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
1472
+ def _merge_qt_chunked(study, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> oms.ConsensusMap:
1377
1473
  """QT-based chunked merge with proper cross-chunk consensus building and optional parallel processing"""
1378
1474
 
1379
1475
  # Generate temporary feature maps on-demand from features_df
1380
- temp_feature_maps = _generate_feature_maps_on_demand(self)
1476
+ temp_feature_maps = _generate_feature_maps_on_demand(study)
1381
1477
 
1382
1478
  n_samples = len(temp_feature_maps)
1383
1479
  if n_samples <= params.chunk_size:
1384
- self.logger.info(f"Dataset size ({n_samples}) ≤ chunk_size, using QT merge")
1385
- consensus_map = _merge_qt(self, params)
1480
+ study.logger.info(f"Dataset size ({n_samples}) ≤ chunk_size, using QT merge")
1481
+ consensus_map = _merge_qt(study, params)
1386
1482
  # Extract consensus features to populate consensus_df for chunked method consistency
1387
- self._extract_consensus_features(consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
1483
+ _extract_consensus_features(study, consensus_map, params.min_samples, cached_adducts_df, cached_valid_adducts)
1388
1484
  return consensus_map
1389
1485
 
1390
1486
  # Process in chunks
@@ -1393,21 +1489,21 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1393
1489
  chunk_end = min(i + params.chunk_size, n_samples)
1394
1490
  chunks.append((i, temp_feature_maps[i:chunk_end]))
1395
1491
 
1396
- self.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples using {params.threads or 'sequential'} thread(s)")
1492
+ study.logger.debug(f"Processing {len(chunks)} chunks of max {params.chunk_size} samples using {params.threads or 'sequential'} thread(s)")
1397
1493
 
1398
1494
  # Process each chunk to create chunk consensus maps
1399
1495
  chunk_consensus_maps = []
1400
1496
 
1401
1497
  if params.threads is None:
1402
1498
  # Sequential processing (original behavior)
1403
- for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}QT Chunk", disable=self.log_level not in ["TRACE", "DEBUG", "INFO"])):
1499
+ for chunk_idx, (chunk_start_idx, chunk_maps) in enumerate(tqdm(chunks, desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {study.log_label}QT Chunk", disable=study.log_level not in ["TRACE", "DEBUG", "INFO"])):
1404
1500
  chunk_consensus_map = oms.ConsensusMap()
1405
1501
 
1406
1502
  # Set up file descriptions for chunk
1407
1503
  file_descriptions = chunk_consensus_map.getColumnHeaders()
1408
1504
  for j, feature_map in enumerate(chunk_maps):
1409
1505
  file_description = file_descriptions.get(j, oms.ColumnHeader())
1410
- file_description.filename = self.samples_df.row(chunk_start_idx + j, named=True)["sample_name"]
1506
+ file_description.filename = study.samples_df.row(chunk_start_idx + j, named=True)["sample_name"]
1411
1507
  file_description.size = feature_map.size()
1412
1508
  file_description.unique_id = feature_map.getUniqueId()
1413
1509
  file_descriptions[j] = file_description
@@ -1430,7 +1526,7 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1430
1526
 
1431
1527
  else:
1432
1528
  # Parallel processing
1433
- self.logger.info(f"Processing chunks in parallel using {params.threads} processes")
1529
+ study.logger.info(f"Processing chunks in parallel using {params.threads} processes")
1434
1530
 
1435
1531
  # Prepare chunk data for parallel processing using features_df slices
1436
1532
  chunk_data_list = []
@@ -1439,7 +1535,7 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1439
1535
  chunk_sample_uids = []
1440
1536
  chunk_samples_df_rows = []
1441
1537
  for j in range(len(chunk_maps)):
1442
- sample_row = self.samples_df.row(chunk_start_idx + j, named=True)
1538
+ sample_row = study.samples_df.row(chunk_start_idx + j, named=True)
1443
1539
  chunk_sample_uids.append(sample_row['sample_uid'])
1444
1540
  chunk_samples_df_rows.append(sample_row)
1445
1541
 
@@ -1447,7 +1543,7 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1447
1543
  chunk_samples_df = pl.DataFrame(chunk_samples_df_rows)
1448
1544
 
1449
1545
  # Filter features_df for this chunk's samples and select only necessary columns
1450
- chunk_features_df = self.features_df.filter(
1546
+ chunk_features_df = study.features_df.filter(
1451
1547
  pl.col('sample_uid').is_in(chunk_sample_uids)
1452
1548
  ).select([
1453
1549
  'sample_uid', 'rt', 'mz', 'inty', 'charge', 'feature_id'
@@ -1491,22 +1587,22 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1491
1587
  serialized_chunk_results.append((chunk_start_idx, consensus_features))
1492
1588
  completed_chunks += 1
1493
1589
  n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
1494
- self.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1590
+ study.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1495
1591
  except Exception as exc:
1496
1592
  # Check if this is a BrokenProcessPool exception from Windows multiprocessing issues
1497
1593
  if isinstance(exc, BrokenProcessPool) or "process pool" in str(exc).lower():
1498
1594
  # Convert to RuntimeError so outer except block can catch it for fallback
1499
1595
  raise RuntimeError(f"Windows multiprocessing failure: {exc}")
1500
1596
  else:
1501
- self.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
1597
+ study.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
1502
1598
  raise exc
1503
1599
 
1504
1600
  except (RuntimeError, OSError, BrokenProcessPool) as e:
1505
1601
  # Handle Windows multiprocessing issues - fallback to ThreadPoolExecutor
1506
1602
  if ("freeze_support" in str(e) or "spawn" in str(e) or "bootstrapping" in str(e) or
1507
1603
  "process pool" in str(e).lower() or "Windows multiprocessing failure" in str(e)):
1508
- self.logger.warning(f"ProcessPoolExecutor failed (likely Windows multiprocessing issue): {e}")
1509
- self.logger.info(f"Falling back to ThreadPoolExecutor with {params.threads} threads")
1604
+ study.logger.warning(f"ProcessPoolExecutor failed (likely Windows multiprocessing issue): {e}")
1605
+ study.logger.info(f"Falling back to ThreadPoolExecutor with {params.threads} threads")
1510
1606
 
1511
1607
  with ThreadPoolExecutor(max_workers=params.threads) as executor:
1512
1608
  # Submit all chunk processing tasks
@@ -1525,9 +1621,9 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1525
1621
  serialized_chunk_results.append((chunk_start_idx, consensus_features))
1526
1622
  completed_chunks += 1
1527
1623
  n_samples_in_chunk = len(chunk_data_list[chunk_idx]['chunk_samples_data'])
1528
- self.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1624
+ study.logger.info(f"Completed chunk {completed_chunks}/{total_chunks} (samples {chunk_start_idx + 1}-{chunk_start_idx + n_samples_in_chunk})")
1529
1625
  except Exception as exc:
1530
- self.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
1626
+ study.logger.error(f"Chunk {chunk_idx} generated an exception: {exc}")
1531
1627
  raise exc
1532
1628
  else:
1533
1629
  # Re-raise other exceptions
@@ -1541,14 +1637,14 @@ def _merge_qt_chunked(self, params: merge_defaults, cached_adducts_df=None, cach
1541
1637
 
1542
1638
  # Merge chunk results with proper cross-chunk consensus building
1543
1639
  # _merge_chunk_results now handles both ConsensusMap objects (sequential) and serialized data (parallel)
1544
- _merge_chunk_results(self, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
1640
+ _merge_chunk_results(study, chunk_consensus_maps, params, cached_adducts_df, cached_valid_adducts)
1545
1641
 
1546
- # Return a dummy consensus map for compatibility (consensus features are stored in self.consensus_df)
1642
+ # Return a dummy consensus map for compatibility (consensus features are stored in study.consensus_df)
1547
1643
  consensus_map = oms.ConsensusMap()
1548
1644
  return consensus_map
1549
1645
 
1550
1646
 
1551
- def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> None:
1647
+ def _merge_chunk_results(study, chunk_consensus_maps: list, params: merge_defaults, cached_adducts_df=None, cached_valid_adducts=None) -> None:
1552
1648
  """
1553
1649
  Scalable aggregation of chunk consensus maps into final consensus_df.
1554
1650
 
@@ -1561,7 +1657,8 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
1561
1657
  if len(chunk_consensus_maps) == 1:
1562
1658
  # Single chunk case - just extract using the true global min_samples.
1563
1659
  # No need for permissive threshold because we are not discarding singletons pre-aggregation.
1564
- self._extract_consensus_features(
1660
+ _extract_consensus_features(
1661
+ study,
1565
1662
  chunk_consensus_maps[0][1],
1566
1663
  params.min_samples,
1567
1664
  cached_adducts_df,
@@ -1572,10 +1669,10 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
1572
1669
  # Build feature_uid to feature_data lookup for fast access
1573
1670
  feature_uid_map = {
1574
1671
  row["feature_id"]: row["feature_uid"]
1575
- for row in self.features_df.iter_rows(named=True)
1672
+ for row in study.features_df.iter_rows(named=True)
1576
1673
  }
1577
1674
 
1578
- features_lookup = _optimized_feature_lookup(self, self.features_df)
1675
+ features_lookup = _optimized_feature_lookup(study, study.features_df)
1579
1676
 
1580
1677
  # Extract all consensus features from chunks with their feature_uids
1581
1678
  all_chunk_consensus = []
@@ -1717,8 +1814,8 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
1717
1814
 
1718
1815
  if not all_chunk_consensus:
1719
1816
  # No valid consensus features found
1720
- self.consensus_df = pl.DataFrame()
1721
- self.consensus_mapping_df = pl.DataFrame()
1817
+ study.consensus_df = pl.DataFrame()
1818
+ study.consensus_mapping_df = pl.DataFrame()
1722
1819
  return
1723
1820
 
1724
1821
  # Perform cross-chunk clustering using optimized spatial indexing
@@ -1744,22 +1841,22 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
1744
1841
  features_by_bin[(rt_bin, mz_bin)].append(i)
1745
1842
 
1746
1843
  class UF:
1747
- def __init__(self, n):
1748
- self.p = list(range(n))
1749
- self.r = [0]*n
1750
- def find(self, x):
1751
- if self.p[x] != x:
1752
- self.p[x] = self.find(self.p[x])
1753
- return self.p[x]
1754
- def union(self, a,b):
1755
- pa, pb = self.find(a), self.find(b)
1844
+ def __init__(study, n):
1845
+ study.p = list(range(n))
1846
+ study.r = [0]*n
1847
+ def find(study, x):
1848
+ if study.p[x] != x:
1849
+ study.p[x] = study.find(study.p[x])
1850
+ return study.p[x]
1851
+ def union(study, a,b):
1852
+ pa, pb = study.find(a), study.find(b)
1756
1853
  if pa == pb:
1757
1854
  return
1758
- if self.r[pa] < self.r[pb]:
1855
+ if study.r[pa] < study.r[pb]:
1759
1856
  pa, pb = pb, pa
1760
- self.p[pb] = pa
1761
- if self.r[pa] == self.r[pb]:
1762
- self.r[pa] += 1
1857
+ study.p[pb] = pa
1858
+ if study.r[pa] == study.r[pb]:
1859
+ study.r[pa] += 1
1763
1860
 
1764
1861
  uf = UF(n_features)
1765
1862
  checked = set()
@@ -1918,7 +2015,7 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
1918
2015
  # This allows proper cross-chunk consensus building before final filtering
1919
2016
 
1920
2017
  metadata = _calculate_consensus_statistics(
1921
- self,
2018
+ study,
1922
2019
  consensus_uid_counter,
1923
2020
  list(feature_data_acc.values()),
1924
2021
  rt_values_chunk,
@@ -1937,7 +2034,7 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
1937
2034
 
1938
2035
  if rt_spread > max_allowed_spread:
1939
2036
  # Skip consensus features with excessive RT spread
1940
- self.logger.debug(f"Skipping consensus feature {consensus_uid_counter} with RT spread {rt_spread:.3f}s > {max_allowed_spread:.3f}s")
2037
+ study.logger.debug(f"Skipping consensus feature {consensus_uid_counter} with RT spread {rt_spread:.3f}s > {max_allowed_spread:.3f}s")
1941
2038
  consensus_uid_counter += 1
1942
2039
  continue
1943
2040
 
@@ -1969,27 +2066,27 @@ def _merge_chunk_results(self, chunk_consensus_maps: list, params: merge_default
1969
2066
  consensus_uid_counter += 1
1970
2067
 
1971
2068
  # Assign DataFrames
1972
- self.consensus_df = pl.DataFrame(consensus_metadata, strict=False)
1973
- self.consensus_mapping_df = pl.DataFrame(consensus_mapping_list, strict=False)
2069
+ study.consensus_df = pl.DataFrame(consensus_metadata, strict=False)
2070
+ study.consensus_mapping_df = pl.DataFrame(consensus_mapping_list, strict=False)
1974
2071
 
1975
2072
  # Ensure mapping only contains features from retained consensus_df
1976
- if len(self.consensus_df) > 0:
1977
- valid_consensus_ids = set(self.consensus_df['consensus_uid'].to_list())
1978
- self.consensus_mapping_df = self.consensus_mapping_df.filter(
2073
+ if len(study.consensus_df) > 0:
2074
+ valid_consensus_ids = set(study.consensus_df['consensus_uid'].to_list())
2075
+ study.consensus_mapping_df = study.consensus_mapping_df.filter(
1979
2076
  pl.col('consensus_uid').is_in(list(valid_consensus_ids))
1980
2077
  )
1981
2078
  else:
1982
- self.consensus_mapping_df = pl.DataFrame()
2079
+ study.consensus_mapping_df = pl.DataFrame()
1983
2080
 
1984
2081
  # Attach empty consensus_map placeholder for downstream compatibility
1985
- self.consensus_map = oms.ConsensusMap()
2082
+ study.consensus_map = oms.ConsensusMap()
1986
2083
  return
1987
2084
 
1988
2085
 
1989
2086
  def _calculate_consensus_statistics(study_obj, consensus_uid: int, feature_data_list: list,
1990
2087
  rt_values: list, mz_values: list,
1991
2088
  intensity_values: list, quality_values: list,
1992
- number_features: int = None, number_samples: int = None,
2089
+ number_features: int | None = None, number_samples: int | None = None,
1993
2090
  cached_adducts_df=None, cached_valid_adducts=None) -> dict:
1994
2091
  """
1995
2092
  Calculate comprehensive statistics for a consensus feature from aggregated feature data.
@@ -2158,24 +2255,24 @@ def _cluster_consensus_features(features: list, rt_tol: float, mz_tol: float) ->
2158
2255
 
2159
2256
  # Use Union-Find for efficient clustering
2160
2257
  class UnionFind:
2161
- def __init__(self, n):
2162
- self.parent = list(range(n))
2163
- self.rank = [0] * n
2258
+ def __init__(study, n):
2259
+ study.parent = list(range(n))
2260
+ study.rank = [0] * n
2164
2261
 
2165
- def find(self, x):
2166
- if self.parent[x] != x:
2167
- self.parent[x] = self.find(self.parent[x])
2168
- return self.parent[x]
2262
+ def find(study, x):
2263
+ if study.parent[x] != x:
2264
+ study.parent[x] = study.find(study.parent[x])
2265
+ return study.parent[x]
2169
2266
 
2170
- def union(self, x, y):
2171
- px, py = self.find(x), self.find(y)
2267
+ def union(study, x, y):
2268
+ px, py = study.find(x), study.find(y)
2172
2269
  if px == py:
2173
2270
  return
2174
- if self.rank[px] < self.rank[py]:
2271
+ if study.rank[px] < study.rank[py]:
2175
2272
  px, py = py, px
2176
- self.parent[py] = px
2177
- if self.rank[px] == self.rank[py]:
2178
- self.rank[px] += 1
2273
+ study.parent[py] = px
2274
+ if study.rank[px] == study.rank[py]:
2275
+ study.rank[px] += 1
2179
2276
 
2180
2277
  n_features = len(features)
2181
2278
  uf = UnionFind(n_features)
@@ -2208,39 +2305,39 @@ def _cluster_consensus_features(features: list, rt_tol: float, mz_tol: float) ->
2208
2305
  return list(groups_by_root.values())
2209
2306
 
2210
2307
 
2211
- def _reset_consensus_data(self):
2308
+ def _reset_consensus_data(study):
2212
2309
  """Reset consensus-related DataFrames at the start of merge."""
2213
- self.consensus_df = pl.DataFrame()
2214
- self.consensus_ms2 = pl.DataFrame()
2215
- self.consensus_mapping_df = pl.DataFrame()
2310
+ study.consensus_df = pl.DataFrame()
2311
+ study.consensus_ms2 = pl.DataFrame()
2312
+ study.consensus_mapping_df = pl.DataFrame()
2216
2313
 
2217
2314
 
2218
- def _extract_consensus_features(self, consensus_map, min_samples, cached_adducts_df=None, cached_valid_adducts=None):
2315
+ def _extract_consensus_features(study, consensus_map, min_samples, cached_adducts_df=None, cached_valid_adducts=None):
2219
2316
  """Extract consensus features and build metadata."""
2220
- # create a dict to map uid to feature_uid using self.features_df
2317
+ # create a dict to map uid to feature_uid using study.features_df
2221
2318
  feature_uid_map = {
2222
2319
  row["feature_id"]: row["feature_uid"]
2223
- for row in self.features_df.iter_rows(named=True)
2320
+ for row in study.features_df.iter_rows(named=True)
2224
2321
  }
2225
2322
  imax = consensus_map.size()
2226
2323
 
2227
- self.logger.debug(f"Found {imax} feature groups by clustering.")
2324
+ study.logger.debug(f"Found {imax} feature groups by clustering.")
2228
2325
 
2229
2326
  # Pre-build fast lookup tables for features_df data using optimized approach
2230
- features_lookup = _optimized_feature_lookup(self, self.features_df)
2327
+ features_lookup = _optimized_feature_lookup(study, study.features_df)
2231
2328
 
2232
2329
  # create a list to store the consensus mapping
2233
2330
  consensus_mapping = []
2234
2331
  metadata_list = []
2235
2332
 
2236
- tqdm_disable = self.log_level not in ["TRACE", "DEBUG"]
2333
+ tqdm_disable = study.log_level not in ["TRACE", "DEBUG"]
2237
2334
 
2238
2335
  for i, feature in enumerate(
2239
2336
  tqdm(
2240
2337
  consensus_map,
2241
2338
  total=imax,
2242
2339
  disable=tqdm_disable,
2243
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Extract metadata",
2340
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {study.log_label}Extract metadata",
2244
2341
  ),
2245
2342
  ):
2246
2343
  # get all features in the feature map with the same unique id as the consensus feature
@@ -2486,7 +2583,7 @@ def _extract_consensus_features(self, consensus_map, min_samples, cached_adducts
2486
2583
  adduct_mass_shift_top = 1.007825
2487
2584
  else:
2488
2585
  # No valid adducts found - assign default based on study polarity
2489
- study_polarity = getattr(self, "polarity", "positive")
2586
+ study_polarity = getattr(study, "polarity", "positive")
2490
2587
  if study_polarity in ["negative", "neg"]:
2491
2588
  # Negative mode default
2492
2589
  adduct_top = "[M-?]1-"
@@ -2618,55 +2715,55 @@ def _extract_consensus_features(self, consensus_map, min_samples, cached_adducts
2618
2715
  )
2619
2716
 
2620
2717
  consensus_mapping_df = pl.DataFrame(consensus_mapping)
2621
- # remove all rows in consensus_mapping_df where consensus_id is not in self.featured_df['uid']
2718
+ # remove all rows in consensus_mapping_df where consensus_id is not in study.featured_df['uid']
2622
2719
  l1 = len(consensus_mapping_df)
2623
2720
  consensus_mapping_df = consensus_mapping_df.filter(
2624
- pl.col("feature_uid").is_in(self.features_df["feature_uid"].to_list()),
2721
+ pl.col("feature_uid").is_in(study.features_df["feature_uid"].to_list()),
2625
2722
  )
2626
- self.logger.debug(
2723
+ study.logger.debug(
2627
2724
  f"Filtered {l1 - len(consensus_mapping_df)} orphan features from maps.",
2628
2725
  )
2629
- self.consensus_mapping_df = consensus_mapping_df
2630
- self.consensus_df = pl.DataFrame(metadata_list, strict=False)
2726
+ study.consensus_mapping_df = consensus_mapping_df
2727
+ study.consensus_df = pl.DataFrame(metadata_list, strict=False)
2631
2728
 
2632
2729
  if min_samples is None:
2633
2730
  min_samples = 1
2634
2731
  if min_samples < 1:
2635
- min_samples = int(min_samples * len(self.samples_df))
2732
+ min_samples = int(min_samples * len(study.samples_df))
2636
2733
 
2637
2734
  # Validate that min_samples doesn't exceed the number of samples
2638
- if min_samples > len(self.samples_df):
2639
- self.logger.warning(
2640
- f"min_samples ({min_samples}) exceeds the number of samples ({len(self.samples_df)}). "
2641
- f"Setting min_samples to {len(self.samples_df)}.",
2735
+ if min_samples > len(study.samples_df):
2736
+ study.logger.warning(
2737
+ f"min_samples ({min_samples}) exceeds the number of samples ({len(study.samples_df)}). "
2738
+ f"Setting min_samples to {len(study.samples_df)}.",
2642
2739
  )
2643
- min_samples = len(self.samples_df)
2740
+ min_samples = len(study.samples_df)
2644
2741
 
2645
2742
  # filter out consensus features with less than min_samples features
2646
- l1 = len(self.consensus_df)
2647
- self.consensus_df = self.consensus_df.filter(
2743
+ l1 = len(study.consensus_df)
2744
+ study.consensus_df = study.consensus_df.filter(
2648
2745
  pl.col("number_samples") >= min_samples,
2649
2746
  )
2650
- self.logger.debug(
2651
- f"Filtered {l1 - len(self.consensus_df)} consensus features with less than {min_samples} samples.",
2747
+ study.logger.debug(
2748
+ f"Filtered {l1 - len(study.consensus_df)} consensus features with less than {min_samples} samples.",
2652
2749
  )
2653
2750
  # filter out consensus mapping with less than min_samples features
2654
- self.consensus_mapping_df = self.consensus_mapping_df.filter(
2655
- pl.col("consensus_uid").is_in(self.consensus_df["consensus_uid"].to_list()),
2751
+ study.consensus_mapping_df = study.consensus_mapping_df.filter(
2752
+ pl.col("consensus_uid").is_in(study.consensus_df["consensus_uid"].to_list()),
2656
2753
  )
2657
2754
 
2658
- self.consensus_map = consensus_map
2755
+ study.consensus_map = consensus_map
2659
2756
 
2660
2757
 
2661
- def _perform_adduct_grouping(self, rt_tol, mz_tol):
2758
+ def _perform_adduct_grouping(study, rt_tol, mz_tol):
2662
2759
  """Perform adduct grouping on consensus features."""
2663
2760
  import polars as pl
2664
2761
 
2665
2762
  # Add adduct grouping and adduct_of assignment
2666
- if len(self.consensus_df) > 0:
2763
+ if len(study.consensus_df) > 0:
2667
2764
  # Get relevant columns for grouping
2668
2765
  consensus_data = []
2669
- for row in self.consensus_df.iter_rows(named=True):
2766
+ for row in study.consensus_df.iter_rows(named=True):
2670
2767
  consensus_data.append(
2671
2768
  {
2672
2769
  "consensus_uid": row["consensus_uid"],
@@ -2679,11 +2776,11 @@ def _perform_adduct_grouping(self, rt_tol, mz_tol):
2679
2776
 
2680
2777
  # Use optimized adduct grouping
2681
2778
  adduct_group_list, adduct_of_list = _optimized_adduct_grouping(
2682
- self, consensus_data, rt_tol, mz_tol
2779
+ study, consensus_data, rt_tol, mz_tol
2683
2780
  )
2684
2781
 
2685
2782
  # Add the new columns to consensus_df
2686
- self.consensus_df = self.consensus_df.with_columns(
2783
+ study.consensus_df = study.consensus_df.with_columns(
2687
2784
  [
2688
2785
  pl.Series("adduct_group", adduct_group_list, dtype=pl.Int64),
2689
2786
  pl.Series("adduct_of", adduct_of_list, dtype=pl.Int64),
@@ -2691,7 +2788,7 @@ def _perform_adduct_grouping(self, rt_tol, mz_tol):
2691
2788
  )
2692
2789
 
2693
2790
 
2694
- def _count_tight_clusters(self, mz_tol: float = 0.04, rt_tol: float = 0.3) -> int:
2791
+ def _count_tight_clusters(study, mz_tol: float = 0.04, rt_tol: float = 0.3) -> int:
2695
2792
  """
2696
2793
  Count consensus features grouped in tight clusters.
2697
2794
 
@@ -2702,12 +2799,12 @@ def _count_tight_clusters(self, mz_tol: float = 0.04, rt_tol: float = 0.3) -> in
2702
2799
  Returns:
2703
2800
  Number of tight clusters found
2704
2801
  """
2705
- if len(self.consensus_df) < 2:
2802
+ if len(study.consensus_df) < 2:
2706
2803
  return 0
2707
2804
 
2708
2805
  # Extract consensus feature data
2709
2806
  consensus_data = []
2710
- for row in self.consensus_df.iter_rows(named=True):
2807
+ for row in study.consensus_df.iter_rows(named=True):
2711
2808
  consensus_data.append({
2712
2809
  'consensus_uid': row['consensus_uid'],
2713
2810
  'mz': row['mz'],
@@ -2768,7 +2865,7 @@ def _count_tight_clusters(self, mz_tol: float = 0.04, rt_tol: float = 0.3) -> in
2768
2865
  return tight_clusters_count
2769
2866
 
2770
2867
 
2771
- def _consensus_cleanup(self, rt_tol, mz_tol):
2868
+ def _consensus_cleanup(study, rt_tol, mz_tol):
2772
2869
  """
2773
2870
  Consensus cleanup to merge over-segmented consensus features and remove isotopic features.
2774
2871
 
@@ -2777,20 +2874,20 @@ def _consensus_cleanup(self, rt_tol, mz_tol):
2777
2874
  (too many features in very tight m/z and RT windows)
2778
2875
  2. Performs deisotoping to remove +1 and +2 isotopic features
2779
2876
  """
2780
- if len(self.consensus_df) == 0:
2877
+ if len(study.consensus_df) == 0:
2781
2878
  return
2782
2879
 
2783
- initial_count = len(self.consensus_df)
2880
+ initial_count = len(study.consensus_df)
2784
2881
 
2785
2882
  # Only perform enhanced post-clustering if there are many features
2786
2883
  if initial_count < 50:
2787
2884
  return
2788
2885
 
2789
- self.logger.debug(f"Enhanced post-clustering: processing {initial_count} consensus features")
2886
+ study.logger.debug(f"Enhanced post-clustering: processing {initial_count} consensus features")
2790
2887
 
2791
2888
  # Find tight clusters using spatial binning
2792
2889
  consensus_data = []
2793
- for row in self.consensus_df.iter_rows(named=True):
2890
+ for row in study.consensus_df.iter_rows(named=True):
2794
2891
  consensus_data.append({
2795
2892
  'consensus_uid': row['consensus_uid'],
2796
2893
  'mz': row['mz'],
@@ -2873,7 +2970,7 @@ def _consensus_cleanup(self, rt_tol, mz_tol):
2873
2970
  if not merge_groups:
2874
2971
  return
2875
2972
 
2876
- self.logger.debug(f"Found {len(merge_groups)} over-segmented clusters to merge")
2973
+ study.logger.debug(f"Found {len(merge_groups)} over-segmented clusters to merge")
2877
2974
 
2878
2975
  # Merge clusters by keeping the most representative feature
2879
2976
  uids_to_remove = set()
@@ -2892,25 +2989,25 @@ def _consensus_cleanup(self, rt_tol, mz_tol):
2892
2989
 
2893
2990
  if uids_to_remove:
2894
2991
  # Remove merged features from consensus_df
2895
- self.consensus_df = self.consensus_df.filter(
2992
+ study.consensus_df = study.consensus_df.filter(
2896
2993
  ~pl.col('consensus_uid').is_in(list(uids_to_remove))
2897
2994
  )
2898
2995
 
2899
2996
  # Also update consensus_mapping_df if it exists
2900
- if hasattr(self, 'consensus_mapping_df') and not self.consensus_mapping_df.is_empty():
2901
- self.consensus_mapping_df = self.consensus_mapping_df.filter(
2997
+ if hasattr(study, 'consensus_mapping_df') and not study.consensus_mapping_df.is_empty():
2998
+ study.consensus_mapping_df = study.consensus_mapping_df.filter(
2902
2999
  ~pl.col('consensus_uid').is_in(list(uids_to_remove))
2903
3000
  )
2904
3001
 
2905
- final_count = len(self.consensus_df)
3002
+ final_count = len(study.consensus_df)
2906
3003
  reduction = initial_count - final_count
2907
3004
  reduction_pct = (reduction / initial_count) * 100
2908
3005
 
2909
3006
  if reduction > 0:
2910
- self.logger.debug(f"Enhanced post-clustering: {initial_count} → {final_count} features ({reduction_pct:.1f}% reduction)")
3007
+ study.logger.debug(f"Enhanced post-clustering: {initial_count} → {final_count} features ({reduction_pct:.1f}% reduction)")
2911
3008
 
2912
3009
  # Step 2: Deisotoping - Remove +1 and +2 isotopic consensus features
2913
- pre_deisotoping_count = len(self.consensus_df)
3010
+ pre_deisotoping_count = len(study.consensus_df)
2914
3011
  isotope_uids_to_remove = set()
2915
3012
 
2916
3013
  # Use strict tolerances for deisotoping (same as declustering)
@@ -2919,7 +3016,7 @@ def _consensus_cleanup(self, rt_tol, mz_tol):
2919
3016
 
2920
3017
  # Get current consensus data for isotope detection
2921
3018
  current_consensus_data = []
2922
- for row in self.consensus_df.iter_rows(named=True):
3019
+ for row in study.consensus_df.iter_rows(named=True):
2923
3020
  current_consensus_data.append({
2924
3021
  'consensus_uid': row['consensus_uid'],
2925
3022
  'mz': row['mz'],
@@ -2970,31 +3067,31 @@ def _consensus_cleanup(self, rt_tol, mz_tol):
2970
3067
 
2971
3068
  # Remove isotopic features
2972
3069
  if isotope_uids_to_remove:
2973
- self.consensus_df = self.consensus_df.filter(
3070
+ study.consensus_df = study.consensus_df.filter(
2974
3071
  ~pl.col('consensus_uid').is_in(list(isotope_uids_to_remove))
2975
3072
  )
2976
3073
 
2977
3074
  # Also update consensus_mapping_df if it exists
2978
- if hasattr(self, 'consensus_mapping_df') and not self.consensus_mapping_df.is_empty():
2979
- self.consensus_mapping_df = self.consensus_mapping_df.filter(
3075
+ if hasattr(study, 'consensus_mapping_df') and not study.consensus_mapping_df.is_empty():
3076
+ study.consensus_mapping_df = study.consensus_mapping_df.filter(
2980
3077
  ~pl.col('consensus_uid').is_in(list(isotope_uids_to_remove))
2981
3078
  )
2982
3079
 
2983
- post_deisotoping_count = len(self.consensus_df)
3080
+ post_deisotoping_count = len(study.consensus_df)
2984
3081
  isotope_reduction = pre_deisotoping_count - post_deisotoping_count
2985
3082
 
2986
3083
  if isotope_reduction > 0:
2987
- self.logger.debug(f"Deisotoping: {pre_deisotoping_count} → {post_deisotoping_count} features ({isotope_reduction} isotopic features removed)")
3084
+ study.logger.debug(f"Deisotoping: {pre_deisotoping_count} → {post_deisotoping_count} features ({isotope_reduction} isotopic features removed)")
2988
3085
 
2989
3086
  # Final summary
2990
- final_count = len(self.consensus_df)
3087
+ final_count = len(study.consensus_df)
2991
3088
  total_reduction = initial_count - final_count
2992
3089
  if total_reduction > 0:
2993
3090
  total_reduction_pct = (total_reduction / initial_count) * 100
2994
- self.logger.debug(f"Consensus cleanup complete: {initial_count} → {final_count} features ({total_reduction_pct:.1f}% total reduction)")
3091
+ study.logger.debug(f"Consensus cleanup complete: {initial_count} → {final_count} features ({total_reduction_pct:.1f}% total reduction)")
2995
3092
 
2996
3093
 
2997
- def _identify_adduct_by_mass_shift(self, rt_tol, cached_adducts_df=None):
3094
+ def _identify_adduct_by_mass_shift(study, rt_tol, cached_adducts_df=None):
2998
3095
  """
2999
3096
  Identify coeluting consensus features by characteristic mass shifts between adducts
3000
3097
  and update their adduct information accordingly.
@@ -3014,23 +3111,24 @@ def _identify_adduct_by_mass_shift(self, rt_tol, cached_adducts_df=None):
3014
3111
  from collections import defaultdict
3015
3112
 
3016
3113
  # Check if consensus_df exists and has features
3017
- if len(self.consensus_df) == 0:
3018
- self.logger.debug("No consensus features for adduct identification by mass shift")
3114
+ if len(study.consensus_df) == 0:
3115
+ study.logger.debug("No consensus features for adduct identification by mass shift")
3019
3116
  return
3020
3117
 
3021
- self.logger.info(f"Identifying coeluting adducts by mass shifts in {len(self.consensus_df)} consensus features...")
3118
+ study.logger.info(f"Identifying coeluting adducts by mass shifts in {len(study.consensus_df)} consensus features...")
3022
3119
 
3023
3120
  # Get adducts DataFrame if not provided
3024
3121
  if cached_adducts_df is None or cached_adducts_df.is_empty():
3025
3122
  try:
3026
3123
  # Use lower min_probability for better adduct coverage in mass shift identification
3027
- cached_adducts_df = self._get_adducts(min_probability=0.01)
3124
+ from masster.study.id import _get_adducts
3125
+ cached_adducts_df = _get_adducts(study, min_probability=0.01)
3028
3126
  except Exception as e:
3029
- self.logger.warning(f"Could not retrieve adducts for mass shift identification: {e}")
3127
+ study.logger.warning(f"Could not retrieve adducts for mass shift identification: {e}")
3030
3128
  return
3031
3129
 
3032
3130
  if cached_adducts_df.is_empty():
3033
- self.logger.debug("No adducts available for mass shift identification")
3131
+ study.logger.debug("No adducts available for mass shift identification")
3034
3132
  return
3035
3133
 
3036
3134
  # Build catalogue of mass shifts between adducts
@@ -3081,11 +3179,11 @@ def _identify_adduct_by_mass_shift(self, rt_tol, cached_adducts_df=None):
3081
3179
  "to_charge": charge2
3082
3180
  })
3083
3181
 
3084
- self.logger.debug(f"Generated mass shift catalog with {len(mass_shift_catalog)} unique shifts")
3182
+ study.logger.debug(f"Generated mass shift catalog with {len(mass_shift_catalog)} unique shifts")
3085
3183
 
3086
3184
  # Get consensus features data
3087
3185
  consensus_data = []
3088
- for i, row in enumerate(self.consensus_df.iter_rows(named=True)):
3186
+ for i, row in enumerate(study.consensus_df.iter_rows(named=True)):
3089
3187
  consensus_data.append({
3090
3188
  "index": i,
3091
3189
  "consensus_uid": row["consensus_uid"],
@@ -3234,7 +3332,7 @@ def _identify_adduct_by_mass_shift(self, rt_tol, cached_adducts_df=None):
3234
3332
  }
3235
3333
 
3236
3334
  updated_count += 2
3237
- self.logger.debug(
3335
+ study.logger.debug(
3238
3336
  f"Identified adduct pair: {from_adduct_name} (m/z {from_feature['mz']:.4f}) "
3239
3337
  f"<-> {to_adduct_name} (m/z {to_feature['mz']:.4f}), "
3240
3338
  f"RT {rt1:.2f}s, Δm/z {mz_diff:.4f}"
@@ -3244,7 +3342,7 @@ def _identify_adduct_by_mass_shift(self, rt_tol, cached_adducts_df=None):
3244
3342
  # Apply updates to consensus_df
3245
3343
  if adduct_updates:
3246
3344
  # Prepare update data
3247
- consensus_uids = self.consensus_df["consensus_uid"].to_list()
3345
+ consensus_uids = study.consensus_df["consensus_uid"].to_list()
3248
3346
 
3249
3347
  new_adduct_top = []
3250
3348
  new_adduct_charge_top = []
@@ -3261,88 +3359,88 @@ def _identify_adduct_by_mass_shift(self, rt_tol, cached_adducts_df=None):
3261
3359
  else:
3262
3360
  # Keep existing values
3263
3361
  row_idx = consensus_uids.index(uid)
3264
- row = self.consensus_df.row(row_idx, named=True)
3362
+ row = study.consensus_df.row(row_idx, named=True)
3265
3363
  new_adduct_top.append(row.get("adduct_top"))
3266
3364
  new_adduct_charge_top.append(row.get("adduct_charge_top"))
3267
3365
  new_adduct_mass_neutral_top.append(row.get("adduct_mass_neutral_top"))
3268
3366
  new_adduct_mass_shift_top.append(row.get("adduct_mass_shift_top"))
3269
3367
 
3270
3368
  # Update the DataFrame
3271
- self.consensus_df = self.consensus_df.with_columns([
3369
+ study.consensus_df = study.consensus_df.with_columns([
3272
3370
  pl.Series("adduct_top", new_adduct_top),
3273
3371
  pl.Series("adduct_charge_top", new_adduct_charge_top),
3274
3372
  pl.Series("adduct_mass_neutral_top", new_adduct_mass_neutral_top),
3275
3373
  pl.Series("adduct_mass_shift_top", new_adduct_mass_shift_top)
3276
3374
  ])
3277
3375
 
3278
- self.logger.info(f"Updated adduct assignments for {updated_count} consensus features based on mass shifts")
3376
+ study.logger.info(f"Updated adduct assignments for {updated_count} consensus features based on mass shifts")
3279
3377
  else:
3280
- self.logger.debug("No consensus features updated based on mass shift analysis")
3378
+ study.logger.debug("No consensus features updated based on mass shift analysis")
3281
3379
 
3282
3380
 
3283
- def _finalize_merge(self, link_ms2, min_samples):
3381
+ def _finalize_merge(study, link_ms2, min_samples):
3284
3382
  """Complete the merge process with final calculations and cleanup."""
3285
3383
  import polars as pl
3286
3384
 
3287
3385
  # Check if consensus_df is empty or missing required columns
3288
- if len(self.consensus_df) == 0 or "number_samples" not in self.consensus_df.columns:
3289
- self.logger.debug("No consensus features found or consensus_df is empty. Skipping finalize merge.")
3386
+ if len(study.consensus_df) == 0 or "number_samples" not in study.consensus_df.columns:
3387
+ study.logger.debug("No consensus features found or consensus_df is empty. Skipping finalize merge.")
3290
3388
  return
3291
3389
 
3292
3390
  # Validate min_samples parameter
3293
3391
  if min_samples is None:
3294
3392
  min_samples = 1
3295
3393
  if min_samples < 1:
3296
- min_samples = int(min_samples * len(self.samples_df))
3394
+ min_samples = int(min_samples * len(study.samples_df))
3297
3395
 
3298
3396
  # Validate that min_samples doesn't exceed the number of samples
3299
- if min_samples > len(self.samples_df):
3300
- self.logger.warning(
3301
- f"min_samples ({min_samples}) exceeds the number of samples ({len(self.samples_df)}). "
3302
- f"Setting min_samples to {len(self.samples_df)}.",
3397
+ if min_samples > len(study.samples_df):
3398
+ study.logger.warning(
3399
+ f"min_samples ({min_samples}) exceeds the number of samples ({len(study.samples_df)}). "
3400
+ f"Setting min_samples to {len(study.samples_df)}.",
3303
3401
  )
3304
- min_samples = len(self.samples_df)
3402
+ min_samples = len(study.samples_df)
3305
3403
 
3306
3404
  # Filter out consensus features with less than min_samples features
3307
- l1 = len(self.consensus_df)
3308
- self.consensus_df = self.consensus_df.filter(
3405
+ l1 = len(study.consensus_df)
3406
+ study.consensus_df = study.consensus_df.filter(
3309
3407
  pl.col("number_samples") >= min_samples,
3310
3408
  )
3311
- self.logger.debug(
3312
- f"Filtered {l1 - len(self.consensus_df)} consensus features with less than {min_samples} samples.",
3409
+ study.logger.debug(
3410
+ f"Filtered {l1 - len(study.consensus_df)} consensus features with less than {min_samples} samples.",
3313
3411
  )
3314
3412
 
3315
3413
  # Filter out consensus mapping with less than min_samples features
3316
- self.consensus_mapping_df = self.consensus_mapping_df.filter(
3317
- pl.col("consensus_uid").is_in(self.consensus_df["consensus_uid"].to_list()),
3414
+ study.consensus_mapping_df = study.consensus_mapping_df.filter(
3415
+ pl.col("consensus_uid").is_in(study.consensus_df["consensus_uid"].to_list()),
3318
3416
  )
3319
3417
 
3320
3418
  # Calculate the completeness of the consensus map
3321
3419
  # Log completion with tight cluster metrics
3322
- if len(self.consensus_df) > 0 and len(self.samples_df) > 0:
3420
+ if len(study.consensus_df) > 0 and len(study.samples_df) > 0:
3323
3421
  c = (
3324
- len(self.consensus_mapping_df)
3325
- / len(self.consensus_df)
3326
- / len(self.samples_df)
3422
+ len(study.consensus_mapping_df)
3423
+ / len(study.consensus_df)
3424
+ / len(study.samples_df)
3327
3425
  )
3328
3426
 
3329
3427
  # Count tight clusters with specified thresholds
3330
- tight_clusters = _count_tight_clusters(self,mz_tol=0.04, rt_tol=0.3)
3428
+ tight_clusters = _count_tight_clusters(study,mz_tol=0.04, rt_tol=0.3)
3331
3429
 
3332
- self.logger.info(
3333
- f"Merging completed. Consensus features: {len(self.consensus_df)}. "
3430
+ study.logger.info(
3431
+ f"Merging completed. Consensus features: {len(study.consensus_df)}. "
3334
3432
  f"Completeness: {c:.2f}. Tight clusters left: {tight_clusters}.",
3335
3433
  )
3336
3434
  else:
3337
- self.logger.warning(
3338
- f"Merging completed with empty result. Consensus features: {len(self.consensus_df)}. "
3435
+ study.logger.warning(
3436
+ f"Merging completed with empty result. Consensus features: {len(study.consensus_df)}. "
3339
3437
  f"This may be due to min_samples ({min_samples}) being too high for the available data.",
3340
3438
  )
3341
3439
 
3342
3440
  # add iso data from raw files.
3343
- self.find_iso()
3441
+ study.find_iso()
3344
3442
  if link_ms2:
3345
- self.find_ms2()
3443
+ study.find_ms2()
3346
3444
 
3347
3445
 
3348
3446
  def _optimized_feature_lookup(study_obj, features_df):
@@ -3419,24 +3517,24 @@ def _optimized_adduct_grouping(study_obj, consensus_data, rt_tol, mz_tol):
3419
3517
 
3420
3518
  # Union-Find for efficient grouping
3421
3519
  class UnionFind:
3422
- def __init__(self, n):
3423
- self.parent = list(range(n))
3424
- self.rank = [0] * n
3520
+ def __init__(study, n):
3521
+ study.parent = list(range(n))
3522
+ study.rank = [0] * n
3425
3523
 
3426
- def find(self, x):
3427
- if self.parent[x] != x:
3428
- self.parent[x] = self.find(self.parent[x])
3429
- return self.parent[x]
3524
+ def find(study, x):
3525
+ if study.parent[x] != x:
3526
+ study.parent[x] = study.find(study.parent[x])
3527
+ return study.parent[x]
3430
3528
 
3431
- def union(self, x, y):
3432
- px, py = self.find(x), self.find(y)
3529
+ def union(study, x, y):
3530
+ px, py = study.find(x), study.find(y)
3433
3531
  if px == py:
3434
3532
  return
3435
- if self.rank[px] < self.rank[py]:
3533
+ if study.rank[px] < study.rank[py]:
3436
3534
  px, py = py, px
3437
- self.parent[py] = px
3438
- if self.rank[px] == self.rank[py]:
3439
- self.rank[px] += 1
3535
+ study.parent[py] = px
3536
+ if study.rank[px] == study.rank[py]:
3537
+ study.rank[px] += 1
3440
3538
 
3441
3539
  uid_to_idx = {feature[0]: i for i, feature in enumerate(valid_features)}
3442
3540
  uf = UnionFind(len(valid_features))