masster 0.5.1__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

@@ -15,77 +15,6 @@ from masster.study.defaults import (
15
15
  )
16
16
 
17
17
 
18
- def _generate_feature_maps_on_demand_for_align(study):
19
- """
20
- Generate feature maps on-demand from study.features_df for alignment operations.
21
- Returns temporary feature maps that are not cached in the study.
22
-
23
- Args:
24
- study: Study object containing features_df and samples_df
25
-
26
- Returns:
27
- list: List of temporary FeatureMap objects
28
- """
29
- import polars as pl
30
- import pyopenms as oms
31
-
32
- if study.features_df is None or len(study.features_df) == 0:
33
- study.logger.error("No features_df available for generating feature maps")
34
- return []
35
-
36
- temp_feature_maps = []
37
-
38
- # Process each sample in order
39
- for sample_index, row_dict in enumerate(study.samples_df.iter_rows(named=True)):
40
- sample_uid = row_dict["sample_uid"]
41
- sample_name = row_dict["sample_name"]
42
-
43
- # Get features for this sample from features_df
44
- sample_features = study.features_df.filter(pl.col("sample_uid") == sample_uid)
45
-
46
- # Create new FeatureMap
47
- feature_map = oms.FeatureMap()
48
-
49
- # Convert DataFrame features to OpenMS Features
50
- for feature_row in sample_features.iter_rows(named=True):
51
- feature = oms.Feature()
52
-
53
- # Set properties from DataFrame (handle missing values gracefully)
54
- try:
55
- # Skip features with missing critical data
56
- if feature_row["mz"] is None:
57
- study.logger.warning("Skipping feature due to missing mz")
58
- continue
59
- if feature_row["rt"] is None:
60
- study.logger.warning("Skipping feature due to missing rt")
61
- continue
62
- if feature_row["inty"] is None:
63
- study.logger.warning("Skipping feature due to missing inty")
64
- continue
65
-
66
- feature.setUniqueId(int(feature_row["feature_id"]))
67
- feature.setMZ(float(feature_row["mz"]))
68
- feature.setRT(float(feature_row["rt"]))
69
- feature.setIntensity(float(feature_row["inty"]))
70
-
71
- # Handle optional fields that might be None
72
- if feature_row.get("quality") is not None:
73
- feature.setOverallQuality(float(feature_row["quality"]))
74
- if feature_row.get("charge") is not None:
75
- feature.setCharge(int(feature_row["charge"]))
76
-
77
- # Add to feature map
78
- feature_map.push_back(feature)
79
- except (ValueError, TypeError) as e:
80
- study.logger.warning(f"Skipping feature due to conversion error: {e}")
81
- continue
82
-
83
- temp_feature_maps.append(feature_map)
84
-
85
- study.logger.debug(f"Generated {len(temp_feature_maps)} temporary feature maps from features_df for alignment")
86
- return temp_feature_maps
87
-
88
-
89
18
  def align(self, **kwargs):
90
19
  """Align feature maps using pose clustering or KD algorithm and update feature RTs.
91
20
 
@@ -103,30 +32,7 @@ def align(self, **kwargs):
103
32
  - algorithm (str): Alignment algorithm ('pc' for PoseClustering, 'kd' for KD).
104
33
 
105
34
  KD algorithm specific parameters:
106
- - min_samples (int): Minimum number of samples required for KD alignment.
107
- - nr_partitions (int): Number of partitions in m/z dimension.
108
- - warp_enabled (bool): Enable non-linear retention time transformation.
109
- - warp_rt_tol (float): RT tolerance for the LOWESS fit.
110
35
  - warp_mz_tol (float): m/z tolerance for the LOWESS fit.
111
- - warp_max_pairwise_log_fc (float): Maximum absolute log10 fold-change threshold for pairing.
112
- - warp_min_rel_cc_size (float): Minimum relative connected component size.
113
- - warp_max_nr_conflicts (int): Allow up to this many conflicts per connected component for alignment.
114
- - link_rt_tol (float): Width of RT tolerance window for linking features.
115
- - link_mz_tol (float): m/z tolerance for linking features.
116
- - link_charge_merging (str): Charge merging strategy for linking features.
117
- - link_adduct_merging (str): Adduct merging strategy for linking features.
118
- - distance_RT_exponent (float): Exponent for normalized RT differences.
119
- - distance_RT_weight (float): Weight factor for final RT distances.
120
- - distance_MZ_exponent (float): Exponent for normalized m/z differences.
121
- - distance_MZ_weight (float): Weight factor for final m/z distances.
122
- - distance_intensity_exponent (float): Exponent for differences in relative intensity.
123
- - distance_intensity_weight (float): Weight factor for final intensity distances.
124
- - distance_intensity_log_transform (str): Log-transform intensities.
125
- - LOWESS_span (float): Fraction of datapoints for each local regression.
126
- - LOWESS_num_iterations (int): Number of robustifying iterations for LOWESS fitting.
127
- - LOWESS_delta (float): Parameter for LOWESS computations (negative auto-computes).
128
- - LOWESS_interpolation_type (str): Method for interpolation between datapoints.
129
- - LOWESS_extrapolation_type (str): Method for extrapolation outside data range.
130
36
  """
131
37
  # parameters initialization
132
38
  params = align_defaults()
@@ -155,145 +61,31 @@ def align(self, **kwargs):
155
61
  )
156
62
  else:
157
63
  self.logger.warning(f"Unknown parameter '{key}' ignored")
158
- # end of parameter initialization
159
64
 
160
65
  # Store parameters in the Study object
161
66
  self.update_history(["align"], params.to_dict())
162
67
  self.logger.debug("Parameters stored to align")
163
68
 
164
- # Generate temporary feature maps on-demand from features_df instead of using cached data
165
- self.logger.debug("Generating feature maps on-demand from features_df for alignment")
166
- fmaps = _generate_feature_maps_on_demand_for_align(self)
69
+ # Ensure rt_original exists before starting alignment (both algorithms need this)
70
+ if "rt_original" not in self.features_df.columns:
71
+ # add column 'rt_original' after 'rt'
72
+ rt_index = self.features_df.columns.get_loc("rt") + 1
73
+ self.features_df.insert(rt_index, "rt_original", 0)
74
+ self.features_df["rt_original"] = self.features_df["rt"]
75
+ self.logger.debug("Created rt_original column from current rt values")
167
76
 
168
77
  # Choose alignment algorithm
169
78
  algorithm = params.get("algorithm").lower()
170
79
 
171
80
  if algorithm == "pc":
172
- _align_pose_clustering(self, fmaps, params)
81
+ _align_pose_clustering(self, params)
173
82
 
174
83
  elif algorithm == "kd":
175
- _align_kd_algorithm(self, fmaps, params)
84
+ _align_kd_algorithm(self, params)
176
85
  else:
177
86
  self.logger.error(f"Unknown alignment algorithm '{algorithm}'")
178
- # Clean up temporary feature maps to release memory
179
- del fmaps
180
87
  return
181
88
 
182
- # check if rt_original exists in features_df, if not, add it after rt
183
- if "rt_original" not in self.features_df.columns:
184
- # add column 'rt_original' after 'rt'
185
- rt_index = self.features_df.columns.get_loc("rt") + 1
186
- self.features_df.insert(rt_index, "rt_original", 0)
187
- self.features_df["rt_original"] = self.features_df["rt"]
188
-
189
- # iterate through all feature_maps and add the transformed retention times to the features_df
190
-
191
- # Build a fast lookup for (sample_uid, featureUid) to index in features_df
192
- feats = self.features_df
193
-
194
- # Pre-build sample_uid lookup for faster access
195
- self.logger.debug("Build sample_uid lookup for fast access...")
196
- sample_uid_lookup = {
197
- idx: row_dict["sample_uid"]
198
- for idx, row_dict in enumerate(self.samples_df.iter_rows(named=True))
199
- }
200
-
201
- # Build the main lookup using feature_uid (not feature_id)
202
- if "feature_id" in feats.columns:
203
- # Create lookup mapping (sample_uid, feature_id) to DataFrame index using Polars
204
- # Since we need a pandas-style index lookup, we'll create a simple dict
205
- sample_uids = feats.get_column("sample_uid").to_list()
206
-
207
- # Handle feature_id column - it might be Object type due to conversion
208
- feature_id_col = feats.get_column("feature_id")
209
- if feature_id_col.dtype == pl.Object:
210
- # If it's Object type, convert to list and let Python handle the conversion
211
- feature_ids = feature_id_col.to_list()
212
- # Convert to strings if they're not already
213
- feature_ids = [str(fid) if fid is not None else None for fid in feature_ids]
214
- else:
215
- # Safe to cast normally
216
- feature_ids = feature_id_col.cast(pl.Utf8).to_list()
217
-
218
- lookup = {
219
- (sample_uid, feature_id): idx
220
- for idx, (sample_uid, feature_id) in enumerate(
221
- zip(sample_uids, feature_ids, strict=True),
222
- )
223
- }
224
- else:
225
- # fallback: skip if feature_uid column missing
226
- lookup = {}
227
- self.logger.warning("feature_id column not found in features_df")
228
-
229
- # Pre-allocate update lists for better performance
230
- all_update_idx = []
231
- all_update_rt = []
232
- all_update_rt_original = []
233
-
234
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG"]
235
-
236
- for index, fm in tqdm(
237
- list(enumerate(fmaps)),
238
- total=len(fmaps),
239
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Extract RTs",
240
- disable=tdqm_disable,
241
- ):
242
- sample_uid = sample_uid_lookup.get(index)
243
- if sample_uid is None:
244
- continue
245
-
246
- # Collect all updates for this feature map
247
- for f in fm:
248
- feature_uid = str(f.getUniqueId())
249
- idx = lookup.get((sample_uid, feature_uid))
250
- if idx is not None:
251
- rt = round(f.getRT(), 3)
252
- # rt_or = round(f.getMetaValue("original_RT"), 3) if f.metaValueExists("original_RT") else rt
253
- all_update_idx.append(idx)
254
- all_update_rt.append(rt)
255
- # all_update_rt_original.append(rt_or)
256
-
257
- # Single batch update for all features at once
258
- if all_update_idx:
259
- # Build a full-length Python list of rt values, update specified indices,
260
- # then replace the DataFrame column with a Series that has the same length
261
- try:
262
- current_rt = self.features_df["rt"].to_list()
263
- except Exception:
264
- current_rt = [None] * self.features_df.height
265
-
266
- # Defensive: ensure list length equals dataframe height
267
- if len(current_rt) != self.features_df.height:
268
- current_rt = [None] * self.features_df.height
269
-
270
- for idx, new_rt in zip(all_update_idx, all_update_rt):
271
- current_rt[idx] = new_rt
272
-
273
- new_cols = [pl.Series("rt", current_rt)]
274
-
275
- # Update rt_original if corresponding updates were collected
276
- if "all_update_rt_original" in locals() and all_update_rt_original:
277
- try:
278
- current_rt_orig = (
279
- self.features_df["rt_original"].to_list()
280
- if "rt_original" in self.features_df.columns
281
- else [None] * self.features_df.height
282
- )
283
- except Exception:
284
- current_rt_orig = [None] * self.features_df.height
285
-
286
- if len(current_rt_orig) != self.features_df.height:
287
- current_rt_orig = [None] * self.features_df.height
288
-
289
- for idx, new_orig in zip(all_update_idx, all_update_rt_original):
290
- current_rt_orig[idx] = new_orig
291
-
292
- new_cols.append(pl.Series("rt_original", current_rt_orig))
293
-
294
- # Replace columns in one call
295
- self.features_df = self.features_df.with_columns(*new_cols)
296
-
297
89
  self.logger.debug("Alignment completed successfully.")
298
90
 
299
91
  # Reset consensus data structures after alignment since RT changes invalidate consensus
@@ -307,6 +99,9 @@ def align(self, **kwargs):
307
99
  if not self.consensus_ms2.is_empty():
308
100
  self.consensus_ms2 = pl.DataFrame()
309
101
  consensus_reset_count += 1
102
+ if not self.id_df.is_empty():
103
+ self.id_df = pl.DataFrame()
104
+ consensus_reset_count += 1
310
105
 
311
106
  # Remove merge and find_ms2 parameters from history since they need to be re-run
312
107
  keys_to_remove = ["merge", "find_ms2"]
@@ -319,17 +114,13 @@ def align(self, **kwargs):
319
114
  self.logger.debug(f"Removed {key} from history")
320
115
 
321
116
  if consensus_reset_count > 0 or history_removed_count > 0:
322
- self.logger.info(
117
+ self.logger.debug(
323
118
  f"Alignment reset: {consensus_reset_count} consensus structures cleared, {history_removed_count} history entries removed",
324
119
  )
325
120
 
326
121
  if params.get("save_features"):
327
122
  self.save_samples()
328
123
 
329
- # Clean up temporary feature maps to release memory
330
- del fmaps
331
- self.logger.debug("Temporary feature maps deleted to release memory")
332
-
333
124
 
334
125
  def find_ms2(self, **kwargs):
335
126
  """
@@ -803,12 +594,73 @@ def _find_closest_valley(chrom, rt, dir="left", threshold=0.9):
803
594
  return chrom.rt[idx]
804
595
 
805
596
 
806
- def _align_pose_clustering(study_obj, fmaps, params):
597
+ def _align_pose_clustering(study_obj, params):
807
598
  """Perform alignment using PoseClustering algorithm."""
808
599
  import pyopenms as oms
600
+ import polars as pl
809
601
  from tqdm import tqdm
810
602
  from datetime import datetime
811
603
 
604
+ # Generate temporary feature maps on-demand from features_df for PoseClustering
605
+ study_obj.logger.debug("Generating feature maps on-demand from features_df for PoseClustering alignment")
606
+
607
+ tdqm_disable = study_obj.log_level not in ["TRACE", "DEBUG", "INFO"]
608
+ fmaps = []
609
+
610
+ # Process each sample in order with progress bar
611
+ for sample_index, row_dict in tqdm(
612
+ list(enumerate(study_obj.samples_df.iter_rows(named=True))),
613
+ total=len(study_obj.samples_df),
614
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {study_obj.log_label}Generate feature maps",
615
+ disable=tdqm_disable,
616
+ ):
617
+ sample_uid = row_dict["sample_uid"]
618
+ sample_name = row_dict["sample_name"]
619
+
620
+ # Get features for this sample from features_df
621
+ sample_features = study_obj.features_df.filter(pl.col("sample_uid") == sample_uid)
622
+
623
+ # Create new FeatureMap
624
+ feature_map = oms.FeatureMap()
625
+
626
+ # Convert DataFrame features to OpenMS Features
627
+ for feature_row in sample_features.iter_rows(named=True):
628
+ feature = oms.Feature()
629
+
630
+ # Set properties from DataFrame (handle missing values gracefully)
631
+ try:
632
+ # Skip features with missing critical data
633
+ if feature_row["mz"] is None:
634
+ study_obj.logger.warning("Skipping feature due to missing mz")
635
+ continue
636
+ if feature_row["rt"] is None:
637
+ study_obj.logger.warning("Skipping feature due to missing rt")
638
+ continue
639
+ if feature_row["inty"] is None:
640
+ study_obj.logger.warning("Skipping feature due to missing inty")
641
+ continue
642
+
643
+ feature.setUniqueId(int(feature_row["feature_id"]))
644
+ feature.setMZ(float(feature_row["mz"]))
645
+ feature.setRT(float(feature_row["rt"]))
646
+ feature.setIntensity(float(feature_row["inty"]))
647
+
648
+ # Handle optional fields that might be None
649
+ if feature_row.get("quality") is not None:
650
+ feature.setOverallQuality(float(feature_row["quality"]))
651
+ if feature_row.get("charge") is not None:
652
+ feature.setCharge(int(feature_row["charge"]))
653
+
654
+ # Add to feature map
655
+ feature_map.push_back(feature)
656
+ except (ValueError, TypeError) as e:
657
+ study_obj.logger.warning(f"Skipping feature due to conversion error: {e}")
658
+ continue
659
+
660
+ fmaps.append(feature_map)
661
+
662
+ study_obj.logger.debug(f"Generated {len(fmaps)} feature maps from features_df for PoseClustering alignment")
663
+
812
664
  # Create PC-specific OpenMS parameters
813
665
  params_oms = oms.Param()
814
666
  params_oms.setValue("pairfinder:distance_intensity:log_transform", "disabled")
@@ -847,7 +699,6 @@ def _align_pose_clustering(study_obj, fmaps, params):
847
699
  study_obj.logger.debug(f"Parameters for alignment: {params}")
848
700
 
849
701
  # Perform alignment and transformation of feature maps to the reference map (exclude reference map)
850
- tdqm_disable = study_obj.log_level not in ["TRACE", "DEBUG", "INFO"]
851
702
  for index, fm in tqdm(
852
703
  list(enumerate(fmaps)),
853
704
  total=len(fmaps),
@@ -879,14 +730,126 @@ def _align_pose_clustering(study_obj, fmaps, params):
879
730
  continue
880
731
 
881
732
  study_obj.alignment_ref_index = ref_index
733
+
734
+ # Process feature maps and update features_df with transformed retention times
735
+ # Build a fast lookup for (sample_uid, featureUid) to index in features_df
736
+ feats = study_obj.features_df
737
+
738
+ # Pre-build sample_uid lookup for faster access
739
+ study_obj.logger.debug("Build sample_uid lookup for fast access...")
740
+ sample_uid_lookup = {
741
+ idx: row_dict["sample_uid"]
742
+ for idx, row_dict in enumerate(study_obj.samples_df.iter_rows(named=True))
743
+ }
744
+
745
+ # Build the main lookup using feature_uid (not feature_id)
746
+ if "feature_id" in feats.columns:
747
+ # Create lookup mapping (sample_uid, feature_id) to DataFrame index using Polars
748
+ # Since we need a pandas-style index lookup, we'll create a simple dict
749
+ sample_uids = feats.get_column("sample_uid").to_list()
750
+
751
+ # Handle feature_id column - it might be Object type due to conversion
752
+ feature_id_col = feats.get_column("feature_id")
753
+ if feature_id_col.dtype == pl.Object:
754
+ # If it's Object type, convert to list and let Python handle the conversion
755
+ feature_ids = feature_id_col.to_list()
756
+ # Convert to strings if they're not already
757
+ feature_ids = [str(fid) if fid is not None else None for fid in feature_ids]
758
+ else:
759
+ # Safe to cast normally
760
+ feature_ids = feature_id_col.cast(pl.Utf8).to_list()
761
+
762
+ lookup = {
763
+ (sample_uid, feature_id): idx
764
+ for idx, (sample_uid, feature_id) in enumerate(
765
+ zip(sample_uids, feature_ids, strict=True),
766
+ )
767
+ }
768
+ else:
769
+ # fallback: skip if feature_uid column missing
770
+ lookup = {}
771
+ study_obj.logger.warning("feature_id column not found in features_df")
772
+
773
+ # Pre-allocate update lists for better performance
774
+ all_update_idx = []
775
+ all_update_rt = []
776
+ all_update_rt_original = []
777
+
778
+ for index, fm in tqdm(
779
+ list(enumerate(fmaps)),
780
+ total=len(fmaps),
781
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {study_obj.log_label}Extract RTs",
782
+ disable=tdqm_disable,
783
+ ):
784
+ sample_uid = sample_uid_lookup.get(index)
785
+ if sample_uid is None:
786
+ continue
882
787
 
788
+ # Collect all updates for this feature map
789
+ for f in fm:
790
+ feature_uid = str(f.getUniqueId())
791
+ idx = lookup.get((sample_uid, feature_uid))
792
+ if idx is not None:
793
+ rt = round(f.getRT(), 3)
794
+ # rt_or = round(f.getMetaValue("original_RT"), 3) if f.metaValueExists("original_RT") else rt
795
+ all_update_idx.append(idx)
796
+ all_update_rt.append(rt)
797
+ # all_update_rt_original.append(rt_or)
798
+
799
+ # Single batch update for all features at once
800
+ if all_update_idx:
801
+ # Build a full-length Python list of rt values, update specified indices,
802
+ # then replace the DataFrame column with a Series that has the same length
803
+ try:
804
+ current_rt = study_obj.features_df["rt"].to_list()
805
+ except Exception:
806
+ current_rt = [None] * study_obj.features_df.height
807
+
808
+ # Defensive: ensure list length equals dataframe height
809
+ if len(current_rt) != study_obj.features_df.height:
810
+ current_rt = [None] * study_obj.features_df.height
811
+
812
+ for idx, new_rt in zip(all_update_idx, all_update_rt):
813
+ current_rt[idx] = new_rt
814
+
815
+ new_cols = [pl.Series("rt", current_rt)]
816
+
817
+ # Update rt_original if corresponding updates were collected
818
+ if "all_update_rt_original" in locals() and all_update_rt_original:
819
+ try:
820
+ current_rt_orig = (
821
+ study_obj.features_df["rt_original"].to_list()
822
+ if "rt_original" in study_obj.features_df.columns
823
+ else [None] * study_obj.features_df.height
824
+ )
825
+ except Exception:
826
+ current_rt_orig = [None] * study_obj.features_df.height
827
+
828
+ if len(current_rt_orig) != study_obj.features_df.height:
829
+ current_rt_orig = [None] * study_obj.features_df.height
830
+
831
+ for idx, new_orig in zip(all_update_idx, all_update_rt_original):
832
+ current_rt_orig[idx] = new_orig
833
+
834
+ new_cols.append(pl.Series("rt_original", current_rt_orig))
835
+
836
+ # Replace columns in one call
837
+ study_obj.features_df = study_obj.features_df.with_columns(*new_cols)
838
+
839
+ # Clean up temporary feature maps to release memory
840
+ del fmaps
841
+ study_obj.logger.debug("Temporary feature maps deleted to release memory")
883
842
 
884
- def _align_kd_algorithm(study_obj, fmaps, params):
843
+
844
+ def _align_kd_algorithm(study_obj, params):
885
845
  """
886
- Custom KD-tree / reference-based alignment.
846
+ Custom KD-tree / reference-based alignment working directly with features_df.
887
847
  """
888
848
  import bisect
889
849
  import statistics
850
+ import pyopenms as oms
851
+ import polars as pl
852
+ from datetime import datetime
890
853
 
891
854
  # Pull parameter values - map standard align params to our algorithm
892
855
  # Use rt_tol (standard align param) instead of warp_rt_tol for RT tolerance
@@ -919,26 +882,64 @@ def _align_kd_algorithm(study_obj, fmaps, params):
919
882
  _raw_mp = None
920
883
  max_points = int(_raw_mp) if _raw_mp is not None else 1000
921
884
  study_obj.logger.info(
922
- f"Align time axes with rt_tol={params.get('rt_tol')}, min_samples={params.get('min_samples')}, max_points={max_points}",
885
+ f"KD align: rt_tol={params.get('rt_tol')}, max_points={max_points}",
923
886
  )
924
887
 
925
- # Check if feature maps are empty before proceeding
926
- if not fmaps:
927
- study_obj.logger.error("No feature maps available for alignment. Cannot proceed with alignment.")
928
- raise ValueError("No feature maps available for alignment. This usually indicates that all samples failed to load properly.")
888
+ # Work directly with features_df instead of feature maps
889
+ if study_obj.features_df is None or study_obj.features_df.is_empty():
890
+ study_obj.logger.error("No features_df available for alignment. Cannot proceed with KD alignment.")
891
+ raise ValueError("No features_df available for alignment. This usually indicates that features were not detected properly.")
892
+
893
+ # OPTIMIZATION 1: Group all features by sample_uid in ONE operation instead of filtering repeatedly
894
+ study_obj.logger.debug("Grouping features efficiently (major speedup)...")
895
+
896
+ # rt_original should already exist (created in main align() function)
897
+ if "rt_original" not in study_obj.features_df.columns:
898
+ raise ValueError("rt_original column missing - this should have been created by align() function")
899
+
900
+ sample_groups = study_obj.features_df.group_by("sample_uid", maintain_order=True)
901
+ sample_feature_data = sample_groups.agg([
902
+ pl.len().alias("feature_count"),
903
+ pl.col("mz").alias("mzs"),
904
+ pl.col("rt_original").alias("rt_originals") # Use original RT values for alignment
905
+ ]).sort("feature_count", descending=True)
906
+
907
+ if sample_feature_data.is_empty():
908
+ study_obj.logger.error("No features found in any sample for alignment.")
909
+ raise ValueError("No features found in any sample for alignment.")
910
+
911
+ # Choose reference sample (sample with most features)
912
+ ref_sample_uid = sample_feature_data.row(0, named=True)["sample_uid"]
913
+
914
+ # Find the index of this sample in samples_df
915
+ ref_index = None
916
+ sample_uid_to_index = {}
917
+ for idx, row_dict in enumerate(study_obj.samples_df.iter_rows(named=True)):
918
+ sample_uid = row_dict["sample_uid"]
919
+ sample_uid_to_index[sample_uid] = idx
920
+ if sample_uid == ref_sample_uid:
921
+ ref_index = idx
922
+
923
+ if ref_index is None:
924
+ study_obj.logger.error(f"Could not find reference sample {ref_sample_uid} in samples_df")
925
+ raise ValueError(f"Could not find reference sample {ref_sample_uid} in samples_df")
929
926
 
930
- # Choose reference map (largest number of features)
931
- ref_index = max(range(len(fmaps)), key=lambda i: fmaps[i].size())
932
- ref_map = fmaps[ref_index]
933
927
  study_obj.alignment_ref_index = ref_index
934
- study_obj.logger.debug(
935
- f"Reference map index {ref_index} (sample: {study_obj.samples_df.row(ref_index, named=True)['sample_name']}) size={ref_map.size()}",
936
- )
937
-
938
- # Extract and sort reference features by m/z for binary search
939
- ref_features = [(f.getMZ(), f.getRT()) for f in ref_map]
928
+
929
+ # OPTIMIZATION 2: Get reference features efficiently from pre-grouped data
930
+ # Always use rt_original for alignment input to ensure consistent results
931
+ ref_row = sample_feature_data.filter(pl.col("sample_uid") == ref_sample_uid).row(0, named=True)
932
+ ref_mzs_list = ref_row["mzs"]
933
+ ref_rts_list = ref_row["rt_originals"] # Use original RT values
934
+
935
+ # Create sorted reference features for binary search
936
+ ref_features = list(zip(ref_mzs_list, ref_rts_list))
940
937
  ref_features.sort(key=lambda x: x[0])
941
938
  ref_mzs = [mz for mz, _ in ref_features]
939
+
940
+ study_obj.logger.debug(
941
+ f"Reference sample UID {ref_sample_uid} (index {ref_index}, sample: {study_obj.samples_df.row(ref_index, named=True)['sample_name']}) has {len(ref_features)} features",
942
+ )
942
943
 
943
944
  def find_best_match(mz: float, rt: float):
944
945
  mz_tol_abs = mz * ppm_tol * 1e-6
@@ -971,45 +972,59 @@ def _align_kd_algorithm(study_obj, fmaps, params):
971
972
  except Exception:
972
973
  pass
973
974
 
974
- transformations: list[oms.TransformationDescription] = []
975
+ # OPTIMIZATION 3: Process samples using pre-grouped data (eliminates expensive filtering)
976
+ transformations = {}
975
977
 
976
- for i, fmap in enumerate(fmaps):
978
+ for row in sample_feature_data.iter_rows(named=True):
979
+ sample_uid = row["sample_uid"]
980
+ sample_mzs = row["mzs"]
981
+ sample_rts = row["rt_originals"] # Use original RT values for alignment input
982
+
977
983
  td = oms.TransformationDescription()
978
- if fmap.size() == 0:
979
- transformations.append(td)
984
+ sample_index = sample_uid_to_index.get(sample_uid)
985
+
986
+ if sample_index is None:
987
+ study_obj.logger.warning(f"Sample UID {sample_uid} not found in samples_df, skipping")
988
+ continue
989
+
990
+ # Skip empty samples
991
+ if not sample_mzs or not sample_rts:
992
+ transformations[sample_uid] = td
980
993
  continue
981
- # Identity for reference map
982
- if i == ref_index:
983
- rts = [f.getRT() for f in fmap]
994
+
995
+ # Identity for reference sample
996
+ if sample_uid == ref_sample_uid:
997
+ rts = [rt for rt in sample_rts if rt is not None]
984
998
  lo, hi = (min(rts), max(rts)) if rts else (0.0, 0.0)
985
999
  try:
986
1000
  _set_pairs(td, [(lo, lo), (hi, hi)])
987
1001
  td.fitModel("linear", oms.Param())
988
1002
  except Exception:
989
1003
  pass
990
- transformations.append(td)
1004
+ transformations[sample_uid] = td
991
1005
  continue
992
1006
 
993
- # Collect candidate pairs
1007
+ # OPTIMIZATION 4: Process pairs using pre-loaded data arrays (no DataFrame operations)
994
1008
  pairs_raw = []
995
- for f in fmap:
996
- match = find_best_match(f.getMZ(), f.getRT())
997
- if match:
998
- obs_rt, ref_rt = match
999
- if abs(obs_rt - ref_rt) <= rt_pair_tol:
1000
- pairs_raw.append((obs_rt, ref_rt))
1009
+ for mz, rt in zip(sample_mzs, sample_rts):
1010
+ if mz is not None and rt is not None:
1011
+ match = find_best_match(mz, rt)
1012
+ if match:
1013
+ obs_rt, ref_rt = match
1014
+ if abs(obs_rt - ref_rt) <= rt_pair_tol:
1015
+ pairs_raw.append((obs_rt, ref_rt))
1001
1016
 
1002
1017
  if not pairs_raw:
1003
1018
  # Fallback identity
1004
- rts = [f.getRT() for f in fmap]
1019
+ rts = [rt for rt in sample_rts if rt is not None]
1005
1020
  lo, hi = (min(rts), max(rts)) if rts else (0.0, 0.0)
1006
1021
  try:
1007
1022
  _set_pairs(td, [(lo, lo), (hi, hi)])
1008
1023
  td.fitModel("linear", oms.Param())
1009
1024
  except Exception:
1010
1025
  pass
1011
- transformations.append(td)
1012
- study_obj.logger.debug(f"Map {i}: no anchors -> identity transform")
1026
+ transformations[sample_uid] = td
1027
+ study_obj.logger.debug(f"Sample {sample_uid}: no anchors -> identity transform")
1013
1028
  continue
1014
1029
 
1015
1030
  # Deduplicate and downsample
@@ -1041,9 +1056,9 @@ def _align_kd_algorithm(study_obj, fmaps, params):
1041
1056
  td.fitModel(model, oms.Param())
1042
1057
  except Exception as e:
1043
1058
  study_obj.logger.debug(
1044
- f"Map {i}: {model} fitting failed ({e}); fallback to linear two-point shift",
1059
+ f"Sample {sample_uid}: {model} fitting failed ({e}); fallback to linear two-point shift",
1045
1060
  )
1046
- rts = [f.getRT() for f in fmap]
1061
+ rts = [rt for rt in sample_rts if rt is not None]
1047
1062
  lo, hi = (min(rts), max(rts)) if rts else (0.0, 1.0)
1048
1063
  td = oms.TransformationDescription()
1049
1064
  try:
@@ -1053,28 +1068,39 @@ def _align_kd_algorithm(study_obj, fmaps, params):
1053
1068
  pass
1054
1069
 
1055
1070
  study_obj.logger.debug(
1056
- f"Map {i}: anchors raw={len(pairs_raw)} used={len(pairs_use)} model={model} median_shift={med_shift:.4f}s",
1071
+ f"Sample {sample_uid}: anchors raw={len(pairs_raw)} used={len(pairs_use)} model={model} median_shift={med_shift:.4f}s",
1057
1072
  )
1058
- transformations.append(td)
1073
+ transformations[sample_uid] = td
1059
1074
 
1060
- # Apply transformations to feature maps; store original rt as meta value if absent
1061
- for i, (fmap, trafo) in enumerate(zip(fmaps, transformations)):
1062
- try:
1063
- for feat in fmap:
1064
- if not feat.metaValueExists("original_RT"):
1065
- try:
1066
- feat.setMetaValue("original_RT", float(feat.getRT()))
1067
- except Exception:
1068
- pass
1069
- oms.MapAlignmentTransformer().transformRetentionTimes(fmap, trafo, True)
1070
- except Exception as e:
1071
- study_obj.logger.warning(f"Map {i}: failed applying transformation ({e})")
1075
+ # OPTIMIZATION 5: Apply transformations efficiently using vectorized operations
1076
+ study_obj.logger.debug("Applying RT transformations efficiently...")
1077
+
1078
+ # Apply transformations to RT values starting from rt_original
1079
+ def transform_rt_vectorized(sample_uid: int, rt_original: float) -> float:
1080
+ if sample_uid in transformations and rt_original is not None:
1081
+ try:
1082
+ trafo = transformations[sample_uid]
1083
+ return trafo.apply(float(rt_original))
1084
+ except Exception:
1085
+ return rt_original
1086
+ return rt_original
1087
+
1088
+ # Use Polars' efficient struct operations for vectorized transformation
1089
+ # Apply transformation to rt_original and store result in rt column
1090
+ study_obj.features_df = study_obj.features_df.with_columns(
1091
+ pl.struct(["sample_uid", "rt_original"]).map_elements(
1092
+ lambda x: transform_rt_vectorized(x["sample_uid"], x["rt_original"]),
1093
+ return_dtype=pl.Float64
1094
+ ).alias("rt")
1095
+ )
1072
1096
 
1073
1097
  study_obj.logger.info(
1074
- f"Alignment completed. Reference index {ref_index}.",
1098
+ f"Alignment completed. Reference sample UID {ref_sample_uid} (index {ref_index}).",
1075
1099
  )
1076
1100
 
1077
1101
 
1102
+
1103
+
1078
1104
  def _align_pose_clustering_fallback(study_obj, fmaps, params):
1079
1105
  """Fallback PoseClustering alignment with minimal parameters."""
1080
1106
  import pyopenms as oms
@@ -1107,7 +1133,7 @@ def _align_pose_clustering_fallback(study_obj, fmaps, params):
1107
1133
  study_obj.alignment_ref_index = ref_index
1108
1134
 
1109
1135
 
1110
- def find_iso(self, rt_tol=0.1, mz_tol=0.01):
1136
+ def find_iso(self, rt_tol=0.1, mz_tol=0.01, uids=None):
1111
1137
  """
1112
1138
  Find isotope patterns for consensus features by searching raw MS1 data.
1113
1139
  OPTIMIZED VERSION: Each sample file is loaded only once for maximum efficiency.
@@ -1123,6 +1149,7 @@ def find_iso(self, rt_tol=0.1, mz_tol=0.01):
1123
1149
  Parameters:
1124
1150
  rt_tol (float): RT tolerance for scan matching in seconds
1125
1151
  mz_tol (float): Additional m/z tolerance for isotope matching in Da
1152
+ uids (list, optional): List of consensus_uid values to process. If None, process all consensus features.
1126
1153
  """
1127
1154
  if self.consensus_df is None or self.consensus_df.is_empty():
1128
1155
  self.logger.error("No consensus features found. Please run merge() first.")
@@ -1148,6 +1175,20 @@ def find_iso(self, rt_tol=0.1, mz_tol=0.01):
1148
1175
 
1149
1176
  self.logger.info("Extracting isotopomers from raw MS1 data...")
1150
1177
 
1178
+ # Filter consensus features if uids is specified
1179
+ if uids is not None:
1180
+ if not isinstance(uids, (list, tuple)):
1181
+ uids = [uids]
1182
+ # Filter consensus_df to only process specified UIDs
1183
+ consensus_df_filtered = self.consensus_df.filter(pl.col("consensus_uid").is_in(uids))
1184
+ if consensus_df_filtered.is_empty():
1185
+ self.logger.warning(f"No consensus features found with specified UIDs: {uids}")
1186
+ return
1187
+ self.logger.debug(f"Processing {len(consensus_df_filtered)} consensus features (UIDs: {uids})")
1188
+ else:
1189
+ consensus_df_filtered = self.consensus_df
1190
+ self.logger.debug(f"Processing all {len(consensus_df_filtered)} consensus features")
1191
+
1151
1192
  # Isotope mass shifts to search for (up to 7x 13C isotopes)
1152
1193
  isotope_shifts = [
1153
1194
  0.33,
@@ -1169,7 +1210,13 @@ def find_iso(self, rt_tol=0.1, mz_tol=0.01):
1169
1210
  self.logger.debug("Building sample-to-consensus mapping using vectorized operations...")
1170
1211
 
1171
1212
  # Step 1: Join consensus_mapping with features to get intensities in one operation
1172
- consensus_with_features = self.consensus_mapping_df.join(
1213
+ # Apply UID filtering if specified
1214
+ if uids is not None:
1215
+ consensus_mapping_filtered = self.consensus_mapping_df.filter(pl.col("consensus_uid").is_in(uids))
1216
+ else:
1217
+ consensus_mapping_filtered = self.consensus_mapping_df
1218
+
1219
+ consensus_with_features = consensus_mapping_filtered.join(
1173
1220
  self.features_df.select(['feature_uid', 'sample_uid', 'inty', 'mz', 'rt', 'rt_original']),
1174
1221
  on=['feature_uid', 'sample_uid'],
1175
1222
  how='left'
@@ -1214,19 +1261,19 @@ def find_iso(self, rt_tol=0.1, mz_tol=0.01):
1214
1261
 
1215
1262
  # Initialize failed consensus features (those not in the mapping)
1216
1263
  processed_consensus_uids = set(best_features_with_paths['consensus_uid'].to_list())
1217
- for consensus_row in self.consensus_df.iter_rows(named=True):
1264
+ for consensus_row in consensus_df_filtered.iter_rows(named=True):
1218
1265
  consensus_uid = consensus_row["consensus_uid"]
1219
1266
  if consensus_uid not in processed_consensus_uids:
1220
1267
  consensus_iso_data[consensus_uid] = None
1221
1268
 
1222
- self.logger.debug(f"Will read {len(sample_to_consensus)} unique sample files for {len(self.consensus_df)} consensus features")
1269
+ self.logger.debug(f"Will read {len(sample_to_consensus)} unique sample files for {len(consensus_df_filtered)} consensus features")
1223
1270
 
1224
1271
  tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
1225
1272
 
1226
1273
  # OPTIMIZATION 2: Process by sample file (load each file only once)
1227
1274
  for sample_path, consensus_list in tqdm(
1228
1275
  sample_to_consensus.items(),
1229
- desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Read files",
1276
+ desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Process from files",
1230
1277
  disable=tdqm_disable,
1231
1278
  ):
1232
1279
  try:
@@ -1245,6 +1292,16 @@ def find_iso(self, rt_tol=0.1, mz_tol=0.01):
1245
1292
  base_mz = best_feature["mz"]
1246
1293
  original_rt = best_feature.get("rt_original", best_feature["rt"])
1247
1294
 
1295
+ # Skip if RT or mz is None or invalid
1296
+ if original_rt is None:
1297
+ original_rt = best_feature["rt"]
1298
+ self.logger.debug(f"original_rt is None. Using aligned rt instead")
1299
+
1300
+ if base_mz is None:
1301
+ self.logger.warning(f"Skipping consensus_uid {consensus_uid}: base_mz is None")
1302
+ consensus_iso_data[consensus_uid] = None
1303
+ continue
1304
+
1248
1305
  # Find MS1 scans near the original RT
1249
1306
  rt_min = original_rt - rt_tol
1250
1307
  rt_max = original_rt + rt_tol