masster 0.5.22__py3-none-any.whl → 0.5.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

@@ -36,16 +36,16 @@ def align(self, **kwargs):
36
36
  """
37
37
  # parameters initialization
38
38
  params = align_defaults()
39
-
39
+
40
40
  # Handle 'params' keyword argument specifically (like merge does)
41
- if 'params' in kwargs:
42
- provided_params = kwargs.pop('params')
41
+ if "params" in kwargs:
42
+ provided_params = kwargs.pop("params")
43
43
  if isinstance(provided_params, align_defaults):
44
44
  params = provided_params
45
45
  self.logger.debug("Using provided align_defaults parameters from 'params' argument")
46
46
  else:
47
47
  self.logger.warning("'params' argument is not an align_defaults instance, ignoring")
48
-
48
+
49
49
  # Process remaining kwargs
50
50
  for key, value in kwargs.items():
51
51
  if isinstance(value, align_defaults):
@@ -68,7 +68,7 @@ def align(self, **kwargs):
68
68
 
69
69
  # Ensure rt_original exists before starting alignment (both algorithms need this)
70
70
  if "rt_original" not in self.features_df.columns:
71
- # add column 'rt_original' after 'rt'
71
+ # add column 'rt_original' after 'rt'
72
72
  rt_index = self.features_df.columns.get_loc("rt") + 1
73
73
  self.features_df.insert(rt_index, "rt_original", 0)
74
74
  self.features_df["rt_original"] = self.features_df["rt"]
@@ -174,9 +174,7 @@ def find_ms2(self, **kwargs):
174
174
  ]
175
175
  for row in feats.iter_rows(named=True):
176
176
  feature_uid = row["feature_uid"]
177
- feature_lookup[feature_uid] = {
178
- col: row[col] for col in relevant_cols if col in feats.columns
179
- }
177
+ feature_lookup[feature_uid] = {col: row[col] for col in relevant_cols if col in feats.columns}
180
178
  tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
181
179
 
182
180
  # Process consensus mapping in batch
@@ -204,13 +202,9 @@ def find_ms2(self, **kwargs):
204
202
  "feature_uid": int(mapping_row["feature_uid"]),
205
203
  "sample_uid": int(mapping_row["sample_uid"]),
206
204
  "scan_id": int(scanid),
207
- "energy": round(spec.energy, 1)
208
- if hasattr(spec, "energy") and spec.energy is not None
209
- else None,
205
+ "energy": round(spec.energy, 1) if hasattr(spec, "energy") and spec.energy is not None else None,
210
206
  "prec_inty": round(inty, 0) if inty is not None else None,
211
- "prec_coherence": round(chrom_coherence, 3)
212
- if chrom_coherence is not None
213
- else None,
207
+ "prec_coherence": round(chrom_coherence, 3) if chrom_coherence is not None else None,
214
208
  "prec_prominence_scaled": round(chrom_prominence_scaled, 3)
215
209
  if chrom_prominence_scaled is not None
216
210
  else None,
@@ -250,10 +244,7 @@ def filter_consensus(
250
244
  else:
251
245
  if isinstance(coherence, tuple) and len(coherence) == 2:
252
246
  min_coherence, max_coherence = coherence
253
- cons = cons[
254
- (cons["chrom_coherence"] >= min_coherence)
255
- & (cons["chrom_coherence"] <= max_coherence)
256
- ]
247
+ cons = cons[(cons["chrom_coherence"] >= min_coherence) & (cons["chrom_coherence"] <= max_coherence)]
257
248
  else:
258
249
  cons = cons[cons["chrom_coherence"] >= coherence]
259
250
  after_coherence = len(cons)
@@ -264,9 +255,7 @@ def filter_consensus(
264
255
  if quality is not None:
265
256
  if isinstance(quality, tuple) and len(quality) == 2:
266
257
  min_quality, max_quality = quality
267
- cons = cons[
268
- (cons["quality"] >= min_quality) & (cons["quality"] <= max_quality)
269
- ]
258
+ cons = cons[(cons["quality"] >= min_quality) & (cons["quality"] <= max_quality)]
270
259
  else:
271
260
  cons = cons[cons["quality"] >= quality]
272
261
  after_quality = len(cons)
@@ -277,10 +266,7 @@ def filter_consensus(
277
266
  if number_samples is not None:
278
267
  if isinstance(number_samples, tuple) and len(number_samples) == 2:
279
268
  min_number, max_number = number_samples
280
- cons = cons[
281
- (cons["number_samples"] >= min_number)
282
- & (cons["number_samples"] <= max_number)
283
- ]
269
+ cons = cons[(cons["number_samples"] >= min_number) & (cons["number_samples"] <= max_number)]
284
270
  else:
285
271
  cons = cons[cons["number_samples"] >= number_samples]
286
272
  after_number_samples = len(cons)
@@ -437,13 +423,9 @@ def _integrate_chrom_impl(self, **kwargs):
437
423
  if update_rows:
438
424
  # Create mapping from row index to new values
439
425
  row_to_chrom = {update_rows[i]: chroms[i] for i in range(len(update_rows))}
440
- row_to_rt_start = {
441
- update_rows[i]: rt_starts[i] for i in range(len(update_rows))
442
- }
426
+ row_to_rt_start = {update_rows[i]: rt_starts[i] for i in range(len(update_rows))}
443
427
  row_to_rt_end = {update_rows[i]: rt_ends[i] for i in range(len(update_rows))}
444
- row_to_rt_delta = {
445
- update_rows[i]: rt_deltas[i] for i in range(len(update_rows))
446
- }
428
+ row_to_rt_delta = {update_rows[i]: rt_deltas[i] for i in range(len(update_rows))}
447
429
  row_to_chrom_area = {
448
430
  update_rows[i]: float(chrom_areas[i]) if chrom_areas[i] is not None else 0.0
449
431
  for i in range(len(update_rows))
@@ -598,10 +580,10 @@ def _align_pose_clustering(study_obj, params):
598
580
 
599
581
  # Generate temporary feature maps on-demand from features_df for PoseClustering
600
582
  study_obj.logger.debug("Generating feature maps on-demand from features_df for PoseClustering alignment")
601
-
583
+
602
584
  tdqm_disable = study_obj.log_level not in ["TRACE", "DEBUG", "INFO"]
603
585
  fmaps = []
604
-
586
+
605
587
  # Process each sample in order with progress bar
606
588
  for sample_index, row_dict in tqdm(
607
589
  list(enumerate(study_obj.samples_df.iter_rows(named=True))),
@@ -611,17 +593,17 @@ def _align_pose_clustering(study_obj, params):
611
593
  ):
612
594
  sample_uid = row_dict["sample_uid"]
613
595
  sample_name = row_dict["sample_name"]
614
-
596
+
615
597
  # Get features for this sample from features_df
616
598
  sample_features = study_obj.features_df.filter(pl.col("sample_uid") == sample_uid)
617
-
599
+
618
600
  # Create new FeatureMap
619
601
  feature_map = oms.FeatureMap()
620
-
602
+
621
603
  # Convert DataFrame features to OpenMS Features
622
604
  for feature_row in sample_features.iter_rows(named=True):
623
605
  feature = oms.Feature()
624
-
606
+
625
607
  # Set properties from DataFrame (handle missing values gracefully)
626
608
  try:
627
609
  # Skip features with missing critical data
@@ -639,7 +621,7 @@ def _align_pose_clustering(study_obj, params):
639
621
  feature.setMZ(float(feature_row["mz"]))
640
622
  feature.setRT(float(feature_row["rt"]))
641
623
  feature.setIntensity(float(feature_row["inty"]))
642
-
624
+
643
625
  # Handle optional fields that might be None
644
626
  if feature_row.get("quality") is not None:
645
627
  feature.setOverallQuality(float(feature_row["quality"]))
@@ -651,9 +633,9 @@ def _align_pose_clustering(study_obj, params):
651
633
  except (ValueError, TypeError) as e:
652
634
  study_obj.logger.warning(f"Skipping feature due to conversion error: {e}")
653
635
  continue
654
-
636
+
655
637
  fmaps.append(feature_map)
656
-
638
+
657
639
  study_obj.logger.debug(f"Generated {len(fmaps)} feature maps from features_df for PoseClustering alignment")
658
640
 
659
641
  # Create PC-specific OpenMS parameters
@@ -684,10 +666,8 @@ def _align_pose_clustering(study_obj, params):
684
666
  )
685
667
 
686
668
  # Set ref_index to feature map index with largest number of features
687
- ref_index = [
688
- i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])
689
- ][-1]
690
-
669
+ ref_index = [i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])][-1]
670
+
691
671
  aligner.setParameters(params_oms)
692
672
  aligner.setReference(fmaps[ref_index])
693
673
  study_obj.logger.debug(f"Parameters for alignment: {params}")
@@ -701,18 +681,17 @@ def _align_pose_clustering(study_obj, params):
701
681
  ):
702
682
  if index == ref_index:
703
683
  continue
704
- if (
705
- params.get("skip_blanks")
706
- and study_obj.samples_df.row(index, named=True)["sample_type"] == "blank"
707
- ):
684
+ if params.get("skip_blanks") and study_obj.samples_df.row(index, named=True)["sample_type"] == "blank":
708
685
  continue
709
-
686
+
710
687
  # Skip feature maps with insufficient data points for alignment
711
688
  if fm.size() < 2:
712
689
  sample_name = study_obj.samples_df.row(index, named=True)["sample_name"]
713
- study_obj.logger.warning(f"Skipping alignment for sample '{sample_name}' - insufficient features ({fm.size()} features)")
690
+ study_obj.logger.warning(
691
+ f"Skipping alignment for sample '{sample_name}' - insufficient features ({fm.size()} features)"
692
+ )
714
693
  continue
715
-
694
+
716
695
  try:
717
696
  trafo = oms.TransformationDescription()
718
697
  aligner.align(fm, trafo)
@@ -724,7 +703,7 @@ def _align_pose_clustering(study_obj, params):
724
703
  continue
725
704
 
726
705
  study_obj.alignment_ref_index = ref_index
727
-
706
+
728
707
  # Process feature maps and update features_df with transformed retention times
729
708
  # Build a fast lookup for (sample_uid, featureUid) to index in features_df
730
709
  feats = study_obj.features_df
@@ -732,8 +711,7 @@ def _align_pose_clustering(study_obj, params):
732
711
  # Pre-build sample_uid lookup for faster access
733
712
  study_obj.logger.debug("Build sample_uid lookup for fast access...")
734
713
  sample_uid_lookup = {
735
- idx: row_dict["sample_uid"]
736
- for idx, row_dict in enumerate(study_obj.samples_df.iter_rows(named=True))
714
+ idx: row_dict["sample_uid"] for idx, row_dict in enumerate(study_obj.samples_df.iter_rows(named=True))
737
715
  }
738
716
 
739
717
  # Build the main lookup using feature_uid (not feature_id)
@@ -833,7 +811,7 @@ def _align_pose_clustering(study_obj, params):
833
811
  # Clean up temporary feature maps to release memory
834
812
  del fmaps
835
813
  study_obj.logger.debug("Temporary feature maps deleted to release memory")
836
-
814
+
837
815
  # Resolve reference sample UID from the reference index
838
816
  ref_sample_uid = sample_uid_lookup.get(ref_index)
839
817
  study_obj.logger.success(
@@ -853,24 +831,15 @@ def _align_kd_algorithm(study_obj, params):
853
831
 
854
832
  # Pull parameter values - map standard align params to our algorithm
855
833
  # Use rt_tol (standard align param) instead of warp_rt_tol for RT tolerance
856
- rt_pair_tol = (
857
- float(params.get("rt_tol")) if params.get("rt_tol") is not None else 2.0
858
- )
834
+ rt_pair_tol = float(params.get("rt_tol")) if params.get("rt_tol") is not None else 2.0
859
835
  # Use mz_max_diff (standard align param) converted to ppm
860
- mz_max_diff_da = (
861
- float(params.get("mz_max_diff"))
862
- if params.get("mz_max_diff") is not None
863
- else 0.02
864
- )
836
+ mz_max_diff_da = float(params.get("mz_max_diff")) if params.get("mz_max_diff") is not None else 0.02
865
837
  # Convert Da to ppm (assuming ~400 m/z average for metabolomics): 0.01 Da / 400 * 1e6 = 25 ppm
866
838
  ppm_tol = mz_max_diff_da / 400.0 * 1e6
867
839
  # Allow override with warp_mz_tol if specifically set (but not from defaults)
868
840
  try:
869
841
  warp_mz_from_params = params.get("warp_mz_tol")
870
- if (
871
- warp_mz_from_params is not None
872
- and warp_mz_from_params != params.__class__().warp_mz_tol
873
- ):
842
+ if warp_mz_from_params is not None and warp_mz_from_params != params.__class__().warp_mz_tol:
874
843
  ppm_tol = float(warp_mz_from_params)
875
844
  except (KeyError, AttributeError):
876
845
  pass
@@ -888,29 +857,31 @@ def _align_kd_algorithm(study_obj, params):
888
857
  # Work directly with features_df instead of feature maps
889
858
  if study_obj.features_df is None or study_obj.features_df.is_empty():
890
859
  study_obj.logger.error("No features_df available for alignment. Cannot proceed with KD alignment.")
891
- raise ValueError("No features_df available for alignment. This usually indicates that features were not detected properly.")
892
-
860
+ raise ValueError(
861
+ "No features_df available for alignment. This usually indicates that features were not detected properly."
862
+ )
863
+
893
864
  # OPTIMIZATION 1: Group all features by sample_uid in ONE operation instead of filtering repeatedly
894
865
  study_obj.logger.debug("Grouping features efficiently (major speedup)...")
895
-
866
+
896
867
  # rt_original should already exist (created in main align() function)
897
868
  if "rt_original" not in study_obj.features_df.columns:
898
869
  raise ValueError("rt_original column missing - this should have been created by align() function")
899
-
870
+
900
871
  sample_groups = study_obj.features_df.group_by("sample_uid", maintain_order=True)
901
872
  sample_feature_data = sample_groups.agg([
902
873
  pl.len().alias("feature_count"),
903
874
  pl.col("mz").alias("mzs"),
904
- pl.col("rt_original").alias("rt_originals") # Use original RT values for alignment
875
+ pl.col("rt_original").alias("rt_originals"), # Use original RT values for alignment
905
876
  ]).sort("feature_count", descending=True)
906
-
877
+
907
878
  if sample_feature_data.is_empty():
908
879
  study_obj.logger.error("No features found in any sample for alignment.")
909
880
  raise ValueError("No features found in any sample for alignment.")
910
-
881
+
911
882
  # Choose reference sample (sample with most features)
912
883
  ref_sample_uid = sample_feature_data.row(0, named=True)["sample_uid"]
913
-
884
+
914
885
  # Find the index of this sample in samples_df
915
886
  ref_index = None
916
887
  sample_uid_to_index = {}
@@ -919,24 +890,24 @@ def _align_kd_algorithm(study_obj, params):
919
890
  sample_uid_to_index[sample_uid] = idx
920
891
  if sample_uid == ref_sample_uid:
921
892
  ref_index = idx
922
-
893
+
923
894
  if ref_index is None:
924
895
  study_obj.logger.error(f"Could not find reference sample {ref_sample_uid} in samples_df")
925
896
  raise ValueError(f"Could not find reference sample {ref_sample_uid} in samples_df")
926
-
897
+
927
898
  study_obj.alignment_ref_index = ref_index
928
-
899
+
929
900
  # OPTIMIZATION 2: Get reference features efficiently from pre-grouped data
930
901
  # Always use rt_original for alignment input to ensure consistent results
931
902
  ref_row = sample_feature_data.filter(pl.col("sample_uid") == ref_sample_uid).row(0, named=True)
932
903
  ref_mzs_list = ref_row["mzs"]
933
904
  ref_rts_list = ref_row["rt_originals"] # Use original RT values
934
-
905
+
935
906
  # Create sorted reference features for binary search
936
907
  ref_features = list(zip(ref_mzs_list, ref_rts_list))
937
908
  ref_features.sort(key=lambda x: x[0])
938
909
  ref_mzs = [mz for mz, _ in ref_features]
939
-
910
+
940
911
  study_obj.logger.debug(
941
912
  f"Reference sample UID {ref_sample_uid} (index {ref_index}, sample: {study_obj.samples_df.row(ref_index, named=True)['sample_name']}) has {len(ref_features)} features",
942
913
  )
@@ -979,19 +950,19 @@ def _align_kd_algorithm(study_obj, params):
979
950
  sample_uid = row["sample_uid"]
980
951
  sample_mzs = row["mzs"]
981
952
  sample_rts = row["rt_originals"] # Use original RT values for alignment input
982
-
953
+
983
954
  td = oms.TransformationDescription()
984
955
  sample_index = sample_uid_to_index.get(sample_uid)
985
-
956
+
986
957
  if sample_index is None:
987
958
  study_obj.logger.warning(f"Sample UID {sample_uid} not found in samples_df, skipping")
988
959
  continue
989
-
960
+
990
961
  # Skip empty samples
991
962
  if not sample_mzs or not sample_rts:
992
963
  transformations[sample_uid] = td
993
964
  continue
994
-
965
+
995
966
  # Identity for reference sample
996
967
  if sample_uid == ref_sample_uid:
997
968
  rts = [rt for rt in sample_rts if rt is not None]
@@ -1074,7 +1045,7 @@ def _align_kd_algorithm(study_obj, params):
1074
1045
 
1075
1046
  # OPTIMIZATION 5: Apply transformations efficiently using vectorized operations
1076
1047
  study_obj.logger.debug("Applying RT transformations efficiently...")
1077
-
1048
+
1078
1049
  # Apply transformations to RT values starting from rt_original
1079
1050
  def transform_rt_vectorized(sample_uid: int, rt_original: float) -> float:
1080
1051
  if sample_uid in transformations and rt_original is not None:
@@ -1084,14 +1055,13 @@ def _align_kd_algorithm(study_obj, params):
1084
1055
  except Exception:
1085
1056
  return rt_original
1086
1057
  return rt_original
1087
-
1058
+
1088
1059
  # Use Polars' efficient struct operations for vectorized transformation
1089
1060
  # Apply transformation to rt_original and store result in rt column
1090
1061
  study_obj.features_df = study_obj.features_df.with_columns(
1091
- pl.struct(["sample_uid", "rt_original"]).map_elements(
1092
- lambda x: transform_rt_vectorized(x["sample_uid"], x["rt_original"]),
1093
- return_dtype=pl.Float64
1094
- ).alias("rt")
1062
+ pl.struct(["sample_uid", "rt_original"])
1063
+ .map_elements(lambda x: transform_rt_vectorized(x["sample_uid"], x["rt_original"]), return_dtype=pl.Float64)
1064
+ .alias("rt")
1095
1065
  )
1096
1066
 
1097
1067
  study_obj.logger.success(
@@ -1099,16 +1069,12 @@ def _align_kd_algorithm(study_obj, params):
1099
1069
  )
1100
1070
 
1101
1071
 
1102
-
1103
-
1104
1072
  def _align_pose_clustering_fallback(study_obj, fmaps, params):
1105
1073
  """Fallback PoseClustering alignment with minimal parameters."""
1106
1074
  import pyopenms as oms
1107
1075
 
1108
1076
  aligner = oms.MapAlignmentAlgorithmPoseClustering()
1109
- ref_index = [
1110
- i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])
1111
- ][-1]
1077
+ ref_index = [i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])][-1]
1112
1078
 
1113
1079
  # Set up basic parameters for pose clustering
1114
1080
  pc_params = oms.Param()
@@ -1137,15 +1103,15 @@ def find_iso(self, rt_tol=0.1, mz_tol=0.01, uids=None):
1137
1103
  """
1138
1104
  Find isotope patterns for consensus features by searching raw MS1 data.
1139
1105
  OPTIMIZED VERSION: Each sample file is loaded only once for maximum efficiency.
1140
-
1106
+
1141
1107
  For each consensus feature:
1142
- 1. Find the associated feature with highest intensity
1108
+ 1. Find the associated feature with highest intensity
1143
1109
  2. Load the corresponding sample5 file to access raw MS1 data
1144
1110
  3. Use original_rt (before alignment) to find the correct scan
1145
1111
  4. Search for isotope patterns in raw MS1 spectra
1146
1112
  5. Look for isotope patterns: 0.33, 0.50, 0.66, 1.00, 1.50, 2.00, 3.00, 4.00, 5.00 Da
1147
1113
  6. Store results as numpy arrays with [mz, inty] in the iso column
1148
-
1114
+
1149
1115
  Parameters:
1150
1116
  rt_tol (float): RT tolerance for scan matching in seconds
1151
1117
  mz_tol (float): Additional m/z tolerance for isotope matching in Da
@@ -1154,27 +1120,25 @@ def find_iso(self, rt_tol=0.1, mz_tol=0.01, uids=None):
1154
1120
  if self.consensus_df is None or self.consensus_df.is_empty():
1155
1121
  self.logger.error("No consensus features found. Please run merge() first.")
1156
1122
  return
1157
-
1123
+
1158
1124
  if self.consensus_mapping_df is None or self.consensus_mapping_df.is_empty():
1159
1125
  self.logger.error("No consensus mapping found. Please run merge() first.")
1160
1126
  return
1161
-
1127
+
1162
1128
  if self.features_df is None or self.features_df.is_empty():
1163
1129
  self.logger.error("No features found.")
1164
1130
  return
1165
-
1131
+
1166
1132
  if self.samples_df is None or self.samples_df.is_empty():
1167
1133
  self.logger.error("No samples found.")
1168
1134
  return
1169
-
1135
+
1170
1136
  # Add iso column if it doesn't exist
1171
1137
  if "iso" not in self.consensus_df.columns:
1172
- self.consensus_df = self.consensus_df.with_columns(
1173
- pl.lit(None, dtype=pl.Object).alias("iso")
1174
- )
1175
-
1138
+ self.consensus_df = self.consensus_df.with_columns(pl.lit(None, dtype=pl.Object).alias("iso"))
1139
+
1176
1140
  self.logger.info("Extracting isotopomers from raw MS1 data...")
1177
-
1141
+
1178
1142
  # Filter consensus features if uids is specified
1179
1143
  if uids is not None:
1180
1144
  if not isinstance(uids, (list, tuple)):
@@ -1188,7 +1152,7 @@ def find_iso(self, rt_tol=0.1, mz_tol=0.01, uids=None):
1188
1152
  else:
1189
1153
  consensus_df_filtered = self.consensus_df
1190
1154
  self.logger.debug(f"Processing all {len(consensus_df_filtered)} consensus features")
1191
-
1155
+
1192
1156
  # Isotope mass shifts to search for (up to 7x 13C isotopes)
1193
1157
  isotope_shifts = [
1194
1158
  0.33,
@@ -1203,73 +1167,73 @@ def find_iso(self, rt_tol=0.1, mz_tol=0.01, uids=None):
1203
1167
  6.02010,
1204
1168
  7.02345,
1205
1169
  ]
1206
-
1170
+
1207
1171
  consensus_iso_data = {}
1208
-
1172
+
1209
1173
  # SUPER OPTIMIZATION: Vectorized pre-calculation using joins (10-100x faster)
1210
1174
  self.logger.debug("Building sample-to-consensus mapping using vectorized operations...")
1211
-
1175
+
1212
1176
  # Step 1: Join consensus_mapping with features to get intensities in one operation
1213
1177
  # Apply UID filtering if specified
1214
1178
  if uids is not None:
1215
1179
  consensus_mapping_filtered = self.consensus_mapping_df.filter(pl.col("consensus_uid").is_in(uids))
1216
1180
  else:
1217
1181
  consensus_mapping_filtered = self.consensus_mapping_df
1218
-
1182
+
1219
1183
  consensus_with_features = consensus_mapping_filtered.join(
1220
- self.features_df.select(['feature_uid', 'sample_uid', 'inty', 'mz', 'rt', 'rt_original']),
1221
- on=['feature_uid', 'sample_uid'],
1222
- how='left'
1184
+ self.features_df.select(["feature_uid", "sample_uid", "inty", "mz", "rt", "rt_original"]),
1185
+ on=["feature_uid", "sample_uid"],
1186
+ how="left",
1223
1187
  )
1224
-
1188
+
1225
1189
  # Step 2: Find the best feature (highest intensity) for each consensus using window functions
1226
- best_features = consensus_with_features.with_columns(
1227
- pl.col('inty').fill_null(0) # Handle null intensities
1228
- ).with_columns(
1229
- pl.col('inty').max().over('consensus_uid').alias('max_inty')
1230
- ).filter(
1231
- pl.col('inty') == pl.col('max_inty')
1232
- ).group_by('consensus_uid').first() # Take first if there are ties
1233
-
1190
+ best_features = (
1191
+ consensus_with_features.with_columns(
1192
+ pl.col("inty").fill_null(0) # Handle null intensities
1193
+ )
1194
+ .with_columns(pl.col("inty").max().over("consensus_uid").alias("max_inty"))
1195
+ .filter(pl.col("inty") == pl.col("max_inty"))
1196
+ .group_by("consensus_uid")
1197
+ .first()
1198
+ ) # Take first if there are ties
1199
+
1234
1200
  # Step 3: Join with samples to get sample paths in one operation
1235
1201
  best_features_with_paths = best_features.join(
1236
- self.samples_df.select(['sample_uid', 'sample_path']),
1237
- on='sample_uid',
1238
- how='left'
1239
- ).filter(
1240
- pl.col('sample_path').is_not_null()
1241
- )
1242
-
1202
+ self.samples_df.select(["sample_uid", "sample_path"]), on="sample_uid", how="left"
1203
+ ).filter(pl.col("sample_path").is_not_null())
1204
+
1243
1205
  # Step 4: Group by sample path for batch processing (much faster than nested loops)
1244
1206
  sample_to_consensus = {}
1245
1207
  for row in best_features_with_paths.iter_rows(named=True):
1246
- sample_path = row['sample_path']
1247
- consensus_uid = row['consensus_uid']
1248
-
1208
+ sample_path = row["sample_path"]
1209
+ consensus_uid = row["consensus_uid"]
1210
+
1249
1211
  # Create feature data dictionary for compatibility
1250
1212
  feature_data = {
1251
- 'mz': row['mz'],
1252
- 'rt': row['rt'],
1253
- 'rt_original': row.get('rt_original', row['rt']),
1254
- 'inty': row['inty']
1213
+ "mz": row["mz"],
1214
+ "rt": row["rt"],
1215
+ "rt_original": row.get("rt_original", row["rt"]),
1216
+ "inty": row["inty"],
1255
1217
  }
1256
-
1218
+
1257
1219
  if sample_path not in sample_to_consensus:
1258
1220
  sample_to_consensus[sample_path] = []
1259
-
1221
+
1260
1222
  sample_to_consensus[sample_path].append((consensus_uid, feature_data))
1261
-
1223
+
1262
1224
  # Initialize failed consensus features (those not in the mapping)
1263
- processed_consensus_uids = set(best_features_with_paths['consensus_uid'].to_list())
1225
+ processed_consensus_uids = set(best_features_with_paths["consensus_uid"].to_list())
1264
1226
  for consensus_row in consensus_df_filtered.iter_rows(named=True):
1265
1227
  consensus_uid = consensus_row["consensus_uid"]
1266
1228
  if consensus_uid not in processed_consensus_uids:
1267
1229
  consensus_iso_data[consensus_uid] = None
1268
-
1269
- self.logger.debug(f"Will read {len(sample_to_consensus)} unique sample files for {len(consensus_df_filtered)} consensus features")
1270
-
1230
+
1231
+ self.logger.debug(
1232
+ f"Will read {len(sample_to_consensus)} unique sample files for {len(consensus_df_filtered)} consensus features"
1233
+ )
1234
+
1271
1235
  tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
1272
-
1236
+
1273
1237
  # OPTIMIZATION 2: Process by sample file (load each file only once)
1274
1238
  for sample_path, consensus_list in tqdm(
1275
1239
  sample_to_consensus.items(),
@@ -1279,126 +1243,115 @@ def find_iso(self, rt_tol=0.1, mz_tol=0.01, uids=None):
1279
1243
  try:
1280
1244
  # Load MS1 data once per sample
1281
1245
  ms1_df = self._load_ms1(sample_path)
1282
-
1246
+
1283
1247
  if ms1_df is None or ms1_df.is_empty():
1284
1248
  # Mark all consensus features from this sample as failed
1285
1249
  for consensus_uid, _ in consensus_list:
1286
1250
  consensus_iso_data[consensus_uid] = None
1287
1251
  continue
1288
-
1252
+
1289
1253
  # Process all consensus features for this sample
1290
1254
  for consensus_uid, best_feature in consensus_list:
1291
1255
  # Get the original RT (before alignment correction)
1292
1256
  base_mz = best_feature["mz"]
1293
1257
  original_rt = best_feature.get("rt_original", best_feature["rt"])
1294
-
1258
+
1295
1259
  # Skip if RT or mz is None or invalid
1296
1260
  if original_rt is None:
1297
- original_rt = best_feature["rt"]
1261
+ original_rt = best_feature["rt"]
1298
1262
  self.logger.debug(f"original_rt is None. Using aligned rt instead")
1299
-
1263
+
1300
1264
  if base_mz is None:
1301
1265
  self.logger.warning(f"Skipping consensus_uid {consensus_uid}: base_mz is None")
1302
1266
  consensus_iso_data[consensus_uid] = None
1303
1267
  continue
1304
-
1268
+
1305
1269
  # Find MS1 scans near the original RT
1306
1270
  rt_min = original_rt - rt_tol
1307
1271
  rt_max = original_rt + rt_tol
1308
-
1272
+
1309
1273
  # Filter MS1 data for scans within RT window
1310
- ms1_window = ms1_df.filter(
1311
- (pl.col("rt") >= rt_min) & (pl.col("rt") <= rt_max)
1312
- )
1313
-
1274
+ ms1_window = ms1_df.filter((pl.col("rt") >= rt_min) & (pl.col("rt") <= rt_max))
1275
+
1314
1276
  if ms1_window.is_empty():
1315
1277
  consensus_iso_data[consensus_uid] = None
1316
1278
  continue
1317
-
1279
+
1318
1280
  isotope_matches = []
1319
-
1281
+
1320
1282
  # Search for each isotope shift
1321
1283
  for shift in isotope_shifts:
1322
1284
  target_mz = base_mz + shift
1323
1285
  mz_min_iso = target_mz - mz_tol
1324
1286
  mz_max_iso = target_mz + mz_tol
1325
-
1287
+
1326
1288
  # Find peaks in MS1 data within m/z tolerance
1327
- isotope_peaks = ms1_window.filter(
1328
- (pl.col("mz") >= mz_min_iso) & (pl.col("mz") <= mz_max_iso)
1329
- )
1330
-
1289
+ isotope_peaks = ms1_window.filter((pl.col("mz") >= mz_min_iso) & (pl.col("mz") <= mz_max_iso))
1290
+
1331
1291
  if not isotope_peaks.is_empty():
1332
1292
  # Get the peak with maximum intensity for this isotope
1333
- max_peak = isotope_peaks.filter(
1334
- pl.col("inty") == pl.col("inty").max()
1335
- ).row(0, named=True)
1336
-
1293
+ max_peak = isotope_peaks.filter(pl.col("inty") == pl.col("inty").max()).row(0, named=True)
1294
+
1337
1295
  # Store as float with specific precision: m/z to 4 decimals, intensity rounded to integer
1338
1296
  mz_formatted = round(float(max_peak["mz"]), 4)
1339
1297
  inty_formatted = float(round(max_peak["inty"])) # Round to integer, but keep as float
1340
1298
  isotope_matches.append([mz_formatted, inty_formatted])
1341
-
1299
+
1342
1300
  # Store results as numpy array
1343
1301
  if isotope_matches:
1344
1302
  consensus_iso_data[consensus_uid] = np.array(isotope_matches)
1345
1303
  else:
1346
1304
  consensus_iso_data[consensus_uid] = None
1347
-
1305
+
1348
1306
  except Exception as e:
1349
1307
  self.logger.warning(f"Failed to load MS1 data from {sample_path}: {e}")
1350
1308
  # Mark all consensus features from this sample as failed
1351
1309
  for consensus_uid, _ in consensus_list:
1352
1310
  consensus_iso_data[consensus_uid] = None
1353
1311
  continue
1354
-
1312
+
1355
1313
  # Update consensus_df with isotope data
1356
1314
  # Create mapping function for update
1357
1315
  def get_iso_data(uid):
1358
1316
  return consensus_iso_data.get(uid, None)
1359
-
1317
+
1360
1318
  # Update the iso column
1361
1319
  self.consensus_df = self.consensus_df.with_columns(
1362
- pl.col("consensus_uid").map_elements(
1363
- lambda uid: get_iso_data(uid),
1364
- return_dtype=pl.Object
1365
- ).alias("iso")
1320
+ pl.col("consensus_uid").map_elements(lambda uid: get_iso_data(uid), return_dtype=pl.Object).alias("iso")
1366
1321
  )
1367
-
1322
+
1368
1323
  # Count how many consensus features have isotope data
1369
1324
  iso_count = sum(1 for data in consensus_iso_data.values() if data is not None and len(data) > 0)
1370
-
1371
- self.logger.success(f"Isotope detection completed. Found isotope patterns for {iso_count}/{len(self.consensus_df)} consensus features.")
1325
+
1326
+ self.logger.success(
1327
+ f"Isotope detection completed. Found isotope patterns for {iso_count}/{len(self.consensus_df)} consensus features."
1328
+ )
1372
1329
 
1373
1330
 
1374
1331
  def reset_iso(self):
1375
1332
  """
1376
1333
  Reset the iso column in consensus_df to None, clearing all isotope data.
1377
-
1334
+
1378
1335
  This function clears any previously computed isotope patterns from the
1379
1336
  consensus_df, setting the 'iso' column to None for all features. This
1380
1337
  is useful before re-running isotope detection with different parameters
1381
1338
  or to clear isotope data entirely.
1382
-
1339
+
1383
1340
  Returns:
1384
1341
  None
1385
1342
  """
1386
1343
  if self.consensus_df is None:
1387
1344
  self.logger.warning("No consensus_df found. Nothing to reset.")
1388
1345
  return
1389
-
1346
+
1390
1347
  if "iso" not in self.consensus_df.columns:
1391
1348
  self.logger.warning("No 'iso' column found in consensus_df. Nothing to reset.")
1392
1349
  return
1393
-
1350
+
1394
1351
  # Count how many features currently have isotope data
1395
- iso_count = self.consensus_df.select(
1396
- pl.col("iso").is_not_null().sum().alias("count")
1397
- ).item(0, "count")
1398
-
1352
+ iso_count = self.consensus_df.select(pl.col("iso").is_not_null().sum().alias("count")).item(0, "count")
1353
+
1399
1354
  # Reset the iso column to None
1400
- self.consensus_df = self.consensus_df.with_columns(
1401
- pl.lit(None, dtype=pl.Object).alias("iso")
1402
- )
1403
-
1355
+ self.consensus_df = self.consensus_df.with_columns(pl.lit(None, dtype=pl.Object).alias("iso"))
1356
+
1404
1357
  self.logger.info(f"Reset isotope data for {iso_count} features. All 'iso' values set to None.")