masster 0.5.22__py3-none-any.whl → 0.5.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/logger.py +35 -19
- masster/sample/adducts.py +15 -29
- masster/sample/defaults/find_adducts_def.py +1 -3
- masster/sample/defaults/sample_def.py +4 -4
- masster/sample/h5.py +203 -361
- masster/sample/helpers.py +14 -30
- masster/sample/lib.py +3 -3
- masster/sample/load.py +21 -29
- masster/sample/plot.py +222 -132
- masster/sample/processing.py +42 -55
- masster/sample/sample.py +37 -46
- masster/sample/save.py +37 -61
- masster/sample/sciex.py +13 -11
- masster/sample/thermo.py +69 -74
- masster/spectrum.py +15 -15
- masster/study/analysis.py +650 -586
- masster/study/defaults/identify_def.py +1 -3
- masster/study/defaults/merge_def.py +6 -7
- masster/study/defaults/study_def.py +1 -5
- masster/study/export.py +35 -96
- masster/study/h5.py +134 -211
- masster/study/helpers.py +385 -459
- masster/study/id.py +239 -290
- masster/study/importers.py +84 -93
- masster/study/load.py +159 -178
- masster/study/merge.py +1112 -1098
- masster/study/plot.py +195 -149
- masster/study/processing.py +144 -191
- masster/study/save.py +14 -13
- masster/study/study.py +89 -130
- masster/wizard/wizard.py +764 -714
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/METADATA +27 -1
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/RECORD +37 -37
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/WHEEL +0 -0
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/entry_points.txt +0 -0
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/licenses/LICENSE +0 -0
masster/study/processing.py
CHANGED
|
@@ -36,16 +36,16 @@ def align(self, **kwargs):
|
|
|
36
36
|
"""
|
|
37
37
|
# parameters initialization
|
|
38
38
|
params = align_defaults()
|
|
39
|
-
|
|
39
|
+
|
|
40
40
|
# Handle 'params' keyword argument specifically (like merge does)
|
|
41
|
-
if
|
|
42
|
-
provided_params = kwargs.pop(
|
|
41
|
+
if "params" in kwargs:
|
|
42
|
+
provided_params = kwargs.pop("params")
|
|
43
43
|
if isinstance(provided_params, align_defaults):
|
|
44
44
|
params = provided_params
|
|
45
45
|
self.logger.debug("Using provided align_defaults parameters from 'params' argument")
|
|
46
46
|
else:
|
|
47
47
|
self.logger.warning("'params' argument is not an align_defaults instance, ignoring")
|
|
48
|
-
|
|
48
|
+
|
|
49
49
|
# Process remaining kwargs
|
|
50
50
|
for key, value in kwargs.items():
|
|
51
51
|
if isinstance(value, align_defaults):
|
|
@@ -68,7 +68,7 @@ def align(self, **kwargs):
|
|
|
68
68
|
|
|
69
69
|
# Ensure rt_original exists before starting alignment (both algorithms need this)
|
|
70
70
|
if "rt_original" not in self.features_df.columns:
|
|
71
|
-
# add column 'rt_original' after 'rt'
|
|
71
|
+
# add column 'rt_original' after 'rt'
|
|
72
72
|
rt_index = self.features_df.columns.get_loc("rt") + 1
|
|
73
73
|
self.features_df.insert(rt_index, "rt_original", 0)
|
|
74
74
|
self.features_df["rt_original"] = self.features_df["rt"]
|
|
@@ -174,9 +174,7 @@ def find_ms2(self, **kwargs):
|
|
|
174
174
|
]
|
|
175
175
|
for row in feats.iter_rows(named=True):
|
|
176
176
|
feature_uid = row["feature_uid"]
|
|
177
|
-
feature_lookup[feature_uid] = {
|
|
178
|
-
col: row[col] for col in relevant_cols if col in feats.columns
|
|
179
|
-
}
|
|
177
|
+
feature_lookup[feature_uid] = {col: row[col] for col in relevant_cols if col in feats.columns}
|
|
180
178
|
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
181
179
|
|
|
182
180
|
# Process consensus mapping in batch
|
|
@@ -204,13 +202,9 @@ def find_ms2(self, **kwargs):
|
|
|
204
202
|
"feature_uid": int(mapping_row["feature_uid"]),
|
|
205
203
|
"sample_uid": int(mapping_row["sample_uid"]),
|
|
206
204
|
"scan_id": int(scanid),
|
|
207
|
-
"energy": round(spec.energy, 1)
|
|
208
|
-
if hasattr(spec, "energy") and spec.energy is not None
|
|
209
|
-
else None,
|
|
205
|
+
"energy": round(spec.energy, 1) if hasattr(spec, "energy") and spec.energy is not None else None,
|
|
210
206
|
"prec_inty": round(inty, 0) if inty is not None else None,
|
|
211
|
-
"prec_coherence": round(chrom_coherence, 3)
|
|
212
|
-
if chrom_coherence is not None
|
|
213
|
-
else None,
|
|
207
|
+
"prec_coherence": round(chrom_coherence, 3) if chrom_coherence is not None else None,
|
|
214
208
|
"prec_prominence_scaled": round(chrom_prominence_scaled, 3)
|
|
215
209
|
if chrom_prominence_scaled is not None
|
|
216
210
|
else None,
|
|
@@ -250,10 +244,7 @@ def filter_consensus(
|
|
|
250
244
|
else:
|
|
251
245
|
if isinstance(coherence, tuple) and len(coherence) == 2:
|
|
252
246
|
min_coherence, max_coherence = coherence
|
|
253
|
-
cons = cons[
|
|
254
|
-
(cons["chrom_coherence"] >= min_coherence)
|
|
255
|
-
& (cons["chrom_coherence"] <= max_coherence)
|
|
256
|
-
]
|
|
247
|
+
cons = cons[(cons["chrom_coherence"] >= min_coherence) & (cons["chrom_coherence"] <= max_coherence)]
|
|
257
248
|
else:
|
|
258
249
|
cons = cons[cons["chrom_coherence"] >= coherence]
|
|
259
250
|
after_coherence = len(cons)
|
|
@@ -264,9 +255,7 @@ def filter_consensus(
|
|
|
264
255
|
if quality is not None:
|
|
265
256
|
if isinstance(quality, tuple) and len(quality) == 2:
|
|
266
257
|
min_quality, max_quality = quality
|
|
267
|
-
cons = cons[
|
|
268
|
-
(cons["quality"] >= min_quality) & (cons["quality"] <= max_quality)
|
|
269
|
-
]
|
|
258
|
+
cons = cons[(cons["quality"] >= min_quality) & (cons["quality"] <= max_quality)]
|
|
270
259
|
else:
|
|
271
260
|
cons = cons[cons["quality"] >= quality]
|
|
272
261
|
after_quality = len(cons)
|
|
@@ -277,10 +266,7 @@ def filter_consensus(
|
|
|
277
266
|
if number_samples is not None:
|
|
278
267
|
if isinstance(number_samples, tuple) and len(number_samples) == 2:
|
|
279
268
|
min_number, max_number = number_samples
|
|
280
|
-
cons = cons[
|
|
281
|
-
(cons["number_samples"] >= min_number)
|
|
282
|
-
& (cons["number_samples"] <= max_number)
|
|
283
|
-
]
|
|
269
|
+
cons = cons[(cons["number_samples"] >= min_number) & (cons["number_samples"] <= max_number)]
|
|
284
270
|
else:
|
|
285
271
|
cons = cons[cons["number_samples"] >= number_samples]
|
|
286
272
|
after_number_samples = len(cons)
|
|
@@ -437,13 +423,9 @@ def _integrate_chrom_impl(self, **kwargs):
|
|
|
437
423
|
if update_rows:
|
|
438
424
|
# Create mapping from row index to new values
|
|
439
425
|
row_to_chrom = {update_rows[i]: chroms[i] for i in range(len(update_rows))}
|
|
440
|
-
row_to_rt_start = {
|
|
441
|
-
update_rows[i]: rt_starts[i] for i in range(len(update_rows))
|
|
442
|
-
}
|
|
426
|
+
row_to_rt_start = {update_rows[i]: rt_starts[i] for i in range(len(update_rows))}
|
|
443
427
|
row_to_rt_end = {update_rows[i]: rt_ends[i] for i in range(len(update_rows))}
|
|
444
|
-
row_to_rt_delta = {
|
|
445
|
-
update_rows[i]: rt_deltas[i] for i in range(len(update_rows))
|
|
446
|
-
}
|
|
428
|
+
row_to_rt_delta = {update_rows[i]: rt_deltas[i] for i in range(len(update_rows))}
|
|
447
429
|
row_to_chrom_area = {
|
|
448
430
|
update_rows[i]: float(chrom_areas[i]) if chrom_areas[i] is not None else 0.0
|
|
449
431
|
for i in range(len(update_rows))
|
|
@@ -598,10 +580,10 @@ def _align_pose_clustering(study_obj, params):
|
|
|
598
580
|
|
|
599
581
|
# Generate temporary feature maps on-demand from features_df for PoseClustering
|
|
600
582
|
study_obj.logger.debug("Generating feature maps on-demand from features_df for PoseClustering alignment")
|
|
601
|
-
|
|
583
|
+
|
|
602
584
|
tdqm_disable = study_obj.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
603
585
|
fmaps = []
|
|
604
|
-
|
|
586
|
+
|
|
605
587
|
# Process each sample in order with progress bar
|
|
606
588
|
for sample_index, row_dict in tqdm(
|
|
607
589
|
list(enumerate(study_obj.samples_df.iter_rows(named=True))),
|
|
@@ -611,17 +593,17 @@ def _align_pose_clustering(study_obj, params):
|
|
|
611
593
|
):
|
|
612
594
|
sample_uid = row_dict["sample_uid"]
|
|
613
595
|
sample_name = row_dict["sample_name"]
|
|
614
|
-
|
|
596
|
+
|
|
615
597
|
# Get features for this sample from features_df
|
|
616
598
|
sample_features = study_obj.features_df.filter(pl.col("sample_uid") == sample_uid)
|
|
617
|
-
|
|
599
|
+
|
|
618
600
|
# Create new FeatureMap
|
|
619
601
|
feature_map = oms.FeatureMap()
|
|
620
|
-
|
|
602
|
+
|
|
621
603
|
# Convert DataFrame features to OpenMS Features
|
|
622
604
|
for feature_row in sample_features.iter_rows(named=True):
|
|
623
605
|
feature = oms.Feature()
|
|
624
|
-
|
|
606
|
+
|
|
625
607
|
# Set properties from DataFrame (handle missing values gracefully)
|
|
626
608
|
try:
|
|
627
609
|
# Skip features with missing critical data
|
|
@@ -639,7 +621,7 @@ def _align_pose_clustering(study_obj, params):
|
|
|
639
621
|
feature.setMZ(float(feature_row["mz"]))
|
|
640
622
|
feature.setRT(float(feature_row["rt"]))
|
|
641
623
|
feature.setIntensity(float(feature_row["inty"]))
|
|
642
|
-
|
|
624
|
+
|
|
643
625
|
# Handle optional fields that might be None
|
|
644
626
|
if feature_row.get("quality") is not None:
|
|
645
627
|
feature.setOverallQuality(float(feature_row["quality"]))
|
|
@@ -651,9 +633,9 @@ def _align_pose_clustering(study_obj, params):
|
|
|
651
633
|
except (ValueError, TypeError) as e:
|
|
652
634
|
study_obj.logger.warning(f"Skipping feature due to conversion error: {e}")
|
|
653
635
|
continue
|
|
654
|
-
|
|
636
|
+
|
|
655
637
|
fmaps.append(feature_map)
|
|
656
|
-
|
|
638
|
+
|
|
657
639
|
study_obj.logger.debug(f"Generated {len(fmaps)} feature maps from features_df for PoseClustering alignment")
|
|
658
640
|
|
|
659
641
|
# Create PC-specific OpenMS parameters
|
|
@@ -684,10 +666,8 @@ def _align_pose_clustering(study_obj, params):
|
|
|
684
666
|
)
|
|
685
667
|
|
|
686
668
|
# Set ref_index to feature map index with largest number of features
|
|
687
|
-
ref_index = [
|
|
688
|
-
|
|
689
|
-
][-1]
|
|
690
|
-
|
|
669
|
+
ref_index = [i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])][-1]
|
|
670
|
+
|
|
691
671
|
aligner.setParameters(params_oms)
|
|
692
672
|
aligner.setReference(fmaps[ref_index])
|
|
693
673
|
study_obj.logger.debug(f"Parameters for alignment: {params}")
|
|
@@ -701,18 +681,17 @@ def _align_pose_clustering(study_obj, params):
|
|
|
701
681
|
):
|
|
702
682
|
if index == ref_index:
|
|
703
683
|
continue
|
|
704
|
-
if (
|
|
705
|
-
params.get("skip_blanks")
|
|
706
|
-
and study_obj.samples_df.row(index, named=True)["sample_type"] == "blank"
|
|
707
|
-
):
|
|
684
|
+
if params.get("skip_blanks") and study_obj.samples_df.row(index, named=True)["sample_type"] == "blank":
|
|
708
685
|
continue
|
|
709
|
-
|
|
686
|
+
|
|
710
687
|
# Skip feature maps with insufficient data points for alignment
|
|
711
688
|
if fm.size() < 2:
|
|
712
689
|
sample_name = study_obj.samples_df.row(index, named=True)["sample_name"]
|
|
713
|
-
study_obj.logger.warning(
|
|
690
|
+
study_obj.logger.warning(
|
|
691
|
+
f"Skipping alignment for sample '{sample_name}' - insufficient features ({fm.size()} features)"
|
|
692
|
+
)
|
|
714
693
|
continue
|
|
715
|
-
|
|
694
|
+
|
|
716
695
|
try:
|
|
717
696
|
trafo = oms.TransformationDescription()
|
|
718
697
|
aligner.align(fm, trafo)
|
|
@@ -724,7 +703,7 @@ def _align_pose_clustering(study_obj, params):
|
|
|
724
703
|
continue
|
|
725
704
|
|
|
726
705
|
study_obj.alignment_ref_index = ref_index
|
|
727
|
-
|
|
706
|
+
|
|
728
707
|
# Process feature maps and update features_df with transformed retention times
|
|
729
708
|
# Build a fast lookup for (sample_uid, featureUid) to index in features_df
|
|
730
709
|
feats = study_obj.features_df
|
|
@@ -732,8 +711,7 @@ def _align_pose_clustering(study_obj, params):
|
|
|
732
711
|
# Pre-build sample_uid lookup for faster access
|
|
733
712
|
study_obj.logger.debug("Build sample_uid lookup for fast access...")
|
|
734
713
|
sample_uid_lookup = {
|
|
735
|
-
idx: row_dict["sample_uid"]
|
|
736
|
-
for idx, row_dict in enumerate(study_obj.samples_df.iter_rows(named=True))
|
|
714
|
+
idx: row_dict["sample_uid"] for idx, row_dict in enumerate(study_obj.samples_df.iter_rows(named=True))
|
|
737
715
|
}
|
|
738
716
|
|
|
739
717
|
# Build the main lookup using feature_uid (not feature_id)
|
|
@@ -833,7 +811,7 @@ def _align_pose_clustering(study_obj, params):
|
|
|
833
811
|
# Clean up temporary feature maps to release memory
|
|
834
812
|
del fmaps
|
|
835
813
|
study_obj.logger.debug("Temporary feature maps deleted to release memory")
|
|
836
|
-
|
|
814
|
+
|
|
837
815
|
# Resolve reference sample UID from the reference index
|
|
838
816
|
ref_sample_uid = sample_uid_lookup.get(ref_index)
|
|
839
817
|
study_obj.logger.success(
|
|
@@ -853,24 +831,15 @@ def _align_kd_algorithm(study_obj, params):
|
|
|
853
831
|
|
|
854
832
|
# Pull parameter values - map standard align params to our algorithm
|
|
855
833
|
# Use rt_tol (standard align param) instead of warp_rt_tol for RT tolerance
|
|
856
|
-
rt_pair_tol = (
|
|
857
|
-
float(params.get("rt_tol")) if params.get("rt_tol") is not None else 2.0
|
|
858
|
-
)
|
|
834
|
+
rt_pair_tol = float(params.get("rt_tol")) if params.get("rt_tol") is not None else 2.0
|
|
859
835
|
# Use mz_max_diff (standard align param) converted to ppm
|
|
860
|
-
mz_max_diff_da = (
|
|
861
|
-
float(params.get("mz_max_diff"))
|
|
862
|
-
if params.get("mz_max_diff") is not None
|
|
863
|
-
else 0.02
|
|
864
|
-
)
|
|
836
|
+
mz_max_diff_da = float(params.get("mz_max_diff")) if params.get("mz_max_diff") is not None else 0.02
|
|
865
837
|
# Convert Da to ppm (assuming ~400 m/z average for metabolomics): 0.01 Da / 400 * 1e6 = 25 ppm
|
|
866
838
|
ppm_tol = mz_max_diff_da / 400.0 * 1e6
|
|
867
839
|
# Allow override with warp_mz_tol if specifically set (but not from defaults)
|
|
868
840
|
try:
|
|
869
841
|
warp_mz_from_params = params.get("warp_mz_tol")
|
|
870
|
-
if (
|
|
871
|
-
warp_mz_from_params is not None
|
|
872
|
-
and warp_mz_from_params != params.__class__().warp_mz_tol
|
|
873
|
-
):
|
|
842
|
+
if warp_mz_from_params is not None and warp_mz_from_params != params.__class__().warp_mz_tol:
|
|
874
843
|
ppm_tol = float(warp_mz_from_params)
|
|
875
844
|
except (KeyError, AttributeError):
|
|
876
845
|
pass
|
|
@@ -888,29 +857,31 @@ def _align_kd_algorithm(study_obj, params):
|
|
|
888
857
|
# Work directly with features_df instead of feature maps
|
|
889
858
|
if study_obj.features_df is None or study_obj.features_df.is_empty():
|
|
890
859
|
study_obj.logger.error("No features_df available for alignment. Cannot proceed with KD alignment.")
|
|
891
|
-
raise ValueError(
|
|
892
|
-
|
|
860
|
+
raise ValueError(
|
|
861
|
+
"No features_df available for alignment. This usually indicates that features were not detected properly."
|
|
862
|
+
)
|
|
863
|
+
|
|
893
864
|
# OPTIMIZATION 1: Group all features by sample_uid in ONE operation instead of filtering repeatedly
|
|
894
865
|
study_obj.logger.debug("Grouping features efficiently (major speedup)...")
|
|
895
|
-
|
|
866
|
+
|
|
896
867
|
# rt_original should already exist (created in main align() function)
|
|
897
868
|
if "rt_original" not in study_obj.features_df.columns:
|
|
898
869
|
raise ValueError("rt_original column missing - this should have been created by align() function")
|
|
899
|
-
|
|
870
|
+
|
|
900
871
|
sample_groups = study_obj.features_df.group_by("sample_uid", maintain_order=True)
|
|
901
872
|
sample_feature_data = sample_groups.agg([
|
|
902
873
|
pl.len().alias("feature_count"),
|
|
903
874
|
pl.col("mz").alias("mzs"),
|
|
904
|
-
pl.col("rt_original").alias("rt_originals") # Use original RT values for alignment
|
|
875
|
+
pl.col("rt_original").alias("rt_originals"), # Use original RT values for alignment
|
|
905
876
|
]).sort("feature_count", descending=True)
|
|
906
|
-
|
|
877
|
+
|
|
907
878
|
if sample_feature_data.is_empty():
|
|
908
879
|
study_obj.logger.error("No features found in any sample for alignment.")
|
|
909
880
|
raise ValueError("No features found in any sample for alignment.")
|
|
910
|
-
|
|
881
|
+
|
|
911
882
|
# Choose reference sample (sample with most features)
|
|
912
883
|
ref_sample_uid = sample_feature_data.row(0, named=True)["sample_uid"]
|
|
913
|
-
|
|
884
|
+
|
|
914
885
|
# Find the index of this sample in samples_df
|
|
915
886
|
ref_index = None
|
|
916
887
|
sample_uid_to_index = {}
|
|
@@ -919,24 +890,24 @@ def _align_kd_algorithm(study_obj, params):
|
|
|
919
890
|
sample_uid_to_index[sample_uid] = idx
|
|
920
891
|
if sample_uid == ref_sample_uid:
|
|
921
892
|
ref_index = idx
|
|
922
|
-
|
|
893
|
+
|
|
923
894
|
if ref_index is None:
|
|
924
895
|
study_obj.logger.error(f"Could not find reference sample {ref_sample_uid} in samples_df")
|
|
925
896
|
raise ValueError(f"Could not find reference sample {ref_sample_uid} in samples_df")
|
|
926
|
-
|
|
897
|
+
|
|
927
898
|
study_obj.alignment_ref_index = ref_index
|
|
928
|
-
|
|
899
|
+
|
|
929
900
|
# OPTIMIZATION 2: Get reference features efficiently from pre-grouped data
|
|
930
901
|
# Always use rt_original for alignment input to ensure consistent results
|
|
931
902
|
ref_row = sample_feature_data.filter(pl.col("sample_uid") == ref_sample_uid).row(0, named=True)
|
|
932
903
|
ref_mzs_list = ref_row["mzs"]
|
|
933
904
|
ref_rts_list = ref_row["rt_originals"] # Use original RT values
|
|
934
|
-
|
|
905
|
+
|
|
935
906
|
# Create sorted reference features for binary search
|
|
936
907
|
ref_features = list(zip(ref_mzs_list, ref_rts_list))
|
|
937
908
|
ref_features.sort(key=lambda x: x[0])
|
|
938
909
|
ref_mzs = [mz for mz, _ in ref_features]
|
|
939
|
-
|
|
910
|
+
|
|
940
911
|
study_obj.logger.debug(
|
|
941
912
|
f"Reference sample UID {ref_sample_uid} (index {ref_index}, sample: {study_obj.samples_df.row(ref_index, named=True)['sample_name']}) has {len(ref_features)} features",
|
|
942
913
|
)
|
|
@@ -979,19 +950,19 @@ def _align_kd_algorithm(study_obj, params):
|
|
|
979
950
|
sample_uid = row["sample_uid"]
|
|
980
951
|
sample_mzs = row["mzs"]
|
|
981
952
|
sample_rts = row["rt_originals"] # Use original RT values for alignment input
|
|
982
|
-
|
|
953
|
+
|
|
983
954
|
td = oms.TransformationDescription()
|
|
984
955
|
sample_index = sample_uid_to_index.get(sample_uid)
|
|
985
|
-
|
|
956
|
+
|
|
986
957
|
if sample_index is None:
|
|
987
958
|
study_obj.logger.warning(f"Sample UID {sample_uid} not found in samples_df, skipping")
|
|
988
959
|
continue
|
|
989
|
-
|
|
960
|
+
|
|
990
961
|
# Skip empty samples
|
|
991
962
|
if not sample_mzs or not sample_rts:
|
|
992
963
|
transformations[sample_uid] = td
|
|
993
964
|
continue
|
|
994
|
-
|
|
965
|
+
|
|
995
966
|
# Identity for reference sample
|
|
996
967
|
if sample_uid == ref_sample_uid:
|
|
997
968
|
rts = [rt for rt in sample_rts if rt is not None]
|
|
@@ -1074,7 +1045,7 @@ def _align_kd_algorithm(study_obj, params):
|
|
|
1074
1045
|
|
|
1075
1046
|
# OPTIMIZATION 5: Apply transformations efficiently using vectorized operations
|
|
1076
1047
|
study_obj.logger.debug("Applying RT transformations efficiently...")
|
|
1077
|
-
|
|
1048
|
+
|
|
1078
1049
|
# Apply transformations to RT values starting from rt_original
|
|
1079
1050
|
def transform_rt_vectorized(sample_uid: int, rt_original: float) -> float:
|
|
1080
1051
|
if sample_uid in transformations and rt_original is not None:
|
|
@@ -1084,14 +1055,13 @@ def _align_kd_algorithm(study_obj, params):
|
|
|
1084
1055
|
except Exception:
|
|
1085
1056
|
return rt_original
|
|
1086
1057
|
return rt_original
|
|
1087
|
-
|
|
1058
|
+
|
|
1088
1059
|
# Use Polars' efficient struct operations for vectorized transformation
|
|
1089
1060
|
# Apply transformation to rt_original and store result in rt column
|
|
1090
1061
|
study_obj.features_df = study_obj.features_df.with_columns(
|
|
1091
|
-
pl.struct(["sample_uid", "rt_original"])
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
).alias("rt")
|
|
1062
|
+
pl.struct(["sample_uid", "rt_original"])
|
|
1063
|
+
.map_elements(lambda x: transform_rt_vectorized(x["sample_uid"], x["rt_original"]), return_dtype=pl.Float64)
|
|
1064
|
+
.alias("rt")
|
|
1095
1065
|
)
|
|
1096
1066
|
|
|
1097
1067
|
study_obj.logger.success(
|
|
@@ -1099,16 +1069,12 @@ def _align_kd_algorithm(study_obj, params):
|
|
|
1099
1069
|
)
|
|
1100
1070
|
|
|
1101
1071
|
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
1072
|
def _align_pose_clustering_fallback(study_obj, fmaps, params):
|
|
1105
1073
|
"""Fallback PoseClustering alignment with minimal parameters."""
|
|
1106
1074
|
import pyopenms as oms
|
|
1107
1075
|
|
|
1108
1076
|
aligner = oms.MapAlignmentAlgorithmPoseClustering()
|
|
1109
|
-
ref_index = [
|
|
1110
|
-
i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])
|
|
1111
|
-
][-1]
|
|
1077
|
+
ref_index = [i[0] for i in sorted(enumerate([fm.size() for fm in fmaps]), key=lambda x: x[1])][-1]
|
|
1112
1078
|
|
|
1113
1079
|
# Set up basic parameters for pose clustering
|
|
1114
1080
|
pc_params = oms.Param()
|
|
@@ -1137,15 +1103,15 @@ def find_iso(self, rt_tol=0.1, mz_tol=0.01, uids=None):
|
|
|
1137
1103
|
"""
|
|
1138
1104
|
Find isotope patterns for consensus features by searching raw MS1 data.
|
|
1139
1105
|
OPTIMIZED VERSION: Each sample file is loaded only once for maximum efficiency.
|
|
1140
|
-
|
|
1106
|
+
|
|
1141
1107
|
For each consensus feature:
|
|
1142
|
-
1. Find the associated feature with highest intensity
|
|
1108
|
+
1. Find the associated feature with highest intensity
|
|
1143
1109
|
2. Load the corresponding sample5 file to access raw MS1 data
|
|
1144
1110
|
3. Use original_rt (before alignment) to find the correct scan
|
|
1145
1111
|
4. Search for isotope patterns in raw MS1 spectra
|
|
1146
1112
|
5. Look for isotope patterns: 0.33, 0.50, 0.66, 1.00, 1.50, 2.00, 3.00, 4.00, 5.00 Da
|
|
1147
1113
|
6. Store results as numpy arrays with [mz, inty] in the iso column
|
|
1148
|
-
|
|
1114
|
+
|
|
1149
1115
|
Parameters:
|
|
1150
1116
|
rt_tol (float): RT tolerance for scan matching in seconds
|
|
1151
1117
|
mz_tol (float): Additional m/z tolerance for isotope matching in Da
|
|
@@ -1154,27 +1120,25 @@ def find_iso(self, rt_tol=0.1, mz_tol=0.01, uids=None):
|
|
|
1154
1120
|
if self.consensus_df is None or self.consensus_df.is_empty():
|
|
1155
1121
|
self.logger.error("No consensus features found. Please run merge() first.")
|
|
1156
1122
|
return
|
|
1157
|
-
|
|
1123
|
+
|
|
1158
1124
|
if self.consensus_mapping_df is None or self.consensus_mapping_df.is_empty():
|
|
1159
1125
|
self.logger.error("No consensus mapping found. Please run merge() first.")
|
|
1160
1126
|
return
|
|
1161
|
-
|
|
1127
|
+
|
|
1162
1128
|
if self.features_df is None or self.features_df.is_empty():
|
|
1163
1129
|
self.logger.error("No features found.")
|
|
1164
1130
|
return
|
|
1165
|
-
|
|
1131
|
+
|
|
1166
1132
|
if self.samples_df is None or self.samples_df.is_empty():
|
|
1167
1133
|
self.logger.error("No samples found.")
|
|
1168
1134
|
return
|
|
1169
|
-
|
|
1135
|
+
|
|
1170
1136
|
# Add iso column if it doesn't exist
|
|
1171
1137
|
if "iso" not in self.consensus_df.columns:
|
|
1172
|
-
self.consensus_df = self.consensus_df.with_columns(
|
|
1173
|
-
|
|
1174
|
-
)
|
|
1175
|
-
|
|
1138
|
+
self.consensus_df = self.consensus_df.with_columns(pl.lit(None, dtype=pl.Object).alias("iso"))
|
|
1139
|
+
|
|
1176
1140
|
self.logger.info("Extracting isotopomers from raw MS1 data...")
|
|
1177
|
-
|
|
1141
|
+
|
|
1178
1142
|
# Filter consensus features if uids is specified
|
|
1179
1143
|
if uids is not None:
|
|
1180
1144
|
if not isinstance(uids, (list, tuple)):
|
|
@@ -1188,7 +1152,7 @@ def find_iso(self, rt_tol=0.1, mz_tol=0.01, uids=None):
|
|
|
1188
1152
|
else:
|
|
1189
1153
|
consensus_df_filtered = self.consensus_df
|
|
1190
1154
|
self.logger.debug(f"Processing all {len(consensus_df_filtered)} consensus features")
|
|
1191
|
-
|
|
1155
|
+
|
|
1192
1156
|
# Isotope mass shifts to search for (up to 7x 13C isotopes)
|
|
1193
1157
|
isotope_shifts = [
|
|
1194
1158
|
0.33,
|
|
@@ -1203,73 +1167,73 @@ def find_iso(self, rt_tol=0.1, mz_tol=0.01, uids=None):
|
|
|
1203
1167
|
6.02010,
|
|
1204
1168
|
7.02345,
|
|
1205
1169
|
]
|
|
1206
|
-
|
|
1170
|
+
|
|
1207
1171
|
consensus_iso_data = {}
|
|
1208
|
-
|
|
1172
|
+
|
|
1209
1173
|
# SUPER OPTIMIZATION: Vectorized pre-calculation using joins (10-100x faster)
|
|
1210
1174
|
self.logger.debug("Building sample-to-consensus mapping using vectorized operations...")
|
|
1211
|
-
|
|
1175
|
+
|
|
1212
1176
|
# Step 1: Join consensus_mapping with features to get intensities in one operation
|
|
1213
1177
|
# Apply UID filtering if specified
|
|
1214
1178
|
if uids is not None:
|
|
1215
1179
|
consensus_mapping_filtered = self.consensus_mapping_df.filter(pl.col("consensus_uid").is_in(uids))
|
|
1216
1180
|
else:
|
|
1217
1181
|
consensus_mapping_filtered = self.consensus_mapping_df
|
|
1218
|
-
|
|
1182
|
+
|
|
1219
1183
|
consensus_with_features = consensus_mapping_filtered.join(
|
|
1220
|
-
self.features_df.select([
|
|
1221
|
-
on=[
|
|
1222
|
-
how=
|
|
1184
|
+
self.features_df.select(["feature_uid", "sample_uid", "inty", "mz", "rt", "rt_original"]),
|
|
1185
|
+
on=["feature_uid", "sample_uid"],
|
|
1186
|
+
how="left",
|
|
1223
1187
|
)
|
|
1224
|
-
|
|
1188
|
+
|
|
1225
1189
|
# Step 2: Find the best feature (highest intensity) for each consensus using window functions
|
|
1226
|
-
best_features =
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
pl.col(
|
|
1232
|
-
|
|
1233
|
-
|
|
1190
|
+
best_features = (
|
|
1191
|
+
consensus_with_features.with_columns(
|
|
1192
|
+
pl.col("inty").fill_null(0) # Handle null intensities
|
|
1193
|
+
)
|
|
1194
|
+
.with_columns(pl.col("inty").max().over("consensus_uid").alias("max_inty"))
|
|
1195
|
+
.filter(pl.col("inty") == pl.col("max_inty"))
|
|
1196
|
+
.group_by("consensus_uid")
|
|
1197
|
+
.first()
|
|
1198
|
+
) # Take first if there are ties
|
|
1199
|
+
|
|
1234
1200
|
# Step 3: Join with samples to get sample paths in one operation
|
|
1235
1201
|
best_features_with_paths = best_features.join(
|
|
1236
|
-
self.samples_df.select([
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
).filter(
|
|
1240
|
-
pl.col('sample_path').is_not_null()
|
|
1241
|
-
)
|
|
1242
|
-
|
|
1202
|
+
self.samples_df.select(["sample_uid", "sample_path"]), on="sample_uid", how="left"
|
|
1203
|
+
).filter(pl.col("sample_path").is_not_null())
|
|
1204
|
+
|
|
1243
1205
|
# Step 4: Group by sample path for batch processing (much faster than nested loops)
|
|
1244
1206
|
sample_to_consensus = {}
|
|
1245
1207
|
for row in best_features_with_paths.iter_rows(named=True):
|
|
1246
|
-
sample_path = row[
|
|
1247
|
-
consensus_uid = row[
|
|
1248
|
-
|
|
1208
|
+
sample_path = row["sample_path"]
|
|
1209
|
+
consensus_uid = row["consensus_uid"]
|
|
1210
|
+
|
|
1249
1211
|
# Create feature data dictionary for compatibility
|
|
1250
1212
|
feature_data = {
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1213
|
+
"mz": row["mz"],
|
|
1214
|
+
"rt": row["rt"],
|
|
1215
|
+
"rt_original": row.get("rt_original", row["rt"]),
|
|
1216
|
+
"inty": row["inty"],
|
|
1255
1217
|
}
|
|
1256
|
-
|
|
1218
|
+
|
|
1257
1219
|
if sample_path not in sample_to_consensus:
|
|
1258
1220
|
sample_to_consensus[sample_path] = []
|
|
1259
|
-
|
|
1221
|
+
|
|
1260
1222
|
sample_to_consensus[sample_path].append((consensus_uid, feature_data))
|
|
1261
|
-
|
|
1223
|
+
|
|
1262
1224
|
# Initialize failed consensus features (those not in the mapping)
|
|
1263
|
-
processed_consensus_uids = set(best_features_with_paths[
|
|
1225
|
+
processed_consensus_uids = set(best_features_with_paths["consensus_uid"].to_list())
|
|
1264
1226
|
for consensus_row in consensus_df_filtered.iter_rows(named=True):
|
|
1265
1227
|
consensus_uid = consensus_row["consensus_uid"]
|
|
1266
1228
|
if consensus_uid not in processed_consensus_uids:
|
|
1267
1229
|
consensus_iso_data[consensus_uid] = None
|
|
1268
|
-
|
|
1269
|
-
self.logger.debug(
|
|
1270
|
-
|
|
1230
|
+
|
|
1231
|
+
self.logger.debug(
|
|
1232
|
+
f"Will read {len(sample_to_consensus)} unique sample files for {len(consensus_df_filtered)} consensus features"
|
|
1233
|
+
)
|
|
1234
|
+
|
|
1271
1235
|
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
1272
|
-
|
|
1236
|
+
|
|
1273
1237
|
# OPTIMIZATION 2: Process by sample file (load each file only once)
|
|
1274
1238
|
for sample_path, consensus_list in tqdm(
|
|
1275
1239
|
sample_to_consensus.items(),
|
|
@@ -1279,126 +1243,115 @@ def find_iso(self, rt_tol=0.1, mz_tol=0.01, uids=None):
|
|
|
1279
1243
|
try:
|
|
1280
1244
|
# Load MS1 data once per sample
|
|
1281
1245
|
ms1_df = self._load_ms1(sample_path)
|
|
1282
|
-
|
|
1246
|
+
|
|
1283
1247
|
if ms1_df is None or ms1_df.is_empty():
|
|
1284
1248
|
# Mark all consensus features from this sample as failed
|
|
1285
1249
|
for consensus_uid, _ in consensus_list:
|
|
1286
1250
|
consensus_iso_data[consensus_uid] = None
|
|
1287
1251
|
continue
|
|
1288
|
-
|
|
1252
|
+
|
|
1289
1253
|
# Process all consensus features for this sample
|
|
1290
1254
|
for consensus_uid, best_feature in consensus_list:
|
|
1291
1255
|
# Get the original RT (before alignment correction)
|
|
1292
1256
|
base_mz = best_feature["mz"]
|
|
1293
1257
|
original_rt = best_feature.get("rt_original", best_feature["rt"])
|
|
1294
|
-
|
|
1258
|
+
|
|
1295
1259
|
# Skip if RT or mz is None or invalid
|
|
1296
1260
|
if original_rt is None:
|
|
1297
|
-
original_rt = best_feature["rt"]
|
|
1261
|
+
original_rt = best_feature["rt"]
|
|
1298
1262
|
self.logger.debug(f"original_rt is None. Using aligned rt instead")
|
|
1299
|
-
|
|
1263
|
+
|
|
1300
1264
|
if base_mz is None:
|
|
1301
1265
|
self.logger.warning(f"Skipping consensus_uid {consensus_uid}: base_mz is None")
|
|
1302
1266
|
consensus_iso_data[consensus_uid] = None
|
|
1303
1267
|
continue
|
|
1304
|
-
|
|
1268
|
+
|
|
1305
1269
|
# Find MS1 scans near the original RT
|
|
1306
1270
|
rt_min = original_rt - rt_tol
|
|
1307
1271
|
rt_max = original_rt + rt_tol
|
|
1308
|
-
|
|
1272
|
+
|
|
1309
1273
|
# Filter MS1 data for scans within RT window
|
|
1310
|
-
ms1_window = ms1_df.filter(
|
|
1311
|
-
|
|
1312
|
-
)
|
|
1313
|
-
|
|
1274
|
+
ms1_window = ms1_df.filter((pl.col("rt") >= rt_min) & (pl.col("rt") <= rt_max))
|
|
1275
|
+
|
|
1314
1276
|
if ms1_window.is_empty():
|
|
1315
1277
|
consensus_iso_data[consensus_uid] = None
|
|
1316
1278
|
continue
|
|
1317
|
-
|
|
1279
|
+
|
|
1318
1280
|
isotope_matches = []
|
|
1319
|
-
|
|
1281
|
+
|
|
1320
1282
|
# Search for each isotope shift
|
|
1321
1283
|
for shift in isotope_shifts:
|
|
1322
1284
|
target_mz = base_mz + shift
|
|
1323
1285
|
mz_min_iso = target_mz - mz_tol
|
|
1324
1286
|
mz_max_iso = target_mz + mz_tol
|
|
1325
|
-
|
|
1287
|
+
|
|
1326
1288
|
# Find peaks in MS1 data within m/z tolerance
|
|
1327
|
-
isotope_peaks = ms1_window.filter(
|
|
1328
|
-
|
|
1329
|
-
)
|
|
1330
|
-
|
|
1289
|
+
isotope_peaks = ms1_window.filter((pl.col("mz") >= mz_min_iso) & (pl.col("mz") <= mz_max_iso))
|
|
1290
|
+
|
|
1331
1291
|
if not isotope_peaks.is_empty():
|
|
1332
1292
|
# Get the peak with maximum intensity for this isotope
|
|
1333
|
-
max_peak = isotope_peaks.filter(
|
|
1334
|
-
|
|
1335
|
-
).row(0, named=True)
|
|
1336
|
-
|
|
1293
|
+
max_peak = isotope_peaks.filter(pl.col("inty") == pl.col("inty").max()).row(0, named=True)
|
|
1294
|
+
|
|
1337
1295
|
# Store as float with specific precision: m/z to 4 decimals, intensity rounded to integer
|
|
1338
1296
|
mz_formatted = round(float(max_peak["mz"]), 4)
|
|
1339
1297
|
inty_formatted = float(round(max_peak["inty"])) # Round to integer, but keep as float
|
|
1340
1298
|
isotope_matches.append([mz_formatted, inty_formatted])
|
|
1341
|
-
|
|
1299
|
+
|
|
1342
1300
|
# Store results as numpy array
|
|
1343
1301
|
if isotope_matches:
|
|
1344
1302
|
consensus_iso_data[consensus_uid] = np.array(isotope_matches)
|
|
1345
1303
|
else:
|
|
1346
1304
|
consensus_iso_data[consensus_uid] = None
|
|
1347
|
-
|
|
1305
|
+
|
|
1348
1306
|
except Exception as e:
|
|
1349
1307
|
self.logger.warning(f"Failed to load MS1 data from {sample_path}: {e}")
|
|
1350
1308
|
# Mark all consensus features from this sample as failed
|
|
1351
1309
|
for consensus_uid, _ in consensus_list:
|
|
1352
1310
|
consensus_iso_data[consensus_uid] = None
|
|
1353
1311
|
continue
|
|
1354
|
-
|
|
1312
|
+
|
|
1355
1313
|
# Update consensus_df with isotope data
|
|
1356
1314
|
# Create mapping function for update
|
|
1357
1315
|
def get_iso_data(uid):
|
|
1358
1316
|
return consensus_iso_data.get(uid, None)
|
|
1359
|
-
|
|
1317
|
+
|
|
1360
1318
|
# Update the iso column
|
|
1361
1319
|
self.consensus_df = self.consensus_df.with_columns(
|
|
1362
|
-
pl.col("consensus_uid").map_elements(
|
|
1363
|
-
lambda uid: get_iso_data(uid),
|
|
1364
|
-
return_dtype=pl.Object
|
|
1365
|
-
).alias("iso")
|
|
1320
|
+
pl.col("consensus_uid").map_elements(lambda uid: get_iso_data(uid), return_dtype=pl.Object).alias("iso")
|
|
1366
1321
|
)
|
|
1367
|
-
|
|
1322
|
+
|
|
1368
1323
|
# Count how many consensus features have isotope data
|
|
1369
1324
|
iso_count = sum(1 for data in consensus_iso_data.values() if data is not None and len(data) > 0)
|
|
1370
|
-
|
|
1371
|
-
self.logger.success(
|
|
1325
|
+
|
|
1326
|
+
self.logger.success(
|
|
1327
|
+
f"Isotope detection completed. Found isotope patterns for {iso_count}/{len(self.consensus_df)} consensus features."
|
|
1328
|
+
)
|
|
1372
1329
|
|
|
1373
1330
|
|
|
1374
1331
|
def reset_iso(self):
|
|
1375
1332
|
"""
|
|
1376
1333
|
Reset the iso column in consensus_df to None, clearing all isotope data.
|
|
1377
|
-
|
|
1334
|
+
|
|
1378
1335
|
This function clears any previously computed isotope patterns from the
|
|
1379
1336
|
consensus_df, setting the 'iso' column to None for all features. This
|
|
1380
1337
|
is useful before re-running isotope detection with different parameters
|
|
1381
1338
|
or to clear isotope data entirely.
|
|
1382
|
-
|
|
1339
|
+
|
|
1383
1340
|
Returns:
|
|
1384
1341
|
None
|
|
1385
1342
|
"""
|
|
1386
1343
|
if self.consensus_df is None:
|
|
1387
1344
|
self.logger.warning("No consensus_df found. Nothing to reset.")
|
|
1388
1345
|
return
|
|
1389
|
-
|
|
1346
|
+
|
|
1390
1347
|
if "iso" not in self.consensus_df.columns:
|
|
1391
1348
|
self.logger.warning("No 'iso' column found in consensus_df. Nothing to reset.")
|
|
1392
1349
|
return
|
|
1393
|
-
|
|
1350
|
+
|
|
1394
1351
|
# Count how many features currently have isotope data
|
|
1395
|
-
iso_count = self.consensus_df.select(
|
|
1396
|
-
|
|
1397
|
-
).item(0, "count")
|
|
1398
|
-
|
|
1352
|
+
iso_count = self.consensus_df.select(pl.col("iso").is_not_null().sum().alias("count")).item(0, "count")
|
|
1353
|
+
|
|
1399
1354
|
# Reset the iso column to None
|
|
1400
|
-
self.consensus_df = self.consensus_df.with_columns(
|
|
1401
|
-
|
|
1402
|
-
)
|
|
1403
|
-
|
|
1355
|
+
self.consensus_df = self.consensus_df.with_columns(pl.lit(None, dtype=pl.Object).alias("iso"))
|
|
1356
|
+
|
|
1404
1357
|
self.logger.info(f"Reset isotope data for {iso_count} features. All 'iso' values set to None.")
|