masster 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/sample/load.py +5 -4
- masster/study/defaults/align_def.py +0 -204
- masster/study/defaults/fill_def.py +9 -1
- masster/study/defaults/merge_def.py +20 -69
- masster/study/export.py +25 -5
- masster/study/h5.py +160 -42
- masster/study/helpers.py +430 -53
- masster/study/load.py +986 -158
- masster/study/merge.py +683 -1076
- masster/study/plot.py +43 -38
- masster/study/processing.py +337 -280
- masster/study/study.py +58 -135
- masster/wizard/wizard.py +20 -6
- {masster-0.5.1.dist-info → masster-0.5.3.dist-info}/METADATA +1 -1
- {masster-0.5.1.dist-info → masster-0.5.3.dist-info}/RECORD +19 -20
- masster/study/defaults/fill_chrom_def.py +0 -260
- {masster-0.5.1.dist-info → masster-0.5.3.dist-info}/WHEEL +0 -0
- {masster-0.5.1.dist-info → masster-0.5.3.dist-info}/entry_points.txt +0 -0
- {masster-0.5.1.dist-info → masster-0.5.3.dist-info}/licenses/LICENSE +0 -0
masster/study/processing.py
CHANGED
|
@@ -15,77 +15,6 @@ from masster.study.defaults import (
|
|
|
15
15
|
)
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
def _generate_feature_maps_on_demand_for_align(study):
|
|
19
|
-
"""
|
|
20
|
-
Generate feature maps on-demand from study.features_df for alignment operations.
|
|
21
|
-
Returns temporary feature maps that are not cached in the study.
|
|
22
|
-
|
|
23
|
-
Args:
|
|
24
|
-
study: Study object containing features_df and samples_df
|
|
25
|
-
|
|
26
|
-
Returns:
|
|
27
|
-
list: List of temporary FeatureMap objects
|
|
28
|
-
"""
|
|
29
|
-
import polars as pl
|
|
30
|
-
import pyopenms as oms
|
|
31
|
-
|
|
32
|
-
if study.features_df is None or len(study.features_df) == 0:
|
|
33
|
-
study.logger.error("No features_df available for generating feature maps")
|
|
34
|
-
return []
|
|
35
|
-
|
|
36
|
-
temp_feature_maps = []
|
|
37
|
-
|
|
38
|
-
# Process each sample in order
|
|
39
|
-
for sample_index, row_dict in enumerate(study.samples_df.iter_rows(named=True)):
|
|
40
|
-
sample_uid = row_dict["sample_uid"]
|
|
41
|
-
sample_name = row_dict["sample_name"]
|
|
42
|
-
|
|
43
|
-
# Get features for this sample from features_df
|
|
44
|
-
sample_features = study.features_df.filter(pl.col("sample_uid") == sample_uid)
|
|
45
|
-
|
|
46
|
-
# Create new FeatureMap
|
|
47
|
-
feature_map = oms.FeatureMap()
|
|
48
|
-
|
|
49
|
-
# Convert DataFrame features to OpenMS Features
|
|
50
|
-
for feature_row in sample_features.iter_rows(named=True):
|
|
51
|
-
feature = oms.Feature()
|
|
52
|
-
|
|
53
|
-
# Set properties from DataFrame (handle missing values gracefully)
|
|
54
|
-
try:
|
|
55
|
-
# Skip features with missing critical data
|
|
56
|
-
if feature_row["mz"] is None:
|
|
57
|
-
study.logger.warning("Skipping feature due to missing mz")
|
|
58
|
-
continue
|
|
59
|
-
if feature_row["rt"] is None:
|
|
60
|
-
study.logger.warning("Skipping feature due to missing rt")
|
|
61
|
-
continue
|
|
62
|
-
if feature_row["inty"] is None:
|
|
63
|
-
study.logger.warning("Skipping feature due to missing inty")
|
|
64
|
-
continue
|
|
65
|
-
|
|
66
|
-
feature.setUniqueId(int(feature_row["feature_id"]))
|
|
67
|
-
feature.setMZ(float(feature_row["mz"]))
|
|
68
|
-
feature.setRT(float(feature_row["rt"]))
|
|
69
|
-
feature.setIntensity(float(feature_row["inty"]))
|
|
70
|
-
|
|
71
|
-
# Handle optional fields that might be None
|
|
72
|
-
if feature_row.get("quality") is not None:
|
|
73
|
-
feature.setOverallQuality(float(feature_row["quality"]))
|
|
74
|
-
if feature_row.get("charge") is not None:
|
|
75
|
-
feature.setCharge(int(feature_row["charge"]))
|
|
76
|
-
|
|
77
|
-
# Add to feature map
|
|
78
|
-
feature_map.push_back(feature)
|
|
79
|
-
except (ValueError, TypeError) as e:
|
|
80
|
-
study.logger.warning(f"Skipping feature due to conversion error: {e}")
|
|
81
|
-
continue
|
|
82
|
-
|
|
83
|
-
temp_feature_maps.append(feature_map)
|
|
84
|
-
|
|
85
|
-
study.logger.debug(f"Generated {len(temp_feature_maps)} temporary feature maps from features_df for alignment")
|
|
86
|
-
return temp_feature_maps
|
|
87
|
-
|
|
88
|
-
|
|
89
18
|
def align(self, **kwargs):
|
|
90
19
|
"""Align feature maps using pose clustering or KD algorithm and update feature RTs.
|
|
91
20
|
|
|
@@ -103,30 +32,7 @@ def align(self, **kwargs):
|
|
|
103
32
|
- algorithm (str): Alignment algorithm ('pc' for PoseClustering, 'kd' for KD).
|
|
104
33
|
|
|
105
34
|
KD algorithm specific parameters:
|
|
106
|
-
- min_samples (int): Minimum number of samples required for KD alignment.
|
|
107
|
-
- nr_partitions (int): Number of partitions in m/z dimension.
|
|
108
|
-
- warp_enabled (bool): Enable non-linear retention time transformation.
|
|
109
|
-
- warp_rt_tol (float): RT tolerance for the LOWESS fit.
|
|
110
35
|
- warp_mz_tol (float): m/z tolerance for the LOWESS fit.
|
|
111
|
-
- warp_max_pairwise_log_fc (float): Maximum absolute log10 fold-change threshold for pairing.
|
|
112
|
-
- warp_min_rel_cc_size (float): Minimum relative connected component size.
|
|
113
|
-
- warp_max_nr_conflicts (int): Allow up to this many conflicts per connected component for alignment.
|
|
114
|
-
- link_rt_tol (float): Width of RT tolerance window for linking features.
|
|
115
|
-
- link_mz_tol (float): m/z tolerance for linking features.
|
|
116
|
-
- link_charge_merging (str): Charge merging strategy for linking features.
|
|
117
|
-
- link_adduct_merging (str): Adduct merging strategy for linking features.
|
|
118
|
-
- distance_RT_exponent (float): Exponent for normalized RT differences.
|
|
119
|
-
- distance_RT_weight (float): Weight factor for final RT distances.
|
|
120
|
-
- distance_MZ_exponent (float): Exponent for normalized m/z differences.
|
|
121
|
-
- distance_MZ_weight (float): Weight factor for final m/z distances.
|
|
122
|
-
- distance_intensity_exponent (float): Exponent for differences in relative intensity.
|
|
123
|
-
- distance_intensity_weight (float): Weight factor for final intensity distances.
|
|
124
|
-
- distance_intensity_log_transform (str): Log-transform intensities.
|
|
125
|
-
- LOWESS_span (float): Fraction of datapoints for each local regression.
|
|
126
|
-
- LOWESS_num_iterations (int): Number of robustifying iterations for LOWESS fitting.
|
|
127
|
-
- LOWESS_delta (float): Parameter for LOWESS computations (negative auto-computes).
|
|
128
|
-
- LOWESS_interpolation_type (str): Method for interpolation between datapoints.
|
|
129
|
-
- LOWESS_extrapolation_type (str): Method for extrapolation outside data range.
|
|
130
36
|
"""
|
|
131
37
|
# parameters initialization
|
|
132
38
|
params = align_defaults()
|
|
@@ -155,145 +61,31 @@ def align(self, **kwargs):
|
|
|
155
61
|
)
|
|
156
62
|
else:
|
|
157
63
|
self.logger.warning(f"Unknown parameter '{key}' ignored")
|
|
158
|
-
# end of parameter initialization
|
|
159
64
|
|
|
160
65
|
# Store parameters in the Study object
|
|
161
66
|
self.update_history(["align"], params.to_dict())
|
|
162
67
|
self.logger.debug("Parameters stored to align")
|
|
163
68
|
|
|
164
|
-
#
|
|
165
|
-
|
|
166
|
-
|
|
69
|
+
# Ensure rt_original exists before starting alignment (both algorithms need this)
|
|
70
|
+
if "rt_original" not in self.features_df.columns:
|
|
71
|
+
# add column 'rt_original' after 'rt'
|
|
72
|
+
rt_index = self.features_df.columns.get_loc("rt") + 1
|
|
73
|
+
self.features_df.insert(rt_index, "rt_original", 0)
|
|
74
|
+
self.features_df["rt_original"] = self.features_df["rt"]
|
|
75
|
+
self.logger.debug("Created rt_original column from current rt values")
|
|
167
76
|
|
|
168
77
|
# Choose alignment algorithm
|
|
169
78
|
algorithm = params.get("algorithm").lower()
|
|
170
79
|
|
|
171
80
|
if algorithm == "pc":
|
|
172
|
-
_align_pose_clustering(self,
|
|
81
|
+
_align_pose_clustering(self, params)
|
|
173
82
|
|
|
174
83
|
elif algorithm == "kd":
|
|
175
|
-
_align_kd_algorithm(self,
|
|
84
|
+
_align_kd_algorithm(self, params)
|
|
176
85
|
else:
|
|
177
86
|
self.logger.error(f"Unknown alignment algorithm '{algorithm}'")
|
|
178
|
-
# Clean up temporary feature maps to release memory
|
|
179
|
-
del fmaps
|
|
180
87
|
return
|
|
181
88
|
|
|
182
|
-
# check if rt_original exists in features_df, if not, add it after rt
|
|
183
|
-
if "rt_original" not in self.features_df.columns:
|
|
184
|
-
# add column 'rt_original' after 'rt'
|
|
185
|
-
rt_index = self.features_df.columns.get_loc("rt") + 1
|
|
186
|
-
self.features_df.insert(rt_index, "rt_original", 0)
|
|
187
|
-
self.features_df["rt_original"] = self.features_df["rt"]
|
|
188
|
-
|
|
189
|
-
# iterate through all feature_maps and add the transformed retention times to the features_df
|
|
190
|
-
|
|
191
|
-
# Build a fast lookup for (sample_uid, featureUid) to index in features_df
|
|
192
|
-
feats = self.features_df
|
|
193
|
-
|
|
194
|
-
# Pre-build sample_uid lookup for faster access
|
|
195
|
-
self.logger.debug("Build sample_uid lookup for fast access...")
|
|
196
|
-
sample_uid_lookup = {
|
|
197
|
-
idx: row_dict["sample_uid"]
|
|
198
|
-
for idx, row_dict in enumerate(self.samples_df.iter_rows(named=True))
|
|
199
|
-
}
|
|
200
|
-
|
|
201
|
-
# Build the main lookup using feature_uid (not feature_id)
|
|
202
|
-
if "feature_id" in feats.columns:
|
|
203
|
-
# Create lookup mapping (sample_uid, feature_id) to DataFrame index using Polars
|
|
204
|
-
# Since we need a pandas-style index lookup, we'll create a simple dict
|
|
205
|
-
sample_uids = feats.get_column("sample_uid").to_list()
|
|
206
|
-
|
|
207
|
-
# Handle feature_id column - it might be Object type due to conversion
|
|
208
|
-
feature_id_col = feats.get_column("feature_id")
|
|
209
|
-
if feature_id_col.dtype == pl.Object:
|
|
210
|
-
# If it's Object type, convert to list and let Python handle the conversion
|
|
211
|
-
feature_ids = feature_id_col.to_list()
|
|
212
|
-
# Convert to strings if they're not already
|
|
213
|
-
feature_ids = [str(fid) if fid is not None else None for fid in feature_ids]
|
|
214
|
-
else:
|
|
215
|
-
# Safe to cast normally
|
|
216
|
-
feature_ids = feature_id_col.cast(pl.Utf8).to_list()
|
|
217
|
-
|
|
218
|
-
lookup = {
|
|
219
|
-
(sample_uid, feature_id): idx
|
|
220
|
-
for idx, (sample_uid, feature_id) in enumerate(
|
|
221
|
-
zip(sample_uids, feature_ids, strict=True),
|
|
222
|
-
)
|
|
223
|
-
}
|
|
224
|
-
else:
|
|
225
|
-
# fallback: skip if feature_uid column missing
|
|
226
|
-
lookup = {}
|
|
227
|
-
self.logger.warning("feature_id column not found in features_df")
|
|
228
|
-
|
|
229
|
-
# Pre-allocate update lists for better performance
|
|
230
|
-
all_update_idx = []
|
|
231
|
-
all_update_rt = []
|
|
232
|
-
all_update_rt_original = []
|
|
233
|
-
|
|
234
|
-
tdqm_disable = self.log_level not in ["TRACE", "DEBUG"]
|
|
235
|
-
|
|
236
|
-
for index, fm in tqdm(
|
|
237
|
-
list(enumerate(fmaps)),
|
|
238
|
-
total=len(fmaps),
|
|
239
|
-
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Extract RTs",
|
|
240
|
-
disable=tdqm_disable,
|
|
241
|
-
):
|
|
242
|
-
sample_uid = sample_uid_lookup.get(index)
|
|
243
|
-
if sample_uid is None:
|
|
244
|
-
continue
|
|
245
|
-
|
|
246
|
-
# Collect all updates for this feature map
|
|
247
|
-
for f in fm:
|
|
248
|
-
feature_uid = str(f.getUniqueId())
|
|
249
|
-
idx = lookup.get((sample_uid, feature_uid))
|
|
250
|
-
if idx is not None:
|
|
251
|
-
rt = round(f.getRT(), 3)
|
|
252
|
-
# rt_or = round(f.getMetaValue("original_RT"), 3) if f.metaValueExists("original_RT") else rt
|
|
253
|
-
all_update_idx.append(idx)
|
|
254
|
-
all_update_rt.append(rt)
|
|
255
|
-
# all_update_rt_original.append(rt_or)
|
|
256
|
-
|
|
257
|
-
# Single batch update for all features at once
|
|
258
|
-
if all_update_idx:
|
|
259
|
-
# Build a full-length Python list of rt values, update specified indices,
|
|
260
|
-
# then replace the DataFrame column with a Series that has the same length
|
|
261
|
-
try:
|
|
262
|
-
current_rt = self.features_df["rt"].to_list()
|
|
263
|
-
except Exception:
|
|
264
|
-
current_rt = [None] * self.features_df.height
|
|
265
|
-
|
|
266
|
-
# Defensive: ensure list length equals dataframe height
|
|
267
|
-
if len(current_rt) != self.features_df.height:
|
|
268
|
-
current_rt = [None] * self.features_df.height
|
|
269
|
-
|
|
270
|
-
for idx, new_rt in zip(all_update_idx, all_update_rt):
|
|
271
|
-
current_rt[idx] = new_rt
|
|
272
|
-
|
|
273
|
-
new_cols = [pl.Series("rt", current_rt)]
|
|
274
|
-
|
|
275
|
-
# Update rt_original if corresponding updates were collected
|
|
276
|
-
if "all_update_rt_original" in locals() and all_update_rt_original:
|
|
277
|
-
try:
|
|
278
|
-
current_rt_orig = (
|
|
279
|
-
self.features_df["rt_original"].to_list()
|
|
280
|
-
if "rt_original" in self.features_df.columns
|
|
281
|
-
else [None] * self.features_df.height
|
|
282
|
-
)
|
|
283
|
-
except Exception:
|
|
284
|
-
current_rt_orig = [None] * self.features_df.height
|
|
285
|
-
|
|
286
|
-
if len(current_rt_orig) != self.features_df.height:
|
|
287
|
-
current_rt_orig = [None] * self.features_df.height
|
|
288
|
-
|
|
289
|
-
for idx, new_orig in zip(all_update_idx, all_update_rt_original):
|
|
290
|
-
current_rt_orig[idx] = new_orig
|
|
291
|
-
|
|
292
|
-
new_cols.append(pl.Series("rt_original", current_rt_orig))
|
|
293
|
-
|
|
294
|
-
# Replace columns in one call
|
|
295
|
-
self.features_df = self.features_df.with_columns(*new_cols)
|
|
296
|
-
|
|
297
89
|
self.logger.debug("Alignment completed successfully.")
|
|
298
90
|
|
|
299
91
|
# Reset consensus data structures after alignment since RT changes invalidate consensus
|
|
@@ -307,6 +99,9 @@ def align(self, **kwargs):
|
|
|
307
99
|
if not self.consensus_ms2.is_empty():
|
|
308
100
|
self.consensus_ms2 = pl.DataFrame()
|
|
309
101
|
consensus_reset_count += 1
|
|
102
|
+
if not self.id_df.is_empty():
|
|
103
|
+
self.id_df = pl.DataFrame()
|
|
104
|
+
consensus_reset_count += 1
|
|
310
105
|
|
|
311
106
|
# Remove merge and find_ms2 parameters from history since they need to be re-run
|
|
312
107
|
keys_to_remove = ["merge", "find_ms2"]
|
|
@@ -319,17 +114,13 @@ def align(self, **kwargs):
|
|
|
319
114
|
self.logger.debug(f"Removed {key} from history")
|
|
320
115
|
|
|
321
116
|
if consensus_reset_count > 0 or history_removed_count > 0:
|
|
322
|
-
self.logger.
|
|
117
|
+
self.logger.debug(
|
|
323
118
|
f"Alignment reset: {consensus_reset_count} consensus structures cleared, {history_removed_count} history entries removed",
|
|
324
119
|
)
|
|
325
120
|
|
|
326
121
|
if params.get("save_features"):
|
|
327
122
|
self.save_samples()
|
|
328
123
|
|
|
329
|
-
# Clean up temporary feature maps to release memory
|
|
330
|
-
del fmaps
|
|
331
|
-
self.logger.debug("Temporary feature maps deleted to release memory")
|
|
332
|
-
|
|
333
124
|
|
|
334
125
|
def find_ms2(self, **kwargs):
|
|
335
126
|
"""
|
|
@@ -803,12 +594,73 @@ def _find_closest_valley(chrom, rt, dir="left", threshold=0.9):
|
|
|
803
594
|
return chrom.rt[idx]
|
|
804
595
|
|
|
805
596
|
|
|
806
|
-
def _align_pose_clustering(study_obj,
|
|
597
|
+
def _align_pose_clustering(study_obj, params):
|
|
807
598
|
"""Perform alignment using PoseClustering algorithm."""
|
|
808
599
|
import pyopenms as oms
|
|
600
|
+
import polars as pl
|
|
809
601
|
from tqdm import tqdm
|
|
810
602
|
from datetime import datetime
|
|
811
603
|
|
|
604
|
+
# Generate temporary feature maps on-demand from features_df for PoseClustering
|
|
605
|
+
study_obj.logger.debug("Generating feature maps on-demand from features_df for PoseClustering alignment")
|
|
606
|
+
|
|
607
|
+
tdqm_disable = study_obj.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
608
|
+
fmaps = []
|
|
609
|
+
|
|
610
|
+
# Process each sample in order with progress bar
|
|
611
|
+
for sample_index, row_dict in tqdm(
|
|
612
|
+
list(enumerate(study_obj.samples_df.iter_rows(named=True))),
|
|
613
|
+
total=len(study_obj.samples_df),
|
|
614
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {study_obj.log_label}Generate feature maps",
|
|
615
|
+
disable=tdqm_disable,
|
|
616
|
+
):
|
|
617
|
+
sample_uid = row_dict["sample_uid"]
|
|
618
|
+
sample_name = row_dict["sample_name"]
|
|
619
|
+
|
|
620
|
+
# Get features for this sample from features_df
|
|
621
|
+
sample_features = study_obj.features_df.filter(pl.col("sample_uid") == sample_uid)
|
|
622
|
+
|
|
623
|
+
# Create new FeatureMap
|
|
624
|
+
feature_map = oms.FeatureMap()
|
|
625
|
+
|
|
626
|
+
# Convert DataFrame features to OpenMS Features
|
|
627
|
+
for feature_row in sample_features.iter_rows(named=True):
|
|
628
|
+
feature = oms.Feature()
|
|
629
|
+
|
|
630
|
+
# Set properties from DataFrame (handle missing values gracefully)
|
|
631
|
+
try:
|
|
632
|
+
# Skip features with missing critical data
|
|
633
|
+
if feature_row["mz"] is None:
|
|
634
|
+
study_obj.logger.warning("Skipping feature due to missing mz")
|
|
635
|
+
continue
|
|
636
|
+
if feature_row["rt"] is None:
|
|
637
|
+
study_obj.logger.warning("Skipping feature due to missing rt")
|
|
638
|
+
continue
|
|
639
|
+
if feature_row["inty"] is None:
|
|
640
|
+
study_obj.logger.warning("Skipping feature due to missing inty")
|
|
641
|
+
continue
|
|
642
|
+
|
|
643
|
+
feature.setUniqueId(int(feature_row["feature_id"]))
|
|
644
|
+
feature.setMZ(float(feature_row["mz"]))
|
|
645
|
+
feature.setRT(float(feature_row["rt"]))
|
|
646
|
+
feature.setIntensity(float(feature_row["inty"]))
|
|
647
|
+
|
|
648
|
+
# Handle optional fields that might be None
|
|
649
|
+
if feature_row.get("quality") is not None:
|
|
650
|
+
feature.setOverallQuality(float(feature_row["quality"]))
|
|
651
|
+
if feature_row.get("charge") is not None:
|
|
652
|
+
feature.setCharge(int(feature_row["charge"]))
|
|
653
|
+
|
|
654
|
+
# Add to feature map
|
|
655
|
+
feature_map.push_back(feature)
|
|
656
|
+
except (ValueError, TypeError) as e:
|
|
657
|
+
study_obj.logger.warning(f"Skipping feature due to conversion error: {e}")
|
|
658
|
+
continue
|
|
659
|
+
|
|
660
|
+
fmaps.append(feature_map)
|
|
661
|
+
|
|
662
|
+
study_obj.logger.debug(f"Generated {len(fmaps)} feature maps from features_df for PoseClustering alignment")
|
|
663
|
+
|
|
812
664
|
# Create PC-specific OpenMS parameters
|
|
813
665
|
params_oms = oms.Param()
|
|
814
666
|
params_oms.setValue("pairfinder:distance_intensity:log_transform", "disabled")
|
|
@@ -847,7 +699,6 @@ def _align_pose_clustering(study_obj, fmaps, params):
|
|
|
847
699
|
study_obj.logger.debug(f"Parameters for alignment: {params}")
|
|
848
700
|
|
|
849
701
|
# Perform alignment and transformation of feature maps to the reference map (exclude reference map)
|
|
850
|
-
tdqm_disable = study_obj.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
851
702
|
for index, fm in tqdm(
|
|
852
703
|
list(enumerate(fmaps)),
|
|
853
704
|
total=len(fmaps),
|
|
@@ -879,14 +730,126 @@ def _align_pose_clustering(study_obj, fmaps, params):
|
|
|
879
730
|
continue
|
|
880
731
|
|
|
881
732
|
study_obj.alignment_ref_index = ref_index
|
|
733
|
+
|
|
734
|
+
# Process feature maps and update features_df with transformed retention times
|
|
735
|
+
# Build a fast lookup for (sample_uid, featureUid) to index in features_df
|
|
736
|
+
feats = study_obj.features_df
|
|
737
|
+
|
|
738
|
+
# Pre-build sample_uid lookup for faster access
|
|
739
|
+
study_obj.logger.debug("Build sample_uid lookup for fast access...")
|
|
740
|
+
sample_uid_lookup = {
|
|
741
|
+
idx: row_dict["sample_uid"]
|
|
742
|
+
for idx, row_dict in enumerate(study_obj.samples_df.iter_rows(named=True))
|
|
743
|
+
}
|
|
744
|
+
|
|
745
|
+
# Build the main lookup using feature_uid (not feature_id)
|
|
746
|
+
if "feature_id" in feats.columns:
|
|
747
|
+
# Create lookup mapping (sample_uid, feature_id) to DataFrame index using Polars
|
|
748
|
+
# Since we need a pandas-style index lookup, we'll create a simple dict
|
|
749
|
+
sample_uids = feats.get_column("sample_uid").to_list()
|
|
750
|
+
|
|
751
|
+
# Handle feature_id column - it might be Object type due to conversion
|
|
752
|
+
feature_id_col = feats.get_column("feature_id")
|
|
753
|
+
if feature_id_col.dtype == pl.Object:
|
|
754
|
+
# If it's Object type, convert to list and let Python handle the conversion
|
|
755
|
+
feature_ids = feature_id_col.to_list()
|
|
756
|
+
# Convert to strings if they're not already
|
|
757
|
+
feature_ids = [str(fid) if fid is not None else None for fid in feature_ids]
|
|
758
|
+
else:
|
|
759
|
+
# Safe to cast normally
|
|
760
|
+
feature_ids = feature_id_col.cast(pl.Utf8).to_list()
|
|
761
|
+
|
|
762
|
+
lookup = {
|
|
763
|
+
(sample_uid, feature_id): idx
|
|
764
|
+
for idx, (sample_uid, feature_id) in enumerate(
|
|
765
|
+
zip(sample_uids, feature_ids, strict=True),
|
|
766
|
+
)
|
|
767
|
+
}
|
|
768
|
+
else:
|
|
769
|
+
# fallback: skip if feature_uid column missing
|
|
770
|
+
lookup = {}
|
|
771
|
+
study_obj.logger.warning("feature_id column not found in features_df")
|
|
772
|
+
|
|
773
|
+
# Pre-allocate update lists for better performance
|
|
774
|
+
all_update_idx = []
|
|
775
|
+
all_update_rt = []
|
|
776
|
+
all_update_rt_original = []
|
|
777
|
+
|
|
778
|
+
for index, fm in tqdm(
|
|
779
|
+
list(enumerate(fmaps)),
|
|
780
|
+
total=len(fmaps),
|
|
781
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {study_obj.log_label}Extract RTs",
|
|
782
|
+
disable=tdqm_disable,
|
|
783
|
+
):
|
|
784
|
+
sample_uid = sample_uid_lookup.get(index)
|
|
785
|
+
if sample_uid is None:
|
|
786
|
+
continue
|
|
882
787
|
|
|
788
|
+
# Collect all updates for this feature map
|
|
789
|
+
for f in fm:
|
|
790
|
+
feature_uid = str(f.getUniqueId())
|
|
791
|
+
idx = lookup.get((sample_uid, feature_uid))
|
|
792
|
+
if idx is not None:
|
|
793
|
+
rt = round(f.getRT(), 3)
|
|
794
|
+
# rt_or = round(f.getMetaValue("original_RT"), 3) if f.metaValueExists("original_RT") else rt
|
|
795
|
+
all_update_idx.append(idx)
|
|
796
|
+
all_update_rt.append(rt)
|
|
797
|
+
# all_update_rt_original.append(rt_or)
|
|
798
|
+
|
|
799
|
+
# Single batch update for all features at once
|
|
800
|
+
if all_update_idx:
|
|
801
|
+
# Build a full-length Python list of rt values, update specified indices,
|
|
802
|
+
# then replace the DataFrame column with a Series that has the same length
|
|
803
|
+
try:
|
|
804
|
+
current_rt = study_obj.features_df["rt"].to_list()
|
|
805
|
+
except Exception:
|
|
806
|
+
current_rt = [None] * study_obj.features_df.height
|
|
807
|
+
|
|
808
|
+
# Defensive: ensure list length equals dataframe height
|
|
809
|
+
if len(current_rt) != study_obj.features_df.height:
|
|
810
|
+
current_rt = [None] * study_obj.features_df.height
|
|
811
|
+
|
|
812
|
+
for idx, new_rt in zip(all_update_idx, all_update_rt):
|
|
813
|
+
current_rt[idx] = new_rt
|
|
814
|
+
|
|
815
|
+
new_cols = [pl.Series("rt", current_rt)]
|
|
816
|
+
|
|
817
|
+
# Update rt_original if corresponding updates were collected
|
|
818
|
+
if "all_update_rt_original" in locals() and all_update_rt_original:
|
|
819
|
+
try:
|
|
820
|
+
current_rt_orig = (
|
|
821
|
+
study_obj.features_df["rt_original"].to_list()
|
|
822
|
+
if "rt_original" in study_obj.features_df.columns
|
|
823
|
+
else [None] * study_obj.features_df.height
|
|
824
|
+
)
|
|
825
|
+
except Exception:
|
|
826
|
+
current_rt_orig = [None] * study_obj.features_df.height
|
|
827
|
+
|
|
828
|
+
if len(current_rt_orig) != study_obj.features_df.height:
|
|
829
|
+
current_rt_orig = [None] * study_obj.features_df.height
|
|
830
|
+
|
|
831
|
+
for idx, new_orig in zip(all_update_idx, all_update_rt_original):
|
|
832
|
+
current_rt_orig[idx] = new_orig
|
|
833
|
+
|
|
834
|
+
new_cols.append(pl.Series("rt_original", current_rt_orig))
|
|
835
|
+
|
|
836
|
+
# Replace columns in one call
|
|
837
|
+
study_obj.features_df = study_obj.features_df.with_columns(*new_cols)
|
|
838
|
+
|
|
839
|
+
# Clean up temporary feature maps to release memory
|
|
840
|
+
del fmaps
|
|
841
|
+
study_obj.logger.debug("Temporary feature maps deleted to release memory")
|
|
883
842
|
|
|
884
|
-
|
|
843
|
+
|
|
844
|
+
def _align_kd_algorithm(study_obj, params):
|
|
885
845
|
"""
|
|
886
|
-
Custom KD-tree / reference-based alignment.
|
|
846
|
+
Custom KD-tree / reference-based alignment working directly with features_df.
|
|
887
847
|
"""
|
|
888
848
|
import bisect
|
|
889
849
|
import statistics
|
|
850
|
+
import pyopenms as oms
|
|
851
|
+
import polars as pl
|
|
852
|
+
from datetime import datetime
|
|
890
853
|
|
|
891
854
|
# Pull parameter values - map standard align params to our algorithm
|
|
892
855
|
# Use rt_tol (standard align param) instead of warp_rt_tol for RT tolerance
|
|
@@ -919,26 +882,64 @@ def _align_kd_algorithm(study_obj, fmaps, params):
|
|
|
919
882
|
_raw_mp = None
|
|
920
883
|
max_points = int(_raw_mp) if _raw_mp is not None else 1000
|
|
921
884
|
study_obj.logger.info(
|
|
922
|
-
f"
|
|
885
|
+
f"KD align: rt_tol={params.get('rt_tol')}, max_points={max_points}",
|
|
923
886
|
)
|
|
924
887
|
|
|
925
|
-
#
|
|
926
|
-
if
|
|
927
|
-
study_obj.logger.error("No
|
|
928
|
-
raise ValueError("No
|
|
888
|
+
# Work directly with features_df instead of feature maps
|
|
889
|
+
if study_obj.features_df is None or study_obj.features_df.is_empty():
|
|
890
|
+
study_obj.logger.error("No features_df available for alignment. Cannot proceed with KD alignment.")
|
|
891
|
+
raise ValueError("No features_df available for alignment. This usually indicates that features were not detected properly.")
|
|
892
|
+
|
|
893
|
+
# OPTIMIZATION 1: Group all features by sample_uid in ONE operation instead of filtering repeatedly
|
|
894
|
+
study_obj.logger.debug("Grouping features efficiently (major speedup)...")
|
|
895
|
+
|
|
896
|
+
# rt_original should already exist (created in main align() function)
|
|
897
|
+
if "rt_original" not in study_obj.features_df.columns:
|
|
898
|
+
raise ValueError("rt_original column missing - this should have been created by align() function")
|
|
899
|
+
|
|
900
|
+
sample_groups = study_obj.features_df.group_by("sample_uid", maintain_order=True)
|
|
901
|
+
sample_feature_data = sample_groups.agg([
|
|
902
|
+
pl.len().alias("feature_count"),
|
|
903
|
+
pl.col("mz").alias("mzs"),
|
|
904
|
+
pl.col("rt_original").alias("rt_originals") # Use original RT values for alignment
|
|
905
|
+
]).sort("feature_count", descending=True)
|
|
906
|
+
|
|
907
|
+
if sample_feature_data.is_empty():
|
|
908
|
+
study_obj.logger.error("No features found in any sample for alignment.")
|
|
909
|
+
raise ValueError("No features found in any sample for alignment.")
|
|
910
|
+
|
|
911
|
+
# Choose reference sample (sample with most features)
|
|
912
|
+
ref_sample_uid = sample_feature_data.row(0, named=True)["sample_uid"]
|
|
913
|
+
|
|
914
|
+
# Find the index of this sample in samples_df
|
|
915
|
+
ref_index = None
|
|
916
|
+
sample_uid_to_index = {}
|
|
917
|
+
for idx, row_dict in enumerate(study_obj.samples_df.iter_rows(named=True)):
|
|
918
|
+
sample_uid = row_dict["sample_uid"]
|
|
919
|
+
sample_uid_to_index[sample_uid] = idx
|
|
920
|
+
if sample_uid == ref_sample_uid:
|
|
921
|
+
ref_index = idx
|
|
922
|
+
|
|
923
|
+
if ref_index is None:
|
|
924
|
+
study_obj.logger.error(f"Could not find reference sample {ref_sample_uid} in samples_df")
|
|
925
|
+
raise ValueError(f"Could not find reference sample {ref_sample_uid} in samples_df")
|
|
929
926
|
|
|
930
|
-
# Choose reference map (largest number of features)
|
|
931
|
-
ref_index = max(range(len(fmaps)), key=lambda i: fmaps[i].size())
|
|
932
|
-
ref_map = fmaps[ref_index]
|
|
933
927
|
study_obj.alignment_ref_index = ref_index
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
928
|
+
|
|
929
|
+
# OPTIMIZATION 2: Get reference features efficiently from pre-grouped data
|
|
930
|
+
# Always use rt_original for alignment input to ensure consistent results
|
|
931
|
+
ref_row = sample_feature_data.filter(pl.col("sample_uid") == ref_sample_uid).row(0, named=True)
|
|
932
|
+
ref_mzs_list = ref_row["mzs"]
|
|
933
|
+
ref_rts_list = ref_row["rt_originals"] # Use original RT values
|
|
934
|
+
|
|
935
|
+
# Create sorted reference features for binary search
|
|
936
|
+
ref_features = list(zip(ref_mzs_list, ref_rts_list))
|
|
940
937
|
ref_features.sort(key=lambda x: x[0])
|
|
941
938
|
ref_mzs = [mz for mz, _ in ref_features]
|
|
939
|
+
|
|
940
|
+
study_obj.logger.debug(
|
|
941
|
+
f"Reference sample UID {ref_sample_uid} (index {ref_index}, sample: {study_obj.samples_df.row(ref_index, named=True)['sample_name']}) has {len(ref_features)} features",
|
|
942
|
+
)
|
|
942
943
|
|
|
943
944
|
def find_best_match(mz: float, rt: float):
|
|
944
945
|
mz_tol_abs = mz * ppm_tol * 1e-6
|
|
@@ -971,45 +972,59 @@ def _align_kd_algorithm(study_obj, fmaps, params):
|
|
|
971
972
|
except Exception:
|
|
972
973
|
pass
|
|
973
974
|
|
|
974
|
-
|
|
975
|
+
# OPTIMIZATION 3: Process samples using pre-grouped data (eliminates expensive filtering)
|
|
976
|
+
transformations = {}
|
|
975
977
|
|
|
976
|
-
for
|
|
978
|
+
for row in sample_feature_data.iter_rows(named=True):
|
|
979
|
+
sample_uid = row["sample_uid"]
|
|
980
|
+
sample_mzs = row["mzs"]
|
|
981
|
+
sample_rts = row["rt_originals"] # Use original RT values for alignment input
|
|
982
|
+
|
|
977
983
|
td = oms.TransformationDescription()
|
|
978
|
-
|
|
979
|
-
|
|
984
|
+
sample_index = sample_uid_to_index.get(sample_uid)
|
|
985
|
+
|
|
986
|
+
if sample_index is None:
|
|
987
|
+
study_obj.logger.warning(f"Sample UID {sample_uid} not found in samples_df, skipping")
|
|
988
|
+
continue
|
|
989
|
+
|
|
990
|
+
# Skip empty samples
|
|
991
|
+
if not sample_mzs or not sample_rts:
|
|
992
|
+
transformations[sample_uid] = td
|
|
980
993
|
continue
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
994
|
+
|
|
995
|
+
# Identity for reference sample
|
|
996
|
+
if sample_uid == ref_sample_uid:
|
|
997
|
+
rts = [rt for rt in sample_rts if rt is not None]
|
|
984
998
|
lo, hi = (min(rts), max(rts)) if rts else (0.0, 0.0)
|
|
985
999
|
try:
|
|
986
1000
|
_set_pairs(td, [(lo, lo), (hi, hi)])
|
|
987
1001
|
td.fitModel("linear", oms.Param())
|
|
988
1002
|
except Exception:
|
|
989
1003
|
pass
|
|
990
|
-
transformations
|
|
1004
|
+
transformations[sample_uid] = td
|
|
991
1005
|
continue
|
|
992
1006
|
|
|
993
|
-
#
|
|
1007
|
+
# OPTIMIZATION 4: Process pairs using pre-loaded data arrays (no DataFrame operations)
|
|
994
1008
|
pairs_raw = []
|
|
995
|
-
for
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1009
|
+
for mz, rt in zip(sample_mzs, sample_rts):
|
|
1010
|
+
if mz is not None and rt is not None:
|
|
1011
|
+
match = find_best_match(mz, rt)
|
|
1012
|
+
if match:
|
|
1013
|
+
obs_rt, ref_rt = match
|
|
1014
|
+
if abs(obs_rt - ref_rt) <= rt_pair_tol:
|
|
1015
|
+
pairs_raw.append((obs_rt, ref_rt))
|
|
1001
1016
|
|
|
1002
1017
|
if not pairs_raw:
|
|
1003
1018
|
# Fallback identity
|
|
1004
|
-
rts = [
|
|
1019
|
+
rts = [rt for rt in sample_rts if rt is not None]
|
|
1005
1020
|
lo, hi = (min(rts), max(rts)) if rts else (0.0, 0.0)
|
|
1006
1021
|
try:
|
|
1007
1022
|
_set_pairs(td, [(lo, lo), (hi, hi)])
|
|
1008
1023
|
td.fitModel("linear", oms.Param())
|
|
1009
1024
|
except Exception:
|
|
1010
1025
|
pass
|
|
1011
|
-
transformations
|
|
1012
|
-
study_obj.logger.debug(f"
|
|
1026
|
+
transformations[sample_uid] = td
|
|
1027
|
+
study_obj.logger.debug(f"Sample {sample_uid}: no anchors -> identity transform")
|
|
1013
1028
|
continue
|
|
1014
1029
|
|
|
1015
1030
|
# Deduplicate and downsample
|
|
@@ -1041,9 +1056,9 @@ def _align_kd_algorithm(study_obj, fmaps, params):
|
|
|
1041
1056
|
td.fitModel(model, oms.Param())
|
|
1042
1057
|
except Exception as e:
|
|
1043
1058
|
study_obj.logger.debug(
|
|
1044
|
-
f"
|
|
1059
|
+
f"Sample {sample_uid}: {model} fitting failed ({e}); fallback to linear two-point shift",
|
|
1045
1060
|
)
|
|
1046
|
-
rts = [
|
|
1061
|
+
rts = [rt for rt in sample_rts if rt is not None]
|
|
1047
1062
|
lo, hi = (min(rts), max(rts)) if rts else (0.0, 1.0)
|
|
1048
1063
|
td = oms.TransformationDescription()
|
|
1049
1064
|
try:
|
|
@@ -1053,28 +1068,39 @@ def _align_kd_algorithm(study_obj, fmaps, params):
|
|
|
1053
1068
|
pass
|
|
1054
1069
|
|
|
1055
1070
|
study_obj.logger.debug(
|
|
1056
|
-
f"
|
|
1071
|
+
f"Sample {sample_uid}: anchors raw={len(pairs_raw)} used={len(pairs_use)} model={model} median_shift={med_shift:.4f}s",
|
|
1057
1072
|
)
|
|
1058
|
-
transformations
|
|
1073
|
+
transformations[sample_uid] = td
|
|
1059
1074
|
|
|
1060
|
-
# Apply transformations
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1075
|
+
# OPTIMIZATION 5: Apply transformations efficiently using vectorized operations
|
|
1076
|
+
study_obj.logger.debug("Applying RT transformations efficiently...")
|
|
1077
|
+
|
|
1078
|
+
# Apply transformations to RT values starting from rt_original
|
|
1079
|
+
def transform_rt_vectorized(sample_uid: int, rt_original: float) -> float:
|
|
1080
|
+
if sample_uid in transformations and rt_original is not None:
|
|
1081
|
+
try:
|
|
1082
|
+
trafo = transformations[sample_uid]
|
|
1083
|
+
return trafo.apply(float(rt_original))
|
|
1084
|
+
except Exception:
|
|
1085
|
+
return rt_original
|
|
1086
|
+
return rt_original
|
|
1087
|
+
|
|
1088
|
+
# Use Polars' efficient struct operations for vectorized transformation
|
|
1089
|
+
# Apply transformation to rt_original and store result in rt column
|
|
1090
|
+
study_obj.features_df = study_obj.features_df.with_columns(
|
|
1091
|
+
pl.struct(["sample_uid", "rt_original"]).map_elements(
|
|
1092
|
+
lambda x: transform_rt_vectorized(x["sample_uid"], x["rt_original"]),
|
|
1093
|
+
return_dtype=pl.Float64
|
|
1094
|
+
).alias("rt")
|
|
1095
|
+
)
|
|
1072
1096
|
|
|
1073
1097
|
study_obj.logger.info(
|
|
1074
|
-
f"Alignment completed. Reference index {ref_index}.",
|
|
1098
|
+
f"Alignment completed. Reference sample UID {ref_sample_uid} (index {ref_index}).",
|
|
1075
1099
|
)
|
|
1076
1100
|
|
|
1077
1101
|
|
|
1102
|
+
|
|
1103
|
+
|
|
1078
1104
|
def _align_pose_clustering_fallback(study_obj, fmaps, params):
|
|
1079
1105
|
"""Fallback PoseClustering alignment with minimal parameters."""
|
|
1080
1106
|
import pyopenms as oms
|
|
@@ -1107,7 +1133,7 @@ def _align_pose_clustering_fallback(study_obj, fmaps, params):
|
|
|
1107
1133
|
study_obj.alignment_ref_index = ref_index
|
|
1108
1134
|
|
|
1109
1135
|
|
|
1110
|
-
def find_iso(self, rt_tol=0.1, mz_tol=0.01):
|
|
1136
|
+
def find_iso(self, rt_tol=0.1, mz_tol=0.01, uids=None):
|
|
1111
1137
|
"""
|
|
1112
1138
|
Find isotope patterns for consensus features by searching raw MS1 data.
|
|
1113
1139
|
OPTIMIZED VERSION: Each sample file is loaded only once for maximum efficiency.
|
|
@@ -1123,6 +1149,7 @@ def find_iso(self, rt_tol=0.1, mz_tol=0.01):
|
|
|
1123
1149
|
Parameters:
|
|
1124
1150
|
rt_tol (float): RT tolerance for scan matching in seconds
|
|
1125
1151
|
mz_tol (float): Additional m/z tolerance for isotope matching in Da
|
|
1152
|
+
uids (list, optional): List of consensus_uid values to process. If None, process all consensus features.
|
|
1126
1153
|
"""
|
|
1127
1154
|
if self.consensus_df is None or self.consensus_df.is_empty():
|
|
1128
1155
|
self.logger.error("No consensus features found. Please run merge() first.")
|
|
@@ -1148,6 +1175,20 @@ def find_iso(self, rt_tol=0.1, mz_tol=0.01):
|
|
|
1148
1175
|
|
|
1149
1176
|
self.logger.info("Extracting isotopomers from raw MS1 data...")
|
|
1150
1177
|
|
|
1178
|
+
# Filter consensus features if uids is specified
|
|
1179
|
+
if uids is not None:
|
|
1180
|
+
if not isinstance(uids, (list, tuple)):
|
|
1181
|
+
uids = [uids]
|
|
1182
|
+
# Filter consensus_df to only process specified UIDs
|
|
1183
|
+
consensus_df_filtered = self.consensus_df.filter(pl.col("consensus_uid").is_in(uids))
|
|
1184
|
+
if consensus_df_filtered.is_empty():
|
|
1185
|
+
self.logger.warning(f"No consensus features found with specified UIDs: {uids}")
|
|
1186
|
+
return
|
|
1187
|
+
self.logger.debug(f"Processing {len(consensus_df_filtered)} consensus features (UIDs: {uids})")
|
|
1188
|
+
else:
|
|
1189
|
+
consensus_df_filtered = self.consensus_df
|
|
1190
|
+
self.logger.debug(f"Processing all {len(consensus_df_filtered)} consensus features")
|
|
1191
|
+
|
|
1151
1192
|
# Isotope mass shifts to search for (up to 7x 13C isotopes)
|
|
1152
1193
|
isotope_shifts = [
|
|
1153
1194
|
0.33,
|
|
@@ -1169,7 +1210,13 @@ def find_iso(self, rt_tol=0.1, mz_tol=0.01):
|
|
|
1169
1210
|
self.logger.debug("Building sample-to-consensus mapping using vectorized operations...")
|
|
1170
1211
|
|
|
1171
1212
|
# Step 1: Join consensus_mapping with features to get intensities in one operation
|
|
1172
|
-
|
|
1213
|
+
# Apply UID filtering if specified
|
|
1214
|
+
if uids is not None:
|
|
1215
|
+
consensus_mapping_filtered = self.consensus_mapping_df.filter(pl.col("consensus_uid").is_in(uids))
|
|
1216
|
+
else:
|
|
1217
|
+
consensus_mapping_filtered = self.consensus_mapping_df
|
|
1218
|
+
|
|
1219
|
+
consensus_with_features = consensus_mapping_filtered.join(
|
|
1173
1220
|
self.features_df.select(['feature_uid', 'sample_uid', 'inty', 'mz', 'rt', 'rt_original']),
|
|
1174
1221
|
on=['feature_uid', 'sample_uid'],
|
|
1175
1222
|
how='left'
|
|
@@ -1214,19 +1261,19 @@ def find_iso(self, rt_tol=0.1, mz_tol=0.01):
|
|
|
1214
1261
|
|
|
1215
1262
|
# Initialize failed consensus features (those not in the mapping)
|
|
1216
1263
|
processed_consensus_uids = set(best_features_with_paths['consensus_uid'].to_list())
|
|
1217
|
-
for consensus_row in
|
|
1264
|
+
for consensus_row in consensus_df_filtered.iter_rows(named=True):
|
|
1218
1265
|
consensus_uid = consensus_row["consensus_uid"]
|
|
1219
1266
|
if consensus_uid not in processed_consensus_uids:
|
|
1220
1267
|
consensus_iso_data[consensus_uid] = None
|
|
1221
1268
|
|
|
1222
|
-
self.logger.debug(f"Will read {len(sample_to_consensus)} unique sample files for {len(
|
|
1269
|
+
self.logger.debug(f"Will read {len(sample_to_consensus)} unique sample files for {len(consensus_df_filtered)} consensus features")
|
|
1223
1270
|
|
|
1224
1271
|
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
1225
1272
|
|
|
1226
1273
|
# OPTIMIZATION 2: Process by sample file (load each file only once)
|
|
1227
1274
|
for sample_path, consensus_list in tqdm(
|
|
1228
1275
|
sample_to_consensus.items(),
|
|
1229
|
-
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}
|
|
1276
|
+
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Process from files",
|
|
1230
1277
|
disable=tdqm_disable,
|
|
1231
1278
|
):
|
|
1232
1279
|
try:
|
|
@@ -1245,6 +1292,16 @@ def find_iso(self, rt_tol=0.1, mz_tol=0.01):
|
|
|
1245
1292
|
base_mz = best_feature["mz"]
|
|
1246
1293
|
original_rt = best_feature.get("rt_original", best_feature["rt"])
|
|
1247
1294
|
|
|
1295
|
+
# Skip if RT or mz is None or invalid
|
|
1296
|
+
if original_rt is None:
|
|
1297
|
+
original_rt = best_feature["rt"]
|
|
1298
|
+
self.logger.debug(f"original_rt is None. Using aligned rt instead")
|
|
1299
|
+
|
|
1300
|
+
if base_mz is None:
|
|
1301
|
+
self.logger.warning(f"Skipping consensus_uid {consensus_uid}: base_mz is None")
|
|
1302
|
+
consensus_iso_data[consensus_uid] = None
|
|
1303
|
+
continue
|
|
1304
|
+
|
|
1248
1305
|
# Find MS1 scans near the original RT
|
|
1249
1306
|
rt_min = original_rt - rt_tol
|
|
1250
1307
|
rt_max = original_rt + rt_tol
|