masster 0.5.22__py3-none-any.whl → 0.5.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/logger.py +35 -19
- masster/sample/adducts.py +15 -29
- masster/sample/defaults/find_adducts_def.py +1 -3
- masster/sample/defaults/sample_def.py +4 -4
- masster/sample/h5.py +203 -361
- masster/sample/helpers.py +14 -30
- masster/sample/lib.py +3 -3
- masster/sample/load.py +21 -29
- masster/sample/plot.py +222 -132
- masster/sample/processing.py +42 -55
- masster/sample/sample.py +37 -46
- masster/sample/save.py +37 -61
- masster/sample/sciex.py +13 -11
- masster/sample/thermo.py +69 -74
- masster/spectrum.py +15 -15
- masster/study/analysis.py +650 -586
- masster/study/defaults/identify_def.py +1 -3
- masster/study/defaults/merge_def.py +6 -7
- masster/study/defaults/study_def.py +1 -5
- masster/study/export.py +35 -96
- masster/study/h5.py +134 -211
- masster/study/helpers.py +385 -459
- masster/study/id.py +239 -290
- masster/study/importers.py +84 -93
- masster/study/load.py +159 -178
- masster/study/merge.py +1112 -1098
- masster/study/plot.py +195 -149
- masster/study/processing.py +144 -191
- masster/study/save.py +14 -13
- masster/study/study.py +89 -130
- masster/wizard/wizard.py +764 -714
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/METADATA +27 -1
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/RECORD +37 -37
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/WHEEL +0 -0
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/entry_points.txt +0 -0
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/licenses/LICENSE +0 -0
masster/study/load.py
CHANGED
|
@@ -34,13 +34,7 @@ except ImportError:
|
|
|
34
34
|
import glob
|
|
35
35
|
|
|
36
36
|
|
|
37
|
-
def add(
|
|
38
|
-
self,
|
|
39
|
-
folder=None,
|
|
40
|
-
reset=False,
|
|
41
|
-
adducts=None,
|
|
42
|
-
max_files=None
|
|
43
|
-
):
|
|
37
|
+
def add(self, folder=None, reset=False, adducts=None, max_files=None):
|
|
44
38
|
"""Add samples from a folder to the study.
|
|
45
39
|
|
|
46
40
|
Args:
|
|
@@ -91,9 +85,7 @@ def add(
|
|
|
91
85
|
|
|
92
86
|
if len(files) > 0:
|
|
93
87
|
# Limit files if max_files is specified
|
|
94
|
-
remaining_slots = (
|
|
95
|
-
max_files - counter if max_files is not None else len(files)
|
|
96
|
-
)
|
|
88
|
+
remaining_slots = max_files - counter if max_files is not None else len(files)
|
|
97
89
|
files = files[:remaining_slots]
|
|
98
90
|
|
|
99
91
|
self.logger.debug(f"Found {len(files)} {ext} files")
|
|
@@ -119,11 +111,8 @@ def add(
|
|
|
119
111
|
self.logger.debug(
|
|
120
112
|
f"Batch processing {len(files_to_process)} {ext} files",
|
|
121
113
|
)
|
|
122
|
-
successful = _add_samples_batch(
|
|
123
|
-
files_to_process,
|
|
124
|
-
reset=reset,
|
|
125
|
-
adducts=adducts,
|
|
126
|
-
blacklist=blacklist
|
|
114
|
+
successful = _add_samples_batch(
|
|
115
|
+
self, files_to_process, reset=reset, adducts=adducts, blacklist=blacklist
|
|
127
116
|
)
|
|
128
117
|
counter += successful
|
|
129
118
|
if successful > 0:
|
|
@@ -171,7 +160,6 @@ def add_sample(self, file, type=None, reset=False, adducts=None):
|
|
|
171
160
|
skip_schema_check=True, # Skip schema check for performance (safe with diagonal concat)
|
|
172
161
|
)
|
|
173
162
|
|
|
174
|
-
|
|
175
163
|
return success
|
|
176
164
|
|
|
177
165
|
|
|
@@ -200,19 +188,20 @@ def load(self, filename=None):
|
|
|
200
188
|
|
|
201
189
|
# self.logger.info(f"Loading study from {filename}")
|
|
202
190
|
from masster.study.h5 import _load_study5
|
|
191
|
+
|
|
203
192
|
_load_study5(self, filename)
|
|
204
|
-
|
|
193
|
+
|
|
205
194
|
# After loading the study, check if we have consensus features before loading consensus XML
|
|
206
|
-
#if (self.consensus_df is not None and not self.consensus_df.is_empty()):
|
|
195
|
+
# if (self.consensus_df is not None and not self.consensus_df.is_empty()):
|
|
207
196
|
# consensus_xml_path = filename.replace(".study5", ".consensusXML")
|
|
208
197
|
# if os.path.exists(consensus_xml_path):
|
|
209
198
|
# self._load_consensusXML(filename=consensus_xml_path)
|
|
210
|
-
|
|
199
|
+
# self.logger.info(f"Automatically loaded consensus from {consensus_xml_path}")
|
|
211
200
|
# else:
|
|
212
201
|
# self.logger.warning(f"No consensus XML file found at {consensus_xml_path}")
|
|
213
|
-
#else:
|
|
202
|
+
# else:
|
|
214
203
|
# self.logger.debug("No consensus features found, skipping consensusXML loading")
|
|
215
|
-
|
|
204
|
+
|
|
216
205
|
self.filename = filename
|
|
217
206
|
|
|
218
207
|
|
|
@@ -250,25 +239,24 @@ def _fill_chrom_single_impl(
|
|
|
250
239
|
if isinstance(min_samples_abs, int) and min_samples_abs >= 0:
|
|
251
240
|
min_number_abs = int(min_samples_abs) if min_samples_abs > 0 else 0
|
|
252
241
|
min_number = max(min_number_rel, min_number_abs)
|
|
253
|
-
|
|
242
|
+
|
|
254
243
|
# Special case: if min_samples_abs is explicitly 0, allow 0-sample features (like library features)
|
|
255
244
|
if isinstance(min_samples_abs, int) and min_samples_abs == 0:
|
|
256
245
|
min_number = 0
|
|
257
|
-
|
|
246
|
+
|
|
258
247
|
self.logger.debug(f"Threshold for gap filling: number_samples>={min_number}")
|
|
259
248
|
|
|
260
249
|
if min_number > 0:
|
|
261
250
|
original_count = len(uids)
|
|
262
251
|
uids = self.consensus_df.filter(
|
|
263
|
-
(pl.col("number_samples") >= min_number)
|
|
264
|
-
& (pl.col("consensus_uid").is_in(uids)),
|
|
252
|
+
(pl.col("number_samples") >= min_number) & (pl.col("consensus_uid").is_in(uids)),
|
|
265
253
|
)["consensus_uid"].to_list()
|
|
266
254
|
self.logger.debug(
|
|
267
255
|
f"Features to fill: {original_count} -> {len(uids)}",
|
|
268
256
|
)
|
|
269
257
|
self.logger.debug("Identifying missing features...")
|
|
270
258
|
# Instead of building full chromatogram matrix, identify missing consensus/sample combinations directly
|
|
271
|
-
missing_combinations = _get_missing_consensus_sample_combinations(self,uids)
|
|
259
|
+
missing_combinations = _get_missing_consensus_sample_combinations(self, uids)
|
|
272
260
|
if not missing_combinations:
|
|
273
261
|
self.logger.info("No missing features found to fill.")
|
|
274
262
|
return
|
|
@@ -335,12 +323,12 @@ def _fill_chrom_single_impl(
|
|
|
335
323
|
if ms1_data is None or ms1_data.is_empty():
|
|
336
324
|
self.logger.warning(f"No MS1 data found for sample {sample_name}")
|
|
337
325
|
continue
|
|
338
|
-
|
|
326
|
+
|
|
339
327
|
# Create a temporary object to hold the MS1 data for processing
|
|
340
328
|
class TempSample:
|
|
341
329
|
def __init__(self, ms1_df):
|
|
342
330
|
self.ms1_df = ms1_df
|
|
343
|
-
|
|
331
|
+
|
|
344
332
|
file = TempSample(ms1_data)
|
|
345
333
|
except Exception as e:
|
|
346
334
|
self.logger.warning(f"Failed to load sample {sample_name}: {e}")
|
|
@@ -363,31 +351,25 @@ def _fill_chrom_single_impl(
|
|
|
363
351
|
# Special handling for RT=0 (library-derived features)
|
|
364
352
|
if rt == 0.0:
|
|
365
353
|
# Step 1: Retrieve full chromatogram for the m/z
|
|
366
|
-
d_full = file.ms1_df.filter(
|
|
367
|
-
|
|
368
|
-
& (pl.col("mz") <= mz + mz_tol)
|
|
369
|
-
)
|
|
370
|
-
|
|
354
|
+
d_full = file.ms1_df.filter((pl.col("mz") >= mz - mz_tol) & (pl.col("mz") <= mz + mz_tol))
|
|
355
|
+
|
|
371
356
|
if not d_full.is_empty():
|
|
372
357
|
# Step 2: Find maximum intensity and its RT
|
|
373
|
-
max_inty_row = d_full.filter(
|
|
374
|
-
|
|
375
|
-
).head(1)
|
|
376
|
-
|
|
358
|
+
max_inty_row = d_full.filter(pl.col("inty") == d_full["inty"].max()).head(1)
|
|
359
|
+
|
|
377
360
|
if not max_inty_row.is_empty():
|
|
378
361
|
max_rt = max_inty_row["rt"].item()
|
|
379
|
-
|
|
362
|
+
|
|
380
363
|
# Get eic_rt_tol from sample parameters if available
|
|
381
364
|
eic_rt_tol = rt_tol # Default fallback
|
|
382
|
-
if hasattr(file,
|
|
365
|
+
if hasattr(file, "parameters") and hasattr(file.parameters, "eic_rt_tol"):
|
|
383
366
|
eic_rt_tol = file.parameters.eic_rt_tol
|
|
384
|
-
|
|
367
|
+
|
|
385
368
|
# Step 3: Trim around max intensity using eic_rt_tol
|
|
386
369
|
d = d_full.filter(
|
|
387
|
-
(pl.col("rt") >= max_rt - eic_rt_tol)
|
|
388
|
-
& (pl.col("rt") <= max_rt + eic_rt_tol)
|
|
370
|
+
(pl.col("rt") >= max_rt - eic_rt_tol) & (pl.col("rt") <= max_rt + eic_rt_tol)
|
|
389
371
|
)
|
|
390
|
-
|
|
372
|
+
|
|
391
373
|
# Update consensus RT info based on discovered peak
|
|
392
374
|
rt = max_rt
|
|
393
375
|
rt_start_mean = max_rt - eic_rt_tol
|
|
@@ -529,10 +511,7 @@ def _fill_chrom_single_impl(
|
|
|
529
511
|
for row in rows_to_add:
|
|
530
512
|
# Cast numeric columns to ensure consistency
|
|
531
513
|
for key, value in row.items():
|
|
532
|
-
if
|
|
533
|
-
key in ["mz", "rt", "intensity", "area", "height"]
|
|
534
|
-
and value is not None
|
|
535
|
-
):
|
|
514
|
+
if key in ["mz", "rt", "intensity", "area", "height"] and value is not None:
|
|
536
515
|
row[key] = float(value)
|
|
537
516
|
elif key in ["sample_id", "feature_id"] and value is not None:
|
|
538
517
|
row[key] = int(value)
|
|
@@ -618,67 +597,64 @@ def _build_rt_correction_mapping_per_sample(self, sample_uid):
|
|
|
618
597
|
"""
|
|
619
598
|
Pre-compute RT correction mapping for a sample by getting all non-filled features.
|
|
620
599
|
This avoids repeated DataFrame filtering for each feature.
|
|
621
|
-
|
|
600
|
+
|
|
622
601
|
Args:
|
|
623
602
|
sample_uid: Sample UID to build mapping for
|
|
624
|
-
|
|
603
|
+
|
|
625
604
|
Returns:
|
|
626
605
|
Polars DataFrame with rt, rt_original, and rt_delta columns, sorted by rt
|
|
627
606
|
Returns empty DataFrame if no reference features found
|
|
628
607
|
"""
|
|
629
608
|
# Get non-filled features from the same sample
|
|
630
|
-
if
|
|
609
|
+
if "filled" in self.features_df.columns:
|
|
631
610
|
sample_features = self.features_df.filter(
|
|
632
|
-
(pl.col(
|
|
633
|
-
(pl.col(
|
|
634
|
-
(pl.col(
|
|
635
|
-
(pl.col(
|
|
611
|
+
(pl.col("sample_uid") == sample_uid)
|
|
612
|
+
& (pl.col("filled") == False)
|
|
613
|
+
& (pl.col("rt_original").is_not_null())
|
|
614
|
+
& (pl.col("rt").is_not_null())
|
|
636
615
|
)
|
|
637
616
|
else:
|
|
638
617
|
# If no filled column, assume all existing features are non-filled
|
|
639
618
|
sample_features = self.features_df.filter(
|
|
640
|
-
(pl.col(
|
|
641
|
-
(pl.col('rt_original').is_not_null()) &
|
|
642
|
-
(pl.col('rt').is_not_null())
|
|
619
|
+
(pl.col("sample_uid") == sample_uid) & (pl.col("rt_original").is_not_null()) & (pl.col("rt").is_not_null())
|
|
643
620
|
)
|
|
644
|
-
|
|
621
|
+
|
|
645
622
|
if sample_features.is_empty():
|
|
646
|
-
return pl.DataFrame(schema={
|
|
647
|
-
|
|
623
|
+
return pl.DataFrame(schema={"rt": pl.Float64, "rt_original": pl.Float64, "rt_delta": pl.Float64})
|
|
624
|
+
|
|
648
625
|
# Pre-compute RT deltas and sort by RT for efficient lookup
|
|
649
626
|
rt_mapping = sample_features.select([
|
|
650
|
-
pl.col(
|
|
651
|
-
pl.col(
|
|
652
|
-
(pl.col(
|
|
653
|
-
]).sort(
|
|
654
|
-
|
|
627
|
+
pl.col("rt"),
|
|
628
|
+
pl.col("rt_original"),
|
|
629
|
+
(pl.col("rt") - pl.col("rt_original")).alias("rt_delta"),
|
|
630
|
+
]).sort("rt")
|
|
631
|
+
|
|
655
632
|
return rt_mapping
|
|
656
633
|
|
|
634
|
+
|
|
657
635
|
def _estimate_rt_original_from_mapping(self, rt_mapping, target_rt):
|
|
658
636
|
"""
|
|
659
637
|
Fast RT original estimation using pre-computed mapping.
|
|
660
|
-
|
|
638
|
+
|
|
661
639
|
Args:
|
|
662
640
|
rt_mapping: Pre-computed RT mapping DataFrame from _build_rt_correction_mapping_per_sample
|
|
663
641
|
target_rt: Target aligned RT for the filled feature
|
|
664
|
-
|
|
642
|
+
|
|
665
643
|
Returns:
|
|
666
644
|
Estimated rt_original value, or None if no mapping available
|
|
667
645
|
"""
|
|
668
646
|
if rt_mapping.is_empty():
|
|
669
647
|
return None
|
|
670
|
-
|
|
648
|
+
|
|
671
649
|
# Find closest RT using vectorized operations
|
|
672
|
-
rt_mapping_with_diff = rt_mapping.with_columns([
|
|
673
|
-
|
|
674
|
-
])
|
|
675
|
-
|
|
650
|
+
rt_mapping_with_diff = rt_mapping.with_columns([(pl.col("rt") - target_rt).abs().alias("rt_diff")])
|
|
651
|
+
|
|
676
652
|
# Get the RT delta from the closest feature
|
|
677
|
-
closest_row = rt_mapping_with_diff.sort(
|
|
653
|
+
closest_row = rt_mapping_with_diff.sort("rt_diff").head(1)
|
|
678
654
|
if closest_row.is_empty():
|
|
679
655
|
return None
|
|
680
|
-
|
|
681
|
-
closest_rt_delta = closest_row[
|
|
656
|
+
|
|
657
|
+
closest_rt_delta = closest_row["rt_delta"].item()
|
|
682
658
|
return target_rt - closest_rt_delta
|
|
683
659
|
|
|
684
660
|
|
|
@@ -686,59 +662,59 @@ def _estimate_rt_original_for_filled_feature(self, sample_uid, target_rt, logger
|
|
|
686
662
|
"""
|
|
687
663
|
Estimate rt_original for a filled feature by finding the closest non-filled feature
|
|
688
664
|
from the same sample and using its RT delta (rt - rt_original).
|
|
689
|
-
|
|
665
|
+
|
|
690
666
|
Args:
|
|
691
667
|
sample_uid: Sample UID to search within
|
|
692
668
|
target_rt: Target aligned RT for the filled feature
|
|
693
669
|
logger: Optional logger for debug messages
|
|
694
|
-
|
|
670
|
+
|
|
695
671
|
Returns:
|
|
696
672
|
Estimated rt_original value, or None if no suitable reference found
|
|
697
673
|
"""
|
|
698
674
|
# Get non-filled features from the same sample
|
|
699
|
-
if
|
|
675
|
+
if "filled" in self.features_df.columns:
|
|
700
676
|
sample_features = self.features_df.filter(
|
|
701
|
-
(pl.col(
|
|
702
|
-
(pl.col(
|
|
703
|
-
(pl.col(
|
|
704
|
-
(pl.col(
|
|
677
|
+
(pl.col("sample_uid") == sample_uid)
|
|
678
|
+
& (pl.col("filled") == False)
|
|
679
|
+
& (pl.col("rt_original").is_not_null())
|
|
680
|
+
& (pl.col("rt").is_not_null())
|
|
705
681
|
)
|
|
706
682
|
else:
|
|
707
683
|
# If no filled column, assume all existing features are non-filled
|
|
708
684
|
sample_features = self.features_df.filter(
|
|
709
|
-
(pl.col(
|
|
710
|
-
(pl.col('rt_original').is_not_null()) &
|
|
711
|
-
(pl.col('rt').is_not_null())
|
|
685
|
+
(pl.col("sample_uid") == sample_uid) & (pl.col("rt_original").is_not_null()) & (pl.col("rt").is_not_null())
|
|
712
686
|
)
|
|
713
|
-
|
|
687
|
+
|
|
714
688
|
if sample_features.is_empty():
|
|
715
689
|
if logger:
|
|
716
690
|
logger.debug(f"No reference features found for sample {sample_uid} to estimate rt_original")
|
|
717
691
|
return None
|
|
718
|
-
|
|
692
|
+
|
|
719
693
|
# Calculate RT differences and find the closest feature
|
|
720
694
|
sample_features_with_diff = sample_features.with_columns([
|
|
721
|
-
(pl.col(
|
|
722
|
-
(pl.col(
|
|
695
|
+
(pl.col("rt") - target_rt).abs().alias("rt_diff"),
|
|
696
|
+
(pl.col("rt") - pl.col("rt_original")).alias("rt_delta"),
|
|
723
697
|
])
|
|
724
|
-
|
|
698
|
+
|
|
725
699
|
# Find the feature with minimum RT difference
|
|
726
|
-
closest_feature = sample_features_with_diff.sort(
|
|
727
|
-
|
|
700
|
+
closest_feature = sample_features_with_diff.sort("rt_diff").head(1)
|
|
701
|
+
|
|
728
702
|
if closest_feature.is_empty():
|
|
729
703
|
return None
|
|
730
|
-
|
|
704
|
+
|
|
731
705
|
# Get the RT delta from the closest feature
|
|
732
|
-
closest_rt_diff = closest_feature[
|
|
733
|
-
closest_rt_delta = closest_feature[
|
|
734
|
-
|
|
706
|
+
closest_rt_diff = closest_feature["rt_diff"].item()
|
|
707
|
+
closest_rt_delta = closest_feature["rt_delta"].item()
|
|
708
|
+
|
|
735
709
|
# Estimate rt_original using the same delta: rt_original = rt - rt_delta
|
|
736
710
|
estimated_rt_original = target_rt - closest_rt_delta
|
|
737
|
-
|
|
711
|
+
|
|
738
712
|
if self.logger:
|
|
739
|
-
self.logger.debug(
|
|
740
|
-
|
|
741
|
-
|
|
713
|
+
self.logger.debug(
|
|
714
|
+
f"Estimated rt_original={estimated_rt_original:.3f} for sample {sample_uid}, rt={target_rt:.3f} "
|
|
715
|
+
f"using closest feature (rt_diff={closest_rt_diff:.3f}, rt_delta={closest_rt_delta:.3f})"
|
|
716
|
+
)
|
|
717
|
+
|
|
742
718
|
return estimated_rt_original
|
|
743
719
|
|
|
744
720
|
|
|
@@ -763,7 +739,7 @@ def _process_sample_for_parallel_fill(
|
|
|
763
739
|
# Get missing features for this sample from precomputed combinations
|
|
764
740
|
sample_missing_df = missing_combinations_df.filter(pl.col("sample_uid") == sample_uid)
|
|
765
741
|
sample_consensus_uids = sample_missing_df["consensus_uid"].to_list()
|
|
766
|
-
|
|
742
|
+
|
|
767
743
|
if not sample_consensus_uids:
|
|
768
744
|
return new_features, new_mapping, counter
|
|
769
745
|
|
|
@@ -787,13 +763,15 @@ def _process_sample_for_parallel_fill(
|
|
|
787
763
|
feature_end=info["rt_end_mean"],
|
|
788
764
|
feature_apex=info["rt"],
|
|
789
765
|
)
|
|
790
|
-
|
|
766
|
+
|
|
791
767
|
new_feature = {
|
|
792
768
|
"uid": features_df_max_uid + counter,
|
|
793
769
|
"sample_uid": sample_uid,
|
|
794
770
|
"mz": info["mz"],
|
|
795
771
|
"rt": info["rt"],
|
|
796
|
-
"rt_original": 0.0
|
|
772
|
+
"rt_original": 0.0
|
|
773
|
+
if info["rt"] == 0.0
|
|
774
|
+
else _estimate_rt_original_from_mapping(self, rt_mapping, info["rt"]),
|
|
797
775
|
"mz_centroid": None,
|
|
798
776
|
"rt_centroid": None,
|
|
799
777
|
"iso": None,
|
|
@@ -811,7 +789,7 @@ def _process_sample_for_parallel_fill(
|
|
|
811
789
|
"ms2_scans": None,
|
|
812
790
|
"ms2_specs": None,
|
|
813
791
|
}
|
|
814
|
-
|
|
792
|
+
|
|
815
793
|
new_features.append(new_feature)
|
|
816
794
|
new_mapping.append({
|
|
817
795
|
"consensus_uid": consensus_uid,
|
|
@@ -820,7 +798,7 @@ def _process_sample_for_parallel_fill(
|
|
|
820
798
|
})
|
|
821
799
|
counter += 1
|
|
822
800
|
return new_features, new_mapping, counter
|
|
823
|
-
|
|
801
|
+
|
|
824
802
|
except Exception as e:
|
|
825
803
|
# If MS1 loading fails, create empty features
|
|
826
804
|
self.logger.debug(f"Failed to load MS1 data from {sample_path}: {e}")
|
|
@@ -836,13 +814,15 @@ def _process_sample_for_parallel_fill(
|
|
|
836
814
|
feature_end=info["rt_end_mean"],
|
|
837
815
|
feature_apex=info["rt"],
|
|
838
816
|
)
|
|
839
|
-
|
|
817
|
+
|
|
840
818
|
new_feature = {
|
|
841
819
|
"uid": features_df_max_uid + counter,
|
|
842
820
|
"sample_uid": sample_uid,
|
|
843
821
|
"mz": info["mz"],
|
|
844
822
|
"rt": info["rt"],
|
|
845
|
-
"rt_original": 0.0
|
|
823
|
+
"rt_original": 0.0
|
|
824
|
+
if info["rt"] == 0.0
|
|
825
|
+
else _estimate_rt_original_from_mapping(self, rt_mapping, info["rt"]),
|
|
846
826
|
"mz_centroid": None,
|
|
847
827
|
"rt_centroid": None,
|
|
848
828
|
"iso": None,
|
|
@@ -860,7 +840,7 @@ def _process_sample_for_parallel_fill(
|
|
|
860
840
|
"ms2_scans": None,
|
|
861
841
|
"ms2_specs": None,
|
|
862
842
|
}
|
|
863
|
-
|
|
843
|
+
|
|
864
844
|
new_features.append(new_feature)
|
|
865
845
|
new_mapping.append({
|
|
866
846
|
"consensus_uid": consensus_uid,
|
|
@@ -874,12 +854,10 @@ def _process_sample_for_parallel_fill(
|
|
|
874
854
|
all_mzs = [consensus_info[uid]["mz"] for uid in sample_consensus_uids]
|
|
875
855
|
mz_min = min(all_mzs) - mz_tol
|
|
876
856
|
mz_max = max(all_mzs) + mz_tol
|
|
877
|
-
|
|
857
|
+
|
|
878
858
|
# Pre-filter by broad m/z range
|
|
879
|
-
ms1_filtered = ms1_data.filter(
|
|
880
|
-
|
|
881
|
-
)
|
|
882
|
-
|
|
859
|
+
ms1_filtered = ms1_data.filter((pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max))
|
|
860
|
+
|
|
883
861
|
# Early exit if no data in m/z range
|
|
884
862
|
if ms1_filtered.is_empty():
|
|
885
863
|
for i, consensus_uid in enumerate(sample_consensus_uids):
|
|
@@ -894,13 +872,15 @@ def _process_sample_for_parallel_fill(
|
|
|
894
872
|
feature_end=info["rt_end_mean"],
|
|
895
873
|
feature_apex=info["rt"],
|
|
896
874
|
)
|
|
897
|
-
|
|
875
|
+
|
|
898
876
|
new_feature = {
|
|
899
877
|
"uid": features_df_max_uid + counter,
|
|
900
878
|
"sample_uid": sample_uid,
|
|
901
879
|
"mz": info["mz"],
|
|
902
880
|
"rt": info["rt"],
|
|
903
|
-
"rt_original": 0.0
|
|
881
|
+
"rt_original": 0.0
|
|
882
|
+
if info["rt"] == 0.0
|
|
883
|
+
else _estimate_rt_original_from_mapping(self, rt_mapping, info["rt"]),
|
|
904
884
|
"mz_centroid": None,
|
|
905
885
|
"rt_centroid": None,
|
|
906
886
|
"iso": None,
|
|
@@ -918,7 +898,7 @@ def _process_sample_for_parallel_fill(
|
|
|
918
898
|
"ms2_scans": None,
|
|
919
899
|
"ms2_specs": None,
|
|
920
900
|
}
|
|
921
|
-
|
|
901
|
+
|
|
922
902
|
new_features.append(new_feature)
|
|
923
903
|
new_mapping.append({
|
|
924
904
|
"consensus_uid": consensus_uid,
|
|
@@ -932,7 +912,7 @@ def _process_sample_for_parallel_fill(
|
|
|
932
912
|
for consensus_uid in sample_consensus_uids:
|
|
933
913
|
info = consensus_info[consensus_uid]
|
|
934
914
|
mz, rt = info["mz"], info["rt"]
|
|
935
|
-
|
|
915
|
+
|
|
936
916
|
try:
|
|
937
917
|
if rt == 0.0:
|
|
938
918
|
# Handle RT=0 features - create empty chromatogram
|
|
@@ -951,10 +931,12 @@ def _process_sample_for_parallel_fill(
|
|
|
951
931
|
else:
|
|
952
932
|
# Extract real chromatogram using pre-filtered MS1 data
|
|
953
933
|
d = ms1_filtered.filter(
|
|
954
|
-
(pl.col("mz") >= mz - mz_tol)
|
|
955
|
-
|
|
934
|
+
(pl.col("mz") >= mz - mz_tol)
|
|
935
|
+
& (pl.col("mz") <= mz + mz_tol)
|
|
936
|
+
& (pl.col("rt") >= rt - rt_tol)
|
|
937
|
+
& (pl.col("rt") <= rt + rt_tol)
|
|
956
938
|
)
|
|
957
|
-
|
|
939
|
+
|
|
958
940
|
# Create chromatogram from filtered data
|
|
959
941
|
if d.is_empty():
|
|
960
942
|
# No MS1 data found - create empty chromatogram
|
|
@@ -972,7 +954,7 @@ def _process_sample_for_parallel_fill(
|
|
|
972
954
|
else:
|
|
973
955
|
# Aggregate intensities per retention time (get max inty per RT)
|
|
974
956
|
eic_rt = d.group_by("rt").agg(pl.col("inty").max()).sort("rt")
|
|
975
|
-
|
|
957
|
+
|
|
976
958
|
# Create chromatogram with real data and find peaks
|
|
977
959
|
eic = Chromatogram(
|
|
978
960
|
eic_rt["rt"].to_numpy(),
|
|
@@ -984,15 +966,19 @@ def _process_sample_for_parallel_fill(
|
|
|
984
966
|
feature_end=info["rt_end_mean"],
|
|
985
967
|
feature_apex=rt,
|
|
986
968
|
).find_peaks()
|
|
987
|
-
best_peak =
|
|
988
|
-
|
|
969
|
+
best_peak = (
|
|
970
|
+
self._find_best_peak_in_eic(eic, rt, rt_tol)
|
|
971
|
+
if hasattr(self, "_find_best_peak_in_eic")
|
|
972
|
+
else None
|
|
973
|
+
)
|
|
974
|
+
|
|
989
975
|
# Create feature with optimized RT original estimation
|
|
990
976
|
rt_original_estimated = None
|
|
991
977
|
if rt == 0.0:
|
|
992
978
|
rt_original_estimated = 0.0 # RT=0 features
|
|
993
979
|
else:
|
|
994
980
|
rt_original_estimated = _estimate_rt_original_from_mapping(self, rt_mapping, rt)
|
|
995
|
-
|
|
981
|
+
|
|
996
982
|
new_feature = {
|
|
997
983
|
"uid": features_df_max_uid + counter,
|
|
998
984
|
"sample_uid": sample_uid,
|
|
@@ -1016,7 +1002,7 @@ def _process_sample_for_parallel_fill(
|
|
|
1016
1002
|
"ms2_scans": None,
|
|
1017
1003
|
"ms2_specs": None,
|
|
1018
1004
|
}
|
|
1019
|
-
|
|
1005
|
+
|
|
1020
1006
|
new_features.append(new_feature)
|
|
1021
1007
|
new_mapping.append({
|
|
1022
1008
|
"consensus_uid": consensus_uid,
|
|
@@ -1024,7 +1010,7 @@ def _process_sample_for_parallel_fill(
|
|
|
1024
1010
|
"feature_uid": features_df_max_uid + counter,
|
|
1025
1011
|
})
|
|
1026
1012
|
counter += 1
|
|
1027
|
-
|
|
1013
|
+
|
|
1028
1014
|
except Exception as e:
|
|
1029
1015
|
# Skip this feature if extraction fails but log the error
|
|
1030
1016
|
self.logger.debug(f"Failed to extract feature {consensus_uid} from {sample_path}: {e}")
|
|
@@ -1032,6 +1018,7 @@ def _process_sample_for_parallel_fill(
|
|
|
1032
1018
|
|
|
1033
1019
|
return new_features, new_mapping, counter
|
|
1034
1020
|
|
|
1021
|
+
|
|
1035
1022
|
def _fill_chrom_impl(
|
|
1036
1023
|
self,
|
|
1037
1024
|
uids=None,
|
|
@@ -1076,8 +1063,7 @@ def _fill_chrom_impl(
|
|
|
1076
1063
|
if min_number > 0:
|
|
1077
1064
|
original_count = len(uids)
|
|
1078
1065
|
uids = self.consensus_df.filter(
|
|
1079
|
-
(pl.col("number_samples") >= min_number)
|
|
1080
|
-
& (pl.col("consensus_uid").is_in(uids)),
|
|
1066
|
+
(pl.col("number_samples") >= min_number) & (pl.col("consensus_uid").is_in(uids)),
|
|
1081
1067
|
)["consensus_uid"].to_list()
|
|
1082
1068
|
self.logger.debug(f"Features to fill: {original_count} -> {len(uids)}")
|
|
1083
1069
|
|
|
@@ -1145,9 +1131,7 @@ def _fill_chrom_impl(
|
|
|
1145
1131
|
)
|
|
1146
1132
|
|
|
1147
1133
|
# Calculate current max feature_uid to avoid conflicts
|
|
1148
|
-
features_df_max_uid = (
|
|
1149
|
-
self.features_df["feature_uid"].max() if not self.features_df.is_empty() else 0
|
|
1150
|
-
)
|
|
1134
|
+
features_df_max_uid = self.features_df["feature_uid"].max() if not self.features_df.is_empty() else 0
|
|
1151
1135
|
|
|
1152
1136
|
# Process samples in parallel
|
|
1153
1137
|
all_new_features: list[dict] = []
|
|
@@ -1161,7 +1145,8 @@ def _fill_chrom_impl(
|
|
|
1161
1145
|
future_to_sample = {}
|
|
1162
1146
|
for sample_info in samples_to_process:
|
|
1163
1147
|
future = executor.submit(
|
|
1164
|
-
_process_sample_for_parallel_fill,
|
|
1148
|
+
_process_sample_for_parallel_fill,
|
|
1149
|
+
self,
|
|
1165
1150
|
sample_info,
|
|
1166
1151
|
consensus_info,
|
|
1167
1152
|
uids,
|
|
@@ -1223,10 +1208,7 @@ def _fill_chrom_impl(
|
|
|
1223
1208
|
for row in rows_to_add:
|
|
1224
1209
|
# Cast numeric columns to ensure consistency
|
|
1225
1210
|
for key, value in row.items():
|
|
1226
|
-
if
|
|
1227
|
-
key in ["mz", "rt", "intensity", "area", "height"]
|
|
1228
|
-
and value is not None
|
|
1229
|
-
):
|
|
1211
|
+
if key in ["mz", "rt", "intensity", "area", "height"] and value is not None:
|
|
1230
1212
|
row[key] = float(value)
|
|
1231
1213
|
elif key in ["sample_id", "feature_id"] and value is not None:
|
|
1232
1214
|
row[key] = int(value)
|
|
@@ -1254,8 +1236,8 @@ def _fill_chrom_impl(
|
|
|
1254
1236
|
|
|
1255
1237
|
# Log statistics about rt_original estimation
|
|
1256
1238
|
if all_new_features:
|
|
1257
|
-
estimated_count = sum(1 for feature in all_new_features if feature.get(
|
|
1258
|
-
none_count = sum(1 for feature in all_new_features if feature.get(
|
|
1239
|
+
estimated_count = sum(1 for feature in all_new_features if feature.get("rt_original") is not None)
|
|
1240
|
+
none_count = sum(1 for feature in all_new_features if feature.get("rt_original") is None)
|
|
1259
1241
|
self.logger.debug(f"Features with estimated rt_original: {estimated_count}")
|
|
1260
1242
|
self.logger.debug(f"Features with None rt_original: {none_count}")
|
|
1261
1243
|
|
|
@@ -1288,7 +1270,7 @@ def fill(self, **kwargs):
|
|
|
1288
1270
|
kwargs["threads"] = kwargs.pop("workers")
|
|
1289
1271
|
self.logger.debug("Converted 'workers' parameter to 'threads' for backward compatibility")
|
|
1290
1272
|
if "num_workers" in kwargs:
|
|
1291
|
-
kwargs["threads"] = kwargs.pop("num_workers")
|
|
1273
|
+
kwargs["threads"] = kwargs.pop("num_workers")
|
|
1292
1274
|
self.logger.debug("Converted 'num_workers' parameter to 'threads' for backward compatibility")
|
|
1293
1275
|
|
|
1294
1276
|
for key, value in kwargs.items():
|
|
@@ -1347,24 +1329,24 @@ def _get_missing_consensus_sample_combinations(self, uids):
|
|
|
1347
1329
|
self.consensus_mapping_df.filter(pl.col("consensus_uid").is_in(uids))["consensus_uid"].to_list()
|
|
1348
1330
|
)
|
|
1349
1331
|
unmapped_consensus_uids = uids_set - mapped_consensus_uids
|
|
1350
|
-
|
|
1332
|
+
|
|
1351
1333
|
# Get all sample info once for efficiency
|
|
1352
1334
|
all_samples = list(
|
|
1353
1335
|
self.samples_df.select(
|
|
1354
1336
|
["sample_uid", "sample_name", "sample_path", "sample_source"],
|
|
1355
1337
|
).iter_rows(),
|
|
1356
1338
|
)
|
|
1357
|
-
|
|
1339
|
+
|
|
1358
1340
|
missing_combinations = []
|
|
1359
|
-
|
|
1341
|
+
|
|
1360
1342
|
# For unmapped consensus features (e.g., RT=0), ALL samples are missing
|
|
1361
1343
|
if unmapped_consensus_uids:
|
|
1362
|
-
self.logger.debug(
|
|
1344
|
+
self.logger.debug(
|
|
1345
|
+
f"Found {len(unmapped_consensus_uids)} consensus features with no mappings (e.g., RT=0 library features)"
|
|
1346
|
+
)
|
|
1363
1347
|
for consensus_uid in unmapped_consensus_uids:
|
|
1364
1348
|
for sample_uid, sample_name, sample_path, sample_source in all_samples:
|
|
1365
|
-
missing_combinations.append(
|
|
1366
|
-
(consensus_uid, sample_uid, sample_name, sample_path, sample_source)
|
|
1367
|
-
)
|
|
1349
|
+
missing_combinations.append((consensus_uid, sample_uid, sample_name, sample_path, sample_source))
|
|
1368
1350
|
|
|
1369
1351
|
# If all consensus features are unmapped, return early
|
|
1370
1352
|
if len(mapped_consensus_uids) == 0:
|
|
@@ -1372,7 +1354,7 @@ def _get_missing_consensus_sample_combinations(self, uids):
|
|
|
1372
1354
|
|
|
1373
1355
|
# Continue with existing logic for mapped consensus features
|
|
1374
1356
|
mapped_uids_list = list(mapped_consensus_uids)
|
|
1375
|
-
|
|
1357
|
+
|
|
1376
1358
|
# Quick early termination check for fully/nearly filled studies
|
|
1377
1359
|
# This handles the common case where fill() is run on an already-filled study
|
|
1378
1360
|
consensus_counts = (
|
|
@@ -1381,9 +1363,7 @@ def _get_missing_consensus_sample_combinations(self, uids):
|
|
|
1381
1363
|
.agg(pl.count("feature_uid").alias("count"))
|
|
1382
1364
|
)
|
|
1383
1365
|
|
|
1384
|
-
total_existing = (
|
|
1385
|
-
consensus_counts["count"].sum() if not consensus_counts.is_empty() else 0
|
|
1386
|
-
)
|
|
1366
|
+
total_existing = consensus_counts["count"].sum() if not consensus_counts.is_empty() else 0
|
|
1387
1367
|
|
|
1388
1368
|
# Calculate total possible for mapped features only
|
|
1389
1369
|
mapped_total_possible = len(mapped_uids_list) * n_samples
|
|
@@ -1451,9 +1431,7 @@ def _get_missing_consensus_sample_combinations(self, uids):
|
|
|
1451
1431
|
for consensus_uid in mapped_uids_list:
|
|
1452
1432
|
for sample_uid, sample_name, sample_path, sample_source in all_samples:
|
|
1453
1433
|
if (consensus_uid, sample_uid) not in existing_combinations:
|
|
1454
|
-
missing_combinations.append(
|
|
1455
|
-
(consensus_uid, sample_uid, sample_name, sample_path, sample_source)
|
|
1456
|
-
)
|
|
1434
|
+
missing_combinations.append((consensus_uid, sample_uid, sample_name, sample_path, sample_source))
|
|
1457
1435
|
|
|
1458
1436
|
return missing_combinations
|
|
1459
1437
|
|
|
@@ -1551,13 +1529,8 @@ def _sanitize(self):
|
|
|
1551
1529
|
except Exception as e:
|
|
1552
1530
|
self.logger.error(f"Failed to recreate sanitized DataFrame: {e}")
|
|
1553
1531
|
|
|
1554
|
-
|
|
1555
|
-
|
|
1556
|
-
files,
|
|
1557
|
-
reset=False,
|
|
1558
|
-
adducts=None,
|
|
1559
|
-
blacklist=None
|
|
1560
|
-
):
|
|
1532
|
+
|
|
1533
|
+
def _add_samples_batch(self, files, reset=False, adducts=None, blacklist=None):
|
|
1561
1534
|
"""
|
|
1562
1535
|
Optimized batch addition of samples.
|
|
1563
1536
|
|
|
@@ -1599,7 +1572,8 @@ def _add_samples_batch(
|
|
|
1599
1572
|
):
|
|
1600
1573
|
try:
|
|
1601
1574
|
# Choose between optimized and standard loading
|
|
1602
|
-
success = _add_sample_noms1(
|
|
1575
|
+
success = _add_sample_noms1(
|
|
1576
|
+
self,
|
|
1603
1577
|
file,
|
|
1604
1578
|
reset=reset,
|
|
1605
1579
|
adducts=adducts,
|
|
@@ -1695,20 +1669,33 @@ def _add_sample_noms1(
|
|
|
1695
1669
|
return False
|
|
1696
1670
|
|
|
1697
1671
|
# Check polarity compatibility
|
|
1698
|
-
sample_polarity = getattr(ddaobj,
|
|
1699
|
-
study_polarity = getattr(self,
|
|
1700
|
-
|
|
1672
|
+
sample_polarity = getattr(ddaobj, "polarity", None)
|
|
1673
|
+
study_polarity = getattr(self, "polarity", None)
|
|
1674
|
+
|
|
1701
1675
|
if sample_polarity is not None and study_polarity is not None:
|
|
1702
1676
|
# Normalize polarity names for comparison
|
|
1703
|
-
sample_pol_norm =
|
|
1704
|
-
|
|
1705
|
-
|
|
1677
|
+
sample_pol_norm = (
|
|
1678
|
+
"positive"
|
|
1679
|
+
if sample_polarity in ["pos", "positive"]
|
|
1680
|
+
else "negative"
|
|
1681
|
+
if sample_polarity in ["neg", "negative"]
|
|
1682
|
+
else sample_polarity
|
|
1683
|
+
)
|
|
1684
|
+
study_pol_norm = (
|
|
1685
|
+
"positive"
|
|
1686
|
+
if study_polarity in ["pos", "positive"]
|
|
1687
|
+
else "negative"
|
|
1688
|
+
if study_polarity in ["neg", "negative"]
|
|
1689
|
+
else study_polarity
|
|
1690
|
+
)
|
|
1691
|
+
|
|
1706
1692
|
if sample_pol_norm != study_pol_norm:
|
|
1707
|
-
self.logger.warning(
|
|
1693
|
+
self.logger.warning(
|
|
1694
|
+
f"Sample {sample_name} polarity ({sample_polarity}) differs from study polarity ({study_polarity}). Skipping sample."
|
|
1695
|
+
)
|
|
1708
1696
|
return False
|
|
1709
1697
|
|
|
1710
|
-
|
|
1711
|
-
#self.features_maps.append(ddaobj._oms_features_map)
|
|
1698
|
+
# self.features_maps.append(ddaobj._oms_features_map)
|
|
1712
1699
|
|
|
1713
1700
|
# Determine sample type
|
|
1714
1701
|
sample_type = "sample" if type is None else type
|
|
@@ -1735,14 +1722,8 @@ def _add_sample_noms1(
|
|
|
1735
1722
|
|
|
1736
1723
|
# Efficient scan counting
|
|
1737
1724
|
ms1_count = ms2_count = 0
|
|
1738
|
-
if (
|
|
1739
|
-
|
|
1740
|
-
and ddaobj.scans_df is not None
|
|
1741
|
-
and not ddaobj.scans_df.is_empty()
|
|
1742
|
-
):
|
|
1743
|
-
scan_counts = (
|
|
1744
|
-
ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
|
|
1745
|
-
)
|
|
1725
|
+
if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
|
|
1726
|
+
scan_counts = ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
|
|
1746
1727
|
ms_levels = scan_counts.get("ms_level", [])
|
|
1747
1728
|
counts = scan_counts.get("len", [])
|
|
1748
1729
|
for level, count in zip(ms_levels, counts):
|