masster 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/sample/load.py +5 -4
- masster/study/defaults/align_def.py +0 -204
- masster/study/defaults/fill_def.py +9 -1
- masster/study/defaults/merge_def.py +20 -69
- masster/study/export.py +25 -5
- masster/study/h5.py +160 -42
- masster/study/helpers.py +430 -53
- masster/study/load.py +986 -158
- masster/study/merge.py +683 -1076
- masster/study/plot.py +43 -38
- masster/study/processing.py +337 -280
- masster/study/study.py +58 -135
- masster/wizard/wizard.py +20 -6
- {masster-0.5.1.dist-info → masster-0.5.3.dist-info}/METADATA +1 -1
- {masster-0.5.1.dist-info → masster-0.5.3.dist-info}/RECORD +19 -20
- masster/study/defaults/fill_chrom_def.py +0 -260
- {masster-0.5.1.dist-info → masster-0.5.3.dist-info}/WHEEL +0 -0
- {masster-0.5.1.dist-info → masster-0.5.3.dist-info}/entry_points.txt +0 -0
- {masster-0.5.1.dist-info → masster-0.5.3.dist-info}/licenses/LICENSE +0 -0
masster/study/load.py
CHANGED
|
@@ -39,8 +39,7 @@ def add(
|
|
|
39
39
|
folder=None,
|
|
40
40
|
reset=False,
|
|
41
41
|
adducts=None,
|
|
42
|
-
max_files=None
|
|
43
|
-
fast=True,
|
|
42
|
+
max_files=None
|
|
44
43
|
):
|
|
45
44
|
"""Add samples from a folder to the study.
|
|
46
45
|
|
|
@@ -52,8 +51,6 @@ def add(
|
|
|
52
51
|
adducts (optional): Adducts to use for sample loading. Defaults to None.
|
|
53
52
|
max_files (int, optional): Maximum number of files to process.
|
|
54
53
|
Defaults to None (no limit).
|
|
55
|
-
fast (bool, optional): Whether to use optimized loading that skips ms1_df
|
|
56
|
-
for better performance. Defaults to True.
|
|
57
54
|
"""
|
|
58
55
|
if folder is None:
|
|
59
56
|
if self.folder is not None:
|
|
@@ -122,12 +119,11 @@ def add(
|
|
|
122
119
|
self.logger.debug(
|
|
123
120
|
f"Batch processing {len(files_to_process)} {ext} files",
|
|
124
121
|
)
|
|
125
|
-
successful =
|
|
122
|
+
successful = _add_samples_batch(self,
|
|
126
123
|
files_to_process,
|
|
127
124
|
reset=reset,
|
|
128
125
|
adducts=adducts,
|
|
129
|
-
blacklist=blacklist
|
|
130
|
-
fast=fast,
|
|
126
|
+
blacklist=blacklist
|
|
131
127
|
)
|
|
132
128
|
counter += successful
|
|
133
129
|
if successful > 0:
|
|
@@ -149,8 +145,7 @@ def add(
|
|
|
149
145
|
return f"Added {counter} samples to study"
|
|
150
146
|
|
|
151
147
|
|
|
152
|
-
|
|
153
|
-
def add_sample(self, file, type=None, reset=False, adducts=None, fast=True):
|
|
148
|
+
def add_sample(self, file, type=None, reset=False, adducts=None):
|
|
154
149
|
"""
|
|
155
150
|
Add a single sample to the study.
|
|
156
151
|
|
|
@@ -165,26 +160,16 @@ def add_sample(self, file, type=None, reset=False, adducts=None, fast=True):
|
|
|
165
160
|
Returns:
|
|
166
161
|
bool: True if successful, False otherwise.
|
|
167
162
|
"""
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
else:
|
|
179
|
-
# Use standard method with full ms1_df loading
|
|
180
|
-
success = self._add_sample_standard(
|
|
181
|
-
file,
|
|
182
|
-
type=type,
|
|
183
|
-
reset=reset,
|
|
184
|
-
adducts=adducts,
|
|
185
|
-
skip_color_reset=False, # Do color reset for individual calls
|
|
186
|
-
skip_schema_check=True, # Skip schema check for performance
|
|
187
|
-
)
|
|
163
|
+
|
|
164
|
+
success = self._add_sample_optimized(
|
|
165
|
+
file,
|
|
166
|
+
type=type,
|
|
167
|
+
reset=reset,
|
|
168
|
+
adducts=adducts,
|
|
169
|
+
skip_color_reset=False, # Do color reset for individual calls
|
|
170
|
+
skip_schema_check=True, # Skip schema check for performance (safe with diagonal concat)
|
|
171
|
+
)
|
|
172
|
+
|
|
188
173
|
|
|
189
174
|
return success
|
|
190
175
|
|
|
@@ -311,11 +296,12 @@ def _fill_chrom_single_impl(
|
|
|
311
296
|
# Process each sample individually
|
|
312
297
|
# Group missing combinations by sample for efficient processing
|
|
313
298
|
missing_by_sample = {}
|
|
314
|
-
for consensus_uid, sample_uid, sample_name, sample_path in missing_combinations:
|
|
299
|
+
for consensus_uid, sample_uid, sample_name, sample_path, sample_source in missing_combinations:
|
|
315
300
|
if sample_name not in missing_by_sample:
|
|
316
301
|
missing_by_sample[sample_name] = {
|
|
317
302
|
"sample_uid": sample_uid,
|
|
318
303
|
"sample_path": sample_path,
|
|
304
|
+
"sample_source": sample_source,
|
|
319
305
|
"missing_consensus_uids": [],
|
|
320
306
|
}
|
|
321
307
|
missing_by_sample[sample_name]["missing_consensus_uids"].append(consensus_uid)
|
|
@@ -338,13 +324,23 @@ def _fill_chrom_single_impl(
|
|
|
338
324
|
# Load this sample
|
|
339
325
|
sample_uid = sample_info["sample_uid"]
|
|
340
326
|
sample_path = sample_info["sample_path"]
|
|
327
|
+
sample_source = sample_info["sample_source"]
|
|
341
328
|
missing_consensus_uids = sample_info["missing_consensus_uids"]
|
|
342
329
|
|
|
343
330
|
try:
|
|
344
|
-
#
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
331
|
+
# Load this sample using study._load_ms1() as suggested by user
|
|
332
|
+
# Use sample_path (points to .sample5 files) not sample_source (points to .raw files)
|
|
333
|
+
ms1_data = self._load_ms1(filename=sample_path)
|
|
334
|
+
if ms1_data is None or ms1_data.is_empty():
|
|
335
|
+
self.logger.warning(f"No MS1 data found for sample {sample_name}")
|
|
336
|
+
continue
|
|
337
|
+
|
|
338
|
+
# Create a temporary object to hold the MS1 data for processing
|
|
339
|
+
class TempSample:
|
|
340
|
+
def __init__(self, ms1_df):
|
|
341
|
+
self.ms1_df = ms1_df
|
|
342
|
+
|
|
343
|
+
file = TempSample(ms1_data)
|
|
348
344
|
except Exception as e:
|
|
349
345
|
self.logger.warning(f"Failed to load sample {sample_name}: {e}")
|
|
350
346
|
continue
|
|
@@ -363,12 +359,50 @@ def _fill_chrom_single_impl(
|
|
|
363
359
|
|
|
364
360
|
# Filter MS1 data for this feature
|
|
365
361
|
if hasattr(file, "ms1_df") and not file.ms1_df.is_empty():
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
362
|
+
# Special handling for RT=0 (library-derived features)
|
|
363
|
+
if rt == 0.0:
|
|
364
|
+
# Step 1: Retrieve full chromatogram for the m/z
|
|
365
|
+
d_full = file.ms1_df.filter(
|
|
366
|
+
(pl.col("mz") >= mz - mz_tol)
|
|
367
|
+
& (pl.col("mz") <= mz + mz_tol)
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
if not d_full.is_empty():
|
|
371
|
+
# Step 2: Find maximum intensity and its RT
|
|
372
|
+
max_inty_row = d_full.filter(
|
|
373
|
+
pl.col("inty") == d_full["inty"].max()
|
|
374
|
+
).head(1)
|
|
375
|
+
|
|
376
|
+
if not max_inty_row.is_empty():
|
|
377
|
+
max_rt = max_inty_row["rt"].item()
|
|
378
|
+
|
|
379
|
+
# Get eic_rt_tol from sample parameters if available
|
|
380
|
+
eic_rt_tol = rt_tol # Default fallback
|
|
381
|
+
if hasattr(file, 'parameters') and hasattr(file.parameters, 'eic_rt_tol'):
|
|
382
|
+
eic_rt_tol = file.parameters.eic_rt_tol
|
|
383
|
+
|
|
384
|
+
# Step 3: Trim around max intensity using eic_rt_tol
|
|
385
|
+
d = d_full.filter(
|
|
386
|
+
(pl.col("rt") >= max_rt - eic_rt_tol)
|
|
387
|
+
& (pl.col("rt") <= max_rt + eic_rt_tol)
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
# Update consensus RT info based on discovered peak
|
|
391
|
+
rt = max_rt
|
|
392
|
+
rt_start_mean = max_rt - eic_rt_tol
|
|
393
|
+
rt_end_mean = max_rt + eic_rt_tol
|
|
394
|
+
else:
|
|
395
|
+
d = pl.DataFrame()
|
|
396
|
+
else:
|
|
397
|
+
d = pl.DataFrame()
|
|
398
|
+
else:
|
|
399
|
+
# Normal RT-based filtering for non-zero RT
|
|
400
|
+
d = file.ms1_df.filter(
|
|
401
|
+
(pl.col("mz") >= mz - mz_tol)
|
|
402
|
+
& (pl.col("mz") <= mz + mz_tol)
|
|
403
|
+
& (pl.col("rt") >= rt_start_mean - rt_tol)
|
|
404
|
+
& (pl.col("rt") <= rt_end_mean + rt_tol),
|
|
405
|
+
)
|
|
372
406
|
else:
|
|
373
407
|
d = pl.DataFrame()
|
|
374
408
|
|
|
@@ -579,6 +613,134 @@ def fill_single(self, **kwargs):
|
|
|
579
613
|
)
|
|
580
614
|
|
|
581
615
|
|
|
616
|
+
def _build_rt_correction_mapping_per_sample(self, sample_uid):
|
|
617
|
+
"""
|
|
618
|
+
Pre-compute RT correction mapping for a sample by getting all non-filled features.
|
|
619
|
+
This avoids repeated DataFrame filtering for each feature.
|
|
620
|
+
|
|
621
|
+
Args:
|
|
622
|
+
sample_uid: Sample UID to build mapping for
|
|
623
|
+
|
|
624
|
+
Returns:
|
|
625
|
+
Polars DataFrame with rt, rt_original, and rt_delta columns, sorted by rt
|
|
626
|
+
Returns empty DataFrame if no reference features found
|
|
627
|
+
"""
|
|
628
|
+
# Get non-filled features from the same sample
|
|
629
|
+
if 'filled' in self.features_df.columns:
|
|
630
|
+
sample_features = self.features_df.filter(
|
|
631
|
+
(pl.col('sample_uid') == sample_uid) &
|
|
632
|
+
(pl.col('filled') == False) &
|
|
633
|
+
(pl.col('rt_original').is_not_null()) &
|
|
634
|
+
(pl.col('rt').is_not_null())
|
|
635
|
+
)
|
|
636
|
+
else:
|
|
637
|
+
# If no filled column, assume all existing features are non-filled
|
|
638
|
+
sample_features = self.features_df.filter(
|
|
639
|
+
(pl.col('sample_uid') == sample_uid) &
|
|
640
|
+
(pl.col('rt_original').is_not_null()) &
|
|
641
|
+
(pl.col('rt').is_not_null())
|
|
642
|
+
)
|
|
643
|
+
|
|
644
|
+
if sample_features.is_empty():
|
|
645
|
+
return pl.DataFrame(schema={'rt': pl.Float64, 'rt_original': pl.Float64, 'rt_delta': pl.Float64})
|
|
646
|
+
|
|
647
|
+
# Pre-compute RT deltas and sort by RT for efficient lookup
|
|
648
|
+
rt_mapping = sample_features.select([
|
|
649
|
+
pl.col('rt'),
|
|
650
|
+
pl.col('rt_original'),
|
|
651
|
+
(pl.col('rt') - pl.col('rt_original')).alias('rt_delta')
|
|
652
|
+
]).sort('rt')
|
|
653
|
+
|
|
654
|
+
return rt_mapping
|
|
655
|
+
|
|
656
|
+
def _estimate_rt_original_from_mapping(self, rt_mapping, target_rt):
|
|
657
|
+
"""
|
|
658
|
+
Fast RT original estimation using pre-computed mapping.
|
|
659
|
+
|
|
660
|
+
Args:
|
|
661
|
+
rt_mapping: Pre-computed RT mapping DataFrame from _build_rt_correction_mapping_per_sample
|
|
662
|
+
target_rt: Target aligned RT for the filled feature
|
|
663
|
+
|
|
664
|
+
Returns:
|
|
665
|
+
Estimated rt_original value, or None if no mapping available
|
|
666
|
+
"""
|
|
667
|
+
if rt_mapping.is_empty():
|
|
668
|
+
return None
|
|
669
|
+
|
|
670
|
+
# Find closest RT using vectorized operations
|
|
671
|
+
rt_mapping_with_diff = rt_mapping.with_columns([
|
|
672
|
+
(pl.col('rt') - target_rt).abs().alias('rt_diff')
|
|
673
|
+
])
|
|
674
|
+
|
|
675
|
+
# Get the RT delta from the closest feature
|
|
676
|
+
closest_row = rt_mapping_with_diff.sort('rt_diff').head(1)
|
|
677
|
+
if closest_row.is_empty():
|
|
678
|
+
return None
|
|
679
|
+
|
|
680
|
+
closest_rt_delta = closest_row['rt_delta'].item()
|
|
681
|
+
return target_rt - closest_rt_delta
|
|
682
|
+
|
|
683
|
+
|
|
684
|
+
def _estimate_rt_original_for_filled_feature(self, sample_uid, target_rt, logger=None):
|
|
685
|
+
"""
|
|
686
|
+
Estimate rt_original for a filled feature by finding the closest non-filled feature
|
|
687
|
+
from the same sample and using its RT delta (rt - rt_original).
|
|
688
|
+
|
|
689
|
+
Args:
|
|
690
|
+
sample_uid: Sample UID to search within
|
|
691
|
+
target_rt: Target aligned RT for the filled feature
|
|
692
|
+
logger: Optional logger for debug messages
|
|
693
|
+
|
|
694
|
+
Returns:
|
|
695
|
+
Estimated rt_original value, or None if no suitable reference found
|
|
696
|
+
"""
|
|
697
|
+
# Get non-filled features from the same sample
|
|
698
|
+
if 'filled' in self.features_df.columns:
|
|
699
|
+
sample_features = self.features_df.filter(
|
|
700
|
+
(pl.col('sample_uid') == sample_uid) &
|
|
701
|
+
(pl.col('filled') == False) &
|
|
702
|
+
(pl.col('rt_original').is_not_null()) &
|
|
703
|
+
(pl.col('rt').is_not_null())
|
|
704
|
+
)
|
|
705
|
+
else:
|
|
706
|
+
# If no filled column, assume all existing features are non-filled
|
|
707
|
+
sample_features = self.features_df.filter(
|
|
708
|
+
(pl.col('sample_uid') == sample_uid) &
|
|
709
|
+
(pl.col('rt_original').is_not_null()) &
|
|
710
|
+
(pl.col('rt').is_not_null())
|
|
711
|
+
)
|
|
712
|
+
|
|
713
|
+
if sample_features.is_empty():
|
|
714
|
+
if logger:
|
|
715
|
+
logger.debug(f"No reference features found for sample {sample_uid} to estimate rt_original")
|
|
716
|
+
return None
|
|
717
|
+
|
|
718
|
+
# Calculate RT differences and find the closest feature
|
|
719
|
+
sample_features_with_diff = sample_features.with_columns([
|
|
720
|
+
(pl.col('rt') - target_rt).abs().alias('rt_diff'),
|
|
721
|
+
(pl.col('rt') - pl.col('rt_original')).alias('rt_delta')
|
|
722
|
+
])
|
|
723
|
+
|
|
724
|
+
# Find the feature with minimum RT difference
|
|
725
|
+
closest_feature = sample_features_with_diff.sort('rt_diff').head(1)
|
|
726
|
+
|
|
727
|
+
if closest_feature.is_empty():
|
|
728
|
+
return None
|
|
729
|
+
|
|
730
|
+
# Get the RT delta from the closest feature
|
|
731
|
+
closest_rt_diff = closest_feature['rt_diff'].item()
|
|
732
|
+
closest_rt_delta = closest_feature['rt_delta'].item()
|
|
733
|
+
|
|
734
|
+
# Estimate rt_original using the same delta: rt_original = rt - rt_delta
|
|
735
|
+
estimated_rt_original = target_rt - closest_rt_delta
|
|
736
|
+
|
|
737
|
+
if self.logger:
|
|
738
|
+
self.logger.debug(f"Estimated rt_original={estimated_rt_original:.3f} for sample {sample_uid}, rt={target_rt:.3f} "
|
|
739
|
+
f"using closest feature (rt_diff={closest_rt_diff:.3f}, rt_delta={closest_rt_delta:.3f})")
|
|
740
|
+
|
|
741
|
+
return estimated_rt_original
|
|
742
|
+
|
|
743
|
+
|
|
582
744
|
def _process_sample_for_parallel_fill(
|
|
583
745
|
self,
|
|
584
746
|
sample_info,
|
|
@@ -589,31 +751,606 @@ def _process_sample_for_parallel_fill(
|
|
|
589
751
|
missing_combinations_df,
|
|
590
752
|
features_df_max_uid,
|
|
591
753
|
):
|
|
592
|
-
"""Process a single sample for parallel gap filling."""
|
|
593
754
|
sample_uid = sample_info["sample_uid"]
|
|
594
755
|
sample_path = sample_info["sample_path"]
|
|
756
|
+
sample_source = sample_info["sample_source"]
|
|
595
757
|
|
|
596
758
|
new_features: list[dict] = []
|
|
597
759
|
new_mapping: list[dict] = []
|
|
598
760
|
counter = 0
|
|
599
761
|
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
except Exception:
|
|
606
|
-
# Skip this sample if loading fails
|
|
762
|
+
# Get missing features for this sample from precomputed combinations
|
|
763
|
+
sample_missing_df = missing_combinations_df.filter(pl.col("sample_uid") == sample_uid)
|
|
764
|
+
sample_consensus_uids = sample_missing_df["consensus_uid"].to_list()
|
|
765
|
+
|
|
766
|
+
if not sample_consensus_uids:
|
|
607
767
|
return new_features, new_mapping, counter
|
|
608
768
|
|
|
609
|
-
#
|
|
610
|
-
|
|
611
|
-
pl.col("sample_uid") == sample_uid,
|
|
612
|
-
)["consensus_uid"].to_list()
|
|
769
|
+
# OPTIMIZATION: Pre-compute RT correction mapping per sample to avoid repeated DataFrame filtering
|
|
770
|
+
rt_mapping = _build_rt_correction_mapping_per_sample(self, sample_uid)
|
|
613
771
|
|
|
614
|
-
|
|
772
|
+
# OPTIMIZATION 1: Load MS1 data ONCE per sample instead of per feature
|
|
773
|
+
try:
|
|
774
|
+
ms1_data = self._load_ms1(filename=sample_path)
|
|
775
|
+
if ms1_data is None or ms1_data.is_empty():
|
|
776
|
+
# Create empty features for all missing consensus UIDs
|
|
777
|
+
for i, consensus_uid in enumerate(sample_consensus_uids):
|
|
778
|
+
info = consensus_info[consensus_uid]
|
|
779
|
+
empty_eic = Chromatogram(
|
|
780
|
+
rt=np.array([info["rt_start_mean"], info["rt_end_mean"]]),
|
|
781
|
+
inty=np.array([0.0, 0.0]),
|
|
782
|
+
label=f"EIC mz={info['mz']:.4f}",
|
|
783
|
+
file=sample_path,
|
|
784
|
+
mz=info["mz"],
|
|
785
|
+
feature_start=info["rt_start_mean"],
|
|
786
|
+
feature_end=info["rt_end_mean"],
|
|
787
|
+
feature_apex=info["rt"],
|
|
788
|
+
)
|
|
789
|
+
|
|
790
|
+
new_feature = {
|
|
791
|
+
"uid": features_df_max_uid + counter,
|
|
792
|
+
"sample_uid": sample_uid,
|
|
793
|
+
"mz": info["mz"],
|
|
794
|
+
"rt": info["rt"],
|
|
795
|
+
"rt_original": 0.0 if info["rt"] == 0.0 else _estimate_rt_original_from_mapping(self, rt_mapping, info["rt"]),
|
|
796
|
+
"mz_centroid": None,
|
|
797
|
+
"rt_centroid": None,
|
|
798
|
+
"iso": None,
|
|
799
|
+
"iso_of": None,
|
|
800
|
+
"adduct": None,
|
|
801
|
+
"adduct_mass": None,
|
|
802
|
+
"adduct_group": None,
|
|
803
|
+
"chrom": empty_eic,
|
|
804
|
+
"filled": True,
|
|
805
|
+
"chrom_area": 0.0,
|
|
806
|
+
"chrom_coherence": None,
|
|
807
|
+
"chrom_prominence": None,
|
|
808
|
+
"chrom_prominence_scaled": None,
|
|
809
|
+
"chrom_height_scaled": None,
|
|
810
|
+
"ms2_scans": None,
|
|
811
|
+
"ms2_specs": None,
|
|
812
|
+
}
|
|
813
|
+
|
|
814
|
+
new_features.append(new_feature)
|
|
815
|
+
new_mapping.append({
|
|
816
|
+
"consensus_uid": consensus_uid,
|
|
817
|
+
"sample_uid": sample_uid,
|
|
818
|
+
"feature_uid": features_df_max_uid + counter,
|
|
819
|
+
})
|
|
820
|
+
counter += 1
|
|
821
|
+
return new_features, new_mapping, counter
|
|
822
|
+
|
|
823
|
+
except Exception as e:
|
|
824
|
+
# If MS1 loading fails, create empty features
|
|
825
|
+
self.logger.debug(f"Failed to load MS1 data from {sample_path}: {e}")
|
|
826
|
+
for i, consensus_uid in enumerate(sample_consensus_uids):
|
|
827
|
+
info = consensus_info[consensus_uid]
|
|
828
|
+
empty_eic = Chromatogram(
|
|
829
|
+
rt=np.array([info["rt_start_mean"], info["rt_end_mean"]]),
|
|
830
|
+
inty=np.array([0.0, 0.0]),
|
|
831
|
+
label=f"EIC mz={info['mz']:.4f}",
|
|
832
|
+
file=sample_path,
|
|
833
|
+
mz=info["mz"],
|
|
834
|
+
feature_start=info["rt_start_mean"],
|
|
835
|
+
feature_end=info["rt_end_mean"],
|
|
836
|
+
feature_apex=info["rt"],
|
|
837
|
+
)
|
|
838
|
+
|
|
839
|
+
new_feature = {
|
|
840
|
+
"uid": features_df_max_uid + counter,
|
|
841
|
+
"sample_uid": sample_uid,
|
|
842
|
+
"mz": info["mz"],
|
|
843
|
+
"rt": info["rt"],
|
|
844
|
+
"rt_original": 0.0 if info["rt"] == 0.0 else _estimate_rt_original_from_mapping(self, rt_mapping, info["rt"]),
|
|
845
|
+
"mz_centroid": None,
|
|
846
|
+
"rt_centroid": None,
|
|
847
|
+
"iso": None,
|
|
848
|
+
"iso_of": None,
|
|
849
|
+
"adduct": None,
|
|
850
|
+
"adduct_mass": None,
|
|
851
|
+
"adduct_group": None,
|
|
852
|
+
"chrom": empty_eic,
|
|
853
|
+
"filled": True,
|
|
854
|
+
"chrom_area": 0.0,
|
|
855
|
+
"chrom_coherence": None,
|
|
856
|
+
"chrom_prominence": None,
|
|
857
|
+
"chrom_prominence_scaled": None,
|
|
858
|
+
"chrom_height_scaled": None,
|
|
859
|
+
"ms2_scans": None,
|
|
860
|
+
"ms2_specs": None,
|
|
861
|
+
}
|
|
862
|
+
|
|
863
|
+
new_features.append(new_feature)
|
|
864
|
+
new_mapping.append({
|
|
865
|
+
"consensus_uid": consensus_uid,
|
|
866
|
+
"sample_uid": sample_uid,
|
|
867
|
+
"feature_uid": features_df_max_uid + counter,
|
|
868
|
+
})
|
|
869
|
+
counter += 1
|
|
870
|
+
return new_features, new_mapping, counter
|
|
871
|
+
|
|
872
|
+
# OPTIMIZATION 2: Pre-filter MS1 data by m/z ranges to reduce memory and processing
|
|
873
|
+
all_mzs = [consensus_info[uid]["mz"] for uid in sample_consensus_uids]
|
|
874
|
+
mz_min = min(all_mzs) - mz_tol
|
|
875
|
+
mz_max = max(all_mzs) + mz_tol
|
|
876
|
+
|
|
877
|
+
# Pre-filter by broad m/z range
|
|
878
|
+
ms1_filtered = ms1_data.filter(
|
|
879
|
+
(pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max)
|
|
880
|
+
)
|
|
881
|
+
|
|
882
|
+
# Early exit if no data in m/z range
|
|
883
|
+
if ms1_filtered.is_empty():
|
|
884
|
+
for i, consensus_uid in enumerate(sample_consensus_uids):
|
|
885
|
+
info = consensus_info[consensus_uid]
|
|
886
|
+
empty_eic = Chromatogram(
|
|
887
|
+
rt=np.array([info["rt_start_mean"], info["rt_end_mean"]]),
|
|
888
|
+
inty=np.array([0.0, 0.0]),
|
|
889
|
+
label=f"EIC mz={info['mz']:.4f}",
|
|
890
|
+
file=sample_path,
|
|
891
|
+
mz=info["mz"],
|
|
892
|
+
feature_start=info["rt_start_mean"],
|
|
893
|
+
feature_end=info["rt_end_mean"],
|
|
894
|
+
feature_apex=info["rt"],
|
|
895
|
+
)
|
|
896
|
+
|
|
897
|
+
new_feature = {
|
|
898
|
+
"uid": features_df_max_uid + counter,
|
|
899
|
+
"sample_uid": sample_uid,
|
|
900
|
+
"mz": info["mz"],
|
|
901
|
+
"rt": info["rt"],
|
|
902
|
+
"rt_original": 0.0 if info["rt"] == 0.0 else _estimate_rt_original_from_mapping(self, rt_mapping, info["rt"]),
|
|
903
|
+
"mz_centroid": None,
|
|
904
|
+
"rt_centroid": None,
|
|
905
|
+
"iso": None,
|
|
906
|
+
"iso_of": None,
|
|
907
|
+
"adduct": None,
|
|
908
|
+
"adduct_mass": None,
|
|
909
|
+
"adduct_group": None,
|
|
910
|
+
"chrom": empty_eic,
|
|
911
|
+
"filled": True,
|
|
912
|
+
"chrom_area": 0.0,
|
|
913
|
+
"chrom_coherence": None,
|
|
914
|
+
"chrom_prominence": None,
|
|
915
|
+
"chrom_prominence_scaled": None,
|
|
916
|
+
"chrom_height_scaled": None,
|
|
917
|
+
"ms2_scans": None,
|
|
918
|
+
"ms2_specs": None,
|
|
919
|
+
}
|
|
920
|
+
|
|
921
|
+
new_features.append(new_feature)
|
|
922
|
+
new_mapping.append({
|
|
923
|
+
"consensus_uid": consensus_uid,
|
|
924
|
+
"sample_uid": sample_uid,
|
|
925
|
+
"feature_uid": features_df_max_uid + counter,
|
|
926
|
+
})
|
|
927
|
+
counter += 1
|
|
615
928
|
return new_features, new_mapping, counter
|
|
616
929
|
|
|
930
|
+
# OPTIMIZATION 3: Process all features using the pre-loaded and filtered MS1 data
|
|
931
|
+
for consensus_uid in sample_consensus_uids:
|
|
932
|
+
info = consensus_info[consensus_uid]
|
|
933
|
+
mz, rt = info["mz"], info["rt"]
|
|
934
|
+
|
|
935
|
+
try:
|
|
936
|
+
if rt == 0.0:
|
|
937
|
+
# Handle RT=0 features - create empty chromatogram
|
|
938
|
+
empty_eic = Chromatogram(
|
|
939
|
+
rt=np.array([info["rt_start_mean"], info["rt_end_mean"]]),
|
|
940
|
+
inty=np.array([0.0, 0.0]),
|
|
941
|
+
label=f"EIC mz={mz:.4f}",
|
|
942
|
+
file=sample_path,
|
|
943
|
+
mz=mz,
|
|
944
|
+
feature_start=info["rt_start_mean"],
|
|
945
|
+
feature_end=info["rt_end_mean"],
|
|
946
|
+
feature_apex=rt,
|
|
947
|
+
)
|
|
948
|
+
eic = empty_eic
|
|
949
|
+
best_peak = None
|
|
950
|
+
else:
|
|
951
|
+
# Extract real chromatogram using pre-filtered MS1 data
|
|
952
|
+
d = ms1_filtered.filter(
|
|
953
|
+
(pl.col("mz") >= mz - mz_tol) & (pl.col("mz") <= mz + mz_tol) &
|
|
954
|
+
(pl.col("rt") >= rt - rt_tol) & (pl.col("rt") <= rt + rt_tol)
|
|
955
|
+
)
|
|
956
|
+
|
|
957
|
+
# Create chromatogram from filtered data
|
|
958
|
+
if d.is_empty():
|
|
959
|
+
# No MS1 data found - create empty chromatogram
|
|
960
|
+
eic = Chromatogram(
|
|
961
|
+
rt=np.array([info["rt_start_mean"], info["rt_end_mean"]]),
|
|
962
|
+
inty=np.array([0.0, 0.0]),
|
|
963
|
+
label=f"EIC mz={mz:.4f}",
|
|
964
|
+
file=sample_path,
|
|
965
|
+
mz=mz,
|
|
966
|
+
feature_start=info["rt_start_mean"],
|
|
967
|
+
feature_end=info["rt_end_mean"],
|
|
968
|
+
feature_apex=rt,
|
|
969
|
+
)
|
|
970
|
+
best_peak = None
|
|
971
|
+
else:
|
|
972
|
+
# Aggregate intensities per retention time (get max inty per RT)
|
|
973
|
+
eic_rt = d.group_by("rt").agg(pl.col("inty").max()).sort("rt")
|
|
974
|
+
|
|
975
|
+
# Create chromatogram with real data and find peaks
|
|
976
|
+
eic = Chromatogram(
|
|
977
|
+
eic_rt["rt"].to_numpy(),
|
|
978
|
+
eic_rt["inty"].to_numpy(),
|
|
979
|
+
label=f"EIC mz={mz:.4f}",
|
|
980
|
+
file=sample_path,
|
|
981
|
+
mz=mz,
|
|
982
|
+
feature_start=info["rt_start_mean"],
|
|
983
|
+
feature_end=info["rt_end_mean"],
|
|
984
|
+
feature_apex=rt,
|
|
985
|
+
).find_peaks()
|
|
986
|
+
best_peak = self._find_best_peak_in_eic(eic, rt, rt_tol) if hasattr(self, '_find_best_peak_in_eic') else None
|
|
987
|
+
|
|
988
|
+
# Create feature with optimized RT original estimation
|
|
989
|
+
rt_original_estimated = None
|
|
990
|
+
if rt == 0.0:
|
|
991
|
+
rt_original_estimated = 0.0 # RT=0 features
|
|
992
|
+
else:
|
|
993
|
+
rt_original_estimated = _estimate_rt_original_from_mapping(self, rt_mapping, rt)
|
|
994
|
+
|
|
995
|
+
new_feature = {
|
|
996
|
+
"uid": features_df_max_uid + counter,
|
|
997
|
+
"sample_uid": sample_uid,
|
|
998
|
+
"mz": mz,
|
|
999
|
+
"rt": rt,
|
|
1000
|
+
"rt_original": rt_original_estimated,
|
|
1001
|
+
"mz_centroid": None,
|
|
1002
|
+
"rt_centroid": None,
|
|
1003
|
+
"iso": None,
|
|
1004
|
+
"iso_of": None,
|
|
1005
|
+
"adduct": None,
|
|
1006
|
+
"adduct_mass": None,
|
|
1007
|
+
"adduct_group": None,
|
|
1008
|
+
"chrom": eic,
|
|
1009
|
+
"filled": True,
|
|
1010
|
+
"chrom_area": best_peak.get("area", 0.0) if best_peak else 0.0,
|
|
1011
|
+
"chrom_coherence": best_peak.get("coherence") if best_peak else None,
|
|
1012
|
+
"chrom_prominence": best_peak.get("prominence") if best_peak else None,
|
|
1013
|
+
"chrom_prominence_scaled": best_peak.get("prominence_scaled") if best_peak else None,
|
|
1014
|
+
"chrom_height_scaled": best_peak.get("height_scaled") if best_peak else None,
|
|
1015
|
+
"ms2_scans": None,
|
|
1016
|
+
"ms2_specs": None,
|
|
1017
|
+
}
|
|
1018
|
+
|
|
1019
|
+
new_features.append(new_feature)
|
|
1020
|
+
new_mapping.append({
|
|
1021
|
+
"consensus_uid": consensus_uid,
|
|
1022
|
+
"sample_uid": sample_uid,
|
|
1023
|
+
"feature_uid": features_df_max_uid + counter,
|
|
1024
|
+
})
|
|
1025
|
+
counter += 1
|
|
1026
|
+
|
|
1027
|
+
except Exception as e:
|
|
1028
|
+
# Skip this feature if extraction fails but log the error
|
|
1029
|
+
self.logger.debug(f"Failed to extract feature {consensus_uid} from {sample_path}: {e}")
|
|
1030
|
+
continue
|
|
1031
|
+
|
|
1032
|
+
return new_features, new_mapping, counter
|
|
1033
|
+
|
|
1034
|
+
'''
|
|
1035
|
+
def _load_ms1_optimized(self, sample_path, mz_ranges, rt_ranges):
|
|
1036
|
+
"""
|
|
1037
|
+
OPTIMIZED: Load only the MS1 data we actually need instead of the entire file.
|
|
1038
|
+
Pre-filter by m/z and RT ranges to reduce memory usage and processing time.
|
|
1039
|
+
"""
|
|
1040
|
+
try:
|
|
1041
|
+
# Load full MS1 data (we'll optimize this further later)
|
|
1042
|
+
ms1_data = self._load_ms1(filename=sample_path)
|
|
1043
|
+
if ms1_data is None or ms1_data.is_empty():
|
|
1044
|
+
return ms1_data
|
|
1045
|
+
|
|
1046
|
+
# OPTIMIZATION: Pre-filter to only relevant m/z ranges to reduce data size
|
|
1047
|
+
if mz_ranges:
|
|
1048
|
+
# Build comprehensive m/z filter covering all ranges
|
|
1049
|
+
mz_min = min(r[0] for r in mz_ranges)
|
|
1050
|
+
mz_max = max(r[1] for r in mz_ranges)
|
|
1051
|
+
|
|
1052
|
+
# Pre-filter by broad m/z range first (much faster than multiple OR conditions)
|
|
1053
|
+
ms1_filtered = ms1_data.filter(
|
|
1054
|
+
(pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max)
|
|
1055
|
+
)
|
|
1056
|
+
|
|
1057
|
+
# If we have RT ranges, also pre-filter by RT
|
|
1058
|
+
if rt_ranges and len(rt_ranges) > 0:
|
|
1059
|
+
rt_min = min(r[0] for r in rt_ranges)
|
|
1060
|
+
rt_max = max(r[1] for r in rt_ranges)
|
|
1061
|
+
ms1_filtered = ms1_filtered.filter(
|
|
1062
|
+
(pl.col("rt") >= rt_min) & (pl.col("rt") <= rt_max)
|
|
1063
|
+
)
|
|
1064
|
+
|
|
1065
|
+
return ms1_filtered
|
|
1066
|
+
|
|
1067
|
+
return ms1_data
|
|
1068
|
+
|
|
1069
|
+
except Exception:
|
|
1070
|
+
return pl.DataFrame()
|
|
1071
|
+
'''
|
|
1072
|
+
|
|
1073
|
+
'''
|
|
1074
|
+
def _create_empty_features(self, consensus_uids, consensus_info, sample_uid, features_df_max_uid):
|
|
1075
|
+
"""Create empty features for consensus UIDs when no MS1 data is available."""
|
|
1076
|
+
new_features = []
|
|
1077
|
+
new_mapping = []
|
|
1078
|
+
|
|
1079
|
+
for i, consensus_uid in enumerate(consensus_uids):
|
|
1080
|
+
cons = consensus_info[consensus_uid]
|
|
1081
|
+
feature_uid = features_df_max_uid + i + 1
|
|
1082
|
+
|
|
1083
|
+
# Create minimal empty feature
|
|
1084
|
+
empty_eic = Chromatogram(
|
|
1085
|
+
rt=np.array([cons["rt_start_mean"], cons["rt_end_mean"]]),
|
|
1086
|
+
inty=np.array([0.0, 0.0]),
|
|
1087
|
+
label=f"EIC mz={cons['mz']:.4f}",
|
|
1088
|
+
file="",
|
|
1089
|
+
mz=cons["mz"],
|
|
1090
|
+
feature_start=cons["rt_start_mean"],
|
|
1091
|
+
feature_end=cons["rt_end_mean"],
|
|
1092
|
+
feature_apex=cons["rt"],
|
|
1093
|
+
)
|
|
1094
|
+
|
|
1095
|
+
new_feature = {
|
|
1096
|
+
"sample_uid": sample_uid,
|
|
1097
|
+
"feature_uid": feature_uid,
|
|
1098
|
+
"feature_id": None,
|
|
1099
|
+
"mz": cons["mz"],
|
|
1100
|
+
"rt": cons["rt"],
|
|
1101
|
+
"rt_original": 0.0 if cons["rt"] == 0.0 else None,
|
|
1102
|
+
"rt_start": cons["rt_start_mean"],
|
|
1103
|
+
"rt_end": cons["rt_end_mean"],
|
|
1104
|
+
"rt_delta": cons["rt_end_mean"] - cons["rt_start_mean"],
|
|
1105
|
+
"mz_start": None,
|
|
1106
|
+
"mz_end": None,
|
|
1107
|
+
"inty": 0.0,
|
|
1108
|
+
"quality": None,
|
|
1109
|
+
"charge": None,
|
|
1110
|
+
"iso": None,
|
|
1111
|
+
"iso_of": None,
|
|
1112
|
+
"adduct": None,
|
|
1113
|
+
"adduct_mass": None,
|
|
1114
|
+
"adduct_group": None,
|
|
1115
|
+
"chrom": empty_eic,
|
|
1116
|
+
"filled": True,
|
|
1117
|
+
"chrom_area": 0.0,
|
|
1118
|
+
"chrom_coherence": None,
|
|
1119
|
+
"chrom_prominence": None,
|
|
1120
|
+
"chrom_prominence_scaled": None,
|
|
1121
|
+
"chrom_height_scaled": None,
|
|
1122
|
+
"ms2_scans": None,
|
|
1123
|
+
"ms2_specs": None,
|
|
1124
|
+
}
|
|
1125
|
+
|
|
1126
|
+
new_features.append(new_feature)
|
|
1127
|
+
new_mapping.append({
|
|
1128
|
+
"consensus_uid": consensus_uid,
|
|
1129
|
+
"sample_uid": sample_uid,
|
|
1130
|
+
"feature_uid": feature_uid,
|
|
1131
|
+
})
|
|
1132
|
+
|
|
1133
|
+
return new_features, new_mapping, len(new_features)
|
|
1134
|
+
'''
|
|
1135
|
+
|
|
1136
|
+
'''
|
|
1137
|
+
def _create_feature_fast(self, consensus_uid, sample_uid, features_df_max_uid, consensus_info):
|
|
1138
|
+
"""
|
|
1139
|
+
OPTIMIZED: Create a minimal empty feature quickly without expensive operations.
|
|
1140
|
+
Used for RT=0 features and other cases where we just need a placeholder feature.
|
|
1141
|
+
"""
|
|
1142
|
+
cons = consensus_info[consensus_uid]
|
|
1143
|
+
feature_uid = features_df_max_uid
|
|
1144
|
+
|
|
1145
|
+
# Create minimal empty feature
|
|
1146
|
+
empty_eic = Chromatogram(
|
|
1147
|
+
rt=np.array([cons["rt_start_mean"], cons["rt_end_mean"]]),
|
|
1148
|
+
inty=np.array([0.0, 0.0]),
|
|
1149
|
+
label=f"EIC mz={cons['mz']:.4f}",
|
|
1150
|
+
file="",
|
|
1151
|
+
mz=cons["mz"],
|
|
1152
|
+
feature_start=cons["rt_start_mean"],
|
|
1153
|
+
feature_end=cons["rt_end_mean"]
|
|
1154
|
+
)
|
|
1155
|
+
|
|
1156
|
+
new_feature = {
|
|
1157
|
+
"uid": feature_uid,
|
|
1158
|
+
"sample_uid": sample_uid,
|
|
1159
|
+
"mz": cons["mz"],
|
|
1160
|
+
"rt": cons["rt"],
|
|
1161
|
+
"mz_centroid": None,
|
|
1162
|
+
"rt_centroid": None,
|
|
1163
|
+
"iso": None,
|
|
1164
|
+
"iso_of": None,
|
|
1165
|
+
"adduct": None,
|
|
1166
|
+
"adduct_mass": None,
|
|
1167
|
+
"adduct_group": None,
|
|
1168
|
+
"chrom": empty_eic,
|
|
1169
|
+
"filled": True,
|
|
1170
|
+
"chrom_area": 0.0,
|
|
1171
|
+
"chrom_coherence": None,
|
|
1172
|
+
"chrom_prominence": None,
|
|
1173
|
+
"chrom_prominence_scaled": None,
|
|
1174
|
+
"chrom_height_scaled": None,
|
|
1175
|
+
"ms2_scans": None,
|
|
1176
|
+
"ms2_specs": None,
|
|
1177
|
+
}
|
|
1178
|
+
|
|
1179
|
+
new_features = [new_feature]
|
|
1180
|
+
new_mapping = [{
|
|
1181
|
+
"consensus_uid": consensus_uid,
|
|
1182
|
+
"sample_uid": sample_uid,
|
|
1183
|
+
"feature_uid": feature_uid,
|
|
1184
|
+
}]
|
|
1185
|
+
|
|
1186
|
+
return new_features, new_mapping, 1
|
|
1187
|
+
'''
|
|
1188
|
+
|
|
1189
|
+
'''
|
|
1190
|
+
def _process_rt_zero_features_batch(self, rt_zero_consensus_uids, consensus_info, sample_uid,
|
|
1191
|
+
features_df_max_uid, rt_zero_features):
|
|
1192
|
+
"""
|
|
1193
|
+
OPTIMIZED: Process all RT=0 features in a batch since they share similar characteristics.
|
|
1194
|
+
RT=0 features are typically not real peaks but artifacts or noise.
|
|
1195
|
+
"""
|
|
1196
|
+
new_features = []
|
|
1197
|
+
new_mapping = []
|
|
1198
|
+
|
|
1199
|
+
for consensus_uid in rt_zero_consensus_uids:
|
|
1200
|
+
new_features_batch, new_mapping_batch, _ = self._create_feature_fast(
|
|
1201
|
+
consensus_uid, sample_uid, features_df_max_uid, consensus_info
|
|
1202
|
+
)
|
|
1203
|
+
new_features.extend(new_features_batch)
|
|
1204
|
+
new_mapping.extend(new_mapping_batch)
|
|
1205
|
+
features_df_max_uid += 1
|
|
1206
|
+
|
|
1207
|
+
# Track RT=0 features for statistics
|
|
1208
|
+
rt_zero_features.append(1)
|
|
1209
|
+
|
|
1210
|
+
return new_features, new_mapping, features_df_max_uid
|
|
1211
|
+
'''
|
|
1212
|
+
|
|
1213
|
+
'''
|
|
1214
|
+
def _process_normal_rt_features_batch(self, normal_rt_consensus_uids, consensus_info, ms1_data,
|
|
1215
|
+
sample_uid, sample_path, mz_tol, rt_tol, features_df_max_uid):
|
|
1216
|
+
"""
|
|
1217
|
+
OPTIMIZED: Process normal RT features in batch with pre-filtered MS1 data.
|
|
1218
|
+
Only loads chromatograms once per batch instead of per feature.
|
|
1219
|
+
"""
|
|
1220
|
+
new_features = []
|
|
1221
|
+
new_mapping = []
|
|
1222
|
+
|
|
1223
|
+
if len(normal_rt_consensus_uids) == 0:
|
|
1224
|
+
return new_features, new_mapping, features_df_max_uid
|
|
1225
|
+
|
|
1226
|
+
# OPTIMIZATION: Pre-filter MS1 data by m/z range to reduce data size
|
|
1227
|
+
all_mzs = [consensus_info[cuid]["mz"] for cuid in normal_rt_consensus_uids]
|
|
1228
|
+
mz_min = min(all_mzs) - max(0.01, min(all_mzs) * mz_tol / 1e6)
|
|
1229
|
+
mz_max = max(all_mzs) + max(0.01, max(all_mzs) * mz_tol / 1e6)
|
|
1230
|
+
|
|
1231
|
+
# Pre-filter MS1 data once for all features
|
|
1232
|
+
ms1_filtered = ms1_data.filter(
|
|
1233
|
+
(pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max)
|
|
1234
|
+
)
|
|
1235
|
+
|
|
1236
|
+
# Early exit if no data in m/z range
|
|
1237
|
+
if ms1_filtered.shape[0] == 0:
|
|
1238
|
+
# Create empty features for all consensus UIDs
|
|
1239
|
+
for consensus_uid in normal_rt_consensus_uids:
|
|
1240
|
+
new_features_batch, new_mapping_batch, _ = self._create_feature_fast(
|
|
1241
|
+
consensus_uid, sample_uid, features_df_max_uid, consensus_info
|
|
1242
|
+
)
|
|
1243
|
+
new_features.extend(new_features_batch)
|
|
1244
|
+
new_mapping.extend(new_mapping_batch)
|
|
1245
|
+
features_df_max_uid += 1
|
|
1246
|
+
return new_features, new_mapping, features_df_max_uid
|
|
1247
|
+
|
|
1248
|
+
# Process each feature with pre-filtered data
|
|
1249
|
+
for consensus_uid in normal_rt_consensus_uids:
|
|
1250
|
+
info = consensus_info[consensus_uid]
|
|
1251
|
+
mz, rt = info["mz"], info["rt"]
|
|
1252
|
+
|
|
1253
|
+
# Extract chromatogram using pre-loaded MS1 data (FIXED!)
|
|
1254
|
+
sample_obj = self._load_ms1(sample_path) # Get the sample object for extract_eic method
|
|
1255
|
+
eic = sample_obj.extract_eic(
|
|
1256
|
+
mz, mz_tol, rt, rt_tol, ms1_data=ms1_filtered # Use the pre-filtered data!
|
|
1257
|
+
)
|
|
1258
|
+
|
|
1259
|
+
# Find best peak
|
|
1260
|
+
best_peak = self._find_best_peak_in_eic(eic, rt, rt_tol)
|
|
1261
|
+
|
|
1262
|
+
# Create feature
|
|
1263
|
+
new_feature = {
|
|
1264
|
+
"uid": features_df_max_uid,
|
|
1265
|
+
"sample_uid": sample_uid,
|
|
1266
|
+
"mz": mz,
|
|
1267
|
+
"rt": rt,
|
|
1268
|
+
"mz_centroid": None,
|
|
1269
|
+
"rt_centroid": None,
|
|
1270
|
+
"iso": None,
|
|
1271
|
+
"iso_of": None,
|
|
1272
|
+
"adduct": None,
|
|
1273
|
+
"adduct_mass": None,
|
|
1274
|
+
"adduct_group": None,
|
|
1275
|
+
"chrom": eic if best_peak else Chromatogram(
|
|
1276
|
+
rt=np.array([rt, rt]),
|
|
1277
|
+
inty=np.array([0.0, 0.0]),
|
|
1278
|
+
label=f"EIC mz={mz:.4f}",
|
|
1279
|
+
file="",
|
|
1280
|
+
mz=mz,
|
|
1281
|
+
feature_start=rt,
|
|
1282
|
+
feature_end=rt
|
|
1283
|
+
),
|
|
1284
|
+
"filled": True,
|
|
1285
|
+
"chrom_area": best_peak.get("area", 0.0) if best_peak else 0.0,
|
|
1286
|
+
"chrom_coherence": best_peak.get("coherence") if best_peak else None,
|
|
1287
|
+
"chrom_prominence": best_peak.get("prominence") if best_peak else None,
|
|
1288
|
+
"chrom_prominence_scaled": best_peak.get("prominence_scaled") if best_peak else None,
|
|
1289
|
+
"chrom_height_scaled": best_peak.get("height_scaled") if best_peak else None,
|
|
1290
|
+
"ms2_scans": None,
|
|
1291
|
+
"ms2_specs": None,
|
|
1292
|
+
}
|
|
1293
|
+
|
|
1294
|
+
new_features.append(new_feature)
|
|
1295
|
+
new_mapping.append({
|
|
1296
|
+
"consensus_uid": consensus_uid,
|
|
1297
|
+
"sample_uid": sample_uid,
|
|
1298
|
+
"feature_uid": features_df_max_uid,
|
|
1299
|
+
})
|
|
1300
|
+
features_df_max_uid += 1
|
|
1301
|
+
|
|
1302
|
+
return new_features, new_mapping, features_df_max_uid
|
|
1303
|
+
'''
|
|
1304
|
+
|
|
1305
|
+
'''def _batch_process_features(self, consensus_uids, consensus_info, ms1_data, sample_uid, sample_path,
|
|
1306
|
+
mz_tol, rt_tol, features_df_max_uid, rt_zero_features):
|
|
1307
|
+
"""
|
|
1308
|
+
OPTIMIZED: Process all missing features for a sample in a single batch operation.
|
|
1309
|
+
This avoids repeated filtering of the MS1 dataframe.
|
|
1310
|
+
"""
|
|
1311
|
+
new_features = []
|
|
1312
|
+
new_mapping = []
|
|
1313
|
+
|
|
1314
|
+
# OPTIMIZATION: Process RT=0 features separately (they need special handling)
|
|
1315
|
+
rt_zero_data = {}
|
|
1316
|
+
if rt_zero_features:
|
|
1317
|
+
rt_zero_data = self._process_rt_zero_features_batch(
|
|
1318
|
+
rt_zero_features, consensus_info, ms1_data, mz_tol, rt_tol
|
|
1319
|
+
)
|
|
1320
|
+
|
|
1321
|
+
# OPTIMIZATION: Build comprehensive filter for all normal RT features at once
|
|
1322
|
+
normal_rt_features = [uid for uid in consensus_uids if uid not in rt_zero_features]
|
|
1323
|
+
normal_rt_data = {}
|
|
1324
|
+
if normal_rt_features:
|
|
1325
|
+
normal_rt_data = self._process_normal_rt_features_batch(
|
|
1326
|
+
normal_rt_features, consensus_info, ms1_data, mz_tol, rt_tol
|
|
1327
|
+
)
|
|
1328
|
+
|
|
1329
|
+
# Combine results and create features
|
|
1330
|
+
all_feature_data = {**rt_zero_data, **normal_rt_data}
|
|
1331
|
+
|
|
1332
|
+
for i, consensus_uid in enumerate(consensus_uids):
|
|
1333
|
+
feature_uid = features_df_max_uid + i + 1
|
|
1334
|
+
cons = consensus_info[consensus_uid]
|
|
1335
|
+
|
|
1336
|
+
# Get pre-processed data for this feature
|
|
1337
|
+
feature_ms1_data = all_feature_data.get(consensus_uid, pl.DataFrame())
|
|
1338
|
+
|
|
1339
|
+
# Create feature using optimized chromatogram creation
|
|
1340
|
+
new_feature, area = self._create_feature_fast(
|
|
1341
|
+
consensus_uid, cons, feature_ms1_data, sample_uid, sample_path,
|
|
1342
|
+
feature_uid, mz_tol, rt_tol
|
|
1343
|
+
)
|
|
1344
|
+
|
|
1345
|
+
new_features.append(new_feature)
|
|
1346
|
+
new_mapping.append({
|
|
1347
|
+
"consensus_uid": consensus_uid,
|
|
1348
|
+
"sample_uid": sample_uid,
|
|
1349
|
+
"feature_uid": feature_uid,
|
|
1350
|
+
})
|
|
1351
|
+
|
|
1352
|
+
return new_features, new_mapping, len(new_features)
|
|
1353
|
+
|
|
617
1354
|
# Process each missing feature
|
|
618
1355
|
for consensus_uid in sample_missing:
|
|
619
1356
|
cons = consensus_info[consensus_uid]
|
|
@@ -624,12 +1361,43 @@ def _process_sample_for_parallel_fill(
|
|
|
624
1361
|
|
|
625
1362
|
# Filter MS1 data for this feature
|
|
626
1363
|
if hasattr(file, "ms1_df") and not file.ms1_df.is_empty():
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
1364
|
+
# Special handling for RT=0 (library-derived features)
|
|
1365
|
+
if rt == 0.0:
|
|
1366
|
+
# Simple RT=0 processing: find max intensity across full m/z range
|
|
1367
|
+
d_full = file.ms1_df.filter(
|
|
1368
|
+
(pl.col("mz") >= mz - mz_tol) & (pl.col("mz") <= mz + mz_tol)
|
|
1369
|
+
)
|
|
1370
|
+
|
|
1371
|
+
if not d_full.is_empty():
|
|
1372
|
+
max_inty = d_full["inty"].max()
|
|
1373
|
+
if max_inty > 0:
|
|
1374
|
+
max_rt = d_full.filter(pl.col("inty") == max_inty)["rt"].min()
|
|
1375
|
+
|
|
1376
|
+
# Use default rt_tol for RT=0 features
|
|
1377
|
+
eic_rt_tol = rt_tol
|
|
1378
|
+
|
|
1379
|
+
# Filter around max RT
|
|
1380
|
+
d = d_full.filter(
|
|
1381
|
+
(pl.col("rt") >= max_rt - eic_rt_tol) &
|
|
1382
|
+
(pl.col("rt") <= max_rt + eic_rt_tol)
|
|
1383
|
+
)
|
|
1384
|
+
|
|
1385
|
+
# Update consensus RT info
|
|
1386
|
+
rt = max_rt
|
|
1387
|
+
rt_start_mean = max_rt - eic_rt_tol
|
|
1388
|
+
rt_end_mean = max_rt + eic_rt_tol
|
|
1389
|
+
else:
|
|
1390
|
+
d = pl.DataFrame()
|
|
1391
|
+
else:
|
|
1392
|
+
d = pl.DataFrame()
|
|
1393
|
+
else:
|
|
1394
|
+
# Normal RT-based filtering for non-zero RT
|
|
1395
|
+
d = file.ms1_df.filter(
|
|
1396
|
+
(pl.col("mz") >= mz - mz_tol)
|
|
1397
|
+
& (pl.col("mz") <= mz + mz_tol)
|
|
1398
|
+
& (pl.col("rt") >= rt_start_mean - rt_tol)
|
|
1399
|
+
& (pl.col("rt") <= rt_end_mean + rt_tol),
|
|
1400
|
+
)
|
|
633
1401
|
else:
|
|
634
1402
|
d = pl.DataFrame()
|
|
635
1403
|
|
|
@@ -648,6 +1416,13 @@ def _process_sample_for_parallel_fill(
|
|
|
648
1416
|
)
|
|
649
1417
|
max_inty = 0.0
|
|
650
1418
|
area = 0.0
|
|
1419
|
+
chrom_coherence = None
|
|
1420
|
+
chrom_prominence = None
|
|
1421
|
+
chrom_prominence_scaled = None
|
|
1422
|
+
chrom_height_scaled = None
|
|
1423
|
+
peak_rt_start = rt_start_mean
|
|
1424
|
+
peak_rt_end = rt_end_mean
|
|
1425
|
+
peak_rt_delta = rt_end_mean - rt_start_mean
|
|
651
1426
|
else:
|
|
652
1427
|
eic_rt = d.group_by("rt").agg(pl.col("inty").max()).sort("rt")
|
|
653
1428
|
|
|
@@ -665,6 +1440,24 @@ def _process_sample_for_parallel_fill(
|
|
|
665
1440
|
).find_peaks()
|
|
666
1441
|
max_inty = np.max(eic.inty)
|
|
667
1442
|
area = eic.feature_area
|
|
1443
|
+
|
|
1444
|
+
# Extract chromatogram peak properties from first peak (if available)
|
|
1445
|
+
if len(eic.peak_rts) > 0 and eic.feature_start is not None and eic.feature_end is not None:
|
|
1446
|
+
chrom_coherence = round(eic.feature_coherence, 3) if eic.feature_coherence is not None else None
|
|
1447
|
+
chrom_prominence = round(eic.peak_prominences[0], 3) if len(eic.peak_prominences) > 0 else None
|
|
1448
|
+
chrom_prominence_scaled = round(eic.peak_prominences[0] / (np.mean(eic.inty) + 1e-10), 3) if len(eic.peak_prominences) > 0 else None
|
|
1449
|
+
chrom_height_scaled = round(eic.peak_heights[0] / (np.mean(eic.inty) + 1e-10), 3) if len(eic.peak_heights) > 0 else None
|
|
1450
|
+
peak_rt_start = eic.feature_start
|
|
1451
|
+
peak_rt_end = eic.feature_end
|
|
1452
|
+
peak_rt_delta = peak_rt_end - peak_rt_start
|
|
1453
|
+
else:
|
|
1454
|
+
chrom_coherence = None
|
|
1455
|
+
chrom_prominence = None
|
|
1456
|
+
chrom_prominence_scaled = None
|
|
1457
|
+
chrom_height_scaled = None
|
|
1458
|
+
peak_rt_start = rt_start_mean
|
|
1459
|
+
peak_rt_end = rt_end_mean
|
|
1460
|
+
peak_rt_delta = rt_end_mean - rt_start_mean
|
|
668
1461
|
else:
|
|
669
1462
|
eic = Chromatogram(
|
|
670
1463
|
eic_rt["rt"].to_numpy(),
|
|
@@ -679,21 +1472,36 @@ def _process_sample_for_parallel_fill(
|
|
|
679
1472
|
)
|
|
680
1473
|
max_inty = 0.0
|
|
681
1474
|
area = 0.0
|
|
1475
|
+
chrom_coherence = None
|
|
1476
|
+
chrom_prominence = None
|
|
1477
|
+
chrom_prominence_scaled = None
|
|
1478
|
+
chrom_height_scaled = None
|
|
1479
|
+
peak_rt_start = rt_start_mean
|
|
1480
|
+
peak_rt_end = rt_end_mean
|
|
1481
|
+
peak_rt_delta = rt_end_mean - rt_start_mean
|
|
682
1482
|
|
|
683
1483
|
# Generate feature UID (will be adjusted later to ensure global uniqueness)
|
|
684
1484
|
feature_uid = features_df_max_uid + len(new_features) + 1
|
|
685
1485
|
|
|
686
|
-
#
|
|
1486
|
+
# Handle rt_original: for RT=0 features, set to 0; otherwise estimate from closest feature
|
|
1487
|
+
if rt == 0.0 or (hasattr(cons, 'get') and cons.get("rt") == 0.0):
|
|
1488
|
+
estimated_rt_original = 0.0
|
|
1489
|
+
else:
|
|
1490
|
+
estimated_rt_original = _estimate_rt_original_for_filled_feature(
|
|
1491
|
+
self, sample_uid, rt, logger=self.logger if hasattr(self, 'logger') else None
|
|
1492
|
+
)
|
|
1493
|
+
|
|
1494
|
+
# Create new feature entry with updated chromatogram properties
|
|
687
1495
|
new_feature = {
|
|
688
1496
|
"sample_uid": sample_uid,
|
|
689
1497
|
"feature_uid": feature_uid,
|
|
690
1498
|
"feature_id": None,
|
|
691
1499
|
"mz": mz,
|
|
692
1500
|
"rt": rt,
|
|
693
|
-
"rt_original":
|
|
694
|
-
"rt_start":
|
|
695
|
-
"rt_end":
|
|
696
|
-
"rt_delta":
|
|
1501
|
+
"rt_original": estimated_rt_original,
|
|
1502
|
+
"rt_start": peak_rt_start,
|
|
1503
|
+
"rt_end": peak_rt_end,
|
|
1504
|
+
"rt_delta": peak_rt_delta,
|
|
697
1505
|
"mz_start": None,
|
|
698
1506
|
"mz_end": None,
|
|
699
1507
|
"inty": max_inty,
|
|
@@ -707,10 +1515,10 @@ def _process_sample_for_parallel_fill(
|
|
|
707
1515
|
"chrom": eic,
|
|
708
1516
|
"filled": True,
|
|
709
1517
|
"chrom_area": area,
|
|
710
|
-
"chrom_coherence":
|
|
711
|
-
"chrom_prominence":
|
|
712
|
-
"chrom_prominence_scaled":
|
|
713
|
-
"chrom_height_scaled":
|
|
1518
|
+
"chrom_coherence": chrom_coherence,
|
|
1519
|
+
"chrom_prominence": chrom_prominence,
|
|
1520
|
+
"chrom_prominence_scaled": chrom_prominence_scaled,
|
|
1521
|
+
"chrom_height_scaled": chrom_height_scaled,
|
|
714
1522
|
"ms2_scans": None,
|
|
715
1523
|
"ms2_specs": None,
|
|
716
1524
|
}
|
|
@@ -726,7 +1534,7 @@ def _process_sample_for_parallel_fill(
|
|
|
726
1534
|
counter += 1
|
|
727
1535
|
|
|
728
1536
|
return new_features, new_mapping, counter
|
|
729
|
-
|
|
1537
|
+
'''
|
|
730
1538
|
|
|
731
1539
|
def _fill_chrom_impl(
|
|
732
1540
|
self,
|
|
@@ -735,7 +1543,7 @@ def _fill_chrom_impl(
|
|
|
735
1543
|
rt_tol: float = 10.0,
|
|
736
1544
|
min_samples_rel: float = 0.0,
|
|
737
1545
|
min_samples_abs: int = 2,
|
|
738
|
-
|
|
1546
|
+
threads=6,
|
|
739
1547
|
):
|
|
740
1548
|
"""Fill missing chromatograms by extracting from raw data using parallel processing.
|
|
741
1549
|
|
|
@@ -745,13 +1553,13 @@ def _fill_chrom_impl(
|
|
|
745
1553
|
rt_tol: RT tolerance for extraction (default: 10.0 seconds)
|
|
746
1554
|
min_samples_rel: Relative minimum sample threshold (default: 0.0)
|
|
747
1555
|
min_samples_abs: Absolute minimum sample threshold (default: 2)
|
|
748
|
-
|
|
1556
|
+
threads: Number of parallel threads (default: 6)
|
|
749
1557
|
"""
|
|
750
1558
|
uids = self._get_consensus_uids(uids)
|
|
751
1559
|
|
|
752
|
-
self.logger.info(f"Gap filling with {
|
|
1560
|
+
self.logger.info(f"Gap filling with {threads} threads...")
|
|
753
1561
|
self.logger.debug(
|
|
754
|
-
f"Parameters: mz_tol={mz_tol}, rt_tol={rt_tol}, min_samples_rel={min_samples_rel}, min_samples_abs={min_samples_abs},
|
|
1562
|
+
f"Parameters: mz_tol={mz_tol}, rt_tol={rt_tol}, min_samples_rel={min_samples_rel}, min_samples_abs={min_samples_abs}, threads={threads}",
|
|
755
1563
|
)
|
|
756
1564
|
|
|
757
1565
|
# Apply minimum sample filters
|
|
@@ -793,6 +1601,7 @@ def _fill_chrom_impl(
|
|
|
793
1601
|
"sample_uid": pl.Int64,
|
|
794
1602
|
"sample_name": pl.Utf8,
|
|
795
1603
|
"sample_path": pl.Utf8,
|
|
1604
|
+
"sample_source": pl.Utf8,
|
|
796
1605
|
},
|
|
797
1606
|
orient="row",
|
|
798
1607
|
)
|
|
@@ -830,12 +1639,13 @@ def _fill_chrom_impl(
|
|
|
830
1639
|
"sample_name": row["sample_name"],
|
|
831
1640
|
"sample_uid": row["sample_uid"],
|
|
832
1641
|
"sample_path": row["sample_path"],
|
|
1642
|
+
"sample_source": row["sample_source"],
|
|
833
1643
|
},
|
|
834
1644
|
)
|
|
835
1645
|
|
|
836
1646
|
total_missing = len(missing_combinations_df)
|
|
837
1647
|
self.logger.debug(
|
|
838
|
-
f"Gap filling for {total_missing} missing features...",
|
|
1648
|
+
f"Gap filling for {total_missing} missing features across {len(samples_to_process)} samples...",
|
|
839
1649
|
)
|
|
840
1650
|
|
|
841
1651
|
# Calculate current max feature_uid to avoid conflicts
|
|
@@ -850,7 +1660,7 @@ def _fill_chrom_impl(
|
|
|
850
1660
|
|
|
851
1661
|
tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
852
1662
|
|
|
853
|
-
with concurrent.futures.ThreadPoolExecutor(max_workers=
|
|
1663
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
|
|
854
1664
|
# Submit all samples for processing
|
|
855
1665
|
future_to_sample = {}
|
|
856
1666
|
for sample_info in samples_to_process:
|
|
@@ -883,6 +1693,8 @@ def _fill_chrom_impl(
|
|
|
883
1693
|
for i, mapping in enumerate(new_mapping):
|
|
884
1694
|
mapping["feature_uid"] = uid_offset + i + 1
|
|
885
1695
|
|
|
1696
|
+
# RT original estimation is now done inside parallel processing - PERFORMANCE OPTIMIZED!
|
|
1697
|
+
|
|
886
1698
|
all_new_features.extend(new_features)
|
|
887
1699
|
all_new_mapping.extend(new_mapping)
|
|
888
1700
|
total_counter += counter
|
|
@@ -944,8 +1756,15 @@ def _fill_chrom_impl(
|
|
|
944
1756
|
how="diagonal",
|
|
945
1757
|
)
|
|
946
1758
|
|
|
1759
|
+
# Log statistics about rt_original estimation
|
|
1760
|
+
if all_new_features:
|
|
1761
|
+
estimated_count = sum(1 for feature in all_new_features if feature.get('rt_original') is not None)
|
|
1762
|
+
none_count = sum(1 for feature in all_new_features if feature.get('rt_original') is None)
|
|
1763
|
+
self.logger.debug(f"Features with estimated rt_original: {estimated_count}")
|
|
1764
|
+
self.logger.debug(f"Features with None rt_original: {none_count}")
|
|
1765
|
+
|
|
947
1766
|
self.logger.info(
|
|
948
|
-
f"Filled {total_counter} chromatograms from raw data
|
|
1767
|
+
f"Filled {total_counter} chromatograms from raw data.",
|
|
949
1768
|
)
|
|
950
1769
|
|
|
951
1770
|
|
|
@@ -963,14 +1782,18 @@ def fill(self, **kwargs):
|
|
|
963
1782
|
rt_tol: RT tolerance for extraction (default: 10.0 seconds)
|
|
964
1783
|
min_samples_rel: Relative minimum sample threshold (default: 0.05)
|
|
965
1784
|
min_samples_abs: Absolute minimum sample threshold (default: 5)
|
|
966
|
-
|
|
1785
|
+
threads: Number of parallel threads (default: 6)
|
|
967
1786
|
"""
|
|
968
1787
|
# parameters initialization
|
|
969
1788
|
params = fill_defaults()
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
1789
|
+
|
|
1790
|
+
# Handle backward compatibility for old parameter names
|
|
1791
|
+
if "workers" in kwargs:
|
|
1792
|
+
kwargs["threads"] = kwargs.pop("workers")
|
|
1793
|
+
self.logger.debug("Converted 'workers' parameter to 'threads' for backward compatibility")
|
|
1794
|
+
if "num_workers" in kwargs:
|
|
1795
|
+
kwargs["threads"] = kwargs.pop("num_workers")
|
|
1796
|
+
self.logger.debug("Converted 'num_workers' parameter to 'threads' for backward compatibility")
|
|
974
1797
|
|
|
975
1798
|
for key, value in kwargs.items():
|
|
976
1799
|
if isinstance(value, fill_defaults):
|
|
@@ -984,7 +1807,7 @@ def fill(self, **kwargs):
|
|
|
984
1807
|
self.logger.warning(
|
|
985
1808
|
f"Failed to set parameter {key} = {value} (validation failed)",
|
|
986
1809
|
)
|
|
987
|
-
|
|
1810
|
+
else:
|
|
988
1811
|
self.logger.debug(f"Unknown parameter {key} ignored")
|
|
989
1812
|
# end of parameter initialization
|
|
990
1813
|
|
|
@@ -1000,14 +1823,10 @@ def fill(self, **kwargs):
|
|
|
1000
1823
|
rt_tol=params.get("rt_tol"),
|
|
1001
1824
|
min_samples_rel=params.get("min_samples_rel"),
|
|
1002
1825
|
min_samples_abs=params.get("min_samples_abs"),
|
|
1003
|
-
|
|
1826
|
+
threads=params.get("threads"),
|
|
1004
1827
|
)
|
|
1005
1828
|
|
|
1006
1829
|
|
|
1007
|
-
# Backward compatibility alias
|
|
1008
|
-
fill_chrom = fill
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
1830
|
def _get_missing_consensus_sample_combinations(self, uids):
|
|
1012
1831
|
"""
|
|
1013
1832
|
Efficiently identify which consensus_uid/sample combinations are missing.
|
|
@@ -1017,6 +1836,7 @@ def _get_missing_consensus_sample_combinations(self, uids):
|
|
|
1017
1836
|
- Early termination for fully-filled studies
|
|
1018
1837
|
- Efficient dictionary lookups instead of expensive DataFrame joins
|
|
1019
1838
|
- Smart handling of sparse vs dense missing data patterns
|
|
1839
|
+
- Special handling for consensus features with no mappings (e.g., library-derived RT=0 features)
|
|
1020
1840
|
"""
|
|
1021
1841
|
if not uids:
|
|
1022
1842
|
return []
|
|
@@ -1025,10 +1845,42 @@ def _get_missing_consensus_sample_combinations(self, uids):
|
|
|
1025
1845
|
n_samples = len(self.samples_df)
|
|
1026
1846
|
total_possible = n_consensus * n_samples
|
|
1027
1847
|
|
|
1848
|
+
# Identify consensus features that have NO mappings at all (e.g., library-derived RT=0 features)
|
|
1849
|
+
uids_set = set(uids)
|
|
1850
|
+
mapped_consensus_uids = set(
|
|
1851
|
+
self.consensus_mapping_df.filter(pl.col("consensus_uid").is_in(uids))["consensus_uid"].to_list()
|
|
1852
|
+
)
|
|
1853
|
+
unmapped_consensus_uids = uids_set - mapped_consensus_uids
|
|
1854
|
+
|
|
1855
|
+
# Get all sample info once for efficiency
|
|
1856
|
+
all_samples = list(
|
|
1857
|
+
self.samples_df.select(
|
|
1858
|
+
["sample_uid", "sample_name", "sample_path", "sample_source"],
|
|
1859
|
+
).iter_rows(),
|
|
1860
|
+
)
|
|
1861
|
+
|
|
1862
|
+
missing_combinations = []
|
|
1863
|
+
|
|
1864
|
+
# For unmapped consensus features (e.g., RT=0), ALL samples are missing
|
|
1865
|
+
if unmapped_consensus_uids:
|
|
1866
|
+
self.logger.debug(f"Found {len(unmapped_consensus_uids)} consensus features with no mappings (e.g., RT=0 library features)")
|
|
1867
|
+
for consensus_uid in unmapped_consensus_uids:
|
|
1868
|
+
for sample_uid, sample_name, sample_path, sample_source in all_samples:
|
|
1869
|
+
missing_combinations.append(
|
|
1870
|
+
(consensus_uid, sample_uid, sample_name, sample_path, sample_source)
|
|
1871
|
+
)
|
|
1872
|
+
|
|
1873
|
+
# If all consensus features are unmapped, return early
|
|
1874
|
+
if len(mapped_consensus_uids) == 0:
|
|
1875
|
+
return missing_combinations
|
|
1876
|
+
|
|
1877
|
+
# Continue with existing logic for mapped consensus features
|
|
1878
|
+
mapped_uids_list = list(mapped_consensus_uids)
|
|
1879
|
+
|
|
1028
1880
|
# Quick early termination check for fully/nearly filled studies
|
|
1029
1881
|
# This handles the common case where fill() is run on an already-filled study
|
|
1030
1882
|
consensus_counts = (
|
|
1031
|
-
self.consensus_mapping_df.filter(pl.col("consensus_uid").is_in(
|
|
1883
|
+
self.consensus_mapping_df.filter(pl.col("consensus_uid").is_in(mapped_uids_list))
|
|
1032
1884
|
.group_by("consensus_uid")
|
|
1033
1885
|
.agg(pl.count("feature_uid").alias("count"))
|
|
1034
1886
|
)
|
|
@@ -1037,22 +1889,22 @@ def _get_missing_consensus_sample_combinations(self, uids):
|
|
|
1037
1889
|
consensus_counts["count"].sum() if not consensus_counts.is_empty() else 0
|
|
1038
1890
|
)
|
|
1039
1891
|
|
|
1892
|
+
# Calculate total possible for mapped features only
|
|
1893
|
+
mapped_total_possible = len(mapped_uids_list) * n_samples
|
|
1894
|
+
|
|
1040
1895
|
# If >95% filled, likely no gaps (common case)
|
|
1041
|
-
if total_existing >=
|
|
1896
|
+
if total_existing >= mapped_total_possible * 0.95:
|
|
1042
1897
|
self.logger.debug(
|
|
1043
|
-
f"Study appears {total_existing /
|
|
1898
|
+
f"Study appears {total_existing / mapped_total_possible * 100:.1f}% filled, using sparse optimization",
|
|
1044
1899
|
)
|
|
1045
1900
|
|
|
1046
1901
|
# For sparse missing data, check each consensus feature individually
|
|
1047
|
-
missing_combinations = []
|
|
1048
|
-
uids_set = set(uids)
|
|
1049
|
-
|
|
1050
1902
|
# Build efficient lookups
|
|
1051
1903
|
feature_to_sample = dict(
|
|
1052
1904
|
self.features_df.select(["feature_uid", "sample_uid"]).iter_rows(),
|
|
1053
1905
|
)
|
|
1054
1906
|
|
|
1055
|
-
# Get existing combinations for target UIDs only
|
|
1907
|
+
# Get existing combinations for target UIDs only (mapped features)
|
|
1056
1908
|
existing_by_consensus = {}
|
|
1057
1909
|
for consensus_uid, feature_uid in self.consensus_mapping_df.select(
|
|
1058
1910
|
[
|
|
@@ -1060,25 +1912,18 @@ def _get_missing_consensus_sample_combinations(self, uids):
|
|
|
1060
1912
|
"feature_uid",
|
|
1061
1913
|
],
|
|
1062
1914
|
).iter_rows():
|
|
1063
|
-
if consensus_uid in
|
|
1915
|
+
if consensus_uid in mapped_consensus_uids and feature_uid in feature_to_sample:
|
|
1064
1916
|
if consensus_uid not in existing_by_consensus:
|
|
1065
1917
|
existing_by_consensus[consensus_uid] = set()
|
|
1066
1918
|
existing_by_consensus[consensus_uid].add(feature_to_sample[feature_uid])
|
|
1067
1919
|
|
|
1068
|
-
#
|
|
1069
|
-
|
|
1070
|
-
self.samples_df.select(
|
|
1071
|
-
["sample_uid", "sample_name", "sample_path"],
|
|
1072
|
-
).iter_rows(),
|
|
1073
|
-
)
|
|
1074
|
-
|
|
1075
|
-
# Check for missing combinations
|
|
1076
|
-
for consensus_uid in uids:
|
|
1920
|
+
# Check for missing combinations for mapped features
|
|
1921
|
+
for consensus_uid in mapped_uids_list:
|
|
1077
1922
|
existing_samples = existing_by_consensus.get(consensus_uid, set())
|
|
1078
|
-
for sample_uid, sample_name, sample_path in all_samples:
|
|
1923
|
+
for sample_uid, sample_name, sample_path, sample_source in all_samples:
|
|
1079
1924
|
if sample_uid not in existing_samples:
|
|
1080
1925
|
missing_combinations.append(
|
|
1081
|
-
(consensus_uid, sample_uid, sample_name, sample_path),
|
|
1926
|
+
(consensus_uid, sample_uid, sample_name, sample_path, sample_source),
|
|
1082
1927
|
)
|
|
1083
1928
|
|
|
1084
1929
|
return missing_combinations
|
|
@@ -1086,16 +1931,15 @@ def _get_missing_consensus_sample_combinations(self, uids):
|
|
|
1086
1931
|
else:
|
|
1087
1932
|
# For studies with many gaps, use bulk operations
|
|
1088
1933
|
self.logger.debug(
|
|
1089
|
-
f"Study {total_existing /
|
|
1934
|
+
f"Study {total_existing / mapped_total_possible * 100:.1f}% filled, using bulk optimization",
|
|
1090
1935
|
)
|
|
1091
1936
|
|
|
1092
1937
|
# Build efficient lookups
|
|
1093
|
-
uids_set = set(uids)
|
|
1094
1938
|
feature_to_sample = dict(
|
|
1095
1939
|
self.features_df.select(["feature_uid", "sample_uid"]).iter_rows(),
|
|
1096
1940
|
)
|
|
1097
1941
|
|
|
1098
|
-
# Build existing combinations set
|
|
1942
|
+
# Build existing combinations set for mapped features only
|
|
1099
1943
|
existing_combinations = {
|
|
1100
1944
|
(consensus_uid, feature_to_sample[feature_uid])
|
|
1101
1945
|
for consensus_uid, feature_uid in self.consensus_mapping_df.select(
|
|
@@ -1104,23 +1948,16 @@ def _get_missing_consensus_sample_combinations(self, uids):
|
|
|
1104
1948
|
"feature_uid",
|
|
1105
1949
|
],
|
|
1106
1950
|
).iter_rows()
|
|
1107
|
-
if consensus_uid in
|
|
1951
|
+
if consensus_uid in mapped_consensus_uids and feature_uid in feature_to_sample
|
|
1108
1952
|
}
|
|
1109
1953
|
|
|
1110
|
-
#
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
# Generate all missing combinations
|
|
1118
|
-
missing_combinations = [
|
|
1119
|
-
(consensus_uid, sample_uid, sample_name, sample_path)
|
|
1120
|
-
for consensus_uid in uids
|
|
1121
|
-
for sample_uid, sample_name, sample_path in all_samples
|
|
1122
|
-
if (consensus_uid, sample_uid) not in existing_combinations
|
|
1123
|
-
]
|
|
1954
|
+
# Generate missing combinations for mapped features
|
|
1955
|
+
for consensus_uid in mapped_uids_list:
|
|
1956
|
+
for sample_uid, sample_name, sample_path, sample_source in all_samples:
|
|
1957
|
+
if (consensus_uid, sample_uid) not in existing_combinations:
|
|
1958
|
+
missing_combinations.append(
|
|
1959
|
+
(consensus_uid, sample_uid, sample_name, sample_path, sample_source)
|
|
1960
|
+
)
|
|
1124
1961
|
|
|
1125
1962
|
return missing_combinations
|
|
1126
1963
|
|
|
@@ -1218,7 +2055,7 @@ def _sanitize(self):
|
|
|
1218
2055
|
except Exception as e:
|
|
1219
2056
|
self.logger.error(f"Failed to recreate sanitized DataFrame: {e}")
|
|
1220
2057
|
|
|
1221
|
-
|
|
2058
|
+
'''
|
|
1222
2059
|
def _load_features(self):
|
|
1223
2060
|
"""
|
|
1224
2061
|
Load features by reconstructing FeatureMaps from the processed features_df data.
|
|
@@ -1326,8 +2163,9 @@ def _load_features(self):
|
|
|
1326
2163
|
self.logger.debug(
|
|
1327
2164
|
f"Successfully reconstructed {len(self.features_maps)} FeatureMaps from features_df.",
|
|
1328
2165
|
)
|
|
2166
|
+
'''
|
|
1329
2167
|
|
|
1330
|
-
|
|
2168
|
+
'''
|
|
1331
2169
|
def _load_features_from_xml(self):
|
|
1332
2170
|
"""
|
|
1333
2171
|
Original load_features method that loads from .featureXML files.
|
|
@@ -1365,8 +2203,8 @@ def _load_features_from_xml(self):
|
|
|
1365
2203
|
fh.load(filename, fm)
|
|
1366
2204
|
self.features_maps.append(fm)
|
|
1367
2205
|
self.logger.debug("Features loaded successfully.")
|
|
1368
|
-
|
|
1369
|
-
|
|
2206
|
+
'''
|
|
2207
|
+
'''
|
|
1370
2208
|
def _load_consensusXML(self, filename="alignment.consensusXML"):
|
|
1371
2209
|
"""
|
|
1372
2210
|
Load a consensus map from a file.
|
|
@@ -1378,15 +2216,14 @@ def _load_consensusXML(self, filename="alignment.consensusXML"):
|
|
|
1378
2216
|
self.consensus_map = oms.ConsensusMap()
|
|
1379
2217
|
fh.load(filename, self.consensus_map)
|
|
1380
2218
|
self.logger.debug(f"Loaded consensus map from {filename}.")
|
|
1381
|
-
|
|
2219
|
+
'''
|
|
1382
2220
|
|
|
1383
2221
|
def _add_samples_batch(
|
|
1384
2222
|
self,
|
|
1385
2223
|
files,
|
|
1386
2224
|
reset=False,
|
|
1387
2225
|
adducts=None,
|
|
1388
|
-
blacklist=None
|
|
1389
|
-
fast=True,
|
|
2226
|
+
blacklist=None
|
|
1390
2227
|
):
|
|
1391
2228
|
"""
|
|
1392
2229
|
Optimized batch addition of samples.
|
|
@@ -1396,7 +2233,6 @@ def _add_samples_batch(
|
|
|
1396
2233
|
reset (bool): Whether to reset features before processing
|
|
1397
2234
|
adducts: Adducts to use for sample loading
|
|
1398
2235
|
blacklist (set): Set of filenames already processed
|
|
1399
|
-
fast (bool): Whether to use optimized loading (skips ms1_df) or standard loading
|
|
1400
2236
|
|
|
1401
2237
|
Performance optimizations:
|
|
1402
2238
|
1. No per-sample color reset
|
|
@@ -1411,7 +2247,7 @@ def _add_samples_batch(
|
|
|
1411
2247
|
blacklist = set()
|
|
1412
2248
|
|
|
1413
2249
|
self.logger.debug(
|
|
1414
|
-
f"Starting batch addition of {len(files)} samples
|
|
2250
|
+
f"Starting batch addition of {len(files)} samples...",
|
|
1415
2251
|
)
|
|
1416
2252
|
|
|
1417
2253
|
successful_additions = 0
|
|
@@ -1430,22 +2266,13 @@ def _add_samples_batch(
|
|
|
1430
2266
|
):
|
|
1431
2267
|
try:
|
|
1432
2268
|
# Choose between optimized and standard loading
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
)
|
|
1441
|
-
else:
|
|
1442
|
-
success = self._add_sample_standard(
|
|
1443
|
-
file,
|
|
1444
|
-
reset=reset,
|
|
1445
|
-
adducts=adducts,
|
|
1446
|
-
skip_color_reset=True, # Skip color reset during batch
|
|
1447
|
-
skip_schema_check=True, # Skip schema enforcement
|
|
1448
|
-
)
|
|
2269
|
+
success = _add_sample_noms1(self,
|
|
2270
|
+
file,
|
|
2271
|
+
reset=reset,
|
|
2272
|
+
adducts=adducts,
|
|
2273
|
+
skip_color_reset=True, # Skip color reset during batch
|
|
2274
|
+
skip_schema_check=True, # Skip schema enforcement
|
|
2275
|
+
)
|
|
1449
2276
|
|
|
1450
2277
|
if success:
|
|
1451
2278
|
# Add to blacklist for filename tracking
|
|
@@ -1467,7 +2294,7 @@ def _add_samples_batch(
|
|
|
1467
2294
|
# self._ensure_features_df_schema_order()
|
|
1468
2295
|
|
|
1469
2296
|
# Color assignment done once for all samples
|
|
1470
|
-
self.
|
|
2297
|
+
self.set_samples_color()
|
|
1471
2298
|
|
|
1472
2299
|
self.logger.debug(
|
|
1473
2300
|
f"Add samples complete: {successful_additions} successful, {failed_additions} failed",
|
|
@@ -1476,7 +2303,7 @@ def _add_samples_batch(
|
|
|
1476
2303
|
return successful_additions
|
|
1477
2304
|
|
|
1478
2305
|
|
|
1479
|
-
def
|
|
2306
|
+
def _add_sample_noms1(
|
|
1480
2307
|
self,
|
|
1481
2308
|
file,
|
|
1482
2309
|
type=None,
|
|
@@ -1535,11 +2362,11 @@ def _add_sample_optimized(
|
|
|
1535
2362
|
return False
|
|
1536
2363
|
|
|
1537
2364
|
# Check if features map was created successfully
|
|
1538
|
-
if ddaobj._oms_features_map is None:
|
|
1539
|
-
|
|
1540
|
-
|
|
2365
|
+
#if ddaobj._oms_features_map is None:
|
|
2366
|
+
# self.logger.warning(f"Failed to add sample {file}: No features map created")
|
|
2367
|
+
# return False
|
|
1541
2368
|
|
|
1542
|
-
self.features_maps.append(ddaobj._oms_features_map)
|
|
2369
|
+
#self.features_maps.append(ddaobj._oms_features_map)
|
|
1543
2370
|
|
|
1544
2371
|
# Determine sample type
|
|
1545
2372
|
sample_type = "sample" if type is None else type
|
|
@@ -1647,7 +2474,7 @@ def _add_sample_optimized(
|
|
|
1647
2474
|
)
|
|
1648
2475
|
return True
|
|
1649
2476
|
|
|
1650
|
-
|
|
2477
|
+
'''
|
|
1651
2478
|
def _add_sample_standard(
|
|
1652
2479
|
self,
|
|
1653
2480
|
file,
|
|
@@ -1921,9 +2748,10 @@ def _add_sample_standard(
|
|
|
1921
2748
|
)
|
|
1922
2749
|
return True
|
|
1923
2750
|
|
|
1924
|
-
|
|
1925
|
-
def _sample_color_reset_optimized(self):
|
|
2751
|
+
'''
|
|
2752
|
+
'''def _sample_color_reset_optimized(self):
|
|
1926
2753
|
"""
|
|
1927
2754
|
Optimized version of sample color reset using set_samples_color.
|
|
1928
2755
|
"""
|
|
1929
2756
|
return self.set_samples_color(by=None)
|
|
2757
|
+
'''
|